diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,196384 @@ +{ + "best_global_step": 30650, + "best_metric": 0.12099920213222504, + "best_model_checkpoint": "saves_multiple/p-tuning/llama-3-8b-instruct/train_multirc_42_1762240404/checkpoint-30650", + "epoch": 20.0, + "eval_steps": 6130, + "global_step": 122600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008156606851549756, + "grad_norm": 314.6358337402344, + "learning_rate": 3.262642740619902e-07, + "loss": 8.3174, + "num_input_tokens_seen": 10144, + "step": 5 + }, + { + "epoch": 0.0016313213703099511, + "grad_norm": 264.6385803222656, + "learning_rate": 7.34094616639478e-07, + "loss": 7.4675, + "num_input_tokens_seen": 20704, + "step": 10 + }, + { + "epoch": 0.0024469820554649264, + "grad_norm": 224.40530395507812, + "learning_rate": 1.1419249592169658e-06, + "loss": 6.5126, + "num_input_tokens_seen": 32096, + "step": 15 + }, + { + "epoch": 0.0032626427406199023, + "grad_norm": 172.0708770751953, + "learning_rate": 1.5497553017944535e-06, + "loss": 5.3189, + "num_input_tokens_seen": 42272, + "step": 20 + }, + { + "epoch": 0.004078303425774877, + "grad_norm": 135.7971649169922, + "learning_rate": 1.957585644371941e-06, + "loss": 3.7183, + "num_input_tokens_seen": 53792, + "step": 25 + }, + { + "epoch": 0.004893964110929853, + "grad_norm": 92.28217315673828, + "learning_rate": 2.365415986949429e-06, + "loss": 3.0498, + "num_input_tokens_seen": 64864, + "step": 30 + }, + { + "epoch": 0.005709624796084829, + "grad_norm": 85.29937744140625, + "learning_rate": 2.7732463295269165e-06, + "loss": 2.1791, + "num_input_tokens_seen": 76480, + "step": 35 + }, + { + "epoch": 0.0065252854812398045, + "grad_norm": 59.62224578857422, + "learning_rate": 3.1810766721044044e-06, + "loss": 1.7123, + "num_input_tokens_seen": 87616, + "step": 40 + }, + { + "epoch": 0.00734094616639478, + "grad_norm": 110.24623107910156, + "learning_rate": 3.5889070146818927e-06, + "loss": 1.2501, + "num_input_tokens_seen": 98624, + "step": 45 + }, + { + "epoch": 0.008156606851549755, + "grad_norm": 30.345088958740234, + "learning_rate": 3.99673735725938e-06, + "loss": 0.7328, + "num_input_tokens_seen": 108896, + "step": 50 + }, + { + "epoch": 0.00897226753670473, + "grad_norm": 88.82992553710938, + "learning_rate": 4.404567699836868e-06, + "loss": 0.6449, + "num_input_tokens_seen": 119936, + "step": 55 + }, + { + "epoch": 0.009787928221859706, + "grad_norm": 21.83713722229004, + "learning_rate": 4.812398042414356e-06, + "loss": 0.4664, + "num_input_tokens_seen": 131520, + "step": 60 + }, + { + "epoch": 0.010603588907014683, + "grad_norm": 46.5600471496582, + "learning_rate": 5.2202283849918435e-06, + "loss": 0.5114, + "num_input_tokens_seen": 142912, + "step": 65 + }, + { + "epoch": 0.011419249592169658, + "grad_norm": 44.04116439819336, + "learning_rate": 5.628058727569331e-06, + "loss": 0.4, + "num_input_tokens_seen": 153600, + "step": 70 + }, + { + "epoch": 0.012234910277324634, + "grad_norm": 31.065277099609375, + "learning_rate": 6.035889070146819e-06, + "loss": 0.4806, + "num_input_tokens_seen": 164576, + "step": 75 + }, + { + "epoch": 0.013050570962479609, + "grad_norm": 58.715877532958984, + "learning_rate": 6.443719412724307e-06, + "loss": 0.4291, + "num_input_tokens_seen": 174912, + "step": 80 + }, + { + "epoch": 0.013866231647634585, + "grad_norm": 36.94560623168945, + "learning_rate": 6.851549755301794e-06, + "loss": 0.3929, + "num_input_tokens_seen": 185312, + "step": 85 + }, + { + "epoch": 0.01468189233278956, + "grad_norm": 41.26664733886719, + "learning_rate": 7.2593800978792825e-06, + "loss": 0.3971, + "num_input_tokens_seen": 196608, + "step": 90 + }, + { + "epoch": 0.015497553017944535, + "grad_norm": 52.26409149169922, + "learning_rate": 7.66721044045677e-06, + "loss": 0.5684, + "num_input_tokens_seen": 209408, + "step": 95 + }, + { + "epoch": 0.01631321370309951, + "grad_norm": 42.45866012573242, + "learning_rate": 8.075040783034257e-06, + "loss": 0.476, + "num_input_tokens_seen": 220192, + "step": 100 + }, + { + "epoch": 0.017128874388254486, + "grad_norm": 37.38993453979492, + "learning_rate": 8.482871125611746e-06, + "loss": 0.3937, + "num_input_tokens_seen": 230464, + "step": 105 + }, + { + "epoch": 0.01794453507340946, + "grad_norm": 12.51401424407959, + "learning_rate": 8.890701468189234e-06, + "loss": 0.3869, + "num_input_tokens_seen": 241664, + "step": 110 + }, + { + "epoch": 0.018760195758564437, + "grad_norm": 51.17809295654297, + "learning_rate": 9.298531810766722e-06, + "loss": 0.4125, + "num_input_tokens_seen": 253248, + "step": 115 + }, + { + "epoch": 0.01957585644371941, + "grad_norm": 34.78268814086914, + "learning_rate": 9.706362153344209e-06, + "loss": 0.3685, + "num_input_tokens_seen": 263072, + "step": 120 + }, + { + "epoch": 0.020391517128874388, + "grad_norm": 18.864459991455078, + "learning_rate": 1.0114192495921697e-05, + "loss": 0.352, + "num_input_tokens_seen": 274944, + "step": 125 + }, + { + "epoch": 0.021207177814029365, + "grad_norm": 17.730932235717773, + "learning_rate": 1.0522022838499184e-05, + "loss": 0.3605, + "num_input_tokens_seen": 285056, + "step": 130 + }, + { + "epoch": 0.02202283849918434, + "grad_norm": 33.00218200683594, + "learning_rate": 1.0929853181076672e-05, + "loss": 0.3986, + "num_input_tokens_seen": 296064, + "step": 135 + }, + { + "epoch": 0.022838499184339316, + "grad_norm": 20.735210418701172, + "learning_rate": 1.1337683523654159e-05, + "loss": 0.4069, + "num_input_tokens_seen": 307136, + "step": 140 + }, + { + "epoch": 0.02365415986949429, + "grad_norm": 21.635116577148438, + "learning_rate": 1.1745513866231649e-05, + "loss": 0.4322, + "num_input_tokens_seen": 317152, + "step": 145 + }, + { + "epoch": 0.024469820554649267, + "grad_norm": 28.22029685974121, + "learning_rate": 1.2153344208809135e-05, + "loss": 0.4022, + "num_input_tokens_seen": 327520, + "step": 150 + }, + { + "epoch": 0.02528548123980424, + "grad_norm": 13.397923469543457, + "learning_rate": 1.2561174551386624e-05, + "loss": 0.5004, + "num_input_tokens_seen": 338720, + "step": 155 + }, + { + "epoch": 0.026101141924959218, + "grad_norm": 16.55824089050293, + "learning_rate": 1.296900489396411e-05, + "loss": 0.3486, + "num_input_tokens_seen": 349824, + "step": 160 + }, + { + "epoch": 0.026916802610114192, + "grad_norm": 39.6685676574707, + "learning_rate": 1.3376835236541599e-05, + "loss": 0.3352, + "num_input_tokens_seen": 360576, + "step": 165 + }, + { + "epoch": 0.02773246329526917, + "grad_norm": 57.711448669433594, + "learning_rate": 1.3784665579119085e-05, + "loss": 0.3404, + "num_input_tokens_seen": 371328, + "step": 170 + }, + { + "epoch": 0.028548123980424143, + "grad_norm": 63.481502532958984, + "learning_rate": 1.4192495921696575e-05, + "loss": 0.4694, + "num_input_tokens_seen": 382624, + "step": 175 + }, + { + "epoch": 0.02936378466557912, + "grad_norm": 175.08531188964844, + "learning_rate": 1.4600326264274062e-05, + "loss": 0.4874, + "num_input_tokens_seen": 395232, + "step": 180 + }, + { + "epoch": 0.030179445350734094, + "grad_norm": 53.07209777832031, + "learning_rate": 1.500815660685155e-05, + "loss": 0.7018, + "num_input_tokens_seen": 406784, + "step": 185 + }, + { + "epoch": 0.03099510603588907, + "grad_norm": 16.1356143951416, + "learning_rate": 1.5415986949429037e-05, + "loss": 0.416, + "num_input_tokens_seen": 417248, + "step": 190 + }, + { + "epoch": 0.03181076672104405, + "grad_norm": 11.130064010620117, + "learning_rate": 1.5823817292006523e-05, + "loss": 0.4056, + "num_input_tokens_seen": 428224, + "step": 195 + }, + { + "epoch": 0.03262642740619902, + "grad_norm": 8.214327812194824, + "learning_rate": 1.6231647634584013e-05, + "loss": 0.3701, + "num_input_tokens_seen": 437312, + "step": 200 + }, + { + "epoch": 0.033442088091353996, + "grad_norm": 19.780445098876953, + "learning_rate": 1.66394779771615e-05, + "loss": 0.3989, + "num_input_tokens_seen": 447776, + "step": 205 + }, + { + "epoch": 0.03425774877650897, + "grad_norm": 23.08263397216797, + "learning_rate": 1.704730831973899e-05, + "loss": 0.3959, + "num_input_tokens_seen": 459040, + "step": 210 + }, + { + "epoch": 0.03507340946166395, + "grad_norm": 24.81744384765625, + "learning_rate": 1.7455138662316477e-05, + "loss": 0.3368, + "num_input_tokens_seen": 470656, + "step": 215 + }, + { + "epoch": 0.03588907014681892, + "grad_norm": 10.550070762634277, + "learning_rate": 1.7862969004893963e-05, + "loss": 0.3761, + "num_input_tokens_seen": 480864, + "step": 220 + }, + { + "epoch": 0.0367047308319739, + "grad_norm": 13.143187522888184, + "learning_rate": 1.8270799347471453e-05, + "loss": 0.349, + "num_input_tokens_seen": 491680, + "step": 225 + }, + { + "epoch": 0.037520391517128875, + "grad_norm": 13.331697463989258, + "learning_rate": 1.867862969004894e-05, + "loss": 0.3603, + "num_input_tokens_seen": 502432, + "step": 230 + }, + { + "epoch": 0.03833605220228385, + "grad_norm": 7.329267501831055, + "learning_rate": 1.908646003262643e-05, + "loss": 0.3927, + "num_input_tokens_seen": 512800, + "step": 235 + }, + { + "epoch": 0.03915171288743882, + "grad_norm": 15.567852973937988, + "learning_rate": 1.9494290375203913e-05, + "loss": 0.4135, + "num_input_tokens_seen": 523424, + "step": 240 + }, + { + "epoch": 0.0399673735725938, + "grad_norm": 7.734094142913818, + "learning_rate": 1.9902120717781403e-05, + "loss": 0.3361, + "num_input_tokens_seen": 533600, + "step": 245 + }, + { + "epoch": 0.040783034257748776, + "grad_norm": 36.39061737060547, + "learning_rate": 2.0309951060358893e-05, + "loss": 0.411, + "num_input_tokens_seen": 544576, + "step": 250 + }, + { + "epoch": 0.041598694942903754, + "grad_norm": 18.676137924194336, + "learning_rate": 2.071778140293638e-05, + "loss": 0.41, + "num_input_tokens_seen": 554784, + "step": 255 + }, + { + "epoch": 0.04241435562805873, + "grad_norm": 7.464500427246094, + "learning_rate": 2.1125611745513866e-05, + "loss": 0.39, + "num_input_tokens_seen": 565632, + "step": 260 + }, + { + "epoch": 0.0432300163132137, + "grad_norm": 9.962908744812012, + "learning_rate": 2.1533442088091353e-05, + "loss": 0.3545, + "num_input_tokens_seen": 576000, + "step": 265 + }, + { + "epoch": 0.04404567699836868, + "grad_norm": 9.480332374572754, + "learning_rate": 2.1941272430668843e-05, + "loss": 0.3468, + "num_input_tokens_seen": 586816, + "step": 270 + }, + { + "epoch": 0.044861337683523655, + "grad_norm": 13.83477783203125, + "learning_rate": 2.234910277324633e-05, + "loss": 0.3342, + "num_input_tokens_seen": 597568, + "step": 275 + }, + { + "epoch": 0.04567699836867863, + "grad_norm": 7.380682468414307, + "learning_rate": 2.2756933115823816e-05, + "loss": 0.36, + "num_input_tokens_seen": 608064, + "step": 280 + }, + { + "epoch": 0.0464926590538336, + "grad_norm": 14.0297269821167, + "learning_rate": 2.3164763458401306e-05, + "loss": 0.3185, + "num_input_tokens_seen": 619904, + "step": 285 + }, + { + "epoch": 0.04730831973898858, + "grad_norm": 43.77539825439453, + "learning_rate": 2.3572593800978793e-05, + "loss": 0.484, + "num_input_tokens_seen": 630560, + "step": 290 + }, + { + "epoch": 0.04812398042414356, + "grad_norm": 30.18943977355957, + "learning_rate": 2.3980424143556283e-05, + "loss": 0.3715, + "num_input_tokens_seen": 641184, + "step": 295 + }, + { + "epoch": 0.048939641109298535, + "grad_norm": 12.309725761413574, + "learning_rate": 2.4388254486133766e-05, + "loss": 0.4153, + "num_input_tokens_seen": 651424, + "step": 300 + }, + { + "epoch": 0.049755301794453505, + "grad_norm": 14.924318313598633, + "learning_rate": 2.4796084828711256e-05, + "loss": 0.3906, + "num_input_tokens_seen": 663072, + "step": 305 + }, + { + "epoch": 0.05057096247960848, + "grad_norm": 9.72663688659668, + "learning_rate": 2.5203915171288743e-05, + "loss": 0.396, + "num_input_tokens_seen": 672416, + "step": 310 + }, + { + "epoch": 0.05138662316476346, + "grad_norm": 11.389423370361328, + "learning_rate": 2.5611745513866233e-05, + "loss": 0.4705, + "num_input_tokens_seen": 683008, + "step": 315 + }, + { + "epoch": 0.052202283849918436, + "grad_norm": 52.0793571472168, + "learning_rate": 2.6019575856443723e-05, + "loss": 0.4732, + "num_input_tokens_seen": 694976, + "step": 320 + }, + { + "epoch": 0.05301794453507341, + "grad_norm": 10.719328880310059, + "learning_rate": 2.6427406199021206e-05, + "loss": 0.4089, + "num_input_tokens_seen": 705824, + "step": 325 + }, + { + "epoch": 0.053833605220228384, + "grad_norm": 6.3341474533081055, + "learning_rate": 2.6835236541598696e-05, + "loss": 0.4339, + "num_input_tokens_seen": 717088, + "step": 330 + }, + { + "epoch": 0.05464926590538336, + "grad_norm": 5.4407782554626465, + "learning_rate": 2.7243066884176183e-05, + "loss": 0.4164, + "num_input_tokens_seen": 728768, + "step": 335 + }, + { + "epoch": 0.05546492659053834, + "grad_norm": 4.411542892456055, + "learning_rate": 2.7650897226753673e-05, + "loss": 0.4142, + "num_input_tokens_seen": 738528, + "step": 340 + }, + { + "epoch": 0.05628058727569331, + "grad_norm": 10.264647483825684, + "learning_rate": 2.805872756933116e-05, + "loss": 0.3912, + "num_input_tokens_seen": 749696, + "step": 345 + }, + { + "epoch": 0.057096247960848286, + "grad_norm": 6.780503273010254, + "learning_rate": 2.8466557911908646e-05, + "loss": 0.3433, + "num_input_tokens_seen": 760672, + "step": 350 + }, + { + "epoch": 0.05791190864600326, + "grad_norm": 2.577894449234009, + "learning_rate": 2.8874388254486136e-05, + "loss": 0.3593, + "num_input_tokens_seen": 771712, + "step": 355 + }, + { + "epoch": 0.05872756933115824, + "grad_norm": 5.244052410125732, + "learning_rate": 2.9282218597063623e-05, + "loss": 0.3646, + "num_input_tokens_seen": 782912, + "step": 360 + }, + { + "epoch": 0.05954323001631321, + "grad_norm": 9.439083099365234, + "learning_rate": 2.969004893964111e-05, + "loss": 0.3592, + "num_input_tokens_seen": 794368, + "step": 365 + }, + { + "epoch": 0.06035889070146819, + "grad_norm": 2.9597246646881104, + "learning_rate": 3.0097879282218596e-05, + "loss": 0.3586, + "num_input_tokens_seen": 804704, + "step": 370 + }, + { + "epoch": 0.061174551386623165, + "grad_norm": 4.3669633865356445, + "learning_rate": 3.0505709624796086e-05, + "loss": 0.3525, + "num_input_tokens_seen": 815872, + "step": 375 + }, + { + "epoch": 0.06199021207177814, + "grad_norm": 4.579859733581543, + "learning_rate": 3.0913539967373576e-05, + "loss": 0.3348, + "num_input_tokens_seen": 826912, + "step": 380 + }, + { + "epoch": 0.06280587275693311, + "grad_norm": 6.14414119720459, + "learning_rate": 3.132137030995106e-05, + "loss": 0.4138, + "num_input_tokens_seen": 836864, + "step": 385 + }, + { + "epoch": 0.0636215334420881, + "grad_norm": 14.99302864074707, + "learning_rate": 3.172920065252855e-05, + "loss": 0.4467, + "num_input_tokens_seen": 848448, + "step": 390 + }, + { + "epoch": 0.06443719412724307, + "grad_norm": 8.607397079467773, + "learning_rate": 3.213703099510604e-05, + "loss": 0.3861, + "num_input_tokens_seen": 858976, + "step": 395 + }, + { + "epoch": 0.06525285481239804, + "grad_norm": 12.14357852935791, + "learning_rate": 3.254486133768352e-05, + "loss": 0.4364, + "num_input_tokens_seen": 869664, + "step": 400 + }, + { + "epoch": 0.06606851549755302, + "grad_norm": 7.592681884765625, + "learning_rate": 3.295269168026101e-05, + "loss": 0.3747, + "num_input_tokens_seen": 881696, + "step": 405 + }, + { + "epoch": 0.06688417618270799, + "grad_norm": 9.035486221313477, + "learning_rate": 3.33605220228385e-05, + "loss": 0.5381, + "num_input_tokens_seen": 893440, + "step": 410 + }, + { + "epoch": 0.06769983686786298, + "grad_norm": 4.473764896392822, + "learning_rate": 3.3768352365415986e-05, + "loss": 0.3439, + "num_input_tokens_seen": 904224, + "step": 415 + }, + { + "epoch": 0.06851549755301795, + "grad_norm": 4.291823863983154, + "learning_rate": 3.4176182707993476e-05, + "loss": 0.4344, + "num_input_tokens_seen": 915776, + "step": 420 + }, + { + "epoch": 0.06933115823817292, + "grad_norm": 24.67149543762207, + "learning_rate": 3.458401305057096e-05, + "loss": 0.383, + "num_input_tokens_seen": 926656, + "step": 425 + }, + { + "epoch": 0.0701468189233279, + "grad_norm": 27.043270111083984, + "learning_rate": 3.4991843393148456e-05, + "loss": 0.4272, + "num_input_tokens_seen": 937728, + "step": 430 + }, + { + "epoch": 0.07096247960848287, + "grad_norm": 13.670089721679688, + "learning_rate": 3.539967373572594e-05, + "loss": 0.4359, + "num_input_tokens_seen": 948800, + "step": 435 + }, + { + "epoch": 0.07177814029363784, + "grad_norm": 17.373754501342773, + "learning_rate": 3.580750407830342e-05, + "loss": 0.3757, + "num_input_tokens_seen": 959712, + "step": 440 + }, + { + "epoch": 0.07259380097879282, + "grad_norm": 7.593050479888916, + "learning_rate": 3.621533442088092e-05, + "loss": 0.2887, + "num_input_tokens_seen": 970592, + "step": 445 + }, + { + "epoch": 0.0734094616639478, + "grad_norm": 19.885852813720703, + "learning_rate": 3.66231647634584e-05, + "loss": 0.3668, + "num_input_tokens_seen": 981632, + "step": 450 + }, + { + "epoch": 0.07422512234910278, + "grad_norm": 3.8135032653808594, + "learning_rate": 3.703099510603589e-05, + "loss": 0.3811, + "num_input_tokens_seen": 991680, + "step": 455 + }, + { + "epoch": 0.07504078303425775, + "grad_norm": 6.083225250244141, + "learning_rate": 3.7438825448613375e-05, + "loss": 0.3688, + "num_input_tokens_seen": 1002880, + "step": 460 + }, + { + "epoch": 0.07585644371941272, + "grad_norm": 4.485177993774414, + "learning_rate": 3.7846655791190865e-05, + "loss": 0.3793, + "num_input_tokens_seen": 1015072, + "step": 465 + }, + { + "epoch": 0.0766721044045677, + "grad_norm": 2.079634666442871, + "learning_rate": 3.8254486133768355e-05, + "loss": 0.3835, + "num_input_tokens_seen": 1026176, + "step": 470 + }, + { + "epoch": 0.07748776508972267, + "grad_norm": 5.577203750610352, + "learning_rate": 3.866231647634584e-05, + "loss": 0.3721, + "num_input_tokens_seen": 1037632, + "step": 475 + }, + { + "epoch": 0.07830342577487764, + "grad_norm": 6.679996967315674, + "learning_rate": 3.907014681892333e-05, + "loss": 0.3572, + "num_input_tokens_seen": 1047040, + "step": 480 + }, + { + "epoch": 0.07911908646003263, + "grad_norm": 75.6471176147461, + "learning_rate": 3.947797716150082e-05, + "loss": 0.332, + "num_input_tokens_seen": 1057344, + "step": 485 + }, + { + "epoch": 0.0799347471451876, + "grad_norm": 1.4969229698181152, + "learning_rate": 3.98858075040783e-05, + "loss": 0.3001, + "num_input_tokens_seen": 1068064, + "step": 490 + }, + { + "epoch": 0.08075040783034258, + "grad_norm": 79.28744506835938, + "learning_rate": 4.029363784665579e-05, + "loss": 0.5147, + "num_input_tokens_seen": 1078560, + "step": 495 + }, + { + "epoch": 0.08156606851549755, + "grad_norm": 54.21046829223633, + "learning_rate": 4.070146818923328e-05, + "loss": 0.2397, + "num_input_tokens_seen": 1089824, + "step": 500 + }, + { + "epoch": 0.08238172920065252, + "grad_norm": 14.4324369430542, + "learning_rate": 4.1109298531810765e-05, + "loss": 0.6049, + "num_input_tokens_seen": 1101120, + "step": 505 + }, + { + "epoch": 0.08319738988580751, + "grad_norm": 21.910572052001953, + "learning_rate": 4.1517128874388255e-05, + "loss": 0.6272, + "num_input_tokens_seen": 1111840, + "step": 510 + }, + { + "epoch": 0.08401305057096248, + "grad_norm": 12.105114936828613, + "learning_rate": 4.1924959216965745e-05, + "loss": 0.5606, + "num_input_tokens_seen": 1122720, + "step": 515 + }, + { + "epoch": 0.08482871125611746, + "grad_norm": 0.32360613346099854, + "learning_rate": 4.233278955954323e-05, + "loss": 0.1157, + "num_input_tokens_seen": 1131808, + "step": 520 + }, + { + "epoch": 0.08564437194127243, + "grad_norm": 1.4238195419311523, + "learning_rate": 4.274061990212072e-05, + "loss": 0.2238, + "num_input_tokens_seen": 1142624, + "step": 525 + }, + { + "epoch": 0.0864600326264274, + "grad_norm": 10.996274948120117, + "learning_rate": 4.314845024469821e-05, + "loss": 0.2579, + "num_input_tokens_seen": 1153536, + "step": 530 + }, + { + "epoch": 0.08727569331158239, + "grad_norm": 0.11020765453577042, + "learning_rate": 4.35562805872757e-05, + "loss": 0.1138, + "num_input_tokens_seen": 1164960, + "step": 535 + }, + { + "epoch": 0.08809135399673736, + "grad_norm": 4.610954761505127, + "learning_rate": 4.396411092985318e-05, + "loss": 0.102, + "num_input_tokens_seen": 1175552, + "step": 540 + }, + { + "epoch": 0.08890701468189233, + "grad_norm": 25.28290367126465, + "learning_rate": 4.4371941272430665e-05, + "loss": 0.3639, + "num_input_tokens_seen": 1186624, + "step": 545 + }, + { + "epoch": 0.08972267536704731, + "grad_norm": 10.82431411743164, + "learning_rate": 4.477977161500816e-05, + "loss": 1.4397, + "num_input_tokens_seen": 1198336, + "step": 550 + }, + { + "epoch": 0.09053833605220228, + "grad_norm": 6.7620134353637695, + "learning_rate": 4.5187601957585645e-05, + "loss": 0.3143, + "num_input_tokens_seen": 1209760, + "step": 555 + }, + { + "epoch": 0.09135399673735727, + "grad_norm": 5.035055160522461, + "learning_rate": 4.559543230016313e-05, + "loss": 0.3098, + "num_input_tokens_seen": 1219936, + "step": 560 + }, + { + "epoch": 0.09216965742251224, + "grad_norm": 0.12749716639518738, + "learning_rate": 4.6003262642740625e-05, + "loss": 0.194, + "num_input_tokens_seen": 1231456, + "step": 565 + }, + { + "epoch": 0.0929853181076672, + "grad_norm": 7.260269641876221, + "learning_rate": 4.641109298531811e-05, + "loss": 0.1057, + "num_input_tokens_seen": 1241952, + "step": 570 + }, + { + "epoch": 0.09380097879282219, + "grad_norm": 0.058566510677337646, + "learning_rate": 4.68189233278956e-05, + "loss": 0.3255, + "num_input_tokens_seen": 1253920, + "step": 575 + }, + { + "epoch": 0.09461663947797716, + "grad_norm": 14.459625244140625, + "learning_rate": 4.722675367047308e-05, + "loss": 0.1501, + "num_input_tokens_seen": 1265632, + "step": 580 + }, + { + "epoch": 0.09543230016313213, + "grad_norm": 22.226320266723633, + "learning_rate": 4.763458401305057e-05, + "loss": 0.692, + "num_input_tokens_seen": 1276256, + "step": 585 + }, + { + "epoch": 0.09624796084828711, + "grad_norm": 1.782360553741455, + "learning_rate": 4.804241435562806e-05, + "loss": 0.2242, + "num_input_tokens_seen": 1287424, + "step": 590 + }, + { + "epoch": 0.09706362153344208, + "grad_norm": 0.1359458863735199, + "learning_rate": 4.8450244698205544e-05, + "loss": 0.0625, + "num_input_tokens_seen": 1298272, + "step": 595 + }, + { + "epoch": 0.09787928221859707, + "grad_norm": 7.373677730560303, + "learning_rate": 4.885807504078304e-05, + "loss": 0.2317, + "num_input_tokens_seen": 1309632, + "step": 600 + }, + { + "epoch": 0.09869494290375204, + "grad_norm": 0.5091097950935364, + "learning_rate": 4.9265905383360524e-05, + "loss": 0.4314, + "num_input_tokens_seen": 1319264, + "step": 605 + }, + { + "epoch": 0.09951060358890701, + "grad_norm": 11.11815071105957, + "learning_rate": 4.967373572593801e-05, + "loss": 0.3954, + "num_input_tokens_seen": 1330144, + "step": 610 + }, + { + "epoch": 0.100326264274062, + "grad_norm": 6.977790832519531, + "learning_rate": 5.00815660685155e-05, + "loss": 0.2213, + "num_input_tokens_seen": 1340640, + "step": 615 + }, + { + "epoch": 0.10114192495921696, + "grad_norm": 9.725613594055176, + "learning_rate": 5.048939641109299e-05, + "loss": 0.4465, + "num_input_tokens_seen": 1351136, + "step": 620 + }, + { + "epoch": 0.10195758564437195, + "grad_norm": 3.7128005027770996, + "learning_rate": 5.089722675367047e-05, + "loss": 0.2263, + "num_input_tokens_seen": 1363104, + "step": 625 + }, + { + "epoch": 0.10277324632952692, + "grad_norm": 3.203340530395508, + "learning_rate": 5.130505709624796e-05, + "loss": 0.235, + "num_input_tokens_seen": 1373856, + "step": 630 + }, + { + "epoch": 0.10358890701468189, + "grad_norm": 1.902295708656311, + "learning_rate": 5.171288743882545e-05, + "loss": 0.1787, + "num_input_tokens_seen": 1385984, + "step": 635 + }, + { + "epoch": 0.10440456769983687, + "grad_norm": 4.927067279815674, + "learning_rate": 5.212071778140294e-05, + "loss": 0.1436, + "num_input_tokens_seen": 1397472, + "step": 640 + }, + { + "epoch": 0.10522022838499184, + "grad_norm": 2.7606987953186035, + "learning_rate": 5.2528548123980424e-05, + "loss": 0.184, + "num_input_tokens_seen": 1407744, + "step": 645 + }, + { + "epoch": 0.10603588907014681, + "grad_norm": 7.337327480316162, + "learning_rate": 5.293637846655791e-05, + "loss": 0.4137, + "num_input_tokens_seen": 1418688, + "step": 650 + }, + { + "epoch": 0.1068515497553018, + "grad_norm": 0.5223647952079773, + "learning_rate": 5.3344208809135404e-05, + "loss": 0.2377, + "num_input_tokens_seen": 1429056, + "step": 655 + }, + { + "epoch": 0.10766721044045677, + "grad_norm": 20.811302185058594, + "learning_rate": 5.375203915171289e-05, + "loss": 0.3604, + "num_input_tokens_seen": 1439264, + "step": 660 + }, + { + "epoch": 0.10848287112561175, + "grad_norm": 12.334787368774414, + "learning_rate": 5.415986949429037e-05, + "loss": 0.4648, + "num_input_tokens_seen": 1451072, + "step": 665 + }, + { + "epoch": 0.10929853181076672, + "grad_norm": 0.2776586413383484, + "learning_rate": 5.456769983686787e-05, + "loss": 0.2371, + "num_input_tokens_seen": 1460640, + "step": 670 + }, + { + "epoch": 0.11011419249592169, + "grad_norm": 5.741067886352539, + "learning_rate": 5.497553017944535e-05, + "loss": 0.2627, + "num_input_tokens_seen": 1472224, + "step": 675 + }, + { + "epoch": 0.11092985318107668, + "grad_norm": 4.412654876708984, + "learning_rate": 5.538336052202284e-05, + "loss": 0.4448, + "num_input_tokens_seen": 1483744, + "step": 680 + }, + { + "epoch": 0.11174551386623165, + "grad_norm": 5.101598739624023, + "learning_rate": 5.579119086460033e-05, + "loss": 0.2309, + "num_input_tokens_seen": 1495552, + "step": 685 + }, + { + "epoch": 0.11256117455138662, + "grad_norm": 4.5909271240234375, + "learning_rate": 5.6199021207177814e-05, + "loss": 0.1059, + "num_input_tokens_seen": 1506880, + "step": 690 + }, + { + "epoch": 0.1133768352365416, + "grad_norm": 9.4280366897583, + "learning_rate": 5.6606851549755304e-05, + "loss": 0.4037, + "num_input_tokens_seen": 1518336, + "step": 695 + }, + { + "epoch": 0.11419249592169657, + "grad_norm": 2.6467490196228027, + "learning_rate": 5.701468189233279e-05, + "loss": 0.169, + "num_input_tokens_seen": 1528704, + "step": 700 + }, + { + "epoch": 0.11500815660685156, + "grad_norm": 8.759927749633789, + "learning_rate": 5.7422512234910284e-05, + "loss": 0.2088, + "num_input_tokens_seen": 1539552, + "step": 705 + }, + { + "epoch": 0.11582381729200653, + "grad_norm": 1.059167742729187, + "learning_rate": 5.783034257748777e-05, + "loss": 0.1942, + "num_input_tokens_seen": 1549696, + "step": 710 + }, + { + "epoch": 0.1166394779771615, + "grad_norm": 4.634883403778076, + "learning_rate": 5.823817292006525e-05, + "loss": 0.1503, + "num_input_tokens_seen": 1559040, + "step": 715 + }, + { + "epoch": 0.11745513866231648, + "grad_norm": 1.964306116104126, + "learning_rate": 5.864600326264275e-05, + "loss": 0.3159, + "num_input_tokens_seen": 1570144, + "step": 720 + }, + { + "epoch": 0.11827079934747145, + "grad_norm": 3.4446094036102295, + "learning_rate": 5.905383360522023e-05, + "loss": 0.2283, + "num_input_tokens_seen": 1579776, + "step": 725 + }, + { + "epoch": 0.11908646003262642, + "grad_norm": 1069.54541015625, + "learning_rate": 5.9461663947797714e-05, + "loss": 3.0837, + "num_input_tokens_seen": 1589120, + "step": 730 + }, + { + "epoch": 0.1199021207177814, + "grad_norm": 227.7669219970703, + "learning_rate": 5.9869494290375204e-05, + "loss": 2.2193, + "num_input_tokens_seen": 1600512, + "step": 735 + }, + { + "epoch": 0.12071778140293637, + "grad_norm": 2.590147018432617, + "learning_rate": 6.0277324632952694e-05, + "loss": 0.7939, + "num_input_tokens_seen": 1610880, + "step": 740 + }, + { + "epoch": 0.12153344208809136, + "grad_norm": 16.200777053833008, + "learning_rate": 6.0685154975530184e-05, + "loss": 0.2326, + "num_input_tokens_seen": 1620896, + "step": 745 + }, + { + "epoch": 0.12234910277324633, + "grad_norm": 6.109814643859863, + "learning_rate": 6.109298531810767e-05, + "loss": 0.131, + "num_input_tokens_seen": 1631232, + "step": 750 + }, + { + "epoch": 0.1231647634584013, + "grad_norm": 1.4488592147827148, + "learning_rate": 6.150081566068516e-05, + "loss": 0.1412, + "num_input_tokens_seen": 1642688, + "step": 755 + }, + { + "epoch": 0.12398042414355628, + "grad_norm": 12.674674034118652, + "learning_rate": 6.190864600326265e-05, + "loss": 0.4573, + "num_input_tokens_seen": 1653792, + "step": 760 + }, + { + "epoch": 0.12479608482871125, + "grad_norm": 75.09253692626953, + "learning_rate": 6.231647634584014e-05, + "loss": 2.002, + "num_input_tokens_seen": 1664160, + "step": 765 + }, + { + "epoch": 0.12561174551386622, + "grad_norm": 77.74810028076172, + "learning_rate": 6.272430668841763e-05, + "loss": 6.3921, + "num_input_tokens_seen": 1675232, + "step": 770 + }, + { + "epoch": 0.1264274061990212, + "grad_norm": 37.449378967285156, + "learning_rate": 6.31321370309951e-05, + "loss": 4.1665, + "num_input_tokens_seen": 1685280, + "step": 775 + }, + { + "epoch": 0.1272430668841762, + "grad_norm": 11.413688659667969, + "learning_rate": 6.35399673735726e-05, + "loss": 1.2492, + "num_input_tokens_seen": 1694848, + "step": 780 + }, + { + "epoch": 0.12805872756933115, + "grad_norm": 16.399850845336914, + "learning_rate": 6.394779771615008e-05, + "loss": 0.4529, + "num_input_tokens_seen": 1705600, + "step": 785 + }, + { + "epoch": 0.12887438825448613, + "grad_norm": 8.579042434692383, + "learning_rate": 6.435562805872756e-05, + "loss": 0.5815, + "num_input_tokens_seen": 1717152, + "step": 790 + }, + { + "epoch": 0.12969004893964112, + "grad_norm": 293.99151611328125, + "learning_rate": 6.476345840130505e-05, + "loss": 0.5153, + "num_input_tokens_seen": 1728288, + "step": 795 + }, + { + "epoch": 0.13050570962479607, + "grad_norm": 9.329829216003418, + "learning_rate": 6.517128874388255e-05, + "loss": 0.2736, + "num_input_tokens_seen": 1739936, + "step": 800 + }, + { + "epoch": 0.13132137030995106, + "grad_norm": 7.285165309906006, + "learning_rate": 6.557911908646004e-05, + "loss": 0.1994, + "num_input_tokens_seen": 1750656, + "step": 805 + }, + { + "epoch": 0.13213703099510604, + "grad_norm": 12.667640686035156, + "learning_rate": 6.598694942903752e-05, + "loss": 0.3218, + "num_input_tokens_seen": 1759392, + "step": 810 + }, + { + "epoch": 0.132952691680261, + "grad_norm": 5.781430244445801, + "learning_rate": 6.639477977161501e-05, + "loss": 0.2163, + "num_input_tokens_seen": 1770816, + "step": 815 + }, + { + "epoch": 0.13376835236541598, + "grad_norm": 1.2714248895645142, + "learning_rate": 6.68026101141925e-05, + "loss": 0.155, + "num_input_tokens_seen": 1781088, + "step": 820 + }, + { + "epoch": 0.13458401305057097, + "grad_norm": 25.848276138305664, + "learning_rate": 6.721044045676998e-05, + "loss": 0.2787, + "num_input_tokens_seen": 1790912, + "step": 825 + }, + { + "epoch": 0.13539967373572595, + "grad_norm": 0.10717128962278366, + "learning_rate": 6.761827079934747e-05, + "loss": 0.0285, + "num_input_tokens_seen": 1800736, + "step": 830 + }, + { + "epoch": 0.1362153344208809, + "grad_norm": 5.4765625, + "learning_rate": 6.802610114192497e-05, + "loss": 0.2213, + "num_input_tokens_seen": 1810016, + "step": 835 + }, + { + "epoch": 0.1370309951060359, + "grad_norm": 5.046440601348877, + "learning_rate": 6.843393148450245e-05, + "loss": 0.3239, + "num_input_tokens_seen": 1820032, + "step": 840 + }, + { + "epoch": 0.13784665579119088, + "grad_norm": 0.4740993082523346, + "learning_rate": 6.884176182707994e-05, + "loss": 0.2064, + "num_input_tokens_seen": 1830080, + "step": 845 + }, + { + "epoch": 0.13866231647634583, + "grad_norm": 6.393681049346924, + "learning_rate": 6.924959216965743e-05, + "loss": 0.3382, + "num_input_tokens_seen": 1840832, + "step": 850 + }, + { + "epoch": 0.13947797716150082, + "grad_norm": 4.807803630828857, + "learning_rate": 6.96574225122349e-05, + "loss": 0.4432, + "num_input_tokens_seen": 1852000, + "step": 855 + }, + { + "epoch": 0.1402936378466558, + "grad_norm": 0.19360409677028656, + "learning_rate": 7.006525285481239e-05, + "loss": 0.2847, + "num_input_tokens_seen": 1863264, + "step": 860 + }, + { + "epoch": 0.14110929853181076, + "grad_norm": 3.193110227584839, + "learning_rate": 7.047308319738988e-05, + "loss": 0.2202, + "num_input_tokens_seen": 1875104, + "step": 865 + }, + { + "epoch": 0.14192495921696574, + "grad_norm": 1.3773865699768066, + "learning_rate": 7.088091353996739e-05, + "loss": 0.143, + "num_input_tokens_seen": 1885088, + "step": 870 + }, + { + "epoch": 0.14274061990212072, + "grad_norm": 2.0674312114715576, + "learning_rate": 7.128874388254486e-05, + "loss": 0.2554, + "num_input_tokens_seen": 1895424, + "step": 875 + }, + { + "epoch": 0.14355628058727568, + "grad_norm": 3.5503621101379395, + "learning_rate": 7.169657422512235e-05, + "loss": 0.0546, + "num_input_tokens_seen": 1906048, + "step": 880 + }, + { + "epoch": 0.14437194127243066, + "grad_norm": 1.0717246532440186, + "learning_rate": 7.210440456769984e-05, + "loss": 0.017, + "num_input_tokens_seen": 1916320, + "step": 885 + }, + { + "epoch": 0.14518760195758565, + "grad_norm": 8.777630805969238, + "learning_rate": 7.251223491027732e-05, + "loss": 0.1349, + "num_input_tokens_seen": 1926880, + "step": 890 + }, + { + "epoch": 0.14600326264274063, + "grad_norm": 0.08770612627267838, + "learning_rate": 7.292006525285481e-05, + "loss": 0.1599, + "num_input_tokens_seen": 1937792, + "step": 895 + }, + { + "epoch": 0.1468189233278956, + "grad_norm": 1.7737507820129395, + "learning_rate": 7.332789559543231e-05, + "loss": 0.4864, + "num_input_tokens_seen": 1950048, + "step": 900 + }, + { + "epoch": 0.14763458401305057, + "grad_norm": 14.238997459411621, + "learning_rate": 7.373572593800979e-05, + "loss": 0.5282, + "num_input_tokens_seen": 1960352, + "step": 905 + }, + { + "epoch": 0.14845024469820556, + "grad_norm": 7.535554885864258, + "learning_rate": 7.414355628058728e-05, + "loss": 0.2474, + "num_input_tokens_seen": 1970784, + "step": 910 + }, + { + "epoch": 0.14926590538336051, + "grad_norm": 2.2550277709960938, + "learning_rate": 7.455138662316477e-05, + "loss": 0.3021, + "num_input_tokens_seen": 1982528, + "step": 915 + }, + { + "epoch": 0.1500815660685155, + "grad_norm": 0.23242764174938202, + "learning_rate": 7.495921696574225e-05, + "loss": 0.23, + "num_input_tokens_seen": 1992704, + "step": 920 + }, + { + "epoch": 0.15089722675367048, + "grad_norm": 4.0261101722717285, + "learning_rate": 7.536704730831974e-05, + "loss": 0.2117, + "num_input_tokens_seen": 2003328, + "step": 925 + }, + { + "epoch": 0.15171288743882544, + "grad_norm": 1.9249851703643799, + "learning_rate": 7.577487765089723e-05, + "loss": 0.1221, + "num_input_tokens_seen": 2013696, + "step": 930 + }, + { + "epoch": 0.15252854812398042, + "grad_norm": 0.0670514702796936, + "learning_rate": 7.618270799347473e-05, + "loss": 0.1289, + "num_input_tokens_seen": 2024288, + "step": 935 + }, + { + "epoch": 0.1533442088091354, + "grad_norm": 4.352898597717285, + "learning_rate": 7.65905383360522e-05, + "loss": 0.1985, + "num_input_tokens_seen": 2035296, + "step": 940 + }, + { + "epoch": 0.15415986949429036, + "grad_norm": 3.5093870162963867, + "learning_rate": 7.69983686786297e-05, + "loss": 0.1994, + "num_input_tokens_seen": 2047680, + "step": 945 + }, + { + "epoch": 0.15497553017944535, + "grad_norm": 2.5943655967712402, + "learning_rate": 7.740619902120719e-05, + "loss": 0.3784, + "num_input_tokens_seen": 2058208, + "step": 950 + }, + { + "epoch": 0.15579119086460033, + "grad_norm": 0.6778724789619446, + "learning_rate": 7.781402936378466e-05, + "loss": 0.1371, + "num_input_tokens_seen": 2069504, + "step": 955 + }, + { + "epoch": 0.1566068515497553, + "grad_norm": 2.2445712089538574, + "learning_rate": 7.822185970636215e-05, + "loss": 0.1568, + "num_input_tokens_seen": 2080416, + "step": 960 + }, + { + "epoch": 0.15742251223491027, + "grad_norm": 5.930630207061768, + "learning_rate": 7.862969004893964e-05, + "loss": 0.1437, + "num_input_tokens_seen": 2090880, + "step": 965 + }, + { + "epoch": 0.15823817292006526, + "grad_norm": 2.420844793319702, + "learning_rate": 7.903752039151713e-05, + "loss": 0.2885, + "num_input_tokens_seen": 2101440, + "step": 970 + }, + { + "epoch": 0.15905383360522024, + "grad_norm": 3.8338279724121094, + "learning_rate": 7.944535073409462e-05, + "loss": 0.2561, + "num_input_tokens_seen": 2111488, + "step": 975 + }, + { + "epoch": 0.1598694942903752, + "grad_norm": 0.34937554597854614, + "learning_rate": 7.985318107667211e-05, + "loss": 0.08, + "num_input_tokens_seen": 2123392, + "step": 980 + }, + { + "epoch": 0.16068515497553018, + "grad_norm": 8.5946683883667, + "learning_rate": 8.026101141924959e-05, + "loss": 0.0638, + "num_input_tokens_seen": 2134016, + "step": 985 + }, + { + "epoch": 0.16150081566068517, + "grad_norm": 6.2617669105529785, + "learning_rate": 8.066884176182708e-05, + "loss": 0.2836, + "num_input_tokens_seen": 2144736, + "step": 990 + }, + { + "epoch": 0.16231647634584012, + "grad_norm": 1.5171515941619873, + "learning_rate": 8.107667210440457e-05, + "loss": 0.1919, + "num_input_tokens_seen": 2155520, + "step": 995 + }, + { + "epoch": 0.1631321370309951, + "grad_norm": 4.9571428298950195, + "learning_rate": 8.148450244698205e-05, + "loss": 0.276, + "num_input_tokens_seen": 2165312, + "step": 1000 + }, + { + "epoch": 0.1639477977161501, + "grad_norm": 5.026040077209473, + "learning_rate": 8.189233278955955e-05, + "loss": 0.2269, + "num_input_tokens_seen": 2175040, + "step": 1005 + }, + { + "epoch": 0.16476345840130505, + "grad_norm": 1.8221598863601685, + "learning_rate": 8.230016313213704e-05, + "loss": 0.2278, + "num_input_tokens_seen": 2186176, + "step": 1010 + }, + { + "epoch": 0.16557911908646003, + "grad_norm": 3.4954538345336914, + "learning_rate": 8.270799347471453e-05, + "loss": 0.1486, + "num_input_tokens_seen": 2195808, + "step": 1015 + }, + { + "epoch": 0.16639477977161501, + "grad_norm": 0.697935938835144, + "learning_rate": 8.3115823817292e-05, + "loss": 0.0983, + "num_input_tokens_seen": 2207232, + "step": 1020 + }, + { + "epoch": 0.16721044045676997, + "grad_norm": 0.5946976542472839, + "learning_rate": 8.35236541598695e-05, + "loss": 0.1653, + "num_input_tokens_seen": 2218912, + "step": 1025 + }, + { + "epoch": 0.16802610114192496, + "grad_norm": 5.24078893661499, + "learning_rate": 8.393148450244699e-05, + "loss": 0.1919, + "num_input_tokens_seen": 2229536, + "step": 1030 + }, + { + "epoch": 0.16884176182707994, + "grad_norm": 4.404401779174805, + "learning_rate": 8.433931484502446e-05, + "loss": 0.1526, + "num_input_tokens_seen": 2240608, + "step": 1035 + }, + { + "epoch": 0.16965742251223492, + "grad_norm": 7.255613803863525, + "learning_rate": 8.474714518760197e-05, + "loss": 0.3607, + "num_input_tokens_seen": 2250176, + "step": 1040 + }, + { + "epoch": 0.17047308319738988, + "grad_norm": 2.6354668140411377, + "learning_rate": 8.515497553017946e-05, + "loss": 0.1367, + "num_input_tokens_seen": 2262080, + "step": 1045 + }, + { + "epoch": 0.17128874388254486, + "grad_norm": 0.04987457022070885, + "learning_rate": 8.556280587275693e-05, + "loss": 0.0759, + "num_input_tokens_seen": 2271968, + "step": 1050 + }, + { + "epoch": 0.17210440456769985, + "grad_norm": 3.4202687740325928, + "learning_rate": 8.597063621533442e-05, + "loss": 0.1543, + "num_input_tokens_seen": 2282080, + "step": 1055 + }, + { + "epoch": 0.1729200652528548, + "grad_norm": 6.735346794128418, + "learning_rate": 8.637846655791191e-05, + "loss": 0.1822, + "num_input_tokens_seen": 2291744, + "step": 1060 + }, + { + "epoch": 0.1737357259380098, + "grad_norm": 2.105976104736328, + "learning_rate": 8.678629690048939e-05, + "loss": 0.0599, + "num_input_tokens_seen": 2302368, + "step": 1065 + }, + { + "epoch": 0.17455138662316477, + "grad_norm": 18.188085556030273, + "learning_rate": 8.719412724306688e-05, + "loss": 0.1898, + "num_input_tokens_seen": 2313664, + "step": 1070 + }, + { + "epoch": 0.17536704730831973, + "grad_norm": 3.8808937072753906, + "learning_rate": 8.760195758564438e-05, + "loss": 0.0623, + "num_input_tokens_seen": 2325184, + "step": 1075 + }, + { + "epoch": 0.1761827079934747, + "grad_norm": 7.137500286102295, + "learning_rate": 8.800978792822187e-05, + "loss": 0.2835, + "num_input_tokens_seen": 2336352, + "step": 1080 + }, + { + "epoch": 0.1769983686786297, + "grad_norm": 1.505784034729004, + "learning_rate": 8.841761827079935e-05, + "loss": 0.0245, + "num_input_tokens_seen": 2346816, + "step": 1085 + }, + { + "epoch": 0.17781402936378465, + "grad_norm": 0.14202602207660675, + "learning_rate": 8.882544861337684e-05, + "loss": 0.0903, + "num_input_tokens_seen": 2356672, + "step": 1090 + }, + { + "epoch": 0.17862969004893964, + "grad_norm": 5.891081809997559, + "learning_rate": 8.923327895595433e-05, + "loss": 0.2582, + "num_input_tokens_seen": 2367168, + "step": 1095 + }, + { + "epoch": 0.17944535073409462, + "grad_norm": 5.1646904945373535, + "learning_rate": 8.96411092985318e-05, + "loss": 0.2548, + "num_input_tokens_seen": 2378912, + "step": 1100 + }, + { + "epoch": 0.1802610114192496, + "grad_norm": 1.1886264085769653, + "learning_rate": 9.00489396411093e-05, + "loss": 0.3341, + "num_input_tokens_seen": 2390848, + "step": 1105 + }, + { + "epoch": 0.18107667210440456, + "grad_norm": 1.5492854118347168, + "learning_rate": 9.04567699836868e-05, + "loss": 0.199, + "num_input_tokens_seen": 2401344, + "step": 1110 + }, + { + "epoch": 0.18189233278955955, + "grad_norm": 0.5389428734779358, + "learning_rate": 9.086460032626427e-05, + "loss": 0.1403, + "num_input_tokens_seen": 2412224, + "step": 1115 + }, + { + "epoch": 0.18270799347471453, + "grad_norm": 1.1149766445159912, + "learning_rate": 9.127243066884176e-05, + "loss": 0.0222, + "num_input_tokens_seen": 2423072, + "step": 1120 + }, + { + "epoch": 0.1835236541598695, + "grad_norm": 6.989159107208252, + "learning_rate": 9.168026101141925e-05, + "loss": 0.1462, + "num_input_tokens_seen": 2434368, + "step": 1125 + }, + { + "epoch": 0.18433931484502447, + "grad_norm": 0.5632582902908325, + "learning_rate": 9.208809135399673e-05, + "loss": 0.3705, + "num_input_tokens_seen": 2444544, + "step": 1130 + }, + { + "epoch": 0.18515497553017946, + "grad_norm": 4.612658977508545, + "learning_rate": 9.249592169657422e-05, + "loss": 0.1005, + "num_input_tokens_seen": 2454048, + "step": 1135 + }, + { + "epoch": 0.1859706362153344, + "grad_norm": 13.68948745727539, + "learning_rate": 9.290375203915171e-05, + "loss": 0.2374, + "num_input_tokens_seen": 2464704, + "step": 1140 + }, + { + "epoch": 0.1867862969004894, + "grad_norm": 4.448404788970947, + "learning_rate": 9.33115823817292e-05, + "loss": 0.359, + "num_input_tokens_seen": 2475712, + "step": 1145 + }, + { + "epoch": 0.18760195758564438, + "grad_norm": 0.15590998530387878, + "learning_rate": 9.371941272430669e-05, + "loss": 0.3164, + "num_input_tokens_seen": 2488352, + "step": 1150 + }, + { + "epoch": 0.18841761827079934, + "grad_norm": 2.397275924682617, + "learning_rate": 9.412724306688418e-05, + "loss": 0.206, + "num_input_tokens_seen": 2499232, + "step": 1155 + }, + { + "epoch": 0.18923327895595432, + "grad_norm": 0.36472228169441223, + "learning_rate": 9.453507340946167e-05, + "loss": 0.0639, + "num_input_tokens_seen": 2509376, + "step": 1160 + }, + { + "epoch": 0.1900489396411093, + "grad_norm": 2.360445022583008, + "learning_rate": 9.494290375203915e-05, + "loss": 0.178, + "num_input_tokens_seen": 2520352, + "step": 1165 + }, + { + "epoch": 0.19086460032626426, + "grad_norm": 0.22225035727024078, + "learning_rate": 9.535073409461664e-05, + "loss": 0.1206, + "num_input_tokens_seen": 2530752, + "step": 1170 + }, + { + "epoch": 0.19168026101141925, + "grad_norm": 0.07478147000074387, + "learning_rate": 9.575856443719413e-05, + "loss": 0.4729, + "num_input_tokens_seen": 2539872, + "step": 1175 + }, + { + "epoch": 0.19249592169657423, + "grad_norm": 0.3705507516860962, + "learning_rate": 9.616639477977162e-05, + "loss": 0.2679, + "num_input_tokens_seen": 2551456, + "step": 1180 + }, + { + "epoch": 0.1933115823817292, + "grad_norm": 0.7973094582557678, + "learning_rate": 9.657422512234911e-05, + "loss": 0.119, + "num_input_tokens_seen": 2563136, + "step": 1185 + }, + { + "epoch": 0.19412724306688417, + "grad_norm": 0.2963646650314331, + "learning_rate": 9.69820554649266e-05, + "loss": 0.0195, + "num_input_tokens_seen": 2573984, + "step": 1190 + }, + { + "epoch": 0.19494290375203915, + "grad_norm": 0.3415137529373169, + "learning_rate": 9.738988580750407e-05, + "loss": 0.2677, + "num_input_tokens_seen": 2585344, + "step": 1195 + }, + { + "epoch": 0.19575856443719414, + "grad_norm": 0.12055652588605881, + "learning_rate": 9.779771615008156e-05, + "loss": 0.2518, + "num_input_tokens_seen": 2596448, + "step": 1200 + }, + { + "epoch": 0.1965742251223491, + "grad_norm": 8.142424583435059, + "learning_rate": 9.820554649265905e-05, + "loss": 0.5066, + "num_input_tokens_seen": 2606848, + "step": 1205 + }, + { + "epoch": 0.19738988580750408, + "grad_norm": 1.2064582109451294, + "learning_rate": 9.861337683523653e-05, + "loss": 0.1164, + "num_input_tokens_seen": 2617952, + "step": 1210 + }, + { + "epoch": 0.19820554649265906, + "grad_norm": 0.07324664294719696, + "learning_rate": 9.902120717781403e-05, + "loss": 0.0511, + "num_input_tokens_seen": 2628448, + "step": 1215 + }, + { + "epoch": 0.19902120717781402, + "grad_norm": 3.5778586864471436, + "learning_rate": 9.942903752039152e-05, + "loss": 0.2794, + "num_input_tokens_seen": 2639648, + "step": 1220 + }, + { + "epoch": 0.199836867862969, + "grad_norm": 3.4566075801849365, + "learning_rate": 9.983686786296901e-05, + "loss": 0.0933, + "num_input_tokens_seen": 2650112, + "step": 1225 + }, + { + "epoch": 0.200652528548124, + "grad_norm": 3.4942331314086914, + "learning_rate": 0.00010024469820554649, + "loss": 0.5332, + "num_input_tokens_seen": 2660576, + "step": 1230 + }, + { + "epoch": 0.20146818923327894, + "grad_norm": 0.2953527271747589, + "learning_rate": 0.00010065252854812398, + "loss": 0.0976, + "num_input_tokens_seen": 2669440, + "step": 1235 + }, + { + "epoch": 0.20228384991843393, + "grad_norm": 1.5927826166152954, + "learning_rate": 0.00010106035889070147, + "loss": 0.2455, + "num_input_tokens_seen": 2680544, + "step": 1240 + }, + { + "epoch": 0.2030995106035889, + "grad_norm": 2.855224609375, + "learning_rate": 0.00010146818923327896, + "loss": 0.2392, + "num_input_tokens_seen": 2692608, + "step": 1245 + }, + { + "epoch": 0.2039151712887439, + "grad_norm": 2.0778863430023193, + "learning_rate": 0.00010187601957585645, + "loss": 0.1251, + "num_input_tokens_seen": 2701760, + "step": 1250 + }, + { + "epoch": 0.20473083197389885, + "grad_norm": 3.7085254192352295, + "learning_rate": 0.00010228384991843394, + "loss": 0.2331, + "num_input_tokens_seen": 2712288, + "step": 1255 + }, + { + "epoch": 0.20554649265905384, + "grad_norm": 2.640923500061035, + "learning_rate": 0.00010269168026101142, + "loss": 0.1405, + "num_input_tokens_seen": 2723488, + "step": 1260 + }, + { + "epoch": 0.20636215334420882, + "grad_norm": 0.7595841884613037, + "learning_rate": 0.00010309951060358891, + "loss": 0.1611, + "num_input_tokens_seen": 2734400, + "step": 1265 + }, + { + "epoch": 0.20717781402936378, + "grad_norm": 0.3323002755641937, + "learning_rate": 0.0001035073409461664, + "loss": 0.0854, + "num_input_tokens_seen": 2746208, + "step": 1270 + }, + { + "epoch": 0.20799347471451876, + "grad_norm": 6.0016303062438965, + "learning_rate": 0.00010391517128874387, + "loss": 0.1824, + "num_input_tokens_seen": 2757216, + "step": 1275 + }, + { + "epoch": 0.20880913539967375, + "grad_norm": 0.13172432780265808, + "learning_rate": 0.00010432300163132138, + "loss": 0.1912, + "num_input_tokens_seen": 2768192, + "step": 1280 + }, + { + "epoch": 0.2096247960848287, + "grad_norm": 3.026542901992798, + "learning_rate": 0.00010473083197389887, + "loss": 0.0904, + "num_input_tokens_seen": 2778400, + "step": 1285 + }, + { + "epoch": 0.21044045676998369, + "grad_norm": 4.948746681213379, + "learning_rate": 0.00010513866231647634, + "loss": 0.3657, + "num_input_tokens_seen": 2788416, + "step": 1290 + }, + { + "epoch": 0.21125611745513867, + "grad_norm": 0.26640602946281433, + "learning_rate": 0.00010554649265905383, + "loss": 0.3439, + "num_input_tokens_seen": 2799360, + "step": 1295 + }, + { + "epoch": 0.21207177814029363, + "grad_norm": 0.36822858452796936, + "learning_rate": 0.00010595432300163132, + "loss": 0.199, + "num_input_tokens_seen": 2809728, + "step": 1300 + }, + { + "epoch": 0.2128874388254486, + "grad_norm": 2.3697075843811035, + "learning_rate": 0.00010636215334420881, + "loss": 0.2865, + "num_input_tokens_seen": 2819776, + "step": 1305 + }, + { + "epoch": 0.2137030995106036, + "grad_norm": 0.7037314772605896, + "learning_rate": 0.00010676998368678629, + "loss": 0.1692, + "num_input_tokens_seen": 2831360, + "step": 1310 + }, + { + "epoch": 0.21451876019575855, + "grad_norm": 1.2460227012634277, + "learning_rate": 0.0001071778140293638, + "loss": 0.1563, + "num_input_tokens_seen": 2842432, + "step": 1315 + }, + { + "epoch": 0.21533442088091354, + "grad_norm": 0.42761459946632385, + "learning_rate": 0.00010758564437194128, + "loss": 0.122, + "num_input_tokens_seen": 2852448, + "step": 1320 + }, + { + "epoch": 0.21615008156606852, + "grad_norm": 0.9030336737632751, + "learning_rate": 0.00010799347471451876, + "loss": 0.088, + "num_input_tokens_seen": 2862912, + "step": 1325 + }, + { + "epoch": 0.2169657422512235, + "grad_norm": 1.067244052886963, + "learning_rate": 0.00010840130505709625, + "loss": 0.2082, + "num_input_tokens_seen": 2873888, + "step": 1330 + }, + { + "epoch": 0.21778140293637846, + "grad_norm": 0.8108012676239014, + "learning_rate": 0.00010880913539967374, + "loss": 0.1894, + "num_input_tokens_seen": 2885664, + "step": 1335 + }, + { + "epoch": 0.21859706362153344, + "grad_norm": 0.14770320057868958, + "learning_rate": 0.00010921696574225122, + "loss": 0.367, + "num_input_tokens_seen": 2896288, + "step": 1340 + }, + { + "epoch": 0.21941272430668843, + "grad_norm": 2.9352149963378906, + "learning_rate": 0.0001096247960848287, + "loss": 0.1078, + "num_input_tokens_seen": 2907264, + "step": 1345 + }, + { + "epoch": 0.22022838499184338, + "grad_norm": 0.31926628947257996, + "learning_rate": 0.00011003262642740621, + "loss": 0.3341, + "num_input_tokens_seen": 2918464, + "step": 1350 + }, + { + "epoch": 0.22104404567699837, + "grad_norm": 7.357073783874512, + "learning_rate": 0.00011044045676998369, + "loss": 0.4854, + "num_input_tokens_seen": 2929792, + "step": 1355 + }, + { + "epoch": 0.22185970636215335, + "grad_norm": 4.365267753601074, + "learning_rate": 0.00011084828711256118, + "loss": 0.1755, + "num_input_tokens_seen": 2940064, + "step": 1360 + }, + { + "epoch": 0.2226753670473083, + "grad_norm": 0.6887586116790771, + "learning_rate": 0.00011125611745513867, + "loss": 0.0783, + "num_input_tokens_seen": 2950368, + "step": 1365 + }, + { + "epoch": 0.2234910277324633, + "grad_norm": 4.315596580505371, + "learning_rate": 0.00011166394779771616, + "loss": 0.3597, + "num_input_tokens_seen": 2960448, + "step": 1370 + }, + { + "epoch": 0.22430668841761828, + "grad_norm": 0.5510193109512329, + "learning_rate": 0.00011207177814029363, + "loss": 0.1461, + "num_input_tokens_seen": 2972032, + "step": 1375 + }, + { + "epoch": 0.22512234910277323, + "grad_norm": 0.8004043102264404, + "learning_rate": 0.00011247960848287112, + "loss": 0.1338, + "num_input_tokens_seen": 2984160, + "step": 1380 + }, + { + "epoch": 0.22593800978792822, + "grad_norm": 0.13799701631069183, + "learning_rate": 0.00011288743882544863, + "loss": 0.0982, + "num_input_tokens_seen": 2994752, + "step": 1385 + }, + { + "epoch": 0.2267536704730832, + "grad_norm": 1.3625370264053345, + "learning_rate": 0.0001132952691680261, + "loss": 0.1263, + "num_input_tokens_seen": 3005376, + "step": 1390 + }, + { + "epoch": 0.2275693311582382, + "grad_norm": 7.915112018585205, + "learning_rate": 0.00011370309951060359, + "loss": 0.2681, + "num_input_tokens_seen": 3016576, + "step": 1395 + }, + { + "epoch": 0.22838499184339314, + "grad_norm": 35.910255432128906, + "learning_rate": 0.00011411092985318108, + "loss": 0.3882, + "num_input_tokens_seen": 3028704, + "step": 1400 + }, + { + "epoch": 0.22920065252854813, + "grad_norm": 3.048297166824341, + "learning_rate": 0.00011451876019575856, + "loss": 0.3004, + "num_input_tokens_seen": 3039456, + "step": 1405 + }, + { + "epoch": 0.2300163132137031, + "grad_norm": 3.245952606201172, + "learning_rate": 0.00011492659053833605, + "loss": 0.2714, + "num_input_tokens_seen": 3050816, + "step": 1410 + }, + { + "epoch": 0.23083197389885807, + "grad_norm": 0.7663965821266174, + "learning_rate": 0.00011533442088091354, + "loss": 0.1503, + "num_input_tokens_seen": 3060064, + "step": 1415 + }, + { + "epoch": 0.23164763458401305, + "grad_norm": 1.7909114360809326, + "learning_rate": 0.00011574225122349103, + "loss": 0.2217, + "num_input_tokens_seen": 3071936, + "step": 1420 + }, + { + "epoch": 0.23246329526916804, + "grad_norm": 1.2014027833938599, + "learning_rate": 0.00011615008156606852, + "loss": 0.0531, + "num_input_tokens_seen": 3082752, + "step": 1425 + }, + { + "epoch": 0.233278955954323, + "grad_norm": 0.3540492653846741, + "learning_rate": 0.00011655791190864601, + "loss": 0.0955, + "num_input_tokens_seen": 3094592, + "step": 1430 + }, + { + "epoch": 0.23409461663947798, + "grad_norm": 1.3756060600280762, + "learning_rate": 0.0001169657422512235, + "loss": 0.2884, + "num_input_tokens_seen": 3105312, + "step": 1435 + }, + { + "epoch": 0.23491027732463296, + "grad_norm": 0.6397413611412048, + "learning_rate": 0.00011737357259380098, + "loss": 0.0951, + "num_input_tokens_seen": 3115776, + "step": 1440 + }, + { + "epoch": 0.23572593800978792, + "grad_norm": 0.5612933039665222, + "learning_rate": 0.00011778140293637847, + "loss": 0.0828, + "num_input_tokens_seen": 3127392, + "step": 1445 + }, + { + "epoch": 0.2365415986949429, + "grad_norm": 2.8829073905944824, + "learning_rate": 0.00011818923327895596, + "loss": 0.1222, + "num_input_tokens_seen": 3137760, + "step": 1450 + }, + { + "epoch": 0.23735725938009788, + "grad_norm": 6.076878547668457, + "learning_rate": 0.00011859706362153345, + "loss": 0.2851, + "num_input_tokens_seen": 3148192, + "step": 1455 + }, + { + "epoch": 0.23817292006525284, + "grad_norm": 0.2625199854373932, + "learning_rate": 0.00011900489396411094, + "loss": 0.1418, + "num_input_tokens_seen": 3159008, + "step": 1460 + }, + { + "epoch": 0.23898858075040783, + "grad_norm": 0.1276540756225586, + "learning_rate": 0.00011941272430668843, + "loss": 0.0804, + "num_input_tokens_seen": 3170784, + "step": 1465 + }, + { + "epoch": 0.2398042414355628, + "grad_norm": 2.398977041244507, + "learning_rate": 0.0001198205546492659, + "loss": 0.1997, + "num_input_tokens_seen": 3182016, + "step": 1470 + }, + { + "epoch": 0.2406199021207178, + "grad_norm": 0.0415484681725502, + "learning_rate": 0.00012022838499184339, + "loss": 0.2176, + "num_input_tokens_seen": 3192288, + "step": 1475 + }, + { + "epoch": 0.24143556280587275, + "grad_norm": 0.10787076503038406, + "learning_rate": 0.00012063621533442088, + "loss": 0.1662, + "num_input_tokens_seen": 3202560, + "step": 1480 + }, + { + "epoch": 0.24225122349102773, + "grad_norm": 2.47776460647583, + "learning_rate": 0.00012104404567699836, + "loss": 0.2372, + "num_input_tokens_seen": 3213888, + "step": 1485 + }, + { + "epoch": 0.24306688417618272, + "grad_norm": 0.6052817702293396, + "learning_rate": 0.00012145187601957586, + "loss": 0.2751, + "num_input_tokens_seen": 3224032, + "step": 1490 + }, + { + "epoch": 0.24388254486133767, + "grad_norm": 0.7725127339363098, + "learning_rate": 0.00012185970636215335, + "loss": 0.0815, + "num_input_tokens_seen": 3234720, + "step": 1495 + }, + { + "epoch": 0.24469820554649266, + "grad_norm": 2.9930591583251953, + "learning_rate": 0.00012226753670473083, + "loss": 0.2188, + "num_input_tokens_seen": 3244928, + "step": 1500 + }, + { + "epoch": 0.24551386623164764, + "grad_norm": 3.0129828453063965, + "learning_rate": 0.00012267536704730833, + "loss": 0.3815, + "num_input_tokens_seen": 3254528, + "step": 1505 + }, + { + "epoch": 0.2463295269168026, + "grad_norm": 0.08210022002458572, + "learning_rate": 0.0001230831973898858, + "loss": 0.0437, + "num_input_tokens_seen": 3264768, + "step": 1510 + }, + { + "epoch": 0.24714518760195758, + "grad_norm": 1.515202522277832, + "learning_rate": 0.0001234910277324633, + "loss": 0.0471, + "num_input_tokens_seen": 3276736, + "step": 1515 + }, + { + "epoch": 0.24796084828711257, + "grad_norm": 2.027247667312622, + "learning_rate": 0.0001238988580750408, + "loss": 0.5108, + "num_input_tokens_seen": 3287264, + "step": 1520 + }, + { + "epoch": 0.24877650897226752, + "grad_norm": 2.5827109813690186, + "learning_rate": 0.00012430668841761827, + "loss": 0.3454, + "num_input_tokens_seen": 3298048, + "step": 1525 + }, + { + "epoch": 0.2495921696574225, + "grad_norm": 1.8741847276687622, + "learning_rate": 0.00012471451876019577, + "loss": 0.2622, + "num_input_tokens_seen": 3308640, + "step": 1530 + }, + { + "epoch": 0.25040783034257746, + "grad_norm": 0.7166083455085754, + "learning_rate": 0.00012512234910277325, + "loss": 0.1249, + "num_input_tokens_seen": 3319040, + "step": 1535 + }, + { + "epoch": 0.25122349102773245, + "grad_norm": 0.20941627025604248, + "learning_rate": 0.00012553017944535072, + "loss": 0.0722, + "num_input_tokens_seen": 3330720, + "step": 1540 + }, + { + "epoch": 0.25203915171288743, + "grad_norm": 8.934325218200684, + "learning_rate": 0.00012593800978792823, + "loss": 0.3257, + "num_input_tokens_seen": 3340352, + "step": 1545 + }, + { + "epoch": 0.2528548123980424, + "grad_norm": 0.5297455787658691, + "learning_rate": 0.0001263458401305057, + "loss": 0.2996, + "num_input_tokens_seen": 3350272, + "step": 1550 + }, + { + "epoch": 0.2536704730831974, + "grad_norm": 2.1738781929016113, + "learning_rate": 0.0001267536704730832, + "loss": 0.2076, + "num_input_tokens_seen": 3361440, + "step": 1555 + }, + { + "epoch": 0.2544861337683524, + "grad_norm": 6.6399455070495605, + "learning_rate": 0.00012716150081566068, + "loss": 0.2664, + "num_input_tokens_seen": 3373600, + "step": 1560 + }, + { + "epoch": 0.2553017944535073, + "grad_norm": 5.0540385246276855, + "learning_rate": 0.00012756933115823819, + "loss": 0.2678, + "num_input_tokens_seen": 3384160, + "step": 1565 + }, + { + "epoch": 0.2561174551386623, + "grad_norm": 18.535308837890625, + "learning_rate": 0.00012797716150081566, + "loss": 0.6454, + "num_input_tokens_seen": 3393792, + "step": 1570 + }, + { + "epoch": 0.2569331158238173, + "grad_norm": 3.7608978748321533, + "learning_rate": 0.00012838499184339314, + "loss": 0.4939, + "num_input_tokens_seen": 3404576, + "step": 1575 + }, + { + "epoch": 0.25774877650897227, + "grad_norm": 3.712184190750122, + "learning_rate": 0.00012879282218597064, + "loss": 0.2159, + "num_input_tokens_seen": 3414816, + "step": 1580 + }, + { + "epoch": 0.25856443719412725, + "grad_norm": 3.3276753425598145, + "learning_rate": 0.00012920065252854812, + "loss": 0.1312, + "num_input_tokens_seen": 3425248, + "step": 1585 + }, + { + "epoch": 0.25938009787928223, + "grad_norm": 0.4005642831325531, + "learning_rate": 0.00012960848287112562, + "loss": 0.069, + "num_input_tokens_seen": 3435680, + "step": 1590 + }, + { + "epoch": 0.2601957585644372, + "grad_norm": 1.2974168062210083, + "learning_rate": 0.0001300163132137031, + "loss": 0.0704, + "num_input_tokens_seen": 3447488, + "step": 1595 + }, + { + "epoch": 0.26101141924959215, + "grad_norm": 0.48080500960350037, + "learning_rate": 0.0001304241435562806, + "loss": 0.1689, + "num_input_tokens_seen": 3458336, + "step": 1600 + }, + { + "epoch": 0.26182707993474713, + "grad_norm": 2.298285961151123, + "learning_rate": 0.00013083197389885805, + "loss": 0.2302, + "num_input_tokens_seen": 3467680, + "step": 1605 + }, + { + "epoch": 0.2626427406199021, + "grad_norm": 0.040536798536777496, + "learning_rate": 0.00013123980424143555, + "loss": 0.3058, + "num_input_tokens_seen": 3478400, + "step": 1610 + }, + { + "epoch": 0.2634584013050571, + "grad_norm": 0.12443608790636063, + "learning_rate": 0.00013164763458401306, + "loss": 0.0356, + "num_input_tokens_seen": 3489280, + "step": 1615 + }, + { + "epoch": 0.2642740619902121, + "grad_norm": 3.2961642742156982, + "learning_rate": 0.00013205546492659053, + "loss": 0.3348, + "num_input_tokens_seen": 3500800, + "step": 1620 + }, + { + "epoch": 0.26508972267536707, + "grad_norm": 0.34571152925491333, + "learning_rate": 0.00013246329526916804, + "loss": 0.1261, + "num_input_tokens_seen": 3510656, + "step": 1625 + }, + { + "epoch": 0.265905383360522, + "grad_norm": 0.5007117986679077, + "learning_rate": 0.00013287112561174552, + "loss": 0.1386, + "num_input_tokens_seen": 3521696, + "step": 1630 + }, + { + "epoch": 0.266721044045677, + "grad_norm": 0.8755322098731995, + "learning_rate": 0.00013327895595432302, + "loss": 0.0952, + "num_input_tokens_seen": 3534304, + "step": 1635 + }, + { + "epoch": 0.26753670473083196, + "grad_norm": 2.3326218128204346, + "learning_rate": 0.00013368678629690047, + "loss": 0.2073, + "num_input_tokens_seen": 3544416, + "step": 1640 + }, + { + "epoch": 0.26835236541598695, + "grad_norm": 0.17495253682136536, + "learning_rate": 0.00013409461663947797, + "loss": 0.1812, + "num_input_tokens_seen": 3553760, + "step": 1645 + }, + { + "epoch": 0.26916802610114193, + "grad_norm": 0.30354925990104675, + "learning_rate": 0.00013450244698205548, + "loss": 0.1423, + "num_input_tokens_seen": 3563776, + "step": 1650 + }, + { + "epoch": 0.2699836867862969, + "grad_norm": 0.07377764582633972, + "learning_rate": 0.00013491027732463295, + "loss": 0.1907, + "num_input_tokens_seen": 3575200, + "step": 1655 + }, + { + "epoch": 0.2707993474714519, + "grad_norm": 0.5978612899780273, + "learning_rate": 0.00013531810766721046, + "loss": 0.1143, + "num_input_tokens_seen": 3587328, + "step": 1660 + }, + { + "epoch": 0.27161500815660683, + "grad_norm": 3.454429864883423, + "learning_rate": 0.00013572593800978793, + "loss": 0.15, + "num_input_tokens_seen": 3597056, + "step": 1665 + }, + { + "epoch": 0.2724306688417618, + "grad_norm": 6.486354827880859, + "learning_rate": 0.0001361337683523654, + "loss": 0.0857, + "num_input_tokens_seen": 3608064, + "step": 1670 + }, + { + "epoch": 0.2732463295269168, + "grad_norm": 6.622701168060303, + "learning_rate": 0.00013654159869494288, + "loss": 0.2567, + "num_input_tokens_seen": 3618208, + "step": 1675 + }, + { + "epoch": 0.2740619902120718, + "grad_norm": 3.2912650108337402, + "learning_rate": 0.0001369494290375204, + "loss": 0.4063, + "num_input_tokens_seen": 3629600, + "step": 1680 + }, + { + "epoch": 0.27487765089722677, + "grad_norm": 1.7371718883514404, + "learning_rate": 0.0001373572593800979, + "loss": 0.1756, + "num_input_tokens_seen": 3640160, + "step": 1685 + }, + { + "epoch": 0.27569331158238175, + "grad_norm": 1.5828455686569214, + "learning_rate": 0.00013776508972267537, + "loss": 0.1398, + "num_input_tokens_seen": 3649984, + "step": 1690 + }, + { + "epoch": 0.2765089722675367, + "grad_norm": 0.8824554681777954, + "learning_rate": 0.00013817292006525287, + "loss": 0.041, + "num_input_tokens_seen": 3660544, + "step": 1695 + }, + { + "epoch": 0.27732463295269166, + "grad_norm": 0.31346526741981506, + "learning_rate": 0.00013858075040783035, + "loss": 0.2259, + "num_input_tokens_seen": 3670688, + "step": 1700 + }, + { + "epoch": 0.27814029363784665, + "grad_norm": 0.8705945014953613, + "learning_rate": 0.00013898858075040782, + "loss": 0.1137, + "num_input_tokens_seen": 3680896, + "step": 1705 + }, + { + "epoch": 0.27895595432300163, + "grad_norm": 0.12432600557804108, + "learning_rate": 0.00013939641109298533, + "loss": 0.1308, + "num_input_tokens_seen": 3691648, + "step": 1710 + }, + { + "epoch": 0.2797716150081566, + "grad_norm": 0.15383680164813995, + "learning_rate": 0.0001398042414355628, + "loss": 0.2624, + "num_input_tokens_seen": 3701920, + "step": 1715 + }, + { + "epoch": 0.2805872756933116, + "grad_norm": 0.3748222291469574, + "learning_rate": 0.0001402120717781403, + "loss": 0.3476, + "num_input_tokens_seen": 3711648, + "step": 1720 + }, + { + "epoch": 0.2814029363784666, + "grad_norm": 1.4904766082763672, + "learning_rate": 0.00014061990212071778, + "loss": 0.1822, + "num_input_tokens_seen": 3721312, + "step": 1725 + }, + { + "epoch": 0.2822185970636215, + "grad_norm": 0.6280257105827332, + "learning_rate": 0.0001410277324632953, + "loss": 0.1454, + "num_input_tokens_seen": 3732192, + "step": 1730 + }, + { + "epoch": 0.2830342577487765, + "grad_norm": 1.832024097442627, + "learning_rate": 0.00014143556280587274, + "loss": 0.2084, + "num_input_tokens_seen": 3742368, + "step": 1735 + }, + { + "epoch": 0.2838499184339315, + "grad_norm": 1.173895001411438, + "learning_rate": 0.00014184339314845024, + "loss": 0.2289, + "num_input_tokens_seen": 3753536, + "step": 1740 + }, + { + "epoch": 0.28466557911908646, + "grad_norm": 0.7341709733009338, + "learning_rate": 0.00014225122349102774, + "loss": 0.0779, + "num_input_tokens_seen": 3764960, + "step": 1745 + }, + { + "epoch": 0.28548123980424145, + "grad_norm": 0.40611112117767334, + "learning_rate": 0.00014265905383360522, + "loss": 0.0627, + "num_input_tokens_seen": 3776224, + "step": 1750 + }, + { + "epoch": 0.28629690048939643, + "grad_norm": 0.38497471809387207, + "learning_rate": 0.00014306688417618272, + "loss": 0.3102, + "num_input_tokens_seen": 3786208, + "step": 1755 + }, + { + "epoch": 0.28711256117455136, + "grad_norm": 1.9566757678985596, + "learning_rate": 0.0001434747145187602, + "loss": 0.1619, + "num_input_tokens_seen": 3796768, + "step": 1760 + }, + { + "epoch": 0.28792822185970635, + "grad_norm": 0.3571220338344574, + "learning_rate": 0.0001438825448613377, + "loss": 0.1974, + "num_input_tokens_seen": 3808928, + "step": 1765 + }, + { + "epoch": 0.28874388254486133, + "grad_norm": 1.0718194246292114, + "learning_rate": 0.00014429037520391515, + "loss": 0.2318, + "num_input_tokens_seen": 3820480, + "step": 1770 + }, + { + "epoch": 0.2895595432300163, + "grad_norm": 0.08225858211517334, + "learning_rate": 0.00014469820554649266, + "loss": 0.1204, + "num_input_tokens_seen": 3832352, + "step": 1775 + }, + { + "epoch": 0.2903752039151713, + "grad_norm": 1.1732842922210693, + "learning_rate": 0.00014510603588907016, + "loss": 0.1203, + "num_input_tokens_seen": 3842560, + "step": 1780 + }, + { + "epoch": 0.2911908646003263, + "grad_norm": 1.7823255062103271, + "learning_rate": 0.00014551386623164764, + "loss": 0.3127, + "num_input_tokens_seen": 3853824, + "step": 1785 + }, + { + "epoch": 0.29200652528548127, + "grad_norm": 0.37060829997062683, + "learning_rate": 0.00014592169657422514, + "loss": 0.1092, + "num_input_tokens_seen": 3865280, + "step": 1790 + }, + { + "epoch": 0.2928221859706362, + "grad_norm": 2.0758445262908936, + "learning_rate": 0.00014632952691680262, + "loss": 0.1014, + "num_input_tokens_seen": 3874944, + "step": 1795 + }, + { + "epoch": 0.2936378466557912, + "grad_norm": 0.1883598268032074, + "learning_rate": 0.0001467373572593801, + "loss": 0.125, + "num_input_tokens_seen": 3885824, + "step": 1800 + }, + { + "epoch": 0.29445350734094616, + "grad_norm": 3.2518420219421387, + "learning_rate": 0.00014714518760195757, + "loss": 0.2931, + "num_input_tokens_seen": 3896480, + "step": 1805 + }, + { + "epoch": 0.29526916802610115, + "grad_norm": 2.0988779067993164, + "learning_rate": 0.00014755301794453507, + "loss": 0.2474, + "num_input_tokens_seen": 3908256, + "step": 1810 + }, + { + "epoch": 0.29608482871125613, + "grad_norm": 0.1773896962404251, + "learning_rate": 0.00014796084828711258, + "loss": 0.0975, + "num_input_tokens_seen": 3919584, + "step": 1815 + }, + { + "epoch": 0.2969004893964111, + "grad_norm": 1.0337015390396118, + "learning_rate": 0.00014836867862969005, + "loss": 0.1539, + "num_input_tokens_seen": 3930016, + "step": 1820 + }, + { + "epoch": 0.29771615008156604, + "grad_norm": 0.4843730032444, + "learning_rate": 0.00014877650897226756, + "loss": 0.0723, + "num_input_tokens_seen": 3940800, + "step": 1825 + }, + { + "epoch": 0.29853181076672103, + "grad_norm": 0.14410769939422607, + "learning_rate": 0.00014918433931484503, + "loss": 0.0975, + "num_input_tokens_seen": 3952192, + "step": 1830 + }, + { + "epoch": 0.299347471451876, + "grad_norm": 0.07089443504810333, + "learning_rate": 0.0001495921696574225, + "loss": 0.0374, + "num_input_tokens_seen": 3963936, + "step": 1835 + }, + { + "epoch": 0.300163132137031, + "grad_norm": 2.6121270656585693, + "learning_rate": 0.00015, + "loss": 0.145, + "num_input_tokens_seen": 3973824, + "step": 1840 + }, + { + "epoch": 0.300978792822186, + "grad_norm": 0.0231600571423769, + "learning_rate": 0.0001504078303425775, + "loss": 0.0033, + "num_input_tokens_seen": 3984576, + "step": 1845 + }, + { + "epoch": 0.30179445350734097, + "grad_norm": 0.015244451351463795, + "learning_rate": 0.000150815660685155, + "loss": 0.0119, + "num_input_tokens_seen": 3997024, + "step": 1850 + }, + { + "epoch": 0.30261011419249595, + "grad_norm": 0.007247697561979294, + "learning_rate": 0.00015122349102773247, + "loss": 0.1895, + "num_input_tokens_seen": 4007808, + "step": 1855 + }, + { + "epoch": 0.3034257748776509, + "grad_norm": 0.05449846014380455, + "learning_rate": 0.00015163132137030997, + "loss": 0.188, + "num_input_tokens_seen": 4018944, + "step": 1860 + }, + { + "epoch": 0.30424143556280586, + "grad_norm": 2.1363980770111084, + "learning_rate": 0.00015203915171288742, + "loss": 0.6462, + "num_input_tokens_seen": 4029824, + "step": 1865 + }, + { + "epoch": 0.30505709624796085, + "grad_norm": 1.8556227684020996, + "learning_rate": 0.00015244698205546493, + "loss": 0.3281, + "num_input_tokens_seen": 4040800, + "step": 1870 + }, + { + "epoch": 0.30587275693311583, + "grad_norm": 1.4034215211868286, + "learning_rate": 0.0001528548123980424, + "loss": 0.2342, + "num_input_tokens_seen": 4050912, + "step": 1875 + }, + { + "epoch": 0.3066884176182708, + "grad_norm": 1.7816450595855713, + "learning_rate": 0.0001532626427406199, + "loss": 0.1647, + "num_input_tokens_seen": 4061696, + "step": 1880 + }, + { + "epoch": 0.3075040783034258, + "grad_norm": 2.1075515747070312, + "learning_rate": 0.0001536704730831974, + "loss": 0.2302, + "num_input_tokens_seen": 4071264, + "step": 1885 + }, + { + "epoch": 0.3083197389885807, + "grad_norm": 1.625557780265808, + "learning_rate": 0.0001540783034257749, + "loss": 0.1746, + "num_input_tokens_seen": 4081152, + "step": 1890 + }, + { + "epoch": 0.3091353996737357, + "grad_norm": 0.2660638988018036, + "learning_rate": 0.00015448613376835236, + "loss": 0.1498, + "num_input_tokens_seen": 4091584, + "step": 1895 + }, + { + "epoch": 0.3099510603588907, + "grad_norm": 0.2802708148956299, + "learning_rate": 0.00015489396411092984, + "loss": 0.0782, + "num_input_tokens_seen": 4102560, + "step": 1900 + }, + { + "epoch": 0.3107667210440457, + "grad_norm": 0.36759790778160095, + "learning_rate": 0.00015530179445350734, + "loss": 0.1151, + "num_input_tokens_seen": 4113920, + "step": 1905 + }, + { + "epoch": 0.31158238172920066, + "grad_norm": 1.4215971231460571, + "learning_rate": 0.00015570962479608482, + "loss": 0.1237, + "num_input_tokens_seen": 4125472, + "step": 1910 + }, + { + "epoch": 0.31239804241435565, + "grad_norm": 0.3487188220024109, + "learning_rate": 0.00015611745513866232, + "loss": 0.2935, + "num_input_tokens_seen": 4136032, + "step": 1915 + }, + { + "epoch": 0.3132137030995106, + "grad_norm": 0.13971523940563202, + "learning_rate": 0.00015652528548123983, + "loss": 0.0692, + "num_input_tokens_seen": 4145792, + "step": 1920 + }, + { + "epoch": 0.31402936378466556, + "grad_norm": 0.19864986836910248, + "learning_rate": 0.0001569331158238173, + "loss": 0.2522, + "num_input_tokens_seen": 4156832, + "step": 1925 + }, + { + "epoch": 0.31484502446982054, + "grad_norm": 2.3408796787261963, + "learning_rate": 0.00015734094616639478, + "loss": 0.1961, + "num_input_tokens_seen": 4167424, + "step": 1930 + }, + { + "epoch": 0.31566068515497553, + "grad_norm": 1.4813441038131714, + "learning_rate": 0.00015774877650897226, + "loss": 0.0822, + "num_input_tokens_seen": 4177248, + "step": 1935 + }, + { + "epoch": 0.3164763458401305, + "grad_norm": 0.1526590883731842, + "learning_rate": 0.00015815660685154976, + "loss": 0.1321, + "num_input_tokens_seen": 4188064, + "step": 1940 + }, + { + "epoch": 0.3172920065252855, + "grad_norm": 0.12207305431365967, + "learning_rate": 0.00015856443719412724, + "loss": 0.0615, + "num_input_tokens_seen": 4198144, + "step": 1945 + }, + { + "epoch": 0.3181076672104405, + "grad_norm": 0.2033795267343521, + "learning_rate": 0.00015897226753670474, + "loss": 0.1123, + "num_input_tokens_seen": 4209952, + "step": 1950 + }, + { + "epoch": 0.3189233278955954, + "grad_norm": 1.4362683296203613, + "learning_rate": 0.00015938009787928224, + "loss": 0.025, + "num_input_tokens_seen": 4219360, + "step": 1955 + }, + { + "epoch": 0.3197389885807504, + "grad_norm": 0.9837049245834351, + "learning_rate": 0.0001597879282218597, + "loss": 0.2129, + "num_input_tokens_seen": 4230080, + "step": 1960 + }, + { + "epoch": 0.3205546492659054, + "grad_norm": 0.08906455338001251, + "learning_rate": 0.0001601957585644372, + "loss": 0.1465, + "num_input_tokens_seen": 4240768, + "step": 1965 + }, + { + "epoch": 0.32137030995106036, + "grad_norm": 1.3643310070037842, + "learning_rate": 0.00016060358890701467, + "loss": 0.3903, + "num_input_tokens_seen": 4252224, + "step": 1970 + }, + { + "epoch": 0.32218597063621535, + "grad_norm": 0.5359205603599548, + "learning_rate": 0.00016101141924959218, + "loss": 0.1853, + "num_input_tokens_seen": 4264160, + "step": 1975 + }, + { + "epoch": 0.32300163132137033, + "grad_norm": 1.2164024114608765, + "learning_rate": 0.00016141924959216965, + "loss": 0.2022, + "num_input_tokens_seen": 4274368, + "step": 1980 + }, + { + "epoch": 0.32381729200652526, + "grad_norm": 0.5487558841705322, + "learning_rate": 0.00016182707993474716, + "loss": 0.1015, + "num_input_tokens_seen": 4283936, + "step": 1985 + }, + { + "epoch": 0.32463295269168024, + "grad_norm": 0.5527929663658142, + "learning_rate": 0.00016223491027732466, + "loss": 0.1015, + "num_input_tokens_seen": 4294368, + "step": 1990 + }, + { + "epoch": 0.3254486133768352, + "grad_norm": 1.1117502450942993, + "learning_rate": 0.0001626427406199021, + "loss": 0.0639, + "num_input_tokens_seen": 4303328, + "step": 1995 + }, + { + "epoch": 0.3262642740619902, + "grad_norm": 2.3114776611328125, + "learning_rate": 0.0001630505709624796, + "loss": 0.1864, + "num_input_tokens_seen": 4315072, + "step": 2000 + }, + { + "epoch": 0.3270799347471452, + "grad_norm": 0.029865602031350136, + "learning_rate": 0.0001634584013050571, + "loss": 0.0466, + "num_input_tokens_seen": 4326816, + "step": 2005 + }, + { + "epoch": 0.3278955954323002, + "grad_norm": 2.729114055633545, + "learning_rate": 0.0001638662316476346, + "loss": 0.0934, + "num_input_tokens_seen": 4336896, + "step": 2010 + }, + { + "epoch": 0.32871125611745516, + "grad_norm": 1.5304375886917114, + "learning_rate": 0.00016427406199021207, + "loss": 0.0852, + "num_input_tokens_seen": 4347328, + "step": 2015 + }, + { + "epoch": 0.3295269168026101, + "grad_norm": 0.3901952803134918, + "learning_rate": 0.00016468189233278957, + "loss": 0.1983, + "num_input_tokens_seen": 4358208, + "step": 2020 + }, + { + "epoch": 0.3303425774877651, + "grad_norm": 2.25579571723938, + "learning_rate": 0.00016508972267536705, + "loss": 0.1892, + "num_input_tokens_seen": 4368672, + "step": 2025 + }, + { + "epoch": 0.33115823817292006, + "grad_norm": 3.3735718727111816, + "learning_rate": 0.00016549755301794453, + "loss": 0.1723, + "num_input_tokens_seen": 4379808, + "step": 2030 + }, + { + "epoch": 0.33197389885807504, + "grad_norm": 1.8546075820922852, + "learning_rate": 0.00016590538336052203, + "loss": 0.1136, + "num_input_tokens_seen": 4389728, + "step": 2035 + }, + { + "epoch": 0.33278955954323003, + "grad_norm": 0.03245149180293083, + "learning_rate": 0.0001663132137030995, + "loss": 0.0093, + "num_input_tokens_seen": 4400704, + "step": 2040 + }, + { + "epoch": 0.333605220228385, + "grad_norm": 2.443859577178955, + "learning_rate": 0.000166721044045677, + "loss": 0.2122, + "num_input_tokens_seen": 4411648, + "step": 2045 + }, + { + "epoch": 0.33442088091353994, + "grad_norm": 0.18161454796791077, + "learning_rate": 0.00016712887438825449, + "loss": 0.1275, + "num_input_tokens_seen": 4423328, + "step": 2050 + }, + { + "epoch": 0.3352365415986949, + "grad_norm": 0.296201229095459, + "learning_rate": 0.000167536704730832, + "loss": 0.3971, + "num_input_tokens_seen": 4434336, + "step": 2055 + }, + { + "epoch": 0.3360522022838499, + "grad_norm": 0.04536137357354164, + "learning_rate": 0.00016794453507340947, + "loss": 0.1908, + "num_input_tokens_seen": 4444672, + "step": 2060 + }, + { + "epoch": 0.3368678629690049, + "grad_norm": 0.1336517184972763, + "learning_rate": 0.00016835236541598694, + "loss": 0.0652, + "num_input_tokens_seen": 4455872, + "step": 2065 + }, + { + "epoch": 0.3376835236541599, + "grad_norm": 0.3583919405937195, + "learning_rate": 0.00016876019575856445, + "loss": 0.129, + "num_input_tokens_seen": 4467616, + "step": 2070 + }, + { + "epoch": 0.33849918433931486, + "grad_norm": 1.1324158906936646, + "learning_rate": 0.00016916802610114192, + "loss": 0.143, + "num_input_tokens_seen": 4478656, + "step": 2075 + }, + { + "epoch": 0.33931484502446985, + "grad_norm": 0.07469242811203003, + "learning_rate": 0.00016957585644371943, + "loss": 0.0389, + "num_input_tokens_seen": 4488800, + "step": 2080 + }, + { + "epoch": 0.3401305057096248, + "grad_norm": 1.5802353620529175, + "learning_rate": 0.0001699836867862969, + "loss": 0.0959, + "num_input_tokens_seen": 4498496, + "step": 2085 + }, + { + "epoch": 0.34094616639477976, + "grad_norm": 1.5060569047927856, + "learning_rate": 0.00017039151712887438, + "loss": 0.2579, + "num_input_tokens_seen": 4509152, + "step": 2090 + }, + { + "epoch": 0.34176182707993474, + "grad_norm": 0.15717989206314087, + "learning_rate": 0.00017079934747145188, + "loss": 0.0914, + "num_input_tokens_seen": 4521120, + "step": 2095 + }, + { + "epoch": 0.3425774877650897, + "grad_norm": 2.0099587440490723, + "learning_rate": 0.00017120717781402936, + "loss": 0.1256, + "num_input_tokens_seen": 4531744, + "step": 2100 + }, + { + "epoch": 0.3433931484502447, + "grad_norm": 0.8347704410552979, + "learning_rate": 0.00017161500815660686, + "loss": 0.2788, + "num_input_tokens_seen": 4540704, + "step": 2105 + }, + { + "epoch": 0.3442088091353997, + "grad_norm": 1.3534941673278809, + "learning_rate": 0.00017202283849918434, + "loss": 0.1521, + "num_input_tokens_seen": 4551552, + "step": 2110 + }, + { + "epoch": 0.3450244698205546, + "grad_norm": 0.24349333345890045, + "learning_rate": 0.00017243066884176184, + "loss": 0.2539, + "num_input_tokens_seen": 4562976, + "step": 2115 + }, + { + "epoch": 0.3458401305057096, + "grad_norm": 0.4490703046321869, + "learning_rate": 0.00017283849918433932, + "loss": 0.1712, + "num_input_tokens_seen": 4573920, + "step": 2120 + }, + { + "epoch": 0.3466557911908646, + "grad_norm": 1.9330402612686157, + "learning_rate": 0.0001732463295269168, + "loss": 0.2538, + "num_input_tokens_seen": 4585088, + "step": 2125 + }, + { + "epoch": 0.3474714518760196, + "grad_norm": 0.46054011583328247, + "learning_rate": 0.0001736541598694943, + "loss": 0.1483, + "num_input_tokens_seen": 4594752, + "step": 2130 + }, + { + "epoch": 0.34828711256117456, + "grad_norm": 0.2638119161128998, + "learning_rate": 0.00017406199021207178, + "loss": 0.1508, + "num_input_tokens_seen": 4604512, + "step": 2135 + }, + { + "epoch": 0.34910277324632955, + "grad_norm": 0.5201454758644104, + "learning_rate": 0.00017446982055464928, + "loss": 0.0691, + "num_input_tokens_seen": 4615616, + "step": 2140 + }, + { + "epoch": 0.34991843393148453, + "grad_norm": 12.082786560058594, + "learning_rate": 0.00017487765089722676, + "loss": 0.2726, + "num_input_tokens_seen": 4625984, + "step": 2145 + }, + { + "epoch": 0.35073409461663946, + "grad_norm": 0.30344992876052856, + "learning_rate": 0.00017528548123980426, + "loss": 0.1277, + "num_input_tokens_seen": 4636992, + "step": 2150 + }, + { + "epoch": 0.35154975530179444, + "grad_norm": 0.7207813858985901, + "learning_rate": 0.0001756933115823817, + "loss": 0.0274, + "num_input_tokens_seen": 4647424, + "step": 2155 + }, + { + "epoch": 0.3523654159869494, + "grad_norm": 0.2960370182991028, + "learning_rate": 0.0001761011419249592, + "loss": 0.1646, + "num_input_tokens_seen": 4659360, + "step": 2160 + }, + { + "epoch": 0.3531810766721044, + "grad_norm": 0.4028482139110565, + "learning_rate": 0.00017650897226753672, + "loss": 0.1761, + "num_input_tokens_seen": 4669248, + "step": 2165 + }, + { + "epoch": 0.3539967373572594, + "grad_norm": 0.2826700210571289, + "learning_rate": 0.0001769168026101142, + "loss": 0.0855, + "num_input_tokens_seen": 4679360, + "step": 2170 + }, + { + "epoch": 0.3548123980424144, + "grad_norm": 1.6123909950256348, + "learning_rate": 0.0001773246329526917, + "loss": 0.3686, + "num_input_tokens_seen": 4691104, + "step": 2175 + }, + { + "epoch": 0.3556280587275693, + "grad_norm": 2.6291587352752686, + "learning_rate": 0.00017773246329526917, + "loss": 0.1182, + "num_input_tokens_seen": 4700416, + "step": 2180 + }, + { + "epoch": 0.3564437194127243, + "grad_norm": 0.5103549957275391, + "learning_rate": 0.00017814029363784668, + "loss": 0.3337, + "num_input_tokens_seen": 4711072, + "step": 2185 + }, + { + "epoch": 0.3572593800978793, + "grad_norm": 0.25698915123939514, + "learning_rate": 0.00017854812398042412, + "loss": 0.2498, + "num_input_tokens_seen": 4722496, + "step": 2190 + }, + { + "epoch": 0.35807504078303426, + "grad_norm": 0.3679829239845276, + "learning_rate": 0.00017895595432300163, + "loss": 0.1047, + "num_input_tokens_seen": 4730784, + "step": 2195 + }, + { + "epoch": 0.35889070146818924, + "grad_norm": 0.2273644059896469, + "learning_rate": 0.00017936378466557913, + "loss": 0.1373, + "num_input_tokens_seen": 4740896, + "step": 2200 + }, + { + "epoch": 0.35970636215334423, + "grad_norm": 0.28911730647087097, + "learning_rate": 0.0001797716150081566, + "loss": 0.1279, + "num_input_tokens_seen": 4752448, + "step": 2205 + }, + { + "epoch": 0.3605220228384992, + "grad_norm": 0.657588005065918, + "learning_rate": 0.0001801794453507341, + "loss": 0.1135, + "num_input_tokens_seen": 4763904, + "step": 2210 + }, + { + "epoch": 0.36133768352365414, + "grad_norm": 0.7489591836929321, + "learning_rate": 0.0001805872756933116, + "loss": 0.092, + "num_input_tokens_seen": 4773952, + "step": 2215 + }, + { + "epoch": 0.3621533442088091, + "grad_norm": 0.6419491171836853, + "learning_rate": 0.00018099510603588906, + "loss": 0.1956, + "num_input_tokens_seen": 4784672, + "step": 2220 + }, + { + "epoch": 0.3629690048939641, + "grad_norm": 1.2978434562683105, + "learning_rate": 0.00018140293637846654, + "loss": 0.1935, + "num_input_tokens_seen": 4796480, + "step": 2225 + }, + { + "epoch": 0.3637846655791191, + "grad_norm": 0.9570780396461487, + "learning_rate": 0.00018181076672104404, + "loss": 0.2541, + "num_input_tokens_seen": 4807328, + "step": 2230 + }, + { + "epoch": 0.3646003262642741, + "grad_norm": 0.6393945813179016, + "learning_rate": 0.00018221859706362155, + "loss": 0.1581, + "num_input_tokens_seen": 4818848, + "step": 2235 + }, + { + "epoch": 0.36541598694942906, + "grad_norm": 0.6659722924232483, + "learning_rate": 0.00018262642740619902, + "loss": 0.1676, + "num_input_tokens_seen": 4829024, + "step": 2240 + }, + { + "epoch": 0.366231647634584, + "grad_norm": 0.46172747015953064, + "learning_rate": 0.00018303425774877653, + "loss": 0.1468, + "num_input_tokens_seen": 4839936, + "step": 2245 + }, + { + "epoch": 0.367047308319739, + "grad_norm": 0.5429582595825195, + "learning_rate": 0.00018344208809135398, + "loss": 0.2178, + "num_input_tokens_seen": 4851424, + "step": 2250 + }, + { + "epoch": 0.36786296900489396, + "grad_norm": 0.18980517983436584, + "learning_rate": 0.00018384991843393148, + "loss": 0.1108, + "num_input_tokens_seen": 4861888, + "step": 2255 + }, + { + "epoch": 0.36867862969004894, + "grad_norm": 0.52447509765625, + "learning_rate": 0.00018425774877650896, + "loss": 0.0889, + "num_input_tokens_seen": 4871136, + "step": 2260 + }, + { + "epoch": 0.3694942903752039, + "grad_norm": 0.4742538034915924, + "learning_rate": 0.00018466557911908646, + "loss": 0.0765, + "num_input_tokens_seen": 4882976, + "step": 2265 + }, + { + "epoch": 0.3703099510603589, + "grad_norm": 0.05093076080083847, + "learning_rate": 0.00018507340946166396, + "loss": 0.0865, + "num_input_tokens_seen": 4893536, + "step": 2270 + }, + { + "epoch": 0.37112561174551384, + "grad_norm": 0.5098185539245605, + "learning_rate": 0.00018548123980424144, + "loss": 0.2369, + "num_input_tokens_seen": 4904384, + "step": 2275 + }, + { + "epoch": 0.3719412724306688, + "grad_norm": 0.00874658115208149, + "learning_rate": 0.00018588907014681894, + "loss": 0.2179, + "num_input_tokens_seen": 4915008, + "step": 2280 + }, + { + "epoch": 0.3727569331158238, + "grad_norm": 0.49980396032333374, + "learning_rate": 0.0001862969004893964, + "loss": 0.081, + "num_input_tokens_seen": 4925632, + "step": 2285 + }, + { + "epoch": 0.3735725938009788, + "grad_norm": 0.029526453465223312, + "learning_rate": 0.0001867047308319739, + "loss": 0.0769, + "num_input_tokens_seen": 4936224, + "step": 2290 + }, + { + "epoch": 0.3743882544861338, + "grad_norm": 0.04340076446533203, + "learning_rate": 0.0001871125611745514, + "loss": 0.1736, + "num_input_tokens_seen": 4947072, + "step": 2295 + }, + { + "epoch": 0.37520391517128876, + "grad_norm": 2.040459156036377, + "learning_rate": 0.00018752039151712888, + "loss": 0.1318, + "num_input_tokens_seen": 4957024, + "step": 2300 + }, + { + "epoch": 0.37601957585644374, + "grad_norm": 0.05700768902897835, + "learning_rate": 0.00018792822185970638, + "loss": 0.0439, + "num_input_tokens_seen": 4966528, + "step": 2305 + }, + { + "epoch": 0.3768352365415987, + "grad_norm": 0.07470440119504929, + "learning_rate": 0.00018833605220228386, + "loss": 0.1136, + "num_input_tokens_seen": 4976928, + "step": 2310 + }, + { + "epoch": 0.37765089722675366, + "grad_norm": 1.5700215101242065, + "learning_rate": 0.00018874388254486133, + "loss": 0.2085, + "num_input_tokens_seen": 4987296, + "step": 2315 + }, + { + "epoch": 0.37846655791190864, + "grad_norm": 0.786880373954773, + "learning_rate": 0.0001891517128874388, + "loss": 0.0911, + "num_input_tokens_seen": 4998336, + "step": 2320 + }, + { + "epoch": 0.3792822185970636, + "grad_norm": 0.19122718274593353, + "learning_rate": 0.00018955954323001631, + "loss": 0.1669, + "num_input_tokens_seen": 5008256, + "step": 2325 + }, + { + "epoch": 0.3800978792822186, + "grad_norm": 1.7962323427200317, + "learning_rate": 0.00018996737357259382, + "loss": 0.195, + "num_input_tokens_seen": 5018688, + "step": 2330 + }, + { + "epoch": 0.3809135399673736, + "grad_norm": 0.29662618041038513, + "learning_rate": 0.0001903752039151713, + "loss": 0.1354, + "num_input_tokens_seen": 5029984, + "step": 2335 + }, + { + "epoch": 0.3817292006525285, + "grad_norm": 0.5231503248214722, + "learning_rate": 0.0001907830342577488, + "loss": 0.0567, + "num_input_tokens_seen": 5040160, + "step": 2340 + }, + { + "epoch": 0.3825448613376835, + "grad_norm": 1.3572986125946045, + "learning_rate": 0.00019119086460032627, + "loss": 0.1691, + "num_input_tokens_seen": 5052640, + "step": 2345 + }, + { + "epoch": 0.3833605220228385, + "grad_norm": 0.11215253174304962, + "learning_rate": 0.00019159869494290375, + "loss": 0.1856, + "num_input_tokens_seen": 5063616, + "step": 2350 + }, + { + "epoch": 0.3841761827079935, + "grad_norm": 1.3003672361373901, + "learning_rate": 0.00019200652528548123, + "loss": 0.1788, + "num_input_tokens_seen": 5075584, + "step": 2355 + }, + { + "epoch": 0.38499184339314846, + "grad_norm": 0.8807132840156555, + "learning_rate": 0.00019241435562805873, + "loss": 0.2466, + "num_input_tokens_seen": 5086272, + "step": 2360 + }, + { + "epoch": 0.38580750407830344, + "grad_norm": 0.5251150131225586, + "learning_rate": 0.00019282218597063623, + "loss": 0.2486, + "num_input_tokens_seen": 5098560, + "step": 2365 + }, + { + "epoch": 0.3866231647634584, + "grad_norm": 0.2054545283317566, + "learning_rate": 0.0001932300163132137, + "loss": 0.1211, + "num_input_tokens_seen": 5109792, + "step": 2370 + }, + { + "epoch": 0.38743882544861336, + "grad_norm": 1.0482416152954102, + "learning_rate": 0.00019363784665579121, + "loss": 0.2811, + "num_input_tokens_seen": 5119328, + "step": 2375 + }, + { + "epoch": 0.38825448613376834, + "grad_norm": 0.46562135219573975, + "learning_rate": 0.00019404567699836866, + "loss": 0.1733, + "num_input_tokens_seen": 5129824, + "step": 2380 + }, + { + "epoch": 0.3890701468189233, + "grad_norm": 1.1354540586471558, + "learning_rate": 0.00019445350734094617, + "loss": 0.2479, + "num_input_tokens_seen": 5140800, + "step": 2385 + }, + { + "epoch": 0.3898858075040783, + "grad_norm": 0.5768679976463318, + "learning_rate": 0.00019486133768352364, + "loss": 0.1671, + "num_input_tokens_seen": 5152064, + "step": 2390 + }, + { + "epoch": 0.3907014681892333, + "grad_norm": 0.4160184860229492, + "learning_rate": 0.00019526916802610115, + "loss": 0.1038, + "num_input_tokens_seen": 5164288, + "step": 2395 + }, + { + "epoch": 0.3915171288743883, + "grad_norm": 0.5931136012077332, + "learning_rate": 0.00019567699836867865, + "loss": 0.0697, + "num_input_tokens_seen": 5174784, + "step": 2400 + }, + { + "epoch": 0.3923327895595432, + "grad_norm": 0.5052030086517334, + "learning_rate": 0.00019608482871125613, + "loss": 0.1027, + "num_input_tokens_seen": 5185792, + "step": 2405 + }, + { + "epoch": 0.3931484502446982, + "grad_norm": 0.699626088142395, + "learning_rate": 0.00019649265905383363, + "loss": 0.2045, + "num_input_tokens_seen": 5196832, + "step": 2410 + }, + { + "epoch": 0.3939641109298532, + "grad_norm": 0.30739665031433105, + "learning_rate": 0.00019690048939641108, + "loss": 0.3627, + "num_input_tokens_seen": 5208288, + "step": 2415 + }, + { + "epoch": 0.39477977161500816, + "grad_norm": 0.7836719155311584, + "learning_rate": 0.00019730831973898858, + "loss": 0.0812, + "num_input_tokens_seen": 5219008, + "step": 2420 + }, + { + "epoch": 0.39559543230016314, + "grad_norm": 1.719211220741272, + "learning_rate": 0.00019771615008156606, + "loss": 0.1785, + "num_input_tokens_seen": 5230400, + "step": 2425 + }, + { + "epoch": 0.3964110929853181, + "grad_norm": 3.8855228424072266, + "learning_rate": 0.00019812398042414356, + "loss": 0.1409, + "num_input_tokens_seen": 5241920, + "step": 2430 + }, + { + "epoch": 0.3972267536704731, + "grad_norm": 0.14320261776447296, + "learning_rate": 0.00019853181076672107, + "loss": 0.1043, + "num_input_tokens_seen": 5252768, + "step": 2435 + }, + { + "epoch": 0.39804241435562804, + "grad_norm": 0.2717430591583252, + "learning_rate": 0.00019893964110929854, + "loss": 0.0898, + "num_input_tokens_seen": 5263840, + "step": 2440 + }, + { + "epoch": 0.398858075040783, + "grad_norm": 0.1439686268568039, + "learning_rate": 0.00019934747145187602, + "loss": 0.2313, + "num_input_tokens_seen": 5274656, + "step": 2445 + }, + { + "epoch": 0.399673735725938, + "grad_norm": 0.17499934136867523, + "learning_rate": 0.0001997553017944535, + "loss": 0.0766, + "num_input_tokens_seen": 5285504, + "step": 2450 + }, + { + "epoch": 0.400489396411093, + "grad_norm": 0.2679937779903412, + "learning_rate": 0.000200163132137031, + "loss": 0.2409, + "num_input_tokens_seen": 5297024, + "step": 2455 + }, + { + "epoch": 0.401305057096248, + "grad_norm": 0.47787871956825256, + "learning_rate": 0.00020057096247960848, + "loss": 0.2762, + "num_input_tokens_seen": 5308064, + "step": 2460 + }, + { + "epoch": 0.40212071778140296, + "grad_norm": 0.14121407270431519, + "learning_rate": 0.00020097879282218598, + "loss": 0.1081, + "num_input_tokens_seen": 5318880, + "step": 2465 + }, + { + "epoch": 0.4029363784665579, + "grad_norm": 0.4858434200286865, + "learning_rate": 0.00020138662316476348, + "loss": 0.1588, + "num_input_tokens_seen": 5329440, + "step": 2470 + }, + { + "epoch": 0.40375203915171287, + "grad_norm": 0.30492958426475525, + "learning_rate": 0.00020179445350734096, + "loss": 0.0775, + "num_input_tokens_seen": 5340000, + "step": 2475 + }, + { + "epoch": 0.40456769983686786, + "grad_norm": 5.706164836883545, + "learning_rate": 0.00020220228384991844, + "loss": 0.1216, + "num_input_tokens_seen": 5350944, + "step": 2480 + }, + { + "epoch": 0.40538336052202284, + "grad_norm": 0.20894655585289001, + "learning_rate": 0.0002026101141924959, + "loss": 0.257, + "num_input_tokens_seen": 5362016, + "step": 2485 + }, + { + "epoch": 0.4061990212071778, + "grad_norm": 0.1500648409128189, + "learning_rate": 0.00020301794453507342, + "loss": 0.1702, + "num_input_tokens_seen": 5373024, + "step": 2490 + }, + { + "epoch": 0.4070146818923328, + "grad_norm": 0.25598615407943726, + "learning_rate": 0.0002034257748776509, + "loss": 0.1976, + "num_input_tokens_seen": 5383840, + "step": 2495 + }, + { + "epoch": 0.4078303425774878, + "grad_norm": 0.2832091748714447, + "learning_rate": 0.0002038336052202284, + "loss": 0.0945, + "num_input_tokens_seen": 5394688, + "step": 2500 + }, + { + "epoch": 0.4086460032626427, + "grad_norm": 0.49175241589546204, + "learning_rate": 0.0002042414355628059, + "loss": 0.0589, + "num_input_tokens_seen": 5405760, + "step": 2505 + }, + { + "epoch": 0.4094616639477977, + "grad_norm": 2.1446218490600586, + "learning_rate": 0.00020464926590538335, + "loss": 0.1422, + "num_input_tokens_seen": 5416736, + "step": 2510 + }, + { + "epoch": 0.4102773246329527, + "grad_norm": 1.993091106414795, + "learning_rate": 0.00020505709624796085, + "loss": 0.1631, + "num_input_tokens_seen": 5428160, + "step": 2515 + }, + { + "epoch": 0.4110929853181077, + "grad_norm": 0.05304631590843201, + "learning_rate": 0.00020546492659053833, + "loss": 0.0302, + "num_input_tokens_seen": 5439616, + "step": 2520 + }, + { + "epoch": 0.41190864600326266, + "grad_norm": 2.1700851917266846, + "learning_rate": 0.00020587275693311583, + "loss": 0.1358, + "num_input_tokens_seen": 5450048, + "step": 2525 + }, + { + "epoch": 0.41272430668841764, + "grad_norm": 2.9487311840057373, + "learning_rate": 0.0002062805872756933, + "loss": 0.1655, + "num_input_tokens_seen": 5460512, + "step": 2530 + }, + { + "epoch": 0.41353996737357257, + "grad_norm": 1.1737804412841797, + "learning_rate": 0.0002066884176182708, + "loss": 0.1076, + "num_input_tokens_seen": 5470496, + "step": 2535 + }, + { + "epoch": 0.41435562805872755, + "grad_norm": 8.738869667053223, + "learning_rate": 0.00020709624796084832, + "loss": 0.4035, + "num_input_tokens_seen": 5482112, + "step": 2540 + }, + { + "epoch": 0.41517128874388254, + "grad_norm": 2.1070964336395264, + "learning_rate": 0.00020750407830342577, + "loss": 0.0887, + "num_input_tokens_seen": 5493984, + "step": 2545 + }, + { + "epoch": 0.4159869494290375, + "grad_norm": 0.5695657134056091, + "learning_rate": 0.00020791190864600327, + "loss": 0.0843, + "num_input_tokens_seen": 5505440, + "step": 2550 + }, + { + "epoch": 0.4168026101141925, + "grad_norm": 0.01275833323597908, + "learning_rate": 0.00020831973898858075, + "loss": 0.1324, + "num_input_tokens_seen": 5516800, + "step": 2555 + }, + { + "epoch": 0.4176182707993475, + "grad_norm": 0.055172014981508255, + "learning_rate": 0.00020872756933115825, + "loss": 0.0315, + "num_input_tokens_seen": 5527360, + "step": 2560 + }, + { + "epoch": 0.4184339314845024, + "grad_norm": 1.583932638168335, + "learning_rate": 0.00020913539967373573, + "loss": 0.1176, + "num_input_tokens_seen": 5537792, + "step": 2565 + }, + { + "epoch": 0.4192495921696574, + "grad_norm": 0.4243050217628479, + "learning_rate": 0.00020954323001631323, + "loss": 0.269, + "num_input_tokens_seen": 5548864, + "step": 2570 + }, + { + "epoch": 0.4200652528548124, + "grad_norm": 2.5984838008880615, + "learning_rate": 0.0002099510603588907, + "loss": 0.0807, + "num_input_tokens_seen": 5559072, + "step": 2575 + }, + { + "epoch": 0.42088091353996737, + "grad_norm": 0.3416847288608551, + "learning_rate": 0.00021035889070146818, + "loss": 0.2578, + "num_input_tokens_seen": 5568864, + "step": 2580 + }, + { + "epoch": 0.42169657422512236, + "grad_norm": 0.1450396478176117, + "learning_rate": 0.00021076672104404569, + "loss": 0.1429, + "num_input_tokens_seen": 5579744, + "step": 2585 + }, + { + "epoch": 0.42251223491027734, + "grad_norm": 0.7415868639945984, + "learning_rate": 0.00021117455138662316, + "loss": 0.2167, + "num_input_tokens_seen": 5589824, + "step": 2590 + }, + { + "epoch": 0.4233278955954323, + "grad_norm": 0.3233489990234375, + "learning_rate": 0.00021158238172920067, + "loss": 0.143, + "num_input_tokens_seen": 5601056, + "step": 2595 + }, + { + "epoch": 0.42414355628058725, + "grad_norm": 0.10952405631542206, + "learning_rate": 0.00021199021207177814, + "loss": 0.2179, + "num_input_tokens_seen": 5611744, + "step": 2600 + }, + { + "epoch": 0.42495921696574224, + "grad_norm": 0.619699239730835, + "learning_rate": 0.00021239804241435562, + "loss": 0.2343, + "num_input_tokens_seen": 5621600, + "step": 2605 + }, + { + "epoch": 0.4257748776508972, + "grad_norm": 0.3699929118156433, + "learning_rate": 0.00021280587275693312, + "loss": 0.143, + "num_input_tokens_seen": 5633088, + "step": 2610 + }, + { + "epoch": 0.4265905383360522, + "grad_norm": 1.0430079698562622, + "learning_rate": 0.0002132137030995106, + "loss": 0.1942, + "num_input_tokens_seen": 5644352, + "step": 2615 + }, + { + "epoch": 0.4274061990212072, + "grad_norm": 0.596524178981781, + "learning_rate": 0.0002136215334420881, + "loss": 0.1614, + "num_input_tokens_seen": 5653600, + "step": 2620 + }, + { + "epoch": 0.4282218597063622, + "grad_norm": 0.615421712398529, + "learning_rate": 0.00021402936378466558, + "loss": 0.1703, + "num_input_tokens_seen": 5665152, + "step": 2625 + }, + { + "epoch": 0.4290375203915171, + "grad_norm": 1.0820027589797974, + "learning_rate": 0.00021443719412724308, + "loss": 0.1177, + "num_input_tokens_seen": 5676640, + "step": 2630 + }, + { + "epoch": 0.4298531810766721, + "grad_norm": 0.33138588070869446, + "learning_rate": 0.00021484502446982056, + "loss": 0.0626, + "num_input_tokens_seen": 5687360, + "step": 2635 + }, + { + "epoch": 0.43066884176182707, + "grad_norm": 1.8373628854751587, + "learning_rate": 0.00021525285481239804, + "loss": 0.1724, + "num_input_tokens_seen": 5698112, + "step": 2640 + }, + { + "epoch": 0.43148450244698205, + "grad_norm": 0.3439456820487976, + "learning_rate": 0.00021566068515497554, + "loss": 0.2369, + "num_input_tokens_seen": 5708480, + "step": 2645 + }, + { + "epoch": 0.43230016313213704, + "grad_norm": 1.2714189291000366, + "learning_rate": 0.00021606851549755302, + "loss": 0.1843, + "num_input_tokens_seen": 5718144, + "step": 2650 + }, + { + "epoch": 0.433115823817292, + "grad_norm": 0.7776082754135132, + "learning_rate": 0.00021647634584013052, + "loss": 0.2882, + "num_input_tokens_seen": 5728832, + "step": 2655 + }, + { + "epoch": 0.433931484502447, + "grad_norm": 0.16278807818889618, + "learning_rate": 0.000216884176182708, + "loss": 0.1196, + "num_input_tokens_seen": 5739584, + "step": 2660 + }, + { + "epoch": 0.43474714518760194, + "grad_norm": 0.8418310284614563, + "learning_rate": 0.0002172920065252855, + "loss": 0.145, + "num_input_tokens_seen": 5749632, + "step": 2665 + }, + { + "epoch": 0.4355628058727569, + "grad_norm": 0.3150191903114319, + "learning_rate": 0.00021769983686786295, + "loss": 0.0556, + "num_input_tokens_seen": 5760480, + "step": 2670 + }, + { + "epoch": 0.4363784665579119, + "grad_norm": 0.9902245998382568, + "learning_rate": 0.00021810766721044045, + "loss": 0.1921, + "num_input_tokens_seen": 5770560, + "step": 2675 + }, + { + "epoch": 0.4371941272430669, + "grad_norm": 1.708687424659729, + "learning_rate": 0.00021851549755301796, + "loss": 0.1473, + "num_input_tokens_seen": 5780416, + "step": 2680 + }, + { + "epoch": 0.43800978792822187, + "grad_norm": 0.952900767326355, + "learning_rate": 0.00021892332789559543, + "loss": 0.2966, + "num_input_tokens_seen": 5791136, + "step": 2685 + }, + { + "epoch": 0.43882544861337686, + "grad_norm": 1.0747466087341309, + "learning_rate": 0.00021933115823817294, + "loss": 0.1756, + "num_input_tokens_seen": 5801600, + "step": 2690 + }, + { + "epoch": 0.4396411092985318, + "grad_norm": 0.06641166657209396, + "learning_rate": 0.0002197389885807504, + "loss": 0.19, + "num_input_tokens_seen": 5813632, + "step": 2695 + }, + { + "epoch": 0.44045676998368677, + "grad_norm": 0.29006800055503845, + "learning_rate": 0.00022014681892332792, + "loss": 0.0424, + "num_input_tokens_seen": 5823680, + "step": 2700 + }, + { + "epoch": 0.44127243066884175, + "grad_norm": 0.15687257051467896, + "learning_rate": 0.00022055464926590536, + "loss": 0.1069, + "num_input_tokens_seen": 5833760, + "step": 2705 + }, + { + "epoch": 0.44208809135399674, + "grad_norm": 0.32435062527656555, + "learning_rate": 0.00022096247960848287, + "loss": 0.184, + "num_input_tokens_seen": 5844352, + "step": 2710 + }, + { + "epoch": 0.4429037520391517, + "grad_norm": 0.9220103621482849, + "learning_rate": 0.00022137030995106037, + "loss": 0.1101, + "num_input_tokens_seen": 5854624, + "step": 2715 + }, + { + "epoch": 0.4437194127243067, + "grad_norm": 0.1260593682527542, + "learning_rate": 0.00022177814029363785, + "loss": 0.1279, + "num_input_tokens_seen": 5865280, + "step": 2720 + }, + { + "epoch": 0.4445350734094617, + "grad_norm": 0.10509152710437775, + "learning_rate": 0.00022218597063621535, + "loss": 0.128, + "num_input_tokens_seen": 5875552, + "step": 2725 + }, + { + "epoch": 0.4453507340946166, + "grad_norm": 0.3267362117767334, + "learning_rate": 0.00022259380097879283, + "loss": 0.2118, + "num_input_tokens_seen": 5886592, + "step": 2730 + }, + { + "epoch": 0.4461663947797716, + "grad_norm": 0.5570999383926392, + "learning_rate": 0.0002230016313213703, + "loss": 0.1433, + "num_input_tokens_seen": 5896864, + "step": 2735 + }, + { + "epoch": 0.4469820554649266, + "grad_norm": 0.09635140746831894, + "learning_rate": 0.00022340946166394778, + "loss": 0.1857, + "num_input_tokens_seen": 5907360, + "step": 2740 + }, + { + "epoch": 0.44779771615008157, + "grad_norm": 0.42166176438331604, + "learning_rate": 0.00022381729200652529, + "loss": 0.1385, + "num_input_tokens_seen": 5917344, + "step": 2745 + }, + { + "epoch": 0.44861337683523655, + "grad_norm": 0.3304622173309326, + "learning_rate": 0.0002242251223491028, + "loss": 0.2181, + "num_input_tokens_seen": 5929536, + "step": 2750 + }, + { + "epoch": 0.44942903752039154, + "grad_norm": 0.3093664050102234, + "learning_rate": 0.00022463295269168027, + "loss": 0.155, + "num_input_tokens_seen": 5940992, + "step": 2755 + }, + { + "epoch": 0.45024469820554647, + "grad_norm": 0.12017809599637985, + "learning_rate": 0.00022504078303425777, + "loss": 0.1618, + "num_input_tokens_seen": 5951520, + "step": 2760 + }, + { + "epoch": 0.45106035889070145, + "grad_norm": 0.5987087488174438, + "learning_rate": 0.00022544861337683525, + "loss": 0.1302, + "num_input_tokens_seen": 5961952, + "step": 2765 + }, + { + "epoch": 0.45187601957585644, + "grad_norm": 1.5060359239578247, + "learning_rate": 0.00022585644371941272, + "loss": 0.2578, + "num_input_tokens_seen": 5972000, + "step": 2770 + }, + { + "epoch": 0.4526916802610114, + "grad_norm": 0.19394594430923462, + "learning_rate": 0.0002262642740619902, + "loss": 0.1126, + "num_input_tokens_seen": 5981984, + "step": 2775 + }, + { + "epoch": 0.4535073409461664, + "grad_norm": 0.22451826930046082, + "learning_rate": 0.0002266721044045677, + "loss": 0.0887, + "num_input_tokens_seen": 5992800, + "step": 2780 + }, + { + "epoch": 0.4543230016313214, + "grad_norm": 0.9931198954582214, + "learning_rate": 0.0002270799347471452, + "loss": 0.0777, + "num_input_tokens_seen": 6003808, + "step": 2785 + }, + { + "epoch": 0.4551386623164764, + "grad_norm": 2.576740026473999, + "learning_rate": 0.00022748776508972268, + "loss": 0.1955, + "num_input_tokens_seen": 6014336, + "step": 2790 + }, + { + "epoch": 0.4559543230016313, + "grad_norm": 0.04945773258805275, + "learning_rate": 0.00022789559543230019, + "loss": 0.2142, + "num_input_tokens_seen": 6025792, + "step": 2795 + }, + { + "epoch": 0.4567699836867863, + "grad_norm": 0.8532063364982605, + "learning_rate": 0.00022830342577487763, + "loss": 0.1424, + "num_input_tokens_seen": 6035968, + "step": 2800 + }, + { + "epoch": 0.45758564437194127, + "grad_norm": 0.15329685807228088, + "learning_rate": 0.00022871125611745514, + "loss": 0.1321, + "num_input_tokens_seen": 6047040, + "step": 2805 + }, + { + "epoch": 0.45840130505709625, + "grad_norm": 0.15588818490505219, + "learning_rate": 0.00022911908646003261, + "loss": 0.0808, + "num_input_tokens_seen": 6057728, + "step": 2810 + }, + { + "epoch": 0.45921696574225124, + "grad_norm": 0.7118588089942932, + "learning_rate": 0.00022952691680261012, + "loss": 0.1649, + "num_input_tokens_seen": 6070208, + "step": 2815 + }, + { + "epoch": 0.4600326264274062, + "grad_norm": 0.23197035491466522, + "learning_rate": 0.00022993474714518762, + "loss": 0.1928, + "num_input_tokens_seen": 6082144, + "step": 2820 + }, + { + "epoch": 0.46084828711256115, + "grad_norm": 0.136517733335495, + "learning_rate": 0.0002303425774877651, + "loss": 0.1365, + "num_input_tokens_seen": 6094112, + "step": 2825 + }, + { + "epoch": 0.46166394779771613, + "grad_norm": 0.056639283895492554, + "learning_rate": 0.0002307504078303426, + "loss": 0.1038, + "num_input_tokens_seen": 6104608, + "step": 2830 + }, + { + "epoch": 0.4624796084828711, + "grad_norm": 0.2406209260225296, + "learning_rate": 0.00023115823817292005, + "loss": 0.0966, + "num_input_tokens_seen": 6114816, + "step": 2835 + }, + { + "epoch": 0.4632952691680261, + "grad_norm": 0.0827520340681076, + "learning_rate": 0.00023156606851549755, + "loss": 0.1219, + "num_input_tokens_seen": 6125952, + "step": 2840 + }, + { + "epoch": 0.4641109298531811, + "grad_norm": 0.08483751118183136, + "learning_rate": 0.00023197389885807503, + "loss": 0.2323, + "num_input_tokens_seen": 6137056, + "step": 2845 + }, + { + "epoch": 0.46492659053833607, + "grad_norm": 1.0944316387176514, + "learning_rate": 0.00023238172920065253, + "loss": 0.0779, + "num_input_tokens_seen": 6147840, + "step": 2850 + }, + { + "epoch": 0.46574225122349105, + "grad_norm": 0.10864396393299103, + "learning_rate": 0.00023278955954323004, + "loss": 0.222, + "num_input_tokens_seen": 6159808, + "step": 2855 + }, + { + "epoch": 0.466557911908646, + "grad_norm": 0.804469108581543, + "learning_rate": 0.00023319738988580751, + "loss": 0.1266, + "num_input_tokens_seen": 6171520, + "step": 2860 + }, + { + "epoch": 0.46737357259380097, + "grad_norm": 0.7416703701019287, + "learning_rate": 0.000233605220228385, + "loss": 0.1842, + "num_input_tokens_seen": 6182656, + "step": 2865 + }, + { + "epoch": 0.46818923327895595, + "grad_norm": 0.2062879502773285, + "learning_rate": 0.00023401305057096247, + "loss": 0.2398, + "num_input_tokens_seen": 6193696, + "step": 2870 + }, + { + "epoch": 0.46900489396411094, + "grad_norm": 0.3211911618709564, + "learning_rate": 0.00023442088091353997, + "loss": 0.1797, + "num_input_tokens_seen": 6204192, + "step": 2875 + }, + { + "epoch": 0.4698205546492659, + "grad_norm": 0.5380843877792358, + "learning_rate": 0.00023482871125611747, + "loss": 0.1488, + "num_input_tokens_seen": 6215136, + "step": 2880 + }, + { + "epoch": 0.4706362153344209, + "grad_norm": 0.6130079627037048, + "learning_rate": 0.00023523654159869495, + "loss": 0.0896, + "num_input_tokens_seen": 6225952, + "step": 2885 + }, + { + "epoch": 0.47145187601957583, + "grad_norm": 1.6829217672348022, + "learning_rate": 0.00023564437194127245, + "loss": 0.2225, + "num_input_tokens_seen": 6237152, + "step": 2890 + }, + { + "epoch": 0.4722675367047308, + "grad_norm": 0.5651580691337585, + "learning_rate": 0.00023605220228384993, + "loss": 0.1513, + "num_input_tokens_seen": 6248416, + "step": 2895 + }, + { + "epoch": 0.4730831973898858, + "grad_norm": 1.282302737236023, + "learning_rate": 0.0002364600326264274, + "loss": 0.1621, + "num_input_tokens_seen": 6259840, + "step": 2900 + }, + { + "epoch": 0.4738988580750408, + "grad_norm": 0.22257353365421295, + "learning_rate": 0.00023686786296900488, + "loss": 0.1043, + "num_input_tokens_seen": 6271104, + "step": 2905 + }, + { + "epoch": 0.47471451876019577, + "grad_norm": 0.780252993106842, + "learning_rate": 0.0002372756933115824, + "loss": 0.1725, + "num_input_tokens_seen": 6281696, + "step": 2910 + }, + { + "epoch": 0.47553017944535075, + "grad_norm": 1.1723055839538574, + "learning_rate": 0.0002376835236541599, + "loss": 0.1408, + "num_input_tokens_seen": 6293760, + "step": 2915 + }, + { + "epoch": 0.4763458401305057, + "grad_norm": 0.23256178200244904, + "learning_rate": 0.00023809135399673737, + "loss": 0.091, + "num_input_tokens_seen": 6305376, + "step": 2920 + }, + { + "epoch": 0.47716150081566067, + "grad_norm": 0.22261440753936768, + "learning_rate": 0.00023849918433931487, + "loss": 0.1022, + "num_input_tokens_seen": 6316032, + "step": 2925 + }, + { + "epoch": 0.47797716150081565, + "grad_norm": 1.2297919988632202, + "learning_rate": 0.00023890701468189232, + "loss": 0.2899, + "num_input_tokens_seen": 6327552, + "step": 2930 + }, + { + "epoch": 0.47879282218597063, + "grad_norm": 0.4029012620449066, + "learning_rate": 0.00023931484502446982, + "loss": 0.1099, + "num_input_tokens_seen": 6337344, + "step": 2935 + }, + { + "epoch": 0.4796084828711256, + "grad_norm": 0.3078548312187195, + "learning_rate": 0.0002397226753670473, + "loss": 0.1346, + "num_input_tokens_seen": 6349120, + "step": 2940 + }, + { + "epoch": 0.4804241435562806, + "grad_norm": 0.18580852448940277, + "learning_rate": 0.0002401305057096248, + "loss": 0.1392, + "num_input_tokens_seen": 6359584, + "step": 2945 + }, + { + "epoch": 0.4812398042414356, + "grad_norm": 0.7232683897018433, + "learning_rate": 0.0002405383360522023, + "loss": 0.1193, + "num_input_tokens_seen": 6370112, + "step": 2950 + }, + { + "epoch": 0.4820554649265905, + "grad_norm": 0.07306995987892151, + "learning_rate": 0.00024094616639477978, + "loss": 0.0465, + "num_input_tokens_seen": 6381248, + "step": 2955 + }, + { + "epoch": 0.4828711256117455, + "grad_norm": 1.1193236112594604, + "learning_rate": 0.00024135399673735726, + "loss": 0.1825, + "num_input_tokens_seen": 6392224, + "step": 2960 + }, + { + "epoch": 0.4836867862969005, + "grad_norm": 0.22385838627815247, + "learning_rate": 0.00024176182707993474, + "loss": 0.2347, + "num_input_tokens_seen": 6401888, + "step": 2965 + }, + { + "epoch": 0.48450244698205547, + "grad_norm": 0.5825753808021545, + "learning_rate": 0.00024216965742251224, + "loss": 0.1298, + "num_input_tokens_seen": 6412352, + "step": 2970 + }, + { + "epoch": 0.48531810766721045, + "grad_norm": 1.0909613370895386, + "learning_rate": 0.00024257748776508972, + "loss": 0.115, + "num_input_tokens_seen": 6422880, + "step": 2975 + }, + { + "epoch": 0.48613376835236544, + "grad_norm": 0.23585692048072815, + "learning_rate": 0.00024298531810766722, + "loss": 0.1675, + "num_input_tokens_seen": 6433696, + "step": 2980 + }, + { + "epoch": 0.48694942903752036, + "grad_norm": 0.722490131855011, + "learning_rate": 0.00024339314845024472, + "loss": 0.2255, + "num_input_tokens_seen": 6444576, + "step": 2985 + }, + { + "epoch": 0.48776508972267535, + "grad_norm": 0.2733224630355835, + "learning_rate": 0.0002438009787928222, + "loss": 0.0739, + "num_input_tokens_seen": 6455616, + "step": 2990 + }, + { + "epoch": 0.48858075040783033, + "grad_norm": 0.12696190178394318, + "learning_rate": 0.0002442088091353997, + "loss": 0.1702, + "num_input_tokens_seen": 6465632, + "step": 2995 + }, + { + "epoch": 0.4893964110929853, + "grad_norm": 1.2236684560775757, + "learning_rate": 0.00024461663947797715, + "loss": 0.1988, + "num_input_tokens_seen": 6476320, + "step": 3000 + }, + { + "epoch": 0.4902120717781403, + "grad_norm": 0.11306619644165039, + "learning_rate": 0.00024502446982055463, + "loss": 0.1749, + "num_input_tokens_seen": 6486560, + "step": 3005 + }, + { + "epoch": 0.4910277324632953, + "grad_norm": 0.06194991618394852, + "learning_rate": 0.00024543230016313216, + "loss": 0.06, + "num_input_tokens_seen": 6496448, + "step": 3010 + }, + { + "epoch": 0.49184339314845027, + "grad_norm": 0.1334661990404129, + "learning_rate": 0.00024584013050570964, + "loss": 0.1295, + "num_input_tokens_seen": 6506624, + "step": 3015 + }, + { + "epoch": 0.4926590538336052, + "grad_norm": 0.09926887601613998, + "learning_rate": 0.0002462479608482871, + "loss": 0.1661, + "num_input_tokens_seen": 6516960, + "step": 3020 + }, + { + "epoch": 0.4934747145187602, + "grad_norm": 1.0292459726333618, + "learning_rate": 0.0002466557911908646, + "loss": 0.1348, + "num_input_tokens_seen": 6528896, + "step": 3025 + }, + { + "epoch": 0.49429037520391517, + "grad_norm": 0.5590057969093323, + "learning_rate": 0.00024706362153344207, + "loss": 0.0731, + "num_input_tokens_seen": 6540576, + "step": 3030 + }, + { + "epoch": 0.49510603588907015, + "grad_norm": 0.3860446512699127, + "learning_rate": 0.0002474714518760196, + "loss": 0.0626, + "num_input_tokens_seen": 6551424, + "step": 3035 + }, + { + "epoch": 0.49592169657422513, + "grad_norm": 0.12069137394428253, + "learning_rate": 0.0002478792822185971, + "loss": 0.0862, + "num_input_tokens_seen": 6562176, + "step": 3040 + }, + { + "epoch": 0.4967373572593801, + "grad_norm": 0.0766163021326065, + "learning_rate": 0.00024828711256117455, + "loss": 0.0904, + "num_input_tokens_seen": 6572384, + "step": 3045 + }, + { + "epoch": 0.49755301794453505, + "grad_norm": 0.1165001317858696, + "learning_rate": 0.000248694942903752, + "loss": 0.2203, + "num_input_tokens_seen": 6583424, + "step": 3050 + }, + { + "epoch": 0.49836867862969003, + "grad_norm": 0.015077603980898857, + "learning_rate": 0.00024910277324632956, + "loss": 0.0513, + "num_input_tokens_seen": 6594144, + "step": 3055 + }, + { + "epoch": 0.499184339314845, + "grad_norm": 0.4812507629394531, + "learning_rate": 0.00024951060358890703, + "loss": 0.1248, + "num_input_tokens_seen": 6605760, + "step": 3060 + }, + { + "epoch": 0.5, + "grad_norm": 0.2543140649795532, + "learning_rate": 0.0002499184339314845, + "loss": 0.1945, + "num_input_tokens_seen": 6616832, + "step": 3065 + }, + { + "epoch": 0.5008156606851549, + "grad_norm": 0.787386417388916, + "learning_rate": 0.00025032626427406204, + "loss": 0.159, + "num_input_tokens_seen": 6628384, + "step": 3070 + }, + { + "epoch": 0.50163132137031, + "grad_norm": 0.19998139142990112, + "learning_rate": 0.00025073409461663946, + "loss": 0.0486, + "num_input_tokens_seen": 6639456, + "step": 3075 + }, + { + "epoch": 0.5024469820554649, + "grad_norm": 0.042470287531614304, + "learning_rate": 0.00025114192495921694, + "loss": 0.0311, + "num_input_tokens_seen": 6651520, + "step": 3080 + }, + { + "epoch": 0.5032626427406199, + "grad_norm": 0.04640533775091171, + "learning_rate": 0.00025154975530179447, + "loss": 0.1245, + "num_input_tokens_seen": 6661664, + "step": 3085 + }, + { + "epoch": 0.5040783034257749, + "grad_norm": 1.1446317434310913, + "learning_rate": 0.00025195758564437195, + "loss": 0.2796, + "num_input_tokens_seen": 6673024, + "step": 3090 + }, + { + "epoch": 0.5048939641109299, + "grad_norm": 0.34324145317077637, + "learning_rate": 0.0002523654159869495, + "loss": 0.1932, + "num_input_tokens_seen": 6685216, + "step": 3095 + }, + { + "epoch": 0.5057096247960848, + "grad_norm": 1.195542812347412, + "learning_rate": 0.0002527732463295269, + "loss": 0.1647, + "num_input_tokens_seen": 6696096, + "step": 3100 + }, + { + "epoch": 0.5065252854812398, + "grad_norm": 0.16621847450733185, + "learning_rate": 0.0002531810766721044, + "loss": 0.2937, + "num_input_tokens_seen": 6706656, + "step": 3105 + }, + { + "epoch": 0.5073409461663948, + "grad_norm": 0.3265911042690277, + "learning_rate": 0.0002535889070146819, + "loss": 0.1488, + "num_input_tokens_seen": 6716704, + "step": 3110 + }, + { + "epoch": 0.5081566068515497, + "grad_norm": 0.47061917185783386, + "learning_rate": 0.0002539967373572594, + "loss": 0.1496, + "num_input_tokens_seen": 6728352, + "step": 3115 + }, + { + "epoch": 0.5089722675367048, + "grad_norm": 0.8664241433143616, + "learning_rate": 0.00025440456769983686, + "loss": 0.1336, + "num_input_tokens_seen": 6739296, + "step": 3120 + }, + { + "epoch": 0.5097879282218597, + "grad_norm": 0.4536451995372772, + "learning_rate": 0.00025481239804241434, + "loss": 0.1314, + "num_input_tokens_seen": 6750176, + "step": 3125 + }, + { + "epoch": 0.5106035889070146, + "grad_norm": 0.8622775673866272, + "learning_rate": 0.00025522022838499187, + "loss": 0.1824, + "num_input_tokens_seen": 6760288, + "step": 3130 + }, + { + "epoch": 0.5114192495921697, + "grad_norm": 0.4697278141975403, + "learning_rate": 0.00025562805872756934, + "loss": 0.1319, + "num_input_tokens_seen": 6769792, + "step": 3135 + }, + { + "epoch": 0.5122349102773246, + "grad_norm": 0.8493194580078125, + "learning_rate": 0.0002560358890701468, + "loss": 0.169, + "num_input_tokens_seen": 6779712, + "step": 3140 + }, + { + "epoch": 0.5130505709624796, + "grad_norm": 0.7845749258995056, + "learning_rate": 0.0002564437194127243, + "loss": 0.1126, + "num_input_tokens_seen": 6791232, + "step": 3145 + }, + { + "epoch": 0.5138662316476346, + "grad_norm": 0.11098422855138779, + "learning_rate": 0.00025685154975530177, + "loss": 0.0951, + "num_input_tokens_seen": 6801696, + "step": 3150 + }, + { + "epoch": 0.5146818923327896, + "grad_norm": 0.016223762184381485, + "learning_rate": 0.0002572593800978793, + "loss": 0.0867, + "num_input_tokens_seen": 6812672, + "step": 3155 + }, + { + "epoch": 0.5154975530179445, + "grad_norm": 1.0923768281936646, + "learning_rate": 0.0002576672104404568, + "loss": 0.2778, + "num_input_tokens_seen": 6822976, + "step": 3160 + }, + { + "epoch": 0.5163132137030995, + "grad_norm": 0.14902295172214508, + "learning_rate": 0.0002580750407830343, + "loss": 0.179, + "num_input_tokens_seen": 6833888, + "step": 3165 + }, + { + "epoch": 0.5171288743882545, + "grad_norm": 0.7654731869697571, + "learning_rate": 0.00025848287112561173, + "loss": 0.3424, + "num_input_tokens_seen": 6845632, + "step": 3170 + }, + { + "epoch": 0.5179445350734094, + "grad_norm": 0.13922156393527985, + "learning_rate": 0.0002588907014681892, + "loss": 0.1384, + "num_input_tokens_seen": 6856128, + "step": 3175 + }, + { + "epoch": 0.5187601957585645, + "grad_norm": 0.2902519404888153, + "learning_rate": 0.00025929853181076674, + "loss": 0.0676, + "num_input_tokens_seen": 6867840, + "step": 3180 + }, + { + "epoch": 0.5195758564437194, + "grad_norm": 0.7630822658538818, + "learning_rate": 0.0002597063621533442, + "loss": 0.1326, + "num_input_tokens_seen": 6878112, + "step": 3185 + }, + { + "epoch": 0.5203915171288744, + "grad_norm": 0.21264766156673431, + "learning_rate": 0.00026011419249592175, + "loss": 0.1368, + "num_input_tokens_seen": 6889312, + "step": 3190 + }, + { + "epoch": 0.5212071778140294, + "grad_norm": 0.3382579982280731, + "learning_rate": 0.00026052202283849917, + "loss": 0.2752, + "num_input_tokens_seen": 6899968, + "step": 3195 + }, + { + "epoch": 0.5220228384991843, + "grad_norm": 0.5595989227294922, + "learning_rate": 0.0002609298531810767, + "loss": 0.106, + "num_input_tokens_seen": 6909568, + "step": 3200 + }, + { + "epoch": 0.5228384991843393, + "grad_norm": 0.2680160403251648, + "learning_rate": 0.0002613376835236542, + "loss": 0.0982, + "num_input_tokens_seen": 6920928, + "step": 3205 + }, + { + "epoch": 0.5236541598694943, + "grad_norm": 0.43840864300727844, + "learning_rate": 0.00026174551386623165, + "loss": 0.059, + "num_input_tokens_seen": 6930144, + "step": 3210 + }, + { + "epoch": 0.5244698205546493, + "grad_norm": 0.0211274903267622, + "learning_rate": 0.00026215334420880913, + "loss": 0.1161, + "num_input_tokens_seen": 6940320, + "step": 3215 + }, + { + "epoch": 0.5252854812398042, + "grad_norm": 0.3852957487106323, + "learning_rate": 0.0002625611745513866, + "loss": 0.2524, + "num_input_tokens_seen": 6950656, + "step": 3220 + }, + { + "epoch": 0.5261011419249593, + "grad_norm": 0.7833412885665894, + "learning_rate": 0.00026296900489396414, + "loss": 0.3218, + "num_input_tokens_seen": 6961216, + "step": 3225 + }, + { + "epoch": 0.5269168026101142, + "grad_norm": 0.1404338926076889, + "learning_rate": 0.0002633768352365416, + "loss": 0.1797, + "num_input_tokens_seen": 6971808, + "step": 3230 + }, + { + "epoch": 0.5277324632952691, + "grad_norm": 0.2573801577091217, + "learning_rate": 0.0002637846655791191, + "loss": 0.1472, + "num_input_tokens_seen": 6984000, + "step": 3235 + }, + { + "epoch": 0.5285481239804242, + "grad_norm": 0.347644180059433, + "learning_rate": 0.00026419249592169657, + "loss": 0.1489, + "num_input_tokens_seen": 6994112, + "step": 3240 + }, + { + "epoch": 0.5293637846655791, + "grad_norm": 0.2416415959596634, + "learning_rate": 0.00026460032626427404, + "loss": 0.1576, + "num_input_tokens_seen": 7005184, + "step": 3245 + }, + { + "epoch": 0.5301794453507341, + "grad_norm": 0.15647706389427185, + "learning_rate": 0.00026500815660685157, + "loss": 0.1407, + "num_input_tokens_seen": 7015968, + "step": 3250 + }, + { + "epoch": 0.5309951060358891, + "grad_norm": 0.6454426646232605, + "learning_rate": 0.00026541598694942905, + "loss": 0.1001, + "num_input_tokens_seen": 7027584, + "step": 3255 + }, + { + "epoch": 0.531810766721044, + "grad_norm": 0.3946031630039215, + "learning_rate": 0.0002658238172920066, + "loss": 0.1434, + "num_input_tokens_seen": 7039424, + "step": 3260 + }, + { + "epoch": 0.532626427406199, + "grad_norm": 0.1988263875246048, + "learning_rate": 0.000266231647634584, + "loss": 0.2385, + "num_input_tokens_seen": 7049824, + "step": 3265 + }, + { + "epoch": 0.533442088091354, + "grad_norm": 0.3768634498119354, + "learning_rate": 0.0002666394779771615, + "loss": 0.2721, + "num_input_tokens_seen": 7061472, + "step": 3270 + }, + { + "epoch": 0.534257748776509, + "grad_norm": 0.9409236311912537, + "learning_rate": 0.000267047308319739, + "loss": 0.1761, + "num_input_tokens_seen": 7073152, + "step": 3275 + }, + { + "epoch": 0.5350734094616639, + "grad_norm": 0.7082731127738953, + "learning_rate": 0.0002674551386623165, + "loss": 0.192, + "num_input_tokens_seen": 7084640, + "step": 3280 + }, + { + "epoch": 0.535889070146819, + "grad_norm": 0.22120733559131622, + "learning_rate": 0.00026786296900489396, + "loss": 0.0733, + "num_input_tokens_seen": 7093088, + "step": 3285 + }, + { + "epoch": 0.5367047308319739, + "grad_norm": 0.10793591290712357, + "learning_rate": 0.00026827079934747144, + "loss": 0.0965, + "num_input_tokens_seen": 7103072, + "step": 3290 + }, + { + "epoch": 0.5375203915171288, + "grad_norm": 0.1707492172718048, + "learning_rate": 0.00026867862969004897, + "loss": 0.0664, + "num_input_tokens_seen": 7113792, + "step": 3295 + }, + { + "epoch": 0.5383360522022839, + "grad_norm": 0.06590881943702698, + "learning_rate": 0.00026908646003262645, + "loss": 0.109, + "num_input_tokens_seen": 7124800, + "step": 3300 + }, + { + "epoch": 0.5391517128874388, + "grad_norm": 0.35956960916519165, + "learning_rate": 0.0002694942903752039, + "loss": 0.0801, + "num_input_tokens_seen": 7137280, + "step": 3305 + }, + { + "epoch": 0.5399673735725938, + "grad_norm": 0.5606528520584106, + "learning_rate": 0.0002699021207177814, + "loss": 0.0672, + "num_input_tokens_seen": 7147264, + "step": 3310 + }, + { + "epoch": 0.5407830342577488, + "grad_norm": 0.4383194148540497, + "learning_rate": 0.0002703099510603589, + "loss": 0.1905, + "num_input_tokens_seen": 7157408, + "step": 3315 + }, + { + "epoch": 0.5415986949429038, + "grad_norm": 0.6812806129455566, + "learning_rate": 0.0002707177814029364, + "loss": 0.0451, + "num_input_tokens_seen": 7168224, + "step": 3320 + }, + { + "epoch": 0.5424143556280587, + "grad_norm": 1.3847594261169434, + "learning_rate": 0.0002711256117455139, + "loss": 0.1935, + "num_input_tokens_seen": 7179680, + "step": 3325 + }, + { + "epoch": 0.5432300163132137, + "grad_norm": 0.5613686442375183, + "learning_rate": 0.0002715334420880914, + "loss": 0.0687, + "num_input_tokens_seen": 7190944, + "step": 3330 + }, + { + "epoch": 0.5440456769983687, + "grad_norm": 0.03551279008388519, + "learning_rate": 0.00027194127243066883, + "loss": 0.2025, + "num_input_tokens_seen": 7200928, + "step": 3335 + }, + { + "epoch": 0.5448613376835236, + "grad_norm": 0.08977734297513962, + "learning_rate": 0.0002723491027732463, + "loss": 0.0574, + "num_input_tokens_seen": 7210400, + "step": 3340 + }, + { + "epoch": 0.5456769983686787, + "grad_norm": 0.0749269425868988, + "learning_rate": 0.00027275693311582384, + "loss": 0.0494, + "num_input_tokens_seen": 7221696, + "step": 3345 + }, + { + "epoch": 0.5464926590538336, + "grad_norm": 1.869079828262329, + "learning_rate": 0.0002731647634584013, + "loss": 0.3457, + "num_input_tokens_seen": 7232288, + "step": 3350 + }, + { + "epoch": 0.5473083197389886, + "grad_norm": 0.035522375255823135, + "learning_rate": 0.0002735725938009788, + "loss": 0.0976, + "num_input_tokens_seen": 7243808, + "step": 3355 + }, + { + "epoch": 0.5481239804241436, + "grad_norm": 0.18078352510929108, + "learning_rate": 0.00027398042414355627, + "loss": 0.1636, + "num_input_tokens_seen": 7254880, + "step": 3360 + }, + { + "epoch": 0.5489396411092985, + "grad_norm": 0.5510651469230652, + "learning_rate": 0.00027438825448613375, + "loss": 0.0648, + "num_input_tokens_seen": 7265664, + "step": 3365 + }, + { + "epoch": 0.5497553017944535, + "grad_norm": 0.2560504972934723, + "learning_rate": 0.0002747960848287113, + "loss": 0.2246, + "num_input_tokens_seen": 7275424, + "step": 3370 + }, + { + "epoch": 0.5505709624796085, + "grad_norm": 0.26291438937187195, + "learning_rate": 0.00027520391517128875, + "loss": 0.1724, + "num_input_tokens_seen": 7286496, + "step": 3375 + }, + { + "epoch": 0.5513866231647635, + "grad_norm": 0.4947168529033661, + "learning_rate": 0.00027561174551386623, + "loss": 0.2398, + "num_input_tokens_seen": 7297152, + "step": 3380 + }, + { + "epoch": 0.5522022838499184, + "grad_norm": 0.21689368784427643, + "learning_rate": 0.0002760195758564437, + "loss": 0.1214, + "num_input_tokens_seen": 7306816, + "step": 3385 + }, + { + "epoch": 0.5530179445350734, + "grad_norm": 0.08921483159065247, + "learning_rate": 0.00027642740619902124, + "loss": 0.0529, + "num_input_tokens_seen": 7317888, + "step": 3390 + }, + { + "epoch": 0.5538336052202284, + "grad_norm": 0.8110567927360535, + "learning_rate": 0.0002768352365415987, + "loss": 0.151, + "num_input_tokens_seen": 7329056, + "step": 3395 + }, + { + "epoch": 0.5546492659053833, + "grad_norm": 0.1971195936203003, + "learning_rate": 0.0002772430668841762, + "loss": 0.0376, + "num_input_tokens_seen": 7340192, + "step": 3400 + }, + { + "epoch": 0.5554649265905384, + "grad_norm": 0.5013919472694397, + "learning_rate": 0.00027765089722675367, + "loss": 0.0647, + "num_input_tokens_seen": 7351584, + "step": 3405 + }, + { + "epoch": 0.5562805872756933, + "grad_norm": 0.2764725387096405, + "learning_rate": 0.00027805872756933114, + "loss": 0.2417, + "num_input_tokens_seen": 7361632, + "step": 3410 + }, + { + "epoch": 0.5570962479608483, + "grad_norm": 0.15180736780166626, + "learning_rate": 0.0002784665579119087, + "loss": 0.0949, + "num_input_tokens_seen": 7371616, + "step": 3415 + }, + { + "epoch": 0.5579119086460033, + "grad_norm": 0.03513738512992859, + "learning_rate": 0.00027887438825448615, + "loss": 0.1879, + "num_input_tokens_seen": 7383488, + "step": 3420 + }, + { + "epoch": 0.5587275693311582, + "grad_norm": 0.05455316975712776, + "learning_rate": 0.00027928221859706363, + "loss": 0.1062, + "num_input_tokens_seen": 7394720, + "step": 3425 + }, + { + "epoch": 0.5595432300163132, + "grad_norm": 0.369393527507782, + "learning_rate": 0.0002796900489396411, + "loss": 0.0697, + "num_input_tokens_seen": 7405312, + "step": 3430 + }, + { + "epoch": 0.5603588907014682, + "grad_norm": 0.5221443176269531, + "learning_rate": 0.0002800978792822186, + "loss": 0.0896, + "num_input_tokens_seen": 7415040, + "step": 3435 + }, + { + "epoch": 0.5611745513866232, + "grad_norm": 0.04115762189030647, + "learning_rate": 0.0002805057096247961, + "loss": 0.0852, + "num_input_tokens_seen": 7426144, + "step": 3440 + }, + { + "epoch": 0.5619902120717781, + "grad_norm": 0.5287455320358276, + "learning_rate": 0.0002809135399673736, + "loss": 0.1836, + "num_input_tokens_seen": 7437024, + "step": 3445 + }, + { + "epoch": 0.5628058727569332, + "grad_norm": 0.0422709584236145, + "learning_rate": 0.00028132137030995106, + "loss": 0.0583, + "num_input_tokens_seen": 7447552, + "step": 3450 + }, + { + "epoch": 0.5636215334420881, + "grad_norm": 1.0483263731002808, + "learning_rate": 0.00028172920065252854, + "loss": 0.2494, + "num_input_tokens_seen": 7459040, + "step": 3455 + }, + { + "epoch": 0.564437194127243, + "grad_norm": 0.06323757022619247, + "learning_rate": 0.000282137030995106, + "loss": 0.3266, + "num_input_tokens_seen": 7469248, + "step": 3460 + }, + { + "epoch": 0.5652528548123981, + "grad_norm": 0.149562269449234, + "learning_rate": 0.00028254486133768355, + "loss": 0.1012, + "num_input_tokens_seen": 7480832, + "step": 3465 + }, + { + "epoch": 0.566068515497553, + "grad_norm": 0.4498364329338074, + "learning_rate": 0.000282952691680261, + "loss": 0.1335, + "num_input_tokens_seen": 7490016, + "step": 3470 + }, + { + "epoch": 0.566884176182708, + "grad_norm": 0.18582139909267426, + "learning_rate": 0.0002833605220228385, + "loss": 0.0739, + "num_input_tokens_seen": 7501184, + "step": 3475 + }, + { + "epoch": 0.567699836867863, + "grad_norm": 0.42618072032928467, + "learning_rate": 0.000283768352365416, + "loss": 0.1605, + "num_input_tokens_seen": 7511648, + "step": 3480 + }, + { + "epoch": 0.5685154975530179, + "grad_norm": 0.24001431465148926, + "learning_rate": 0.0002841761827079935, + "loss": 0.0847, + "num_input_tokens_seen": 7523104, + "step": 3485 + }, + { + "epoch": 0.5693311582381729, + "grad_norm": 0.15262551605701447, + "learning_rate": 0.000284584013050571, + "loss": 0.2082, + "num_input_tokens_seen": 7534496, + "step": 3490 + }, + { + "epoch": 0.5701468189233279, + "grad_norm": 1.1047290563583374, + "learning_rate": 0.0002849918433931484, + "loss": 0.2414, + "num_input_tokens_seen": 7545568, + "step": 3495 + }, + { + "epoch": 0.5709624796084829, + "grad_norm": 0.1071564257144928, + "learning_rate": 0.00028539967373572594, + "loss": 0.0974, + "num_input_tokens_seen": 7555808, + "step": 3500 + }, + { + "epoch": 0.5717781402936378, + "grad_norm": 0.6726254224777222, + "learning_rate": 0.0002858075040783034, + "loss": 0.1989, + "num_input_tokens_seen": 7567296, + "step": 3505 + }, + { + "epoch": 0.5725938009787929, + "grad_norm": 0.18670007586479187, + "learning_rate": 0.00028621533442088094, + "loss": 0.1782, + "num_input_tokens_seen": 7577824, + "step": 3510 + }, + { + "epoch": 0.5734094616639478, + "grad_norm": 0.7367821931838989, + "learning_rate": 0.0002866231647634584, + "loss": 0.1201, + "num_input_tokens_seen": 7588448, + "step": 3515 + }, + { + "epoch": 0.5742251223491027, + "grad_norm": 0.11420662701129913, + "learning_rate": 0.0002870309951060359, + "loss": 0.1526, + "num_input_tokens_seen": 7599264, + "step": 3520 + }, + { + "epoch": 0.5750407830342578, + "grad_norm": 1.1549158096313477, + "learning_rate": 0.0002874388254486134, + "loss": 0.2732, + "num_input_tokens_seen": 7611264, + "step": 3525 + }, + { + "epoch": 0.5758564437194127, + "grad_norm": 1.2724305391311646, + "learning_rate": 0.00028784665579119085, + "loss": 0.1366, + "num_input_tokens_seen": 7621312, + "step": 3530 + }, + { + "epoch": 0.5766721044045677, + "grad_norm": 0.13163702189922333, + "learning_rate": 0.0002882544861337684, + "loss": 0.2843, + "num_input_tokens_seen": 7632736, + "step": 3535 + }, + { + "epoch": 0.5774877650897227, + "grad_norm": 0.22550056874752045, + "learning_rate": 0.00028866231647634586, + "loss": 0.2393, + "num_input_tokens_seen": 7643296, + "step": 3540 + }, + { + "epoch": 0.5783034257748777, + "grad_norm": 0.5635867714881897, + "learning_rate": 0.00028907014681892333, + "loss": 0.1396, + "num_input_tokens_seen": 7653888, + "step": 3545 + }, + { + "epoch": 0.5791190864600326, + "grad_norm": 0.1705874353647232, + "learning_rate": 0.0002894779771615008, + "loss": 0.089, + "num_input_tokens_seen": 7664448, + "step": 3550 + }, + { + "epoch": 0.5799347471451876, + "grad_norm": 0.32659459114074707, + "learning_rate": 0.00028988580750407834, + "loss": 0.0754, + "num_input_tokens_seen": 7674176, + "step": 3555 + }, + { + "epoch": 0.5807504078303426, + "grad_norm": 0.4205467998981476, + "learning_rate": 0.0002902936378466558, + "loss": 0.1165, + "num_input_tokens_seen": 7685184, + "step": 3560 + }, + { + "epoch": 0.5815660685154975, + "grad_norm": 1.7951291799545288, + "learning_rate": 0.00029070146818923324, + "loss": 0.2567, + "num_input_tokens_seen": 7696064, + "step": 3565 + }, + { + "epoch": 0.5823817292006526, + "grad_norm": 0.15837207436561584, + "learning_rate": 0.00029110929853181077, + "loss": 0.2791, + "num_input_tokens_seen": 7707328, + "step": 3570 + }, + { + "epoch": 0.5831973898858075, + "grad_norm": 0.20898975431919098, + "learning_rate": 0.00029151712887438825, + "loss": 0.0736, + "num_input_tokens_seen": 7718368, + "step": 3575 + }, + { + "epoch": 0.5840130505709625, + "grad_norm": 1.0188244581222534, + "learning_rate": 0.0002919249592169658, + "loss": 0.1135, + "num_input_tokens_seen": 7729888, + "step": 3580 + }, + { + "epoch": 0.5848287112561175, + "grad_norm": 1.0055124759674072, + "learning_rate": 0.00029233278955954325, + "loss": 0.2485, + "num_input_tokens_seen": 7739424, + "step": 3585 + }, + { + "epoch": 0.5856443719412724, + "grad_norm": 1.2235937118530273, + "learning_rate": 0.0002927406199021207, + "loss": 0.1966, + "num_input_tokens_seen": 7748832, + "step": 3590 + }, + { + "epoch": 0.5864600326264274, + "grad_norm": 0.947248637676239, + "learning_rate": 0.0002931484502446982, + "loss": 0.0973, + "num_input_tokens_seen": 7760128, + "step": 3595 + }, + { + "epoch": 0.5872756933115824, + "grad_norm": 0.5701817870140076, + "learning_rate": 0.0002935562805872757, + "loss": 0.1069, + "num_input_tokens_seen": 7770688, + "step": 3600 + }, + { + "epoch": 0.5880913539967374, + "grad_norm": 0.41673779487609863, + "learning_rate": 0.0002939641109298532, + "loss": 0.1512, + "num_input_tokens_seen": 7782304, + "step": 3605 + }, + { + "epoch": 0.5889070146818923, + "grad_norm": 0.6106691360473633, + "learning_rate": 0.0002943719412724307, + "loss": 0.1987, + "num_input_tokens_seen": 7793184, + "step": 3610 + }, + { + "epoch": 0.5897226753670473, + "grad_norm": 0.19900889694690704, + "learning_rate": 0.00029477977161500817, + "loss": 0.1869, + "num_input_tokens_seen": 7803648, + "step": 3615 + }, + { + "epoch": 0.5905383360522023, + "grad_norm": 0.19333691895008087, + "learning_rate": 0.00029518760195758564, + "loss": 0.1128, + "num_input_tokens_seen": 7814272, + "step": 3620 + }, + { + "epoch": 0.5913539967373572, + "grad_norm": 0.2754856050014496, + "learning_rate": 0.0002955954323001631, + "loss": 0.1743, + "num_input_tokens_seen": 7825120, + "step": 3625 + }, + { + "epoch": 0.5921696574225123, + "grad_norm": 0.9911066889762878, + "learning_rate": 0.00029600326264274065, + "loss": 0.135, + "num_input_tokens_seen": 7837440, + "step": 3630 + }, + { + "epoch": 0.5929853181076672, + "grad_norm": 0.45625540614128113, + "learning_rate": 0.00029641109298531807, + "loss": 0.0608, + "num_input_tokens_seen": 7848064, + "step": 3635 + }, + { + "epoch": 0.5938009787928222, + "grad_norm": 0.22430795431137085, + "learning_rate": 0.0002968189233278956, + "loss": 0.1347, + "num_input_tokens_seen": 7858816, + "step": 3640 + }, + { + "epoch": 0.5946166394779772, + "grad_norm": 1.5107712745666504, + "learning_rate": 0.0002972267536704731, + "loss": 0.2535, + "num_input_tokens_seen": 7869504, + "step": 3645 + }, + { + "epoch": 0.5954323001631321, + "grad_norm": 0.162008136510849, + "learning_rate": 0.0002976345840130506, + "loss": 0.0354, + "num_input_tokens_seen": 7879712, + "step": 3650 + }, + { + "epoch": 0.5962479608482871, + "grad_norm": 0.5064948201179504, + "learning_rate": 0.0002980424143556281, + "loss": 0.0763, + "num_input_tokens_seen": 7890368, + "step": 3655 + }, + { + "epoch": 0.5970636215334421, + "grad_norm": 0.48915374279022217, + "learning_rate": 0.0002984502446982055, + "loss": 0.0591, + "num_input_tokens_seen": 7900704, + "step": 3660 + }, + { + "epoch": 0.5978792822185971, + "grad_norm": 1.123414158821106, + "learning_rate": 0.00029885807504078304, + "loss": 0.2029, + "num_input_tokens_seen": 7911776, + "step": 3665 + }, + { + "epoch": 0.598694942903752, + "grad_norm": 0.43261805176734924, + "learning_rate": 0.0002992659053833605, + "loss": 0.2295, + "num_input_tokens_seen": 7921984, + "step": 3670 + }, + { + "epoch": 0.5995106035889071, + "grad_norm": 0.5339052677154541, + "learning_rate": 0.00029967373572593805, + "loss": 0.2408, + "num_input_tokens_seen": 7934080, + "step": 3675 + }, + { + "epoch": 0.600326264274062, + "grad_norm": 0.15146224200725555, + "learning_rate": 0.0003000815660685155, + "loss": 0.1547, + "num_input_tokens_seen": 7944736, + "step": 3680 + }, + { + "epoch": 0.6011419249592169, + "grad_norm": 0.11091198772192001, + "learning_rate": 0.000300489396411093, + "loss": 0.1293, + "num_input_tokens_seen": 7956256, + "step": 3685 + }, + { + "epoch": 0.601957585644372, + "grad_norm": 0.07787430286407471, + "learning_rate": 0.0003008972267536705, + "loss": 0.1261, + "num_input_tokens_seen": 7967808, + "step": 3690 + }, + { + "epoch": 0.6027732463295269, + "grad_norm": 0.16613641381263733, + "learning_rate": 0.00030130505709624795, + "loss": 0.0563, + "num_input_tokens_seen": 7979648, + "step": 3695 + }, + { + "epoch": 0.6035889070146819, + "grad_norm": 0.32812386751174927, + "learning_rate": 0.0003017128874388255, + "loss": 0.0928, + "num_input_tokens_seen": 7990944, + "step": 3700 + }, + { + "epoch": 0.6044045676998369, + "grad_norm": 0.3859018385410309, + "learning_rate": 0.0003021207177814029, + "loss": 0.1239, + "num_input_tokens_seen": 8001568, + "step": 3705 + }, + { + "epoch": 0.6052202283849919, + "grad_norm": 0.14347811043262482, + "learning_rate": 0.00030252854812398044, + "loss": 0.2942, + "num_input_tokens_seen": 8013376, + "step": 3710 + }, + { + "epoch": 0.6060358890701468, + "grad_norm": 0.922331690788269, + "learning_rate": 0.0003029363784665579, + "loss": 0.2152, + "num_input_tokens_seen": 8024512, + "step": 3715 + }, + { + "epoch": 0.6068515497553018, + "grad_norm": 0.7256356477737427, + "learning_rate": 0.0003033442088091354, + "loss": 0.1756, + "num_input_tokens_seen": 8035520, + "step": 3720 + }, + { + "epoch": 0.6076672104404568, + "grad_norm": 0.32077616453170776, + "learning_rate": 0.0003037520391517129, + "loss": 0.1294, + "num_input_tokens_seen": 8046912, + "step": 3725 + }, + { + "epoch": 0.6084828711256117, + "grad_norm": 0.3042055666446686, + "learning_rate": 0.00030415986949429034, + "loss": 0.1097, + "num_input_tokens_seen": 8057472, + "step": 3730 + }, + { + "epoch": 0.6092985318107668, + "grad_norm": 1.0995193719863892, + "learning_rate": 0.00030456769983686787, + "loss": 0.3589, + "num_input_tokens_seen": 8068576, + "step": 3735 + }, + { + "epoch": 0.6101141924959217, + "grad_norm": 0.34954649209976196, + "learning_rate": 0.00030497553017944535, + "loss": 0.1123, + "num_input_tokens_seen": 8080320, + "step": 3740 + }, + { + "epoch": 0.6109298531810766, + "grad_norm": 0.32164424657821655, + "learning_rate": 0.0003053833605220229, + "loss": 0.2047, + "num_input_tokens_seen": 8090144, + "step": 3745 + }, + { + "epoch": 0.6117455138662317, + "grad_norm": 0.20417018234729767, + "learning_rate": 0.00030579119086460036, + "loss": 0.0666, + "num_input_tokens_seen": 8101472, + "step": 3750 + }, + { + "epoch": 0.6125611745513866, + "grad_norm": 0.8525700569152832, + "learning_rate": 0.0003061990212071778, + "loss": 0.1242, + "num_input_tokens_seen": 8112736, + "step": 3755 + }, + { + "epoch": 0.6133768352365416, + "grad_norm": 0.8774811625480652, + "learning_rate": 0.0003066068515497553, + "loss": 0.1184, + "num_input_tokens_seen": 8123296, + "step": 3760 + }, + { + "epoch": 0.6141924959216966, + "grad_norm": 0.4467347264289856, + "learning_rate": 0.0003070146818923328, + "loss": 0.1969, + "num_input_tokens_seen": 8133248, + "step": 3765 + }, + { + "epoch": 0.6150081566068516, + "grad_norm": 0.09053094685077667, + "learning_rate": 0.0003074225122349103, + "loss": 0.1198, + "num_input_tokens_seen": 8144448, + "step": 3770 + }, + { + "epoch": 0.6158238172920065, + "grad_norm": 0.6725847721099854, + "learning_rate": 0.00030783034257748774, + "loss": 0.1982, + "num_input_tokens_seen": 8154656, + "step": 3775 + }, + { + "epoch": 0.6166394779771615, + "grad_norm": 0.3370138108730316, + "learning_rate": 0.00030823817292006527, + "loss": 0.0528, + "num_input_tokens_seen": 8165888, + "step": 3780 + }, + { + "epoch": 0.6174551386623165, + "grad_norm": 1.0672154426574707, + "learning_rate": 0.00030864600326264275, + "loss": 0.0759, + "num_input_tokens_seen": 8175744, + "step": 3785 + }, + { + "epoch": 0.6182707993474714, + "grad_norm": 0.0875239148736, + "learning_rate": 0.0003090538336052202, + "loss": 0.1605, + "num_input_tokens_seen": 8186880, + "step": 3790 + }, + { + "epoch": 0.6190864600326265, + "grad_norm": 0.2635380029678345, + "learning_rate": 0.00030946166394779775, + "loss": 0.1947, + "num_input_tokens_seen": 8196192, + "step": 3795 + }, + { + "epoch": 0.6199021207177814, + "grad_norm": 0.2070256769657135, + "learning_rate": 0.0003098694942903752, + "loss": 0.1919, + "num_input_tokens_seen": 8205632, + "step": 3800 + }, + { + "epoch": 0.6207177814029364, + "grad_norm": 0.3485415577888489, + "learning_rate": 0.0003102773246329527, + "loss": 0.1056, + "num_input_tokens_seen": 8216032, + "step": 3805 + }, + { + "epoch": 0.6215334420880914, + "grad_norm": 0.5896238684654236, + "learning_rate": 0.0003106851549755302, + "loss": 0.1347, + "num_input_tokens_seen": 8228192, + "step": 3810 + }, + { + "epoch": 0.6223491027732463, + "grad_norm": 0.1791100800037384, + "learning_rate": 0.00031109298531810766, + "loss": 0.143, + "num_input_tokens_seen": 8239136, + "step": 3815 + }, + { + "epoch": 0.6231647634584013, + "grad_norm": 0.2757539451122284, + "learning_rate": 0.0003115008156606852, + "loss": 0.0687, + "num_input_tokens_seen": 8249184, + "step": 3820 + }, + { + "epoch": 0.6239804241435563, + "grad_norm": 0.47659072279930115, + "learning_rate": 0.0003119086460032626, + "loss": 0.1604, + "num_input_tokens_seen": 8260288, + "step": 3825 + }, + { + "epoch": 0.6247960848287113, + "grad_norm": 0.1077791154384613, + "learning_rate": 0.00031231647634584014, + "loss": 0.2936, + "num_input_tokens_seen": 8271232, + "step": 3830 + }, + { + "epoch": 0.6256117455138662, + "grad_norm": 0.2414446771144867, + "learning_rate": 0.0003127243066884176, + "loss": 0.1355, + "num_input_tokens_seen": 8281248, + "step": 3835 + }, + { + "epoch": 0.6264274061990212, + "grad_norm": 0.13507677614688873, + "learning_rate": 0.00031313213703099515, + "loss": 0.1825, + "num_input_tokens_seen": 8292864, + "step": 3840 + }, + { + "epoch": 0.6272430668841762, + "grad_norm": 0.26294106245040894, + "learning_rate": 0.0003135399673735726, + "loss": 0.1879, + "num_input_tokens_seen": 8303488, + "step": 3845 + }, + { + "epoch": 0.6280587275693311, + "grad_norm": 0.3151414096355438, + "learning_rate": 0.00031394779771615005, + "loss": 0.217, + "num_input_tokens_seen": 8315008, + "step": 3850 + }, + { + "epoch": 0.6288743882544862, + "grad_norm": 0.38112303614616394, + "learning_rate": 0.0003143556280587276, + "loss": 0.1261, + "num_input_tokens_seen": 8325696, + "step": 3855 + }, + { + "epoch": 0.6296900489396411, + "grad_norm": 0.07693363726139069, + "learning_rate": 0.00031476345840130506, + "loss": 0.1513, + "num_input_tokens_seen": 8336960, + "step": 3860 + }, + { + "epoch": 0.6305057096247961, + "grad_norm": 0.24605275690555573, + "learning_rate": 0.0003151712887438826, + "loss": 0.1153, + "num_input_tokens_seen": 8348544, + "step": 3865 + }, + { + "epoch": 0.6313213703099511, + "grad_norm": 0.48214077949523926, + "learning_rate": 0.00031557911908646, + "loss": 0.1825, + "num_input_tokens_seen": 8359072, + "step": 3870 + }, + { + "epoch": 0.632137030995106, + "grad_norm": 0.4101504385471344, + "learning_rate": 0.00031598694942903754, + "loss": 0.1694, + "num_input_tokens_seen": 8369184, + "step": 3875 + }, + { + "epoch": 0.632952691680261, + "grad_norm": 0.06815630197525024, + "learning_rate": 0.000316394779771615, + "loss": 0.0905, + "num_input_tokens_seen": 8380352, + "step": 3880 + }, + { + "epoch": 0.633768352365416, + "grad_norm": 0.11706419289112091, + "learning_rate": 0.0003168026101141925, + "loss": 0.0495, + "num_input_tokens_seen": 8390880, + "step": 3885 + }, + { + "epoch": 0.634584013050571, + "grad_norm": 0.41242027282714844, + "learning_rate": 0.00031721044045677, + "loss": 0.1514, + "num_input_tokens_seen": 8402176, + "step": 3890 + }, + { + "epoch": 0.6353996737357259, + "grad_norm": 0.09979145973920822, + "learning_rate": 0.00031761827079934744, + "loss": 0.0868, + "num_input_tokens_seen": 8413280, + "step": 3895 + }, + { + "epoch": 0.636215334420881, + "grad_norm": 0.043393541127443314, + "learning_rate": 0.000318026101141925, + "loss": 0.046, + "num_input_tokens_seen": 8424960, + "step": 3900 + }, + { + "epoch": 0.6370309951060359, + "grad_norm": 0.12201043963432312, + "learning_rate": 0.00031843393148450245, + "loss": 0.0594, + "num_input_tokens_seen": 8436160, + "step": 3905 + }, + { + "epoch": 0.6378466557911908, + "grad_norm": 1.5243480205535889, + "learning_rate": 0.00031884176182708, + "loss": 0.1185, + "num_input_tokens_seen": 8445856, + "step": 3910 + }, + { + "epoch": 0.6386623164763459, + "grad_norm": 1.0513534545898438, + "learning_rate": 0.00031924959216965746, + "loss": 0.5507, + "num_input_tokens_seen": 8455264, + "step": 3915 + }, + { + "epoch": 0.6394779771615008, + "grad_norm": 0.4103231132030487, + "learning_rate": 0.0003196574225122349, + "loss": 0.1775, + "num_input_tokens_seen": 8465984, + "step": 3920 + }, + { + "epoch": 0.6402936378466558, + "grad_norm": 1.263214349746704, + "learning_rate": 0.0003200652528548124, + "loss": 0.2106, + "num_input_tokens_seen": 8477344, + "step": 3925 + }, + { + "epoch": 0.6411092985318108, + "grad_norm": 0.15126630663871765, + "learning_rate": 0.0003204730831973899, + "loss": 0.1166, + "num_input_tokens_seen": 8488544, + "step": 3930 + }, + { + "epoch": 0.6419249592169658, + "grad_norm": 0.36729708313941956, + "learning_rate": 0.0003208809135399674, + "loss": 0.1048, + "num_input_tokens_seen": 8499296, + "step": 3935 + }, + { + "epoch": 0.6427406199021207, + "grad_norm": 0.9425373673439026, + "learning_rate": 0.00032128874388254484, + "loss": 0.1382, + "num_input_tokens_seen": 8510912, + "step": 3940 + }, + { + "epoch": 0.6435562805872757, + "grad_norm": 0.2685391306877136, + "learning_rate": 0.0003216965742251223, + "loss": 0.1376, + "num_input_tokens_seen": 8521920, + "step": 3945 + }, + { + "epoch": 0.6443719412724307, + "grad_norm": 0.489003986120224, + "learning_rate": 0.00032210440456769985, + "loss": 0.1312, + "num_input_tokens_seen": 8532448, + "step": 3950 + }, + { + "epoch": 0.6451876019575856, + "grad_norm": 0.14087380468845367, + "learning_rate": 0.0003225122349102773, + "loss": 0.1139, + "num_input_tokens_seen": 8543936, + "step": 3955 + }, + { + "epoch": 0.6460032626427407, + "grad_norm": 0.11659581959247589, + "learning_rate": 0.00032292006525285486, + "loss": 0.0836, + "num_input_tokens_seen": 8556160, + "step": 3960 + }, + { + "epoch": 0.6468189233278956, + "grad_norm": 0.15926118195056915, + "learning_rate": 0.0003233278955954323, + "loss": 0.0522, + "num_input_tokens_seen": 8566816, + "step": 3965 + }, + { + "epoch": 0.6476345840130505, + "grad_norm": 0.35616812109947205, + "learning_rate": 0.0003237357259380098, + "loss": 0.2619, + "num_input_tokens_seen": 8576992, + "step": 3970 + }, + { + "epoch": 0.6484502446982056, + "grad_norm": 0.46962714195251465, + "learning_rate": 0.0003241435562805873, + "loss": 0.2148, + "num_input_tokens_seen": 8588224, + "step": 3975 + }, + { + "epoch": 0.6492659053833605, + "grad_norm": 0.061958249658346176, + "learning_rate": 0.00032455138662316476, + "loss": 0.0982, + "num_input_tokens_seen": 8598976, + "step": 3980 + }, + { + "epoch": 0.6500815660685155, + "grad_norm": 0.13584494590759277, + "learning_rate": 0.0003249592169657423, + "loss": 0.0892, + "num_input_tokens_seen": 8609216, + "step": 3985 + }, + { + "epoch": 0.6508972267536705, + "grad_norm": 0.8373795747756958, + "learning_rate": 0.0003253670473083197, + "loss": 0.1666, + "num_input_tokens_seen": 8620032, + "step": 3990 + }, + { + "epoch": 0.6517128874388255, + "grad_norm": 0.06975753605365753, + "learning_rate": 0.00032577487765089724, + "loss": 0.2013, + "num_input_tokens_seen": 8631456, + "step": 3995 + }, + { + "epoch": 0.6525285481239804, + "grad_norm": 0.164698988199234, + "learning_rate": 0.0003261827079934747, + "loss": 0.0616, + "num_input_tokens_seen": 8641696, + "step": 4000 + }, + { + "epoch": 0.6533442088091354, + "grad_norm": 0.8426600098609924, + "learning_rate": 0.00032659053833605225, + "loss": 0.1575, + "num_input_tokens_seen": 8652576, + "step": 4005 + }, + { + "epoch": 0.6541598694942904, + "grad_norm": 0.6252540349960327, + "learning_rate": 0.0003269983686786297, + "loss": 0.2312, + "num_input_tokens_seen": 8662464, + "step": 4010 + }, + { + "epoch": 0.6549755301794453, + "grad_norm": 0.43457654118537903, + "learning_rate": 0.00032740619902120715, + "loss": 0.0936, + "num_input_tokens_seen": 8673312, + "step": 4015 + }, + { + "epoch": 0.6557911908646004, + "grad_norm": 0.4076187312602997, + "learning_rate": 0.0003278140293637847, + "loss": 0.1401, + "num_input_tokens_seen": 8683904, + "step": 4020 + }, + { + "epoch": 0.6566068515497553, + "grad_norm": 0.28343382477760315, + "learning_rate": 0.00032822185970636216, + "loss": 0.0753, + "num_input_tokens_seen": 8694944, + "step": 4025 + }, + { + "epoch": 0.6574225122349103, + "grad_norm": 0.19631558656692505, + "learning_rate": 0.0003286296900489397, + "loss": 0.0465, + "num_input_tokens_seen": 8706400, + "step": 4030 + }, + { + "epoch": 0.6582381729200653, + "grad_norm": 0.06990889459848404, + "learning_rate": 0.0003290375203915171, + "loss": 0.1025, + "num_input_tokens_seen": 8717504, + "step": 4035 + }, + { + "epoch": 0.6590538336052202, + "grad_norm": 0.39275580644607544, + "learning_rate": 0.00032944535073409464, + "loss": 0.0554, + "num_input_tokens_seen": 8728736, + "step": 4040 + }, + { + "epoch": 0.6598694942903752, + "grad_norm": 0.7639222741127014, + "learning_rate": 0.0003298531810766721, + "loss": 0.3712, + "num_input_tokens_seen": 8740032, + "step": 4045 + }, + { + "epoch": 0.6606851549755302, + "grad_norm": 0.0512065626680851, + "learning_rate": 0.0003302610114192496, + "loss": 0.1152, + "num_input_tokens_seen": 8749280, + "step": 4050 + }, + { + "epoch": 0.6615008156606852, + "grad_norm": 0.07373015582561493, + "learning_rate": 0.0003306688417618271, + "loss": 0.0288, + "num_input_tokens_seen": 8760320, + "step": 4055 + }, + { + "epoch": 0.6623164763458401, + "grad_norm": 0.5321258902549744, + "learning_rate": 0.00033107667210440455, + "loss": 0.1337, + "num_input_tokens_seen": 8771104, + "step": 4060 + }, + { + "epoch": 0.6631321370309952, + "grad_norm": 0.03265725448727608, + "learning_rate": 0.0003314845024469821, + "loss": 0.0709, + "num_input_tokens_seen": 8781664, + "step": 4065 + }, + { + "epoch": 0.6639477977161501, + "grad_norm": 0.15506812930107117, + "learning_rate": 0.00033189233278955955, + "loss": 0.0715, + "num_input_tokens_seen": 8792128, + "step": 4070 + }, + { + "epoch": 0.664763458401305, + "grad_norm": 0.3224940896034241, + "learning_rate": 0.00033230016313213703, + "loss": 0.2244, + "num_input_tokens_seen": 8803616, + "step": 4075 + }, + { + "epoch": 0.6655791190864601, + "grad_norm": 0.6347690224647522, + "learning_rate": 0.0003327079934747145, + "loss": 0.0932, + "num_input_tokens_seen": 8813696, + "step": 4080 + }, + { + "epoch": 0.666394779771615, + "grad_norm": 0.6844305396080017, + "learning_rate": 0.000333115823817292, + "loss": 0.1977, + "num_input_tokens_seen": 8824672, + "step": 4085 + }, + { + "epoch": 0.66721044045677, + "grad_norm": 0.053750500082969666, + "learning_rate": 0.0003335236541598695, + "loss": 0.0225, + "num_input_tokens_seen": 8836256, + "step": 4090 + }, + { + "epoch": 0.668026101141925, + "grad_norm": 0.0979921966791153, + "learning_rate": 0.000333931484502447, + "loss": 0.1962, + "num_input_tokens_seen": 8847168, + "step": 4095 + }, + { + "epoch": 0.6688417618270799, + "grad_norm": 0.7607890367507935, + "learning_rate": 0.0003343393148450245, + "loss": 0.1747, + "num_input_tokens_seen": 8855008, + "step": 4100 + }, + { + "epoch": 0.6696574225122349, + "grad_norm": 0.2811325490474701, + "learning_rate": 0.00033474714518760194, + "loss": 0.1966, + "num_input_tokens_seen": 8865728, + "step": 4105 + }, + { + "epoch": 0.6704730831973899, + "grad_norm": 0.14467936754226685, + "learning_rate": 0.0003351549755301794, + "loss": 0.1633, + "num_input_tokens_seen": 8877440, + "step": 4110 + }, + { + "epoch": 0.6712887438825449, + "grad_norm": 0.5608596205711365, + "learning_rate": 0.00033556280587275695, + "loss": 0.2213, + "num_input_tokens_seen": 8889248, + "step": 4115 + }, + { + "epoch": 0.6721044045676998, + "grad_norm": 0.36362361907958984, + "learning_rate": 0.0003359706362153344, + "loss": 0.0987, + "num_input_tokens_seen": 8900640, + "step": 4120 + }, + { + "epoch": 0.6729200652528549, + "grad_norm": 0.05654023960232735, + "learning_rate": 0.00033637846655791196, + "loss": 0.1707, + "num_input_tokens_seen": 8911232, + "step": 4125 + }, + { + "epoch": 0.6737357259380098, + "grad_norm": 0.09752820432186127, + "learning_rate": 0.0003367862969004894, + "loss": 0.1078, + "num_input_tokens_seen": 8921952, + "step": 4130 + }, + { + "epoch": 0.6745513866231647, + "grad_norm": 0.08624225109815598, + "learning_rate": 0.0003371941272430669, + "loss": 0.1293, + "num_input_tokens_seen": 8933856, + "step": 4135 + }, + { + "epoch": 0.6753670473083198, + "grad_norm": 0.14989924430847168, + "learning_rate": 0.0003376019575856444, + "loss": 0.061, + "num_input_tokens_seen": 8944800, + "step": 4140 + }, + { + "epoch": 0.6761827079934747, + "grad_norm": 0.11734739691019058, + "learning_rate": 0.00033800978792822186, + "loss": 0.0411, + "num_input_tokens_seen": 8956352, + "step": 4145 + }, + { + "epoch": 0.6769983686786297, + "grad_norm": 0.35162967443466187, + "learning_rate": 0.00033841761827079934, + "loss": 0.0789, + "num_input_tokens_seen": 8967520, + "step": 4150 + }, + { + "epoch": 0.6778140293637847, + "grad_norm": 0.03362584114074707, + "learning_rate": 0.0003388254486133768, + "loss": 0.1638, + "num_input_tokens_seen": 8977888, + "step": 4155 + }, + { + "epoch": 0.6786296900489397, + "grad_norm": 0.1328830122947693, + "learning_rate": 0.00033923327895595435, + "loss": 0.0441, + "num_input_tokens_seen": 8989440, + "step": 4160 + }, + { + "epoch": 0.6794453507340946, + "grad_norm": 0.18508820235729218, + "learning_rate": 0.0003396411092985318, + "loss": 0.0742, + "num_input_tokens_seen": 9000896, + "step": 4165 + }, + { + "epoch": 0.6802610114192496, + "grad_norm": 0.026474563404917717, + "learning_rate": 0.0003400489396411093, + "loss": 0.1311, + "num_input_tokens_seen": 9011808, + "step": 4170 + }, + { + "epoch": 0.6810766721044046, + "grad_norm": 0.793641984462738, + "learning_rate": 0.0003404567699836868, + "loss": 0.1348, + "num_input_tokens_seen": 9024096, + "step": 4175 + }, + { + "epoch": 0.6818923327895595, + "grad_norm": 0.07803583890199661, + "learning_rate": 0.00034086460032626425, + "loss": 0.2088, + "num_input_tokens_seen": 9035648, + "step": 4180 + }, + { + "epoch": 0.6827079934747146, + "grad_norm": 0.08670012652873993, + "learning_rate": 0.0003412724306688418, + "loss": 0.0742, + "num_input_tokens_seen": 9045920, + "step": 4185 + }, + { + "epoch": 0.6835236541598695, + "grad_norm": 0.4543367028236389, + "learning_rate": 0.00034168026101141926, + "loss": 0.0632, + "num_input_tokens_seen": 9057088, + "step": 4190 + }, + { + "epoch": 0.6843393148450244, + "grad_norm": 0.41005179286003113, + "learning_rate": 0.0003420880913539968, + "loss": 0.1719, + "num_input_tokens_seen": 9066208, + "step": 4195 + }, + { + "epoch": 0.6851549755301795, + "grad_norm": 0.7371568083763123, + "learning_rate": 0.0003424959216965742, + "loss": 0.1812, + "num_input_tokens_seen": 9077120, + "step": 4200 + }, + { + "epoch": 0.6859706362153344, + "grad_norm": 0.340640127658844, + "learning_rate": 0.0003429037520391517, + "loss": 0.2524, + "num_input_tokens_seen": 9086592, + "step": 4205 + }, + { + "epoch": 0.6867862969004894, + "grad_norm": 0.18895219266414642, + "learning_rate": 0.0003433115823817292, + "loss": 0.1456, + "num_input_tokens_seen": 9096864, + "step": 4210 + }, + { + "epoch": 0.6876019575856444, + "grad_norm": 0.18842971324920654, + "learning_rate": 0.0003437194127243067, + "loss": 0.1622, + "num_input_tokens_seen": 9107424, + "step": 4215 + }, + { + "epoch": 0.6884176182707994, + "grad_norm": 0.0588395819067955, + "learning_rate": 0.00034412724306688417, + "loss": 0.0809, + "num_input_tokens_seen": 9117696, + "step": 4220 + }, + { + "epoch": 0.6892332789559543, + "grad_norm": 0.08728792518377304, + "learning_rate": 0.00034453507340946165, + "loss": 0.0914, + "num_input_tokens_seen": 9128096, + "step": 4225 + }, + { + "epoch": 0.6900489396411092, + "grad_norm": 1.0194220542907715, + "learning_rate": 0.0003449429037520392, + "loss": 0.1346, + "num_input_tokens_seen": 9139104, + "step": 4230 + }, + { + "epoch": 0.6908646003262643, + "grad_norm": 0.4258745014667511, + "learning_rate": 0.00034535073409461666, + "loss": 0.1032, + "num_input_tokens_seen": 9149408, + "step": 4235 + }, + { + "epoch": 0.6916802610114192, + "grad_norm": 0.0936698392033577, + "learning_rate": 0.00034575856443719413, + "loss": 0.064, + "num_input_tokens_seen": 9160672, + "step": 4240 + }, + { + "epoch": 0.6924959216965743, + "grad_norm": 0.8383188843727112, + "learning_rate": 0.0003461663947797716, + "loss": 0.2815, + "num_input_tokens_seen": 9171104, + "step": 4245 + }, + { + "epoch": 0.6933115823817292, + "grad_norm": 0.05329615995287895, + "learning_rate": 0.0003465742251223491, + "loss": 0.0785, + "num_input_tokens_seen": 9181440, + "step": 4250 + }, + { + "epoch": 0.6941272430668842, + "grad_norm": 0.044270992279052734, + "learning_rate": 0.0003469820554649266, + "loss": 0.0703, + "num_input_tokens_seen": 9191488, + "step": 4255 + }, + { + "epoch": 0.6949429037520392, + "grad_norm": 1.0437971353530884, + "learning_rate": 0.0003473898858075041, + "loss": 0.2139, + "num_input_tokens_seen": 9203392, + "step": 4260 + }, + { + "epoch": 0.6957585644371941, + "grad_norm": 0.3245795667171478, + "learning_rate": 0.0003477977161500816, + "loss": 0.2403, + "num_input_tokens_seen": 9214368, + "step": 4265 + }, + { + "epoch": 0.6965742251223491, + "grad_norm": 0.08259432762861252, + "learning_rate": 0.00034820554649265905, + "loss": 0.062, + "num_input_tokens_seen": 9225248, + "step": 4270 + }, + { + "epoch": 0.697389885807504, + "grad_norm": 0.269199401140213, + "learning_rate": 0.0003486133768352365, + "loss": 0.1181, + "num_input_tokens_seen": 9236640, + "step": 4275 + }, + { + "epoch": 0.6982055464926591, + "grad_norm": 0.38677653670310974, + "learning_rate": 0.00034902120717781405, + "loss": 0.0849, + "num_input_tokens_seen": 9248000, + "step": 4280 + }, + { + "epoch": 0.699021207177814, + "grad_norm": 0.1369486302137375, + "learning_rate": 0.00034942903752039153, + "loss": 0.267, + "num_input_tokens_seen": 9258752, + "step": 4285 + }, + { + "epoch": 0.6998368678629691, + "grad_norm": 0.44952574372291565, + "learning_rate": 0.000349836867862969, + "loss": 0.1027, + "num_input_tokens_seen": 9269376, + "step": 4290 + }, + { + "epoch": 0.700652528548124, + "grad_norm": 0.12477151304483414, + "learning_rate": 0.0003502446982055465, + "loss": 0.2004, + "num_input_tokens_seen": 9281312, + "step": 4295 + }, + { + "epoch": 0.7014681892332789, + "grad_norm": 0.11125738173723221, + "learning_rate": 0.00035065252854812396, + "loss": 0.0653, + "num_input_tokens_seen": 9291936, + "step": 4300 + }, + { + "epoch": 0.702283849918434, + "grad_norm": 0.1944471299648285, + "learning_rate": 0.0003510603588907015, + "loss": 0.1115, + "num_input_tokens_seen": 9302528, + "step": 4305 + }, + { + "epoch": 0.7030995106035889, + "grad_norm": 0.08961895108222961, + "learning_rate": 0.00035146818923327897, + "loss": 0.0653, + "num_input_tokens_seen": 9313536, + "step": 4310 + }, + { + "epoch": 0.7039151712887439, + "grad_norm": 0.043053120374679565, + "learning_rate": 0.00035187601957585644, + "loss": 0.1076, + "num_input_tokens_seen": 9324384, + "step": 4315 + }, + { + "epoch": 0.7047308319738989, + "grad_norm": 0.5048277378082275, + "learning_rate": 0.0003522838499184339, + "loss": 0.2702, + "num_input_tokens_seen": 9335104, + "step": 4320 + }, + { + "epoch": 0.7055464926590538, + "grad_norm": 0.516410768032074, + "learning_rate": 0.00035269168026101145, + "loss": 0.2341, + "num_input_tokens_seen": 9345824, + "step": 4325 + }, + { + "epoch": 0.7063621533442088, + "grad_norm": 0.15441341698169708, + "learning_rate": 0.0003530995106035889, + "loss": 0.0657, + "num_input_tokens_seen": 9357088, + "step": 4330 + }, + { + "epoch": 0.7071778140293637, + "grad_norm": 0.7144105434417725, + "learning_rate": 0.0003535073409461664, + "loss": 0.2489, + "num_input_tokens_seen": 9366784, + "step": 4335 + }, + { + "epoch": 0.7079934747145188, + "grad_norm": 0.1695648729801178, + "learning_rate": 0.0003539151712887439, + "loss": 0.1474, + "num_input_tokens_seen": 9377024, + "step": 4340 + }, + { + "epoch": 0.7088091353996737, + "grad_norm": 0.2768016457557678, + "learning_rate": 0.00035432300163132136, + "loss": 0.1546, + "num_input_tokens_seen": 9389152, + "step": 4345 + }, + { + "epoch": 0.7096247960848288, + "grad_norm": 0.1949160099029541, + "learning_rate": 0.0003547308319738989, + "loss": 0.1935, + "num_input_tokens_seen": 9399616, + "step": 4350 + }, + { + "epoch": 0.7104404567699837, + "grad_norm": 0.09738589823246002, + "learning_rate": 0.00035513866231647636, + "loss": 0.0919, + "num_input_tokens_seen": 9410176, + "step": 4355 + }, + { + "epoch": 0.7112561174551386, + "grad_norm": 0.14508315920829773, + "learning_rate": 0.0003555464926590539, + "loss": 0.1165, + "num_input_tokens_seen": 9421760, + "step": 4360 + }, + { + "epoch": 0.7120717781402937, + "grad_norm": 0.07993219792842865, + "learning_rate": 0.0003559543230016313, + "loss": 0.0539, + "num_input_tokens_seen": 9432960, + "step": 4365 + }, + { + "epoch": 0.7128874388254486, + "grad_norm": 0.3040957450866699, + "learning_rate": 0.0003563621533442088, + "loss": 0.3573, + "num_input_tokens_seen": 9443936, + "step": 4370 + }, + { + "epoch": 0.7137030995106036, + "grad_norm": 0.043079450726509094, + "learning_rate": 0.0003567699836867863, + "loss": 0.0536, + "num_input_tokens_seen": 9453120, + "step": 4375 + }, + { + "epoch": 0.7145187601957586, + "grad_norm": 0.12413550168275833, + "learning_rate": 0.0003571778140293638, + "loss": 0.1671, + "num_input_tokens_seen": 9464480, + "step": 4380 + }, + { + "epoch": 0.7153344208809136, + "grad_norm": 0.45056381821632385, + "learning_rate": 0.0003575856443719413, + "loss": 0.1845, + "num_input_tokens_seen": 9475360, + "step": 4385 + }, + { + "epoch": 0.7161500815660685, + "grad_norm": 0.3023238182067871, + "learning_rate": 0.00035799347471451875, + "loss": 0.2689, + "num_input_tokens_seen": 9484992, + "step": 4390 + }, + { + "epoch": 0.7169657422512234, + "grad_norm": 0.5121544003486633, + "learning_rate": 0.0003584013050570963, + "loss": 0.0575, + "num_input_tokens_seen": 9495296, + "step": 4395 + }, + { + "epoch": 0.7177814029363785, + "grad_norm": 0.2982773184776306, + "learning_rate": 0.00035880913539967376, + "loss": 0.0646, + "num_input_tokens_seen": 9506592, + "step": 4400 + }, + { + "epoch": 0.7185970636215334, + "grad_norm": 0.4488369822502136, + "learning_rate": 0.00035921696574225124, + "loss": 0.1364, + "num_input_tokens_seen": 9517056, + "step": 4405 + }, + { + "epoch": 0.7194127243066885, + "grad_norm": 0.037081990391016006, + "learning_rate": 0.0003596247960848287, + "loss": 0.0476, + "num_input_tokens_seen": 9527808, + "step": 4410 + }, + { + "epoch": 0.7202283849918434, + "grad_norm": 0.4101148545742035, + "learning_rate": 0.0003600326264274062, + "loss": 0.0485, + "num_input_tokens_seen": 9538624, + "step": 4415 + }, + { + "epoch": 0.7210440456769984, + "grad_norm": 0.6088188290596008, + "learning_rate": 0.0003604404567699837, + "loss": 0.1898, + "num_input_tokens_seen": 9547744, + "step": 4420 + }, + { + "epoch": 0.7218597063621534, + "grad_norm": 0.4636387526988983, + "learning_rate": 0.0003608482871125612, + "loss": 0.0678, + "num_input_tokens_seen": 9559072, + "step": 4425 + }, + { + "epoch": 0.7226753670473083, + "grad_norm": 0.4632618725299835, + "learning_rate": 0.0003612561174551386, + "loss": 0.1456, + "num_input_tokens_seen": 9570528, + "step": 4430 + }, + { + "epoch": 0.7234910277324633, + "grad_norm": 0.5356050729751587, + "learning_rate": 0.00036166394779771615, + "loss": 0.1853, + "num_input_tokens_seen": 9580800, + "step": 4435 + }, + { + "epoch": 0.7243066884176182, + "grad_norm": 0.06347585469484329, + "learning_rate": 0.0003620717781402936, + "loss": 0.1803, + "num_input_tokens_seen": 9592064, + "step": 4440 + }, + { + "epoch": 0.7251223491027733, + "grad_norm": 0.03401469439268112, + "learning_rate": 0.00036247960848287116, + "loss": 0.1577, + "num_input_tokens_seen": 9603744, + "step": 4445 + }, + { + "epoch": 0.7259380097879282, + "grad_norm": 0.08639135956764221, + "learning_rate": 0.00036288743882544863, + "loss": 0.247, + "num_input_tokens_seen": 9615008, + "step": 4450 + }, + { + "epoch": 0.7267536704730831, + "grad_norm": 0.52489173412323, + "learning_rate": 0.0003632952691680261, + "loss": 0.198, + "num_input_tokens_seen": 9625376, + "step": 4455 + }, + { + "epoch": 0.7275693311582382, + "grad_norm": 0.23638580739498138, + "learning_rate": 0.0003637030995106036, + "loss": 0.1478, + "num_input_tokens_seen": 9636128, + "step": 4460 + }, + { + "epoch": 0.7283849918433931, + "grad_norm": 0.2670087218284607, + "learning_rate": 0.00036411092985318106, + "loss": 0.2096, + "num_input_tokens_seen": 9647808, + "step": 4465 + }, + { + "epoch": 0.7292006525285482, + "grad_norm": 0.5415324568748474, + "learning_rate": 0.0003645187601957586, + "loss": 0.1935, + "num_input_tokens_seen": 9658496, + "step": 4470 + }, + { + "epoch": 0.7300163132137031, + "grad_norm": 0.27866536378860474, + "learning_rate": 0.00036492659053833607, + "loss": 0.1422, + "num_input_tokens_seen": 9669120, + "step": 4475 + }, + { + "epoch": 0.7308319738988581, + "grad_norm": 0.20106881856918335, + "learning_rate": 0.00036533442088091354, + "loss": 0.1005, + "num_input_tokens_seen": 9680448, + "step": 4480 + }, + { + "epoch": 0.731647634584013, + "grad_norm": 0.2943683862686157, + "learning_rate": 0.000365742251223491, + "loss": 0.0592, + "num_input_tokens_seen": 9690592, + "step": 4485 + }, + { + "epoch": 0.732463295269168, + "grad_norm": 0.8741294741630554, + "learning_rate": 0.00036615008156606855, + "loss": 0.1675, + "num_input_tokens_seen": 9701824, + "step": 4490 + }, + { + "epoch": 0.733278955954323, + "grad_norm": 0.7757192254066467, + "learning_rate": 0.00036655791190864603, + "loss": 0.2252, + "num_input_tokens_seen": 9712384, + "step": 4495 + }, + { + "epoch": 0.734094616639478, + "grad_norm": 0.24651999771595, + "learning_rate": 0.0003669657422512235, + "loss": 0.2337, + "num_input_tokens_seen": 9723200, + "step": 4500 + }, + { + "epoch": 0.734910277324633, + "grad_norm": 0.1742609441280365, + "learning_rate": 0.000367373572593801, + "loss": 0.138, + "num_input_tokens_seen": 9733536, + "step": 4505 + }, + { + "epoch": 0.7357259380097879, + "grad_norm": 0.20956604182720184, + "learning_rate": 0.00036778140293637846, + "loss": 0.2098, + "num_input_tokens_seen": 9744832, + "step": 4510 + }, + { + "epoch": 0.736541598694943, + "grad_norm": 0.4425009489059448, + "learning_rate": 0.000368189233278956, + "loss": 0.1022, + "num_input_tokens_seen": 9755520, + "step": 4515 + }, + { + "epoch": 0.7373572593800979, + "grad_norm": 0.9309787750244141, + "learning_rate": 0.00036859706362153346, + "loss": 0.1647, + "num_input_tokens_seen": 9766208, + "step": 4520 + }, + { + "epoch": 0.7381729200652528, + "grad_norm": 0.6328949332237244, + "learning_rate": 0.0003690048939641109, + "loss": 0.1981, + "num_input_tokens_seen": 9778272, + "step": 4525 + }, + { + "epoch": 0.7389885807504079, + "grad_norm": 0.8695969581604004, + "learning_rate": 0.0003694127243066884, + "loss": 0.1662, + "num_input_tokens_seen": 9789760, + "step": 4530 + }, + { + "epoch": 0.7398042414355628, + "grad_norm": 0.7203797101974487, + "learning_rate": 0.0003698205546492659, + "loss": 0.1121, + "num_input_tokens_seen": 9800800, + "step": 4535 + }, + { + "epoch": 0.7406199021207178, + "grad_norm": 1.077952265739441, + "learning_rate": 0.0003702283849918434, + "loss": 0.1574, + "num_input_tokens_seen": 9812672, + "step": 4540 + }, + { + "epoch": 0.7414355628058727, + "grad_norm": 0.6627715229988098, + "learning_rate": 0.0003706362153344209, + "loss": 0.2866, + "num_input_tokens_seen": 9823232, + "step": 4545 + }, + { + "epoch": 0.7422512234910277, + "grad_norm": 0.1315276026725769, + "learning_rate": 0.0003710440456769984, + "loss": 0.1755, + "num_input_tokens_seen": 9834848, + "step": 4550 + }, + { + "epoch": 0.7430668841761827, + "grad_norm": 0.05795247107744217, + "learning_rate": 0.00037145187601957585, + "loss": 0.1221, + "num_input_tokens_seen": 9846304, + "step": 4555 + }, + { + "epoch": 0.7438825448613376, + "grad_norm": 0.21074354648590088, + "learning_rate": 0.00037185970636215333, + "loss": 0.0962, + "num_input_tokens_seen": 9857472, + "step": 4560 + }, + { + "epoch": 0.7446982055464927, + "grad_norm": 0.11915198713541031, + "learning_rate": 0.00037226753670473086, + "loss": 0.0883, + "num_input_tokens_seen": 9869632, + "step": 4565 + }, + { + "epoch": 0.7455138662316476, + "grad_norm": 0.12031367421150208, + "learning_rate": 0.00037267536704730834, + "loss": 0.0608, + "num_input_tokens_seen": 9881344, + "step": 4570 + }, + { + "epoch": 0.7463295269168027, + "grad_norm": 0.14942015707492828, + "learning_rate": 0.0003730831973898858, + "loss": 0.1219, + "num_input_tokens_seen": 9892640, + "step": 4575 + }, + { + "epoch": 0.7471451876019576, + "grad_norm": 0.525719165802002, + "learning_rate": 0.0003734910277324633, + "loss": 0.0878, + "num_input_tokens_seen": 9903296, + "step": 4580 + }, + { + "epoch": 0.7479608482871125, + "grad_norm": 0.03644242137670517, + "learning_rate": 0.0003738988580750408, + "loss": 0.0736, + "num_input_tokens_seen": 9913952, + "step": 4585 + }, + { + "epoch": 0.7487765089722676, + "grad_norm": 0.02505657821893692, + "learning_rate": 0.0003743066884176183, + "loss": 0.0757, + "num_input_tokens_seen": 9924928, + "step": 4590 + }, + { + "epoch": 0.7495921696574225, + "grad_norm": 0.08508000522851944, + "learning_rate": 0.0003747145187601957, + "loss": 0.0751, + "num_input_tokens_seen": 9936576, + "step": 4595 + }, + { + "epoch": 0.7504078303425775, + "grad_norm": 0.7894995212554932, + "learning_rate": 0.00037512234910277325, + "loss": 0.2172, + "num_input_tokens_seen": 9948288, + "step": 4600 + }, + { + "epoch": 0.7512234910277324, + "grad_norm": 0.46785202622413635, + "learning_rate": 0.00037553017944535073, + "loss": 0.1958, + "num_input_tokens_seen": 9958976, + "step": 4605 + }, + { + "epoch": 0.7520391517128875, + "grad_norm": 0.3128347098827362, + "learning_rate": 0.00037593800978792826, + "loss": 0.2062, + "num_input_tokens_seen": 9969856, + "step": 4610 + }, + { + "epoch": 0.7528548123980424, + "grad_norm": 0.21548837423324585, + "learning_rate": 0.00037634584013050573, + "loss": 0.1369, + "num_input_tokens_seen": 9982176, + "step": 4615 + }, + { + "epoch": 0.7536704730831973, + "grad_norm": 0.61204993724823, + "learning_rate": 0.0003767536704730832, + "loss": 0.1607, + "num_input_tokens_seen": 9994176, + "step": 4620 + }, + { + "epoch": 0.7544861337683524, + "grad_norm": 0.2929581105709076, + "learning_rate": 0.0003771615008156607, + "loss": 0.127, + "num_input_tokens_seen": 10005504, + "step": 4625 + }, + { + "epoch": 0.7553017944535073, + "grad_norm": 0.2728572487831116, + "learning_rate": 0.00037756933115823816, + "loss": 0.1428, + "num_input_tokens_seen": 10016800, + "step": 4630 + }, + { + "epoch": 0.7561174551386624, + "grad_norm": 0.5226534008979797, + "learning_rate": 0.0003779771615008157, + "loss": 0.2997, + "num_input_tokens_seen": 10026720, + "step": 4635 + }, + { + "epoch": 0.7569331158238173, + "grad_norm": 0.32955631613731384, + "learning_rate": 0.00037838499184339317, + "loss": 0.1112, + "num_input_tokens_seen": 10037664, + "step": 4640 + }, + { + "epoch": 0.7577487765089723, + "grad_norm": 0.2558460235595703, + "learning_rate": 0.00037879282218597065, + "loss": 0.1725, + "num_input_tokens_seen": 10049216, + "step": 4645 + }, + { + "epoch": 0.7585644371941273, + "grad_norm": 0.34898290038108826, + "learning_rate": 0.0003792006525285481, + "loss": 0.0869, + "num_input_tokens_seen": 10060960, + "step": 4650 + }, + { + "epoch": 0.7593800978792822, + "grad_norm": 0.2750975489616394, + "learning_rate": 0.0003796084828711256, + "loss": 0.1028, + "num_input_tokens_seen": 10072096, + "step": 4655 + }, + { + "epoch": 0.7601957585644372, + "grad_norm": 0.4596557319164276, + "learning_rate": 0.00038001631321370313, + "loss": 0.0883, + "num_input_tokens_seen": 10083712, + "step": 4660 + }, + { + "epoch": 0.7610114192495921, + "grad_norm": 0.21442389488220215, + "learning_rate": 0.00038042414355628055, + "loss": 0.1009, + "num_input_tokens_seen": 10094656, + "step": 4665 + }, + { + "epoch": 0.7618270799347472, + "grad_norm": 0.41847553849220276, + "learning_rate": 0.0003808319738988581, + "loss": 0.0981, + "num_input_tokens_seen": 10105568, + "step": 4670 + }, + { + "epoch": 0.7626427406199021, + "grad_norm": 0.258605033159256, + "learning_rate": 0.00038123980424143556, + "loss": 0.0465, + "num_input_tokens_seen": 10116800, + "step": 4675 + }, + { + "epoch": 0.763458401305057, + "grad_norm": 0.08785971254110336, + "learning_rate": 0.0003816476345840131, + "loss": 0.151, + "num_input_tokens_seen": 10127360, + "step": 4680 + }, + { + "epoch": 0.7642740619902121, + "grad_norm": 0.44143345952033997, + "learning_rate": 0.00038205546492659057, + "loss": 0.0895, + "num_input_tokens_seen": 10137312, + "step": 4685 + }, + { + "epoch": 0.765089722675367, + "grad_norm": 1.2000739574432373, + "learning_rate": 0.000382463295269168, + "loss": 0.2656, + "num_input_tokens_seen": 10148960, + "step": 4690 + }, + { + "epoch": 0.765905383360522, + "grad_norm": 0.045616984367370605, + "learning_rate": 0.0003828711256117455, + "loss": 0.0484, + "num_input_tokens_seen": 10157856, + "step": 4695 + }, + { + "epoch": 0.766721044045677, + "grad_norm": 0.29947271943092346, + "learning_rate": 0.000383278955954323, + "loss": 0.0808, + "num_input_tokens_seen": 10168672, + "step": 4700 + }, + { + "epoch": 0.767536704730832, + "grad_norm": 0.0342426560819149, + "learning_rate": 0.00038368678629690053, + "loss": 0.0645, + "num_input_tokens_seen": 10178272, + "step": 4705 + }, + { + "epoch": 0.768352365415987, + "grad_norm": 0.40731778740882874, + "learning_rate": 0.000384094616639478, + "loss": 0.0414, + "num_input_tokens_seen": 10188896, + "step": 4710 + }, + { + "epoch": 0.7691680261011419, + "grad_norm": 0.35202035307884216, + "learning_rate": 0.0003845024469820555, + "loss": 0.1535, + "num_input_tokens_seen": 10199488, + "step": 4715 + }, + { + "epoch": 0.7699836867862969, + "grad_norm": 0.38114434480667114, + "learning_rate": 0.00038491027732463296, + "loss": 0.1137, + "num_input_tokens_seen": 10210720, + "step": 4720 + }, + { + "epoch": 0.7707993474714518, + "grad_norm": 0.16754403710365295, + "learning_rate": 0.00038531810766721043, + "loss": 0.0798, + "num_input_tokens_seen": 10221472, + "step": 4725 + }, + { + "epoch": 0.7716150081566069, + "grad_norm": 0.761899471282959, + "learning_rate": 0.00038572593800978796, + "loss": 0.1995, + "num_input_tokens_seen": 10232288, + "step": 4730 + }, + { + "epoch": 0.7724306688417618, + "grad_norm": 0.18429462611675262, + "learning_rate": 0.0003861337683523654, + "loss": 0.1711, + "num_input_tokens_seen": 10243616, + "step": 4735 + }, + { + "epoch": 0.7732463295269169, + "grad_norm": 0.5064928531646729, + "learning_rate": 0.0003865415986949429, + "loss": 0.1026, + "num_input_tokens_seen": 10254464, + "step": 4740 + }, + { + "epoch": 0.7740619902120718, + "grad_norm": 0.21989291906356812, + "learning_rate": 0.0003869494290375204, + "loss": 0.065, + "num_input_tokens_seen": 10265472, + "step": 4745 + }, + { + "epoch": 0.7748776508972267, + "grad_norm": 0.2017669528722763, + "learning_rate": 0.0003873572593800979, + "loss": 0.1856, + "num_input_tokens_seen": 10277184, + "step": 4750 + }, + { + "epoch": 0.7756933115823818, + "grad_norm": 0.43002089858055115, + "learning_rate": 0.0003877650897226754, + "loss": 0.1439, + "num_input_tokens_seen": 10287104, + "step": 4755 + }, + { + "epoch": 0.7765089722675367, + "grad_norm": 0.11221319437026978, + "learning_rate": 0.0003881729200652528, + "loss": 0.1302, + "num_input_tokens_seen": 10297952, + "step": 4760 + }, + { + "epoch": 0.7773246329526917, + "grad_norm": 0.12216249108314514, + "learning_rate": 0.00038858075040783035, + "loss": 0.0866, + "num_input_tokens_seen": 10309440, + "step": 4765 + }, + { + "epoch": 0.7781402936378466, + "grad_norm": 0.12690703570842743, + "learning_rate": 0.00038898858075040783, + "loss": 0.2505, + "num_input_tokens_seen": 10319680, + "step": 4770 + }, + { + "epoch": 0.7789559543230016, + "grad_norm": 0.2630586624145508, + "learning_rate": 0.00038939641109298536, + "loss": 0.1576, + "num_input_tokens_seen": 10331424, + "step": 4775 + }, + { + "epoch": 0.7797716150081566, + "grad_norm": 0.5038022398948669, + "learning_rate": 0.00038980424143556284, + "loss": 0.2242, + "num_input_tokens_seen": 10341600, + "step": 4780 + }, + { + "epoch": 0.7805872756933115, + "grad_norm": 0.30624839663505554, + "learning_rate": 0.00039021207177814026, + "loss": 0.2602, + "num_input_tokens_seen": 10353504, + "step": 4785 + }, + { + "epoch": 0.7814029363784666, + "grad_norm": 0.10484899580478668, + "learning_rate": 0.0003906199021207178, + "loss": 0.0848, + "num_input_tokens_seen": 10365536, + "step": 4790 + }, + { + "epoch": 0.7822185970636215, + "grad_norm": 0.285604327917099, + "learning_rate": 0.00039102773246329527, + "loss": 0.1797, + "num_input_tokens_seen": 10375456, + "step": 4795 + }, + { + "epoch": 0.7830342577487766, + "grad_norm": 0.14094938337802887, + "learning_rate": 0.0003914355628058728, + "loss": 0.1878, + "num_input_tokens_seen": 10386336, + "step": 4800 + }, + { + "epoch": 0.7838499184339315, + "grad_norm": 0.04346349090337753, + "learning_rate": 0.0003918433931484502, + "loss": 0.1707, + "num_input_tokens_seen": 10397504, + "step": 4805 + }, + { + "epoch": 0.7846655791190864, + "grad_norm": 0.128965362906456, + "learning_rate": 0.00039225122349102775, + "loss": 0.0681, + "num_input_tokens_seen": 10407520, + "step": 4810 + }, + { + "epoch": 0.7854812398042414, + "grad_norm": 0.3891755938529968, + "learning_rate": 0.0003926590538336052, + "loss": 0.1559, + "num_input_tokens_seen": 10418368, + "step": 4815 + }, + { + "epoch": 0.7862969004893964, + "grad_norm": 0.0359419621527195, + "learning_rate": 0.0003930668841761827, + "loss": 0.1942, + "num_input_tokens_seen": 10428864, + "step": 4820 + }, + { + "epoch": 0.7871125611745514, + "grad_norm": 0.36615094542503357, + "learning_rate": 0.00039347471451876023, + "loss": 0.1266, + "num_input_tokens_seen": 10440224, + "step": 4825 + }, + { + "epoch": 0.7879282218597063, + "grad_norm": 0.2508382499217987, + "learning_rate": 0.00039388254486133766, + "loss": 0.1644, + "num_input_tokens_seen": 10450912, + "step": 4830 + }, + { + "epoch": 0.7887438825448614, + "grad_norm": 0.19512003660202026, + "learning_rate": 0.0003942903752039152, + "loss": 0.1277, + "num_input_tokens_seen": 10460352, + "step": 4835 + }, + { + "epoch": 0.7895595432300163, + "grad_norm": 0.23638033866882324, + "learning_rate": 0.00039469820554649266, + "loss": 0.242, + "num_input_tokens_seen": 10471968, + "step": 4840 + }, + { + "epoch": 0.7903752039151712, + "grad_norm": 0.43455421924591064, + "learning_rate": 0.0003951060358890702, + "loss": 0.1504, + "num_input_tokens_seen": 10481312, + "step": 4845 + }, + { + "epoch": 0.7911908646003263, + "grad_norm": 0.13896767795085907, + "learning_rate": 0.00039551386623164767, + "loss": 0.1634, + "num_input_tokens_seen": 10491136, + "step": 4850 + }, + { + "epoch": 0.7920065252854812, + "grad_norm": 0.31423068046569824, + "learning_rate": 0.0003959216965742251, + "loss": 0.1022, + "num_input_tokens_seen": 10501344, + "step": 4855 + }, + { + "epoch": 0.7928221859706363, + "grad_norm": 0.08108766376972198, + "learning_rate": 0.0003963295269168026, + "loss": 0.0808, + "num_input_tokens_seen": 10511968, + "step": 4860 + }, + { + "epoch": 0.7936378466557912, + "grad_norm": 0.3036273717880249, + "learning_rate": 0.0003967373572593801, + "loss": 0.139, + "num_input_tokens_seen": 10523296, + "step": 4865 + }, + { + "epoch": 0.7944535073409462, + "grad_norm": 0.18951745331287384, + "learning_rate": 0.00039714518760195763, + "loss": 0.068, + "num_input_tokens_seen": 10532992, + "step": 4870 + }, + { + "epoch": 0.7952691680261011, + "grad_norm": 0.030269593000411987, + "learning_rate": 0.00039755301794453505, + "loss": 0.1123, + "num_input_tokens_seen": 10544768, + "step": 4875 + }, + { + "epoch": 0.7960848287112561, + "grad_norm": 0.29710137844085693, + "learning_rate": 0.00039796084828711253, + "loss": 0.049, + "num_input_tokens_seen": 10555680, + "step": 4880 + }, + { + "epoch": 0.7969004893964111, + "grad_norm": 0.21235564351081848, + "learning_rate": 0.00039836867862969006, + "loss": 0.0519, + "num_input_tokens_seen": 10566304, + "step": 4885 + }, + { + "epoch": 0.797716150081566, + "grad_norm": 0.6146203875541687, + "learning_rate": 0.00039877650897226754, + "loss": 0.0219, + "num_input_tokens_seen": 10576128, + "step": 4890 + }, + { + "epoch": 0.7985318107667211, + "grad_norm": 0.18994970619678497, + "learning_rate": 0.00039918433931484507, + "loss": 0.032, + "num_input_tokens_seen": 10587072, + "step": 4895 + }, + { + "epoch": 0.799347471451876, + "grad_norm": 1.0278782844543457, + "learning_rate": 0.0003995921696574225, + "loss": 0.087, + "num_input_tokens_seen": 10597696, + "step": 4900 + }, + { + "epoch": 0.8001631321370309, + "grad_norm": 0.9544143676757812, + "learning_rate": 0.0004, + "loss": 0.1629, + "num_input_tokens_seen": 10608928, + "step": 4905 + }, + { + "epoch": 0.800978792822186, + "grad_norm": 0.12461934238672256, + "learning_rate": 0.0004004078303425775, + "loss": 0.04, + "num_input_tokens_seen": 10619872, + "step": 4910 + }, + { + "epoch": 0.8017944535073409, + "grad_norm": 0.027850087732076645, + "learning_rate": 0.00040081566068515497, + "loss": 0.1131, + "num_input_tokens_seen": 10631360, + "step": 4915 + }, + { + "epoch": 0.802610114192496, + "grad_norm": 0.21807579696178436, + "learning_rate": 0.0004012234910277325, + "loss": 0.0874, + "num_input_tokens_seen": 10642880, + "step": 4920 + }, + { + "epoch": 0.8034257748776509, + "grad_norm": 0.40161916613578796, + "learning_rate": 0.0004016313213703099, + "loss": 0.0408, + "num_input_tokens_seen": 10651968, + "step": 4925 + }, + { + "epoch": 0.8042414355628059, + "grad_norm": 0.8437064290046692, + "learning_rate": 0.00040203915171288746, + "loss": 0.246, + "num_input_tokens_seen": 10662240, + "step": 4930 + }, + { + "epoch": 0.8050570962479608, + "grad_norm": 0.15779760479927063, + "learning_rate": 0.00040244698205546493, + "loss": 0.0533, + "num_input_tokens_seen": 10672864, + "step": 4935 + }, + { + "epoch": 0.8058727569331158, + "grad_norm": 0.08539305627346039, + "learning_rate": 0.00040285481239804246, + "loss": 0.1148, + "num_input_tokens_seen": 10683936, + "step": 4940 + }, + { + "epoch": 0.8066884176182708, + "grad_norm": 0.8961646556854248, + "learning_rate": 0.0004032626427406199, + "loss": 0.2105, + "num_input_tokens_seen": 10695104, + "step": 4945 + }, + { + "epoch": 0.8075040783034257, + "grad_norm": 0.11104848980903625, + "learning_rate": 0.00040367047308319736, + "loss": 0.1674, + "num_input_tokens_seen": 10706528, + "step": 4950 + }, + { + "epoch": 0.8083197389885808, + "grad_norm": 0.26247522234916687, + "learning_rate": 0.0004040783034257749, + "loss": 0.1362, + "num_input_tokens_seen": 10717728, + "step": 4955 + }, + { + "epoch": 0.8091353996737357, + "grad_norm": 0.030089763924479485, + "learning_rate": 0.00040448613376835237, + "loss": 0.1047, + "num_input_tokens_seen": 10727104, + "step": 4960 + }, + { + "epoch": 0.8099510603588908, + "grad_norm": 0.11344542354345322, + "learning_rate": 0.0004048939641109299, + "loss": 0.1412, + "num_input_tokens_seen": 10737952, + "step": 4965 + }, + { + "epoch": 0.8107667210440457, + "grad_norm": 0.3622676134109497, + "learning_rate": 0.0004053017944535073, + "loss": 0.2602, + "num_input_tokens_seen": 10747744, + "step": 4970 + }, + { + "epoch": 0.8115823817292006, + "grad_norm": 0.08006960898637772, + "learning_rate": 0.00040570962479608485, + "loss": 0.1059, + "num_input_tokens_seen": 10757920, + "step": 4975 + }, + { + "epoch": 0.8123980424143556, + "grad_norm": 0.12218235433101654, + "learning_rate": 0.00040611745513866233, + "loss": 0.0414, + "num_input_tokens_seen": 10768896, + "step": 4980 + }, + { + "epoch": 0.8132137030995106, + "grad_norm": 0.0940176248550415, + "learning_rate": 0.0004065252854812398, + "loss": 0.0599, + "num_input_tokens_seen": 10779136, + "step": 4985 + }, + { + "epoch": 0.8140293637846656, + "grad_norm": 0.17066459357738495, + "learning_rate": 0.00040693311582381734, + "loss": 0.1839, + "num_input_tokens_seen": 10789280, + "step": 4990 + }, + { + "epoch": 0.8148450244698205, + "grad_norm": 0.17495228350162506, + "learning_rate": 0.00040734094616639476, + "loss": 0.1297, + "num_input_tokens_seen": 10800192, + "step": 4995 + }, + { + "epoch": 0.8156606851549756, + "grad_norm": 0.4211640954017639, + "learning_rate": 0.0004077487765089723, + "loss": 0.0795, + "num_input_tokens_seen": 10811296, + "step": 5000 + }, + { + "epoch": 0.8164763458401305, + "grad_norm": 0.1242627203464508, + "learning_rate": 0.00040815660685154977, + "loss": 0.0298, + "num_input_tokens_seen": 10822272, + "step": 5005 + }, + { + "epoch": 0.8172920065252854, + "grad_norm": 0.684248149394989, + "learning_rate": 0.00040856443719412724, + "loss": 0.0643, + "num_input_tokens_seen": 10832832, + "step": 5010 + }, + { + "epoch": 0.8181076672104405, + "grad_norm": 0.5153582692146301, + "learning_rate": 0.00040897226753670477, + "loss": 0.2636, + "num_input_tokens_seen": 10843008, + "step": 5015 + }, + { + "epoch": 0.8189233278955954, + "grad_norm": 0.13736863434314728, + "learning_rate": 0.0004093800978792822, + "loss": 0.0742, + "num_input_tokens_seen": 10856000, + "step": 5020 + }, + { + "epoch": 0.8197389885807504, + "grad_norm": 0.07956301420927048, + "learning_rate": 0.0004097879282218597, + "loss": 0.0642, + "num_input_tokens_seen": 10866144, + "step": 5025 + }, + { + "epoch": 0.8205546492659054, + "grad_norm": 0.7332919239997864, + "learning_rate": 0.0004101957585644372, + "loss": 0.3499, + "num_input_tokens_seen": 10877312, + "step": 5030 + }, + { + "epoch": 0.8213703099510603, + "grad_norm": 0.054674167186021805, + "learning_rate": 0.00041060358890701473, + "loss": 0.1298, + "num_input_tokens_seen": 10887040, + "step": 5035 + }, + { + "epoch": 0.8221859706362153, + "grad_norm": 0.20225413143634796, + "learning_rate": 0.00041101141924959215, + "loss": 0.0691, + "num_input_tokens_seen": 10897440, + "step": 5040 + }, + { + "epoch": 0.8230016313213703, + "grad_norm": 0.6315981149673462, + "learning_rate": 0.00041141924959216963, + "loss": 0.0708, + "num_input_tokens_seen": 10909504, + "step": 5045 + }, + { + "epoch": 0.8238172920065253, + "grad_norm": 0.7156874537467957, + "learning_rate": 0.00041182707993474716, + "loss": 0.3704, + "num_input_tokens_seen": 10921728, + "step": 5050 + }, + { + "epoch": 0.8246329526916802, + "grad_norm": 0.16364993155002594, + "learning_rate": 0.00041223491027732464, + "loss": 0.0518, + "num_input_tokens_seen": 10931776, + "step": 5055 + }, + { + "epoch": 0.8254486133768353, + "grad_norm": 0.12342008948326111, + "learning_rate": 0.00041264274061990217, + "loss": 0.0695, + "num_input_tokens_seen": 10942784, + "step": 5060 + }, + { + "epoch": 0.8262642740619902, + "grad_norm": 0.04900471493601799, + "learning_rate": 0.0004130505709624796, + "loss": 0.0299, + "num_input_tokens_seen": 10954272, + "step": 5065 + }, + { + "epoch": 0.8270799347471451, + "grad_norm": 0.40814009308815, + "learning_rate": 0.0004134584013050571, + "loss": 0.1194, + "num_input_tokens_seen": 10964864, + "step": 5070 + }, + { + "epoch": 0.8278955954323002, + "grad_norm": 0.0919327363371849, + "learning_rate": 0.0004138662316476346, + "loss": 0.1849, + "num_input_tokens_seen": 10975104, + "step": 5075 + }, + { + "epoch": 0.8287112561174551, + "grad_norm": 0.6414536237716675, + "learning_rate": 0.0004142740619902121, + "loss": 0.0728, + "num_input_tokens_seen": 10985152, + "step": 5080 + }, + { + "epoch": 0.8295269168026101, + "grad_norm": 0.05294128879904747, + "learning_rate": 0.0004146818923327896, + "loss": 0.0663, + "num_input_tokens_seen": 10996640, + "step": 5085 + }, + { + "epoch": 0.8303425774877651, + "grad_norm": 0.0711590051651001, + "learning_rate": 0.00041508972267536703, + "loss": 0.1359, + "num_input_tokens_seen": 11007328, + "step": 5090 + }, + { + "epoch": 0.8311582381729201, + "grad_norm": 0.27603647112846375, + "learning_rate": 0.00041549755301794456, + "loss": 0.1779, + "num_input_tokens_seen": 11017760, + "step": 5095 + }, + { + "epoch": 0.831973898858075, + "grad_norm": 0.6225929856300354, + "learning_rate": 0.00041590538336052203, + "loss": 0.0765, + "num_input_tokens_seen": 11028768, + "step": 5100 + }, + { + "epoch": 0.83278955954323, + "grad_norm": 0.2413845807313919, + "learning_rate": 0.00041631321370309957, + "loss": 0.269, + "num_input_tokens_seen": 11039360, + "step": 5105 + }, + { + "epoch": 0.833605220228385, + "grad_norm": 0.3218750059604645, + "learning_rate": 0.000416721044045677, + "loss": 0.0634, + "num_input_tokens_seen": 11050272, + "step": 5110 + }, + { + "epoch": 0.8344208809135399, + "grad_norm": 0.42538219690322876, + "learning_rate": 0.00041712887438825446, + "loss": 0.3043, + "num_input_tokens_seen": 11060736, + "step": 5115 + }, + { + "epoch": 0.835236541598695, + "grad_norm": 0.951379656791687, + "learning_rate": 0.000417536704730832, + "loss": 0.1503, + "num_input_tokens_seen": 11070688, + "step": 5120 + }, + { + "epoch": 0.8360522022838499, + "grad_norm": 0.5826851725578308, + "learning_rate": 0.00041794453507340947, + "loss": 0.2831, + "num_input_tokens_seen": 11080800, + "step": 5125 + }, + { + "epoch": 0.8368678629690048, + "grad_norm": 0.1050708070397377, + "learning_rate": 0.000418352365415987, + "loss": 0.0262, + "num_input_tokens_seen": 11091680, + "step": 5130 + }, + { + "epoch": 0.8376835236541599, + "grad_norm": 0.08830724656581879, + "learning_rate": 0.0004187601957585644, + "loss": 0.1475, + "num_input_tokens_seen": 11102656, + "step": 5135 + }, + { + "epoch": 0.8384991843393148, + "grad_norm": 0.21703684329986572, + "learning_rate": 0.0004191680261011419, + "loss": 0.0642, + "num_input_tokens_seen": 11112960, + "step": 5140 + }, + { + "epoch": 0.8393148450244698, + "grad_norm": 0.07175783067941666, + "learning_rate": 0.00041957585644371943, + "loss": 0.098, + "num_input_tokens_seen": 11124512, + "step": 5145 + }, + { + "epoch": 0.8401305057096248, + "grad_norm": 0.33142393827438354, + "learning_rate": 0.0004199836867862969, + "loss": 0.2984, + "num_input_tokens_seen": 11134976, + "step": 5150 + }, + { + "epoch": 0.8409461663947798, + "grad_norm": 0.46356773376464844, + "learning_rate": 0.00042039151712887444, + "loss": 0.1555, + "num_input_tokens_seen": 11146144, + "step": 5155 + }, + { + "epoch": 0.8417618270799347, + "grad_norm": 0.36611565947532654, + "learning_rate": 0.00042079934747145186, + "loss": 0.0533, + "num_input_tokens_seen": 11157152, + "step": 5160 + }, + { + "epoch": 0.8425774877650897, + "grad_norm": 0.11518274247646332, + "learning_rate": 0.0004212071778140294, + "loss": 0.0976, + "num_input_tokens_seen": 11166400, + "step": 5165 + }, + { + "epoch": 0.8433931484502447, + "grad_norm": 0.3951309025287628, + "learning_rate": 0.00042161500815660687, + "loss": 0.0887, + "num_input_tokens_seen": 11178016, + "step": 5170 + }, + { + "epoch": 0.8442088091353996, + "grad_norm": 0.4681323766708374, + "learning_rate": 0.00042202283849918434, + "loss": 0.1065, + "num_input_tokens_seen": 11188992, + "step": 5175 + }, + { + "epoch": 0.8450244698205547, + "grad_norm": 0.26340165734291077, + "learning_rate": 0.0004224306688417618, + "loss": 0.0694, + "num_input_tokens_seen": 11200160, + "step": 5180 + }, + { + "epoch": 0.8458401305057096, + "grad_norm": 0.562423586845398, + "learning_rate": 0.0004228384991843393, + "loss": 0.2449, + "num_input_tokens_seen": 11211776, + "step": 5185 + }, + { + "epoch": 0.8466557911908646, + "grad_norm": 0.7215205430984497, + "learning_rate": 0.00042324632952691683, + "loss": 0.2644, + "num_input_tokens_seen": 11223328, + "step": 5190 + }, + { + "epoch": 0.8474714518760196, + "grad_norm": 0.04237314313650131, + "learning_rate": 0.0004236541598694943, + "loss": 0.1336, + "num_input_tokens_seen": 11234528, + "step": 5195 + }, + { + "epoch": 0.8482871125611745, + "grad_norm": 0.09171731770038605, + "learning_rate": 0.00042406199021207183, + "loss": 0.1566, + "num_input_tokens_seen": 11245920, + "step": 5200 + }, + { + "epoch": 0.8491027732463295, + "grad_norm": 0.135093554854393, + "learning_rate": 0.00042446982055464926, + "loss": 0.1481, + "num_input_tokens_seen": 11258080, + "step": 5205 + }, + { + "epoch": 0.8499184339314845, + "grad_norm": 0.10724607855081558, + "learning_rate": 0.00042487765089722673, + "loss": 0.1028, + "num_input_tokens_seen": 11270176, + "step": 5210 + }, + { + "epoch": 0.8507340946166395, + "grad_norm": 0.08377696573734283, + "learning_rate": 0.00042528548123980426, + "loss": 0.0622, + "num_input_tokens_seen": 11282016, + "step": 5215 + }, + { + "epoch": 0.8515497553017944, + "grad_norm": 0.12989285588264465, + "learning_rate": 0.00042569331158238174, + "loss": 0.0544, + "num_input_tokens_seen": 11292320, + "step": 5220 + }, + { + "epoch": 0.8523654159869495, + "grad_norm": 0.13682430982589722, + "learning_rate": 0.00042610114192495927, + "loss": 0.1094, + "num_input_tokens_seen": 11303040, + "step": 5225 + }, + { + "epoch": 0.8531810766721044, + "grad_norm": 0.7620516419410706, + "learning_rate": 0.0004265089722675367, + "loss": 0.1155, + "num_input_tokens_seen": 11312672, + "step": 5230 + }, + { + "epoch": 0.8539967373572593, + "grad_norm": 0.033026549965143204, + "learning_rate": 0.00042691680261011417, + "loss": 0.0373, + "num_input_tokens_seen": 11323488, + "step": 5235 + }, + { + "epoch": 0.8548123980424144, + "grad_norm": 0.16827772557735443, + "learning_rate": 0.0004273246329526917, + "loss": 0.0487, + "num_input_tokens_seen": 11334176, + "step": 5240 + }, + { + "epoch": 0.8556280587275693, + "grad_norm": 0.16478866338729858, + "learning_rate": 0.0004277324632952692, + "loss": 0.1613, + "num_input_tokens_seen": 11345632, + "step": 5245 + }, + { + "epoch": 0.8564437194127243, + "grad_norm": 0.2556282579898834, + "learning_rate": 0.00042814029363784665, + "loss": 0.0364, + "num_input_tokens_seen": 11355552, + "step": 5250 + }, + { + "epoch": 0.8572593800978793, + "grad_norm": 0.02794981375336647, + "learning_rate": 0.00042854812398042413, + "loss": 0.1908, + "num_input_tokens_seen": 11367072, + "step": 5255 + }, + { + "epoch": 0.8580750407830342, + "grad_norm": 0.5714795589447021, + "learning_rate": 0.00042895595432300166, + "loss": 0.2137, + "num_input_tokens_seen": 11377120, + "step": 5260 + }, + { + "epoch": 0.8588907014681892, + "grad_norm": 0.11488376557826996, + "learning_rate": 0.00042936378466557914, + "loss": 0.0982, + "num_input_tokens_seen": 11388416, + "step": 5265 + }, + { + "epoch": 0.8597063621533442, + "grad_norm": 0.08155690133571625, + "learning_rate": 0.0004297716150081566, + "loss": 0.1723, + "num_input_tokens_seen": 11398528, + "step": 5270 + }, + { + "epoch": 0.8605220228384992, + "grad_norm": 0.10925207287073135, + "learning_rate": 0.0004301794453507341, + "loss": 0.0976, + "num_input_tokens_seen": 11408576, + "step": 5275 + }, + { + "epoch": 0.8613376835236541, + "grad_norm": 0.23385116457939148, + "learning_rate": 0.00043058727569331157, + "loss": 0.0709, + "num_input_tokens_seen": 11420000, + "step": 5280 + }, + { + "epoch": 0.8621533442088092, + "grad_norm": 0.510635495185852, + "learning_rate": 0.0004309951060358891, + "loss": 0.1581, + "num_input_tokens_seen": 11430528, + "step": 5285 + }, + { + "epoch": 0.8629690048939641, + "grad_norm": 0.08769966661930084, + "learning_rate": 0.0004314029363784666, + "loss": 0.0703, + "num_input_tokens_seen": 11440992, + "step": 5290 + }, + { + "epoch": 0.863784665579119, + "grad_norm": 0.09656643867492676, + "learning_rate": 0.0004318107667210441, + "loss": 0.1036, + "num_input_tokens_seen": 11451712, + "step": 5295 + }, + { + "epoch": 0.8646003262642741, + "grad_norm": 0.5707296133041382, + "learning_rate": 0.0004322185970636215, + "loss": 0.1564, + "num_input_tokens_seen": 11463584, + "step": 5300 + }, + { + "epoch": 0.865415986949429, + "grad_norm": 0.18754911422729492, + "learning_rate": 0.000432626427406199, + "loss": 0.0607, + "num_input_tokens_seen": 11474560, + "step": 5305 + }, + { + "epoch": 0.866231647634584, + "grad_norm": 0.0587138757109642, + "learning_rate": 0.00043303425774877653, + "loss": 0.0883, + "num_input_tokens_seen": 11486080, + "step": 5310 + }, + { + "epoch": 0.867047308319739, + "grad_norm": 0.026419376954436302, + "learning_rate": 0.000433442088091354, + "loss": 0.1829, + "num_input_tokens_seen": 11498464, + "step": 5315 + }, + { + "epoch": 0.867862969004894, + "grad_norm": 0.5190200209617615, + "learning_rate": 0.0004338499184339315, + "loss": 0.2018, + "num_input_tokens_seen": 11509376, + "step": 5320 + }, + { + "epoch": 0.8686786296900489, + "grad_norm": 0.07488425821065903, + "learning_rate": 0.00043425774877650896, + "loss": 0.202, + "num_input_tokens_seen": 11520480, + "step": 5325 + }, + { + "epoch": 0.8694942903752039, + "grad_norm": 0.18614496290683746, + "learning_rate": 0.0004346655791190865, + "loss": 0.0892, + "num_input_tokens_seen": 11531136, + "step": 5330 + }, + { + "epoch": 0.8703099510603589, + "grad_norm": 0.05950484424829483, + "learning_rate": 0.00043507340946166397, + "loss": 0.0623, + "num_input_tokens_seen": 11541408, + "step": 5335 + }, + { + "epoch": 0.8711256117455138, + "grad_norm": 0.21500875055789948, + "learning_rate": 0.00043548123980424145, + "loss": 0.2217, + "num_input_tokens_seen": 11552320, + "step": 5340 + }, + { + "epoch": 0.8719412724306689, + "grad_norm": 0.773648202419281, + "learning_rate": 0.0004358890701468189, + "loss": 0.2593, + "num_input_tokens_seen": 11563232, + "step": 5345 + }, + { + "epoch": 0.8727569331158238, + "grad_norm": 0.14433734118938446, + "learning_rate": 0.0004362969004893964, + "loss": 0.0955, + "num_input_tokens_seen": 11573856, + "step": 5350 + }, + { + "epoch": 0.8735725938009788, + "grad_norm": 0.10043247789144516, + "learning_rate": 0.00043670473083197393, + "loss": 0.1101, + "num_input_tokens_seen": 11584960, + "step": 5355 + }, + { + "epoch": 0.8743882544861338, + "grad_norm": 0.03573020547628403, + "learning_rate": 0.0004371125611745514, + "loss": 0.1191, + "num_input_tokens_seen": 11595168, + "step": 5360 + }, + { + "epoch": 0.8752039151712887, + "grad_norm": 0.25010839104652405, + "learning_rate": 0.0004375203915171289, + "loss": 0.1045, + "num_input_tokens_seen": 11605472, + "step": 5365 + }, + { + "epoch": 0.8760195758564437, + "grad_norm": 0.22654956579208374, + "learning_rate": 0.00043792822185970636, + "loss": 0.035, + "num_input_tokens_seen": 11616224, + "step": 5370 + }, + { + "epoch": 0.8768352365415987, + "grad_norm": 0.20199733972549438, + "learning_rate": 0.00043833605220228384, + "loss": 0.077, + "num_input_tokens_seen": 11626816, + "step": 5375 + }, + { + "epoch": 0.8776508972267537, + "grad_norm": 0.20932228863239288, + "learning_rate": 0.00043874388254486137, + "loss": 0.0706, + "num_input_tokens_seen": 11637152, + "step": 5380 + }, + { + "epoch": 0.8784665579119086, + "grad_norm": 0.268714964389801, + "learning_rate": 0.00043915171288743884, + "loss": 0.1373, + "num_input_tokens_seen": 11648160, + "step": 5385 + }, + { + "epoch": 0.8792822185970636, + "grad_norm": 0.25124499201774597, + "learning_rate": 0.0004395595432300163, + "loss": 0.2045, + "num_input_tokens_seen": 11660320, + "step": 5390 + }, + { + "epoch": 0.8800978792822186, + "grad_norm": 0.030649229884147644, + "learning_rate": 0.0004399673735725938, + "loss": 0.1744, + "num_input_tokens_seen": 11670816, + "step": 5395 + }, + { + "epoch": 0.8809135399673735, + "grad_norm": 0.06209159642457962, + "learning_rate": 0.00044037520391517127, + "loss": 0.0314, + "num_input_tokens_seen": 11682560, + "step": 5400 + }, + { + "epoch": 0.8817292006525286, + "grad_norm": 0.13285031914710999, + "learning_rate": 0.0004407830342577488, + "loss": 0.3882, + "num_input_tokens_seen": 11694400, + "step": 5405 + }, + { + "epoch": 0.8825448613376835, + "grad_norm": 0.14116713404655457, + "learning_rate": 0.0004411908646003263, + "loss": 0.0998, + "num_input_tokens_seen": 11705728, + "step": 5410 + }, + { + "epoch": 0.8833605220228385, + "grad_norm": 0.23796717822551727, + "learning_rate": 0.00044159869494290376, + "loss": 0.1255, + "num_input_tokens_seen": 11716736, + "step": 5415 + }, + { + "epoch": 0.8841761827079935, + "grad_norm": 0.18344132602214813, + "learning_rate": 0.00044200652528548123, + "loss": 0.0871, + "num_input_tokens_seen": 11726208, + "step": 5420 + }, + { + "epoch": 0.8849918433931484, + "grad_norm": 0.10778245329856873, + "learning_rate": 0.00044241435562805876, + "loss": 0.0952, + "num_input_tokens_seen": 11737664, + "step": 5425 + }, + { + "epoch": 0.8858075040783034, + "grad_norm": 0.22092236578464508, + "learning_rate": 0.00044282218597063624, + "loss": 0.1987, + "num_input_tokens_seen": 11749312, + "step": 5430 + }, + { + "epoch": 0.8866231647634584, + "grad_norm": 0.6242573261260986, + "learning_rate": 0.0004432300163132137, + "loss": 0.2475, + "num_input_tokens_seen": 11760192, + "step": 5435 + }, + { + "epoch": 0.8874388254486134, + "grad_norm": 0.2279716432094574, + "learning_rate": 0.0004436378466557912, + "loss": 0.164, + "num_input_tokens_seen": 11770464, + "step": 5440 + }, + { + "epoch": 0.8882544861337683, + "grad_norm": 0.06530027091503143, + "learning_rate": 0.00044404567699836867, + "loss": 0.1861, + "num_input_tokens_seen": 11781344, + "step": 5445 + }, + { + "epoch": 0.8890701468189234, + "grad_norm": 0.14814841747283936, + "learning_rate": 0.0004444535073409462, + "loss": 0.116, + "num_input_tokens_seen": 11791968, + "step": 5450 + }, + { + "epoch": 0.8898858075040783, + "grad_norm": 0.3580428659915924, + "learning_rate": 0.0004448613376835237, + "loss": 0.1099, + "num_input_tokens_seen": 11802624, + "step": 5455 + }, + { + "epoch": 0.8907014681892332, + "grad_norm": 0.944884717464447, + "learning_rate": 0.0004452691680261011, + "loss": 0.2742, + "num_input_tokens_seen": 11813248, + "step": 5460 + }, + { + "epoch": 0.8915171288743883, + "grad_norm": 0.2670440077781677, + "learning_rate": 0.00044567699836867863, + "loss": 0.1914, + "num_input_tokens_seen": 11823040, + "step": 5465 + }, + { + "epoch": 0.8923327895595432, + "grad_norm": 0.3020407259464264, + "learning_rate": 0.0004460848287112561, + "loss": 0.1478, + "num_input_tokens_seen": 11834528, + "step": 5470 + }, + { + "epoch": 0.8931484502446982, + "grad_norm": 0.21428096294403076, + "learning_rate": 0.00044649265905383364, + "loss": 0.1098, + "num_input_tokens_seen": 11844896, + "step": 5475 + }, + { + "epoch": 0.8939641109298532, + "grad_norm": 0.2839694321155548, + "learning_rate": 0.0004469004893964111, + "loss": 0.0678, + "num_input_tokens_seen": 11855392, + "step": 5480 + }, + { + "epoch": 0.8947797716150081, + "grad_norm": 0.6894422769546509, + "learning_rate": 0.0004473083197389886, + "loss": 0.2677, + "num_input_tokens_seen": 11864608, + "step": 5485 + }, + { + "epoch": 0.8955954323001631, + "grad_norm": 0.20967309176921844, + "learning_rate": 0.00044771615008156607, + "loss": 0.2639, + "num_input_tokens_seen": 11875776, + "step": 5490 + }, + { + "epoch": 0.8964110929853181, + "grad_norm": 0.37381711602211, + "learning_rate": 0.00044812398042414354, + "loss": 0.0903, + "num_input_tokens_seen": 11885472, + "step": 5495 + }, + { + "epoch": 0.8972267536704731, + "grad_norm": 0.6217723488807678, + "learning_rate": 0.00044853181076672107, + "loss": 0.178, + "num_input_tokens_seen": 11897056, + "step": 5500 + }, + { + "epoch": 0.898042414355628, + "grad_norm": 0.4900282323360443, + "learning_rate": 0.00044893964110929855, + "loss": 0.1818, + "num_input_tokens_seen": 11907104, + "step": 5505 + }, + { + "epoch": 0.8988580750407831, + "grad_norm": 0.05841980502009392, + "learning_rate": 0.000449347471451876, + "loss": 0.1849, + "num_input_tokens_seen": 11916960, + "step": 5510 + }, + { + "epoch": 0.899673735725938, + "grad_norm": 0.0654044821858406, + "learning_rate": 0.0004497553017944535, + "loss": 0.0843, + "num_input_tokens_seen": 11927232, + "step": 5515 + }, + { + "epoch": 0.9004893964110929, + "grad_norm": 0.1382654458284378, + "learning_rate": 0.00045016313213703103, + "loss": 0.2114, + "num_input_tokens_seen": 11938272, + "step": 5520 + }, + { + "epoch": 0.901305057096248, + "grad_norm": 0.2675796449184418, + "learning_rate": 0.0004505709624796085, + "loss": 0.2142, + "num_input_tokens_seen": 11949632, + "step": 5525 + }, + { + "epoch": 0.9021207177814029, + "grad_norm": 0.05411672219634056, + "learning_rate": 0.00045097879282218593, + "loss": 0.1789, + "num_input_tokens_seen": 11959232, + "step": 5530 + }, + { + "epoch": 0.9029363784665579, + "grad_norm": 0.3251686096191406, + "learning_rate": 0.00045138662316476346, + "loss": 0.1218, + "num_input_tokens_seen": 11970304, + "step": 5535 + }, + { + "epoch": 0.9037520391517129, + "grad_norm": 0.15985806286334991, + "learning_rate": 0.00045179445350734094, + "loss": 0.1069, + "num_input_tokens_seen": 11980864, + "step": 5540 + }, + { + "epoch": 0.9045676998368679, + "grad_norm": 0.5636354684829712, + "learning_rate": 0.00045220228384991847, + "loss": 0.1869, + "num_input_tokens_seen": 11990592, + "step": 5545 + }, + { + "epoch": 0.9053833605220228, + "grad_norm": 0.07867056131362915, + "learning_rate": 0.00045261011419249595, + "loss": 0.0485, + "num_input_tokens_seen": 12002720, + "step": 5550 + }, + { + "epoch": 0.9061990212071778, + "grad_norm": 0.7367706298828125, + "learning_rate": 0.0004530179445350734, + "loss": 0.126, + "num_input_tokens_seen": 12012896, + "step": 5555 + }, + { + "epoch": 0.9070146818923328, + "grad_norm": 0.75595623254776, + "learning_rate": 0.0004534257748776509, + "loss": 0.1054, + "num_input_tokens_seen": 12025120, + "step": 5560 + }, + { + "epoch": 0.9078303425774877, + "grad_norm": 0.9178465008735657, + "learning_rate": 0.0004538336052202284, + "loss": 0.3257, + "num_input_tokens_seen": 12036384, + "step": 5565 + }, + { + "epoch": 0.9086460032626428, + "grad_norm": 0.47468215227127075, + "learning_rate": 0.0004542414355628059, + "loss": 0.2105, + "num_input_tokens_seen": 12047296, + "step": 5570 + }, + { + "epoch": 0.9094616639477977, + "grad_norm": 0.175617977976799, + "learning_rate": 0.0004546492659053834, + "loss": 0.1081, + "num_input_tokens_seen": 12057664, + "step": 5575 + }, + { + "epoch": 0.9102773246329527, + "grad_norm": 0.21897639334201813, + "learning_rate": 0.00045505709624796086, + "loss": 0.1315, + "num_input_tokens_seen": 12069344, + "step": 5580 + }, + { + "epoch": 0.9110929853181077, + "grad_norm": 0.10874702781438828, + "learning_rate": 0.00045546492659053833, + "loss": 0.0992, + "num_input_tokens_seen": 12080224, + "step": 5585 + }, + { + "epoch": 0.9119086460032626, + "grad_norm": 0.36471787095069885, + "learning_rate": 0.0004558727569331158, + "loss": 0.0847, + "num_input_tokens_seen": 12089824, + "step": 5590 + }, + { + "epoch": 0.9127243066884176, + "grad_norm": 0.024809151887893677, + "learning_rate": 0.00045628058727569334, + "loss": 0.0778, + "num_input_tokens_seen": 12101152, + "step": 5595 + }, + { + "epoch": 0.9135399673735726, + "grad_norm": 0.024391191080212593, + "learning_rate": 0.00045668841761827076, + "loss": 0.1407, + "num_input_tokens_seen": 12111936, + "step": 5600 + }, + { + "epoch": 0.9143556280587276, + "grad_norm": 0.18645010888576508, + "learning_rate": 0.0004570962479608483, + "loss": 0.1784, + "num_input_tokens_seen": 12122528, + "step": 5605 + }, + { + "epoch": 0.9151712887438825, + "grad_norm": 0.03236968070268631, + "learning_rate": 0.00045750407830342577, + "loss": 0.2051, + "num_input_tokens_seen": 12132128, + "step": 5610 + }, + { + "epoch": 0.9159869494290375, + "grad_norm": 0.036100562661886215, + "learning_rate": 0.0004579119086460033, + "loss": 0.2114, + "num_input_tokens_seen": 12143552, + "step": 5615 + }, + { + "epoch": 0.9168026101141925, + "grad_norm": 0.16130702197551727, + "learning_rate": 0.0004583197389885808, + "loss": 0.1329, + "num_input_tokens_seen": 12152992, + "step": 5620 + }, + { + "epoch": 0.9176182707993474, + "grad_norm": 0.23771022260189056, + "learning_rate": 0.0004587275693311582, + "loss": 0.1475, + "num_input_tokens_seen": 12163488, + "step": 5625 + }, + { + "epoch": 0.9184339314845025, + "grad_norm": 0.2382916957139969, + "learning_rate": 0.00045913539967373573, + "loss": 0.0997, + "num_input_tokens_seen": 12174976, + "step": 5630 + }, + { + "epoch": 0.9192495921696574, + "grad_norm": 0.3301408588886261, + "learning_rate": 0.0004595432300163132, + "loss": 0.0692, + "num_input_tokens_seen": 12185984, + "step": 5635 + }, + { + "epoch": 0.9200652528548124, + "grad_norm": 0.7716155648231506, + "learning_rate": 0.00045995106035889074, + "loss": 0.2212, + "num_input_tokens_seen": 12196768, + "step": 5640 + }, + { + "epoch": 0.9208809135399674, + "grad_norm": 0.11273916065692902, + "learning_rate": 0.0004603588907014682, + "loss": 0.0561, + "num_input_tokens_seen": 12209120, + "step": 5645 + }, + { + "epoch": 0.9216965742251223, + "grad_norm": 0.07322194427251816, + "learning_rate": 0.0004607667210440457, + "loss": 0.0937, + "num_input_tokens_seen": 12220768, + "step": 5650 + }, + { + "epoch": 0.9225122349102773, + "grad_norm": 0.1523495316505432, + "learning_rate": 0.00046117455138662317, + "loss": 0.047, + "num_input_tokens_seen": 12230720, + "step": 5655 + }, + { + "epoch": 0.9233278955954323, + "grad_norm": 0.6982433795928955, + "learning_rate": 0.00046158238172920064, + "loss": 0.1112, + "num_input_tokens_seen": 12242880, + "step": 5660 + }, + { + "epoch": 0.9241435562805873, + "grad_norm": 0.05697758495807648, + "learning_rate": 0.0004619902120717782, + "loss": 0.0375, + "num_input_tokens_seen": 12253760, + "step": 5665 + }, + { + "epoch": 0.9249592169657422, + "grad_norm": 0.05087430030107498, + "learning_rate": 0.0004623980424143556, + "loss": 0.1231, + "num_input_tokens_seen": 12263712, + "step": 5670 + }, + { + "epoch": 0.9257748776508973, + "grad_norm": 0.04257337376475334, + "learning_rate": 0.00046280587275693313, + "loss": 0.145, + "num_input_tokens_seen": 12274560, + "step": 5675 + }, + { + "epoch": 0.9265905383360522, + "grad_norm": 0.3906922936439514, + "learning_rate": 0.0004632137030995106, + "loss": 0.159, + "num_input_tokens_seen": 12285024, + "step": 5680 + }, + { + "epoch": 0.9274061990212071, + "grad_norm": 0.21243229508399963, + "learning_rate": 0.00046362153344208813, + "loss": 0.1451, + "num_input_tokens_seen": 12295712, + "step": 5685 + }, + { + "epoch": 0.9282218597063622, + "grad_norm": 0.46332404017448425, + "learning_rate": 0.0004640293637846656, + "loss": 0.1199, + "num_input_tokens_seen": 12305024, + "step": 5690 + }, + { + "epoch": 0.9290375203915171, + "grad_norm": 0.20840857923030853, + "learning_rate": 0.00046443719412724303, + "loss": 0.1054, + "num_input_tokens_seen": 12315360, + "step": 5695 + }, + { + "epoch": 0.9298531810766721, + "grad_norm": 0.39110761880874634, + "learning_rate": 0.00046484502446982056, + "loss": 0.0757, + "num_input_tokens_seen": 12326464, + "step": 5700 + }, + { + "epoch": 0.9306688417618271, + "grad_norm": 0.3890259861946106, + "learning_rate": 0.00046525285481239804, + "loss": 0.0945, + "num_input_tokens_seen": 12337600, + "step": 5705 + }, + { + "epoch": 0.9314845024469821, + "grad_norm": 0.19361518323421478, + "learning_rate": 0.00046566068515497557, + "loss": 0.2785, + "num_input_tokens_seen": 12347904, + "step": 5710 + }, + { + "epoch": 0.932300163132137, + "grad_norm": 0.15051914751529694, + "learning_rate": 0.00046606851549755305, + "loss": 0.1508, + "num_input_tokens_seen": 12359712, + "step": 5715 + }, + { + "epoch": 0.933115823817292, + "grad_norm": 0.12826696038246155, + "learning_rate": 0.00046647634584013047, + "loss": 0.1295, + "num_input_tokens_seen": 12369952, + "step": 5720 + }, + { + "epoch": 0.933931484502447, + "grad_norm": 0.20581692457199097, + "learning_rate": 0.000466884176182708, + "loss": 0.1429, + "num_input_tokens_seen": 12380384, + "step": 5725 + }, + { + "epoch": 0.9347471451876019, + "grad_norm": 0.0365518257021904, + "learning_rate": 0.0004672920065252855, + "loss": 0.1297, + "num_input_tokens_seen": 12391904, + "step": 5730 + }, + { + "epoch": 0.935562805872757, + "grad_norm": 0.04881615564227104, + "learning_rate": 0.000467699836867863, + "loss": 0.0857, + "num_input_tokens_seen": 12402944, + "step": 5735 + }, + { + "epoch": 0.9363784665579119, + "grad_norm": 0.26118239760398865, + "learning_rate": 0.0004681076672104405, + "loss": 0.2689, + "num_input_tokens_seen": 12414432, + "step": 5740 + }, + { + "epoch": 0.9371941272430668, + "grad_norm": 0.18851740658283234, + "learning_rate": 0.00046851549755301796, + "loss": 0.2165, + "num_input_tokens_seen": 12425024, + "step": 5745 + }, + { + "epoch": 0.9380097879282219, + "grad_norm": 0.5728957056999207, + "learning_rate": 0.00046892332789559544, + "loss": 0.2883, + "num_input_tokens_seen": 12435232, + "step": 5750 + }, + { + "epoch": 0.9388254486133768, + "grad_norm": 0.13215041160583496, + "learning_rate": 0.0004693311582381729, + "loss": 0.1216, + "num_input_tokens_seen": 12445984, + "step": 5755 + }, + { + "epoch": 0.9396411092985318, + "grad_norm": 0.2319490611553192, + "learning_rate": 0.00046973898858075044, + "loss": 0.1076, + "num_input_tokens_seen": 12456416, + "step": 5760 + }, + { + "epoch": 0.9404567699836868, + "grad_norm": 0.16915561258792877, + "learning_rate": 0.00047014681892332787, + "loss": 0.2011, + "num_input_tokens_seen": 12466656, + "step": 5765 + }, + { + "epoch": 0.9412724306688418, + "grad_norm": 0.09713034331798553, + "learning_rate": 0.0004705546492659054, + "loss": 0.0862, + "num_input_tokens_seen": 12477696, + "step": 5770 + }, + { + "epoch": 0.9420880913539967, + "grad_norm": 0.7483925223350525, + "learning_rate": 0.0004709624796084829, + "loss": 0.2129, + "num_input_tokens_seen": 12488896, + "step": 5775 + }, + { + "epoch": 0.9429037520391517, + "grad_norm": 0.5672449469566345, + "learning_rate": 0.0004713703099510604, + "loss": 0.1227, + "num_input_tokens_seen": 12499040, + "step": 5780 + }, + { + "epoch": 0.9437194127243067, + "grad_norm": 0.6804947257041931, + "learning_rate": 0.0004717781402936379, + "loss": 0.1309, + "num_input_tokens_seen": 12509760, + "step": 5785 + }, + { + "epoch": 0.9445350734094616, + "grad_norm": 0.24877053499221802, + "learning_rate": 0.0004721859706362153, + "loss": 0.1011, + "num_input_tokens_seen": 12521440, + "step": 5790 + }, + { + "epoch": 0.9453507340946167, + "grad_norm": 0.5431973338127136, + "learning_rate": 0.00047259380097879283, + "loss": 0.2528, + "num_input_tokens_seen": 12531840, + "step": 5795 + }, + { + "epoch": 0.9461663947797716, + "grad_norm": 0.1831020563840866, + "learning_rate": 0.0004730016313213703, + "loss": 0.2248, + "num_input_tokens_seen": 12543136, + "step": 5800 + }, + { + "epoch": 0.9469820554649266, + "grad_norm": 0.33074751496315, + "learning_rate": 0.00047340946166394784, + "loss": 0.2949, + "num_input_tokens_seen": 12554240, + "step": 5805 + }, + { + "epoch": 0.9477977161500816, + "grad_norm": 0.10430339723825455, + "learning_rate": 0.0004738172920065253, + "loss": 0.1823, + "num_input_tokens_seen": 12564640, + "step": 5810 + }, + { + "epoch": 0.9486133768352365, + "grad_norm": 0.17456623911857605, + "learning_rate": 0.00047422512234910274, + "loss": 0.2085, + "num_input_tokens_seen": 12574976, + "step": 5815 + }, + { + "epoch": 0.9494290375203915, + "grad_norm": 0.15890410542488098, + "learning_rate": 0.00047463295269168027, + "loss": 0.1755, + "num_input_tokens_seen": 12585632, + "step": 5820 + }, + { + "epoch": 0.9502446982055465, + "grad_norm": 0.4678882658481598, + "learning_rate": 0.00047504078303425775, + "loss": 0.2283, + "num_input_tokens_seen": 12596096, + "step": 5825 + }, + { + "epoch": 0.9510603588907015, + "grad_norm": 0.10083203762769699, + "learning_rate": 0.0004754486133768353, + "loss": 0.0988, + "num_input_tokens_seen": 12607712, + "step": 5830 + }, + { + "epoch": 0.9518760195758564, + "grad_norm": 0.054982513189315796, + "learning_rate": 0.0004758564437194127, + "loss": 0.0766, + "num_input_tokens_seen": 12618272, + "step": 5835 + }, + { + "epoch": 0.9526916802610114, + "grad_norm": 0.08998695760965347, + "learning_rate": 0.00047626427406199023, + "loss": 0.0792, + "num_input_tokens_seen": 12629856, + "step": 5840 + }, + { + "epoch": 0.9535073409461664, + "grad_norm": 0.19011932611465454, + "learning_rate": 0.0004766721044045677, + "loss": 0.1799, + "num_input_tokens_seen": 12639456, + "step": 5845 + }, + { + "epoch": 0.9543230016313213, + "grad_norm": 0.5274903774261475, + "learning_rate": 0.0004770799347471452, + "loss": 0.1551, + "num_input_tokens_seen": 12649088, + "step": 5850 + }, + { + "epoch": 0.9551386623164764, + "grad_norm": 0.065676748752594, + "learning_rate": 0.0004774877650897227, + "loss": 0.0948, + "num_input_tokens_seen": 12659680, + "step": 5855 + }, + { + "epoch": 0.9559543230016313, + "grad_norm": 0.25573477149009705, + "learning_rate": 0.00047789559543230014, + "loss": 0.1348, + "num_input_tokens_seen": 12670848, + "step": 5860 + }, + { + "epoch": 0.9567699836867863, + "grad_norm": 0.11124901473522186, + "learning_rate": 0.00047830342577487767, + "loss": 0.0595, + "num_input_tokens_seen": 12681568, + "step": 5865 + }, + { + "epoch": 0.9575856443719413, + "grad_norm": 0.12954628467559814, + "learning_rate": 0.00047871125611745514, + "loss": 0.2106, + "num_input_tokens_seen": 12691616, + "step": 5870 + }, + { + "epoch": 0.9584013050570962, + "grad_norm": 0.1547648310661316, + "learning_rate": 0.0004791190864600327, + "loss": 0.1245, + "num_input_tokens_seen": 12701728, + "step": 5875 + }, + { + "epoch": 0.9592169657422512, + "grad_norm": 0.18160319328308105, + "learning_rate": 0.00047952691680261015, + "loss": 0.1439, + "num_input_tokens_seen": 12712480, + "step": 5880 + }, + { + "epoch": 0.9600326264274062, + "grad_norm": 0.12603481113910675, + "learning_rate": 0.00047993474714518757, + "loss": 0.2585, + "num_input_tokens_seen": 12724544, + "step": 5885 + }, + { + "epoch": 0.9608482871125612, + "grad_norm": 0.12232507020235062, + "learning_rate": 0.0004803425774877651, + "loss": 0.0721, + "num_input_tokens_seen": 12735552, + "step": 5890 + }, + { + "epoch": 0.9616639477977161, + "grad_norm": 0.8182587623596191, + "learning_rate": 0.0004807504078303426, + "loss": 0.2742, + "num_input_tokens_seen": 12747072, + "step": 5895 + }, + { + "epoch": 0.9624796084828712, + "grad_norm": 0.2686678171157837, + "learning_rate": 0.0004811582381729201, + "loss": 0.2193, + "num_input_tokens_seen": 12757504, + "step": 5900 + }, + { + "epoch": 0.9632952691680261, + "grad_norm": 0.10753358900547028, + "learning_rate": 0.00048156606851549753, + "loss": 0.1594, + "num_input_tokens_seen": 12768544, + "step": 5905 + }, + { + "epoch": 0.964110929853181, + "grad_norm": 0.24111835658550262, + "learning_rate": 0.00048197389885807506, + "loss": 0.069, + "num_input_tokens_seen": 12778464, + "step": 5910 + }, + { + "epoch": 0.9649265905383361, + "grad_norm": 0.1870940625667572, + "learning_rate": 0.00048238172920065254, + "loss": 0.1426, + "num_input_tokens_seen": 12788352, + "step": 5915 + }, + { + "epoch": 0.965742251223491, + "grad_norm": 0.08543268591165543, + "learning_rate": 0.00048278955954323, + "loss": 0.1739, + "num_input_tokens_seen": 12797344, + "step": 5920 + }, + { + "epoch": 0.966557911908646, + "grad_norm": 0.17103126645088196, + "learning_rate": 0.00048319738988580755, + "loss": 0.1457, + "num_input_tokens_seen": 12807264, + "step": 5925 + }, + { + "epoch": 0.967373572593801, + "grad_norm": 0.037070151418447495, + "learning_rate": 0.00048360522022838497, + "loss": 0.1057, + "num_input_tokens_seen": 12817408, + "step": 5930 + }, + { + "epoch": 0.968189233278956, + "grad_norm": 0.34985965490341187, + "learning_rate": 0.0004840130505709625, + "loss": 0.1548, + "num_input_tokens_seen": 12827616, + "step": 5935 + }, + { + "epoch": 0.9690048939641109, + "grad_norm": 0.42751795053482056, + "learning_rate": 0.00048442088091354, + "loss": 0.181, + "num_input_tokens_seen": 12837696, + "step": 5940 + }, + { + "epoch": 0.9698205546492659, + "grad_norm": 0.12427811324596405, + "learning_rate": 0.00048482871125611745, + "loss": 0.0716, + "num_input_tokens_seen": 12848928, + "step": 5945 + }, + { + "epoch": 0.9706362153344209, + "grad_norm": 0.15662795305252075, + "learning_rate": 0.000485236541598695, + "loss": 0.2212, + "num_input_tokens_seen": 12859264, + "step": 5950 + }, + { + "epoch": 0.9714518760195758, + "grad_norm": 0.13130423426628113, + "learning_rate": 0.0004856443719412724, + "loss": 0.1212, + "num_input_tokens_seen": 12870336, + "step": 5955 + }, + { + "epoch": 0.9722675367047309, + "grad_norm": 0.5949497222900391, + "learning_rate": 0.00048605220228384994, + "loss": 0.308, + "num_input_tokens_seen": 12881280, + "step": 5960 + }, + { + "epoch": 0.9730831973898858, + "grad_norm": 0.24126264452934265, + "learning_rate": 0.0004864600326264274, + "loss": 0.1112, + "num_input_tokens_seen": 12892544, + "step": 5965 + }, + { + "epoch": 0.9738988580750407, + "grad_norm": 0.18657752871513367, + "learning_rate": 0.00048686786296900494, + "loss": 0.1458, + "num_input_tokens_seen": 12903680, + "step": 5970 + }, + { + "epoch": 0.9747145187601958, + "grad_norm": 0.2861552834510803, + "learning_rate": 0.00048727569331158237, + "loss": 0.1987, + "num_input_tokens_seen": 12914400, + "step": 5975 + }, + { + "epoch": 0.9755301794453507, + "grad_norm": 0.11565116047859192, + "learning_rate": 0.00048768352365415984, + "loss": 0.1782, + "num_input_tokens_seen": 12925920, + "step": 5980 + }, + { + "epoch": 0.9763458401305057, + "grad_norm": 0.09785725176334381, + "learning_rate": 0.00048809135399673737, + "loss": 0.1816, + "num_input_tokens_seen": 12936640, + "step": 5985 + }, + { + "epoch": 0.9771615008156607, + "grad_norm": 0.4810200035572052, + "learning_rate": 0.0004884991843393148, + "loss": 0.177, + "num_input_tokens_seen": 12948160, + "step": 5990 + }, + { + "epoch": 0.9779771615008157, + "grad_norm": 0.16747428476810455, + "learning_rate": 0.0004889070146818923, + "loss": 0.1374, + "num_input_tokens_seen": 12959296, + "step": 5995 + }, + { + "epoch": 0.9787928221859706, + "grad_norm": 0.12989793717861176, + "learning_rate": 0.0004893148450244698, + "loss": 0.1107, + "num_input_tokens_seen": 12970624, + "step": 6000 + }, + { + "epoch": 0.9796084828711256, + "grad_norm": 0.08526536822319031, + "learning_rate": 0.0004897226753670474, + "loss": 0.2367, + "num_input_tokens_seen": 12981664, + "step": 6005 + }, + { + "epoch": 0.9804241435562806, + "grad_norm": 0.15927903354167938, + "learning_rate": 0.0004901305057096248, + "loss": 0.2274, + "num_input_tokens_seen": 12991328, + "step": 6010 + }, + { + "epoch": 0.9812398042414355, + "grad_norm": 0.15431569516658783, + "learning_rate": 0.0004905383360522022, + "loss": 0.046, + "num_input_tokens_seen": 13001632, + "step": 6015 + }, + { + "epoch": 0.9820554649265906, + "grad_norm": 0.6077148914337158, + "learning_rate": 0.0004909461663947798, + "loss": 0.1611, + "num_input_tokens_seen": 13012256, + "step": 6020 + }, + { + "epoch": 0.9828711256117455, + "grad_norm": 0.36628520488739014, + "learning_rate": 0.0004913539967373573, + "loss": 0.1385, + "num_input_tokens_seen": 13024480, + "step": 6025 + }, + { + "epoch": 0.9836867862969005, + "grad_norm": 0.052024632692337036, + "learning_rate": 0.0004917618270799348, + "loss": 0.0512, + "num_input_tokens_seen": 13034432, + "step": 6030 + }, + { + "epoch": 0.9845024469820555, + "grad_norm": 0.08394888788461685, + "learning_rate": 0.0004921696574225122, + "loss": 0.163, + "num_input_tokens_seen": 13045824, + "step": 6035 + }, + { + "epoch": 0.9853181076672104, + "grad_norm": 0.21150889992713928, + "learning_rate": 0.0004925774877650897, + "loss": 0.1086, + "num_input_tokens_seen": 13057536, + "step": 6040 + }, + { + "epoch": 0.9861337683523654, + "grad_norm": 0.09327172487974167, + "learning_rate": 0.0004929853181076672, + "loss": 0.1038, + "num_input_tokens_seen": 13069120, + "step": 6045 + }, + { + "epoch": 0.9869494290375204, + "grad_norm": 0.2453579306602478, + "learning_rate": 0.0004933931484502447, + "loss": 0.0449, + "num_input_tokens_seen": 13080192, + "step": 6050 + }, + { + "epoch": 0.9877650897226754, + "grad_norm": 0.04804931953549385, + "learning_rate": 0.0004938009787928223, + "loss": 0.1282, + "num_input_tokens_seen": 13091040, + "step": 6055 + }, + { + "epoch": 0.9885807504078303, + "grad_norm": 0.01482168585062027, + "learning_rate": 0.0004942088091353996, + "loss": 0.0451, + "num_input_tokens_seen": 13102976, + "step": 6060 + }, + { + "epoch": 0.9893964110929854, + "grad_norm": 0.13239941000938416, + "learning_rate": 0.0004946166394779772, + "loss": 0.3, + "num_input_tokens_seen": 13114688, + "step": 6065 + }, + { + "epoch": 0.9902120717781403, + "grad_norm": 0.10520962625741959, + "learning_rate": 0.0004950244698205547, + "loss": 0.0628, + "num_input_tokens_seen": 13126432, + "step": 6070 + }, + { + "epoch": 0.9910277324632952, + "grad_norm": 0.21707811951637268, + "learning_rate": 0.0004954323001631322, + "loss": 0.1958, + "num_input_tokens_seen": 13137344, + "step": 6075 + }, + { + "epoch": 0.9918433931484503, + "grad_norm": 0.11555507779121399, + "learning_rate": 0.0004958401305057096, + "loss": 0.1382, + "num_input_tokens_seen": 13148608, + "step": 6080 + }, + { + "epoch": 0.9926590538336052, + "grad_norm": 0.5580472350120544, + "learning_rate": 0.0004962479608482871, + "loss": 0.1019, + "num_input_tokens_seen": 13159840, + "step": 6085 + }, + { + "epoch": 0.9934747145187602, + "grad_norm": 0.4591805934906006, + "learning_rate": 0.0004966557911908646, + "loss": 0.1872, + "num_input_tokens_seen": 13169856, + "step": 6090 + }, + { + "epoch": 0.9942903752039152, + "grad_norm": 0.3633251190185547, + "learning_rate": 0.0004970636215334421, + "loss": 0.1258, + "num_input_tokens_seen": 13181728, + "step": 6095 + }, + { + "epoch": 0.9951060358890701, + "grad_norm": 0.22325961291790009, + "learning_rate": 0.0004974714518760197, + "loss": 0.1266, + "num_input_tokens_seen": 13193760, + "step": 6100 + }, + { + "epoch": 0.9959216965742251, + "grad_norm": 0.09583016484975815, + "learning_rate": 0.000497879282218597, + "loss": 0.0894, + "num_input_tokens_seen": 13205280, + "step": 6105 + }, + { + "epoch": 0.9967373572593801, + "grad_norm": 0.8459984064102173, + "learning_rate": 0.0004982871125611745, + "loss": 0.2232, + "num_input_tokens_seen": 13215584, + "step": 6110 + }, + { + "epoch": 0.9975530179445351, + "grad_norm": 0.24980811774730682, + "learning_rate": 0.0004986949429037521, + "loss": 0.1846, + "num_input_tokens_seen": 13226496, + "step": 6115 + }, + { + "epoch": 0.99836867862969, + "grad_norm": 0.04283153638243675, + "learning_rate": 0.0004991027732463296, + "loss": 0.1192, + "num_input_tokens_seen": 13236992, + "step": 6120 + }, + { + "epoch": 0.9991843393148451, + "grad_norm": 0.5137706995010376, + "learning_rate": 0.000499510603588907, + "loss": 0.2165, + "num_input_tokens_seen": 13247424, + "step": 6125 + }, + { + "epoch": 1.0, + "grad_norm": 0.39642074704170227, + "learning_rate": 0.0004999184339314845, + "loss": 0.1799, + "num_input_tokens_seen": 13256608, + "step": 6130 + }, + { + "epoch": 1.0, + "eval_loss": 0.1398823857307434, + "eval_runtime": 103.3977, + "eval_samples_per_second": 26.355, + "eval_steps_per_second": 6.596, + "num_input_tokens_seen": 13256608, + "step": 6130 + }, + { + "epoch": 1.000815660685155, + "grad_norm": 0.11160384863615036, + "learning_rate": 0.000500326264274062, + "loss": 0.1257, + "num_input_tokens_seen": 13268544, + "step": 6135 + }, + { + "epoch": 1.0016313213703099, + "grad_norm": 0.23075062036514282, + "learning_rate": 0.0005007340946166395, + "loss": 0.1004, + "num_input_tokens_seen": 13279712, + "step": 6140 + }, + { + "epoch": 1.002446982055465, + "grad_norm": 0.18675552308559418, + "learning_rate": 0.0005011419249592169, + "loss": 0.1524, + "num_input_tokens_seen": 13289344, + "step": 6145 + }, + { + "epoch": 1.00326264274062, + "grad_norm": 0.2517428398132324, + "learning_rate": 0.0005015497553017944, + "loss": 0.1255, + "num_input_tokens_seen": 13300032, + "step": 6150 + }, + { + "epoch": 1.004078303425775, + "grad_norm": 0.18479129672050476, + "learning_rate": 0.000501957585644372, + "loss": 0.0235, + "num_input_tokens_seen": 13310176, + "step": 6155 + }, + { + "epoch": 1.0048939641109298, + "grad_norm": 0.09900052845478058, + "learning_rate": 0.0005023654159869494, + "loss": 0.1479, + "num_input_tokens_seen": 13320256, + "step": 6160 + }, + { + "epoch": 1.0057096247960848, + "grad_norm": 0.2950863838195801, + "learning_rate": 0.000502773246329527, + "loss": 0.2501, + "num_input_tokens_seen": 13330560, + "step": 6165 + }, + { + "epoch": 1.0065252854812399, + "grad_norm": 0.0240387711673975, + "learning_rate": 0.0005031810766721044, + "loss": 0.1392, + "num_input_tokens_seen": 13341120, + "step": 6170 + }, + { + "epoch": 1.0073409461663947, + "grad_norm": 0.08495213836431503, + "learning_rate": 0.0005035889070146818, + "loss": 0.1341, + "num_input_tokens_seen": 13352416, + "step": 6175 + }, + { + "epoch": 1.0081566068515497, + "grad_norm": 0.23577626049518585, + "learning_rate": 0.0005039967373572594, + "loss": 0.148, + "num_input_tokens_seen": 13363392, + "step": 6180 + }, + { + "epoch": 1.0089722675367048, + "grad_norm": 0.10276693105697632, + "learning_rate": 0.0005044045676998369, + "loss": 0.1369, + "num_input_tokens_seen": 13373056, + "step": 6185 + }, + { + "epoch": 1.0097879282218598, + "grad_norm": 0.08981849253177643, + "learning_rate": 0.0005048123980424144, + "loss": 0.1007, + "num_input_tokens_seen": 13384288, + "step": 6190 + }, + { + "epoch": 1.0106035889070146, + "grad_norm": 0.1138916164636612, + "learning_rate": 0.0005052202283849918, + "loss": 0.2059, + "num_input_tokens_seen": 13394208, + "step": 6195 + }, + { + "epoch": 1.0114192495921697, + "grad_norm": 0.1942625790834427, + "learning_rate": 0.0005056280587275693, + "loss": 0.0768, + "num_input_tokens_seen": 13404576, + "step": 6200 + }, + { + "epoch": 1.0122349102773247, + "grad_norm": 0.09755151718854904, + "learning_rate": 0.0005060358890701469, + "loss": 0.1849, + "num_input_tokens_seen": 13415296, + "step": 6205 + }, + { + "epoch": 1.0130505709624795, + "grad_norm": 0.06156241148710251, + "learning_rate": 0.0005064437194127242, + "loss": 0.1902, + "num_input_tokens_seen": 13426112, + "step": 6210 + }, + { + "epoch": 1.0138662316476346, + "grad_norm": 0.10398396849632263, + "learning_rate": 0.0005068515497553018, + "loss": 0.2426, + "num_input_tokens_seen": 13436416, + "step": 6215 + }, + { + "epoch": 1.0146818923327896, + "grad_norm": 0.20540976524353027, + "learning_rate": 0.0005072593800978793, + "loss": 0.2627, + "num_input_tokens_seen": 13446624, + "step": 6220 + }, + { + "epoch": 1.0154975530179446, + "grad_norm": 0.15334442257881165, + "learning_rate": 0.0005076672104404568, + "loss": 0.1482, + "num_input_tokens_seen": 13457440, + "step": 6225 + }, + { + "epoch": 1.0163132137030995, + "grad_norm": 0.34090787172317505, + "learning_rate": 0.0005080750407830343, + "loss": 0.1779, + "num_input_tokens_seen": 13468800, + "step": 6230 + }, + { + "epoch": 1.0171288743882545, + "grad_norm": 0.15002760291099548, + "learning_rate": 0.0005084828711256117, + "loss": 0.0614, + "num_input_tokens_seen": 13480512, + "step": 6235 + }, + { + "epoch": 1.0179445350734095, + "grad_norm": 0.2431073635816574, + "learning_rate": 0.0005088907014681893, + "loss": 0.133, + "num_input_tokens_seen": 13491232, + "step": 6240 + }, + { + "epoch": 1.0187601957585644, + "grad_norm": 0.14370810985565186, + "learning_rate": 0.0005092985318107667, + "loss": 0.0539, + "num_input_tokens_seen": 13501920, + "step": 6245 + }, + { + "epoch": 1.0195758564437194, + "grad_norm": 0.07355213165283203, + "learning_rate": 0.0005097063621533442, + "loss": 0.0994, + "num_input_tokens_seen": 13510016, + "step": 6250 + }, + { + "epoch": 1.0203915171288744, + "grad_norm": 0.2566724419593811, + "learning_rate": 0.0005101141924959218, + "loss": 0.2044, + "num_input_tokens_seen": 13521280, + "step": 6255 + }, + { + "epoch": 1.0212071778140293, + "grad_norm": 0.08346903324127197, + "learning_rate": 0.0005105220228384992, + "loss": 0.1983, + "num_input_tokens_seen": 13532032, + "step": 6260 + }, + { + "epoch": 1.0220228384991843, + "grad_norm": 0.28570470213890076, + "learning_rate": 0.0005109298531810767, + "loss": 0.1358, + "num_input_tokens_seen": 13541568, + "step": 6265 + }, + { + "epoch": 1.0228384991843393, + "grad_norm": 0.41711992025375366, + "learning_rate": 0.0005113376835236542, + "loss": 0.1472, + "num_input_tokens_seen": 13552064, + "step": 6270 + }, + { + "epoch": 1.0236541598694944, + "grad_norm": 0.10956079512834549, + "learning_rate": 0.0005117455138662317, + "loss": 0.1441, + "num_input_tokens_seen": 13560928, + "step": 6275 + }, + { + "epoch": 1.0244698205546492, + "grad_norm": 0.13065916299819946, + "learning_rate": 0.0005121533442088091, + "loss": 0.0472, + "num_input_tokens_seen": 13572096, + "step": 6280 + }, + { + "epoch": 1.0252854812398042, + "grad_norm": 0.5665595531463623, + "learning_rate": 0.0005125611745513866, + "loss": 0.1187, + "num_input_tokens_seen": 13584448, + "step": 6285 + }, + { + "epoch": 1.0261011419249593, + "grad_norm": 0.18895985186100006, + "learning_rate": 0.0005129690048939642, + "loss": 0.1032, + "num_input_tokens_seen": 13596352, + "step": 6290 + }, + { + "epoch": 1.026916802610114, + "grad_norm": 0.1723942905664444, + "learning_rate": 0.0005133768352365417, + "loss": 0.1629, + "num_input_tokens_seen": 13606688, + "step": 6295 + }, + { + "epoch": 1.0277324632952691, + "grad_norm": 0.5321813821792603, + "learning_rate": 0.000513784665579119, + "loss": 0.1551, + "num_input_tokens_seen": 13618400, + "step": 6300 + }, + { + "epoch": 1.0285481239804242, + "grad_norm": 0.014531032182276249, + "learning_rate": 0.0005141924959216966, + "loss": 0.0871, + "num_input_tokens_seen": 13629376, + "step": 6305 + }, + { + "epoch": 1.0293637846655792, + "grad_norm": 0.11209902167320251, + "learning_rate": 0.0005146003262642741, + "loss": 0.0521, + "num_input_tokens_seen": 13640896, + "step": 6310 + }, + { + "epoch": 1.030179445350734, + "grad_norm": 0.2015608698129654, + "learning_rate": 0.0005150081566068515, + "loss": 0.143, + "num_input_tokens_seen": 13652128, + "step": 6315 + }, + { + "epoch": 1.030995106035889, + "grad_norm": 0.09540046751499176, + "learning_rate": 0.000515415986949429, + "loss": 0.019, + "num_input_tokens_seen": 13662048, + "step": 6320 + }, + { + "epoch": 1.031810766721044, + "grad_norm": 0.579704225063324, + "learning_rate": 0.0005158238172920065, + "loss": 0.1019, + "num_input_tokens_seen": 13672160, + "step": 6325 + }, + { + "epoch": 1.032626427406199, + "grad_norm": 0.1127452477812767, + "learning_rate": 0.0005162316476345841, + "loss": 0.0413, + "num_input_tokens_seen": 13682400, + "step": 6330 + }, + { + "epoch": 1.033442088091354, + "grad_norm": 0.6191928386688232, + "learning_rate": 0.0005166394779771615, + "loss": 0.3745, + "num_input_tokens_seen": 13692608, + "step": 6335 + }, + { + "epoch": 1.034257748776509, + "grad_norm": 0.23995548486709595, + "learning_rate": 0.000517047308319739, + "loss": 0.2638, + "num_input_tokens_seen": 13703008, + "step": 6340 + }, + { + "epoch": 1.035073409461664, + "grad_norm": 0.05872446298599243, + "learning_rate": 0.0005174551386623165, + "loss": 0.1718, + "num_input_tokens_seen": 13714496, + "step": 6345 + }, + { + "epoch": 1.0358890701468189, + "grad_norm": 0.1604536920785904, + "learning_rate": 0.0005178629690048939, + "loss": 0.1295, + "num_input_tokens_seen": 13725120, + "step": 6350 + }, + { + "epoch": 1.036704730831974, + "grad_norm": 0.1121409609913826, + "learning_rate": 0.0005182707993474715, + "loss": 0.049, + "num_input_tokens_seen": 13734560, + "step": 6355 + }, + { + "epoch": 1.037520391517129, + "grad_norm": 0.1634875386953354, + "learning_rate": 0.000518678629690049, + "loss": 0.1074, + "num_input_tokens_seen": 13745248, + "step": 6360 + }, + { + "epoch": 1.0383360522022838, + "grad_norm": 0.20854128897190094, + "learning_rate": 0.0005190864600326263, + "loss": 0.1673, + "num_input_tokens_seen": 13755584, + "step": 6365 + }, + { + "epoch": 1.0391517128874388, + "grad_norm": 0.08223803341388702, + "learning_rate": 0.0005194942903752039, + "loss": 0.0784, + "num_input_tokens_seen": 13767456, + "step": 6370 + }, + { + "epoch": 1.0399673735725938, + "grad_norm": 0.3701033294200897, + "learning_rate": 0.0005199021207177814, + "loss": 0.2891, + "num_input_tokens_seen": 13778176, + "step": 6375 + }, + { + "epoch": 1.0407830342577489, + "grad_norm": 0.042385704815387726, + "learning_rate": 0.000520309951060359, + "loss": 0.0874, + "num_input_tokens_seen": 13788576, + "step": 6380 + }, + { + "epoch": 1.0415986949429037, + "grad_norm": 0.04089699313044548, + "learning_rate": 0.0005207177814029364, + "loss": 0.1409, + "num_input_tokens_seen": 13798432, + "step": 6385 + }, + { + "epoch": 1.0424143556280587, + "grad_norm": 0.1607908308506012, + "learning_rate": 0.0005211256117455138, + "loss": 0.1119, + "num_input_tokens_seen": 13809408, + "step": 6390 + }, + { + "epoch": 1.0432300163132138, + "grad_norm": 0.2969203591346741, + "learning_rate": 0.0005215334420880914, + "loss": 0.1187, + "num_input_tokens_seen": 13821152, + "step": 6395 + }, + { + "epoch": 1.0440456769983686, + "grad_norm": 0.41446980834007263, + "learning_rate": 0.0005219412724306688, + "loss": 0.254, + "num_input_tokens_seen": 13832000, + "step": 6400 + }, + { + "epoch": 1.0448613376835236, + "grad_norm": 0.1455860584974289, + "learning_rate": 0.0005223491027732464, + "loss": 0.1489, + "num_input_tokens_seen": 13842208, + "step": 6405 + }, + { + "epoch": 1.0456769983686787, + "grad_norm": 0.3523208796977997, + "learning_rate": 0.0005227569331158238, + "loss": 0.1161, + "num_input_tokens_seen": 13852256, + "step": 6410 + }, + { + "epoch": 1.0464926590538337, + "grad_norm": 0.14199502766132355, + "learning_rate": 0.0005231647634584013, + "loss": 0.0589, + "num_input_tokens_seen": 13864064, + "step": 6415 + }, + { + "epoch": 1.0473083197389885, + "grad_norm": 0.09805894643068314, + "learning_rate": 0.0005235725938009788, + "loss": 0.1358, + "num_input_tokens_seen": 13874368, + "step": 6420 + }, + { + "epoch": 1.0481239804241436, + "grad_norm": 0.2984837293624878, + "learning_rate": 0.0005239804241435563, + "loss": 0.0741, + "num_input_tokens_seen": 13885984, + "step": 6425 + }, + { + "epoch": 1.0489396411092986, + "grad_norm": 0.03853955864906311, + "learning_rate": 0.0005243882544861339, + "loss": 0.0288, + "num_input_tokens_seen": 13896000, + "step": 6430 + }, + { + "epoch": 1.0497553017944534, + "grad_norm": 0.010538961738348007, + "learning_rate": 0.0005247960848287112, + "loss": 0.2701, + "num_input_tokens_seen": 13906848, + "step": 6435 + }, + { + "epoch": 1.0505709624796085, + "grad_norm": 0.04958980530500412, + "learning_rate": 0.0005252039151712887, + "loss": 0.1826, + "num_input_tokens_seen": 13916864, + "step": 6440 + }, + { + "epoch": 1.0513866231647635, + "grad_norm": 0.10969033092260361, + "learning_rate": 0.0005256117455138663, + "loss": 0.1165, + "num_input_tokens_seen": 13926592, + "step": 6445 + }, + { + "epoch": 1.0522022838499185, + "grad_norm": 0.08144375681877136, + "learning_rate": 0.0005260195758564438, + "loss": 0.0327, + "num_input_tokens_seen": 13937120, + "step": 6450 + }, + { + "epoch": 1.0530179445350734, + "grad_norm": 0.399844229221344, + "learning_rate": 0.0005264274061990211, + "loss": 0.1252, + "num_input_tokens_seen": 13949120, + "step": 6455 + }, + { + "epoch": 1.0538336052202284, + "grad_norm": 0.28734290599823, + "learning_rate": 0.0005268352365415987, + "loss": 0.0848, + "num_input_tokens_seen": 13959264, + "step": 6460 + }, + { + "epoch": 1.0546492659053834, + "grad_norm": 0.4673934876918793, + "learning_rate": 0.0005272430668841762, + "loss": 0.1745, + "num_input_tokens_seen": 13970304, + "step": 6465 + }, + { + "epoch": 1.0554649265905383, + "grad_norm": 0.15282650291919708, + "learning_rate": 0.0005276508972267537, + "loss": 0.1067, + "num_input_tokens_seen": 13980800, + "step": 6470 + }, + { + "epoch": 1.0562805872756933, + "grad_norm": 0.11581412702798843, + "learning_rate": 0.0005280587275693311, + "loss": 0.0991, + "num_input_tokens_seen": 13991040, + "step": 6475 + }, + { + "epoch": 1.0570962479608483, + "grad_norm": 0.023843001574277878, + "learning_rate": 0.0005284665579119086, + "loss": 0.1086, + "num_input_tokens_seen": 14001216, + "step": 6480 + }, + { + "epoch": 1.0579119086460032, + "grad_norm": 0.027091102674603462, + "learning_rate": 0.0005288743882544862, + "loss": 0.0865, + "num_input_tokens_seen": 14012672, + "step": 6485 + }, + { + "epoch": 1.0587275693311582, + "grad_norm": 0.051208313554525375, + "learning_rate": 0.0005292822185970636, + "loss": 0.119, + "num_input_tokens_seen": 14023872, + "step": 6490 + }, + { + "epoch": 1.0595432300163132, + "grad_norm": 0.061948299407958984, + "learning_rate": 0.0005296900489396412, + "loss": 0.0983, + "num_input_tokens_seen": 14035136, + "step": 6495 + }, + { + "epoch": 1.0603588907014683, + "grad_norm": 0.5826138257980347, + "learning_rate": 0.0005300978792822186, + "loss": 0.1936, + "num_input_tokens_seen": 14046240, + "step": 6500 + }, + { + "epoch": 1.061174551386623, + "grad_norm": 0.025136886164546013, + "learning_rate": 0.000530505709624796, + "loss": 0.0611, + "num_input_tokens_seen": 14056512, + "step": 6505 + }, + { + "epoch": 1.0619902120717781, + "grad_norm": 0.3028709888458252, + "learning_rate": 0.0005309135399673736, + "loss": 0.1867, + "num_input_tokens_seen": 14068000, + "step": 6510 + }, + { + "epoch": 1.0628058727569332, + "grad_norm": 0.04569149389863014, + "learning_rate": 0.0005313213703099511, + "loss": 0.1099, + "num_input_tokens_seen": 14078336, + "step": 6515 + }, + { + "epoch": 1.0636215334420882, + "grad_norm": 0.48968860507011414, + "learning_rate": 0.0005317292006525287, + "loss": 0.1447, + "num_input_tokens_seen": 14089632, + "step": 6520 + }, + { + "epoch": 1.064437194127243, + "grad_norm": 0.06684679538011551, + "learning_rate": 0.000532137030995106, + "loss": 0.1157, + "num_input_tokens_seen": 14099104, + "step": 6525 + }, + { + "epoch": 1.065252854812398, + "grad_norm": 0.5205869078636169, + "learning_rate": 0.0005325448613376835, + "loss": 0.1275, + "num_input_tokens_seen": 14109408, + "step": 6530 + }, + { + "epoch": 1.066068515497553, + "grad_norm": 0.07348957657814026, + "learning_rate": 0.0005329526916802611, + "loss": 0.2033, + "num_input_tokens_seen": 14119616, + "step": 6535 + }, + { + "epoch": 1.066884176182708, + "grad_norm": 0.12224670499563217, + "learning_rate": 0.0005333605220228385, + "loss": 0.0612, + "num_input_tokens_seen": 14130112, + "step": 6540 + }, + { + "epoch": 1.067699836867863, + "grad_norm": 0.03272275999188423, + "learning_rate": 0.000533768352365416, + "loss": 0.0502, + "num_input_tokens_seen": 14140672, + "step": 6545 + }, + { + "epoch": 1.068515497553018, + "grad_norm": 0.4673844277858734, + "learning_rate": 0.0005341761827079935, + "loss": 0.1442, + "num_input_tokens_seen": 14152128, + "step": 6550 + }, + { + "epoch": 1.0693311582381728, + "grad_norm": 0.12728549540042877, + "learning_rate": 0.000534584013050571, + "loss": 0.0641, + "num_input_tokens_seen": 14164096, + "step": 6555 + }, + { + "epoch": 1.0701468189233279, + "grad_norm": 0.48025938868522644, + "learning_rate": 0.0005349918433931485, + "loss": 0.1081, + "num_input_tokens_seen": 14174624, + "step": 6560 + }, + { + "epoch": 1.070962479608483, + "grad_norm": 0.06705355644226074, + "learning_rate": 0.0005353996737357259, + "loss": 0.1736, + "num_input_tokens_seen": 14186784, + "step": 6565 + }, + { + "epoch": 1.071778140293638, + "grad_norm": 0.043406713753938675, + "learning_rate": 0.0005358075040783035, + "loss": 0.0601, + "num_input_tokens_seen": 14196064, + "step": 6570 + }, + { + "epoch": 1.0725938009787928, + "grad_norm": 0.3859328031539917, + "learning_rate": 0.0005362153344208809, + "loss": 0.0934, + "num_input_tokens_seen": 14207232, + "step": 6575 + }, + { + "epoch": 1.0734094616639478, + "grad_norm": 0.015095439739525318, + "learning_rate": 0.0005366231647634584, + "loss": 0.0868, + "num_input_tokens_seen": 14217920, + "step": 6580 + }, + { + "epoch": 1.0742251223491028, + "grad_norm": 0.8526964783668518, + "learning_rate": 0.000537030995106036, + "loss": 0.2762, + "num_input_tokens_seen": 14228896, + "step": 6585 + }, + { + "epoch": 1.0750407830342577, + "grad_norm": 0.045150063931941986, + "learning_rate": 0.0005374388254486133, + "loss": 0.2702, + "num_input_tokens_seen": 14239712, + "step": 6590 + }, + { + "epoch": 1.0758564437194127, + "grad_norm": 0.4150654077529907, + "learning_rate": 0.0005378466557911908, + "loss": 0.1477, + "num_input_tokens_seen": 14249760, + "step": 6595 + }, + { + "epoch": 1.0766721044045677, + "grad_norm": 0.23948174715042114, + "learning_rate": 0.0005382544861337684, + "loss": 0.1037, + "num_input_tokens_seen": 14259552, + "step": 6600 + }, + { + "epoch": 1.0774877650897228, + "grad_norm": 0.2983197271823883, + "learning_rate": 0.0005386623164763459, + "loss": 0.1766, + "num_input_tokens_seen": 14270880, + "step": 6605 + }, + { + "epoch": 1.0783034257748776, + "grad_norm": 0.08725722879171371, + "learning_rate": 0.0005390701468189233, + "loss": 0.0909, + "num_input_tokens_seen": 14281856, + "step": 6610 + }, + { + "epoch": 1.0791190864600326, + "grad_norm": 0.17761683464050293, + "learning_rate": 0.0005394779771615008, + "loss": 0.133, + "num_input_tokens_seen": 14293536, + "step": 6615 + }, + { + "epoch": 1.0799347471451877, + "grad_norm": 0.4125170111656189, + "learning_rate": 0.0005398858075040783, + "loss": 0.1336, + "num_input_tokens_seen": 14303968, + "step": 6620 + }, + { + "epoch": 1.0807504078303425, + "grad_norm": 0.1900666207075119, + "learning_rate": 0.0005402936378466558, + "loss": 0.1574, + "num_input_tokens_seen": 14315296, + "step": 6625 + }, + { + "epoch": 1.0815660685154975, + "grad_norm": 0.28400659561157227, + "learning_rate": 0.0005407014681892332, + "loss": 0.1111, + "num_input_tokens_seen": 14326528, + "step": 6630 + }, + { + "epoch": 1.0823817292006526, + "grad_norm": 0.04672158136963844, + "learning_rate": 0.0005411092985318108, + "loss": 0.0592, + "num_input_tokens_seen": 14337248, + "step": 6635 + }, + { + "epoch": 1.0831973898858076, + "grad_norm": 0.4267369508743286, + "learning_rate": 0.0005415171288743883, + "loss": 0.0632, + "num_input_tokens_seen": 14347392, + "step": 6640 + }, + { + "epoch": 1.0840130505709624, + "grad_norm": 0.14387011528015137, + "learning_rate": 0.0005419249592169657, + "loss": 0.032, + "num_input_tokens_seen": 14359008, + "step": 6645 + }, + { + "epoch": 1.0848287112561175, + "grad_norm": 0.2627013325691223, + "learning_rate": 0.0005423327895595433, + "loss": 0.1407, + "num_input_tokens_seen": 14370464, + "step": 6650 + }, + { + "epoch": 1.0856443719412725, + "grad_norm": 0.7487311363220215, + "learning_rate": 0.0005427406199021207, + "loss": 0.1933, + "num_input_tokens_seen": 14382304, + "step": 6655 + }, + { + "epoch": 1.0864600326264273, + "grad_norm": 0.2547962963581085, + "learning_rate": 0.0005431484502446982, + "loss": 0.1144, + "num_input_tokens_seen": 14392288, + "step": 6660 + }, + { + "epoch": 1.0872756933115824, + "grad_norm": 0.5006257891654968, + "learning_rate": 0.0005435562805872757, + "loss": 0.117, + "num_input_tokens_seen": 14403200, + "step": 6665 + }, + { + "epoch": 1.0880913539967374, + "grad_norm": 0.10363472998142242, + "learning_rate": 0.0005439641109298532, + "loss": 0.253, + "num_input_tokens_seen": 14414944, + "step": 6670 + }, + { + "epoch": 1.0889070146818924, + "grad_norm": 0.7664408087730408, + "learning_rate": 0.0005443719412724307, + "loss": 0.1454, + "num_input_tokens_seen": 14426336, + "step": 6675 + }, + { + "epoch": 1.0897226753670473, + "grad_norm": 0.11776512116193771, + "learning_rate": 0.0005447797716150081, + "loss": 0.0555, + "num_input_tokens_seen": 14437760, + "step": 6680 + }, + { + "epoch": 1.0905383360522023, + "grad_norm": 0.13282710313796997, + "learning_rate": 0.0005451876019575857, + "loss": 0.1458, + "num_input_tokens_seen": 14449120, + "step": 6685 + }, + { + "epoch": 1.0913539967373573, + "grad_norm": 0.3319006562232971, + "learning_rate": 0.0005455954323001632, + "loss": 0.1105, + "num_input_tokens_seen": 14460544, + "step": 6690 + }, + { + "epoch": 1.0921696574225122, + "grad_norm": 0.24569126963615417, + "learning_rate": 0.0005460032626427405, + "loss": 0.1459, + "num_input_tokens_seen": 14472608, + "step": 6695 + }, + { + "epoch": 1.0929853181076672, + "grad_norm": 0.12690795958042145, + "learning_rate": 0.0005464110929853181, + "loss": 0.111, + "num_input_tokens_seen": 14482176, + "step": 6700 + }, + { + "epoch": 1.0938009787928222, + "grad_norm": 0.24353094398975372, + "learning_rate": 0.0005468189233278956, + "loss": 0.0743, + "num_input_tokens_seen": 14492768, + "step": 6705 + }, + { + "epoch": 1.094616639477977, + "grad_norm": 0.3623286187648773, + "learning_rate": 0.0005472267536704732, + "loss": 0.0524, + "num_input_tokens_seen": 14505024, + "step": 6710 + }, + { + "epoch": 1.095432300163132, + "grad_norm": 0.5367739200592041, + "learning_rate": 0.0005476345840130506, + "loss": 0.0445, + "num_input_tokens_seen": 14515744, + "step": 6715 + }, + { + "epoch": 1.0962479608482871, + "grad_norm": 0.2832864820957184, + "learning_rate": 0.000548042414355628, + "loss": 0.0292, + "num_input_tokens_seen": 14526784, + "step": 6720 + }, + { + "epoch": 1.0970636215334422, + "grad_norm": 0.2894558906555176, + "learning_rate": 0.0005484502446982056, + "loss": 0.0782, + "num_input_tokens_seen": 14536960, + "step": 6725 + }, + { + "epoch": 1.097879282218597, + "grad_norm": 0.07219669222831726, + "learning_rate": 0.000548858075040783, + "loss": 0.0159, + "num_input_tokens_seen": 14549344, + "step": 6730 + }, + { + "epoch": 1.098694942903752, + "grad_norm": 0.04819165915250778, + "learning_rate": 0.0005492659053833605, + "loss": 0.0529, + "num_input_tokens_seen": 14560224, + "step": 6735 + }, + { + "epoch": 1.099510603588907, + "grad_norm": 0.08893704414367676, + "learning_rate": 0.000549673735725938, + "loss": 0.288, + "num_input_tokens_seen": 14569440, + "step": 6740 + }, + { + "epoch": 1.100326264274062, + "grad_norm": 0.5870652794837952, + "learning_rate": 0.0005500815660685155, + "loss": 0.1119, + "num_input_tokens_seen": 14579648, + "step": 6745 + }, + { + "epoch": 1.101141924959217, + "grad_norm": 0.03373727947473526, + "learning_rate": 0.000550489396411093, + "loss": 0.0401, + "num_input_tokens_seen": 14590528, + "step": 6750 + }, + { + "epoch": 1.101957585644372, + "grad_norm": 0.10977505147457123, + "learning_rate": 0.0005508972267536705, + "loss": 0.1862, + "num_input_tokens_seen": 14600864, + "step": 6755 + }, + { + "epoch": 1.102773246329527, + "grad_norm": 1.0331140756607056, + "learning_rate": 0.000551305057096248, + "loss": 0.186, + "num_input_tokens_seen": 14612096, + "step": 6760 + }, + { + "epoch": 1.1035889070146818, + "grad_norm": 0.3095877170562744, + "learning_rate": 0.0005517128874388254, + "loss": 0.059, + "num_input_tokens_seen": 14623392, + "step": 6765 + }, + { + "epoch": 1.1044045676998369, + "grad_norm": 0.19286680221557617, + "learning_rate": 0.0005521207177814029, + "loss": 0.0896, + "num_input_tokens_seen": 14634016, + "step": 6770 + }, + { + "epoch": 1.105220228384992, + "grad_norm": 0.22141914069652557, + "learning_rate": 0.0005525285481239805, + "loss": 0.2321, + "num_input_tokens_seen": 14645216, + "step": 6775 + }, + { + "epoch": 1.1060358890701467, + "grad_norm": 0.48826268315315247, + "learning_rate": 0.000552936378466558, + "loss": 0.2204, + "num_input_tokens_seen": 14656576, + "step": 6780 + }, + { + "epoch": 1.1068515497553018, + "grad_norm": 0.13619111478328705, + "learning_rate": 0.0005533442088091353, + "loss": 0.0632, + "num_input_tokens_seen": 14667168, + "step": 6785 + }, + { + "epoch": 1.1076672104404568, + "grad_norm": 0.5033240914344788, + "learning_rate": 0.0005537520391517129, + "loss": 0.199, + "num_input_tokens_seen": 14678016, + "step": 6790 + }, + { + "epoch": 1.1084828711256118, + "grad_norm": 0.08465439826250076, + "learning_rate": 0.0005541598694942904, + "loss": 0.0638, + "num_input_tokens_seen": 14688960, + "step": 6795 + }, + { + "epoch": 1.1092985318107667, + "grad_norm": 0.2793862521648407, + "learning_rate": 0.0005545676998368679, + "loss": 0.1144, + "num_input_tokens_seen": 14700480, + "step": 6800 + }, + { + "epoch": 1.1101141924959217, + "grad_norm": 0.4719338119029999, + "learning_rate": 0.0005549755301794454, + "loss": 0.1082, + "num_input_tokens_seen": 14711104, + "step": 6805 + }, + { + "epoch": 1.1109298531810767, + "grad_norm": 0.12284115701913834, + "learning_rate": 0.0005553833605220228, + "loss": 0.0624, + "num_input_tokens_seen": 14722176, + "step": 6810 + }, + { + "epoch": 1.1117455138662315, + "grad_norm": 0.1608635038137436, + "learning_rate": 0.0005557911908646003, + "loss": 0.2192, + "num_input_tokens_seen": 14733600, + "step": 6815 + }, + { + "epoch": 1.1125611745513866, + "grad_norm": 0.1508731096982956, + "learning_rate": 0.0005561990212071778, + "loss": 0.1064, + "num_input_tokens_seen": 14744352, + "step": 6820 + }, + { + "epoch": 1.1133768352365416, + "grad_norm": 0.2695777714252472, + "learning_rate": 0.0005566068515497554, + "loss": 0.1752, + "num_input_tokens_seen": 14755040, + "step": 6825 + }, + { + "epoch": 1.1141924959216967, + "grad_norm": 0.39584463834762573, + "learning_rate": 0.0005570146818923328, + "loss": 0.1059, + "num_input_tokens_seen": 14766112, + "step": 6830 + }, + { + "epoch": 1.1150081566068515, + "grad_norm": 0.11579836159944534, + "learning_rate": 0.0005574225122349102, + "loss": 0.0326, + "num_input_tokens_seen": 14776192, + "step": 6835 + }, + { + "epoch": 1.1158238172920065, + "grad_norm": 0.013752263970673084, + "learning_rate": 0.0005578303425774878, + "loss": 0.0487, + "num_input_tokens_seen": 14786240, + "step": 6840 + }, + { + "epoch": 1.1166394779771616, + "grad_norm": 0.7901402115821838, + "learning_rate": 0.0005582381729200653, + "loss": 0.1736, + "num_input_tokens_seen": 14798048, + "step": 6845 + }, + { + "epoch": 1.1174551386623164, + "grad_norm": 0.37859535217285156, + "learning_rate": 0.0005586460032626428, + "loss": 0.2425, + "num_input_tokens_seen": 14808928, + "step": 6850 + }, + { + "epoch": 1.1182707993474714, + "grad_norm": 0.21328096091747284, + "learning_rate": 0.0005590538336052202, + "loss": 0.1267, + "num_input_tokens_seen": 14820384, + "step": 6855 + }, + { + "epoch": 1.1190864600326265, + "grad_norm": 0.11841249465942383, + "learning_rate": 0.0005594616639477977, + "loss": 0.0939, + "num_input_tokens_seen": 14831168, + "step": 6860 + }, + { + "epoch": 1.1199021207177815, + "grad_norm": 0.5339210629463196, + "learning_rate": 0.0005598694942903753, + "loss": 0.3598, + "num_input_tokens_seen": 14842112, + "step": 6865 + }, + { + "epoch": 1.1207177814029363, + "grad_norm": 0.2543666958808899, + "learning_rate": 0.0005602773246329527, + "loss": 0.1049, + "num_input_tokens_seen": 14852096, + "step": 6870 + }, + { + "epoch": 1.1215334420880914, + "grad_norm": 0.06985493749380112, + "learning_rate": 0.0005606851549755301, + "loss": 0.131, + "num_input_tokens_seen": 14862240, + "step": 6875 + }, + { + "epoch": 1.1223491027732464, + "grad_norm": 0.1255275011062622, + "learning_rate": 0.0005610929853181077, + "loss": 0.0738, + "num_input_tokens_seen": 14872448, + "step": 6880 + }, + { + "epoch": 1.1231647634584012, + "grad_norm": 0.06869206577539444, + "learning_rate": 0.0005615008156606851, + "loss": 0.1663, + "num_input_tokens_seen": 14884000, + "step": 6885 + }, + { + "epoch": 1.1239804241435563, + "grad_norm": 0.20043504238128662, + "learning_rate": 0.0005619086460032627, + "loss": 0.2337, + "num_input_tokens_seen": 14895552, + "step": 6890 + }, + { + "epoch": 1.1247960848287113, + "grad_norm": 0.1891249418258667, + "learning_rate": 0.0005623164763458401, + "loss": 0.0987, + "num_input_tokens_seen": 14904896, + "step": 6895 + }, + { + "epoch": 1.1256117455138663, + "grad_norm": 0.05826778709888458, + "learning_rate": 0.0005627243066884176, + "loss": 0.1315, + "num_input_tokens_seen": 14917184, + "step": 6900 + }, + { + "epoch": 1.1264274061990212, + "grad_norm": 0.15955139696598053, + "learning_rate": 0.0005631321370309951, + "loss": 0.1136, + "num_input_tokens_seen": 14928064, + "step": 6905 + }, + { + "epoch": 1.1272430668841762, + "grad_norm": 0.19315731525421143, + "learning_rate": 0.0005635399673735726, + "loss": 0.0669, + "num_input_tokens_seen": 14939008, + "step": 6910 + }, + { + "epoch": 1.1280587275693312, + "grad_norm": 0.24480724334716797, + "learning_rate": 0.0005639477977161502, + "loss": 0.1961, + "num_input_tokens_seen": 14950464, + "step": 6915 + }, + { + "epoch": 1.128874388254486, + "grad_norm": 0.4513859450817108, + "learning_rate": 0.0005643556280587275, + "loss": 0.1275, + "num_input_tokens_seen": 14960736, + "step": 6920 + }, + { + "epoch": 1.129690048939641, + "grad_norm": 0.4298337399959564, + "learning_rate": 0.000564763458401305, + "loss": 0.2811, + "num_input_tokens_seen": 14972128, + "step": 6925 + }, + { + "epoch": 1.1305057096247961, + "grad_norm": 0.20439466834068298, + "learning_rate": 0.0005651712887438826, + "loss": 0.1381, + "num_input_tokens_seen": 14982400, + "step": 6930 + }, + { + "epoch": 1.131321370309951, + "grad_norm": 0.25064659118652344, + "learning_rate": 0.0005655791190864601, + "loss": 0.1285, + "num_input_tokens_seen": 14992320, + "step": 6935 + }, + { + "epoch": 1.132137030995106, + "grad_norm": 0.42060086131095886, + "learning_rate": 0.0005659869494290375, + "loss": 0.2396, + "num_input_tokens_seen": 15003520, + "step": 6940 + }, + { + "epoch": 1.132952691680261, + "grad_norm": 0.044980768114328384, + "learning_rate": 0.000566394779771615, + "loss": 0.0594, + "num_input_tokens_seen": 15014656, + "step": 6945 + }, + { + "epoch": 1.133768352365416, + "grad_norm": 0.07436365634202957, + "learning_rate": 0.0005668026101141925, + "loss": 0.0619, + "num_input_tokens_seen": 15025408, + "step": 6950 + }, + { + "epoch": 1.1345840130505709, + "grad_norm": 0.47422927618026733, + "learning_rate": 0.00056721044045677, + "loss": 0.1253, + "num_input_tokens_seen": 15036672, + "step": 6955 + }, + { + "epoch": 1.135399673735726, + "grad_norm": 0.29630041122436523, + "learning_rate": 0.0005676182707993474, + "loss": 0.1597, + "num_input_tokens_seen": 15047136, + "step": 6960 + }, + { + "epoch": 1.136215334420881, + "grad_norm": 0.16470947861671448, + "learning_rate": 0.000568026101141925, + "loss": 0.134, + "num_input_tokens_seen": 15058976, + "step": 6965 + }, + { + "epoch": 1.137030995106036, + "grad_norm": 0.28484222292900085, + "learning_rate": 0.0005684339314845025, + "loss": 0.1313, + "num_input_tokens_seen": 15069312, + "step": 6970 + }, + { + "epoch": 1.1378466557911908, + "grad_norm": 0.03445442393422127, + "learning_rate": 0.0005688417618270799, + "loss": 0.0632, + "num_input_tokens_seen": 15080832, + "step": 6975 + }, + { + "epoch": 1.1386623164763459, + "grad_norm": 0.05196113511919975, + "learning_rate": 0.0005692495921696575, + "loss": 0.0529, + "num_input_tokens_seen": 15092288, + "step": 6980 + }, + { + "epoch": 1.139477977161501, + "grad_norm": 0.0929301381111145, + "learning_rate": 0.0005696574225122349, + "loss": 0.1277, + "num_input_tokens_seen": 15102432, + "step": 6985 + }, + { + "epoch": 1.1402936378466557, + "grad_norm": 0.11680106818675995, + "learning_rate": 0.0005700652528548124, + "loss": 0.1757, + "num_input_tokens_seen": 15112256, + "step": 6990 + }, + { + "epoch": 1.1411092985318108, + "grad_norm": 0.19014939665794373, + "learning_rate": 0.0005704730831973899, + "loss": 0.2152, + "num_input_tokens_seen": 15123808, + "step": 6995 + }, + { + "epoch": 1.1419249592169658, + "grad_norm": 0.1377144306898117, + "learning_rate": 0.0005708809135399674, + "loss": 0.0885, + "num_input_tokens_seen": 15133184, + "step": 7000 + }, + { + "epoch": 1.1427406199021206, + "grad_norm": 0.5549722909927368, + "learning_rate": 0.000571288743882545, + "loss": 0.2517, + "num_input_tokens_seen": 15144704, + "step": 7005 + }, + { + "epoch": 1.1435562805872757, + "grad_norm": 0.24352173507213593, + "learning_rate": 0.0005716965742251223, + "loss": 0.1248, + "num_input_tokens_seen": 15155456, + "step": 7010 + }, + { + "epoch": 1.1443719412724307, + "grad_norm": 0.11551082879304886, + "learning_rate": 0.0005721044045676999, + "loss": 0.0901, + "num_input_tokens_seen": 15165696, + "step": 7015 + }, + { + "epoch": 1.1451876019575857, + "grad_norm": 0.278942734003067, + "learning_rate": 0.0005725122349102774, + "loss": 0.1059, + "num_input_tokens_seen": 15176320, + "step": 7020 + }, + { + "epoch": 1.1460032626427405, + "grad_norm": 0.07334481179714203, + "learning_rate": 0.0005729200652528548, + "loss": 0.2575, + "num_input_tokens_seen": 15187488, + "step": 7025 + }, + { + "epoch": 1.1468189233278956, + "grad_norm": 0.11356133967638016, + "learning_rate": 0.0005733278955954323, + "loss": 0.0769, + "num_input_tokens_seen": 15198528, + "step": 7030 + }, + { + "epoch": 1.1476345840130506, + "grad_norm": 0.31060490012168884, + "learning_rate": 0.0005737357259380098, + "loss": 0.1982, + "num_input_tokens_seen": 15209376, + "step": 7035 + }, + { + "epoch": 1.1484502446982057, + "grad_norm": 0.10984083265066147, + "learning_rate": 0.0005741435562805873, + "loss": 0.0814, + "num_input_tokens_seen": 15220128, + "step": 7040 + }, + { + "epoch": 1.1492659053833605, + "grad_norm": 0.08647423982620239, + "learning_rate": 0.0005745513866231648, + "loss": 0.2202, + "num_input_tokens_seen": 15231552, + "step": 7045 + }, + { + "epoch": 1.1500815660685155, + "grad_norm": 0.32499903440475464, + "learning_rate": 0.0005749592169657422, + "loss": 0.2185, + "num_input_tokens_seen": 15241664, + "step": 7050 + }, + { + "epoch": 1.1508972267536706, + "grad_norm": 0.374118834733963, + "learning_rate": 0.0005753670473083198, + "loss": 0.112, + "num_input_tokens_seen": 15253536, + "step": 7055 + }, + { + "epoch": 1.1517128874388254, + "grad_norm": 0.4085756242275238, + "learning_rate": 0.0005757748776508972, + "loss": 0.1844, + "num_input_tokens_seen": 15263872, + "step": 7060 + }, + { + "epoch": 1.1525285481239804, + "grad_norm": 0.08538088202476501, + "learning_rate": 0.0005761827079934747, + "loss": 0.0696, + "num_input_tokens_seen": 15275264, + "step": 7065 + }, + { + "epoch": 1.1533442088091355, + "grad_norm": 0.37617227435112, + "learning_rate": 0.0005765905383360523, + "loss": 0.2125, + "num_input_tokens_seen": 15286848, + "step": 7070 + }, + { + "epoch": 1.1541598694942903, + "grad_norm": 0.15897266566753387, + "learning_rate": 0.0005769983686786296, + "loss": 0.2071, + "num_input_tokens_seen": 15297472, + "step": 7075 + }, + { + "epoch": 1.1549755301794453, + "grad_norm": 0.22269028425216675, + "learning_rate": 0.0005774061990212072, + "loss": 0.0877, + "num_input_tokens_seen": 15308224, + "step": 7080 + }, + { + "epoch": 1.1557911908646004, + "grad_norm": 0.2394644170999527, + "learning_rate": 0.0005778140293637847, + "loss": 0.2109, + "num_input_tokens_seen": 15319904, + "step": 7085 + }, + { + "epoch": 1.1566068515497552, + "grad_norm": 0.2885288596153259, + "learning_rate": 0.0005782218597063622, + "loss": 0.1601, + "num_input_tokens_seen": 15329216, + "step": 7090 + }, + { + "epoch": 1.1574225122349102, + "grad_norm": 0.062343530356884, + "learning_rate": 0.0005786296900489396, + "loss": 0.1017, + "num_input_tokens_seen": 15340288, + "step": 7095 + }, + { + "epoch": 1.1582381729200653, + "grad_norm": 0.30550217628479004, + "learning_rate": 0.0005790375203915171, + "loss": 0.1342, + "num_input_tokens_seen": 15351648, + "step": 7100 + }, + { + "epoch": 1.1590538336052203, + "grad_norm": 0.022208329290151596, + "learning_rate": 0.0005794453507340947, + "loss": 0.1672, + "num_input_tokens_seen": 15363488, + "step": 7105 + }, + { + "epoch": 1.1598694942903751, + "grad_norm": 0.06365969777107239, + "learning_rate": 0.0005798531810766721, + "loss": 0.1039, + "num_input_tokens_seen": 15373792, + "step": 7110 + }, + { + "epoch": 1.1606851549755302, + "grad_norm": 0.30073338747024536, + "learning_rate": 0.0005802610114192495, + "loss": 0.1141, + "num_input_tokens_seen": 15385120, + "step": 7115 + }, + { + "epoch": 1.1615008156606852, + "grad_norm": 0.18821366131305695, + "learning_rate": 0.0005806688417618271, + "loss": 0.1638, + "num_input_tokens_seen": 15395264, + "step": 7120 + }, + { + "epoch": 1.1623164763458402, + "grad_norm": 0.3371999263763428, + "learning_rate": 0.0005810766721044046, + "loss": 0.2989, + "num_input_tokens_seen": 15405696, + "step": 7125 + }, + { + "epoch": 1.163132137030995, + "grad_norm": 0.3867366909980774, + "learning_rate": 0.0005814845024469821, + "loss": 0.0885, + "num_input_tokens_seen": 15416928, + "step": 7130 + }, + { + "epoch": 1.16394779771615, + "grad_norm": 0.48066145181655884, + "learning_rate": 0.0005818923327895596, + "loss": 0.2603, + "num_input_tokens_seen": 15427456, + "step": 7135 + }, + { + "epoch": 1.1647634584013051, + "grad_norm": 0.11666944622993469, + "learning_rate": 0.000582300163132137, + "loss": 0.0898, + "num_input_tokens_seen": 15438336, + "step": 7140 + }, + { + "epoch": 1.16557911908646, + "grad_norm": 0.15594574809074402, + "learning_rate": 0.0005827079934747145, + "loss": 0.2309, + "num_input_tokens_seen": 15449088, + "step": 7145 + }, + { + "epoch": 1.166394779771615, + "grad_norm": 0.2819611132144928, + "learning_rate": 0.000583115823817292, + "loss": 0.1877, + "num_input_tokens_seen": 15460256, + "step": 7150 + }, + { + "epoch": 1.16721044045677, + "grad_norm": 0.1727602779865265, + "learning_rate": 0.0005835236541598696, + "loss": 0.0619, + "num_input_tokens_seen": 15470560, + "step": 7155 + }, + { + "epoch": 1.1680261011419248, + "grad_norm": 0.19856815040111542, + "learning_rate": 0.000583931484502447, + "loss": 0.1137, + "num_input_tokens_seen": 15482112, + "step": 7160 + }, + { + "epoch": 1.1688417618270799, + "grad_norm": 0.034124091267585754, + "learning_rate": 0.0005843393148450244, + "loss": 0.074, + "num_input_tokens_seen": 15493312, + "step": 7165 + }, + { + "epoch": 1.169657422512235, + "grad_norm": 0.05666607618331909, + "learning_rate": 0.000584747145187602, + "loss": 0.1201, + "num_input_tokens_seen": 15503424, + "step": 7170 + }, + { + "epoch": 1.17047308319739, + "grad_norm": 0.05534420162439346, + "learning_rate": 0.0005851549755301795, + "loss": 0.1046, + "num_input_tokens_seen": 15514400, + "step": 7175 + }, + { + "epoch": 1.1712887438825448, + "grad_norm": 0.014158536680042744, + "learning_rate": 0.0005855628058727568, + "loss": 0.0807, + "num_input_tokens_seen": 15525760, + "step": 7180 + }, + { + "epoch": 1.1721044045676998, + "grad_norm": 0.44807717204093933, + "learning_rate": 0.0005859706362153344, + "loss": 0.179, + "num_input_tokens_seen": 15536480, + "step": 7185 + }, + { + "epoch": 1.1729200652528549, + "grad_norm": 0.06306151300668716, + "learning_rate": 0.0005863784665579119, + "loss": 0.129, + "num_input_tokens_seen": 15546720, + "step": 7190 + }, + { + "epoch": 1.17373572593801, + "grad_norm": 0.37018027901649475, + "learning_rate": 0.0005867862969004895, + "loss": 0.0538, + "num_input_tokens_seen": 15556704, + "step": 7195 + }, + { + "epoch": 1.1745513866231647, + "grad_norm": 0.03759448975324631, + "learning_rate": 0.0005871941272430669, + "loss": 0.0436, + "num_input_tokens_seen": 15567776, + "step": 7200 + }, + { + "epoch": 1.1753670473083198, + "grad_norm": 0.07634948194026947, + "learning_rate": 0.0005876019575856443, + "loss": 0.1187, + "num_input_tokens_seen": 15578976, + "step": 7205 + }, + { + "epoch": 1.1761827079934748, + "grad_norm": 0.23645785450935364, + "learning_rate": 0.0005880097879282219, + "loss": 0.1072, + "num_input_tokens_seen": 15590816, + "step": 7210 + }, + { + "epoch": 1.1769983686786296, + "grad_norm": 0.02600211650133133, + "learning_rate": 0.0005884176182707993, + "loss": 0.0452, + "num_input_tokens_seen": 15601120, + "step": 7215 + }, + { + "epoch": 1.1778140293637847, + "grad_norm": 0.062186602503061295, + "learning_rate": 0.0005888254486133769, + "loss": 0.1061, + "num_input_tokens_seen": 15610816, + "step": 7220 + }, + { + "epoch": 1.1786296900489397, + "grad_norm": 0.015181249938905239, + "learning_rate": 0.0005892332789559544, + "loss": 0.1301, + "num_input_tokens_seen": 15621664, + "step": 7225 + }, + { + "epoch": 1.1794453507340945, + "grad_norm": 0.06925445795059204, + "learning_rate": 0.0005896411092985318, + "loss": 0.1576, + "num_input_tokens_seen": 15632768, + "step": 7230 + }, + { + "epoch": 1.1802610114192496, + "grad_norm": 0.008259747177362442, + "learning_rate": 0.0005900489396411093, + "loss": 0.1392, + "num_input_tokens_seen": 15643872, + "step": 7235 + }, + { + "epoch": 1.1810766721044046, + "grad_norm": 0.12475767731666565, + "learning_rate": 0.0005904567699836868, + "loss": 0.0776, + "num_input_tokens_seen": 15655200, + "step": 7240 + }, + { + "epoch": 1.1818923327895596, + "grad_norm": 0.0850701779127121, + "learning_rate": 0.0005908646003262644, + "loss": 0.2109, + "num_input_tokens_seen": 15664512, + "step": 7245 + }, + { + "epoch": 1.1827079934747144, + "grad_norm": 0.7401888370513916, + "learning_rate": 0.0005912724306688417, + "loss": 0.2835, + "num_input_tokens_seen": 15675072, + "step": 7250 + }, + { + "epoch": 1.1835236541598695, + "grad_norm": 0.1192028746008873, + "learning_rate": 0.0005916802610114192, + "loss": 0.0668, + "num_input_tokens_seen": 15685152, + "step": 7255 + }, + { + "epoch": 1.1843393148450245, + "grad_norm": 0.1462324559688568, + "learning_rate": 0.0005920880913539968, + "loss": 0.1597, + "num_input_tokens_seen": 15696064, + "step": 7260 + }, + { + "epoch": 1.1851549755301796, + "grad_norm": 0.21007820963859558, + "learning_rate": 0.0005924959216965743, + "loss": 0.1158, + "num_input_tokens_seen": 15707296, + "step": 7265 + }, + { + "epoch": 1.1859706362153344, + "grad_norm": 0.13377432525157928, + "learning_rate": 0.0005929037520391517, + "loss": 0.095, + "num_input_tokens_seen": 15718496, + "step": 7270 + }, + { + "epoch": 1.1867862969004894, + "grad_norm": 0.23630847036838531, + "learning_rate": 0.0005933115823817292, + "loss": 0.1359, + "num_input_tokens_seen": 15730080, + "step": 7275 + }, + { + "epoch": 1.1876019575856445, + "grad_norm": 0.05261866748332977, + "learning_rate": 0.0005937194127243067, + "loss": 0.0328, + "num_input_tokens_seen": 15741248, + "step": 7280 + }, + { + "epoch": 1.1884176182707993, + "grad_norm": 0.09066125005483627, + "learning_rate": 0.0005941272430668842, + "loss": 0.0561, + "num_input_tokens_seen": 15752736, + "step": 7285 + }, + { + "epoch": 1.1892332789559543, + "grad_norm": 0.5383073687553406, + "learning_rate": 0.0005945350734094617, + "loss": 0.2609, + "num_input_tokens_seen": 15763648, + "step": 7290 + }, + { + "epoch": 1.1900489396411094, + "grad_norm": 0.2469019889831543, + "learning_rate": 0.0005949429037520392, + "loss": 0.2318, + "num_input_tokens_seen": 15774016, + "step": 7295 + }, + { + "epoch": 1.1908646003262642, + "grad_norm": 0.10927631705999374, + "learning_rate": 0.0005953507340946166, + "loss": 0.1849, + "num_input_tokens_seen": 15783232, + "step": 7300 + }, + { + "epoch": 1.1916802610114192, + "grad_norm": 0.14146047830581665, + "learning_rate": 0.0005957585644371941, + "loss": 0.226, + "num_input_tokens_seen": 15794304, + "step": 7305 + }, + { + "epoch": 1.1924959216965743, + "grad_norm": 0.1634165346622467, + "learning_rate": 0.0005961663947797717, + "loss": 0.0985, + "num_input_tokens_seen": 15804960, + "step": 7310 + }, + { + "epoch": 1.1933115823817293, + "grad_norm": 0.5072509050369263, + "learning_rate": 0.0005965742251223491, + "loss": 0.2341, + "num_input_tokens_seen": 15815808, + "step": 7315 + }, + { + "epoch": 1.1941272430668841, + "grad_norm": 0.2947240471839905, + "learning_rate": 0.0005969820554649265, + "loss": 0.1681, + "num_input_tokens_seen": 15826528, + "step": 7320 + }, + { + "epoch": 1.1949429037520392, + "grad_norm": 0.34804099798202515, + "learning_rate": 0.0005973898858075041, + "loss": 0.1394, + "num_input_tokens_seen": 15837920, + "step": 7325 + }, + { + "epoch": 1.1957585644371942, + "grad_norm": 0.09488537162542343, + "learning_rate": 0.0005977977161500816, + "loss": 0.1016, + "num_input_tokens_seen": 15849152, + "step": 7330 + }, + { + "epoch": 1.196574225122349, + "grad_norm": 0.012553676031529903, + "learning_rate": 0.000598205546492659, + "loss": 0.0936, + "num_input_tokens_seen": 15861152, + "step": 7335 + }, + { + "epoch": 1.197389885807504, + "grad_norm": 0.1785455197095871, + "learning_rate": 0.0005986133768352365, + "loss": 0.1073, + "num_input_tokens_seen": 15872352, + "step": 7340 + }, + { + "epoch": 1.198205546492659, + "grad_norm": 0.09887787699699402, + "learning_rate": 0.000599021207177814, + "loss": 0.0886, + "num_input_tokens_seen": 15881984, + "step": 7345 + }, + { + "epoch": 1.1990212071778141, + "grad_norm": 0.5077795386314392, + "learning_rate": 0.0005994290375203916, + "loss": 0.1468, + "num_input_tokens_seen": 15892800, + "step": 7350 + }, + { + "epoch": 1.199836867862969, + "grad_norm": 0.015568344853818417, + "learning_rate": 0.000599836867862969, + "loss": 0.2378, + "num_input_tokens_seen": 15902912, + "step": 7355 + }, + { + "epoch": 1.200652528548124, + "grad_norm": 0.02570744976401329, + "learning_rate": 0.0006002446982055465, + "loss": 0.0258, + "num_input_tokens_seen": 15914656, + "step": 7360 + }, + { + "epoch": 1.201468189233279, + "grad_norm": 0.01589066907763481, + "learning_rate": 0.000600652528548124, + "loss": 0.0614, + "num_input_tokens_seen": 15924960, + "step": 7365 + }, + { + "epoch": 1.2022838499184338, + "grad_norm": 0.5855007171630859, + "learning_rate": 0.0006010603588907014, + "loss": 0.22, + "num_input_tokens_seen": 15935808, + "step": 7370 + }, + { + "epoch": 1.2030995106035889, + "grad_norm": 0.19280865788459778, + "learning_rate": 0.000601468189233279, + "loss": 0.1349, + "num_input_tokens_seen": 15945184, + "step": 7375 + }, + { + "epoch": 1.203915171288744, + "grad_norm": 0.18036264181137085, + "learning_rate": 0.0006018760195758564, + "loss": 0.0681, + "num_input_tokens_seen": 15956000, + "step": 7380 + }, + { + "epoch": 1.2047308319738987, + "grad_norm": 0.27529698610305786, + "learning_rate": 0.000602283849918434, + "loss": 0.1275, + "num_input_tokens_seen": 15967712, + "step": 7385 + }, + { + "epoch": 1.2055464926590538, + "grad_norm": 0.6813198924064636, + "learning_rate": 0.0006026916802610114, + "loss": 0.2154, + "num_input_tokens_seen": 15977312, + "step": 7390 + }, + { + "epoch": 1.2063621533442088, + "grad_norm": 0.6599311232566833, + "learning_rate": 0.0006030995106035889, + "loss": 0.1632, + "num_input_tokens_seen": 15986048, + "step": 7395 + }, + { + "epoch": 1.2071778140293639, + "grad_norm": 0.2776208817958832, + "learning_rate": 0.0006035073409461665, + "loss": 0.2229, + "num_input_tokens_seen": 15998080, + "step": 7400 + }, + { + "epoch": 1.2079934747145187, + "grad_norm": 0.08681552112102509, + "learning_rate": 0.0006039151712887438, + "loss": 0.097, + "num_input_tokens_seen": 16008704, + "step": 7405 + }, + { + "epoch": 1.2088091353996737, + "grad_norm": 0.13826783001422882, + "learning_rate": 0.0006043230016313214, + "loss": 0.0846, + "num_input_tokens_seen": 16020128, + "step": 7410 + }, + { + "epoch": 1.2096247960848288, + "grad_norm": 0.2015797346830368, + "learning_rate": 0.0006047308319738989, + "loss": 0.1688, + "num_input_tokens_seen": 16031680, + "step": 7415 + }, + { + "epoch": 1.2104404567699838, + "grad_norm": 0.27414339780807495, + "learning_rate": 0.0006051386623164764, + "loss": 0.1624, + "num_input_tokens_seen": 16042400, + "step": 7420 + }, + { + "epoch": 1.2112561174551386, + "grad_norm": 0.4146776795387268, + "learning_rate": 0.0006055464926590538, + "loss": 0.0825, + "num_input_tokens_seen": 16052832, + "step": 7425 + }, + { + "epoch": 1.2120717781402937, + "grad_norm": 0.5340694785118103, + "learning_rate": 0.0006059543230016313, + "loss": 0.2615, + "num_input_tokens_seen": 16062688, + "step": 7430 + }, + { + "epoch": 1.2128874388254487, + "grad_norm": 0.15199506282806396, + "learning_rate": 0.0006063621533442089, + "loss": 0.1463, + "num_input_tokens_seen": 16073440, + "step": 7435 + }, + { + "epoch": 1.2137030995106035, + "grad_norm": 0.45297807455062866, + "learning_rate": 0.0006067699836867863, + "loss": 0.2037, + "num_input_tokens_seen": 16085120, + "step": 7440 + }, + { + "epoch": 1.2145187601957586, + "grad_norm": 0.23709626495838165, + "learning_rate": 0.0006071778140293637, + "loss": 0.1115, + "num_input_tokens_seen": 16095296, + "step": 7445 + }, + { + "epoch": 1.2153344208809136, + "grad_norm": 0.3890670835971832, + "learning_rate": 0.0006075856443719413, + "loss": 0.2847, + "num_input_tokens_seen": 16106112, + "step": 7450 + }, + { + "epoch": 1.2161500815660684, + "grad_norm": 0.1603914052248001, + "learning_rate": 0.0006079934747145188, + "loss": 0.1126, + "num_input_tokens_seen": 16115136, + "step": 7455 + }, + { + "epoch": 1.2169657422512234, + "grad_norm": 0.06594960391521454, + "learning_rate": 0.0006084013050570962, + "loss": 0.083, + "num_input_tokens_seen": 16125344, + "step": 7460 + }, + { + "epoch": 1.2177814029363785, + "grad_norm": 0.12942712008953094, + "learning_rate": 0.0006088091353996738, + "loss": 0.1093, + "num_input_tokens_seen": 16136928, + "step": 7465 + }, + { + "epoch": 1.2185970636215335, + "grad_norm": 0.2689228951931, + "learning_rate": 0.0006092169657422512, + "loss": 0.0951, + "num_input_tokens_seen": 16147392, + "step": 7470 + }, + { + "epoch": 1.2194127243066883, + "grad_norm": 0.142789825797081, + "learning_rate": 0.0006096247960848287, + "loss": 0.117, + "num_input_tokens_seen": 16158080, + "step": 7475 + }, + { + "epoch": 1.2202283849918434, + "grad_norm": 0.06195086985826492, + "learning_rate": 0.0006100326264274062, + "loss": 0.1366, + "num_input_tokens_seen": 16169568, + "step": 7480 + }, + { + "epoch": 1.2210440456769984, + "grad_norm": 0.1662866175174713, + "learning_rate": 0.0006104404567699837, + "loss": 0.0804, + "num_input_tokens_seen": 16180768, + "step": 7485 + }, + { + "epoch": 1.2218597063621535, + "grad_norm": 0.21019388735294342, + "learning_rate": 0.0006108482871125613, + "loss": 0.0708, + "num_input_tokens_seen": 16191520, + "step": 7490 + }, + { + "epoch": 1.2226753670473083, + "grad_norm": 0.20374208688735962, + "learning_rate": 0.0006112561174551386, + "loss": 0.1439, + "num_input_tokens_seen": 16202624, + "step": 7495 + }, + { + "epoch": 1.2234910277324633, + "grad_norm": 0.0657731369137764, + "learning_rate": 0.0006116639477977162, + "loss": 0.0694, + "num_input_tokens_seen": 16212192, + "step": 7500 + }, + { + "epoch": 1.2243066884176184, + "grad_norm": 0.5109583735466003, + "learning_rate": 0.0006120717781402937, + "loss": 0.0916, + "num_input_tokens_seen": 16224416, + "step": 7505 + }, + { + "epoch": 1.2251223491027732, + "grad_norm": 0.04669109731912613, + "learning_rate": 0.000612479608482871, + "loss": 0.1749, + "num_input_tokens_seen": 16235232, + "step": 7510 + }, + { + "epoch": 1.2259380097879282, + "grad_norm": 0.04343324527144432, + "learning_rate": 0.0006128874388254486, + "loss": 0.0244, + "num_input_tokens_seen": 16245696, + "step": 7515 + }, + { + "epoch": 1.2267536704730833, + "grad_norm": 0.124315544962883, + "learning_rate": 0.0006132952691680261, + "loss": 0.1668, + "num_input_tokens_seen": 16257216, + "step": 7520 + }, + { + "epoch": 1.227569331158238, + "grad_norm": 0.4877174496650696, + "learning_rate": 0.0006137030995106036, + "loss": 0.1787, + "num_input_tokens_seen": 16268896, + "step": 7525 + }, + { + "epoch": 1.2283849918433931, + "grad_norm": 0.0646728053689003, + "learning_rate": 0.0006141109298531811, + "loss": 0.074, + "num_input_tokens_seen": 16280128, + "step": 7530 + }, + { + "epoch": 1.2292006525285482, + "grad_norm": 0.07480008155107498, + "learning_rate": 0.0006145187601957585, + "loss": 0.0611, + "num_input_tokens_seen": 16292128, + "step": 7535 + }, + { + "epoch": 1.2300163132137032, + "grad_norm": 0.25711551308631897, + "learning_rate": 0.0006149265905383361, + "loss": 0.0798, + "num_input_tokens_seen": 16302048, + "step": 7540 + }, + { + "epoch": 1.230831973898858, + "grad_norm": 0.48960769176483154, + "learning_rate": 0.0006153344208809135, + "loss": 0.2378, + "num_input_tokens_seen": 16312672, + "step": 7545 + }, + { + "epoch": 1.231647634584013, + "grad_norm": 0.21101155877113342, + "learning_rate": 0.0006157422512234911, + "loss": 0.0516, + "num_input_tokens_seen": 16322528, + "step": 7550 + }, + { + "epoch": 1.232463295269168, + "grad_norm": 0.6068270206451416, + "learning_rate": 0.0006161500815660686, + "loss": 0.3582, + "num_input_tokens_seen": 16334720, + "step": 7555 + }, + { + "epoch": 1.233278955954323, + "grad_norm": 0.21375367045402527, + "learning_rate": 0.0006165579119086459, + "loss": 0.059, + "num_input_tokens_seen": 16346272, + "step": 7560 + }, + { + "epoch": 1.234094616639478, + "grad_norm": 0.13177448511123657, + "learning_rate": 0.0006169657422512235, + "loss": 0.094, + "num_input_tokens_seen": 16357280, + "step": 7565 + }, + { + "epoch": 1.234910277324633, + "grad_norm": 0.2921614646911621, + "learning_rate": 0.000617373572593801, + "loss": 0.1991, + "num_input_tokens_seen": 16368704, + "step": 7570 + }, + { + "epoch": 1.235725938009788, + "grad_norm": 0.3497111201286316, + "learning_rate": 0.0006177814029363786, + "loss": 0.1153, + "num_input_tokens_seen": 16379904, + "step": 7575 + }, + { + "epoch": 1.2365415986949428, + "grad_norm": 0.056018609553575516, + "learning_rate": 0.0006181892332789559, + "loss": 0.129, + "num_input_tokens_seen": 16390144, + "step": 7580 + }, + { + "epoch": 1.2373572593800979, + "grad_norm": 0.15077915787696838, + "learning_rate": 0.0006185970636215334, + "loss": 0.0764, + "num_input_tokens_seen": 16401024, + "step": 7585 + }, + { + "epoch": 1.238172920065253, + "grad_norm": 0.13833118975162506, + "learning_rate": 0.000619004893964111, + "loss": 0.1811, + "num_input_tokens_seen": 16410368, + "step": 7590 + }, + { + "epoch": 1.2389885807504077, + "grad_norm": 0.2312982827425003, + "learning_rate": 0.0006194127243066884, + "loss": 0.1477, + "num_input_tokens_seen": 16420000, + "step": 7595 + }, + { + "epoch": 1.2398042414355628, + "grad_norm": 0.11060801148414612, + "learning_rate": 0.000619820554649266, + "loss": 0.08, + "num_input_tokens_seen": 16430240, + "step": 7600 + }, + { + "epoch": 1.2406199021207178, + "grad_norm": 0.1415390968322754, + "learning_rate": 0.0006202283849918434, + "loss": 0.2042, + "num_input_tokens_seen": 16440384, + "step": 7605 + }, + { + "epoch": 1.2414355628058726, + "grad_norm": 0.36229464411735535, + "learning_rate": 0.0006206362153344209, + "loss": 0.1486, + "num_input_tokens_seen": 16451296, + "step": 7610 + }, + { + "epoch": 1.2422512234910277, + "grad_norm": 0.13426244258880615, + "learning_rate": 0.0006210440456769984, + "loss": 0.0735, + "num_input_tokens_seen": 16463584, + "step": 7615 + }, + { + "epoch": 1.2430668841761827, + "grad_norm": 0.24502316117286682, + "learning_rate": 0.0006214518760195759, + "loss": 0.1116, + "num_input_tokens_seen": 16474656, + "step": 7620 + }, + { + "epoch": 1.2438825448613378, + "grad_norm": 0.39227384328842163, + "learning_rate": 0.0006218597063621533, + "loss": 0.213, + "num_input_tokens_seen": 16485632, + "step": 7625 + }, + { + "epoch": 1.2446982055464926, + "grad_norm": 0.24334146082401276, + "learning_rate": 0.0006222675367047308, + "loss": 0.1499, + "num_input_tokens_seen": 16496160, + "step": 7630 + }, + { + "epoch": 1.2455138662316476, + "grad_norm": 0.4462047517299652, + "learning_rate": 0.0006226753670473083, + "loss": 0.1664, + "num_input_tokens_seen": 16506016, + "step": 7635 + }, + { + "epoch": 1.2463295269168027, + "grad_norm": 0.14899565279483795, + "learning_rate": 0.0006230831973898859, + "loss": 0.0756, + "num_input_tokens_seen": 16516832, + "step": 7640 + }, + { + "epoch": 1.2471451876019577, + "grad_norm": 0.043367356061935425, + "learning_rate": 0.0006234910277324634, + "loss": 0.0656, + "num_input_tokens_seen": 16528128, + "step": 7645 + }, + { + "epoch": 1.2479608482871125, + "grad_norm": 0.019222905859351158, + "learning_rate": 0.0006238988580750407, + "loss": 0.039, + "num_input_tokens_seen": 16539168, + "step": 7650 + }, + { + "epoch": 1.2487765089722676, + "grad_norm": 0.16319647431373596, + "learning_rate": 0.0006243066884176183, + "loss": 0.1972, + "num_input_tokens_seen": 16549280, + "step": 7655 + }, + { + "epoch": 1.2495921696574226, + "grad_norm": 0.17091651260852814, + "learning_rate": 0.0006247145187601958, + "loss": 0.0665, + "num_input_tokens_seen": 16559680, + "step": 7660 + }, + { + "epoch": 1.2504078303425774, + "grad_norm": 0.055152345448732376, + "learning_rate": 0.0006251223491027733, + "loss": 0.0209, + "num_input_tokens_seen": 16570048, + "step": 7665 + }, + { + "epoch": 1.2512234910277324, + "grad_norm": 0.35145047307014465, + "learning_rate": 0.0006255301794453507, + "loss": 0.123, + "num_input_tokens_seen": 16581344, + "step": 7670 + }, + { + "epoch": 1.2520391517128875, + "grad_norm": 0.08169589936733246, + "learning_rate": 0.0006259380097879282, + "loss": 0.0798, + "num_input_tokens_seen": 16591872, + "step": 7675 + }, + { + "epoch": 1.2528548123980423, + "grad_norm": 0.20706957578659058, + "learning_rate": 0.0006263458401305058, + "loss": 0.264, + "num_input_tokens_seen": 16602400, + "step": 7680 + }, + { + "epoch": 1.2536704730831973, + "grad_norm": 0.41825392842292786, + "learning_rate": 0.0006267536704730832, + "loss": 0.0811, + "num_input_tokens_seen": 16614272, + "step": 7685 + }, + { + "epoch": 1.2544861337683524, + "grad_norm": 0.017080556601285934, + "learning_rate": 0.0006271615008156607, + "loss": 0.0979, + "num_input_tokens_seen": 16625728, + "step": 7690 + }, + { + "epoch": 1.2553017944535072, + "grad_norm": 0.30033621191978455, + "learning_rate": 0.0006275693311582382, + "loss": 0.0881, + "num_input_tokens_seen": 16637056, + "step": 7695 + }, + { + "epoch": 1.2561174551386622, + "grad_norm": 0.02396804839372635, + "learning_rate": 0.0006279771615008156, + "loss": 0.0527, + "num_input_tokens_seen": 16648448, + "step": 7700 + }, + { + "epoch": 1.2569331158238173, + "grad_norm": 0.18351727724075317, + "learning_rate": 0.0006283849918433932, + "loss": 0.1394, + "num_input_tokens_seen": 16659200, + "step": 7705 + }, + { + "epoch": 1.2577487765089723, + "grad_norm": 0.008948463946580887, + "learning_rate": 0.0006287928221859707, + "loss": 0.1612, + "num_input_tokens_seen": 16668992, + "step": 7710 + }, + { + "epoch": 1.2585644371941274, + "grad_norm": 0.2766994535923004, + "learning_rate": 0.0006292006525285482, + "loss": 0.068, + "num_input_tokens_seen": 16678720, + "step": 7715 + }, + { + "epoch": 1.2593800978792822, + "grad_norm": 0.0534847155213356, + "learning_rate": 0.0006296084828711256, + "loss": 0.0561, + "num_input_tokens_seen": 16690656, + "step": 7720 + }, + { + "epoch": 1.2601957585644372, + "grad_norm": 0.026031237095594406, + "learning_rate": 0.0006300163132137031, + "loss": 0.0931, + "num_input_tokens_seen": 16701536, + "step": 7725 + }, + { + "epoch": 1.2610114192495923, + "grad_norm": 0.0823201909661293, + "learning_rate": 0.0006304241435562807, + "loss": 0.1381, + "num_input_tokens_seen": 16712096, + "step": 7730 + }, + { + "epoch": 1.261827079934747, + "grad_norm": 0.058362994343042374, + "learning_rate": 0.000630831973898858, + "loss": 0.0968, + "num_input_tokens_seen": 16724544, + "step": 7735 + }, + { + "epoch": 1.2626427406199021, + "grad_norm": 0.30868566036224365, + "learning_rate": 0.0006312398042414356, + "loss": 0.0493, + "num_input_tokens_seen": 16736128, + "step": 7740 + }, + { + "epoch": 1.2634584013050572, + "grad_norm": 0.03179110214114189, + "learning_rate": 0.0006316476345840131, + "loss": 0.1463, + "num_input_tokens_seen": 16746464, + "step": 7745 + }, + { + "epoch": 1.264274061990212, + "grad_norm": 0.10827480256557465, + "learning_rate": 0.0006320554649265906, + "loss": 0.0504, + "num_input_tokens_seen": 16757632, + "step": 7750 + }, + { + "epoch": 1.265089722675367, + "grad_norm": 0.4119908809661865, + "learning_rate": 0.000632463295269168, + "loss": 0.1013, + "num_input_tokens_seen": 16767296, + "step": 7755 + }, + { + "epoch": 1.265905383360522, + "grad_norm": 0.0076904455199837685, + "learning_rate": 0.0006328711256117455, + "loss": 0.2405, + "num_input_tokens_seen": 16779616, + "step": 7760 + }, + { + "epoch": 1.2667210440456769, + "grad_norm": 0.21104197204113007, + "learning_rate": 0.000633278955954323, + "loss": 0.0598, + "num_input_tokens_seen": 16789216, + "step": 7765 + }, + { + "epoch": 1.267536704730832, + "grad_norm": 0.47181203961372375, + "learning_rate": 0.0006336867862969005, + "loss": 0.0737, + "num_input_tokens_seen": 16800192, + "step": 7770 + }, + { + "epoch": 1.268352365415987, + "grad_norm": 0.2550259828567505, + "learning_rate": 0.000634094616639478, + "loss": 0.088, + "num_input_tokens_seen": 16810624, + "step": 7775 + }, + { + "epoch": 1.269168026101142, + "grad_norm": 0.09608176350593567, + "learning_rate": 0.0006345024469820555, + "loss": 0.213, + "num_input_tokens_seen": 16821280, + "step": 7780 + }, + { + "epoch": 1.269983686786297, + "grad_norm": 0.026034316048026085, + "learning_rate": 0.0006349102773246329, + "loss": 0.1053, + "num_input_tokens_seen": 16832384, + "step": 7785 + }, + { + "epoch": 1.2707993474714518, + "grad_norm": 0.12941612303256989, + "learning_rate": 0.0006353181076672104, + "loss": 0.06, + "num_input_tokens_seen": 16843360, + "step": 7790 + }, + { + "epoch": 1.2716150081566069, + "grad_norm": 0.10202895104885101, + "learning_rate": 0.000635725938009788, + "loss": 0.1224, + "num_input_tokens_seen": 16853216, + "step": 7795 + }, + { + "epoch": 1.272430668841762, + "grad_norm": 0.3297278583049774, + "learning_rate": 0.0006361337683523654, + "loss": 0.1465, + "num_input_tokens_seen": 16864704, + "step": 7800 + }, + { + "epoch": 1.2732463295269167, + "grad_norm": 0.5877017974853516, + "learning_rate": 0.0006365415986949429, + "loss": 0.1138, + "num_input_tokens_seen": 16876928, + "step": 7805 + }, + { + "epoch": 1.2740619902120718, + "grad_norm": 0.4183492362499237, + "learning_rate": 0.0006369494290375204, + "loss": 0.1534, + "num_input_tokens_seen": 16888224, + "step": 7810 + }, + { + "epoch": 1.2748776508972268, + "grad_norm": 0.31846538186073303, + "learning_rate": 0.0006373572593800979, + "loss": 0.0547, + "num_input_tokens_seen": 16899648, + "step": 7815 + }, + { + "epoch": 1.2756933115823816, + "grad_norm": 0.38586241006851196, + "learning_rate": 0.0006377650897226754, + "loss": 0.1134, + "num_input_tokens_seen": 16909472, + "step": 7820 + }, + { + "epoch": 1.2765089722675367, + "grad_norm": 0.15391522645950317, + "learning_rate": 0.0006381729200652528, + "loss": 0.0538, + "num_input_tokens_seen": 16921344, + "step": 7825 + }, + { + "epoch": 1.2773246329526917, + "grad_norm": 0.2430095225572586, + "learning_rate": 0.0006385807504078304, + "loss": 0.1289, + "num_input_tokens_seen": 16931648, + "step": 7830 + }, + { + "epoch": 1.2781402936378465, + "grad_norm": 0.10504290461540222, + "learning_rate": 0.0006389885807504079, + "loss": 0.1039, + "num_input_tokens_seen": 16942304, + "step": 7835 + }, + { + "epoch": 1.2789559543230016, + "grad_norm": 0.10132510960102081, + "learning_rate": 0.0006393964110929853, + "loss": 0.1656, + "num_input_tokens_seen": 16953056, + "step": 7840 + }, + { + "epoch": 1.2797716150081566, + "grad_norm": 0.14680597186088562, + "learning_rate": 0.0006398042414355628, + "loss": 0.0431, + "num_input_tokens_seen": 16962944, + "step": 7845 + }, + { + "epoch": 1.2805872756933117, + "grad_norm": 0.04616044834256172, + "learning_rate": 0.0006402120717781403, + "loss": 0.1193, + "num_input_tokens_seen": 16973408, + "step": 7850 + }, + { + "epoch": 1.2814029363784667, + "grad_norm": 0.03202705457806587, + "learning_rate": 0.0006406199021207178, + "loss": 0.0673, + "num_input_tokens_seen": 16983904, + "step": 7855 + }, + { + "epoch": 1.2822185970636215, + "grad_norm": 0.0769016444683075, + "learning_rate": 0.0006410277324632953, + "loss": 0.0692, + "num_input_tokens_seen": 16995584, + "step": 7860 + }, + { + "epoch": 1.2830342577487766, + "grad_norm": 0.2932111620903015, + "learning_rate": 0.0006414355628058727, + "loss": 0.2171, + "num_input_tokens_seen": 17006912, + "step": 7865 + }, + { + "epoch": 1.2838499184339316, + "grad_norm": 0.0873182862997055, + "learning_rate": 0.0006418433931484503, + "loss": 0.2605, + "num_input_tokens_seen": 17018240, + "step": 7870 + }, + { + "epoch": 1.2846655791190864, + "grad_norm": 0.1578231006860733, + "learning_rate": 0.0006422512234910277, + "loss": 0.1284, + "num_input_tokens_seen": 17028832, + "step": 7875 + }, + { + "epoch": 1.2854812398042414, + "grad_norm": 0.27968907356262207, + "learning_rate": 0.0006426590538336053, + "loss": 0.1061, + "num_input_tokens_seen": 17040704, + "step": 7880 + }, + { + "epoch": 1.2862969004893965, + "grad_norm": 0.04651603102684021, + "learning_rate": 0.0006430668841761828, + "loss": 0.1679, + "num_input_tokens_seen": 17052224, + "step": 7885 + }, + { + "epoch": 1.2871125611745513, + "grad_norm": 0.5027137398719788, + "learning_rate": 0.0006434747145187601, + "loss": 0.4001, + "num_input_tokens_seen": 17062272, + "step": 7890 + }, + { + "epoch": 1.2879282218597063, + "grad_norm": 0.08611617237329483, + "learning_rate": 0.0006438825448613377, + "loss": 0.0792, + "num_input_tokens_seen": 17073728, + "step": 7895 + }, + { + "epoch": 1.2887438825448614, + "grad_norm": 0.29694536328315735, + "learning_rate": 0.0006442903752039152, + "loss": 0.241, + "num_input_tokens_seen": 17084032, + "step": 7900 + }, + { + "epoch": 1.2895595432300162, + "grad_norm": 0.023007987067103386, + "learning_rate": 0.0006446982055464927, + "loss": 0.2715, + "num_input_tokens_seen": 17095616, + "step": 7905 + }, + { + "epoch": 1.2903752039151712, + "grad_norm": 0.23659998178482056, + "learning_rate": 0.0006451060358890701, + "loss": 0.1089, + "num_input_tokens_seen": 17105952, + "step": 7910 + }, + { + "epoch": 1.2911908646003263, + "grad_norm": 0.28056174516677856, + "learning_rate": 0.0006455138662316476, + "loss": 0.161, + "num_input_tokens_seen": 17116064, + "step": 7915 + }, + { + "epoch": 1.2920065252854813, + "grad_norm": 0.22875936329364777, + "learning_rate": 0.0006459216965742252, + "loss": 0.1918, + "num_input_tokens_seen": 17127456, + "step": 7920 + }, + { + "epoch": 1.2928221859706361, + "grad_norm": 0.5045974254608154, + "learning_rate": 0.0006463295269168026, + "loss": 0.1348, + "num_input_tokens_seen": 17139168, + "step": 7925 + }, + { + "epoch": 1.2936378466557912, + "grad_norm": 0.24506209790706635, + "learning_rate": 0.00064673735725938, + "loss": 0.078, + "num_input_tokens_seen": 17150848, + "step": 7930 + }, + { + "epoch": 1.2944535073409462, + "grad_norm": 0.04896121099591255, + "learning_rate": 0.0006471451876019576, + "loss": 0.0384, + "num_input_tokens_seen": 17162080, + "step": 7935 + }, + { + "epoch": 1.2952691680261013, + "grad_norm": 0.2526971995830536, + "learning_rate": 0.0006475530179445351, + "loss": 0.0936, + "num_input_tokens_seen": 17172544, + "step": 7940 + }, + { + "epoch": 1.296084828711256, + "grad_norm": 0.10988382250070572, + "learning_rate": 0.0006479608482871126, + "loss": 0.0965, + "num_input_tokens_seen": 17182848, + "step": 7945 + }, + { + "epoch": 1.2969004893964111, + "grad_norm": 0.27198734879493713, + "learning_rate": 0.0006483686786296901, + "loss": 0.0372, + "num_input_tokens_seen": 17193408, + "step": 7950 + }, + { + "epoch": 1.2977161500815662, + "grad_norm": 0.3824102580547333, + "learning_rate": 0.0006487765089722675, + "loss": 0.176, + "num_input_tokens_seen": 17204384, + "step": 7955 + }, + { + "epoch": 1.298531810766721, + "grad_norm": 0.1732729822397232, + "learning_rate": 0.000649184339314845, + "loss": 0.1326, + "num_input_tokens_seen": 17214656, + "step": 7960 + }, + { + "epoch": 1.299347471451876, + "grad_norm": 0.2566794753074646, + "learning_rate": 0.0006495921696574225, + "loss": 0.0654, + "num_input_tokens_seen": 17224320, + "step": 7965 + }, + { + "epoch": 1.300163132137031, + "grad_norm": 0.09076650440692902, + "learning_rate": 0.0006500000000000001, + "loss": 0.1901, + "num_input_tokens_seen": 17236512, + "step": 7970 + }, + { + "epoch": 1.3009787928221859, + "grad_norm": 0.18000783026218414, + "learning_rate": 0.0006504078303425776, + "loss": 0.0926, + "num_input_tokens_seen": 17247520, + "step": 7975 + }, + { + "epoch": 1.301794453507341, + "grad_norm": 0.23230217397212982, + "learning_rate": 0.0006508156606851549, + "loss": 0.1648, + "num_input_tokens_seen": 17258752, + "step": 7980 + }, + { + "epoch": 1.302610114192496, + "grad_norm": 0.11019614338874817, + "learning_rate": 0.0006512234910277325, + "loss": 0.0771, + "num_input_tokens_seen": 17269632, + "step": 7985 + }, + { + "epoch": 1.3034257748776508, + "grad_norm": 0.04341624677181244, + "learning_rate": 0.00065163132137031, + "loss": 0.2333, + "num_input_tokens_seen": 17280192, + "step": 7990 + }, + { + "epoch": 1.3042414355628058, + "grad_norm": 0.4747850298881531, + "learning_rate": 0.0006520391517128875, + "loss": 0.1317, + "num_input_tokens_seen": 17291712, + "step": 7995 + }, + { + "epoch": 1.3050570962479608, + "grad_norm": 0.7965296506881714, + "learning_rate": 0.0006524469820554649, + "loss": 0.22, + "num_input_tokens_seen": 17302272, + "step": 8000 + }, + { + "epoch": 1.3058727569331159, + "grad_norm": 0.29167118668556213, + "learning_rate": 0.0006528548123980424, + "loss": 0.1517, + "num_input_tokens_seen": 17314304, + "step": 8005 + }, + { + "epoch": 1.306688417618271, + "grad_norm": 0.3655271828174591, + "learning_rate": 0.0006532626427406199, + "loss": 0.1185, + "num_input_tokens_seen": 17325248, + "step": 8010 + }, + { + "epoch": 1.3075040783034257, + "grad_norm": 0.26490768790245056, + "learning_rate": 0.0006536704730831974, + "loss": 0.1989, + "num_input_tokens_seen": 17335936, + "step": 8015 + }, + { + "epoch": 1.3083197389885808, + "grad_norm": 0.3598152995109558, + "learning_rate": 0.000654078303425775, + "loss": 0.2269, + "num_input_tokens_seen": 17346016, + "step": 8020 + }, + { + "epoch": 1.3091353996737358, + "grad_norm": 0.10805067420005798, + "learning_rate": 0.0006544861337683524, + "loss": 0.1456, + "num_input_tokens_seen": 17357728, + "step": 8025 + }, + { + "epoch": 1.3099510603588906, + "grad_norm": 0.07780245691537857, + "learning_rate": 0.0006548939641109298, + "loss": 0.094, + "num_input_tokens_seen": 17368032, + "step": 8030 + }, + { + "epoch": 1.3107667210440457, + "grad_norm": 0.41313520073890686, + "learning_rate": 0.0006553017944535074, + "loss": 0.1353, + "num_input_tokens_seen": 17378816, + "step": 8035 + }, + { + "epoch": 1.3115823817292007, + "grad_norm": 0.10756899416446686, + "learning_rate": 0.0006557096247960849, + "loss": 0.1648, + "num_input_tokens_seen": 17390048, + "step": 8040 + }, + { + "epoch": 1.3123980424143555, + "grad_norm": 0.5034075379371643, + "learning_rate": 0.0006561174551386622, + "loss": 0.1469, + "num_input_tokens_seen": 17399680, + "step": 8045 + }, + { + "epoch": 1.3132137030995106, + "grad_norm": 0.08964333683252335, + "learning_rate": 0.0006565252854812398, + "loss": 0.0941, + "num_input_tokens_seen": 17411424, + "step": 8050 + }, + { + "epoch": 1.3140293637846656, + "grad_norm": 0.029598180204629898, + "learning_rate": 0.0006569331158238173, + "loss": 0.0703, + "num_input_tokens_seen": 17422272, + "step": 8055 + }, + { + "epoch": 1.3148450244698204, + "grad_norm": 0.2906266450881958, + "learning_rate": 0.0006573409461663949, + "loss": 0.1844, + "num_input_tokens_seen": 17433216, + "step": 8060 + }, + { + "epoch": 1.3156606851549755, + "grad_norm": 0.033766523003578186, + "learning_rate": 0.0006577487765089722, + "loss": 0.0721, + "num_input_tokens_seen": 17443200, + "step": 8065 + }, + { + "epoch": 1.3164763458401305, + "grad_norm": 0.08947774022817612, + "learning_rate": 0.0006581566068515497, + "loss": 0.1417, + "num_input_tokens_seen": 17453792, + "step": 8070 + }, + { + "epoch": 1.3172920065252856, + "grad_norm": 0.14372272789478302, + "learning_rate": 0.0006585644371941273, + "loss": 0.1788, + "num_input_tokens_seen": 17464192, + "step": 8075 + }, + { + "epoch": 1.3181076672104406, + "grad_norm": 0.029002483934164047, + "learning_rate": 0.0006589722675367047, + "loss": 0.1642, + "num_input_tokens_seen": 17475200, + "step": 8080 + }, + { + "epoch": 1.3189233278955954, + "grad_norm": 0.2958846092224121, + "learning_rate": 0.0006593800978792823, + "loss": 0.1529, + "num_input_tokens_seen": 17486176, + "step": 8085 + }, + { + "epoch": 1.3197389885807504, + "grad_norm": 0.14247475564479828, + "learning_rate": 0.0006597879282218597, + "loss": 0.2228, + "num_input_tokens_seen": 17496416, + "step": 8090 + }, + { + "epoch": 1.3205546492659055, + "grad_norm": 0.2910315990447998, + "learning_rate": 0.0006601957585644372, + "loss": 0.1838, + "num_input_tokens_seen": 17507200, + "step": 8095 + }, + { + "epoch": 1.3213703099510603, + "grad_norm": 0.08594219386577606, + "learning_rate": 0.0006606035889070147, + "loss": 0.1321, + "num_input_tokens_seen": 17517792, + "step": 8100 + }, + { + "epoch": 1.3221859706362153, + "grad_norm": 0.026004578918218613, + "learning_rate": 0.0006610114192495922, + "loss": 0.12, + "num_input_tokens_seen": 17527808, + "step": 8105 + }, + { + "epoch": 1.3230016313213704, + "grad_norm": 0.030598606914281845, + "learning_rate": 0.0006614192495921697, + "loss": 0.124, + "num_input_tokens_seen": 17538560, + "step": 8110 + }, + { + "epoch": 1.3238172920065252, + "grad_norm": 0.12475190311670303, + "learning_rate": 0.0006618270799347471, + "loss": 0.1772, + "num_input_tokens_seen": 17550656, + "step": 8115 + }, + { + "epoch": 1.3246329526916802, + "grad_norm": 0.05746564269065857, + "learning_rate": 0.0006622349102773246, + "loss": 0.1185, + "num_input_tokens_seen": 17561824, + "step": 8120 + }, + { + "epoch": 1.3254486133768353, + "grad_norm": 0.2370694875717163, + "learning_rate": 0.0006626427406199022, + "loss": 0.1014, + "num_input_tokens_seen": 17573120, + "step": 8125 + }, + { + "epoch": 1.32626427406199, + "grad_norm": 0.3231610655784607, + "learning_rate": 0.0006630505709624797, + "loss": 0.1533, + "num_input_tokens_seen": 17584224, + "step": 8130 + }, + { + "epoch": 1.3270799347471451, + "grad_norm": 0.025710172951221466, + "learning_rate": 0.0006634584013050571, + "loss": 0.0276, + "num_input_tokens_seen": 17595232, + "step": 8135 + }, + { + "epoch": 1.3278955954323002, + "grad_norm": 0.22752685844898224, + "learning_rate": 0.0006638662316476346, + "loss": 0.1328, + "num_input_tokens_seen": 17606624, + "step": 8140 + }, + { + "epoch": 1.3287112561174552, + "grad_norm": 0.07197009772062302, + "learning_rate": 0.0006642740619902121, + "loss": 0.0577, + "num_input_tokens_seen": 17616864, + "step": 8145 + }, + { + "epoch": 1.32952691680261, + "grad_norm": 0.11145736277103424, + "learning_rate": 0.0006646818923327896, + "loss": 0.2121, + "num_input_tokens_seen": 17628416, + "step": 8150 + }, + { + "epoch": 1.330342577487765, + "grad_norm": 0.06519704312086105, + "learning_rate": 0.000665089722675367, + "loss": 0.1524, + "num_input_tokens_seen": 17637472, + "step": 8155 + }, + { + "epoch": 1.3311582381729201, + "grad_norm": 0.0630141869187355, + "learning_rate": 0.0006654975530179446, + "loss": 0.1842, + "num_input_tokens_seen": 17647744, + "step": 8160 + }, + { + "epoch": 1.3319738988580752, + "grad_norm": 0.10813954472541809, + "learning_rate": 0.0006659053833605221, + "loss": 0.0914, + "num_input_tokens_seen": 17657824, + "step": 8165 + }, + { + "epoch": 1.33278955954323, + "grad_norm": 0.0244632288813591, + "learning_rate": 0.0006663132137030995, + "loss": 0.0866, + "num_input_tokens_seen": 17669344, + "step": 8170 + }, + { + "epoch": 1.333605220228385, + "grad_norm": 0.019405458122491837, + "learning_rate": 0.000666721044045677, + "loss": 0.0706, + "num_input_tokens_seen": 17680000, + "step": 8175 + }, + { + "epoch": 1.33442088091354, + "grad_norm": 0.013781199231743813, + "learning_rate": 0.0006671288743882545, + "loss": 0.1345, + "num_input_tokens_seen": 17690368, + "step": 8180 + }, + { + "epoch": 1.3352365415986949, + "grad_norm": 0.012108061462640762, + "learning_rate": 0.0006675367047308319, + "loss": 0.0377, + "num_input_tokens_seen": 17701216, + "step": 8185 + }, + { + "epoch": 1.33605220228385, + "grad_norm": 0.10047098994255066, + "learning_rate": 0.0006679445350734095, + "loss": 0.2312, + "num_input_tokens_seen": 17712800, + "step": 8190 + }, + { + "epoch": 1.336867862969005, + "grad_norm": 0.10426725447177887, + "learning_rate": 0.000668352365415987, + "loss": 0.218, + "num_input_tokens_seen": 17723136, + "step": 8195 + }, + { + "epoch": 1.3376835236541598, + "grad_norm": 0.21119491755962372, + "learning_rate": 0.0006687601957585645, + "loss": 0.1494, + "num_input_tokens_seen": 17733792, + "step": 8200 + }, + { + "epoch": 1.3384991843393148, + "grad_norm": 0.060859113931655884, + "learning_rate": 0.0006691680261011419, + "loss": 0.1225, + "num_input_tokens_seen": 17744032, + "step": 8205 + }, + { + "epoch": 1.3393148450244698, + "grad_norm": 0.04675585404038429, + "learning_rate": 0.0006695758564437194, + "loss": 0.1787, + "num_input_tokens_seen": 17753856, + "step": 8210 + }, + { + "epoch": 1.3401305057096247, + "grad_norm": 0.09033242613077164, + "learning_rate": 0.000669983686786297, + "loss": 0.2053, + "num_input_tokens_seen": 17765184, + "step": 8215 + }, + { + "epoch": 1.3409461663947797, + "grad_norm": 0.2503712773323059, + "learning_rate": 0.0006703915171288743, + "loss": 0.1107, + "num_input_tokens_seen": 17776032, + "step": 8220 + }, + { + "epoch": 1.3417618270799347, + "grad_norm": 0.03463561087846756, + "learning_rate": 0.0006707993474714519, + "loss": 0.0904, + "num_input_tokens_seen": 17785472, + "step": 8225 + }, + { + "epoch": 1.3425774877650898, + "grad_norm": 0.6294701099395752, + "learning_rate": 0.0006712071778140294, + "loss": 0.1538, + "num_input_tokens_seen": 17796832, + "step": 8230 + }, + { + "epoch": 1.3433931484502448, + "grad_norm": 0.3345804214477539, + "learning_rate": 0.0006716150081566068, + "loss": 0.1497, + "num_input_tokens_seen": 17809344, + "step": 8235 + }, + { + "epoch": 1.3442088091353996, + "grad_norm": 0.47858479619026184, + "learning_rate": 0.0006720228384991843, + "loss": 0.3782, + "num_input_tokens_seen": 17820000, + "step": 8240 + }, + { + "epoch": 1.3450244698205547, + "grad_norm": 0.38808053731918335, + "learning_rate": 0.0006724306688417618, + "loss": 0.1865, + "num_input_tokens_seen": 17830912, + "step": 8245 + }, + { + "epoch": 1.3458401305057097, + "grad_norm": 0.24597151577472687, + "learning_rate": 0.0006728384991843394, + "loss": 0.1407, + "num_input_tokens_seen": 17841856, + "step": 8250 + }, + { + "epoch": 1.3466557911908645, + "grad_norm": 0.29200631380081177, + "learning_rate": 0.0006732463295269168, + "loss": 0.1813, + "num_input_tokens_seen": 17852320, + "step": 8255 + }, + { + "epoch": 1.3474714518760196, + "grad_norm": 0.02408430352807045, + "learning_rate": 0.0006736541598694943, + "loss": 0.1287, + "num_input_tokens_seen": 17862624, + "step": 8260 + }, + { + "epoch": 1.3482871125611746, + "grad_norm": 0.10565165430307388, + "learning_rate": 0.0006740619902120718, + "loss": 0.2301, + "num_input_tokens_seen": 17873280, + "step": 8265 + }, + { + "epoch": 1.3491027732463294, + "grad_norm": 0.07163897156715393, + "learning_rate": 0.0006744698205546492, + "loss": 0.1216, + "num_input_tokens_seen": 17884608, + "step": 8270 + }, + { + "epoch": 1.3499184339314845, + "grad_norm": 0.17496277391910553, + "learning_rate": 0.0006748776508972268, + "loss": 0.0676, + "num_input_tokens_seen": 17895072, + "step": 8275 + }, + { + "epoch": 1.3507340946166395, + "grad_norm": 0.14802797138690948, + "learning_rate": 0.0006752854812398043, + "loss": 0.0963, + "num_input_tokens_seen": 17905248, + "step": 8280 + }, + { + "epoch": 1.3515497553017943, + "grad_norm": 0.01929861307144165, + "learning_rate": 0.0006756933115823817, + "loss": 0.0899, + "num_input_tokens_seen": 17914336, + "step": 8285 + }, + { + "epoch": 1.3523654159869494, + "grad_norm": 0.129594624042511, + "learning_rate": 0.0006761011419249592, + "loss": 0.1605, + "num_input_tokens_seen": 17926112, + "step": 8290 + }, + { + "epoch": 1.3531810766721044, + "grad_norm": 0.23017330467700958, + "learning_rate": 0.0006765089722675367, + "loss": 0.0561, + "num_input_tokens_seen": 17937472, + "step": 8295 + }, + { + "epoch": 1.3539967373572595, + "grad_norm": 0.044130630791187286, + "learning_rate": 0.0006769168026101143, + "loss": 0.0399, + "num_input_tokens_seen": 17948960, + "step": 8300 + }, + { + "epoch": 1.3548123980424145, + "grad_norm": 0.32337111234664917, + "learning_rate": 0.0006773246329526917, + "loss": 0.0665, + "num_input_tokens_seen": 17959808, + "step": 8305 + }, + { + "epoch": 1.3556280587275693, + "grad_norm": 0.5042018890380859, + "learning_rate": 0.0006777324632952691, + "loss": 0.069, + "num_input_tokens_seen": 17972224, + "step": 8310 + }, + { + "epoch": 1.3564437194127243, + "grad_norm": 0.057626839727163315, + "learning_rate": 0.0006781402936378467, + "loss": 0.0513, + "num_input_tokens_seen": 17982528, + "step": 8315 + }, + { + "epoch": 1.3572593800978794, + "grad_norm": 0.011388307437300682, + "learning_rate": 0.0006785481239804242, + "loss": 0.192, + "num_input_tokens_seen": 17993408, + "step": 8320 + }, + { + "epoch": 1.3580750407830342, + "grad_norm": 0.13212941586971283, + "learning_rate": 0.0006789559543230017, + "loss": 0.054, + "num_input_tokens_seen": 18004288, + "step": 8325 + }, + { + "epoch": 1.3588907014681892, + "grad_norm": 0.025670086964964867, + "learning_rate": 0.0006793637846655791, + "loss": 0.2387, + "num_input_tokens_seen": 18015552, + "step": 8330 + }, + { + "epoch": 1.3597063621533443, + "grad_norm": 0.37728649377822876, + "learning_rate": 0.0006797716150081566, + "loss": 0.1111, + "num_input_tokens_seen": 18024800, + "step": 8335 + }, + { + "epoch": 1.360522022838499, + "grad_norm": 0.018351459875702858, + "learning_rate": 0.0006801794453507341, + "loss": 0.0376, + "num_input_tokens_seen": 18035072, + "step": 8340 + }, + { + "epoch": 1.3613376835236541, + "grad_norm": 0.08784783631563187, + "learning_rate": 0.0006805872756933116, + "loss": 0.057, + "num_input_tokens_seen": 18045728, + "step": 8345 + }, + { + "epoch": 1.3621533442088092, + "grad_norm": 0.06758838891983032, + "learning_rate": 0.000680995106035889, + "loss": 0.1178, + "num_input_tokens_seen": 18055936, + "step": 8350 + }, + { + "epoch": 1.362969004893964, + "grad_norm": 0.5933988690376282, + "learning_rate": 0.0006814029363784666, + "loss": 0.2268, + "num_input_tokens_seen": 18067680, + "step": 8355 + }, + { + "epoch": 1.363784665579119, + "grad_norm": 0.05332661420106888, + "learning_rate": 0.000681810766721044, + "loss": 0.1218, + "num_input_tokens_seen": 18078400, + "step": 8360 + }, + { + "epoch": 1.364600326264274, + "grad_norm": 0.16641493141651154, + "learning_rate": 0.0006822185970636216, + "loss": 0.199, + "num_input_tokens_seen": 18089568, + "step": 8365 + }, + { + "epoch": 1.3654159869494291, + "grad_norm": 0.14251184463500977, + "learning_rate": 0.0006826264274061991, + "loss": 0.1338, + "num_input_tokens_seen": 18100928, + "step": 8370 + }, + { + "epoch": 1.366231647634584, + "grad_norm": 0.21270228922367096, + "learning_rate": 0.0006830342577487764, + "loss": 0.0976, + "num_input_tokens_seen": 18111264, + "step": 8375 + }, + { + "epoch": 1.367047308319739, + "grad_norm": 0.0460171140730381, + "learning_rate": 0.000683442088091354, + "loss": 0.0314, + "num_input_tokens_seen": 18122080, + "step": 8380 + }, + { + "epoch": 1.367862969004894, + "grad_norm": 0.4137776792049408, + "learning_rate": 0.0006838499184339315, + "loss": 0.2193, + "num_input_tokens_seen": 18132416, + "step": 8385 + }, + { + "epoch": 1.368678629690049, + "grad_norm": 0.10444167256355286, + "learning_rate": 0.0006842577487765091, + "loss": 0.1293, + "num_input_tokens_seen": 18143584, + "step": 8390 + }, + { + "epoch": 1.3694942903752039, + "grad_norm": 0.06799294054508209, + "learning_rate": 0.0006846655791190864, + "loss": 0.2084, + "num_input_tokens_seen": 18153760, + "step": 8395 + }, + { + "epoch": 1.370309951060359, + "grad_norm": 0.0833783745765686, + "learning_rate": 0.0006850734094616639, + "loss": 0.1257, + "num_input_tokens_seen": 18165216, + "step": 8400 + }, + { + "epoch": 1.371125611745514, + "grad_norm": 0.031502969563007355, + "learning_rate": 0.0006854812398042415, + "loss": 0.1021, + "num_input_tokens_seen": 18176480, + "step": 8405 + }, + { + "epoch": 1.3719412724306688, + "grad_norm": 0.3506411910057068, + "learning_rate": 0.0006858890701468189, + "loss": 0.0863, + "num_input_tokens_seen": 18187648, + "step": 8410 + }, + { + "epoch": 1.3727569331158238, + "grad_norm": 0.45229458808898926, + "learning_rate": 0.0006862969004893965, + "loss": 0.068, + "num_input_tokens_seen": 18198752, + "step": 8415 + }, + { + "epoch": 1.3735725938009788, + "grad_norm": 0.2910847067832947, + "learning_rate": 0.0006867047308319739, + "loss": 0.1658, + "num_input_tokens_seen": 18210880, + "step": 8420 + }, + { + "epoch": 1.3743882544861337, + "grad_norm": 0.31154388189315796, + "learning_rate": 0.0006871125611745514, + "loss": 0.0971, + "num_input_tokens_seen": 18222464, + "step": 8425 + }, + { + "epoch": 1.3752039151712887, + "grad_norm": 0.03552580624818802, + "learning_rate": 0.0006875203915171289, + "loss": 0.122, + "num_input_tokens_seen": 18233216, + "step": 8430 + }, + { + "epoch": 1.3760195758564437, + "grad_norm": 0.1600065529346466, + "learning_rate": 0.0006879282218597064, + "loss": 0.0669, + "num_input_tokens_seen": 18244832, + "step": 8435 + }, + { + "epoch": 1.3768352365415986, + "grad_norm": 0.07445048540830612, + "learning_rate": 0.000688336052202284, + "loss": 0.1999, + "num_input_tokens_seen": 18254176, + "step": 8440 + }, + { + "epoch": 1.3776508972267536, + "grad_norm": 0.47163334488868713, + "learning_rate": 0.0006887438825448613, + "loss": 0.22, + "num_input_tokens_seen": 18265184, + "step": 8445 + }, + { + "epoch": 1.3784665579119086, + "grad_norm": 0.13729801774024963, + "learning_rate": 0.0006891517128874388, + "loss": 0.1316, + "num_input_tokens_seen": 18275424, + "step": 8450 + }, + { + "epoch": 1.3792822185970637, + "grad_norm": 0.39802855253219604, + "learning_rate": 0.0006895595432300164, + "loss": 0.1697, + "num_input_tokens_seen": 18285632, + "step": 8455 + }, + { + "epoch": 1.3800978792822187, + "grad_norm": 0.02712981216609478, + "learning_rate": 0.0006899673735725939, + "loss": 0.0611, + "num_input_tokens_seen": 18297536, + "step": 8460 + }, + { + "epoch": 1.3809135399673735, + "grad_norm": 0.1002969890832901, + "learning_rate": 0.0006903752039151713, + "loss": 0.0591, + "num_input_tokens_seen": 18307360, + "step": 8465 + }, + { + "epoch": 1.3817292006525286, + "grad_norm": 0.4240610599517822, + "learning_rate": 0.0006907830342577488, + "loss": 0.2264, + "num_input_tokens_seen": 18317792, + "step": 8470 + }, + { + "epoch": 1.3825448613376836, + "grad_norm": 0.41298383474349976, + "learning_rate": 0.0006911908646003263, + "loss": 0.2458, + "num_input_tokens_seen": 18329184, + "step": 8475 + }, + { + "epoch": 1.3833605220228384, + "grad_norm": 0.08825691044330597, + "learning_rate": 0.0006915986949429038, + "loss": 0.0931, + "num_input_tokens_seen": 18340224, + "step": 8480 + }, + { + "epoch": 1.3841761827079935, + "grad_norm": 0.31725960969924927, + "learning_rate": 0.0006920065252854812, + "loss": 0.1657, + "num_input_tokens_seen": 18352032, + "step": 8485 + }, + { + "epoch": 1.3849918433931485, + "grad_norm": 0.2807118892669678, + "learning_rate": 0.0006924143556280587, + "loss": 0.1332, + "num_input_tokens_seen": 18361728, + "step": 8490 + }, + { + "epoch": 1.3858075040783033, + "grad_norm": 0.20829269289970398, + "learning_rate": 0.0006928221859706362, + "loss": 0.0705, + "num_input_tokens_seen": 18371744, + "step": 8495 + }, + { + "epoch": 1.3866231647634584, + "grad_norm": 0.11227507889270782, + "learning_rate": 0.0006932300163132137, + "loss": 0.147, + "num_input_tokens_seen": 18382752, + "step": 8500 + }, + { + "epoch": 1.3874388254486134, + "grad_norm": 0.0697614997625351, + "learning_rate": 0.0006936378466557913, + "loss": 0.1329, + "num_input_tokens_seen": 18393152, + "step": 8505 + }, + { + "epoch": 1.3882544861337682, + "grad_norm": 0.23251527547836304, + "learning_rate": 0.0006940456769983687, + "loss": 0.0789, + "num_input_tokens_seen": 18403168, + "step": 8510 + }, + { + "epoch": 1.3890701468189233, + "grad_norm": 0.37753644585609436, + "learning_rate": 0.0006944535073409461, + "loss": 0.3207, + "num_input_tokens_seen": 18413664, + "step": 8515 + }, + { + "epoch": 1.3898858075040783, + "grad_norm": 0.17351676523685455, + "learning_rate": 0.0006948613376835237, + "loss": 0.112, + "num_input_tokens_seen": 18424288, + "step": 8520 + }, + { + "epoch": 1.3907014681892333, + "grad_norm": 0.21399036049842834, + "learning_rate": 0.0006952691680261012, + "loss": 0.0894, + "num_input_tokens_seen": 18434976, + "step": 8525 + }, + { + "epoch": 1.3915171288743884, + "grad_norm": 0.20291267335414886, + "learning_rate": 0.0006956769983686786, + "loss": 0.0709, + "num_input_tokens_seen": 18445440, + "step": 8530 + }, + { + "epoch": 1.3923327895595432, + "grad_norm": 0.04411192238330841, + "learning_rate": 0.0006960848287112561, + "loss": 0.0533, + "num_input_tokens_seen": 18455904, + "step": 8535 + }, + { + "epoch": 1.3931484502446982, + "grad_norm": 0.0695725828409195, + "learning_rate": 0.0006964926590538336, + "loss": 0.2081, + "num_input_tokens_seen": 18467200, + "step": 8540 + }, + { + "epoch": 1.3939641109298533, + "grad_norm": 0.0445864163339138, + "learning_rate": 0.0006969004893964112, + "loss": 0.1561, + "num_input_tokens_seen": 18478304, + "step": 8545 + }, + { + "epoch": 1.394779771615008, + "grad_norm": 0.4487917721271515, + "learning_rate": 0.0006973083197389885, + "loss": 0.1422, + "num_input_tokens_seen": 18489664, + "step": 8550 + }, + { + "epoch": 1.3955954323001631, + "grad_norm": 0.19967851042747498, + "learning_rate": 0.0006977161500815661, + "loss": 0.0737, + "num_input_tokens_seen": 18501568, + "step": 8555 + }, + { + "epoch": 1.3964110929853182, + "grad_norm": 0.35850760340690613, + "learning_rate": 0.0006981239804241436, + "loss": 0.1543, + "num_input_tokens_seen": 18513504, + "step": 8560 + }, + { + "epoch": 1.397226753670473, + "grad_norm": 0.15785285830497742, + "learning_rate": 0.000698531810766721, + "loss": 0.0517, + "num_input_tokens_seen": 18523616, + "step": 8565 + }, + { + "epoch": 1.398042414355628, + "grad_norm": 0.5196748971939087, + "learning_rate": 0.0006989396411092986, + "loss": 0.1858, + "num_input_tokens_seen": 18534464, + "step": 8570 + }, + { + "epoch": 1.398858075040783, + "grad_norm": 0.40732455253601074, + "learning_rate": 0.000699347471451876, + "loss": 0.1775, + "num_input_tokens_seen": 18545536, + "step": 8575 + }, + { + "epoch": 1.399673735725938, + "grad_norm": 0.3885939121246338, + "learning_rate": 0.0006997553017944536, + "loss": 0.0849, + "num_input_tokens_seen": 18556128, + "step": 8580 + }, + { + "epoch": 1.400489396411093, + "grad_norm": 0.27319011092185974, + "learning_rate": 0.000700163132137031, + "loss": 0.0458, + "num_input_tokens_seen": 18567232, + "step": 8585 + }, + { + "epoch": 1.401305057096248, + "grad_norm": 0.13056805729866028, + "learning_rate": 0.0007005709624796085, + "loss": 0.2172, + "num_input_tokens_seen": 18577824, + "step": 8590 + }, + { + "epoch": 1.402120717781403, + "grad_norm": 0.07697630673646927, + "learning_rate": 0.000700978792822186, + "loss": 0.1189, + "num_input_tokens_seen": 18588096, + "step": 8595 + }, + { + "epoch": 1.4029363784665578, + "grad_norm": 0.18461918830871582, + "learning_rate": 0.0007013866231647634, + "loss": 0.0482, + "num_input_tokens_seen": 18599744, + "step": 8600 + }, + { + "epoch": 1.4037520391517129, + "grad_norm": 0.3856184482574463, + "learning_rate": 0.000701794453507341, + "loss": 0.206, + "num_input_tokens_seen": 18610592, + "step": 8605 + }, + { + "epoch": 1.404567699836868, + "grad_norm": 0.06395883858203888, + "learning_rate": 0.0007022022838499185, + "loss": 0.1081, + "num_input_tokens_seen": 18620896, + "step": 8610 + }, + { + "epoch": 1.405383360522023, + "grad_norm": 0.011114265769720078, + "learning_rate": 0.000702610114192496, + "loss": 0.0479, + "num_input_tokens_seen": 18630848, + "step": 8615 + }, + { + "epoch": 1.4061990212071778, + "grad_norm": 0.024176809936761856, + "learning_rate": 0.0007030179445350734, + "loss": 0.0549, + "num_input_tokens_seen": 18641600, + "step": 8620 + }, + { + "epoch": 1.4070146818923328, + "grad_norm": 0.6192775964736938, + "learning_rate": 0.0007034257748776509, + "loss": 0.1453, + "num_input_tokens_seen": 18652448, + "step": 8625 + }, + { + "epoch": 1.4078303425774878, + "grad_norm": 0.04966207593679428, + "learning_rate": 0.0007038336052202285, + "loss": 0.1306, + "num_input_tokens_seen": 18663392, + "step": 8630 + }, + { + "epoch": 1.4086460032626427, + "grad_norm": 0.016872374340891838, + "learning_rate": 0.0007042414355628059, + "loss": 0.063, + "num_input_tokens_seen": 18674528, + "step": 8635 + }, + { + "epoch": 1.4094616639477977, + "grad_norm": 0.02471073530614376, + "learning_rate": 0.0007046492659053833, + "loss": 0.0279, + "num_input_tokens_seen": 18685632, + "step": 8640 + }, + { + "epoch": 1.4102773246329527, + "grad_norm": 0.012303034774959087, + "learning_rate": 0.0007050570962479609, + "loss": 0.071, + "num_input_tokens_seen": 18695776, + "step": 8645 + }, + { + "epoch": 1.4110929853181076, + "grad_norm": 0.098455511033535, + "learning_rate": 0.0007054649265905384, + "loss": 0.249, + "num_input_tokens_seen": 18705056, + "step": 8650 + }, + { + "epoch": 1.4119086460032626, + "grad_norm": 0.3847644627094269, + "learning_rate": 0.0007058727569331158, + "loss": 0.1436, + "num_input_tokens_seen": 18714688, + "step": 8655 + }, + { + "epoch": 1.4127243066884176, + "grad_norm": 0.024373041465878487, + "learning_rate": 0.0007062805872756933, + "loss": 0.102, + "num_input_tokens_seen": 18725984, + "step": 8660 + }, + { + "epoch": 1.4135399673735725, + "grad_norm": 0.15685175359249115, + "learning_rate": 0.0007066884176182708, + "loss": 0.1985, + "num_input_tokens_seen": 18736224, + "step": 8665 + }, + { + "epoch": 1.4143556280587275, + "grad_norm": 0.1535254567861557, + "learning_rate": 0.0007070962479608483, + "loss": 0.0492, + "num_input_tokens_seen": 18748832, + "step": 8670 + }, + { + "epoch": 1.4151712887438825, + "grad_norm": 0.3047678470611572, + "learning_rate": 0.0007075040783034258, + "loss": 0.1792, + "num_input_tokens_seen": 18757792, + "step": 8675 + }, + { + "epoch": 1.4159869494290376, + "grad_norm": 0.28612783551216125, + "learning_rate": 0.0007079119086460033, + "loss": 0.249, + "num_input_tokens_seen": 18768160, + "step": 8680 + }, + { + "epoch": 1.4168026101141926, + "grad_norm": 0.059516116976737976, + "learning_rate": 0.0007083197389885808, + "loss": 0.0793, + "num_input_tokens_seen": 18778336, + "step": 8685 + }, + { + "epoch": 1.4176182707993474, + "grad_norm": 0.2646983861923218, + "learning_rate": 0.0007087275693311582, + "loss": 0.1472, + "num_input_tokens_seen": 18789632, + "step": 8690 + }, + { + "epoch": 1.4184339314845025, + "grad_norm": 0.15517950057983398, + "learning_rate": 0.0007091353996737358, + "loss": 0.2406, + "num_input_tokens_seen": 18800224, + "step": 8695 + }, + { + "epoch": 1.4192495921696575, + "grad_norm": 0.04616353288292885, + "learning_rate": 0.0007095432300163133, + "loss": 0.108, + "num_input_tokens_seen": 18810208, + "step": 8700 + }, + { + "epoch": 1.4200652528548123, + "grad_norm": 0.04655442386865616, + "learning_rate": 0.0007099510603588906, + "loss": 0.065, + "num_input_tokens_seen": 18821760, + "step": 8705 + }, + { + "epoch": 1.4208809135399674, + "grad_norm": 0.2469894289970398, + "learning_rate": 0.0007103588907014682, + "loss": 0.2837, + "num_input_tokens_seen": 18833536, + "step": 8710 + }, + { + "epoch": 1.4216965742251224, + "grad_norm": 0.11056669801473618, + "learning_rate": 0.0007107667210440457, + "loss": 0.1823, + "num_input_tokens_seen": 18844576, + "step": 8715 + }, + { + "epoch": 1.4225122349102772, + "grad_norm": 0.01934995874762535, + "learning_rate": 0.0007111745513866232, + "loss": 0.0429, + "num_input_tokens_seen": 18856096, + "step": 8720 + }, + { + "epoch": 1.4233278955954323, + "grad_norm": 0.12118564546108246, + "learning_rate": 0.0007115823817292006, + "loss": 0.134, + "num_input_tokens_seen": 18866944, + "step": 8725 + }, + { + "epoch": 1.4241435562805873, + "grad_norm": 0.09233374893665314, + "learning_rate": 0.0007119902120717781, + "loss": 0.189, + "num_input_tokens_seen": 18878464, + "step": 8730 + }, + { + "epoch": 1.4249592169657421, + "grad_norm": 0.10285761952400208, + "learning_rate": 0.0007123980424143557, + "loss": 0.0487, + "num_input_tokens_seen": 18888992, + "step": 8735 + }, + { + "epoch": 1.4257748776508972, + "grad_norm": 0.03624412789940834, + "learning_rate": 0.0007128058727569331, + "loss": 0.1002, + "num_input_tokens_seen": 18899776, + "step": 8740 + }, + { + "epoch": 1.4265905383360522, + "grad_norm": 0.6590876579284668, + "learning_rate": 0.0007132137030995107, + "loss": 0.0809, + "num_input_tokens_seen": 18910528, + "step": 8745 + }, + { + "epoch": 1.4274061990212072, + "grad_norm": 0.06478895992040634, + "learning_rate": 0.0007136215334420881, + "loss": 0.0835, + "num_input_tokens_seen": 18922048, + "step": 8750 + }, + { + "epoch": 1.4282218597063623, + "grad_norm": 0.020453322678804398, + "learning_rate": 0.0007140293637846655, + "loss": 0.0671, + "num_input_tokens_seen": 18933472, + "step": 8755 + }, + { + "epoch": 1.429037520391517, + "grad_norm": 0.46330201625823975, + "learning_rate": 0.0007144371941272431, + "loss": 0.1484, + "num_input_tokens_seen": 18944160, + "step": 8760 + }, + { + "epoch": 1.4298531810766721, + "grad_norm": 0.5292882323265076, + "learning_rate": 0.0007148450244698206, + "loss": 0.1076, + "num_input_tokens_seen": 18954688, + "step": 8765 + }, + { + "epoch": 1.4306688417618272, + "grad_norm": 0.12839145958423615, + "learning_rate": 0.0007152528548123982, + "loss": 0.0256, + "num_input_tokens_seen": 18965728, + "step": 8770 + }, + { + "epoch": 1.431484502446982, + "grad_norm": 0.026854708790779114, + "learning_rate": 0.0007156606851549755, + "loss": 0.1457, + "num_input_tokens_seen": 18977088, + "step": 8775 + }, + { + "epoch": 1.432300163132137, + "grad_norm": 0.0423266664147377, + "learning_rate": 0.000716068515497553, + "loss": 0.14, + "num_input_tokens_seen": 18989056, + "step": 8780 + }, + { + "epoch": 1.433115823817292, + "grad_norm": 0.022482803091406822, + "learning_rate": 0.0007164763458401306, + "loss": 0.0264, + "num_input_tokens_seen": 18999936, + "step": 8785 + }, + { + "epoch": 1.433931484502447, + "grad_norm": 0.009021877311170101, + "learning_rate": 0.000716884176182708, + "loss": 0.0196, + "num_input_tokens_seen": 19011328, + "step": 8790 + }, + { + "epoch": 1.434747145187602, + "grad_norm": 0.34267958998680115, + "learning_rate": 0.0007172920065252854, + "loss": 0.0449, + "num_input_tokens_seen": 19022176, + "step": 8795 + }, + { + "epoch": 1.435562805872757, + "grad_norm": 0.0407864935696125, + "learning_rate": 0.000717699836867863, + "loss": 0.2754, + "num_input_tokens_seen": 19032832, + "step": 8800 + }, + { + "epoch": 1.4363784665579118, + "grad_norm": 0.06768330931663513, + "learning_rate": 0.0007181076672104405, + "loss": 0.0412, + "num_input_tokens_seen": 19044384, + "step": 8805 + }, + { + "epoch": 1.4371941272430668, + "grad_norm": 0.03693476691842079, + "learning_rate": 0.000718515497553018, + "loss": 0.1054, + "num_input_tokens_seen": 19054816, + "step": 8810 + }, + { + "epoch": 1.4380097879282219, + "grad_norm": 0.05763913691043854, + "learning_rate": 0.0007189233278955954, + "loss": 0.031, + "num_input_tokens_seen": 19065120, + "step": 8815 + }, + { + "epoch": 1.438825448613377, + "grad_norm": 0.26617172360420227, + "learning_rate": 0.0007193311582381729, + "loss": 0.389, + "num_input_tokens_seen": 19075872, + "step": 8820 + }, + { + "epoch": 1.4396411092985317, + "grad_norm": 0.06919386237859726, + "learning_rate": 0.0007197389885807504, + "loss": 0.0492, + "num_input_tokens_seen": 19087808, + "step": 8825 + }, + { + "epoch": 1.4404567699836868, + "grad_norm": 0.2736736238002777, + "learning_rate": 0.0007201468189233279, + "loss": 0.3008, + "num_input_tokens_seen": 19099488, + "step": 8830 + }, + { + "epoch": 1.4412724306688418, + "grad_norm": 0.05337755009531975, + "learning_rate": 0.0007205546492659055, + "loss": 0.0528, + "num_input_tokens_seen": 19111104, + "step": 8835 + }, + { + "epoch": 1.4420880913539968, + "grad_norm": 0.03262023627758026, + "learning_rate": 0.0007209624796084829, + "loss": 0.0659, + "num_input_tokens_seen": 19122656, + "step": 8840 + }, + { + "epoch": 1.4429037520391517, + "grad_norm": 0.19894321262836456, + "learning_rate": 0.0007213703099510603, + "loss": 0.2232, + "num_input_tokens_seen": 19133728, + "step": 8845 + }, + { + "epoch": 1.4437194127243067, + "grad_norm": 0.031567685306072235, + "learning_rate": 0.0007217781402936379, + "loss": 0.0757, + "num_input_tokens_seen": 19144896, + "step": 8850 + }, + { + "epoch": 1.4445350734094617, + "grad_norm": 0.050865061581134796, + "learning_rate": 0.0007221859706362154, + "loss": 0.0726, + "num_input_tokens_seen": 19155040, + "step": 8855 + }, + { + "epoch": 1.4453507340946166, + "grad_norm": 0.3916897177696228, + "learning_rate": 0.0007225938009787928, + "loss": 0.1051, + "num_input_tokens_seen": 19165216, + "step": 8860 + }, + { + "epoch": 1.4461663947797716, + "grad_norm": 0.02855168841779232, + "learning_rate": 0.0007230016313213703, + "loss": 0.0636, + "num_input_tokens_seen": 19175904, + "step": 8865 + }, + { + "epoch": 1.4469820554649266, + "grad_norm": 0.009972508065402508, + "learning_rate": 0.0007234094616639478, + "loss": 0.1216, + "num_input_tokens_seen": 19185312, + "step": 8870 + }, + { + "epoch": 1.4477977161500815, + "grad_norm": 0.05328962206840515, + "learning_rate": 0.0007238172920065254, + "loss": 0.1923, + "num_input_tokens_seen": 19196768, + "step": 8875 + }, + { + "epoch": 1.4486133768352365, + "grad_norm": 0.20842105150222778, + "learning_rate": 0.0007242251223491027, + "loss": 0.1206, + "num_input_tokens_seen": 19207360, + "step": 8880 + }, + { + "epoch": 1.4494290375203915, + "grad_norm": 0.298380047082901, + "learning_rate": 0.0007246329526916803, + "loss": 0.097, + "num_input_tokens_seen": 19217792, + "step": 8885 + }, + { + "epoch": 1.4502446982055464, + "grad_norm": 0.056603509932756424, + "learning_rate": 0.0007250407830342578, + "loss": 0.0887, + "num_input_tokens_seen": 19229920, + "step": 8890 + }, + { + "epoch": 1.4510603588907014, + "grad_norm": 0.028075775131583214, + "learning_rate": 0.0007254486133768352, + "loss": 0.0862, + "num_input_tokens_seen": 19241952, + "step": 8895 + }, + { + "epoch": 1.4518760195758564, + "grad_norm": 0.04573468863964081, + "learning_rate": 0.0007258564437194128, + "loss": 0.157, + "num_input_tokens_seen": 19252320, + "step": 8900 + }, + { + "epoch": 1.4526916802610115, + "grad_norm": 0.06405018270015717, + "learning_rate": 0.0007262642740619902, + "loss": 0.0865, + "num_input_tokens_seen": 19262912, + "step": 8905 + }, + { + "epoch": 1.4535073409461665, + "grad_norm": 0.17091864347457886, + "learning_rate": 0.0007266721044045678, + "loss": 0.2396, + "num_input_tokens_seen": 19273792, + "step": 8910 + }, + { + "epoch": 1.4543230016313213, + "grad_norm": 0.15708409249782562, + "learning_rate": 0.0007270799347471452, + "loss": 0.2161, + "num_input_tokens_seen": 19284448, + "step": 8915 + }, + { + "epoch": 1.4551386623164764, + "grad_norm": 0.05929143726825714, + "learning_rate": 0.0007274877650897227, + "loss": 0.1374, + "num_input_tokens_seen": 19295712, + "step": 8920 + }, + { + "epoch": 1.4559543230016314, + "grad_norm": 0.03152371942996979, + "learning_rate": 0.0007278955954323002, + "loss": 0.0991, + "num_input_tokens_seen": 19307520, + "step": 8925 + }, + { + "epoch": 1.4567699836867862, + "grad_norm": 0.06505519896745682, + "learning_rate": 0.0007283034257748776, + "loss": 0.0437, + "num_input_tokens_seen": 19319008, + "step": 8930 + }, + { + "epoch": 1.4575856443719413, + "grad_norm": 0.1482236683368683, + "learning_rate": 0.0007287112561174551, + "loss": 0.0816, + "num_input_tokens_seen": 19329440, + "step": 8935 + }, + { + "epoch": 1.4584013050570963, + "grad_norm": 0.10577386617660522, + "learning_rate": 0.0007291190864600327, + "loss": 0.2586, + "num_input_tokens_seen": 19339616, + "step": 8940 + }, + { + "epoch": 1.4592169657422511, + "grad_norm": 0.18734675645828247, + "learning_rate": 0.00072952691680261, + "loss": 0.0709, + "num_input_tokens_seen": 19351136, + "step": 8945 + }, + { + "epoch": 1.4600326264274062, + "grad_norm": 0.052861470729112625, + "learning_rate": 0.0007299347471451876, + "loss": 0.1096, + "num_input_tokens_seen": 19360768, + "step": 8950 + }, + { + "epoch": 1.4608482871125612, + "grad_norm": 0.02653975412249565, + "learning_rate": 0.0007303425774877651, + "loss": 0.0464, + "num_input_tokens_seen": 19370688, + "step": 8955 + }, + { + "epoch": 1.461663947797716, + "grad_norm": 0.04796244576573372, + "learning_rate": 0.0007307504078303426, + "loss": 0.0705, + "num_input_tokens_seen": 19382336, + "step": 8960 + }, + { + "epoch": 1.462479608482871, + "grad_norm": 0.02996446006000042, + "learning_rate": 0.0007311582381729201, + "loss": 0.0502, + "num_input_tokens_seen": 19393152, + "step": 8965 + }, + { + "epoch": 1.463295269168026, + "grad_norm": 0.13484928011894226, + "learning_rate": 0.0007315660685154975, + "loss": 0.116, + "num_input_tokens_seen": 19403744, + "step": 8970 + }, + { + "epoch": 1.4641109298531811, + "grad_norm": 0.0562857910990715, + "learning_rate": 0.0007319738988580751, + "loss": 0.0506, + "num_input_tokens_seen": 19415008, + "step": 8975 + }, + { + "epoch": 1.4649265905383362, + "grad_norm": 0.11413650959730148, + "learning_rate": 0.0007323817292006525, + "loss": 0.1563, + "num_input_tokens_seen": 19425888, + "step": 8980 + }, + { + "epoch": 1.465742251223491, + "grad_norm": 0.2681230902671814, + "learning_rate": 0.00073278955954323, + "loss": 0.1464, + "num_input_tokens_seen": 19437280, + "step": 8985 + }, + { + "epoch": 1.466557911908646, + "grad_norm": 0.055149346590042114, + "learning_rate": 0.0007331973898858076, + "loss": 0.2557, + "num_input_tokens_seen": 19448544, + "step": 8990 + }, + { + "epoch": 1.467373572593801, + "grad_norm": 0.07654145359992981, + "learning_rate": 0.000733605220228385, + "loss": 0.1016, + "num_input_tokens_seen": 19459264, + "step": 8995 + }, + { + "epoch": 1.468189233278956, + "grad_norm": 0.07035654783248901, + "learning_rate": 0.0007340130505709625, + "loss": 0.1717, + "num_input_tokens_seen": 19468896, + "step": 9000 + }, + { + "epoch": 1.469004893964111, + "grad_norm": 0.3256077170372009, + "learning_rate": 0.00073442088091354, + "loss": 0.2185, + "num_input_tokens_seen": 19480032, + "step": 9005 + }, + { + "epoch": 1.469820554649266, + "grad_norm": 0.08641856163740158, + "learning_rate": 0.0007348287112561175, + "loss": 0.1296, + "num_input_tokens_seen": 19490816, + "step": 9010 + }, + { + "epoch": 1.4706362153344208, + "grad_norm": 0.4245319068431854, + "learning_rate": 0.0007352365415986949, + "loss": 0.1821, + "num_input_tokens_seen": 19501248, + "step": 9015 + }, + { + "epoch": 1.4714518760195758, + "grad_norm": 0.057510893791913986, + "learning_rate": 0.0007356443719412724, + "loss": 0.0404, + "num_input_tokens_seen": 19513152, + "step": 9020 + }, + { + "epoch": 1.4722675367047309, + "grad_norm": 0.23167626559734344, + "learning_rate": 0.00073605220228385, + "loss": 0.1153, + "num_input_tokens_seen": 19523776, + "step": 9025 + }, + { + "epoch": 1.4730831973898857, + "grad_norm": 0.03532523289322853, + "learning_rate": 0.0007364600326264275, + "loss": 0.1117, + "num_input_tokens_seen": 19535072, + "step": 9030 + }, + { + "epoch": 1.4738988580750407, + "grad_norm": 0.015089893713593483, + "learning_rate": 0.0007368678629690048, + "loss": 0.0937, + "num_input_tokens_seen": 19545696, + "step": 9035 + }, + { + "epoch": 1.4747145187601958, + "grad_norm": 0.4665493667125702, + "learning_rate": 0.0007372756933115824, + "loss": 0.1995, + "num_input_tokens_seen": 19554944, + "step": 9040 + }, + { + "epoch": 1.4755301794453508, + "grad_norm": 0.21652501821517944, + "learning_rate": 0.0007376835236541599, + "loss": 0.2747, + "num_input_tokens_seen": 19566144, + "step": 9045 + }, + { + "epoch": 1.4763458401305056, + "grad_norm": 0.329572468996048, + "learning_rate": 0.0007380913539967374, + "loss": 0.1159, + "num_input_tokens_seen": 19576256, + "step": 9050 + }, + { + "epoch": 1.4771615008156607, + "grad_norm": 0.13731348514556885, + "learning_rate": 0.0007384991843393149, + "loss": 0.0869, + "num_input_tokens_seen": 19587264, + "step": 9055 + }, + { + "epoch": 1.4779771615008157, + "grad_norm": 0.08550713956356049, + "learning_rate": 0.0007389070146818923, + "loss": 0.087, + "num_input_tokens_seen": 19599904, + "step": 9060 + }, + { + "epoch": 1.4787928221859707, + "grad_norm": 0.08178147673606873, + "learning_rate": 0.0007393148450244699, + "loss": 0.1437, + "num_input_tokens_seen": 19610848, + "step": 9065 + }, + { + "epoch": 1.4796084828711256, + "grad_norm": 0.28226912021636963, + "learning_rate": 0.0007397226753670473, + "loss": 0.1054, + "num_input_tokens_seen": 19621344, + "step": 9070 + }, + { + "epoch": 1.4804241435562806, + "grad_norm": 0.05449576675891876, + "learning_rate": 0.0007401305057096248, + "loss": 0.1941, + "num_input_tokens_seen": 19631616, + "step": 9075 + }, + { + "epoch": 1.4812398042414356, + "grad_norm": 0.24325771629810333, + "learning_rate": 0.0007405383360522023, + "loss": 0.2353, + "num_input_tokens_seen": 19642528, + "step": 9080 + }, + { + "epoch": 1.4820554649265905, + "grad_norm": 0.07389519363641739, + "learning_rate": 0.0007409461663947797, + "loss": 0.1516, + "num_input_tokens_seen": 19651840, + "step": 9085 + }, + { + "epoch": 1.4828711256117455, + "grad_norm": 0.08990222960710526, + "learning_rate": 0.0007413539967373573, + "loss": 0.0684, + "num_input_tokens_seen": 19662848, + "step": 9090 + }, + { + "epoch": 1.4836867862969005, + "grad_norm": 0.04385393112897873, + "learning_rate": 0.0007417618270799348, + "loss": 0.1173, + "num_input_tokens_seen": 19673984, + "step": 9095 + }, + { + "epoch": 1.4845024469820554, + "grad_norm": 0.06532658636569977, + "learning_rate": 0.0007421696574225123, + "loss": 0.0915, + "num_input_tokens_seen": 19685504, + "step": 9100 + }, + { + "epoch": 1.4853181076672104, + "grad_norm": 0.025229470804333687, + "learning_rate": 0.0007425774877650897, + "loss": 0.0416, + "num_input_tokens_seen": 19696832, + "step": 9105 + }, + { + "epoch": 1.4861337683523654, + "grad_norm": 0.35470104217529297, + "learning_rate": 0.0007429853181076672, + "loss": 0.1424, + "num_input_tokens_seen": 19708064, + "step": 9110 + }, + { + "epoch": 1.4869494290375203, + "grad_norm": 0.34158971905708313, + "learning_rate": 0.0007433931484502448, + "loss": 0.2832, + "num_input_tokens_seen": 19719264, + "step": 9115 + }, + { + "epoch": 1.4877650897226753, + "grad_norm": 0.041290439665317535, + "learning_rate": 0.0007438009787928222, + "loss": 0.0405, + "num_input_tokens_seen": 19729280, + "step": 9120 + }, + { + "epoch": 1.4885807504078303, + "grad_norm": 0.03599981218576431, + "learning_rate": 0.0007442088091353996, + "loss": 0.0781, + "num_input_tokens_seen": 19739840, + "step": 9125 + }, + { + "epoch": 1.4893964110929854, + "grad_norm": 0.11953336000442505, + "learning_rate": 0.0007446166394779772, + "loss": 0.0411, + "num_input_tokens_seen": 19750304, + "step": 9130 + }, + { + "epoch": 1.4902120717781404, + "grad_norm": 0.14723843336105347, + "learning_rate": 0.0007450244698205547, + "loss": 0.1182, + "num_input_tokens_seen": 19760288, + "step": 9135 + }, + { + "epoch": 1.4910277324632952, + "grad_norm": 0.07382355630397797, + "learning_rate": 0.0007454323001631322, + "loss": 0.0367, + "num_input_tokens_seen": 19770080, + "step": 9140 + }, + { + "epoch": 1.4918433931484503, + "grad_norm": 0.32175618410110474, + "learning_rate": 0.0007458401305057096, + "loss": 0.1103, + "num_input_tokens_seen": 19780512, + "step": 9145 + }, + { + "epoch": 1.4926590538336053, + "grad_norm": 0.28489458560943604, + "learning_rate": 0.0007462479608482871, + "loss": 0.1126, + "num_input_tokens_seen": 19792032, + "step": 9150 + }, + { + "epoch": 1.4934747145187601, + "grad_norm": 0.07463068515062332, + "learning_rate": 0.0007466557911908646, + "loss": 0.1008, + "num_input_tokens_seen": 19802496, + "step": 9155 + }, + { + "epoch": 1.4942903752039152, + "grad_norm": 0.2966947853565216, + "learning_rate": 0.0007470636215334421, + "loss": 0.1891, + "num_input_tokens_seen": 19813024, + "step": 9160 + }, + { + "epoch": 1.4951060358890702, + "grad_norm": 0.05433628708124161, + "learning_rate": 0.0007474714518760197, + "loss": 0.1489, + "num_input_tokens_seen": 19823968, + "step": 9165 + }, + { + "epoch": 1.495921696574225, + "grad_norm": 0.10637657344341278, + "learning_rate": 0.0007478792822185971, + "loss": 0.0561, + "num_input_tokens_seen": 19833856, + "step": 9170 + }, + { + "epoch": 1.49673735725938, + "grad_norm": 0.36320605874061584, + "learning_rate": 0.0007482871125611745, + "loss": 0.1962, + "num_input_tokens_seen": 19843872, + "step": 9175 + }, + { + "epoch": 1.497553017944535, + "grad_norm": 0.04385019838809967, + "learning_rate": 0.0007486949429037521, + "loss": 0.0605, + "num_input_tokens_seen": 19854144, + "step": 9180 + }, + { + "epoch": 1.49836867862969, + "grad_norm": 0.04330093413591385, + "learning_rate": 0.0007491027732463296, + "loss": 0.0296, + "num_input_tokens_seen": 19865184, + "step": 9185 + }, + { + "epoch": 1.499184339314845, + "grad_norm": 0.32716479897499084, + "learning_rate": 0.000749510603588907, + "loss": 0.2863, + "num_input_tokens_seen": 19874528, + "step": 9190 + }, + { + "epoch": 1.5, + "grad_norm": 0.042429886758327484, + "learning_rate": 0.0007499184339314845, + "loss": 0.1687, + "num_input_tokens_seen": 19885184, + "step": 9195 + }, + { + "epoch": 1.5008156606851548, + "grad_norm": 0.16900426149368286, + "learning_rate": 0.000750326264274062, + "loss": 0.1278, + "num_input_tokens_seen": 19896768, + "step": 9200 + }, + { + "epoch": 1.50163132137031, + "grad_norm": 0.04221004992723465, + "learning_rate": 0.0007507340946166395, + "loss": 0.153, + "num_input_tokens_seen": 19906848, + "step": 9205 + }, + { + "epoch": 1.502446982055465, + "grad_norm": 0.038230542093515396, + "learning_rate": 0.000751141924959217, + "loss": 0.1196, + "num_input_tokens_seen": 19917728, + "step": 9210 + }, + { + "epoch": 1.50326264274062, + "grad_norm": 0.20852527022361755, + "learning_rate": 0.0007515497553017944, + "loss": 0.343, + "num_input_tokens_seen": 19928224, + "step": 9215 + }, + { + "epoch": 1.504078303425775, + "grad_norm": 0.134451761841774, + "learning_rate": 0.000751957585644372, + "loss": 0.1256, + "num_input_tokens_seen": 19939904, + "step": 9220 + }, + { + "epoch": 1.5048939641109298, + "grad_norm": 0.071271613240242, + "learning_rate": 0.0007523654159869494, + "loss": 0.1314, + "num_input_tokens_seen": 19951360, + "step": 9225 + }, + { + "epoch": 1.5057096247960848, + "grad_norm": 0.12655316293239594, + "learning_rate": 0.000752773246329527, + "loss": 0.261, + "num_input_tokens_seen": 19961792, + "step": 9230 + }, + { + "epoch": 1.5065252854812399, + "grad_norm": 0.04052901268005371, + "learning_rate": 0.0007531810766721044, + "loss": 0.0979, + "num_input_tokens_seen": 19972288, + "step": 9235 + }, + { + "epoch": 1.5073409461663947, + "grad_norm": 0.6805994510650635, + "learning_rate": 0.0007535889070146818, + "loss": 0.1843, + "num_input_tokens_seen": 19982688, + "step": 9240 + }, + { + "epoch": 1.5081566068515497, + "grad_norm": 0.07349798828363419, + "learning_rate": 0.0007539967373572594, + "loss": 0.1171, + "num_input_tokens_seen": 19992992, + "step": 9245 + }, + { + "epoch": 1.5089722675367048, + "grad_norm": 0.3114171028137207, + "learning_rate": 0.0007544045676998369, + "loss": 0.1579, + "num_input_tokens_seen": 20004384, + "step": 9250 + }, + { + "epoch": 1.5097879282218596, + "grad_norm": 0.045912064611911774, + "learning_rate": 0.0007548123980424145, + "loss": 0.2534, + "num_input_tokens_seen": 20014784, + "step": 9255 + }, + { + "epoch": 1.5106035889070146, + "grad_norm": 0.06314994394779205, + "learning_rate": 0.0007552202283849918, + "loss": 0.0856, + "num_input_tokens_seen": 20025536, + "step": 9260 + }, + { + "epoch": 1.5114192495921697, + "grad_norm": 0.07371212542057037, + "learning_rate": 0.0007556280587275693, + "loss": 0.0728, + "num_input_tokens_seen": 20035712, + "step": 9265 + }, + { + "epoch": 1.5122349102773245, + "grad_norm": 0.08201035857200623, + "learning_rate": 0.0007560358890701469, + "loss": 0.1516, + "num_input_tokens_seen": 20046912, + "step": 9270 + }, + { + "epoch": 1.5130505709624797, + "grad_norm": 0.05865674465894699, + "learning_rate": 0.0007564437194127243, + "loss": 0.0503, + "num_input_tokens_seen": 20058304, + "step": 9275 + }, + { + "epoch": 1.5138662316476346, + "grad_norm": 0.26975804567337036, + "learning_rate": 0.0007568515497553018, + "loss": 0.0944, + "num_input_tokens_seen": 20068256, + "step": 9280 + }, + { + "epoch": 1.5146818923327896, + "grad_norm": 0.02460741624236107, + "learning_rate": 0.0007572593800978793, + "loss": 0.1286, + "num_input_tokens_seen": 20079232, + "step": 9285 + }, + { + "epoch": 1.5154975530179446, + "grad_norm": 0.010626477189362049, + "learning_rate": 0.0007576672104404568, + "loss": 0.0225, + "num_input_tokens_seen": 20090176, + "step": 9290 + }, + { + "epoch": 1.5163132137030995, + "grad_norm": 0.21808360517024994, + "learning_rate": 0.0007580750407830343, + "loss": 0.09, + "num_input_tokens_seen": 20100640, + "step": 9295 + }, + { + "epoch": 1.5171288743882545, + "grad_norm": 0.31197798252105713, + "learning_rate": 0.0007584828711256117, + "loss": 0.1233, + "num_input_tokens_seen": 20110656, + "step": 9300 + }, + { + "epoch": 1.5179445350734095, + "grad_norm": 0.07159364223480225, + "learning_rate": 0.0007588907014681893, + "loss": 0.1392, + "num_input_tokens_seen": 20122240, + "step": 9305 + }, + { + "epoch": 1.5187601957585644, + "grad_norm": 0.011855943128466606, + "learning_rate": 0.0007592985318107667, + "loss": 0.1328, + "num_input_tokens_seen": 20133344, + "step": 9310 + }, + { + "epoch": 1.5195758564437194, + "grad_norm": 0.39659756422042847, + "learning_rate": 0.0007597063621533442, + "loss": 0.1872, + "num_input_tokens_seen": 20142944, + "step": 9315 + }, + { + "epoch": 1.5203915171288744, + "grad_norm": 0.23843838274478912, + "learning_rate": 0.0007601141924959218, + "loss": 0.1653, + "num_input_tokens_seen": 20153728, + "step": 9320 + }, + { + "epoch": 1.5212071778140293, + "grad_norm": 0.17858515679836273, + "learning_rate": 0.0007605220228384992, + "loss": 0.1768, + "num_input_tokens_seen": 20163904, + "step": 9325 + }, + { + "epoch": 1.5220228384991843, + "grad_norm": 0.07030331343412399, + "learning_rate": 0.0007609298531810767, + "loss": 0.0681, + "num_input_tokens_seen": 20175040, + "step": 9330 + }, + { + "epoch": 1.5228384991843393, + "grad_norm": 0.09889055788516998, + "learning_rate": 0.0007613376835236542, + "loss": 0.1594, + "num_input_tokens_seen": 20185696, + "step": 9335 + }, + { + "epoch": 1.5236541598694942, + "grad_norm": 0.08910706639289856, + "learning_rate": 0.0007617455138662317, + "loss": 0.1579, + "num_input_tokens_seen": 20196288, + "step": 9340 + }, + { + "epoch": 1.5244698205546494, + "grad_norm": 0.11903411149978638, + "learning_rate": 0.0007621533442088091, + "loss": 0.1586, + "num_input_tokens_seen": 20206688, + "step": 9345 + }, + { + "epoch": 1.5252854812398042, + "grad_norm": 0.2565356194972992, + "learning_rate": 0.0007625611745513866, + "loss": 0.206, + "num_input_tokens_seen": 20217824, + "step": 9350 + }, + { + "epoch": 1.5261011419249593, + "grad_norm": 0.09693357348442078, + "learning_rate": 0.0007629690048939642, + "loss": 0.1122, + "num_input_tokens_seen": 20227552, + "step": 9355 + }, + { + "epoch": 1.5269168026101143, + "grad_norm": 0.2581535279750824, + "learning_rate": 0.0007633768352365417, + "loss": 0.2088, + "num_input_tokens_seen": 20237920, + "step": 9360 + }, + { + "epoch": 1.5277324632952691, + "grad_norm": 0.14089787006378174, + "learning_rate": 0.000763784665579119, + "loss": 0.1094, + "num_input_tokens_seen": 20248928, + "step": 9365 + }, + { + "epoch": 1.5285481239804242, + "grad_norm": 0.2328333854675293, + "learning_rate": 0.0007641924959216966, + "loss": 0.1387, + "num_input_tokens_seen": 20260448, + "step": 9370 + }, + { + "epoch": 1.5293637846655792, + "grad_norm": 0.018458805978298187, + "learning_rate": 0.0007646003262642741, + "loss": 0.06, + "num_input_tokens_seen": 20271136, + "step": 9375 + }, + { + "epoch": 1.530179445350734, + "grad_norm": 0.13734719157218933, + "learning_rate": 0.0007650081566068515, + "loss": 0.1299, + "num_input_tokens_seen": 20282048, + "step": 9380 + }, + { + "epoch": 1.530995106035889, + "grad_norm": 0.18070201575756073, + "learning_rate": 0.0007654159869494291, + "loss": 0.1086, + "num_input_tokens_seen": 20291936, + "step": 9385 + }, + { + "epoch": 1.531810766721044, + "grad_norm": 0.09469042718410492, + "learning_rate": 0.0007658238172920065, + "loss": 0.0774, + "num_input_tokens_seen": 20302976, + "step": 9390 + }, + { + "epoch": 1.532626427406199, + "grad_norm": 0.02390364743769169, + "learning_rate": 0.0007662316476345841, + "loss": 0.0631, + "num_input_tokens_seen": 20314336, + "step": 9395 + }, + { + "epoch": 1.533442088091354, + "grad_norm": 0.2972950339317322, + "learning_rate": 0.0007666394779771615, + "loss": 0.1546, + "num_input_tokens_seen": 20325248, + "step": 9400 + }, + { + "epoch": 1.534257748776509, + "grad_norm": 0.39182689785957336, + "learning_rate": 0.000767047308319739, + "loss": 0.388, + "num_input_tokens_seen": 20335936, + "step": 9405 + }, + { + "epoch": 1.5350734094616638, + "grad_norm": 0.03537694737315178, + "learning_rate": 0.0007674551386623165, + "loss": 0.0757, + "num_input_tokens_seen": 20347424, + "step": 9410 + }, + { + "epoch": 1.535889070146819, + "grad_norm": 0.06817349791526794, + "learning_rate": 0.0007678629690048939, + "loss": 0.1584, + "num_input_tokens_seen": 20358240, + "step": 9415 + }, + { + "epoch": 1.536704730831974, + "grad_norm": 0.21510159969329834, + "learning_rate": 0.0007682707993474715, + "loss": 0.172, + "num_input_tokens_seen": 20369184, + "step": 9420 + }, + { + "epoch": 1.5375203915171287, + "grad_norm": 0.17005428671836853, + "learning_rate": 0.000768678629690049, + "loss": 0.2609, + "num_input_tokens_seen": 20380480, + "step": 9425 + }, + { + "epoch": 1.538336052202284, + "grad_norm": 0.10062456130981445, + "learning_rate": 0.0007690864600326263, + "loss": 0.2049, + "num_input_tokens_seen": 20389600, + "step": 9430 + }, + { + "epoch": 1.5391517128874388, + "grad_norm": 0.08677083998918533, + "learning_rate": 0.0007694942903752039, + "loss": 0.1215, + "num_input_tokens_seen": 20399712, + "step": 9435 + }, + { + "epoch": 1.5399673735725938, + "grad_norm": 0.21858160197734833, + "learning_rate": 0.0007699021207177814, + "loss": 0.2001, + "num_input_tokens_seen": 20410816, + "step": 9440 + }, + { + "epoch": 1.5407830342577489, + "grad_norm": 0.13692283630371094, + "learning_rate": 0.000770309951060359, + "loss": 0.08, + "num_input_tokens_seen": 20421952, + "step": 9445 + }, + { + "epoch": 1.5415986949429037, + "grad_norm": 0.07183019816875458, + "learning_rate": 0.0007707177814029364, + "loss": 0.0549, + "num_input_tokens_seen": 20432480, + "step": 9450 + }, + { + "epoch": 1.5424143556280587, + "grad_norm": 0.04967404529452324, + "learning_rate": 0.0007711256117455138, + "loss": 0.1739, + "num_input_tokens_seen": 20444032, + "step": 9455 + }, + { + "epoch": 1.5432300163132138, + "grad_norm": 0.05237607657909393, + "learning_rate": 0.0007715334420880914, + "loss": 0.2333, + "num_input_tokens_seen": 20455648, + "step": 9460 + }, + { + "epoch": 1.5440456769983686, + "grad_norm": 0.2099936455488205, + "learning_rate": 0.0007719412724306688, + "loss": 0.139, + "num_input_tokens_seen": 20466112, + "step": 9465 + }, + { + "epoch": 1.5448613376835236, + "grad_norm": 0.04406539723277092, + "learning_rate": 0.0007723491027732464, + "loss": 0.1943, + "num_input_tokens_seen": 20477280, + "step": 9470 + }, + { + "epoch": 1.5456769983686787, + "grad_norm": 0.0637565404176712, + "learning_rate": 0.0007727569331158239, + "loss": 0.0561, + "num_input_tokens_seen": 20488704, + "step": 9475 + }, + { + "epoch": 1.5464926590538335, + "grad_norm": 0.14393645524978638, + "learning_rate": 0.0007731647634584013, + "loss": 0.146, + "num_input_tokens_seen": 20499168, + "step": 9480 + }, + { + "epoch": 1.5473083197389887, + "grad_norm": 0.032931309193372726, + "learning_rate": 0.0007735725938009788, + "loss": 0.0926, + "num_input_tokens_seen": 20509696, + "step": 9485 + }, + { + "epoch": 1.5481239804241436, + "grad_norm": 0.11536554247140884, + "learning_rate": 0.0007739804241435563, + "loss": 0.1343, + "num_input_tokens_seen": 20519424, + "step": 9490 + }, + { + "epoch": 1.5489396411092984, + "grad_norm": 0.017625289037823677, + "learning_rate": 0.0007743882544861339, + "loss": 0.1249, + "num_input_tokens_seen": 20530304, + "step": 9495 + }, + { + "epoch": 1.5497553017944536, + "grad_norm": 0.15197528898715973, + "learning_rate": 0.0007747960848287112, + "loss": 0.1542, + "num_input_tokens_seen": 20540384, + "step": 9500 + }, + { + "epoch": 1.5505709624796085, + "grad_norm": 0.2158111035823822, + "learning_rate": 0.0007752039151712887, + "loss": 0.096, + "num_input_tokens_seen": 20550912, + "step": 9505 + }, + { + "epoch": 1.5513866231647635, + "grad_norm": 0.04284696653485298, + "learning_rate": 0.0007756117455138663, + "loss": 0.0918, + "num_input_tokens_seen": 20561280, + "step": 9510 + }, + { + "epoch": 1.5522022838499185, + "grad_norm": 0.1303076297044754, + "learning_rate": 0.0007760195758564438, + "loss": 0.2241, + "num_input_tokens_seen": 20572896, + "step": 9515 + }, + { + "epoch": 1.5530179445350734, + "grad_norm": 0.044415369629859924, + "learning_rate": 0.0007764274061990211, + "loss": 0.0969, + "num_input_tokens_seen": 20583200, + "step": 9520 + }, + { + "epoch": 1.5538336052202284, + "grad_norm": 0.09820342063903809, + "learning_rate": 0.0007768352365415987, + "loss": 0.0713, + "num_input_tokens_seen": 20593824, + "step": 9525 + }, + { + "epoch": 1.5546492659053834, + "grad_norm": 0.0766250267624855, + "learning_rate": 0.0007772430668841762, + "loss": 0.1066, + "num_input_tokens_seen": 20605696, + "step": 9530 + }, + { + "epoch": 1.5554649265905383, + "grad_norm": 0.11463528871536255, + "learning_rate": 0.0007776508972267537, + "loss": 0.0916, + "num_input_tokens_seen": 20616672, + "step": 9535 + }, + { + "epoch": 1.5562805872756933, + "grad_norm": 0.3298501968383789, + "learning_rate": 0.0007780587275693312, + "loss": 0.0981, + "num_input_tokens_seen": 20629216, + "step": 9540 + }, + { + "epoch": 1.5570962479608483, + "grad_norm": 0.15438921749591827, + "learning_rate": 0.0007784665579119086, + "loss": 0.046, + "num_input_tokens_seen": 20640544, + "step": 9545 + }, + { + "epoch": 1.5579119086460032, + "grad_norm": 0.03456113860011101, + "learning_rate": 0.0007788743882544862, + "loss": 0.1043, + "num_input_tokens_seen": 20651808, + "step": 9550 + }, + { + "epoch": 1.5587275693311582, + "grad_norm": 0.044604893773794174, + "learning_rate": 0.0007792822185970636, + "loss": 0.0403, + "num_input_tokens_seen": 20663360, + "step": 9555 + }, + { + "epoch": 1.5595432300163132, + "grad_norm": 0.015352782793343067, + "learning_rate": 0.0007796900489396412, + "loss": 0.1858, + "num_input_tokens_seen": 20674336, + "step": 9560 + }, + { + "epoch": 1.560358890701468, + "grad_norm": 0.12856276333332062, + "learning_rate": 0.0007800978792822186, + "loss": 0.0773, + "num_input_tokens_seen": 20684960, + "step": 9565 + }, + { + "epoch": 1.5611745513866233, + "grad_norm": 0.15705958008766174, + "learning_rate": 0.000780505709624796, + "loss": 0.1627, + "num_input_tokens_seen": 20695648, + "step": 9570 + }, + { + "epoch": 1.5619902120717781, + "grad_norm": 0.26499855518341064, + "learning_rate": 0.0007809135399673736, + "loss": 0.1323, + "num_input_tokens_seen": 20707008, + "step": 9575 + }, + { + "epoch": 1.5628058727569332, + "grad_norm": 0.44296300411224365, + "learning_rate": 0.0007813213703099511, + "loss": 0.2125, + "num_input_tokens_seen": 20717024, + "step": 9580 + }, + { + "epoch": 1.5636215334420882, + "grad_norm": 0.10148364305496216, + "learning_rate": 0.0007817292006525287, + "loss": 0.1699, + "num_input_tokens_seen": 20727968, + "step": 9585 + }, + { + "epoch": 1.564437194127243, + "grad_norm": 0.7346378564834595, + "learning_rate": 0.000782137030995106, + "loss": 0.3014, + "num_input_tokens_seen": 20739520, + "step": 9590 + }, + { + "epoch": 1.565252854812398, + "grad_norm": 0.25575461983680725, + "learning_rate": 0.0007825448613376835, + "loss": 0.099, + "num_input_tokens_seen": 20751104, + "step": 9595 + }, + { + "epoch": 1.566068515497553, + "grad_norm": 0.041789885610342026, + "learning_rate": 0.0007829526916802611, + "loss": 0.0537, + "num_input_tokens_seen": 20760672, + "step": 9600 + }, + { + "epoch": 1.566884176182708, + "grad_norm": 0.2482481747865677, + "learning_rate": 0.0007833605220228385, + "loss": 0.1595, + "num_input_tokens_seen": 20772160, + "step": 9605 + }, + { + "epoch": 1.567699836867863, + "grad_norm": 0.32589903473854065, + "learning_rate": 0.000783768352365416, + "loss": 0.076, + "num_input_tokens_seen": 20783840, + "step": 9610 + }, + { + "epoch": 1.568515497553018, + "grad_norm": 0.056545983999967575, + "learning_rate": 0.0007841761827079935, + "loss": 0.0829, + "num_input_tokens_seen": 20795488, + "step": 9615 + }, + { + "epoch": 1.5693311582381728, + "grad_norm": 0.02571425400674343, + "learning_rate": 0.000784584013050571, + "loss": 0.1417, + "num_input_tokens_seen": 20806272, + "step": 9620 + }, + { + "epoch": 1.5701468189233279, + "grad_norm": 0.12023447453975677, + "learning_rate": 0.0007849918433931485, + "loss": 0.1562, + "num_input_tokens_seen": 20817472, + "step": 9625 + }, + { + "epoch": 1.570962479608483, + "grad_norm": 0.17629219591617584, + "learning_rate": 0.000785399673735726, + "loss": 0.1079, + "num_input_tokens_seen": 20828992, + "step": 9630 + }, + { + "epoch": 1.5717781402936377, + "grad_norm": 0.20702822506427765, + "learning_rate": 0.0007858075040783035, + "loss": 0.1637, + "num_input_tokens_seen": 20841184, + "step": 9635 + }, + { + "epoch": 1.572593800978793, + "grad_norm": 0.10118906199932098, + "learning_rate": 0.0007862153344208809, + "loss": 0.1106, + "num_input_tokens_seen": 20852096, + "step": 9640 + }, + { + "epoch": 1.5734094616639478, + "grad_norm": 0.016205577179789543, + "learning_rate": 0.0007866231647634584, + "loss": 0.1564, + "num_input_tokens_seen": 20862784, + "step": 9645 + }, + { + "epoch": 1.5742251223491026, + "grad_norm": 0.3246265947818756, + "learning_rate": 0.000787030995106036, + "loss": 0.2121, + "num_input_tokens_seen": 20872320, + "step": 9650 + }, + { + "epoch": 1.5750407830342579, + "grad_norm": 0.12944892048835754, + "learning_rate": 0.0007874388254486133, + "loss": 0.1364, + "num_input_tokens_seen": 20883648, + "step": 9655 + }, + { + "epoch": 1.5758564437194127, + "grad_norm": 0.2590799629688263, + "learning_rate": 0.0007878466557911908, + "loss": 0.1175, + "num_input_tokens_seen": 20892992, + "step": 9660 + }, + { + "epoch": 1.5766721044045677, + "grad_norm": 0.049245089292526245, + "learning_rate": 0.0007882544861337684, + "loss": 0.1526, + "num_input_tokens_seen": 20903552, + "step": 9665 + }, + { + "epoch": 1.5774877650897228, + "grad_norm": 0.15859724581241608, + "learning_rate": 0.0007886623164763459, + "loss": 0.118, + "num_input_tokens_seen": 20914400, + "step": 9670 + }, + { + "epoch": 1.5783034257748776, + "grad_norm": 0.08111312985420227, + "learning_rate": 0.0007890701468189233, + "loss": 0.0824, + "num_input_tokens_seen": 20925280, + "step": 9675 + }, + { + "epoch": 1.5791190864600326, + "grad_norm": 0.2263268232345581, + "learning_rate": 0.0007894779771615008, + "loss": 0.1133, + "num_input_tokens_seen": 20936256, + "step": 9680 + }, + { + "epoch": 1.5799347471451877, + "grad_norm": 0.24854174256324768, + "learning_rate": 0.0007898858075040783, + "loss": 0.2001, + "num_input_tokens_seen": 20946432, + "step": 9685 + }, + { + "epoch": 1.5807504078303425, + "grad_norm": 0.3115064203739166, + "learning_rate": 0.0007902936378466558, + "loss": 0.2652, + "num_input_tokens_seen": 20956576, + "step": 9690 + }, + { + "epoch": 1.5815660685154975, + "grad_norm": 0.054324883967638016, + "learning_rate": 0.0007907014681892332, + "loss": 0.2013, + "num_input_tokens_seen": 20967296, + "step": 9695 + }, + { + "epoch": 1.5823817292006526, + "grad_norm": 0.15358050167560577, + "learning_rate": 0.0007911092985318108, + "loss": 0.1248, + "num_input_tokens_seen": 20978176, + "step": 9700 + }, + { + "epoch": 1.5831973898858074, + "grad_norm": 0.05537407100200653, + "learning_rate": 0.0007915171288743883, + "loss": 0.0842, + "num_input_tokens_seen": 20989824, + "step": 9705 + }, + { + "epoch": 1.5840130505709626, + "grad_norm": 0.1706741601228714, + "learning_rate": 0.0007919249592169657, + "loss": 0.1579, + "num_input_tokens_seen": 21000992, + "step": 9710 + }, + { + "epoch": 1.5848287112561175, + "grad_norm": 0.179428368806839, + "learning_rate": 0.0007923327895595433, + "loss": 0.086, + "num_input_tokens_seen": 21010624, + "step": 9715 + }, + { + "epoch": 1.5856443719412723, + "grad_norm": 0.19192343950271606, + "learning_rate": 0.0007927406199021207, + "loss": 0.224, + "num_input_tokens_seen": 21021856, + "step": 9720 + }, + { + "epoch": 1.5864600326264275, + "grad_norm": 0.2749413847923279, + "learning_rate": 0.0007931484502446982, + "loss": 0.2647, + "num_input_tokens_seen": 21032864, + "step": 9725 + }, + { + "epoch": 1.5872756933115824, + "grad_norm": 0.020634248852729797, + "learning_rate": 0.0007935562805872757, + "loss": 0.0556, + "num_input_tokens_seen": 21044800, + "step": 9730 + }, + { + "epoch": 1.5880913539967374, + "grad_norm": 0.036429792642593384, + "learning_rate": 0.0007939641109298532, + "loss": 0.1549, + "num_input_tokens_seen": 21055936, + "step": 9735 + }, + { + "epoch": 1.5889070146818924, + "grad_norm": 0.11412417143583298, + "learning_rate": 0.0007943719412724308, + "loss": 0.0468, + "num_input_tokens_seen": 21066944, + "step": 9740 + }, + { + "epoch": 1.5897226753670473, + "grad_norm": 0.34290286898612976, + "learning_rate": 0.0007947797716150081, + "loss": 0.2389, + "num_input_tokens_seen": 21077440, + "step": 9745 + }, + { + "epoch": 1.5905383360522023, + "grad_norm": 0.07320336997509003, + "learning_rate": 0.0007951876019575857, + "loss": 0.2007, + "num_input_tokens_seen": 21087616, + "step": 9750 + }, + { + "epoch": 1.5913539967373573, + "grad_norm": 0.13163729012012482, + "learning_rate": 0.0007955954323001632, + "loss": 0.1386, + "num_input_tokens_seen": 21099424, + "step": 9755 + }, + { + "epoch": 1.5921696574225122, + "grad_norm": 0.08553323149681091, + "learning_rate": 0.0007960032626427406, + "loss": 0.1947, + "num_input_tokens_seen": 21109696, + "step": 9760 + }, + { + "epoch": 1.5929853181076672, + "grad_norm": 0.08144395798444748, + "learning_rate": 0.0007964110929853181, + "loss": 0.1125, + "num_input_tokens_seen": 21120608, + "step": 9765 + }, + { + "epoch": 1.5938009787928222, + "grad_norm": 0.27438557147979736, + "learning_rate": 0.0007968189233278956, + "loss": 0.1317, + "num_input_tokens_seen": 21131552, + "step": 9770 + }, + { + "epoch": 1.594616639477977, + "grad_norm": 0.13903331756591797, + "learning_rate": 0.0007972267536704732, + "loss": 0.116, + "num_input_tokens_seen": 21141920, + "step": 9775 + }, + { + "epoch": 1.595432300163132, + "grad_norm": 0.07145722210407257, + "learning_rate": 0.0007976345840130506, + "loss": 0.0591, + "num_input_tokens_seen": 21152768, + "step": 9780 + }, + { + "epoch": 1.5962479608482871, + "grad_norm": 0.19481636583805084, + "learning_rate": 0.000798042414355628, + "loss": 0.0639, + "num_input_tokens_seen": 21162816, + "step": 9785 + }, + { + "epoch": 1.597063621533442, + "grad_norm": 0.1090518906712532, + "learning_rate": 0.0007984502446982056, + "loss": 0.1006, + "num_input_tokens_seen": 21173376, + "step": 9790 + }, + { + "epoch": 1.5978792822185972, + "grad_norm": 0.07812517881393433, + "learning_rate": 0.000798858075040783, + "loss": 0.0791, + "num_input_tokens_seen": 21184064, + "step": 9795 + }, + { + "epoch": 1.598694942903752, + "grad_norm": 0.10007583349943161, + "learning_rate": 0.0007992659053833605, + "loss": 0.0489, + "num_input_tokens_seen": 21195136, + "step": 9800 + }, + { + "epoch": 1.599510603588907, + "grad_norm": 0.00513321440666914, + "learning_rate": 0.0007996737357259381, + "loss": 0.1956, + "num_input_tokens_seen": 21205824, + "step": 9805 + }, + { + "epoch": 1.600326264274062, + "grad_norm": 0.005637241993099451, + "learning_rate": 0.0008000815660685155, + "loss": 0.0374, + "num_input_tokens_seen": 21215456, + "step": 9810 + }, + { + "epoch": 1.601141924959217, + "grad_norm": 0.0892510935664177, + "learning_rate": 0.000800489396411093, + "loss": 0.0899, + "num_input_tokens_seen": 21226368, + "step": 9815 + }, + { + "epoch": 1.601957585644372, + "grad_norm": 0.376203328371048, + "learning_rate": 0.0008008972267536705, + "loss": 0.1367, + "num_input_tokens_seen": 21238240, + "step": 9820 + }, + { + "epoch": 1.602773246329527, + "grad_norm": 0.36713284254074097, + "learning_rate": 0.000801305057096248, + "loss": 0.1611, + "num_input_tokens_seen": 21248896, + "step": 9825 + }, + { + "epoch": 1.6035889070146818, + "grad_norm": 0.0800262987613678, + "learning_rate": 0.0008017128874388254, + "loss": 0.1063, + "num_input_tokens_seen": 21259264, + "step": 9830 + }, + { + "epoch": 1.6044045676998369, + "grad_norm": 0.0729251429438591, + "learning_rate": 0.0008021207177814029, + "loss": 0.0827, + "num_input_tokens_seen": 21269472, + "step": 9835 + }, + { + "epoch": 1.605220228384992, + "grad_norm": 0.4738904535770416, + "learning_rate": 0.0008025285481239805, + "loss": 0.1909, + "num_input_tokens_seen": 21281280, + "step": 9840 + }, + { + "epoch": 1.6060358890701467, + "grad_norm": 0.03555934503674507, + "learning_rate": 0.000802936378466558, + "loss": 0.1532, + "num_input_tokens_seen": 21290752, + "step": 9845 + }, + { + "epoch": 1.6068515497553018, + "grad_norm": 0.2605329751968384, + "learning_rate": 0.0008033442088091353, + "loss": 0.2615, + "num_input_tokens_seen": 21299520, + "step": 9850 + }, + { + "epoch": 1.6076672104404568, + "grad_norm": 0.21877489984035492, + "learning_rate": 0.0008037520391517129, + "loss": 0.1428, + "num_input_tokens_seen": 21309632, + "step": 9855 + }, + { + "epoch": 1.6084828711256116, + "grad_norm": 0.2073116898536682, + "learning_rate": 0.0008041598694942904, + "loss": 0.079, + "num_input_tokens_seen": 21320416, + "step": 9860 + }, + { + "epoch": 1.6092985318107669, + "grad_norm": 0.23353806138038635, + "learning_rate": 0.0008045676998368679, + "loss": 0.1914, + "num_input_tokens_seen": 21330432, + "step": 9865 + }, + { + "epoch": 1.6101141924959217, + "grad_norm": 0.06838128715753555, + "learning_rate": 0.0008049755301794454, + "loss": 0.0641, + "num_input_tokens_seen": 21341056, + "step": 9870 + }, + { + "epoch": 1.6109298531810765, + "grad_norm": 0.08493325114250183, + "learning_rate": 0.0008053833605220228, + "loss": 0.1241, + "num_input_tokens_seen": 21352864, + "step": 9875 + }, + { + "epoch": 1.6117455138662318, + "grad_norm": 0.1823084056377411, + "learning_rate": 0.0008057911908646003, + "loss": 0.1849, + "num_input_tokens_seen": 21364512, + "step": 9880 + }, + { + "epoch": 1.6125611745513866, + "grad_norm": 0.03267619386315346, + "learning_rate": 0.0008061990212071778, + "loss": 0.1082, + "num_input_tokens_seen": 21374656, + "step": 9885 + }, + { + "epoch": 1.6133768352365416, + "grad_norm": 0.22920729219913483, + "learning_rate": 0.0008066068515497554, + "loss": 0.1155, + "num_input_tokens_seen": 21384928, + "step": 9890 + }, + { + "epoch": 1.6141924959216967, + "grad_norm": 0.009968969970941544, + "learning_rate": 0.0008070146818923329, + "loss": 0.0435, + "num_input_tokens_seen": 21395488, + "step": 9895 + }, + { + "epoch": 1.6150081566068515, + "grad_norm": 0.06347603350877762, + "learning_rate": 0.0008074225122349102, + "loss": 0.0536, + "num_input_tokens_seen": 21405760, + "step": 9900 + }, + { + "epoch": 1.6158238172920065, + "grad_norm": 0.03512804955244064, + "learning_rate": 0.0008078303425774878, + "loss": 0.3131, + "num_input_tokens_seen": 21416096, + "step": 9905 + }, + { + "epoch": 1.6166394779771616, + "grad_norm": 0.25602778792381287, + "learning_rate": 0.0008082381729200653, + "loss": 0.1505, + "num_input_tokens_seen": 21426176, + "step": 9910 + }, + { + "epoch": 1.6174551386623164, + "grad_norm": 0.036331601440906525, + "learning_rate": 0.0008086460032626428, + "loss": 0.0997, + "num_input_tokens_seen": 21437024, + "step": 9915 + }, + { + "epoch": 1.6182707993474714, + "grad_norm": 0.06043427065014839, + "learning_rate": 0.0008090538336052202, + "loss": 0.1313, + "num_input_tokens_seen": 21447168, + "step": 9920 + }, + { + "epoch": 1.6190864600326265, + "grad_norm": 0.19334854185581207, + "learning_rate": 0.0008094616639477977, + "loss": 0.2161, + "num_input_tokens_seen": 21457568, + "step": 9925 + }, + { + "epoch": 1.6199021207177813, + "grad_norm": 0.08765456825494766, + "learning_rate": 0.0008098694942903753, + "loss": 0.0758, + "num_input_tokens_seen": 21467264, + "step": 9930 + }, + { + "epoch": 1.6207177814029365, + "grad_norm": 0.1640249788761139, + "learning_rate": 0.0008102773246329527, + "loss": 0.0498, + "num_input_tokens_seen": 21479360, + "step": 9935 + }, + { + "epoch": 1.6215334420880914, + "grad_norm": 0.1942995935678482, + "learning_rate": 0.0008106851549755301, + "loss": 0.0642, + "num_input_tokens_seen": 21490208, + "step": 9940 + }, + { + "epoch": 1.6223491027732462, + "grad_norm": 0.1993321031332016, + "learning_rate": 0.0008110929853181077, + "loss": 0.138, + "num_input_tokens_seen": 21501472, + "step": 9945 + }, + { + "epoch": 1.6231647634584014, + "grad_norm": 0.09414301067590714, + "learning_rate": 0.0008115008156606851, + "loss": 0.1241, + "num_input_tokens_seen": 21512896, + "step": 9950 + }, + { + "epoch": 1.6239804241435563, + "grad_norm": 0.07336173206567764, + "learning_rate": 0.0008119086460032627, + "loss": 0.1602, + "num_input_tokens_seen": 21523744, + "step": 9955 + }, + { + "epoch": 1.6247960848287113, + "grad_norm": 0.23218612372875214, + "learning_rate": 0.0008123164763458402, + "loss": 0.0861, + "num_input_tokens_seen": 21534592, + "step": 9960 + }, + { + "epoch": 1.6256117455138663, + "grad_norm": 0.0703354924917221, + "learning_rate": 0.0008127243066884176, + "loss": 0.0807, + "num_input_tokens_seen": 21546784, + "step": 9965 + }, + { + "epoch": 1.6264274061990212, + "grad_norm": 0.5143048763275146, + "learning_rate": 0.0008131321370309951, + "loss": 0.1959, + "num_input_tokens_seen": 21557792, + "step": 9970 + }, + { + "epoch": 1.6272430668841762, + "grad_norm": 0.26658895611763, + "learning_rate": 0.0008135399673735726, + "loss": 0.1729, + "num_input_tokens_seen": 21566496, + "step": 9975 + }, + { + "epoch": 1.6280587275693312, + "grad_norm": 0.007412992883473635, + "learning_rate": 0.0008139477977161502, + "loss": 0.0649, + "num_input_tokens_seen": 21576800, + "step": 9980 + }, + { + "epoch": 1.628874388254486, + "grad_norm": 0.14374643564224243, + "learning_rate": 0.0008143556280587275, + "loss": 0.1996, + "num_input_tokens_seen": 21587360, + "step": 9985 + }, + { + "epoch": 1.629690048939641, + "grad_norm": 0.23667171597480774, + "learning_rate": 0.000814763458401305, + "loss": 0.1, + "num_input_tokens_seen": 21597824, + "step": 9990 + }, + { + "epoch": 1.6305057096247961, + "grad_norm": 0.02756788767874241, + "learning_rate": 0.0008151712887438826, + "loss": 0.0974, + "num_input_tokens_seen": 21608064, + "step": 9995 + }, + { + "epoch": 1.631321370309951, + "grad_norm": 0.03173388913273811, + "learning_rate": 0.0008155791190864601, + "loss": 0.129, + "num_input_tokens_seen": 21617568, + "step": 10000 + }, + { + "epoch": 1.632137030995106, + "grad_norm": 0.0286561269313097, + "learning_rate": 0.0008159869494290375, + "loss": 0.1911, + "num_input_tokens_seen": 21628544, + "step": 10005 + }, + { + "epoch": 1.632952691680261, + "grad_norm": 0.02535759098827839, + "learning_rate": 0.000816394779771615, + "loss": 0.0685, + "num_input_tokens_seen": 21637440, + "step": 10010 + }, + { + "epoch": 1.6337683523654158, + "grad_norm": 0.26951324939727783, + "learning_rate": 0.0008168026101141925, + "loss": 0.1693, + "num_input_tokens_seen": 21647200, + "step": 10015 + }, + { + "epoch": 1.634584013050571, + "grad_norm": 0.09645380079746246, + "learning_rate": 0.00081721044045677, + "loss": 0.181, + "num_input_tokens_seen": 21657888, + "step": 10020 + }, + { + "epoch": 1.635399673735726, + "grad_norm": 0.19887897372245789, + "learning_rate": 0.0008176182707993475, + "loss": 0.1373, + "num_input_tokens_seen": 21668640, + "step": 10025 + }, + { + "epoch": 1.636215334420881, + "grad_norm": 0.18069353699684143, + "learning_rate": 0.000818026101141925, + "loss": 0.1123, + "num_input_tokens_seen": 21679008, + "step": 10030 + }, + { + "epoch": 1.637030995106036, + "grad_norm": 0.09430285543203354, + "learning_rate": 0.0008184339314845025, + "loss": 0.2209, + "num_input_tokens_seen": 21689984, + "step": 10035 + }, + { + "epoch": 1.6378466557911908, + "grad_norm": 0.050288546830415726, + "learning_rate": 0.0008188417618270799, + "loss": 0.1313, + "num_input_tokens_seen": 21700960, + "step": 10040 + }, + { + "epoch": 1.6386623164763459, + "grad_norm": 0.11963741481304169, + "learning_rate": 0.0008192495921696575, + "loss": 0.1268, + "num_input_tokens_seen": 21711680, + "step": 10045 + }, + { + "epoch": 1.639477977161501, + "grad_norm": 0.18857711553573608, + "learning_rate": 0.0008196574225122349, + "loss": 0.1761, + "num_input_tokens_seen": 21722432, + "step": 10050 + }, + { + "epoch": 1.6402936378466557, + "grad_norm": 0.2952326238155365, + "learning_rate": 0.0008200652528548124, + "loss": 0.0944, + "num_input_tokens_seen": 21732608, + "step": 10055 + }, + { + "epoch": 1.6411092985318108, + "grad_norm": 0.06583889573812485, + "learning_rate": 0.0008204730831973899, + "loss": 0.1298, + "num_input_tokens_seen": 21741760, + "step": 10060 + }, + { + "epoch": 1.6419249592169658, + "grad_norm": 0.23590274155139923, + "learning_rate": 0.0008208809135399674, + "loss": 0.1632, + "num_input_tokens_seen": 21753472, + "step": 10065 + }, + { + "epoch": 1.6427406199021206, + "grad_norm": 0.11746193468570709, + "learning_rate": 0.000821288743882545, + "loss": 0.1314, + "num_input_tokens_seen": 21764480, + "step": 10070 + }, + { + "epoch": 1.6435562805872757, + "grad_norm": 0.11129488050937653, + "learning_rate": 0.0008216965742251223, + "loss": 0.1275, + "num_input_tokens_seen": 21775296, + "step": 10075 + }, + { + "epoch": 1.6443719412724307, + "grad_norm": 0.25786536931991577, + "learning_rate": 0.0008221044045676999, + "loss": 0.2348, + "num_input_tokens_seen": 21786240, + "step": 10080 + }, + { + "epoch": 1.6451876019575855, + "grad_norm": 0.1905505508184433, + "learning_rate": 0.0008225122349102774, + "loss": 0.171, + "num_input_tokens_seen": 21797440, + "step": 10085 + }, + { + "epoch": 1.6460032626427408, + "grad_norm": 0.10690456628799438, + "learning_rate": 0.0008229200652528548, + "loss": 0.069, + "num_input_tokens_seen": 21807456, + "step": 10090 + }, + { + "epoch": 1.6468189233278956, + "grad_norm": 0.015939027070999146, + "learning_rate": 0.0008233278955954323, + "loss": 0.0365, + "num_input_tokens_seen": 21819968, + "step": 10095 + }, + { + "epoch": 1.6476345840130504, + "grad_norm": 0.03100682608783245, + "learning_rate": 0.0008237357259380098, + "loss": 0.1481, + "num_input_tokens_seen": 21831328, + "step": 10100 + }, + { + "epoch": 1.6484502446982057, + "grad_norm": 0.2114262878894806, + "learning_rate": 0.0008241435562805873, + "loss": 0.2003, + "num_input_tokens_seen": 21843648, + "step": 10105 + }, + { + "epoch": 1.6492659053833605, + "grad_norm": 0.1278071254491806, + "learning_rate": 0.0008245513866231648, + "loss": 0.0986, + "num_input_tokens_seen": 21853120, + "step": 10110 + }, + { + "epoch": 1.6500815660685155, + "grad_norm": 0.06103214994072914, + "learning_rate": 0.0008249592169657422, + "loss": 0.129, + "num_input_tokens_seen": 21863424, + "step": 10115 + }, + { + "epoch": 1.6508972267536706, + "grad_norm": 0.048976849764585495, + "learning_rate": 0.0008253670473083198, + "loss": 0.173, + "num_input_tokens_seen": 21873952, + "step": 10120 + }, + { + "epoch": 1.6517128874388254, + "grad_norm": 0.07527100294828415, + "learning_rate": 0.0008257748776508972, + "loss": 0.1356, + "num_input_tokens_seen": 21884736, + "step": 10125 + }, + { + "epoch": 1.6525285481239804, + "grad_norm": 0.038339171558618546, + "learning_rate": 0.0008261827079934747, + "loss": 0.0748, + "num_input_tokens_seen": 21895264, + "step": 10130 + }, + { + "epoch": 1.6533442088091355, + "grad_norm": 0.15123924612998962, + "learning_rate": 0.0008265905383360523, + "loss": 0.2225, + "num_input_tokens_seen": 21906720, + "step": 10135 + }, + { + "epoch": 1.6541598694942903, + "grad_norm": 0.19199934601783752, + "learning_rate": 0.0008269983686786296, + "loss": 0.1511, + "num_input_tokens_seen": 21916416, + "step": 10140 + }, + { + "epoch": 1.6549755301794453, + "grad_norm": 0.1354297697544098, + "learning_rate": 0.0008274061990212072, + "loss": 0.1065, + "num_input_tokens_seen": 21927328, + "step": 10145 + }, + { + "epoch": 1.6557911908646004, + "grad_norm": 0.16701364517211914, + "learning_rate": 0.0008278140293637847, + "loss": 0.1901, + "num_input_tokens_seen": 21938112, + "step": 10150 + }, + { + "epoch": 1.6566068515497552, + "grad_norm": 0.10553938150405884, + "learning_rate": 0.0008282218597063622, + "loss": 0.123, + "num_input_tokens_seen": 21949472, + "step": 10155 + }, + { + "epoch": 1.6574225122349104, + "grad_norm": 0.031392499804496765, + "learning_rate": 0.0008286296900489396, + "loss": 0.0626, + "num_input_tokens_seen": 21960384, + "step": 10160 + }, + { + "epoch": 1.6582381729200653, + "grad_norm": 0.15648214519023895, + "learning_rate": 0.0008290375203915171, + "loss": 0.1233, + "num_input_tokens_seen": 21971104, + "step": 10165 + }, + { + "epoch": 1.65905383360522, + "grad_norm": 0.06735506653785706, + "learning_rate": 0.0008294453507340947, + "loss": 0.0851, + "num_input_tokens_seen": 21982400, + "step": 10170 + }, + { + "epoch": 1.6598694942903753, + "grad_norm": 0.017349006608128548, + "learning_rate": 0.0008298531810766721, + "loss": 0.1242, + "num_input_tokens_seen": 21994080, + "step": 10175 + }, + { + "epoch": 1.6606851549755302, + "grad_norm": 0.1527702659368515, + "learning_rate": 0.0008302610114192496, + "loss": 0.0567, + "num_input_tokens_seen": 22005984, + "step": 10180 + }, + { + "epoch": 1.6615008156606852, + "grad_norm": 0.07216469943523407, + "learning_rate": 0.0008306688417618271, + "loss": 0.1481, + "num_input_tokens_seen": 22015744, + "step": 10185 + }, + { + "epoch": 1.6623164763458402, + "grad_norm": 0.15371765196323395, + "learning_rate": 0.0008310766721044046, + "loss": 0.2624, + "num_input_tokens_seen": 22025248, + "step": 10190 + }, + { + "epoch": 1.663132137030995, + "grad_norm": 0.06531374156475067, + "learning_rate": 0.0008314845024469821, + "loss": 0.0583, + "num_input_tokens_seen": 22035456, + "step": 10195 + }, + { + "epoch": 1.66394779771615, + "grad_norm": 0.06966865062713623, + "learning_rate": 0.0008318923327895596, + "loss": 0.1549, + "num_input_tokens_seen": 22045568, + "step": 10200 + }, + { + "epoch": 1.6647634584013051, + "grad_norm": 0.036182425916194916, + "learning_rate": 0.000832300163132137, + "loss": 0.0536, + "num_input_tokens_seen": 22055360, + "step": 10205 + }, + { + "epoch": 1.66557911908646, + "grad_norm": 0.040477022528648376, + "learning_rate": 0.0008327079934747145, + "loss": 0.0946, + "num_input_tokens_seen": 22066624, + "step": 10210 + }, + { + "epoch": 1.666394779771615, + "grad_norm": 0.06451041251420975, + "learning_rate": 0.000833115823817292, + "loss": 0.1723, + "num_input_tokens_seen": 22078528, + "step": 10215 + }, + { + "epoch": 1.66721044045677, + "grad_norm": 0.27383536100387573, + "learning_rate": 0.0008335236541598696, + "loss": 0.1661, + "num_input_tokens_seen": 22089824, + "step": 10220 + }, + { + "epoch": 1.6680261011419248, + "grad_norm": 0.09901408106088638, + "learning_rate": 0.0008339314845024471, + "loss": 0.1523, + "num_input_tokens_seen": 22099744, + "step": 10225 + }, + { + "epoch": 1.6688417618270799, + "grad_norm": 0.3080720901489258, + "learning_rate": 0.0008343393148450244, + "loss": 0.2365, + "num_input_tokens_seen": 22110432, + "step": 10230 + }, + { + "epoch": 1.669657422512235, + "grad_norm": 0.1689985692501068, + "learning_rate": 0.000834747145187602, + "loss": 0.1776, + "num_input_tokens_seen": 22120992, + "step": 10235 + }, + { + "epoch": 1.6704730831973897, + "grad_norm": 0.36505305767059326, + "learning_rate": 0.0008351549755301795, + "loss": 0.2088, + "num_input_tokens_seen": 22130816, + "step": 10240 + }, + { + "epoch": 1.671288743882545, + "grad_norm": 0.11974579095840454, + "learning_rate": 0.0008355628058727569, + "loss": 0.0763, + "num_input_tokens_seen": 22142144, + "step": 10245 + }, + { + "epoch": 1.6721044045676998, + "grad_norm": 0.21502956748008728, + "learning_rate": 0.0008359706362153344, + "loss": 0.1549, + "num_input_tokens_seen": 22152544, + "step": 10250 + }, + { + "epoch": 1.6729200652528549, + "grad_norm": 0.29304221272468567, + "learning_rate": 0.0008363784665579119, + "loss": 0.1516, + "num_input_tokens_seen": 22162720, + "step": 10255 + }, + { + "epoch": 1.67373572593801, + "grad_norm": 0.11507556587457657, + "learning_rate": 0.0008367862969004895, + "loss": 0.1327, + "num_input_tokens_seen": 22172832, + "step": 10260 + }, + { + "epoch": 1.6745513866231647, + "grad_norm": 0.0777512788772583, + "learning_rate": 0.0008371941272430669, + "loss": 0.0519, + "num_input_tokens_seen": 22183360, + "step": 10265 + }, + { + "epoch": 1.6753670473083198, + "grad_norm": 0.02606668882071972, + "learning_rate": 0.0008376019575856443, + "loss": 0.1347, + "num_input_tokens_seen": 22192640, + "step": 10270 + }, + { + "epoch": 1.6761827079934748, + "grad_norm": 0.19271859526634216, + "learning_rate": 0.0008380097879282219, + "loss": 0.2345, + "num_input_tokens_seen": 22204416, + "step": 10275 + }, + { + "epoch": 1.6769983686786296, + "grad_norm": 0.05584167316555977, + "learning_rate": 0.0008384176182707993, + "loss": 0.0319, + "num_input_tokens_seen": 22216288, + "step": 10280 + }, + { + "epoch": 1.6778140293637847, + "grad_norm": 0.052766405045986176, + "learning_rate": 0.0008388254486133769, + "loss": 0.2267, + "num_input_tokens_seen": 22226752, + "step": 10285 + }, + { + "epoch": 1.6786296900489397, + "grad_norm": 0.10538561642169952, + "learning_rate": 0.0008392332789559544, + "loss": 0.225, + "num_input_tokens_seen": 22237952, + "step": 10290 + }, + { + "epoch": 1.6794453507340945, + "grad_norm": 0.06835313141345978, + "learning_rate": 0.0008396411092985318, + "loss": 0.1137, + "num_input_tokens_seen": 22248032, + "step": 10295 + }, + { + "epoch": 1.6802610114192496, + "grad_norm": 0.27471956610679626, + "learning_rate": 0.0008400489396411093, + "loss": 0.1408, + "num_input_tokens_seen": 22258816, + "step": 10300 + }, + { + "epoch": 1.6810766721044046, + "grad_norm": 0.07462483644485474, + "learning_rate": 0.0008404567699836868, + "loss": 0.1832, + "num_input_tokens_seen": 22270720, + "step": 10305 + }, + { + "epoch": 1.6818923327895594, + "grad_norm": 0.22200042009353638, + "learning_rate": 0.0008408646003262644, + "loss": 0.1152, + "num_input_tokens_seen": 22280160, + "step": 10310 + }, + { + "epoch": 1.6827079934747147, + "grad_norm": 0.018682435154914856, + "learning_rate": 0.0008412724306688417, + "loss": 0.0519, + "num_input_tokens_seen": 22291264, + "step": 10315 + }, + { + "epoch": 1.6835236541598695, + "grad_norm": 0.1510961651802063, + "learning_rate": 0.0008416802610114192, + "loss": 0.1112, + "num_input_tokens_seen": 22302080, + "step": 10320 + }, + { + "epoch": 1.6843393148450243, + "grad_norm": 0.46816954016685486, + "learning_rate": 0.0008420880913539968, + "loss": 0.2358, + "num_input_tokens_seen": 22313344, + "step": 10325 + }, + { + "epoch": 1.6851549755301796, + "grad_norm": 0.07551740109920502, + "learning_rate": 0.0008424959216965743, + "loss": 0.1266, + "num_input_tokens_seen": 22324416, + "step": 10330 + }, + { + "epoch": 1.6859706362153344, + "grad_norm": 0.0401376448571682, + "learning_rate": 0.0008429037520391518, + "loss": 0.1085, + "num_input_tokens_seen": 22334528, + "step": 10335 + }, + { + "epoch": 1.6867862969004894, + "grad_norm": 0.0454762764275074, + "learning_rate": 0.0008433115823817292, + "loss": 0.0972, + "num_input_tokens_seen": 22344864, + "step": 10340 + }, + { + "epoch": 1.6876019575856445, + "grad_norm": 0.0823223739862442, + "learning_rate": 0.0008437194127243067, + "loss": 0.1238, + "num_input_tokens_seen": 22355360, + "step": 10345 + }, + { + "epoch": 1.6884176182707993, + "grad_norm": 0.043828509747982025, + "learning_rate": 0.0008441272430668842, + "loss": 0.0374, + "num_input_tokens_seen": 22366016, + "step": 10350 + }, + { + "epoch": 1.6892332789559543, + "grad_norm": 0.21091154217720032, + "learning_rate": 0.0008445350734094617, + "loss": 0.0975, + "num_input_tokens_seen": 22376128, + "step": 10355 + }, + { + "epoch": 1.6900489396411094, + "grad_norm": 0.19217798113822937, + "learning_rate": 0.0008449429037520392, + "loss": 0.2432, + "num_input_tokens_seen": 22385632, + "step": 10360 + }, + { + "epoch": 1.6908646003262642, + "grad_norm": 0.17423127591609955, + "learning_rate": 0.0008453507340946166, + "loss": 0.2182, + "num_input_tokens_seen": 22397152, + "step": 10365 + }, + { + "epoch": 1.6916802610114192, + "grad_norm": 0.06854557245969772, + "learning_rate": 0.0008457585644371941, + "loss": 0.1772, + "num_input_tokens_seen": 22409952, + "step": 10370 + }, + { + "epoch": 1.6924959216965743, + "grad_norm": 0.05031463876366615, + "learning_rate": 0.0008461663947797717, + "loss": 0.1286, + "num_input_tokens_seen": 22422016, + "step": 10375 + }, + { + "epoch": 1.693311582381729, + "grad_norm": 0.10461442172527313, + "learning_rate": 0.0008465742251223492, + "loss": 0.1723, + "num_input_tokens_seen": 22432960, + "step": 10380 + }, + { + "epoch": 1.6941272430668843, + "grad_norm": 0.06757992506027222, + "learning_rate": 0.0008469820554649265, + "loss": 0.082, + "num_input_tokens_seen": 22443648, + "step": 10385 + }, + { + "epoch": 1.6949429037520392, + "grad_norm": 0.09296396374702454, + "learning_rate": 0.0008473898858075041, + "loss": 0.1124, + "num_input_tokens_seen": 22454560, + "step": 10390 + }, + { + "epoch": 1.695758564437194, + "grad_norm": 0.16991855204105377, + "learning_rate": 0.0008477977161500816, + "loss": 0.127, + "num_input_tokens_seen": 22466400, + "step": 10395 + }, + { + "epoch": 1.6965742251223492, + "grad_norm": 0.05406171828508377, + "learning_rate": 0.0008482055464926591, + "loss": 0.0847, + "num_input_tokens_seen": 22476800, + "step": 10400 + }, + { + "epoch": 1.697389885807504, + "grad_norm": 0.12532663345336914, + "learning_rate": 0.0008486133768352365, + "loss": 0.0764, + "num_input_tokens_seen": 22488288, + "step": 10405 + }, + { + "epoch": 1.698205546492659, + "grad_norm": 0.23430512845516205, + "learning_rate": 0.000849021207177814, + "loss": 0.1187, + "num_input_tokens_seen": 22499264, + "step": 10410 + }, + { + "epoch": 1.6990212071778141, + "grad_norm": 0.08874372392892838, + "learning_rate": 0.0008494290375203916, + "loss": 0.1074, + "num_input_tokens_seen": 22510080, + "step": 10415 + }, + { + "epoch": 1.699836867862969, + "grad_norm": 0.1635916829109192, + "learning_rate": 0.000849836867862969, + "loss": 0.0719, + "num_input_tokens_seen": 22520096, + "step": 10420 + }, + { + "epoch": 1.700652528548124, + "grad_norm": 0.07796313613653183, + "learning_rate": 0.0008502446982055465, + "loss": 0.0887, + "num_input_tokens_seen": 22531168, + "step": 10425 + }, + { + "epoch": 1.701468189233279, + "grad_norm": 0.2718281149864197, + "learning_rate": 0.000850652528548124, + "loss": 0.1157, + "num_input_tokens_seen": 22541600, + "step": 10430 + }, + { + "epoch": 1.7022838499184338, + "grad_norm": 0.17568756639957428, + "learning_rate": 0.0008510603588907014, + "loss": 0.0538, + "num_input_tokens_seen": 22552864, + "step": 10435 + }, + { + "epoch": 1.7030995106035889, + "grad_norm": 0.006328089628368616, + "learning_rate": 0.000851468189233279, + "loss": 0.2101, + "num_input_tokens_seen": 22563776, + "step": 10440 + }, + { + "epoch": 1.703915171288744, + "grad_norm": 0.20163559913635254, + "learning_rate": 0.0008518760195758565, + "loss": 0.0462, + "num_input_tokens_seen": 22575328, + "step": 10445 + }, + { + "epoch": 1.7047308319738987, + "grad_norm": 0.3907875418663025, + "learning_rate": 0.000852283849918434, + "loss": 0.1548, + "num_input_tokens_seen": 22586240, + "step": 10450 + }, + { + "epoch": 1.7055464926590538, + "grad_norm": 0.010955499485135078, + "learning_rate": 0.0008526916802610114, + "loss": 0.0301, + "num_input_tokens_seen": 22596512, + "step": 10455 + }, + { + "epoch": 1.7063621533442088, + "grad_norm": 0.4692709147930145, + "learning_rate": 0.0008530995106035889, + "loss": 0.279, + "num_input_tokens_seen": 22607200, + "step": 10460 + }, + { + "epoch": 1.7071778140293636, + "grad_norm": 0.17287231981754303, + "learning_rate": 0.0008535073409461665, + "loss": 0.0531, + "num_input_tokens_seen": 22618368, + "step": 10465 + }, + { + "epoch": 1.707993474714519, + "grad_norm": 0.022568654268980026, + "learning_rate": 0.0008539151712887438, + "loss": 0.1089, + "num_input_tokens_seen": 22629312, + "step": 10470 + }, + { + "epoch": 1.7088091353996737, + "grad_norm": 0.09624893218278885, + "learning_rate": 0.0008543230016313214, + "loss": 0.1738, + "num_input_tokens_seen": 22641568, + "step": 10475 + }, + { + "epoch": 1.7096247960848288, + "grad_norm": 0.12691551446914673, + "learning_rate": 0.0008547308319738989, + "loss": 0.0738, + "num_input_tokens_seen": 22652224, + "step": 10480 + }, + { + "epoch": 1.7104404567699838, + "grad_norm": 0.17793002724647522, + "learning_rate": 0.0008551386623164764, + "loss": 0.057, + "num_input_tokens_seen": 22662816, + "step": 10485 + }, + { + "epoch": 1.7112561174551386, + "grad_norm": 0.07302019745111465, + "learning_rate": 0.0008555464926590538, + "loss": 0.1733, + "num_input_tokens_seen": 22674368, + "step": 10490 + }, + { + "epoch": 1.7120717781402937, + "grad_norm": 0.15017388761043549, + "learning_rate": 0.0008559543230016313, + "loss": 0.1191, + "num_input_tokens_seen": 22686528, + "step": 10495 + }, + { + "epoch": 1.7128874388254487, + "grad_norm": 0.2169235348701477, + "learning_rate": 0.0008563621533442089, + "loss": 0.1601, + "num_input_tokens_seen": 22696384, + "step": 10500 + }, + { + "epoch": 1.7137030995106035, + "grad_norm": 0.12006626278162003, + "learning_rate": 0.0008567699836867863, + "loss": 0.2777, + "num_input_tokens_seen": 22707552, + "step": 10505 + }, + { + "epoch": 1.7145187601957586, + "grad_norm": 0.1256376951932907, + "learning_rate": 0.0008571778140293638, + "loss": 0.2857, + "num_input_tokens_seen": 22718496, + "step": 10510 + }, + { + "epoch": 1.7153344208809136, + "grad_norm": 0.24016325175762177, + "learning_rate": 0.0008575856443719413, + "loss": 0.2836, + "num_input_tokens_seen": 22729088, + "step": 10515 + }, + { + "epoch": 1.7161500815660684, + "grad_norm": 0.08173760026693344, + "learning_rate": 0.0008579934747145188, + "loss": 0.1304, + "num_input_tokens_seen": 22738944, + "step": 10520 + }, + { + "epoch": 1.7169657422512234, + "grad_norm": 0.11104224622249603, + "learning_rate": 0.0008584013050570962, + "loss": 0.1197, + "num_input_tokens_seen": 22749952, + "step": 10525 + }, + { + "epoch": 1.7177814029363785, + "grad_norm": 0.05892535671591759, + "learning_rate": 0.0008588091353996738, + "loss": 0.0683, + "num_input_tokens_seen": 22759200, + "step": 10530 + }, + { + "epoch": 1.7185970636215333, + "grad_norm": 0.14461533725261688, + "learning_rate": 0.0008592169657422512, + "loss": 0.1721, + "num_input_tokens_seen": 22769632, + "step": 10535 + }, + { + "epoch": 1.7194127243066886, + "grad_norm": 0.05228950455784798, + "learning_rate": 0.0008596247960848287, + "loss": 0.0866, + "num_input_tokens_seen": 22781056, + "step": 10540 + }, + { + "epoch": 1.7202283849918434, + "grad_norm": 0.29761654138565063, + "learning_rate": 0.0008600326264274062, + "loss": 0.1655, + "num_input_tokens_seen": 22791776, + "step": 10545 + }, + { + "epoch": 1.7210440456769984, + "grad_norm": 0.018953580409288406, + "learning_rate": 0.0008604404567699837, + "loss": 0.0932, + "num_input_tokens_seen": 22801952, + "step": 10550 + }, + { + "epoch": 1.7218597063621535, + "grad_norm": 0.05711786821484566, + "learning_rate": 0.0008608482871125613, + "loss": 0.0713, + "num_input_tokens_seen": 22813024, + "step": 10555 + }, + { + "epoch": 1.7226753670473083, + "grad_norm": 0.08064857870340347, + "learning_rate": 0.0008612561174551386, + "loss": 0.1857, + "num_input_tokens_seen": 22823904, + "step": 10560 + }, + { + "epoch": 1.7234910277324633, + "grad_norm": 0.03695710375905037, + "learning_rate": 0.0008616639477977162, + "loss": 0.136, + "num_input_tokens_seen": 22834400, + "step": 10565 + }, + { + "epoch": 1.7243066884176184, + "grad_norm": 0.132895827293396, + "learning_rate": 0.0008620717781402937, + "loss": 0.0797, + "num_input_tokens_seen": 22845856, + "step": 10570 + }, + { + "epoch": 1.7251223491027732, + "grad_norm": 0.07648682594299316, + "learning_rate": 0.0008624796084828711, + "loss": 0.1009, + "num_input_tokens_seen": 22856608, + "step": 10575 + }, + { + "epoch": 1.7259380097879282, + "grad_norm": 0.2530839741230011, + "learning_rate": 0.0008628874388254486, + "loss": 0.1728, + "num_input_tokens_seen": 22867168, + "step": 10580 + }, + { + "epoch": 1.7267536704730833, + "grad_norm": 0.14033333957195282, + "learning_rate": 0.0008632952691680261, + "loss": 0.1443, + "num_input_tokens_seen": 22877312, + "step": 10585 + }, + { + "epoch": 1.727569331158238, + "grad_norm": 0.12076137959957123, + "learning_rate": 0.0008637030995106036, + "loss": 0.0775, + "num_input_tokens_seen": 22888832, + "step": 10590 + }, + { + "epoch": 1.7283849918433931, + "grad_norm": 0.05201772227883339, + "learning_rate": 0.0008641109298531811, + "loss": 0.1281, + "num_input_tokens_seen": 22898464, + "step": 10595 + }, + { + "epoch": 1.7292006525285482, + "grad_norm": 0.05368548259139061, + "learning_rate": 0.0008645187601957585, + "loss": 0.0989, + "num_input_tokens_seen": 22909568, + "step": 10600 + }, + { + "epoch": 1.730016313213703, + "grad_norm": 0.21361590921878815, + "learning_rate": 0.0008649265905383361, + "loss": 0.1372, + "num_input_tokens_seen": 22920992, + "step": 10605 + }, + { + "epoch": 1.7308319738988582, + "grad_norm": 0.18085242807865143, + "learning_rate": 0.0008653344208809135, + "loss": 0.1765, + "num_input_tokens_seen": 22930880, + "step": 10610 + }, + { + "epoch": 1.731647634584013, + "grad_norm": 0.060455434024333954, + "learning_rate": 0.0008657422512234911, + "loss": 0.1894, + "num_input_tokens_seen": 22941568, + "step": 10615 + }, + { + "epoch": 1.7324632952691679, + "grad_norm": 0.12104543298482895, + "learning_rate": 0.0008661500815660686, + "loss": 0.0868, + "num_input_tokens_seen": 22952576, + "step": 10620 + }, + { + "epoch": 1.7332789559543231, + "grad_norm": 0.16045480966567993, + "learning_rate": 0.0008665579119086459, + "loss": 0.1066, + "num_input_tokens_seen": 22962144, + "step": 10625 + }, + { + "epoch": 1.734094616639478, + "grad_norm": 0.012985051609575748, + "learning_rate": 0.0008669657422512235, + "loss": 0.1631, + "num_input_tokens_seen": 22973312, + "step": 10630 + }, + { + "epoch": 1.734910277324633, + "grad_norm": 0.07617738097906113, + "learning_rate": 0.000867373572593801, + "loss": 0.1551, + "num_input_tokens_seen": 22983424, + "step": 10635 + }, + { + "epoch": 1.735725938009788, + "grad_norm": 0.16659033298492432, + "learning_rate": 0.0008677814029363786, + "loss": 0.0858, + "num_input_tokens_seen": 22992800, + "step": 10640 + }, + { + "epoch": 1.7365415986949428, + "grad_norm": 0.012401281856000423, + "learning_rate": 0.0008681892332789559, + "loss": 0.0629, + "num_input_tokens_seen": 23003552, + "step": 10645 + }, + { + "epoch": 1.7373572593800979, + "grad_norm": 0.010529414750635624, + "learning_rate": 0.0008685970636215334, + "loss": 0.1084, + "num_input_tokens_seen": 23014688, + "step": 10650 + }, + { + "epoch": 1.738172920065253, + "grad_norm": 0.006768247112631798, + "learning_rate": 0.000869004893964111, + "loss": 0.0348, + "num_input_tokens_seen": 23025344, + "step": 10655 + }, + { + "epoch": 1.7389885807504077, + "grad_norm": 0.058623116463422775, + "learning_rate": 0.0008694127243066884, + "loss": 0.0389, + "num_input_tokens_seen": 23035328, + "step": 10660 + }, + { + "epoch": 1.7398042414355628, + "grad_norm": 0.007725914474576712, + "learning_rate": 0.000869820554649266, + "loss": 0.1048, + "num_input_tokens_seen": 23045504, + "step": 10665 + }, + { + "epoch": 1.7406199021207178, + "grad_norm": 0.34186941385269165, + "learning_rate": 0.0008702283849918434, + "loss": 0.1211, + "num_input_tokens_seen": 23056736, + "step": 10670 + }, + { + "epoch": 1.7414355628058726, + "grad_norm": 0.47554755210876465, + "learning_rate": 0.0008706362153344209, + "loss": 0.1373, + "num_input_tokens_seen": 23065760, + "step": 10675 + }, + { + "epoch": 1.7422512234910277, + "grad_norm": 0.206298828125, + "learning_rate": 0.0008710440456769984, + "loss": 0.1016, + "num_input_tokens_seen": 23077376, + "step": 10680 + }, + { + "epoch": 1.7430668841761827, + "grad_norm": 0.016806311905384064, + "learning_rate": 0.0008714518760195759, + "loss": 0.0164, + "num_input_tokens_seen": 23088736, + "step": 10685 + }, + { + "epoch": 1.7438825448613375, + "grad_norm": 0.05761105194687843, + "learning_rate": 0.0008718597063621533, + "loss": 0.2616, + "num_input_tokens_seen": 23101088, + "step": 10690 + }, + { + "epoch": 1.7446982055464928, + "grad_norm": 0.03953443840146065, + "learning_rate": 0.0008722675367047308, + "loss": 0.2475, + "num_input_tokens_seen": 23112576, + "step": 10695 + }, + { + "epoch": 1.7455138662316476, + "grad_norm": 0.36860281229019165, + "learning_rate": 0.0008726753670473083, + "loss": 0.1334, + "num_input_tokens_seen": 23123744, + "step": 10700 + }, + { + "epoch": 1.7463295269168027, + "grad_norm": 0.23439127206802368, + "learning_rate": 0.0008730831973898859, + "loss": 0.1546, + "num_input_tokens_seen": 23134784, + "step": 10705 + }, + { + "epoch": 1.7471451876019577, + "grad_norm": 0.04523128643631935, + "learning_rate": 0.0008734910277324634, + "loss": 0.1541, + "num_input_tokens_seen": 23146240, + "step": 10710 + }, + { + "epoch": 1.7479608482871125, + "grad_norm": 0.2866004705429077, + "learning_rate": 0.0008738988580750407, + "loss": 0.1121, + "num_input_tokens_seen": 23158208, + "step": 10715 + }, + { + "epoch": 1.7487765089722676, + "grad_norm": 0.0991467610001564, + "learning_rate": 0.0008743066884176183, + "loss": 0.0889, + "num_input_tokens_seen": 23169280, + "step": 10720 + }, + { + "epoch": 1.7495921696574226, + "grad_norm": 0.05290234833955765, + "learning_rate": 0.0008747145187601958, + "loss": 0.0835, + "num_input_tokens_seen": 23181728, + "step": 10725 + }, + { + "epoch": 1.7504078303425774, + "grad_norm": 0.2909699082374573, + "learning_rate": 0.0008751223491027733, + "loss": 0.1154, + "num_input_tokens_seen": 23191328, + "step": 10730 + }, + { + "epoch": 1.7512234910277324, + "grad_norm": 0.08580849319696426, + "learning_rate": 0.0008755301794453507, + "loss": 0.0663, + "num_input_tokens_seen": 23200832, + "step": 10735 + }, + { + "epoch": 1.7520391517128875, + "grad_norm": 0.3593541085720062, + "learning_rate": 0.0008759380097879282, + "loss": 0.2485, + "num_input_tokens_seen": 23211456, + "step": 10740 + }, + { + "epoch": 1.7528548123980423, + "grad_norm": 0.09344206005334854, + "learning_rate": 0.0008763458401305058, + "loss": 0.111, + "num_input_tokens_seen": 23223104, + "step": 10745 + }, + { + "epoch": 1.7536704730831973, + "grad_norm": 0.18612459301948547, + "learning_rate": 0.0008767536704730832, + "loss": 0.2546, + "num_input_tokens_seen": 23234752, + "step": 10750 + }, + { + "epoch": 1.7544861337683524, + "grad_norm": 0.4155328869819641, + "learning_rate": 0.0008771615008156608, + "loss": 0.1174, + "num_input_tokens_seen": 23246944, + "step": 10755 + }, + { + "epoch": 1.7553017944535072, + "grad_norm": 0.02744222804903984, + "learning_rate": 0.0008775693311582382, + "loss": 0.1136, + "num_input_tokens_seen": 23258848, + "step": 10760 + }, + { + "epoch": 1.7561174551386625, + "grad_norm": 0.060500990599393845, + "learning_rate": 0.0008779771615008156, + "loss": 0.1549, + "num_input_tokens_seen": 23269056, + "step": 10765 + }, + { + "epoch": 1.7569331158238173, + "grad_norm": 0.06255891919136047, + "learning_rate": 0.0008783849918433932, + "loss": 0.1089, + "num_input_tokens_seen": 23279328, + "step": 10770 + }, + { + "epoch": 1.7577487765089723, + "grad_norm": 0.05365055426955223, + "learning_rate": 0.0008787928221859707, + "loss": 0.0844, + "num_input_tokens_seen": 23288192, + "step": 10775 + }, + { + "epoch": 1.7585644371941274, + "grad_norm": 0.02608959935605526, + "learning_rate": 0.0008792006525285482, + "loss": 0.0549, + "num_input_tokens_seen": 23299232, + "step": 10780 + }, + { + "epoch": 1.7593800978792822, + "grad_norm": 0.0558866485953331, + "learning_rate": 0.0008796084828711256, + "loss": 0.0902, + "num_input_tokens_seen": 23309536, + "step": 10785 + }, + { + "epoch": 1.7601957585644372, + "grad_norm": 0.09061188995838165, + "learning_rate": 0.0008800163132137031, + "loss": 0.0936, + "num_input_tokens_seen": 23321120, + "step": 10790 + }, + { + "epoch": 1.7610114192495923, + "grad_norm": 0.025436315685510635, + "learning_rate": 0.0008804241435562807, + "loss": 0.0201, + "num_input_tokens_seen": 23331904, + "step": 10795 + }, + { + "epoch": 1.761827079934747, + "grad_norm": 0.021344909444451332, + "learning_rate": 0.000880831973898858, + "loss": 0.1558, + "num_input_tokens_seen": 23342016, + "step": 10800 + }, + { + "epoch": 1.7626427406199021, + "grad_norm": 0.23954921960830688, + "learning_rate": 0.0008812398042414356, + "loss": 0.2992, + "num_input_tokens_seen": 23352320, + "step": 10805 + }, + { + "epoch": 1.7634584013050572, + "grad_norm": 0.353021502494812, + "learning_rate": 0.0008816476345840131, + "loss": 0.2506, + "num_input_tokens_seen": 23363840, + "step": 10810 + }, + { + "epoch": 1.764274061990212, + "grad_norm": 0.2084723562002182, + "learning_rate": 0.0008820554649265906, + "loss": 0.2649, + "num_input_tokens_seen": 23375040, + "step": 10815 + }, + { + "epoch": 1.765089722675367, + "grad_norm": 0.07466694712638855, + "learning_rate": 0.000882463295269168, + "loss": 0.19, + "num_input_tokens_seen": 23386752, + "step": 10820 + }, + { + "epoch": 1.765905383360522, + "grad_norm": 0.08622830361127853, + "learning_rate": 0.0008828711256117455, + "loss": 0.0898, + "num_input_tokens_seen": 23398496, + "step": 10825 + }, + { + "epoch": 1.7667210440456769, + "grad_norm": 0.168392151594162, + "learning_rate": 0.000883278955954323, + "loss": 0.1449, + "num_input_tokens_seen": 23409984, + "step": 10830 + }, + { + "epoch": 1.7675367047308321, + "grad_norm": 0.18816818296909332, + "learning_rate": 0.0008836867862969005, + "loss": 0.2378, + "num_input_tokens_seen": 23419968, + "step": 10835 + }, + { + "epoch": 1.768352365415987, + "grad_norm": 0.08224303275346756, + "learning_rate": 0.000884094616639478, + "loss": 0.0756, + "num_input_tokens_seen": 23430720, + "step": 10840 + }, + { + "epoch": 1.7691680261011418, + "grad_norm": 0.07841552048921585, + "learning_rate": 0.0008845024469820555, + "loss": 0.1262, + "num_input_tokens_seen": 23440960, + "step": 10845 + }, + { + "epoch": 1.769983686786297, + "grad_norm": 0.03823342174291611, + "learning_rate": 0.0008849102773246329, + "loss": 0.0482, + "num_input_tokens_seen": 23451040, + "step": 10850 + }, + { + "epoch": 1.7707993474714518, + "grad_norm": 0.369051456451416, + "learning_rate": 0.0008853181076672104, + "loss": 0.2453, + "num_input_tokens_seen": 23461024, + "step": 10855 + }, + { + "epoch": 1.7716150081566069, + "grad_norm": 0.08475756645202637, + "learning_rate": 0.000885725938009788, + "loss": 0.0929, + "num_input_tokens_seen": 23472192, + "step": 10860 + }, + { + "epoch": 1.772430668841762, + "grad_norm": 0.2269689291715622, + "learning_rate": 0.0008861337683523655, + "loss": 0.2801, + "num_input_tokens_seen": 23482624, + "step": 10865 + }, + { + "epoch": 1.7732463295269167, + "grad_norm": 0.10074033588171005, + "learning_rate": 0.0008865415986949429, + "loss": 0.0721, + "num_input_tokens_seen": 23492928, + "step": 10870 + }, + { + "epoch": 1.7740619902120718, + "grad_norm": 0.04861301928758621, + "learning_rate": 0.0008869494290375204, + "loss": 0.1541, + "num_input_tokens_seen": 23505312, + "step": 10875 + }, + { + "epoch": 1.7748776508972268, + "grad_norm": 0.05188162997364998, + "learning_rate": 0.0008873572593800979, + "loss": 0.1303, + "num_input_tokens_seen": 23516608, + "step": 10880 + }, + { + "epoch": 1.7756933115823816, + "grad_norm": 0.21070732176303864, + "learning_rate": 0.0008877650897226754, + "loss": 0.2544, + "num_input_tokens_seen": 23527552, + "step": 10885 + }, + { + "epoch": 1.7765089722675367, + "grad_norm": 0.04607458412647247, + "learning_rate": 0.0008881729200652528, + "loss": 0.0782, + "num_input_tokens_seen": 23537888, + "step": 10890 + }, + { + "epoch": 1.7773246329526917, + "grad_norm": 0.09887990355491638, + "learning_rate": 0.0008885807504078304, + "loss": 0.1516, + "num_input_tokens_seen": 23548480, + "step": 10895 + }, + { + "epoch": 1.7781402936378465, + "grad_norm": 0.13077348470687866, + "learning_rate": 0.0008889885807504079, + "loss": 0.1197, + "num_input_tokens_seen": 23560192, + "step": 10900 + }, + { + "epoch": 1.7789559543230016, + "grad_norm": 0.176055908203125, + "learning_rate": 0.0008893964110929853, + "loss": 0.1321, + "num_input_tokens_seen": 23570432, + "step": 10905 + }, + { + "epoch": 1.7797716150081566, + "grad_norm": 0.2664521634578705, + "learning_rate": 0.0008898042414355628, + "loss": 0.1728, + "num_input_tokens_seen": 23582080, + "step": 10910 + }, + { + "epoch": 1.7805872756933114, + "grad_norm": 0.17070364952087402, + "learning_rate": 0.0008902120717781403, + "loss": 0.1953, + "num_input_tokens_seen": 23592096, + "step": 10915 + }, + { + "epoch": 1.7814029363784667, + "grad_norm": 0.31275373697280884, + "learning_rate": 0.0008906199021207178, + "loss": 0.0856, + "num_input_tokens_seen": 23603712, + "step": 10920 + }, + { + "epoch": 1.7822185970636215, + "grad_norm": 0.04737719148397446, + "learning_rate": 0.0008910277324632953, + "loss": 0.1324, + "num_input_tokens_seen": 23615168, + "step": 10925 + }, + { + "epoch": 1.7830342577487766, + "grad_norm": 0.0940323919057846, + "learning_rate": 0.0008914355628058728, + "loss": 0.1321, + "num_input_tokens_seen": 23623936, + "step": 10930 + }, + { + "epoch": 1.7838499184339316, + "grad_norm": 0.23389218747615814, + "learning_rate": 0.0008918433931484503, + "loss": 0.1104, + "num_input_tokens_seen": 23634656, + "step": 10935 + }, + { + "epoch": 1.7846655791190864, + "grad_norm": 0.16723588109016418, + "learning_rate": 0.0008922512234910277, + "loss": 0.1185, + "num_input_tokens_seen": 23644992, + "step": 10940 + }, + { + "epoch": 1.7854812398042414, + "grad_norm": 0.3634952902793884, + "learning_rate": 0.0008926590538336053, + "loss": 0.2453, + "num_input_tokens_seen": 23656800, + "step": 10945 + }, + { + "epoch": 1.7862969004893965, + "grad_norm": 0.2302580624818802, + "learning_rate": 0.0008930668841761828, + "loss": 0.0792, + "num_input_tokens_seen": 23668832, + "step": 10950 + }, + { + "epoch": 1.7871125611745513, + "grad_norm": 0.34366220235824585, + "learning_rate": 0.0008934747145187601, + "loss": 0.2055, + "num_input_tokens_seen": 23678464, + "step": 10955 + }, + { + "epoch": 1.7879282218597063, + "grad_norm": 0.041612409055233, + "learning_rate": 0.0008938825448613377, + "loss": 0.0343, + "num_input_tokens_seen": 23689376, + "step": 10960 + }, + { + "epoch": 1.7887438825448614, + "grad_norm": 0.02797471545636654, + "learning_rate": 0.0008942903752039152, + "loss": 0.0632, + "num_input_tokens_seen": 23700096, + "step": 10965 + }, + { + "epoch": 1.7895595432300162, + "grad_norm": 0.19312946498394012, + "learning_rate": 0.0008946982055464927, + "loss": 0.0759, + "num_input_tokens_seen": 23711552, + "step": 10970 + }, + { + "epoch": 1.7903752039151712, + "grad_norm": 0.18493857979774475, + "learning_rate": 0.0008951060358890701, + "loss": 0.1243, + "num_input_tokens_seen": 23722912, + "step": 10975 + }, + { + "epoch": 1.7911908646003263, + "grad_norm": 0.026496220380067825, + "learning_rate": 0.0008955138662316476, + "loss": 0.2482, + "num_input_tokens_seen": 23733248, + "step": 10980 + }, + { + "epoch": 1.792006525285481, + "grad_norm": 0.2228316068649292, + "learning_rate": 0.0008959216965742252, + "loss": 0.1109, + "num_input_tokens_seen": 23744384, + "step": 10985 + }, + { + "epoch": 1.7928221859706364, + "grad_norm": 0.40756627917289734, + "learning_rate": 0.0008963295269168026, + "loss": 0.1965, + "num_input_tokens_seen": 23755040, + "step": 10990 + }, + { + "epoch": 1.7936378466557912, + "grad_norm": 0.30325761437416077, + "learning_rate": 0.0008967373572593801, + "loss": 0.1768, + "num_input_tokens_seen": 23766240, + "step": 10995 + }, + { + "epoch": 1.7944535073409462, + "grad_norm": 0.19416655600070953, + "learning_rate": 0.0008971451876019576, + "loss": 0.1114, + "num_input_tokens_seen": 23777024, + "step": 11000 + }, + { + "epoch": 1.7952691680261013, + "grad_norm": 0.0678112730383873, + "learning_rate": 0.0008975530179445351, + "loss": 0.0636, + "num_input_tokens_seen": 23789920, + "step": 11005 + }, + { + "epoch": 1.796084828711256, + "grad_norm": 0.2991323471069336, + "learning_rate": 0.0008979608482871126, + "loss": 0.1564, + "num_input_tokens_seen": 23801600, + "step": 11010 + }, + { + "epoch": 1.7969004893964111, + "grad_norm": 0.2331552803516388, + "learning_rate": 0.0008983686786296901, + "loss": 0.1613, + "num_input_tokens_seen": 23812032, + "step": 11015 + }, + { + "epoch": 1.7977161500815662, + "grad_norm": 0.12368535250425339, + "learning_rate": 0.0008987765089722675, + "loss": 0.1444, + "num_input_tokens_seen": 23822752, + "step": 11020 + }, + { + "epoch": 1.798531810766721, + "grad_norm": 0.2614479959011078, + "learning_rate": 0.000899184339314845, + "loss": 0.1571, + "num_input_tokens_seen": 23833024, + "step": 11025 + }, + { + "epoch": 1.799347471451876, + "grad_norm": 0.2386416345834732, + "learning_rate": 0.0008995921696574225, + "loss": 0.1573, + "num_input_tokens_seen": 23843328, + "step": 11030 + }, + { + "epoch": 1.800163132137031, + "grad_norm": 0.04170841723680496, + "learning_rate": 0.0009000000000000001, + "loss": 0.0955, + "num_input_tokens_seen": 23853568, + "step": 11035 + }, + { + "epoch": 1.8009787928221859, + "grad_norm": 0.21823978424072266, + "learning_rate": 0.0009004078303425776, + "loss": 0.1304, + "num_input_tokens_seen": 23864000, + "step": 11040 + }, + { + "epoch": 1.801794453507341, + "grad_norm": 0.1484844982624054, + "learning_rate": 0.0009008156606851549, + "loss": 0.1727, + "num_input_tokens_seen": 23875136, + "step": 11045 + }, + { + "epoch": 1.802610114192496, + "grad_norm": 0.09188838303089142, + "learning_rate": 0.0009012234910277325, + "loss": 0.0887, + "num_input_tokens_seen": 23886784, + "step": 11050 + }, + { + "epoch": 1.8034257748776508, + "grad_norm": 0.06821305304765701, + "learning_rate": 0.00090163132137031, + "loss": 0.0782, + "num_input_tokens_seen": 23897440, + "step": 11055 + }, + { + "epoch": 1.804241435562806, + "grad_norm": 0.08660285174846649, + "learning_rate": 0.0009020391517128875, + "loss": 0.0435, + "num_input_tokens_seen": 23909248, + "step": 11060 + }, + { + "epoch": 1.8050570962479608, + "grad_norm": 0.10734372586011887, + "learning_rate": 0.0009024469820554649, + "loss": 0.075, + "num_input_tokens_seen": 23920224, + "step": 11065 + }, + { + "epoch": 1.8058727569331157, + "grad_norm": 0.009611149318516254, + "learning_rate": 0.0009028548123980424, + "loss": 0.0941, + "num_input_tokens_seen": 23930048, + "step": 11070 + }, + { + "epoch": 1.806688417618271, + "grad_norm": 0.039050959050655365, + "learning_rate": 0.0009032626427406199, + "loss": 0.1033, + "num_input_tokens_seen": 23940992, + "step": 11075 + }, + { + "epoch": 1.8075040783034257, + "grad_norm": 0.0733620673418045, + "learning_rate": 0.0009036704730831974, + "loss": 0.0577, + "num_input_tokens_seen": 23952288, + "step": 11080 + }, + { + "epoch": 1.8083197389885808, + "grad_norm": 0.09018149226903915, + "learning_rate": 0.000904078303425775, + "loss": 0.085, + "num_input_tokens_seen": 23962848, + "step": 11085 + }, + { + "epoch": 1.8091353996737358, + "grad_norm": 0.005015532020479441, + "learning_rate": 0.0009044861337683524, + "loss": 0.0439, + "num_input_tokens_seen": 23973504, + "step": 11090 + }, + { + "epoch": 1.8099510603588906, + "grad_norm": 0.11613103747367859, + "learning_rate": 0.0009048939641109298, + "loss": 0.0687, + "num_input_tokens_seen": 23983840, + "step": 11095 + }, + { + "epoch": 1.8107667210440457, + "grad_norm": 0.14860443770885468, + "learning_rate": 0.0009053017944535074, + "loss": 0.1707, + "num_input_tokens_seen": 23994720, + "step": 11100 + }, + { + "epoch": 1.8115823817292007, + "grad_norm": 0.6425272226333618, + "learning_rate": 0.0009057096247960849, + "loss": 0.277, + "num_input_tokens_seen": 24003488, + "step": 11105 + }, + { + "epoch": 1.8123980424143555, + "grad_norm": 0.22874023020267487, + "learning_rate": 0.0009061174551386622, + "loss": 0.19, + "num_input_tokens_seen": 24014336, + "step": 11110 + }, + { + "epoch": 1.8132137030995106, + "grad_norm": 0.15151821076869965, + "learning_rate": 0.0009065252854812398, + "loss": 0.1462, + "num_input_tokens_seen": 24025600, + "step": 11115 + }, + { + "epoch": 1.8140293637846656, + "grad_norm": 0.11738032102584839, + "learning_rate": 0.0009069331158238173, + "loss": 0.1265, + "num_input_tokens_seen": 24035392, + "step": 11120 + }, + { + "epoch": 1.8148450244698204, + "grad_norm": 0.6219301819801331, + "learning_rate": 0.0009073409461663949, + "loss": 0.3286, + "num_input_tokens_seen": 24046080, + "step": 11125 + }, + { + "epoch": 1.8156606851549757, + "grad_norm": 0.08909933269023895, + "learning_rate": 0.0009077487765089722, + "loss": 0.1256, + "num_input_tokens_seen": 24057536, + "step": 11130 + }, + { + "epoch": 1.8164763458401305, + "grad_norm": 0.04366849735379219, + "learning_rate": 0.0009081566068515497, + "loss": 0.0288, + "num_input_tokens_seen": 24068736, + "step": 11135 + }, + { + "epoch": 1.8172920065252853, + "grad_norm": 0.11872971057891846, + "learning_rate": 0.0009085644371941273, + "loss": 0.1215, + "num_input_tokens_seen": 24079360, + "step": 11140 + }, + { + "epoch": 1.8181076672104406, + "grad_norm": 0.3688180446624756, + "learning_rate": 0.0009089722675367047, + "loss": 0.1734, + "num_input_tokens_seen": 24090656, + "step": 11145 + }, + { + "epoch": 1.8189233278955954, + "grad_norm": 0.01831037551164627, + "learning_rate": 0.0009093800978792823, + "loss": 0.1089, + "num_input_tokens_seen": 24102336, + "step": 11150 + }, + { + "epoch": 1.8197389885807504, + "grad_norm": 0.0956871286034584, + "learning_rate": 0.0009097879282218597, + "loss": 0.1478, + "num_input_tokens_seen": 24113120, + "step": 11155 + }, + { + "epoch": 1.8205546492659055, + "grad_norm": 0.04308653995394707, + "learning_rate": 0.0009101957585644372, + "loss": 0.0524, + "num_input_tokens_seen": 24122688, + "step": 11160 + }, + { + "epoch": 1.8213703099510603, + "grad_norm": 0.21079613268375397, + "learning_rate": 0.0009106035889070147, + "loss": 0.1034, + "num_input_tokens_seen": 24133984, + "step": 11165 + }, + { + "epoch": 1.8221859706362153, + "grad_norm": 0.01858745887875557, + "learning_rate": 0.0009110114192495922, + "loss": 0.1008, + "num_input_tokens_seen": 24144928, + "step": 11170 + }, + { + "epoch": 1.8230016313213704, + "grad_norm": 0.0403125137090683, + "learning_rate": 0.0009114192495921697, + "loss": 0.0812, + "num_input_tokens_seen": 24154240, + "step": 11175 + }, + { + "epoch": 1.8238172920065252, + "grad_norm": 0.027002638205885887, + "learning_rate": 0.0009118270799347471, + "loss": 0.1107, + "num_input_tokens_seen": 24163584, + "step": 11180 + }, + { + "epoch": 1.8246329526916802, + "grad_norm": 0.042410269379615784, + "learning_rate": 0.0009122349102773246, + "loss": 0.1227, + "num_input_tokens_seen": 24175136, + "step": 11185 + }, + { + "epoch": 1.8254486133768353, + "grad_norm": 0.12024813145399094, + "learning_rate": 0.0009126427406199022, + "loss": 0.0818, + "num_input_tokens_seen": 24185856, + "step": 11190 + }, + { + "epoch": 1.82626427406199, + "grad_norm": 0.054859358817338943, + "learning_rate": 0.0009130505709624797, + "loss": 0.0702, + "num_input_tokens_seen": 24197152, + "step": 11195 + }, + { + "epoch": 1.8270799347471451, + "grad_norm": 0.14947378635406494, + "learning_rate": 0.0009134584013050571, + "loss": 0.2348, + "num_input_tokens_seen": 24206720, + "step": 11200 + }, + { + "epoch": 1.8278955954323002, + "grad_norm": 0.01208100188523531, + "learning_rate": 0.0009138662316476346, + "loss": 0.0458, + "num_input_tokens_seen": 24217696, + "step": 11205 + }, + { + "epoch": 1.828711256117455, + "grad_norm": 0.03552878648042679, + "learning_rate": 0.0009142740619902121, + "loss": 0.0615, + "num_input_tokens_seen": 24229088, + "step": 11210 + }, + { + "epoch": 1.8295269168026103, + "grad_norm": 0.12704972922801971, + "learning_rate": 0.0009146818923327896, + "loss": 0.1118, + "num_input_tokens_seen": 24240128, + "step": 11215 + }, + { + "epoch": 1.830342577487765, + "grad_norm": 0.049700263887643814, + "learning_rate": 0.000915089722675367, + "loss": 0.0909, + "num_input_tokens_seen": 24251520, + "step": 11220 + }, + { + "epoch": 1.8311582381729201, + "grad_norm": 0.05248570069670677, + "learning_rate": 0.0009154975530179446, + "loss": 0.0693, + "num_input_tokens_seen": 24262048, + "step": 11225 + }, + { + "epoch": 1.8319738988580752, + "grad_norm": 0.040520548820495605, + "learning_rate": 0.0009159053833605221, + "loss": 0.1184, + "num_input_tokens_seen": 24272480, + "step": 11230 + }, + { + "epoch": 1.83278955954323, + "grad_norm": 0.123872309923172, + "learning_rate": 0.0009163132137030995, + "loss": 0.1531, + "num_input_tokens_seen": 24282848, + "step": 11235 + }, + { + "epoch": 1.833605220228385, + "grad_norm": 0.050551868975162506, + "learning_rate": 0.000916721044045677, + "loss": 0.0674, + "num_input_tokens_seen": 24295072, + "step": 11240 + }, + { + "epoch": 1.83442088091354, + "grad_norm": 0.014776119962334633, + "learning_rate": 0.0009171288743882545, + "loss": 0.2557, + "num_input_tokens_seen": 24305920, + "step": 11245 + }, + { + "epoch": 1.8352365415986949, + "grad_norm": 0.08579311519861221, + "learning_rate": 0.0009175367047308319, + "loss": 0.1091, + "num_input_tokens_seen": 24317056, + "step": 11250 + }, + { + "epoch": 1.83605220228385, + "grad_norm": 0.05025889351963997, + "learning_rate": 0.0009179445350734095, + "loss": 0.1928, + "num_input_tokens_seen": 24327488, + "step": 11255 + }, + { + "epoch": 1.836867862969005, + "grad_norm": 0.25533077120780945, + "learning_rate": 0.000918352365415987, + "loss": 0.1287, + "num_input_tokens_seen": 24336896, + "step": 11260 + }, + { + "epoch": 1.8376835236541598, + "grad_norm": 0.14637209475040436, + "learning_rate": 0.0009187601957585645, + "loss": 0.0772, + "num_input_tokens_seen": 24348864, + "step": 11265 + }, + { + "epoch": 1.8384991843393148, + "grad_norm": 0.02450774982571602, + "learning_rate": 0.0009191680261011419, + "loss": 0.1115, + "num_input_tokens_seen": 24359520, + "step": 11270 + }, + { + "epoch": 1.8393148450244698, + "grad_norm": 0.1265328973531723, + "learning_rate": 0.0009195758564437194, + "loss": 0.128, + "num_input_tokens_seen": 24370304, + "step": 11275 + }, + { + "epoch": 1.8401305057096247, + "grad_norm": 0.06586892902851105, + "learning_rate": 0.000919983686786297, + "loss": 0.1629, + "num_input_tokens_seen": 24381312, + "step": 11280 + }, + { + "epoch": 1.84094616639478, + "grad_norm": 0.2524750530719757, + "learning_rate": 0.0009203915171288743, + "loss": 0.0922, + "num_input_tokens_seen": 24391552, + "step": 11285 + }, + { + "epoch": 1.8417618270799347, + "grad_norm": 0.07396470755338669, + "learning_rate": 0.0009207993474714519, + "loss": 0.0868, + "num_input_tokens_seen": 24402112, + "step": 11290 + }, + { + "epoch": 1.8425774877650896, + "grad_norm": 0.20884621143341064, + "learning_rate": 0.0009212071778140294, + "loss": 0.0339, + "num_input_tokens_seen": 24413600, + "step": 11295 + }, + { + "epoch": 1.8433931484502448, + "grad_norm": 0.009276431985199451, + "learning_rate": 0.0009216150081566068, + "loss": 0.0742, + "num_input_tokens_seen": 24423968, + "step": 11300 + }, + { + "epoch": 1.8442088091353996, + "grad_norm": 0.17926661670207977, + "learning_rate": 0.0009220228384991844, + "loss": 0.0935, + "num_input_tokens_seen": 24434688, + "step": 11305 + }, + { + "epoch": 1.8450244698205547, + "grad_norm": 0.4188963770866394, + "learning_rate": 0.0009224306688417618, + "loss": 0.1193, + "num_input_tokens_seen": 24444992, + "step": 11310 + }, + { + "epoch": 1.8458401305057097, + "grad_norm": 0.015882406383752823, + "learning_rate": 0.0009228384991843394, + "loss": 0.1212, + "num_input_tokens_seen": 24456512, + "step": 11315 + }, + { + "epoch": 1.8466557911908645, + "grad_norm": 0.01852530613541603, + "learning_rate": 0.0009232463295269168, + "loss": 0.0301, + "num_input_tokens_seen": 24467424, + "step": 11320 + }, + { + "epoch": 1.8474714518760196, + "grad_norm": 0.04548301175236702, + "learning_rate": 0.0009236541598694943, + "loss": 0.1594, + "num_input_tokens_seen": 24478720, + "step": 11325 + }, + { + "epoch": 1.8482871125611746, + "grad_norm": 0.24560341238975525, + "learning_rate": 0.0009240619902120718, + "loss": 0.0453, + "num_input_tokens_seen": 24489792, + "step": 11330 + }, + { + "epoch": 1.8491027732463294, + "grad_norm": 0.017956508323550224, + "learning_rate": 0.0009244698205546492, + "loss": 0.1174, + "num_input_tokens_seen": 24501760, + "step": 11335 + }, + { + "epoch": 1.8499184339314845, + "grad_norm": 0.13720254600048065, + "learning_rate": 0.0009248776508972268, + "loss": 0.1975, + "num_input_tokens_seen": 24511584, + "step": 11340 + }, + { + "epoch": 1.8507340946166395, + "grad_norm": 0.03751807659864426, + "learning_rate": 0.0009252854812398043, + "loss": 0.0923, + "num_input_tokens_seen": 24522432, + "step": 11345 + }, + { + "epoch": 1.8515497553017943, + "grad_norm": 0.310740202665329, + "learning_rate": 0.0009256933115823818, + "loss": 0.2647, + "num_input_tokens_seen": 24532544, + "step": 11350 + }, + { + "epoch": 1.8523654159869496, + "grad_norm": 0.13542063534259796, + "learning_rate": 0.0009261011419249592, + "loss": 0.1742, + "num_input_tokens_seen": 24543488, + "step": 11355 + }, + { + "epoch": 1.8531810766721044, + "grad_norm": 0.2352442741394043, + "learning_rate": 0.0009265089722675367, + "loss": 0.3112, + "num_input_tokens_seen": 24554080, + "step": 11360 + }, + { + "epoch": 1.8539967373572592, + "grad_norm": 0.08593737334012985, + "learning_rate": 0.0009269168026101143, + "loss": 0.081, + "num_input_tokens_seen": 24564992, + "step": 11365 + }, + { + "epoch": 1.8548123980424145, + "grad_norm": 0.07849381864070892, + "learning_rate": 0.0009273246329526917, + "loss": 0.0747, + "num_input_tokens_seen": 24576608, + "step": 11370 + }, + { + "epoch": 1.8556280587275693, + "grad_norm": 0.0883590504527092, + "learning_rate": 0.0009277324632952691, + "loss": 0.134, + "num_input_tokens_seen": 24587872, + "step": 11375 + }, + { + "epoch": 1.8564437194127243, + "grad_norm": 0.037716735154390335, + "learning_rate": 0.0009281402936378467, + "loss": 0.1817, + "num_input_tokens_seen": 24599136, + "step": 11380 + }, + { + "epoch": 1.8572593800978794, + "grad_norm": 0.08465716242790222, + "learning_rate": 0.0009285481239804242, + "loss": 0.1005, + "num_input_tokens_seen": 24610080, + "step": 11385 + }, + { + "epoch": 1.8580750407830342, + "grad_norm": 0.10003326833248138, + "learning_rate": 0.0009289559543230017, + "loss": 0.0937, + "num_input_tokens_seen": 24621568, + "step": 11390 + }, + { + "epoch": 1.8588907014681892, + "grad_norm": 0.08781938254833221, + "learning_rate": 0.0009293637846655791, + "loss": 0.112, + "num_input_tokens_seen": 24632384, + "step": 11395 + }, + { + "epoch": 1.8597063621533443, + "grad_norm": 0.1371522694826126, + "learning_rate": 0.0009297716150081566, + "loss": 0.2002, + "num_input_tokens_seen": 24643648, + "step": 11400 + }, + { + "epoch": 1.860522022838499, + "grad_norm": 0.18471182882785797, + "learning_rate": 0.0009301794453507341, + "loss": 0.0949, + "num_input_tokens_seen": 24655392, + "step": 11405 + }, + { + "epoch": 1.8613376835236541, + "grad_norm": 0.11153913289308548, + "learning_rate": 0.0009305872756933116, + "loss": 0.1083, + "num_input_tokens_seen": 24666112, + "step": 11410 + }, + { + "epoch": 1.8621533442088092, + "grad_norm": 0.2752339243888855, + "learning_rate": 0.000930995106035889, + "loss": 0.3004, + "num_input_tokens_seen": 24677248, + "step": 11415 + }, + { + "epoch": 1.862969004893964, + "grad_norm": 0.11731915175914764, + "learning_rate": 0.0009314029363784666, + "loss": 0.0804, + "num_input_tokens_seen": 24688192, + "step": 11420 + }, + { + "epoch": 1.863784665579119, + "grad_norm": 0.3206159770488739, + "learning_rate": 0.000931810766721044, + "loss": 0.1983, + "num_input_tokens_seen": 24699424, + "step": 11425 + }, + { + "epoch": 1.864600326264274, + "grad_norm": 0.19717612862586975, + "learning_rate": 0.0009322185970636216, + "loss": 0.2276, + "num_input_tokens_seen": 24709760, + "step": 11430 + }, + { + "epoch": 1.865415986949429, + "grad_norm": 0.06198367476463318, + "learning_rate": 0.0009326264274061991, + "loss": 0.1171, + "num_input_tokens_seen": 24720064, + "step": 11435 + }, + { + "epoch": 1.8662316476345842, + "grad_norm": 0.14472417533397675, + "learning_rate": 0.0009330342577487764, + "loss": 0.1029, + "num_input_tokens_seen": 24730912, + "step": 11440 + }, + { + "epoch": 1.867047308319739, + "grad_norm": 0.06060084328055382, + "learning_rate": 0.000933442088091354, + "loss": 0.045, + "num_input_tokens_seen": 24739808, + "step": 11445 + }, + { + "epoch": 1.867862969004894, + "grad_norm": 0.26887786388397217, + "learning_rate": 0.0009338499184339315, + "loss": 0.0618, + "num_input_tokens_seen": 24750976, + "step": 11450 + }, + { + "epoch": 1.868678629690049, + "grad_norm": 0.07969934493303299, + "learning_rate": 0.0009342577487765091, + "loss": 0.1042, + "num_input_tokens_seen": 24760672, + "step": 11455 + }, + { + "epoch": 1.8694942903752039, + "grad_norm": 0.025502964854240417, + "learning_rate": 0.0009346655791190864, + "loss": 0.083, + "num_input_tokens_seen": 24771968, + "step": 11460 + }, + { + "epoch": 1.870309951060359, + "grad_norm": 0.10868193954229355, + "learning_rate": 0.0009350734094616639, + "loss": 0.1087, + "num_input_tokens_seen": 24783776, + "step": 11465 + }, + { + "epoch": 1.871125611745514, + "grad_norm": 0.03004133701324463, + "learning_rate": 0.0009354812398042415, + "loss": 0.0305, + "num_input_tokens_seen": 24793472, + "step": 11470 + }, + { + "epoch": 1.8719412724306688, + "grad_norm": 0.12336234748363495, + "learning_rate": 0.0009358890701468189, + "loss": 0.0977, + "num_input_tokens_seen": 24802912, + "step": 11475 + }, + { + "epoch": 1.8727569331158238, + "grad_norm": 0.07441865652799606, + "learning_rate": 0.0009362969004893965, + "loss": 0.0828, + "num_input_tokens_seen": 24812416, + "step": 11480 + }, + { + "epoch": 1.8735725938009788, + "grad_norm": 0.08542287349700928, + "learning_rate": 0.0009367047308319739, + "loss": 0.2089, + "num_input_tokens_seen": 24823616, + "step": 11485 + }, + { + "epoch": 1.8743882544861337, + "grad_norm": 0.055280644446611404, + "learning_rate": 0.0009371125611745514, + "loss": 0.1477, + "num_input_tokens_seen": 24834656, + "step": 11490 + }, + { + "epoch": 1.8752039151712887, + "grad_norm": 0.051045581698417664, + "learning_rate": 0.0009375203915171289, + "loss": 0.1889, + "num_input_tokens_seen": 24845600, + "step": 11495 + }, + { + "epoch": 1.8760195758564437, + "grad_norm": 0.1657111495733261, + "learning_rate": 0.0009379282218597064, + "loss": 0.0902, + "num_input_tokens_seen": 24858208, + "step": 11500 + }, + { + "epoch": 1.8768352365415986, + "grad_norm": 0.2820875942707062, + "learning_rate": 0.000938336052202284, + "loss": 0.1677, + "num_input_tokens_seen": 24868416, + "step": 11505 + }, + { + "epoch": 1.8776508972267538, + "grad_norm": 0.07757575809955597, + "learning_rate": 0.0009387438825448613, + "loss": 0.0578, + "num_input_tokens_seen": 24879616, + "step": 11510 + }, + { + "epoch": 1.8784665579119086, + "grad_norm": 0.0790976956486702, + "learning_rate": 0.0009391517128874388, + "loss": 0.09, + "num_input_tokens_seen": 24890336, + "step": 11515 + }, + { + "epoch": 1.8792822185970635, + "grad_norm": 0.2218760997056961, + "learning_rate": 0.0009395595432300164, + "loss": 0.1424, + "num_input_tokens_seen": 24901760, + "step": 11520 + }, + { + "epoch": 1.8800978792822187, + "grad_norm": 0.03744116052985191, + "learning_rate": 0.0009399673735725939, + "loss": 0.1792, + "num_input_tokens_seen": 24911520, + "step": 11525 + }, + { + "epoch": 1.8809135399673735, + "grad_norm": 0.16890190541744232, + "learning_rate": 0.0009403752039151713, + "loss": 0.1157, + "num_input_tokens_seen": 24922016, + "step": 11530 + }, + { + "epoch": 1.8817292006525286, + "grad_norm": 0.031172795221209526, + "learning_rate": 0.0009407830342577488, + "loss": 0.0411, + "num_input_tokens_seen": 24932832, + "step": 11535 + }, + { + "epoch": 1.8825448613376836, + "grad_norm": 0.03609168902039528, + "learning_rate": 0.0009411908646003263, + "loss": 0.1289, + "num_input_tokens_seen": 24945632, + "step": 11540 + }, + { + "epoch": 1.8833605220228384, + "grad_norm": 0.4785745441913605, + "learning_rate": 0.0009415986949429038, + "loss": 0.3079, + "num_input_tokens_seen": 24954176, + "step": 11545 + }, + { + "epoch": 1.8841761827079935, + "grad_norm": 0.06320811808109283, + "learning_rate": 0.0009420065252854812, + "loss": 0.1004, + "num_input_tokens_seen": 24964512, + "step": 11550 + }, + { + "epoch": 1.8849918433931485, + "grad_norm": 0.0489787794649601, + "learning_rate": 0.0009424143556280587, + "loss": 0.0694, + "num_input_tokens_seen": 24976416, + "step": 11555 + }, + { + "epoch": 1.8858075040783033, + "grad_norm": 0.20213648676872253, + "learning_rate": 0.0009428221859706362, + "loss": 0.3237, + "num_input_tokens_seen": 24986656, + "step": 11560 + }, + { + "epoch": 1.8866231647634584, + "grad_norm": 0.14317026734352112, + "learning_rate": 0.0009432300163132137, + "loss": 0.1493, + "num_input_tokens_seen": 24996384, + "step": 11565 + }, + { + "epoch": 1.8874388254486134, + "grad_norm": 0.04424556717276573, + "learning_rate": 0.0009436378466557913, + "loss": 0.0939, + "num_input_tokens_seen": 25007808, + "step": 11570 + }, + { + "epoch": 1.8882544861337682, + "grad_norm": 0.08489039540290833, + "learning_rate": 0.0009440456769983687, + "loss": 0.1333, + "num_input_tokens_seen": 25018816, + "step": 11575 + }, + { + "epoch": 1.8890701468189235, + "grad_norm": 0.30416834354400635, + "learning_rate": 0.0009444535073409461, + "loss": 0.1223, + "num_input_tokens_seen": 25029472, + "step": 11580 + }, + { + "epoch": 1.8898858075040783, + "grad_norm": 0.031548064202070236, + "learning_rate": 0.0009448613376835237, + "loss": 0.1407, + "num_input_tokens_seen": 25041280, + "step": 11585 + }, + { + "epoch": 1.8907014681892331, + "grad_norm": 0.05067252740263939, + "learning_rate": 0.0009452691680261012, + "loss": 0.1139, + "num_input_tokens_seen": 25052256, + "step": 11590 + }, + { + "epoch": 1.8915171288743884, + "grad_norm": 0.18682821094989777, + "learning_rate": 0.0009456769983686786, + "loss": 0.1019, + "num_input_tokens_seen": 25063328, + "step": 11595 + }, + { + "epoch": 1.8923327895595432, + "grad_norm": 0.04457815736532211, + "learning_rate": 0.0009460848287112561, + "loss": 0.0436, + "num_input_tokens_seen": 25075296, + "step": 11600 + }, + { + "epoch": 1.8931484502446982, + "grad_norm": 0.09983167052268982, + "learning_rate": 0.0009464926590538336, + "loss": 0.1379, + "num_input_tokens_seen": 25086144, + "step": 11605 + }, + { + "epoch": 1.8939641109298533, + "grad_norm": 0.19143344461917877, + "learning_rate": 0.0009469004893964112, + "loss": 0.1299, + "num_input_tokens_seen": 25096736, + "step": 11610 + }, + { + "epoch": 1.894779771615008, + "grad_norm": 0.06157934293150902, + "learning_rate": 0.0009473083197389885, + "loss": 0.0483, + "num_input_tokens_seen": 25107296, + "step": 11615 + }, + { + "epoch": 1.8955954323001631, + "grad_norm": 0.016641128808259964, + "learning_rate": 0.0009477161500815661, + "loss": 0.0616, + "num_input_tokens_seen": 25118784, + "step": 11620 + }, + { + "epoch": 1.8964110929853182, + "grad_norm": 0.049104683101177216, + "learning_rate": 0.0009481239804241436, + "loss": 0.2264, + "num_input_tokens_seen": 25128448, + "step": 11625 + }, + { + "epoch": 1.897226753670473, + "grad_norm": 0.29125604033470154, + "learning_rate": 0.000948531810766721, + "loss": 0.0607, + "num_input_tokens_seen": 25139040, + "step": 11630 + }, + { + "epoch": 1.898042414355628, + "grad_norm": 0.011295678094029427, + "learning_rate": 0.0009489396411092986, + "loss": 0.1261, + "num_input_tokens_seen": 25149312, + "step": 11635 + }, + { + "epoch": 1.898858075040783, + "grad_norm": 0.27017614245414734, + "learning_rate": 0.000949347471451876, + "loss": 0.1949, + "num_input_tokens_seen": 25160544, + "step": 11640 + }, + { + "epoch": 1.899673735725938, + "grad_norm": 0.09900322556495667, + "learning_rate": 0.0009497553017944536, + "loss": 0.1791, + "num_input_tokens_seen": 25171168, + "step": 11645 + }, + { + "epoch": 1.900489396411093, + "grad_norm": 0.015560412779450417, + "learning_rate": 0.000950163132137031, + "loss": 0.1213, + "num_input_tokens_seen": 25181920, + "step": 11650 + }, + { + "epoch": 1.901305057096248, + "grad_norm": 0.13649839162826538, + "learning_rate": 0.0009505709624796085, + "loss": 0.0536, + "num_input_tokens_seen": 25192704, + "step": 11655 + }, + { + "epoch": 1.9021207177814028, + "grad_norm": 0.04336768016219139, + "learning_rate": 0.000950978792822186, + "loss": 0.0182, + "num_input_tokens_seen": 25204544, + "step": 11660 + }, + { + "epoch": 1.902936378466558, + "grad_norm": 0.20540879666805267, + "learning_rate": 0.0009513866231647634, + "loss": 0.2157, + "num_input_tokens_seen": 25215488, + "step": 11665 + }, + { + "epoch": 1.9037520391517129, + "grad_norm": 0.31003397703170776, + "learning_rate": 0.000951794453507341, + "loss": 0.2263, + "num_input_tokens_seen": 25225824, + "step": 11670 + }, + { + "epoch": 1.904567699836868, + "grad_norm": 0.37362805008888245, + "learning_rate": 0.0009522022838499185, + "loss": 0.2027, + "num_input_tokens_seen": 25236096, + "step": 11675 + }, + { + "epoch": 1.905383360522023, + "grad_norm": 0.14676491916179657, + "learning_rate": 0.000952610114192496, + "loss": 0.219, + "num_input_tokens_seen": 25246656, + "step": 11680 + }, + { + "epoch": 1.9061990212071778, + "grad_norm": 0.2860686779022217, + "learning_rate": 0.0009530179445350734, + "loss": 0.2504, + "num_input_tokens_seen": 25258048, + "step": 11685 + }, + { + "epoch": 1.9070146818923328, + "grad_norm": 0.12542608380317688, + "learning_rate": 0.0009534257748776509, + "loss": 0.12, + "num_input_tokens_seen": 25269088, + "step": 11690 + }, + { + "epoch": 1.9078303425774878, + "grad_norm": 0.040885381400585175, + "learning_rate": 0.0009538336052202285, + "loss": 0.1084, + "num_input_tokens_seen": 25280544, + "step": 11695 + }, + { + "epoch": 1.9086460032626427, + "grad_norm": 0.053815603256225586, + "learning_rate": 0.0009542414355628059, + "loss": 0.1517, + "num_input_tokens_seen": 25288896, + "step": 11700 + }, + { + "epoch": 1.9094616639477977, + "grad_norm": 0.15390309691429138, + "learning_rate": 0.0009546492659053833, + "loss": 0.2707, + "num_input_tokens_seen": 25299200, + "step": 11705 + }, + { + "epoch": 1.9102773246329527, + "grad_norm": 0.046468086540699005, + "learning_rate": 0.0009550570962479609, + "loss": 0.0945, + "num_input_tokens_seen": 25310240, + "step": 11710 + }, + { + "epoch": 1.9110929853181076, + "grad_norm": 0.11353089660406113, + "learning_rate": 0.0009554649265905384, + "loss": 0.0926, + "num_input_tokens_seen": 25320992, + "step": 11715 + }, + { + "epoch": 1.9119086460032626, + "grad_norm": 0.04293264448642731, + "learning_rate": 0.0009558727569331158, + "loss": 0.0586, + "num_input_tokens_seen": 25333280, + "step": 11720 + }, + { + "epoch": 1.9127243066884176, + "grad_norm": 0.030097907409071922, + "learning_rate": 0.0009562805872756934, + "loss": 0.0627, + "num_input_tokens_seen": 25344608, + "step": 11725 + }, + { + "epoch": 1.9135399673735725, + "grad_norm": 0.10203356295824051, + "learning_rate": 0.0009566884176182708, + "loss": 0.0644, + "num_input_tokens_seen": 25355264, + "step": 11730 + }, + { + "epoch": 1.9143556280587277, + "grad_norm": 0.33191055059432983, + "learning_rate": 0.0009570962479608483, + "loss": 0.0924, + "num_input_tokens_seen": 25365600, + "step": 11735 + }, + { + "epoch": 1.9151712887438825, + "grad_norm": 0.33127424120903015, + "learning_rate": 0.0009575040783034258, + "loss": 0.2759, + "num_input_tokens_seen": 25376416, + "step": 11740 + }, + { + "epoch": 1.9159869494290374, + "grad_norm": 0.2595071792602539, + "learning_rate": 0.0009579119086460033, + "loss": 0.2841, + "num_input_tokens_seen": 25387168, + "step": 11745 + }, + { + "epoch": 1.9168026101141926, + "grad_norm": 0.03523773327469826, + "learning_rate": 0.0009583197389885808, + "loss": 0.1388, + "num_input_tokens_seen": 25399008, + "step": 11750 + }, + { + "epoch": 1.9176182707993474, + "grad_norm": 0.06833466142416, + "learning_rate": 0.0009587275693311582, + "loss": 0.0641, + "num_input_tokens_seen": 25409952, + "step": 11755 + }, + { + "epoch": 1.9184339314845025, + "grad_norm": 0.10483044385910034, + "learning_rate": 0.0009591353996737358, + "loss": 0.1451, + "num_input_tokens_seen": 25421984, + "step": 11760 + }, + { + "epoch": 1.9192495921696575, + "grad_norm": 0.04970962181687355, + "learning_rate": 0.0009595432300163133, + "loss": 0.1608, + "num_input_tokens_seen": 25432416, + "step": 11765 + }, + { + "epoch": 1.9200652528548123, + "grad_norm": 0.16871914267539978, + "learning_rate": 0.0009599510603588906, + "loss": 0.3064, + "num_input_tokens_seen": 25444192, + "step": 11770 + }, + { + "epoch": 1.9208809135399674, + "grad_norm": 0.03458542376756668, + "learning_rate": 0.0009603588907014682, + "loss": 0.0789, + "num_input_tokens_seen": 25455104, + "step": 11775 + }, + { + "epoch": 1.9216965742251224, + "grad_norm": 0.04260988160967827, + "learning_rate": 0.0009607667210440457, + "loss": 0.1354, + "num_input_tokens_seen": 25465856, + "step": 11780 + }, + { + "epoch": 1.9225122349102772, + "grad_norm": 0.05124415084719658, + "learning_rate": 0.0009611745513866232, + "loss": 0.1117, + "num_input_tokens_seen": 25476000, + "step": 11785 + }, + { + "epoch": 1.9233278955954323, + "grad_norm": 0.07688180357217789, + "learning_rate": 0.0009615823817292007, + "loss": 0.08, + "num_input_tokens_seen": 25488160, + "step": 11790 + }, + { + "epoch": 1.9241435562805873, + "grad_norm": 0.11089295148849487, + "learning_rate": 0.0009619902120717781, + "loss": 0.1051, + "num_input_tokens_seen": 25499328, + "step": 11795 + }, + { + "epoch": 1.9249592169657421, + "grad_norm": 0.04999072477221489, + "learning_rate": 0.0009623980424143557, + "loss": 0.1119, + "num_input_tokens_seen": 25510592, + "step": 11800 + }, + { + "epoch": 1.9257748776508974, + "grad_norm": 0.06794946640729904, + "learning_rate": 0.0009628058727569331, + "loss": 0.0912, + "num_input_tokens_seen": 25521184, + "step": 11805 + }, + { + "epoch": 1.9265905383360522, + "grad_norm": 0.21613682806491852, + "learning_rate": 0.0009632137030995107, + "loss": 0.0978, + "num_input_tokens_seen": 25532608, + "step": 11810 + }, + { + "epoch": 1.927406199021207, + "grad_norm": 0.034385792911052704, + "learning_rate": 0.0009636215334420881, + "loss": 0.1519, + "num_input_tokens_seen": 25543936, + "step": 11815 + }, + { + "epoch": 1.9282218597063623, + "grad_norm": 0.19377191364765167, + "learning_rate": 0.0009640293637846655, + "loss": 0.1283, + "num_input_tokens_seen": 25555104, + "step": 11820 + }, + { + "epoch": 1.929037520391517, + "grad_norm": 0.16980724036693573, + "learning_rate": 0.0009644371941272431, + "loss": 0.061, + "num_input_tokens_seen": 25565472, + "step": 11825 + }, + { + "epoch": 1.9298531810766721, + "grad_norm": 0.014253470115363598, + "learning_rate": 0.0009648450244698206, + "loss": 0.1105, + "num_input_tokens_seen": 25576832, + "step": 11830 + }, + { + "epoch": 1.9306688417618272, + "grad_norm": 0.03056260570883751, + "learning_rate": 0.0009652528548123982, + "loss": 0.0344, + "num_input_tokens_seen": 25588256, + "step": 11835 + }, + { + "epoch": 1.931484502446982, + "grad_norm": 0.2445419281721115, + "learning_rate": 0.0009656606851549755, + "loss": 0.274, + "num_input_tokens_seen": 25599072, + "step": 11840 + }, + { + "epoch": 1.932300163132137, + "grad_norm": 0.4470367729663849, + "learning_rate": 0.000966068515497553, + "loss": 0.1761, + "num_input_tokens_seen": 25609696, + "step": 11845 + }, + { + "epoch": 1.933115823817292, + "grad_norm": 0.03975436091423035, + "learning_rate": 0.0009664763458401306, + "loss": 0.0952, + "num_input_tokens_seen": 25621984, + "step": 11850 + }, + { + "epoch": 1.933931484502447, + "grad_norm": 0.035750702023506165, + "learning_rate": 0.000966884176182708, + "loss": 0.1447, + "num_input_tokens_seen": 25633024, + "step": 11855 + }, + { + "epoch": 1.934747145187602, + "grad_norm": 0.05805574357509613, + "learning_rate": 0.0009672920065252854, + "loss": 0.278, + "num_input_tokens_seen": 25644320, + "step": 11860 + }, + { + "epoch": 1.935562805872757, + "grad_norm": 0.246421679854393, + "learning_rate": 0.000967699836867863, + "loss": 0.1166, + "num_input_tokens_seen": 25655456, + "step": 11865 + }, + { + "epoch": 1.9363784665579118, + "grad_norm": 0.1580600142478943, + "learning_rate": 0.0009681076672104405, + "loss": 0.1623, + "num_input_tokens_seen": 25666592, + "step": 11870 + }, + { + "epoch": 1.9371941272430668, + "grad_norm": 0.14604364335536957, + "learning_rate": 0.000968515497553018, + "loss": 0.1526, + "num_input_tokens_seen": 25676800, + "step": 11875 + }, + { + "epoch": 1.9380097879282219, + "grad_norm": 0.10736247897148132, + "learning_rate": 0.0009689233278955954, + "loss": 0.0648, + "num_input_tokens_seen": 25686560, + "step": 11880 + }, + { + "epoch": 1.9388254486133767, + "grad_norm": 0.15189337730407715, + "learning_rate": 0.0009693311582381729, + "loss": 0.1006, + "num_input_tokens_seen": 25697120, + "step": 11885 + }, + { + "epoch": 1.939641109298532, + "grad_norm": 0.06999967247247696, + "learning_rate": 0.0009697389885807504, + "loss": 0.0706, + "num_input_tokens_seen": 25708736, + "step": 11890 + }, + { + "epoch": 1.9404567699836868, + "grad_norm": 0.06257314234972, + "learning_rate": 0.0009701468189233279, + "loss": 0.1763, + "num_input_tokens_seen": 25720064, + "step": 11895 + }, + { + "epoch": 1.9412724306688418, + "grad_norm": 0.23246027529239655, + "learning_rate": 0.0009705546492659055, + "loss": 0.2333, + "num_input_tokens_seen": 25731616, + "step": 11900 + }, + { + "epoch": 1.9420880913539968, + "grad_norm": 0.2732833921909332, + "learning_rate": 0.0009709624796084829, + "loss": 0.1856, + "num_input_tokens_seen": 25743072, + "step": 11905 + }, + { + "epoch": 1.9429037520391517, + "grad_norm": 0.054060909897089005, + "learning_rate": 0.0009713703099510603, + "loss": 0.1246, + "num_input_tokens_seen": 25753920, + "step": 11910 + }, + { + "epoch": 1.9437194127243067, + "grad_norm": 0.10430661588907242, + "learning_rate": 0.0009717781402936379, + "loss": 0.087, + "num_input_tokens_seen": 25764640, + "step": 11915 + }, + { + "epoch": 1.9445350734094617, + "grad_norm": 0.030959485098719597, + "learning_rate": 0.0009721859706362154, + "loss": 0.0374, + "num_input_tokens_seen": 25777152, + "step": 11920 + }, + { + "epoch": 1.9453507340946166, + "grad_norm": 0.03141267970204353, + "learning_rate": 0.0009725938009787928, + "loss": 0.0367, + "num_input_tokens_seen": 25787040, + "step": 11925 + }, + { + "epoch": 1.9461663947797716, + "grad_norm": 0.1135735735297203, + "learning_rate": 0.0009730016313213703, + "loss": 0.1172, + "num_input_tokens_seen": 25796544, + "step": 11930 + }, + { + "epoch": 1.9469820554649266, + "grad_norm": 0.1543119102716446, + "learning_rate": 0.0009734094616639478, + "loss": 0.1302, + "num_input_tokens_seen": 25807392, + "step": 11935 + }, + { + "epoch": 1.9477977161500815, + "grad_norm": 0.008437680080533028, + "learning_rate": 0.0009738172920065254, + "loss": 0.06, + "num_input_tokens_seen": 25817952, + "step": 11940 + }, + { + "epoch": 1.9486133768352365, + "grad_norm": 0.0668170228600502, + "learning_rate": 0.0009742251223491027, + "loss": 0.0682, + "num_input_tokens_seen": 25829568, + "step": 11945 + }, + { + "epoch": 1.9494290375203915, + "grad_norm": 0.10054466128349304, + "learning_rate": 0.0009746329526916803, + "loss": 0.0361, + "num_input_tokens_seen": 25840160, + "step": 11950 + }, + { + "epoch": 1.9502446982055464, + "grad_norm": 0.16265834867954254, + "learning_rate": 0.0009750407830342578, + "loss": 0.2625, + "num_input_tokens_seen": 25850816, + "step": 11955 + }, + { + "epoch": 1.9510603588907016, + "grad_norm": 0.32785764336586, + "learning_rate": 0.0009754486133768352, + "loss": 0.3579, + "num_input_tokens_seen": 25862400, + "step": 11960 + }, + { + "epoch": 1.9518760195758564, + "grad_norm": 0.06142808124423027, + "learning_rate": 0.0009758564437194128, + "loss": 0.203, + "num_input_tokens_seen": 25872096, + "step": 11965 + }, + { + "epoch": 1.9526916802610113, + "grad_norm": 0.04707051441073418, + "learning_rate": 0.0009762642740619902, + "loss": 0.1397, + "num_input_tokens_seen": 25884000, + "step": 11970 + }, + { + "epoch": 1.9535073409461665, + "grad_norm": 0.03933320939540863, + "learning_rate": 0.0009766721044045677, + "loss": 0.1543, + "num_input_tokens_seen": 25895776, + "step": 11975 + }, + { + "epoch": 1.9543230016313213, + "grad_norm": 0.2816435992717743, + "learning_rate": 0.0009770799347471452, + "loss": 0.2767, + "num_input_tokens_seen": 25906624, + "step": 11980 + }, + { + "epoch": 1.9551386623164764, + "grad_norm": 0.2596624791622162, + "learning_rate": 0.0009774877650897227, + "loss": 0.1901, + "num_input_tokens_seen": 25918592, + "step": 11985 + }, + { + "epoch": 1.9559543230016314, + "grad_norm": 0.12683138251304626, + "learning_rate": 0.0009778955954323001, + "loss": 0.1409, + "num_input_tokens_seen": 25929056, + "step": 11990 + }, + { + "epoch": 1.9567699836867862, + "grad_norm": 0.106838159263134, + "learning_rate": 0.0009783034257748776, + "loss": 0.1512, + "num_input_tokens_seen": 25938912, + "step": 11995 + }, + { + "epoch": 1.9575856443719413, + "grad_norm": 0.17070820927619934, + "learning_rate": 0.000978711256117455, + "loss": 0.1901, + "num_input_tokens_seen": 25948192, + "step": 12000 + }, + { + "epoch": 1.9584013050570963, + "grad_norm": 0.19078631699085236, + "learning_rate": 0.0009791190864600326, + "loss": 0.224, + "num_input_tokens_seen": 25959552, + "step": 12005 + }, + { + "epoch": 1.9592169657422511, + "grad_norm": 0.04365871846675873, + "learning_rate": 0.00097952691680261, + "loss": 0.2889, + "num_input_tokens_seen": 25971232, + "step": 12010 + }, + { + "epoch": 1.9600326264274062, + "grad_norm": 0.06310081481933594, + "learning_rate": 0.0009799347471451875, + "loss": 0.1247, + "num_input_tokens_seen": 25982080, + "step": 12015 + }, + { + "epoch": 1.9608482871125612, + "grad_norm": 0.09681393951177597, + "learning_rate": 0.0009803425774877652, + "loss": 0.1131, + "num_input_tokens_seen": 25993824, + "step": 12020 + }, + { + "epoch": 1.961663947797716, + "grad_norm": 0.06323209404945374, + "learning_rate": 0.0009807504078303427, + "loss": 0.0543, + "num_input_tokens_seen": 26004512, + "step": 12025 + }, + { + "epoch": 1.9624796084828713, + "grad_norm": 0.10774081945419312, + "learning_rate": 0.00098115823817292, + "loss": 0.2355, + "num_input_tokens_seen": 26016064, + "step": 12030 + }, + { + "epoch": 1.963295269168026, + "grad_norm": 0.0556272454559803, + "learning_rate": 0.0009815660685154977, + "loss": 0.1216, + "num_input_tokens_seen": 26027392, + "step": 12035 + }, + { + "epoch": 1.964110929853181, + "grad_norm": 0.007031048182398081, + "learning_rate": 0.0009819738988580751, + "loss": 0.0434, + "num_input_tokens_seen": 26038112, + "step": 12040 + }, + { + "epoch": 1.9649265905383362, + "grad_norm": 0.24994535744190216, + "learning_rate": 0.0009823817292006526, + "loss": 0.1626, + "num_input_tokens_seen": 26049792, + "step": 12045 + }, + { + "epoch": 1.965742251223491, + "grad_norm": 0.007116135209798813, + "learning_rate": 0.00098278955954323, + "loss": 0.1201, + "num_input_tokens_seen": 26060352, + "step": 12050 + }, + { + "epoch": 1.966557911908646, + "grad_norm": 0.13843262195587158, + "learning_rate": 0.0009831973898858076, + "loss": 0.1493, + "num_input_tokens_seen": 26071520, + "step": 12055 + }, + { + "epoch": 1.967373572593801, + "grad_norm": 0.047937799245119095, + "learning_rate": 0.000983605220228385, + "loss": 0.0618, + "num_input_tokens_seen": 26082976, + "step": 12060 + }, + { + "epoch": 1.968189233278956, + "grad_norm": 0.21617697179317474, + "learning_rate": 0.0009840130505709625, + "loss": 0.1505, + "num_input_tokens_seen": 26092960, + "step": 12065 + }, + { + "epoch": 1.969004893964111, + "grad_norm": 0.1679522842168808, + "learning_rate": 0.00098442088091354, + "loss": 0.1382, + "num_input_tokens_seen": 26103776, + "step": 12070 + }, + { + "epoch": 1.969820554649266, + "grad_norm": 0.09036083519458771, + "learning_rate": 0.0009848287112561175, + "loss": 0.0951, + "num_input_tokens_seen": 26115040, + "step": 12075 + }, + { + "epoch": 1.9706362153344208, + "grad_norm": 0.03721405193209648, + "learning_rate": 0.000985236541598695, + "loss": 0.2005, + "num_input_tokens_seen": 26125952, + "step": 12080 + }, + { + "epoch": 1.9714518760195758, + "grad_norm": 0.040085360407829285, + "learning_rate": 0.0009856443719412724, + "loss": 0.1522, + "num_input_tokens_seen": 26136512, + "step": 12085 + }, + { + "epoch": 1.9722675367047309, + "grad_norm": 0.16589096188545227, + "learning_rate": 0.00098605220228385, + "loss": 0.2404, + "num_input_tokens_seen": 26146944, + "step": 12090 + }, + { + "epoch": 1.9730831973898857, + "grad_norm": 0.11659594625234604, + "learning_rate": 0.0009864600326264274, + "loss": 0.0901, + "num_input_tokens_seen": 26157920, + "step": 12095 + }, + { + "epoch": 1.9738988580750407, + "grad_norm": 0.16950438916683197, + "learning_rate": 0.0009868678629690048, + "loss": 0.0963, + "num_input_tokens_seen": 26168864, + "step": 12100 + }, + { + "epoch": 1.9747145187601958, + "grad_norm": 0.09831640124320984, + "learning_rate": 0.0009872756933115823, + "loss": 0.0675, + "num_input_tokens_seen": 26179968, + "step": 12105 + }, + { + "epoch": 1.9755301794453506, + "grad_norm": 0.21385249495506287, + "learning_rate": 0.00098768352365416, + "loss": 0.1444, + "num_input_tokens_seen": 26192096, + "step": 12110 + }, + { + "epoch": 1.9763458401305058, + "grad_norm": 0.0602976493537426, + "learning_rate": 0.0009880913539967373, + "loss": 0.1203, + "num_input_tokens_seen": 26202400, + "step": 12115 + }, + { + "epoch": 1.9771615008156607, + "grad_norm": 0.2599957287311554, + "learning_rate": 0.0009884991843393148, + "loss": 0.0832, + "num_input_tokens_seen": 26212608, + "step": 12120 + }, + { + "epoch": 1.9779771615008157, + "grad_norm": 0.04638943821191788, + "learning_rate": 0.0009889070146818924, + "loss": 0.1196, + "num_input_tokens_seen": 26222912, + "step": 12125 + }, + { + "epoch": 1.9787928221859707, + "grad_norm": 0.4077025353908539, + "learning_rate": 0.00098931484502447, + "loss": 0.2744, + "num_input_tokens_seen": 26233856, + "step": 12130 + }, + { + "epoch": 1.9796084828711256, + "grad_norm": 0.18510301411151886, + "learning_rate": 0.0009897226753670474, + "loss": 0.1087, + "num_input_tokens_seen": 26242976, + "step": 12135 + }, + { + "epoch": 1.9804241435562806, + "grad_norm": 0.04004143178462982, + "learning_rate": 0.0009901305057096249, + "loss": 0.2139, + "num_input_tokens_seen": 26253600, + "step": 12140 + }, + { + "epoch": 1.9812398042414356, + "grad_norm": 0.16456164419651031, + "learning_rate": 0.0009905383360522024, + "loss": 0.1437, + "num_input_tokens_seen": 26263296, + "step": 12145 + }, + { + "epoch": 1.9820554649265905, + "grad_norm": 0.05468539148569107, + "learning_rate": 0.0009909461663947798, + "loss": 0.1029, + "num_input_tokens_seen": 26274656, + "step": 12150 + }, + { + "epoch": 1.9828711256117455, + "grad_norm": 0.11041852831840515, + "learning_rate": 0.0009913539967373573, + "loss": 0.0795, + "num_input_tokens_seen": 26286368, + "step": 12155 + }, + { + "epoch": 1.9836867862969005, + "grad_norm": 0.1570504903793335, + "learning_rate": 0.0009917618270799348, + "loss": 0.0905, + "num_input_tokens_seen": 26297792, + "step": 12160 + }, + { + "epoch": 1.9845024469820554, + "grad_norm": 0.02780609205365181, + "learning_rate": 0.0009921696574225123, + "loss": 0.0639, + "num_input_tokens_seen": 26309024, + "step": 12165 + }, + { + "epoch": 1.9853181076672104, + "grad_norm": 0.0053548384457826614, + "learning_rate": 0.0009925774877650897, + "loss": 0.1691, + "num_input_tokens_seen": 26319360, + "step": 12170 + }, + { + "epoch": 1.9861337683523654, + "grad_norm": 0.20225311815738678, + "learning_rate": 0.0009929853181076672, + "loss": 0.1344, + "num_input_tokens_seen": 26328256, + "step": 12175 + }, + { + "epoch": 1.9869494290375203, + "grad_norm": 0.02028091996908188, + "learning_rate": 0.0009933931484502447, + "loss": 0.0445, + "num_input_tokens_seen": 26339648, + "step": 12180 + }, + { + "epoch": 1.9877650897226755, + "grad_norm": 0.475198894739151, + "learning_rate": 0.0009938009787928222, + "loss": 0.1998, + "num_input_tokens_seen": 26350528, + "step": 12185 + }, + { + "epoch": 1.9885807504078303, + "grad_norm": 0.10536182671785355, + "learning_rate": 0.0009942088091353996, + "loss": 0.1896, + "num_input_tokens_seen": 26361568, + "step": 12190 + }, + { + "epoch": 1.9893964110929854, + "grad_norm": 0.06540945172309875, + "learning_rate": 0.0009946166394779771, + "loss": 0.1007, + "num_input_tokens_seen": 26372832, + "step": 12195 + }, + { + "epoch": 1.9902120717781404, + "grad_norm": 0.010280012153089046, + "learning_rate": 0.0009950244698205548, + "loss": 0.0371, + "num_input_tokens_seen": 26384000, + "step": 12200 + }, + { + "epoch": 1.9910277324632952, + "grad_norm": 0.2649214267730713, + "learning_rate": 0.000995432300163132, + "loss": 0.0944, + "num_input_tokens_seen": 26395008, + "step": 12205 + }, + { + "epoch": 1.9918433931484503, + "grad_norm": 0.05911831930279732, + "learning_rate": 0.0009958401305057095, + "loss": 0.0362, + "num_input_tokens_seen": 26406304, + "step": 12210 + }, + { + "epoch": 1.9926590538336053, + "grad_norm": 0.2364213466644287, + "learning_rate": 0.0009962479608482872, + "loss": 0.164, + "num_input_tokens_seen": 26416576, + "step": 12215 + }, + { + "epoch": 1.9934747145187601, + "grad_norm": 0.07283175736665726, + "learning_rate": 0.0009966557911908645, + "loss": 0.0485, + "num_input_tokens_seen": 26426784, + "step": 12220 + }, + { + "epoch": 1.9942903752039152, + "grad_norm": 0.1471281498670578, + "learning_rate": 0.0009970636215334422, + "loss": 0.0946, + "num_input_tokens_seen": 26436512, + "step": 12225 + }, + { + "epoch": 1.9951060358890702, + "grad_norm": 0.0576576367020607, + "learning_rate": 0.0009974714518760197, + "loss": 0.0836, + "num_input_tokens_seen": 26446880, + "step": 12230 + }, + { + "epoch": 1.995921696574225, + "grad_norm": 0.016541773453354836, + "learning_rate": 0.0009978792822185971, + "loss": 0.1503, + "num_input_tokens_seen": 26457984, + "step": 12235 + }, + { + "epoch": 1.99673735725938, + "grad_norm": 0.26312413811683655, + "learning_rate": 0.0009982871125611746, + "loss": 0.1184, + "num_input_tokens_seen": 26469440, + "step": 12240 + }, + { + "epoch": 1.997553017944535, + "grad_norm": 0.027293941006064415, + "learning_rate": 0.000998694942903752, + "loss": 0.0629, + "num_input_tokens_seen": 26479648, + "step": 12245 + }, + { + "epoch": 1.99836867862969, + "grad_norm": 0.1137554869055748, + "learning_rate": 0.0009991027732463296, + "loss": 0.1678, + "num_input_tokens_seen": 26489920, + "step": 12250 + }, + { + "epoch": 1.9991843393148452, + "grad_norm": 0.05962604284286499, + "learning_rate": 0.000999510603588907, + "loss": 0.0611, + "num_input_tokens_seen": 26501248, + "step": 12255 + }, + { + "epoch": 2.0, + "grad_norm": 0.03831448405981064, + "learning_rate": 0.0009999184339314845, + "loss": 0.0262, + "num_input_tokens_seen": 26510112, + "step": 12260 + }, + { + "epoch": 2.0, + "eval_loss": 0.13076965510845184, + "eval_runtime": 103.3641, + "eval_samples_per_second": 26.363, + "eval_steps_per_second": 6.598, + "num_input_tokens_seen": 26510112, + "step": 12260 + }, + { + "epoch": 2.000815660685155, + "grad_norm": 0.02669798582792282, + "learning_rate": 0.000999999996757397, + "loss": 0.0441, + "num_input_tokens_seen": 26521088, + "step": 12265 + }, + { + "epoch": 2.00163132137031, + "grad_norm": 0.09440121054649353, + "learning_rate": 0.0009999999835843226, + "loss": 0.0782, + "num_input_tokens_seen": 26530976, + "step": 12270 + }, + { + "epoch": 2.002446982055465, + "grad_norm": 0.013201319612562656, + "learning_rate": 0.000999999960278114, + "loss": 0.0368, + "num_input_tokens_seen": 26541536, + "step": 12275 + }, + { + "epoch": 2.0032626427406197, + "grad_norm": 0.3132156729698181, + "learning_rate": 0.000999999926838772, + "loss": 0.24, + "num_input_tokens_seen": 26551776, + "step": 12280 + }, + { + "epoch": 2.004078303425775, + "grad_norm": 0.3435702621936798, + "learning_rate": 0.0009999998832662972, + "loss": 0.2266, + "num_input_tokens_seen": 26562528, + "step": 12285 + }, + { + "epoch": 2.00489396411093, + "grad_norm": 0.07973910868167877, + "learning_rate": 0.0009999998295606907, + "loss": 0.1712, + "num_input_tokens_seen": 26572480, + "step": 12290 + }, + { + "epoch": 2.0057096247960846, + "grad_norm": 0.09572141617536545, + "learning_rate": 0.000999999765721953, + "loss": 0.0815, + "num_input_tokens_seen": 26582400, + "step": 12295 + }, + { + "epoch": 2.00652528548124, + "grad_norm": 0.10497356951236725, + "learning_rate": 0.000999999691750086, + "loss": 0.0754, + "num_input_tokens_seen": 26593248, + "step": 12300 + }, + { + "epoch": 2.0073409461663947, + "grad_norm": 0.021210532635450363, + "learning_rate": 0.0009999996076450908, + "loss": 0.1709, + "num_input_tokens_seen": 26603616, + "step": 12305 + }, + { + "epoch": 2.00815660685155, + "grad_norm": 0.08911927789449692, + "learning_rate": 0.0009999995134069692, + "loss": 0.0257, + "num_input_tokens_seen": 26614560, + "step": 12310 + }, + { + "epoch": 2.0089722675367048, + "grad_norm": 0.043773408979177475, + "learning_rate": 0.0009999994090357234, + "loss": 0.281, + "num_input_tokens_seen": 26624672, + "step": 12315 + }, + { + "epoch": 2.0097879282218596, + "grad_norm": 0.12832224369049072, + "learning_rate": 0.0009999992945313551, + "loss": 0.1412, + "num_input_tokens_seen": 26635552, + "step": 12320 + }, + { + "epoch": 2.010603588907015, + "grad_norm": 0.078923799097538, + "learning_rate": 0.0009999991698938669, + "loss": 0.1098, + "num_input_tokens_seen": 26645984, + "step": 12325 + }, + { + "epoch": 2.0114192495921697, + "grad_norm": 0.13948754966259003, + "learning_rate": 0.000999999035123261, + "loss": 0.2246, + "num_input_tokens_seen": 26656192, + "step": 12330 + }, + { + "epoch": 2.0122349102773245, + "grad_norm": 0.06741499155759811, + "learning_rate": 0.0009999988902195407, + "loss": 0.0859, + "num_input_tokens_seen": 26667200, + "step": 12335 + }, + { + "epoch": 2.0130505709624797, + "grad_norm": 0.21579128503799438, + "learning_rate": 0.0009999987351827085, + "loss": 0.2156, + "num_input_tokens_seen": 26678080, + "step": 12340 + }, + { + "epoch": 2.0138662316476346, + "grad_norm": 0.045807160437107086, + "learning_rate": 0.0009999985700127674, + "loss": 0.0783, + "num_input_tokens_seen": 26690272, + "step": 12345 + }, + { + "epoch": 2.0146818923327894, + "grad_norm": 0.06333373486995697, + "learning_rate": 0.0009999983947097213, + "loss": 0.104, + "num_input_tokens_seen": 26701408, + "step": 12350 + }, + { + "epoch": 2.0154975530179446, + "grad_norm": 0.16630201041698456, + "learning_rate": 0.0009999982092735733, + "loss": 0.1589, + "num_input_tokens_seen": 26711680, + "step": 12355 + }, + { + "epoch": 2.0163132137030995, + "grad_norm": 0.1822613775730133, + "learning_rate": 0.0009999980137043274, + "loss": 0.1364, + "num_input_tokens_seen": 26722336, + "step": 12360 + }, + { + "epoch": 2.0171288743882543, + "grad_norm": 0.073283351957798, + "learning_rate": 0.0009999978080019872, + "loss": 0.1224, + "num_input_tokens_seen": 26732832, + "step": 12365 + }, + { + "epoch": 2.0179445350734095, + "grad_norm": 0.37038156390190125, + "learning_rate": 0.0009999975921665574, + "loss": 0.2476, + "num_input_tokens_seen": 26744608, + "step": 12370 + }, + { + "epoch": 2.0187601957585644, + "grad_norm": 0.020198166370391846, + "learning_rate": 0.000999997366198042, + "loss": 0.0229, + "num_input_tokens_seen": 26755776, + "step": 12375 + }, + { + "epoch": 2.0195758564437196, + "grad_norm": 0.0995674729347229, + "learning_rate": 0.0009999971300964456, + "loss": 0.1549, + "num_input_tokens_seen": 26765472, + "step": 12380 + }, + { + "epoch": 2.0203915171288744, + "grad_norm": 0.0052305120043456554, + "learning_rate": 0.0009999968838617732, + "loss": 0.0756, + "num_input_tokens_seen": 26776224, + "step": 12385 + }, + { + "epoch": 2.0212071778140293, + "grad_norm": 0.12994590401649475, + "learning_rate": 0.0009999966274940296, + "loss": 0.2399, + "num_input_tokens_seen": 26787520, + "step": 12390 + }, + { + "epoch": 2.0220228384991845, + "grad_norm": 0.019414570182561874, + "learning_rate": 0.00099999636099322, + "loss": 0.0252, + "num_input_tokens_seen": 26798048, + "step": 12395 + }, + { + "epoch": 2.0228384991843393, + "grad_norm": 0.2989323139190674, + "learning_rate": 0.0009999960843593498, + "loss": 0.2631, + "num_input_tokens_seen": 26808064, + "step": 12400 + }, + { + "epoch": 2.023654159869494, + "grad_norm": 0.03400423377752304, + "learning_rate": 0.0009999957975924249, + "loss": 0.0305, + "num_input_tokens_seen": 26818304, + "step": 12405 + }, + { + "epoch": 2.0244698205546494, + "grad_norm": 0.03117789700627327, + "learning_rate": 0.0009999955006924507, + "loss": 0.0789, + "num_input_tokens_seen": 26829632, + "step": 12410 + }, + { + "epoch": 2.0252854812398042, + "grad_norm": 0.01920362561941147, + "learning_rate": 0.0009999951936594334, + "loss": 0.0514, + "num_input_tokens_seen": 26839264, + "step": 12415 + }, + { + "epoch": 2.026101141924959, + "grad_norm": 0.1514592468738556, + "learning_rate": 0.0009999948764933793, + "loss": 0.1578, + "num_input_tokens_seen": 26850688, + "step": 12420 + }, + { + "epoch": 2.0269168026101143, + "grad_norm": 0.023600779473781586, + "learning_rate": 0.0009999945491942946, + "loss": 0.0543, + "num_input_tokens_seen": 26861376, + "step": 12425 + }, + { + "epoch": 2.027732463295269, + "grad_norm": 0.14419586956501007, + "learning_rate": 0.0009999942117621863, + "loss": 0.0864, + "num_input_tokens_seen": 26872288, + "step": 12430 + }, + { + "epoch": 2.028548123980424, + "grad_norm": 0.04900144413113594, + "learning_rate": 0.0009999938641970607, + "loss": 0.0924, + "num_input_tokens_seen": 26883360, + "step": 12435 + }, + { + "epoch": 2.029363784665579, + "grad_norm": 0.10072429478168488, + "learning_rate": 0.0009999935064989255, + "loss": 0.1094, + "num_input_tokens_seen": 26894816, + "step": 12440 + }, + { + "epoch": 2.030179445350734, + "grad_norm": 0.024320529773831367, + "learning_rate": 0.0009999931386677873, + "loss": 0.0679, + "num_input_tokens_seen": 26905344, + "step": 12445 + }, + { + "epoch": 2.0309951060358893, + "grad_norm": 0.15190035104751587, + "learning_rate": 0.000999992760703654, + "loss": 0.1935, + "num_input_tokens_seen": 26916960, + "step": 12450 + }, + { + "epoch": 2.031810766721044, + "grad_norm": 0.1964467167854309, + "learning_rate": 0.000999992372606533, + "loss": 0.1, + "num_input_tokens_seen": 26926304, + "step": 12455 + }, + { + "epoch": 2.032626427406199, + "grad_norm": 0.12315433472394943, + "learning_rate": 0.0009999919743764324, + "loss": 0.2653, + "num_input_tokens_seen": 26935776, + "step": 12460 + }, + { + "epoch": 2.033442088091354, + "grad_norm": 0.18422943353652954, + "learning_rate": 0.00099999156601336, + "loss": 0.0925, + "num_input_tokens_seen": 26945792, + "step": 12465 + }, + { + "epoch": 2.034257748776509, + "grad_norm": 0.296763151884079, + "learning_rate": 0.0009999911475173245, + "loss": 0.15, + "num_input_tokens_seen": 26958336, + "step": 12470 + }, + { + "epoch": 2.035073409461664, + "grad_norm": 0.16706916689872742, + "learning_rate": 0.000999990718888334, + "loss": 0.1491, + "num_input_tokens_seen": 26969184, + "step": 12475 + }, + { + "epoch": 2.035889070146819, + "grad_norm": 0.08804943412542343, + "learning_rate": 0.0009999902801263974, + "loss": 0.1627, + "num_input_tokens_seen": 26979328, + "step": 12480 + }, + { + "epoch": 2.036704730831974, + "grad_norm": 0.08816880732774734, + "learning_rate": 0.0009999898312315232, + "loss": 0.0732, + "num_input_tokens_seen": 26989920, + "step": 12485 + }, + { + "epoch": 2.0375203915171287, + "grad_norm": 0.021312927827239037, + "learning_rate": 0.000999989372203721, + "loss": 0.1219, + "num_input_tokens_seen": 27000544, + "step": 12490 + }, + { + "epoch": 2.038336052202284, + "grad_norm": 0.0042571392841637135, + "learning_rate": 0.0009999889030429998, + "loss": 0.0286, + "num_input_tokens_seen": 27011360, + "step": 12495 + }, + { + "epoch": 2.039151712887439, + "grad_norm": 0.09083344042301178, + "learning_rate": 0.0009999884237493692, + "loss": 0.0313, + "num_input_tokens_seen": 27021312, + "step": 12500 + }, + { + "epoch": 2.0399673735725936, + "grad_norm": 0.028655050322413445, + "learning_rate": 0.000999987934322839, + "loss": 0.0255, + "num_input_tokens_seen": 27032640, + "step": 12505 + }, + { + "epoch": 2.040783034257749, + "grad_norm": 0.047613725066185, + "learning_rate": 0.000999987434763419, + "loss": 0.1954, + "num_input_tokens_seen": 27043648, + "step": 12510 + }, + { + "epoch": 2.0415986949429037, + "grad_norm": 0.3595113158226013, + "learning_rate": 0.0009999869250711193, + "loss": 0.1717, + "num_input_tokens_seen": 27055232, + "step": 12515 + }, + { + "epoch": 2.0424143556280585, + "grad_norm": 0.05471295118331909, + "learning_rate": 0.0009999864052459503, + "loss": 0.2655, + "num_input_tokens_seen": 27066464, + "step": 12520 + }, + { + "epoch": 2.0432300163132138, + "grad_norm": 0.14621756970882416, + "learning_rate": 0.0009999858752879228, + "loss": 0.1095, + "num_input_tokens_seen": 27077888, + "step": 12525 + }, + { + "epoch": 2.0440456769983686, + "grad_norm": 0.09596377611160278, + "learning_rate": 0.0009999853351970469, + "loss": 0.0755, + "num_input_tokens_seen": 27088064, + "step": 12530 + }, + { + "epoch": 2.044861337683524, + "grad_norm": 0.05375010520219803, + "learning_rate": 0.000999984784973334, + "loss": 0.2147, + "num_input_tokens_seen": 27099296, + "step": 12535 + }, + { + "epoch": 2.0456769983686787, + "grad_norm": 0.0923907533288002, + "learning_rate": 0.0009999842246167952, + "loss": 0.0616, + "num_input_tokens_seen": 27110176, + "step": 12540 + }, + { + "epoch": 2.0464926590538335, + "grad_norm": 0.019899077713489532, + "learning_rate": 0.0009999836541274417, + "loss": 0.0729, + "num_input_tokens_seen": 27120768, + "step": 12545 + }, + { + "epoch": 2.0473083197389887, + "grad_norm": 0.13260437548160553, + "learning_rate": 0.0009999830735052853, + "loss": 0.0526, + "num_input_tokens_seen": 27131904, + "step": 12550 + }, + { + "epoch": 2.0481239804241436, + "grad_norm": 0.009612596593797207, + "learning_rate": 0.0009999824827503377, + "loss": 0.0405, + "num_input_tokens_seen": 27142976, + "step": 12555 + }, + { + "epoch": 2.0489396411092984, + "grad_norm": 0.010959037579596043, + "learning_rate": 0.0009999818818626105, + "loss": 0.0632, + "num_input_tokens_seen": 27154656, + "step": 12560 + }, + { + "epoch": 2.0497553017944536, + "grad_norm": 0.03808634728193283, + "learning_rate": 0.0009999812708421166, + "loss": 0.06, + "num_input_tokens_seen": 27165344, + "step": 12565 + }, + { + "epoch": 2.0505709624796085, + "grad_norm": 0.006856338586658239, + "learning_rate": 0.0009999806496888677, + "loss": 0.0447, + "num_input_tokens_seen": 27175104, + "step": 12570 + }, + { + "epoch": 2.0513866231647633, + "grad_norm": 0.10554108023643494, + "learning_rate": 0.0009999800184028766, + "loss": 0.0212, + "num_input_tokens_seen": 27186528, + "step": 12575 + }, + { + "epoch": 2.0522022838499185, + "grad_norm": 0.0567990280687809, + "learning_rate": 0.0009999793769841564, + "loss": 0.165, + "num_input_tokens_seen": 27197664, + "step": 12580 + }, + { + "epoch": 2.0530179445350734, + "grad_norm": 0.0033737735357135534, + "learning_rate": 0.0009999787254327196, + "loss": 0.3187, + "num_input_tokens_seen": 27209440, + "step": 12585 + }, + { + "epoch": 2.053833605220228, + "grad_norm": 0.2872142791748047, + "learning_rate": 0.00099997806374858, + "loss": 0.229, + "num_input_tokens_seen": 27222464, + "step": 12590 + }, + { + "epoch": 2.0546492659053834, + "grad_norm": 0.1510654091835022, + "learning_rate": 0.0009999773919317505, + "loss": 0.1552, + "num_input_tokens_seen": 27233664, + "step": 12595 + }, + { + "epoch": 2.0554649265905383, + "grad_norm": 0.1906013935804367, + "learning_rate": 0.000999976709982245, + "loss": 0.2621, + "num_input_tokens_seen": 27245632, + "step": 12600 + }, + { + "epoch": 2.0562805872756935, + "grad_norm": 0.10470928251743317, + "learning_rate": 0.000999976017900077, + "loss": 0.0961, + "num_input_tokens_seen": 27255872, + "step": 12605 + }, + { + "epoch": 2.0570962479608483, + "grad_norm": 0.08966370671987534, + "learning_rate": 0.0009999753156852609, + "loss": 0.0501, + "num_input_tokens_seen": 27266368, + "step": 12610 + }, + { + "epoch": 2.057911908646003, + "grad_norm": 0.12156267464160919, + "learning_rate": 0.0009999746033378105, + "loss": 0.0726, + "num_input_tokens_seen": 27276960, + "step": 12615 + }, + { + "epoch": 2.0587275693311584, + "grad_norm": 0.2356519103050232, + "learning_rate": 0.0009999738808577408, + "loss": 0.1518, + "num_input_tokens_seen": 27288416, + "step": 12620 + }, + { + "epoch": 2.0595432300163132, + "grad_norm": 0.12507210671901703, + "learning_rate": 0.000999973148245066, + "loss": 0.0652, + "num_input_tokens_seen": 27300032, + "step": 12625 + }, + { + "epoch": 2.060358890701468, + "grad_norm": 0.20876716077327728, + "learning_rate": 0.000999972405499801, + "loss": 0.1517, + "num_input_tokens_seen": 27310688, + "step": 12630 + }, + { + "epoch": 2.0611745513866233, + "grad_norm": 0.09181343019008636, + "learning_rate": 0.0009999716526219611, + "loss": 0.0955, + "num_input_tokens_seen": 27323424, + "step": 12635 + }, + { + "epoch": 2.061990212071778, + "grad_norm": 0.134469673037529, + "learning_rate": 0.0009999708896115613, + "loss": 0.1265, + "num_input_tokens_seen": 27333024, + "step": 12640 + }, + { + "epoch": 2.062805872756933, + "grad_norm": 0.06973686814308167, + "learning_rate": 0.0009999701164686173, + "loss": 0.0691, + "num_input_tokens_seen": 27344512, + "step": 12645 + }, + { + "epoch": 2.063621533442088, + "grad_norm": 0.27596315741539, + "learning_rate": 0.0009999693331931446, + "loss": 0.0942, + "num_input_tokens_seen": 27355328, + "step": 12650 + }, + { + "epoch": 2.064437194127243, + "grad_norm": 0.029866395518183708, + "learning_rate": 0.000999968539785159, + "loss": 0.189, + "num_input_tokens_seen": 27366688, + "step": 12655 + }, + { + "epoch": 2.065252854812398, + "grad_norm": 0.07480645924806595, + "learning_rate": 0.0009999677362446768, + "loss": 0.1079, + "num_input_tokens_seen": 27377376, + "step": 12660 + }, + { + "epoch": 2.066068515497553, + "grad_norm": 0.11446642875671387, + "learning_rate": 0.000999966922571714, + "loss": 0.1362, + "num_input_tokens_seen": 27387328, + "step": 12665 + }, + { + "epoch": 2.066884176182708, + "grad_norm": 0.20092760026454926, + "learning_rate": 0.0009999660987662876, + "loss": 0.0884, + "num_input_tokens_seen": 27399136, + "step": 12670 + }, + { + "epoch": 2.067699836867863, + "grad_norm": 0.48023906350135803, + "learning_rate": 0.0009999652648284136, + "loss": 0.1254, + "num_input_tokens_seen": 27409984, + "step": 12675 + }, + { + "epoch": 2.068515497553018, + "grad_norm": 0.3658871352672577, + "learning_rate": 0.0009999644207581092, + "loss": 0.2625, + "num_input_tokens_seen": 27421088, + "step": 12680 + }, + { + "epoch": 2.069331158238173, + "grad_norm": 0.10626986622810364, + "learning_rate": 0.000999963566555392, + "loss": 0.0983, + "num_input_tokens_seen": 27431648, + "step": 12685 + }, + { + "epoch": 2.070146818923328, + "grad_norm": 0.19185417890548706, + "learning_rate": 0.0009999627022202785, + "loss": 0.2449, + "num_input_tokens_seen": 27441440, + "step": 12690 + }, + { + "epoch": 2.070962479608483, + "grad_norm": 0.2664777338504791, + "learning_rate": 0.0009999618277527868, + "loss": 0.1815, + "num_input_tokens_seen": 27450496, + "step": 12695 + }, + { + "epoch": 2.0717781402936377, + "grad_norm": 0.0740237906575203, + "learning_rate": 0.0009999609431529345, + "loss": 0.1454, + "num_input_tokens_seen": 27461248, + "step": 12700 + }, + { + "epoch": 2.072593800978793, + "grad_norm": 0.112830251455307, + "learning_rate": 0.0009999600484207392, + "loss": 0.1482, + "num_input_tokens_seen": 27471104, + "step": 12705 + }, + { + "epoch": 2.073409461663948, + "grad_norm": 0.07632242888212204, + "learning_rate": 0.0009999591435562193, + "loss": 0.1155, + "num_input_tokens_seen": 27481376, + "step": 12710 + }, + { + "epoch": 2.0742251223491026, + "grad_norm": 0.06909924000501633, + "learning_rate": 0.0009999582285593932, + "loss": 0.031, + "num_input_tokens_seen": 27491872, + "step": 12715 + }, + { + "epoch": 2.075040783034258, + "grad_norm": 0.046119239181280136, + "learning_rate": 0.0009999573034302793, + "loss": 0.0963, + "num_input_tokens_seen": 27503712, + "step": 12720 + }, + { + "epoch": 2.0758564437194127, + "grad_norm": 0.0311514250934124, + "learning_rate": 0.0009999563681688964, + "loss": 0.1303, + "num_input_tokens_seen": 27514720, + "step": 12725 + }, + { + "epoch": 2.0766721044045675, + "grad_norm": 0.030699364840984344, + "learning_rate": 0.0009999554227752634, + "loss": 0.0915, + "num_input_tokens_seen": 27526016, + "step": 12730 + }, + { + "epoch": 2.0774877650897228, + "grad_norm": 0.16419100761413574, + "learning_rate": 0.0009999544672493997, + "loss": 0.0642, + "num_input_tokens_seen": 27536160, + "step": 12735 + }, + { + "epoch": 2.0783034257748776, + "grad_norm": 0.08112215995788574, + "learning_rate": 0.0009999535015913243, + "loss": 0.1317, + "num_input_tokens_seen": 27547296, + "step": 12740 + }, + { + "epoch": 2.0791190864600324, + "grad_norm": 0.04285610839724541, + "learning_rate": 0.0009999525258010571, + "loss": 0.26, + "num_input_tokens_seen": 27558688, + "step": 12745 + }, + { + "epoch": 2.0799347471451877, + "grad_norm": 0.05731838569045067, + "learning_rate": 0.0009999515398786177, + "loss": 0.2337, + "num_input_tokens_seen": 27570144, + "step": 12750 + }, + { + "epoch": 2.0807504078303425, + "grad_norm": 0.02599678374826908, + "learning_rate": 0.000999950543824026, + "loss": 0.0605, + "num_input_tokens_seen": 27581376, + "step": 12755 + }, + { + "epoch": 2.0815660685154977, + "grad_norm": 0.05650242790579796, + "learning_rate": 0.0009999495376373025, + "loss": 0.1582, + "num_input_tokens_seen": 27592480, + "step": 12760 + }, + { + "epoch": 2.0823817292006526, + "grad_norm": 0.10208354890346527, + "learning_rate": 0.0009999485213184672, + "loss": 0.0866, + "num_input_tokens_seen": 27603584, + "step": 12765 + }, + { + "epoch": 2.0831973898858074, + "grad_norm": 0.06093911826610565, + "learning_rate": 0.000999947494867541, + "loss": 0.1478, + "num_input_tokens_seen": 27615264, + "step": 12770 + }, + { + "epoch": 2.0840130505709626, + "grad_norm": 0.15886317193508148, + "learning_rate": 0.0009999464582845445, + "loss": 0.1507, + "num_input_tokens_seen": 27624800, + "step": 12775 + }, + { + "epoch": 2.0848287112561175, + "grad_norm": 0.1170632541179657, + "learning_rate": 0.0009999454115694989, + "loss": 0.132, + "num_input_tokens_seen": 27635456, + "step": 12780 + }, + { + "epoch": 2.0856443719412723, + "grad_norm": 0.23480327427387238, + "learning_rate": 0.0009999443547224253, + "loss": 0.0773, + "num_input_tokens_seen": 27646304, + "step": 12785 + }, + { + "epoch": 2.0864600326264275, + "grad_norm": 0.051176343113183975, + "learning_rate": 0.0009999432877433449, + "loss": 0.1447, + "num_input_tokens_seen": 27655456, + "step": 12790 + }, + { + "epoch": 2.0872756933115824, + "grad_norm": 0.11672715097665787, + "learning_rate": 0.0009999422106322798, + "loss": 0.0529, + "num_input_tokens_seen": 27665632, + "step": 12795 + }, + { + "epoch": 2.088091353996737, + "grad_norm": 0.15267214179039001, + "learning_rate": 0.0009999411233892516, + "loss": 0.1734, + "num_input_tokens_seen": 27677504, + "step": 12800 + }, + { + "epoch": 2.0889070146818924, + "grad_norm": 0.011678201146423817, + "learning_rate": 0.000999940026014282, + "loss": 0.0411, + "num_input_tokens_seen": 27688768, + "step": 12805 + }, + { + "epoch": 2.0897226753670473, + "grad_norm": 0.06737668067216873, + "learning_rate": 0.000999938918507394, + "loss": 0.0999, + "num_input_tokens_seen": 27699680, + "step": 12810 + }, + { + "epoch": 2.090538336052202, + "grad_norm": 0.03292597457766533, + "learning_rate": 0.0009999378008686093, + "loss": 0.1877, + "num_input_tokens_seen": 27709504, + "step": 12815 + }, + { + "epoch": 2.0913539967373573, + "grad_norm": 0.12016480416059494, + "learning_rate": 0.000999936673097951, + "loss": 0.1935, + "num_input_tokens_seen": 27720704, + "step": 12820 + }, + { + "epoch": 2.092169657422512, + "grad_norm": 0.018072044476866722, + "learning_rate": 0.0009999355351954418, + "loss": 0.178, + "num_input_tokens_seen": 27730656, + "step": 12825 + }, + { + "epoch": 2.0929853181076674, + "grad_norm": 0.1713826060295105, + "learning_rate": 0.0009999343871611045, + "loss": 0.1951, + "num_input_tokens_seen": 27740864, + "step": 12830 + }, + { + "epoch": 2.0938009787928222, + "grad_norm": 0.10080626606941223, + "learning_rate": 0.000999933228994963, + "loss": 0.1041, + "num_input_tokens_seen": 27751104, + "step": 12835 + }, + { + "epoch": 2.094616639477977, + "grad_norm": 0.10050133615732193, + "learning_rate": 0.00099993206069704, + "loss": 0.0784, + "num_input_tokens_seen": 27761760, + "step": 12840 + }, + { + "epoch": 2.0954323001631323, + "grad_norm": 0.02649875171482563, + "learning_rate": 0.0009999308822673599, + "loss": 0.2457, + "num_input_tokens_seen": 27772864, + "step": 12845 + }, + { + "epoch": 2.096247960848287, + "grad_norm": 0.09067980200052261, + "learning_rate": 0.000999929693705946, + "loss": 0.1051, + "num_input_tokens_seen": 27783936, + "step": 12850 + }, + { + "epoch": 2.097063621533442, + "grad_norm": 0.13061358034610748, + "learning_rate": 0.000999928495012823, + "loss": 0.1225, + "num_input_tokens_seen": 27794368, + "step": 12855 + }, + { + "epoch": 2.097879282218597, + "grad_norm": 0.11418254673480988, + "learning_rate": 0.0009999272861880148, + "loss": 0.0877, + "num_input_tokens_seen": 27804224, + "step": 12860 + }, + { + "epoch": 2.098694942903752, + "grad_norm": 0.1471940577030182, + "learning_rate": 0.0009999260672315456, + "loss": 0.1568, + "num_input_tokens_seen": 27815168, + "step": 12865 + }, + { + "epoch": 2.099510603588907, + "grad_norm": 0.45913568139076233, + "learning_rate": 0.0009999248381434406, + "loss": 0.1507, + "num_input_tokens_seen": 27825088, + "step": 12870 + }, + { + "epoch": 2.100326264274062, + "grad_norm": 0.21608853340148926, + "learning_rate": 0.0009999235989237249, + "loss": 0.2859, + "num_input_tokens_seen": 27834880, + "step": 12875 + }, + { + "epoch": 2.101141924959217, + "grad_norm": 0.06812023371458054, + "learning_rate": 0.0009999223495724228, + "loss": 0.1004, + "num_input_tokens_seen": 27845248, + "step": 12880 + }, + { + "epoch": 2.1019575856443717, + "grad_norm": 0.02336515672504902, + "learning_rate": 0.0009999210900895603, + "loss": 0.1676, + "num_input_tokens_seen": 27854144, + "step": 12885 + }, + { + "epoch": 2.102773246329527, + "grad_norm": 0.10959352552890778, + "learning_rate": 0.0009999198204751628, + "loss": 0.2141, + "num_input_tokens_seen": 27864000, + "step": 12890 + }, + { + "epoch": 2.103588907014682, + "grad_norm": 0.10948460549116135, + "learning_rate": 0.0009999185407292557, + "loss": 0.1374, + "num_input_tokens_seen": 27873856, + "step": 12895 + }, + { + "epoch": 2.104404567699837, + "grad_norm": 0.05279287323355675, + "learning_rate": 0.0009999172508518654, + "loss": 0.0531, + "num_input_tokens_seen": 27885088, + "step": 12900 + }, + { + "epoch": 2.105220228384992, + "grad_norm": 0.11285223066806793, + "learning_rate": 0.0009999159508430177, + "loss": 0.1392, + "num_input_tokens_seen": 27896256, + "step": 12905 + }, + { + "epoch": 2.1060358890701467, + "grad_norm": 0.12476310133934021, + "learning_rate": 0.000999914640702739, + "loss": 0.169, + "num_input_tokens_seen": 27905696, + "step": 12910 + }, + { + "epoch": 2.106851549755302, + "grad_norm": 0.21431344747543335, + "learning_rate": 0.000999913320431056, + "loss": 0.0857, + "num_input_tokens_seen": 27915808, + "step": 12915 + }, + { + "epoch": 2.107667210440457, + "grad_norm": 0.04303182661533356, + "learning_rate": 0.0009999119900279956, + "loss": 0.0545, + "num_input_tokens_seen": 27926528, + "step": 12920 + }, + { + "epoch": 2.1084828711256116, + "grad_norm": 0.029864931479096413, + "learning_rate": 0.0009999106494935843, + "loss": 0.162, + "num_input_tokens_seen": 27937120, + "step": 12925 + }, + { + "epoch": 2.109298531810767, + "grad_norm": 0.05805061757564545, + "learning_rate": 0.0009999092988278496, + "loss": 0.0684, + "num_input_tokens_seen": 27947648, + "step": 12930 + }, + { + "epoch": 2.1101141924959217, + "grad_norm": 0.012730555608868599, + "learning_rate": 0.0009999079380308186, + "loss": 0.0532, + "num_input_tokens_seen": 27958752, + "step": 12935 + }, + { + "epoch": 2.1109298531810765, + "grad_norm": 0.05740470066666603, + "learning_rate": 0.000999906567102519, + "loss": 0.1548, + "num_input_tokens_seen": 27969248, + "step": 12940 + }, + { + "epoch": 2.1117455138662318, + "grad_norm": 0.2081340104341507, + "learning_rate": 0.0009999051860429791, + "loss": 0.2038, + "num_input_tokens_seen": 27980256, + "step": 12945 + }, + { + "epoch": 2.1125611745513866, + "grad_norm": 0.057117413729429245, + "learning_rate": 0.000999903794852226, + "loss": 0.0831, + "num_input_tokens_seen": 27990016, + "step": 12950 + }, + { + "epoch": 2.1133768352365414, + "grad_norm": 0.03829582408070564, + "learning_rate": 0.0009999023935302886, + "loss": 0.0685, + "num_input_tokens_seen": 28000928, + "step": 12955 + }, + { + "epoch": 2.1141924959216967, + "grad_norm": 0.05271393433213234, + "learning_rate": 0.000999900982077195, + "loss": 0.0568, + "num_input_tokens_seen": 28009984, + "step": 12960 + }, + { + "epoch": 2.1150081566068515, + "grad_norm": 0.050736140459775925, + "learning_rate": 0.0009998995604929735, + "loss": 0.0266, + "num_input_tokens_seen": 28022560, + "step": 12965 + }, + { + "epoch": 2.1158238172920063, + "grad_norm": 0.018366431817412376, + "learning_rate": 0.0009998981287776536, + "loss": 0.0346, + "num_input_tokens_seen": 28032960, + "step": 12970 + }, + { + "epoch": 2.1166394779771616, + "grad_norm": 0.18964123725891113, + "learning_rate": 0.0009998966869312637, + "loss": 0.0733, + "num_input_tokens_seen": 28044512, + "step": 12975 + }, + { + "epoch": 2.1174551386623164, + "grad_norm": 0.14065471291542053, + "learning_rate": 0.0009998952349538335, + "loss": 0.103, + "num_input_tokens_seen": 28055040, + "step": 12980 + }, + { + "epoch": 2.1182707993474716, + "grad_norm": 0.022199753671884537, + "learning_rate": 0.000999893772845392, + "loss": 0.0426, + "num_input_tokens_seen": 28065952, + "step": 12985 + }, + { + "epoch": 2.1190864600326265, + "grad_norm": 0.06245112419128418, + "learning_rate": 0.0009998923006059692, + "loss": 0.0593, + "num_input_tokens_seen": 28077280, + "step": 12990 + }, + { + "epoch": 2.1199021207177813, + "grad_norm": 0.08001261949539185, + "learning_rate": 0.0009998908182355948, + "loss": 0.0577, + "num_input_tokens_seen": 28088448, + "step": 12995 + }, + { + "epoch": 2.1207177814029365, + "grad_norm": 0.04466702789068222, + "learning_rate": 0.0009998893257342986, + "loss": 0.0837, + "num_input_tokens_seen": 28099584, + "step": 13000 + }, + { + "epoch": 2.1215334420880914, + "grad_norm": 0.0033461377024650574, + "learning_rate": 0.000999887823102111, + "loss": 0.1053, + "num_input_tokens_seen": 28110496, + "step": 13005 + }, + { + "epoch": 2.122349102773246, + "grad_norm": 0.029360774904489517, + "learning_rate": 0.0009998863103390628, + "loss": 0.2003, + "num_input_tokens_seen": 28119616, + "step": 13010 + }, + { + "epoch": 2.1231647634584014, + "grad_norm": 0.047707974910736084, + "learning_rate": 0.0009998847874451843, + "loss": 0.1026, + "num_input_tokens_seen": 28129664, + "step": 13015 + }, + { + "epoch": 2.1239804241435563, + "grad_norm": 0.39130014181137085, + "learning_rate": 0.0009998832544205064, + "loss": 0.273, + "num_input_tokens_seen": 28140576, + "step": 13020 + }, + { + "epoch": 2.124796084828711, + "grad_norm": 0.1589263379573822, + "learning_rate": 0.0009998817112650603, + "loss": 0.1321, + "num_input_tokens_seen": 28150976, + "step": 13025 + }, + { + "epoch": 2.1256117455138663, + "grad_norm": 0.03926459699869156, + "learning_rate": 0.000999880157978877, + "loss": 0.0811, + "num_input_tokens_seen": 28161728, + "step": 13030 + }, + { + "epoch": 2.126427406199021, + "grad_norm": 0.09237375855445862, + "learning_rate": 0.0009998785945619882, + "loss": 0.0559, + "num_input_tokens_seen": 28172896, + "step": 13035 + }, + { + "epoch": 2.1272430668841764, + "grad_norm": 0.06005003675818443, + "learning_rate": 0.0009998770210144256, + "loss": 0.1213, + "num_input_tokens_seen": 28181184, + "step": 13040 + }, + { + "epoch": 2.1280587275693312, + "grad_norm": 0.15633663535118103, + "learning_rate": 0.000999875437336221, + "loss": 0.1371, + "num_input_tokens_seen": 28191552, + "step": 13045 + }, + { + "epoch": 2.128874388254486, + "grad_norm": 0.12029604613780975, + "learning_rate": 0.0009998738435274064, + "loss": 0.194, + "num_input_tokens_seen": 28203168, + "step": 13050 + }, + { + "epoch": 2.1296900489396413, + "grad_norm": 0.10934768617153168, + "learning_rate": 0.0009998722395880145, + "loss": 0.056, + "num_input_tokens_seen": 28214720, + "step": 13055 + }, + { + "epoch": 2.130505709624796, + "grad_norm": 0.016308283433318138, + "learning_rate": 0.0009998706255180774, + "loss": 0.1291, + "num_input_tokens_seen": 28225472, + "step": 13060 + }, + { + "epoch": 2.131321370309951, + "grad_norm": 0.35608285665512085, + "learning_rate": 0.0009998690013176279, + "loss": 0.2248, + "num_input_tokens_seen": 28235328, + "step": 13065 + }, + { + "epoch": 2.132137030995106, + "grad_norm": 0.29491209983825684, + "learning_rate": 0.0009998673669866988, + "loss": 0.3122, + "num_input_tokens_seen": 28246912, + "step": 13070 + }, + { + "epoch": 2.132952691680261, + "grad_norm": 0.09003280103206635, + "learning_rate": 0.0009998657225253236, + "loss": 0.2082, + "num_input_tokens_seen": 28258368, + "step": 13075 + }, + { + "epoch": 2.133768352365416, + "grad_norm": 0.11987301707267761, + "learning_rate": 0.0009998640679335354, + "loss": 0.0845, + "num_input_tokens_seen": 28268544, + "step": 13080 + }, + { + "epoch": 2.134584013050571, + "grad_norm": 0.031110180541872978, + "learning_rate": 0.0009998624032113677, + "loss": 0.0505, + "num_input_tokens_seen": 28280576, + "step": 13085 + }, + { + "epoch": 2.135399673735726, + "grad_norm": 0.11604081839323044, + "learning_rate": 0.0009998607283588543, + "loss": 0.0855, + "num_input_tokens_seen": 28290944, + "step": 13090 + }, + { + "epoch": 2.1362153344208807, + "grad_norm": 0.09189382195472717, + "learning_rate": 0.000999859043376029, + "loss": 0.1483, + "num_input_tokens_seen": 28302176, + "step": 13095 + }, + { + "epoch": 2.137030995106036, + "grad_norm": 0.026161260902881622, + "learning_rate": 0.0009998573482629264, + "loss": 0.0861, + "num_input_tokens_seen": 28312864, + "step": 13100 + }, + { + "epoch": 2.137846655791191, + "grad_norm": 0.10673241317272186, + "learning_rate": 0.0009998556430195803, + "loss": 0.0502, + "num_input_tokens_seen": 28323328, + "step": 13105 + }, + { + "epoch": 2.1386623164763456, + "grad_norm": 0.06484927237033844, + "learning_rate": 0.0009998539276460255, + "loss": 0.0325, + "num_input_tokens_seen": 28335264, + "step": 13110 + }, + { + "epoch": 2.139477977161501, + "grad_norm": 0.08133813738822937, + "learning_rate": 0.0009998522021422967, + "loss": 0.2518, + "num_input_tokens_seen": 28346400, + "step": 13115 + }, + { + "epoch": 2.1402936378466557, + "grad_norm": 0.2059800624847412, + "learning_rate": 0.000999850466508429, + "loss": 0.1144, + "num_input_tokens_seen": 28357888, + "step": 13120 + }, + { + "epoch": 2.141109298531811, + "grad_norm": 0.008014354854822159, + "learning_rate": 0.0009998487207444574, + "loss": 0.0368, + "num_input_tokens_seen": 28369888, + "step": 13125 + }, + { + "epoch": 2.141924959216966, + "grad_norm": 0.0397895947098732, + "learning_rate": 0.0009998469648504174, + "loss": 0.0487, + "num_input_tokens_seen": 28380736, + "step": 13130 + }, + { + "epoch": 2.1427406199021206, + "grad_norm": 0.05694444105029106, + "learning_rate": 0.0009998451988263444, + "loss": 0.238, + "num_input_tokens_seen": 28390816, + "step": 13135 + }, + { + "epoch": 2.143556280587276, + "grad_norm": 0.1789853572845459, + "learning_rate": 0.0009998434226722746, + "loss": 0.0606, + "num_input_tokens_seen": 28402080, + "step": 13140 + }, + { + "epoch": 2.1443719412724307, + "grad_norm": 0.24429011344909668, + "learning_rate": 0.0009998416363882438, + "loss": 0.1706, + "num_input_tokens_seen": 28412768, + "step": 13145 + }, + { + "epoch": 2.1451876019575855, + "grad_norm": 0.032388217747211456, + "learning_rate": 0.0009998398399742878, + "loss": 0.0746, + "num_input_tokens_seen": 28425056, + "step": 13150 + }, + { + "epoch": 2.1460032626427408, + "grad_norm": 0.08640412241220474, + "learning_rate": 0.0009998380334304436, + "loss": 0.1357, + "num_input_tokens_seen": 28436800, + "step": 13155 + }, + { + "epoch": 2.1468189233278956, + "grad_norm": 0.08437643945217133, + "learning_rate": 0.0009998362167567476, + "loss": 0.0239, + "num_input_tokens_seen": 28447360, + "step": 13160 + }, + { + "epoch": 2.1476345840130504, + "grad_norm": 0.03582681342959404, + "learning_rate": 0.0009998343899532364, + "loss": 0.0438, + "num_input_tokens_seen": 28457888, + "step": 13165 + }, + { + "epoch": 2.1484502446982057, + "grad_norm": 0.4298367202281952, + "learning_rate": 0.0009998325530199473, + "loss": 0.1974, + "num_input_tokens_seen": 28469472, + "step": 13170 + }, + { + "epoch": 2.1492659053833605, + "grad_norm": 0.034253429621458054, + "learning_rate": 0.0009998307059569174, + "loss": 0.2651, + "num_input_tokens_seen": 28479584, + "step": 13175 + }, + { + "epoch": 2.1500815660685153, + "grad_norm": 0.044094908982515335, + "learning_rate": 0.0009998288487641843, + "loss": 0.0908, + "num_input_tokens_seen": 28491520, + "step": 13180 + }, + { + "epoch": 2.1508972267536706, + "grad_norm": 0.020100802183151245, + "learning_rate": 0.0009998269814417854, + "loss": 0.1055, + "num_input_tokens_seen": 28501472, + "step": 13185 + }, + { + "epoch": 2.1517128874388254, + "grad_norm": 0.05620182305574417, + "learning_rate": 0.0009998251039897586, + "loss": 0.1607, + "num_input_tokens_seen": 28512320, + "step": 13190 + }, + { + "epoch": 2.15252854812398, + "grad_norm": 0.27030977606773376, + "learning_rate": 0.000999823216408142, + "loss": 0.1351, + "num_input_tokens_seen": 28523776, + "step": 13195 + }, + { + "epoch": 2.1533442088091355, + "grad_norm": 0.1298169493675232, + "learning_rate": 0.0009998213186969739, + "loss": 0.157, + "num_input_tokens_seen": 28534016, + "step": 13200 + }, + { + "epoch": 2.1541598694942903, + "grad_norm": 0.06606268137693405, + "learning_rate": 0.0009998194108562927, + "loss": 0.0791, + "num_input_tokens_seen": 28544096, + "step": 13205 + }, + { + "epoch": 2.1549755301794455, + "grad_norm": 0.07334471493959427, + "learning_rate": 0.000999817492886137, + "loss": 0.0535, + "num_input_tokens_seen": 28554912, + "step": 13210 + }, + { + "epoch": 2.1557911908646004, + "grad_norm": 0.08001653105020523, + "learning_rate": 0.000999815564786546, + "loss": 0.0618, + "num_input_tokens_seen": 28564416, + "step": 13215 + }, + { + "epoch": 2.156606851549755, + "grad_norm": 0.09251904487609863, + "learning_rate": 0.0009998136265575582, + "loss": 0.1092, + "num_input_tokens_seen": 28575328, + "step": 13220 + }, + { + "epoch": 2.1574225122349104, + "grad_norm": 0.16219399869441986, + "learning_rate": 0.0009998116781992133, + "loss": 0.176, + "num_input_tokens_seen": 28584992, + "step": 13225 + }, + { + "epoch": 2.1582381729200653, + "grad_norm": 0.1895899772644043, + "learning_rate": 0.0009998097197115507, + "loss": 0.0768, + "num_input_tokens_seen": 28594720, + "step": 13230 + }, + { + "epoch": 2.15905383360522, + "grad_norm": 0.10811378061771393, + "learning_rate": 0.00099980775109461, + "loss": 0.0716, + "num_input_tokens_seen": 28606560, + "step": 13235 + }, + { + "epoch": 2.1598694942903753, + "grad_norm": 0.006182703655213118, + "learning_rate": 0.0009998057723484312, + "loss": 0.0508, + "num_input_tokens_seen": 28617312, + "step": 13240 + }, + { + "epoch": 2.16068515497553, + "grad_norm": 0.04205413907766342, + "learning_rate": 0.0009998037834730545, + "loss": 0.2351, + "num_input_tokens_seen": 28629696, + "step": 13245 + }, + { + "epoch": 2.161500815660685, + "grad_norm": 0.05634569004178047, + "learning_rate": 0.0009998017844685201, + "loss": 0.0308, + "num_input_tokens_seen": 28640704, + "step": 13250 + }, + { + "epoch": 2.1623164763458402, + "grad_norm": 0.006111837457865477, + "learning_rate": 0.0009997997753348684, + "loss": 0.2064, + "num_input_tokens_seen": 28650432, + "step": 13255 + }, + { + "epoch": 2.163132137030995, + "grad_norm": 0.23091314733028412, + "learning_rate": 0.0009997977560721402, + "loss": 0.134, + "num_input_tokens_seen": 28662848, + "step": 13260 + }, + { + "epoch": 2.1639477977161503, + "grad_norm": 0.07334164530038834, + "learning_rate": 0.0009997957266803766, + "loss": 0.1186, + "num_input_tokens_seen": 28673568, + "step": 13265 + }, + { + "epoch": 2.164763458401305, + "grad_norm": 0.0321493074297905, + "learning_rate": 0.0009997936871596182, + "loss": 0.1529, + "num_input_tokens_seen": 28682784, + "step": 13270 + }, + { + "epoch": 2.16557911908646, + "grad_norm": 0.055742476135492325, + "learning_rate": 0.000999791637509907, + "loss": 0.1244, + "num_input_tokens_seen": 28692480, + "step": 13275 + }, + { + "epoch": 2.166394779771615, + "grad_norm": 0.19681106507778168, + "learning_rate": 0.0009997895777312843, + "loss": 0.1311, + "num_input_tokens_seen": 28702144, + "step": 13280 + }, + { + "epoch": 2.16721044045677, + "grad_norm": 0.059670474380254745, + "learning_rate": 0.0009997875078237915, + "loss": 0.1195, + "num_input_tokens_seen": 28711072, + "step": 13285 + }, + { + "epoch": 2.168026101141925, + "grad_norm": 0.1475917249917984, + "learning_rate": 0.000999785427787471, + "loss": 0.0813, + "num_input_tokens_seen": 28721504, + "step": 13290 + }, + { + "epoch": 2.16884176182708, + "grad_norm": 0.005357819609344006, + "learning_rate": 0.0009997833376223647, + "loss": 0.0831, + "num_input_tokens_seen": 28733184, + "step": 13295 + }, + { + "epoch": 2.169657422512235, + "grad_norm": 0.25688982009887695, + "learning_rate": 0.000999781237328515, + "loss": 0.1804, + "num_input_tokens_seen": 28743584, + "step": 13300 + }, + { + "epoch": 2.1704730831973897, + "grad_norm": 0.15253715217113495, + "learning_rate": 0.0009997791269059646, + "loss": 0.1502, + "num_input_tokens_seen": 28754432, + "step": 13305 + }, + { + "epoch": 2.171288743882545, + "grad_norm": 0.050005823373794556, + "learning_rate": 0.0009997770063547562, + "loss": 0.0221, + "num_input_tokens_seen": 28764928, + "step": 13310 + }, + { + "epoch": 2.1721044045677, + "grad_norm": 0.09376704692840576, + "learning_rate": 0.0009997748756749327, + "loss": 0.053, + "num_input_tokens_seen": 28775648, + "step": 13315 + }, + { + "epoch": 2.1729200652528546, + "grad_norm": 0.1947200894355774, + "learning_rate": 0.0009997727348665373, + "loss": 0.07, + "num_input_tokens_seen": 28787040, + "step": 13320 + }, + { + "epoch": 2.17373572593801, + "grad_norm": 0.04367386922240257, + "learning_rate": 0.0009997705839296135, + "loss": 0.0663, + "num_input_tokens_seen": 28798304, + "step": 13325 + }, + { + "epoch": 2.1745513866231647, + "grad_norm": 0.15603747963905334, + "learning_rate": 0.0009997684228642049, + "loss": 0.1732, + "num_input_tokens_seen": 28808608, + "step": 13330 + }, + { + "epoch": 2.1753670473083195, + "grad_norm": 0.052698567509651184, + "learning_rate": 0.0009997662516703552, + "loss": 0.0443, + "num_input_tokens_seen": 28819520, + "step": 13335 + }, + { + "epoch": 2.176182707993475, + "grad_norm": 0.016307028010487556, + "learning_rate": 0.0009997640703481082, + "loss": 0.1428, + "num_input_tokens_seen": 28830912, + "step": 13340 + }, + { + "epoch": 2.1769983686786296, + "grad_norm": 0.029360493645071983, + "learning_rate": 0.0009997618788975084, + "loss": 0.0631, + "num_input_tokens_seen": 28842208, + "step": 13345 + }, + { + "epoch": 2.177814029363785, + "grad_norm": 0.061105113476514816, + "learning_rate": 0.0009997596773186, + "loss": 0.0664, + "num_input_tokens_seen": 28852864, + "step": 13350 + }, + { + "epoch": 2.1786296900489397, + "grad_norm": 0.024250203743577003, + "learning_rate": 0.000999757465611428, + "loss": 0.0284, + "num_input_tokens_seen": 28862400, + "step": 13355 + }, + { + "epoch": 2.1794453507340945, + "grad_norm": 0.22527538239955902, + "learning_rate": 0.000999755243776037, + "loss": 0.0893, + "num_input_tokens_seen": 28872832, + "step": 13360 + }, + { + "epoch": 2.1802610114192498, + "grad_norm": 0.27681607007980347, + "learning_rate": 0.000999753011812472, + "loss": 0.0944, + "num_input_tokens_seen": 28883008, + "step": 13365 + }, + { + "epoch": 2.1810766721044046, + "grad_norm": 0.020333116874098778, + "learning_rate": 0.000999750769720778, + "loss": 0.0301, + "num_input_tokens_seen": 28894976, + "step": 13370 + }, + { + "epoch": 2.1818923327895594, + "grad_norm": 0.2983275055885315, + "learning_rate": 0.0009997485175010008, + "loss": 0.1387, + "num_input_tokens_seen": 28905888, + "step": 13375 + }, + { + "epoch": 2.1827079934747147, + "grad_norm": 0.15638279914855957, + "learning_rate": 0.000999746255153186, + "loss": 0.1873, + "num_input_tokens_seen": 28916640, + "step": 13380 + }, + { + "epoch": 2.1835236541598695, + "grad_norm": 0.34206530451774597, + "learning_rate": 0.0009997439826773791, + "loss": 0.175, + "num_input_tokens_seen": 28927424, + "step": 13385 + }, + { + "epoch": 2.1843393148450243, + "grad_norm": 0.013786377385258675, + "learning_rate": 0.0009997417000736266, + "loss": 0.1524, + "num_input_tokens_seen": 28938240, + "step": 13390 + }, + { + "epoch": 2.1851549755301796, + "grad_norm": 0.02515227347612381, + "learning_rate": 0.0009997394073419747, + "loss": 0.0392, + "num_input_tokens_seen": 28948192, + "step": 13395 + }, + { + "epoch": 2.1859706362153344, + "grad_norm": 0.24523992836475372, + "learning_rate": 0.0009997371044824697, + "loss": 0.0854, + "num_input_tokens_seen": 28958784, + "step": 13400 + }, + { + "epoch": 2.186786296900489, + "grad_norm": 0.011088637635111809, + "learning_rate": 0.0009997347914951582, + "loss": 0.1012, + "num_input_tokens_seen": 28969568, + "step": 13405 + }, + { + "epoch": 2.1876019575856445, + "grad_norm": 0.21824337542057037, + "learning_rate": 0.0009997324683800872, + "loss": 0.1296, + "num_input_tokens_seen": 28980096, + "step": 13410 + }, + { + "epoch": 2.1884176182707993, + "grad_norm": 0.07013077288866043, + "learning_rate": 0.0009997301351373038, + "loss": 0.0685, + "num_input_tokens_seen": 28990752, + "step": 13415 + }, + { + "epoch": 2.189233278955954, + "grad_norm": 0.008021237328648567, + "learning_rate": 0.0009997277917668552, + "loss": 0.1336, + "num_input_tokens_seen": 29001248, + "step": 13420 + }, + { + "epoch": 2.1900489396411094, + "grad_norm": 0.3219771087169647, + "learning_rate": 0.000999725438268789, + "loss": 0.1138, + "num_input_tokens_seen": 29011520, + "step": 13425 + }, + { + "epoch": 2.190864600326264, + "grad_norm": 0.05944007635116577, + "learning_rate": 0.0009997230746431529, + "loss": 0.0874, + "num_input_tokens_seen": 29022336, + "step": 13430 + }, + { + "epoch": 2.1916802610114194, + "grad_norm": 0.24976307153701782, + "learning_rate": 0.0009997207008899946, + "loss": 0.1174, + "num_input_tokens_seen": 29034464, + "step": 13435 + }, + { + "epoch": 2.1924959216965743, + "grad_norm": 0.09191533923149109, + "learning_rate": 0.0009997183170093625, + "loss": 0.1743, + "num_input_tokens_seen": 29045728, + "step": 13440 + }, + { + "epoch": 2.193311582381729, + "grad_norm": 0.01558864489197731, + "learning_rate": 0.000999715923001305, + "loss": 0.2273, + "num_input_tokens_seen": 29055392, + "step": 13445 + }, + { + "epoch": 2.1941272430668843, + "grad_norm": 0.03274226933717728, + "learning_rate": 0.00099971351886587, + "loss": 0.0776, + "num_input_tokens_seen": 29066304, + "step": 13450 + }, + { + "epoch": 2.194942903752039, + "grad_norm": 0.08572755008935928, + "learning_rate": 0.0009997111046031067, + "loss": 0.0711, + "num_input_tokens_seen": 29077344, + "step": 13455 + }, + { + "epoch": 2.195758564437194, + "grad_norm": 0.033109501004219055, + "learning_rate": 0.000999708680213064, + "loss": 0.0327, + "num_input_tokens_seen": 29088672, + "step": 13460 + }, + { + "epoch": 2.1965742251223492, + "grad_norm": 0.15404462814331055, + "learning_rate": 0.000999706245695791, + "loss": 0.1834, + "num_input_tokens_seen": 29100224, + "step": 13465 + }, + { + "epoch": 2.197389885807504, + "grad_norm": 0.1067809909582138, + "learning_rate": 0.0009997038010513368, + "loss": 0.0657, + "num_input_tokens_seen": 29110336, + "step": 13470 + }, + { + "epoch": 2.198205546492659, + "grad_norm": 0.3593546450138092, + "learning_rate": 0.0009997013462797514, + "loss": 0.1793, + "num_input_tokens_seen": 29121280, + "step": 13475 + }, + { + "epoch": 2.199021207177814, + "grad_norm": 0.13226577639579773, + "learning_rate": 0.000999698881381084, + "loss": 0.234, + "num_input_tokens_seen": 29133056, + "step": 13480 + }, + { + "epoch": 2.199836867862969, + "grad_norm": 0.037532739341259, + "learning_rate": 0.0009996964063553851, + "loss": 0.0819, + "num_input_tokens_seen": 29144544, + "step": 13485 + }, + { + "epoch": 2.200652528548124, + "grad_norm": 0.03458193317055702, + "learning_rate": 0.0009996939212027045, + "loss": 0.126, + "num_input_tokens_seen": 29155936, + "step": 13490 + }, + { + "epoch": 2.201468189233279, + "grad_norm": 0.11294587701559067, + "learning_rate": 0.0009996914259230928, + "loss": 0.1229, + "num_input_tokens_seen": 29166688, + "step": 13495 + }, + { + "epoch": 2.202283849918434, + "grad_norm": 0.1393410861492157, + "learning_rate": 0.0009996889205166003, + "loss": 0.124, + "num_input_tokens_seen": 29177568, + "step": 13500 + }, + { + "epoch": 2.203099510603589, + "grad_norm": 0.06924308836460114, + "learning_rate": 0.000999686404983278, + "loss": 0.1475, + "num_input_tokens_seen": 29187936, + "step": 13505 + }, + { + "epoch": 2.203915171288744, + "grad_norm": 0.023421689867973328, + "learning_rate": 0.0009996838793231771, + "loss": 0.155, + "num_input_tokens_seen": 29198208, + "step": 13510 + }, + { + "epoch": 2.2047308319738987, + "grad_norm": 0.1216554045677185, + "learning_rate": 0.0009996813435363481, + "loss": 0.1129, + "num_input_tokens_seen": 29209248, + "step": 13515 + }, + { + "epoch": 2.205546492659054, + "grad_norm": 0.29664355516433716, + "learning_rate": 0.000999678797622843, + "loss": 0.2195, + "num_input_tokens_seen": 29220032, + "step": 13520 + }, + { + "epoch": 2.206362153344209, + "grad_norm": 0.22446954250335693, + "learning_rate": 0.000999676241582713, + "loss": 0.1816, + "num_input_tokens_seen": 29230432, + "step": 13525 + }, + { + "epoch": 2.2071778140293636, + "grad_norm": 0.077766552567482, + "learning_rate": 0.0009996736754160102, + "loss": 0.0407, + "num_input_tokens_seen": 29241024, + "step": 13530 + }, + { + "epoch": 2.207993474714519, + "grad_norm": 0.12766964733600616, + "learning_rate": 0.0009996710991227865, + "loss": 0.1988, + "num_input_tokens_seen": 29252384, + "step": 13535 + }, + { + "epoch": 2.2088091353996737, + "grad_norm": 0.04737265780568123, + "learning_rate": 0.000999668512703094, + "loss": 0.13, + "num_input_tokens_seen": 29263648, + "step": 13540 + }, + { + "epoch": 2.2096247960848285, + "grad_norm": 0.1462884545326233, + "learning_rate": 0.0009996659161569852, + "loss": 0.1695, + "num_input_tokens_seen": 29274016, + "step": 13545 + }, + { + "epoch": 2.210440456769984, + "grad_norm": 0.026576614007353783, + "learning_rate": 0.0009996633094845127, + "loss": 0.163, + "num_input_tokens_seen": 29284128, + "step": 13550 + }, + { + "epoch": 2.2112561174551386, + "grad_norm": 0.16052138805389404, + "learning_rate": 0.0009996606926857296, + "loss": 0.0733, + "num_input_tokens_seen": 29294976, + "step": 13555 + }, + { + "epoch": 2.2120717781402934, + "grad_norm": 0.29137226939201355, + "learning_rate": 0.0009996580657606886, + "loss": 0.2453, + "num_input_tokens_seen": 29305696, + "step": 13560 + }, + { + "epoch": 2.2128874388254487, + "grad_norm": 0.14441095292568207, + "learning_rate": 0.0009996554287094428, + "loss": 0.1043, + "num_input_tokens_seen": 29316000, + "step": 13565 + }, + { + "epoch": 2.2137030995106035, + "grad_norm": 0.10074219852685928, + "learning_rate": 0.0009996527815320463, + "loss": 0.1148, + "num_input_tokens_seen": 29327872, + "step": 13570 + }, + { + "epoch": 2.2145187601957588, + "grad_norm": 0.043885741382837296, + "learning_rate": 0.000999650124228552, + "loss": 0.0642, + "num_input_tokens_seen": 29338528, + "step": 13575 + }, + { + "epoch": 2.2153344208809136, + "grad_norm": 0.29121461510658264, + "learning_rate": 0.0009996474567990142, + "loss": 0.2077, + "num_input_tokens_seen": 29349568, + "step": 13580 + }, + { + "epoch": 2.2161500815660684, + "grad_norm": 0.026214681565761566, + "learning_rate": 0.0009996447792434868, + "loss": 0.1421, + "num_input_tokens_seen": 29360544, + "step": 13585 + }, + { + "epoch": 2.2169657422512237, + "grad_norm": 0.11622193455696106, + "learning_rate": 0.000999642091562024, + "loss": 0.129, + "num_input_tokens_seen": 29371424, + "step": 13590 + }, + { + "epoch": 2.2177814029363785, + "grad_norm": 0.09396478533744812, + "learning_rate": 0.0009996393937546806, + "loss": 0.1663, + "num_input_tokens_seen": 29382176, + "step": 13595 + }, + { + "epoch": 2.2185970636215333, + "grad_norm": 0.19870883226394653, + "learning_rate": 0.000999636685821511, + "loss": 0.0664, + "num_input_tokens_seen": 29392672, + "step": 13600 + }, + { + "epoch": 2.2194127243066886, + "grad_norm": 0.013889658264815807, + "learning_rate": 0.0009996339677625702, + "loss": 0.0608, + "num_input_tokens_seen": 29402816, + "step": 13605 + }, + { + "epoch": 2.2202283849918434, + "grad_norm": 0.20676304399967194, + "learning_rate": 0.000999631239577913, + "loss": 0.1154, + "num_input_tokens_seen": 29415104, + "step": 13610 + }, + { + "epoch": 2.221044045676998, + "grad_norm": 0.3030164837837219, + "learning_rate": 0.000999628501267595, + "loss": 0.158, + "num_input_tokens_seen": 29425888, + "step": 13615 + }, + { + "epoch": 2.2218597063621535, + "grad_norm": 0.0975450873374939, + "learning_rate": 0.0009996257528316716, + "loss": 0.0996, + "num_input_tokens_seen": 29436672, + "step": 13620 + }, + { + "epoch": 2.2226753670473083, + "grad_norm": 0.027641698718070984, + "learning_rate": 0.0009996229942701984, + "loss": 0.2582, + "num_input_tokens_seen": 29447744, + "step": 13625 + }, + { + "epoch": 2.223491027732463, + "grad_norm": 0.3274204730987549, + "learning_rate": 0.0009996202255832317, + "loss": 0.1159, + "num_input_tokens_seen": 29458080, + "step": 13630 + }, + { + "epoch": 2.2243066884176184, + "grad_norm": 0.1280737817287445, + "learning_rate": 0.000999617446770827, + "loss": 0.1324, + "num_input_tokens_seen": 29468928, + "step": 13635 + }, + { + "epoch": 2.225122349102773, + "grad_norm": 0.027391565963625908, + "learning_rate": 0.0009996146578330409, + "loss": 0.0873, + "num_input_tokens_seen": 29478624, + "step": 13640 + }, + { + "epoch": 2.225938009787928, + "grad_norm": 0.03806430846452713, + "learning_rate": 0.0009996118587699302, + "loss": 0.0518, + "num_input_tokens_seen": 29489408, + "step": 13645 + }, + { + "epoch": 2.2267536704730833, + "grad_norm": 0.10042430460453033, + "learning_rate": 0.0009996090495815514, + "loss": 0.0661, + "num_input_tokens_seen": 29498912, + "step": 13650 + }, + { + "epoch": 2.227569331158238, + "grad_norm": 0.2514718770980835, + "learning_rate": 0.000999606230267961, + "loss": 0.1197, + "num_input_tokens_seen": 29509824, + "step": 13655 + }, + { + "epoch": 2.2283849918433933, + "grad_norm": 0.39634665846824646, + "learning_rate": 0.000999603400829217, + "loss": 0.2653, + "num_input_tokens_seen": 29521344, + "step": 13660 + }, + { + "epoch": 2.229200652528548, + "grad_norm": 0.3142750561237335, + "learning_rate": 0.0009996005612653762, + "loss": 0.1665, + "num_input_tokens_seen": 29533152, + "step": 13665 + }, + { + "epoch": 2.230016313213703, + "grad_norm": 0.06737440079450607, + "learning_rate": 0.000999597711576496, + "loss": 0.0555, + "num_input_tokens_seen": 29543360, + "step": 13670 + }, + { + "epoch": 2.2308319738988582, + "grad_norm": 0.06384480744600296, + "learning_rate": 0.0009995948517626347, + "loss": 0.1319, + "num_input_tokens_seen": 29555520, + "step": 13675 + }, + { + "epoch": 2.231647634584013, + "grad_norm": 0.057452812790870667, + "learning_rate": 0.0009995919818238496, + "loss": 0.0955, + "num_input_tokens_seen": 29566528, + "step": 13680 + }, + { + "epoch": 2.232463295269168, + "grad_norm": 0.17521750926971436, + "learning_rate": 0.0009995891017601996, + "loss": 0.0957, + "num_input_tokens_seen": 29578400, + "step": 13685 + }, + { + "epoch": 2.233278955954323, + "grad_norm": 0.017670799046754837, + "learning_rate": 0.0009995862115717426, + "loss": 0.116, + "num_input_tokens_seen": 29590528, + "step": 13690 + }, + { + "epoch": 2.234094616639478, + "grad_norm": 0.03287632763385773, + "learning_rate": 0.000999583311258537, + "loss": 0.1225, + "num_input_tokens_seen": 29601856, + "step": 13695 + }, + { + "epoch": 2.2349102773246328, + "grad_norm": 0.12917554378509521, + "learning_rate": 0.000999580400820642, + "loss": 0.118, + "num_input_tokens_seen": 29613344, + "step": 13700 + }, + { + "epoch": 2.235725938009788, + "grad_norm": 0.060185208916664124, + "learning_rate": 0.0009995774802581165, + "loss": 0.0905, + "num_input_tokens_seen": 29622656, + "step": 13705 + }, + { + "epoch": 2.236541598694943, + "grad_norm": 0.08099085092544556, + "learning_rate": 0.0009995745495710194, + "loss": 0.1697, + "num_input_tokens_seen": 29633824, + "step": 13710 + }, + { + "epoch": 2.237357259380098, + "grad_norm": 0.015069660730659962, + "learning_rate": 0.0009995716087594104, + "loss": 0.0533, + "num_input_tokens_seen": 29644608, + "step": 13715 + }, + { + "epoch": 2.238172920065253, + "grad_norm": 0.1912614107131958, + "learning_rate": 0.000999568657823349, + "loss": 0.1524, + "num_input_tokens_seen": 29655648, + "step": 13720 + }, + { + "epoch": 2.2389885807504077, + "grad_norm": 0.07988587021827698, + "learning_rate": 0.000999565696762895, + "loss": 0.0696, + "num_input_tokens_seen": 29666336, + "step": 13725 + }, + { + "epoch": 2.239804241435563, + "grad_norm": 0.10382115095853806, + "learning_rate": 0.0009995627255781083, + "loss": 0.1417, + "num_input_tokens_seen": 29677792, + "step": 13730 + }, + { + "epoch": 2.240619902120718, + "grad_norm": 0.004169138614088297, + "learning_rate": 0.0009995597442690493, + "loss": 0.0632, + "num_input_tokens_seen": 29687840, + "step": 13735 + }, + { + "epoch": 2.2414355628058726, + "grad_norm": 0.08007065951824188, + "learning_rate": 0.0009995567528357785, + "loss": 0.0196, + "num_input_tokens_seen": 29698880, + "step": 13740 + }, + { + "epoch": 2.242251223491028, + "grad_norm": 0.04466477409005165, + "learning_rate": 0.0009995537512783562, + "loss": 0.0859, + "num_input_tokens_seen": 29709408, + "step": 13745 + }, + { + "epoch": 2.2430668841761827, + "grad_norm": 0.003576258197426796, + "learning_rate": 0.0009995507395968435, + "loss": 0.0557, + "num_input_tokens_seen": 29719712, + "step": 13750 + }, + { + "epoch": 2.2438825448613375, + "grad_norm": 0.2553325593471527, + "learning_rate": 0.0009995477177913014, + "loss": 0.2313, + "num_input_tokens_seen": 29730144, + "step": 13755 + }, + { + "epoch": 2.244698205546493, + "grad_norm": 0.10004813224077225, + "learning_rate": 0.0009995446858617908, + "loss": 0.1032, + "num_input_tokens_seen": 29741408, + "step": 13760 + }, + { + "epoch": 2.2455138662316476, + "grad_norm": 0.08300987631082535, + "learning_rate": 0.0009995416438083736, + "loss": 0.169, + "num_input_tokens_seen": 29752736, + "step": 13765 + }, + { + "epoch": 2.2463295269168024, + "grad_norm": 0.11108089983463287, + "learning_rate": 0.0009995385916311112, + "loss": 0.0882, + "num_input_tokens_seen": 29763680, + "step": 13770 + }, + { + "epoch": 2.2471451876019577, + "grad_norm": 0.21327409148216248, + "learning_rate": 0.0009995355293300656, + "loss": 0.1918, + "num_input_tokens_seen": 29773472, + "step": 13775 + }, + { + "epoch": 2.2479608482871125, + "grad_norm": 0.0419314019382, + "learning_rate": 0.0009995324569052988, + "loss": 0.3502, + "num_input_tokens_seen": 29783776, + "step": 13780 + }, + { + "epoch": 2.2487765089722673, + "grad_norm": 0.06031573563814163, + "learning_rate": 0.000999529374356873, + "loss": 0.1638, + "num_input_tokens_seen": 29794752, + "step": 13785 + }, + { + "epoch": 2.2495921696574226, + "grad_norm": 0.03626589849591255, + "learning_rate": 0.0009995262816848507, + "loss": 0.0725, + "num_input_tokens_seen": 29805920, + "step": 13790 + }, + { + "epoch": 2.2504078303425774, + "grad_norm": 0.009249306283891201, + "learning_rate": 0.0009995231788892949, + "loss": 0.055, + "num_input_tokens_seen": 29817184, + "step": 13795 + }, + { + "epoch": 2.2512234910277327, + "grad_norm": 0.015564526431262493, + "learning_rate": 0.000999520065970268, + "loss": 0.0539, + "num_input_tokens_seen": 29828448, + "step": 13800 + }, + { + "epoch": 2.2520391517128875, + "grad_norm": 0.13924640417099, + "learning_rate": 0.000999516942927833, + "loss": 0.1173, + "num_input_tokens_seen": 29840512, + "step": 13805 + }, + { + "epoch": 2.2528548123980423, + "grad_norm": 0.09099038690328598, + "learning_rate": 0.0009995138097620537, + "loss": 0.0805, + "num_input_tokens_seen": 29852608, + "step": 13810 + }, + { + "epoch": 2.2536704730831976, + "grad_norm": 0.2963365912437439, + "learning_rate": 0.0009995106664729934, + "loss": 0.1063, + "num_input_tokens_seen": 29863488, + "step": 13815 + }, + { + "epoch": 2.2544861337683524, + "grad_norm": 0.11496601998806, + "learning_rate": 0.0009995075130607158, + "loss": 0.1043, + "num_input_tokens_seen": 29874752, + "step": 13820 + }, + { + "epoch": 2.255301794453507, + "grad_norm": 0.09994488954544067, + "learning_rate": 0.0009995043495252848, + "loss": 0.2477, + "num_input_tokens_seen": 29886144, + "step": 13825 + }, + { + "epoch": 2.2561174551386625, + "grad_norm": 0.11016276478767395, + "learning_rate": 0.0009995011758667644, + "loss": 0.3073, + "num_input_tokens_seen": 29896960, + "step": 13830 + }, + { + "epoch": 2.2569331158238173, + "grad_norm": 0.036299578845500946, + "learning_rate": 0.000999497992085219, + "loss": 0.0813, + "num_input_tokens_seen": 29908096, + "step": 13835 + }, + { + "epoch": 2.257748776508972, + "grad_norm": 0.0857962816953659, + "learning_rate": 0.0009994947981807132, + "loss": 0.1517, + "num_input_tokens_seen": 29918880, + "step": 13840 + }, + { + "epoch": 2.2585644371941274, + "grad_norm": 0.37297701835632324, + "learning_rate": 0.0009994915941533115, + "loss": 0.2363, + "num_input_tokens_seen": 29929152, + "step": 13845 + }, + { + "epoch": 2.259380097879282, + "grad_norm": 0.08345893025398254, + "learning_rate": 0.0009994883800030791, + "loss": 0.0882, + "num_input_tokens_seen": 29938560, + "step": 13850 + }, + { + "epoch": 2.2601957585644374, + "grad_norm": 0.006022712681442499, + "learning_rate": 0.0009994851557300812, + "loss": 0.1289, + "num_input_tokens_seen": 29948512, + "step": 13855 + }, + { + "epoch": 2.2610114192495923, + "grad_norm": 0.04013175144791603, + "learning_rate": 0.000999481921334383, + "loss": 0.2283, + "num_input_tokens_seen": 29958944, + "step": 13860 + }, + { + "epoch": 2.261827079934747, + "grad_norm": 0.07237616926431656, + "learning_rate": 0.0009994786768160496, + "loss": 0.2818, + "num_input_tokens_seen": 29970080, + "step": 13865 + }, + { + "epoch": 2.262642740619902, + "grad_norm": 0.1106957495212555, + "learning_rate": 0.0009994754221751474, + "loss": 0.087, + "num_input_tokens_seen": 29980256, + "step": 13870 + }, + { + "epoch": 2.263458401305057, + "grad_norm": 0.12279194593429565, + "learning_rate": 0.0009994721574117422, + "loss": 0.1116, + "num_input_tokens_seen": 29991776, + "step": 13875 + }, + { + "epoch": 2.264274061990212, + "grad_norm": 0.09282184392213821, + "learning_rate": 0.0009994688825259001, + "loss": 0.1075, + "num_input_tokens_seen": 30003232, + "step": 13880 + }, + { + "epoch": 2.2650897226753672, + "grad_norm": 0.036214679479599, + "learning_rate": 0.0009994655975176874, + "loss": 0.0976, + "num_input_tokens_seen": 30013504, + "step": 13885 + }, + { + "epoch": 2.265905383360522, + "grad_norm": 0.19235451519489288, + "learning_rate": 0.0009994623023871709, + "loss": 0.1041, + "num_input_tokens_seen": 30024256, + "step": 13890 + }, + { + "epoch": 2.266721044045677, + "grad_norm": 0.19567914307117462, + "learning_rate": 0.000999458997134417, + "loss": 0.1421, + "num_input_tokens_seen": 30034656, + "step": 13895 + }, + { + "epoch": 2.267536704730832, + "grad_norm": 0.3461471199989319, + "learning_rate": 0.000999455681759493, + "loss": 0.2955, + "num_input_tokens_seen": 30045856, + "step": 13900 + }, + { + "epoch": 2.268352365415987, + "grad_norm": 0.10061849653720856, + "learning_rate": 0.0009994523562624662, + "loss": 0.2066, + "num_input_tokens_seen": 30056032, + "step": 13905 + }, + { + "epoch": 2.2691680261011418, + "grad_norm": 0.23414430022239685, + "learning_rate": 0.0009994490206434038, + "loss": 0.0969, + "num_input_tokens_seen": 30067456, + "step": 13910 + }, + { + "epoch": 2.269983686786297, + "grad_norm": 0.16487205028533936, + "learning_rate": 0.000999445674902373, + "loss": 0.14, + "num_input_tokens_seen": 30078944, + "step": 13915 + }, + { + "epoch": 2.270799347471452, + "grad_norm": 0.1677953600883484, + "learning_rate": 0.0009994423190394423, + "loss": 0.1317, + "num_input_tokens_seen": 30089536, + "step": 13920 + }, + { + "epoch": 2.2716150081566067, + "grad_norm": 0.10743577778339386, + "learning_rate": 0.0009994389530546795, + "loss": 0.0725, + "num_input_tokens_seen": 30100256, + "step": 13925 + }, + { + "epoch": 2.272430668841762, + "grad_norm": 0.17737969756126404, + "learning_rate": 0.0009994355769481524, + "loss": 0.14, + "num_input_tokens_seen": 30110368, + "step": 13930 + }, + { + "epoch": 2.2732463295269167, + "grad_norm": 0.1521010547876358, + "learning_rate": 0.00099943219071993, + "loss": 0.2311, + "num_input_tokens_seen": 30122080, + "step": 13935 + }, + { + "epoch": 2.274061990212072, + "grad_norm": 0.12245091050863266, + "learning_rate": 0.0009994287943700807, + "loss": 0.0706, + "num_input_tokens_seen": 30132736, + "step": 13940 + }, + { + "epoch": 2.274877650897227, + "grad_norm": 0.13133811950683594, + "learning_rate": 0.0009994253878986732, + "loss": 0.1867, + "num_input_tokens_seen": 30142624, + "step": 13945 + }, + { + "epoch": 2.2756933115823816, + "grad_norm": 0.13661529123783112, + "learning_rate": 0.0009994219713057768, + "loss": 0.1389, + "num_input_tokens_seen": 30153632, + "step": 13950 + }, + { + "epoch": 2.2765089722675365, + "grad_norm": 0.08187350630760193, + "learning_rate": 0.0009994185445914604, + "loss": 0.0999, + "num_input_tokens_seen": 30165056, + "step": 13955 + }, + { + "epoch": 2.2773246329526917, + "grad_norm": 0.12019728869199753, + "learning_rate": 0.000999415107755794, + "loss": 0.0526, + "num_input_tokens_seen": 30176480, + "step": 13960 + }, + { + "epoch": 2.2781402936378465, + "grad_norm": 0.04871026799082756, + "learning_rate": 0.0009994116607988464, + "loss": 0.2142, + "num_input_tokens_seen": 30187200, + "step": 13965 + }, + { + "epoch": 2.278955954323002, + "grad_norm": 0.03447539359331131, + "learning_rate": 0.0009994082037206881, + "loss": 0.0814, + "num_input_tokens_seen": 30198336, + "step": 13970 + }, + { + "epoch": 2.2797716150081566, + "grad_norm": 0.09334293007850647, + "learning_rate": 0.0009994047365213892, + "loss": 0.1331, + "num_input_tokens_seen": 30208416, + "step": 13975 + }, + { + "epoch": 2.2805872756933114, + "grad_norm": 0.07840663939714432, + "learning_rate": 0.0009994012592010196, + "loss": 0.0942, + "num_input_tokens_seen": 30219424, + "step": 13980 + }, + { + "epoch": 2.2814029363784667, + "grad_norm": 0.24991539120674133, + "learning_rate": 0.00099939777175965, + "loss": 0.1466, + "num_input_tokens_seen": 30229440, + "step": 13985 + }, + { + "epoch": 2.2822185970636215, + "grad_norm": 0.02840086817741394, + "learning_rate": 0.000999394274197351, + "loss": 0.1704, + "num_input_tokens_seen": 30241088, + "step": 13990 + }, + { + "epoch": 2.2830342577487763, + "grad_norm": 0.11454634368419647, + "learning_rate": 0.0009993907665141934, + "loss": 0.0365, + "num_input_tokens_seen": 30252672, + "step": 13995 + }, + { + "epoch": 2.2838499184339316, + "grad_norm": 0.43254274129867554, + "learning_rate": 0.0009993872487102486, + "loss": 0.1782, + "num_input_tokens_seen": 30263712, + "step": 14000 + }, + { + "epoch": 2.2846655791190864, + "grad_norm": 0.27640798687934875, + "learning_rate": 0.0009993837207855876, + "loss": 0.2381, + "num_input_tokens_seen": 30275200, + "step": 14005 + }, + { + "epoch": 2.2854812398042412, + "grad_norm": 0.36231812834739685, + "learning_rate": 0.000999380182740282, + "loss": 0.1693, + "num_input_tokens_seen": 30286080, + "step": 14010 + }, + { + "epoch": 2.2862969004893965, + "grad_norm": 0.07490170747041702, + "learning_rate": 0.0009993766345744036, + "loss": 0.0939, + "num_input_tokens_seen": 30297504, + "step": 14015 + }, + { + "epoch": 2.2871125611745513, + "grad_norm": 0.07344070076942444, + "learning_rate": 0.000999373076288024, + "loss": 0.0515, + "num_input_tokens_seen": 30308704, + "step": 14020 + }, + { + "epoch": 2.2879282218597066, + "grad_norm": 0.027155442163348198, + "learning_rate": 0.0009993695078812156, + "loss": 0.064, + "num_input_tokens_seen": 30318208, + "step": 14025 + }, + { + "epoch": 2.2887438825448614, + "grad_norm": 0.04195243865251541, + "learning_rate": 0.0009993659293540506, + "loss": 0.3038, + "num_input_tokens_seen": 30328480, + "step": 14030 + }, + { + "epoch": 2.289559543230016, + "grad_norm": 0.05106004700064659, + "learning_rate": 0.0009993623407066016, + "loss": 0.1349, + "num_input_tokens_seen": 30339200, + "step": 14035 + }, + { + "epoch": 2.2903752039151715, + "grad_norm": 0.032931577414274216, + "learning_rate": 0.0009993587419389412, + "loss": 0.0565, + "num_input_tokens_seen": 30348864, + "step": 14040 + }, + { + "epoch": 2.2911908646003263, + "grad_norm": 0.1499442309141159, + "learning_rate": 0.0009993551330511423, + "loss": 0.0761, + "num_input_tokens_seen": 30360064, + "step": 14045 + }, + { + "epoch": 2.292006525285481, + "grad_norm": 0.07077664136886597, + "learning_rate": 0.0009993515140432783, + "loss": 0.1002, + "num_input_tokens_seen": 30371616, + "step": 14050 + }, + { + "epoch": 2.2928221859706364, + "grad_norm": 0.11122244596481323, + "learning_rate": 0.0009993478849154224, + "loss": 0.0986, + "num_input_tokens_seen": 30382976, + "step": 14055 + }, + { + "epoch": 2.293637846655791, + "grad_norm": 0.016836611554026604, + "learning_rate": 0.0009993442456676482, + "loss": 0.1041, + "num_input_tokens_seen": 30395040, + "step": 14060 + }, + { + "epoch": 2.294453507340946, + "grad_norm": 0.1562366485595703, + "learning_rate": 0.0009993405963000294, + "loss": 0.083, + "num_input_tokens_seen": 30405792, + "step": 14065 + }, + { + "epoch": 2.2952691680261013, + "grad_norm": 0.07218848168849945, + "learning_rate": 0.00099933693681264, + "loss": 0.1354, + "num_input_tokens_seen": 30418272, + "step": 14070 + }, + { + "epoch": 2.296084828711256, + "grad_norm": 0.2335197776556015, + "learning_rate": 0.000999333267205554, + "loss": 0.0964, + "num_input_tokens_seen": 30429472, + "step": 14075 + }, + { + "epoch": 2.2969004893964113, + "grad_norm": 0.0899466797709465, + "learning_rate": 0.000999329587478846, + "loss": 0.0773, + "num_input_tokens_seen": 30439456, + "step": 14080 + }, + { + "epoch": 2.297716150081566, + "grad_norm": 0.14783619344234467, + "learning_rate": 0.0009993258976325903, + "loss": 0.1321, + "num_input_tokens_seen": 30451168, + "step": 14085 + }, + { + "epoch": 2.298531810766721, + "grad_norm": 0.08867207169532776, + "learning_rate": 0.0009993221976668618, + "loss": 0.1167, + "num_input_tokens_seen": 30461888, + "step": 14090 + }, + { + "epoch": 2.299347471451876, + "grad_norm": 0.09165129065513611, + "learning_rate": 0.0009993184875817357, + "loss": 0.1071, + "num_input_tokens_seen": 30472128, + "step": 14095 + }, + { + "epoch": 2.300163132137031, + "grad_norm": 0.5241231918334961, + "learning_rate": 0.0009993147673772868, + "loss": 0.3387, + "num_input_tokens_seen": 30482624, + "step": 14100 + }, + { + "epoch": 2.300978792822186, + "grad_norm": 0.1067451611161232, + "learning_rate": 0.000999311037053591, + "loss": 0.0684, + "num_input_tokens_seen": 30495104, + "step": 14105 + }, + { + "epoch": 2.301794453507341, + "grad_norm": 0.19375212490558624, + "learning_rate": 0.0009993072966107235, + "loss": 0.1801, + "num_input_tokens_seen": 30506368, + "step": 14110 + }, + { + "epoch": 2.302610114192496, + "grad_norm": 0.05557816103100777, + "learning_rate": 0.0009993035460487602, + "loss": 0.1223, + "num_input_tokens_seen": 30516608, + "step": 14115 + }, + { + "epoch": 2.3034257748776508, + "grad_norm": 0.15730692446231842, + "learning_rate": 0.0009992997853677773, + "loss": 0.1223, + "num_input_tokens_seen": 30526944, + "step": 14120 + }, + { + "epoch": 2.304241435562806, + "grad_norm": 0.07870891690254211, + "learning_rate": 0.0009992960145678506, + "loss": 0.0757, + "num_input_tokens_seen": 30537216, + "step": 14125 + }, + { + "epoch": 2.305057096247961, + "grad_norm": 0.19272860884666443, + "learning_rate": 0.0009992922336490568, + "loss": 0.1123, + "num_input_tokens_seen": 30547904, + "step": 14130 + }, + { + "epoch": 2.3058727569331157, + "grad_norm": 0.13096141815185547, + "learning_rate": 0.0009992884426114725, + "loss": 0.2245, + "num_input_tokens_seen": 30559328, + "step": 14135 + }, + { + "epoch": 2.306688417618271, + "grad_norm": 0.12387151271104813, + "learning_rate": 0.0009992846414551746, + "loss": 0.2344, + "num_input_tokens_seen": 30571424, + "step": 14140 + }, + { + "epoch": 2.3075040783034257, + "grad_norm": 0.06615098565816879, + "learning_rate": 0.00099928083018024, + "loss": 0.1153, + "num_input_tokens_seen": 30582560, + "step": 14145 + }, + { + "epoch": 2.3083197389885806, + "grad_norm": 0.156528040766716, + "learning_rate": 0.000999277008786746, + "loss": 0.1359, + "num_input_tokens_seen": 30593568, + "step": 14150 + }, + { + "epoch": 2.309135399673736, + "grad_norm": 0.0884905606508255, + "learning_rate": 0.0009992731772747701, + "loss": 0.188, + "num_input_tokens_seen": 30604704, + "step": 14155 + }, + { + "epoch": 2.3099510603588906, + "grad_norm": 0.1028173565864563, + "learning_rate": 0.0009992693356443898, + "loss": 0.0918, + "num_input_tokens_seen": 30615200, + "step": 14160 + }, + { + "epoch": 2.310766721044046, + "grad_norm": 0.06769051402807236, + "learning_rate": 0.0009992654838956831, + "loss": 0.0695, + "num_input_tokens_seen": 30626016, + "step": 14165 + }, + { + "epoch": 2.3115823817292007, + "grad_norm": 0.06213633716106415, + "learning_rate": 0.000999261622028728, + "loss": 0.1648, + "num_input_tokens_seen": 30637056, + "step": 14170 + }, + { + "epoch": 2.3123980424143555, + "grad_norm": 0.1374928504228592, + "learning_rate": 0.0009992577500436027, + "loss": 0.0828, + "num_input_tokens_seen": 30647616, + "step": 14175 + }, + { + "epoch": 2.3132137030995104, + "grad_norm": 0.027195928618311882, + "learning_rate": 0.0009992538679403857, + "loss": 0.1721, + "num_input_tokens_seen": 30658848, + "step": 14180 + }, + { + "epoch": 2.3140293637846656, + "grad_norm": 0.07334202527999878, + "learning_rate": 0.0009992499757191559, + "loss": 0.129, + "num_input_tokens_seen": 30669952, + "step": 14185 + }, + { + "epoch": 2.3148450244698204, + "grad_norm": 0.16840721666812897, + "learning_rate": 0.000999246073379992, + "loss": 0.1011, + "num_input_tokens_seen": 30680576, + "step": 14190 + }, + { + "epoch": 2.3156606851549757, + "grad_norm": 0.006261528003960848, + "learning_rate": 0.0009992421609229729, + "loss": 0.1487, + "num_input_tokens_seen": 30689984, + "step": 14195 + }, + { + "epoch": 2.3164763458401305, + "grad_norm": 0.11172261834144592, + "learning_rate": 0.0009992382383481782, + "loss": 0.0794, + "num_input_tokens_seen": 30701696, + "step": 14200 + }, + { + "epoch": 2.3172920065252853, + "grad_norm": 0.21091987192630768, + "learning_rate": 0.0009992343056556873, + "loss": 0.1625, + "num_input_tokens_seen": 30712064, + "step": 14205 + }, + { + "epoch": 2.3181076672104406, + "grad_norm": 0.19095903635025024, + "learning_rate": 0.0009992303628455796, + "loss": 0.154, + "num_input_tokens_seen": 30723040, + "step": 14210 + }, + { + "epoch": 2.3189233278955954, + "grad_norm": 0.14958035945892334, + "learning_rate": 0.0009992264099179355, + "loss": 0.1231, + "num_input_tokens_seen": 30734720, + "step": 14215 + }, + { + "epoch": 2.3197389885807502, + "grad_norm": 0.207402765750885, + "learning_rate": 0.000999222446872835, + "loss": 0.0863, + "num_input_tokens_seen": 30747008, + "step": 14220 + }, + { + "epoch": 2.3205546492659055, + "grad_norm": 0.07774131745100021, + "learning_rate": 0.0009992184737103583, + "loss": 0.0618, + "num_input_tokens_seen": 30757248, + "step": 14225 + }, + { + "epoch": 2.3213703099510603, + "grad_norm": 0.06501632183790207, + "learning_rate": 0.0009992144904305857, + "loss": 0.119, + "num_input_tokens_seen": 30768736, + "step": 14230 + }, + { + "epoch": 2.322185970636215, + "grad_norm": 0.19293227791786194, + "learning_rate": 0.0009992104970335982, + "loss": 0.0865, + "num_input_tokens_seen": 30779264, + "step": 14235 + }, + { + "epoch": 2.3230016313213704, + "grad_norm": 0.32836616039276123, + "learning_rate": 0.0009992064935194767, + "loss": 0.2237, + "num_input_tokens_seen": 30790592, + "step": 14240 + }, + { + "epoch": 2.323817292006525, + "grad_norm": 0.04832938686013222, + "learning_rate": 0.0009992024798883025, + "loss": 0.1201, + "num_input_tokens_seen": 30802208, + "step": 14245 + }, + { + "epoch": 2.3246329526916805, + "grad_norm": 0.20137490332126617, + "learning_rate": 0.0009991984561401566, + "loss": 0.1799, + "num_input_tokens_seen": 30812160, + "step": 14250 + }, + { + "epoch": 2.3254486133768353, + "grad_norm": 0.12982739508152008, + "learning_rate": 0.0009991944222751208, + "loss": 0.0397, + "num_input_tokens_seen": 30823040, + "step": 14255 + }, + { + "epoch": 2.32626427406199, + "grad_norm": 0.11527493596076965, + "learning_rate": 0.0009991903782932765, + "loss": 0.1478, + "num_input_tokens_seen": 30833760, + "step": 14260 + }, + { + "epoch": 2.3270799347471454, + "grad_norm": 0.009630398824810982, + "learning_rate": 0.0009991863241947062, + "loss": 0.1292, + "num_input_tokens_seen": 30845216, + "step": 14265 + }, + { + "epoch": 2.3278955954323, + "grad_norm": 0.2665800154209137, + "learning_rate": 0.0009991822599794916, + "loss": 0.1223, + "num_input_tokens_seen": 30855200, + "step": 14270 + }, + { + "epoch": 2.328711256117455, + "grad_norm": 0.21644163131713867, + "learning_rate": 0.0009991781856477156, + "loss": 0.0993, + "num_input_tokens_seen": 30865696, + "step": 14275 + }, + { + "epoch": 2.3295269168026103, + "grad_norm": 0.11290088295936584, + "learning_rate": 0.00099917410119946, + "loss": 0.0717, + "num_input_tokens_seen": 30877472, + "step": 14280 + }, + { + "epoch": 2.330342577487765, + "grad_norm": 0.09280723333358765, + "learning_rate": 0.0009991700066348081, + "loss": 0.0898, + "num_input_tokens_seen": 30887552, + "step": 14285 + }, + { + "epoch": 2.33115823817292, + "grad_norm": 0.04525647312402725, + "learning_rate": 0.000999165901953843, + "loss": 0.0806, + "num_input_tokens_seen": 30898432, + "step": 14290 + }, + { + "epoch": 2.331973898858075, + "grad_norm": 0.1614445596933365, + "learning_rate": 0.0009991617871566473, + "loss": 0.0871, + "num_input_tokens_seen": 30908000, + "step": 14295 + }, + { + "epoch": 2.33278955954323, + "grad_norm": 0.08257835358381271, + "learning_rate": 0.000999157662243305, + "loss": 0.1466, + "num_input_tokens_seen": 30920256, + "step": 14300 + }, + { + "epoch": 2.3336052202283852, + "grad_norm": 0.010302538052201271, + "learning_rate": 0.0009991535272138995, + "loss": 0.3769, + "num_input_tokens_seen": 30931232, + "step": 14305 + }, + { + "epoch": 2.33442088091354, + "grad_norm": 0.05618816241621971, + "learning_rate": 0.0009991493820685142, + "loss": 0.0733, + "num_input_tokens_seen": 30941536, + "step": 14310 + }, + { + "epoch": 2.335236541598695, + "grad_norm": 0.04763595759868622, + "learning_rate": 0.000999145226807234, + "loss": 0.0348, + "num_input_tokens_seen": 30951872, + "step": 14315 + }, + { + "epoch": 2.3360522022838497, + "grad_norm": 0.028318610042333603, + "learning_rate": 0.000999141061430142, + "loss": 0.1134, + "num_input_tokens_seen": 30961792, + "step": 14320 + }, + { + "epoch": 2.336867862969005, + "grad_norm": 0.20109055936336517, + "learning_rate": 0.0009991368859373236, + "loss": 0.1174, + "num_input_tokens_seen": 30971968, + "step": 14325 + }, + { + "epoch": 2.3376835236541598, + "grad_norm": 0.057134952396154404, + "learning_rate": 0.0009991327003288626, + "loss": 0.0911, + "num_input_tokens_seen": 30983456, + "step": 14330 + }, + { + "epoch": 2.338499184339315, + "grad_norm": 0.07061900943517685, + "learning_rate": 0.0009991285046048446, + "loss": 0.1042, + "num_input_tokens_seen": 30994400, + "step": 14335 + }, + { + "epoch": 2.33931484502447, + "grad_norm": 0.06494476646184921, + "learning_rate": 0.0009991242987653541, + "loss": 0.2292, + "num_input_tokens_seen": 31005280, + "step": 14340 + }, + { + "epoch": 2.3401305057096247, + "grad_norm": 0.18833774328231812, + "learning_rate": 0.0009991200828104766, + "loss": 0.1026, + "num_input_tokens_seen": 31015264, + "step": 14345 + }, + { + "epoch": 2.34094616639478, + "grad_norm": 0.06600786000490189, + "learning_rate": 0.0009991158567402973, + "loss": 0.1364, + "num_input_tokens_seen": 31023776, + "step": 14350 + }, + { + "epoch": 2.3417618270799347, + "grad_norm": 0.3423004746437073, + "learning_rate": 0.0009991116205549022, + "loss": 0.3825, + "num_input_tokens_seen": 31035232, + "step": 14355 + }, + { + "epoch": 2.3425774877650896, + "grad_norm": 0.21730433404445648, + "learning_rate": 0.0009991073742543768, + "loss": 0.1815, + "num_input_tokens_seen": 31046688, + "step": 14360 + }, + { + "epoch": 2.343393148450245, + "grad_norm": 0.03402172401547432, + "learning_rate": 0.0009991031178388072, + "loss": 0.086, + "num_input_tokens_seen": 31056928, + "step": 14365 + }, + { + "epoch": 2.3442088091353996, + "grad_norm": 0.05237003415822983, + "learning_rate": 0.0009990988513082799, + "loss": 0.1465, + "num_input_tokens_seen": 31067616, + "step": 14370 + }, + { + "epoch": 2.3450244698205545, + "grad_norm": 0.06425706297159195, + "learning_rate": 0.0009990945746628812, + "loss": 0.0777, + "num_input_tokens_seen": 31079648, + "step": 14375 + }, + { + "epoch": 2.3458401305057097, + "grad_norm": 0.07050355523824692, + "learning_rate": 0.0009990902879026978, + "loss": 0.1368, + "num_input_tokens_seen": 31091040, + "step": 14380 + }, + { + "epoch": 2.3466557911908645, + "grad_norm": 0.09159641712903976, + "learning_rate": 0.0009990859910278167, + "loss": 0.0728, + "num_input_tokens_seen": 31101024, + "step": 14385 + }, + { + "epoch": 2.34747145187602, + "grad_norm": 0.17783799767494202, + "learning_rate": 0.0009990816840383247, + "loss": 0.113, + "num_input_tokens_seen": 31111520, + "step": 14390 + }, + { + "epoch": 2.3482871125611746, + "grad_norm": 0.12338680028915405, + "learning_rate": 0.0009990773669343092, + "loss": 0.136, + "num_input_tokens_seen": 31123680, + "step": 14395 + }, + { + "epoch": 2.3491027732463294, + "grad_norm": 0.08523198962211609, + "learning_rate": 0.0009990730397158578, + "loss": 0.1999, + "num_input_tokens_seen": 31132768, + "step": 14400 + }, + { + "epoch": 2.3499184339314847, + "grad_norm": 0.27966123819351196, + "learning_rate": 0.0009990687023830583, + "loss": 0.0596, + "num_input_tokens_seen": 31144960, + "step": 14405 + }, + { + "epoch": 2.3507340946166395, + "grad_norm": 0.06837616860866547, + "learning_rate": 0.0009990643549359982, + "loss": 0.06, + "num_input_tokens_seen": 31155872, + "step": 14410 + }, + { + "epoch": 2.3515497553017943, + "grad_norm": 0.034586962312459946, + "learning_rate": 0.0009990599973747657, + "loss": 0.0702, + "num_input_tokens_seen": 31166912, + "step": 14415 + }, + { + "epoch": 2.3523654159869496, + "grad_norm": 0.1494152545928955, + "learning_rate": 0.0009990556296994497, + "loss": 0.1052, + "num_input_tokens_seen": 31177504, + "step": 14420 + }, + { + "epoch": 2.3531810766721044, + "grad_norm": 0.03763037547469139, + "learning_rate": 0.000999051251910138, + "loss": 0.1022, + "num_input_tokens_seen": 31188576, + "step": 14425 + }, + { + "epoch": 2.3539967373572592, + "grad_norm": 0.05461564660072327, + "learning_rate": 0.0009990468640069196, + "loss": 0.085, + "num_input_tokens_seen": 31199136, + "step": 14430 + }, + { + "epoch": 2.3548123980424145, + "grad_norm": 0.02698604017496109, + "learning_rate": 0.0009990424659898833, + "loss": 0.0803, + "num_input_tokens_seen": 31209408, + "step": 14435 + }, + { + "epoch": 2.3556280587275693, + "grad_norm": 0.05417114123702049, + "learning_rate": 0.0009990380578591186, + "loss": 0.1756, + "num_input_tokens_seen": 31220128, + "step": 14440 + }, + { + "epoch": 2.356443719412724, + "grad_norm": 0.1396542489528656, + "learning_rate": 0.0009990336396147144, + "loss": 0.232, + "num_input_tokens_seen": 31230944, + "step": 14445 + }, + { + "epoch": 2.3572593800978794, + "grad_norm": 0.07993250340223312, + "learning_rate": 0.0009990292112567606, + "loss": 0.1641, + "num_input_tokens_seen": 31241888, + "step": 14450 + }, + { + "epoch": 2.358075040783034, + "grad_norm": 0.22863461077213287, + "learning_rate": 0.0009990247727853466, + "loss": 0.114, + "num_input_tokens_seen": 31252480, + "step": 14455 + }, + { + "epoch": 2.358890701468189, + "grad_norm": 0.039005108177661896, + "learning_rate": 0.0009990203242005626, + "loss": 0.1857, + "num_input_tokens_seen": 31263296, + "step": 14460 + }, + { + "epoch": 2.3597063621533443, + "grad_norm": 0.06166834011673927, + "learning_rate": 0.0009990158655024985, + "loss": 0.3229, + "num_input_tokens_seen": 31274880, + "step": 14465 + }, + { + "epoch": 2.360522022838499, + "grad_norm": 0.2376435250043869, + "learning_rate": 0.0009990113966912451, + "loss": 0.2014, + "num_input_tokens_seen": 31285600, + "step": 14470 + }, + { + "epoch": 2.3613376835236544, + "grad_norm": 0.17259903252124786, + "learning_rate": 0.0009990069177668926, + "loss": 0.1741, + "num_input_tokens_seen": 31296288, + "step": 14475 + }, + { + "epoch": 2.362153344208809, + "grad_norm": 0.06873729079961777, + "learning_rate": 0.0009990024287295318, + "loss": 0.1471, + "num_input_tokens_seen": 31307264, + "step": 14480 + }, + { + "epoch": 2.362969004893964, + "grad_norm": 0.042510055005550385, + "learning_rate": 0.000998997929579254, + "loss": 0.083, + "num_input_tokens_seen": 31317568, + "step": 14485 + }, + { + "epoch": 2.3637846655791193, + "grad_norm": 0.02149435691535473, + "learning_rate": 0.0009989934203161498, + "loss": 0.0877, + "num_input_tokens_seen": 31329792, + "step": 14490 + }, + { + "epoch": 2.364600326264274, + "grad_norm": 0.07500947266817093, + "learning_rate": 0.0009989889009403112, + "loss": 0.0941, + "num_input_tokens_seen": 31341824, + "step": 14495 + }, + { + "epoch": 2.365415986949429, + "grad_norm": 0.16817909479141235, + "learning_rate": 0.0009989843714518294, + "loss": 0.2337, + "num_input_tokens_seen": 31354112, + "step": 14500 + }, + { + "epoch": 2.366231647634584, + "grad_norm": 0.12466907501220703, + "learning_rate": 0.0009989798318507962, + "loss": 0.0941, + "num_input_tokens_seen": 31365664, + "step": 14505 + }, + { + "epoch": 2.367047308319739, + "grad_norm": 0.020459629595279694, + "learning_rate": 0.0009989752821373038, + "loss": 0.1951, + "num_input_tokens_seen": 31376864, + "step": 14510 + }, + { + "epoch": 2.367862969004894, + "grad_norm": 0.11784857511520386, + "learning_rate": 0.0009989707223114444, + "loss": 0.1683, + "num_input_tokens_seen": 31386816, + "step": 14515 + }, + { + "epoch": 2.368678629690049, + "grad_norm": 0.008937754668295383, + "learning_rate": 0.0009989661523733102, + "loss": 0.1142, + "num_input_tokens_seen": 31398368, + "step": 14520 + }, + { + "epoch": 2.369494290375204, + "grad_norm": 0.09518945962190628, + "learning_rate": 0.000998961572322994, + "loss": 0.3046, + "num_input_tokens_seen": 31408864, + "step": 14525 + }, + { + "epoch": 2.370309951060359, + "grad_norm": 0.061172470450401306, + "learning_rate": 0.0009989569821605886, + "loss": 0.2233, + "num_input_tokens_seen": 31420256, + "step": 14530 + }, + { + "epoch": 2.371125611745514, + "grad_norm": 0.12996791303157806, + "learning_rate": 0.0009989523818861867, + "loss": 0.2008, + "num_input_tokens_seen": 31430016, + "step": 14535 + }, + { + "epoch": 2.3719412724306688, + "grad_norm": 0.04465119168162346, + "learning_rate": 0.0009989477714998822, + "loss": 0.0848, + "num_input_tokens_seen": 31440128, + "step": 14540 + }, + { + "epoch": 2.3727569331158236, + "grad_norm": 0.17468681931495667, + "learning_rate": 0.000998943151001768, + "loss": 0.1192, + "num_input_tokens_seen": 31451872, + "step": 14545 + }, + { + "epoch": 2.373572593800979, + "grad_norm": 0.04149039462208748, + "learning_rate": 0.0009989385203919379, + "loss": 0.1115, + "num_input_tokens_seen": 31463776, + "step": 14550 + }, + { + "epoch": 2.3743882544861337, + "grad_norm": 0.1784275621175766, + "learning_rate": 0.0009989338796704856, + "loss": 0.1233, + "num_input_tokens_seen": 31475136, + "step": 14555 + }, + { + "epoch": 2.375203915171289, + "grad_norm": 0.06148466467857361, + "learning_rate": 0.0009989292288375053, + "loss": 0.1171, + "num_input_tokens_seen": 31486176, + "step": 14560 + }, + { + "epoch": 2.3760195758564437, + "grad_norm": 0.054308172315359116, + "learning_rate": 0.0009989245678930915, + "loss": 0.0486, + "num_input_tokens_seen": 31497504, + "step": 14565 + }, + { + "epoch": 2.3768352365415986, + "grad_norm": 0.23977619409561157, + "learning_rate": 0.0009989198968373381, + "loss": 0.1074, + "num_input_tokens_seen": 31509664, + "step": 14570 + }, + { + "epoch": 2.377650897226754, + "grad_norm": 0.59047532081604, + "learning_rate": 0.0009989152156703403, + "loss": 0.1587, + "num_input_tokens_seen": 31520352, + "step": 14575 + }, + { + "epoch": 2.3784665579119086, + "grad_norm": 0.14900071918964386, + "learning_rate": 0.0009989105243921926, + "loss": 0.1093, + "num_input_tokens_seen": 31530496, + "step": 14580 + }, + { + "epoch": 2.3792822185970635, + "grad_norm": 0.012715587392449379, + "learning_rate": 0.0009989058230029904, + "loss": 0.067, + "num_input_tokens_seen": 31540480, + "step": 14585 + }, + { + "epoch": 2.3800978792822187, + "grad_norm": 0.06799346208572388, + "learning_rate": 0.0009989011115028286, + "loss": 0.1453, + "num_input_tokens_seen": 31551808, + "step": 14590 + }, + { + "epoch": 2.3809135399673735, + "grad_norm": 0.01626054011285305, + "learning_rate": 0.0009988963898918029, + "loss": 0.0401, + "num_input_tokens_seen": 31562752, + "step": 14595 + }, + { + "epoch": 2.3817292006525284, + "grad_norm": 0.0843578651547432, + "learning_rate": 0.000998891658170009, + "loss": 0.1734, + "num_input_tokens_seen": 31573600, + "step": 14600 + }, + { + "epoch": 2.3825448613376836, + "grad_norm": 0.09764565527439117, + "learning_rate": 0.0009988869163375428, + "loss": 0.0758, + "num_input_tokens_seen": 31584128, + "step": 14605 + }, + { + "epoch": 2.3833605220228384, + "grad_norm": 0.10026843100786209, + "learning_rate": 0.0009988821643945002, + "loss": 0.107, + "num_input_tokens_seen": 31594880, + "step": 14610 + }, + { + "epoch": 2.3841761827079937, + "grad_norm": 0.07060685753822327, + "learning_rate": 0.0009988774023409776, + "loss": 0.1273, + "num_input_tokens_seen": 31605984, + "step": 14615 + }, + { + "epoch": 2.3849918433931485, + "grad_norm": 0.013278050348162651, + "learning_rate": 0.0009988726301770718, + "loss": 0.2176, + "num_input_tokens_seen": 31616960, + "step": 14620 + }, + { + "epoch": 2.3858075040783033, + "grad_norm": 0.05854358151555061, + "learning_rate": 0.0009988678479028793, + "loss": 0.025, + "num_input_tokens_seen": 31628896, + "step": 14625 + }, + { + "epoch": 2.3866231647634586, + "grad_norm": 0.27456170320510864, + "learning_rate": 0.000998863055518497, + "loss": 0.1917, + "num_input_tokens_seen": 31640480, + "step": 14630 + }, + { + "epoch": 2.3874388254486134, + "grad_norm": 0.016536332666873932, + "learning_rate": 0.0009988582530240217, + "loss": 0.0856, + "num_input_tokens_seen": 31651392, + "step": 14635 + }, + { + "epoch": 2.3882544861337682, + "grad_norm": 0.09446101635694504, + "learning_rate": 0.0009988534404195516, + "loss": 0.1164, + "num_input_tokens_seen": 31661472, + "step": 14640 + }, + { + "epoch": 2.3890701468189235, + "grad_norm": 0.5649963021278381, + "learning_rate": 0.000998848617705183, + "loss": 0.1488, + "num_input_tokens_seen": 31672224, + "step": 14645 + }, + { + "epoch": 2.3898858075040783, + "grad_norm": 0.03826959431171417, + "learning_rate": 0.000998843784881015, + "loss": 0.2156, + "num_input_tokens_seen": 31683392, + "step": 14650 + }, + { + "epoch": 2.390701468189233, + "grad_norm": 0.12697307765483856, + "learning_rate": 0.0009988389419471446, + "loss": 0.0878, + "num_input_tokens_seen": 31694720, + "step": 14655 + }, + { + "epoch": 2.3915171288743884, + "grad_norm": 0.10490674525499344, + "learning_rate": 0.0009988340889036701, + "loss": 0.1254, + "num_input_tokens_seen": 31705024, + "step": 14660 + }, + { + "epoch": 2.392332789559543, + "grad_norm": 0.29427623748779297, + "learning_rate": 0.0009988292257506902, + "loss": 0.3219, + "num_input_tokens_seen": 31717440, + "step": 14665 + }, + { + "epoch": 2.393148450244698, + "grad_norm": 0.20007169246673584, + "learning_rate": 0.000998824352488303, + "loss": 0.1459, + "num_input_tokens_seen": 31729632, + "step": 14670 + }, + { + "epoch": 2.3939641109298533, + "grad_norm": 0.19052647054195404, + "learning_rate": 0.0009988194691166077, + "loss": 0.155, + "num_input_tokens_seen": 31739648, + "step": 14675 + }, + { + "epoch": 2.394779771615008, + "grad_norm": 0.060261037200689316, + "learning_rate": 0.000998814575635703, + "loss": 0.1356, + "num_input_tokens_seen": 31750944, + "step": 14680 + }, + { + "epoch": 2.395595432300163, + "grad_norm": 0.11980581283569336, + "learning_rate": 0.000998809672045688, + "loss": 0.1422, + "num_input_tokens_seen": 31760416, + "step": 14685 + }, + { + "epoch": 2.396411092985318, + "grad_norm": 0.0499906986951828, + "learning_rate": 0.0009988047583466622, + "loss": 0.1829, + "num_input_tokens_seen": 31770336, + "step": 14690 + }, + { + "epoch": 2.397226753670473, + "grad_norm": 0.09711778163909912, + "learning_rate": 0.0009987998345387255, + "loss": 0.0789, + "num_input_tokens_seen": 31782208, + "step": 14695 + }, + { + "epoch": 2.3980424143556283, + "grad_norm": 0.171736478805542, + "learning_rate": 0.000998794900621977, + "loss": 0.1197, + "num_input_tokens_seen": 31792576, + "step": 14700 + }, + { + "epoch": 2.398858075040783, + "grad_norm": 0.27212202548980713, + "learning_rate": 0.0009987899565965172, + "loss": 0.104, + "num_input_tokens_seen": 31802656, + "step": 14705 + }, + { + "epoch": 2.399673735725938, + "grad_norm": 0.01339148823171854, + "learning_rate": 0.0009987850024624463, + "loss": 0.0807, + "num_input_tokens_seen": 31812896, + "step": 14710 + }, + { + "epoch": 2.400489396411093, + "grad_norm": 0.05710975453257561, + "learning_rate": 0.0009987800382198647, + "loss": 0.0603, + "num_input_tokens_seen": 31823872, + "step": 14715 + }, + { + "epoch": 2.401305057096248, + "grad_norm": 0.027438897639513016, + "learning_rate": 0.0009987750638688726, + "loss": 0.0612, + "num_input_tokens_seen": 31835840, + "step": 14720 + }, + { + "epoch": 2.402120717781403, + "grad_norm": 0.011026641353964806, + "learning_rate": 0.000998770079409571, + "loss": 0.1365, + "num_input_tokens_seen": 31847008, + "step": 14725 + }, + { + "epoch": 2.402936378466558, + "grad_norm": 0.15247893333435059, + "learning_rate": 0.0009987650848420613, + "loss": 0.0311, + "num_input_tokens_seen": 31857888, + "step": 14730 + }, + { + "epoch": 2.403752039151713, + "grad_norm": 0.561492919921875, + "learning_rate": 0.0009987600801664442, + "loss": 0.2133, + "num_input_tokens_seen": 31867712, + "step": 14735 + }, + { + "epoch": 2.4045676998368677, + "grad_norm": 0.04680028185248375, + "learning_rate": 0.0009987550653828214, + "loss": 0.1001, + "num_input_tokens_seen": 31878656, + "step": 14740 + }, + { + "epoch": 2.405383360522023, + "grad_norm": 0.018265612423419952, + "learning_rate": 0.0009987500404912946, + "loss": 0.0544, + "num_input_tokens_seen": 31889792, + "step": 14745 + }, + { + "epoch": 2.4061990212071778, + "grad_norm": 0.09335828572511673, + "learning_rate": 0.0009987450054919655, + "loss": 0.0635, + "num_input_tokens_seen": 31901216, + "step": 14750 + }, + { + "epoch": 2.407014681892333, + "grad_norm": 0.04732209071516991, + "learning_rate": 0.000998739960384936, + "loss": 0.1827, + "num_input_tokens_seen": 31912256, + "step": 14755 + }, + { + "epoch": 2.407830342577488, + "grad_norm": 0.07890065014362335, + "learning_rate": 0.0009987349051703088, + "loss": 0.0249, + "num_input_tokens_seen": 31922336, + "step": 14760 + }, + { + "epoch": 2.4086460032626427, + "grad_norm": 0.20765246450901031, + "learning_rate": 0.0009987298398481859, + "loss": 0.1302, + "num_input_tokens_seen": 31933664, + "step": 14765 + }, + { + "epoch": 2.4094616639477975, + "grad_norm": 0.1883508414030075, + "learning_rate": 0.00099872476441867, + "loss": 0.0663, + "num_input_tokens_seen": 31945856, + "step": 14770 + }, + { + "epoch": 2.4102773246329527, + "grad_norm": 0.01676262356340885, + "learning_rate": 0.0009987196788818643, + "loss": 0.0757, + "num_input_tokens_seen": 31955872, + "step": 14775 + }, + { + "epoch": 2.4110929853181076, + "grad_norm": 0.1437556892633438, + "learning_rate": 0.0009987145832378713, + "loss": 0.0811, + "num_input_tokens_seen": 31965952, + "step": 14780 + }, + { + "epoch": 2.411908646003263, + "grad_norm": 0.082915298640728, + "learning_rate": 0.0009987094774867949, + "loss": 0.0544, + "num_input_tokens_seen": 31977024, + "step": 14785 + }, + { + "epoch": 2.4127243066884176, + "grad_norm": 0.012319295667111874, + "learning_rate": 0.000998704361628738, + "loss": 0.0436, + "num_input_tokens_seen": 31987232, + "step": 14790 + }, + { + "epoch": 2.4135399673735725, + "grad_norm": 0.027199752628803253, + "learning_rate": 0.000998699235663805, + "loss": 0.1137, + "num_input_tokens_seen": 31997952, + "step": 14795 + }, + { + "epoch": 2.4143556280587277, + "grad_norm": 0.21190786361694336, + "learning_rate": 0.000998694099592099, + "loss": 0.1201, + "num_input_tokens_seen": 32008480, + "step": 14800 + }, + { + "epoch": 2.4151712887438825, + "grad_norm": 0.16720622777938843, + "learning_rate": 0.0009986889534137245, + "loss": 0.0574, + "num_input_tokens_seen": 32019360, + "step": 14805 + }, + { + "epoch": 2.4159869494290374, + "grad_norm": 0.4074651300907135, + "learning_rate": 0.0009986837971287857, + "loss": 0.1117, + "num_input_tokens_seen": 32030752, + "step": 14810 + }, + { + "epoch": 2.4168026101141926, + "grad_norm": 0.14166052639484406, + "learning_rate": 0.0009986786307373873, + "loss": 0.1184, + "num_input_tokens_seen": 32042688, + "step": 14815 + }, + { + "epoch": 2.4176182707993474, + "grad_norm": 0.04240216687321663, + "learning_rate": 0.0009986734542396336, + "loss": 0.0605, + "num_input_tokens_seen": 32053184, + "step": 14820 + }, + { + "epoch": 2.4184339314845023, + "grad_norm": 0.024739380925893784, + "learning_rate": 0.0009986682676356299, + "loss": 0.1312, + "num_input_tokens_seen": 32063840, + "step": 14825 + }, + { + "epoch": 2.4192495921696575, + "grad_norm": 0.060678571462631226, + "learning_rate": 0.000998663070925481, + "loss": 0.1352, + "num_input_tokens_seen": 32075584, + "step": 14830 + }, + { + "epoch": 2.4200652528548123, + "grad_norm": 0.012930831871926785, + "learning_rate": 0.0009986578641092924, + "loss": 0.3656, + "num_input_tokens_seen": 32085504, + "step": 14835 + }, + { + "epoch": 2.4208809135399676, + "grad_norm": 0.03932429105043411, + "learning_rate": 0.0009986526471871698, + "loss": 0.0843, + "num_input_tokens_seen": 32096992, + "step": 14840 + }, + { + "epoch": 2.4216965742251224, + "grad_norm": 0.131380096077919, + "learning_rate": 0.0009986474201592187, + "loss": 0.1235, + "num_input_tokens_seen": 32107072, + "step": 14845 + }, + { + "epoch": 2.4225122349102772, + "grad_norm": 0.10655047744512558, + "learning_rate": 0.0009986421830255447, + "loss": 0.2237, + "num_input_tokens_seen": 32118560, + "step": 14850 + }, + { + "epoch": 2.4233278955954325, + "grad_norm": 0.2773621678352356, + "learning_rate": 0.0009986369357862545, + "loss": 0.1452, + "num_input_tokens_seen": 32130080, + "step": 14855 + }, + { + "epoch": 2.4241435562805873, + "grad_norm": 0.06350556761026382, + "learning_rate": 0.0009986316784414543, + "loss": 0.1017, + "num_input_tokens_seen": 32139456, + "step": 14860 + }, + { + "epoch": 2.424959216965742, + "grad_norm": 0.036143578588962555, + "learning_rate": 0.0009986264109912507, + "loss": 0.1017, + "num_input_tokens_seen": 32151552, + "step": 14865 + }, + { + "epoch": 2.4257748776508974, + "grad_norm": 0.13741706311702728, + "learning_rate": 0.00099862113343575, + "loss": 0.0588, + "num_input_tokens_seen": 32163456, + "step": 14870 + }, + { + "epoch": 2.426590538336052, + "grad_norm": 0.03913474828004837, + "learning_rate": 0.0009986158457750596, + "loss": 0.0753, + "num_input_tokens_seen": 32173888, + "step": 14875 + }, + { + "epoch": 2.427406199021207, + "grad_norm": 0.01649622619152069, + "learning_rate": 0.0009986105480092866, + "loss": 0.1942, + "num_input_tokens_seen": 32185504, + "step": 14880 + }, + { + "epoch": 2.4282218597063623, + "grad_norm": 0.02170804888010025, + "learning_rate": 0.0009986052401385385, + "loss": 0.1132, + "num_input_tokens_seen": 32195968, + "step": 14885 + }, + { + "epoch": 2.429037520391517, + "grad_norm": 0.0716899037361145, + "learning_rate": 0.0009985999221629224, + "loss": 0.2186, + "num_input_tokens_seen": 32205312, + "step": 14890 + }, + { + "epoch": 2.429853181076672, + "grad_norm": 0.02708481065928936, + "learning_rate": 0.0009985945940825464, + "loss": 0.0402, + "num_input_tokens_seen": 32216576, + "step": 14895 + }, + { + "epoch": 2.430668841761827, + "grad_norm": 0.13017131388187408, + "learning_rate": 0.0009985892558975185, + "loss": 0.0907, + "num_input_tokens_seen": 32227200, + "step": 14900 + }, + { + "epoch": 2.431484502446982, + "grad_norm": 0.04850441962480545, + "learning_rate": 0.0009985839076079469, + "loss": 0.1175, + "num_input_tokens_seen": 32237152, + "step": 14905 + }, + { + "epoch": 2.432300163132137, + "grad_norm": 0.11114905774593353, + "learning_rate": 0.0009985785492139397, + "loss": 0.1311, + "num_input_tokens_seen": 32247520, + "step": 14910 + }, + { + "epoch": 2.433115823817292, + "grad_norm": 0.07037919014692307, + "learning_rate": 0.0009985731807156057, + "loss": 0.1295, + "num_input_tokens_seen": 32257632, + "step": 14915 + }, + { + "epoch": 2.433931484502447, + "grad_norm": 0.08659728616476059, + "learning_rate": 0.0009985678021130538, + "loss": 0.2184, + "num_input_tokens_seen": 32267808, + "step": 14920 + }, + { + "epoch": 2.434747145187602, + "grad_norm": 0.06102270260453224, + "learning_rate": 0.000998562413406393, + "loss": 0.1491, + "num_input_tokens_seen": 32278752, + "step": 14925 + }, + { + "epoch": 2.435562805872757, + "grad_norm": 0.010404076427221298, + "learning_rate": 0.0009985570145957324, + "loss": 0.2626, + "num_input_tokens_seen": 32289696, + "step": 14930 + }, + { + "epoch": 2.436378466557912, + "grad_norm": 0.06372539699077606, + "learning_rate": 0.0009985516056811815, + "loss": 0.0724, + "num_input_tokens_seen": 32299392, + "step": 14935 + }, + { + "epoch": 2.437194127243067, + "grad_norm": 0.07917524129152298, + "learning_rate": 0.0009985461866628496, + "loss": 0.0539, + "num_input_tokens_seen": 32310592, + "step": 14940 + }, + { + "epoch": 2.438009787928222, + "grad_norm": 0.20440097153186798, + "learning_rate": 0.000998540757540847, + "loss": 0.1825, + "num_input_tokens_seen": 32319968, + "step": 14945 + }, + { + "epoch": 2.4388254486133767, + "grad_norm": 0.007162266410887241, + "learning_rate": 0.0009985353183152835, + "loss": 0.088, + "num_input_tokens_seen": 32330272, + "step": 14950 + }, + { + "epoch": 2.439641109298532, + "grad_norm": 0.11462079733610153, + "learning_rate": 0.0009985298689862692, + "loss": 0.1583, + "num_input_tokens_seen": 32341376, + "step": 14955 + }, + { + "epoch": 2.4404567699836868, + "grad_norm": 0.07103787362575531, + "learning_rate": 0.0009985244095539149, + "loss": 0.1189, + "num_input_tokens_seen": 32352704, + "step": 14960 + }, + { + "epoch": 2.4412724306688416, + "grad_norm": 0.11806105077266693, + "learning_rate": 0.0009985189400183306, + "loss": 0.1283, + "num_input_tokens_seen": 32363616, + "step": 14965 + }, + { + "epoch": 2.442088091353997, + "grad_norm": 0.061703894287347794, + "learning_rate": 0.0009985134603796278, + "loss": 0.1217, + "num_input_tokens_seen": 32372960, + "step": 14970 + }, + { + "epoch": 2.4429037520391517, + "grad_norm": 0.03348749130964279, + "learning_rate": 0.0009985079706379175, + "loss": 0.1353, + "num_input_tokens_seen": 32383456, + "step": 14975 + }, + { + "epoch": 2.443719412724307, + "grad_norm": 0.03957496955990791, + "learning_rate": 0.0009985024707933107, + "loss": 0.0475, + "num_input_tokens_seen": 32395136, + "step": 14980 + }, + { + "epoch": 2.4445350734094617, + "grad_norm": 0.06971059739589691, + "learning_rate": 0.0009984969608459186, + "loss": 0.0367, + "num_input_tokens_seen": 32403744, + "step": 14985 + }, + { + "epoch": 2.4453507340946166, + "grad_norm": 0.07785134762525558, + "learning_rate": 0.0009984914407958536, + "loss": 0.1242, + "num_input_tokens_seen": 32414848, + "step": 14990 + }, + { + "epoch": 2.4461663947797714, + "grad_norm": 0.15177929401397705, + "learning_rate": 0.000998485910643227, + "loss": 0.2226, + "num_input_tokens_seen": 32426112, + "step": 14995 + }, + { + "epoch": 2.4469820554649266, + "grad_norm": 0.1218811422586441, + "learning_rate": 0.000998480370388151, + "loss": 0.113, + "num_input_tokens_seen": 32436640, + "step": 15000 + }, + { + "epoch": 2.4477977161500815, + "grad_norm": 0.18148620426654816, + "learning_rate": 0.000998474820030738, + "loss": 0.0694, + "num_input_tokens_seen": 32447584, + "step": 15005 + }, + { + "epoch": 2.4486133768352367, + "grad_norm": 0.1163652166724205, + "learning_rate": 0.0009984692595711004, + "loss": 0.0809, + "num_input_tokens_seen": 32458272, + "step": 15010 + }, + { + "epoch": 2.4494290375203915, + "grad_norm": 0.034473199397325516, + "learning_rate": 0.0009984636890093509, + "loss": 0.0824, + "num_input_tokens_seen": 32469152, + "step": 15015 + }, + { + "epoch": 2.4502446982055464, + "grad_norm": 0.13606807589530945, + "learning_rate": 0.0009984581083456023, + "loss": 0.14, + "num_input_tokens_seen": 32480576, + "step": 15020 + }, + { + "epoch": 2.4510603588907016, + "grad_norm": 0.16098394989967346, + "learning_rate": 0.000998452517579968, + "loss": 0.0327, + "num_input_tokens_seen": 32490976, + "step": 15025 + }, + { + "epoch": 2.4518760195758564, + "grad_norm": 0.04360827058553696, + "learning_rate": 0.000998446916712561, + "loss": 0.0645, + "num_input_tokens_seen": 32501216, + "step": 15030 + }, + { + "epoch": 2.4526916802610113, + "grad_norm": 0.10341744124889374, + "learning_rate": 0.0009984413057434948, + "loss": 0.0638, + "num_input_tokens_seen": 32513120, + "step": 15035 + }, + { + "epoch": 2.4535073409461665, + "grad_norm": 0.15271392464637756, + "learning_rate": 0.0009984356846728835, + "loss": 0.2005, + "num_input_tokens_seen": 32524320, + "step": 15040 + }, + { + "epoch": 2.4543230016313213, + "grad_norm": 0.20539811253547668, + "learning_rate": 0.0009984300535008405, + "loss": 0.1879, + "num_input_tokens_seen": 32534208, + "step": 15045 + }, + { + "epoch": 2.455138662316476, + "grad_norm": 0.11480668932199478, + "learning_rate": 0.0009984244122274802, + "loss": 0.1111, + "num_input_tokens_seen": 32545792, + "step": 15050 + }, + { + "epoch": 2.4559543230016314, + "grad_norm": 0.050684988498687744, + "learning_rate": 0.000998418760852917, + "loss": 0.0521, + "num_input_tokens_seen": 32556576, + "step": 15055 + }, + { + "epoch": 2.4567699836867862, + "grad_norm": 0.014801833778619766, + "learning_rate": 0.0009984130993772652, + "loss": 0.0767, + "num_input_tokens_seen": 32568000, + "step": 15060 + }, + { + "epoch": 2.4575856443719415, + "grad_norm": 0.009958263486623764, + "learning_rate": 0.0009984074278006397, + "loss": 0.0668, + "num_input_tokens_seen": 32577472, + "step": 15065 + }, + { + "epoch": 2.4584013050570963, + "grad_norm": 0.3374749422073364, + "learning_rate": 0.0009984017461231553, + "loss": 0.1985, + "num_input_tokens_seen": 32588096, + "step": 15070 + }, + { + "epoch": 2.459216965742251, + "grad_norm": 0.07289399951696396, + "learning_rate": 0.0009983960543449276, + "loss": 0.0832, + "num_input_tokens_seen": 32599328, + "step": 15075 + }, + { + "epoch": 2.4600326264274064, + "grad_norm": 0.11521682888269424, + "learning_rate": 0.0009983903524660711, + "loss": 0.0833, + "num_input_tokens_seen": 32609216, + "step": 15080 + }, + { + "epoch": 2.460848287112561, + "grad_norm": 0.4067881107330322, + "learning_rate": 0.0009983846404867022, + "loss": 0.2918, + "num_input_tokens_seen": 32620416, + "step": 15085 + }, + { + "epoch": 2.461663947797716, + "grad_norm": 0.028215084224939346, + "learning_rate": 0.0009983789184069363, + "loss": 0.0462, + "num_input_tokens_seen": 32631776, + "step": 15090 + }, + { + "epoch": 2.4624796084828713, + "grad_norm": 0.18957918882369995, + "learning_rate": 0.0009983731862268893, + "loss": 0.1796, + "num_input_tokens_seen": 32642304, + "step": 15095 + }, + { + "epoch": 2.463295269168026, + "grad_norm": 0.009655492380261421, + "learning_rate": 0.0009983674439466774, + "loss": 0.0251, + "num_input_tokens_seen": 32653088, + "step": 15100 + }, + { + "epoch": 2.464110929853181, + "grad_norm": 0.06055706366896629, + "learning_rate": 0.000998361691566417, + "loss": 0.1191, + "num_input_tokens_seen": 32662464, + "step": 15105 + }, + { + "epoch": 2.464926590538336, + "grad_norm": 0.03933952748775482, + "learning_rate": 0.0009983559290862247, + "loss": 0.0662, + "num_input_tokens_seen": 32672992, + "step": 15110 + }, + { + "epoch": 2.465742251223491, + "grad_norm": 0.10756219178438187, + "learning_rate": 0.0009983501565062173, + "loss": 0.0942, + "num_input_tokens_seen": 32684256, + "step": 15115 + }, + { + "epoch": 2.466557911908646, + "grad_norm": 0.04777355492115021, + "learning_rate": 0.000998344373826512, + "loss": 0.1098, + "num_input_tokens_seen": 32694816, + "step": 15120 + }, + { + "epoch": 2.467373572593801, + "grad_norm": 0.3761058747768402, + "learning_rate": 0.0009983385810472256, + "loss": 0.3218, + "num_input_tokens_seen": 32705568, + "step": 15125 + }, + { + "epoch": 2.468189233278956, + "grad_norm": 0.19796237349510193, + "learning_rate": 0.0009983327781684756, + "loss": 0.1266, + "num_input_tokens_seen": 32716128, + "step": 15130 + }, + { + "epoch": 2.4690048939641107, + "grad_norm": 0.016280511394143105, + "learning_rate": 0.0009983269651903798, + "loss": 0.1654, + "num_input_tokens_seen": 32728480, + "step": 15135 + }, + { + "epoch": 2.469820554649266, + "grad_norm": 0.11683381348848343, + "learning_rate": 0.0009983211421130558, + "loss": 0.2111, + "num_input_tokens_seen": 32738848, + "step": 15140 + }, + { + "epoch": 2.470636215334421, + "grad_norm": 0.12451004981994629, + "learning_rate": 0.0009983153089366218, + "loss": 0.1187, + "num_input_tokens_seen": 32749728, + "step": 15145 + }, + { + "epoch": 2.471451876019576, + "grad_norm": 0.11421272903680801, + "learning_rate": 0.0009983094656611958, + "loss": 0.1476, + "num_input_tokens_seen": 32761824, + "step": 15150 + }, + { + "epoch": 2.472267536704731, + "grad_norm": 0.20336616039276123, + "learning_rate": 0.0009983036122868962, + "loss": 0.1398, + "num_input_tokens_seen": 32773216, + "step": 15155 + }, + { + "epoch": 2.4730831973898857, + "grad_norm": 0.013980901800096035, + "learning_rate": 0.000998297748813842, + "loss": 0.1411, + "num_input_tokens_seen": 32785344, + "step": 15160 + }, + { + "epoch": 2.473898858075041, + "grad_norm": 0.026078760623931885, + "learning_rate": 0.0009982918752421516, + "loss": 0.0482, + "num_input_tokens_seen": 32795648, + "step": 15165 + }, + { + "epoch": 2.4747145187601958, + "grad_norm": 0.07515005022287369, + "learning_rate": 0.0009982859915719444, + "loss": 0.04, + "num_input_tokens_seen": 32806848, + "step": 15170 + }, + { + "epoch": 2.4755301794453506, + "grad_norm": 0.1675223857164383, + "learning_rate": 0.0009982800978033395, + "loss": 0.1084, + "num_input_tokens_seen": 32819552, + "step": 15175 + }, + { + "epoch": 2.476345840130506, + "grad_norm": 0.020948603749275208, + "learning_rate": 0.000998274193936456, + "loss": 0.117, + "num_input_tokens_seen": 32830944, + "step": 15180 + }, + { + "epoch": 2.4771615008156607, + "grad_norm": 0.12657499313354492, + "learning_rate": 0.000998268279971414, + "loss": 0.1146, + "num_input_tokens_seen": 32840512, + "step": 15185 + }, + { + "epoch": 2.4779771615008155, + "grad_norm": 0.2049468606710434, + "learning_rate": 0.0009982623559083332, + "loss": 0.0701, + "num_input_tokens_seen": 32851424, + "step": 15190 + }, + { + "epoch": 2.4787928221859707, + "grad_norm": 0.19748122990131378, + "learning_rate": 0.0009982564217473338, + "loss": 0.183, + "num_input_tokens_seen": 32861856, + "step": 15195 + }, + { + "epoch": 2.4796084828711256, + "grad_norm": 0.35869279503822327, + "learning_rate": 0.000998250477488536, + "loss": 0.1523, + "num_input_tokens_seen": 32873696, + "step": 15200 + }, + { + "epoch": 2.480424143556281, + "grad_norm": 0.03681536018848419, + "learning_rate": 0.0009982445231320597, + "loss": 0.2374, + "num_input_tokens_seen": 32883488, + "step": 15205 + }, + { + "epoch": 2.4812398042414356, + "grad_norm": 0.022419409826397896, + "learning_rate": 0.0009982385586780264, + "loss": 0.1162, + "num_input_tokens_seen": 32894720, + "step": 15210 + }, + { + "epoch": 2.4820554649265905, + "grad_norm": 0.2048080563545227, + "learning_rate": 0.0009982325841265567, + "loss": 0.1677, + "num_input_tokens_seen": 32905696, + "step": 15215 + }, + { + "epoch": 2.4828711256117453, + "grad_norm": 0.04616566747426987, + "learning_rate": 0.0009982265994777717, + "loss": 0.1081, + "num_input_tokens_seen": 32916704, + "step": 15220 + }, + { + "epoch": 2.4836867862969005, + "grad_norm": 0.04768180847167969, + "learning_rate": 0.0009982206047317926, + "loss": 0.0489, + "num_input_tokens_seen": 32927392, + "step": 15225 + }, + { + "epoch": 2.4845024469820554, + "grad_norm": 0.034913014620542526, + "learning_rate": 0.0009982145998887406, + "loss": 0.0482, + "num_input_tokens_seen": 32939232, + "step": 15230 + }, + { + "epoch": 2.4853181076672106, + "grad_norm": 0.12986132502555847, + "learning_rate": 0.000998208584948738, + "loss": 0.1679, + "num_input_tokens_seen": 32950144, + "step": 15235 + }, + { + "epoch": 2.4861337683523654, + "grad_norm": 0.06647571176290512, + "learning_rate": 0.0009982025599119062, + "loss": 0.1219, + "num_input_tokens_seen": 32961184, + "step": 15240 + }, + { + "epoch": 2.4869494290375203, + "grad_norm": 0.08212022483348846, + "learning_rate": 0.0009981965247783677, + "loss": 0.0928, + "num_input_tokens_seen": 32971584, + "step": 15245 + }, + { + "epoch": 2.4877650897226755, + "grad_norm": 0.05855432525277138, + "learning_rate": 0.0009981904795482446, + "loss": 0.1524, + "num_input_tokens_seen": 32982496, + "step": 15250 + }, + { + "epoch": 2.4885807504078303, + "grad_norm": 2.2332639694213867, + "learning_rate": 0.0009981844242216594, + "loss": 0.2134, + "num_input_tokens_seen": 32992192, + "step": 15255 + }, + { + "epoch": 2.489396411092985, + "grad_norm": 0.030122999101877213, + "learning_rate": 0.0009981783587987348, + "loss": 0.0564, + "num_input_tokens_seen": 33003136, + "step": 15260 + }, + { + "epoch": 2.4902120717781404, + "grad_norm": 0.09303940832614899, + "learning_rate": 0.0009981722832795937, + "loss": 0.0588, + "num_input_tokens_seen": 33014592, + "step": 15265 + }, + { + "epoch": 2.4910277324632952, + "grad_norm": 0.019270701333880424, + "learning_rate": 0.0009981661976643595, + "loss": 0.1582, + "num_input_tokens_seen": 33025568, + "step": 15270 + }, + { + "epoch": 2.49184339314845, + "grad_norm": 0.16244956851005554, + "learning_rate": 0.0009981601019531552, + "loss": 0.1883, + "num_input_tokens_seen": 33037632, + "step": 15275 + }, + { + "epoch": 2.4926590538336053, + "grad_norm": 0.06669965386390686, + "learning_rate": 0.0009981539961461045, + "loss": 0.0413, + "num_input_tokens_seen": 33047840, + "step": 15280 + }, + { + "epoch": 2.49347471451876, + "grad_norm": 0.027973853051662445, + "learning_rate": 0.000998147880243331, + "loss": 0.1188, + "num_input_tokens_seen": 33057824, + "step": 15285 + }, + { + "epoch": 2.4942903752039154, + "grad_norm": 0.23098739981651306, + "learning_rate": 0.000998141754244959, + "loss": 0.2423, + "num_input_tokens_seen": 33069056, + "step": 15290 + }, + { + "epoch": 2.49510603588907, + "grad_norm": 0.06371300667524338, + "learning_rate": 0.0009981356181511124, + "loss": 0.0366, + "num_input_tokens_seen": 33080352, + "step": 15295 + }, + { + "epoch": 2.495921696574225, + "grad_norm": 0.056606777012348175, + "learning_rate": 0.0009981294719619152, + "loss": 0.064, + "num_input_tokens_seen": 33090592, + "step": 15300 + }, + { + "epoch": 2.4967373572593803, + "grad_norm": 0.05140992999076843, + "learning_rate": 0.0009981233156774927, + "loss": 0.0346, + "num_input_tokens_seen": 33102304, + "step": 15305 + }, + { + "epoch": 2.497553017944535, + "grad_norm": 0.101639524102211, + "learning_rate": 0.0009981171492979691, + "loss": 0.0721, + "num_input_tokens_seen": 33112192, + "step": 15310 + }, + { + "epoch": 2.49836867862969, + "grad_norm": 0.012121300213038921, + "learning_rate": 0.0009981109728234698, + "loss": 0.2219, + "num_input_tokens_seen": 33122336, + "step": 15315 + }, + { + "epoch": 2.499184339314845, + "grad_norm": 0.11129625886678696, + "learning_rate": 0.0009981047862541194, + "loss": 0.1776, + "num_input_tokens_seen": 33133376, + "step": 15320 + }, + { + "epoch": 2.5, + "grad_norm": 0.2766715884208679, + "learning_rate": 0.0009980985895900439, + "loss": 0.2495, + "num_input_tokens_seen": 33145248, + "step": 15325 + }, + { + "epoch": 2.500815660685155, + "grad_norm": 0.091251902282238, + "learning_rate": 0.0009980923828313685, + "loss": 0.0637, + "num_input_tokens_seen": 33156320, + "step": 15330 + }, + { + "epoch": 2.50163132137031, + "grad_norm": 0.025461995974183083, + "learning_rate": 0.000998086165978219, + "loss": 0.1226, + "num_input_tokens_seen": 33165600, + "step": 15335 + }, + { + "epoch": 2.502446982055465, + "grad_norm": 0.028699345886707306, + "learning_rate": 0.0009980799390307215, + "loss": 0.0736, + "num_input_tokens_seen": 33176992, + "step": 15340 + }, + { + "epoch": 2.50326264274062, + "grad_norm": 0.06483875215053558, + "learning_rate": 0.0009980737019890024, + "loss": 0.0747, + "num_input_tokens_seen": 33188800, + "step": 15345 + }, + { + "epoch": 2.504078303425775, + "grad_norm": 0.2196149379014969, + "learning_rate": 0.0009980674548531877, + "loss": 0.1588, + "num_input_tokens_seen": 33198784, + "step": 15350 + }, + { + "epoch": 2.50489396411093, + "grad_norm": 0.01191483624279499, + "learning_rate": 0.0009980611976234041, + "loss": 0.0171, + "num_input_tokens_seen": 33210336, + "step": 15355 + }, + { + "epoch": 2.5057096247960846, + "grad_norm": 0.010378982871770859, + "learning_rate": 0.0009980549302997788, + "loss": 0.0217, + "num_input_tokens_seen": 33221088, + "step": 15360 + }, + { + "epoch": 2.50652528548124, + "grad_norm": 0.05322907119989395, + "learning_rate": 0.000998048652882438, + "loss": 0.143, + "num_input_tokens_seen": 33231232, + "step": 15365 + }, + { + "epoch": 2.5073409461663947, + "grad_norm": 0.13478170335292816, + "learning_rate": 0.00099804236537151, + "loss": 0.1536, + "num_input_tokens_seen": 33243776, + "step": 15370 + }, + { + "epoch": 2.50815660685155, + "grad_norm": 0.07025640457868576, + "learning_rate": 0.0009980360677671214, + "loss": 0.1515, + "num_input_tokens_seen": 33253632, + "step": 15375 + }, + { + "epoch": 2.5089722675367048, + "grad_norm": 0.08173404633998871, + "learning_rate": 0.0009980297600694, + "loss": 0.0829, + "num_input_tokens_seen": 33264000, + "step": 15380 + }, + { + "epoch": 2.5097879282218596, + "grad_norm": 0.041357748210430145, + "learning_rate": 0.0009980234422784738, + "loss": 0.1647, + "num_input_tokens_seen": 33275008, + "step": 15385 + }, + { + "epoch": 2.5106035889070144, + "grad_norm": 0.14917460083961487, + "learning_rate": 0.0009980171143944708, + "loss": 0.2091, + "num_input_tokens_seen": 33285632, + "step": 15390 + }, + { + "epoch": 2.5114192495921697, + "grad_norm": 0.0907067358493805, + "learning_rate": 0.000998010776417519, + "loss": 0.1932, + "num_input_tokens_seen": 33297376, + "step": 15395 + }, + { + "epoch": 2.5122349102773245, + "grad_norm": 0.060952670872211456, + "learning_rate": 0.0009980044283477473, + "loss": 0.0709, + "num_input_tokens_seen": 33307712, + "step": 15400 + }, + { + "epoch": 2.5130505709624797, + "grad_norm": 0.2798716425895691, + "learning_rate": 0.000997998070185284, + "loss": 0.0892, + "num_input_tokens_seen": 33318304, + "step": 15405 + }, + { + "epoch": 2.5138662316476346, + "grad_norm": 0.04096159338951111, + "learning_rate": 0.000997991701930258, + "loss": 0.1341, + "num_input_tokens_seen": 33328544, + "step": 15410 + }, + { + "epoch": 2.5146818923327894, + "grad_norm": 0.053559333086013794, + "learning_rate": 0.0009979853235827984, + "loss": 0.1205, + "num_input_tokens_seen": 33338016, + "step": 15415 + }, + { + "epoch": 2.5154975530179446, + "grad_norm": 0.06617650389671326, + "learning_rate": 0.0009979789351430347, + "loss": 0.0993, + "num_input_tokens_seen": 33346496, + "step": 15420 + }, + { + "epoch": 2.5163132137030995, + "grad_norm": 0.05221320688724518, + "learning_rate": 0.0009979725366110958, + "loss": 0.0364, + "num_input_tokens_seen": 33356448, + "step": 15425 + }, + { + "epoch": 2.5171288743882547, + "grad_norm": 0.030037062242627144, + "learning_rate": 0.0009979661279871119, + "loss": 0.0951, + "num_input_tokens_seen": 33366432, + "step": 15430 + }, + { + "epoch": 2.5179445350734095, + "grad_norm": 0.11865063011646271, + "learning_rate": 0.0009979597092712128, + "loss": 0.0728, + "num_input_tokens_seen": 33377344, + "step": 15435 + }, + { + "epoch": 2.5187601957585644, + "grad_norm": 0.08982488512992859, + "learning_rate": 0.0009979532804635283, + "loss": 0.0634, + "num_input_tokens_seen": 33387776, + "step": 15440 + }, + { + "epoch": 2.519575856443719, + "grad_norm": 0.0816715732216835, + "learning_rate": 0.000997946841564189, + "loss": 0.1627, + "num_input_tokens_seen": 33398208, + "step": 15445 + }, + { + "epoch": 2.5203915171288744, + "grad_norm": 0.0918974056839943, + "learning_rate": 0.0009979403925733253, + "loss": 0.0532, + "num_input_tokens_seen": 33408864, + "step": 15450 + }, + { + "epoch": 2.5212071778140293, + "grad_norm": 0.0044286069460213184, + "learning_rate": 0.0009979339334910678, + "loss": 0.1216, + "num_input_tokens_seen": 33419040, + "step": 15455 + }, + { + "epoch": 2.5220228384991845, + "grad_norm": 0.05251329392194748, + "learning_rate": 0.0009979274643175473, + "loss": 0.0569, + "num_input_tokens_seen": 33430048, + "step": 15460 + }, + { + "epoch": 2.5228384991843393, + "grad_norm": 0.3440323770046234, + "learning_rate": 0.0009979209850528954, + "loss": 0.1986, + "num_input_tokens_seen": 33440288, + "step": 15465 + }, + { + "epoch": 2.523654159869494, + "grad_norm": 0.1386038362979889, + "learning_rate": 0.0009979144956972427, + "loss": 0.0708, + "num_input_tokens_seen": 33451616, + "step": 15470 + }, + { + "epoch": 2.5244698205546494, + "grad_norm": 0.0037523547653108835, + "learning_rate": 0.0009979079962507214, + "loss": 0.1202, + "num_input_tokens_seen": 33463520, + "step": 15475 + }, + { + "epoch": 2.5252854812398042, + "grad_norm": 0.05357692763209343, + "learning_rate": 0.0009979014867134628, + "loss": 0.116, + "num_input_tokens_seen": 33474272, + "step": 15480 + }, + { + "epoch": 2.5261011419249595, + "grad_norm": 0.13688050210475922, + "learning_rate": 0.000997894967085599, + "loss": 0.128, + "num_input_tokens_seen": 33485024, + "step": 15485 + }, + { + "epoch": 2.5269168026101143, + "grad_norm": 0.13154840469360352, + "learning_rate": 0.000997888437367262, + "loss": 0.1607, + "num_input_tokens_seen": 33494912, + "step": 15490 + }, + { + "epoch": 2.527732463295269, + "grad_norm": 0.0538487546145916, + "learning_rate": 0.0009978818975585843, + "loss": 0.1917, + "num_input_tokens_seen": 33506560, + "step": 15495 + }, + { + "epoch": 2.528548123980424, + "grad_norm": 0.048410579562187195, + "learning_rate": 0.0009978753476596982, + "loss": 0.1062, + "num_input_tokens_seen": 33518752, + "step": 15500 + }, + { + "epoch": 2.529363784665579, + "grad_norm": 0.2545541226863861, + "learning_rate": 0.0009978687876707366, + "loss": 0.0737, + "num_input_tokens_seen": 33529856, + "step": 15505 + }, + { + "epoch": 2.530179445350734, + "grad_norm": 0.21288903057575226, + "learning_rate": 0.0009978622175918323, + "loss": 0.2061, + "num_input_tokens_seen": 33540896, + "step": 15510 + }, + { + "epoch": 2.5309951060358893, + "grad_norm": 0.06659834831953049, + "learning_rate": 0.0009978556374231188, + "loss": 0.0653, + "num_input_tokens_seen": 33552224, + "step": 15515 + }, + { + "epoch": 2.531810766721044, + "grad_norm": 0.056002382189035416, + "learning_rate": 0.0009978490471647292, + "loss": 0.1134, + "num_input_tokens_seen": 33562176, + "step": 15520 + }, + { + "epoch": 2.532626427406199, + "grad_norm": 0.026309235021471977, + "learning_rate": 0.000997842446816797, + "loss": 0.0851, + "num_input_tokens_seen": 33573984, + "step": 15525 + }, + { + "epoch": 2.5334420880913537, + "grad_norm": 0.13522249460220337, + "learning_rate": 0.0009978358363794562, + "loss": 0.1014, + "num_input_tokens_seen": 33584032, + "step": 15530 + }, + { + "epoch": 2.534257748776509, + "grad_norm": 0.35306116938591003, + "learning_rate": 0.0009978292158528406, + "loss": 0.1917, + "num_input_tokens_seen": 33595744, + "step": 15535 + }, + { + "epoch": 2.535073409461664, + "grad_norm": 0.1873067319393158, + "learning_rate": 0.0009978225852370843, + "loss": 0.2297, + "num_input_tokens_seen": 33606304, + "step": 15540 + }, + { + "epoch": 2.535889070146819, + "grad_norm": 0.0513768270611763, + "learning_rate": 0.000997815944532322, + "loss": 0.1684, + "num_input_tokens_seen": 33616224, + "step": 15545 + }, + { + "epoch": 2.536704730831974, + "grad_norm": 0.30809465050697327, + "learning_rate": 0.0009978092937386878, + "loss": 0.1936, + "num_input_tokens_seen": 33628000, + "step": 15550 + }, + { + "epoch": 2.5375203915171287, + "grad_norm": 0.10597193241119385, + "learning_rate": 0.0009978026328563167, + "loss": 0.1492, + "num_input_tokens_seen": 33639296, + "step": 15555 + }, + { + "epoch": 2.538336052202284, + "grad_norm": 0.0677858367562294, + "learning_rate": 0.0009977959618853438, + "loss": 0.3077, + "num_input_tokens_seen": 33650496, + "step": 15560 + }, + { + "epoch": 2.539151712887439, + "grad_norm": 0.09901798516511917, + "learning_rate": 0.0009977892808259044, + "loss": 0.1203, + "num_input_tokens_seen": 33659712, + "step": 15565 + }, + { + "epoch": 2.539967373572594, + "grad_norm": 0.10196952521800995, + "learning_rate": 0.0009977825896781336, + "loss": 0.2513, + "num_input_tokens_seen": 33670656, + "step": 15570 + }, + { + "epoch": 2.540783034257749, + "grad_norm": 0.1093427911400795, + "learning_rate": 0.0009977758884421673, + "loss": 0.1093, + "num_input_tokens_seen": 33682176, + "step": 15575 + }, + { + "epoch": 2.5415986949429037, + "grad_norm": 0.08146216720342636, + "learning_rate": 0.000997769177118141, + "loss": 0.0914, + "num_input_tokens_seen": 33693280, + "step": 15580 + }, + { + "epoch": 2.5424143556280585, + "grad_norm": 0.02189205028116703, + "learning_rate": 0.0009977624557061908, + "loss": 0.0368, + "num_input_tokens_seen": 33704032, + "step": 15585 + }, + { + "epoch": 2.5432300163132138, + "grad_norm": 0.039141327142715454, + "learning_rate": 0.000997755724206453, + "loss": 0.0615, + "num_input_tokens_seen": 33715168, + "step": 15590 + }, + { + "epoch": 2.5440456769983686, + "grad_norm": 0.08967316895723343, + "learning_rate": 0.0009977489826190641, + "loss": 0.0734, + "num_input_tokens_seen": 33724896, + "step": 15595 + }, + { + "epoch": 2.544861337683524, + "grad_norm": 0.18973685801029205, + "learning_rate": 0.0009977422309441605, + "loss": 0.1065, + "num_input_tokens_seen": 33735904, + "step": 15600 + }, + { + "epoch": 2.5456769983686787, + "grad_norm": 0.011004339903593063, + "learning_rate": 0.0009977354691818794, + "loss": 0.3357, + "num_input_tokens_seen": 33746752, + "step": 15605 + }, + { + "epoch": 2.5464926590538335, + "grad_norm": 0.089126817882061, + "learning_rate": 0.0009977286973323575, + "loss": 0.1619, + "num_input_tokens_seen": 33757056, + "step": 15610 + }, + { + "epoch": 2.5473083197389887, + "grad_norm": 0.10468865931034088, + "learning_rate": 0.000997721915395732, + "loss": 0.0873, + "num_input_tokens_seen": 33767616, + "step": 15615 + }, + { + "epoch": 2.5481239804241436, + "grad_norm": 0.1721217781305313, + "learning_rate": 0.0009977151233721406, + "loss": 0.0854, + "num_input_tokens_seen": 33777152, + "step": 15620 + }, + { + "epoch": 2.5489396411092984, + "grad_norm": 0.09493706375360489, + "learning_rate": 0.0009977083212617207, + "loss": 0.2463, + "num_input_tokens_seen": 33787904, + "step": 15625 + }, + { + "epoch": 2.5497553017944536, + "grad_norm": 0.2843787968158722, + "learning_rate": 0.0009977015090646105, + "loss": 0.1284, + "num_input_tokens_seen": 33798464, + "step": 15630 + }, + { + "epoch": 2.5505709624796085, + "grad_norm": 0.02873399294912815, + "learning_rate": 0.0009976946867809476, + "loss": 0.0721, + "num_input_tokens_seen": 33809824, + "step": 15635 + }, + { + "epoch": 2.5513866231647633, + "grad_norm": 0.0776677280664444, + "learning_rate": 0.0009976878544108705, + "loss": 0.0718, + "num_input_tokens_seen": 33819936, + "step": 15640 + }, + { + "epoch": 2.5522022838499185, + "grad_norm": 0.05906492844223976, + "learning_rate": 0.000997681011954518, + "loss": 0.1859, + "num_input_tokens_seen": 33830784, + "step": 15645 + }, + { + "epoch": 2.5530179445350734, + "grad_norm": 0.08871506154537201, + "learning_rate": 0.0009976741594120281, + "loss": 0.078, + "num_input_tokens_seen": 33842272, + "step": 15650 + }, + { + "epoch": 2.5538336052202286, + "grad_norm": 0.172580748796463, + "learning_rate": 0.00099766729678354, + "loss": 0.1249, + "num_input_tokens_seen": 33853184, + "step": 15655 + }, + { + "epoch": 2.5546492659053834, + "grad_norm": 0.2699727416038513, + "learning_rate": 0.0009976604240691932, + "loss": 0.181, + "num_input_tokens_seen": 33865248, + "step": 15660 + }, + { + "epoch": 2.5554649265905383, + "grad_norm": 0.2737541198730469, + "learning_rate": 0.0009976535412691261, + "loss": 0.2657, + "num_input_tokens_seen": 33876544, + "step": 15665 + }, + { + "epoch": 2.556280587275693, + "grad_norm": 0.056653060019016266, + "learning_rate": 0.0009976466483834789, + "loss": 0.0694, + "num_input_tokens_seen": 33886816, + "step": 15670 + }, + { + "epoch": 2.5570962479608483, + "grad_norm": 0.060865480452775955, + "learning_rate": 0.0009976397454123911, + "loss": 0.3578, + "num_input_tokens_seen": 33897728, + "step": 15675 + }, + { + "epoch": 2.557911908646003, + "grad_norm": 0.1835509091615677, + "learning_rate": 0.0009976328323560025, + "loss": 0.067, + "num_input_tokens_seen": 33907936, + "step": 15680 + }, + { + "epoch": 2.5587275693311584, + "grad_norm": 0.07976268976926804, + "learning_rate": 0.0009976259092144533, + "loss": 0.0566, + "num_input_tokens_seen": 33917280, + "step": 15685 + }, + { + "epoch": 2.5595432300163132, + "grad_norm": 0.05880413204431534, + "learning_rate": 0.0009976189759878836, + "loss": 0.148, + "num_input_tokens_seen": 33928896, + "step": 15690 + }, + { + "epoch": 2.560358890701468, + "grad_norm": 0.04047093167901039, + "learning_rate": 0.0009976120326764342, + "loss": 0.1329, + "num_input_tokens_seen": 33939008, + "step": 15695 + }, + { + "epoch": 2.5611745513866233, + "grad_norm": 0.036515820771455765, + "learning_rate": 0.0009976050792802457, + "loss": 0.1013, + "num_input_tokens_seen": 33950048, + "step": 15700 + }, + { + "epoch": 2.561990212071778, + "grad_norm": 0.03752981126308441, + "learning_rate": 0.000997598115799459, + "loss": 0.0863, + "num_input_tokens_seen": 33960992, + "step": 15705 + }, + { + "epoch": 2.5628058727569334, + "grad_norm": 0.1627136617898941, + "learning_rate": 0.0009975911422342152, + "loss": 0.0564, + "num_input_tokens_seen": 33972288, + "step": 15710 + }, + { + "epoch": 2.563621533442088, + "grad_norm": 0.16130013763904572, + "learning_rate": 0.0009975841585846558, + "loss": 0.0638, + "num_input_tokens_seen": 33983264, + "step": 15715 + }, + { + "epoch": 2.564437194127243, + "grad_norm": 0.05317399650812149, + "learning_rate": 0.000997577164850922, + "loss": 0.1662, + "num_input_tokens_seen": 33993152, + "step": 15720 + }, + { + "epoch": 2.565252854812398, + "grad_norm": 0.14853844046592712, + "learning_rate": 0.000997570161033156, + "loss": 0.071, + "num_input_tokens_seen": 34003296, + "step": 15725 + }, + { + "epoch": 2.566068515497553, + "grad_norm": 0.07281967252492905, + "learning_rate": 0.0009975631471314992, + "loss": 0.1886, + "num_input_tokens_seen": 34015168, + "step": 15730 + }, + { + "epoch": 2.566884176182708, + "grad_norm": 0.053378649055957794, + "learning_rate": 0.0009975561231460942, + "loss": 0.0543, + "num_input_tokens_seen": 34024128, + "step": 15735 + }, + { + "epoch": 2.567699836867863, + "grad_norm": 0.29613426327705383, + "learning_rate": 0.000997549089077083, + "loss": 0.1518, + "num_input_tokens_seen": 34033888, + "step": 15740 + }, + { + "epoch": 2.568515497553018, + "grad_norm": 0.14543229341506958, + "learning_rate": 0.0009975420449246084, + "loss": 0.2044, + "num_input_tokens_seen": 34044128, + "step": 15745 + }, + { + "epoch": 2.569331158238173, + "grad_norm": 0.019933458417654037, + "learning_rate": 0.0009975349906888131, + "loss": 0.1099, + "num_input_tokens_seen": 34055520, + "step": 15750 + }, + { + "epoch": 2.5701468189233276, + "grad_norm": 0.06531205028295517, + "learning_rate": 0.00099752792636984, + "loss": 0.1278, + "num_input_tokens_seen": 34066048, + "step": 15755 + }, + { + "epoch": 2.570962479608483, + "grad_norm": 0.14804935455322266, + "learning_rate": 0.0009975208519678324, + "loss": 0.0681, + "num_input_tokens_seen": 34077632, + "step": 15760 + }, + { + "epoch": 2.5717781402936377, + "grad_norm": 0.12645001709461212, + "learning_rate": 0.0009975137674829335, + "loss": 0.2039, + "num_input_tokens_seen": 34088384, + "step": 15765 + }, + { + "epoch": 2.572593800978793, + "grad_norm": 0.0323776975274086, + "learning_rate": 0.000997506672915287, + "loss": 0.04, + "num_input_tokens_seen": 34098880, + "step": 15770 + }, + { + "epoch": 2.573409461663948, + "grad_norm": 0.025035852566361427, + "learning_rate": 0.0009974995682650368, + "loss": 0.1377, + "num_input_tokens_seen": 34109952, + "step": 15775 + }, + { + "epoch": 2.5742251223491026, + "grad_norm": 0.13843075931072235, + "learning_rate": 0.0009974924535323265, + "loss": 0.0768, + "num_input_tokens_seen": 34120576, + "step": 15780 + }, + { + "epoch": 2.575040783034258, + "grad_norm": 0.2659997045993805, + "learning_rate": 0.0009974853287173006, + "loss": 0.1793, + "num_input_tokens_seen": 34131168, + "step": 15785 + }, + { + "epoch": 2.5758564437194127, + "grad_norm": 0.045313864946365356, + "learning_rate": 0.0009974781938201034, + "loss": 0.1267, + "num_input_tokens_seen": 34141984, + "step": 15790 + }, + { + "epoch": 2.576672104404568, + "grad_norm": 0.23315565288066864, + "learning_rate": 0.0009974710488408795, + "loss": 0.1438, + "num_input_tokens_seen": 34153056, + "step": 15795 + }, + { + "epoch": 2.5774877650897228, + "grad_norm": 0.10043656826019287, + "learning_rate": 0.0009974638937797736, + "loss": 0.165, + "num_input_tokens_seen": 34163424, + "step": 15800 + }, + { + "epoch": 2.5783034257748776, + "grad_norm": 0.05951232835650444, + "learning_rate": 0.000997456728636931, + "loss": 0.2315, + "num_input_tokens_seen": 34175712, + "step": 15805 + }, + { + "epoch": 2.5791190864600324, + "grad_norm": 1.1597989797592163, + "learning_rate": 0.0009974495534124967, + "loss": 0.1146, + "num_input_tokens_seen": 34186784, + "step": 15810 + }, + { + "epoch": 2.5799347471451877, + "grad_norm": 0.058770764619112015, + "learning_rate": 0.000997442368106616, + "loss": 0.0759, + "num_input_tokens_seen": 34196224, + "step": 15815 + }, + { + "epoch": 2.5807504078303425, + "grad_norm": 0.23356138169765472, + "learning_rate": 0.0009974351727194347, + "loss": 0.1931, + "num_input_tokens_seen": 34206720, + "step": 15820 + }, + { + "epoch": 2.5815660685154977, + "grad_norm": 0.02992885187268257, + "learning_rate": 0.0009974279672510986, + "loss": 0.107, + "num_input_tokens_seen": 34217120, + "step": 15825 + }, + { + "epoch": 2.5823817292006526, + "grad_norm": 0.2092907577753067, + "learning_rate": 0.0009974207517017537, + "loss": 0.1395, + "num_input_tokens_seen": 34226592, + "step": 15830 + }, + { + "epoch": 2.5831973898858074, + "grad_norm": 0.046701934188604355, + "learning_rate": 0.0009974135260715465, + "loss": 0.0898, + "num_input_tokens_seen": 34237440, + "step": 15835 + }, + { + "epoch": 2.5840130505709626, + "grad_norm": 0.1012524962425232, + "learning_rate": 0.0009974062903606229, + "loss": 0.1047, + "num_input_tokens_seen": 34248896, + "step": 15840 + }, + { + "epoch": 2.5848287112561175, + "grad_norm": 0.19864198565483093, + "learning_rate": 0.0009973990445691298, + "loss": 0.1758, + "num_input_tokens_seen": 34260512, + "step": 15845 + }, + { + "epoch": 2.5856443719412723, + "grad_norm": 0.14162364602088928, + "learning_rate": 0.0009973917886972143, + "loss": 0.1472, + "num_input_tokens_seen": 34271104, + "step": 15850 + }, + { + "epoch": 2.5864600326264275, + "grad_norm": 0.1076699048280716, + "learning_rate": 0.000997384522745023, + "loss": 0.0957, + "num_input_tokens_seen": 34281472, + "step": 15855 + }, + { + "epoch": 2.5872756933115824, + "grad_norm": 0.05362573638558388, + "learning_rate": 0.0009973772467127035, + "loss": 0.1355, + "num_input_tokens_seen": 34293216, + "step": 15860 + }, + { + "epoch": 2.588091353996737, + "grad_norm": 0.028096288442611694, + "learning_rate": 0.000997369960600403, + "loss": 0.0984, + "num_input_tokens_seen": 34304448, + "step": 15865 + }, + { + "epoch": 2.5889070146818924, + "grad_norm": 0.06138193607330322, + "learning_rate": 0.0009973626644082694, + "loss": 0.0566, + "num_input_tokens_seen": 34316352, + "step": 15870 + }, + { + "epoch": 2.5897226753670473, + "grad_norm": 0.05653262510895729, + "learning_rate": 0.0009973553581364503, + "loss": 0.0578, + "num_input_tokens_seen": 34326688, + "step": 15875 + }, + { + "epoch": 2.5905383360522025, + "grad_norm": 0.10075034201145172, + "learning_rate": 0.0009973480417850942, + "loss": 0.1513, + "num_input_tokens_seen": 34338848, + "step": 15880 + }, + { + "epoch": 2.5913539967373573, + "grad_norm": 0.10138165205717087, + "learning_rate": 0.0009973407153543489, + "loss": 0.1257, + "num_input_tokens_seen": 34349472, + "step": 15885 + }, + { + "epoch": 2.592169657422512, + "grad_norm": 0.12078166007995605, + "learning_rate": 0.0009973333788443632, + "loss": 0.1139, + "num_input_tokens_seen": 34359712, + "step": 15890 + }, + { + "epoch": 2.592985318107667, + "grad_norm": 0.056835684925317764, + "learning_rate": 0.0009973260322552855, + "loss": 0.0601, + "num_input_tokens_seen": 34371136, + "step": 15895 + }, + { + "epoch": 2.5938009787928222, + "grad_norm": 0.2918012738227844, + "learning_rate": 0.000997318675587265, + "loss": 0.1126, + "num_input_tokens_seen": 34381120, + "step": 15900 + }, + { + "epoch": 2.594616639477977, + "grad_norm": 0.04708686098456383, + "learning_rate": 0.0009973113088404507, + "loss": 0.1877, + "num_input_tokens_seen": 34392384, + "step": 15905 + }, + { + "epoch": 2.5954323001631323, + "grad_norm": 0.024804405868053436, + "learning_rate": 0.0009973039320149916, + "loss": 0.128, + "num_input_tokens_seen": 34402688, + "step": 15910 + }, + { + "epoch": 2.596247960848287, + "grad_norm": 0.12013711035251617, + "learning_rate": 0.0009972965451110376, + "loss": 0.0896, + "num_input_tokens_seen": 34413280, + "step": 15915 + }, + { + "epoch": 2.597063621533442, + "grad_norm": 0.030978182330727577, + "learning_rate": 0.0009972891481287382, + "loss": 0.124, + "num_input_tokens_seen": 34424224, + "step": 15920 + }, + { + "epoch": 2.597879282218597, + "grad_norm": 0.03300139680504799, + "learning_rate": 0.0009972817410682433, + "loss": 0.0973, + "num_input_tokens_seen": 34433888, + "step": 15925 + }, + { + "epoch": 2.598694942903752, + "grad_norm": 0.009529628790915012, + "learning_rate": 0.0009972743239297032, + "loss": 0.0316, + "num_input_tokens_seen": 34444576, + "step": 15930 + }, + { + "epoch": 2.5995106035889073, + "grad_norm": 0.21190616488456726, + "learning_rate": 0.000997266896713268, + "loss": 0.123, + "num_input_tokens_seen": 34455232, + "step": 15935 + }, + { + "epoch": 2.600326264274062, + "grad_norm": 0.1390226036310196, + "learning_rate": 0.0009972594594190884, + "loss": 0.2094, + "num_input_tokens_seen": 34465792, + "step": 15940 + }, + { + "epoch": 2.601141924959217, + "grad_norm": 0.046548616141080856, + "learning_rate": 0.0009972520120473149, + "loss": 0.0458, + "num_input_tokens_seen": 34477024, + "step": 15945 + }, + { + "epoch": 2.6019575856443717, + "grad_norm": 0.053153183311223984, + "learning_rate": 0.0009972445545980988, + "loss": 0.1465, + "num_input_tokens_seen": 34487808, + "step": 15950 + }, + { + "epoch": 2.602773246329527, + "grad_norm": 0.016847344115376472, + "learning_rate": 0.0009972370870715908, + "loss": 0.0365, + "num_input_tokens_seen": 34498848, + "step": 15955 + }, + { + "epoch": 2.603588907014682, + "grad_norm": 0.05434371903538704, + "learning_rate": 0.0009972296094679426, + "loss": 0.1182, + "num_input_tokens_seen": 34509664, + "step": 15960 + }, + { + "epoch": 2.604404567699837, + "grad_norm": 0.02624763920903206, + "learning_rate": 0.0009972221217873054, + "loss": 0.0167, + "num_input_tokens_seen": 34518720, + "step": 15965 + }, + { + "epoch": 2.605220228384992, + "grad_norm": 0.03936131298542023, + "learning_rate": 0.0009972146240298312, + "loss": 0.0485, + "num_input_tokens_seen": 34530272, + "step": 15970 + }, + { + "epoch": 2.6060358890701467, + "grad_norm": 0.06856126338243484, + "learning_rate": 0.000997207116195672, + "loss": 0.0453, + "num_input_tokens_seen": 34541152, + "step": 15975 + }, + { + "epoch": 2.6068515497553015, + "grad_norm": 0.18589448928833008, + "learning_rate": 0.0009971995982849795, + "loss": 0.1929, + "num_input_tokens_seen": 34551872, + "step": 15980 + }, + { + "epoch": 2.607667210440457, + "grad_norm": 0.1637170910835266, + "learning_rate": 0.0009971920702979066, + "loss": 0.0866, + "num_input_tokens_seen": 34563168, + "step": 15985 + }, + { + "epoch": 2.6084828711256116, + "grad_norm": 0.25298258662223816, + "learning_rate": 0.000997184532234606, + "loss": 0.2414, + "num_input_tokens_seen": 34572832, + "step": 15990 + }, + { + "epoch": 2.609298531810767, + "grad_norm": 0.10075201094150543, + "learning_rate": 0.0009971769840952296, + "loss": 0.0439, + "num_input_tokens_seen": 34584704, + "step": 15995 + }, + { + "epoch": 2.6101141924959217, + "grad_norm": 0.03244736045598984, + "learning_rate": 0.0009971694258799312, + "loss": 0.1003, + "num_input_tokens_seen": 34594912, + "step": 16000 + }, + { + "epoch": 2.6109298531810765, + "grad_norm": 0.10081294924020767, + "learning_rate": 0.0009971618575888637, + "loss": 0.1658, + "num_input_tokens_seen": 34604480, + "step": 16005 + }, + { + "epoch": 2.6117455138662318, + "grad_norm": 0.010015531443059444, + "learning_rate": 0.0009971542792221802, + "loss": 0.1819, + "num_input_tokens_seen": 34615072, + "step": 16010 + }, + { + "epoch": 2.6125611745513866, + "grad_norm": 0.15907056629657745, + "learning_rate": 0.000997146690780035, + "loss": 0.104, + "num_input_tokens_seen": 34626016, + "step": 16015 + }, + { + "epoch": 2.613376835236542, + "grad_norm": 0.03764641657471657, + "learning_rate": 0.000997139092262581, + "loss": 0.0564, + "num_input_tokens_seen": 34637600, + "step": 16020 + }, + { + "epoch": 2.6141924959216967, + "grad_norm": 0.03706960752606392, + "learning_rate": 0.0009971314836699728, + "loss": 0.1023, + "num_input_tokens_seen": 34648736, + "step": 16025 + }, + { + "epoch": 2.6150081566068515, + "grad_norm": 0.03933669254183769, + "learning_rate": 0.0009971238650023644, + "loss": 0.0428, + "num_input_tokens_seen": 34658528, + "step": 16030 + }, + { + "epoch": 2.6158238172920063, + "grad_norm": 0.07986725121736526, + "learning_rate": 0.0009971162362599102, + "loss": 0.0671, + "num_input_tokens_seen": 34669088, + "step": 16035 + }, + { + "epoch": 2.6166394779771616, + "grad_norm": 0.29692548513412476, + "learning_rate": 0.000997108597442765, + "loss": 0.0939, + "num_input_tokens_seen": 34681632, + "step": 16040 + }, + { + "epoch": 2.6174551386623164, + "grad_norm": 0.10401139408349991, + "learning_rate": 0.000997100948551083, + "loss": 0.0722, + "num_input_tokens_seen": 34692768, + "step": 16045 + }, + { + "epoch": 2.6182707993474716, + "grad_norm": 0.04141692817211151, + "learning_rate": 0.0009970932895850201, + "loss": 0.0579, + "num_input_tokens_seen": 34702656, + "step": 16050 + }, + { + "epoch": 2.6190864600326265, + "grad_norm": 0.017853064462542534, + "learning_rate": 0.000997085620544731, + "loss": 0.0688, + "num_input_tokens_seen": 34713248, + "step": 16055 + }, + { + "epoch": 2.6199021207177813, + "grad_norm": 0.018851248547434807, + "learning_rate": 0.0009970779414303712, + "loss": 0.2701, + "num_input_tokens_seen": 34723808, + "step": 16060 + }, + { + "epoch": 2.6207177814029365, + "grad_norm": 0.06940672546625137, + "learning_rate": 0.0009970702522420962, + "loss": 0.0288, + "num_input_tokens_seen": 34735616, + "step": 16065 + }, + { + "epoch": 2.6215334420880914, + "grad_norm": 0.2243514209985733, + "learning_rate": 0.000997062552980062, + "loss": 0.3053, + "num_input_tokens_seen": 34746048, + "step": 16070 + }, + { + "epoch": 2.622349102773246, + "grad_norm": 0.1178874745965004, + "learning_rate": 0.0009970548436444248, + "loss": 0.07, + "num_input_tokens_seen": 34755488, + "step": 16075 + }, + { + "epoch": 2.6231647634584014, + "grad_norm": 0.03382722660899162, + "learning_rate": 0.0009970471242353406, + "loss": 0.1779, + "num_input_tokens_seen": 34766048, + "step": 16080 + }, + { + "epoch": 2.6239804241435563, + "grad_norm": 0.017402131110429764, + "learning_rate": 0.0009970393947529657, + "loss": 0.1642, + "num_input_tokens_seen": 34776992, + "step": 16085 + }, + { + "epoch": 2.624796084828711, + "grad_norm": 0.1997496634721756, + "learning_rate": 0.0009970316551974568, + "loss": 0.1137, + "num_input_tokens_seen": 34788640, + "step": 16090 + }, + { + "epoch": 2.6256117455138663, + "grad_norm": 0.032988984137773514, + "learning_rate": 0.0009970239055689712, + "loss": 0.1022, + "num_input_tokens_seen": 34797536, + "step": 16095 + }, + { + "epoch": 2.626427406199021, + "grad_norm": 0.063567616045475, + "learning_rate": 0.0009970161458676655, + "loss": 0.0633, + "num_input_tokens_seen": 34809152, + "step": 16100 + }, + { + "epoch": 2.6272430668841764, + "grad_norm": 0.11862125992774963, + "learning_rate": 0.000997008376093697, + "loss": 0.1626, + "num_input_tokens_seen": 34820160, + "step": 16105 + }, + { + "epoch": 2.6280587275693312, + "grad_norm": 0.1909906566143036, + "learning_rate": 0.0009970005962472233, + "loss": 0.1546, + "num_input_tokens_seen": 34833312, + "step": 16110 + }, + { + "epoch": 2.628874388254486, + "grad_norm": 0.22565732896327972, + "learning_rate": 0.0009969928063284022, + "loss": 0.1844, + "num_input_tokens_seen": 34842368, + "step": 16115 + }, + { + "epoch": 2.629690048939641, + "grad_norm": 0.04123775288462639, + "learning_rate": 0.0009969850063373913, + "loss": 0.197, + "num_input_tokens_seen": 34852320, + "step": 16120 + }, + { + "epoch": 2.630505709624796, + "grad_norm": 0.038814183324575424, + "learning_rate": 0.0009969771962743488, + "loss": 0.0386, + "num_input_tokens_seen": 34863680, + "step": 16125 + }, + { + "epoch": 2.631321370309951, + "grad_norm": 0.007476178463548422, + "learning_rate": 0.0009969693761394326, + "loss": 0.1201, + "num_input_tokens_seen": 34874880, + "step": 16130 + }, + { + "epoch": 2.632137030995106, + "grad_norm": 0.11543486267328262, + "learning_rate": 0.000996961545932802, + "loss": 0.1336, + "num_input_tokens_seen": 34884192, + "step": 16135 + }, + { + "epoch": 2.632952691680261, + "grad_norm": 0.038836341351270676, + "learning_rate": 0.0009969537056546151, + "loss": 0.1078, + "num_input_tokens_seen": 34894688, + "step": 16140 + }, + { + "epoch": 2.633768352365416, + "grad_norm": 0.062395110726356506, + "learning_rate": 0.000996945855305031, + "loss": 0.1237, + "num_input_tokens_seen": 34904256, + "step": 16145 + }, + { + "epoch": 2.634584013050571, + "grad_norm": 0.06816502660512924, + "learning_rate": 0.0009969379948842085, + "loss": 0.0695, + "num_input_tokens_seen": 34916864, + "step": 16150 + }, + { + "epoch": 2.635399673735726, + "grad_norm": 0.028660116717219353, + "learning_rate": 0.0009969301243923073, + "loss": 0.0581, + "num_input_tokens_seen": 34927200, + "step": 16155 + }, + { + "epoch": 2.636215334420881, + "grad_norm": 0.054395247250795364, + "learning_rate": 0.0009969222438294867, + "loss": 0.0342, + "num_input_tokens_seen": 34938656, + "step": 16160 + }, + { + "epoch": 2.637030995106036, + "grad_norm": 0.12486924231052399, + "learning_rate": 0.0009969143531959063, + "loss": 0.0679, + "num_input_tokens_seen": 34949600, + "step": 16165 + }, + { + "epoch": 2.637846655791191, + "grad_norm": 0.017010482028126717, + "learning_rate": 0.0009969064524917265, + "loss": 0.0674, + "num_input_tokens_seen": 34961696, + "step": 16170 + }, + { + "epoch": 2.6386623164763456, + "grad_norm": 0.009399576112627983, + "learning_rate": 0.000996898541717107, + "loss": 0.0415, + "num_input_tokens_seen": 34972320, + "step": 16175 + }, + { + "epoch": 2.639477977161501, + "grad_norm": 0.2002403438091278, + "learning_rate": 0.0009968906208722077, + "loss": 0.0695, + "num_input_tokens_seen": 34983744, + "step": 16180 + }, + { + "epoch": 2.6402936378466557, + "grad_norm": 0.0805182009935379, + "learning_rate": 0.00099688268995719, + "loss": 0.1237, + "num_input_tokens_seen": 34994880, + "step": 16185 + }, + { + "epoch": 2.641109298531811, + "grad_norm": 0.14909473061561584, + "learning_rate": 0.0009968747489722141, + "loss": 0.0621, + "num_input_tokens_seen": 35004928, + "step": 16190 + }, + { + "epoch": 2.641924959216966, + "grad_norm": 0.2769220471382141, + "learning_rate": 0.0009968667979174412, + "loss": 0.1966, + "num_input_tokens_seen": 35015136, + "step": 16195 + }, + { + "epoch": 2.6427406199021206, + "grad_norm": 0.2111338973045349, + "learning_rate": 0.0009968588367930324, + "loss": 0.1392, + "num_input_tokens_seen": 35024736, + "step": 16200 + }, + { + "epoch": 2.6435562805872754, + "grad_norm": 0.003953300416469574, + "learning_rate": 0.0009968508655991489, + "loss": 0.2273, + "num_input_tokens_seen": 35035872, + "step": 16205 + }, + { + "epoch": 2.6443719412724307, + "grad_norm": 0.03729906305670738, + "learning_rate": 0.0009968428843359523, + "loss": 0.0513, + "num_input_tokens_seen": 35047424, + "step": 16210 + }, + { + "epoch": 2.6451876019575855, + "grad_norm": 0.2112385332584381, + "learning_rate": 0.0009968348930036043, + "loss": 0.097, + "num_input_tokens_seen": 35057696, + "step": 16215 + }, + { + "epoch": 2.6460032626427408, + "grad_norm": 0.12490873783826828, + "learning_rate": 0.000996826891602267, + "loss": 0.0654, + "num_input_tokens_seen": 35069440, + "step": 16220 + }, + { + "epoch": 2.6468189233278956, + "grad_norm": 0.03835887461900711, + "learning_rate": 0.0009968188801321024, + "loss": 0.1038, + "num_input_tokens_seen": 35080064, + "step": 16225 + }, + { + "epoch": 2.6476345840130504, + "grad_norm": 0.12459404021501541, + "learning_rate": 0.000996810858593273, + "loss": 0.1034, + "num_input_tokens_seen": 35090944, + "step": 16230 + }, + { + "epoch": 2.6484502446982057, + "grad_norm": 0.21367542445659637, + "learning_rate": 0.000996802826985941, + "loss": 0.1437, + "num_input_tokens_seen": 35100896, + "step": 16235 + }, + { + "epoch": 2.6492659053833605, + "grad_norm": 0.03609883412718773, + "learning_rate": 0.0009967947853102698, + "loss": 0.1104, + "num_input_tokens_seen": 35111488, + "step": 16240 + }, + { + "epoch": 2.6500815660685157, + "grad_norm": 0.010214082896709442, + "learning_rate": 0.000996786733566422, + "loss": 0.1245, + "num_input_tokens_seen": 35123360, + "step": 16245 + }, + { + "epoch": 2.6508972267536706, + "grad_norm": 0.11889711022377014, + "learning_rate": 0.0009967786717545609, + "loss": 0.1321, + "num_input_tokens_seen": 35133952, + "step": 16250 + }, + { + "epoch": 2.6517128874388254, + "grad_norm": 0.4400777220726013, + "learning_rate": 0.0009967705998748496, + "loss": 0.1365, + "num_input_tokens_seen": 35144800, + "step": 16255 + }, + { + "epoch": 2.65252854812398, + "grad_norm": 0.023491906002163887, + "learning_rate": 0.000996762517927452, + "loss": 0.0422, + "num_input_tokens_seen": 35154624, + "step": 16260 + }, + { + "epoch": 2.6533442088091355, + "grad_norm": 0.2003888189792633, + "learning_rate": 0.0009967544259125317, + "loss": 0.1334, + "num_input_tokens_seen": 35163648, + "step": 16265 + }, + { + "epoch": 2.6541598694942903, + "grad_norm": 0.19598814845085144, + "learning_rate": 0.000996746323830253, + "loss": 0.2184, + "num_input_tokens_seen": 35174976, + "step": 16270 + }, + { + "epoch": 2.6549755301794455, + "grad_norm": 0.050365347415208817, + "learning_rate": 0.0009967382116807797, + "loss": 0.1016, + "num_input_tokens_seen": 35186432, + "step": 16275 + }, + { + "epoch": 2.6557911908646004, + "grad_norm": 0.05750131234526634, + "learning_rate": 0.0009967300894642764, + "loss": 0.0348, + "num_input_tokens_seen": 35196896, + "step": 16280 + }, + { + "epoch": 2.656606851549755, + "grad_norm": 0.04523392394185066, + "learning_rate": 0.0009967219571809076, + "loss": 0.0938, + "num_input_tokens_seen": 35207360, + "step": 16285 + }, + { + "epoch": 2.6574225122349104, + "grad_norm": 0.0062296329997479916, + "learning_rate": 0.0009967138148308384, + "loss": 0.0612, + "num_input_tokens_seen": 35217664, + "step": 16290 + }, + { + "epoch": 2.6582381729200653, + "grad_norm": 0.0448344424366951, + "learning_rate": 0.0009967056624142336, + "loss": 0.0749, + "num_input_tokens_seen": 35228128, + "step": 16295 + }, + { + "epoch": 2.65905383360522, + "grad_norm": 0.046069227159023285, + "learning_rate": 0.0009966974999312584, + "loss": 0.1693, + "num_input_tokens_seen": 35237728, + "step": 16300 + }, + { + "epoch": 2.6598694942903753, + "grad_norm": 0.04713843762874603, + "learning_rate": 0.000996689327382078, + "loss": 0.0631, + "num_input_tokens_seen": 35247968, + "step": 16305 + }, + { + "epoch": 2.66068515497553, + "grad_norm": 0.088753342628479, + "learning_rate": 0.0009966811447668586, + "loss": 0.1018, + "num_input_tokens_seen": 35258496, + "step": 16310 + }, + { + "epoch": 2.661500815660685, + "grad_norm": 0.23398029804229736, + "learning_rate": 0.0009966729520857658, + "loss": 0.1089, + "num_input_tokens_seen": 35268448, + "step": 16315 + }, + { + "epoch": 2.6623164763458402, + "grad_norm": 0.02539239078760147, + "learning_rate": 0.0009966647493389654, + "loss": 0.2314, + "num_input_tokens_seen": 35279680, + "step": 16320 + }, + { + "epoch": 2.663132137030995, + "grad_norm": 0.05267626419663429, + "learning_rate": 0.0009966565365266238, + "loss": 0.1049, + "num_input_tokens_seen": 35290080, + "step": 16325 + }, + { + "epoch": 2.6639477977161503, + "grad_norm": 0.01566062681376934, + "learning_rate": 0.0009966483136489073, + "loss": 0.0362, + "num_input_tokens_seen": 35301760, + "step": 16330 + }, + { + "epoch": 2.664763458401305, + "grad_norm": 0.06919416785240173, + "learning_rate": 0.0009966400807059827, + "loss": 0.0646, + "num_input_tokens_seen": 35313376, + "step": 16335 + }, + { + "epoch": 2.66557911908646, + "grad_norm": 0.04618493840098381, + "learning_rate": 0.000996631837698017, + "loss": 0.0782, + "num_input_tokens_seen": 35323968, + "step": 16340 + }, + { + "epoch": 2.6663947797716148, + "grad_norm": 0.2246595174074173, + "learning_rate": 0.000996623584625177, + "loss": 0.3272, + "num_input_tokens_seen": 35334592, + "step": 16345 + }, + { + "epoch": 2.66721044045677, + "grad_norm": 0.02475031651556492, + "learning_rate": 0.00099661532148763, + "loss": 0.0636, + "num_input_tokens_seen": 35346208, + "step": 16350 + }, + { + "epoch": 2.668026101141925, + "grad_norm": 0.1650330275297165, + "learning_rate": 0.0009966070482855436, + "loss": 0.1049, + "num_input_tokens_seen": 35357696, + "step": 16355 + }, + { + "epoch": 2.66884176182708, + "grad_norm": 0.0800071507692337, + "learning_rate": 0.0009965987650190852, + "loss": 0.2077, + "num_input_tokens_seen": 35368640, + "step": 16360 + }, + { + "epoch": 2.669657422512235, + "grad_norm": 0.03208388015627861, + "learning_rate": 0.000996590471688423, + "loss": 0.1451, + "num_input_tokens_seen": 35380480, + "step": 16365 + }, + { + "epoch": 2.6704730831973897, + "grad_norm": 0.15314123034477234, + "learning_rate": 0.000996582168293725, + "loss": 0.0715, + "num_input_tokens_seen": 35393344, + "step": 16370 + }, + { + "epoch": 2.671288743882545, + "grad_norm": 0.20598310232162476, + "learning_rate": 0.0009965738548351592, + "loss": 0.1603, + "num_input_tokens_seen": 35402656, + "step": 16375 + }, + { + "epoch": 2.6721044045677, + "grad_norm": 0.06572859734296799, + "learning_rate": 0.0009965655313128945, + "loss": 0.0563, + "num_input_tokens_seen": 35413920, + "step": 16380 + }, + { + "epoch": 2.672920065252855, + "grad_norm": 0.10475694388151169, + "learning_rate": 0.0009965571977270994, + "loss": 0.0779, + "num_input_tokens_seen": 35422688, + "step": 16385 + }, + { + "epoch": 2.67373572593801, + "grad_norm": 0.0855659693479538, + "learning_rate": 0.0009965488540779426, + "loss": 0.1929, + "num_input_tokens_seen": 35432896, + "step": 16390 + }, + { + "epoch": 2.6745513866231647, + "grad_norm": 0.0473497100174427, + "learning_rate": 0.0009965405003655933, + "loss": 0.052, + "num_input_tokens_seen": 35443328, + "step": 16395 + }, + { + "epoch": 2.6753670473083195, + "grad_norm": 0.040079183876514435, + "learning_rate": 0.000996532136590221, + "loss": 0.1367, + "num_input_tokens_seen": 35452992, + "step": 16400 + }, + { + "epoch": 2.676182707993475, + "grad_norm": 0.1019391268491745, + "learning_rate": 0.000996523762751995, + "loss": 0.15, + "num_input_tokens_seen": 35464544, + "step": 16405 + }, + { + "epoch": 2.6769983686786296, + "grad_norm": 0.038980767130851746, + "learning_rate": 0.000996515378851085, + "loss": 0.1345, + "num_input_tokens_seen": 35475488, + "step": 16410 + }, + { + "epoch": 2.677814029363785, + "grad_norm": 0.12611663341522217, + "learning_rate": 0.0009965069848876609, + "loss": 0.0944, + "num_input_tokens_seen": 35486272, + "step": 16415 + }, + { + "epoch": 2.6786296900489397, + "grad_norm": 0.2920730412006378, + "learning_rate": 0.000996498580861893, + "loss": 0.1352, + "num_input_tokens_seen": 35497312, + "step": 16420 + }, + { + "epoch": 2.6794453507340945, + "grad_norm": 0.2523801624774933, + "learning_rate": 0.0009964901667739517, + "loss": 0.2546, + "num_input_tokens_seen": 35506624, + "step": 16425 + }, + { + "epoch": 2.6802610114192493, + "grad_norm": 0.4290613532066345, + "learning_rate": 0.000996481742624007, + "loss": 0.2409, + "num_input_tokens_seen": 35516352, + "step": 16430 + }, + { + "epoch": 2.6810766721044046, + "grad_norm": 0.029356911778450012, + "learning_rate": 0.00099647330841223, + "loss": 0.0954, + "num_input_tokens_seen": 35527392, + "step": 16435 + }, + { + "epoch": 2.6818923327895594, + "grad_norm": 0.10720787197351456, + "learning_rate": 0.0009964648641387918, + "loss": 0.2365, + "num_input_tokens_seen": 35537888, + "step": 16440 + }, + { + "epoch": 2.6827079934747147, + "grad_norm": 0.026292763650417328, + "learning_rate": 0.000996456409803863, + "loss": 0.1337, + "num_input_tokens_seen": 35547680, + "step": 16445 + }, + { + "epoch": 2.6835236541598695, + "grad_norm": 0.030141742900013924, + "learning_rate": 0.0009964479454076156, + "loss": 0.1958, + "num_input_tokens_seen": 35557920, + "step": 16450 + }, + { + "epoch": 2.6843393148450243, + "grad_norm": 0.09774786978960037, + "learning_rate": 0.0009964394709502207, + "loss": 0.1538, + "num_input_tokens_seen": 35568064, + "step": 16455 + }, + { + "epoch": 2.6851549755301796, + "grad_norm": 0.03949353098869324, + "learning_rate": 0.0009964309864318502, + "loss": 0.0572, + "num_input_tokens_seen": 35579200, + "step": 16460 + }, + { + "epoch": 2.6859706362153344, + "grad_norm": 0.0355185829102993, + "learning_rate": 0.0009964224918526758, + "loss": 0.1612, + "num_input_tokens_seen": 35588928, + "step": 16465 + }, + { + "epoch": 2.6867862969004896, + "grad_norm": 0.03025023639202118, + "learning_rate": 0.0009964139872128699, + "loss": 0.0876, + "num_input_tokens_seen": 35600576, + "step": 16470 + }, + { + "epoch": 2.6876019575856445, + "grad_norm": 0.23995541036128998, + "learning_rate": 0.000996405472512605, + "loss": 0.1761, + "num_input_tokens_seen": 35611296, + "step": 16475 + }, + { + "epoch": 2.6884176182707993, + "grad_norm": 0.26141855120658875, + "learning_rate": 0.0009963969477520531, + "loss": 0.1881, + "num_input_tokens_seen": 35622592, + "step": 16480 + }, + { + "epoch": 2.689233278955954, + "grad_norm": 0.244488924741745, + "learning_rate": 0.0009963884129313876, + "loss": 0.1601, + "num_input_tokens_seen": 35632704, + "step": 16485 + }, + { + "epoch": 2.6900489396411094, + "grad_norm": 0.0766143873333931, + "learning_rate": 0.0009963798680507811, + "loss": 0.0869, + "num_input_tokens_seen": 35644000, + "step": 16490 + }, + { + "epoch": 2.690864600326264, + "grad_norm": 0.02519366890192032, + "learning_rate": 0.0009963713131104068, + "loss": 0.0949, + "num_input_tokens_seen": 35653632, + "step": 16495 + }, + { + "epoch": 2.6916802610114194, + "grad_norm": 0.07979767769575119, + "learning_rate": 0.0009963627481104384, + "loss": 0.1554, + "num_input_tokens_seen": 35664704, + "step": 16500 + }, + { + "epoch": 2.6924959216965743, + "grad_norm": 0.1658993363380432, + "learning_rate": 0.000996354173051049, + "loss": 0.1786, + "num_input_tokens_seen": 35674944, + "step": 16505 + }, + { + "epoch": 2.693311582381729, + "grad_norm": 0.12059544771909714, + "learning_rate": 0.0009963455879324129, + "loss": 0.0858, + "num_input_tokens_seen": 35685856, + "step": 16510 + }, + { + "epoch": 2.6941272430668843, + "grad_norm": 0.1469067484140396, + "learning_rate": 0.0009963369927547035, + "loss": 0.092, + "num_input_tokens_seen": 35697376, + "step": 16515 + }, + { + "epoch": 2.694942903752039, + "grad_norm": 0.12039193511009216, + "learning_rate": 0.0009963283875180952, + "loss": 0.0749, + "num_input_tokens_seen": 35709184, + "step": 16520 + }, + { + "epoch": 2.695758564437194, + "grad_norm": 0.03307841345667839, + "learning_rate": 0.0009963197722227628, + "loss": 0.0792, + "num_input_tokens_seen": 35719168, + "step": 16525 + }, + { + "epoch": 2.6965742251223492, + "grad_norm": 0.13452056050300598, + "learning_rate": 0.0009963111468688805, + "loss": 0.0751, + "num_input_tokens_seen": 35729184, + "step": 16530 + }, + { + "epoch": 2.697389885807504, + "grad_norm": 0.024676907807588577, + "learning_rate": 0.000996302511456623, + "loss": 0.0837, + "num_input_tokens_seen": 35739968, + "step": 16535 + }, + { + "epoch": 2.698205546492659, + "grad_norm": 0.12530513107776642, + "learning_rate": 0.0009962938659861657, + "loss": 0.142, + "num_input_tokens_seen": 35752384, + "step": 16540 + }, + { + "epoch": 2.699021207177814, + "grad_norm": 0.20160333812236786, + "learning_rate": 0.0009962852104576836, + "loss": 0.0889, + "num_input_tokens_seen": 35762976, + "step": 16545 + }, + { + "epoch": 2.699836867862969, + "grad_norm": 0.02842654101550579, + "learning_rate": 0.0009962765448713522, + "loss": 0.0224, + "num_input_tokens_seen": 35773536, + "step": 16550 + }, + { + "epoch": 2.700652528548124, + "grad_norm": 0.010374743491411209, + "learning_rate": 0.000996267869227347, + "loss": 0.2177, + "num_input_tokens_seen": 35784480, + "step": 16555 + }, + { + "epoch": 2.701468189233279, + "grad_norm": 0.04730985313653946, + "learning_rate": 0.0009962591835258436, + "loss": 0.0501, + "num_input_tokens_seen": 35794720, + "step": 16560 + }, + { + "epoch": 2.702283849918434, + "grad_norm": 0.14314627647399902, + "learning_rate": 0.0009962504877670186, + "loss": 0.1492, + "num_input_tokens_seen": 35805280, + "step": 16565 + }, + { + "epoch": 2.7030995106035887, + "grad_norm": 0.01593198999762535, + "learning_rate": 0.0009962417819510479, + "loss": 0.0565, + "num_input_tokens_seen": 35814752, + "step": 16570 + }, + { + "epoch": 2.703915171288744, + "grad_norm": 0.022026842460036278, + "learning_rate": 0.0009962330660781078, + "loss": 0.11, + "num_input_tokens_seen": 35825344, + "step": 16575 + }, + { + "epoch": 2.7047308319738987, + "grad_norm": 0.07531365007162094, + "learning_rate": 0.0009962243401483752, + "loss": 0.313, + "num_input_tokens_seen": 35835840, + "step": 16580 + }, + { + "epoch": 2.705546492659054, + "grad_norm": 0.0568210706114769, + "learning_rate": 0.000996215604162027, + "loss": 0.1396, + "num_input_tokens_seen": 35845856, + "step": 16585 + }, + { + "epoch": 2.706362153344209, + "grad_norm": 0.038479603826999664, + "learning_rate": 0.0009962068581192399, + "loss": 0.1486, + "num_input_tokens_seen": 35856928, + "step": 16590 + }, + { + "epoch": 2.7071778140293636, + "grad_norm": 0.03605350852012634, + "learning_rate": 0.0009961981020201913, + "loss": 0.0991, + "num_input_tokens_seen": 35867232, + "step": 16595 + }, + { + "epoch": 2.707993474714519, + "grad_norm": 0.12278129905462265, + "learning_rate": 0.0009961893358650586, + "loss": 0.1502, + "num_input_tokens_seen": 35878272, + "step": 16600 + }, + { + "epoch": 2.7088091353996737, + "grad_norm": 0.031820762902498245, + "learning_rate": 0.00099618055965402, + "loss": 0.1481, + "num_input_tokens_seen": 35889088, + "step": 16605 + }, + { + "epoch": 2.709624796084829, + "grad_norm": 0.15299761295318604, + "learning_rate": 0.0009961717733872524, + "loss": 0.1214, + "num_input_tokens_seen": 35899328, + "step": 16610 + }, + { + "epoch": 2.710440456769984, + "grad_norm": 0.08476471900939941, + "learning_rate": 0.0009961629770649347, + "loss": 0.1204, + "num_input_tokens_seen": 35911008, + "step": 16615 + }, + { + "epoch": 2.7112561174551386, + "grad_norm": 0.1937631070613861, + "learning_rate": 0.0009961541706872447, + "loss": 0.2569, + "num_input_tokens_seen": 35921440, + "step": 16620 + }, + { + "epoch": 2.7120717781402934, + "grad_norm": 0.03174687549471855, + "learning_rate": 0.000996145354254361, + "loss": 0.0654, + "num_input_tokens_seen": 35933792, + "step": 16625 + }, + { + "epoch": 2.7128874388254487, + "grad_norm": 0.021128924563527107, + "learning_rate": 0.0009961365277664624, + "loss": 0.2715, + "num_input_tokens_seen": 35945920, + "step": 16630 + }, + { + "epoch": 2.7137030995106035, + "grad_norm": 0.06190233677625656, + "learning_rate": 0.0009961276912237276, + "loss": 0.1118, + "num_input_tokens_seen": 35957536, + "step": 16635 + }, + { + "epoch": 2.7145187601957588, + "grad_norm": 0.25115782022476196, + "learning_rate": 0.0009961188446263357, + "loss": 0.2473, + "num_input_tokens_seen": 35968512, + "step": 16640 + }, + { + "epoch": 2.7153344208809136, + "grad_norm": 0.08620987087488174, + "learning_rate": 0.0009961099879744661, + "loss": 0.0696, + "num_input_tokens_seen": 35977760, + "step": 16645 + }, + { + "epoch": 2.7161500815660684, + "grad_norm": 0.16655907034873962, + "learning_rate": 0.0009961011212682982, + "loss": 0.1499, + "num_input_tokens_seen": 35989056, + "step": 16650 + }, + { + "epoch": 2.7169657422512232, + "grad_norm": 0.038287725299596786, + "learning_rate": 0.0009960922445080118, + "loss": 0.1075, + "num_input_tokens_seen": 36000096, + "step": 16655 + }, + { + "epoch": 2.7177814029363785, + "grad_norm": 0.021945785731077194, + "learning_rate": 0.0009960833576937867, + "loss": 0.1108, + "num_input_tokens_seen": 36010624, + "step": 16660 + }, + { + "epoch": 2.7185970636215333, + "grad_norm": 0.0757167786359787, + "learning_rate": 0.000996074460825803, + "loss": 0.081, + "num_input_tokens_seen": 36022144, + "step": 16665 + }, + { + "epoch": 2.7194127243066886, + "grad_norm": 0.15707460045814514, + "learning_rate": 0.0009960655539042412, + "loss": 0.0851, + "num_input_tokens_seen": 36033216, + "step": 16670 + }, + { + "epoch": 2.7202283849918434, + "grad_norm": 0.08880306780338287, + "learning_rate": 0.0009960566369292814, + "loss": 0.0704, + "num_input_tokens_seen": 36044224, + "step": 16675 + }, + { + "epoch": 2.721044045676998, + "grad_norm": 0.04351628199219704, + "learning_rate": 0.0009960477099011048, + "loss": 0.1569, + "num_input_tokens_seen": 36056768, + "step": 16680 + }, + { + "epoch": 2.7218597063621535, + "grad_norm": 0.010717559605836868, + "learning_rate": 0.000996038772819892, + "loss": 0.1071, + "num_input_tokens_seen": 36067584, + "step": 16685 + }, + { + "epoch": 2.7226753670473083, + "grad_norm": 0.02620554156601429, + "learning_rate": 0.0009960298256858238, + "loss": 0.0415, + "num_input_tokens_seen": 36078528, + "step": 16690 + }, + { + "epoch": 2.7234910277324635, + "grad_norm": 0.02566135860979557, + "learning_rate": 0.0009960208684990824, + "loss": 0.0877, + "num_input_tokens_seen": 36090400, + "step": 16695 + }, + { + "epoch": 2.7243066884176184, + "grad_norm": 0.21687543392181396, + "learning_rate": 0.0009960119012598489, + "loss": 0.0928, + "num_input_tokens_seen": 36101792, + "step": 16700 + }, + { + "epoch": 2.725122349102773, + "grad_norm": 0.19889047741889954, + "learning_rate": 0.0009960029239683046, + "loss": 0.3088, + "num_input_tokens_seen": 36112160, + "step": 16705 + }, + { + "epoch": 2.725938009787928, + "grad_norm": 0.14541514217853546, + "learning_rate": 0.000995993936624632, + "loss": 0.1052, + "num_input_tokens_seen": 36123232, + "step": 16710 + }, + { + "epoch": 2.7267536704730833, + "grad_norm": 0.12091077119112015, + "learning_rate": 0.000995984939229013, + "loss": 0.0878, + "num_input_tokens_seen": 36135232, + "step": 16715 + }, + { + "epoch": 2.727569331158238, + "grad_norm": 0.02517981454730034, + "learning_rate": 0.0009959759317816302, + "loss": 0.0899, + "num_input_tokens_seen": 36147104, + "step": 16720 + }, + { + "epoch": 2.7283849918433933, + "grad_norm": 0.03287290409207344, + "learning_rate": 0.0009959669142826659, + "loss": 0.1489, + "num_input_tokens_seen": 36158144, + "step": 16725 + }, + { + "epoch": 2.729200652528548, + "grad_norm": 0.04140019416809082, + "learning_rate": 0.0009959578867323028, + "loss": 0.1108, + "num_input_tokens_seen": 36168416, + "step": 16730 + }, + { + "epoch": 2.730016313213703, + "grad_norm": 0.07255889475345612, + "learning_rate": 0.000995948849130724, + "loss": 0.111, + "num_input_tokens_seen": 36179488, + "step": 16735 + }, + { + "epoch": 2.7308319738988582, + "grad_norm": 0.04503436014056206, + "learning_rate": 0.0009959398014781128, + "loss": 0.1488, + "num_input_tokens_seen": 36190336, + "step": 16740 + }, + { + "epoch": 2.731647634584013, + "grad_norm": 0.08020366728305817, + "learning_rate": 0.000995930743774652, + "loss": 0.1268, + "num_input_tokens_seen": 36201280, + "step": 16745 + }, + { + "epoch": 2.732463295269168, + "grad_norm": 0.029173359274864197, + "learning_rate": 0.0009959216760205257, + "loss": 0.1225, + "num_input_tokens_seen": 36211904, + "step": 16750 + }, + { + "epoch": 2.733278955954323, + "grad_norm": 0.14357851445674896, + "learning_rate": 0.0009959125982159176, + "loss": 0.2048, + "num_input_tokens_seen": 36222912, + "step": 16755 + }, + { + "epoch": 2.734094616639478, + "grad_norm": 0.17278997600078583, + "learning_rate": 0.0009959035103610115, + "loss": 0.1939, + "num_input_tokens_seen": 36234176, + "step": 16760 + }, + { + "epoch": 2.7349102773246328, + "grad_norm": 0.09782871603965759, + "learning_rate": 0.0009958944124559919, + "loss": 0.0862, + "num_input_tokens_seen": 36244480, + "step": 16765 + }, + { + "epoch": 2.735725938009788, + "grad_norm": 0.16363048553466797, + "learning_rate": 0.0009958853045010426, + "loss": 0.1488, + "num_input_tokens_seen": 36255520, + "step": 16770 + }, + { + "epoch": 2.736541598694943, + "grad_norm": 0.050788093358278275, + "learning_rate": 0.0009958761864963487, + "loss": 0.0555, + "num_input_tokens_seen": 36265952, + "step": 16775 + }, + { + "epoch": 2.737357259380098, + "grad_norm": 0.08968115597963333, + "learning_rate": 0.0009958670584420948, + "loss": 0.1237, + "num_input_tokens_seen": 36277248, + "step": 16780 + }, + { + "epoch": 2.738172920065253, + "grad_norm": 0.06076105311512947, + "learning_rate": 0.000995857920338466, + "loss": 0.0688, + "num_input_tokens_seen": 36287424, + "step": 16785 + }, + { + "epoch": 2.7389885807504077, + "grad_norm": 0.19070397317409515, + "learning_rate": 0.0009958487721856474, + "loss": 0.1682, + "num_input_tokens_seen": 36297632, + "step": 16790 + }, + { + "epoch": 2.7398042414355626, + "grad_norm": 0.002982828998938203, + "learning_rate": 0.0009958396139838242, + "loss": 0.0469, + "num_input_tokens_seen": 36307008, + "step": 16795 + }, + { + "epoch": 2.740619902120718, + "grad_norm": 0.015070038847625256, + "learning_rate": 0.0009958304457331822, + "loss": 0.0126, + "num_input_tokens_seen": 36317952, + "step": 16800 + }, + { + "epoch": 2.7414355628058726, + "grad_norm": 0.19135338068008423, + "learning_rate": 0.0009958212674339075, + "loss": 0.059, + "num_input_tokens_seen": 36328128, + "step": 16805 + }, + { + "epoch": 2.742251223491028, + "grad_norm": 0.045779090374708176, + "learning_rate": 0.0009958120790861855, + "loss": 0.1217, + "num_input_tokens_seen": 36337440, + "step": 16810 + }, + { + "epoch": 2.7430668841761827, + "grad_norm": 0.15114933252334595, + "learning_rate": 0.000995802880690203, + "loss": 0.1546, + "num_input_tokens_seen": 36347776, + "step": 16815 + }, + { + "epoch": 2.7438825448613375, + "grad_norm": 0.014411961659789085, + "learning_rate": 0.000995793672246146, + "loss": 0.1026, + "num_input_tokens_seen": 36359360, + "step": 16820 + }, + { + "epoch": 2.744698205546493, + "grad_norm": 0.011926224455237389, + "learning_rate": 0.0009957844537542013, + "loss": 0.0833, + "num_input_tokens_seen": 36370560, + "step": 16825 + }, + { + "epoch": 2.7455138662316476, + "grad_norm": 0.16889113187789917, + "learning_rate": 0.0009957752252145557, + "loss": 0.1278, + "num_input_tokens_seen": 36381024, + "step": 16830 + }, + { + "epoch": 2.746329526916803, + "grad_norm": 0.042039863765239716, + "learning_rate": 0.0009957659866273963, + "loss": 0.0866, + "num_input_tokens_seen": 36392224, + "step": 16835 + }, + { + "epoch": 2.7471451876019577, + "grad_norm": 0.013748877681791782, + "learning_rate": 0.0009957567379929103, + "loss": 0.0833, + "num_input_tokens_seen": 36402880, + "step": 16840 + }, + { + "epoch": 2.7479608482871125, + "grad_norm": 0.05248915031552315, + "learning_rate": 0.0009957474793112848, + "loss": 0.0354, + "num_input_tokens_seen": 36412288, + "step": 16845 + }, + { + "epoch": 2.7487765089722673, + "grad_norm": 0.030665088444948196, + "learning_rate": 0.0009957382105827079, + "loss": 0.0658, + "num_input_tokens_seen": 36422976, + "step": 16850 + }, + { + "epoch": 2.7495921696574226, + "grad_norm": 0.009233402088284492, + "learning_rate": 0.0009957289318073674, + "loss": 0.0312, + "num_input_tokens_seen": 36433984, + "step": 16855 + }, + { + "epoch": 2.7504078303425774, + "grad_norm": 0.18588995933532715, + "learning_rate": 0.000995719642985451, + "loss": 0.1046, + "num_input_tokens_seen": 36444640, + "step": 16860 + }, + { + "epoch": 2.7512234910277327, + "grad_norm": 0.04793168231844902, + "learning_rate": 0.0009957103441171472, + "loss": 0.0486, + "num_input_tokens_seen": 36456256, + "step": 16865 + }, + { + "epoch": 2.7520391517128875, + "grad_norm": 0.0442626029253006, + "learning_rate": 0.0009957010352026447, + "loss": 0.0761, + "num_input_tokens_seen": 36467456, + "step": 16870 + }, + { + "epoch": 2.7528548123980423, + "grad_norm": 0.07995634526014328, + "learning_rate": 0.0009956917162421317, + "loss": 0.1783, + "num_input_tokens_seen": 36477472, + "step": 16875 + }, + { + "epoch": 2.753670473083197, + "grad_norm": 0.025483250617980957, + "learning_rate": 0.0009956823872357972, + "loss": 0.222, + "num_input_tokens_seen": 36489728, + "step": 16880 + }, + { + "epoch": 2.7544861337683524, + "grad_norm": 0.07462822645902634, + "learning_rate": 0.0009956730481838303, + "loss": 0.0635, + "num_input_tokens_seen": 36500384, + "step": 16885 + }, + { + "epoch": 2.755301794453507, + "grad_norm": 0.029625002294778824, + "learning_rate": 0.0009956636990864202, + "loss": 0.0823, + "num_input_tokens_seen": 36512192, + "step": 16890 + }, + { + "epoch": 2.7561174551386625, + "grad_norm": 0.04722673445940018, + "learning_rate": 0.0009956543399437569, + "loss": 0.0857, + "num_input_tokens_seen": 36521600, + "step": 16895 + }, + { + "epoch": 2.7569331158238173, + "grad_norm": 0.009356287308037281, + "learning_rate": 0.0009956449707560291, + "loss": 0.0197, + "num_input_tokens_seen": 36532192, + "step": 16900 + }, + { + "epoch": 2.757748776508972, + "grad_norm": 0.023361343890428543, + "learning_rate": 0.0009956355915234274, + "loss": 0.1314, + "num_input_tokens_seen": 36542816, + "step": 16905 + }, + { + "epoch": 2.7585644371941274, + "grad_norm": 0.12797385454177856, + "learning_rate": 0.0009956262022461416, + "loss": 0.223, + "num_input_tokens_seen": 36553344, + "step": 16910 + }, + { + "epoch": 2.759380097879282, + "grad_norm": 0.18771253526210785, + "learning_rate": 0.0009956168029243621, + "loss": 0.3599, + "num_input_tokens_seen": 36564320, + "step": 16915 + }, + { + "epoch": 2.7601957585644374, + "grad_norm": 0.039339229464530945, + "learning_rate": 0.0009956073935582794, + "loss": 0.0463, + "num_input_tokens_seen": 36575744, + "step": 16920 + }, + { + "epoch": 2.7610114192495923, + "grad_norm": 0.16050797700881958, + "learning_rate": 0.000995597974148084, + "loss": 0.1364, + "num_input_tokens_seen": 36587328, + "step": 16925 + }, + { + "epoch": 2.761827079934747, + "grad_norm": 0.04854540526866913, + "learning_rate": 0.0009955885446939672, + "loss": 0.1264, + "num_input_tokens_seen": 36598304, + "step": 16930 + }, + { + "epoch": 2.762642740619902, + "grad_norm": 0.03333837911486626, + "learning_rate": 0.0009955791051961195, + "loss": 0.0639, + "num_input_tokens_seen": 36609856, + "step": 16935 + }, + { + "epoch": 2.763458401305057, + "grad_norm": 0.015216774307191372, + "learning_rate": 0.000995569655654733, + "loss": 0.0753, + "num_input_tokens_seen": 36619648, + "step": 16940 + }, + { + "epoch": 2.764274061990212, + "grad_norm": 0.04915167763829231, + "learning_rate": 0.0009955601960699983, + "loss": 0.0694, + "num_input_tokens_seen": 36629440, + "step": 16945 + }, + { + "epoch": 2.7650897226753672, + "grad_norm": 0.07696164399385452, + "learning_rate": 0.0009955507264421079, + "loss": 0.0829, + "num_input_tokens_seen": 36640224, + "step": 16950 + }, + { + "epoch": 2.765905383360522, + "grad_norm": 0.17016223073005676, + "learning_rate": 0.0009955412467712531, + "loss": 0.1458, + "num_input_tokens_seen": 36652288, + "step": 16955 + }, + { + "epoch": 2.766721044045677, + "grad_norm": 0.07087530940771103, + "learning_rate": 0.0009955317570576265, + "loss": 0.024, + "num_input_tokens_seen": 36663776, + "step": 16960 + }, + { + "epoch": 2.767536704730832, + "grad_norm": 0.015933789312839508, + "learning_rate": 0.0009955222573014202, + "loss": 0.0209, + "num_input_tokens_seen": 36674560, + "step": 16965 + }, + { + "epoch": 2.768352365415987, + "grad_norm": 0.05921601876616478, + "learning_rate": 0.0009955127475028266, + "loss": 0.0991, + "num_input_tokens_seen": 36685664, + "step": 16970 + }, + { + "epoch": 2.7691680261011418, + "grad_norm": 0.03298629820346832, + "learning_rate": 0.0009955032276620388, + "loss": 0.0534, + "num_input_tokens_seen": 36697568, + "step": 16975 + }, + { + "epoch": 2.769983686786297, + "grad_norm": 0.08991827815771103, + "learning_rate": 0.0009954936977792492, + "loss": 0.0566, + "num_input_tokens_seen": 36708288, + "step": 16980 + }, + { + "epoch": 2.770799347471452, + "grad_norm": 0.04772263765335083, + "learning_rate": 0.0009954841578546515, + "loss": 0.0423, + "num_input_tokens_seen": 36718976, + "step": 16985 + }, + { + "epoch": 2.7716150081566067, + "grad_norm": 0.32339712977409363, + "learning_rate": 0.0009954746078884387, + "loss": 0.1995, + "num_input_tokens_seen": 36730688, + "step": 16990 + }, + { + "epoch": 2.772430668841762, + "grad_norm": 0.10802032053470612, + "learning_rate": 0.0009954650478808042, + "loss": 0.1608, + "num_input_tokens_seen": 36741120, + "step": 16995 + }, + { + "epoch": 2.7732463295269167, + "grad_norm": 0.006154247093945742, + "learning_rate": 0.0009954554778319423, + "loss": 0.0105, + "num_input_tokens_seen": 36751904, + "step": 17000 + }, + { + "epoch": 2.774061990212072, + "grad_norm": 0.0152150709182024, + "learning_rate": 0.0009954458977420465, + "loss": 0.0958, + "num_input_tokens_seen": 36761952, + "step": 17005 + }, + { + "epoch": 2.774877650897227, + "grad_norm": 0.0315382294356823, + "learning_rate": 0.000995436307611311, + "loss": 0.1436, + "num_input_tokens_seen": 36772800, + "step": 17010 + }, + { + "epoch": 2.7756933115823816, + "grad_norm": 0.13253070414066315, + "learning_rate": 0.0009954267074399302, + "loss": 0.1868, + "num_input_tokens_seen": 36783168, + "step": 17015 + }, + { + "epoch": 2.7765089722675365, + "grad_norm": 0.052694886922836304, + "learning_rate": 0.0009954170972280988, + "loss": 0.0201, + "num_input_tokens_seen": 36794464, + "step": 17020 + }, + { + "epoch": 2.7773246329526917, + "grad_norm": 0.011267283000051975, + "learning_rate": 0.0009954074769760112, + "loss": 0.0306, + "num_input_tokens_seen": 36805344, + "step": 17025 + }, + { + "epoch": 2.7781402936378465, + "grad_norm": 0.031975969672203064, + "learning_rate": 0.0009953978466838629, + "loss": 0.1541, + "num_input_tokens_seen": 36816192, + "step": 17030 + }, + { + "epoch": 2.778955954323002, + "grad_norm": 0.20794735848903656, + "learning_rate": 0.0009953882063518486, + "loss": 0.0352, + "num_input_tokens_seen": 36827968, + "step": 17035 + }, + { + "epoch": 2.7797716150081566, + "grad_norm": 0.013200036250054836, + "learning_rate": 0.000995378555980164, + "loss": 0.1362, + "num_input_tokens_seen": 36839392, + "step": 17040 + }, + { + "epoch": 2.7805872756933114, + "grad_norm": 0.3909376859664917, + "learning_rate": 0.0009953688955690045, + "loss": 0.2137, + "num_input_tokens_seen": 36850208, + "step": 17045 + }, + { + "epoch": 2.7814029363784667, + "grad_norm": 0.09196203947067261, + "learning_rate": 0.0009953592251185658, + "loss": 0.136, + "num_input_tokens_seen": 36861504, + "step": 17050 + }, + { + "epoch": 2.7822185970636215, + "grad_norm": 0.03299575299024582, + "learning_rate": 0.000995349544629044, + "loss": 0.0893, + "num_input_tokens_seen": 36871616, + "step": 17055 + }, + { + "epoch": 2.7830342577487768, + "grad_norm": 0.0684279352426529, + "learning_rate": 0.0009953398541006353, + "loss": 0.0477, + "num_input_tokens_seen": 36882816, + "step": 17060 + }, + { + "epoch": 2.7838499184339316, + "grad_norm": 0.06969677656888962, + "learning_rate": 0.0009953301535335361, + "loss": 0.211, + "num_input_tokens_seen": 36894208, + "step": 17065 + }, + { + "epoch": 2.7846655791190864, + "grad_norm": 0.05764150246977806, + "learning_rate": 0.000995320442927943, + "loss": 0.2221, + "num_input_tokens_seen": 36906560, + "step": 17070 + }, + { + "epoch": 2.7854812398042412, + "grad_norm": 0.03223137930035591, + "learning_rate": 0.0009953107222840528, + "loss": 0.0646, + "num_input_tokens_seen": 36916672, + "step": 17075 + }, + { + "epoch": 2.7862969004893965, + "grad_norm": 0.1956624835729599, + "learning_rate": 0.0009953009916020624, + "loss": 0.1274, + "num_input_tokens_seen": 36927776, + "step": 17080 + }, + { + "epoch": 2.7871125611745513, + "grad_norm": 0.042754679918289185, + "learning_rate": 0.0009952912508821691, + "loss": 0.1977, + "num_input_tokens_seen": 36939392, + "step": 17085 + }, + { + "epoch": 2.7879282218597066, + "grad_norm": 0.061387669295072556, + "learning_rate": 0.0009952815001245702, + "loss": 0.0669, + "num_input_tokens_seen": 36951168, + "step": 17090 + }, + { + "epoch": 2.7887438825448614, + "grad_norm": 0.03364640846848488, + "learning_rate": 0.0009952717393294636, + "loss": 0.0622, + "num_input_tokens_seen": 36962112, + "step": 17095 + }, + { + "epoch": 2.789559543230016, + "grad_norm": 0.1411299854516983, + "learning_rate": 0.0009952619684970468, + "loss": 0.1288, + "num_input_tokens_seen": 36973184, + "step": 17100 + }, + { + "epoch": 2.790375203915171, + "grad_norm": 0.02755509875714779, + "learning_rate": 0.0009952521876275178, + "loss": 0.1578, + "num_input_tokens_seen": 36983232, + "step": 17105 + }, + { + "epoch": 2.7911908646003263, + "grad_norm": 0.043589744716882706, + "learning_rate": 0.0009952423967210752, + "loss": 0.1919, + "num_input_tokens_seen": 36993824, + "step": 17110 + }, + { + "epoch": 2.792006525285481, + "grad_norm": 0.032452911138534546, + "learning_rate": 0.0009952325957779168, + "loss": 0.0787, + "num_input_tokens_seen": 37005248, + "step": 17115 + }, + { + "epoch": 2.7928221859706364, + "grad_norm": 0.11344970762729645, + "learning_rate": 0.0009952227847982418, + "loss": 0.1194, + "num_input_tokens_seen": 37015808, + "step": 17120 + }, + { + "epoch": 2.793637846655791, + "grad_norm": 0.010948172770440578, + "learning_rate": 0.000995212963782249, + "loss": 0.0596, + "num_input_tokens_seen": 37026176, + "step": 17125 + }, + { + "epoch": 2.794453507340946, + "grad_norm": 0.05834111198782921, + "learning_rate": 0.000995203132730137, + "loss": 0.1682, + "num_input_tokens_seen": 37036864, + "step": 17130 + }, + { + "epoch": 2.7952691680261013, + "grad_norm": 0.12609679996967316, + "learning_rate": 0.0009951932916421053, + "loss": 0.2157, + "num_input_tokens_seen": 37046688, + "step": 17135 + }, + { + "epoch": 2.796084828711256, + "grad_norm": 0.017100023105740547, + "learning_rate": 0.0009951834405183535, + "loss": 0.1031, + "num_input_tokens_seen": 37056992, + "step": 17140 + }, + { + "epoch": 2.7969004893964113, + "grad_norm": 0.17199130356311798, + "learning_rate": 0.0009951735793590811, + "loss": 0.0784, + "num_input_tokens_seen": 37068064, + "step": 17145 + }, + { + "epoch": 2.797716150081566, + "grad_norm": 0.0340568870306015, + "learning_rate": 0.0009951637081644879, + "loss": 0.2099, + "num_input_tokens_seen": 37078752, + "step": 17150 + }, + { + "epoch": 2.798531810766721, + "grad_norm": 0.10646383464336395, + "learning_rate": 0.000995153826934774, + "loss": 0.0641, + "num_input_tokens_seen": 37089984, + "step": 17155 + }, + { + "epoch": 2.799347471451876, + "grad_norm": 0.17747409641742706, + "learning_rate": 0.0009951439356701394, + "loss": 0.1234, + "num_input_tokens_seen": 37100320, + "step": 17160 + }, + { + "epoch": 2.800163132137031, + "grad_norm": 0.2767079770565033, + "learning_rate": 0.0009951340343707852, + "loss": 0.3425, + "num_input_tokens_seen": 37111648, + "step": 17165 + }, + { + "epoch": 2.800978792822186, + "grad_norm": 0.02862684801220894, + "learning_rate": 0.0009951241230369114, + "loss": 0.0344, + "num_input_tokens_seen": 37121376, + "step": 17170 + }, + { + "epoch": 2.801794453507341, + "grad_norm": 0.07905431091785431, + "learning_rate": 0.0009951142016687193, + "loss": 0.0706, + "num_input_tokens_seen": 37131008, + "step": 17175 + }, + { + "epoch": 2.802610114192496, + "grad_norm": 0.06610142439603806, + "learning_rate": 0.0009951042702664099, + "loss": 0.2177, + "num_input_tokens_seen": 37141664, + "step": 17180 + }, + { + "epoch": 2.8034257748776508, + "grad_norm": 0.041321273893117905, + "learning_rate": 0.0009950943288301842, + "loss": 0.055, + "num_input_tokens_seen": 37153344, + "step": 17185 + }, + { + "epoch": 2.804241435562806, + "grad_norm": 0.1373293399810791, + "learning_rate": 0.0009950843773602438, + "loss": 0.2467, + "num_input_tokens_seen": 37163840, + "step": 17190 + }, + { + "epoch": 2.805057096247961, + "grad_norm": 0.08510863035917282, + "learning_rate": 0.0009950744158567905, + "loss": 0.0857, + "num_input_tokens_seen": 37175008, + "step": 17195 + }, + { + "epoch": 2.8058727569331157, + "grad_norm": 0.039745964109897614, + "learning_rate": 0.0009950644443200262, + "loss": 0.102, + "num_input_tokens_seen": 37185888, + "step": 17200 + }, + { + "epoch": 2.806688417618271, + "grad_norm": 0.03634534031152725, + "learning_rate": 0.0009950544627501529, + "loss": 0.1042, + "num_input_tokens_seen": 37196864, + "step": 17205 + }, + { + "epoch": 2.8075040783034257, + "grad_norm": 0.004250263329595327, + "learning_rate": 0.0009950444711473727, + "loss": 0.1475, + "num_input_tokens_seen": 37208608, + "step": 17210 + }, + { + "epoch": 2.8083197389885806, + "grad_norm": 0.07612774521112442, + "learning_rate": 0.0009950344695118885, + "loss": 0.1043, + "num_input_tokens_seen": 37218464, + "step": 17215 + }, + { + "epoch": 2.809135399673736, + "grad_norm": 0.05913073569536209, + "learning_rate": 0.0009950244578439027, + "loss": 0.108, + "num_input_tokens_seen": 37229536, + "step": 17220 + }, + { + "epoch": 2.8099510603588906, + "grad_norm": 0.06495174020528793, + "learning_rate": 0.0009950144361436182, + "loss": 0.2674, + "num_input_tokens_seen": 37239744, + "step": 17225 + }, + { + "epoch": 2.810766721044046, + "grad_norm": 0.04763927310705185, + "learning_rate": 0.0009950044044112383, + "loss": 0.0752, + "num_input_tokens_seen": 37250912, + "step": 17230 + }, + { + "epoch": 2.8115823817292007, + "grad_norm": 0.23130439221858978, + "learning_rate": 0.000994994362646966, + "loss": 0.1877, + "num_input_tokens_seen": 37261920, + "step": 17235 + }, + { + "epoch": 2.8123980424143555, + "grad_norm": 0.084737628698349, + "learning_rate": 0.0009949843108510053, + "loss": 0.0475, + "num_input_tokens_seen": 37273184, + "step": 17240 + }, + { + "epoch": 2.8132137030995104, + "grad_norm": 0.030352629721164703, + "learning_rate": 0.0009949742490235594, + "loss": 0.0357, + "num_input_tokens_seen": 37284992, + "step": 17245 + }, + { + "epoch": 2.8140293637846656, + "grad_norm": 0.08658528327941895, + "learning_rate": 0.0009949641771648324, + "loss": 0.0821, + "num_input_tokens_seen": 37295872, + "step": 17250 + }, + { + "epoch": 2.8148450244698204, + "grad_norm": 0.10219324380159378, + "learning_rate": 0.0009949540952750285, + "loss": 0.0533, + "num_input_tokens_seen": 37305696, + "step": 17255 + }, + { + "epoch": 2.8156606851549757, + "grad_norm": 0.015360584482550621, + "learning_rate": 0.000994944003354352, + "loss": 0.1204, + "num_input_tokens_seen": 37315712, + "step": 17260 + }, + { + "epoch": 2.8164763458401305, + "grad_norm": 0.11603693664073944, + "learning_rate": 0.0009949339014030075, + "loss": 0.1377, + "num_input_tokens_seen": 37325856, + "step": 17265 + }, + { + "epoch": 2.8172920065252853, + "grad_norm": 0.09709656238555908, + "learning_rate": 0.0009949237894211994, + "loss": 0.0453, + "num_input_tokens_seen": 37336160, + "step": 17270 + }, + { + "epoch": 2.8181076672104406, + "grad_norm": 0.07709289342164993, + "learning_rate": 0.000994913667409133, + "loss": 0.024, + "num_input_tokens_seen": 37346912, + "step": 17275 + }, + { + "epoch": 2.8189233278955954, + "grad_norm": 0.015501154586672783, + "learning_rate": 0.0009949035353670132, + "loss": 0.0947, + "num_input_tokens_seen": 37357184, + "step": 17280 + }, + { + "epoch": 2.8197389885807507, + "grad_norm": 0.15367379784584045, + "learning_rate": 0.0009948933932950456, + "loss": 0.0817, + "num_input_tokens_seen": 37367232, + "step": 17285 + }, + { + "epoch": 2.8205546492659055, + "grad_norm": 0.10089415311813354, + "learning_rate": 0.0009948832411934352, + "loss": 0.0574, + "num_input_tokens_seen": 37378848, + "step": 17290 + }, + { + "epoch": 2.8213703099510603, + "grad_norm": 0.2953617572784424, + "learning_rate": 0.0009948730790623884, + "loss": 0.2016, + "num_input_tokens_seen": 37389760, + "step": 17295 + }, + { + "epoch": 2.822185970636215, + "grad_norm": 0.13659532368183136, + "learning_rate": 0.0009948629069021107, + "loss": 0.1575, + "num_input_tokens_seen": 37400512, + "step": 17300 + }, + { + "epoch": 2.8230016313213704, + "grad_norm": 0.04431614279747009, + "learning_rate": 0.0009948527247128085, + "loss": 0.1449, + "num_input_tokens_seen": 37409408, + "step": 17305 + }, + { + "epoch": 2.823817292006525, + "grad_norm": 0.022543810307979584, + "learning_rate": 0.0009948425324946882, + "loss": 0.1717, + "num_input_tokens_seen": 37419456, + "step": 17310 + }, + { + "epoch": 2.8246329526916805, + "grad_norm": 0.16338996589183807, + "learning_rate": 0.0009948323302479561, + "loss": 0.0756, + "num_input_tokens_seen": 37430336, + "step": 17315 + }, + { + "epoch": 2.8254486133768353, + "grad_norm": 0.06958018243312836, + "learning_rate": 0.000994822117972819, + "loss": 0.0902, + "num_input_tokens_seen": 37441856, + "step": 17320 + }, + { + "epoch": 2.82626427406199, + "grad_norm": 0.08969177305698395, + "learning_rate": 0.000994811895669484, + "loss": 0.0669, + "num_input_tokens_seen": 37451808, + "step": 17325 + }, + { + "epoch": 2.827079934747145, + "grad_norm": 0.04363901913166046, + "learning_rate": 0.0009948016633381583, + "loss": 0.1059, + "num_input_tokens_seen": 37462784, + "step": 17330 + }, + { + "epoch": 2.8278955954323, + "grad_norm": 0.018658151850104332, + "learning_rate": 0.0009947914209790492, + "loss": 0.0434, + "num_input_tokens_seen": 37474592, + "step": 17335 + }, + { + "epoch": 2.828711256117455, + "grad_norm": 0.01596796326339245, + "learning_rate": 0.0009947811685923642, + "loss": 0.1768, + "num_input_tokens_seen": 37484000, + "step": 17340 + }, + { + "epoch": 2.8295269168026103, + "grad_norm": 0.061537813395261765, + "learning_rate": 0.0009947709061783113, + "loss": 0.1124, + "num_input_tokens_seen": 37493856, + "step": 17345 + }, + { + "epoch": 2.830342577487765, + "grad_norm": 0.06127791479229927, + "learning_rate": 0.000994760633737098, + "loss": 0.0803, + "num_input_tokens_seen": 37504736, + "step": 17350 + }, + { + "epoch": 2.83115823817292, + "grad_norm": 0.06189500913023949, + "learning_rate": 0.0009947503512689332, + "loss": 0.0975, + "num_input_tokens_seen": 37515744, + "step": 17355 + }, + { + "epoch": 2.831973898858075, + "grad_norm": 0.14593607187271118, + "learning_rate": 0.0009947400587740245, + "loss": 0.1168, + "num_input_tokens_seen": 37526240, + "step": 17360 + }, + { + "epoch": 2.83278955954323, + "grad_norm": 0.11315637826919556, + "learning_rate": 0.0009947297562525811, + "loss": 0.131, + "num_input_tokens_seen": 37537088, + "step": 17365 + }, + { + "epoch": 2.8336052202283852, + "grad_norm": 0.006786394864320755, + "learning_rate": 0.0009947194437048116, + "loss": 0.1604, + "num_input_tokens_seen": 37548352, + "step": 17370 + }, + { + "epoch": 2.83442088091354, + "grad_norm": 0.16215111315250397, + "learning_rate": 0.000994709121130925, + "loss": 0.1263, + "num_input_tokens_seen": 37560128, + "step": 17375 + }, + { + "epoch": 2.835236541598695, + "grad_norm": 0.05200238898396492, + "learning_rate": 0.0009946987885311304, + "loss": 0.0564, + "num_input_tokens_seen": 37571264, + "step": 17380 + }, + { + "epoch": 2.8360522022838497, + "grad_norm": 0.049134548753499985, + "learning_rate": 0.0009946884459056374, + "loss": 0.0917, + "num_input_tokens_seen": 37580992, + "step": 17385 + }, + { + "epoch": 2.836867862969005, + "grad_norm": 0.08593729138374329, + "learning_rate": 0.0009946780932546552, + "loss": 0.1033, + "num_input_tokens_seen": 37591488, + "step": 17390 + }, + { + "epoch": 2.8376835236541598, + "grad_norm": 0.2163233906030655, + "learning_rate": 0.0009946677305783943, + "loss": 0.1685, + "num_input_tokens_seen": 37603200, + "step": 17395 + }, + { + "epoch": 2.838499184339315, + "grad_norm": 0.05750100314617157, + "learning_rate": 0.000994657357877064, + "loss": 0.0956, + "num_input_tokens_seen": 37614944, + "step": 17400 + }, + { + "epoch": 2.83931484502447, + "grad_norm": 0.02084900252521038, + "learning_rate": 0.0009946469751508748, + "loss": 0.0773, + "num_input_tokens_seen": 37624960, + "step": 17405 + }, + { + "epoch": 2.8401305057096247, + "grad_norm": 0.08458512276411057, + "learning_rate": 0.0009946365824000374, + "loss": 0.1109, + "num_input_tokens_seen": 37636352, + "step": 17410 + }, + { + "epoch": 2.84094616639478, + "grad_norm": 0.045834679156541824, + "learning_rate": 0.000994626179624762, + "loss": 0.1966, + "num_input_tokens_seen": 37647872, + "step": 17415 + }, + { + "epoch": 2.8417618270799347, + "grad_norm": 0.14066869020462036, + "learning_rate": 0.0009946157668252597, + "loss": 0.098, + "num_input_tokens_seen": 37659040, + "step": 17420 + }, + { + "epoch": 2.8425774877650896, + "grad_norm": 0.08382485061883926, + "learning_rate": 0.0009946053440017413, + "loss": 0.2024, + "num_input_tokens_seen": 37669088, + "step": 17425 + }, + { + "epoch": 2.843393148450245, + "grad_norm": 0.0480208620429039, + "learning_rate": 0.000994594911154418, + "loss": 0.0692, + "num_input_tokens_seen": 37680768, + "step": 17430 + }, + { + "epoch": 2.8442088091353996, + "grad_norm": 0.15396052598953247, + "learning_rate": 0.0009945844682835018, + "loss": 0.2609, + "num_input_tokens_seen": 37692416, + "step": 17435 + }, + { + "epoch": 2.8450244698205545, + "grad_norm": 0.3378278315067291, + "learning_rate": 0.0009945740153892036, + "loss": 0.2769, + "num_input_tokens_seen": 37702080, + "step": 17440 + }, + { + "epoch": 2.8458401305057097, + "grad_norm": 0.04631584882736206, + "learning_rate": 0.0009945635524717359, + "loss": 0.0574, + "num_input_tokens_seen": 37712480, + "step": 17445 + }, + { + "epoch": 2.8466557911908645, + "grad_norm": 0.01898629404604435, + "learning_rate": 0.00099455307953131, + "loss": 0.0857, + "num_input_tokens_seen": 37724032, + "step": 17450 + }, + { + "epoch": 2.84747145187602, + "grad_norm": 0.0465041846036911, + "learning_rate": 0.0009945425965681388, + "loss": 0.0883, + "num_input_tokens_seen": 37734272, + "step": 17455 + }, + { + "epoch": 2.8482871125611746, + "grad_norm": 0.027010783553123474, + "learning_rate": 0.0009945321035824343, + "loss": 0.1646, + "num_input_tokens_seen": 37745760, + "step": 17460 + }, + { + "epoch": 2.8491027732463294, + "grad_norm": 0.1783338189125061, + "learning_rate": 0.0009945216005744096, + "loss": 0.3157, + "num_input_tokens_seen": 37756160, + "step": 17465 + }, + { + "epoch": 2.8499184339314843, + "grad_norm": 0.024316715076565742, + "learning_rate": 0.0009945110875442774, + "loss": 0.0998, + "num_input_tokens_seen": 37768224, + "step": 17470 + }, + { + "epoch": 2.8507340946166395, + "grad_norm": 0.02121208980679512, + "learning_rate": 0.0009945005644922504, + "loss": 0.125, + "num_input_tokens_seen": 37778880, + "step": 17475 + }, + { + "epoch": 2.8515497553017943, + "grad_norm": 0.0659647062420845, + "learning_rate": 0.0009944900314185422, + "loss": 0.1498, + "num_input_tokens_seen": 37790944, + "step": 17480 + }, + { + "epoch": 2.8523654159869496, + "grad_norm": 0.02814595028758049, + "learning_rate": 0.0009944794883233663, + "loss": 0.1876, + "num_input_tokens_seen": 37801792, + "step": 17485 + }, + { + "epoch": 2.8531810766721044, + "grad_norm": 0.053324826061725616, + "learning_rate": 0.0009944689352069363, + "loss": 0.1417, + "num_input_tokens_seen": 37813504, + "step": 17490 + }, + { + "epoch": 2.8539967373572592, + "grad_norm": 0.10937617719173431, + "learning_rate": 0.000994458372069466, + "loss": 0.0979, + "num_input_tokens_seen": 37823136, + "step": 17495 + }, + { + "epoch": 2.8548123980424145, + "grad_norm": 0.04137161374092102, + "learning_rate": 0.0009944477989111695, + "loss": 0.0794, + "num_input_tokens_seen": 37833568, + "step": 17500 + }, + { + "epoch": 2.8556280587275693, + "grad_norm": 0.03246806561946869, + "learning_rate": 0.0009944372157322612, + "loss": 0.0449, + "num_input_tokens_seen": 37845120, + "step": 17505 + }, + { + "epoch": 2.8564437194127246, + "grad_norm": 0.21855475008487701, + "learning_rate": 0.0009944266225329552, + "loss": 0.2053, + "num_input_tokens_seen": 37854368, + "step": 17510 + }, + { + "epoch": 2.8572593800978794, + "grad_norm": 0.21136105060577393, + "learning_rate": 0.0009944160193134668, + "loss": 0.1399, + "num_input_tokens_seen": 37865568, + "step": 17515 + }, + { + "epoch": 2.858075040783034, + "grad_norm": 0.11021832376718521, + "learning_rate": 0.0009944054060740104, + "loss": 0.0635, + "num_input_tokens_seen": 37876128, + "step": 17520 + }, + { + "epoch": 2.858890701468189, + "grad_norm": 0.32767796516418457, + "learning_rate": 0.0009943947828148013, + "loss": 0.1833, + "num_input_tokens_seen": 37887040, + "step": 17525 + }, + { + "epoch": 2.8597063621533443, + "grad_norm": 0.0583861842751503, + "learning_rate": 0.0009943841495360546, + "loss": 0.1344, + "num_input_tokens_seen": 37897088, + "step": 17530 + }, + { + "epoch": 2.860522022838499, + "grad_norm": 0.10598666220903397, + "learning_rate": 0.0009943735062379862, + "loss": 0.0413, + "num_input_tokens_seen": 37908096, + "step": 17535 + }, + { + "epoch": 2.8613376835236544, + "grad_norm": 0.10326528549194336, + "learning_rate": 0.0009943628529208114, + "loss": 0.0641, + "num_input_tokens_seen": 37918432, + "step": 17540 + }, + { + "epoch": 2.862153344208809, + "grad_norm": 0.07112540304660797, + "learning_rate": 0.0009943521895847461, + "loss": 0.114, + "num_input_tokens_seen": 37928320, + "step": 17545 + }, + { + "epoch": 2.862969004893964, + "grad_norm": 0.25001704692840576, + "learning_rate": 0.0009943415162300066, + "loss": 0.1949, + "num_input_tokens_seen": 37939520, + "step": 17550 + }, + { + "epoch": 2.863784665579119, + "grad_norm": 0.05746942013502121, + "learning_rate": 0.0009943308328568094, + "loss": 0.1526, + "num_input_tokens_seen": 37949728, + "step": 17555 + }, + { + "epoch": 2.864600326264274, + "grad_norm": 0.12497828155755997, + "learning_rate": 0.0009943201394653706, + "loss": 0.0897, + "num_input_tokens_seen": 37961248, + "step": 17560 + }, + { + "epoch": 2.865415986949429, + "grad_norm": 0.04214569926261902, + "learning_rate": 0.0009943094360559072, + "loss": 0.1186, + "num_input_tokens_seen": 37972320, + "step": 17565 + }, + { + "epoch": 2.866231647634584, + "grad_norm": 0.01726198010146618, + "learning_rate": 0.0009942987226286358, + "loss": 0.082, + "num_input_tokens_seen": 37983520, + "step": 17570 + }, + { + "epoch": 2.867047308319739, + "grad_norm": 0.006213213782757521, + "learning_rate": 0.0009942879991837739, + "loss": 0.0446, + "num_input_tokens_seen": 37995104, + "step": 17575 + }, + { + "epoch": 2.867862969004894, + "grad_norm": 0.07860016822814941, + "learning_rate": 0.0009942772657215385, + "loss": 0.0319, + "num_input_tokens_seen": 38006240, + "step": 17580 + }, + { + "epoch": 2.868678629690049, + "grad_norm": 0.19524259865283966, + "learning_rate": 0.0009942665222421475, + "loss": 0.2527, + "num_input_tokens_seen": 38017504, + "step": 17585 + }, + { + "epoch": 2.869494290375204, + "grad_norm": 0.021658936515450478, + "learning_rate": 0.0009942557687458182, + "loss": 0.0806, + "num_input_tokens_seen": 38029248, + "step": 17590 + }, + { + "epoch": 2.870309951060359, + "grad_norm": 0.0713447779417038, + "learning_rate": 0.0009942450052327688, + "loss": 0.0292, + "num_input_tokens_seen": 38039424, + "step": 17595 + }, + { + "epoch": 2.871125611745514, + "grad_norm": 0.09926056861877441, + "learning_rate": 0.0009942342317032172, + "loss": 0.0932, + "num_input_tokens_seen": 38048608, + "step": 17600 + }, + { + "epoch": 2.8719412724306688, + "grad_norm": 0.023149937391281128, + "learning_rate": 0.000994223448157382, + "loss": 0.0098, + "num_input_tokens_seen": 38060000, + "step": 17605 + }, + { + "epoch": 2.8727569331158236, + "grad_norm": 0.07217232882976532, + "learning_rate": 0.000994212654595482, + "loss": 0.1114, + "num_input_tokens_seen": 38070848, + "step": 17610 + }, + { + "epoch": 2.873572593800979, + "grad_norm": 0.20313020050525665, + "learning_rate": 0.0009942018510177351, + "loss": 0.1052, + "num_input_tokens_seen": 38082016, + "step": 17615 + }, + { + "epoch": 2.8743882544861337, + "grad_norm": 0.15015803277492523, + "learning_rate": 0.000994191037424361, + "loss": 0.1398, + "num_input_tokens_seen": 38092672, + "step": 17620 + }, + { + "epoch": 2.875203915171289, + "grad_norm": 0.0243154838681221, + "learning_rate": 0.0009941802138155786, + "loss": 0.1759, + "num_input_tokens_seen": 38103328, + "step": 17625 + }, + { + "epoch": 2.8760195758564437, + "grad_norm": 0.03959393873810768, + "learning_rate": 0.0009941693801916074, + "loss": 0.2552, + "num_input_tokens_seen": 38114464, + "step": 17630 + }, + { + "epoch": 2.8768352365415986, + "grad_norm": 0.13363416492938995, + "learning_rate": 0.0009941585365526666, + "loss": 0.2813, + "num_input_tokens_seen": 38124800, + "step": 17635 + }, + { + "epoch": 2.877650897226754, + "grad_norm": 0.0363851822912693, + "learning_rate": 0.0009941476828989762, + "loss": 0.1221, + "num_input_tokens_seen": 38135456, + "step": 17640 + }, + { + "epoch": 2.8784665579119086, + "grad_norm": 0.0372467003762722, + "learning_rate": 0.0009941368192307562, + "loss": 0.0719, + "num_input_tokens_seen": 38146432, + "step": 17645 + }, + { + "epoch": 2.8792822185970635, + "grad_norm": 0.06116746738553047, + "learning_rate": 0.0009941259455482267, + "loss": 0.1476, + "num_input_tokens_seen": 38157952, + "step": 17650 + }, + { + "epoch": 2.8800978792822187, + "grad_norm": 0.028371773660182953, + "learning_rate": 0.0009941150618516079, + "loss": 0.1191, + "num_input_tokens_seen": 38168224, + "step": 17655 + }, + { + "epoch": 2.8809135399673735, + "grad_norm": 0.10169167071580887, + "learning_rate": 0.0009941041681411206, + "loss": 0.1522, + "num_input_tokens_seen": 38179104, + "step": 17660 + }, + { + "epoch": 2.8817292006525284, + "grad_norm": 0.022912686690688133, + "learning_rate": 0.0009940932644169858, + "loss": 0.0693, + "num_input_tokens_seen": 38189536, + "step": 17665 + }, + { + "epoch": 2.8825448613376836, + "grad_norm": 0.10834155231714249, + "learning_rate": 0.000994082350679424, + "loss": 0.1066, + "num_input_tokens_seen": 38200480, + "step": 17670 + }, + { + "epoch": 2.8833605220228384, + "grad_norm": 0.021890873089432716, + "learning_rate": 0.0009940714269286565, + "loss": 0.168, + "num_input_tokens_seen": 38211872, + "step": 17675 + }, + { + "epoch": 2.8841761827079937, + "grad_norm": 0.05110299587249756, + "learning_rate": 0.000994060493164905, + "loss": 0.0401, + "num_input_tokens_seen": 38223680, + "step": 17680 + }, + { + "epoch": 2.8849918433931485, + "grad_norm": 0.036153409630060196, + "learning_rate": 0.0009940495493883906, + "loss": 0.0998, + "num_input_tokens_seen": 38234464, + "step": 17685 + }, + { + "epoch": 2.8858075040783033, + "grad_norm": 0.16665154695510864, + "learning_rate": 0.0009940385955993353, + "loss": 0.0915, + "num_input_tokens_seen": 38245952, + "step": 17690 + }, + { + "epoch": 2.886623164763458, + "grad_norm": 0.18631958961486816, + "learning_rate": 0.0009940276317979611, + "loss": 0.1663, + "num_input_tokens_seen": 38256640, + "step": 17695 + }, + { + "epoch": 2.8874388254486134, + "grad_norm": 0.028646018356084824, + "learning_rate": 0.0009940166579844906, + "loss": 0.0803, + "num_input_tokens_seen": 38267136, + "step": 17700 + }, + { + "epoch": 2.8882544861337682, + "grad_norm": 0.2977273166179657, + "learning_rate": 0.0009940056741591455, + "loss": 0.1029, + "num_input_tokens_seen": 38277056, + "step": 17705 + }, + { + "epoch": 2.8890701468189235, + "grad_norm": 0.01933087781071663, + "learning_rate": 0.0009939946803221487, + "loss": 0.1755, + "num_input_tokens_seen": 38287936, + "step": 17710 + }, + { + "epoch": 2.8898858075040783, + "grad_norm": 0.09155084192752838, + "learning_rate": 0.000993983676473723, + "loss": 0.1827, + "num_input_tokens_seen": 38299040, + "step": 17715 + }, + { + "epoch": 2.890701468189233, + "grad_norm": 0.04351414740085602, + "learning_rate": 0.0009939726626140917, + "loss": 0.0792, + "num_input_tokens_seen": 38309696, + "step": 17720 + }, + { + "epoch": 2.8915171288743884, + "grad_norm": 0.15587954223155975, + "learning_rate": 0.0009939616387434776, + "loss": 0.0669, + "num_input_tokens_seen": 38321568, + "step": 17725 + }, + { + "epoch": 2.892332789559543, + "grad_norm": 0.08067210018634796, + "learning_rate": 0.0009939506048621044, + "loss": 0.0425, + "num_input_tokens_seen": 38331392, + "step": 17730 + }, + { + "epoch": 2.8931484502446985, + "grad_norm": 0.3965483605861664, + "learning_rate": 0.0009939395609701953, + "loss": 0.1558, + "num_input_tokens_seen": 38340800, + "step": 17735 + }, + { + "epoch": 2.8939641109298533, + "grad_norm": 0.06495457887649536, + "learning_rate": 0.0009939285070679745, + "loss": 0.0168, + "num_input_tokens_seen": 38351296, + "step": 17740 + }, + { + "epoch": 2.894779771615008, + "grad_norm": 0.10801254212856293, + "learning_rate": 0.000993917443155666, + "loss": 0.0849, + "num_input_tokens_seen": 38362304, + "step": 17745 + }, + { + "epoch": 2.895595432300163, + "grad_norm": 0.006482311058789492, + "learning_rate": 0.0009939063692334937, + "loss": 0.1352, + "num_input_tokens_seen": 38372064, + "step": 17750 + }, + { + "epoch": 2.896411092985318, + "grad_norm": 0.11562133580446243, + "learning_rate": 0.0009938952853016825, + "loss": 0.1764, + "num_input_tokens_seen": 38382272, + "step": 17755 + }, + { + "epoch": 2.897226753670473, + "grad_norm": 0.0724702775478363, + "learning_rate": 0.0009938841913604568, + "loss": 0.0763, + "num_input_tokens_seen": 38392384, + "step": 17760 + }, + { + "epoch": 2.8980424143556283, + "grad_norm": 0.029041165485978127, + "learning_rate": 0.0009938730874100412, + "loss": 0.0394, + "num_input_tokens_seen": 38403200, + "step": 17765 + }, + { + "epoch": 2.898858075040783, + "grad_norm": 0.025574803352355957, + "learning_rate": 0.0009938619734506612, + "loss": 0.2273, + "num_input_tokens_seen": 38413408, + "step": 17770 + }, + { + "epoch": 2.899673735725938, + "grad_norm": 0.012726670131087303, + "learning_rate": 0.0009938508494825417, + "loss": 0.1231, + "num_input_tokens_seen": 38423904, + "step": 17775 + }, + { + "epoch": 2.9004893964110927, + "grad_norm": 0.023426564410328865, + "learning_rate": 0.0009938397155059083, + "loss": 0.1429, + "num_input_tokens_seen": 38435584, + "step": 17780 + }, + { + "epoch": 2.901305057096248, + "grad_norm": 0.1236015111207962, + "learning_rate": 0.0009938285715209866, + "loss": 0.1968, + "num_input_tokens_seen": 38446272, + "step": 17785 + }, + { + "epoch": 2.902120717781403, + "grad_norm": 0.11476442217826843, + "learning_rate": 0.0009938174175280023, + "loss": 0.1609, + "num_input_tokens_seen": 38457312, + "step": 17790 + }, + { + "epoch": 2.902936378466558, + "grad_norm": 0.20464342832565308, + "learning_rate": 0.0009938062535271817, + "loss": 0.1929, + "num_input_tokens_seen": 38467616, + "step": 17795 + }, + { + "epoch": 2.903752039151713, + "grad_norm": 0.18224021792411804, + "learning_rate": 0.0009937950795187508, + "loss": 0.1864, + "num_input_tokens_seen": 38478528, + "step": 17800 + }, + { + "epoch": 2.9045676998368677, + "grad_norm": 0.07501034438610077, + "learning_rate": 0.0009937838955029362, + "loss": 0.1231, + "num_input_tokens_seen": 38490208, + "step": 17805 + }, + { + "epoch": 2.905383360522023, + "grad_norm": 0.020777180790901184, + "learning_rate": 0.0009937727014799646, + "loss": 0.0777, + "num_input_tokens_seen": 38500672, + "step": 17810 + }, + { + "epoch": 2.9061990212071778, + "grad_norm": 0.11070007085800171, + "learning_rate": 0.0009937614974500628, + "loss": 0.1563, + "num_input_tokens_seen": 38511296, + "step": 17815 + }, + { + "epoch": 2.907014681892333, + "grad_norm": 0.031207676976919174, + "learning_rate": 0.000993750283413458, + "loss": 0.1357, + "num_input_tokens_seen": 38521344, + "step": 17820 + }, + { + "epoch": 2.907830342577488, + "grad_norm": 0.012583895586431026, + "learning_rate": 0.0009937390593703773, + "loss": 0.0796, + "num_input_tokens_seen": 38531296, + "step": 17825 + }, + { + "epoch": 2.9086460032626427, + "grad_norm": 0.10093329101800919, + "learning_rate": 0.000993727825321048, + "loss": 0.0785, + "num_input_tokens_seen": 38542912, + "step": 17830 + }, + { + "epoch": 2.9094616639477975, + "grad_norm": 0.13077743351459503, + "learning_rate": 0.0009937165812656983, + "loss": 0.0848, + "num_input_tokens_seen": 38554848, + "step": 17835 + }, + { + "epoch": 2.9102773246329527, + "grad_norm": 0.021879076957702637, + "learning_rate": 0.0009937053272045554, + "loss": 0.0666, + "num_input_tokens_seen": 38565280, + "step": 17840 + }, + { + "epoch": 2.9110929853181076, + "grad_norm": 0.035172972828149796, + "learning_rate": 0.000993694063137848, + "loss": 0.1404, + "num_input_tokens_seen": 38575168, + "step": 17845 + }, + { + "epoch": 2.911908646003263, + "grad_norm": 0.27714836597442627, + "learning_rate": 0.000993682789065804, + "loss": 0.1276, + "num_input_tokens_seen": 38586624, + "step": 17850 + }, + { + "epoch": 2.9127243066884176, + "grad_norm": 0.010750222019851208, + "learning_rate": 0.0009936715049886522, + "loss": 0.1634, + "num_input_tokens_seen": 38596896, + "step": 17855 + }, + { + "epoch": 2.9135399673735725, + "grad_norm": 0.08408083766698837, + "learning_rate": 0.0009936602109066209, + "loss": 0.0561, + "num_input_tokens_seen": 38609152, + "step": 17860 + }, + { + "epoch": 2.9143556280587277, + "grad_norm": 0.07528946548700333, + "learning_rate": 0.0009936489068199392, + "loss": 0.071, + "num_input_tokens_seen": 38619776, + "step": 17865 + }, + { + "epoch": 2.9151712887438825, + "grad_norm": 0.05108056589961052, + "learning_rate": 0.0009936375927288362, + "loss": 0.1126, + "num_input_tokens_seen": 38630752, + "step": 17870 + }, + { + "epoch": 2.9159869494290374, + "grad_norm": 0.08132991939783096, + "learning_rate": 0.000993626268633541, + "loss": 0.1087, + "num_input_tokens_seen": 38642112, + "step": 17875 + }, + { + "epoch": 2.9168026101141926, + "grad_norm": 0.031875815242528915, + "learning_rate": 0.0009936149345342834, + "loss": 0.0846, + "num_input_tokens_seen": 38652096, + "step": 17880 + }, + { + "epoch": 2.9176182707993474, + "grad_norm": 0.08545435965061188, + "learning_rate": 0.000993603590431293, + "loss": 0.0701, + "num_input_tokens_seen": 38663456, + "step": 17885 + }, + { + "epoch": 2.9184339314845023, + "grad_norm": 0.015088371001183987, + "learning_rate": 0.0009935922363247995, + "loss": 0.0486, + "num_input_tokens_seen": 38674368, + "step": 17890 + }, + { + "epoch": 2.9192495921696575, + "grad_norm": 0.0254862941801548, + "learning_rate": 0.0009935808722150333, + "loss": 0.0544, + "num_input_tokens_seen": 38685888, + "step": 17895 + }, + { + "epoch": 2.9200652528548123, + "grad_norm": 0.017295166850090027, + "learning_rate": 0.0009935694981022245, + "loss": 0.0302, + "num_input_tokens_seen": 38697408, + "step": 17900 + }, + { + "epoch": 2.9208809135399676, + "grad_norm": 0.06552346795797348, + "learning_rate": 0.0009935581139866039, + "loss": 0.1045, + "num_input_tokens_seen": 38707616, + "step": 17905 + }, + { + "epoch": 2.9216965742251224, + "grad_norm": 0.02178419567644596, + "learning_rate": 0.0009935467198684015, + "loss": 0.1506, + "num_input_tokens_seen": 38719840, + "step": 17910 + }, + { + "epoch": 2.9225122349102772, + "grad_norm": 0.0072307041846215725, + "learning_rate": 0.0009935353157478493, + "loss": 0.1003, + "num_input_tokens_seen": 38732064, + "step": 17915 + }, + { + "epoch": 2.923327895595432, + "grad_norm": 0.09286753833293915, + "learning_rate": 0.0009935239016251776, + "loss": 0.0853, + "num_input_tokens_seen": 38743808, + "step": 17920 + }, + { + "epoch": 2.9241435562805873, + "grad_norm": 0.023326946422457695, + "learning_rate": 0.0009935124775006178, + "loss": 0.0571, + "num_input_tokens_seen": 38755264, + "step": 17925 + }, + { + "epoch": 2.924959216965742, + "grad_norm": 0.32947757840156555, + "learning_rate": 0.0009935010433744017, + "loss": 0.1625, + "num_input_tokens_seen": 38766944, + "step": 17930 + }, + { + "epoch": 2.9257748776508974, + "grad_norm": 0.06605410575866699, + "learning_rate": 0.000993489599246761, + "loss": 0.1863, + "num_input_tokens_seen": 38778304, + "step": 17935 + }, + { + "epoch": 2.926590538336052, + "grad_norm": 0.13709475100040436, + "learning_rate": 0.0009934781451179273, + "loss": 0.0641, + "num_input_tokens_seen": 38790816, + "step": 17940 + }, + { + "epoch": 2.927406199021207, + "grad_norm": 0.2378200739622116, + "learning_rate": 0.000993466680988133, + "loss": 0.1579, + "num_input_tokens_seen": 38801088, + "step": 17945 + }, + { + "epoch": 2.9282218597063623, + "grad_norm": 0.036661747843027115, + "learning_rate": 0.0009934552068576105, + "loss": 0.0297, + "num_input_tokens_seen": 38811296, + "step": 17950 + }, + { + "epoch": 2.929037520391517, + "grad_norm": 0.018623948097229004, + "learning_rate": 0.0009934437227265924, + "loss": 0.1375, + "num_input_tokens_seen": 38822048, + "step": 17955 + }, + { + "epoch": 2.9298531810766724, + "grad_norm": 0.15882422029972076, + "learning_rate": 0.0009934322285953111, + "loss": 0.0939, + "num_input_tokens_seen": 38831968, + "step": 17960 + }, + { + "epoch": 2.930668841761827, + "grad_norm": 0.0032852909062057734, + "learning_rate": 0.0009934207244639997, + "loss": 0.1332, + "num_input_tokens_seen": 38841440, + "step": 17965 + }, + { + "epoch": 2.931484502446982, + "grad_norm": 0.2479127198457718, + "learning_rate": 0.0009934092103328915, + "loss": 0.1153, + "num_input_tokens_seen": 38851168, + "step": 17970 + }, + { + "epoch": 2.932300163132137, + "grad_norm": 0.010026328265666962, + "learning_rate": 0.0009933976862022196, + "loss": 0.1458, + "num_input_tokens_seen": 38862848, + "step": 17975 + }, + { + "epoch": 2.933115823817292, + "grad_norm": 0.014436143450438976, + "learning_rate": 0.0009933861520722176, + "loss": 0.066, + "num_input_tokens_seen": 38874176, + "step": 17980 + }, + { + "epoch": 2.933931484502447, + "grad_norm": 0.028659898787736893, + "learning_rate": 0.0009933746079431195, + "loss": 0.1446, + "num_input_tokens_seen": 38885056, + "step": 17985 + }, + { + "epoch": 2.934747145187602, + "grad_norm": 0.02407947927713394, + "learning_rate": 0.000993363053815159, + "loss": 0.0667, + "num_input_tokens_seen": 38895968, + "step": 17990 + }, + { + "epoch": 2.935562805872757, + "grad_norm": 0.1150076687335968, + "learning_rate": 0.0009933514896885705, + "loss": 0.2417, + "num_input_tokens_seen": 38907584, + "step": 17995 + }, + { + "epoch": 2.936378466557912, + "grad_norm": 0.15142081677913666, + "learning_rate": 0.000993339915563588, + "loss": 0.1271, + "num_input_tokens_seen": 38919296, + "step": 18000 + }, + { + "epoch": 2.9371941272430666, + "grad_norm": 0.36399489641189575, + "learning_rate": 0.0009933283314404462, + "loss": 0.1575, + "num_input_tokens_seen": 38930016, + "step": 18005 + }, + { + "epoch": 2.938009787928222, + "grad_norm": 0.17013728618621826, + "learning_rate": 0.0009933167373193802, + "loss": 0.0984, + "num_input_tokens_seen": 38941056, + "step": 18010 + }, + { + "epoch": 2.9388254486133767, + "grad_norm": 0.1846058964729309, + "learning_rate": 0.0009933051332006245, + "loss": 0.1026, + "num_input_tokens_seen": 38952480, + "step": 18015 + }, + { + "epoch": 2.939641109298532, + "grad_norm": 0.006210431456565857, + "learning_rate": 0.0009932935190844145, + "loss": 0.038, + "num_input_tokens_seen": 38962272, + "step": 18020 + }, + { + "epoch": 2.9404567699836868, + "grad_norm": 0.243395134806633, + "learning_rate": 0.0009932818949709855, + "loss": 0.2279, + "num_input_tokens_seen": 38971712, + "step": 18025 + }, + { + "epoch": 2.9412724306688416, + "grad_norm": 0.24012702703475952, + "learning_rate": 0.0009932702608605733, + "loss": 0.1315, + "num_input_tokens_seen": 38982688, + "step": 18030 + }, + { + "epoch": 2.942088091353997, + "grad_norm": 0.024320699274539948, + "learning_rate": 0.0009932586167534134, + "loss": 0.138, + "num_input_tokens_seen": 38993152, + "step": 18035 + }, + { + "epoch": 2.9429037520391517, + "grad_norm": 0.23949933052062988, + "learning_rate": 0.0009932469626497418, + "loss": 0.0934, + "num_input_tokens_seen": 39004640, + "step": 18040 + }, + { + "epoch": 2.943719412724307, + "grad_norm": 0.1403769850730896, + "learning_rate": 0.000993235298549795, + "loss": 0.1009, + "num_input_tokens_seen": 39015040, + "step": 18045 + }, + { + "epoch": 2.9445350734094617, + "grad_norm": 0.42424431443214417, + "learning_rate": 0.0009932236244538089, + "loss": 0.217, + "num_input_tokens_seen": 39025792, + "step": 18050 + }, + { + "epoch": 2.9453507340946166, + "grad_norm": 0.2443961650133133, + "learning_rate": 0.0009932119403620206, + "loss": 0.1784, + "num_input_tokens_seen": 39036512, + "step": 18055 + }, + { + "epoch": 2.9461663947797714, + "grad_norm": 0.1767474114894867, + "learning_rate": 0.0009932002462746665, + "loss": 0.1867, + "num_input_tokens_seen": 39048224, + "step": 18060 + }, + { + "epoch": 2.9469820554649266, + "grad_norm": 0.1322534829378128, + "learning_rate": 0.0009931885421919837, + "loss": 0.0662, + "num_input_tokens_seen": 39060288, + "step": 18065 + }, + { + "epoch": 2.9477977161500815, + "grad_norm": 0.07639496773481369, + "learning_rate": 0.0009931768281142095, + "loss": 0.0996, + "num_input_tokens_seen": 39071360, + "step": 18070 + }, + { + "epoch": 2.9486133768352367, + "grad_norm": 0.22929255664348602, + "learning_rate": 0.0009931651040415812, + "loss": 0.2103, + "num_input_tokens_seen": 39082336, + "step": 18075 + }, + { + "epoch": 2.9494290375203915, + "grad_norm": 0.10765232890844345, + "learning_rate": 0.0009931533699743364, + "loss": 0.1251, + "num_input_tokens_seen": 39093984, + "step": 18080 + }, + { + "epoch": 2.9502446982055464, + "grad_norm": 0.08604505658149719, + "learning_rate": 0.000993141625912713, + "loss": 0.0533, + "num_input_tokens_seen": 39104992, + "step": 18085 + }, + { + "epoch": 2.9510603588907016, + "grad_norm": 0.42062729597091675, + "learning_rate": 0.0009931298718569492, + "loss": 0.1695, + "num_input_tokens_seen": 39115840, + "step": 18090 + }, + { + "epoch": 2.9518760195758564, + "grad_norm": 0.1661776304244995, + "learning_rate": 0.0009931181078072827, + "loss": 0.1415, + "num_input_tokens_seen": 39126784, + "step": 18095 + }, + { + "epoch": 2.9526916802610113, + "grad_norm": 0.030810121446847916, + "learning_rate": 0.0009931063337639521, + "loss": 0.0517, + "num_input_tokens_seen": 39136448, + "step": 18100 + }, + { + "epoch": 2.9535073409461665, + "grad_norm": 0.12106618285179138, + "learning_rate": 0.0009930945497271964, + "loss": 0.0673, + "num_input_tokens_seen": 39146688, + "step": 18105 + }, + { + "epoch": 2.9543230016313213, + "grad_norm": 0.07541876286268234, + "learning_rate": 0.0009930827556972539, + "loss": 0.233, + "num_input_tokens_seen": 39157568, + "step": 18110 + }, + { + "epoch": 2.955138662316476, + "grad_norm": 0.25600165128707886, + "learning_rate": 0.0009930709516743639, + "loss": 0.162, + "num_input_tokens_seen": 39168960, + "step": 18115 + }, + { + "epoch": 2.9559543230016314, + "grad_norm": 0.02245187573134899, + "learning_rate": 0.0009930591376587654, + "loss": 0.2354, + "num_input_tokens_seen": 39178848, + "step": 18120 + }, + { + "epoch": 2.9567699836867862, + "grad_norm": 0.21825523674488068, + "learning_rate": 0.0009930473136506982, + "loss": 0.1749, + "num_input_tokens_seen": 39190432, + "step": 18125 + }, + { + "epoch": 2.9575856443719415, + "grad_norm": 0.10119245946407318, + "learning_rate": 0.0009930354796504018, + "loss": 0.0423, + "num_input_tokens_seen": 39200896, + "step": 18130 + }, + { + "epoch": 2.9584013050570963, + "grad_norm": 0.021194253116846085, + "learning_rate": 0.0009930236356581158, + "loss": 0.2261, + "num_input_tokens_seen": 39211872, + "step": 18135 + }, + { + "epoch": 2.959216965742251, + "grad_norm": 0.06642644107341766, + "learning_rate": 0.0009930117816740803, + "loss": 0.0689, + "num_input_tokens_seen": 39221472, + "step": 18140 + }, + { + "epoch": 2.960032626427406, + "grad_norm": 0.014469392597675323, + "learning_rate": 0.0009929999176985355, + "loss": 0.1492, + "num_input_tokens_seen": 39232832, + "step": 18145 + }, + { + "epoch": 2.960848287112561, + "grad_norm": 0.15131227672100067, + "learning_rate": 0.0009929880437317222, + "loss": 0.0935, + "num_input_tokens_seen": 39243264, + "step": 18150 + }, + { + "epoch": 2.961663947797716, + "grad_norm": 0.16809213161468506, + "learning_rate": 0.0009929761597738808, + "loss": 0.3091, + "num_input_tokens_seen": 39254784, + "step": 18155 + }, + { + "epoch": 2.9624796084828713, + "grad_norm": 0.07613056898117065, + "learning_rate": 0.000992964265825252, + "loss": 0.1164, + "num_input_tokens_seen": 39264608, + "step": 18160 + }, + { + "epoch": 2.963295269168026, + "grad_norm": 0.1471426635980606, + "learning_rate": 0.0009929523618860772, + "loss": 0.1588, + "num_input_tokens_seen": 39274880, + "step": 18165 + }, + { + "epoch": 2.964110929853181, + "grad_norm": 0.18869291245937347, + "learning_rate": 0.000992940447956597, + "loss": 0.2073, + "num_input_tokens_seen": 39285696, + "step": 18170 + }, + { + "epoch": 2.964926590538336, + "grad_norm": 0.0686211809515953, + "learning_rate": 0.000992928524037054, + "loss": 0.0567, + "num_input_tokens_seen": 39297728, + "step": 18175 + }, + { + "epoch": 2.965742251223491, + "grad_norm": 0.09575016796588898, + "learning_rate": 0.0009929165901276884, + "loss": 0.1175, + "num_input_tokens_seen": 39309248, + "step": 18180 + }, + { + "epoch": 2.9665579119086463, + "grad_norm": 0.03930651396512985, + "learning_rate": 0.000992904646228743, + "loss": 0.0464, + "num_input_tokens_seen": 39318848, + "step": 18185 + }, + { + "epoch": 2.967373572593801, + "grad_norm": 0.018661662936210632, + "learning_rate": 0.00099289269234046, + "loss": 0.0403, + "num_input_tokens_seen": 39329504, + "step": 18190 + }, + { + "epoch": 2.968189233278956, + "grad_norm": 0.17350107431411743, + "learning_rate": 0.000992880728463081, + "loss": 0.1852, + "num_input_tokens_seen": 39339744, + "step": 18195 + }, + { + "epoch": 2.9690048939641107, + "grad_norm": 0.3067820072174072, + "learning_rate": 0.0009928687545968486, + "loss": 0.1869, + "num_input_tokens_seen": 39349600, + "step": 18200 + }, + { + "epoch": 2.969820554649266, + "grad_norm": 0.24880094826221466, + "learning_rate": 0.0009928567707420059, + "loss": 0.213, + "num_input_tokens_seen": 39360480, + "step": 18205 + }, + { + "epoch": 2.970636215334421, + "grad_norm": 0.05344460904598236, + "learning_rate": 0.0009928447768987956, + "loss": 0.1125, + "num_input_tokens_seen": 39371168, + "step": 18210 + }, + { + "epoch": 2.971451876019576, + "grad_norm": 0.06595193594694138, + "learning_rate": 0.0009928327730674604, + "loss": 0.0714, + "num_input_tokens_seen": 39382272, + "step": 18215 + }, + { + "epoch": 2.972267536704731, + "grad_norm": 0.07218914479017258, + "learning_rate": 0.000992820759248244, + "loss": 0.2494, + "num_input_tokens_seen": 39392544, + "step": 18220 + }, + { + "epoch": 2.9730831973898857, + "grad_norm": 0.23639288544654846, + "learning_rate": 0.00099280873544139, + "loss": 0.2001, + "num_input_tokens_seen": 39401760, + "step": 18225 + }, + { + "epoch": 2.9738988580750405, + "grad_norm": 0.1368018090724945, + "learning_rate": 0.0009927967016471414, + "loss": 0.1813, + "num_input_tokens_seen": 39411904, + "step": 18230 + }, + { + "epoch": 2.9747145187601958, + "grad_norm": 0.029314961284399033, + "learning_rate": 0.0009927846578657426, + "loss": 0.1154, + "num_input_tokens_seen": 39422368, + "step": 18235 + }, + { + "epoch": 2.9755301794453506, + "grad_norm": 0.07671564072370529, + "learning_rate": 0.0009927726040974377, + "loss": 0.1119, + "num_input_tokens_seen": 39432672, + "step": 18240 + }, + { + "epoch": 2.976345840130506, + "grad_norm": 0.01595219224691391, + "learning_rate": 0.0009927605403424707, + "loss": 0.0523, + "num_input_tokens_seen": 39442944, + "step": 18245 + }, + { + "epoch": 2.9771615008156607, + "grad_norm": 0.26665130257606506, + "learning_rate": 0.0009927484666010862, + "loss": 0.3434, + "num_input_tokens_seen": 39453920, + "step": 18250 + }, + { + "epoch": 2.9779771615008155, + "grad_norm": 0.01717739924788475, + "learning_rate": 0.000992736382873529, + "loss": 0.0542, + "num_input_tokens_seen": 39464768, + "step": 18255 + }, + { + "epoch": 2.9787928221859707, + "grad_norm": 0.08836905658245087, + "learning_rate": 0.000992724289160044, + "loss": 0.123, + "num_input_tokens_seen": 39476544, + "step": 18260 + }, + { + "epoch": 2.9796084828711256, + "grad_norm": 0.18800336122512817, + "learning_rate": 0.000992712185460876, + "loss": 0.0685, + "num_input_tokens_seen": 39487008, + "step": 18265 + }, + { + "epoch": 2.980424143556281, + "grad_norm": 0.21052786707878113, + "learning_rate": 0.0009927000717762707, + "loss": 0.2213, + "num_input_tokens_seen": 39497408, + "step": 18270 + }, + { + "epoch": 2.9812398042414356, + "grad_norm": 0.0330355167388916, + "learning_rate": 0.0009926879481064734, + "loss": 0.0982, + "num_input_tokens_seen": 39506400, + "step": 18275 + }, + { + "epoch": 2.9820554649265905, + "grad_norm": 0.04376707971096039, + "learning_rate": 0.0009926758144517297, + "loss": 0.1352, + "num_input_tokens_seen": 39517376, + "step": 18280 + }, + { + "epoch": 2.9828711256117453, + "grad_norm": 0.03611741214990616, + "learning_rate": 0.000992663670812286, + "loss": 0.0615, + "num_input_tokens_seen": 39528096, + "step": 18285 + }, + { + "epoch": 2.9836867862969005, + "grad_norm": 0.10370776802301407, + "learning_rate": 0.0009926515171883874, + "loss": 0.0593, + "num_input_tokens_seen": 39539552, + "step": 18290 + }, + { + "epoch": 2.9845024469820554, + "grad_norm": 0.006939777173101902, + "learning_rate": 0.0009926393535802812, + "loss": 0.0796, + "num_input_tokens_seen": 39550560, + "step": 18295 + }, + { + "epoch": 2.9853181076672106, + "grad_norm": 0.042476359754800797, + "learning_rate": 0.0009926271799882134, + "loss": 0.1941, + "num_input_tokens_seen": 39561728, + "step": 18300 + }, + { + "epoch": 2.9861337683523654, + "grad_norm": 0.042153820395469666, + "learning_rate": 0.000992614996412431, + "loss": 0.1872, + "num_input_tokens_seen": 39572608, + "step": 18305 + }, + { + "epoch": 2.9869494290375203, + "grad_norm": 0.1687641739845276, + "learning_rate": 0.0009926028028531808, + "loss": 0.1653, + "num_input_tokens_seen": 39583008, + "step": 18310 + }, + { + "epoch": 2.9877650897226755, + "grad_norm": 0.09225862473249435, + "learning_rate": 0.0009925905993107098, + "loss": 0.0366, + "num_input_tokens_seen": 39594688, + "step": 18315 + }, + { + "epoch": 2.9885807504078303, + "grad_norm": 0.3248561918735504, + "learning_rate": 0.0009925783857852653, + "loss": 0.1379, + "num_input_tokens_seen": 39605920, + "step": 18320 + }, + { + "epoch": 2.9893964110929856, + "grad_norm": 0.019485827535390854, + "learning_rate": 0.0009925661622770953, + "loss": 0.2786, + "num_input_tokens_seen": 39617088, + "step": 18325 + }, + { + "epoch": 2.9902120717781404, + "grad_norm": 0.09611544013023376, + "learning_rate": 0.0009925539287864466, + "loss": 0.1963, + "num_input_tokens_seen": 39627232, + "step": 18330 + }, + { + "epoch": 2.9910277324632952, + "grad_norm": 0.09955920279026031, + "learning_rate": 0.000992541685313568, + "loss": 0.0978, + "num_input_tokens_seen": 39637600, + "step": 18335 + }, + { + "epoch": 2.99184339314845, + "grad_norm": 0.017093650996685028, + "learning_rate": 0.0009925294318587075, + "loss": 0.0625, + "num_input_tokens_seen": 39648192, + "step": 18340 + }, + { + "epoch": 2.9926590538336053, + "grad_norm": 0.02634618431329727, + "learning_rate": 0.000992517168422113, + "loss": 0.1107, + "num_input_tokens_seen": 39658464, + "step": 18345 + }, + { + "epoch": 2.99347471451876, + "grad_norm": 0.043731238692998886, + "learning_rate": 0.0009925048950040333, + "loss": 0.175, + "num_input_tokens_seen": 39668800, + "step": 18350 + }, + { + "epoch": 2.9942903752039154, + "grad_norm": 0.027921607717871666, + "learning_rate": 0.000992492611604717, + "loss": 0.108, + "num_input_tokens_seen": 39680224, + "step": 18355 + }, + { + "epoch": 2.99510603588907, + "grad_norm": 0.07438381016254425, + "learning_rate": 0.0009924803182244134, + "loss": 0.0632, + "num_input_tokens_seen": 39691584, + "step": 18360 + }, + { + "epoch": 2.995921696574225, + "grad_norm": 0.13787518441677094, + "learning_rate": 0.0009924680148633714, + "loss": 0.0801, + "num_input_tokens_seen": 39702816, + "step": 18365 + }, + { + "epoch": 2.99673735725938, + "grad_norm": 0.009611738845705986, + "learning_rate": 0.0009924557015218401, + "loss": 0.0202, + "num_input_tokens_seen": 39713504, + "step": 18370 + }, + { + "epoch": 2.997553017944535, + "grad_norm": 0.1483236849308014, + "learning_rate": 0.0009924433782000695, + "loss": 0.0886, + "num_input_tokens_seen": 39724320, + "step": 18375 + }, + { + "epoch": 2.99836867862969, + "grad_norm": 0.015144513919949532, + "learning_rate": 0.000992431044898309, + "loss": 0.0932, + "num_input_tokens_seen": 39735648, + "step": 18380 + }, + { + "epoch": 2.999184339314845, + "grad_norm": 0.2547537684440613, + "learning_rate": 0.0009924187016168086, + "loss": 0.1138, + "num_input_tokens_seen": 39745408, + "step": 18385 + }, + { + "epoch": 3.0, + "grad_norm": 0.01212351769208908, + "learning_rate": 0.0009924063483558187, + "loss": 0.0511, + "num_input_tokens_seen": 39755376, + "step": 18390 + }, + { + "epoch": 3.0, + "eval_loss": 0.12335383147001266, + "eval_runtime": 103.2218, + "eval_samples_per_second": 26.399, + "eval_steps_per_second": 6.607, + "num_input_tokens_seen": 39755376, + "step": 18390 + }, + { + "epoch": 3.000815660685155, + "grad_norm": 0.014516279101371765, + "learning_rate": 0.0009923939851155896, + "loss": 0.0607, + "num_input_tokens_seen": 39765936, + "step": 18395 + }, + { + "epoch": 3.00163132137031, + "grad_norm": 0.01939479261636734, + "learning_rate": 0.0009923816118963715, + "loss": 0.1182, + "num_input_tokens_seen": 39777104, + "step": 18400 + }, + { + "epoch": 3.002446982055465, + "grad_norm": 0.276183158159256, + "learning_rate": 0.0009923692286984156, + "loss": 0.3429, + "num_input_tokens_seen": 39787280, + "step": 18405 + }, + { + "epoch": 3.0032626427406197, + "grad_norm": 0.021763058379292488, + "learning_rate": 0.0009923568355219726, + "loss": 0.1081, + "num_input_tokens_seen": 39797328, + "step": 18410 + }, + { + "epoch": 3.004078303425775, + "grad_norm": 0.033986590802669525, + "learning_rate": 0.0009923444323672937, + "loss": 0.0204, + "num_input_tokens_seen": 39809424, + "step": 18415 + }, + { + "epoch": 3.00489396411093, + "grad_norm": 0.12651905417442322, + "learning_rate": 0.0009923320192346302, + "loss": 0.162, + "num_input_tokens_seen": 39820464, + "step": 18420 + }, + { + "epoch": 3.0057096247960846, + "grad_norm": 0.07450538873672485, + "learning_rate": 0.000992319596124234, + "loss": 0.0964, + "num_input_tokens_seen": 39831152, + "step": 18425 + }, + { + "epoch": 3.00652528548124, + "grad_norm": 0.13451939821243286, + "learning_rate": 0.0009923071630363563, + "loss": 0.1113, + "num_input_tokens_seen": 39842480, + "step": 18430 + }, + { + "epoch": 3.0073409461663947, + "grad_norm": 0.014959607273340225, + "learning_rate": 0.0009922947199712496, + "loss": 0.0608, + "num_input_tokens_seen": 39852240, + "step": 18435 + }, + { + "epoch": 3.00815660685155, + "grad_norm": 0.38718149065971375, + "learning_rate": 0.0009922822669291658, + "loss": 0.2369, + "num_input_tokens_seen": 39863440, + "step": 18440 + }, + { + "epoch": 3.0089722675367048, + "grad_norm": 0.020501941442489624, + "learning_rate": 0.0009922698039103574, + "loss": 0.0728, + "num_input_tokens_seen": 39874032, + "step": 18445 + }, + { + "epoch": 3.0097879282218596, + "grad_norm": 0.07462462037801743, + "learning_rate": 0.000992257330915077, + "loss": 0.0733, + "num_input_tokens_seen": 39885392, + "step": 18450 + }, + { + "epoch": 3.010603588907015, + "grad_norm": 0.030257759615778923, + "learning_rate": 0.0009922448479435773, + "loss": 0.2009, + "num_input_tokens_seen": 39896272, + "step": 18455 + }, + { + "epoch": 3.0114192495921697, + "grad_norm": 0.07138212770223618, + "learning_rate": 0.0009922323549961113, + "loss": 0.0512, + "num_input_tokens_seen": 39907664, + "step": 18460 + }, + { + "epoch": 3.0122349102773245, + "grad_norm": 0.035266272723674774, + "learning_rate": 0.000992219852072932, + "loss": 0.0572, + "num_input_tokens_seen": 39918992, + "step": 18465 + }, + { + "epoch": 3.0130505709624797, + "grad_norm": 0.13974876701831818, + "learning_rate": 0.0009922073391742932, + "loss": 0.1032, + "num_input_tokens_seen": 39929424, + "step": 18470 + }, + { + "epoch": 3.0138662316476346, + "grad_norm": 0.23525859415531158, + "learning_rate": 0.0009921948163004483, + "loss": 0.1119, + "num_input_tokens_seen": 39939856, + "step": 18475 + }, + { + "epoch": 3.0146818923327894, + "grad_norm": 0.06922031193971634, + "learning_rate": 0.000992182283451651, + "loss": 0.0607, + "num_input_tokens_seen": 39949392, + "step": 18480 + }, + { + "epoch": 3.0154975530179446, + "grad_norm": 0.007163001224398613, + "learning_rate": 0.0009921697406281554, + "loss": 0.1979, + "num_input_tokens_seen": 39959056, + "step": 18485 + }, + { + "epoch": 3.0163132137030995, + "grad_norm": 0.008242499083280563, + "learning_rate": 0.0009921571878302154, + "loss": 0.0617, + "num_input_tokens_seen": 39969360, + "step": 18490 + }, + { + "epoch": 3.0171288743882543, + "grad_norm": 0.04228367283940315, + "learning_rate": 0.0009921446250580857, + "loss": 0.0403, + "num_input_tokens_seen": 39979664, + "step": 18495 + }, + { + "epoch": 3.0179445350734095, + "grad_norm": 0.03732677921652794, + "learning_rate": 0.000992132052312021, + "loss": 0.2385, + "num_input_tokens_seen": 39990480, + "step": 18500 + }, + { + "epoch": 3.0187601957585644, + "grad_norm": 0.12380467355251312, + "learning_rate": 0.000992119469592276, + "loss": 0.1255, + "num_input_tokens_seen": 40000976, + "step": 18505 + }, + { + "epoch": 3.0195758564437196, + "grad_norm": 0.18119105696678162, + "learning_rate": 0.0009921068768991056, + "loss": 0.0736, + "num_input_tokens_seen": 40011472, + "step": 18510 + }, + { + "epoch": 3.0203915171288744, + "grad_norm": 0.12756581604480743, + "learning_rate": 0.0009920942742327648, + "loss": 0.0755, + "num_input_tokens_seen": 40022096, + "step": 18515 + }, + { + "epoch": 3.0212071778140293, + "grad_norm": 0.03765145689249039, + "learning_rate": 0.0009920816615935095, + "loss": 0.1434, + "num_input_tokens_seen": 40032848, + "step": 18520 + }, + { + "epoch": 3.0220228384991845, + "grad_norm": 0.009554882533848286, + "learning_rate": 0.000992069038981595, + "loss": 0.0725, + "num_input_tokens_seen": 40043856, + "step": 18525 + }, + { + "epoch": 3.0228384991843393, + "grad_norm": 0.04768422991037369, + "learning_rate": 0.0009920564063972772, + "loss": 0.0575, + "num_input_tokens_seen": 40053648, + "step": 18530 + }, + { + "epoch": 3.023654159869494, + "grad_norm": 0.24854423105716705, + "learning_rate": 0.0009920437638408122, + "loss": 0.0433, + "num_input_tokens_seen": 40064944, + "step": 18535 + }, + { + "epoch": 3.0244698205546494, + "grad_norm": 0.20471757650375366, + "learning_rate": 0.000992031111312456, + "loss": 0.1138, + "num_input_tokens_seen": 40075536, + "step": 18540 + }, + { + "epoch": 3.0252854812398042, + "grad_norm": 0.04740242287516594, + "learning_rate": 0.0009920184488124654, + "loss": 0.2227, + "num_input_tokens_seen": 40085712, + "step": 18545 + }, + { + "epoch": 3.026101141924959, + "grad_norm": 0.01797325722873211, + "learning_rate": 0.0009920057763410962, + "loss": 0.0672, + "num_input_tokens_seen": 40096688, + "step": 18550 + }, + { + "epoch": 3.0269168026101143, + "grad_norm": 0.07381663471460342, + "learning_rate": 0.0009919930938986064, + "loss": 0.1917, + "num_input_tokens_seen": 40107440, + "step": 18555 + }, + { + "epoch": 3.027732463295269, + "grad_norm": 0.21422551572322845, + "learning_rate": 0.000991980401485252, + "loss": 0.103, + "num_input_tokens_seen": 40119216, + "step": 18560 + }, + { + "epoch": 3.028548123980424, + "grad_norm": 0.21496577560901642, + "learning_rate": 0.000991967699101291, + "loss": 0.109, + "num_input_tokens_seen": 40130672, + "step": 18565 + }, + { + "epoch": 3.029363784665579, + "grad_norm": 0.04606503248214722, + "learning_rate": 0.00099195498674698, + "loss": 0.0676, + "num_input_tokens_seen": 40142032, + "step": 18570 + }, + { + "epoch": 3.030179445350734, + "grad_norm": 0.00870492309331894, + "learning_rate": 0.0009919422644225776, + "loss": 0.11, + "num_input_tokens_seen": 40152528, + "step": 18575 + }, + { + "epoch": 3.0309951060358893, + "grad_norm": 0.023363051936030388, + "learning_rate": 0.0009919295321283409, + "loss": 0.1298, + "num_input_tokens_seen": 40162064, + "step": 18580 + }, + { + "epoch": 3.031810766721044, + "grad_norm": 0.08522692322731018, + "learning_rate": 0.0009919167898645282, + "loss": 0.116, + "num_input_tokens_seen": 40173872, + "step": 18585 + }, + { + "epoch": 3.032626427406199, + "grad_norm": 0.014564625918865204, + "learning_rate": 0.0009919040376313976, + "loss": 0.0727, + "num_input_tokens_seen": 40184976, + "step": 18590 + }, + { + "epoch": 3.033442088091354, + "grad_norm": 0.09863422065973282, + "learning_rate": 0.0009918912754292078, + "loss": 0.1579, + "num_input_tokens_seen": 40196720, + "step": 18595 + }, + { + "epoch": 3.034257748776509, + "grad_norm": 0.168905109167099, + "learning_rate": 0.0009918785032582173, + "loss": 0.0404, + "num_input_tokens_seen": 40207344, + "step": 18600 + }, + { + "epoch": 3.035073409461664, + "grad_norm": 0.07370392233133316, + "learning_rate": 0.000991865721118685, + "loss": 0.0626, + "num_input_tokens_seen": 40219632, + "step": 18605 + }, + { + "epoch": 3.035889070146819, + "grad_norm": 0.12834832072257996, + "learning_rate": 0.0009918529290108696, + "loss": 0.0315, + "num_input_tokens_seen": 40230896, + "step": 18610 + }, + { + "epoch": 3.036704730831974, + "grad_norm": 0.18769440054893494, + "learning_rate": 0.000991840126935031, + "loss": 0.0714, + "num_input_tokens_seen": 40241264, + "step": 18615 + }, + { + "epoch": 3.0375203915171287, + "grad_norm": 0.029713381081819534, + "learning_rate": 0.000991827314891428, + "loss": 0.1725, + "num_input_tokens_seen": 40252272, + "step": 18620 + }, + { + "epoch": 3.038336052202284, + "grad_norm": 0.25868767499923706, + "learning_rate": 0.0009918144928803205, + "loss": 0.1248, + "num_input_tokens_seen": 40262768, + "step": 18625 + }, + { + "epoch": 3.039151712887439, + "grad_norm": 0.02136482112109661, + "learning_rate": 0.0009918016609019686, + "loss": 0.0209, + "num_input_tokens_seen": 40273552, + "step": 18630 + }, + { + "epoch": 3.0399673735725936, + "grad_norm": 0.19516494870185852, + "learning_rate": 0.0009917888189566323, + "loss": 0.2173, + "num_input_tokens_seen": 40283664, + "step": 18635 + }, + { + "epoch": 3.040783034257749, + "grad_norm": 0.005416174419224262, + "learning_rate": 0.0009917759670445712, + "loss": 0.0397, + "num_input_tokens_seen": 40293904, + "step": 18640 + }, + { + "epoch": 3.0415986949429037, + "grad_norm": 0.0065343305468559265, + "learning_rate": 0.0009917631051660468, + "loss": 0.0655, + "num_input_tokens_seen": 40304912, + "step": 18645 + }, + { + "epoch": 3.0424143556280585, + "grad_norm": 0.034539107233285904, + "learning_rate": 0.0009917502333213189, + "loss": 0.1223, + "num_input_tokens_seen": 40315856, + "step": 18650 + }, + { + "epoch": 3.0432300163132138, + "grad_norm": 0.008732376620173454, + "learning_rate": 0.0009917373515106486, + "loss": 0.0421, + "num_input_tokens_seen": 40325936, + "step": 18655 + }, + { + "epoch": 3.0440456769983686, + "grad_norm": 0.1251506805419922, + "learning_rate": 0.0009917244597342973, + "loss": 0.0945, + "num_input_tokens_seen": 40336880, + "step": 18660 + }, + { + "epoch": 3.044861337683524, + "grad_norm": 0.009798316285014153, + "learning_rate": 0.000991711557992526, + "loss": 0.0674, + "num_input_tokens_seen": 40347568, + "step": 18665 + }, + { + "epoch": 3.0456769983686787, + "grad_norm": 0.015984924510121346, + "learning_rate": 0.000991698646285596, + "loss": 0.0855, + "num_input_tokens_seen": 40358736, + "step": 18670 + }, + { + "epoch": 3.0464926590538335, + "grad_norm": 0.37127813696861267, + "learning_rate": 0.0009916857246137693, + "loss": 0.2148, + "num_input_tokens_seen": 40370480, + "step": 18675 + }, + { + "epoch": 3.0473083197389887, + "grad_norm": 0.16755881905555725, + "learning_rate": 0.0009916727929773078, + "loss": 0.0697, + "num_input_tokens_seen": 40380336, + "step": 18680 + }, + { + "epoch": 3.0481239804241436, + "grad_norm": 0.04090609401464462, + "learning_rate": 0.0009916598513764732, + "loss": 0.122, + "num_input_tokens_seen": 40391824, + "step": 18685 + }, + { + "epoch": 3.0489396411092984, + "grad_norm": 0.05720209330320358, + "learning_rate": 0.0009916468998115281, + "loss": 0.1412, + "num_input_tokens_seen": 40403088, + "step": 18690 + }, + { + "epoch": 3.0497553017944536, + "grad_norm": 0.1086057499051094, + "learning_rate": 0.000991633938282735, + "loss": 0.1968, + "num_input_tokens_seen": 40414480, + "step": 18695 + }, + { + "epoch": 3.0505709624796085, + "grad_norm": 0.032246749848127365, + "learning_rate": 0.0009916209667903562, + "loss": 0.0715, + "num_input_tokens_seen": 40425968, + "step": 18700 + }, + { + "epoch": 3.0513866231647633, + "grad_norm": 0.03548622503876686, + "learning_rate": 0.0009916079853346548, + "loss": 0.0328, + "num_input_tokens_seen": 40436912, + "step": 18705 + }, + { + "epoch": 3.0522022838499185, + "grad_norm": 0.06208382919430733, + "learning_rate": 0.0009915949939158942, + "loss": 0.0605, + "num_input_tokens_seen": 40447472, + "step": 18710 + }, + { + "epoch": 3.0530179445350734, + "grad_norm": 0.010785914957523346, + "learning_rate": 0.0009915819925343373, + "loss": 0.1707, + "num_input_tokens_seen": 40458192, + "step": 18715 + }, + { + "epoch": 3.053833605220228, + "grad_norm": 0.01436684001237154, + "learning_rate": 0.0009915689811902477, + "loss": 0.0577, + "num_input_tokens_seen": 40468016, + "step": 18720 + }, + { + "epoch": 3.0546492659053834, + "grad_norm": 0.2570911943912506, + "learning_rate": 0.000991555959883889, + "loss": 0.0871, + "num_input_tokens_seen": 40478448, + "step": 18725 + }, + { + "epoch": 3.0554649265905383, + "grad_norm": 0.012195185758173466, + "learning_rate": 0.0009915429286155254, + "loss": 0.0189, + "num_input_tokens_seen": 40489488, + "step": 18730 + }, + { + "epoch": 3.0562805872756935, + "grad_norm": 0.007231024093925953, + "learning_rate": 0.0009915298873854207, + "loss": 0.0101, + "num_input_tokens_seen": 40499888, + "step": 18735 + }, + { + "epoch": 3.0570962479608483, + "grad_norm": 0.054103195667266846, + "learning_rate": 0.0009915168361938392, + "loss": 0.1771, + "num_input_tokens_seen": 40509744, + "step": 18740 + }, + { + "epoch": 3.057911908646003, + "grad_norm": 0.22565001249313354, + "learning_rate": 0.0009915037750410456, + "loss": 0.2337, + "num_input_tokens_seen": 40520176, + "step": 18745 + }, + { + "epoch": 3.0587275693311584, + "grad_norm": 0.1788310408592224, + "learning_rate": 0.0009914907039273045, + "loss": 0.1327, + "num_input_tokens_seen": 40530896, + "step": 18750 + }, + { + "epoch": 3.0595432300163132, + "grad_norm": 0.03100595623254776, + "learning_rate": 0.0009914776228528805, + "loss": 0.3058, + "num_input_tokens_seen": 40541456, + "step": 18755 + }, + { + "epoch": 3.060358890701468, + "grad_norm": 0.3577895164489746, + "learning_rate": 0.0009914645318180393, + "loss": 0.1918, + "num_input_tokens_seen": 40552912, + "step": 18760 + }, + { + "epoch": 3.0611745513866233, + "grad_norm": 0.12759797275066376, + "learning_rate": 0.0009914514308230458, + "loss": 0.0771, + "num_input_tokens_seen": 40563824, + "step": 18765 + }, + { + "epoch": 3.061990212071778, + "grad_norm": 0.08027143776416779, + "learning_rate": 0.0009914383198681657, + "loss": 0.044, + "num_input_tokens_seen": 40575120, + "step": 18770 + }, + { + "epoch": 3.062805872756933, + "grad_norm": 0.20178310573101044, + "learning_rate": 0.0009914251989536645, + "loss": 0.0979, + "num_input_tokens_seen": 40586448, + "step": 18775 + }, + { + "epoch": 3.063621533442088, + "grad_norm": 0.043073415756225586, + "learning_rate": 0.0009914120680798082, + "loss": 0.0983, + "num_input_tokens_seen": 40597936, + "step": 18780 + }, + { + "epoch": 3.064437194127243, + "grad_norm": 0.14184482395648956, + "learning_rate": 0.000991398927246863, + "loss": 0.0433, + "num_input_tokens_seen": 40607984, + "step": 18785 + }, + { + "epoch": 3.065252854812398, + "grad_norm": 0.07433011382818222, + "learning_rate": 0.000991385776455095, + "loss": 0.0627, + "num_input_tokens_seen": 40618928, + "step": 18790 + }, + { + "epoch": 3.066068515497553, + "grad_norm": 0.030221259221434593, + "learning_rate": 0.0009913726157047712, + "loss": 0.1432, + "num_input_tokens_seen": 40628432, + "step": 18795 + }, + { + "epoch": 3.066884176182708, + "grad_norm": 0.015028982423245907, + "learning_rate": 0.0009913594449961576, + "loss": 0.0488, + "num_input_tokens_seen": 40639920, + "step": 18800 + }, + { + "epoch": 3.067699836867863, + "grad_norm": 0.08033467084169388, + "learning_rate": 0.0009913462643295217, + "loss": 0.1466, + "num_input_tokens_seen": 40651120, + "step": 18805 + }, + { + "epoch": 3.068515497553018, + "grad_norm": 0.1094004362821579, + "learning_rate": 0.0009913330737051304, + "loss": 0.221, + "num_input_tokens_seen": 40661008, + "step": 18810 + }, + { + "epoch": 3.069331158238173, + "grad_norm": 0.1347908228635788, + "learning_rate": 0.0009913198731232513, + "loss": 0.123, + "num_input_tokens_seen": 40671920, + "step": 18815 + }, + { + "epoch": 3.070146818923328, + "grad_norm": 0.04396280273795128, + "learning_rate": 0.0009913066625841513, + "loss": 0.0832, + "num_input_tokens_seen": 40681552, + "step": 18820 + }, + { + "epoch": 3.070962479608483, + "grad_norm": 0.01838274672627449, + "learning_rate": 0.0009912934420880988, + "loss": 0.0323, + "num_input_tokens_seen": 40692368, + "step": 18825 + }, + { + "epoch": 3.0717781402936377, + "grad_norm": 0.007258755154907703, + "learning_rate": 0.0009912802116353613, + "loss": 0.0615, + "num_input_tokens_seen": 40701392, + "step": 18830 + }, + { + "epoch": 3.072593800978793, + "grad_norm": 0.10661440342664719, + "learning_rate": 0.0009912669712262073, + "loss": 0.1358, + "num_input_tokens_seen": 40712272, + "step": 18835 + }, + { + "epoch": 3.073409461663948, + "grad_norm": 0.10035596787929535, + "learning_rate": 0.0009912537208609047, + "loss": 0.0314, + "num_input_tokens_seen": 40724496, + "step": 18840 + }, + { + "epoch": 3.0742251223491026, + "grad_norm": 0.009967050515115261, + "learning_rate": 0.0009912404605397222, + "loss": 0.1524, + "num_input_tokens_seen": 40734928, + "step": 18845 + }, + { + "epoch": 3.075040783034258, + "grad_norm": 0.13334870338439941, + "learning_rate": 0.0009912271902629288, + "loss": 0.0507, + "num_input_tokens_seen": 40744240, + "step": 18850 + }, + { + "epoch": 3.0758564437194127, + "grad_norm": 0.13278226554393768, + "learning_rate": 0.000991213910030793, + "loss": 0.0939, + "num_input_tokens_seen": 40754096, + "step": 18855 + }, + { + "epoch": 3.0766721044045675, + "grad_norm": 0.22139577567577362, + "learning_rate": 0.0009912006198435843, + "loss": 0.1393, + "num_input_tokens_seen": 40765328, + "step": 18860 + }, + { + "epoch": 3.0774877650897228, + "grad_norm": 0.00508854491636157, + "learning_rate": 0.000991187319701572, + "loss": 0.0157, + "num_input_tokens_seen": 40776304, + "step": 18865 + }, + { + "epoch": 3.0783034257748776, + "grad_norm": 0.31696730852127075, + "learning_rate": 0.0009911740096050252, + "loss": 0.1645, + "num_input_tokens_seen": 40787376, + "step": 18870 + }, + { + "epoch": 3.0791190864600324, + "grad_norm": 0.08231733739376068, + "learning_rate": 0.0009911606895542143, + "loss": 0.0805, + "num_input_tokens_seen": 40798032, + "step": 18875 + }, + { + "epoch": 3.0799347471451877, + "grad_norm": 0.2622213363647461, + "learning_rate": 0.0009911473595494089, + "loss": 0.109, + "num_input_tokens_seen": 40809456, + "step": 18880 + }, + { + "epoch": 3.0807504078303425, + "grad_norm": 0.026339467614889145, + "learning_rate": 0.0009911340195908791, + "loss": 0.0945, + "num_input_tokens_seen": 40819184, + "step": 18885 + }, + { + "epoch": 3.0815660685154977, + "grad_norm": 0.022269627079367638, + "learning_rate": 0.0009911206696788955, + "loss": 0.096, + "num_input_tokens_seen": 40831088, + "step": 18890 + }, + { + "epoch": 3.0823817292006526, + "grad_norm": 0.13229554891586304, + "learning_rate": 0.0009911073098137285, + "loss": 0.113, + "num_input_tokens_seen": 40842160, + "step": 18895 + }, + { + "epoch": 3.0831973898858074, + "grad_norm": 0.15041932463645935, + "learning_rate": 0.0009910939399956488, + "loss": 0.2426, + "num_input_tokens_seen": 40852304, + "step": 18900 + }, + { + "epoch": 3.0840130505709626, + "grad_norm": 0.10796932131052017, + "learning_rate": 0.0009910805602249273, + "loss": 0.0974, + "num_input_tokens_seen": 40864816, + "step": 18905 + }, + { + "epoch": 3.0848287112561175, + "grad_norm": 0.30564913153648376, + "learning_rate": 0.0009910671705018353, + "loss": 0.151, + "num_input_tokens_seen": 40875632, + "step": 18910 + }, + { + "epoch": 3.0856443719412723, + "grad_norm": 0.08066358417272568, + "learning_rate": 0.000991053770826644, + "loss": 0.0565, + "num_input_tokens_seen": 40886896, + "step": 18915 + }, + { + "epoch": 3.0864600326264275, + "grad_norm": 0.07252980023622513, + "learning_rate": 0.0009910403611996252, + "loss": 0.2728, + "num_input_tokens_seen": 40897744, + "step": 18920 + }, + { + "epoch": 3.0872756933115824, + "grad_norm": 0.046622633934020996, + "learning_rate": 0.0009910269416210508, + "loss": 0.1146, + "num_input_tokens_seen": 40907184, + "step": 18925 + }, + { + "epoch": 3.088091353996737, + "grad_norm": 0.13025681674480438, + "learning_rate": 0.0009910135120911924, + "loss": 0.0651, + "num_input_tokens_seen": 40918160, + "step": 18930 + }, + { + "epoch": 3.0889070146818924, + "grad_norm": 0.06960994750261307, + "learning_rate": 0.0009910000726103222, + "loss": 0.2516, + "num_input_tokens_seen": 40927856, + "step": 18935 + }, + { + "epoch": 3.0897226753670473, + "grad_norm": 0.1453385353088379, + "learning_rate": 0.0009909866231787125, + "loss": 0.0677, + "num_input_tokens_seen": 40939088, + "step": 18940 + }, + { + "epoch": 3.090538336052202, + "grad_norm": 0.07813244313001633, + "learning_rate": 0.0009909731637966362, + "loss": 0.0854, + "num_input_tokens_seen": 40949008, + "step": 18945 + }, + { + "epoch": 3.0913539967373573, + "grad_norm": 0.1851913183927536, + "learning_rate": 0.0009909596944643658, + "loss": 0.1805, + "num_input_tokens_seen": 40959984, + "step": 18950 + }, + { + "epoch": 3.092169657422512, + "grad_norm": 0.04275398701429367, + "learning_rate": 0.0009909462151821745, + "loss": 0.1174, + "num_input_tokens_seen": 40971024, + "step": 18955 + }, + { + "epoch": 3.0929853181076674, + "grad_norm": 0.0939917117357254, + "learning_rate": 0.0009909327259503351, + "loss": 0.0721, + "num_input_tokens_seen": 40984112, + "step": 18960 + }, + { + "epoch": 3.0938009787928222, + "grad_norm": 0.006666467059403658, + "learning_rate": 0.0009909192267691215, + "loss": 0.1091, + "num_input_tokens_seen": 40995504, + "step": 18965 + }, + { + "epoch": 3.094616639477977, + "grad_norm": 0.008724762126803398, + "learning_rate": 0.000990905717638807, + "loss": 0.0256, + "num_input_tokens_seen": 41006960, + "step": 18970 + }, + { + "epoch": 3.0954323001631323, + "grad_norm": 0.02470196783542633, + "learning_rate": 0.000990892198559665, + "loss": 0.1204, + "num_input_tokens_seen": 41017104, + "step": 18975 + }, + { + "epoch": 3.096247960848287, + "grad_norm": 0.21092841029167175, + "learning_rate": 0.0009908786695319702, + "loss": 0.114, + "num_input_tokens_seen": 41028368, + "step": 18980 + }, + { + "epoch": 3.097063621533442, + "grad_norm": 0.04528717324137688, + "learning_rate": 0.0009908651305559964, + "loss": 0.0845, + "num_input_tokens_seen": 41038960, + "step": 18985 + }, + { + "epoch": 3.097879282218597, + "grad_norm": 0.10174711793661118, + "learning_rate": 0.000990851581632018, + "loss": 0.0807, + "num_input_tokens_seen": 41049008, + "step": 18990 + }, + { + "epoch": 3.098694942903752, + "grad_norm": 0.42217883467674255, + "learning_rate": 0.0009908380227603094, + "loss": 0.2884, + "num_input_tokens_seen": 41060400, + "step": 18995 + }, + { + "epoch": 3.099510603588907, + "grad_norm": 0.020816409960389137, + "learning_rate": 0.000990824453941146, + "loss": 0.0787, + "num_input_tokens_seen": 41072112, + "step": 19000 + }, + { + "epoch": 3.100326264274062, + "grad_norm": 0.035774942487478256, + "learning_rate": 0.000990810875174802, + "loss": 0.0627, + "num_input_tokens_seen": 41083664, + "step": 19005 + }, + { + "epoch": 3.101141924959217, + "grad_norm": 0.015757689252495766, + "learning_rate": 0.0009907972864615531, + "loss": 0.1523, + "num_input_tokens_seen": 41093232, + "step": 19010 + }, + { + "epoch": 3.1019575856443717, + "grad_norm": 0.12344611436128616, + "learning_rate": 0.0009907836878016746, + "loss": 0.1165, + "num_input_tokens_seen": 41105328, + "step": 19015 + }, + { + "epoch": 3.102773246329527, + "grad_norm": 0.08110906928777695, + "learning_rate": 0.000990770079195442, + "loss": 0.0464, + "num_input_tokens_seen": 41115504, + "step": 19020 + }, + { + "epoch": 3.103588907014682, + "grad_norm": 0.07154972851276398, + "learning_rate": 0.0009907564606431315, + "loss": 0.1205, + "num_input_tokens_seen": 41126480, + "step": 19025 + }, + { + "epoch": 3.104404567699837, + "grad_norm": 0.007094620727002621, + "learning_rate": 0.0009907428321450182, + "loss": 0.0812, + "num_input_tokens_seen": 41138512, + "step": 19030 + }, + { + "epoch": 3.105220228384992, + "grad_norm": 0.02809945121407509, + "learning_rate": 0.0009907291937013792, + "loss": 0.0625, + "num_input_tokens_seen": 41150224, + "step": 19035 + }, + { + "epoch": 3.1060358890701467, + "grad_norm": 0.06788235157728195, + "learning_rate": 0.0009907155453124906, + "loss": 0.0248, + "num_input_tokens_seen": 41161392, + "step": 19040 + }, + { + "epoch": 3.106851549755302, + "grad_norm": 0.01055830903351307, + "learning_rate": 0.0009907018869786289, + "loss": 0.0953, + "num_input_tokens_seen": 41172592, + "step": 19045 + }, + { + "epoch": 3.107667210440457, + "grad_norm": 0.07226168364286423, + "learning_rate": 0.0009906882187000708, + "loss": 0.1012, + "num_input_tokens_seen": 41183792, + "step": 19050 + }, + { + "epoch": 3.1084828711256116, + "grad_norm": 0.008402747102081776, + "learning_rate": 0.0009906745404770936, + "loss": 0.0609, + "num_input_tokens_seen": 41192880, + "step": 19055 + }, + { + "epoch": 3.109298531810767, + "grad_norm": 0.015082316473126411, + "learning_rate": 0.0009906608523099743, + "loss": 0.0394, + "num_input_tokens_seen": 41202768, + "step": 19060 + }, + { + "epoch": 3.1101141924959217, + "grad_norm": 0.0037719886749982834, + "learning_rate": 0.0009906471541989905, + "loss": 0.0603, + "num_input_tokens_seen": 41213552, + "step": 19065 + }, + { + "epoch": 3.1109298531810765, + "grad_norm": 0.2002812772989273, + "learning_rate": 0.0009906334461444195, + "loss": 0.1401, + "num_input_tokens_seen": 41223920, + "step": 19070 + }, + { + "epoch": 3.1117455138662318, + "grad_norm": 0.05566996708512306, + "learning_rate": 0.0009906197281465395, + "loss": 0.0523, + "num_input_tokens_seen": 41235440, + "step": 19075 + }, + { + "epoch": 3.1125611745513866, + "grad_norm": 0.09809504449367523, + "learning_rate": 0.0009906060002056283, + "loss": 0.0989, + "num_input_tokens_seen": 41247184, + "step": 19080 + }, + { + "epoch": 3.1133768352365414, + "grad_norm": 0.05056000500917435, + "learning_rate": 0.000990592262321964, + "loss": 0.2402, + "num_input_tokens_seen": 41257552, + "step": 19085 + }, + { + "epoch": 3.1141924959216967, + "grad_norm": 0.22058925032615662, + "learning_rate": 0.0009905785144958253, + "loss": 0.2654, + "num_input_tokens_seen": 41268496, + "step": 19090 + }, + { + "epoch": 3.1150081566068515, + "grad_norm": 0.2546690106391907, + "learning_rate": 0.0009905647567274905, + "loss": 0.1255, + "num_input_tokens_seen": 41278640, + "step": 19095 + }, + { + "epoch": 3.1158238172920063, + "grad_norm": 0.11807727813720703, + "learning_rate": 0.0009905509890172385, + "loss": 0.0706, + "num_input_tokens_seen": 41288976, + "step": 19100 + }, + { + "epoch": 3.1166394779771616, + "grad_norm": 0.06996791809797287, + "learning_rate": 0.0009905372113653487, + "loss": 0.1316, + "num_input_tokens_seen": 41300912, + "step": 19105 + }, + { + "epoch": 3.1174551386623164, + "grad_norm": 0.1596715748310089, + "learning_rate": 0.0009905234237721, + "loss": 0.3419, + "num_input_tokens_seen": 41311088, + "step": 19110 + }, + { + "epoch": 3.1182707993474716, + "grad_norm": 0.03004053235054016, + "learning_rate": 0.0009905096262377716, + "loss": 0.0519, + "num_input_tokens_seen": 41321488, + "step": 19115 + }, + { + "epoch": 3.1190864600326265, + "grad_norm": 0.16573455929756165, + "learning_rate": 0.0009904958187626433, + "loss": 0.1674, + "num_input_tokens_seen": 41333008, + "step": 19120 + }, + { + "epoch": 3.1199021207177813, + "grad_norm": 0.10211943089962006, + "learning_rate": 0.0009904820013469952, + "loss": 0.2155, + "num_input_tokens_seen": 41343920, + "step": 19125 + }, + { + "epoch": 3.1207177814029365, + "grad_norm": 0.08475879579782486, + "learning_rate": 0.0009904681739911073, + "loss": 0.0721, + "num_input_tokens_seen": 41353488, + "step": 19130 + }, + { + "epoch": 3.1215334420880914, + "grad_norm": 0.2866770923137665, + "learning_rate": 0.0009904543366952593, + "loss": 0.1097, + "num_input_tokens_seen": 41364112, + "step": 19135 + }, + { + "epoch": 3.122349102773246, + "grad_norm": 0.033991675823926926, + "learning_rate": 0.0009904404894597323, + "loss": 0.0394, + "num_input_tokens_seen": 41375664, + "step": 19140 + }, + { + "epoch": 3.1231647634584014, + "grad_norm": 0.009995969012379646, + "learning_rate": 0.0009904266322848063, + "loss": 0.043, + "num_input_tokens_seen": 41387440, + "step": 19145 + }, + { + "epoch": 3.1239804241435563, + "grad_norm": 0.05611064285039902, + "learning_rate": 0.0009904127651707627, + "loss": 0.0443, + "num_input_tokens_seen": 41399664, + "step": 19150 + }, + { + "epoch": 3.124796084828711, + "grad_norm": 0.07509331405162811, + "learning_rate": 0.000990398888117882, + "loss": 0.2621, + "num_input_tokens_seen": 41409296, + "step": 19155 + }, + { + "epoch": 3.1256117455138663, + "grad_norm": 0.14326560497283936, + "learning_rate": 0.0009903850011264458, + "loss": 0.1059, + "num_input_tokens_seen": 41421008, + "step": 19160 + }, + { + "epoch": 3.126427406199021, + "grad_norm": 0.1420290321111679, + "learning_rate": 0.0009903711041967357, + "loss": 0.1972, + "num_input_tokens_seen": 41431984, + "step": 19165 + }, + { + "epoch": 3.1272430668841764, + "grad_norm": 0.018591681495308876, + "learning_rate": 0.000990357197329033, + "loss": 0.0265, + "num_input_tokens_seen": 41442768, + "step": 19170 + }, + { + "epoch": 3.1280587275693312, + "grad_norm": 0.017070859670639038, + "learning_rate": 0.0009903432805236194, + "loss": 0.05, + "num_input_tokens_seen": 41452240, + "step": 19175 + }, + { + "epoch": 3.128874388254486, + "grad_norm": 0.22679705917835236, + "learning_rate": 0.0009903293537807773, + "loss": 0.2145, + "num_input_tokens_seen": 41463696, + "step": 19180 + }, + { + "epoch": 3.1296900489396413, + "grad_norm": 0.10408644378185272, + "learning_rate": 0.0009903154171007889, + "loss": 0.0406, + "num_input_tokens_seen": 41474608, + "step": 19185 + }, + { + "epoch": 3.130505709624796, + "grad_norm": 0.00605523819103837, + "learning_rate": 0.0009903014704839366, + "loss": 0.067, + "num_input_tokens_seen": 41485968, + "step": 19190 + }, + { + "epoch": 3.131321370309951, + "grad_norm": 0.11108432710170746, + "learning_rate": 0.000990287513930503, + "loss": 0.0651, + "num_input_tokens_seen": 41496688, + "step": 19195 + }, + { + "epoch": 3.132137030995106, + "grad_norm": 0.004489239305257797, + "learning_rate": 0.000990273547440771, + "loss": 0.0742, + "num_input_tokens_seen": 41507824, + "step": 19200 + }, + { + "epoch": 3.132952691680261, + "grad_norm": 0.0032085098791867495, + "learning_rate": 0.0009902595710150233, + "loss": 0.0714, + "num_input_tokens_seen": 41519440, + "step": 19205 + }, + { + "epoch": 3.133768352365416, + "grad_norm": 0.004664691165089607, + "learning_rate": 0.0009902455846535437, + "loss": 0.0315, + "num_input_tokens_seen": 41530768, + "step": 19210 + }, + { + "epoch": 3.134584013050571, + "grad_norm": 0.2081509232521057, + "learning_rate": 0.0009902315883566152, + "loss": 0.2124, + "num_input_tokens_seen": 41541808, + "step": 19215 + }, + { + "epoch": 3.135399673735726, + "grad_norm": 0.055930253118276596, + "learning_rate": 0.000990217582124522, + "loss": 0.0733, + "num_input_tokens_seen": 41552400, + "step": 19220 + }, + { + "epoch": 3.1362153344208807, + "grad_norm": 0.023356657475233078, + "learning_rate": 0.0009902035659575474, + "loss": 0.0448, + "num_input_tokens_seen": 41562416, + "step": 19225 + }, + { + "epoch": 3.137030995106036, + "grad_norm": 0.010270596481859684, + "learning_rate": 0.0009901895398559757, + "loss": 0.0487, + "num_input_tokens_seen": 41574000, + "step": 19230 + }, + { + "epoch": 3.137846655791191, + "grad_norm": 0.06142498180270195, + "learning_rate": 0.0009901755038200912, + "loss": 0.1271, + "num_input_tokens_seen": 41584400, + "step": 19235 + }, + { + "epoch": 3.1386623164763456, + "grad_norm": 0.059650782495737076, + "learning_rate": 0.0009901614578501782, + "loss": 0.1603, + "num_input_tokens_seen": 41596656, + "step": 19240 + }, + { + "epoch": 3.139477977161501, + "grad_norm": 0.056524645537137985, + "learning_rate": 0.0009901474019465215, + "loss": 0.0678, + "num_input_tokens_seen": 41606960, + "step": 19245 + }, + { + "epoch": 3.1402936378466557, + "grad_norm": 0.038970183581113815, + "learning_rate": 0.0009901333361094057, + "loss": 0.0453, + "num_input_tokens_seen": 41617936, + "step": 19250 + }, + { + "epoch": 3.141109298531811, + "grad_norm": 0.3043142855167389, + "learning_rate": 0.0009901192603391162, + "loss": 0.1262, + "num_input_tokens_seen": 41628720, + "step": 19255 + }, + { + "epoch": 3.141924959216966, + "grad_norm": 0.2813945710659027, + "learning_rate": 0.0009901051746359381, + "loss": 0.0738, + "num_input_tokens_seen": 41639408, + "step": 19260 + }, + { + "epoch": 3.1427406199021206, + "grad_norm": 0.024280589073896408, + "learning_rate": 0.0009900910790001571, + "loss": 0.0133, + "num_input_tokens_seen": 41650128, + "step": 19265 + }, + { + "epoch": 3.143556280587276, + "grad_norm": 0.25504744052886963, + "learning_rate": 0.0009900769734320586, + "loss": 0.1879, + "num_input_tokens_seen": 41661168, + "step": 19270 + }, + { + "epoch": 3.1443719412724307, + "grad_norm": 0.08303016424179077, + "learning_rate": 0.0009900628579319283, + "loss": 0.1217, + "num_input_tokens_seen": 41672656, + "step": 19275 + }, + { + "epoch": 3.1451876019575855, + "grad_norm": 0.04942861944437027, + "learning_rate": 0.0009900487325000527, + "loss": 0.1128, + "num_input_tokens_seen": 41684176, + "step": 19280 + }, + { + "epoch": 3.1460032626427408, + "grad_norm": 0.40202391147613525, + "learning_rate": 0.0009900345971367178, + "loss": 0.1924, + "num_input_tokens_seen": 41694896, + "step": 19285 + }, + { + "epoch": 3.1468189233278956, + "grad_norm": 0.013745547272264957, + "learning_rate": 0.00099002045184221, + "loss": 0.08, + "num_input_tokens_seen": 41705840, + "step": 19290 + }, + { + "epoch": 3.1476345840130504, + "grad_norm": 0.23511438071727753, + "learning_rate": 0.0009900062966168163, + "loss": 0.1777, + "num_input_tokens_seen": 41717104, + "step": 19295 + }, + { + "epoch": 3.1484502446982057, + "grad_norm": 0.5526800155639648, + "learning_rate": 0.0009899921314608232, + "loss": 0.1843, + "num_input_tokens_seen": 41727632, + "step": 19300 + }, + { + "epoch": 3.1492659053833605, + "grad_norm": 0.022193720564246178, + "learning_rate": 0.0009899779563745182, + "loss": 0.0536, + "num_input_tokens_seen": 41738608, + "step": 19305 + }, + { + "epoch": 3.1500815660685153, + "grad_norm": 0.4144640564918518, + "learning_rate": 0.0009899637713581882, + "loss": 0.0486, + "num_input_tokens_seen": 41749072, + "step": 19310 + }, + { + "epoch": 3.1508972267536706, + "grad_norm": 0.060778357088565826, + "learning_rate": 0.0009899495764121207, + "loss": 0.2802, + "num_input_tokens_seen": 41760400, + "step": 19315 + }, + { + "epoch": 3.1517128874388254, + "grad_norm": 0.17989078164100647, + "learning_rate": 0.0009899353715366037, + "loss": 0.1911, + "num_input_tokens_seen": 41771184, + "step": 19320 + }, + { + "epoch": 3.15252854812398, + "grad_norm": 0.025782205164432526, + "learning_rate": 0.0009899211567319247, + "loss": 0.0811, + "num_input_tokens_seen": 41782896, + "step": 19325 + }, + { + "epoch": 3.1533442088091355, + "grad_norm": 0.016936376690864563, + "learning_rate": 0.000989906931998372, + "loss": 0.1149, + "num_input_tokens_seen": 41793616, + "step": 19330 + }, + { + "epoch": 3.1541598694942903, + "grad_norm": 0.035956088453531265, + "learning_rate": 0.000989892697336234, + "loss": 0.0452, + "num_input_tokens_seen": 41806000, + "step": 19335 + }, + { + "epoch": 3.1549755301794455, + "grad_norm": 0.06265423446893692, + "learning_rate": 0.0009898784527457988, + "loss": 0.0659, + "num_input_tokens_seen": 41816016, + "step": 19340 + }, + { + "epoch": 3.1557911908646004, + "grad_norm": 0.048211682587862015, + "learning_rate": 0.0009898641982273553, + "loss": 0.0343, + "num_input_tokens_seen": 41826320, + "step": 19345 + }, + { + "epoch": 3.156606851549755, + "grad_norm": 0.10827599465847015, + "learning_rate": 0.0009898499337811925, + "loss": 0.1373, + "num_input_tokens_seen": 41837328, + "step": 19350 + }, + { + "epoch": 3.1574225122349104, + "grad_norm": 0.012927313335239887, + "learning_rate": 0.0009898356594075992, + "loss": 0.1189, + "num_input_tokens_seen": 41847856, + "step": 19355 + }, + { + "epoch": 3.1582381729200653, + "grad_norm": 0.08092676848173141, + "learning_rate": 0.0009898213751068652, + "loss": 0.0205, + "num_input_tokens_seen": 41858288, + "step": 19360 + }, + { + "epoch": 3.15905383360522, + "grad_norm": 0.12177547812461853, + "learning_rate": 0.0009898070808792795, + "loss": 0.1304, + "num_input_tokens_seen": 41868496, + "step": 19365 + }, + { + "epoch": 3.1598694942903753, + "grad_norm": 0.064247727394104, + "learning_rate": 0.0009897927767251319, + "loss": 0.0914, + "num_input_tokens_seen": 41879312, + "step": 19370 + }, + { + "epoch": 3.16068515497553, + "grad_norm": 0.05906981602311134, + "learning_rate": 0.0009897784626447122, + "loss": 0.0883, + "num_input_tokens_seen": 41890736, + "step": 19375 + }, + { + "epoch": 3.161500815660685, + "grad_norm": 0.010267877951264381, + "learning_rate": 0.0009897641386383106, + "loss": 0.0268, + "num_input_tokens_seen": 41902704, + "step": 19380 + }, + { + "epoch": 3.1623164763458402, + "grad_norm": 0.006765549536794424, + "learning_rate": 0.0009897498047062177, + "loss": 0.1427, + "num_input_tokens_seen": 41913200, + "step": 19385 + }, + { + "epoch": 3.163132137030995, + "grad_norm": 0.035863492637872696, + "learning_rate": 0.0009897354608487234, + "loss": 0.1432, + "num_input_tokens_seen": 41923792, + "step": 19390 + }, + { + "epoch": 3.1639477977161503, + "grad_norm": 0.004194718785583973, + "learning_rate": 0.000989721107066119, + "loss": 0.0884, + "num_input_tokens_seen": 41934960, + "step": 19395 + }, + { + "epoch": 3.164763458401305, + "grad_norm": 0.010995978489518166, + "learning_rate": 0.000989706743358695, + "loss": 0.0438, + "num_input_tokens_seen": 41945904, + "step": 19400 + }, + { + "epoch": 3.16557911908646, + "grad_norm": 0.28057172894477844, + "learning_rate": 0.0009896923697267426, + "loss": 0.252, + "num_input_tokens_seen": 41957488, + "step": 19405 + }, + { + "epoch": 3.166394779771615, + "grad_norm": 0.09755899012088776, + "learning_rate": 0.0009896779861705532, + "loss": 0.1373, + "num_input_tokens_seen": 41968784, + "step": 19410 + }, + { + "epoch": 3.16721044045677, + "grad_norm": 0.04553823918104172, + "learning_rate": 0.000989663592690418, + "loss": 0.2561, + "num_input_tokens_seen": 41980048, + "step": 19415 + }, + { + "epoch": 3.168026101141925, + "grad_norm": 0.05321956053376198, + "learning_rate": 0.0009896491892866291, + "loss": 0.0508, + "num_input_tokens_seen": 41989904, + "step": 19420 + }, + { + "epoch": 3.16884176182708, + "grad_norm": 0.24264544248580933, + "learning_rate": 0.0009896347759594782, + "loss": 0.2203, + "num_input_tokens_seen": 42000432, + "step": 19425 + }, + { + "epoch": 3.169657422512235, + "grad_norm": 0.10809178650379181, + "learning_rate": 0.0009896203527092573, + "loss": 0.1427, + "num_input_tokens_seen": 42010576, + "step": 19430 + }, + { + "epoch": 3.1704730831973897, + "grad_norm": 0.046370748430490494, + "learning_rate": 0.000989605919536259, + "loss": 0.0368, + "num_input_tokens_seen": 42023536, + "step": 19435 + }, + { + "epoch": 3.171288743882545, + "grad_norm": 0.02008720114827156, + "learning_rate": 0.0009895914764407755, + "loss": 0.1091, + "num_input_tokens_seen": 42035792, + "step": 19440 + }, + { + "epoch": 3.1721044045677, + "grad_norm": 0.12499675899744034, + "learning_rate": 0.0009895770234230996, + "loss": 0.052, + "num_input_tokens_seen": 42045968, + "step": 19445 + }, + { + "epoch": 3.1729200652528546, + "grad_norm": 0.02297365851700306, + "learning_rate": 0.0009895625604835244, + "loss": 0.1416, + "num_input_tokens_seen": 42057616, + "step": 19450 + }, + { + "epoch": 3.17373572593801, + "grad_norm": 0.02101576328277588, + "learning_rate": 0.0009895480876223428, + "loss": 0.0367, + "num_input_tokens_seen": 42067536, + "step": 19455 + }, + { + "epoch": 3.1745513866231647, + "grad_norm": 0.05291339010000229, + "learning_rate": 0.000989533604839848, + "loss": 0.101, + "num_input_tokens_seen": 42079024, + "step": 19460 + }, + { + "epoch": 3.1753670473083195, + "grad_norm": 0.03308214247226715, + "learning_rate": 0.0009895191121363338, + "loss": 0.0384, + "num_input_tokens_seen": 42090864, + "step": 19465 + }, + { + "epoch": 3.176182707993475, + "grad_norm": 0.007731168996542692, + "learning_rate": 0.0009895046095120938, + "loss": 0.026, + "num_input_tokens_seen": 42101776, + "step": 19470 + }, + { + "epoch": 3.1769983686786296, + "grad_norm": 0.016055205836892128, + "learning_rate": 0.0009894900969674221, + "loss": 0.1852, + "num_input_tokens_seen": 42112752, + "step": 19475 + }, + { + "epoch": 3.177814029363785, + "grad_norm": 0.02115645818412304, + "learning_rate": 0.0009894755745026124, + "loss": 0.2081, + "num_input_tokens_seen": 42125072, + "step": 19480 + }, + { + "epoch": 3.1786296900489397, + "grad_norm": 0.0125564094632864, + "learning_rate": 0.0009894610421179594, + "loss": 0.0768, + "num_input_tokens_seen": 42136368, + "step": 19485 + }, + { + "epoch": 3.1794453507340945, + "grad_norm": 0.00938443560153246, + "learning_rate": 0.0009894464998137572, + "loss": 0.0911, + "num_input_tokens_seen": 42146576, + "step": 19490 + }, + { + "epoch": 3.1802610114192498, + "grad_norm": 0.2912297546863556, + "learning_rate": 0.000989431947590301, + "loss": 0.0781, + "num_input_tokens_seen": 42157680, + "step": 19495 + }, + { + "epoch": 3.1810766721044046, + "grad_norm": 0.08204010128974915, + "learning_rate": 0.0009894173854478854, + "loss": 0.0977, + "num_input_tokens_seen": 42168720, + "step": 19500 + }, + { + "epoch": 3.1818923327895594, + "grad_norm": 0.041957221925258636, + "learning_rate": 0.0009894028133868055, + "loss": 0.1634, + "num_input_tokens_seen": 42179376, + "step": 19505 + }, + { + "epoch": 3.1827079934747147, + "grad_norm": 0.2277340292930603, + "learning_rate": 0.000989388231407357, + "loss": 0.0766, + "num_input_tokens_seen": 42190416, + "step": 19510 + }, + { + "epoch": 3.1835236541598695, + "grad_norm": 0.1018163338303566, + "learning_rate": 0.000989373639509835, + "loss": 0.0571, + "num_input_tokens_seen": 42200752, + "step": 19515 + }, + { + "epoch": 3.1843393148450243, + "grad_norm": 0.023954367265105247, + "learning_rate": 0.0009893590376945354, + "loss": 0.0406, + "num_input_tokens_seen": 42210352, + "step": 19520 + }, + { + "epoch": 3.1851549755301796, + "grad_norm": 0.18598498404026031, + "learning_rate": 0.000989344425961754, + "loss": 0.2032, + "num_input_tokens_seen": 42222256, + "step": 19525 + }, + { + "epoch": 3.1859706362153344, + "grad_norm": 0.2285570353269577, + "learning_rate": 0.000989329804311787, + "loss": 0.1581, + "num_input_tokens_seen": 42234320, + "step": 19530 + }, + { + "epoch": 3.186786296900489, + "grad_norm": 0.03166361153125763, + "learning_rate": 0.000989315172744931, + "loss": 0.231, + "num_input_tokens_seen": 42244688, + "step": 19535 + }, + { + "epoch": 3.1876019575856445, + "grad_norm": 0.253466933965683, + "learning_rate": 0.0009893005312614823, + "loss": 0.1098, + "num_input_tokens_seen": 42254736, + "step": 19540 + }, + { + "epoch": 3.1884176182707993, + "grad_norm": 0.016121881082654, + "learning_rate": 0.0009892858798617374, + "loss": 0.073, + "num_input_tokens_seen": 42266992, + "step": 19545 + }, + { + "epoch": 3.189233278955954, + "grad_norm": 0.10315223783254623, + "learning_rate": 0.0009892712185459935, + "loss": 0.1024, + "num_input_tokens_seen": 42277968, + "step": 19550 + }, + { + "epoch": 3.1900489396411094, + "grad_norm": 0.2475394308567047, + "learning_rate": 0.0009892565473145476, + "loss": 0.1473, + "num_input_tokens_seen": 42289680, + "step": 19555 + }, + { + "epoch": 3.190864600326264, + "grad_norm": 0.037393637001514435, + "learning_rate": 0.0009892418661676973, + "loss": 0.127, + "num_input_tokens_seen": 42299696, + "step": 19560 + }, + { + "epoch": 3.1916802610114194, + "grad_norm": 0.019417136907577515, + "learning_rate": 0.0009892271751057399, + "loss": 0.0779, + "num_input_tokens_seen": 42309968, + "step": 19565 + }, + { + "epoch": 3.1924959216965743, + "grad_norm": 0.23461590707302094, + "learning_rate": 0.000989212474128973, + "loss": 0.1245, + "num_input_tokens_seen": 42319952, + "step": 19570 + }, + { + "epoch": 3.193311582381729, + "grad_norm": 0.2393941581249237, + "learning_rate": 0.0009891977632376949, + "loss": 0.1614, + "num_input_tokens_seen": 42330704, + "step": 19575 + }, + { + "epoch": 3.1941272430668843, + "grad_norm": 0.11234183609485626, + "learning_rate": 0.0009891830424322034, + "loss": 0.1503, + "num_input_tokens_seen": 42340944, + "step": 19580 + }, + { + "epoch": 3.194942903752039, + "grad_norm": 0.00959884561598301, + "learning_rate": 0.000989168311712797, + "loss": 0.0682, + "num_input_tokens_seen": 42351696, + "step": 19585 + }, + { + "epoch": 3.195758564437194, + "grad_norm": 0.04411447048187256, + "learning_rate": 0.0009891535710797744, + "loss": 0.0312, + "num_input_tokens_seen": 42363728, + "step": 19590 + }, + { + "epoch": 3.1965742251223492, + "grad_norm": 0.1372920125722885, + "learning_rate": 0.0009891388205334338, + "loss": 0.3343, + "num_input_tokens_seen": 42374992, + "step": 19595 + }, + { + "epoch": 3.197389885807504, + "grad_norm": 0.0299091674387455, + "learning_rate": 0.0009891240600740747, + "loss": 0.0688, + "num_input_tokens_seen": 42385232, + "step": 19600 + }, + { + "epoch": 3.198205546492659, + "grad_norm": 0.012968046590685844, + "learning_rate": 0.000989109289701996, + "loss": 0.0465, + "num_input_tokens_seen": 42396176, + "step": 19605 + }, + { + "epoch": 3.199021207177814, + "grad_norm": 0.2268250286579132, + "learning_rate": 0.000989094509417497, + "loss": 0.1662, + "num_input_tokens_seen": 42407632, + "step": 19610 + }, + { + "epoch": 3.199836867862969, + "grad_norm": 0.025382978841662407, + "learning_rate": 0.0009890797192208774, + "loss": 0.0664, + "num_input_tokens_seen": 42417776, + "step": 19615 + }, + { + "epoch": 3.200652528548124, + "grad_norm": 0.07793654501438141, + "learning_rate": 0.0009890649191124368, + "loss": 0.1867, + "num_input_tokens_seen": 42428400, + "step": 19620 + }, + { + "epoch": 3.201468189233279, + "grad_norm": 0.11532403528690338, + "learning_rate": 0.000989050109092475, + "loss": 0.0966, + "num_input_tokens_seen": 42440176, + "step": 19625 + }, + { + "epoch": 3.202283849918434, + "grad_norm": 0.15272283554077148, + "learning_rate": 0.0009890352891612927, + "loss": 0.1024, + "num_input_tokens_seen": 42451632, + "step": 19630 + }, + { + "epoch": 3.203099510603589, + "grad_norm": 0.06868071109056473, + "learning_rate": 0.0009890204593191896, + "loss": 0.1599, + "num_input_tokens_seen": 42462672, + "step": 19635 + }, + { + "epoch": 3.203915171288744, + "grad_norm": 0.17394308745861053, + "learning_rate": 0.0009890056195664668, + "loss": 0.0878, + "num_input_tokens_seen": 42473040, + "step": 19640 + }, + { + "epoch": 3.2047308319738987, + "grad_norm": 0.060149114578962326, + "learning_rate": 0.0009889907699034246, + "loss": 0.045, + "num_input_tokens_seen": 42484816, + "step": 19645 + }, + { + "epoch": 3.205546492659054, + "grad_norm": 0.015076788142323494, + "learning_rate": 0.000988975910330364, + "loss": 0.0669, + "num_input_tokens_seen": 42495216, + "step": 19650 + }, + { + "epoch": 3.206362153344209, + "grad_norm": 0.088878333568573, + "learning_rate": 0.0009889610408475864, + "loss": 0.19, + "num_input_tokens_seen": 42506160, + "step": 19655 + }, + { + "epoch": 3.2071778140293636, + "grad_norm": 0.15855666995048523, + "learning_rate": 0.000988946161455393, + "loss": 0.1424, + "num_input_tokens_seen": 42516336, + "step": 19660 + }, + { + "epoch": 3.207993474714519, + "grad_norm": 0.02387774921953678, + "learning_rate": 0.0009889312721540855, + "loss": 0.077, + "num_input_tokens_seen": 42525776, + "step": 19665 + }, + { + "epoch": 3.2088091353996737, + "grad_norm": 0.05558675155043602, + "learning_rate": 0.0009889163729439653, + "loss": 0.1243, + "num_input_tokens_seen": 42536624, + "step": 19670 + }, + { + "epoch": 3.2096247960848285, + "grad_norm": 0.0066003985702991486, + "learning_rate": 0.0009889014638253346, + "loss": 0.0481, + "num_input_tokens_seen": 42547216, + "step": 19675 + }, + { + "epoch": 3.210440456769984, + "grad_norm": 0.16620665788650513, + "learning_rate": 0.0009888865447984956, + "loss": 0.0817, + "num_input_tokens_seen": 42557328, + "step": 19680 + }, + { + "epoch": 3.2112561174551386, + "grad_norm": 0.05774753913283348, + "learning_rate": 0.0009888716158637505, + "loss": 0.068, + "num_input_tokens_seen": 42568016, + "step": 19685 + }, + { + "epoch": 3.2120717781402934, + "grad_norm": 0.02686813473701477, + "learning_rate": 0.000988856677021402, + "loss": 0.0304, + "num_input_tokens_seen": 42580240, + "step": 19690 + }, + { + "epoch": 3.2128874388254487, + "grad_norm": 0.018097640946507454, + "learning_rate": 0.0009888417282717529, + "loss": 0.1011, + "num_input_tokens_seen": 42590928, + "step": 19695 + }, + { + "epoch": 3.2137030995106035, + "grad_norm": 0.002314778044819832, + "learning_rate": 0.000988826769615106, + "loss": 0.1185, + "num_input_tokens_seen": 42602384, + "step": 19700 + }, + { + "epoch": 3.2145187601957588, + "grad_norm": 0.0739186555147171, + "learning_rate": 0.0009888118010517642, + "loss": 0.2168, + "num_input_tokens_seen": 42612240, + "step": 19705 + }, + { + "epoch": 3.2153344208809136, + "grad_norm": 0.054093651473522186, + "learning_rate": 0.0009887968225820315, + "loss": 0.0316, + "num_input_tokens_seen": 42624336, + "step": 19710 + }, + { + "epoch": 3.2161500815660684, + "grad_norm": 0.07303130626678467, + "learning_rate": 0.0009887818342062106, + "loss": 0.1457, + "num_input_tokens_seen": 42634672, + "step": 19715 + }, + { + "epoch": 3.2169657422512237, + "grad_norm": 0.07583710551261902, + "learning_rate": 0.0009887668359246063, + "loss": 0.0942, + "num_input_tokens_seen": 42646032, + "step": 19720 + }, + { + "epoch": 3.2177814029363785, + "grad_norm": 0.1961059272289276, + "learning_rate": 0.0009887518277375217, + "loss": 0.0917, + "num_input_tokens_seen": 42656848, + "step": 19725 + }, + { + "epoch": 3.2185970636215333, + "grad_norm": 0.01787407509982586, + "learning_rate": 0.0009887368096452617, + "loss": 0.036, + "num_input_tokens_seen": 42666800, + "step": 19730 + }, + { + "epoch": 3.2194127243066886, + "grad_norm": 0.1734134703874588, + "learning_rate": 0.0009887217816481298, + "loss": 0.0505, + "num_input_tokens_seen": 42678032, + "step": 19735 + }, + { + "epoch": 3.2202283849918434, + "grad_norm": 0.13812491297721863, + "learning_rate": 0.0009887067437464312, + "loss": 0.1628, + "num_input_tokens_seen": 42687984, + "step": 19740 + }, + { + "epoch": 3.221044045676998, + "grad_norm": 0.17805027961730957, + "learning_rate": 0.0009886916959404703, + "loss": 0.1175, + "num_input_tokens_seen": 42698896, + "step": 19745 + }, + { + "epoch": 3.2218597063621535, + "grad_norm": 0.012442238628864288, + "learning_rate": 0.0009886766382305526, + "loss": 0.0697, + "num_input_tokens_seen": 42709040, + "step": 19750 + }, + { + "epoch": 3.2226753670473083, + "grad_norm": 0.2859947085380554, + "learning_rate": 0.0009886615706169825, + "loss": 0.0914, + "num_input_tokens_seen": 42719088, + "step": 19755 + }, + { + "epoch": 3.223491027732463, + "grad_norm": 0.037967607378959656, + "learning_rate": 0.0009886464931000661, + "loss": 0.0984, + "num_input_tokens_seen": 42729616, + "step": 19760 + }, + { + "epoch": 3.2243066884176184, + "grad_norm": 0.006067187059670687, + "learning_rate": 0.0009886314056801084, + "loss": 0.0457, + "num_input_tokens_seen": 42740592, + "step": 19765 + }, + { + "epoch": 3.225122349102773, + "grad_norm": 0.2507992386817932, + "learning_rate": 0.0009886163083574154, + "loss": 0.1842, + "num_input_tokens_seen": 42751376, + "step": 19770 + }, + { + "epoch": 3.225938009787928, + "grad_norm": 0.006261878181248903, + "learning_rate": 0.000988601201132293, + "loss": 0.141, + "num_input_tokens_seen": 42763312, + "step": 19775 + }, + { + "epoch": 3.2267536704730833, + "grad_norm": 0.004393375013023615, + "learning_rate": 0.0009885860840050478, + "loss": 0.0486, + "num_input_tokens_seen": 42774896, + "step": 19780 + }, + { + "epoch": 3.227569331158238, + "grad_norm": 0.041983719915151596, + "learning_rate": 0.0009885709569759852, + "loss": 0.107, + "num_input_tokens_seen": 42786992, + "step": 19785 + }, + { + "epoch": 3.2283849918433933, + "grad_norm": 0.03765320032835007, + "learning_rate": 0.0009885558200454128, + "loss": 0.0924, + "num_input_tokens_seen": 42797776, + "step": 19790 + }, + { + "epoch": 3.229200652528548, + "grad_norm": 0.06044596806168556, + "learning_rate": 0.0009885406732136367, + "loss": 0.1155, + "num_input_tokens_seen": 42808848, + "step": 19795 + }, + { + "epoch": 3.230016313213703, + "grad_norm": 0.1069084033370018, + "learning_rate": 0.0009885255164809644, + "loss": 0.0783, + "num_input_tokens_seen": 42820784, + "step": 19800 + }, + { + "epoch": 3.2308319738988582, + "grad_norm": 0.14403347671031952, + "learning_rate": 0.0009885103498477026, + "loss": 0.0549, + "num_input_tokens_seen": 42832208, + "step": 19805 + }, + { + "epoch": 3.231647634584013, + "grad_norm": 0.10448440164327621, + "learning_rate": 0.0009884951733141586, + "loss": 0.2749, + "num_input_tokens_seen": 42843312, + "step": 19810 + }, + { + "epoch": 3.232463295269168, + "grad_norm": 0.24592241644859314, + "learning_rate": 0.0009884799868806406, + "loss": 0.1802, + "num_input_tokens_seen": 42854736, + "step": 19815 + }, + { + "epoch": 3.233278955954323, + "grad_norm": 0.046691689640283585, + "learning_rate": 0.000988464790547456, + "loss": 0.0578, + "num_input_tokens_seen": 42864656, + "step": 19820 + }, + { + "epoch": 3.234094616639478, + "grad_norm": 0.053699150681495667, + "learning_rate": 0.0009884495843149124, + "loss": 0.1522, + "num_input_tokens_seen": 42875344, + "step": 19825 + }, + { + "epoch": 3.2349102773246328, + "grad_norm": 0.04752179607748985, + "learning_rate": 0.0009884343681833185, + "loss": 0.1695, + "num_input_tokens_seen": 42885712, + "step": 19830 + }, + { + "epoch": 3.235725938009788, + "grad_norm": 0.03858195245265961, + "learning_rate": 0.0009884191421529825, + "loss": 0.1487, + "num_input_tokens_seen": 42898352, + "step": 19835 + }, + { + "epoch": 3.236541598694943, + "grad_norm": 0.10333125293254852, + "learning_rate": 0.000988403906224213, + "loss": 0.0636, + "num_input_tokens_seen": 42908496, + "step": 19840 + }, + { + "epoch": 3.237357259380098, + "grad_norm": 0.18015356361865997, + "learning_rate": 0.0009883886603973188, + "loss": 0.1176, + "num_input_tokens_seen": 42919280, + "step": 19845 + }, + { + "epoch": 3.238172920065253, + "grad_norm": 0.07732725888490677, + "learning_rate": 0.0009883734046726086, + "loss": 0.1254, + "num_input_tokens_seen": 42930160, + "step": 19850 + }, + { + "epoch": 3.2389885807504077, + "grad_norm": 0.09112097322940826, + "learning_rate": 0.0009883581390503922, + "loss": 0.0903, + "num_input_tokens_seen": 42941616, + "step": 19855 + }, + { + "epoch": 3.239804241435563, + "grad_norm": 0.16242296993732452, + "learning_rate": 0.0009883428635309784, + "loss": 0.1832, + "num_input_tokens_seen": 42952240, + "step": 19860 + }, + { + "epoch": 3.240619902120718, + "grad_norm": 0.14074952900409698, + "learning_rate": 0.0009883275781146768, + "loss": 0.1466, + "num_input_tokens_seen": 42962672, + "step": 19865 + }, + { + "epoch": 3.2414355628058726, + "grad_norm": 0.14547309279441833, + "learning_rate": 0.0009883122828017977, + "loss": 0.0943, + "num_input_tokens_seen": 42974800, + "step": 19870 + }, + { + "epoch": 3.242251223491028, + "grad_norm": 0.13024355471134186, + "learning_rate": 0.0009882969775926505, + "loss": 0.0502, + "num_input_tokens_seen": 42985648, + "step": 19875 + }, + { + "epoch": 3.2430668841761827, + "grad_norm": 0.022221842780709267, + "learning_rate": 0.0009882816624875454, + "loss": 0.019, + "num_input_tokens_seen": 42995824, + "step": 19880 + }, + { + "epoch": 3.2438825448613375, + "grad_norm": 0.04413722828030586, + "learning_rate": 0.0009882663374867933, + "loss": 0.0398, + "num_input_tokens_seen": 43007248, + "step": 19885 + }, + { + "epoch": 3.244698205546493, + "grad_norm": 0.022984053939580917, + "learning_rate": 0.0009882510025907042, + "loss": 0.0813, + "num_input_tokens_seen": 43018512, + "step": 19890 + }, + { + "epoch": 3.2455138662316476, + "grad_norm": 0.08174191415309906, + "learning_rate": 0.0009882356577995894, + "loss": 0.1128, + "num_input_tokens_seen": 43028368, + "step": 19895 + }, + { + "epoch": 3.2463295269168024, + "grad_norm": 0.11336220800876617, + "learning_rate": 0.0009882203031137595, + "loss": 0.042, + "num_input_tokens_seen": 43039120, + "step": 19900 + }, + { + "epoch": 3.2471451876019577, + "grad_norm": 0.181237131357193, + "learning_rate": 0.000988204938533526, + "loss": 0.1334, + "num_input_tokens_seen": 43049936, + "step": 19905 + }, + { + "epoch": 3.2479608482871125, + "grad_norm": 0.14684025943279266, + "learning_rate": 0.0009881895640591997, + "loss": 0.0507, + "num_input_tokens_seen": 43061808, + "step": 19910 + }, + { + "epoch": 3.2487765089722673, + "grad_norm": 0.028737680986523628, + "learning_rate": 0.0009881741796910928, + "loss": 0.078, + "num_input_tokens_seen": 43073456, + "step": 19915 + }, + { + "epoch": 3.2495921696574226, + "grad_norm": 0.17885787785053253, + "learning_rate": 0.0009881587854295168, + "loss": 0.2179, + "num_input_tokens_seen": 43084688, + "step": 19920 + }, + { + "epoch": 3.2504078303425774, + "grad_norm": 0.038553569465875626, + "learning_rate": 0.0009881433812747838, + "loss": 0.1366, + "num_input_tokens_seen": 43095504, + "step": 19925 + }, + { + "epoch": 3.2512234910277327, + "grad_norm": 0.02528173290193081, + "learning_rate": 0.000988127967227206, + "loss": 0.0567, + "num_input_tokens_seen": 43105648, + "step": 19930 + }, + { + "epoch": 3.2520391517128875, + "grad_norm": 0.021175552159547806, + "learning_rate": 0.0009881125432870956, + "loss": 0.1676, + "num_input_tokens_seen": 43117264, + "step": 19935 + }, + { + "epoch": 3.2528548123980423, + "grad_norm": 0.07172536849975586, + "learning_rate": 0.0009880971094547652, + "loss": 0.0924, + "num_input_tokens_seen": 43128016, + "step": 19940 + }, + { + "epoch": 3.2536704730831976, + "grad_norm": 0.14829133450984955, + "learning_rate": 0.0009880816657305278, + "loss": 0.0931, + "num_input_tokens_seen": 43139920, + "step": 19945 + }, + { + "epoch": 3.2544861337683524, + "grad_norm": 0.08503178507089615, + "learning_rate": 0.0009880662121146964, + "loss": 0.1716, + "num_input_tokens_seen": 43149808, + "step": 19950 + }, + { + "epoch": 3.255301794453507, + "grad_norm": 0.06452854722738266, + "learning_rate": 0.0009880507486075838, + "loss": 0.0955, + "num_input_tokens_seen": 43160944, + "step": 19955 + }, + { + "epoch": 3.2561174551386625, + "grad_norm": 0.06388487666845322, + "learning_rate": 0.0009880352752095038, + "loss": 0.035, + "num_input_tokens_seen": 43170032, + "step": 19960 + }, + { + "epoch": 3.2569331158238173, + "grad_norm": 0.14313308894634247, + "learning_rate": 0.0009880197919207698, + "loss": 0.0736, + "num_input_tokens_seen": 43179920, + "step": 19965 + }, + { + "epoch": 3.257748776508972, + "grad_norm": 0.0758235901594162, + "learning_rate": 0.0009880042987416957, + "loss": 0.0517, + "num_input_tokens_seen": 43190736, + "step": 19970 + }, + { + "epoch": 3.2585644371941274, + "grad_norm": 0.0018469083588570356, + "learning_rate": 0.0009879887956725953, + "loss": 0.3477, + "num_input_tokens_seen": 43201648, + "step": 19975 + }, + { + "epoch": 3.259380097879282, + "grad_norm": 0.09347787499427795, + "learning_rate": 0.0009879732827137828, + "loss": 0.1401, + "num_input_tokens_seen": 43211984, + "step": 19980 + }, + { + "epoch": 3.2601957585644374, + "grad_norm": 0.16566364467144012, + "learning_rate": 0.0009879577598655728, + "loss": 0.3083, + "num_input_tokens_seen": 43223120, + "step": 19985 + }, + { + "epoch": 3.2610114192495923, + "grad_norm": 0.10149620473384857, + "learning_rate": 0.0009879422271282798, + "loss": 0.1296, + "num_input_tokens_seen": 43233840, + "step": 19990 + }, + { + "epoch": 3.261827079934747, + "grad_norm": 0.07267143577337265, + "learning_rate": 0.0009879266845022187, + "loss": 0.1318, + "num_input_tokens_seen": 43245200, + "step": 19995 + }, + { + "epoch": 3.262642740619902, + "grad_norm": 0.1777501404285431, + "learning_rate": 0.0009879111319877041, + "loss": 0.112, + "num_input_tokens_seen": 43256944, + "step": 20000 + }, + { + "epoch": 3.263458401305057, + "grad_norm": 0.04019446298480034, + "learning_rate": 0.0009878955695850516, + "loss": 0.1542, + "num_input_tokens_seen": 43268432, + "step": 20005 + }, + { + "epoch": 3.264274061990212, + "grad_norm": 0.07024483382701874, + "learning_rate": 0.0009878799972945762, + "loss": 0.1052, + "num_input_tokens_seen": 43279792, + "step": 20010 + }, + { + "epoch": 3.2650897226753672, + "grad_norm": 0.11250631511211395, + "learning_rate": 0.000987864415116594, + "loss": 0.169, + "num_input_tokens_seen": 43290288, + "step": 20015 + }, + { + "epoch": 3.265905383360522, + "grad_norm": 0.1379203498363495, + "learning_rate": 0.0009878488230514206, + "loss": 0.1573, + "num_input_tokens_seen": 43299920, + "step": 20020 + }, + { + "epoch": 3.266721044045677, + "grad_norm": 0.17190445959568024, + "learning_rate": 0.0009878332210993717, + "loss": 0.0863, + "num_input_tokens_seen": 43311248, + "step": 20025 + }, + { + "epoch": 3.267536704730832, + "grad_norm": 0.28690385818481445, + "learning_rate": 0.0009878176092607638, + "loss": 0.1576, + "num_input_tokens_seen": 43320976, + "step": 20030 + }, + { + "epoch": 3.268352365415987, + "grad_norm": 0.08246587961912155, + "learning_rate": 0.0009878019875359132, + "loss": 0.0845, + "num_input_tokens_seen": 43331440, + "step": 20035 + }, + { + "epoch": 3.2691680261011418, + "grad_norm": 0.039607934653759, + "learning_rate": 0.0009877863559251366, + "loss": 0.1099, + "num_input_tokens_seen": 43343024, + "step": 20040 + }, + { + "epoch": 3.269983686786297, + "grad_norm": 0.017975594848394394, + "learning_rate": 0.0009877707144287505, + "loss": 0.0499, + "num_input_tokens_seen": 43353840, + "step": 20045 + }, + { + "epoch": 3.270799347471452, + "grad_norm": 0.0691947340965271, + "learning_rate": 0.0009877550630470722, + "loss": 0.049, + "num_input_tokens_seen": 43364656, + "step": 20050 + }, + { + "epoch": 3.2716150081566067, + "grad_norm": 0.3025503158569336, + "learning_rate": 0.000987739401780419, + "loss": 0.24, + "num_input_tokens_seen": 43375920, + "step": 20055 + }, + { + "epoch": 3.272430668841762, + "grad_norm": 0.022506562992930412, + "learning_rate": 0.0009877237306291076, + "loss": 0.155, + "num_input_tokens_seen": 43386608, + "step": 20060 + }, + { + "epoch": 3.2732463295269167, + "grad_norm": 0.12372944504022598, + "learning_rate": 0.0009877080495934564, + "loss": 0.0375, + "num_input_tokens_seen": 43397840, + "step": 20065 + }, + { + "epoch": 3.274061990212072, + "grad_norm": 0.03336038067936897, + "learning_rate": 0.0009876923586737828, + "loss": 0.1941, + "num_input_tokens_seen": 43409136, + "step": 20070 + }, + { + "epoch": 3.274877650897227, + "grad_norm": 0.05050639435648918, + "learning_rate": 0.000987676657870405, + "loss": 0.0498, + "num_input_tokens_seen": 43418864, + "step": 20075 + }, + { + "epoch": 3.2756933115823816, + "grad_norm": 0.0558999739587307, + "learning_rate": 0.0009876609471836408, + "loss": 0.1609, + "num_input_tokens_seen": 43430032, + "step": 20080 + }, + { + "epoch": 3.2765089722675365, + "grad_norm": 0.02081811986863613, + "learning_rate": 0.000987645226613809, + "loss": 0.1736, + "num_input_tokens_seen": 43441712, + "step": 20085 + }, + { + "epoch": 3.2773246329526917, + "grad_norm": 0.09108427911996841, + "learning_rate": 0.0009876294961612283, + "loss": 0.0837, + "num_input_tokens_seen": 43453200, + "step": 20090 + }, + { + "epoch": 3.2781402936378465, + "grad_norm": 0.059019628912210464, + "learning_rate": 0.0009876137558262168, + "loss": 0.1473, + "num_input_tokens_seen": 43464368, + "step": 20095 + }, + { + "epoch": 3.278955954323002, + "grad_norm": 0.049325328320264816, + "learning_rate": 0.0009875980056090943, + "loss": 0.1024, + "num_input_tokens_seen": 43475280, + "step": 20100 + }, + { + "epoch": 3.2797716150081566, + "grad_norm": 0.12276256829500198, + "learning_rate": 0.0009875822455101795, + "loss": 0.0626, + "num_input_tokens_seen": 43485904, + "step": 20105 + }, + { + "epoch": 3.2805872756933114, + "grad_norm": 0.2100994735956192, + "learning_rate": 0.000987566475529792, + "loss": 0.123, + "num_input_tokens_seen": 43496720, + "step": 20110 + }, + { + "epoch": 3.2814029363784667, + "grad_norm": 0.018480392172932625, + "learning_rate": 0.0009875506956682513, + "loss": 0.0791, + "num_input_tokens_seen": 43508080, + "step": 20115 + }, + { + "epoch": 3.2822185970636215, + "grad_norm": 0.06958062946796417, + "learning_rate": 0.0009875349059258773, + "loss": 0.0787, + "num_input_tokens_seen": 43519280, + "step": 20120 + }, + { + "epoch": 3.2830342577487763, + "grad_norm": 0.22425442934036255, + "learning_rate": 0.00098751910630299, + "loss": 0.1221, + "num_input_tokens_seen": 43529552, + "step": 20125 + }, + { + "epoch": 3.2838499184339316, + "grad_norm": 0.014040003530681133, + "learning_rate": 0.0009875032967999096, + "loss": 0.1182, + "num_input_tokens_seen": 43541008, + "step": 20130 + }, + { + "epoch": 3.2846655791190864, + "grad_norm": 0.2088811844587326, + "learning_rate": 0.0009874874774169562, + "loss": 0.1187, + "num_input_tokens_seen": 43551152, + "step": 20135 + }, + { + "epoch": 3.2854812398042412, + "grad_norm": 0.1166122779250145, + "learning_rate": 0.0009874716481544509, + "loss": 0.226, + "num_input_tokens_seen": 43562672, + "step": 20140 + }, + { + "epoch": 3.2862969004893965, + "grad_norm": 0.1853681206703186, + "learning_rate": 0.0009874558090127142, + "loss": 0.1362, + "num_input_tokens_seen": 43572944, + "step": 20145 + }, + { + "epoch": 3.2871125611745513, + "grad_norm": 0.07401765882968903, + "learning_rate": 0.0009874399599920669, + "loss": 0.0824, + "num_input_tokens_seen": 43585072, + "step": 20150 + }, + { + "epoch": 3.2879282218597066, + "grad_norm": 0.04394443705677986, + "learning_rate": 0.0009874241010928307, + "loss": 0.0714, + "num_input_tokens_seen": 43595920, + "step": 20155 + }, + { + "epoch": 3.2887438825448614, + "grad_norm": 0.10552657395601273, + "learning_rate": 0.0009874082323153266, + "loss": 0.0889, + "num_input_tokens_seen": 43605232, + "step": 20160 + }, + { + "epoch": 3.289559543230016, + "grad_norm": 0.041619252413511276, + "learning_rate": 0.0009873923536598765, + "loss": 0.0367, + "num_input_tokens_seen": 43616624, + "step": 20165 + }, + { + "epoch": 3.2903752039151715, + "grad_norm": 0.16440622508525848, + "learning_rate": 0.000987376465126802, + "loss": 0.0673, + "num_input_tokens_seen": 43627504, + "step": 20170 + }, + { + "epoch": 3.2911908646003263, + "grad_norm": 0.016031792387366295, + "learning_rate": 0.0009873605667164252, + "loss": 0.1093, + "num_input_tokens_seen": 43637776, + "step": 20175 + }, + { + "epoch": 3.292006525285481, + "grad_norm": 0.24192920327186584, + "learning_rate": 0.0009873446584290682, + "loss": 0.1444, + "num_input_tokens_seen": 43648784, + "step": 20180 + }, + { + "epoch": 3.2928221859706364, + "grad_norm": 0.01155361719429493, + "learning_rate": 0.0009873287402650535, + "loss": 0.0349, + "num_input_tokens_seen": 43658800, + "step": 20185 + }, + { + "epoch": 3.293637846655791, + "grad_norm": 0.017669612541794777, + "learning_rate": 0.0009873128122247035, + "loss": 0.1177, + "num_input_tokens_seen": 43669936, + "step": 20190 + }, + { + "epoch": 3.294453507340946, + "grad_norm": 0.04354847967624664, + "learning_rate": 0.0009872968743083414, + "loss": 0.0945, + "num_input_tokens_seen": 43680976, + "step": 20195 + }, + { + "epoch": 3.2952691680261013, + "grad_norm": 0.05074286088347435, + "learning_rate": 0.0009872809265162898, + "loss": 0.1438, + "num_input_tokens_seen": 43692112, + "step": 20200 + }, + { + "epoch": 3.296084828711256, + "grad_norm": 0.024301722645759583, + "learning_rate": 0.000987264968848872, + "loss": 0.1897, + "num_input_tokens_seen": 43702224, + "step": 20205 + }, + { + "epoch": 3.2969004893964113, + "grad_norm": 0.11771446466445923, + "learning_rate": 0.0009872490013064117, + "loss": 0.1187, + "num_input_tokens_seen": 43712720, + "step": 20210 + }, + { + "epoch": 3.297716150081566, + "grad_norm": 0.07622315734624863, + "learning_rate": 0.000987233023889232, + "loss": 0.2055, + "num_input_tokens_seen": 43724176, + "step": 20215 + }, + { + "epoch": 3.298531810766721, + "grad_norm": 0.06392424553632736, + "learning_rate": 0.000987217036597657, + "loss": 0.0286, + "num_input_tokens_seen": 43734032, + "step": 20220 + }, + { + "epoch": 3.299347471451876, + "grad_norm": 0.10581985116004944, + "learning_rate": 0.000987201039432011, + "loss": 0.1012, + "num_input_tokens_seen": 43744304, + "step": 20225 + }, + { + "epoch": 3.300163132137031, + "grad_norm": 0.012859735637903214, + "learning_rate": 0.0009871850323926177, + "loss": 0.115, + "num_input_tokens_seen": 43755888, + "step": 20230 + }, + { + "epoch": 3.300978792822186, + "grad_norm": 0.03773471340537071, + "learning_rate": 0.0009871690154798017, + "loss": 0.1641, + "num_input_tokens_seen": 43767056, + "step": 20235 + }, + { + "epoch": 3.301794453507341, + "grad_norm": 0.03229673206806183, + "learning_rate": 0.0009871529886938874, + "loss": 0.15, + "num_input_tokens_seen": 43778480, + "step": 20240 + }, + { + "epoch": 3.302610114192496, + "grad_norm": 0.11208148300647736, + "learning_rate": 0.0009871369520352, + "loss": 0.093, + "num_input_tokens_seen": 43788624, + "step": 20245 + }, + { + "epoch": 3.3034257748776508, + "grad_norm": 0.07075408846139908, + "learning_rate": 0.0009871209055040643, + "loss": 0.1083, + "num_input_tokens_seen": 43799216, + "step": 20250 + }, + { + "epoch": 3.304241435562806, + "grad_norm": 0.008634911850094795, + "learning_rate": 0.0009871048491008052, + "loss": 0.1022, + "num_input_tokens_seen": 43810480, + "step": 20255 + }, + { + "epoch": 3.305057096247961, + "grad_norm": 0.028035888448357582, + "learning_rate": 0.0009870887828257486, + "loss": 0.0718, + "num_input_tokens_seen": 43820688, + "step": 20260 + }, + { + "epoch": 3.3058727569331157, + "grad_norm": 0.05893208459019661, + "learning_rate": 0.00098707270667922, + "loss": 0.1905, + "num_input_tokens_seen": 43830256, + "step": 20265 + }, + { + "epoch": 3.306688417618271, + "grad_norm": 0.11259466409683228, + "learning_rate": 0.000987056620661545, + "loss": 0.1289, + "num_input_tokens_seen": 43841360, + "step": 20270 + }, + { + "epoch": 3.3075040783034257, + "grad_norm": 0.23528766632080078, + "learning_rate": 0.0009870405247730497, + "loss": 0.0641, + "num_input_tokens_seen": 43852848, + "step": 20275 + }, + { + "epoch": 3.3083197389885806, + "grad_norm": 0.0631113052368164, + "learning_rate": 0.0009870244190140602, + "loss": 0.0684, + "num_input_tokens_seen": 43863600, + "step": 20280 + }, + { + "epoch": 3.309135399673736, + "grad_norm": 0.018527382984757423, + "learning_rate": 0.000987008303384903, + "loss": 0.1043, + "num_input_tokens_seen": 43874512, + "step": 20285 + }, + { + "epoch": 3.3099510603588906, + "grad_norm": 0.07803243398666382, + "learning_rate": 0.000986992177885905, + "loss": 0.0825, + "num_input_tokens_seen": 43885808, + "step": 20290 + }, + { + "epoch": 3.310766721044046, + "grad_norm": 0.054975420236587524, + "learning_rate": 0.0009869760425173927, + "loss": 0.0348, + "num_input_tokens_seen": 43896816, + "step": 20295 + }, + { + "epoch": 3.3115823817292007, + "grad_norm": 0.0050786943174898624, + "learning_rate": 0.000986959897279693, + "loss": 0.0825, + "num_input_tokens_seen": 43908368, + "step": 20300 + }, + { + "epoch": 3.3123980424143555, + "grad_norm": 0.2176738679409027, + "learning_rate": 0.0009869437421731332, + "loss": 0.1334, + "num_input_tokens_seen": 43918992, + "step": 20305 + }, + { + "epoch": 3.3132137030995104, + "grad_norm": 0.1001739352941513, + "learning_rate": 0.0009869275771980405, + "loss": 0.0686, + "num_input_tokens_seen": 43930256, + "step": 20310 + }, + { + "epoch": 3.3140293637846656, + "grad_norm": 0.14070114493370056, + "learning_rate": 0.000986911402354743, + "loss": 0.3225, + "num_input_tokens_seen": 43941168, + "step": 20315 + }, + { + "epoch": 3.3148450244698204, + "grad_norm": 0.1104494109749794, + "learning_rate": 0.0009868952176435683, + "loss": 0.1142, + "num_input_tokens_seen": 43952368, + "step": 20320 + }, + { + "epoch": 3.3156606851549757, + "grad_norm": 0.02809176966547966, + "learning_rate": 0.0009868790230648443, + "loss": 0.0501, + "num_input_tokens_seen": 43963632, + "step": 20325 + }, + { + "epoch": 3.3164763458401305, + "grad_norm": 0.0439760759472847, + "learning_rate": 0.0009868628186188993, + "loss": 0.091, + "num_input_tokens_seen": 43975024, + "step": 20330 + }, + { + "epoch": 3.3172920065252853, + "grad_norm": 0.012558380141854286, + "learning_rate": 0.0009868466043060616, + "loss": 0.0471, + "num_input_tokens_seen": 43986608, + "step": 20335 + }, + { + "epoch": 3.3181076672104406, + "grad_norm": 0.016795523464679718, + "learning_rate": 0.00098683038012666, + "loss": 0.0691, + "num_input_tokens_seen": 43997968, + "step": 20340 + }, + { + "epoch": 3.3189233278955954, + "grad_norm": 0.05473247915506363, + "learning_rate": 0.0009868141460810226, + "loss": 0.1192, + "num_input_tokens_seen": 44010032, + "step": 20345 + }, + { + "epoch": 3.3197389885807502, + "grad_norm": 0.03609495982527733, + "learning_rate": 0.0009867979021694795, + "loss": 0.0272, + "num_input_tokens_seen": 44021104, + "step": 20350 + }, + { + "epoch": 3.3205546492659055, + "grad_norm": 0.101466603577137, + "learning_rate": 0.0009867816483923593, + "loss": 0.0828, + "num_input_tokens_seen": 44031952, + "step": 20355 + }, + { + "epoch": 3.3213703099510603, + "grad_norm": 0.34413954615592957, + "learning_rate": 0.0009867653847499913, + "loss": 0.1781, + "num_input_tokens_seen": 44042704, + "step": 20360 + }, + { + "epoch": 3.322185970636215, + "grad_norm": 0.04273676499724388, + "learning_rate": 0.0009867491112427055, + "loss": 0.0603, + "num_input_tokens_seen": 44052592, + "step": 20365 + }, + { + "epoch": 3.3230016313213704, + "grad_norm": 0.35218381881713867, + "learning_rate": 0.0009867328278708313, + "loss": 0.276, + "num_input_tokens_seen": 44065328, + "step": 20370 + }, + { + "epoch": 3.323817292006525, + "grad_norm": 0.229897141456604, + "learning_rate": 0.0009867165346346988, + "loss": 0.0778, + "num_input_tokens_seen": 44077552, + "step": 20375 + }, + { + "epoch": 3.3246329526916805, + "grad_norm": 0.04176846519112587, + "learning_rate": 0.0009867002315346383, + "loss": 0.1624, + "num_input_tokens_seen": 44088752, + "step": 20380 + }, + { + "epoch": 3.3254486133768353, + "grad_norm": 0.17083647847175598, + "learning_rate": 0.0009866839185709805, + "loss": 0.0944, + "num_input_tokens_seen": 44099376, + "step": 20385 + }, + { + "epoch": 3.32626427406199, + "grad_norm": 0.1816277652978897, + "learning_rate": 0.0009866675957440553, + "loss": 0.1169, + "num_input_tokens_seen": 44109488, + "step": 20390 + }, + { + "epoch": 3.3270799347471454, + "grad_norm": 0.06937110424041748, + "learning_rate": 0.0009866512630541942, + "loss": 0.1128, + "num_input_tokens_seen": 44119568, + "step": 20395 + }, + { + "epoch": 3.3278955954323, + "grad_norm": 0.1718575358390808, + "learning_rate": 0.0009866349205017277, + "loss": 0.0818, + "num_input_tokens_seen": 44130000, + "step": 20400 + }, + { + "epoch": 3.328711256117455, + "grad_norm": 0.1907864212989807, + "learning_rate": 0.0009866185680869873, + "loss": 0.0938, + "num_input_tokens_seen": 44140208, + "step": 20405 + }, + { + "epoch": 3.3295269168026103, + "grad_norm": 0.01826365478336811, + "learning_rate": 0.0009866022058103042, + "loss": 0.0715, + "num_input_tokens_seen": 44150160, + "step": 20410 + }, + { + "epoch": 3.330342577487765, + "grad_norm": 0.018938470631837845, + "learning_rate": 0.0009865858336720102, + "loss": 0.0622, + "num_input_tokens_seen": 44161008, + "step": 20415 + }, + { + "epoch": 3.33115823817292, + "grad_norm": 0.1174740418791771, + "learning_rate": 0.000986569451672437, + "loss": 0.1543, + "num_input_tokens_seen": 44170768, + "step": 20420 + }, + { + "epoch": 3.331973898858075, + "grad_norm": 0.24229373037815094, + "learning_rate": 0.0009865530598119163, + "loss": 0.1628, + "num_input_tokens_seen": 44180080, + "step": 20425 + }, + { + "epoch": 3.33278955954323, + "grad_norm": 0.00437184190377593, + "learning_rate": 0.000986536658090781, + "loss": 0.1034, + "num_input_tokens_seen": 44191216, + "step": 20430 + }, + { + "epoch": 3.3336052202283852, + "grad_norm": 0.035282671451568604, + "learning_rate": 0.0009865202465093631, + "loss": 0.1143, + "num_input_tokens_seen": 44201584, + "step": 20435 + }, + { + "epoch": 3.33442088091354, + "grad_norm": 0.0404328852891922, + "learning_rate": 0.000986503825067995, + "loss": 0.1257, + "num_input_tokens_seen": 44213840, + "step": 20440 + }, + { + "epoch": 3.335236541598695, + "grad_norm": 0.1707407385110855, + "learning_rate": 0.0009864873937670098, + "loss": 0.145, + "num_input_tokens_seen": 44224944, + "step": 20445 + }, + { + "epoch": 3.3360522022838497, + "grad_norm": 0.0483018197119236, + "learning_rate": 0.0009864709526067404, + "loss": 0.0343, + "num_input_tokens_seen": 44235536, + "step": 20450 + }, + { + "epoch": 3.336867862969005, + "grad_norm": 0.069381944835186, + "learning_rate": 0.0009864545015875199, + "loss": 0.0415, + "num_input_tokens_seen": 44245776, + "step": 20455 + }, + { + "epoch": 3.3376835236541598, + "grad_norm": 0.012071680277585983, + "learning_rate": 0.000986438040709682, + "loss": 0.1051, + "num_input_tokens_seen": 44255472, + "step": 20460 + }, + { + "epoch": 3.338499184339315, + "grad_norm": 0.2652641534805298, + "learning_rate": 0.00098642156997356, + "loss": 0.2206, + "num_input_tokens_seen": 44266000, + "step": 20465 + }, + { + "epoch": 3.33931484502447, + "grad_norm": 0.04244496300816536, + "learning_rate": 0.0009864050893794878, + "loss": 0.2189, + "num_input_tokens_seen": 44276624, + "step": 20470 + }, + { + "epoch": 3.3401305057096247, + "grad_norm": 0.11729129403829575, + "learning_rate": 0.0009863885989277994, + "loss": 0.0754, + "num_input_tokens_seen": 44287472, + "step": 20475 + }, + { + "epoch": 3.34094616639478, + "grad_norm": 0.04826750606298447, + "learning_rate": 0.0009863720986188291, + "loss": 0.1146, + "num_input_tokens_seen": 44296944, + "step": 20480 + }, + { + "epoch": 3.3417618270799347, + "grad_norm": 0.2376585453748703, + "learning_rate": 0.0009863555884529114, + "loss": 0.1758, + "num_input_tokens_seen": 44307952, + "step": 20485 + }, + { + "epoch": 3.3425774877650896, + "grad_norm": 0.08156166970729828, + "learning_rate": 0.0009863390684303804, + "loss": 0.1008, + "num_input_tokens_seen": 44318640, + "step": 20490 + }, + { + "epoch": 3.343393148450245, + "grad_norm": 0.12889395654201508, + "learning_rate": 0.0009863225385515714, + "loss": 0.0658, + "num_input_tokens_seen": 44329008, + "step": 20495 + }, + { + "epoch": 3.3442088091353996, + "grad_norm": 0.0919957235455513, + "learning_rate": 0.000986305998816819, + "loss": 0.1266, + "num_input_tokens_seen": 44340304, + "step": 20500 + }, + { + "epoch": 3.3450244698205545, + "grad_norm": 0.1220528781414032, + "learning_rate": 0.000986289449226459, + "loss": 0.1175, + "num_input_tokens_seen": 44351664, + "step": 20505 + }, + { + "epoch": 3.3458401305057097, + "grad_norm": 0.020937541499733925, + "learning_rate": 0.000986272889780826, + "loss": 0.0989, + "num_input_tokens_seen": 44362448, + "step": 20510 + }, + { + "epoch": 3.3466557911908645, + "grad_norm": 0.01079709641635418, + "learning_rate": 0.000986256320480256, + "loss": 0.0785, + "num_input_tokens_seen": 44373712, + "step": 20515 + }, + { + "epoch": 3.34747145187602, + "grad_norm": 0.09133608639240265, + "learning_rate": 0.0009862397413250852, + "loss": 0.0963, + "num_input_tokens_seen": 44384496, + "step": 20520 + }, + { + "epoch": 3.3482871125611746, + "grad_norm": 0.11672952771186829, + "learning_rate": 0.0009862231523156489, + "loss": 0.1958, + "num_input_tokens_seen": 44395952, + "step": 20525 + }, + { + "epoch": 3.3491027732463294, + "grad_norm": 0.08248498290777206, + "learning_rate": 0.0009862065534522837, + "loss": 0.1382, + "num_input_tokens_seen": 44407792, + "step": 20530 + }, + { + "epoch": 3.3499184339314847, + "grad_norm": 0.08188489824533463, + "learning_rate": 0.000986189944735326, + "loss": 0.1545, + "num_input_tokens_seen": 44419568, + "step": 20535 + }, + { + "epoch": 3.3507340946166395, + "grad_norm": 0.08767145872116089, + "learning_rate": 0.000986173326165112, + "loss": 0.1524, + "num_input_tokens_seen": 44429872, + "step": 20540 + }, + { + "epoch": 3.3515497553017943, + "grad_norm": 0.21928314864635468, + "learning_rate": 0.000986156697741979, + "loss": 0.3474, + "num_input_tokens_seen": 44442160, + "step": 20545 + }, + { + "epoch": 3.3523654159869496, + "grad_norm": 0.01372506469488144, + "learning_rate": 0.0009861400594662637, + "loss": 0.2471, + "num_input_tokens_seen": 44452336, + "step": 20550 + }, + { + "epoch": 3.3531810766721044, + "grad_norm": 0.1195288896560669, + "learning_rate": 0.0009861234113383035, + "loss": 0.1279, + "num_input_tokens_seen": 44464112, + "step": 20555 + }, + { + "epoch": 3.3539967373572592, + "grad_norm": 0.01491206232458353, + "learning_rate": 0.0009861067533584356, + "loss": 0.1224, + "num_input_tokens_seen": 44475024, + "step": 20560 + }, + { + "epoch": 3.3548123980424145, + "grad_norm": 0.02331310696899891, + "learning_rate": 0.0009860900855269976, + "loss": 0.0526, + "num_input_tokens_seen": 44486128, + "step": 20565 + }, + { + "epoch": 3.3556280587275693, + "grad_norm": 0.10006996989250183, + "learning_rate": 0.0009860734078443276, + "loss": 0.1613, + "num_input_tokens_seen": 44495568, + "step": 20570 + }, + { + "epoch": 3.356443719412724, + "grad_norm": 0.06204470619559288, + "learning_rate": 0.0009860567203107632, + "loss": 0.1667, + "num_input_tokens_seen": 44506064, + "step": 20575 + }, + { + "epoch": 3.3572593800978794, + "grad_norm": 0.029366256669163704, + "learning_rate": 0.0009860400229266427, + "loss": 0.0747, + "num_input_tokens_seen": 44517808, + "step": 20580 + }, + { + "epoch": 3.358075040783034, + "grad_norm": 0.04264573007822037, + "learning_rate": 0.0009860233156923047, + "loss": 0.0994, + "num_input_tokens_seen": 44529104, + "step": 20585 + }, + { + "epoch": 3.358890701468189, + "grad_norm": 0.056957364082336426, + "learning_rate": 0.0009860065986080876, + "loss": 0.195, + "num_input_tokens_seen": 44540432, + "step": 20590 + }, + { + "epoch": 3.3597063621533443, + "grad_norm": 0.06419949233531952, + "learning_rate": 0.00098598987167433, + "loss": 0.0677, + "num_input_tokens_seen": 44550128, + "step": 20595 + }, + { + "epoch": 3.360522022838499, + "grad_norm": 0.11103334277868271, + "learning_rate": 0.0009859731348913713, + "loss": 0.0612, + "num_input_tokens_seen": 44560880, + "step": 20600 + }, + { + "epoch": 3.3613376835236544, + "grad_norm": 0.054705556482076645, + "learning_rate": 0.0009859563882595507, + "loss": 0.1947, + "num_input_tokens_seen": 44571216, + "step": 20605 + }, + { + "epoch": 3.362153344208809, + "grad_norm": 0.16497164964675903, + "learning_rate": 0.0009859396317792074, + "loss": 0.2826, + "num_input_tokens_seen": 44581776, + "step": 20610 + }, + { + "epoch": 3.362969004893964, + "grad_norm": 0.016342537477612495, + "learning_rate": 0.0009859228654506807, + "loss": 0.0585, + "num_input_tokens_seen": 44591600, + "step": 20615 + }, + { + "epoch": 3.3637846655791193, + "grad_norm": 0.029464807361364365, + "learning_rate": 0.0009859060892743108, + "loss": 0.0535, + "num_input_tokens_seen": 44602544, + "step": 20620 + }, + { + "epoch": 3.364600326264274, + "grad_norm": 0.15060758590698242, + "learning_rate": 0.0009858893032504378, + "loss": 0.1205, + "num_input_tokens_seen": 44613584, + "step": 20625 + }, + { + "epoch": 3.365415986949429, + "grad_norm": 0.014383463189005852, + "learning_rate": 0.0009858725073794016, + "loss": 0.1241, + "num_input_tokens_seen": 44623248, + "step": 20630 + }, + { + "epoch": 3.366231647634584, + "grad_norm": 0.020124254748225212, + "learning_rate": 0.0009858557016615423, + "loss": 0.0502, + "num_input_tokens_seen": 44633232, + "step": 20635 + }, + { + "epoch": 3.367047308319739, + "grad_norm": 0.16115230321884155, + "learning_rate": 0.0009858388860972012, + "loss": 0.1436, + "num_input_tokens_seen": 44644016, + "step": 20640 + }, + { + "epoch": 3.367862969004894, + "grad_norm": 0.013520710170269012, + "learning_rate": 0.0009858220606867188, + "loss": 0.022, + "num_input_tokens_seen": 44654672, + "step": 20645 + }, + { + "epoch": 3.368678629690049, + "grad_norm": 0.009846985340118408, + "learning_rate": 0.000985805225430436, + "loss": 0.0319, + "num_input_tokens_seen": 44666768, + "step": 20650 + }, + { + "epoch": 3.369494290375204, + "grad_norm": 0.089094378054142, + "learning_rate": 0.0009857883803286937, + "loss": 0.0989, + "num_input_tokens_seen": 44677520, + "step": 20655 + }, + { + "epoch": 3.370309951060359, + "grad_norm": 0.1536937952041626, + "learning_rate": 0.0009857715253818338, + "loss": 0.0803, + "num_input_tokens_seen": 44688080, + "step": 20660 + }, + { + "epoch": 3.371125611745514, + "grad_norm": 0.07965698093175888, + "learning_rate": 0.000985754660590198, + "loss": 0.0663, + "num_input_tokens_seen": 44698288, + "step": 20665 + }, + { + "epoch": 3.3719412724306688, + "grad_norm": 0.028182541951537132, + "learning_rate": 0.0009857377859541275, + "loss": 0.1655, + "num_input_tokens_seen": 44710160, + "step": 20670 + }, + { + "epoch": 3.3727569331158236, + "grad_norm": 0.10902436077594757, + "learning_rate": 0.0009857209014739645, + "loss": 0.0584, + "num_input_tokens_seen": 44720592, + "step": 20675 + }, + { + "epoch": 3.373572593800979, + "grad_norm": 0.07185492664575577, + "learning_rate": 0.0009857040071500512, + "loss": 0.171, + "num_input_tokens_seen": 44731024, + "step": 20680 + }, + { + "epoch": 3.3743882544861337, + "grad_norm": 0.051205482333898544, + "learning_rate": 0.0009856871029827303, + "loss": 0.1979, + "num_input_tokens_seen": 44742352, + "step": 20685 + }, + { + "epoch": 3.375203915171289, + "grad_norm": 0.02235202118754387, + "learning_rate": 0.0009856701889723438, + "loss": 0.0427, + "num_input_tokens_seen": 44751856, + "step": 20690 + }, + { + "epoch": 3.3760195758564437, + "grad_norm": 0.10820963233709335, + "learning_rate": 0.0009856532651192351, + "loss": 0.1165, + "num_input_tokens_seen": 44763728, + "step": 20695 + }, + { + "epoch": 3.3768352365415986, + "grad_norm": 0.11374247819185257, + "learning_rate": 0.0009856363314237468, + "loss": 0.1476, + "num_input_tokens_seen": 44775440, + "step": 20700 + }, + { + "epoch": 3.377650897226754, + "grad_norm": 0.014920140616595745, + "learning_rate": 0.0009856193878862221, + "loss": 0.1563, + "num_input_tokens_seen": 44787472, + "step": 20705 + }, + { + "epoch": 3.3784665579119086, + "grad_norm": 0.1235361248254776, + "learning_rate": 0.0009856024345070045, + "loss": 0.158, + "num_input_tokens_seen": 44799056, + "step": 20710 + }, + { + "epoch": 3.3792822185970635, + "grad_norm": 0.00849025510251522, + "learning_rate": 0.0009855854712864376, + "loss": 0.0113, + "num_input_tokens_seen": 44810672, + "step": 20715 + }, + { + "epoch": 3.3800978792822187, + "grad_norm": 0.2491769641637802, + "learning_rate": 0.000985568498224865, + "loss": 0.246, + "num_input_tokens_seen": 44822224, + "step": 20720 + }, + { + "epoch": 3.3809135399673735, + "grad_norm": 0.03148525208234787, + "learning_rate": 0.0009855515153226308, + "loss": 0.1343, + "num_input_tokens_seen": 44832496, + "step": 20725 + }, + { + "epoch": 3.3817292006525284, + "grad_norm": 0.17702309787273407, + "learning_rate": 0.0009855345225800792, + "loss": 0.0784, + "num_input_tokens_seen": 44840464, + "step": 20730 + }, + { + "epoch": 3.3825448613376836, + "grad_norm": 0.11792438477277756, + "learning_rate": 0.0009855175199975546, + "loss": 0.0817, + "num_input_tokens_seen": 44850768, + "step": 20735 + }, + { + "epoch": 3.3833605220228384, + "grad_norm": 0.046677011996507645, + "learning_rate": 0.0009855005075754015, + "loss": 0.1386, + "num_input_tokens_seen": 44861904, + "step": 20740 + }, + { + "epoch": 3.3841761827079937, + "grad_norm": 0.09238780289888382, + "learning_rate": 0.0009854834853139647, + "loss": 0.2265, + "num_input_tokens_seen": 44871984, + "step": 20745 + }, + { + "epoch": 3.3849918433931485, + "grad_norm": 0.09034372121095657, + "learning_rate": 0.0009854664532135892, + "loss": 0.226, + "num_input_tokens_seen": 44882960, + "step": 20750 + }, + { + "epoch": 3.3858075040783033, + "grad_norm": 0.16702663898468018, + "learning_rate": 0.0009854494112746203, + "loss": 0.0946, + "num_input_tokens_seen": 44894640, + "step": 20755 + }, + { + "epoch": 3.3866231647634586, + "grad_norm": 0.055394161492586136, + "learning_rate": 0.000985432359497403, + "loss": 0.0685, + "num_input_tokens_seen": 44906128, + "step": 20760 + }, + { + "epoch": 3.3874388254486134, + "grad_norm": 0.013766895048320293, + "learning_rate": 0.0009854152978822834, + "loss": 0.0934, + "num_input_tokens_seen": 44915824, + "step": 20765 + }, + { + "epoch": 3.3882544861337682, + "grad_norm": 0.2671952545642853, + "learning_rate": 0.0009853982264296068, + "loss": 0.0708, + "num_input_tokens_seen": 44925840, + "step": 20770 + }, + { + "epoch": 3.3890701468189235, + "grad_norm": 0.012830116786062717, + "learning_rate": 0.0009853811451397195, + "loss": 0.0483, + "num_input_tokens_seen": 44936592, + "step": 20775 + }, + { + "epoch": 3.3898858075040783, + "grad_norm": 0.01821967586874962, + "learning_rate": 0.0009853640540129674, + "loss": 0.205, + "num_input_tokens_seen": 44947600, + "step": 20780 + }, + { + "epoch": 3.390701468189233, + "grad_norm": 0.11865301430225372, + "learning_rate": 0.0009853469530496971, + "loss": 0.1086, + "num_input_tokens_seen": 44957968, + "step": 20785 + }, + { + "epoch": 3.3915171288743884, + "grad_norm": 0.21843115985393524, + "learning_rate": 0.000985329842250255, + "loss": 0.0617, + "num_input_tokens_seen": 44970032, + "step": 20790 + }, + { + "epoch": 3.392332789559543, + "grad_norm": 0.004521653056144714, + "learning_rate": 0.000985312721614988, + "loss": 0.0306, + "num_input_tokens_seen": 44980400, + "step": 20795 + }, + { + "epoch": 3.393148450244698, + "grad_norm": 0.17764700949192047, + "learning_rate": 0.0009852955911442431, + "loss": 0.1299, + "num_input_tokens_seen": 44990480, + "step": 20800 + }, + { + "epoch": 3.3939641109298533, + "grad_norm": 0.04476391151547432, + "learning_rate": 0.0009852784508383673, + "loss": 0.0808, + "num_input_tokens_seen": 45001584, + "step": 20805 + }, + { + "epoch": 3.394779771615008, + "grad_norm": 0.32288724184036255, + "learning_rate": 0.0009852613006977081, + "loss": 0.2153, + "num_input_tokens_seen": 45012720, + "step": 20810 + }, + { + "epoch": 3.395595432300163, + "grad_norm": 0.012930216267704964, + "learning_rate": 0.0009852441407226132, + "loss": 0.0305, + "num_input_tokens_seen": 45022832, + "step": 20815 + }, + { + "epoch": 3.396411092985318, + "grad_norm": 0.09052237868309021, + "learning_rate": 0.00098522697091343, + "loss": 0.1351, + "num_input_tokens_seen": 45033648, + "step": 20820 + }, + { + "epoch": 3.397226753670473, + "grad_norm": 0.02524031139910221, + "learning_rate": 0.0009852097912705067, + "loss": 0.1472, + "num_input_tokens_seen": 45044592, + "step": 20825 + }, + { + "epoch": 3.3980424143556283, + "grad_norm": 0.015985824167728424, + "learning_rate": 0.0009851926017941917, + "loss": 0.0751, + "num_input_tokens_seen": 45055056, + "step": 20830 + }, + { + "epoch": 3.398858075040783, + "grad_norm": 0.023439688608050346, + "learning_rate": 0.0009851754024848328, + "loss": 0.0788, + "num_input_tokens_seen": 45065840, + "step": 20835 + }, + { + "epoch": 3.399673735725938, + "grad_norm": 0.07412150502204895, + "learning_rate": 0.0009851581933427792, + "loss": 0.186, + "num_input_tokens_seen": 45077200, + "step": 20840 + }, + { + "epoch": 3.400489396411093, + "grad_norm": 0.01010909117758274, + "learning_rate": 0.000985140974368379, + "loss": 0.1162, + "num_input_tokens_seen": 45088432, + "step": 20845 + }, + { + "epoch": 3.401305057096248, + "grad_norm": 0.0620625801384449, + "learning_rate": 0.0009851237455619818, + "loss": 0.0669, + "num_input_tokens_seen": 45099248, + "step": 20850 + }, + { + "epoch": 3.402120717781403, + "grad_norm": 0.054616160690784454, + "learning_rate": 0.0009851065069239361, + "loss": 0.0995, + "num_input_tokens_seen": 45109968, + "step": 20855 + }, + { + "epoch": 3.402936378466558, + "grad_norm": 0.053330112248659134, + "learning_rate": 0.0009850892584545921, + "loss": 0.0316, + "num_input_tokens_seen": 45121840, + "step": 20860 + }, + { + "epoch": 3.403752039151713, + "grad_norm": 0.12266937643289566, + "learning_rate": 0.0009850720001542985, + "loss": 0.1227, + "num_input_tokens_seen": 45132720, + "step": 20865 + }, + { + "epoch": 3.4045676998368677, + "grad_norm": 0.02327810972929001, + "learning_rate": 0.0009850547320234058, + "loss": 0.0596, + "num_input_tokens_seen": 45143472, + "step": 20870 + }, + { + "epoch": 3.405383360522023, + "grad_norm": 0.006088678725063801, + "learning_rate": 0.0009850374540622633, + "loss": 0.026, + "num_input_tokens_seen": 45153744, + "step": 20875 + }, + { + "epoch": 3.4061990212071778, + "grad_norm": 0.005732911638915539, + "learning_rate": 0.0009850201662712217, + "loss": 0.1386, + "num_input_tokens_seen": 45163920, + "step": 20880 + }, + { + "epoch": 3.407014681892333, + "grad_norm": 0.013120281510055065, + "learning_rate": 0.0009850028686506313, + "loss": 0.0312, + "num_input_tokens_seen": 45174928, + "step": 20885 + }, + { + "epoch": 3.407830342577488, + "grad_norm": 0.014896417036652565, + "learning_rate": 0.000984985561200842, + "loss": 0.0457, + "num_input_tokens_seen": 45185488, + "step": 20890 + }, + { + "epoch": 3.4086460032626427, + "grad_norm": 0.14158938825130463, + "learning_rate": 0.0009849682439222055, + "loss": 0.0753, + "num_input_tokens_seen": 45197200, + "step": 20895 + }, + { + "epoch": 3.4094616639477975, + "grad_norm": 0.2206645905971527, + "learning_rate": 0.000984950916815072, + "loss": 0.0672, + "num_input_tokens_seen": 45207600, + "step": 20900 + }, + { + "epoch": 3.4102773246329527, + "grad_norm": 0.015935998409986496, + "learning_rate": 0.0009849335798797932, + "loss": 0.0726, + "num_input_tokens_seen": 45218800, + "step": 20905 + }, + { + "epoch": 3.4110929853181076, + "grad_norm": 0.006892753764986992, + "learning_rate": 0.0009849162331167201, + "loss": 0.1614, + "num_input_tokens_seen": 45230224, + "step": 20910 + }, + { + "epoch": 3.411908646003263, + "grad_norm": 0.35190969705581665, + "learning_rate": 0.0009848988765262044, + "loss": 0.1044, + "num_input_tokens_seen": 45241840, + "step": 20915 + }, + { + "epoch": 3.4127243066884176, + "grad_norm": 0.3081585764884949, + "learning_rate": 0.0009848815101085977, + "loss": 0.1927, + "num_input_tokens_seen": 45254032, + "step": 20920 + }, + { + "epoch": 3.4135399673735725, + "grad_norm": 0.008788962848484516, + "learning_rate": 0.0009848641338642524, + "loss": 0.1713, + "num_input_tokens_seen": 45263696, + "step": 20925 + }, + { + "epoch": 3.4143556280587277, + "grad_norm": 0.004804656840860844, + "learning_rate": 0.00098484674779352, + "loss": 0.1061, + "num_input_tokens_seen": 45275120, + "step": 20930 + }, + { + "epoch": 3.4151712887438825, + "grad_norm": 0.09571245312690735, + "learning_rate": 0.0009848293518967533, + "loss": 0.0516, + "num_input_tokens_seen": 45285744, + "step": 20935 + }, + { + "epoch": 3.4159869494290374, + "grad_norm": 0.05200956016778946, + "learning_rate": 0.0009848119461743049, + "loss": 0.3109, + "num_input_tokens_seen": 45295760, + "step": 20940 + }, + { + "epoch": 3.4168026101141926, + "grad_norm": 0.16579431295394897, + "learning_rate": 0.000984794530626527, + "loss": 0.0964, + "num_input_tokens_seen": 45306288, + "step": 20945 + }, + { + "epoch": 3.4176182707993474, + "grad_norm": 0.11425749212503433, + "learning_rate": 0.0009847771052537732, + "loss": 0.074, + "num_input_tokens_seen": 45318352, + "step": 20950 + }, + { + "epoch": 3.4184339314845023, + "grad_norm": 0.057450488209724426, + "learning_rate": 0.0009847596700563966, + "loss": 0.0443, + "num_input_tokens_seen": 45328368, + "step": 20955 + }, + { + "epoch": 3.4192495921696575, + "grad_norm": 0.09544433653354645, + "learning_rate": 0.00098474222503475, + "loss": 0.0842, + "num_input_tokens_seen": 45339920, + "step": 20960 + }, + { + "epoch": 3.4200652528548123, + "grad_norm": 0.08446510136127472, + "learning_rate": 0.0009847247701891874, + "loss": 0.0987, + "num_input_tokens_seen": 45351632, + "step": 20965 + }, + { + "epoch": 3.4208809135399676, + "grad_norm": 0.03102685697376728, + "learning_rate": 0.0009847073055200624, + "loss": 0.0527, + "num_input_tokens_seen": 45362224, + "step": 20970 + }, + { + "epoch": 3.4216965742251224, + "grad_norm": 0.030437711626291275, + "learning_rate": 0.0009846898310277288, + "loss": 0.1066, + "num_input_tokens_seen": 45373488, + "step": 20975 + }, + { + "epoch": 3.4225122349102772, + "grad_norm": 0.04457832872867584, + "learning_rate": 0.000984672346712541, + "loss": 0.0861, + "num_input_tokens_seen": 45384560, + "step": 20980 + }, + { + "epoch": 3.4233278955954325, + "grad_norm": 0.04948972165584564, + "learning_rate": 0.0009846548525748533, + "loss": 0.07, + "num_input_tokens_seen": 45394256, + "step": 20985 + }, + { + "epoch": 3.4241435562805873, + "grad_norm": 0.0155490068718791, + "learning_rate": 0.0009846373486150201, + "loss": 0.1207, + "num_input_tokens_seen": 45405936, + "step": 20990 + }, + { + "epoch": 3.424959216965742, + "grad_norm": 0.054467808455228806, + "learning_rate": 0.0009846198348333964, + "loss": 0.1241, + "num_input_tokens_seen": 45415696, + "step": 20995 + }, + { + "epoch": 3.4257748776508974, + "grad_norm": 0.023017987608909607, + "learning_rate": 0.0009846023112303369, + "loss": 0.1182, + "num_input_tokens_seen": 45426800, + "step": 21000 + }, + { + "epoch": 3.426590538336052, + "grad_norm": 0.036964334547519684, + "learning_rate": 0.0009845847778061968, + "loss": 0.0828, + "num_input_tokens_seen": 45437168, + "step": 21005 + }, + { + "epoch": 3.427406199021207, + "grad_norm": 0.23528705537319183, + "learning_rate": 0.0009845672345613313, + "loss": 0.2005, + "num_input_tokens_seen": 45447888, + "step": 21010 + }, + { + "epoch": 3.4282218597063623, + "grad_norm": 0.22169376909732819, + "learning_rate": 0.0009845496814960962, + "loss": 0.1643, + "num_input_tokens_seen": 45459568, + "step": 21015 + }, + { + "epoch": 3.429037520391517, + "grad_norm": 0.00393277732655406, + "learning_rate": 0.0009845321186108468, + "loss": 0.083, + "num_input_tokens_seen": 45470768, + "step": 21020 + }, + { + "epoch": 3.429853181076672, + "grad_norm": 0.11375081539154053, + "learning_rate": 0.0009845145459059397, + "loss": 0.0762, + "num_input_tokens_seen": 45481456, + "step": 21025 + }, + { + "epoch": 3.430668841761827, + "grad_norm": 0.007324701175093651, + "learning_rate": 0.0009844969633817306, + "loss": 0.1264, + "num_input_tokens_seen": 45492592, + "step": 21030 + }, + { + "epoch": 3.431484502446982, + "grad_norm": 0.031020818278193474, + "learning_rate": 0.000984479371038576, + "loss": 0.1417, + "num_input_tokens_seen": 45503792, + "step": 21035 + }, + { + "epoch": 3.432300163132137, + "grad_norm": 0.015361804515123367, + "learning_rate": 0.0009844617688768323, + "loss": 0.0588, + "num_input_tokens_seen": 45514800, + "step": 21040 + }, + { + "epoch": 3.433115823817292, + "grad_norm": 0.04145984724164009, + "learning_rate": 0.000984444156896856, + "loss": 0.1141, + "num_input_tokens_seen": 45525168, + "step": 21045 + }, + { + "epoch": 3.433931484502447, + "grad_norm": 0.05199075862765312, + "learning_rate": 0.0009844265350990047, + "loss": 0.1007, + "num_input_tokens_seen": 45536080, + "step": 21050 + }, + { + "epoch": 3.434747145187602, + "grad_norm": 0.19810503721237183, + "learning_rate": 0.000984408903483635, + "loss": 0.1237, + "num_input_tokens_seen": 45545808, + "step": 21055 + }, + { + "epoch": 3.435562805872757, + "grad_norm": 0.009147719480097294, + "learning_rate": 0.0009843912620511042, + "loss": 0.1346, + "num_input_tokens_seen": 45557232, + "step": 21060 + }, + { + "epoch": 3.436378466557912, + "grad_norm": 0.0113412756472826, + "learning_rate": 0.00098437361080177, + "loss": 0.0346, + "num_input_tokens_seen": 45568336, + "step": 21065 + }, + { + "epoch": 3.437194127243067, + "grad_norm": 0.014199744910001755, + "learning_rate": 0.0009843559497359903, + "loss": 0.1464, + "num_input_tokens_seen": 45578544, + "step": 21070 + }, + { + "epoch": 3.438009787928222, + "grad_norm": 0.03397858887910843, + "learning_rate": 0.0009843382788541227, + "loss": 0.0896, + "num_input_tokens_seen": 45590608, + "step": 21075 + }, + { + "epoch": 3.4388254486133767, + "grad_norm": 0.20899049937725067, + "learning_rate": 0.0009843205981565253, + "loss": 0.1218, + "num_input_tokens_seen": 45600048, + "step": 21080 + }, + { + "epoch": 3.439641109298532, + "grad_norm": 0.07323971390724182, + "learning_rate": 0.0009843029076435567, + "loss": 0.1632, + "num_input_tokens_seen": 45608944, + "step": 21085 + }, + { + "epoch": 3.4404567699836868, + "grad_norm": 0.022303447127342224, + "learning_rate": 0.0009842852073155754, + "loss": 0.1182, + "num_input_tokens_seen": 45618832, + "step": 21090 + }, + { + "epoch": 3.4412724306688416, + "grad_norm": 0.23006023466587067, + "learning_rate": 0.00098426749717294, + "loss": 0.1579, + "num_input_tokens_seen": 45628144, + "step": 21095 + }, + { + "epoch": 3.442088091353997, + "grad_norm": 0.0528857558965683, + "learning_rate": 0.0009842497772160092, + "loss": 0.1934, + "num_input_tokens_seen": 45638480, + "step": 21100 + }, + { + "epoch": 3.4429037520391517, + "grad_norm": 0.057266563177108765, + "learning_rate": 0.0009842320474451427, + "loss": 0.081, + "num_input_tokens_seen": 45649648, + "step": 21105 + }, + { + "epoch": 3.443719412724307, + "grad_norm": 0.041887782514095306, + "learning_rate": 0.0009842143078606991, + "loss": 0.0751, + "num_input_tokens_seen": 45661168, + "step": 21110 + }, + { + "epoch": 3.4445350734094617, + "grad_norm": 0.0473744235932827, + "learning_rate": 0.0009841965584630385, + "loss": 0.115, + "num_input_tokens_seen": 45672432, + "step": 21115 + }, + { + "epoch": 3.4453507340946166, + "grad_norm": 0.1000686064362526, + "learning_rate": 0.0009841787992525203, + "loss": 0.2541, + "num_input_tokens_seen": 45683472, + "step": 21120 + }, + { + "epoch": 3.4461663947797714, + "grad_norm": 0.029857605695724487, + "learning_rate": 0.0009841610302295048, + "loss": 0.0619, + "num_input_tokens_seen": 45694704, + "step": 21125 + }, + { + "epoch": 3.4469820554649266, + "grad_norm": 0.2285618633031845, + "learning_rate": 0.0009841432513943516, + "loss": 0.1447, + "num_input_tokens_seen": 45705168, + "step": 21130 + }, + { + "epoch": 3.4477977161500815, + "grad_norm": 0.09792362153530121, + "learning_rate": 0.0009841254627474213, + "loss": 0.1114, + "num_input_tokens_seen": 45715792, + "step": 21135 + }, + { + "epoch": 3.4486133768352367, + "grad_norm": 0.054127782583236694, + "learning_rate": 0.000984107664289074, + "loss": 0.0916, + "num_input_tokens_seen": 45726576, + "step": 21140 + }, + { + "epoch": 3.4494290375203915, + "grad_norm": 0.011056124232709408, + "learning_rate": 0.0009840898560196712, + "loss": 0.0446, + "num_input_tokens_seen": 45737520, + "step": 21145 + }, + { + "epoch": 3.4502446982055464, + "grad_norm": 0.1722433865070343, + "learning_rate": 0.000984072037939573, + "loss": 0.14, + "num_input_tokens_seen": 45748176, + "step": 21150 + }, + { + "epoch": 3.4510603588907016, + "grad_norm": 0.014301082119345665, + "learning_rate": 0.000984054210049141, + "loss": 0.0681, + "num_input_tokens_seen": 45758864, + "step": 21155 + }, + { + "epoch": 3.4518760195758564, + "grad_norm": 0.017598386853933334, + "learning_rate": 0.0009840363723487365, + "loss": 0.2171, + "num_input_tokens_seen": 45769744, + "step": 21160 + }, + { + "epoch": 3.4526916802610113, + "grad_norm": 0.21228720247745514, + "learning_rate": 0.0009840185248387208, + "loss": 0.2491, + "num_input_tokens_seen": 45780400, + "step": 21165 + }, + { + "epoch": 3.4535073409461665, + "grad_norm": 0.11573278903961182, + "learning_rate": 0.0009840006675194558, + "loss": 0.1264, + "num_input_tokens_seen": 45790416, + "step": 21170 + }, + { + "epoch": 3.4543230016313213, + "grad_norm": 0.05527227371931076, + "learning_rate": 0.000983982800391303, + "loss": 0.1217, + "num_input_tokens_seen": 45801392, + "step": 21175 + }, + { + "epoch": 3.455138662316476, + "grad_norm": 0.06245320290327072, + "learning_rate": 0.0009839649234546248, + "loss": 0.0428, + "num_input_tokens_seen": 45812464, + "step": 21180 + }, + { + "epoch": 3.4559543230016314, + "grad_norm": 0.05715889483690262, + "learning_rate": 0.0009839470367097836, + "loss": 0.1378, + "num_input_tokens_seen": 45823056, + "step": 21185 + }, + { + "epoch": 3.4567699836867862, + "grad_norm": 0.02741180546581745, + "learning_rate": 0.0009839291401571417, + "loss": 0.0969, + "num_input_tokens_seen": 45832336, + "step": 21190 + }, + { + "epoch": 3.4575856443719415, + "grad_norm": 0.009015440940856934, + "learning_rate": 0.0009839112337970619, + "loss": 0.0465, + "num_input_tokens_seen": 45842864, + "step": 21195 + }, + { + "epoch": 3.4584013050570963, + "grad_norm": 0.22681792080402374, + "learning_rate": 0.0009838933176299072, + "loss": 0.2291, + "num_input_tokens_seen": 45854192, + "step": 21200 + }, + { + "epoch": 3.459216965742251, + "grad_norm": 0.03709195926785469, + "learning_rate": 0.0009838753916560404, + "loss": 0.1055, + "num_input_tokens_seen": 45865296, + "step": 21205 + }, + { + "epoch": 3.4600326264274064, + "grad_norm": 0.0438421331346035, + "learning_rate": 0.000983857455875825, + "loss": 0.0986, + "num_input_tokens_seen": 45876336, + "step": 21210 + }, + { + "epoch": 3.460848287112561, + "grad_norm": 0.056209295988082886, + "learning_rate": 0.0009838395102896244, + "loss": 0.0809, + "num_input_tokens_seen": 45886768, + "step": 21215 + }, + { + "epoch": 3.461663947797716, + "grad_norm": 0.07024645805358887, + "learning_rate": 0.0009838215548978024, + "loss": 0.1117, + "num_input_tokens_seen": 45896688, + "step": 21220 + }, + { + "epoch": 3.4624796084828713, + "grad_norm": 0.011782780289649963, + "learning_rate": 0.0009838035897007226, + "loss": 0.0833, + "num_input_tokens_seen": 45908496, + "step": 21225 + }, + { + "epoch": 3.463295269168026, + "grad_norm": 0.007954503409564495, + "learning_rate": 0.0009837856146987496, + "loss": 0.0861, + "num_input_tokens_seen": 45920144, + "step": 21230 + }, + { + "epoch": 3.464110929853181, + "grad_norm": 0.01540245022624731, + "learning_rate": 0.0009837676298922473, + "loss": 0.0652, + "num_input_tokens_seen": 45930480, + "step": 21235 + }, + { + "epoch": 3.464926590538336, + "grad_norm": 0.008227720856666565, + "learning_rate": 0.0009837496352815803, + "loss": 0.1849, + "num_input_tokens_seen": 45942416, + "step": 21240 + }, + { + "epoch": 3.465742251223491, + "grad_norm": 0.025327688083052635, + "learning_rate": 0.000983731630867113, + "loss": 0.1903, + "num_input_tokens_seen": 45953616, + "step": 21245 + }, + { + "epoch": 3.466557911908646, + "grad_norm": 0.022125469520688057, + "learning_rate": 0.0009837136166492109, + "loss": 0.1485, + "num_input_tokens_seen": 45965584, + "step": 21250 + }, + { + "epoch": 3.467373572593801, + "grad_norm": 0.2036154568195343, + "learning_rate": 0.0009836955926282385, + "loss": 0.1585, + "num_input_tokens_seen": 45977424, + "step": 21255 + }, + { + "epoch": 3.468189233278956, + "grad_norm": 0.061766836792230606, + "learning_rate": 0.0009836775588045613, + "loss": 0.0947, + "num_input_tokens_seen": 45988752, + "step": 21260 + }, + { + "epoch": 3.4690048939641107, + "grad_norm": 0.08904801309108734, + "learning_rate": 0.0009836595151785448, + "loss": 0.1983, + "num_input_tokens_seen": 45999184, + "step": 21265 + }, + { + "epoch": 3.469820554649266, + "grad_norm": 0.04244118183851242, + "learning_rate": 0.0009836414617505548, + "loss": 0.0285, + "num_input_tokens_seen": 46010544, + "step": 21270 + }, + { + "epoch": 3.470636215334421, + "grad_norm": 0.07451055198907852, + "learning_rate": 0.000983623398520957, + "loss": 0.0926, + "num_input_tokens_seen": 46021360, + "step": 21275 + }, + { + "epoch": 3.471451876019576, + "grad_norm": 0.15908612310886383, + "learning_rate": 0.0009836053254901173, + "loss": 0.1483, + "num_input_tokens_seen": 46030640, + "step": 21280 + }, + { + "epoch": 3.472267536704731, + "grad_norm": 0.15909342467784882, + "learning_rate": 0.0009835872426584024, + "loss": 0.0829, + "num_input_tokens_seen": 46041200, + "step": 21285 + }, + { + "epoch": 3.4730831973898857, + "grad_norm": 0.2042553573846817, + "learning_rate": 0.0009835691500261784, + "loss": 0.1192, + "num_input_tokens_seen": 46052208, + "step": 21290 + }, + { + "epoch": 3.473898858075041, + "grad_norm": 0.06767347455024719, + "learning_rate": 0.0009835510475938124, + "loss": 0.0646, + "num_input_tokens_seen": 46063536, + "step": 21295 + }, + { + "epoch": 3.4747145187601958, + "grad_norm": 0.017927464097738266, + "learning_rate": 0.0009835329353616708, + "loss": 0.0989, + "num_input_tokens_seen": 46075024, + "step": 21300 + }, + { + "epoch": 3.4755301794453506, + "grad_norm": 0.23534156382083893, + "learning_rate": 0.000983514813330121, + "loss": 0.1022, + "num_input_tokens_seen": 46084432, + "step": 21305 + }, + { + "epoch": 3.476345840130506, + "grad_norm": 0.1075979694724083, + "learning_rate": 0.00098349668149953, + "loss": 0.0794, + "num_input_tokens_seen": 46095504, + "step": 21310 + }, + { + "epoch": 3.4771615008156607, + "grad_norm": 0.02730988711118698, + "learning_rate": 0.0009834785398702653, + "loss": 0.024, + "num_input_tokens_seen": 46105520, + "step": 21315 + }, + { + "epoch": 3.4779771615008155, + "grad_norm": 0.158670112490654, + "learning_rate": 0.0009834603884426947, + "loss": 0.1295, + "num_input_tokens_seen": 46115344, + "step": 21320 + }, + { + "epoch": 3.4787928221859707, + "grad_norm": 0.008138585835695267, + "learning_rate": 0.000983442227217186, + "loss": 0.1815, + "num_input_tokens_seen": 46124784, + "step": 21325 + }, + { + "epoch": 3.4796084828711256, + "grad_norm": 0.12669169902801514, + "learning_rate": 0.0009834240561941072, + "loss": 0.1283, + "num_input_tokens_seen": 46135856, + "step": 21330 + }, + { + "epoch": 3.480424143556281, + "grad_norm": 0.06928084045648575, + "learning_rate": 0.000983405875373827, + "loss": 0.0421, + "num_input_tokens_seen": 46146960, + "step": 21335 + }, + { + "epoch": 3.4812398042414356, + "grad_norm": 0.05004847049713135, + "learning_rate": 0.0009833876847567132, + "loss": 0.0972, + "num_input_tokens_seen": 46158544, + "step": 21340 + }, + { + "epoch": 3.4820554649265905, + "grad_norm": 0.2124181091785431, + "learning_rate": 0.0009833694843431346, + "loss": 0.109, + "num_input_tokens_seen": 46168336, + "step": 21345 + }, + { + "epoch": 3.4828711256117453, + "grad_norm": 0.20705543458461761, + "learning_rate": 0.0009833512741334604, + "loss": 0.2405, + "num_input_tokens_seen": 46178128, + "step": 21350 + }, + { + "epoch": 3.4836867862969005, + "grad_norm": 0.13747872412204742, + "learning_rate": 0.0009833330541280595, + "loss": 0.0734, + "num_input_tokens_seen": 46188144, + "step": 21355 + }, + { + "epoch": 3.4845024469820554, + "grad_norm": 0.05297991633415222, + "learning_rate": 0.0009833148243273012, + "loss": 0.0458, + "num_input_tokens_seen": 46198768, + "step": 21360 + }, + { + "epoch": 3.4853181076672106, + "grad_norm": 0.11282984167337418, + "learning_rate": 0.0009832965847315547, + "loss": 0.1525, + "num_input_tokens_seen": 46209360, + "step": 21365 + }, + { + "epoch": 3.4861337683523654, + "grad_norm": 0.06479454785585403, + "learning_rate": 0.00098327833534119, + "loss": 0.0563, + "num_input_tokens_seen": 46219792, + "step": 21370 + }, + { + "epoch": 3.4869494290375203, + "grad_norm": 0.03795412927865982, + "learning_rate": 0.0009832600761565764, + "loss": 0.0519, + "num_input_tokens_seen": 46229424, + "step": 21375 + }, + { + "epoch": 3.4877650897226755, + "grad_norm": 0.017168574035167694, + "learning_rate": 0.0009832418071780845, + "loss": 0.1306, + "num_input_tokens_seen": 46240656, + "step": 21380 + }, + { + "epoch": 3.4885807504078303, + "grad_norm": 0.05710841342806816, + "learning_rate": 0.0009832235284060842, + "loss": 0.1015, + "num_input_tokens_seen": 46251312, + "step": 21385 + }, + { + "epoch": 3.489396411092985, + "grad_norm": 0.1470293253660202, + "learning_rate": 0.0009832052398409464, + "loss": 0.0598, + "num_input_tokens_seen": 46262832, + "step": 21390 + }, + { + "epoch": 3.4902120717781404, + "grad_norm": 0.015464311465620995, + "learning_rate": 0.000983186941483041, + "loss": 0.0205, + "num_input_tokens_seen": 46273616, + "step": 21395 + }, + { + "epoch": 3.4910277324632952, + "grad_norm": 0.0574021190404892, + "learning_rate": 0.0009831686333327397, + "loss": 0.154, + "num_input_tokens_seen": 46285456, + "step": 21400 + }, + { + "epoch": 3.49184339314845, + "grad_norm": 0.04269903153181076, + "learning_rate": 0.0009831503153904127, + "loss": 0.0562, + "num_input_tokens_seen": 46296848, + "step": 21405 + }, + { + "epoch": 3.4926590538336053, + "grad_norm": 0.5759614706039429, + "learning_rate": 0.000983131987656432, + "loss": 0.2359, + "num_input_tokens_seen": 46308400, + "step": 21410 + }, + { + "epoch": 3.49347471451876, + "grad_norm": 0.02763795293867588, + "learning_rate": 0.0009831136501311684, + "loss": 0.0369, + "num_input_tokens_seen": 46319312, + "step": 21415 + }, + { + "epoch": 3.4942903752039154, + "grad_norm": 0.26870211958885193, + "learning_rate": 0.000983095302814994, + "loss": 0.2007, + "num_input_tokens_seen": 46331184, + "step": 21420 + }, + { + "epoch": 3.49510603588907, + "grad_norm": 0.10838611423969269, + "learning_rate": 0.0009830769457082804, + "loss": 0.0913, + "num_input_tokens_seen": 46341264, + "step": 21425 + }, + { + "epoch": 3.495921696574225, + "grad_norm": 0.325128972530365, + "learning_rate": 0.0009830585788113994, + "loss": 0.1205, + "num_input_tokens_seen": 46352432, + "step": 21430 + }, + { + "epoch": 3.4967373572593803, + "grad_norm": 0.09117142111063004, + "learning_rate": 0.0009830402021247238, + "loss": 0.0678, + "num_input_tokens_seen": 46363280, + "step": 21435 + }, + { + "epoch": 3.497553017944535, + "grad_norm": 0.09155073761940002, + "learning_rate": 0.0009830218156486256, + "loss": 0.2447, + "num_input_tokens_seen": 46372848, + "step": 21440 + }, + { + "epoch": 3.49836867862969, + "grad_norm": 0.022856619209051132, + "learning_rate": 0.0009830034193834777, + "loss": 0.0627, + "num_input_tokens_seen": 46384976, + "step": 21445 + }, + { + "epoch": 3.499184339314845, + "grad_norm": 0.4519825279712677, + "learning_rate": 0.0009829850133296527, + "loss": 0.1388, + "num_input_tokens_seen": 46396560, + "step": 21450 + }, + { + "epoch": 3.5, + "grad_norm": 0.06419207900762558, + "learning_rate": 0.0009829665974875237, + "loss": 0.1412, + "num_input_tokens_seen": 46407472, + "step": 21455 + }, + { + "epoch": 3.500815660685155, + "grad_norm": 0.19768834114074707, + "learning_rate": 0.0009829481718574638, + "loss": 0.1633, + "num_input_tokens_seen": 46417872, + "step": 21460 + }, + { + "epoch": 3.50163132137031, + "grad_norm": 0.04964315891265869, + "learning_rate": 0.0009829297364398466, + "loss": 0.0879, + "num_input_tokens_seen": 46428432, + "step": 21465 + }, + { + "epoch": 3.502446982055465, + "grad_norm": 0.020760485902428627, + "learning_rate": 0.0009829112912350456, + "loss": 0.0781, + "num_input_tokens_seen": 46439856, + "step": 21470 + }, + { + "epoch": 3.50326264274062, + "grad_norm": 0.020568421110510826, + "learning_rate": 0.000982892836243435, + "loss": 0.0753, + "num_input_tokens_seen": 46451376, + "step": 21475 + }, + { + "epoch": 3.504078303425775, + "grad_norm": 0.04842434450984001, + "learning_rate": 0.000982874371465388, + "loss": 0.0795, + "num_input_tokens_seen": 46461456, + "step": 21480 + }, + { + "epoch": 3.50489396411093, + "grad_norm": 0.09125185012817383, + "learning_rate": 0.0009828558969012795, + "loss": 0.0588, + "num_input_tokens_seen": 46471600, + "step": 21485 + }, + { + "epoch": 3.5057096247960846, + "grad_norm": 0.02235431969165802, + "learning_rate": 0.0009828374125514837, + "loss": 0.1059, + "num_input_tokens_seen": 46482224, + "step": 21490 + }, + { + "epoch": 3.50652528548124, + "grad_norm": 0.22898629307746887, + "learning_rate": 0.0009828189184163752, + "loss": 0.382, + "num_input_tokens_seen": 46492496, + "step": 21495 + }, + { + "epoch": 3.5073409461663947, + "grad_norm": 0.15465903282165527, + "learning_rate": 0.0009828004144963288, + "loss": 0.1664, + "num_input_tokens_seen": 46503120, + "step": 21500 + }, + { + "epoch": 3.50815660685155, + "grad_norm": 0.03542419150471687, + "learning_rate": 0.0009827819007917195, + "loss": 0.2168, + "num_input_tokens_seen": 46513680, + "step": 21505 + }, + { + "epoch": 3.5089722675367048, + "grad_norm": 0.127181276679039, + "learning_rate": 0.0009827633773029228, + "loss": 0.0798, + "num_input_tokens_seen": 46523632, + "step": 21510 + }, + { + "epoch": 3.5097879282218596, + "grad_norm": 0.039647456258535385, + "learning_rate": 0.0009827448440303135, + "loss": 0.1077, + "num_input_tokens_seen": 46534352, + "step": 21515 + }, + { + "epoch": 3.5106035889070144, + "grad_norm": 0.024184754118323326, + "learning_rate": 0.0009827263009742678, + "loss": 0.1848, + "num_input_tokens_seen": 46547056, + "step": 21520 + }, + { + "epoch": 3.5114192495921697, + "grad_norm": 0.054916176944971085, + "learning_rate": 0.000982707748135161, + "loss": 0.0938, + "num_input_tokens_seen": 46557648, + "step": 21525 + }, + { + "epoch": 3.5122349102773245, + "grad_norm": 0.25888592004776, + "learning_rate": 0.0009826891855133693, + "loss": 0.1358, + "num_input_tokens_seen": 46567888, + "step": 21530 + }, + { + "epoch": 3.5130505709624797, + "grad_norm": 0.25864580273628235, + "learning_rate": 0.000982670613109269, + "loss": 0.1325, + "num_input_tokens_seen": 46578096, + "step": 21535 + }, + { + "epoch": 3.5138662316476346, + "grad_norm": 0.1897091567516327, + "learning_rate": 0.0009826520309232365, + "loss": 0.2476, + "num_input_tokens_seen": 46589104, + "step": 21540 + }, + { + "epoch": 3.5146818923327894, + "grad_norm": 0.10447441786527634, + "learning_rate": 0.0009826334389556482, + "loss": 0.0694, + "num_input_tokens_seen": 46600080, + "step": 21545 + }, + { + "epoch": 3.5154975530179446, + "grad_norm": 0.023850787431001663, + "learning_rate": 0.000982614837206881, + "loss": 0.1558, + "num_input_tokens_seen": 46610096, + "step": 21550 + }, + { + "epoch": 3.5163132137030995, + "grad_norm": 0.036812786012887955, + "learning_rate": 0.000982596225677312, + "loss": 0.0632, + "num_input_tokens_seen": 46620688, + "step": 21555 + }, + { + "epoch": 3.5171288743882547, + "grad_norm": 0.16952641308307648, + "learning_rate": 0.0009825776043673182, + "loss": 0.0788, + "num_input_tokens_seen": 46631824, + "step": 21560 + }, + { + "epoch": 3.5179445350734095, + "grad_norm": 0.024134181439876556, + "learning_rate": 0.000982558973277277, + "loss": 0.1133, + "num_input_tokens_seen": 46642832, + "step": 21565 + }, + { + "epoch": 3.5187601957585644, + "grad_norm": 0.1285693347454071, + "learning_rate": 0.0009825403324075662, + "loss": 0.0894, + "num_input_tokens_seen": 46653456, + "step": 21570 + }, + { + "epoch": 3.519575856443719, + "grad_norm": 0.009325760416686535, + "learning_rate": 0.0009825216817585633, + "loss": 0.1061, + "num_input_tokens_seen": 46665136, + "step": 21575 + }, + { + "epoch": 3.5203915171288744, + "grad_norm": 0.1657373160123825, + "learning_rate": 0.0009825030213306463, + "loss": 0.1963, + "num_input_tokens_seen": 46674544, + "step": 21580 + }, + { + "epoch": 3.5212071778140293, + "grad_norm": 0.10782121121883392, + "learning_rate": 0.0009824843511241936, + "loss": 0.117, + "num_input_tokens_seen": 46685264, + "step": 21585 + }, + { + "epoch": 3.5220228384991845, + "grad_norm": 0.11278649419546127, + "learning_rate": 0.0009824656711395834, + "loss": 0.2149, + "num_input_tokens_seen": 46696528, + "step": 21590 + }, + { + "epoch": 3.5228384991843393, + "grad_norm": 0.11115585267543793, + "learning_rate": 0.0009824469813771945, + "loss": 0.1136, + "num_input_tokens_seen": 46708176, + "step": 21595 + }, + { + "epoch": 3.523654159869494, + "grad_norm": 0.030352916568517685, + "learning_rate": 0.0009824282818374052, + "loss": 0.0661, + "num_input_tokens_seen": 46717328, + "step": 21600 + }, + { + "epoch": 3.5244698205546494, + "grad_norm": 0.07619695365428925, + "learning_rate": 0.000982409572520595, + "loss": 0.2937, + "num_input_tokens_seen": 46727568, + "step": 21605 + }, + { + "epoch": 3.5252854812398042, + "grad_norm": 0.08703344315290451, + "learning_rate": 0.0009823908534271426, + "loss": 0.2221, + "num_input_tokens_seen": 46738928, + "step": 21610 + }, + { + "epoch": 3.5261011419249595, + "grad_norm": 0.07479031383991241, + "learning_rate": 0.0009823721245574278, + "loss": 0.1035, + "num_input_tokens_seen": 46748752, + "step": 21615 + }, + { + "epoch": 3.5269168026101143, + "grad_norm": 0.04100106284022331, + "learning_rate": 0.0009823533859118299, + "loss": 0.1089, + "num_input_tokens_seen": 46760240, + "step": 21620 + }, + { + "epoch": 3.527732463295269, + "grad_norm": 0.08685865998268127, + "learning_rate": 0.0009823346374907287, + "loss": 0.1158, + "num_input_tokens_seen": 46769936, + "step": 21625 + }, + { + "epoch": 3.528548123980424, + "grad_norm": 0.03426363319158554, + "learning_rate": 0.000982315879294504, + "loss": 0.0956, + "num_input_tokens_seen": 46779888, + "step": 21630 + }, + { + "epoch": 3.529363784665579, + "grad_norm": 0.1270504593849182, + "learning_rate": 0.0009822971113235366, + "loss": 0.1797, + "num_input_tokens_seen": 46791504, + "step": 21635 + }, + { + "epoch": 3.530179445350734, + "grad_norm": 0.10011230409145355, + "learning_rate": 0.0009822783335782061, + "loss": 0.1347, + "num_input_tokens_seen": 46802096, + "step": 21640 + }, + { + "epoch": 3.5309951060358893, + "grad_norm": 0.11784350126981735, + "learning_rate": 0.0009822595460588935, + "loss": 0.1285, + "num_input_tokens_seen": 46813616, + "step": 21645 + }, + { + "epoch": 3.531810766721044, + "grad_norm": 0.23490601778030396, + "learning_rate": 0.0009822407487659792, + "loss": 0.1382, + "num_input_tokens_seen": 46824816, + "step": 21650 + }, + { + "epoch": 3.532626427406199, + "grad_norm": 0.14736691117286682, + "learning_rate": 0.0009822219416998445, + "loss": 0.2027, + "num_input_tokens_seen": 46835536, + "step": 21655 + }, + { + "epoch": 3.5334420880913537, + "grad_norm": 0.20881640911102295, + "learning_rate": 0.0009822031248608704, + "loss": 0.2451, + "num_input_tokens_seen": 46847280, + "step": 21660 + }, + { + "epoch": 3.534257748776509, + "grad_norm": 0.06078454852104187, + "learning_rate": 0.0009821842982494383, + "loss": 0.085, + "num_input_tokens_seen": 46859152, + "step": 21665 + }, + { + "epoch": 3.535073409461664, + "grad_norm": 0.03408531844615936, + "learning_rate": 0.0009821654618659297, + "loss": 0.0769, + "num_input_tokens_seen": 46869584, + "step": 21670 + }, + { + "epoch": 3.535889070146819, + "grad_norm": 0.06222820654511452, + "learning_rate": 0.0009821466157107263, + "loss": 0.208, + "num_input_tokens_seen": 46880432, + "step": 21675 + }, + { + "epoch": 3.536704730831974, + "grad_norm": 0.09785876423120499, + "learning_rate": 0.0009821277597842101, + "loss": 0.0608, + "num_input_tokens_seen": 46890768, + "step": 21680 + }, + { + "epoch": 3.5375203915171287, + "grad_norm": 0.010254275985062122, + "learning_rate": 0.0009821088940867632, + "loss": 0.0897, + "num_input_tokens_seen": 46902352, + "step": 21685 + }, + { + "epoch": 3.538336052202284, + "grad_norm": 0.07269848138093948, + "learning_rate": 0.0009820900186187681, + "loss": 0.1248, + "num_input_tokens_seen": 46912816, + "step": 21690 + }, + { + "epoch": 3.539151712887439, + "grad_norm": 0.06444855034351349, + "learning_rate": 0.0009820711333806068, + "loss": 0.0661, + "num_input_tokens_seen": 46923888, + "step": 21695 + }, + { + "epoch": 3.539967373572594, + "grad_norm": 0.04911039397120476, + "learning_rate": 0.000982052238372663, + "loss": 0.0567, + "num_input_tokens_seen": 46934832, + "step": 21700 + }, + { + "epoch": 3.540783034257749, + "grad_norm": 0.19150716066360474, + "learning_rate": 0.0009820333335953187, + "loss": 0.1368, + "num_input_tokens_seen": 46944784, + "step": 21705 + }, + { + "epoch": 3.5415986949429037, + "grad_norm": 0.18624483048915863, + "learning_rate": 0.0009820144190489574, + "loss": 0.1731, + "num_input_tokens_seen": 46955376, + "step": 21710 + }, + { + "epoch": 3.5424143556280585, + "grad_norm": 0.010764437727630138, + "learning_rate": 0.0009819954947339624, + "loss": 0.1629, + "num_input_tokens_seen": 46964944, + "step": 21715 + }, + { + "epoch": 3.5432300163132138, + "grad_norm": 0.07276313006877899, + "learning_rate": 0.0009819765606507173, + "loss": 0.0411, + "num_input_tokens_seen": 46974960, + "step": 21720 + }, + { + "epoch": 3.5440456769983686, + "grad_norm": 0.09438583999872208, + "learning_rate": 0.0009819576167996058, + "loss": 0.1368, + "num_input_tokens_seen": 46986416, + "step": 21725 + }, + { + "epoch": 3.544861337683524, + "grad_norm": 0.005585776641964912, + "learning_rate": 0.000981938663181012, + "loss": 0.1781, + "num_input_tokens_seen": 46996240, + "step": 21730 + }, + { + "epoch": 3.5456769983686787, + "grad_norm": 0.08937297016382217, + "learning_rate": 0.0009819196997953195, + "loss": 0.1255, + "num_input_tokens_seen": 47007472, + "step": 21735 + }, + { + "epoch": 3.5464926590538335, + "grad_norm": 0.026922032237052917, + "learning_rate": 0.000981900726642913, + "loss": 0.0574, + "num_input_tokens_seen": 47018128, + "step": 21740 + }, + { + "epoch": 3.5473083197389887, + "grad_norm": 0.017074687406420708, + "learning_rate": 0.0009818817437241768, + "loss": 0.0855, + "num_input_tokens_seen": 47029456, + "step": 21745 + }, + { + "epoch": 3.5481239804241436, + "grad_norm": 0.03096526488661766, + "learning_rate": 0.000981862751039496, + "loss": 0.1789, + "num_input_tokens_seen": 47039792, + "step": 21750 + }, + { + "epoch": 3.5489396411092984, + "grad_norm": 0.06783930957317352, + "learning_rate": 0.000981843748589255, + "loss": 0.0915, + "num_input_tokens_seen": 47050032, + "step": 21755 + }, + { + "epoch": 3.5497553017944536, + "grad_norm": 0.08572400361299515, + "learning_rate": 0.0009818247363738396, + "loss": 0.1358, + "num_input_tokens_seen": 47060336, + "step": 21760 + }, + { + "epoch": 3.5505709624796085, + "grad_norm": 0.08011411875486374, + "learning_rate": 0.0009818057143936344, + "loss": 0.104, + "num_input_tokens_seen": 47070160, + "step": 21765 + }, + { + "epoch": 3.5513866231647633, + "grad_norm": 0.029326729476451874, + "learning_rate": 0.000981786682649025, + "loss": 0.0554, + "num_input_tokens_seen": 47081008, + "step": 21770 + }, + { + "epoch": 3.5522022838499185, + "grad_norm": 0.008124127052724361, + "learning_rate": 0.0009817676411403976, + "loss": 0.1112, + "num_input_tokens_seen": 47091088, + "step": 21775 + }, + { + "epoch": 3.5530179445350734, + "grad_norm": 0.10163454711437225, + "learning_rate": 0.0009817485898681378, + "loss": 0.0836, + "num_input_tokens_seen": 47102384, + "step": 21780 + }, + { + "epoch": 3.5538336052202286, + "grad_norm": 0.04621696472167969, + "learning_rate": 0.0009817295288326315, + "loss": 0.0513, + "num_input_tokens_seen": 47114064, + "step": 21785 + }, + { + "epoch": 3.5546492659053834, + "grad_norm": 0.09239888191223145, + "learning_rate": 0.0009817104580342653, + "loss": 0.103, + "num_input_tokens_seen": 47123824, + "step": 21790 + }, + { + "epoch": 3.5554649265905383, + "grad_norm": 0.017387012019753456, + "learning_rate": 0.0009816913774734254, + "loss": 0.048, + "num_input_tokens_seen": 47136432, + "step": 21795 + }, + { + "epoch": 3.556280587275693, + "grad_norm": 0.00779850734397769, + "learning_rate": 0.0009816722871504987, + "loss": 0.025, + "num_input_tokens_seen": 47148496, + "step": 21800 + }, + { + "epoch": 3.5570962479608483, + "grad_norm": 0.00785167794674635, + "learning_rate": 0.0009816531870658722, + "loss": 0.0417, + "num_input_tokens_seen": 47159600, + "step": 21805 + }, + { + "epoch": 3.557911908646003, + "grad_norm": 0.03706509619951248, + "learning_rate": 0.0009816340772199328, + "loss": 0.1361, + "num_input_tokens_seen": 47170224, + "step": 21810 + }, + { + "epoch": 3.5587275693311584, + "grad_norm": 0.04010167345404625, + "learning_rate": 0.0009816149576130678, + "loss": 0.0308, + "num_input_tokens_seen": 47181360, + "step": 21815 + }, + { + "epoch": 3.5595432300163132, + "grad_norm": 0.04599921405315399, + "learning_rate": 0.0009815958282456648, + "loss": 0.2459, + "num_input_tokens_seen": 47192176, + "step": 21820 + }, + { + "epoch": 3.560358890701468, + "grad_norm": 0.037334144115448, + "learning_rate": 0.0009815766891181112, + "loss": 0.1097, + "num_input_tokens_seen": 47201616, + "step": 21825 + }, + { + "epoch": 3.5611745513866233, + "grad_norm": 0.10492201894521713, + "learning_rate": 0.0009815575402307953, + "loss": 0.0899, + "num_input_tokens_seen": 47211280, + "step": 21830 + }, + { + "epoch": 3.561990212071778, + "grad_norm": 0.0777861624956131, + "learning_rate": 0.0009815383815841047, + "loss": 0.0296, + "num_input_tokens_seen": 47221968, + "step": 21835 + }, + { + "epoch": 3.5628058727569334, + "grad_norm": 0.08785879611968994, + "learning_rate": 0.0009815192131784282, + "loss": 0.3332, + "num_input_tokens_seen": 47233136, + "step": 21840 + }, + { + "epoch": 3.563621533442088, + "grad_norm": 0.009797224774956703, + "learning_rate": 0.0009815000350141539, + "loss": 0.1732, + "num_input_tokens_seen": 47242672, + "step": 21845 + }, + { + "epoch": 3.564437194127243, + "grad_norm": 0.02497190050780773, + "learning_rate": 0.0009814808470916705, + "loss": 0.1779, + "num_input_tokens_seen": 47253552, + "step": 21850 + }, + { + "epoch": 3.565252854812398, + "grad_norm": 0.08281011134386063, + "learning_rate": 0.0009814616494113668, + "loss": 0.1092, + "num_input_tokens_seen": 47265680, + "step": 21855 + }, + { + "epoch": 3.566068515497553, + "grad_norm": 0.03524525463581085, + "learning_rate": 0.0009814424419736323, + "loss": 0.0513, + "num_input_tokens_seen": 47277200, + "step": 21860 + }, + { + "epoch": 3.566884176182708, + "grad_norm": 0.022290315479040146, + "learning_rate": 0.0009814232247788556, + "loss": 0.1073, + "num_input_tokens_seen": 47288240, + "step": 21865 + }, + { + "epoch": 3.567699836867863, + "grad_norm": 0.03501790761947632, + "learning_rate": 0.0009814039978274269, + "loss": 0.0556, + "num_input_tokens_seen": 47297808, + "step": 21870 + }, + { + "epoch": 3.568515497553018, + "grad_norm": 0.06423972547054291, + "learning_rate": 0.0009813847611197352, + "loss": 0.1121, + "num_input_tokens_seen": 47308304, + "step": 21875 + }, + { + "epoch": 3.569331158238173, + "grad_norm": 0.16512851417064667, + "learning_rate": 0.0009813655146561709, + "loss": 0.0808, + "num_input_tokens_seen": 47320080, + "step": 21880 + }, + { + "epoch": 3.5701468189233276, + "grad_norm": 0.1547977477312088, + "learning_rate": 0.0009813462584371236, + "loss": 0.1394, + "num_input_tokens_seen": 47331952, + "step": 21885 + }, + { + "epoch": 3.570962479608483, + "grad_norm": 0.10016020387411118, + "learning_rate": 0.0009813269924629838, + "loss": 0.0572, + "num_input_tokens_seen": 47342864, + "step": 21890 + }, + { + "epoch": 3.5717781402936377, + "grad_norm": 0.12822580337524414, + "learning_rate": 0.000981307716734142, + "loss": 0.0958, + "num_input_tokens_seen": 47353296, + "step": 21895 + }, + { + "epoch": 3.572593800978793, + "grad_norm": 0.14780253171920776, + "learning_rate": 0.0009812884312509883, + "loss": 0.132, + "num_input_tokens_seen": 47364720, + "step": 21900 + }, + { + "epoch": 3.573409461663948, + "grad_norm": 0.0791921615600586, + "learning_rate": 0.0009812691360139144, + "loss": 0.0664, + "num_input_tokens_seen": 47375920, + "step": 21905 + }, + { + "epoch": 3.5742251223491026, + "grad_norm": 0.005884220823645592, + "learning_rate": 0.000981249831023311, + "loss": 0.1976, + "num_input_tokens_seen": 47388144, + "step": 21910 + }, + { + "epoch": 3.575040783034258, + "grad_norm": 0.052293986082077026, + "learning_rate": 0.000981230516279569, + "loss": 0.0787, + "num_input_tokens_seen": 47398640, + "step": 21915 + }, + { + "epoch": 3.5758564437194127, + "grad_norm": 0.047704145312309265, + "learning_rate": 0.0009812111917830801, + "loss": 0.0788, + "num_input_tokens_seen": 47410000, + "step": 21920 + }, + { + "epoch": 3.576672104404568, + "grad_norm": 0.12634558975696564, + "learning_rate": 0.000981191857534236, + "loss": 0.1064, + "num_input_tokens_seen": 47419824, + "step": 21925 + }, + { + "epoch": 3.5774877650897228, + "grad_norm": 0.27708667516708374, + "learning_rate": 0.0009811725135334287, + "loss": 0.2807, + "num_input_tokens_seen": 47431120, + "step": 21930 + }, + { + "epoch": 3.5783034257748776, + "grad_norm": 0.05823507532477379, + "learning_rate": 0.0009811531597810497, + "loss": 0.1329, + "num_input_tokens_seen": 47441232, + "step": 21935 + }, + { + "epoch": 3.5791190864600324, + "grad_norm": 0.034099794924259186, + "learning_rate": 0.0009811337962774916, + "loss": 0.1188, + "num_input_tokens_seen": 47451504, + "step": 21940 + }, + { + "epoch": 3.5799347471451877, + "grad_norm": 0.1668100655078888, + "learning_rate": 0.0009811144230231468, + "loss": 0.1736, + "num_input_tokens_seen": 47463472, + "step": 21945 + }, + { + "epoch": 3.5807504078303425, + "grad_norm": 0.013046424835920334, + "learning_rate": 0.0009810950400184078, + "loss": 0.145, + "num_input_tokens_seen": 47474416, + "step": 21950 + }, + { + "epoch": 3.5815660685154977, + "grad_norm": 0.05839097872376442, + "learning_rate": 0.0009810756472636677, + "loss": 0.1073, + "num_input_tokens_seen": 47486000, + "step": 21955 + }, + { + "epoch": 3.5823817292006526, + "grad_norm": 0.026946526020765305, + "learning_rate": 0.000981056244759319, + "loss": 0.0967, + "num_input_tokens_seen": 47496464, + "step": 21960 + }, + { + "epoch": 3.5831973898858074, + "grad_norm": 0.02627391740679741, + "learning_rate": 0.0009810368325057555, + "loss": 0.0605, + "num_input_tokens_seen": 47506800, + "step": 21965 + }, + { + "epoch": 3.5840130505709626, + "grad_norm": 0.01618942618370056, + "learning_rate": 0.0009810174105033703, + "loss": 0.1714, + "num_input_tokens_seen": 47517008, + "step": 21970 + }, + { + "epoch": 3.5848287112561175, + "grad_norm": 0.05231276527047157, + "learning_rate": 0.000980997978752557, + "loss": 0.1563, + "num_input_tokens_seen": 47528176, + "step": 21975 + }, + { + "epoch": 3.5856443719412723, + "grad_norm": 0.08935698866844177, + "learning_rate": 0.0009809785372537094, + "loss": 0.1748, + "num_input_tokens_seen": 47539248, + "step": 21980 + }, + { + "epoch": 3.5864600326264275, + "grad_norm": 0.015321357175707817, + "learning_rate": 0.0009809590860072217, + "loss": 0.0311, + "num_input_tokens_seen": 47549712, + "step": 21985 + }, + { + "epoch": 3.5872756933115824, + "grad_norm": 0.08399423211812973, + "learning_rate": 0.0009809396250134881, + "loss": 0.0744, + "num_input_tokens_seen": 47560432, + "step": 21990 + }, + { + "epoch": 3.588091353996737, + "grad_norm": 0.06792002171278, + "learning_rate": 0.0009809201542729028, + "loss": 0.0799, + "num_input_tokens_seen": 47572112, + "step": 21995 + }, + { + "epoch": 3.5889070146818924, + "grad_norm": 0.008914710953831673, + "learning_rate": 0.0009809006737858603, + "loss": 0.1498, + "num_input_tokens_seen": 47583184, + "step": 22000 + }, + { + "epoch": 3.5897226753670473, + "grad_norm": 0.004703295882791281, + "learning_rate": 0.0009808811835527557, + "loss": 0.1542, + "num_input_tokens_seen": 47594544, + "step": 22005 + }, + { + "epoch": 3.5905383360522025, + "grad_norm": 0.029907135292887688, + "learning_rate": 0.000980861683573984, + "loss": 0.0412, + "num_input_tokens_seen": 47605136, + "step": 22010 + }, + { + "epoch": 3.5913539967373573, + "grad_norm": 0.23002728819847107, + "learning_rate": 0.00098084217384994, + "loss": 0.137, + "num_input_tokens_seen": 47615440, + "step": 22015 + }, + { + "epoch": 3.592169657422512, + "grad_norm": 0.06879785656929016, + "learning_rate": 0.0009808226543810198, + "loss": 0.05, + "num_input_tokens_seen": 47626128, + "step": 22020 + }, + { + "epoch": 3.592985318107667, + "grad_norm": 0.10355670005083084, + "learning_rate": 0.0009808031251676182, + "loss": 0.1418, + "num_input_tokens_seen": 47636784, + "step": 22025 + }, + { + "epoch": 3.5938009787928222, + "grad_norm": 0.020710989832878113, + "learning_rate": 0.0009807835862101313, + "loss": 0.1934, + "num_input_tokens_seen": 47648624, + "step": 22030 + }, + { + "epoch": 3.594616639477977, + "grad_norm": 0.21865439414978027, + "learning_rate": 0.0009807640375089552, + "loss": 0.1058, + "num_input_tokens_seen": 47659344, + "step": 22035 + }, + { + "epoch": 3.5954323001631323, + "grad_norm": 0.047853246331214905, + "learning_rate": 0.000980744479064486, + "loss": 0.1016, + "num_input_tokens_seen": 47670064, + "step": 22040 + }, + { + "epoch": 3.596247960848287, + "grad_norm": 0.013760424219071865, + "learning_rate": 0.00098072491087712, + "loss": 0.1097, + "num_input_tokens_seen": 47681264, + "step": 22045 + }, + { + "epoch": 3.597063621533442, + "grad_norm": 0.17822307348251343, + "learning_rate": 0.0009807053329472539, + "loss": 0.3183, + "num_input_tokens_seen": 47692304, + "step": 22050 + }, + { + "epoch": 3.597879282218597, + "grad_norm": 0.15463876724243164, + "learning_rate": 0.0009806857452752844, + "loss": 0.1162, + "num_input_tokens_seen": 47703600, + "step": 22055 + }, + { + "epoch": 3.598694942903752, + "grad_norm": 0.025757692754268646, + "learning_rate": 0.0009806661478616084, + "loss": 0.0322, + "num_input_tokens_seen": 47713520, + "step": 22060 + }, + { + "epoch": 3.5995106035889073, + "grad_norm": 0.026838814839720726, + "learning_rate": 0.000980646540706623, + "loss": 0.0998, + "num_input_tokens_seen": 47723600, + "step": 22065 + }, + { + "epoch": 3.600326264274062, + "grad_norm": 0.021702522411942482, + "learning_rate": 0.0009806269238107261, + "loss": 0.1855, + "num_input_tokens_seen": 47733808, + "step": 22070 + }, + { + "epoch": 3.601141924959217, + "grad_norm": 0.06848857551813126, + "learning_rate": 0.0009806072971743148, + "loss": 0.0631, + "num_input_tokens_seen": 47745296, + "step": 22075 + }, + { + "epoch": 3.6019575856443717, + "grad_norm": 0.03980998322367668, + "learning_rate": 0.000980587660797787, + "loss": 0.0783, + "num_input_tokens_seen": 47754736, + "step": 22080 + }, + { + "epoch": 3.602773246329527, + "grad_norm": 0.01532980240881443, + "learning_rate": 0.00098056801468154, + "loss": 0.0586, + "num_input_tokens_seen": 47766160, + "step": 22085 + }, + { + "epoch": 3.603588907014682, + "grad_norm": 0.010872176848351955, + "learning_rate": 0.0009805483588259732, + "loss": 0.0167, + "num_input_tokens_seen": 47777904, + "step": 22090 + }, + { + "epoch": 3.604404567699837, + "grad_norm": 0.04009169712662697, + "learning_rate": 0.000980528693231484, + "loss": 0.1523, + "num_input_tokens_seen": 47789328, + "step": 22095 + }, + { + "epoch": 3.605220228384992, + "grad_norm": 0.010094402357935905, + "learning_rate": 0.0009805090178984712, + "loss": 0.0779, + "num_input_tokens_seen": 47799472, + "step": 22100 + }, + { + "epoch": 3.6060358890701467, + "grad_norm": 0.06072307005524635, + "learning_rate": 0.0009804893328273336, + "loss": 0.1104, + "num_input_tokens_seen": 47811248, + "step": 22105 + }, + { + "epoch": 3.6068515497553015, + "grad_norm": 0.05516495928168297, + "learning_rate": 0.0009804696380184704, + "loss": 0.0625, + "num_input_tokens_seen": 47823152, + "step": 22110 + }, + { + "epoch": 3.607667210440457, + "grad_norm": 0.2074514776468277, + "learning_rate": 0.0009804499334722801, + "loss": 0.0722, + "num_input_tokens_seen": 47834448, + "step": 22115 + }, + { + "epoch": 3.6084828711256116, + "grad_norm": 0.01784053072333336, + "learning_rate": 0.0009804302191891625, + "loss": 0.1881, + "num_input_tokens_seen": 47845360, + "step": 22120 + }, + { + "epoch": 3.609298531810767, + "grad_norm": 0.005364630371332169, + "learning_rate": 0.0009804104951695173, + "loss": 0.0771, + "num_input_tokens_seen": 47854736, + "step": 22125 + }, + { + "epoch": 3.6101141924959217, + "grad_norm": 0.2547522187232971, + "learning_rate": 0.0009803907614137435, + "loss": 0.1743, + "num_input_tokens_seen": 47866864, + "step": 22130 + }, + { + "epoch": 3.6109298531810765, + "grad_norm": 0.0751950666308403, + "learning_rate": 0.0009803710179222419, + "loss": 0.1087, + "num_input_tokens_seen": 47877456, + "step": 22135 + }, + { + "epoch": 3.6117455138662318, + "grad_norm": 0.002935384400188923, + "learning_rate": 0.000980351264695412, + "loss": 0.1391, + "num_input_tokens_seen": 47889392, + "step": 22140 + }, + { + "epoch": 3.6125611745513866, + "grad_norm": 0.026649711653590202, + "learning_rate": 0.0009803315017336545, + "loss": 0.0165, + "num_input_tokens_seen": 47900016, + "step": 22145 + }, + { + "epoch": 3.613376835236542, + "grad_norm": 0.020435314625501633, + "learning_rate": 0.0009803117290373697, + "loss": 0.185, + "num_input_tokens_seen": 47910416, + "step": 22150 + }, + { + "epoch": 3.6141924959216967, + "grad_norm": 0.15744924545288086, + "learning_rate": 0.0009802919466069585, + "loss": 0.1078, + "num_input_tokens_seen": 47921584, + "step": 22155 + }, + { + "epoch": 3.6150081566068515, + "grad_norm": 0.16279421746730804, + "learning_rate": 0.0009802721544428215, + "loss": 0.2165, + "num_input_tokens_seen": 47932016, + "step": 22160 + }, + { + "epoch": 3.6158238172920063, + "grad_norm": 0.0843036100268364, + "learning_rate": 0.0009802523525453601, + "loss": 0.2297, + "num_input_tokens_seen": 47942864, + "step": 22165 + }, + { + "epoch": 3.6166394779771616, + "grad_norm": 0.18664637207984924, + "learning_rate": 0.0009802325409149757, + "loss": 0.0993, + "num_input_tokens_seen": 47953968, + "step": 22170 + }, + { + "epoch": 3.6174551386623164, + "grad_norm": 0.00876756850630045, + "learning_rate": 0.0009802127195520697, + "loss": 0.0728, + "num_input_tokens_seen": 47964688, + "step": 22175 + }, + { + "epoch": 3.6182707993474716, + "grad_norm": 0.015477425418794155, + "learning_rate": 0.0009801928884570434, + "loss": 0.0471, + "num_input_tokens_seen": 47974992, + "step": 22180 + }, + { + "epoch": 3.6190864600326265, + "grad_norm": 0.07113178819417953, + "learning_rate": 0.0009801730476302992, + "loss": 0.1995, + "num_input_tokens_seen": 47984688, + "step": 22185 + }, + { + "epoch": 3.6199021207177813, + "grad_norm": 0.13852275907993317, + "learning_rate": 0.000980153197072239, + "loss": 0.1148, + "num_input_tokens_seen": 47995792, + "step": 22190 + }, + { + "epoch": 3.6207177814029365, + "grad_norm": 0.09939780831336975, + "learning_rate": 0.0009801333367832651, + "loss": 0.1039, + "num_input_tokens_seen": 48005424, + "step": 22195 + }, + { + "epoch": 3.6215334420880914, + "grad_norm": 0.07844390720129013, + "learning_rate": 0.0009801134667637803, + "loss": 0.0971, + "num_input_tokens_seen": 48016368, + "step": 22200 + }, + { + "epoch": 3.622349102773246, + "grad_norm": 0.16363392770290375, + "learning_rate": 0.0009800935870141868, + "loss": 0.0741, + "num_input_tokens_seen": 48027568, + "step": 22205 + }, + { + "epoch": 3.6231647634584014, + "grad_norm": 0.08116459101438522, + "learning_rate": 0.0009800736975348878, + "loss": 0.1015, + "num_input_tokens_seen": 48039056, + "step": 22210 + }, + { + "epoch": 3.6239804241435563, + "grad_norm": 0.025155600160360336, + "learning_rate": 0.0009800537983262862, + "loss": 0.0714, + "num_input_tokens_seen": 48049936, + "step": 22215 + }, + { + "epoch": 3.624796084828711, + "grad_norm": 0.11358506232500076, + "learning_rate": 0.0009800338893887857, + "loss": 0.0955, + "num_input_tokens_seen": 48061232, + "step": 22220 + }, + { + "epoch": 3.6256117455138663, + "grad_norm": 0.16867463290691376, + "learning_rate": 0.000980013970722789, + "loss": 0.0756, + "num_input_tokens_seen": 48072336, + "step": 22225 + }, + { + "epoch": 3.626427406199021, + "grad_norm": 0.12615631520748138, + "learning_rate": 0.0009799940423287005, + "loss": 0.188, + "num_input_tokens_seen": 48083792, + "step": 22230 + }, + { + "epoch": 3.6272430668841764, + "grad_norm": 0.027677416801452637, + "learning_rate": 0.000979974104206924, + "loss": 0.0424, + "num_input_tokens_seen": 48094512, + "step": 22235 + }, + { + "epoch": 3.6280587275693312, + "grad_norm": 0.038241248577833176, + "learning_rate": 0.0009799541563578632, + "loss": 0.1908, + "num_input_tokens_seen": 48106416, + "step": 22240 + }, + { + "epoch": 3.628874388254486, + "grad_norm": 0.04931412637233734, + "learning_rate": 0.0009799341987819224, + "loss": 0.1333, + "num_input_tokens_seen": 48117840, + "step": 22245 + }, + { + "epoch": 3.629690048939641, + "grad_norm": 0.028640341013669968, + "learning_rate": 0.0009799142314795065, + "loss": 0.0577, + "num_input_tokens_seen": 48127888, + "step": 22250 + }, + { + "epoch": 3.630505709624796, + "grad_norm": 0.03567443788051605, + "learning_rate": 0.0009798942544510198, + "loss": 0.0432, + "num_input_tokens_seen": 48137200, + "step": 22255 + }, + { + "epoch": 3.631321370309951, + "grad_norm": 0.06819088757038116, + "learning_rate": 0.000979874267696867, + "loss": 0.057, + "num_input_tokens_seen": 48147632, + "step": 22260 + }, + { + "epoch": 3.632137030995106, + "grad_norm": 0.1777115911245346, + "learning_rate": 0.0009798542712174537, + "loss": 0.1266, + "num_input_tokens_seen": 48158064, + "step": 22265 + }, + { + "epoch": 3.632952691680261, + "grad_norm": 0.013221224769949913, + "learning_rate": 0.0009798342650131845, + "loss": 0.0276, + "num_input_tokens_seen": 48168176, + "step": 22270 + }, + { + "epoch": 3.633768352365416, + "grad_norm": 0.09064479172229767, + "learning_rate": 0.0009798142490844656, + "loss": 0.2523, + "num_input_tokens_seen": 48179472, + "step": 22275 + }, + { + "epoch": 3.634584013050571, + "grad_norm": 0.09441931545734406, + "learning_rate": 0.0009797942234317022, + "loss": 0.1527, + "num_input_tokens_seen": 48189232, + "step": 22280 + }, + { + "epoch": 3.635399673735726, + "grad_norm": 0.1492157280445099, + "learning_rate": 0.0009797741880553, + "loss": 0.3185, + "num_input_tokens_seen": 48200496, + "step": 22285 + }, + { + "epoch": 3.636215334420881, + "grad_norm": 0.12546101212501526, + "learning_rate": 0.0009797541429556653, + "loss": 0.2057, + "num_input_tokens_seen": 48211280, + "step": 22290 + }, + { + "epoch": 3.637030995106036, + "grad_norm": 0.027052100747823715, + "learning_rate": 0.0009797340881332044, + "loss": 0.0607, + "num_input_tokens_seen": 48221872, + "step": 22295 + }, + { + "epoch": 3.637846655791191, + "grad_norm": 0.04728303104639053, + "learning_rate": 0.0009797140235883236, + "loss": 0.1707, + "num_input_tokens_seen": 48233616, + "step": 22300 + }, + { + "epoch": 3.6386623164763456, + "grad_norm": 0.10751637816429138, + "learning_rate": 0.0009796939493214294, + "loss": 0.1243, + "num_input_tokens_seen": 48244976, + "step": 22305 + }, + { + "epoch": 3.639477977161501, + "grad_norm": 0.17382705211639404, + "learning_rate": 0.000979673865332929, + "loss": 0.1327, + "num_input_tokens_seen": 48256016, + "step": 22310 + }, + { + "epoch": 3.6402936378466557, + "grad_norm": 0.0284186452627182, + "learning_rate": 0.0009796537716232289, + "loss": 0.0715, + "num_input_tokens_seen": 48267632, + "step": 22315 + }, + { + "epoch": 3.641109298531811, + "grad_norm": 0.028015002608299255, + "learning_rate": 0.000979633668192737, + "loss": 0.1001, + "num_input_tokens_seen": 48278544, + "step": 22320 + }, + { + "epoch": 3.641924959216966, + "grad_norm": 0.026279503479599953, + "learning_rate": 0.0009796135550418602, + "loss": 0.0556, + "num_input_tokens_seen": 48289616, + "step": 22325 + }, + { + "epoch": 3.6427406199021206, + "grad_norm": 0.07529424875974655, + "learning_rate": 0.0009795934321710062, + "loss": 0.1005, + "num_input_tokens_seen": 48300400, + "step": 22330 + }, + { + "epoch": 3.6435562805872754, + "grad_norm": 0.0067391046322882175, + "learning_rate": 0.0009795732995805829, + "loss": 0.1522, + "num_input_tokens_seen": 48310448, + "step": 22335 + }, + { + "epoch": 3.6443719412724307, + "grad_norm": 0.24121522903442383, + "learning_rate": 0.0009795531572709983, + "loss": 0.3409, + "num_input_tokens_seen": 48320688, + "step": 22340 + }, + { + "epoch": 3.6451876019575855, + "grad_norm": 0.06484576314687729, + "learning_rate": 0.0009795330052426608, + "loss": 0.0733, + "num_input_tokens_seen": 48331376, + "step": 22345 + }, + { + "epoch": 3.6460032626427408, + "grad_norm": 0.05939403548836708, + "learning_rate": 0.0009795128434959785, + "loss": 0.1592, + "num_input_tokens_seen": 48342288, + "step": 22350 + }, + { + "epoch": 3.6468189233278956, + "grad_norm": 0.06702617555856705, + "learning_rate": 0.00097949267203136, + "loss": 0.1425, + "num_input_tokens_seen": 48351440, + "step": 22355 + }, + { + "epoch": 3.6476345840130504, + "grad_norm": 0.04426341503858566, + "learning_rate": 0.0009794724908492143, + "loss": 0.0851, + "num_input_tokens_seen": 48363632, + "step": 22360 + }, + { + "epoch": 3.6484502446982057, + "grad_norm": 0.0624653585255146, + "learning_rate": 0.0009794522999499503, + "loss": 0.0316, + "num_input_tokens_seen": 48374512, + "step": 22365 + }, + { + "epoch": 3.6492659053833605, + "grad_norm": 0.10191851109266281, + "learning_rate": 0.0009794320993339772, + "loss": 0.0799, + "num_input_tokens_seen": 48385232, + "step": 22370 + }, + { + "epoch": 3.6500815660685157, + "grad_norm": 0.2434154897928238, + "learning_rate": 0.0009794118890017046, + "loss": 0.1618, + "num_input_tokens_seen": 48396304, + "step": 22375 + }, + { + "epoch": 3.6508972267536706, + "grad_norm": 0.1698484867811203, + "learning_rate": 0.0009793916689535417, + "loss": 0.0629, + "num_input_tokens_seen": 48406384, + "step": 22380 + }, + { + "epoch": 3.6517128874388254, + "grad_norm": 0.12525101006031036, + "learning_rate": 0.0009793714391898984, + "loss": 0.1334, + "num_input_tokens_seen": 48416944, + "step": 22385 + }, + { + "epoch": 3.65252854812398, + "grad_norm": 0.14747369289398193, + "learning_rate": 0.000979351199711185, + "loss": 0.0558, + "num_input_tokens_seen": 48426928, + "step": 22390 + }, + { + "epoch": 3.6533442088091355, + "grad_norm": 0.14484144747257233, + "learning_rate": 0.0009793309505178112, + "loss": 0.0951, + "num_input_tokens_seen": 48437264, + "step": 22395 + }, + { + "epoch": 3.6541598694942903, + "grad_norm": 0.03382772579789162, + "learning_rate": 0.000979310691610188, + "loss": 0.0677, + "num_input_tokens_seen": 48446160, + "step": 22400 + }, + { + "epoch": 3.6549755301794455, + "grad_norm": 0.06506810337305069, + "learning_rate": 0.0009792904229887253, + "loss": 0.102, + "num_input_tokens_seen": 48456944, + "step": 22405 + }, + { + "epoch": 3.6557911908646004, + "grad_norm": 0.2006702721118927, + "learning_rate": 0.0009792701446538342, + "loss": 0.2902, + "num_input_tokens_seen": 48468464, + "step": 22410 + }, + { + "epoch": 3.656606851549755, + "grad_norm": 0.04025622457265854, + "learning_rate": 0.0009792498566059255, + "loss": 0.1454, + "num_input_tokens_seen": 48478864, + "step": 22415 + }, + { + "epoch": 3.6574225122349104, + "grad_norm": 0.04909409210085869, + "learning_rate": 0.0009792295588454106, + "loss": 0.0857, + "num_input_tokens_seen": 48489456, + "step": 22420 + }, + { + "epoch": 3.6582381729200653, + "grad_norm": 0.2315702885389328, + "learning_rate": 0.0009792092513727006, + "loss": 0.1617, + "num_input_tokens_seen": 48500496, + "step": 22425 + }, + { + "epoch": 3.65905383360522, + "grad_norm": 0.059401609003543854, + "learning_rate": 0.0009791889341882075, + "loss": 0.0676, + "num_input_tokens_seen": 48511888, + "step": 22430 + }, + { + "epoch": 3.6598694942903753, + "grad_norm": 0.1892523616552353, + "learning_rate": 0.0009791686072923424, + "loss": 0.1063, + "num_input_tokens_seen": 48522160, + "step": 22435 + }, + { + "epoch": 3.66068515497553, + "grad_norm": 0.006050454918295145, + "learning_rate": 0.0009791482706855178, + "loss": 0.0272, + "num_input_tokens_seen": 48532272, + "step": 22440 + }, + { + "epoch": 3.661500815660685, + "grad_norm": 0.0315130352973938, + "learning_rate": 0.0009791279243681456, + "loss": 0.1565, + "num_input_tokens_seen": 48543120, + "step": 22445 + }, + { + "epoch": 3.6623164763458402, + "grad_norm": 0.015598422847688198, + "learning_rate": 0.0009791075683406383, + "loss": 0.0948, + "num_input_tokens_seen": 48553648, + "step": 22450 + }, + { + "epoch": 3.663132137030995, + "grad_norm": 0.08075518906116486, + "learning_rate": 0.0009790872026034082, + "loss": 0.1476, + "num_input_tokens_seen": 48564240, + "step": 22455 + }, + { + "epoch": 3.6639477977161503, + "grad_norm": 0.035409845411777496, + "learning_rate": 0.0009790668271568684, + "loss": 0.0687, + "num_input_tokens_seen": 48574320, + "step": 22460 + }, + { + "epoch": 3.664763458401305, + "grad_norm": 0.25173941254615784, + "learning_rate": 0.0009790464420014312, + "loss": 0.0941, + "num_input_tokens_seen": 48584784, + "step": 22465 + }, + { + "epoch": 3.66557911908646, + "grad_norm": 0.056600235402584076, + "learning_rate": 0.0009790260471375105, + "loss": 0.0999, + "num_input_tokens_seen": 48595440, + "step": 22470 + }, + { + "epoch": 3.6663947797716148, + "grad_norm": 0.12601543962955475, + "learning_rate": 0.0009790056425655193, + "loss": 0.0929, + "num_input_tokens_seen": 48607920, + "step": 22475 + }, + { + "epoch": 3.66721044045677, + "grad_norm": 0.044781044125556946, + "learning_rate": 0.0009789852282858708, + "loss": 0.1691, + "num_input_tokens_seen": 48619696, + "step": 22480 + }, + { + "epoch": 3.668026101141925, + "grad_norm": 0.05876341462135315, + "learning_rate": 0.0009789648042989793, + "loss": 0.1129, + "num_input_tokens_seen": 48630544, + "step": 22485 + }, + { + "epoch": 3.66884176182708, + "grad_norm": 0.007882521487772465, + "learning_rate": 0.0009789443706052583, + "loss": 0.1321, + "num_input_tokens_seen": 48641072, + "step": 22490 + }, + { + "epoch": 3.669657422512235, + "grad_norm": 0.010897364467382431, + "learning_rate": 0.000978923927205122, + "loss": 0.0425, + "num_input_tokens_seen": 48651792, + "step": 22495 + }, + { + "epoch": 3.6704730831973897, + "grad_norm": 0.04307050630450249, + "learning_rate": 0.0009789034740989848, + "loss": 0.1143, + "num_input_tokens_seen": 48661296, + "step": 22500 + }, + { + "epoch": 3.671288743882545, + "grad_norm": 0.4440848231315613, + "learning_rate": 0.0009788830112872611, + "loss": 0.099, + "num_input_tokens_seen": 48671312, + "step": 22505 + }, + { + "epoch": 3.6721044045677, + "grad_norm": 0.22522272169589996, + "learning_rate": 0.0009788625387703658, + "loss": 0.1167, + "num_input_tokens_seen": 48681680, + "step": 22510 + }, + { + "epoch": 3.672920065252855, + "grad_norm": 0.23165500164031982, + "learning_rate": 0.0009788420565487136, + "loss": 0.2092, + "num_input_tokens_seen": 48692912, + "step": 22515 + }, + { + "epoch": 3.67373572593801, + "grad_norm": 0.07002677768468857, + "learning_rate": 0.0009788215646227196, + "loss": 0.2015, + "num_input_tokens_seen": 48704912, + "step": 22520 + }, + { + "epoch": 3.6745513866231647, + "grad_norm": 0.09665162861347198, + "learning_rate": 0.0009788010629927992, + "loss": 0.0611, + "num_input_tokens_seen": 48713232, + "step": 22525 + }, + { + "epoch": 3.6753670473083195, + "grad_norm": 0.038766391575336456, + "learning_rate": 0.000978780551659368, + "loss": 0.0261, + "num_input_tokens_seen": 48724976, + "step": 22530 + }, + { + "epoch": 3.676182707993475, + "grad_norm": 0.05711549147963524, + "learning_rate": 0.0009787600306228415, + "loss": 0.2448, + "num_input_tokens_seen": 48736624, + "step": 22535 + }, + { + "epoch": 3.6769983686786296, + "grad_norm": 0.09710178524255753, + "learning_rate": 0.0009787394998836355, + "loss": 0.0471, + "num_input_tokens_seen": 48747696, + "step": 22540 + }, + { + "epoch": 3.677814029363785, + "grad_norm": 0.2320680022239685, + "learning_rate": 0.0009787189594421663, + "loss": 0.117, + "num_input_tokens_seen": 48759632, + "step": 22545 + }, + { + "epoch": 3.6786296900489397, + "grad_norm": 0.08304693549871445, + "learning_rate": 0.00097869840929885, + "loss": 0.1196, + "num_input_tokens_seen": 48771152, + "step": 22550 + }, + { + "epoch": 3.6794453507340945, + "grad_norm": 0.13316458463668823, + "learning_rate": 0.0009786778494541033, + "loss": 0.1519, + "num_input_tokens_seen": 48783312, + "step": 22555 + }, + { + "epoch": 3.6802610114192493, + "grad_norm": 0.009458227083086967, + "learning_rate": 0.0009786572799083426, + "loss": 0.0734, + "num_input_tokens_seen": 48795728, + "step": 22560 + }, + { + "epoch": 3.6810766721044046, + "grad_norm": 0.1085597574710846, + "learning_rate": 0.000978636700661985, + "loss": 0.1794, + "num_input_tokens_seen": 48807152, + "step": 22565 + }, + { + "epoch": 3.6818923327895594, + "grad_norm": 0.11425057798624039, + "learning_rate": 0.0009786161117154475, + "loss": 0.1324, + "num_input_tokens_seen": 48817072, + "step": 22570 + }, + { + "epoch": 3.6827079934747147, + "grad_norm": 0.011296875774860382, + "learning_rate": 0.0009785955130691471, + "loss": 0.0387, + "num_input_tokens_seen": 48828080, + "step": 22575 + }, + { + "epoch": 3.6835236541598695, + "grad_norm": 0.027069205418229103, + "learning_rate": 0.0009785749047235017, + "loss": 0.0839, + "num_input_tokens_seen": 48838192, + "step": 22580 + }, + { + "epoch": 3.6843393148450243, + "grad_norm": 0.043377745896577835, + "learning_rate": 0.0009785542866789288, + "loss": 0.04, + "num_input_tokens_seen": 48849904, + "step": 22585 + }, + { + "epoch": 3.6851549755301796, + "grad_norm": 0.017865043133497238, + "learning_rate": 0.000978533658935846, + "loss": 0.3016, + "num_input_tokens_seen": 48861200, + "step": 22590 + }, + { + "epoch": 3.6859706362153344, + "grad_norm": 0.11259305477142334, + "learning_rate": 0.0009785130214946716, + "loss": 0.09, + "num_input_tokens_seen": 48871760, + "step": 22595 + }, + { + "epoch": 3.6867862969004896, + "grad_norm": 0.023715078830718994, + "learning_rate": 0.0009784923743558238, + "loss": 0.0522, + "num_input_tokens_seen": 48882128, + "step": 22600 + }, + { + "epoch": 3.6876019575856445, + "grad_norm": 0.021904323250055313, + "learning_rate": 0.000978471717519721, + "loss": 0.0444, + "num_input_tokens_seen": 48891824, + "step": 22605 + }, + { + "epoch": 3.6884176182707993, + "grad_norm": 0.13719536364078522, + "learning_rate": 0.0009784510509867818, + "loss": 0.0905, + "num_input_tokens_seen": 48903888, + "step": 22610 + }, + { + "epoch": 3.689233278955954, + "grad_norm": 0.08701768517494202, + "learning_rate": 0.0009784303747574254, + "loss": 0.0619, + "num_input_tokens_seen": 48914480, + "step": 22615 + }, + { + "epoch": 3.6900489396411094, + "grad_norm": 0.13661521673202515, + "learning_rate": 0.0009784096888320703, + "loss": 0.1242, + "num_input_tokens_seen": 48925680, + "step": 22620 + }, + { + "epoch": 3.690864600326264, + "grad_norm": 0.009991122409701347, + "learning_rate": 0.000978388993211136, + "loss": 0.0959, + "num_input_tokens_seen": 48937072, + "step": 22625 + }, + { + "epoch": 3.6916802610114194, + "grad_norm": 0.08614683896303177, + "learning_rate": 0.0009783682878950416, + "loss": 0.0987, + "num_input_tokens_seen": 48948240, + "step": 22630 + }, + { + "epoch": 3.6924959216965743, + "grad_norm": 0.10593032091856003, + "learning_rate": 0.0009783475728842074, + "loss": 0.1475, + "num_input_tokens_seen": 48958320, + "step": 22635 + }, + { + "epoch": 3.693311582381729, + "grad_norm": 0.20387686789035797, + "learning_rate": 0.0009783268481790527, + "loss": 0.1945, + "num_input_tokens_seen": 48969840, + "step": 22640 + }, + { + "epoch": 3.6941272430668843, + "grad_norm": 0.059103433042764664, + "learning_rate": 0.0009783061137799975, + "loss": 0.124, + "num_input_tokens_seen": 48981744, + "step": 22645 + }, + { + "epoch": 3.694942903752039, + "grad_norm": 0.08534158766269684, + "learning_rate": 0.000978285369687462, + "loss": 0.1068, + "num_input_tokens_seen": 48992368, + "step": 22650 + }, + { + "epoch": 3.695758564437194, + "grad_norm": 0.025781484320759773, + "learning_rate": 0.000978264615901867, + "loss": 0.0279, + "num_input_tokens_seen": 49003120, + "step": 22655 + }, + { + "epoch": 3.6965742251223492, + "grad_norm": 0.19997680187225342, + "learning_rate": 0.0009782438524236327, + "loss": 0.1823, + "num_input_tokens_seen": 49014256, + "step": 22660 + }, + { + "epoch": 3.697389885807504, + "grad_norm": 0.03971175104379654, + "learning_rate": 0.00097822307925318, + "loss": 0.1205, + "num_input_tokens_seen": 49025232, + "step": 22665 + }, + { + "epoch": 3.698205546492659, + "grad_norm": 0.32710427045822144, + "learning_rate": 0.00097820229639093, + "loss": 0.1444, + "num_input_tokens_seen": 49036176, + "step": 22670 + }, + { + "epoch": 3.699021207177814, + "grad_norm": 0.1855972707271576, + "learning_rate": 0.0009781815038373042, + "loss": 0.1353, + "num_input_tokens_seen": 49046832, + "step": 22675 + }, + { + "epoch": 3.699836867862969, + "grad_norm": 0.028964513912796974, + "learning_rate": 0.000978160701592723, + "loss": 0.1412, + "num_input_tokens_seen": 49057264, + "step": 22680 + }, + { + "epoch": 3.700652528548124, + "grad_norm": 0.04488237202167511, + "learning_rate": 0.000978139889657609, + "loss": 0.0243, + "num_input_tokens_seen": 49068592, + "step": 22685 + }, + { + "epoch": 3.701468189233279, + "grad_norm": 0.00563571834936738, + "learning_rate": 0.0009781190680323833, + "loss": 0.1421, + "num_input_tokens_seen": 49078640, + "step": 22690 + }, + { + "epoch": 3.702283849918434, + "grad_norm": 0.10360507667064667, + "learning_rate": 0.0009780982367174683, + "loss": 0.0797, + "num_input_tokens_seen": 49089616, + "step": 22695 + }, + { + "epoch": 3.7030995106035887, + "grad_norm": 0.1571696400642395, + "learning_rate": 0.000978077395713286, + "loss": 0.1266, + "num_input_tokens_seen": 49099312, + "step": 22700 + }, + { + "epoch": 3.703915171288744, + "grad_norm": 0.04751509800553322, + "learning_rate": 0.0009780565450202587, + "loss": 0.1661, + "num_input_tokens_seen": 49111056, + "step": 22705 + }, + { + "epoch": 3.7047308319738987, + "grad_norm": 0.2686530351638794, + "learning_rate": 0.0009780356846388091, + "loss": 0.1756, + "num_input_tokens_seen": 49122352, + "step": 22710 + }, + { + "epoch": 3.705546492659054, + "grad_norm": 0.11259738355875015, + "learning_rate": 0.00097801481456936, + "loss": 0.0504, + "num_input_tokens_seen": 49133136, + "step": 22715 + }, + { + "epoch": 3.706362153344209, + "grad_norm": 0.1789553016424179, + "learning_rate": 0.0009779939348123342, + "loss": 0.0762, + "num_input_tokens_seen": 49143952, + "step": 22720 + }, + { + "epoch": 3.7071778140293636, + "grad_norm": 0.01377193909138441, + "learning_rate": 0.000977973045368155, + "loss": 0.1501, + "num_input_tokens_seen": 49154864, + "step": 22725 + }, + { + "epoch": 3.707993474714519, + "grad_norm": 0.1669149249792099, + "learning_rate": 0.0009779521462372457, + "loss": 0.1535, + "num_input_tokens_seen": 49166192, + "step": 22730 + }, + { + "epoch": 3.7088091353996737, + "grad_norm": 0.010271217674016953, + "learning_rate": 0.0009779312374200298, + "loss": 0.0722, + "num_input_tokens_seen": 49177136, + "step": 22735 + }, + { + "epoch": 3.709624796084829, + "grad_norm": 0.11398719996213913, + "learning_rate": 0.0009779103189169309, + "loss": 0.0827, + "num_input_tokens_seen": 49187440, + "step": 22740 + }, + { + "epoch": 3.710440456769984, + "grad_norm": 0.07981985062360764, + "learning_rate": 0.0009778893907283733, + "loss": 0.1428, + "num_input_tokens_seen": 49199056, + "step": 22745 + }, + { + "epoch": 3.7112561174551386, + "grad_norm": 0.25845813751220703, + "learning_rate": 0.000977868452854781, + "loss": 0.1377, + "num_input_tokens_seen": 49208720, + "step": 22750 + }, + { + "epoch": 3.7120717781402934, + "grad_norm": 0.15199466049671173, + "learning_rate": 0.000977847505296578, + "loss": 0.0882, + "num_input_tokens_seen": 49219056, + "step": 22755 + }, + { + "epoch": 3.7128874388254487, + "grad_norm": 0.14639122784137726, + "learning_rate": 0.0009778265480541895, + "loss": 0.1336, + "num_input_tokens_seen": 49229232, + "step": 22760 + }, + { + "epoch": 3.7137030995106035, + "grad_norm": 0.011554457247257233, + "learning_rate": 0.0009778055811280396, + "loss": 0.0255, + "num_input_tokens_seen": 49241456, + "step": 22765 + }, + { + "epoch": 3.7145187601957588, + "grad_norm": 0.009976423345506191, + "learning_rate": 0.0009777846045185535, + "loss": 0.1987, + "num_input_tokens_seen": 49252144, + "step": 22770 + }, + { + "epoch": 3.7153344208809136, + "grad_norm": 0.07433804869651794, + "learning_rate": 0.0009777636182261562, + "loss": 0.0468, + "num_input_tokens_seen": 49262416, + "step": 22775 + }, + { + "epoch": 3.7161500815660684, + "grad_norm": 0.02024008147418499, + "learning_rate": 0.0009777426222512733, + "loss": 0.0305, + "num_input_tokens_seen": 49274224, + "step": 22780 + }, + { + "epoch": 3.7169657422512232, + "grad_norm": 0.09279941767454147, + "learning_rate": 0.0009777216165943298, + "loss": 0.1684, + "num_input_tokens_seen": 49284656, + "step": 22785 + }, + { + "epoch": 3.7177814029363785, + "grad_norm": 0.07832145690917969, + "learning_rate": 0.0009777006012557522, + "loss": 0.1519, + "num_input_tokens_seen": 49294896, + "step": 22790 + }, + { + "epoch": 3.7185970636215333, + "grad_norm": 0.06239181011915207, + "learning_rate": 0.0009776795762359654, + "loss": 0.164, + "num_input_tokens_seen": 49305200, + "step": 22795 + }, + { + "epoch": 3.7194127243066886, + "grad_norm": 0.05114758387207985, + "learning_rate": 0.0009776585415353963, + "loss": 0.0504, + "num_input_tokens_seen": 49316080, + "step": 22800 + }, + { + "epoch": 3.7202283849918434, + "grad_norm": 0.017731616273522377, + "learning_rate": 0.0009776374971544708, + "loss": 0.0382, + "num_input_tokens_seen": 49327024, + "step": 22805 + }, + { + "epoch": 3.721044045676998, + "grad_norm": 0.06036202982068062, + "learning_rate": 0.0009776164430936153, + "loss": 0.0778, + "num_input_tokens_seen": 49338224, + "step": 22810 + }, + { + "epoch": 3.7218597063621535, + "grad_norm": 0.013572810217738152, + "learning_rate": 0.000977595379353257, + "loss": 0.0984, + "num_input_tokens_seen": 49349936, + "step": 22815 + }, + { + "epoch": 3.7226753670473083, + "grad_norm": 0.015775786712765694, + "learning_rate": 0.0009775743059338223, + "loss": 0.122, + "num_input_tokens_seen": 49361200, + "step": 22820 + }, + { + "epoch": 3.7234910277324635, + "grad_norm": 0.030214644968509674, + "learning_rate": 0.0009775532228357385, + "loss": 0.3579, + "num_input_tokens_seen": 49372976, + "step": 22825 + }, + { + "epoch": 3.7243066884176184, + "grad_norm": 0.041531722992658615, + "learning_rate": 0.0009775321300594328, + "loss": 0.0512, + "num_input_tokens_seen": 49383824, + "step": 22830 + }, + { + "epoch": 3.725122349102773, + "grad_norm": 0.08106119930744171, + "learning_rate": 0.0009775110276053327, + "loss": 0.1839, + "num_input_tokens_seen": 49395504, + "step": 22835 + }, + { + "epoch": 3.725938009787928, + "grad_norm": 0.020309995859861374, + "learning_rate": 0.000977489915473866, + "loss": 0.0665, + "num_input_tokens_seen": 49404176, + "step": 22840 + }, + { + "epoch": 3.7267536704730833, + "grad_norm": 0.04128960520029068, + "learning_rate": 0.0009774687936654602, + "loss": 0.0401, + "num_input_tokens_seen": 49413776, + "step": 22845 + }, + { + "epoch": 3.727569331158238, + "grad_norm": 0.024078309535980225, + "learning_rate": 0.0009774476621805437, + "loss": 0.1105, + "num_input_tokens_seen": 49424848, + "step": 22850 + }, + { + "epoch": 3.7283849918433933, + "grad_norm": 0.0830194428563118, + "learning_rate": 0.0009774265210195446, + "loss": 0.0731, + "num_input_tokens_seen": 49436880, + "step": 22855 + }, + { + "epoch": 3.729200652528548, + "grad_norm": 0.16092805564403534, + "learning_rate": 0.0009774053701828913, + "loss": 0.2289, + "num_input_tokens_seen": 49448048, + "step": 22860 + }, + { + "epoch": 3.730016313213703, + "grad_norm": 0.20131917297840118, + "learning_rate": 0.0009773842096710127, + "loss": 0.1552, + "num_input_tokens_seen": 49458544, + "step": 22865 + }, + { + "epoch": 3.7308319738988582, + "grad_norm": 0.21859797835350037, + "learning_rate": 0.0009773630394843374, + "loss": 0.1336, + "num_input_tokens_seen": 49467344, + "step": 22870 + }, + { + "epoch": 3.731647634584013, + "grad_norm": 0.18565644323825836, + "learning_rate": 0.0009773418596232945, + "loss": 0.1809, + "num_input_tokens_seen": 49478064, + "step": 22875 + }, + { + "epoch": 3.732463295269168, + "grad_norm": 0.051977213472127914, + "learning_rate": 0.0009773206700883135, + "loss": 0.1254, + "num_input_tokens_seen": 49488464, + "step": 22880 + }, + { + "epoch": 3.733278955954323, + "grad_norm": 0.027007022872567177, + "learning_rate": 0.0009772994708798232, + "loss": 0.1159, + "num_input_tokens_seen": 49501264, + "step": 22885 + }, + { + "epoch": 3.734094616639478, + "grad_norm": 0.19858905673027039, + "learning_rate": 0.000977278261998254, + "loss": 0.1456, + "num_input_tokens_seen": 49511408, + "step": 22890 + }, + { + "epoch": 3.7349102773246328, + "grad_norm": 0.09167278558015823, + "learning_rate": 0.0009772570434440353, + "loss": 0.1164, + "num_input_tokens_seen": 49522896, + "step": 22895 + }, + { + "epoch": 3.735725938009788, + "grad_norm": 0.03391212224960327, + "learning_rate": 0.000977235815217597, + "loss": 0.0852, + "num_input_tokens_seen": 49535120, + "step": 22900 + }, + { + "epoch": 3.736541598694943, + "grad_norm": 0.0225540641695261, + "learning_rate": 0.0009772145773193695, + "loss": 0.1699, + "num_input_tokens_seen": 49546448, + "step": 22905 + }, + { + "epoch": 3.737357259380098, + "grad_norm": 0.06987614184617996, + "learning_rate": 0.0009771933297497831, + "loss": 0.0708, + "num_input_tokens_seen": 49557488, + "step": 22910 + }, + { + "epoch": 3.738172920065253, + "grad_norm": 0.04545498266816139, + "learning_rate": 0.0009771720725092687, + "loss": 0.0383, + "num_input_tokens_seen": 49567088, + "step": 22915 + }, + { + "epoch": 3.7389885807504077, + "grad_norm": 0.015980003401637077, + "learning_rate": 0.000977150805598257, + "loss": 0.2011, + "num_input_tokens_seen": 49578544, + "step": 22920 + }, + { + "epoch": 3.7398042414355626, + "grad_norm": 0.12308619171380997, + "learning_rate": 0.0009771295290171788, + "loss": 0.1587, + "num_input_tokens_seen": 49590000, + "step": 22925 + }, + { + "epoch": 3.740619902120718, + "grad_norm": 0.022953316569328308, + "learning_rate": 0.0009771082427664655, + "loss": 0.3003, + "num_input_tokens_seen": 49601328, + "step": 22930 + }, + { + "epoch": 3.7414355628058726, + "grad_norm": 0.020980946719646454, + "learning_rate": 0.0009770869468465483, + "loss": 0.066, + "num_input_tokens_seen": 49613264, + "step": 22935 + }, + { + "epoch": 3.742251223491028, + "grad_norm": 0.08716975897550583, + "learning_rate": 0.000977065641257859, + "loss": 0.117, + "num_input_tokens_seen": 49622352, + "step": 22940 + }, + { + "epoch": 3.7430668841761827, + "grad_norm": 0.2300471067428589, + "learning_rate": 0.000977044326000829, + "loss": 0.3114, + "num_input_tokens_seen": 49633264, + "step": 22945 + }, + { + "epoch": 3.7438825448613375, + "grad_norm": 0.08867449313402176, + "learning_rate": 0.0009770230010758907, + "loss": 0.0752, + "num_input_tokens_seen": 49644432, + "step": 22950 + }, + { + "epoch": 3.744698205546493, + "grad_norm": 0.19246956706047058, + "learning_rate": 0.0009770016664834762, + "loss": 0.1718, + "num_input_tokens_seen": 49656432, + "step": 22955 + }, + { + "epoch": 3.7455138662316476, + "grad_norm": 0.021392393857240677, + "learning_rate": 0.000976980322224018, + "loss": 0.0801, + "num_input_tokens_seen": 49667600, + "step": 22960 + }, + { + "epoch": 3.746329526916803, + "grad_norm": 0.22876377403736115, + "learning_rate": 0.0009769589682979481, + "loss": 0.0842, + "num_input_tokens_seen": 49678576, + "step": 22965 + }, + { + "epoch": 3.7471451876019577, + "grad_norm": 0.12781397998332977, + "learning_rate": 0.0009769376047056998, + "loss": 0.1321, + "num_input_tokens_seen": 49689296, + "step": 22970 + }, + { + "epoch": 3.7479608482871125, + "grad_norm": 0.17140617966651917, + "learning_rate": 0.0009769162314477058, + "loss": 0.2144, + "num_input_tokens_seen": 49700208, + "step": 22975 + }, + { + "epoch": 3.7487765089722673, + "grad_norm": 0.013391259126365185, + "learning_rate": 0.0009768948485243997, + "loss": 0.0628, + "num_input_tokens_seen": 49711696, + "step": 22980 + }, + { + "epoch": 3.7495921696574226, + "grad_norm": 0.04102994501590729, + "learning_rate": 0.0009768734559362142, + "loss": 0.1631, + "num_input_tokens_seen": 49721904, + "step": 22985 + }, + { + "epoch": 3.7504078303425774, + "grad_norm": 0.08313404023647308, + "learning_rate": 0.0009768520536835832, + "loss": 0.2573, + "num_input_tokens_seen": 49733328, + "step": 22990 + }, + { + "epoch": 3.7512234910277327, + "grad_norm": 0.21866433322429657, + "learning_rate": 0.0009768306417669405, + "loss": 0.1553, + "num_input_tokens_seen": 49744560, + "step": 22995 + }, + { + "epoch": 3.7520391517128875, + "grad_norm": 0.027432316914200783, + "learning_rate": 0.00097680922018672, + "loss": 0.1966, + "num_input_tokens_seen": 49756944, + "step": 23000 + }, + { + "epoch": 3.7528548123980423, + "grad_norm": 0.03837420046329498, + "learning_rate": 0.0009767877889433555, + "loss": 0.0872, + "num_input_tokens_seen": 49765776, + "step": 23005 + }, + { + "epoch": 3.753670473083197, + "grad_norm": 0.09529531747102737, + "learning_rate": 0.0009767663480372817, + "loss": 0.0804, + "num_input_tokens_seen": 49776400, + "step": 23010 + }, + { + "epoch": 3.7544861337683524, + "grad_norm": 0.10651466995477676, + "learning_rate": 0.0009767448974689332, + "loss": 0.0706, + "num_input_tokens_seen": 49787408, + "step": 23015 + }, + { + "epoch": 3.755301794453507, + "grad_norm": 0.02113129198551178, + "learning_rate": 0.0009767234372387444, + "loss": 0.1721, + "num_input_tokens_seen": 49797456, + "step": 23020 + }, + { + "epoch": 3.7561174551386625, + "grad_norm": 0.14945995807647705, + "learning_rate": 0.0009767019673471505, + "loss": 0.1206, + "num_input_tokens_seen": 49809328, + "step": 23025 + }, + { + "epoch": 3.7569331158238173, + "grad_norm": 0.021181615069508553, + "learning_rate": 0.0009766804877945864, + "loss": 0.0364, + "num_input_tokens_seen": 49819664, + "step": 23030 + }, + { + "epoch": 3.757748776508972, + "grad_norm": 0.07378098368644714, + "learning_rate": 0.0009766589985814875, + "loss": 0.1234, + "num_input_tokens_seen": 49831088, + "step": 23035 + }, + { + "epoch": 3.7585644371941274, + "grad_norm": 0.011933263391256332, + "learning_rate": 0.0009766374997082893, + "loss": 0.138, + "num_input_tokens_seen": 49842864, + "step": 23040 + }, + { + "epoch": 3.759380097879282, + "grad_norm": 0.07167614996433258, + "learning_rate": 0.0009766159911754277, + "loss": 0.1442, + "num_input_tokens_seen": 49852848, + "step": 23045 + }, + { + "epoch": 3.7601957585644374, + "grad_norm": 0.012339459732174873, + "learning_rate": 0.0009765944729833382, + "loss": 0.0924, + "num_input_tokens_seen": 49863088, + "step": 23050 + }, + { + "epoch": 3.7610114192495923, + "grad_norm": 0.1635027825832367, + "learning_rate": 0.0009765729451324573, + "loss": 0.1175, + "num_input_tokens_seen": 49873840, + "step": 23055 + }, + { + "epoch": 3.761827079934747, + "grad_norm": 0.1554376184940338, + "learning_rate": 0.000976551407623221, + "loss": 0.1576, + "num_input_tokens_seen": 49883888, + "step": 23060 + }, + { + "epoch": 3.762642740619902, + "grad_norm": 0.011004294268786907, + "learning_rate": 0.0009765298604560657, + "loss": 0.1131, + "num_input_tokens_seen": 49894448, + "step": 23065 + }, + { + "epoch": 3.763458401305057, + "grad_norm": 0.014040950685739517, + "learning_rate": 0.0009765083036314284, + "loss": 0.1647, + "num_input_tokens_seen": 49904400, + "step": 23070 + }, + { + "epoch": 3.764274061990212, + "grad_norm": 0.017209839075803757, + "learning_rate": 0.0009764867371497459, + "loss": 0.0778, + "num_input_tokens_seen": 49915184, + "step": 23075 + }, + { + "epoch": 3.7650897226753672, + "grad_norm": 0.24092888832092285, + "learning_rate": 0.000976465161011455, + "loss": 0.0861, + "num_input_tokens_seen": 49926128, + "step": 23080 + }, + { + "epoch": 3.765905383360522, + "grad_norm": 0.10889220982789993, + "learning_rate": 0.0009764435752169933, + "loss": 0.0448, + "num_input_tokens_seen": 49936432, + "step": 23085 + }, + { + "epoch": 3.766721044045677, + "grad_norm": 0.0978965014219284, + "learning_rate": 0.0009764219797667982, + "loss": 0.0422, + "num_input_tokens_seen": 49947760, + "step": 23090 + }, + { + "epoch": 3.767536704730832, + "grad_norm": 0.3926611542701721, + "learning_rate": 0.0009764003746613073, + "loss": 0.0791, + "num_input_tokens_seen": 49958704, + "step": 23095 + }, + { + "epoch": 3.768352365415987, + "grad_norm": 0.2822864055633545, + "learning_rate": 0.0009763787599009583, + "loss": 0.1521, + "num_input_tokens_seen": 49970064, + "step": 23100 + }, + { + "epoch": 3.7691680261011418, + "grad_norm": 0.014437035657465458, + "learning_rate": 0.0009763571354861895, + "loss": 0.0534, + "num_input_tokens_seen": 49980976, + "step": 23105 + }, + { + "epoch": 3.769983686786297, + "grad_norm": 0.17819297313690186, + "learning_rate": 0.0009763355014174391, + "loss": 0.2183, + "num_input_tokens_seen": 49991024, + "step": 23110 + }, + { + "epoch": 3.770799347471452, + "grad_norm": 0.016189444810152054, + "learning_rate": 0.0009763138576951454, + "loss": 0.0935, + "num_input_tokens_seen": 50001680, + "step": 23115 + }, + { + "epoch": 3.7716150081566067, + "grad_norm": 0.03998810052871704, + "learning_rate": 0.0009762922043197471, + "loss": 0.0357, + "num_input_tokens_seen": 50012112, + "step": 23120 + }, + { + "epoch": 3.772430668841762, + "grad_norm": 0.02609841711819172, + "learning_rate": 0.0009762705412916831, + "loss": 0.0961, + "num_input_tokens_seen": 50022768, + "step": 23125 + }, + { + "epoch": 3.7732463295269167, + "grad_norm": 0.006209354382008314, + "learning_rate": 0.0009762488686113924, + "loss": 0.0296, + "num_input_tokens_seen": 50034320, + "step": 23130 + }, + { + "epoch": 3.774061990212072, + "grad_norm": 0.04003912955522537, + "learning_rate": 0.0009762271862793143, + "loss": 0.0711, + "num_input_tokens_seen": 50044304, + "step": 23135 + }, + { + "epoch": 3.774877650897227, + "grad_norm": 0.021071631461381912, + "learning_rate": 0.000976205494295888, + "loss": 0.1904, + "num_input_tokens_seen": 50052976, + "step": 23140 + }, + { + "epoch": 3.7756933115823816, + "grad_norm": 0.07991696894168854, + "learning_rate": 0.0009761837926615533, + "loss": 0.0411, + "num_input_tokens_seen": 50064624, + "step": 23145 + }, + { + "epoch": 3.7765089722675365, + "grad_norm": 0.0656970888376236, + "learning_rate": 0.00097616208137675, + "loss": 0.1568, + "num_input_tokens_seen": 50075696, + "step": 23150 + }, + { + "epoch": 3.7773246329526917, + "grad_norm": 0.22373802959918976, + "learning_rate": 0.000976140360441918, + "loss": 0.1426, + "num_input_tokens_seen": 50086096, + "step": 23155 + }, + { + "epoch": 3.7781402936378465, + "grad_norm": 0.034465014934539795, + "learning_rate": 0.0009761186298574975, + "loss": 0.1237, + "num_input_tokens_seen": 50097520, + "step": 23160 + }, + { + "epoch": 3.778955954323002, + "grad_norm": 0.041891466826200485, + "learning_rate": 0.0009760968896239291, + "loss": 0.0389, + "num_input_tokens_seen": 50108880, + "step": 23165 + }, + { + "epoch": 3.7797716150081566, + "grad_norm": 0.06839856505393982, + "learning_rate": 0.0009760751397416532, + "loss": 0.089, + "num_input_tokens_seen": 50119504, + "step": 23170 + }, + { + "epoch": 3.7805872756933114, + "grad_norm": 0.014309341087937355, + "learning_rate": 0.0009760533802111107, + "loss": 0.0488, + "num_input_tokens_seen": 50130640, + "step": 23175 + }, + { + "epoch": 3.7814029363784667, + "grad_norm": 0.15914183855056763, + "learning_rate": 0.0009760316110327426, + "loss": 0.0888, + "num_input_tokens_seen": 50141616, + "step": 23180 + }, + { + "epoch": 3.7822185970636215, + "grad_norm": 0.3128720223903656, + "learning_rate": 0.00097600983220699, + "loss": 0.1164, + "num_input_tokens_seen": 50152336, + "step": 23185 + }, + { + "epoch": 3.7830342577487768, + "grad_norm": 0.09936369210481644, + "learning_rate": 0.0009759880437342941, + "loss": 0.0577, + "num_input_tokens_seen": 50163088, + "step": 23190 + }, + { + "epoch": 3.7838499184339316, + "grad_norm": 0.2154925912618637, + "learning_rate": 0.0009759662456150967, + "loss": 0.1918, + "num_input_tokens_seen": 50173648, + "step": 23195 + }, + { + "epoch": 3.7846655791190864, + "grad_norm": 0.01144491694867611, + "learning_rate": 0.0009759444378498397, + "loss": 0.0797, + "num_input_tokens_seen": 50184816, + "step": 23200 + }, + { + "epoch": 3.7854812398042412, + "grad_norm": 0.014319537207484245, + "learning_rate": 0.0009759226204389646, + "loss": 0.0992, + "num_input_tokens_seen": 50196560, + "step": 23205 + }, + { + "epoch": 3.7862969004893965, + "grad_norm": 0.025292597711086273, + "learning_rate": 0.0009759007933829141, + "loss": 0.0594, + "num_input_tokens_seen": 50207824, + "step": 23210 + }, + { + "epoch": 3.7871125611745513, + "grad_norm": 0.08496753871440887, + "learning_rate": 0.0009758789566821302, + "loss": 0.1291, + "num_input_tokens_seen": 50219216, + "step": 23215 + }, + { + "epoch": 3.7879282218597066, + "grad_norm": 0.012534767389297485, + "learning_rate": 0.0009758571103370556, + "loss": 0.0247, + "num_input_tokens_seen": 50229648, + "step": 23220 + }, + { + "epoch": 3.7887438825448614, + "grad_norm": 0.048558078706264496, + "learning_rate": 0.000975835254348133, + "loss": 0.0374, + "num_input_tokens_seen": 50240368, + "step": 23225 + }, + { + "epoch": 3.789559543230016, + "grad_norm": 0.038768526166677475, + "learning_rate": 0.0009758133887158053, + "loss": 0.1518, + "num_input_tokens_seen": 50251312, + "step": 23230 + }, + { + "epoch": 3.790375203915171, + "grad_norm": 0.011934944428503513, + "learning_rate": 0.0009757915134405155, + "loss": 0.0362, + "num_input_tokens_seen": 50262320, + "step": 23235 + }, + { + "epoch": 3.7911908646003263, + "grad_norm": 0.02086496911942959, + "learning_rate": 0.0009757696285227073, + "loss": 0.1667, + "num_input_tokens_seen": 50272080, + "step": 23240 + }, + { + "epoch": 3.792006525285481, + "grad_norm": 0.01079615205526352, + "learning_rate": 0.000975747733962824, + "loss": 0.0395, + "num_input_tokens_seen": 50283056, + "step": 23245 + }, + { + "epoch": 3.7928221859706364, + "grad_norm": 0.005180465057492256, + "learning_rate": 0.0009757258297613095, + "loss": 0.2372, + "num_input_tokens_seen": 50294800, + "step": 23250 + }, + { + "epoch": 3.793637846655791, + "grad_norm": 0.11687880754470825, + "learning_rate": 0.0009757039159186072, + "loss": 0.1172, + "num_input_tokens_seen": 50306512, + "step": 23255 + }, + { + "epoch": 3.794453507340946, + "grad_norm": 0.12655872106552124, + "learning_rate": 0.0009756819924351618, + "loss": 0.1006, + "num_input_tokens_seen": 50316912, + "step": 23260 + }, + { + "epoch": 3.7952691680261013, + "grad_norm": 0.20242717862129211, + "learning_rate": 0.0009756600593114174, + "loss": 0.0794, + "num_input_tokens_seen": 50328496, + "step": 23265 + }, + { + "epoch": 3.796084828711256, + "grad_norm": 0.01801212504506111, + "learning_rate": 0.0009756381165478183, + "loss": 0.2224, + "num_input_tokens_seen": 50338128, + "step": 23270 + }, + { + "epoch": 3.7969004893964113, + "grad_norm": 0.08773397654294968, + "learning_rate": 0.0009756161641448095, + "loss": 0.1753, + "num_input_tokens_seen": 50349328, + "step": 23275 + }, + { + "epoch": 3.797716150081566, + "grad_norm": 0.18493768572807312, + "learning_rate": 0.0009755942021028356, + "loss": 0.2975, + "num_input_tokens_seen": 50361232, + "step": 23280 + }, + { + "epoch": 3.798531810766721, + "grad_norm": 0.2676745355129242, + "learning_rate": 0.0009755722304223422, + "loss": 0.2993, + "num_input_tokens_seen": 50371632, + "step": 23285 + }, + { + "epoch": 3.799347471451876, + "grad_norm": 0.08383552730083466, + "learning_rate": 0.000975550249103774, + "loss": 0.0929, + "num_input_tokens_seen": 50381616, + "step": 23290 + }, + { + "epoch": 3.800163132137031, + "grad_norm": 0.03349275887012482, + "learning_rate": 0.0009755282581475768, + "loss": 0.1189, + "num_input_tokens_seen": 50392816, + "step": 23295 + }, + { + "epoch": 3.800978792822186, + "grad_norm": 0.12421400845050812, + "learning_rate": 0.0009755062575541962, + "loss": 0.1084, + "num_input_tokens_seen": 50403280, + "step": 23300 + }, + { + "epoch": 3.801794453507341, + "grad_norm": 0.075412318110466, + "learning_rate": 0.000975484247324078, + "loss": 0.0887, + "num_input_tokens_seen": 50414480, + "step": 23305 + }, + { + "epoch": 3.802610114192496, + "grad_norm": 0.16242346167564392, + "learning_rate": 0.0009754622274576684, + "loss": 0.1853, + "num_input_tokens_seen": 50424368, + "step": 23310 + }, + { + "epoch": 3.8034257748776508, + "grad_norm": 0.29253003001213074, + "learning_rate": 0.0009754401979554136, + "loss": 0.2353, + "num_input_tokens_seen": 50434608, + "step": 23315 + }, + { + "epoch": 3.804241435562806, + "grad_norm": 0.03192012012004852, + "learning_rate": 0.00097541815881776, + "loss": 0.0414, + "num_input_tokens_seen": 50445552, + "step": 23320 + }, + { + "epoch": 3.805057096247961, + "grad_norm": 0.03622366860508919, + "learning_rate": 0.0009753961100451544, + "loss": 0.1547, + "num_input_tokens_seen": 50456240, + "step": 23325 + }, + { + "epoch": 3.8058727569331157, + "grad_norm": 0.007690200116485357, + "learning_rate": 0.0009753740516380433, + "loss": 0.1217, + "num_input_tokens_seen": 50466160, + "step": 23330 + }, + { + "epoch": 3.806688417618271, + "grad_norm": 0.01823308691382408, + "learning_rate": 0.0009753519835968743, + "loss": 0.1037, + "num_input_tokens_seen": 50475024, + "step": 23335 + }, + { + "epoch": 3.8075040783034257, + "grad_norm": 0.03313617780804634, + "learning_rate": 0.0009753299059220941, + "loss": 0.0663, + "num_input_tokens_seen": 50487600, + "step": 23340 + }, + { + "epoch": 3.8083197389885806, + "grad_norm": 0.08058440685272217, + "learning_rate": 0.0009753078186141506, + "loss": 0.1118, + "num_input_tokens_seen": 50497488, + "step": 23345 + }, + { + "epoch": 3.809135399673736, + "grad_norm": 0.0680818036198616, + "learning_rate": 0.0009752857216734909, + "loss": 0.0484, + "num_input_tokens_seen": 50508336, + "step": 23350 + }, + { + "epoch": 3.8099510603588906, + "grad_norm": 0.22939538955688477, + "learning_rate": 0.0009752636151005633, + "loss": 0.1434, + "num_input_tokens_seen": 50518192, + "step": 23355 + }, + { + "epoch": 3.810766721044046, + "grad_norm": 0.02211681194603443, + "learning_rate": 0.0009752414988958156, + "loss": 0.0673, + "num_input_tokens_seen": 50529200, + "step": 23360 + }, + { + "epoch": 3.8115823817292007, + "grad_norm": 0.2066287249326706, + "learning_rate": 0.000975219373059696, + "loss": 0.2095, + "num_input_tokens_seen": 50539920, + "step": 23365 + }, + { + "epoch": 3.8123980424143555, + "grad_norm": 0.07064596563577652, + "learning_rate": 0.000975197237592653, + "loss": 0.0655, + "num_input_tokens_seen": 50550416, + "step": 23370 + }, + { + "epoch": 3.8132137030995104, + "grad_norm": 0.01532050408422947, + "learning_rate": 0.000975175092495135, + "loss": 0.1925, + "num_input_tokens_seen": 50561808, + "step": 23375 + }, + { + "epoch": 3.8140293637846656, + "grad_norm": 0.02331993728876114, + "learning_rate": 0.0009751529377675911, + "loss": 0.1121, + "num_input_tokens_seen": 50571792, + "step": 23380 + }, + { + "epoch": 3.8148450244698204, + "grad_norm": 0.166970893740654, + "learning_rate": 0.00097513077341047, + "loss": 0.1895, + "num_input_tokens_seen": 50583280, + "step": 23385 + }, + { + "epoch": 3.8156606851549757, + "grad_norm": 0.20115573704242706, + "learning_rate": 0.0009751085994242212, + "loss": 0.18, + "num_input_tokens_seen": 50593872, + "step": 23390 + }, + { + "epoch": 3.8164763458401305, + "grad_norm": 0.0661921575665474, + "learning_rate": 0.0009750864158092938, + "loss": 0.1162, + "num_input_tokens_seen": 50603600, + "step": 23395 + }, + { + "epoch": 3.8172920065252853, + "grad_norm": 0.17341428995132446, + "learning_rate": 0.0009750642225661375, + "loss": 0.1361, + "num_input_tokens_seen": 50614608, + "step": 23400 + }, + { + "epoch": 3.8181076672104406, + "grad_norm": 0.05718767270445824, + "learning_rate": 0.0009750420196952021, + "loss": 0.1265, + "num_input_tokens_seen": 50625552, + "step": 23405 + }, + { + "epoch": 3.8189233278955954, + "grad_norm": 0.014998821541666985, + "learning_rate": 0.0009750198071969376, + "loss": 0.0404, + "num_input_tokens_seen": 50636176, + "step": 23410 + }, + { + "epoch": 3.8197389885807507, + "grad_norm": 0.016761574894189835, + "learning_rate": 0.0009749975850717941, + "loss": 0.0604, + "num_input_tokens_seen": 50647408, + "step": 23415 + }, + { + "epoch": 3.8205546492659055, + "grad_norm": 0.023346178233623505, + "learning_rate": 0.0009749753533202218, + "loss": 0.1211, + "num_input_tokens_seen": 50658896, + "step": 23420 + }, + { + "epoch": 3.8213703099510603, + "grad_norm": 0.09524130076169968, + "learning_rate": 0.0009749531119426716, + "loss": 0.0637, + "num_input_tokens_seen": 50670160, + "step": 23425 + }, + { + "epoch": 3.822185970636215, + "grad_norm": 0.04625507444143295, + "learning_rate": 0.000974930860939594, + "loss": 0.0956, + "num_input_tokens_seen": 50680240, + "step": 23430 + }, + { + "epoch": 3.8230016313213704, + "grad_norm": 0.10155382752418518, + "learning_rate": 0.0009749086003114399, + "loss": 0.0425, + "num_input_tokens_seen": 50691376, + "step": 23435 + }, + { + "epoch": 3.823817292006525, + "grad_norm": 0.05376620218157768, + "learning_rate": 0.0009748863300586605, + "loss": 0.0676, + "num_input_tokens_seen": 50701808, + "step": 23440 + }, + { + "epoch": 3.8246329526916805, + "grad_norm": 0.014472625218331814, + "learning_rate": 0.0009748640501817074, + "loss": 0.1025, + "num_input_tokens_seen": 50712592, + "step": 23445 + }, + { + "epoch": 3.8254486133768353, + "grad_norm": 0.022697031497955322, + "learning_rate": 0.0009748417606810319, + "loss": 0.0574, + "num_input_tokens_seen": 50723088, + "step": 23450 + }, + { + "epoch": 3.82626427406199, + "grad_norm": 0.25425323843955994, + "learning_rate": 0.0009748194615570857, + "loss": 0.217, + "num_input_tokens_seen": 50733328, + "step": 23455 + }, + { + "epoch": 3.827079934747145, + "grad_norm": 0.10280811786651611, + "learning_rate": 0.0009747971528103207, + "loss": 0.1424, + "num_input_tokens_seen": 50744976, + "step": 23460 + }, + { + "epoch": 3.8278955954323, + "grad_norm": 0.12097106873989105, + "learning_rate": 0.0009747748344411891, + "loss": 0.1551, + "num_input_tokens_seen": 50755952, + "step": 23465 + }, + { + "epoch": 3.828711256117455, + "grad_norm": 0.15204603970050812, + "learning_rate": 0.0009747525064501433, + "loss": 0.0997, + "num_input_tokens_seen": 50766480, + "step": 23470 + }, + { + "epoch": 3.8295269168026103, + "grad_norm": 0.027186892926692963, + "learning_rate": 0.0009747301688376355, + "loss": 0.1228, + "num_input_tokens_seen": 50777808, + "step": 23475 + }, + { + "epoch": 3.830342577487765, + "grad_norm": 0.03366658836603165, + "learning_rate": 0.0009747078216041187, + "loss": 0.0903, + "num_input_tokens_seen": 50789552, + "step": 23480 + }, + { + "epoch": 3.83115823817292, + "grad_norm": 0.25274983048439026, + "learning_rate": 0.0009746854647500457, + "loss": 0.1051, + "num_input_tokens_seen": 50799920, + "step": 23485 + }, + { + "epoch": 3.831973898858075, + "grad_norm": 0.006881623528897762, + "learning_rate": 0.0009746630982758695, + "loss": 0.0453, + "num_input_tokens_seen": 50811792, + "step": 23490 + }, + { + "epoch": 3.83278955954323, + "grad_norm": 0.002175088506191969, + "learning_rate": 0.0009746407221820435, + "loss": 0.0442, + "num_input_tokens_seen": 50822960, + "step": 23495 + }, + { + "epoch": 3.8336052202283852, + "grad_norm": 0.37563708424568176, + "learning_rate": 0.0009746183364690212, + "loss": 0.1254, + "num_input_tokens_seen": 50833328, + "step": 23500 + }, + { + "epoch": 3.83442088091354, + "grad_norm": 0.058402448892593384, + "learning_rate": 0.0009745959411372561, + "loss": 0.0492, + "num_input_tokens_seen": 50844144, + "step": 23505 + }, + { + "epoch": 3.835236541598695, + "grad_norm": 0.23560450971126556, + "learning_rate": 0.0009745735361872023, + "loss": 0.1266, + "num_input_tokens_seen": 50855440, + "step": 23510 + }, + { + "epoch": 3.8360522022838497, + "grad_norm": 0.13727842271327972, + "learning_rate": 0.0009745511216193137, + "loss": 0.1043, + "num_input_tokens_seen": 50866288, + "step": 23515 + }, + { + "epoch": 3.836867862969005, + "grad_norm": 0.031131109222769737, + "learning_rate": 0.0009745286974340445, + "loss": 0.1187, + "num_input_tokens_seen": 50877360, + "step": 23520 + }, + { + "epoch": 3.8376835236541598, + "grad_norm": 0.0038565329741686583, + "learning_rate": 0.0009745062636318495, + "loss": 0.0595, + "num_input_tokens_seen": 50889424, + "step": 23525 + }, + { + "epoch": 3.838499184339315, + "grad_norm": 0.13137976825237274, + "learning_rate": 0.0009744838202131829, + "loss": 0.1308, + "num_input_tokens_seen": 50900560, + "step": 23530 + }, + { + "epoch": 3.83931484502447, + "grad_norm": 0.0036349627189338207, + "learning_rate": 0.0009744613671784999, + "loss": 0.0695, + "num_input_tokens_seen": 50910256, + "step": 23535 + }, + { + "epoch": 3.8401305057096247, + "grad_norm": 0.07140156626701355, + "learning_rate": 0.0009744389045282554, + "loss": 0.1406, + "num_input_tokens_seen": 50920208, + "step": 23540 + }, + { + "epoch": 3.84094616639478, + "grad_norm": 0.06446415930986404, + "learning_rate": 0.0009744164322629046, + "loss": 0.0901, + "num_input_tokens_seen": 50930832, + "step": 23545 + }, + { + "epoch": 3.8417618270799347, + "grad_norm": 0.055119890719652176, + "learning_rate": 0.0009743939503829027, + "loss": 0.0645, + "num_input_tokens_seen": 50942064, + "step": 23550 + }, + { + "epoch": 3.8425774877650896, + "grad_norm": 0.18346649408340454, + "learning_rate": 0.0009743714588887059, + "loss": 0.377, + "num_input_tokens_seen": 50953008, + "step": 23555 + }, + { + "epoch": 3.843393148450245, + "grad_norm": 0.22450843453407288, + "learning_rate": 0.0009743489577807696, + "loss": 0.1515, + "num_input_tokens_seen": 50964720, + "step": 23560 + }, + { + "epoch": 3.8442088091353996, + "grad_norm": 0.015807831659913063, + "learning_rate": 0.0009743264470595499, + "loss": 0.1297, + "num_input_tokens_seen": 50975088, + "step": 23565 + }, + { + "epoch": 3.8450244698205545, + "grad_norm": 0.10449165105819702, + "learning_rate": 0.0009743039267255031, + "loss": 0.0935, + "num_input_tokens_seen": 50984912, + "step": 23570 + }, + { + "epoch": 3.8458401305057097, + "grad_norm": 0.005697912070900202, + "learning_rate": 0.0009742813967790855, + "loss": 0.0575, + "num_input_tokens_seen": 50996304, + "step": 23575 + }, + { + "epoch": 3.8466557911908645, + "grad_norm": 0.06416153907775879, + "learning_rate": 0.0009742588572207538, + "loss": 0.0658, + "num_input_tokens_seen": 51007376, + "step": 23580 + }, + { + "epoch": 3.84747145187602, + "grad_norm": 0.10796103626489639, + "learning_rate": 0.0009742363080509647, + "loss": 0.1991, + "num_input_tokens_seen": 51018736, + "step": 23585 + }, + { + "epoch": 3.8482871125611746, + "grad_norm": 0.012265535071492195, + "learning_rate": 0.000974213749270175, + "loss": 0.0476, + "num_input_tokens_seen": 51029648, + "step": 23590 + }, + { + "epoch": 3.8491027732463294, + "grad_norm": 0.0124428141862154, + "learning_rate": 0.0009741911808788422, + "loss": 0.1514, + "num_input_tokens_seen": 51041104, + "step": 23595 + }, + { + "epoch": 3.8499184339314843, + "grad_norm": 0.005789014510810375, + "learning_rate": 0.0009741686028774236, + "loss": 0.1404, + "num_input_tokens_seen": 51052432, + "step": 23600 + }, + { + "epoch": 3.8507340946166395, + "grad_norm": 0.05271648243069649, + "learning_rate": 0.0009741460152663768, + "loss": 0.0728, + "num_input_tokens_seen": 51063440, + "step": 23605 + }, + { + "epoch": 3.8515497553017943, + "grad_norm": 0.008030619472265244, + "learning_rate": 0.0009741234180461593, + "loss": 0.1906, + "num_input_tokens_seen": 51074864, + "step": 23610 + }, + { + "epoch": 3.8523654159869496, + "grad_norm": 0.1799931675195694, + "learning_rate": 0.0009741008112172293, + "loss": 0.1197, + "num_input_tokens_seen": 51084848, + "step": 23615 + }, + { + "epoch": 3.8531810766721044, + "grad_norm": 0.24197715520858765, + "learning_rate": 0.0009740781947800452, + "loss": 0.2887, + "num_input_tokens_seen": 51095312, + "step": 23620 + }, + { + "epoch": 3.8539967373572592, + "grad_norm": 0.06715114414691925, + "learning_rate": 0.0009740555687350648, + "loss": 0.078, + "num_input_tokens_seen": 51106800, + "step": 23625 + }, + { + "epoch": 3.8548123980424145, + "grad_norm": 0.05898895114660263, + "learning_rate": 0.0009740329330827471, + "loss": 0.1036, + "num_input_tokens_seen": 51117648, + "step": 23630 + }, + { + "epoch": 3.8556280587275693, + "grad_norm": 0.13618339598178864, + "learning_rate": 0.0009740102878235505, + "loss": 0.0797, + "num_input_tokens_seen": 51129200, + "step": 23635 + }, + { + "epoch": 3.8564437194127246, + "grad_norm": 0.22078992426395416, + "learning_rate": 0.0009739876329579343, + "loss": 0.1101, + "num_input_tokens_seen": 51140080, + "step": 23640 + }, + { + "epoch": 3.8572593800978794, + "grad_norm": 0.25214922428131104, + "learning_rate": 0.0009739649684863572, + "loss": 0.1807, + "num_input_tokens_seen": 51151440, + "step": 23645 + }, + { + "epoch": 3.858075040783034, + "grad_norm": 0.032653022557497025, + "learning_rate": 0.0009739422944092789, + "loss": 0.0547, + "num_input_tokens_seen": 51162896, + "step": 23650 + }, + { + "epoch": 3.858890701468189, + "grad_norm": 0.06532101333141327, + "learning_rate": 0.0009739196107271586, + "loss": 0.109, + "num_input_tokens_seen": 51173232, + "step": 23655 + }, + { + "epoch": 3.8597063621533443, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0009738969174404562, + "loss": 0.0568, + "num_input_tokens_seen": 51184336, + "step": 23660 + }, + { + "epoch": 3.860522022838499, + "grad_norm": 0.023910559713840485, + "learning_rate": 0.0009738742145496318, + "loss": 0.18, + "num_input_tokens_seen": 51195280, + "step": 23665 + }, + { + "epoch": 3.8613376835236544, + "grad_norm": 0.05344879627227783, + "learning_rate": 0.000973851502055145, + "loss": 0.0418, + "num_input_tokens_seen": 51204976, + "step": 23670 + }, + { + "epoch": 3.862153344208809, + "grad_norm": 0.16817040741443634, + "learning_rate": 0.0009738287799574565, + "loss": 0.1508, + "num_input_tokens_seen": 51217328, + "step": 23675 + }, + { + "epoch": 3.862969004893964, + "grad_norm": 0.07454685121774673, + "learning_rate": 0.0009738060482570268, + "loss": 0.0351, + "num_input_tokens_seen": 51228400, + "step": 23680 + }, + { + "epoch": 3.863784665579119, + "grad_norm": 0.1946832686662674, + "learning_rate": 0.0009737833069543163, + "loss": 0.1932, + "num_input_tokens_seen": 51238640, + "step": 23685 + }, + { + "epoch": 3.864600326264274, + "grad_norm": 0.2734370231628418, + "learning_rate": 0.0009737605560497862, + "loss": 0.2889, + "num_input_tokens_seen": 51249136, + "step": 23690 + }, + { + "epoch": 3.865415986949429, + "grad_norm": 0.07125987857580185, + "learning_rate": 0.0009737377955438973, + "loss": 0.051, + "num_input_tokens_seen": 51260112, + "step": 23695 + }, + { + "epoch": 3.866231647634584, + "grad_norm": 0.03408673778176308, + "learning_rate": 0.000973715025437111, + "loss": 0.1153, + "num_input_tokens_seen": 51269616, + "step": 23700 + }, + { + "epoch": 3.867047308319739, + "grad_norm": 0.06401803344488144, + "learning_rate": 0.0009736922457298889, + "loss": 0.131, + "num_input_tokens_seen": 51280624, + "step": 23705 + }, + { + "epoch": 3.867862969004894, + "grad_norm": 0.03508564084768295, + "learning_rate": 0.0009736694564226924, + "loss": 0.13, + "num_input_tokens_seen": 51291952, + "step": 23710 + }, + { + "epoch": 3.868678629690049, + "grad_norm": 0.23152831196784973, + "learning_rate": 0.0009736466575159835, + "loss": 0.1823, + "num_input_tokens_seen": 51302672, + "step": 23715 + }, + { + "epoch": 3.869494290375204, + "grad_norm": 0.008906074799597263, + "learning_rate": 0.0009736238490102243, + "loss": 0.1114, + "num_input_tokens_seen": 51312848, + "step": 23720 + }, + { + "epoch": 3.870309951060359, + "grad_norm": 0.008829626254737377, + "learning_rate": 0.0009736010309058769, + "loss": 0.0331, + "num_input_tokens_seen": 51323376, + "step": 23725 + }, + { + "epoch": 3.871125611745514, + "grad_norm": 0.11298642307519913, + "learning_rate": 0.0009735782032034038, + "loss": 0.1043, + "num_input_tokens_seen": 51334192, + "step": 23730 + }, + { + "epoch": 3.8719412724306688, + "grad_norm": 0.1311386078596115, + "learning_rate": 0.0009735553659032674, + "loss": 0.1297, + "num_input_tokens_seen": 51345072, + "step": 23735 + }, + { + "epoch": 3.8727569331158236, + "grad_norm": 0.09266921132802963, + "learning_rate": 0.000973532519005931, + "loss": 0.049, + "num_input_tokens_seen": 51355280, + "step": 23740 + }, + { + "epoch": 3.873572593800979, + "grad_norm": 0.015555300749838352, + "learning_rate": 0.0009735096625118574, + "loss": 0.0823, + "num_input_tokens_seen": 51366608, + "step": 23745 + }, + { + "epoch": 3.8743882544861337, + "grad_norm": 0.04085804149508476, + "learning_rate": 0.0009734867964215099, + "loss": 0.0795, + "num_input_tokens_seen": 51377488, + "step": 23750 + }, + { + "epoch": 3.875203915171289, + "grad_norm": 0.16222117841243744, + "learning_rate": 0.0009734639207353516, + "loss": 0.1495, + "num_input_tokens_seen": 51389168, + "step": 23755 + }, + { + "epoch": 3.8760195758564437, + "grad_norm": 0.06792500615119934, + "learning_rate": 0.0009734410354538464, + "loss": 0.205, + "num_input_tokens_seen": 51400368, + "step": 23760 + }, + { + "epoch": 3.8768352365415986, + "grad_norm": 0.10864797234535217, + "learning_rate": 0.0009734181405774581, + "loss": 0.0554, + "num_input_tokens_seen": 51410288, + "step": 23765 + }, + { + "epoch": 3.877650897226754, + "grad_norm": 0.013939526863396168, + "learning_rate": 0.0009733952361066505, + "loss": 0.0239, + "num_input_tokens_seen": 51420432, + "step": 23770 + }, + { + "epoch": 3.8784665579119086, + "grad_norm": 0.07833394408226013, + "learning_rate": 0.0009733723220418877, + "loss": 0.3111, + "num_input_tokens_seen": 51429968, + "step": 23775 + }, + { + "epoch": 3.8792822185970635, + "grad_norm": 0.007237799931317568, + "learning_rate": 0.0009733493983836345, + "loss": 0.0471, + "num_input_tokens_seen": 51439312, + "step": 23780 + }, + { + "epoch": 3.8800978792822187, + "grad_norm": 0.09092199802398682, + "learning_rate": 0.0009733264651323553, + "loss": 0.1733, + "num_input_tokens_seen": 51449808, + "step": 23785 + }, + { + "epoch": 3.8809135399673735, + "grad_norm": 0.015112137421965599, + "learning_rate": 0.0009733035222885149, + "loss": 0.0582, + "num_input_tokens_seen": 51460496, + "step": 23790 + }, + { + "epoch": 3.8817292006525284, + "grad_norm": 0.0755823478102684, + "learning_rate": 0.000973280569852578, + "loss": 0.1515, + "num_input_tokens_seen": 51469424, + "step": 23795 + }, + { + "epoch": 3.8825448613376836, + "grad_norm": 0.14067342877388, + "learning_rate": 0.00097325760782501, + "loss": 0.0405, + "num_input_tokens_seen": 51480656, + "step": 23800 + }, + { + "epoch": 3.8833605220228384, + "grad_norm": 0.03899535536766052, + "learning_rate": 0.0009732346362062763, + "loss": 0.1319, + "num_input_tokens_seen": 51491696, + "step": 23805 + }, + { + "epoch": 3.8841761827079937, + "grad_norm": 0.0993603765964508, + "learning_rate": 0.0009732116549968421, + "loss": 0.0957, + "num_input_tokens_seen": 51501584, + "step": 23810 + }, + { + "epoch": 3.8849918433931485, + "grad_norm": 0.19096675515174866, + "learning_rate": 0.0009731886641971737, + "loss": 0.1519, + "num_input_tokens_seen": 51512080, + "step": 23815 + }, + { + "epoch": 3.8858075040783033, + "grad_norm": 0.24470771849155426, + "learning_rate": 0.0009731656638077367, + "loss": 0.1327, + "num_input_tokens_seen": 51522640, + "step": 23820 + }, + { + "epoch": 3.886623164763458, + "grad_norm": 0.16861572861671448, + "learning_rate": 0.0009731426538289971, + "loss": 0.0593, + "num_input_tokens_seen": 51534288, + "step": 23825 + }, + { + "epoch": 3.8874388254486134, + "grad_norm": 0.10015416890382767, + "learning_rate": 0.0009731196342614214, + "loss": 0.0965, + "num_input_tokens_seen": 51544720, + "step": 23830 + }, + { + "epoch": 3.8882544861337682, + "grad_norm": 0.16817638278007507, + "learning_rate": 0.0009730966051054763, + "loss": 0.1047, + "num_input_tokens_seen": 51555376, + "step": 23835 + }, + { + "epoch": 3.8890701468189235, + "grad_norm": 0.10419394820928574, + "learning_rate": 0.0009730735663616281, + "loss": 0.2183, + "num_input_tokens_seen": 51567568, + "step": 23840 + }, + { + "epoch": 3.8898858075040783, + "grad_norm": 0.15064406394958496, + "learning_rate": 0.0009730505180303441, + "loss": 0.201, + "num_input_tokens_seen": 51578384, + "step": 23845 + }, + { + "epoch": 3.890701468189233, + "grad_norm": 0.22134579718112946, + "learning_rate": 0.0009730274601120913, + "loss": 0.1304, + "num_input_tokens_seen": 51588656, + "step": 23850 + }, + { + "epoch": 3.8915171288743884, + "grad_norm": 0.1771828681230545, + "learning_rate": 0.0009730043926073369, + "loss": 0.1495, + "num_input_tokens_seen": 51600112, + "step": 23855 + }, + { + "epoch": 3.892332789559543, + "grad_norm": 0.08342130482196808, + "learning_rate": 0.0009729813155165484, + "loss": 0.171, + "num_input_tokens_seen": 51610864, + "step": 23860 + }, + { + "epoch": 3.8931484502446985, + "grad_norm": 0.046608809381723404, + "learning_rate": 0.0009729582288401934, + "loss": 0.1579, + "num_input_tokens_seen": 51621616, + "step": 23865 + }, + { + "epoch": 3.8939641109298533, + "grad_norm": 0.15302029252052307, + "learning_rate": 0.0009729351325787402, + "loss": 0.1561, + "num_input_tokens_seen": 51633264, + "step": 23870 + }, + { + "epoch": 3.894779771615008, + "grad_norm": 0.04558643698692322, + "learning_rate": 0.0009729120267326564, + "loss": 0.0339, + "num_input_tokens_seen": 51643056, + "step": 23875 + }, + { + "epoch": 3.895595432300163, + "grad_norm": 0.032037895172834396, + "learning_rate": 0.0009728889113024103, + "loss": 0.0536, + "num_input_tokens_seen": 51653200, + "step": 23880 + }, + { + "epoch": 3.896411092985318, + "grad_norm": 0.022089485079050064, + "learning_rate": 0.0009728657862884707, + "loss": 0.102, + "num_input_tokens_seen": 51664048, + "step": 23885 + }, + { + "epoch": 3.897226753670473, + "grad_norm": 0.044861339032649994, + "learning_rate": 0.0009728426516913061, + "loss": 0.0744, + "num_input_tokens_seen": 51674448, + "step": 23890 + }, + { + "epoch": 3.8980424143556283, + "grad_norm": 0.04996919259428978, + "learning_rate": 0.0009728195075113851, + "loss": 0.075, + "num_input_tokens_seen": 51683440, + "step": 23895 + }, + { + "epoch": 3.898858075040783, + "grad_norm": 0.03168988600373268, + "learning_rate": 0.000972796353749177, + "loss": 0.0557, + "num_input_tokens_seen": 51693552, + "step": 23900 + }, + { + "epoch": 3.899673735725938, + "grad_norm": 0.06277398020029068, + "learning_rate": 0.0009727731904051513, + "loss": 0.0248, + "num_input_tokens_seen": 51704048, + "step": 23905 + }, + { + "epoch": 3.9004893964110927, + "grad_norm": 0.413730651140213, + "learning_rate": 0.0009727500174797769, + "loss": 0.2979, + "num_input_tokens_seen": 51713264, + "step": 23910 + }, + { + "epoch": 3.901305057096248, + "grad_norm": 0.02978765405714512, + "learning_rate": 0.0009727268349735237, + "loss": 0.0508, + "num_input_tokens_seen": 51724304, + "step": 23915 + }, + { + "epoch": 3.902120717781403, + "grad_norm": 0.03769057244062424, + "learning_rate": 0.0009727036428868616, + "loss": 0.1514, + "num_input_tokens_seen": 51734832, + "step": 23920 + }, + { + "epoch": 3.902936378466558, + "grad_norm": 0.12522292137145996, + "learning_rate": 0.0009726804412202604, + "loss": 0.0921, + "num_input_tokens_seen": 51745584, + "step": 23925 + }, + { + "epoch": 3.903752039151713, + "grad_norm": 0.056543026119470596, + "learning_rate": 0.0009726572299741904, + "loss": 0.0798, + "num_input_tokens_seen": 51755856, + "step": 23930 + }, + { + "epoch": 3.9045676998368677, + "grad_norm": 0.19713228940963745, + "learning_rate": 0.0009726340091491221, + "loss": 0.2008, + "num_input_tokens_seen": 51766704, + "step": 23935 + }, + { + "epoch": 3.905383360522023, + "grad_norm": 0.05433223396539688, + "learning_rate": 0.000972610778745526, + "loss": 0.0572, + "num_input_tokens_seen": 51778288, + "step": 23940 + }, + { + "epoch": 3.9061990212071778, + "grad_norm": 0.10950616747140884, + "learning_rate": 0.0009725875387638729, + "loss": 0.1825, + "num_input_tokens_seen": 51788464, + "step": 23945 + }, + { + "epoch": 3.907014681892333, + "grad_norm": 0.03161918371915817, + "learning_rate": 0.0009725642892046339, + "loss": 0.0664, + "num_input_tokens_seen": 51798480, + "step": 23950 + }, + { + "epoch": 3.907830342577488, + "grad_norm": 0.05995155870914459, + "learning_rate": 0.00097254103006828, + "loss": 0.0404, + "num_input_tokens_seen": 51808912, + "step": 23955 + }, + { + "epoch": 3.9086460032626427, + "grad_norm": 0.1624925136566162, + "learning_rate": 0.0009725177613552827, + "loss": 0.0665, + "num_input_tokens_seen": 51819600, + "step": 23960 + }, + { + "epoch": 3.9094616639477975, + "grad_norm": 0.10385416448116302, + "learning_rate": 0.0009724944830661135, + "loss": 0.1225, + "num_input_tokens_seen": 51831824, + "step": 23965 + }, + { + "epoch": 3.9102773246329527, + "grad_norm": 0.11179591715335846, + "learning_rate": 0.0009724711952012442, + "loss": 0.0817, + "num_input_tokens_seen": 51842256, + "step": 23970 + }, + { + "epoch": 3.9110929853181076, + "grad_norm": 0.35106027126312256, + "learning_rate": 0.0009724478977611469, + "loss": 0.1216, + "num_input_tokens_seen": 51853840, + "step": 23975 + }, + { + "epoch": 3.911908646003263, + "grad_norm": 0.047677185386419296, + "learning_rate": 0.0009724245907462934, + "loss": 0.3742, + "num_input_tokens_seen": 51864464, + "step": 23980 + }, + { + "epoch": 3.9127243066884176, + "grad_norm": 0.05719529092311859, + "learning_rate": 0.0009724012741571563, + "loss": 0.2108, + "num_input_tokens_seen": 51874224, + "step": 23985 + }, + { + "epoch": 3.9135399673735725, + "grad_norm": 0.14676021039485931, + "learning_rate": 0.000972377947994208, + "loss": 0.0799, + "num_input_tokens_seen": 51886064, + "step": 23990 + }, + { + "epoch": 3.9143556280587277, + "grad_norm": 0.026104595512151718, + "learning_rate": 0.0009723546122579217, + "loss": 0.0623, + "num_input_tokens_seen": 51896880, + "step": 23995 + }, + { + "epoch": 3.9151712887438825, + "grad_norm": 0.07807918637990952, + "learning_rate": 0.0009723312669487696, + "loss": 0.0688, + "num_input_tokens_seen": 51907952, + "step": 24000 + }, + { + "epoch": 3.9159869494290374, + "grad_norm": 0.11770875006914139, + "learning_rate": 0.0009723079120672254, + "loss": 0.1049, + "num_input_tokens_seen": 51918352, + "step": 24005 + }, + { + "epoch": 3.9168026101141926, + "grad_norm": 0.09638605266809464, + "learning_rate": 0.0009722845476137621, + "loss": 0.0385, + "num_input_tokens_seen": 51928976, + "step": 24010 + }, + { + "epoch": 3.9176182707993474, + "grad_norm": 0.02468789368867874, + "learning_rate": 0.0009722611735888532, + "loss": 0.188, + "num_input_tokens_seen": 51940656, + "step": 24015 + }, + { + "epoch": 3.9184339314845023, + "grad_norm": 0.09898632019758224, + "learning_rate": 0.0009722377899929727, + "loss": 0.053, + "num_input_tokens_seen": 51951600, + "step": 24020 + }, + { + "epoch": 3.9192495921696575, + "grad_norm": 0.010132327675819397, + "learning_rate": 0.0009722143968265942, + "loss": 0.0106, + "num_input_tokens_seen": 51962608, + "step": 24025 + }, + { + "epoch": 3.9200652528548123, + "grad_norm": 0.07563593238592148, + "learning_rate": 0.0009721909940901918, + "loss": 0.0415, + "num_input_tokens_seen": 51972816, + "step": 24030 + }, + { + "epoch": 3.9208809135399676, + "grad_norm": 0.01588386856019497, + "learning_rate": 0.0009721675817842402, + "loss": 0.1785, + "num_input_tokens_seen": 51983824, + "step": 24035 + }, + { + "epoch": 3.9216965742251224, + "grad_norm": 0.06717728078365326, + "learning_rate": 0.0009721441599092133, + "loss": 0.177, + "num_input_tokens_seen": 51995120, + "step": 24040 + }, + { + "epoch": 3.9225122349102772, + "grad_norm": 0.1424424946308136, + "learning_rate": 0.0009721207284655862, + "loss": 0.0805, + "num_input_tokens_seen": 52004400, + "step": 24045 + }, + { + "epoch": 3.923327895595432, + "grad_norm": 0.027268648147583008, + "learning_rate": 0.0009720972874538334, + "loss": 0.1578, + "num_input_tokens_seen": 52014896, + "step": 24050 + }, + { + "epoch": 3.9241435562805873, + "grad_norm": 0.019477305933833122, + "learning_rate": 0.0009720738368744304, + "loss": 0.1404, + "num_input_tokens_seen": 52026160, + "step": 24055 + }, + { + "epoch": 3.924959216965742, + "grad_norm": 0.035887766629457474, + "learning_rate": 0.0009720503767278522, + "loss": 0.0717, + "num_input_tokens_seen": 52036784, + "step": 24060 + }, + { + "epoch": 3.9257748776508974, + "grad_norm": 0.11439003050327301, + "learning_rate": 0.0009720269070145742, + "loss": 0.0399, + "num_input_tokens_seen": 52046512, + "step": 24065 + }, + { + "epoch": 3.926590538336052, + "grad_norm": 0.29793015122413635, + "learning_rate": 0.000972003427735072, + "loss": 0.1544, + "num_input_tokens_seen": 52055664, + "step": 24070 + }, + { + "epoch": 3.927406199021207, + "grad_norm": 0.23559176921844482, + "learning_rate": 0.0009719799388898219, + "loss": 0.1592, + "num_input_tokens_seen": 52065424, + "step": 24075 + }, + { + "epoch": 3.9282218597063623, + "grad_norm": 0.02124435268342495, + "learning_rate": 0.0009719564404792993, + "loss": 0.1231, + "num_input_tokens_seen": 52074288, + "step": 24080 + }, + { + "epoch": 3.929037520391517, + "grad_norm": 0.10373537242412567, + "learning_rate": 0.0009719329325039807, + "loss": 0.0768, + "num_input_tokens_seen": 52086064, + "step": 24085 + }, + { + "epoch": 3.9298531810766724, + "grad_norm": 0.18443343043327332, + "learning_rate": 0.0009719094149643426, + "loss": 0.0996, + "num_input_tokens_seen": 52096560, + "step": 24090 + }, + { + "epoch": 3.930668841761827, + "grad_norm": 0.10123252868652344, + "learning_rate": 0.0009718858878608617, + "loss": 0.0788, + "num_input_tokens_seen": 52107280, + "step": 24095 + }, + { + "epoch": 3.931484502446982, + "grad_norm": 0.20112597942352295, + "learning_rate": 0.0009718623511940145, + "loss": 0.1425, + "num_input_tokens_seen": 52118064, + "step": 24100 + }, + { + "epoch": 3.932300163132137, + "grad_norm": 0.01606622524559498, + "learning_rate": 0.0009718388049642781, + "loss": 0.0863, + "num_input_tokens_seen": 52129104, + "step": 24105 + }, + { + "epoch": 3.933115823817292, + "grad_norm": 0.08770886808633804, + "learning_rate": 0.00097181524917213, + "loss": 0.041, + "num_input_tokens_seen": 52139600, + "step": 24110 + }, + { + "epoch": 3.933931484502447, + "grad_norm": 0.005683081690222025, + "learning_rate": 0.0009717916838180471, + "loss": 0.0594, + "num_input_tokens_seen": 52149744, + "step": 24115 + }, + { + "epoch": 3.934747145187602, + "grad_norm": 0.06411273777484894, + "learning_rate": 0.0009717681089025073, + "loss": 0.0694, + "num_input_tokens_seen": 52160592, + "step": 24120 + }, + { + "epoch": 3.935562805872757, + "grad_norm": 0.08481893688440323, + "learning_rate": 0.0009717445244259882, + "loss": 0.052, + "num_input_tokens_seen": 52170576, + "step": 24125 + }, + { + "epoch": 3.936378466557912, + "grad_norm": 0.1713237464427948, + "learning_rate": 0.0009717209303889679, + "loss": 0.0488, + "num_input_tokens_seen": 52181264, + "step": 24130 + }, + { + "epoch": 3.9371941272430666, + "grad_norm": 0.10465515404939651, + "learning_rate": 0.0009716973267919246, + "loss": 0.0216, + "num_input_tokens_seen": 52192432, + "step": 24135 + }, + { + "epoch": 3.938009787928222, + "grad_norm": 0.25784167647361755, + "learning_rate": 0.0009716737136353365, + "loss": 0.2222, + "num_input_tokens_seen": 52202896, + "step": 24140 + }, + { + "epoch": 3.9388254486133767, + "grad_norm": 0.03767762333154678, + "learning_rate": 0.0009716500909196824, + "loss": 0.1513, + "num_input_tokens_seen": 52214608, + "step": 24145 + }, + { + "epoch": 3.939641109298532, + "grad_norm": 0.0148626072332263, + "learning_rate": 0.0009716264586454406, + "loss": 0.0515, + "num_input_tokens_seen": 52226224, + "step": 24150 + }, + { + "epoch": 3.9404567699836868, + "grad_norm": 0.1550069898366928, + "learning_rate": 0.0009716028168130906, + "loss": 0.224, + "num_input_tokens_seen": 52237072, + "step": 24155 + }, + { + "epoch": 3.9412724306688416, + "grad_norm": 0.013852902688086033, + "learning_rate": 0.000971579165423111, + "loss": 0.0268, + "num_input_tokens_seen": 52247696, + "step": 24160 + }, + { + "epoch": 3.942088091353997, + "grad_norm": 0.03343448415398598, + "learning_rate": 0.0009715555044759815, + "loss": 0.0365, + "num_input_tokens_seen": 52259120, + "step": 24165 + }, + { + "epoch": 3.9429037520391517, + "grad_norm": 0.03657007962465286, + "learning_rate": 0.0009715318339721814, + "loss": 0.2922, + "num_input_tokens_seen": 52269136, + "step": 24170 + }, + { + "epoch": 3.943719412724307, + "grad_norm": 0.1288510113954544, + "learning_rate": 0.0009715081539121908, + "loss": 0.0949, + "num_input_tokens_seen": 52280560, + "step": 24175 + }, + { + "epoch": 3.9445350734094617, + "grad_norm": 0.07137199491262436, + "learning_rate": 0.0009714844642964891, + "loss": 0.0433, + "num_input_tokens_seen": 52289072, + "step": 24180 + }, + { + "epoch": 3.9453507340946166, + "grad_norm": 0.04436810687184334, + "learning_rate": 0.0009714607651255565, + "loss": 0.1238, + "num_input_tokens_seen": 52301264, + "step": 24185 + }, + { + "epoch": 3.9461663947797714, + "grad_norm": 0.037092193961143494, + "learning_rate": 0.0009714370563998736, + "loss": 0.0356, + "num_input_tokens_seen": 52311760, + "step": 24190 + }, + { + "epoch": 3.9469820554649266, + "grad_norm": 0.03281337395310402, + "learning_rate": 0.0009714133381199205, + "loss": 0.0545, + "num_input_tokens_seen": 52322160, + "step": 24195 + }, + { + "epoch": 3.9477977161500815, + "grad_norm": 0.09527835249900818, + "learning_rate": 0.0009713896102861782, + "loss": 0.0856, + "num_input_tokens_seen": 52331760, + "step": 24200 + }, + { + "epoch": 3.9486133768352367, + "grad_norm": 0.13394109904766083, + "learning_rate": 0.0009713658728991274, + "loss": 0.1299, + "num_input_tokens_seen": 52342512, + "step": 24205 + }, + { + "epoch": 3.9494290375203915, + "grad_norm": 0.1500953584909439, + "learning_rate": 0.0009713421259592493, + "loss": 0.0533, + "num_input_tokens_seen": 52352784, + "step": 24210 + }, + { + "epoch": 3.9502446982055464, + "grad_norm": 0.1007457748055458, + "learning_rate": 0.0009713183694670249, + "loss": 0.0409, + "num_input_tokens_seen": 52363920, + "step": 24215 + }, + { + "epoch": 3.9510603588907016, + "grad_norm": 0.020904328674077988, + "learning_rate": 0.000971294603422936, + "loss": 0.0296, + "num_input_tokens_seen": 52373904, + "step": 24220 + }, + { + "epoch": 3.9518760195758564, + "grad_norm": 0.17763468623161316, + "learning_rate": 0.000971270827827464, + "loss": 0.1151, + "num_input_tokens_seen": 52383152, + "step": 24225 + }, + { + "epoch": 3.9526916802610113, + "grad_norm": 0.003645398421213031, + "learning_rate": 0.0009712470426810909, + "loss": 0.0381, + "num_input_tokens_seen": 52393968, + "step": 24230 + }, + { + "epoch": 3.9535073409461665, + "grad_norm": 0.004083213862031698, + "learning_rate": 0.0009712232479842986, + "loss": 0.1002, + "num_input_tokens_seen": 52404368, + "step": 24235 + }, + { + "epoch": 3.9543230016313213, + "grad_norm": 0.08057636767625809, + "learning_rate": 0.0009711994437375693, + "loss": 0.0359, + "num_input_tokens_seen": 52415824, + "step": 24240 + }, + { + "epoch": 3.955138662316476, + "grad_norm": 0.06122511997818947, + "learning_rate": 0.0009711756299413856, + "loss": 0.0688, + "num_input_tokens_seen": 52426576, + "step": 24245 + }, + { + "epoch": 3.9559543230016314, + "grad_norm": 0.014753523282706738, + "learning_rate": 0.0009711518065962302, + "loss": 0.1354, + "num_input_tokens_seen": 52437456, + "step": 24250 + }, + { + "epoch": 3.9567699836867862, + "grad_norm": 0.2708718180656433, + "learning_rate": 0.0009711279737025856, + "loss": 0.1888, + "num_input_tokens_seen": 52448720, + "step": 24255 + }, + { + "epoch": 3.9575856443719415, + "grad_norm": 0.18782684206962585, + "learning_rate": 0.0009711041312609349, + "loss": 0.1408, + "num_input_tokens_seen": 52459024, + "step": 24260 + }, + { + "epoch": 3.9584013050570963, + "grad_norm": 0.017271332442760468, + "learning_rate": 0.0009710802792717613, + "loss": 0.0117, + "num_input_tokens_seen": 52469872, + "step": 24265 + }, + { + "epoch": 3.959216965742251, + "grad_norm": 0.13320668041706085, + "learning_rate": 0.0009710564177355483, + "loss": 0.1292, + "num_input_tokens_seen": 52481264, + "step": 24270 + }, + { + "epoch": 3.960032626427406, + "grad_norm": 0.06218891963362694, + "learning_rate": 0.0009710325466527794, + "loss": 0.054, + "num_input_tokens_seen": 52492240, + "step": 24275 + }, + { + "epoch": 3.960848287112561, + "grad_norm": 0.002708859508857131, + "learning_rate": 0.0009710086660239386, + "loss": 0.105, + "num_input_tokens_seen": 52502320, + "step": 24280 + }, + { + "epoch": 3.961663947797716, + "grad_norm": 0.004188997205346823, + "learning_rate": 0.0009709847758495094, + "loss": 0.0715, + "num_input_tokens_seen": 52512336, + "step": 24285 + }, + { + "epoch": 3.9624796084828713, + "grad_norm": 0.009143562987446785, + "learning_rate": 0.0009709608761299763, + "loss": 0.1352, + "num_input_tokens_seen": 52522192, + "step": 24290 + }, + { + "epoch": 3.963295269168026, + "grad_norm": 0.10898889601230621, + "learning_rate": 0.0009709369668658237, + "loss": 0.084, + "num_input_tokens_seen": 52532144, + "step": 24295 + }, + { + "epoch": 3.964110929853181, + "grad_norm": 0.061856959015131, + "learning_rate": 0.0009709130480575359, + "loss": 0.1064, + "num_input_tokens_seen": 52543824, + "step": 24300 + }, + { + "epoch": 3.964926590538336, + "grad_norm": 0.16867460310459137, + "learning_rate": 0.0009708891197055978, + "loss": 0.1489, + "num_input_tokens_seen": 52553552, + "step": 24305 + }, + { + "epoch": 3.965742251223491, + "grad_norm": 0.03134358301758766, + "learning_rate": 0.0009708651818104943, + "loss": 0.1136, + "num_input_tokens_seen": 52564528, + "step": 24310 + }, + { + "epoch": 3.9665579119086463, + "grad_norm": 0.0166754350066185, + "learning_rate": 0.0009708412343727106, + "loss": 0.0494, + "num_input_tokens_seen": 52574160, + "step": 24315 + }, + { + "epoch": 3.967373572593801, + "grad_norm": 0.014139552600681782, + "learning_rate": 0.000970817277392732, + "loss": 0.2645, + "num_input_tokens_seen": 52584368, + "step": 24320 + }, + { + "epoch": 3.968189233278956, + "grad_norm": 0.20456832647323608, + "learning_rate": 0.000970793310871044, + "loss": 0.1731, + "num_input_tokens_seen": 52595664, + "step": 24325 + }, + { + "epoch": 3.9690048939641107, + "grad_norm": 0.05795443058013916, + "learning_rate": 0.0009707693348081323, + "loss": 0.1666, + "num_input_tokens_seen": 52607472, + "step": 24330 + }, + { + "epoch": 3.969820554649266, + "grad_norm": 0.031290389597415924, + "learning_rate": 0.0009707453492044829, + "loss": 0.0734, + "num_input_tokens_seen": 52618864, + "step": 24335 + }, + { + "epoch": 3.970636215334421, + "grad_norm": 0.0793827474117279, + "learning_rate": 0.0009707213540605817, + "loss": 0.1576, + "num_input_tokens_seen": 52628656, + "step": 24340 + }, + { + "epoch": 3.971451876019576, + "grad_norm": 0.055470243096351624, + "learning_rate": 0.0009706973493769152, + "loss": 0.1762, + "num_input_tokens_seen": 52638832, + "step": 24345 + }, + { + "epoch": 3.972267536704731, + "grad_norm": 0.05963335558772087, + "learning_rate": 0.0009706733351539696, + "loss": 0.0783, + "num_input_tokens_seen": 52648400, + "step": 24350 + }, + { + "epoch": 3.9730831973898857, + "grad_norm": 0.08069650828838348, + "learning_rate": 0.0009706493113922318, + "loss": 0.0823, + "num_input_tokens_seen": 52659216, + "step": 24355 + }, + { + "epoch": 3.9738988580750405, + "grad_norm": 0.08950478583574295, + "learning_rate": 0.000970625278092189, + "loss": 0.1453, + "num_input_tokens_seen": 52669680, + "step": 24360 + }, + { + "epoch": 3.9747145187601958, + "grad_norm": 0.1317376345396042, + "learning_rate": 0.0009706012352543276, + "loss": 0.1271, + "num_input_tokens_seen": 52681008, + "step": 24365 + }, + { + "epoch": 3.9755301794453506, + "grad_norm": 0.014402917586266994, + "learning_rate": 0.0009705771828791353, + "loss": 0.1912, + "num_input_tokens_seen": 52691312, + "step": 24370 + }, + { + "epoch": 3.976345840130506, + "grad_norm": 0.09558943659067154, + "learning_rate": 0.0009705531209670993, + "loss": 0.1728, + "num_input_tokens_seen": 52701712, + "step": 24375 + }, + { + "epoch": 3.9771615008156607, + "grad_norm": 0.020290644839406013, + "learning_rate": 0.0009705290495187073, + "loss": 0.1391, + "num_input_tokens_seen": 52713648, + "step": 24380 + }, + { + "epoch": 3.9779771615008155, + "grad_norm": 0.03894273191690445, + "learning_rate": 0.0009705049685344474, + "loss": 0.1819, + "num_input_tokens_seen": 52724656, + "step": 24385 + }, + { + "epoch": 3.9787928221859707, + "grad_norm": 0.017061561346054077, + "learning_rate": 0.0009704808780148074, + "loss": 0.1858, + "num_input_tokens_seen": 52735824, + "step": 24390 + }, + { + "epoch": 3.9796084828711256, + "grad_norm": 0.08352766185998917, + "learning_rate": 0.0009704567779602754, + "loss": 0.1287, + "num_input_tokens_seen": 52747344, + "step": 24395 + }, + { + "epoch": 3.980424143556281, + "grad_norm": 0.08225654065608978, + "learning_rate": 0.0009704326683713402, + "loss": 0.0856, + "num_input_tokens_seen": 52758320, + "step": 24400 + }, + { + "epoch": 3.9812398042414356, + "grad_norm": 0.143769770860672, + "learning_rate": 0.00097040854924849, + "loss": 0.1, + "num_input_tokens_seen": 52768752, + "step": 24405 + }, + { + "epoch": 3.9820554649265905, + "grad_norm": 0.03472739830613136, + "learning_rate": 0.0009703844205922139, + "loss": 0.1142, + "num_input_tokens_seen": 52779376, + "step": 24410 + }, + { + "epoch": 3.9828711256117453, + "grad_norm": 0.07649330794811249, + "learning_rate": 0.0009703602824030007, + "loss": 0.1169, + "num_input_tokens_seen": 52790064, + "step": 24415 + }, + { + "epoch": 3.9836867862969005, + "grad_norm": 0.15564045310020447, + "learning_rate": 0.0009703361346813398, + "loss": 0.0573, + "num_input_tokens_seen": 52800080, + "step": 24420 + }, + { + "epoch": 3.9845024469820554, + "grad_norm": 0.006799460854381323, + "learning_rate": 0.0009703119774277205, + "loss": 0.1825, + "num_input_tokens_seen": 52811632, + "step": 24425 + }, + { + "epoch": 3.9853181076672106, + "grad_norm": 0.08229997754096985, + "learning_rate": 0.0009702878106426321, + "loss": 0.0794, + "num_input_tokens_seen": 52822192, + "step": 24430 + }, + { + "epoch": 3.9861337683523654, + "grad_norm": 0.14275634288787842, + "learning_rate": 0.0009702636343265649, + "loss": 0.085, + "num_input_tokens_seen": 52833168, + "step": 24435 + }, + { + "epoch": 3.9869494290375203, + "grad_norm": 0.08394118398427963, + "learning_rate": 0.0009702394484800084, + "loss": 0.1069, + "num_input_tokens_seen": 52843024, + "step": 24440 + }, + { + "epoch": 3.9877650897226755, + "grad_norm": 0.09326915442943573, + "learning_rate": 0.000970215253103453, + "loss": 0.1826, + "num_input_tokens_seen": 52852528, + "step": 24445 + }, + { + "epoch": 3.9885807504078303, + "grad_norm": 0.13007716834545135, + "learning_rate": 0.0009701910481973889, + "loss": 0.1718, + "num_input_tokens_seen": 52862288, + "step": 24450 + }, + { + "epoch": 3.9893964110929856, + "grad_norm": 0.10196779668331146, + "learning_rate": 0.0009701668337623069, + "loss": 0.1418, + "num_input_tokens_seen": 52873584, + "step": 24455 + }, + { + "epoch": 3.9902120717781404, + "grad_norm": 0.15357103943824768, + "learning_rate": 0.0009701426097986974, + "loss": 0.1067, + "num_input_tokens_seen": 52883856, + "step": 24460 + }, + { + "epoch": 3.9910277324632952, + "grad_norm": 0.11803940683603287, + "learning_rate": 0.0009701183763070516, + "loss": 0.1075, + "num_input_tokens_seen": 52895120, + "step": 24465 + }, + { + "epoch": 3.99184339314845, + "grad_norm": 0.1026521846652031, + "learning_rate": 0.0009700941332878605, + "loss": 0.134, + "num_input_tokens_seen": 52905648, + "step": 24470 + }, + { + "epoch": 3.9926590538336053, + "grad_norm": 0.04510059580206871, + "learning_rate": 0.0009700698807416153, + "loss": 0.082, + "num_input_tokens_seen": 52917168, + "step": 24475 + }, + { + "epoch": 3.99347471451876, + "grad_norm": 0.029394259676337242, + "learning_rate": 0.0009700456186688078, + "loss": 0.1809, + "num_input_tokens_seen": 52928208, + "step": 24480 + }, + { + "epoch": 3.9942903752039154, + "grad_norm": 0.13904544711112976, + "learning_rate": 0.0009700213470699295, + "loss": 0.1228, + "num_input_tokens_seen": 52939088, + "step": 24485 + }, + { + "epoch": 3.99510603588907, + "grad_norm": 0.06425957381725311, + "learning_rate": 0.0009699970659454723, + "loss": 0.0245, + "num_input_tokens_seen": 52949008, + "step": 24490 + }, + { + "epoch": 3.995921696574225, + "grad_norm": 0.009191847406327724, + "learning_rate": 0.0009699727752959284, + "loss": 0.1855, + "num_input_tokens_seen": 52959600, + "step": 24495 + }, + { + "epoch": 3.99673735725938, + "grad_norm": 0.20780709385871887, + "learning_rate": 0.00096994847512179, + "loss": 0.0849, + "num_input_tokens_seen": 52969488, + "step": 24500 + }, + { + "epoch": 3.997553017944535, + "grad_norm": 0.05502910912036896, + "learning_rate": 0.0009699241654235495, + "loss": 0.1294, + "num_input_tokens_seen": 52980848, + "step": 24505 + }, + { + "epoch": 3.99836867862969, + "grad_norm": 0.06313939392566681, + "learning_rate": 0.0009698998462016997, + "loss": 0.0386, + "num_input_tokens_seen": 52991600, + "step": 24510 + }, + { + "epoch": 3.999184339314845, + "grad_norm": 0.10375002026557922, + "learning_rate": 0.0009698755174567333, + "loss": 0.0745, + "num_input_tokens_seen": 53001680, + "step": 24515 + }, + { + "epoch": 4.0, + "grad_norm": 0.1650952696800232, + "learning_rate": 0.0009698511791891435, + "loss": 0.2173, + "num_input_tokens_seen": 53010912, + "step": 24520 + }, + { + "epoch": 4.0, + "eval_loss": 0.12605686485767365, + "eval_runtime": 103.1538, + "eval_samples_per_second": 26.417, + "eval_steps_per_second": 6.611, + "num_input_tokens_seen": 53010912, + "step": 24520 + }, + { + "epoch": 4.000815660685155, + "grad_norm": 0.02118796855211258, + "learning_rate": 0.0009698268313994236, + "loss": 0.044, + "num_input_tokens_seen": 53022112, + "step": 24525 + }, + { + "epoch": 4.00163132137031, + "grad_norm": 0.07317659258842468, + "learning_rate": 0.0009698024740880668, + "loss": 0.0935, + "num_input_tokens_seen": 53033056, + "step": 24530 + }, + { + "epoch": 4.002446982055465, + "grad_norm": 0.15030977129936218, + "learning_rate": 0.0009697781072555672, + "loss": 0.1966, + "num_input_tokens_seen": 53043904, + "step": 24535 + }, + { + "epoch": 4.00326264274062, + "grad_norm": 0.11619038134813309, + "learning_rate": 0.0009697537309024181, + "loss": 0.0746, + "num_input_tokens_seen": 53054080, + "step": 24540 + }, + { + "epoch": 4.004078303425775, + "grad_norm": 0.026329301297664642, + "learning_rate": 0.0009697293450291136, + "loss": 0.19, + "num_input_tokens_seen": 53064256, + "step": 24545 + }, + { + "epoch": 4.00489396411093, + "grad_norm": 0.01354842260479927, + "learning_rate": 0.0009697049496361481, + "loss": 0.125, + "num_input_tokens_seen": 53075200, + "step": 24550 + }, + { + "epoch": 4.005709624796085, + "grad_norm": 0.008432844653725624, + "learning_rate": 0.000969680544724016, + "loss": 0.0372, + "num_input_tokens_seen": 53086496, + "step": 24555 + }, + { + "epoch": 4.006525285481239, + "grad_norm": 0.13896256685256958, + "learning_rate": 0.0009696561302932117, + "loss": 0.1721, + "num_input_tokens_seen": 53097216, + "step": 24560 + }, + { + "epoch": 4.007340946166395, + "grad_norm": 0.012351407669484615, + "learning_rate": 0.0009696317063442303, + "loss": 0.1693, + "num_input_tokens_seen": 53108160, + "step": 24565 + }, + { + "epoch": 4.00815660685155, + "grad_norm": 0.04247596859931946, + "learning_rate": 0.0009696072728775664, + "loss": 0.0845, + "num_input_tokens_seen": 53119520, + "step": 24570 + }, + { + "epoch": 4.008972267536705, + "grad_norm": 0.03004402108490467, + "learning_rate": 0.0009695828298937155, + "loss": 0.1213, + "num_input_tokens_seen": 53130976, + "step": 24575 + }, + { + "epoch": 4.00978792822186, + "grad_norm": 0.023356273770332336, + "learning_rate": 0.0009695583773931728, + "loss": 0.0444, + "num_input_tokens_seen": 53141504, + "step": 24580 + }, + { + "epoch": 4.010603588907014, + "grad_norm": 0.06728272885084152, + "learning_rate": 0.000969533915376434, + "loss": 0.0355, + "num_input_tokens_seen": 53151360, + "step": 24585 + }, + { + "epoch": 4.011419249592169, + "grad_norm": 0.0781501904129982, + "learning_rate": 0.0009695094438439947, + "loss": 0.0826, + "num_input_tokens_seen": 53163008, + "step": 24590 + }, + { + "epoch": 4.012234910277325, + "grad_norm": 0.00982770137488842, + "learning_rate": 0.000969484962796351, + "loss": 0.1427, + "num_input_tokens_seen": 53175200, + "step": 24595 + }, + { + "epoch": 4.01305057096248, + "grad_norm": 0.009649750776588917, + "learning_rate": 0.0009694604722339987, + "loss": 0.1421, + "num_input_tokens_seen": 53185952, + "step": 24600 + }, + { + "epoch": 4.013866231647635, + "grad_norm": 0.038069020956754684, + "learning_rate": 0.0009694359721574345, + "loss": 0.2128, + "num_input_tokens_seen": 53196704, + "step": 24605 + }, + { + "epoch": 4.014681892332789, + "grad_norm": 0.0761687308549881, + "learning_rate": 0.0009694114625671548, + "loss": 0.1306, + "num_input_tokens_seen": 53206272, + "step": 24610 + }, + { + "epoch": 4.015497553017944, + "grad_norm": 0.02528243139386177, + "learning_rate": 0.0009693869434636564, + "loss": 0.1686, + "num_input_tokens_seen": 53217248, + "step": 24615 + }, + { + "epoch": 4.0163132137031, + "grad_norm": 0.1863255798816681, + "learning_rate": 0.000969362414847436, + "loss": 0.0935, + "num_input_tokens_seen": 53226176, + "step": 24620 + }, + { + "epoch": 4.017128874388255, + "grad_norm": 0.19547618925571442, + "learning_rate": 0.0009693378767189909, + "loss": 0.2296, + "num_input_tokens_seen": 53236096, + "step": 24625 + }, + { + "epoch": 4.0179445350734095, + "grad_norm": 0.05746988207101822, + "learning_rate": 0.0009693133290788184, + "loss": 0.1009, + "num_input_tokens_seen": 53246496, + "step": 24630 + }, + { + "epoch": 4.018760195758564, + "grad_norm": 0.04920806735754013, + "learning_rate": 0.0009692887719274159, + "loss": 0.1455, + "num_input_tokens_seen": 53257792, + "step": 24635 + }, + { + "epoch": 4.019575856443719, + "grad_norm": 0.04692168906331062, + "learning_rate": 0.0009692642052652811, + "loss": 0.1514, + "num_input_tokens_seen": 53269312, + "step": 24640 + }, + { + "epoch": 4.020391517128874, + "grad_norm": 0.03407781943678856, + "learning_rate": 0.0009692396290929118, + "loss": 0.0826, + "num_input_tokens_seen": 53278944, + "step": 24645 + }, + { + "epoch": 4.02120717781403, + "grad_norm": 0.019528646022081375, + "learning_rate": 0.0009692150434108061, + "loss": 0.1115, + "num_input_tokens_seen": 53290656, + "step": 24650 + }, + { + "epoch": 4.0220228384991845, + "grad_norm": 0.04901457577943802, + "learning_rate": 0.0009691904482194625, + "loss": 0.0774, + "num_input_tokens_seen": 53300736, + "step": 24655 + }, + { + "epoch": 4.022838499184339, + "grad_norm": 0.06730242073535919, + "learning_rate": 0.000969165843519379, + "loss": 0.1215, + "num_input_tokens_seen": 53312032, + "step": 24660 + }, + { + "epoch": 4.023654159869494, + "grad_norm": 0.26630842685699463, + "learning_rate": 0.0009691412293110546, + "loss": 0.2011, + "num_input_tokens_seen": 53323616, + "step": 24665 + }, + { + "epoch": 4.024469820554649, + "grad_norm": 0.0840611532330513, + "learning_rate": 0.0009691166055949881, + "loss": 0.0298, + "num_input_tokens_seen": 53334848, + "step": 24670 + }, + { + "epoch": 4.025285481239805, + "grad_norm": 0.020203417167067528, + "learning_rate": 0.0009690919723716785, + "loss": 0.0755, + "num_input_tokens_seen": 53346208, + "step": 24675 + }, + { + "epoch": 4.0261011419249595, + "grad_norm": 0.08350825309753418, + "learning_rate": 0.000969067329641625, + "loss": 0.1501, + "num_input_tokens_seen": 53355520, + "step": 24680 + }, + { + "epoch": 4.026916802610114, + "grad_norm": 0.05830421671271324, + "learning_rate": 0.000969042677405327, + "loss": 0.1252, + "num_input_tokens_seen": 53366144, + "step": 24685 + }, + { + "epoch": 4.027732463295269, + "grad_norm": 0.03297814726829529, + "learning_rate": 0.0009690180156632839, + "loss": 0.1003, + "num_input_tokens_seen": 53377568, + "step": 24690 + }, + { + "epoch": 4.028548123980424, + "grad_norm": 0.015535816550254822, + "learning_rate": 0.000968993344415996, + "loss": 0.0405, + "num_input_tokens_seen": 53388320, + "step": 24695 + }, + { + "epoch": 4.029363784665579, + "grad_norm": 0.05832860991358757, + "learning_rate": 0.0009689686636639629, + "loss": 0.1678, + "num_input_tokens_seen": 53398880, + "step": 24700 + }, + { + "epoch": 4.0301794453507345, + "grad_norm": 0.1182079017162323, + "learning_rate": 0.000968943973407685, + "loss": 0.0647, + "num_input_tokens_seen": 53409376, + "step": 24705 + }, + { + "epoch": 4.030995106035889, + "grad_norm": 0.018100082874298096, + "learning_rate": 0.0009689192736476624, + "loss": 0.1543, + "num_input_tokens_seen": 53420736, + "step": 24710 + }, + { + "epoch": 4.031810766721044, + "grad_norm": 0.1703413426876068, + "learning_rate": 0.000968894564384396, + "loss": 0.1288, + "num_input_tokens_seen": 53431264, + "step": 24715 + }, + { + "epoch": 4.032626427406199, + "grad_norm": 0.03496500477194786, + "learning_rate": 0.0009688698456183863, + "loss": 0.1077, + "num_input_tokens_seen": 53441504, + "step": 24720 + }, + { + "epoch": 4.033442088091354, + "grad_norm": 0.018825236707925797, + "learning_rate": 0.0009688451173501345, + "loss": 0.0318, + "num_input_tokens_seen": 53451616, + "step": 24725 + }, + { + "epoch": 4.034257748776509, + "grad_norm": 0.07746203988790512, + "learning_rate": 0.0009688203795801415, + "loss": 0.2134, + "num_input_tokens_seen": 53463456, + "step": 24730 + }, + { + "epoch": 4.035073409461664, + "grad_norm": 0.01029142364859581, + "learning_rate": 0.0009687956323089088, + "loss": 0.0487, + "num_input_tokens_seen": 53474624, + "step": 24735 + }, + { + "epoch": 4.035889070146819, + "grad_norm": 0.06122157722711563, + "learning_rate": 0.000968770875536938, + "loss": 0.4163, + "num_input_tokens_seen": 53486304, + "step": 24740 + }, + { + "epoch": 4.036704730831974, + "grad_norm": 0.2235114872455597, + "learning_rate": 0.0009687461092647308, + "loss": 0.2131, + "num_input_tokens_seen": 53497152, + "step": 24745 + }, + { + "epoch": 4.037520391517129, + "grad_norm": 0.09703657776117325, + "learning_rate": 0.0009687213334927888, + "loss": 0.2333, + "num_input_tokens_seen": 53508704, + "step": 24750 + }, + { + "epoch": 4.0383360522022835, + "grad_norm": 0.15249381959438324, + "learning_rate": 0.0009686965482216145, + "loss": 0.0816, + "num_input_tokens_seen": 53519776, + "step": 24755 + }, + { + "epoch": 4.039151712887439, + "grad_norm": 0.13171689212322235, + "learning_rate": 0.00096867175345171, + "loss": 0.1609, + "num_input_tokens_seen": 53529952, + "step": 24760 + }, + { + "epoch": 4.039967373572594, + "grad_norm": 0.0511307455599308, + "learning_rate": 0.0009686469491835779, + "loss": 0.157, + "num_input_tokens_seen": 53541440, + "step": 24765 + }, + { + "epoch": 4.040783034257749, + "grad_norm": 0.11280439794063568, + "learning_rate": 0.0009686221354177209, + "loss": 0.1204, + "num_input_tokens_seen": 53551008, + "step": 24770 + }, + { + "epoch": 4.041598694942904, + "grad_norm": 0.08131375908851624, + "learning_rate": 0.0009685973121546417, + "loss": 0.0589, + "num_input_tokens_seen": 53560832, + "step": 24775 + }, + { + "epoch": 4.0424143556280585, + "grad_norm": 0.07098285108804703, + "learning_rate": 0.0009685724793948436, + "loss": 0.1964, + "num_input_tokens_seen": 53571264, + "step": 24780 + }, + { + "epoch": 4.043230016313213, + "grad_norm": 0.04006614908576012, + "learning_rate": 0.0009685476371388298, + "loss": 0.0495, + "num_input_tokens_seen": 53582272, + "step": 24785 + }, + { + "epoch": 4.044045676998369, + "grad_norm": 0.07132290303707123, + "learning_rate": 0.0009685227853871037, + "loss": 0.0748, + "num_input_tokens_seen": 53592992, + "step": 24790 + }, + { + "epoch": 4.044861337683524, + "grad_norm": 0.02085905708372593, + "learning_rate": 0.000968497924140169, + "loss": 0.0274, + "num_input_tokens_seen": 53603712, + "step": 24795 + }, + { + "epoch": 4.045676998368679, + "grad_norm": 0.04866085574030876, + "learning_rate": 0.0009684730533985296, + "loss": 0.062, + "num_input_tokens_seen": 53613984, + "step": 24800 + }, + { + "epoch": 4.0464926590538335, + "grad_norm": 0.314547061920166, + "learning_rate": 0.0009684481731626895, + "loss": 0.2072, + "num_input_tokens_seen": 53625024, + "step": 24805 + }, + { + "epoch": 4.047308319738988, + "grad_norm": 0.016428275033831596, + "learning_rate": 0.0009684232834331528, + "loss": 0.1322, + "num_input_tokens_seen": 53636192, + "step": 24810 + }, + { + "epoch": 4.048123980424143, + "grad_norm": 0.23308736085891724, + "learning_rate": 0.000968398384210424, + "loss": 0.0738, + "num_input_tokens_seen": 53645984, + "step": 24815 + }, + { + "epoch": 4.048939641109299, + "grad_norm": 0.03797006979584694, + "learning_rate": 0.0009683734754950078, + "loss": 0.0557, + "num_input_tokens_seen": 53655968, + "step": 24820 + }, + { + "epoch": 4.049755301794454, + "grad_norm": 0.05711853504180908, + "learning_rate": 0.0009683485572874089, + "loss": 0.107, + "num_input_tokens_seen": 53667232, + "step": 24825 + }, + { + "epoch": 4.0505709624796085, + "grad_norm": 0.07612357288599014, + "learning_rate": 0.0009683236295881324, + "loss": 0.1029, + "num_input_tokens_seen": 53678336, + "step": 24830 + }, + { + "epoch": 4.051386623164763, + "grad_norm": 0.09866306930780411, + "learning_rate": 0.0009682986923976834, + "loss": 0.1011, + "num_input_tokens_seen": 53688896, + "step": 24835 + }, + { + "epoch": 4.052202283849918, + "grad_norm": 0.20437058806419373, + "learning_rate": 0.0009682737457165673, + "loss": 0.2586, + "num_input_tokens_seen": 53699008, + "step": 24840 + }, + { + "epoch": 4.053017944535074, + "grad_norm": 0.03260158374905586, + "learning_rate": 0.0009682487895452898, + "loss": 0.0862, + "num_input_tokens_seen": 53709888, + "step": 24845 + }, + { + "epoch": 4.053833605220229, + "grad_norm": 0.016189221292734146, + "learning_rate": 0.0009682238238843565, + "loss": 0.0317, + "num_input_tokens_seen": 53720896, + "step": 24850 + }, + { + "epoch": 4.054649265905383, + "grad_norm": 0.09101825207471848, + "learning_rate": 0.0009681988487342735, + "loss": 0.0554, + "num_input_tokens_seen": 53731360, + "step": 24855 + }, + { + "epoch": 4.055464926590538, + "grad_norm": 0.015849553048610687, + "learning_rate": 0.0009681738640955466, + "loss": 0.1559, + "num_input_tokens_seen": 53742848, + "step": 24860 + }, + { + "epoch": 4.056280587275693, + "grad_norm": 0.21268007159233093, + "learning_rate": 0.0009681488699686827, + "loss": 0.2324, + "num_input_tokens_seen": 53753856, + "step": 24865 + }, + { + "epoch": 4.057096247960848, + "grad_norm": 0.02260478027164936, + "learning_rate": 0.000968123866354188, + "loss": 0.051, + "num_input_tokens_seen": 53762304, + "step": 24870 + }, + { + "epoch": 4.057911908646004, + "grad_norm": 0.056082677096128464, + "learning_rate": 0.0009680988532525693, + "loss": 0.0717, + "num_input_tokens_seen": 53773664, + "step": 24875 + }, + { + "epoch": 4.058727569331158, + "grad_norm": 0.06552215665578842, + "learning_rate": 0.0009680738306643335, + "loss": 0.0703, + "num_input_tokens_seen": 53784576, + "step": 24880 + }, + { + "epoch": 4.059543230016313, + "grad_norm": 0.04463575780391693, + "learning_rate": 0.0009680487985899878, + "loss": 0.153, + "num_input_tokens_seen": 53796096, + "step": 24885 + }, + { + "epoch": 4.060358890701468, + "grad_norm": 0.012880692258477211, + "learning_rate": 0.0009680237570300392, + "loss": 0.0884, + "num_input_tokens_seen": 53807552, + "step": 24890 + }, + { + "epoch": 4.061174551386623, + "grad_norm": 0.0764424279332161, + "learning_rate": 0.0009679987059849956, + "loss": 0.0623, + "num_input_tokens_seen": 53818272, + "step": 24895 + }, + { + "epoch": 4.061990212071779, + "grad_norm": 0.23426131904125214, + "learning_rate": 0.0009679736454553645, + "loss": 0.095, + "num_input_tokens_seen": 53829696, + "step": 24900 + }, + { + "epoch": 4.062805872756933, + "grad_norm": 0.16571703553199768, + "learning_rate": 0.0009679485754416538, + "loss": 0.19, + "num_input_tokens_seen": 53841024, + "step": 24905 + }, + { + "epoch": 4.063621533442088, + "grad_norm": 0.04391804337501526, + "learning_rate": 0.0009679234959443717, + "loss": 0.0576, + "num_input_tokens_seen": 53850816, + "step": 24910 + }, + { + "epoch": 4.064437194127243, + "grad_norm": 0.09299265593290329, + "learning_rate": 0.0009678984069640262, + "loss": 0.1523, + "num_input_tokens_seen": 53861344, + "step": 24915 + }, + { + "epoch": 4.065252854812398, + "grad_norm": 0.13993260264396667, + "learning_rate": 0.000967873308501126, + "loss": 0.1648, + "num_input_tokens_seen": 53872384, + "step": 24920 + }, + { + "epoch": 4.066068515497553, + "grad_norm": 0.01173485815525055, + "learning_rate": 0.0009678482005561795, + "loss": 0.0924, + "num_input_tokens_seen": 53884160, + "step": 24925 + }, + { + "epoch": 4.066884176182708, + "grad_norm": 0.015828793868422508, + "learning_rate": 0.0009678230831296959, + "loss": 0.0988, + "num_input_tokens_seen": 53892320, + "step": 24930 + }, + { + "epoch": 4.067699836867863, + "grad_norm": 0.1745096892118454, + "learning_rate": 0.000967797956222184, + "loss": 0.0591, + "num_input_tokens_seen": 53901920, + "step": 24935 + }, + { + "epoch": 4.068515497553018, + "grad_norm": 0.06484830379486084, + "learning_rate": 0.000967772819834153, + "loss": 0.0235, + "num_input_tokens_seen": 53911968, + "step": 24940 + }, + { + "epoch": 4.069331158238173, + "grad_norm": 0.22511164844036102, + "learning_rate": 0.0009677476739661124, + "loss": 0.1797, + "num_input_tokens_seen": 53923520, + "step": 24945 + }, + { + "epoch": 4.070146818923328, + "grad_norm": 0.013632331043481827, + "learning_rate": 0.0009677225186185719, + "loss": 0.0258, + "num_input_tokens_seen": 53934400, + "step": 24950 + }, + { + "epoch": 4.0709624796084825, + "grad_norm": 0.23946814239025116, + "learning_rate": 0.0009676973537920411, + "loss": 0.2613, + "num_input_tokens_seen": 53945696, + "step": 24955 + }, + { + "epoch": 4.071778140293638, + "grad_norm": 0.20356135070323944, + "learning_rate": 0.0009676721794870302, + "loss": 0.1221, + "num_input_tokens_seen": 53955488, + "step": 24960 + }, + { + "epoch": 4.072593800978793, + "grad_norm": 0.2573193907737732, + "learning_rate": 0.0009676469957040492, + "loss": 0.0804, + "num_input_tokens_seen": 53965792, + "step": 24965 + }, + { + "epoch": 4.073409461663948, + "grad_norm": 0.04237401485443115, + "learning_rate": 0.0009676218024436087, + "loss": 0.1027, + "num_input_tokens_seen": 53976352, + "step": 24970 + }, + { + "epoch": 4.074225122349103, + "grad_norm": 0.03187128156423569, + "learning_rate": 0.0009675965997062192, + "loss": 0.0646, + "num_input_tokens_seen": 53987744, + "step": 24975 + }, + { + "epoch": 4.075040783034257, + "grad_norm": 0.14574794471263885, + "learning_rate": 0.0009675713874923912, + "loss": 0.1111, + "num_input_tokens_seen": 53999136, + "step": 24980 + }, + { + "epoch": 4.075856443719413, + "grad_norm": 0.005596504081040621, + "learning_rate": 0.0009675461658026361, + "loss": 0.0442, + "num_input_tokens_seen": 54010112, + "step": 24985 + }, + { + "epoch": 4.076672104404568, + "grad_norm": 0.04081597551703453, + "learning_rate": 0.0009675209346374647, + "loss": 0.0305, + "num_input_tokens_seen": 54020992, + "step": 24990 + }, + { + "epoch": 4.077487765089723, + "grad_norm": 0.16552519798278809, + "learning_rate": 0.0009674956939973885, + "loss": 0.078, + "num_input_tokens_seen": 54031488, + "step": 24995 + }, + { + "epoch": 4.078303425774878, + "grad_norm": 0.17626450955867767, + "learning_rate": 0.0009674704438829189, + "loss": 0.0988, + "num_input_tokens_seen": 54043104, + "step": 25000 + }, + { + "epoch": 4.079119086460032, + "grad_norm": 0.22518527507781982, + "learning_rate": 0.0009674451842945679, + "loss": 0.1335, + "num_input_tokens_seen": 54052096, + "step": 25005 + }, + { + "epoch": 4.079934747145187, + "grad_norm": 0.06224973499774933, + "learning_rate": 0.0009674199152328472, + "loss": 0.0489, + "num_input_tokens_seen": 54062400, + "step": 25010 + }, + { + "epoch": 4.080750407830343, + "grad_norm": 0.032965682446956635, + "learning_rate": 0.0009673946366982689, + "loss": 0.0575, + "num_input_tokens_seen": 54073120, + "step": 25015 + }, + { + "epoch": 4.081566068515498, + "grad_norm": 0.05587991327047348, + "learning_rate": 0.0009673693486913453, + "loss": 0.036, + "num_input_tokens_seen": 54083296, + "step": 25020 + }, + { + "epoch": 4.082381729200653, + "grad_norm": 0.052622903138399124, + "learning_rate": 0.000967344051212589, + "loss": 0.1853, + "num_input_tokens_seen": 54093728, + "step": 25025 + }, + { + "epoch": 4.083197389885807, + "grad_norm": 0.058794185519218445, + "learning_rate": 0.0009673187442625126, + "loss": 0.0643, + "num_input_tokens_seen": 54104512, + "step": 25030 + }, + { + "epoch": 4.084013050570962, + "grad_norm": 0.4008449912071228, + "learning_rate": 0.0009672934278416292, + "loss": 0.0552, + "num_input_tokens_seen": 54116256, + "step": 25035 + }, + { + "epoch": 4.084828711256117, + "grad_norm": 0.04696516692638397, + "learning_rate": 0.0009672681019504514, + "loss": 0.0188, + "num_input_tokens_seen": 54126496, + "step": 25040 + }, + { + "epoch": 4.085644371941273, + "grad_norm": 0.006844913586974144, + "learning_rate": 0.0009672427665894929, + "loss": 0.0958, + "num_input_tokens_seen": 54138656, + "step": 25045 + }, + { + "epoch": 4.0864600326264275, + "grad_norm": 0.01937456987798214, + "learning_rate": 0.0009672174217592671, + "loss": 0.0123, + "num_input_tokens_seen": 54150336, + "step": 25050 + }, + { + "epoch": 4.087275693311582, + "grad_norm": 0.06921794265508652, + "learning_rate": 0.0009671920674602874, + "loss": 0.0251, + "num_input_tokens_seen": 54161376, + "step": 25055 + }, + { + "epoch": 4.088091353996737, + "grad_norm": 0.3637893795967102, + "learning_rate": 0.0009671667036930678, + "loss": 0.049, + "num_input_tokens_seen": 54171104, + "step": 25060 + }, + { + "epoch": 4.088907014681892, + "grad_norm": 0.07828755676746368, + "learning_rate": 0.0009671413304581224, + "loss": 0.0705, + "num_input_tokens_seen": 54182592, + "step": 25065 + }, + { + "epoch": 4.089722675367048, + "grad_norm": 0.11320872604846954, + "learning_rate": 0.0009671159477559652, + "loss": 0.1043, + "num_input_tokens_seen": 54194048, + "step": 25070 + }, + { + "epoch": 4.0905383360522025, + "grad_norm": 0.06699980795383453, + "learning_rate": 0.0009670905555871108, + "loss": 0.1065, + "num_input_tokens_seen": 54204896, + "step": 25075 + }, + { + "epoch": 4.091353996737357, + "grad_norm": 0.10126443952322006, + "learning_rate": 0.0009670651539520737, + "loss": 0.0786, + "num_input_tokens_seen": 54216288, + "step": 25080 + }, + { + "epoch": 4.092169657422512, + "grad_norm": 0.09108009189367294, + "learning_rate": 0.0009670397428513688, + "loss": 0.0177, + "num_input_tokens_seen": 54226976, + "step": 25085 + }, + { + "epoch": 4.092985318107667, + "grad_norm": 0.008087173104286194, + "learning_rate": 0.000967014322285511, + "loss": 0.1152, + "num_input_tokens_seen": 54236928, + "step": 25090 + }, + { + "epoch": 4.093800978792822, + "grad_norm": 0.24143271148204803, + "learning_rate": 0.0009669888922550154, + "loss": 0.185, + "num_input_tokens_seen": 54247712, + "step": 25095 + }, + { + "epoch": 4.0946166394779775, + "grad_norm": 0.01384643279016018, + "learning_rate": 0.0009669634527603977, + "loss": 0.0329, + "num_input_tokens_seen": 54258144, + "step": 25100 + }, + { + "epoch": 4.095432300163132, + "grad_norm": 0.062448494136333466, + "learning_rate": 0.000966938003802173, + "loss": 0.1463, + "num_input_tokens_seen": 54268448, + "step": 25105 + }, + { + "epoch": 4.096247960848287, + "grad_norm": 0.2051960825920105, + "learning_rate": 0.0009669125453808573, + "loss": 0.1625, + "num_input_tokens_seen": 54280736, + "step": 25110 + }, + { + "epoch": 4.097063621533442, + "grad_norm": 0.2943427562713623, + "learning_rate": 0.0009668870774969668, + "loss": 0.1793, + "num_input_tokens_seen": 54291968, + "step": 25115 + }, + { + "epoch": 4.097879282218597, + "grad_norm": 0.036557041108608246, + "learning_rate": 0.0009668616001510173, + "loss": 0.1396, + "num_input_tokens_seen": 54303712, + "step": 25120 + }, + { + "epoch": 4.0986949429037525, + "grad_norm": 0.20468497276306152, + "learning_rate": 0.0009668361133435252, + "loss": 0.0988, + "num_input_tokens_seen": 54314880, + "step": 25125 + }, + { + "epoch": 4.099510603588907, + "grad_norm": 0.1012115404009819, + "learning_rate": 0.0009668106170750071, + "loss": 0.1233, + "num_input_tokens_seen": 54325376, + "step": 25130 + }, + { + "epoch": 4.100326264274062, + "grad_norm": 0.11558778584003448, + "learning_rate": 0.0009667851113459795, + "loss": 0.0793, + "num_input_tokens_seen": 54335968, + "step": 25135 + }, + { + "epoch": 4.101141924959217, + "grad_norm": 0.23821662366390228, + "learning_rate": 0.0009667595961569595, + "loss": 0.0979, + "num_input_tokens_seen": 54346400, + "step": 25140 + }, + { + "epoch": 4.101957585644372, + "grad_norm": 0.013337861746549606, + "learning_rate": 0.0009667340715084641, + "loss": 0.0484, + "num_input_tokens_seen": 54357280, + "step": 25145 + }, + { + "epoch": 4.102773246329527, + "grad_norm": 0.027568155899643898, + "learning_rate": 0.0009667085374010107, + "loss": 0.203, + "num_input_tokens_seen": 54368416, + "step": 25150 + }, + { + "epoch": 4.103588907014682, + "grad_norm": 0.13918833434581757, + "learning_rate": 0.0009666829938351169, + "loss": 0.1497, + "num_input_tokens_seen": 54378720, + "step": 25155 + }, + { + "epoch": 4.104404567699837, + "grad_norm": 0.005044138990342617, + "learning_rate": 0.0009666574408113, + "loss": 0.0733, + "num_input_tokens_seen": 54389280, + "step": 25160 + }, + { + "epoch": 4.105220228384992, + "grad_norm": 0.007398922927677631, + "learning_rate": 0.0009666318783300782, + "loss": 0.1414, + "num_input_tokens_seen": 54399520, + "step": 25165 + }, + { + "epoch": 4.106035889070147, + "grad_norm": 0.027200058102607727, + "learning_rate": 0.0009666063063919693, + "loss": 0.1109, + "num_input_tokens_seen": 54410016, + "step": 25170 + }, + { + "epoch": 4.1068515497553015, + "grad_norm": 0.0715019479393959, + "learning_rate": 0.0009665807249974917, + "loss": 0.0765, + "num_input_tokens_seen": 54420480, + "step": 25175 + }, + { + "epoch": 4.107667210440456, + "grad_norm": 0.2523505985736847, + "learning_rate": 0.0009665551341471639, + "loss": 0.078, + "num_input_tokens_seen": 54429856, + "step": 25180 + }, + { + "epoch": 4.108482871125612, + "grad_norm": 0.07486454397439957, + "learning_rate": 0.0009665295338415044, + "loss": 0.0845, + "num_input_tokens_seen": 54439392, + "step": 25185 + }, + { + "epoch": 4.109298531810767, + "grad_norm": 0.02025793120265007, + "learning_rate": 0.0009665039240810319, + "loss": 0.1363, + "num_input_tokens_seen": 54451168, + "step": 25190 + }, + { + "epoch": 4.110114192495922, + "grad_norm": 0.2634422183036804, + "learning_rate": 0.0009664783048662658, + "loss": 0.0735, + "num_input_tokens_seen": 54462304, + "step": 25195 + }, + { + "epoch": 4.1109298531810765, + "grad_norm": 0.17543160915374756, + "learning_rate": 0.0009664526761977249, + "loss": 0.0702, + "num_input_tokens_seen": 54473216, + "step": 25200 + }, + { + "epoch": 4.111745513866231, + "grad_norm": 0.04653778672218323, + "learning_rate": 0.0009664270380759289, + "loss": 0.0496, + "num_input_tokens_seen": 54484224, + "step": 25205 + }, + { + "epoch": 4.112561174551387, + "grad_norm": 0.09117867797613144, + "learning_rate": 0.0009664013905013971, + "loss": 0.0245, + "num_input_tokens_seen": 54495904, + "step": 25210 + }, + { + "epoch": 4.113376835236542, + "grad_norm": 0.24651305377483368, + "learning_rate": 0.0009663757334746497, + "loss": 0.2096, + "num_input_tokens_seen": 54506240, + "step": 25215 + }, + { + "epoch": 4.114192495921697, + "grad_norm": 0.017522353678941727, + "learning_rate": 0.0009663500669962063, + "loss": 0.016, + "num_input_tokens_seen": 54517632, + "step": 25220 + }, + { + "epoch": 4.1150081566068515, + "grad_norm": 0.021647047251462936, + "learning_rate": 0.0009663243910665872, + "loss": 0.056, + "num_input_tokens_seen": 54527040, + "step": 25225 + }, + { + "epoch": 4.115823817292006, + "grad_norm": 0.008239752613008022, + "learning_rate": 0.0009662987056863128, + "loss": 0.1381, + "num_input_tokens_seen": 54537728, + "step": 25230 + }, + { + "epoch": 4.116639477977161, + "grad_norm": 0.27086710929870605, + "learning_rate": 0.0009662730108559034, + "loss": 0.0667, + "num_input_tokens_seen": 54547968, + "step": 25235 + }, + { + "epoch": 4.117455138662317, + "grad_norm": 0.1596733182668686, + "learning_rate": 0.0009662473065758801, + "loss": 0.1615, + "num_input_tokens_seen": 54558176, + "step": 25240 + }, + { + "epoch": 4.118270799347472, + "grad_norm": 0.21179279685020447, + "learning_rate": 0.0009662215928467636, + "loss": 0.0515, + "num_input_tokens_seen": 54568928, + "step": 25245 + }, + { + "epoch": 4.1190864600326265, + "grad_norm": 0.018240327015519142, + "learning_rate": 0.000966195869669075, + "loss": 0.0339, + "num_input_tokens_seen": 54579232, + "step": 25250 + }, + { + "epoch": 4.119902120717781, + "grad_norm": 0.2262805551290512, + "learning_rate": 0.0009661701370433358, + "loss": 0.0826, + "num_input_tokens_seen": 54589600, + "step": 25255 + }, + { + "epoch": 4.120717781402936, + "grad_norm": 0.008236533030867577, + "learning_rate": 0.0009661443949700674, + "loss": 0.0907, + "num_input_tokens_seen": 54600832, + "step": 25260 + }, + { + "epoch": 4.121533442088092, + "grad_norm": 0.2041776180267334, + "learning_rate": 0.0009661186434497915, + "loss": 0.0405, + "num_input_tokens_seen": 54612416, + "step": 25265 + }, + { + "epoch": 4.122349102773247, + "grad_norm": 0.11392532289028168, + "learning_rate": 0.0009660928824830299, + "loss": 0.1556, + "num_input_tokens_seen": 54623328, + "step": 25270 + }, + { + "epoch": 4.123164763458401, + "grad_norm": 0.023981567472219467, + "learning_rate": 0.0009660671120703048, + "loss": 0.0831, + "num_input_tokens_seen": 54634752, + "step": 25275 + }, + { + "epoch": 4.123980424143556, + "grad_norm": 0.030910201370716095, + "learning_rate": 0.0009660413322121384, + "loss": 0.0507, + "num_input_tokens_seen": 54644288, + "step": 25280 + }, + { + "epoch": 4.124796084828711, + "grad_norm": 0.0698685273528099, + "learning_rate": 0.0009660155429090531, + "loss": 0.0225, + "num_input_tokens_seen": 54655232, + "step": 25285 + }, + { + "epoch": 4.125611745513866, + "grad_norm": 0.24555733799934387, + "learning_rate": 0.0009659897441615717, + "loss": 0.1784, + "num_input_tokens_seen": 54666336, + "step": 25290 + }, + { + "epoch": 4.126427406199022, + "grad_norm": 0.003053902881219983, + "learning_rate": 0.000965963935970217, + "loss": 0.0313, + "num_input_tokens_seen": 54676448, + "step": 25295 + }, + { + "epoch": 4.127243066884176, + "grad_norm": 0.4340955317020416, + "learning_rate": 0.0009659381183355121, + "loss": 0.2067, + "num_input_tokens_seen": 54688032, + "step": 25300 + }, + { + "epoch": 4.128058727569331, + "grad_norm": 0.008606837131083012, + "learning_rate": 0.0009659122912579801, + "loss": 0.1159, + "num_input_tokens_seen": 54698528, + "step": 25305 + }, + { + "epoch": 4.128874388254486, + "grad_norm": 0.18576228618621826, + "learning_rate": 0.0009658864547381445, + "loss": 0.0435, + "num_input_tokens_seen": 54709440, + "step": 25310 + }, + { + "epoch": 4.129690048939641, + "grad_norm": 0.03524043411016464, + "learning_rate": 0.0009658606087765288, + "loss": 0.0268, + "num_input_tokens_seen": 54719424, + "step": 25315 + }, + { + "epoch": 4.130505709624796, + "grad_norm": 0.03319951519370079, + "learning_rate": 0.0009658347533736569, + "loss": 0.0383, + "num_input_tokens_seen": 54729728, + "step": 25320 + }, + { + "epoch": 4.131321370309951, + "grad_norm": 0.2569403052330017, + "learning_rate": 0.0009658088885300528, + "loss": 0.1364, + "num_input_tokens_seen": 54740608, + "step": 25325 + }, + { + "epoch": 4.132137030995106, + "grad_norm": 0.2813706696033478, + "learning_rate": 0.0009657830142462406, + "loss": 0.4054, + "num_input_tokens_seen": 54749856, + "step": 25330 + }, + { + "epoch": 4.132952691680261, + "grad_norm": 0.016006600111722946, + "learning_rate": 0.0009657571305227449, + "loss": 0.0408, + "num_input_tokens_seen": 54760864, + "step": 25335 + }, + { + "epoch": 4.133768352365416, + "grad_norm": 0.28111621737480164, + "learning_rate": 0.0009657312373600899, + "loss": 0.1102, + "num_input_tokens_seen": 54772832, + "step": 25340 + }, + { + "epoch": 4.134584013050571, + "grad_norm": 0.05557915195822716, + "learning_rate": 0.0009657053347588005, + "loss": 0.114, + "num_input_tokens_seen": 54783616, + "step": 25345 + }, + { + "epoch": 4.135399673735726, + "grad_norm": 0.09768980741500854, + "learning_rate": 0.0009656794227194019, + "loss": 0.0851, + "num_input_tokens_seen": 54793888, + "step": 25350 + }, + { + "epoch": 4.136215334420881, + "grad_norm": 0.06314717233181, + "learning_rate": 0.0009656535012424189, + "loss": 0.1181, + "num_input_tokens_seen": 54805600, + "step": 25355 + }, + { + "epoch": 4.137030995106036, + "grad_norm": 0.1482965648174286, + "learning_rate": 0.000965627570328377, + "loss": 0.1069, + "num_input_tokens_seen": 54817120, + "step": 25360 + }, + { + "epoch": 4.137846655791191, + "grad_norm": 0.03432433307170868, + "learning_rate": 0.0009656016299778017, + "loss": 0.1857, + "num_input_tokens_seen": 54829344, + "step": 25365 + }, + { + "epoch": 4.138662316476346, + "grad_norm": 0.08786375820636749, + "learning_rate": 0.0009655756801912188, + "loss": 0.1095, + "num_input_tokens_seen": 54838976, + "step": 25370 + }, + { + "epoch": 4.1394779771615005, + "grad_norm": 0.20477358996868134, + "learning_rate": 0.000965549720969154, + "loss": 0.0533, + "num_input_tokens_seen": 54849408, + "step": 25375 + }, + { + "epoch": 4.140293637846656, + "grad_norm": 0.24416247010231018, + "learning_rate": 0.0009655237523121336, + "loss": 0.1572, + "num_input_tokens_seen": 54860032, + "step": 25380 + }, + { + "epoch": 4.141109298531811, + "grad_norm": 0.08682450652122498, + "learning_rate": 0.0009654977742206837, + "loss": 0.089, + "num_input_tokens_seen": 54871168, + "step": 25385 + }, + { + "epoch": 4.141924959216966, + "grad_norm": 0.15808038413524628, + "learning_rate": 0.000965471786695331, + "loss": 0.0574, + "num_input_tokens_seen": 54880800, + "step": 25390 + }, + { + "epoch": 4.142740619902121, + "grad_norm": 0.3128909766674042, + "learning_rate": 0.0009654457897366021, + "loss": 0.2258, + "num_input_tokens_seen": 54890464, + "step": 25395 + }, + { + "epoch": 4.143556280587275, + "grad_norm": 0.021523917093873024, + "learning_rate": 0.0009654197833450235, + "loss": 0.1372, + "num_input_tokens_seen": 54901344, + "step": 25400 + }, + { + "epoch": 4.14437194127243, + "grad_norm": 0.17342549562454224, + "learning_rate": 0.0009653937675211229, + "loss": 0.1227, + "num_input_tokens_seen": 54912800, + "step": 25405 + }, + { + "epoch": 4.145187601957586, + "grad_norm": 0.037385955452919006, + "learning_rate": 0.000965367742265427, + "loss": 0.0842, + "num_input_tokens_seen": 54923872, + "step": 25410 + }, + { + "epoch": 4.146003262642741, + "grad_norm": 0.017450451850891113, + "learning_rate": 0.0009653417075784635, + "loss": 0.0476, + "num_input_tokens_seen": 54934112, + "step": 25415 + }, + { + "epoch": 4.146818923327896, + "grad_norm": 0.010421866551041603, + "learning_rate": 0.0009653156634607601, + "loss": 0.0577, + "num_input_tokens_seen": 54944064, + "step": 25420 + }, + { + "epoch": 4.14763458401305, + "grad_norm": 0.022267503663897514, + "learning_rate": 0.0009652896099128443, + "loss": 0.0636, + "num_input_tokens_seen": 54953920, + "step": 25425 + }, + { + "epoch": 4.148450244698205, + "grad_norm": 0.024997280910611153, + "learning_rate": 0.0009652635469352443, + "loss": 0.1155, + "num_input_tokens_seen": 54963968, + "step": 25430 + }, + { + "epoch": 4.149265905383361, + "grad_norm": 0.050787921994924545, + "learning_rate": 0.0009652374745284884, + "loss": 0.0298, + "num_input_tokens_seen": 54975008, + "step": 25435 + }, + { + "epoch": 4.150081566068516, + "grad_norm": 0.00542342197149992, + "learning_rate": 0.0009652113926931048, + "loss": 0.0607, + "num_input_tokens_seen": 54986336, + "step": 25440 + }, + { + "epoch": 4.150897226753671, + "grad_norm": 0.006053715944290161, + "learning_rate": 0.0009651853014296223, + "loss": 0.0287, + "num_input_tokens_seen": 54996384, + "step": 25445 + }, + { + "epoch": 4.151712887438825, + "grad_norm": 0.09369000047445297, + "learning_rate": 0.0009651592007385694, + "loss": 0.1321, + "num_input_tokens_seen": 55006688, + "step": 25450 + }, + { + "epoch": 4.15252854812398, + "grad_norm": 0.0828031525015831, + "learning_rate": 0.0009651330906204752, + "loss": 0.1098, + "num_input_tokens_seen": 55018496, + "step": 25455 + }, + { + "epoch": 4.153344208809135, + "grad_norm": 0.015311665832996368, + "learning_rate": 0.0009651069710758689, + "loss": 0.0383, + "num_input_tokens_seen": 55028416, + "step": 25460 + }, + { + "epoch": 4.154159869494291, + "grad_norm": 0.022664785385131836, + "learning_rate": 0.0009650808421052798, + "loss": 0.1705, + "num_input_tokens_seen": 55039232, + "step": 25465 + }, + { + "epoch": 4.1549755301794455, + "grad_norm": 0.20808342099189758, + "learning_rate": 0.0009650547037092374, + "loss": 0.1587, + "num_input_tokens_seen": 55051168, + "step": 25470 + }, + { + "epoch": 4.1557911908646, + "grad_norm": 0.016271889209747314, + "learning_rate": 0.0009650285558882715, + "loss": 0.2009, + "num_input_tokens_seen": 55061216, + "step": 25475 + }, + { + "epoch": 4.156606851549755, + "grad_norm": 0.2235141098499298, + "learning_rate": 0.0009650023986429119, + "loss": 0.073, + "num_input_tokens_seen": 55071968, + "step": 25480 + }, + { + "epoch": 4.15742251223491, + "grad_norm": 0.11861073225736618, + "learning_rate": 0.000964976231973689, + "loss": 0.0904, + "num_input_tokens_seen": 55081824, + "step": 25485 + }, + { + "epoch": 4.158238172920065, + "grad_norm": 0.025952542200684547, + "learning_rate": 0.0009649500558811328, + "loss": 0.0308, + "num_input_tokens_seen": 55094208, + "step": 25490 + }, + { + "epoch": 4.1590538336052205, + "grad_norm": 0.03323453292250633, + "learning_rate": 0.0009649238703657739, + "loss": 0.2475, + "num_input_tokens_seen": 55104608, + "step": 25495 + }, + { + "epoch": 4.159869494290375, + "grad_norm": 0.18540321290493011, + "learning_rate": 0.0009648976754281429, + "loss": 0.0877, + "num_input_tokens_seen": 55115168, + "step": 25500 + }, + { + "epoch": 4.16068515497553, + "grad_norm": 0.025260310620069504, + "learning_rate": 0.0009648714710687708, + "loss": 0.328, + "num_input_tokens_seen": 55125920, + "step": 25505 + }, + { + "epoch": 4.161500815660685, + "grad_norm": 0.13722355663776398, + "learning_rate": 0.0009648452572881885, + "loss": 0.2166, + "num_input_tokens_seen": 55135936, + "step": 25510 + }, + { + "epoch": 4.16231647634584, + "grad_norm": 0.138593390583992, + "learning_rate": 0.0009648190340869274, + "loss": 0.0896, + "num_input_tokens_seen": 55147136, + "step": 25515 + }, + { + "epoch": 4.1631321370309955, + "grad_norm": 0.05859887972474098, + "learning_rate": 0.000964792801465519, + "loss": 0.1762, + "num_input_tokens_seen": 55158432, + "step": 25520 + }, + { + "epoch": 4.16394779771615, + "grad_norm": 0.12412280589342117, + "learning_rate": 0.0009647665594244947, + "loss": 0.1473, + "num_input_tokens_seen": 55169728, + "step": 25525 + }, + { + "epoch": 4.164763458401305, + "grad_norm": 0.1191917136311531, + "learning_rate": 0.0009647403079643866, + "loss": 0.0689, + "num_input_tokens_seen": 55180640, + "step": 25530 + }, + { + "epoch": 4.16557911908646, + "grad_norm": 0.06216058135032654, + "learning_rate": 0.0009647140470857267, + "loss": 0.0957, + "num_input_tokens_seen": 55191616, + "step": 25535 + }, + { + "epoch": 4.166394779771615, + "grad_norm": 0.14134946465492249, + "learning_rate": 0.0009646877767890469, + "loss": 0.1614, + "num_input_tokens_seen": 55203392, + "step": 25540 + }, + { + "epoch": 4.16721044045677, + "grad_norm": 0.06047775596380234, + "learning_rate": 0.00096466149707488, + "loss": 0.0884, + "num_input_tokens_seen": 55214912, + "step": 25545 + }, + { + "epoch": 4.168026101141925, + "grad_norm": 0.06157161295413971, + "learning_rate": 0.0009646352079437582, + "loss": 0.0517, + "num_input_tokens_seen": 55225248, + "step": 25550 + }, + { + "epoch": 4.16884176182708, + "grad_norm": 0.018876446411013603, + "learning_rate": 0.0009646089093962145, + "loss": 0.0912, + "num_input_tokens_seen": 55234688, + "step": 25555 + }, + { + "epoch": 4.169657422512235, + "grad_norm": 0.07278304547071457, + "learning_rate": 0.0009645826014327819, + "loss": 0.1574, + "num_input_tokens_seen": 55245760, + "step": 25560 + }, + { + "epoch": 4.17047308319739, + "grad_norm": 0.21307507157325745, + "learning_rate": 0.0009645562840539935, + "loss": 0.0803, + "num_input_tokens_seen": 55256416, + "step": 25565 + }, + { + "epoch": 4.171288743882545, + "grad_norm": 0.03809243068099022, + "learning_rate": 0.0009645299572603827, + "loss": 0.0958, + "num_input_tokens_seen": 55267392, + "step": 25570 + }, + { + "epoch": 4.1721044045677, + "grad_norm": 0.0994822233915329, + "learning_rate": 0.000964503621052483, + "loss": 0.2607, + "num_input_tokens_seen": 55278304, + "step": 25575 + }, + { + "epoch": 4.172920065252855, + "grad_norm": 0.3972223103046417, + "learning_rate": 0.0009644772754308281, + "loss": 0.2883, + "num_input_tokens_seen": 55288992, + "step": 25580 + }, + { + "epoch": 4.17373572593801, + "grad_norm": 0.08899694681167603, + "learning_rate": 0.0009644509203959522, + "loss": 0.0906, + "num_input_tokens_seen": 55299232, + "step": 25585 + }, + { + "epoch": 4.174551386623165, + "grad_norm": 0.11821216344833374, + "learning_rate": 0.0009644245559483891, + "loss": 0.0933, + "num_input_tokens_seen": 55309600, + "step": 25590 + }, + { + "epoch": 4.1753670473083195, + "grad_norm": 0.014808449894189835, + "learning_rate": 0.0009643981820886731, + "loss": 0.1414, + "num_input_tokens_seen": 55320032, + "step": 25595 + }, + { + "epoch": 4.176182707993474, + "grad_norm": 0.032600466161966324, + "learning_rate": 0.0009643717988173389, + "loss": 0.1385, + "num_input_tokens_seen": 55327904, + "step": 25600 + }, + { + "epoch": 4.17699836867863, + "grad_norm": 0.1723560392856598, + "learning_rate": 0.0009643454061349211, + "loss": 0.2089, + "num_input_tokens_seen": 55337824, + "step": 25605 + }, + { + "epoch": 4.177814029363785, + "grad_norm": 0.012177304364740849, + "learning_rate": 0.0009643190040419545, + "loss": 0.0925, + "num_input_tokens_seen": 55349536, + "step": 25610 + }, + { + "epoch": 4.17862969004894, + "grad_norm": 0.05429260432720184, + "learning_rate": 0.0009642925925389743, + "loss": 0.2556, + "num_input_tokens_seen": 55360672, + "step": 25615 + }, + { + "epoch": 4.1794453507340945, + "grad_norm": 0.05302607640624046, + "learning_rate": 0.0009642661716265156, + "loss": 0.0524, + "num_input_tokens_seen": 55370656, + "step": 25620 + }, + { + "epoch": 4.180261011419249, + "grad_norm": 0.07791854441165924, + "learning_rate": 0.0009642397413051142, + "loss": 0.1616, + "num_input_tokens_seen": 55380704, + "step": 25625 + }, + { + "epoch": 4.181076672104404, + "grad_norm": 0.03771020844578743, + "learning_rate": 0.0009642133015753054, + "loss": 0.121, + "num_input_tokens_seen": 55393024, + "step": 25630 + }, + { + "epoch": 4.18189233278956, + "grad_norm": 0.06244561821222305, + "learning_rate": 0.0009641868524376252, + "loss": 0.078, + "num_input_tokens_seen": 55404736, + "step": 25635 + }, + { + "epoch": 4.182707993474715, + "grad_norm": 0.15027554333209991, + "learning_rate": 0.0009641603938926093, + "loss": 0.067, + "num_input_tokens_seen": 55415072, + "step": 25640 + }, + { + "epoch": 4.1835236541598695, + "grad_norm": 0.03861184045672417, + "learning_rate": 0.0009641339259407946, + "loss": 0.0421, + "num_input_tokens_seen": 55425888, + "step": 25645 + }, + { + "epoch": 4.184339314845024, + "grad_norm": 0.015195484273135662, + "learning_rate": 0.0009641074485827168, + "loss": 0.0822, + "num_input_tokens_seen": 55436224, + "step": 25650 + }, + { + "epoch": 4.185154975530179, + "grad_norm": 0.08453787118196487, + "learning_rate": 0.0009640809618189129, + "loss": 0.16, + "num_input_tokens_seen": 55447392, + "step": 25655 + }, + { + "epoch": 4.185970636215335, + "grad_norm": 0.16250330209732056, + "learning_rate": 0.0009640544656499197, + "loss": 0.1534, + "num_input_tokens_seen": 55457472, + "step": 25660 + }, + { + "epoch": 4.18678629690049, + "grad_norm": 0.0841694101691246, + "learning_rate": 0.0009640279600762738, + "loss": 0.1424, + "num_input_tokens_seen": 55468448, + "step": 25665 + }, + { + "epoch": 4.1876019575856445, + "grad_norm": 0.020344849675893784, + "learning_rate": 0.0009640014450985129, + "loss": 0.0319, + "num_input_tokens_seen": 55478016, + "step": 25670 + }, + { + "epoch": 4.188417618270799, + "grad_norm": 0.062538743019104, + "learning_rate": 0.0009639749207171739, + "loss": 0.0769, + "num_input_tokens_seen": 55488672, + "step": 25675 + }, + { + "epoch": 4.189233278955954, + "grad_norm": 0.04574638605117798, + "learning_rate": 0.0009639483869327946, + "loss": 0.0249, + "num_input_tokens_seen": 55499232, + "step": 25680 + }, + { + "epoch": 4.190048939641109, + "grad_norm": 0.03264615312218666, + "learning_rate": 0.0009639218437459125, + "loss": 0.0221, + "num_input_tokens_seen": 55509120, + "step": 25685 + }, + { + "epoch": 4.190864600326265, + "grad_norm": 0.1373731940984726, + "learning_rate": 0.000963895291157066, + "loss": 0.2198, + "num_input_tokens_seen": 55520544, + "step": 25690 + }, + { + "epoch": 4.191680261011419, + "grad_norm": 0.008453291840851307, + "learning_rate": 0.0009638687291667927, + "loss": 0.0718, + "num_input_tokens_seen": 55531136, + "step": 25695 + }, + { + "epoch": 4.192495921696574, + "grad_norm": 0.17602114379405975, + "learning_rate": 0.0009638421577756313, + "loss": 0.0732, + "num_input_tokens_seen": 55541440, + "step": 25700 + }, + { + "epoch": 4.193311582381729, + "grad_norm": 0.25807738304138184, + "learning_rate": 0.0009638155769841201, + "loss": 0.1683, + "num_input_tokens_seen": 55552992, + "step": 25705 + }, + { + "epoch": 4.194127243066884, + "grad_norm": 0.10710060596466064, + "learning_rate": 0.0009637889867927978, + "loss": 0.1119, + "num_input_tokens_seen": 55564000, + "step": 25710 + }, + { + "epoch": 4.19494290375204, + "grad_norm": 0.013491766527295113, + "learning_rate": 0.0009637623872022034, + "loss": 0.0564, + "num_input_tokens_seen": 55574880, + "step": 25715 + }, + { + "epoch": 4.195758564437194, + "grad_norm": 0.1762889325618744, + "learning_rate": 0.0009637357782128758, + "loss": 0.1068, + "num_input_tokens_seen": 55585920, + "step": 25720 + }, + { + "epoch": 4.196574225122349, + "grad_norm": 0.014259211719036102, + "learning_rate": 0.0009637091598253544, + "loss": 0.1228, + "num_input_tokens_seen": 55595456, + "step": 25725 + }, + { + "epoch": 4.197389885807504, + "grad_norm": 0.030640989542007446, + "learning_rate": 0.0009636825320401787, + "loss": 0.1722, + "num_input_tokens_seen": 55606592, + "step": 25730 + }, + { + "epoch": 4.198205546492659, + "grad_norm": 0.03938770666718483, + "learning_rate": 0.0009636558948578882, + "loss": 0.0333, + "num_input_tokens_seen": 55617184, + "step": 25735 + }, + { + "epoch": 4.199021207177814, + "grad_norm": 0.09662085026502609, + "learning_rate": 0.0009636292482790229, + "loss": 0.0808, + "num_input_tokens_seen": 55627584, + "step": 25740 + }, + { + "epoch": 4.199836867862969, + "grad_norm": 0.09255876392126083, + "learning_rate": 0.0009636025923041227, + "loss": 0.1373, + "num_input_tokens_seen": 55638336, + "step": 25745 + }, + { + "epoch": 4.200652528548124, + "grad_norm": 0.21851196885108948, + "learning_rate": 0.0009635759269337276, + "loss": 0.1107, + "num_input_tokens_seen": 55648480, + "step": 25750 + }, + { + "epoch": 4.201468189233279, + "grad_norm": 0.22205887734889984, + "learning_rate": 0.0009635492521683785, + "loss": 0.0723, + "num_input_tokens_seen": 55658368, + "step": 25755 + }, + { + "epoch": 4.202283849918434, + "grad_norm": 0.00717106182128191, + "learning_rate": 0.0009635225680086157, + "loss": 0.1347, + "num_input_tokens_seen": 55669344, + "step": 25760 + }, + { + "epoch": 4.203099510603589, + "grad_norm": 0.06024621054530144, + "learning_rate": 0.00096349587445498, + "loss": 0.1066, + "num_input_tokens_seen": 55680832, + "step": 25765 + }, + { + "epoch": 4.2039151712887435, + "grad_norm": 0.00822582095861435, + "learning_rate": 0.0009634691715080124, + "loss": 0.0453, + "num_input_tokens_seen": 55692416, + "step": 25770 + }, + { + "epoch": 4.204730831973899, + "grad_norm": 0.033295344561338425, + "learning_rate": 0.0009634424591682542, + "loss": 0.2381, + "num_input_tokens_seen": 55702752, + "step": 25775 + }, + { + "epoch": 4.205546492659054, + "grad_norm": 0.00885600782930851, + "learning_rate": 0.0009634157374362466, + "loss": 0.1732, + "num_input_tokens_seen": 55715680, + "step": 25780 + }, + { + "epoch": 4.206362153344209, + "grad_norm": 0.037626128643751144, + "learning_rate": 0.0009633890063125313, + "loss": 0.1091, + "num_input_tokens_seen": 55727008, + "step": 25785 + }, + { + "epoch": 4.207177814029364, + "grad_norm": 0.01951495371758938, + "learning_rate": 0.0009633622657976498, + "loss": 0.0224, + "num_input_tokens_seen": 55738272, + "step": 25790 + }, + { + "epoch": 4.2079934747145185, + "grad_norm": 0.01373792253434658, + "learning_rate": 0.0009633355158921441, + "loss": 0.0835, + "num_input_tokens_seen": 55748416, + "step": 25795 + }, + { + "epoch": 4.208809135399674, + "grad_norm": 0.09148267656564713, + "learning_rate": 0.0009633087565965564, + "loss": 0.0556, + "num_input_tokens_seen": 55758336, + "step": 25800 + }, + { + "epoch": 4.209624796084829, + "grad_norm": 0.10780200362205505, + "learning_rate": 0.0009632819879114291, + "loss": 0.0759, + "num_input_tokens_seen": 55768032, + "step": 25805 + }, + { + "epoch": 4.210440456769984, + "grad_norm": 0.19272269308567047, + "learning_rate": 0.0009632552098373045, + "loss": 0.0823, + "num_input_tokens_seen": 55777728, + "step": 25810 + }, + { + "epoch": 4.211256117455139, + "grad_norm": 0.0974721908569336, + "learning_rate": 0.0009632284223747255, + "loss": 0.1946, + "num_input_tokens_seen": 55787808, + "step": 25815 + }, + { + "epoch": 4.212071778140293, + "grad_norm": 0.01590495929121971, + "learning_rate": 0.0009632016255242348, + "loss": 0.1211, + "num_input_tokens_seen": 55798528, + "step": 25820 + }, + { + "epoch": 4.212887438825448, + "grad_norm": 0.0683855265378952, + "learning_rate": 0.0009631748192863756, + "loss": 0.0979, + "num_input_tokens_seen": 55807552, + "step": 25825 + }, + { + "epoch": 4.213703099510604, + "grad_norm": 0.030019963160157204, + "learning_rate": 0.0009631480036616911, + "loss": 0.1347, + "num_input_tokens_seen": 55819296, + "step": 25830 + }, + { + "epoch": 4.214518760195759, + "grad_norm": 0.16835179924964905, + "learning_rate": 0.0009631211786507248, + "loss": 0.3311, + "num_input_tokens_seen": 55830048, + "step": 25835 + }, + { + "epoch": 4.215334420880914, + "grad_norm": 0.2539372146129608, + "learning_rate": 0.0009630943442540202, + "loss": 0.1926, + "num_input_tokens_seen": 55841408, + "step": 25840 + }, + { + "epoch": 4.216150081566068, + "grad_norm": 0.023150641471147537, + "learning_rate": 0.0009630675004721212, + "loss": 0.0445, + "num_input_tokens_seen": 55852160, + "step": 25845 + }, + { + "epoch": 4.216965742251223, + "grad_norm": 0.18378613889217377, + "learning_rate": 0.000963040647305572, + "loss": 0.0621, + "num_input_tokens_seen": 55861984, + "step": 25850 + }, + { + "epoch": 4.217781402936378, + "grad_norm": 0.0326051265001297, + "learning_rate": 0.0009630137847549166, + "loss": 0.0805, + "num_input_tokens_seen": 55872800, + "step": 25855 + }, + { + "epoch": 4.218597063621534, + "grad_norm": 0.030857285484671593, + "learning_rate": 0.0009629869128206997, + "loss": 0.0333, + "num_input_tokens_seen": 55884512, + "step": 25860 + }, + { + "epoch": 4.219412724306689, + "grad_norm": 0.07170385867357254, + "learning_rate": 0.0009629600315034652, + "loss": 0.0389, + "num_input_tokens_seen": 55897024, + "step": 25865 + }, + { + "epoch": 4.220228384991843, + "grad_norm": 0.020618334412574768, + "learning_rate": 0.0009629331408037588, + "loss": 0.1663, + "num_input_tokens_seen": 55907520, + "step": 25870 + }, + { + "epoch": 4.221044045676998, + "grad_norm": 0.10644153505563736, + "learning_rate": 0.0009629062407221248, + "loss": 0.0783, + "num_input_tokens_seen": 55917280, + "step": 25875 + }, + { + "epoch": 4.221859706362153, + "grad_norm": 0.04173273965716362, + "learning_rate": 0.0009628793312591086, + "loss": 0.0846, + "num_input_tokens_seen": 55928832, + "step": 25880 + }, + { + "epoch": 4.222675367047309, + "grad_norm": 0.06491757184267044, + "learning_rate": 0.0009628524124152555, + "loss": 0.1031, + "num_input_tokens_seen": 55939904, + "step": 25885 + }, + { + "epoch": 4.2234910277324635, + "grad_norm": 0.009601933881640434, + "learning_rate": 0.0009628254841911113, + "loss": 0.0698, + "num_input_tokens_seen": 55951968, + "step": 25890 + }, + { + "epoch": 4.224306688417618, + "grad_norm": 0.02336297743022442, + "learning_rate": 0.0009627985465872214, + "loss": 0.0163, + "num_input_tokens_seen": 55962240, + "step": 25895 + }, + { + "epoch": 4.225122349102773, + "grad_norm": 0.0032236254774034023, + "learning_rate": 0.0009627715996041319, + "loss": 0.06, + "num_input_tokens_seen": 55973216, + "step": 25900 + }, + { + "epoch": 4.225938009787928, + "grad_norm": 0.006209230981767178, + "learning_rate": 0.0009627446432423888, + "loss": 0.0677, + "num_input_tokens_seen": 55984544, + "step": 25905 + }, + { + "epoch": 4.226753670473083, + "grad_norm": 0.010295428335666656, + "learning_rate": 0.0009627176775025385, + "loss": 0.1451, + "num_input_tokens_seen": 55995616, + "step": 25910 + }, + { + "epoch": 4.2275693311582385, + "grad_norm": 0.007961004041135311, + "learning_rate": 0.0009626907023851275, + "loss": 0.0774, + "num_input_tokens_seen": 56006752, + "step": 25915 + }, + { + "epoch": 4.228384991843393, + "grad_norm": 0.16366617381572723, + "learning_rate": 0.0009626637178907024, + "loss": 0.1335, + "num_input_tokens_seen": 56017280, + "step": 25920 + }, + { + "epoch": 4.229200652528548, + "grad_norm": 0.024116847664117813, + "learning_rate": 0.0009626367240198101, + "loss": 0.1689, + "num_input_tokens_seen": 56027904, + "step": 25925 + }, + { + "epoch": 4.230016313213703, + "grad_norm": 0.11132140457630157, + "learning_rate": 0.0009626097207729978, + "loss": 0.1548, + "num_input_tokens_seen": 56038752, + "step": 25930 + }, + { + "epoch": 4.230831973898858, + "grad_norm": 0.016673635691404343, + "learning_rate": 0.0009625827081508125, + "loss": 0.0446, + "num_input_tokens_seen": 56048832, + "step": 25935 + }, + { + "epoch": 4.231647634584013, + "grad_norm": 0.06682579219341278, + "learning_rate": 0.000962555686153802, + "loss": 0.0731, + "num_input_tokens_seen": 56060800, + "step": 25940 + }, + { + "epoch": 4.232463295269168, + "grad_norm": 0.017317702993750572, + "learning_rate": 0.0009625286547825136, + "loss": 0.0203, + "num_input_tokens_seen": 56071808, + "step": 25945 + }, + { + "epoch": 4.233278955954323, + "grad_norm": 0.012221962213516235, + "learning_rate": 0.0009625016140374952, + "loss": 0.0339, + "num_input_tokens_seen": 56082336, + "step": 25950 + }, + { + "epoch": 4.234094616639478, + "grad_norm": 0.07057840377092361, + "learning_rate": 0.0009624745639192949, + "loss": 0.0544, + "num_input_tokens_seen": 56093120, + "step": 25955 + }, + { + "epoch": 4.234910277324633, + "grad_norm": 0.011993489228188992, + "learning_rate": 0.0009624475044284609, + "loss": 0.1421, + "num_input_tokens_seen": 56104320, + "step": 25960 + }, + { + "epoch": 4.235725938009788, + "grad_norm": 0.24318675696849823, + "learning_rate": 0.0009624204355655416, + "loss": 0.0602, + "num_input_tokens_seen": 56115136, + "step": 25965 + }, + { + "epoch": 4.236541598694943, + "grad_norm": 0.21818304061889648, + "learning_rate": 0.0009623933573310855, + "loss": 0.2945, + "num_input_tokens_seen": 56125696, + "step": 25970 + }, + { + "epoch": 4.237357259380098, + "grad_norm": 0.1993371695280075, + "learning_rate": 0.0009623662697256414, + "loss": 0.3022, + "num_input_tokens_seen": 56136832, + "step": 25975 + }, + { + "epoch": 4.238172920065253, + "grad_norm": 0.29056215286254883, + "learning_rate": 0.0009623391727497584, + "loss": 0.1117, + "num_input_tokens_seen": 56147296, + "step": 25980 + }, + { + "epoch": 4.238988580750408, + "grad_norm": 0.04809493198990822, + "learning_rate": 0.0009623120664039855, + "loss": 0.0876, + "num_input_tokens_seen": 56156896, + "step": 25985 + }, + { + "epoch": 4.239804241435563, + "grad_norm": 0.01413186639547348, + "learning_rate": 0.000962284950688872, + "loss": 0.1441, + "num_input_tokens_seen": 56166720, + "step": 25990 + }, + { + "epoch": 4.240619902120717, + "grad_norm": 0.1408049762248993, + "learning_rate": 0.0009622578256049675, + "loss": 0.1734, + "num_input_tokens_seen": 56177888, + "step": 25995 + }, + { + "epoch": 4.241435562805873, + "grad_norm": 0.09817873686552048, + "learning_rate": 0.0009622306911528219, + "loss": 0.0329, + "num_input_tokens_seen": 56188160, + "step": 26000 + }, + { + "epoch": 4.242251223491028, + "grad_norm": 0.02216365560889244, + "learning_rate": 0.0009622035473329848, + "loss": 0.1569, + "num_input_tokens_seen": 56198688, + "step": 26005 + }, + { + "epoch": 4.243066884176183, + "grad_norm": 0.08225827664136887, + "learning_rate": 0.0009621763941460067, + "loss": 0.0653, + "num_input_tokens_seen": 56210144, + "step": 26010 + }, + { + "epoch": 4.2438825448613375, + "grad_norm": 0.035414330661296844, + "learning_rate": 0.0009621492315924375, + "loss": 0.1642, + "num_input_tokens_seen": 56220928, + "step": 26015 + }, + { + "epoch": 4.244698205546492, + "grad_norm": 0.22662504017353058, + "learning_rate": 0.0009621220596728278, + "loss": 0.1058, + "num_input_tokens_seen": 56232032, + "step": 26020 + }, + { + "epoch": 4.245513866231648, + "grad_norm": 0.016972634941339493, + "learning_rate": 0.0009620948783877285, + "loss": 0.118, + "num_input_tokens_seen": 56242624, + "step": 26025 + }, + { + "epoch": 4.246329526916803, + "grad_norm": 0.07601569592952728, + "learning_rate": 0.0009620676877376902, + "loss": 0.0433, + "num_input_tokens_seen": 56253600, + "step": 26030 + }, + { + "epoch": 4.247145187601958, + "grad_norm": 0.05941057205200195, + "learning_rate": 0.000962040487723264, + "loss": 0.1097, + "num_input_tokens_seen": 56264992, + "step": 26035 + }, + { + "epoch": 4.2479608482871125, + "grad_norm": 0.09792166948318481, + "learning_rate": 0.0009620132783450011, + "loss": 0.0593, + "num_input_tokens_seen": 56275232, + "step": 26040 + }, + { + "epoch": 4.248776508972267, + "grad_norm": 0.05090722069144249, + "learning_rate": 0.0009619860596034531, + "loss": 0.1732, + "num_input_tokens_seen": 56287776, + "step": 26045 + }, + { + "epoch": 4.249592169657422, + "grad_norm": 0.021216444671154022, + "learning_rate": 0.0009619588314991716, + "loss": 0.0687, + "num_input_tokens_seen": 56298784, + "step": 26050 + }, + { + "epoch": 4.250407830342578, + "grad_norm": 0.15228860080242157, + "learning_rate": 0.0009619315940327082, + "loss": 0.1041, + "num_input_tokens_seen": 56309088, + "step": 26055 + }, + { + "epoch": 4.251223491027733, + "grad_norm": 0.02385716140270233, + "learning_rate": 0.0009619043472046151, + "loss": 0.1266, + "num_input_tokens_seen": 56320832, + "step": 26060 + }, + { + "epoch": 4.2520391517128875, + "grad_norm": 0.01335175335407257, + "learning_rate": 0.0009618770910154444, + "loss": 0.1649, + "num_input_tokens_seen": 56331360, + "step": 26065 + }, + { + "epoch": 4.252854812398042, + "grad_norm": 0.07662574201822281, + "learning_rate": 0.0009618498254657486, + "loss": 0.0968, + "num_input_tokens_seen": 56342944, + "step": 26070 + }, + { + "epoch": 4.253670473083197, + "grad_norm": 0.10685458034276962, + "learning_rate": 0.00096182255055608, + "loss": 0.0863, + "num_input_tokens_seen": 56354432, + "step": 26075 + }, + { + "epoch": 4.254486133768353, + "grad_norm": 0.2515588104724884, + "learning_rate": 0.0009617952662869918, + "loss": 0.0752, + "num_input_tokens_seen": 56365696, + "step": 26080 + }, + { + "epoch": 4.255301794453508, + "grad_norm": 0.016076816245913506, + "learning_rate": 0.0009617679726590366, + "loss": 0.1153, + "num_input_tokens_seen": 56377376, + "step": 26085 + }, + { + "epoch": 4.2561174551386625, + "grad_norm": 0.048764199018478394, + "learning_rate": 0.0009617406696727676, + "loss": 0.1156, + "num_input_tokens_seen": 56388064, + "step": 26090 + }, + { + "epoch": 4.256933115823817, + "grad_norm": 0.16464506089687347, + "learning_rate": 0.0009617133573287382, + "loss": 0.1028, + "num_input_tokens_seen": 56399488, + "step": 26095 + }, + { + "epoch": 4.257748776508972, + "grad_norm": 0.08378614485263824, + "learning_rate": 0.0009616860356275019, + "loss": 0.1784, + "num_input_tokens_seen": 56411104, + "step": 26100 + }, + { + "epoch": 4.258564437194127, + "grad_norm": 0.09539467096328735, + "learning_rate": 0.0009616587045696124, + "loss": 0.0475, + "num_input_tokens_seen": 56422496, + "step": 26105 + }, + { + "epoch": 4.259380097879283, + "grad_norm": 0.02436433918774128, + "learning_rate": 0.0009616313641556235, + "loss": 0.0608, + "num_input_tokens_seen": 56432512, + "step": 26110 + }, + { + "epoch": 4.260195758564437, + "grad_norm": 0.0581025667488575, + "learning_rate": 0.0009616040143860896, + "loss": 0.0504, + "num_input_tokens_seen": 56442816, + "step": 26115 + }, + { + "epoch": 4.261011419249592, + "grad_norm": 0.02602909319102764, + "learning_rate": 0.0009615766552615645, + "loss": 0.0794, + "num_input_tokens_seen": 56454080, + "step": 26120 + }, + { + "epoch": 4.261827079934747, + "grad_norm": 0.1946924775838852, + "learning_rate": 0.0009615492867826032, + "loss": 0.0812, + "num_input_tokens_seen": 56464352, + "step": 26125 + }, + { + "epoch": 4.262642740619902, + "grad_norm": 0.1757528930902481, + "learning_rate": 0.00096152190894976, + "loss": 0.1193, + "num_input_tokens_seen": 56476416, + "step": 26130 + }, + { + "epoch": 4.263458401305057, + "grad_norm": 0.21124711632728577, + "learning_rate": 0.0009614945217635897, + "loss": 0.0683, + "num_input_tokens_seen": 56487360, + "step": 26135 + }, + { + "epoch": 4.264274061990212, + "grad_norm": 0.17990908026695251, + "learning_rate": 0.0009614671252246476, + "loss": 0.1003, + "num_input_tokens_seen": 56499008, + "step": 26140 + }, + { + "epoch": 4.265089722675367, + "grad_norm": 0.1713842749595642, + "learning_rate": 0.0009614397193334887, + "loss": 0.0782, + "num_input_tokens_seen": 56511040, + "step": 26145 + }, + { + "epoch": 4.265905383360522, + "grad_norm": 0.27467039227485657, + "learning_rate": 0.0009614123040906686, + "loss": 0.1319, + "num_input_tokens_seen": 56522240, + "step": 26150 + }, + { + "epoch": 4.266721044045677, + "grad_norm": 0.00788316410034895, + "learning_rate": 0.0009613848794967428, + "loss": 0.1168, + "num_input_tokens_seen": 56532960, + "step": 26155 + }, + { + "epoch": 4.267536704730832, + "grad_norm": 0.18166938424110413, + "learning_rate": 0.0009613574455522671, + "loss": 0.2217, + "num_input_tokens_seen": 56543488, + "step": 26160 + }, + { + "epoch": 4.268352365415987, + "grad_norm": 0.16838808357715607, + "learning_rate": 0.0009613300022577974, + "loss": 0.0748, + "num_input_tokens_seen": 56553536, + "step": 26165 + }, + { + "epoch": 4.269168026101142, + "grad_norm": 0.028545338660478592, + "learning_rate": 0.00096130254961389, + "loss": 0.0584, + "num_input_tokens_seen": 56563648, + "step": 26170 + }, + { + "epoch": 4.269983686786297, + "grad_norm": 0.01613200642168522, + "learning_rate": 0.0009612750876211014, + "loss": 0.1161, + "num_input_tokens_seen": 56575136, + "step": 26175 + }, + { + "epoch": 4.270799347471452, + "grad_norm": 0.10914766043424606, + "learning_rate": 0.0009612476162799878, + "loss": 0.0396, + "num_input_tokens_seen": 56586720, + "step": 26180 + }, + { + "epoch": 4.271615008156607, + "grad_norm": 0.24682168662548065, + "learning_rate": 0.0009612201355911061, + "loss": 0.1341, + "num_input_tokens_seen": 56597984, + "step": 26185 + }, + { + "epoch": 4.2724306688417615, + "grad_norm": 0.004787517245858908, + "learning_rate": 0.0009611926455550135, + "loss": 0.0284, + "num_input_tokens_seen": 56608736, + "step": 26190 + }, + { + "epoch": 4.273246329526917, + "grad_norm": 0.024321412667632103, + "learning_rate": 0.0009611651461722666, + "loss": 0.1023, + "num_input_tokens_seen": 56620128, + "step": 26195 + }, + { + "epoch": 4.274061990212072, + "grad_norm": 0.014418189413845539, + "learning_rate": 0.0009611376374434231, + "loss": 0.1351, + "num_input_tokens_seen": 56631008, + "step": 26200 + }, + { + "epoch": 4.274877650897227, + "grad_norm": 0.021421290934085846, + "learning_rate": 0.0009611101193690403, + "loss": 0.1299, + "num_input_tokens_seen": 56642112, + "step": 26205 + }, + { + "epoch": 4.275693311582382, + "grad_norm": 0.2636556625366211, + "learning_rate": 0.0009610825919496761, + "loss": 0.0863, + "num_input_tokens_seen": 56652192, + "step": 26210 + }, + { + "epoch": 4.2765089722675365, + "grad_norm": 0.09782522916793823, + "learning_rate": 0.0009610550551858881, + "loss": 0.058, + "num_input_tokens_seen": 56663488, + "step": 26215 + }, + { + "epoch": 4.277324632952691, + "grad_norm": 0.019692903384566307, + "learning_rate": 0.0009610275090782347, + "loss": 0.0179, + "num_input_tokens_seen": 56673408, + "step": 26220 + }, + { + "epoch": 4.278140293637847, + "grad_norm": 0.19901689887046814, + "learning_rate": 0.0009609999536272738, + "loss": 0.1461, + "num_input_tokens_seen": 56683968, + "step": 26225 + }, + { + "epoch": 4.278955954323002, + "grad_norm": 0.18883618712425232, + "learning_rate": 0.0009609723888335641, + "loss": 0.2123, + "num_input_tokens_seen": 56694016, + "step": 26230 + }, + { + "epoch": 4.279771615008157, + "grad_norm": 0.06648790836334229, + "learning_rate": 0.0009609448146976642, + "loss": 0.1283, + "num_input_tokens_seen": 56704096, + "step": 26235 + }, + { + "epoch": 4.280587275693311, + "grad_norm": 0.07960224896669388, + "learning_rate": 0.0009609172312201328, + "loss": 0.2726, + "num_input_tokens_seen": 56712768, + "step": 26240 + }, + { + "epoch": 4.281402936378466, + "grad_norm": 0.04163384810090065, + "learning_rate": 0.000960889638401529, + "loss": 0.0597, + "num_input_tokens_seen": 56723776, + "step": 26245 + }, + { + "epoch": 4.282218597063622, + "grad_norm": 0.10396461188793182, + "learning_rate": 0.0009608620362424121, + "loss": 0.0566, + "num_input_tokens_seen": 56733696, + "step": 26250 + }, + { + "epoch": 4.283034257748777, + "grad_norm": 0.024622034281492233, + "learning_rate": 0.0009608344247433412, + "loss": 0.1117, + "num_input_tokens_seen": 56745216, + "step": 26255 + }, + { + "epoch": 4.283849918433932, + "grad_norm": 0.16428062319755554, + "learning_rate": 0.0009608068039048763, + "loss": 0.1339, + "num_input_tokens_seen": 56754656, + "step": 26260 + }, + { + "epoch": 4.284665579119086, + "grad_norm": 0.05322808027267456, + "learning_rate": 0.0009607791737275769, + "loss": 0.036, + "num_input_tokens_seen": 56765408, + "step": 26265 + }, + { + "epoch": 4.285481239804241, + "grad_norm": 0.10171199589967728, + "learning_rate": 0.0009607515342120028, + "loss": 0.0805, + "num_input_tokens_seen": 56776384, + "step": 26270 + }, + { + "epoch": 4.286296900489396, + "grad_norm": 0.053133487701416016, + "learning_rate": 0.0009607238853587144, + "loss": 0.0336, + "num_input_tokens_seen": 56788320, + "step": 26275 + }, + { + "epoch": 4.287112561174552, + "grad_norm": 0.09317631274461746, + "learning_rate": 0.0009606962271682722, + "loss": 0.0892, + "num_input_tokens_seen": 56799552, + "step": 26280 + }, + { + "epoch": 4.287928221859707, + "grad_norm": 0.09496995061635971, + "learning_rate": 0.0009606685596412364, + "loss": 0.1756, + "num_input_tokens_seen": 56810048, + "step": 26285 + }, + { + "epoch": 4.288743882544861, + "grad_norm": 0.17305903136730194, + "learning_rate": 0.0009606408827781679, + "loss": 0.0797, + "num_input_tokens_seen": 56820736, + "step": 26290 + }, + { + "epoch": 4.289559543230016, + "grad_norm": 0.12145748734474182, + "learning_rate": 0.0009606131965796274, + "loss": 0.0712, + "num_input_tokens_seen": 56831808, + "step": 26295 + }, + { + "epoch": 4.290375203915171, + "grad_norm": 0.015286357142031193, + "learning_rate": 0.0009605855010461761, + "loss": 0.1748, + "num_input_tokens_seen": 56842816, + "step": 26300 + }, + { + "epoch": 4.291190864600326, + "grad_norm": 0.1779191792011261, + "learning_rate": 0.0009605577961783756, + "loss": 0.0942, + "num_input_tokens_seen": 56853440, + "step": 26305 + }, + { + "epoch": 4.2920065252854815, + "grad_norm": 0.16392691433429718, + "learning_rate": 0.0009605300819767869, + "loss": 0.1803, + "num_input_tokens_seen": 56864192, + "step": 26310 + }, + { + "epoch": 4.292822185970636, + "grad_norm": 0.027073819190263748, + "learning_rate": 0.000960502358441972, + "loss": 0.0244, + "num_input_tokens_seen": 56872800, + "step": 26315 + }, + { + "epoch": 4.293637846655791, + "grad_norm": 0.09286334365606308, + "learning_rate": 0.0009604746255744925, + "loss": 0.0476, + "num_input_tokens_seen": 56882688, + "step": 26320 + }, + { + "epoch": 4.294453507340946, + "grad_norm": 0.1313088983297348, + "learning_rate": 0.0009604468833749105, + "loss": 0.1409, + "num_input_tokens_seen": 56893152, + "step": 26325 + }, + { + "epoch": 4.295269168026101, + "grad_norm": 0.20351779460906982, + "learning_rate": 0.0009604191318437885, + "loss": 0.1661, + "num_input_tokens_seen": 56904128, + "step": 26330 + }, + { + "epoch": 4.2960848287112565, + "grad_norm": 0.056847669184207916, + "learning_rate": 0.0009603913709816886, + "loss": 0.1283, + "num_input_tokens_seen": 56915136, + "step": 26335 + }, + { + "epoch": 4.296900489396411, + "grad_norm": 0.04496561363339424, + "learning_rate": 0.0009603636007891735, + "loss": 0.0262, + "num_input_tokens_seen": 56926208, + "step": 26340 + }, + { + "epoch": 4.297716150081566, + "grad_norm": 0.13187801837921143, + "learning_rate": 0.0009603358212668061, + "loss": 0.1805, + "num_input_tokens_seen": 56935520, + "step": 26345 + }, + { + "epoch": 4.298531810766721, + "grad_norm": 0.01551650557667017, + "learning_rate": 0.0009603080324151492, + "loss": 0.0794, + "num_input_tokens_seen": 56946560, + "step": 26350 + }, + { + "epoch": 4.299347471451876, + "grad_norm": 0.22867026925086975, + "learning_rate": 0.0009602802342347661, + "loss": 0.1025, + "num_input_tokens_seen": 56957120, + "step": 26355 + }, + { + "epoch": 4.300163132137031, + "grad_norm": 0.17811556160449982, + "learning_rate": 0.0009602524267262203, + "loss": 0.1104, + "num_input_tokens_seen": 56970368, + "step": 26360 + }, + { + "epoch": 4.300978792822186, + "grad_norm": 0.09705094248056412, + "learning_rate": 0.0009602246098900749, + "loss": 0.1768, + "num_input_tokens_seen": 56980672, + "step": 26365 + }, + { + "epoch": 4.301794453507341, + "grad_norm": 0.04232777655124664, + "learning_rate": 0.0009601967837268941, + "loss": 0.0868, + "num_input_tokens_seen": 56991232, + "step": 26370 + }, + { + "epoch": 4.302610114192496, + "grad_norm": 0.33170196413993835, + "learning_rate": 0.0009601689482372417, + "loss": 0.1073, + "num_input_tokens_seen": 57002112, + "step": 26375 + }, + { + "epoch": 4.303425774877651, + "grad_norm": 0.03127500042319298, + "learning_rate": 0.0009601411034216818, + "loss": 0.056, + "num_input_tokens_seen": 57011936, + "step": 26380 + }, + { + "epoch": 4.304241435562806, + "grad_norm": 0.19058465957641602, + "learning_rate": 0.0009601132492807787, + "loss": 0.0724, + "num_input_tokens_seen": 57022432, + "step": 26385 + }, + { + "epoch": 4.30505709624796, + "grad_norm": 0.016696345061063766, + "learning_rate": 0.000960085385815097, + "loss": 0.0464, + "num_input_tokens_seen": 57032896, + "step": 26390 + }, + { + "epoch": 4.305872756933116, + "grad_norm": 0.07727955281734467, + "learning_rate": 0.0009600575130252012, + "loss": 0.0479, + "num_input_tokens_seen": 57042880, + "step": 26395 + }, + { + "epoch": 4.306688417618271, + "grad_norm": 0.17325547337532043, + "learning_rate": 0.0009600296309116563, + "loss": 0.16, + "num_input_tokens_seen": 57053280, + "step": 26400 + }, + { + "epoch": 4.307504078303426, + "grad_norm": 0.2629067301750183, + "learning_rate": 0.0009600017394750274, + "loss": 0.1369, + "num_input_tokens_seen": 57063456, + "step": 26405 + }, + { + "epoch": 4.308319738988581, + "grad_norm": 0.1544959545135498, + "learning_rate": 0.0009599738387158794, + "loss": 0.09, + "num_input_tokens_seen": 57074496, + "step": 26410 + }, + { + "epoch": 4.309135399673735, + "grad_norm": 0.007932339794933796, + "learning_rate": 0.0009599459286347783, + "loss": 0.0117, + "num_input_tokens_seen": 57085696, + "step": 26415 + }, + { + "epoch": 4.309951060358891, + "grad_norm": 0.008668801747262478, + "learning_rate": 0.0009599180092322894, + "loss": 0.0259, + "num_input_tokens_seen": 57095648, + "step": 26420 + }, + { + "epoch": 4.310766721044046, + "grad_norm": 0.030321603640913963, + "learning_rate": 0.0009598900805089786, + "loss": 0.0511, + "num_input_tokens_seen": 57107168, + "step": 26425 + }, + { + "epoch": 4.311582381729201, + "grad_norm": 0.0036541877780109644, + "learning_rate": 0.0009598621424654119, + "loss": 0.0722, + "num_input_tokens_seen": 57118016, + "step": 26430 + }, + { + "epoch": 4.3123980424143555, + "grad_norm": 0.011160695925354958, + "learning_rate": 0.0009598341951021557, + "loss": 0.0955, + "num_input_tokens_seen": 57129632, + "step": 26435 + }, + { + "epoch": 4.31321370309951, + "grad_norm": 0.08019030094146729, + "learning_rate": 0.0009598062384197759, + "loss": 0.1186, + "num_input_tokens_seen": 57140576, + "step": 26440 + }, + { + "epoch": 4.314029363784665, + "grad_norm": 0.025762809440493584, + "learning_rate": 0.0009597782724188395, + "loss": 0.0847, + "num_input_tokens_seen": 57151712, + "step": 26445 + }, + { + "epoch": 4.314845024469821, + "grad_norm": 0.058879684656858444, + "learning_rate": 0.0009597502970999132, + "loss": 0.0694, + "num_input_tokens_seen": 57162272, + "step": 26450 + }, + { + "epoch": 4.315660685154976, + "grad_norm": 0.04242071136832237, + "learning_rate": 0.0009597223124635639, + "loss": 0.014, + "num_input_tokens_seen": 57173952, + "step": 26455 + }, + { + "epoch": 4.3164763458401305, + "grad_norm": 0.07131559401750565, + "learning_rate": 0.0009596943185103586, + "loss": 0.175, + "num_input_tokens_seen": 57184000, + "step": 26460 + }, + { + "epoch": 4.317292006525285, + "grad_norm": 0.15778687596321106, + "learning_rate": 0.0009596663152408648, + "loss": 0.1615, + "num_input_tokens_seen": 57195264, + "step": 26465 + }, + { + "epoch": 4.31810766721044, + "grad_norm": 0.02540893293917179, + "learning_rate": 0.0009596383026556501, + "loss": 0.063, + "num_input_tokens_seen": 57205056, + "step": 26470 + }, + { + "epoch": 4.318923327895595, + "grad_norm": 0.10403016209602356, + "learning_rate": 0.000959610280755282, + "loss": 0.2099, + "num_input_tokens_seen": 57215296, + "step": 26475 + }, + { + "epoch": 4.319738988580751, + "grad_norm": 0.18612545728683472, + "learning_rate": 0.0009595822495403286, + "loss": 0.183, + "num_input_tokens_seen": 57225632, + "step": 26480 + }, + { + "epoch": 4.3205546492659055, + "grad_norm": 0.022068077698349953, + "learning_rate": 0.0009595542090113579, + "loss": 0.0378, + "num_input_tokens_seen": 57237504, + "step": 26485 + }, + { + "epoch": 4.32137030995106, + "grad_norm": 0.12466895580291748, + "learning_rate": 0.0009595261591689381, + "loss": 0.0562, + "num_input_tokens_seen": 57249696, + "step": 26490 + }, + { + "epoch": 4.322185970636215, + "grad_norm": 0.006654700729995966, + "learning_rate": 0.0009594981000136377, + "loss": 0.0703, + "num_input_tokens_seen": 57259232, + "step": 26495 + }, + { + "epoch": 4.32300163132137, + "grad_norm": 0.028172895312309265, + "learning_rate": 0.0009594700315460254, + "loss": 0.0881, + "num_input_tokens_seen": 57269408, + "step": 26500 + }, + { + "epoch": 4.323817292006526, + "grad_norm": 0.013215692713856697, + "learning_rate": 0.0009594419537666701, + "loss": 0.0691, + "num_input_tokens_seen": 57280352, + "step": 26505 + }, + { + "epoch": 4.3246329526916805, + "grad_norm": 0.03416256979107857, + "learning_rate": 0.0009594138666761407, + "loss": 0.1872, + "num_input_tokens_seen": 57289248, + "step": 26510 + }, + { + "epoch": 4.325448613376835, + "grad_norm": 0.1410515457391739, + "learning_rate": 0.0009593857702750065, + "loss": 0.0417, + "num_input_tokens_seen": 57298720, + "step": 26515 + }, + { + "epoch": 4.32626427406199, + "grad_norm": 0.038631804287433624, + "learning_rate": 0.0009593576645638369, + "loss": 0.0654, + "num_input_tokens_seen": 57310016, + "step": 26520 + }, + { + "epoch": 4.327079934747145, + "grad_norm": 0.07176709920167923, + "learning_rate": 0.0009593295495432015, + "loss": 0.1958, + "num_input_tokens_seen": 57321216, + "step": 26525 + }, + { + "epoch": 4.327895595432301, + "grad_norm": 0.017286749556660652, + "learning_rate": 0.00095930142521367, + "loss": 0.0294, + "num_input_tokens_seen": 57330240, + "step": 26530 + }, + { + "epoch": 4.328711256117455, + "grad_norm": 0.005593335721641779, + "learning_rate": 0.0009592732915758127, + "loss": 0.1743, + "num_input_tokens_seen": 57341344, + "step": 26535 + }, + { + "epoch": 4.32952691680261, + "grad_norm": 0.2700396180152893, + "learning_rate": 0.0009592451486301991, + "loss": 0.0932, + "num_input_tokens_seen": 57352736, + "step": 26540 + }, + { + "epoch": 4.330342577487765, + "grad_norm": 0.0752822533249855, + "learning_rate": 0.0009592169963774004, + "loss": 0.055, + "num_input_tokens_seen": 57363200, + "step": 26545 + }, + { + "epoch": 4.33115823817292, + "grad_norm": 0.011770925484597683, + "learning_rate": 0.0009591888348179865, + "loss": 0.0351, + "num_input_tokens_seen": 57373952, + "step": 26550 + }, + { + "epoch": 4.331973898858075, + "grad_norm": 0.07157646864652634, + "learning_rate": 0.0009591606639525283, + "loss": 0.0491, + "num_input_tokens_seen": 57384000, + "step": 26555 + }, + { + "epoch": 4.33278955954323, + "grad_norm": 0.03235434740781784, + "learning_rate": 0.0009591324837815969, + "loss": 0.1104, + "num_input_tokens_seen": 57394432, + "step": 26560 + }, + { + "epoch": 4.333605220228385, + "grad_norm": 0.18330805003643036, + "learning_rate": 0.0009591042943057631, + "loss": 0.1976, + "num_input_tokens_seen": 57405088, + "step": 26565 + }, + { + "epoch": 4.33442088091354, + "grad_norm": 0.013964567333459854, + "learning_rate": 0.0009590760955255985, + "loss": 0.035, + "num_input_tokens_seen": 57416608, + "step": 26570 + }, + { + "epoch": 4.335236541598695, + "grad_norm": 0.0749783143401146, + "learning_rate": 0.0009590478874416744, + "loss": 0.1481, + "num_input_tokens_seen": 57427744, + "step": 26575 + }, + { + "epoch": 4.33605220228385, + "grad_norm": 0.19421903789043427, + "learning_rate": 0.0009590196700545626, + "loss": 0.2615, + "num_input_tokens_seen": 57438112, + "step": 26580 + }, + { + "epoch": 4.3368678629690045, + "grad_norm": 0.08299185335636139, + "learning_rate": 0.0009589914433648347, + "loss": 0.0568, + "num_input_tokens_seen": 57448896, + "step": 26585 + }, + { + "epoch": 4.33768352365416, + "grad_norm": 0.11819956451654434, + "learning_rate": 0.000958963207373063, + "loss": 0.055, + "num_input_tokens_seen": 57460160, + "step": 26590 + }, + { + "epoch": 4.338499184339315, + "grad_norm": 0.05758635699748993, + "learning_rate": 0.0009589349620798197, + "loss": 0.0679, + "num_input_tokens_seen": 57470656, + "step": 26595 + }, + { + "epoch": 4.33931484502447, + "grad_norm": 0.010048212483525276, + "learning_rate": 0.0009589067074856772, + "loss": 0.0374, + "num_input_tokens_seen": 57481504, + "step": 26600 + }, + { + "epoch": 4.340130505709625, + "grad_norm": 0.009367452003061771, + "learning_rate": 0.0009588784435912082, + "loss": 0.0595, + "num_input_tokens_seen": 57491520, + "step": 26605 + }, + { + "epoch": 4.3409461663947795, + "grad_norm": 0.21798332035541534, + "learning_rate": 0.0009588501703969852, + "loss": 0.1723, + "num_input_tokens_seen": 57501344, + "step": 26610 + }, + { + "epoch": 4.341761827079935, + "grad_norm": 0.013095865026116371, + "learning_rate": 0.0009588218879035815, + "loss": 0.0281, + "num_input_tokens_seen": 57513120, + "step": 26615 + }, + { + "epoch": 4.34257748776509, + "grad_norm": 0.04659920558333397, + "learning_rate": 0.0009587935961115701, + "loss": 0.0643, + "num_input_tokens_seen": 57523584, + "step": 26620 + }, + { + "epoch": 4.343393148450245, + "grad_norm": 0.4134010374546051, + "learning_rate": 0.0009587652950215247, + "loss": 0.2004, + "num_input_tokens_seen": 57534080, + "step": 26625 + }, + { + "epoch": 4.3442088091354, + "grad_norm": 0.022348370403051376, + "learning_rate": 0.0009587369846340184, + "loss": 0.1715, + "num_input_tokens_seen": 57545632, + "step": 26630 + }, + { + "epoch": 4.3450244698205545, + "grad_norm": 0.10105215013027191, + "learning_rate": 0.000958708664949625, + "loss": 0.0796, + "num_input_tokens_seen": 57556832, + "step": 26635 + }, + { + "epoch": 4.345840130505709, + "grad_norm": 0.023678358644247055, + "learning_rate": 0.0009586803359689189, + "loss": 0.128, + "num_input_tokens_seen": 57567552, + "step": 26640 + }, + { + "epoch": 4.346655791190865, + "grad_norm": 0.019319554790854454, + "learning_rate": 0.0009586519976924739, + "loss": 0.0737, + "num_input_tokens_seen": 57579424, + "step": 26645 + }, + { + "epoch": 4.34747145187602, + "grad_norm": 0.24461629986763, + "learning_rate": 0.0009586236501208642, + "loss": 0.0585, + "num_input_tokens_seen": 57590400, + "step": 26650 + }, + { + "epoch": 4.348287112561175, + "grad_norm": 0.0134728467091918, + "learning_rate": 0.0009585952932546644, + "loss": 0.0408, + "num_input_tokens_seen": 57599648, + "step": 26655 + }, + { + "epoch": 4.349102773246329, + "grad_norm": 0.08431003242731094, + "learning_rate": 0.0009585669270944493, + "loss": 0.0275, + "num_input_tokens_seen": 57611168, + "step": 26660 + }, + { + "epoch": 4.349918433931484, + "grad_norm": 0.19880840182304382, + "learning_rate": 0.0009585385516407936, + "loss": 0.1469, + "num_input_tokens_seen": 57622048, + "step": 26665 + }, + { + "epoch": 4.350734094616639, + "grad_norm": 0.01987231895327568, + "learning_rate": 0.0009585101668942726, + "loss": 0.0594, + "num_input_tokens_seen": 57632832, + "step": 26670 + }, + { + "epoch": 4.351549755301795, + "grad_norm": 0.025066286325454712, + "learning_rate": 0.0009584817728554613, + "loss": 0.1467, + "num_input_tokens_seen": 57644480, + "step": 26675 + }, + { + "epoch": 4.35236541598695, + "grad_norm": 0.04371904954314232, + "learning_rate": 0.0009584533695249353, + "loss": 0.1442, + "num_input_tokens_seen": 57654464, + "step": 26680 + }, + { + "epoch": 4.353181076672104, + "grad_norm": 0.008882477879524231, + "learning_rate": 0.0009584249569032701, + "loss": 0.1217, + "num_input_tokens_seen": 57664672, + "step": 26685 + }, + { + "epoch": 4.353996737357259, + "grad_norm": 0.02217107079923153, + "learning_rate": 0.0009583965349910417, + "loss": 0.1414, + "num_input_tokens_seen": 57675360, + "step": 26690 + }, + { + "epoch": 4.354812398042414, + "grad_norm": 0.05907650664448738, + "learning_rate": 0.0009583681037888259, + "loss": 0.1975, + "num_input_tokens_seen": 57686624, + "step": 26695 + }, + { + "epoch": 4.35562805872757, + "grad_norm": 0.07159577310085297, + "learning_rate": 0.0009583396632971991, + "loss": 0.0371, + "num_input_tokens_seen": 57698048, + "step": 26700 + }, + { + "epoch": 4.356443719412725, + "grad_norm": 0.12222054600715637, + "learning_rate": 0.0009583112135167376, + "loss": 0.3591, + "num_input_tokens_seen": 57709216, + "step": 26705 + }, + { + "epoch": 4.357259380097879, + "grad_norm": 0.016294648870825768, + "learning_rate": 0.0009582827544480177, + "loss": 0.1798, + "num_input_tokens_seen": 57719488, + "step": 26710 + }, + { + "epoch": 4.358075040783034, + "grad_norm": 0.02656223438680172, + "learning_rate": 0.0009582542860916166, + "loss": 0.0431, + "num_input_tokens_seen": 57731040, + "step": 26715 + }, + { + "epoch": 4.358890701468189, + "grad_norm": 0.21412135660648346, + "learning_rate": 0.000958225808448111, + "loss": 0.0866, + "num_input_tokens_seen": 57741728, + "step": 26720 + }, + { + "epoch": 4.359706362153344, + "grad_norm": 0.06581278145313263, + "learning_rate": 0.0009581973215180782, + "loss": 0.1054, + "num_input_tokens_seen": 57752128, + "step": 26725 + }, + { + "epoch": 4.3605220228384995, + "grad_norm": 0.06512191146612167, + "learning_rate": 0.0009581688253020952, + "loss": 0.142, + "num_input_tokens_seen": 57762944, + "step": 26730 + }, + { + "epoch": 4.361337683523654, + "grad_norm": 0.059873901307582855, + "learning_rate": 0.00095814031980074, + "loss": 0.0894, + "num_input_tokens_seen": 57775296, + "step": 26735 + }, + { + "epoch": 4.362153344208809, + "grad_norm": 0.2084333449602127, + "learning_rate": 0.0009581118050145898, + "loss": 0.3417, + "num_input_tokens_seen": 57786368, + "step": 26740 + }, + { + "epoch": 4.362969004893964, + "grad_norm": 0.19248606264591217, + "learning_rate": 0.0009580832809442228, + "loss": 0.1961, + "num_input_tokens_seen": 57797312, + "step": 26745 + }, + { + "epoch": 4.363784665579119, + "grad_norm": 0.03354114666581154, + "learning_rate": 0.000958054747590217, + "loss": 0.1853, + "num_input_tokens_seen": 57808896, + "step": 26750 + }, + { + "epoch": 4.364600326264274, + "grad_norm": 0.04537074267864227, + "learning_rate": 0.0009580262049531508, + "loss": 0.0527, + "num_input_tokens_seen": 57819744, + "step": 26755 + }, + { + "epoch": 4.365415986949429, + "grad_norm": 0.10104655474424362, + "learning_rate": 0.0009579976530336023, + "loss": 0.0981, + "num_input_tokens_seen": 57830944, + "step": 26760 + }, + { + "epoch": 4.366231647634584, + "grad_norm": 0.07620599120855331, + "learning_rate": 0.0009579690918321504, + "loss": 0.1167, + "num_input_tokens_seen": 57841920, + "step": 26765 + }, + { + "epoch": 4.367047308319739, + "grad_norm": 0.03413588926196098, + "learning_rate": 0.0009579405213493739, + "loss": 0.0761, + "num_input_tokens_seen": 57853056, + "step": 26770 + }, + { + "epoch": 4.367862969004894, + "grad_norm": 0.14624758064746857, + "learning_rate": 0.0009579119415858518, + "loss": 0.1401, + "num_input_tokens_seen": 57863552, + "step": 26775 + }, + { + "epoch": 4.368678629690049, + "grad_norm": 0.042857468128204346, + "learning_rate": 0.0009578833525421633, + "loss": 0.1191, + "num_input_tokens_seen": 57872992, + "step": 26780 + }, + { + "epoch": 4.369494290375204, + "grad_norm": 0.058174289762973785, + "learning_rate": 0.0009578547542188878, + "loss": 0.0965, + "num_input_tokens_seen": 57884224, + "step": 26785 + }, + { + "epoch": 4.370309951060359, + "grad_norm": 0.11993542313575745, + "learning_rate": 0.0009578261466166049, + "loss": 0.104, + "num_input_tokens_seen": 57895712, + "step": 26790 + }, + { + "epoch": 4.371125611745514, + "grad_norm": 0.03129759058356285, + "learning_rate": 0.0009577975297358943, + "loss": 0.0799, + "num_input_tokens_seen": 57907328, + "step": 26795 + }, + { + "epoch": 4.371941272430669, + "grad_norm": 0.08179029077291489, + "learning_rate": 0.0009577689035773359, + "loss": 0.1652, + "num_input_tokens_seen": 57918464, + "step": 26800 + }, + { + "epoch": 4.372756933115824, + "grad_norm": 0.08137882500886917, + "learning_rate": 0.0009577402681415102, + "loss": 0.0689, + "num_input_tokens_seen": 57929088, + "step": 26805 + }, + { + "epoch": 4.373572593800978, + "grad_norm": 0.04663487523794174, + "learning_rate": 0.0009577116234289971, + "loss": 0.2543, + "num_input_tokens_seen": 57941248, + "step": 26810 + }, + { + "epoch": 4.374388254486134, + "grad_norm": 0.04174930229783058, + "learning_rate": 0.0009576829694403772, + "loss": 0.0331, + "num_input_tokens_seen": 57951968, + "step": 26815 + }, + { + "epoch": 4.375203915171289, + "grad_norm": 0.21580137312412262, + "learning_rate": 0.0009576543061762315, + "loss": 0.1422, + "num_input_tokens_seen": 57961568, + "step": 26820 + }, + { + "epoch": 4.376019575856444, + "grad_norm": 0.23943717777729034, + "learning_rate": 0.0009576256336371407, + "loss": 0.137, + "num_input_tokens_seen": 57972832, + "step": 26825 + }, + { + "epoch": 4.376835236541599, + "grad_norm": 0.061017557978630066, + "learning_rate": 0.0009575969518236857, + "loss": 0.1647, + "num_input_tokens_seen": 57982816, + "step": 26830 + }, + { + "epoch": 4.377650897226753, + "grad_norm": 0.08061721920967102, + "learning_rate": 0.0009575682607364482, + "loss": 0.0811, + "num_input_tokens_seen": 57994784, + "step": 26835 + }, + { + "epoch": 4.378466557911908, + "grad_norm": 0.04571057856082916, + "learning_rate": 0.0009575395603760095, + "loss": 0.0377, + "num_input_tokens_seen": 58006208, + "step": 26840 + }, + { + "epoch": 4.379282218597064, + "grad_norm": 0.02869790978729725, + "learning_rate": 0.000957510850742951, + "loss": 0.0307, + "num_input_tokens_seen": 58016256, + "step": 26845 + }, + { + "epoch": 4.380097879282219, + "grad_norm": 0.10976076126098633, + "learning_rate": 0.0009574821318378547, + "loss": 0.134, + "num_input_tokens_seen": 58027680, + "step": 26850 + }, + { + "epoch": 4.3809135399673735, + "grad_norm": 0.06502792984247208, + "learning_rate": 0.0009574534036613028, + "loss": 0.1922, + "num_input_tokens_seen": 58038496, + "step": 26855 + }, + { + "epoch": 4.381729200652528, + "grad_norm": 0.2433139979839325, + "learning_rate": 0.0009574246662138772, + "loss": 0.1004, + "num_input_tokens_seen": 58047776, + "step": 26860 + }, + { + "epoch": 4.382544861337683, + "grad_norm": 0.124003104865551, + "learning_rate": 0.0009573959194961604, + "loss": 0.1118, + "num_input_tokens_seen": 58058112, + "step": 26865 + }, + { + "epoch": 4.383360522022839, + "grad_norm": 0.048630520701408386, + "learning_rate": 0.0009573671635087352, + "loss": 0.0627, + "num_input_tokens_seen": 58068608, + "step": 26870 + }, + { + "epoch": 4.384176182707994, + "grad_norm": 0.03981039673089981, + "learning_rate": 0.0009573383982521841, + "loss": 0.0459, + "num_input_tokens_seen": 58079680, + "step": 26875 + }, + { + "epoch": 4.3849918433931485, + "grad_norm": 0.04121852666139603, + "learning_rate": 0.0009573096237270903, + "loss": 0.0925, + "num_input_tokens_seen": 58090176, + "step": 26880 + }, + { + "epoch": 4.385807504078303, + "grad_norm": 0.2329862415790558, + "learning_rate": 0.0009572808399340368, + "loss": 0.0928, + "num_input_tokens_seen": 58101472, + "step": 26885 + }, + { + "epoch": 4.386623164763458, + "grad_norm": 0.08368705213069916, + "learning_rate": 0.000957252046873607, + "loss": 0.0682, + "num_input_tokens_seen": 58112576, + "step": 26890 + }, + { + "epoch": 4.387438825448613, + "grad_norm": 0.0444665402173996, + "learning_rate": 0.0009572232445463843, + "loss": 0.0524, + "num_input_tokens_seen": 58122976, + "step": 26895 + }, + { + "epoch": 4.388254486133769, + "grad_norm": 0.02356015332043171, + "learning_rate": 0.0009571944329529526, + "loss": 0.058, + "num_input_tokens_seen": 58134368, + "step": 26900 + }, + { + "epoch": 4.3890701468189235, + "grad_norm": 0.002911847084760666, + "learning_rate": 0.0009571656120938956, + "loss": 0.0363, + "num_input_tokens_seen": 58145312, + "step": 26905 + }, + { + "epoch": 4.389885807504078, + "grad_norm": 0.24054285883903503, + "learning_rate": 0.0009571367819697978, + "loss": 0.2026, + "num_input_tokens_seen": 58155680, + "step": 26910 + }, + { + "epoch": 4.390701468189233, + "grad_norm": 0.027164777740836143, + "learning_rate": 0.000957107942581243, + "loss": 0.0445, + "num_input_tokens_seen": 58164960, + "step": 26915 + }, + { + "epoch": 4.391517128874388, + "grad_norm": 0.08999812602996826, + "learning_rate": 0.0009570790939288159, + "loss": 0.0515, + "num_input_tokens_seen": 58174624, + "step": 26920 + }, + { + "epoch": 4.392332789559543, + "grad_norm": 0.4166439473628998, + "learning_rate": 0.0009570502360131011, + "loss": 0.1268, + "num_input_tokens_seen": 58185344, + "step": 26925 + }, + { + "epoch": 4.3931484502446985, + "grad_norm": 0.03616182506084442, + "learning_rate": 0.0009570213688346833, + "loss": 0.0233, + "num_input_tokens_seen": 58196416, + "step": 26930 + }, + { + "epoch": 4.393964110929853, + "grad_norm": 0.6690056920051575, + "learning_rate": 0.000956992492394148, + "loss": 0.0621, + "num_input_tokens_seen": 58206592, + "step": 26935 + }, + { + "epoch": 4.394779771615008, + "grad_norm": 0.14354459941387177, + "learning_rate": 0.00095696360669208, + "loss": 0.1493, + "num_input_tokens_seen": 58217568, + "step": 26940 + }, + { + "epoch": 4.395595432300163, + "grad_norm": 0.32492372393608093, + "learning_rate": 0.0009569347117290647, + "loss": 0.0811, + "num_input_tokens_seen": 58227552, + "step": 26945 + }, + { + "epoch": 4.396411092985318, + "grad_norm": 0.01738920249044895, + "learning_rate": 0.0009569058075056878, + "loss": 0.054, + "num_input_tokens_seen": 58239040, + "step": 26950 + }, + { + "epoch": 4.397226753670473, + "grad_norm": 0.011106864549219608, + "learning_rate": 0.0009568768940225352, + "loss": 0.1509, + "num_input_tokens_seen": 58250304, + "step": 26955 + }, + { + "epoch": 4.398042414355628, + "grad_norm": 0.3137091398239136, + "learning_rate": 0.0009568479712801926, + "loss": 0.1059, + "num_input_tokens_seen": 58262144, + "step": 26960 + }, + { + "epoch": 4.398858075040783, + "grad_norm": 0.022524043917655945, + "learning_rate": 0.0009568190392792464, + "loss": 0.0957, + "num_input_tokens_seen": 58272736, + "step": 26965 + }, + { + "epoch": 4.399673735725938, + "grad_norm": 0.03002866730093956, + "learning_rate": 0.000956790098020283, + "loss": 0.0216, + "num_input_tokens_seen": 58283040, + "step": 26970 + }, + { + "epoch": 4.400489396411093, + "grad_norm": 0.17999090254306793, + "learning_rate": 0.0009567611475038886, + "loss": 0.1068, + "num_input_tokens_seen": 58294432, + "step": 26975 + }, + { + "epoch": 4.401305057096248, + "grad_norm": 0.0031778549309819937, + "learning_rate": 0.0009567321877306501, + "loss": 0.0222, + "num_input_tokens_seen": 58303328, + "step": 26980 + }, + { + "epoch": 4.402120717781403, + "grad_norm": 0.0474659726023674, + "learning_rate": 0.0009567032187011546, + "loss": 0.0455, + "num_input_tokens_seen": 58314752, + "step": 26985 + }, + { + "epoch": 4.402936378466558, + "grad_norm": 0.06852079182863235, + "learning_rate": 0.0009566742404159887, + "loss": 0.1312, + "num_input_tokens_seen": 58325024, + "step": 26990 + }, + { + "epoch": 4.403752039151713, + "grad_norm": 0.0092054707929492, + "learning_rate": 0.0009566452528757402, + "loss": 0.1629, + "num_input_tokens_seen": 58334464, + "step": 26995 + }, + { + "epoch": 4.404567699836868, + "grad_norm": 0.003914583474397659, + "learning_rate": 0.0009566162560809963, + "loss": 0.0191, + "num_input_tokens_seen": 58345312, + "step": 27000 + }, + { + "epoch": 4.4053833605220225, + "grad_norm": 0.16951356828212738, + "learning_rate": 0.0009565872500323447, + "loss": 0.0657, + "num_input_tokens_seen": 58355424, + "step": 27005 + }, + { + "epoch": 4.406199021207178, + "grad_norm": 0.0581197664141655, + "learning_rate": 0.0009565582347303733, + "loss": 0.0428, + "num_input_tokens_seen": 58366304, + "step": 27010 + }, + { + "epoch": 4.407014681892333, + "grad_norm": 0.02101009152829647, + "learning_rate": 0.00095652921017567, + "loss": 0.061, + "num_input_tokens_seen": 58377376, + "step": 27015 + }, + { + "epoch": 4.407830342577488, + "grad_norm": 0.08173485100269318, + "learning_rate": 0.0009565001763688233, + "loss": 0.1788, + "num_input_tokens_seen": 58387424, + "step": 27020 + }, + { + "epoch": 4.408646003262643, + "grad_norm": 0.1883484423160553, + "learning_rate": 0.0009564711333104213, + "loss": 0.1001, + "num_input_tokens_seen": 58398880, + "step": 27025 + }, + { + "epoch": 4.4094616639477975, + "grad_norm": 0.10336413234472275, + "learning_rate": 0.0009564420810010526, + "loss": 0.1442, + "num_input_tokens_seen": 58410464, + "step": 27030 + }, + { + "epoch": 4.410277324632952, + "grad_norm": 0.03497812896966934, + "learning_rate": 0.0009564130194413061, + "loss": 0.0291, + "num_input_tokens_seen": 58422560, + "step": 27035 + }, + { + "epoch": 4.411092985318108, + "grad_norm": 0.008337237872183323, + "learning_rate": 0.0009563839486317709, + "loss": 0.1733, + "num_input_tokens_seen": 58432576, + "step": 27040 + }, + { + "epoch": 4.411908646003263, + "grad_norm": 0.12369904667139053, + "learning_rate": 0.000956354868573036, + "loss": 0.1701, + "num_input_tokens_seen": 58443488, + "step": 27045 + }, + { + "epoch": 4.412724306688418, + "grad_norm": 0.1232222467660904, + "learning_rate": 0.0009563257792656908, + "loss": 0.1001, + "num_input_tokens_seen": 58454560, + "step": 27050 + }, + { + "epoch": 4.4135399673735725, + "grad_norm": 0.23827281594276428, + "learning_rate": 0.0009562966807103246, + "loss": 0.1455, + "num_input_tokens_seen": 58465568, + "step": 27055 + }, + { + "epoch": 4.414355628058727, + "grad_norm": 0.06323964148759842, + "learning_rate": 0.0009562675729075274, + "loss": 0.037, + "num_input_tokens_seen": 58477056, + "step": 27060 + }, + { + "epoch": 4.415171288743883, + "grad_norm": 0.17134052515029907, + "learning_rate": 0.0009562384558578891, + "loss": 0.1215, + "num_input_tokens_seen": 58487904, + "step": 27065 + }, + { + "epoch": 4.415986949429038, + "grad_norm": 0.06725971400737762, + "learning_rate": 0.0009562093295619996, + "loss": 0.2767, + "num_input_tokens_seen": 58500384, + "step": 27070 + }, + { + "epoch": 4.416802610114193, + "grad_norm": 0.014682374894618988, + "learning_rate": 0.0009561801940204493, + "loss": 0.0501, + "num_input_tokens_seen": 58511104, + "step": 27075 + }, + { + "epoch": 4.417618270799347, + "grad_norm": 0.005113726481795311, + "learning_rate": 0.0009561510492338287, + "loss": 0.1159, + "num_input_tokens_seen": 58522528, + "step": 27080 + }, + { + "epoch": 4.418433931484502, + "grad_norm": 0.051373984664678574, + "learning_rate": 0.0009561218952027286, + "loss": 0.1983, + "num_input_tokens_seen": 58533184, + "step": 27085 + }, + { + "epoch": 4.419249592169657, + "grad_norm": 0.09865567088127136, + "learning_rate": 0.0009560927319277395, + "loss": 0.0901, + "num_input_tokens_seen": 58543328, + "step": 27090 + }, + { + "epoch": 4.420065252854813, + "grad_norm": 0.12814849615097046, + "learning_rate": 0.0009560635594094524, + "loss": 0.1032, + "num_input_tokens_seen": 58554784, + "step": 27095 + }, + { + "epoch": 4.420880913539968, + "grad_norm": 0.26408764719963074, + "learning_rate": 0.000956034377648459, + "loss": 0.1338, + "num_input_tokens_seen": 58565280, + "step": 27100 + }, + { + "epoch": 4.421696574225122, + "grad_norm": 0.08768882602453232, + "learning_rate": 0.0009560051866453503, + "loss": 0.1002, + "num_input_tokens_seen": 58575456, + "step": 27105 + }, + { + "epoch": 4.422512234910277, + "grad_norm": 0.020314859226346016, + "learning_rate": 0.000955975986400718, + "loss": 0.0671, + "num_input_tokens_seen": 58586176, + "step": 27110 + }, + { + "epoch": 4.423327895595432, + "grad_norm": 0.12767770886421204, + "learning_rate": 0.000955946776915154, + "loss": 0.1762, + "num_input_tokens_seen": 58596864, + "step": 27115 + }, + { + "epoch": 4.424143556280587, + "grad_norm": 0.009916570968925953, + "learning_rate": 0.00095591755818925, + "loss": 0.1992, + "num_input_tokens_seen": 58607680, + "step": 27120 + }, + { + "epoch": 4.424959216965743, + "grad_norm": 0.05374537780880928, + "learning_rate": 0.0009558883302235984, + "loss": 0.1024, + "num_input_tokens_seen": 58618400, + "step": 27125 + }, + { + "epoch": 4.425774877650897, + "grad_norm": 0.03968321159482002, + "learning_rate": 0.0009558590930187913, + "loss": 0.0513, + "num_input_tokens_seen": 58629152, + "step": 27130 + }, + { + "epoch": 4.426590538336052, + "grad_norm": 0.05687025561928749, + "learning_rate": 0.0009558298465754216, + "loss": 0.0983, + "num_input_tokens_seen": 58639744, + "step": 27135 + }, + { + "epoch": 4.427406199021207, + "grad_norm": 0.022792354226112366, + "learning_rate": 0.0009558005908940816, + "loss": 0.0336, + "num_input_tokens_seen": 58649536, + "step": 27140 + }, + { + "epoch": 4.428221859706362, + "grad_norm": 0.1321583092212677, + "learning_rate": 0.0009557713259753647, + "loss": 0.0354, + "num_input_tokens_seen": 58660384, + "step": 27145 + }, + { + "epoch": 4.4290375203915175, + "grad_norm": 0.022462155669927597, + "learning_rate": 0.0009557420518198634, + "loss": 0.1426, + "num_input_tokens_seen": 58671136, + "step": 27150 + }, + { + "epoch": 4.429853181076672, + "grad_norm": 0.2902876138687134, + "learning_rate": 0.0009557127684281714, + "loss": 0.1398, + "num_input_tokens_seen": 58683424, + "step": 27155 + }, + { + "epoch": 4.430668841761827, + "grad_norm": 0.02551012486219406, + "learning_rate": 0.000955683475800882, + "loss": 0.1193, + "num_input_tokens_seen": 58693600, + "step": 27160 + }, + { + "epoch": 4.431484502446982, + "grad_norm": 0.10253246873617172, + "learning_rate": 0.0009556541739385889, + "loss": 0.1524, + "num_input_tokens_seen": 58704064, + "step": 27165 + }, + { + "epoch": 4.432300163132137, + "grad_norm": 0.218844935297966, + "learning_rate": 0.000955624862841886, + "loss": 0.1446, + "num_input_tokens_seen": 58713632, + "step": 27170 + }, + { + "epoch": 4.433115823817292, + "grad_norm": 0.2918366491794586, + "learning_rate": 0.0009555955425113672, + "loss": 0.1021, + "num_input_tokens_seen": 58724352, + "step": 27175 + }, + { + "epoch": 4.433931484502447, + "grad_norm": 0.06699206680059433, + "learning_rate": 0.0009555662129476266, + "loss": 0.0543, + "num_input_tokens_seen": 58735392, + "step": 27180 + }, + { + "epoch": 4.434747145187602, + "grad_norm": 0.046586476266384125, + "learning_rate": 0.0009555368741512589, + "loss": 0.0569, + "num_input_tokens_seen": 58744928, + "step": 27185 + }, + { + "epoch": 4.435562805872757, + "grad_norm": 0.041165731847286224, + "learning_rate": 0.0009555075261228586, + "loss": 0.0791, + "num_input_tokens_seen": 58755520, + "step": 27190 + }, + { + "epoch": 4.436378466557912, + "grad_norm": 0.013963720761239529, + "learning_rate": 0.0009554781688630204, + "loss": 0.0346, + "num_input_tokens_seen": 58765984, + "step": 27195 + }, + { + "epoch": 4.437194127243067, + "grad_norm": 0.20940876007080078, + "learning_rate": 0.0009554488023723394, + "loss": 0.0938, + "num_input_tokens_seen": 58777760, + "step": 27200 + }, + { + "epoch": 4.438009787928221, + "grad_norm": 0.08399229496717453, + "learning_rate": 0.0009554194266514105, + "loss": 0.0483, + "num_input_tokens_seen": 58789152, + "step": 27205 + }, + { + "epoch": 4.438825448613377, + "grad_norm": 0.013586705550551414, + "learning_rate": 0.0009553900417008292, + "loss": 0.0297, + "num_input_tokens_seen": 58800096, + "step": 27210 + }, + { + "epoch": 4.439641109298532, + "grad_norm": 0.2480028122663498, + "learning_rate": 0.000955360647521191, + "loss": 0.0942, + "num_input_tokens_seen": 58809472, + "step": 27215 + }, + { + "epoch": 4.440456769983687, + "grad_norm": 0.019358092918992043, + "learning_rate": 0.0009553312441130916, + "loss": 0.1742, + "num_input_tokens_seen": 58820672, + "step": 27220 + }, + { + "epoch": 4.441272430668842, + "grad_norm": 0.01376861147582531, + "learning_rate": 0.0009553018314771269, + "loss": 0.0367, + "num_input_tokens_seen": 58831360, + "step": 27225 + }, + { + "epoch": 4.442088091353996, + "grad_norm": 0.2584722340106964, + "learning_rate": 0.0009552724096138931, + "loss": 0.0807, + "num_input_tokens_seen": 58841664, + "step": 27230 + }, + { + "epoch": 4.442903752039152, + "grad_norm": 0.0029593873769044876, + "learning_rate": 0.0009552429785239863, + "loss": 0.1452, + "num_input_tokens_seen": 58852736, + "step": 27235 + }, + { + "epoch": 4.443719412724307, + "grad_norm": 0.2969488501548767, + "learning_rate": 0.0009552135382080029, + "loss": 0.0455, + "num_input_tokens_seen": 58863520, + "step": 27240 + }, + { + "epoch": 4.444535073409462, + "grad_norm": 0.21504615247249603, + "learning_rate": 0.0009551840886665398, + "loss": 0.1008, + "num_input_tokens_seen": 58874080, + "step": 27245 + }, + { + "epoch": 4.445350734094617, + "grad_norm": 0.019169187173247337, + "learning_rate": 0.0009551546299001938, + "loss": 0.0939, + "num_input_tokens_seen": 58884864, + "step": 27250 + }, + { + "epoch": 4.446166394779771, + "grad_norm": 0.05577266216278076, + "learning_rate": 0.0009551251619095616, + "loss": 0.2479, + "num_input_tokens_seen": 58895904, + "step": 27255 + }, + { + "epoch": 4.446982055464926, + "grad_norm": 0.06436615437269211, + "learning_rate": 0.0009550956846952408, + "loss": 0.2503, + "num_input_tokens_seen": 58905984, + "step": 27260 + }, + { + "epoch": 4.447797716150082, + "grad_norm": 0.014609013684093952, + "learning_rate": 0.0009550661982578286, + "loss": 0.1163, + "num_input_tokens_seen": 58917536, + "step": 27265 + }, + { + "epoch": 4.448613376835237, + "grad_norm": 0.09037666767835617, + "learning_rate": 0.0009550367025979225, + "loss": 0.1394, + "num_input_tokens_seen": 58928224, + "step": 27270 + }, + { + "epoch": 4.4494290375203915, + "grad_norm": 0.020922953262925148, + "learning_rate": 0.0009550071977161203, + "loss": 0.0755, + "num_input_tokens_seen": 58939488, + "step": 27275 + }, + { + "epoch": 4.450244698205546, + "grad_norm": 0.053212665021419525, + "learning_rate": 0.0009549776836130202, + "loss": 0.1124, + "num_input_tokens_seen": 58951520, + "step": 27280 + }, + { + "epoch": 4.451060358890701, + "grad_norm": 0.04497351124882698, + "learning_rate": 0.0009549481602892201, + "loss": 0.0697, + "num_input_tokens_seen": 58962048, + "step": 27285 + }, + { + "epoch": 4.451876019575856, + "grad_norm": 0.2501903176307678, + "learning_rate": 0.0009549186277453184, + "loss": 0.1667, + "num_input_tokens_seen": 58973696, + "step": 27290 + }, + { + "epoch": 4.452691680261012, + "grad_norm": 0.7072194218635559, + "learning_rate": 0.0009548890859819138, + "loss": 0.0931, + "num_input_tokens_seen": 58983840, + "step": 27295 + }, + { + "epoch": 4.4535073409461665, + "grad_norm": 0.06406120955944061, + "learning_rate": 0.0009548595349996045, + "loss": 0.0534, + "num_input_tokens_seen": 58993952, + "step": 27300 + }, + { + "epoch": 4.454323001631321, + "grad_norm": 0.038360778242349625, + "learning_rate": 0.0009548299747989897, + "loss": 0.0509, + "num_input_tokens_seen": 59004576, + "step": 27305 + }, + { + "epoch": 4.455138662316476, + "grad_norm": 0.019249223172664642, + "learning_rate": 0.0009548004053806686, + "loss": 0.0643, + "num_input_tokens_seen": 59014464, + "step": 27310 + }, + { + "epoch": 4.455954323001631, + "grad_norm": 0.16556453704833984, + "learning_rate": 0.0009547708267452403, + "loss": 0.0886, + "num_input_tokens_seen": 59024352, + "step": 27315 + }, + { + "epoch": 4.456769983686787, + "grad_norm": 0.11377817392349243, + "learning_rate": 0.0009547412388933042, + "loss": 0.0644, + "num_input_tokens_seen": 59035488, + "step": 27320 + }, + { + "epoch": 4.4575856443719415, + "grad_norm": 0.019814448431134224, + "learning_rate": 0.0009547116418254601, + "loss": 0.0831, + "num_input_tokens_seen": 59047072, + "step": 27325 + }, + { + "epoch": 4.458401305057096, + "grad_norm": 0.11277088522911072, + "learning_rate": 0.0009546820355423077, + "loss": 0.1107, + "num_input_tokens_seen": 59058144, + "step": 27330 + }, + { + "epoch": 4.459216965742251, + "grad_norm": 0.005291223991662264, + "learning_rate": 0.0009546524200444471, + "loss": 0.1291, + "num_input_tokens_seen": 59068576, + "step": 27335 + }, + { + "epoch": 4.460032626427406, + "grad_norm": 0.023128168657422066, + "learning_rate": 0.0009546227953324784, + "loss": 0.1454, + "num_input_tokens_seen": 59079808, + "step": 27340 + }, + { + "epoch": 4.460848287112561, + "grad_norm": 0.3094649016857147, + "learning_rate": 0.000954593161407002, + "loss": 0.3053, + "num_input_tokens_seen": 59090432, + "step": 27345 + }, + { + "epoch": 4.4616639477977165, + "grad_norm": 0.07266154885292053, + "learning_rate": 0.0009545635182686185, + "loss": 0.1429, + "num_input_tokens_seen": 59100736, + "step": 27350 + }, + { + "epoch": 4.462479608482871, + "grad_norm": 0.20185202360153198, + "learning_rate": 0.0009545338659179286, + "loss": 0.0734, + "num_input_tokens_seen": 59111776, + "step": 27355 + }, + { + "epoch": 4.463295269168026, + "grad_norm": 0.05168718472123146, + "learning_rate": 0.0009545042043555334, + "loss": 0.0698, + "num_input_tokens_seen": 59123936, + "step": 27360 + }, + { + "epoch": 4.464110929853181, + "grad_norm": 0.017242785543203354, + "learning_rate": 0.000954474533582034, + "loss": 0.0958, + "num_input_tokens_seen": 59135200, + "step": 27365 + }, + { + "epoch": 4.464926590538336, + "grad_norm": 0.03838271647691727, + "learning_rate": 0.0009544448535980315, + "loss": 0.1033, + "num_input_tokens_seen": 59146592, + "step": 27370 + }, + { + "epoch": 4.465742251223491, + "grad_norm": 0.0059380610473454, + "learning_rate": 0.0009544151644041275, + "loss": 0.0748, + "num_input_tokens_seen": 59157472, + "step": 27375 + }, + { + "epoch": 4.466557911908646, + "grad_norm": 0.05592265725135803, + "learning_rate": 0.0009543854660009237, + "loss": 0.0528, + "num_input_tokens_seen": 59169216, + "step": 27380 + }, + { + "epoch": 4.467373572593801, + "grad_norm": 0.027988268062472343, + "learning_rate": 0.0009543557583890221, + "loss": 0.0278, + "num_input_tokens_seen": 59180800, + "step": 27385 + }, + { + "epoch": 4.468189233278956, + "grad_norm": 0.5290764570236206, + "learning_rate": 0.0009543260415690247, + "loss": 0.2604, + "num_input_tokens_seen": 59192736, + "step": 27390 + }, + { + "epoch": 4.469004893964111, + "grad_norm": 0.3840823173522949, + "learning_rate": 0.0009542963155415336, + "loss": 0.3098, + "num_input_tokens_seen": 59202208, + "step": 27395 + }, + { + "epoch": 4.4698205546492655, + "grad_norm": 0.09972764551639557, + "learning_rate": 0.0009542665803071515, + "loss": 0.117, + "num_input_tokens_seen": 59213312, + "step": 27400 + }, + { + "epoch": 4.470636215334421, + "grad_norm": 0.028418170288205147, + "learning_rate": 0.0009542368358664806, + "loss": 0.0422, + "num_input_tokens_seen": 59223936, + "step": 27405 + }, + { + "epoch": 4.471451876019576, + "grad_norm": 0.005448495969176292, + "learning_rate": 0.0009542070822201244, + "loss": 0.0976, + "num_input_tokens_seen": 59234176, + "step": 27410 + }, + { + "epoch": 4.472267536704731, + "grad_norm": 0.02386247180402279, + "learning_rate": 0.0009541773193686851, + "loss": 0.1321, + "num_input_tokens_seen": 59245952, + "step": 27415 + }, + { + "epoch": 4.473083197389886, + "grad_norm": 0.016697218641638756, + "learning_rate": 0.0009541475473127664, + "loss": 0.2068, + "num_input_tokens_seen": 59257216, + "step": 27420 + }, + { + "epoch": 4.4738988580750405, + "grad_norm": 0.02813301980495453, + "learning_rate": 0.0009541177660529715, + "loss": 0.1181, + "num_input_tokens_seen": 59267936, + "step": 27425 + }, + { + "epoch": 4.474714518760196, + "grad_norm": 0.1297505646944046, + "learning_rate": 0.0009540879755899041, + "loss": 0.1286, + "num_input_tokens_seen": 59279232, + "step": 27430 + }, + { + "epoch": 4.475530179445351, + "grad_norm": 0.12573681771755219, + "learning_rate": 0.0009540581759241676, + "loss": 0.1628, + "num_input_tokens_seen": 59288800, + "step": 27435 + }, + { + "epoch": 4.476345840130506, + "grad_norm": 0.2438529133796692, + "learning_rate": 0.0009540283670563663, + "loss": 0.2941, + "num_input_tokens_seen": 59299904, + "step": 27440 + }, + { + "epoch": 4.477161500815661, + "grad_norm": 0.08795158565044403, + "learning_rate": 0.0009539985489871041, + "loss": 0.0748, + "num_input_tokens_seen": 59310592, + "step": 27445 + }, + { + "epoch": 4.4779771615008155, + "grad_norm": 0.08853715658187866, + "learning_rate": 0.0009539687217169855, + "loss": 0.0883, + "num_input_tokens_seen": 59320768, + "step": 27450 + }, + { + "epoch": 4.47879282218597, + "grad_norm": 0.1433527171611786, + "learning_rate": 0.0009539388852466146, + "loss": 0.0713, + "num_input_tokens_seen": 59332992, + "step": 27455 + }, + { + "epoch": 4.479608482871126, + "grad_norm": 0.04177452251315117, + "learning_rate": 0.0009539090395765966, + "loss": 0.0501, + "num_input_tokens_seen": 59342944, + "step": 27460 + }, + { + "epoch": 4.480424143556281, + "grad_norm": 0.0390019528567791, + "learning_rate": 0.000953879184707536, + "loss": 0.1618, + "num_input_tokens_seen": 59354016, + "step": 27465 + }, + { + "epoch": 4.481239804241436, + "grad_norm": 0.06062839925289154, + "learning_rate": 0.0009538493206400378, + "loss": 0.0576, + "num_input_tokens_seen": 59364064, + "step": 27470 + }, + { + "epoch": 4.4820554649265905, + "grad_norm": 0.10372152924537659, + "learning_rate": 0.0009538194473747077, + "loss": 0.0856, + "num_input_tokens_seen": 59374336, + "step": 27475 + }, + { + "epoch": 4.482871125611745, + "grad_norm": 0.1782025396823883, + "learning_rate": 0.0009537895649121504, + "loss": 0.069, + "num_input_tokens_seen": 59385056, + "step": 27480 + }, + { + "epoch": 4.4836867862969, + "grad_norm": 0.044130004942417145, + "learning_rate": 0.0009537596732529721, + "loss": 0.0647, + "num_input_tokens_seen": 59393952, + "step": 27485 + }, + { + "epoch": 4.484502446982056, + "grad_norm": 0.20328426361083984, + "learning_rate": 0.0009537297723977784, + "loss": 0.1263, + "num_input_tokens_seen": 59405824, + "step": 27490 + }, + { + "epoch": 4.485318107667211, + "grad_norm": 0.17632223665714264, + "learning_rate": 0.0009536998623471752, + "loss": 0.1787, + "num_input_tokens_seen": 59415968, + "step": 27495 + }, + { + "epoch": 4.486133768352365, + "grad_norm": 0.2297552525997162, + "learning_rate": 0.0009536699431017688, + "loss": 0.1442, + "num_input_tokens_seen": 59426336, + "step": 27500 + }, + { + "epoch": 4.48694942903752, + "grad_norm": 0.0059665110893547535, + "learning_rate": 0.0009536400146621653, + "loss": 0.0369, + "num_input_tokens_seen": 59436864, + "step": 27505 + }, + { + "epoch": 4.487765089722675, + "grad_norm": 0.20988836884498596, + "learning_rate": 0.0009536100770289717, + "loss": 0.1199, + "num_input_tokens_seen": 59448128, + "step": 27510 + }, + { + "epoch": 4.488580750407831, + "grad_norm": 0.008611970581114292, + "learning_rate": 0.0009535801302027942, + "loss": 0.0509, + "num_input_tokens_seen": 59458432, + "step": 27515 + }, + { + "epoch": 4.489396411092986, + "grad_norm": 0.21014344692230225, + "learning_rate": 0.0009535501741842401, + "loss": 0.2112, + "num_input_tokens_seen": 59468160, + "step": 27520 + }, + { + "epoch": 4.49021207177814, + "grad_norm": 0.02096942998468876, + "learning_rate": 0.0009535202089739162, + "loss": 0.0287, + "num_input_tokens_seen": 59478144, + "step": 27525 + }, + { + "epoch": 4.491027732463295, + "grad_norm": 0.004998327232897282, + "learning_rate": 0.0009534902345724301, + "loss": 0.0732, + "num_input_tokens_seen": 59489792, + "step": 27530 + }, + { + "epoch": 4.49184339314845, + "grad_norm": 0.016319639980793, + "learning_rate": 0.000953460250980389, + "loss": 0.0483, + "num_input_tokens_seen": 59501376, + "step": 27535 + }, + { + "epoch": 4.492659053833605, + "grad_norm": 0.47428780794143677, + "learning_rate": 0.0009534302581984007, + "loss": 0.1153, + "num_input_tokens_seen": 59511936, + "step": 27540 + }, + { + "epoch": 4.493474714518761, + "grad_norm": 0.07189547270536423, + "learning_rate": 0.000953400256227073, + "loss": 0.0709, + "num_input_tokens_seen": 59522912, + "step": 27545 + }, + { + "epoch": 4.494290375203915, + "grad_norm": 0.008183644153177738, + "learning_rate": 0.0009533702450670138, + "loss": 0.0362, + "num_input_tokens_seen": 59533632, + "step": 27550 + }, + { + "epoch": 4.49510603588907, + "grad_norm": 0.013985889032483101, + "learning_rate": 0.0009533402247188317, + "loss": 0.1478, + "num_input_tokens_seen": 59545024, + "step": 27555 + }, + { + "epoch": 4.495921696574225, + "grad_norm": 0.011940013617277145, + "learning_rate": 0.0009533101951831347, + "loss": 0.0556, + "num_input_tokens_seen": 59555616, + "step": 27560 + }, + { + "epoch": 4.49673735725938, + "grad_norm": 0.006115328054875135, + "learning_rate": 0.0009532801564605315, + "loss": 0.0559, + "num_input_tokens_seen": 59567136, + "step": 27565 + }, + { + "epoch": 4.497553017944535, + "grad_norm": 0.16617755591869354, + "learning_rate": 0.000953250108551631, + "loss": 0.2774, + "num_input_tokens_seen": 59580000, + "step": 27570 + }, + { + "epoch": 4.49836867862969, + "grad_norm": 0.05506671965122223, + "learning_rate": 0.0009532200514570419, + "loss": 0.0339, + "num_input_tokens_seen": 59591008, + "step": 27575 + }, + { + "epoch": 4.499184339314845, + "grad_norm": 0.008953276090323925, + "learning_rate": 0.0009531899851773737, + "loss": 0.0657, + "num_input_tokens_seen": 59601920, + "step": 27580 + }, + { + "epoch": 4.5, + "grad_norm": 0.11161115020513535, + "learning_rate": 0.0009531599097132354, + "loss": 0.0558, + "num_input_tokens_seen": 59611968, + "step": 27585 + }, + { + "epoch": 4.500815660685155, + "grad_norm": 0.015438306145370007, + "learning_rate": 0.0009531298250652367, + "loss": 0.046, + "num_input_tokens_seen": 59623328, + "step": 27590 + }, + { + "epoch": 4.50163132137031, + "grad_norm": 0.018507203087210655, + "learning_rate": 0.0009530997312339873, + "loss": 0.1303, + "num_input_tokens_seen": 59634208, + "step": 27595 + }, + { + "epoch": 4.502446982055465, + "grad_norm": 0.24117828905582428, + "learning_rate": 0.000953069628220097, + "loss": 0.1026, + "num_input_tokens_seen": 59645472, + "step": 27600 + }, + { + "epoch": 4.50326264274062, + "grad_norm": 0.06984904408454895, + "learning_rate": 0.0009530395160241759, + "loss": 0.0374, + "num_input_tokens_seen": 59656960, + "step": 27605 + }, + { + "epoch": 4.504078303425775, + "grad_norm": 0.011626221239566803, + "learning_rate": 0.0009530093946468343, + "loss": 0.0235, + "num_input_tokens_seen": 59668224, + "step": 27610 + }, + { + "epoch": 4.50489396411093, + "grad_norm": 0.06135258078575134, + "learning_rate": 0.0009529792640886827, + "loss": 0.1165, + "num_input_tokens_seen": 59679168, + "step": 27615 + }, + { + "epoch": 4.505709624796085, + "grad_norm": 0.041614778339862823, + "learning_rate": 0.0009529491243503316, + "loss": 0.0838, + "num_input_tokens_seen": 59689376, + "step": 27620 + }, + { + "epoch": 4.506525285481239, + "grad_norm": 0.18257805705070496, + "learning_rate": 0.000952918975432392, + "loss": 0.2499, + "num_input_tokens_seen": 59700256, + "step": 27625 + }, + { + "epoch": 4.507340946166395, + "grad_norm": 0.059920214116573334, + "learning_rate": 0.0009528888173354746, + "loss": 0.093, + "num_input_tokens_seen": 59711648, + "step": 27630 + }, + { + "epoch": 4.50815660685155, + "grad_norm": 0.16995491087436676, + "learning_rate": 0.000952858650060191, + "loss": 0.2813, + "num_input_tokens_seen": 59721152, + "step": 27635 + }, + { + "epoch": 4.508972267536705, + "grad_norm": 0.0451311320066452, + "learning_rate": 0.0009528284736071522, + "loss": 0.096, + "num_input_tokens_seen": 59731264, + "step": 27640 + }, + { + "epoch": 4.50978792822186, + "grad_norm": 0.21006529033184052, + "learning_rate": 0.00095279828797697, + "loss": 0.2281, + "num_input_tokens_seen": 59741504, + "step": 27645 + }, + { + "epoch": 4.510603588907014, + "grad_norm": 0.016192087903618813, + "learning_rate": 0.000952768093170256, + "loss": 0.0549, + "num_input_tokens_seen": 59751904, + "step": 27650 + }, + { + "epoch": 4.511419249592169, + "grad_norm": 0.17167527973651886, + "learning_rate": 0.0009527378891876223, + "loss": 0.2543, + "num_input_tokens_seen": 59762976, + "step": 27655 + }, + { + "epoch": 4.512234910277325, + "grad_norm": 0.04856706038117409, + "learning_rate": 0.0009527076760296809, + "loss": 0.131, + "num_input_tokens_seen": 59773664, + "step": 27660 + }, + { + "epoch": 4.51305057096248, + "grad_norm": 0.03441392630338669, + "learning_rate": 0.0009526774536970442, + "loss": 0.1098, + "num_input_tokens_seen": 59783168, + "step": 27665 + }, + { + "epoch": 4.513866231647635, + "grad_norm": 0.04515177384018898, + "learning_rate": 0.0009526472221903247, + "loss": 0.088, + "num_input_tokens_seen": 59794688, + "step": 27670 + }, + { + "epoch": 4.514681892332789, + "grad_norm": 0.014667819254100323, + "learning_rate": 0.0009526169815101349, + "loss": 0.0503, + "num_input_tokens_seen": 59804832, + "step": 27675 + }, + { + "epoch": 4.515497553017944, + "grad_norm": 0.09431620687246323, + "learning_rate": 0.0009525867316570877, + "loss": 0.145, + "num_input_tokens_seen": 59814944, + "step": 27680 + }, + { + "epoch": 4.5163132137031, + "grad_norm": 0.023899326100945473, + "learning_rate": 0.0009525564726317963, + "loss": 0.0793, + "num_input_tokens_seen": 59826176, + "step": 27685 + }, + { + "epoch": 4.517128874388255, + "grad_norm": 0.04530876874923706, + "learning_rate": 0.000952526204434874, + "loss": 0.038, + "num_input_tokens_seen": 59838240, + "step": 27690 + }, + { + "epoch": 4.5179445350734095, + "grad_norm": 0.026498528197407722, + "learning_rate": 0.000952495927066934, + "loss": 0.0548, + "num_input_tokens_seen": 59848032, + "step": 27695 + }, + { + "epoch": 4.518760195758564, + "grad_norm": 0.027610991150140762, + "learning_rate": 0.00095246564052859, + "loss": 0.0211, + "num_input_tokens_seen": 59859328, + "step": 27700 + }, + { + "epoch": 4.519575856443719, + "grad_norm": 0.0709771141409874, + "learning_rate": 0.0009524353448204558, + "loss": 0.0723, + "num_input_tokens_seen": 59870272, + "step": 27705 + }, + { + "epoch": 4.520391517128875, + "grad_norm": 0.008169763721525669, + "learning_rate": 0.0009524050399431454, + "loss": 0.0665, + "num_input_tokens_seen": 59880416, + "step": 27710 + }, + { + "epoch": 4.52120717781403, + "grad_norm": 0.09817571938037872, + "learning_rate": 0.0009523747258972729, + "loss": 0.11, + "num_input_tokens_seen": 59891168, + "step": 27715 + }, + { + "epoch": 4.5220228384991845, + "grad_norm": 0.11686165630817413, + "learning_rate": 0.0009523444026834528, + "loss": 0.2663, + "num_input_tokens_seen": 59901600, + "step": 27720 + }, + { + "epoch": 4.522838499184339, + "grad_norm": 0.03325748071074486, + "learning_rate": 0.0009523140703022995, + "loss": 0.0841, + "num_input_tokens_seen": 59912832, + "step": 27725 + }, + { + "epoch": 4.523654159869494, + "grad_norm": 0.26222917437553406, + "learning_rate": 0.0009522837287544277, + "loss": 0.1265, + "num_input_tokens_seen": 59924288, + "step": 27730 + }, + { + "epoch": 4.524469820554649, + "grad_norm": 0.04080116003751755, + "learning_rate": 0.0009522533780404526, + "loss": 0.0865, + "num_input_tokens_seen": 59935072, + "step": 27735 + }, + { + "epoch": 4.525285481239804, + "grad_norm": 0.02146266959607601, + "learning_rate": 0.0009522230181609888, + "loss": 0.1188, + "num_input_tokens_seen": 59945856, + "step": 27740 + }, + { + "epoch": 4.5261011419249595, + "grad_norm": 0.008499844931066036, + "learning_rate": 0.000952192649116652, + "loss": 0.0873, + "num_input_tokens_seen": 59957152, + "step": 27745 + }, + { + "epoch": 4.526916802610114, + "grad_norm": 0.06477084755897522, + "learning_rate": 0.0009521622709080574, + "loss": 0.1054, + "num_input_tokens_seen": 59968000, + "step": 27750 + }, + { + "epoch": 4.527732463295269, + "grad_norm": 0.03540358319878578, + "learning_rate": 0.0009521318835358208, + "loss": 0.0768, + "num_input_tokens_seen": 59978496, + "step": 27755 + }, + { + "epoch": 4.528548123980424, + "grad_norm": 0.01290238369256258, + "learning_rate": 0.000952101487000558, + "loss": 0.1064, + "num_input_tokens_seen": 59989376, + "step": 27760 + }, + { + "epoch": 4.529363784665579, + "grad_norm": 0.007645154371857643, + "learning_rate": 0.0009520710813028852, + "loss": 0.0936, + "num_input_tokens_seen": 59999296, + "step": 27765 + }, + { + "epoch": 4.5301794453507345, + "grad_norm": 0.18740314245224, + "learning_rate": 0.0009520406664434183, + "loss": 0.2108, + "num_input_tokens_seen": 60008928, + "step": 27770 + }, + { + "epoch": 4.530995106035889, + "grad_norm": 0.1460702270269394, + "learning_rate": 0.0009520102424227739, + "loss": 0.1085, + "num_input_tokens_seen": 60020640, + "step": 27775 + }, + { + "epoch": 4.531810766721044, + "grad_norm": 0.26850706338882446, + "learning_rate": 0.0009519798092415683, + "loss": 0.128, + "num_input_tokens_seen": 60031904, + "step": 27780 + }, + { + "epoch": 4.532626427406199, + "grad_norm": 0.23345030844211578, + "learning_rate": 0.0009519493669004189, + "loss": 0.0953, + "num_input_tokens_seen": 60041440, + "step": 27785 + }, + { + "epoch": 4.533442088091354, + "grad_norm": 0.12963812053203583, + "learning_rate": 0.0009519189153999419, + "loss": 0.0624, + "num_input_tokens_seen": 60052256, + "step": 27790 + }, + { + "epoch": 4.5342577487765094, + "grad_norm": 0.10406608879566193, + "learning_rate": 0.0009518884547407549, + "loss": 0.0787, + "num_input_tokens_seen": 60062624, + "step": 27795 + }, + { + "epoch": 4.535073409461664, + "grad_norm": 0.16851423680782318, + "learning_rate": 0.0009518579849234752, + "loss": 0.2199, + "num_input_tokens_seen": 60073280, + "step": 27800 + }, + { + "epoch": 4.535889070146819, + "grad_norm": 0.01687563955783844, + "learning_rate": 0.00095182750594872, + "loss": 0.0738, + "num_input_tokens_seen": 60084288, + "step": 27805 + }, + { + "epoch": 4.536704730831974, + "grad_norm": 0.06263386458158493, + "learning_rate": 0.0009517970178171074, + "loss": 0.0758, + "num_input_tokens_seen": 60095200, + "step": 27810 + }, + { + "epoch": 4.537520391517129, + "grad_norm": 0.019296538084745407, + "learning_rate": 0.000951766520529255, + "loss": 0.1504, + "num_input_tokens_seen": 60104960, + "step": 27815 + }, + { + "epoch": 4.5383360522022835, + "grad_norm": 0.009546547196805477, + "learning_rate": 0.0009517360140857809, + "loss": 0.023, + "num_input_tokens_seen": 60116704, + "step": 27820 + }, + { + "epoch": 4.539151712887438, + "grad_norm": 0.09043900668621063, + "learning_rate": 0.0009517054984873035, + "loss": 0.1035, + "num_input_tokens_seen": 60128800, + "step": 27825 + }, + { + "epoch": 4.539967373572594, + "grad_norm": 0.0927947387099266, + "learning_rate": 0.0009516749737344412, + "loss": 0.2508, + "num_input_tokens_seen": 60140960, + "step": 27830 + }, + { + "epoch": 4.540783034257749, + "grad_norm": 0.035468216985464096, + "learning_rate": 0.0009516444398278125, + "loss": 0.1376, + "num_input_tokens_seen": 60151616, + "step": 27835 + }, + { + "epoch": 4.541598694942904, + "grad_norm": 0.01672246865928173, + "learning_rate": 0.0009516138967680363, + "loss": 0.1912, + "num_input_tokens_seen": 60163392, + "step": 27840 + }, + { + "epoch": 4.5424143556280585, + "grad_norm": 0.018259450793266296, + "learning_rate": 0.0009515833445557314, + "loss": 0.1607, + "num_input_tokens_seen": 60174848, + "step": 27845 + }, + { + "epoch": 4.543230016313213, + "grad_norm": 0.038537368178367615, + "learning_rate": 0.0009515527831915174, + "loss": 0.0199, + "num_input_tokens_seen": 60184384, + "step": 27850 + }, + { + "epoch": 4.544045676998369, + "grad_norm": 0.14104370772838593, + "learning_rate": 0.0009515222126760132, + "loss": 0.1638, + "num_input_tokens_seen": 60196064, + "step": 27855 + }, + { + "epoch": 4.544861337683524, + "grad_norm": 0.03136323764920235, + "learning_rate": 0.0009514916330098386, + "loss": 0.1443, + "num_input_tokens_seen": 60205760, + "step": 27860 + }, + { + "epoch": 4.545676998368679, + "grad_norm": 0.05053570494055748, + "learning_rate": 0.0009514610441936133, + "loss": 0.0778, + "num_input_tokens_seen": 60218208, + "step": 27865 + }, + { + "epoch": 4.5464926590538335, + "grad_norm": 0.04487035796046257, + "learning_rate": 0.0009514304462279574, + "loss": 0.0618, + "num_input_tokens_seen": 60230016, + "step": 27870 + }, + { + "epoch": 4.547308319738988, + "grad_norm": 0.00935713853687048, + "learning_rate": 0.0009513998391134906, + "loss": 0.1061, + "num_input_tokens_seen": 60238976, + "step": 27875 + }, + { + "epoch": 4.548123980424144, + "grad_norm": 0.09090052545070648, + "learning_rate": 0.0009513692228508336, + "loss": 0.1155, + "num_input_tokens_seen": 60248864, + "step": 27880 + }, + { + "epoch": 4.548939641109299, + "grad_norm": 0.0810205340385437, + "learning_rate": 0.0009513385974406066, + "loss": 0.1643, + "num_input_tokens_seen": 60259328, + "step": 27885 + }, + { + "epoch": 4.549755301794454, + "grad_norm": 0.2484816461801529, + "learning_rate": 0.0009513079628834305, + "loss": 0.172, + "num_input_tokens_seen": 60269152, + "step": 27890 + }, + { + "epoch": 4.5505709624796085, + "grad_norm": 0.007728502620011568, + "learning_rate": 0.0009512773191799258, + "loss": 0.0496, + "num_input_tokens_seen": 60279616, + "step": 27895 + }, + { + "epoch": 4.551386623164763, + "grad_norm": 0.010481576435267925, + "learning_rate": 0.0009512466663307138, + "loss": 0.1528, + "num_input_tokens_seen": 60291264, + "step": 27900 + }, + { + "epoch": 4.552202283849918, + "grad_norm": 0.10634084790945053, + "learning_rate": 0.0009512160043364157, + "loss": 0.1042, + "num_input_tokens_seen": 60302592, + "step": 27905 + }, + { + "epoch": 4.553017944535073, + "grad_norm": 0.03876103460788727, + "learning_rate": 0.0009511853331976527, + "loss": 0.0544, + "num_input_tokens_seen": 60312992, + "step": 27910 + }, + { + "epoch": 4.553833605220229, + "grad_norm": 0.026083072647452354, + "learning_rate": 0.0009511546529150467, + "loss": 0.1053, + "num_input_tokens_seen": 60324032, + "step": 27915 + }, + { + "epoch": 4.554649265905383, + "grad_norm": 0.05046556144952774, + "learning_rate": 0.0009511239634892195, + "loss": 0.0663, + "num_input_tokens_seen": 60334432, + "step": 27920 + }, + { + "epoch": 4.555464926590538, + "grad_norm": 0.006147427950054407, + "learning_rate": 0.0009510932649207926, + "loss": 0.0284, + "num_input_tokens_seen": 60344576, + "step": 27925 + }, + { + "epoch": 4.556280587275693, + "grad_norm": 0.0055056107230484486, + "learning_rate": 0.0009510625572103886, + "loss": 0.1075, + "num_input_tokens_seen": 60354752, + "step": 27930 + }, + { + "epoch": 4.557096247960848, + "grad_norm": 0.019017960876226425, + "learning_rate": 0.0009510318403586297, + "loss": 0.0389, + "num_input_tokens_seen": 60365728, + "step": 27935 + }, + { + "epoch": 4.557911908646004, + "grad_norm": 0.033328790217638016, + "learning_rate": 0.0009510011143661382, + "loss": 0.0932, + "num_input_tokens_seen": 60375936, + "step": 27940 + }, + { + "epoch": 4.558727569331158, + "grad_norm": 0.10524723678827286, + "learning_rate": 0.0009509703792335371, + "loss": 0.0775, + "num_input_tokens_seen": 60386400, + "step": 27945 + }, + { + "epoch": 4.559543230016313, + "grad_norm": 0.043684348464012146, + "learning_rate": 0.0009509396349614492, + "loss": 0.1881, + "num_input_tokens_seen": 60397952, + "step": 27950 + }, + { + "epoch": 4.560358890701468, + "grad_norm": 0.006683858577162027, + "learning_rate": 0.0009509088815504975, + "loss": 0.1703, + "num_input_tokens_seen": 60409600, + "step": 27955 + }, + { + "epoch": 4.561174551386623, + "grad_norm": 0.022528601810336113, + "learning_rate": 0.0009508781190013053, + "loss": 0.0743, + "num_input_tokens_seen": 60419136, + "step": 27960 + }, + { + "epoch": 4.561990212071779, + "grad_norm": 0.19460022449493408, + "learning_rate": 0.0009508473473144961, + "loss": 0.1594, + "num_input_tokens_seen": 60429504, + "step": 27965 + }, + { + "epoch": 4.562805872756933, + "grad_norm": 0.011187287978827953, + "learning_rate": 0.0009508165664906933, + "loss": 0.037, + "num_input_tokens_seen": 60439936, + "step": 27970 + }, + { + "epoch": 4.563621533442088, + "grad_norm": 0.042578887194395065, + "learning_rate": 0.000950785776530521, + "loss": 0.1405, + "num_input_tokens_seen": 60450336, + "step": 27975 + }, + { + "epoch": 4.564437194127243, + "grad_norm": 0.19036757946014404, + "learning_rate": 0.0009507549774346029, + "loss": 0.0967, + "num_input_tokens_seen": 60459296, + "step": 27980 + }, + { + "epoch": 4.565252854812398, + "grad_norm": 0.02649342082440853, + "learning_rate": 0.0009507241692035635, + "loss": 0.0385, + "num_input_tokens_seen": 60469632, + "step": 27985 + }, + { + "epoch": 4.566068515497553, + "grad_norm": 0.15176327526569366, + "learning_rate": 0.0009506933518380272, + "loss": 0.1407, + "num_input_tokens_seen": 60479488, + "step": 27990 + }, + { + "epoch": 4.566884176182708, + "grad_norm": 0.017907990142703056, + "learning_rate": 0.0009506625253386181, + "loss": 0.2209, + "num_input_tokens_seen": 60489824, + "step": 27995 + }, + { + "epoch": 4.567699836867863, + "grad_norm": 0.2888742983341217, + "learning_rate": 0.0009506316897059614, + "loss": 0.1817, + "num_input_tokens_seen": 60499776, + "step": 28000 + }, + { + "epoch": 4.568515497553018, + "grad_norm": 0.01035986002534628, + "learning_rate": 0.0009506008449406818, + "loss": 0.0258, + "num_input_tokens_seen": 60509952, + "step": 28005 + }, + { + "epoch": 4.569331158238173, + "grad_norm": 0.12834765017032623, + "learning_rate": 0.0009505699910434043, + "loss": 0.1524, + "num_input_tokens_seen": 60520704, + "step": 28010 + }, + { + "epoch": 4.570146818923328, + "grad_norm": 0.1346168965101242, + "learning_rate": 0.0009505391280147545, + "loss": 0.0391, + "num_input_tokens_seen": 60532640, + "step": 28015 + }, + { + "epoch": 4.5709624796084825, + "grad_norm": 0.15639248490333557, + "learning_rate": 0.0009505082558553577, + "loss": 0.169, + "num_input_tokens_seen": 60543360, + "step": 28020 + }, + { + "epoch": 4.571778140293638, + "grad_norm": 0.01339662540704012, + "learning_rate": 0.0009504773745658395, + "loss": 0.0576, + "num_input_tokens_seen": 60554080, + "step": 28025 + }, + { + "epoch": 4.572593800978793, + "grad_norm": 0.0473957359790802, + "learning_rate": 0.0009504464841468259, + "loss": 0.0252, + "num_input_tokens_seen": 60564576, + "step": 28030 + }, + { + "epoch": 4.573409461663948, + "grad_norm": 0.035671476274728775, + "learning_rate": 0.000950415584598943, + "loss": 0.1969, + "num_input_tokens_seen": 60574880, + "step": 28035 + }, + { + "epoch": 4.574225122349103, + "grad_norm": 0.166676864027977, + "learning_rate": 0.0009503846759228167, + "loss": 0.0505, + "num_input_tokens_seen": 60585216, + "step": 28040 + }, + { + "epoch": 4.575040783034257, + "grad_norm": 0.19653046131134033, + "learning_rate": 0.0009503537581190736, + "loss": 0.1203, + "num_input_tokens_seen": 60595840, + "step": 28045 + }, + { + "epoch": 4.575856443719413, + "grad_norm": 0.13469751179218292, + "learning_rate": 0.0009503228311883402, + "loss": 0.0871, + "num_input_tokens_seen": 60606720, + "step": 28050 + }, + { + "epoch": 4.576672104404568, + "grad_norm": 0.0519726388156414, + "learning_rate": 0.0009502918951312436, + "loss": 0.1482, + "num_input_tokens_seen": 60618368, + "step": 28055 + }, + { + "epoch": 4.577487765089723, + "grad_norm": 0.017064429819583893, + "learning_rate": 0.0009502609499484104, + "loss": 0.0481, + "num_input_tokens_seen": 60628352, + "step": 28060 + }, + { + "epoch": 4.578303425774878, + "grad_norm": 0.13285928964614868, + "learning_rate": 0.0009502299956404679, + "loss": 0.1352, + "num_input_tokens_seen": 60639168, + "step": 28065 + }, + { + "epoch": 4.579119086460032, + "grad_norm": 0.01909302920103073, + "learning_rate": 0.0009501990322080433, + "loss": 0.2349, + "num_input_tokens_seen": 60649760, + "step": 28070 + }, + { + "epoch": 4.579934747145187, + "grad_norm": 0.09038439393043518, + "learning_rate": 0.0009501680596517641, + "loss": 0.2203, + "num_input_tokens_seen": 60660000, + "step": 28075 + }, + { + "epoch": 4.580750407830343, + "grad_norm": 0.2496732622385025, + "learning_rate": 0.0009501370779722582, + "loss": 0.1181, + "num_input_tokens_seen": 60670752, + "step": 28080 + }, + { + "epoch": 4.581566068515498, + "grad_norm": 0.050215430557727814, + "learning_rate": 0.0009501060871701534, + "loss": 0.0693, + "num_input_tokens_seen": 60681408, + "step": 28085 + }, + { + "epoch": 4.582381729200653, + "grad_norm": 0.0064918166026473045, + "learning_rate": 0.0009500750872460778, + "loss": 0.1827, + "num_input_tokens_seen": 60691936, + "step": 28090 + }, + { + "epoch": 4.583197389885807, + "grad_norm": 0.01403888314962387, + "learning_rate": 0.0009500440782006594, + "loss": 0.1614, + "num_input_tokens_seen": 60702848, + "step": 28095 + }, + { + "epoch": 4.584013050570962, + "grad_norm": 0.21205657720565796, + "learning_rate": 0.000950013060034527, + "loss": 0.1736, + "num_input_tokens_seen": 60713920, + "step": 28100 + }, + { + "epoch": 4.584828711256117, + "grad_norm": 0.07595892995595932, + "learning_rate": 0.0009499820327483091, + "loss": 0.0665, + "num_input_tokens_seen": 60724992, + "step": 28105 + }, + { + "epoch": 4.585644371941273, + "grad_norm": 0.10927172750234604, + "learning_rate": 0.0009499509963426342, + "loss": 0.0705, + "num_input_tokens_seen": 60735520, + "step": 28110 + }, + { + "epoch": 4.5864600326264275, + "grad_norm": 0.20014338195323944, + "learning_rate": 0.0009499199508181318, + "loss": 0.1299, + "num_input_tokens_seen": 60747072, + "step": 28115 + }, + { + "epoch": 4.587275693311582, + "grad_norm": 0.12331785261631012, + "learning_rate": 0.0009498888961754308, + "loss": 0.0715, + "num_input_tokens_seen": 60756768, + "step": 28120 + }, + { + "epoch": 4.588091353996737, + "grad_norm": 0.29597270488739014, + "learning_rate": 0.0009498578324151606, + "loss": 0.1716, + "num_input_tokens_seen": 60767392, + "step": 28125 + }, + { + "epoch": 4.588907014681892, + "grad_norm": 0.01776668056845665, + "learning_rate": 0.0009498267595379506, + "loss": 0.2, + "num_input_tokens_seen": 60776416, + "step": 28130 + }, + { + "epoch": 4.589722675367048, + "grad_norm": 0.02030642330646515, + "learning_rate": 0.0009497956775444307, + "loss": 0.0556, + "num_input_tokens_seen": 60786688, + "step": 28135 + }, + { + "epoch": 4.5905383360522025, + "grad_norm": 0.14282387495040894, + "learning_rate": 0.0009497645864352309, + "loss": 0.0482, + "num_input_tokens_seen": 60797600, + "step": 28140 + }, + { + "epoch": 4.591353996737357, + "grad_norm": 0.002983206883072853, + "learning_rate": 0.0009497334862109812, + "loss": 0.0327, + "num_input_tokens_seen": 60808192, + "step": 28145 + }, + { + "epoch": 4.592169657422512, + "grad_norm": 0.2608118951320648, + "learning_rate": 0.0009497023768723119, + "loss": 0.1167, + "num_input_tokens_seen": 60818912, + "step": 28150 + }, + { + "epoch": 4.592985318107667, + "grad_norm": 0.12593863904476166, + "learning_rate": 0.0009496712584198532, + "loss": 0.0521, + "num_input_tokens_seen": 60829568, + "step": 28155 + }, + { + "epoch": 4.593800978792823, + "grad_norm": 0.050070811063051224, + "learning_rate": 0.0009496401308542363, + "loss": 0.0943, + "num_input_tokens_seen": 60841440, + "step": 28160 + }, + { + "epoch": 4.5946166394779775, + "grad_norm": 0.01761269010603428, + "learning_rate": 0.0009496089941760915, + "loss": 0.0159, + "num_input_tokens_seen": 60852672, + "step": 28165 + }, + { + "epoch": 4.595432300163132, + "grad_norm": 0.38470232486724854, + "learning_rate": 0.0009495778483860502, + "loss": 0.2002, + "num_input_tokens_seen": 60864384, + "step": 28170 + }, + { + "epoch": 4.596247960848287, + "grad_norm": 0.32193905115127563, + "learning_rate": 0.0009495466934847434, + "loss": 0.1645, + "num_input_tokens_seen": 60874656, + "step": 28175 + }, + { + "epoch": 4.597063621533442, + "grad_norm": 0.04758863151073456, + "learning_rate": 0.0009495155294728026, + "loss": 0.0498, + "num_input_tokens_seen": 60885920, + "step": 28180 + }, + { + "epoch": 4.597879282218597, + "grad_norm": 0.0736926794052124, + "learning_rate": 0.0009494843563508594, + "loss": 0.1112, + "num_input_tokens_seen": 60896992, + "step": 28185 + }, + { + "epoch": 4.598694942903752, + "grad_norm": 0.25502830743789673, + "learning_rate": 0.0009494531741195454, + "loss": 0.0813, + "num_input_tokens_seen": 60908768, + "step": 28190 + }, + { + "epoch": 4.599510603588907, + "grad_norm": 0.057509299367666245, + "learning_rate": 0.0009494219827794928, + "loss": 0.0414, + "num_input_tokens_seen": 60919680, + "step": 28195 + }, + { + "epoch": 4.600326264274062, + "grad_norm": 0.05833054706454277, + "learning_rate": 0.0009493907823313334, + "loss": 0.122, + "num_input_tokens_seen": 60930368, + "step": 28200 + }, + { + "epoch": 4.601141924959217, + "grad_norm": 0.010716291144490242, + "learning_rate": 0.0009493595727756998, + "loss": 0.0178, + "num_input_tokens_seen": 60940896, + "step": 28205 + }, + { + "epoch": 4.601957585644372, + "grad_norm": 0.12371497601270676, + "learning_rate": 0.0009493283541132245, + "loss": 0.1883, + "num_input_tokens_seen": 60951424, + "step": 28210 + }, + { + "epoch": 4.602773246329527, + "grad_norm": 0.16348916292190552, + "learning_rate": 0.0009492971263445401, + "loss": 0.1108, + "num_input_tokens_seen": 60961088, + "step": 28215 + }, + { + "epoch": 4.603588907014682, + "grad_norm": 0.06231715530157089, + "learning_rate": 0.0009492658894702792, + "loss": 0.1384, + "num_input_tokens_seen": 60971424, + "step": 28220 + }, + { + "epoch": 4.604404567699837, + "grad_norm": 0.016711369156837463, + "learning_rate": 0.0009492346434910753, + "loss": 0.1207, + "num_input_tokens_seen": 60981056, + "step": 28225 + }, + { + "epoch": 4.605220228384992, + "grad_norm": 0.053427740931510925, + "learning_rate": 0.0009492033884075615, + "loss": 0.0644, + "num_input_tokens_seen": 60992288, + "step": 28230 + }, + { + "epoch": 4.606035889070147, + "grad_norm": 0.057806648313999176, + "learning_rate": 0.000949172124220371, + "loss": 0.0199, + "num_input_tokens_seen": 61003008, + "step": 28235 + }, + { + "epoch": 4.6068515497553015, + "grad_norm": 0.03352320194244385, + "learning_rate": 0.0009491408509301378, + "loss": 0.195, + "num_input_tokens_seen": 61011904, + "step": 28240 + }, + { + "epoch": 4.607667210440457, + "grad_norm": 0.017068684101104736, + "learning_rate": 0.0009491095685374954, + "loss": 0.0537, + "num_input_tokens_seen": 61022976, + "step": 28245 + }, + { + "epoch": 4.608482871125612, + "grad_norm": 0.11282926797866821, + "learning_rate": 0.0009490782770430777, + "loss": 0.0291, + "num_input_tokens_seen": 61034976, + "step": 28250 + }, + { + "epoch": 4.609298531810767, + "grad_norm": 0.13886576890945435, + "learning_rate": 0.0009490469764475191, + "loss": 0.0539, + "num_input_tokens_seen": 61045696, + "step": 28255 + }, + { + "epoch": 4.610114192495922, + "grad_norm": 0.2587529420852661, + "learning_rate": 0.0009490156667514541, + "loss": 0.0959, + "num_input_tokens_seen": 61056192, + "step": 28260 + }, + { + "epoch": 4.6109298531810765, + "grad_norm": 0.013154418207705021, + "learning_rate": 0.0009489843479555167, + "loss": 0.0607, + "num_input_tokens_seen": 61065856, + "step": 28265 + }, + { + "epoch": 4.611745513866231, + "grad_norm": 0.004272149410098791, + "learning_rate": 0.000948953020060342, + "loss": 0.0547, + "num_input_tokens_seen": 61075424, + "step": 28270 + }, + { + "epoch": 4.612561174551386, + "grad_norm": 0.300809383392334, + "learning_rate": 0.0009489216830665649, + "loss": 0.0647, + "num_input_tokens_seen": 61087168, + "step": 28275 + }, + { + "epoch": 4.613376835236542, + "grad_norm": 0.005980401299893856, + "learning_rate": 0.0009488903369748203, + "loss": 0.0921, + "num_input_tokens_seen": 61098816, + "step": 28280 + }, + { + "epoch": 4.614192495921697, + "grad_norm": 0.12887540459632874, + "learning_rate": 0.0009488589817857435, + "loss": 0.1491, + "num_input_tokens_seen": 61108992, + "step": 28285 + }, + { + "epoch": 4.6150081566068515, + "grad_norm": 0.1607290506362915, + "learning_rate": 0.0009488276174999702, + "loss": 0.0865, + "num_input_tokens_seen": 61120032, + "step": 28290 + }, + { + "epoch": 4.615823817292006, + "grad_norm": 0.16185465455055237, + "learning_rate": 0.0009487962441181357, + "loss": 0.0644, + "num_input_tokens_seen": 61130496, + "step": 28295 + }, + { + "epoch": 4.616639477977161, + "grad_norm": 0.13092803955078125, + "learning_rate": 0.0009487648616408762, + "loss": 0.0513, + "num_input_tokens_seen": 61141344, + "step": 28300 + }, + { + "epoch": 4.617455138662317, + "grad_norm": 0.07150975614786148, + "learning_rate": 0.0009487334700688273, + "loss": 0.0256, + "num_input_tokens_seen": 61152512, + "step": 28305 + }, + { + "epoch": 4.618270799347472, + "grad_norm": 0.02035592496395111, + "learning_rate": 0.0009487020694026254, + "loss": 0.0387, + "num_input_tokens_seen": 61164416, + "step": 28310 + }, + { + "epoch": 4.6190864600326265, + "grad_norm": 0.0039632623083889484, + "learning_rate": 0.0009486706596429068, + "loss": 0.2028, + "num_input_tokens_seen": 61176160, + "step": 28315 + }, + { + "epoch": 4.619902120717781, + "grad_norm": 0.023158259689807892, + "learning_rate": 0.0009486392407903082, + "loss": 0.2388, + "num_input_tokens_seen": 61186560, + "step": 28320 + }, + { + "epoch": 4.620717781402936, + "grad_norm": 0.10569548606872559, + "learning_rate": 0.000948607812845466, + "loss": 0.0666, + "num_input_tokens_seen": 61197504, + "step": 28325 + }, + { + "epoch": 4.621533442088092, + "grad_norm": 0.020724743604660034, + "learning_rate": 0.0009485763758090176, + "loss": 0.0308, + "num_input_tokens_seen": 61209088, + "step": 28330 + }, + { + "epoch": 4.622349102773247, + "grad_norm": 0.21098117530345917, + "learning_rate": 0.0009485449296815999, + "loss": 0.1628, + "num_input_tokens_seen": 61220224, + "step": 28335 + }, + { + "epoch": 4.623164763458401, + "grad_norm": 0.03836962580680847, + "learning_rate": 0.00094851347446385, + "loss": 0.0523, + "num_input_tokens_seen": 61232480, + "step": 28340 + }, + { + "epoch": 4.623980424143556, + "grad_norm": 0.2680718004703522, + "learning_rate": 0.0009484820101564058, + "loss": 0.1557, + "num_input_tokens_seen": 61244032, + "step": 28345 + }, + { + "epoch": 4.624796084828711, + "grad_norm": 0.19660809636116028, + "learning_rate": 0.0009484505367599045, + "loss": 0.1351, + "num_input_tokens_seen": 61256096, + "step": 28350 + }, + { + "epoch": 4.625611745513866, + "grad_norm": 0.04200530797243118, + "learning_rate": 0.0009484190542749844, + "loss": 0.1997, + "num_input_tokens_seen": 61268576, + "step": 28355 + }, + { + "epoch": 4.626427406199021, + "grad_norm": 0.05554460734128952, + "learning_rate": 0.0009483875627022831, + "loss": 0.0761, + "num_input_tokens_seen": 61279584, + "step": 28360 + }, + { + "epoch": 4.627243066884176, + "grad_norm": 0.020894384011626244, + "learning_rate": 0.0009483560620424391, + "loss": 0.1132, + "num_input_tokens_seen": 61289824, + "step": 28365 + }, + { + "epoch": 4.628058727569331, + "grad_norm": 0.071620874106884, + "learning_rate": 0.0009483245522960909, + "loss": 0.1386, + "num_input_tokens_seen": 61299296, + "step": 28370 + }, + { + "epoch": 4.628874388254486, + "grad_norm": 0.03438745439052582, + "learning_rate": 0.0009482930334638766, + "loss": 0.0269, + "num_input_tokens_seen": 61309600, + "step": 28375 + }, + { + "epoch": 4.629690048939641, + "grad_norm": 0.05470510572195053, + "learning_rate": 0.0009482615055464354, + "loss": 0.1482, + "num_input_tokens_seen": 61320288, + "step": 28380 + }, + { + "epoch": 4.630505709624796, + "grad_norm": 0.09955763071775436, + "learning_rate": 0.0009482299685444062, + "loss": 0.117, + "num_input_tokens_seen": 61331328, + "step": 28385 + }, + { + "epoch": 4.631321370309951, + "grad_norm": 0.006198828108608723, + "learning_rate": 0.0009481984224584279, + "loss": 0.1589, + "num_input_tokens_seen": 61341248, + "step": 28390 + }, + { + "epoch": 4.632137030995106, + "grad_norm": 0.2537461221218109, + "learning_rate": 0.0009481668672891401, + "loss": 0.1139, + "num_input_tokens_seen": 61350880, + "step": 28395 + }, + { + "epoch": 4.632952691680261, + "grad_norm": 0.09156087040901184, + "learning_rate": 0.0009481353030371822, + "loss": 0.083, + "num_input_tokens_seen": 61361760, + "step": 28400 + }, + { + "epoch": 4.633768352365416, + "grad_norm": 0.011091694235801697, + "learning_rate": 0.0009481037297031939, + "loss": 0.1354, + "num_input_tokens_seen": 61372896, + "step": 28405 + }, + { + "epoch": 4.634584013050571, + "grad_norm": 0.23919807374477386, + "learning_rate": 0.0009480721472878151, + "loss": 0.2114, + "num_input_tokens_seen": 61382752, + "step": 28410 + }, + { + "epoch": 4.635399673735726, + "grad_norm": 0.012990964576601982, + "learning_rate": 0.0009480405557916858, + "loss": 0.0337, + "num_input_tokens_seen": 61393792, + "step": 28415 + }, + { + "epoch": 4.636215334420881, + "grad_norm": 0.057514190673828125, + "learning_rate": 0.0009480089552154461, + "loss": 0.1262, + "num_input_tokens_seen": 61403968, + "step": 28420 + }, + { + "epoch": 4.637030995106036, + "grad_norm": 0.25216689705848694, + "learning_rate": 0.0009479773455597367, + "loss": 0.1167, + "num_input_tokens_seen": 61415648, + "step": 28425 + }, + { + "epoch": 4.637846655791191, + "grad_norm": 0.12715749442577362, + "learning_rate": 0.0009479457268251981, + "loss": 0.1525, + "num_input_tokens_seen": 61425696, + "step": 28430 + }, + { + "epoch": 4.638662316476346, + "grad_norm": 0.006238611415028572, + "learning_rate": 0.0009479140990124711, + "loss": 0.065, + "num_input_tokens_seen": 61436928, + "step": 28435 + }, + { + "epoch": 4.6394779771615005, + "grad_norm": 0.14752565324306488, + "learning_rate": 0.0009478824621221967, + "loss": 0.2457, + "num_input_tokens_seen": 61447648, + "step": 28440 + }, + { + "epoch": 4.640293637846656, + "grad_norm": 0.03811931237578392, + "learning_rate": 0.0009478508161550159, + "loss": 0.1437, + "num_input_tokens_seen": 61459776, + "step": 28445 + }, + { + "epoch": 4.641109298531811, + "grad_norm": 0.03835804760456085, + "learning_rate": 0.0009478191611115702, + "loss": 0.081, + "num_input_tokens_seen": 61471040, + "step": 28450 + }, + { + "epoch": 4.641924959216966, + "grad_norm": 0.0996306762099266, + "learning_rate": 0.0009477874969925011, + "loss": 0.0583, + "num_input_tokens_seen": 61481888, + "step": 28455 + }, + { + "epoch": 4.642740619902121, + "grad_norm": 0.2590426206588745, + "learning_rate": 0.0009477558237984503, + "loss": 0.1078, + "num_input_tokens_seen": 61492928, + "step": 28460 + }, + { + "epoch": 4.643556280587275, + "grad_norm": 0.21474169194698334, + "learning_rate": 0.0009477241415300599, + "loss": 0.1292, + "num_input_tokens_seen": 61504320, + "step": 28465 + }, + { + "epoch": 4.64437194127243, + "grad_norm": 0.021407373249530792, + "learning_rate": 0.0009476924501879715, + "loss": 0.0183, + "num_input_tokens_seen": 61515424, + "step": 28470 + }, + { + "epoch": 4.645187601957586, + "grad_norm": 0.03953413665294647, + "learning_rate": 0.0009476607497728279, + "loss": 0.0356, + "num_input_tokens_seen": 61526176, + "step": 28475 + }, + { + "epoch": 4.646003262642741, + "grad_norm": 0.09875276684761047, + "learning_rate": 0.0009476290402852712, + "loss": 0.1282, + "num_input_tokens_seen": 61536288, + "step": 28480 + }, + { + "epoch": 4.646818923327896, + "grad_norm": 0.0387643501162529, + "learning_rate": 0.0009475973217259442, + "loss": 0.0396, + "num_input_tokens_seen": 61548544, + "step": 28485 + }, + { + "epoch": 4.64763458401305, + "grad_norm": 0.010005326010286808, + "learning_rate": 0.0009475655940954896, + "loss": 0.0581, + "num_input_tokens_seen": 61559776, + "step": 28490 + }, + { + "epoch": 4.648450244698205, + "grad_norm": 0.30142971873283386, + "learning_rate": 0.0009475338573945504, + "loss": 0.1899, + "num_input_tokens_seen": 61570784, + "step": 28495 + }, + { + "epoch": 4.649265905383361, + "grad_norm": 0.060554053634405136, + "learning_rate": 0.0009475021116237699, + "loss": 0.1408, + "num_input_tokens_seen": 61581856, + "step": 28500 + }, + { + "epoch": 4.650081566068516, + "grad_norm": 0.010634462349116802, + "learning_rate": 0.0009474703567837915, + "loss": 0.0715, + "num_input_tokens_seen": 61593376, + "step": 28505 + }, + { + "epoch": 4.650897226753671, + "grad_norm": 0.20411312580108643, + "learning_rate": 0.0009474385928752585, + "loss": 0.2613, + "num_input_tokens_seen": 61604256, + "step": 28510 + }, + { + "epoch": 4.651712887438825, + "grad_norm": 0.014688130468130112, + "learning_rate": 0.0009474068198988151, + "loss": 0.1626, + "num_input_tokens_seen": 61615008, + "step": 28515 + }, + { + "epoch": 4.65252854812398, + "grad_norm": 0.021405626088380814, + "learning_rate": 0.0009473750378551046, + "loss": 0.0247, + "num_input_tokens_seen": 61626848, + "step": 28520 + }, + { + "epoch": 4.653344208809135, + "grad_norm": 0.02764919400215149, + "learning_rate": 0.0009473432467447715, + "loss": 0.0585, + "num_input_tokens_seen": 61637952, + "step": 28525 + }, + { + "epoch": 4.654159869494291, + "grad_norm": 0.4175470769405365, + "learning_rate": 0.00094731144656846, + "loss": 0.2282, + "num_input_tokens_seen": 61648288, + "step": 28530 + }, + { + "epoch": 4.6549755301794455, + "grad_norm": 0.059521906077861786, + "learning_rate": 0.0009472796373268147, + "loss": 0.0559, + "num_input_tokens_seen": 61659936, + "step": 28535 + }, + { + "epoch": 4.6557911908646, + "grad_norm": 0.1181897521018982, + "learning_rate": 0.00094724781902048, + "loss": 0.0752, + "num_input_tokens_seen": 61670464, + "step": 28540 + }, + { + "epoch": 4.656606851549755, + "grad_norm": 0.06549588590860367, + "learning_rate": 0.0009472159916501011, + "loss": 0.0248, + "num_input_tokens_seen": 61681568, + "step": 28545 + }, + { + "epoch": 4.65742251223491, + "grad_norm": 0.008692199364304543, + "learning_rate": 0.0009471841552163225, + "loss": 0.0928, + "num_input_tokens_seen": 61691808, + "step": 28550 + }, + { + "epoch": 4.658238172920065, + "grad_norm": 0.1296404004096985, + "learning_rate": 0.0009471523097197898, + "loss": 0.1294, + "num_input_tokens_seen": 61702400, + "step": 28555 + }, + { + "epoch": 4.6590538336052205, + "grad_norm": 0.011472326703369617, + "learning_rate": 0.0009471204551611483, + "loss": 0.1363, + "num_input_tokens_seen": 61713824, + "step": 28560 + }, + { + "epoch": 4.659869494290375, + "grad_norm": 0.017907777801156044, + "learning_rate": 0.0009470885915410437, + "loss": 0.1489, + "num_input_tokens_seen": 61724096, + "step": 28565 + }, + { + "epoch": 4.66068515497553, + "grad_norm": 0.03549930080771446, + "learning_rate": 0.0009470567188601214, + "loss": 0.0473, + "num_input_tokens_seen": 61735136, + "step": 28570 + }, + { + "epoch": 4.661500815660685, + "grad_norm": 0.05104723572731018, + "learning_rate": 0.0009470248371190277, + "loss": 0.0625, + "num_input_tokens_seen": 61746368, + "step": 28575 + }, + { + "epoch": 4.66231647634584, + "grad_norm": 0.2593303620815277, + "learning_rate": 0.0009469929463184086, + "loss": 0.3085, + "num_input_tokens_seen": 61756128, + "step": 28580 + }, + { + "epoch": 4.6631321370309955, + "grad_norm": 0.03945760801434517, + "learning_rate": 0.0009469610464589104, + "loss": 0.0243, + "num_input_tokens_seen": 61766464, + "step": 28585 + }, + { + "epoch": 4.66394779771615, + "grad_norm": 0.017860667780041695, + "learning_rate": 0.0009469291375411795, + "loss": 0.1135, + "num_input_tokens_seen": 61777632, + "step": 28590 + }, + { + "epoch": 4.664763458401305, + "grad_norm": 0.020938122645020485, + "learning_rate": 0.0009468972195658626, + "loss": 0.105, + "num_input_tokens_seen": 61788992, + "step": 28595 + }, + { + "epoch": 4.66557911908646, + "grad_norm": 0.03804948925971985, + "learning_rate": 0.0009468652925336068, + "loss": 0.0745, + "num_input_tokens_seen": 61799840, + "step": 28600 + }, + { + "epoch": 4.666394779771615, + "grad_norm": 0.17097699642181396, + "learning_rate": 0.0009468333564450587, + "loss": 0.1053, + "num_input_tokens_seen": 61810240, + "step": 28605 + }, + { + "epoch": 4.6672104404567705, + "grad_norm": 0.059029195457696915, + "learning_rate": 0.000946801411300866, + "loss": 0.0978, + "num_input_tokens_seen": 61820512, + "step": 28610 + }, + { + "epoch": 4.668026101141925, + "grad_norm": 0.06193551421165466, + "learning_rate": 0.0009467694571016758, + "loss": 0.1145, + "num_input_tokens_seen": 61831328, + "step": 28615 + }, + { + "epoch": 4.66884176182708, + "grad_norm": 0.031025558710098267, + "learning_rate": 0.0009467374938481359, + "loss": 0.1367, + "num_input_tokens_seen": 61842976, + "step": 28620 + }, + { + "epoch": 4.669657422512235, + "grad_norm": 0.09817475825548172, + "learning_rate": 0.0009467055215408939, + "loss": 0.0633, + "num_input_tokens_seen": 61854912, + "step": 28625 + }, + { + "epoch": 4.67047308319739, + "grad_norm": 0.051113102585077286, + "learning_rate": 0.0009466735401805977, + "loss": 0.0644, + "num_input_tokens_seen": 61866208, + "step": 28630 + }, + { + "epoch": 4.671288743882545, + "grad_norm": 0.17623735964298248, + "learning_rate": 0.0009466415497678957, + "loss": 0.1229, + "num_input_tokens_seen": 61877088, + "step": 28635 + }, + { + "epoch": 4.672104404567699, + "grad_norm": 0.1480836421251297, + "learning_rate": 0.000946609550303436, + "loss": 0.2016, + "num_input_tokens_seen": 61887328, + "step": 28640 + }, + { + "epoch": 4.672920065252855, + "grad_norm": 0.12157644331455231, + "learning_rate": 0.0009465775417878673, + "loss": 0.1657, + "num_input_tokens_seen": 61898336, + "step": 28645 + }, + { + "epoch": 4.67373572593801, + "grad_norm": 0.0039813462644815445, + "learning_rate": 0.0009465455242218382, + "loss": 0.0227, + "num_input_tokens_seen": 61909728, + "step": 28650 + }, + { + "epoch": 4.674551386623165, + "grad_norm": 0.15658128261566162, + "learning_rate": 0.0009465134976059975, + "loss": 0.0684, + "num_input_tokens_seen": 61921312, + "step": 28655 + }, + { + "epoch": 4.6753670473083195, + "grad_norm": 0.19435112178325653, + "learning_rate": 0.0009464814619409942, + "loss": 0.0361, + "num_input_tokens_seen": 61931744, + "step": 28660 + }, + { + "epoch": 4.676182707993474, + "grad_norm": 0.1114906296133995, + "learning_rate": 0.0009464494172274778, + "loss": 0.1178, + "num_input_tokens_seen": 61943264, + "step": 28665 + }, + { + "epoch": 4.67699836867863, + "grad_norm": 0.3585332930088043, + "learning_rate": 0.0009464173634660978, + "loss": 0.1103, + "num_input_tokens_seen": 61954112, + "step": 28670 + }, + { + "epoch": 4.677814029363785, + "grad_norm": 0.030214810743927956, + "learning_rate": 0.0009463853006575032, + "loss": 0.0855, + "num_input_tokens_seen": 61964384, + "step": 28675 + }, + { + "epoch": 4.67862969004894, + "grad_norm": 0.13624903559684753, + "learning_rate": 0.0009463532288023444, + "loss": 0.0304, + "num_input_tokens_seen": 61976608, + "step": 28680 + }, + { + "epoch": 4.6794453507340945, + "grad_norm": 0.006839347537606955, + "learning_rate": 0.0009463211479012712, + "loss": 0.0412, + "num_input_tokens_seen": 61987616, + "step": 28685 + }, + { + "epoch": 4.680261011419249, + "grad_norm": 0.2887722849845886, + "learning_rate": 0.0009462890579549338, + "loss": 0.0814, + "num_input_tokens_seen": 61997984, + "step": 28690 + }, + { + "epoch": 4.681076672104405, + "grad_norm": 0.1711539328098297, + "learning_rate": 0.0009462569589639825, + "loss": 0.1158, + "num_input_tokens_seen": 62008384, + "step": 28695 + }, + { + "epoch": 4.68189233278956, + "grad_norm": 0.014359569177031517, + "learning_rate": 0.0009462248509290676, + "loss": 0.0336, + "num_input_tokens_seen": 62019040, + "step": 28700 + }, + { + "epoch": 4.682707993474715, + "grad_norm": 0.12640659511089325, + "learning_rate": 0.0009461927338508402, + "loss": 0.1402, + "num_input_tokens_seen": 62030240, + "step": 28705 + }, + { + "epoch": 4.6835236541598695, + "grad_norm": 0.029281822964549065, + "learning_rate": 0.0009461606077299509, + "loss": 0.1383, + "num_input_tokens_seen": 62040768, + "step": 28710 + }, + { + "epoch": 4.684339314845024, + "grad_norm": 0.019829019904136658, + "learning_rate": 0.000946128472567051, + "loss": 0.0498, + "num_input_tokens_seen": 62051168, + "step": 28715 + }, + { + "epoch": 4.685154975530179, + "grad_norm": 0.3932690918445587, + "learning_rate": 0.0009460963283627917, + "loss": 0.0642, + "num_input_tokens_seen": 62062336, + "step": 28720 + }, + { + "epoch": 4.685970636215334, + "grad_norm": 0.13580350577831268, + "learning_rate": 0.0009460641751178243, + "loss": 0.126, + "num_input_tokens_seen": 62071232, + "step": 28725 + }, + { + "epoch": 4.68678629690049, + "grad_norm": 0.0017230919329449534, + "learning_rate": 0.0009460320128328003, + "loss": 0.0224, + "num_input_tokens_seen": 62082560, + "step": 28730 + }, + { + "epoch": 4.6876019575856445, + "grad_norm": 0.2777078449726105, + "learning_rate": 0.0009459998415083721, + "loss": 0.1692, + "num_input_tokens_seen": 62094304, + "step": 28735 + }, + { + "epoch": 4.688417618270799, + "grad_norm": 0.08183332532644272, + "learning_rate": 0.000945967661145191, + "loss": 0.1268, + "num_input_tokens_seen": 62104160, + "step": 28740 + }, + { + "epoch": 4.689233278955954, + "grad_norm": 0.1658097356557846, + "learning_rate": 0.0009459354717439097, + "loss": 0.0926, + "num_input_tokens_seen": 62114208, + "step": 28745 + }, + { + "epoch": 4.690048939641109, + "grad_norm": 0.07683463394641876, + "learning_rate": 0.0009459032733051805, + "loss": 0.0867, + "num_input_tokens_seen": 62124320, + "step": 28750 + }, + { + "epoch": 4.690864600326265, + "grad_norm": 0.1957731395959854, + "learning_rate": 0.0009458710658296555, + "loss": 0.0979, + "num_input_tokens_seen": 62135904, + "step": 28755 + }, + { + "epoch": 4.691680261011419, + "grad_norm": 0.02776450477540493, + "learning_rate": 0.000945838849317988, + "loss": 0.0774, + "num_input_tokens_seen": 62145632, + "step": 28760 + }, + { + "epoch": 4.692495921696574, + "grad_norm": 0.10273412615060806, + "learning_rate": 0.0009458066237708302, + "loss": 0.1534, + "num_input_tokens_seen": 62157376, + "step": 28765 + }, + { + "epoch": 4.693311582381729, + "grad_norm": 0.01814623735845089, + "learning_rate": 0.0009457743891888359, + "loss": 0.0768, + "num_input_tokens_seen": 62167648, + "step": 28770 + }, + { + "epoch": 4.694127243066884, + "grad_norm": 0.2564343214035034, + "learning_rate": 0.0009457421455726582, + "loss": 0.1296, + "num_input_tokens_seen": 62178784, + "step": 28775 + }, + { + "epoch": 4.69494290375204, + "grad_norm": 0.07235551625490189, + "learning_rate": 0.0009457098929229503, + "loss": 0.1122, + "num_input_tokens_seen": 62189600, + "step": 28780 + }, + { + "epoch": 4.695758564437194, + "grad_norm": 0.18390199542045593, + "learning_rate": 0.0009456776312403661, + "loss": 0.1042, + "num_input_tokens_seen": 62200768, + "step": 28785 + }, + { + "epoch": 4.696574225122349, + "grad_norm": 0.039058052003383636, + "learning_rate": 0.0009456453605255592, + "loss": 0.0469, + "num_input_tokens_seen": 62211520, + "step": 28790 + }, + { + "epoch": 4.697389885807504, + "grad_norm": 0.2684352397918701, + "learning_rate": 0.0009456130807791839, + "loss": 0.3105, + "num_input_tokens_seen": 62222624, + "step": 28795 + }, + { + "epoch": 4.698205546492659, + "grad_norm": 0.025328971445560455, + "learning_rate": 0.000945580792001894, + "loss": 0.0929, + "num_input_tokens_seen": 62232000, + "step": 28800 + }, + { + "epoch": 4.699021207177814, + "grad_norm": 0.1835407316684723, + "learning_rate": 0.0009455484941943442, + "loss": 0.0953, + "num_input_tokens_seen": 62241952, + "step": 28805 + }, + { + "epoch": 4.699836867862969, + "grad_norm": 0.015583495609462261, + "learning_rate": 0.0009455161873571889, + "loss": 0.0627, + "num_input_tokens_seen": 62252864, + "step": 28810 + }, + { + "epoch": 4.700652528548124, + "grad_norm": 0.06854277104139328, + "learning_rate": 0.000945483871491083, + "loss": 0.0716, + "num_input_tokens_seen": 62264096, + "step": 28815 + }, + { + "epoch": 4.701468189233279, + "grad_norm": 0.07336314022541046, + "learning_rate": 0.0009454515465966812, + "loss": 0.1379, + "num_input_tokens_seen": 62274176, + "step": 28820 + }, + { + "epoch": 4.702283849918434, + "grad_norm": 0.019611230120062828, + "learning_rate": 0.0009454192126746388, + "loss": 0.039, + "num_input_tokens_seen": 62285472, + "step": 28825 + }, + { + "epoch": 4.703099510603589, + "grad_norm": 0.0149730509147048, + "learning_rate": 0.000945386869725611, + "loss": 0.0883, + "num_input_tokens_seen": 62296256, + "step": 28830 + }, + { + "epoch": 4.7039151712887435, + "grad_norm": 0.1251792311668396, + "learning_rate": 0.0009453545177502532, + "loss": 0.1685, + "num_input_tokens_seen": 62307456, + "step": 28835 + }, + { + "epoch": 4.704730831973899, + "grad_norm": 0.025677544996142387, + "learning_rate": 0.0009453221567492211, + "loss": 0.0219, + "num_input_tokens_seen": 62317696, + "step": 28840 + }, + { + "epoch": 4.705546492659054, + "grad_norm": 0.17929667234420776, + "learning_rate": 0.0009452897867231705, + "loss": 0.1908, + "num_input_tokens_seen": 62328672, + "step": 28845 + }, + { + "epoch": 4.706362153344209, + "grad_norm": 0.10408436506986618, + "learning_rate": 0.0009452574076727576, + "loss": 0.1479, + "num_input_tokens_seen": 62339648, + "step": 28850 + }, + { + "epoch": 4.707177814029364, + "grad_norm": 0.047870147973299026, + "learning_rate": 0.0009452250195986385, + "loss": 0.1515, + "num_input_tokens_seen": 62350592, + "step": 28855 + }, + { + "epoch": 4.7079934747145185, + "grad_norm": 0.07525742053985596, + "learning_rate": 0.0009451926225014695, + "loss": 0.0627, + "num_input_tokens_seen": 62362528, + "step": 28860 + }, + { + "epoch": 4.708809135399674, + "grad_norm": 0.015628913417458534, + "learning_rate": 0.0009451602163819073, + "loss": 0.1018, + "num_input_tokens_seen": 62373600, + "step": 28865 + }, + { + "epoch": 4.709624796084829, + "grad_norm": 0.05882667005062103, + "learning_rate": 0.0009451278012406086, + "loss": 0.0225, + "num_input_tokens_seen": 62385568, + "step": 28870 + }, + { + "epoch": 4.710440456769984, + "grad_norm": 0.008061857894062996, + "learning_rate": 0.0009450953770782304, + "loss": 0.0281, + "num_input_tokens_seen": 62395872, + "step": 28875 + }, + { + "epoch": 4.711256117455139, + "grad_norm": 0.07395133376121521, + "learning_rate": 0.0009450629438954296, + "loss": 0.0657, + "num_input_tokens_seen": 62407296, + "step": 28880 + }, + { + "epoch": 4.712071778140293, + "grad_norm": 0.0533161461353302, + "learning_rate": 0.0009450305016928636, + "loss": 0.0343, + "num_input_tokens_seen": 62418656, + "step": 28885 + }, + { + "epoch": 4.712887438825448, + "grad_norm": 0.01694626919925213, + "learning_rate": 0.00094499805047119, + "loss": 0.03, + "num_input_tokens_seen": 62429792, + "step": 28890 + }, + { + "epoch": 4.713703099510604, + "grad_norm": 0.0854320377111435, + "learning_rate": 0.0009449655902310665, + "loss": 0.1937, + "num_input_tokens_seen": 62439936, + "step": 28895 + }, + { + "epoch": 4.714518760195759, + "grad_norm": 0.08638182282447815, + "learning_rate": 0.0009449331209731507, + "loss": 0.1911, + "num_input_tokens_seen": 62450720, + "step": 28900 + }, + { + "epoch": 4.715334420880914, + "grad_norm": 0.18696919083595276, + "learning_rate": 0.0009449006426981007, + "loss": 0.1344, + "num_input_tokens_seen": 62461856, + "step": 28905 + }, + { + "epoch": 4.716150081566068, + "grad_norm": 0.02467340975999832, + "learning_rate": 0.0009448681554065749, + "loss": 0.1609, + "num_input_tokens_seen": 62472064, + "step": 28910 + }, + { + "epoch": 4.716965742251223, + "grad_norm": 0.13728247582912445, + "learning_rate": 0.0009448356590992316, + "loss": 0.0877, + "num_input_tokens_seen": 62482976, + "step": 28915 + }, + { + "epoch": 4.717781402936378, + "grad_norm": 0.03479057550430298, + "learning_rate": 0.0009448031537767292, + "loss": 0.1147, + "num_input_tokens_seen": 62492960, + "step": 28920 + }, + { + "epoch": 4.718597063621534, + "grad_norm": 0.008539623580873013, + "learning_rate": 0.0009447706394397266, + "loss": 0.0833, + "num_input_tokens_seen": 62503648, + "step": 28925 + }, + { + "epoch": 4.719412724306689, + "grad_norm": 0.2139635682106018, + "learning_rate": 0.0009447381160888831, + "loss": 0.1307, + "num_input_tokens_seen": 62513920, + "step": 28930 + }, + { + "epoch": 4.720228384991843, + "grad_norm": 0.21587203443050385, + "learning_rate": 0.0009447055837248572, + "loss": 0.1285, + "num_input_tokens_seen": 62524960, + "step": 28935 + }, + { + "epoch": 4.721044045676998, + "grad_norm": 0.1398542821407318, + "learning_rate": 0.0009446730423483085, + "loss": 0.0451, + "num_input_tokens_seen": 62535776, + "step": 28940 + }, + { + "epoch": 4.721859706362153, + "grad_norm": 0.010080617852509022, + "learning_rate": 0.0009446404919598965, + "loss": 0.0644, + "num_input_tokens_seen": 62548096, + "step": 28945 + }, + { + "epoch": 4.722675367047309, + "grad_norm": 0.1806686967611313, + "learning_rate": 0.000944607932560281, + "loss": 0.2239, + "num_input_tokens_seen": 62559168, + "step": 28950 + }, + { + "epoch": 4.7234910277324635, + "grad_norm": 0.10033036023378372, + "learning_rate": 0.0009445753641501215, + "loss": 0.0873, + "num_input_tokens_seen": 62570080, + "step": 28955 + }, + { + "epoch": 4.724306688417618, + "grad_norm": 0.015866931527853012, + "learning_rate": 0.0009445427867300785, + "loss": 0.1447, + "num_input_tokens_seen": 62580384, + "step": 28960 + }, + { + "epoch": 4.725122349102773, + "grad_norm": 0.01197098009288311, + "learning_rate": 0.0009445102003008119, + "loss": 0.1351, + "num_input_tokens_seen": 62591488, + "step": 28965 + }, + { + "epoch": 4.725938009787928, + "grad_norm": 0.15940040349960327, + "learning_rate": 0.0009444776048629822, + "loss": 0.1103, + "num_input_tokens_seen": 62603264, + "step": 28970 + }, + { + "epoch": 4.726753670473083, + "grad_norm": 0.038952793926000595, + "learning_rate": 0.0009444450004172498, + "loss": 0.0648, + "num_input_tokens_seen": 62614624, + "step": 28975 + }, + { + "epoch": 4.7275693311582385, + "grad_norm": 0.25991666316986084, + "learning_rate": 0.0009444123869642758, + "loss": 0.1644, + "num_input_tokens_seen": 62625856, + "step": 28980 + }, + { + "epoch": 4.728384991843393, + "grad_norm": 0.016241293400526047, + "learning_rate": 0.000944379764504721, + "loss": 0.0401, + "num_input_tokens_seen": 62635616, + "step": 28985 + }, + { + "epoch": 4.729200652528548, + "grad_norm": 0.038051947951316833, + "learning_rate": 0.0009443471330392466, + "loss": 0.1934, + "num_input_tokens_seen": 62645920, + "step": 28990 + }, + { + "epoch": 4.730016313213703, + "grad_norm": 0.05174148455262184, + "learning_rate": 0.0009443144925685137, + "loss": 0.0372, + "num_input_tokens_seen": 62656864, + "step": 28995 + }, + { + "epoch": 4.730831973898858, + "grad_norm": 0.10264194756746292, + "learning_rate": 0.0009442818430931841, + "loss": 0.1653, + "num_input_tokens_seen": 62666496, + "step": 29000 + }, + { + "epoch": 4.731647634584013, + "grad_norm": 0.20121224224567413, + "learning_rate": 0.0009442491846139192, + "loss": 0.1943, + "num_input_tokens_seen": 62677152, + "step": 29005 + }, + { + "epoch": 4.732463295269168, + "grad_norm": 0.04519479721784592, + "learning_rate": 0.0009442165171313811, + "loss": 0.0546, + "num_input_tokens_seen": 62687104, + "step": 29010 + }, + { + "epoch": 4.733278955954323, + "grad_norm": 0.02056044153869152, + "learning_rate": 0.0009441838406462318, + "loss": 0.0996, + "num_input_tokens_seen": 62697472, + "step": 29015 + }, + { + "epoch": 4.734094616639478, + "grad_norm": 0.0412328764796257, + "learning_rate": 0.0009441511551591333, + "loss": 0.1148, + "num_input_tokens_seen": 62708736, + "step": 29020 + }, + { + "epoch": 4.734910277324633, + "grad_norm": 0.007981553673744202, + "learning_rate": 0.0009441184606707484, + "loss": 0.0868, + "num_input_tokens_seen": 62720672, + "step": 29025 + }, + { + "epoch": 4.735725938009788, + "grad_norm": 0.017568625509738922, + "learning_rate": 0.0009440857571817394, + "loss": 0.1278, + "num_input_tokens_seen": 62732576, + "step": 29030 + }, + { + "epoch": 4.736541598694943, + "grad_norm": 0.16702494025230408, + "learning_rate": 0.000944053044692769, + "loss": 0.0729, + "num_input_tokens_seen": 62742880, + "step": 29035 + }, + { + "epoch": 4.737357259380098, + "grad_norm": 0.21429851651191711, + "learning_rate": 0.0009440203232045005, + "loss": 0.0599, + "num_input_tokens_seen": 62753184, + "step": 29040 + }, + { + "epoch": 4.738172920065253, + "grad_norm": 0.04229350760579109, + "learning_rate": 0.000943987592717597, + "loss": 0.1866, + "num_input_tokens_seen": 62763936, + "step": 29045 + }, + { + "epoch": 4.738988580750408, + "grad_norm": 0.21390216052532196, + "learning_rate": 0.0009439548532327216, + "loss": 0.1543, + "num_input_tokens_seen": 62775328, + "step": 29050 + }, + { + "epoch": 4.739804241435563, + "grad_norm": 0.033763255923986435, + "learning_rate": 0.0009439221047505377, + "loss": 0.0381, + "num_input_tokens_seen": 62786656, + "step": 29055 + }, + { + "epoch": 4.740619902120718, + "grad_norm": 0.4876824915409088, + "learning_rate": 0.0009438893472717094, + "loss": 0.1986, + "num_input_tokens_seen": 62797664, + "step": 29060 + }, + { + "epoch": 4.741435562805873, + "grad_norm": 0.05257996916770935, + "learning_rate": 0.0009438565807969005, + "loss": 0.1252, + "num_input_tokens_seen": 62808416, + "step": 29065 + }, + { + "epoch": 4.742251223491028, + "grad_norm": 0.13803677260875702, + "learning_rate": 0.0009438238053267746, + "loss": 0.1153, + "num_input_tokens_seen": 62818624, + "step": 29070 + }, + { + "epoch": 4.743066884176183, + "grad_norm": 0.03434896469116211, + "learning_rate": 0.0009437910208619964, + "loss": 0.1261, + "num_input_tokens_seen": 62828928, + "step": 29075 + }, + { + "epoch": 4.7438825448613375, + "grad_norm": 0.051103875041007996, + "learning_rate": 0.0009437582274032301, + "loss": 0.1211, + "num_input_tokens_seen": 62838784, + "step": 29080 + }, + { + "epoch": 4.744698205546492, + "grad_norm": 0.06288142502307892, + "learning_rate": 0.0009437254249511404, + "loss": 0.1152, + "num_input_tokens_seen": 62849888, + "step": 29085 + }, + { + "epoch": 4.745513866231647, + "grad_norm": 0.006992727518081665, + "learning_rate": 0.0009436926135063922, + "loss": 0.0351, + "num_input_tokens_seen": 62862848, + "step": 29090 + }, + { + "epoch": 4.746329526916803, + "grad_norm": 0.08556337654590607, + "learning_rate": 0.0009436597930696502, + "loss": 0.0419, + "num_input_tokens_seen": 62873440, + "step": 29095 + }, + { + "epoch": 4.747145187601958, + "grad_norm": 0.14834004640579224, + "learning_rate": 0.0009436269636415798, + "loss": 0.0841, + "num_input_tokens_seen": 62882528, + "step": 29100 + }, + { + "epoch": 4.7479608482871125, + "grad_norm": 0.033871617168188095, + "learning_rate": 0.000943594125222846, + "loss": 0.0712, + "num_input_tokens_seen": 62893824, + "step": 29105 + }, + { + "epoch": 4.748776508972267, + "grad_norm": 0.02020910568535328, + "learning_rate": 0.0009435612778141146, + "loss": 0.1716, + "num_input_tokens_seen": 62903840, + "step": 29110 + }, + { + "epoch": 4.749592169657422, + "grad_norm": 0.21955829858779907, + "learning_rate": 0.0009435284214160513, + "loss": 0.2021, + "num_input_tokens_seen": 62915040, + "step": 29115 + }, + { + "epoch": 4.750407830342578, + "grad_norm": 0.013592018745839596, + "learning_rate": 0.0009434955560293217, + "loss": 0.0469, + "num_input_tokens_seen": 62927136, + "step": 29120 + }, + { + "epoch": 4.751223491027733, + "grad_norm": 0.04946539178490639, + "learning_rate": 0.0009434626816545922, + "loss": 0.1829, + "num_input_tokens_seen": 62938144, + "step": 29125 + }, + { + "epoch": 4.7520391517128875, + "grad_norm": 0.4289376437664032, + "learning_rate": 0.0009434297982925288, + "loss": 0.3736, + "num_input_tokens_seen": 62949888, + "step": 29130 + }, + { + "epoch": 4.752854812398042, + "grad_norm": 0.014377152547240257, + "learning_rate": 0.000943396905943798, + "loss": 0.047, + "num_input_tokens_seen": 62960864, + "step": 29135 + }, + { + "epoch": 4.753670473083197, + "grad_norm": 0.07028697431087494, + "learning_rate": 0.0009433640046090664, + "loss": 0.1671, + "num_input_tokens_seen": 62970720, + "step": 29140 + }, + { + "epoch": 4.754486133768353, + "grad_norm": 0.14718550443649292, + "learning_rate": 0.0009433310942890009, + "loss": 0.1276, + "num_input_tokens_seen": 62980704, + "step": 29145 + }, + { + "epoch": 4.755301794453508, + "grad_norm": 0.05152687057852745, + "learning_rate": 0.0009432981749842683, + "loss": 0.0898, + "num_input_tokens_seen": 62991008, + "step": 29150 + }, + { + "epoch": 4.7561174551386625, + "grad_norm": 0.04468585178256035, + "learning_rate": 0.0009432652466955358, + "loss": 0.0548, + "num_input_tokens_seen": 63000864, + "step": 29155 + }, + { + "epoch": 4.756933115823817, + "grad_norm": 0.018594171851873398, + "learning_rate": 0.0009432323094234708, + "loss": 0.0329, + "num_input_tokens_seen": 63011776, + "step": 29160 + }, + { + "epoch": 4.757748776508972, + "grad_norm": 0.21704287827014923, + "learning_rate": 0.0009431993631687408, + "loss": 0.1588, + "num_input_tokens_seen": 63023296, + "step": 29165 + }, + { + "epoch": 4.758564437194127, + "grad_norm": 0.05709686875343323, + "learning_rate": 0.0009431664079320134, + "loss": 0.1016, + "num_input_tokens_seen": 63035168, + "step": 29170 + }, + { + "epoch": 4.759380097879282, + "grad_norm": 0.011319992132484913, + "learning_rate": 0.0009431334437139565, + "loss": 0.1046, + "num_input_tokens_seen": 63046272, + "step": 29175 + }, + { + "epoch": 4.760195758564437, + "grad_norm": 0.06291552633047104, + "learning_rate": 0.0009431004705152384, + "loss": 0.1456, + "num_input_tokens_seen": 63056096, + "step": 29180 + }, + { + "epoch": 4.761011419249592, + "grad_norm": 0.09090526401996613, + "learning_rate": 0.0009430674883365269, + "loss": 0.0367, + "num_input_tokens_seen": 63066432, + "step": 29185 + }, + { + "epoch": 4.761827079934747, + "grad_norm": 0.19455280900001526, + "learning_rate": 0.0009430344971784909, + "loss": 0.0772, + "num_input_tokens_seen": 63077856, + "step": 29190 + }, + { + "epoch": 4.762642740619902, + "grad_norm": 0.023099692538380623, + "learning_rate": 0.0009430014970417986, + "loss": 0.1152, + "num_input_tokens_seen": 63088160, + "step": 29195 + }, + { + "epoch": 4.763458401305057, + "grad_norm": 0.2389380782842636, + "learning_rate": 0.0009429684879271191, + "loss": 0.1268, + "num_input_tokens_seen": 63098240, + "step": 29200 + }, + { + "epoch": 4.764274061990212, + "grad_norm": 0.2107832282781601, + "learning_rate": 0.0009429354698351212, + "loss": 0.088, + "num_input_tokens_seen": 63109472, + "step": 29205 + }, + { + "epoch": 4.765089722675367, + "grad_norm": 0.004489370156079531, + "learning_rate": 0.0009429024427664741, + "loss": 0.0149, + "num_input_tokens_seen": 63121120, + "step": 29210 + }, + { + "epoch": 4.765905383360522, + "grad_norm": 0.0221620574593544, + "learning_rate": 0.0009428694067218473, + "loss": 0.0978, + "num_input_tokens_seen": 63131264, + "step": 29215 + }, + { + "epoch": 4.766721044045677, + "grad_norm": 0.07446268945932388, + "learning_rate": 0.0009428363617019099, + "loss": 0.0791, + "num_input_tokens_seen": 63143488, + "step": 29220 + }, + { + "epoch": 4.767536704730832, + "grad_norm": 0.1201087012887001, + "learning_rate": 0.0009428033077073319, + "loss": 0.2029, + "num_input_tokens_seen": 63154080, + "step": 29225 + }, + { + "epoch": 4.768352365415987, + "grad_norm": 0.05503406375646591, + "learning_rate": 0.0009427702447387833, + "loss": 0.1802, + "num_input_tokens_seen": 63165728, + "step": 29230 + }, + { + "epoch": 4.769168026101142, + "grad_norm": 0.0661977156996727, + "learning_rate": 0.0009427371727969338, + "loss": 0.168, + "num_input_tokens_seen": 63176224, + "step": 29235 + }, + { + "epoch": 4.769983686786297, + "grad_norm": 0.01881541684269905, + "learning_rate": 0.000942704091882454, + "loss": 0.0746, + "num_input_tokens_seen": 63185792, + "step": 29240 + }, + { + "epoch": 4.770799347471452, + "grad_norm": 0.05708598718047142, + "learning_rate": 0.0009426710019960141, + "loss": 0.0806, + "num_input_tokens_seen": 63196800, + "step": 29245 + }, + { + "epoch": 4.771615008156607, + "grad_norm": 0.0396232083439827, + "learning_rate": 0.0009426379031382848, + "loss": 0.0472, + "num_input_tokens_seen": 63207744, + "step": 29250 + }, + { + "epoch": 4.7724306688417615, + "grad_norm": 0.037397295236587524, + "learning_rate": 0.0009426047953099368, + "loss": 0.0347, + "num_input_tokens_seen": 63219712, + "step": 29255 + }, + { + "epoch": 4.773246329526917, + "grad_norm": 0.246902734041214, + "learning_rate": 0.0009425716785116412, + "loss": 0.3137, + "num_input_tokens_seen": 63229408, + "step": 29260 + }, + { + "epoch": 4.774061990212072, + "grad_norm": 0.0640609934926033, + "learning_rate": 0.0009425385527440691, + "loss": 0.0455, + "num_input_tokens_seen": 63239424, + "step": 29265 + }, + { + "epoch": 4.774877650897227, + "grad_norm": 0.012092667631804943, + "learning_rate": 0.0009425054180078917, + "loss": 0.1158, + "num_input_tokens_seen": 63251232, + "step": 29270 + }, + { + "epoch": 4.775693311582382, + "grad_norm": 0.03524189069867134, + "learning_rate": 0.0009424722743037808, + "loss": 0.0792, + "num_input_tokens_seen": 63263584, + "step": 29275 + }, + { + "epoch": 4.7765089722675365, + "grad_norm": 0.04715615138411522, + "learning_rate": 0.0009424391216324078, + "loss": 0.0403, + "num_input_tokens_seen": 63273376, + "step": 29280 + }, + { + "epoch": 4.777324632952691, + "grad_norm": 0.029380058869719505, + "learning_rate": 0.0009424059599944449, + "loss": 0.085, + "num_input_tokens_seen": 63282176, + "step": 29285 + }, + { + "epoch": 4.778140293637847, + "grad_norm": 0.21220123767852783, + "learning_rate": 0.0009423727893905638, + "loss": 0.0706, + "num_input_tokens_seen": 63293440, + "step": 29290 + }, + { + "epoch": 4.778955954323002, + "grad_norm": 0.2774854600429535, + "learning_rate": 0.0009423396098214372, + "loss": 0.0728, + "num_input_tokens_seen": 63303136, + "step": 29295 + }, + { + "epoch": 4.779771615008157, + "grad_norm": 0.3588896691799164, + "learning_rate": 0.0009423064212877371, + "loss": 0.311, + "num_input_tokens_seen": 63314944, + "step": 29300 + }, + { + "epoch": 4.780587275693311, + "grad_norm": 0.18132130801677704, + "learning_rate": 0.0009422732237901361, + "loss": 0.0476, + "num_input_tokens_seen": 63324736, + "step": 29305 + }, + { + "epoch": 4.781402936378466, + "grad_norm": 0.1575162261724472, + "learning_rate": 0.0009422400173293073, + "loss": 0.1672, + "num_input_tokens_seen": 63334880, + "step": 29310 + }, + { + "epoch": 4.782218597063622, + "grad_norm": 0.29085320234298706, + "learning_rate": 0.0009422068019059235, + "loss": 0.2645, + "num_input_tokens_seen": 63346336, + "step": 29315 + }, + { + "epoch": 4.783034257748777, + "grad_norm": 0.2051205039024353, + "learning_rate": 0.0009421735775206582, + "loss": 0.0909, + "num_input_tokens_seen": 63357088, + "step": 29320 + }, + { + "epoch": 4.783849918433932, + "grad_norm": 0.18533475697040558, + "learning_rate": 0.000942140344174184, + "loss": 0.0807, + "num_input_tokens_seen": 63368512, + "step": 29325 + }, + { + "epoch": 4.784665579119086, + "grad_norm": 0.05170245096087456, + "learning_rate": 0.0009421071018671749, + "loss": 0.0812, + "num_input_tokens_seen": 63380288, + "step": 29330 + }, + { + "epoch": 4.785481239804241, + "grad_norm": 0.032106757164001465, + "learning_rate": 0.0009420738506003047, + "loss": 0.0393, + "num_input_tokens_seen": 63391808, + "step": 29335 + }, + { + "epoch": 4.786296900489396, + "grad_norm": 0.009413869120180607, + "learning_rate": 0.0009420405903742471, + "loss": 0.0281, + "num_input_tokens_seen": 63402528, + "step": 29340 + }, + { + "epoch": 4.787112561174552, + "grad_norm": 0.14599008858203888, + "learning_rate": 0.000942007321189676, + "loss": 0.1658, + "num_input_tokens_seen": 63414368, + "step": 29345 + }, + { + "epoch": 4.787928221859707, + "grad_norm": 0.29077810049057007, + "learning_rate": 0.0009419740430472659, + "loss": 0.0954, + "num_input_tokens_seen": 63424896, + "step": 29350 + }, + { + "epoch": 4.788743882544861, + "grad_norm": 0.04795224219560623, + "learning_rate": 0.0009419407559476911, + "loss": 0.1213, + "num_input_tokens_seen": 63435200, + "step": 29355 + }, + { + "epoch": 4.789559543230016, + "grad_norm": 0.23997856676578522, + "learning_rate": 0.0009419074598916262, + "loss": 0.1989, + "num_input_tokens_seen": 63444960, + "step": 29360 + }, + { + "epoch": 4.790375203915171, + "grad_norm": 0.2853996753692627, + "learning_rate": 0.0009418741548797462, + "loss": 0.1052, + "num_input_tokens_seen": 63455520, + "step": 29365 + }, + { + "epoch": 4.791190864600326, + "grad_norm": 0.08539510518312454, + "learning_rate": 0.0009418408409127257, + "loss": 0.0527, + "num_input_tokens_seen": 63466784, + "step": 29370 + }, + { + "epoch": 4.7920065252854815, + "grad_norm": 0.1659606695175171, + "learning_rate": 0.0009418075179912402, + "loss": 0.0795, + "num_input_tokens_seen": 63477664, + "step": 29375 + }, + { + "epoch": 4.792822185970636, + "grad_norm": 0.10241387784481049, + "learning_rate": 0.0009417741861159648, + "loss": 0.09, + "num_input_tokens_seen": 63488864, + "step": 29380 + }, + { + "epoch": 4.793637846655791, + "grad_norm": 0.25465595722198486, + "learning_rate": 0.0009417408452875751, + "loss": 0.1875, + "num_input_tokens_seen": 63500000, + "step": 29385 + }, + { + "epoch": 4.794453507340946, + "grad_norm": 0.013809128664433956, + "learning_rate": 0.0009417074955067467, + "loss": 0.1046, + "num_input_tokens_seen": 63511424, + "step": 29390 + }, + { + "epoch": 4.795269168026101, + "grad_norm": 0.04915191978216171, + "learning_rate": 0.0009416741367741557, + "loss": 0.1079, + "num_input_tokens_seen": 63521216, + "step": 29395 + }, + { + "epoch": 4.7960848287112565, + "grad_norm": 0.26613113284111023, + "learning_rate": 0.0009416407690904778, + "loss": 0.1033, + "num_input_tokens_seen": 63531488, + "step": 29400 + }, + { + "epoch": 4.796900489396411, + "grad_norm": 0.012351097539067268, + "learning_rate": 0.0009416073924563897, + "loss": 0.079, + "num_input_tokens_seen": 63542784, + "step": 29405 + }, + { + "epoch": 4.797716150081566, + "grad_norm": 0.005100678652524948, + "learning_rate": 0.0009415740068725674, + "loss": 0.06, + "num_input_tokens_seen": 63554240, + "step": 29410 + }, + { + "epoch": 4.798531810766721, + "grad_norm": 0.2662050127983093, + "learning_rate": 0.0009415406123396878, + "loss": 0.1314, + "num_input_tokens_seen": 63566208, + "step": 29415 + }, + { + "epoch": 4.799347471451876, + "grad_norm": 0.2572949230670929, + "learning_rate": 0.0009415072088584275, + "loss": 0.1517, + "num_input_tokens_seen": 63576640, + "step": 29420 + }, + { + "epoch": 4.800163132137031, + "grad_norm": 0.5557500720024109, + "learning_rate": 0.0009414737964294635, + "loss": 0.1396, + "num_input_tokens_seen": 63587264, + "step": 29425 + }, + { + "epoch": 4.800978792822186, + "grad_norm": 0.05724980682134628, + "learning_rate": 0.0009414403750534731, + "loss": 0.0484, + "num_input_tokens_seen": 63597824, + "step": 29430 + }, + { + "epoch": 4.801794453507341, + "grad_norm": 0.007257545366883278, + "learning_rate": 0.0009414069447311333, + "loss": 0.0988, + "num_input_tokens_seen": 63608896, + "step": 29435 + }, + { + "epoch": 4.802610114192496, + "grad_norm": 0.12548641860485077, + "learning_rate": 0.0009413735054631218, + "loss": 0.1451, + "num_input_tokens_seen": 63619840, + "step": 29440 + }, + { + "epoch": 4.803425774877651, + "grad_norm": 0.022364402189850807, + "learning_rate": 0.0009413400572501164, + "loss": 0.0349, + "num_input_tokens_seen": 63630112, + "step": 29445 + }, + { + "epoch": 4.804241435562806, + "grad_norm": 0.13613943755626678, + "learning_rate": 0.0009413066000927948, + "loss": 0.037, + "num_input_tokens_seen": 63640384, + "step": 29450 + }, + { + "epoch": 4.80505709624796, + "grad_norm": 0.06001855060458183, + "learning_rate": 0.0009412731339918353, + "loss": 0.0666, + "num_input_tokens_seen": 63650752, + "step": 29455 + }, + { + "epoch": 4.805872756933116, + "grad_norm": 0.08017107099294662, + "learning_rate": 0.0009412396589479157, + "loss": 0.1239, + "num_input_tokens_seen": 63660544, + "step": 29460 + }, + { + "epoch": 4.806688417618271, + "grad_norm": 0.007623352110385895, + "learning_rate": 0.0009412061749617147, + "loss": 0.0372, + "num_input_tokens_seen": 63671328, + "step": 29465 + }, + { + "epoch": 4.807504078303426, + "grad_norm": 0.08126839250326157, + "learning_rate": 0.0009411726820339109, + "loss": 0.0415, + "num_input_tokens_seen": 63682304, + "step": 29470 + }, + { + "epoch": 4.808319738988581, + "grad_norm": 0.02001064084470272, + "learning_rate": 0.000941139180165183, + "loss": 0.0137, + "num_input_tokens_seen": 63692608, + "step": 29475 + }, + { + "epoch": 4.809135399673735, + "grad_norm": 0.010059705004096031, + "learning_rate": 0.0009411056693562101, + "loss": 0.0354, + "num_input_tokens_seen": 63704448, + "step": 29480 + }, + { + "epoch": 4.809951060358891, + "grad_norm": 0.10895252227783203, + "learning_rate": 0.000941072149607671, + "loss": 0.2122, + "num_input_tokens_seen": 63714848, + "step": 29485 + }, + { + "epoch": 4.810766721044046, + "grad_norm": 0.02574615553021431, + "learning_rate": 0.0009410386209202455, + "loss": 0.0484, + "num_input_tokens_seen": 63726336, + "step": 29490 + }, + { + "epoch": 4.811582381729201, + "grad_norm": 0.00544143607839942, + "learning_rate": 0.0009410050832946127, + "loss": 0.0905, + "num_input_tokens_seen": 63737792, + "step": 29495 + }, + { + "epoch": 4.8123980424143555, + "grad_norm": 0.03158778324723244, + "learning_rate": 0.0009409715367314527, + "loss": 0.0503, + "num_input_tokens_seen": 63748256, + "step": 29500 + }, + { + "epoch": 4.81321370309951, + "grad_norm": 0.1094653531908989, + "learning_rate": 0.0009409379812314447, + "loss": 0.0539, + "num_input_tokens_seen": 63758432, + "step": 29505 + }, + { + "epoch": 4.814029363784666, + "grad_norm": 0.0789109617471695, + "learning_rate": 0.0009409044167952694, + "loss": 0.1382, + "num_input_tokens_seen": 63767744, + "step": 29510 + }, + { + "epoch": 4.814845024469821, + "grad_norm": 0.3024810254573822, + "learning_rate": 0.0009408708434236066, + "loss": 0.1406, + "num_input_tokens_seen": 63779648, + "step": 29515 + }, + { + "epoch": 4.815660685154976, + "grad_norm": 0.028889697045087814, + "learning_rate": 0.000940837261117137, + "loss": 0.1202, + "num_input_tokens_seen": 63790368, + "step": 29520 + }, + { + "epoch": 4.8164763458401305, + "grad_norm": 0.009988417848944664, + "learning_rate": 0.000940803669876541, + "loss": 0.0549, + "num_input_tokens_seen": 63801536, + "step": 29525 + }, + { + "epoch": 4.817292006525285, + "grad_norm": 0.03842491656541824, + "learning_rate": 0.0009407700697024995, + "loss": 0.1412, + "num_input_tokens_seen": 63812224, + "step": 29530 + }, + { + "epoch": 4.81810766721044, + "grad_norm": 0.12616626918315887, + "learning_rate": 0.0009407364605956933, + "loss": 0.0376, + "num_input_tokens_seen": 63823520, + "step": 29535 + }, + { + "epoch": 4.818923327895595, + "grad_norm": 0.24169041216373444, + "learning_rate": 0.0009407028425568036, + "loss": 0.0622, + "num_input_tokens_seen": 63835808, + "step": 29540 + }, + { + "epoch": 4.819738988580751, + "grad_norm": 0.3266375958919525, + "learning_rate": 0.0009406692155865117, + "loss": 0.1315, + "num_input_tokens_seen": 63846528, + "step": 29545 + }, + { + "epoch": 4.8205546492659055, + "grad_norm": 0.0059782578609883785, + "learning_rate": 0.0009406355796854993, + "loss": 0.0899, + "num_input_tokens_seen": 63857760, + "step": 29550 + }, + { + "epoch": 4.82137030995106, + "grad_norm": 0.13781596720218658, + "learning_rate": 0.0009406019348544478, + "loss": 0.0945, + "num_input_tokens_seen": 63867840, + "step": 29555 + }, + { + "epoch": 4.822185970636215, + "grad_norm": 0.005153140053153038, + "learning_rate": 0.000940568281094039, + "loss": 0.1169, + "num_input_tokens_seen": 63878848, + "step": 29560 + }, + { + "epoch": 4.82300163132137, + "grad_norm": 0.012185310013592243, + "learning_rate": 0.0009405346184049552, + "loss": 0.0916, + "num_input_tokens_seen": 63890816, + "step": 29565 + }, + { + "epoch": 4.823817292006526, + "grad_norm": 0.2665148377418518, + "learning_rate": 0.0009405009467878787, + "loss": 0.1471, + "num_input_tokens_seen": 63901632, + "step": 29570 + }, + { + "epoch": 4.8246329526916805, + "grad_norm": 0.3611166775226593, + "learning_rate": 0.0009404672662434914, + "loss": 0.2292, + "num_input_tokens_seen": 63911520, + "step": 29575 + }, + { + "epoch": 4.825448613376835, + "grad_norm": 0.11029206961393356, + "learning_rate": 0.0009404335767724763, + "loss": 0.2225, + "num_input_tokens_seen": 63922176, + "step": 29580 + }, + { + "epoch": 4.82626427406199, + "grad_norm": 0.09035732597112656, + "learning_rate": 0.000940399878375516, + "loss": 0.2299, + "num_input_tokens_seen": 63932544, + "step": 29585 + }, + { + "epoch": 4.827079934747145, + "grad_norm": 0.055607203394174576, + "learning_rate": 0.0009403661710532936, + "loss": 0.103, + "num_input_tokens_seen": 63944032, + "step": 29590 + }, + { + "epoch": 4.827895595432301, + "grad_norm": 0.04743117094039917, + "learning_rate": 0.0009403324548064919, + "loss": 0.0749, + "num_input_tokens_seen": 63955968, + "step": 29595 + }, + { + "epoch": 4.828711256117455, + "grad_norm": 0.0531173013150692, + "learning_rate": 0.0009402987296357946, + "loss": 0.0851, + "num_input_tokens_seen": 63967136, + "step": 29600 + }, + { + "epoch": 4.82952691680261, + "grad_norm": 0.0809497982263565, + "learning_rate": 0.0009402649955418848, + "loss": 0.1069, + "num_input_tokens_seen": 63977728, + "step": 29605 + }, + { + "epoch": 4.830342577487765, + "grad_norm": 0.07989180833101273, + "learning_rate": 0.0009402312525254464, + "loss": 0.1295, + "num_input_tokens_seen": 63988256, + "step": 29610 + }, + { + "epoch": 4.83115823817292, + "grad_norm": 0.07448780536651611, + "learning_rate": 0.0009401975005871632, + "loss": 0.2039, + "num_input_tokens_seen": 63998400, + "step": 29615 + }, + { + "epoch": 4.831973898858075, + "grad_norm": 0.26234593987464905, + "learning_rate": 0.0009401637397277193, + "loss": 0.0886, + "num_input_tokens_seen": 64009312, + "step": 29620 + }, + { + "epoch": 4.8327895595432295, + "grad_norm": 0.1408432275056839, + "learning_rate": 0.0009401299699477988, + "loss": 0.1501, + "num_input_tokens_seen": 64020192, + "step": 29625 + }, + { + "epoch": 4.833605220228385, + "grad_norm": 0.20116648077964783, + "learning_rate": 0.0009400961912480861, + "loss": 0.1166, + "num_input_tokens_seen": 64032352, + "step": 29630 + }, + { + "epoch": 4.83442088091354, + "grad_norm": 0.044378455728292465, + "learning_rate": 0.0009400624036292657, + "loss": 0.072, + "num_input_tokens_seen": 64043360, + "step": 29635 + }, + { + "epoch": 4.835236541598695, + "grad_norm": 0.06201139837503433, + "learning_rate": 0.0009400286070920226, + "loss": 0.1189, + "num_input_tokens_seen": 64053952, + "step": 29640 + }, + { + "epoch": 4.83605220228385, + "grad_norm": 0.08605514466762543, + "learning_rate": 0.0009399948016370415, + "loss": 0.0357, + "num_input_tokens_seen": 64063424, + "step": 29645 + }, + { + "epoch": 4.8368678629690045, + "grad_norm": 0.07138156145811081, + "learning_rate": 0.0009399609872650075, + "loss": 0.0497, + "num_input_tokens_seen": 64073568, + "step": 29650 + }, + { + "epoch": 4.83768352365416, + "grad_norm": 0.07217609882354736, + "learning_rate": 0.000939927163976606, + "loss": 0.1953, + "num_input_tokens_seen": 64084704, + "step": 29655 + }, + { + "epoch": 4.838499184339315, + "grad_norm": 0.0773155689239502, + "learning_rate": 0.0009398933317725225, + "loss": 0.231, + "num_input_tokens_seen": 64095200, + "step": 29660 + }, + { + "epoch": 4.83931484502447, + "grad_norm": 0.10701734572649002, + "learning_rate": 0.0009398594906534424, + "loss": 0.1688, + "num_input_tokens_seen": 64105728, + "step": 29665 + }, + { + "epoch": 4.840130505709625, + "grad_norm": 0.24366922676563263, + "learning_rate": 0.0009398256406200518, + "loss": 0.1445, + "num_input_tokens_seen": 64116544, + "step": 29670 + }, + { + "epoch": 4.8409461663947795, + "grad_norm": 0.06484881788492203, + "learning_rate": 0.0009397917816730368, + "loss": 0.106, + "num_input_tokens_seen": 64128064, + "step": 29675 + }, + { + "epoch": 4.841761827079935, + "grad_norm": 0.14218920469284058, + "learning_rate": 0.0009397579138130832, + "loss": 0.1686, + "num_input_tokens_seen": 64139328, + "step": 29680 + }, + { + "epoch": 4.84257748776509, + "grad_norm": 0.1667938232421875, + "learning_rate": 0.0009397240370408777, + "loss": 0.1401, + "num_input_tokens_seen": 64148992, + "step": 29685 + }, + { + "epoch": 4.843393148450245, + "grad_norm": 0.06840559095144272, + "learning_rate": 0.0009396901513571068, + "loss": 0.2409, + "num_input_tokens_seen": 64159232, + "step": 29690 + }, + { + "epoch": 4.8442088091354, + "grad_norm": 0.04041779041290283, + "learning_rate": 0.0009396562567624572, + "loss": 0.0819, + "num_input_tokens_seen": 64169728, + "step": 29695 + }, + { + "epoch": 4.8450244698205545, + "grad_norm": 0.1621711403131485, + "learning_rate": 0.0009396223532576159, + "loss": 0.0646, + "num_input_tokens_seen": 64181760, + "step": 29700 + }, + { + "epoch": 4.845840130505709, + "grad_norm": 0.03825107589364052, + "learning_rate": 0.0009395884408432696, + "loss": 0.1504, + "num_input_tokens_seen": 64192480, + "step": 29705 + }, + { + "epoch": 4.846655791190865, + "grad_norm": 0.0052245259284973145, + "learning_rate": 0.0009395545195201062, + "loss": 0.1366, + "num_input_tokens_seen": 64203008, + "step": 29710 + }, + { + "epoch": 4.84747145187602, + "grad_norm": 0.031087854877114296, + "learning_rate": 0.0009395205892888126, + "loss": 0.0807, + "num_input_tokens_seen": 64213696, + "step": 29715 + }, + { + "epoch": 4.848287112561175, + "grad_norm": 0.02025986835360527, + "learning_rate": 0.0009394866501500769, + "loss": 0.0323, + "num_input_tokens_seen": 64222784, + "step": 29720 + }, + { + "epoch": 4.849102773246329, + "grad_norm": 0.017888156697154045, + "learning_rate": 0.0009394527021045866, + "loss": 0.0199, + "num_input_tokens_seen": 64233312, + "step": 29725 + }, + { + "epoch": 4.849918433931484, + "grad_norm": 0.030029356479644775, + "learning_rate": 0.0009394187451530298, + "loss": 0.0646, + "num_input_tokens_seen": 64245056, + "step": 29730 + }, + { + "epoch": 4.850734094616639, + "grad_norm": 0.16684281826019287, + "learning_rate": 0.0009393847792960948, + "loss": 0.1564, + "num_input_tokens_seen": 64256032, + "step": 29735 + }, + { + "epoch": 4.851549755301795, + "grad_norm": 0.028144538402557373, + "learning_rate": 0.0009393508045344697, + "loss": 0.097, + "num_input_tokens_seen": 64266848, + "step": 29740 + }, + { + "epoch": 4.85236541598695, + "grad_norm": 0.17019401490688324, + "learning_rate": 0.0009393168208688432, + "loss": 0.116, + "num_input_tokens_seen": 64276448, + "step": 29745 + }, + { + "epoch": 4.853181076672104, + "grad_norm": 0.01753625087440014, + "learning_rate": 0.0009392828282999042, + "loss": 0.1403, + "num_input_tokens_seen": 64288000, + "step": 29750 + }, + { + "epoch": 4.853996737357259, + "grad_norm": 0.05499972775578499, + "learning_rate": 0.0009392488268283412, + "loss": 0.0819, + "num_input_tokens_seen": 64299104, + "step": 29755 + }, + { + "epoch": 4.854812398042414, + "grad_norm": 0.02356194145977497, + "learning_rate": 0.0009392148164548436, + "loss": 0.0414, + "num_input_tokens_seen": 64310880, + "step": 29760 + }, + { + "epoch": 4.85562805872757, + "grad_norm": 0.23860061168670654, + "learning_rate": 0.0009391807971801005, + "loss": 0.0683, + "num_input_tokens_seen": 64321696, + "step": 29765 + }, + { + "epoch": 4.856443719412725, + "grad_norm": 0.02195931412279606, + "learning_rate": 0.0009391467690048014, + "loss": 0.067, + "num_input_tokens_seen": 64332544, + "step": 29770 + }, + { + "epoch": 4.857259380097879, + "grad_norm": 0.08725380897521973, + "learning_rate": 0.000939112731929636, + "loss": 0.1052, + "num_input_tokens_seen": 64344640, + "step": 29775 + }, + { + "epoch": 4.858075040783034, + "grad_norm": 0.17282482981681824, + "learning_rate": 0.000939078685955294, + "loss": 0.1707, + "num_input_tokens_seen": 64356608, + "step": 29780 + }, + { + "epoch": 4.858890701468189, + "grad_norm": 0.030611051246523857, + "learning_rate": 0.0009390446310824654, + "loss": 0.0402, + "num_input_tokens_seen": 64366592, + "step": 29785 + }, + { + "epoch": 4.859706362153344, + "grad_norm": 0.006273448932915926, + "learning_rate": 0.0009390105673118405, + "loss": 0.1194, + "num_input_tokens_seen": 64378560, + "step": 29790 + }, + { + "epoch": 4.8605220228384995, + "grad_norm": 0.2683796286582947, + "learning_rate": 0.0009389764946441094, + "loss": 0.1358, + "num_input_tokens_seen": 64389664, + "step": 29795 + }, + { + "epoch": 4.861337683523654, + "grad_norm": 0.07152576744556427, + "learning_rate": 0.0009389424130799628, + "loss": 0.0708, + "num_input_tokens_seen": 64400544, + "step": 29800 + }, + { + "epoch": 4.862153344208809, + "grad_norm": 0.08048900961875916, + "learning_rate": 0.0009389083226200914, + "loss": 0.0461, + "num_input_tokens_seen": 64411744, + "step": 29805 + }, + { + "epoch": 4.862969004893964, + "grad_norm": 0.07328837364912033, + "learning_rate": 0.0009388742232651859, + "loss": 0.0862, + "num_input_tokens_seen": 64421440, + "step": 29810 + }, + { + "epoch": 4.863784665579119, + "grad_norm": 0.038748499006032944, + "learning_rate": 0.0009388401150159377, + "loss": 0.0792, + "num_input_tokens_seen": 64432256, + "step": 29815 + }, + { + "epoch": 4.864600326264274, + "grad_norm": 0.02115444466471672, + "learning_rate": 0.0009388059978730377, + "loss": 0.0326, + "num_input_tokens_seen": 64442688, + "step": 29820 + }, + { + "epoch": 4.865415986949429, + "grad_norm": 0.022876601666212082, + "learning_rate": 0.0009387718718371776, + "loss": 0.0351, + "num_input_tokens_seen": 64453792, + "step": 29825 + }, + { + "epoch": 4.866231647634584, + "grad_norm": 0.016570046544075012, + "learning_rate": 0.0009387377369090489, + "loss": 0.0518, + "num_input_tokens_seen": 64465248, + "step": 29830 + }, + { + "epoch": 4.867047308319739, + "grad_norm": 0.024501780048012733, + "learning_rate": 0.0009387035930893433, + "loss": 0.024, + "num_input_tokens_seen": 64477792, + "step": 29835 + }, + { + "epoch": 4.867862969004894, + "grad_norm": 0.015719836577773094, + "learning_rate": 0.0009386694403787529, + "loss": 0.1272, + "num_input_tokens_seen": 64489824, + "step": 29840 + }, + { + "epoch": 4.868678629690049, + "grad_norm": 0.4363064467906952, + "learning_rate": 0.0009386352787779697, + "loss": 0.1239, + "num_input_tokens_seen": 64501248, + "step": 29845 + }, + { + "epoch": 4.869494290375204, + "grad_norm": 0.02297714538872242, + "learning_rate": 0.0009386011082876863, + "loss": 0.0636, + "num_input_tokens_seen": 64513088, + "step": 29850 + }, + { + "epoch": 4.870309951060359, + "grad_norm": 0.04365037381649017, + "learning_rate": 0.000938566928908595, + "loss": 0.127, + "num_input_tokens_seen": 64524096, + "step": 29855 + }, + { + "epoch": 4.871125611745514, + "grad_norm": 0.01078298594802618, + "learning_rate": 0.0009385327406413883, + "loss": 0.0282, + "num_input_tokens_seen": 64534944, + "step": 29860 + }, + { + "epoch": 4.871941272430669, + "grad_norm": 0.07390675693750381, + "learning_rate": 0.0009384985434867597, + "loss": 0.0329, + "num_input_tokens_seen": 64546464, + "step": 29865 + }, + { + "epoch": 4.872756933115824, + "grad_norm": 0.016646305099129677, + "learning_rate": 0.0009384643374454014, + "loss": 0.023, + "num_input_tokens_seen": 64558464, + "step": 29870 + }, + { + "epoch": 4.873572593800979, + "grad_norm": 0.03762689605355263, + "learning_rate": 0.0009384301225180074, + "loss": 0.0652, + "num_input_tokens_seen": 64568864, + "step": 29875 + }, + { + "epoch": 4.874388254486134, + "grad_norm": 0.0683576911687851, + "learning_rate": 0.0009383958987052706, + "loss": 0.1253, + "num_input_tokens_seen": 64580192, + "step": 29880 + }, + { + "epoch": 4.875203915171289, + "grad_norm": 0.0032370425760746002, + "learning_rate": 0.0009383616660078849, + "loss": 0.2725, + "num_input_tokens_seen": 64589632, + "step": 29885 + }, + { + "epoch": 4.876019575856444, + "grad_norm": 0.01514100655913353, + "learning_rate": 0.0009383274244265438, + "loss": 0.1108, + "num_input_tokens_seen": 64600192, + "step": 29890 + }, + { + "epoch": 4.876835236541599, + "grad_norm": 0.06531043350696564, + "learning_rate": 0.0009382931739619416, + "loss": 0.0504, + "num_input_tokens_seen": 64610656, + "step": 29895 + }, + { + "epoch": 4.877650897226753, + "grad_norm": 0.058074701577425, + "learning_rate": 0.000938258914614772, + "loss": 0.0522, + "num_input_tokens_seen": 64621728, + "step": 29900 + }, + { + "epoch": 4.878466557911908, + "grad_norm": 0.015192513354122639, + "learning_rate": 0.0009382246463857295, + "loss": 0.0948, + "num_input_tokens_seen": 64633024, + "step": 29905 + }, + { + "epoch": 4.879282218597064, + "grad_norm": 0.07241601496934891, + "learning_rate": 0.0009381903692755087, + "loss": 0.1767, + "num_input_tokens_seen": 64644480, + "step": 29910 + }, + { + "epoch": 4.880097879282219, + "grad_norm": 0.0064300913363695145, + "learning_rate": 0.0009381560832848043, + "loss": 0.0913, + "num_input_tokens_seen": 64655520, + "step": 29915 + }, + { + "epoch": 4.8809135399673735, + "grad_norm": 0.2770669162273407, + "learning_rate": 0.0009381217884143109, + "loss": 0.2039, + "num_input_tokens_seen": 64667392, + "step": 29920 + }, + { + "epoch": 4.881729200652528, + "grad_norm": 0.09652873128652573, + "learning_rate": 0.0009380874846647236, + "loss": 0.0567, + "num_input_tokens_seen": 64678528, + "step": 29925 + }, + { + "epoch": 4.882544861337683, + "grad_norm": 0.06691589951515198, + "learning_rate": 0.0009380531720367378, + "loss": 0.0366, + "num_input_tokens_seen": 64689248, + "step": 29930 + }, + { + "epoch": 4.883360522022839, + "grad_norm": 0.010861546732485294, + "learning_rate": 0.0009380188505310488, + "loss": 0.0329, + "num_input_tokens_seen": 64698688, + "step": 29935 + }, + { + "epoch": 4.884176182707994, + "grad_norm": 0.021841704845428467, + "learning_rate": 0.0009379845201483519, + "loss": 0.0614, + "num_input_tokens_seen": 64710368, + "step": 29940 + }, + { + "epoch": 4.8849918433931485, + "grad_norm": 0.3616441488265991, + "learning_rate": 0.0009379501808893433, + "loss": 0.1891, + "num_input_tokens_seen": 64720416, + "step": 29945 + }, + { + "epoch": 4.885807504078303, + "grad_norm": 0.1140311136841774, + "learning_rate": 0.0009379158327547186, + "loss": 0.2245, + "num_input_tokens_seen": 64731360, + "step": 29950 + }, + { + "epoch": 4.886623164763458, + "grad_norm": 0.19789689779281616, + "learning_rate": 0.000937881475745174, + "loss": 0.1402, + "num_input_tokens_seen": 64740928, + "step": 29955 + }, + { + "epoch": 4.887438825448614, + "grad_norm": 0.03600520268082619, + "learning_rate": 0.0009378471098614059, + "loss": 0.062, + "num_input_tokens_seen": 64750784, + "step": 29960 + }, + { + "epoch": 4.888254486133769, + "grad_norm": 0.23173433542251587, + "learning_rate": 0.0009378127351041106, + "loss": 0.173, + "num_input_tokens_seen": 64762272, + "step": 29965 + }, + { + "epoch": 4.8890701468189235, + "grad_norm": 0.1283300220966339, + "learning_rate": 0.0009377783514739848, + "loss": 0.115, + "num_input_tokens_seen": 64773984, + "step": 29970 + }, + { + "epoch": 4.889885807504078, + "grad_norm": 0.030921900644898415, + "learning_rate": 0.0009377439589717254, + "loss": 0.0907, + "num_input_tokens_seen": 64784704, + "step": 29975 + }, + { + "epoch": 4.890701468189233, + "grad_norm": 0.022784793749451637, + "learning_rate": 0.0009377095575980293, + "loss": 0.1639, + "num_input_tokens_seen": 64795072, + "step": 29980 + }, + { + "epoch": 4.891517128874388, + "grad_norm": 0.0242206659168005, + "learning_rate": 0.0009376751473535939, + "loss": 0.1102, + "num_input_tokens_seen": 64805888, + "step": 29985 + }, + { + "epoch": 4.892332789559543, + "grad_norm": 0.26550939679145813, + "learning_rate": 0.0009376407282391161, + "loss": 0.077, + "num_input_tokens_seen": 64817120, + "step": 29990 + }, + { + "epoch": 4.8931484502446985, + "grad_norm": 0.025271253660321236, + "learning_rate": 0.0009376063002552939, + "loss": 0.068, + "num_input_tokens_seen": 64829504, + "step": 29995 + }, + { + "epoch": 4.893964110929853, + "grad_norm": 0.02558698132634163, + "learning_rate": 0.0009375718634028249, + "loss": 0.0362, + "num_input_tokens_seen": 64839488, + "step": 30000 + }, + { + "epoch": 4.894779771615008, + "grad_norm": 0.4274783730506897, + "learning_rate": 0.0009375374176824071, + "loss": 0.222, + "num_input_tokens_seen": 64851168, + "step": 30005 + }, + { + "epoch": 4.895595432300163, + "grad_norm": 0.028903350234031677, + "learning_rate": 0.0009375029630947384, + "loss": 0.1904, + "num_input_tokens_seen": 64860736, + "step": 30010 + }, + { + "epoch": 4.896411092985318, + "grad_norm": 0.012105568312108517, + "learning_rate": 0.000937468499640517, + "loss": 0.0793, + "num_input_tokens_seen": 64871968, + "step": 30015 + }, + { + "epoch": 4.897226753670473, + "grad_norm": 0.2322903871536255, + "learning_rate": 0.0009374340273204416, + "loss": 0.2846, + "num_input_tokens_seen": 64882912, + "step": 30020 + }, + { + "epoch": 4.898042414355628, + "grad_norm": 0.06537723541259766, + "learning_rate": 0.0009373995461352107, + "loss": 0.0465, + "num_input_tokens_seen": 64894272, + "step": 30025 + }, + { + "epoch": 4.898858075040783, + "grad_norm": 0.18969091773033142, + "learning_rate": 0.0009373650560855232, + "loss": 0.1186, + "num_input_tokens_seen": 64904672, + "step": 30030 + }, + { + "epoch": 4.899673735725938, + "grad_norm": 0.2091660499572754, + "learning_rate": 0.0009373305571720779, + "loss": 0.1496, + "num_input_tokens_seen": 64915296, + "step": 30035 + }, + { + "epoch": 4.900489396411093, + "grad_norm": 0.030176764354109764, + "learning_rate": 0.0009372960493955741, + "loss": 0.1566, + "num_input_tokens_seen": 64927840, + "step": 30040 + }, + { + "epoch": 4.901305057096248, + "grad_norm": 0.041412852704524994, + "learning_rate": 0.0009372615327567111, + "loss": 0.0968, + "num_input_tokens_seen": 64938976, + "step": 30045 + }, + { + "epoch": 4.902120717781403, + "grad_norm": 0.026320433244109154, + "learning_rate": 0.0009372270072561885, + "loss": 0.0668, + "num_input_tokens_seen": 64949056, + "step": 30050 + }, + { + "epoch": 4.902936378466558, + "grad_norm": 0.023878064006567, + "learning_rate": 0.0009371924728947059, + "loss": 0.1959, + "num_input_tokens_seen": 64959680, + "step": 30055 + }, + { + "epoch": 4.903752039151713, + "grad_norm": 0.11372017860412598, + "learning_rate": 0.0009371579296729631, + "loss": 0.095, + "num_input_tokens_seen": 64971232, + "step": 30060 + }, + { + "epoch": 4.904567699836868, + "grad_norm": 0.04866662621498108, + "learning_rate": 0.0009371233775916604, + "loss": 0.1943, + "num_input_tokens_seen": 64983040, + "step": 30065 + }, + { + "epoch": 4.9053833605220225, + "grad_norm": 0.15420961380004883, + "learning_rate": 0.0009370888166514979, + "loss": 0.1675, + "num_input_tokens_seen": 64993792, + "step": 30070 + }, + { + "epoch": 4.906199021207177, + "grad_norm": 0.0190042182803154, + "learning_rate": 0.0009370542468531761, + "loss": 0.1009, + "num_input_tokens_seen": 65004608, + "step": 30075 + }, + { + "epoch": 4.907014681892333, + "grad_norm": 0.03797129914164543, + "learning_rate": 0.0009370196681973955, + "loss": 0.0749, + "num_input_tokens_seen": 65015360, + "step": 30080 + }, + { + "epoch": 4.907830342577488, + "grad_norm": 0.10052059590816498, + "learning_rate": 0.0009369850806848569, + "loss": 0.2176, + "num_input_tokens_seen": 65025216, + "step": 30085 + }, + { + "epoch": 4.908646003262643, + "grad_norm": 0.07716162502765656, + "learning_rate": 0.0009369504843162613, + "loss": 0.0644, + "num_input_tokens_seen": 65036160, + "step": 30090 + }, + { + "epoch": 4.9094616639477975, + "grad_norm": 0.19943881034851074, + "learning_rate": 0.0009369158790923098, + "loss": 0.0612, + "num_input_tokens_seen": 65046112, + "step": 30095 + }, + { + "epoch": 4.910277324632952, + "grad_norm": 0.03232225030660629, + "learning_rate": 0.0009368812650137038, + "loss": 0.0405, + "num_input_tokens_seen": 65056960, + "step": 30100 + }, + { + "epoch": 4.911092985318108, + "grad_norm": 0.014520195312798023, + "learning_rate": 0.0009368466420811446, + "loss": 0.1118, + "num_input_tokens_seen": 65067488, + "step": 30105 + }, + { + "epoch": 4.911908646003263, + "grad_norm": 0.12783744931221008, + "learning_rate": 0.0009368120102953341, + "loss": 0.138, + "num_input_tokens_seen": 65076768, + "step": 30110 + }, + { + "epoch": 4.912724306688418, + "grad_norm": 0.046797335147857666, + "learning_rate": 0.0009367773696569742, + "loss": 0.0359, + "num_input_tokens_seen": 65088416, + "step": 30115 + }, + { + "epoch": 4.9135399673735725, + "grad_norm": 0.008519193157553673, + "learning_rate": 0.0009367427201667667, + "loss": 0.0835, + "num_input_tokens_seen": 65100192, + "step": 30120 + }, + { + "epoch": 4.914355628058727, + "grad_norm": 0.14391852915287018, + "learning_rate": 0.000936708061825414, + "loss": 0.0602, + "num_input_tokens_seen": 65110336, + "step": 30125 + }, + { + "epoch": 4.915171288743883, + "grad_norm": 0.19014444947242737, + "learning_rate": 0.0009366733946336184, + "loss": 0.2144, + "num_input_tokens_seen": 65121408, + "step": 30130 + }, + { + "epoch": 4.915986949429038, + "grad_norm": 0.040380168706178665, + "learning_rate": 0.0009366387185920824, + "loss": 0.0334, + "num_input_tokens_seen": 65131584, + "step": 30135 + }, + { + "epoch": 4.916802610114193, + "grad_norm": 0.049314629286527634, + "learning_rate": 0.0009366040337015089, + "loss": 0.0762, + "num_input_tokens_seen": 65143072, + "step": 30140 + }, + { + "epoch": 4.917618270799347, + "grad_norm": 0.022450562566518784, + "learning_rate": 0.0009365693399626009, + "loss": 0.1344, + "num_input_tokens_seen": 65155008, + "step": 30145 + }, + { + "epoch": 4.918433931484502, + "grad_norm": 0.006674426142126322, + "learning_rate": 0.0009365346373760613, + "loss": 0.1624, + "num_input_tokens_seen": 65165760, + "step": 30150 + }, + { + "epoch": 4.919249592169657, + "grad_norm": 0.007688583806157112, + "learning_rate": 0.0009364999259425935, + "loss": 0.0116, + "num_input_tokens_seen": 65175808, + "step": 30155 + }, + { + "epoch": 4.920065252854813, + "grad_norm": 0.033700115978717804, + "learning_rate": 0.0009364652056629008, + "loss": 0.0651, + "num_input_tokens_seen": 65186176, + "step": 30160 + }, + { + "epoch": 4.920880913539968, + "grad_norm": 0.1259598284959793, + "learning_rate": 0.0009364304765376872, + "loss": 0.1328, + "num_input_tokens_seen": 65196096, + "step": 30165 + }, + { + "epoch": 4.921696574225122, + "grad_norm": 0.052267853170633316, + "learning_rate": 0.0009363957385676563, + "loss": 0.2088, + "num_input_tokens_seen": 65206432, + "step": 30170 + }, + { + "epoch": 4.922512234910277, + "grad_norm": 0.06683950871229172, + "learning_rate": 0.0009363609917535122, + "loss": 0.0611, + "num_input_tokens_seen": 65216800, + "step": 30175 + }, + { + "epoch": 4.923327895595432, + "grad_norm": 0.08807369321584702, + "learning_rate": 0.000936326236095959, + "loss": 0.0807, + "num_input_tokens_seen": 65225280, + "step": 30180 + }, + { + "epoch": 4.924143556280587, + "grad_norm": 0.1126125380396843, + "learning_rate": 0.0009362914715957011, + "loss": 0.166, + "num_input_tokens_seen": 65235776, + "step": 30185 + }, + { + "epoch": 4.924959216965743, + "grad_norm": 0.17967797815799713, + "learning_rate": 0.000936256698253443, + "loss": 0.1025, + "num_input_tokens_seen": 65246496, + "step": 30190 + }, + { + "epoch": 4.925774877650897, + "grad_norm": 0.12007225304841995, + "learning_rate": 0.0009362219160698895, + "loss": 0.0478, + "num_input_tokens_seen": 65256864, + "step": 30195 + }, + { + "epoch": 4.926590538336052, + "grad_norm": 0.011540076695382595, + "learning_rate": 0.0009361871250457457, + "loss": 0.151, + "num_input_tokens_seen": 65268320, + "step": 30200 + }, + { + "epoch": 4.927406199021207, + "grad_norm": 0.006911895237863064, + "learning_rate": 0.0009361523251817161, + "loss": 0.1814, + "num_input_tokens_seen": 65279232, + "step": 30205 + }, + { + "epoch": 4.928221859706362, + "grad_norm": 0.030219666659832, + "learning_rate": 0.0009361175164785065, + "loss": 0.1072, + "num_input_tokens_seen": 65289664, + "step": 30210 + }, + { + "epoch": 4.9290375203915175, + "grad_norm": 0.09913137555122375, + "learning_rate": 0.0009360826989368223, + "loss": 0.0381, + "num_input_tokens_seen": 65301088, + "step": 30215 + }, + { + "epoch": 4.929853181076672, + "grad_norm": 0.01770567148923874, + "learning_rate": 0.0009360478725573689, + "loss": 0.1241, + "num_input_tokens_seen": 65311328, + "step": 30220 + }, + { + "epoch": 4.930668841761827, + "grad_norm": 0.025420457124710083, + "learning_rate": 0.0009360130373408522, + "loss": 0.068, + "num_input_tokens_seen": 65321088, + "step": 30225 + }, + { + "epoch": 4.931484502446982, + "grad_norm": 0.284669429063797, + "learning_rate": 0.000935978193287978, + "loss": 0.1822, + "num_input_tokens_seen": 65332576, + "step": 30230 + }, + { + "epoch": 4.932300163132137, + "grad_norm": 0.1337418556213379, + "learning_rate": 0.0009359433403994529, + "loss": 0.2099, + "num_input_tokens_seen": 65343200, + "step": 30235 + }, + { + "epoch": 4.933115823817292, + "grad_norm": 0.015668069943785667, + "learning_rate": 0.0009359084786759828, + "loss": 0.0709, + "num_input_tokens_seen": 65355040, + "step": 30240 + }, + { + "epoch": 4.933931484502447, + "grad_norm": 0.037535425275564194, + "learning_rate": 0.0009358736081182746, + "loss": 0.1577, + "num_input_tokens_seen": 65366592, + "step": 30245 + }, + { + "epoch": 4.934747145187602, + "grad_norm": 0.19737955927848816, + "learning_rate": 0.0009358387287270346, + "loss": 0.154, + "num_input_tokens_seen": 65377568, + "step": 30250 + }, + { + "epoch": 4.935562805872757, + "grad_norm": 0.03458595648407936, + "learning_rate": 0.0009358038405029699, + "loss": 0.192, + "num_input_tokens_seen": 65388064, + "step": 30255 + }, + { + "epoch": 4.936378466557912, + "grad_norm": 0.1987319439649582, + "learning_rate": 0.0009357689434467875, + "loss": 0.141, + "num_input_tokens_seen": 65398816, + "step": 30260 + }, + { + "epoch": 4.937194127243067, + "grad_norm": 0.18942292034626007, + "learning_rate": 0.0009357340375591947, + "loss": 0.0928, + "num_input_tokens_seen": 65409184, + "step": 30265 + }, + { + "epoch": 4.938009787928221, + "grad_norm": 0.0764419436454773, + "learning_rate": 0.0009356991228408988, + "loss": 0.0767, + "num_input_tokens_seen": 65422208, + "step": 30270 + }, + { + "epoch": 4.938825448613377, + "grad_norm": 0.01329710427671671, + "learning_rate": 0.0009356641992926075, + "loss": 0.0533, + "num_input_tokens_seen": 65433984, + "step": 30275 + }, + { + "epoch": 4.939641109298532, + "grad_norm": 0.0733145996928215, + "learning_rate": 0.0009356292669150286, + "loss": 0.0747, + "num_input_tokens_seen": 65444864, + "step": 30280 + }, + { + "epoch": 4.940456769983687, + "grad_norm": 0.047081612050533295, + "learning_rate": 0.0009355943257088698, + "loss": 0.0963, + "num_input_tokens_seen": 65456960, + "step": 30285 + }, + { + "epoch": 4.941272430668842, + "grad_norm": 0.10892040282487869, + "learning_rate": 0.0009355593756748395, + "loss": 0.0535, + "num_input_tokens_seen": 65466816, + "step": 30290 + }, + { + "epoch": 4.942088091353996, + "grad_norm": 0.017515188083052635, + "learning_rate": 0.0009355244168136459, + "loss": 0.1046, + "num_input_tokens_seen": 65478240, + "step": 30295 + }, + { + "epoch": 4.942903752039152, + "grad_norm": 0.014847962185740471, + "learning_rate": 0.0009354894491259975, + "loss": 0.0855, + "num_input_tokens_seen": 65489408, + "step": 30300 + }, + { + "epoch": 4.943719412724307, + "grad_norm": 0.02151155099272728, + "learning_rate": 0.0009354544726126029, + "loss": 0.0291, + "num_input_tokens_seen": 65499904, + "step": 30305 + }, + { + "epoch": 4.944535073409462, + "grad_norm": 0.43711721897125244, + "learning_rate": 0.000935419487274171, + "loss": 0.0833, + "num_input_tokens_seen": 65510144, + "step": 30310 + }, + { + "epoch": 4.945350734094617, + "grad_norm": 0.2166256606578827, + "learning_rate": 0.0009353844931114108, + "loss": 0.1275, + "num_input_tokens_seen": 65520704, + "step": 30315 + }, + { + "epoch": 4.946166394779771, + "grad_norm": 0.10503847897052765, + "learning_rate": 0.0009353494901250316, + "loss": 0.1356, + "num_input_tokens_seen": 65531424, + "step": 30320 + }, + { + "epoch": 4.946982055464927, + "grad_norm": 0.3435342311859131, + "learning_rate": 0.0009353144783157428, + "loss": 0.1432, + "num_input_tokens_seen": 65542272, + "step": 30325 + }, + { + "epoch": 4.947797716150082, + "grad_norm": 0.0990108996629715, + "learning_rate": 0.0009352794576842536, + "loss": 0.0943, + "num_input_tokens_seen": 65553664, + "step": 30330 + }, + { + "epoch": 4.948613376835237, + "grad_norm": 0.680467963218689, + "learning_rate": 0.0009352444282312742, + "loss": 0.1843, + "num_input_tokens_seen": 65564992, + "step": 30335 + }, + { + "epoch": 4.9494290375203915, + "grad_norm": 0.2405286282300949, + "learning_rate": 0.0009352093899575143, + "loss": 0.1095, + "num_input_tokens_seen": 65576736, + "step": 30340 + }, + { + "epoch": 4.950244698205546, + "grad_norm": 0.049668990075588226, + "learning_rate": 0.0009351743428636838, + "loss": 0.0338, + "num_input_tokens_seen": 65587072, + "step": 30345 + }, + { + "epoch": 4.951060358890701, + "grad_norm": 0.026410933583974838, + "learning_rate": 0.0009351392869504934, + "loss": 0.0325, + "num_input_tokens_seen": 65598400, + "step": 30350 + }, + { + "epoch": 4.951876019575856, + "grad_norm": 0.01361384242773056, + "learning_rate": 0.0009351042222186533, + "loss": 0.1572, + "num_input_tokens_seen": 65609024, + "step": 30355 + }, + { + "epoch": 4.952691680261012, + "grad_norm": 0.15705722570419312, + "learning_rate": 0.0009350691486688743, + "loss": 0.2538, + "num_input_tokens_seen": 65618368, + "step": 30360 + }, + { + "epoch": 4.9535073409461665, + "grad_norm": 0.1279543787240982, + "learning_rate": 0.0009350340663018668, + "loss": 0.0771, + "num_input_tokens_seen": 65629216, + "step": 30365 + }, + { + "epoch": 4.954323001631321, + "grad_norm": 0.01910022459924221, + "learning_rate": 0.0009349989751183422, + "loss": 0.0606, + "num_input_tokens_seen": 65639904, + "step": 30370 + }, + { + "epoch": 4.955138662316476, + "grad_norm": 0.014316494576632977, + "learning_rate": 0.0009349638751190115, + "loss": 0.0762, + "num_input_tokens_seen": 65651936, + "step": 30375 + }, + { + "epoch": 4.955954323001631, + "grad_norm": 0.05893901363015175, + "learning_rate": 0.0009349287663045862, + "loss": 0.1634, + "num_input_tokens_seen": 65662848, + "step": 30380 + }, + { + "epoch": 4.956769983686787, + "grad_norm": 0.2651807367801666, + "learning_rate": 0.0009348936486757775, + "loss": 0.122, + "num_input_tokens_seen": 65674272, + "step": 30385 + }, + { + "epoch": 4.9575856443719415, + "grad_norm": 0.3959857225418091, + "learning_rate": 0.0009348585222332975, + "loss": 0.2737, + "num_input_tokens_seen": 65684288, + "step": 30390 + }, + { + "epoch": 4.958401305057096, + "grad_norm": 0.03873610496520996, + "learning_rate": 0.0009348233869778577, + "loss": 0.0281, + "num_input_tokens_seen": 65694784, + "step": 30395 + }, + { + "epoch": 4.959216965742251, + "grad_norm": 0.03320920094847679, + "learning_rate": 0.0009347882429101706, + "loss": 0.0251, + "num_input_tokens_seen": 65704768, + "step": 30400 + }, + { + "epoch": 4.960032626427406, + "grad_norm": 0.055013399571180344, + "learning_rate": 0.000934753090030948, + "loss": 0.3051, + "num_input_tokens_seen": 65715264, + "step": 30405 + }, + { + "epoch": 4.960848287112562, + "grad_norm": 0.01581265963613987, + "learning_rate": 0.0009347179283409027, + "loss": 0.084, + "num_input_tokens_seen": 65726624, + "step": 30410 + }, + { + "epoch": 4.9616639477977165, + "grad_norm": 0.15133565664291382, + "learning_rate": 0.0009346827578407468, + "loss": 0.1525, + "num_input_tokens_seen": 65737760, + "step": 30415 + }, + { + "epoch": 4.962479608482871, + "grad_norm": 0.05148269236087799, + "learning_rate": 0.0009346475785311936, + "loss": 0.1763, + "num_input_tokens_seen": 65749280, + "step": 30420 + }, + { + "epoch": 4.963295269168026, + "grad_norm": 0.09984282404184341, + "learning_rate": 0.0009346123904129558, + "loss": 0.0952, + "num_input_tokens_seen": 65759616, + "step": 30425 + }, + { + "epoch": 4.964110929853181, + "grad_norm": 0.03934243321418762, + "learning_rate": 0.0009345771934867464, + "loss": 0.0789, + "num_input_tokens_seen": 65771104, + "step": 30430 + }, + { + "epoch": 4.964926590538336, + "grad_norm": 0.035313550382852554, + "learning_rate": 0.000934541987753279, + "loss": 0.059, + "num_input_tokens_seen": 65782688, + "step": 30435 + }, + { + "epoch": 4.9657422512234906, + "grad_norm": 0.037588831037282944, + "learning_rate": 0.0009345067732132671, + "loss": 0.0505, + "num_input_tokens_seen": 65793408, + "step": 30440 + }, + { + "epoch": 4.966557911908646, + "grad_norm": 0.28265854716300964, + "learning_rate": 0.0009344715498674241, + "loss": 0.1367, + "num_input_tokens_seen": 65805024, + "step": 30445 + }, + { + "epoch": 4.967373572593801, + "grad_norm": 0.020411711186170578, + "learning_rate": 0.0009344363177164639, + "loss": 0.0968, + "num_input_tokens_seen": 65816736, + "step": 30450 + }, + { + "epoch": 4.968189233278956, + "grad_norm": 0.12036476284265518, + "learning_rate": 0.0009344010767611007, + "loss": 0.2162, + "num_input_tokens_seen": 65828512, + "step": 30455 + }, + { + "epoch": 4.969004893964111, + "grad_norm": 0.24372944235801697, + "learning_rate": 0.0009343658270020485, + "loss": 0.1547, + "num_input_tokens_seen": 65839200, + "step": 30460 + }, + { + "epoch": 4.9698205546492655, + "grad_norm": 0.04841368645429611, + "learning_rate": 0.000934330568440022, + "loss": 0.0459, + "num_input_tokens_seen": 65851104, + "step": 30465 + }, + { + "epoch": 4.970636215334421, + "grad_norm": 0.04966012388467789, + "learning_rate": 0.0009342953010757353, + "loss": 0.11, + "num_input_tokens_seen": 65861696, + "step": 30470 + }, + { + "epoch": 4.971451876019576, + "grad_norm": 0.19069804251194, + "learning_rate": 0.0009342600249099036, + "loss": 0.1176, + "num_input_tokens_seen": 65871264, + "step": 30475 + }, + { + "epoch": 4.972267536704731, + "grad_norm": 0.048126090317964554, + "learning_rate": 0.0009342247399432414, + "loss": 0.1139, + "num_input_tokens_seen": 65882816, + "step": 30480 + }, + { + "epoch": 4.973083197389886, + "grad_norm": 0.020289182662963867, + "learning_rate": 0.0009341894461764641, + "loss": 0.0724, + "num_input_tokens_seen": 65892576, + "step": 30485 + }, + { + "epoch": 4.9738988580750405, + "grad_norm": 0.03914694860577583, + "learning_rate": 0.0009341541436102868, + "loss": 0.1141, + "num_input_tokens_seen": 65903744, + "step": 30490 + }, + { + "epoch": 4.974714518760196, + "grad_norm": 0.03482364863157272, + "learning_rate": 0.0009341188322454251, + "loss": 0.1668, + "num_input_tokens_seen": 65913504, + "step": 30495 + }, + { + "epoch": 4.975530179445351, + "grad_norm": 0.01898271031677723, + "learning_rate": 0.0009340835120825946, + "loss": 0.0568, + "num_input_tokens_seen": 65925152, + "step": 30500 + }, + { + "epoch": 4.976345840130506, + "grad_norm": 0.10762995481491089, + "learning_rate": 0.0009340481831225109, + "loss": 0.106, + "num_input_tokens_seen": 65936288, + "step": 30505 + }, + { + "epoch": 4.977161500815661, + "grad_norm": 0.04676266387104988, + "learning_rate": 0.0009340128453658902, + "loss": 0.044, + "num_input_tokens_seen": 65947104, + "step": 30510 + }, + { + "epoch": 4.9779771615008155, + "grad_norm": 0.013631324283778667, + "learning_rate": 0.0009339774988134487, + "loss": 0.1765, + "num_input_tokens_seen": 65958464, + "step": 30515 + }, + { + "epoch": 4.97879282218597, + "grad_norm": 0.23239445686340332, + "learning_rate": 0.0009339421434659025, + "loss": 0.123, + "num_input_tokens_seen": 65970784, + "step": 30520 + }, + { + "epoch": 4.979608482871125, + "grad_norm": 0.11724288761615753, + "learning_rate": 0.0009339067793239682, + "loss": 0.1044, + "num_input_tokens_seen": 65981088, + "step": 30525 + }, + { + "epoch": 4.980424143556281, + "grad_norm": 0.08179371058940887, + "learning_rate": 0.0009338714063883627, + "loss": 0.0622, + "num_input_tokens_seen": 65992608, + "step": 30530 + }, + { + "epoch": 4.981239804241436, + "grad_norm": 0.06659191846847534, + "learning_rate": 0.0009338360246598028, + "loss": 0.0275, + "num_input_tokens_seen": 66004416, + "step": 30535 + }, + { + "epoch": 4.9820554649265905, + "grad_norm": 0.18371257185935974, + "learning_rate": 0.0009338006341390053, + "loss": 0.3027, + "num_input_tokens_seen": 66014464, + "step": 30540 + }, + { + "epoch": 4.982871125611745, + "grad_norm": 0.20001915097236633, + "learning_rate": 0.0009337652348266879, + "loss": 0.0386, + "num_input_tokens_seen": 66024416, + "step": 30545 + }, + { + "epoch": 4.9836867862969, + "grad_norm": 0.21984423696994781, + "learning_rate": 0.0009337298267235675, + "loss": 0.1813, + "num_input_tokens_seen": 66035776, + "step": 30550 + }, + { + "epoch": 4.984502446982056, + "grad_norm": 0.015670161694288254, + "learning_rate": 0.0009336944098303621, + "loss": 0.0501, + "num_input_tokens_seen": 66046464, + "step": 30555 + }, + { + "epoch": 4.985318107667211, + "grad_norm": 0.0296319667249918, + "learning_rate": 0.0009336589841477893, + "loss": 0.0785, + "num_input_tokens_seen": 66055680, + "step": 30560 + }, + { + "epoch": 4.986133768352365, + "grad_norm": 0.09039829671382904, + "learning_rate": 0.0009336235496765669, + "loss": 0.1786, + "num_input_tokens_seen": 66066944, + "step": 30565 + }, + { + "epoch": 4.98694942903752, + "grad_norm": 0.05969356372952461, + "learning_rate": 0.0009335881064174134, + "loss": 0.0984, + "num_input_tokens_seen": 66077152, + "step": 30570 + }, + { + "epoch": 4.987765089722675, + "grad_norm": 0.01346441637724638, + "learning_rate": 0.0009335526543710466, + "loss": 0.367, + "num_input_tokens_seen": 66087712, + "step": 30575 + }, + { + "epoch": 4.988580750407831, + "grad_norm": 0.08466268330812454, + "learning_rate": 0.0009335171935381854, + "loss": 0.11, + "num_input_tokens_seen": 66098336, + "step": 30580 + }, + { + "epoch": 4.989396411092986, + "grad_norm": 0.07348710298538208, + "learning_rate": 0.0009334817239195483, + "loss": 0.0621, + "num_input_tokens_seen": 66109184, + "step": 30585 + }, + { + "epoch": 4.99021207177814, + "grad_norm": 0.02474355883896351, + "learning_rate": 0.0009334462455158543, + "loss": 0.0252, + "num_input_tokens_seen": 66120160, + "step": 30590 + }, + { + "epoch": 4.991027732463295, + "grad_norm": 0.02579125203192234, + "learning_rate": 0.0009334107583278222, + "loss": 0.0987, + "num_input_tokens_seen": 66131552, + "step": 30595 + }, + { + "epoch": 4.99184339314845, + "grad_norm": 0.09439677000045776, + "learning_rate": 0.0009333752623561711, + "loss": 0.0677, + "num_input_tokens_seen": 66142624, + "step": 30600 + }, + { + "epoch": 4.992659053833605, + "grad_norm": 0.0035137098748236895, + "learning_rate": 0.0009333397576016207, + "loss": 0.0482, + "num_input_tokens_seen": 66153568, + "step": 30605 + }, + { + "epoch": 4.993474714518761, + "grad_norm": 0.07614894211292267, + "learning_rate": 0.0009333042440648903, + "loss": 0.0661, + "num_input_tokens_seen": 66163840, + "step": 30610 + }, + { + "epoch": 4.994290375203915, + "grad_norm": 0.1604684740304947, + "learning_rate": 0.0009332687217466997, + "loss": 0.216, + "num_input_tokens_seen": 66173568, + "step": 30615 + }, + { + "epoch": 4.99510603588907, + "grad_norm": 0.005750894080847502, + "learning_rate": 0.000933233190647769, + "loss": 0.1232, + "num_input_tokens_seen": 66183136, + "step": 30620 + }, + { + "epoch": 4.995921696574225, + "grad_norm": 0.01765310950577259, + "learning_rate": 0.0009331976507688178, + "loss": 0.0913, + "num_input_tokens_seen": 66194560, + "step": 30625 + }, + { + "epoch": 4.99673735725938, + "grad_norm": 0.17110048234462738, + "learning_rate": 0.0009331621021105668, + "loss": 0.1888, + "num_input_tokens_seen": 66205440, + "step": 30630 + }, + { + "epoch": 4.997553017944535, + "grad_norm": 0.07099224627017975, + "learning_rate": 0.0009331265446737364, + "loss": 0.195, + "num_input_tokens_seen": 66215616, + "step": 30635 + }, + { + "epoch": 4.99836867862969, + "grad_norm": 0.052238188683986664, + "learning_rate": 0.0009330909784590469, + "loss": 0.0538, + "num_input_tokens_seen": 66226464, + "step": 30640 + }, + { + "epoch": 4.999184339314845, + "grad_norm": 0.05132593587040901, + "learning_rate": 0.0009330554034672194, + "loss": 0.0958, + "num_input_tokens_seen": 66238144, + "step": 30645 + }, + { + "epoch": 5.0, + "grad_norm": 0.16655795276165009, + "learning_rate": 0.0009330198196989749, + "loss": 0.1424, + "num_input_tokens_seen": 66248576, + "step": 30650 + }, + { + "epoch": 5.0, + "eval_loss": 0.12099920213222504, + "eval_runtime": 103.3952, + "eval_samples_per_second": 26.355, + "eval_steps_per_second": 6.596, + "num_input_tokens_seen": 66248576, + "step": 30650 + }, + { + "epoch": 5.000815660685155, + "grad_norm": 0.19292932748794556, + "learning_rate": 0.0009329842271550342, + "loss": 0.2331, + "num_input_tokens_seen": 66259904, + "step": 30655 + }, + { + "epoch": 5.00163132137031, + "grad_norm": 0.039342980831861496, + "learning_rate": 0.0009329486258361191, + "loss": 0.0778, + "num_input_tokens_seen": 66271744, + "step": 30660 + }, + { + "epoch": 5.002446982055465, + "grad_norm": 0.11099471151828766, + "learning_rate": 0.0009329130157429507, + "loss": 0.0978, + "num_input_tokens_seen": 66283072, + "step": 30665 + }, + { + "epoch": 5.00326264274062, + "grad_norm": 0.01309216022491455, + "learning_rate": 0.000932877396876251, + "loss": 0.0518, + "num_input_tokens_seen": 66294784, + "step": 30670 + }, + { + "epoch": 5.004078303425775, + "grad_norm": 0.024124326184391975, + "learning_rate": 0.0009328417692367415, + "loss": 0.1393, + "num_input_tokens_seen": 66305728, + "step": 30675 + }, + { + "epoch": 5.00489396411093, + "grad_norm": 0.07616019248962402, + "learning_rate": 0.0009328061328251445, + "loss": 0.0466, + "num_input_tokens_seen": 66316000, + "step": 30680 + }, + { + "epoch": 5.005709624796085, + "grad_norm": 0.07263064384460449, + "learning_rate": 0.0009327704876421824, + "loss": 0.1198, + "num_input_tokens_seen": 66327488, + "step": 30685 + }, + { + "epoch": 5.006525285481239, + "grad_norm": 0.010890113189816475, + "learning_rate": 0.000932734833688577, + "loss": 0.0965, + "num_input_tokens_seen": 66339200, + "step": 30690 + }, + { + "epoch": 5.007340946166395, + "grad_norm": 0.034294452518224716, + "learning_rate": 0.0009326991709650514, + "loss": 0.0183, + "num_input_tokens_seen": 66350944, + "step": 30695 + }, + { + "epoch": 5.00815660685155, + "grad_norm": 0.1619931310415268, + "learning_rate": 0.0009326634994723282, + "loss": 0.0652, + "num_input_tokens_seen": 66362272, + "step": 30700 + }, + { + "epoch": 5.008972267536705, + "grad_norm": 0.04089265316724777, + "learning_rate": 0.0009326278192111304, + "loss": 0.0523, + "num_input_tokens_seen": 66371904, + "step": 30705 + }, + { + "epoch": 5.00978792822186, + "grad_norm": 0.037255510687828064, + "learning_rate": 0.0009325921301821809, + "loss": 0.0339, + "num_input_tokens_seen": 66383360, + "step": 30710 + }, + { + "epoch": 5.010603588907014, + "grad_norm": 0.023775247856974602, + "learning_rate": 0.000932556432386203, + "loss": 0.0252, + "num_input_tokens_seen": 66394112, + "step": 30715 + }, + { + "epoch": 5.011419249592169, + "grad_norm": 0.004952778108417988, + "learning_rate": 0.0009325207258239204, + "loss": 0.0867, + "num_input_tokens_seen": 66404992, + "step": 30720 + }, + { + "epoch": 5.012234910277325, + "grad_norm": 0.20834074914455414, + "learning_rate": 0.0009324850104960566, + "loss": 0.0936, + "num_input_tokens_seen": 66415680, + "step": 30725 + }, + { + "epoch": 5.01305057096248, + "grad_norm": 0.20381319522857666, + "learning_rate": 0.0009324492864033354, + "loss": 0.1504, + "num_input_tokens_seen": 66427072, + "step": 30730 + }, + { + "epoch": 5.013866231647635, + "grad_norm": 0.02317776158452034, + "learning_rate": 0.0009324135535464808, + "loss": 0.0571, + "num_input_tokens_seen": 66438720, + "step": 30735 + }, + { + "epoch": 5.014681892332789, + "grad_norm": 0.162574902176857, + "learning_rate": 0.000932377811926217, + "loss": 0.0935, + "num_input_tokens_seen": 66449216, + "step": 30740 + }, + { + "epoch": 5.015497553017944, + "grad_norm": 0.025656161829829216, + "learning_rate": 0.0009323420615432683, + "loss": 0.119, + "num_input_tokens_seen": 66460064, + "step": 30745 + }, + { + "epoch": 5.0163132137031, + "grad_norm": 0.34151774644851685, + "learning_rate": 0.0009323063023983593, + "loss": 0.1343, + "num_input_tokens_seen": 66470656, + "step": 30750 + }, + { + "epoch": 5.017128874388255, + "grad_norm": 0.004854666069149971, + "learning_rate": 0.0009322705344922146, + "loss": 0.0562, + "num_input_tokens_seen": 66480896, + "step": 30755 + }, + { + "epoch": 5.0179445350734095, + "grad_norm": 0.03299418091773987, + "learning_rate": 0.0009322347578255592, + "loss": 0.151, + "num_input_tokens_seen": 66491040, + "step": 30760 + }, + { + "epoch": 5.018760195758564, + "grad_norm": 0.07489554584026337, + "learning_rate": 0.0009321989723991181, + "loss": 0.1743, + "num_input_tokens_seen": 66500640, + "step": 30765 + }, + { + "epoch": 5.019575856443719, + "grad_norm": 0.10389326512813568, + "learning_rate": 0.0009321631782136166, + "loss": 0.1137, + "num_input_tokens_seen": 66509376, + "step": 30770 + }, + { + "epoch": 5.020391517128874, + "grad_norm": 0.01117030717432499, + "learning_rate": 0.0009321273752697798, + "loss": 0.1189, + "num_input_tokens_seen": 66519552, + "step": 30775 + }, + { + "epoch": 5.02120717781403, + "grad_norm": 0.007174614816904068, + "learning_rate": 0.0009320915635683338, + "loss": 0.0188, + "num_input_tokens_seen": 66530432, + "step": 30780 + }, + { + "epoch": 5.0220228384991845, + "grad_norm": 0.1178077757358551, + "learning_rate": 0.0009320557431100041, + "loss": 0.0483, + "num_input_tokens_seen": 66543296, + "step": 30785 + }, + { + "epoch": 5.022838499184339, + "grad_norm": 0.10831668972969055, + "learning_rate": 0.0009320199138955165, + "loss": 0.1303, + "num_input_tokens_seen": 66554304, + "step": 30790 + }, + { + "epoch": 5.023654159869494, + "grad_norm": 0.019279837608337402, + "learning_rate": 0.0009319840759255976, + "loss": 0.0603, + "num_input_tokens_seen": 66564800, + "step": 30795 + }, + { + "epoch": 5.024469820554649, + "grad_norm": 0.03408272936940193, + "learning_rate": 0.0009319482292009731, + "loss": 0.0475, + "num_input_tokens_seen": 66576800, + "step": 30800 + }, + { + "epoch": 5.025285481239805, + "grad_norm": 0.19642093777656555, + "learning_rate": 0.0009319123737223698, + "loss": 0.07, + "num_input_tokens_seen": 66587264, + "step": 30805 + }, + { + "epoch": 5.0261011419249595, + "grad_norm": 0.2684004008769989, + "learning_rate": 0.0009318765094905144, + "loss": 0.0828, + "num_input_tokens_seen": 66598848, + "step": 30810 + }, + { + "epoch": 5.026916802610114, + "grad_norm": 0.2714649736881256, + "learning_rate": 0.0009318406365061336, + "loss": 0.3286, + "num_input_tokens_seen": 66610560, + "step": 30815 + }, + { + "epoch": 5.027732463295269, + "grad_norm": 0.04261276498436928, + "learning_rate": 0.0009318047547699546, + "loss": 0.0427, + "num_input_tokens_seen": 66621248, + "step": 30820 + }, + { + "epoch": 5.028548123980424, + "grad_norm": 0.281934916973114, + "learning_rate": 0.0009317688642827044, + "loss": 0.1371, + "num_input_tokens_seen": 66632384, + "step": 30825 + }, + { + "epoch": 5.029363784665579, + "grad_norm": 0.045836810022592545, + "learning_rate": 0.0009317329650451103, + "loss": 0.0374, + "num_input_tokens_seen": 66643648, + "step": 30830 + }, + { + "epoch": 5.0301794453507345, + "grad_norm": 0.18936778604984283, + "learning_rate": 0.0009316970570579002, + "loss": 0.1372, + "num_input_tokens_seen": 66654528, + "step": 30835 + }, + { + "epoch": 5.030995106035889, + "grad_norm": 0.008387645706534386, + "learning_rate": 0.0009316611403218013, + "loss": 0.1091, + "num_input_tokens_seen": 66665536, + "step": 30840 + }, + { + "epoch": 5.031810766721044, + "grad_norm": 0.1754762977361679, + "learning_rate": 0.000931625214837542, + "loss": 0.1136, + "num_input_tokens_seen": 66676288, + "step": 30845 + }, + { + "epoch": 5.032626427406199, + "grad_norm": 0.22303487360477448, + "learning_rate": 0.0009315892806058501, + "loss": 0.1583, + "num_input_tokens_seen": 66687648, + "step": 30850 + }, + { + "epoch": 5.033442088091354, + "grad_norm": 0.11185749620199203, + "learning_rate": 0.0009315533376274541, + "loss": 0.1291, + "num_input_tokens_seen": 66698112, + "step": 30855 + }, + { + "epoch": 5.034257748776509, + "grad_norm": 0.014345620758831501, + "learning_rate": 0.0009315173859030821, + "loss": 0.0844, + "num_input_tokens_seen": 66709440, + "step": 30860 + }, + { + "epoch": 5.035073409461664, + "grad_norm": 0.02847551926970482, + "learning_rate": 0.0009314814254334627, + "loss": 0.1353, + "num_input_tokens_seen": 66720768, + "step": 30865 + }, + { + "epoch": 5.035889070146819, + "grad_norm": 0.09638968110084534, + "learning_rate": 0.000931445456219325, + "loss": 0.0769, + "num_input_tokens_seen": 66732160, + "step": 30870 + }, + { + "epoch": 5.036704730831974, + "grad_norm": 0.02057075873017311, + "learning_rate": 0.0009314094782613977, + "loss": 0.0517, + "num_input_tokens_seen": 66742368, + "step": 30875 + }, + { + "epoch": 5.037520391517129, + "grad_norm": 0.05229242146015167, + "learning_rate": 0.0009313734915604103, + "loss": 0.1478, + "num_input_tokens_seen": 66752832, + "step": 30880 + }, + { + "epoch": 5.0383360522022835, + "grad_norm": 0.015186270698904991, + "learning_rate": 0.0009313374961170917, + "loss": 0.062, + "num_input_tokens_seen": 66764032, + "step": 30885 + }, + { + "epoch": 5.039151712887439, + "grad_norm": 0.0803709402680397, + "learning_rate": 0.0009313014919321715, + "loss": 0.1218, + "num_input_tokens_seen": 66774944, + "step": 30890 + }, + { + "epoch": 5.039967373572594, + "grad_norm": 0.0704301968216896, + "learning_rate": 0.0009312654790063795, + "loss": 0.1541, + "num_input_tokens_seen": 66786240, + "step": 30895 + }, + { + "epoch": 5.040783034257749, + "grad_norm": 0.0323120579123497, + "learning_rate": 0.0009312294573404454, + "loss": 0.0395, + "num_input_tokens_seen": 66797376, + "step": 30900 + }, + { + "epoch": 5.041598694942904, + "grad_norm": 0.17087383568286896, + "learning_rate": 0.0009311934269350993, + "loss": 0.0578, + "num_input_tokens_seen": 66807680, + "step": 30905 + }, + { + "epoch": 5.0424143556280585, + "grad_norm": 0.25635138154029846, + "learning_rate": 0.0009311573877910716, + "loss": 0.2143, + "num_input_tokens_seen": 66818336, + "step": 30910 + }, + { + "epoch": 5.043230016313213, + "grad_norm": 0.03489004820585251, + "learning_rate": 0.0009311213399090921, + "loss": 0.2087, + "num_input_tokens_seen": 66830240, + "step": 30915 + }, + { + "epoch": 5.044045676998369, + "grad_norm": 0.020329639315605164, + "learning_rate": 0.000931085283289892, + "loss": 0.091, + "num_input_tokens_seen": 66840000, + "step": 30920 + }, + { + "epoch": 5.044861337683524, + "grad_norm": 0.05295855551958084, + "learning_rate": 0.0009310492179342016, + "loss": 0.0732, + "num_input_tokens_seen": 66849792, + "step": 30925 + }, + { + "epoch": 5.045676998368679, + "grad_norm": 0.04636767506599426, + "learning_rate": 0.0009310131438427521, + "loss": 0.0234, + "num_input_tokens_seen": 66860384, + "step": 30930 + }, + { + "epoch": 5.0464926590538335, + "grad_norm": 0.0942985787987709, + "learning_rate": 0.0009309770610162744, + "loss": 0.1044, + "num_input_tokens_seen": 66870880, + "step": 30935 + }, + { + "epoch": 5.047308319738988, + "grad_norm": 0.009630602784454823, + "learning_rate": 0.0009309409694555, + "loss": 0.0774, + "num_input_tokens_seen": 66882144, + "step": 30940 + }, + { + "epoch": 5.048123980424143, + "grad_norm": 0.17799124121665955, + "learning_rate": 0.0009309048691611599, + "loss": 0.1953, + "num_input_tokens_seen": 66893280, + "step": 30945 + }, + { + "epoch": 5.048939641109299, + "grad_norm": 0.01871904544532299, + "learning_rate": 0.0009308687601339861, + "loss": 0.0307, + "num_input_tokens_seen": 66904224, + "step": 30950 + }, + { + "epoch": 5.049755301794454, + "grad_norm": 0.07792586088180542, + "learning_rate": 0.0009308326423747103, + "loss": 0.025, + "num_input_tokens_seen": 66915936, + "step": 30955 + }, + { + "epoch": 5.0505709624796085, + "grad_norm": 0.048931483179330826, + "learning_rate": 0.0009307965158840644, + "loss": 0.0606, + "num_input_tokens_seen": 66926432, + "step": 30960 + }, + { + "epoch": 5.051386623164763, + "grad_norm": 0.011593530885875225, + "learning_rate": 0.0009307603806627807, + "loss": 0.0806, + "num_input_tokens_seen": 66937984, + "step": 30965 + }, + { + "epoch": 5.052202283849918, + "grad_norm": 0.13193394243717194, + "learning_rate": 0.0009307242367115914, + "loss": 0.0623, + "num_input_tokens_seen": 66949312, + "step": 30970 + }, + { + "epoch": 5.053017944535074, + "grad_norm": 0.16529838740825653, + "learning_rate": 0.000930688084031229, + "loss": 0.1508, + "num_input_tokens_seen": 66960032, + "step": 30975 + }, + { + "epoch": 5.053833605220229, + "grad_norm": 0.0980365201830864, + "learning_rate": 0.0009306519226224262, + "loss": 0.1488, + "num_input_tokens_seen": 66970208, + "step": 30980 + }, + { + "epoch": 5.054649265905383, + "grad_norm": 0.1277938187122345, + "learning_rate": 0.0009306157524859158, + "loss": 0.1445, + "num_input_tokens_seen": 66980000, + "step": 30985 + }, + { + "epoch": 5.055464926590538, + "grad_norm": 0.21884754300117493, + "learning_rate": 0.000930579573622431, + "loss": 0.2126, + "num_input_tokens_seen": 66990400, + "step": 30990 + }, + { + "epoch": 5.056280587275693, + "grad_norm": 0.04518071934580803, + "learning_rate": 0.0009305433860327049, + "loss": 0.0731, + "num_input_tokens_seen": 67001184, + "step": 30995 + }, + { + "epoch": 5.057096247960848, + "grad_norm": 0.01968988962471485, + "learning_rate": 0.0009305071897174708, + "loss": 0.0856, + "num_input_tokens_seen": 67012640, + "step": 31000 + }, + { + "epoch": 5.057911908646004, + "grad_norm": 0.06685356050729752, + "learning_rate": 0.0009304709846774625, + "loss": 0.0884, + "num_input_tokens_seen": 67023936, + "step": 31005 + }, + { + "epoch": 5.058727569331158, + "grad_norm": 0.01798919029533863, + "learning_rate": 0.0009304347709134136, + "loss": 0.0427, + "num_input_tokens_seen": 67034912, + "step": 31010 + }, + { + "epoch": 5.059543230016313, + "grad_norm": 0.009535958990454674, + "learning_rate": 0.000930398548426058, + "loss": 0.0316, + "num_input_tokens_seen": 67045184, + "step": 31015 + }, + { + "epoch": 5.060358890701468, + "grad_norm": 0.030188433825969696, + "learning_rate": 0.0009303623172161298, + "loss": 0.1074, + "num_input_tokens_seen": 67055936, + "step": 31020 + }, + { + "epoch": 5.061174551386623, + "grad_norm": 0.06585537642240524, + "learning_rate": 0.0009303260772843632, + "loss": 0.0587, + "num_input_tokens_seen": 67066528, + "step": 31025 + }, + { + "epoch": 5.061990212071779, + "grad_norm": 0.04481403902173042, + "learning_rate": 0.0009302898286314929, + "loss": 0.1304, + "num_input_tokens_seen": 67077504, + "step": 31030 + }, + { + "epoch": 5.062805872756933, + "grad_norm": 0.05069199204444885, + "learning_rate": 0.0009302535712582532, + "loss": 0.0279, + "num_input_tokens_seen": 67088480, + "step": 31035 + }, + { + "epoch": 5.063621533442088, + "grad_norm": 0.1780654489994049, + "learning_rate": 0.0009302173051653792, + "loss": 0.0536, + "num_input_tokens_seen": 67099936, + "step": 31040 + }, + { + "epoch": 5.064437194127243, + "grad_norm": 0.22669237852096558, + "learning_rate": 0.0009301810303536056, + "loss": 0.2969, + "num_input_tokens_seen": 67109760, + "step": 31045 + }, + { + "epoch": 5.065252854812398, + "grad_norm": 0.08419650793075562, + "learning_rate": 0.0009301447468236678, + "loss": 0.0358, + "num_input_tokens_seen": 67119840, + "step": 31050 + }, + { + "epoch": 5.066068515497553, + "grad_norm": 0.45807531476020813, + "learning_rate": 0.000930108454576301, + "loss": 0.1346, + "num_input_tokens_seen": 67130304, + "step": 31055 + }, + { + "epoch": 5.066884176182708, + "grad_norm": 0.06297741830348969, + "learning_rate": 0.0009300721536122408, + "loss": 0.0572, + "num_input_tokens_seen": 67140960, + "step": 31060 + }, + { + "epoch": 5.067699836867863, + "grad_norm": 0.2701318860054016, + "learning_rate": 0.0009300358439322228, + "loss": 0.0844, + "num_input_tokens_seen": 67152160, + "step": 31065 + }, + { + "epoch": 5.068515497553018, + "grad_norm": 0.011515563353896141, + "learning_rate": 0.0009299995255369828, + "loss": 0.1513, + "num_input_tokens_seen": 67163104, + "step": 31070 + }, + { + "epoch": 5.069331158238173, + "grad_norm": 0.0866737812757492, + "learning_rate": 0.000929963198427257, + "loss": 0.0299, + "num_input_tokens_seen": 67174208, + "step": 31075 + }, + { + "epoch": 5.070146818923328, + "grad_norm": 0.0019546225666999817, + "learning_rate": 0.0009299268626037815, + "loss": 0.1659, + "num_input_tokens_seen": 67186912, + "step": 31080 + }, + { + "epoch": 5.0709624796084825, + "grad_norm": 0.34600770473480225, + "learning_rate": 0.0009298905180672928, + "loss": 0.2018, + "num_input_tokens_seen": 67197952, + "step": 31085 + }, + { + "epoch": 5.071778140293638, + "grad_norm": 0.01651175133883953, + "learning_rate": 0.0009298541648185272, + "loss": 0.1235, + "num_input_tokens_seen": 67208832, + "step": 31090 + }, + { + "epoch": 5.072593800978793, + "grad_norm": 0.185794860124588, + "learning_rate": 0.0009298178028582218, + "loss": 0.0688, + "num_input_tokens_seen": 67219968, + "step": 31095 + }, + { + "epoch": 5.073409461663948, + "grad_norm": 0.26523634791374207, + "learning_rate": 0.0009297814321871133, + "loss": 0.1548, + "num_input_tokens_seen": 67231232, + "step": 31100 + }, + { + "epoch": 5.074225122349103, + "grad_norm": 0.32514598965644836, + "learning_rate": 0.0009297450528059389, + "loss": 0.1022, + "num_input_tokens_seen": 67242784, + "step": 31105 + }, + { + "epoch": 5.075040783034257, + "grad_norm": 0.12018303573131561, + "learning_rate": 0.0009297086647154358, + "loss": 0.0762, + "num_input_tokens_seen": 67253440, + "step": 31110 + }, + { + "epoch": 5.075856443719413, + "grad_norm": 0.040082309395074844, + "learning_rate": 0.0009296722679163417, + "loss": 0.0716, + "num_input_tokens_seen": 67263168, + "step": 31115 + }, + { + "epoch": 5.076672104404568, + "grad_norm": 0.019938020035624504, + "learning_rate": 0.0009296358624093937, + "loss": 0.0168, + "num_input_tokens_seen": 67273248, + "step": 31120 + }, + { + "epoch": 5.077487765089723, + "grad_norm": 0.19098550081253052, + "learning_rate": 0.00092959944819533, + "loss": 0.0936, + "num_input_tokens_seen": 67283712, + "step": 31125 + }, + { + "epoch": 5.078303425774878, + "grad_norm": 0.07549386471509933, + "learning_rate": 0.0009295630252748885, + "loss": 0.014, + "num_input_tokens_seen": 67293824, + "step": 31130 + }, + { + "epoch": 5.079119086460032, + "grad_norm": 0.25435495376586914, + "learning_rate": 0.0009295265936488076, + "loss": 0.1051, + "num_input_tokens_seen": 67303936, + "step": 31135 + }, + { + "epoch": 5.079934747145187, + "grad_norm": 0.054433248937129974, + "learning_rate": 0.0009294901533178251, + "loss": 0.0092, + "num_input_tokens_seen": 67315200, + "step": 31140 + }, + { + "epoch": 5.080750407830343, + "grad_norm": 0.14523616433143616, + "learning_rate": 0.0009294537042826798, + "loss": 0.061, + "num_input_tokens_seen": 67325952, + "step": 31145 + }, + { + "epoch": 5.081566068515498, + "grad_norm": 0.03600054606795311, + "learning_rate": 0.0009294172465441104, + "loss": 0.0518, + "num_input_tokens_seen": 67336640, + "step": 31150 + }, + { + "epoch": 5.082381729200653, + "grad_norm": 0.1872844696044922, + "learning_rate": 0.0009293807801028558, + "loss": 0.1058, + "num_input_tokens_seen": 67347776, + "step": 31155 + }, + { + "epoch": 5.083197389885807, + "grad_norm": 0.06703568249940872, + "learning_rate": 0.0009293443049596551, + "loss": 0.1516, + "num_input_tokens_seen": 67359264, + "step": 31160 + }, + { + "epoch": 5.084013050570962, + "grad_norm": 0.19908444583415985, + "learning_rate": 0.0009293078211152473, + "loss": 0.1284, + "num_input_tokens_seen": 67368960, + "step": 31165 + }, + { + "epoch": 5.084828711256117, + "grad_norm": 0.03559856116771698, + "learning_rate": 0.0009292713285703718, + "loss": 0.0476, + "num_input_tokens_seen": 67379520, + "step": 31170 + }, + { + "epoch": 5.085644371941273, + "grad_norm": 0.11854170262813568, + "learning_rate": 0.0009292348273257684, + "loss": 0.1697, + "num_input_tokens_seen": 67390272, + "step": 31175 + }, + { + "epoch": 5.0864600326264275, + "grad_norm": 0.11884764581918716, + "learning_rate": 0.0009291983173821765, + "loss": 0.1775, + "num_input_tokens_seen": 67401536, + "step": 31180 + }, + { + "epoch": 5.087275693311582, + "grad_norm": 0.056827448308467865, + "learning_rate": 0.0009291617987403364, + "loss": 0.0397, + "num_input_tokens_seen": 67411264, + "step": 31185 + }, + { + "epoch": 5.088091353996737, + "grad_norm": 0.0024486789479851723, + "learning_rate": 0.000929125271400988, + "loss": 0.0479, + "num_input_tokens_seen": 67422976, + "step": 31190 + }, + { + "epoch": 5.088907014681892, + "grad_norm": 0.14866988360881805, + "learning_rate": 0.0009290887353648716, + "loss": 0.0504, + "num_input_tokens_seen": 67434720, + "step": 31195 + }, + { + "epoch": 5.089722675367048, + "grad_norm": 0.02642189897596836, + "learning_rate": 0.0009290521906327276, + "loss": 0.0092, + "num_input_tokens_seen": 67446368, + "step": 31200 + }, + { + "epoch": 5.0905383360522025, + "grad_norm": 0.020417513325810432, + "learning_rate": 0.0009290156372052967, + "loss": 0.0516, + "num_input_tokens_seen": 67457664, + "step": 31205 + }, + { + "epoch": 5.091353996737357, + "grad_norm": 0.036934275180101395, + "learning_rate": 0.0009289790750833196, + "loss": 0.0764, + "num_input_tokens_seen": 67468416, + "step": 31210 + }, + { + "epoch": 5.092169657422512, + "grad_norm": 0.16267381608486176, + "learning_rate": 0.0009289425042675373, + "loss": 0.1494, + "num_input_tokens_seen": 67479328, + "step": 31215 + }, + { + "epoch": 5.092985318107667, + "grad_norm": 0.00929997954517603, + "learning_rate": 0.0009289059247586911, + "loss": 0.0177, + "num_input_tokens_seen": 67490144, + "step": 31220 + }, + { + "epoch": 5.093800978792822, + "grad_norm": 0.03489250689744949, + "learning_rate": 0.0009288693365575222, + "loss": 0.0243, + "num_input_tokens_seen": 67500384, + "step": 31225 + }, + { + "epoch": 5.0946166394779775, + "grad_norm": 0.1912592649459839, + "learning_rate": 0.0009288327396647722, + "loss": 0.0725, + "num_input_tokens_seen": 67511744, + "step": 31230 + }, + { + "epoch": 5.095432300163132, + "grad_norm": 0.40533021092414856, + "learning_rate": 0.0009287961340811826, + "loss": 0.2004, + "num_input_tokens_seen": 67522688, + "step": 31235 + }, + { + "epoch": 5.096247960848287, + "grad_norm": 0.2418777048587799, + "learning_rate": 0.0009287595198074955, + "loss": 0.2036, + "num_input_tokens_seen": 67533984, + "step": 31240 + }, + { + "epoch": 5.097063621533442, + "grad_norm": 0.17491726577281952, + "learning_rate": 0.0009287228968444527, + "loss": 0.2711, + "num_input_tokens_seen": 67545952, + "step": 31245 + }, + { + "epoch": 5.097879282218597, + "grad_norm": 0.00852019339799881, + "learning_rate": 0.0009286862651927966, + "loss": 0.0627, + "num_input_tokens_seen": 67556928, + "step": 31250 + }, + { + "epoch": 5.0986949429037525, + "grad_norm": 0.10353199392557144, + "learning_rate": 0.0009286496248532695, + "loss": 0.203, + "num_input_tokens_seen": 67567840, + "step": 31255 + }, + { + "epoch": 5.099510603588907, + "grad_norm": 0.006339102052152157, + "learning_rate": 0.000928612975826614, + "loss": 0.0513, + "num_input_tokens_seen": 67578432, + "step": 31260 + }, + { + "epoch": 5.100326264274062, + "grad_norm": 0.20576722919940948, + "learning_rate": 0.0009285763181135727, + "loss": 0.0726, + "num_input_tokens_seen": 67588992, + "step": 31265 + }, + { + "epoch": 5.101141924959217, + "grad_norm": 0.10356633365154266, + "learning_rate": 0.0009285396517148888, + "loss": 0.1517, + "num_input_tokens_seen": 67599200, + "step": 31270 + }, + { + "epoch": 5.101957585644372, + "grad_norm": 0.2491035759449005, + "learning_rate": 0.000928502976631305, + "loss": 0.1456, + "num_input_tokens_seen": 67610016, + "step": 31275 + }, + { + "epoch": 5.102773246329527, + "grad_norm": 0.0775388851761818, + "learning_rate": 0.0009284662928635649, + "loss": 0.073, + "num_input_tokens_seen": 67621568, + "step": 31280 + }, + { + "epoch": 5.103588907014682, + "grad_norm": 0.03371018171310425, + "learning_rate": 0.0009284296004124118, + "loss": 0.039, + "num_input_tokens_seen": 67631712, + "step": 31285 + }, + { + "epoch": 5.104404567699837, + "grad_norm": 0.011440278962254524, + "learning_rate": 0.0009283928992785894, + "loss": 0.0177, + "num_input_tokens_seen": 67643552, + "step": 31290 + }, + { + "epoch": 5.105220228384992, + "grad_norm": 0.004816057626157999, + "learning_rate": 0.0009283561894628414, + "loss": 0.055, + "num_input_tokens_seen": 67653920, + "step": 31295 + }, + { + "epoch": 5.106035889070147, + "grad_norm": 0.09972722083330154, + "learning_rate": 0.0009283194709659117, + "loss": 0.1086, + "num_input_tokens_seen": 67665760, + "step": 31300 + }, + { + "epoch": 5.1068515497553015, + "grad_norm": 0.011177991516888142, + "learning_rate": 0.0009282827437885449, + "loss": 0.0199, + "num_input_tokens_seen": 67676320, + "step": 31305 + }, + { + "epoch": 5.107667210440456, + "grad_norm": 0.010363086126744747, + "learning_rate": 0.0009282460079314848, + "loss": 0.0367, + "num_input_tokens_seen": 67687008, + "step": 31310 + }, + { + "epoch": 5.108482871125612, + "grad_norm": 0.07990599423646927, + "learning_rate": 0.0009282092633954759, + "loss": 0.1999, + "num_input_tokens_seen": 67698112, + "step": 31315 + }, + { + "epoch": 5.109298531810767, + "grad_norm": 0.026262901723384857, + "learning_rate": 0.0009281725101812632, + "loss": 0.086, + "num_input_tokens_seen": 67707552, + "step": 31320 + }, + { + "epoch": 5.110114192495922, + "grad_norm": 0.1390565186738968, + "learning_rate": 0.0009281357482895914, + "loss": 0.0503, + "num_input_tokens_seen": 67718016, + "step": 31325 + }, + { + "epoch": 5.1109298531810765, + "grad_norm": 0.18929249048233032, + "learning_rate": 0.0009280989777212055, + "loss": 0.0975, + "num_input_tokens_seen": 67729056, + "step": 31330 + }, + { + "epoch": 5.111745513866231, + "grad_norm": 0.015308565460145473, + "learning_rate": 0.0009280621984768507, + "loss": 0.1097, + "num_input_tokens_seen": 67741024, + "step": 31335 + }, + { + "epoch": 5.112561174551387, + "grad_norm": 0.04618688300251961, + "learning_rate": 0.0009280254105572725, + "loss": 0.1202, + "num_input_tokens_seen": 67751296, + "step": 31340 + }, + { + "epoch": 5.113376835236542, + "grad_norm": 0.05035729706287384, + "learning_rate": 0.0009279886139632163, + "loss": 0.1071, + "num_input_tokens_seen": 67761536, + "step": 31345 + }, + { + "epoch": 5.114192495921697, + "grad_norm": 0.0054813530296087265, + "learning_rate": 0.000927951808695428, + "loss": 0.0664, + "num_input_tokens_seen": 67771968, + "step": 31350 + }, + { + "epoch": 5.1150081566068515, + "grad_norm": 0.005418519489467144, + "learning_rate": 0.0009279149947546534, + "loss": 0.1719, + "num_input_tokens_seen": 67781376, + "step": 31355 + }, + { + "epoch": 5.115823817292006, + "grad_norm": 0.00409423653036356, + "learning_rate": 0.0009278781721416385, + "loss": 0.0805, + "num_input_tokens_seen": 67793472, + "step": 31360 + }, + { + "epoch": 5.116639477977161, + "grad_norm": 0.03038191795349121, + "learning_rate": 0.0009278413408571295, + "loss": 0.1029, + "num_input_tokens_seen": 67804256, + "step": 31365 + }, + { + "epoch": 5.117455138662317, + "grad_norm": 0.028354860842227936, + "learning_rate": 0.0009278045009018733, + "loss": 0.027, + "num_input_tokens_seen": 67814688, + "step": 31370 + }, + { + "epoch": 5.118270799347472, + "grad_norm": 0.007617499213665724, + "learning_rate": 0.000927767652276616, + "loss": 0.0202, + "num_input_tokens_seen": 67824672, + "step": 31375 + }, + { + "epoch": 5.1190864600326265, + "grad_norm": 0.005120754241943359, + "learning_rate": 0.0009277307949821045, + "loss": 0.1537, + "num_input_tokens_seen": 67834720, + "step": 31380 + }, + { + "epoch": 5.119902120717781, + "grad_norm": 0.021448107436299324, + "learning_rate": 0.000927693929019086, + "loss": 0.0241, + "num_input_tokens_seen": 67846624, + "step": 31385 + }, + { + "epoch": 5.120717781402936, + "grad_norm": 0.1250106692314148, + "learning_rate": 0.0009276570543883074, + "loss": 0.047, + "num_input_tokens_seen": 67855296, + "step": 31390 + }, + { + "epoch": 5.121533442088092, + "grad_norm": 0.12414438277482986, + "learning_rate": 0.000927620171090516, + "loss": 0.0678, + "num_input_tokens_seen": 67866336, + "step": 31395 + }, + { + "epoch": 5.122349102773247, + "grad_norm": 0.011611179448664188, + "learning_rate": 0.0009275832791264593, + "loss": 0.0664, + "num_input_tokens_seen": 67877664, + "step": 31400 + }, + { + "epoch": 5.123164763458401, + "grad_norm": 0.07134946435689926, + "learning_rate": 0.0009275463784968852, + "loss": 0.0711, + "num_input_tokens_seen": 67888576, + "step": 31405 + }, + { + "epoch": 5.123980424143556, + "grad_norm": 0.06697040051221848, + "learning_rate": 0.0009275094692025413, + "loss": 0.0834, + "num_input_tokens_seen": 67898880, + "step": 31410 + }, + { + "epoch": 5.124796084828711, + "grad_norm": 0.030282270163297653, + "learning_rate": 0.0009274725512441757, + "loss": 0.1136, + "num_input_tokens_seen": 67908480, + "step": 31415 + }, + { + "epoch": 5.125611745513866, + "grad_norm": 0.010222107172012329, + "learning_rate": 0.0009274356246225364, + "loss": 0.1331, + "num_input_tokens_seen": 67918688, + "step": 31420 + }, + { + "epoch": 5.126427406199022, + "grad_norm": 0.11597134917974472, + "learning_rate": 0.0009273986893383722, + "loss": 0.1195, + "num_input_tokens_seen": 67929504, + "step": 31425 + }, + { + "epoch": 5.127243066884176, + "grad_norm": 0.004997505806386471, + "learning_rate": 0.000927361745392431, + "loss": 0.0747, + "num_input_tokens_seen": 67940192, + "step": 31430 + }, + { + "epoch": 5.128058727569331, + "grad_norm": 0.10774416476488113, + "learning_rate": 0.0009273247927854622, + "loss": 0.0965, + "num_input_tokens_seen": 67951072, + "step": 31435 + }, + { + "epoch": 5.128874388254486, + "grad_norm": 0.1743687093257904, + "learning_rate": 0.0009272878315182141, + "loss": 0.0475, + "num_input_tokens_seen": 67961280, + "step": 31440 + }, + { + "epoch": 5.129690048939641, + "grad_norm": 0.23096045851707458, + "learning_rate": 0.0009272508615914363, + "loss": 0.123, + "num_input_tokens_seen": 67972224, + "step": 31445 + }, + { + "epoch": 5.130505709624796, + "grad_norm": 0.37662026286125183, + "learning_rate": 0.0009272138830058776, + "loss": 0.2525, + "num_input_tokens_seen": 67984256, + "step": 31450 + }, + { + "epoch": 5.131321370309951, + "grad_norm": 0.05308877304196358, + "learning_rate": 0.0009271768957622877, + "loss": 0.0669, + "num_input_tokens_seen": 67994144, + "step": 31455 + }, + { + "epoch": 5.132137030995106, + "grad_norm": 0.17500479519367218, + "learning_rate": 0.0009271398998614162, + "loss": 0.1912, + "num_input_tokens_seen": 68004896, + "step": 31460 + }, + { + "epoch": 5.132952691680261, + "grad_norm": 0.01674867980182171, + "learning_rate": 0.0009271028953040126, + "loss": 0.1249, + "num_input_tokens_seen": 68015456, + "step": 31465 + }, + { + "epoch": 5.133768352365416, + "grad_norm": 0.3295539319515228, + "learning_rate": 0.0009270658820908271, + "loss": 0.0809, + "num_input_tokens_seen": 68026880, + "step": 31470 + }, + { + "epoch": 5.134584013050571, + "grad_norm": 0.1620461344718933, + "learning_rate": 0.0009270288602226096, + "loss": 0.307, + "num_input_tokens_seen": 68036672, + "step": 31475 + }, + { + "epoch": 5.135399673735726, + "grad_norm": 0.0587170347571373, + "learning_rate": 0.0009269918297001106, + "loss": 0.059, + "num_input_tokens_seen": 68047040, + "step": 31480 + }, + { + "epoch": 5.136215334420881, + "grad_norm": 0.3170589804649353, + "learning_rate": 0.0009269547905240805, + "loss": 0.1165, + "num_input_tokens_seen": 68058112, + "step": 31485 + }, + { + "epoch": 5.137030995106036, + "grad_norm": 0.034953050315380096, + "learning_rate": 0.00092691774269527, + "loss": 0.0355, + "num_input_tokens_seen": 68068384, + "step": 31490 + }, + { + "epoch": 5.137846655791191, + "grad_norm": 0.053561653941869736, + "learning_rate": 0.0009268806862144298, + "loss": 0.0943, + "num_input_tokens_seen": 68078848, + "step": 31495 + }, + { + "epoch": 5.138662316476346, + "grad_norm": 0.011080753058195114, + "learning_rate": 0.0009268436210823109, + "loss": 0.0651, + "num_input_tokens_seen": 68090272, + "step": 31500 + }, + { + "epoch": 5.1394779771615005, + "grad_norm": 0.19692468643188477, + "learning_rate": 0.0009268065472996645, + "loss": 0.1488, + "num_input_tokens_seen": 68101984, + "step": 31505 + }, + { + "epoch": 5.140293637846656, + "grad_norm": 0.200147807598114, + "learning_rate": 0.0009267694648672423, + "loss": 0.0512, + "num_input_tokens_seen": 68113696, + "step": 31510 + }, + { + "epoch": 5.141109298531811, + "grad_norm": 0.12792713940143585, + "learning_rate": 0.0009267323737857952, + "loss": 0.0741, + "num_input_tokens_seen": 68125120, + "step": 31515 + }, + { + "epoch": 5.141924959216966, + "grad_norm": 0.036151349544525146, + "learning_rate": 0.0009266952740560752, + "loss": 0.0511, + "num_input_tokens_seen": 68136128, + "step": 31520 + }, + { + "epoch": 5.142740619902121, + "grad_norm": 0.006894730031490326, + "learning_rate": 0.0009266581656788342, + "loss": 0.071, + "num_input_tokens_seen": 68147808, + "step": 31525 + }, + { + "epoch": 5.143556280587275, + "grad_norm": 0.02515142224729061, + "learning_rate": 0.0009266210486548243, + "loss": 0.1119, + "num_input_tokens_seen": 68158208, + "step": 31530 + }, + { + "epoch": 5.14437194127243, + "grad_norm": 0.007471158169209957, + "learning_rate": 0.0009265839229847975, + "loss": 0.1079, + "num_input_tokens_seen": 68169088, + "step": 31535 + }, + { + "epoch": 5.145187601957586, + "grad_norm": 0.0272090844810009, + "learning_rate": 0.0009265467886695064, + "loss": 0.0137, + "num_input_tokens_seen": 68179200, + "step": 31540 + }, + { + "epoch": 5.146003262642741, + "grad_norm": 0.2392469048500061, + "learning_rate": 0.0009265096457097035, + "loss": 0.0796, + "num_input_tokens_seen": 68189120, + "step": 31545 + }, + { + "epoch": 5.146818923327896, + "grad_norm": 0.02147931605577469, + "learning_rate": 0.0009264724941061418, + "loss": 0.0274, + "num_input_tokens_seen": 68200672, + "step": 31550 + }, + { + "epoch": 5.14763458401305, + "grad_norm": 0.21360090374946594, + "learning_rate": 0.0009264353338595736, + "loss": 0.1693, + "num_input_tokens_seen": 68211040, + "step": 31555 + }, + { + "epoch": 5.148450244698205, + "grad_norm": 0.08548810333013535, + "learning_rate": 0.0009263981649707527, + "loss": 0.102, + "num_input_tokens_seen": 68221216, + "step": 31560 + }, + { + "epoch": 5.149265905383361, + "grad_norm": 0.01596478745341301, + "learning_rate": 0.0009263609874404319, + "loss": 0.1411, + "num_input_tokens_seen": 68231360, + "step": 31565 + }, + { + "epoch": 5.150081566068516, + "grad_norm": 0.051141407340765, + "learning_rate": 0.0009263238012693649, + "loss": 0.0586, + "num_input_tokens_seen": 68243456, + "step": 31570 + }, + { + "epoch": 5.150897226753671, + "grad_norm": 0.1186927780508995, + "learning_rate": 0.0009262866064583051, + "loss": 0.0407, + "num_input_tokens_seen": 68254560, + "step": 31575 + }, + { + "epoch": 5.151712887438825, + "grad_norm": 0.002311921678483486, + "learning_rate": 0.0009262494030080066, + "loss": 0.0422, + "num_input_tokens_seen": 68265792, + "step": 31580 + }, + { + "epoch": 5.15252854812398, + "grad_norm": 0.004859385080635548, + "learning_rate": 0.0009262121909192232, + "loss": 0.168, + "num_input_tokens_seen": 68276992, + "step": 31585 + }, + { + "epoch": 5.153344208809135, + "grad_norm": 0.09993654489517212, + "learning_rate": 0.0009261749701927089, + "loss": 0.2298, + "num_input_tokens_seen": 68286496, + "step": 31590 + }, + { + "epoch": 5.154159869494291, + "grad_norm": 0.03430628776550293, + "learning_rate": 0.0009261377408292183, + "loss": 0.1492, + "num_input_tokens_seen": 68297696, + "step": 31595 + }, + { + "epoch": 5.1549755301794455, + "grad_norm": 0.0053238943219184875, + "learning_rate": 0.0009261005028295058, + "loss": 0.0319, + "num_input_tokens_seen": 68309216, + "step": 31600 + }, + { + "epoch": 5.1557911908646, + "grad_norm": 0.2986612021923065, + "learning_rate": 0.000926063256194326, + "loss": 0.2046, + "num_input_tokens_seen": 68319584, + "step": 31605 + }, + { + "epoch": 5.156606851549755, + "grad_norm": 0.07867178320884705, + "learning_rate": 0.0009260260009244339, + "loss": 0.0633, + "num_input_tokens_seen": 68331104, + "step": 31610 + }, + { + "epoch": 5.15742251223491, + "grad_norm": 0.01586318016052246, + "learning_rate": 0.0009259887370205844, + "loss": 0.0906, + "num_input_tokens_seen": 68341568, + "step": 31615 + }, + { + "epoch": 5.158238172920065, + "grad_norm": 0.010150609537959099, + "learning_rate": 0.0009259514644835327, + "loss": 0.0247, + "num_input_tokens_seen": 68352480, + "step": 31620 + }, + { + "epoch": 5.1590538336052205, + "grad_norm": 0.13318832218647003, + "learning_rate": 0.0009259141833140343, + "loss": 0.1502, + "num_input_tokens_seen": 68362560, + "step": 31625 + }, + { + "epoch": 5.159869494290375, + "grad_norm": 0.22387240827083588, + "learning_rate": 0.0009258768935128445, + "loss": 0.1043, + "num_input_tokens_seen": 68372800, + "step": 31630 + }, + { + "epoch": 5.16068515497553, + "grad_norm": 0.30648529529571533, + "learning_rate": 0.0009258395950807194, + "loss": 0.1968, + "num_input_tokens_seen": 68383264, + "step": 31635 + }, + { + "epoch": 5.161500815660685, + "grad_norm": 0.004418856929987669, + "learning_rate": 0.0009258022880184145, + "loss": 0.0746, + "num_input_tokens_seen": 68394176, + "step": 31640 + }, + { + "epoch": 5.16231647634584, + "grad_norm": 0.2309313714504242, + "learning_rate": 0.0009257649723266863, + "loss": 0.1278, + "num_input_tokens_seen": 68406592, + "step": 31645 + }, + { + "epoch": 5.1631321370309955, + "grad_norm": 0.1833798736333847, + "learning_rate": 0.0009257276480062907, + "loss": 0.1194, + "num_input_tokens_seen": 68415936, + "step": 31650 + }, + { + "epoch": 5.16394779771615, + "grad_norm": 0.0035895612090826035, + "learning_rate": 0.0009256903150579842, + "loss": 0.176, + "num_input_tokens_seen": 68426880, + "step": 31655 + }, + { + "epoch": 5.164763458401305, + "grad_norm": 0.19863756000995636, + "learning_rate": 0.0009256529734825234, + "loss": 0.2212, + "num_input_tokens_seen": 68439936, + "step": 31660 + }, + { + "epoch": 5.16557911908646, + "grad_norm": 0.14670097827911377, + "learning_rate": 0.0009256156232806652, + "loss": 0.1125, + "num_input_tokens_seen": 68450784, + "step": 31665 + }, + { + "epoch": 5.166394779771615, + "grad_norm": 0.07167736440896988, + "learning_rate": 0.0009255782644531664, + "loss": 0.0307, + "num_input_tokens_seen": 68462592, + "step": 31670 + }, + { + "epoch": 5.16721044045677, + "grad_norm": 0.11502383649349213, + "learning_rate": 0.0009255408970007842, + "loss": 0.1548, + "num_input_tokens_seen": 68472928, + "step": 31675 + }, + { + "epoch": 5.168026101141925, + "grad_norm": 0.03686782345175743, + "learning_rate": 0.0009255035209242759, + "loss": 0.1334, + "num_input_tokens_seen": 68483360, + "step": 31680 + }, + { + "epoch": 5.16884176182708, + "grad_norm": 0.04977540299296379, + "learning_rate": 0.0009254661362243991, + "loss": 0.1431, + "num_input_tokens_seen": 68494336, + "step": 31685 + }, + { + "epoch": 5.169657422512235, + "grad_norm": 0.10123711824417114, + "learning_rate": 0.000925428742901911, + "loss": 0.0828, + "num_input_tokens_seen": 68506240, + "step": 31690 + }, + { + "epoch": 5.17047308319739, + "grad_norm": 0.01098128966987133, + "learning_rate": 0.0009253913409575698, + "loss": 0.06, + "num_input_tokens_seen": 68516736, + "step": 31695 + }, + { + "epoch": 5.171288743882545, + "grad_norm": 0.15740327537059784, + "learning_rate": 0.0009253539303921336, + "loss": 0.1266, + "num_input_tokens_seen": 68527680, + "step": 31700 + }, + { + "epoch": 5.1721044045677, + "grad_norm": 0.27190694212913513, + "learning_rate": 0.0009253165112063604, + "loss": 0.1337, + "num_input_tokens_seen": 68537056, + "step": 31705 + }, + { + "epoch": 5.172920065252855, + "grad_norm": 0.05805153027176857, + "learning_rate": 0.0009252790834010085, + "loss": 0.0541, + "num_input_tokens_seen": 68547296, + "step": 31710 + }, + { + "epoch": 5.17373572593801, + "grad_norm": 0.26238253712654114, + "learning_rate": 0.0009252416469768363, + "loss": 0.1146, + "num_input_tokens_seen": 68557344, + "step": 31715 + }, + { + "epoch": 5.174551386623165, + "grad_norm": 0.2675519585609436, + "learning_rate": 0.0009252042019346029, + "loss": 0.0867, + "num_input_tokens_seen": 68567968, + "step": 31720 + }, + { + "epoch": 5.1753670473083195, + "grad_norm": 0.30450505018234253, + "learning_rate": 0.0009251667482750669, + "loss": 0.1679, + "num_input_tokens_seen": 68578656, + "step": 31725 + }, + { + "epoch": 5.176182707993474, + "grad_norm": 0.0031907472293823957, + "learning_rate": 0.0009251292859989873, + "loss": 0.0446, + "num_input_tokens_seen": 68588608, + "step": 31730 + }, + { + "epoch": 5.17699836867863, + "grad_norm": 0.016649756580591202, + "learning_rate": 0.0009250918151071235, + "loss": 0.0295, + "num_input_tokens_seen": 68598624, + "step": 31735 + }, + { + "epoch": 5.177814029363785, + "grad_norm": 0.09811677038669586, + "learning_rate": 0.0009250543356002347, + "loss": 0.029, + "num_input_tokens_seen": 68609792, + "step": 31740 + }, + { + "epoch": 5.17862969004894, + "grad_norm": 0.033193688839673996, + "learning_rate": 0.0009250168474790806, + "loss": 0.0852, + "num_input_tokens_seen": 68620512, + "step": 31745 + }, + { + "epoch": 5.1794453507340945, + "grad_norm": 0.24452006816864014, + "learning_rate": 0.0009249793507444208, + "loss": 0.2061, + "num_input_tokens_seen": 68631392, + "step": 31750 + }, + { + "epoch": 5.180261011419249, + "grad_norm": 0.02791479602456093, + "learning_rate": 0.0009249418453970155, + "loss": 0.0707, + "num_input_tokens_seen": 68642944, + "step": 31755 + }, + { + "epoch": 5.181076672104404, + "grad_norm": 0.0830642580986023, + "learning_rate": 0.0009249043314376247, + "loss": 0.0382, + "num_input_tokens_seen": 68653984, + "step": 31760 + }, + { + "epoch": 5.18189233278956, + "grad_norm": 0.0186906885355711, + "learning_rate": 0.0009248668088670084, + "loss": 0.1107, + "num_input_tokens_seen": 68665344, + "step": 31765 + }, + { + "epoch": 5.182707993474715, + "grad_norm": 0.16254150867462158, + "learning_rate": 0.0009248292776859273, + "loss": 0.1132, + "num_input_tokens_seen": 68676128, + "step": 31770 + }, + { + "epoch": 5.1835236541598695, + "grad_norm": 0.04959748312830925, + "learning_rate": 0.0009247917378951419, + "loss": 0.0199, + "num_input_tokens_seen": 68686304, + "step": 31775 + }, + { + "epoch": 5.184339314845024, + "grad_norm": 0.2542068660259247, + "learning_rate": 0.0009247541894954132, + "loss": 0.0773, + "num_input_tokens_seen": 68697248, + "step": 31780 + }, + { + "epoch": 5.185154975530179, + "grad_norm": 0.07414700835943222, + "learning_rate": 0.0009247166324875018, + "loss": 0.0817, + "num_input_tokens_seen": 68707744, + "step": 31785 + }, + { + "epoch": 5.185970636215335, + "grad_norm": 0.21046984195709229, + "learning_rate": 0.0009246790668721692, + "loss": 0.1773, + "num_input_tokens_seen": 68718496, + "step": 31790 + }, + { + "epoch": 5.18678629690049, + "grad_norm": 0.11404412984848022, + "learning_rate": 0.0009246414926501766, + "loss": 0.2142, + "num_input_tokens_seen": 68729920, + "step": 31795 + }, + { + "epoch": 5.1876019575856445, + "grad_norm": 0.021141186356544495, + "learning_rate": 0.0009246039098222854, + "loss": 0.1502, + "num_input_tokens_seen": 68740320, + "step": 31800 + }, + { + "epoch": 5.188417618270799, + "grad_norm": 0.10404963791370392, + "learning_rate": 0.0009245663183892572, + "loss": 0.0983, + "num_input_tokens_seen": 68751648, + "step": 31805 + }, + { + "epoch": 5.189233278955954, + "grad_norm": 0.207743838429451, + "learning_rate": 0.0009245287183518541, + "loss": 0.0903, + "num_input_tokens_seen": 68762592, + "step": 31810 + }, + { + "epoch": 5.190048939641109, + "grad_norm": 0.010766721330583096, + "learning_rate": 0.0009244911097108379, + "loss": 0.2304, + "num_input_tokens_seen": 68772448, + "step": 31815 + }, + { + "epoch": 5.190864600326265, + "grad_norm": 0.05641620233654976, + "learning_rate": 0.000924453492466971, + "loss": 0.0593, + "num_input_tokens_seen": 68782944, + "step": 31820 + }, + { + "epoch": 5.191680261011419, + "grad_norm": 0.04736460745334625, + "learning_rate": 0.0009244158666210154, + "loss": 0.0461, + "num_input_tokens_seen": 68791456, + "step": 31825 + }, + { + "epoch": 5.192495921696574, + "grad_norm": 0.0764179453253746, + "learning_rate": 0.0009243782321737339, + "loss": 0.0716, + "num_input_tokens_seen": 68802368, + "step": 31830 + }, + { + "epoch": 5.193311582381729, + "grad_norm": 0.07003484666347504, + "learning_rate": 0.0009243405891258894, + "loss": 0.0991, + "num_input_tokens_seen": 68812768, + "step": 31835 + }, + { + "epoch": 5.194127243066884, + "grad_norm": 0.003841748461127281, + "learning_rate": 0.0009243029374782443, + "loss": 0.0276, + "num_input_tokens_seen": 68824416, + "step": 31840 + }, + { + "epoch": 5.19494290375204, + "grad_norm": 0.01023764256387949, + "learning_rate": 0.0009242652772315621, + "loss": 0.0097, + "num_input_tokens_seen": 68835776, + "step": 31845 + }, + { + "epoch": 5.195758564437194, + "grad_norm": 0.0831431970000267, + "learning_rate": 0.0009242276083866056, + "loss": 0.0938, + "num_input_tokens_seen": 68846912, + "step": 31850 + }, + { + "epoch": 5.196574225122349, + "grad_norm": 0.10236520320177078, + "learning_rate": 0.0009241899309441386, + "loss": 0.066, + "num_input_tokens_seen": 68857632, + "step": 31855 + }, + { + "epoch": 5.197389885807504, + "grad_norm": 0.013170513324439526, + "learning_rate": 0.0009241522449049245, + "loss": 0.1338, + "num_input_tokens_seen": 68868768, + "step": 31860 + }, + { + "epoch": 5.198205546492659, + "grad_norm": 0.08625346422195435, + "learning_rate": 0.000924114550269727, + "loss": 0.0162, + "num_input_tokens_seen": 68879936, + "step": 31865 + }, + { + "epoch": 5.199021207177814, + "grad_norm": 0.10950763523578644, + "learning_rate": 0.0009240768470393101, + "loss": 0.0711, + "num_input_tokens_seen": 68890528, + "step": 31870 + }, + { + "epoch": 5.199836867862969, + "grad_norm": 0.009500091895461082, + "learning_rate": 0.0009240391352144382, + "loss": 0.0679, + "num_input_tokens_seen": 68901632, + "step": 31875 + }, + { + "epoch": 5.200652528548124, + "grad_norm": 0.009571898728609085, + "learning_rate": 0.0009240014147958751, + "loss": 0.0322, + "num_input_tokens_seen": 68912000, + "step": 31880 + }, + { + "epoch": 5.201468189233279, + "grad_norm": 0.026301007717847824, + "learning_rate": 0.0009239636857843854, + "loss": 0.0299, + "num_input_tokens_seen": 68922144, + "step": 31885 + }, + { + "epoch": 5.202283849918434, + "grad_norm": 0.05546105280518532, + "learning_rate": 0.0009239259481807338, + "loss": 0.0472, + "num_input_tokens_seen": 68932928, + "step": 31890 + }, + { + "epoch": 5.203099510603589, + "grad_norm": 0.2150077074766159, + "learning_rate": 0.0009238882019856851, + "loss": 0.0741, + "num_input_tokens_seen": 68942560, + "step": 31895 + }, + { + "epoch": 5.2039151712887435, + "grad_norm": 0.007343418430536985, + "learning_rate": 0.0009238504472000042, + "loss": 0.1001, + "num_input_tokens_seen": 68953216, + "step": 31900 + }, + { + "epoch": 5.204730831973899, + "grad_norm": 0.058143239468336105, + "learning_rate": 0.0009238126838244562, + "loss": 0.0299, + "num_input_tokens_seen": 68964352, + "step": 31905 + }, + { + "epoch": 5.205546492659054, + "grad_norm": 0.14698320627212524, + "learning_rate": 0.0009237749118598067, + "loss": 0.1336, + "num_input_tokens_seen": 68975456, + "step": 31910 + }, + { + "epoch": 5.206362153344209, + "grad_norm": 0.2738533020019531, + "learning_rate": 0.000923737131306821, + "loss": 0.2138, + "num_input_tokens_seen": 68985856, + "step": 31915 + }, + { + "epoch": 5.207177814029364, + "grad_norm": 0.19384877383708954, + "learning_rate": 0.0009236993421662648, + "loss": 0.084, + "num_input_tokens_seen": 68997504, + "step": 31920 + }, + { + "epoch": 5.2079934747145185, + "grad_norm": 0.3091152608394623, + "learning_rate": 0.0009236615444389038, + "loss": 0.2369, + "num_input_tokens_seen": 69007008, + "step": 31925 + }, + { + "epoch": 5.208809135399674, + "grad_norm": 0.02713857591152191, + "learning_rate": 0.0009236237381255041, + "loss": 0.0268, + "num_input_tokens_seen": 69018304, + "step": 31930 + }, + { + "epoch": 5.209624796084829, + "grad_norm": 0.0630718544125557, + "learning_rate": 0.0009235859232268322, + "loss": 0.0813, + "num_input_tokens_seen": 69028160, + "step": 31935 + }, + { + "epoch": 5.210440456769984, + "grad_norm": 0.007515220437198877, + "learning_rate": 0.000923548099743654, + "loss": 0.0116, + "num_input_tokens_seen": 69038624, + "step": 31940 + }, + { + "epoch": 5.211256117455139, + "grad_norm": 0.22423508763313293, + "learning_rate": 0.0009235102676767364, + "loss": 0.2559, + "num_input_tokens_seen": 69049888, + "step": 31945 + }, + { + "epoch": 5.212071778140293, + "grad_norm": 0.006067072041332722, + "learning_rate": 0.0009234724270268459, + "loss": 0.0745, + "num_input_tokens_seen": 69061376, + "step": 31950 + }, + { + "epoch": 5.212887438825448, + "grad_norm": 0.024653153494000435, + "learning_rate": 0.0009234345777947493, + "loss": 0.0349, + "num_input_tokens_seen": 69072576, + "step": 31955 + }, + { + "epoch": 5.213703099510604, + "grad_norm": 0.03436309099197388, + "learning_rate": 0.0009233967199812141, + "loss": 0.1051, + "num_input_tokens_seen": 69082976, + "step": 31960 + }, + { + "epoch": 5.214518760195759, + "grad_norm": 0.32174986600875854, + "learning_rate": 0.000923358853587007, + "loss": 0.1442, + "num_input_tokens_seen": 69095072, + "step": 31965 + }, + { + "epoch": 5.215334420880914, + "grad_norm": 0.058664221316576004, + "learning_rate": 0.0009233209786128957, + "loss": 0.04, + "num_input_tokens_seen": 69107040, + "step": 31970 + }, + { + "epoch": 5.216150081566068, + "grad_norm": 0.19562427699565887, + "learning_rate": 0.0009232830950596479, + "loss": 0.3531, + "num_input_tokens_seen": 69118336, + "step": 31975 + }, + { + "epoch": 5.216965742251223, + "grad_norm": 0.03304930403828621, + "learning_rate": 0.0009232452029280312, + "loss": 0.1295, + "num_input_tokens_seen": 69130208, + "step": 31980 + }, + { + "epoch": 5.217781402936378, + "grad_norm": 0.09775389730930328, + "learning_rate": 0.0009232073022188135, + "loss": 0.03, + "num_input_tokens_seen": 69140512, + "step": 31985 + }, + { + "epoch": 5.218597063621534, + "grad_norm": 0.14110304415225983, + "learning_rate": 0.0009231693929327628, + "loss": 0.0802, + "num_input_tokens_seen": 69150816, + "step": 31990 + }, + { + "epoch": 5.219412724306689, + "grad_norm": 0.009008850902318954, + "learning_rate": 0.0009231314750706476, + "loss": 0.0448, + "num_input_tokens_seen": 69161472, + "step": 31995 + }, + { + "epoch": 5.220228384991843, + "grad_norm": 0.1816573441028595, + "learning_rate": 0.0009230935486332363, + "loss": 0.1657, + "num_input_tokens_seen": 69172064, + "step": 32000 + }, + { + "epoch": 5.221044045676998, + "grad_norm": 0.3012794554233551, + "learning_rate": 0.0009230556136212975, + "loss": 0.1398, + "num_input_tokens_seen": 69183264, + "step": 32005 + }, + { + "epoch": 5.221859706362153, + "grad_norm": 0.11017505824565887, + "learning_rate": 0.0009230176700356001, + "loss": 0.1744, + "num_input_tokens_seen": 69194528, + "step": 32010 + }, + { + "epoch": 5.222675367047309, + "grad_norm": 0.05585956946015358, + "learning_rate": 0.0009229797178769128, + "loss": 0.174, + "num_input_tokens_seen": 69205920, + "step": 32015 + }, + { + "epoch": 5.2234910277324635, + "grad_norm": 0.22336703538894653, + "learning_rate": 0.000922941757146005, + "loss": 0.0823, + "num_input_tokens_seen": 69217408, + "step": 32020 + }, + { + "epoch": 5.224306688417618, + "grad_norm": 0.010442069731652737, + "learning_rate": 0.000922903787843646, + "loss": 0.1295, + "num_input_tokens_seen": 69226560, + "step": 32025 + }, + { + "epoch": 5.225122349102773, + "grad_norm": 0.1273556351661682, + "learning_rate": 0.0009228658099706053, + "loss": 0.0722, + "num_input_tokens_seen": 69238112, + "step": 32030 + }, + { + "epoch": 5.225938009787928, + "grad_norm": 0.08883675187826157, + "learning_rate": 0.0009228278235276524, + "loss": 0.149, + "num_input_tokens_seen": 69249088, + "step": 32035 + }, + { + "epoch": 5.226753670473083, + "grad_norm": 0.016032544896006584, + "learning_rate": 0.0009227898285155574, + "loss": 0.0544, + "num_input_tokens_seen": 69258752, + "step": 32040 + }, + { + "epoch": 5.2275693311582385, + "grad_norm": 0.25991424918174744, + "learning_rate": 0.00092275182493509, + "loss": 0.1155, + "num_input_tokens_seen": 69270496, + "step": 32045 + }, + { + "epoch": 5.228384991843393, + "grad_norm": 0.16175585985183716, + "learning_rate": 0.0009227138127870208, + "loss": 0.1564, + "num_input_tokens_seen": 69280800, + "step": 32050 + }, + { + "epoch": 5.229200652528548, + "grad_norm": 0.03370179980993271, + "learning_rate": 0.0009226757920721196, + "loss": 0.181, + "num_input_tokens_seen": 69291776, + "step": 32055 + }, + { + "epoch": 5.230016313213703, + "grad_norm": 0.09633185714483261, + "learning_rate": 0.0009226377627911575, + "loss": 0.0787, + "num_input_tokens_seen": 69302144, + "step": 32060 + }, + { + "epoch": 5.230831973898858, + "grad_norm": 0.061300963163375854, + "learning_rate": 0.000922599724944905, + "loss": 0.0605, + "num_input_tokens_seen": 69313984, + "step": 32065 + }, + { + "epoch": 5.231647634584013, + "grad_norm": 0.11820586770772934, + "learning_rate": 0.0009225616785341329, + "loss": 0.0464, + "num_input_tokens_seen": 69324640, + "step": 32070 + }, + { + "epoch": 5.232463295269168, + "grad_norm": 0.07643051445484161, + "learning_rate": 0.0009225236235596123, + "loss": 0.0222, + "num_input_tokens_seen": 69335680, + "step": 32075 + }, + { + "epoch": 5.233278955954323, + "grad_norm": 0.016196228563785553, + "learning_rate": 0.0009224855600221145, + "loss": 0.0245, + "num_input_tokens_seen": 69347616, + "step": 32080 + }, + { + "epoch": 5.234094616639478, + "grad_norm": 0.1011374369263649, + "learning_rate": 0.0009224474879224109, + "loss": 0.1312, + "num_input_tokens_seen": 69358400, + "step": 32085 + }, + { + "epoch": 5.234910277324633, + "grad_norm": 0.029623612761497498, + "learning_rate": 0.000922409407261273, + "loss": 0.0477, + "num_input_tokens_seen": 69368224, + "step": 32090 + }, + { + "epoch": 5.235725938009788, + "grad_norm": 0.04129718616604805, + "learning_rate": 0.0009223713180394726, + "loss": 0.0925, + "num_input_tokens_seen": 69379200, + "step": 32095 + }, + { + "epoch": 5.236541598694943, + "grad_norm": 0.019016016274690628, + "learning_rate": 0.0009223332202577815, + "loss": 0.0822, + "num_input_tokens_seen": 69389600, + "step": 32100 + }, + { + "epoch": 5.237357259380098, + "grad_norm": 0.026308605447411537, + "learning_rate": 0.0009222951139169722, + "loss": 0.1353, + "num_input_tokens_seen": 69399424, + "step": 32105 + }, + { + "epoch": 5.238172920065253, + "grad_norm": 0.15939036011695862, + "learning_rate": 0.0009222569990178165, + "loss": 0.0608, + "num_input_tokens_seen": 69410336, + "step": 32110 + }, + { + "epoch": 5.238988580750408, + "grad_norm": 0.267518550157547, + "learning_rate": 0.0009222188755610871, + "loss": 0.1342, + "num_input_tokens_seen": 69421152, + "step": 32115 + }, + { + "epoch": 5.239804241435563, + "grad_norm": 0.009280465543270111, + "learning_rate": 0.0009221807435475564, + "loss": 0.1205, + "num_input_tokens_seen": 69431360, + "step": 32120 + }, + { + "epoch": 5.240619902120717, + "grad_norm": 0.01776033826172352, + "learning_rate": 0.0009221426029779975, + "loss": 0.0569, + "num_input_tokens_seen": 69442144, + "step": 32125 + }, + { + "epoch": 5.241435562805873, + "grad_norm": 0.025634892284870148, + "learning_rate": 0.0009221044538531833, + "loss": 0.0466, + "num_input_tokens_seen": 69452576, + "step": 32130 + }, + { + "epoch": 5.242251223491028, + "grad_norm": 0.04073479771614075, + "learning_rate": 0.0009220662961738868, + "loss": 0.1665, + "num_input_tokens_seen": 69463936, + "step": 32135 + }, + { + "epoch": 5.243066884176183, + "grad_norm": 0.03299302980303764, + "learning_rate": 0.0009220281299408815, + "loss": 0.0561, + "num_input_tokens_seen": 69475424, + "step": 32140 + }, + { + "epoch": 5.2438825448613375, + "grad_norm": 0.047456976026296616, + "learning_rate": 0.0009219899551549405, + "loss": 0.114, + "num_input_tokens_seen": 69485632, + "step": 32145 + }, + { + "epoch": 5.244698205546492, + "grad_norm": 0.029536686837673187, + "learning_rate": 0.0009219517718168379, + "loss": 0.179, + "num_input_tokens_seen": 69495264, + "step": 32150 + }, + { + "epoch": 5.245513866231648, + "grad_norm": 0.6375518441200256, + "learning_rate": 0.0009219135799273474, + "loss": 0.0721, + "num_input_tokens_seen": 69506112, + "step": 32155 + }, + { + "epoch": 5.246329526916803, + "grad_norm": 0.016054809093475342, + "learning_rate": 0.0009218753794872429, + "loss": 0.0486, + "num_input_tokens_seen": 69516768, + "step": 32160 + }, + { + "epoch": 5.247145187601958, + "grad_norm": 0.020715905353426933, + "learning_rate": 0.0009218371704972987, + "loss": 0.0741, + "num_input_tokens_seen": 69528096, + "step": 32165 + }, + { + "epoch": 5.2479608482871125, + "grad_norm": 0.06473297625780106, + "learning_rate": 0.0009217989529582889, + "loss": 0.0302, + "num_input_tokens_seen": 69540000, + "step": 32170 + }, + { + "epoch": 5.248776508972267, + "grad_norm": 0.06317108124494553, + "learning_rate": 0.0009217607268709884, + "loss": 0.0426, + "num_input_tokens_seen": 69551200, + "step": 32175 + }, + { + "epoch": 5.249592169657422, + "grad_norm": 0.012645971961319447, + "learning_rate": 0.0009217224922361718, + "loss": 0.0322, + "num_input_tokens_seen": 69561600, + "step": 32180 + }, + { + "epoch": 5.250407830342578, + "grad_norm": 0.04027498885989189, + "learning_rate": 0.0009216842490546138, + "loss": 0.0872, + "num_input_tokens_seen": 69572608, + "step": 32185 + }, + { + "epoch": 5.251223491027733, + "grad_norm": 0.02918020635843277, + "learning_rate": 0.0009216459973270895, + "loss": 0.1817, + "num_input_tokens_seen": 69584096, + "step": 32190 + }, + { + "epoch": 5.2520391517128875, + "grad_norm": 0.021406283602118492, + "learning_rate": 0.0009216077370543743, + "loss": 0.0901, + "num_input_tokens_seen": 69594240, + "step": 32195 + }, + { + "epoch": 5.252854812398042, + "grad_norm": 0.07893198728561401, + "learning_rate": 0.0009215694682372433, + "loss": 0.0284, + "num_input_tokens_seen": 69606208, + "step": 32200 + }, + { + "epoch": 5.253670473083197, + "grad_norm": 0.0020585639867931604, + "learning_rate": 0.0009215311908764724, + "loss": 0.0273, + "num_input_tokens_seen": 69617504, + "step": 32205 + }, + { + "epoch": 5.254486133768353, + "grad_norm": 0.21157675981521606, + "learning_rate": 0.000921492904972837, + "loss": 0.0974, + "num_input_tokens_seen": 69628000, + "step": 32210 + }, + { + "epoch": 5.255301794453508, + "grad_norm": 0.16144989430904388, + "learning_rate": 0.0009214546105271133, + "loss": 0.4135, + "num_input_tokens_seen": 69639296, + "step": 32215 + }, + { + "epoch": 5.2561174551386625, + "grad_norm": 0.16526293754577637, + "learning_rate": 0.0009214163075400772, + "loss": 0.1462, + "num_input_tokens_seen": 69650272, + "step": 32220 + }, + { + "epoch": 5.256933115823817, + "grad_norm": 0.06682567298412323, + "learning_rate": 0.000921377996012505, + "loss": 0.1254, + "num_input_tokens_seen": 69660544, + "step": 32225 + }, + { + "epoch": 5.257748776508972, + "grad_norm": 0.25775107741355896, + "learning_rate": 0.0009213396759451732, + "loss": 0.1977, + "num_input_tokens_seen": 69671168, + "step": 32230 + }, + { + "epoch": 5.258564437194127, + "grad_norm": 0.13597147166728973, + "learning_rate": 0.0009213013473388584, + "loss": 0.0848, + "num_input_tokens_seen": 69681376, + "step": 32235 + }, + { + "epoch": 5.259380097879283, + "grad_norm": 0.11380225419998169, + "learning_rate": 0.0009212630101943373, + "loss": 0.0665, + "num_input_tokens_seen": 69692192, + "step": 32240 + }, + { + "epoch": 5.260195758564437, + "grad_norm": 0.007670269813388586, + "learning_rate": 0.000921224664512387, + "loss": 0.0274, + "num_input_tokens_seen": 69702816, + "step": 32245 + }, + { + "epoch": 5.261011419249592, + "grad_norm": 0.015922527760267258, + "learning_rate": 0.0009211863102937843, + "loss": 0.0319, + "num_input_tokens_seen": 69712800, + "step": 32250 + }, + { + "epoch": 5.261827079934747, + "grad_norm": 0.0070436312817037106, + "learning_rate": 0.0009211479475393068, + "loss": 0.2056, + "num_input_tokens_seen": 69723808, + "step": 32255 + }, + { + "epoch": 5.262642740619902, + "grad_norm": 0.013316688127815723, + "learning_rate": 0.0009211095762497319, + "loss": 0.0193, + "num_input_tokens_seen": 69734944, + "step": 32260 + }, + { + "epoch": 5.263458401305057, + "grad_norm": 0.023608213290572166, + "learning_rate": 0.0009210711964258372, + "loss": 0.1439, + "num_input_tokens_seen": 69744672, + "step": 32265 + }, + { + "epoch": 5.264274061990212, + "grad_norm": 0.1670239418745041, + "learning_rate": 0.0009210328080684005, + "loss": 0.0817, + "num_input_tokens_seen": 69754944, + "step": 32270 + }, + { + "epoch": 5.265089722675367, + "grad_norm": 0.006368768867105246, + "learning_rate": 0.0009209944111782, + "loss": 0.0374, + "num_input_tokens_seen": 69765824, + "step": 32275 + }, + { + "epoch": 5.265905383360522, + "grad_norm": 0.011990712955594063, + "learning_rate": 0.0009209560057560134, + "loss": 0.1155, + "num_input_tokens_seen": 69777152, + "step": 32280 + }, + { + "epoch": 5.266721044045677, + "grad_norm": 0.07604020088911057, + "learning_rate": 0.0009209175918026195, + "loss": 0.0241, + "num_input_tokens_seen": 69787360, + "step": 32285 + }, + { + "epoch": 5.267536704730832, + "grad_norm": 0.2205602526664734, + "learning_rate": 0.0009208791693187967, + "loss": 0.1131, + "num_input_tokens_seen": 69798048, + "step": 32290 + }, + { + "epoch": 5.268352365415987, + "grad_norm": 0.04664922133088112, + "learning_rate": 0.0009208407383053235, + "loss": 0.0366, + "num_input_tokens_seen": 69809088, + "step": 32295 + }, + { + "epoch": 5.269168026101142, + "grad_norm": 0.007245397195219994, + "learning_rate": 0.000920802298762979, + "loss": 0.0704, + "num_input_tokens_seen": 69819296, + "step": 32300 + }, + { + "epoch": 5.269983686786297, + "grad_norm": 0.023717431351542473, + "learning_rate": 0.0009207638506925419, + "loss": 0.0771, + "num_input_tokens_seen": 69829472, + "step": 32305 + }, + { + "epoch": 5.270799347471452, + "grad_norm": 0.2488313913345337, + "learning_rate": 0.0009207253940947916, + "loss": 0.0537, + "num_input_tokens_seen": 69840320, + "step": 32310 + }, + { + "epoch": 5.271615008156607, + "grad_norm": 0.01920371875166893, + "learning_rate": 0.0009206869289705075, + "loss": 0.1372, + "num_input_tokens_seen": 69852192, + "step": 32315 + }, + { + "epoch": 5.2724306688417615, + "grad_norm": 0.03608312830328941, + "learning_rate": 0.0009206484553204693, + "loss": 0.0895, + "num_input_tokens_seen": 69864032, + "step": 32320 + }, + { + "epoch": 5.273246329526917, + "grad_norm": 0.03217107802629471, + "learning_rate": 0.0009206099731454562, + "loss": 0.0295, + "num_input_tokens_seen": 69874816, + "step": 32325 + }, + { + "epoch": 5.274061990212072, + "grad_norm": 0.03715856000781059, + "learning_rate": 0.0009205714824462487, + "loss": 0.0362, + "num_input_tokens_seen": 69885984, + "step": 32330 + }, + { + "epoch": 5.274877650897227, + "grad_norm": 0.00929944682866335, + "learning_rate": 0.0009205329832236265, + "loss": 0.0338, + "num_input_tokens_seen": 69896000, + "step": 32335 + }, + { + "epoch": 5.275693311582382, + "grad_norm": 0.03817165642976761, + "learning_rate": 0.0009204944754783698, + "loss": 0.1096, + "num_input_tokens_seen": 69907040, + "step": 32340 + }, + { + "epoch": 5.2765089722675365, + "grad_norm": 0.0905759334564209, + "learning_rate": 0.0009204559592112592, + "loss": 0.0265, + "num_input_tokens_seen": 69917280, + "step": 32345 + }, + { + "epoch": 5.277324632952691, + "grad_norm": 0.03727143257856369, + "learning_rate": 0.0009204174344230751, + "loss": 0.2925, + "num_input_tokens_seen": 69929344, + "step": 32350 + }, + { + "epoch": 5.278140293637847, + "grad_norm": 0.01050970796495676, + "learning_rate": 0.0009203789011145984, + "loss": 0.0518, + "num_input_tokens_seen": 69940352, + "step": 32355 + }, + { + "epoch": 5.278955954323002, + "grad_norm": 0.2786335051059723, + "learning_rate": 0.00092034035928661, + "loss": 0.1492, + "num_input_tokens_seen": 69950080, + "step": 32360 + }, + { + "epoch": 5.279771615008157, + "grad_norm": 0.2287655770778656, + "learning_rate": 0.000920301808939891, + "loss": 0.0845, + "num_input_tokens_seen": 69960064, + "step": 32365 + }, + { + "epoch": 5.280587275693311, + "grad_norm": 0.04754359647631645, + "learning_rate": 0.0009202632500752226, + "loss": 0.02, + "num_input_tokens_seen": 69971072, + "step": 32370 + }, + { + "epoch": 5.281402936378466, + "grad_norm": 0.13212303817272186, + "learning_rate": 0.0009202246826933864, + "loss": 0.0811, + "num_input_tokens_seen": 69981408, + "step": 32375 + }, + { + "epoch": 5.282218597063622, + "grad_norm": 0.2108330875635147, + "learning_rate": 0.0009201861067951638, + "loss": 0.0271, + "num_input_tokens_seen": 69991840, + "step": 32380 + }, + { + "epoch": 5.283034257748777, + "grad_norm": 0.02782035619020462, + "learning_rate": 0.0009201475223813368, + "loss": 0.189, + "num_input_tokens_seen": 70002528, + "step": 32385 + }, + { + "epoch": 5.283849918433932, + "grad_norm": 0.16605517268180847, + "learning_rate": 0.0009201089294526872, + "loss": 0.1117, + "num_input_tokens_seen": 70013280, + "step": 32390 + }, + { + "epoch": 5.284665579119086, + "grad_norm": 0.019904859364032745, + "learning_rate": 0.0009200703280099971, + "loss": 0.1801, + "num_input_tokens_seen": 70024864, + "step": 32395 + }, + { + "epoch": 5.285481239804241, + "grad_norm": 0.13329952955245972, + "learning_rate": 0.0009200317180540491, + "loss": 0.1104, + "num_input_tokens_seen": 70035648, + "step": 32400 + }, + { + "epoch": 5.286296900489396, + "grad_norm": 0.12227307260036469, + "learning_rate": 0.0009199930995856254, + "loss": 0.1115, + "num_input_tokens_seen": 70046592, + "step": 32405 + }, + { + "epoch": 5.287112561174552, + "grad_norm": 0.010475658811628819, + "learning_rate": 0.0009199544726055087, + "loss": 0.0213, + "num_input_tokens_seen": 70058016, + "step": 32410 + }, + { + "epoch": 5.287928221859707, + "grad_norm": 0.0636746883392334, + "learning_rate": 0.000919915837114482, + "loss": 0.0393, + "num_input_tokens_seen": 70069536, + "step": 32415 + }, + { + "epoch": 5.288743882544861, + "grad_norm": 0.022627348080277443, + "learning_rate": 0.0009198771931133281, + "loss": 0.1151, + "num_input_tokens_seen": 70079936, + "step": 32420 + }, + { + "epoch": 5.289559543230016, + "grad_norm": 0.34379759430885315, + "learning_rate": 0.0009198385406028302, + "loss": 0.0878, + "num_input_tokens_seen": 70090912, + "step": 32425 + }, + { + "epoch": 5.290375203915171, + "grad_norm": 0.019473228603601456, + "learning_rate": 0.0009197998795837716, + "loss": 0.0394, + "num_input_tokens_seen": 70102624, + "step": 32430 + }, + { + "epoch": 5.291190864600326, + "grad_norm": 0.20492912828922272, + "learning_rate": 0.0009197612100569359, + "loss": 0.1075, + "num_input_tokens_seen": 70112864, + "step": 32435 + }, + { + "epoch": 5.2920065252854815, + "grad_norm": 0.0357382632791996, + "learning_rate": 0.0009197225320231069, + "loss": 0.0834, + "num_input_tokens_seen": 70122304, + "step": 32440 + }, + { + "epoch": 5.292822185970636, + "grad_norm": 0.016824502497911453, + "learning_rate": 0.0009196838454830682, + "loss": 0.016, + "num_input_tokens_seen": 70133728, + "step": 32445 + }, + { + "epoch": 5.293637846655791, + "grad_norm": 0.07850372046232224, + "learning_rate": 0.000919645150437604, + "loss": 0.0894, + "num_input_tokens_seen": 70144384, + "step": 32450 + }, + { + "epoch": 5.294453507340946, + "grad_norm": 0.2341936081647873, + "learning_rate": 0.0009196064468874985, + "loss": 0.1501, + "num_input_tokens_seen": 70155040, + "step": 32455 + }, + { + "epoch": 5.295269168026101, + "grad_norm": 0.03325160965323448, + "learning_rate": 0.0009195677348335361, + "loss": 0.0169, + "num_input_tokens_seen": 70165312, + "step": 32460 + }, + { + "epoch": 5.2960848287112565, + "grad_norm": 0.0046011339873075485, + "learning_rate": 0.0009195290142765012, + "loss": 0.0465, + "num_input_tokens_seen": 70177824, + "step": 32465 + }, + { + "epoch": 5.296900489396411, + "grad_norm": 0.2414412796497345, + "learning_rate": 0.0009194902852171787, + "loss": 0.0773, + "num_input_tokens_seen": 70187904, + "step": 32470 + }, + { + "epoch": 5.297716150081566, + "grad_norm": 0.06987257301807404, + "learning_rate": 0.0009194515476563533, + "loss": 0.0763, + "num_input_tokens_seen": 70199296, + "step": 32475 + }, + { + "epoch": 5.298531810766721, + "grad_norm": 0.1548604816198349, + "learning_rate": 0.0009194128015948103, + "loss": 0.0339, + "num_input_tokens_seen": 70209472, + "step": 32480 + }, + { + "epoch": 5.299347471451876, + "grad_norm": 0.007507277186959982, + "learning_rate": 0.0009193740470333347, + "loss": 0.0868, + "num_input_tokens_seen": 70220832, + "step": 32485 + }, + { + "epoch": 5.300163132137031, + "grad_norm": 0.012318221852183342, + "learning_rate": 0.0009193352839727121, + "loss": 0.2219, + "num_input_tokens_seen": 70230912, + "step": 32490 + }, + { + "epoch": 5.300978792822186, + "grad_norm": 0.04195151478052139, + "learning_rate": 0.0009192965124137279, + "loss": 0.0795, + "num_input_tokens_seen": 70241440, + "step": 32495 + }, + { + "epoch": 5.301794453507341, + "grad_norm": 0.01736997626721859, + "learning_rate": 0.000919257732357168, + "loss": 0.0565, + "num_input_tokens_seen": 70252320, + "step": 32500 + }, + { + "epoch": 5.302610114192496, + "grad_norm": 0.032310500741004944, + "learning_rate": 0.0009192189438038183, + "loss": 0.1439, + "num_input_tokens_seen": 70263584, + "step": 32505 + }, + { + "epoch": 5.303425774877651, + "grad_norm": 0.03387526422739029, + "learning_rate": 0.0009191801467544649, + "loss": 0.0598, + "num_input_tokens_seen": 70275040, + "step": 32510 + }, + { + "epoch": 5.304241435562806, + "grad_norm": 0.17230959236621857, + "learning_rate": 0.0009191413412098942, + "loss": 0.1064, + "num_input_tokens_seen": 70287168, + "step": 32515 + }, + { + "epoch": 5.30505709624796, + "grad_norm": 0.04161704704165459, + "learning_rate": 0.0009191025271708923, + "loss": 0.2394, + "num_input_tokens_seen": 70298848, + "step": 32520 + }, + { + "epoch": 5.305872756933116, + "grad_norm": 0.038167256861925125, + "learning_rate": 0.0009190637046382461, + "loss": 0.0417, + "num_input_tokens_seen": 70309760, + "step": 32525 + }, + { + "epoch": 5.306688417618271, + "grad_norm": 0.16672658920288086, + "learning_rate": 0.0009190248736127422, + "loss": 0.0625, + "num_input_tokens_seen": 70320288, + "step": 32530 + }, + { + "epoch": 5.307504078303426, + "grad_norm": 0.042624689638614655, + "learning_rate": 0.0009189860340951679, + "loss": 0.0649, + "num_input_tokens_seen": 70332128, + "step": 32535 + }, + { + "epoch": 5.308319738988581, + "grad_norm": 0.012349123135209084, + "learning_rate": 0.0009189471860863099, + "loss": 0.0641, + "num_input_tokens_seen": 70342400, + "step": 32540 + }, + { + "epoch": 5.309135399673735, + "grad_norm": 0.2591664791107178, + "learning_rate": 0.0009189083295869558, + "loss": 0.1002, + "num_input_tokens_seen": 70352416, + "step": 32545 + }, + { + "epoch": 5.309951060358891, + "grad_norm": 0.022202063351869583, + "learning_rate": 0.0009188694645978928, + "loss": 0.1235, + "num_input_tokens_seen": 70363296, + "step": 32550 + }, + { + "epoch": 5.310766721044046, + "grad_norm": 0.1519620418548584, + "learning_rate": 0.0009188305911199088, + "loss": 0.1209, + "num_input_tokens_seen": 70375488, + "step": 32555 + }, + { + "epoch": 5.311582381729201, + "grad_norm": 0.025497501716017723, + "learning_rate": 0.0009187917091537918, + "loss": 0.0746, + "num_input_tokens_seen": 70385184, + "step": 32560 + }, + { + "epoch": 5.3123980424143555, + "grad_norm": 0.08437871932983398, + "learning_rate": 0.0009187528187003293, + "loss": 0.1442, + "num_input_tokens_seen": 70395296, + "step": 32565 + }, + { + "epoch": 5.31321370309951, + "grad_norm": 0.05105772614479065, + "learning_rate": 0.0009187139197603097, + "loss": 0.0387, + "num_input_tokens_seen": 70405920, + "step": 32570 + }, + { + "epoch": 5.314029363784665, + "grad_norm": 0.28494322299957275, + "learning_rate": 0.0009186750123345214, + "loss": 0.1898, + "num_input_tokens_seen": 70416960, + "step": 32575 + }, + { + "epoch": 5.314845024469821, + "grad_norm": 0.1514277160167694, + "learning_rate": 0.0009186360964237528, + "loss": 0.13, + "num_input_tokens_seen": 70428480, + "step": 32580 + }, + { + "epoch": 5.315660685154976, + "grad_norm": 0.030695544555783272, + "learning_rate": 0.0009185971720287926, + "loss": 0.0963, + "num_input_tokens_seen": 70439200, + "step": 32585 + }, + { + "epoch": 5.3164763458401305, + "grad_norm": 0.12666253745555878, + "learning_rate": 0.0009185582391504299, + "loss": 0.0913, + "num_input_tokens_seen": 70450656, + "step": 32590 + }, + { + "epoch": 5.317292006525285, + "grad_norm": 0.1707489788532257, + "learning_rate": 0.0009185192977894533, + "loss": 0.1483, + "num_input_tokens_seen": 70461504, + "step": 32595 + }, + { + "epoch": 5.31810766721044, + "grad_norm": 0.07268530875444412, + "learning_rate": 0.0009184803479466521, + "loss": 0.1427, + "num_input_tokens_seen": 70471840, + "step": 32600 + }, + { + "epoch": 5.318923327895595, + "grad_norm": 0.014792957343161106, + "learning_rate": 0.0009184413896228161, + "loss": 0.0997, + "num_input_tokens_seen": 70481536, + "step": 32605 + }, + { + "epoch": 5.319738988580751, + "grad_norm": 0.19271481037139893, + "learning_rate": 0.0009184024228187343, + "loss": 0.0948, + "num_input_tokens_seen": 70492064, + "step": 32610 + }, + { + "epoch": 5.3205546492659055, + "grad_norm": 0.2561456859111786, + "learning_rate": 0.0009183634475351967, + "loss": 0.107, + "num_input_tokens_seen": 70501184, + "step": 32615 + }, + { + "epoch": 5.32137030995106, + "grad_norm": 0.11083754897117615, + "learning_rate": 0.0009183244637729931, + "loss": 0.1501, + "num_input_tokens_seen": 70511808, + "step": 32620 + }, + { + "epoch": 5.322185970636215, + "grad_norm": 0.062356043606996536, + "learning_rate": 0.0009182854715329134, + "loss": 0.0937, + "num_input_tokens_seen": 70521792, + "step": 32625 + }, + { + "epoch": 5.32300163132137, + "grad_norm": 0.013759220018982887, + "learning_rate": 0.0009182464708157481, + "loss": 0.0377, + "num_input_tokens_seen": 70533120, + "step": 32630 + }, + { + "epoch": 5.323817292006526, + "grad_norm": 0.12132881581783295, + "learning_rate": 0.0009182074616222875, + "loss": 0.0632, + "num_input_tokens_seen": 70543360, + "step": 32635 + }, + { + "epoch": 5.3246329526916805, + "grad_norm": 0.08230478316545486, + "learning_rate": 0.0009181684439533223, + "loss": 0.1087, + "num_input_tokens_seen": 70554560, + "step": 32640 + }, + { + "epoch": 5.325448613376835, + "grad_norm": 0.1693173348903656, + "learning_rate": 0.0009181294178096427, + "loss": 0.2002, + "num_input_tokens_seen": 70566080, + "step": 32645 + }, + { + "epoch": 5.32626427406199, + "grad_norm": 0.21043474972248077, + "learning_rate": 0.0009180903831920404, + "loss": 0.1808, + "num_input_tokens_seen": 70576064, + "step": 32650 + }, + { + "epoch": 5.327079934747145, + "grad_norm": 0.0415470190346241, + "learning_rate": 0.0009180513401013059, + "loss": 0.1335, + "num_input_tokens_seen": 70586624, + "step": 32655 + }, + { + "epoch": 5.327895595432301, + "grad_norm": 0.019588204100728035, + "learning_rate": 0.0009180122885382307, + "loss": 0.1125, + "num_input_tokens_seen": 70598176, + "step": 32660 + }, + { + "epoch": 5.328711256117455, + "grad_norm": 0.24973243474960327, + "learning_rate": 0.0009179732285036062, + "loss": 0.2379, + "num_input_tokens_seen": 70608128, + "step": 32665 + }, + { + "epoch": 5.32952691680261, + "grad_norm": 0.006756752263754606, + "learning_rate": 0.0009179341599982239, + "loss": 0.1654, + "num_input_tokens_seen": 70620064, + "step": 32670 + }, + { + "epoch": 5.330342577487765, + "grad_norm": 0.14148695766925812, + "learning_rate": 0.0009178950830228759, + "loss": 0.1429, + "num_input_tokens_seen": 70630048, + "step": 32675 + }, + { + "epoch": 5.33115823817292, + "grad_norm": 0.02704835683107376, + "learning_rate": 0.0009178559975783536, + "loss": 0.0799, + "num_input_tokens_seen": 70639552, + "step": 32680 + }, + { + "epoch": 5.331973898858075, + "grad_norm": 0.24264878034591675, + "learning_rate": 0.0009178169036654496, + "loss": 0.0869, + "num_input_tokens_seen": 70651040, + "step": 32685 + }, + { + "epoch": 5.33278955954323, + "grad_norm": 0.10831085592508316, + "learning_rate": 0.0009177778012849561, + "loss": 0.0775, + "num_input_tokens_seen": 70660736, + "step": 32690 + }, + { + "epoch": 5.333605220228385, + "grad_norm": 0.014376015402376652, + "learning_rate": 0.0009177386904376652, + "loss": 0.0338, + "num_input_tokens_seen": 70671488, + "step": 32695 + }, + { + "epoch": 5.33442088091354, + "grad_norm": 0.029407214373350143, + "learning_rate": 0.0009176995711243699, + "loss": 0.0204, + "num_input_tokens_seen": 70680608, + "step": 32700 + }, + { + "epoch": 5.335236541598695, + "grad_norm": 0.3172883987426758, + "learning_rate": 0.0009176604433458631, + "loss": 0.1396, + "num_input_tokens_seen": 70691200, + "step": 32705 + }, + { + "epoch": 5.33605220228385, + "grad_norm": 0.028004512190818787, + "learning_rate": 0.0009176213071029373, + "loss": 0.0474, + "num_input_tokens_seen": 70702496, + "step": 32710 + }, + { + "epoch": 5.3368678629690045, + "grad_norm": 0.014881027862429619, + "learning_rate": 0.0009175821623963861, + "loss": 0.2455, + "num_input_tokens_seen": 70713856, + "step": 32715 + }, + { + "epoch": 5.33768352365416, + "grad_norm": 0.08989004045724869, + "learning_rate": 0.0009175430092270026, + "loss": 0.1038, + "num_input_tokens_seen": 70724320, + "step": 32720 + }, + { + "epoch": 5.338499184339315, + "grad_norm": 0.05116923525929451, + "learning_rate": 0.0009175038475955804, + "loss": 0.0783, + "num_input_tokens_seen": 70735232, + "step": 32725 + }, + { + "epoch": 5.33931484502447, + "grad_norm": 0.06526987254619598, + "learning_rate": 0.0009174646775029129, + "loss": 0.0791, + "num_input_tokens_seen": 70746528, + "step": 32730 + }, + { + "epoch": 5.340130505709625, + "grad_norm": 0.028074419125914574, + "learning_rate": 0.0009174254989497942, + "loss": 0.057, + "num_input_tokens_seen": 70756320, + "step": 32735 + }, + { + "epoch": 5.3409461663947795, + "grad_norm": 0.12809574604034424, + "learning_rate": 0.0009173863119370183, + "loss": 0.0726, + "num_input_tokens_seen": 70768064, + "step": 32740 + }, + { + "epoch": 5.341761827079935, + "grad_norm": 0.12281595915555954, + "learning_rate": 0.0009173471164653791, + "loss": 0.0536, + "num_input_tokens_seen": 70779488, + "step": 32745 + }, + { + "epoch": 5.34257748776509, + "grad_norm": 0.08573044091463089, + "learning_rate": 0.0009173079125356714, + "loss": 0.1067, + "num_input_tokens_seen": 70789728, + "step": 32750 + }, + { + "epoch": 5.343393148450245, + "grad_norm": 0.19512666761875153, + "learning_rate": 0.0009172687001486892, + "loss": 0.1032, + "num_input_tokens_seen": 70800992, + "step": 32755 + }, + { + "epoch": 5.3442088091354, + "grad_norm": 0.07091918587684631, + "learning_rate": 0.0009172294793052277, + "loss": 0.0405, + "num_input_tokens_seen": 70812032, + "step": 32760 + }, + { + "epoch": 5.3450244698205545, + "grad_norm": 0.312071293592453, + "learning_rate": 0.0009171902500060814, + "loss": 0.1494, + "num_input_tokens_seen": 70824032, + "step": 32765 + }, + { + "epoch": 5.345840130505709, + "grad_norm": 0.004325787536799908, + "learning_rate": 0.0009171510122520455, + "loss": 0.0629, + "num_input_tokens_seen": 70835648, + "step": 32770 + }, + { + "epoch": 5.346655791190865, + "grad_norm": 0.01614411175251007, + "learning_rate": 0.000917111766043915, + "loss": 0.0197, + "num_input_tokens_seen": 70846080, + "step": 32775 + }, + { + "epoch": 5.34747145187602, + "grad_norm": 0.1334189772605896, + "learning_rate": 0.0009170725113824855, + "loss": 0.2134, + "num_input_tokens_seen": 70856096, + "step": 32780 + }, + { + "epoch": 5.348287112561175, + "grad_norm": 0.07463495433330536, + "learning_rate": 0.0009170332482685524, + "loss": 0.0312, + "num_input_tokens_seen": 70867168, + "step": 32785 + }, + { + "epoch": 5.349102773246329, + "grad_norm": 0.11940980702638626, + "learning_rate": 0.0009169939767029116, + "loss": 0.0687, + "num_input_tokens_seen": 70878080, + "step": 32790 + }, + { + "epoch": 5.349918433931484, + "grad_norm": 0.020729253068566322, + "learning_rate": 0.0009169546966863588, + "loss": 0.1368, + "num_input_tokens_seen": 70889472, + "step": 32795 + }, + { + "epoch": 5.350734094616639, + "grad_norm": 0.14954179525375366, + "learning_rate": 0.0009169154082196901, + "loss": 0.0857, + "num_input_tokens_seen": 70900640, + "step": 32800 + }, + { + "epoch": 5.351549755301795, + "grad_norm": 0.32872921228408813, + "learning_rate": 0.0009168761113037019, + "loss": 0.128, + "num_input_tokens_seen": 70909856, + "step": 32805 + }, + { + "epoch": 5.35236541598695, + "grad_norm": 0.0425887331366539, + "learning_rate": 0.0009168368059391903, + "loss": 0.0225, + "num_input_tokens_seen": 70921248, + "step": 32810 + }, + { + "epoch": 5.353181076672104, + "grad_norm": 0.078069768846035, + "learning_rate": 0.0009167974921269519, + "loss": 0.1495, + "num_input_tokens_seen": 70930720, + "step": 32815 + }, + { + "epoch": 5.353996737357259, + "grad_norm": 0.14288416504859924, + "learning_rate": 0.0009167581698677838, + "loss": 0.0707, + "num_input_tokens_seen": 70940448, + "step": 32820 + }, + { + "epoch": 5.354812398042414, + "grad_norm": 0.022837474942207336, + "learning_rate": 0.0009167188391624827, + "loss": 0.0113, + "num_input_tokens_seen": 70952000, + "step": 32825 + }, + { + "epoch": 5.35562805872757, + "grad_norm": 0.008411848917603493, + "learning_rate": 0.0009166795000118456, + "loss": 0.0717, + "num_input_tokens_seen": 70961856, + "step": 32830 + }, + { + "epoch": 5.356443719412725, + "grad_norm": 0.0260999146848917, + "learning_rate": 0.0009166401524166699, + "loss": 0.0281, + "num_input_tokens_seen": 70972320, + "step": 32835 + }, + { + "epoch": 5.357259380097879, + "grad_norm": 0.0431085005402565, + "learning_rate": 0.000916600796377753, + "loss": 0.0942, + "num_input_tokens_seen": 70983648, + "step": 32840 + }, + { + "epoch": 5.358075040783034, + "grad_norm": 0.01479522418230772, + "learning_rate": 0.0009165614318958924, + "loss": 0.1821, + "num_input_tokens_seen": 70993952, + "step": 32845 + }, + { + "epoch": 5.358890701468189, + "grad_norm": 0.27183374762535095, + "learning_rate": 0.0009165220589718859, + "loss": 0.1811, + "num_input_tokens_seen": 71004160, + "step": 32850 + }, + { + "epoch": 5.359706362153344, + "grad_norm": 0.07483382523059845, + "learning_rate": 0.0009164826776065316, + "loss": 0.0864, + "num_input_tokens_seen": 71015264, + "step": 32855 + }, + { + "epoch": 5.3605220228384995, + "grad_norm": 0.008126812987029552, + "learning_rate": 0.0009164432878006274, + "loss": 0.1002, + "num_input_tokens_seen": 71026560, + "step": 32860 + }, + { + "epoch": 5.361337683523654, + "grad_norm": 0.2765503227710724, + "learning_rate": 0.0009164038895549716, + "loss": 0.2013, + "num_input_tokens_seen": 71038752, + "step": 32865 + }, + { + "epoch": 5.362153344208809, + "grad_norm": 0.191825270652771, + "learning_rate": 0.0009163644828703628, + "loss": 0.1433, + "num_input_tokens_seen": 71050112, + "step": 32870 + }, + { + "epoch": 5.362969004893964, + "grad_norm": 0.04115011915564537, + "learning_rate": 0.0009163250677475996, + "loss": 0.0803, + "num_input_tokens_seen": 71061600, + "step": 32875 + }, + { + "epoch": 5.363784665579119, + "grad_norm": 0.010353055782616138, + "learning_rate": 0.0009162856441874807, + "loss": 0.0795, + "num_input_tokens_seen": 71072960, + "step": 32880 + }, + { + "epoch": 5.364600326264274, + "grad_norm": 0.03754093125462532, + "learning_rate": 0.0009162462121908052, + "loss": 0.1689, + "num_input_tokens_seen": 71083872, + "step": 32885 + }, + { + "epoch": 5.365415986949429, + "grad_norm": 0.20056197047233582, + "learning_rate": 0.0009162067717583722, + "loss": 0.2398, + "num_input_tokens_seen": 71094368, + "step": 32890 + }, + { + "epoch": 5.366231647634584, + "grad_norm": 0.26877015829086304, + "learning_rate": 0.0009161673228909808, + "loss": 0.1409, + "num_input_tokens_seen": 71105472, + "step": 32895 + }, + { + "epoch": 5.367047308319739, + "grad_norm": 0.035021211951971054, + "learning_rate": 0.0009161278655894307, + "loss": 0.0337, + "num_input_tokens_seen": 71115840, + "step": 32900 + }, + { + "epoch": 5.367862969004894, + "grad_norm": 0.17578844726085663, + "learning_rate": 0.0009160883998545216, + "loss": 0.0935, + "num_input_tokens_seen": 71126240, + "step": 32905 + }, + { + "epoch": 5.368678629690049, + "grad_norm": 0.054084427654743195, + "learning_rate": 0.0009160489256870532, + "loss": 0.1648, + "num_input_tokens_seen": 71136640, + "step": 32910 + }, + { + "epoch": 5.369494290375204, + "grad_norm": 0.03351510688662529, + "learning_rate": 0.0009160094430878255, + "loss": 0.1695, + "num_input_tokens_seen": 71146592, + "step": 32915 + }, + { + "epoch": 5.370309951060359, + "grad_norm": 0.013419684953987598, + "learning_rate": 0.0009159699520576388, + "loss": 0.1165, + "num_input_tokens_seen": 71157664, + "step": 32920 + }, + { + "epoch": 5.371125611745514, + "grad_norm": 0.2606358528137207, + "learning_rate": 0.0009159304525972931, + "loss": 0.0713, + "num_input_tokens_seen": 71168352, + "step": 32925 + }, + { + "epoch": 5.371941272430669, + "grad_norm": 0.041385188698768616, + "learning_rate": 0.0009158909447075894, + "loss": 0.0725, + "num_input_tokens_seen": 71178432, + "step": 32930 + }, + { + "epoch": 5.372756933115824, + "grad_norm": 0.2104557901620865, + "learning_rate": 0.0009158514283893279, + "loss": 0.1242, + "num_input_tokens_seen": 71188256, + "step": 32935 + }, + { + "epoch": 5.373572593800978, + "grad_norm": 0.1753581315279007, + "learning_rate": 0.0009158119036433097, + "loss": 0.1321, + "num_input_tokens_seen": 71198720, + "step": 32940 + }, + { + "epoch": 5.374388254486134, + "grad_norm": 0.10579296201467514, + "learning_rate": 0.0009157723704703358, + "loss": 0.1819, + "num_input_tokens_seen": 71208928, + "step": 32945 + }, + { + "epoch": 5.375203915171289, + "grad_norm": 0.11044318228960037, + "learning_rate": 0.0009157328288712075, + "loss": 0.0461, + "num_input_tokens_seen": 71220640, + "step": 32950 + }, + { + "epoch": 5.376019575856444, + "grad_norm": 0.015562590211629868, + "learning_rate": 0.0009156932788467259, + "loss": 0.0734, + "num_input_tokens_seen": 71230976, + "step": 32955 + }, + { + "epoch": 5.376835236541599, + "grad_norm": 0.021977802738547325, + "learning_rate": 0.0009156537203976927, + "loss": 0.0687, + "num_input_tokens_seen": 71241696, + "step": 32960 + }, + { + "epoch": 5.377650897226753, + "grad_norm": 0.1440175622701645, + "learning_rate": 0.0009156141535249094, + "loss": 0.0302, + "num_input_tokens_seen": 71253792, + "step": 32965 + }, + { + "epoch": 5.378466557911908, + "grad_norm": 0.17520390450954437, + "learning_rate": 0.0009155745782291782, + "loss": 0.063, + "num_input_tokens_seen": 71263584, + "step": 32970 + }, + { + "epoch": 5.379282218597064, + "grad_norm": 0.011420628987252712, + "learning_rate": 0.000915534994511301, + "loss": 0.0151, + "num_input_tokens_seen": 71273760, + "step": 32975 + }, + { + "epoch": 5.380097879282219, + "grad_norm": 0.25457486510276794, + "learning_rate": 0.0009154954023720799, + "loss": 0.045, + "num_input_tokens_seen": 71284512, + "step": 32980 + }, + { + "epoch": 5.3809135399673735, + "grad_norm": 0.023655150085687637, + "learning_rate": 0.0009154558018123174, + "loss": 0.1596, + "num_input_tokens_seen": 71294560, + "step": 32985 + }, + { + "epoch": 5.381729200652528, + "grad_norm": 0.06550729274749756, + "learning_rate": 0.000915416192832816, + "loss": 0.1011, + "num_input_tokens_seen": 71304576, + "step": 32990 + }, + { + "epoch": 5.382544861337683, + "grad_norm": 0.009049389511346817, + "learning_rate": 0.0009153765754343786, + "loss": 0.0417, + "num_input_tokens_seen": 71315136, + "step": 32995 + }, + { + "epoch": 5.383360522022839, + "grad_norm": 0.09317310899496078, + "learning_rate": 0.0009153369496178078, + "loss": 0.1479, + "num_input_tokens_seen": 71324800, + "step": 33000 + }, + { + "epoch": 5.384176182707994, + "grad_norm": 0.11149311810731888, + "learning_rate": 0.0009152973153839068, + "loss": 0.1349, + "num_input_tokens_seen": 71336320, + "step": 33005 + }, + { + "epoch": 5.3849918433931485, + "grad_norm": 0.25844934582710266, + "learning_rate": 0.000915257672733479, + "loss": 0.1111, + "num_input_tokens_seen": 71348128, + "step": 33010 + }, + { + "epoch": 5.385807504078303, + "grad_norm": 0.008003010414540768, + "learning_rate": 0.0009152180216673276, + "loss": 0.0315, + "num_input_tokens_seen": 71358816, + "step": 33015 + }, + { + "epoch": 5.386623164763458, + "grad_norm": 0.12192642688751221, + "learning_rate": 0.0009151783621862564, + "loss": 0.0382, + "num_input_tokens_seen": 71369728, + "step": 33020 + }, + { + "epoch": 5.387438825448613, + "grad_norm": 0.010448573157191277, + "learning_rate": 0.0009151386942910688, + "loss": 0.0945, + "num_input_tokens_seen": 71381056, + "step": 33025 + }, + { + "epoch": 5.388254486133769, + "grad_norm": 0.0021205150987952948, + "learning_rate": 0.0009150990179825689, + "loss": 0.0383, + "num_input_tokens_seen": 71391456, + "step": 33030 + }, + { + "epoch": 5.3890701468189235, + "grad_norm": 0.019802767783403397, + "learning_rate": 0.000915059333261561, + "loss": 0.1716, + "num_input_tokens_seen": 71402208, + "step": 33035 + }, + { + "epoch": 5.389885807504078, + "grad_norm": 0.06028865650296211, + "learning_rate": 0.0009150196401288491, + "loss": 0.0465, + "num_input_tokens_seen": 71412864, + "step": 33040 + }, + { + "epoch": 5.390701468189233, + "grad_norm": 0.017764244228601456, + "learning_rate": 0.0009149799385852375, + "loss": 0.0465, + "num_input_tokens_seen": 71423840, + "step": 33045 + }, + { + "epoch": 5.391517128874388, + "grad_norm": 0.04317443072795868, + "learning_rate": 0.0009149402286315314, + "loss": 0.0318, + "num_input_tokens_seen": 71432480, + "step": 33050 + }, + { + "epoch": 5.392332789559543, + "grad_norm": 0.2608911693096161, + "learning_rate": 0.0009149005102685348, + "loss": 0.1132, + "num_input_tokens_seen": 71445152, + "step": 33055 + }, + { + "epoch": 5.3931484502446985, + "grad_norm": 0.03656046465039253, + "learning_rate": 0.0009148607834970532, + "loss": 0.2079, + "num_input_tokens_seen": 71455808, + "step": 33060 + }, + { + "epoch": 5.393964110929853, + "grad_norm": 0.16922008991241455, + "learning_rate": 0.0009148210483178916, + "loss": 0.0763, + "num_input_tokens_seen": 71467264, + "step": 33065 + }, + { + "epoch": 5.394779771615008, + "grad_norm": 0.015841931104660034, + "learning_rate": 0.000914781304731855, + "loss": 0.0865, + "num_input_tokens_seen": 71476832, + "step": 33070 + }, + { + "epoch": 5.395595432300163, + "grad_norm": 0.24486678838729858, + "learning_rate": 0.0009147415527397492, + "loss": 0.1865, + "num_input_tokens_seen": 71487840, + "step": 33075 + }, + { + "epoch": 5.396411092985318, + "grad_norm": 0.13854020833969116, + "learning_rate": 0.0009147017923423797, + "loss": 0.1851, + "num_input_tokens_seen": 71499776, + "step": 33080 + }, + { + "epoch": 5.397226753670473, + "grad_norm": 0.04607773944735527, + "learning_rate": 0.0009146620235405523, + "loss": 0.0428, + "num_input_tokens_seen": 71511232, + "step": 33085 + }, + { + "epoch": 5.398042414355628, + "grad_norm": 0.04011186957359314, + "learning_rate": 0.0009146222463350729, + "loss": 0.1327, + "num_input_tokens_seen": 71523040, + "step": 33090 + }, + { + "epoch": 5.398858075040783, + "grad_norm": 0.06927596777677536, + "learning_rate": 0.0009145824607267478, + "loss": 0.1235, + "num_input_tokens_seen": 71533216, + "step": 33095 + }, + { + "epoch": 5.399673735725938, + "grad_norm": 0.043770719319581985, + "learning_rate": 0.0009145426667163832, + "loss": 0.0413, + "num_input_tokens_seen": 71544256, + "step": 33100 + }, + { + "epoch": 5.400489396411093, + "grad_norm": 0.29412418603897095, + "learning_rate": 0.0009145028643047855, + "loss": 0.0871, + "num_input_tokens_seen": 71555104, + "step": 33105 + }, + { + "epoch": 5.401305057096248, + "grad_norm": 0.09121891856193542, + "learning_rate": 0.0009144630534927613, + "loss": 0.0808, + "num_input_tokens_seen": 71566176, + "step": 33110 + }, + { + "epoch": 5.402120717781403, + "grad_norm": 0.3319765329360962, + "learning_rate": 0.0009144232342811179, + "loss": 0.1327, + "num_input_tokens_seen": 71577440, + "step": 33115 + }, + { + "epoch": 5.402936378466558, + "grad_norm": 0.017797963693737984, + "learning_rate": 0.0009143834066706615, + "loss": 0.0248, + "num_input_tokens_seen": 71588160, + "step": 33120 + }, + { + "epoch": 5.403752039151713, + "grad_norm": 0.165283203125, + "learning_rate": 0.0009143435706621999, + "loss": 0.0478, + "num_input_tokens_seen": 71598848, + "step": 33125 + }, + { + "epoch": 5.404567699836868, + "grad_norm": 0.011903224512934685, + "learning_rate": 0.0009143037262565401, + "loss": 0.1196, + "num_input_tokens_seen": 71609536, + "step": 33130 + }, + { + "epoch": 5.4053833605220225, + "grad_norm": 0.08311284333467484, + "learning_rate": 0.00091426387345449, + "loss": 0.1009, + "num_input_tokens_seen": 71620576, + "step": 33135 + }, + { + "epoch": 5.406199021207178, + "grad_norm": 0.032639991492033005, + "learning_rate": 0.0009142240122568566, + "loss": 0.1154, + "num_input_tokens_seen": 71632800, + "step": 33140 + }, + { + "epoch": 5.407014681892333, + "grad_norm": 0.1708725243806839, + "learning_rate": 0.0009141841426644482, + "loss": 0.0963, + "num_input_tokens_seen": 71643392, + "step": 33145 + }, + { + "epoch": 5.407830342577488, + "grad_norm": 0.1919700801372528, + "learning_rate": 0.0009141442646780728, + "loss": 0.0418, + "num_input_tokens_seen": 71653824, + "step": 33150 + }, + { + "epoch": 5.408646003262643, + "grad_norm": 0.05919947475194931, + "learning_rate": 0.0009141043782985385, + "loss": 0.0129, + "num_input_tokens_seen": 71664288, + "step": 33155 + }, + { + "epoch": 5.4094616639477975, + "grad_norm": 0.0068783871829509735, + "learning_rate": 0.0009140644835266537, + "loss": 0.1369, + "num_input_tokens_seen": 71674848, + "step": 33160 + }, + { + "epoch": 5.410277324632952, + "grad_norm": 0.07434239238500595, + "learning_rate": 0.0009140245803632268, + "loss": 0.0418, + "num_input_tokens_seen": 71685280, + "step": 33165 + }, + { + "epoch": 5.411092985318108, + "grad_norm": 0.05117892101407051, + "learning_rate": 0.0009139846688090665, + "loss": 0.0888, + "num_input_tokens_seen": 71695584, + "step": 33170 + }, + { + "epoch": 5.411908646003263, + "grad_norm": 0.016075173392891884, + "learning_rate": 0.0009139447488649818, + "loss": 0.0488, + "num_input_tokens_seen": 71706016, + "step": 33175 + }, + { + "epoch": 5.412724306688418, + "grad_norm": 0.18677929043769836, + "learning_rate": 0.0009139048205317817, + "loss": 0.12, + "num_input_tokens_seen": 71718144, + "step": 33180 + }, + { + "epoch": 5.4135399673735725, + "grad_norm": 0.01021922379732132, + "learning_rate": 0.0009138648838102751, + "loss": 0.2681, + "num_input_tokens_seen": 71729600, + "step": 33185 + }, + { + "epoch": 5.414355628058727, + "grad_norm": 0.19370241463184357, + "learning_rate": 0.0009138249387012718, + "loss": 0.2159, + "num_input_tokens_seen": 71739680, + "step": 33190 + }, + { + "epoch": 5.415171288743883, + "grad_norm": 0.07409081608057022, + "learning_rate": 0.000913784985205581, + "loss": 0.0201, + "num_input_tokens_seen": 71750848, + "step": 33195 + }, + { + "epoch": 5.415986949429038, + "grad_norm": 0.027513748034834862, + "learning_rate": 0.0009137450233240127, + "loss": 0.075, + "num_input_tokens_seen": 71762080, + "step": 33200 + }, + { + "epoch": 5.416802610114193, + "grad_norm": 0.00557843130081892, + "learning_rate": 0.0009137050530573765, + "loss": 0.0684, + "num_input_tokens_seen": 71772320, + "step": 33205 + }, + { + "epoch": 5.417618270799347, + "grad_norm": 0.16831335425376892, + "learning_rate": 0.0009136650744064827, + "loss": 0.1361, + "num_input_tokens_seen": 71782912, + "step": 33210 + }, + { + "epoch": 5.418433931484502, + "grad_norm": 0.037396691739559174, + "learning_rate": 0.0009136250873721413, + "loss": 0.0382, + "num_input_tokens_seen": 71793696, + "step": 33215 + }, + { + "epoch": 5.419249592169657, + "grad_norm": 0.2280731201171875, + "learning_rate": 0.0009135850919551628, + "loss": 0.104, + "num_input_tokens_seen": 71803584, + "step": 33220 + }, + { + "epoch": 5.420065252854813, + "grad_norm": 0.035369303077459335, + "learning_rate": 0.0009135450881563578, + "loss": 0.0536, + "num_input_tokens_seen": 71815232, + "step": 33225 + }, + { + "epoch": 5.420880913539968, + "grad_norm": 0.2152285873889923, + "learning_rate": 0.0009135050759765369, + "loss": 0.0528, + "num_input_tokens_seen": 71824960, + "step": 33230 + }, + { + "epoch": 5.421696574225122, + "grad_norm": 0.03372219577431679, + "learning_rate": 0.0009134650554165111, + "loss": 0.0108, + "num_input_tokens_seen": 71836576, + "step": 33235 + }, + { + "epoch": 5.422512234910277, + "grad_norm": 0.03322592377662659, + "learning_rate": 0.0009134250264770914, + "loss": 0.0154, + "num_input_tokens_seen": 71847168, + "step": 33240 + }, + { + "epoch": 5.423327895595432, + "grad_norm": 0.1197444424033165, + "learning_rate": 0.0009133849891590891, + "loss": 0.0457, + "num_input_tokens_seen": 71857504, + "step": 33245 + }, + { + "epoch": 5.424143556280587, + "grad_norm": 0.1454666703939438, + "learning_rate": 0.0009133449434633157, + "loss": 0.1297, + "num_input_tokens_seen": 71868608, + "step": 33250 + }, + { + "epoch": 5.424959216965743, + "grad_norm": 0.15138794481754303, + "learning_rate": 0.0009133048893905824, + "loss": 0.2438, + "num_input_tokens_seen": 71879520, + "step": 33255 + }, + { + "epoch": 5.425774877650897, + "grad_norm": 0.06414239853620529, + "learning_rate": 0.0009132648269417014, + "loss": 0.1642, + "num_input_tokens_seen": 71890304, + "step": 33260 + }, + { + "epoch": 5.426590538336052, + "grad_norm": 0.004905904643237591, + "learning_rate": 0.0009132247561174843, + "loss": 0.1074, + "num_input_tokens_seen": 71900480, + "step": 33265 + }, + { + "epoch": 5.427406199021207, + "grad_norm": 0.029563914984464645, + "learning_rate": 0.0009131846769187434, + "loss": 0.0306, + "num_input_tokens_seen": 71911136, + "step": 33270 + }, + { + "epoch": 5.428221859706362, + "grad_norm": 0.028971252962946892, + "learning_rate": 0.0009131445893462908, + "loss": 0.0627, + "num_input_tokens_seen": 71923296, + "step": 33275 + }, + { + "epoch": 5.4290375203915175, + "grad_norm": 0.21459606289863586, + "learning_rate": 0.000913104493400939, + "loss": 0.072, + "num_input_tokens_seen": 71934464, + "step": 33280 + }, + { + "epoch": 5.429853181076672, + "grad_norm": 0.1923767775297165, + "learning_rate": 0.0009130643890835007, + "loss": 0.2351, + "num_input_tokens_seen": 71944672, + "step": 33285 + }, + { + "epoch": 5.430668841761827, + "grad_norm": 0.039720792323350906, + "learning_rate": 0.0009130242763947884, + "loss": 0.0836, + "num_input_tokens_seen": 71955872, + "step": 33290 + }, + { + "epoch": 5.431484502446982, + "grad_norm": 0.22541844844818115, + "learning_rate": 0.0009129841553356152, + "loss": 0.0747, + "num_input_tokens_seen": 71966400, + "step": 33295 + }, + { + "epoch": 5.432300163132137, + "grad_norm": 0.1534278839826584, + "learning_rate": 0.0009129440259067941, + "loss": 0.0594, + "num_input_tokens_seen": 71978240, + "step": 33300 + }, + { + "epoch": 5.433115823817292, + "grad_norm": 0.3853054940700531, + "learning_rate": 0.0009129038881091386, + "loss": 0.3502, + "num_input_tokens_seen": 71988640, + "step": 33305 + }, + { + "epoch": 5.433931484502447, + "grad_norm": 0.04415808245539665, + "learning_rate": 0.000912863741943462, + "loss": 0.0426, + "num_input_tokens_seen": 72000032, + "step": 33310 + }, + { + "epoch": 5.434747145187602, + "grad_norm": 0.14555767178535461, + "learning_rate": 0.000912823587410578, + "loss": 0.1448, + "num_input_tokens_seen": 72010304, + "step": 33315 + }, + { + "epoch": 5.435562805872757, + "grad_norm": 0.046213071793317795, + "learning_rate": 0.0009127834245113, + "loss": 0.0813, + "num_input_tokens_seen": 72021376, + "step": 33320 + }, + { + "epoch": 5.436378466557912, + "grad_norm": 0.21453042328357697, + "learning_rate": 0.0009127432532464424, + "loss": 0.0588, + "num_input_tokens_seen": 72031392, + "step": 33325 + }, + { + "epoch": 5.437194127243067, + "grad_norm": 0.22624626755714417, + "learning_rate": 0.0009127030736168192, + "loss": 0.1267, + "num_input_tokens_seen": 72041600, + "step": 33330 + }, + { + "epoch": 5.438009787928221, + "grad_norm": 0.19587194919586182, + "learning_rate": 0.0009126628856232446, + "loss": 0.094, + "num_input_tokens_seen": 72052288, + "step": 33335 + }, + { + "epoch": 5.438825448613377, + "grad_norm": 0.19771817326545715, + "learning_rate": 0.0009126226892665333, + "loss": 0.1786, + "num_input_tokens_seen": 72062656, + "step": 33340 + }, + { + "epoch": 5.439641109298532, + "grad_norm": 0.03184128552675247, + "learning_rate": 0.0009125824845474996, + "loss": 0.0166, + "num_input_tokens_seen": 72072416, + "step": 33345 + }, + { + "epoch": 5.440456769983687, + "grad_norm": 0.13524416089057922, + "learning_rate": 0.0009125422714669584, + "loss": 0.1857, + "num_input_tokens_seen": 72082624, + "step": 33350 + }, + { + "epoch": 5.441272430668842, + "grad_norm": 0.2313859611749649, + "learning_rate": 0.0009125020500257248, + "loss": 0.1425, + "num_input_tokens_seen": 72094144, + "step": 33355 + }, + { + "epoch": 5.442088091353996, + "grad_norm": 0.11446195840835571, + "learning_rate": 0.000912461820224614, + "loss": 0.0561, + "num_input_tokens_seen": 72105312, + "step": 33360 + }, + { + "epoch": 5.442903752039152, + "grad_norm": 0.019046427682042122, + "learning_rate": 0.000912421582064441, + "loss": 0.2843, + "num_input_tokens_seen": 72116096, + "step": 33365 + }, + { + "epoch": 5.443719412724307, + "grad_norm": 0.01346815936267376, + "learning_rate": 0.0009123813355460214, + "loss": 0.0992, + "num_input_tokens_seen": 72127392, + "step": 33370 + }, + { + "epoch": 5.444535073409462, + "grad_norm": 0.07554985582828522, + "learning_rate": 0.000912341080670171, + "loss": 0.0914, + "num_input_tokens_seen": 72137440, + "step": 33375 + }, + { + "epoch": 5.445350734094617, + "grad_norm": 0.14728760719299316, + "learning_rate": 0.0009123008174377054, + "loss": 0.0779, + "num_input_tokens_seen": 72150112, + "step": 33380 + }, + { + "epoch": 5.446166394779771, + "grad_norm": 0.057900868356227875, + "learning_rate": 0.0009122605458494409, + "loss": 0.1828, + "num_input_tokens_seen": 72159296, + "step": 33385 + }, + { + "epoch": 5.446982055464926, + "grad_norm": 0.11225569248199463, + "learning_rate": 0.0009122202659061934, + "loss": 0.0435, + "num_input_tokens_seen": 72170240, + "step": 33390 + }, + { + "epoch": 5.447797716150082, + "grad_norm": 0.036758918315172195, + "learning_rate": 0.0009121799776087791, + "loss": 0.0167, + "num_input_tokens_seen": 72180672, + "step": 33395 + }, + { + "epoch": 5.448613376835237, + "grad_norm": 0.007878992706537247, + "learning_rate": 0.0009121396809580147, + "loss": 0.0929, + "num_input_tokens_seen": 72191264, + "step": 33400 + }, + { + "epoch": 5.4494290375203915, + "grad_norm": 0.17817571759223938, + "learning_rate": 0.0009120993759547169, + "loss": 0.1207, + "num_input_tokens_seen": 72201472, + "step": 33405 + }, + { + "epoch": 5.450244698205546, + "grad_norm": 0.07467425614595413, + "learning_rate": 0.0009120590625997026, + "loss": 0.063, + "num_input_tokens_seen": 72213024, + "step": 33410 + }, + { + "epoch": 5.451060358890701, + "grad_norm": 0.08875560760498047, + "learning_rate": 0.0009120187408937884, + "loss": 0.0549, + "num_input_tokens_seen": 72224288, + "step": 33415 + }, + { + "epoch": 5.451876019575856, + "grad_norm": 0.15524475276470184, + "learning_rate": 0.0009119784108377918, + "loss": 0.0649, + "num_input_tokens_seen": 72235104, + "step": 33420 + }, + { + "epoch": 5.452691680261012, + "grad_norm": 0.010183833539485931, + "learning_rate": 0.0009119380724325302, + "loss": 0.2185, + "num_input_tokens_seen": 72245856, + "step": 33425 + }, + { + "epoch": 5.4535073409461665, + "grad_norm": 0.053868986666202545, + "learning_rate": 0.0009118977256788208, + "loss": 0.0478, + "num_input_tokens_seen": 72256032, + "step": 33430 + }, + { + "epoch": 5.454323001631321, + "grad_norm": 0.11249548941850662, + "learning_rate": 0.0009118573705774815, + "loss": 0.1081, + "num_input_tokens_seen": 72267104, + "step": 33435 + }, + { + "epoch": 5.455138662316476, + "grad_norm": 0.008952013216912746, + "learning_rate": 0.0009118170071293302, + "loss": 0.0212, + "num_input_tokens_seen": 72278912, + "step": 33440 + }, + { + "epoch": 5.455954323001631, + "grad_norm": 0.03076971136033535, + "learning_rate": 0.0009117766353351848, + "loss": 0.083, + "num_input_tokens_seen": 72289312, + "step": 33445 + }, + { + "epoch": 5.456769983686787, + "grad_norm": 0.023297972977161407, + "learning_rate": 0.0009117362551958635, + "loss": 0.0506, + "num_input_tokens_seen": 72300896, + "step": 33450 + }, + { + "epoch": 5.4575856443719415, + "grad_norm": 0.028049878776073456, + "learning_rate": 0.0009116958667121847, + "loss": 0.021, + "num_input_tokens_seen": 72312288, + "step": 33455 + }, + { + "epoch": 5.458401305057096, + "grad_norm": 0.12048943340778351, + "learning_rate": 0.0009116554698849668, + "loss": 0.1195, + "num_input_tokens_seen": 72322912, + "step": 33460 + }, + { + "epoch": 5.459216965742251, + "grad_norm": 0.019078262150287628, + "learning_rate": 0.0009116150647150286, + "loss": 0.0531, + "num_input_tokens_seen": 72333600, + "step": 33465 + }, + { + "epoch": 5.460032626427406, + "grad_norm": 0.05092627555131912, + "learning_rate": 0.0009115746512031891, + "loss": 0.1175, + "num_input_tokens_seen": 72344096, + "step": 33470 + }, + { + "epoch": 5.460848287112561, + "grad_norm": 0.10556714981794357, + "learning_rate": 0.0009115342293502669, + "loss": 0.1504, + "num_input_tokens_seen": 72354432, + "step": 33475 + }, + { + "epoch": 5.4616639477977165, + "grad_norm": 0.0820518508553505, + "learning_rate": 0.0009114937991570817, + "loss": 0.0532, + "num_input_tokens_seen": 72366400, + "step": 33480 + }, + { + "epoch": 5.462479608482871, + "grad_norm": 0.04745522886514664, + "learning_rate": 0.0009114533606244526, + "loss": 0.0263, + "num_input_tokens_seen": 72375808, + "step": 33485 + }, + { + "epoch": 5.463295269168026, + "grad_norm": 0.18660973012447357, + "learning_rate": 0.0009114129137531991, + "loss": 0.1344, + "num_input_tokens_seen": 72386560, + "step": 33490 + }, + { + "epoch": 5.464110929853181, + "grad_norm": 0.04909445717930794, + "learning_rate": 0.000911372458544141, + "loss": 0.2111, + "num_input_tokens_seen": 72396960, + "step": 33495 + }, + { + "epoch": 5.464926590538336, + "grad_norm": 0.03188806772232056, + "learning_rate": 0.0009113319949980983, + "loss": 0.0183, + "num_input_tokens_seen": 72407104, + "step": 33500 + }, + { + "epoch": 5.465742251223491, + "grad_norm": 0.020453909412026405, + "learning_rate": 0.0009112915231158907, + "loss": 0.0981, + "num_input_tokens_seen": 72418432, + "step": 33505 + }, + { + "epoch": 5.466557911908646, + "grad_norm": 0.2022310048341751, + "learning_rate": 0.0009112510428983387, + "loss": 0.0943, + "num_input_tokens_seen": 72429824, + "step": 33510 + }, + { + "epoch": 5.467373572593801, + "grad_norm": 0.07638852298259735, + "learning_rate": 0.0009112105543462628, + "loss": 0.0174, + "num_input_tokens_seen": 72440352, + "step": 33515 + }, + { + "epoch": 5.468189233278956, + "grad_norm": 0.13178791105747223, + "learning_rate": 0.0009111700574604831, + "loss": 0.0705, + "num_input_tokens_seen": 72452000, + "step": 33520 + }, + { + "epoch": 5.469004893964111, + "grad_norm": 0.14010080695152283, + "learning_rate": 0.0009111295522418207, + "loss": 0.0609, + "num_input_tokens_seen": 72462240, + "step": 33525 + }, + { + "epoch": 5.4698205546492655, + "grad_norm": 0.057958196848630905, + "learning_rate": 0.0009110890386910964, + "loss": 0.0756, + "num_input_tokens_seen": 72474656, + "step": 33530 + }, + { + "epoch": 5.470636215334421, + "grad_norm": 0.0577903650701046, + "learning_rate": 0.0009110485168091311, + "loss": 0.0432, + "num_input_tokens_seen": 72485568, + "step": 33535 + }, + { + "epoch": 5.471451876019576, + "grad_norm": 0.01845645345747471, + "learning_rate": 0.0009110079865967462, + "loss": 0.1357, + "num_input_tokens_seen": 72496192, + "step": 33540 + }, + { + "epoch": 5.472267536704731, + "grad_norm": 0.26865318417549133, + "learning_rate": 0.0009109674480547632, + "loss": 0.2558, + "num_input_tokens_seen": 72506176, + "step": 33545 + }, + { + "epoch": 5.473083197389886, + "grad_norm": 0.027263272553682327, + "learning_rate": 0.0009109269011840033, + "loss": 0.1013, + "num_input_tokens_seen": 72517216, + "step": 33550 + }, + { + "epoch": 5.4738988580750405, + "grad_norm": 0.13702431321144104, + "learning_rate": 0.0009108863459852886, + "loss": 0.1482, + "num_input_tokens_seen": 72528992, + "step": 33555 + }, + { + "epoch": 5.474714518760196, + "grad_norm": 0.033558014780282974, + "learning_rate": 0.0009108457824594407, + "loss": 0.3343, + "num_input_tokens_seen": 72539968, + "step": 33560 + }, + { + "epoch": 5.475530179445351, + "grad_norm": 0.15126581490039825, + "learning_rate": 0.0009108052106072819, + "loss": 0.0679, + "num_input_tokens_seen": 72551488, + "step": 33565 + }, + { + "epoch": 5.476345840130506, + "grad_norm": 0.03495920076966286, + "learning_rate": 0.0009107646304296344, + "loss": 0.1196, + "num_input_tokens_seen": 72561984, + "step": 33570 + }, + { + "epoch": 5.477161500815661, + "grad_norm": 0.034811608493328094, + "learning_rate": 0.0009107240419273206, + "loss": 0.019, + "num_input_tokens_seen": 72572704, + "step": 33575 + }, + { + "epoch": 5.4779771615008155, + "grad_norm": 0.3282313942909241, + "learning_rate": 0.000910683445101163, + "loss": 0.245, + "num_input_tokens_seen": 72584096, + "step": 33580 + }, + { + "epoch": 5.47879282218597, + "grad_norm": 0.008367033675312996, + "learning_rate": 0.0009106428399519844, + "loss": 0.1595, + "num_input_tokens_seen": 72594080, + "step": 33585 + }, + { + "epoch": 5.479608482871126, + "grad_norm": 0.14254389703273773, + "learning_rate": 0.0009106022264806078, + "loss": 0.1269, + "num_input_tokens_seen": 72605504, + "step": 33590 + }, + { + "epoch": 5.480424143556281, + "grad_norm": 0.03509574383497238, + "learning_rate": 0.000910561604687856, + "loss": 0.0335, + "num_input_tokens_seen": 72616736, + "step": 33595 + }, + { + "epoch": 5.481239804241436, + "grad_norm": 0.04512341693043709, + "learning_rate": 0.0009105209745745526, + "loss": 0.2051, + "num_input_tokens_seen": 72627136, + "step": 33600 + }, + { + "epoch": 5.4820554649265905, + "grad_norm": 0.06479254364967346, + "learning_rate": 0.0009104803361415208, + "loss": 0.11, + "num_input_tokens_seen": 72637408, + "step": 33605 + }, + { + "epoch": 5.482871125611745, + "grad_norm": 0.17471860349178314, + "learning_rate": 0.0009104396893895843, + "loss": 0.1633, + "num_input_tokens_seen": 72648000, + "step": 33610 + }, + { + "epoch": 5.4836867862969, + "grad_norm": 0.049819447100162506, + "learning_rate": 0.0009103990343195667, + "loss": 0.0398, + "num_input_tokens_seen": 72657888, + "step": 33615 + }, + { + "epoch": 5.484502446982056, + "grad_norm": 0.13450585305690765, + "learning_rate": 0.0009103583709322923, + "loss": 0.085, + "num_input_tokens_seen": 72667904, + "step": 33620 + }, + { + "epoch": 5.485318107667211, + "grad_norm": 0.058071158826351166, + "learning_rate": 0.0009103176992285847, + "loss": 0.0583, + "num_input_tokens_seen": 72679968, + "step": 33625 + }, + { + "epoch": 5.486133768352365, + "grad_norm": 0.09091849625110626, + "learning_rate": 0.0009102770192092684, + "loss": 0.0348, + "num_input_tokens_seen": 72691392, + "step": 33630 + }, + { + "epoch": 5.48694942903752, + "grad_norm": 0.08295218646526337, + "learning_rate": 0.000910236330875168, + "loss": 0.0648, + "num_input_tokens_seen": 72700672, + "step": 33635 + }, + { + "epoch": 5.487765089722675, + "grad_norm": 0.22870251536369324, + "learning_rate": 0.0009101956342271078, + "loss": 0.1369, + "num_input_tokens_seen": 72712128, + "step": 33640 + }, + { + "epoch": 5.488580750407831, + "grad_norm": 0.014779879711568356, + "learning_rate": 0.0009101549292659128, + "loss": 0.1379, + "num_input_tokens_seen": 72722016, + "step": 33645 + }, + { + "epoch": 5.489396411092986, + "grad_norm": 0.1314256340265274, + "learning_rate": 0.0009101142159924077, + "loss": 0.0734, + "num_input_tokens_seen": 72732704, + "step": 33650 + }, + { + "epoch": 5.49021207177814, + "grad_norm": 0.12707646191120148, + "learning_rate": 0.0009100734944074179, + "loss": 0.0638, + "num_input_tokens_seen": 72744256, + "step": 33655 + }, + { + "epoch": 5.491027732463295, + "grad_norm": 0.03855331614613533, + "learning_rate": 0.0009100327645117684, + "loss": 0.0669, + "num_input_tokens_seen": 72754336, + "step": 33660 + }, + { + "epoch": 5.49184339314845, + "grad_norm": 0.02805102989077568, + "learning_rate": 0.0009099920263062848, + "loss": 0.1077, + "num_input_tokens_seen": 72765536, + "step": 33665 + }, + { + "epoch": 5.492659053833605, + "grad_norm": 0.03538591042160988, + "learning_rate": 0.0009099512797917927, + "loss": 0.019, + "num_input_tokens_seen": 72776160, + "step": 33670 + }, + { + "epoch": 5.493474714518761, + "grad_norm": 0.22538243234157562, + "learning_rate": 0.0009099105249691179, + "loss": 0.0887, + "num_input_tokens_seen": 72786368, + "step": 33675 + }, + { + "epoch": 5.494290375203915, + "grad_norm": 0.01177581213414669, + "learning_rate": 0.0009098697618390862, + "loss": 0.0581, + "num_input_tokens_seen": 72797152, + "step": 33680 + }, + { + "epoch": 5.49510603588907, + "grad_norm": 0.2585497796535492, + "learning_rate": 0.0009098289904025239, + "loss": 0.1149, + "num_input_tokens_seen": 72807552, + "step": 33685 + }, + { + "epoch": 5.495921696574225, + "grad_norm": 0.07408113032579422, + "learning_rate": 0.0009097882106602571, + "loss": 0.1972, + "num_input_tokens_seen": 72817888, + "step": 33690 + }, + { + "epoch": 5.49673735725938, + "grad_norm": 0.02731460891664028, + "learning_rate": 0.0009097474226131124, + "loss": 0.0196, + "num_input_tokens_seen": 72829792, + "step": 33695 + }, + { + "epoch": 5.497553017944535, + "grad_norm": 0.29231026768684387, + "learning_rate": 0.0009097066262619165, + "loss": 0.084, + "num_input_tokens_seen": 72839360, + "step": 33700 + }, + { + "epoch": 5.49836867862969, + "grad_norm": 0.2278498113155365, + "learning_rate": 0.000909665821607496, + "loss": 0.2324, + "num_input_tokens_seen": 72849376, + "step": 33705 + }, + { + "epoch": 5.499184339314845, + "grad_norm": 0.18931224942207336, + "learning_rate": 0.0009096250086506779, + "loss": 0.2018, + "num_input_tokens_seen": 72859872, + "step": 33710 + }, + { + "epoch": 5.5, + "grad_norm": 0.14162462949752808, + "learning_rate": 0.0009095841873922894, + "loss": 0.0834, + "num_input_tokens_seen": 72871392, + "step": 33715 + }, + { + "epoch": 5.500815660685155, + "grad_norm": 0.036499932408332825, + "learning_rate": 0.0009095433578331576, + "loss": 0.1281, + "num_input_tokens_seen": 72882176, + "step": 33720 + }, + { + "epoch": 5.50163132137031, + "grad_norm": 0.2484671026468277, + "learning_rate": 0.0009095025199741103, + "loss": 0.0841, + "num_input_tokens_seen": 72893024, + "step": 33725 + }, + { + "epoch": 5.502446982055465, + "grad_norm": 0.1913287341594696, + "learning_rate": 0.0009094616738159748, + "loss": 0.0924, + "num_input_tokens_seen": 72904512, + "step": 33730 + }, + { + "epoch": 5.50326264274062, + "grad_norm": 0.049790579825639725, + "learning_rate": 0.000909420819359579, + "loss": 0.0527, + "num_input_tokens_seen": 72914848, + "step": 33735 + }, + { + "epoch": 5.504078303425775, + "grad_norm": 0.19436365365982056, + "learning_rate": 0.000909379956605751, + "loss": 0.0706, + "num_input_tokens_seen": 72924832, + "step": 33740 + }, + { + "epoch": 5.50489396411093, + "grad_norm": 0.010649348609149456, + "learning_rate": 0.000909339085555319, + "loss": 0.2315, + "num_input_tokens_seen": 72935936, + "step": 33745 + }, + { + "epoch": 5.505709624796085, + "grad_norm": 0.18552739918231964, + "learning_rate": 0.0009092982062091109, + "loss": 0.1633, + "num_input_tokens_seen": 72946400, + "step": 33750 + }, + { + "epoch": 5.506525285481239, + "grad_norm": 0.2872273921966553, + "learning_rate": 0.0009092573185679556, + "loss": 0.0368, + "num_input_tokens_seen": 72957536, + "step": 33755 + }, + { + "epoch": 5.507340946166395, + "grad_norm": 0.032905105501413345, + "learning_rate": 0.0009092164226326814, + "loss": 0.0495, + "num_input_tokens_seen": 72969216, + "step": 33760 + }, + { + "epoch": 5.50815660685155, + "grad_norm": 0.07028263807296753, + "learning_rate": 0.0009091755184041173, + "loss": 0.1416, + "num_input_tokens_seen": 72980096, + "step": 33765 + }, + { + "epoch": 5.508972267536705, + "grad_norm": 0.20369423925876617, + "learning_rate": 0.0009091346058830923, + "loss": 0.0918, + "num_input_tokens_seen": 72991808, + "step": 33770 + }, + { + "epoch": 5.50978792822186, + "grad_norm": 0.16369512677192688, + "learning_rate": 0.0009090936850704354, + "loss": 0.0923, + "num_input_tokens_seen": 73002304, + "step": 33775 + }, + { + "epoch": 5.510603588907014, + "grad_norm": 0.26133355498313904, + "learning_rate": 0.0009090527559669761, + "loss": 0.1211, + "num_input_tokens_seen": 73013120, + "step": 33780 + }, + { + "epoch": 5.511419249592169, + "grad_norm": 0.15874671936035156, + "learning_rate": 0.0009090118185735438, + "loss": 0.1379, + "num_input_tokens_seen": 73023392, + "step": 33785 + }, + { + "epoch": 5.512234910277325, + "grad_norm": 0.17313063144683838, + "learning_rate": 0.000908970872890968, + "loss": 0.0637, + "num_input_tokens_seen": 73034688, + "step": 33790 + }, + { + "epoch": 5.51305057096248, + "grad_norm": 0.018427910283207893, + "learning_rate": 0.0009089299189200789, + "loss": 0.1245, + "num_input_tokens_seen": 73045600, + "step": 33795 + }, + { + "epoch": 5.513866231647635, + "grad_norm": 0.004695965442806482, + "learning_rate": 0.000908888956661706, + "loss": 0.2346, + "num_input_tokens_seen": 73056224, + "step": 33800 + }, + { + "epoch": 5.514681892332789, + "grad_norm": 0.06857309490442276, + "learning_rate": 0.0009088479861166797, + "loss": 0.2559, + "num_input_tokens_seen": 73067552, + "step": 33805 + }, + { + "epoch": 5.515497553017944, + "grad_norm": 0.2367842048406601, + "learning_rate": 0.0009088070072858303, + "loss": 0.1591, + "num_input_tokens_seen": 73077408, + "step": 33810 + }, + { + "epoch": 5.5163132137031, + "grad_norm": 0.009032535366714, + "learning_rate": 0.0009087660201699884, + "loss": 0.1355, + "num_input_tokens_seen": 73087936, + "step": 33815 + }, + { + "epoch": 5.517128874388255, + "grad_norm": 0.20530152320861816, + "learning_rate": 0.0009087250247699846, + "loss": 0.0991, + "num_input_tokens_seen": 73098912, + "step": 33820 + }, + { + "epoch": 5.5179445350734095, + "grad_norm": 0.024618202820420265, + "learning_rate": 0.0009086840210866493, + "loss": 0.0662, + "num_input_tokens_seen": 73109920, + "step": 33825 + }, + { + "epoch": 5.518760195758564, + "grad_norm": 0.018427925184369087, + "learning_rate": 0.0009086430091208142, + "loss": 0.0556, + "num_input_tokens_seen": 73120704, + "step": 33830 + }, + { + "epoch": 5.519575856443719, + "grad_norm": 0.029816431924700737, + "learning_rate": 0.00090860198887331, + "loss": 0.0435, + "num_input_tokens_seen": 73132160, + "step": 33835 + }, + { + "epoch": 5.520391517128875, + "grad_norm": 0.18719585239887238, + "learning_rate": 0.0009085609603449683, + "loss": 0.0573, + "num_input_tokens_seen": 73141600, + "step": 33840 + }, + { + "epoch": 5.52120717781403, + "grad_norm": 0.026867792010307312, + "learning_rate": 0.0009085199235366201, + "loss": 0.0416, + "num_input_tokens_seen": 73152064, + "step": 33845 + }, + { + "epoch": 5.5220228384991845, + "grad_norm": 0.14632810652256012, + "learning_rate": 0.0009084788784490977, + "loss": 0.0572, + "num_input_tokens_seen": 73163200, + "step": 33850 + }, + { + "epoch": 5.522838499184339, + "grad_norm": 0.027970803901553154, + "learning_rate": 0.0009084378250832325, + "loss": 0.0811, + "num_input_tokens_seen": 73172704, + "step": 33855 + }, + { + "epoch": 5.523654159869494, + "grad_norm": 0.08090617507696152, + "learning_rate": 0.0009083967634398567, + "loss": 0.1068, + "num_input_tokens_seen": 73182368, + "step": 33860 + }, + { + "epoch": 5.524469820554649, + "grad_norm": 0.15196193754673004, + "learning_rate": 0.0009083556935198024, + "loss": 0.1154, + "num_input_tokens_seen": 73193504, + "step": 33865 + }, + { + "epoch": 5.525285481239804, + "grad_norm": 0.02766106277704239, + "learning_rate": 0.0009083146153239019, + "loss": 0.046, + "num_input_tokens_seen": 73203136, + "step": 33870 + }, + { + "epoch": 5.5261011419249595, + "grad_norm": 0.013581224717199802, + "learning_rate": 0.0009082735288529878, + "loss": 0.1066, + "num_input_tokens_seen": 73214496, + "step": 33875 + }, + { + "epoch": 5.526916802610114, + "grad_norm": 0.007133916020393372, + "learning_rate": 0.0009082324341078927, + "loss": 0.1363, + "num_input_tokens_seen": 73225248, + "step": 33880 + }, + { + "epoch": 5.527732463295269, + "grad_norm": 0.018216347321867943, + "learning_rate": 0.0009081913310894494, + "loss": 0.0638, + "num_input_tokens_seen": 73236864, + "step": 33885 + }, + { + "epoch": 5.528548123980424, + "grad_norm": 0.05228884145617485, + "learning_rate": 0.000908150219798491, + "loss": 0.0864, + "num_input_tokens_seen": 73246880, + "step": 33890 + }, + { + "epoch": 5.529363784665579, + "grad_norm": 0.09811168909072876, + "learning_rate": 0.0009081091002358506, + "loss": 0.0786, + "num_input_tokens_seen": 73256192, + "step": 33895 + }, + { + "epoch": 5.5301794453507345, + "grad_norm": 0.007979627698659897, + "learning_rate": 0.0009080679724023615, + "loss": 0.0675, + "num_input_tokens_seen": 73267072, + "step": 33900 + }, + { + "epoch": 5.530995106035889, + "grad_norm": 0.04914901778101921, + "learning_rate": 0.0009080268362988572, + "loss": 0.0795, + "num_input_tokens_seen": 73278208, + "step": 33905 + }, + { + "epoch": 5.531810766721044, + "grad_norm": 0.34442588686943054, + "learning_rate": 0.0009079856919261716, + "loss": 0.1373, + "num_input_tokens_seen": 73288800, + "step": 33910 + }, + { + "epoch": 5.532626427406199, + "grad_norm": 0.02547566592693329, + "learning_rate": 0.0009079445392851383, + "loss": 0.0475, + "num_input_tokens_seen": 73300064, + "step": 33915 + }, + { + "epoch": 5.533442088091354, + "grad_norm": 0.07199371606111526, + "learning_rate": 0.0009079033783765914, + "loss": 0.0693, + "num_input_tokens_seen": 73311264, + "step": 33920 + }, + { + "epoch": 5.5342577487765094, + "grad_norm": 0.10011252015829086, + "learning_rate": 0.0009078622092013651, + "loss": 0.1076, + "num_input_tokens_seen": 73322304, + "step": 33925 + }, + { + "epoch": 5.535073409461664, + "grad_norm": 0.008656290359795094, + "learning_rate": 0.0009078210317602938, + "loss": 0.0772, + "num_input_tokens_seen": 73333760, + "step": 33930 + }, + { + "epoch": 5.535889070146819, + "grad_norm": 0.11770905554294586, + "learning_rate": 0.0009077798460542119, + "loss": 0.2849, + "num_input_tokens_seen": 73343744, + "step": 33935 + }, + { + "epoch": 5.536704730831974, + "grad_norm": 0.03572523966431618, + "learning_rate": 0.0009077386520839541, + "loss": 0.1529, + "num_input_tokens_seen": 73353824, + "step": 33940 + }, + { + "epoch": 5.537520391517129, + "grad_norm": 0.05641023814678192, + "learning_rate": 0.0009076974498503552, + "loss": 0.1559, + "num_input_tokens_seen": 73363968, + "step": 33945 + }, + { + "epoch": 5.5383360522022835, + "grad_norm": 0.09805291891098022, + "learning_rate": 0.0009076562393542502, + "loss": 0.0444, + "num_input_tokens_seen": 73375680, + "step": 33950 + }, + { + "epoch": 5.539151712887438, + "grad_norm": 0.05173416808247566, + "learning_rate": 0.0009076150205964746, + "loss": 0.0864, + "num_input_tokens_seen": 73386912, + "step": 33955 + }, + { + "epoch": 5.539967373572594, + "grad_norm": 0.14410558342933655, + "learning_rate": 0.0009075737935778634, + "loss": 0.0858, + "num_input_tokens_seen": 73399648, + "step": 33960 + }, + { + "epoch": 5.540783034257749, + "grad_norm": 0.1506105214357376, + "learning_rate": 0.0009075325582992522, + "loss": 0.0496, + "num_input_tokens_seen": 73410944, + "step": 33965 + }, + { + "epoch": 5.541598694942904, + "grad_norm": 0.030392751097679138, + "learning_rate": 0.0009074913147614767, + "loss": 0.042, + "num_input_tokens_seen": 73422016, + "step": 33970 + }, + { + "epoch": 5.5424143556280585, + "grad_norm": 0.020440472289919853, + "learning_rate": 0.0009074500629653728, + "loss": 0.1176, + "num_input_tokens_seen": 73433120, + "step": 33975 + }, + { + "epoch": 5.543230016313213, + "grad_norm": 0.02089664526283741, + "learning_rate": 0.0009074088029117764, + "loss": 0.0972, + "num_input_tokens_seen": 73443456, + "step": 33980 + }, + { + "epoch": 5.544045676998369, + "grad_norm": 0.3009980022907257, + "learning_rate": 0.0009073675346015239, + "loss": 0.1754, + "num_input_tokens_seen": 73453632, + "step": 33985 + }, + { + "epoch": 5.544861337683524, + "grad_norm": 0.18772025406360626, + "learning_rate": 0.0009073262580354516, + "loss": 0.2225, + "num_input_tokens_seen": 73464000, + "step": 33990 + }, + { + "epoch": 5.545676998368679, + "grad_norm": 0.02644345909357071, + "learning_rate": 0.0009072849732143957, + "loss": 0.1702, + "num_input_tokens_seen": 73475328, + "step": 33995 + }, + { + "epoch": 5.5464926590538335, + "grad_norm": 0.013715567998588085, + "learning_rate": 0.0009072436801391932, + "loss": 0.1263, + "num_input_tokens_seen": 73486464, + "step": 34000 + }, + { + "epoch": 5.547308319738988, + "grad_norm": 0.15529221296310425, + "learning_rate": 0.0009072023788106811, + "loss": 0.1647, + "num_input_tokens_seen": 73497440, + "step": 34005 + }, + { + "epoch": 5.548123980424144, + "grad_norm": 0.2070222645998001, + "learning_rate": 0.0009071610692296961, + "loss": 0.1927, + "num_input_tokens_seen": 73508704, + "step": 34010 + }, + { + "epoch": 5.548939641109299, + "grad_norm": 0.18096806108951569, + "learning_rate": 0.0009071197513970755, + "loss": 0.0852, + "num_input_tokens_seen": 73519264, + "step": 34015 + }, + { + "epoch": 5.549755301794454, + "grad_norm": 0.0723312720656395, + "learning_rate": 0.0009070784253136565, + "loss": 0.0437, + "num_input_tokens_seen": 73529088, + "step": 34020 + }, + { + "epoch": 5.5505709624796085, + "grad_norm": 0.02088317647576332, + "learning_rate": 0.0009070370909802772, + "loss": 0.1904, + "num_input_tokens_seen": 73540800, + "step": 34025 + }, + { + "epoch": 5.551386623164763, + "grad_norm": 0.020408082753419876, + "learning_rate": 0.0009069957483977747, + "loss": 0.0953, + "num_input_tokens_seen": 73550656, + "step": 34030 + }, + { + "epoch": 5.552202283849918, + "grad_norm": 0.050628501921892166, + "learning_rate": 0.0009069543975669869, + "loss": 0.0705, + "num_input_tokens_seen": 73561824, + "step": 34035 + }, + { + "epoch": 5.553017944535073, + "grad_norm": 0.17295457422733307, + "learning_rate": 0.0009069130384887521, + "loss": 0.1914, + "num_input_tokens_seen": 73572544, + "step": 34040 + }, + { + "epoch": 5.553833605220229, + "grad_norm": 0.018812965601682663, + "learning_rate": 0.0009068716711639084, + "loss": 0.0508, + "num_input_tokens_seen": 73583136, + "step": 34045 + }, + { + "epoch": 5.554649265905383, + "grad_norm": 0.21679341793060303, + "learning_rate": 0.0009068302955932939, + "loss": 0.0733, + "num_input_tokens_seen": 73594528, + "step": 34050 + }, + { + "epoch": 5.555464926590538, + "grad_norm": 0.06878527998924255, + "learning_rate": 0.0009067889117777477, + "loss": 0.0795, + "num_input_tokens_seen": 73606272, + "step": 34055 + }, + { + "epoch": 5.556280587275693, + "grad_norm": 0.20378956198692322, + "learning_rate": 0.000906747519718108, + "loss": 0.1039, + "num_input_tokens_seen": 73616384, + "step": 34060 + }, + { + "epoch": 5.557096247960848, + "grad_norm": 0.2941659390926361, + "learning_rate": 0.0009067061194152138, + "loss": 0.0564, + "num_input_tokens_seen": 73626624, + "step": 34065 + }, + { + "epoch": 5.557911908646004, + "grad_norm": 0.022131171077489853, + "learning_rate": 0.0009066647108699041, + "loss": 0.0767, + "num_input_tokens_seen": 73637536, + "step": 34070 + }, + { + "epoch": 5.558727569331158, + "grad_norm": 0.21184739470481873, + "learning_rate": 0.0009066232940830182, + "loss": 0.1311, + "num_input_tokens_seen": 73647744, + "step": 34075 + }, + { + "epoch": 5.559543230016313, + "grad_norm": 0.020756877958774567, + "learning_rate": 0.0009065818690553955, + "loss": 0.3321, + "num_input_tokens_seen": 73657952, + "step": 34080 + }, + { + "epoch": 5.560358890701468, + "grad_norm": 0.0600406639277935, + "learning_rate": 0.0009065404357878752, + "loss": 0.1166, + "num_input_tokens_seen": 73669376, + "step": 34085 + }, + { + "epoch": 5.561174551386623, + "grad_norm": 0.020772617310285568, + "learning_rate": 0.0009064989942812974, + "loss": 0.106, + "num_input_tokens_seen": 73679616, + "step": 34090 + }, + { + "epoch": 5.561990212071779, + "grad_norm": 0.12317997217178345, + "learning_rate": 0.0009064575445365019, + "loss": 0.0546, + "num_input_tokens_seen": 73690816, + "step": 34095 + }, + { + "epoch": 5.562805872756933, + "grad_norm": 0.057949621230363846, + "learning_rate": 0.0009064160865543285, + "loss": 0.1079, + "num_input_tokens_seen": 73701792, + "step": 34100 + }, + { + "epoch": 5.563621533442088, + "grad_norm": 0.08427233248949051, + "learning_rate": 0.0009063746203356176, + "loss": 0.0214, + "num_input_tokens_seen": 73712288, + "step": 34105 + }, + { + "epoch": 5.564437194127243, + "grad_norm": 0.12674139440059662, + "learning_rate": 0.0009063331458812094, + "loss": 0.0537, + "num_input_tokens_seen": 73721472, + "step": 34110 + }, + { + "epoch": 5.565252854812398, + "grad_norm": 0.13015665113925934, + "learning_rate": 0.0009062916631919445, + "loss": 0.2981, + "num_input_tokens_seen": 73731744, + "step": 34115 + }, + { + "epoch": 5.566068515497553, + "grad_norm": 0.016723886132240295, + "learning_rate": 0.0009062501722686638, + "loss": 0.0735, + "num_input_tokens_seen": 73743200, + "step": 34120 + }, + { + "epoch": 5.566884176182708, + "grad_norm": 0.2247808575630188, + "learning_rate": 0.0009062086731122079, + "loss": 0.231, + "num_input_tokens_seen": 73754432, + "step": 34125 + }, + { + "epoch": 5.567699836867863, + "grad_norm": 0.05223708599805832, + "learning_rate": 0.0009061671657234179, + "loss": 0.2016, + "num_input_tokens_seen": 73766432, + "step": 34130 + }, + { + "epoch": 5.568515497553018, + "grad_norm": 0.07750531286001205, + "learning_rate": 0.000906125650103135, + "loss": 0.126, + "num_input_tokens_seen": 73776512, + "step": 34135 + }, + { + "epoch": 5.569331158238173, + "grad_norm": 0.07240372151136398, + "learning_rate": 0.0009060841262522006, + "loss": 0.1414, + "num_input_tokens_seen": 73786688, + "step": 34140 + }, + { + "epoch": 5.570146818923328, + "grad_norm": 0.07198721915483475, + "learning_rate": 0.0009060425941714563, + "loss": 0.0999, + "num_input_tokens_seen": 73798016, + "step": 34145 + }, + { + "epoch": 5.5709624796084825, + "grad_norm": 0.027029162272810936, + "learning_rate": 0.0009060010538617437, + "loss": 0.0319, + "num_input_tokens_seen": 73808992, + "step": 34150 + }, + { + "epoch": 5.571778140293638, + "grad_norm": 0.12103355675935745, + "learning_rate": 0.0009059595053239047, + "loss": 0.1308, + "num_input_tokens_seen": 73819520, + "step": 34155 + }, + { + "epoch": 5.572593800978793, + "grad_norm": 0.067531056702137, + "learning_rate": 0.0009059179485587813, + "loss": 0.1778, + "num_input_tokens_seen": 73831712, + "step": 34160 + }, + { + "epoch": 5.573409461663948, + "grad_norm": 0.07771392166614532, + "learning_rate": 0.0009058763835672157, + "loss": 0.1988, + "num_input_tokens_seen": 73842464, + "step": 34165 + }, + { + "epoch": 5.574225122349103, + "grad_norm": 0.04161020740866661, + "learning_rate": 0.0009058348103500504, + "loss": 0.0862, + "num_input_tokens_seen": 73853088, + "step": 34170 + }, + { + "epoch": 5.575040783034257, + "grad_norm": 0.02227962017059326, + "learning_rate": 0.0009057932289081278, + "loss": 0.0497, + "num_input_tokens_seen": 73864512, + "step": 34175 + }, + { + "epoch": 5.575856443719413, + "grad_norm": 0.02278684638440609, + "learning_rate": 0.0009057516392422906, + "loss": 0.0666, + "num_input_tokens_seen": 73875456, + "step": 34180 + }, + { + "epoch": 5.576672104404568, + "grad_norm": 0.0633942186832428, + "learning_rate": 0.0009057100413533817, + "loss": 0.1228, + "num_input_tokens_seen": 73886048, + "step": 34185 + }, + { + "epoch": 5.577487765089723, + "grad_norm": 0.012222129851579666, + "learning_rate": 0.0009056684352422441, + "loss": 0.0677, + "num_input_tokens_seen": 73898016, + "step": 34190 + }, + { + "epoch": 5.578303425774878, + "grad_norm": 0.024317584931850433, + "learning_rate": 0.0009056268209097211, + "loss": 0.1371, + "num_input_tokens_seen": 73909600, + "step": 34195 + }, + { + "epoch": 5.579119086460032, + "grad_norm": 0.02568567730486393, + "learning_rate": 0.000905585198356656, + "loss": 0.0815, + "num_input_tokens_seen": 73920128, + "step": 34200 + }, + { + "epoch": 5.579934747145187, + "grad_norm": 0.06447894871234894, + "learning_rate": 0.0009055435675838923, + "loss": 0.0389, + "num_input_tokens_seen": 73931456, + "step": 34205 + }, + { + "epoch": 5.580750407830343, + "grad_norm": 0.020777931436896324, + "learning_rate": 0.0009055019285922737, + "loss": 0.1695, + "num_input_tokens_seen": 73942464, + "step": 34210 + }, + { + "epoch": 5.581566068515498, + "grad_norm": 0.09878566116094589, + "learning_rate": 0.0009054602813826441, + "loss": 0.0635, + "num_input_tokens_seen": 73953120, + "step": 34215 + }, + { + "epoch": 5.582381729200653, + "grad_norm": 0.0664665699005127, + "learning_rate": 0.0009054186259558477, + "loss": 0.0914, + "num_input_tokens_seen": 73963104, + "step": 34220 + }, + { + "epoch": 5.583197389885807, + "grad_norm": 0.01421379018574953, + "learning_rate": 0.0009053769623127284, + "loss": 0.0335, + "num_input_tokens_seen": 73974240, + "step": 34225 + }, + { + "epoch": 5.584013050570962, + "grad_norm": 0.1316417157649994, + "learning_rate": 0.0009053352904541306, + "loss": 0.1655, + "num_input_tokens_seen": 73984864, + "step": 34230 + }, + { + "epoch": 5.584828711256117, + "grad_norm": 0.0478694885969162, + "learning_rate": 0.0009052936103808991, + "loss": 0.0957, + "num_input_tokens_seen": 73995936, + "step": 34235 + }, + { + "epoch": 5.585644371941273, + "grad_norm": 0.04798683524131775, + "learning_rate": 0.0009052519220938784, + "loss": 0.043, + "num_input_tokens_seen": 74007008, + "step": 34240 + }, + { + "epoch": 5.5864600326264275, + "grad_norm": 0.022165268659591675, + "learning_rate": 0.0009052102255939134, + "loss": 0.0668, + "num_input_tokens_seen": 74017664, + "step": 34245 + }, + { + "epoch": 5.587275693311582, + "grad_norm": 0.1620720475912094, + "learning_rate": 0.000905168520881849, + "loss": 0.0979, + "num_input_tokens_seen": 74028928, + "step": 34250 + }, + { + "epoch": 5.588091353996737, + "grad_norm": 0.18612481653690338, + "learning_rate": 0.0009051268079585306, + "loss": 0.2239, + "num_input_tokens_seen": 74041088, + "step": 34255 + }, + { + "epoch": 5.588907014681892, + "grad_norm": 0.015167465433478355, + "learning_rate": 0.0009050850868248037, + "loss": 0.3041, + "num_input_tokens_seen": 74052896, + "step": 34260 + }, + { + "epoch": 5.589722675367048, + "grad_norm": 0.05257358402013779, + "learning_rate": 0.0009050433574815134, + "loss": 0.0485, + "num_input_tokens_seen": 74063872, + "step": 34265 + }, + { + "epoch": 5.5905383360522025, + "grad_norm": 0.07323313504457474, + "learning_rate": 0.0009050016199295057, + "loss": 0.2594, + "num_input_tokens_seen": 74073856, + "step": 34270 + }, + { + "epoch": 5.591353996737357, + "grad_norm": 0.01376113761216402, + "learning_rate": 0.0009049598741696263, + "loss": 0.0437, + "num_input_tokens_seen": 74085472, + "step": 34275 + }, + { + "epoch": 5.592169657422512, + "grad_norm": 0.02173309214413166, + "learning_rate": 0.0009049181202027215, + "loss": 0.0443, + "num_input_tokens_seen": 74094720, + "step": 34280 + }, + { + "epoch": 5.592985318107667, + "grad_norm": 0.14551734924316406, + "learning_rate": 0.0009048763580296373, + "loss": 0.173, + "num_input_tokens_seen": 74105888, + "step": 34285 + }, + { + "epoch": 5.593800978792823, + "grad_norm": 0.18056020140647888, + "learning_rate": 0.00090483458765122, + "loss": 0.0877, + "num_input_tokens_seen": 74115968, + "step": 34290 + }, + { + "epoch": 5.5946166394779775, + "grad_norm": 0.21234531700611115, + "learning_rate": 0.0009047928090683162, + "loss": 0.1543, + "num_input_tokens_seen": 74126880, + "step": 34295 + }, + { + "epoch": 5.595432300163132, + "grad_norm": 0.020504070445895195, + "learning_rate": 0.0009047510222817725, + "loss": 0.1306, + "num_input_tokens_seen": 74137952, + "step": 34300 + }, + { + "epoch": 5.596247960848287, + "grad_norm": 0.2272954136133194, + "learning_rate": 0.0009047092272924361, + "loss": 0.1549, + "num_input_tokens_seen": 74148640, + "step": 34305 + }, + { + "epoch": 5.597063621533442, + "grad_norm": 0.04594704508781433, + "learning_rate": 0.0009046674241011537, + "loss": 0.0655, + "num_input_tokens_seen": 74159616, + "step": 34310 + }, + { + "epoch": 5.597879282218597, + "grad_norm": 0.18465717136859894, + "learning_rate": 0.0009046256127087727, + "loss": 0.1131, + "num_input_tokens_seen": 74170208, + "step": 34315 + }, + { + "epoch": 5.598694942903752, + "grad_norm": 0.1933651864528656, + "learning_rate": 0.0009045837931161402, + "loss": 0.2482, + "num_input_tokens_seen": 74180992, + "step": 34320 + }, + { + "epoch": 5.599510603588907, + "grad_norm": 0.01716572791337967, + "learning_rate": 0.0009045419653241038, + "loss": 0.0893, + "num_input_tokens_seen": 74191392, + "step": 34325 + }, + { + "epoch": 5.600326264274062, + "grad_norm": 0.103814996778965, + "learning_rate": 0.0009045001293335115, + "loss": 0.0989, + "num_input_tokens_seen": 74202496, + "step": 34330 + }, + { + "epoch": 5.601141924959217, + "grad_norm": 0.1775224357843399, + "learning_rate": 0.0009044582851452107, + "loss": 0.0965, + "num_input_tokens_seen": 74213632, + "step": 34335 + }, + { + "epoch": 5.601957585644372, + "grad_norm": 0.053888604044914246, + "learning_rate": 0.0009044164327600499, + "loss": 0.0887, + "num_input_tokens_seen": 74223392, + "step": 34340 + }, + { + "epoch": 5.602773246329527, + "grad_norm": 0.02769533544778824, + "learning_rate": 0.000904374572178877, + "loss": 0.0727, + "num_input_tokens_seen": 74234368, + "step": 34345 + }, + { + "epoch": 5.603588907014682, + "grad_norm": 0.07350389659404755, + "learning_rate": 0.0009043327034025404, + "loss": 0.0706, + "num_input_tokens_seen": 74245344, + "step": 34350 + }, + { + "epoch": 5.604404567699837, + "grad_norm": 0.015233350917696953, + "learning_rate": 0.0009042908264318885, + "loss": 0.0489, + "num_input_tokens_seen": 74254368, + "step": 34355 + }, + { + "epoch": 5.605220228384992, + "grad_norm": 0.02992558479309082, + "learning_rate": 0.0009042489412677702, + "loss": 0.1246, + "num_input_tokens_seen": 74265344, + "step": 34360 + }, + { + "epoch": 5.606035889070147, + "grad_norm": 0.061724767088890076, + "learning_rate": 0.0009042070479110343, + "loss": 0.1397, + "num_input_tokens_seen": 74275264, + "step": 34365 + }, + { + "epoch": 5.6068515497553015, + "grad_norm": 0.08111986517906189, + "learning_rate": 0.0009041651463625298, + "loss": 0.0385, + "num_input_tokens_seen": 74284608, + "step": 34370 + }, + { + "epoch": 5.607667210440457, + "grad_norm": 0.05526169762015343, + "learning_rate": 0.0009041232366231059, + "loss": 0.1293, + "num_input_tokens_seen": 74296128, + "step": 34375 + }, + { + "epoch": 5.608482871125612, + "grad_norm": 0.21980856359004974, + "learning_rate": 0.0009040813186936119, + "loss": 0.1306, + "num_input_tokens_seen": 74308000, + "step": 34380 + }, + { + "epoch": 5.609298531810767, + "grad_norm": 0.23230993747711182, + "learning_rate": 0.0009040393925748973, + "loss": 0.1128, + "num_input_tokens_seen": 74318240, + "step": 34385 + }, + { + "epoch": 5.610114192495922, + "grad_norm": 0.19267278909683228, + "learning_rate": 0.0009039974582678121, + "loss": 0.1068, + "num_input_tokens_seen": 74329184, + "step": 34390 + }, + { + "epoch": 5.6109298531810765, + "grad_norm": 0.029193982481956482, + "learning_rate": 0.0009039555157732056, + "loss": 0.1378, + "num_input_tokens_seen": 74340000, + "step": 34395 + }, + { + "epoch": 5.611745513866231, + "grad_norm": 0.15133799612522125, + "learning_rate": 0.0009039135650919283, + "loss": 0.0774, + "num_input_tokens_seen": 74350624, + "step": 34400 + }, + { + "epoch": 5.612561174551386, + "grad_norm": 0.161695659160614, + "learning_rate": 0.0009038716062248302, + "loss": 0.0662, + "num_input_tokens_seen": 74360608, + "step": 34405 + }, + { + "epoch": 5.613376835236542, + "grad_norm": 0.05793755128979683, + "learning_rate": 0.0009038296391727616, + "loss": 0.0207, + "num_input_tokens_seen": 74371104, + "step": 34410 + }, + { + "epoch": 5.614192495921697, + "grad_norm": 0.008591441437602043, + "learning_rate": 0.0009037876639365731, + "loss": 0.0934, + "num_input_tokens_seen": 74382400, + "step": 34415 + }, + { + "epoch": 5.6150081566068515, + "grad_norm": 0.14188680052757263, + "learning_rate": 0.0009037456805171154, + "loss": 0.1134, + "num_input_tokens_seen": 74394048, + "step": 34420 + }, + { + "epoch": 5.615823817292006, + "grad_norm": 0.05818561464548111, + "learning_rate": 0.0009037036889152391, + "loss": 0.0622, + "num_input_tokens_seen": 74406016, + "step": 34425 + }, + { + "epoch": 5.616639477977161, + "grad_norm": 0.1742202490568161, + "learning_rate": 0.0009036616891317956, + "loss": 0.0609, + "num_input_tokens_seen": 74417152, + "step": 34430 + }, + { + "epoch": 5.617455138662317, + "grad_norm": 0.13640397787094116, + "learning_rate": 0.0009036196811676358, + "loss": 0.1228, + "num_input_tokens_seen": 74428384, + "step": 34435 + }, + { + "epoch": 5.618270799347472, + "grad_norm": 0.047426458448171616, + "learning_rate": 0.0009035776650236112, + "loss": 0.0624, + "num_input_tokens_seen": 74438880, + "step": 34440 + }, + { + "epoch": 5.6190864600326265, + "grad_norm": 0.14811500906944275, + "learning_rate": 0.0009035356407005732, + "loss": 0.1488, + "num_input_tokens_seen": 74449504, + "step": 34445 + }, + { + "epoch": 5.619902120717781, + "grad_norm": 0.21113741397857666, + "learning_rate": 0.0009034936081993736, + "loss": 0.2132, + "num_input_tokens_seen": 74460032, + "step": 34450 + }, + { + "epoch": 5.620717781402936, + "grad_norm": 0.055134713649749756, + "learning_rate": 0.0009034515675208641, + "loss": 0.0786, + "num_input_tokens_seen": 74471072, + "step": 34455 + }, + { + "epoch": 5.621533442088092, + "grad_norm": 0.29549095034599304, + "learning_rate": 0.0009034095186658966, + "loss": 0.1138, + "num_input_tokens_seen": 74483360, + "step": 34460 + }, + { + "epoch": 5.622349102773247, + "grad_norm": 0.280021607875824, + "learning_rate": 0.0009033674616353236, + "loss": 0.0716, + "num_input_tokens_seen": 74492896, + "step": 34465 + }, + { + "epoch": 5.623164763458401, + "grad_norm": 0.26246124505996704, + "learning_rate": 0.0009033253964299972, + "loss": 0.0924, + "num_input_tokens_seen": 74503200, + "step": 34470 + }, + { + "epoch": 5.623980424143556, + "grad_norm": 0.050797343254089355, + "learning_rate": 0.0009032833230507702, + "loss": 0.2738, + "num_input_tokens_seen": 74514112, + "step": 34475 + }, + { + "epoch": 5.624796084828711, + "grad_norm": 0.0720033347606659, + "learning_rate": 0.000903241241498495, + "loss": 0.2803, + "num_input_tokens_seen": 74525856, + "step": 34480 + }, + { + "epoch": 5.625611745513866, + "grad_norm": 0.19244541227817535, + "learning_rate": 0.0009031991517740244, + "loss": 0.1479, + "num_input_tokens_seen": 74535232, + "step": 34485 + }, + { + "epoch": 5.626427406199021, + "grad_norm": 0.13854020833969116, + "learning_rate": 0.0009031570538782115, + "loss": 0.1443, + "num_input_tokens_seen": 74545216, + "step": 34490 + }, + { + "epoch": 5.627243066884176, + "grad_norm": 0.46975669264793396, + "learning_rate": 0.0009031149478119094, + "loss": 0.1608, + "num_input_tokens_seen": 74556192, + "step": 34495 + }, + { + "epoch": 5.628058727569331, + "grad_norm": 0.1952454298734665, + "learning_rate": 0.0009030728335759716, + "loss": 0.0868, + "num_input_tokens_seen": 74566240, + "step": 34500 + }, + { + "epoch": 5.628874388254486, + "grad_norm": 0.13439679145812988, + "learning_rate": 0.0009030307111712514, + "loss": 0.1536, + "num_input_tokens_seen": 74576768, + "step": 34505 + }, + { + "epoch": 5.629690048939641, + "grad_norm": 0.03982832282781601, + "learning_rate": 0.0009029885805986027, + "loss": 0.1538, + "num_input_tokens_seen": 74587872, + "step": 34510 + }, + { + "epoch": 5.630505709624796, + "grad_norm": 0.24063228070735931, + "learning_rate": 0.0009029464418588791, + "loss": 0.1306, + "num_input_tokens_seen": 74598752, + "step": 34515 + }, + { + "epoch": 5.631321370309951, + "grad_norm": 0.0440259650349617, + "learning_rate": 0.0009029042949529347, + "loss": 0.1057, + "num_input_tokens_seen": 74609856, + "step": 34520 + }, + { + "epoch": 5.632137030995106, + "grad_norm": 0.05447227880358696, + "learning_rate": 0.0009028621398816236, + "loss": 0.1447, + "num_input_tokens_seen": 74620960, + "step": 34525 + }, + { + "epoch": 5.632952691680261, + "grad_norm": 0.018078099936246872, + "learning_rate": 0.0009028199766458002, + "loss": 0.1177, + "num_input_tokens_seen": 74632192, + "step": 34530 + }, + { + "epoch": 5.633768352365416, + "grad_norm": 0.12170816212892532, + "learning_rate": 0.000902777805246319, + "loss": 0.1091, + "num_input_tokens_seen": 74643424, + "step": 34535 + }, + { + "epoch": 5.634584013050571, + "grad_norm": 0.151223286986351, + "learning_rate": 0.0009027356256840345, + "loss": 0.0751, + "num_input_tokens_seen": 74654080, + "step": 34540 + }, + { + "epoch": 5.635399673735726, + "grad_norm": 0.04554088041186333, + "learning_rate": 0.0009026934379598018, + "loss": 0.0829, + "num_input_tokens_seen": 74663840, + "step": 34545 + }, + { + "epoch": 5.636215334420881, + "grad_norm": 0.05020049586892128, + "learning_rate": 0.0009026512420744756, + "loss": 0.0606, + "num_input_tokens_seen": 74674688, + "step": 34550 + }, + { + "epoch": 5.637030995106036, + "grad_norm": 0.09834477305412292, + "learning_rate": 0.0009026090380289111, + "loss": 0.1564, + "num_input_tokens_seen": 74685344, + "step": 34555 + }, + { + "epoch": 5.637846655791191, + "grad_norm": 0.0076295617036521435, + "learning_rate": 0.0009025668258239638, + "loss": 0.1465, + "num_input_tokens_seen": 74697952, + "step": 34560 + }, + { + "epoch": 5.638662316476346, + "grad_norm": 0.012084455229341984, + "learning_rate": 0.0009025246054604892, + "loss": 0.0624, + "num_input_tokens_seen": 74708864, + "step": 34565 + }, + { + "epoch": 5.6394779771615005, + "grad_norm": 0.1278560906648636, + "learning_rate": 0.0009024823769393427, + "loss": 0.2907, + "num_input_tokens_seen": 74719072, + "step": 34570 + }, + { + "epoch": 5.640293637846656, + "grad_norm": 0.0414595901966095, + "learning_rate": 0.0009024401402613803, + "loss": 0.0604, + "num_input_tokens_seen": 74729408, + "step": 34575 + }, + { + "epoch": 5.641109298531811, + "grad_norm": 0.02301446720957756, + "learning_rate": 0.0009023978954274579, + "loss": 0.0541, + "num_input_tokens_seen": 74740064, + "step": 34580 + }, + { + "epoch": 5.641924959216966, + "grad_norm": 0.021538980305194855, + "learning_rate": 0.0009023556424384317, + "loss": 0.0544, + "num_input_tokens_seen": 74750080, + "step": 34585 + }, + { + "epoch": 5.642740619902121, + "grad_norm": 0.06400078535079956, + "learning_rate": 0.0009023133812951581, + "loss": 0.0592, + "num_input_tokens_seen": 74761536, + "step": 34590 + }, + { + "epoch": 5.643556280587275, + "grad_norm": 0.032709237188100815, + "learning_rate": 0.0009022711119984932, + "loss": 0.0412, + "num_input_tokens_seen": 74772128, + "step": 34595 + }, + { + "epoch": 5.64437194127243, + "grad_norm": 0.09720578044652939, + "learning_rate": 0.0009022288345492941, + "loss": 0.1003, + "num_input_tokens_seen": 74783040, + "step": 34600 + }, + { + "epoch": 5.645187601957586, + "grad_norm": 0.3814923167228699, + "learning_rate": 0.0009021865489484173, + "loss": 0.0675, + "num_input_tokens_seen": 74793984, + "step": 34605 + }, + { + "epoch": 5.646003262642741, + "grad_norm": 0.06573888659477234, + "learning_rate": 0.0009021442551967198, + "loss": 0.1281, + "num_input_tokens_seen": 74805088, + "step": 34610 + }, + { + "epoch": 5.646818923327896, + "grad_norm": 0.08702082186937332, + "learning_rate": 0.000902101953295059, + "loss": 0.1954, + "num_input_tokens_seen": 74815520, + "step": 34615 + }, + { + "epoch": 5.64763458401305, + "grad_norm": 0.03930259495973587, + "learning_rate": 0.0009020596432442918, + "loss": 0.049, + "num_input_tokens_seen": 74826368, + "step": 34620 + }, + { + "epoch": 5.648450244698205, + "grad_norm": 0.017254164442420006, + "learning_rate": 0.0009020173250452761, + "loss": 0.1167, + "num_input_tokens_seen": 74836416, + "step": 34625 + }, + { + "epoch": 5.649265905383361, + "grad_norm": 0.0806068629026413, + "learning_rate": 0.0009019749986988692, + "loss": 0.0189, + "num_input_tokens_seen": 74846784, + "step": 34630 + }, + { + "epoch": 5.650081566068516, + "grad_norm": 0.19931930303573608, + "learning_rate": 0.000901932664205929, + "loss": 0.106, + "num_input_tokens_seen": 74857216, + "step": 34635 + }, + { + "epoch": 5.650897226753671, + "grad_norm": 0.06503892689943314, + "learning_rate": 0.0009018903215673135, + "loss": 0.0381, + "num_input_tokens_seen": 74868384, + "step": 34640 + }, + { + "epoch": 5.651712887438825, + "grad_norm": 0.12235700339078903, + "learning_rate": 0.0009018479707838808, + "loss": 0.0673, + "num_input_tokens_seen": 74878592, + "step": 34645 + }, + { + "epoch": 5.65252854812398, + "grad_norm": 0.21434731781482697, + "learning_rate": 0.0009018056118564893, + "loss": 0.0589, + "num_input_tokens_seen": 74888992, + "step": 34650 + }, + { + "epoch": 5.653344208809135, + "grad_norm": 0.19774998724460602, + "learning_rate": 0.0009017632447859971, + "loss": 0.0817, + "num_input_tokens_seen": 74900640, + "step": 34655 + }, + { + "epoch": 5.654159869494291, + "grad_norm": 0.011533130891621113, + "learning_rate": 0.0009017208695732633, + "loss": 0.037, + "num_input_tokens_seen": 74912736, + "step": 34660 + }, + { + "epoch": 5.6549755301794455, + "grad_norm": 0.0663393959403038, + "learning_rate": 0.0009016784862191463, + "loss": 0.0922, + "num_input_tokens_seen": 74923616, + "step": 34665 + }, + { + "epoch": 5.6557911908646, + "grad_norm": 0.028127873316407204, + "learning_rate": 0.0009016360947245053, + "loss": 0.0766, + "num_input_tokens_seen": 74934336, + "step": 34670 + }, + { + "epoch": 5.656606851549755, + "grad_norm": 0.12323799729347229, + "learning_rate": 0.0009015936950901993, + "loss": 0.0224, + "num_input_tokens_seen": 74945952, + "step": 34675 + }, + { + "epoch": 5.65742251223491, + "grad_norm": 0.006789292208850384, + "learning_rate": 0.0009015512873170877, + "loss": 0.1083, + "num_input_tokens_seen": 74956512, + "step": 34680 + }, + { + "epoch": 5.658238172920065, + "grad_norm": 0.08701828867197037, + "learning_rate": 0.0009015088714060297, + "loss": 0.3225, + "num_input_tokens_seen": 74967104, + "step": 34685 + }, + { + "epoch": 5.6590538336052205, + "grad_norm": 0.21887104213237762, + "learning_rate": 0.0009014664473578851, + "loss": 0.1056, + "num_input_tokens_seen": 74977344, + "step": 34690 + }, + { + "epoch": 5.659869494290375, + "grad_norm": 0.3914003074169159, + "learning_rate": 0.0009014240151735138, + "loss": 0.1699, + "num_input_tokens_seen": 74988672, + "step": 34695 + }, + { + "epoch": 5.66068515497553, + "grad_norm": 0.015539568848907948, + "learning_rate": 0.0009013815748537755, + "loss": 0.1177, + "num_input_tokens_seen": 75000384, + "step": 34700 + }, + { + "epoch": 5.661500815660685, + "grad_norm": 0.06390603631734848, + "learning_rate": 0.0009013391263995303, + "loss": 0.1135, + "num_input_tokens_seen": 75010528, + "step": 34705 + }, + { + "epoch": 5.66231647634584, + "grad_norm": 0.011964190751314163, + "learning_rate": 0.0009012966698116387, + "loss": 0.023, + "num_input_tokens_seen": 75021920, + "step": 34710 + }, + { + "epoch": 5.6631321370309955, + "grad_norm": 0.03568481281399727, + "learning_rate": 0.0009012542050909609, + "loss": 0.0481, + "num_input_tokens_seen": 75031968, + "step": 34715 + }, + { + "epoch": 5.66394779771615, + "grad_norm": 0.17110410332679749, + "learning_rate": 0.0009012117322383577, + "loss": 0.1272, + "num_input_tokens_seen": 75043200, + "step": 34720 + }, + { + "epoch": 5.664763458401305, + "grad_norm": 0.04262327775359154, + "learning_rate": 0.0009011692512546897, + "loss": 0.058, + "num_input_tokens_seen": 75054016, + "step": 34725 + }, + { + "epoch": 5.66557911908646, + "grad_norm": 0.1769617199897766, + "learning_rate": 0.0009011267621408179, + "loss": 0.064, + "num_input_tokens_seen": 75065632, + "step": 34730 + }, + { + "epoch": 5.666394779771615, + "grad_norm": 0.27743223309516907, + "learning_rate": 0.0009010842648976034, + "loss": 0.0554, + "num_input_tokens_seen": 75076384, + "step": 34735 + }, + { + "epoch": 5.6672104404567705, + "grad_norm": 0.2698041796684265, + "learning_rate": 0.0009010417595259077, + "loss": 0.1715, + "num_input_tokens_seen": 75087104, + "step": 34740 + }, + { + "epoch": 5.668026101141925, + "grad_norm": 0.13711011409759521, + "learning_rate": 0.0009009992460265917, + "loss": 0.0883, + "num_input_tokens_seen": 75097184, + "step": 34745 + }, + { + "epoch": 5.66884176182708, + "grad_norm": 0.013362093828618526, + "learning_rate": 0.0009009567244005174, + "loss": 0.0612, + "num_input_tokens_seen": 75107136, + "step": 34750 + }, + { + "epoch": 5.669657422512235, + "grad_norm": 0.028613250702619553, + "learning_rate": 0.0009009141946485464, + "loss": 0.0546, + "num_input_tokens_seen": 75117024, + "step": 34755 + }, + { + "epoch": 5.67047308319739, + "grad_norm": 0.016025543212890625, + "learning_rate": 0.0009008716567715406, + "loss": 0.0294, + "num_input_tokens_seen": 75129632, + "step": 34760 + }, + { + "epoch": 5.671288743882545, + "grad_norm": 0.010202709585428238, + "learning_rate": 0.0009008291107703621, + "loss": 0.2316, + "num_input_tokens_seen": 75140704, + "step": 34765 + }, + { + "epoch": 5.672104404567699, + "grad_norm": 0.010567674413323402, + "learning_rate": 0.0009007865566458733, + "loss": 0.0205, + "num_input_tokens_seen": 75150880, + "step": 34770 + }, + { + "epoch": 5.672920065252855, + "grad_norm": 0.2892915606498718, + "learning_rate": 0.0009007439943989364, + "loss": 0.1478, + "num_input_tokens_seen": 75161728, + "step": 34775 + }, + { + "epoch": 5.67373572593801, + "grad_norm": 0.05027128756046295, + "learning_rate": 0.0009007014240304143, + "loss": 0.0588, + "num_input_tokens_seen": 75171936, + "step": 34780 + }, + { + "epoch": 5.674551386623165, + "grad_norm": 0.06722673773765564, + "learning_rate": 0.0009006588455411692, + "loss": 0.0681, + "num_input_tokens_seen": 75182720, + "step": 34785 + }, + { + "epoch": 5.6753670473083195, + "grad_norm": 0.2073163390159607, + "learning_rate": 0.0009006162589320645, + "loss": 0.1459, + "num_input_tokens_seen": 75193344, + "step": 34790 + }, + { + "epoch": 5.676182707993474, + "grad_norm": 0.10115987807512283, + "learning_rate": 0.000900573664203963, + "loss": 0.0308, + "num_input_tokens_seen": 75203104, + "step": 34795 + }, + { + "epoch": 5.67699836867863, + "grad_norm": 0.4650232195854187, + "learning_rate": 0.0009005310613577282, + "loss": 0.1244, + "num_input_tokens_seen": 75213632, + "step": 34800 + }, + { + "epoch": 5.677814029363785, + "grad_norm": 0.007083847187459469, + "learning_rate": 0.0009004884503942232, + "loss": 0.0641, + "num_input_tokens_seen": 75224192, + "step": 34805 + }, + { + "epoch": 5.67862969004894, + "grad_norm": 0.008455055765807629, + "learning_rate": 0.0009004458313143118, + "loss": 0.0426, + "num_input_tokens_seen": 75234880, + "step": 34810 + }, + { + "epoch": 5.6794453507340945, + "grad_norm": 0.10253097862005234, + "learning_rate": 0.0009004032041188575, + "loss": 0.1099, + "num_input_tokens_seen": 75244768, + "step": 34815 + }, + { + "epoch": 5.680261011419249, + "grad_norm": 0.25227639079093933, + "learning_rate": 0.0009003605688087244, + "loss": 0.2557, + "num_input_tokens_seen": 75255264, + "step": 34820 + }, + { + "epoch": 5.681076672104405, + "grad_norm": 0.040591347962617874, + "learning_rate": 0.0009003179253847764, + "loss": 0.2172, + "num_input_tokens_seen": 75265408, + "step": 34825 + }, + { + "epoch": 5.68189233278956, + "grad_norm": 0.22518138587474823, + "learning_rate": 0.0009002752738478779, + "loss": 0.1612, + "num_input_tokens_seen": 75277504, + "step": 34830 + }, + { + "epoch": 5.682707993474715, + "grad_norm": 0.06464552134275436, + "learning_rate": 0.000900232614198893, + "loss": 0.1623, + "num_input_tokens_seen": 75288864, + "step": 34835 + }, + { + "epoch": 5.6835236541598695, + "grad_norm": 0.02341165393590927, + "learning_rate": 0.0009001899464386867, + "loss": 0.1506, + "num_input_tokens_seen": 75298784, + "step": 34840 + }, + { + "epoch": 5.684339314845024, + "grad_norm": 0.0356857031583786, + "learning_rate": 0.0009001472705681233, + "loss": 0.0259, + "num_input_tokens_seen": 75309888, + "step": 34845 + }, + { + "epoch": 5.685154975530179, + "grad_norm": 0.023588141426444054, + "learning_rate": 0.0009001045865880679, + "loss": 0.0789, + "num_input_tokens_seen": 75321216, + "step": 34850 + }, + { + "epoch": 5.685970636215334, + "grad_norm": 0.005699558649212122, + "learning_rate": 0.0009000618944993854, + "loss": 0.073, + "num_input_tokens_seen": 75333280, + "step": 34855 + }, + { + "epoch": 5.68678629690049, + "grad_norm": 0.2904861569404602, + "learning_rate": 0.0009000191943029412, + "loss": 0.0885, + "num_input_tokens_seen": 75342464, + "step": 34860 + }, + { + "epoch": 5.6876019575856445, + "grad_norm": 0.07417767494916916, + "learning_rate": 0.0008999764859996005, + "loss": 0.0726, + "num_input_tokens_seen": 75354240, + "step": 34865 + }, + { + "epoch": 5.688417618270799, + "grad_norm": 0.06917222589254379, + "learning_rate": 0.000899933769590229, + "loss": 0.0889, + "num_input_tokens_seen": 75364992, + "step": 34870 + }, + { + "epoch": 5.689233278955954, + "grad_norm": 0.16416126489639282, + "learning_rate": 0.0008998910450756923, + "loss": 0.2155, + "num_input_tokens_seen": 75375392, + "step": 34875 + }, + { + "epoch": 5.690048939641109, + "grad_norm": 0.23445457220077515, + "learning_rate": 0.0008998483124568561, + "loss": 0.0474, + "num_input_tokens_seen": 75387328, + "step": 34880 + }, + { + "epoch": 5.690864600326265, + "grad_norm": 0.01251760683953762, + "learning_rate": 0.0008998055717345868, + "loss": 0.0442, + "num_input_tokens_seen": 75397984, + "step": 34885 + }, + { + "epoch": 5.691680261011419, + "grad_norm": 0.01877402886748314, + "learning_rate": 0.0008997628229097503, + "loss": 0.1067, + "num_input_tokens_seen": 75407360, + "step": 34890 + }, + { + "epoch": 5.692495921696574, + "grad_norm": 0.17165693640708923, + "learning_rate": 0.0008997200659832129, + "loss": 0.1041, + "num_input_tokens_seen": 75417408, + "step": 34895 + }, + { + "epoch": 5.693311582381729, + "grad_norm": 0.07236825674772263, + "learning_rate": 0.0008996773009558416, + "loss": 0.045, + "num_input_tokens_seen": 75427840, + "step": 34900 + }, + { + "epoch": 5.694127243066884, + "grad_norm": 0.34581971168518066, + "learning_rate": 0.0008996345278285027, + "loss": 0.1505, + "num_input_tokens_seen": 75439424, + "step": 34905 + }, + { + "epoch": 5.69494290375204, + "grad_norm": 0.22256053984165192, + "learning_rate": 0.000899591746602063, + "loss": 0.1045, + "num_input_tokens_seen": 75450272, + "step": 34910 + }, + { + "epoch": 5.695758564437194, + "grad_norm": 0.0090791629627347, + "learning_rate": 0.0008995489572773896, + "loss": 0.0798, + "num_input_tokens_seen": 75460544, + "step": 34915 + }, + { + "epoch": 5.696574225122349, + "grad_norm": 0.04982369393110275, + "learning_rate": 0.0008995061598553499, + "loss": 0.0309, + "num_input_tokens_seen": 75471488, + "step": 34920 + }, + { + "epoch": 5.697389885807504, + "grad_norm": 0.14165297150611877, + "learning_rate": 0.000899463354336811, + "loss": 0.1819, + "num_input_tokens_seen": 75482944, + "step": 34925 + }, + { + "epoch": 5.698205546492659, + "grad_norm": 0.019582441076636314, + "learning_rate": 0.0008994205407226403, + "loss": 0.0423, + "num_input_tokens_seen": 75493984, + "step": 34930 + }, + { + "epoch": 5.699021207177814, + "grad_norm": 0.00845408346503973, + "learning_rate": 0.0008993777190137058, + "loss": 0.1114, + "num_input_tokens_seen": 75504864, + "step": 34935 + }, + { + "epoch": 5.699836867862969, + "grad_norm": 0.1220518946647644, + "learning_rate": 0.0008993348892108753, + "loss": 0.125, + "num_input_tokens_seen": 75515936, + "step": 34940 + }, + { + "epoch": 5.700652528548124, + "grad_norm": 0.0652894675731659, + "learning_rate": 0.0008992920513150165, + "loss": 0.051, + "num_input_tokens_seen": 75527520, + "step": 34945 + }, + { + "epoch": 5.701468189233279, + "grad_norm": 0.02808886580169201, + "learning_rate": 0.0008992492053269976, + "loss": 0.0843, + "num_input_tokens_seen": 75539072, + "step": 34950 + }, + { + "epoch": 5.702283849918434, + "grad_norm": 0.17246325314044952, + "learning_rate": 0.0008992063512476873, + "loss": 0.1733, + "num_input_tokens_seen": 75549792, + "step": 34955 + }, + { + "epoch": 5.703099510603589, + "grad_norm": 0.18310581147670746, + "learning_rate": 0.0008991634890779538, + "loss": 0.0305, + "num_input_tokens_seen": 75560448, + "step": 34960 + }, + { + "epoch": 5.7039151712887435, + "grad_norm": 0.048973195254802704, + "learning_rate": 0.0008991206188186658, + "loss": 0.1631, + "num_input_tokens_seen": 75571776, + "step": 34965 + }, + { + "epoch": 5.704730831973899, + "grad_norm": 0.10040713101625443, + "learning_rate": 0.0008990777404706922, + "loss": 0.2469, + "num_input_tokens_seen": 75583328, + "step": 34970 + }, + { + "epoch": 5.705546492659054, + "grad_norm": 0.19471172988414764, + "learning_rate": 0.0008990348540349019, + "loss": 0.1636, + "num_input_tokens_seen": 75593984, + "step": 34975 + }, + { + "epoch": 5.706362153344209, + "grad_norm": 0.25231602787971497, + "learning_rate": 0.0008989919595121641, + "loss": 0.1036, + "num_input_tokens_seen": 75604256, + "step": 34980 + }, + { + "epoch": 5.707177814029364, + "grad_norm": 0.02728627808392048, + "learning_rate": 0.000898949056903348, + "loss": 0.0964, + "num_input_tokens_seen": 75614848, + "step": 34985 + }, + { + "epoch": 5.7079934747145185, + "grad_norm": 0.22083836793899536, + "learning_rate": 0.0008989061462093233, + "loss": 0.1313, + "num_input_tokens_seen": 75626208, + "step": 34990 + }, + { + "epoch": 5.708809135399674, + "grad_norm": 0.03731374442577362, + "learning_rate": 0.0008988632274309593, + "loss": 0.1389, + "num_input_tokens_seen": 75637024, + "step": 34995 + }, + { + "epoch": 5.709624796084829, + "grad_norm": 0.01850057952105999, + "learning_rate": 0.0008988203005691262, + "loss": 0.0446, + "num_input_tokens_seen": 75647456, + "step": 35000 + }, + { + "epoch": 5.710440456769984, + "grad_norm": 0.08934499323368073, + "learning_rate": 0.0008987773656246936, + "loss": 0.0551, + "num_input_tokens_seen": 75658368, + "step": 35005 + }, + { + "epoch": 5.711256117455139, + "grad_norm": 0.11565990746021271, + "learning_rate": 0.0008987344225985319, + "loss": 0.1668, + "num_input_tokens_seen": 75668640, + "step": 35010 + }, + { + "epoch": 5.712071778140293, + "grad_norm": 0.07857227325439453, + "learning_rate": 0.0008986914714915112, + "loss": 0.1319, + "num_input_tokens_seen": 75679136, + "step": 35015 + }, + { + "epoch": 5.712887438825448, + "grad_norm": 0.018412547186017036, + "learning_rate": 0.000898648512304502, + "loss": 0.0948, + "num_input_tokens_seen": 75689312, + "step": 35020 + }, + { + "epoch": 5.713703099510604, + "grad_norm": 0.15788637101650238, + "learning_rate": 0.0008986055450383752, + "loss": 0.1009, + "num_input_tokens_seen": 75699360, + "step": 35025 + }, + { + "epoch": 5.714518760195759, + "grad_norm": 0.059505756944417953, + "learning_rate": 0.0008985625696940013, + "loss": 0.0621, + "num_input_tokens_seen": 75710304, + "step": 35030 + }, + { + "epoch": 5.715334420880914, + "grad_norm": 0.03797432407736778, + "learning_rate": 0.0008985195862722513, + "loss": 0.1173, + "num_input_tokens_seen": 75721408, + "step": 35035 + }, + { + "epoch": 5.716150081566068, + "grad_norm": 0.08449093252420425, + "learning_rate": 0.0008984765947739964, + "loss": 0.154, + "num_input_tokens_seen": 75732832, + "step": 35040 + }, + { + "epoch": 5.716965742251223, + "grad_norm": 0.0638086199760437, + "learning_rate": 0.0008984335952001075, + "loss": 0.0495, + "num_input_tokens_seen": 75744256, + "step": 35045 + }, + { + "epoch": 5.717781402936378, + "grad_norm": 0.050827570259571075, + "learning_rate": 0.0008983905875514566, + "loss": 0.0169, + "num_input_tokens_seen": 75754016, + "step": 35050 + }, + { + "epoch": 5.718597063621534, + "grad_norm": 0.13446107506752014, + "learning_rate": 0.000898347571828915, + "loss": 0.0554, + "num_input_tokens_seen": 75766304, + "step": 35055 + }, + { + "epoch": 5.719412724306689, + "grad_norm": 0.03578329086303711, + "learning_rate": 0.0008983045480333545, + "loss": 0.1295, + "num_input_tokens_seen": 75776544, + "step": 35060 + }, + { + "epoch": 5.720228384991843, + "grad_norm": 0.11802957952022552, + "learning_rate": 0.0008982615161656471, + "loss": 0.2292, + "num_input_tokens_seen": 75787008, + "step": 35065 + }, + { + "epoch": 5.721044045676998, + "grad_norm": 0.011829815804958344, + "learning_rate": 0.0008982184762266648, + "loss": 0.0445, + "num_input_tokens_seen": 75796128, + "step": 35070 + }, + { + "epoch": 5.721859706362153, + "grad_norm": 0.09536035358905792, + "learning_rate": 0.00089817542821728, + "loss": 0.0644, + "num_input_tokens_seen": 75808640, + "step": 35075 + }, + { + "epoch": 5.722675367047309, + "grad_norm": 0.21114234626293182, + "learning_rate": 0.0008981323721383649, + "loss": 0.2755, + "num_input_tokens_seen": 75819040, + "step": 35080 + }, + { + "epoch": 5.7234910277324635, + "grad_norm": 0.1931707113981247, + "learning_rate": 0.0008980893079907922, + "loss": 0.0722, + "num_input_tokens_seen": 75830464, + "step": 35085 + }, + { + "epoch": 5.724306688417618, + "grad_norm": 0.010170339606702328, + "learning_rate": 0.0008980462357754347, + "loss": 0.0227, + "num_input_tokens_seen": 75841440, + "step": 35090 + }, + { + "epoch": 5.725122349102773, + "grad_norm": 0.11149877309799194, + "learning_rate": 0.0008980031554931654, + "loss": 0.1586, + "num_input_tokens_seen": 75853952, + "step": 35095 + }, + { + "epoch": 5.725938009787928, + "grad_norm": 0.22925962507724762, + "learning_rate": 0.0008979600671448571, + "loss": 0.1256, + "num_input_tokens_seen": 75863776, + "step": 35100 + }, + { + "epoch": 5.726753670473083, + "grad_norm": 0.15158496797084808, + "learning_rate": 0.0008979169707313831, + "loss": 0.0396, + "num_input_tokens_seen": 75876352, + "step": 35105 + }, + { + "epoch": 5.7275693311582385, + "grad_norm": 0.04836349934339523, + "learning_rate": 0.000897873866253617, + "loss": 0.0416, + "num_input_tokens_seen": 75888000, + "step": 35110 + }, + { + "epoch": 5.728384991843393, + "grad_norm": 0.011450660414993763, + "learning_rate": 0.0008978307537124324, + "loss": 0.0917, + "num_input_tokens_seen": 75898048, + "step": 35115 + }, + { + "epoch": 5.729200652528548, + "grad_norm": 0.2579789161682129, + "learning_rate": 0.0008977876331087027, + "loss": 0.3243, + "num_input_tokens_seen": 75909344, + "step": 35120 + }, + { + "epoch": 5.730016313213703, + "grad_norm": 0.1533122956752777, + "learning_rate": 0.0008977445044433021, + "loss": 0.108, + "num_input_tokens_seen": 75919360, + "step": 35125 + }, + { + "epoch": 5.730831973898858, + "grad_norm": 0.011914343573153019, + "learning_rate": 0.0008977013677171045, + "loss": 0.0261, + "num_input_tokens_seen": 75930624, + "step": 35130 + }, + { + "epoch": 5.731647634584013, + "grad_norm": 0.054159220308065414, + "learning_rate": 0.0008976582229309842, + "loss": 0.095, + "num_input_tokens_seen": 75941728, + "step": 35135 + }, + { + "epoch": 5.732463295269168, + "grad_norm": 0.22217957675457, + "learning_rate": 0.0008976150700858155, + "loss": 0.2075, + "num_input_tokens_seen": 75952960, + "step": 35140 + }, + { + "epoch": 5.733278955954323, + "grad_norm": 0.288897305727005, + "learning_rate": 0.000897571909182473, + "loss": 0.1507, + "num_input_tokens_seen": 75964320, + "step": 35145 + }, + { + "epoch": 5.734094616639478, + "grad_norm": 0.15639406442642212, + "learning_rate": 0.0008975287402218314, + "loss": 0.0971, + "num_input_tokens_seen": 75974848, + "step": 35150 + }, + { + "epoch": 5.734910277324633, + "grad_norm": 0.13085006177425385, + "learning_rate": 0.0008974855632047657, + "loss": 0.0652, + "num_input_tokens_seen": 75985024, + "step": 35155 + }, + { + "epoch": 5.735725938009788, + "grad_norm": 0.23708663880825043, + "learning_rate": 0.0008974423781321506, + "loss": 0.0831, + "num_input_tokens_seen": 75996544, + "step": 35160 + }, + { + "epoch": 5.736541598694943, + "grad_norm": 0.010799623094499111, + "learning_rate": 0.0008973991850048616, + "loss": 0.0199, + "num_input_tokens_seen": 76007616, + "step": 35165 + }, + { + "epoch": 5.737357259380098, + "grad_norm": 0.22216928005218506, + "learning_rate": 0.0008973559838237739, + "loss": 0.1143, + "num_input_tokens_seen": 76017824, + "step": 35170 + }, + { + "epoch": 5.738172920065253, + "grad_norm": 0.01401756051927805, + "learning_rate": 0.0008973127745897634, + "loss": 0.0905, + "num_input_tokens_seen": 76029024, + "step": 35175 + }, + { + "epoch": 5.738988580750408, + "grad_norm": 0.18570876121520996, + "learning_rate": 0.0008972695573037052, + "loss": 0.1219, + "num_input_tokens_seen": 76039968, + "step": 35180 + }, + { + "epoch": 5.739804241435563, + "grad_norm": 0.04568921774625778, + "learning_rate": 0.0008972263319664756, + "loss": 0.0498, + "num_input_tokens_seen": 76051360, + "step": 35185 + }, + { + "epoch": 5.740619902120718, + "grad_norm": 0.007866466417908669, + "learning_rate": 0.0008971830985789504, + "loss": 0.1595, + "num_input_tokens_seen": 76062016, + "step": 35190 + }, + { + "epoch": 5.741435562805873, + "grad_norm": 0.005754650104790926, + "learning_rate": 0.0008971398571420058, + "loss": 0.1148, + "num_input_tokens_seen": 76073696, + "step": 35195 + }, + { + "epoch": 5.742251223491028, + "grad_norm": 0.022885838523507118, + "learning_rate": 0.0008970966076565183, + "loss": 0.119, + "num_input_tokens_seen": 76083104, + "step": 35200 + }, + { + "epoch": 5.743066884176183, + "grad_norm": 0.11712540686130524, + "learning_rate": 0.0008970533501233642, + "loss": 0.0966, + "num_input_tokens_seen": 76094144, + "step": 35205 + }, + { + "epoch": 5.7438825448613375, + "grad_norm": 0.00609155697748065, + "learning_rate": 0.0008970100845434204, + "loss": 0.0308, + "num_input_tokens_seen": 76105408, + "step": 35210 + }, + { + "epoch": 5.744698205546492, + "grad_norm": 0.08641016483306885, + "learning_rate": 0.0008969668109175635, + "loss": 0.1631, + "num_input_tokens_seen": 76116896, + "step": 35215 + }, + { + "epoch": 5.745513866231647, + "grad_norm": 0.00937197171151638, + "learning_rate": 0.0008969235292466706, + "loss": 0.0648, + "num_input_tokens_seen": 76128736, + "step": 35220 + }, + { + "epoch": 5.746329526916803, + "grad_norm": 0.008261475712060928, + "learning_rate": 0.0008968802395316187, + "loss": 0.0116, + "num_input_tokens_seen": 76138240, + "step": 35225 + }, + { + "epoch": 5.747145187601958, + "grad_norm": 0.05619320645928383, + "learning_rate": 0.0008968369417732855, + "loss": 0.0609, + "num_input_tokens_seen": 76149312, + "step": 35230 + }, + { + "epoch": 5.7479608482871125, + "grad_norm": 0.08603104203939438, + "learning_rate": 0.0008967936359725482, + "loss": 0.1606, + "num_input_tokens_seen": 76161056, + "step": 35235 + }, + { + "epoch": 5.748776508972267, + "grad_norm": 0.014682306908071041, + "learning_rate": 0.0008967503221302844, + "loss": 0.0621, + "num_input_tokens_seen": 76172288, + "step": 35240 + }, + { + "epoch": 5.749592169657422, + "grad_norm": 0.008648062124848366, + "learning_rate": 0.0008967070002473721, + "loss": 0.1693, + "num_input_tokens_seen": 76182432, + "step": 35245 + }, + { + "epoch": 5.750407830342578, + "grad_norm": 0.06725231558084488, + "learning_rate": 0.0008966636703246891, + "loss": 0.0745, + "num_input_tokens_seen": 76193216, + "step": 35250 + }, + { + "epoch": 5.751223491027733, + "grad_norm": 0.051333747804164886, + "learning_rate": 0.0008966203323631137, + "loss": 0.1863, + "num_input_tokens_seen": 76203520, + "step": 35255 + }, + { + "epoch": 5.7520391517128875, + "grad_norm": 0.041993435472249985, + "learning_rate": 0.000896576986363524, + "loss": 0.081, + "num_input_tokens_seen": 76215584, + "step": 35260 + }, + { + "epoch": 5.752854812398042, + "grad_norm": 0.0039042108692228794, + "learning_rate": 0.0008965336323267986, + "loss": 0.0566, + "num_input_tokens_seen": 76225120, + "step": 35265 + }, + { + "epoch": 5.753670473083197, + "grad_norm": 0.10992413014173508, + "learning_rate": 0.0008964902702538163, + "loss": 0.1709, + "num_input_tokens_seen": 76235776, + "step": 35270 + }, + { + "epoch": 5.754486133768353, + "grad_norm": 0.02428418956696987, + "learning_rate": 0.0008964469001454554, + "loss": 0.0625, + "num_input_tokens_seen": 76246112, + "step": 35275 + }, + { + "epoch": 5.755301794453508, + "grad_norm": 0.26490381360054016, + "learning_rate": 0.0008964035220025953, + "loss": 0.1386, + "num_input_tokens_seen": 76255872, + "step": 35280 + }, + { + "epoch": 5.7561174551386625, + "grad_norm": 0.005199507810175419, + "learning_rate": 0.000896360135826115, + "loss": 0.0508, + "num_input_tokens_seen": 76265312, + "step": 35285 + }, + { + "epoch": 5.756933115823817, + "grad_norm": 0.006524212658405304, + "learning_rate": 0.0008963167416168936, + "loss": 0.031, + "num_input_tokens_seen": 76275104, + "step": 35290 + }, + { + "epoch": 5.757748776508972, + "grad_norm": 0.054473526775836945, + "learning_rate": 0.0008962733393758107, + "loss": 0.0554, + "num_input_tokens_seen": 76286304, + "step": 35295 + }, + { + "epoch": 5.758564437194127, + "grad_norm": 0.13662466406822205, + "learning_rate": 0.0008962299291037459, + "loss": 0.1246, + "num_input_tokens_seen": 76296704, + "step": 35300 + }, + { + "epoch": 5.759380097879282, + "grad_norm": 0.11444362252950668, + "learning_rate": 0.000896186510801579, + "loss": 0.0867, + "num_input_tokens_seen": 76307328, + "step": 35305 + }, + { + "epoch": 5.760195758564437, + "grad_norm": 0.010635402984917164, + "learning_rate": 0.0008961430844701899, + "loss": 0.0943, + "num_input_tokens_seen": 76318048, + "step": 35310 + }, + { + "epoch": 5.761011419249592, + "grad_norm": 0.03280719742178917, + "learning_rate": 0.0008960996501104583, + "loss": 0.1289, + "num_input_tokens_seen": 76328768, + "step": 35315 + }, + { + "epoch": 5.761827079934747, + "grad_norm": 0.01935116946697235, + "learning_rate": 0.0008960562077232652, + "loss": 0.045, + "num_input_tokens_seen": 76339712, + "step": 35320 + }, + { + "epoch": 5.762642740619902, + "grad_norm": 0.25301891565322876, + "learning_rate": 0.0008960127573094904, + "loss": 0.082, + "num_input_tokens_seen": 76351616, + "step": 35325 + }, + { + "epoch": 5.763458401305057, + "grad_norm": 0.08036404848098755, + "learning_rate": 0.0008959692988700148, + "loss": 0.0467, + "num_input_tokens_seen": 76361408, + "step": 35330 + }, + { + "epoch": 5.764274061990212, + "grad_norm": 0.03166870027780533, + "learning_rate": 0.000895925832405719, + "loss": 0.0762, + "num_input_tokens_seen": 76372544, + "step": 35335 + }, + { + "epoch": 5.765089722675367, + "grad_norm": 0.10184311866760254, + "learning_rate": 0.0008958823579174839, + "loss": 0.1098, + "num_input_tokens_seen": 76383552, + "step": 35340 + }, + { + "epoch": 5.765905383360522, + "grad_norm": 0.17572534084320068, + "learning_rate": 0.0008958388754061907, + "loss": 0.0737, + "num_input_tokens_seen": 76393568, + "step": 35345 + }, + { + "epoch": 5.766721044045677, + "grad_norm": 0.030263762921094894, + "learning_rate": 0.0008957953848727205, + "loss": 0.0165, + "num_input_tokens_seen": 76402976, + "step": 35350 + }, + { + "epoch": 5.767536704730832, + "grad_norm": 0.008022490888834, + "learning_rate": 0.0008957518863179545, + "loss": 0.1867, + "num_input_tokens_seen": 76414176, + "step": 35355 + }, + { + "epoch": 5.768352365415987, + "grad_norm": 0.003972323145717382, + "learning_rate": 0.0008957083797427747, + "loss": 0.0115, + "num_input_tokens_seen": 76425248, + "step": 35360 + }, + { + "epoch": 5.769168026101142, + "grad_norm": 0.3723903298377991, + "learning_rate": 0.0008956648651480627, + "loss": 0.1278, + "num_input_tokens_seen": 76435104, + "step": 35365 + }, + { + "epoch": 5.769983686786297, + "grad_norm": 0.29073670506477356, + "learning_rate": 0.0008956213425347001, + "loss": 0.1751, + "num_input_tokens_seen": 76446496, + "step": 35370 + }, + { + "epoch": 5.770799347471452, + "grad_norm": 0.4639027416706085, + "learning_rate": 0.0008955778119035692, + "loss": 0.2464, + "num_input_tokens_seen": 76456000, + "step": 35375 + }, + { + "epoch": 5.771615008156607, + "grad_norm": 0.01451733335852623, + "learning_rate": 0.000895534273255552, + "loss": 0.0313, + "num_input_tokens_seen": 76466880, + "step": 35380 + }, + { + "epoch": 5.7724306688417615, + "grad_norm": 0.018691841512918472, + "learning_rate": 0.0008954907265915311, + "loss": 0.0964, + "num_input_tokens_seen": 76478880, + "step": 35385 + }, + { + "epoch": 5.773246329526917, + "grad_norm": 0.20020082592964172, + "learning_rate": 0.0008954471719123889, + "loss": 0.2309, + "num_input_tokens_seen": 76490240, + "step": 35390 + }, + { + "epoch": 5.774061990212072, + "grad_norm": 0.16038750112056732, + "learning_rate": 0.0008954036092190079, + "loss": 0.078, + "num_input_tokens_seen": 76499168, + "step": 35395 + }, + { + "epoch": 5.774877650897227, + "grad_norm": 0.005479294341057539, + "learning_rate": 0.0008953600385122713, + "loss": 0.0832, + "num_input_tokens_seen": 76511392, + "step": 35400 + }, + { + "epoch": 5.775693311582382, + "grad_norm": 0.03720789775252342, + "learning_rate": 0.0008953164597930621, + "loss": 0.0649, + "num_input_tokens_seen": 76520896, + "step": 35405 + }, + { + "epoch": 5.7765089722675365, + "grad_norm": 0.1710098385810852, + "learning_rate": 0.0008952728730622632, + "loss": 0.126, + "num_input_tokens_seen": 76531648, + "step": 35410 + }, + { + "epoch": 5.777324632952691, + "grad_norm": 0.04318312928080559, + "learning_rate": 0.000895229278320758, + "loss": 0.0368, + "num_input_tokens_seen": 76542464, + "step": 35415 + }, + { + "epoch": 5.778140293637847, + "grad_norm": 0.043150052428245544, + "learning_rate": 0.0008951856755694303, + "loss": 0.0952, + "num_input_tokens_seen": 76553824, + "step": 35420 + }, + { + "epoch": 5.778955954323002, + "grad_norm": 0.20751482248306274, + "learning_rate": 0.0008951420648091635, + "loss": 0.1107, + "num_input_tokens_seen": 76565344, + "step": 35425 + }, + { + "epoch": 5.779771615008157, + "grad_norm": 0.04356337711215019, + "learning_rate": 0.0008950984460408414, + "loss": 0.0887, + "num_input_tokens_seen": 76577600, + "step": 35430 + }, + { + "epoch": 5.780587275693311, + "grad_norm": 0.0067030293866992, + "learning_rate": 0.0008950548192653481, + "loss": 0.2208, + "num_input_tokens_seen": 76588448, + "step": 35435 + }, + { + "epoch": 5.781402936378466, + "grad_norm": 0.029318923130631447, + "learning_rate": 0.0008950111844835678, + "loss": 0.0751, + "num_input_tokens_seen": 76600704, + "step": 35440 + }, + { + "epoch": 5.782218597063622, + "grad_norm": 0.004557712934911251, + "learning_rate": 0.0008949675416963847, + "loss": 0.1848, + "num_input_tokens_seen": 76611488, + "step": 35445 + }, + { + "epoch": 5.783034257748777, + "grad_norm": 0.19797858595848083, + "learning_rate": 0.0008949238909046833, + "loss": 0.0869, + "num_input_tokens_seen": 76623904, + "step": 35450 + }, + { + "epoch": 5.783849918433932, + "grad_norm": 0.12520618736743927, + "learning_rate": 0.0008948802321093484, + "loss": 0.1262, + "num_input_tokens_seen": 76635936, + "step": 35455 + }, + { + "epoch": 5.784665579119086, + "grad_norm": 0.0165663193911314, + "learning_rate": 0.0008948365653112645, + "loss": 0.0287, + "num_input_tokens_seen": 76646720, + "step": 35460 + }, + { + "epoch": 5.785481239804241, + "grad_norm": 0.20906522870063782, + "learning_rate": 0.0008947928905113166, + "loss": 0.1032, + "num_input_tokens_seen": 76656704, + "step": 35465 + }, + { + "epoch": 5.786296900489396, + "grad_norm": 0.04493867978453636, + "learning_rate": 0.00089474920771039, + "loss": 0.2559, + "num_input_tokens_seen": 76667360, + "step": 35470 + }, + { + "epoch": 5.787112561174552, + "grad_norm": 0.1427122801542282, + "learning_rate": 0.0008947055169093701, + "loss": 0.0507, + "num_input_tokens_seen": 76677440, + "step": 35475 + }, + { + "epoch": 5.787928221859707, + "grad_norm": 0.21092888712882996, + "learning_rate": 0.000894661818109142, + "loss": 0.155, + "num_input_tokens_seen": 76688320, + "step": 35480 + }, + { + "epoch": 5.788743882544861, + "grad_norm": 0.13803480565547943, + "learning_rate": 0.0008946181113105915, + "loss": 0.1265, + "num_input_tokens_seen": 76700608, + "step": 35485 + }, + { + "epoch": 5.789559543230016, + "grad_norm": 0.19523029029369354, + "learning_rate": 0.0008945743965146044, + "loss": 0.1149, + "num_input_tokens_seen": 76712064, + "step": 35490 + }, + { + "epoch": 5.790375203915171, + "grad_norm": 0.05511949956417084, + "learning_rate": 0.0008945306737220669, + "loss": 0.0938, + "num_input_tokens_seen": 76722784, + "step": 35495 + }, + { + "epoch": 5.791190864600326, + "grad_norm": 0.21019725501537323, + "learning_rate": 0.0008944869429338645, + "loss": 0.0816, + "num_input_tokens_seen": 76734752, + "step": 35500 + }, + { + "epoch": 5.7920065252854815, + "grad_norm": 0.020916135981678963, + "learning_rate": 0.0008944432041508838, + "loss": 0.0304, + "num_input_tokens_seen": 76745504, + "step": 35505 + }, + { + "epoch": 5.792822185970636, + "grad_norm": 0.21575558185577393, + "learning_rate": 0.0008943994573740111, + "loss": 0.1336, + "num_input_tokens_seen": 76756768, + "step": 35510 + }, + { + "epoch": 5.793637846655791, + "grad_norm": 0.08731380105018616, + "learning_rate": 0.0008943557026041331, + "loss": 0.1259, + "num_input_tokens_seen": 76766400, + "step": 35515 + }, + { + "epoch": 5.794453507340946, + "grad_norm": 0.19834499061107635, + "learning_rate": 0.0008943119398421367, + "loss": 0.077, + "num_input_tokens_seen": 76778208, + "step": 35520 + }, + { + "epoch": 5.795269168026101, + "grad_norm": 0.2235175371170044, + "learning_rate": 0.0008942681690889084, + "loss": 0.3166, + "num_input_tokens_seen": 76787712, + "step": 35525 + }, + { + "epoch": 5.7960848287112565, + "grad_norm": 0.10026352107524872, + "learning_rate": 0.0008942243903453356, + "loss": 0.1214, + "num_input_tokens_seen": 76798592, + "step": 35530 + }, + { + "epoch": 5.796900489396411, + "grad_norm": 0.17312408983707428, + "learning_rate": 0.0008941806036123054, + "loss": 0.0992, + "num_input_tokens_seen": 76810688, + "step": 35535 + }, + { + "epoch": 5.797716150081566, + "grad_norm": 0.07986405491828918, + "learning_rate": 0.0008941368088907052, + "loss": 0.0481, + "num_input_tokens_seen": 76822080, + "step": 35540 + }, + { + "epoch": 5.798531810766721, + "grad_norm": 0.03755347803235054, + "learning_rate": 0.0008940930061814226, + "loss": 0.095, + "num_input_tokens_seen": 76832224, + "step": 35545 + }, + { + "epoch": 5.799347471451876, + "grad_norm": 0.04478127136826515, + "learning_rate": 0.0008940491954853451, + "loss": 0.1006, + "num_input_tokens_seen": 76843584, + "step": 35550 + }, + { + "epoch": 5.800163132137031, + "grad_norm": 0.01865328848361969, + "learning_rate": 0.0008940053768033609, + "loss": 0.0711, + "num_input_tokens_seen": 76854688, + "step": 35555 + }, + { + "epoch": 5.800978792822186, + "grad_norm": 0.008900360204279423, + "learning_rate": 0.0008939615501363581, + "loss": 0.0727, + "num_input_tokens_seen": 76866560, + "step": 35560 + }, + { + "epoch": 5.801794453507341, + "grad_norm": 0.0329662561416626, + "learning_rate": 0.0008939177154852245, + "loss": 0.1171, + "num_input_tokens_seen": 76876608, + "step": 35565 + }, + { + "epoch": 5.802610114192496, + "grad_norm": 0.019964130595326424, + "learning_rate": 0.0008938738728508487, + "loss": 0.0625, + "num_input_tokens_seen": 76886656, + "step": 35570 + }, + { + "epoch": 5.803425774877651, + "grad_norm": 0.13185709714889526, + "learning_rate": 0.0008938300222341192, + "loss": 0.0812, + "num_input_tokens_seen": 76897696, + "step": 35575 + }, + { + "epoch": 5.804241435562806, + "grad_norm": 0.04914616420865059, + "learning_rate": 0.0008937861636359248, + "loss": 0.0342, + "num_input_tokens_seen": 76908128, + "step": 35580 + }, + { + "epoch": 5.80505709624796, + "grad_norm": 0.06280156224966049, + "learning_rate": 0.000893742297057154, + "loss": 0.039, + "num_input_tokens_seen": 76918912, + "step": 35585 + }, + { + "epoch": 5.805872756933116, + "grad_norm": 0.10165125131607056, + "learning_rate": 0.0008936984224986962, + "loss": 0.0566, + "num_input_tokens_seen": 76930240, + "step": 35590 + }, + { + "epoch": 5.806688417618271, + "grad_norm": 0.05567912384867668, + "learning_rate": 0.0008936545399614405, + "loss": 0.1717, + "num_input_tokens_seen": 76941856, + "step": 35595 + }, + { + "epoch": 5.807504078303426, + "grad_norm": 0.11308423429727554, + "learning_rate": 0.0008936106494462761, + "loss": 0.1369, + "num_input_tokens_seen": 76952608, + "step": 35600 + }, + { + "epoch": 5.808319738988581, + "grad_norm": 0.034917544573545456, + "learning_rate": 0.0008935667509540926, + "loss": 0.0723, + "num_input_tokens_seen": 76963904, + "step": 35605 + }, + { + "epoch": 5.809135399673735, + "grad_norm": 0.1020023375749588, + "learning_rate": 0.0008935228444857795, + "loss": 0.1328, + "num_input_tokens_seen": 76974848, + "step": 35610 + }, + { + "epoch": 5.809951060358891, + "grad_norm": 0.031795721501111984, + "learning_rate": 0.0008934789300422268, + "loss": 0.0606, + "num_input_tokens_seen": 76984640, + "step": 35615 + }, + { + "epoch": 5.810766721044046, + "grad_norm": 0.020438876003026962, + "learning_rate": 0.0008934350076243245, + "loss": 0.144, + "num_input_tokens_seen": 76994112, + "step": 35620 + }, + { + "epoch": 5.811582381729201, + "grad_norm": 0.06384740769863129, + "learning_rate": 0.0008933910772329625, + "loss": 0.0439, + "num_input_tokens_seen": 77005792, + "step": 35625 + }, + { + "epoch": 5.8123980424143555, + "grad_norm": 0.018098855391144753, + "learning_rate": 0.0008933471388690314, + "loss": 0.0366, + "num_input_tokens_seen": 77016288, + "step": 35630 + }, + { + "epoch": 5.81321370309951, + "grad_norm": 0.20901861786842346, + "learning_rate": 0.0008933031925334214, + "loss": 0.1645, + "num_input_tokens_seen": 77027456, + "step": 35635 + }, + { + "epoch": 5.814029363784666, + "grad_norm": 0.026231657713651657, + "learning_rate": 0.0008932592382270235, + "loss": 0.1708, + "num_input_tokens_seen": 77038816, + "step": 35640 + }, + { + "epoch": 5.814845024469821, + "grad_norm": 0.047216691076755524, + "learning_rate": 0.0008932152759507279, + "loss": 0.0307, + "num_input_tokens_seen": 77050112, + "step": 35645 + }, + { + "epoch": 5.815660685154976, + "grad_norm": 0.20011982321739197, + "learning_rate": 0.0008931713057054263, + "loss": 0.1104, + "num_input_tokens_seen": 77060768, + "step": 35650 + }, + { + "epoch": 5.8164763458401305, + "grad_norm": 0.2040461301803589, + "learning_rate": 0.0008931273274920091, + "loss": 0.0727, + "num_input_tokens_seen": 77071712, + "step": 35655 + }, + { + "epoch": 5.817292006525285, + "grad_norm": 0.02102004364132881, + "learning_rate": 0.0008930833413113682, + "loss": 0.1561, + "num_input_tokens_seen": 77081824, + "step": 35660 + }, + { + "epoch": 5.81810766721044, + "grad_norm": 0.03012845665216446, + "learning_rate": 0.0008930393471643945, + "loss": 0.0348, + "num_input_tokens_seen": 77093760, + "step": 35665 + }, + { + "epoch": 5.818923327895595, + "grad_norm": 0.04393388330936432, + "learning_rate": 0.0008929953450519799, + "loss": 0.0973, + "num_input_tokens_seen": 77105056, + "step": 35670 + }, + { + "epoch": 5.819738988580751, + "grad_norm": 0.0268500167876482, + "learning_rate": 0.000892951334975016, + "loss": 0.2219, + "num_input_tokens_seen": 77116768, + "step": 35675 + }, + { + "epoch": 5.8205546492659055, + "grad_norm": 0.039734356105327606, + "learning_rate": 0.0008929073169343948, + "loss": 0.0278, + "num_input_tokens_seen": 77126080, + "step": 35680 + }, + { + "epoch": 5.82137030995106, + "grad_norm": 0.00670345826074481, + "learning_rate": 0.0008928632909310084, + "loss": 0.0191, + "num_input_tokens_seen": 77138720, + "step": 35685 + }, + { + "epoch": 5.822185970636215, + "grad_norm": 0.020748404785990715, + "learning_rate": 0.000892819256965749, + "loss": 0.0561, + "num_input_tokens_seen": 77150304, + "step": 35690 + }, + { + "epoch": 5.82300163132137, + "grad_norm": 0.006461436860263348, + "learning_rate": 0.0008927752150395092, + "loss": 0.0242, + "num_input_tokens_seen": 77161408, + "step": 35695 + }, + { + "epoch": 5.823817292006526, + "grad_norm": 0.04613238573074341, + "learning_rate": 0.0008927311651531813, + "loss": 0.1324, + "num_input_tokens_seen": 77171104, + "step": 35700 + }, + { + "epoch": 5.8246329526916805, + "grad_norm": 0.03033752180635929, + "learning_rate": 0.0008926871073076581, + "loss": 0.0144, + "num_input_tokens_seen": 77183264, + "step": 35705 + }, + { + "epoch": 5.825448613376835, + "grad_norm": 0.0055288695730268955, + "learning_rate": 0.0008926430415038324, + "loss": 0.102, + "num_input_tokens_seen": 77192608, + "step": 35710 + }, + { + "epoch": 5.82626427406199, + "grad_norm": 0.3753073215484619, + "learning_rate": 0.0008925989677425976, + "loss": 0.1775, + "num_input_tokens_seen": 77202432, + "step": 35715 + }, + { + "epoch": 5.827079934747145, + "grad_norm": 0.01609216444194317, + "learning_rate": 0.0008925548860248464, + "loss": 0.0581, + "num_input_tokens_seen": 77213376, + "step": 35720 + }, + { + "epoch": 5.827895595432301, + "grad_norm": 0.12528999149799347, + "learning_rate": 0.0008925107963514727, + "loss": 0.0309, + "num_input_tokens_seen": 77224448, + "step": 35725 + }, + { + "epoch": 5.828711256117455, + "grad_norm": 0.05051087588071823, + "learning_rate": 0.0008924666987233697, + "loss": 0.1069, + "num_input_tokens_seen": 77235264, + "step": 35730 + }, + { + "epoch": 5.82952691680261, + "grad_norm": 0.03082728572189808, + "learning_rate": 0.0008924225931414312, + "loss": 0.0907, + "num_input_tokens_seen": 77245280, + "step": 35735 + }, + { + "epoch": 5.830342577487765, + "grad_norm": 0.024117425084114075, + "learning_rate": 0.000892378479606551, + "loss": 0.0191, + "num_input_tokens_seen": 77254976, + "step": 35740 + }, + { + "epoch": 5.83115823817292, + "grad_norm": 0.00413041515275836, + "learning_rate": 0.0008923343581196231, + "loss": 0.0402, + "num_input_tokens_seen": 77264992, + "step": 35745 + }, + { + "epoch": 5.831973898858075, + "grad_norm": 0.027068182826042175, + "learning_rate": 0.0008922902286815417, + "loss": 0.0574, + "num_input_tokens_seen": 77277600, + "step": 35750 + }, + { + "epoch": 5.8327895595432295, + "grad_norm": 0.003934292122721672, + "learning_rate": 0.0008922460912932013, + "loss": 0.0348, + "num_input_tokens_seen": 77288544, + "step": 35755 + }, + { + "epoch": 5.833605220228385, + "grad_norm": 0.010412490926682949, + "learning_rate": 0.0008922019459554961, + "loss": 0.1897, + "num_input_tokens_seen": 77299648, + "step": 35760 + }, + { + "epoch": 5.83442088091354, + "grad_norm": 0.15992146730422974, + "learning_rate": 0.000892157792669321, + "loss": 0.0811, + "num_input_tokens_seen": 77311008, + "step": 35765 + }, + { + "epoch": 5.835236541598695, + "grad_norm": 0.032339174300432205, + "learning_rate": 0.0008921136314355706, + "loss": 0.0093, + "num_input_tokens_seen": 77320928, + "step": 35770 + }, + { + "epoch": 5.83605220228385, + "grad_norm": 0.014086034148931503, + "learning_rate": 0.0008920694622551402, + "loss": 0.0587, + "num_input_tokens_seen": 77331328, + "step": 35775 + }, + { + "epoch": 5.8368678629690045, + "grad_norm": 0.3389575481414795, + "learning_rate": 0.0008920252851289248, + "loss": 0.236, + "num_input_tokens_seen": 77342272, + "step": 35780 + }, + { + "epoch": 5.83768352365416, + "grad_norm": 0.017857255414128304, + "learning_rate": 0.0008919811000578195, + "loss": 0.0768, + "num_input_tokens_seen": 77353952, + "step": 35785 + }, + { + "epoch": 5.838499184339315, + "grad_norm": 0.023409778252243996, + "learning_rate": 0.0008919369070427201, + "loss": 0.0528, + "num_input_tokens_seen": 77364992, + "step": 35790 + }, + { + "epoch": 5.83931484502447, + "grad_norm": 0.34260937571525574, + "learning_rate": 0.000891892706084522, + "loss": 0.0514, + "num_input_tokens_seen": 77375424, + "step": 35795 + }, + { + "epoch": 5.840130505709625, + "grad_norm": 0.15101541578769684, + "learning_rate": 0.0008918484971841211, + "loss": 0.0591, + "num_input_tokens_seen": 77384576, + "step": 35800 + }, + { + "epoch": 5.8409461663947795, + "grad_norm": 0.026871444657444954, + "learning_rate": 0.0008918042803424133, + "loss": 0.0332, + "num_input_tokens_seen": 77395584, + "step": 35805 + }, + { + "epoch": 5.841761827079935, + "grad_norm": 0.04889817163348198, + "learning_rate": 0.0008917600555602947, + "loss": 0.1316, + "num_input_tokens_seen": 77406944, + "step": 35810 + }, + { + "epoch": 5.84257748776509, + "grad_norm": 0.012206361629068851, + "learning_rate": 0.0008917158228386616, + "loss": 0.13, + "num_input_tokens_seen": 77418240, + "step": 35815 + }, + { + "epoch": 5.843393148450245, + "grad_norm": 0.005175419617444277, + "learning_rate": 0.0008916715821784105, + "loss": 0.018, + "num_input_tokens_seen": 77429920, + "step": 35820 + }, + { + "epoch": 5.8442088091354, + "grad_norm": 0.05672929808497429, + "learning_rate": 0.0008916273335804377, + "loss": 0.0394, + "num_input_tokens_seen": 77441504, + "step": 35825 + }, + { + "epoch": 5.8450244698205545, + "grad_norm": 0.0816783681511879, + "learning_rate": 0.0008915830770456403, + "loss": 0.038, + "num_input_tokens_seen": 77452320, + "step": 35830 + }, + { + "epoch": 5.845840130505709, + "grad_norm": 0.09134317189455032, + "learning_rate": 0.0008915388125749152, + "loss": 0.0466, + "num_input_tokens_seen": 77463168, + "step": 35835 + }, + { + "epoch": 5.846655791190865, + "grad_norm": 0.05557303503155708, + "learning_rate": 0.0008914945401691592, + "loss": 0.0353, + "num_input_tokens_seen": 77474464, + "step": 35840 + }, + { + "epoch": 5.84747145187602, + "grad_norm": 0.3143428564071655, + "learning_rate": 0.0008914502598292698, + "loss": 0.2314, + "num_input_tokens_seen": 77485792, + "step": 35845 + }, + { + "epoch": 5.848287112561175, + "grad_norm": 0.23155027627944946, + "learning_rate": 0.0008914059715561442, + "loss": 0.2272, + "num_input_tokens_seen": 77497568, + "step": 35850 + }, + { + "epoch": 5.849102773246329, + "grad_norm": 0.5001063346862793, + "learning_rate": 0.0008913616753506801, + "loss": 0.0709, + "num_input_tokens_seen": 77507296, + "step": 35855 + }, + { + "epoch": 5.849918433931484, + "grad_norm": 0.22688445448875427, + "learning_rate": 0.0008913173712137752, + "loss": 0.0571, + "num_input_tokens_seen": 77517856, + "step": 35860 + }, + { + "epoch": 5.850734094616639, + "grad_norm": 0.014077413827180862, + "learning_rate": 0.0008912730591463274, + "loss": 0.0847, + "num_input_tokens_seen": 77528320, + "step": 35865 + }, + { + "epoch": 5.851549755301795, + "grad_norm": 0.005804943386465311, + "learning_rate": 0.0008912287391492345, + "loss": 0.0677, + "num_input_tokens_seen": 77539392, + "step": 35870 + }, + { + "epoch": 5.85236541598695, + "grad_norm": 0.049013834446668625, + "learning_rate": 0.0008911844112233951, + "loss": 0.0577, + "num_input_tokens_seen": 77550176, + "step": 35875 + }, + { + "epoch": 5.853181076672104, + "grad_norm": 0.42374399304389954, + "learning_rate": 0.0008911400753697072, + "loss": 0.1369, + "num_input_tokens_seen": 77560864, + "step": 35880 + }, + { + "epoch": 5.853996737357259, + "grad_norm": 0.04476075619459152, + "learning_rate": 0.0008910957315890695, + "loss": 0.0288, + "num_input_tokens_seen": 77570752, + "step": 35885 + }, + { + "epoch": 5.854812398042414, + "grad_norm": 0.011624328792095184, + "learning_rate": 0.0008910513798823807, + "loss": 0.0218, + "num_input_tokens_seen": 77581600, + "step": 35890 + }, + { + "epoch": 5.85562805872757, + "grad_norm": 0.006588727701455355, + "learning_rate": 0.0008910070202505396, + "loss": 0.0593, + "num_input_tokens_seen": 77593536, + "step": 35895 + }, + { + "epoch": 5.856443719412725, + "grad_norm": 0.28029340505599976, + "learning_rate": 0.0008909626526944452, + "loss": 0.2462, + "num_input_tokens_seen": 77603328, + "step": 35900 + }, + { + "epoch": 5.857259380097879, + "grad_norm": 0.04250373691320419, + "learning_rate": 0.0008909182772149966, + "loss": 0.0479, + "num_input_tokens_seen": 77612800, + "step": 35905 + }, + { + "epoch": 5.858075040783034, + "grad_norm": 0.0644042119383812, + "learning_rate": 0.0008908738938130933, + "loss": 0.0841, + "num_input_tokens_seen": 77622304, + "step": 35910 + }, + { + "epoch": 5.858890701468189, + "grad_norm": 0.07935375720262527, + "learning_rate": 0.0008908295024896346, + "loss": 0.0657, + "num_input_tokens_seen": 77634336, + "step": 35915 + }, + { + "epoch": 5.859706362153344, + "grad_norm": 0.018218394368886948, + "learning_rate": 0.0008907851032455204, + "loss": 0.0737, + "num_input_tokens_seen": 77645504, + "step": 35920 + }, + { + "epoch": 5.8605220228384995, + "grad_norm": 0.137411966919899, + "learning_rate": 0.0008907406960816502, + "loss": 0.0551, + "num_input_tokens_seen": 77657472, + "step": 35925 + }, + { + "epoch": 5.861337683523654, + "grad_norm": 0.22196845710277557, + "learning_rate": 0.0008906962809989242, + "loss": 0.1278, + "num_input_tokens_seen": 77669344, + "step": 35930 + }, + { + "epoch": 5.862153344208809, + "grad_norm": 0.0576411709189415, + "learning_rate": 0.0008906518579982423, + "loss": 0.0599, + "num_input_tokens_seen": 77679648, + "step": 35935 + }, + { + "epoch": 5.862969004893964, + "grad_norm": 0.03521393612027168, + "learning_rate": 0.000890607427080505, + "loss": 0.0273, + "num_input_tokens_seen": 77689440, + "step": 35940 + }, + { + "epoch": 5.863784665579119, + "grad_norm": 0.034378282725811005, + "learning_rate": 0.0008905629882466126, + "loss": 0.0667, + "num_input_tokens_seen": 77700672, + "step": 35945 + }, + { + "epoch": 5.864600326264274, + "grad_norm": 0.004574810154736042, + "learning_rate": 0.0008905185414974659, + "loss": 0.0447, + "num_input_tokens_seen": 77710368, + "step": 35950 + }, + { + "epoch": 5.865415986949429, + "grad_norm": 0.07072892785072327, + "learning_rate": 0.0008904740868339655, + "loss": 0.1731, + "num_input_tokens_seen": 77721728, + "step": 35955 + }, + { + "epoch": 5.866231647634584, + "grad_norm": 0.044736456125974655, + "learning_rate": 0.0008904296242570123, + "loss": 0.052, + "num_input_tokens_seen": 77732448, + "step": 35960 + }, + { + "epoch": 5.867047308319739, + "grad_norm": 0.0900491327047348, + "learning_rate": 0.0008903851537675076, + "loss": 0.0902, + "num_input_tokens_seen": 77743520, + "step": 35965 + }, + { + "epoch": 5.867862969004894, + "grad_norm": 0.0018697967752814293, + "learning_rate": 0.0008903406753663524, + "loss": 0.1432, + "num_input_tokens_seen": 77754656, + "step": 35970 + }, + { + "epoch": 5.868678629690049, + "grad_norm": 0.07420942187309265, + "learning_rate": 0.0008902961890544483, + "loss": 0.1013, + "num_input_tokens_seen": 77764960, + "step": 35975 + }, + { + "epoch": 5.869494290375204, + "grad_norm": 0.014277939684689045, + "learning_rate": 0.0008902516948326967, + "loss": 0.2049, + "num_input_tokens_seen": 77776672, + "step": 35980 + }, + { + "epoch": 5.870309951060359, + "grad_norm": 0.1220325455069542, + "learning_rate": 0.0008902071927019996, + "loss": 0.1353, + "num_input_tokens_seen": 77788352, + "step": 35985 + }, + { + "epoch": 5.871125611745514, + "grad_norm": 0.017809653654694557, + "learning_rate": 0.0008901626826632586, + "loss": 0.04, + "num_input_tokens_seen": 77797504, + "step": 35990 + }, + { + "epoch": 5.871941272430669, + "grad_norm": 0.010331861674785614, + "learning_rate": 0.000890118164717376, + "loss": 0.0541, + "num_input_tokens_seen": 77807136, + "step": 35995 + }, + { + "epoch": 5.872756933115824, + "grad_norm": 0.33883270621299744, + "learning_rate": 0.0008900736388652537, + "loss": 0.205, + "num_input_tokens_seen": 77817728, + "step": 36000 + }, + { + "epoch": 5.873572593800979, + "grad_norm": 0.021140409633517265, + "learning_rate": 0.0008900291051077944, + "loss": 0.1508, + "num_input_tokens_seen": 77828032, + "step": 36005 + }, + { + "epoch": 5.874388254486134, + "grad_norm": 0.005671947728842497, + "learning_rate": 0.0008899845634459005, + "loss": 0.0407, + "num_input_tokens_seen": 77838272, + "step": 36010 + }, + { + "epoch": 5.875203915171289, + "grad_norm": 0.11678887158632278, + "learning_rate": 0.0008899400138804748, + "loss": 0.1261, + "num_input_tokens_seen": 77848640, + "step": 36015 + }, + { + "epoch": 5.876019575856444, + "grad_norm": 0.11311411112546921, + "learning_rate": 0.0008898954564124197, + "loss": 0.1584, + "num_input_tokens_seen": 77860256, + "step": 36020 + }, + { + "epoch": 5.876835236541599, + "grad_norm": 0.03331954777240753, + "learning_rate": 0.0008898508910426388, + "loss": 0.0781, + "num_input_tokens_seen": 77871040, + "step": 36025 + }, + { + "epoch": 5.877650897226753, + "grad_norm": 0.1294984668493271, + "learning_rate": 0.0008898063177720351, + "loss": 0.1216, + "num_input_tokens_seen": 77882400, + "step": 36030 + }, + { + "epoch": 5.878466557911908, + "grad_norm": 0.0061601377092301846, + "learning_rate": 0.0008897617366015118, + "loss": 0.0277, + "num_input_tokens_seen": 77893216, + "step": 36035 + }, + { + "epoch": 5.879282218597064, + "grad_norm": 0.03422814980149269, + "learning_rate": 0.0008897171475319723, + "loss": 0.069, + "num_input_tokens_seen": 77905088, + "step": 36040 + }, + { + "epoch": 5.880097879282219, + "grad_norm": 0.07074693590402603, + "learning_rate": 0.0008896725505643206, + "loss": 0.0383, + "num_input_tokens_seen": 77914624, + "step": 36045 + }, + { + "epoch": 5.8809135399673735, + "grad_norm": 0.2282271385192871, + "learning_rate": 0.0008896279456994603, + "loss": 0.0993, + "num_input_tokens_seen": 77924896, + "step": 36050 + }, + { + "epoch": 5.881729200652528, + "grad_norm": 0.04102031886577606, + "learning_rate": 0.0008895833329382954, + "loss": 0.0236, + "num_input_tokens_seen": 77934976, + "step": 36055 + }, + { + "epoch": 5.882544861337683, + "grad_norm": 0.0101171201094985, + "learning_rate": 0.00088953871228173, + "loss": 0.0472, + "num_input_tokens_seen": 77945824, + "step": 36060 + }, + { + "epoch": 5.883360522022839, + "grad_norm": 0.022622620686888695, + "learning_rate": 0.0008894940837306685, + "loss": 0.1508, + "num_input_tokens_seen": 77956704, + "step": 36065 + }, + { + "epoch": 5.884176182707994, + "grad_norm": 0.03825777769088745, + "learning_rate": 0.000889449447286015, + "loss": 0.241, + "num_input_tokens_seen": 77968288, + "step": 36070 + }, + { + "epoch": 5.8849918433931485, + "grad_norm": 0.027706053107976913, + "learning_rate": 0.0008894048029486748, + "loss": 0.0252, + "num_input_tokens_seen": 77979264, + "step": 36075 + }, + { + "epoch": 5.885807504078303, + "grad_norm": 0.06347054243087769, + "learning_rate": 0.0008893601507195521, + "loss": 0.1103, + "num_input_tokens_seen": 77988448, + "step": 36080 + }, + { + "epoch": 5.886623164763458, + "grad_norm": 0.08096565306186676, + "learning_rate": 0.000889315490599552, + "loss": 0.0732, + "num_input_tokens_seen": 77999040, + "step": 36085 + }, + { + "epoch": 5.887438825448614, + "grad_norm": 0.33597108721733093, + "learning_rate": 0.0008892708225895796, + "loss": 0.1519, + "num_input_tokens_seen": 78009408, + "step": 36090 + }, + { + "epoch": 5.888254486133769, + "grad_norm": 0.01979757472872734, + "learning_rate": 0.0008892261466905402, + "loss": 0.0937, + "num_input_tokens_seen": 78019776, + "step": 36095 + }, + { + "epoch": 5.8890701468189235, + "grad_norm": 0.19596102833747864, + "learning_rate": 0.000889181462903339, + "loss": 0.3053, + "num_input_tokens_seen": 78030144, + "step": 36100 + }, + { + "epoch": 5.889885807504078, + "grad_norm": 0.27742302417755127, + "learning_rate": 0.0008891367712288819, + "loss": 0.1255, + "num_input_tokens_seen": 78041152, + "step": 36105 + }, + { + "epoch": 5.890701468189233, + "grad_norm": 0.05575815960764885, + "learning_rate": 0.0008890920716680744, + "loss": 0.1552, + "num_input_tokens_seen": 78052384, + "step": 36110 + }, + { + "epoch": 5.891517128874388, + "grad_norm": 0.013160888105630875, + "learning_rate": 0.0008890473642218226, + "loss": 0.0322, + "num_input_tokens_seen": 78062400, + "step": 36115 + }, + { + "epoch": 5.892332789559543, + "grad_norm": 0.009605771861970425, + "learning_rate": 0.0008890026488910323, + "loss": 0.1056, + "num_input_tokens_seen": 78072480, + "step": 36120 + }, + { + "epoch": 5.8931484502446985, + "grad_norm": 0.0945558175444603, + "learning_rate": 0.0008889579256766098, + "loss": 0.0372, + "num_input_tokens_seen": 78083968, + "step": 36125 + }, + { + "epoch": 5.893964110929853, + "grad_norm": 0.14063893258571625, + "learning_rate": 0.0008889131945794618, + "loss": 0.0596, + "num_input_tokens_seen": 78094176, + "step": 36130 + }, + { + "epoch": 5.894779771615008, + "grad_norm": 0.02134103700518608, + "learning_rate": 0.0008888684556004942, + "loss": 0.0324, + "num_input_tokens_seen": 78103872, + "step": 36135 + }, + { + "epoch": 5.895595432300163, + "grad_norm": 0.13972730934619904, + "learning_rate": 0.0008888237087406141, + "loss": 0.0643, + "num_input_tokens_seen": 78114656, + "step": 36140 + }, + { + "epoch": 5.896411092985318, + "grad_norm": 0.06875422596931458, + "learning_rate": 0.0008887789540007285, + "loss": 0.1673, + "num_input_tokens_seen": 78125312, + "step": 36145 + }, + { + "epoch": 5.897226753670473, + "grad_norm": 0.02877797745168209, + "learning_rate": 0.000888734191381744, + "loss": 0.035, + "num_input_tokens_seen": 78135936, + "step": 36150 + }, + { + "epoch": 5.898042414355628, + "grad_norm": 0.06866522133350372, + "learning_rate": 0.000888689420884568, + "loss": 0.1522, + "num_input_tokens_seen": 78148384, + "step": 36155 + }, + { + "epoch": 5.898858075040783, + "grad_norm": 0.013201478868722916, + "learning_rate": 0.0008886446425101078, + "loss": 0.0963, + "num_input_tokens_seen": 78158368, + "step": 36160 + }, + { + "epoch": 5.899673735725938, + "grad_norm": 0.06638146191835403, + "learning_rate": 0.0008885998562592709, + "loss": 0.0219, + "num_input_tokens_seen": 78167680, + "step": 36165 + }, + { + "epoch": 5.900489396411093, + "grad_norm": 0.2783553898334503, + "learning_rate": 0.0008885550621329649, + "loss": 0.0815, + "num_input_tokens_seen": 78178048, + "step": 36170 + }, + { + "epoch": 5.901305057096248, + "grad_norm": 0.07050324976444244, + "learning_rate": 0.0008885102601320976, + "loss": 0.0394, + "num_input_tokens_seen": 78187360, + "step": 36175 + }, + { + "epoch": 5.902120717781403, + "grad_norm": 0.01027140486985445, + "learning_rate": 0.0008884654502575771, + "loss": 0.0568, + "num_input_tokens_seen": 78198912, + "step": 36180 + }, + { + "epoch": 5.902936378466558, + "grad_norm": 0.009896304458379745, + "learning_rate": 0.0008884206325103115, + "loss": 0.0781, + "num_input_tokens_seen": 78209952, + "step": 36185 + }, + { + "epoch": 5.903752039151713, + "grad_norm": 0.010709409601986408, + "learning_rate": 0.000888375806891209, + "loss": 0.1165, + "num_input_tokens_seen": 78219712, + "step": 36190 + }, + { + "epoch": 5.904567699836868, + "grad_norm": 0.04064284265041351, + "learning_rate": 0.0008883309734011779, + "loss": 0.0475, + "num_input_tokens_seen": 78231776, + "step": 36195 + }, + { + "epoch": 5.9053833605220225, + "grad_norm": 0.038544662296772, + "learning_rate": 0.0008882861320411273, + "loss": 0.0941, + "num_input_tokens_seen": 78243648, + "step": 36200 + }, + { + "epoch": 5.906199021207177, + "grad_norm": 0.05744968354701996, + "learning_rate": 0.0008882412828119655, + "loss": 0.0914, + "num_input_tokens_seen": 78253792, + "step": 36205 + }, + { + "epoch": 5.907014681892333, + "grad_norm": 0.012252528220415115, + "learning_rate": 0.0008881964257146015, + "loss": 0.0543, + "num_input_tokens_seen": 78263616, + "step": 36210 + }, + { + "epoch": 5.907830342577488, + "grad_norm": 0.18378940224647522, + "learning_rate": 0.0008881515607499446, + "loss": 0.1035, + "num_input_tokens_seen": 78275296, + "step": 36215 + }, + { + "epoch": 5.908646003262643, + "grad_norm": 0.16523931920528412, + "learning_rate": 0.000888106687918904, + "loss": 0.1301, + "num_input_tokens_seen": 78286336, + "step": 36220 + }, + { + "epoch": 5.9094616639477975, + "grad_norm": 0.05260119214653969, + "learning_rate": 0.000888061807222389, + "loss": 0.0246, + "num_input_tokens_seen": 78296640, + "step": 36225 + }, + { + "epoch": 5.910277324632952, + "grad_norm": 0.003584100864827633, + "learning_rate": 0.000888016918661309, + "loss": 0.0172, + "num_input_tokens_seen": 78306560, + "step": 36230 + }, + { + "epoch": 5.911092985318108, + "grad_norm": 0.2283678501844406, + "learning_rate": 0.0008879720222365739, + "loss": 0.1179, + "num_input_tokens_seen": 78317248, + "step": 36235 + }, + { + "epoch": 5.911908646003263, + "grad_norm": 0.08701247721910477, + "learning_rate": 0.0008879271179490938, + "loss": 0.133, + "num_input_tokens_seen": 78328992, + "step": 36240 + }, + { + "epoch": 5.912724306688418, + "grad_norm": 0.09132330864667892, + "learning_rate": 0.0008878822057997784, + "loss": 0.0695, + "num_input_tokens_seen": 78340384, + "step": 36245 + }, + { + "epoch": 5.9135399673735725, + "grad_norm": 0.2684599459171295, + "learning_rate": 0.000887837285789538, + "loss": 0.1982, + "num_input_tokens_seen": 78352096, + "step": 36250 + }, + { + "epoch": 5.914355628058727, + "grad_norm": 0.037875618785619736, + "learning_rate": 0.0008877923579192831, + "loss": 0.0138, + "num_input_tokens_seen": 78361728, + "step": 36255 + }, + { + "epoch": 5.915171288743883, + "grad_norm": 0.009383009746670723, + "learning_rate": 0.0008877474221899241, + "loss": 0.0733, + "num_input_tokens_seen": 78373440, + "step": 36260 + }, + { + "epoch": 5.915986949429038, + "grad_norm": 0.2724219858646393, + "learning_rate": 0.0008877024786023718, + "loss": 0.3018, + "num_input_tokens_seen": 78383744, + "step": 36265 + }, + { + "epoch": 5.916802610114193, + "grad_norm": 0.08910335600376129, + "learning_rate": 0.0008876575271575366, + "loss": 0.086, + "num_input_tokens_seen": 78394080, + "step": 36270 + }, + { + "epoch": 5.917618270799347, + "grad_norm": 0.08044688403606415, + "learning_rate": 0.0008876125678563301, + "loss": 0.1419, + "num_input_tokens_seen": 78404416, + "step": 36275 + }, + { + "epoch": 5.918433931484502, + "grad_norm": 0.1438187211751938, + "learning_rate": 0.0008875676006996631, + "loss": 0.1822, + "num_input_tokens_seen": 78414624, + "step": 36280 + }, + { + "epoch": 5.919249592169657, + "grad_norm": 0.5074575543403625, + "learning_rate": 0.0008875226256884471, + "loss": 0.1391, + "num_input_tokens_seen": 78424704, + "step": 36285 + }, + { + "epoch": 5.920065252854813, + "grad_norm": 0.14423705637454987, + "learning_rate": 0.0008874776428235933, + "loss": 0.1201, + "num_input_tokens_seen": 78436064, + "step": 36290 + }, + { + "epoch": 5.920880913539968, + "grad_norm": 0.21933916211128235, + "learning_rate": 0.0008874326521060138, + "loss": 0.0663, + "num_input_tokens_seen": 78447200, + "step": 36295 + }, + { + "epoch": 5.921696574225122, + "grad_norm": 0.02093925140798092, + "learning_rate": 0.0008873876535366199, + "loss": 0.0535, + "num_input_tokens_seen": 78459552, + "step": 36300 + }, + { + "epoch": 5.922512234910277, + "grad_norm": 0.06091681867837906, + "learning_rate": 0.0008873426471163238, + "loss": 0.0752, + "num_input_tokens_seen": 78470912, + "step": 36305 + }, + { + "epoch": 5.923327895595432, + "grad_norm": 0.038125790655612946, + "learning_rate": 0.0008872976328460376, + "loss": 0.144, + "num_input_tokens_seen": 78481728, + "step": 36310 + }, + { + "epoch": 5.924143556280587, + "grad_norm": 0.2037804126739502, + "learning_rate": 0.0008872526107266736, + "loss": 0.1306, + "num_input_tokens_seen": 78493248, + "step": 36315 + }, + { + "epoch": 5.924959216965743, + "grad_norm": 0.17241688072681427, + "learning_rate": 0.0008872075807591442, + "loss": 0.273, + "num_input_tokens_seen": 78504224, + "step": 36320 + }, + { + "epoch": 5.925774877650897, + "grad_norm": 0.04938659444451332, + "learning_rate": 0.0008871625429443617, + "loss": 0.1786, + "num_input_tokens_seen": 78514944, + "step": 36325 + }, + { + "epoch": 5.926590538336052, + "grad_norm": 0.13024407625198364, + "learning_rate": 0.0008871174972832394, + "loss": 0.0791, + "num_input_tokens_seen": 78525920, + "step": 36330 + }, + { + "epoch": 5.927406199021207, + "grad_norm": 0.02717706933617592, + "learning_rate": 0.0008870724437766898, + "loss": 0.0405, + "num_input_tokens_seen": 78537888, + "step": 36335 + }, + { + "epoch": 5.928221859706362, + "grad_norm": 0.009567965753376484, + "learning_rate": 0.0008870273824256261, + "loss": 0.0324, + "num_input_tokens_seen": 78547584, + "step": 36340 + }, + { + "epoch": 5.9290375203915175, + "grad_norm": 0.0030902696307748556, + "learning_rate": 0.0008869823132309616, + "loss": 0.0572, + "num_input_tokens_seen": 78557760, + "step": 36345 + }, + { + "epoch": 5.929853181076672, + "grad_norm": 0.049891430884599686, + "learning_rate": 0.0008869372361936096, + "loss": 0.0572, + "num_input_tokens_seen": 78568640, + "step": 36350 + }, + { + "epoch": 5.930668841761827, + "grad_norm": 0.01841222122311592, + "learning_rate": 0.0008868921513144835, + "loss": 0.0685, + "num_input_tokens_seen": 78580224, + "step": 36355 + }, + { + "epoch": 5.931484502446982, + "grad_norm": 0.03582854941487312, + "learning_rate": 0.0008868470585944972, + "loss": 0.0341, + "num_input_tokens_seen": 78590432, + "step": 36360 + }, + { + "epoch": 5.932300163132137, + "grad_norm": 0.15736792981624603, + "learning_rate": 0.0008868019580345645, + "loss": 0.0631, + "num_input_tokens_seen": 78601696, + "step": 36365 + }, + { + "epoch": 5.933115823817292, + "grad_norm": 0.07770948112010956, + "learning_rate": 0.0008867568496355996, + "loss": 0.0571, + "num_input_tokens_seen": 78613024, + "step": 36370 + }, + { + "epoch": 5.933931484502447, + "grad_norm": 0.010174530558288097, + "learning_rate": 0.0008867117333985164, + "loss": 0.18, + "num_input_tokens_seen": 78624064, + "step": 36375 + }, + { + "epoch": 5.934747145187602, + "grad_norm": 0.08652004599571228, + "learning_rate": 0.0008866666093242292, + "loss": 0.0473, + "num_input_tokens_seen": 78635104, + "step": 36380 + }, + { + "epoch": 5.935562805872757, + "grad_norm": 0.17718979716300964, + "learning_rate": 0.0008866214774136528, + "loss": 0.2155, + "num_input_tokens_seen": 78646272, + "step": 36385 + }, + { + "epoch": 5.936378466557912, + "grad_norm": 0.0171345341950655, + "learning_rate": 0.0008865763376677017, + "loss": 0.0465, + "num_input_tokens_seen": 78656640, + "step": 36390 + }, + { + "epoch": 5.937194127243067, + "grad_norm": 0.13406716287136078, + "learning_rate": 0.0008865311900872905, + "loss": 0.1356, + "num_input_tokens_seen": 78667200, + "step": 36395 + }, + { + "epoch": 5.938009787928221, + "grad_norm": 0.08030443638563156, + "learning_rate": 0.0008864860346733346, + "loss": 0.0393, + "num_input_tokens_seen": 78677184, + "step": 36400 + }, + { + "epoch": 5.938825448613377, + "grad_norm": 0.08462988585233688, + "learning_rate": 0.0008864408714267489, + "loss": 0.1079, + "num_input_tokens_seen": 78689440, + "step": 36405 + }, + { + "epoch": 5.939641109298532, + "grad_norm": 0.35317009687423706, + "learning_rate": 0.0008863957003484486, + "loss": 0.112, + "num_input_tokens_seen": 78700640, + "step": 36410 + }, + { + "epoch": 5.940456769983687, + "grad_norm": 0.03767653927206993, + "learning_rate": 0.0008863505214393494, + "loss": 0.1616, + "num_input_tokens_seen": 78710976, + "step": 36415 + }, + { + "epoch": 5.941272430668842, + "grad_norm": 0.0903841108083725, + "learning_rate": 0.0008863053347003667, + "loss": 0.1289, + "num_input_tokens_seen": 78722240, + "step": 36420 + }, + { + "epoch": 5.942088091353996, + "grad_norm": 0.014996036887168884, + "learning_rate": 0.0008862601401324162, + "loss": 0.1195, + "num_input_tokens_seen": 78732704, + "step": 36425 + }, + { + "epoch": 5.942903752039152, + "grad_norm": 0.06513303518295288, + "learning_rate": 0.0008862149377364142, + "loss": 0.1074, + "num_input_tokens_seen": 78742592, + "step": 36430 + }, + { + "epoch": 5.943719412724307, + "grad_norm": 0.13003186881542206, + "learning_rate": 0.0008861697275132763, + "loss": 0.1357, + "num_input_tokens_seen": 78752064, + "step": 36435 + }, + { + "epoch": 5.944535073409462, + "grad_norm": 0.19689883291721344, + "learning_rate": 0.0008861245094639193, + "loss": 0.1116, + "num_input_tokens_seen": 78761984, + "step": 36440 + }, + { + "epoch": 5.945350734094617, + "grad_norm": 0.00810596626251936, + "learning_rate": 0.000886079283589259, + "loss": 0.0769, + "num_input_tokens_seen": 78773504, + "step": 36445 + }, + { + "epoch": 5.946166394779771, + "grad_norm": 0.014026161283254623, + "learning_rate": 0.0008860340498902121, + "loss": 0.1225, + "num_input_tokens_seen": 78784352, + "step": 36450 + }, + { + "epoch": 5.946982055464927, + "grad_norm": 0.08399257808923721, + "learning_rate": 0.0008859888083676958, + "loss": 0.1057, + "num_input_tokens_seen": 78795968, + "step": 36455 + }, + { + "epoch": 5.947797716150082, + "grad_norm": 0.08424603939056396, + "learning_rate": 0.0008859435590226266, + "loss": 0.1024, + "num_input_tokens_seen": 78805984, + "step": 36460 + }, + { + "epoch": 5.948613376835237, + "grad_norm": 0.02504083514213562, + "learning_rate": 0.0008858983018559214, + "loss": 0.0457, + "num_input_tokens_seen": 78817184, + "step": 36465 + }, + { + "epoch": 5.9494290375203915, + "grad_norm": 0.19386163353919983, + "learning_rate": 0.0008858530368684977, + "loss": 0.2027, + "num_input_tokens_seen": 78828160, + "step": 36470 + }, + { + "epoch": 5.950244698205546, + "grad_norm": 0.13911594450473785, + "learning_rate": 0.0008858077640612727, + "loss": 0.1198, + "num_input_tokens_seen": 78838944, + "step": 36475 + }, + { + "epoch": 5.951060358890701, + "grad_norm": 0.020422089844942093, + "learning_rate": 0.0008857624834351639, + "loss": 0.0332, + "num_input_tokens_seen": 78848896, + "step": 36480 + }, + { + "epoch": 5.951876019575856, + "grad_norm": 0.026278553530573845, + "learning_rate": 0.000885717194991089, + "loss": 0.0181, + "num_input_tokens_seen": 78860608, + "step": 36485 + }, + { + "epoch": 5.952691680261012, + "grad_norm": 0.008417508564889431, + "learning_rate": 0.0008856718987299656, + "loss": 0.0644, + "num_input_tokens_seen": 78871200, + "step": 36490 + }, + { + "epoch": 5.9535073409461665, + "grad_norm": 0.045608025044202805, + "learning_rate": 0.0008856265946527122, + "loss": 0.0382, + "num_input_tokens_seen": 78882464, + "step": 36495 + }, + { + "epoch": 5.954323001631321, + "grad_norm": 0.020935669541358948, + "learning_rate": 0.0008855812827602465, + "loss": 0.0739, + "num_input_tokens_seen": 78892416, + "step": 36500 + }, + { + "epoch": 5.955138662316476, + "grad_norm": 0.036478910595178604, + "learning_rate": 0.0008855359630534871, + "loss": 0.0398, + "num_input_tokens_seen": 78904128, + "step": 36505 + }, + { + "epoch": 5.955954323001631, + "grad_norm": 0.025049181655049324, + "learning_rate": 0.0008854906355333522, + "loss": 0.0129, + "num_input_tokens_seen": 78913152, + "step": 36510 + }, + { + "epoch": 5.956769983686787, + "grad_norm": 0.025541089475154877, + "learning_rate": 0.0008854453002007607, + "loss": 0.0664, + "num_input_tokens_seen": 78924672, + "step": 36515 + }, + { + "epoch": 5.9575856443719415, + "grad_norm": 0.013693660497665405, + "learning_rate": 0.0008853999570566311, + "loss": 0.0741, + "num_input_tokens_seen": 78936224, + "step": 36520 + }, + { + "epoch": 5.958401305057096, + "grad_norm": 0.05598202720284462, + "learning_rate": 0.0008853546061018825, + "loss": 0.1703, + "num_input_tokens_seen": 78947200, + "step": 36525 + }, + { + "epoch": 5.959216965742251, + "grad_norm": 0.030897224321961403, + "learning_rate": 0.000885309247337434, + "loss": 0.0402, + "num_input_tokens_seen": 78959104, + "step": 36530 + }, + { + "epoch": 5.960032626427406, + "grad_norm": 0.006449823267757893, + "learning_rate": 0.0008852638807642048, + "loss": 0.0547, + "num_input_tokens_seen": 78970240, + "step": 36535 + }, + { + "epoch": 5.960848287112562, + "grad_norm": 0.01938220113515854, + "learning_rate": 0.0008852185063831142, + "loss": 0.0472, + "num_input_tokens_seen": 78979872, + "step": 36540 + }, + { + "epoch": 5.9616639477977165, + "grad_norm": 0.01463254727423191, + "learning_rate": 0.000885173124195082, + "loss": 0.0383, + "num_input_tokens_seen": 78992032, + "step": 36545 + }, + { + "epoch": 5.962479608482871, + "grad_norm": 0.015324999578297138, + "learning_rate": 0.0008851277342010278, + "loss": 0.0787, + "num_input_tokens_seen": 79002240, + "step": 36550 + }, + { + "epoch": 5.963295269168026, + "grad_norm": 0.006455022841691971, + "learning_rate": 0.0008850823364018715, + "loss": 0.0297, + "num_input_tokens_seen": 79013632, + "step": 36555 + }, + { + "epoch": 5.964110929853181, + "grad_norm": 0.2201565057039261, + "learning_rate": 0.0008850369307985328, + "loss": 0.0426, + "num_input_tokens_seen": 79023808, + "step": 36560 + }, + { + "epoch": 5.964926590538336, + "grad_norm": 0.05902295187115669, + "learning_rate": 0.0008849915173919327, + "loss": 0.0371, + "num_input_tokens_seen": 79034560, + "step": 36565 + }, + { + "epoch": 5.9657422512234906, + "grad_norm": 0.013868676498532295, + "learning_rate": 0.0008849460961829909, + "loss": 0.0969, + "num_input_tokens_seen": 79045824, + "step": 36570 + }, + { + "epoch": 5.966557911908646, + "grad_norm": 0.33222365379333496, + "learning_rate": 0.0008849006671726281, + "loss": 0.3332, + "num_input_tokens_seen": 79055648, + "step": 36575 + }, + { + "epoch": 5.967373572593801, + "grad_norm": 0.2062501609325409, + "learning_rate": 0.0008848552303617651, + "loss": 0.0749, + "num_input_tokens_seen": 79066752, + "step": 36580 + }, + { + "epoch": 5.968189233278956, + "grad_norm": 0.036664046347141266, + "learning_rate": 0.0008848097857513227, + "loss": 0.0406, + "num_input_tokens_seen": 79077984, + "step": 36585 + }, + { + "epoch": 5.969004893964111, + "grad_norm": 0.1617347002029419, + "learning_rate": 0.0008847643333422216, + "loss": 0.0892, + "num_input_tokens_seen": 79089888, + "step": 36590 + }, + { + "epoch": 5.9698205546492655, + "grad_norm": 0.048055000603199005, + "learning_rate": 0.0008847188731353833, + "loss": 0.2289, + "num_input_tokens_seen": 79101312, + "step": 36595 + }, + { + "epoch": 5.970636215334421, + "grad_norm": 0.017596082761883736, + "learning_rate": 0.0008846734051317289, + "loss": 0.0828, + "num_input_tokens_seen": 79111616, + "step": 36600 + }, + { + "epoch": 5.971451876019576, + "grad_norm": 0.005010406486690044, + "learning_rate": 0.0008846279293321801, + "loss": 0.1542, + "num_input_tokens_seen": 79121632, + "step": 36605 + }, + { + "epoch": 5.972267536704731, + "grad_norm": 0.2197558879852295, + "learning_rate": 0.0008845824457376583, + "loss": 0.1049, + "num_input_tokens_seen": 79132672, + "step": 36610 + }, + { + "epoch": 5.973083197389886, + "grad_norm": 0.12246564775705338, + "learning_rate": 0.0008845369543490853, + "loss": 0.1219, + "num_input_tokens_seen": 79142304, + "step": 36615 + }, + { + "epoch": 5.9738988580750405, + "grad_norm": 0.38152462244033813, + "learning_rate": 0.0008844914551673832, + "loss": 0.1163, + "num_input_tokens_seen": 79153088, + "step": 36620 + }, + { + "epoch": 5.974714518760196, + "grad_norm": 0.05257618799805641, + "learning_rate": 0.000884445948193474, + "loss": 0.1031, + "num_input_tokens_seen": 79164800, + "step": 36625 + }, + { + "epoch": 5.975530179445351, + "grad_norm": 0.0162787064909935, + "learning_rate": 0.0008844004334282801, + "loss": 0.0145, + "num_input_tokens_seen": 79175072, + "step": 36630 + }, + { + "epoch": 5.976345840130506, + "grad_norm": 0.2210160195827484, + "learning_rate": 0.0008843549108727234, + "loss": 0.1055, + "num_input_tokens_seen": 79186016, + "step": 36635 + }, + { + "epoch": 5.977161500815661, + "grad_norm": 0.11478012800216675, + "learning_rate": 0.0008843093805277271, + "loss": 0.1818, + "num_input_tokens_seen": 79196224, + "step": 36640 + }, + { + "epoch": 5.9779771615008155, + "grad_norm": 0.05836905911564827, + "learning_rate": 0.0008842638423942136, + "loss": 0.1286, + "num_input_tokens_seen": 79208320, + "step": 36645 + }, + { + "epoch": 5.97879282218597, + "grad_norm": 0.0437001995742321, + "learning_rate": 0.0008842182964731058, + "loss": 0.0551, + "num_input_tokens_seen": 79218528, + "step": 36650 + }, + { + "epoch": 5.979608482871125, + "grad_norm": 0.09292764961719513, + "learning_rate": 0.0008841727427653269, + "loss": 0.0908, + "num_input_tokens_seen": 79230144, + "step": 36655 + }, + { + "epoch": 5.980424143556281, + "grad_norm": 0.186200350522995, + "learning_rate": 0.0008841271812717999, + "loss": 0.1414, + "num_input_tokens_seen": 79240960, + "step": 36660 + }, + { + "epoch": 5.981239804241436, + "grad_norm": 0.30797278881073, + "learning_rate": 0.0008840816119934485, + "loss": 0.1893, + "num_input_tokens_seen": 79251616, + "step": 36665 + }, + { + "epoch": 5.9820554649265905, + "grad_norm": 0.010716347023844719, + "learning_rate": 0.0008840360349311958, + "loss": 0.0335, + "num_input_tokens_seen": 79262560, + "step": 36670 + }, + { + "epoch": 5.982871125611745, + "grad_norm": 0.014386476948857307, + "learning_rate": 0.0008839904500859656, + "loss": 0.0863, + "num_input_tokens_seen": 79274048, + "step": 36675 + }, + { + "epoch": 5.9836867862969, + "grad_norm": 0.03421509265899658, + "learning_rate": 0.0008839448574586821, + "loss": 0.1078, + "num_input_tokens_seen": 79284736, + "step": 36680 + }, + { + "epoch": 5.984502446982056, + "grad_norm": 0.00656637828797102, + "learning_rate": 0.0008838992570502687, + "loss": 0.0994, + "num_input_tokens_seen": 79295040, + "step": 36685 + }, + { + "epoch": 5.985318107667211, + "grad_norm": 0.08180084824562073, + "learning_rate": 0.0008838536488616499, + "loss": 0.2041, + "num_input_tokens_seen": 79305536, + "step": 36690 + }, + { + "epoch": 5.986133768352365, + "grad_norm": 0.13446083664894104, + "learning_rate": 0.0008838080328937501, + "loss": 0.0699, + "num_input_tokens_seen": 79316960, + "step": 36695 + }, + { + "epoch": 5.98694942903752, + "grad_norm": 0.09626548737287521, + "learning_rate": 0.0008837624091474935, + "loss": 0.0536, + "num_input_tokens_seen": 79327584, + "step": 36700 + }, + { + "epoch": 5.987765089722675, + "grad_norm": 0.07584668695926666, + "learning_rate": 0.0008837167776238049, + "loss": 0.0289, + "num_input_tokens_seen": 79338688, + "step": 36705 + }, + { + "epoch": 5.988580750407831, + "grad_norm": 0.0042076618410646915, + "learning_rate": 0.0008836711383236089, + "loss": 0.1248, + "num_input_tokens_seen": 79349920, + "step": 36710 + }, + { + "epoch": 5.989396411092986, + "grad_norm": 0.019717322662472725, + "learning_rate": 0.0008836254912478308, + "loss": 0.0433, + "num_input_tokens_seen": 79360192, + "step": 36715 + }, + { + "epoch": 5.99021207177814, + "grad_norm": 0.004804224707186222, + "learning_rate": 0.0008835798363973952, + "loss": 0.2098, + "num_input_tokens_seen": 79368992, + "step": 36720 + }, + { + "epoch": 5.991027732463295, + "grad_norm": 0.020206429064273834, + "learning_rate": 0.0008835341737732276, + "loss": 0.0592, + "num_input_tokens_seen": 79379616, + "step": 36725 + }, + { + "epoch": 5.99184339314845, + "grad_norm": 0.0091730747371912, + "learning_rate": 0.0008834885033762536, + "loss": 0.0136, + "num_input_tokens_seen": 79389216, + "step": 36730 + }, + { + "epoch": 5.992659053833605, + "grad_norm": 0.12126602977514267, + "learning_rate": 0.0008834428252073986, + "loss": 0.0618, + "num_input_tokens_seen": 79400064, + "step": 36735 + }, + { + "epoch": 5.993474714518761, + "grad_norm": 0.02198721095919609, + "learning_rate": 0.0008833971392675882, + "loss": 0.0269, + "num_input_tokens_seen": 79411808, + "step": 36740 + }, + { + "epoch": 5.994290375203915, + "grad_norm": 0.006473301909863949, + "learning_rate": 0.0008833514455577485, + "loss": 0.0968, + "num_input_tokens_seen": 79420928, + "step": 36745 + }, + { + "epoch": 5.99510603588907, + "grad_norm": 0.11051056534051895, + "learning_rate": 0.0008833057440788053, + "loss": 0.0488, + "num_input_tokens_seen": 79432768, + "step": 36750 + }, + { + "epoch": 5.995921696574225, + "grad_norm": 0.09950881451368332, + "learning_rate": 0.000883260034831685, + "loss": 0.0649, + "num_input_tokens_seen": 79442720, + "step": 36755 + }, + { + "epoch": 5.99673735725938, + "grad_norm": 0.08756718784570694, + "learning_rate": 0.000883214317817314, + "loss": 0.0541, + "num_input_tokens_seen": 79453344, + "step": 36760 + }, + { + "epoch": 5.997553017944535, + "grad_norm": 0.1397211253643036, + "learning_rate": 0.0008831685930366187, + "loss": 0.0768, + "num_input_tokens_seen": 79463424, + "step": 36765 + }, + { + "epoch": 5.99836867862969, + "grad_norm": 0.004025152884423733, + "learning_rate": 0.0008831228604905257, + "loss": 0.039, + "num_input_tokens_seen": 79475264, + "step": 36770 + }, + { + "epoch": 5.999184339314845, + "grad_norm": 0.0041638934053480625, + "learning_rate": 0.0008830771201799619, + "loss": 0.0379, + "num_input_tokens_seen": 79486464, + "step": 36775 + }, + { + "epoch": 6.0, + "grad_norm": 0.06496407091617584, + "learning_rate": 0.0008830313721058543, + "loss": 0.0721, + "num_input_tokens_seen": 79495984, + "step": 36780 + }, + { + "epoch": 6.0, + "eval_loss": 0.12523896992206573, + "eval_runtime": 103.3104, + "eval_samples_per_second": 26.377, + "eval_steps_per_second": 6.601, + "num_input_tokens_seen": 79495984, + "step": 36780 + }, + { + "epoch": 6.000815660685155, + "grad_norm": 0.19501720368862152, + "learning_rate": 0.00088298561626913, + "loss": 0.1103, + "num_input_tokens_seen": 79505904, + "step": 36785 + }, + { + "epoch": 6.00163132137031, + "grad_norm": 0.0032038709614425898, + "learning_rate": 0.0008829398526707164, + "loss": 0.0167, + "num_input_tokens_seen": 79516880, + "step": 36790 + }, + { + "epoch": 6.002446982055465, + "grad_norm": 0.0025172571185976267, + "learning_rate": 0.0008828940813115408, + "loss": 0.1738, + "num_input_tokens_seen": 79527472, + "step": 36795 + }, + { + "epoch": 6.00326264274062, + "grad_norm": 0.3164461553096771, + "learning_rate": 0.000882848302192531, + "loss": 0.1231, + "num_input_tokens_seen": 79538704, + "step": 36800 + }, + { + "epoch": 6.004078303425775, + "grad_norm": 0.07731668651103973, + "learning_rate": 0.0008828025153146147, + "loss": 0.1044, + "num_input_tokens_seen": 79550096, + "step": 36805 + }, + { + "epoch": 6.00489396411093, + "grad_norm": 0.019409824162721634, + "learning_rate": 0.0008827567206787197, + "loss": 0.1054, + "num_input_tokens_seen": 79561488, + "step": 36810 + }, + { + "epoch": 6.005709624796085, + "grad_norm": 0.038065653294324875, + "learning_rate": 0.0008827109182857742, + "loss": 0.135, + "num_input_tokens_seen": 79572048, + "step": 36815 + }, + { + "epoch": 6.006525285481239, + "grad_norm": 0.04077430069446564, + "learning_rate": 0.0008826651081367065, + "loss": 0.0517, + "num_input_tokens_seen": 79582160, + "step": 36820 + }, + { + "epoch": 6.007340946166395, + "grad_norm": 0.25474321842193604, + "learning_rate": 0.0008826192902324449, + "loss": 0.1317, + "num_input_tokens_seen": 79592304, + "step": 36825 + }, + { + "epoch": 6.00815660685155, + "grad_norm": 0.23740410804748535, + "learning_rate": 0.0008825734645739181, + "loss": 0.1233, + "num_input_tokens_seen": 79602384, + "step": 36830 + }, + { + "epoch": 6.008972267536705, + "grad_norm": 0.18980100750923157, + "learning_rate": 0.0008825276311620546, + "loss": 0.1412, + "num_input_tokens_seen": 79611440, + "step": 36835 + }, + { + "epoch": 6.00978792822186, + "grad_norm": 0.08281727135181427, + "learning_rate": 0.0008824817899977834, + "loss": 0.0658, + "num_input_tokens_seen": 79622800, + "step": 36840 + }, + { + "epoch": 6.010603588907014, + "grad_norm": 0.06491725146770477, + "learning_rate": 0.0008824359410820335, + "loss": 0.0888, + "num_input_tokens_seen": 79634608, + "step": 36845 + }, + { + "epoch": 6.011419249592169, + "grad_norm": 0.10696760565042496, + "learning_rate": 0.0008823900844157342, + "loss": 0.1227, + "num_input_tokens_seen": 79646512, + "step": 36850 + }, + { + "epoch": 6.012234910277325, + "grad_norm": 0.2877226769924164, + "learning_rate": 0.0008823442199998147, + "loss": 0.0735, + "num_input_tokens_seen": 79658160, + "step": 36855 + }, + { + "epoch": 6.01305057096248, + "grad_norm": 0.04781525209546089, + "learning_rate": 0.0008822983478352044, + "loss": 0.1068, + "num_input_tokens_seen": 79668304, + "step": 36860 + }, + { + "epoch": 6.013866231647635, + "grad_norm": 0.29813289642333984, + "learning_rate": 0.0008822524679228332, + "loss": 0.1074, + "num_input_tokens_seen": 79680048, + "step": 36865 + }, + { + "epoch": 6.014681892332789, + "grad_norm": 0.11078273504972458, + "learning_rate": 0.0008822065802636308, + "loss": 0.0679, + "num_input_tokens_seen": 79690224, + "step": 36870 + }, + { + "epoch": 6.015497553017944, + "grad_norm": 0.03565460816025734, + "learning_rate": 0.0008821606848585273, + "loss": 0.0859, + "num_input_tokens_seen": 79700688, + "step": 36875 + }, + { + "epoch": 6.0163132137031, + "grad_norm": 0.02995418943464756, + "learning_rate": 0.0008821147817084526, + "loss": 0.0601, + "num_input_tokens_seen": 79711120, + "step": 36880 + }, + { + "epoch": 6.017128874388255, + "grad_norm": 0.040033359080553055, + "learning_rate": 0.0008820688708143372, + "loss": 0.0825, + "num_input_tokens_seen": 79721360, + "step": 36885 + }, + { + "epoch": 6.0179445350734095, + "grad_norm": 0.19602453708648682, + "learning_rate": 0.0008820229521771112, + "loss": 0.0882, + "num_input_tokens_seen": 79732048, + "step": 36890 + }, + { + "epoch": 6.018760195758564, + "grad_norm": 0.03513888269662857, + "learning_rate": 0.0008819770257977058, + "loss": 0.2078, + "num_input_tokens_seen": 79742864, + "step": 36895 + }, + { + "epoch": 6.019575856443719, + "grad_norm": 0.009727993980050087, + "learning_rate": 0.0008819310916770511, + "loss": 0.0816, + "num_input_tokens_seen": 79754512, + "step": 36900 + }, + { + "epoch": 6.020391517128874, + "grad_norm": 0.04514655843377113, + "learning_rate": 0.0008818851498160785, + "loss": 0.0333, + "num_input_tokens_seen": 79766032, + "step": 36905 + }, + { + "epoch": 6.02120717781403, + "grad_norm": 0.15755507349967957, + "learning_rate": 0.0008818392002157188, + "loss": 0.0285, + "num_input_tokens_seen": 79775792, + "step": 36910 + }, + { + "epoch": 6.0220228384991845, + "grad_norm": 0.07789606600999832, + "learning_rate": 0.0008817932428769033, + "loss": 0.0898, + "num_input_tokens_seen": 79786576, + "step": 36915 + }, + { + "epoch": 6.022838499184339, + "grad_norm": 0.05118430405855179, + "learning_rate": 0.0008817472778005635, + "loss": 0.1647, + "num_input_tokens_seen": 79797456, + "step": 36920 + }, + { + "epoch": 6.023654159869494, + "grad_norm": 0.030694983899593353, + "learning_rate": 0.0008817013049876308, + "loss": 0.1446, + "num_input_tokens_seen": 79808944, + "step": 36925 + }, + { + "epoch": 6.024469820554649, + "grad_norm": 0.023417538031935692, + "learning_rate": 0.0008816553244390368, + "loss": 0.1296, + "num_input_tokens_seen": 79820592, + "step": 36930 + }, + { + "epoch": 6.025285481239805, + "grad_norm": 0.03646084666252136, + "learning_rate": 0.0008816093361557136, + "loss": 0.0359, + "num_input_tokens_seen": 79832592, + "step": 36935 + }, + { + "epoch": 6.0261011419249595, + "grad_norm": 0.08928578346967697, + "learning_rate": 0.0008815633401385932, + "loss": 0.0778, + "num_input_tokens_seen": 79843536, + "step": 36940 + }, + { + "epoch": 6.026916802610114, + "grad_norm": 0.01563401333987713, + "learning_rate": 0.0008815173363886075, + "loss": 0.1516, + "num_input_tokens_seen": 79854000, + "step": 36945 + }, + { + "epoch": 6.027732463295269, + "grad_norm": 0.3969258964061737, + "learning_rate": 0.000881471324906689, + "loss": 0.076, + "num_input_tokens_seen": 79864688, + "step": 36950 + }, + { + "epoch": 6.028548123980424, + "grad_norm": 0.029993494972586632, + "learning_rate": 0.0008814253056937702, + "loss": 0.0336, + "num_input_tokens_seen": 79876208, + "step": 36955 + }, + { + "epoch": 6.029363784665579, + "grad_norm": 0.10093618929386139, + "learning_rate": 0.0008813792787507837, + "loss": 0.065, + "num_input_tokens_seen": 79887568, + "step": 36960 + }, + { + "epoch": 6.0301794453507345, + "grad_norm": 0.012898314744234085, + "learning_rate": 0.0008813332440786623, + "loss": 0.0167, + "num_input_tokens_seen": 79897584, + "step": 36965 + }, + { + "epoch": 6.030995106035889, + "grad_norm": 0.0038545397110283375, + "learning_rate": 0.0008812872016783389, + "loss": 0.0186, + "num_input_tokens_seen": 79910096, + "step": 36970 + }, + { + "epoch": 6.031810766721044, + "grad_norm": 0.3187786340713501, + "learning_rate": 0.0008812411515507468, + "loss": 0.2138, + "num_input_tokens_seen": 79921200, + "step": 36975 + }, + { + "epoch": 6.032626427406199, + "grad_norm": 0.21439680457115173, + "learning_rate": 0.000881195093696819, + "loss": 0.0556, + "num_input_tokens_seen": 79932496, + "step": 36980 + }, + { + "epoch": 6.033442088091354, + "grad_norm": 0.30229452252388, + "learning_rate": 0.000881149028117489, + "loss": 0.282, + "num_input_tokens_seen": 79942992, + "step": 36985 + }, + { + "epoch": 6.034257748776509, + "grad_norm": 0.04235079139471054, + "learning_rate": 0.0008811029548136906, + "loss": 0.1579, + "num_input_tokens_seen": 79955504, + "step": 36990 + }, + { + "epoch": 6.035073409461664, + "grad_norm": 0.1639384925365448, + "learning_rate": 0.0008810568737863574, + "loss": 0.1658, + "num_input_tokens_seen": 79965520, + "step": 36995 + }, + { + "epoch": 6.035889070146819, + "grad_norm": 0.1602403074502945, + "learning_rate": 0.000881010785036423, + "loss": 0.0947, + "num_input_tokens_seen": 79977264, + "step": 37000 + }, + { + "epoch": 6.036704730831974, + "grad_norm": 0.3046966791152954, + "learning_rate": 0.0008809646885648218, + "loss": 0.1439, + "num_input_tokens_seen": 79986832, + "step": 37005 + }, + { + "epoch": 6.037520391517129, + "grad_norm": 0.07625728845596313, + "learning_rate": 0.000880918584372488, + "loss": 0.041, + "num_input_tokens_seen": 79998160, + "step": 37010 + }, + { + "epoch": 6.0383360522022835, + "grad_norm": 0.21763290464878082, + "learning_rate": 0.0008808724724603558, + "loss": 0.1222, + "num_input_tokens_seen": 80008720, + "step": 37015 + }, + { + "epoch": 6.039151712887439, + "grad_norm": 0.031195595860481262, + "learning_rate": 0.0008808263528293596, + "loss": 0.0334, + "num_input_tokens_seen": 80017872, + "step": 37020 + }, + { + "epoch": 6.039967373572594, + "grad_norm": 0.25529634952545166, + "learning_rate": 0.0008807802254804344, + "loss": 0.2067, + "num_input_tokens_seen": 80027664, + "step": 37025 + }, + { + "epoch": 6.040783034257749, + "grad_norm": 0.12576717138290405, + "learning_rate": 0.000880734090414515, + "loss": 0.166, + "num_input_tokens_seen": 80039152, + "step": 37030 + }, + { + "epoch": 6.041598694942904, + "grad_norm": 0.013903754763305187, + "learning_rate": 0.000880687947632536, + "loss": 0.0329, + "num_input_tokens_seen": 80049904, + "step": 37035 + }, + { + "epoch": 6.0424143556280585, + "grad_norm": 0.01922302134335041, + "learning_rate": 0.000880641797135433, + "loss": 0.0433, + "num_input_tokens_seen": 80060496, + "step": 37040 + }, + { + "epoch": 6.043230016313213, + "grad_norm": 0.2248125970363617, + "learning_rate": 0.000880595638924141, + "loss": 0.1103, + "num_input_tokens_seen": 80070992, + "step": 37045 + }, + { + "epoch": 6.044045676998369, + "grad_norm": 0.04535336047410965, + "learning_rate": 0.0008805494729995957, + "loss": 0.0237, + "num_input_tokens_seen": 80081424, + "step": 37050 + }, + { + "epoch": 6.044861337683524, + "grad_norm": 0.07262910157442093, + "learning_rate": 0.0008805032993627324, + "loss": 0.0378, + "num_input_tokens_seen": 80091824, + "step": 37055 + }, + { + "epoch": 6.045676998368679, + "grad_norm": 0.1967252790927887, + "learning_rate": 0.0008804571180144871, + "loss": 0.1032, + "num_input_tokens_seen": 80102960, + "step": 37060 + }, + { + "epoch": 6.0464926590538335, + "grad_norm": 0.035517822951078415, + "learning_rate": 0.0008804109289557956, + "loss": 0.079, + "num_input_tokens_seen": 80114096, + "step": 37065 + }, + { + "epoch": 6.047308319738988, + "grad_norm": 0.02115003764629364, + "learning_rate": 0.0008803647321875942, + "loss": 0.0263, + "num_input_tokens_seen": 80124976, + "step": 37070 + }, + { + "epoch": 6.048123980424143, + "grad_norm": 0.08668390661478043, + "learning_rate": 0.0008803185277108188, + "loss": 0.1535, + "num_input_tokens_seen": 80135376, + "step": 37075 + }, + { + "epoch": 6.048939641109299, + "grad_norm": 0.014792878180742264, + "learning_rate": 0.0008802723155264061, + "loss": 0.0525, + "num_input_tokens_seen": 80145328, + "step": 37080 + }, + { + "epoch": 6.049755301794454, + "grad_norm": 0.055387213826179504, + "learning_rate": 0.0008802260956352924, + "loss": 0.0516, + "num_input_tokens_seen": 80155856, + "step": 37085 + }, + { + "epoch": 6.0505709624796085, + "grad_norm": 0.020852621644735336, + "learning_rate": 0.0008801798680384145, + "loss": 0.0232, + "num_input_tokens_seen": 80165136, + "step": 37090 + }, + { + "epoch": 6.051386623164763, + "grad_norm": 0.03771822154521942, + "learning_rate": 0.0008801336327367096, + "loss": 0.0203, + "num_input_tokens_seen": 80175504, + "step": 37095 + }, + { + "epoch": 6.052202283849918, + "grad_norm": 0.05520971119403839, + "learning_rate": 0.0008800873897311141, + "loss": 0.1523, + "num_input_tokens_seen": 80187632, + "step": 37100 + }, + { + "epoch": 6.053017944535074, + "grad_norm": 0.025719482451677322, + "learning_rate": 0.0008800411390225655, + "loss": 0.0728, + "num_input_tokens_seen": 80198576, + "step": 37105 + }, + { + "epoch": 6.053833605220229, + "grad_norm": 0.01990801841020584, + "learning_rate": 0.000879994880612001, + "loss": 0.03, + "num_input_tokens_seen": 80209232, + "step": 37110 + }, + { + "epoch": 6.054649265905383, + "grad_norm": 0.00535226333886385, + "learning_rate": 0.0008799486145003583, + "loss": 0.0637, + "num_input_tokens_seen": 80220336, + "step": 37115 + }, + { + "epoch": 6.055464926590538, + "grad_norm": 0.3502698838710785, + "learning_rate": 0.0008799023406885751, + "loss": 0.0758, + "num_input_tokens_seen": 80230384, + "step": 37120 + }, + { + "epoch": 6.056280587275693, + "grad_norm": 0.006463268771767616, + "learning_rate": 0.0008798560591775889, + "loss": 0.0427, + "num_input_tokens_seen": 80241136, + "step": 37125 + }, + { + "epoch": 6.057096247960848, + "grad_norm": 0.35583990812301636, + "learning_rate": 0.0008798097699683376, + "loss": 0.1532, + "num_input_tokens_seen": 80252688, + "step": 37130 + }, + { + "epoch": 6.057911908646004, + "grad_norm": 0.0426289327442646, + "learning_rate": 0.0008797634730617598, + "loss": 0.1269, + "num_input_tokens_seen": 80263728, + "step": 37135 + }, + { + "epoch": 6.058727569331158, + "grad_norm": 0.20918045938014984, + "learning_rate": 0.0008797171684587933, + "loss": 0.2027, + "num_input_tokens_seen": 80273104, + "step": 37140 + }, + { + "epoch": 6.059543230016313, + "grad_norm": 0.21897371113300323, + "learning_rate": 0.0008796708561603766, + "loss": 0.1077, + "num_input_tokens_seen": 80282576, + "step": 37145 + }, + { + "epoch": 6.060358890701468, + "grad_norm": 0.042841531336307526, + "learning_rate": 0.0008796245361674484, + "loss": 0.0861, + "num_input_tokens_seen": 80293840, + "step": 37150 + }, + { + "epoch": 6.061174551386623, + "grad_norm": 0.06690218299627304, + "learning_rate": 0.0008795782084809473, + "loss": 0.0386, + "num_input_tokens_seen": 80304656, + "step": 37155 + }, + { + "epoch": 6.061990212071779, + "grad_norm": 0.1094597652554512, + "learning_rate": 0.0008795318731018124, + "loss": 0.0313, + "num_input_tokens_seen": 80315984, + "step": 37160 + }, + { + "epoch": 6.062805872756933, + "grad_norm": 0.1659562736749649, + "learning_rate": 0.0008794855300309827, + "loss": 0.0799, + "num_input_tokens_seen": 80326992, + "step": 37165 + }, + { + "epoch": 6.063621533442088, + "grad_norm": 0.14440147578716278, + "learning_rate": 0.0008794391792693973, + "loss": 0.1157, + "num_input_tokens_seen": 80337712, + "step": 37170 + }, + { + "epoch": 6.064437194127243, + "grad_norm": 0.01738247647881508, + "learning_rate": 0.0008793928208179955, + "loss": 0.1882, + "num_input_tokens_seen": 80348560, + "step": 37175 + }, + { + "epoch": 6.065252854812398, + "grad_norm": 0.25447797775268555, + "learning_rate": 0.000879346454677717, + "loss": 0.1146, + "num_input_tokens_seen": 80359664, + "step": 37180 + }, + { + "epoch": 6.066068515497553, + "grad_norm": 0.3118617832660675, + "learning_rate": 0.0008793000808495012, + "loss": 0.0427, + "num_input_tokens_seen": 80370064, + "step": 37185 + }, + { + "epoch": 6.066884176182708, + "grad_norm": 0.008864902891218662, + "learning_rate": 0.0008792536993342882, + "loss": 0.0349, + "num_input_tokens_seen": 80380880, + "step": 37190 + }, + { + "epoch": 6.067699836867863, + "grad_norm": 0.10314369946718216, + "learning_rate": 0.0008792073101330177, + "loss": 0.212, + "num_input_tokens_seen": 80392144, + "step": 37195 + }, + { + "epoch": 6.068515497553018, + "grad_norm": 0.04900093749165535, + "learning_rate": 0.00087916091324663, + "loss": 0.0646, + "num_input_tokens_seen": 80403024, + "step": 37200 + }, + { + "epoch": 6.069331158238173, + "grad_norm": 0.0865040123462677, + "learning_rate": 0.0008791145086760656, + "loss": 0.0694, + "num_input_tokens_seen": 80413488, + "step": 37205 + }, + { + "epoch": 6.070146818923328, + "grad_norm": 0.1456834226846695, + "learning_rate": 0.0008790680964222647, + "loss": 0.1613, + "num_input_tokens_seen": 80424176, + "step": 37210 + }, + { + "epoch": 6.0709624796084825, + "grad_norm": 0.1410757303237915, + "learning_rate": 0.000879021676486168, + "loss": 0.0648, + "num_input_tokens_seen": 80434256, + "step": 37215 + }, + { + "epoch": 6.071778140293638, + "grad_norm": 0.02068004384636879, + "learning_rate": 0.0008789752488687159, + "loss": 0.0562, + "num_input_tokens_seen": 80444176, + "step": 37220 + }, + { + "epoch": 6.072593800978793, + "grad_norm": 0.1710319072008133, + "learning_rate": 0.00087892881357085, + "loss": 0.0422, + "num_input_tokens_seen": 80455952, + "step": 37225 + }, + { + "epoch": 6.073409461663948, + "grad_norm": 0.08538807928562164, + "learning_rate": 0.0008788823705935107, + "loss": 0.0306, + "num_input_tokens_seen": 80467024, + "step": 37230 + }, + { + "epoch": 6.074225122349103, + "grad_norm": 0.1996658742427826, + "learning_rate": 0.0008788359199376396, + "loss": 0.1149, + "num_input_tokens_seen": 80477488, + "step": 37235 + }, + { + "epoch": 6.075040783034257, + "grad_norm": 0.09354493021965027, + "learning_rate": 0.0008787894616041781, + "loss": 0.0556, + "num_input_tokens_seen": 80488080, + "step": 37240 + }, + { + "epoch": 6.075856443719413, + "grad_norm": 0.1762208789587021, + "learning_rate": 0.0008787429955940675, + "loss": 0.1917, + "num_input_tokens_seen": 80499664, + "step": 37245 + }, + { + "epoch": 6.076672104404568, + "grad_norm": 0.018932653591036797, + "learning_rate": 0.0008786965219082497, + "loss": 0.0719, + "num_input_tokens_seen": 80510544, + "step": 37250 + }, + { + "epoch": 6.077487765089723, + "grad_norm": 0.018798017874360085, + "learning_rate": 0.0008786500405476664, + "loss": 0.0172, + "num_input_tokens_seen": 80520880, + "step": 37255 + }, + { + "epoch": 6.078303425774878, + "grad_norm": 0.2359062135219574, + "learning_rate": 0.0008786035515132598, + "loss": 0.1641, + "num_input_tokens_seen": 80532784, + "step": 37260 + }, + { + "epoch": 6.079119086460032, + "grad_norm": 0.14479638636112213, + "learning_rate": 0.0008785570548059718, + "loss": 0.053, + "num_input_tokens_seen": 80542832, + "step": 37265 + }, + { + "epoch": 6.079934747145187, + "grad_norm": 0.2287491112947464, + "learning_rate": 0.0008785105504267449, + "loss": 0.0875, + "num_input_tokens_seen": 80553744, + "step": 37270 + }, + { + "epoch": 6.080750407830343, + "grad_norm": 0.1595332771539688, + "learning_rate": 0.0008784640383765215, + "loss": 0.0695, + "num_input_tokens_seen": 80564432, + "step": 37275 + }, + { + "epoch": 6.081566068515498, + "grad_norm": 0.01782151870429516, + "learning_rate": 0.0008784175186562442, + "loss": 0.1004, + "num_input_tokens_seen": 80575856, + "step": 37280 + }, + { + "epoch": 6.082381729200653, + "grad_norm": 0.1804230660200119, + "learning_rate": 0.000878370991266856, + "loss": 0.0825, + "num_input_tokens_seen": 80586096, + "step": 37285 + }, + { + "epoch": 6.083197389885807, + "grad_norm": 0.027629168704152107, + "learning_rate": 0.0008783244562092996, + "loss": 0.0175, + "num_input_tokens_seen": 80597040, + "step": 37290 + }, + { + "epoch": 6.084013050570962, + "grad_norm": 0.03521690145134926, + "learning_rate": 0.0008782779134845181, + "loss": 0.14, + "num_input_tokens_seen": 80609168, + "step": 37295 + }, + { + "epoch": 6.084828711256117, + "grad_norm": 0.02052682265639305, + "learning_rate": 0.0008782313630934548, + "loss": 0.1297, + "num_input_tokens_seen": 80620400, + "step": 37300 + }, + { + "epoch": 6.085644371941273, + "grad_norm": 0.33327436447143555, + "learning_rate": 0.0008781848050370531, + "loss": 0.1538, + "num_input_tokens_seen": 80633104, + "step": 37305 + }, + { + "epoch": 6.0864600326264275, + "grad_norm": 0.012549174949526787, + "learning_rate": 0.0008781382393162566, + "loss": 0.0184, + "num_input_tokens_seen": 80643056, + "step": 37310 + }, + { + "epoch": 6.087275693311582, + "grad_norm": 0.0040662651881575584, + "learning_rate": 0.0008780916659320091, + "loss": 0.1361, + "num_input_tokens_seen": 80653424, + "step": 37315 + }, + { + "epoch": 6.088091353996737, + "grad_norm": 0.03261783719062805, + "learning_rate": 0.0008780450848852541, + "loss": 0.0318, + "num_input_tokens_seen": 80663376, + "step": 37320 + }, + { + "epoch": 6.088907014681892, + "grad_norm": 0.0901263877749443, + "learning_rate": 0.0008779984961769361, + "loss": 0.0454, + "num_input_tokens_seen": 80674192, + "step": 37325 + }, + { + "epoch": 6.089722675367048, + "grad_norm": 0.03137005493044853, + "learning_rate": 0.0008779518998079988, + "loss": 0.0617, + "num_input_tokens_seen": 80685840, + "step": 37330 + }, + { + "epoch": 6.0905383360522025, + "grad_norm": 0.18100182712078094, + "learning_rate": 0.000877905295779387, + "loss": 0.0995, + "num_input_tokens_seen": 80696880, + "step": 37335 + }, + { + "epoch": 6.091353996737357, + "grad_norm": 0.2891119718551636, + "learning_rate": 0.0008778586840920449, + "loss": 0.0969, + "num_input_tokens_seen": 80707952, + "step": 37340 + }, + { + "epoch": 6.092169657422512, + "grad_norm": 0.14484070241451263, + "learning_rate": 0.0008778120647469172, + "loss": 0.0299, + "num_input_tokens_seen": 80719504, + "step": 37345 + }, + { + "epoch": 6.092985318107667, + "grad_norm": 0.1225421279668808, + "learning_rate": 0.0008777654377449487, + "loss": 0.0167, + "num_input_tokens_seen": 80730448, + "step": 37350 + }, + { + "epoch": 6.093800978792822, + "grad_norm": 0.23504728078842163, + "learning_rate": 0.0008777188030870845, + "loss": 0.1477, + "num_input_tokens_seen": 80742480, + "step": 37355 + }, + { + "epoch": 6.0946166394779775, + "grad_norm": 0.13727766275405884, + "learning_rate": 0.0008776721607742695, + "loss": 0.199, + "num_input_tokens_seen": 80753488, + "step": 37360 + }, + { + "epoch": 6.095432300163132, + "grad_norm": 0.014772283844649792, + "learning_rate": 0.0008776255108074489, + "loss": 0.2143, + "num_input_tokens_seen": 80764560, + "step": 37365 + }, + { + "epoch": 6.096247960848287, + "grad_norm": 0.05206260085105896, + "learning_rate": 0.0008775788531875685, + "loss": 0.0618, + "num_input_tokens_seen": 80776496, + "step": 37370 + }, + { + "epoch": 6.097063621533442, + "grad_norm": 0.08829730749130249, + "learning_rate": 0.0008775321879155735, + "loss": 0.0907, + "num_input_tokens_seen": 80786576, + "step": 37375 + }, + { + "epoch": 6.097879282218597, + "grad_norm": 0.021104643121361732, + "learning_rate": 0.0008774855149924099, + "loss": 0.0409, + "num_input_tokens_seen": 80798256, + "step": 37380 + }, + { + "epoch": 6.0986949429037525, + "grad_norm": 0.10581175982952118, + "learning_rate": 0.0008774388344190234, + "loss": 0.0454, + "num_input_tokens_seen": 80807056, + "step": 37385 + }, + { + "epoch": 6.099510603588907, + "grad_norm": 0.02822595275938511, + "learning_rate": 0.0008773921461963601, + "loss": 0.1279, + "num_input_tokens_seen": 80818160, + "step": 37390 + }, + { + "epoch": 6.100326264274062, + "grad_norm": 0.019954686984419823, + "learning_rate": 0.0008773454503253662, + "loss": 0.0432, + "num_input_tokens_seen": 80828144, + "step": 37395 + }, + { + "epoch": 6.101141924959217, + "grad_norm": 0.07640758901834488, + "learning_rate": 0.0008772987468069881, + "loss": 0.0477, + "num_input_tokens_seen": 80839696, + "step": 37400 + }, + { + "epoch": 6.101957585644372, + "grad_norm": 0.01018131896853447, + "learning_rate": 0.0008772520356421723, + "loss": 0.0103, + "num_input_tokens_seen": 80852912, + "step": 37405 + }, + { + "epoch": 6.102773246329527, + "grad_norm": 0.10560321807861328, + "learning_rate": 0.0008772053168318653, + "loss": 0.0675, + "num_input_tokens_seen": 80864336, + "step": 37410 + }, + { + "epoch": 6.103588907014682, + "grad_norm": 0.005883463192731142, + "learning_rate": 0.000877158590377014, + "loss": 0.0162, + "num_input_tokens_seen": 80874576, + "step": 37415 + }, + { + "epoch": 6.104404567699837, + "grad_norm": 0.14068001508712769, + "learning_rate": 0.0008771118562785656, + "loss": 0.0542, + "num_input_tokens_seen": 80885904, + "step": 37420 + }, + { + "epoch": 6.105220228384992, + "grad_norm": 0.23875434696674347, + "learning_rate": 0.0008770651145374669, + "loss": 0.1259, + "num_input_tokens_seen": 80894992, + "step": 37425 + }, + { + "epoch": 6.106035889070147, + "grad_norm": 0.009279138408601284, + "learning_rate": 0.0008770183651546653, + "loss": 0.0263, + "num_input_tokens_seen": 80906448, + "step": 37430 + }, + { + "epoch": 6.1068515497553015, + "grad_norm": 0.24461911618709564, + "learning_rate": 0.0008769716081311083, + "loss": 0.0408, + "num_input_tokens_seen": 80916048, + "step": 37435 + }, + { + "epoch": 6.107667210440456, + "grad_norm": 0.008548499085009098, + "learning_rate": 0.0008769248434677434, + "loss": 0.0109, + "num_input_tokens_seen": 80926288, + "step": 37440 + }, + { + "epoch": 6.108482871125612, + "grad_norm": 0.2648637890815735, + "learning_rate": 0.0008768780711655185, + "loss": 0.1377, + "num_input_tokens_seen": 80937840, + "step": 37445 + }, + { + "epoch": 6.109298531810767, + "grad_norm": 0.3084779977798462, + "learning_rate": 0.0008768312912253811, + "loss": 0.0928, + "num_input_tokens_seen": 80947472, + "step": 37450 + }, + { + "epoch": 6.110114192495922, + "grad_norm": 0.034691330045461655, + "learning_rate": 0.0008767845036482798, + "loss": 0.0082, + "num_input_tokens_seen": 80956528, + "step": 37455 + }, + { + "epoch": 6.1109298531810765, + "grad_norm": 0.04501752927899361, + "learning_rate": 0.0008767377084351625, + "loss": 0.0208, + "num_input_tokens_seen": 80966288, + "step": 37460 + }, + { + "epoch": 6.111745513866231, + "grad_norm": 0.056020502001047134, + "learning_rate": 0.0008766909055869777, + "loss": 0.0741, + "num_input_tokens_seen": 80977456, + "step": 37465 + }, + { + "epoch": 6.112561174551387, + "grad_norm": 0.012287724763154984, + "learning_rate": 0.0008766440951046736, + "loss": 0.0528, + "num_input_tokens_seen": 80989168, + "step": 37470 + }, + { + "epoch": 6.113376835236542, + "grad_norm": 0.28466975688934326, + "learning_rate": 0.0008765972769891993, + "loss": 0.1104, + "num_input_tokens_seen": 80998256, + "step": 37475 + }, + { + "epoch": 6.114192495921697, + "grad_norm": 0.03604330122470856, + "learning_rate": 0.0008765504512415033, + "loss": 0.0634, + "num_input_tokens_seen": 81009840, + "step": 37480 + }, + { + "epoch": 6.1150081566068515, + "grad_norm": 0.09890235960483551, + "learning_rate": 0.0008765036178625347, + "loss": 0.1132, + "num_input_tokens_seen": 81020624, + "step": 37485 + }, + { + "epoch": 6.115823817292006, + "grad_norm": 0.21546748280525208, + "learning_rate": 0.0008764567768532427, + "loss": 0.1936, + "num_input_tokens_seen": 81032368, + "step": 37490 + }, + { + "epoch": 6.116639477977161, + "grad_norm": 0.007293385919183493, + "learning_rate": 0.0008764099282145767, + "loss": 0.0463, + "num_input_tokens_seen": 81043856, + "step": 37495 + }, + { + "epoch": 6.117455138662317, + "grad_norm": 0.2758633494377136, + "learning_rate": 0.0008763630719474857, + "loss": 0.1094, + "num_input_tokens_seen": 81054704, + "step": 37500 + }, + { + "epoch": 6.118270799347472, + "grad_norm": 0.36068642139434814, + "learning_rate": 0.0008763162080529199, + "loss": 0.0931, + "num_input_tokens_seen": 81066800, + "step": 37505 + }, + { + "epoch": 6.1190864600326265, + "grad_norm": 0.1663595587015152, + "learning_rate": 0.0008762693365318286, + "loss": 0.0633, + "num_input_tokens_seen": 81078192, + "step": 37510 + }, + { + "epoch": 6.119902120717781, + "grad_norm": 0.2499445080757141, + "learning_rate": 0.0008762224573851619, + "loss": 0.167, + "num_input_tokens_seen": 81089200, + "step": 37515 + }, + { + "epoch": 6.120717781402936, + "grad_norm": 0.014237415045499802, + "learning_rate": 0.0008761755706138698, + "loss": 0.0442, + "num_input_tokens_seen": 81101552, + "step": 37520 + }, + { + "epoch": 6.121533442088092, + "grad_norm": 0.04967695102095604, + "learning_rate": 0.0008761286762189027, + "loss": 0.0679, + "num_input_tokens_seen": 81112080, + "step": 37525 + }, + { + "epoch": 6.122349102773247, + "grad_norm": 0.04989294335246086, + "learning_rate": 0.0008760817742012106, + "loss": 0.0584, + "num_input_tokens_seen": 81121744, + "step": 37530 + }, + { + "epoch": 6.123164763458401, + "grad_norm": 0.00509643042460084, + "learning_rate": 0.0008760348645617444, + "loss": 0.1544, + "num_input_tokens_seen": 81132912, + "step": 37535 + }, + { + "epoch": 6.123980424143556, + "grad_norm": 0.006603681482374668, + "learning_rate": 0.0008759879473014545, + "loss": 0.0361, + "num_input_tokens_seen": 81142032, + "step": 37540 + }, + { + "epoch": 6.124796084828711, + "grad_norm": 0.16137801110744476, + "learning_rate": 0.000875941022421292, + "loss": 0.1599, + "num_input_tokens_seen": 81154160, + "step": 37545 + }, + { + "epoch": 6.125611745513866, + "grad_norm": 0.10797300934791565, + "learning_rate": 0.0008758940899222077, + "loss": 0.0749, + "num_input_tokens_seen": 81164272, + "step": 37550 + }, + { + "epoch": 6.126427406199022, + "grad_norm": 0.005488535389304161, + "learning_rate": 0.0008758471498051528, + "loss": 0.0849, + "num_input_tokens_seen": 81174544, + "step": 37555 + }, + { + "epoch": 6.127243066884176, + "grad_norm": 0.08023177087306976, + "learning_rate": 0.0008758002020710787, + "loss": 0.2513, + "num_input_tokens_seen": 81186032, + "step": 37560 + }, + { + "epoch": 6.128058727569331, + "grad_norm": 0.022178582847118378, + "learning_rate": 0.0008757532467209367, + "loss": 0.017, + "num_input_tokens_seen": 81195472, + "step": 37565 + }, + { + "epoch": 6.128874388254486, + "grad_norm": 0.01314950454980135, + "learning_rate": 0.0008757062837556784, + "loss": 0.0286, + "num_input_tokens_seen": 81206096, + "step": 37570 + }, + { + "epoch": 6.129690048939641, + "grad_norm": 0.0685277059674263, + "learning_rate": 0.0008756593131762557, + "loss": 0.1684, + "num_input_tokens_seen": 81216464, + "step": 37575 + }, + { + "epoch": 6.130505709624796, + "grad_norm": 0.03624110668897629, + "learning_rate": 0.0008756123349836206, + "loss": 0.0953, + "num_input_tokens_seen": 81227440, + "step": 37580 + }, + { + "epoch": 6.131321370309951, + "grad_norm": 0.12655434012413025, + "learning_rate": 0.0008755653491787249, + "loss": 0.0492, + "num_input_tokens_seen": 81239184, + "step": 37585 + }, + { + "epoch": 6.132137030995106, + "grad_norm": 0.3611098527908325, + "learning_rate": 0.000875518355762521, + "loss": 0.1936, + "num_input_tokens_seen": 81249008, + "step": 37590 + }, + { + "epoch": 6.132952691680261, + "grad_norm": 0.16590391099452972, + "learning_rate": 0.0008754713547359612, + "loss": 0.0758, + "num_input_tokens_seen": 81261008, + "step": 37595 + }, + { + "epoch": 6.133768352365416, + "grad_norm": 0.25919094681739807, + "learning_rate": 0.0008754243460999982, + "loss": 0.1533, + "num_input_tokens_seen": 81271024, + "step": 37600 + }, + { + "epoch": 6.134584013050571, + "grad_norm": 0.021038195118308067, + "learning_rate": 0.0008753773298555844, + "loss": 0.1182, + "num_input_tokens_seen": 81281424, + "step": 37605 + }, + { + "epoch": 6.135399673735726, + "grad_norm": 0.31902235746383667, + "learning_rate": 0.0008753303060036728, + "loss": 0.1694, + "num_input_tokens_seen": 81293296, + "step": 37610 + }, + { + "epoch": 6.136215334420881, + "grad_norm": 0.07178599387407303, + "learning_rate": 0.0008752832745452166, + "loss": 0.0486, + "num_input_tokens_seen": 81305680, + "step": 37615 + }, + { + "epoch": 6.137030995106036, + "grad_norm": 0.02646786905825138, + "learning_rate": 0.0008752362354811686, + "loss": 0.1189, + "num_input_tokens_seen": 81316976, + "step": 37620 + }, + { + "epoch": 6.137846655791191, + "grad_norm": 0.0307559035718441, + "learning_rate": 0.0008751891888124823, + "loss": 0.0843, + "num_input_tokens_seen": 81328016, + "step": 37625 + }, + { + "epoch": 6.138662316476346, + "grad_norm": 0.27889934182167053, + "learning_rate": 0.0008751421345401111, + "loss": 0.1975, + "num_input_tokens_seen": 81338736, + "step": 37630 + }, + { + "epoch": 6.1394779771615005, + "grad_norm": 0.2421681433916092, + "learning_rate": 0.0008750950726650089, + "loss": 0.0508, + "num_input_tokens_seen": 81350192, + "step": 37635 + }, + { + "epoch": 6.140293637846656, + "grad_norm": 0.025474179536104202, + "learning_rate": 0.0008750480031881289, + "loss": 0.0527, + "num_input_tokens_seen": 81360752, + "step": 37640 + }, + { + "epoch": 6.141109298531811, + "grad_norm": 0.2337075024843216, + "learning_rate": 0.0008750009261104255, + "loss": 0.0568, + "num_input_tokens_seen": 81372176, + "step": 37645 + }, + { + "epoch": 6.141924959216966, + "grad_norm": 0.01117783784866333, + "learning_rate": 0.0008749538414328525, + "loss": 0.1526, + "num_input_tokens_seen": 81383632, + "step": 37650 + }, + { + "epoch": 6.142740619902121, + "grad_norm": 0.14854608476161957, + "learning_rate": 0.0008749067491563643, + "loss": 0.0564, + "num_input_tokens_seen": 81394384, + "step": 37655 + }, + { + "epoch": 6.143556280587275, + "grad_norm": 0.10728046298027039, + "learning_rate": 0.0008748596492819152, + "loss": 0.0813, + "num_input_tokens_seen": 81404816, + "step": 37660 + }, + { + "epoch": 6.14437194127243, + "grad_norm": 0.2026142179965973, + "learning_rate": 0.0008748125418104598, + "loss": 0.0881, + "num_input_tokens_seen": 81415568, + "step": 37665 + }, + { + "epoch": 6.145187601957586, + "grad_norm": 0.02100459858775139, + "learning_rate": 0.0008747654267429526, + "loss": 0.0187, + "num_input_tokens_seen": 81427344, + "step": 37670 + }, + { + "epoch": 6.146003262642741, + "grad_norm": 0.03070612996816635, + "learning_rate": 0.0008747183040803488, + "loss": 0.2181, + "num_input_tokens_seen": 81437808, + "step": 37675 + }, + { + "epoch": 6.146818923327896, + "grad_norm": 0.09371879696846008, + "learning_rate": 0.000874671173823603, + "loss": 0.0673, + "num_input_tokens_seen": 81448912, + "step": 37680 + }, + { + "epoch": 6.14763458401305, + "grad_norm": 0.09240694344043732, + "learning_rate": 0.0008746240359736708, + "loss": 0.1602, + "num_input_tokens_seen": 81459312, + "step": 37685 + }, + { + "epoch": 6.148450244698205, + "grad_norm": 0.036088816821575165, + "learning_rate": 0.0008745768905315072, + "loss": 0.1586, + "num_input_tokens_seen": 81469328, + "step": 37690 + }, + { + "epoch": 6.149265905383361, + "grad_norm": 0.06369510293006897, + "learning_rate": 0.0008745297374980676, + "loss": 0.0624, + "num_input_tokens_seen": 81479216, + "step": 37695 + }, + { + "epoch": 6.150081566068516, + "grad_norm": 0.10222480446100235, + "learning_rate": 0.0008744825768743079, + "loss": 0.0239, + "num_input_tokens_seen": 81490032, + "step": 37700 + }, + { + "epoch": 6.150897226753671, + "grad_norm": 0.06268610805273056, + "learning_rate": 0.0008744354086611837, + "loss": 0.1849, + "num_input_tokens_seen": 81500976, + "step": 37705 + }, + { + "epoch": 6.151712887438825, + "grad_norm": 0.012655073776841164, + "learning_rate": 0.0008743882328596509, + "loss": 0.0349, + "num_input_tokens_seen": 81511632, + "step": 37710 + }, + { + "epoch": 6.15252854812398, + "grad_norm": 0.06229158863425255, + "learning_rate": 0.0008743410494706655, + "loss": 0.1122, + "num_input_tokens_seen": 81521616, + "step": 37715 + }, + { + "epoch": 6.153344208809135, + "grad_norm": 0.08903874456882477, + "learning_rate": 0.0008742938584951841, + "loss": 0.0678, + "num_input_tokens_seen": 81530864, + "step": 37720 + }, + { + "epoch": 6.154159869494291, + "grad_norm": 0.005974739324301481, + "learning_rate": 0.0008742466599341625, + "loss": 0.0475, + "num_input_tokens_seen": 81542896, + "step": 37725 + }, + { + "epoch": 6.1549755301794455, + "grad_norm": 0.014743143692612648, + "learning_rate": 0.0008741994537885578, + "loss": 0.0512, + "num_input_tokens_seen": 81553360, + "step": 37730 + }, + { + "epoch": 6.1557911908646, + "grad_norm": 0.006878357846289873, + "learning_rate": 0.0008741522400593265, + "loss": 0.0604, + "num_input_tokens_seen": 81564752, + "step": 37735 + }, + { + "epoch": 6.156606851549755, + "grad_norm": 0.030033187940716743, + "learning_rate": 0.0008741050187474253, + "loss": 0.0392, + "num_input_tokens_seen": 81575504, + "step": 37740 + }, + { + "epoch": 6.15742251223491, + "grad_norm": 0.2209750860929489, + "learning_rate": 0.0008740577898538114, + "loss": 0.1603, + "num_input_tokens_seen": 81586480, + "step": 37745 + }, + { + "epoch": 6.158238172920065, + "grad_norm": 0.008535216562449932, + "learning_rate": 0.0008740105533794417, + "loss": 0.0455, + "num_input_tokens_seen": 81597264, + "step": 37750 + }, + { + "epoch": 6.1590538336052205, + "grad_norm": 0.07704164832830429, + "learning_rate": 0.0008739633093252738, + "loss": 0.0388, + "num_input_tokens_seen": 81608592, + "step": 37755 + }, + { + "epoch": 6.159869494290375, + "grad_norm": 0.07119981199502945, + "learning_rate": 0.0008739160576922649, + "loss": 0.0264, + "num_input_tokens_seen": 81619600, + "step": 37760 + }, + { + "epoch": 6.16068515497553, + "grad_norm": 0.08949284255504608, + "learning_rate": 0.0008738687984813729, + "loss": 0.0402, + "num_input_tokens_seen": 81630160, + "step": 37765 + }, + { + "epoch": 6.161500815660685, + "grad_norm": 0.0037753605283796787, + "learning_rate": 0.0008738215316935554, + "loss": 0.0422, + "num_input_tokens_seen": 81641392, + "step": 37770 + }, + { + "epoch": 6.16231647634584, + "grad_norm": 0.0014340798370540142, + "learning_rate": 0.0008737742573297702, + "loss": 0.1182, + "num_input_tokens_seen": 81652944, + "step": 37775 + }, + { + "epoch": 6.1631321370309955, + "grad_norm": 0.024722402915358543, + "learning_rate": 0.0008737269753909757, + "loss": 0.1752, + "num_input_tokens_seen": 81664592, + "step": 37780 + }, + { + "epoch": 6.16394779771615, + "grad_norm": 0.019996825605630875, + "learning_rate": 0.0008736796858781297, + "loss": 0.0303, + "num_input_tokens_seen": 81675952, + "step": 37785 + }, + { + "epoch": 6.164763458401305, + "grad_norm": 0.25875627994537354, + "learning_rate": 0.0008736323887921911, + "loss": 0.0558, + "num_input_tokens_seen": 81685360, + "step": 37790 + }, + { + "epoch": 6.16557911908646, + "grad_norm": 0.15578259527683258, + "learning_rate": 0.0008735850841341179, + "loss": 0.0491, + "num_input_tokens_seen": 81696304, + "step": 37795 + }, + { + "epoch": 6.166394779771615, + "grad_norm": 0.26379722356796265, + "learning_rate": 0.0008735377719048692, + "loss": 0.1074, + "num_input_tokens_seen": 81706160, + "step": 37800 + }, + { + "epoch": 6.16721044045677, + "grad_norm": 0.17212273180484772, + "learning_rate": 0.0008734904521054037, + "loss": 0.0713, + "num_input_tokens_seen": 81715760, + "step": 37805 + }, + { + "epoch": 6.168026101141925, + "grad_norm": 0.042617104947566986, + "learning_rate": 0.0008734431247366803, + "loss": 0.0534, + "num_input_tokens_seen": 81726736, + "step": 37810 + }, + { + "epoch": 6.16884176182708, + "grad_norm": 0.3961753249168396, + "learning_rate": 0.0008733957897996583, + "loss": 0.1007, + "num_input_tokens_seen": 81736528, + "step": 37815 + }, + { + "epoch": 6.169657422512235, + "grad_norm": 0.04119712486863136, + "learning_rate": 0.0008733484472952969, + "loss": 0.0921, + "num_input_tokens_seen": 81747856, + "step": 37820 + }, + { + "epoch": 6.17047308319739, + "grad_norm": 0.20554359257221222, + "learning_rate": 0.0008733010972245554, + "loss": 0.112, + "num_input_tokens_seen": 81758928, + "step": 37825 + }, + { + "epoch": 6.171288743882545, + "grad_norm": 0.11228428035974503, + "learning_rate": 0.0008732537395883938, + "loss": 0.0574, + "num_input_tokens_seen": 81769392, + "step": 37830 + }, + { + "epoch": 6.1721044045677, + "grad_norm": 0.021784307435154915, + "learning_rate": 0.0008732063743877716, + "loss": 0.0364, + "num_input_tokens_seen": 81781872, + "step": 37835 + }, + { + "epoch": 6.172920065252855, + "grad_norm": 0.019864261150360107, + "learning_rate": 0.0008731590016236489, + "loss": 0.1228, + "num_input_tokens_seen": 81792976, + "step": 37840 + }, + { + "epoch": 6.17373572593801, + "grad_norm": 0.05301322415471077, + "learning_rate": 0.0008731116212969856, + "loss": 0.0468, + "num_input_tokens_seen": 81802928, + "step": 37845 + }, + { + "epoch": 6.174551386623165, + "grad_norm": 0.14838729798793793, + "learning_rate": 0.000873064233408742, + "loss": 0.1423, + "num_input_tokens_seen": 81812272, + "step": 37850 + }, + { + "epoch": 6.1753670473083195, + "grad_norm": 0.03396567702293396, + "learning_rate": 0.0008730168379598782, + "loss": 0.0764, + "num_input_tokens_seen": 81823568, + "step": 37855 + }, + { + "epoch": 6.176182707993474, + "grad_norm": 0.23131468892097473, + "learning_rate": 0.0008729694349513552, + "loss": 0.1222, + "num_input_tokens_seen": 81834000, + "step": 37860 + }, + { + "epoch": 6.17699836867863, + "grad_norm": 0.041217345744371414, + "learning_rate": 0.0008729220243841334, + "loss": 0.3346, + "num_input_tokens_seen": 81844016, + "step": 37865 + }, + { + "epoch": 6.177814029363785, + "grad_norm": 0.049347419291734695, + "learning_rate": 0.0008728746062591737, + "loss": 0.0311, + "num_input_tokens_seen": 81855600, + "step": 37870 + }, + { + "epoch": 6.17862969004894, + "grad_norm": 0.03369903936982155, + "learning_rate": 0.0008728271805774371, + "loss": 0.0435, + "num_input_tokens_seen": 81866640, + "step": 37875 + }, + { + "epoch": 6.1794453507340945, + "grad_norm": 0.18959836661815643, + "learning_rate": 0.0008727797473398846, + "loss": 0.1385, + "num_input_tokens_seen": 81876528, + "step": 37880 + }, + { + "epoch": 6.180261011419249, + "grad_norm": 0.020675910636782646, + "learning_rate": 0.0008727323065474778, + "loss": 0.0281, + "num_input_tokens_seen": 81888144, + "step": 37885 + }, + { + "epoch": 6.181076672104404, + "grad_norm": 0.05461437627673149, + "learning_rate": 0.000872684858201178, + "loss": 0.1013, + "num_input_tokens_seen": 81899344, + "step": 37890 + }, + { + "epoch": 6.18189233278956, + "grad_norm": 0.09581451117992401, + "learning_rate": 0.0008726374023019465, + "loss": 0.1075, + "num_input_tokens_seen": 81909328, + "step": 37895 + }, + { + "epoch": 6.182707993474715, + "grad_norm": 0.10295901447534561, + "learning_rate": 0.0008725899388507454, + "loss": 0.0428, + "num_input_tokens_seen": 81921232, + "step": 37900 + }, + { + "epoch": 6.1835236541598695, + "grad_norm": 0.005341388285160065, + "learning_rate": 0.0008725424678485366, + "loss": 0.1492, + "num_input_tokens_seen": 81932304, + "step": 37905 + }, + { + "epoch": 6.184339314845024, + "grad_norm": 0.02357945591211319, + "learning_rate": 0.0008724949892962821, + "loss": 0.0893, + "num_input_tokens_seen": 81943792, + "step": 37910 + }, + { + "epoch": 6.185154975530179, + "grad_norm": 0.012670408934354782, + "learning_rate": 0.0008724475031949441, + "loss": 0.0995, + "num_input_tokens_seen": 81954000, + "step": 37915 + }, + { + "epoch": 6.185970636215335, + "grad_norm": 0.2212541550397873, + "learning_rate": 0.0008724000095454849, + "loss": 0.0837, + "num_input_tokens_seen": 81964560, + "step": 37920 + }, + { + "epoch": 6.18678629690049, + "grad_norm": 0.07937619090080261, + "learning_rate": 0.0008723525083488671, + "loss": 0.0348, + "num_input_tokens_seen": 81975504, + "step": 37925 + }, + { + "epoch": 6.1876019575856445, + "grad_norm": 0.21293163299560547, + "learning_rate": 0.0008723049996060534, + "loss": 0.1686, + "num_input_tokens_seen": 81984816, + "step": 37930 + }, + { + "epoch": 6.188417618270799, + "grad_norm": 0.056424580514431, + "learning_rate": 0.0008722574833180065, + "loss": 0.017, + "num_input_tokens_seen": 81992880, + "step": 37935 + }, + { + "epoch": 6.189233278955954, + "grad_norm": 0.007523713167756796, + "learning_rate": 0.0008722099594856895, + "loss": 0.0239, + "num_input_tokens_seen": 82003216, + "step": 37940 + }, + { + "epoch": 6.190048939641109, + "grad_norm": 0.09879666566848755, + "learning_rate": 0.0008721624281100655, + "loss": 0.0795, + "num_input_tokens_seen": 82015120, + "step": 37945 + }, + { + "epoch": 6.190864600326265, + "grad_norm": 0.13292758166790009, + "learning_rate": 0.0008721148891920978, + "loss": 0.0786, + "num_input_tokens_seen": 82026512, + "step": 37950 + }, + { + "epoch": 6.191680261011419, + "grad_norm": 0.01907687447965145, + "learning_rate": 0.0008720673427327496, + "loss": 0.0676, + "num_input_tokens_seen": 82037360, + "step": 37955 + }, + { + "epoch": 6.192495921696574, + "grad_norm": 0.057684943079948425, + "learning_rate": 0.0008720197887329851, + "loss": 0.0512, + "num_input_tokens_seen": 82048048, + "step": 37960 + }, + { + "epoch": 6.193311582381729, + "grad_norm": 0.011981310322880745, + "learning_rate": 0.0008719722271937673, + "loss": 0.0326, + "num_input_tokens_seen": 82059216, + "step": 37965 + }, + { + "epoch": 6.194127243066884, + "grad_norm": 0.1914396733045578, + "learning_rate": 0.0008719246581160606, + "loss": 0.2076, + "num_input_tokens_seen": 82070160, + "step": 37970 + }, + { + "epoch": 6.19494290375204, + "grad_norm": 0.051620397716760635, + "learning_rate": 0.0008718770815008288, + "loss": 0.085, + "num_input_tokens_seen": 82081328, + "step": 37975 + }, + { + "epoch": 6.195758564437194, + "grad_norm": 0.1118040457367897, + "learning_rate": 0.0008718294973490362, + "loss": 0.0768, + "num_input_tokens_seen": 82091920, + "step": 37980 + }, + { + "epoch": 6.196574225122349, + "grad_norm": 0.13779281079769135, + "learning_rate": 0.0008717819056616472, + "loss": 0.0242, + "num_input_tokens_seen": 82103312, + "step": 37985 + }, + { + "epoch": 6.197389885807504, + "grad_norm": 0.10947174578905106, + "learning_rate": 0.0008717343064396262, + "loss": 0.1393, + "num_input_tokens_seen": 82114608, + "step": 37990 + }, + { + "epoch": 6.198205546492659, + "grad_norm": 0.1319754719734192, + "learning_rate": 0.0008716866996839378, + "loss": 0.0664, + "num_input_tokens_seen": 82125232, + "step": 37995 + }, + { + "epoch": 6.199021207177814, + "grad_norm": 0.02237529121339321, + "learning_rate": 0.0008716390853955472, + "loss": 0.0549, + "num_input_tokens_seen": 82136208, + "step": 38000 + }, + { + "epoch": 6.199836867862969, + "grad_norm": 0.05175924673676491, + "learning_rate": 0.0008715914635754187, + "loss": 0.0199, + "num_input_tokens_seen": 82147184, + "step": 38005 + }, + { + "epoch": 6.200652528548124, + "grad_norm": 0.33201223611831665, + "learning_rate": 0.0008715438342245181, + "loss": 0.1194, + "num_input_tokens_seen": 82158672, + "step": 38010 + }, + { + "epoch": 6.201468189233279, + "grad_norm": 0.19700491428375244, + "learning_rate": 0.0008714961973438103, + "loss": 0.206, + "num_input_tokens_seen": 82170448, + "step": 38015 + }, + { + "epoch": 6.202283849918434, + "grad_norm": 0.0030625720974057913, + "learning_rate": 0.0008714485529342606, + "loss": 0.1937, + "num_input_tokens_seen": 82180848, + "step": 38020 + }, + { + "epoch": 6.203099510603589, + "grad_norm": 0.06964084506034851, + "learning_rate": 0.0008714009009968349, + "loss": 0.1514, + "num_input_tokens_seen": 82191600, + "step": 38025 + }, + { + "epoch": 6.2039151712887435, + "grad_norm": 0.019868431612849236, + "learning_rate": 0.0008713532415324988, + "loss": 0.0404, + "num_input_tokens_seen": 82202384, + "step": 38030 + }, + { + "epoch": 6.204730831973899, + "grad_norm": 0.00986363273113966, + "learning_rate": 0.0008713055745422181, + "loss": 0.0654, + "num_input_tokens_seen": 82211184, + "step": 38035 + }, + { + "epoch": 6.205546492659054, + "grad_norm": 0.3911730647087097, + "learning_rate": 0.000871257900026959, + "loss": 0.2685, + "num_input_tokens_seen": 82220720, + "step": 38040 + }, + { + "epoch": 6.206362153344209, + "grad_norm": 0.10592617094516754, + "learning_rate": 0.0008712102179876876, + "loss": 0.0356, + "num_input_tokens_seen": 82232176, + "step": 38045 + }, + { + "epoch": 6.207177814029364, + "grad_norm": 0.1874314248561859, + "learning_rate": 0.0008711625284253701, + "loss": 0.1489, + "num_input_tokens_seen": 82241648, + "step": 38050 + }, + { + "epoch": 6.2079934747145185, + "grad_norm": 0.008363268338143826, + "learning_rate": 0.0008711148313409731, + "loss": 0.3101, + "num_input_tokens_seen": 82251952, + "step": 38055 + }, + { + "epoch": 6.208809135399674, + "grad_norm": 0.006032775621861219, + "learning_rate": 0.0008710671267354633, + "loss": 0.0385, + "num_input_tokens_seen": 82262800, + "step": 38060 + }, + { + "epoch": 6.209624796084829, + "grad_norm": 0.018502449616789818, + "learning_rate": 0.0008710194146098074, + "loss": 0.1468, + "num_input_tokens_seen": 82273552, + "step": 38065 + }, + { + "epoch": 6.210440456769984, + "grad_norm": 0.03034134767949581, + "learning_rate": 0.0008709716949649724, + "loss": 0.0399, + "num_input_tokens_seen": 82285552, + "step": 38070 + }, + { + "epoch": 6.211256117455139, + "grad_norm": 0.20970183610916138, + "learning_rate": 0.0008709239678019255, + "loss": 0.1452, + "num_input_tokens_seen": 82297520, + "step": 38075 + }, + { + "epoch": 6.212071778140293, + "grad_norm": 0.08290934562683105, + "learning_rate": 0.0008708762331216338, + "loss": 0.0404, + "num_input_tokens_seen": 82307856, + "step": 38080 + }, + { + "epoch": 6.212887438825448, + "grad_norm": 0.06803029775619507, + "learning_rate": 0.0008708284909250646, + "loss": 0.0822, + "num_input_tokens_seen": 82319344, + "step": 38085 + }, + { + "epoch": 6.213703099510604, + "grad_norm": 0.2577921152114868, + "learning_rate": 0.0008707807412131858, + "loss": 0.2342, + "num_input_tokens_seen": 82330672, + "step": 38090 + }, + { + "epoch": 6.214518760195759, + "grad_norm": 0.08383003622293472, + "learning_rate": 0.0008707329839869649, + "loss": 0.0632, + "num_input_tokens_seen": 82342064, + "step": 38095 + }, + { + "epoch": 6.215334420880914, + "grad_norm": 0.06594287604093552, + "learning_rate": 0.0008706852192473696, + "loss": 0.0725, + "num_input_tokens_seen": 82353232, + "step": 38100 + }, + { + "epoch": 6.216150081566068, + "grad_norm": 0.09301108121871948, + "learning_rate": 0.0008706374469953682, + "loss": 0.023, + "num_input_tokens_seen": 82364624, + "step": 38105 + }, + { + "epoch": 6.216965742251223, + "grad_norm": 0.047476354986429214, + "learning_rate": 0.0008705896672319286, + "loss": 0.0269, + "num_input_tokens_seen": 82376816, + "step": 38110 + }, + { + "epoch": 6.217781402936378, + "grad_norm": 0.09446101635694504, + "learning_rate": 0.0008705418799580196, + "loss": 0.0419, + "num_input_tokens_seen": 82386992, + "step": 38115 + }, + { + "epoch": 6.218597063621534, + "grad_norm": 0.00325257726944983, + "learning_rate": 0.000870494085174609, + "loss": 0.0846, + "num_input_tokens_seen": 82398032, + "step": 38120 + }, + { + "epoch": 6.219412724306689, + "grad_norm": 0.033085815608501434, + "learning_rate": 0.000870446282882666, + "loss": 0.0113, + "num_input_tokens_seen": 82409488, + "step": 38125 + }, + { + "epoch": 6.220228384991843, + "grad_norm": 0.005405626259744167, + "learning_rate": 0.0008703984730831589, + "loss": 0.0081, + "num_input_tokens_seen": 82420368, + "step": 38130 + }, + { + "epoch": 6.221044045676998, + "grad_norm": 0.010103190317749977, + "learning_rate": 0.0008703506557770571, + "loss": 0.009, + "num_input_tokens_seen": 82430800, + "step": 38135 + }, + { + "epoch": 6.221859706362153, + "grad_norm": 0.04629164934158325, + "learning_rate": 0.0008703028309653293, + "loss": 0.0677, + "num_input_tokens_seen": 82441680, + "step": 38140 + }, + { + "epoch": 6.222675367047309, + "grad_norm": 0.009996136650443077, + "learning_rate": 0.0008702549986489449, + "loss": 0.0619, + "num_input_tokens_seen": 82452432, + "step": 38145 + }, + { + "epoch": 6.2234910277324635, + "grad_norm": 0.1453513503074646, + "learning_rate": 0.0008702071588288731, + "loss": 0.1359, + "num_input_tokens_seen": 82463216, + "step": 38150 + }, + { + "epoch": 6.224306688417618, + "grad_norm": 0.2776382863521576, + "learning_rate": 0.0008701593115060837, + "loss": 0.1409, + "num_input_tokens_seen": 82475184, + "step": 38155 + }, + { + "epoch": 6.225122349102773, + "grad_norm": 0.08298216760158539, + "learning_rate": 0.0008701114566815464, + "loss": 0.0307, + "num_input_tokens_seen": 82484080, + "step": 38160 + }, + { + "epoch": 6.225938009787928, + "grad_norm": 0.18404394388198853, + "learning_rate": 0.0008700635943562308, + "loss": 0.0845, + "num_input_tokens_seen": 82493488, + "step": 38165 + }, + { + "epoch": 6.226753670473083, + "grad_norm": 0.23019194602966309, + "learning_rate": 0.0008700157245311071, + "loss": 0.0372, + "num_input_tokens_seen": 82504784, + "step": 38170 + }, + { + "epoch": 6.2275693311582385, + "grad_norm": 0.275285929441452, + "learning_rate": 0.0008699678472071453, + "loss": 0.1103, + "num_input_tokens_seen": 82516400, + "step": 38175 + }, + { + "epoch": 6.228384991843393, + "grad_norm": 0.04369976744055748, + "learning_rate": 0.0008699199623853156, + "loss": 0.0648, + "num_input_tokens_seen": 82526000, + "step": 38180 + }, + { + "epoch": 6.229200652528548, + "grad_norm": 0.010660940781235695, + "learning_rate": 0.0008698720700665888, + "loss": 0.1524, + "num_input_tokens_seen": 82535760, + "step": 38185 + }, + { + "epoch": 6.230016313213703, + "grad_norm": 0.10546161234378815, + "learning_rate": 0.0008698241702519351, + "loss": 0.0292, + "num_input_tokens_seen": 82547472, + "step": 38190 + }, + { + "epoch": 6.230831973898858, + "grad_norm": 0.02637307532131672, + "learning_rate": 0.0008697762629423254, + "loss": 0.0357, + "num_input_tokens_seen": 82559312, + "step": 38195 + }, + { + "epoch": 6.231647634584013, + "grad_norm": 0.016469111666083336, + "learning_rate": 0.0008697283481387308, + "loss": 0.0954, + "num_input_tokens_seen": 82570800, + "step": 38200 + }, + { + "epoch": 6.232463295269168, + "grad_norm": 0.005616335663944483, + "learning_rate": 0.000869680425842122, + "loss": 0.0197, + "num_input_tokens_seen": 82582416, + "step": 38205 + }, + { + "epoch": 6.233278955954323, + "grad_norm": 0.0673634260892868, + "learning_rate": 0.0008696324960534706, + "loss": 0.0132, + "num_input_tokens_seen": 82592400, + "step": 38210 + }, + { + "epoch": 6.234094616639478, + "grad_norm": 0.02261550910770893, + "learning_rate": 0.0008695845587737476, + "loss": 0.0817, + "num_input_tokens_seen": 82603216, + "step": 38215 + }, + { + "epoch": 6.234910277324633, + "grad_norm": 0.012282857671380043, + "learning_rate": 0.0008695366140039248, + "loss": 0.0945, + "num_input_tokens_seen": 82613648, + "step": 38220 + }, + { + "epoch": 6.235725938009788, + "grad_norm": 0.01848006621003151, + "learning_rate": 0.0008694886617449738, + "loss": 0.0324, + "num_input_tokens_seen": 82625488, + "step": 38225 + }, + { + "epoch": 6.236541598694943, + "grad_norm": 0.004085164982825518, + "learning_rate": 0.0008694407019978661, + "loss": 0.0539, + "num_input_tokens_seen": 82636880, + "step": 38230 + }, + { + "epoch": 6.237357259380098, + "grad_norm": 0.412178635597229, + "learning_rate": 0.0008693927347635741, + "loss": 0.1095, + "num_input_tokens_seen": 82648368, + "step": 38235 + }, + { + "epoch": 6.238172920065253, + "grad_norm": 0.063252754509449, + "learning_rate": 0.0008693447600430695, + "loss": 0.1211, + "num_input_tokens_seen": 82658736, + "step": 38240 + }, + { + "epoch": 6.238988580750408, + "grad_norm": 0.33977410197257996, + "learning_rate": 0.000869296777837325, + "loss": 0.2601, + "num_input_tokens_seen": 82667888, + "step": 38245 + }, + { + "epoch": 6.239804241435563, + "grad_norm": 0.007731478661298752, + "learning_rate": 0.0008692487881473128, + "loss": 0.0068, + "num_input_tokens_seen": 82678896, + "step": 38250 + }, + { + "epoch": 6.240619902120717, + "grad_norm": 0.014961308799684048, + "learning_rate": 0.0008692007909740054, + "loss": 0.1406, + "num_input_tokens_seen": 82689424, + "step": 38255 + }, + { + "epoch": 6.241435562805873, + "grad_norm": 0.030552340671420097, + "learning_rate": 0.0008691527863183755, + "loss": 0.0516, + "num_input_tokens_seen": 82700848, + "step": 38260 + }, + { + "epoch": 6.242251223491028, + "grad_norm": 0.00711380410939455, + "learning_rate": 0.0008691047741813963, + "loss": 0.188, + "num_input_tokens_seen": 82710288, + "step": 38265 + }, + { + "epoch": 6.243066884176183, + "grad_norm": 0.011850893497467041, + "learning_rate": 0.0008690567545640406, + "loss": 0.0169, + "num_input_tokens_seen": 82720240, + "step": 38270 + }, + { + "epoch": 6.2438825448613375, + "grad_norm": 0.09806658327579498, + "learning_rate": 0.0008690087274672814, + "loss": 0.0543, + "num_input_tokens_seen": 82731664, + "step": 38275 + }, + { + "epoch": 6.244698205546492, + "grad_norm": 0.11136013269424438, + "learning_rate": 0.0008689606928920923, + "loss": 0.0897, + "num_input_tokens_seen": 82742832, + "step": 38280 + }, + { + "epoch": 6.245513866231648, + "grad_norm": 0.3061741292476654, + "learning_rate": 0.0008689126508394467, + "loss": 0.1712, + "num_input_tokens_seen": 82752496, + "step": 38285 + }, + { + "epoch": 6.246329526916803, + "grad_norm": 0.04090377315878868, + "learning_rate": 0.0008688646013103183, + "loss": 0.0762, + "num_input_tokens_seen": 82764016, + "step": 38290 + }, + { + "epoch": 6.247145187601958, + "grad_norm": 0.02585625648498535, + "learning_rate": 0.0008688165443056808, + "loss": 0.0554, + "num_input_tokens_seen": 82773904, + "step": 38295 + }, + { + "epoch": 6.2479608482871125, + "grad_norm": 0.12480328232049942, + "learning_rate": 0.0008687684798265081, + "loss": 0.0233, + "num_input_tokens_seen": 82783376, + "step": 38300 + }, + { + "epoch": 6.248776508972267, + "grad_norm": 0.34098994731903076, + "learning_rate": 0.0008687204078737744, + "loss": 0.1455, + "num_input_tokens_seen": 82794256, + "step": 38305 + }, + { + "epoch": 6.249592169657422, + "grad_norm": 0.2570803165435791, + "learning_rate": 0.0008686723284484538, + "loss": 0.2018, + "num_input_tokens_seen": 82802288, + "step": 38310 + }, + { + "epoch": 6.250407830342578, + "grad_norm": 0.009916471317410469, + "learning_rate": 0.0008686242415515209, + "loss": 0.0138, + "num_input_tokens_seen": 82813584, + "step": 38315 + }, + { + "epoch": 6.251223491027733, + "grad_norm": 0.2923734188079834, + "learning_rate": 0.00086857614718395, + "loss": 0.2181, + "num_input_tokens_seen": 82823824, + "step": 38320 + }, + { + "epoch": 6.2520391517128875, + "grad_norm": 0.04271473363041878, + "learning_rate": 0.0008685280453467159, + "loss": 0.208, + "num_input_tokens_seen": 82834032, + "step": 38325 + }, + { + "epoch": 6.252854812398042, + "grad_norm": 0.04432328790426254, + "learning_rate": 0.0008684799360407935, + "loss": 0.0405, + "num_input_tokens_seen": 82845456, + "step": 38330 + }, + { + "epoch": 6.253670473083197, + "grad_norm": 0.10046076774597168, + "learning_rate": 0.0008684318192671576, + "loss": 0.1017, + "num_input_tokens_seen": 82856144, + "step": 38335 + }, + { + "epoch": 6.254486133768353, + "grad_norm": 0.2138485610485077, + "learning_rate": 0.0008683836950267838, + "loss": 0.1575, + "num_input_tokens_seen": 82867728, + "step": 38340 + }, + { + "epoch": 6.255301794453508, + "grad_norm": 0.022035308182239532, + "learning_rate": 0.0008683355633206469, + "loss": 0.1219, + "num_input_tokens_seen": 82878768, + "step": 38345 + }, + { + "epoch": 6.2561174551386625, + "grad_norm": 0.05334000289440155, + "learning_rate": 0.0008682874241497225, + "loss": 0.0254, + "num_input_tokens_seen": 82889872, + "step": 38350 + }, + { + "epoch": 6.256933115823817, + "grad_norm": 0.005959376692771912, + "learning_rate": 0.0008682392775149863, + "loss": 0.0455, + "num_input_tokens_seen": 82901616, + "step": 38355 + }, + { + "epoch": 6.257748776508972, + "grad_norm": 0.007209327537566423, + "learning_rate": 0.000868191123417414, + "loss": 0.03, + "num_input_tokens_seen": 82911792, + "step": 38360 + }, + { + "epoch": 6.258564437194127, + "grad_norm": 0.11563768237829208, + "learning_rate": 0.0008681429618579815, + "loss": 0.0332, + "num_input_tokens_seen": 82922256, + "step": 38365 + }, + { + "epoch": 6.259380097879283, + "grad_norm": 0.05262453109025955, + "learning_rate": 0.0008680947928376648, + "loss": 0.0458, + "num_input_tokens_seen": 82933072, + "step": 38370 + }, + { + "epoch": 6.260195758564437, + "grad_norm": 0.10289740562438965, + "learning_rate": 0.0008680466163574402, + "loss": 0.0744, + "num_input_tokens_seen": 82943984, + "step": 38375 + }, + { + "epoch": 6.261011419249592, + "grad_norm": 0.0066475640051066875, + "learning_rate": 0.000867998432418284, + "loss": 0.0083, + "num_input_tokens_seen": 82953072, + "step": 38380 + }, + { + "epoch": 6.261827079934747, + "grad_norm": 0.21525736153125763, + "learning_rate": 0.0008679502410211728, + "loss": 0.0413, + "num_input_tokens_seen": 82962576, + "step": 38385 + }, + { + "epoch": 6.262642740619902, + "grad_norm": 0.4553488492965698, + "learning_rate": 0.0008679020421670831, + "loss": 0.1761, + "num_input_tokens_seen": 82974192, + "step": 38390 + }, + { + "epoch": 6.263458401305057, + "grad_norm": 0.06272874027490616, + "learning_rate": 0.0008678538358569918, + "loss": 0.0753, + "num_input_tokens_seen": 82985104, + "step": 38395 + }, + { + "epoch": 6.264274061990212, + "grad_norm": 0.10946105420589447, + "learning_rate": 0.000867805622091876, + "loss": 0.0305, + "num_input_tokens_seen": 82995920, + "step": 38400 + }, + { + "epoch": 6.265089722675367, + "grad_norm": 0.30534982681274414, + "learning_rate": 0.0008677574008727126, + "loss": 0.1317, + "num_input_tokens_seen": 83007632, + "step": 38405 + }, + { + "epoch": 6.265905383360522, + "grad_norm": 0.22424903512001038, + "learning_rate": 0.0008677091722004788, + "loss": 0.205, + "num_input_tokens_seen": 83018768, + "step": 38410 + }, + { + "epoch": 6.266721044045677, + "grad_norm": 0.04151112586259842, + "learning_rate": 0.0008676609360761524, + "loss": 0.0292, + "num_input_tokens_seen": 83028976, + "step": 38415 + }, + { + "epoch": 6.267536704730832, + "grad_norm": 0.24260827898979187, + "learning_rate": 0.0008676126925007107, + "loss": 0.1155, + "num_input_tokens_seen": 83039760, + "step": 38420 + }, + { + "epoch": 6.268352365415987, + "grad_norm": 0.07921306043863297, + "learning_rate": 0.0008675644414751311, + "loss": 0.1389, + "num_input_tokens_seen": 83050512, + "step": 38425 + }, + { + "epoch": 6.269168026101142, + "grad_norm": 0.3923218846321106, + "learning_rate": 0.0008675161830003921, + "loss": 0.0957, + "num_input_tokens_seen": 83062384, + "step": 38430 + }, + { + "epoch": 6.269983686786297, + "grad_norm": 0.037347909063100815, + "learning_rate": 0.0008674679170774713, + "loss": 0.1005, + "num_input_tokens_seen": 83073680, + "step": 38435 + }, + { + "epoch": 6.270799347471452, + "grad_norm": 0.08266767114400864, + "learning_rate": 0.0008674196437073472, + "loss": 0.0624, + "num_input_tokens_seen": 83085552, + "step": 38440 + }, + { + "epoch": 6.271615008156607, + "grad_norm": 0.1442742645740509, + "learning_rate": 0.0008673713628909978, + "loss": 0.0605, + "num_input_tokens_seen": 83095792, + "step": 38445 + }, + { + "epoch": 6.2724306688417615, + "grad_norm": 0.3053017854690552, + "learning_rate": 0.0008673230746294016, + "loss": 0.1411, + "num_input_tokens_seen": 83107760, + "step": 38450 + }, + { + "epoch": 6.273246329526917, + "grad_norm": 0.1329246461391449, + "learning_rate": 0.0008672747789235373, + "loss": 0.0365, + "num_input_tokens_seen": 83118704, + "step": 38455 + }, + { + "epoch": 6.274061990212072, + "grad_norm": 0.1079804003238678, + "learning_rate": 0.0008672264757743838, + "loss": 0.0188, + "num_input_tokens_seen": 83129936, + "step": 38460 + }, + { + "epoch": 6.274877650897227, + "grad_norm": 0.09248155355453491, + "learning_rate": 0.0008671781651829198, + "loss": 0.0307, + "num_input_tokens_seen": 83139888, + "step": 38465 + }, + { + "epoch": 6.275693311582382, + "grad_norm": 0.13056586682796478, + "learning_rate": 0.0008671298471501246, + "loss": 0.0293, + "num_input_tokens_seen": 83151152, + "step": 38470 + }, + { + "epoch": 6.2765089722675365, + "grad_norm": 0.12060708552598953, + "learning_rate": 0.0008670815216769771, + "loss": 0.0555, + "num_input_tokens_seen": 83163344, + "step": 38475 + }, + { + "epoch": 6.277324632952691, + "grad_norm": 0.0019625083077698946, + "learning_rate": 0.0008670331887644571, + "loss": 0.0822, + "num_input_tokens_seen": 83175568, + "step": 38480 + }, + { + "epoch": 6.278140293637847, + "grad_norm": 0.0041697206906974316, + "learning_rate": 0.0008669848484135439, + "loss": 0.096, + "num_input_tokens_seen": 83186064, + "step": 38485 + }, + { + "epoch": 6.278955954323002, + "grad_norm": 0.003180544590577483, + "learning_rate": 0.0008669365006252172, + "loss": 0.0647, + "num_input_tokens_seen": 83195792, + "step": 38490 + }, + { + "epoch": 6.279771615008157, + "grad_norm": 0.039838630706071854, + "learning_rate": 0.0008668881454004567, + "loss": 0.0173, + "num_input_tokens_seen": 83206800, + "step": 38495 + }, + { + "epoch": 6.280587275693311, + "grad_norm": 0.20914457738399506, + "learning_rate": 0.0008668397827402425, + "loss": 0.188, + "num_input_tokens_seen": 83217168, + "step": 38500 + }, + { + "epoch": 6.281402936378466, + "grad_norm": 0.12957248091697693, + "learning_rate": 0.000866791412645555, + "loss": 0.2052, + "num_input_tokens_seen": 83228496, + "step": 38505 + }, + { + "epoch": 6.282218597063622, + "grad_norm": 0.198208287358284, + "learning_rate": 0.000866743035117374, + "loss": 0.0561, + "num_input_tokens_seen": 83238704, + "step": 38510 + }, + { + "epoch": 6.283034257748777, + "grad_norm": 0.012076416984200478, + "learning_rate": 0.0008666946501566801, + "loss": 0.0393, + "num_input_tokens_seen": 83248400, + "step": 38515 + }, + { + "epoch": 6.283849918433932, + "grad_norm": 0.03215594217181206, + "learning_rate": 0.000866646257764454, + "loss": 0.0604, + "num_input_tokens_seen": 83260048, + "step": 38520 + }, + { + "epoch": 6.284665579119086, + "grad_norm": 0.011157657951116562, + "learning_rate": 0.0008665978579416763, + "loss": 0.0095, + "num_input_tokens_seen": 83272208, + "step": 38525 + }, + { + "epoch": 6.285481239804241, + "grad_norm": 0.014190180227160454, + "learning_rate": 0.000866549450689328, + "loss": 0.0604, + "num_input_tokens_seen": 83281808, + "step": 38530 + }, + { + "epoch": 6.286296900489396, + "grad_norm": 0.37947651743888855, + "learning_rate": 0.0008665010360083902, + "loss": 0.1194, + "num_input_tokens_seen": 83291792, + "step": 38535 + }, + { + "epoch": 6.287112561174552, + "grad_norm": 0.008268352597951889, + "learning_rate": 0.0008664526138998438, + "loss": 0.1653, + "num_input_tokens_seen": 83301648, + "step": 38540 + }, + { + "epoch": 6.287928221859707, + "grad_norm": 0.2546350061893463, + "learning_rate": 0.0008664041843646704, + "loss": 0.0901, + "num_input_tokens_seen": 83312560, + "step": 38545 + }, + { + "epoch": 6.288743882544861, + "grad_norm": 0.1817859411239624, + "learning_rate": 0.0008663557474038512, + "loss": 0.0509, + "num_input_tokens_seen": 83323728, + "step": 38550 + }, + { + "epoch": 6.289559543230016, + "grad_norm": 0.019393671303987503, + "learning_rate": 0.0008663073030183683, + "loss": 0.102, + "num_input_tokens_seen": 83335600, + "step": 38555 + }, + { + "epoch": 6.290375203915171, + "grad_norm": 0.005289722234010696, + "learning_rate": 0.000866258851209203, + "loss": 0.0309, + "num_input_tokens_seen": 83345872, + "step": 38560 + }, + { + "epoch": 6.291190864600326, + "grad_norm": 0.08684537559747696, + "learning_rate": 0.0008662103919773375, + "loss": 0.178, + "num_input_tokens_seen": 83357232, + "step": 38565 + }, + { + "epoch": 6.2920065252854815, + "grad_norm": 0.009959384799003601, + "learning_rate": 0.0008661619253237538, + "loss": 0.0118, + "num_input_tokens_seen": 83368560, + "step": 38570 + }, + { + "epoch": 6.292822185970636, + "grad_norm": 0.007728797383606434, + "learning_rate": 0.0008661134512494343, + "loss": 0.1884, + "num_input_tokens_seen": 83378416, + "step": 38575 + }, + { + "epoch": 6.293637846655791, + "grad_norm": 0.04982810467481613, + "learning_rate": 0.0008660649697553612, + "loss": 0.2022, + "num_input_tokens_seen": 83389712, + "step": 38580 + }, + { + "epoch": 6.294453507340946, + "grad_norm": 0.0082283029332757, + "learning_rate": 0.000866016480842517, + "loss": 0.0575, + "num_input_tokens_seen": 83399920, + "step": 38585 + }, + { + "epoch": 6.295269168026101, + "grad_norm": 0.06599834561347961, + "learning_rate": 0.0008659679845118847, + "loss": 0.0333, + "num_input_tokens_seen": 83411504, + "step": 38590 + }, + { + "epoch": 6.2960848287112565, + "grad_norm": 0.021498100832104683, + "learning_rate": 0.0008659194807644468, + "loss": 0.1437, + "num_input_tokens_seen": 83421296, + "step": 38595 + }, + { + "epoch": 6.296900489396411, + "grad_norm": 0.016979070380330086, + "learning_rate": 0.0008658709696011864, + "loss": 0.0244, + "num_input_tokens_seen": 83432016, + "step": 38600 + }, + { + "epoch": 6.297716150081566, + "grad_norm": 0.01811056397855282, + "learning_rate": 0.0008658224510230867, + "loss": 0.0568, + "num_input_tokens_seen": 83442448, + "step": 38605 + }, + { + "epoch": 6.298531810766721, + "grad_norm": 0.07393760234117508, + "learning_rate": 0.0008657739250311309, + "loss": 0.0306, + "num_input_tokens_seen": 83453744, + "step": 38610 + }, + { + "epoch": 6.299347471451876, + "grad_norm": 0.01685251295566559, + "learning_rate": 0.0008657253916263026, + "loss": 0.0194, + "num_input_tokens_seen": 83464528, + "step": 38615 + }, + { + "epoch": 6.300163132137031, + "grad_norm": 0.009738907217979431, + "learning_rate": 0.0008656768508095852, + "loss": 0.0169, + "num_input_tokens_seen": 83474928, + "step": 38620 + }, + { + "epoch": 6.300978792822186, + "grad_norm": 0.048500582575798035, + "learning_rate": 0.0008656283025819626, + "loss": 0.1431, + "num_input_tokens_seen": 83485584, + "step": 38625 + }, + { + "epoch": 6.301794453507341, + "grad_norm": 0.01439825538545847, + "learning_rate": 0.0008655797469444186, + "loss": 0.0978, + "num_input_tokens_seen": 83496752, + "step": 38630 + }, + { + "epoch": 6.302610114192496, + "grad_norm": 0.2932996451854706, + "learning_rate": 0.0008655311838979371, + "loss": 0.1392, + "num_input_tokens_seen": 83506736, + "step": 38635 + }, + { + "epoch": 6.303425774877651, + "grad_norm": 0.08580674976110458, + "learning_rate": 0.0008654826134435028, + "loss": 0.0396, + "num_input_tokens_seen": 83518576, + "step": 38640 + }, + { + "epoch": 6.304241435562806, + "grad_norm": 0.021468764171004295, + "learning_rate": 0.0008654340355820993, + "loss": 0.1472, + "num_input_tokens_seen": 83528944, + "step": 38645 + }, + { + "epoch": 6.30505709624796, + "grad_norm": 0.027125662192702293, + "learning_rate": 0.0008653854503147117, + "loss": 0.154, + "num_input_tokens_seen": 83540560, + "step": 38650 + }, + { + "epoch": 6.305872756933116, + "grad_norm": 0.004373315721750259, + "learning_rate": 0.0008653368576423244, + "loss": 0.0116, + "num_input_tokens_seen": 83551984, + "step": 38655 + }, + { + "epoch": 6.306688417618271, + "grad_norm": 0.020719408988952637, + "learning_rate": 0.0008652882575659222, + "loss": 0.0158, + "num_input_tokens_seen": 83561712, + "step": 38660 + }, + { + "epoch": 6.307504078303426, + "grad_norm": 0.05750217288732529, + "learning_rate": 0.00086523965008649, + "loss": 0.0459, + "num_input_tokens_seen": 83571888, + "step": 38665 + }, + { + "epoch": 6.308319738988581, + "grad_norm": 0.018178651109337807, + "learning_rate": 0.0008651910352050129, + "loss": 0.0802, + "num_input_tokens_seen": 83582928, + "step": 38670 + }, + { + "epoch": 6.309135399673735, + "grad_norm": 0.04456610232591629, + "learning_rate": 0.0008651424129224764, + "loss": 0.055, + "num_input_tokens_seen": 83594672, + "step": 38675 + }, + { + "epoch": 6.309951060358891, + "grad_norm": 0.1513279229402542, + "learning_rate": 0.0008650937832398656, + "loss": 0.1175, + "num_input_tokens_seen": 83604400, + "step": 38680 + }, + { + "epoch": 6.310766721044046, + "grad_norm": 0.34819474816322327, + "learning_rate": 0.0008650451461581661, + "loss": 0.0755, + "num_input_tokens_seen": 83614320, + "step": 38685 + }, + { + "epoch": 6.311582381729201, + "grad_norm": 0.030669698491692543, + "learning_rate": 0.0008649965016783636, + "loss": 0.0355, + "num_input_tokens_seen": 83624784, + "step": 38690 + }, + { + "epoch": 6.3123980424143555, + "grad_norm": 0.030591286718845367, + "learning_rate": 0.0008649478498014441, + "loss": 0.1491, + "num_input_tokens_seen": 83635408, + "step": 38695 + }, + { + "epoch": 6.31321370309951, + "grad_norm": 0.001337647088803351, + "learning_rate": 0.0008648991905283931, + "loss": 0.0464, + "num_input_tokens_seen": 83647568, + "step": 38700 + }, + { + "epoch": 6.314029363784665, + "grad_norm": 0.030285224318504333, + "learning_rate": 0.0008648505238601974, + "loss": 0.0486, + "num_input_tokens_seen": 83658480, + "step": 38705 + }, + { + "epoch": 6.314845024469821, + "grad_norm": 0.015253371559083462, + "learning_rate": 0.0008648018497978429, + "loss": 0.086, + "num_input_tokens_seen": 83669552, + "step": 38710 + }, + { + "epoch": 6.315660685154976, + "grad_norm": 0.19649551808834076, + "learning_rate": 0.0008647531683423162, + "loss": 0.0902, + "num_input_tokens_seen": 83681392, + "step": 38715 + }, + { + "epoch": 6.3164763458401305, + "grad_norm": 0.03983687609434128, + "learning_rate": 0.0008647044794946038, + "loss": 0.2075, + "num_input_tokens_seen": 83691920, + "step": 38720 + }, + { + "epoch": 6.317292006525285, + "grad_norm": 0.014638083986938, + "learning_rate": 0.0008646557832556925, + "loss": 0.0456, + "num_input_tokens_seen": 83702576, + "step": 38725 + }, + { + "epoch": 6.31810766721044, + "grad_norm": 0.01107320748269558, + "learning_rate": 0.000864607079626569, + "loss": 0.0201, + "num_input_tokens_seen": 83715088, + "step": 38730 + }, + { + "epoch": 6.318923327895595, + "grad_norm": 0.011112612672150135, + "learning_rate": 0.0008645583686082206, + "loss": 0.1986, + "num_input_tokens_seen": 83725264, + "step": 38735 + }, + { + "epoch": 6.319738988580751, + "grad_norm": 0.028126679360866547, + "learning_rate": 0.0008645096502016346, + "loss": 0.1116, + "num_input_tokens_seen": 83736400, + "step": 38740 + }, + { + "epoch": 6.3205546492659055, + "grad_norm": 0.103696309030056, + "learning_rate": 0.0008644609244077978, + "loss": 0.0249, + "num_input_tokens_seen": 83748144, + "step": 38745 + }, + { + "epoch": 6.32137030995106, + "grad_norm": 0.22747750580310822, + "learning_rate": 0.0008644121912276981, + "loss": 0.1537, + "num_input_tokens_seen": 83758704, + "step": 38750 + }, + { + "epoch": 6.322185970636215, + "grad_norm": 0.04633212834596634, + "learning_rate": 0.000864363450662323, + "loss": 0.0403, + "num_input_tokens_seen": 83769488, + "step": 38755 + }, + { + "epoch": 6.32300163132137, + "grad_norm": 0.15548771619796753, + "learning_rate": 0.0008643147027126604, + "loss": 0.0712, + "num_input_tokens_seen": 83780080, + "step": 38760 + }, + { + "epoch": 6.323817292006526, + "grad_norm": 0.053673189133405685, + "learning_rate": 0.0008642659473796984, + "loss": 0.1654, + "num_input_tokens_seen": 83792144, + "step": 38765 + }, + { + "epoch": 6.3246329526916805, + "grad_norm": 0.23594816029071808, + "learning_rate": 0.0008642171846644245, + "loss": 0.0615, + "num_input_tokens_seen": 83803440, + "step": 38770 + }, + { + "epoch": 6.325448613376835, + "grad_norm": 0.11045833677053452, + "learning_rate": 0.0008641684145678275, + "loss": 0.1299, + "num_input_tokens_seen": 83812912, + "step": 38775 + }, + { + "epoch": 6.32626427406199, + "grad_norm": 0.017149219289422035, + "learning_rate": 0.0008641196370908956, + "loss": 0.0582, + "num_input_tokens_seen": 83822960, + "step": 38780 + }, + { + "epoch": 6.327079934747145, + "grad_norm": 0.40087974071502686, + "learning_rate": 0.0008640708522346173, + "loss": 0.1819, + "num_input_tokens_seen": 83832304, + "step": 38785 + }, + { + "epoch": 6.327895595432301, + "grad_norm": 0.016898376867175102, + "learning_rate": 0.0008640220599999813, + "loss": 0.0217, + "num_input_tokens_seen": 83841136, + "step": 38790 + }, + { + "epoch": 6.328711256117455, + "grad_norm": 0.051673293113708496, + "learning_rate": 0.0008639732603879766, + "loss": 0.0734, + "num_input_tokens_seen": 83851600, + "step": 38795 + }, + { + "epoch": 6.32952691680261, + "grad_norm": 0.031015006825327873, + "learning_rate": 0.0008639244533995919, + "loss": 0.1093, + "num_input_tokens_seen": 83862896, + "step": 38800 + }, + { + "epoch": 6.330342577487765, + "grad_norm": 0.20969587564468384, + "learning_rate": 0.0008638756390358164, + "loss": 0.1846, + "num_input_tokens_seen": 83873552, + "step": 38805 + }, + { + "epoch": 6.33115823817292, + "grad_norm": 0.032477930188179016, + "learning_rate": 0.0008638268172976398, + "loss": 0.027, + "num_input_tokens_seen": 83884720, + "step": 38810 + }, + { + "epoch": 6.331973898858075, + "grad_norm": 0.0873703733086586, + "learning_rate": 0.0008637779881860509, + "loss": 0.1324, + "num_input_tokens_seen": 83895088, + "step": 38815 + }, + { + "epoch": 6.33278955954323, + "grad_norm": 0.034240808337926865, + "learning_rate": 0.0008637291517020397, + "loss": 0.0311, + "num_input_tokens_seen": 83906640, + "step": 38820 + }, + { + "epoch": 6.333605220228385, + "grad_norm": 0.03294326364994049, + "learning_rate": 0.0008636803078465958, + "loss": 0.0705, + "num_input_tokens_seen": 83916976, + "step": 38825 + }, + { + "epoch": 6.33442088091354, + "grad_norm": 0.2754383683204651, + "learning_rate": 0.000863631456620709, + "loss": 0.2057, + "num_input_tokens_seen": 83928432, + "step": 38830 + }, + { + "epoch": 6.335236541598695, + "grad_norm": 0.01746748387813568, + "learning_rate": 0.0008635825980253696, + "loss": 0.0447, + "num_input_tokens_seen": 83939440, + "step": 38835 + }, + { + "epoch": 6.33605220228385, + "grad_norm": 0.059651441872119904, + "learning_rate": 0.0008635337320615675, + "loss": 0.1236, + "num_input_tokens_seen": 83950032, + "step": 38840 + }, + { + "epoch": 6.3368678629690045, + "grad_norm": 0.12536899745464325, + "learning_rate": 0.0008634848587302932, + "loss": 0.1126, + "num_input_tokens_seen": 83962000, + "step": 38845 + }, + { + "epoch": 6.33768352365416, + "grad_norm": 0.024651646614074707, + "learning_rate": 0.0008634359780325372, + "loss": 0.0204, + "num_input_tokens_seen": 83973360, + "step": 38850 + }, + { + "epoch": 6.338499184339315, + "grad_norm": 0.20898933708667755, + "learning_rate": 0.0008633870899692899, + "loss": 0.0862, + "num_input_tokens_seen": 83984464, + "step": 38855 + }, + { + "epoch": 6.33931484502447, + "grad_norm": 0.004932452458888292, + "learning_rate": 0.0008633381945415422, + "loss": 0.0445, + "num_input_tokens_seen": 83993808, + "step": 38860 + }, + { + "epoch": 6.340130505709625, + "grad_norm": 0.06426247209310532, + "learning_rate": 0.0008632892917502852, + "loss": 0.023, + "num_input_tokens_seen": 84004880, + "step": 38865 + }, + { + "epoch": 6.3409461663947795, + "grad_norm": 0.011022510938346386, + "learning_rate": 0.0008632403815965099, + "loss": 0.0385, + "num_input_tokens_seen": 84016240, + "step": 38870 + }, + { + "epoch": 6.341761827079935, + "grad_norm": 0.36207377910614014, + "learning_rate": 0.0008631914640812073, + "loss": 0.1861, + "num_input_tokens_seen": 84027920, + "step": 38875 + }, + { + "epoch": 6.34257748776509, + "grad_norm": 0.02720283530652523, + "learning_rate": 0.000863142539205369, + "loss": 0.14, + "num_input_tokens_seen": 84038000, + "step": 38880 + }, + { + "epoch": 6.343393148450245, + "grad_norm": 0.03938665613532066, + "learning_rate": 0.0008630936069699864, + "loss": 0.041, + "num_input_tokens_seen": 84049136, + "step": 38885 + }, + { + "epoch": 6.3442088091354, + "grad_norm": 0.008454914204776287, + "learning_rate": 0.0008630446673760513, + "loss": 0.0379, + "num_input_tokens_seen": 84059024, + "step": 38890 + }, + { + "epoch": 6.3450244698205545, + "grad_norm": 0.24689482152462006, + "learning_rate": 0.0008629957204245555, + "loss": 0.0959, + "num_input_tokens_seen": 84069616, + "step": 38895 + }, + { + "epoch": 6.345840130505709, + "grad_norm": 0.16050291061401367, + "learning_rate": 0.000862946766116491, + "loss": 0.1066, + "num_input_tokens_seen": 84079984, + "step": 38900 + }, + { + "epoch": 6.346655791190865, + "grad_norm": 0.008206437341868877, + "learning_rate": 0.0008628978044528496, + "loss": 0.0314, + "num_input_tokens_seen": 84090768, + "step": 38905 + }, + { + "epoch": 6.34747145187602, + "grad_norm": 0.04334701597690582, + "learning_rate": 0.000862848835434624, + "loss": 0.1137, + "num_input_tokens_seen": 84102352, + "step": 38910 + }, + { + "epoch": 6.348287112561175, + "grad_norm": 0.025094132870435715, + "learning_rate": 0.0008627998590628065, + "loss": 0.0465, + "num_input_tokens_seen": 84113328, + "step": 38915 + }, + { + "epoch": 6.349102773246329, + "grad_norm": 0.017059357836842537, + "learning_rate": 0.0008627508753383895, + "loss": 0.0132, + "num_input_tokens_seen": 84124592, + "step": 38920 + }, + { + "epoch": 6.349918433931484, + "grad_norm": 0.00839939247816801, + "learning_rate": 0.0008627018842623657, + "loss": 0.0675, + "num_input_tokens_seen": 84134544, + "step": 38925 + }, + { + "epoch": 6.350734094616639, + "grad_norm": 0.1650896817445755, + "learning_rate": 0.0008626528858357283, + "loss": 0.0926, + "num_input_tokens_seen": 84144944, + "step": 38930 + }, + { + "epoch": 6.351549755301795, + "grad_norm": 0.12865620851516724, + "learning_rate": 0.0008626038800594703, + "loss": 0.079, + "num_input_tokens_seen": 84155312, + "step": 38935 + }, + { + "epoch": 6.35236541598695, + "grad_norm": 0.030272645875811577, + "learning_rate": 0.0008625548669345842, + "loss": 0.0434, + "num_input_tokens_seen": 84166096, + "step": 38940 + }, + { + "epoch": 6.353181076672104, + "grad_norm": 0.018891897052526474, + "learning_rate": 0.0008625058464620641, + "loss": 0.1183, + "num_input_tokens_seen": 84177264, + "step": 38945 + }, + { + "epoch": 6.353996737357259, + "grad_norm": 0.14031964540481567, + "learning_rate": 0.0008624568186429031, + "loss": 0.056, + "num_input_tokens_seen": 84188176, + "step": 38950 + }, + { + "epoch": 6.354812398042414, + "grad_norm": 0.13088087737560272, + "learning_rate": 0.0008624077834780948, + "loss": 0.1551, + "num_input_tokens_seen": 84199792, + "step": 38955 + }, + { + "epoch": 6.35562805872757, + "grad_norm": 0.4480366110801697, + "learning_rate": 0.000862358740968633, + "loss": 0.2585, + "num_input_tokens_seen": 84210832, + "step": 38960 + }, + { + "epoch": 6.356443719412725, + "grad_norm": 0.07605596631765366, + "learning_rate": 0.0008623096911155117, + "loss": 0.0494, + "num_input_tokens_seen": 84220496, + "step": 38965 + }, + { + "epoch": 6.357259380097879, + "grad_norm": 0.0019360106671229005, + "learning_rate": 0.000862260633919725, + "loss": 0.1394, + "num_input_tokens_seen": 84231472, + "step": 38970 + }, + { + "epoch": 6.358075040783034, + "grad_norm": 0.011364354752004147, + "learning_rate": 0.0008622115693822668, + "loss": 0.2661, + "num_input_tokens_seen": 84242608, + "step": 38975 + }, + { + "epoch": 6.358890701468189, + "grad_norm": 0.053338050842285156, + "learning_rate": 0.0008621624975041316, + "loss": 0.1578, + "num_input_tokens_seen": 84254000, + "step": 38980 + }, + { + "epoch": 6.359706362153344, + "grad_norm": 0.017331453040242195, + "learning_rate": 0.0008621134182863142, + "loss": 0.0381, + "num_input_tokens_seen": 84263952, + "step": 38985 + }, + { + "epoch": 6.3605220228384995, + "grad_norm": 0.24667911231517792, + "learning_rate": 0.0008620643317298088, + "loss": 0.1638, + "num_input_tokens_seen": 84274832, + "step": 38990 + }, + { + "epoch": 6.361337683523654, + "grad_norm": 0.12548623979091644, + "learning_rate": 0.0008620152378356105, + "loss": 0.0421, + "num_input_tokens_seen": 84285552, + "step": 38995 + }, + { + "epoch": 6.362153344208809, + "grad_norm": 0.12679541110992432, + "learning_rate": 0.0008619661366047141, + "loss": 0.0568, + "num_input_tokens_seen": 84296368, + "step": 39000 + }, + { + "epoch": 6.362969004893964, + "grad_norm": 0.13178208470344543, + "learning_rate": 0.0008619170280381148, + "loss": 0.0628, + "num_input_tokens_seen": 84307792, + "step": 39005 + }, + { + "epoch": 6.363784665579119, + "grad_norm": 0.051386695355176926, + "learning_rate": 0.0008618679121368078, + "loss": 0.0598, + "num_input_tokens_seen": 84318000, + "step": 39010 + }, + { + "epoch": 6.364600326264274, + "grad_norm": 0.017814617604017258, + "learning_rate": 0.0008618187889017886, + "loss": 0.0297, + "num_input_tokens_seen": 84328368, + "step": 39015 + }, + { + "epoch": 6.365415986949429, + "grad_norm": 0.04465307667851448, + "learning_rate": 0.0008617696583340524, + "loss": 0.0384, + "num_input_tokens_seen": 84340528, + "step": 39020 + }, + { + "epoch": 6.366231647634584, + "grad_norm": 0.003912598360329866, + "learning_rate": 0.0008617205204345952, + "loss": 0.2345, + "num_input_tokens_seen": 84351344, + "step": 39025 + }, + { + "epoch": 6.367047308319739, + "grad_norm": 0.15951602160930634, + "learning_rate": 0.000861671375204413, + "loss": 0.1794, + "num_input_tokens_seen": 84361968, + "step": 39030 + }, + { + "epoch": 6.367862969004894, + "grad_norm": 0.011443092487752438, + "learning_rate": 0.0008616222226445014, + "loss": 0.0899, + "num_input_tokens_seen": 84372432, + "step": 39035 + }, + { + "epoch": 6.368678629690049, + "grad_norm": 0.21486307680606842, + "learning_rate": 0.0008615730627558566, + "loss": 0.0626, + "num_input_tokens_seen": 84382896, + "step": 39040 + }, + { + "epoch": 6.369494290375204, + "grad_norm": 0.10696162283420563, + "learning_rate": 0.0008615238955394753, + "loss": 0.1678, + "num_input_tokens_seen": 84392464, + "step": 39045 + }, + { + "epoch": 6.370309951060359, + "grad_norm": 0.21196526288986206, + "learning_rate": 0.0008614747209963534, + "loss": 0.1264, + "num_input_tokens_seen": 84403504, + "step": 39050 + }, + { + "epoch": 6.371125611745514, + "grad_norm": 0.012137340381741524, + "learning_rate": 0.0008614255391274877, + "loss": 0.1791, + "num_input_tokens_seen": 84413904, + "step": 39055 + }, + { + "epoch": 6.371941272430669, + "grad_norm": 0.04827677458524704, + "learning_rate": 0.0008613763499338751, + "loss": 0.157, + "num_input_tokens_seen": 84425392, + "step": 39060 + }, + { + "epoch": 6.372756933115824, + "grad_norm": 0.010995679534971714, + "learning_rate": 0.0008613271534165121, + "loss": 0.0305, + "num_input_tokens_seen": 84436624, + "step": 39065 + }, + { + "epoch": 6.373572593800978, + "grad_norm": 0.11564923822879791, + "learning_rate": 0.0008612779495763963, + "loss": 0.0988, + "num_input_tokens_seen": 84446352, + "step": 39070 + }, + { + "epoch": 6.374388254486134, + "grad_norm": 0.1190088763833046, + "learning_rate": 0.0008612287384145243, + "loss": 0.0846, + "num_input_tokens_seen": 84457648, + "step": 39075 + }, + { + "epoch": 6.375203915171289, + "grad_norm": 0.01807386428117752, + "learning_rate": 0.0008611795199318937, + "loss": 0.0507, + "num_input_tokens_seen": 84467792, + "step": 39080 + }, + { + "epoch": 6.376019575856444, + "grad_norm": 0.17465393245220184, + "learning_rate": 0.000861130294129502, + "loss": 0.0798, + "num_input_tokens_seen": 84478448, + "step": 39085 + }, + { + "epoch": 6.376835236541599, + "grad_norm": 0.067719466984272, + "learning_rate": 0.0008610810610083466, + "loss": 0.0597, + "num_input_tokens_seen": 84489200, + "step": 39090 + }, + { + "epoch": 6.377650897226753, + "grad_norm": 0.05747228488326073, + "learning_rate": 0.0008610318205694256, + "loss": 0.2849, + "num_input_tokens_seen": 84499536, + "step": 39095 + }, + { + "epoch": 6.378466557911908, + "grad_norm": 0.028629053384065628, + "learning_rate": 0.0008609825728137366, + "loss": 0.1891, + "num_input_tokens_seen": 84510960, + "step": 39100 + }, + { + "epoch": 6.379282218597064, + "grad_norm": 0.08141916245222092, + "learning_rate": 0.000860933317742278, + "loss": 0.0877, + "num_input_tokens_seen": 84522704, + "step": 39105 + }, + { + "epoch": 6.380097879282219, + "grad_norm": 0.06588222086429596, + "learning_rate": 0.0008608840553560478, + "loss": 0.056, + "num_input_tokens_seen": 84532688, + "step": 39110 + }, + { + "epoch": 6.3809135399673735, + "grad_norm": 0.20890022814273834, + "learning_rate": 0.0008608347856560443, + "loss": 0.0917, + "num_input_tokens_seen": 84543312, + "step": 39115 + }, + { + "epoch": 6.381729200652528, + "grad_norm": 0.0732296034693718, + "learning_rate": 0.0008607855086432663, + "loss": 0.0376, + "num_input_tokens_seen": 84555024, + "step": 39120 + }, + { + "epoch": 6.382544861337683, + "grad_norm": 0.04963723570108414, + "learning_rate": 0.0008607362243187121, + "loss": 0.0424, + "num_input_tokens_seen": 84565456, + "step": 39125 + }, + { + "epoch": 6.383360522022839, + "grad_norm": 0.03612956404685974, + "learning_rate": 0.0008606869326833809, + "loss": 0.1472, + "num_input_tokens_seen": 84576880, + "step": 39130 + }, + { + "epoch": 6.384176182707994, + "grad_norm": 0.012030171230435371, + "learning_rate": 0.0008606376337382711, + "loss": 0.0992, + "num_input_tokens_seen": 84588176, + "step": 39135 + }, + { + "epoch": 6.3849918433931485, + "grad_norm": 0.02331584133207798, + "learning_rate": 0.0008605883274843824, + "loss": 0.0305, + "num_input_tokens_seen": 84598448, + "step": 39140 + }, + { + "epoch": 6.385807504078303, + "grad_norm": 0.19398914277553558, + "learning_rate": 0.0008605390139227137, + "loss": 0.1123, + "num_input_tokens_seen": 84609904, + "step": 39145 + }, + { + "epoch": 6.386623164763458, + "grad_norm": 0.21747860312461853, + "learning_rate": 0.0008604896930542645, + "loss": 0.0518, + "num_input_tokens_seen": 84621008, + "step": 39150 + }, + { + "epoch": 6.387438825448613, + "grad_norm": 0.14797638356685638, + "learning_rate": 0.0008604403648800346, + "loss": 0.0327, + "num_input_tokens_seen": 84632048, + "step": 39155 + }, + { + "epoch": 6.388254486133769, + "grad_norm": 0.024937812238931656, + "learning_rate": 0.0008603910294010231, + "loss": 0.1336, + "num_input_tokens_seen": 84641488, + "step": 39160 + }, + { + "epoch": 6.3890701468189235, + "grad_norm": 0.027049263939261436, + "learning_rate": 0.0008603416866182305, + "loss": 0.0497, + "num_input_tokens_seen": 84651664, + "step": 39165 + }, + { + "epoch": 6.389885807504078, + "grad_norm": 0.20507164299488068, + "learning_rate": 0.0008602923365326563, + "loss": 0.0912, + "num_input_tokens_seen": 84662000, + "step": 39170 + }, + { + "epoch": 6.390701468189233, + "grad_norm": 0.016058241948485374, + "learning_rate": 0.000860242979145301, + "loss": 0.1774, + "num_input_tokens_seen": 84674000, + "step": 39175 + }, + { + "epoch": 6.391517128874388, + "grad_norm": 0.07409879565238953, + "learning_rate": 0.0008601936144571646, + "loss": 0.0802, + "num_input_tokens_seen": 84685200, + "step": 39180 + }, + { + "epoch": 6.392332789559543, + "grad_norm": 0.22600983083248138, + "learning_rate": 0.0008601442424692476, + "loss": 0.1863, + "num_input_tokens_seen": 84695984, + "step": 39185 + }, + { + "epoch": 6.3931484502446985, + "grad_norm": 0.015394588001072407, + "learning_rate": 0.0008600948631825508, + "loss": 0.0487, + "num_input_tokens_seen": 84705936, + "step": 39190 + }, + { + "epoch": 6.393964110929853, + "grad_norm": 0.22620944678783417, + "learning_rate": 0.0008600454765980747, + "loss": 0.0313, + "num_input_tokens_seen": 84716144, + "step": 39195 + }, + { + "epoch": 6.394779771615008, + "grad_norm": 0.01256847195327282, + "learning_rate": 0.0008599960827168204, + "loss": 0.0432, + "num_input_tokens_seen": 84726704, + "step": 39200 + }, + { + "epoch": 6.395595432300163, + "grad_norm": 0.01900371164083481, + "learning_rate": 0.0008599466815397886, + "loss": 0.1159, + "num_input_tokens_seen": 84736784, + "step": 39205 + }, + { + "epoch": 6.396411092985318, + "grad_norm": 0.32982227206230164, + "learning_rate": 0.0008598972730679809, + "loss": 0.2601, + "num_input_tokens_seen": 84746672, + "step": 39210 + }, + { + "epoch": 6.397226753670473, + "grad_norm": 0.06382697820663452, + "learning_rate": 0.0008598478573023982, + "loss": 0.0882, + "num_input_tokens_seen": 84756560, + "step": 39215 + }, + { + "epoch": 6.398042414355628, + "grad_norm": 0.009049140848219395, + "learning_rate": 0.0008597984342440421, + "loss": 0.0119, + "num_input_tokens_seen": 84768592, + "step": 39220 + }, + { + "epoch": 6.398858075040783, + "grad_norm": 0.06087180972099304, + "learning_rate": 0.0008597490038939145, + "loss": 0.0424, + "num_input_tokens_seen": 84778736, + "step": 39225 + }, + { + "epoch": 6.399673735725938, + "grad_norm": 0.021845834329724312, + "learning_rate": 0.0008596995662530169, + "loss": 0.0358, + "num_input_tokens_seen": 84789008, + "step": 39230 + }, + { + "epoch": 6.400489396411093, + "grad_norm": 0.005335748661309481, + "learning_rate": 0.0008596501213223514, + "loss": 0.1087, + "num_input_tokens_seen": 84800816, + "step": 39235 + }, + { + "epoch": 6.401305057096248, + "grad_norm": 0.054845456033945084, + "learning_rate": 0.0008596006691029196, + "loss": 0.0228, + "num_input_tokens_seen": 84811152, + "step": 39240 + }, + { + "epoch": 6.402120717781403, + "grad_norm": 0.03387339413166046, + "learning_rate": 0.0008595512095957244, + "loss": 0.0684, + "num_input_tokens_seen": 84822032, + "step": 39245 + }, + { + "epoch": 6.402936378466558, + "grad_norm": 0.10483232140541077, + "learning_rate": 0.0008595017428017677, + "loss": 0.0402, + "num_input_tokens_seen": 84832144, + "step": 39250 + }, + { + "epoch": 6.403752039151713, + "grad_norm": 0.015923313796520233, + "learning_rate": 0.000859452268722052, + "loss": 0.0227, + "num_input_tokens_seen": 84843440, + "step": 39255 + }, + { + "epoch": 6.404567699836868, + "grad_norm": 0.0959169939160347, + "learning_rate": 0.0008594027873575803, + "loss": 0.1808, + "num_input_tokens_seen": 84852688, + "step": 39260 + }, + { + "epoch": 6.4053833605220225, + "grad_norm": 0.2450348436832428, + "learning_rate": 0.0008593532987093551, + "loss": 0.1963, + "num_input_tokens_seen": 84862544, + "step": 39265 + }, + { + "epoch": 6.406199021207178, + "grad_norm": 0.015201598405838013, + "learning_rate": 0.0008593038027783793, + "loss": 0.068, + "num_input_tokens_seen": 84873744, + "step": 39270 + }, + { + "epoch": 6.407014681892333, + "grad_norm": 0.19144880771636963, + "learning_rate": 0.0008592542995656563, + "loss": 0.2351, + "num_input_tokens_seen": 84883856, + "step": 39275 + }, + { + "epoch": 6.407830342577488, + "grad_norm": 0.021460549905896187, + "learning_rate": 0.000859204789072189, + "loss": 0.044, + "num_input_tokens_seen": 84895536, + "step": 39280 + }, + { + "epoch": 6.408646003262643, + "grad_norm": 0.025125499814748764, + "learning_rate": 0.0008591552712989812, + "loss": 0.0558, + "num_input_tokens_seen": 84905840, + "step": 39285 + }, + { + "epoch": 6.4094616639477975, + "grad_norm": 0.007889053784310818, + "learning_rate": 0.0008591057462470359, + "loss": 0.1521, + "num_input_tokens_seen": 84917008, + "step": 39290 + }, + { + "epoch": 6.410277324632952, + "grad_norm": 0.07312604784965515, + "learning_rate": 0.0008590562139173573, + "loss": 0.0741, + "num_input_tokens_seen": 84929424, + "step": 39295 + }, + { + "epoch": 6.411092985318108, + "grad_norm": 0.1872720569372177, + "learning_rate": 0.000859006674310949, + "loss": 0.048, + "num_input_tokens_seen": 84940400, + "step": 39300 + }, + { + "epoch": 6.411908646003263, + "grad_norm": 0.024740254506468773, + "learning_rate": 0.000858957127428815, + "loss": 0.0569, + "num_input_tokens_seen": 84951888, + "step": 39305 + }, + { + "epoch": 6.412724306688418, + "grad_norm": 0.2020983099937439, + "learning_rate": 0.0008589075732719594, + "loss": 0.1, + "num_input_tokens_seen": 84961232, + "step": 39310 + }, + { + "epoch": 6.4135399673735725, + "grad_norm": 0.021523285657167435, + "learning_rate": 0.0008588580118413867, + "loss": 0.0291, + "num_input_tokens_seen": 84972240, + "step": 39315 + }, + { + "epoch": 6.414355628058727, + "grad_norm": 0.046101946383714676, + "learning_rate": 0.0008588084431381009, + "loss": 0.0816, + "num_input_tokens_seen": 84983408, + "step": 39320 + }, + { + "epoch": 6.415171288743883, + "grad_norm": 0.02815198339521885, + "learning_rate": 0.000858758867163107, + "loss": 0.0823, + "num_input_tokens_seen": 84994128, + "step": 39325 + }, + { + "epoch": 6.415986949429038, + "grad_norm": 0.13036702573299408, + "learning_rate": 0.0008587092839174096, + "loss": 0.0584, + "num_input_tokens_seen": 85005904, + "step": 39330 + }, + { + "epoch": 6.416802610114193, + "grad_norm": 0.004371070768684149, + "learning_rate": 0.0008586596934020132, + "loss": 0.0292, + "num_input_tokens_seen": 85016624, + "step": 39335 + }, + { + "epoch": 6.417618270799347, + "grad_norm": 0.015720317140221596, + "learning_rate": 0.0008586100956179234, + "loss": 0.1512, + "num_input_tokens_seen": 85027152, + "step": 39340 + }, + { + "epoch": 6.418433931484502, + "grad_norm": 0.21012809872627258, + "learning_rate": 0.000858560490566145, + "loss": 0.0698, + "num_input_tokens_seen": 85036784, + "step": 39345 + }, + { + "epoch": 6.419249592169657, + "grad_norm": 0.013738599605858326, + "learning_rate": 0.0008585108782476834, + "loss": 0.0278, + "num_input_tokens_seen": 85047440, + "step": 39350 + }, + { + "epoch": 6.420065252854813, + "grad_norm": 0.1178101897239685, + "learning_rate": 0.000858461258663544, + "loss": 0.0739, + "num_input_tokens_seen": 85057744, + "step": 39355 + }, + { + "epoch": 6.420880913539968, + "grad_norm": 0.007661431562155485, + "learning_rate": 0.0008584116318147324, + "loss": 0.0287, + "num_input_tokens_seen": 85069392, + "step": 39360 + }, + { + "epoch": 6.421696574225122, + "grad_norm": 0.06654492020606995, + "learning_rate": 0.0008583619977022546, + "loss": 0.0798, + "num_input_tokens_seen": 85080560, + "step": 39365 + }, + { + "epoch": 6.422512234910277, + "grad_norm": 0.043904032558202744, + "learning_rate": 0.000858312356327116, + "loss": 0.0626, + "num_input_tokens_seen": 85091344, + "step": 39370 + }, + { + "epoch": 6.423327895595432, + "grad_norm": 0.04809953272342682, + "learning_rate": 0.0008582627076903232, + "loss": 0.1128, + "num_input_tokens_seen": 85102256, + "step": 39375 + }, + { + "epoch": 6.424143556280587, + "grad_norm": 0.3738861680030823, + "learning_rate": 0.0008582130517928821, + "loss": 0.2677, + "num_input_tokens_seen": 85113168, + "step": 39380 + }, + { + "epoch": 6.424959216965743, + "grad_norm": 0.010011561214923859, + "learning_rate": 0.000858163388635799, + "loss": 0.0636, + "num_input_tokens_seen": 85123504, + "step": 39385 + }, + { + "epoch": 6.425774877650897, + "grad_norm": 0.028765080496668816, + "learning_rate": 0.0008581137182200806, + "loss": 0.0107, + "num_input_tokens_seen": 85133584, + "step": 39390 + }, + { + "epoch": 6.426590538336052, + "grad_norm": 0.2487831860780716, + "learning_rate": 0.0008580640405467333, + "loss": 0.0799, + "num_input_tokens_seen": 85143344, + "step": 39395 + }, + { + "epoch": 6.427406199021207, + "grad_norm": 0.1759326159954071, + "learning_rate": 0.0008580143556167638, + "loss": 0.0973, + "num_input_tokens_seen": 85154000, + "step": 39400 + }, + { + "epoch": 6.428221859706362, + "grad_norm": 0.02199672721326351, + "learning_rate": 0.0008579646634311795, + "loss": 0.0562, + "num_input_tokens_seen": 85164432, + "step": 39405 + }, + { + "epoch": 6.4290375203915175, + "grad_norm": 0.16176863014698029, + "learning_rate": 0.0008579149639909872, + "loss": 0.045, + "num_input_tokens_seen": 85175152, + "step": 39410 + }, + { + "epoch": 6.429853181076672, + "grad_norm": 0.05018671602010727, + "learning_rate": 0.0008578652572971939, + "loss": 0.0616, + "num_input_tokens_seen": 85187248, + "step": 39415 + }, + { + "epoch": 6.430668841761827, + "grad_norm": 0.005079567898064852, + "learning_rate": 0.0008578155433508073, + "loss": 0.2459, + "num_input_tokens_seen": 85199120, + "step": 39420 + }, + { + "epoch": 6.431484502446982, + "grad_norm": 0.016508281230926514, + "learning_rate": 0.0008577658221528349, + "loss": 0.0143, + "num_input_tokens_seen": 85209840, + "step": 39425 + }, + { + "epoch": 6.432300163132137, + "grad_norm": 0.001730692689307034, + "learning_rate": 0.000857716093704284, + "loss": 0.1407, + "num_input_tokens_seen": 85220592, + "step": 39430 + }, + { + "epoch": 6.433115823817292, + "grad_norm": 0.013995840214192867, + "learning_rate": 0.0008576663580061628, + "loss": 0.0532, + "num_input_tokens_seen": 85232656, + "step": 39435 + }, + { + "epoch": 6.433931484502447, + "grad_norm": 0.27595826983451843, + "learning_rate": 0.0008576166150594792, + "loss": 0.1849, + "num_input_tokens_seen": 85244368, + "step": 39440 + }, + { + "epoch": 6.434747145187602, + "grad_norm": 0.025783469900488853, + "learning_rate": 0.0008575668648652411, + "loss": 0.1106, + "num_input_tokens_seen": 85252976, + "step": 39445 + }, + { + "epoch": 6.435562805872757, + "grad_norm": 0.32373544573783875, + "learning_rate": 0.0008575171074244568, + "loss": 0.1423, + "num_input_tokens_seen": 85263984, + "step": 39450 + }, + { + "epoch": 6.436378466557912, + "grad_norm": 0.025260545313358307, + "learning_rate": 0.000857467342738135, + "loss": 0.0328, + "num_input_tokens_seen": 85274000, + "step": 39455 + }, + { + "epoch": 6.437194127243067, + "grad_norm": 0.020506108179688454, + "learning_rate": 0.000857417570807284, + "loss": 0.1123, + "num_input_tokens_seen": 85285584, + "step": 39460 + }, + { + "epoch": 6.438009787928221, + "grad_norm": 0.013910512439906597, + "learning_rate": 0.0008573677916329124, + "loss": 0.0458, + "num_input_tokens_seen": 85297072, + "step": 39465 + }, + { + "epoch": 6.438825448613377, + "grad_norm": 0.011981474235653877, + "learning_rate": 0.0008573180052160291, + "loss": 0.0254, + "num_input_tokens_seen": 85307248, + "step": 39470 + }, + { + "epoch": 6.439641109298532, + "grad_norm": 0.15529952943325043, + "learning_rate": 0.0008572682115576433, + "loss": 0.121, + "num_input_tokens_seen": 85318384, + "step": 39475 + }, + { + "epoch": 6.440456769983687, + "grad_norm": 0.010499502532184124, + "learning_rate": 0.0008572184106587638, + "loss": 0.09, + "num_input_tokens_seen": 85328208, + "step": 39480 + }, + { + "epoch": 6.441272430668842, + "grad_norm": 0.1622689962387085, + "learning_rate": 0.0008571686025204002, + "loss": 0.1659, + "num_input_tokens_seen": 85338960, + "step": 39485 + }, + { + "epoch": 6.442088091353996, + "grad_norm": 0.18635737895965576, + "learning_rate": 0.0008571187871435616, + "loss": 0.1749, + "num_input_tokens_seen": 85349904, + "step": 39490 + }, + { + "epoch": 6.442903752039152, + "grad_norm": 0.3250514566898346, + "learning_rate": 0.0008570689645292579, + "loss": 0.1153, + "num_input_tokens_seen": 85361008, + "step": 39495 + }, + { + "epoch": 6.443719412724307, + "grad_norm": 0.012182512320578098, + "learning_rate": 0.0008570191346784986, + "loss": 0.0405, + "num_input_tokens_seen": 85372336, + "step": 39500 + }, + { + "epoch": 6.444535073409462, + "grad_norm": 0.009086497128009796, + "learning_rate": 0.0008569692975922935, + "loss": 0.0852, + "num_input_tokens_seen": 85384112, + "step": 39505 + }, + { + "epoch": 6.445350734094617, + "grad_norm": 0.27018457651138306, + "learning_rate": 0.0008569194532716529, + "loss": 0.0806, + "num_input_tokens_seen": 85394192, + "step": 39510 + }, + { + "epoch": 6.446166394779771, + "grad_norm": 0.17779019474983215, + "learning_rate": 0.0008568696017175868, + "loss": 0.0703, + "num_input_tokens_seen": 85405744, + "step": 39515 + }, + { + "epoch": 6.446982055464926, + "grad_norm": 0.020411750301718712, + "learning_rate": 0.0008568197429311054, + "loss": 0.0819, + "num_input_tokens_seen": 85417968, + "step": 39520 + }, + { + "epoch": 6.447797716150082, + "grad_norm": 0.013875674456357956, + "learning_rate": 0.0008567698769132193, + "loss": 0.0579, + "num_input_tokens_seen": 85428112, + "step": 39525 + }, + { + "epoch": 6.448613376835237, + "grad_norm": 0.031358882784843445, + "learning_rate": 0.0008567200036649391, + "loss": 0.1543, + "num_input_tokens_seen": 85438800, + "step": 39530 + }, + { + "epoch": 6.4494290375203915, + "grad_norm": 0.21101713180541992, + "learning_rate": 0.0008566701231872753, + "loss": 0.1086, + "num_input_tokens_seen": 85450640, + "step": 39535 + }, + { + "epoch": 6.450244698205546, + "grad_norm": 0.375827431678772, + "learning_rate": 0.0008566202354812392, + "loss": 0.1573, + "num_input_tokens_seen": 85461712, + "step": 39540 + }, + { + "epoch": 6.451060358890701, + "grad_norm": 0.15580013394355774, + "learning_rate": 0.0008565703405478415, + "loss": 0.171, + "num_input_tokens_seen": 85472464, + "step": 39545 + }, + { + "epoch": 6.451876019575856, + "grad_norm": 0.019443074241280556, + "learning_rate": 0.0008565204383880937, + "loss": 0.0262, + "num_input_tokens_seen": 85483472, + "step": 39550 + }, + { + "epoch": 6.452691680261012, + "grad_norm": 0.01120977383106947, + "learning_rate": 0.0008564705290030068, + "loss": 0.0563, + "num_input_tokens_seen": 85492976, + "step": 39555 + }, + { + "epoch": 6.4535073409461665, + "grad_norm": 0.015544314868748188, + "learning_rate": 0.0008564206123935924, + "loss": 0.0935, + "num_input_tokens_seen": 85503216, + "step": 39560 + }, + { + "epoch": 6.454323001631321, + "grad_norm": 0.042205292731523514, + "learning_rate": 0.0008563706885608622, + "loss": 0.0113, + "num_input_tokens_seen": 85513616, + "step": 39565 + }, + { + "epoch": 6.455138662316476, + "grad_norm": 0.04667263105511665, + "learning_rate": 0.0008563207575058279, + "loss": 0.1049, + "num_input_tokens_seen": 85524624, + "step": 39570 + }, + { + "epoch": 6.455954323001631, + "grad_norm": 0.21057234704494476, + "learning_rate": 0.0008562708192295012, + "loss": 0.1132, + "num_input_tokens_seen": 85535760, + "step": 39575 + }, + { + "epoch": 6.456769983686787, + "grad_norm": 0.007653217297047377, + "learning_rate": 0.0008562208737328947, + "loss": 0.0387, + "num_input_tokens_seen": 85546576, + "step": 39580 + }, + { + "epoch": 6.4575856443719415, + "grad_norm": 0.14348191022872925, + "learning_rate": 0.0008561709210170201, + "loss": 0.0989, + "num_input_tokens_seen": 85556592, + "step": 39585 + }, + { + "epoch": 6.458401305057096, + "grad_norm": 0.1331503540277481, + "learning_rate": 0.00085612096108289, + "loss": 0.0442, + "num_input_tokens_seen": 85566640, + "step": 39590 + }, + { + "epoch": 6.459216965742251, + "grad_norm": 0.10012613236904144, + "learning_rate": 0.0008560709939315169, + "loss": 0.032, + "num_input_tokens_seen": 85577552, + "step": 39595 + }, + { + "epoch": 6.460032626427406, + "grad_norm": 0.011143765412271023, + "learning_rate": 0.0008560210195639133, + "loss": 0.1716, + "num_input_tokens_seen": 85588560, + "step": 39600 + }, + { + "epoch": 6.460848287112561, + "grad_norm": 0.08696023374795914, + "learning_rate": 0.0008559710379810922, + "loss": 0.0537, + "num_input_tokens_seen": 85599440, + "step": 39605 + }, + { + "epoch": 6.4616639477977165, + "grad_norm": 0.01324189268052578, + "learning_rate": 0.0008559210491840664, + "loss": 0.0355, + "num_input_tokens_seen": 85610192, + "step": 39610 + }, + { + "epoch": 6.462479608482871, + "grad_norm": 0.02257055602967739, + "learning_rate": 0.0008558710531738489, + "loss": 0.2433, + "num_input_tokens_seen": 85620112, + "step": 39615 + }, + { + "epoch": 6.463295269168026, + "grad_norm": 0.016799110919237137, + "learning_rate": 0.0008558210499514532, + "loss": 0.0371, + "num_input_tokens_seen": 85630832, + "step": 39620 + }, + { + "epoch": 6.464110929853181, + "grad_norm": 0.028752895072102547, + "learning_rate": 0.0008557710395178926, + "loss": 0.1295, + "num_input_tokens_seen": 85641840, + "step": 39625 + }, + { + "epoch": 6.464926590538336, + "grad_norm": 0.04469837620854378, + "learning_rate": 0.0008557210218741805, + "loss": 0.0474, + "num_input_tokens_seen": 85652112, + "step": 39630 + }, + { + "epoch": 6.465742251223491, + "grad_norm": 0.007485832553356886, + "learning_rate": 0.0008556709970213305, + "loss": 0.0592, + "num_input_tokens_seen": 85662352, + "step": 39635 + }, + { + "epoch": 6.466557911908646, + "grad_norm": 0.014441374689340591, + "learning_rate": 0.0008556209649603566, + "loss": 0.1465, + "num_input_tokens_seen": 85674576, + "step": 39640 + }, + { + "epoch": 6.467373572593801, + "grad_norm": 0.0882072001695633, + "learning_rate": 0.0008555709256922728, + "loss": 0.0681, + "num_input_tokens_seen": 85685008, + "step": 39645 + }, + { + "epoch": 6.468189233278956, + "grad_norm": 0.22026503086090088, + "learning_rate": 0.0008555208792180931, + "loss": 0.1859, + "num_input_tokens_seen": 85695600, + "step": 39650 + }, + { + "epoch": 6.469004893964111, + "grad_norm": 0.30590370297431946, + "learning_rate": 0.0008554708255388317, + "loss": 0.0633, + "num_input_tokens_seen": 85706896, + "step": 39655 + }, + { + "epoch": 6.4698205546492655, + "grad_norm": 0.010739093646407127, + "learning_rate": 0.0008554207646555032, + "loss": 0.0633, + "num_input_tokens_seen": 85718640, + "step": 39660 + }, + { + "epoch": 6.470636215334421, + "grad_norm": 0.034509677439928055, + "learning_rate": 0.0008553706965691218, + "loss": 0.0709, + "num_input_tokens_seen": 85728592, + "step": 39665 + }, + { + "epoch": 6.471451876019576, + "grad_norm": 0.1682896912097931, + "learning_rate": 0.0008553206212807026, + "loss": 0.1079, + "num_input_tokens_seen": 85739024, + "step": 39670 + }, + { + "epoch": 6.472267536704731, + "grad_norm": 0.16085843741893768, + "learning_rate": 0.0008552705387912602, + "loss": 0.1775, + "num_input_tokens_seen": 85749872, + "step": 39675 + }, + { + "epoch": 6.473083197389886, + "grad_norm": 0.18290142714977264, + "learning_rate": 0.0008552204491018096, + "loss": 0.0961, + "num_input_tokens_seen": 85760208, + "step": 39680 + }, + { + "epoch": 6.4738988580750405, + "grad_norm": 0.21753232181072235, + "learning_rate": 0.000855170352213366, + "loss": 0.2405, + "num_input_tokens_seen": 85770448, + "step": 39685 + }, + { + "epoch": 6.474714518760196, + "grad_norm": 0.037998493760824203, + "learning_rate": 0.0008551202481269446, + "loss": 0.1286, + "num_input_tokens_seen": 85782288, + "step": 39690 + }, + { + "epoch": 6.475530179445351, + "grad_norm": 0.32717499136924744, + "learning_rate": 0.000855070136843561, + "loss": 0.1742, + "num_input_tokens_seen": 85793264, + "step": 39695 + }, + { + "epoch": 6.476345840130506, + "grad_norm": 0.0370999239385128, + "learning_rate": 0.0008550200183642304, + "loss": 0.1615, + "num_input_tokens_seen": 85803952, + "step": 39700 + }, + { + "epoch": 6.477161500815661, + "grad_norm": 0.025982806459069252, + "learning_rate": 0.000854969892689969, + "loss": 0.026, + "num_input_tokens_seen": 85814576, + "step": 39705 + }, + { + "epoch": 6.4779771615008155, + "grad_norm": 0.4047028124332428, + "learning_rate": 0.0008549197598217923, + "loss": 0.1137, + "num_input_tokens_seen": 85824560, + "step": 39710 + }, + { + "epoch": 6.47879282218597, + "grad_norm": 0.01007707417011261, + "learning_rate": 0.0008548696197607165, + "loss": 0.055, + "num_input_tokens_seen": 85834896, + "step": 39715 + }, + { + "epoch": 6.479608482871126, + "grad_norm": 0.22528640925884247, + "learning_rate": 0.0008548194725077576, + "loss": 0.0809, + "num_input_tokens_seen": 85845840, + "step": 39720 + }, + { + "epoch": 6.480424143556281, + "grad_norm": 0.05834813788533211, + "learning_rate": 0.000854769318063932, + "loss": 0.0625, + "num_input_tokens_seen": 85856464, + "step": 39725 + }, + { + "epoch": 6.481239804241436, + "grad_norm": 0.03998285531997681, + "learning_rate": 0.0008547191564302561, + "loss": 0.0396, + "num_input_tokens_seen": 85868272, + "step": 39730 + }, + { + "epoch": 6.4820554649265905, + "grad_norm": 0.003781791077926755, + "learning_rate": 0.0008546689876077464, + "loss": 0.0394, + "num_input_tokens_seen": 85877936, + "step": 39735 + }, + { + "epoch": 6.482871125611745, + "grad_norm": 0.012051782570779324, + "learning_rate": 0.0008546188115974198, + "loss": 0.1194, + "num_input_tokens_seen": 85889296, + "step": 39740 + }, + { + "epoch": 6.4836867862969, + "grad_norm": 0.040543217211961746, + "learning_rate": 0.0008545686284002932, + "loss": 0.0667, + "num_input_tokens_seen": 85900496, + "step": 39745 + }, + { + "epoch": 6.484502446982056, + "grad_norm": 0.03690031170845032, + "learning_rate": 0.0008545184380173835, + "loss": 0.0351, + "num_input_tokens_seen": 85912976, + "step": 39750 + }, + { + "epoch": 6.485318107667211, + "grad_norm": 0.06059866026043892, + "learning_rate": 0.0008544682404497079, + "loss": 0.0299, + "num_input_tokens_seen": 85924112, + "step": 39755 + }, + { + "epoch": 6.486133768352365, + "grad_norm": 0.008430173620581627, + "learning_rate": 0.0008544180356982838, + "loss": 0.0773, + "num_input_tokens_seen": 85935152, + "step": 39760 + }, + { + "epoch": 6.48694942903752, + "grad_norm": 0.016898339614272118, + "learning_rate": 0.0008543678237641284, + "loss": 0.0268, + "num_input_tokens_seen": 85945616, + "step": 39765 + }, + { + "epoch": 6.487765089722675, + "grad_norm": 0.24281612038612366, + "learning_rate": 0.0008543176046482597, + "loss": 0.1337, + "num_input_tokens_seen": 85956304, + "step": 39770 + }, + { + "epoch": 6.488580750407831, + "grad_norm": 0.17431525886058807, + "learning_rate": 0.0008542673783516952, + "loss": 0.0515, + "num_input_tokens_seen": 85968752, + "step": 39775 + }, + { + "epoch": 6.489396411092986, + "grad_norm": 0.10620342195034027, + "learning_rate": 0.0008542171448754528, + "loss": 0.0484, + "num_input_tokens_seen": 85979600, + "step": 39780 + }, + { + "epoch": 6.49021207177814, + "grad_norm": 0.2232169657945633, + "learning_rate": 0.0008541669042205507, + "loss": 0.0962, + "num_input_tokens_seen": 85990416, + "step": 39785 + }, + { + "epoch": 6.491027732463295, + "grad_norm": 0.11217696219682693, + "learning_rate": 0.0008541166563880069, + "loss": 0.0293, + "num_input_tokens_seen": 86002064, + "step": 39790 + }, + { + "epoch": 6.49184339314845, + "grad_norm": 0.17561346292495728, + "learning_rate": 0.00085406640137884, + "loss": 0.2404, + "num_input_tokens_seen": 86012240, + "step": 39795 + }, + { + "epoch": 6.492659053833605, + "grad_norm": 0.3116014003753662, + "learning_rate": 0.0008540161391940681, + "loss": 0.1079, + "num_input_tokens_seen": 86022224, + "step": 39800 + }, + { + "epoch": 6.493474714518761, + "grad_norm": 0.24169039726257324, + "learning_rate": 0.0008539658698347102, + "loss": 0.1294, + "num_input_tokens_seen": 86032176, + "step": 39805 + }, + { + "epoch": 6.494290375203915, + "grad_norm": 0.026461800560355186, + "learning_rate": 0.0008539155933017848, + "loss": 0.0391, + "num_input_tokens_seen": 86043088, + "step": 39810 + }, + { + "epoch": 6.49510603588907, + "grad_norm": 0.012956095859408379, + "learning_rate": 0.0008538653095963109, + "loss": 0.0505, + "num_input_tokens_seen": 86053360, + "step": 39815 + }, + { + "epoch": 6.495921696574225, + "grad_norm": 0.05258989706635475, + "learning_rate": 0.0008538150187193076, + "loss": 0.1171, + "num_input_tokens_seen": 86063888, + "step": 39820 + }, + { + "epoch": 6.49673735725938, + "grad_norm": 0.01811002753674984, + "learning_rate": 0.0008537647206717942, + "loss": 0.0675, + "num_input_tokens_seen": 86074448, + "step": 39825 + }, + { + "epoch": 6.497553017944535, + "grad_norm": 0.09665459394454956, + "learning_rate": 0.00085371441545479, + "loss": 0.0486, + "num_input_tokens_seen": 86085392, + "step": 39830 + }, + { + "epoch": 6.49836867862969, + "grad_norm": 0.020809080451726913, + "learning_rate": 0.0008536641030693143, + "loss": 0.0123, + "num_input_tokens_seen": 86095120, + "step": 39835 + }, + { + "epoch": 6.499184339314845, + "grad_norm": 0.033111944794654846, + "learning_rate": 0.000853613783516387, + "loss": 0.074, + "num_input_tokens_seen": 86106000, + "step": 39840 + }, + { + "epoch": 6.5, + "grad_norm": 0.01744101569056511, + "learning_rate": 0.0008535634567970277, + "loss": 0.0766, + "num_input_tokens_seen": 86117456, + "step": 39845 + }, + { + "epoch": 6.500815660685155, + "grad_norm": 0.05731053650379181, + "learning_rate": 0.0008535131229122565, + "loss": 0.1427, + "num_input_tokens_seen": 86127984, + "step": 39850 + }, + { + "epoch": 6.50163132137031, + "grad_norm": 0.0589575469493866, + "learning_rate": 0.0008534627818630933, + "loss": 0.0319, + "num_input_tokens_seen": 86138928, + "step": 39855 + }, + { + "epoch": 6.502446982055465, + "grad_norm": 0.08219664543867111, + "learning_rate": 0.0008534124336505585, + "loss": 0.0133, + "num_input_tokens_seen": 86149424, + "step": 39860 + }, + { + "epoch": 6.50326264274062, + "grad_norm": 0.2619974911212921, + "learning_rate": 0.0008533620782756724, + "loss": 0.2123, + "num_input_tokens_seen": 86159376, + "step": 39865 + }, + { + "epoch": 6.504078303425775, + "grad_norm": 0.08538439869880676, + "learning_rate": 0.0008533117157394556, + "loss": 0.0734, + "num_input_tokens_seen": 86170320, + "step": 39870 + }, + { + "epoch": 6.50489396411093, + "grad_norm": 0.011978470720350742, + "learning_rate": 0.0008532613460429285, + "loss": 0.0706, + "num_input_tokens_seen": 86181136, + "step": 39875 + }, + { + "epoch": 6.505709624796085, + "grad_norm": 0.03728099912405014, + "learning_rate": 0.0008532109691871122, + "loss": 0.0537, + "num_input_tokens_seen": 86189808, + "step": 39880 + }, + { + "epoch": 6.506525285481239, + "grad_norm": 0.05977741256356239, + "learning_rate": 0.0008531605851730275, + "loss": 0.0707, + "num_input_tokens_seen": 86200752, + "step": 39885 + }, + { + "epoch": 6.507340946166395, + "grad_norm": 0.005612279754132032, + "learning_rate": 0.0008531101940016954, + "loss": 0.0123, + "num_input_tokens_seen": 86211088, + "step": 39890 + }, + { + "epoch": 6.50815660685155, + "grad_norm": 0.012287049554288387, + "learning_rate": 0.0008530597956741374, + "loss": 0.0329, + "num_input_tokens_seen": 86222576, + "step": 39895 + }, + { + "epoch": 6.508972267536705, + "grad_norm": 0.07354990392923355, + "learning_rate": 0.0008530093901913748, + "loss": 0.0792, + "num_input_tokens_seen": 86233008, + "step": 39900 + }, + { + "epoch": 6.50978792822186, + "grad_norm": 0.0029094265773892403, + "learning_rate": 0.000852958977554429, + "loss": 0.0263, + "num_input_tokens_seen": 86245296, + "step": 39905 + }, + { + "epoch": 6.510603588907014, + "grad_norm": 0.019716400653123856, + "learning_rate": 0.0008529085577643217, + "loss": 0.0412, + "num_input_tokens_seen": 86256016, + "step": 39910 + }, + { + "epoch": 6.511419249592169, + "grad_norm": 0.0031939074397087097, + "learning_rate": 0.0008528581308220748, + "loss": 0.0833, + "num_input_tokens_seen": 86267088, + "step": 39915 + }, + { + "epoch": 6.512234910277325, + "grad_norm": 0.22552239894866943, + "learning_rate": 0.0008528076967287103, + "loss": 0.0648, + "num_input_tokens_seen": 86278640, + "step": 39920 + }, + { + "epoch": 6.51305057096248, + "grad_norm": 0.22357866168022156, + "learning_rate": 0.0008527572554852502, + "loss": 0.2424, + "num_input_tokens_seen": 86289616, + "step": 39925 + }, + { + "epoch": 6.513866231647635, + "grad_norm": 0.004282618407160044, + "learning_rate": 0.0008527068070927169, + "loss": 0.1507, + "num_input_tokens_seen": 86300048, + "step": 39930 + }, + { + "epoch": 6.514681892332789, + "grad_norm": 0.16823726892471313, + "learning_rate": 0.0008526563515521327, + "loss": 0.2235, + "num_input_tokens_seen": 86312496, + "step": 39935 + }, + { + "epoch": 6.515497553017944, + "grad_norm": 0.22176630795001984, + "learning_rate": 0.0008526058888645202, + "loss": 0.0328, + "num_input_tokens_seen": 86322768, + "step": 39940 + }, + { + "epoch": 6.5163132137031, + "grad_norm": 0.204921692609787, + "learning_rate": 0.000852555419030902, + "loss": 0.0765, + "num_input_tokens_seen": 86333392, + "step": 39945 + }, + { + "epoch": 6.517128874388255, + "grad_norm": 0.06411628425121307, + "learning_rate": 0.000852504942052301, + "loss": 0.0981, + "num_input_tokens_seen": 86345456, + "step": 39950 + }, + { + "epoch": 6.5179445350734095, + "grad_norm": 0.0035396197345107794, + "learning_rate": 0.0008524544579297402, + "loss": 0.0942, + "num_input_tokens_seen": 86356464, + "step": 39955 + }, + { + "epoch": 6.518760195758564, + "grad_norm": 0.28199630975723267, + "learning_rate": 0.0008524039666642424, + "loss": 0.2473, + "num_input_tokens_seen": 86367120, + "step": 39960 + }, + { + "epoch": 6.519575856443719, + "grad_norm": 0.025316933169960976, + "learning_rate": 0.0008523534682568315, + "loss": 0.0578, + "num_input_tokens_seen": 86377712, + "step": 39965 + }, + { + "epoch": 6.520391517128875, + "grad_norm": 0.04194442555308342, + "learning_rate": 0.0008523029627085306, + "loss": 0.0543, + "num_input_tokens_seen": 86387760, + "step": 39970 + }, + { + "epoch": 6.52120717781403, + "grad_norm": 0.10103817284107208, + "learning_rate": 0.000852252450020363, + "loss": 0.1271, + "num_input_tokens_seen": 86398736, + "step": 39975 + }, + { + "epoch": 6.5220228384991845, + "grad_norm": 0.018434442579746246, + "learning_rate": 0.0008522019301933528, + "loss": 0.0569, + "num_input_tokens_seen": 86409648, + "step": 39980 + }, + { + "epoch": 6.522838499184339, + "grad_norm": 0.22668053209781647, + "learning_rate": 0.0008521514032285236, + "loss": 0.099, + "num_input_tokens_seen": 86420432, + "step": 39985 + }, + { + "epoch": 6.523654159869494, + "grad_norm": 0.07056698203086853, + "learning_rate": 0.0008521008691268994, + "loss": 0.0622, + "num_input_tokens_seen": 86431536, + "step": 39990 + }, + { + "epoch": 6.524469820554649, + "grad_norm": 0.17081086337566376, + "learning_rate": 0.0008520503278895045, + "loss": 0.0583, + "num_input_tokens_seen": 86442800, + "step": 39995 + }, + { + "epoch": 6.525285481239804, + "grad_norm": 0.07947465777397156, + "learning_rate": 0.0008519997795173632, + "loss": 0.041, + "num_input_tokens_seen": 86454224, + "step": 40000 + }, + { + "epoch": 6.5261011419249595, + "grad_norm": 0.027177680283784866, + "learning_rate": 0.0008519492240114996, + "loss": 0.1133, + "num_input_tokens_seen": 86464656, + "step": 40005 + }, + { + "epoch": 6.526916802610114, + "grad_norm": 0.0470040962100029, + "learning_rate": 0.0008518986613729387, + "loss": 0.0338, + "num_input_tokens_seen": 86476336, + "step": 40010 + }, + { + "epoch": 6.527732463295269, + "grad_norm": 0.2636547386646271, + "learning_rate": 0.0008518480916027049, + "loss": 0.0809, + "num_input_tokens_seen": 86486864, + "step": 40015 + }, + { + "epoch": 6.528548123980424, + "grad_norm": 0.047732576727867126, + "learning_rate": 0.0008517975147018233, + "loss": 0.0607, + "num_input_tokens_seen": 86497424, + "step": 40020 + }, + { + "epoch": 6.529363784665579, + "grad_norm": 0.025507677346467972, + "learning_rate": 0.0008517469306713187, + "loss": 0.0947, + "num_input_tokens_seen": 86508080, + "step": 40025 + }, + { + "epoch": 6.5301794453507345, + "grad_norm": 0.026355070993304253, + "learning_rate": 0.0008516963395122163, + "loss": 0.1351, + "num_input_tokens_seen": 86518608, + "step": 40030 + }, + { + "epoch": 6.530995106035889, + "grad_norm": 0.004582028370350599, + "learning_rate": 0.0008516457412255414, + "loss": 0.0083, + "num_input_tokens_seen": 86530160, + "step": 40035 + }, + { + "epoch": 6.531810766721044, + "grad_norm": 0.24956628680229187, + "learning_rate": 0.0008515951358123195, + "loss": 0.0364, + "num_input_tokens_seen": 86540592, + "step": 40040 + }, + { + "epoch": 6.532626427406199, + "grad_norm": 0.013599184341728687, + "learning_rate": 0.0008515445232735761, + "loss": 0.2517, + "num_input_tokens_seen": 86551152, + "step": 40045 + }, + { + "epoch": 6.533442088091354, + "grad_norm": 0.006095684599131346, + "learning_rate": 0.0008514939036103371, + "loss": 0.2383, + "num_input_tokens_seen": 86561456, + "step": 40050 + }, + { + "epoch": 6.5342577487765094, + "grad_norm": 0.031160254031419754, + "learning_rate": 0.0008514432768236282, + "loss": 0.02, + "num_input_tokens_seen": 86573648, + "step": 40055 + }, + { + "epoch": 6.535073409461664, + "grad_norm": 0.12235338240861893, + "learning_rate": 0.0008513926429144754, + "loss": 0.1384, + "num_input_tokens_seen": 86585552, + "step": 40060 + }, + { + "epoch": 6.535889070146819, + "grad_norm": 0.16848307847976685, + "learning_rate": 0.0008513420018839049, + "loss": 0.0781, + "num_input_tokens_seen": 86595632, + "step": 40065 + }, + { + "epoch": 6.536704730831974, + "grad_norm": 0.195616215467453, + "learning_rate": 0.0008512913537329431, + "loss": 0.0789, + "num_input_tokens_seen": 86606512, + "step": 40070 + }, + { + "epoch": 6.537520391517129, + "grad_norm": 0.011958951130509377, + "learning_rate": 0.0008512406984626162, + "loss": 0.0838, + "num_input_tokens_seen": 86616688, + "step": 40075 + }, + { + "epoch": 6.5383360522022835, + "grad_norm": 0.013706839643418789, + "learning_rate": 0.0008511900360739512, + "loss": 0.1612, + "num_input_tokens_seen": 86626736, + "step": 40080 + }, + { + "epoch": 6.539151712887438, + "grad_norm": 0.03659415990114212, + "learning_rate": 0.0008511393665679745, + "loss": 0.0696, + "num_input_tokens_seen": 86638576, + "step": 40085 + }, + { + "epoch": 6.539967373572594, + "grad_norm": 0.06943966448307037, + "learning_rate": 0.000851088689945713, + "loss": 0.0754, + "num_input_tokens_seen": 86649360, + "step": 40090 + }, + { + "epoch": 6.540783034257749, + "grad_norm": 0.02747497893869877, + "learning_rate": 0.0008510380062081939, + "loss": 0.1278, + "num_input_tokens_seen": 86659536, + "step": 40095 + }, + { + "epoch": 6.541598694942904, + "grad_norm": 0.07961980253458023, + "learning_rate": 0.0008509873153564443, + "loss": 0.0776, + "num_input_tokens_seen": 86670512, + "step": 40100 + }, + { + "epoch": 6.5424143556280585, + "grad_norm": 0.0976494625210762, + "learning_rate": 0.0008509366173914914, + "loss": 0.0581, + "num_input_tokens_seen": 86681264, + "step": 40105 + }, + { + "epoch": 6.543230016313213, + "grad_norm": 0.004687887150794268, + "learning_rate": 0.0008508859123143628, + "loss": 0.149, + "num_input_tokens_seen": 86692368, + "step": 40110 + }, + { + "epoch": 6.544045676998369, + "grad_norm": 0.014002878218889236, + "learning_rate": 0.0008508352001260861, + "loss": 0.093, + "num_input_tokens_seen": 86702608, + "step": 40115 + }, + { + "epoch": 6.544861337683524, + "grad_norm": 0.03292868286371231, + "learning_rate": 0.000850784480827689, + "loss": 0.0346, + "num_input_tokens_seen": 86713776, + "step": 40120 + }, + { + "epoch": 6.545676998368679, + "grad_norm": 0.05072006955742836, + "learning_rate": 0.0008507337544201994, + "loss": 0.1226, + "num_input_tokens_seen": 86724624, + "step": 40125 + }, + { + "epoch": 6.5464926590538335, + "grad_norm": 0.017592573538422585, + "learning_rate": 0.0008506830209046453, + "loss": 0.0755, + "num_input_tokens_seen": 86736048, + "step": 40130 + }, + { + "epoch": 6.547308319738988, + "grad_norm": 0.020308518782258034, + "learning_rate": 0.000850632280282055, + "loss": 0.0187, + "num_input_tokens_seen": 86747216, + "step": 40135 + }, + { + "epoch": 6.548123980424144, + "grad_norm": 0.09209541976451874, + "learning_rate": 0.0008505815325534565, + "loss": 0.0561, + "num_input_tokens_seen": 86757296, + "step": 40140 + }, + { + "epoch": 6.548939641109299, + "grad_norm": 0.06564757227897644, + "learning_rate": 0.0008505307777198788, + "loss": 0.0243, + "num_input_tokens_seen": 86769072, + "step": 40145 + }, + { + "epoch": 6.549755301794454, + "grad_norm": 0.052758727222681046, + "learning_rate": 0.0008504800157823501, + "loss": 0.0323, + "num_input_tokens_seen": 86780816, + "step": 40150 + }, + { + "epoch": 6.5505709624796085, + "grad_norm": 0.023643581196665764, + "learning_rate": 0.000850429246741899, + "loss": 0.0383, + "num_input_tokens_seen": 86791280, + "step": 40155 + }, + { + "epoch": 6.551386623164763, + "grad_norm": 0.2505358159542084, + "learning_rate": 0.0008503784705995549, + "loss": 0.0601, + "num_input_tokens_seen": 86800912, + "step": 40160 + }, + { + "epoch": 6.552202283849918, + "grad_norm": 0.007761516608297825, + "learning_rate": 0.0008503276873563465, + "loss": 0.0122, + "num_input_tokens_seen": 86812592, + "step": 40165 + }, + { + "epoch": 6.553017944535073, + "grad_norm": 0.029418349266052246, + "learning_rate": 0.0008502768970133032, + "loss": 0.17, + "num_input_tokens_seen": 86824048, + "step": 40170 + }, + { + "epoch": 6.553833605220229, + "grad_norm": 0.22073465585708618, + "learning_rate": 0.0008502260995714543, + "loss": 0.0459, + "num_input_tokens_seen": 86834096, + "step": 40175 + }, + { + "epoch": 6.554649265905383, + "grad_norm": 0.003069857368245721, + "learning_rate": 0.0008501752950318292, + "loss": 0.0489, + "num_input_tokens_seen": 86844752, + "step": 40180 + }, + { + "epoch": 6.555464926590538, + "grad_norm": 0.008434565737843513, + "learning_rate": 0.0008501244833954573, + "loss": 0.1582, + "num_input_tokens_seen": 86855056, + "step": 40185 + }, + { + "epoch": 6.556280587275693, + "grad_norm": 0.32387346029281616, + "learning_rate": 0.0008500736646633686, + "loss": 0.1796, + "num_input_tokens_seen": 86866128, + "step": 40190 + }, + { + "epoch": 6.557096247960848, + "grad_norm": 0.2440408319234848, + "learning_rate": 0.0008500228388365933, + "loss": 0.1675, + "num_input_tokens_seen": 86877328, + "step": 40195 + }, + { + "epoch": 6.557911908646004, + "grad_norm": 0.0660281777381897, + "learning_rate": 0.0008499720059161608, + "loss": 0.0356, + "num_input_tokens_seen": 86888240, + "step": 40200 + }, + { + "epoch": 6.558727569331158, + "grad_norm": 0.013883471488952637, + "learning_rate": 0.0008499211659031018, + "loss": 0.08, + "num_input_tokens_seen": 86899664, + "step": 40205 + }, + { + "epoch": 6.559543230016313, + "grad_norm": 0.16646042466163635, + "learning_rate": 0.0008498703187984465, + "loss": 0.1258, + "num_input_tokens_seen": 86910448, + "step": 40210 + }, + { + "epoch": 6.560358890701468, + "grad_norm": 0.01823389157652855, + "learning_rate": 0.0008498194646032253, + "loss": 0.0364, + "num_input_tokens_seen": 86921168, + "step": 40215 + }, + { + "epoch": 6.561174551386623, + "grad_norm": 0.010996226221323013, + "learning_rate": 0.0008497686033184687, + "loss": 0.0631, + "num_input_tokens_seen": 86931632, + "step": 40220 + }, + { + "epoch": 6.561990212071779, + "grad_norm": 0.021542511880397797, + "learning_rate": 0.0008497177349452077, + "loss": 0.1433, + "num_input_tokens_seen": 86942608, + "step": 40225 + }, + { + "epoch": 6.562805872756933, + "grad_norm": 0.04288684204220772, + "learning_rate": 0.0008496668594844733, + "loss": 0.0376, + "num_input_tokens_seen": 86954352, + "step": 40230 + }, + { + "epoch": 6.563621533442088, + "grad_norm": 0.1718064695596695, + "learning_rate": 0.0008496159769372964, + "loss": 0.0615, + "num_input_tokens_seen": 86964816, + "step": 40235 + }, + { + "epoch": 6.564437194127243, + "grad_norm": 0.15476419031620026, + "learning_rate": 0.0008495650873047081, + "loss": 0.0596, + "num_input_tokens_seen": 86975760, + "step": 40240 + }, + { + "epoch": 6.565252854812398, + "grad_norm": 0.29840338230133057, + "learning_rate": 0.0008495141905877398, + "loss": 0.0854, + "num_input_tokens_seen": 86986672, + "step": 40245 + }, + { + "epoch": 6.566068515497553, + "grad_norm": 0.318273663520813, + "learning_rate": 0.0008494632867874232, + "loss": 0.1865, + "num_input_tokens_seen": 86996560, + "step": 40250 + }, + { + "epoch": 6.566884176182708, + "grad_norm": 0.15846861898899078, + "learning_rate": 0.0008494123759047897, + "loss": 0.0864, + "num_input_tokens_seen": 87008080, + "step": 40255 + }, + { + "epoch": 6.567699836867863, + "grad_norm": 0.013275109231472015, + "learning_rate": 0.0008493614579408712, + "loss": 0.0272, + "num_input_tokens_seen": 87019920, + "step": 40260 + }, + { + "epoch": 6.568515497553018, + "grad_norm": 0.6214556097984314, + "learning_rate": 0.0008493105328966995, + "loss": 0.1214, + "num_input_tokens_seen": 87031056, + "step": 40265 + }, + { + "epoch": 6.569331158238173, + "grad_norm": 0.2053581178188324, + "learning_rate": 0.0008492596007733066, + "loss": 0.0857, + "num_input_tokens_seen": 87041872, + "step": 40270 + }, + { + "epoch": 6.570146818923328, + "grad_norm": 0.22130665183067322, + "learning_rate": 0.0008492086615717251, + "loss": 0.049, + "num_input_tokens_seen": 87051888, + "step": 40275 + }, + { + "epoch": 6.5709624796084825, + "grad_norm": 0.010671776719391346, + "learning_rate": 0.0008491577152929867, + "loss": 0.1461, + "num_input_tokens_seen": 87061744, + "step": 40280 + }, + { + "epoch": 6.571778140293638, + "grad_norm": 0.007222020998597145, + "learning_rate": 0.0008491067619381247, + "loss": 0.1972, + "num_input_tokens_seen": 87073008, + "step": 40285 + }, + { + "epoch": 6.572593800978793, + "grad_norm": 0.22576162219047546, + "learning_rate": 0.0008490558015081711, + "loss": 0.1103, + "num_input_tokens_seen": 87084336, + "step": 40290 + }, + { + "epoch": 6.573409461663948, + "grad_norm": 0.24591238796710968, + "learning_rate": 0.0008490048340041587, + "loss": 0.1464, + "num_input_tokens_seen": 87095088, + "step": 40295 + }, + { + "epoch": 6.574225122349103, + "grad_norm": 0.002666117623448372, + "learning_rate": 0.0008489538594271209, + "loss": 0.1626, + "num_input_tokens_seen": 87105744, + "step": 40300 + }, + { + "epoch": 6.575040783034257, + "grad_norm": 0.004681061487644911, + "learning_rate": 0.0008489028777780901, + "loss": 0.0815, + "num_input_tokens_seen": 87115984, + "step": 40305 + }, + { + "epoch": 6.575856443719413, + "grad_norm": 0.1317053884267807, + "learning_rate": 0.0008488518890581002, + "loss": 0.0587, + "num_input_tokens_seen": 87127344, + "step": 40310 + }, + { + "epoch": 6.576672104404568, + "grad_norm": 0.44375360012054443, + "learning_rate": 0.0008488008932681841, + "loss": 0.1411, + "num_input_tokens_seen": 87138512, + "step": 40315 + }, + { + "epoch": 6.577487765089723, + "grad_norm": 0.00884460099041462, + "learning_rate": 0.0008487498904093753, + "loss": 0.1103, + "num_input_tokens_seen": 87149776, + "step": 40320 + }, + { + "epoch": 6.578303425774878, + "grad_norm": 0.054936353117227554, + "learning_rate": 0.0008486988804827077, + "loss": 0.0426, + "num_input_tokens_seen": 87160848, + "step": 40325 + }, + { + "epoch": 6.579119086460032, + "grad_norm": 0.3017171621322632, + "learning_rate": 0.0008486478634892149, + "loss": 0.0796, + "num_input_tokens_seen": 87171984, + "step": 40330 + }, + { + "epoch": 6.579934747145187, + "grad_norm": 0.1470268815755844, + "learning_rate": 0.0008485968394299308, + "loss": 0.1392, + "num_input_tokens_seen": 87182736, + "step": 40335 + }, + { + "epoch": 6.580750407830343, + "grad_norm": 0.03465595841407776, + "learning_rate": 0.0008485458083058896, + "loss": 0.0727, + "num_input_tokens_seen": 87193872, + "step": 40340 + }, + { + "epoch": 6.581566068515498, + "grad_norm": 0.09754090011119843, + "learning_rate": 0.0008484947701181254, + "loss": 0.1589, + "num_input_tokens_seen": 87204912, + "step": 40345 + }, + { + "epoch": 6.582381729200653, + "grad_norm": 0.029379529878497124, + "learning_rate": 0.0008484437248676726, + "loss": 0.0285, + "num_input_tokens_seen": 87215120, + "step": 40350 + }, + { + "epoch": 6.583197389885807, + "grad_norm": 0.02292765863239765, + "learning_rate": 0.0008483926725555655, + "loss": 0.0411, + "num_input_tokens_seen": 87227024, + "step": 40355 + }, + { + "epoch": 6.584013050570962, + "grad_norm": 0.01272663101553917, + "learning_rate": 0.0008483416131828392, + "loss": 0.0231, + "num_input_tokens_seen": 87238352, + "step": 40360 + }, + { + "epoch": 6.584828711256117, + "grad_norm": 0.29345691204071045, + "learning_rate": 0.000848290546750528, + "loss": 0.1485, + "num_input_tokens_seen": 87248656, + "step": 40365 + }, + { + "epoch": 6.585644371941273, + "grad_norm": 0.017028363421559334, + "learning_rate": 0.0008482394732596672, + "loss": 0.0282, + "num_input_tokens_seen": 87259696, + "step": 40370 + }, + { + "epoch": 6.5864600326264275, + "grad_norm": 0.06555546075105667, + "learning_rate": 0.0008481883927112917, + "loss": 0.0583, + "num_input_tokens_seen": 87269776, + "step": 40375 + }, + { + "epoch": 6.587275693311582, + "grad_norm": 0.03757341951131821, + "learning_rate": 0.0008481373051064365, + "loss": 0.0747, + "num_input_tokens_seen": 87280112, + "step": 40380 + }, + { + "epoch": 6.588091353996737, + "grad_norm": 0.11471926420927048, + "learning_rate": 0.0008480862104461374, + "loss": 0.0875, + "num_input_tokens_seen": 87290256, + "step": 40385 + }, + { + "epoch": 6.588907014681892, + "grad_norm": 0.022442886605858803, + "learning_rate": 0.0008480351087314295, + "loss": 0.0722, + "num_input_tokens_seen": 87301488, + "step": 40390 + }, + { + "epoch": 6.589722675367048, + "grad_norm": 0.02271287702023983, + "learning_rate": 0.0008479839999633487, + "loss": 0.069, + "num_input_tokens_seen": 87311088, + "step": 40395 + }, + { + "epoch": 6.5905383360522025, + "grad_norm": 0.16364285349845886, + "learning_rate": 0.0008479328841429306, + "loss": 0.1194, + "num_input_tokens_seen": 87321776, + "step": 40400 + }, + { + "epoch": 6.591353996737357, + "grad_norm": 0.17481482028961182, + "learning_rate": 0.0008478817612712113, + "loss": 0.0324, + "num_input_tokens_seen": 87333328, + "step": 40405 + }, + { + "epoch": 6.592169657422512, + "grad_norm": 0.0992930606007576, + "learning_rate": 0.0008478306313492267, + "loss": 0.0716, + "num_input_tokens_seen": 87344112, + "step": 40410 + }, + { + "epoch": 6.592985318107667, + "grad_norm": 0.015793804079294205, + "learning_rate": 0.0008477794943780132, + "loss": 0.0133, + "num_input_tokens_seen": 87355888, + "step": 40415 + }, + { + "epoch": 6.593800978792823, + "grad_norm": 0.24398945271968842, + "learning_rate": 0.0008477283503586072, + "loss": 0.0351, + "num_input_tokens_seen": 87367088, + "step": 40420 + }, + { + "epoch": 6.5946166394779775, + "grad_norm": 0.0020906298886984587, + "learning_rate": 0.0008476771992920449, + "loss": 0.0301, + "num_input_tokens_seen": 87377904, + "step": 40425 + }, + { + "epoch": 6.595432300163132, + "grad_norm": 0.2970251142978668, + "learning_rate": 0.0008476260411793631, + "loss": 0.045, + "num_input_tokens_seen": 87387920, + "step": 40430 + }, + { + "epoch": 6.596247960848287, + "grad_norm": 0.07806608080863953, + "learning_rate": 0.0008475748760215984, + "loss": 0.1143, + "num_input_tokens_seen": 87397392, + "step": 40435 + }, + { + "epoch": 6.597063621533442, + "grad_norm": 0.08590800315141678, + "learning_rate": 0.0008475237038197882, + "loss": 0.4011, + "num_input_tokens_seen": 87407952, + "step": 40440 + }, + { + "epoch": 6.597879282218597, + "grad_norm": 0.014154416508972645, + "learning_rate": 0.0008474725245749691, + "loss": 0.0905, + "num_input_tokens_seen": 87418576, + "step": 40445 + }, + { + "epoch": 6.598694942903752, + "grad_norm": 0.2728196978569031, + "learning_rate": 0.0008474213382881786, + "loss": 0.0507, + "num_input_tokens_seen": 87429456, + "step": 40450 + }, + { + "epoch": 6.599510603588907, + "grad_norm": 0.04241441190242767, + "learning_rate": 0.0008473701449604539, + "loss": 0.0273, + "num_input_tokens_seen": 87439248, + "step": 40455 + }, + { + "epoch": 6.600326264274062, + "grad_norm": 0.15882225334644318, + "learning_rate": 0.0008473189445928325, + "loss": 0.2533, + "num_input_tokens_seen": 87449936, + "step": 40460 + }, + { + "epoch": 6.601141924959217, + "grad_norm": 0.36263370513916016, + "learning_rate": 0.0008472677371863521, + "loss": 0.2535, + "num_input_tokens_seen": 87460432, + "step": 40465 + }, + { + "epoch": 6.601957585644372, + "grad_norm": 0.09968896955251694, + "learning_rate": 0.0008472165227420505, + "loss": 0.251, + "num_input_tokens_seen": 87470288, + "step": 40470 + }, + { + "epoch": 6.602773246329527, + "grad_norm": 0.24596227705478668, + "learning_rate": 0.0008471653012609655, + "loss": 0.0642, + "num_input_tokens_seen": 87480208, + "step": 40475 + }, + { + "epoch": 6.603588907014682, + "grad_norm": 0.010824929922819138, + "learning_rate": 0.0008471140727441353, + "loss": 0.0965, + "num_input_tokens_seen": 87490768, + "step": 40480 + }, + { + "epoch": 6.604404567699837, + "grad_norm": 0.012988324277102947, + "learning_rate": 0.0008470628371925981, + "loss": 0.0368, + "num_input_tokens_seen": 87501040, + "step": 40485 + }, + { + "epoch": 6.605220228384992, + "grad_norm": 0.11417032778263092, + "learning_rate": 0.0008470115946073922, + "loss": 0.1614, + "num_input_tokens_seen": 87511824, + "step": 40490 + }, + { + "epoch": 6.606035889070147, + "grad_norm": 0.09179277718067169, + "learning_rate": 0.0008469603449895562, + "loss": 0.1003, + "num_input_tokens_seen": 87522928, + "step": 40495 + }, + { + "epoch": 6.6068515497553015, + "grad_norm": 0.03256848081946373, + "learning_rate": 0.0008469090883401286, + "loss": 0.0587, + "num_input_tokens_seen": 87533200, + "step": 40500 + }, + { + "epoch": 6.607667210440457, + "grad_norm": 0.09992846846580505, + "learning_rate": 0.0008468578246601482, + "loss": 0.1608, + "num_input_tokens_seen": 87541808, + "step": 40505 + }, + { + "epoch": 6.608482871125612, + "grad_norm": 0.19650889933109283, + "learning_rate": 0.000846806553950654, + "loss": 0.0926, + "num_input_tokens_seen": 87552656, + "step": 40510 + }, + { + "epoch": 6.609298531810767, + "grad_norm": 0.33135658502578735, + "learning_rate": 0.0008467552762126851, + "loss": 0.1255, + "num_input_tokens_seen": 87564048, + "step": 40515 + }, + { + "epoch": 6.610114192495922, + "grad_norm": 0.01710195280611515, + "learning_rate": 0.0008467039914472805, + "loss": 0.0421, + "num_input_tokens_seen": 87574832, + "step": 40520 + }, + { + "epoch": 6.6109298531810765, + "grad_norm": 0.024433651939034462, + "learning_rate": 0.0008466526996554797, + "loss": 0.078, + "num_input_tokens_seen": 87585040, + "step": 40525 + }, + { + "epoch": 6.611745513866231, + "grad_norm": 0.1268104910850525, + "learning_rate": 0.0008466014008383224, + "loss": 0.0534, + "num_input_tokens_seen": 87595312, + "step": 40530 + }, + { + "epoch": 6.612561174551386, + "grad_norm": 0.02771225944161415, + "learning_rate": 0.0008465500949968479, + "loss": 0.0801, + "num_input_tokens_seen": 87606224, + "step": 40535 + }, + { + "epoch": 6.613376835236542, + "grad_norm": 0.01759571023285389, + "learning_rate": 0.000846498782132096, + "loss": 0.0336, + "num_input_tokens_seen": 87617520, + "step": 40540 + }, + { + "epoch": 6.614192495921697, + "grad_norm": 0.019269373267889023, + "learning_rate": 0.0008464474622451067, + "loss": 0.0567, + "num_input_tokens_seen": 87629008, + "step": 40545 + }, + { + "epoch": 6.6150081566068515, + "grad_norm": 0.017509333789348602, + "learning_rate": 0.0008463961353369202, + "loss": 0.0433, + "num_input_tokens_seen": 87639824, + "step": 40550 + }, + { + "epoch": 6.615823817292006, + "grad_norm": 0.06460079550743103, + "learning_rate": 0.0008463448014085765, + "loss": 0.0763, + "num_input_tokens_seen": 87649040, + "step": 40555 + }, + { + "epoch": 6.616639477977161, + "grad_norm": 0.0920075848698616, + "learning_rate": 0.000846293460461116, + "loss": 0.185, + "num_input_tokens_seen": 87659536, + "step": 40560 + }, + { + "epoch": 6.617455138662317, + "grad_norm": 0.1951504349708557, + "learning_rate": 0.0008462421124955792, + "loss": 0.2377, + "num_input_tokens_seen": 87670256, + "step": 40565 + }, + { + "epoch": 6.618270799347472, + "grad_norm": 0.016024351119995117, + "learning_rate": 0.0008461907575130069, + "loss": 0.1435, + "num_input_tokens_seen": 87681008, + "step": 40570 + }, + { + "epoch": 6.6190864600326265, + "grad_norm": 0.104428231716156, + "learning_rate": 0.0008461393955144397, + "loss": 0.1053, + "num_input_tokens_seen": 87692400, + "step": 40575 + }, + { + "epoch": 6.619902120717781, + "grad_norm": 0.02700626105070114, + "learning_rate": 0.0008460880265009185, + "loss": 0.1724, + "num_input_tokens_seen": 87703888, + "step": 40580 + }, + { + "epoch": 6.620717781402936, + "grad_norm": 0.08827092498540878, + "learning_rate": 0.0008460366504734843, + "loss": 0.0314, + "num_input_tokens_seen": 87714096, + "step": 40585 + }, + { + "epoch": 6.621533442088092, + "grad_norm": 0.03388110548257828, + "learning_rate": 0.0008459852674331785, + "loss": 0.0655, + "num_input_tokens_seen": 87723760, + "step": 40590 + }, + { + "epoch": 6.622349102773247, + "grad_norm": 0.27784040570259094, + "learning_rate": 0.0008459338773810424, + "loss": 0.2896, + "num_input_tokens_seen": 87734160, + "step": 40595 + }, + { + "epoch": 6.623164763458401, + "grad_norm": 0.018514566123485565, + "learning_rate": 0.0008458824803181174, + "loss": 0.0615, + "num_input_tokens_seen": 87744496, + "step": 40600 + }, + { + "epoch": 6.623980424143556, + "grad_norm": 0.19091467559337616, + "learning_rate": 0.0008458310762454451, + "loss": 0.153, + "num_input_tokens_seen": 87755376, + "step": 40605 + }, + { + "epoch": 6.624796084828711, + "grad_norm": 0.01676258258521557, + "learning_rate": 0.0008457796651640672, + "loss": 0.0911, + "num_input_tokens_seen": 87765552, + "step": 40610 + }, + { + "epoch": 6.625611745513866, + "grad_norm": 0.14877432584762573, + "learning_rate": 0.0008457282470750259, + "loss": 0.0568, + "num_input_tokens_seen": 87777552, + "step": 40615 + }, + { + "epoch": 6.626427406199021, + "grad_norm": 0.0045336890034377575, + "learning_rate": 0.0008456768219793631, + "loss": 0.1728, + "num_input_tokens_seen": 87788176, + "step": 40620 + }, + { + "epoch": 6.627243066884176, + "grad_norm": 0.047274842858314514, + "learning_rate": 0.000845625389878121, + "loss": 0.0987, + "num_input_tokens_seen": 87800240, + "step": 40625 + }, + { + "epoch": 6.628058727569331, + "grad_norm": 0.26113563776016235, + "learning_rate": 0.0008455739507723418, + "loss": 0.0529, + "num_input_tokens_seen": 87811248, + "step": 40630 + }, + { + "epoch": 6.628874388254486, + "grad_norm": 0.046045806258916855, + "learning_rate": 0.0008455225046630681, + "loss": 0.1242, + "num_input_tokens_seen": 87823280, + "step": 40635 + }, + { + "epoch": 6.629690048939641, + "grad_norm": 0.007317631971091032, + "learning_rate": 0.0008454710515513426, + "loss": 0.0486, + "num_input_tokens_seen": 87833456, + "step": 40640 + }, + { + "epoch": 6.630505709624796, + "grad_norm": 0.033978912979364395, + "learning_rate": 0.0008454195914382079, + "loss": 0.1417, + "num_input_tokens_seen": 87844016, + "step": 40645 + }, + { + "epoch": 6.631321370309951, + "grad_norm": 0.01450091227889061, + "learning_rate": 0.0008453681243247071, + "loss": 0.0215, + "num_input_tokens_seen": 87855216, + "step": 40650 + }, + { + "epoch": 6.632137030995106, + "grad_norm": 0.10272349417209625, + "learning_rate": 0.000845316650211883, + "loss": 0.1389, + "num_input_tokens_seen": 87865392, + "step": 40655 + }, + { + "epoch": 6.632952691680261, + "grad_norm": 0.26933956146240234, + "learning_rate": 0.0008452651691007789, + "loss": 0.0634, + "num_input_tokens_seen": 87876496, + "step": 40660 + }, + { + "epoch": 6.633768352365416, + "grad_norm": 0.2286217361688614, + "learning_rate": 0.0008452136809924384, + "loss": 0.1909, + "num_input_tokens_seen": 87888112, + "step": 40665 + }, + { + "epoch": 6.634584013050571, + "grad_norm": 0.026156388223171234, + "learning_rate": 0.0008451621858879043, + "loss": 0.0643, + "num_input_tokens_seen": 87898160, + "step": 40670 + }, + { + "epoch": 6.635399673735726, + "grad_norm": 0.010627840645611286, + "learning_rate": 0.000845110683788221, + "loss": 0.2, + "num_input_tokens_seen": 87909200, + "step": 40675 + }, + { + "epoch": 6.636215334420881, + "grad_norm": 0.003927143756300211, + "learning_rate": 0.0008450591746944319, + "loss": 0.0798, + "num_input_tokens_seen": 87919856, + "step": 40680 + }, + { + "epoch": 6.637030995106036, + "grad_norm": 0.25393131375312805, + "learning_rate": 0.0008450076586075805, + "loss": 0.2177, + "num_input_tokens_seen": 87931088, + "step": 40685 + }, + { + "epoch": 6.637846655791191, + "grad_norm": 0.09210921078920364, + "learning_rate": 0.0008449561355287116, + "loss": 0.1009, + "num_input_tokens_seen": 87942096, + "step": 40690 + }, + { + "epoch": 6.638662316476346, + "grad_norm": 0.06515488773584366, + "learning_rate": 0.000844904605458869, + "loss": 0.0668, + "num_input_tokens_seen": 87952784, + "step": 40695 + }, + { + "epoch": 6.6394779771615005, + "grad_norm": 0.1152672991156578, + "learning_rate": 0.0008448530683990968, + "loss": 0.0984, + "num_input_tokens_seen": 87962096, + "step": 40700 + }, + { + "epoch": 6.640293637846656, + "grad_norm": 0.011901522055268288, + "learning_rate": 0.0008448015243504398, + "loss": 0.0648, + "num_input_tokens_seen": 87972144, + "step": 40705 + }, + { + "epoch": 6.641109298531811, + "grad_norm": 0.010754693299531937, + "learning_rate": 0.0008447499733139426, + "loss": 0.0753, + "num_input_tokens_seen": 87983920, + "step": 40710 + }, + { + "epoch": 6.641924959216966, + "grad_norm": 0.526595950126648, + "learning_rate": 0.0008446984152906496, + "loss": 0.1452, + "num_input_tokens_seen": 87993360, + "step": 40715 + }, + { + "epoch": 6.642740619902121, + "grad_norm": 0.0625261515378952, + "learning_rate": 0.0008446468502816061, + "loss": 0.2296, + "num_input_tokens_seen": 88002320, + "step": 40720 + }, + { + "epoch": 6.643556280587275, + "grad_norm": 0.04473881796002388, + "learning_rate": 0.000844595278287857, + "loss": 0.0903, + "num_input_tokens_seen": 88013200, + "step": 40725 + }, + { + "epoch": 6.64437194127243, + "grad_norm": 0.21388088166713715, + "learning_rate": 0.0008445436993104473, + "loss": 0.0913, + "num_input_tokens_seen": 88024368, + "step": 40730 + }, + { + "epoch": 6.645187601957586, + "grad_norm": 0.009727729484438896, + "learning_rate": 0.0008444921133504225, + "loss": 0.0832, + "num_input_tokens_seen": 88035984, + "step": 40735 + }, + { + "epoch": 6.646003262642741, + "grad_norm": 0.11908382922410965, + "learning_rate": 0.0008444405204088281, + "loss": 0.0735, + "num_input_tokens_seen": 88047344, + "step": 40740 + }, + { + "epoch": 6.646818923327896, + "grad_norm": 0.010134859941899776, + "learning_rate": 0.0008443889204867095, + "loss": 0.0272, + "num_input_tokens_seen": 88058608, + "step": 40745 + }, + { + "epoch": 6.64763458401305, + "grad_norm": 0.03795737773180008, + "learning_rate": 0.0008443373135851125, + "loss": 0.133, + "num_input_tokens_seen": 88069648, + "step": 40750 + }, + { + "epoch": 6.648450244698205, + "grad_norm": 0.02392597869038582, + "learning_rate": 0.0008442856997050832, + "loss": 0.0323, + "num_input_tokens_seen": 88080368, + "step": 40755 + }, + { + "epoch": 6.649265905383361, + "grad_norm": 0.0146209467202425, + "learning_rate": 0.0008442340788476672, + "loss": 0.0301, + "num_input_tokens_seen": 88090832, + "step": 40760 + }, + { + "epoch": 6.650081566068516, + "grad_norm": 0.042282216250896454, + "learning_rate": 0.0008441824510139111, + "loss": 0.1166, + "num_input_tokens_seen": 88101456, + "step": 40765 + }, + { + "epoch": 6.650897226753671, + "grad_norm": 0.1621234118938446, + "learning_rate": 0.0008441308162048609, + "loss": 0.0638, + "num_input_tokens_seen": 88112624, + "step": 40770 + }, + { + "epoch": 6.651712887438825, + "grad_norm": 0.01184393372386694, + "learning_rate": 0.0008440791744215632, + "loss": 0.03, + "num_input_tokens_seen": 88124080, + "step": 40775 + }, + { + "epoch": 6.65252854812398, + "grad_norm": 0.10488130897283554, + "learning_rate": 0.0008440275256650644, + "loss": 0.1571, + "num_input_tokens_seen": 88135088, + "step": 40780 + }, + { + "epoch": 6.653344208809135, + "grad_norm": 0.009288913570344448, + "learning_rate": 0.0008439758699364115, + "loss": 0.0498, + "num_input_tokens_seen": 88145296, + "step": 40785 + }, + { + "epoch": 6.654159869494291, + "grad_norm": 0.17210085690021515, + "learning_rate": 0.0008439242072366511, + "loss": 0.1509, + "num_input_tokens_seen": 88154864, + "step": 40790 + }, + { + "epoch": 6.6549755301794455, + "grad_norm": 0.03770684078335762, + "learning_rate": 0.0008438725375668305, + "loss": 0.2683, + "num_input_tokens_seen": 88164528, + "step": 40795 + }, + { + "epoch": 6.6557911908646, + "grad_norm": 0.030847519636154175, + "learning_rate": 0.0008438208609279967, + "loss": 0.0984, + "num_input_tokens_seen": 88177136, + "step": 40800 + }, + { + "epoch": 6.656606851549755, + "grad_norm": 0.02910693734884262, + "learning_rate": 0.0008437691773211969, + "loss": 0.0326, + "num_input_tokens_seen": 88187824, + "step": 40805 + }, + { + "epoch": 6.65742251223491, + "grad_norm": 0.013066194951534271, + "learning_rate": 0.0008437174867474786, + "loss": 0.1456, + "num_input_tokens_seen": 88198288, + "step": 40810 + }, + { + "epoch": 6.658238172920065, + "grad_norm": 0.005027337931096554, + "learning_rate": 0.0008436657892078895, + "loss": 0.0227, + "num_input_tokens_seen": 88208720, + "step": 40815 + }, + { + "epoch": 6.6590538336052205, + "grad_norm": 0.1727529913187027, + "learning_rate": 0.0008436140847034772, + "loss": 0.1289, + "num_input_tokens_seen": 88219344, + "step": 40820 + }, + { + "epoch": 6.659869494290375, + "grad_norm": 0.13390670716762543, + "learning_rate": 0.0008435623732352895, + "loss": 0.1687, + "num_input_tokens_seen": 88229392, + "step": 40825 + }, + { + "epoch": 6.66068515497553, + "grad_norm": 0.09145081788301468, + "learning_rate": 0.0008435106548043745, + "loss": 0.0726, + "num_input_tokens_seen": 88240624, + "step": 40830 + }, + { + "epoch": 6.661500815660685, + "grad_norm": 0.18923735618591309, + "learning_rate": 0.0008434589294117802, + "loss": 0.0453, + "num_input_tokens_seen": 88250640, + "step": 40835 + }, + { + "epoch": 6.66231647634584, + "grad_norm": 0.12857620418071747, + "learning_rate": 0.0008434071970585551, + "loss": 0.103, + "num_input_tokens_seen": 88262224, + "step": 40840 + }, + { + "epoch": 6.6631321370309955, + "grad_norm": 0.04811937361955643, + "learning_rate": 0.0008433554577457475, + "loss": 0.0248, + "num_input_tokens_seen": 88271856, + "step": 40845 + }, + { + "epoch": 6.66394779771615, + "grad_norm": 0.0026477025821805, + "learning_rate": 0.000843303711474406, + "loss": 0.155, + "num_input_tokens_seen": 88283344, + "step": 40850 + }, + { + "epoch": 6.664763458401305, + "grad_norm": 0.04366447031497955, + "learning_rate": 0.0008432519582455792, + "loss": 0.0734, + "num_input_tokens_seen": 88294064, + "step": 40855 + }, + { + "epoch": 6.66557911908646, + "grad_norm": 0.09165249764919281, + "learning_rate": 0.0008432001980603161, + "loss": 0.2132, + "num_input_tokens_seen": 88304656, + "step": 40860 + }, + { + "epoch": 6.666394779771615, + "grad_norm": 0.09211524575948715, + "learning_rate": 0.0008431484309196656, + "loss": 0.1458, + "num_input_tokens_seen": 88314192, + "step": 40865 + }, + { + "epoch": 6.6672104404567705, + "grad_norm": 0.019264977425336838, + "learning_rate": 0.0008430966568246768, + "loss": 0.0255, + "num_input_tokens_seen": 88324784, + "step": 40870 + }, + { + "epoch": 6.668026101141925, + "grad_norm": 0.03427863493561745, + "learning_rate": 0.0008430448757763989, + "loss": 0.0462, + "num_input_tokens_seen": 88335280, + "step": 40875 + }, + { + "epoch": 6.66884176182708, + "grad_norm": 0.34995800256729126, + "learning_rate": 0.0008429930877758814, + "loss": 0.1108, + "num_input_tokens_seen": 88346096, + "step": 40880 + }, + { + "epoch": 6.669657422512235, + "grad_norm": 0.012390978634357452, + "learning_rate": 0.000842941292824174, + "loss": 0.0087, + "num_input_tokens_seen": 88357968, + "step": 40885 + }, + { + "epoch": 6.67047308319739, + "grad_norm": 0.25816184282302856, + "learning_rate": 0.0008428894909223261, + "loss": 0.0435, + "num_input_tokens_seen": 88368816, + "step": 40890 + }, + { + "epoch": 6.671288743882545, + "grad_norm": 0.027464818209409714, + "learning_rate": 0.0008428376820713879, + "loss": 0.1062, + "num_input_tokens_seen": 88378256, + "step": 40895 + }, + { + "epoch": 6.672104404567699, + "grad_norm": 0.009099733084440231, + "learning_rate": 0.000842785866272409, + "loss": 0.0545, + "num_input_tokens_seen": 88387888, + "step": 40900 + }, + { + "epoch": 6.672920065252855, + "grad_norm": 0.5091248750686646, + "learning_rate": 0.0008427340435264397, + "loss": 0.0938, + "num_input_tokens_seen": 88397360, + "step": 40905 + }, + { + "epoch": 6.67373572593801, + "grad_norm": 0.021805984899401665, + "learning_rate": 0.0008426822138345302, + "loss": 0.2499, + "num_input_tokens_seen": 88408880, + "step": 40910 + }, + { + "epoch": 6.674551386623165, + "grad_norm": 0.055580854415893555, + "learning_rate": 0.0008426303771977311, + "loss": 0.0147, + "num_input_tokens_seen": 88420784, + "step": 40915 + }, + { + "epoch": 6.6753670473083195, + "grad_norm": 0.09892192482948303, + "learning_rate": 0.0008425785336170925, + "loss": 0.1976, + "num_input_tokens_seen": 88432432, + "step": 40920 + }, + { + "epoch": 6.676182707993474, + "grad_norm": 0.08244457095861435, + "learning_rate": 0.0008425266830936654, + "loss": 0.149, + "num_input_tokens_seen": 88442960, + "step": 40925 + }, + { + "epoch": 6.67699836867863, + "grad_norm": 0.007770029827952385, + "learning_rate": 0.0008424748256285005, + "loss": 0.1115, + "num_input_tokens_seen": 88453936, + "step": 40930 + }, + { + "epoch": 6.677814029363785, + "grad_norm": 0.0658392608165741, + "learning_rate": 0.0008424229612226488, + "loss": 0.0897, + "num_input_tokens_seen": 88464304, + "step": 40935 + }, + { + "epoch": 6.67862969004894, + "grad_norm": 0.17187225818634033, + "learning_rate": 0.0008423710898771614, + "loss": 0.0562, + "num_input_tokens_seen": 88475280, + "step": 40940 + }, + { + "epoch": 6.6794453507340945, + "grad_norm": 0.3012283146381378, + "learning_rate": 0.0008423192115930897, + "loss": 0.1038, + "num_input_tokens_seen": 88485424, + "step": 40945 + }, + { + "epoch": 6.680261011419249, + "grad_norm": 0.08645543456077576, + "learning_rate": 0.0008422673263714848, + "loss": 0.0516, + "num_input_tokens_seen": 88495504, + "step": 40950 + }, + { + "epoch": 6.681076672104405, + "grad_norm": 0.015715450048446655, + "learning_rate": 0.0008422154342133983, + "loss": 0.0286, + "num_input_tokens_seen": 88506960, + "step": 40955 + }, + { + "epoch": 6.68189233278956, + "grad_norm": 0.01696091704070568, + "learning_rate": 0.0008421635351198819, + "loss": 0.0101, + "num_input_tokens_seen": 88519088, + "step": 40960 + }, + { + "epoch": 6.682707993474715, + "grad_norm": 0.14385735988616943, + "learning_rate": 0.0008421116290919875, + "loss": 0.1959, + "num_input_tokens_seen": 88529584, + "step": 40965 + }, + { + "epoch": 6.6835236541598695, + "grad_norm": 0.040347855538129807, + "learning_rate": 0.0008420597161307668, + "loss": 0.0822, + "num_input_tokens_seen": 88540976, + "step": 40970 + }, + { + "epoch": 6.684339314845024, + "grad_norm": 0.0037138413172215223, + "learning_rate": 0.0008420077962372721, + "loss": 0.0141, + "num_input_tokens_seen": 88551952, + "step": 40975 + }, + { + "epoch": 6.685154975530179, + "grad_norm": 0.41363850235939026, + "learning_rate": 0.0008419558694125555, + "loss": 0.113, + "num_input_tokens_seen": 88563408, + "step": 40980 + }, + { + "epoch": 6.685970636215334, + "grad_norm": 0.009157951921224594, + "learning_rate": 0.0008419039356576695, + "loss": 0.2461, + "num_input_tokens_seen": 88574384, + "step": 40985 + }, + { + "epoch": 6.68678629690049, + "grad_norm": 0.19476893544197083, + "learning_rate": 0.0008418519949736664, + "loss": 0.0781, + "num_input_tokens_seen": 88585200, + "step": 40990 + }, + { + "epoch": 6.6876019575856445, + "grad_norm": 0.03723384067416191, + "learning_rate": 0.000841800047361599, + "loss": 0.0361, + "num_input_tokens_seen": 88596272, + "step": 40995 + }, + { + "epoch": 6.688417618270799, + "grad_norm": 0.02606227621436119, + "learning_rate": 0.00084174809282252, + "loss": 0.1431, + "num_input_tokens_seen": 88605904, + "step": 41000 + }, + { + "epoch": 6.689233278955954, + "grad_norm": 0.018963932991027832, + "learning_rate": 0.0008416961313574824, + "loss": 0.1336, + "num_input_tokens_seen": 88615888, + "step": 41005 + }, + { + "epoch": 6.690048939641109, + "grad_norm": 0.057104069739580154, + "learning_rate": 0.0008416441629675391, + "loss": 0.0611, + "num_input_tokens_seen": 88627792, + "step": 41010 + }, + { + "epoch": 6.690864600326265, + "grad_norm": 0.01525149866938591, + "learning_rate": 0.0008415921876537436, + "loss": 0.1299, + "num_input_tokens_seen": 88639632, + "step": 41015 + }, + { + "epoch": 6.691680261011419, + "grad_norm": 0.12123435735702515, + "learning_rate": 0.000841540205417149, + "loss": 0.0662, + "num_input_tokens_seen": 88650384, + "step": 41020 + }, + { + "epoch": 6.692495921696574, + "grad_norm": 0.056442294269800186, + "learning_rate": 0.0008414882162588089, + "loss": 0.0486, + "num_input_tokens_seen": 88660880, + "step": 41025 + }, + { + "epoch": 6.693311582381729, + "grad_norm": 0.0264727883040905, + "learning_rate": 0.0008414362201797768, + "loss": 0.0384, + "num_input_tokens_seen": 88671312, + "step": 41030 + }, + { + "epoch": 6.694127243066884, + "grad_norm": 0.13660025596618652, + "learning_rate": 0.0008413842171811066, + "loss": 0.0359, + "num_input_tokens_seen": 88681232, + "step": 41035 + }, + { + "epoch": 6.69494290375204, + "grad_norm": 0.03457217290997505, + "learning_rate": 0.0008413322072638523, + "loss": 0.022, + "num_input_tokens_seen": 88691472, + "step": 41040 + }, + { + "epoch": 6.695758564437194, + "grad_norm": 0.03694334253668785, + "learning_rate": 0.0008412801904290677, + "loss": 0.0197, + "num_input_tokens_seen": 88702544, + "step": 41045 + }, + { + "epoch": 6.696574225122349, + "grad_norm": 0.03526332601904869, + "learning_rate": 0.000841228166677807, + "loss": 0.1232, + "num_input_tokens_seen": 88713648, + "step": 41050 + }, + { + "epoch": 6.697389885807504, + "grad_norm": 0.009007184766232967, + "learning_rate": 0.0008411761360111248, + "loss": 0.0462, + "num_input_tokens_seen": 88724464, + "step": 41055 + }, + { + "epoch": 6.698205546492659, + "grad_norm": 0.10562512278556824, + "learning_rate": 0.0008411240984300752, + "loss": 0.0197, + "num_input_tokens_seen": 88735280, + "step": 41060 + }, + { + "epoch": 6.699021207177814, + "grad_norm": 0.0942317321896553, + "learning_rate": 0.0008410720539357132, + "loss": 0.1576, + "num_input_tokens_seen": 88746448, + "step": 41065 + }, + { + "epoch": 6.699836867862969, + "grad_norm": 0.02955956757068634, + "learning_rate": 0.0008410200025290933, + "loss": 0.0215, + "num_input_tokens_seen": 88757200, + "step": 41070 + }, + { + "epoch": 6.700652528548124, + "grad_norm": 0.02018778957426548, + "learning_rate": 0.0008409679442112703, + "loss": 0.1164, + "num_input_tokens_seen": 88768208, + "step": 41075 + }, + { + "epoch": 6.701468189233279, + "grad_norm": 0.019691454246640205, + "learning_rate": 0.0008409158789832994, + "loss": 0.0729, + "num_input_tokens_seen": 88779152, + "step": 41080 + }, + { + "epoch": 6.702283849918434, + "grad_norm": 0.0125938281416893, + "learning_rate": 0.0008408638068462357, + "loss": 0.1236, + "num_input_tokens_seen": 88790256, + "step": 41085 + }, + { + "epoch": 6.703099510603589, + "grad_norm": 0.2372901290655136, + "learning_rate": 0.0008408117278011347, + "loss": 0.1925, + "num_input_tokens_seen": 88800976, + "step": 41090 + }, + { + "epoch": 6.7039151712887435, + "grad_norm": 0.24161028861999512, + "learning_rate": 0.0008407596418490515, + "loss": 0.0759, + "num_input_tokens_seen": 88811312, + "step": 41095 + }, + { + "epoch": 6.704730831973899, + "grad_norm": 0.034654729068279266, + "learning_rate": 0.0008407075489910421, + "loss": 0.0624, + "num_input_tokens_seen": 88821968, + "step": 41100 + }, + { + "epoch": 6.705546492659054, + "grad_norm": 0.062173739075660706, + "learning_rate": 0.0008406554492281618, + "loss": 0.0306, + "num_input_tokens_seen": 88832784, + "step": 41105 + }, + { + "epoch": 6.706362153344209, + "grad_norm": 0.01632249914109707, + "learning_rate": 0.0008406033425614667, + "loss": 0.0783, + "num_input_tokens_seen": 88843152, + "step": 41110 + }, + { + "epoch": 6.707177814029364, + "grad_norm": 0.04637160152196884, + "learning_rate": 0.0008405512289920129, + "loss": 0.2796, + "num_input_tokens_seen": 88853264, + "step": 41115 + }, + { + "epoch": 6.7079934747145185, + "grad_norm": 0.05950487405061722, + "learning_rate": 0.0008404991085208562, + "loss": 0.0801, + "num_input_tokens_seen": 88863440, + "step": 41120 + }, + { + "epoch": 6.708809135399674, + "grad_norm": 0.04365135356783867, + "learning_rate": 0.0008404469811490534, + "loss": 0.0398, + "num_input_tokens_seen": 88873616, + "step": 41125 + }, + { + "epoch": 6.709624796084829, + "grad_norm": 0.19524741172790527, + "learning_rate": 0.0008403948468776604, + "loss": 0.0655, + "num_input_tokens_seen": 88884816, + "step": 41130 + }, + { + "epoch": 6.710440456769984, + "grad_norm": 0.2746712863445282, + "learning_rate": 0.0008403427057077342, + "loss": 0.1683, + "num_input_tokens_seen": 88896080, + "step": 41135 + }, + { + "epoch": 6.711256117455139, + "grad_norm": 0.3242705464363098, + "learning_rate": 0.0008402905576403312, + "loss": 0.1226, + "num_input_tokens_seen": 88907152, + "step": 41140 + }, + { + "epoch": 6.712071778140293, + "grad_norm": 0.022281266748905182, + "learning_rate": 0.0008402384026765084, + "loss": 0.0701, + "num_input_tokens_seen": 88918640, + "step": 41145 + }, + { + "epoch": 6.712887438825448, + "grad_norm": 0.0167181808501482, + "learning_rate": 0.0008401862408173226, + "loss": 0.0659, + "num_input_tokens_seen": 88928880, + "step": 41150 + }, + { + "epoch": 6.713703099510604, + "grad_norm": 0.039158862084150314, + "learning_rate": 0.0008401340720638313, + "loss": 0.1253, + "num_input_tokens_seen": 88941296, + "step": 41155 + }, + { + "epoch": 6.714518760195759, + "grad_norm": 0.2806652784347534, + "learning_rate": 0.0008400818964170913, + "loss": 0.1515, + "num_input_tokens_seen": 88951888, + "step": 41160 + }, + { + "epoch": 6.715334420880914, + "grad_norm": 0.02568766102194786, + "learning_rate": 0.0008400297138781605, + "loss": 0.0607, + "num_input_tokens_seen": 88962352, + "step": 41165 + }, + { + "epoch": 6.716150081566068, + "grad_norm": 0.07870490849018097, + "learning_rate": 0.0008399775244480961, + "loss": 0.0313, + "num_input_tokens_seen": 88974512, + "step": 41170 + }, + { + "epoch": 6.716965742251223, + "grad_norm": 0.018581105396151543, + "learning_rate": 0.0008399253281279557, + "loss": 0.0362, + "num_input_tokens_seen": 88985168, + "step": 41175 + }, + { + "epoch": 6.717781402936378, + "grad_norm": 0.24575214087963104, + "learning_rate": 0.0008398731249187975, + "loss": 0.2494, + "num_input_tokens_seen": 88996592, + "step": 41180 + }, + { + "epoch": 6.718597063621534, + "grad_norm": 0.023241423070430756, + "learning_rate": 0.0008398209148216793, + "loss": 0.02, + "num_input_tokens_seen": 89008912, + "step": 41185 + }, + { + "epoch": 6.719412724306689, + "grad_norm": 0.2728913128376007, + "learning_rate": 0.000839768697837659, + "loss": 0.0942, + "num_input_tokens_seen": 89020240, + "step": 41190 + }, + { + "epoch": 6.720228384991843, + "grad_norm": 0.004862621426582336, + "learning_rate": 0.0008397164739677951, + "loss": 0.1391, + "num_input_tokens_seen": 89031792, + "step": 41195 + }, + { + "epoch": 6.721044045676998, + "grad_norm": 0.2995845079421997, + "learning_rate": 0.0008396642432131459, + "loss": 0.1093, + "num_input_tokens_seen": 89042928, + "step": 41200 + }, + { + "epoch": 6.721859706362153, + "grad_norm": 0.01482780184596777, + "learning_rate": 0.0008396120055747698, + "loss": 0.1983, + "num_input_tokens_seen": 89052432, + "step": 41205 + }, + { + "epoch": 6.722675367047309, + "grad_norm": 0.1203823983669281, + "learning_rate": 0.0008395597610537257, + "loss": 0.0408, + "num_input_tokens_seen": 89063792, + "step": 41210 + }, + { + "epoch": 6.7234910277324635, + "grad_norm": 0.05856647342443466, + "learning_rate": 0.0008395075096510723, + "loss": 0.1183, + "num_input_tokens_seen": 89074384, + "step": 41215 + }, + { + "epoch": 6.724306688417618, + "grad_norm": 0.07476924359798431, + "learning_rate": 0.0008394552513678684, + "loss": 0.0963, + "num_input_tokens_seen": 89085488, + "step": 41220 + }, + { + "epoch": 6.725122349102773, + "grad_norm": 0.03770218417048454, + "learning_rate": 0.0008394029862051733, + "loss": 0.0795, + "num_input_tokens_seen": 89095728, + "step": 41225 + }, + { + "epoch": 6.725938009787928, + "grad_norm": 0.019872894510626793, + "learning_rate": 0.0008393507141640461, + "loss": 0.0649, + "num_input_tokens_seen": 89106096, + "step": 41230 + }, + { + "epoch": 6.726753670473083, + "grad_norm": 0.004693881142884493, + "learning_rate": 0.0008392984352455461, + "loss": 0.0563, + "num_input_tokens_seen": 89116048, + "step": 41235 + }, + { + "epoch": 6.7275693311582385, + "grad_norm": 0.07502961158752441, + "learning_rate": 0.0008392461494507331, + "loss": 0.0422, + "num_input_tokens_seen": 89127472, + "step": 41240 + }, + { + "epoch": 6.728384991843393, + "grad_norm": 0.0028481758199632168, + "learning_rate": 0.0008391938567806663, + "loss": 0.0258, + "num_input_tokens_seen": 89138288, + "step": 41245 + }, + { + "epoch": 6.729200652528548, + "grad_norm": 0.006228649523109198, + "learning_rate": 0.0008391415572364058, + "loss": 0.0626, + "num_input_tokens_seen": 89149456, + "step": 41250 + }, + { + "epoch": 6.730016313213703, + "grad_norm": 0.2872016429901123, + "learning_rate": 0.0008390892508190113, + "loss": 0.0579, + "num_input_tokens_seen": 89161008, + "step": 41255 + }, + { + "epoch": 6.730831973898858, + "grad_norm": 0.12412890046834946, + "learning_rate": 0.000839036937529543, + "loss": 0.0991, + "num_input_tokens_seen": 89171856, + "step": 41260 + }, + { + "epoch": 6.731647634584013, + "grad_norm": 0.02800234593451023, + "learning_rate": 0.0008389846173690611, + "loss": 0.0191, + "num_input_tokens_seen": 89183024, + "step": 41265 + }, + { + "epoch": 6.732463295269168, + "grad_norm": 0.012140207923948765, + "learning_rate": 0.0008389322903386261, + "loss": 0.0071, + "num_input_tokens_seen": 89192880, + "step": 41270 + }, + { + "epoch": 6.733278955954323, + "grad_norm": 0.02547260746359825, + "learning_rate": 0.0008388799564392979, + "loss": 0.0876, + "num_input_tokens_seen": 89203888, + "step": 41275 + }, + { + "epoch": 6.734094616639478, + "grad_norm": 0.267605185508728, + "learning_rate": 0.0008388276156721377, + "loss": 0.0582, + "num_input_tokens_seen": 89214192, + "step": 41280 + }, + { + "epoch": 6.734910277324633, + "grad_norm": 0.03963426128029823, + "learning_rate": 0.0008387752680382062, + "loss": 0.1477, + "num_input_tokens_seen": 89225424, + "step": 41285 + }, + { + "epoch": 6.735725938009788, + "grad_norm": 0.2313862442970276, + "learning_rate": 0.0008387229135385638, + "loss": 0.0645, + "num_input_tokens_seen": 89235600, + "step": 41290 + }, + { + "epoch": 6.736541598694943, + "grad_norm": 0.06140168383717537, + "learning_rate": 0.0008386705521742719, + "loss": 0.1396, + "num_input_tokens_seen": 89246544, + "step": 41295 + }, + { + "epoch": 6.737357259380098, + "grad_norm": 0.23113910853862762, + "learning_rate": 0.0008386181839463918, + "loss": 0.1171, + "num_input_tokens_seen": 89257040, + "step": 41300 + }, + { + "epoch": 6.738172920065253, + "grad_norm": 0.011116042733192444, + "learning_rate": 0.0008385658088559845, + "loss": 0.164, + "num_input_tokens_seen": 89267792, + "step": 41305 + }, + { + "epoch": 6.738988580750408, + "grad_norm": 0.12740331888198853, + "learning_rate": 0.0008385134269041116, + "loss": 0.0459, + "num_input_tokens_seen": 89278832, + "step": 41310 + }, + { + "epoch": 6.739804241435563, + "grad_norm": 0.0157408956438303, + "learning_rate": 0.0008384610380918347, + "loss": 0.0332, + "num_input_tokens_seen": 89290000, + "step": 41315 + }, + { + "epoch": 6.740619902120718, + "grad_norm": 0.3017098605632782, + "learning_rate": 0.0008384086424202156, + "loss": 0.0939, + "num_input_tokens_seen": 89300080, + "step": 41320 + }, + { + "epoch": 6.741435562805873, + "grad_norm": 0.01924244500696659, + "learning_rate": 0.0008383562398903157, + "loss": 0.0786, + "num_input_tokens_seen": 89310960, + "step": 41325 + }, + { + "epoch": 6.742251223491028, + "grad_norm": 0.06611377000808716, + "learning_rate": 0.0008383038305031976, + "loss": 0.1008, + "num_input_tokens_seen": 89320784, + "step": 41330 + }, + { + "epoch": 6.743066884176183, + "grad_norm": 0.26710790395736694, + "learning_rate": 0.0008382514142599234, + "loss": 0.1293, + "num_input_tokens_seen": 89330416, + "step": 41335 + }, + { + "epoch": 6.7438825448613375, + "grad_norm": 0.005136616062372923, + "learning_rate": 0.0008381989911615548, + "loss": 0.1704, + "num_input_tokens_seen": 89340976, + "step": 41340 + }, + { + "epoch": 6.744698205546492, + "grad_norm": 0.37061432003974915, + "learning_rate": 0.0008381465612091549, + "loss": 0.1181, + "num_input_tokens_seen": 89352240, + "step": 41345 + }, + { + "epoch": 6.745513866231647, + "grad_norm": 0.0033552530221641064, + "learning_rate": 0.0008380941244037858, + "loss": 0.0418, + "num_input_tokens_seen": 89362704, + "step": 41350 + }, + { + "epoch": 6.746329526916803, + "grad_norm": 0.003381171729415655, + "learning_rate": 0.0008380416807465106, + "loss": 0.0364, + "num_input_tokens_seen": 89373392, + "step": 41355 + }, + { + "epoch": 6.747145187601958, + "grad_norm": 0.0964924544095993, + "learning_rate": 0.0008379892302383916, + "loss": 0.021, + "num_input_tokens_seen": 89384368, + "step": 41360 + }, + { + "epoch": 6.7479608482871125, + "grad_norm": 0.021633053198456764, + "learning_rate": 0.0008379367728804923, + "loss": 0.0493, + "num_input_tokens_seen": 89394736, + "step": 41365 + }, + { + "epoch": 6.748776508972267, + "grad_norm": 0.10366320610046387, + "learning_rate": 0.0008378843086738755, + "loss": 0.0979, + "num_input_tokens_seen": 89405296, + "step": 41370 + }, + { + "epoch": 6.749592169657422, + "grad_norm": 0.0048525105230510235, + "learning_rate": 0.0008378318376196046, + "loss": 0.036, + "num_input_tokens_seen": 89416624, + "step": 41375 + }, + { + "epoch": 6.750407830342578, + "grad_norm": 0.3233185112476349, + "learning_rate": 0.0008377793597187428, + "loss": 0.1037, + "num_input_tokens_seen": 89426928, + "step": 41380 + }, + { + "epoch": 6.751223491027733, + "grad_norm": 0.01112865749746561, + "learning_rate": 0.000837726874972354, + "loss": 0.0211, + "num_input_tokens_seen": 89439024, + "step": 41385 + }, + { + "epoch": 6.7520391517128875, + "grad_norm": 0.28287333250045776, + "learning_rate": 0.0008376743833815015, + "loss": 0.1352, + "num_input_tokens_seen": 89447472, + "step": 41390 + }, + { + "epoch": 6.752854812398042, + "grad_norm": 0.3251228928565979, + "learning_rate": 0.0008376218849472493, + "loss": 0.201, + "num_input_tokens_seen": 89458192, + "step": 41395 + }, + { + "epoch": 6.753670473083197, + "grad_norm": 0.03416355699300766, + "learning_rate": 0.0008375693796706613, + "loss": 0.0452, + "num_input_tokens_seen": 89468912, + "step": 41400 + }, + { + "epoch": 6.754486133768353, + "grad_norm": 0.049930673092603683, + "learning_rate": 0.0008375168675528016, + "loss": 0.0615, + "num_input_tokens_seen": 89479856, + "step": 41405 + }, + { + "epoch": 6.755301794453508, + "grad_norm": 0.18245936930179596, + "learning_rate": 0.0008374643485947342, + "loss": 0.2387, + "num_input_tokens_seen": 89490736, + "step": 41410 + }, + { + "epoch": 6.7561174551386625, + "grad_norm": 0.09749633818864822, + "learning_rate": 0.0008374118227975238, + "loss": 0.0335, + "num_input_tokens_seen": 89501392, + "step": 41415 + }, + { + "epoch": 6.756933115823817, + "grad_norm": 0.022130804136395454, + "learning_rate": 0.0008373592901622349, + "loss": 0.0518, + "num_input_tokens_seen": 89512656, + "step": 41420 + }, + { + "epoch": 6.757748776508972, + "grad_norm": 0.02885841391980648, + "learning_rate": 0.0008373067506899319, + "loss": 0.0253, + "num_input_tokens_seen": 89523088, + "step": 41425 + }, + { + "epoch": 6.758564437194127, + "grad_norm": 0.09676162898540497, + "learning_rate": 0.0008372542043816797, + "loss": 0.1792, + "num_input_tokens_seen": 89534288, + "step": 41430 + }, + { + "epoch": 6.759380097879282, + "grad_norm": 0.00921018235385418, + "learning_rate": 0.0008372016512385432, + "loss": 0.0125, + "num_input_tokens_seen": 89545872, + "step": 41435 + }, + { + "epoch": 6.760195758564437, + "grad_norm": 0.06077948957681656, + "learning_rate": 0.0008371490912615875, + "loss": 0.0715, + "num_input_tokens_seen": 89556336, + "step": 41440 + }, + { + "epoch": 6.761011419249592, + "grad_norm": 0.031869012862443924, + "learning_rate": 0.0008370965244518778, + "loss": 0.0465, + "num_input_tokens_seen": 89566768, + "step": 41445 + }, + { + "epoch": 6.761827079934747, + "grad_norm": 0.02272254228591919, + "learning_rate": 0.0008370439508104794, + "loss": 0.0356, + "num_input_tokens_seen": 89577456, + "step": 41450 + }, + { + "epoch": 6.762642740619902, + "grad_norm": 0.09011733531951904, + "learning_rate": 0.0008369913703384576, + "loss": 0.1329, + "num_input_tokens_seen": 89589136, + "step": 41455 + }, + { + "epoch": 6.763458401305057, + "grad_norm": 0.22700747847557068, + "learning_rate": 0.0008369387830368785, + "loss": 0.1296, + "num_input_tokens_seen": 89599312, + "step": 41460 + }, + { + "epoch": 6.764274061990212, + "grad_norm": 0.24166239798069, + "learning_rate": 0.0008368861889068071, + "loss": 0.1623, + "num_input_tokens_seen": 89610960, + "step": 41465 + }, + { + "epoch": 6.765089722675367, + "grad_norm": 0.05804312974214554, + "learning_rate": 0.0008368335879493099, + "loss": 0.0187, + "num_input_tokens_seen": 89620784, + "step": 41470 + }, + { + "epoch": 6.765905383360522, + "grad_norm": 0.0035739641170948744, + "learning_rate": 0.0008367809801654529, + "loss": 0.1717, + "num_input_tokens_seen": 89632272, + "step": 41475 + }, + { + "epoch": 6.766721044045677, + "grad_norm": 0.23069559037685394, + "learning_rate": 0.0008367283655563018, + "loss": 0.1694, + "num_input_tokens_seen": 89643120, + "step": 41480 + }, + { + "epoch": 6.767536704730832, + "grad_norm": 0.15718932449817657, + "learning_rate": 0.0008366757441229235, + "loss": 0.0793, + "num_input_tokens_seen": 89653584, + "step": 41485 + }, + { + "epoch": 6.768352365415987, + "grad_norm": 0.08193394541740417, + "learning_rate": 0.000836623115866384, + "loss": 0.0785, + "num_input_tokens_seen": 89665264, + "step": 41490 + }, + { + "epoch": 6.769168026101142, + "grad_norm": 0.010544022545218468, + "learning_rate": 0.00083657048078775, + "loss": 0.0782, + "num_input_tokens_seen": 89675216, + "step": 41495 + }, + { + "epoch": 6.769983686786297, + "grad_norm": 0.16942720115184784, + "learning_rate": 0.0008365178388880883, + "loss": 0.2512, + "num_input_tokens_seen": 89686480, + "step": 41500 + }, + { + "epoch": 6.770799347471452, + "grad_norm": 0.024143319576978683, + "learning_rate": 0.0008364651901684657, + "loss": 0.117, + "num_input_tokens_seen": 89697232, + "step": 41505 + }, + { + "epoch": 6.771615008156607, + "grad_norm": 0.18397311866283417, + "learning_rate": 0.0008364125346299492, + "loss": 0.0661, + "num_input_tokens_seen": 89708080, + "step": 41510 + }, + { + "epoch": 6.7724306688417615, + "grad_norm": 0.02165348269045353, + "learning_rate": 0.0008363598722736057, + "loss": 0.1404, + "num_input_tokens_seen": 89718416, + "step": 41515 + }, + { + "epoch": 6.773246329526917, + "grad_norm": 0.07183000445365906, + "learning_rate": 0.0008363072031005028, + "loss": 0.0314, + "num_input_tokens_seen": 89728688, + "step": 41520 + }, + { + "epoch": 6.774061990212072, + "grad_norm": 0.0437711663544178, + "learning_rate": 0.0008362545271117079, + "loss": 0.1382, + "num_input_tokens_seen": 89740784, + "step": 41525 + }, + { + "epoch": 6.774877650897227, + "grad_norm": 0.1806856095790863, + "learning_rate": 0.0008362018443082884, + "loss": 0.0989, + "num_input_tokens_seen": 89751632, + "step": 41530 + }, + { + "epoch": 6.775693311582382, + "grad_norm": 0.08479801565408707, + "learning_rate": 0.000836149154691312, + "loss": 0.0721, + "num_input_tokens_seen": 89762224, + "step": 41535 + }, + { + "epoch": 6.7765089722675365, + "grad_norm": 0.013879367150366306, + "learning_rate": 0.0008360964582618465, + "loss": 0.1656, + "num_input_tokens_seen": 89772624, + "step": 41540 + }, + { + "epoch": 6.777324632952691, + "grad_norm": 0.014607289806008339, + "learning_rate": 0.0008360437550209599, + "loss": 0.0195, + "num_input_tokens_seen": 89782832, + "step": 41545 + }, + { + "epoch": 6.778140293637847, + "grad_norm": 0.3534904420375824, + "learning_rate": 0.0008359910449697203, + "loss": 0.1276, + "num_input_tokens_seen": 89793072, + "step": 41550 + }, + { + "epoch": 6.778955954323002, + "grad_norm": 0.01215518917888403, + "learning_rate": 0.0008359383281091961, + "loss": 0.055, + "num_input_tokens_seen": 89804080, + "step": 41555 + }, + { + "epoch": 6.779771615008157, + "grad_norm": 0.028247803449630737, + "learning_rate": 0.0008358856044404553, + "loss": 0.1613, + "num_input_tokens_seen": 89815888, + "step": 41560 + }, + { + "epoch": 6.780587275693311, + "grad_norm": 0.010149382054805756, + "learning_rate": 0.0008358328739645668, + "loss": 0.0612, + "num_input_tokens_seen": 89827344, + "step": 41565 + }, + { + "epoch": 6.781402936378466, + "grad_norm": 0.281253457069397, + "learning_rate": 0.000835780136682599, + "loss": 0.2119, + "num_input_tokens_seen": 89837328, + "step": 41570 + }, + { + "epoch": 6.782218597063622, + "grad_norm": 0.05083395168185234, + "learning_rate": 0.0008357273925956208, + "loss": 0.0712, + "num_input_tokens_seen": 89847056, + "step": 41575 + }, + { + "epoch": 6.783034257748777, + "grad_norm": 0.06286054849624634, + "learning_rate": 0.000835674641704701, + "loss": 0.0331, + "num_input_tokens_seen": 89857488, + "step": 41580 + }, + { + "epoch": 6.783849918433932, + "grad_norm": 0.01152315828949213, + "learning_rate": 0.0008356218840109089, + "loss": 0.0839, + "num_input_tokens_seen": 89867920, + "step": 41585 + }, + { + "epoch": 6.784665579119086, + "grad_norm": 0.025084182620048523, + "learning_rate": 0.0008355691195153134, + "loss": 0.0258, + "num_input_tokens_seen": 89878768, + "step": 41590 + }, + { + "epoch": 6.785481239804241, + "grad_norm": 0.046773433685302734, + "learning_rate": 0.000835516348218984, + "loss": 0.1176, + "num_input_tokens_seen": 89890768, + "step": 41595 + }, + { + "epoch": 6.786296900489396, + "grad_norm": 0.039644379168748856, + "learning_rate": 0.0008354635701229902, + "loss": 0.083, + "num_input_tokens_seen": 89901712, + "step": 41600 + }, + { + "epoch": 6.787112561174552, + "grad_norm": 0.14292879402637482, + "learning_rate": 0.0008354107852284016, + "loss": 0.0583, + "num_input_tokens_seen": 89913104, + "step": 41605 + }, + { + "epoch": 6.787928221859707, + "grad_norm": 0.2356540560722351, + "learning_rate": 0.0008353579935362881, + "loss": 0.1072, + "num_input_tokens_seen": 89923120, + "step": 41610 + }, + { + "epoch": 6.788743882544861, + "grad_norm": 0.2634557783603668, + "learning_rate": 0.0008353051950477192, + "loss": 0.0516, + "num_input_tokens_seen": 89934736, + "step": 41615 + }, + { + "epoch": 6.789559543230016, + "grad_norm": 0.10258938372135162, + "learning_rate": 0.0008352523897637652, + "loss": 0.0387, + "num_input_tokens_seen": 89945808, + "step": 41620 + }, + { + "epoch": 6.790375203915171, + "grad_norm": 0.014087699353694916, + "learning_rate": 0.0008351995776854962, + "loss": 0.1091, + "num_input_tokens_seen": 89956880, + "step": 41625 + }, + { + "epoch": 6.791190864600326, + "grad_norm": 0.01874430850148201, + "learning_rate": 0.0008351467588139827, + "loss": 0.0937, + "num_input_tokens_seen": 89967504, + "step": 41630 + }, + { + "epoch": 6.7920065252854815, + "grad_norm": 0.029873300343751907, + "learning_rate": 0.0008350939331502949, + "loss": 0.0384, + "num_input_tokens_seen": 89978448, + "step": 41635 + }, + { + "epoch": 6.792822185970636, + "grad_norm": 0.011422554962337017, + "learning_rate": 0.0008350411006955033, + "loss": 0.0058, + "num_input_tokens_seen": 89990160, + "step": 41640 + }, + { + "epoch": 6.793637846655791, + "grad_norm": 0.11676127463579178, + "learning_rate": 0.0008349882614506789, + "loss": 0.049, + "num_input_tokens_seen": 90000592, + "step": 41645 + }, + { + "epoch": 6.794453507340946, + "grad_norm": 0.1809616982936859, + "learning_rate": 0.0008349354154168924, + "loss": 0.1131, + "num_input_tokens_seen": 90011152, + "step": 41650 + }, + { + "epoch": 6.795269168026101, + "grad_norm": 0.19138872623443604, + "learning_rate": 0.0008348825625952148, + "loss": 0.0849, + "num_input_tokens_seen": 90020432, + "step": 41655 + }, + { + "epoch": 6.7960848287112565, + "grad_norm": 0.278010755777359, + "learning_rate": 0.0008348297029867172, + "loss": 0.135, + "num_input_tokens_seen": 90031120, + "step": 41660 + }, + { + "epoch": 6.796900489396411, + "grad_norm": 0.06747753918170929, + "learning_rate": 0.0008347768365924709, + "loss": 0.0888, + "num_input_tokens_seen": 90042160, + "step": 41665 + }, + { + "epoch": 6.797716150081566, + "grad_norm": 0.14137335121631622, + "learning_rate": 0.0008347239634135474, + "loss": 0.1913, + "num_input_tokens_seen": 90052976, + "step": 41670 + }, + { + "epoch": 6.798531810766721, + "grad_norm": 0.021791979670524597, + "learning_rate": 0.0008346710834510181, + "loss": 0.0335, + "num_input_tokens_seen": 90063728, + "step": 41675 + }, + { + "epoch": 6.799347471451876, + "grad_norm": 0.10690966993570328, + "learning_rate": 0.0008346181967059548, + "loss": 0.041, + "num_input_tokens_seen": 90076272, + "step": 41680 + }, + { + "epoch": 6.800163132137031, + "grad_norm": 0.22786171734333038, + "learning_rate": 0.0008345653031794292, + "loss": 0.079, + "num_input_tokens_seen": 90087056, + "step": 41685 + }, + { + "epoch": 6.800978792822186, + "grad_norm": 0.41516733169555664, + "learning_rate": 0.0008345124028725133, + "loss": 0.1288, + "num_input_tokens_seen": 90096944, + "step": 41690 + }, + { + "epoch": 6.801794453507341, + "grad_norm": 0.007719927933067083, + "learning_rate": 0.0008344594957862792, + "loss": 0.0907, + "num_input_tokens_seen": 90108752, + "step": 41695 + }, + { + "epoch": 6.802610114192496, + "grad_norm": 0.012921489775180817, + "learning_rate": 0.000834406581921799, + "loss": 0.1094, + "num_input_tokens_seen": 90119856, + "step": 41700 + }, + { + "epoch": 6.803425774877651, + "grad_norm": 0.12971089780330658, + "learning_rate": 0.0008343536612801454, + "loss": 0.0577, + "num_input_tokens_seen": 90130896, + "step": 41705 + }, + { + "epoch": 6.804241435562806, + "grad_norm": 0.010836289264261723, + "learning_rate": 0.0008343007338623906, + "loss": 0.1355, + "num_input_tokens_seen": 90139632, + "step": 41710 + }, + { + "epoch": 6.80505709624796, + "grad_norm": 0.006006123032420874, + "learning_rate": 0.0008342477996696074, + "loss": 0.1392, + "num_input_tokens_seen": 90150096, + "step": 41715 + }, + { + "epoch": 6.805872756933116, + "grad_norm": 0.15588663518428802, + "learning_rate": 0.0008341948587028684, + "loss": 0.0615, + "num_input_tokens_seen": 90161360, + "step": 41720 + }, + { + "epoch": 6.806688417618271, + "grad_norm": 0.18570052087306976, + "learning_rate": 0.0008341419109632466, + "loss": 0.14, + "num_input_tokens_seen": 90172144, + "step": 41725 + }, + { + "epoch": 6.807504078303426, + "grad_norm": 0.07356259226799011, + "learning_rate": 0.0008340889564518153, + "loss": 0.0617, + "num_input_tokens_seen": 90183536, + "step": 41730 + }, + { + "epoch": 6.808319738988581, + "grad_norm": 0.10233187675476074, + "learning_rate": 0.0008340359951696472, + "loss": 0.1075, + "num_input_tokens_seen": 90194224, + "step": 41735 + }, + { + "epoch": 6.809135399673735, + "grad_norm": 0.027137896046042442, + "learning_rate": 0.0008339830271178162, + "loss": 0.0263, + "num_input_tokens_seen": 90205200, + "step": 41740 + }, + { + "epoch": 6.809951060358891, + "grad_norm": 0.13853560388088226, + "learning_rate": 0.0008339300522973952, + "loss": 0.1033, + "num_input_tokens_seen": 90214224, + "step": 41745 + }, + { + "epoch": 6.810766721044046, + "grad_norm": 0.03210921958088875, + "learning_rate": 0.0008338770707094583, + "loss": 0.0439, + "num_input_tokens_seen": 90225552, + "step": 41750 + }, + { + "epoch": 6.811582381729201, + "grad_norm": 0.0702987015247345, + "learning_rate": 0.0008338240823550789, + "loss": 0.2171, + "num_input_tokens_seen": 90236048, + "step": 41755 + }, + { + "epoch": 6.8123980424143555, + "grad_norm": 0.35574427247047424, + "learning_rate": 0.000833771087235331, + "loss": 0.094, + "num_input_tokens_seen": 90247344, + "step": 41760 + }, + { + "epoch": 6.81321370309951, + "grad_norm": 0.2279064953327179, + "learning_rate": 0.0008337180853512885, + "loss": 0.0731, + "num_input_tokens_seen": 90258320, + "step": 41765 + }, + { + "epoch": 6.814029363784666, + "grad_norm": 0.025548186153173447, + "learning_rate": 0.0008336650767040258, + "loss": 0.0733, + "num_input_tokens_seen": 90268336, + "step": 41770 + }, + { + "epoch": 6.814845024469821, + "grad_norm": 0.20487023890018463, + "learning_rate": 0.000833612061294617, + "loss": 0.1404, + "num_input_tokens_seen": 90279120, + "step": 41775 + }, + { + "epoch": 6.815660685154976, + "grad_norm": 0.10038571059703827, + "learning_rate": 0.0008335590391241365, + "loss": 0.0751, + "num_input_tokens_seen": 90290064, + "step": 41780 + }, + { + "epoch": 6.8164763458401305, + "grad_norm": 0.41231435537338257, + "learning_rate": 0.000833506010193659, + "loss": 0.125, + "num_input_tokens_seen": 90299952, + "step": 41785 + }, + { + "epoch": 6.817292006525285, + "grad_norm": 0.20405316352844238, + "learning_rate": 0.000833452974504259, + "loss": 0.0788, + "num_input_tokens_seen": 90310640, + "step": 41790 + }, + { + "epoch": 6.81810766721044, + "grad_norm": 0.03132156655192375, + "learning_rate": 0.0008333999320570116, + "loss": 0.0719, + "num_input_tokens_seen": 90320656, + "step": 41795 + }, + { + "epoch": 6.818923327895595, + "grad_norm": 0.012432006187736988, + "learning_rate": 0.0008333468828529916, + "loss": 0.1274, + "num_input_tokens_seen": 90331824, + "step": 41800 + }, + { + "epoch": 6.819738988580751, + "grad_norm": 0.021729158237576485, + "learning_rate": 0.0008332938268932742, + "loss": 0.06, + "num_input_tokens_seen": 90344016, + "step": 41805 + }, + { + "epoch": 6.8205546492659055, + "grad_norm": 0.04156330227851868, + "learning_rate": 0.0008332407641789344, + "loss": 0.0443, + "num_input_tokens_seen": 90356048, + "step": 41810 + }, + { + "epoch": 6.82137030995106, + "grad_norm": 0.14998769760131836, + "learning_rate": 0.0008331876947110478, + "loss": 0.1661, + "num_input_tokens_seen": 90367792, + "step": 41815 + }, + { + "epoch": 6.822185970636215, + "grad_norm": 0.05917227268218994, + "learning_rate": 0.00083313461849069, + "loss": 0.0593, + "num_input_tokens_seen": 90377840, + "step": 41820 + }, + { + "epoch": 6.82300163132137, + "grad_norm": 0.012237275019288063, + "learning_rate": 0.0008330815355189365, + "loss": 0.0382, + "num_input_tokens_seen": 90387920, + "step": 41825 + }, + { + "epoch": 6.823817292006526, + "grad_norm": 0.19908970594406128, + "learning_rate": 0.0008330284457968631, + "loss": 0.0849, + "num_input_tokens_seen": 90399760, + "step": 41830 + }, + { + "epoch": 6.8246329526916805, + "grad_norm": 0.12390464544296265, + "learning_rate": 0.0008329753493255458, + "loss": 0.1322, + "num_input_tokens_seen": 90409840, + "step": 41835 + }, + { + "epoch": 6.825448613376835, + "grad_norm": 0.0513911172747612, + "learning_rate": 0.0008329222461060606, + "loss": 0.1119, + "num_input_tokens_seen": 90421136, + "step": 41840 + }, + { + "epoch": 6.82626427406199, + "grad_norm": 0.22825922071933746, + "learning_rate": 0.0008328691361394838, + "loss": 0.1985, + "num_input_tokens_seen": 90430640, + "step": 41845 + }, + { + "epoch": 6.827079934747145, + "grad_norm": 0.01640220545232296, + "learning_rate": 0.0008328160194268916, + "loss": 0.0438, + "num_input_tokens_seen": 90442064, + "step": 41850 + }, + { + "epoch": 6.827895595432301, + "grad_norm": 0.09551920741796494, + "learning_rate": 0.0008327628959693606, + "loss": 0.046, + "num_input_tokens_seen": 90452336, + "step": 41855 + }, + { + "epoch": 6.828711256117455, + "grad_norm": 0.25587597489356995, + "learning_rate": 0.0008327097657679674, + "loss": 0.1737, + "num_input_tokens_seen": 90463600, + "step": 41860 + }, + { + "epoch": 6.82952691680261, + "grad_norm": 0.04232428967952728, + "learning_rate": 0.0008326566288237887, + "loss": 0.0204, + "num_input_tokens_seen": 90474576, + "step": 41865 + }, + { + "epoch": 6.830342577487765, + "grad_norm": 0.03149278461933136, + "learning_rate": 0.0008326034851379014, + "loss": 0.1502, + "num_input_tokens_seen": 90484976, + "step": 41870 + }, + { + "epoch": 6.83115823817292, + "grad_norm": 0.28385284543037415, + "learning_rate": 0.0008325503347113826, + "loss": 0.1137, + "num_input_tokens_seen": 90494576, + "step": 41875 + }, + { + "epoch": 6.831973898858075, + "grad_norm": 0.2156359702348709, + "learning_rate": 0.0008324971775453094, + "loss": 0.2996, + "num_input_tokens_seen": 90504592, + "step": 41880 + }, + { + "epoch": 6.8327895595432295, + "grad_norm": 0.01356032956391573, + "learning_rate": 0.0008324440136407591, + "loss": 0.0573, + "num_input_tokens_seen": 90515120, + "step": 41885 + }, + { + "epoch": 6.833605220228385, + "grad_norm": 0.1261776089668274, + "learning_rate": 0.000832390842998809, + "loss": 0.0498, + "num_input_tokens_seen": 90526000, + "step": 41890 + }, + { + "epoch": 6.83442088091354, + "grad_norm": 0.1900995522737503, + "learning_rate": 0.0008323376656205369, + "loss": 0.1378, + "num_input_tokens_seen": 90537552, + "step": 41895 + }, + { + "epoch": 6.835236541598695, + "grad_norm": 0.31141209602355957, + "learning_rate": 0.0008322844815070204, + "loss": 0.1889, + "num_input_tokens_seen": 90548272, + "step": 41900 + }, + { + "epoch": 6.83605220228385, + "grad_norm": 0.2049756795167923, + "learning_rate": 0.0008322312906593373, + "loss": 0.1449, + "num_input_tokens_seen": 90558064, + "step": 41905 + }, + { + "epoch": 6.8368678629690045, + "grad_norm": 0.03546799719333649, + "learning_rate": 0.0008321780930785657, + "loss": 0.0302, + "num_input_tokens_seen": 90569712, + "step": 41910 + }, + { + "epoch": 6.83768352365416, + "grad_norm": 0.09230761975049973, + "learning_rate": 0.0008321248887657836, + "loss": 0.1765, + "num_input_tokens_seen": 90580656, + "step": 41915 + }, + { + "epoch": 6.838499184339315, + "grad_norm": 0.052474021911621094, + "learning_rate": 0.0008320716777220694, + "loss": 0.0439, + "num_input_tokens_seen": 90590512, + "step": 41920 + }, + { + "epoch": 6.83931484502447, + "grad_norm": 0.05721645429730415, + "learning_rate": 0.0008320184599485012, + "loss": 0.0613, + "num_input_tokens_seen": 90600784, + "step": 41925 + }, + { + "epoch": 6.840130505709625, + "grad_norm": 0.015794144943356514, + "learning_rate": 0.0008319652354461577, + "loss": 0.0657, + "num_input_tokens_seen": 90611984, + "step": 41930 + }, + { + "epoch": 6.8409461663947795, + "grad_norm": 0.024517951533198357, + "learning_rate": 0.0008319120042161179, + "loss": 0.0459, + "num_input_tokens_seen": 90623152, + "step": 41935 + }, + { + "epoch": 6.841761827079935, + "grad_norm": 0.24330684542655945, + "learning_rate": 0.00083185876625946, + "loss": 0.1472, + "num_input_tokens_seen": 90634736, + "step": 41940 + }, + { + "epoch": 6.84257748776509, + "grad_norm": 0.015622702427208424, + "learning_rate": 0.0008318055215772633, + "loss": 0.1043, + "num_input_tokens_seen": 90644976, + "step": 41945 + }, + { + "epoch": 6.843393148450245, + "grad_norm": 0.1785089373588562, + "learning_rate": 0.0008317522701706066, + "loss": 0.1045, + "num_input_tokens_seen": 90656656, + "step": 41950 + }, + { + "epoch": 6.8442088091354, + "grad_norm": 0.16877882182598114, + "learning_rate": 0.0008316990120405695, + "loss": 0.0717, + "num_input_tokens_seen": 90668592, + "step": 41955 + }, + { + "epoch": 6.8450244698205545, + "grad_norm": 0.009568187408149242, + "learning_rate": 0.0008316457471882311, + "loss": 0.0108, + "num_input_tokens_seen": 90678800, + "step": 41960 + }, + { + "epoch": 6.845840130505709, + "grad_norm": 0.012613932602107525, + "learning_rate": 0.0008315924756146708, + "loss": 0.0715, + "num_input_tokens_seen": 90689584, + "step": 41965 + }, + { + "epoch": 6.846655791190865, + "grad_norm": 0.20978093147277832, + "learning_rate": 0.0008315391973209685, + "loss": 0.2168, + "num_input_tokens_seen": 90699536, + "step": 41970 + }, + { + "epoch": 6.84747145187602, + "grad_norm": 0.04319089278578758, + "learning_rate": 0.0008314859123082037, + "loss": 0.0733, + "num_input_tokens_seen": 90710800, + "step": 41975 + }, + { + "epoch": 6.848287112561175, + "grad_norm": 0.17170238494873047, + "learning_rate": 0.0008314326205774563, + "loss": 0.0658, + "num_input_tokens_seen": 90720976, + "step": 41980 + }, + { + "epoch": 6.849102773246329, + "grad_norm": 0.1302386075258255, + "learning_rate": 0.0008313793221298065, + "loss": 0.1116, + "num_input_tokens_seen": 90730832, + "step": 41985 + }, + { + "epoch": 6.849918433931484, + "grad_norm": 0.01649455539882183, + "learning_rate": 0.0008313260169663343, + "loss": 0.0702, + "num_input_tokens_seen": 90742320, + "step": 41990 + }, + { + "epoch": 6.850734094616639, + "grad_norm": 0.046188920736312866, + "learning_rate": 0.00083127270508812, + "loss": 0.0986, + "num_input_tokens_seen": 90753680, + "step": 41995 + }, + { + "epoch": 6.851549755301795, + "grad_norm": 0.02995547652244568, + "learning_rate": 0.0008312193864962442, + "loss": 0.0453, + "num_input_tokens_seen": 90764272, + "step": 42000 + }, + { + "epoch": 6.85236541598695, + "grad_norm": 0.023149535059928894, + "learning_rate": 0.0008311660611917873, + "loss": 0.026, + "num_input_tokens_seen": 90774672, + "step": 42005 + }, + { + "epoch": 6.853181076672104, + "grad_norm": 0.1374523937702179, + "learning_rate": 0.00083111272917583, + "loss": 0.1136, + "num_input_tokens_seen": 90786416, + "step": 42010 + }, + { + "epoch": 6.853996737357259, + "grad_norm": 0.25843703746795654, + "learning_rate": 0.0008310593904494532, + "loss": 0.1107, + "num_input_tokens_seen": 90795472, + "step": 42015 + }, + { + "epoch": 6.854812398042414, + "grad_norm": 0.01731134206056595, + "learning_rate": 0.000831006045013738, + "loss": 0.0317, + "num_input_tokens_seen": 90807248, + "step": 42020 + }, + { + "epoch": 6.85562805872757, + "grad_norm": 0.10636473447084427, + "learning_rate": 0.0008309526928697653, + "loss": 0.0349, + "num_input_tokens_seen": 90818544, + "step": 42025 + }, + { + "epoch": 6.856443719412725, + "grad_norm": 0.01988835819065571, + "learning_rate": 0.0008308993340186164, + "loss": 0.1655, + "num_input_tokens_seen": 90829264, + "step": 42030 + }, + { + "epoch": 6.857259380097879, + "grad_norm": 0.03717343881726265, + "learning_rate": 0.0008308459684613727, + "loss": 0.0217, + "num_input_tokens_seen": 90840048, + "step": 42035 + }, + { + "epoch": 6.858075040783034, + "grad_norm": 0.16305583715438843, + "learning_rate": 0.0008307925961991158, + "loss": 0.1935, + "num_input_tokens_seen": 90850512, + "step": 42040 + }, + { + "epoch": 6.858890701468189, + "grad_norm": 0.14231441915035248, + "learning_rate": 0.0008307392172329273, + "loss": 0.1166, + "num_input_tokens_seen": 90862064, + "step": 42045 + }, + { + "epoch": 6.859706362153344, + "grad_norm": 0.012785021215677261, + "learning_rate": 0.000830685831563889, + "loss": 0.0339, + "num_input_tokens_seen": 90872592, + "step": 42050 + }, + { + "epoch": 6.8605220228384995, + "grad_norm": 0.07883328944444656, + "learning_rate": 0.0008306324391930827, + "loss": 0.0422, + "num_input_tokens_seen": 90883024, + "step": 42055 + }, + { + "epoch": 6.861337683523654, + "grad_norm": 0.04878608137369156, + "learning_rate": 0.0008305790401215906, + "loss": 0.0309, + "num_input_tokens_seen": 90893392, + "step": 42060 + }, + { + "epoch": 6.862153344208809, + "grad_norm": 0.13074623048305511, + "learning_rate": 0.000830525634350495, + "loss": 0.1722, + "num_input_tokens_seen": 90904400, + "step": 42065 + }, + { + "epoch": 6.862969004893964, + "grad_norm": 0.07179665565490723, + "learning_rate": 0.0008304722218808782, + "loss": 0.1888, + "num_input_tokens_seen": 90916560, + "step": 42070 + }, + { + "epoch": 6.863784665579119, + "grad_norm": 0.12156625837087631, + "learning_rate": 0.0008304188027138225, + "loss": 0.0426, + "num_input_tokens_seen": 90928368, + "step": 42075 + }, + { + "epoch": 6.864600326264274, + "grad_norm": 0.15231293439865112, + "learning_rate": 0.0008303653768504105, + "loss": 0.0825, + "num_input_tokens_seen": 90939600, + "step": 42080 + }, + { + "epoch": 6.865415986949429, + "grad_norm": 0.1693643033504486, + "learning_rate": 0.000830311944291725, + "loss": 0.084, + "num_input_tokens_seen": 90950256, + "step": 42085 + }, + { + "epoch": 6.866231647634584, + "grad_norm": 0.2293866127729416, + "learning_rate": 0.0008302585050388491, + "loss": 0.0804, + "num_input_tokens_seen": 90960656, + "step": 42090 + }, + { + "epoch": 6.867047308319739, + "grad_norm": 0.011171751655638218, + "learning_rate": 0.0008302050590928656, + "loss": 0.0496, + "num_input_tokens_seen": 90971376, + "step": 42095 + }, + { + "epoch": 6.867862969004894, + "grad_norm": 0.01649610511958599, + "learning_rate": 0.0008301516064548577, + "loss": 0.0216, + "num_input_tokens_seen": 90980144, + "step": 42100 + }, + { + "epoch": 6.868678629690049, + "grad_norm": 0.14276473224163055, + "learning_rate": 0.0008300981471259086, + "loss": 0.165, + "num_input_tokens_seen": 90990960, + "step": 42105 + }, + { + "epoch": 6.869494290375204, + "grad_norm": 0.017909109592437744, + "learning_rate": 0.0008300446811071018, + "loss": 0.0642, + "num_input_tokens_seen": 91002864, + "step": 42110 + }, + { + "epoch": 6.870309951060359, + "grad_norm": 0.013232512399554253, + "learning_rate": 0.0008299912083995208, + "loss": 0.0539, + "num_input_tokens_seen": 91013968, + "step": 42115 + }, + { + "epoch": 6.871125611745514, + "grad_norm": 0.01880715787410736, + "learning_rate": 0.0008299377290042493, + "loss": 0.0368, + "num_input_tokens_seen": 91025136, + "step": 42120 + }, + { + "epoch": 6.871941272430669, + "grad_norm": 0.14953921735286713, + "learning_rate": 0.0008298842429223714, + "loss": 0.0699, + "num_input_tokens_seen": 91035184, + "step": 42125 + }, + { + "epoch": 6.872756933115824, + "grad_norm": 0.22939430177211761, + "learning_rate": 0.0008298307501549706, + "loss": 0.069, + "num_input_tokens_seen": 91046544, + "step": 42130 + }, + { + "epoch": 6.873572593800979, + "grad_norm": 0.005941577255725861, + "learning_rate": 0.0008297772507031314, + "loss": 0.0412, + "num_input_tokens_seen": 91057488, + "step": 42135 + }, + { + "epoch": 6.874388254486134, + "grad_norm": 0.23242725431919098, + "learning_rate": 0.0008297237445679378, + "loss": 0.1376, + "num_input_tokens_seen": 91069040, + "step": 42140 + }, + { + "epoch": 6.875203915171289, + "grad_norm": 0.04065469652414322, + "learning_rate": 0.0008296702317504741, + "loss": 0.0106, + "num_input_tokens_seen": 91078352, + "step": 42145 + }, + { + "epoch": 6.876019575856444, + "grad_norm": 0.06341571360826492, + "learning_rate": 0.0008296167122518252, + "loss": 0.1237, + "num_input_tokens_seen": 91089008, + "step": 42150 + }, + { + "epoch": 6.876835236541599, + "grad_norm": 0.058492325246334076, + "learning_rate": 0.0008295631860730752, + "loss": 0.1141, + "num_input_tokens_seen": 91099536, + "step": 42155 + }, + { + "epoch": 6.877650897226753, + "grad_norm": 0.036050185561180115, + "learning_rate": 0.0008295096532153093, + "loss": 0.0745, + "num_input_tokens_seen": 91110160, + "step": 42160 + }, + { + "epoch": 6.878466557911908, + "grad_norm": 0.006050860974937677, + "learning_rate": 0.0008294561136796122, + "loss": 0.0198, + "num_input_tokens_seen": 91121136, + "step": 42165 + }, + { + "epoch": 6.879282218597064, + "grad_norm": 0.10723251849412918, + "learning_rate": 0.000829402567467069, + "loss": 0.0796, + "num_input_tokens_seen": 91131568, + "step": 42170 + }, + { + "epoch": 6.880097879282219, + "grad_norm": 0.040584757924079895, + "learning_rate": 0.000829349014578765, + "loss": 0.0434, + "num_input_tokens_seen": 91142192, + "step": 42175 + }, + { + "epoch": 6.8809135399673735, + "grad_norm": 0.012464815750718117, + "learning_rate": 0.0008292954550157853, + "loss": 0.0365, + "num_input_tokens_seen": 91153424, + "step": 42180 + }, + { + "epoch": 6.881729200652528, + "grad_norm": 0.29857343435287476, + "learning_rate": 0.0008292418887792155, + "loss": 0.1503, + "num_input_tokens_seen": 91163088, + "step": 42185 + }, + { + "epoch": 6.882544861337683, + "grad_norm": 0.013671220280230045, + "learning_rate": 0.0008291883158701413, + "loss": 0.1858, + "num_input_tokens_seen": 91174320, + "step": 42190 + }, + { + "epoch": 6.883360522022839, + "grad_norm": 0.08105769008398056, + "learning_rate": 0.000829134736289648, + "loss": 0.1649, + "num_input_tokens_seen": 91183792, + "step": 42195 + }, + { + "epoch": 6.884176182707994, + "grad_norm": 0.026280393823981285, + "learning_rate": 0.0008290811500388219, + "loss": 0.0126, + "num_input_tokens_seen": 91194608, + "step": 42200 + }, + { + "epoch": 6.8849918433931485, + "grad_norm": 0.056491460651159286, + "learning_rate": 0.0008290275571187488, + "loss": 0.0839, + "num_input_tokens_seen": 91204784, + "step": 42205 + }, + { + "epoch": 6.885807504078303, + "grad_norm": 0.13008785247802734, + "learning_rate": 0.0008289739575305148, + "loss": 0.0596, + "num_input_tokens_seen": 91215152, + "step": 42210 + }, + { + "epoch": 6.886623164763458, + "grad_norm": 0.25845256447792053, + "learning_rate": 0.0008289203512752063, + "loss": 0.1248, + "num_input_tokens_seen": 91225232, + "step": 42215 + }, + { + "epoch": 6.887438825448614, + "grad_norm": 0.12239914387464523, + "learning_rate": 0.0008288667383539097, + "loss": 0.0507, + "num_input_tokens_seen": 91236528, + "step": 42220 + }, + { + "epoch": 6.888254486133769, + "grad_norm": 0.10594190657138824, + "learning_rate": 0.0008288131187677112, + "loss": 0.0668, + "num_input_tokens_seen": 91247600, + "step": 42225 + }, + { + "epoch": 6.8890701468189235, + "grad_norm": 0.11915894597768784, + "learning_rate": 0.000828759492517698, + "loss": 0.1642, + "num_input_tokens_seen": 91257744, + "step": 42230 + }, + { + "epoch": 6.889885807504078, + "grad_norm": 0.03344777226448059, + "learning_rate": 0.0008287058596049563, + "loss": 0.1174, + "num_input_tokens_seen": 91268624, + "step": 42235 + }, + { + "epoch": 6.890701468189233, + "grad_norm": 0.006844738032668829, + "learning_rate": 0.0008286522200305738, + "loss": 0.0567, + "num_input_tokens_seen": 91279984, + "step": 42240 + }, + { + "epoch": 6.891517128874388, + "grad_norm": 0.20053116977214813, + "learning_rate": 0.0008285985737956367, + "loss": 0.0718, + "num_input_tokens_seen": 91290896, + "step": 42245 + }, + { + "epoch": 6.892332789559543, + "grad_norm": 0.026900721713900566, + "learning_rate": 0.0008285449209012328, + "loss": 0.0506, + "num_input_tokens_seen": 91302608, + "step": 42250 + }, + { + "epoch": 6.8931484502446985, + "grad_norm": 0.30847597122192383, + "learning_rate": 0.0008284912613484493, + "loss": 0.1317, + "num_input_tokens_seen": 91313104, + "step": 42255 + }, + { + "epoch": 6.893964110929853, + "grad_norm": 0.12379782646894455, + "learning_rate": 0.0008284375951383738, + "loss": 0.0368, + "num_input_tokens_seen": 91322416, + "step": 42260 + }, + { + "epoch": 6.894779771615008, + "grad_norm": 0.015151728875935078, + "learning_rate": 0.0008283839222720935, + "loss": 0.1379, + "num_input_tokens_seen": 91333328, + "step": 42265 + }, + { + "epoch": 6.895595432300163, + "grad_norm": 0.3223811089992523, + "learning_rate": 0.0008283302427506966, + "loss": 0.0628, + "num_input_tokens_seen": 91345264, + "step": 42270 + }, + { + "epoch": 6.896411092985318, + "grad_norm": 0.01046650018543005, + "learning_rate": 0.0008282765565752708, + "loss": 0.1245, + "num_input_tokens_seen": 91356528, + "step": 42275 + }, + { + "epoch": 6.897226753670473, + "grad_norm": 0.13600951433181763, + "learning_rate": 0.0008282228637469042, + "loss": 0.1784, + "num_input_tokens_seen": 91365904, + "step": 42280 + }, + { + "epoch": 6.898042414355628, + "grad_norm": 0.11428900063037872, + "learning_rate": 0.0008281691642666848, + "loss": 0.0718, + "num_input_tokens_seen": 91376528, + "step": 42285 + }, + { + "epoch": 6.898858075040783, + "grad_norm": 0.02441113069653511, + "learning_rate": 0.000828115458135701, + "loss": 0.1084, + "num_input_tokens_seen": 91387376, + "step": 42290 + }, + { + "epoch": 6.899673735725938, + "grad_norm": 0.05587480589747429, + "learning_rate": 0.0008280617453550412, + "loss": 0.0298, + "num_input_tokens_seen": 91398160, + "step": 42295 + }, + { + "epoch": 6.900489396411093, + "grad_norm": 0.1454855501651764, + "learning_rate": 0.0008280080259257939, + "loss": 0.2052, + "num_input_tokens_seen": 91409872, + "step": 42300 + }, + { + "epoch": 6.901305057096248, + "grad_norm": 0.046002261340618134, + "learning_rate": 0.0008279542998490479, + "loss": 0.0353, + "num_input_tokens_seen": 91420752, + "step": 42305 + }, + { + "epoch": 6.902120717781403, + "grad_norm": 0.010383290238678455, + "learning_rate": 0.000827900567125892, + "loss": 0.1391, + "num_input_tokens_seen": 91431632, + "step": 42310 + }, + { + "epoch": 6.902936378466558, + "grad_norm": 0.005426608491688967, + "learning_rate": 0.0008278468277574152, + "loss": 0.0978, + "num_input_tokens_seen": 91441488, + "step": 42315 + }, + { + "epoch": 6.903752039151713, + "grad_norm": 0.1538834422826767, + "learning_rate": 0.0008277930817447063, + "loss": 0.0675, + "num_input_tokens_seen": 91453040, + "step": 42320 + }, + { + "epoch": 6.904567699836868, + "grad_norm": 0.06816184520721436, + "learning_rate": 0.000827739329088855, + "loss": 0.0395, + "num_input_tokens_seen": 91463952, + "step": 42325 + }, + { + "epoch": 6.9053833605220225, + "grad_norm": 0.018162427470088005, + "learning_rate": 0.0008276855697909502, + "loss": 0.058, + "num_input_tokens_seen": 91475216, + "step": 42330 + }, + { + "epoch": 6.906199021207177, + "grad_norm": 0.26269209384918213, + "learning_rate": 0.0008276318038520818, + "loss": 0.0946, + "num_input_tokens_seen": 91486160, + "step": 42335 + }, + { + "epoch": 6.907014681892333, + "grad_norm": 0.041546259075403214, + "learning_rate": 0.0008275780312733392, + "loss": 0.1047, + "num_input_tokens_seen": 91496208, + "step": 42340 + }, + { + "epoch": 6.907830342577488, + "grad_norm": 0.037615060806274414, + "learning_rate": 0.0008275242520558124, + "loss": 0.0692, + "num_input_tokens_seen": 91505520, + "step": 42345 + }, + { + "epoch": 6.908646003262643, + "grad_norm": 0.03843872621655464, + "learning_rate": 0.000827470466200591, + "loss": 0.03, + "num_input_tokens_seen": 91515312, + "step": 42350 + }, + { + "epoch": 6.9094616639477975, + "grad_norm": 0.023156536743044853, + "learning_rate": 0.0008274166737087652, + "loss": 0.4206, + "num_input_tokens_seen": 91525808, + "step": 42355 + }, + { + "epoch": 6.910277324632952, + "grad_norm": 0.04622017592191696, + "learning_rate": 0.000827362874581425, + "loss": 0.0939, + "num_input_tokens_seen": 91536016, + "step": 42360 + }, + { + "epoch": 6.911092985318108, + "grad_norm": 0.08515045791864395, + "learning_rate": 0.000827309068819661, + "loss": 0.135, + "num_input_tokens_seen": 91546704, + "step": 42365 + }, + { + "epoch": 6.911908646003263, + "grad_norm": 0.034274887293577194, + "learning_rate": 0.0008272552564245635, + "loss": 0.068, + "num_input_tokens_seen": 91557552, + "step": 42370 + }, + { + "epoch": 6.912724306688418, + "grad_norm": 0.22742615640163422, + "learning_rate": 0.000827201437397223, + "loss": 0.0693, + "num_input_tokens_seen": 91568080, + "step": 42375 + }, + { + "epoch": 6.9135399673735725, + "grad_norm": 0.06458600610494614, + "learning_rate": 0.0008271476117387303, + "loss": 0.0532, + "num_input_tokens_seen": 91578384, + "step": 42380 + }, + { + "epoch": 6.914355628058727, + "grad_norm": 0.20055918395519257, + "learning_rate": 0.0008270937794501763, + "loss": 0.1223, + "num_input_tokens_seen": 91589936, + "step": 42385 + }, + { + "epoch": 6.915171288743883, + "grad_norm": 0.07637903094291687, + "learning_rate": 0.0008270399405326519, + "loss": 0.0543, + "num_input_tokens_seen": 91601392, + "step": 42390 + }, + { + "epoch": 6.915986949429038, + "grad_norm": 0.13277378678321838, + "learning_rate": 0.0008269860949872484, + "loss": 0.1121, + "num_input_tokens_seen": 91613808, + "step": 42395 + }, + { + "epoch": 6.916802610114193, + "grad_norm": 0.1605086475610733, + "learning_rate": 0.0008269322428150565, + "loss": 0.1092, + "num_input_tokens_seen": 91624400, + "step": 42400 + }, + { + "epoch": 6.917618270799347, + "grad_norm": 0.05308017507195473, + "learning_rate": 0.0008268783840171682, + "loss": 0.081, + "num_input_tokens_seen": 91635888, + "step": 42405 + }, + { + "epoch": 6.918433931484502, + "grad_norm": 0.027449732646346092, + "learning_rate": 0.0008268245185946748, + "loss": 0.0868, + "num_input_tokens_seen": 91645488, + "step": 42410 + }, + { + "epoch": 6.919249592169657, + "grad_norm": 0.03316226229071617, + "learning_rate": 0.0008267706465486677, + "loss": 0.0518, + "num_input_tokens_seen": 91655632, + "step": 42415 + }, + { + "epoch": 6.920065252854813, + "grad_norm": 0.10044834017753601, + "learning_rate": 0.000826716767880239, + "loss": 0.0434, + "num_input_tokens_seen": 91666608, + "step": 42420 + }, + { + "epoch": 6.920880913539968, + "grad_norm": 0.33315449953079224, + "learning_rate": 0.0008266628825904807, + "loss": 0.1557, + "num_input_tokens_seen": 91676656, + "step": 42425 + }, + { + "epoch": 6.921696574225122, + "grad_norm": 0.21813486516475677, + "learning_rate": 0.0008266089906804845, + "loss": 0.2951, + "num_input_tokens_seen": 91688240, + "step": 42430 + }, + { + "epoch": 6.922512234910277, + "grad_norm": 0.002060960978269577, + "learning_rate": 0.0008265550921513428, + "loss": 0.1761, + "num_input_tokens_seen": 91699632, + "step": 42435 + }, + { + "epoch": 6.923327895595432, + "grad_norm": 0.12456963211297989, + "learning_rate": 0.000826501187004148, + "loss": 0.0603, + "num_input_tokens_seen": 91709744, + "step": 42440 + }, + { + "epoch": 6.924143556280587, + "grad_norm": 0.003388361306861043, + "learning_rate": 0.0008264472752399923, + "loss": 0.105, + "num_input_tokens_seen": 91719472, + "step": 42445 + }, + { + "epoch": 6.924959216965743, + "grad_norm": 0.4648321568965912, + "learning_rate": 0.0008263933568599687, + "loss": 0.298, + "num_input_tokens_seen": 91729776, + "step": 42450 + }, + { + "epoch": 6.925774877650897, + "grad_norm": 0.19051282107830048, + "learning_rate": 0.0008263394318651693, + "loss": 0.1332, + "num_input_tokens_seen": 91740176, + "step": 42455 + }, + { + "epoch": 6.926590538336052, + "grad_norm": 0.07513487339019775, + "learning_rate": 0.0008262855002566876, + "loss": 0.0346, + "num_input_tokens_seen": 91750288, + "step": 42460 + }, + { + "epoch": 6.927406199021207, + "grad_norm": 0.0671396404504776, + "learning_rate": 0.0008262315620356163, + "loss": 0.1645, + "num_input_tokens_seen": 91762000, + "step": 42465 + }, + { + "epoch": 6.928221859706362, + "grad_norm": 0.04479534178972244, + "learning_rate": 0.0008261776172030484, + "loss": 0.049, + "num_input_tokens_seen": 91771760, + "step": 42470 + }, + { + "epoch": 6.9290375203915175, + "grad_norm": 0.007520393934100866, + "learning_rate": 0.0008261236657600773, + "loss": 0.075, + "num_input_tokens_seen": 91781936, + "step": 42475 + }, + { + "epoch": 6.929853181076672, + "grad_norm": 0.019585467875003815, + "learning_rate": 0.0008260697077077964, + "loss": 0.1323, + "num_input_tokens_seen": 91793264, + "step": 42480 + }, + { + "epoch": 6.930668841761827, + "grad_norm": 0.13702279329299927, + "learning_rate": 0.0008260157430472992, + "loss": 0.1284, + "num_input_tokens_seen": 91803824, + "step": 42485 + }, + { + "epoch": 6.931484502446982, + "grad_norm": 0.01728702522814274, + "learning_rate": 0.0008259617717796795, + "loss": 0.1255, + "num_input_tokens_seen": 91814384, + "step": 42490 + }, + { + "epoch": 6.932300163132137, + "grad_norm": 0.042796917259693146, + "learning_rate": 0.0008259077939060309, + "loss": 0.1423, + "num_input_tokens_seen": 91823920, + "step": 42495 + }, + { + "epoch": 6.933115823817292, + "grad_norm": 0.02835630439221859, + "learning_rate": 0.0008258538094274475, + "loss": 0.0458, + "num_input_tokens_seen": 91834640, + "step": 42500 + }, + { + "epoch": 6.933931484502447, + "grad_norm": 0.0981273278594017, + "learning_rate": 0.0008257998183450233, + "loss": 0.0381, + "num_input_tokens_seen": 91845968, + "step": 42505 + }, + { + "epoch": 6.934747145187602, + "grad_norm": 0.02087876945734024, + "learning_rate": 0.0008257458206598524, + "loss": 0.1194, + "num_input_tokens_seen": 91856464, + "step": 42510 + }, + { + "epoch": 6.935562805872757, + "grad_norm": 0.04924190044403076, + "learning_rate": 0.0008256918163730291, + "loss": 0.0882, + "num_input_tokens_seen": 91867984, + "step": 42515 + }, + { + "epoch": 6.936378466557912, + "grad_norm": 0.23019491136074066, + "learning_rate": 0.0008256378054856482, + "loss": 0.1184, + "num_input_tokens_seen": 91878576, + "step": 42520 + }, + { + "epoch": 6.937194127243067, + "grad_norm": 0.025902308523654938, + "learning_rate": 0.000825583787998804, + "loss": 0.1507, + "num_input_tokens_seen": 91888176, + "step": 42525 + }, + { + "epoch": 6.938009787928221, + "grad_norm": 0.02752247266471386, + "learning_rate": 0.0008255297639135912, + "loss": 0.176, + "num_input_tokens_seen": 91901104, + "step": 42530 + }, + { + "epoch": 6.938825448613377, + "grad_norm": 0.1430131047964096, + "learning_rate": 0.000825475733231105, + "loss": 0.05, + "num_input_tokens_seen": 91911920, + "step": 42535 + }, + { + "epoch": 6.939641109298532, + "grad_norm": 0.1026284396648407, + "learning_rate": 0.0008254216959524399, + "loss": 0.1036, + "num_input_tokens_seen": 91922512, + "step": 42540 + }, + { + "epoch": 6.940456769983687, + "grad_norm": 0.035261936485767365, + "learning_rate": 0.0008253676520786914, + "loss": 0.0472, + "num_input_tokens_seen": 91934224, + "step": 42545 + }, + { + "epoch": 6.941272430668842, + "grad_norm": 0.02183767594397068, + "learning_rate": 0.0008253136016109547, + "loss": 0.0493, + "num_input_tokens_seen": 91944592, + "step": 42550 + }, + { + "epoch": 6.942088091353996, + "grad_norm": 0.09058975428342819, + "learning_rate": 0.0008252595445503253, + "loss": 0.0245, + "num_input_tokens_seen": 91955024, + "step": 42555 + }, + { + "epoch": 6.942903752039152, + "grad_norm": 0.1628194898366928, + "learning_rate": 0.0008252054808978984, + "loss": 0.0704, + "num_input_tokens_seen": 91963696, + "step": 42560 + }, + { + "epoch": 6.943719412724307, + "grad_norm": 0.07157375663518906, + "learning_rate": 0.0008251514106547698, + "loss": 0.0774, + "num_input_tokens_seen": 91973936, + "step": 42565 + }, + { + "epoch": 6.944535073409462, + "grad_norm": 0.06799112260341644, + "learning_rate": 0.0008250973338220356, + "loss": 0.1247, + "num_input_tokens_seen": 91984080, + "step": 42570 + }, + { + "epoch": 6.945350734094617, + "grad_norm": 0.06843625754117966, + "learning_rate": 0.0008250432504007914, + "loss": 0.0725, + "num_input_tokens_seen": 91994800, + "step": 42575 + }, + { + "epoch": 6.946166394779771, + "grad_norm": 0.07280784845352173, + "learning_rate": 0.0008249891603921334, + "loss": 0.0485, + "num_input_tokens_seen": 92006160, + "step": 42580 + }, + { + "epoch": 6.946982055464927, + "grad_norm": 0.016119126230478287, + "learning_rate": 0.0008249350637971577, + "loss": 0.0951, + "num_input_tokens_seen": 92017520, + "step": 42585 + }, + { + "epoch": 6.947797716150082, + "grad_norm": 0.26134583353996277, + "learning_rate": 0.0008248809606169609, + "loss": 0.1383, + "num_input_tokens_seen": 92028400, + "step": 42590 + }, + { + "epoch": 6.948613376835237, + "grad_norm": 0.30048561096191406, + "learning_rate": 0.0008248268508526393, + "loss": 0.0728, + "num_input_tokens_seen": 92040368, + "step": 42595 + }, + { + "epoch": 6.9494290375203915, + "grad_norm": 0.0028530319686979055, + "learning_rate": 0.0008247727345052894, + "loss": 0.054, + "num_input_tokens_seen": 92049968, + "step": 42600 + }, + { + "epoch": 6.950244698205546, + "grad_norm": 0.08610218018293381, + "learning_rate": 0.000824718611576008, + "loss": 0.0521, + "num_input_tokens_seen": 92059536, + "step": 42605 + }, + { + "epoch": 6.951060358890701, + "grad_norm": 0.01567983254790306, + "learning_rate": 0.0008246644820658922, + "loss": 0.0264, + "num_input_tokens_seen": 92070352, + "step": 42610 + }, + { + "epoch": 6.951876019575856, + "grad_norm": 0.022035297006368637, + "learning_rate": 0.0008246103459760385, + "loss": 0.054, + "num_input_tokens_seen": 92081584, + "step": 42615 + }, + { + "epoch": 6.952691680261012, + "grad_norm": 0.2599090337753296, + "learning_rate": 0.0008245562033075446, + "loss": 0.1472, + "num_input_tokens_seen": 92092880, + "step": 42620 + }, + { + "epoch": 6.9535073409461665, + "grad_norm": 0.0031842731405049562, + "learning_rate": 0.0008245020540615074, + "loss": 0.0356, + "num_input_tokens_seen": 92104368, + "step": 42625 + }, + { + "epoch": 6.954323001631321, + "grad_norm": 0.11851377040147781, + "learning_rate": 0.0008244478982390245, + "loss": 0.141, + "num_input_tokens_seen": 92115152, + "step": 42630 + }, + { + "epoch": 6.955138662316476, + "grad_norm": 0.03890161216259003, + "learning_rate": 0.0008243937358411933, + "loss": 0.1503, + "num_input_tokens_seen": 92126352, + "step": 42635 + }, + { + "epoch": 6.955954323001631, + "grad_norm": 0.012479268014431, + "learning_rate": 0.0008243395668691113, + "loss": 0.0521, + "num_input_tokens_seen": 92137488, + "step": 42640 + }, + { + "epoch": 6.956769983686787, + "grad_norm": 0.21884118020534515, + "learning_rate": 0.0008242853913238769, + "loss": 0.1481, + "num_input_tokens_seen": 92148976, + "step": 42645 + }, + { + "epoch": 6.9575856443719415, + "grad_norm": 0.044424448162317276, + "learning_rate": 0.0008242312092065873, + "loss": 0.1143, + "num_input_tokens_seen": 92158896, + "step": 42650 + }, + { + "epoch": 6.958401305057096, + "grad_norm": 0.008710110560059547, + "learning_rate": 0.0008241770205183412, + "loss": 0.0932, + "num_input_tokens_seen": 92169552, + "step": 42655 + }, + { + "epoch": 6.959216965742251, + "grad_norm": 0.12920400500297546, + "learning_rate": 0.0008241228252602364, + "loss": 0.0838, + "num_input_tokens_seen": 92179792, + "step": 42660 + }, + { + "epoch": 6.960032626427406, + "grad_norm": 0.11128882318735123, + "learning_rate": 0.0008240686234333714, + "loss": 0.0433, + "num_input_tokens_seen": 92190416, + "step": 42665 + }, + { + "epoch": 6.960848287112562, + "grad_norm": 0.11121262609958649, + "learning_rate": 0.0008240144150388446, + "loss": 0.0997, + "num_input_tokens_seen": 92201456, + "step": 42670 + }, + { + "epoch": 6.9616639477977165, + "grad_norm": 0.019484156742691994, + "learning_rate": 0.0008239602000777548, + "loss": 0.1558, + "num_input_tokens_seen": 92213072, + "step": 42675 + }, + { + "epoch": 6.962479608482871, + "grad_norm": 0.01814747042953968, + "learning_rate": 0.0008239059785512005, + "loss": 0.0192, + "num_input_tokens_seen": 92223632, + "step": 42680 + }, + { + "epoch": 6.963295269168026, + "grad_norm": 0.33034294843673706, + "learning_rate": 0.0008238517504602805, + "loss": 0.0463, + "num_input_tokens_seen": 92234960, + "step": 42685 + }, + { + "epoch": 6.964110929853181, + "grad_norm": 0.026794755831360817, + "learning_rate": 0.0008237975158060939, + "loss": 0.0358, + "num_input_tokens_seen": 92245712, + "step": 42690 + }, + { + "epoch": 6.964926590538336, + "grad_norm": 0.22729530930519104, + "learning_rate": 0.0008237432745897402, + "loss": 0.0682, + "num_input_tokens_seen": 92256784, + "step": 42695 + }, + { + "epoch": 6.9657422512234906, + "grad_norm": 0.009565351530909538, + "learning_rate": 0.000823689026812318, + "loss": 0.0174, + "num_input_tokens_seen": 92265936, + "step": 42700 + }, + { + "epoch": 6.966557911908646, + "grad_norm": 0.05226115137338638, + "learning_rate": 0.0008236347724749274, + "loss": 0.2109, + "num_input_tokens_seen": 92276464, + "step": 42705 + }, + { + "epoch": 6.967373572593801, + "grad_norm": 0.2930293083190918, + "learning_rate": 0.0008235805115786672, + "loss": 0.1611, + "num_input_tokens_seen": 92287664, + "step": 42710 + }, + { + "epoch": 6.968189233278956, + "grad_norm": 0.1569661796092987, + "learning_rate": 0.0008235262441246376, + "loss": 0.2314, + "num_input_tokens_seen": 92296976, + "step": 42715 + }, + { + "epoch": 6.969004893964111, + "grad_norm": 0.2670922875404358, + "learning_rate": 0.0008234719701139384, + "loss": 0.0767, + "num_input_tokens_seen": 92307184, + "step": 42720 + }, + { + "epoch": 6.9698205546492655, + "grad_norm": 0.004752719309180975, + "learning_rate": 0.0008234176895476692, + "loss": 0.0497, + "num_input_tokens_seen": 92318128, + "step": 42725 + }, + { + "epoch": 6.970636215334421, + "grad_norm": 0.012683387845754623, + "learning_rate": 0.0008233634024269302, + "loss": 0.0872, + "num_input_tokens_seen": 92329744, + "step": 42730 + }, + { + "epoch": 6.971451876019576, + "grad_norm": 0.032268162816762924, + "learning_rate": 0.0008233091087528217, + "loss": 0.098, + "num_input_tokens_seen": 92340720, + "step": 42735 + }, + { + "epoch": 6.972267536704731, + "grad_norm": 0.19561608135700226, + "learning_rate": 0.000823254808526444, + "loss": 0.0382, + "num_input_tokens_seen": 92351248, + "step": 42740 + }, + { + "epoch": 6.973083197389886, + "grad_norm": 0.021456921473145485, + "learning_rate": 0.0008232005017488975, + "loss": 0.0165, + "num_input_tokens_seen": 92361680, + "step": 42745 + }, + { + "epoch": 6.9738988580750405, + "grad_norm": 0.11003857105970383, + "learning_rate": 0.0008231461884212828, + "loss": 0.0841, + "num_input_tokens_seen": 92373616, + "step": 42750 + }, + { + "epoch": 6.974714518760196, + "grad_norm": 0.07811323553323746, + "learning_rate": 0.0008230918685447006, + "loss": 0.0345, + "num_input_tokens_seen": 92384464, + "step": 42755 + }, + { + "epoch": 6.975530179445351, + "grad_norm": 0.02601797506213188, + "learning_rate": 0.000823037542120252, + "loss": 0.0467, + "num_input_tokens_seen": 92395984, + "step": 42760 + }, + { + "epoch": 6.976345840130506, + "grad_norm": 0.01498804334551096, + "learning_rate": 0.0008229832091490377, + "loss": 0.1297, + "num_input_tokens_seen": 92407120, + "step": 42765 + }, + { + "epoch": 6.977161500815661, + "grad_norm": 0.09473436325788498, + "learning_rate": 0.0008229288696321588, + "loss": 0.0315, + "num_input_tokens_seen": 92417296, + "step": 42770 + }, + { + "epoch": 6.9779771615008155, + "grad_norm": 0.05961019545793533, + "learning_rate": 0.0008228745235707169, + "loss": 0.0963, + "num_input_tokens_seen": 92428240, + "step": 42775 + }, + { + "epoch": 6.97879282218597, + "grad_norm": 0.012499609962105751, + "learning_rate": 0.000822820170965813, + "loss": 0.2952, + "num_input_tokens_seen": 92438864, + "step": 42780 + }, + { + "epoch": 6.979608482871125, + "grad_norm": 0.03288532793521881, + "learning_rate": 0.0008227658118185491, + "loss": 0.1119, + "num_input_tokens_seen": 92449424, + "step": 42785 + }, + { + "epoch": 6.980424143556281, + "grad_norm": 0.16200962662696838, + "learning_rate": 0.0008227114461300262, + "loss": 0.092, + "num_input_tokens_seen": 92460016, + "step": 42790 + }, + { + "epoch": 6.981239804241436, + "grad_norm": 0.03325765207409859, + "learning_rate": 0.0008226570739013466, + "loss": 0.0655, + "num_input_tokens_seen": 92472336, + "step": 42795 + }, + { + "epoch": 6.9820554649265905, + "grad_norm": 0.06443525105714798, + "learning_rate": 0.0008226026951336121, + "loss": 0.1035, + "num_input_tokens_seen": 92482576, + "step": 42800 + }, + { + "epoch": 6.982871125611745, + "grad_norm": 0.09831217676401138, + "learning_rate": 0.0008225483098279247, + "loss": 0.0297, + "num_input_tokens_seen": 92494160, + "step": 42805 + }, + { + "epoch": 6.9836867862969, + "grad_norm": 0.061349742114543915, + "learning_rate": 0.0008224939179853868, + "loss": 0.0775, + "num_input_tokens_seen": 92504432, + "step": 42810 + }, + { + "epoch": 6.984502446982056, + "grad_norm": 0.01365916058421135, + "learning_rate": 0.0008224395196071003, + "loss": 0.0592, + "num_input_tokens_seen": 92514576, + "step": 42815 + }, + { + "epoch": 6.985318107667211, + "grad_norm": 0.06067565083503723, + "learning_rate": 0.000822385114694168, + "loss": 0.0316, + "num_input_tokens_seen": 92525648, + "step": 42820 + }, + { + "epoch": 6.986133768352365, + "grad_norm": 0.0380895771086216, + "learning_rate": 0.0008223307032476923, + "loss": 0.0947, + "num_input_tokens_seen": 92535248, + "step": 42825 + }, + { + "epoch": 6.98694942903752, + "grad_norm": 0.008909545838832855, + "learning_rate": 0.0008222762852687762, + "loss": 0.0511, + "num_input_tokens_seen": 92546800, + "step": 42830 + }, + { + "epoch": 6.987765089722675, + "grad_norm": 0.04149286076426506, + "learning_rate": 0.0008222218607585221, + "loss": 0.056, + "num_input_tokens_seen": 92557584, + "step": 42835 + }, + { + "epoch": 6.988580750407831, + "grad_norm": 0.02532074600458145, + "learning_rate": 0.0008221674297180334, + "loss": 0.0636, + "num_input_tokens_seen": 92567184, + "step": 42840 + }, + { + "epoch": 6.989396411092986, + "grad_norm": 0.28405627608299255, + "learning_rate": 0.000822112992148413, + "loss": 0.0643, + "num_input_tokens_seen": 92576880, + "step": 42845 + }, + { + "epoch": 6.99021207177814, + "grad_norm": 0.08372216671705246, + "learning_rate": 0.000822058548050764, + "loss": 0.1165, + "num_input_tokens_seen": 92588368, + "step": 42850 + }, + { + "epoch": 6.991027732463295, + "grad_norm": 0.01633065938949585, + "learning_rate": 0.0008220040974261901, + "loss": 0.0704, + "num_input_tokens_seen": 92598992, + "step": 42855 + }, + { + "epoch": 6.99184339314845, + "grad_norm": 0.004168565850704908, + "learning_rate": 0.0008219496402757948, + "loss": 0.0553, + "num_input_tokens_seen": 92609904, + "step": 42860 + }, + { + "epoch": 6.992659053833605, + "grad_norm": 0.030385682359337807, + "learning_rate": 0.0008218951766006815, + "loss": 0.0593, + "num_input_tokens_seen": 92621360, + "step": 42865 + }, + { + "epoch": 6.993474714518761, + "grad_norm": 0.044751256704330444, + "learning_rate": 0.0008218407064019541, + "loss": 0.1026, + "num_input_tokens_seen": 92631312, + "step": 42870 + }, + { + "epoch": 6.994290375203915, + "grad_norm": 0.004565827082842588, + "learning_rate": 0.0008217862296807165, + "loss": 0.0222, + "num_input_tokens_seen": 92641584, + "step": 42875 + }, + { + "epoch": 6.99510603588907, + "grad_norm": 0.019839083775877953, + "learning_rate": 0.0008217317464380727, + "loss": 0.1736, + "num_input_tokens_seen": 92651824, + "step": 42880 + }, + { + "epoch": 6.995921696574225, + "grad_norm": 0.013745338656008244, + "learning_rate": 0.0008216772566751269, + "loss": 0.0456, + "num_input_tokens_seen": 92662384, + "step": 42885 + }, + { + "epoch": 6.99673735725938, + "grad_norm": 0.012423294596374035, + "learning_rate": 0.0008216227603929835, + "loss": 0.1445, + "num_input_tokens_seen": 92673008, + "step": 42890 + }, + { + "epoch": 6.997553017944535, + "grad_norm": 0.1849156767129898, + "learning_rate": 0.0008215682575927468, + "loss": 0.056, + "num_input_tokens_seen": 92683568, + "step": 42895 + }, + { + "epoch": 6.99836867862969, + "grad_norm": 0.08922215551137924, + "learning_rate": 0.0008215137482755215, + "loss": 0.0186, + "num_input_tokens_seen": 92693392, + "step": 42900 + }, + { + "epoch": 6.999184339314845, + "grad_norm": 0.04565683752298355, + "learning_rate": 0.0008214592324424122, + "loss": 0.0873, + "num_input_tokens_seen": 92704240, + "step": 42905 + }, + { + "epoch": 7.0, + "grad_norm": 0.013504397124052048, + "learning_rate": 0.0008214047100945236, + "loss": 0.1325, + "num_input_tokens_seen": 92713360, + "step": 42910 + }, + { + "epoch": 7.0, + "eval_loss": 0.13006359338760376, + "eval_runtime": 103.5374, + "eval_samples_per_second": 26.319, + "eval_steps_per_second": 6.587, + "num_input_tokens_seen": 92713360, + "step": 42910 + }, + { + "epoch": 7.000815660685155, + "grad_norm": 0.3289978504180908, + "learning_rate": 0.0008213501812329609, + "loss": 0.1642, + "num_input_tokens_seen": 92724208, + "step": 42915 + }, + { + "epoch": 7.00163132137031, + "grad_norm": 0.3054993748664856, + "learning_rate": 0.0008212956458588292, + "loss": 0.1031, + "num_input_tokens_seen": 92735600, + "step": 42920 + }, + { + "epoch": 7.002446982055465, + "grad_norm": 0.027686649933457375, + "learning_rate": 0.0008212411039732336, + "loss": 0.1013, + "num_input_tokens_seen": 92746544, + "step": 42925 + }, + { + "epoch": 7.00326264274062, + "grad_norm": 0.08517606556415558, + "learning_rate": 0.0008211865555772795, + "loss": 0.0472, + "num_input_tokens_seen": 92756912, + "step": 42930 + }, + { + "epoch": 7.004078303425775, + "grad_norm": 0.006800774950534105, + "learning_rate": 0.0008211320006720723, + "loss": 0.0581, + "num_input_tokens_seen": 92767504, + "step": 42935 + }, + { + "epoch": 7.00489396411093, + "grad_norm": 0.02842605859041214, + "learning_rate": 0.000821077439258718, + "loss": 0.0366, + "num_input_tokens_seen": 92778064, + "step": 42940 + }, + { + "epoch": 7.005709624796085, + "grad_norm": 0.005044872872531414, + "learning_rate": 0.0008210228713383218, + "loss": 0.0951, + "num_input_tokens_seen": 92788784, + "step": 42945 + }, + { + "epoch": 7.006525285481239, + "grad_norm": 0.06589864194393158, + "learning_rate": 0.00082096829691199, + "loss": 0.1049, + "num_input_tokens_seen": 92800144, + "step": 42950 + }, + { + "epoch": 7.007340946166395, + "grad_norm": 0.11859538406133652, + "learning_rate": 0.0008209137159808284, + "loss": 0.0534, + "num_input_tokens_seen": 92809744, + "step": 42955 + }, + { + "epoch": 7.00815660685155, + "grad_norm": 0.0624762699007988, + "learning_rate": 0.0008208591285459434, + "loss": 0.0615, + "num_input_tokens_seen": 92820240, + "step": 42960 + }, + { + "epoch": 7.008972267536705, + "grad_norm": 0.35461515188217163, + "learning_rate": 0.0008208045346084409, + "loss": 0.2508, + "num_input_tokens_seen": 92831984, + "step": 42965 + }, + { + "epoch": 7.00978792822186, + "grad_norm": 0.18568821251392365, + "learning_rate": 0.0008207499341694278, + "loss": 0.1936, + "num_input_tokens_seen": 92842928, + "step": 42970 + }, + { + "epoch": 7.010603588907014, + "grad_norm": 0.09701191633939743, + "learning_rate": 0.0008206953272300102, + "loss": 0.0443, + "num_input_tokens_seen": 92853776, + "step": 42975 + }, + { + "epoch": 7.011419249592169, + "grad_norm": 0.010077468119561672, + "learning_rate": 0.000820640713791295, + "loss": 0.0585, + "num_input_tokens_seen": 92863856, + "step": 42980 + }, + { + "epoch": 7.012234910277325, + "grad_norm": 0.43398287892341614, + "learning_rate": 0.000820586093854389, + "loss": 0.129, + "num_input_tokens_seen": 92874992, + "step": 42985 + }, + { + "epoch": 7.01305057096248, + "grad_norm": 0.028587957844138145, + "learning_rate": 0.0008205314674203989, + "loss": 0.0651, + "num_input_tokens_seen": 92886704, + "step": 42990 + }, + { + "epoch": 7.013866231647635, + "grad_norm": 0.03041784279048443, + "learning_rate": 0.0008204768344904323, + "loss": 0.0336, + "num_input_tokens_seen": 92897328, + "step": 42995 + }, + { + "epoch": 7.014681892332789, + "grad_norm": 0.008256292901933193, + "learning_rate": 0.0008204221950655959, + "loss": 0.0472, + "num_input_tokens_seen": 92909616, + "step": 43000 + }, + { + "epoch": 7.015497553017944, + "grad_norm": 0.028076890856027603, + "learning_rate": 0.0008203675491469973, + "loss": 0.123, + "num_input_tokens_seen": 92920976, + "step": 43005 + }, + { + "epoch": 7.0163132137031, + "grad_norm": 0.0032557565718889236, + "learning_rate": 0.0008203128967357438, + "loss": 0.0525, + "num_input_tokens_seen": 92933200, + "step": 43010 + }, + { + "epoch": 7.017128874388255, + "grad_norm": 0.006004045717418194, + "learning_rate": 0.0008202582378329433, + "loss": 0.0474, + "num_input_tokens_seen": 92944688, + "step": 43015 + }, + { + "epoch": 7.0179445350734095, + "grad_norm": 0.025199543684720993, + "learning_rate": 0.0008202035724397032, + "loss": 0.0692, + "num_input_tokens_seen": 92954704, + "step": 43020 + }, + { + "epoch": 7.018760195758564, + "grad_norm": 0.008000586181879044, + "learning_rate": 0.0008201489005571316, + "loss": 0.0496, + "num_input_tokens_seen": 92966096, + "step": 43025 + }, + { + "epoch": 7.019575856443719, + "grad_norm": 0.17397743463516235, + "learning_rate": 0.0008200942221863363, + "loss": 0.0542, + "num_input_tokens_seen": 92977360, + "step": 43030 + }, + { + "epoch": 7.020391517128874, + "grad_norm": 0.0518217608332634, + "learning_rate": 0.0008200395373284255, + "loss": 0.0628, + "num_input_tokens_seen": 92988400, + "step": 43035 + }, + { + "epoch": 7.02120717781403, + "grad_norm": 0.02842766046524048, + "learning_rate": 0.0008199848459845077, + "loss": 0.0218, + "num_input_tokens_seen": 92998480, + "step": 43040 + }, + { + "epoch": 7.0220228384991845, + "grad_norm": 0.1324327141046524, + "learning_rate": 0.0008199301481556907, + "loss": 0.0617, + "num_input_tokens_seen": 93009616, + "step": 43045 + }, + { + "epoch": 7.022838499184339, + "grad_norm": 0.07451920211315155, + "learning_rate": 0.0008198754438430836, + "loss": 0.0133, + "num_input_tokens_seen": 93019312, + "step": 43050 + }, + { + "epoch": 7.023654159869494, + "grad_norm": 0.12650872766971588, + "learning_rate": 0.000819820733047795, + "loss": 0.0332, + "num_input_tokens_seen": 93030160, + "step": 43055 + }, + { + "epoch": 7.024469820554649, + "grad_norm": 0.004953624680638313, + "learning_rate": 0.0008197660157709333, + "loss": 0.0893, + "num_input_tokens_seen": 93040112, + "step": 43060 + }, + { + "epoch": 7.025285481239805, + "grad_norm": 0.40456417202949524, + "learning_rate": 0.0008197112920136076, + "loss": 0.0968, + "num_input_tokens_seen": 93051760, + "step": 43065 + }, + { + "epoch": 7.0261011419249595, + "grad_norm": 0.2973378598690033, + "learning_rate": 0.000819656561776927, + "loss": 0.0515, + "num_input_tokens_seen": 93063024, + "step": 43070 + }, + { + "epoch": 7.026916802610114, + "grad_norm": 0.14938867092132568, + "learning_rate": 0.0008196018250620008, + "loss": 0.081, + "num_input_tokens_seen": 93073008, + "step": 43075 + }, + { + "epoch": 7.027732463295269, + "grad_norm": 0.22964538633823395, + "learning_rate": 0.0008195470818699381, + "loss": 0.1137, + "num_input_tokens_seen": 93083664, + "step": 43080 + }, + { + "epoch": 7.028548123980424, + "grad_norm": 0.28837212920188904, + "learning_rate": 0.0008194923322018484, + "loss": 0.1966, + "num_input_tokens_seen": 93092880, + "step": 43085 + }, + { + "epoch": 7.029363784665579, + "grad_norm": 0.2682326138019562, + "learning_rate": 0.0008194375760588413, + "loss": 0.1459, + "num_input_tokens_seen": 93103504, + "step": 43090 + }, + { + "epoch": 7.0301794453507345, + "grad_norm": 0.04698742553591728, + "learning_rate": 0.0008193828134420265, + "loss": 0.1205, + "num_input_tokens_seen": 93116016, + "step": 43095 + }, + { + "epoch": 7.030995106035889, + "grad_norm": 0.007682493422180414, + "learning_rate": 0.0008193280443525138, + "loss": 0.0109, + "num_input_tokens_seen": 93128048, + "step": 43100 + }, + { + "epoch": 7.031810766721044, + "grad_norm": 0.13157765567302704, + "learning_rate": 0.0008192732687914131, + "loss": 0.0206, + "num_input_tokens_seen": 93139440, + "step": 43105 + }, + { + "epoch": 7.032626427406199, + "grad_norm": 0.20668326318264008, + "learning_rate": 0.0008192184867598347, + "loss": 0.1582, + "num_input_tokens_seen": 93150352, + "step": 43110 + }, + { + "epoch": 7.033442088091354, + "grad_norm": 0.005746053997427225, + "learning_rate": 0.0008191636982588887, + "loss": 0.1208, + "num_input_tokens_seen": 93160432, + "step": 43115 + }, + { + "epoch": 7.034257748776509, + "grad_norm": 0.14109240472316742, + "learning_rate": 0.0008191089032896855, + "loss": 0.11, + "num_input_tokens_seen": 93169936, + "step": 43120 + }, + { + "epoch": 7.035073409461664, + "grad_norm": 0.03715949505567551, + "learning_rate": 0.0008190541018533353, + "loss": 0.0124, + "num_input_tokens_seen": 93181264, + "step": 43125 + }, + { + "epoch": 7.035889070146819, + "grad_norm": 0.027663471177220345, + "learning_rate": 0.0008189992939509491, + "loss": 0.0626, + "num_input_tokens_seen": 93192624, + "step": 43130 + }, + { + "epoch": 7.036704730831974, + "grad_norm": 0.14800049364566803, + "learning_rate": 0.0008189444795836377, + "loss": 0.0593, + "num_input_tokens_seen": 93202608, + "step": 43135 + }, + { + "epoch": 7.037520391517129, + "grad_norm": 0.024344148114323616, + "learning_rate": 0.0008188896587525118, + "loss": 0.0277, + "num_input_tokens_seen": 93214096, + "step": 43140 + }, + { + "epoch": 7.0383360522022835, + "grad_norm": 0.1449653059244156, + "learning_rate": 0.0008188348314586823, + "loss": 0.1141, + "num_input_tokens_seen": 93223856, + "step": 43145 + }, + { + "epoch": 7.039151712887439, + "grad_norm": 0.017205238342285156, + "learning_rate": 0.0008187799977032605, + "loss": 0.0296, + "num_input_tokens_seen": 93234576, + "step": 43150 + }, + { + "epoch": 7.039967373572594, + "grad_norm": 0.3828336298465729, + "learning_rate": 0.0008187251574873576, + "loss": 0.2549, + "num_input_tokens_seen": 93245968, + "step": 43155 + }, + { + "epoch": 7.040783034257749, + "grad_norm": 0.40320709347724915, + "learning_rate": 0.0008186703108120852, + "loss": 0.0536, + "num_input_tokens_seen": 93255728, + "step": 43160 + }, + { + "epoch": 7.041598694942904, + "grad_norm": 0.03069511242210865, + "learning_rate": 0.0008186154576785545, + "loss": 0.1759, + "num_input_tokens_seen": 93265328, + "step": 43165 + }, + { + "epoch": 7.0424143556280585, + "grad_norm": 0.0060121663846075535, + "learning_rate": 0.0008185605980878775, + "loss": 0.0751, + "num_input_tokens_seen": 93276048, + "step": 43170 + }, + { + "epoch": 7.043230016313213, + "grad_norm": 0.18002556264400482, + "learning_rate": 0.0008185057320411658, + "loss": 0.0342, + "num_input_tokens_seen": 93286736, + "step": 43175 + }, + { + "epoch": 7.044045676998369, + "grad_norm": 0.02318798191845417, + "learning_rate": 0.0008184508595395314, + "loss": 0.0313, + "num_input_tokens_seen": 93297040, + "step": 43180 + }, + { + "epoch": 7.044861337683524, + "grad_norm": 0.02688850648701191, + "learning_rate": 0.0008183959805840863, + "loss": 0.0638, + "num_input_tokens_seen": 93307472, + "step": 43185 + }, + { + "epoch": 7.045676998368679, + "grad_norm": 0.23760437965393066, + "learning_rate": 0.0008183410951759429, + "loss": 0.1342, + "num_input_tokens_seen": 93319280, + "step": 43190 + }, + { + "epoch": 7.0464926590538335, + "grad_norm": 0.016625650227069855, + "learning_rate": 0.0008182862033162131, + "loss": 0.1159, + "num_input_tokens_seen": 93330128, + "step": 43195 + }, + { + "epoch": 7.047308319738988, + "grad_norm": 0.004090285860002041, + "learning_rate": 0.0008182313050060098, + "loss": 0.0634, + "num_input_tokens_seen": 93341776, + "step": 43200 + }, + { + "epoch": 7.048123980424143, + "grad_norm": 0.14525869488716125, + "learning_rate": 0.0008181764002464454, + "loss": 0.1078, + "num_input_tokens_seen": 93353008, + "step": 43205 + }, + { + "epoch": 7.048939641109299, + "grad_norm": 0.1771831512451172, + "learning_rate": 0.0008181214890386326, + "loss": 0.0713, + "num_input_tokens_seen": 93363664, + "step": 43210 + }, + { + "epoch": 7.049755301794454, + "grad_norm": 0.11869829148054123, + "learning_rate": 0.0008180665713836842, + "loss": 0.0565, + "num_input_tokens_seen": 93374000, + "step": 43215 + }, + { + "epoch": 7.0505709624796085, + "grad_norm": 0.012297256849706173, + "learning_rate": 0.0008180116472827133, + "loss": 0.0368, + "num_input_tokens_seen": 93384368, + "step": 43220 + }, + { + "epoch": 7.051386623164763, + "grad_norm": 0.07448185235261917, + "learning_rate": 0.000817956716736833, + "loss": 0.0206, + "num_input_tokens_seen": 93396080, + "step": 43225 + }, + { + "epoch": 7.052202283849918, + "grad_norm": 0.019102323800325394, + "learning_rate": 0.0008179017797471562, + "loss": 0.0733, + "num_input_tokens_seen": 93407728, + "step": 43230 + }, + { + "epoch": 7.053017944535074, + "grad_norm": 0.04259370639920235, + "learning_rate": 0.0008178468363147968, + "loss": 0.0183, + "num_input_tokens_seen": 93419024, + "step": 43235 + }, + { + "epoch": 7.053833605220229, + "grad_norm": 0.0029630782082676888, + "learning_rate": 0.000817791886440868, + "loss": 0.0074, + "num_input_tokens_seen": 93429040, + "step": 43240 + }, + { + "epoch": 7.054649265905383, + "grad_norm": 0.01488267257809639, + "learning_rate": 0.0008177369301264834, + "loss": 0.0293, + "num_input_tokens_seen": 93439312, + "step": 43245 + }, + { + "epoch": 7.055464926590538, + "grad_norm": 0.032309968024492264, + "learning_rate": 0.0008176819673727569, + "loss": 0.1367, + "num_input_tokens_seen": 93451312, + "step": 43250 + }, + { + "epoch": 7.056280587275693, + "grad_norm": 0.18661397695541382, + "learning_rate": 0.0008176269981808023, + "loss": 0.0899, + "num_input_tokens_seen": 93462576, + "step": 43255 + }, + { + "epoch": 7.057096247960848, + "grad_norm": 0.31488245725631714, + "learning_rate": 0.0008175720225517337, + "loss": 0.1092, + "num_input_tokens_seen": 93472656, + "step": 43260 + }, + { + "epoch": 7.057911908646004, + "grad_norm": 0.2543412148952484, + "learning_rate": 0.0008175170404866652, + "loss": 0.0472, + "num_input_tokens_seen": 93483024, + "step": 43265 + }, + { + "epoch": 7.058727569331158, + "grad_norm": 0.00109146349132061, + "learning_rate": 0.0008174620519867109, + "loss": 0.074, + "num_input_tokens_seen": 93493712, + "step": 43270 + }, + { + "epoch": 7.059543230016313, + "grad_norm": 0.021653829142451286, + "learning_rate": 0.0008174070570529854, + "loss": 0.098, + "num_input_tokens_seen": 93505040, + "step": 43275 + }, + { + "epoch": 7.060358890701468, + "grad_norm": 0.2788853943347931, + "learning_rate": 0.0008173520556866035, + "loss": 0.1521, + "num_input_tokens_seen": 93516528, + "step": 43280 + }, + { + "epoch": 7.061174551386623, + "grad_norm": 0.01834874600172043, + "learning_rate": 0.0008172970478886794, + "loss": 0.0338, + "num_input_tokens_seen": 93528688, + "step": 43285 + }, + { + "epoch": 7.061990212071779, + "grad_norm": 0.020214732736349106, + "learning_rate": 0.0008172420336603281, + "loss": 0.0116, + "num_input_tokens_seen": 93540112, + "step": 43290 + }, + { + "epoch": 7.062805872756933, + "grad_norm": 0.21407492458820343, + "learning_rate": 0.0008171870130026646, + "loss": 0.2408, + "num_input_tokens_seen": 93550608, + "step": 43295 + }, + { + "epoch": 7.063621533442088, + "grad_norm": 0.04193660989403725, + "learning_rate": 0.000817131985916804, + "loss": 0.012, + "num_input_tokens_seen": 93561232, + "step": 43300 + }, + { + "epoch": 7.064437194127243, + "grad_norm": 0.1706007868051529, + "learning_rate": 0.0008170769524038613, + "loss": 0.0536, + "num_input_tokens_seen": 93572272, + "step": 43305 + }, + { + "epoch": 7.065252854812398, + "grad_norm": 0.007401083130389452, + "learning_rate": 0.0008170219124649518, + "loss": 0.1607, + "num_input_tokens_seen": 93583376, + "step": 43310 + }, + { + "epoch": 7.066068515497553, + "grad_norm": 0.014208367094397545, + "learning_rate": 0.0008169668661011912, + "loss": 0.0789, + "num_input_tokens_seen": 93592624, + "step": 43315 + }, + { + "epoch": 7.066884176182708, + "grad_norm": 0.11837077885866165, + "learning_rate": 0.0008169118133136951, + "loss": 0.0174, + "num_input_tokens_seen": 93602864, + "step": 43320 + }, + { + "epoch": 7.067699836867863, + "grad_norm": 0.021076519042253494, + "learning_rate": 0.0008168567541035788, + "loss": 0.24, + "num_input_tokens_seen": 93613072, + "step": 43325 + }, + { + "epoch": 7.068515497553018, + "grad_norm": 0.30501484870910645, + "learning_rate": 0.0008168016884719585, + "loss": 0.1192, + "num_input_tokens_seen": 93624016, + "step": 43330 + }, + { + "epoch": 7.069331158238173, + "grad_norm": 0.02154276892542839, + "learning_rate": 0.0008167466164199499, + "loss": 0.0635, + "num_input_tokens_seen": 93635216, + "step": 43335 + }, + { + "epoch": 7.070146818923328, + "grad_norm": 0.006805689074099064, + "learning_rate": 0.0008166915379486697, + "loss": 0.0354, + "num_input_tokens_seen": 93646320, + "step": 43340 + }, + { + "epoch": 7.0709624796084825, + "grad_norm": 0.05280419811606407, + "learning_rate": 0.0008166364530592334, + "loss": 0.0714, + "num_input_tokens_seen": 93655600, + "step": 43345 + }, + { + "epoch": 7.071778140293638, + "grad_norm": 0.011205635033547878, + "learning_rate": 0.0008165813617527579, + "loss": 0.2762, + "num_input_tokens_seen": 93666544, + "step": 43350 + }, + { + "epoch": 7.072593800978793, + "grad_norm": 0.23616188764572144, + "learning_rate": 0.0008165262640303595, + "loss": 0.1106, + "num_input_tokens_seen": 93677776, + "step": 43355 + }, + { + "epoch": 7.073409461663948, + "grad_norm": 0.02595806121826172, + "learning_rate": 0.0008164711598931546, + "loss": 0.0192, + "num_input_tokens_seen": 93688368, + "step": 43360 + }, + { + "epoch": 7.074225122349103, + "grad_norm": 0.23422791063785553, + "learning_rate": 0.0008164160493422604, + "loss": 0.0558, + "num_input_tokens_seen": 93698256, + "step": 43365 + }, + { + "epoch": 7.075040783034257, + "grad_norm": 0.025671793147921562, + "learning_rate": 0.0008163609323787934, + "loss": 0.0263, + "num_input_tokens_seen": 93708048, + "step": 43370 + }, + { + "epoch": 7.075856443719413, + "grad_norm": 0.03166002407670021, + "learning_rate": 0.0008163058090038709, + "loss": 0.0987, + "num_input_tokens_seen": 93717712, + "step": 43375 + }, + { + "epoch": 7.076672104404568, + "grad_norm": 0.016035977751016617, + "learning_rate": 0.0008162506792186099, + "loss": 0.0638, + "num_input_tokens_seen": 93728848, + "step": 43380 + }, + { + "epoch": 7.077487765089723, + "grad_norm": 0.05759859457612038, + "learning_rate": 0.0008161955430241276, + "loss": 0.0724, + "num_input_tokens_seen": 93740688, + "step": 43385 + }, + { + "epoch": 7.078303425774878, + "grad_norm": 0.057698123157024384, + "learning_rate": 0.0008161404004215415, + "loss": 0.1149, + "num_input_tokens_seen": 93751696, + "step": 43390 + }, + { + "epoch": 7.079119086460032, + "grad_norm": 0.0237799733877182, + "learning_rate": 0.0008160852514119692, + "loss": 0.0246, + "num_input_tokens_seen": 93762480, + "step": 43395 + }, + { + "epoch": 7.079934747145187, + "grad_norm": 0.16691306233406067, + "learning_rate": 0.0008160300959965284, + "loss": 0.0571, + "num_input_tokens_seen": 93772592, + "step": 43400 + }, + { + "epoch": 7.080750407830343, + "grad_norm": 0.43984997272491455, + "learning_rate": 0.0008159749341763367, + "loss": 0.113, + "num_input_tokens_seen": 93782640, + "step": 43405 + }, + { + "epoch": 7.081566068515498, + "grad_norm": 0.01615430787205696, + "learning_rate": 0.000815919765952512, + "loss": 0.0495, + "num_input_tokens_seen": 93793552, + "step": 43410 + }, + { + "epoch": 7.082381729200653, + "grad_norm": 0.011169064790010452, + "learning_rate": 0.0008158645913261726, + "loss": 0.2136, + "num_input_tokens_seen": 93803952, + "step": 43415 + }, + { + "epoch": 7.083197389885807, + "grad_norm": 0.018460217863321304, + "learning_rate": 0.0008158094102984366, + "loss": 0.025, + "num_input_tokens_seen": 93816048, + "step": 43420 + }, + { + "epoch": 7.084013050570962, + "grad_norm": 0.06511392444372177, + "learning_rate": 0.0008157542228704221, + "loss": 0.0282, + "num_input_tokens_seen": 93827088, + "step": 43425 + }, + { + "epoch": 7.084828711256117, + "grad_norm": 0.010363047942519188, + "learning_rate": 0.0008156990290432478, + "loss": 0.0142, + "num_input_tokens_seen": 93837968, + "step": 43430 + }, + { + "epoch": 7.085644371941273, + "grad_norm": 0.3792910575866699, + "learning_rate": 0.0008156438288180321, + "loss": 0.0871, + "num_input_tokens_seen": 93849808, + "step": 43435 + }, + { + "epoch": 7.0864600326264275, + "grad_norm": 0.00462770601734519, + "learning_rate": 0.0008155886221958939, + "loss": 0.1214, + "num_input_tokens_seen": 93860816, + "step": 43440 + }, + { + "epoch": 7.087275693311582, + "grad_norm": 0.004159488715231419, + "learning_rate": 0.0008155334091779518, + "loss": 0.077, + "num_input_tokens_seen": 93871024, + "step": 43445 + }, + { + "epoch": 7.088091353996737, + "grad_norm": 0.010639780201017857, + "learning_rate": 0.0008154781897653251, + "loss": 0.0076, + "num_input_tokens_seen": 93883280, + "step": 43450 + }, + { + "epoch": 7.088907014681892, + "grad_norm": 0.23465438187122345, + "learning_rate": 0.0008154229639591324, + "loss": 0.0977, + "num_input_tokens_seen": 93894928, + "step": 43455 + }, + { + "epoch": 7.089722675367048, + "grad_norm": 0.008510327897965908, + "learning_rate": 0.0008153677317604935, + "loss": 0.1191, + "num_input_tokens_seen": 93906320, + "step": 43460 + }, + { + "epoch": 7.0905383360522025, + "grad_norm": 0.02426774427294731, + "learning_rate": 0.0008153124931705271, + "loss": 0.0701, + "num_input_tokens_seen": 93917520, + "step": 43465 + }, + { + "epoch": 7.091353996737357, + "grad_norm": 0.08482329547405243, + "learning_rate": 0.0008152572481903533, + "loss": 0.101, + "num_input_tokens_seen": 93928560, + "step": 43470 + }, + { + "epoch": 7.092169657422512, + "grad_norm": 0.25256073474884033, + "learning_rate": 0.0008152019968210913, + "loss": 0.1663, + "num_input_tokens_seen": 93939536, + "step": 43475 + }, + { + "epoch": 7.092985318107667, + "grad_norm": 0.08296633511781693, + "learning_rate": 0.0008151467390638611, + "loss": 0.1023, + "num_input_tokens_seen": 93950032, + "step": 43480 + }, + { + "epoch": 7.093800978792822, + "grad_norm": 0.08412051200866699, + "learning_rate": 0.0008150914749197823, + "loss": 0.0835, + "num_input_tokens_seen": 93961744, + "step": 43485 + }, + { + "epoch": 7.0946166394779775, + "grad_norm": 0.004528961610049009, + "learning_rate": 0.0008150362043899751, + "loss": 0.0824, + "num_input_tokens_seen": 93972656, + "step": 43490 + }, + { + "epoch": 7.095432300163132, + "grad_norm": 0.05135902389883995, + "learning_rate": 0.0008149809274755595, + "loss": 0.0755, + "num_input_tokens_seen": 93983248, + "step": 43495 + }, + { + "epoch": 7.096247960848287, + "grad_norm": 0.09717827290296555, + "learning_rate": 0.0008149256441776559, + "loss": 0.0423, + "num_input_tokens_seen": 93993904, + "step": 43500 + }, + { + "epoch": 7.097063621533442, + "grad_norm": 0.054743360728025436, + "learning_rate": 0.0008148703544973846, + "loss": 0.2156, + "num_input_tokens_seen": 94004784, + "step": 43505 + }, + { + "epoch": 7.097879282218597, + "grad_norm": 0.05458596348762512, + "learning_rate": 0.000814815058435866, + "loss": 0.1561, + "num_input_tokens_seen": 94015472, + "step": 43510 + }, + { + "epoch": 7.0986949429037525, + "grad_norm": 0.022231120616197586, + "learning_rate": 0.0008147597559942211, + "loss": 0.0298, + "num_input_tokens_seen": 94026000, + "step": 43515 + }, + { + "epoch": 7.099510603588907, + "grad_norm": 0.2611452639102936, + "learning_rate": 0.0008147044471735703, + "loss": 0.1484, + "num_input_tokens_seen": 94037680, + "step": 43520 + }, + { + "epoch": 7.100326264274062, + "grad_norm": 0.12025828659534454, + "learning_rate": 0.0008146491319750346, + "loss": 0.1683, + "num_input_tokens_seen": 94048528, + "step": 43525 + }, + { + "epoch": 7.101141924959217, + "grad_norm": 0.09317224472761154, + "learning_rate": 0.0008145938103997352, + "loss": 0.0316, + "num_input_tokens_seen": 94059248, + "step": 43530 + }, + { + "epoch": 7.101957585644372, + "grad_norm": 0.09532984346151352, + "learning_rate": 0.0008145384824487931, + "loss": 0.12, + "num_input_tokens_seen": 94069680, + "step": 43535 + }, + { + "epoch": 7.102773246329527, + "grad_norm": 0.2917781472206116, + "learning_rate": 0.0008144831481233296, + "loss": 0.184, + "num_input_tokens_seen": 94079632, + "step": 43540 + }, + { + "epoch": 7.103588907014682, + "grad_norm": 0.18950186669826508, + "learning_rate": 0.0008144278074244662, + "loss": 0.0446, + "num_input_tokens_seen": 94090288, + "step": 43545 + }, + { + "epoch": 7.104404567699837, + "grad_norm": 0.00261941971257329, + "learning_rate": 0.0008143724603533243, + "loss": 0.0615, + "num_input_tokens_seen": 94101168, + "step": 43550 + }, + { + "epoch": 7.105220228384992, + "grad_norm": 0.04184950143098831, + "learning_rate": 0.0008143171069110258, + "loss": 0.0339, + "num_input_tokens_seen": 94111088, + "step": 43555 + }, + { + "epoch": 7.106035889070147, + "grad_norm": 0.009390073828399181, + "learning_rate": 0.0008142617470986924, + "loss": 0.1462, + "num_input_tokens_seen": 94121872, + "step": 43560 + }, + { + "epoch": 7.1068515497553015, + "grad_norm": 0.10224417597055435, + "learning_rate": 0.000814206380917446, + "loss": 0.0273, + "num_input_tokens_seen": 94133776, + "step": 43565 + }, + { + "epoch": 7.107667210440456, + "grad_norm": 0.004597066435962915, + "learning_rate": 0.0008141510083684087, + "loss": 0.0346, + "num_input_tokens_seen": 94143600, + "step": 43570 + }, + { + "epoch": 7.108482871125612, + "grad_norm": 0.042119644582271576, + "learning_rate": 0.0008140956294527026, + "loss": 0.0294, + "num_input_tokens_seen": 94154704, + "step": 43575 + }, + { + "epoch": 7.109298531810767, + "grad_norm": 0.2623211145401001, + "learning_rate": 0.00081404024417145, + "loss": 0.1258, + "num_input_tokens_seen": 94164592, + "step": 43580 + }, + { + "epoch": 7.110114192495922, + "grad_norm": 0.006643175147473812, + "learning_rate": 0.0008139848525257737, + "loss": 0.0998, + "num_input_tokens_seen": 94174960, + "step": 43585 + }, + { + "epoch": 7.1109298531810765, + "grad_norm": 0.14614737033843994, + "learning_rate": 0.000813929454516796, + "loss": 0.0271, + "num_input_tokens_seen": 94185328, + "step": 43590 + }, + { + "epoch": 7.111745513866231, + "grad_norm": 0.2504687011241913, + "learning_rate": 0.0008138740501456396, + "loss": 0.1811, + "num_input_tokens_seen": 94194896, + "step": 43595 + }, + { + "epoch": 7.112561174551387, + "grad_norm": 0.19602181017398834, + "learning_rate": 0.0008138186394134275, + "loss": 0.1734, + "num_input_tokens_seen": 94203920, + "step": 43600 + }, + { + "epoch": 7.113376835236542, + "grad_norm": 0.06594168394804001, + "learning_rate": 0.0008137632223212824, + "loss": 0.094, + "num_input_tokens_seen": 94215408, + "step": 43605 + }, + { + "epoch": 7.114192495921697, + "grad_norm": 0.3192061185836792, + "learning_rate": 0.0008137077988703276, + "loss": 0.3059, + "num_input_tokens_seen": 94225840, + "step": 43610 + }, + { + "epoch": 7.1150081566068515, + "grad_norm": 0.028179455548524857, + "learning_rate": 0.0008136523690616864, + "loss": 0.0297, + "num_input_tokens_seen": 94235120, + "step": 43615 + }, + { + "epoch": 7.115823817292006, + "grad_norm": 0.01822386495769024, + "learning_rate": 0.000813596932896482, + "loss": 0.1139, + "num_input_tokens_seen": 94247312, + "step": 43620 + }, + { + "epoch": 7.116639477977161, + "grad_norm": 0.06899786740541458, + "learning_rate": 0.000813541490375838, + "loss": 0.0928, + "num_input_tokens_seen": 94258128, + "step": 43625 + }, + { + "epoch": 7.117455138662317, + "grad_norm": 0.19595955312252045, + "learning_rate": 0.0008134860415008778, + "loss": 0.0488, + "num_input_tokens_seen": 94268048, + "step": 43630 + }, + { + "epoch": 7.118270799347472, + "grad_norm": 0.08394555747509003, + "learning_rate": 0.0008134305862727253, + "loss": 0.0358, + "num_input_tokens_seen": 94279184, + "step": 43635 + }, + { + "epoch": 7.1190864600326265, + "grad_norm": 0.3096727132797241, + "learning_rate": 0.0008133751246925046, + "loss": 0.0491, + "num_input_tokens_seen": 94289744, + "step": 43640 + }, + { + "epoch": 7.119902120717781, + "grad_norm": 0.21318064630031586, + "learning_rate": 0.0008133196567613391, + "loss": 0.0773, + "num_input_tokens_seen": 94300656, + "step": 43645 + }, + { + "epoch": 7.120717781402936, + "grad_norm": 0.2673552632331848, + "learning_rate": 0.0008132641824803534, + "loss": 0.1315, + "num_input_tokens_seen": 94310608, + "step": 43650 + }, + { + "epoch": 7.121533442088092, + "grad_norm": 0.0654841959476471, + "learning_rate": 0.0008132087018506716, + "loss": 0.0438, + "num_input_tokens_seen": 94321584, + "step": 43655 + }, + { + "epoch": 7.122349102773247, + "grad_norm": 0.010458219796419144, + "learning_rate": 0.0008131532148734182, + "loss": 0.0208, + "num_input_tokens_seen": 94331056, + "step": 43660 + }, + { + "epoch": 7.123164763458401, + "grad_norm": 0.03026103600859642, + "learning_rate": 0.0008130977215497177, + "loss": 0.0341, + "num_input_tokens_seen": 94341840, + "step": 43665 + }, + { + "epoch": 7.123980424143556, + "grad_norm": 0.27282023429870605, + "learning_rate": 0.0008130422218806945, + "loss": 0.1214, + "num_input_tokens_seen": 94353232, + "step": 43670 + }, + { + "epoch": 7.124796084828711, + "grad_norm": 0.006859856657683849, + "learning_rate": 0.0008129867158674737, + "loss": 0.0763, + "num_input_tokens_seen": 94364368, + "step": 43675 + }, + { + "epoch": 7.125611745513866, + "grad_norm": 0.23025669157505035, + "learning_rate": 0.00081293120351118, + "loss": 0.1177, + "num_input_tokens_seen": 94375216, + "step": 43680 + }, + { + "epoch": 7.126427406199022, + "grad_norm": 0.10104181617498398, + "learning_rate": 0.0008128756848129386, + "loss": 0.0252, + "num_input_tokens_seen": 94384144, + "step": 43685 + }, + { + "epoch": 7.127243066884176, + "grad_norm": 0.005190078169107437, + "learning_rate": 0.0008128201597738744, + "loss": 0.0214, + "num_input_tokens_seen": 94394288, + "step": 43690 + }, + { + "epoch": 7.128058727569331, + "grad_norm": 0.020754588767886162, + "learning_rate": 0.0008127646283951129, + "loss": 0.1413, + "num_input_tokens_seen": 94405808, + "step": 43695 + }, + { + "epoch": 7.128874388254486, + "grad_norm": 0.2567387521266937, + "learning_rate": 0.0008127090906777793, + "loss": 0.0685, + "num_input_tokens_seen": 94417680, + "step": 43700 + }, + { + "epoch": 7.129690048939641, + "grad_norm": 0.23241105675697327, + "learning_rate": 0.0008126535466229993, + "loss": 0.0605, + "num_input_tokens_seen": 94428912, + "step": 43705 + }, + { + "epoch": 7.130505709624796, + "grad_norm": 0.031064637005329132, + "learning_rate": 0.0008125979962318987, + "loss": 0.1147, + "num_input_tokens_seen": 94439344, + "step": 43710 + }, + { + "epoch": 7.131321370309951, + "grad_norm": 0.11722356826066971, + "learning_rate": 0.000812542439505603, + "loss": 0.0777, + "num_input_tokens_seen": 94450992, + "step": 43715 + }, + { + "epoch": 7.132137030995106, + "grad_norm": 0.04877146705985069, + "learning_rate": 0.0008124868764452384, + "loss": 0.0381, + "num_input_tokens_seen": 94462064, + "step": 43720 + }, + { + "epoch": 7.132952691680261, + "grad_norm": 0.262928307056427, + "learning_rate": 0.0008124313070519307, + "loss": 0.0774, + "num_input_tokens_seen": 94472880, + "step": 43725 + }, + { + "epoch": 7.133768352365416, + "grad_norm": 0.20131583511829376, + "learning_rate": 0.0008123757313268064, + "loss": 0.0411, + "num_input_tokens_seen": 94483600, + "step": 43730 + }, + { + "epoch": 7.134584013050571, + "grad_norm": 0.03627254441380501, + "learning_rate": 0.0008123201492709915, + "loss": 0.2043, + "num_input_tokens_seen": 94494160, + "step": 43735 + }, + { + "epoch": 7.135399673735726, + "grad_norm": 0.0354943573474884, + "learning_rate": 0.0008122645608856125, + "loss": 0.0844, + "num_input_tokens_seen": 94505104, + "step": 43740 + }, + { + "epoch": 7.136215334420881, + "grad_norm": 0.2703050374984741, + "learning_rate": 0.0008122089661717961, + "loss": 0.1285, + "num_input_tokens_seen": 94513680, + "step": 43745 + }, + { + "epoch": 7.137030995106036, + "grad_norm": 0.00605939282104373, + "learning_rate": 0.000812153365130669, + "loss": 0.0212, + "num_input_tokens_seen": 94523792, + "step": 43750 + }, + { + "epoch": 7.137846655791191, + "grad_norm": 0.33780086040496826, + "learning_rate": 0.0008120977577633578, + "loss": 0.1763, + "num_input_tokens_seen": 94534896, + "step": 43755 + }, + { + "epoch": 7.138662316476346, + "grad_norm": 0.29291245341300964, + "learning_rate": 0.0008120421440709897, + "loss": 0.1751, + "num_input_tokens_seen": 94544592, + "step": 43760 + }, + { + "epoch": 7.1394779771615005, + "grad_norm": 0.024935364723205566, + "learning_rate": 0.0008119865240546918, + "loss": 0.0174, + "num_input_tokens_seen": 94555056, + "step": 43765 + }, + { + "epoch": 7.140293637846656, + "grad_norm": 0.06435410678386688, + "learning_rate": 0.000811930897715591, + "loss": 0.04, + "num_input_tokens_seen": 94564272, + "step": 43770 + }, + { + "epoch": 7.141109298531811, + "grad_norm": 0.01669169031083584, + "learning_rate": 0.0008118752650548151, + "loss": 0.0208, + "num_input_tokens_seen": 94574832, + "step": 43775 + }, + { + "epoch": 7.141924959216966, + "grad_norm": 0.17030321061611176, + "learning_rate": 0.0008118196260734911, + "loss": 0.1151, + "num_input_tokens_seen": 94585616, + "step": 43780 + }, + { + "epoch": 7.142740619902121, + "grad_norm": 0.015497151762247086, + "learning_rate": 0.000811763980772747, + "loss": 0.1636, + "num_input_tokens_seen": 94596432, + "step": 43785 + }, + { + "epoch": 7.143556280587275, + "grad_norm": 0.17114491760730743, + "learning_rate": 0.0008117083291537102, + "loss": 0.2757, + "num_input_tokens_seen": 94607600, + "step": 43790 + }, + { + "epoch": 7.14437194127243, + "grad_norm": 0.12084033340215683, + "learning_rate": 0.0008116526712175087, + "loss": 0.0772, + "num_input_tokens_seen": 94618288, + "step": 43795 + }, + { + "epoch": 7.145187601957586, + "grad_norm": 0.04583222046494484, + "learning_rate": 0.0008115970069652705, + "loss": 0.0244, + "num_input_tokens_seen": 94628656, + "step": 43800 + }, + { + "epoch": 7.146003262642741, + "grad_norm": 0.029846591874957085, + "learning_rate": 0.0008115413363981237, + "loss": 0.018, + "num_input_tokens_seen": 94639696, + "step": 43805 + }, + { + "epoch": 7.146818923327896, + "grad_norm": 0.05696532502770424, + "learning_rate": 0.0008114856595171963, + "loss": 0.0417, + "num_input_tokens_seen": 94651216, + "step": 43810 + }, + { + "epoch": 7.14763458401305, + "grad_norm": 0.015203659422695637, + "learning_rate": 0.000811429976323617, + "loss": 0.0945, + "num_input_tokens_seen": 94661328, + "step": 43815 + }, + { + "epoch": 7.148450244698205, + "grad_norm": 0.020137041807174683, + "learning_rate": 0.0008113742868185142, + "loss": 0.0442, + "num_input_tokens_seen": 94672464, + "step": 43820 + }, + { + "epoch": 7.149265905383361, + "grad_norm": 0.017115939408540726, + "learning_rate": 0.0008113185910030163, + "loss": 0.0405, + "num_input_tokens_seen": 94683888, + "step": 43825 + }, + { + "epoch": 7.150081566068516, + "grad_norm": 0.2196558266878128, + "learning_rate": 0.0008112628888782523, + "loss": 0.179, + "num_input_tokens_seen": 94695504, + "step": 43830 + }, + { + "epoch": 7.150897226753671, + "grad_norm": 0.011017675511538982, + "learning_rate": 0.0008112071804453511, + "loss": 0.0671, + "num_input_tokens_seen": 94706768, + "step": 43835 + }, + { + "epoch": 7.151712887438825, + "grad_norm": 0.2845572233200073, + "learning_rate": 0.0008111514657054415, + "loss": 0.199, + "num_input_tokens_seen": 94718288, + "step": 43840 + }, + { + "epoch": 7.15252854812398, + "grad_norm": 0.009154774248600006, + "learning_rate": 0.0008110957446596527, + "loss": 0.1263, + "num_input_tokens_seen": 94728752, + "step": 43845 + }, + { + "epoch": 7.153344208809135, + "grad_norm": 0.015152211301028728, + "learning_rate": 0.0008110400173091142, + "loss": 0.0405, + "num_input_tokens_seen": 94740464, + "step": 43850 + }, + { + "epoch": 7.154159869494291, + "grad_norm": 0.3790530562400818, + "learning_rate": 0.0008109842836549549, + "loss": 0.1817, + "num_input_tokens_seen": 94751152, + "step": 43855 + }, + { + "epoch": 7.1549755301794455, + "grad_norm": 0.13960027694702148, + "learning_rate": 0.0008109285436983047, + "loss": 0.0275, + "num_input_tokens_seen": 94761584, + "step": 43860 + }, + { + "epoch": 7.1557911908646, + "grad_norm": 0.22438912093639374, + "learning_rate": 0.000810872797440293, + "loss": 0.1195, + "num_input_tokens_seen": 94772816, + "step": 43865 + }, + { + "epoch": 7.156606851549755, + "grad_norm": 0.007492088247090578, + "learning_rate": 0.0008108170448820498, + "loss": 0.0219, + "num_input_tokens_seen": 94784112, + "step": 43870 + }, + { + "epoch": 7.15742251223491, + "grad_norm": 0.1964537799358368, + "learning_rate": 0.0008107612860247049, + "loss": 0.0462, + "num_input_tokens_seen": 94796240, + "step": 43875 + }, + { + "epoch": 7.158238172920065, + "grad_norm": 0.013916724361479282, + "learning_rate": 0.0008107055208693882, + "loss": 0.1627, + "num_input_tokens_seen": 94807280, + "step": 43880 + }, + { + "epoch": 7.1590538336052205, + "grad_norm": 0.04685168340802193, + "learning_rate": 0.00081064974941723, + "loss": 0.0447, + "num_input_tokens_seen": 94816720, + "step": 43885 + }, + { + "epoch": 7.159869494290375, + "grad_norm": 0.03049752674996853, + "learning_rate": 0.0008105939716693606, + "loss": 0.0282, + "num_input_tokens_seen": 94827984, + "step": 43890 + }, + { + "epoch": 7.16068515497553, + "grad_norm": 0.006316207814961672, + "learning_rate": 0.0008105381876269104, + "loss": 0.099, + "num_input_tokens_seen": 94839632, + "step": 43895 + }, + { + "epoch": 7.161500815660685, + "grad_norm": 0.0397644080221653, + "learning_rate": 0.0008104823972910098, + "loss": 0.0905, + "num_input_tokens_seen": 94849168, + "step": 43900 + }, + { + "epoch": 7.16231647634584, + "grad_norm": 0.025401227176189423, + "learning_rate": 0.0008104266006627895, + "loss": 0.0366, + "num_input_tokens_seen": 94859664, + "step": 43905 + }, + { + "epoch": 7.1631321370309955, + "grad_norm": 0.3657297194004059, + "learning_rate": 0.0008103707977433804, + "loss": 0.0846, + "num_input_tokens_seen": 94870736, + "step": 43910 + }, + { + "epoch": 7.16394779771615, + "grad_norm": 0.10411134362220764, + "learning_rate": 0.0008103149885339134, + "loss": 0.0258, + "num_input_tokens_seen": 94881456, + "step": 43915 + }, + { + "epoch": 7.164763458401305, + "grad_norm": 0.014874082058668137, + "learning_rate": 0.0008102591730355193, + "loss": 0.0352, + "num_input_tokens_seen": 94892976, + "step": 43920 + }, + { + "epoch": 7.16557911908646, + "grad_norm": 0.03545144945383072, + "learning_rate": 0.0008102033512493297, + "loss": 0.0273, + "num_input_tokens_seen": 94904464, + "step": 43925 + }, + { + "epoch": 7.166394779771615, + "grad_norm": 0.21843662858009338, + "learning_rate": 0.0008101475231764756, + "loss": 0.0497, + "num_input_tokens_seen": 94917008, + "step": 43930 + }, + { + "epoch": 7.16721044045677, + "grad_norm": 0.13957096636295319, + "learning_rate": 0.0008100916888180884, + "loss": 0.061, + "num_input_tokens_seen": 94928560, + "step": 43935 + }, + { + "epoch": 7.168026101141925, + "grad_norm": 0.26728448271751404, + "learning_rate": 0.0008100358481752998, + "loss": 0.0351, + "num_input_tokens_seen": 94939536, + "step": 43940 + }, + { + "epoch": 7.16884176182708, + "grad_norm": 0.006182427518069744, + "learning_rate": 0.0008099800012492415, + "loss": 0.0131, + "num_input_tokens_seen": 94950640, + "step": 43945 + }, + { + "epoch": 7.169657422512235, + "grad_norm": 0.004266462288796902, + "learning_rate": 0.0008099241480410451, + "loss": 0.1088, + "num_input_tokens_seen": 94961968, + "step": 43950 + }, + { + "epoch": 7.17047308319739, + "grad_norm": 0.17784513533115387, + "learning_rate": 0.0008098682885518427, + "loss": 0.1978, + "num_input_tokens_seen": 94972272, + "step": 43955 + }, + { + "epoch": 7.171288743882545, + "grad_norm": 0.008746275678277016, + "learning_rate": 0.0008098124227827663, + "loss": 0.0554, + "num_input_tokens_seen": 94983088, + "step": 43960 + }, + { + "epoch": 7.1721044045677, + "grad_norm": 0.009621957316994667, + "learning_rate": 0.0008097565507349482, + "loss": 0.166, + "num_input_tokens_seen": 94992560, + "step": 43965 + }, + { + "epoch": 7.172920065252855, + "grad_norm": 0.34376588463783264, + "learning_rate": 0.0008097006724095208, + "loss": 0.1315, + "num_input_tokens_seen": 95003568, + "step": 43970 + }, + { + "epoch": 7.17373572593801, + "grad_norm": 0.11799240112304688, + "learning_rate": 0.0008096447878076161, + "loss": 0.1875, + "num_input_tokens_seen": 95014768, + "step": 43975 + }, + { + "epoch": 7.174551386623165, + "grad_norm": 0.2644333839416504, + "learning_rate": 0.0008095888969303672, + "loss": 0.2692, + "num_input_tokens_seen": 95025296, + "step": 43980 + }, + { + "epoch": 7.1753670473083195, + "grad_norm": 0.052062440663576126, + "learning_rate": 0.0008095329997789063, + "loss": 0.0439, + "num_input_tokens_seen": 95036464, + "step": 43985 + }, + { + "epoch": 7.176182707993474, + "grad_norm": 0.012716952711343765, + "learning_rate": 0.0008094770963543667, + "loss": 0.0288, + "num_input_tokens_seen": 95047344, + "step": 43990 + }, + { + "epoch": 7.17699836867863, + "grad_norm": 0.03887278586626053, + "learning_rate": 0.0008094211866578812, + "loss": 0.0392, + "num_input_tokens_seen": 95058320, + "step": 43995 + }, + { + "epoch": 7.177814029363785, + "grad_norm": 0.14700531959533691, + "learning_rate": 0.0008093652706905827, + "loss": 0.0444, + "num_input_tokens_seen": 95069488, + "step": 44000 + }, + { + "epoch": 7.17862969004894, + "grad_norm": 0.018018363043665886, + "learning_rate": 0.0008093093484536045, + "loss": 0.0374, + "num_input_tokens_seen": 95079792, + "step": 44005 + }, + { + "epoch": 7.1794453507340945, + "grad_norm": 0.03422345593571663, + "learning_rate": 0.0008092534199480801, + "loss": 0.0164, + "num_input_tokens_seen": 95090832, + "step": 44010 + }, + { + "epoch": 7.180261011419249, + "grad_norm": 0.10804907977581024, + "learning_rate": 0.0008091974851751427, + "loss": 0.0491, + "num_input_tokens_seen": 95102736, + "step": 44015 + }, + { + "epoch": 7.181076672104404, + "grad_norm": 0.20800091326236725, + "learning_rate": 0.0008091415441359261, + "loss": 0.1226, + "num_input_tokens_seen": 95112496, + "step": 44020 + }, + { + "epoch": 7.18189233278956, + "grad_norm": 0.0028511809650808573, + "learning_rate": 0.000809085596831564, + "loss": 0.0309, + "num_input_tokens_seen": 95122896, + "step": 44025 + }, + { + "epoch": 7.182707993474715, + "grad_norm": 0.25854846835136414, + "learning_rate": 0.0008090296432631901, + "loss": 0.0932, + "num_input_tokens_seen": 95133808, + "step": 44030 + }, + { + "epoch": 7.1835236541598695, + "grad_norm": 0.017109766602516174, + "learning_rate": 0.0008089736834319384, + "loss": 0.0283, + "num_input_tokens_seen": 95143952, + "step": 44035 + }, + { + "epoch": 7.184339314845024, + "grad_norm": 0.07196343690156937, + "learning_rate": 0.0008089177173389431, + "loss": 0.0225, + "num_input_tokens_seen": 95154096, + "step": 44040 + }, + { + "epoch": 7.185154975530179, + "grad_norm": 0.2786267101764679, + "learning_rate": 0.0008088617449853382, + "loss": 0.0926, + "num_input_tokens_seen": 95164816, + "step": 44045 + }, + { + "epoch": 7.185970636215335, + "grad_norm": 0.0031654785852879286, + "learning_rate": 0.0008088057663722583, + "loss": 0.0361, + "num_input_tokens_seen": 95174608, + "step": 44050 + }, + { + "epoch": 7.18678629690049, + "grad_norm": 0.006744810845702887, + "learning_rate": 0.000808749781500838, + "loss": 0.0053, + "num_input_tokens_seen": 95185552, + "step": 44055 + }, + { + "epoch": 7.1876019575856445, + "grad_norm": 0.23967309296131134, + "learning_rate": 0.0008086937903722114, + "loss": 0.0432, + "num_input_tokens_seen": 95196816, + "step": 44060 + }, + { + "epoch": 7.188417618270799, + "grad_norm": 0.060102108865976334, + "learning_rate": 0.0008086377929875137, + "loss": 0.1255, + "num_input_tokens_seen": 95207856, + "step": 44065 + }, + { + "epoch": 7.189233278955954, + "grad_norm": 0.012995628640055656, + "learning_rate": 0.0008085817893478797, + "loss": 0.0434, + "num_input_tokens_seen": 95217968, + "step": 44070 + }, + { + "epoch": 7.190048939641109, + "grad_norm": 0.3781433403491974, + "learning_rate": 0.0008085257794544441, + "loss": 0.0734, + "num_input_tokens_seen": 95229488, + "step": 44075 + }, + { + "epoch": 7.190864600326265, + "grad_norm": 0.47650331258773804, + "learning_rate": 0.0008084697633083422, + "loss": 0.2665, + "num_input_tokens_seen": 95239312, + "step": 44080 + }, + { + "epoch": 7.191680261011419, + "grad_norm": 0.05169449746608734, + "learning_rate": 0.0008084137409107093, + "loss": 0.0848, + "num_input_tokens_seen": 95248784, + "step": 44085 + }, + { + "epoch": 7.192495921696574, + "grad_norm": 0.22721447050571442, + "learning_rate": 0.0008083577122626806, + "loss": 0.1741, + "num_input_tokens_seen": 95260176, + "step": 44090 + }, + { + "epoch": 7.193311582381729, + "grad_norm": 0.004407059401273727, + "learning_rate": 0.0008083016773653917, + "loss": 0.0358, + "num_input_tokens_seen": 95271248, + "step": 44095 + }, + { + "epoch": 7.194127243066884, + "grad_norm": 0.004881150089204311, + "learning_rate": 0.0008082456362199783, + "loss": 0.0105, + "num_input_tokens_seen": 95281104, + "step": 44100 + }, + { + "epoch": 7.19494290375204, + "grad_norm": 0.1279035061597824, + "learning_rate": 0.000808189588827576, + "loss": 0.0558, + "num_input_tokens_seen": 95291344, + "step": 44105 + }, + { + "epoch": 7.195758564437194, + "grad_norm": 0.07432336360216141, + "learning_rate": 0.0008081335351893206, + "loss": 0.0268, + "num_input_tokens_seen": 95301904, + "step": 44110 + }, + { + "epoch": 7.196574225122349, + "grad_norm": 0.02257407084107399, + "learning_rate": 0.0008080774753063485, + "loss": 0.0824, + "num_input_tokens_seen": 95312144, + "step": 44115 + }, + { + "epoch": 7.197389885807504, + "grad_norm": 0.22624976933002472, + "learning_rate": 0.0008080214091797953, + "loss": 0.1775, + "num_input_tokens_seen": 95321360, + "step": 44120 + }, + { + "epoch": 7.198205546492659, + "grad_norm": 0.005118417553603649, + "learning_rate": 0.0008079653368107975, + "loss": 0.0346, + "num_input_tokens_seen": 95331888, + "step": 44125 + }, + { + "epoch": 7.199021207177814, + "grad_norm": 0.006295321509242058, + "learning_rate": 0.0008079092582004915, + "loss": 0.1056, + "num_input_tokens_seen": 95343152, + "step": 44130 + }, + { + "epoch": 7.199836867862969, + "grad_norm": 0.02929351106286049, + "learning_rate": 0.0008078531733500137, + "loss": 0.0619, + "num_input_tokens_seen": 95353936, + "step": 44135 + }, + { + "epoch": 7.200652528548124, + "grad_norm": 0.16848796606063843, + "learning_rate": 0.000807797082260501, + "loss": 0.0424, + "num_input_tokens_seen": 95364240, + "step": 44140 + }, + { + "epoch": 7.201468189233279, + "grad_norm": 0.015281510539352894, + "learning_rate": 0.0008077409849330898, + "loss": 0.0405, + "num_input_tokens_seen": 95374448, + "step": 44145 + }, + { + "epoch": 7.202283849918434, + "grad_norm": 0.3676411807537079, + "learning_rate": 0.0008076848813689171, + "loss": 0.155, + "num_input_tokens_seen": 95384528, + "step": 44150 + }, + { + "epoch": 7.203099510603589, + "grad_norm": 0.035579223185777664, + "learning_rate": 0.0008076287715691201, + "loss": 0.0363, + "num_input_tokens_seen": 95395728, + "step": 44155 + }, + { + "epoch": 7.2039151712887435, + "grad_norm": 0.31866884231567383, + "learning_rate": 0.0008075726555348357, + "loss": 0.1915, + "num_input_tokens_seen": 95407120, + "step": 44160 + }, + { + "epoch": 7.204730831973899, + "grad_norm": 0.2983294725418091, + "learning_rate": 0.0008075165332672013, + "loss": 0.0841, + "num_input_tokens_seen": 95417008, + "step": 44165 + }, + { + "epoch": 7.205546492659054, + "grad_norm": 0.2822973430156708, + "learning_rate": 0.0008074604047673542, + "loss": 0.0586, + "num_input_tokens_seen": 95427632, + "step": 44170 + }, + { + "epoch": 7.206362153344209, + "grad_norm": 0.3057224154472351, + "learning_rate": 0.000807404270036432, + "loss": 0.0935, + "num_input_tokens_seen": 95437424, + "step": 44175 + }, + { + "epoch": 7.207177814029364, + "grad_norm": 0.009443351998925209, + "learning_rate": 0.0008073481290755723, + "loss": 0.1251, + "num_input_tokens_seen": 95449424, + "step": 44180 + }, + { + "epoch": 7.2079934747145185, + "grad_norm": 0.13818103075027466, + "learning_rate": 0.0008072919818859128, + "loss": 0.0718, + "num_input_tokens_seen": 95460624, + "step": 44185 + }, + { + "epoch": 7.208809135399674, + "grad_norm": 0.022867241874337196, + "learning_rate": 0.0008072358284685915, + "loss": 0.1683, + "num_input_tokens_seen": 95470672, + "step": 44190 + }, + { + "epoch": 7.209624796084829, + "grad_norm": 0.3809795379638672, + "learning_rate": 0.0008071796688247463, + "loss": 0.0934, + "num_input_tokens_seen": 95481936, + "step": 44195 + }, + { + "epoch": 7.210440456769984, + "grad_norm": 0.05303087458014488, + "learning_rate": 0.0008071235029555155, + "loss": 0.0491, + "num_input_tokens_seen": 95492368, + "step": 44200 + }, + { + "epoch": 7.211256117455139, + "grad_norm": 0.30895814299583435, + "learning_rate": 0.0008070673308620373, + "loss": 0.1043, + "num_input_tokens_seen": 95503280, + "step": 44205 + }, + { + "epoch": 7.212071778140293, + "grad_norm": 0.05848781764507294, + "learning_rate": 0.0008070111525454501, + "loss": 0.0508, + "num_input_tokens_seen": 95514544, + "step": 44210 + }, + { + "epoch": 7.212887438825448, + "grad_norm": 0.016617875546216965, + "learning_rate": 0.0008069549680068923, + "loss": 0.037, + "num_input_tokens_seen": 95525200, + "step": 44215 + }, + { + "epoch": 7.213703099510604, + "grad_norm": 0.010751327499747276, + "learning_rate": 0.0008068987772475029, + "loss": 0.1975, + "num_input_tokens_seen": 95536144, + "step": 44220 + }, + { + "epoch": 7.214518760195759, + "grad_norm": 0.3213596045970917, + "learning_rate": 0.0008068425802684204, + "loss": 0.1174, + "num_input_tokens_seen": 95547664, + "step": 44225 + }, + { + "epoch": 7.215334420880914, + "grad_norm": 0.012682809494435787, + "learning_rate": 0.0008067863770707838, + "loss": 0.0114, + "num_input_tokens_seen": 95559024, + "step": 44230 + }, + { + "epoch": 7.216150081566068, + "grad_norm": 0.02761760540306568, + "learning_rate": 0.0008067301676557319, + "loss": 0.0405, + "num_input_tokens_seen": 95569680, + "step": 44235 + }, + { + "epoch": 7.216965742251223, + "grad_norm": 0.22118832170963287, + "learning_rate": 0.0008066739520244042, + "loss": 0.1241, + "num_input_tokens_seen": 95580944, + "step": 44240 + }, + { + "epoch": 7.217781402936378, + "grad_norm": 0.44345036149024963, + "learning_rate": 0.0008066177301779396, + "loss": 0.116, + "num_input_tokens_seen": 95590928, + "step": 44245 + }, + { + "epoch": 7.218597063621534, + "grad_norm": 0.07468734681606293, + "learning_rate": 0.0008065615021174779, + "loss": 0.0334, + "num_input_tokens_seen": 95602128, + "step": 44250 + }, + { + "epoch": 7.219412724306689, + "grad_norm": 0.2577841281890869, + "learning_rate": 0.0008065052678441584, + "loss": 0.0726, + "num_input_tokens_seen": 95613264, + "step": 44255 + }, + { + "epoch": 7.220228384991843, + "grad_norm": 0.04845619201660156, + "learning_rate": 0.0008064490273591209, + "loss": 0.0159, + "num_input_tokens_seen": 95623600, + "step": 44260 + }, + { + "epoch": 7.221044045676998, + "grad_norm": 0.3292473256587982, + "learning_rate": 0.000806392780663505, + "loss": 0.0465, + "num_input_tokens_seen": 95635696, + "step": 44265 + }, + { + "epoch": 7.221859706362153, + "grad_norm": 0.11419453471899033, + "learning_rate": 0.0008063365277584508, + "loss": 0.0303, + "num_input_tokens_seen": 95647088, + "step": 44270 + }, + { + "epoch": 7.222675367047309, + "grad_norm": 0.01460292749106884, + "learning_rate": 0.0008062802686450982, + "loss": 0.0346, + "num_input_tokens_seen": 95657936, + "step": 44275 + }, + { + "epoch": 7.2234910277324635, + "grad_norm": 0.03610467538237572, + "learning_rate": 0.0008062240033245875, + "loss": 0.0664, + "num_input_tokens_seen": 95668400, + "step": 44280 + }, + { + "epoch": 7.224306688417618, + "grad_norm": 0.01440652459859848, + "learning_rate": 0.0008061677317980587, + "loss": 0.1672, + "num_input_tokens_seen": 95678256, + "step": 44285 + }, + { + "epoch": 7.225122349102773, + "grad_norm": 0.002639458980411291, + "learning_rate": 0.0008061114540666525, + "loss": 0.0697, + "num_input_tokens_seen": 95688112, + "step": 44290 + }, + { + "epoch": 7.225938009787928, + "grad_norm": 0.0032658553682267666, + "learning_rate": 0.0008060551701315093, + "loss": 0.0458, + "num_input_tokens_seen": 95699088, + "step": 44295 + }, + { + "epoch": 7.226753670473083, + "grad_norm": 0.09494752436876297, + "learning_rate": 0.00080599887999377, + "loss": 0.081, + "num_input_tokens_seen": 95709392, + "step": 44300 + }, + { + "epoch": 7.2275693311582385, + "grad_norm": 0.0071384357288479805, + "learning_rate": 0.0008059425836545751, + "loss": 0.0594, + "num_input_tokens_seen": 95719952, + "step": 44305 + }, + { + "epoch": 7.228384991843393, + "grad_norm": 0.010850159451365471, + "learning_rate": 0.0008058862811150657, + "loss": 0.0724, + "num_input_tokens_seen": 95730032, + "step": 44310 + }, + { + "epoch": 7.229200652528548, + "grad_norm": 0.1265016794204712, + "learning_rate": 0.0008058299723763826, + "loss": 0.0379, + "num_input_tokens_seen": 95740752, + "step": 44315 + }, + { + "epoch": 7.230016313213703, + "grad_norm": 0.2655118703842163, + "learning_rate": 0.0008057736574396673, + "loss": 0.048, + "num_input_tokens_seen": 95751888, + "step": 44320 + }, + { + "epoch": 7.230831973898858, + "grad_norm": 0.025311551988124847, + "learning_rate": 0.000805717336306061, + "loss": 0.1106, + "num_input_tokens_seen": 95762512, + "step": 44325 + }, + { + "epoch": 7.231647634584013, + "grad_norm": 0.0299469456076622, + "learning_rate": 0.000805661008976705, + "loss": 0.261, + "num_input_tokens_seen": 95773424, + "step": 44330 + }, + { + "epoch": 7.232463295269168, + "grad_norm": 0.09730672836303711, + "learning_rate": 0.0008056046754527406, + "loss": 0.0398, + "num_input_tokens_seen": 95783888, + "step": 44335 + }, + { + "epoch": 7.233278955954323, + "grad_norm": 0.2526463568210602, + "learning_rate": 0.00080554833573531, + "loss": 0.0983, + "num_input_tokens_seen": 95794992, + "step": 44340 + }, + { + "epoch": 7.234094616639478, + "grad_norm": 0.03998822346329689, + "learning_rate": 0.0008054919898255548, + "loss": 0.0873, + "num_input_tokens_seen": 95806224, + "step": 44345 + }, + { + "epoch": 7.234910277324633, + "grad_norm": 0.252896785736084, + "learning_rate": 0.0008054356377246168, + "loss": 0.0704, + "num_input_tokens_seen": 95817168, + "step": 44350 + }, + { + "epoch": 7.235725938009788, + "grad_norm": 0.01774667389690876, + "learning_rate": 0.0008053792794336381, + "loss": 0.1547, + "num_input_tokens_seen": 95827792, + "step": 44355 + }, + { + "epoch": 7.236541598694943, + "grad_norm": 0.015287657268345356, + "learning_rate": 0.0008053229149537611, + "loss": 0.0861, + "num_input_tokens_seen": 95837488, + "step": 44360 + }, + { + "epoch": 7.237357259380098, + "grad_norm": 0.05477646738290787, + "learning_rate": 0.0008052665442861278, + "loss": 0.0597, + "num_input_tokens_seen": 95849744, + "step": 44365 + }, + { + "epoch": 7.238172920065253, + "grad_norm": 0.22003793716430664, + "learning_rate": 0.0008052101674318805, + "loss": 0.056, + "num_input_tokens_seen": 95861328, + "step": 44370 + }, + { + "epoch": 7.238988580750408, + "grad_norm": 0.017607053741812706, + "learning_rate": 0.0008051537843921623, + "loss": 0.153, + "num_input_tokens_seen": 95872752, + "step": 44375 + }, + { + "epoch": 7.239804241435563, + "grad_norm": 0.015592445619404316, + "learning_rate": 0.0008050973951681153, + "loss": 0.0481, + "num_input_tokens_seen": 95883984, + "step": 44380 + }, + { + "epoch": 7.240619902120717, + "grad_norm": 0.10997194796800613, + "learning_rate": 0.0008050409997608827, + "loss": 0.2259, + "num_input_tokens_seen": 95895056, + "step": 44385 + }, + { + "epoch": 7.241435562805873, + "grad_norm": 0.025174317881464958, + "learning_rate": 0.0008049845981716072, + "loss": 0.0652, + "num_input_tokens_seen": 95905104, + "step": 44390 + }, + { + "epoch": 7.242251223491028, + "grad_norm": 0.2000531107187271, + "learning_rate": 0.0008049281904014318, + "loss": 0.0521, + "num_input_tokens_seen": 95915216, + "step": 44395 + }, + { + "epoch": 7.243066884176183, + "grad_norm": 0.12257369607686996, + "learning_rate": 0.0008048717764514999, + "loss": 0.033, + "num_input_tokens_seen": 95924560, + "step": 44400 + }, + { + "epoch": 7.2438825448613375, + "grad_norm": 0.27642714977264404, + "learning_rate": 0.0008048153563229548, + "loss": 0.083, + "num_input_tokens_seen": 95935024, + "step": 44405 + }, + { + "epoch": 7.244698205546492, + "grad_norm": 0.113319993019104, + "learning_rate": 0.0008047589300169398, + "loss": 0.0861, + "num_input_tokens_seen": 95944464, + "step": 44410 + }, + { + "epoch": 7.245513866231648, + "grad_norm": 0.007153376936912537, + "learning_rate": 0.0008047024975345983, + "loss": 0.1051, + "num_input_tokens_seen": 95954704, + "step": 44415 + }, + { + "epoch": 7.246329526916803, + "grad_norm": 0.01275695487856865, + "learning_rate": 0.0008046460588770743, + "loss": 0.0642, + "num_input_tokens_seen": 95965456, + "step": 44420 + }, + { + "epoch": 7.247145187601958, + "grad_norm": 0.14486052095890045, + "learning_rate": 0.0008045896140455114, + "loss": 0.1648, + "num_input_tokens_seen": 95976752, + "step": 44425 + }, + { + "epoch": 7.2479608482871125, + "grad_norm": 0.24699799716472626, + "learning_rate": 0.0008045331630410535, + "loss": 0.065, + "num_input_tokens_seen": 95988400, + "step": 44430 + }, + { + "epoch": 7.248776508972267, + "grad_norm": 0.015178795903921127, + "learning_rate": 0.0008044767058648448, + "loss": 0.0229, + "num_input_tokens_seen": 95999280, + "step": 44435 + }, + { + "epoch": 7.249592169657422, + "grad_norm": 0.04336842894554138, + "learning_rate": 0.0008044202425180293, + "loss": 0.0238, + "num_input_tokens_seen": 96008144, + "step": 44440 + }, + { + "epoch": 7.250407830342578, + "grad_norm": 0.11812159419059753, + "learning_rate": 0.0008043637730017515, + "loss": 0.2407, + "num_input_tokens_seen": 96018768, + "step": 44445 + }, + { + "epoch": 7.251223491027733, + "grad_norm": 0.25787487626075745, + "learning_rate": 0.0008043072973171557, + "loss": 0.0677, + "num_input_tokens_seen": 96028688, + "step": 44450 + }, + { + "epoch": 7.2520391517128875, + "grad_norm": 0.04251531511545181, + "learning_rate": 0.0008042508154653865, + "loss": 0.068, + "num_input_tokens_seen": 96040368, + "step": 44455 + }, + { + "epoch": 7.252854812398042, + "grad_norm": 0.019232304766774178, + "learning_rate": 0.0008041943274475886, + "loss": 0.0173, + "num_input_tokens_seen": 96051120, + "step": 44460 + }, + { + "epoch": 7.253670473083197, + "grad_norm": 0.3413325548171997, + "learning_rate": 0.0008041378332649067, + "loss": 0.1559, + "num_input_tokens_seen": 96062224, + "step": 44465 + }, + { + "epoch": 7.254486133768353, + "grad_norm": 0.006722092628479004, + "learning_rate": 0.0008040813329184857, + "loss": 0.102, + "num_input_tokens_seen": 96073072, + "step": 44470 + }, + { + "epoch": 7.255301794453508, + "grad_norm": 0.053375717252492905, + "learning_rate": 0.000804024826409471, + "loss": 0.1075, + "num_input_tokens_seen": 96082992, + "step": 44475 + }, + { + "epoch": 7.2561174551386625, + "grad_norm": 0.14441102743148804, + "learning_rate": 0.0008039683137390073, + "loss": 0.1015, + "num_input_tokens_seen": 96093136, + "step": 44480 + }, + { + "epoch": 7.256933115823817, + "grad_norm": 0.22683408856391907, + "learning_rate": 0.0008039117949082401, + "loss": 0.0542, + "num_input_tokens_seen": 96103376, + "step": 44485 + }, + { + "epoch": 7.257748776508972, + "grad_norm": 0.2603529989719391, + "learning_rate": 0.0008038552699183148, + "loss": 0.049, + "num_input_tokens_seen": 96114672, + "step": 44490 + }, + { + "epoch": 7.258564437194127, + "grad_norm": 0.03789392486214638, + "learning_rate": 0.0008037987387703771, + "loss": 0.1357, + "num_input_tokens_seen": 96125136, + "step": 44495 + }, + { + "epoch": 7.259380097879283, + "grad_norm": 0.008358441293239594, + "learning_rate": 0.0008037422014655725, + "loss": 0.1643, + "num_input_tokens_seen": 96135856, + "step": 44500 + }, + { + "epoch": 7.260195758564437, + "grad_norm": 0.3523517847061157, + "learning_rate": 0.0008036856580050469, + "loss": 0.2079, + "num_input_tokens_seen": 96145712, + "step": 44505 + }, + { + "epoch": 7.261011419249592, + "grad_norm": 0.07002881169319153, + "learning_rate": 0.000803629108389946, + "loss": 0.1167, + "num_input_tokens_seen": 96156304, + "step": 44510 + }, + { + "epoch": 7.261827079934747, + "grad_norm": 0.05014079064130783, + "learning_rate": 0.0008035725526214164, + "loss": 0.0708, + "num_input_tokens_seen": 96166864, + "step": 44515 + }, + { + "epoch": 7.262642740619902, + "grad_norm": 0.2250353842973709, + "learning_rate": 0.0008035159907006037, + "loss": 0.0655, + "num_input_tokens_seen": 96177552, + "step": 44520 + }, + { + "epoch": 7.263458401305057, + "grad_norm": 0.2552667260169983, + "learning_rate": 0.0008034594226286545, + "loss": 0.0755, + "num_input_tokens_seen": 96188592, + "step": 44525 + }, + { + "epoch": 7.264274061990212, + "grad_norm": 0.22745175659656525, + "learning_rate": 0.0008034028484067149, + "loss": 0.1418, + "num_input_tokens_seen": 96199376, + "step": 44530 + }, + { + "epoch": 7.265089722675367, + "grad_norm": 0.1307612508535385, + "learning_rate": 0.0008033462680359319, + "loss": 0.0757, + "num_input_tokens_seen": 96210800, + "step": 44535 + }, + { + "epoch": 7.265905383360522, + "grad_norm": 0.015104546211659908, + "learning_rate": 0.000803289681517452, + "loss": 0.018, + "num_input_tokens_seen": 96221488, + "step": 44540 + }, + { + "epoch": 7.266721044045677, + "grad_norm": 0.2224571257829666, + "learning_rate": 0.0008032330888524217, + "loss": 0.1082, + "num_input_tokens_seen": 96231024, + "step": 44545 + }, + { + "epoch": 7.267536704730832, + "grad_norm": 0.05627712979912758, + "learning_rate": 0.0008031764900419885, + "loss": 0.0475, + "num_input_tokens_seen": 96241328, + "step": 44550 + }, + { + "epoch": 7.268352365415987, + "grad_norm": 0.12514592707157135, + "learning_rate": 0.000803119885087299, + "loss": 0.0325, + "num_input_tokens_seen": 96253072, + "step": 44555 + }, + { + "epoch": 7.269168026101142, + "grad_norm": 0.052119843661785126, + "learning_rate": 0.0008030632739895004, + "loss": 0.0786, + "num_input_tokens_seen": 96262928, + "step": 44560 + }, + { + "epoch": 7.269983686786297, + "grad_norm": 0.030936799943447113, + "learning_rate": 0.0008030066567497401, + "loss": 0.0753, + "num_input_tokens_seen": 96274640, + "step": 44565 + }, + { + "epoch": 7.270799347471452, + "grad_norm": 0.00539214164018631, + "learning_rate": 0.0008029500333691656, + "loss": 0.0261, + "num_input_tokens_seen": 96285104, + "step": 44570 + }, + { + "epoch": 7.271615008156607, + "grad_norm": 0.012339390814304352, + "learning_rate": 0.0008028934038489243, + "loss": 0.0689, + "num_input_tokens_seen": 96296720, + "step": 44575 + }, + { + "epoch": 7.2724306688417615, + "grad_norm": 0.08805263042449951, + "learning_rate": 0.000802836768190164, + "loss": 0.0618, + "num_input_tokens_seen": 96307280, + "step": 44580 + }, + { + "epoch": 7.273246329526917, + "grad_norm": 0.03065626323223114, + "learning_rate": 0.0008027801263940322, + "loss": 0.0505, + "num_input_tokens_seen": 96317424, + "step": 44585 + }, + { + "epoch": 7.274061990212072, + "grad_norm": 0.2728951871395111, + "learning_rate": 0.0008027234784616773, + "loss": 0.1392, + "num_input_tokens_seen": 96328528, + "step": 44590 + }, + { + "epoch": 7.274877650897227, + "grad_norm": 0.198155015707016, + "learning_rate": 0.0008026668243942469, + "loss": 0.0426, + "num_input_tokens_seen": 96338864, + "step": 44595 + }, + { + "epoch": 7.275693311582382, + "grad_norm": 0.1933411955833435, + "learning_rate": 0.0008026101641928895, + "loss": 0.0569, + "num_input_tokens_seen": 96349936, + "step": 44600 + }, + { + "epoch": 7.2765089722675365, + "grad_norm": 0.04269864410161972, + "learning_rate": 0.000802553497858753, + "loss": 0.0611, + "num_input_tokens_seen": 96360944, + "step": 44605 + }, + { + "epoch": 7.277324632952691, + "grad_norm": 0.02413656748831272, + "learning_rate": 0.0008024968253929861, + "loss": 0.0228, + "num_input_tokens_seen": 96372016, + "step": 44610 + }, + { + "epoch": 7.278140293637847, + "grad_norm": 0.0752306878566742, + "learning_rate": 0.0008024401467967375, + "loss": 0.0947, + "num_input_tokens_seen": 96382064, + "step": 44615 + }, + { + "epoch": 7.278955954323002, + "grad_norm": 0.20823276042938232, + "learning_rate": 0.0008023834620711555, + "loss": 0.1745, + "num_input_tokens_seen": 96392592, + "step": 44620 + }, + { + "epoch": 7.279771615008157, + "grad_norm": 0.0009308802546001971, + "learning_rate": 0.000802326771217389, + "loss": 0.0337, + "num_input_tokens_seen": 96403760, + "step": 44625 + }, + { + "epoch": 7.280587275693311, + "grad_norm": 0.06344801187515259, + "learning_rate": 0.0008022700742365871, + "loss": 0.0351, + "num_input_tokens_seen": 96415600, + "step": 44630 + }, + { + "epoch": 7.281402936378466, + "grad_norm": 0.02278982475399971, + "learning_rate": 0.0008022133711298987, + "loss": 0.0133, + "num_input_tokens_seen": 96424592, + "step": 44635 + }, + { + "epoch": 7.282218597063622, + "grad_norm": 0.2168564349412918, + "learning_rate": 0.0008021566618984728, + "loss": 0.0382, + "num_input_tokens_seen": 96435376, + "step": 44640 + }, + { + "epoch": 7.283034257748777, + "grad_norm": 0.014572087675333023, + "learning_rate": 0.0008020999465434589, + "loss": 0.0164, + "num_input_tokens_seen": 96445680, + "step": 44645 + }, + { + "epoch": 7.283849918433932, + "grad_norm": 0.003936320077627897, + "learning_rate": 0.0008020432250660063, + "loss": 0.1148, + "num_input_tokens_seen": 96455440, + "step": 44650 + }, + { + "epoch": 7.284665579119086, + "grad_norm": 0.422513872385025, + "learning_rate": 0.0008019864974672646, + "loss": 0.077, + "num_input_tokens_seen": 96465328, + "step": 44655 + }, + { + "epoch": 7.285481239804241, + "grad_norm": 0.006895182654261589, + "learning_rate": 0.0008019297637483836, + "loss": 0.1485, + "num_input_tokens_seen": 96476304, + "step": 44660 + }, + { + "epoch": 7.286296900489396, + "grad_norm": 0.021543532609939575, + "learning_rate": 0.0008018730239105127, + "loss": 0.031, + "num_input_tokens_seen": 96486928, + "step": 44665 + }, + { + "epoch": 7.287112561174552, + "grad_norm": 0.08389617502689362, + "learning_rate": 0.000801816277954802, + "loss": 0.0707, + "num_input_tokens_seen": 96496592, + "step": 44670 + }, + { + "epoch": 7.287928221859707, + "grad_norm": 0.0032530969474464655, + "learning_rate": 0.0008017595258824016, + "loss": 0.0888, + "num_input_tokens_seen": 96507216, + "step": 44675 + }, + { + "epoch": 7.288743882544861, + "grad_norm": 0.052641693502664566, + "learning_rate": 0.0008017027676944617, + "loss": 0.1027, + "num_input_tokens_seen": 96518160, + "step": 44680 + }, + { + "epoch": 7.289559543230016, + "grad_norm": 0.02181733213365078, + "learning_rate": 0.0008016460033921323, + "loss": 0.1183, + "num_input_tokens_seen": 96531568, + "step": 44685 + }, + { + "epoch": 7.290375203915171, + "grad_norm": 0.14702098071575165, + "learning_rate": 0.0008015892329765642, + "loss": 0.199, + "num_input_tokens_seen": 96541840, + "step": 44690 + }, + { + "epoch": 7.291190864600326, + "grad_norm": 0.016916362568736076, + "learning_rate": 0.0008015324564489075, + "loss": 0.1729, + "num_input_tokens_seen": 96553072, + "step": 44695 + }, + { + "epoch": 7.2920065252854815, + "grad_norm": 0.02293366566300392, + "learning_rate": 0.0008014756738103132, + "loss": 0.0379, + "num_input_tokens_seen": 96563568, + "step": 44700 + }, + { + "epoch": 7.292822185970636, + "grad_norm": 0.031679704785346985, + "learning_rate": 0.0008014188850619318, + "loss": 0.0292, + "num_input_tokens_seen": 96573392, + "step": 44705 + }, + { + "epoch": 7.293637846655791, + "grad_norm": 0.06691129505634308, + "learning_rate": 0.0008013620902049143, + "loss": 0.0507, + "num_input_tokens_seen": 96584464, + "step": 44710 + }, + { + "epoch": 7.294453507340946, + "grad_norm": 0.632112443447113, + "learning_rate": 0.0008013052892404118, + "loss": 0.1124, + "num_input_tokens_seen": 96595824, + "step": 44715 + }, + { + "epoch": 7.295269168026101, + "grad_norm": 0.01686936616897583, + "learning_rate": 0.0008012484821695754, + "loss": 0.1156, + "num_input_tokens_seen": 96607312, + "step": 44720 + }, + { + "epoch": 7.2960848287112565, + "grad_norm": 0.008439785800874233, + "learning_rate": 0.0008011916689935563, + "loss": 0.0843, + "num_input_tokens_seen": 96617680, + "step": 44725 + }, + { + "epoch": 7.296900489396411, + "grad_norm": 0.020563535392284393, + "learning_rate": 0.000801134849713506, + "loss": 0.0913, + "num_input_tokens_seen": 96627440, + "step": 44730 + }, + { + "epoch": 7.297716150081566, + "grad_norm": 0.012274558655917645, + "learning_rate": 0.0008010780243305758, + "loss": 0.054, + "num_input_tokens_seen": 96638064, + "step": 44735 + }, + { + "epoch": 7.298531810766721, + "grad_norm": 0.03836611658334732, + "learning_rate": 0.0008010211928459177, + "loss": 0.0708, + "num_input_tokens_seen": 96649840, + "step": 44740 + }, + { + "epoch": 7.299347471451876, + "grad_norm": 0.02701750211417675, + "learning_rate": 0.0008009643552606831, + "loss": 0.008, + "num_input_tokens_seen": 96660784, + "step": 44745 + }, + { + "epoch": 7.300163132137031, + "grad_norm": 0.06201131269335747, + "learning_rate": 0.0008009075115760243, + "loss": 0.0744, + "num_input_tokens_seen": 96672144, + "step": 44750 + }, + { + "epoch": 7.300978792822186, + "grad_norm": 0.004208603873848915, + "learning_rate": 0.0008008506617930926, + "loss": 0.1152, + "num_input_tokens_seen": 96684784, + "step": 44755 + }, + { + "epoch": 7.301794453507341, + "grad_norm": 0.022887179628014565, + "learning_rate": 0.000800793805913041, + "loss": 0.0218, + "num_input_tokens_seen": 96694576, + "step": 44760 + }, + { + "epoch": 7.302610114192496, + "grad_norm": 0.014200643636286259, + "learning_rate": 0.0008007369439370211, + "loss": 0.0331, + "num_input_tokens_seen": 96705072, + "step": 44765 + }, + { + "epoch": 7.303425774877651, + "grad_norm": 0.0050778696313500404, + "learning_rate": 0.0008006800758661856, + "loss": 0.0181, + "num_input_tokens_seen": 96716368, + "step": 44770 + }, + { + "epoch": 7.304241435562806, + "grad_norm": 0.09507616609334946, + "learning_rate": 0.000800623201701687, + "loss": 0.0259, + "num_input_tokens_seen": 96726576, + "step": 44775 + }, + { + "epoch": 7.30505709624796, + "grad_norm": 0.009630247950553894, + "learning_rate": 0.0008005663214446777, + "loss": 0.0309, + "num_input_tokens_seen": 96735952, + "step": 44780 + }, + { + "epoch": 7.305872756933116, + "grad_norm": 0.0041391802951693535, + "learning_rate": 0.0008005094350963107, + "loss": 0.0134, + "num_input_tokens_seen": 96746064, + "step": 44785 + }, + { + "epoch": 7.306688417618271, + "grad_norm": 0.09530620276927948, + "learning_rate": 0.0008004525426577387, + "loss": 0.1831, + "num_input_tokens_seen": 96755504, + "step": 44790 + }, + { + "epoch": 7.307504078303426, + "grad_norm": 0.010489806532859802, + "learning_rate": 0.0008003956441301149, + "loss": 0.0492, + "num_input_tokens_seen": 96766800, + "step": 44795 + }, + { + "epoch": 7.308319738988581, + "grad_norm": 0.05959814041852951, + "learning_rate": 0.0008003387395145922, + "loss": 0.0286, + "num_input_tokens_seen": 96776944, + "step": 44800 + }, + { + "epoch": 7.309135399673735, + "grad_norm": 0.017102934420108795, + "learning_rate": 0.0008002818288123239, + "loss": 0.0684, + "num_input_tokens_seen": 96788528, + "step": 44805 + }, + { + "epoch": 7.309951060358891, + "grad_norm": 0.06432080268859863, + "learning_rate": 0.0008002249120244635, + "loss": 0.0208, + "num_input_tokens_seen": 96799632, + "step": 44810 + }, + { + "epoch": 7.310766721044046, + "grad_norm": 0.011474508792161942, + "learning_rate": 0.0008001679891521642, + "loss": 0.1318, + "num_input_tokens_seen": 96810288, + "step": 44815 + }, + { + "epoch": 7.311582381729201, + "grad_norm": 0.013832269236445427, + "learning_rate": 0.00080011106019658, + "loss": 0.0292, + "num_input_tokens_seen": 96821360, + "step": 44820 + }, + { + "epoch": 7.3123980424143555, + "grad_norm": 0.3839665949344635, + "learning_rate": 0.0008000541251588644, + "loss": 0.1622, + "num_input_tokens_seen": 96832944, + "step": 44825 + }, + { + "epoch": 7.31321370309951, + "grad_norm": 0.1246161013841629, + "learning_rate": 0.0007999971840401714, + "loss": 0.0526, + "num_input_tokens_seen": 96843824, + "step": 44830 + }, + { + "epoch": 7.314029363784665, + "grad_norm": 0.0059314328245818615, + "learning_rate": 0.0007999402368416548, + "loss": 0.0425, + "num_input_tokens_seen": 96854416, + "step": 44835 + }, + { + "epoch": 7.314845024469821, + "grad_norm": 0.31664329767227173, + "learning_rate": 0.0007998832835644687, + "loss": 0.0656, + "num_input_tokens_seen": 96864976, + "step": 44840 + }, + { + "epoch": 7.315660685154976, + "grad_norm": 0.28005892038345337, + "learning_rate": 0.0007998263242097675, + "loss": 0.1451, + "num_input_tokens_seen": 96876400, + "step": 44845 + }, + { + "epoch": 7.3164763458401305, + "grad_norm": 0.04676670581102371, + "learning_rate": 0.0007997693587787056, + "loss": 0.1371, + "num_input_tokens_seen": 96887152, + "step": 44850 + }, + { + "epoch": 7.317292006525285, + "grad_norm": 0.01653682440519333, + "learning_rate": 0.0007997123872724373, + "loss": 0.0475, + "num_input_tokens_seen": 96896976, + "step": 44855 + }, + { + "epoch": 7.31810766721044, + "grad_norm": 0.011989987455308437, + "learning_rate": 0.0007996554096921172, + "loss": 0.0299, + "num_input_tokens_seen": 96908048, + "step": 44860 + }, + { + "epoch": 7.318923327895595, + "grad_norm": 0.030787119641900063, + "learning_rate": 0.0007995984260389001, + "loss": 0.0846, + "num_input_tokens_seen": 96918960, + "step": 44865 + }, + { + "epoch": 7.319738988580751, + "grad_norm": 0.16728070378303528, + "learning_rate": 0.0007995414363139408, + "loss": 0.1015, + "num_input_tokens_seen": 96929520, + "step": 44870 + }, + { + "epoch": 7.3205546492659055, + "grad_norm": 0.009303702041506767, + "learning_rate": 0.0007994844405183944, + "loss": 0.0311, + "num_input_tokens_seen": 96940560, + "step": 44875 + }, + { + "epoch": 7.32137030995106, + "grad_norm": 0.10620930045843124, + "learning_rate": 0.0007994274386534158, + "loss": 0.0973, + "num_input_tokens_seen": 96952496, + "step": 44880 + }, + { + "epoch": 7.322185970636215, + "grad_norm": 0.251691609621048, + "learning_rate": 0.0007993704307201604, + "loss": 0.0335, + "num_input_tokens_seen": 96963120, + "step": 44885 + }, + { + "epoch": 7.32300163132137, + "grad_norm": 0.12604647874832153, + "learning_rate": 0.0007993134167197833, + "loss": 0.1191, + "num_input_tokens_seen": 96973168, + "step": 44890 + }, + { + "epoch": 7.323817292006526, + "grad_norm": 0.040935318917036057, + "learning_rate": 0.0007992563966534403, + "loss": 0.1098, + "num_input_tokens_seen": 96984240, + "step": 44895 + }, + { + "epoch": 7.3246329526916805, + "grad_norm": 0.020672090351581573, + "learning_rate": 0.0007991993705222867, + "loss": 0.0637, + "num_input_tokens_seen": 96995504, + "step": 44900 + }, + { + "epoch": 7.325448613376835, + "grad_norm": 0.3290887176990509, + "learning_rate": 0.0007991423383274782, + "loss": 0.1589, + "num_input_tokens_seen": 97007312, + "step": 44905 + }, + { + "epoch": 7.32626427406199, + "grad_norm": 0.05711958557367325, + "learning_rate": 0.0007990853000701708, + "loss": 0.067, + "num_input_tokens_seen": 97019408, + "step": 44910 + }, + { + "epoch": 7.327079934747145, + "grad_norm": 0.05303024500608444, + "learning_rate": 0.0007990282557515204, + "loss": 0.1594, + "num_input_tokens_seen": 97031344, + "step": 44915 + }, + { + "epoch": 7.327895595432301, + "grad_norm": 0.005337063688784838, + "learning_rate": 0.0007989712053726829, + "loss": 0.0444, + "num_input_tokens_seen": 97041648, + "step": 44920 + }, + { + "epoch": 7.328711256117455, + "grad_norm": 0.032504886388778687, + "learning_rate": 0.0007989141489348149, + "loss": 0.0186, + "num_input_tokens_seen": 97052432, + "step": 44925 + }, + { + "epoch": 7.32952691680261, + "grad_norm": 0.07863004505634308, + "learning_rate": 0.0007988570864390723, + "loss": 0.1965, + "num_input_tokens_seen": 97062928, + "step": 44930 + }, + { + "epoch": 7.330342577487765, + "grad_norm": 0.07831018418073654, + "learning_rate": 0.0007988000178866117, + "loss": 0.1547, + "num_input_tokens_seen": 97073968, + "step": 44935 + }, + { + "epoch": 7.33115823817292, + "grad_norm": 0.17209608852863312, + "learning_rate": 0.0007987429432785897, + "loss": 0.0334, + "num_input_tokens_seen": 97085520, + "step": 44940 + }, + { + "epoch": 7.331973898858075, + "grad_norm": 0.037289805710315704, + "learning_rate": 0.000798685862616163, + "loss": 0.0173, + "num_input_tokens_seen": 97096080, + "step": 44945 + }, + { + "epoch": 7.33278955954323, + "grad_norm": 0.05421232804656029, + "learning_rate": 0.0007986287759004884, + "loss": 0.0347, + "num_input_tokens_seen": 97107152, + "step": 44950 + }, + { + "epoch": 7.333605220228385, + "grad_norm": 0.01405141968280077, + "learning_rate": 0.000798571683132723, + "loss": 0.0802, + "num_input_tokens_seen": 97117296, + "step": 44955 + }, + { + "epoch": 7.33442088091354, + "grad_norm": 0.015522617846727371, + "learning_rate": 0.0007985145843140233, + "loss": 0.0175, + "num_input_tokens_seen": 97127440, + "step": 44960 + }, + { + "epoch": 7.335236541598695, + "grad_norm": 0.09611742943525314, + "learning_rate": 0.0007984574794455472, + "loss": 0.0393, + "num_input_tokens_seen": 97138288, + "step": 44965 + }, + { + "epoch": 7.33605220228385, + "grad_norm": 0.2853439450263977, + "learning_rate": 0.0007984003685284516, + "loss": 0.0372, + "num_input_tokens_seen": 97150864, + "step": 44970 + }, + { + "epoch": 7.3368678629690045, + "grad_norm": 0.14586283266544342, + "learning_rate": 0.0007983432515638937, + "loss": 0.023, + "num_input_tokens_seen": 97161776, + "step": 44975 + }, + { + "epoch": 7.33768352365416, + "grad_norm": 0.03794190287590027, + "learning_rate": 0.0007982861285530317, + "loss": 0.0311, + "num_input_tokens_seen": 97175184, + "step": 44980 + }, + { + "epoch": 7.338499184339315, + "grad_norm": 0.07810980826616287, + "learning_rate": 0.0007982289994970227, + "loss": 0.0555, + "num_input_tokens_seen": 97185488, + "step": 44985 + }, + { + "epoch": 7.33931484502447, + "grad_norm": 0.07301853597164154, + "learning_rate": 0.0007981718643970246, + "loss": 0.1488, + "num_input_tokens_seen": 97196368, + "step": 44990 + }, + { + "epoch": 7.340130505709625, + "grad_norm": 0.00231625372543931, + "learning_rate": 0.0007981147232541956, + "loss": 0.0321, + "num_input_tokens_seen": 97205872, + "step": 44995 + }, + { + "epoch": 7.3409461663947795, + "grad_norm": 0.007339373230934143, + "learning_rate": 0.0007980575760696935, + "loss": 0.0172, + "num_input_tokens_seen": 97217072, + "step": 45000 + }, + { + "epoch": 7.341761827079935, + "grad_norm": 0.002107437001541257, + "learning_rate": 0.0007980004228446765, + "loss": 0.0124, + "num_input_tokens_seen": 97227216, + "step": 45005 + }, + { + "epoch": 7.34257748776509, + "grad_norm": 0.4507990777492523, + "learning_rate": 0.0007979432635803029, + "loss": 0.2407, + "num_input_tokens_seen": 97238064, + "step": 45010 + }, + { + "epoch": 7.343393148450245, + "grad_norm": 0.054382532835006714, + "learning_rate": 0.000797886098277731, + "loss": 0.0231, + "num_input_tokens_seen": 97249328, + "step": 45015 + }, + { + "epoch": 7.3442088091354, + "grad_norm": 0.04891600459814072, + "learning_rate": 0.0007978289269381196, + "loss": 0.0437, + "num_input_tokens_seen": 97260144, + "step": 45020 + }, + { + "epoch": 7.3450244698205545, + "grad_norm": 0.4334297180175781, + "learning_rate": 0.0007977717495626271, + "loss": 0.1371, + "num_input_tokens_seen": 97271024, + "step": 45025 + }, + { + "epoch": 7.345840130505709, + "grad_norm": 0.024992918595671654, + "learning_rate": 0.0007977145661524123, + "loss": 0.1881, + "num_input_tokens_seen": 97281776, + "step": 45030 + }, + { + "epoch": 7.346655791190865, + "grad_norm": 0.06149057671427727, + "learning_rate": 0.000797657376708634, + "loss": 0.1039, + "num_input_tokens_seen": 97292368, + "step": 45035 + }, + { + "epoch": 7.34747145187602, + "grad_norm": 0.3460540175437927, + "learning_rate": 0.0007976001812324516, + "loss": 0.0483, + "num_input_tokens_seen": 97303088, + "step": 45040 + }, + { + "epoch": 7.348287112561175, + "grad_norm": 0.05683022737503052, + "learning_rate": 0.0007975429797250239, + "loss": 0.0936, + "num_input_tokens_seen": 97313776, + "step": 45045 + }, + { + "epoch": 7.349102773246329, + "grad_norm": 0.1997668594121933, + "learning_rate": 0.0007974857721875102, + "loss": 0.0713, + "num_input_tokens_seen": 97323952, + "step": 45050 + }, + { + "epoch": 7.349918433931484, + "grad_norm": 0.06268063187599182, + "learning_rate": 0.0007974285586210701, + "loss": 0.0096, + "num_input_tokens_seen": 97334448, + "step": 45055 + }, + { + "epoch": 7.350734094616639, + "grad_norm": 0.010480429045855999, + "learning_rate": 0.0007973713390268629, + "loss": 0.0375, + "num_input_tokens_seen": 97345360, + "step": 45060 + }, + { + "epoch": 7.351549755301795, + "grad_norm": 0.006281863432377577, + "learning_rate": 0.0007973141134060483, + "loss": 0.0453, + "num_input_tokens_seen": 97356080, + "step": 45065 + }, + { + "epoch": 7.35236541598695, + "grad_norm": 0.10553544759750366, + "learning_rate": 0.0007972568817597857, + "loss": 0.0659, + "num_input_tokens_seen": 97367248, + "step": 45070 + }, + { + "epoch": 7.353181076672104, + "grad_norm": 0.1673373132944107, + "learning_rate": 0.0007971996440892356, + "loss": 0.0627, + "num_input_tokens_seen": 97378000, + "step": 45075 + }, + { + "epoch": 7.353996737357259, + "grad_norm": 0.13530519604682922, + "learning_rate": 0.0007971424003955577, + "loss": 0.0529, + "num_input_tokens_seen": 97388432, + "step": 45080 + }, + { + "epoch": 7.354812398042414, + "grad_norm": 0.02766413986682892, + "learning_rate": 0.0007970851506799119, + "loss": 0.0407, + "num_input_tokens_seen": 97397520, + "step": 45085 + }, + { + "epoch": 7.35562805872757, + "grad_norm": 0.0584256574511528, + "learning_rate": 0.0007970278949434588, + "loss": 0.0141, + "num_input_tokens_seen": 97407952, + "step": 45090 + }, + { + "epoch": 7.356443719412725, + "grad_norm": 0.28726014494895935, + "learning_rate": 0.0007969706331873586, + "loss": 0.2492, + "num_input_tokens_seen": 97419824, + "step": 45095 + }, + { + "epoch": 7.357259380097879, + "grad_norm": 0.2784290313720703, + "learning_rate": 0.0007969133654127718, + "loss": 0.1095, + "num_input_tokens_seen": 97431056, + "step": 45100 + }, + { + "epoch": 7.358075040783034, + "grad_norm": 0.07640790939331055, + "learning_rate": 0.0007968560916208589, + "loss": 0.016, + "num_input_tokens_seen": 97440976, + "step": 45105 + }, + { + "epoch": 7.358890701468189, + "grad_norm": 0.06213083863258362, + "learning_rate": 0.0007967988118127808, + "loss": 0.1767, + "num_input_tokens_seen": 97452720, + "step": 45110 + }, + { + "epoch": 7.359706362153344, + "grad_norm": 0.0668218657374382, + "learning_rate": 0.0007967415259896982, + "loss": 0.0154, + "num_input_tokens_seen": 97464240, + "step": 45115 + }, + { + "epoch": 7.3605220228384995, + "grad_norm": 0.038856931030750275, + "learning_rate": 0.0007966842341527722, + "loss": 0.021, + "num_input_tokens_seen": 97474032, + "step": 45120 + }, + { + "epoch": 7.361337683523654, + "grad_norm": 0.03655475750565529, + "learning_rate": 0.0007966269363031637, + "loss": 0.0358, + "num_input_tokens_seen": 97483952, + "step": 45125 + }, + { + "epoch": 7.362153344208809, + "grad_norm": 0.005070575047284365, + "learning_rate": 0.0007965696324420342, + "loss": 0.045, + "num_input_tokens_seen": 97495728, + "step": 45130 + }, + { + "epoch": 7.362969004893964, + "grad_norm": 0.007292062509804964, + "learning_rate": 0.0007965123225705447, + "loss": 0.1607, + "num_input_tokens_seen": 97507216, + "step": 45135 + }, + { + "epoch": 7.363784665579119, + "grad_norm": 0.028081052005290985, + "learning_rate": 0.000796455006689857, + "loss": 0.0573, + "num_input_tokens_seen": 97516880, + "step": 45140 + }, + { + "epoch": 7.364600326264274, + "grad_norm": 0.19714754819869995, + "learning_rate": 0.0007963976848011324, + "loss": 0.0737, + "num_input_tokens_seen": 97526928, + "step": 45145 + }, + { + "epoch": 7.365415986949429, + "grad_norm": 0.006840975489467382, + "learning_rate": 0.0007963403569055328, + "loss": 0.0077, + "num_input_tokens_seen": 97538192, + "step": 45150 + }, + { + "epoch": 7.366231647634584, + "grad_norm": 0.016417210921645164, + "learning_rate": 0.0007962830230042197, + "loss": 0.009, + "num_input_tokens_seen": 97547024, + "step": 45155 + }, + { + "epoch": 7.367047308319739, + "grad_norm": 0.15059006214141846, + "learning_rate": 0.0007962256830983556, + "loss": 0.0252, + "num_input_tokens_seen": 97557840, + "step": 45160 + }, + { + "epoch": 7.367862969004894, + "grad_norm": 0.01066130492836237, + "learning_rate": 0.0007961683371891019, + "loss": 0.0116, + "num_input_tokens_seen": 97569616, + "step": 45165 + }, + { + "epoch": 7.368678629690049, + "grad_norm": 0.017685212194919586, + "learning_rate": 0.0007961109852776214, + "loss": 0.1703, + "num_input_tokens_seen": 97580944, + "step": 45170 + }, + { + "epoch": 7.369494290375204, + "grad_norm": 0.046484678983688354, + "learning_rate": 0.0007960536273650761, + "loss": 0.0465, + "num_input_tokens_seen": 97590512, + "step": 45175 + }, + { + "epoch": 7.370309951060359, + "grad_norm": 0.07047548145055771, + "learning_rate": 0.0007959962634526285, + "loss": 0.0777, + "num_input_tokens_seen": 97600944, + "step": 45180 + }, + { + "epoch": 7.371125611745514, + "grad_norm": 0.06450872123241425, + "learning_rate": 0.0007959388935414411, + "loss": 0.0801, + "num_input_tokens_seen": 97611952, + "step": 45185 + }, + { + "epoch": 7.371941272430669, + "grad_norm": 0.24986594915390015, + "learning_rate": 0.0007958815176326764, + "loss": 0.0463, + "num_input_tokens_seen": 97624208, + "step": 45190 + }, + { + "epoch": 7.372756933115824, + "grad_norm": 0.19626054167747498, + "learning_rate": 0.0007958241357274976, + "loss": 0.1218, + "num_input_tokens_seen": 97635152, + "step": 45195 + }, + { + "epoch": 7.373572593800978, + "grad_norm": 0.01864909753203392, + "learning_rate": 0.0007957667478270674, + "loss": 0.0121, + "num_input_tokens_seen": 97646064, + "step": 45200 + }, + { + "epoch": 7.374388254486134, + "grad_norm": 0.025611115619540215, + "learning_rate": 0.0007957093539325489, + "loss": 0.0788, + "num_input_tokens_seen": 97657232, + "step": 45205 + }, + { + "epoch": 7.375203915171289, + "grad_norm": 0.17264457046985626, + "learning_rate": 0.000795651954045105, + "loss": 0.0596, + "num_input_tokens_seen": 97668176, + "step": 45210 + }, + { + "epoch": 7.376019575856444, + "grad_norm": 0.021459899842739105, + "learning_rate": 0.0007955945481658992, + "loss": 0.069, + "num_input_tokens_seen": 97678480, + "step": 45215 + }, + { + "epoch": 7.376835236541599, + "grad_norm": 0.008294719271361828, + "learning_rate": 0.0007955371362960951, + "loss": 0.0507, + "num_input_tokens_seen": 97688944, + "step": 45220 + }, + { + "epoch": 7.377650897226753, + "grad_norm": 0.11790649592876434, + "learning_rate": 0.000795479718436856, + "loss": 0.1997, + "num_input_tokens_seen": 97700592, + "step": 45225 + }, + { + "epoch": 7.378466557911908, + "grad_norm": 0.0033008514437824488, + "learning_rate": 0.0007954222945893455, + "loss": 0.0278, + "num_input_tokens_seen": 97710224, + "step": 45230 + }, + { + "epoch": 7.379282218597064, + "grad_norm": 0.1821359246969223, + "learning_rate": 0.0007953648647547274, + "loss": 0.1602, + "num_input_tokens_seen": 97720976, + "step": 45235 + }, + { + "epoch": 7.380097879282219, + "grad_norm": 0.03195223957300186, + "learning_rate": 0.0007953074289341655, + "loss": 0.1326, + "num_input_tokens_seen": 97732752, + "step": 45240 + }, + { + "epoch": 7.3809135399673735, + "grad_norm": 0.008191731758415699, + "learning_rate": 0.0007952499871288241, + "loss": 0.0657, + "num_input_tokens_seen": 97741648, + "step": 45245 + }, + { + "epoch": 7.381729200652528, + "grad_norm": 0.28001055121421814, + "learning_rate": 0.0007951925393398672, + "loss": 0.1205, + "num_input_tokens_seen": 97752816, + "step": 45250 + }, + { + "epoch": 7.382544861337683, + "grad_norm": 0.13458704948425293, + "learning_rate": 0.0007951350855684588, + "loss": 0.0565, + "num_input_tokens_seen": 97762864, + "step": 45255 + }, + { + "epoch": 7.383360522022839, + "grad_norm": 0.14959220588207245, + "learning_rate": 0.0007950776258157637, + "loss": 0.0523, + "num_input_tokens_seen": 97773424, + "step": 45260 + }, + { + "epoch": 7.384176182707994, + "grad_norm": 0.20786850154399872, + "learning_rate": 0.000795020160082946, + "loss": 0.1556, + "num_input_tokens_seen": 97783088, + "step": 45265 + }, + { + "epoch": 7.3849918433931485, + "grad_norm": 0.012430977076292038, + "learning_rate": 0.0007949626883711707, + "loss": 0.0616, + "num_input_tokens_seen": 97794032, + "step": 45270 + }, + { + "epoch": 7.385807504078303, + "grad_norm": 0.01813976839184761, + "learning_rate": 0.0007949052106816022, + "loss": 0.0188, + "num_input_tokens_seen": 97804336, + "step": 45275 + }, + { + "epoch": 7.386623164763458, + "grad_norm": 0.2485961616039276, + "learning_rate": 0.0007948477270154056, + "loss": 0.1332, + "num_input_tokens_seen": 97815312, + "step": 45280 + }, + { + "epoch": 7.387438825448613, + "grad_norm": 0.021446917206048965, + "learning_rate": 0.0007947902373737456, + "loss": 0.024, + "num_input_tokens_seen": 97825872, + "step": 45285 + }, + { + "epoch": 7.388254486133769, + "grad_norm": 0.002523197792470455, + "learning_rate": 0.0007947327417577875, + "loss": 0.0537, + "num_input_tokens_seen": 97836656, + "step": 45290 + }, + { + "epoch": 7.3890701468189235, + "grad_norm": 0.006098889745771885, + "learning_rate": 0.0007946752401686966, + "loss": 0.0246, + "num_input_tokens_seen": 97847824, + "step": 45295 + }, + { + "epoch": 7.389885807504078, + "grad_norm": 0.04989955946803093, + "learning_rate": 0.000794617732607638, + "loss": 0.0457, + "num_input_tokens_seen": 97857904, + "step": 45300 + }, + { + "epoch": 7.390701468189233, + "grad_norm": 0.24254950881004333, + "learning_rate": 0.0007945602190757775, + "loss": 0.0901, + "num_input_tokens_seen": 97867824, + "step": 45305 + }, + { + "epoch": 7.391517128874388, + "grad_norm": 0.007170901633799076, + "learning_rate": 0.0007945026995742803, + "loss": 0.0133, + "num_input_tokens_seen": 97876976, + "step": 45310 + }, + { + "epoch": 7.392332789559543, + "grad_norm": 0.033077552914619446, + "learning_rate": 0.0007944451741043124, + "loss": 0.0667, + "num_input_tokens_seen": 97887568, + "step": 45315 + }, + { + "epoch": 7.3931484502446985, + "grad_norm": 0.03981386125087738, + "learning_rate": 0.0007943876426670395, + "loss": 0.0257, + "num_input_tokens_seen": 97898576, + "step": 45320 + }, + { + "epoch": 7.393964110929853, + "grad_norm": 0.027024945244193077, + "learning_rate": 0.0007943301052636276, + "loss": 0.0196, + "num_input_tokens_seen": 97909072, + "step": 45325 + }, + { + "epoch": 7.394779771615008, + "grad_norm": 0.02054162509739399, + "learning_rate": 0.0007942725618952426, + "loss": 0.0272, + "num_input_tokens_seen": 97920016, + "step": 45330 + }, + { + "epoch": 7.395595432300163, + "grad_norm": 0.0020551327615976334, + "learning_rate": 0.000794215012563051, + "loss": 0.0239, + "num_input_tokens_seen": 97931888, + "step": 45335 + }, + { + "epoch": 7.396411092985318, + "grad_norm": 0.1443719118833542, + "learning_rate": 0.0007941574572682187, + "loss": 0.0298, + "num_input_tokens_seen": 97942896, + "step": 45340 + }, + { + "epoch": 7.397226753670473, + "grad_norm": 0.0606955960392952, + "learning_rate": 0.0007940998960119126, + "loss": 0.0194, + "num_input_tokens_seen": 97955152, + "step": 45345 + }, + { + "epoch": 7.398042414355628, + "grad_norm": 0.016795361414551735, + "learning_rate": 0.0007940423287952989, + "loss": 0.0377, + "num_input_tokens_seen": 97965776, + "step": 45350 + }, + { + "epoch": 7.398858075040783, + "grad_norm": 0.004982766695320606, + "learning_rate": 0.0007939847556195443, + "loss": 0.014, + "num_input_tokens_seen": 97976240, + "step": 45355 + }, + { + "epoch": 7.399673735725938, + "grad_norm": 0.004065237939357758, + "learning_rate": 0.0007939271764858158, + "loss": 0.1471, + "num_input_tokens_seen": 97986800, + "step": 45360 + }, + { + "epoch": 7.400489396411093, + "grad_norm": 0.3481471836566925, + "learning_rate": 0.0007938695913952802, + "loss": 0.1904, + "num_input_tokens_seen": 97997840, + "step": 45365 + }, + { + "epoch": 7.401305057096248, + "grad_norm": 0.006440913304686546, + "learning_rate": 0.0007938120003491045, + "loss": 0.0164, + "num_input_tokens_seen": 98008688, + "step": 45370 + }, + { + "epoch": 7.402120717781403, + "grad_norm": 0.0016812963876873255, + "learning_rate": 0.0007937544033484558, + "loss": 0.0401, + "num_input_tokens_seen": 98019920, + "step": 45375 + }, + { + "epoch": 7.402936378466558, + "grad_norm": 0.004809595178812742, + "learning_rate": 0.0007936968003945015, + "loss": 0.0346, + "num_input_tokens_seen": 98030576, + "step": 45380 + }, + { + "epoch": 7.403752039151713, + "grad_norm": 0.057477615773677826, + "learning_rate": 0.0007936391914884092, + "loss": 0.1052, + "num_input_tokens_seen": 98041104, + "step": 45385 + }, + { + "epoch": 7.404567699836868, + "grad_norm": 0.009014656767249107, + "learning_rate": 0.0007935815766313459, + "loss": 0.0873, + "num_input_tokens_seen": 98052048, + "step": 45390 + }, + { + "epoch": 7.4053833605220225, + "grad_norm": 0.1354934573173523, + "learning_rate": 0.0007935239558244795, + "loss": 0.0482, + "num_input_tokens_seen": 98062384, + "step": 45395 + }, + { + "epoch": 7.406199021207178, + "grad_norm": 0.17773400247097015, + "learning_rate": 0.000793466329068978, + "loss": 0.1396, + "num_input_tokens_seen": 98073232, + "step": 45400 + }, + { + "epoch": 7.407014681892333, + "grad_norm": 0.07813189923763275, + "learning_rate": 0.000793408696366009, + "loss": 0.0567, + "num_input_tokens_seen": 98085232, + "step": 45405 + }, + { + "epoch": 7.407830342577488, + "grad_norm": 0.035286080092191696, + "learning_rate": 0.0007933510577167404, + "loss": 0.0474, + "num_input_tokens_seen": 98095824, + "step": 45410 + }, + { + "epoch": 7.408646003262643, + "grad_norm": 0.1701454520225525, + "learning_rate": 0.0007932934131223406, + "loss": 0.3416, + "num_input_tokens_seen": 98106640, + "step": 45415 + }, + { + "epoch": 7.4094616639477975, + "grad_norm": 0.011776736006140709, + "learning_rate": 0.0007932357625839776, + "loss": 0.0757, + "num_input_tokens_seen": 98118384, + "step": 45420 + }, + { + "epoch": 7.410277324632952, + "grad_norm": 0.2138156294822693, + "learning_rate": 0.0007931781061028201, + "loss": 0.1299, + "num_input_tokens_seen": 98129168, + "step": 45425 + }, + { + "epoch": 7.411092985318108, + "grad_norm": 0.01285554189234972, + "learning_rate": 0.0007931204436800361, + "loss": 0.0365, + "num_input_tokens_seen": 98139408, + "step": 45430 + }, + { + "epoch": 7.411908646003263, + "grad_norm": 0.2702222764492035, + "learning_rate": 0.0007930627753167945, + "loss": 0.1871, + "num_input_tokens_seen": 98149744, + "step": 45435 + }, + { + "epoch": 7.412724306688418, + "grad_norm": 0.1068132221698761, + "learning_rate": 0.0007930051010142641, + "loss": 0.0408, + "num_input_tokens_seen": 98160528, + "step": 45440 + }, + { + "epoch": 7.4135399673735725, + "grad_norm": 0.13657432794570923, + "learning_rate": 0.0007929474207736136, + "loss": 0.0438, + "num_input_tokens_seen": 98170768, + "step": 45445 + }, + { + "epoch": 7.414355628058727, + "grad_norm": 0.0018624071963131428, + "learning_rate": 0.000792889734596012, + "loss": 0.0343, + "num_input_tokens_seen": 98180816, + "step": 45450 + }, + { + "epoch": 7.415171288743883, + "grad_norm": 0.00952328834682703, + "learning_rate": 0.0007928320424826284, + "loss": 0.1863, + "num_input_tokens_seen": 98191376, + "step": 45455 + }, + { + "epoch": 7.415986949429038, + "grad_norm": 0.0030028768815100193, + "learning_rate": 0.0007927743444346317, + "loss": 0.0213, + "num_input_tokens_seen": 98202320, + "step": 45460 + }, + { + "epoch": 7.416802610114193, + "grad_norm": 0.555898904800415, + "learning_rate": 0.0007927166404531916, + "loss": 0.0916, + "num_input_tokens_seen": 98212912, + "step": 45465 + }, + { + "epoch": 7.417618270799347, + "grad_norm": 0.010686096735298634, + "learning_rate": 0.0007926589305394776, + "loss": 0.1291, + "num_input_tokens_seen": 98224688, + "step": 45470 + }, + { + "epoch": 7.418433931484502, + "grad_norm": 0.1383063644170761, + "learning_rate": 0.0007926012146946591, + "loss": 0.0652, + "num_input_tokens_seen": 98235216, + "step": 45475 + }, + { + "epoch": 7.419249592169657, + "grad_norm": 0.14411990344524384, + "learning_rate": 0.0007925434929199058, + "loss": 0.0662, + "num_input_tokens_seen": 98245104, + "step": 45480 + }, + { + "epoch": 7.420065252854813, + "grad_norm": 0.15971876680850983, + "learning_rate": 0.0007924857652163873, + "loss": 0.1169, + "num_input_tokens_seen": 98256912, + "step": 45485 + }, + { + "epoch": 7.420880913539968, + "grad_norm": 0.009572381153702736, + "learning_rate": 0.0007924280315852739, + "loss": 0.0096, + "num_input_tokens_seen": 98266992, + "step": 45490 + }, + { + "epoch": 7.421696574225122, + "grad_norm": 0.01877368800342083, + "learning_rate": 0.0007923702920277355, + "loss": 0.0309, + "num_input_tokens_seen": 98277488, + "step": 45495 + }, + { + "epoch": 7.422512234910277, + "grad_norm": 0.5380867123603821, + "learning_rate": 0.0007923125465449421, + "loss": 0.1168, + "num_input_tokens_seen": 98287408, + "step": 45500 + }, + { + "epoch": 7.423327895595432, + "grad_norm": 0.017852267250418663, + "learning_rate": 0.0007922547951380643, + "loss": 0.0567, + "num_input_tokens_seen": 98299568, + "step": 45505 + }, + { + "epoch": 7.424143556280587, + "grad_norm": 0.0074815042316913605, + "learning_rate": 0.0007921970378082722, + "loss": 0.0244, + "num_input_tokens_seen": 98310800, + "step": 45510 + }, + { + "epoch": 7.424959216965743, + "grad_norm": 0.4314858317375183, + "learning_rate": 0.0007921392745567364, + "loss": 0.0492, + "num_input_tokens_seen": 98321104, + "step": 45515 + }, + { + "epoch": 7.425774877650897, + "grad_norm": 0.012821875512599945, + "learning_rate": 0.0007920815053846277, + "loss": 0.1269, + "num_input_tokens_seen": 98331312, + "step": 45520 + }, + { + "epoch": 7.426590538336052, + "grad_norm": 0.10156050324440002, + "learning_rate": 0.0007920237302931167, + "loss": 0.049, + "num_input_tokens_seen": 98341584, + "step": 45525 + }, + { + "epoch": 7.427406199021207, + "grad_norm": 0.08236122131347656, + "learning_rate": 0.0007919659492833744, + "loss": 0.0163, + "num_input_tokens_seen": 98351024, + "step": 45530 + }, + { + "epoch": 7.428221859706362, + "grad_norm": 0.549354076385498, + "learning_rate": 0.0007919081623565717, + "loss": 0.0509, + "num_input_tokens_seen": 98362224, + "step": 45535 + }, + { + "epoch": 7.4290375203915175, + "grad_norm": 0.21111354231834412, + "learning_rate": 0.0007918503695138799, + "loss": 0.0355, + "num_input_tokens_seen": 98373616, + "step": 45540 + }, + { + "epoch": 7.429853181076672, + "grad_norm": 0.015019206330180168, + "learning_rate": 0.0007917925707564699, + "loss": 0.0085, + "num_input_tokens_seen": 98384464, + "step": 45545 + }, + { + "epoch": 7.430668841761827, + "grad_norm": 0.2213941514492035, + "learning_rate": 0.0007917347660855134, + "loss": 0.0996, + "num_input_tokens_seen": 98394544, + "step": 45550 + }, + { + "epoch": 7.431484502446982, + "grad_norm": 0.2764107286930084, + "learning_rate": 0.0007916769555021819, + "loss": 0.258, + "num_input_tokens_seen": 98405904, + "step": 45555 + }, + { + "epoch": 7.432300163132137, + "grad_norm": 0.005178771913051605, + "learning_rate": 0.0007916191390076468, + "loss": 0.0095, + "num_input_tokens_seen": 98417296, + "step": 45560 + }, + { + "epoch": 7.433115823817292, + "grad_norm": 0.006140346638858318, + "learning_rate": 0.0007915613166030799, + "loss": 0.1488, + "num_input_tokens_seen": 98428496, + "step": 45565 + }, + { + "epoch": 7.433931484502447, + "grad_norm": 0.05736982077360153, + "learning_rate": 0.0007915034882896528, + "loss": 0.0199, + "num_input_tokens_seen": 98438480, + "step": 45570 + }, + { + "epoch": 7.434747145187602, + "grad_norm": 0.018076226115226746, + "learning_rate": 0.0007914456540685379, + "loss": 0.1387, + "num_input_tokens_seen": 98448784, + "step": 45575 + }, + { + "epoch": 7.435562805872757, + "grad_norm": 0.5139619708061218, + "learning_rate": 0.0007913878139409072, + "loss": 0.1692, + "num_input_tokens_seen": 98459952, + "step": 45580 + }, + { + "epoch": 7.436378466557912, + "grad_norm": 0.09341350197792053, + "learning_rate": 0.0007913299679079326, + "loss": 0.1271, + "num_input_tokens_seen": 98470256, + "step": 45585 + }, + { + "epoch": 7.437194127243067, + "grad_norm": 0.025008317083120346, + "learning_rate": 0.000791272115970787, + "loss": 0.1429, + "num_input_tokens_seen": 98481552, + "step": 45590 + }, + { + "epoch": 7.438009787928221, + "grad_norm": 0.031664974987506866, + "learning_rate": 0.0007912142581306421, + "loss": 0.0941, + "num_input_tokens_seen": 98492208, + "step": 45595 + }, + { + "epoch": 7.438825448613377, + "grad_norm": 0.28572022914886475, + "learning_rate": 0.0007911563943886709, + "loss": 0.1129, + "num_input_tokens_seen": 98502672, + "step": 45600 + }, + { + "epoch": 7.439641109298532, + "grad_norm": 0.051957886666059494, + "learning_rate": 0.000791098524746046, + "loss": 0.074, + "num_input_tokens_seen": 98514000, + "step": 45605 + }, + { + "epoch": 7.440456769983687, + "grad_norm": 0.10540501028299332, + "learning_rate": 0.0007910406492039404, + "loss": 0.0328, + "num_input_tokens_seen": 98525712, + "step": 45610 + }, + { + "epoch": 7.441272430668842, + "grad_norm": 0.00548113789409399, + "learning_rate": 0.0007909827677635267, + "loss": 0.1057, + "num_input_tokens_seen": 98535984, + "step": 45615 + }, + { + "epoch": 7.442088091353996, + "grad_norm": 0.07310794293880463, + "learning_rate": 0.000790924880425978, + "loss": 0.049, + "num_input_tokens_seen": 98548016, + "step": 45620 + }, + { + "epoch": 7.442903752039152, + "grad_norm": 0.05371404439210892, + "learning_rate": 0.0007908669871924676, + "loss": 0.102, + "num_input_tokens_seen": 98558736, + "step": 45625 + }, + { + "epoch": 7.443719412724307, + "grad_norm": 0.033556073904037476, + "learning_rate": 0.0007908090880641688, + "loss": 0.148, + "num_input_tokens_seen": 98570192, + "step": 45630 + }, + { + "epoch": 7.444535073409462, + "grad_norm": 0.011591989547014236, + "learning_rate": 0.0007907511830422547, + "loss": 0.1166, + "num_input_tokens_seen": 98582320, + "step": 45635 + }, + { + "epoch": 7.445350734094617, + "grad_norm": 0.30247440934181213, + "learning_rate": 0.0007906932721278992, + "loss": 0.181, + "num_input_tokens_seen": 98592656, + "step": 45640 + }, + { + "epoch": 7.446166394779771, + "grad_norm": 0.004149232525378466, + "learning_rate": 0.0007906353553222757, + "loss": 0.0424, + "num_input_tokens_seen": 98601936, + "step": 45645 + }, + { + "epoch": 7.446982055464926, + "grad_norm": 0.2527449131011963, + "learning_rate": 0.000790577432626558, + "loss": 0.1473, + "num_input_tokens_seen": 98613424, + "step": 45650 + }, + { + "epoch": 7.447797716150082, + "grad_norm": 0.012989042326807976, + "learning_rate": 0.0007905195040419202, + "loss": 0.0551, + "num_input_tokens_seen": 98623056, + "step": 45655 + }, + { + "epoch": 7.448613376835237, + "grad_norm": 0.014558068476617336, + "learning_rate": 0.0007904615695695359, + "loss": 0.0796, + "num_input_tokens_seen": 98633904, + "step": 45660 + }, + { + "epoch": 7.4494290375203915, + "grad_norm": 0.037499744445085526, + "learning_rate": 0.0007904036292105794, + "loss": 0.0371, + "num_input_tokens_seen": 98644560, + "step": 45665 + }, + { + "epoch": 7.450244698205546, + "grad_norm": 0.06032086908817291, + "learning_rate": 0.000790345682966225, + "loss": 0.0921, + "num_input_tokens_seen": 98655216, + "step": 45670 + }, + { + "epoch": 7.451060358890701, + "grad_norm": 0.035724010318517685, + "learning_rate": 0.000790287730837647, + "loss": 0.0217, + "num_input_tokens_seen": 98666288, + "step": 45675 + }, + { + "epoch": 7.451876019575856, + "grad_norm": 0.19834664463996887, + "learning_rate": 0.0007902297728260199, + "loss": 0.1434, + "num_input_tokens_seen": 98677584, + "step": 45680 + }, + { + "epoch": 7.452691680261012, + "grad_norm": 0.11141058057546616, + "learning_rate": 0.0007901718089325183, + "loss": 0.0282, + "num_input_tokens_seen": 98688016, + "step": 45685 + }, + { + "epoch": 7.4535073409461665, + "grad_norm": 0.04874371364712715, + "learning_rate": 0.0007901138391583169, + "loss": 0.0702, + "num_input_tokens_seen": 98698960, + "step": 45690 + }, + { + "epoch": 7.454323001631321, + "grad_norm": 0.09582873433828354, + "learning_rate": 0.0007900558635045904, + "loss": 0.1012, + "num_input_tokens_seen": 98709456, + "step": 45695 + }, + { + "epoch": 7.455138662316476, + "grad_norm": 0.10680019855499268, + "learning_rate": 0.000789997881972514, + "loss": 0.13, + "num_input_tokens_seen": 98720752, + "step": 45700 + }, + { + "epoch": 7.455954323001631, + "grad_norm": 0.007165440358221531, + "learning_rate": 0.0007899398945632626, + "loss": 0.0142, + "num_input_tokens_seen": 98731856, + "step": 45705 + }, + { + "epoch": 7.456769983686787, + "grad_norm": 0.03316681459546089, + "learning_rate": 0.0007898819012780114, + "loss": 0.245, + "num_input_tokens_seen": 98742160, + "step": 45710 + }, + { + "epoch": 7.4575856443719415, + "grad_norm": 0.22914770245552063, + "learning_rate": 0.0007898239021179356, + "loss": 0.1253, + "num_input_tokens_seen": 98751568, + "step": 45715 + }, + { + "epoch": 7.458401305057096, + "grad_norm": 0.251247763633728, + "learning_rate": 0.000789765897084211, + "loss": 0.1455, + "num_input_tokens_seen": 98763664, + "step": 45720 + }, + { + "epoch": 7.459216965742251, + "grad_norm": 0.023140182718634605, + "learning_rate": 0.0007897078861780127, + "loss": 0.0735, + "num_input_tokens_seen": 98774640, + "step": 45725 + }, + { + "epoch": 7.460032626427406, + "grad_norm": 0.15593160688877106, + "learning_rate": 0.0007896498694005168, + "loss": 0.0445, + "num_input_tokens_seen": 98784656, + "step": 45730 + }, + { + "epoch": 7.460848287112561, + "grad_norm": 0.060400452464818954, + "learning_rate": 0.0007895918467528987, + "loss": 0.087, + "num_input_tokens_seen": 98795792, + "step": 45735 + }, + { + "epoch": 7.4616639477977165, + "grad_norm": 0.008460725657641888, + "learning_rate": 0.0007895338182363343, + "loss": 0.0236, + "num_input_tokens_seen": 98806416, + "step": 45740 + }, + { + "epoch": 7.462479608482871, + "grad_norm": 0.11589276045560837, + "learning_rate": 0.0007894757838519999, + "loss": 0.0469, + "num_input_tokens_seen": 98817616, + "step": 45745 + }, + { + "epoch": 7.463295269168026, + "grad_norm": 0.20811273157596588, + "learning_rate": 0.0007894177436010716, + "loss": 0.134, + "num_input_tokens_seen": 98828304, + "step": 45750 + }, + { + "epoch": 7.464110929853181, + "grad_norm": 0.02944091148674488, + "learning_rate": 0.0007893596974847255, + "loss": 0.0251, + "num_input_tokens_seen": 98837648, + "step": 45755 + }, + { + "epoch": 7.464926590538336, + "grad_norm": 0.2792534828186035, + "learning_rate": 0.000789301645504138, + "loss": 0.0759, + "num_input_tokens_seen": 98849168, + "step": 45760 + }, + { + "epoch": 7.465742251223491, + "grad_norm": 0.011199427768588066, + "learning_rate": 0.0007892435876604857, + "loss": 0.1193, + "num_input_tokens_seen": 98859984, + "step": 45765 + }, + { + "epoch": 7.466557911908646, + "grad_norm": 0.17388412356376648, + "learning_rate": 0.0007891855239549453, + "loss": 0.0971, + "num_input_tokens_seen": 98870992, + "step": 45770 + }, + { + "epoch": 7.467373572593801, + "grad_norm": 0.004538760520517826, + "learning_rate": 0.0007891274543886933, + "loss": 0.053, + "num_input_tokens_seen": 98882160, + "step": 45775 + }, + { + "epoch": 7.468189233278956, + "grad_norm": 0.22801204025745392, + "learning_rate": 0.0007890693789629064, + "loss": 0.0405, + "num_input_tokens_seen": 98892880, + "step": 45780 + }, + { + "epoch": 7.469004893964111, + "grad_norm": 0.049073074012994766, + "learning_rate": 0.0007890112976787621, + "loss": 0.1171, + "num_input_tokens_seen": 98903984, + "step": 45785 + }, + { + "epoch": 7.4698205546492655, + "grad_norm": 0.023845715448260307, + "learning_rate": 0.0007889532105374373, + "loss": 0.0861, + "num_input_tokens_seen": 98914992, + "step": 45790 + }, + { + "epoch": 7.470636215334421, + "grad_norm": 0.0964706763625145, + "learning_rate": 0.0007888951175401089, + "loss": 0.0648, + "num_input_tokens_seen": 98924752, + "step": 45795 + }, + { + "epoch": 7.471451876019576, + "grad_norm": 0.040027473121881485, + "learning_rate": 0.0007888370186879545, + "loss": 0.0357, + "num_input_tokens_seen": 98935536, + "step": 45800 + }, + { + "epoch": 7.472267536704731, + "grad_norm": 0.019557340070605278, + "learning_rate": 0.0007887789139821516, + "loss": 0.0342, + "num_input_tokens_seen": 98946672, + "step": 45805 + }, + { + "epoch": 7.473083197389886, + "grad_norm": 0.06337641179561615, + "learning_rate": 0.0007887208034238777, + "loss": 0.0753, + "num_input_tokens_seen": 98956880, + "step": 45810 + }, + { + "epoch": 7.4738988580750405, + "grad_norm": 0.019905684515833855, + "learning_rate": 0.0007886626870143103, + "loss": 0.0212, + "num_input_tokens_seen": 98966192, + "step": 45815 + }, + { + "epoch": 7.474714518760196, + "grad_norm": 0.18070393800735474, + "learning_rate": 0.0007886045647546274, + "loss": 0.2547, + "num_input_tokens_seen": 98976528, + "step": 45820 + }, + { + "epoch": 7.475530179445351, + "grad_norm": 0.16771891713142395, + "learning_rate": 0.0007885464366460069, + "loss": 0.0834, + "num_input_tokens_seen": 98987152, + "step": 45825 + }, + { + "epoch": 7.476345840130506, + "grad_norm": 0.04264623671770096, + "learning_rate": 0.0007884883026896268, + "loss": 0.0263, + "num_input_tokens_seen": 98998992, + "step": 45830 + }, + { + "epoch": 7.477161500815661, + "grad_norm": 0.006334866862744093, + "learning_rate": 0.0007884301628866652, + "loss": 0.0243, + "num_input_tokens_seen": 99009328, + "step": 45835 + }, + { + "epoch": 7.4779771615008155, + "grad_norm": 0.005012247245758772, + "learning_rate": 0.0007883720172383007, + "loss": 0.0347, + "num_input_tokens_seen": 99019728, + "step": 45840 + }, + { + "epoch": 7.47879282218597, + "grad_norm": 0.18069060146808624, + "learning_rate": 0.0007883138657457111, + "loss": 0.2113, + "num_input_tokens_seen": 99031440, + "step": 45845 + }, + { + "epoch": 7.479608482871126, + "grad_norm": 0.18505054712295532, + "learning_rate": 0.0007882557084100755, + "loss": 0.1176, + "num_input_tokens_seen": 99042192, + "step": 45850 + }, + { + "epoch": 7.480424143556281, + "grad_norm": 0.26251259446144104, + "learning_rate": 0.0007881975452325722, + "loss": 0.0476, + "num_input_tokens_seen": 99053552, + "step": 45855 + }, + { + "epoch": 7.481239804241436, + "grad_norm": 0.04688497632741928, + "learning_rate": 0.00078813937621438, + "loss": 0.1765, + "num_input_tokens_seen": 99064208, + "step": 45860 + }, + { + "epoch": 7.4820554649265905, + "grad_norm": 0.20419980585575104, + "learning_rate": 0.000788081201356678, + "loss": 0.1365, + "num_input_tokens_seen": 99073680, + "step": 45865 + }, + { + "epoch": 7.482871125611745, + "grad_norm": 0.026826782152056694, + "learning_rate": 0.0007880230206606449, + "loss": 0.0253, + "num_input_tokens_seen": 99085360, + "step": 45870 + }, + { + "epoch": 7.4836867862969, + "grad_norm": 0.09688873589038849, + "learning_rate": 0.0007879648341274599, + "loss": 0.065, + "num_input_tokens_seen": 99096112, + "step": 45875 + }, + { + "epoch": 7.484502446982056, + "grad_norm": 0.13799415528774261, + "learning_rate": 0.0007879066417583021, + "loss": 0.0537, + "num_input_tokens_seen": 99106832, + "step": 45880 + }, + { + "epoch": 7.485318107667211, + "grad_norm": 0.05497647821903229, + "learning_rate": 0.0007878484435543511, + "loss": 0.0885, + "num_input_tokens_seen": 99118832, + "step": 45885 + }, + { + "epoch": 7.486133768352365, + "grad_norm": 0.3244342505931854, + "learning_rate": 0.0007877902395167862, + "loss": 0.226, + "num_input_tokens_seen": 99129776, + "step": 45890 + }, + { + "epoch": 7.48694942903752, + "grad_norm": 0.03948109596967697, + "learning_rate": 0.000787732029646787, + "loss": 0.1059, + "num_input_tokens_seen": 99141424, + "step": 45895 + }, + { + "epoch": 7.487765089722675, + "grad_norm": 0.02498193085193634, + "learning_rate": 0.0007876738139455332, + "loss": 0.019, + "num_input_tokens_seen": 99151888, + "step": 45900 + }, + { + "epoch": 7.488580750407831, + "grad_norm": 0.1953345686197281, + "learning_rate": 0.0007876155924142046, + "loss": 0.1031, + "num_input_tokens_seen": 99162128, + "step": 45905 + }, + { + "epoch": 7.489396411092986, + "grad_norm": 0.0279481690376997, + "learning_rate": 0.0007875573650539811, + "loss": 0.222, + "num_input_tokens_seen": 99172592, + "step": 45910 + }, + { + "epoch": 7.49021207177814, + "grad_norm": 0.03121795877814293, + "learning_rate": 0.0007874991318660429, + "loss": 0.0298, + "num_input_tokens_seen": 99183856, + "step": 45915 + }, + { + "epoch": 7.491027732463295, + "grad_norm": 0.016351960599422455, + "learning_rate": 0.0007874408928515702, + "loss": 0.0499, + "num_input_tokens_seen": 99195216, + "step": 45920 + }, + { + "epoch": 7.49184339314845, + "grad_norm": 0.006542586255818605, + "learning_rate": 0.000787382648011743, + "loss": 0.0124, + "num_input_tokens_seen": 99203984, + "step": 45925 + }, + { + "epoch": 7.492659053833605, + "grad_norm": 0.16917765140533447, + "learning_rate": 0.0007873243973477419, + "loss": 0.0249, + "num_input_tokens_seen": 99214448, + "step": 45930 + }, + { + "epoch": 7.493474714518761, + "grad_norm": 0.05392063409090042, + "learning_rate": 0.0007872661408607473, + "loss": 0.1237, + "num_input_tokens_seen": 99224976, + "step": 45935 + }, + { + "epoch": 7.494290375203915, + "grad_norm": 0.10604605078697205, + "learning_rate": 0.0007872078785519401, + "loss": 0.0346, + "num_input_tokens_seen": 99236944, + "step": 45940 + }, + { + "epoch": 7.49510603588907, + "grad_norm": 0.13532358407974243, + "learning_rate": 0.0007871496104225007, + "loss": 0.0768, + "num_input_tokens_seen": 99247984, + "step": 45945 + }, + { + "epoch": 7.495921696574225, + "grad_norm": 0.05853382125496864, + "learning_rate": 0.0007870913364736103, + "loss": 0.1162, + "num_input_tokens_seen": 99259344, + "step": 45950 + }, + { + "epoch": 7.49673735725938, + "grad_norm": 0.03759206831455231, + "learning_rate": 0.0007870330567064499, + "loss": 0.0555, + "num_input_tokens_seen": 99270576, + "step": 45955 + }, + { + "epoch": 7.497553017944535, + "grad_norm": 0.12168195843696594, + "learning_rate": 0.0007869747711222001, + "loss": 0.0328, + "num_input_tokens_seen": 99280848, + "step": 45960 + }, + { + "epoch": 7.49836867862969, + "grad_norm": 0.22778406739234924, + "learning_rate": 0.0007869164797220429, + "loss": 0.2144, + "num_input_tokens_seen": 99291536, + "step": 45965 + }, + { + "epoch": 7.499184339314845, + "grad_norm": 0.17280970513820648, + "learning_rate": 0.000786858182507159, + "loss": 0.0656, + "num_input_tokens_seen": 99302640, + "step": 45970 + }, + { + "epoch": 7.5, + "grad_norm": 0.03377150744199753, + "learning_rate": 0.0007867998794787303, + "loss": 0.0341, + "num_input_tokens_seen": 99314160, + "step": 45975 + }, + { + "epoch": 7.500815660685155, + "grad_norm": 0.06262348592281342, + "learning_rate": 0.0007867415706379381, + "loss": 0.0689, + "num_input_tokens_seen": 99325456, + "step": 45980 + }, + { + "epoch": 7.50163132137031, + "grad_norm": 0.005712342448532581, + "learning_rate": 0.0007866832559859642, + "loss": 0.1304, + "num_input_tokens_seen": 99336976, + "step": 45985 + }, + { + "epoch": 7.502446982055465, + "grad_norm": 0.3855852782726288, + "learning_rate": 0.0007866249355239905, + "loss": 0.1711, + "num_input_tokens_seen": 99347536, + "step": 45990 + }, + { + "epoch": 7.50326264274062, + "grad_norm": 0.005695751868188381, + "learning_rate": 0.0007865666092531989, + "loss": 0.0842, + "num_input_tokens_seen": 99359504, + "step": 45995 + }, + { + "epoch": 7.504078303425775, + "grad_norm": 0.0534052737057209, + "learning_rate": 0.0007865082771747713, + "loss": 0.0217, + "num_input_tokens_seen": 99370032, + "step": 46000 + }, + { + "epoch": 7.50489396411093, + "grad_norm": 0.038309045135974884, + "learning_rate": 0.00078644993928989, + "loss": 0.0391, + "num_input_tokens_seen": 99380816, + "step": 46005 + }, + { + "epoch": 7.505709624796085, + "grad_norm": 0.010843995958566666, + "learning_rate": 0.0007863915955997374, + "loss": 0.0524, + "num_input_tokens_seen": 99392592, + "step": 46010 + }, + { + "epoch": 7.506525285481239, + "grad_norm": 0.017481878399848938, + "learning_rate": 0.0007863332461054957, + "loss": 0.0117, + "num_input_tokens_seen": 99403760, + "step": 46015 + }, + { + "epoch": 7.507340946166395, + "grad_norm": 0.03930390253663063, + "learning_rate": 0.0007862748908083477, + "loss": 0.0803, + "num_input_tokens_seen": 99414768, + "step": 46020 + }, + { + "epoch": 7.50815660685155, + "grad_norm": 0.07113105058670044, + "learning_rate": 0.0007862165297094758, + "loss": 0.0497, + "num_input_tokens_seen": 99425040, + "step": 46025 + }, + { + "epoch": 7.508972267536705, + "grad_norm": 0.015223911963403225, + "learning_rate": 0.0007861581628100628, + "loss": 0.0549, + "num_input_tokens_seen": 99436016, + "step": 46030 + }, + { + "epoch": 7.50978792822186, + "grad_norm": 0.03532985597848892, + "learning_rate": 0.0007860997901112917, + "loss": 0.091, + "num_input_tokens_seen": 99448624, + "step": 46035 + }, + { + "epoch": 7.510603588907014, + "grad_norm": 0.013420658186078072, + "learning_rate": 0.0007860414116143453, + "loss": 0.0429, + "num_input_tokens_seen": 99458608, + "step": 46040 + }, + { + "epoch": 7.511419249592169, + "grad_norm": 0.06856658309698105, + "learning_rate": 0.0007859830273204069, + "loss": 0.1099, + "num_input_tokens_seen": 99469040, + "step": 46045 + }, + { + "epoch": 7.512234910277325, + "grad_norm": 0.05944625288248062, + "learning_rate": 0.0007859246372306595, + "loss": 0.0799, + "num_input_tokens_seen": 99479888, + "step": 46050 + }, + { + "epoch": 7.51305057096248, + "grad_norm": 0.3955201208591461, + "learning_rate": 0.0007858662413462867, + "loss": 0.1141, + "num_input_tokens_seen": 99489520, + "step": 46055 + }, + { + "epoch": 7.513866231647635, + "grad_norm": 0.2453528493642807, + "learning_rate": 0.000785807839668472, + "loss": 0.1028, + "num_input_tokens_seen": 99501136, + "step": 46060 + }, + { + "epoch": 7.514681892332789, + "grad_norm": 0.3181774616241455, + "learning_rate": 0.0007857494321983987, + "loss": 0.0933, + "num_input_tokens_seen": 99513040, + "step": 46065 + }, + { + "epoch": 7.515497553017944, + "grad_norm": 0.05358253791928291, + "learning_rate": 0.0007856910189372506, + "loss": 0.0786, + "num_input_tokens_seen": 99523760, + "step": 46070 + }, + { + "epoch": 7.5163132137031, + "grad_norm": 0.005161238834261894, + "learning_rate": 0.0007856325998862118, + "loss": 0.051, + "num_input_tokens_seen": 99533456, + "step": 46075 + }, + { + "epoch": 7.517128874388255, + "grad_norm": 0.02480381354689598, + "learning_rate": 0.0007855741750464658, + "loss": 0.1233, + "num_input_tokens_seen": 99544944, + "step": 46080 + }, + { + "epoch": 7.5179445350734095, + "grad_norm": 0.02038867212831974, + "learning_rate": 0.0007855157444191969, + "loss": 0.0735, + "num_input_tokens_seen": 99555152, + "step": 46085 + }, + { + "epoch": 7.518760195758564, + "grad_norm": 0.006652286276221275, + "learning_rate": 0.0007854573080055894, + "loss": 0.0325, + "num_input_tokens_seen": 99566256, + "step": 46090 + }, + { + "epoch": 7.519575856443719, + "grad_norm": 0.31422188878059387, + "learning_rate": 0.0007853988658068274, + "loss": 0.0718, + "num_input_tokens_seen": 99575952, + "step": 46095 + }, + { + "epoch": 7.520391517128875, + "grad_norm": 0.26823487877845764, + "learning_rate": 0.000785340417824095, + "loss": 0.1506, + "num_input_tokens_seen": 99588688, + "step": 46100 + }, + { + "epoch": 7.52120717781403, + "grad_norm": 0.04891170188784599, + "learning_rate": 0.0007852819640585773, + "loss": 0.0632, + "num_input_tokens_seen": 99599280, + "step": 46105 + }, + { + "epoch": 7.5220228384991845, + "grad_norm": 0.11184188723564148, + "learning_rate": 0.0007852235045114588, + "loss": 0.0441, + "num_input_tokens_seen": 99609776, + "step": 46110 + }, + { + "epoch": 7.522838499184339, + "grad_norm": 0.07456564158201218, + "learning_rate": 0.000785165039183924, + "loss": 0.2539, + "num_input_tokens_seen": 99620208, + "step": 46115 + }, + { + "epoch": 7.523654159869494, + "grad_norm": 0.2406882345676422, + "learning_rate": 0.0007851065680771581, + "loss": 0.1051, + "num_input_tokens_seen": 99631088, + "step": 46120 + }, + { + "epoch": 7.524469820554649, + "grad_norm": 0.058970607817173004, + "learning_rate": 0.0007850480911923457, + "loss": 0.1523, + "num_input_tokens_seen": 99643664, + "step": 46125 + }, + { + "epoch": 7.525285481239804, + "grad_norm": 0.3528447151184082, + "learning_rate": 0.0007849896085306723, + "loss": 0.0559, + "num_input_tokens_seen": 99652304, + "step": 46130 + }, + { + "epoch": 7.5261011419249595, + "grad_norm": 0.08873096853494644, + "learning_rate": 0.0007849311200933228, + "loss": 0.1321, + "num_input_tokens_seen": 99662416, + "step": 46135 + }, + { + "epoch": 7.526916802610114, + "grad_norm": 0.041165851056575775, + "learning_rate": 0.0007848726258814826, + "loss": 0.045, + "num_input_tokens_seen": 99672400, + "step": 46140 + }, + { + "epoch": 7.527732463295269, + "grad_norm": 0.005180297419428825, + "learning_rate": 0.0007848141258963375, + "loss": 0.0275, + "num_input_tokens_seen": 99683216, + "step": 46145 + }, + { + "epoch": 7.528548123980424, + "grad_norm": 0.18040695786476135, + "learning_rate": 0.0007847556201390727, + "loss": 0.0728, + "num_input_tokens_seen": 99695056, + "step": 46150 + }, + { + "epoch": 7.529363784665579, + "grad_norm": 0.09594681859016418, + "learning_rate": 0.0007846971086108741, + "loss": 0.0779, + "num_input_tokens_seen": 99704816, + "step": 46155 + }, + { + "epoch": 7.5301794453507345, + "grad_norm": 0.0594286173582077, + "learning_rate": 0.0007846385913129273, + "loss": 0.0263, + "num_input_tokens_seen": 99714704, + "step": 46160 + }, + { + "epoch": 7.530995106035889, + "grad_norm": 0.32255420088768005, + "learning_rate": 0.0007845800682464185, + "loss": 0.0777, + "num_input_tokens_seen": 99726000, + "step": 46165 + }, + { + "epoch": 7.531810766721044, + "grad_norm": 0.3754183351993561, + "learning_rate": 0.0007845215394125336, + "loss": 0.1829, + "num_input_tokens_seen": 99736688, + "step": 46170 + }, + { + "epoch": 7.532626427406199, + "grad_norm": 0.25886958837509155, + "learning_rate": 0.0007844630048124586, + "loss": 0.1646, + "num_input_tokens_seen": 99748400, + "step": 46175 + }, + { + "epoch": 7.533442088091354, + "grad_norm": 0.38430964946746826, + "learning_rate": 0.00078440446444738, + "loss": 0.1293, + "num_input_tokens_seen": 99759344, + "step": 46180 + }, + { + "epoch": 7.5342577487765094, + "grad_norm": 0.02618015743792057, + "learning_rate": 0.0007843459183184843, + "loss": 0.0407, + "num_input_tokens_seen": 99769616, + "step": 46185 + }, + { + "epoch": 7.535073409461664, + "grad_norm": 0.23281855881214142, + "learning_rate": 0.0007842873664269576, + "loss": 0.0932, + "num_input_tokens_seen": 99780656, + "step": 46190 + }, + { + "epoch": 7.535889070146819, + "grad_norm": 0.265331894159317, + "learning_rate": 0.0007842288087739868, + "loss": 0.0768, + "num_input_tokens_seen": 99790544, + "step": 46195 + }, + { + "epoch": 7.536704730831974, + "grad_norm": 0.2676144242286682, + "learning_rate": 0.0007841702453607589, + "loss": 0.1303, + "num_input_tokens_seen": 99801616, + "step": 46200 + }, + { + "epoch": 7.537520391517129, + "grad_norm": 0.2378363013267517, + "learning_rate": 0.0007841116761884601, + "loss": 0.1382, + "num_input_tokens_seen": 99812848, + "step": 46205 + }, + { + "epoch": 7.5383360522022835, + "grad_norm": 0.02555975876748562, + "learning_rate": 0.000784053101258278, + "loss": 0.0295, + "num_input_tokens_seen": 99823152, + "step": 46210 + }, + { + "epoch": 7.539151712887438, + "grad_norm": 0.01895485259592533, + "learning_rate": 0.0007839945205713995, + "loss": 0.0686, + "num_input_tokens_seen": 99834160, + "step": 46215 + }, + { + "epoch": 7.539967373572594, + "grad_norm": 0.021273165941238403, + "learning_rate": 0.0007839359341290116, + "loss": 0.0391, + "num_input_tokens_seen": 99844784, + "step": 46220 + }, + { + "epoch": 7.540783034257749, + "grad_norm": 0.1254192292690277, + "learning_rate": 0.0007838773419323019, + "loss": 0.0602, + "num_input_tokens_seen": 99855536, + "step": 46225 + }, + { + "epoch": 7.541598694942904, + "grad_norm": 0.17011789977550507, + "learning_rate": 0.0007838187439824577, + "loss": 0.0475, + "num_input_tokens_seen": 99866800, + "step": 46230 + }, + { + "epoch": 7.5424143556280585, + "grad_norm": 0.272366464138031, + "learning_rate": 0.0007837601402806666, + "loss": 0.257, + "num_input_tokens_seen": 99877520, + "step": 46235 + }, + { + "epoch": 7.543230016313213, + "grad_norm": 0.4665481448173523, + "learning_rate": 0.0007837015308281163, + "loss": 0.1388, + "num_input_tokens_seen": 99887632, + "step": 46240 + }, + { + "epoch": 7.544045676998369, + "grad_norm": 0.007717117201536894, + "learning_rate": 0.0007836429156259946, + "loss": 0.0402, + "num_input_tokens_seen": 99899664, + "step": 46245 + }, + { + "epoch": 7.544861337683524, + "grad_norm": 0.3290517330169678, + "learning_rate": 0.0007835842946754893, + "loss": 0.0653, + "num_input_tokens_seen": 99910928, + "step": 46250 + }, + { + "epoch": 7.545676998368679, + "grad_norm": 0.18104241788387299, + "learning_rate": 0.0007835256679777887, + "loss": 0.0232, + "num_input_tokens_seen": 99922128, + "step": 46255 + }, + { + "epoch": 7.5464926590538335, + "grad_norm": 0.2232947051525116, + "learning_rate": 0.0007834670355340805, + "loss": 0.0346, + "num_input_tokens_seen": 99932112, + "step": 46260 + }, + { + "epoch": 7.547308319738988, + "grad_norm": 0.04595812410116196, + "learning_rate": 0.0007834083973455535, + "loss": 0.0139, + "num_input_tokens_seen": 99942096, + "step": 46265 + }, + { + "epoch": 7.548123980424144, + "grad_norm": 0.04887615144252777, + "learning_rate": 0.0007833497534133955, + "loss": 0.0205, + "num_input_tokens_seen": 99953712, + "step": 46270 + }, + { + "epoch": 7.548939641109299, + "grad_norm": 0.03450680151581764, + "learning_rate": 0.0007832911037387955, + "loss": 0.1711, + "num_input_tokens_seen": 99965904, + "step": 46275 + }, + { + "epoch": 7.549755301794454, + "grad_norm": 0.1929694563150406, + "learning_rate": 0.000783232448322942, + "loss": 0.044, + "num_input_tokens_seen": 99977424, + "step": 46280 + }, + { + "epoch": 7.5505709624796085, + "grad_norm": 0.12964440882205963, + "learning_rate": 0.0007831737871670235, + "loss": 0.0528, + "num_input_tokens_seen": 99989072, + "step": 46285 + }, + { + "epoch": 7.551386623164763, + "grad_norm": 0.009152422659099102, + "learning_rate": 0.0007831151202722288, + "loss": 0.0278, + "num_input_tokens_seen": 99998992, + "step": 46290 + }, + { + "epoch": 7.552202283849918, + "grad_norm": 0.01057891920208931, + "learning_rate": 0.0007830564476397473, + "loss": 0.2981, + "num_input_tokens_seen": 100008976, + "step": 46295 + }, + { + "epoch": 7.553017944535073, + "grad_norm": 0.09518894553184509, + "learning_rate": 0.0007829977692707676, + "loss": 0.0199, + "num_input_tokens_seen": 100018672, + "step": 46300 + }, + { + "epoch": 7.553833605220229, + "grad_norm": 0.24947939813137054, + "learning_rate": 0.0007829390851664793, + "loss": 0.2036, + "num_input_tokens_seen": 100029904, + "step": 46305 + }, + { + "epoch": 7.554649265905383, + "grad_norm": 0.14291471242904663, + "learning_rate": 0.0007828803953280713, + "loss": 0.0521, + "num_input_tokens_seen": 100040304, + "step": 46310 + }, + { + "epoch": 7.555464926590538, + "grad_norm": 0.04056019335985184, + "learning_rate": 0.0007828216997567333, + "loss": 0.0403, + "num_input_tokens_seen": 100050544, + "step": 46315 + }, + { + "epoch": 7.556280587275693, + "grad_norm": 0.027434013783931732, + "learning_rate": 0.0007827629984536548, + "loss": 0.0211, + "num_input_tokens_seen": 100060496, + "step": 46320 + }, + { + "epoch": 7.557096247960848, + "grad_norm": 0.014924759976565838, + "learning_rate": 0.0007827042914200254, + "loss": 0.057, + "num_input_tokens_seen": 100071568, + "step": 46325 + }, + { + "epoch": 7.557911908646004, + "grad_norm": 0.24778462946414948, + "learning_rate": 0.000782645578657035, + "loss": 0.2794, + "num_input_tokens_seen": 100081840, + "step": 46330 + }, + { + "epoch": 7.558727569331158, + "grad_norm": 0.1591310203075409, + "learning_rate": 0.0007825868601658733, + "loss": 0.0877, + "num_input_tokens_seen": 100091472, + "step": 46335 + }, + { + "epoch": 7.559543230016313, + "grad_norm": 0.13435420393943787, + "learning_rate": 0.0007825281359477303, + "loss": 0.0849, + "num_input_tokens_seen": 100102448, + "step": 46340 + }, + { + "epoch": 7.560358890701468, + "grad_norm": 0.029477981850504875, + "learning_rate": 0.0007824694060037964, + "loss": 0.0293, + "num_input_tokens_seen": 100113360, + "step": 46345 + }, + { + "epoch": 7.561174551386623, + "grad_norm": 0.10839947313070297, + "learning_rate": 0.0007824106703352616, + "loss": 0.0268, + "num_input_tokens_seen": 100124912, + "step": 46350 + }, + { + "epoch": 7.561990212071779, + "grad_norm": 0.2539820075035095, + "learning_rate": 0.0007823519289433162, + "loss": 0.0822, + "num_input_tokens_seen": 100134512, + "step": 46355 + }, + { + "epoch": 7.562805872756933, + "grad_norm": 0.29448628425598145, + "learning_rate": 0.0007822931818291508, + "loss": 0.0657, + "num_input_tokens_seen": 100146160, + "step": 46360 + }, + { + "epoch": 7.563621533442088, + "grad_norm": 0.007594697643071413, + "learning_rate": 0.0007822344289939561, + "loss": 0.0502, + "num_input_tokens_seen": 100157264, + "step": 46365 + }, + { + "epoch": 7.564437194127243, + "grad_norm": 0.045168206095695496, + "learning_rate": 0.0007821756704389224, + "loss": 0.1212, + "num_input_tokens_seen": 100168400, + "step": 46370 + }, + { + "epoch": 7.565252854812398, + "grad_norm": 0.4243335723876953, + "learning_rate": 0.000782116906165241, + "loss": 0.1991, + "num_input_tokens_seen": 100179632, + "step": 46375 + }, + { + "epoch": 7.566068515497553, + "grad_norm": 0.08852332830429077, + "learning_rate": 0.0007820581361741025, + "loss": 0.0195, + "num_input_tokens_seen": 100191600, + "step": 46380 + }, + { + "epoch": 7.566884176182708, + "grad_norm": 0.07176997512578964, + "learning_rate": 0.0007819993604666982, + "loss": 0.2055, + "num_input_tokens_seen": 100202736, + "step": 46385 + }, + { + "epoch": 7.567699836867863, + "grad_norm": 0.09689157456159592, + "learning_rate": 0.0007819405790442189, + "loss": 0.0999, + "num_input_tokens_seen": 100213104, + "step": 46390 + }, + { + "epoch": 7.568515497553018, + "grad_norm": 0.002046206733211875, + "learning_rate": 0.0007818817919078562, + "loss": 0.1328, + "num_input_tokens_seen": 100223408, + "step": 46395 + }, + { + "epoch": 7.569331158238173, + "grad_norm": 0.005216080229729414, + "learning_rate": 0.0007818229990588013, + "loss": 0.2068, + "num_input_tokens_seen": 100233872, + "step": 46400 + }, + { + "epoch": 7.570146818923328, + "grad_norm": 0.0509600006043911, + "learning_rate": 0.000781764200498246, + "loss": 0.1134, + "num_input_tokens_seen": 100244528, + "step": 46405 + }, + { + "epoch": 7.5709624796084825, + "grad_norm": 0.061129264533519745, + "learning_rate": 0.0007817053962273817, + "loss": 0.0379, + "num_input_tokens_seen": 100255344, + "step": 46410 + }, + { + "epoch": 7.571778140293638, + "grad_norm": 0.08482226729393005, + "learning_rate": 0.0007816465862474, + "loss": 0.1293, + "num_input_tokens_seen": 100266448, + "step": 46415 + }, + { + "epoch": 7.572593800978793, + "grad_norm": 0.007290207780897617, + "learning_rate": 0.000781587770559493, + "loss": 0.0334, + "num_input_tokens_seen": 100277584, + "step": 46420 + }, + { + "epoch": 7.573409461663948, + "grad_norm": 0.00973653793334961, + "learning_rate": 0.0007815289491648527, + "loss": 0.039, + "num_input_tokens_seen": 100290160, + "step": 46425 + }, + { + "epoch": 7.574225122349103, + "grad_norm": 0.0023570421617478132, + "learning_rate": 0.000781470122064671, + "loss": 0.0381, + "num_input_tokens_seen": 100300784, + "step": 46430 + }, + { + "epoch": 7.575040783034257, + "grad_norm": 0.001977626234292984, + "learning_rate": 0.0007814112892601403, + "loss": 0.1092, + "num_input_tokens_seen": 100311344, + "step": 46435 + }, + { + "epoch": 7.575856443719413, + "grad_norm": 0.0060659232549369335, + "learning_rate": 0.0007813524507524527, + "loss": 0.0216, + "num_input_tokens_seen": 100322608, + "step": 46440 + }, + { + "epoch": 7.576672104404568, + "grad_norm": 0.060652635991573334, + "learning_rate": 0.0007812936065428009, + "loss": 0.0623, + "num_input_tokens_seen": 100332912, + "step": 46445 + }, + { + "epoch": 7.577487765089723, + "grad_norm": 0.0259055495262146, + "learning_rate": 0.0007812347566323774, + "loss": 0.1047, + "num_input_tokens_seen": 100344144, + "step": 46450 + }, + { + "epoch": 7.578303425774878, + "grad_norm": 0.02046637050807476, + "learning_rate": 0.0007811759010223747, + "loss": 0.0468, + "num_input_tokens_seen": 100355856, + "step": 46455 + }, + { + "epoch": 7.579119086460032, + "grad_norm": 0.18777872622013092, + "learning_rate": 0.0007811170397139855, + "loss": 0.056, + "num_input_tokens_seen": 100366960, + "step": 46460 + }, + { + "epoch": 7.579934747145187, + "grad_norm": 0.10581226646900177, + "learning_rate": 0.000781058172708403, + "loss": 0.0651, + "num_input_tokens_seen": 100377584, + "step": 46465 + }, + { + "epoch": 7.580750407830343, + "grad_norm": 0.22157377004623413, + "learning_rate": 0.00078099930000682, + "loss": 0.0466, + "num_input_tokens_seen": 100389424, + "step": 46470 + }, + { + "epoch": 7.581566068515498, + "grad_norm": 0.39631515741348267, + "learning_rate": 0.0007809404216104299, + "loss": 0.1734, + "num_input_tokens_seen": 100400688, + "step": 46475 + }, + { + "epoch": 7.582381729200653, + "grad_norm": 0.09392768889665604, + "learning_rate": 0.0007808815375204257, + "loss": 0.1044, + "num_input_tokens_seen": 100410448, + "step": 46480 + }, + { + "epoch": 7.583197389885807, + "grad_norm": 0.044808097183704376, + "learning_rate": 0.0007808226477380007, + "loss": 0.0971, + "num_input_tokens_seen": 100421232, + "step": 46485 + }, + { + "epoch": 7.584013050570962, + "grad_norm": 0.009121015667915344, + "learning_rate": 0.0007807637522643484, + "loss": 0.0537, + "num_input_tokens_seen": 100431312, + "step": 46490 + }, + { + "epoch": 7.584828711256117, + "grad_norm": 0.08114711195230484, + "learning_rate": 0.0007807048511006628, + "loss": 0.1994, + "num_input_tokens_seen": 100441936, + "step": 46495 + }, + { + "epoch": 7.585644371941273, + "grad_norm": 0.20501329004764557, + "learning_rate": 0.0007806459442481372, + "loss": 0.1497, + "num_input_tokens_seen": 100453488, + "step": 46500 + }, + { + "epoch": 7.5864600326264275, + "grad_norm": 0.2496049851179123, + "learning_rate": 0.0007805870317079654, + "loss": 0.1331, + "num_input_tokens_seen": 100466224, + "step": 46505 + }, + { + "epoch": 7.587275693311582, + "grad_norm": 0.04062649607658386, + "learning_rate": 0.0007805281134813416, + "loss": 0.0313, + "num_input_tokens_seen": 100476560, + "step": 46510 + }, + { + "epoch": 7.588091353996737, + "grad_norm": 0.266244500875473, + "learning_rate": 0.0007804691895694595, + "loss": 0.0772, + "num_input_tokens_seen": 100487184, + "step": 46515 + }, + { + "epoch": 7.588907014681892, + "grad_norm": 0.010720369406044483, + "learning_rate": 0.0007804102599735137, + "loss": 0.0226, + "num_input_tokens_seen": 100498608, + "step": 46520 + }, + { + "epoch": 7.589722675367048, + "grad_norm": 0.009115898050367832, + "learning_rate": 0.0007803513246946981, + "loss": 0.0456, + "num_input_tokens_seen": 100509104, + "step": 46525 + }, + { + "epoch": 7.5905383360522025, + "grad_norm": 0.019223831593990326, + "learning_rate": 0.0007802923837342072, + "loss": 0.1405, + "num_input_tokens_seen": 100520368, + "step": 46530 + }, + { + "epoch": 7.591353996737357, + "grad_norm": 0.04066868871450424, + "learning_rate": 0.0007802334370932357, + "loss": 0.0156, + "num_input_tokens_seen": 100530928, + "step": 46535 + }, + { + "epoch": 7.592169657422512, + "grad_norm": 0.10982602834701538, + "learning_rate": 0.0007801744847729781, + "loss": 0.0714, + "num_input_tokens_seen": 100542960, + "step": 46540 + }, + { + "epoch": 7.592985318107667, + "grad_norm": 0.006750395521521568, + "learning_rate": 0.0007801155267746291, + "loss": 0.0703, + "num_input_tokens_seen": 100553232, + "step": 46545 + }, + { + "epoch": 7.593800978792823, + "grad_norm": 0.008056842721998692, + "learning_rate": 0.0007800565630993834, + "loss": 0.0737, + "num_input_tokens_seen": 100565552, + "step": 46550 + }, + { + "epoch": 7.5946166394779775, + "grad_norm": 0.07503590732812881, + "learning_rate": 0.0007799975937484365, + "loss": 0.0222, + "num_input_tokens_seen": 100576592, + "step": 46555 + }, + { + "epoch": 7.595432300163132, + "grad_norm": 0.011532962322235107, + "learning_rate": 0.000779938618722983, + "loss": 0.0278, + "num_input_tokens_seen": 100586512, + "step": 46560 + }, + { + "epoch": 7.596247960848287, + "grad_norm": 0.3396737575531006, + "learning_rate": 0.0007798796380242183, + "loss": 0.2314, + "num_input_tokens_seen": 100597584, + "step": 46565 + }, + { + "epoch": 7.597063621533442, + "grad_norm": 0.06101497262716293, + "learning_rate": 0.0007798206516533377, + "loss": 0.0452, + "num_input_tokens_seen": 100609168, + "step": 46570 + }, + { + "epoch": 7.597879282218597, + "grad_norm": 0.2853509187698364, + "learning_rate": 0.0007797616596115365, + "loss": 0.0593, + "num_input_tokens_seen": 100620368, + "step": 46575 + }, + { + "epoch": 7.598694942903752, + "grad_norm": 0.08300846070051193, + "learning_rate": 0.0007797026619000105, + "loss": 0.0298, + "num_input_tokens_seen": 100630160, + "step": 46580 + }, + { + "epoch": 7.599510603588907, + "grad_norm": 0.12231241166591644, + "learning_rate": 0.0007796436585199553, + "loss": 0.0236, + "num_input_tokens_seen": 100641872, + "step": 46585 + }, + { + "epoch": 7.600326264274062, + "grad_norm": 0.46338924765586853, + "learning_rate": 0.0007795846494725665, + "loss": 0.2078, + "num_input_tokens_seen": 100653168, + "step": 46590 + }, + { + "epoch": 7.601141924959217, + "grad_norm": 0.11617031693458557, + "learning_rate": 0.00077952563475904, + "loss": 0.056, + "num_input_tokens_seen": 100664144, + "step": 46595 + }, + { + "epoch": 7.601957585644372, + "grad_norm": 0.1972285658121109, + "learning_rate": 0.000779466614380572, + "loss": 0.1623, + "num_input_tokens_seen": 100674640, + "step": 46600 + }, + { + "epoch": 7.602773246329527, + "grad_norm": 0.00785736832767725, + "learning_rate": 0.0007794075883383586, + "loss": 0.0509, + "num_input_tokens_seen": 100685040, + "step": 46605 + }, + { + "epoch": 7.603588907014682, + "grad_norm": 0.09408126026391983, + "learning_rate": 0.0007793485566335958, + "loss": 0.1419, + "num_input_tokens_seen": 100695408, + "step": 46610 + }, + { + "epoch": 7.604404567699837, + "grad_norm": 0.08680996298789978, + "learning_rate": 0.0007792895192674802, + "loss": 0.0433, + "num_input_tokens_seen": 100705840, + "step": 46615 + }, + { + "epoch": 7.605220228384992, + "grad_norm": 0.37143993377685547, + "learning_rate": 0.0007792304762412084, + "loss": 0.1415, + "num_input_tokens_seen": 100716944, + "step": 46620 + }, + { + "epoch": 7.606035889070147, + "grad_norm": 0.0823369175195694, + "learning_rate": 0.0007791714275559765, + "loss": 0.032, + "num_input_tokens_seen": 100729072, + "step": 46625 + }, + { + "epoch": 7.6068515497553015, + "grad_norm": 0.01576918736100197, + "learning_rate": 0.0007791123732129815, + "loss": 0.1349, + "num_input_tokens_seen": 100739952, + "step": 46630 + }, + { + "epoch": 7.607667210440457, + "grad_norm": 1.0202510356903076, + "learning_rate": 0.0007790533132134201, + "loss": 0.0627, + "num_input_tokens_seen": 100751344, + "step": 46635 + }, + { + "epoch": 7.608482871125612, + "grad_norm": 0.010519228875637054, + "learning_rate": 0.0007789942475584894, + "loss": 0.0128, + "num_input_tokens_seen": 100761712, + "step": 46640 + }, + { + "epoch": 7.609298531810767, + "grad_norm": 0.07782994210720062, + "learning_rate": 0.0007789351762493865, + "loss": 0.086, + "num_input_tokens_seen": 100773040, + "step": 46645 + }, + { + "epoch": 7.610114192495922, + "grad_norm": 0.09352076053619385, + "learning_rate": 0.0007788760992873083, + "loss": 0.0475, + "num_input_tokens_seen": 100782448, + "step": 46650 + }, + { + "epoch": 7.6109298531810765, + "grad_norm": 0.16804049909114838, + "learning_rate": 0.000778817016673452, + "loss": 0.0503, + "num_input_tokens_seen": 100792720, + "step": 46655 + }, + { + "epoch": 7.611745513866231, + "grad_norm": 0.05742299184203148, + "learning_rate": 0.0007787579284090154, + "loss": 0.1065, + "num_input_tokens_seen": 100804400, + "step": 46660 + }, + { + "epoch": 7.612561174551386, + "grad_norm": 0.002605182584375143, + "learning_rate": 0.0007786988344951956, + "loss": 0.0223, + "num_input_tokens_seen": 100814704, + "step": 46665 + }, + { + "epoch": 7.613376835236542, + "grad_norm": 0.2340633124113083, + "learning_rate": 0.0007786397349331904, + "loss": 0.0414, + "num_input_tokens_seen": 100823632, + "step": 46670 + }, + { + "epoch": 7.614192495921697, + "grad_norm": 0.011929473839700222, + "learning_rate": 0.0007785806297241976, + "loss": 0.0075, + "num_input_tokens_seen": 100834544, + "step": 46675 + }, + { + "epoch": 7.6150081566068515, + "grad_norm": 0.1309245228767395, + "learning_rate": 0.0007785215188694148, + "loss": 0.0354, + "num_input_tokens_seen": 100846864, + "step": 46680 + }, + { + "epoch": 7.615823817292006, + "grad_norm": 0.0732983872294426, + "learning_rate": 0.0007784624023700402, + "loss": 0.0269, + "num_input_tokens_seen": 100858480, + "step": 46685 + }, + { + "epoch": 7.616639477977161, + "grad_norm": 0.2625514566898346, + "learning_rate": 0.0007784032802272716, + "loss": 0.2307, + "num_input_tokens_seen": 100869584, + "step": 46690 + }, + { + "epoch": 7.617455138662317, + "grad_norm": 0.1511547863483429, + "learning_rate": 0.0007783441524423074, + "loss": 0.1718, + "num_input_tokens_seen": 100879248, + "step": 46695 + }, + { + "epoch": 7.618270799347472, + "grad_norm": 0.05024776607751846, + "learning_rate": 0.0007782850190163459, + "loss": 0.0707, + "num_input_tokens_seen": 100890288, + "step": 46700 + }, + { + "epoch": 7.6190864600326265, + "grad_norm": 0.2799839377403259, + "learning_rate": 0.0007782258799505855, + "loss": 0.0376, + "num_input_tokens_seen": 100901616, + "step": 46705 + }, + { + "epoch": 7.619902120717781, + "grad_norm": 0.08367808163166046, + "learning_rate": 0.0007781667352462245, + "loss": 0.2048, + "num_input_tokens_seen": 100912496, + "step": 46710 + }, + { + "epoch": 7.620717781402936, + "grad_norm": 0.07492048293352127, + "learning_rate": 0.0007781075849044619, + "loss": 0.0655, + "num_input_tokens_seen": 100924656, + "step": 46715 + }, + { + "epoch": 7.621533442088092, + "grad_norm": 0.36795780062675476, + "learning_rate": 0.0007780484289264961, + "loss": 0.0932, + "num_input_tokens_seen": 100936176, + "step": 46720 + }, + { + "epoch": 7.622349102773247, + "grad_norm": 0.14871415495872498, + "learning_rate": 0.0007779892673135264, + "loss": 0.1147, + "num_input_tokens_seen": 100947312, + "step": 46725 + }, + { + "epoch": 7.623164763458401, + "grad_norm": 0.03786802291870117, + "learning_rate": 0.0007779301000667516, + "loss": 0.0893, + "num_input_tokens_seen": 100957232, + "step": 46730 + }, + { + "epoch": 7.623980424143556, + "grad_norm": 0.20446109771728516, + "learning_rate": 0.0007778709271873706, + "loss": 0.1756, + "num_input_tokens_seen": 100968592, + "step": 46735 + }, + { + "epoch": 7.624796084828711, + "grad_norm": 0.030702682211995125, + "learning_rate": 0.0007778117486765825, + "loss": 0.1157, + "num_input_tokens_seen": 100979024, + "step": 46740 + }, + { + "epoch": 7.625611745513866, + "grad_norm": 0.02638734132051468, + "learning_rate": 0.0007777525645355872, + "loss": 0.0571, + "num_input_tokens_seen": 100989712, + "step": 46745 + }, + { + "epoch": 7.626427406199021, + "grad_norm": 0.004499876406043768, + "learning_rate": 0.0007776933747655838, + "loss": 0.1112, + "num_input_tokens_seen": 101000784, + "step": 46750 + }, + { + "epoch": 7.627243066884176, + "grad_norm": 0.056049101054668427, + "learning_rate": 0.0007776341793677719, + "loss": 0.0751, + "num_input_tokens_seen": 101011600, + "step": 46755 + }, + { + "epoch": 7.628058727569331, + "grad_norm": 0.14655882120132446, + "learning_rate": 0.000777574978343351, + "loss": 0.0671, + "num_input_tokens_seen": 101022160, + "step": 46760 + }, + { + "epoch": 7.628874388254486, + "grad_norm": 0.04394836723804474, + "learning_rate": 0.000777515771693521, + "loss": 0.0459, + "num_input_tokens_seen": 101034192, + "step": 46765 + }, + { + "epoch": 7.629690048939641, + "grad_norm": 0.01639639027416706, + "learning_rate": 0.0007774565594194821, + "loss": 0.0374, + "num_input_tokens_seen": 101044592, + "step": 46770 + }, + { + "epoch": 7.630505709624796, + "grad_norm": 0.14619885385036469, + "learning_rate": 0.0007773973415224339, + "loss": 0.0513, + "num_input_tokens_seen": 101054640, + "step": 46775 + }, + { + "epoch": 7.631321370309951, + "grad_norm": 0.017432374879717827, + "learning_rate": 0.0007773381180035766, + "loss": 0.0684, + "num_input_tokens_seen": 101066160, + "step": 46780 + }, + { + "epoch": 7.632137030995106, + "grad_norm": 0.15743707120418549, + "learning_rate": 0.0007772788888641107, + "loss": 0.1699, + "num_input_tokens_seen": 101078000, + "step": 46785 + }, + { + "epoch": 7.632952691680261, + "grad_norm": 0.1914764940738678, + "learning_rate": 0.0007772196541052361, + "loss": 0.1399, + "num_input_tokens_seen": 101088272, + "step": 46790 + }, + { + "epoch": 7.633768352365416, + "grad_norm": 0.05888279527425766, + "learning_rate": 0.0007771604137281538, + "loss": 0.08, + "num_input_tokens_seen": 101098384, + "step": 46795 + }, + { + "epoch": 7.634584013050571, + "grad_norm": 0.005790786352008581, + "learning_rate": 0.0007771011677340639, + "loss": 0.0977, + "num_input_tokens_seen": 101110064, + "step": 46800 + }, + { + "epoch": 7.635399673735726, + "grad_norm": 0.13105180859565735, + "learning_rate": 0.0007770419161241675, + "loss": 0.0936, + "num_input_tokens_seen": 101121200, + "step": 46805 + }, + { + "epoch": 7.636215334420881, + "grad_norm": 0.06090496852993965, + "learning_rate": 0.0007769826588996651, + "loss": 0.0136, + "num_input_tokens_seen": 101132208, + "step": 46810 + }, + { + "epoch": 7.637030995106036, + "grad_norm": 0.023098204284906387, + "learning_rate": 0.0007769233960617576, + "loss": 0.0248, + "num_input_tokens_seen": 101141744, + "step": 46815 + }, + { + "epoch": 7.637846655791191, + "grad_norm": 0.03087800368666649, + "learning_rate": 0.0007768641276116465, + "loss": 0.0233, + "num_input_tokens_seen": 101151728, + "step": 46820 + }, + { + "epoch": 7.638662316476346, + "grad_norm": 0.31353387236595154, + "learning_rate": 0.0007768048535505324, + "loss": 0.116, + "num_input_tokens_seen": 101162160, + "step": 46825 + }, + { + "epoch": 7.6394779771615005, + "grad_norm": 0.27113598585128784, + "learning_rate": 0.0007767455738796169, + "loss": 0.0587, + "num_input_tokens_seen": 101172816, + "step": 46830 + }, + { + "epoch": 7.640293637846656, + "grad_norm": 0.14102505147457123, + "learning_rate": 0.0007766862886001011, + "loss": 0.0897, + "num_input_tokens_seen": 101183344, + "step": 46835 + }, + { + "epoch": 7.641109298531811, + "grad_norm": 0.028963766992092133, + "learning_rate": 0.0007766269977131868, + "loss": 0.0281, + "num_input_tokens_seen": 101193680, + "step": 46840 + }, + { + "epoch": 7.641924959216966, + "grad_norm": 0.007863939739763737, + "learning_rate": 0.0007765677012200753, + "loss": 0.0303, + "num_input_tokens_seen": 101205808, + "step": 46845 + }, + { + "epoch": 7.642740619902121, + "grad_norm": 0.11236032843589783, + "learning_rate": 0.0007765083991219688, + "loss": 0.0624, + "num_input_tokens_seen": 101216400, + "step": 46850 + }, + { + "epoch": 7.643556280587275, + "grad_norm": 0.20675627887248993, + "learning_rate": 0.0007764490914200686, + "loss": 0.0503, + "num_input_tokens_seen": 101226480, + "step": 46855 + }, + { + "epoch": 7.64437194127243, + "grad_norm": 0.03227461874485016, + "learning_rate": 0.0007763897781155769, + "loss": 0.0144, + "num_input_tokens_seen": 101235952, + "step": 46860 + }, + { + "epoch": 7.645187601957586, + "grad_norm": 0.07756864279508591, + "learning_rate": 0.0007763304592096956, + "loss": 0.0173, + "num_input_tokens_seen": 101247280, + "step": 46865 + }, + { + "epoch": 7.646003262642741, + "grad_norm": 0.23163361847400665, + "learning_rate": 0.0007762711347036273, + "loss": 0.1045, + "num_input_tokens_seen": 101258224, + "step": 46870 + }, + { + "epoch": 7.646818923327896, + "grad_norm": 0.01285717636346817, + "learning_rate": 0.0007762118045985738, + "loss": 0.0289, + "num_input_tokens_seen": 101268688, + "step": 46875 + }, + { + "epoch": 7.64763458401305, + "grad_norm": 0.019179528579115868, + "learning_rate": 0.0007761524688957377, + "loss": 0.0728, + "num_input_tokens_seen": 101279568, + "step": 46880 + }, + { + "epoch": 7.648450244698205, + "grad_norm": 0.03536270931363106, + "learning_rate": 0.0007760931275963215, + "loss": 0.0187, + "num_input_tokens_seen": 101291568, + "step": 46885 + }, + { + "epoch": 7.649265905383361, + "grad_norm": 0.021652111783623695, + "learning_rate": 0.0007760337807015276, + "loss": 0.0341, + "num_input_tokens_seen": 101302320, + "step": 46890 + }, + { + "epoch": 7.650081566068516, + "grad_norm": 0.007089455612003803, + "learning_rate": 0.0007759744282125593, + "loss": 0.0512, + "num_input_tokens_seen": 101313328, + "step": 46895 + }, + { + "epoch": 7.650897226753671, + "grad_norm": 0.06724183261394501, + "learning_rate": 0.000775915070130619, + "loss": 0.1505, + "num_input_tokens_seen": 101323888, + "step": 46900 + }, + { + "epoch": 7.651712887438825, + "grad_norm": 0.02761230431497097, + "learning_rate": 0.0007758557064569096, + "loss": 0.0983, + "num_input_tokens_seen": 101334416, + "step": 46905 + }, + { + "epoch": 7.65252854812398, + "grad_norm": 0.049123216420412064, + "learning_rate": 0.0007757963371926346, + "loss": 0.0224, + "num_input_tokens_seen": 101345552, + "step": 46910 + }, + { + "epoch": 7.653344208809135, + "grad_norm": 0.03470964357256889, + "learning_rate": 0.000775736962338997, + "loss": 0.1807, + "num_input_tokens_seen": 101357168, + "step": 46915 + }, + { + "epoch": 7.654159869494291, + "grad_norm": 0.25555697083473206, + "learning_rate": 0.0007756775818971998, + "loss": 0.0535, + "num_input_tokens_seen": 101368336, + "step": 46920 + }, + { + "epoch": 7.6549755301794455, + "grad_norm": 0.2521311640739441, + "learning_rate": 0.0007756181958684467, + "loss": 0.0587, + "num_input_tokens_seen": 101379088, + "step": 46925 + }, + { + "epoch": 7.6557911908646, + "grad_norm": 0.03777456283569336, + "learning_rate": 0.0007755588042539414, + "loss": 0.0218, + "num_input_tokens_seen": 101389360, + "step": 46930 + }, + { + "epoch": 7.656606851549755, + "grad_norm": 0.25904545187950134, + "learning_rate": 0.0007754994070548873, + "loss": 0.0492, + "num_input_tokens_seen": 101401264, + "step": 46935 + }, + { + "epoch": 7.65742251223491, + "grad_norm": 0.008663099259138107, + "learning_rate": 0.0007754400042724881, + "loss": 0.0318, + "num_input_tokens_seen": 101411248, + "step": 46940 + }, + { + "epoch": 7.658238172920065, + "grad_norm": 0.15806740522384644, + "learning_rate": 0.0007753805959079481, + "loss": 0.0542, + "num_input_tokens_seen": 101420752, + "step": 46945 + }, + { + "epoch": 7.6590538336052205, + "grad_norm": 0.008633045479655266, + "learning_rate": 0.0007753211819624706, + "loss": 0.1152, + "num_input_tokens_seen": 101432272, + "step": 46950 + }, + { + "epoch": 7.659869494290375, + "grad_norm": 0.47052812576293945, + "learning_rate": 0.0007752617624372602, + "loss": 0.047, + "num_input_tokens_seen": 101443824, + "step": 46955 + }, + { + "epoch": 7.66068515497553, + "grad_norm": 0.22263245284557343, + "learning_rate": 0.000775202337333521, + "loss": 0.0303, + "num_input_tokens_seen": 101454512, + "step": 46960 + }, + { + "epoch": 7.661500815660685, + "grad_norm": 0.0009532614494673908, + "learning_rate": 0.0007751429066524575, + "loss": 0.1237, + "num_input_tokens_seen": 101465968, + "step": 46965 + }, + { + "epoch": 7.66231647634584, + "grad_norm": 0.0020556438248604536, + "learning_rate": 0.0007750834703952738, + "loss": 0.0419, + "num_input_tokens_seen": 101476720, + "step": 46970 + }, + { + "epoch": 7.6631321370309955, + "grad_norm": 0.0666525810956955, + "learning_rate": 0.0007750240285631745, + "loss": 0.0806, + "num_input_tokens_seen": 101487600, + "step": 46975 + }, + { + "epoch": 7.66394779771615, + "grad_norm": 0.09244846552610397, + "learning_rate": 0.0007749645811573646, + "loss": 0.065, + "num_input_tokens_seen": 101498832, + "step": 46980 + }, + { + "epoch": 7.664763458401305, + "grad_norm": 0.2173173427581787, + "learning_rate": 0.0007749051281790484, + "loss": 0.0477, + "num_input_tokens_seen": 101510000, + "step": 46985 + }, + { + "epoch": 7.66557911908646, + "grad_norm": 0.006644314154982567, + "learning_rate": 0.0007748456696294312, + "loss": 0.0871, + "num_input_tokens_seen": 101520528, + "step": 46990 + }, + { + "epoch": 7.666394779771615, + "grad_norm": 0.006989603862166405, + "learning_rate": 0.0007747862055097179, + "loss": 0.0196, + "num_input_tokens_seen": 101531504, + "step": 46995 + }, + { + "epoch": 7.6672104404567705, + "grad_norm": 0.36260828375816345, + "learning_rate": 0.0007747267358211135, + "loss": 0.1973, + "num_input_tokens_seen": 101542640, + "step": 47000 + }, + { + "epoch": 7.668026101141925, + "grad_norm": 0.11980457603931427, + "learning_rate": 0.0007746672605648231, + "loss": 0.0752, + "num_input_tokens_seen": 101553040, + "step": 47005 + }, + { + "epoch": 7.66884176182708, + "grad_norm": 0.2689124643802643, + "learning_rate": 0.0007746077797420524, + "loss": 0.1872, + "num_input_tokens_seen": 101564272, + "step": 47010 + }, + { + "epoch": 7.669657422512235, + "grad_norm": 0.04494722560048103, + "learning_rate": 0.0007745482933540067, + "loss": 0.0134, + "num_input_tokens_seen": 101574896, + "step": 47015 + }, + { + "epoch": 7.67047308319739, + "grad_norm": 0.005968465004116297, + "learning_rate": 0.0007744888014018914, + "loss": 0.0078, + "num_input_tokens_seen": 101586032, + "step": 47020 + }, + { + "epoch": 7.671288743882545, + "grad_norm": 0.004824151284992695, + "learning_rate": 0.0007744293038869125, + "loss": 0.018, + "num_input_tokens_seen": 101596784, + "step": 47025 + }, + { + "epoch": 7.672104404567699, + "grad_norm": 0.01978217624127865, + "learning_rate": 0.0007743698008102755, + "loss": 0.0438, + "num_input_tokens_seen": 101607632, + "step": 47030 + }, + { + "epoch": 7.672920065252855, + "grad_norm": 0.11269936710596085, + "learning_rate": 0.0007743102921731864, + "loss": 0.0749, + "num_input_tokens_seen": 101618064, + "step": 47035 + }, + { + "epoch": 7.67373572593801, + "grad_norm": 0.4015054702758789, + "learning_rate": 0.0007742507779768513, + "loss": 0.1683, + "num_input_tokens_seen": 101629712, + "step": 47040 + }, + { + "epoch": 7.674551386623165, + "grad_norm": 0.014630908146500587, + "learning_rate": 0.0007741912582224764, + "loss": 0.0424, + "num_input_tokens_seen": 101639920, + "step": 47045 + }, + { + "epoch": 7.6753670473083195, + "grad_norm": 0.004438962321728468, + "learning_rate": 0.0007741317329112675, + "loss": 0.0132, + "num_input_tokens_seen": 101650448, + "step": 47050 + }, + { + "epoch": 7.676182707993474, + "grad_norm": 0.06496120244264603, + "learning_rate": 0.0007740722020444315, + "loss": 0.0644, + "num_input_tokens_seen": 101661136, + "step": 47055 + }, + { + "epoch": 7.67699836867863, + "grad_norm": 0.25175753235816956, + "learning_rate": 0.0007740126656231746, + "loss": 0.1169, + "num_input_tokens_seen": 101671600, + "step": 47060 + }, + { + "epoch": 7.677814029363785, + "grad_norm": 0.23091796040534973, + "learning_rate": 0.0007739531236487034, + "loss": 0.1631, + "num_input_tokens_seen": 101681200, + "step": 47065 + }, + { + "epoch": 7.67862969004894, + "grad_norm": 0.012666295282542706, + "learning_rate": 0.0007738935761222247, + "loss": 0.114, + "num_input_tokens_seen": 101691824, + "step": 47070 + }, + { + "epoch": 7.6794453507340945, + "grad_norm": 0.08096782118082047, + "learning_rate": 0.0007738340230449451, + "loss": 0.1061, + "num_input_tokens_seen": 101701552, + "step": 47075 + }, + { + "epoch": 7.680261011419249, + "grad_norm": 0.21125862002372742, + "learning_rate": 0.0007737744644180718, + "loss": 0.0638, + "num_input_tokens_seen": 101713136, + "step": 47080 + }, + { + "epoch": 7.681076672104405, + "grad_norm": 0.11040032655000687, + "learning_rate": 0.0007737149002428114, + "loss": 0.0173, + "num_input_tokens_seen": 101723792, + "step": 47085 + }, + { + "epoch": 7.68189233278956, + "grad_norm": 0.0034124937374144793, + "learning_rate": 0.0007736553305203715, + "loss": 0.079, + "num_input_tokens_seen": 101733136, + "step": 47090 + }, + { + "epoch": 7.682707993474715, + "grad_norm": 0.01145437452942133, + "learning_rate": 0.0007735957552519592, + "loss": 0.081, + "num_input_tokens_seen": 101744496, + "step": 47095 + }, + { + "epoch": 7.6835236541598695, + "grad_norm": 0.02029622718691826, + "learning_rate": 0.0007735361744387818, + "loss": 0.0309, + "num_input_tokens_seen": 101755088, + "step": 47100 + }, + { + "epoch": 7.684339314845024, + "grad_norm": 0.029110131785273552, + "learning_rate": 0.0007734765880820468, + "loss": 0.0534, + "num_input_tokens_seen": 101764144, + "step": 47105 + }, + { + "epoch": 7.685154975530179, + "grad_norm": 0.0040335459634661674, + "learning_rate": 0.0007734169961829618, + "loss": 0.0099, + "num_input_tokens_seen": 101774960, + "step": 47110 + }, + { + "epoch": 7.685970636215334, + "grad_norm": 0.1551961600780487, + "learning_rate": 0.0007733573987427346, + "loss": 0.0268, + "num_input_tokens_seen": 101785488, + "step": 47115 + }, + { + "epoch": 7.68678629690049, + "grad_norm": 0.0011410359293222427, + "learning_rate": 0.0007732977957625729, + "loss": 0.0282, + "num_input_tokens_seen": 101795792, + "step": 47120 + }, + { + "epoch": 7.6876019575856445, + "grad_norm": 0.009819770231842995, + "learning_rate": 0.0007732381872436846, + "loss": 0.1608, + "num_input_tokens_seen": 101805168, + "step": 47125 + }, + { + "epoch": 7.688417618270799, + "grad_norm": 0.36888980865478516, + "learning_rate": 0.0007731785731872778, + "loss": 0.1409, + "num_input_tokens_seen": 101817104, + "step": 47130 + }, + { + "epoch": 7.689233278955954, + "grad_norm": 0.009072404354810715, + "learning_rate": 0.0007731189535945609, + "loss": 0.084, + "num_input_tokens_seen": 101826768, + "step": 47135 + }, + { + "epoch": 7.690048939641109, + "grad_norm": 0.14182324707508087, + "learning_rate": 0.0007730593284667416, + "loss": 0.1707, + "num_input_tokens_seen": 101835696, + "step": 47140 + }, + { + "epoch": 7.690864600326265, + "grad_norm": 0.2949478328227997, + "learning_rate": 0.0007729996978050287, + "loss": 0.0506, + "num_input_tokens_seen": 101847280, + "step": 47145 + }, + { + "epoch": 7.691680261011419, + "grad_norm": 0.010478787124156952, + "learning_rate": 0.0007729400616106308, + "loss": 0.0218, + "num_input_tokens_seen": 101859056, + "step": 47150 + }, + { + "epoch": 7.692495921696574, + "grad_norm": 0.1508476883172989, + "learning_rate": 0.0007728804198847561, + "loss": 0.2044, + "num_input_tokens_seen": 101871184, + "step": 47155 + }, + { + "epoch": 7.693311582381729, + "grad_norm": 0.12415754795074463, + "learning_rate": 0.0007728207726286136, + "loss": 0.0443, + "num_input_tokens_seen": 101881616, + "step": 47160 + }, + { + "epoch": 7.694127243066884, + "grad_norm": 0.01569686271250248, + "learning_rate": 0.000772761119843412, + "loss": 0.0265, + "num_input_tokens_seen": 101892688, + "step": 47165 + }, + { + "epoch": 7.69494290375204, + "grad_norm": 0.07221835851669312, + "learning_rate": 0.0007727014615303602, + "loss": 0.0413, + "num_input_tokens_seen": 101903696, + "step": 47170 + }, + { + "epoch": 7.695758564437194, + "grad_norm": 0.03987114503979683, + "learning_rate": 0.0007726417976906674, + "loss": 0.0323, + "num_input_tokens_seen": 101913808, + "step": 47175 + }, + { + "epoch": 7.696574225122349, + "grad_norm": 0.0109365563839674, + "learning_rate": 0.0007725821283255427, + "loss": 0.0375, + "num_input_tokens_seen": 101924304, + "step": 47180 + }, + { + "epoch": 7.697389885807504, + "grad_norm": 0.020825443789362907, + "learning_rate": 0.0007725224534361955, + "loss": 0.0574, + "num_input_tokens_seen": 101934320, + "step": 47185 + }, + { + "epoch": 7.698205546492659, + "grad_norm": 0.386552095413208, + "learning_rate": 0.000772462773023835, + "loss": 0.0995, + "num_input_tokens_seen": 101944912, + "step": 47190 + }, + { + "epoch": 7.699021207177814, + "grad_norm": 0.055130232125520706, + "learning_rate": 0.0007724030870896707, + "loss": 0.0217, + "num_input_tokens_seen": 101954992, + "step": 47195 + }, + { + "epoch": 7.699836867862969, + "grad_norm": 0.38049188256263733, + "learning_rate": 0.0007723433956349123, + "loss": 0.0683, + "num_input_tokens_seen": 101965552, + "step": 47200 + }, + { + "epoch": 7.700652528548124, + "grad_norm": 0.025845926254987717, + "learning_rate": 0.0007722836986607696, + "loss": 0.0644, + "num_input_tokens_seen": 101976304, + "step": 47205 + }, + { + "epoch": 7.701468189233279, + "grad_norm": 0.009440034627914429, + "learning_rate": 0.000772223996168452, + "loss": 0.0158, + "num_input_tokens_seen": 101987568, + "step": 47210 + }, + { + "epoch": 7.702283849918434, + "grad_norm": 0.2705138027667999, + "learning_rate": 0.0007721642881591701, + "loss": 0.0852, + "num_input_tokens_seen": 101998992, + "step": 47215 + }, + { + "epoch": 7.703099510603589, + "grad_norm": 0.01270539965480566, + "learning_rate": 0.0007721045746341335, + "loss": 0.0462, + "num_input_tokens_seen": 102009744, + "step": 47220 + }, + { + "epoch": 7.7039151712887435, + "grad_norm": 0.009581586346030235, + "learning_rate": 0.0007720448555945527, + "loss": 0.0652, + "num_input_tokens_seen": 102020528, + "step": 47225 + }, + { + "epoch": 7.704730831973899, + "grad_norm": 0.00414057495072484, + "learning_rate": 0.0007719851310416376, + "loss": 0.0147, + "num_input_tokens_seen": 102031056, + "step": 47230 + }, + { + "epoch": 7.705546492659054, + "grad_norm": 0.14105384051799774, + "learning_rate": 0.0007719254009765988, + "loss": 0.0172, + "num_input_tokens_seen": 102042480, + "step": 47235 + }, + { + "epoch": 7.706362153344209, + "grad_norm": 0.09803734719753265, + "learning_rate": 0.0007718656654006469, + "loss": 0.0487, + "num_input_tokens_seen": 102053904, + "step": 47240 + }, + { + "epoch": 7.707177814029364, + "grad_norm": 0.0025712084025144577, + "learning_rate": 0.0007718059243149921, + "loss": 0.0106, + "num_input_tokens_seen": 102064464, + "step": 47245 + }, + { + "epoch": 7.7079934747145185, + "grad_norm": 0.017207970842719078, + "learning_rate": 0.0007717461777208458, + "loss": 0.0131, + "num_input_tokens_seen": 102074832, + "step": 47250 + }, + { + "epoch": 7.708809135399674, + "grad_norm": 0.006477975752204657, + "learning_rate": 0.0007716864256194182, + "loss": 0.16, + "num_input_tokens_seen": 102087088, + "step": 47255 + }, + { + "epoch": 7.709624796084829, + "grad_norm": 0.0029095339123159647, + "learning_rate": 0.0007716266680119207, + "loss": 0.0223, + "num_input_tokens_seen": 102098576, + "step": 47260 + }, + { + "epoch": 7.710440456769984, + "grad_norm": 0.042995352298021317, + "learning_rate": 0.0007715669048995641, + "loss": 0.1378, + "num_input_tokens_seen": 102109360, + "step": 47265 + }, + { + "epoch": 7.711256117455139, + "grad_norm": 0.013274877332150936, + "learning_rate": 0.0007715071362835597, + "loss": 0.0187, + "num_input_tokens_seen": 102119344, + "step": 47270 + }, + { + "epoch": 7.712071778140293, + "grad_norm": 0.28431835770606995, + "learning_rate": 0.0007714473621651188, + "loss": 0.0345, + "num_input_tokens_seen": 102130192, + "step": 47275 + }, + { + "epoch": 7.712887438825448, + "grad_norm": 0.06582538783550262, + "learning_rate": 0.0007713875825454526, + "loss": 0.1525, + "num_input_tokens_seen": 102140496, + "step": 47280 + }, + { + "epoch": 7.713703099510604, + "grad_norm": 0.02666584588587284, + "learning_rate": 0.0007713277974257729, + "loss": 0.0184, + "num_input_tokens_seen": 102150448, + "step": 47285 + }, + { + "epoch": 7.714518760195759, + "grad_norm": 0.045763175934553146, + "learning_rate": 0.0007712680068072911, + "loss": 0.0853, + "num_input_tokens_seen": 102160848, + "step": 47290 + }, + { + "epoch": 7.715334420880914, + "grad_norm": 0.005454830825328827, + "learning_rate": 0.000771208210691219, + "loss": 0.0161, + "num_input_tokens_seen": 102171152, + "step": 47295 + }, + { + "epoch": 7.716150081566068, + "grad_norm": 0.0028063564095646143, + "learning_rate": 0.0007711484090787686, + "loss": 0.0423, + "num_input_tokens_seen": 102181840, + "step": 47300 + }, + { + "epoch": 7.716965742251223, + "grad_norm": 0.026000995188951492, + "learning_rate": 0.0007710886019711516, + "loss": 0.0331, + "num_input_tokens_seen": 102193328, + "step": 47305 + }, + { + "epoch": 7.717781402936378, + "grad_norm": 0.21542233228683472, + "learning_rate": 0.0007710287893695803, + "loss": 0.0964, + "num_input_tokens_seen": 102203632, + "step": 47310 + }, + { + "epoch": 7.718597063621534, + "grad_norm": 0.004267824813723564, + "learning_rate": 0.0007709689712752666, + "loss": 0.0594, + "num_input_tokens_seen": 102214864, + "step": 47315 + }, + { + "epoch": 7.719412724306689, + "grad_norm": 0.05040392652153969, + "learning_rate": 0.000770909147689423, + "loss": 0.049, + "num_input_tokens_seen": 102225392, + "step": 47320 + }, + { + "epoch": 7.720228384991843, + "grad_norm": 0.003110036253929138, + "learning_rate": 0.000770849318613262, + "loss": 0.0652, + "num_input_tokens_seen": 102235408, + "step": 47325 + }, + { + "epoch": 7.721044045676998, + "grad_norm": 0.007758749648928642, + "learning_rate": 0.0007707894840479957, + "loss": 0.0663, + "num_input_tokens_seen": 102246672, + "step": 47330 + }, + { + "epoch": 7.721859706362153, + "grad_norm": 0.12128084897994995, + "learning_rate": 0.0007707296439948372, + "loss": 0.0693, + "num_input_tokens_seen": 102256752, + "step": 47335 + }, + { + "epoch": 7.722675367047309, + "grad_norm": 0.08663403242826462, + "learning_rate": 0.0007706697984549988, + "loss": 0.0679, + "num_input_tokens_seen": 102267280, + "step": 47340 + }, + { + "epoch": 7.7234910277324635, + "grad_norm": 0.021859407424926758, + "learning_rate": 0.0007706099474296938, + "loss": 0.0234, + "num_input_tokens_seen": 102278928, + "step": 47345 + }, + { + "epoch": 7.724306688417618, + "grad_norm": 0.013470686972141266, + "learning_rate": 0.0007705500909201349, + "loss": 0.0075, + "num_input_tokens_seen": 102290608, + "step": 47350 + }, + { + "epoch": 7.725122349102773, + "grad_norm": 0.31886106729507446, + "learning_rate": 0.0007704902289275351, + "loss": 0.1433, + "num_input_tokens_seen": 102301456, + "step": 47355 + }, + { + "epoch": 7.725938009787928, + "grad_norm": 0.016273748129606247, + "learning_rate": 0.0007704303614531076, + "loss": 0.1529, + "num_input_tokens_seen": 102311760, + "step": 47360 + }, + { + "epoch": 7.726753670473083, + "grad_norm": 0.0390392541885376, + "learning_rate": 0.0007703704884980659, + "loss": 0.01, + "num_input_tokens_seen": 102322928, + "step": 47365 + }, + { + "epoch": 7.7275693311582385, + "grad_norm": 0.022559884935617447, + "learning_rate": 0.0007703106100636233, + "loss": 0.1171, + "num_input_tokens_seen": 102333488, + "step": 47370 + }, + { + "epoch": 7.728384991843393, + "grad_norm": 0.1979581117630005, + "learning_rate": 0.0007702507261509932, + "loss": 0.0402, + "num_input_tokens_seen": 102344240, + "step": 47375 + }, + { + "epoch": 7.729200652528548, + "grad_norm": 0.03317411243915558, + "learning_rate": 0.000770190836761389, + "loss": 0.0135, + "num_input_tokens_seen": 102354544, + "step": 47380 + }, + { + "epoch": 7.730016313213703, + "grad_norm": 0.13358907401561737, + "learning_rate": 0.0007701309418960252, + "loss": 0.0188, + "num_input_tokens_seen": 102365456, + "step": 47385 + }, + { + "epoch": 7.730831973898858, + "grad_norm": 0.010089668445289135, + "learning_rate": 0.000770071041556115, + "loss": 0.0294, + "num_input_tokens_seen": 102376560, + "step": 47390 + }, + { + "epoch": 7.731647634584013, + "grad_norm": 0.22938187420368195, + "learning_rate": 0.0007700111357428724, + "loss": 0.2544, + "num_input_tokens_seen": 102387184, + "step": 47395 + }, + { + "epoch": 7.732463295269168, + "grad_norm": 0.005095354747027159, + "learning_rate": 0.0007699512244575118, + "loss": 0.0726, + "num_input_tokens_seen": 102398128, + "step": 47400 + }, + { + "epoch": 7.733278955954323, + "grad_norm": 0.3448812961578369, + "learning_rate": 0.0007698913077012471, + "loss": 0.1301, + "num_input_tokens_seen": 102409840, + "step": 47405 + }, + { + "epoch": 7.734094616639478, + "grad_norm": 0.013782687485218048, + "learning_rate": 0.0007698313854752925, + "loss": 0.1804, + "num_input_tokens_seen": 102418544, + "step": 47410 + }, + { + "epoch": 7.734910277324633, + "grad_norm": 0.036323726177215576, + "learning_rate": 0.0007697714577808627, + "loss": 0.0443, + "num_input_tokens_seen": 102429872, + "step": 47415 + }, + { + "epoch": 7.735725938009788, + "grad_norm": 0.35927650332450867, + "learning_rate": 0.0007697115246191723, + "loss": 0.0777, + "num_input_tokens_seen": 102441424, + "step": 47420 + }, + { + "epoch": 7.736541598694943, + "grad_norm": 0.12467711418867111, + "learning_rate": 0.0007696515859914355, + "loss": 0.1231, + "num_input_tokens_seen": 102451568, + "step": 47425 + }, + { + "epoch": 7.737357259380098, + "grad_norm": 0.13213302195072174, + "learning_rate": 0.0007695916418988672, + "loss": 0.0862, + "num_input_tokens_seen": 102463312, + "step": 47430 + }, + { + "epoch": 7.738172920065253, + "grad_norm": 0.060174569487571716, + "learning_rate": 0.0007695316923426823, + "loss": 0.1374, + "num_input_tokens_seen": 102473584, + "step": 47435 + }, + { + "epoch": 7.738988580750408, + "grad_norm": 0.01953984424471855, + "learning_rate": 0.0007694717373240957, + "loss": 0.0782, + "num_input_tokens_seen": 102484592, + "step": 47440 + }, + { + "epoch": 7.739804241435563, + "grad_norm": 0.02944285422563553, + "learning_rate": 0.0007694117768443225, + "loss": 0.0167, + "num_input_tokens_seen": 102494960, + "step": 47445 + }, + { + "epoch": 7.740619902120718, + "grad_norm": 0.06595656275749207, + "learning_rate": 0.0007693518109045779, + "loss": 0.0576, + "num_input_tokens_seen": 102505872, + "step": 47450 + }, + { + "epoch": 7.741435562805873, + "grad_norm": 0.0057144612073898315, + "learning_rate": 0.0007692918395060772, + "loss": 0.0318, + "num_input_tokens_seen": 102518320, + "step": 47455 + }, + { + "epoch": 7.742251223491028, + "grad_norm": 0.22104227542877197, + "learning_rate": 0.0007692318626500357, + "loss": 0.0608, + "num_input_tokens_seen": 102529488, + "step": 47460 + }, + { + "epoch": 7.743066884176183, + "grad_norm": 0.09873582422733307, + "learning_rate": 0.000769171880337669, + "loss": 0.0279, + "num_input_tokens_seen": 102539920, + "step": 47465 + }, + { + "epoch": 7.7438825448613375, + "grad_norm": 0.011355056427419186, + "learning_rate": 0.0007691118925701927, + "loss": 0.1026, + "num_input_tokens_seen": 102550832, + "step": 47470 + }, + { + "epoch": 7.744698205546492, + "grad_norm": 0.0603439062833786, + "learning_rate": 0.0007690518993488225, + "loss": 0.1269, + "num_input_tokens_seen": 102560784, + "step": 47475 + }, + { + "epoch": 7.745513866231647, + "grad_norm": 0.004944812506437302, + "learning_rate": 0.0007689919006747741, + "loss": 0.0378, + "num_input_tokens_seen": 102571920, + "step": 47480 + }, + { + "epoch": 7.746329526916803, + "grad_norm": 0.04303191974759102, + "learning_rate": 0.0007689318965492637, + "loss": 0.0342, + "num_input_tokens_seen": 102582288, + "step": 47485 + }, + { + "epoch": 7.747145187601958, + "grad_norm": 0.1496291607618332, + "learning_rate": 0.0007688718869735072, + "loss": 0.2258, + "num_input_tokens_seen": 102592400, + "step": 47490 + }, + { + "epoch": 7.7479608482871125, + "grad_norm": 0.13335032761096954, + "learning_rate": 0.0007688118719487209, + "loss": 0.0704, + "num_input_tokens_seen": 102604112, + "step": 47495 + }, + { + "epoch": 7.748776508972267, + "grad_norm": 0.02721407637000084, + "learning_rate": 0.000768751851476121, + "loss": 0.0234, + "num_input_tokens_seen": 102615760, + "step": 47500 + }, + { + "epoch": 7.749592169657422, + "grad_norm": 0.009211353026330471, + "learning_rate": 0.0007686918255569238, + "loss": 0.0356, + "num_input_tokens_seen": 102626640, + "step": 47505 + }, + { + "epoch": 7.750407830342578, + "grad_norm": 0.00823135394603014, + "learning_rate": 0.000768631794192346, + "loss": 0.0338, + "num_input_tokens_seen": 102635280, + "step": 47510 + }, + { + "epoch": 7.751223491027733, + "grad_norm": 0.21994829177856445, + "learning_rate": 0.0007685717573836041, + "loss": 0.2531, + "num_input_tokens_seen": 102644592, + "step": 47515 + }, + { + "epoch": 7.7520391517128875, + "grad_norm": 0.1314292550086975, + "learning_rate": 0.0007685117151319148, + "loss": 0.0189, + "num_input_tokens_seen": 102655792, + "step": 47520 + }, + { + "epoch": 7.752854812398042, + "grad_norm": 0.07114052772521973, + "learning_rate": 0.000768451667438495, + "loss": 0.0133, + "num_input_tokens_seen": 102664912, + "step": 47525 + }, + { + "epoch": 7.753670473083197, + "grad_norm": 0.26361361145973206, + "learning_rate": 0.0007683916143045615, + "loss": 0.1825, + "num_input_tokens_seen": 102675952, + "step": 47530 + }, + { + "epoch": 7.754486133768353, + "grad_norm": 0.0912046879529953, + "learning_rate": 0.0007683315557313315, + "loss": 0.0991, + "num_input_tokens_seen": 102686352, + "step": 47535 + }, + { + "epoch": 7.755301794453508, + "grad_norm": 0.13648521900177002, + "learning_rate": 0.0007682714917200222, + "loss": 0.272, + "num_input_tokens_seen": 102697136, + "step": 47540 + }, + { + "epoch": 7.7561174551386625, + "grad_norm": 0.11068026721477509, + "learning_rate": 0.0007682114222718507, + "loss": 0.1335, + "num_input_tokens_seen": 102708496, + "step": 47545 + }, + { + "epoch": 7.756933115823817, + "grad_norm": 0.07234393805265427, + "learning_rate": 0.0007681513473880345, + "loss": 0.047, + "num_input_tokens_seen": 102718736, + "step": 47550 + }, + { + "epoch": 7.757748776508972, + "grad_norm": 0.16802458465099335, + "learning_rate": 0.000768091267069791, + "loss": 0.0532, + "num_input_tokens_seen": 102729904, + "step": 47555 + }, + { + "epoch": 7.758564437194127, + "grad_norm": 0.03538018837571144, + "learning_rate": 0.000768031181318338, + "loss": 0.0343, + "num_input_tokens_seen": 102739664, + "step": 47560 + }, + { + "epoch": 7.759380097879282, + "grad_norm": 0.08238279074430466, + "learning_rate": 0.000767971090134893, + "loss": 0.0939, + "num_input_tokens_seen": 102750480, + "step": 47565 + }, + { + "epoch": 7.760195758564437, + "grad_norm": 0.23739773035049438, + "learning_rate": 0.0007679109935206741, + "loss": 0.3616, + "num_input_tokens_seen": 102762256, + "step": 47570 + }, + { + "epoch": 7.761011419249592, + "grad_norm": 0.002562036272138357, + "learning_rate": 0.0007678508914768989, + "loss": 0.0531, + "num_input_tokens_seen": 102771856, + "step": 47575 + }, + { + "epoch": 7.761827079934747, + "grad_norm": 0.5261669158935547, + "learning_rate": 0.0007677907840047855, + "loss": 0.0518, + "num_input_tokens_seen": 102781744, + "step": 47580 + }, + { + "epoch": 7.762642740619902, + "grad_norm": 0.013511805795133114, + "learning_rate": 0.0007677306711055523, + "loss": 0.1552, + "num_input_tokens_seen": 102792272, + "step": 47585 + }, + { + "epoch": 7.763458401305057, + "grad_norm": 0.050765104591846466, + "learning_rate": 0.0007676705527804173, + "loss": 0.0508, + "num_input_tokens_seen": 102802608, + "step": 47590 + }, + { + "epoch": 7.764274061990212, + "grad_norm": 0.010733344592154026, + "learning_rate": 0.000767610429030599, + "loss": 0.0236, + "num_input_tokens_seen": 102812784, + "step": 47595 + }, + { + "epoch": 7.765089722675367, + "grad_norm": 0.16465511918067932, + "learning_rate": 0.0007675502998573159, + "loss": 0.0481, + "num_input_tokens_seen": 102823792, + "step": 47600 + }, + { + "epoch": 7.765905383360522, + "grad_norm": 0.005473458673804998, + "learning_rate": 0.0007674901652617865, + "loss": 0.0576, + "num_input_tokens_seen": 102833712, + "step": 47605 + }, + { + "epoch": 7.766721044045677, + "grad_norm": 0.2899492681026459, + "learning_rate": 0.0007674300252452297, + "loss": 0.0938, + "num_input_tokens_seen": 102845232, + "step": 47610 + }, + { + "epoch": 7.767536704730832, + "grad_norm": 0.012640978209674358, + "learning_rate": 0.000767369879808864, + "loss": 0.0254, + "num_input_tokens_seen": 102856944, + "step": 47615 + }, + { + "epoch": 7.768352365415987, + "grad_norm": 0.0032530981115996838, + "learning_rate": 0.0007673097289539086, + "loss": 0.0265, + "num_input_tokens_seen": 102867856, + "step": 47620 + }, + { + "epoch": 7.769168026101142, + "grad_norm": 0.050850965082645416, + "learning_rate": 0.0007672495726815825, + "loss": 0.0641, + "num_input_tokens_seen": 102878064, + "step": 47625 + }, + { + "epoch": 7.769983686786297, + "grad_norm": 0.004038092214614153, + "learning_rate": 0.0007671894109931048, + "loss": 0.0452, + "num_input_tokens_seen": 102890320, + "step": 47630 + }, + { + "epoch": 7.770799347471452, + "grad_norm": 0.007436644751578569, + "learning_rate": 0.0007671292438896946, + "loss": 0.0517, + "num_input_tokens_seen": 102901296, + "step": 47635 + }, + { + "epoch": 7.771615008156607, + "grad_norm": 0.12992477416992188, + "learning_rate": 0.0007670690713725715, + "loss": 0.0603, + "num_input_tokens_seen": 102911856, + "step": 47640 + }, + { + "epoch": 7.7724306688417615, + "grad_norm": 0.08481542021036148, + "learning_rate": 0.0007670088934429548, + "loss": 0.0352, + "num_input_tokens_seen": 102921488, + "step": 47645 + }, + { + "epoch": 7.773246329526917, + "grad_norm": 0.012516772374510765, + "learning_rate": 0.0007669487101020642, + "loss": 0.1126, + "num_input_tokens_seen": 102931984, + "step": 47650 + }, + { + "epoch": 7.774061990212072, + "grad_norm": 0.08736187219619751, + "learning_rate": 0.0007668885213511193, + "loss": 0.0573, + "num_input_tokens_seen": 102943568, + "step": 47655 + }, + { + "epoch": 7.774877650897227, + "grad_norm": 0.012583942152559757, + "learning_rate": 0.0007668283271913399, + "loss": 0.1129, + "num_input_tokens_seen": 102954288, + "step": 47660 + }, + { + "epoch": 7.775693311582382, + "grad_norm": 0.09388376772403717, + "learning_rate": 0.000766768127623946, + "loss": 0.0823, + "num_input_tokens_seen": 102964304, + "step": 47665 + }, + { + "epoch": 7.7765089722675365, + "grad_norm": 0.0643705278635025, + "learning_rate": 0.0007667079226501576, + "loss": 0.0343, + "num_input_tokens_seen": 102975248, + "step": 47670 + }, + { + "epoch": 7.777324632952691, + "grad_norm": 0.008023286238312721, + "learning_rate": 0.0007666477122711948, + "loss": 0.0354, + "num_input_tokens_seen": 102986576, + "step": 47675 + }, + { + "epoch": 7.778140293637847, + "grad_norm": 0.30081382393836975, + "learning_rate": 0.000766587496488278, + "loss": 0.1358, + "num_input_tokens_seen": 102997584, + "step": 47680 + }, + { + "epoch": 7.778955954323002, + "grad_norm": 0.03647547587752342, + "learning_rate": 0.0007665272753026271, + "loss": 0.0171, + "num_input_tokens_seen": 103008368, + "step": 47685 + }, + { + "epoch": 7.779771615008157, + "grad_norm": 0.004456724505871534, + "learning_rate": 0.000766467048715463, + "loss": 0.0149, + "num_input_tokens_seen": 103019568, + "step": 47690 + }, + { + "epoch": 7.780587275693311, + "grad_norm": 0.055073726922273636, + "learning_rate": 0.000766406816728006, + "loss": 0.0918, + "num_input_tokens_seen": 103030256, + "step": 47695 + }, + { + "epoch": 7.781402936378466, + "grad_norm": 0.3227660357952118, + "learning_rate": 0.000766346579341477, + "loss": 0.0326, + "num_input_tokens_seen": 103041168, + "step": 47700 + }, + { + "epoch": 7.782218597063622, + "grad_norm": 0.06537395715713501, + "learning_rate": 0.0007662863365570967, + "loss": 0.1407, + "num_input_tokens_seen": 103052464, + "step": 47705 + }, + { + "epoch": 7.783034257748777, + "grad_norm": 0.3882252871990204, + "learning_rate": 0.000766226088376086, + "loss": 0.0276, + "num_input_tokens_seen": 103063376, + "step": 47710 + }, + { + "epoch": 7.783849918433932, + "grad_norm": 0.04566549137234688, + "learning_rate": 0.0007661658347996659, + "loss": 0.0638, + "num_input_tokens_seen": 103074448, + "step": 47715 + }, + { + "epoch": 7.784665579119086, + "grad_norm": 0.039481550455093384, + "learning_rate": 0.0007661055758290574, + "loss": 0.204, + "num_input_tokens_seen": 103084912, + "step": 47720 + }, + { + "epoch": 7.785481239804241, + "grad_norm": 0.24234062433242798, + "learning_rate": 0.0007660453114654819, + "loss": 0.1593, + "num_input_tokens_seen": 103095408, + "step": 47725 + }, + { + "epoch": 7.786296900489396, + "grad_norm": 0.24106614291667938, + "learning_rate": 0.0007659850417101606, + "loss": 0.0674, + "num_input_tokens_seen": 103105776, + "step": 47730 + }, + { + "epoch": 7.787112561174552, + "grad_norm": 0.013635087758302689, + "learning_rate": 0.0007659247665643151, + "loss": 0.1237, + "num_input_tokens_seen": 103117392, + "step": 47735 + }, + { + "epoch": 7.787928221859707, + "grad_norm": 0.05942991003394127, + "learning_rate": 0.0007658644860291668, + "loss": 0.0434, + "num_input_tokens_seen": 103127344, + "step": 47740 + }, + { + "epoch": 7.788743882544861, + "grad_norm": 0.008099487982690334, + "learning_rate": 0.0007658042001059373, + "loss": 0.1783, + "num_input_tokens_seen": 103136848, + "step": 47745 + }, + { + "epoch": 7.789559543230016, + "grad_norm": 0.01683001220226288, + "learning_rate": 0.0007657439087958486, + "loss": 0.0663, + "num_input_tokens_seen": 103148208, + "step": 47750 + }, + { + "epoch": 7.790375203915171, + "grad_norm": 0.05494864284992218, + "learning_rate": 0.0007656836121001225, + "loss": 0.0163, + "num_input_tokens_seen": 103158832, + "step": 47755 + }, + { + "epoch": 7.791190864600326, + "grad_norm": 0.1892729550600052, + "learning_rate": 0.0007656233100199809, + "loss": 0.1048, + "num_input_tokens_seen": 103168688, + "step": 47760 + }, + { + "epoch": 7.7920065252854815, + "grad_norm": 0.05981948971748352, + "learning_rate": 0.000765563002556646, + "loss": 0.0575, + "num_input_tokens_seen": 103178736, + "step": 47765 + }, + { + "epoch": 7.792822185970636, + "grad_norm": 0.216399148106575, + "learning_rate": 0.00076550268971134, + "loss": 0.1477, + "num_input_tokens_seen": 103188528, + "step": 47770 + }, + { + "epoch": 7.793637846655791, + "grad_norm": 0.09204624593257904, + "learning_rate": 0.0007654423714852852, + "loss": 0.0542, + "num_input_tokens_seen": 103198736, + "step": 47775 + }, + { + "epoch": 7.794453507340946, + "grad_norm": 0.11351175606250763, + "learning_rate": 0.0007653820478797038, + "loss": 0.0609, + "num_input_tokens_seen": 103209296, + "step": 47780 + }, + { + "epoch": 7.795269168026101, + "grad_norm": 0.2176835536956787, + "learning_rate": 0.0007653217188958188, + "loss": 0.188, + "num_input_tokens_seen": 103219632, + "step": 47785 + }, + { + "epoch": 7.7960848287112565, + "grad_norm": 0.025288639590144157, + "learning_rate": 0.0007652613845348524, + "loss": 0.0448, + "num_input_tokens_seen": 103230960, + "step": 47790 + }, + { + "epoch": 7.796900489396411, + "grad_norm": 0.03485998511314392, + "learning_rate": 0.0007652010447980276, + "loss": 0.047, + "num_input_tokens_seen": 103240912, + "step": 47795 + }, + { + "epoch": 7.797716150081566, + "grad_norm": 0.004971094895154238, + "learning_rate": 0.0007651406996865672, + "loss": 0.0318, + "num_input_tokens_seen": 103252752, + "step": 47800 + }, + { + "epoch": 7.798531810766721, + "grad_norm": 0.2158200442790985, + "learning_rate": 0.000765080349201694, + "loss": 0.0414, + "num_input_tokens_seen": 103262800, + "step": 47805 + }, + { + "epoch": 7.799347471451876, + "grad_norm": 0.0057363430969417095, + "learning_rate": 0.0007650199933446314, + "loss": 0.1655, + "num_input_tokens_seen": 103273680, + "step": 47810 + }, + { + "epoch": 7.800163132137031, + "grad_norm": 0.13042238354682922, + "learning_rate": 0.0007649596321166025, + "loss": 0.0229, + "num_input_tokens_seen": 103285200, + "step": 47815 + }, + { + "epoch": 7.800978792822186, + "grad_norm": 0.010898235253989697, + "learning_rate": 0.0007648992655188305, + "loss": 0.0878, + "num_input_tokens_seen": 103295568, + "step": 47820 + }, + { + "epoch": 7.801794453507341, + "grad_norm": 0.08140433579683304, + "learning_rate": 0.0007648388935525388, + "loss": 0.0804, + "num_input_tokens_seen": 103305648, + "step": 47825 + }, + { + "epoch": 7.802610114192496, + "grad_norm": 0.036025699228048325, + "learning_rate": 0.0007647785162189509, + "loss": 0.0935, + "num_input_tokens_seen": 103317264, + "step": 47830 + }, + { + "epoch": 7.803425774877651, + "grad_norm": 0.026536036282777786, + "learning_rate": 0.0007647181335192905, + "loss": 0.0664, + "num_input_tokens_seen": 103328720, + "step": 47835 + }, + { + "epoch": 7.804241435562806, + "grad_norm": 0.1489490419626236, + "learning_rate": 0.0007646577454547814, + "loss": 0.038, + "num_input_tokens_seen": 103340464, + "step": 47840 + }, + { + "epoch": 7.80505709624796, + "grad_norm": 0.016447249799966812, + "learning_rate": 0.0007645973520266472, + "loss": 0.0407, + "num_input_tokens_seen": 103350992, + "step": 47845 + }, + { + "epoch": 7.805872756933116, + "grad_norm": 0.23225589096546173, + "learning_rate": 0.000764536953236112, + "loss": 0.1121, + "num_input_tokens_seen": 103362736, + "step": 47850 + }, + { + "epoch": 7.806688417618271, + "grad_norm": 0.024745440110564232, + "learning_rate": 0.0007644765490844, + "loss": 0.0941, + "num_input_tokens_seen": 103373136, + "step": 47855 + }, + { + "epoch": 7.807504078303426, + "grad_norm": 0.18811935186386108, + "learning_rate": 0.0007644161395727352, + "loss": 0.0494, + "num_input_tokens_seen": 103383504, + "step": 47860 + }, + { + "epoch": 7.808319738988581, + "grad_norm": 0.2695559561252594, + "learning_rate": 0.0007643557247023418, + "loss": 0.0746, + "num_input_tokens_seen": 103393808, + "step": 47865 + }, + { + "epoch": 7.809135399673735, + "grad_norm": 0.03419940173625946, + "learning_rate": 0.0007642953044744443, + "loss": 0.0669, + "num_input_tokens_seen": 103405936, + "step": 47870 + }, + { + "epoch": 7.809951060358891, + "grad_norm": 0.018129676580429077, + "learning_rate": 0.0007642348788902672, + "loss": 0.0814, + "num_input_tokens_seen": 103416464, + "step": 47875 + }, + { + "epoch": 7.810766721044046, + "grad_norm": 0.2084517925977707, + "learning_rate": 0.000764174447951035, + "loss": 0.0363, + "num_input_tokens_seen": 103426768, + "step": 47880 + }, + { + "epoch": 7.811582381729201, + "grad_norm": 0.08284687250852585, + "learning_rate": 0.0007641140116579725, + "loss": 0.0782, + "num_input_tokens_seen": 103436656, + "step": 47885 + }, + { + "epoch": 7.8123980424143555, + "grad_norm": 0.07356946915388107, + "learning_rate": 0.0007640535700123047, + "loss": 0.0255, + "num_input_tokens_seen": 103447504, + "step": 47890 + }, + { + "epoch": 7.81321370309951, + "grad_norm": 0.018549971282482147, + "learning_rate": 0.000763993123015256, + "loss": 0.0239, + "num_input_tokens_seen": 103458800, + "step": 47895 + }, + { + "epoch": 7.814029363784666, + "grad_norm": 0.001164857647381723, + "learning_rate": 0.0007639326706680521, + "loss": 0.0284, + "num_input_tokens_seen": 103468944, + "step": 47900 + }, + { + "epoch": 7.814845024469821, + "grad_norm": 0.0029101655818521976, + "learning_rate": 0.0007638722129719175, + "loss": 0.0222, + "num_input_tokens_seen": 103478928, + "step": 47905 + }, + { + "epoch": 7.815660685154976, + "grad_norm": 0.005133399274200201, + "learning_rate": 0.0007638117499280778, + "loss": 0.014, + "num_input_tokens_seen": 103491024, + "step": 47910 + }, + { + "epoch": 7.8164763458401305, + "grad_norm": 0.10789318382740021, + "learning_rate": 0.0007637512815377585, + "loss": 0.0456, + "num_input_tokens_seen": 103503600, + "step": 47915 + }, + { + "epoch": 7.817292006525285, + "grad_norm": 0.417122483253479, + "learning_rate": 0.0007636908078021848, + "loss": 0.0817, + "num_input_tokens_seen": 103513136, + "step": 47920 + }, + { + "epoch": 7.81810766721044, + "grad_norm": 0.1550966501235962, + "learning_rate": 0.0007636303287225823, + "loss": 0.1408, + "num_input_tokens_seen": 103523696, + "step": 47925 + }, + { + "epoch": 7.818923327895595, + "grad_norm": 0.24719803035259247, + "learning_rate": 0.0007635698443001768, + "loss": 0.1584, + "num_input_tokens_seen": 103535088, + "step": 47930 + }, + { + "epoch": 7.819738988580751, + "grad_norm": 0.1656491756439209, + "learning_rate": 0.0007635093545361942, + "loss": 0.0848, + "num_input_tokens_seen": 103544976, + "step": 47935 + }, + { + "epoch": 7.8205546492659055, + "grad_norm": 0.011646476574242115, + "learning_rate": 0.00076344885943186, + "loss": 0.158, + "num_input_tokens_seen": 103555824, + "step": 47940 + }, + { + "epoch": 7.82137030995106, + "grad_norm": 0.2617485821247101, + "learning_rate": 0.0007633883589884007, + "loss": 0.1808, + "num_input_tokens_seen": 103566544, + "step": 47945 + }, + { + "epoch": 7.822185970636215, + "grad_norm": 0.0567353293299675, + "learning_rate": 0.000763327853207042, + "loss": 0.0203, + "num_input_tokens_seen": 103577680, + "step": 47950 + }, + { + "epoch": 7.82300163132137, + "grad_norm": 0.07070982456207275, + "learning_rate": 0.0007632673420890104, + "loss": 0.0291, + "num_input_tokens_seen": 103587280, + "step": 47955 + }, + { + "epoch": 7.823817292006526, + "grad_norm": 0.08610428869724274, + "learning_rate": 0.000763206825635532, + "loss": 0.0578, + "num_input_tokens_seen": 103598896, + "step": 47960 + }, + { + "epoch": 7.8246329526916805, + "grad_norm": 0.28003424406051636, + "learning_rate": 0.0007631463038478334, + "loss": 0.1182, + "num_input_tokens_seen": 103608976, + "step": 47965 + }, + { + "epoch": 7.825448613376835, + "grad_norm": 0.015281864441931248, + "learning_rate": 0.0007630857767271413, + "loss": 0.0134, + "num_input_tokens_seen": 103620080, + "step": 47970 + }, + { + "epoch": 7.82626427406199, + "grad_norm": 0.08863025903701782, + "learning_rate": 0.000763025244274682, + "loss": 0.0249, + "num_input_tokens_seen": 103631312, + "step": 47975 + }, + { + "epoch": 7.827079934747145, + "grad_norm": 0.5021570324897766, + "learning_rate": 0.0007629647064916825, + "loss": 0.0948, + "num_input_tokens_seen": 103641360, + "step": 47980 + }, + { + "epoch": 7.827895595432301, + "grad_norm": 0.07482955604791641, + "learning_rate": 0.0007629041633793696, + "loss": 0.0867, + "num_input_tokens_seen": 103652944, + "step": 47985 + }, + { + "epoch": 7.828711256117455, + "grad_norm": 0.1137724220752716, + "learning_rate": 0.0007628436149389703, + "loss": 0.1346, + "num_input_tokens_seen": 103662544, + "step": 47990 + }, + { + "epoch": 7.82952691680261, + "grad_norm": 0.07171276956796646, + "learning_rate": 0.000762783061171712, + "loss": 0.0905, + "num_input_tokens_seen": 103672496, + "step": 47995 + }, + { + "epoch": 7.830342577487765, + "grad_norm": 0.01498460490256548, + "learning_rate": 0.0007627225020788213, + "loss": 0.0511, + "num_input_tokens_seen": 103683312, + "step": 48000 + }, + { + "epoch": 7.83115823817292, + "grad_norm": 0.08065731823444366, + "learning_rate": 0.0007626619376615258, + "loss": 0.0343, + "num_input_tokens_seen": 103694320, + "step": 48005 + }, + { + "epoch": 7.831973898858075, + "grad_norm": 0.22067171335220337, + "learning_rate": 0.000762601367921053, + "loss": 0.0691, + "num_input_tokens_seen": 103705712, + "step": 48010 + }, + { + "epoch": 7.8327895595432295, + "grad_norm": 0.2833503484725952, + "learning_rate": 0.0007625407928586303, + "loss": 0.0494, + "num_input_tokens_seen": 103717424, + "step": 48015 + }, + { + "epoch": 7.833605220228385, + "grad_norm": 0.053285736590623856, + "learning_rate": 0.0007624802124754855, + "loss": 0.0152, + "num_input_tokens_seen": 103729008, + "step": 48020 + }, + { + "epoch": 7.83442088091354, + "grad_norm": 0.3245934844017029, + "learning_rate": 0.000762419626772846, + "loss": 0.0347, + "num_input_tokens_seen": 103740464, + "step": 48025 + }, + { + "epoch": 7.835236541598695, + "grad_norm": 0.5312497019767761, + "learning_rate": 0.0007623590357519401, + "loss": 0.1356, + "num_input_tokens_seen": 103751088, + "step": 48030 + }, + { + "epoch": 7.83605220228385, + "grad_norm": 0.0032611230853945017, + "learning_rate": 0.0007622984394139953, + "loss": 0.1581, + "num_input_tokens_seen": 103760432, + "step": 48035 + }, + { + "epoch": 7.8368678629690045, + "grad_norm": 0.0028083904180675745, + "learning_rate": 0.00076223783776024, + "loss": 0.015, + "num_input_tokens_seen": 103771120, + "step": 48040 + }, + { + "epoch": 7.83768352365416, + "grad_norm": 0.09268619865179062, + "learning_rate": 0.0007621772307919022, + "loss": 0.0176, + "num_input_tokens_seen": 103780144, + "step": 48045 + }, + { + "epoch": 7.838499184339315, + "grad_norm": 0.012299345806241035, + "learning_rate": 0.0007621166185102104, + "loss": 0.0105, + "num_input_tokens_seen": 103790928, + "step": 48050 + }, + { + "epoch": 7.83931484502447, + "grad_norm": 0.32112395763397217, + "learning_rate": 0.0007620560009163926, + "loss": 0.1263, + "num_input_tokens_seen": 103802064, + "step": 48055 + }, + { + "epoch": 7.840130505709625, + "grad_norm": 0.01307889074087143, + "learning_rate": 0.0007619953780116775, + "loss": 0.1268, + "num_input_tokens_seen": 103813872, + "step": 48060 + }, + { + "epoch": 7.8409461663947795, + "grad_norm": 0.045404642820358276, + "learning_rate": 0.0007619347497972937, + "loss": 0.0942, + "num_input_tokens_seen": 103825232, + "step": 48065 + }, + { + "epoch": 7.841761827079935, + "grad_norm": 0.2753269672393799, + "learning_rate": 0.00076187411627447, + "loss": 0.1738, + "num_input_tokens_seen": 103834992, + "step": 48070 + }, + { + "epoch": 7.84257748776509, + "grad_norm": 0.008529874496161938, + "learning_rate": 0.0007618134774444351, + "loss": 0.0346, + "num_input_tokens_seen": 103846384, + "step": 48075 + }, + { + "epoch": 7.843393148450245, + "grad_norm": 0.22419241070747375, + "learning_rate": 0.0007617528333084178, + "loss": 0.0849, + "num_input_tokens_seen": 103855888, + "step": 48080 + }, + { + "epoch": 7.8442088091354, + "grad_norm": 0.15129628777503967, + "learning_rate": 0.0007616921838676475, + "loss": 0.102, + "num_input_tokens_seen": 103867568, + "step": 48085 + }, + { + "epoch": 7.8450244698205545, + "grad_norm": 0.23662333190441132, + "learning_rate": 0.0007616315291233531, + "loss": 0.0673, + "num_input_tokens_seen": 103878960, + "step": 48090 + }, + { + "epoch": 7.845840130505709, + "grad_norm": 0.0883672758936882, + "learning_rate": 0.0007615708690767637, + "loss": 0.0409, + "num_input_tokens_seen": 103888432, + "step": 48095 + }, + { + "epoch": 7.846655791190865, + "grad_norm": 0.057228852063417435, + "learning_rate": 0.0007615102037291089, + "loss": 0.1298, + "num_input_tokens_seen": 103899632, + "step": 48100 + }, + { + "epoch": 7.84747145187602, + "grad_norm": 0.013155256398022175, + "learning_rate": 0.000761449533081618, + "loss": 0.0247, + "num_input_tokens_seen": 103910128, + "step": 48105 + }, + { + "epoch": 7.848287112561175, + "grad_norm": 0.02147751860320568, + "learning_rate": 0.0007613888571355208, + "loss": 0.1243, + "num_input_tokens_seen": 103922160, + "step": 48110 + }, + { + "epoch": 7.849102773246329, + "grad_norm": 0.003036431735381484, + "learning_rate": 0.0007613281758920467, + "loss": 0.0094, + "num_input_tokens_seen": 103933232, + "step": 48115 + }, + { + "epoch": 7.849918433931484, + "grad_norm": 0.03485213965177536, + "learning_rate": 0.0007612674893524256, + "loss": 0.0377, + "num_input_tokens_seen": 103946160, + "step": 48120 + }, + { + "epoch": 7.850734094616639, + "grad_norm": 0.04104197025299072, + "learning_rate": 0.0007612067975178874, + "loss": 0.0486, + "num_input_tokens_seen": 103958000, + "step": 48125 + }, + { + "epoch": 7.851549755301795, + "grad_norm": 0.2663140892982483, + "learning_rate": 0.0007611461003896621, + "loss": 0.1522, + "num_input_tokens_seen": 103968912, + "step": 48130 + }, + { + "epoch": 7.85236541598695, + "grad_norm": 0.8700830340385437, + "learning_rate": 0.0007610853979689797, + "loss": 0.096, + "num_input_tokens_seen": 103978960, + "step": 48135 + }, + { + "epoch": 7.853181076672104, + "grad_norm": 0.28547972440719604, + "learning_rate": 0.0007610246902570706, + "loss": 0.1672, + "num_input_tokens_seen": 103989008, + "step": 48140 + }, + { + "epoch": 7.853996737357259, + "grad_norm": 0.20221035182476044, + "learning_rate": 0.000760963977255165, + "loss": 0.0699, + "num_input_tokens_seen": 104001712, + "step": 48145 + }, + { + "epoch": 7.854812398042414, + "grad_norm": 0.01691204123198986, + "learning_rate": 0.0007609032589644934, + "loss": 0.0162, + "num_input_tokens_seen": 104013648, + "step": 48150 + }, + { + "epoch": 7.85562805872757, + "grad_norm": 0.22434313595294952, + "learning_rate": 0.0007608425353862863, + "loss": 0.1619, + "num_input_tokens_seen": 104024496, + "step": 48155 + }, + { + "epoch": 7.856443719412725, + "grad_norm": 0.520187497138977, + "learning_rate": 0.000760781806521774, + "loss": 0.1211, + "num_input_tokens_seen": 104034256, + "step": 48160 + }, + { + "epoch": 7.857259380097879, + "grad_norm": 0.2626437842845917, + "learning_rate": 0.0007607210723721879, + "loss": 0.1844, + "num_input_tokens_seen": 104045456, + "step": 48165 + }, + { + "epoch": 7.858075040783034, + "grad_norm": 0.2683877646923065, + "learning_rate": 0.0007606603329387585, + "loss": 0.1541, + "num_input_tokens_seen": 104056368, + "step": 48170 + }, + { + "epoch": 7.858890701468189, + "grad_norm": 0.05445249378681183, + "learning_rate": 0.0007605995882227166, + "loss": 0.0701, + "num_input_tokens_seen": 104067280, + "step": 48175 + }, + { + "epoch": 7.859706362153344, + "grad_norm": 0.18587210774421692, + "learning_rate": 0.0007605388382252936, + "loss": 0.1222, + "num_input_tokens_seen": 104079120, + "step": 48180 + }, + { + "epoch": 7.8605220228384995, + "grad_norm": 0.1920372098684311, + "learning_rate": 0.0007604780829477205, + "loss": 0.1243, + "num_input_tokens_seen": 104090288, + "step": 48185 + }, + { + "epoch": 7.861337683523654, + "grad_norm": 0.007299771066755056, + "learning_rate": 0.0007604173223912285, + "loss": 0.0234, + "num_input_tokens_seen": 104099952, + "step": 48190 + }, + { + "epoch": 7.862153344208809, + "grad_norm": 0.0069532874040305614, + "learning_rate": 0.0007603565565570493, + "loss": 0.0189, + "num_input_tokens_seen": 104111344, + "step": 48195 + }, + { + "epoch": 7.862969004893964, + "grad_norm": 0.023784659802913666, + "learning_rate": 0.0007602957854464141, + "loss": 0.0314, + "num_input_tokens_seen": 104123056, + "step": 48200 + }, + { + "epoch": 7.863784665579119, + "grad_norm": 0.18003705143928528, + "learning_rate": 0.0007602350090605546, + "loss": 0.1692, + "num_input_tokens_seen": 104132208, + "step": 48205 + }, + { + "epoch": 7.864600326264274, + "grad_norm": 0.03283815085887909, + "learning_rate": 0.0007601742274007023, + "loss": 0.0305, + "num_input_tokens_seen": 104143376, + "step": 48210 + }, + { + "epoch": 7.865415986949429, + "grad_norm": 0.04933464154601097, + "learning_rate": 0.0007601134404680894, + "loss": 0.0597, + "num_input_tokens_seen": 104153584, + "step": 48215 + }, + { + "epoch": 7.866231647634584, + "grad_norm": 0.0056173368357121944, + "learning_rate": 0.0007600526482639477, + "loss": 0.0334, + "num_input_tokens_seen": 104164880, + "step": 48220 + }, + { + "epoch": 7.867047308319739, + "grad_norm": 0.5657948851585388, + "learning_rate": 0.0007599918507895092, + "loss": 0.0872, + "num_input_tokens_seen": 104175920, + "step": 48225 + }, + { + "epoch": 7.867862969004894, + "grad_norm": 0.06797628104686737, + "learning_rate": 0.000759931048046006, + "loss": 0.0218, + "num_input_tokens_seen": 104186640, + "step": 48230 + }, + { + "epoch": 7.868678629690049, + "grad_norm": 0.15060186386108398, + "learning_rate": 0.0007598702400346703, + "loss": 0.0289, + "num_input_tokens_seen": 104197872, + "step": 48235 + }, + { + "epoch": 7.869494290375204, + "grad_norm": 0.16141721606254578, + "learning_rate": 0.0007598094267567345, + "loss": 0.1611, + "num_input_tokens_seen": 104208816, + "step": 48240 + }, + { + "epoch": 7.870309951060359, + "grad_norm": 0.14451834559440613, + "learning_rate": 0.0007597486082134311, + "loss": 0.0256, + "num_input_tokens_seen": 104219952, + "step": 48245 + }, + { + "epoch": 7.871125611745514, + "grad_norm": 0.010201388970017433, + "learning_rate": 0.0007596877844059926, + "loss": 0.11, + "num_input_tokens_seen": 104230640, + "step": 48250 + }, + { + "epoch": 7.871941272430669, + "grad_norm": 0.22415506839752197, + "learning_rate": 0.0007596269553356518, + "loss": 0.1094, + "num_input_tokens_seen": 104242000, + "step": 48255 + }, + { + "epoch": 7.872756933115824, + "grad_norm": 0.02670017071068287, + "learning_rate": 0.0007595661210036414, + "loss": 0.0597, + "num_input_tokens_seen": 104252144, + "step": 48260 + }, + { + "epoch": 7.873572593800979, + "grad_norm": 0.20463339984416962, + "learning_rate": 0.0007595052814111942, + "loss": 0.031, + "num_input_tokens_seen": 104263984, + "step": 48265 + }, + { + "epoch": 7.874388254486134, + "grad_norm": 0.13383440673351288, + "learning_rate": 0.0007594444365595435, + "loss": 0.0718, + "num_input_tokens_seen": 104274320, + "step": 48270 + }, + { + "epoch": 7.875203915171289, + "grad_norm": 0.6233965754508972, + "learning_rate": 0.0007593835864499219, + "loss": 0.1259, + "num_input_tokens_seen": 104285776, + "step": 48275 + }, + { + "epoch": 7.876019575856444, + "grad_norm": 0.0012839973205700517, + "learning_rate": 0.0007593227310835629, + "loss": 0.0229, + "num_input_tokens_seen": 104297104, + "step": 48280 + }, + { + "epoch": 7.876835236541599, + "grad_norm": 0.5068719983100891, + "learning_rate": 0.0007592618704616998, + "loss": 0.1091, + "num_input_tokens_seen": 104307600, + "step": 48285 + }, + { + "epoch": 7.877650897226753, + "grad_norm": 0.1758025586605072, + "learning_rate": 0.0007592010045855662, + "loss": 0.14, + "num_input_tokens_seen": 104319664, + "step": 48290 + }, + { + "epoch": 7.878466557911908, + "grad_norm": 0.02141629531979561, + "learning_rate": 0.0007591401334563952, + "loss": 0.1144, + "num_input_tokens_seen": 104331184, + "step": 48295 + }, + { + "epoch": 7.879282218597064, + "grad_norm": 0.14108148217201233, + "learning_rate": 0.0007590792570754207, + "loss": 0.0349, + "num_input_tokens_seen": 104342352, + "step": 48300 + }, + { + "epoch": 7.880097879282219, + "grad_norm": 0.027795322239398956, + "learning_rate": 0.0007590183754438764, + "loss": 0.078, + "num_input_tokens_seen": 104354704, + "step": 48305 + }, + { + "epoch": 7.8809135399673735, + "grad_norm": 0.16794262826442719, + "learning_rate": 0.0007589574885629961, + "loss": 0.093, + "num_input_tokens_seen": 104365872, + "step": 48310 + }, + { + "epoch": 7.881729200652528, + "grad_norm": 0.30435100197792053, + "learning_rate": 0.0007588965964340137, + "loss": 0.0865, + "num_input_tokens_seen": 104377008, + "step": 48315 + }, + { + "epoch": 7.882544861337683, + "grad_norm": 0.007787167094647884, + "learning_rate": 0.0007588356990581635, + "loss": 0.0255, + "num_input_tokens_seen": 104388016, + "step": 48320 + }, + { + "epoch": 7.883360522022839, + "grad_norm": 0.19416892528533936, + "learning_rate": 0.0007587747964366796, + "loss": 0.0961, + "num_input_tokens_seen": 104397968, + "step": 48325 + }, + { + "epoch": 7.884176182707994, + "grad_norm": 0.012199878692626953, + "learning_rate": 0.0007587138885707959, + "loss": 0.0248, + "num_input_tokens_seen": 104408944, + "step": 48330 + }, + { + "epoch": 7.8849918433931485, + "grad_norm": 0.6253767013549805, + "learning_rate": 0.000758652975461747, + "loss": 0.168, + "num_input_tokens_seen": 104419440, + "step": 48335 + }, + { + "epoch": 7.885807504078303, + "grad_norm": 0.003849747823551297, + "learning_rate": 0.0007585920571107677, + "loss": 0.0135, + "num_input_tokens_seen": 104428560, + "step": 48340 + }, + { + "epoch": 7.886623164763458, + "grad_norm": 0.08811540901660919, + "learning_rate": 0.0007585311335190923, + "loss": 0.1504, + "num_input_tokens_seen": 104438256, + "step": 48345 + }, + { + "epoch": 7.887438825448614, + "grad_norm": 0.020213250070810318, + "learning_rate": 0.0007584702046879554, + "loss": 0.0259, + "num_input_tokens_seen": 104448624, + "step": 48350 + }, + { + "epoch": 7.888254486133769, + "grad_norm": 0.41347524523735046, + "learning_rate": 0.0007584092706185919, + "loss": 0.228, + "num_input_tokens_seen": 104460528, + "step": 48355 + }, + { + "epoch": 7.8890701468189235, + "grad_norm": 0.0025836778804659843, + "learning_rate": 0.0007583483313122368, + "loss": 0.0134, + "num_input_tokens_seen": 104472016, + "step": 48360 + }, + { + "epoch": 7.889885807504078, + "grad_norm": 0.012997115030884743, + "learning_rate": 0.000758287386770125, + "loss": 0.0448, + "num_input_tokens_seen": 104482288, + "step": 48365 + }, + { + "epoch": 7.890701468189233, + "grad_norm": 0.041280992329120636, + "learning_rate": 0.0007582264369934915, + "loss": 0.0502, + "num_input_tokens_seen": 104492304, + "step": 48370 + }, + { + "epoch": 7.891517128874388, + "grad_norm": 0.0031451070681214333, + "learning_rate": 0.0007581654819835717, + "loss": 0.1548, + "num_input_tokens_seen": 104503472, + "step": 48375 + }, + { + "epoch": 7.892332789559543, + "grad_norm": 0.017510058358311653, + "learning_rate": 0.0007581045217416011, + "loss": 0.1972, + "num_input_tokens_seen": 104513872, + "step": 48380 + }, + { + "epoch": 7.8931484502446985, + "grad_norm": 0.00507473386824131, + "learning_rate": 0.0007580435562688148, + "loss": 0.1909, + "num_input_tokens_seen": 104524336, + "step": 48385 + }, + { + "epoch": 7.893964110929853, + "grad_norm": 0.0073803444392979145, + "learning_rate": 0.0007579825855664486, + "loss": 0.1623, + "num_input_tokens_seen": 104535600, + "step": 48390 + }, + { + "epoch": 7.894779771615008, + "grad_norm": 0.06939529627561569, + "learning_rate": 0.0007579216096357378, + "loss": 0.0451, + "num_input_tokens_seen": 104544688, + "step": 48395 + }, + { + "epoch": 7.895595432300163, + "grad_norm": 0.2730625569820404, + "learning_rate": 0.0007578606284779185, + "loss": 0.2875, + "num_input_tokens_seen": 104555952, + "step": 48400 + }, + { + "epoch": 7.896411092985318, + "grad_norm": 0.054563023149967194, + "learning_rate": 0.0007577996420942266, + "loss": 0.0795, + "num_input_tokens_seen": 104566928, + "step": 48405 + }, + { + "epoch": 7.897226753670473, + "grad_norm": 0.2631271183490753, + "learning_rate": 0.0007577386504858978, + "loss": 0.116, + "num_input_tokens_seen": 104578544, + "step": 48410 + }, + { + "epoch": 7.898042414355628, + "grad_norm": 0.014852375723421574, + "learning_rate": 0.0007576776536541682, + "loss": 0.0832, + "num_input_tokens_seen": 104588496, + "step": 48415 + }, + { + "epoch": 7.898858075040783, + "grad_norm": 0.06987016648054123, + "learning_rate": 0.0007576166516002741, + "loss": 0.0754, + "num_input_tokens_seen": 104599760, + "step": 48420 + }, + { + "epoch": 7.899673735725938, + "grad_norm": 0.03241053223609924, + "learning_rate": 0.0007575556443254518, + "loss": 0.0856, + "num_input_tokens_seen": 104610192, + "step": 48425 + }, + { + "epoch": 7.900489396411093, + "grad_norm": 0.8125465512275696, + "learning_rate": 0.0007574946318309376, + "loss": 0.1169, + "num_input_tokens_seen": 104621072, + "step": 48430 + }, + { + "epoch": 7.901305057096248, + "grad_norm": 0.015340335667133331, + "learning_rate": 0.000757433614117968, + "loss": 0.0825, + "num_input_tokens_seen": 104632080, + "step": 48435 + }, + { + "epoch": 7.902120717781403, + "grad_norm": 0.04143735393881798, + "learning_rate": 0.0007573725911877797, + "loss": 0.1525, + "num_input_tokens_seen": 104643792, + "step": 48440 + }, + { + "epoch": 7.902936378466558, + "grad_norm": 0.01662031188607216, + "learning_rate": 0.0007573115630416092, + "loss": 0.0695, + "num_input_tokens_seen": 104654960, + "step": 48445 + }, + { + "epoch": 7.903752039151713, + "grad_norm": 0.23152291774749756, + "learning_rate": 0.0007572505296806935, + "loss": 0.0432, + "num_input_tokens_seen": 104666320, + "step": 48450 + }, + { + "epoch": 7.904567699836868, + "grad_norm": 0.20455844700336456, + "learning_rate": 0.0007571894911062696, + "loss": 0.0884, + "num_input_tokens_seen": 104675600, + "step": 48455 + }, + { + "epoch": 7.9053833605220225, + "grad_norm": 0.2332301288843155, + "learning_rate": 0.0007571284473195743, + "loss": 0.1064, + "num_input_tokens_seen": 104686928, + "step": 48460 + }, + { + "epoch": 7.906199021207177, + "grad_norm": 0.15824279189109802, + "learning_rate": 0.0007570673983218448, + "loss": 0.053, + "num_input_tokens_seen": 104698512, + "step": 48465 + }, + { + "epoch": 7.907014681892333, + "grad_norm": 0.031252142041921616, + "learning_rate": 0.0007570063441143185, + "loss": 0.0574, + "num_input_tokens_seen": 104709552, + "step": 48470 + }, + { + "epoch": 7.907830342577488, + "grad_norm": 0.01947942189872265, + "learning_rate": 0.0007569452846982325, + "loss": 0.0595, + "num_input_tokens_seen": 104719184, + "step": 48475 + }, + { + "epoch": 7.908646003262643, + "grad_norm": 0.296113520860672, + "learning_rate": 0.0007568842200748243, + "loss": 0.0816, + "num_input_tokens_seen": 104730160, + "step": 48480 + }, + { + "epoch": 7.9094616639477975, + "grad_norm": 0.636048436164856, + "learning_rate": 0.0007568231502453317, + "loss": 0.1006, + "num_input_tokens_seen": 104740816, + "step": 48485 + }, + { + "epoch": 7.910277324632952, + "grad_norm": 0.07422137260437012, + "learning_rate": 0.000756762075210992, + "loss": 0.144, + "num_input_tokens_seen": 104750672, + "step": 48490 + }, + { + "epoch": 7.911092985318108, + "grad_norm": 0.016525914892554283, + "learning_rate": 0.0007567009949730431, + "loss": 0.0165, + "num_input_tokens_seen": 104761584, + "step": 48495 + }, + { + "epoch": 7.911908646003263, + "grad_norm": 0.011553775519132614, + "learning_rate": 0.000756639909532723, + "loss": 0.0264, + "num_input_tokens_seen": 104771376, + "step": 48500 + }, + { + "epoch": 7.912724306688418, + "grad_norm": 0.047472815960645676, + "learning_rate": 0.0007565788188912694, + "loss": 0.0956, + "num_input_tokens_seen": 104783440, + "step": 48505 + }, + { + "epoch": 7.9135399673735725, + "grad_norm": 0.006028347183018923, + "learning_rate": 0.0007565177230499206, + "loss": 0.0177, + "num_input_tokens_seen": 104794256, + "step": 48510 + }, + { + "epoch": 7.914355628058727, + "grad_norm": 0.009613900445401669, + "learning_rate": 0.0007564566220099147, + "loss": 0.0571, + "num_input_tokens_seen": 104804944, + "step": 48515 + }, + { + "epoch": 7.915171288743883, + "grad_norm": 0.35099858045578003, + "learning_rate": 0.00075639551577249, + "loss": 0.0724, + "num_input_tokens_seen": 104816656, + "step": 48520 + }, + { + "epoch": 7.915986949429038, + "grad_norm": 0.02365029975771904, + "learning_rate": 0.0007563344043388851, + "loss": 0.0796, + "num_input_tokens_seen": 104828016, + "step": 48525 + }, + { + "epoch": 7.916802610114193, + "grad_norm": 0.013142816722393036, + "learning_rate": 0.0007562732877103382, + "loss": 0.1559, + "num_input_tokens_seen": 104839696, + "step": 48530 + }, + { + "epoch": 7.917618270799347, + "grad_norm": 0.00743667408823967, + "learning_rate": 0.000756212165888088, + "loss": 0.05, + "num_input_tokens_seen": 104850576, + "step": 48535 + }, + { + "epoch": 7.918433931484502, + "grad_norm": 0.012738938443362713, + "learning_rate": 0.0007561510388733732, + "loss": 0.1381, + "num_input_tokens_seen": 104861008, + "step": 48540 + }, + { + "epoch": 7.919249592169657, + "grad_norm": 0.027310442179441452, + "learning_rate": 0.0007560899066674327, + "loss": 0.1044, + "num_input_tokens_seen": 104872144, + "step": 48545 + }, + { + "epoch": 7.920065252854813, + "grad_norm": 0.2208627462387085, + "learning_rate": 0.0007560287692715053, + "loss": 0.1653, + "num_input_tokens_seen": 104882608, + "step": 48550 + }, + { + "epoch": 7.920880913539968, + "grad_norm": 0.11449895054101944, + "learning_rate": 0.0007559676266868302, + "loss": 0.066, + "num_input_tokens_seen": 104892688, + "step": 48555 + }, + { + "epoch": 7.921696574225122, + "grad_norm": 0.006951616611331701, + "learning_rate": 0.0007559064789146464, + "loss": 0.0307, + "num_input_tokens_seen": 104902640, + "step": 48560 + }, + { + "epoch": 7.922512234910277, + "grad_norm": 0.36576905846595764, + "learning_rate": 0.000755845325956193, + "loss": 0.2786, + "num_input_tokens_seen": 104913168, + "step": 48565 + }, + { + "epoch": 7.923327895595432, + "grad_norm": 0.19045992195606232, + "learning_rate": 0.0007557841678127097, + "loss": 0.2578, + "num_input_tokens_seen": 104922896, + "step": 48570 + }, + { + "epoch": 7.924143556280587, + "grad_norm": 0.009913114830851555, + "learning_rate": 0.0007557230044854357, + "loss": 0.0764, + "num_input_tokens_seen": 104934672, + "step": 48575 + }, + { + "epoch": 7.924959216965743, + "grad_norm": 0.17396552860736847, + "learning_rate": 0.0007556618359756107, + "loss": 0.0408, + "num_input_tokens_seen": 104947120, + "step": 48580 + }, + { + "epoch": 7.925774877650897, + "grad_norm": 0.07783018052577972, + "learning_rate": 0.0007556006622844742, + "loss": 0.102, + "num_input_tokens_seen": 104957616, + "step": 48585 + }, + { + "epoch": 7.926590538336052, + "grad_norm": 0.04610704258084297, + "learning_rate": 0.000755539483413266, + "loss": 0.0192, + "num_input_tokens_seen": 104968112, + "step": 48590 + }, + { + "epoch": 7.927406199021207, + "grad_norm": 0.18525907397270203, + "learning_rate": 0.0007554782993632259, + "loss": 0.0888, + "num_input_tokens_seen": 104978672, + "step": 48595 + }, + { + "epoch": 7.928221859706362, + "grad_norm": 0.21388539671897888, + "learning_rate": 0.0007554171101355941, + "loss": 0.1034, + "num_input_tokens_seen": 104989360, + "step": 48600 + }, + { + "epoch": 7.9290375203915175, + "grad_norm": 0.026249831542372704, + "learning_rate": 0.0007553559157316105, + "loss": 0.1253, + "num_input_tokens_seen": 105000080, + "step": 48605 + }, + { + "epoch": 7.929853181076672, + "grad_norm": 0.013661502860486507, + "learning_rate": 0.0007552947161525153, + "loss": 0.0453, + "num_input_tokens_seen": 105010768, + "step": 48610 + }, + { + "epoch": 7.930668841761827, + "grad_norm": 0.7273470163345337, + "learning_rate": 0.0007552335113995489, + "loss": 0.0613, + "num_input_tokens_seen": 105022128, + "step": 48615 + }, + { + "epoch": 7.931484502446982, + "grad_norm": 0.1308993399143219, + "learning_rate": 0.0007551723014739515, + "loss": 0.0295, + "num_input_tokens_seen": 105033168, + "step": 48620 + }, + { + "epoch": 7.932300163132137, + "grad_norm": 0.0197969488799572, + "learning_rate": 0.0007551110863769638, + "loss": 0.0514, + "num_input_tokens_seen": 105043728, + "step": 48625 + }, + { + "epoch": 7.933115823817292, + "grad_norm": 0.1168442964553833, + "learning_rate": 0.0007550498661098263, + "loss": 0.0743, + "num_input_tokens_seen": 105053008, + "step": 48630 + }, + { + "epoch": 7.933931484502447, + "grad_norm": 0.063286192715168, + "learning_rate": 0.0007549886406737796, + "loss": 0.0991, + "num_input_tokens_seen": 105063312, + "step": 48635 + }, + { + "epoch": 7.934747145187602, + "grad_norm": 0.01165692787617445, + "learning_rate": 0.0007549274100700647, + "loss": 0.2042, + "num_input_tokens_seen": 105073712, + "step": 48640 + }, + { + "epoch": 7.935562805872757, + "grad_norm": 0.2161632776260376, + "learning_rate": 0.0007548661742999225, + "loss": 0.1167, + "num_input_tokens_seen": 105084560, + "step": 48645 + }, + { + "epoch": 7.936378466557912, + "grad_norm": 0.03843540698289871, + "learning_rate": 0.0007548049333645939, + "loss": 0.0233, + "num_input_tokens_seen": 105095984, + "step": 48650 + }, + { + "epoch": 7.937194127243067, + "grad_norm": 0.010381447151303291, + "learning_rate": 0.00075474368726532, + "loss": 0.0206, + "num_input_tokens_seen": 105107024, + "step": 48655 + }, + { + "epoch": 7.938009787928221, + "grad_norm": 0.16924302279949188, + "learning_rate": 0.0007546824360033421, + "loss": 0.1278, + "num_input_tokens_seen": 105118448, + "step": 48660 + }, + { + "epoch": 7.938825448613377, + "grad_norm": 0.15478937327861786, + "learning_rate": 0.0007546211795799016, + "loss": 0.1476, + "num_input_tokens_seen": 105128592, + "step": 48665 + }, + { + "epoch": 7.939641109298532, + "grad_norm": 0.015761559829115868, + "learning_rate": 0.0007545599179962399, + "loss": 0.0412, + "num_input_tokens_seen": 105139888, + "step": 48670 + }, + { + "epoch": 7.940456769983687, + "grad_norm": 0.06137595698237419, + "learning_rate": 0.0007544986512535985, + "loss": 0.2636, + "num_input_tokens_seen": 105150192, + "step": 48675 + }, + { + "epoch": 7.941272430668842, + "grad_norm": 0.15942196547985077, + "learning_rate": 0.0007544373793532191, + "loss": 0.187, + "num_input_tokens_seen": 105160912, + "step": 48680 + }, + { + "epoch": 7.942088091353996, + "grad_norm": 0.036675989627838135, + "learning_rate": 0.0007543761022963436, + "loss": 0.0901, + "num_input_tokens_seen": 105171792, + "step": 48685 + }, + { + "epoch": 7.942903752039152, + "grad_norm": 0.020440716296434402, + "learning_rate": 0.0007543148200842134, + "loss": 0.0202, + "num_input_tokens_seen": 105183408, + "step": 48690 + }, + { + "epoch": 7.943719412724307, + "grad_norm": 0.09116707742214203, + "learning_rate": 0.0007542535327180708, + "loss": 0.0878, + "num_input_tokens_seen": 105194736, + "step": 48695 + }, + { + "epoch": 7.944535073409462, + "grad_norm": 0.0569356270134449, + "learning_rate": 0.0007541922401991579, + "loss": 0.1086, + "num_input_tokens_seen": 105205008, + "step": 48700 + }, + { + "epoch": 7.945350734094617, + "grad_norm": 0.008786008693277836, + "learning_rate": 0.0007541309425287168, + "loss": 0.0645, + "num_input_tokens_seen": 105216592, + "step": 48705 + }, + { + "epoch": 7.946166394779771, + "grad_norm": 0.1885114312171936, + "learning_rate": 0.0007540696397079898, + "loss": 0.1041, + "num_input_tokens_seen": 105227504, + "step": 48710 + }, + { + "epoch": 7.946982055464927, + "grad_norm": 0.00608594436198473, + "learning_rate": 0.0007540083317382192, + "loss": 0.0876, + "num_input_tokens_seen": 105237904, + "step": 48715 + }, + { + "epoch": 7.947797716150082, + "grad_norm": 0.08031502366065979, + "learning_rate": 0.0007539470186206474, + "loss": 0.0259, + "num_input_tokens_seen": 105247504, + "step": 48720 + }, + { + "epoch": 7.948613376835237, + "grad_norm": 0.026645051315426826, + "learning_rate": 0.0007538857003565174, + "loss": 0.0697, + "num_input_tokens_seen": 105256400, + "step": 48725 + }, + { + "epoch": 7.9494290375203915, + "grad_norm": 0.015550825744867325, + "learning_rate": 0.0007538243769470714, + "loss": 0.0345, + "num_input_tokens_seen": 105267440, + "step": 48730 + }, + { + "epoch": 7.950244698205546, + "grad_norm": 0.03166070580482483, + "learning_rate": 0.0007537630483935524, + "loss": 0.0417, + "num_input_tokens_seen": 105278928, + "step": 48735 + }, + { + "epoch": 7.951060358890701, + "grad_norm": 0.2161301076412201, + "learning_rate": 0.0007537017146972033, + "loss": 0.1683, + "num_input_tokens_seen": 105290320, + "step": 48740 + }, + { + "epoch": 7.951876019575856, + "grad_norm": 0.04890443757176399, + "learning_rate": 0.0007536403758592672, + "loss": 0.0214, + "num_input_tokens_seen": 105302320, + "step": 48745 + }, + { + "epoch": 7.952691680261012, + "grad_norm": 0.0032995252404361963, + "learning_rate": 0.000753579031880987, + "loss": 0.0257, + "num_input_tokens_seen": 105312336, + "step": 48750 + }, + { + "epoch": 7.9535073409461665, + "grad_norm": 0.04470792040228844, + "learning_rate": 0.0007535176827636061, + "loss": 0.0997, + "num_input_tokens_seen": 105323216, + "step": 48755 + }, + { + "epoch": 7.954323001631321, + "grad_norm": 0.03811972588300705, + "learning_rate": 0.0007534563285083678, + "loss": 0.0638, + "num_input_tokens_seen": 105333680, + "step": 48760 + }, + { + "epoch": 7.955138662316476, + "grad_norm": 0.004735193680971861, + "learning_rate": 0.0007533949691165152, + "loss": 0.1082, + "num_input_tokens_seen": 105344624, + "step": 48765 + }, + { + "epoch": 7.955954323001631, + "grad_norm": 0.02903125435113907, + "learning_rate": 0.0007533336045892925, + "loss": 0.1382, + "num_input_tokens_seen": 105355568, + "step": 48770 + }, + { + "epoch": 7.956769983686787, + "grad_norm": 0.06740237772464752, + "learning_rate": 0.0007532722349279426, + "loss": 0.1862, + "num_input_tokens_seen": 105365328, + "step": 48775 + }, + { + "epoch": 7.9575856443719415, + "grad_norm": 0.3388029932975769, + "learning_rate": 0.0007532108601337097, + "loss": 0.1106, + "num_input_tokens_seen": 105376144, + "step": 48780 + }, + { + "epoch": 7.958401305057096, + "grad_norm": 0.09516182541847229, + "learning_rate": 0.0007531494802078376, + "loss": 0.093, + "num_input_tokens_seen": 105386928, + "step": 48785 + }, + { + "epoch": 7.959216965742251, + "grad_norm": 0.023860549554228783, + "learning_rate": 0.00075308809515157, + "loss": 0.062, + "num_input_tokens_seen": 105398352, + "step": 48790 + }, + { + "epoch": 7.960032626427406, + "grad_norm": 0.016818996518850327, + "learning_rate": 0.0007530267049661511, + "loss": 0.0407, + "num_input_tokens_seen": 105408528, + "step": 48795 + }, + { + "epoch": 7.960848287112562, + "grad_norm": 0.01870405301451683, + "learning_rate": 0.000752965309652825, + "loss": 0.0232, + "num_input_tokens_seen": 105418224, + "step": 48800 + }, + { + "epoch": 7.9616639477977165, + "grad_norm": 0.02412063628435135, + "learning_rate": 0.0007529039092128361, + "loss": 0.04, + "num_input_tokens_seen": 105429072, + "step": 48805 + }, + { + "epoch": 7.962479608482871, + "grad_norm": 0.24175409972667694, + "learning_rate": 0.0007528425036474287, + "loss": 0.1169, + "num_input_tokens_seen": 105440240, + "step": 48810 + }, + { + "epoch": 7.963295269168026, + "grad_norm": 0.006044411100447178, + "learning_rate": 0.000752781092957847, + "loss": 0.0621, + "num_input_tokens_seen": 105450352, + "step": 48815 + }, + { + "epoch": 7.964110929853181, + "grad_norm": 0.010947907343506813, + "learning_rate": 0.000752719677145336, + "loss": 0.167, + "num_input_tokens_seen": 105461072, + "step": 48820 + }, + { + "epoch": 7.964926590538336, + "grad_norm": 0.04575859382748604, + "learning_rate": 0.0007526582562111399, + "loss": 0.0721, + "num_input_tokens_seen": 105471440, + "step": 48825 + }, + { + "epoch": 7.9657422512234906, + "grad_norm": 0.005048078019171953, + "learning_rate": 0.0007525968301565038, + "loss": 0.0306, + "num_input_tokens_seen": 105480816, + "step": 48830 + }, + { + "epoch": 7.966557911908646, + "grad_norm": 0.31248384714126587, + "learning_rate": 0.0007525353989826726, + "loss": 0.2006, + "num_input_tokens_seen": 105490800, + "step": 48835 + }, + { + "epoch": 7.967373572593801, + "grad_norm": 0.012551373802125454, + "learning_rate": 0.000752473962690891, + "loss": 0.0381, + "num_input_tokens_seen": 105501104, + "step": 48840 + }, + { + "epoch": 7.968189233278956, + "grad_norm": 0.11429378390312195, + "learning_rate": 0.0007524125212824044, + "loss": 0.0702, + "num_input_tokens_seen": 105510128, + "step": 48845 + }, + { + "epoch": 7.969004893964111, + "grad_norm": 0.17006027698516846, + "learning_rate": 0.0007523510747584578, + "loss": 0.1937, + "num_input_tokens_seen": 105520976, + "step": 48850 + }, + { + "epoch": 7.9698205546492655, + "grad_norm": 0.07592868059873581, + "learning_rate": 0.0007522896231202967, + "loss": 0.0343, + "num_input_tokens_seen": 105532400, + "step": 48855 + }, + { + "epoch": 7.970636215334421, + "grad_norm": 0.061747901141643524, + "learning_rate": 0.0007522281663691661, + "loss": 0.0891, + "num_input_tokens_seen": 105543184, + "step": 48860 + }, + { + "epoch": 7.971451876019576, + "grad_norm": 0.061240021139383316, + "learning_rate": 0.0007521667045063119, + "loss": 0.0517, + "num_input_tokens_seen": 105553680, + "step": 48865 + }, + { + "epoch": 7.972267536704731, + "grad_norm": 0.15075771510601044, + "learning_rate": 0.0007521052375329793, + "loss": 0.0457, + "num_input_tokens_seen": 105563472, + "step": 48870 + }, + { + "epoch": 7.973083197389886, + "grad_norm": 0.2989932596683502, + "learning_rate": 0.0007520437654504144, + "loss": 0.1824, + "num_input_tokens_seen": 105574224, + "step": 48875 + }, + { + "epoch": 7.9738988580750405, + "grad_norm": 0.05400446057319641, + "learning_rate": 0.0007519822882598629, + "loss": 0.0226, + "num_input_tokens_seen": 105585712, + "step": 48880 + }, + { + "epoch": 7.974714518760196, + "grad_norm": 0.08582861721515656, + "learning_rate": 0.0007519208059625707, + "loss": 0.1254, + "num_input_tokens_seen": 105596304, + "step": 48885 + }, + { + "epoch": 7.975530179445351, + "grad_norm": 0.1772875189781189, + "learning_rate": 0.0007518593185597837, + "loss": 0.1296, + "num_input_tokens_seen": 105607600, + "step": 48890 + }, + { + "epoch": 7.976345840130506, + "grad_norm": 0.039883363991975784, + "learning_rate": 0.000751797826052748, + "loss": 0.046, + "num_input_tokens_seen": 105618192, + "step": 48895 + }, + { + "epoch": 7.977161500815661, + "grad_norm": 0.03105618990957737, + "learning_rate": 0.0007517363284427101, + "loss": 0.0825, + "num_input_tokens_seen": 105629392, + "step": 48900 + }, + { + "epoch": 7.9779771615008155, + "grad_norm": 0.16668418049812317, + "learning_rate": 0.0007516748257309162, + "loss": 0.0844, + "num_input_tokens_seen": 105639056, + "step": 48905 + }, + { + "epoch": 7.97879282218597, + "grad_norm": 0.039331886917352676, + "learning_rate": 0.0007516133179186125, + "loss": 0.129, + "num_input_tokens_seen": 105651024, + "step": 48910 + }, + { + "epoch": 7.979608482871125, + "grad_norm": 0.34375035762786865, + "learning_rate": 0.0007515518050070458, + "loss": 0.1145, + "num_input_tokens_seen": 105662128, + "step": 48915 + }, + { + "epoch": 7.980424143556281, + "grad_norm": 0.014578046277165413, + "learning_rate": 0.0007514902869974627, + "loss": 0.0093, + "num_input_tokens_seen": 105673552, + "step": 48920 + }, + { + "epoch": 7.981239804241436, + "grad_norm": 0.005937238223850727, + "learning_rate": 0.0007514287638911099, + "loss": 0.0427, + "num_input_tokens_seen": 105684464, + "step": 48925 + }, + { + "epoch": 7.9820554649265905, + "grad_norm": 0.024509524926543236, + "learning_rate": 0.0007513672356892342, + "loss": 0.0338, + "num_input_tokens_seen": 105696272, + "step": 48930 + }, + { + "epoch": 7.982871125611745, + "grad_norm": 0.174819678068161, + "learning_rate": 0.0007513057023930825, + "loss": 0.1251, + "num_input_tokens_seen": 105708208, + "step": 48935 + }, + { + "epoch": 7.9836867862969, + "grad_norm": 0.18815754354000092, + "learning_rate": 0.000751244164003902, + "loss": 0.1564, + "num_input_tokens_seen": 105718928, + "step": 48940 + }, + { + "epoch": 7.984502446982056, + "grad_norm": 0.17951254546642303, + "learning_rate": 0.00075118262052294, + "loss": 0.1253, + "num_input_tokens_seen": 105729296, + "step": 48945 + }, + { + "epoch": 7.985318107667211, + "grad_norm": 0.012976273894309998, + "learning_rate": 0.0007511210719514432, + "loss": 0.0199, + "num_input_tokens_seen": 105739664, + "step": 48950 + }, + { + "epoch": 7.986133768352365, + "grad_norm": 0.009064053185284138, + "learning_rate": 0.0007510595182906595, + "loss": 0.025, + "num_input_tokens_seen": 105749872, + "step": 48955 + }, + { + "epoch": 7.98694942903752, + "grad_norm": 0.16349948942661285, + "learning_rate": 0.0007509979595418362, + "loss": 0.0693, + "num_input_tokens_seen": 105761008, + "step": 48960 + }, + { + "epoch": 7.987765089722675, + "grad_norm": 0.149709090590477, + "learning_rate": 0.0007509363957062207, + "loss": 0.076, + "num_input_tokens_seen": 105771824, + "step": 48965 + }, + { + "epoch": 7.988580750407831, + "grad_norm": 0.3649735152721405, + "learning_rate": 0.0007508748267850609, + "loss": 0.4246, + "num_input_tokens_seen": 105783056, + "step": 48970 + }, + { + "epoch": 7.989396411092986, + "grad_norm": 0.031553879380226135, + "learning_rate": 0.0007508132527796043, + "loss": 0.1724, + "num_input_tokens_seen": 105794192, + "step": 48975 + }, + { + "epoch": 7.99021207177814, + "grad_norm": 0.10740819573402405, + "learning_rate": 0.0007507516736910992, + "loss": 0.2098, + "num_input_tokens_seen": 105804528, + "step": 48980 + }, + { + "epoch": 7.991027732463295, + "grad_norm": 0.011156396940350533, + "learning_rate": 0.0007506900895207932, + "loss": 0.0543, + "num_input_tokens_seen": 105815824, + "step": 48985 + }, + { + "epoch": 7.99184339314845, + "grad_norm": 0.008487922139465809, + "learning_rate": 0.0007506285002699346, + "loss": 0.009, + "num_input_tokens_seen": 105827440, + "step": 48990 + }, + { + "epoch": 7.992659053833605, + "grad_norm": 0.12116503715515137, + "learning_rate": 0.0007505669059397715, + "loss": 0.0424, + "num_input_tokens_seen": 105838288, + "step": 48995 + }, + { + "epoch": 7.993474714518761, + "grad_norm": 0.015823280438780785, + "learning_rate": 0.0007505053065315521, + "loss": 0.1182, + "num_input_tokens_seen": 105849584, + "step": 49000 + }, + { + "epoch": 7.994290375203915, + "grad_norm": 0.0266315545886755, + "learning_rate": 0.0007504437020465248, + "loss": 0.0419, + "num_input_tokens_seen": 105861232, + "step": 49005 + }, + { + "epoch": 7.99510603588907, + "grad_norm": 0.005523244384676218, + "learning_rate": 0.0007503820924859382, + "loss": 0.0389, + "num_input_tokens_seen": 105871600, + "step": 49010 + }, + { + "epoch": 7.995921696574225, + "grad_norm": 0.43169787526130676, + "learning_rate": 0.000750320477851041, + "loss": 0.1195, + "num_input_tokens_seen": 105882992, + "step": 49015 + }, + { + "epoch": 7.99673735725938, + "grad_norm": 0.015888521447777748, + "learning_rate": 0.0007502588581430817, + "loss": 0.0675, + "num_input_tokens_seen": 105893360, + "step": 49020 + }, + { + "epoch": 7.997553017944535, + "grad_norm": 0.12559035420417786, + "learning_rate": 0.0007501972333633091, + "loss": 0.0527, + "num_input_tokens_seen": 105903728, + "step": 49025 + }, + { + "epoch": 7.99836867862969, + "grad_norm": 0.19304737448692322, + "learning_rate": 0.0007501356035129723, + "loss": 0.1411, + "num_input_tokens_seen": 105915600, + "step": 49030 + }, + { + "epoch": 7.999184339314845, + "grad_norm": 0.105661541223526, + "learning_rate": 0.0007500739685933201, + "loss": 0.0734, + "num_input_tokens_seen": 105925808, + "step": 49035 + }, + { + "epoch": 8.0, + "grad_norm": 0.05994661524891853, + "learning_rate": 0.0007500123286056018, + "loss": 0.0358, + "num_input_tokens_seen": 105934480, + "step": 49040 + }, + { + "epoch": 8.0, + "eval_loss": 0.12728162109851837, + "eval_runtime": 103.6319, + "eval_samples_per_second": 26.295, + "eval_steps_per_second": 6.581, + "num_input_tokens_seen": 105934480, + "step": 49040 + }, + { + "epoch": 8.000815660685156, + "grad_norm": 0.014739147387444973, + "learning_rate": 0.0007499506835510663, + "loss": 0.032, + "num_input_tokens_seen": 105946192, + "step": 49045 + }, + { + "epoch": 8.00163132137031, + "grad_norm": 0.07827870547771454, + "learning_rate": 0.0007498890334309633, + "loss": 0.0517, + "num_input_tokens_seen": 105956304, + "step": 49050 + }, + { + "epoch": 8.002446982055465, + "grad_norm": 0.0049188388511538506, + "learning_rate": 0.000749827378246542, + "loss": 0.0262, + "num_input_tokens_seen": 105968240, + "step": 49055 + }, + { + "epoch": 8.00326264274062, + "grad_norm": 0.1784546822309494, + "learning_rate": 0.0007497657179990518, + "loss": 0.0955, + "num_input_tokens_seen": 105980080, + "step": 49060 + }, + { + "epoch": 8.004078303425775, + "grad_norm": 0.037165481597185135, + "learning_rate": 0.0007497040526897426, + "loss": 0.1378, + "num_input_tokens_seen": 105990224, + "step": 49065 + }, + { + "epoch": 8.00489396411093, + "grad_norm": 0.3894607424736023, + "learning_rate": 0.0007496423823198639, + "loss": 0.0841, + "num_input_tokens_seen": 106000944, + "step": 49070 + }, + { + "epoch": 8.005709624796085, + "grad_norm": 0.008682585321366787, + "learning_rate": 0.0007495807068906657, + "loss": 0.0254, + "num_input_tokens_seen": 106012368, + "step": 49075 + }, + { + "epoch": 8.00652528548124, + "grad_norm": 0.033180102705955505, + "learning_rate": 0.0007495190264033978, + "loss": 0.1062, + "num_input_tokens_seen": 106023696, + "step": 49080 + }, + { + "epoch": 8.007340946166394, + "grad_norm": 0.00399895990267396, + "learning_rate": 0.0007494573408593103, + "loss": 0.1288, + "num_input_tokens_seen": 106033840, + "step": 49085 + }, + { + "epoch": 8.00815660685155, + "grad_norm": 0.07352360337972641, + "learning_rate": 0.0007493956502596533, + "loss": 0.034, + "num_input_tokens_seen": 106044144, + "step": 49090 + }, + { + "epoch": 8.008972267536704, + "grad_norm": 0.01003354787826538, + "learning_rate": 0.0007493339546056772, + "loss": 0.0388, + "num_input_tokens_seen": 106054800, + "step": 49095 + }, + { + "epoch": 8.00978792822186, + "grad_norm": 0.03879820555448532, + "learning_rate": 0.0007492722538986321, + "loss": 0.0408, + "num_input_tokens_seen": 106065392, + "step": 49100 + }, + { + "epoch": 8.010603588907015, + "grad_norm": 0.053220439702272415, + "learning_rate": 0.0007492105481397686, + "loss": 0.0953, + "num_input_tokens_seen": 106076528, + "step": 49105 + }, + { + "epoch": 8.01141924959217, + "grad_norm": 0.059041813015937805, + "learning_rate": 0.0007491488373303373, + "loss": 0.0277, + "num_input_tokens_seen": 106087664, + "step": 49110 + }, + { + "epoch": 8.012234910277325, + "grad_norm": 0.011349274776875973, + "learning_rate": 0.0007490871214715885, + "loss": 0.0122, + "num_input_tokens_seen": 106099184, + "step": 49115 + }, + { + "epoch": 8.013050570962479, + "grad_norm": 0.057813651859760284, + "learning_rate": 0.0007490254005647735, + "loss": 0.0269, + "num_input_tokens_seen": 106109776, + "step": 49120 + }, + { + "epoch": 8.013866231647635, + "grad_norm": 0.017140861600637436, + "learning_rate": 0.0007489636746111426, + "loss": 0.0241, + "num_input_tokens_seen": 106120464, + "step": 49125 + }, + { + "epoch": 8.01468189233279, + "grad_norm": 0.015432114712893963, + "learning_rate": 0.0007489019436119471, + "loss": 0.0379, + "num_input_tokens_seen": 106131568, + "step": 49130 + }, + { + "epoch": 8.015497553017944, + "grad_norm": 0.0168614462018013, + "learning_rate": 0.0007488402075684379, + "loss": 0.017, + "num_input_tokens_seen": 106142416, + "step": 49135 + }, + { + "epoch": 8.0163132137031, + "grad_norm": 0.004921023268252611, + "learning_rate": 0.0007487784664818662, + "loss": 0.0197, + "num_input_tokens_seen": 106152592, + "step": 49140 + }, + { + "epoch": 8.017128874388254, + "grad_norm": 0.25885462760925293, + "learning_rate": 0.0007487167203534834, + "loss": 0.1524, + "num_input_tokens_seen": 106162992, + "step": 49145 + }, + { + "epoch": 8.01794453507341, + "grad_norm": 0.09149748086929321, + "learning_rate": 0.0007486549691845405, + "loss": 0.0246, + "num_input_tokens_seen": 106174736, + "step": 49150 + }, + { + "epoch": 8.018760195758565, + "grad_norm": 0.004045677836984396, + "learning_rate": 0.0007485932129762895, + "loss": 0.1091, + "num_input_tokens_seen": 106185936, + "step": 49155 + }, + { + "epoch": 8.01957585644372, + "grad_norm": 0.013637539930641651, + "learning_rate": 0.0007485314517299815, + "loss": 0.0353, + "num_input_tokens_seen": 106196688, + "step": 49160 + }, + { + "epoch": 8.020391517128875, + "grad_norm": 0.02192877233028412, + "learning_rate": 0.0007484696854468684, + "loss": 0.0796, + "num_input_tokens_seen": 106206768, + "step": 49165 + }, + { + "epoch": 8.021207177814029, + "grad_norm": 0.0217854306101799, + "learning_rate": 0.0007484079141282018, + "loss": 0.0482, + "num_input_tokens_seen": 106216208, + "step": 49170 + }, + { + "epoch": 8.022022838499185, + "grad_norm": 0.1591954380273819, + "learning_rate": 0.0007483461377752339, + "loss": 0.0885, + "num_input_tokens_seen": 106226704, + "step": 49175 + }, + { + "epoch": 8.022838499184338, + "grad_norm": 0.2401871681213379, + "learning_rate": 0.0007482843563892164, + "loss": 0.1582, + "num_input_tokens_seen": 106238640, + "step": 49180 + }, + { + "epoch": 8.023654159869494, + "grad_norm": 0.12886598706245422, + "learning_rate": 0.0007482225699714014, + "loss": 0.0273, + "num_input_tokens_seen": 106249200, + "step": 49185 + }, + { + "epoch": 8.02446982055465, + "grad_norm": 0.0026838481426239014, + "learning_rate": 0.0007481607785230411, + "loss": 0.0509, + "num_input_tokens_seen": 106259920, + "step": 49190 + }, + { + "epoch": 8.025285481239804, + "grad_norm": 0.07842859625816345, + "learning_rate": 0.0007480989820453878, + "loss": 0.1143, + "num_input_tokens_seen": 106271088, + "step": 49195 + }, + { + "epoch": 8.02610114192496, + "grad_norm": 0.1295957714319229, + "learning_rate": 0.0007480371805396941, + "loss": 0.1251, + "num_input_tokens_seen": 106281488, + "step": 49200 + }, + { + "epoch": 8.026916802610113, + "grad_norm": 0.005239915568381548, + "learning_rate": 0.0007479753740072121, + "loss": 0.0062, + "num_input_tokens_seen": 106291920, + "step": 49205 + }, + { + "epoch": 8.02773246329527, + "grad_norm": 0.02723226509988308, + "learning_rate": 0.0007479135624491946, + "loss": 0.0365, + "num_input_tokens_seen": 106302896, + "step": 49210 + }, + { + "epoch": 8.028548123980425, + "grad_norm": 0.19069083034992218, + "learning_rate": 0.0007478517458668943, + "loss": 0.0748, + "num_input_tokens_seen": 106313968, + "step": 49215 + }, + { + "epoch": 8.029363784665579, + "grad_norm": 0.07725605368614197, + "learning_rate": 0.0007477899242615639, + "loss": 0.0185, + "num_input_tokens_seen": 106325328, + "step": 49220 + }, + { + "epoch": 8.030179445350734, + "grad_norm": 0.34236836433410645, + "learning_rate": 0.0007477280976344563, + "loss": 0.0838, + "num_input_tokens_seen": 106336720, + "step": 49225 + }, + { + "epoch": 8.030995106035888, + "grad_norm": 0.01022479310631752, + "learning_rate": 0.0007476662659868246, + "loss": 0.1021, + "num_input_tokens_seen": 106347888, + "step": 49230 + }, + { + "epoch": 8.031810766721044, + "grad_norm": 0.0006199624622240663, + "learning_rate": 0.0007476044293199218, + "loss": 0.009, + "num_input_tokens_seen": 106359440, + "step": 49235 + }, + { + "epoch": 8.0326264274062, + "grad_norm": 0.025253375992178917, + "learning_rate": 0.0007475425876350011, + "loss": 0.0687, + "num_input_tokens_seen": 106371152, + "step": 49240 + }, + { + "epoch": 8.033442088091354, + "grad_norm": 0.00817878171801567, + "learning_rate": 0.000747480740933316, + "loss": 0.0207, + "num_input_tokens_seen": 106382640, + "step": 49245 + }, + { + "epoch": 8.03425774877651, + "grad_norm": 0.029934866353869438, + "learning_rate": 0.0007474188892161196, + "loss": 0.0692, + "num_input_tokens_seen": 106393712, + "step": 49250 + }, + { + "epoch": 8.035073409461663, + "grad_norm": 0.3806656301021576, + "learning_rate": 0.0007473570324846656, + "loss": 0.0402, + "num_input_tokens_seen": 106403664, + "step": 49255 + }, + { + "epoch": 8.035889070146819, + "grad_norm": 0.039969995617866516, + "learning_rate": 0.0007472951707402074, + "loss": 0.0082, + "num_input_tokens_seen": 106415664, + "step": 49260 + }, + { + "epoch": 8.036704730831975, + "grad_norm": 0.1323152780532837, + "learning_rate": 0.0007472333039839989, + "loss": 0.0612, + "num_input_tokens_seen": 106426672, + "step": 49265 + }, + { + "epoch": 8.037520391517129, + "grad_norm": 0.035253897309303284, + "learning_rate": 0.000747171432217294, + "loss": 0.0171, + "num_input_tokens_seen": 106437296, + "step": 49270 + }, + { + "epoch": 8.038336052202284, + "grad_norm": 0.08336462080478668, + "learning_rate": 0.0007471095554413463, + "loss": 0.0331, + "num_input_tokens_seen": 106447024, + "step": 49275 + }, + { + "epoch": 8.039151712887438, + "grad_norm": 0.03029344044625759, + "learning_rate": 0.0007470476736574102, + "loss": 0.0719, + "num_input_tokens_seen": 106456976, + "step": 49280 + }, + { + "epoch": 8.039967373572594, + "grad_norm": 0.3142164647579193, + "learning_rate": 0.0007469857868667393, + "loss": 0.2818, + "num_input_tokens_seen": 106467888, + "step": 49285 + }, + { + "epoch": 8.040783034257748, + "grad_norm": 0.02353101409971714, + "learning_rate": 0.0007469238950705883, + "loss": 0.0155, + "num_input_tokens_seen": 106479024, + "step": 49290 + }, + { + "epoch": 8.041598694942904, + "grad_norm": 0.003246279200538993, + "learning_rate": 0.0007468619982702112, + "loss": 0.005, + "num_input_tokens_seen": 106488912, + "step": 49295 + }, + { + "epoch": 8.04241435562806, + "grad_norm": 0.33486485481262207, + "learning_rate": 0.0007468000964668625, + "loss": 0.0731, + "num_input_tokens_seen": 106500336, + "step": 49300 + }, + { + "epoch": 8.043230016313213, + "grad_norm": 0.044109445065259933, + "learning_rate": 0.0007467381896617968, + "loss": 0.1144, + "num_input_tokens_seen": 106511280, + "step": 49305 + }, + { + "epoch": 8.044045676998369, + "grad_norm": 0.2685130536556244, + "learning_rate": 0.0007466762778562687, + "loss": 0.1703, + "num_input_tokens_seen": 106521776, + "step": 49310 + }, + { + "epoch": 8.044861337683523, + "grad_norm": 0.07512059807777405, + "learning_rate": 0.000746614361051533, + "loss": 0.1613, + "num_input_tokens_seen": 106532976, + "step": 49315 + }, + { + "epoch": 8.045676998368679, + "grad_norm": 0.1503468155860901, + "learning_rate": 0.0007465524392488443, + "loss": 0.1, + "num_input_tokens_seen": 106543792, + "step": 49320 + }, + { + "epoch": 8.046492659053834, + "grad_norm": 0.14477421343326569, + "learning_rate": 0.0007464905124494578, + "loss": 0.23, + "num_input_tokens_seen": 106555632, + "step": 49325 + }, + { + "epoch": 8.047308319738988, + "grad_norm": 0.20425103604793549, + "learning_rate": 0.0007464285806546283, + "loss": 0.07, + "num_input_tokens_seen": 106566608, + "step": 49330 + }, + { + "epoch": 8.048123980424144, + "grad_norm": 0.007286466658115387, + "learning_rate": 0.0007463666438656109, + "loss": 0.0461, + "num_input_tokens_seen": 106576976, + "step": 49335 + }, + { + "epoch": 8.048939641109298, + "grad_norm": 0.007894366048276424, + "learning_rate": 0.000746304702083661, + "loss": 0.0204, + "num_input_tokens_seen": 106586896, + "step": 49340 + }, + { + "epoch": 8.049755301794454, + "grad_norm": 0.044826071709394455, + "learning_rate": 0.0007462427553100339, + "loss": 0.0876, + "num_input_tokens_seen": 106595760, + "step": 49345 + }, + { + "epoch": 8.05057096247961, + "grad_norm": 0.09864915162324905, + "learning_rate": 0.0007461808035459848, + "loss": 0.0103, + "num_input_tokens_seen": 106606480, + "step": 49350 + }, + { + "epoch": 8.051386623164763, + "grad_norm": 0.08167116343975067, + "learning_rate": 0.0007461188467927695, + "loss": 0.023, + "num_input_tokens_seen": 106616720, + "step": 49355 + }, + { + "epoch": 8.052202283849919, + "grad_norm": 0.0640343576669693, + "learning_rate": 0.0007460568850516436, + "loss": 0.0279, + "num_input_tokens_seen": 106626512, + "step": 49360 + }, + { + "epoch": 8.053017944535073, + "grad_norm": 0.39939889311790466, + "learning_rate": 0.0007459949183238627, + "loss": 0.0461, + "num_input_tokens_seen": 106637296, + "step": 49365 + }, + { + "epoch": 8.053833605220229, + "grad_norm": 0.036775026470422745, + "learning_rate": 0.0007459329466106829, + "loss": 0.0943, + "num_input_tokens_seen": 106647888, + "step": 49370 + }, + { + "epoch": 8.054649265905383, + "grad_norm": 0.20606455206871033, + "learning_rate": 0.0007458709699133597, + "loss": 0.1257, + "num_input_tokens_seen": 106659408, + "step": 49375 + }, + { + "epoch": 8.055464926590538, + "grad_norm": 0.007144047878682613, + "learning_rate": 0.0007458089882331495, + "loss": 0.079, + "num_input_tokens_seen": 106670576, + "step": 49380 + }, + { + "epoch": 8.056280587275694, + "grad_norm": 0.4637611210346222, + "learning_rate": 0.0007457470015713085, + "loss": 0.0459, + "num_input_tokens_seen": 106681040, + "step": 49385 + }, + { + "epoch": 8.057096247960848, + "grad_norm": 0.00918448157608509, + "learning_rate": 0.0007456850099290927, + "loss": 0.0184, + "num_input_tokens_seen": 106691376, + "step": 49390 + }, + { + "epoch": 8.057911908646004, + "grad_norm": 0.09710443019866943, + "learning_rate": 0.0007456230133077583, + "loss": 0.012, + "num_input_tokens_seen": 106703312, + "step": 49395 + }, + { + "epoch": 8.058727569331158, + "grad_norm": 0.004849865101277828, + "learning_rate": 0.0007455610117085618, + "loss": 0.0331, + "num_input_tokens_seen": 106713328, + "step": 49400 + }, + { + "epoch": 8.059543230016313, + "grad_norm": 0.22771984338760376, + "learning_rate": 0.0007454990051327602, + "loss": 0.0268, + "num_input_tokens_seen": 106723984, + "step": 49405 + }, + { + "epoch": 8.060358890701469, + "grad_norm": 0.018330955877900124, + "learning_rate": 0.0007454369935816098, + "loss": 0.0197, + "num_input_tokens_seen": 106735216, + "step": 49410 + }, + { + "epoch": 8.061174551386623, + "grad_norm": 0.02104993537068367, + "learning_rate": 0.0007453749770563673, + "loss": 0.1355, + "num_input_tokens_seen": 106746160, + "step": 49415 + }, + { + "epoch": 8.061990212071779, + "grad_norm": 0.0019923443906009197, + "learning_rate": 0.0007453129555582896, + "loss": 0.0202, + "num_input_tokens_seen": 106756912, + "step": 49420 + }, + { + "epoch": 8.062805872756933, + "grad_norm": 0.019411850720643997, + "learning_rate": 0.0007452509290886336, + "loss": 0.0215, + "num_input_tokens_seen": 106768560, + "step": 49425 + }, + { + "epoch": 8.063621533442088, + "grad_norm": 0.09938501566648483, + "learning_rate": 0.0007451888976486565, + "loss": 0.0326, + "num_input_tokens_seen": 106780656, + "step": 49430 + }, + { + "epoch": 8.064437194127244, + "grad_norm": 0.30560827255249023, + "learning_rate": 0.0007451268612396154, + "loss": 0.0346, + "num_input_tokens_seen": 106789968, + "step": 49435 + }, + { + "epoch": 8.065252854812398, + "grad_norm": 0.002016686834394932, + "learning_rate": 0.0007450648198627673, + "loss": 0.0084, + "num_input_tokens_seen": 106800144, + "step": 49440 + }, + { + "epoch": 8.066068515497554, + "grad_norm": 0.04060761258006096, + "learning_rate": 0.0007450027735193699, + "loss": 0.0354, + "num_input_tokens_seen": 106811248, + "step": 49445 + }, + { + "epoch": 8.066884176182707, + "grad_norm": 0.004460466559976339, + "learning_rate": 0.0007449407222106804, + "loss": 0.0678, + "num_input_tokens_seen": 106821136, + "step": 49450 + }, + { + "epoch": 8.067699836867863, + "grad_norm": 0.006339784245938063, + "learning_rate": 0.0007448786659379565, + "loss": 0.1027, + "num_input_tokens_seen": 106832368, + "step": 49455 + }, + { + "epoch": 8.068515497553017, + "grad_norm": 0.012937436811625957, + "learning_rate": 0.0007448166047024556, + "loss": 0.1329, + "num_input_tokens_seen": 106842096, + "step": 49460 + }, + { + "epoch": 8.069331158238173, + "grad_norm": 0.0679636299610138, + "learning_rate": 0.0007447545385054358, + "loss": 0.0163, + "num_input_tokens_seen": 106853200, + "step": 49465 + }, + { + "epoch": 8.070146818923329, + "grad_norm": 0.0034860328305512667, + "learning_rate": 0.0007446924673481548, + "loss": 0.0071, + "num_input_tokens_seen": 106865104, + "step": 49470 + }, + { + "epoch": 8.070962479608482, + "grad_norm": 0.011176558211445808, + "learning_rate": 0.0007446303912318705, + "loss": 0.0941, + "num_input_tokens_seen": 106876560, + "step": 49475 + }, + { + "epoch": 8.071778140293638, + "grad_norm": 0.38480356335639954, + "learning_rate": 0.000744568310157841, + "loss": 0.0225, + "num_input_tokens_seen": 106888176, + "step": 49480 + }, + { + "epoch": 8.072593800978792, + "grad_norm": 0.00534833362326026, + "learning_rate": 0.0007445062241273244, + "loss": 0.1502, + "num_input_tokens_seen": 106899696, + "step": 49485 + }, + { + "epoch": 8.073409461663948, + "grad_norm": 0.05332694947719574, + "learning_rate": 0.000744444133141579, + "loss": 0.0184, + "num_input_tokens_seen": 106910480, + "step": 49490 + }, + { + "epoch": 8.074225122349104, + "grad_norm": 0.007500792853534222, + "learning_rate": 0.0007443820372018631, + "loss": 0.007, + "num_input_tokens_seen": 106921072, + "step": 49495 + }, + { + "epoch": 8.075040783034257, + "grad_norm": 0.03552490100264549, + "learning_rate": 0.0007443199363094353, + "loss": 0.0176, + "num_input_tokens_seen": 106933392, + "step": 49500 + }, + { + "epoch": 8.075856443719413, + "grad_norm": 0.3780343234539032, + "learning_rate": 0.0007442578304655541, + "loss": 0.0829, + "num_input_tokens_seen": 106942928, + "step": 49505 + }, + { + "epoch": 8.076672104404567, + "grad_norm": 0.041732337325811386, + "learning_rate": 0.0007441957196714778, + "loss": 0.0128, + "num_input_tokens_seen": 106952912, + "step": 49510 + }, + { + "epoch": 8.077487765089723, + "grad_norm": 0.0023167598992586136, + "learning_rate": 0.0007441336039284656, + "loss": 0.0128, + "num_input_tokens_seen": 106962704, + "step": 49515 + }, + { + "epoch": 8.078303425774878, + "grad_norm": 0.2755075693130493, + "learning_rate": 0.0007440714832377764, + "loss": 0.1491, + "num_input_tokens_seen": 106973008, + "step": 49520 + }, + { + "epoch": 8.079119086460032, + "grad_norm": 0.05139836296439171, + "learning_rate": 0.0007440093576006688, + "loss": 0.0638, + "num_input_tokens_seen": 106983632, + "step": 49525 + }, + { + "epoch": 8.079934747145188, + "grad_norm": 0.3967004120349884, + "learning_rate": 0.000743947227018402, + "loss": 0.2202, + "num_input_tokens_seen": 106995152, + "step": 49530 + }, + { + "epoch": 8.080750407830342, + "grad_norm": 0.1344999372959137, + "learning_rate": 0.0007438850914922352, + "loss": 0.0648, + "num_input_tokens_seen": 107005936, + "step": 49535 + }, + { + "epoch": 8.081566068515498, + "grad_norm": 0.05749267339706421, + "learning_rate": 0.0007438229510234278, + "loss": 0.015, + "num_input_tokens_seen": 107014960, + "step": 49540 + }, + { + "epoch": 8.082381729200652, + "grad_norm": 0.001101077999919653, + "learning_rate": 0.0007437608056132388, + "loss": 0.025, + "num_input_tokens_seen": 107026448, + "step": 49545 + }, + { + "epoch": 8.083197389885807, + "grad_norm": 0.003104017348960042, + "learning_rate": 0.0007436986552629279, + "loss": 0.0218, + "num_input_tokens_seen": 107038768, + "step": 49550 + }, + { + "epoch": 8.084013050570963, + "grad_norm": 0.012065582908689976, + "learning_rate": 0.0007436364999737546, + "loss": 0.0271, + "num_input_tokens_seen": 107049936, + "step": 49555 + }, + { + "epoch": 8.084828711256117, + "grad_norm": 0.1463005393743515, + "learning_rate": 0.0007435743397469785, + "loss": 0.0241, + "num_input_tokens_seen": 107061808, + "step": 49560 + }, + { + "epoch": 8.085644371941273, + "grad_norm": 0.024028457701206207, + "learning_rate": 0.0007435121745838595, + "loss": 0.0256, + "num_input_tokens_seen": 107072464, + "step": 49565 + }, + { + "epoch": 8.086460032626427, + "grad_norm": 0.24131129682064056, + "learning_rate": 0.0007434500044856574, + "loss": 0.1333, + "num_input_tokens_seen": 107082704, + "step": 49570 + }, + { + "epoch": 8.087275693311582, + "grad_norm": 0.2152007669210434, + "learning_rate": 0.000743387829453632, + "loss": 0.0369, + "num_input_tokens_seen": 107093680, + "step": 49575 + }, + { + "epoch": 8.088091353996738, + "grad_norm": 0.2218417227268219, + "learning_rate": 0.0007433256494890435, + "loss": 0.0972, + "num_input_tokens_seen": 107103824, + "step": 49580 + }, + { + "epoch": 8.088907014681892, + "grad_norm": 0.01789860427379608, + "learning_rate": 0.000743263464593152, + "loss": 0.062, + "num_input_tokens_seen": 107114512, + "step": 49585 + }, + { + "epoch": 8.089722675367048, + "grad_norm": 0.4298427104949951, + "learning_rate": 0.0007432012747672179, + "loss": 0.0929, + "num_input_tokens_seen": 107125904, + "step": 49590 + }, + { + "epoch": 8.090538336052202, + "grad_norm": 0.0065206796862185, + "learning_rate": 0.0007431390800125013, + "loss": 0.0142, + "num_input_tokens_seen": 107137136, + "step": 49595 + }, + { + "epoch": 8.091353996737357, + "grad_norm": 0.020476138219237328, + "learning_rate": 0.0007430768803302629, + "loss": 0.0653, + "num_input_tokens_seen": 107148912, + "step": 49600 + }, + { + "epoch": 8.092169657422513, + "grad_norm": 0.017311519011855125, + "learning_rate": 0.0007430146757217631, + "loss": 0.2111, + "num_input_tokens_seen": 107159792, + "step": 49605 + }, + { + "epoch": 8.092985318107667, + "grad_norm": 0.022033190354704857, + "learning_rate": 0.0007429524661882626, + "loss": 0.1223, + "num_input_tokens_seen": 107170640, + "step": 49610 + }, + { + "epoch": 8.093800978792823, + "grad_norm": 0.05615110695362091, + "learning_rate": 0.0007428902517310222, + "loss": 0.0604, + "num_input_tokens_seen": 107180816, + "step": 49615 + }, + { + "epoch": 8.094616639477977, + "grad_norm": 0.03126392140984535, + "learning_rate": 0.0007428280323513028, + "loss": 0.049, + "num_input_tokens_seen": 107192016, + "step": 49620 + }, + { + "epoch": 8.095432300163132, + "grad_norm": 0.622178316116333, + "learning_rate": 0.0007427658080503652, + "loss": 0.2196, + "num_input_tokens_seen": 107202160, + "step": 49625 + }, + { + "epoch": 8.096247960848286, + "grad_norm": 0.1008746325969696, + "learning_rate": 0.0007427035788294704, + "loss": 0.0663, + "num_input_tokens_seen": 107213776, + "step": 49630 + }, + { + "epoch": 8.097063621533442, + "grad_norm": 0.04118318483233452, + "learning_rate": 0.0007426413446898799, + "loss": 0.0315, + "num_input_tokens_seen": 107224048, + "step": 49635 + }, + { + "epoch": 8.097879282218598, + "grad_norm": 0.3178521990776062, + "learning_rate": 0.0007425791056328546, + "loss": 0.1177, + "num_input_tokens_seen": 107234800, + "step": 49640 + }, + { + "epoch": 8.098694942903752, + "grad_norm": 0.24075457453727722, + "learning_rate": 0.0007425168616596561, + "loss": 0.0396, + "num_input_tokens_seen": 107245872, + "step": 49645 + }, + { + "epoch": 8.099510603588907, + "grad_norm": 0.22038891911506653, + "learning_rate": 0.0007424546127715456, + "loss": 0.057, + "num_input_tokens_seen": 107255760, + "step": 49650 + }, + { + "epoch": 8.100326264274061, + "grad_norm": 0.13066186010837555, + "learning_rate": 0.0007423923589697849, + "loss": 0.0265, + "num_input_tokens_seen": 107266992, + "step": 49655 + }, + { + "epoch": 8.101141924959217, + "grad_norm": 0.009751847945153713, + "learning_rate": 0.0007423301002556355, + "loss": 0.0186, + "num_input_tokens_seen": 107275536, + "step": 49660 + }, + { + "epoch": 8.101957585644373, + "grad_norm": 0.11319594085216522, + "learning_rate": 0.0007422678366303592, + "loss": 0.0525, + "num_input_tokens_seen": 107286960, + "step": 49665 + }, + { + "epoch": 8.102773246329527, + "grad_norm": 0.051054831594228745, + "learning_rate": 0.000742205568095218, + "loss": 0.0205, + "num_input_tokens_seen": 107297360, + "step": 49670 + }, + { + "epoch": 8.103588907014682, + "grad_norm": 0.059771161526441574, + "learning_rate": 0.0007421432946514736, + "loss": 0.0394, + "num_input_tokens_seen": 107308272, + "step": 49675 + }, + { + "epoch": 8.104404567699836, + "grad_norm": 0.042254697531461716, + "learning_rate": 0.0007420810163003881, + "loss": 0.0656, + "num_input_tokens_seen": 107318640, + "step": 49680 + }, + { + "epoch": 8.105220228384992, + "grad_norm": 0.004530859179794788, + "learning_rate": 0.0007420187330432238, + "loss": 0.1321, + "num_input_tokens_seen": 107328656, + "step": 49685 + }, + { + "epoch": 8.106035889070148, + "grad_norm": 0.48277902603149414, + "learning_rate": 0.0007419564448812428, + "loss": 0.1329, + "num_input_tokens_seen": 107338832, + "step": 49690 + }, + { + "epoch": 8.106851549755302, + "grad_norm": 0.053494442254304886, + "learning_rate": 0.0007418941518157075, + "loss": 0.0152, + "num_input_tokens_seen": 107349456, + "step": 49695 + }, + { + "epoch": 8.107667210440457, + "grad_norm": 0.3123549520969391, + "learning_rate": 0.0007418318538478803, + "loss": 0.1035, + "num_input_tokens_seen": 107361360, + "step": 49700 + }, + { + "epoch": 8.108482871125611, + "grad_norm": 0.3388768136501312, + "learning_rate": 0.0007417695509790239, + "loss": 0.11, + "num_input_tokens_seen": 107372496, + "step": 49705 + }, + { + "epoch": 8.109298531810767, + "grad_norm": 0.07602295279502869, + "learning_rate": 0.0007417072432104007, + "loss": 0.1473, + "num_input_tokens_seen": 107383376, + "step": 49710 + }, + { + "epoch": 8.11011419249592, + "grad_norm": 0.08169770240783691, + "learning_rate": 0.0007416449305432738, + "loss": 0.0188, + "num_input_tokens_seen": 107394448, + "step": 49715 + }, + { + "epoch": 8.110929853181077, + "grad_norm": 0.02449674718081951, + "learning_rate": 0.0007415826129789057, + "loss": 0.0245, + "num_input_tokens_seen": 107404944, + "step": 49720 + }, + { + "epoch": 8.111745513866232, + "grad_norm": 0.16801486909389496, + "learning_rate": 0.0007415202905185594, + "loss": 0.1158, + "num_input_tokens_seen": 107416080, + "step": 49725 + }, + { + "epoch": 8.112561174551386, + "grad_norm": 0.03335743770003319, + "learning_rate": 0.0007414579631634981, + "loss": 0.0197, + "num_input_tokens_seen": 107426224, + "step": 49730 + }, + { + "epoch": 8.113376835236542, + "grad_norm": 0.053683776408433914, + "learning_rate": 0.0007413956309149848, + "loss": 0.1636, + "num_input_tokens_seen": 107437328, + "step": 49735 + }, + { + "epoch": 8.114192495921696, + "grad_norm": 0.15747734904289246, + "learning_rate": 0.000741333293774283, + "loss": 0.0375, + "num_input_tokens_seen": 107446736, + "step": 49740 + }, + { + "epoch": 8.115008156606851, + "grad_norm": 0.039649732410907745, + "learning_rate": 0.0007412709517426556, + "loss": 0.011, + "num_input_tokens_seen": 107456784, + "step": 49745 + }, + { + "epoch": 8.115823817292007, + "grad_norm": 0.015954799950122833, + "learning_rate": 0.0007412086048213665, + "loss": 0.0142, + "num_input_tokens_seen": 107466640, + "step": 49750 + }, + { + "epoch": 8.116639477977161, + "grad_norm": 0.005059359595179558, + "learning_rate": 0.000741146253011679, + "loss": 0.0644, + "num_input_tokens_seen": 107477168, + "step": 49755 + }, + { + "epoch": 8.117455138662317, + "grad_norm": 0.010598313063383102, + "learning_rate": 0.0007410838963148568, + "loss": 0.0103, + "num_input_tokens_seen": 107487920, + "step": 49760 + }, + { + "epoch": 8.11827079934747, + "grad_norm": 0.006887549534440041, + "learning_rate": 0.0007410215347321634, + "loss": 0.0252, + "num_input_tokens_seen": 107499632, + "step": 49765 + }, + { + "epoch": 8.119086460032626, + "grad_norm": 0.026491384953260422, + "learning_rate": 0.000740959168264863, + "loss": 0.0803, + "num_input_tokens_seen": 107510192, + "step": 49770 + }, + { + "epoch": 8.119902120717782, + "grad_norm": 0.02549799717962742, + "learning_rate": 0.0007408967969142193, + "loss": 0.0923, + "num_input_tokens_seen": 107521040, + "step": 49775 + }, + { + "epoch": 8.120717781402936, + "grad_norm": 0.6434139013290405, + "learning_rate": 0.0007408344206814965, + "loss": 0.0789, + "num_input_tokens_seen": 107532816, + "step": 49780 + }, + { + "epoch": 8.121533442088092, + "grad_norm": 0.4618445038795471, + "learning_rate": 0.0007407720395679585, + "loss": 0.1924, + "num_input_tokens_seen": 107543984, + "step": 49785 + }, + { + "epoch": 8.122349102773246, + "grad_norm": 0.024171195924282074, + "learning_rate": 0.0007407096535748698, + "loss": 0.0338, + "num_input_tokens_seen": 107555824, + "step": 49790 + }, + { + "epoch": 8.123164763458401, + "grad_norm": 0.052864234894514084, + "learning_rate": 0.0007406472627034946, + "loss": 0.1377, + "num_input_tokens_seen": 107566800, + "step": 49795 + }, + { + "epoch": 8.123980424143557, + "grad_norm": 0.0018474479438737035, + "learning_rate": 0.0007405848669550973, + "loss": 0.0285, + "num_input_tokens_seen": 107577456, + "step": 49800 + }, + { + "epoch": 8.124796084828711, + "grad_norm": 0.17964208126068115, + "learning_rate": 0.0007405224663309425, + "loss": 0.2171, + "num_input_tokens_seen": 107589328, + "step": 49805 + }, + { + "epoch": 8.125611745513867, + "grad_norm": 0.1299470216035843, + "learning_rate": 0.0007404600608322948, + "loss": 0.146, + "num_input_tokens_seen": 107600784, + "step": 49810 + }, + { + "epoch": 8.12642740619902, + "grad_norm": 0.005026193335652351, + "learning_rate": 0.0007403976504604189, + "loss": 0.0586, + "num_input_tokens_seen": 107612592, + "step": 49815 + }, + { + "epoch": 8.127243066884176, + "grad_norm": 0.06259380280971527, + "learning_rate": 0.0007403352352165797, + "loss": 0.0488, + "num_input_tokens_seen": 107623184, + "step": 49820 + }, + { + "epoch": 8.12805872756933, + "grad_norm": 0.016340158879756927, + "learning_rate": 0.0007402728151020419, + "loss": 0.0232, + "num_input_tokens_seen": 107633456, + "step": 49825 + }, + { + "epoch": 8.128874388254486, + "grad_norm": 0.19695636630058289, + "learning_rate": 0.0007402103901180708, + "loss": 0.0811, + "num_input_tokens_seen": 107644688, + "step": 49830 + }, + { + "epoch": 8.129690048939642, + "grad_norm": 0.01583622582256794, + "learning_rate": 0.0007401479602659315, + "loss": 0.1204, + "num_input_tokens_seen": 107654960, + "step": 49835 + }, + { + "epoch": 8.130505709624796, + "grad_norm": 0.13566166162490845, + "learning_rate": 0.000740085525546889, + "loss": 0.0135, + "num_input_tokens_seen": 107665104, + "step": 49840 + }, + { + "epoch": 8.131321370309951, + "grad_norm": 0.0025712582282721996, + "learning_rate": 0.0007400230859622088, + "loss": 0.024, + "num_input_tokens_seen": 107675920, + "step": 49845 + }, + { + "epoch": 8.132137030995105, + "grad_norm": 0.07446157187223434, + "learning_rate": 0.0007399606415131563, + "loss": 0.0806, + "num_input_tokens_seen": 107686544, + "step": 49850 + }, + { + "epoch": 8.132952691680261, + "grad_norm": 0.08578246086835861, + "learning_rate": 0.0007398981922009971, + "loss": 0.0433, + "num_input_tokens_seen": 107698320, + "step": 49855 + }, + { + "epoch": 8.133768352365417, + "grad_norm": 0.06800012290477753, + "learning_rate": 0.0007398357380269966, + "loss": 0.0156, + "num_input_tokens_seen": 107709360, + "step": 49860 + }, + { + "epoch": 8.13458401305057, + "grad_norm": 0.007951917126774788, + "learning_rate": 0.0007397732789924205, + "loss": 0.0075, + "num_input_tokens_seen": 107720560, + "step": 49865 + }, + { + "epoch": 8.135399673735726, + "grad_norm": 0.010952308773994446, + "learning_rate": 0.0007397108150985349, + "loss": 0.0187, + "num_input_tokens_seen": 107730864, + "step": 49870 + }, + { + "epoch": 8.13621533442088, + "grad_norm": 0.1421835869550705, + "learning_rate": 0.0007396483463466055, + "loss": 0.0472, + "num_input_tokens_seen": 107740720, + "step": 49875 + }, + { + "epoch": 8.137030995106036, + "grad_norm": 0.00989621039479971, + "learning_rate": 0.0007395858727378982, + "loss": 0.0458, + "num_input_tokens_seen": 107750096, + "step": 49880 + }, + { + "epoch": 8.137846655791192, + "grad_norm": 0.0596698634326458, + "learning_rate": 0.0007395233942736794, + "loss": 0.1653, + "num_input_tokens_seen": 107761584, + "step": 49885 + }, + { + "epoch": 8.138662316476346, + "grad_norm": 0.028497813269495964, + "learning_rate": 0.0007394609109552152, + "loss": 0.0863, + "num_input_tokens_seen": 107772464, + "step": 49890 + }, + { + "epoch": 8.139477977161501, + "grad_norm": 0.08234116435050964, + "learning_rate": 0.0007393984227837718, + "loss": 0.1797, + "num_input_tokens_seen": 107783248, + "step": 49895 + }, + { + "epoch": 8.140293637846655, + "grad_norm": 0.02983970381319523, + "learning_rate": 0.0007393359297606155, + "loss": 0.0247, + "num_input_tokens_seen": 107793584, + "step": 49900 + }, + { + "epoch": 8.141109298531811, + "grad_norm": 0.23107391595840454, + "learning_rate": 0.0007392734318870133, + "loss": 0.0643, + "num_input_tokens_seen": 107803440, + "step": 49905 + }, + { + "epoch": 8.141924959216965, + "grad_norm": 0.06202094629406929, + "learning_rate": 0.0007392109291642311, + "loss": 0.0638, + "num_input_tokens_seen": 107813424, + "step": 49910 + }, + { + "epoch": 8.14274061990212, + "grad_norm": 0.31579720973968506, + "learning_rate": 0.0007391484215935363, + "loss": 0.1542, + "num_input_tokens_seen": 107823152, + "step": 49915 + }, + { + "epoch": 8.143556280587276, + "grad_norm": 0.14060421288013458, + "learning_rate": 0.000739085909176195, + "loss": 0.1019, + "num_input_tokens_seen": 107834736, + "step": 49920 + }, + { + "epoch": 8.14437194127243, + "grad_norm": 0.2537612020969391, + "learning_rate": 0.0007390233919134747, + "loss": 0.0287, + "num_input_tokens_seen": 107847120, + "step": 49925 + }, + { + "epoch": 8.145187601957586, + "grad_norm": 0.1196332573890686, + "learning_rate": 0.0007389608698066422, + "loss": 0.0142, + "num_input_tokens_seen": 107858448, + "step": 49930 + }, + { + "epoch": 8.14600326264274, + "grad_norm": 0.0030300780199468136, + "learning_rate": 0.0007388983428569643, + "loss": 0.0214, + "num_input_tokens_seen": 107870288, + "step": 49935 + }, + { + "epoch": 8.146818923327896, + "grad_norm": 0.005464842543005943, + "learning_rate": 0.0007388358110657085, + "loss": 0.0359, + "num_input_tokens_seen": 107881232, + "step": 49940 + }, + { + "epoch": 8.147634584013051, + "grad_norm": 0.12564074993133545, + "learning_rate": 0.000738773274434142, + "loss": 0.0288, + "num_input_tokens_seen": 107891024, + "step": 49945 + }, + { + "epoch": 8.148450244698205, + "grad_norm": 0.021209627389907837, + "learning_rate": 0.0007387107329635322, + "loss": 0.0732, + "num_input_tokens_seen": 107902224, + "step": 49950 + }, + { + "epoch": 8.149265905383361, + "grad_norm": 0.0424620546400547, + "learning_rate": 0.0007386481866551466, + "loss": 0.0218, + "num_input_tokens_seen": 107913776, + "step": 49955 + }, + { + "epoch": 8.150081566068515, + "grad_norm": 0.13068750500679016, + "learning_rate": 0.0007385856355102528, + "loss": 0.1071, + "num_input_tokens_seen": 107925488, + "step": 49960 + }, + { + "epoch": 8.15089722675367, + "grad_norm": 0.04212689772248268, + "learning_rate": 0.0007385230795301183, + "loss": 0.1603, + "num_input_tokens_seen": 107937040, + "step": 49965 + }, + { + "epoch": 8.151712887438826, + "grad_norm": 0.41780608892440796, + "learning_rate": 0.000738460518716011, + "loss": 0.0536, + "num_input_tokens_seen": 107947824, + "step": 49970 + }, + { + "epoch": 8.15252854812398, + "grad_norm": 0.2264157235622406, + "learning_rate": 0.0007383979530691989, + "loss": 0.1038, + "num_input_tokens_seen": 107957904, + "step": 49975 + }, + { + "epoch": 8.153344208809136, + "grad_norm": 0.10788124054670334, + "learning_rate": 0.0007383353825909498, + "loss": 0.1653, + "num_input_tokens_seen": 107968336, + "step": 49980 + }, + { + "epoch": 8.15415986949429, + "grad_norm": 0.16582083702087402, + "learning_rate": 0.0007382728072825318, + "loss": 0.1608, + "num_input_tokens_seen": 107978768, + "step": 49985 + }, + { + "epoch": 8.154975530179446, + "grad_norm": 0.23172712326049805, + "learning_rate": 0.0007382102271452132, + "loss": 0.0412, + "num_input_tokens_seen": 107989168, + "step": 49990 + }, + { + "epoch": 8.1557911908646, + "grad_norm": 0.11851934343576431, + "learning_rate": 0.0007381476421802621, + "loss": 0.0618, + "num_input_tokens_seen": 107999664, + "step": 49995 + }, + { + "epoch": 8.156606851549755, + "grad_norm": 0.022425547242164612, + "learning_rate": 0.0007380850523889469, + "loss": 0.0268, + "num_input_tokens_seen": 108008560, + "step": 50000 + }, + { + "epoch": 8.15742251223491, + "grad_norm": 0.17704157531261444, + "learning_rate": 0.0007380224577725361, + "loss": 0.0822, + "num_input_tokens_seen": 108019504, + "step": 50005 + }, + { + "epoch": 8.158238172920065, + "grad_norm": 0.22146500647068024, + "learning_rate": 0.0007379598583322982, + "loss": 0.2091, + "num_input_tokens_seen": 108030224, + "step": 50010 + }, + { + "epoch": 8.15905383360522, + "grad_norm": 0.0075067877769470215, + "learning_rate": 0.0007378972540695019, + "loss": 0.0308, + "num_input_tokens_seen": 108041936, + "step": 50015 + }, + { + "epoch": 8.159869494290374, + "grad_norm": 0.007990765385329723, + "learning_rate": 0.0007378346449854159, + "loss": 0.0663, + "num_input_tokens_seen": 108052688, + "step": 50020 + }, + { + "epoch": 8.16068515497553, + "grad_norm": 0.003241181606426835, + "learning_rate": 0.0007377720310813092, + "loss": 0.1593, + "num_input_tokens_seen": 108063440, + "step": 50025 + }, + { + "epoch": 8.161500815660686, + "grad_norm": 0.17617450654506683, + "learning_rate": 0.0007377094123584507, + "loss": 0.0786, + "num_input_tokens_seen": 108074448, + "step": 50030 + }, + { + "epoch": 8.16231647634584, + "grad_norm": 0.04974460229277611, + "learning_rate": 0.0007376467888181094, + "loss": 0.0424, + "num_input_tokens_seen": 108085840, + "step": 50035 + }, + { + "epoch": 8.163132137030995, + "grad_norm": 0.028729038313031197, + "learning_rate": 0.0007375841604615542, + "loss": 0.0164, + "num_input_tokens_seen": 108096816, + "step": 50040 + }, + { + "epoch": 8.16394779771615, + "grad_norm": 0.22961454093456268, + "learning_rate": 0.0007375215272900548, + "loss": 0.1319, + "num_input_tokens_seen": 108106448, + "step": 50045 + }, + { + "epoch": 8.164763458401305, + "grad_norm": 0.010329188778996468, + "learning_rate": 0.0007374588893048803, + "loss": 0.0371, + "num_input_tokens_seen": 108116816, + "step": 50050 + }, + { + "epoch": 8.16557911908646, + "grad_norm": 0.010221997275948524, + "learning_rate": 0.0007373962465073002, + "loss": 0.0539, + "num_input_tokens_seen": 108127440, + "step": 50055 + }, + { + "epoch": 8.166394779771615, + "grad_norm": 0.23573219776153564, + "learning_rate": 0.0007373335988985839, + "loss": 0.0611, + "num_input_tokens_seen": 108138128, + "step": 50060 + }, + { + "epoch": 8.16721044045677, + "grad_norm": 0.007467462215572596, + "learning_rate": 0.0007372709464800013, + "loss": 0.0263, + "num_input_tokens_seen": 108148912, + "step": 50065 + }, + { + "epoch": 8.168026101141924, + "grad_norm": 0.23514242470264435, + "learning_rate": 0.0007372082892528218, + "loss": 0.0509, + "num_input_tokens_seen": 108160240, + "step": 50070 + }, + { + "epoch": 8.16884176182708, + "grad_norm": 0.24603112041950226, + "learning_rate": 0.0007371456272183156, + "loss": 0.1425, + "num_input_tokens_seen": 108171280, + "step": 50075 + }, + { + "epoch": 8.169657422512234, + "grad_norm": 0.013973649591207504, + "learning_rate": 0.0007370829603777523, + "loss": 0.0193, + "num_input_tokens_seen": 108183088, + "step": 50080 + }, + { + "epoch": 8.17047308319739, + "grad_norm": 0.05473875626921654, + "learning_rate": 0.000737020288732402, + "loss": 0.1722, + "num_input_tokens_seen": 108194320, + "step": 50085 + }, + { + "epoch": 8.171288743882545, + "grad_norm": 0.075847327709198, + "learning_rate": 0.0007369576122835349, + "loss": 0.0737, + "num_input_tokens_seen": 108205296, + "step": 50090 + }, + { + "epoch": 8.1721044045677, + "grad_norm": 0.5191269516944885, + "learning_rate": 0.0007368949310324211, + "loss": 0.2405, + "num_input_tokens_seen": 108215696, + "step": 50095 + }, + { + "epoch": 8.172920065252855, + "grad_norm": 0.15466056764125824, + "learning_rate": 0.0007368322449803311, + "loss": 0.1787, + "num_input_tokens_seen": 108225456, + "step": 50100 + }, + { + "epoch": 8.173735725938009, + "grad_norm": 0.26574286818504333, + "learning_rate": 0.0007367695541285353, + "loss": 0.0349, + "num_input_tokens_seen": 108236560, + "step": 50105 + }, + { + "epoch": 8.174551386623165, + "grad_norm": 0.04187563434243202, + "learning_rate": 0.0007367068584783041, + "loss": 0.0278, + "num_input_tokens_seen": 108247568, + "step": 50110 + }, + { + "epoch": 8.17536704730832, + "grad_norm": 0.1134595200419426, + "learning_rate": 0.000736644158030908, + "loss": 0.0225, + "num_input_tokens_seen": 108258192, + "step": 50115 + }, + { + "epoch": 8.176182707993474, + "grad_norm": 0.004802480805665255, + "learning_rate": 0.0007365814527876179, + "loss": 0.0665, + "num_input_tokens_seen": 108268944, + "step": 50120 + }, + { + "epoch": 8.17699836867863, + "grad_norm": 0.3093656599521637, + "learning_rate": 0.0007365187427497045, + "loss": 0.1294, + "num_input_tokens_seen": 108279344, + "step": 50125 + }, + { + "epoch": 8.177814029363784, + "grad_norm": 0.020289601758122444, + "learning_rate": 0.0007364560279184387, + "loss": 0.0414, + "num_input_tokens_seen": 108290768, + "step": 50130 + }, + { + "epoch": 8.17862969004894, + "grad_norm": 0.019983666017651558, + "learning_rate": 0.0007363933082950917, + "loss": 0.1073, + "num_input_tokens_seen": 108302192, + "step": 50135 + }, + { + "epoch": 8.179445350734095, + "grad_norm": 0.009791013784706593, + "learning_rate": 0.0007363305838809344, + "loss": 0.0139, + "num_input_tokens_seen": 108312720, + "step": 50140 + }, + { + "epoch": 8.18026101141925, + "grad_norm": 0.007855056785047054, + "learning_rate": 0.0007362678546772379, + "loss": 0.2655, + "num_input_tokens_seen": 108322480, + "step": 50145 + }, + { + "epoch": 8.181076672104405, + "grad_norm": 0.023270519450306892, + "learning_rate": 0.0007362051206852736, + "loss": 0.0336, + "num_input_tokens_seen": 108332976, + "step": 50150 + }, + { + "epoch": 8.181892332789559, + "grad_norm": 0.15803292393684387, + "learning_rate": 0.0007361423819063128, + "loss": 0.0329, + "num_input_tokens_seen": 108344112, + "step": 50155 + }, + { + "epoch": 8.182707993474715, + "grad_norm": 0.2568044364452362, + "learning_rate": 0.0007360796383416273, + "loss": 0.1826, + "num_input_tokens_seen": 108354960, + "step": 50160 + }, + { + "epoch": 8.18352365415987, + "grad_norm": 0.011066235601902008, + "learning_rate": 0.0007360168899924883, + "loss": 0.055, + "num_input_tokens_seen": 108365008, + "step": 50165 + }, + { + "epoch": 8.184339314845024, + "grad_norm": 0.08335398137569427, + "learning_rate": 0.0007359541368601675, + "loss": 0.1744, + "num_input_tokens_seen": 108374544, + "step": 50170 + }, + { + "epoch": 8.18515497553018, + "grad_norm": 0.23367607593536377, + "learning_rate": 0.0007358913789459369, + "loss": 0.161, + "num_input_tokens_seen": 108384304, + "step": 50175 + }, + { + "epoch": 8.185970636215334, + "grad_norm": 0.04029659181833267, + "learning_rate": 0.0007358286162510683, + "loss": 0.1058, + "num_input_tokens_seen": 108394992, + "step": 50180 + }, + { + "epoch": 8.18678629690049, + "grad_norm": 0.04990570619702339, + "learning_rate": 0.0007357658487768337, + "loss": 0.0677, + "num_input_tokens_seen": 108407568, + "step": 50185 + }, + { + "epoch": 8.187601957585644, + "grad_norm": 0.009469253942370415, + "learning_rate": 0.0007357030765245049, + "loss": 0.022, + "num_input_tokens_seen": 108417904, + "step": 50190 + }, + { + "epoch": 8.1884176182708, + "grad_norm": 0.026664957404136658, + "learning_rate": 0.0007356402994953544, + "loss": 0.0822, + "num_input_tokens_seen": 108428400, + "step": 50195 + }, + { + "epoch": 8.189233278955955, + "grad_norm": 0.015516448765993118, + "learning_rate": 0.0007355775176906543, + "loss": 0.0596, + "num_input_tokens_seen": 108438864, + "step": 50200 + }, + { + "epoch": 8.190048939641109, + "grad_norm": 0.014006822369992733, + "learning_rate": 0.0007355147311116768, + "loss": 0.1504, + "num_input_tokens_seen": 108448720, + "step": 50205 + }, + { + "epoch": 8.190864600326265, + "grad_norm": 0.1970442533493042, + "learning_rate": 0.0007354519397596946, + "loss": 0.049, + "num_input_tokens_seen": 108459344, + "step": 50210 + }, + { + "epoch": 8.191680261011419, + "grad_norm": 0.015166080556809902, + "learning_rate": 0.0007353891436359801, + "loss": 0.0223, + "num_input_tokens_seen": 108470000, + "step": 50215 + }, + { + "epoch": 8.192495921696574, + "grad_norm": 0.0828956738114357, + "learning_rate": 0.000735326342741806, + "loss": 0.0787, + "num_input_tokens_seen": 108479856, + "step": 50220 + }, + { + "epoch": 8.19331158238173, + "grad_norm": 0.1197994202375412, + "learning_rate": 0.0007352635370784451, + "loss": 0.0738, + "num_input_tokens_seen": 108490576, + "step": 50225 + }, + { + "epoch": 8.194127243066884, + "grad_norm": 0.047717440873384476, + "learning_rate": 0.00073520072664717, + "loss": 0.0749, + "num_input_tokens_seen": 108502960, + "step": 50230 + }, + { + "epoch": 8.19494290375204, + "grad_norm": 0.08582471311092377, + "learning_rate": 0.000735137911449254, + "loss": 0.0627, + "num_input_tokens_seen": 108512944, + "step": 50235 + }, + { + "epoch": 8.195758564437194, + "grad_norm": 0.1752943992614746, + "learning_rate": 0.0007350750914859698, + "loss": 0.0526, + "num_input_tokens_seen": 108523664, + "step": 50240 + }, + { + "epoch": 8.19657422512235, + "grad_norm": 0.008756861090660095, + "learning_rate": 0.0007350122667585908, + "loss": 0.0229, + "num_input_tokens_seen": 108534704, + "step": 50245 + }, + { + "epoch": 8.197389885807505, + "grad_norm": 0.01668260246515274, + "learning_rate": 0.0007349494372683899, + "loss": 0.0281, + "num_input_tokens_seen": 108545936, + "step": 50250 + }, + { + "epoch": 8.198205546492659, + "grad_norm": 0.7298435568809509, + "learning_rate": 0.0007348866030166407, + "loss": 0.0642, + "num_input_tokens_seen": 108555536, + "step": 50255 + }, + { + "epoch": 8.199021207177815, + "grad_norm": 0.015028917230665684, + "learning_rate": 0.0007348237640046165, + "loss": 0.0155, + "num_input_tokens_seen": 108566896, + "step": 50260 + }, + { + "epoch": 8.199836867862969, + "grad_norm": 0.013336896896362305, + "learning_rate": 0.0007347609202335907, + "loss": 0.0383, + "num_input_tokens_seen": 108577520, + "step": 50265 + }, + { + "epoch": 8.200652528548124, + "grad_norm": 0.0023462544195353985, + "learning_rate": 0.0007346980717048373, + "loss": 0.0093, + "num_input_tokens_seen": 108589104, + "step": 50270 + }, + { + "epoch": 8.201468189233278, + "grad_norm": 0.02456527017056942, + "learning_rate": 0.0007346352184196296, + "loss": 0.0188, + "num_input_tokens_seen": 108599440, + "step": 50275 + }, + { + "epoch": 8.202283849918434, + "grad_norm": 0.23470385372638702, + "learning_rate": 0.0007345723603792415, + "loss": 0.0888, + "num_input_tokens_seen": 108610608, + "step": 50280 + }, + { + "epoch": 8.20309951060359, + "grad_norm": 0.03273777663707733, + "learning_rate": 0.000734509497584947, + "loss": 0.0784, + "num_input_tokens_seen": 108621680, + "step": 50285 + }, + { + "epoch": 8.203915171288743, + "grad_norm": 0.20122350752353668, + "learning_rate": 0.0007344466300380201, + "loss": 0.0249, + "num_input_tokens_seen": 108633712, + "step": 50290 + }, + { + "epoch": 8.2047308319739, + "grad_norm": 0.03938678279519081, + "learning_rate": 0.0007343837577397347, + "loss": 0.0923, + "num_input_tokens_seen": 108643888, + "step": 50295 + }, + { + "epoch": 8.205546492659053, + "grad_norm": 0.023820001631975174, + "learning_rate": 0.0007343208806913651, + "loss": 0.0581, + "num_input_tokens_seen": 108654160, + "step": 50300 + }, + { + "epoch": 8.206362153344209, + "grad_norm": 0.0029546052683144808, + "learning_rate": 0.0007342579988941858, + "loss": 0.1322, + "num_input_tokens_seen": 108665072, + "step": 50305 + }, + { + "epoch": 8.207177814029365, + "grad_norm": 0.09167854487895966, + "learning_rate": 0.0007341951123494708, + "loss": 0.0173, + "num_input_tokens_seen": 108676432, + "step": 50310 + }, + { + "epoch": 8.207993474714518, + "grad_norm": 0.13153241574764252, + "learning_rate": 0.0007341322210584947, + "loss": 0.0306, + "num_input_tokens_seen": 108687088, + "step": 50315 + }, + { + "epoch": 8.208809135399674, + "grad_norm": 0.27671709656715393, + "learning_rate": 0.0007340693250225322, + "loss": 0.0658, + "num_input_tokens_seen": 108698736, + "step": 50320 + }, + { + "epoch": 8.209624796084828, + "grad_norm": 0.0030362617690116167, + "learning_rate": 0.0007340064242428579, + "loss": 0.0678, + "num_input_tokens_seen": 108709616, + "step": 50325 + }, + { + "epoch": 8.210440456769984, + "grad_norm": 0.02015618234872818, + "learning_rate": 0.0007339435187207466, + "loss": 0.006, + "num_input_tokens_seen": 108720688, + "step": 50330 + }, + { + "epoch": 8.21125611745514, + "grad_norm": 0.017543811351060867, + "learning_rate": 0.0007338806084574731, + "loss": 0.0171, + "num_input_tokens_seen": 108730928, + "step": 50335 + }, + { + "epoch": 8.212071778140293, + "grad_norm": 0.003998770844191313, + "learning_rate": 0.0007338176934543124, + "loss": 0.0067, + "num_input_tokens_seen": 108742832, + "step": 50340 + }, + { + "epoch": 8.21288743882545, + "grad_norm": 0.0968787744641304, + "learning_rate": 0.0007337547737125394, + "loss": 0.1262, + "num_input_tokens_seen": 108753616, + "step": 50345 + }, + { + "epoch": 8.213703099510603, + "grad_norm": 0.3040590286254883, + "learning_rate": 0.0007336918492334294, + "loss": 0.1554, + "num_input_tokens_seen": 108764592, + "step": 50350 + }, + { + "epoch": 8.214518760195759, + "grad_norm": 0.09109804034233093, + "learning_rate": 0.0007336289200182576, + "loss": 0.0133, + "num_input_tokens_seen": 108776688, + "step": 50355 + }, + { + "epoch": 8.215334420880913, + "grad_norm": 0.04389248788356781, + "learning_rate": 0.0007335659860682994, + "loss": 0.0288, + "num_input_tokens_seen": 108787792, + "step": 50360 + }, + { + "epoch": 8.216150081566068, + "grad_norm": 0.12057659029960632, + "learning_rate": 0.0007335030473848302, + "loss": 0.0637, + "num_input_tokens_seen": 108798032, + "step": 50365 + }, + { + "epoch": 8.216965742251224, + "grad_norm": 0.02155953273177147, + "learning_rate": 0.0007334401039691255, + "loss": 0.0127, + "num_input_tokens_seen": 108808976, + "step": 50370 + }, + { + "epoch": 8.217781402936378, + "grad_norm": 0.011279478669166565, + "learning_rate": 0.000733377155822461, + "loss": 0.0086, + "num_input_tokens_seen": 108818928, + "step": 50375 + }, + { + "epoch": 8.218597063621534, + "grad_norm": 0.042144011706113815, + "learning_rate": 0.0007333142029461124, + "loss": 0.2277, + "num_input_tokens_seen": 108829744, + "step": 50380 + }, + { + "epoch": 8.219412724306688, + "grad_norm": 0.3779882788658142, + "learning_rate": 0.0007332512453413555, + "loss": 0.0623, + "num_input_tokens_seen": 108841008, + "step": 50385 + }, + { + "epoch": 8.220228384991843, + "grad_norm": 0.029292693361639977, + "learning_rate": 0.0007331882830094661, + "loss": 0.1683, + "num_input_tokens_seen": 108852208, + "step": 50390 + }, + { + "epoch": 8.221044045676999, + "grad_norm": 0.058450907468795776, + "learning_rate": 0.0007331253159517204, + "loss": 0.0171, + "num_input_tokens_seen": 108863440, + "step": 50395 + }, + { + "epoch": 8.221859706362153, + "grad_norm": 0.016852879896759987, + "learning_rate": 0.0007330623441693944, + "loss": 0.0701, + "num_input_tokens_seen": 108874256, + "step": 50400 + }, + { + "epoch": 8.222675367047309, + "grad_norm": 0.030780978500843048, + "learning_rate": 0.0007329993676637643, + "loss": 0.0464, + "num_input_tokens_seen": 108885808, + "step": 50405 + }, + { + "epoch": 8.223491027732463, + "grad_norm": 0.1959686279296875, + "learning_rate": 0.0007329363864361065, + "loss": 0.1331, + "num_input_tokens_seen": 108896880, + "step": 50410 + }, + { + "epoch": 8.224306688417618, + "grad_norm": 0.3598119616508484, + "learning_rate": 0.0007328734004876974, + "loss": 0.0714, + "num_input_tokens_seen": 108908336, + "step": 50415 + }, + { + "epoch": 8.225122349102774, + "grad_norm": 0.004757583606988192, + "learning_rate": 0.0007328104098198131, + "loss": 0.0879, + "num_input_tokens_seen": 108919312, + "step": 50420 + }, + { + "epoch": 8.225938009787928, + "grad_norm": 0.40510547161102295, + "learning_rate": 0.000732747414433731, + "loss": 0.0601, + "num_input_tokens_seen": 108930992, + "step": 50425 + }, + { + "epoch": 8.226753670473084, + "grad_norm": 0.3972322344779968, + "learning_rate": 0.000732684414330727, + "loss": 0.062, + "num_input_tokens_seen": 108942864, + "step": 50430 + }, + { + "epoch": 8.227569331158238, + "grad_norm": 0.008180802688002586, + "learning_rate": 0.0007326214095120781, + "loss": 0.0334, + "num_input_tokens_seen": 108952880, + "step": 50435 + }, + { + "epoch": 8.228384991843393, + "grad_norm": 0.007211578544229269, + "learning_rate": 0.0007325583999790613, + "loss": 0.0264, + "num_input_tokens_seen": 108964560, + "step": 50440 + }, + { + "epoch": 8.229200652528547, + "grad_norm": 0.017740648239850998, + "learning_rate": 0.0007324953857329535, + "loss": 0.0261, + "num_input_tokens_seen": 108976336, + "step": 50445 + }, + { + "epoch": 8.230016313213703, + "grad_norm": 0.009635083377361298, + "learning_rate": 0.0007324323667750319, + "loss": 0.0208, + "num_input_tokens_seen": 108987216, + "step": 50450 + }, + { + "epoch": 8.230831973898859, + "grad_norm": 0.007957677356898785, + "learning_rate": 0.0007323693431065734, + "loss": 0.1397, + "num_input_tokens_seen": 108997328, + "step": 50455 + }, + { + "epoch": 8.231647634584013, + "grad_norm": 0.013457262888550758, + "learning_rate": 0.0007323063147288553, + "loss": 0.1046, + "num_input_tokens_seen": 109007088, + "step": 50460 + }, + { + "epoch": 8.232463295269168, + "grad_norm": 0.22362826764583588, + "learning_rate": 0.0007322432816431551, + "loss": 0.0372, + "num_input_tokens_seen": 109017488, + "step": 50465 + }, + { + "epoch": 8.233278955954322, + "grad_norm": 0.17243242263793945, + "learning_rate": 0.0007321802438507502, + "loss": 0.0236, + "num_input_tokens_seen": 109028240, + "step": 50470 + }, + { + "epoch": 8.234094616639478, + "grad_norm": 0.003037064801901579, + "learning_rate": 0.0007321172013529182, + "loss": 0.129, + "num_input_tokens_seen": 109038224, + "step": 50475 + }, + { + "epoch": 8.234910277324634, + "grad_norm": 0.007365551311522722, + "learning_rate": 0.0007320541541509366, + "loss": 0.0422, + "num_input_tokens_seen": 109048592, + "step": 50480 + }, + { + "epoch": 8.235725938009788, + "grad_norm": 0.020375186577439308, + "learning_rate": 0.0007319911022460831, + "loss": 0.0671, + "num_input_tokens_seen": 109058512, + "step": 50485 + }, + { + "epoch": 8.236541598694943, + "grad_norm": 0.22685709595680237, + "learning_rate": 0.0007319280456396357, + "loss": 0.0269, + "num_input_tokens_seen": 109069392, + "step": 50490 + }, + { + "epoch": 8.237357259380097, + "grad_norm": 0.27163517475128174, + "learning_rate": 0.0007318649843328722, + "loss": 0.0468, + "num_input_tokens_seen": 109080176, + "step": 50495 + }, + { + "epoch": 8.238172920065253, + "grad_norm": 0.008194385096430779, + "learning_rate": 0.0007318019183270707, + "loss": 0.0809, + "num_input_tokens_seen": 109091216, + "step": 50500 + }, + { + "epoch": 8.238988580750409, + "grad_norm": 0.03463249281048775, + "learning_rate": 0.0007317388476235091, + "loss": 0.0119, + "num_input_tokens_seen": 109101872, + "step": 50505 + }, + { + "epoch": 8.239804241435563, + "grad_norm": 0.02676951140165329, + "learning_rate": 0.0007316757722234659, + "loss": 0.0978, + "num_input_tokens_seen": 109111600, + "step": 50510 + }, + { + "epoch": 8.240619902120718, + "grad_norm": 0.012483866885304451, + "learning_rate": 0.0007316126921282193, + "loss": 0.0099, + "num_input_tokens_seen": 109121872, + "step": 50515 + }, + { + "epoch": 8.241435562805872, + "grad_norm": 0.2171943187713623, + "learning_rate": 0.0007315496073390477, + "loss": 0.029, + "num_input_tokens_seen": 109133136, + "step": 50520 + }, + { + "epoch": 8.242251223491028, + "grad_norm": 0.0367184579372406, + "learning_rate": 0.0007314865178572295, + "loss": 0.2034, + "num_input_tokens_seen": 109143216, + "step": 50525 + }, + { + "epoch": 8.243066884176184, + "grad_norm": 0.04937918111681938, + "learning_rate": 0.0007314234236840434, + "loss": 0.0461, + "num_input_tokens_seen": 109153104, + "step": 50530 + }, + { + "epoch": 8.243882544861338, + "grad_norm": 0.0748000219464302, + "learning_rate": 0.000731360324820768, + "loss": 0.035, + "num_input_tokens_seen": 109163568, + "step": 50535 + }, + { + "epoch": 8.244698205546493, + "grad_norm": 0.008928696624934673, + "learning_rate": 0.000731297221268682, + "loss": 0.0819, + "num_input_tokens_seen": 109174096, + "step": 50540 + }, + { + "epoch": 8.245513866231647, + "grad_norm": 0.003620315110310912, + "learning_rate": 0.0007312341130290645, + "loss": 0.1859, + "num_input_tokens_seen": 109185328, + "step": 50545 + }, + { + "epoch": 8.246329526916803, + "grad_norm": 0.03834008425474167, + "learning_rate": 0.0007311710001031943, + "loss": 0.0182, + "num_input_tokens_seen": 109195568, + "step": 50550 + }, + { + "epoch": 8.247145187601957, + "grad_norm": 0.2103102058172226, + "learning_rate": 0.0007311078824923506, + "loss": 0.0414, + "num_input_tokens_seen": 109205712, + "step": 50555 + }, + { + "epoch": 8.247960848287113, + "grad_norm": 0.0034484812058508396, + "learning_rate": 0.0007310447601978125, + "loss": 0.0053, + "num_input_tokens_seen": 109217200, + "step": 50560 + }, + { + "epoch": 8.248776508972268, + "grad_norm": 0.05513901263475418, + "learning_rate": 0.0007309816332208592, + "loss": 0.0062, + "num_input_tokens_seen": 109227312, + "step": 50565 + }, + { + "epoch": 8.249592169657422, + "grad_norm": 0.3575925827026367, + "learning_rate": 0.00073091850156277, + "loss": 0.0751, + "num_input_tokens_seen": 109239664, + "step": 50570 + }, + { + "epoch": 8.250407830342578, + "grad_norm": 0.02116283029317856, + "learning_rate": 0.0007308553652248244, + "loss": 0.1445, + "num_input_tokens_seen": 109250320, + "step": 50575 + }, + { + "epoch": 8.251223491027732, + "grad_norm": 0.006838952656835318, + "learning_rate": 0.0007307922242083022, + "loss": 0.0543, + "num_input_tokens_seen": 109260048, + "step": 50580 + }, + { + "epoch": 8.252039151712887, + "grad_norm": 0.04055549204349518, + "learning_rate": 0.0007307290785144826, + "loss": 0.0177, + "num_input_tokens_seen": 109270896, + "step": 50585 + }, + { + "epoch": 8.252854812398043, + "grad_norm": 0.17573504149913788, + "learning_rate": 0.0007306659281446456, + "loss": 0.0878, + "num_input_tokens_seen": 109279440, + "step": 50590 + }, + { + "epoch": 8.253670473083197, + "grad_norm": 0.03469372168183327, + "learning_rate": 0.000730602773100071, + "loss": 0.0988, + "num_input_tokens_seen": 109291824, + "step": 50595 + }, + { + "epoch": 8.254486133768353, + "grad_norm": 0.021459020674228668, + "learning_rate": 0.0007305396133820385, + "loss": 0.0152, + "num_input_tokens_seen": 109302256, + "step": 50600 + }, + { + "epoch": 8.255301794453507, + "grad_norm": 0.029705122113227844, + "learning_rate": 0.0007304764489918284, + "loss": 0.1134, + "num_input_tokens_seen": 109312624, + "step": 50605 + }, + { + "epoch": 8.256117455138662, + "grad_norm": 0.050352420657873154, + "learning_rate": 0.0007304132799307206, + "loss": 0.0955, + "num_input_tokens_seen": 109324304, + "step": 50610 + }, + { + "epoch": 8.256933115823816, + "grad_norm": 0.017651639878749847, + "learning_rate": 0.0007303501061999956, + "loss": 0.125, + "num_input_tokens_seen": 109336208, + "step": 50615 + }, + { + "epoch": 8.257748776508972, + "grad_norm": 0.008997799828648567, + "learning_rate": 0.0007302869278009332, + "loss": 0.0646, + "num_input_tokens_seen": 109346672, + "step": 50620 + }, + { + "epoch": 8.258564437194128, + "grad_norm": 0.25991567969322205, + "learning_rate": 0.0007302237447348141, + "loss": 0.1863, + "num_input_tokens_seen": 109356656, + "step": 50625 + }, + { + "epoch": 8.259380097879282, + "grad_norm": 0.008971529081463814, + "learning_rate": 0.0007301605570029189, + "loss": 0.1918, + "num_input_tokens_seen": 109365168, + "step": 50630 + }, + { + "epoch": 8.260195758564437, + "grad_norm": 0.014313746243715286, + "learning_rate": 0.000730097364606528, + "loss": 0.0394, + "num_input_tokens_seen": 109375376, + "step": 50635 + }, + { + "epoch": 8.261011419249591, + "grad_norm": 0.2412402629852295, + "learning_rate": 0.000730034167546922, + "loss": 0.1086, + "num_input_tokens_seen": 109386576, + "step": 50640 + }, + { + "epoch": 8.261827079934747, + "grad_norm": 0.008197726681828499, + "learning_rate": 0.0007299709658253819, + "loss": 0.0813, + "num_input_tokens_seen": 109398352, + "step": 50645 + }, + { + "epoch": 8.262642740619903, + "grad_norm": 0.20047369599342346, + "learning_rate": 0.0007299077594431885, + "loss": 0.0372, + "num_input_tokens_seen": 109407056, + "step": 50650 + }, + { + "epoch": 8.263458401305057, + "grad_norm": 0.2139945924282074, + "learning_rate": 0.0007298445484016225, + "loss": 0.0519, + "num_input_tokens_seen": 109416912, + "step": 50655 + }, + { + "epoch": 8.264274061990212, + "grad_norm": 0.05942140519618988, + "learning_rate": 0.0007297813327019652, + "loss": 0.0264, + "num_input_tokens_seen": 109428144, + "step": 50660 + }, + { + "epoch": 8.265089722675366, + "grad_norm": 0.2181319147348404, + "learning_rate": 0.0007297181123454977, + "loss": 0.0128, + "num_input_tokens_seen": 109436976, + "step": 50665 + }, + { + "epoch": 8.265905383360522, + "grad_norm": 0.10995151102542877, + "learning_rate": 0.0007296548873335013, + "loss": 0.1215, + "num_input_tokens_seen": 109448336, + "step": 50670 + }, + { + "epoch": 8.266721044045678, + "grad_norm": 0.3578730523586273, + "learning_rate": 0.0007295916576672572, + "loss": 0.0989, + "num_input_tokens_seen": 109458992, + "step": 50675 + }, + { + "epoch": 8.267536704730832, + "grad_norm": 0.22147579491138458, + "learning_rate": 0.0007295284233480468, + "loss": 0.1213, + "num_input_tokens_seen": 109469392, + "step": 50680 + }, + { + "epoch": 8.268352365415987, + "grad_norm": 0.028804771602153778, + "learning_rate": 0.0007294651843771519, + "loss": 0.113, + "num_input_tokens_seen": 109480112, + "step": 50685 + }, + { + "epoch": 8.269168026101141, + "grad_norm": 0.09923944622278214, + "learning_rate": 0.0007294019407558538, + "loss": 0.0278, + "num_input_tokens_seen": 109491344, + "step": 50690 + }, + { + "epoch": 8.269983686786297, + "grad_norm": 0.006381940096616745, + "learning_rate": 0.0007293386924854346, + "loss": 0.0117, + "num_input_tokens_seen": 109502640, + "step": 50695 + }, + { + "epoch": 8.270799347471453, + "grad_norm": 0.03557540103793144, + "learning_rate": 0.0007292754395671757, + "loss": 0.1572, + "num_input_tokens_seen": 109512464, + "step": 50700 + }, + { + "epoch": 8.271615008156607, + "grad_norm": 0.13613596558570862, + "learning_rate": 0.0007292121820023592, + "loss": 0.0724, + "num_input_tokens_seen": 109524816, + "step": 50705 + }, + { + "epoch": 8.272430668841762, + "grad_norm": 0.48386117815971375, + "learning_rate": 0.000729148919792267, + "loss": 0.0783, + "num_input_tokens_seen": 109535568, + "step": 50710 + }, + { + "epoch": 8.273246329526916, + "grad_norm": 0.005705375224351883, + "learning_rate": 0.000729085652938181, + "loss": 0.0368, + "num_input_tokens_seen": 109546704, + "step": 50715 + }, + { + "epoch": 8.274061990212072, + "grad_norm": 0.023817330598831177, + "learning_rate": 0.0007290223814413841, + "loss": 0.0169, + "num_input_tokens_seen": 109557648, + "step": 50720 + }, + { + "epoch": 8.274877650897226, + "grad_norm": 0.06411412358283997, + "learning_rate": 0.0007289591053031578, + "loss": 0.0384, + "num_input_tokens_seen": 109569136, + "step": 50725 + }, + { + "epoch": 8.275693311582382, + "grad_norm": 0.11758533865213394, + "learning_rate": 0.000728895824524785, + "loss": 0.0779, + "num_input_tokens_seen": 109580144, + "step": 50730 + }, + { + "epoch": 8.276508972267537, + "grad_norm": 0.16610552370548248, + "learning_rate": 0.0007288325391075478, + "loss": 0.0837, + "num_input_tokens_seen": 109590928, + "step": 50735 + }, + { + "epoch": 8.277324632952691, + "grad_norm": 0.021982286125421524, + "learning_rate": 0.000728769249052729, + "loss": 0.1219, + "num_input_tokens_seen": 109601584, + "step": 50740 + }, + { + "epoch": 8.278140293637847, + "grad_norm": 0.1610334813594818, + "learning_rate": 0.000728705954361611, + "loss": 0.0725, + "num_input_tokens_seen": 109611728, + "step": 50745 + }, + { + "epoch": 8.278955954323001, + "grad_norm": 0.003985857591032982, + "learning_rate": 0.0007286426550354768, + "loss": 0.1684, + "num_input_tokens_seen": 109622896, + "step": 50750 + }, + { + "epoch": 8.279771615008157, + "grad_norm": 0.0630965307354927, + "learning_rate": 0.000728579351075609, + "loss": 0.0341, + "num_input_tokens_seen": 109633296, + "step": 50755 + }, + { + "epoch": 8.280587275693312, + "grad_norm": 0.13424259424209595, + "learning_rate": 0.0007285160424832909, + "loss": 0.0536, + "num_input_tokens_seen": 109644336, + "step": 50760 + }, + { + "epoch": 8.281402936378466, + "grad_norm": 0.014872642233967781, + "learning_rate": 0.0007284527292598051, + "loss": 0.0983, + "num_input_tokens_seen": 109653776, + "step": 50765 + }, + { + "epoch": 8.282218597063622, + "grad_norm": 0.009358805604279041, + "learning_rate": 0.0007283894114064351, + "loss": 0.1608, + "num_input_tokens_seen": 109665328, + "step": 50770 + }, + { + "epoch": 8.283034257748776, + "grad_norm": 0.12680892646312714, + "learning_rate": 0.0007283260889244639, + "loss": 0.145, + "num_input_tokens_seen": 109676176, + "step": 50775 + }, + { + "epoch": 8.283849918433932, + "grad_norm": 0.13732105493545532, + "learning_rate": 0.0007282627618151747, + "loss": 0.0679, + "num_input_tokens_seen": 109687120, + "step": 50780 + }, + { + "epoch": 8.284665579119087, + "grad_norm": 0.18857133388519287, + "learning_rate": 0.0007281994300798511, + "loss": 0.1352, + "num_input_tokens_seen": 109697552, + "step": 50785 + }, + { + "epoch": 8.285481239804241, + "grad_norm": 0.16515469551086426, + "learning_rate": 0.0007281360937197767, + "loss": 0.0405, + "num_input_tokens_seen": 109708368, + "step": 50790 + }, + { + "epoch": 8.286296900489397, + "grad_norm": 0.008306358940899372, + "learning_rate": 0.0007280727527362349, + "loss": 0.0822, + "num_input_tokens_seen": 109719536, + "step": 50795 + }, + { + "epoch": 8.28711256117455, + "grad_norm": 0.01918146014213562, + "learning_rate": 0.0007280094071305095, + "loss": 0.0259, + "num_input_tokens_seen": 109730032, + "step": 50800 + }, + { + "epoch": 8.287928221859707, + "grad_norm": 0.09741424024105072, + "learning_rate": 0.0007279460569038841, + "loss": 0.0503, + "num_input_tokens_seen": 109738992, + "step": 50805 + }, + { + "epoch": 8.28874388254486, + "grad_norm": 0.016544492915272713, + "learning_rate": 0.0007278827020576427, + "loss": 0.0188, + "num_input_tokens_seen": 109749776, + "step": 50810 + }, + { + "epoch": 8.289559543230016, + "grad_norm": 0.014726830646395683, + "learning_rate": 0.0007278193425930692, + "loss": 0.0578, + "num_input_tokens_seen": 109761264, + "step": 50815 + }, + { + "epoch": 8.290375203915172, + "grad_norm": 0.0035396378953009844, + "learning_rate": 0.0007277559785114478, + "loss": 0.0045, + "num_input_tokens_seen": 109772304, + "step": 50820 + }, + { + "epoch": 8.291190864600326, + "grad_norm": 0.06589116901159286, + "learning_rate": 0.0007276926098140626, + "loss": 0.0646, + "num_input_tokens_seen": 109783600, + "step": 50825 + }, + { + "epoch": 8.292006525285482, + "grad_norm": 0.010147838853299618, + "learning_rate": 0.0007276292365021979, + "loss": 0.0145, + "num_input_tokens_seen": 109794096, + "step": 50830 + }, + { + "epoch": 8.292822185970635, + "grad_norm": 0.002056955127045512, + "learning_rate": 0.0007275658585771378, + "loss": 0.021, + "num_input_tokens_seen": 109804400, + "step": 50835 + }, + { + "epoch": 8.293637846655791, + "grad_norm": 0.005161386914551258, + "learning_rate": 0.0007275024760401668, + "loss": 0.013, + "num_input_tokens_seen": 109814704, + "step": 50840 + }, + { + "epoch": 8.294453507340947, + "grad_norm": 0.19297684729099274, + "learning_rate": 0.0007274390888925697, + "loss": 0.1108, + "num_input_tokens_seen": 109825264, + "step": 50845 + }, + { + "epoch": 8.2952691680261, + "grad_norm": 0.3012802004814148, + "learning_rate": 0.0007273756971356308, + "loss": 0.1523, + "num_input_tokens_seen": 109837072, + "step": 50850 + }, + { + "epoch": 8.296084828711257, + "grad_norm": 0.17966091632843018, + "learning_rate": 0.000727312300770635, + "loss": 0.04, + "num_input_tokens_seen": 109847920, + "step": 50855 + }, + { + "epoch": 8.29690048939641, + "grad_norm": 0.019909987226128578, + "learning_rate": 0.0007272488997988671, + "loss": 0.0852, + "num_input_tokens_seen": 109858768, + "step": 50860 + }, + { + "epoch": 8.297716150081566, + "grad_norm": 0.02569238841533661, + "learning_rate": 0.000727185494221612, + "loss": 0.1047, + "num_input_tokens_seen": 109869072, + "step": 50865 + }, + { + "epoch": 8.298531810766722, + "grad_norm": 0.002372046699747443, + "learning_rate": 0.0007271220840401546, + "loss": 0.0585, + "num_input_tokens_seen": 109881232, + "step": 50870 + }, + { + "epoch": 8.299347471451876, + "grad_norm": 0.005322215147316456, + "learning_rate": 0.0007270586692557799, + "loss": 0.0141, + "num_input_tokens_seen": 109892368, + "step": 50875 + }, + { + "epoch": 8.300163132137031, + "grad_norm": 0.007450290489941835, + "learning_rate": 0.0007269952498697733, + "loss": 0.0219, + "num_input_tokens_seen": 109903888, + "step": 50880 + }, + { + "epoch": 8.300978792822185, + "grad_norm": 0.09495791792869568, + "learning_rate": 0.0007269318258834202, + "loss": 0.0182, + "num_input_tokens_seen": 109915120, + "step": 50885 + }, + { + "epoch": 8.301794453507341, + "grad_norm": 0.008407175540924072, + "learning_rate": 0.0007268683972980056, + "loss": 0.0262, + "num_input_tokens_seen": 109925840, + "step": 50890 + }, + { + "epoch": 8.302610114192497, + "grad_norm": 0.024781066924333572, + "learning_rate": 0.0007268049641148152, + "loss": 0.1302, + "num_input_tokens_seen": 109936304, + "step": 50895 + }, + { + "epoch": 8.30342577487765, + "grad_norm": 0.01070436555892229, + "learning_rate": 0.0007267415263351343, + "loss": 0.0203, + "num_input_tokens_seen": 109946448, + "step": 50900 + }, + { + "epoch": 8.304241435562806, + "grad_norm": 0.19071489572525024, + "learning_rate": 0.0007266780839602488, + "loss": 0.1507, + "num_input_tokens_seen": 109956432, + "step": 50905 + }, + { + "epoch": 8.30505709624796, + "grad_norm": 0.3626735806465149, + "learning_rate": 0.0007266146369914445, + "loss": 0.1265, + "num_input_tokens_seen": 109967248, + "step": 50910 + }, + { + "epoch": 8.305872756933116, + "grad_norm": 0.38729625940322876, + "learning_rate": 0.0007265511854300069, + "loss": 0.0512, + "num_input_tokens_seen": 109979792, + "step": 50915 + }, + { + "epoch": 8.30668841761827, + "grad_norm": 0.3133726716041565, + "learning_rate": 0.0007264877292772223, + "loss": 0.1247, + "num_input_tokens_seen": 109990480, + "step": 50920 + }, + { + "epoch": 8.307504078303426, + "grad_norm": 0.24243846535682678, + "learning_rate": 0.0007264242685343765, + "loss": 0.1239, + "num_input_tokens_seen": 109999760, + "step": 50925 + }, + { + "epoch": 8.308319738988581, + "grad_norm": 0.19307786226272583, + "learning_rate": 0.0007263608032027557, + "loss": 0.0408, + "num_input_tokens_seen": 110010192, + "step": 50930 + }, + { + "epoch": 8.309135399673735, + "grad_norm": 0.017839960753917694, + "learning_rate": 0.000726297333283646, + "loss": 0.0239, + "num_input_tokens_seen": 110022192, + "step": 50935 + }, + { + "epoch": 8.309951060358891, + "grad_norm": 0.00453083124011755, + "learning_rate": 0.0007262338587783338, + "loss": 0.017, + "num_input_tokens_seen": 110032944, + "step": 50940 + }, + { + "epoch": 8.310766721044045, + "grad_norm": 0.1239013820886612, + "learning_rate": 0.0007261703796881054, + "loss": 0.0101, + "num_input_tokens_seen": 110043408, + "step": 50945 + }, + { + "epoch": 8.3115823817292, + "grad_norm": 0.18398962914943695, + "learning_rate": 0.0007261068960142474, + "loss": 0.035, + "num_input_tokens_seen": 110054448, + "step": 50950 + }, + { + "epoch": 8.312398042414356, + "grad_norm": 0.07020730525255203, + "learning_rate": 0.0007260434077580463, + "loss": 0.018, + "num_input_tokens_seen": 110065072, + "step": 50955 + }, + { + "epoch": 8.31321370309951, + "grad_norm": 0.02656623162329197, + "learning_rate": 0.0007259799149207887, + "loss": 0.0148, + "num_input_tokens_seen": 110075664, + "step": 50960 + }, + { + "epoch": 8.314029363784666, + "grad_norm": 0.0020199622958898544, + "learning_rate": 0.0007259164175037616, + "loss": 0.0139, + "num_input_tokens_seen": 110086672, + "step": 50965 + }, + { + "epoch": 8.31484502446982, + "grad_norm": 0.12150160223245621, + "learning_rate": 0.0007258529155082516, + "loss": 0.0214, + "num_input_tokens_seen": 110097840, + "step": 50970 + }, + { + "epoch": 8.315660685154976, + "grad_norm": 0.031684860587120056, + "learning_rate": 0.0007257894089355458, + "loss": 0.2559, + "num_input_tokens_seen": 110109552, + "step": 50975 + }, + { + "epoch": 8.31647634584013, + "grad_norm": 0.23399962484836578, + "learning_rate": 0.0007257258977869313, + "loss": 0.0723, + "num_input_tokens_seen": 110121072, + "step": 50980 + }, + { + "epoch": 8.317292006525285, + "grad_norm": 0.14578203856945038, + "learning_rate": 0.000725662382063695, + "loss": 0.0523, + "num_input_tokens_seen": 110132432, + "step": 50985 + }, + { + "epoch": 8.318107667210441, + "grad_norm": 0.10246943682432175, + "learning_rate": 0.0007255988617671241, + "loss": 0.0761, + "num_input_tokens_seen": 110144464, + "step": 50990 + }, + { + "epoch": 8.318923327895595, + "grad_norm": 0.0060472646728158, + "learning_rate": 0.0007255353368985063, + "loss": 0.0798, + "num_input_tokens_seen": 110155280, + "step": 50995 + }, + { + "epoch": 8.31973898858075, + "grad_norm": 0.22746527194976807, + "learning_rate": 0.0007254718074591285, + "loss": 0.027, + "num_input_tokens_seen": 110165360, + "step": 51000 + }, + { + "epoch": 8.320554649265905, + "grad_norm": 0.3178712725639343, + "learning_rate": 0.0007254082734502788, + "loss": 0.116, + "num_input_tokens_seen": 110176336, + "step": 51005 + }, + { + "epoch": 8.32137030995106, + "grad_norm": 0.3559790849685669, + "learning_rate": 0.0007253447348732443, + "loss": 0.0362, + "num_input_tokens_seen": 110188624, + "step": 51010 + }, + { + "epoch": 8.322185970636216, + "grad_norm": 0.22412791848182678, + "learning_rate": 0.000725281191729313, + "loss": 0.0574, + "num_input_tokens_seen": 110198768, + "step": 51015 + }, + { + "epoch": 8.32300163132137, + "grad_norm": 0.01915472373366356, + "learning_rate": 0.0007252176440197726, + "loss": 0.013, + "num_input_tokens_seen": 110209456, + "step": 51020 + }, + { + "epoch": 8.323817292006526, + "grad_norm": 0.018906638026237488, + "learning_rate": 0.0007251540917459109, + "loss": 0.0337, + "num_input_tokens_seen": 110220656, + "step": 51025 + }, + { + "epoch": 8.32463295269168, + "grad_norm": 0.09125878661870956, + "learning_rate": 0.0007250905349090158, + "loss": 0.0241, + "num_input_tokens_seen": 110231504, + "step": 51030 + }, + { + "epoch": 8.325448613376835, + "grad_norm": 0.28258514404296875, + "learning_rate": 0.0007250269735103754, + "loss": 0.0414, + "num_input_tokens_seen": 110242704, + "step": 51035 + }, + { + "epoch": 8.326264274061991, + "grad_norm": 0.026500288397073746, + "learning_rate": 0.0007249634075512781, + "loss": 0.0064, + "num_input_tokens_seen": 110253104, + "step": 51040 + }, + { + "epoch": 8.327079934747145, + "grad_norm": 0.002784762065857649, + "learning_rate": 0.0007248998370330119, + "loss": 0.1243, + "num_input_tokens_seen": 110263408, + "step": 51045 + }, + { + "epoch": 8.3278955954323, + "grad_norm": 0.02501026540994644, + "learning_rate": 0.0007248362619568651, + "loss": 0.0126, + "num_input_tokens_seen": 110275536, + "step": 51050 + }, + { + "epoch": 8.328711256117455, + "grad_norm": 0.005268088076263666, + "learning_rate": 0.0007247726823241264, + "loss": 0.1713, + "num_input_tokens_seen": 110285616, + "step": 51055 + }, + { + "epoch": 8.32952691680261, + "grad_norm": 0.004592955578118563, + "learning_rate": 0.0007247090981360841, + "loss": 0.0813, + "num_input_tokens_seen": 110297680, + "step": 51060 + }, + { + "epoch": 8.330342577487766, + "grad_norm": 0.2294696718454361, + "learning_rate": 0.0007246455093940268, + "loss": 0.1092, + "num_input_tokens_seen": 110307120, + "step": 51065 + }, + { + "epoch": 8.33115823817292, + "grad_norm": 0.01248850580304861, + "learning_rate": 0.0007245819160992434, + "loss": 0.0297, + "num_input_tokens_seen": 110317424, + "step": 51070 + }, + { + "epoch": 8.331973898858076, + "grad_norm": 0.15484943985939026, + "learning_rate": 0.0007245183182530224, + "loss": 0.083, + "num_input_tokens_seen": 110328240, + "step": 51075 + }, + { + "epoch": 8.33278955954323, + "grad_norm": 0.01719985157251358, + "learning_rate": 0.0007244547158566531, + "loss": 0.0061, + "num_input_tokens_seen": 110338448, + "step": 51080 + }, + { + "epoch": 8.333605220228385, + "grad_norm": 0.09124539792537689, + "learning_rate": 0.0007243911089114239, + "loss": 0.032, + "num_input_tokens_seen": 110348496, + "step": 51085 + }, + { + "epoch": 8.33442088091354, + "grad_norm": 0.05580779165029526, + "learning_rate": 0.0007243274974186245, + "loss": 0.0204, + "num_input_tokens_seen": 110360400, + "step": 51090 + }, + { + "epoch": 8.335236541598695, + "grad_norm": 0.09825216233730316, + "learning_rate": 0.0007242638813795437, + "loss": 0.0344, + "num_input_tokens_seen": 110371664, + "step": 51095 + }, + { + "epoch": 8.33605220228385, + "grad_norm": 0.009273175150156021, + "learning_rate": 0.0007242002607954708, + "loss": 0.0771, + "num_input_tokens_seen": 110383024, + "step": 51100 + }, + { + "epoch": 8.336867862969005, + "grad_norm": 0.11133985221385956, + "learning_rate": 0.000724136635667695, + "loss": 0.0664, + "num_input_tokens_seen": 110393808, + "step": 51105 + }, + { + "epoch": 8.33768352365416, + "grad_norm": 0.5067927241325378, + "learning_rate": 0.0007240730059975063, + "loss": 0.123, + "num_input_tokens_seen": 110404912, + "step": 51110 + }, + { + "epoch": 8.338499184339314, + "grad_norm": 0.008333737030625343, + "learning_rate": 0.0007240093717861937, + "loss": 0.0251, + "num_input_tokens_seen": 110415920, + "step": 51115 + }, + { + "epoch": 8.33931484502447, + "grad_norm": 0.007308666128665209, + "learning_rate": 0.000723945733035047, + "loss": 0.0115, + "num_input_tokens_seen": 110426416, + "step": 51120 + }, + { + "epoch": 8.340130505709626, + "grad_norm": 0.05521797761321068, + "learning_rate": 0.0007238820897453559, + "loss": 0.2082, + "num_input_tokens_seen": 110438480, + "step": 51125 + }, + { + "epoch": 8.34094616639478, + "grad_norm": 0.02290504239499569, + "learning_rate": 0.0007238184419184104, + "loss": 0.0267, + "num_input_tokens_seen": 110449168, + "step": 51130 + }, + { + "epoch": 8.341761827079935, + "grad_norm": 0.018945492804050446, + "learning_rate": 0.0007237547895555001, + "loss": 0.0522, + "num_input_tokens_seen": 110459472, + "step": 51135 + }, + { + "epoch": 8.34257748776509, + "grad_norm": 0.04555573686957359, + "learning_rate": 0.0007236911326579152, + "loss": 0.0231, + "num_input_tokens_seen": 110469584, + "step": 51140 + }, + { + "epoch": 8.343393148450245, + "grad_norm": 0.012672092765569687, + "learning_rate": 0.0007236274712269457, + "loss": 0.0262, + "num_input_tokens_seen": 110479600, + "step": 51145 + }, + { + "epoch": 8.3442088091354, + "grad_norm": 0.01710429973900318, + "learning_rate": 0.0007235638052638819, + "loss": 0.0671, + "num_input_tokens_seen": 110491600, + "step": 51150 + }, + { + "epoch": 8.345024469820554, + "grad_norm": 0.0054277884773910046, + "learning_rate": 0.0007235001347700139, + "loss": 0.0549, + "num_input_tokens_seen": 110501872, + "step": 51155 + }, + { + "epoch": 8.34584013050571, + "grad_norm": 0.018640436232089996, + "learning_rate": 0.0007234364597466321, + "loss": 0.0379, + "num_input_tokens_seen": 110513328, + "step": 51160 + }, + { + "epoch": 8.346655791190864, + "grad_norm": 0.05840952321887016, + "learning_rate": 0.000723372780195027, + "loss": 0.0388, + "num_input_tokens_seen": 110524496, + "step": 51165 + }, + { + "epoch": 8.34747145187602, + "grad_norm": 0.004298006650060415, + "learning_rate": 0.0007233090961164892, + "loss": 0.0765, + "num_input_tokens_seen": 110535120, + "step": 51170 + }, + { + "epoch": 8.348287112561174, + "grad_norm": 0.12360477447509766, + "learning_rate": 0.000723245407512309, + "loss": 0.0333, + "num_input_tokens_seen": 110545712, + "step": 51175 + }, + { + "epoch": 8.34910277324633, + "grad_norm": 0.19511815905570984, + "learning_rate": 0.0007231817143837778, + "loss": 0.051, + "num_input_tokens_seen": 110555952, + "step": 51180 + }, + { + "epoch": 8.349918433931485, + "grad_norm": 0.002610130002722144, + "learning_rate": 0.0007231180167321858, + "loss": 0.0388, + "num_input_tokens_seen": 110566320, + "step": 51185 + }, + { + "epoch": 8.350734094616639, + "grad_norm": 0.17444966733455658, + "learning_rate": 0.0007230543145588242, + "loss": 0.1123, + "num_input_tokens_seen": 110577744, + "step": 51190 + }, + { + "epoch": 8.351549755301795, + "grad_norm": 0.22752845287322998, + "learning_rate": 0.000722990607864984, + "loss": 0.1407, + "num_input_tokens_seen": 110589392, + "step": 51195 + }, + { + "epoch": 8.352365415986949, + "grad_norm": 0.19195815920829773, + "learning_rate": 0.0007229268966519562, + "loss": 0.0575, + "num_input_tokens_seen": 110600880, + "step": 51200 + }, + { + "epoch": 8.353181076672104, + "grad_norm": 0.08895209431648254, + "learning_rate": 0.0007228631809210321, + "loss": 0.0465, + "num_input_tokens_seen": 110611120, + "step": 51205 + }, + { + "epoch": 8.35399673735726, + "grad_norm": 0.048343852162361145, + "learning_rate": 0.0007227994606735029, + "loss": 0.091, + "num_input_tokens_seen": 110621584, + "step": 51210 + }, + { + "epoch": 8.354812398042414, + "grad_norm": 0.30052274465560913, + "learning_rate": 0.0007227357359106598, + "loss": 0.1321, + "num_input_tokens_seen": 110632656, + "step": 51215 + }, + { + "epoch": 8.35562805872757, + "grad_norm": 0.00870759878307581, + "learning_rate": 0.0007226720066337946, + "loss": 0.0447, + "num_input_tokens_seen": 110644208, + "step": 51220 + }, + { + "epoch": 8.356443719412724, + "grad_norm": 0.051032643765211105, + "learning_rate": 0.0007226082728441989, + "loss": 0.0856, + "num_input_tokens_seen": 110654832, + "step": 51225 + }, + { + "epoch": 8.35725938009788, + "grad_norm": 0.015562736429274082, + "learning_rate": 0.0007225445345431638, + "loss": 0.0081, + "num_input_tokens_seen": 110665648, + "step": 51230 + }, + { + "epoch": 8.358075040783035, + "grad_norm": 0.008317803032696247, + "learning_rate": 0.0007224807917319817, + "loss": 0.0205, + "num_input_tokens_seen": 110675280, + "step": 51235 + }, + { + "epoch": 8.358890701468189, + "grad_norm": 0.01899358443915844, + "learning_rate": 0.000722417044411944, + "loss": 0.0458, + "num_input_tokens_seen": 110685296, + "step": 51240 + }, + { + "epoch": 8.359706362153345, + "grad_norm": 0.009460066445171833, + "learning_rate": 0.0007223532925843427, + "loss": 0.0061, + "num_input_tokens_seen": 110695408, + "step": 51245 + }, + { + "epoch": 8.360522022838499, + "grad_norm": 0.02281215600669384, + "learning_rate": 0.0007222895362504698, + "loss": 0.0356, + "num_input_tokens_seen": 110707312, + "step": 51250 + }, + { + "epoch": 8.361337683523654, + "grad_norm": 0.1802525818347931, + "learning_rate": 0.0007222257754116176, + "loss": 0.1316, + "num_input_tokens_seen": 110718288, + "step": 51255 + }, + { + "epoch": 8.362153344208808, + "grad_norm": 0.011667085811495781, + "learning_rate": 0.000722162010069078, + "loss": 0.0183, + "num_input_tokens_seen": 110728880, + "step": 51260 + }, + { + "epoch": 8.362969004893964, + "grad_norm": 0.005664953961968422, + "learning_rate": 0.0007220982402241436, + "loss": 0.1559, + "num_input_tokens_seen": 110741104, + "step": 51265 + }, + { + "epoch": 8.36378466557912, + "grad_norm": 0.005667020566761494, + "learning_rate": 0.0007220344658781065, + "loss": 0.0779, + "num_input_tokens_seen": 110751888, + "step": 51270 + }, + { + "epoch": 8.364600326264274, + "grad_norm": 0.025366060435771942, + "learning_rate": 0.0007219706870322594, + "loss": 0.0326, + "num_input_tokens_seen": 110762640, + "step": 51275 + }, + { + "epoch": 8.36541598694943, + "grad_norm": 0.19890545308589935, + "learning_rate": 0.0007219069036878945, + "loss": 0.161, + "num_input_tokens_seen": 110772912, + "step": 51280 + }, + { + "epoch": 8.366231647634583, + "grad_norm": 0.07803814113140106, + "learning_rate": 0.0007218431158463048, + "loss": 0.0557, + "num_input_tokens_seen": 110784592, + "step": 51285 + }, + { + "epoch": 8.367047308319739, + "grad_norm": 0.19067999720573425, + "learning_rate": 0.000721779323508783, + "loss": 0.0339, + "num_input_tokens_seen": 110795216, + "step": 51290 + }, + { + "epoch": 8.367862969004895, + "grad_norm": 0.43747133016586304, + "learning_rate": 0.0007217155266766217, + "loss": 0.166, + "num_input_tokens_seen": 110805968, + "step": 51295 + }, + { + "epoch": 8.368678629690049, + "grad_norm": 0.020700732246041298, + "learning_rate": 0.0007216517253511143, + "loss": 0.0165, + "num_input_tokens_seen": 110816112, + "step": 51300 + }, + { + "epoch": 8.369494290375204, + "grad_norm": 0.2714422941207886, + "learning_rate": 0.0007215879195335531, + "loss": 0.0797, + "num_input_tokens_seen": 110826384, + "step": 51305 + }, + { + "epoch": 8.370309951060358, + "grad_norm": 0.02906269021332264, + "learning_rate": 0.0007215241092252319, + "loss": 0.0776, + "num_input_tokens_seen": 110835728, + "step": 51310 + }, + { + "epoch": 8.371125611745514, + "grad_norm": 0.006266695912927389, + "learning_rate": 0.0007214602944274435, + "loss": 0.0088, + "num_input_tokens_seen": 110846768, + "step": 51315 + }, + { + "epoch": 8.37194127243067, + "grad_norm": 0.019487502053380013, + "learning_rate": 0.0007213964751414812, + "loss": 0.0152, + "num_input_tokens_seen": 110856720, + "step": 51320 + }, + { + "epoch": 8.372756933115824, + "grad_norm": 0.005166689399629831, + "learning_rate": 0.0007213326513686386, + "loss": 0.0128, + "num_input_tokens_seen": 110867536, + "step": 51325 + }, + { + "epoch": 8.37357259380098, + "grad_norm": 0.0032772270496934652, + "learning_rate": 0.0007212688231102091, + "loss": 0.0934, + "num_input_tokens_seen": 110878320, + "step": 51330 + }, + { + "epoch": 8.374388254486133, + "grad_norm": 0.022973116487264633, + "learning_rate": 0.000721204990367486, + "loss": 0.1023, + "num_input_tokens_seen": 110888816, + "step": 51335 + }, + { + "epoch": 8.375203915171289, + "grad_norm": 0.2152455896139145, + "learning_rate": 0.0007211411531417633, + "loss": 0.223, + "num_input_tokens_seen": 110899792, + "step": 51340 + }, + { + "epoch": 8.376019575856443, + "grad_norm": 0.2188149094581604, + "learning_rate": 0.0007210773114343345, + "loss": 0.1133, + "num_input_tokens_seen": 110909584, + "step": 51345 + }, + { + "epoch": 8.376835236541599, + "grad_norm": 0.020212259143590927, + "learning_rate": 0.0007210134652464935, + "loss": 0.0428, + "num_input_tokens_seen": 110920752, + "step": 51350 + }, + { + "epoch": 8.377650897226754, + "grad_norm": 0.004299887455999851, + "learning_rate": 0.0007209496145795343, + "loss": 0.0371, + "num_input_tokens_seen": 110931248, + "step": 51355 + }, + { + "epoch": 8.378466557911908, + "grad_norm": 0.011292368173599243, + "learning_rate": 0.000720885759434751, + "loss": 0.0168, + "num_input_tokens_seen": 110940432, + "step": 51360 + }, + { + "epoch": 8.379282218597064, + "grad_norm": 0.017861951142549515, + "learning_rate": 0.0007208218998134375, + "loss": 0.0674, + "num_input_tokens_seen": 110950832, + "step": 51365 + }, + { + "epoch": 8.380097879282218, + "grad_norm": 0.02082081325352192, + "learning_rate": 0.000720758035716888, + "loss": 0.0143, + "num_input_tokens_seen": 110960624, + "step": 51370 + }, + { + "epoch": 8.380913539967374, + "grad_norm": 0.13963234424591064, + "learning_rate": 0.0007206941671463969, + "loss": 0.1249, + "num_input_tokens_seen": 110969840, + "step": 51375 + }, + { + "epoch": 8.38172920065253, + "grad_norm": 0.005206751171499491, + "learning_rate": 0.0007206302941032586, + "loss": 0.1266, + "num_input_tokens_seen": 110981680, + "step": 51380 + }, + { + "epoch": 8.382544861337683, + "grad_norm": 0.04112826660275459, + "learning_rate": 0.0007205664165887673, + "loss": 0.0323, + "num_input_tokens_seen": 110992112, + "step": 51385 + }, + { + "epoch": 8.383360522022839, + "grad_norm": 0.05072109028697014, + "learning_rate": 0.000720502534604218, + "loss": 0.0369, + "num_input_tokens_seen": 111001616, + "step": 51390 + }, + { + "epoch": 8.384176182707993, + "grad_norm": 0.026560001075267792, + "learning_rate": 0.0007204386481509049, + "loss": 0.0208, + "num_input_tokens_seen": 111011568, + "step": 51395 + }, + { + "epoch": 8.384991843393149, + "grad_norm": 0.32577085494995117, + "learning_rate": 0.0007203747572301231, + "loss": 0.1754, + "num_input_tokens_seen": 111022128, + "step": 51400 + }, + { + "epoch": 8.385807504078304, + "grad_norm": 0.09001737087965012, + "learning_rate": 0.0007203108618431672, + "loss": 0.0991, + "num_input_tokens_seen": 111032496, + "step": 51405 + }, + { + "epoch": 8.386623164763458, + "grad_norm": 0.04394825920462608, + "learning_rate": 0.0007202469619913322, + "loss": 0.0641, + "num_input_tokens_seen": 111043216, + "step": 51410 + }, + { + "epoch": 8.387438825448614, + "grad_norm": 0.16462813317775726, + "learning_rate": 0.0007201830576759132, + "loss": 0.0273, + "num_input_tokens_seen": 111054032, + "step": 51415 + }, + { + "epoch": 8.388254486133768, + "grad_norm": 0.34273892641067505, + "learning_rate": 0.0007201191488982051, + "loss": 0.1403, + "num_input_tokens_seen": 111065072, + "step": 51420 + }, + { + "epoch": 8.389070146818923, + "grad_norm": 0.018835965543985367, + "learning_rate": 0.0007200552356595031, + "loss": 0.1901, + "num_input_tokens_seen": 111076208, + "step": 51425 + }, + { + "epoch": 8.38988580750408, + "grad_norm": 0.004108819179236889, + "learning_rate": 0.0007199913179611029, + "loss": 0.0192, + "num_input_tokens_seen": 111087728, + "step": 51430 + }, + { + "epoch": 8.390701468189233, + "grad_norm": 0.03467337414622307, + "learning_rate": 0.0007199273958042994, + "loss": 0.1125, + "num_input_tokens_seen": 111098928, + "step": 51435 + }, + { + "epoch": 8.391517128874389, + "grad_norm": 0.04100308567285538, + "learning_rate": 0.0007198634691903882, + "loss": 0.1222, + "num_input_tokens_seen": 111110320, + "step": 51440 + }, + { + "epoch": 8.392332789559543, + "grad_norm": 0.27813297510147095, + "learning_rate": 0.0007197995381206649, + "loss": 0.1024, + "num_input_tokens_seen": 111120112, + "step": 51445 + }, + { + "epoch": 8.393148450244698, + "grad_norm": 0.2542005777359009, + "learning_rate": 0.0007197356025964252, + "loss": 0.1429, + "num_input_tokens_seen": 111131568, + "step": 51450 + }, + { + "epoch": 8.393964110929852, + "grad_norm": 0.016471102833747864, + "learning_rate": 0.0007196716626189646, + "loss": 0.0473, + "num_input_tokens_seen": 111142992, + "step": 51455 + }, + { + "epoch": 8.394779771615008, + "grad_norm": 0.06641436368227005, + "learning_rate": 0.0007196077181895792, + "loss": 0.1069, + "num_input_tokens_seen": 111154512, + "step": 51460 + }, + { + "epoch": 8.395595432300164, + "grad_norm": 0.1935798078775406, + "learning_rate": 0.0007195437693095647, + "loss": 0.1171, + "num_input_tokens_seen": 111164304, + "step": 51465 + }, + { + "epoch": 8.396411092985318, + "grad_norm": 0.019165636971592903, + "learning_rate": 0.0007194798159802174, + "loss": 0.0137, + "num_input_tokens_seen": 111174832, + "step": 51470 + }, + { + "epoch": 8.397226753670473, + "grad_norm": 0.2816579043865204, + "learning_rate": 0.0007194158582028332, + "loss": 0.2435, + "num_input_tokens_seen": 111184496, + "step": 51475 + }, + { + "epoch": 8.398042414355627, + "grad_norm": 0.010307567194104195, + "learning_rate": 0.0007193518959787081, + "loss": 0.1182, + "num_input_tokens_seen": 111194672, + "step": 51480 + }, + { + "epoch": 8.398858075040783, + "grad_norm": 0.43253281712532043, + "learning_rate": 0.0007192879293091386, + "loss": 0.0763, + "num_input_tokens_seen": 111205328, + "step": 51485 + }, + { + "epoch": 8.399673735725939, + "grad_norm": 0.026115600019693375, + "learning_rate": 0.000719223958195421, + "loss": 0.029, + "num_input_tokens_seen": 111216944, + "step": 51490 + }, + { + "epoch": 8.400489396411093, + "grad_norm": 0.004687016364187002, + "learning_rate": 0.0007191599826388518, + "loss": 0.1128, + "num_input_tokens_seen": 111228400, + "step": 51495 + }, + { + "epoch": 8.401305057096248, + "grad_norm": 0.003850112436339259, + "learning_rate": 0.0007190960026407276, + "loss": 0.0425, + "num_input_tokens_seen": 111239216, + "step": 51500 + }, + { + "epoch": 8.402120717781402, + "grad_norm": 0.02196848951280117, + "learning_rate": 0.0007190320182023449, + "loss": 0.0334, + "num_input_tokens_seen": 111249904, + "step": 51505 + }, + { + "epoch": 8.402936378466558, + "grad_norm": 0.10487034171819687, + "learning_rate": 0.0007189680293250005, + "loss": 0.0302, + "num_input_tokens_seen": 111261808, + "step": 51510 + }, + { + "epoch": 8.403752039151712, + "grad_norm": 0.03270275518298149, + "learning_rate": 0.0007189040360099913, + "loss": 0.0364, + "num_input_tokens_seen": 111273136, + "step": 51515 + }, + { + "epoch": 8.404567699836868, + "grad_norm": 0.04307975620031357, + "learning_rate": 0.000718840038258614, + "loss": 0.03, + "num_input_tokens_seen": 111285488, + "step": 51520 + }, + { + "epoch": 8.405383360522023, + "grad_norm": 0.05347808450460434, + "learning_rate": 0.0007187760360721658, + "loss": 0.0384, + "num_input_tokens_seen": 111297296, + "step": 51525 + }, + { + "epoch": 8.406199021207177, + "grad_norm": 0.046753790229558945, + "learning_rate": 0.0007187120294519434, + "loss": 0.0718, + "num_input_tokens_seen": 111308784, + "step": 51530 + }, + { + "epoch": 8.407014681892333, + "grad_norm": 0.016464874148368835, + "learning_rate": 0.0007186480183992446, + "loss": 0.0653, + "num_input_tokens_seen": 111318928, + "step": 51535 + }, + { + "epoch": 8.407830342577487, + "grad_norm": 0.022152036428451538, + "learning_rate": 0.0007185840029153663, + "loss": 0.1133, + "num_input_tokens_seen": 111330320, + "step": 51540 + }, + { + "epoch": 8.408646003262643, + "grad_norm": 0.06510874629020691, + "learning_rate": 0.0007185199830016058, + "loss": 0.0687, + "num_input_tokens_seen": 111341712, + "step": 51545 + }, + { + "epoch": 8.409461663947798, + "grad_norm": 0.39695748686790466, + "learning_rate": 0.0007184559586592606, + "loss": 0.078, + "num_input_tokens_seen": 111352688, + "step": 51550 + }, + { + "epoch": 8.410277324632952, + "grad_norm": 0.06608985364437103, + "learning_rate": 0.0007183919298896283, + "loss": 0.0637, + "num_input_tokens_seen": 111364144, + "step": 51555 + }, + { + "epoch": 8.411092985318108, + "grad_norm": 0.23177236318588257, + "learning_rate": 0.0007183278966940065, + "loss": 0.0279, + "num_input_tokens_seen": 111374512, + "step": 51560 + }, + { + "epoch": 8.411908646003262, + "grad_norm": 0.014381797052919865, + "learning_rate": 0.000718263859073693, + "loss": 0.0146, + "num_input_tokens_seen": 111384592, + "step": 51565 + }, + { + "epoch": 8.412724306688418, + "grad_norm": 0.3006967306137085, + "learning_rate": 0.0007181998170299854, + "loss": 0.1615, + "num_input_tokens_seen": 111396112, + "step": 51570 + }, + { + "epoch": 8.413539967373573, + "grad_norm": 0.029067158699035645, + "learning_rate": 0.0007181357705641818, + "loss": 0.0813, + "num_input_tokens_seen": 111407920, + "step": 51575 + }, + { + "epoch": 8.414355628058727, + "grad_norm": 0.4092482924461365, + "learning_rate": 0.0007180717196775799, + "loss": 0.157, + "num_input_tokens_seen": 111418320, + "step": 51580 + }, + { + "epoch": 8.415171288743883, + "grad_norm": 0.02459162473678589, + "learning_rate": 0.0007180076643714781, + "loss": 0.1154, + "num_input_tokens_seen": 111429200, + "step": 51585 + }, + { + "epoch": 8.415986949429037, + "grad_norm": 0.03159189224243164, + "learning_rate": 0.0007179436046471743, + "loss": 0.0434, + "num_input_tokens_seen": 111440336, + "step": 51590 + }, + { + "epoch": 8.416802610114193, + "grad_norm": 0.2111511081457138, + "learning_rate": 0.0007178795405059671, + "loss": 0.0707, + "num_input_tokens_seen": 111451920, + "step": 51595 + }, + { + "epoch": 8.417618270799348, + "grad_norm": 0.2789364457130432, + "learning_rate": 0.0007178154719491545, + "loss": 0.0808, + "num_input_tokens_seen": 111463344, + "step": 51600 + }, + { + "epoch": 8.418433931484502, + "grad_norm": 0.06086054444313049, + "learning_rate": 0.0007177513989780349, + "loss": 0.1239, + "num_input_tokens_seen": 111474256, + "step": 51605 + }, + { + "epoch": 8.419249592169658, + "grad_norm": 0.019520027562975883, + "learning_rate": 0.0007176873215939072, + "loss": 0.0895, + "num_input_tokens_seen": 111486544, + "step": 51610 + }, + { + "epoch": 8.420065252854812, + "grad_norm": 0.14331720769405365, + "learning_rate": 0.0007176232397980696, + "loss": 0.1271, + "num_input_tokens_seen": 111496752, + "step": 51615 + }, + { + "epoch": 8.420880913539968, + "grad_norm": 0.0736565813422203, + "learning_rate": 0.000717559153591821, + "loss": 0.0223, + "num_input_tokens_seen": 111507696, + "step": 51620 + }, + { + "epoch": 8.421696574225122, + "grad_norm": 0.028831366449594498, + "learning_rate": 0.0007174950629764602, + "loss": 0.0343, + "num_input_tokens_seen": 111517040, + "step": 51625 + }, + { + "epoch": 8.422512234910277, + "grad_norm": 0.020894574001431465, + "learning_rate": 0.0007174309679532859, + "loss": 0.0226, + "num_input_tokens_seen": 111527824, + "step": 51630 + }, + { + "epoch": 8.423327895595433, + "grad_norm": 0.3167615830898285, + "learning_rate": 0.0007173668685235973, + "loss": 0.0735, + "num_input_tokens_seen": 111538576, + "step": 51635 + }, + { + "epoch": 8.424143556280587, + "grad_norm": 0.23885099589824677, + "learning_rate": 0.0007173027646886934, + "loss": 0.0408, + "num_input_tokens_seen": 111548976, + "step": 51640 + }, + { + "epoch": 8.424959216965743, + "grad_norm": 0.008945178240537643, + "learning_rate": 0.0007172386564498733, + "loss": 0.1267, + "num_input_tokens_seen": 111560560, + "step": 51645 + }, + { + "epoch": 8.425774877650896, + "grad_norm": 0.03208020329475403, + "learning_rate": 0.0007171745438084362, + "loss": 0.1069, + "num_input_tokens_seen": 111571600, + "step": 51650 + }, + { + "epoch": 8.426590538336052, + "grad_norm": 0.008227204903960228, + "learning_rate": 0.0007171104267656814, + "loss": 0.0583, + "num_input_tokens_seen": 111583920, + "step": 51655 + }, + { + "epoch": 8.427406199021208, + "grad_norm": 0.08085020631551743, + "learning_rate": 0.0007170463053229085, + "loss": 0.0314, + "num_input_tokens_seen": 111593264, + "step": 51660 + }, + { + "epoch": 8.428221859706362, + "grad_norm": 0.11244131624698639, + "learning_rate": 0.0007169821794814168, + "loss": 0.056, + "num_input_tokens_seen": 111602416, + "step": 51665 + }, + { + "epoch": 8.429037520391518, + "grad_norm": 0.02487485483288765, + "learning_rate": 0.000716918049242506, + "loss": 0.0144, + "num_input_tokens_seen": 111614448, + "step": 51670 + }, + { + "epoch": 8.429853181076671, + "grad_norm": 0.01839791052043438, + "learning_rate": 0.0007168539146074757, + "loss": 0.015, + "num_input_tokens_seen": 111625360, + "step": 51675 + }, + { + "epoch": 8.430668841761827, + "grad_norm": 0.20842097699642181, + "learning_rate": 0.0007167897755776258, + "loss": 0.0672, + "num_input_tokens_seen": 111636720, + "step": 51680 + }, + { + "epoch": 8.431484502446983, + "grad_norm": 0.07622958719730377, + "learning_rate": 0.0007167256321542561, + "loss": 0.1013, + "num_input_tokens_seen": 111647312, + "step": 51685 + }, + { + "epoch": 8.432300163132137, + "grad_norm": 0.07612695544958115, + "learning_rate": 0.0007166614843386666, + "loss": 0.0439, + "num_input_tokens_seen": 111657968, + "step": 51690 + }, + { + "epoch": 8.433115823817293, + "grad_norm": 0.20850707590579987, + "learning_rate": 0.0007165973321321571, + "loss": 0.0798, + "num_input_tokens_seen": 111669040, + "step": 51695 + }, + { + "epoch": 8.433931484502446, + "grad_norm": 0.13440640270709991, + "learning_rate": 0.0007165331755360281, + "loss": 0.0137, + "num_input_tokens_seen": 111680560, + "step": 51700 + }, + { + "epoch": 8.434747145187602, + "grad_norm": 0.004296013154089451, + "learning_rate": 0.0007164690145515793, + "loss": 0.0736, + "num_input_tokens_seen": 111691408, + "step": 51705 + }, + { + "epoch": 8.435562805872756, + "grad_norm": 0.026044311001896858, + "learning_rate": 0.0007164048491801116, + "loss": 0.0153, + "num_input_tokens_seen": 111701776, + "step": 51710 + }, + { + "epoch": 8.436378466557912, + "grad_norm": 0.218144953250885, + "learning_rate": 0.0007163406794229249, + "loss": 0.1054, + "num_input_tokens_seen": 111712880, + "step": 51715 + }, + { + "epoch": 8.437194127243067, + "grad_norm": 0.06459010392427444, + "learning_rate": 0.0007162765052813199, + "loss": 0.0305, + "num_input_tokens_seen": 111725520, + "step": 51720 + }, + { + "epoch": 8.438009787928221, + "grad_norm": 0.009473503567278385, + "learning_rate": 0.0007162123267565972, + "loss": 0.0265, + "num_input_tokens_seen": 111736240, + "step": 51725 + }, + { + "epoch": 8.438825448613377, + "grad_norm": 0.1754245012998581, + "learning_rate": 0.0007161481438500574, + "loss": 0.0571, + "num_input_tokens_seen": 111747632, + "step": 51730 + }, + { + "epoch": 8.439641109298531, + "grad_norm": 0.007052075117826462, + "learning_rate": 0.0007160839565630014, + "loss": 0.0112, + "num_input_tokens_seen": 111758224, + "step": 51735 + }, + { + "epoch": 8.440456769983687, + "grad_norm": 0.06561672687530518, + "learning_rate": 0.0007160197648967298, + "loss": 0.0301, + "num_input_tokens_seen": 111770096, + "step": 51740 + }, + { + "epoch": 8.441272430668842, + "grad_norm": 0.4638075530529022, + "learning_rate": 0.0007159555688525434, + "loss": 0.0888, + "num_input_tokens_seen": 111780560, + "step": 51745 + }, + { + "epoch": 8.442088091353996, + "grad_norm": 0.25027647614479065, + "learning_rate": 0.0007158913684317437, + "loss": 0.118, + "num_input_tokens_seen": 111791408, + "step": 51750 + }, + { + "epoch": 8.442903752039152, + "grad_norm": 0.010236898437142372, + "learning_rate": 0.0007158271636356315, + "loss": 0.0693, + "num_input_tokens_seen": 111802096, + "step": 51755 + }, + { + "epoch": 8.443719412724306, + "grad_norm": 0.021024169400334358, + "learning_rate": 0.000715762954465508, + "loss": 0.089, + "num_input_tokens_seen": 111813136, + "step": 51760 + }, + { + "epoch": 8.444535073409462, + "grad_norm": 0.0077271731570363045, + "learning_rate": 0.0007156987409226745, + "loss": 0.0293, + "num_input_tokens_seen": 111824624, + "step": 51765 + }, + { + "epoch": 8.445350734094617, + "grad_norm": 0.010105198249220848, + "learning_rate": 0.0007156345230084325, + "loss": 0.0382, + "num_input_tokens_seen": 111834736, + "step": 51770 + }, + { + "epoch": 8.446166394779771, + "grad_norm": 0.012942255474627018, + "learning_rate": 0.0007155703007240832, + "loss": 0.0033, + "num_input_tokens_seen": 111846192, + "step": 51775 + }, + { + "epoch": 8.446982055464927, + "grad_norm": 0.0030060415156185627, + "learning_rate": 0.0007155060740709284, + "loss": 0.0844, + "num_input_tokens_seen": 111858096, + "step": 51780 + }, + { + "epoch": 8.447797716150081, + "grad_norm": 0.04821230471134186, + "learning_rate": 0.0007154418430502696, + "loss": 0.0281, + "num_input_tokens_seen": 111868720, + "step": 51785 + }, + { + "epoch": 8.448613376835237, + "grad_norm": 0.005300053860992193, + "learning_rate": 0.0007153776076634084, + "loss": 0.0975, + "num_input_tokens_seen": 111879888, + "step": 51790 + }, + { + "epoch": 8.449429037520392, + "grad_norm": 0.05414601042866707, + "learning_rate": 0.0007153133679116469, + "loss": 0.0067, + "num_input_tokens_seen": 111890672, + "step": 51795 + }, + { + "epoch": 8.450244698205546, + "grad_norm": 0.09117277711629868, + "learning_rate": 0.0007152491237962867, + "loss": 0.018, + "num_input_tokens_seen": 111901936, + "step": 51800 + }, + { + "epoch": 8.451060358890702, + "grad_norm": 0.23480220139026642, + "learning_rate": 0.0007151848753186301, + "loss": 0.0746, + "num_input_tokens_seen": 111912656, + "step": 51805 + }, + { + "epoch": 8.451876019575856, + "grad_norm": 0.2549675405025482, + "learning_rate": 0.000715120622479979, + "loss": 0.0947, + "num_input_tokens_seen": 111922448, + "step": 51810 + }, + { + "epoch": 8.452691680261012, + "grad_norm": 0.004847115837037563, + "learning_rate": 0.0007150563652816355, + "loss": 0.1074, + "num_input_tokens_seen": 111933200, + "step": 51815 + }, + { + "epoch": 8.453507340946166, + "grad_norm": 0.2996913194656372, + "learning_rate": 0.0007149921037249021, + "loss": 0.2818, + "num_input_tokens_seen": 111944048, + "step": 51820 + }, + { + "epoch": 8.454323001631321, + "grad_norm": 0.08265216648578644, + "learning_rate": 0.0007149278378110808, + "loss": 0.0484, + "num_input_tokens_seen": 111955184, + "step": 51825 + }, + { + "epoch": 8.455138662316477, + "grad_norm": 0.033596016466617584, + "learning_rate": 0.0007148635675414743, + "loss": 0.0412, + "num_input_tokens_seen": 111967632, + "step": 51830 + }, + { + "epoch": 8.455954323001631, + "grad_norm": 0.014473401010036469, + "learning_rate": 0.000714799292917385, + "loss": 0.0857, + "num_input_tokens_seen": 111978160, + "step": 51835 + }, + { + "epoch": 8.456769983686787, + "grad_norm": 0.10292612016201019, + "learning_rate": 0.0007147350139401156, + "loss": 0.024, + "num_input_tokens_seen": 111988688, + "step": 51840 + }, + { + "epoch": 8.45758564437194, + "grad_norm": 0.011349059641361237, + "learning_rate": 0.0007146707306109687, + "loss": 0.0143, + "num_input_tokens_seen": 111999024, + "step": 51845 + }, + { + "epoch": 8.458401305057096, + "grad_norm": 0.07083631306886673, + "learning_rate": 0.000714606442931247, + "loss": 0.039, + "num_input_tokens_seen": 112010544, + "step": 51850 + }, + { + "epoch": 8.459216965742252, + "grad_norm": 0.0234207920730114, + "learning_rate": 0.0007145421509022536, + "loss": 0.0425, + "num_input_tokens_seen": 112021456, + "step": 51855 + }, + { + "epoch": 8.460032626427406, + "grad_norm": 0.10967404395341873, + "learning_rate": 0.0007144778545252914, + "loss": 0.1617, + "num_input_tokens_seen": 112032528, + "step": 51860 + }, + { + "epoch": 8.460848287112562, + "grad_norm": 0.04319612681865692, + "learning_rate": 0.0007144135538016633, + "loss": 0.0911, + "num_input_tokens_seen": 112043376, + "step": 51865 + }, + { + "epoch": 8.461663947797716, + "grad_norm": 0.02937161736190319, + "learning_rate": 0.0007143492487326726, + "loss": 0.0352, + "num_input_tokens_seen": 112054928, + "step": 51870 + }, + { + "epoch": 8.462479608482871, + "grad_norm": 0.04233347997069359, + "learning_rate": 0.0007142849393196223, + "loss": 0.1164, + "num_input_tokens_seen": 112066352, + "step": 51875 + }, + { + "epoch": 8.463295269168025, + "grad_norm": 0.003394125262275338, + "learning_rate": 0.000714220625563816, + "loss": 0.0085, + "num_input_tokens_seen": 112077584, + "step": 51880 + }, + { + "epoch": 8.464110929853181, + "grad_norm": 0.20848369598388672, + "learning_rate": 0.0007141563074665571, + "loss": 0.0893, + "num_input_tokens_seen": 112088560, + "step": 51885 + }, + { + "epoch": 8.464926590538337, + "grad_norm": 0.2927113175392151, + "learning_rate": 0.0007140919850291488, + "loss": 0.0801, + "num_input_tokens_seen": 112099696, + "step": 51890 + }, + { + "epoch": 8.46574225122349, + "grad_norm": 0.18456770479679108, + "learning_rate": 0.0007140276582528947, + "loss": 0.0724, + "num_input_tokens_seen": 112111728, + "step": 51895 + }, + { + "epoch": 8.466557911908646, + "grad_norm": 0.01184168178588152, + "learning_rate": 0.0007139633271390988, + "loss": 0.0117, + "num_input_tokens_seen": 112121904, + "step": 51900 + }, + { + "epoch": 8.4673735725938, + "grad_norm": 0.01245115976780653, + "learning_rate": 0.0007138989916890644, + "loss": 0.0217, + "num_input_tokens_seen": 112133168, + "step": 51905 + }, + { + "epoch": 8.468189233278956, + "grad_norm": 0.003608755301684141, + "learning_rate": 0.0007138346519040959, + "loss": 0.0067, + "num_input_tokens_seen": 112144176, + "step": 51910 + }, + { + "epoch": 8.469004893964112, + "grad_norm": 0.017230842262506485, + "learning_rate": 0.0007137703077854967, + "loss": 0.0537, + "num_input_tokens_seen": 112155664, + "step": 51915 + }, + { + "epoch": 8.469820554649266, + "grad_norm": 0.00256637716665864, + "learning_rate": 0.0007137059593345711, + "loss": 0.0366, + "num_input_tokens_seen": 112167184, + "step": 51920 + }, + { + "epoch": 8.470636215334421, + "grad_norm": 0.00391194224357605, + "learning_rate": 0.0007136416065526231, + "loss": 0.0715, + "num_input_tokens_seen": 112177936, + "step": 51925 + }, + { + "epoch": 8.471451876019575, + "grad_norm": 0.07863806933164597, + "learning_rate": 0.0007135772494409569, + "loss": 0.1055, + "num_input_tokens_seen": 112189520, + "step": 51930 + }, + { + "epoch": 8.47226753670473, + "grad_norm": 0.004654384218156338, + "learning_rate": 0.0007135128880008768, + "loss": 0.0256, + "num_input_tokens_seen": 112199984, + "step": 51935 + }, + { + "epoch": 8.473083197389887, + "grad_norm": 0.0199937354773283, + "learning_rate": 0.0007134485222336873, + "loss": 0.0288, + "num_input_tokens_seen": 112210704, + "step": 51940 + }, + { + "epoch": 8.47389885807504, + "grad_norm": 0.302824467420578, + "learning_rate": 0.0007133841521406925, + "loss": 0.0393, + "num_input_tokens_seen": 112222224, + "step": 51945 + }, + { + "epoch": 8.474714518760196, + "grad_norm": 0.017630685120821, + "learning_rate": 0.0007133197777231973, + "loss": 0.014, + "num_input_tokens_seen": 112233456, + "step": 51950 + }, + { + "epoch": 8.47553017944535, + "grad_norm": 0.11950056254863739, + "learning_rate": 0.0007132553989825061, + "loss": 0.0268, + "num_input_tokens_seen": 112244720, + "step": 51955 + }, + { + "epoch": 8.476345840130506, + "grad_norm": 0.0019255392253398895, + "learning_rate": 0.0007131910159199238, + "loss": 0.0532, + "num_input_tokens_seen": 112253680, + "step": 51960 + }, + { + "epoch": 8.477161500815662, + "grad_norm": 0.002990216948091984, + "learning_rate": 0.000713126628536755, + "loss": 0.0182, + "num_input_tokens_seen": 112264144, + "step": 51965 + }, + { + "epoch": 8.477977161500815, + "grad_norm": 0.009614250622689724, + "learning_rate": 0.0007130622368343048, + "loss": 0.0283, + "num_input_tokens_seen": 112275088, + "step": 51970 + }, + { + "epoch": 8.478792822185971, + "grad_norm": 0.2912578284740448, + "learning_rate": 0.000712997840813878, + "loss": 0.1088, + "num_input_tokens_seen": 112284944, + "step": 51975 + }, + { + "epoch": 8.479608482871125, + "grad_norm": 0.021116318181157112, + "learning_rate": 0.0007129334404767797, + "loss": 0.0256, + "num_input_tokens_seen": 112296336, + "step": 51980 + }, + { + "epoch": 8.48042414355628, + "grad_norm": 0.30992650985717773, + "learning_rate": 0.0007128690358243153, + "loss": 0.127, + "num_input_tokens_seen": 112307248, + "step": 51985 + }, + { + "epoch": 8.481239804241435, + "grad_norm": 0.2787032425403595, + "learning_rate": 0.0007128046268577898, + "loss": 0.0337, + "num_input_tokens_seen": 112318672, + "step": 51990 + }, + { + "epoch": 8.48205546492659, + "grad_norm": 0.016766250133514404, + "learning_rate": 0.0007127402135785086, + "loss": 0.022, + "num_input_tokens_seen": 112329648, + "step": 51995 + }, + { + "epoch": 8.482871125611746, + "grad_norm": 0.006805556360632181, + "learning_rate": 0.000712675795987777, + "loss": 0.1846, + "num_input_tokens_seen": 112340816, + "step": 52000 + }, + { + "epoch": 8.4836867862969, + "grad_norm": 0.3869755268096924, + "learning_rate": 0.0007126113740869006, + "loss": 0.0957, + "num_input_tokens_seen": 112351344, + "step": 52005 + }, + { + "epoch": 8.484502446982056, + "grad_norm": 0.026312250643968582, + "learning_rate": 0.000712546947877185, + "loss": 0.0812, + "num_input_tokens_seen": 112361456, + "step": 52010 + }, + { + "epoch": 8.48531810766721, + "grad_norm": 0.01061819028109312, + "learning_rate": 0.0007124825173599359, + "loss": 0.0437, + "num_input_tokens_seen": 112374352, + "step": 52015 + }, + { + "epoch": 8.486133768352365, + "grad_norm": 0.004045186098664999, + "learning_rate": 0.000712418082536459, + "loss": 0.0841, + "num_input_tokens_seen": 112385520, + "step": 52020 + }, + { + "epoch": 8.486949429037521, + "grad_norm": 0.004907793365418911, + "learning_rate": 0.0007123536434080602, + "loss": 0.156, + "num_input_tokens_seen": 112395312, + "step": 52025 + }, + { + "epoch": 8.487765089722675, + "grad_norm": 0.04804065078496933, + "learning_rate": 0.0007122891999760454, + "loss": 0.0264, + "num_input_tokens_seen": 112406192, + "step": 52030 + }, + { + "epoch": 8.48858075040783, + "grad_norm": 0.15240783989429474, + "learning_rate": 0.0007122247522417206, + "loss": 0.0554, + "num_input_tokens_seen": 112417840, + "step": 52035 + }, + { + "epoch": 8.489396411092985, + "grad_norm": 0.14023399353027344, + "learning_rate": 0.0007121603002063921, + "loss": 0.0316, + "num_input_tokens_seen": 112427760, + "step": 52040 + }, + { + "epoch": 8.49021207177814, + "grad_norm": 0.3805373013019562, + "learning_rate": 0.000712095843871366, + "loss": 0.119, + "num_input_tokens_seen": 112438768, + "step": 52045 + }, + { + "epoch": 8.491027732463296, + "grad_norm": 0.065114825963974, + "learning_rate": 0.0007120313832379483, + "loss": 0.1752, + "num_input_tokens_seen": 112449840, + "step": 52050 + }, + { + "epoch": 8.49184339314845, + "grad_norm": 0.42016083002090454, + "learning_rate": 0.000711966918307446, + "loss": 0.1429, + "num_input_tokens_seen": 112460272, + "step": 52055 + }, + { + "epoch": 8.492659053833606, + "grad_norm": 0.023709211498498917, + "learning_rate": 0.000711902449081165, + "loss": 0.0529, + "num_input_tokens_seen": 112471056, + "step": 52060 + }, + { + "epoch": 8.49347471451876, + "grad_norm": 0.0041375719010829926, + "learning_rate": 0.000711837975560412, + "loss": 0.0569, + "num_input_tokens_seen": 112481584, + "step": 52065 + }, + { + "epoch": 8.494290375203915, + "grad_norm": 0.18444637954235077, + "learning_rate": 0.0007117734977464937, + "loss": 0.0652, + "num_input_tokens_seen": 112493040, + "step": 52070 + }, + { + "epoch": 8.49510603588907, + "grad_norm": 0.20939184725284576, + "learning_rate": 0.0007117090156407168, + "loss": 0.0256, + "num_input_tokens_seen": 112502960, + "step": 52075 + }, + { + "epoch": 8.495921696574225, + "grad_norm": 0.0033792341127991676, + "learning_rate": 0.0007116445292443883, + "loss": 0.1428, + "num_input_tokens_seen": 112513968, + "step": 52080 + }, + { + "epoch": 8.49673735725938, + "grad_norm": 0.010220236144959927, + "learning_rate": 0.0007115800385588148, + "loss": 0.1098, + "num_input_tokens_seen": 112525040, + "step": 52085 + }, + { + "epoch": 8.497553017944535, + "grad_norm": 0.2910071015357971, + "learning_rate": 0.0007115155435853034, + "loss": 0.1471, + "num_input_tokens_seen": 112536944, + "step": 52090 + }, + { + "epoch": 8.49836867862969, + "grad_norm": 0.009880652651190758, + "learning_rate": 0.0007114510443251613, + "loss": 0.0204, + "num_input_tokens_seen": 112547888, + "step": 52095 + }, + { + "epoch": 8.499184339314844, + "grad_norm": 0.039387013763189316, + "learning_rate": 0.0007113865407796955, + "loss": 0.2248, + "num_input_tokens_seen": 112557264, + "step": 52100 + }, + { + "epoch": 8.5, + "grad_norm": 0.007032784633338451, + "learning_rate": 0.0007113220329502131, + "loss": 0.0199, + "num_input_tokens_seen": 112568496, + "step": 52105 + }, + { + "epoch": 8.500815660685156, + "grad_norm": 0.0679621696472168, + "learning_rate": 0.0007112575208380219, + "loss": 0.0882, + "num_input_tokens_seen": 112578832, + "step": 52110 + }, + { + "epoch": 8.50163132137031, + "grad_norm": 0.01735229603946209, + "learning_rate": 0.0007111930044444288, + "loss": 0.0058, + "num_input_tokens_seen": 112590224, + "step": 52115 + }, + { + "epoch": 8.502446982055465, + "grad_norm": 0.014939922839403152, + "learning_rate": 0.0007111284837707416, + "loss": 0.0157, + "num_input_tokens_seen": 112601808, + "step": 52120 + }, + { + "epoch": 8.50326264274062, + "grad_norm": 0.012377532199025154, + "learning_rate": 0.0007110639588182679, + "loss": 0.0581, + "num_input_tokens_seen": 112611856, + "step": 52125 + }, + { + "epoch": 8.504078303425775, + "grad_norm": 0.7738260626792908, + "learning_rate": 0.0007109994295883154, + "loss": 0.135, + "num_input_tokens_seen": 112621648, + "step": 52130 + }, + { + "epoch": 8.50489396411093, + "grad_norm": 0.114794060587883, + "learning_rate": 0.0007109348960821916, + "loss": 0.0378, + "num_input_tokens_seen": 112631472, + "step": 52135 + }, + { + "epoch": 8.505709624796085, + "grad_norm": 0.12063523381948471, + "learning_rate": 0.0007108703583012047, + "loss": 0.0188, + "num_input_tokens_seen": 112641104, + "step": 52140 + }, + { + "epoch": 8.50652528548124, + "grad_norm": 0.035221684724092484, + "learning_rate": 0.0007108058162466624, + "loss": 0.0134, + "num_input_tokens_seen": 112651984, + "step": 52145 + }, + { + "epoch": 8.507340946166394, + "grad_norm": 0.3090044856071472, + "learning_rate": 0.0007107412699198729, + "loss": 0.1596, + "num_input_tokens_seen": 112662288, + "step": 52150 + }, + { + "epoch": 8.50815660685155, + "grad_norm": 0.21309930086135864, + "learning_rate": 0.0007106767193221442, + "loss": 0.0886, + "num_input_tokens_seen": 112674160, + "step": 52155 + }, + { + "epoch": 8.508972267536706, + "grad_norm": 0.028279351070523262, + "learning_rate": 0.0007106121644547844, + "loss": 0.0308, + "num_input_tokens_seen": 112685200, + "step": 52160 + }, + { + "epoch": 8.50978792822186, + "grad_norm": 0.962195634841919, + "learning_rate": 0.000710547605319102, + "loss": 0.0735, + "num_input_tokens_seen": 112695600, + "step": 52165 + }, + { + "epoch": 8.510603588907015, + "grad_norm": 0.06181253492832184, + "learning_rate": 0.0007104830419164052, + "loss": 0.1228, + "num_input_tokens_seen": 112708304, + "step": 52170 + }, + { + "epoch": 8.51141924959217, + "grad_norm": 0.13785938918590546, + "learning_rate": 0.0007104184742480025, + "loss": 0.0694, + "num_input_tokens_seen": 112718672, + "step": 52175 + }, + { + "epoch": 8.512234910277325, + "grad_norm": 0.001226426218636334, + "learning_rate": 0.0007103539023152025, + "loss": 0.0092, + "num_input_tokens_seen": 112728304, + "step": 52180 + }, + { + "epoch": 8.513050570962479, + "grad_norm": 0.007094322703778744, + "learning_rate": 0.0007102893261193141, + "loss": 0.0052, + "num_input_tokens_seen": 112738288, + "step": 52185 + }, + { + "epoch": 8.513866231647635, + "grad_norm": 0.08791056275367737, + "learning_rate": 0.0007102247456616456, + "loss": 0.0206, + "num_input_tokens_seen": 112749008, + "step": 52190 + }, + { + "epoch": 8.51468189233279, + "grad_norm": 0.1945263296365738, + "learning_rate": 0.0007101601609435057, + "loss": 0.0508, + "num_input_tokens_seen": 112759472, + "step": 52195 + }, + { + "epoch": 8.515497553017944, + "grad_norm": 0.17970463633537292, + "learning_rate": 0.0007100955719662038, + "loss": 0.0529, + "num_input_tokens_seen": 112770928, + "step": 52200 + }, + { + "epoch": 8.5163132137031, + "grad_norm": 0.13630010187625885, + "learning_rate": 0.0007100309787310485, + "loss": 0.0463, + "num_input_tokens_seen": 112780784, + "step": 52205 + }, + { + "epoch": 8.517128874388254, + "grad_norm": 0.008806428872048855, + "learning_rate": 0.0007099663812393489, + "loss": 0.0579, + "num_input_tokens_seen": 112792528, + "step": 52210 + }, + { + "epoch": 8.51794453507341, + "grad_norm": 0.5296904444694519, + "learning_rate": 0.0007099017794924144, + "loss": 0.0636, + "num_input_tokens_seen": 112804496, + "step": 52215 + }, + { + "epoch": 8.518760195758565, + "grad_norm": 0.014076504856348038, + "learning_rate": 0.000709837173491554, + "loss": 0.023, + "num_input_tokens_seen": 112815472, + "step": 52220 + }, + { + "epoch": 8.51957585644372, + "grad_norm": 0.010113160125911236, + "learning_rate": 0.0007097725632380771, + "loss": 0.0869, + "num_input_tokens_seen": 112826288, + "step": 52225 + }, + { + "epoch": 8.520391517128875, + "grad_norm": 0.05956968665122986, + "learning_rate": 0.0007097079487332931, + "loss": 0.0974, + "num_input_tokens_seen": 112837072, + "step": 52230 + }, + { + "epoch": 8.521207177814029, + "grad_norm": 0.013660747557878494, + "learning_rate": 0.0007096433299785113, + "loss": 0.023, + "num_input_tokens_seen": 112846736, + "step": 52235 + }, + { + "epoch": 8.522022838499185, + "grad_norm": 0.03401073068380356, + "learning_rate": 0.0007095787069750416, + "loss": 0.0341, + "num_input_tokens_seen": 112858864, + "step": 52240 + }, + { + "epoch": 8.522838499184338, + "grad_norm": 0.00827324390411377, + "learning_rate": 0.0007095140797241936, + "loss": 0.041, + "num_input_tokens_seen": 112870032, + "step": 52245 + }, + { + "epoch": 8.523654159869494, + "grad_norm": 0.005122459959238768, + "learning_rate": 0.0007094494482272768, + "loss": 0.0207, + "num_input_tokens_seen": 112880016, + "step": 52250 + }, + { + "epoch": 8.52446982055465, + "grad_norm": 0.22027291357517242, + "learning_rate": 0.0007093848124856014, + "loss": 0.0762, + "num_input_tokens_seen": 112890576, + "step": 52255 + }, + { + "epoch": 8.525285481239804, + "grad_norm": 0.08939344435930252, + "learning_rate": 0.000709320172500477, + "loss": 0.0264, + "num_input_tokens_seen": 112899920, + "step": 52260 + }, + { + "epoch": 8.52610114192496, + "grad_norm": 0.13552896678447723, + "learning_rate": 0.0007092555282732139, + "loss": 0.0398, + "num_input_tokens_seen": 112911696, + "step": 52265 + }, + { + "epoch": 8.526916802610113, + "grad_norm": 0.034195221960544586, + "learning_rate": 0.000709190879805122, + "loss": 0.0092, + "num_input_tokens_seen": 112922608, + "step": 52270 + }, + { + "epoch": 8.52773246329527, + "grad_norm": 0.003045213408768177, + "learning_rate": 0.0007091262270975116, + "loss": 0.0573, + "num_input_tokens_seen": 112931952, + "step": 52275 + }, + { + "epoch": 8.528548123980425, + "grad_norm": 0.009350604377686977, + "learning_rate": 0.0007090615701516929, + "loss": 0.0563, + "num_input_tokens_seen": 112942608, + "step": 52280 + }, + { + "epoch": 8.529363784665579, + "grad_norm": 0.00833084899932146, + "learning_rate": 0.0007089969089689761, + "loss": 0.0798, + "num_input_tokens_seen": 112953616, + "step": 52285 + }, + { + "epoch": 8.530179445350734, + "grad_norm": 0.04479145631194115, + "learning_rate": 0.0007089322435506719, + "loss": 0.0367, + "num_input_tokens_seen": 112964624, + "step": 52290 + }, + { + "epoch": 8.530995106035888, + "grad_norm": 0.34924301505088806, + "learning_rate": 0.0007088675738980909, + "loss": 0.142, + "num_input_tokens_seen": 112973936, + "step": 52295 + }, + { + "epoch": 8.531810766721044, + "grad_norm": 0.0031820894218981266, + "learning_rate": 0.0007088029000125435, + "loss": 0.0221, + "num_input_tokens_seen": 112983408, + "step": 52300 + }, + { + "epoch": 8.5326264274062, + "grad_norm": 0.011126089841127396, + "learning_rate": 0.0007087382218953403, + "loss": 0.069, + "num_input_tokens_seen": 112994384, + "step": 52305 + }, + { + "epoch": 8.533442088091354, + "grad_norm": 0.7672825455665588, + "learning_rate": 0.0007086735395477923, + "loss": 0.1052, + "num_input_tokens_seen": 113003632, + "step": 52310 + }, + { + "epoch": 8.53425774877651, + "grad_norm": 0.019653482362627983, + "learning_rate": 0.0007086088529712103, + "loss": 0.0994, + "num_input_tokens_seen": 113013584, + "step": 52315 + }, + { + "epoch": 8.535073409461663, + "grad_norm": 0.068137988448143, + "learning_rate": 0.0007085441621669053, + "loss": 0.0696, + "num_input_tokens_seen": 113024432, + "step": 52320 + }, + { + "epoch": 8.535889070146819, + "grad_norm": 0.4487306475639343, + "learning_rate": 0.0007084794671361883, + "loss": 0.0535, + "num_input_tokens_seen": 113034640, + "step": 52325 + }, + { + "epoch": 8.536704730831975, + "grad_norm": 0.006371349096298218, + "learning_rate": 0.0007084147678803703, + "loss": 0.0067, + "num_input_tokens_seen": 113044816, + "step": 52330 + }, + { + "epoch": 8.537520391517129, + "grad_norm": 0.2185864895582199, + "learning_rate": 0.0007083500644007628, + "loss": 0.1425, + "num_input_tokens_seen": 113055152, + "step": 52335 + }, + { + "epoch": 8.538336052202284, + "grad_norm": 0.06938138604164124, + "learning_rate": 0.0007082853566986769, + "loss": 0.0359, + "num_input_tokens_seen": 113066608, + "step": 52340 + }, + { + "epoch": 8.539151712887438, + "grad_norm": 0.3545314371585846, + "learning_rate": 0.0007082206447754239, + "loss": 0.1569, + "num_input_tokens_seen": 113076944, + "step": 52345 + }, + { + "epoch": 8.539967373572594, + "grad_norm": 0.033213697373867035, + "learning_rate": 0.0007081559286323155, + "loss": 0.03, + "num_input_tokens_seen": 113088240, + "step": 52350 + }, + { + "epoch": 8.540783034257748, + "grad_norm": 0.0024236650206148624, + "learning_rate": 0.0007080912082706631, + "loss": 0.0878, + "num_input_tokens_seen": 113100688, + "step": 52355 + }, + { + "epoch": 8.541598694942904, + "grad_norm": 0.01018419861793518, + "learning_rate": 0.0007080264836917783, + "loss": 0.0711, + "num_input_tokens_seen": 113110352, + "step": 52360 + }, + { + "epoch": 8.54241435562806, + "grad_norm": 0.002987699583172798, + "learning_rate": 0.000707961754896973, + "loss": 0.2497, + "num_input_tokens_seen": 113120048, + "step": 52365 + }, + { + "epoch": 8.543230016313213, + "grad_norm": 0.011956961825489998, + "learning_rate": 0.0007078970218875589, + "loss": 0.0799, + "num_input_tokens_seen": 113130960, + "step": 52370 + }, + { + "epoch": 8.544045676998369, + "grad_norm": 0.06541749089956284, + "learning_rate": 0.0007078322846648479, + "loss": 0.0704, + "num_input_tokens_seen": 113142192, + "step": 52375 + }, + { + "epoch": 8.544861337683523, + "grad_norm": 0.27652621269226074, + "learning_rate": 0.0007077675432301521, + "loss": 0.1562, + "num_input_tokens_seen": 113154256, + "step": 52380 + }, + { + "epoch": 8.545676998368679, + "grad_norm": 0.27194124460220337, + "learning_rate": 0.0007077027975847833, + "loss": 0.2258, + "num_input_tokens_seen": 113166640, + "step": 52385 + }, + { + "epoch": 8.546492659053834, + "grad_norm": 0.18163374066352844, + "learning_rate": 0.0007076380477300539, + "loss": 0.0444, + "num_input_tokens_seen": 113177744, + "step": 52390 + }, + { + "epoch": 8.547308319738988, + "grad_norm": 0.26939302682876587, + "learning_rate": 0.0007075732936672761, + "loss": 0.0754, + "num_input_tokens_seen": 113188688, + "step": 52395 + }, + { + "epoch": 8.548123980424144, + "grad_norm": 0.007374065462499857, + "learning_rate": 0.0007075085353977622, + "loss": 0.0201, + "num_input_tokens_seen": 113198704, + "step": 52400 + }, + { + "epoch": 8.548939641109298, + "grad_norm": 0.19838181138038635, + "learning_rate": 0.0007074437729228245, + "loss": 0.0926, + "num_input_tokens_seen": 113209296, + "step": 52405 + }, + { + "epoch": 8.549755301794454, + "grad_norm": 0.021237578243017197, + "learning_rate": 0.0007073790062437755, + "loss": 0.0677, + "num_input_tokens_seen": 113218896, + "step": 52410 + }, + { + "epoch": 8.550570962479608, + "grad_norm": 0.0337057039141655, + "learning_rate": 0.000707314235361928, + "loss": 0.058, + "num_input_tokens_seen": 113229520, + "step": 52415 + }, + { + "epoch": 8.551386623164763, + "grad_norm": 0.07322728633880615, + "learning_rate": 0.0007072494602785945, + "loss": 0.0298, + "num_input_tokens_seen": 113240048, + "step": 52420 + }, + { + "epoch": 8.552202283849919, + "grad_norm": 0.42627814412117004, + "learning_rate": 0.0007071846809950878, + "loss": 0.144, + "num_input_tokens_seen": 113251984, + "step": 52425 + }, + { + "epoch": 8.553017944535073, + "grad_norm": 0.02343291975557804, + "learning_rate": 0.0007071198975127206, + "loss": 0.0746, + "num_input_tokens_seen": 113262512, + "step": 52430 + }, + { + "epoch": 8.553833605220229, + "grad_norm": 0.008681093342602253, + "learning_rate": 0.000707055109832806, + "loss": 0.0294, + "num_input_tokens_seen": 113274768, + "step": 52435 + }, + { + "epoch": 8.554649265905383, + "grad_norm": 0.03344331681728363, + "learning_rate": 0.0007069903179566569, + "loss": 0.0243, + "num_input_tokens_seen": 113286512, + "step": 52440 + }, + { + "epoch": 8.555464926590538, + "grad_norm": 0.015263247303664684, + "learning_rate": 0.0007069255218855865, + "loss": 0.0903, + "num_input_tokens_seen": 113296944, + "step": 52445 + }, + { + "epoch": 8.556280587275694, + "grad_norm": 0.008519859984517097, + "learning_rate": 0.0007068607216209078, + "loss": 0.0887, + "num_input_tokens_seen": 113307760, + "step": 52450 + }, + { + "epoch": 8.557096247960848, + "grad_norm": 0.1599939614534378, + "learning_rate": 0.0007067959171639342, + "loss": 0.1706, + "num_input_tokens_seen": 113317776, + "step": 52455 + }, + { + "epoch": 8.557911908646004, + "grad_norm": 0.027809320017695427, + "learning_rate": 0.000706731108515979, + "loss": 0.0842, + "num_input_tokens_seen": 113328912, + "step": 52460 + }, + { + "epoch": 8.558727569331158, + "grad_norm": 0.10448624938726425, + "learning_rate": 0.0007066662956783556, + "loss": 0.035, + "num_input_tokens_seen": 113339056, + "step": 52465 + }, + { + "epoch": 8.559543230016313, + "grad_norm": 0.010518464259803295, + "learning_rate": 0.0007066014786523776, + "loss": 0.0219, + "num_input_tokens_seen": 113349488, + "step": 52470 + }, + { + "epoch": 8.560358890701469, + "grad_norm": 0.23149004578590393, + "learning_rate": 0.0007065366574393585, + "loss": 0.0528, + "num_input_tokens_seen": 113361616, + "step": 52475 + }, + { + "epoch": 8.561174551386623, + "grad_norm": 0.009577560238540173, + "learning_rate": 0.000706471832040612, + "loss": 0.0427, + "num_input_tokens_seen": 113371728, + "step": 52480 + }, + { + "epoch": 8.561990212071779, + "grad_norm": 0.03113245777785778, + "learning_rate": 0.000706407002457452, + "loss": 0.0484, + "num_input_tokens_seen": 113383280, + "step": 52485 + }, + { + "epoch": 8.562805872756933, + "grad_norm": 0.18912045657634735, + "learning_rate": 0.0007063421686911921, + "loss": 0.1145, + "num_input_tokens_seen": 113394160, + "step": 52490 + }, + { + "epoch": 8.563621533442088, + "grad_norm": 0.2634306848049164, + "learning_rate": 0.0007062773307431465, + "loss": 0.1875, + "num_input_tokens_seen": 113405552, + "step": 52495 + }, + { + "epoch": 8.564437194127244, + "grad_norm": 0.33455103635787964, + "learning_rate": 0.000706212488614629, + "loss": 0.1255, + "num_input_tokens_seen": 113415920, + "step": 52500 + }, + { + "epoch": 8.565252854812398, + "grad_norm": 0.00712059810757637, + "learning_rate": 0.0007061476423069539, + "loss": 0.0049, + "num_input_tokens_seen": 113427088, + "step": 52505 + }, + { + "epoch": 8.566068515497554, + "grad_norm": 0.1418961137533188, + "learning_rate": 0.0007060827918214353, + "loss": 0.0772, + "num_input_tokens_seen": 113438032, + "step": 52510 + }, + { + "epoch": 8.566884176182707, + "grad_norm": 0.0014664334012195468, + "learning_rate": 0.0007060179371593876, + "loss": 0.0993, + "num_input_tokens_seen": 113449424, + "step": 52515 + }, + { + "epoch": 8.567699836867863, + "grad_norm": 0.25699958205223083, + "learning_rate": 0.0007059530783221249, + "loss": 0.0971, + "num_input_tokens_seen": 113461232, + "step": 52520 + }, + { + "epoch": 8.568515497553017, + "grad_norm": 0.006246160715818405, + "learning_rate": 0.0007058882153109618, + "loss": 0.0439, + "num_input_tokens_seen": 113472432, + "step": 52525 + }, + { + "epoch": 8.569331158238173, + "grad_norm": 0.021962016820907593, + "learning_rate": 0.000705823348127213, + "loss": 0.0765, + "num_input_tokens_seen": 113482640, + "step": 52530 + }, + { + "epoch": 8.570146818923329, + "grad_norm": 0.004425059538334608, + "learning_rate": 0.0007057584767721927, + "loss": 0.0733, + "num_input_tokens_seen": 113492464, + "step": 52535 + }, + { + "epoch": 8.570962479608482, + "grad_norm": 0.003864873433485627, + "learning_rate": 0.000705693601247216, + "loss": 0.1789, + "num_input_tokens_seen": 113503152, + "step": 52540 + }, + { + "epoch": 8.571778140293638, + "grad_norm": 0.006510365754365921, + "learning_rate": 0.0007056287215535976, + "loss": 0.0384, + "num_input_tokens_seen": 113514416, + "step": 52545 + }, + { + "epoch": 8.572593800978792, + "grad_norm": 0.189361110329628, + "learning_rate": 0.0007055638376926522, + "loss": 0.0428, + "num_input_tokens_seen": 113525808, + "step": 52550 + }, + { + "epoch": 8.573409461663948, + "grad_norm": 0.03475072607398033, + "learning_rate": 0.0007054989496656949, + "loss": 0.0801, + "num_input_tokens_seen": 113536880, + "step": 52555 + }, + { + "epoch": 8.574225122349104, + "grad_norm": 0.0047148847952485085, + "learning_rate": 0.0007054340574740405, + "loss": 0.021, + "num_input_tokens_seen": 113549200, + "step": 52560 + }, + { + "epoch": 8.575040783034257, + "grad_norm": 0.2840512990951538, + "learning_rate": 0.0007053691611190045, + "loss": 0.1158, + "num_input_tokens_seen": 113559824, + "step": 52565 + }, + { + "epoch": 8.575856443719413, + "grad_norm": 0.14973792433738708, + "learning_rate": 0.0007053042606019017, + "loss": 0.1616, + "num_input_tokens_seen": 113571056, + "step": 52570 + }, + { + "epoch": 8.576672104404567, + "grad_norm": 0.002990563167259097, + "learning_rate": 0.0007052393559240479, + "loss": 0.0572, + "num_input_tokens_seen": 113581840, + "step": 52575 + }, + { + "epoch": 8.577487765089723, + "grad_norm": 0.004023312591016293, + "learning_rate": 0.0007051744470867581, + "loss": 0.1229, + "num_input_tokens_seen": 113593392, + "step": 52580 + }, + { + "epoch": 8.578303425774878, + "grad_norm": 0.23519225418567657, + "learning_rate": 0.0007051095340913478, + "loss": 0.2013, + "num_input_tokens_seen": 113604624, + "step": 52585 + }, + { + "epoch": 8.579119086460032, + "grad_norm": 0.17504338920116425, + "learning_rate": 0.0007050446169391326, + "loss": 0.1504, + "num_input_tokens_seen": 113615920, + "step": 52590 + }, + { + "epoch": 8.579934747145188, + "grad_norm": 0.035556066781282425, + "learning_rate": 0.0007049796956314281, + "loss": 0.046, + "num_input_tokens_seen": 113625840, + "step": 52595 + }, + { + "epoch": 8.580750407830342, + "grad_norm": 0.014154000207781792, + "learning_rate": 0.00070491477016955, + "loss": 0.0249, + "num_input_tokens_seen": 113636784, + "step": 52600 + }, + { + "epoch": 8.581566068515498, + "grad_norm": 0.03693210706114769, + "learning_rate": 0.0007048498405548142, + "loss": 0.0116, + "num_input_tokens_seen": 113647920, + "step": 52605 + }, + { + "epoch": 8.582381729200652, + "grad_norm": 0.03940049931406975, + "learning_rate": 0.0007047849067885366, + "loss": 0.0161, + "num_input_tokens_seen": 113658288, + "step": 52610 + }, + { + "epoch": 8.583197389885807, + "grad_norm": 0.012887493707239628, + "learning_rate": 0.000704719968872033, + "loss": 0.0988, + "num_input_tokens_seen": 113668368, + "step": 52615 + }, + { + "epoch": 8.584013050570963, + "grad_norm": 0.05204826593399048, + "learning_rate": 0.0007046550268066194, + "loss": 0.0693, + "num_input_tokens_seen": 113678288, + "step": 52620 + }, + { + "epoch": 8.584828711256117, + "grad_norm": 0.22619600594043732, + "learning_rate": 0.0007045900805936122, + "loss": 0.1762, + "num_input_tokens_seen": 113689392, + "step": 52625 + }, + { + "epoch": 8.585644371941273, + "grad_norm": 0.160128653049469, + "learning_rate": 0.0007045251302343276, + "loss": 0.1142, + "num_input_tokens_seen": 113700016, + "step": 52630 + }, + { + "epoch": 8.586460032626427, + "grad_norm": 0.01484632957726717, + "learning_rate": 0.0007044601757300815, + "loss": 0.0264, + "num_input_tokens_seen": 113712048, + "step": 52635 + }, + { + "epoch": 8.587275693311582, + "grad_norm": 0.0061071184463799, + "learning_rate": 0.0007043952170821907, + "loss": 0.0273, + "num_input_tokens_seen": 113722512, + "step": 52640 + }, + { + "epoch": 8.588091353996738, + "grad_norm": 0.023335954174399376, + "learning_rate": 0.0007043302542919715, + "loss": 0.0497, + "num_input_tokens_seen": 113732784, + "step": 52645 + }, + { + "epoch": 8.588907014681892, + "grad_norm": 0.21892686188220978, + "learning_rate": 0.0007042652873607405, + "loss": 0.1033, + "num_input_tokens_seen": 113742960, + "step": 52650 + }, + { + "epoch": 8.589722675367048, + "grad_norm": 0.008176966570317745, + "learning_rate": 0.0007042003162898143, + "loss": 0.0097, + "num_input_tokens_seen": 113753712, + "step": 52655 + }, + { + "epoch": 8.590538336052202, + "grad_norm": 0.028112633153796196, + "learning_rate": 0.0007041353410805097, + "loss": 0.0703, + "num_input_tokens_seen": 113765136, + "step": 52660 + }, + { + "epoch": 8.591353996737357, + "grad_norm": 0.013970048166811466, + "learning_rate": 0.0007040703617341434, + "loss": 0.0227, + "num_input_tokens_seen": 113775056, + "step": 52665 + }, + { + "epoch": 8.592169657422513, + "grad_norm": 0.0035523215774446726, + "learning_rate": 0.0007040053782520324, + "loss": 0.0439, + "num_input_tokens_seen": 113786512, + "step": 52670 + }, + { + "epoch": 8.592985318107667, + "grad_norm": 0.24162547290325165, + "learning_rate": 0.0007039403906354936, + "loss": 0.1064, + "num_input_tokens_seen": 113796880, + "step": 52675 + }, + { + "epoch": 8.593800978792823, + "grad_norm": 0.09258195012807846, + "learning_rate": 0.0007038753988858439, + "loss": 0.0934, + "num_input_tokens_seen": 113809360, + "step": 52680 + }, + { + "epoch": 8.594616639477977, + "grad_norm": 0.1602240353822708, + "learning_rate": 0.0007038104030044008, + "loss": 0.0791, + "num_input_tokens_seen": 113820592, + "step": 52685 + }, + { + "epoch": 8.595432300163132, + "grad_norm": 0.0038400774355977774, + "learning_rate": 0.0007037454029924814, + "loss": 0.0588, + "num_input_tokens_seen": 113832176, + "step": 52690 + }, + { + "epoch": 8.596247960848288, + "grad_norm": 0.02949357032775879, + "learning_rate": 0.0007036803988514028, + "loss": 0.046, + "num_input_tokens_seen": 113844496, + "step": 52695 + }, + { + "epoch": 8.597063621533442, + "grad_norm": 0.41612306237220764, + "learning_rate": 0.0007036153905824825, + "loss": 0.0184, + "num_input_tokens_seen": 113854544, + "step": 52700 + }, + { + "epoch": 8.597879282218598, + "grad_norm": 0.14093731343746185, + "learning_rate": 0.0007035503781870379, + "loss": 0.0312, + "num_input_tokens_seen": 113865008, + "step": 52705 + }, + { + "epoch": 8.598694942903752, + "grad_norm": 0.27731600403785706, + "learning_rate": 0.0007034853616663868, + "loss": 0.1039, + "num_input_tokens_seen": 113875952, + "step": 52710 + }, + { + "epoch": 8.599510603588907, + "grad_norm": 0.014228510670363903, + "learning_rate": 0.0007034203410218467, + "loss": 0.1459, + "num_input_tokens_seen": 113885552, + "step": 52715 + }, + { + "epoch": 8.600326264274061, + "grad_norm": 0.07596822828054428, + "learning_rate": 0.0007033553162547355, + "loss": 0.0467, + "num_input_tokens_seen": 113897104, + "step": 52720 + }, + { + "epoch": 8.601141924959217, + "grad_norm": 0.048214759677648544, + "learning_rate": 0.0007032902873663707, + "loss": 0.0941, + "num_input_tokens_seen": 113908944, + "step": 52725 + }, + { + "epoch": 8.601957585644373, + "grad_norm": 0.14651963114738464, + "learning_rate": 0.0007032252543580702, + "loss": 0.0291, + "num_input_tokens_seen": 113920624, + "step": 52730 + }, + { + "epoch": 8.602773246329527, + "grad_norm": 0.012341699562966824, + "learning_rate": 0.0007031602172311523, + "loss": 0.0289, + "num_input_tokens_seen": 113932432, + "step": 52735 + }, + { + "epoch": 8.603588907014682, + "grad_norm": 0.054182104766368866, + "learning_rate": 0.0007030951759869347, + "loss": 0.1926, + "num_input_tokens_seen": 113943632, + "step": 52740 + }, + { + "epoch": 8.604404567699836, + "grad_norm": 0.08766784518957138, + "learning_rate": 0.0007030301306267358, + "loss": 0.1292, + "num_input_tokens_seen": 113954928, + "step": 52745 + }, + { + "epoch": 8.605220228384992, + "grad_norm": 0.018275413662195206, + "learning_rate": 0.0007029650811518737, + "loss": 0.1261, + "num_input_tokens_seen": 113965296, + "step": 52750 + }, + { + "epoch": 8.606035889070148, + "grad_norm": 0.03040480799973011, + "learning_rate": 0.0007029000275636669, + "loss": 0.0281, + "num_input_tokens_seen": 113976176, + "step": 52755 + }, + { + "epoch": 8.606851549755302, + "grad_norm": 0.11693539470434189, + "learning_rate": 0.0007028349698634335, + "loss": 0.0874, + "num_input_tokens_seen": 113986288, + "step": 52760 + }, + { + "epoch": 8.607667210440457, + "grad_norm": 0.19277828931808472, + "learning_rate": 0.0007027699080524923, + "loss": 0.129, + "num_input_tokens_seen": 113997360, + "step": 52765 + }, + { + "epoch": 8.608482871125611, + "grad_norm": 0.049259208142757416, + "learning_rate": 0.0007027048421321616, + "loss": 0.0237, + "num_input_tokens_seen": 114009360, + "step": 52770 + }, + { + "epoch": 8.609298531810767, + "grad_norm": 0.16560588777065277, + "learning_rate": 0.0007026397721037601, + "loss": 0.0378, + "num_input_tokens_seen": 114021104, + "step": 52775 + }, + { + "epoch": 8.61011419249592, + "grad_norm": 0.29161280393600464, + "learning_rate": 0.0007025746979686065, + "loss": 0.1414, + "num_input_tokens_seen": 114032176, + "step": 52780 + }, + { + "epoch": 8.610929853181077, + "grad_norm": 0.006784006953239441, + "learning_rate": 0.0007025096197280196, + "loss": 0.0363, + "num_input_tokens_seen": 114042352, + "step": 52785 + }, + { + "epoch": 8.611745513866232, + "grad_norm": 0.3216829299926758, + "learning_rate": 0.0007024445373833185, + "loss": 0.1175, + "num_input_tokens_seen": 114052112, + "step": 52790 + }, + { + "epoch": 8.612561174551386, + "grad_norm": 0.03260602802038193, + "learning_rate": 0.000702379450935822, + "loss": 0.0421, + "num_input_tokens_seen": 114062928, + "step": 52795 + }, + { + "epoch": 8.613376835236542, + "grad_norm": 0.1370018720626831, + "learning_rate": 0.0007023143603868492, + "loss": 0.0691, + "num_input_tokens_seen": 114073392, + "step": 52800 + }, + { + "epoch": 8.614192495921696, + "grad_norm": 0.17849183082580566, + "learning_rate": 0.0007022492657377192, + "loss": 0.0693, + "num_input_tokens_seen": 114084464, + "step": 52805 + }, + { + "epoch": 8.615008156606851, + "grad_norm": 0.019710781052708626, + "learning_rate": 0.0007021841669897511, + "loss": 0.0684, + "num_input_tokens_seen": 114096240, + "step": 52810 + }, + { + "epoch": 8.615823817292007, + "grad_norm": 0.03188520297408104, + "learning_rate": 0.0007021190641442645, + "loss": 0.0797, + "num_input_tokens_seen": 114105168, + "step": 52815 + }, + { + "epoch": 8.616639477977161, + "grad_norm": 0.0067170062102377415, + "learning_rate": 0.0007020539572025788, + "loss": 0.0086, + "num_input_tokens_seen": 114115024, + "step": 52820 + }, + { + "epoch": 8.617455138662317, + "grad_norm": 0.006065691821277142, + "learning_rate": 0.0007019888461660132, + "loss": 0.0646, + "num_input_tokens_seen": 114126416, + "step": 52825 + }, + { + "epoch": 8.61827079934747, + "grad_norm": 0.256100058555603, + "learning_rate": 0.0007019237310358874, + "loss": 0.1274, + "num_input_tokens_seen": 114137008, + "step": 52830 + }, + { + "epoch": 8.619086460032626, + "grad_norm": 0.1438518464565277, + "learning_rate": 0.000701858611813521, + "loss": 0.155, + "num_input_tokens_seen": 114148144, + "step": 52835 + }, + { + "epoch": 8.619902120717782, + "grad_norm": 0.025290878489613533, + "learning_rate": 0.0007017934885002339, + "loss": 0.1007, + "num_input_tokens_seen": 114157872, + "step": 52840 + }, + { + "epoch": 8.620717781402936, + "grad_norm": 0.013580790720880032, + "learning_rate": 0.0007017283610973456, + "loss": 0.0635, + "num_input_tokens_seen": 114169168, + "step": 52845 + }, + { + "epoch": 8.621533442088092, + "grad_norm": 0.08735395967960358, + "learning_rate": 0.0007016632296061762, + "loss": 0.0327, + "num_input_tokens_seen": 114179984, + "step": 52850 + }, + { + "epoch": 8.622349102773246, + "grad_norm": 0.13977889716625214, + "learning_rate": 0.0007015980940280458, + "loss": 0.163, + "num_input_tokens_seen": 114191120, + "step": 52855 + }, + { + "epoch": 8.623164763458401, + "grad_norm": 0.06077880784869194, + "learning_rate": 0.0007015329543642741, + "loss": 0.0578, + "num_input_tokens_seen": 114201424, + "step": 52860 + }, + { + "epoch": 8.623980424143557, + "grad_norm": 0.005568662192672491, + "learning_rate": 0.0007014678106161814, + "loss": 0.0258, + "num_input_tokens_seen": 114212976, + "step": 52865 + }, + { + "epoch": 8.624796084828711, + "grad_norm": 0.013049778528511524, + "learning_rate": 0.000701402662785088, + "loss": 0.0454, + "num_input_tokens_seen": 114224432, + "step": 52870 + }, + { + "epoch": 8.625611745513867, + "grad_norm": 0.01467926986515522, + "learning_rate": 0.0007013375108723141, + "loss": 0.0322, + "num_input_tokens_seen": 114234256, + "step": 52875 + }, + { + "epoch": 8.62642740619902, + "grad_norm": 0.23037031292915344, + "learning_rate": 0.0007012723548791802, + "loss": 0.039, + "num_input_tokens_seen": 114244912, + "step": 52880 + }, + { + "epoch": 8.627243066884176, + "grad_norm": 0.03196464106440544, + "learning_rate": 0.0007012071948070065, + "loss": 0.0694, + "num_input_tokens_seen": 114257008, + "step": 52885 + }, + { + "epoch": 8.62805872756933, + "grad_norm": 0.2203933745622635, + "learning_rate": 0.0007011420306571139, + "loss": 0.1499, + "num_input_tokens_seen": 114266800, + "step": 52890 + }, + { + "epoch": 8.628874388254486, + "grad_norm": 0.35956448316574097, + "learning_rate": 0.0007010768624308228, + "loss": 0.0696, + "num_input_tokens_seen": 114278320, + "step": 52895 + }, + { + "epoch": 8.629690048939642, + "grad_norm": 0.027677489444613457, + "learning_rate": 0.0007010116901294541, + "loss": 0.0302, + "num_input_tokens_seen": 114288624, + "step": 52900 + }, + { + "epoch": 8.630505709624796, + "grad_norm": 0.09585878998041153, + "learning_rate": 0.0007009465137543285, + "loss": 0.0271, + "num_input_tokens_seen": 114299184, + "step": 52905 + }, + { + "epoch": 8.631321370309951, + "grad_norm": 0.012258858419954777, + "learning_rate": 0.0007008813333067668, + "loss": 0.0776, + "num_input_tokens_seen": 114309744, + "step": 52910 + }, + { + "epoch": 8.632137030995105, + "grad_norm": 0.030114926397800446, + "learning_rate": 0.00070081614878809, + "loss": 0.1126, + "num_input_tokens_seen": 114320496, + "step": 52915 + }, + { + "epoch": 8.632952691680261, + "grad_norm": 0.06548038870096207, + "learning_rate": 0.0007007509601996193, + "loss": 0.0716, + "num_input_tokens_seen": 114331728, + "step": 52920 + }, + { + "epoch": 8.633768352365417, + "grad_norm": 0.06908722221851349, + "learning_rate": 0.0007006857675426757, + "loss": 0.0594, + "num_input_tokens_seen": 114341776, + "step": 52925 + }, + { + "epoch": 8.63458401305057, + "grad_norm": 0.22732791304588318, + "learning_rate": 0.0007006205708185804, + "loss": 0.1991, + "num_input_tokens_seen": 114353264, + "step": 52930 + }, + { + "epoch": 8.635399673735726, + "grad_norm": 0.007982817478477955, + "learning_rate": 0.0007005553700286549, + "loss": 0.2081, + "num_input_tokens_seen": 114364784, + "step": 52935 + }, + { + "epoch": 8.63621533442088, + "grad_norm": 0.011525883339345455, + "learning_rate": 0.0007004901651742201, + "loss": 0.0113, + "num_input_tokens_seen": 114375920, + "step": 52940 + }, + { + "epoch": 8.637030995106036, + "grad_norm": 0.11902187764644623, + "learning_rate": 0.000700424956256598, + "loss": 0.0312, + "num_input_tokens_seen": 114386448, + "step": 52945 + }, + { + "epoch": 8.63784665579119, + "grad_norm": 0.057894494384527206, + "learning_rate": 0.0007003597432771098, + "loss": 0.0793, + "num_input_tokens_seen": 114397296, + "step": 52950 + }, + { + "epoch": 8.638662316476346, + "grad_norm": 0.029431601986289024, + "learning_rate": 0.0007002945262370773, + "loss": 0.1027, + "num_input_tokens_seen": 114408336, + "step": 52955 + }, + { + "epoch": 8.639477977161501, + "grad_norm": 0.02109280601143837, + "learning_rate": 0.0007002293051378221, + "loss": 0.0341, + "num_input_tokens_seen": 114419024, + "step": 52960 + }, + { + "epoch": 8.640293637846655, + "grad_norm": 0.24475204944610596, + "learning_rate": 0.0007001640799806662, + "loss": 0.1841, + "num_input_tokens_seen": 114430704, + "step": 52965 + }, + { + "epoch": 8.641109298531811, + "grad_norm": 0.12317442893981934, + "learning_rate": 0.000700098850766931, + "loss": 0.026, + "num_input_tokens_seen": 114441424, + "step": 52970 + }, + { + "epoch": 8.641924959216965, + "grad_norm": 0.08731578290462494, + "learning_rate": 0.0007000336174979389, + "loss": 0.2159, + "num_input_tokens_seen": 114452720, + "step": 52975 + }, + { + "epoch": 8.64274061990212, + "grad_norm": 0.1497471034526825, + "learning_rate": 0.0006999683801750116, + "loss": 0.034, + "num_input_tokens_seen": 114463920, + "step": 52980 + }, + { + "epoch": 8.643556280587276, + "grad_norm": 0.06633875519037247, + "learning_rate": 0.0006999031387994717, + "loss": 0.106, + "num_input_tokens_seen": 114474768, + "step": 52985 + }, + { + "epoch": 8.64437194127243, + "grad_norm": 0.1102503314614296, + "learning_rate": 0.0006998378933726408, + "loss": 0.0459, + "num_input_tokens_seen": 114483984, + "step": 52990 + }, + { + "epoch": 8.645187601957586, + "grad_norm": 0.056301407516002655, + "learning_rate": 0.0006997726438958417, + "loss": 0.1107, + "num_input_tokens_seen": 114494032, + "step": 52995 + }, + { + "epoch": 8.64600326264274, + "grad_norm": 0.2921803593635559, + "learning_rate": 0.0006997073903703964, + "loss": 0.0404, + "num_input_tokens_seen": 114504720, + "step": 53000 + }, + { + "epoch": 8.646818923327896, + "grad_norm": 0.05801844969391823, + "learning_rate": 0.0006996421327976276, + "loss": 0.1848, + "num_input_tokens_seen": 114515536, + "step": 53005 + }, + { + "epoch": 8.647634584013051, + "grad_norm": 0.05805162340402603, + "learning_rate": 0.0006995768711788577, + "loss": 0.0231, + "num_input_tokens_seen": 114527312, + "step": 53010 + }, + { + "epoch": 8.648450244698205, + "grad_norm": 0.38928350806236267, + "learning_rate": 0.0006995116055154093, + "loss": 0.0959, + "num_input_tokens_seen": 114539728, + "step": 53015 + }, + { + "epoch": 8.649265905383361, + "grad_norm": 0.018125947564840317, + "learning_rate": 0.000699446335808605, + "loss": 0.0172, + "num_input_tokens_seen": 114551184, + "step": 53020 + }, + { + "epoch": 8.650081566068515, + "grad_norm": 0.02418561838567257, + "learning_rate": 0.0006993810620597677, + "loss": 0.0251, + "num_input_tokens_seen": 114561936, + "step": 53025 + }, + { + "epoch": 8.65089722675367, + "grad_norm": 0.007253970485180616, + "learning_rate": 0.0006993157842702203, + "loss": 0.2023, + "num_input_tokens_seen": 114571728, + "step": 53030 + }, + { + "epoch": 8.651712887438826, + "grad_norm": 0.2686326801776886, + "learning_rate": 0.0006992505024412858, + "loss": 0.0473, + "num_input_tokens_seen": 114581264, + "step": 53035 + }, + { + "epoch": 8.65252854812398, + "grad_norm": 0.5314196944236755, + "learning_rate": 0.000699185216574287, + "loss": 0.1753, + "num_input_tokens_seen": 114591664, + "step": 53040 + }, + { + "epoch": 8.653344208809136, + "grad_norm": 0.06350405514240265, + "learning_rate": 0.0006991199266705472, + "loss": 0.0147, + "num_input_tokens_seen": 114602864, + "step": 53045 + }, + { + "epoch": 8.65415986949429, + "grad_norm": 0.014382350258529186, + "learning_rate": 0.0006990546327313894, + "loss": 0.0233, + "num_input_tokens_seen": 114614000, + "step": 53050 + }, + { + "epoch": 8.654975530179446, + "grad_norm": 0.011114361695945263, + "learning_rate": 0.0006989893347581368, + "loss": 0.0323, + "num_input_tokens_seen": 114625360, + "step": 53055 + }, + { + "epoch": 8.655791190864601, + "grad_norm": 0.01948225125670433, + "learning_rate": 0.000698924032752113, + "loss": 0.1014, + "num_input_tokens_seen": 114635536, + "step": 53060 + }, + { + "epoch": 8.656606851549755, + "grad_norm": 0.006442820653319359, + "learning_rate": 0.0006988587267146414, + "loss": 0.0258, + "num_input_tokens_seen": 114647088, + "step": 53065 + }, + { + "epoch": 8.65742251223491, + "grad_norm": 0.5618607401847839, + "learning_rate": 0.0006987934166470454, + "loss": 0.1312, + "num_input_tokens_seen": 114658864, + "step": 53070 + }, + { + "epoch": 8.658238172920065, + "grad_norm": 0.05817626416683197, + "learning_rate": 0.0006987281025506487, + "loss": 0.0412, + "num_input_tokens_seen": 114670576, + "step": 53075 + }, + { + "epoch": 8.65905383360522, + "grad_norm": 0.13771042227745056, + "learning_rate": 0.0006986627844267748, + "loss": 0.0924, + "num_input_tokens_seen": 114682416, + "step": 53080 + }, + { + "epoch": 8.659869494290374, + "grad_norm": 0.0055312663316726685, + "learning_rate": 0.0006985974622767475, + "loss": 0.0119, + "num_input_tokens_seen": 114692496, + "step": 53085 + }, + { + "epoch": 8.66068515497553, + "grad_norm": 0.04500410705804825, + "learning_rate": 0.0006985321361018908, + "loss": 0.1105, + "num_input_tokens_seen": 114701872, + "step": 53090 + }, + { + "epoch": 8.661500815660686, + "grad_norm": 0.0751497745513916, + "learning_rate": 0.0006984668059035284, + "loss": 0.0143, + "num_input_tokens_seen": 114713392, + "step": 53095 + }, + { + "epoch": 8.66231647634584, + "grad_norm": 0.012522445991635323, + "learning_rate": 0.0006984014716829845, + "loss": 0.0253, + "num_input_tokens_seen": 114723056, + "step": 53100 + }, + { + "epoch": 8.663132137030995, + "grad_norm": 0.07139261066913605, + "learning_rate": 0.0006983361334415831, + "loss": 0.03, + "num_input_tokens_seen": 114735056, + "step": 53105 + }, + { + "epoch": 8.66394779771615, + "grad_norm": 0.08267118781805038, + "learning_rate": 0.0006982707911806483, + "loss": 0.1202, + "num_input_tokens_seen": 114746352, + "step": 53110 + }, + { + "epoch": 8.664763458401305, + "grad_norm": 0.007913434877991676, + "learning_rate": 0.0006982054449015044, + "loss": 0.0371, + "num_input_tokens_seen": 114757840, + "step": 53115 + }, + { + "epoch": 8.66557911908646, + "grad_norm": 0.01678098551928997, + "learning_rate": 0.0006981400946054758, + "loss": 0.016, + "num_input_tokens_seen": 114768848, + "step": 53120 + }, + { + "epoch": 8.666394779771615, + "grad_norm": 0.2976462244987488, + "learning_rate": 0.0006980747402938868, + "loss": 0.0631, + "num_input_tokens_seen": 114779952, + "step": 53125 + }, + { + "epoch": 8.66721044045677, + "grad_norm": 0.09854038804769516, + "learning_rate": 0.0006980093819680616, + "loss": 0.0107, + "num_input_tokens_seen": 114788880, + "step": 53130 + }, + { + "epoch": 8.668026101141924, + "grad_norm": 0.006026748567819595, + "learning_rate": 0.0006979440196293254, + "loss": 0.0977, + "num_input_tokens_seen": 114800528, + "step": 53135 + }, + { + "epoch": 8.66884176182708, + "grad_norm": 0.017739087343215942, + "learning_rate": 0.0006978786532790025, + "loss": 0.04, + "num_input_tokens_seen": 114811088, + "step": 53140 + }, + { + "epoch": 8.669657422512234, + "grad_norm": 0.031023986637592316, + "learning_rate": 0.0006978132829184176, + "loss": 0.1588, + "num_input_tokens_seen": 114822256, + "step": 53145 + }, + { + "epoch": 8.67047308319739, + "grad_norm": 0.002542394446209073, + "learning_rate": 0.0006977479085488956, + "loss": 0.1743, + "num_input_tokens_seen": 114834352, + "step": 53150 + }, + { + "epoch": 8.671288743882545, + "grad_norm": 0.13473205268383026, + "learning_rate": 0.0006976825301717615, + "loss": 0.0847, + "num_input_tokens_seen": 114844272, + "step": 53155 + }, + { + "epoch": 8.6721044045677, + "grad_norm": 0.09745965898036957, + "learning_rate": 0.0006976171477883399, + "loss": 0.0513, + "num_input_tokens_seen": 114855216, + "step": 53160 + }, + { + "epoch": 8.672920065252855, + "grad_norm": 0.11454571783542633, + "learning_rate": 0.0006975517613999562, + "loss": 0.0267, + "num_input_tokens_seen": 114866992, + "step": 53165 + }, + { + "epoch": 8.673735725938009, + "grad_norm": 0.011554248631000519, + "learning_rate": 0.0006974863710079355, + "loss": 0.1094, + "num_input_tokens_seen": 114878032, + "step": 53170 + }, + { + "epoch": 8.674551386623165, + "grad_norm": 0.04244585707783699, + "learning_rate": 0.0006974209766136031, + "loss": 0.0644, + "num_input_tokens_seen": 114889104, + "step": 53175 + }, + { + "epoch": 8.67536704730832, + "grad_norm": 0.033876482397317886, + "learning_rate": 0.0006973555782182839, + "loss": 0.0396, + "num_input_tokens_seen": 114899792, + "step": 53180 + }, + { + "epoch": 8.676182707993474, + "grad_norm": 0.01323134358972311, + "learning_rate": 0.0006972901758233037, + "loss": 0.0818, + "num_input_tokens_seen": 114911184, + "step": 53185 + }, + { + "epoch": 8.67699836867863, + "grad_norm": 0.004896457307040691, + "learning_rate": 0.0006972247694299877, + "loss": 0.0284, + "num_input_tokens_seen": 114921936, + "step": 53190 + }, + { + "epoch": 8.677814029363784, + "grad_norm": 0.09365899115800858, + "learning_rate": 0.0006971593590396616, + "loss": 0.0165, + "num_input_tokens_seen": 114932752, + "step": 53195 + }, + { + "epoch": 8.67862969004894, + "grad_norm": 0.315121591091156, + "learning_rate": 0.000697093944653651, + "loss": 0.0879, + "num_input_tokens_seen": 114943632, + "step": 53200 + }, + { + "epoch": 8.679445350734095, + "grad_norm": 0.008210898377001286, + "learning_rate": 0.0006970285262732815, + "loss": 0.1004, + "num_input_tokens_seen": 114955088, + "step": 53205 + }, + { + "epoch": 8.68026101141925, + "grad_norm": 0.07787376642227173, + "learning_rate": 0.000696963103899879, + "loss": 0.0299, + "num_input_tokens_seen": 114966416, + "step": 53210 + }, + { + "epoch": 8.681076672104405, + "grad_norm": 0.04467257484793663, + "learning_rate": 0.0006968976775347694, + "loss": 0.1604, + "num_input_tokens_seen": 114978032, + "step": 53215 + }, + { + "epoch": 8.681892332789559, + "grad_norm": 0.07246481627225876, + "learning_rate": 0.0006968322471792785, + "loss": 0.0268, + "num_input_tokens_seen": 114990416, + "step": 53220 + }, + { + "epoch": 8.682707993474715, + "grad_norm": 0.04033525288105011, + "learning_rate": 0.0006967668128347324, + "loss": 0.0187, + "num_input_tokens_seen": 115000336, + "step": 53225 + }, + { + "epoch": 8.68352365415987, + "grad_norm": 0.0070259906351566315, + "learning_rate": 0.0006967013745024573, + "loss": 0.0676, + "num_input_tokens_seen": 115010192, + "step": 53230 + }, + { + "epoch": 8.684339314845024, + "grad_norm": 0.019295837730169296, + "learning_rate": 0.0006966359321837792, + "loss": 0.1062, + "num_input_tokens_seen": 115021072, + "step": 53235 + }, + { + "epoch": 8.68515497553018, + "grad_norm": 0.24232225120067596, + "learning_rate": 0.0006965704858800246, + "loss": 0.0964, + "num_input_tokens_seen": 115031760, + "step": 53240 + }, + { + "epoch": 8.685970636215334, + "grad_norm": 0.015037334524095058, + "learning_rate": 0.0006965050355925197, + "loss": 0.1433, + "num_input_tokens_seen": 115042480, + "step": 53245 + }, + { + "epoch": 8.68678629690049, + "grad_norm": 0.02515346184372902, + "learning_rate": 0.000696439581322591, + "loss": 0.0383, + "num_input_tokens_seen": 115054384, + "step": 53250 + }, + { + "epoch": 8.687601957585644, + "grad_norm": 0.1145104393362999, + "learning_rate": 0.000696374123071565, + "loss": 0.0218, + "num_input_tokens_seen": 115064048, + "step": 53255 + }, + { + "epoch": 8.6884176182708, + "grad_norm": 0.03403741493821144, + "learning_rate": 0.0006963086608407683, + "loss": 0.0287, + "num_input_tokens_seen": 115075088, + "step": 53260 + }, + { + "epoch": 8.689233278955955, + "grad_norm": 0.01130970474332571, + "learning_rate": 0.0006962431946315274, + "loss": 0.079, + "num_input_tokens_seen": 115085008, + "step": 53265 + }, + { + "epoch": 8.690048939641109, + "grad_norm": 0.09057468175888062, + "learning_rate": 0.0006961777244451694, + "loss": 0.1347, + "num_input_tokens_seen": 115096688, + "step": 53270 + }, + { + "epoch": 8.690864600326265, + "grad_norm": 0.05597497150301933, + "learning_rate": 0.0006961122502830208, + "loss": 0.0818, + "num_input_tokens_seen": 115108080, + "step": 53275 + }, + { + "epoch": 8.691680261011419, + "grad_norm": 0.24031208455562592, + "learning_rate": 0.0006960467721464086, + "loss": 0.0754, + "num_input_tokens_seen": 115118864, + "step": 53280 + }, + { + "epoch": 8.692495921696574, + "grad_norm": 0.22548261284828186, + "learning_rate": 0.00069598129003666, + "loss": 0.052, + "num_input_tokens_seen": 115128944, + "step": 53285 + }, + { + "epoch": 8.69331158238173, + "grad_norm": 0.1766250878572464, + "learning_rate": 0.0006959158039551019, + "loss": 0.0616, + "num_input_tokens_seen": 115139600, + "step": 53290 + }, + { + "epoch": 8.694127243066884, + "grad_norm": 0.007115835323929787, + "learning_rate": 0.0006958503139030616, + "loss": 0.0235, + "num_input_tokens_seen": 115149552, + "step": 53295 + }, + { + "epoch": 8.69494290375204, + "grad_norm": 0.05052676051855087, + "learning_rate": 0.0006957848198818661, + "loss": 0.0436, + "num_input_tokens_seen": 115160624, + "step": 53300 + }, + { + "epoch": 8.695758564437194, + "grad_norm": 0.0866311639547348, + "learning_rate": 0.0006957193218928429, + "loss": 0.1183, + "num_input_tokens_seen": 115172304, + "step": 53305 + }, + { + "epoch": 8.69657422512235, + "grad_norm": 0.2571766972541809, + "learning_rate": 0.0006956538199373194, + "loss": 0.0922, + "num_input_tokens_seen": 115183824, + "step": 53310 + }, + { + "epoch": 8.697389885807503, + "grad_norm": 0.02091350592672825, + "learning_rate": 0.000695588314016623, + "loss": 0.0278, + "num_input_tokens_seen": 115194832, + "step": 53315 + }, + { + "epoch": 8.698205546492659, + "grad_norm": 0.06575371325016022, + "learning_rate": 0.0006955228041320811, + "loss": 0.0135, + "num_input_tokens_seen": 115204528, + "step": 53320 + }, + { + "epoch": 8.699021207177815, + "grad_norm": 0.03311387449502945, + "learning_rate": 0.0006954572902850218, + "loss": 0.0177, + "num_input_tokens_seen": 115213840, + "step": 53325 + }, + { + "epoch": 8.699836867862969, + "grad_norm": 0.01601443998515606, + "learning_rate": 0.0006953917724767724, + "loss": 0.0281, + "num_input_tokens_seen": 115223600, + "step": 53330 + }, + { + "epoch": 8.700652528548124, + "grad_norm": 0.15040957927703857, + "learning_rate": 0.0006953262507086611, + "loss": 0.0961, + "num_input_tokens_seen": 115233168, + "step": 53335 + }, + { + "epoch": 8.701468189233278, + "grad_norm": 0.03878360241651535, + "learning_rate": 0.0006952607249820153, + "loss": 0.0101, + "num_input_tokens_seen": 115244592, + "step": 53340 + }, + { + "epoch": 8.702283849918434, + "grad_norm": 0.2561344504356384, + "learning_rate": 0.0006951951952981631, + "loss": 0.2158, + "num_input_tokens_seen": 115254192, + "step": 53345 + }, + { + "epoch": 8.70309951060359, + "grad_norm": 0.05164014548063278, + "learning_rate": 0.0006951296616584329, + "loss": 0.0431, + "num_input_tokens_seen": 115266384, + "step": 53350 + }, + { + "epoch": 8.703915171288743, + "grad_norm": 0.021563317626714706, + "learning_rate": 0.0006950641240641524, + "loss": 0.0155, + "num_input_tokens_seen": 115277776, + "step": 53355 + }, + { + "epoch": 8.7047308319739, + "grad_norm": 0.8428294658660889, + "learning_rate": 0.0006949985825166501, + "loss": 0.0516, + "num_input_tokens_seen": 115289168, + "step": 53360 + }, + { + "epoch": 8.705546492659053, + "grad_norm": 0.010905325412750244, + "learning_rate": 0.0006949330370172541, + "loss": 0.1883, + "num_input_tokens_seen": 115299216, + "step": 53365 + }, + { + "epoch": 8.706362153344209, + "grad_norm": 0.00984366238117218, + "learning_rate": 0.0006948674875672927, + "loss": 0.0573, + "num_input_tokens_seen": 115309968, + "step": 53370 + }, + { + "epoch": 8.707177814029365, + "grad_norm": 0.01519181951880455, + "learning_rate": 0.0006948019341680945, + "loss": 0.0492, + "num_input_tokens_seen": 115321264, + "step": 53375 + }, + { + "epoch": 8.707993474714518, + "grad_norm": 0.015593461692333221, + "learning_rate": 0.0006947363768209882, + "loss": 0.1021, + "num_input_tokens_seen": 115331472, + "step": 53380 + }, + { + "epoch": 8.708809135399674, + "grad_norm": 0.15758588910102844, + "learning_rate": 0.000694670815527302, + "loss": 0.0611, + "num_input_tokens_seen": 115342480, + "step": 53385 + }, + { + "epoch": 8.709624796084828, + "grad_norm": 0.006150087807327509, + "learning_rate": 0.0006946052502883648, + "loss": 0.0425, + "num_input_tokens_seen": 115351184, + "step": 53390 + }, + { + "epoch": 8.710440456769984, + "grad_norm": 0.12416129559278488, + "learning_rate": 0.0006945396811055053, + "loss": 0.0198, + "num_input_tokens_seen": 115363152, + "step": 53395 + }, + { + "epoch": 8.71125611745514, + "grad_norm": 0.05409393459558487, + "learning_rate": 0.0006944741079800525, + "loss": 0.0436, + "num_input_tokens_seen": 115373648, + "step": 53400 + }, + { + "epoch": 8.712071778140293, + "grad_norm": 0.39778071641921997, + "learning_rate": 0.000694408530913335, + "loss": 0.1524, + "num_input_tokens_seen": 115384560, + "step": 53405 + }, + { + "epoch": 8.71288743882545, + "grad_norm": 0.0031305132433772087, + "learning_rate": 0.0006943429499066821, + "loss": 0.0084, + "num_input_tokens_seen": 115396272, + "step": 53410 + }, + { + "epoch": 8.713703099510603, + "grad_norm": 0.008841861970722675, + "learning_rate": 0.0006942773649614228, + "loss": 0.0066, + "num_input_tokens_seen": 115407600, + "step": 53415 + }, + { + "epoch": 8.714518760195759, + "grad_norm": 0.337829053401947, + "learning_rate": 0.0006942117760788862, + "loss": 0.1059, + "num_input_tokens_seen": 115418288, + "step": 53420 + }, + { + "epoch": 8.715334420880914, + "grad_norm": 0.22501158714294434, + "learning_rate": 0.0006941461832604017, + "loss": 0.1335, + "num_input_tokens_seen": 115428592, + "step": 53425 + }, + { + "epoch": 8.716150081566068, + "grad_norm": 0.2836399972438812, + "learning_rate": 0.0006940805865072984, + "loss": 0.3035, + "num_input_tokens_seen": 115439824, + "step": 53430 + }, + { + "epoch": 8.716965742251224, + "grad_norm": 0.30257099866867065, + "learning_rate": 0.0006940149858209058, + "loss": 0.1856, + "num_input_tokens_seen": 115450960, + "step": 53435 + }, + { + "epoch": 8.717781402936378, + "grad_norm": 0.18512238562107086, + "learning_rate": 0.0006939493812025534, + "loss": 0.0894, + "num_input_tokens_seen": 115461584, + "step": 53440 + }, + { + "epoch": 8.718597063621534, + "grad_norm": 0.05251099169254303, + "learning_rate": 0.0006938837726535707, + "loss": 0.0768, + "num_input_tokens_seen": 115470992, + "step": 53445 + }, + { + "epoch": 8.719412724306688, + "grad_norm": 0.28675875067710876, + "learning_rate": 0.0006938181601752873, + "loss": 0.0591, + "num_input_tokens_seen": 115482736, + "step": 53450 + }, + { + "epoch": 8.720228384991843, + "grad_norm": 0.03428531438112259, + "learning_rate": 0.0006937525437690332, + "loss": 0.0197, + "num_input_tokens_seen": 115493680, + "step": 53455 + }, + { + "epoch": 8.721044045676999, + "grad_norm": 0.007960710674524307, + "learning_rate": 0.0006936869234361379, + "loss": 0.064, + "num_input_tokens_seen": 115503568, + "step": 53460 + }, + { + "epoch": 8.721859706362153, + "grad_norm": 0.025951100513339043, + "learning_rate": 0.0006936212991779314, + "loss": 0.0374, + "num_input_tokens_seen": 115514512, + "step": 53465 + }, + { + "epoch": 8.722675367047309, + "grad_norm": 0.31575798988342285, + "learning_rate": 0.0006935556709957437, + "loss": 0.0587, + "num_input_tokens_seen": 115525840, + "step": 53470 + }, + { + "epoch": 8.723491027732463, + "grad_norm": 0.0034322626888751984, + "learning_rate": 0.0006934900388909048, + "loss": 0.0936, + "num_input_tokens_seen": 115535856, + "step": 53475 + }, + { + "epoch": 8.724306688417618, + "grad_norm": 0.011164214462041855, + "learning_rate": 0.0006934244028647447, + "loss": 0.0251, + "num_input_tokens_seen": 115547376, + "step": 53480 + }, + { + "epoch": 8.725122349102774, + "grad_norm": 0.27567970752716064, + "learning_rate": 0.0006933587629185938, + "loss": 0.1887, + "num_input_tokens_seen": 115557200, + "step": 53485 + }, + { + "epoch": 8.725938009787928, + "grad_norm": 0.23501868546009064, + "learning_rate": 0.0006932931190537822, + "loss": 0.1783, + "num_input_tokens_seen": 115568048, + "step": 53490 + }, + { + "epoch": 8.726753670473084, + "grad_norm": 0.017488988116383553, + "learning_rate": 0.0006932274712716405, + "loss": 0.0237, + "num_input_tokens_seen": 115578736, + "step": 53495 + }, + { + "epoch": 8.727569331158238, + "grad_norm": 0.025671212002635002, + "learning_rate": 0.0006931618195734988, + "loss": 0.0542, + "num_input_tokens_seen": 115588656, + "step": 53500 + }, + { + "epoch": 8.728384991843393, + "grad_norm": 0.30542680621147156, + "learning_rate": 0.0006930961639606878, + "loss": 0.0637, + "num_input_tokens_seen": 115599280, + "step": 53505 + }, + { + "epoch": 8.729200652528547, + "grad_norm": 0.04574590176343918, + "learning_rate": 0.0006930305044345381, + "loss": 0.0612, + "num_input_tokens_seen": 115610544, + "step": 53510 + }, + { + "epoch": 8.730016313213703, + "grad_norm": 0.016665812581777573, + "learning_rate": 0.0006929648409963802, + "loss": 0.0437, + "num_input_tokens_seen": 115621104, + "step": 53515 + }, + { + "epoch": 8.730831973898859, + "grad_norm": 0.06146889925003052, + "learning_rate": 0.0006928991736475452, + "loss": 0.0719, + "num_input_tokens_seen": 115633104, + "step": 53520 + }, + { + "epoch": 8.731647634584013, + "grad_norm": 0.6150043606758118, + "learning_rate": 0.0006928335023893637, + "loss": 0.0762, + "num_input_tokens_seen": 115644048, + "step": 53525 + }, + { + "epoch": 8.732463295269168, + "grad_norm": 0.028153996914625168, + "learning_rate": 0.0006927678272231667, + "loss": 0.0251, + "num_input_tokens_seen": 115656304, + "step": 53530 + }, + { + "epoch": 8.733278955954322, + "grad_norm": 0.07091391086578369, + "learning_rate": 0.0006927021481502851, + "loss": 0.0403, + "num_input_tokens_seen": 115667120, + "step": 53535 + }, + { + "epoch": 8.734094616639478, + "grad_norm": 0.049456529319286346, + "learning_rate": 0.0006926364651720499, + "loss": 0.0772, + "num_input_tokens_seen": 115676880, + "step": 53540 + }, + { + "epoch": 8.734910277324634, + "grad_norm": 0.32859930396080017, + "learning_rate": 0.0006925707782897925, + "loss": 0.2128, + "num_input_tokens_seen": 115687344, + "step": 53545 + }, + { + "epoch": 8.735725938009788, + "grad_norm": 0.02055169828236103, + "learning_rate": 0.000692505087504844, + "loss": 0.0476, + "num_input_tokens_seen": 115697936, + "step": 53550 + }, + { + "epoch": 8.736541598694943, + "grad_norm": 0.016683807596564293, + "learning_rate": 0.0006924393928185354, + "loss": 0.0219, + "num_input_tokens_seen": 115707056, + "step": 53555 + }, + { + "epoch": 8.737357259380097, + "grad_norm": 0.07922355085611343, + "learning_rate": 0.0006923736942321987, + "loss": 0.0324, + "num_input_tokens_seen": 115718384, + "step": 53560 + }, + { + "epoch": 8.738172920065253, + "grad_norm": 0.04225243628025055, + "learning_rate": 0.0006923079917471648, + "loss": 0.0664, + "num_input_tokens_seen": 115729168, + "step": 53565 + }, + { + "epoch": 8.738988580750409, + "grad_norm": 0.047049473971128464, + "learning_rate": 0.0006922422853647656, + "loss": 0.1753, + "num_input_tokens_seen": 115739344, + "step": 53570 + }, + { + "epoch": 8.739804241435563, + "grad_norm": 0.061708804219961166, + "learning_rate": 0.0006921765750863327, + "loss": 0.0559, + "num_input_tokens_seen": 115749360, + "step": 53575 + }, + { + "epoch": 8.740619902120718, + "grad_norm": 0.3257828652858734, + "learning_rate": 0.0006921108609131976, + "loss": 0.0648, + "num_input_tokens_seen": 115759664, + "step": 53580 + }, + { + "epoch": 8.741435562805872, + "grad_norm": 0.10550856590270996, + "learning_rate": 0.0006920451428466923, + "loss": 0.0411, + "num_input_tokens_seen": 115770192, + "step": 53585 + }, + { + "epoch": 8.742251223491028, + "grad_norm": 0.037641484290361404, + "learning_rate": 0.0006919794208881486, + "loss": 0.0312, + "num_input_tokens_seen": 115782000, + "step": 53590 + }, + { + "epoch": 8.743066884176184, + "grad_norm": 0.07037726789712906, + "learning_rate": 0.0006919136950388982, + "loss": 0.033, + "num_input_tokens_seen": 115792560, + "step": 53595 + }, + { + "epoch": 8.743882544861338, + "grad_norm": 0.09038639813661575, + "learning_rate": 0.0006918479653002734, + "loss": 0.0259, + "num_input_tokens_seen": 115803248, + "step": 53600 + }, + { + "epoch": 8.744698205546493, + "grad_norm": 0.26753684878349304, + "learning_rate": 0.0006917822316736062, + "loss": 0.2313, + "num_input_tokens_seen": 115813776, + "step": 53605 + }, + { + "epoch": 8.745513866231647, + "grad_norm": 0.004256491083651781, + "learning_rate": 0.0006917164941602289, + "loss": 0.215, + "num_input_tokens_seen": 115824144, + "step": 53610 + }, + { + "epoch": 8.746329526916803, + "grad_norm": 0.49793606996536255, + "learning_rate": 0.0006916507527614735, + "loss": 0.1397, + "num_input_tokens_seen": 115834896, + "step": 53615 + }, + { + "epoch": 8.747145187601957, + "grad_norm": 0.03849693387746811, + "learning_rate": 0.0006915850074786725, + "loss": 0.0755, + "num_input_tokens_seen": 115844976, + "step": 53620 + }, + { + "epoch": 8.747960848287113, + "grad_norm": 0.1075640395283699, + "learning_rate": 0.0006915192583131582, + "loss": 0.0565, + "num_input_tokens_seen": 115856752, + "step": 53625 + }, + { + "epoch": 8.748776508972268, + "grad_norm": 0.014227380976080894, + "learning_rate": 0.0006914535052662633, + "loss": 0.032, + "num_input_tokens_seen": 115866064, + "step": 53630 + }, + { + "epoch": 8.749592169657422, + "grad_norm": 0.07204482704401016, + "learning_rate": 0.0006913877483393202, + "loss": 0.1358, + "num_input_tokens_seen": 115875856, + "step": 53635 + }, + { + "epoch": 8.750407830342578, + "grad_norm": 0.03224179521203041, + "learning_rate": 0.0006913219875336616, + "loss": 0.1238, + "num_input_tokens_seen": 115886608, + "step": 53640 + }, + { + "epoch": 8.751223491027732, + "grad_norm": 0.02999437227845192, + "learning_rate": 0.0006912562228506201, + "loss": 0.0118, + "num_input_tokens_seen": 115898672, + "step": 53645 + }, + { + "epoch": 8.752039151712887, + "grad_norm": 0.012919296510517597, + "learning_rate": 0.0006911904542915288, + "loss": 0.1489, + "num_input_tokens_seen": 115910448, + "step": 53650 + }, + { + "epoch": 8.752854812398043, + "grad_norm": 0.3587695360183716, + "learning_rate": 0.0006911246818577201, + "loss": 0.0957, + "num_input_tokens_seen": 115920944, + "step": 53655 + }, + { + "epoch": 8.753670473083197, + "grad_norm": 0.17896397411823273, + "learning_rate": 0.0006910589055505275, + "loss": 0.0861, + "num_input_tokens_seen": 115930480, + "step": 53660 + }, + { + "epoch": 8.754486133768353, + "grad_norm": 0.09359142929315567, + "learning_rate": 0.0006909931253712838, + "loss": 0.0512, + "num_input_tokens_seen": 115940144, + "step": 53665 + }, + { + "epoch": 8.755301794453507, + "grad_norm": 0.3921225965023041, + "learning_rate": 0.0006909273413213222, + "loss": 0.1149, + "num_input_tokens_seen": 115951216, + "step": 53670 + }, + { + "epoch": 8.756117455138662, + "grad_norm": 0.1849663108587265, + "learning_rate": 0.0006908615534019757, + "loss": 0.0598, + "num_input_tokens_seen": 115961680, + "step": 53675 + }, + { + "epoch": 8.756933115823816, + "grad_norm": 0.07306811958551407, + "learning_rate": 0.0006907957616145777, + "loss": 0.0202, + "num_input_tokens_seen": 115972720, + "step": 53680 + }, + { + "epoch": 8.757748776508972, + "grad_norm": 0.13053929805755615, + "learning_rate": 0.0006907299659604613, + "loss": 0.0811, + "num_input_tokens_seen": 115983632, + "step": 53685 + }, + { + "epoch": 8.758564437194128, + "grad_norm": 0.265766441822052, + "learning_rate": 0.0006906641664409605, + "loss": 0.1375, + "num_input_tokens_seen": 115993552, + "step": 53690 + }, + { + "epoch": 8.759380097879282, + "grad_norm": 0.018430359661579132, + "learning_rate": 0.0006905983630574084, + "loss": 0.0442, + "num_input_tokens_seen": 116004752, + "step": 53695 + }, + { + "epoch": 8.760195758564437, + "grad_norm": 0.03611930087208748, + "learning_rate": 0.0006905325558111389, + "loss": 0.0762, + "num_input_tokens_seen": 116015056, + "step": 53700 + }, + { + "epoch": 8.761011419249591, + "grad_norm": 0.023959677666425705, + "learning_rate": 0.0006904667447034851, + "loss": 0.0373, + "num_input_tokens_seen": 116026512, + "step": 53705 + }, + { + "epoch": 8.761827079934747, + "grad_norm": 0.005680852569639683, + "learning_rate": 0.0006904009297357814, + "loss": 0.0173, + "num_input_tokens_seen": 116036144, + "step": 53710 + }, + { + "epoch": 8.762642740619903, + "grad_norm": 0.03747767210006714, + "learning_rate": 0.000690335110909361, + "loss": 0.0393, + "num_input_tokens_seen": 116047056, + "step": 53715 + }, + { + "epoch": 8.763458401305057, + "grad_norm": 0.01141283754259348, + "learning_rate": 0.0006902692882255583, + "loss": 0.013, + "num_input_tokens_seen": 116057360, + "step": 53720 + }, + { + "epoch": 8.764274061990212, + "grad_norm": 0.039775773882865906, + "learning_rate": 0.0006902034616857073, + "loss": 0.0694, + "num_input_tokens_seen": 116066992, + "step": 53725 + }, + { + "epoch": 8.765089722675366, + "grad_norm": 0.12590359151363373, + "learning_rate": 0.0006901376312911416, + "loss": 0.0809, + "num_input_tokens_seen": 116079344, + "step": 53730 + }, + { + "epoch": 8.765905383360522, + "grad_norm": 0.011169610545039177, + "learning_rate": 0.0006900717970431956, + "loss": 0.0129, + "num_input_tokens_seen": 116089648, + "step": 53735 + }, + { + "epoch": 8.766721044045678, + "grad_norm": 0.31761443614959717, + "learning_rate": 0.0006900059589432036, + "loss": 0.1355, + "num_input_tokens_seen": 116100656, + "step": 53740 + }, + { + "epoch": 8.767536704730832, + "grad_norm": 0.004789343569427729, + "learning_rate": 0.0006899401169924997, + "loss": 0.0708, + "num_input_tokens_seen": 116111184, + "step": 53745 + }, + { + "epoch": 8.768352365415987, + "grad_norm": 0.18085156381130219, + "learning_rate": 0.0006898742711924185, + "loss": 0.0906, + "num_input_tokens_seen": 116121648, + "step": 53750 + }, + { + "epoch": 8.769168026101141, + "grad_norm": 0.19203798472881317, + "learning_rate": 0.0006898084215442942, + "loss": 0.3073, + "num_input_tokens_seen": 116132144, + "step": 53755 + }, + { + "epoch": 8.769983686786297, + "grad_norm": 0.005764484871178865, + "learning_rate": 0.0006897425680494616, + "loss": 0.0732, + "num_input_tokens_seen": 116144176, + "step": 53760 + }, + { + "epoch": 8.770799347471453, + "grad_norm": 0.005528958048671484, + "learning_rate": 0.000689676710709255, + "loss": 0.125, + "num_input_tokens_seen": 116154704, + "step": 53765 + }, + { + "epoch": 8.771615008156607, + "grad_norm": 0.018419597297906876, + "learning_rate": 0.0006896108495250092, + "loss": 0.0443, + "num_input_tokens_seen": 116165904, + "step": 53770 + }, + { + "epoch": 8.772430668841762, + "grad_norm": 0.18632349371910095, + "learning_rate": 0.0006895449844980592, + "loss": 0.0617, + "num_input_tokens_seen": 116177040, + "step": 53775 + }, + { + "epoch": 8.773246329526916, + "grad_norm": 0.012830687686800957, + "learning_rate": 0.0006894791156297394, + "loss": 0.072, + "num_input_tokens_seen": 116187248, + "step": 53780 + }, + { + "epoch": 8.774061990212072, + "grad_norm": 0.002509468700736761, + "learning_rate": 0.0006894132429213851, + "loss": 0.0209, + "num_input_tokens_seen": 116198640, + "step": 53785 + }, + { + "epoch": 8.774877650897226, + "grad_norm": 0.037770144641399384, + "learning_rate": 0.0006893473663743311, + "loss": 0.0382, + "num_input_tokens_seen": 116208944, + "step": 53790 + }, + { + "epoch": 8.775693311582382, + "grad_norm": 0.1476047933101654, + "learning_rate": 0.0006892814859899126, + "loss": 0.0277, + "num_input_tokens_seen": 116220784, + "step": 53795 + }, + { + "epoch": 8.776508972267537, + "grad_norm": 0.004433739464730024, + "learning_rate": 0.0006892156017694646, + "loss": 0.0761, + "num_input_tokens_seen": 116231600, + "step": 53800 + }, + { + "epoch": 8.777324632952691, + "grad_norm": 0.027068404480814934, + "learning_rate": 0.0006891497137143224, + "loss": 0.1049, + "num_input_tokens_seen": 116243280, + "step": 53805 + }, + { + "epoch": 8.778140293637847, + "grad_norm": 0.008436436764895916, + "learning_rate": 0.0006890838218258213, + "loss": 0.0582, + "num_input_tokens_seen": 116253584, + "step": 53810 + }, + { + "epoch": 8.778955954323001, + "grad_norm": 0.017870329320430756, + "learning_rate": 0.0006890179261052967, + "loss": 0.0358, + "num_input_tokens_seen": 116263824, + "step": 53815 + }, + { + "epoch": 8.779771615008157, + "grad_norm": 0.023240847513079643, + "learning_rate": 0.000688952026554084, + "loss": 0.0669, + "num_input_tokens_seen": 116274224, + "step": 53820 + }, + { + "epoch": 8.780587275693312, + "grad_norm": 0.018025638535618782, + "learning_rate": 0.0006888861231735186, + "loss": 0.0153, + "num_input_tokens_seen": 116284944, + "step": 53825 + }, + { + "epoch": 8.781402936378466, + "grad_norm": 0.017300186678767204, + "learning_rate": 0.0006888202159649366, + "loss": 0.0943, + "num_input_tokens_seen": 116295696, + "step": 53830 + }, + { + "epoch": 8.782218597063622, + "grad_norm": 0.09028012305498123, + "learning_rate": 0.0006887543049296733, + "loss": 0.0588, + "num_input_tokens_seen": 116306512, + "step": 53835 + }, + { + "epoch": 8.783034257748776, + "grad_norm": 0.06694075465202332, + "learning_rate": 0.0006886883900690645, + "loss": 0.1342, + "num_input_tokens_seen": 116316944, + "step": 53840 + }, + { + "epoch": 8.783849918433932, + "grad_norm": 0.02075386978685856, + "learning_rate": 0.0006886224713844461, + "loss": 0.0448, + "num_input_tokens_seen": 116327664, + "step": 53845 + }, + { + "epoch": 8.784665579119086, + "grad_norm": 0.2526707947254181, + "learning_rate": 0.0006885565488771541, + "loss": 0.0893, + "num_input_tokens_seen": 116339184, + "step": 53850 + }, + { + "epoch": 8.785481239804241, + "grad_norm": 0.01189572736620903, + "learning_rate": 0.0006884906225485245, + "loss": 0.0183, + "num_input_tokens_seen": 116349104, + "step": 53855 + }, + { + "epoch": 8.786296900489397, + "grad_norm": 0.13445112109184265, + "learning_rate": 0.0006884246923998932, + "loss": 0.1051, + "num_input_tokens_seen": 116359856, + "step": 53860 + }, + { + "epoch": 8.78711256117455, + "grad_norm": 0.21758858859539032, + "learning_rate": 0.0006883587584325965, + "loss": 0.0645, + "num_input_tokens_seen": 116370672, + "step": 53865 + }, + { + "epoch": 8.787928221859707, + "grad_norm": 0.007030788343399763, + "learning_rate": 0.0006882928206479707, + "loss": 0.042, + "num_input_tokens_seen": 116381648, + "step": 53870 + }, + { + "epoch": 8.78874388254486, + "grad_norm": 0.1571875512599945, + "learning_rate": 0.0006882268790473517, + "loss": 0.1503, + "num_input_tokens_seen": 116392912, + "step": 53875 + }, + { + "epoch": 8.789559543230016, + "grad_norm": 0.05002790316939354, + "learning_rate": 0.0006881609336320764, + "loss": 0.1246, + "num_input_tokens_seen": 116402448, + "step": 53880 + }, + { + "epoch": 8.790375203915172, + "grad_norm": 0.25340625643730164, + "learning_rate": 0.0006880949844034811, + "loss": 0.0462, + "num_input_tokens_seen": 116412496, + "step": 53885 + }, + { + "epoch": 8.791190864600326, + "grad_norm": 0.12071508914232254, + "learning_rate": 0.0006880290313629026, + "loss": 0.0331, + "num_input_tokens_seen": 116422736, + "step": 53890 + }, + { + "epoch": 8.792006525285482, + "grad_norm": 0.3033401370048523, + "learning_rate": 0.0006879630745116769, + "loss": 0.0823, + "num_input_tokens_seen": 116432464, + "step": 53895 + }, + { + "epoch": 8.792822185970635, + "grad_norm": 0.2533930838108063, + "learning_rate": 0.0006878971138511412, + "loss": 0.0436, + "num_input_tokens_seen": 116442224, + "step": 53900 + }, + { + "epoch": 8.793637846655791, + "grad_norm": 0.11267364770174026, + "learning_rate": 0.000687831149382632, + "loss": 0.1089, + "num_input_tokens_seen": 116454352, + "step": 53905 + }, + { + "epoch": 8.794453507340947, + "grad_norm": 0.021168632432818413, + "learning_rate": 0.0006877651811074863, + "loss": 0.064, + "num_input_tokens_seen": 116465264, + "step": 53910 + }, + { + "epoch": 8.7952691680261, + "grad_norm": 0.05451773852109909, + "learning_rate": 0.0006876992090270411, + "loss": 0.0697, + "num_input_tokens_seen": 116475760, + "step": 53915 + }, + { + "epoch": 8.796084828711257, + "grad_norm": 0.060790155082941055, + "learning_rate": 0.0006876332331426332, + "loss": 0.1321, + "num_input_tokens_seen": 116485744, + "step": 53920 + }, + { + "epoch": 8.79690048939641, + "grad_norm": 0.06983616203069687, + "learning_rate": 0.0006875672534556, + "loss": 0.0615, + "num_input_tokens_seen": 116497488, + "step": 53925 + }, + { + "epoch": 8.797716150081566, + "grad_norm": 0.20969927310943604, + "learning_rate": 0.0006875012699672783, + "loss": 0.1099, + "num_input_tokens_seen": 116507728, + "step": 53930 + }, + { + "epoch": 8.798531810766722, + "grad_norm": 0.0036327510606497526, + "learning_rate": 0.0006874352826790055, + "loss": 0.1238, + "num_input_tokens_seen": 116517552, + "step": 53935 + }, + { + "epoch": 8.799347471451876, + "grad_norm": 0.050834622234106064, + "learning_rate": 0.000687369291592119, + "loss": 0.0128, + "num_input_tokens_seen": 116528144, + "step": 53940 + }, + { + "epoch": 8.800163132137031, + "grad_norm": 0.22240811586380005, + "learning_rate": 0.0006873032967079561, + "loss": 0.1287, + "num_input_tokens_seen": 116537968, + "step": 53945 + }, + { + "epoch": 8.800978792822185, + "grad_norm": 0.10322391241788864, + "learning_rate": 0.0006872372980278543, + "loss": 0.0811, + "num_input_tokens_seen": 116548208, + "step": 53950 + }, + { + "epoch": 8.801794453507341, + "grad_norm": 0.027097368612885475, + "learning_rate": 0.0006871712955531511, + "loss": 0.0971, + "num_input_tokens_seen": 116558480, + "step": 53955 + }, + { + "epoch": 8.802610114192497, + "grad_norm": 0.23474834859371185, + "learning_rate": 0.0006871052892851842, + "loss": 0.2162, + "num_input_tokens_seen": 116569776, + "step": 53960 + }, + { + "epoch": 8.80342577487765, + "grad_norm": 0.02830558642745018, + "learning_rate": 0.0006870392792252911, + "loss": 0.0259, + "num_input_tokens_seen": 116580144, + "step": 53965 + }, + { + "epoch": 8.804241435562806, + "grad_norm": 0.010453589260578156, + "learning_rate": 0.0006869732653748096, + "loss": 0.0772, + "num_input_tokens_seen": 116590064, + "step": 53970 + }, + { + "epoch": 8.80505709624796, + "grad_norm": 0.17776353657245636, + "learning_rate": 0.000686907247735078, + "loss": 0.0678, + "num_input_tokens_seen": 116600080, + "step": 53975 + }, + { + "epoch": 8.805872756933116, + "grad_norm": 0.006603384390473366, + "learning_rate": 0.0006868412263074337, + "loss": 0.0302, + "num_input_tokens_seen": 116610160, + "step": 53980 + }, + { + "epoch": 8.80668841761827, + "grad_norm": 0.020314160734415054, + "learning_rate": 0.0006867752010932151, + "loss": 0.0417, + "num_input_tokens_seen": 116620688, + "step": 53985 + }, + { + "epoch": 8.807504078303426, + "grad_norm": 0.13206049799919128, + "learning_rate": 0.00068670917209376, + "loss": 0.1469, + "num_input_tokens_seen": 116630064, + "step": 53990 + }, + { + "epoch": 8.808319738988581, + "grad_norm": 0.08112114667892456, + "learning_rate": 0.0006866431393104067, + "loss": 0.0818, + "num_input_tokens_seen": 116641520, + "step": 53995 + }, + { + "epoch": 8.809135399673735, + "grad_norm": 0.3721643388271332, + "learning_rate": 0.0006865771027444933, + "loss": 0.2034, + "num_input_tokens_seen": 116653456, + "step": 54000 + }, + { + "epoch": 8.809951060358891, + "grad_norm": 0.18222694098949432, + "learning_rate": 0.0006865110623973585, + "loss": 0.1046, + "num_input_tokens_seen": 116663408, + "step": 54005 + }, + { + "epoch": 8.810766721044045, + "grad_norm": 0.5798073410987854, + "learning_rate": 0.0006864450182703403, + "loss": 0.0734, + "num_input_tokens_seen": 116674256, + "step": 54010 + }, + { + "epoch": 8.8115823817292, + "grad_norm": 0.06881486624479294, + "learning_rate": 0.0006863789703647771, + "loss": 0.0776, + "num_input_tokens_seen": 116685552, + "step": 54015 + }, + { + "epoch": 8.812398042414356, + "grad_norm": 0.022122984752058983, + "learning_rate": 0.0006863129186820079, + "loss": 0.1005, + "num_input_tokens_seen": 116695824, + "step": 54020 + }, + { + "epoch": 8.81321370309951, + "grad_norm": 0.03289042413234711, + "learning_rate": 0.0006862468632233709, + "loss": 0.0524, + "num_input_tokens_seen": 116706128, + "step": 54025 + }, + { + "epoch": 8.814029363784666, + "grad_norm": 0.07637394964694977, + "learning_rate": 0.000686180803990205, + "loss": 0.1307, + "num_input_tokens_seen": 116715728, + "step": 54030 + }, + { + "epoch": 8.81484502446982, + "grad_norm": 0.016515476629137993, + "learning_rate": 0.0006861147409838489, + "loss": 0.0353, + "num_input_tokens_seen": 116726992, + "step": 54035 + }, + { + "epoch": 8.815660685154976, + "grad_norm": 0.3054102063179016, + "learning_rate": 0.0006860486742056415, + "loss": 0.1445, + "num_input_tokens_seen": 116737968, + "step": 54040 + }, + { + "epoch": 8.81647634584013, + "grad_norm": 0.007263507228344679, + "learning_rate": 0.0006859826036569216, + "loss": 0.0922, + "num_input_tokens_seen": 116748240, + "step": 54045 + }, + { + "epoch": 8.817292006525285, + "grad_norm": 0.1208798810839653, + "learning_rate": 0.0006859165293390284, + "loss": 0.0555, + "num_input_tokens_seen": 116759504, + "step": 54050 + }, + { + "epoch": 8.818107667210441, + "grad_norm": 0.22497281432151794, + "learning_rate": 0.0006858504512533008, + "loss": 0.0817, + "num_input_tokens_seen": 116770768, + "step": 54055 + }, + { + "epoch": 8.818923327895595, + "grad_norm": 0.1389174610376358, + "learning_rate": 0.000685784369401078, + "loss": 0.0579, + "num_input_tokens_seen": 116782352, + "step": 54060 + }, + { + "epoch": 8.81973898858075, + "grad_norm": 0.3582153916358948, + "learning_rate": 0.0006857182837836994, + "loss": 0.1186, + "num_input_tokens_seen": 116793712, + "step": 54065 + }, + { + "epoch": 8.820554649265905, + "grad_norm": 0.004126532934606075, + "learning_rate": 0.0006856521944025041, + "loss": 0.0268, + "num_input_tokens_seen": 116804816, + "step": 54070 + }, + { + "epoch": 8.82137030995106, + "grad_norm": 0.0762988030910492, + "learning_rate": 0.0006855861012588316, + "loss": 0.0666, + "num_input_tokens_seen": 116815824, + "step": 54075 + }, + { + "epoch": 8.822185970636216, + "grad_norm": 0.12311746925115585, + "learning_rate": 0.0006855200043540213, + "loss": 0.0263, + "num_input_tokens_seen": 116826960, + "step": 54080 + }, + { + "epoch": 8.82300163132137, + "grad_norm": 0.004039400722831488, + "learning_rate": 0.0006854539036894128, + "loss": 0.0109, + "num_input_tokens_seen": 116837680, + "step": 54085 + }, + { + "epoch": 8.823817292006526, + "grad_norm": 0.060211095958948135, + "learning_rate": 0.0006853877992663456, + "loss": 0.1247, + "num_input_tokens_seen": 116847120, + "step": 54090 + }, + { + "epoch": 8.82463295269168, + "grad_norm": 0.01676938310265541, + "learning_rate": 0.0006853216910861595, + "loss": 0.0374, + "num_input_tokens_seen": 116857456, + "step": 54095 + }, + { + "epoch": 8.825448613376835, + "grad_norm": 0.04738698527216911, + "learning_rate": 0.0006852555791501942, + "loss": 0.0222, + "num_input_tokens_seen": 116869264, + "step": 54100 + }, + { + "epoch": 8.826264274061991, + "grad_norm": 0.26809951663017273, + "learning_rate": 0.0006851894634597898, + "loss": 0.1946, + "num_input_tokens_seen": 116879504, + "step": 54105 + }, + { + "epoch": 8.827079934747145, + "grad_norm": 0.06636186689138412, + "learning_rate": 0.0006851233440162858, + "loss": 0.1837, + "num_input_tokens_seen": 116889808, + "step": 54110 + }, + { + "epoch": 8.8278955954323, + "grad_norm": 0.048026785254478455, + "learning_rate": 0.0006850572208210223, + "loss": 0.0446, + "num_input_tokens_seen": 116900816, + "step": 54115 + }, + { + "epoch": 8.828711256117455, + "grad_norm": 0.010122005827724934, + "learning_rate": 0.0006849910938753396, + "loss": 0.2698, + "num_input_tokens_seen": 116912208, + "step": 54120 + }, + { + "epoch": 8.82952691680261, + "grad_norm": 0.31122568249702454, + "learning_rate": 0.0006849249631805777, + "loss": 0.0497, + "num_input_tokens_seen": 116922864, + "step": 54125 + }, + { + "epoch": 8.830342577487766, + "grad_norm": 0.12743504345417023, + "learning_rate": 0.0006848588287380769, + "loss": 0.0294, + "num_input_tokens_seen": 116933552, + "step": 54130 + }, + { + "epoch": 8.83115823817292, + "grad_norm": 0.01123537216335535, + "learning_rate": 0.0006847926905491771, + "loss": 0.1134, + "num_input_tokens_seen": 116944208, + "step": 54135 + }, + { + "epoch": 8.831973898858076, + "grad_norm": 0.01716950722038746, + "learning_rate": 0.0006847265486152192, + "loss": 0.0422, + "num_input_tokens_seen": 116955248, + "step": 54140 + }, + { + "epoch": 8.83278955954323, + "grad_norm": 0.07278816401958466, + "learning_rate": 0.0006846604029375435, + "loss": 0.0258, + "num_input_tokens_seen": 116966768, + "step": 54145 + }, + { + "epoch": 8.833605220228385, + "grad_norm": 0.005184966139495373, + "learning_rate": 0.0006845942535174905, + "loss": 0.0346, + "num_input_tokens_seen": 116977392, + "step": 54150 + }, + { + "epoch": 8.83442088091354, + "grad_norm": 0.30484655499458313, + "learning_rate": 0.0006845281003564007, + "loss": 0.1343, + "num_input_tokens_seen": 116988528, + "step": 54155 + }, + { + "epoch": 8.835236541598695, + "grad_norm": 0.14823204278945923, + "learning_rate": 0.0006844619434556149, + "loss": 0.0152, + "num_input_tokens_seen": 116999280, + "step": 54160 + }, + { + "epoch": 8.83605220228385, + "grad_norm": 0.024366330355405807, + "learning_rate": 0.0006843957828164737, + "loss": 0.0304, + "num_input_tokens_seen": 117008368, + "step": 54165 + }, + { + "epoch": 8.836867862969005, + "grad_norm": 0.161958709359169, + "learning_rate": 0.0006843296184403182, + "loss": 0.0811, + "num_input_tokens_seen": 117019472, + "step": 54170 + }, + { + "epoch": 8.83768352365416, + "grad_norm": 0.031379345804452896, + "learning_rate": 0.0006842634503284891, + "loss": 0.0155, + "num_input_tokens_seen": 117029136, + "step": 54175 + }, + { + "epoch": 8.838499184339314, + "grad_norm": 0.030218927189707756, + "learning_rate": 0.0006841972784823274, + "loss": 0.0115, + "num_input_tokens_seen": 117039888, + "step": 54180 + }, + { + "epoch": 8.83931484502447, + "grad_norm": 0.0450090654194355, + "learning_rate": 0.0006841311029031742, + "loss": 0.2128, + "num_input_tokens_seen": 117050224, + "step": 54185 + }, + { + "epoch": 8.840130505709626, + "grad_norm": 0.20938782393932343, + "learning_rate": 0.0006840649235923706, + "loss": 0.0908, + "num_input_tokens_seen": 117061840, + "step": 54190 + }, + { + "epoch": 8.84094616639478, + "grad_norm": 0.0133676053956151, + "learning_rate": 0.0006839987405512577, + "loss": 0.153, + "num_input_tokens_seen": 117073040, + "step": 54195 + }, + { + "epoch": 8.841761827079935, + "grad_norm": 0.22873623669147491, + "learning_rate": 0.000683932553781177, + "loss": 0.157, + "num_input_tokens_seen": 117084240, + "step": 54200 + }, + { + "epoch": 8.84257748776509, + "grad_norm": 0.02595687285065651, + "learning_rate": 0.0006838663632834697, + "loss": 0.0781, + "num_input_tokens_seen": 117095568, + "step": 54205 + }, + { + "epoch": 8.843393148450245, + "grad_norm": 0.20884649455547333, + "learning_rate": 0.0006838001690594775, + "loss": 0.1167, + "num_input_tokens_seen": 117106288, + "step": 54210 + }, + { + "epoch": 8.844208809135399, + "grad_norm": 0.03644806891679764, + "learning_rate": 0.0006837339711105414, + "loss": 0.0528, + "num_input_tokens_seen": 117116912, + "step": 54215 + }, + { + "epoch": 8.845024469820554, + "grad_norm": 0.17615261673927307, + "learning_rate": 0.0006836677694380035, + "loss": 0.2627, + "num_input_tokens_seen": 117128048, + "step": 54220 + }, + { + "epoch": 8.84584013050571, + "grad_norm": 0.27721107006073, + "learning_rate": 0.0006836015640432054, + "loss": 0.0625, + "num_input_tokens_seen": 117139376, + "step": 54225 + }, + { + "epoch": 8.846655791190864, + "grad_norm": 0.003872593864798546, + "learning_rate": 0.0006835353549274885, + "loss": 0.0266, + "num_input_tokens_seen": 117151472, + "step": 54230 + }, + { + "epoch": 8.84747145187602, + "grad_norm": 0.02096593752503395, + "learning_rate": 0.0006834691420921948, + "loss": 0.0404, + "num_input_tokens_seen": 117160848, + "step": 54235 + }, + { + "epoch": 8.848287112561174, + "grad_norm": 0.0623176284134388, + "learning_rate": 0.0006834029255386663, + "loss": 0.0434, + "num_input_tokens_seen": 117171664, + "step": 54240 + }, + { + "epoch": 8.84910277324633, + "grad_norm": 0.01989927887916565, + "learning_rate": 0.0006833367052682446, + "loss": 0.0354, + "num_input_tokens_seen": 117181488, + "step": 54245 + }, + { + "epoch": 8.849918433931485, + "grad_norm": 0.23223379254341125, + "learning_rate": 0.0006832704812822722, + "loss": 0.1932, + "num_input_tokens_seen": 117191984, + "step": 54250 + }, + { + "epoch": 8.850734094616639, + "grad_norm": 0.11960668861865997, + "learning_rate": 0.0006832042535820911, + "loss": 0.046, + "num_input_tokens_seen": 117201776, + "step": 54255 + }, + { + "epoch": 8.851549755301795, + "grad_norm": 0.1948905736207962, + "learning_rate": 0.0006831380221690431, + "loss": 0.093, + "num_input_tokens_seen": 117212944, + "step": 54260 + }, + { + "epoch": 8.852365415986949, + "grad_norm": 0.16176442801952362, + "learning_rate": 0.0006830717870444709, + "loss": 0.0523, + "num_input_tokens_seen": 117223312, + "step": 54265 + }, + { + "epoch": 8.853181076672104, + "grad_norm": 0.09057550132274628, + "learning_rate": 0.0006830055482097168, + "loss": 0.0323, + "num_input_tokens_seen": 117233744, + "step": 54270 + }, + { + "epoch": 8.85399673735726, + "grad_norm": 0.15768930315971375, + "learning_rate": 0.000682939305666123, + "loss": 0.2155, + "num_input_tokens_seen": 117245328, + "step": 54275 + }, + { + "epoch": 8.854812398042414, + "grad_norm": 0.30745288729667664, + "learning_rate": 0.000682873059415032, + "loss": 0.1165, + "num_input_tokens_seen": 117256176, + "step": 54280 + }, + { + "epoch": 8.85562805872757, + "grad_norm": 0.04249805584549904, + "learning_rate": 0.0006828068094577864, + "loss": 0.0557, + "num_input_tokens_seen": 117267664, + "step": 54285 + }, + { + "epoch": 8.856443719412724, + "grad_norm": 0.30030643939971924, + "learning_rate": 0.0006827405557957291, + "loss": 0.0919, + "num_input_tokens_seen": 117277744, + "step": 54290 + }, + { + "epoch": 8.85725938009788, + "grad_norm": 0.11320184171199799, + "learning_rate": 0.0006826742984302026, + "loss": 0.1424, + "num_input_tokens_seen": 117289168, + "step": 54295 + }, + { + "epoch": 8.858075040783035, + "grad_norm": 0.07032129168510437, + "learning_rate": 0.0006826080373625496, + "loss": 0.0498, + "num_input_tokens_seen": 117298896, + "step": 54300 + }, + { + "epoch": 8.858890701468189, + "grad_norm": 0.35693272948265076, + "learning_rate": 0.0006825417725941132, + "loss": 0.1395, + "num_input_tokens_seen": 117310000, + "step": 54305 + }, + { + "epoch": 8.859706362153345, + "grad_norm": 0.2755097448825836, + "learning_rate": 0.0006824755041262361, + "loss": 0.0562, + "num_input_tokens_seen": 117319088, + "step": 54310 + }, + { + "epoch": 8.860522022838499, + "grad_norm": 0.024103153496980667, + "learning_rate": 0.0006824092319602614, + "loss": 0.1487, + "num_input_tokens_seen": 117329552, + "step": 54315 + }, + { + "epoch": 8.861337683523654, + "grad_norm": 0.04700141400098801, + "learning_rate": 0.0006823429560975323, + "loss": 0.066, + "num_input_tokens_seen": 117340336, + "step": 54320 + }, + { + "epoch": 8.86215334420881, + "grad_norm": 0.06457889080047607, + "learning_rate": 0.0006822766765393919, + "loss": 0.0271, + "num_input_tokens_seen": 117352272, + "step": 54325 + }, + { + "epoch": 8.862969004893964, + "grad_norm": 0.10260617733001709, + "learning_rate": 0.0006822103932871832, + "loss": 0.0536, + "num_input_tokens_seen": 117362192, + "step": 54330 + }, + { + "epoch": 8.86378466557912, + "grad_norm": 0.05182819440960884, + "learning_rate": 0.00068214410634225, + "loss": 0.037, + "num_input_tokens_seen": 117373328, + "step": 54335 + }, + { + "epoch": 8.864600326264274, + "grad_norm": 0.4678332209587097, + "learning_rate": 0.0006820778157059353, + "loss": 0.2525, + "num_input_tokens_seen": 117383824, + "step": 54340 + }, + { + "epoch": 8.86541598694943, + "grad_norm": 0.006274137180298567, + "learning_rate": 0.0006820115213795827, + "loss": 0.0557, + "num_input_tokens_seen": 117395856, + "step": 54345 + }, + { + "epoch": 8.866231647634583, + "grad_norm": 0.025420423597097397, + "learning_rate": 0.0006819452233645357, + "loss": 0.0221, + "num_input_tokens_seen": 117406768, + "step": 54350 + }, + { + "epoch": 8.867047308319739, + "grad_norm": 0.025930307805538177, + "learning_rate": 0.0006818789216621379, + "loss": 0.0554, + "num_input_tokens_seen": 117418224, + "step": 54355 + }, + { + "epoch": 8.867862969004895, + "grad_norm": 0.17869673669338226, + "learning_rate": 0.0006818126162737332, + "loss": 0.0779, + "num_input_tokens_seen": 117429776, + "step": 54360 + }, + { + "epoch": 8.868678629690049, + "grad_norm": 0.003503937041386962, + "learning_rate": 0.000681746307200665, + "loss": 0.0338, + "num_input_tokens_seen": 117441296, + "step": 54365 + }, + { + "epoch": 8.869494290375204, + "grad_norm": 0.18694375455379486, + "learning_rate": 0.0006816799944442774, + "loss": 0.1056, + "num_input_tokens_seen": 117452880, + "step": 54370 + }, + { + "epoch": 8.870309951060358, + "grad_norm": 0.08532639592885971, + "learning_rate": 0.0006816136780059142, + "loss": 0.145, + "num_input_tokens_seen": 117464272, + "step": 54375 + }, + { + "epoch": 8.871125611745514, + "grad_norm": 0.09267734736204147, + "learning_rate": 0.0006815473578869194, + "loss": 0.0598, + "num_input_tokens_seen": 117476272, + "step": 54380 + }, + { + "epoch": 8.87194127243067, + "grad_norm": 0.03818744048476219, + "learning_rate": 0.0006814810340886372, + "loss": 0.0146, + "num_input_tokens_seen": 117487504, + "step": 54385 + }, + { + "epoch": 8.872756933115824, + "grad_norm": 0.01406745333224535, + "learning_rate": 0.0006814147066124116, + "loss": 0.0289, + "num_input_tokens_seen": 117497808, + "step": 54390 + }, + { + "epoch": 8.87357259380098, + "grad_norm": 0.03349088132381439, + "learning_rate": 0.0006813483754595867, + "loss": 0.0173, + "num_input_tokens_seen": 117510480, + "step": 54395 + }, + { + "epoch": 8.874388254486133, + "grad_norm": 0.17432676255702972, + "learning_rate": 0.000681282040631507, + "loss": 0.0528, + "num_input_tokens_seen": 117521872, + "step": 54400 + }, + { + "epoch": 8.875203915171289, + "grad_norm": 0.07740618288516998, + "learning_rate": 0.0006812157021295167, + "loss": 0.2157, + "num_input_tokens_seen": 117531760, + "step": 54405 + }, + { + "epoch": 8.876019575856443, + "grad_norm": 0.20691031217575073, + "learning_rate": 0.0006811493599549603, + "loss": 0.0289, + "num_input_tokens_seen": 117541872, + "step": 54410 + }, + { + "epoch": 8.876835236541599, + "grad_norm": 0.12131853401660919, + "learning_rate": 0.0006810830141091825, + "loss": 0.1375, + "num_input_tokens_seen": 117552336, + "step": 54415 + }, + { + "epoch": 8.877650897226754, + "grad_norm": 0.36007246375083923, + "learning_rate": 0.0006810166645935276, + "loss": 0.1905, + "num_input_tokens_seen": 117562704, + "step": 54420 + }, + { + "epoch": 8.878466557911908, + "grad_norm": 0.028180794790387154, + "learning_rate": 0.0006809503114093403, + "loss": 0.0518, + "num_input_tokens_seen": 117574032, + "step": 54425 + }, + { + "epoch": 8.879282218597064, + "grad_norm": 0.009340302087366581, + "learning_rate": 0.0006808839545579655, + "loss": 0.0174, + "num_input_tokens_seen": 117583984, + "step": 54430 + }, + { + "epoch": 8.880097879282218, + "grad_norm": 0.04891321435570717, + "learning_rate": 0.0006808175940407477, + "loss": 0.0297, + "num_input_tokens_seen": 117593200, + "step": 54435 + }, + { + "epoch": 8.880913539967374, + "grad_norm": 0.3209296762943268, + "learning_rate": 0.0006807512298590321, + "loss": 0.0407, + "num_input_tokens_seen": 117604688, + "step": 54440 + }, + { + "epoch": 8.88172920065253, + "grad_norm": 0.3639463186264038, + "learning_rate": 0.0006806848620141636, + "loss": 0.0542, + "num_input_tokens_seen": 117614960, + "step": 54445 + }, + { + "epoch": 8.882544861337683, + "grad_norm": 0.02138776332139969, + "learning_rate": 0.0006806184905074871, + "loss": 0.0273, + "num_input_tokens_seen": 117625616, + "step": 54450 + }, + { + "epoch": 8.883360522022839, + "grad_norm": 0.9454212188720703, + "learning_rate": 0.0006805521153403476, + "loss": 0.1509, + "num_input_tokens_seen": 117635216, + "step": 54455 + }, + { + "epoch": 8.884176182707993, + "grad_norm": 0.02996288612484932, + "learning_rate": 0.0006804857365140906, + "loss": 0.0542, + "num_input_tokens_seen": 117645008, + "step": 54460 + }, + { + "epoch": 8.884991843393149, + "grad_norm": 0.016237854957580566, + "learning_rate": 0.0006804193540300612, + "loss": 0.1137, + "num_input_tokens_seen": 117654992, + "step": 54465 + }, + { + "epoch": 8.885807504078304, + "grad_norm": 0.13553477823734283, + "learning_rate": 0.0006803529678896047, + "loss": 0.0965, + "num_input_tokens_seen": 117665520, + "step": 54470 + }, + { + "epoch": 8.886623164763458, + "grad_norm": 0.21454428136348724, + "learning_rate": 0.0006802865780940663, + "loss": 0.1178, + "num_input_tokens_seen": 117676240, + "step": 54475 + }, + { + "epoch": 8.887438825448614, + "grad_norm": 0.005688629578799009, + "learning_rate": 0.000680220184644792, + "loss": 0.0257, + "num_input_tokens_seen": 117686288, + "step": 54480 + }, + { + "epoch": 8.888254486133768, + "grad_norm": 0.004805149510502815, + "learning_rate": 0.0006801537875431269, + "loss": 0.0261, + "num_input_tokens_seen": 117696528, + "step": 54485 + }, + { + "epoch": 8.889070146818923, + "grad_norm": 0.003945660311728716, + "learning_rate": 0.0006800873867904167, + "loss": 0.0208, + "num_input_tokens_seen": 117707856, + "step": 54490 + }, + { + "epoch": 8.88988580750408, + "grad_norm": 0.011920423246920109, + "learning_rate": 0.0006800209823880072, + "loss": 0.0189, + "num_input_tokens_seen": 117719024, + "step": 54495 + }, + { + "epoch": 8.890701468189233, + "grad_norm": 0.0690336748957634, + "learning_rate": 0.0006799545743372442, + "loss": 0.1799, + "num_input_tokens_seen": 117728528, + "step": 54500 + }, + { + "epoch": 8.891517128874389, + "grad_norm": 0.021867262199521065, + "learning_rate": 0.0006798881626394734, + "loss": 0.0189, + "num_input_tokens_seen": 117738704, + "step": 54505 + }, + { + "epoch": 8.892332789559543, + "grad_norm": 0.005222649779170752, + "learning_rate": 0.0006798217472960407, + "loss": 0.0125, + "num_input_tokens_seen": 117750448, + "step": 54510 + }, + { + "epoch": 8.893148450244698, + "grad_norm": 0.26809030771255493, + "learning_rate": 0.0006797553283082922, + "loss": 0.0942, + "num_input_tokens_seen": 117760848, + "step": 54515 + }, + { + "epoch": 8.893964110929852, + "grad_norm": 0.004433480557054281, + "learning_rate": 0.000679688905677574, + "loss": 0.0352, + "num_input_tokens_seen": 117770960, + "step": 54520 + }, + { + "epoch": 8.894779771615008, + "grad_norm": 0.006452389992773533, + "learning_rate": 0.0006796224794052322, + "loss": 0.0854, + "num_input_tokens_seen": 117781744, + "step": 54525 + }, + { + "epoch": 8.895595432300164, + "grad_norm": 0.13232830166816711, + "learning_rate": 0.0006795560494926129, + "loss": 0.1114, + "num_input_tokens_seen": 117792368, + "step": 54530 + }, + { + "epoch": 8.896411092985318, + "grad_norm": 0.0037049497477710247, + "learning_rate": 0.0006794896159410625, + "loss": 0.0158, + "num_input_tokens_seen": 117802800, + "step": 54535 + }, + { + "epoch": 8.897226753670473, + "grad_norm": 0.005620979238301516, + "learning_rate": 0.0006794231787519274, + "loss": 0.124, + "num_input_tokens_seen": 117812624, + "step": 54540 + }, + { + "epoch": 8.898042414355627, + "grad_norm": 0.004999668337404728, + "learning_rate": 0.000679356737926554, + "loss": 0.0204, + "num_input_tokens_seen": 117823984, + "step": 54545 + }, + { + "epoch": 8.898858075040783, + "grad_norm": 0.044155821204185486, + "learning_rate": 0.0006792902934662885, + "loss": 0.048, + "num_input_tokens_seen": 117834448, + "step": 54550 + }, + { + "epoch": 8.899673735725939, + "grad_norm": 0.031155651435256004, + "learning_rate": 0.000679223845372478, + "loss": 0.0429, + "num_input_tokens_seen": 117845104, + "step": 54555 + }, + { + "epoch": 8.900489396411093, + "grad_norm": 0.1095210388302803, + "learning_rate": 0.0006791573936464689, + "loss": 0.3007, + "num_input_tokens_seen": 117854992, + "step": 54560 + }, + { + "epoch": 8.901305057096248, + "grad_norm": 0.1805056929588318, + "learning_rate": 0.0006790909382896079, + "loss": 0.0367, + "num_input_tokens_seen": 117864592, + "step": 54565 + }, + { + "epoch": 8.902120717781402, + "grad_norm": 0.012045920826494694, + "learning_rate": 0.0006790244793032418, + "loss": 0.0524, + "num_input_tokens_seen": 117875472, + "step": 54570 + }, + { + "epoch": 8.902936378466558, + "grad_norm": 0.3896695375442505, + "learning_rate": 0.0006789580166887176, + "loss": 0.1069, + "num_input_tokens_seen": 117886416, + "step": 54575 + }, + { + "epoch": 8.903752039151712, + "grad_norm": 0.13602808117866516, + "learning_rate": 0.0006788915504473822, + "loss": 0.0464, + "num_input_tokens_seen": 117897872, + "step": 54580 + }, + { + "epoch": 8.904567699836868, + "grad_norm": 0.13074855506420135, + "learning_rate": 0.0006788250805805824, + "loss": 0.0688, + "num_input_tokens_seen": 117908112, + "step": 54585 + }, + { + "epoch": 8.905383360522023, + "grad_norm": 0.02051844261586666, + "learning_rate": 0.0006787586070896657, + "loss": 0.0161, + "num_input_tokens_seen": 117919248, + "step": 54590 + }, + { + "epoch": 8.906199021207177, + "grad_norm": 0.020327605307102203, + "learning_rate": 0.0006786921299759789, + "loss": 0.0123, + "num_input_tokens_seen": 117928624, + "step": 54595 + }, + { + "epoch": 8.907014681892333, + "grad_norm": 0.06539393216371536, + "learning_rate": 0.0006786256492408694, + "loss": 0.0319, + "num_input_tokens_seen": 117940624, + "step": 54600 + }, + { + "epoch": 8.907830342577487, + "grad_norm": 0.0312392208725214, + "learning_rate": 0.0006785591648856846, + "loss": 0.0969, + "num_input_tokens_seen": 117952304, + "step": 54605 + }, + { + "epoch": 8.908646003262643, + "grad_norm": 0.010261873714625835, + "learning_rate": 0.0006784926769117717, + "loss": 0.0455, + "num_input_tokens_seen": 117964048, + "step": 54610 + }, + { + "epoch": 8.909461663947798, + "grad_norm": 0.04237981513142586, + "learning_rate": 0.0006784261853204783, + "loss": 0.0133, + "num_input_tokens_seen": 117975472, + "step": 54615 + }, + { + "epoch": 8.910277324632952, + "grad_norm": 0.43529877066612244, + "learning_rate": 0.0006783596901131521, + "loss": 0.2285, + "num_input_tokens_seen": 117986192, + "step": 54620 + }, + { + "epoch": 8.911092985318108, + "grad_norm": 0.8742546439170837, + "learning_rate": 0.0006782931912911402, + "loss": 0.1212, + "num_input_tokens_seen": 117997200, + "step": 54625 + }, + { + "epoch": 8.911908646003262, + "grad_norm": 0.07040494680404663, + "learning_rate": 0.0006782266888557909, + "loss": 0.0251, + "num_input_tokens_seen": 118007952, + "step": 54630 + }, + { + "epoch": 8.912724306688418, + "grad_norm": 0.02308201789855957, + "learning_rate": 0.0006781601828084513, + "loss": 0.0147, + "num_input_tokens_seen": 118019824, + "step": 54635 + }, + { + "epoch": 8.913539967373573, + "grad_norm": 0.05983942374587059, + "learning_rate": 0.0006780936731504699, + "loss": 0.1136, + "num_input_tokens_seen": 118030160, + "step": 54640 + }, + { + "epoch": 8.914355628058727, + "grad_norm": 0.02567743882536888, + "learning_rate": 0.0006780271598831942, + "loss": 0.0497, + "num_input_tokens_seen": 118040784, + "step": 54645 + }, + { + "epoch": 8.915171288743883, + "grad_norm": 0.09421033412218094, + "learning_rate": 0.0006779606430079723, + "loss": 0.0594, + "num_input_tokens_seen": 118050928, + "step": 54650 + }, + { + "epoch": 8.915986949429037, + "grad_norm": 0.1930844634771347, + "learning_rate": 0.0006778941225261522, + "loss": 0.0538, + "num_input_tokens_seen": 118062096, + "step": 54655 + }, + { + "epoch": 8.916802610114193, + "grad_norm": 0.006250257138162851, + "learning_rate": 0.0006778275984390819, + "loss": 0.0855, + "num_input_tokens_seen": 118073968, + "step": 54660 + }, + { + "epoch": 8.917618270799348, + "grad_norm": 0.032026421278715134, + "learning_rate": 0.0006777610707481099, + "loss": 0.0834, + "num_input_tokens_seen": 118084912, + "step": 54665 + }, + { + "epoch": 8.918433931484502, + "grad_norm": 0.29081985354423523, + "learning_rate": 0.0006776945394545841, + "loss": 0.1938, + "num_input_tokens_seen": 118095056, + "step": 54670 + }, + { + "epoch": 8.919249592169658, + "grad_norm": 0.2211325615644455, + "learning_rate": 0.0006776280045598533, + "loss": 0.0852, + "num_input_tokens_seen": 118106352, + "step": 54675 + }, + { + "epoch": 8.920065252854812, + "grad_norm": 0.21044909954071045, + "learning_rate": 0.0006775614660652655, + "loss": 0.0505, + "num_input_tokens_seen": 118118576, + "step": 54680 + }, + { + "epoch": 8.920880913539968, + "grad_norm": 0.26746198534965515, + "learning_rate": 0.0006774949239721692, + "loss": 0.1183, + "num_input_tokens_seen": 118130320, + "step": 54685 + }, + { + "epoch": 8.921696574225122, + "grad_norm": 0.005159418564289808, + "learning_rate": 0.0006774283782819133, + "loss": 0.0218, + "num_input_tokens_seen": 118141232, + "step": 54690 + }, + { + "epoch": 8.922512234910277, + "grad_norm": 0.00886352825909853, + "learning_rate": 0.0006773618289958462, + "loss": 0.0372, + "num_input_tokens_seen": 118152304, + "step": 54695 + }, + { + "epoch": 8.923327895595433, + "grad_norm": 0.04698998108506203, + "learning_rate": 0.0006772952761153167, + "loss": 0.0688, + "num_input_tokens_seen": 118162960, + "step": 54700 + }, + { + "epoch": 8.924143556280587, + "grad_norm": 0.026449838653206825, + "learning_rate": 0.0006772287196416733, + "loss": 0.1556, + "num_input_tokens_seen": 118173840, + "step": 54705 + }, + { + "epoch": 8.924959216965743, + "grad_norm": 0.09392181783914566, + "learning_rate": 0.0006771621595762652, + "loss": 0.1458, + "num_input_tokens_seen": 118184272, + "step": 54710 + }, + { + "epoch": 8.925774877650896, + "grad_norm": 0.011575781740248203, + "learning_rate": 0.0006770955959204412, + "loss": 0.1433, + "num_input_tokens_seen": 118195408, + "step": 54715 + }, + { + "epoch": 8.926590538336052, + "grad_norm": 0.0434119813144207, + "learning_rate": 0.0006770290286755503, + "loss": 0.0752, + "num_input_tokens_seen": 118204784, + "step": 54720 + }, + { + "epoch": 8.927406199021208, + "grad_norm": 0.1033087745308876, + "learning_rate": 0.0006769624578429414, + "loss": 0.0674, + "num_input_tokens_seen": 118214448, + "step": 54725 + }, + { + "epoch": 8.928221859706362, + "grad_norm": 0.01915070414543152, + "learning_rate": 0.0006768958834239639, + "loss": 0.0969, + "num_input_tokens_seen": 118223952, + "step": 54730 + }, + { + "epoch": 8.929037520391518, + "grad_norm": 0.05163106694817543, + "learning_rate": 0.0006768293054199669, + "loss": 0.0264, + "num_input_tokens_seen": 118235856, + "step": 54735 + }, + { + "epoch": 8.929853181076671, + "grad_norm": 0.0274738110601902, + "learning_rate": 0.0006767627238322998, + "loss": 0.0579, + "num_input_tokens_seen": 118247536, + "step": 54740 + }, + { + "epoch": 8.930668841761827, + "grad_norm": 0.03914555907249451, + "learning_rate": 0.0006766961386623118, + "loss": 0.0684, + "num_input_tokens_seen": 118258512, + "step": 54745 + }, + { + "epoch": 8.931484502446983, + "grad_norm": 0.009585415944457054, + "learning_rate": 0.0006766295499113524, + "loss": 0.0216, + "num_input_tokens_seen": 118268144, + "step": 54750 + }, + { + "epoch": 8.932300163132137, + "grad_norm": 0.05026097968220711, + "learning_rate": 0.000676562957580771, + "loss": 0.0148, + "num_input_tokens_seen": 118279216, + "step": 54755 + }, + { + "epoch": 8.933115823817293, + "grad_norm": 0.08081215620040894, + "learning_rate": 0.0006764963616719174, + "loss": 0.0796, + "num_input_tokens_seen": 118291312, + "step": 54760 + }, + { + "epoch": 8.933931484502446, + "grad_norm": 0.08607519418001175, + "learning_rate": 0.000676429762186141, + "loss": 0.016, + "num_input_tokens_seen": 118302832, + "step": 54765 + }, + { + "epoch": 8.934747145187602, + "grad_norm": 0.5207202434539795, + "learning_rate": 0.0006763631591247917, + "loss": 0.1118, + "num_input_tokens_seen": 118314224, + "step": 54770 + }, + { + "epoch": 8.935562805872756, + "grad_norm": 0.007003180216997862, + "learning_rate": 0.0006762965524892194, + "loss": 0.0346, + "num_input_tokens_seen": 118325552, + "step": 54775 + }, + { + "epoch": 8.936378466557912, + "grad_norm": 0.030011018738150597, + "learning_rate": 0.0006762299422807737, + "loss": 0.0151, + "num_input_tokens_seen": 118335376, + "step": 54780 + }, + { + "epoch": 8.937194127243067, + "grad_norm": 0.006405920721590519, + "learning_rate": 0.0006761633285008046, + "loss": 0.0123, + "num_input_tokens_seen": 118346640, + "step": 54785 + }, + { + "epoch": 8.938009787928221, + "grad_norm": 0.49705770611763, + "learning_rate": 0.0006760967111506623, + "loss": 0.0813, + "num_input_tokens_seen": 118356464, + "step": 54790 + }, + { + "epoch": 8.938825448613377, + "grad_norm": 0.3860141932964325, + "learning_rate": 0.0006760300902316967, + "loss": 0.0731, + "num_input_tokens_seen": 118365744, + "step": 54795 + }, + { + "epoch": 8.939641109298531, + "grad_norm": 0.16810378432273865, + "learning_rate": 0.000675963465745258, + "loss": 0.0829, + "num_input_tokens_seen": 118376240, + "step": 54800 + }, + { + "epoch": 8.940456769983687, + "grad_norm": 0.009920637123286724, + "learning_rate": 0.0006758968376926965, + "loss": 0.0898, + "num_input_tokens_seen": 118387440, + "step": 54805 + }, + { + "epoch": 8.941272430668842, + "grad_norm": 0.024205774068832397, + "learning_rate": 0.0006758302060753624, + "loss": 0.0147, + "num_input_tokens_seen": 118398576, + "step": 54810 + }, + { + "epoch": 8.942088091353996, + "grad_norm": 0.006671852432191372, + "learning_rate": 0.000675763570894606, + "loss": 0.0175, + "num_input_tokens_seen": 118410384, + "step": 54815 + }, + { + "epoch": 8.942903752039152, + "grad_norm": 0.015929104760289192, + "learning_rate": 0.0006756969321517781, + "loss": 0.0418, + "num_input_tokens_seen": 118420720, + "step": 54820 + }, + { + "epoch": 8.943719412724306, + "grad_norm": 0.023308448493480682, + "learning_rate": 0.0006756302898482288, + "loss": 0.014, + "num_input_tokens_seen": 118432688, + "step": 54825 + }, + { + "epoch": 8.944535073409462, + "grad_norm": 0.016953030601143837, + "learning_rate": 0.0006755636439853089, + "loss": 0.0235, + "num_input_tokens_seen": 118444144, + "step": 54830 + }, + { + "epoch": 8.945350734094617, + "grad_norm": 0.1932719647884369, + "learning_rate": 0.0006754969945643689, + "loss": 0.1507, + "num_input_tokens_seen": 118455184, + "step": 54835 + }, + { + "epoch": 8.946166394779771, + "grad_norm": 0.028529340401291847, + "learning_rate": 0.0006754303415867599, + "loss": 0.0774, + "num_input_tokens_seen": 118465936, + "step": 54840 + }, + { + "epoch": 8.946982055464927, + "grad_norm": 0.041322022676467896, + "learning_rate": 0.0006753636850538325, + "loss": 0.0844, + "num_input_tokens_seen": 118477392, + "step": 54845 + }, + { + "epoch": 8.947797716150081, + "grad_norm": 0.04866543412208557, + "learning_rate": 0.0006752970249669374, + "loss": 0.0183, + "num_input_tokens_seen": 118488816, + "step": 54850 + }, + { + "epoch": 8.948613376835237, + "grad_norm": 0.028561508283019066, + "learning_rate": 0.0006752303613274257, + "loss": 0.2118, + "num_input_tokens_seen": 118500848, + "step": 54855 + }, + { + "epoch": 8.949429037520392, + "grad_norm": 0.5350882411003113, + "learning_rate": 0.0006751636941366486, + "loss": 0.0707, + "num_input_tokens_seen": 118511376, + "step": 54860 + }, + { + "epoch": 8.950244698205546, + "grad_norm": 0.08346492052078247, + "learning_rate": 0.000675097023395957, + "loss": 0.0234, + "num_input_tokens_seen": 118521776, + "step": 54865 + }, + { + "epoch": 8.951060358890702, + "grad_norm": 0.10249694436788559, + "learning_rate": 0.0006750303491067021, + "loss": 0.0456, + "num_input_tokens_seen": 118532400, + "step": 54870 + }, + { + "epoch": 8.951876019575856, + "grad_norm": 0.07672520726919174, + "learning_rate": 0.0006749636712702349, + "loss": 0.1437, + "num_input_tokens_seen": 118541424, + "step": 54875 + }, + { + "epoch": 8.952691680261012, + "grad_norm": 0.3835560381412506, + "learning_rate": 0.0006748969898879071, + "loss": 0.0844, + "num_input_tokens_seen": 118551856, + "step": 54880 + }, + { + "epoch": 8.953507340946166, + "grad_norm": 0.003387624863535166, + "learning_rate": 0.00067483030496107, + "loss": 0.0761, + "num_input_tokens_seen": 118562192, + "step": 54885 + }, + { + "epoch": 8.954323001631321, + "grad_norm": 0.007898389361798763, + "learning_rate": 0.000674763616491075, + "loss": 0.1076, + "num_input_tokens_seen": 118572624, + "step": 54890 + }, + { + "epoch": 8.955138662316477, + "grad_norm": 0.019596073776483536, + "learning_rate": 0.0006746969244792734, + "loss": 0.0177, + "num_input_tokens_seen": 118584048, + "step": 54895 + }, + { + "epoch": 8.955954323001631, + "grad_norm": 0.005080987699329853, + "learning_rate": 0.0006746302289270172, + "loss": 0.0103, + "num_input_tokens_seen": 118595696, + "step": 54900 + }, + { + "epoch": 8.956769983686787, + "grad_norm": 0.012511802837252617, + "learning_rate": 0.0006745635298356579, + "loss": 0.079, + "num_input_tokens_seen": 118605392, + "step": 54905 + }, + { + "epoch": 8.95758564437194, + "grad_norm": 0.11990555375814438, + "learning_rate": 0.0006744968272065469, + "loss": 0.0612, + "num_input_tokens_seen": 118615024, + "step": 54910 + }, + { + "epoch": 8.958401305057096, + "grad_norm": 0.019289560616016388, + "learning_rate": 0.0006744301210410366, + "loss": 0.0455, + "num_input_tokens_seen": 118625968, + "step": 54915 + }, + { + "epoch": 8.959216965742252, + "grad_norm": 0.0528937429189682, + "learning_rate": 0.0006743634113404786, + "loss": 0.0236, + "num_input_tokens_seen": 118635824, + "step": 54920 + }, + { + "epoch": 8.960032626427406, + "grad_norm": 0.04663698002696037, + "learning_rate": 0.0006742966981062249, + "loss": 0.0265, + "num_input_tokens_seen": 118644624, + "step": 54925 + }, + { + "epoch": 8.960848287112562, + "grad_norm": 0.0013387626968324184, + "learning_rate": 0.0006742299813396274, + "loss": 0.0941, + "num_input_tokens_seen": 118653168, + "step": 54930 + }, + { + "epoch": 8.961663947797716, + "grad_norm": 0.02348313294351101, + "learning_rate": 0.0006741632610420384, + "loss": 0.0951, + "num_input_tokens_seen": 118663856, + "step": 54935 + }, + { + "epoch": 8.962479608482871, + "grad_norm": 0.19825030863285065, + "learning_rate": 0.0006740965372148098, + "loss": 0.0429, + "num_input_tokens_seen": 118674480, + "step": 54940 + }, + { + "epoch": 8.963295269168025, + "grad_norm": 0.36576709151268005, + "learning_rate": 0.0006740298098592941, + "loss": 0.0643, + "num_input_tokens_seen": 118684848, + "step": 54945 + }, + { + "epoch": 8.964110929853181, + "grad_norm": 0.25725460052490234, + "learning_rate": 0.0006739630789768436, + "loss": 0.081, + "num_input_tokens_seen": 118696400, + "step": 54950 + }, + { + "epoch": 8.964926590538337, + "grad_norm": 0.06459327787160873, + "learning_rate": 0.0006738963445688107, + "loss": 0.0887, + "num_input_tokens_seen": 118707120, + "step": 54955 + }, + { + "epoch": 8.96574225122349, + "grad_norm": 0.043718330562114716, + "learning_rate": 0.0006738296066365476, + "loss": 0.0155, + "num_input_tokens_seen": 118717200, + "step": 54960 + }, + { + "epoch": 8.966557911908646, + "grad_norm": 0.09502162039279938, + "learning_rate": 0.000673762865181407, + "loss": 0.2014, + "num_input_tokens_seen": 118728336, + "step": 54965 + }, + { + "epoch": 8.9673735725938, + "grad_norm": 0.094657763838768, + "learning_rate": 0.0006736961202047417, + "loss": 0.0306, + "num_input_tokens_seen": 118738928, + "step": 54970 + }, + { + "epoch": 8.968189233278956, + "grad_norm": 0.017213786020874977, + "learning_rate": 0.0006736293717079041, + "loss": 0.0587, + "num_input_tokens_seen": 118750096, + "step": 54975 + }, + { + "epoch": 8.969004893964112, + "grad_norm": 0.019299479201436043, + "learning_rate": 0.0006735626196922469, + "loss": 0.0334, + "num_input_tokens_seen": 118760176, + "step": 54980 + }, + { + "epoch": 8.969820554649266, + "grad_norm": 0.21472111344337463, + "learning_rate": 0.0006734958641591231, + "loss": 0.1958, + "num_input_tokens_seen": 118770928, + "step": 54985 + }, + { + "epoch": 8.970636215334421, + "grad_norm": 0.022455843165516853, + "learning_rate": 0.0006734291051098856, + "loss": 0.0815, + "num_input_tokens_seen": 118782576, + "step": 54990 + }, + { + "epoch": 8.971451876019575, + "grad_norm": 0.003710412187501788, + "learning_rate": 0.0006733623425458871, + "loss": 0.0148, + "num_input_tokens_seen": 118794128, + "step": 54995 + }, + { + "epoch": 8.97226753670473, + "grad_norm": 0.005927415564656258, + "learning_rate": 0.000673295576468481, + "loss": 0.0632, + "num_input_tokens_seen": 118804048, + "step": 55000 + }, + { + "epoch": 8.973083197389887, + "grad_norm": 0.19900697469711304, + "learning_rate": 0.00067322880687902, + "loss": 0.0425, + "num_input_tokens_seen": 118814800, + "step": 55005 + }, + { + "epoch": 8.97389885807504, + "grad_norm": 0.2374364286661148, + "learning_rate": 0.0006731620337788576, + "loss": 0.0805, + "num_input_tokens_seen": 118825616, + "step": 55010 + }, + { + "epoch": 8.974714518760196, + "grad_norm": 0.05746473744511604, + "learning_rate": 0.0006730952571693469, + "loss": 0.0391, + "num_input_tokens_seen": 118836208, + "step": 55015 + }, + { + "epoch": 8.97553017944535, + "grad_norm": 0.020573321729898453, + "learning_rate": 0.0006730284770518412, + "loss": 0.0822, + "num_input_tokens_seen": 118847344, + "step": 55020 + }, + { + "epoch": 8.976345840130506, + "grad_norm": 0.009821916930377483, + "learning_rate": 0.0006729616934276939, + "loss": 0.0074, + "num_input_tokens_seen": 118857840, + "step": 55025 + }, + { + "epoch": 8.977161500815662, + "grad_norm": 0.005163253750652075, + "learning_rate": 0.0006728949062982585, + "loss": 0.1417, + "num_input_tokens_seen": 118867984, + "step": 55030 + }, + { + "epoch": 8.977977161500815, + "grad_norm": 0.0437258705496788, + "learning_rate": 0.0006728281156648885, + "loss": 0.0142, + "num_input_tokens_seen": 118879696, + "step": 55035 + }, + { + "epoch": 8.978792822185971, + "grad_norm": 0.02063642628490925, + "learning_rate": 0.0006727613215289374, + "loss": 0.0074, + "num_input_tokens_seen": 118891088, + "step": 55040 + }, + { + "epoch": 8.979608482871125, + "grad_norm": 0.00389106129296124, + "learning_rate": 0.0006726945238917589, + "loss": 0.0202, + "num_input_tokens_seen": 118902672, + "step": 55045 + }, + { + "epoch": 8.98042414355628, + "grad_norm": 0.3619442880153656, + "learning_rate": 0.000672627722754707, + "loss": 0.2283, + "num_input_tokens_seen": 118913200, + "step": 55050 + }, + { + "epoch": 8.981239804241435, + "grad_norm": 0.014173166826367378, + "learning_rate": 0.0006725609181191352, + "loss": 0.0756, + "num_input_tokens_seen": 118924816, + "step": 55055 + }, + { + "epoch": 8.98205546492659, + "grad_norm": 0.017785949632525444, + "learning_rate": 0.0006724941099863975, + "loss": 0.0982, + "num_input_tokens_seen": 118934896, + "step": 55060 + }, + { + "epoch": 8.982871125611746, + "grad_norm": 0.01947389915585518, + "learning_rate": 0.0006724272983578478, + "loss": 0.0191, + "num_input_tokens_seen": 118943728, + "step": 55065 + }, + { + "epoch": 8.9836867862969, + "grad_norm": 0.08733158558607101, + "learning_rate": 0.0006723604832348403, + "loss": 0.1048, + "num_input_tokens_seen": 118955024, + "step": 55070 + }, + { + "epoch": 8.984502446982056, + "grad_norm": 0.038850247859954834, + "learning_rate": 0.0006722936646187288, + "loss": 0.0178, + "num_input_tokens_seen": 118966320, + "step": 55075 + }, + { + "epoch": 8.98531810766721, + "grad_norm": 0.018704602494835854, + "learning_rate": 0.0006722268425108675, + "loss": 0.1408, + "num_input_tokens_seen": 118976656, + "step": 55080 + }, + { + "epoch": 8.986133768352365, + "grad_norm": 0.15029045939445496, + "learning_rate": 0.000672160016912611, + "loss": 0.0266, + "num_input_tokens_seen": 118987472, + "step": 55085 + }, + { + "epoch": 8.986949429037521, + "grad_norm": 0.4871756434440613, + "learning_rate": 0.0006720931878253133, + "loss": 0.0172, + "num_input_tokens_seen": 118997904, + "step": 55090 + }, + { + "epoch": 8.987765089722675, + "grad_norm": 0.00840507447719574, + "learning_rate": 0.0006720263552503288, + "loss": 0.0407, + "num_input_tokens_seen": 119008080, + "step": 55095 + }, + { + "epoch": 8.98858075040783, + "grad_norm": 0.2138519436120987, + "learning_rate": 0.000671959519189012, + "loss": 0.0609, + "num_input_tokens_seen": 119018064, + "step": 55100 + }, + { + "epoch": 8.989396411092985, + "grad_norm": 0.04366368055343628, + "learning_rate": 0.0006718926796427174, + "loss": 0.1852, + "num_input_tokens_seen": 119027472, + "step": 55105 + }, + { + "epoch": 8.99021207177814, + "grad_norm": 0.26440298557281494, + "learning_rate": 0.0006718258366127995, + "loss": 0.1308, + "num_input_tokens_seen": 119038160, + "step": 55110 + }, + { + "epoch": 8.991027732463294, + "grad_norm": 0.026875967159867287, + "learning_rate": 0.0006717589901006131, + "loss": 0.0477, + "num_input_tokens_seen": 119048464, + "step": 55115 + }, + { + "epoch": 8.99184339314845, + "grad_norm": 0.2834641933441162, + "learning_rate": 0.0006716921401075129, + "loss": 0.2825, + "num_input_tokens_seen": 119059056, + "step": 55120 + }, + { + "epoch": 8.992659053833606, + "grad_norm": 0.2926873564720154, + "learning_rate": 0.0006716252866348537, + "loss": 0.0781, + "num_input_tokens_seen": 119069648, + "step": 55125 + }, + { + "epoch": 8.99347471451876, + "grad_norm": 0.2594529986381531, + "learning_rate": 0.0006715584296839903, + "loss": 0.1189, + "num_input_tokens_seen": 119079920, + "step": 55130 + }, + { + "epoch": 8.994290375203915, + "grad_norm": 0.018395302817225456, + "learning_rate": 0.0006714915692562777, + "loss": 0.0279, + "num_input_tokens_seen": 119090896, + "step": 55135 + }, + { + "epoch": 8.99510603588907, + "grad_norm": 0.043946582823991776, + "learning_rate": 0.0006714247053530709, + "loss": 0.0472, + "num_input_tokens_seen": 119101360, + "step": 55140 + }, + { + "epoch": 8.995921696574225, + "grad_norm": 0.04027822986245155, + "learning_rate": 0.0006713578379757251, + "loss": 0.068, + "num_input_tokens_seen": 119112336, + "step": 55145 + }, + { + "epoch": 8.99673735725938, + "grad_norm": 0.14224760234355927, + "learning_rate": 0.0006712909671255952, + "loss": 0.0758, + "num_input_tokens_seen": 119121712, + "step": 55150 + }, + { + "epoch": 8.997553017944535, + "grad_norm": 0.085087850689888, + "learning_rate": 0.0006712240928040363, + "loss": 0.0845, + "num_input_tokens_seen": 119133424, + "step": 55155 + }, + { + "epoch": 8.99836867862969, + "grad_norm": 0.0085781030356884, + "learning_rate": 0.0006711572150124043, + "loss": 0.0217, + "num_input_tokens_seen": 119145360, + "step": 55160 + }, + { + "epoch": 8.999184339314844, + "grad_norm": 0.0073289647698402405, + "learning_rate": 0.0006710903337520539, + "loss": 0.0478, + "num_input_tokens_seen": 119155568, + "step": 55165 + }, + { + "epoch": 9.0, + "grad_norm": 0.013840760104358196, + "learning_rate": 0.0006710234490243412, + "loss": 0.2284, + "num_input_tokens_seen": 119164864, + "step": 55170 + }, + { + "epoch": 9.0, + "eval_loss": 0.1312951296567917, + "eval_runtime": 103.641, + "eval_samples_per_second": 26.293, + "eval_steps_per_second": 6.58, + "num_input_tokens_seen": 119164864, + "step": 55170 + }, + { + "epoch": 9.000815660685156, + "grad_norm": 0.012224181555211544, + "learning_rate": 0.0006709565608306212, + "loss": 0.0546, + "num_input_tokens_seen": 119176256, + "step": 55175 + }, + { + "epoch": 9.00163132137031, + "grad_norm": 0.18823717534542084, + "learning_rate": 0.0006708896691722495, + "loss": 0.0402, + "num_input_tokens_seen": 119187200, + "step": 55180 + }, + { + "epoch": 9.002446982055465, + "grad_norm": 0.006239529699087143, + "learning_rate": 0.0006708227740505822, + "loss": 0.0164, + "num_input_tokens_seen": 119199072, + "step": 55185 + }, + { + "epoch": 9.00326264274062, + "grad_norm": 0.0993809700012207, + "learning_rate": 0.0006707558754669744, + "loss": 0.0127, + "num_input_tokens_seen": 119210016, + "step": 55190 + }, + { + "epoch": 9.004078303425775, + "grad_norm": 0.16244710981845856, + "learning_rate": 0.0006706889734227823, + "loss": 0.0424, + "num_input_tokens_seen": 119221376, + "step": 55195 + }, + { + "epoch": 9.00489396411093, + "grad_norm": 0.14778240025043488, + "learning_rate": 0.0006706220679193614, + "loss": 0.0585, + "num_input_tokens_seen": 119232672, + "step": 55200 + }, + { + "epoch": 9.005709624796085, + "grad_norm": 0.04779522493481636, + "learning_rate": 0.000670555158958068, + "loss": 0.2511, + "num_input_tokens_seen": 119243072, + "step": 55205 + }, + { + "epoch": 9.00652528548124, + "grad_norm": 0.05503462255001068, + "learning_rate": 0.0006704882465402579, + "loss": 0.0213, + "num_input_tokens_seen": 119254016, + "step": 55210 + }, + { + "epoch": 9.007340946166394, + "grad_norm": 0.02401323802769184, + "learning_rate": 0.0006704213306672873, + "loss": 0.0482, + "num_input_tokens_seen": 119264448, + "step": 55215 + }, + { + "epoch": 9.00815660685155, + "grad_norm": 0.11283843219280243, + "learning_rate": 0.0006703544113405122, + "loss": 0.0795, + "num_input_tokens_seen": 119275424, + "step": 55220 + }, + { + "epoch": 9.008972267536704, + "grad_norm": 0.09734462946653366, + "learning_rate": 0.0006702874885612887, + "loss": 0.071, + "num_input_tokens_seen": 119287392, + "step": 55225 + }, + { + "epoch": 9.00978792822186, + "grad_norm": 0.24100083112716675, + "learning_rate": 0.0006702205623309734, + "loss": 0.102, + "num_input_tokens_seen": 119297472, + "step": 55230 + }, + { + "epoch": 9.010603588907015, + "grad_norm": 0.011355135589838028, + "learning_rate": 0.0006701536326509224, + "loss": 0.0167, + "num_input_tokens_seen": 119308416, + "step": 55235 + }, + { + "epoch": 9.01141924959217, + "grad_norm": 0.004279524087905884, + "learning_rate": 0.0006700866995224921, + "loss": 0.1309, + "num_input_tokens_seen": 119318592, + "step": 55240 + }, + { + "epoch": 9.012234910277325, + "grad_norm": 0.1352081000804901, + "learning_rate": 0.0006700197629470393, + "loss": 0.0251, + "num_input_tokens_seen": 119329696, + "step": 55245 + }, + { + "epoch": 9.013050570962479, + "grad_norm": 0.013182039372622967, + "learning_rate": 0.00066995282292592, + "loss": 0.0778, + "num_input_tokens_seen": 119339936, + "step": 55250 + }, + { + "epoch": 9.013866231647635, + "grad_norm": 0.019128017127513885, + "learning_rate": 0.0006698858794604914, + "loss": 0.0305, + "num_input_tokens_seen": 119350752, + "step": 55255 + }, + { + "epoch": 9.01468189233279, + "grad_norm": 0.10801997780799866, + "learning_rate": 0.0006698189325521097, + "loss": 0.1493, + "num_input_tokens_seen": 119362048, + "step": 55260 + }, + { + "epoch": 9.015497553017944, + "grad_norm": 0.020492171868681908, + "learning_rate": 0.000669751982202132, + "loss": 0.016, + "num_input_tokens_seen": 119371488, + "step": 55265 + }, + { + "epoch": 9.0163132137031, + "grad_norm": 0.00517929857596755, + "learning_rate": 0.0006696850284119151, + "loss": 0.0194, + "num_input_tokens_seen": 119382336, + "step": 55270 + }, + { + "epoch": 9.017128874388254, + "grad_norm": 0.10492771118879318, + "learning_rate": 0.0006696180711828159, + "loss": 0.1248, + "num_input_tokens_seen": 119392384, + "step": 55275 + }, + { + "epoch": 9.01794453507341, + "grad_norm": 0.01910402998328209, + "learning_rate": 0.0006695511105161913, + "loss": 0.0182, + "num_input_tokens_seen": 119404640, + "step": 55280 + }, + { + "epoch": 9.018760195758565, + "grad_norm": 0.01113252155482769, + "learning_rate": 0.0006694841464133981, + "loss": 0.0633, + "num_input_tokens_seen": 119414816, + "step": 55285 + }, + { + "epoch": 9.01957585644372, + "grad_norm": 0.0077075595036149025, + "learning_rate": 0.0006694171788757939, + "loss": 0.0078, + "num_input_tokens_seen": 119425824, + "step": 55290 + }, + { + "epoch": 9.020391517128875, + "grad_norm": 0.29589682817459106, + "learning_rate": 0.0006693502079047356, + "loss": 0.0388, + "num_input_tokens_seen": 119437344, + "step": 55295 + }, + { + "epoch": 9.021207177814029, + "grad_norm": 0.08724862337112427, + "learning_rate": 0.0006692832335015806, + "loss": 0.2134, + "num_input_tokens_seen": 119448896, + "step": 55300 + }, + { + "epoch": 9.022022838499185, + "grad_norm": 0.045984670519828796, + "learning_rate": 0.000669216255667686, + "loss": 0.0944, + "num_input_tokens_seen": 119459904, + "step": 55305 + }, + { + "epoch": 9.022838499184338, + "grad_norm": 0.0032309559173882008, + "learning_rate": 0.0006691492744044093, + "loss": 0.0247, + "num_input_tokens_seen": 119470624, + "step": 55310 + }, + { + "epoch": 9.023654159869494, + "grad_norm": 0.6814609169960022, + "learning_rate": 0.000669082289713108, + "loss": 0.0539, + "num_input_tokens_seen": 119481408, + "step": 55315 + }, + { + "epoch": 9.02446982055465, + "grad_norm": 0.009352785535156727, + "learning_rate": 0.0006690153015951397, + "loss": 0.03, + "num_input_tokens_seen": 119493376, + "step": 55320 + }, + { + "epoch": 9.025285481239804, + "grad_norm": 0.06866900622844696, + "learning_rate": 0.0006689483100518617, + "loss": 0.0134, + "num_input_tokens_seen": 119505056, + "step": 55325 + }, + { + "epoch": 9.02610114192496, + "grad_norm": 0.06737705320119858, + "learning_rate": 0.000668881315084632, + "loss": 0.0599, + "num_input_tokens_seen": 119516096, + "step": 55330 + }, + { + "epoch": 9.026916802610113, + "grad_norm": 0.14733898639678955, + "learning_rate": 0.0006688143166948082, + "loss": 0.0697, + "num_input_tokens_seen": 119526944, + "step": 55335 + }, + { + "epoch": 9.02773246329527, + "grad_norm": 0.031115690246224403, + "learning_rate": 0.0006687473148837482, + "loss": 0.0326, + "num_input_tokens_seen": 119537280, + "step": 55340 + }, + { + "epoch": 9.028548123980425, + "grad_norm": 0.0266889575868845, + "learning_rate": 0.0006686803096528096, + "loss": 0.0681, + "num_input_tokens_seen": 119547744, + "step": 55345 + }, + { + "epoch": 9.029363784665579, + "grad_norm": 0.0063073355704545975, + "learning_rate": 0.0006686133010033507, + "loss": 0.1827, + "num_input_tokens_seen": 119558784, + "step": 55350 + }, + { + "epoch": 9.030179445350734, + "grad_norm": 0.15066297352313995, + "learning_rate": 0.0006685462889367293, + "loss": 0.0953, + "num_input_tokens_seen": 119569056, + "step": 55355 + }, + { + "epoch": 9.030995106035888, + "grad_norm": 0.6305835843086243, + "learning_rate": 0.0006684792734543036, + "loss": 0.0557, + "num_input_tokens_seen": 119579808, + "step": 55360 + }, + { + "epoch": 9.031810766721044, + "grad_norm": 0.05144287645816803, + "learning_rate": 0.0006684122545574315, + "loss": 0.0321, + "num_input_tokens_seen": 119590816, + "step": 55365 + }, + { + "epoch": 9.0326264274062, + "grad_norm": 0.07655679434537888, + "learning_rate": 0.0006683452322474715, + "loss": 0.1697, + "num_input_tokens_seen": 119600640, + "step": 55370 + }, + { + "epoch": 9.033442088091354, + "grad_norm": 0.08831831812858582, + "learning_rate": 0.0006682782065257818, + "loss": 0.007, + "num_input_tokens_seen": 119611328, + "step": 55375 + }, + { + "epoch": 9.03425774877651, + "grad_norm": 0.04714163392782211, + "learning_rate": 0.000668211177393721, + "loss": 0.0365, + "num_input_tokens_seen": 119622368, + "step": 55380 + }, + { + "epoch": 9.035073409461663, + "grad_norm": 0.17070084810256958, + "learning_rate": 0.0006681441448526471, + "loss": 0.0405, + "num_input_tokens_seen": 119632960, + "step": 55385 + }, + { + "epoch": 9.035889070146819, + "grad_norm": 0.1857834756374359, + "learning_rate": 0.0006680771089039188, + "loss": 0.0273, + "num_input_tokens_seen": 119644096, + "step": 55390 + }, + { + "epoch": 9.036704730831975, + "grad_norm": 0.14468571543693542, + "learning_rate": 0.0006680100695488946, + "loss": 0.088, + "num_input_tokens_seen": 119655456, + "step": 55395 + }, + { + "epoch": 9.037520391517129, + "grad_norm": 0.0060724965296685696, + "learning_rate": 0.0006679430267889332, + "loss": 0.0111, + "num_input_tokens_seen": 119666240, + "step": 55400 + }, + { + "epoch": 9.038336052202284, + "grad_norm": 0.08735064417123795, + "learning_rate": 0.0006678759806253933, + "loss": 0.1389, + "num_input_tokens_seen": 119678272, + "step": 55405 + }, + { + "epoch": 9.039151712887438, + "grad_norm": 0.08246026933193207, + "learning_rate": 0.0006678089310596339, + "loss": 0.0397, + "num_input_tokens_seen": 119688224, + "step": 55410 + }, + { + "epoch": 9.039967373572594, + "grad_norm": 0.011172840371727943, + "learning_rate": 0.0006677418780930136, + "loss": 0.0094, + "num_input_tokens_seen": 119698752, + "step": 55415 + }, + { + "epoch": 9.040783034257748, + "grad_norm": 0.007427356671541929, + "learning_rate": 0.0006676748217268912, + "loss": 0.0619, + "num_input_tokens_seen": 119709600, + "step": 55420 + }, + { + "epoch": 9.041598694942904, + "grad_norm": 0.22098654508590698, + "learning_rate": 0.0006676077619626259, + "loss": 0.0482, + "num_input_tokens_seen": 119719552, + "step": 55425 + }, + { + "epoch": 9.04241435562806, + "grad_norm": 0.717595636844635, + "learning_rate": 0.0006675406988015766, + "loss": 0.0553, + "num_input_tokens_seen": 119730112, + "step": 55430 + }, + { + "epoch": 9.043230016313213, + "grad_norm": 0.06307411193847656, + "learning_rate": 0.0006674736322451027, + "loss": 0.0242, + "num_input_tokens_seen": 119741120, + "step": 55435 + }, + { + "epoch": 9.044045676998369, + "grad_norm": 0.021658165380358696, + "learning_rate": 0.000667406562294563, + "loss": 0.0132, + "num_input_tokens_seen": 119752480, + "step": 55440 + }, + { + "epoch": 9.044861337683523, + "grad_norm": 0.03899122402071953, + "learning_rate": 0.0006673394889513169, + "loss": 0.0207, + "num_input_tokens_seen": 119763552, + "step": 55445 + }, + { + "epoch": 9.045676998368679, + "grad_norm": 0.1413194090127945, + "learning_rate": 0.000667272412216724, + "loss": 0.0306, + "num_input_tokens_seen": 119774944, + "step": 55450 + }, + { + "epoch": 9.046492659053834, + "grad_norm": 0.017150040715932846, + "learning_rate": 0.0006672053320921433, + "loss": 0.1126, + "num_input_tokens_seen": 119786112, + "step": 55455 + }, + { + "epoch": 9.047308319738988, + "grad_norm": 0.016458654776215553, + "learning_rate": 0.0006671382485789344, + "loss": 0.0098, + "num_input_tokens_seen": 119797920, + "step": 55460 + }, + { + "epoch": 9.048123980424144, + "grad_norm": 0.026537369936704636, + "learning_rate": 0.0006670711616784571, + "loss": 0.0144, + "num_input_tokens_seen": 119808768, + "step": 55465 + }, + { + "epoch": 9.048939641109298, + "grad_norm": 0.020560389384627342, + "learning_rate": 0.0006670040713920704, + "loss": 0.0329, + "num_input_tokens_seen": 119818720, + "step": 55470 + }, + { + "epoch": 9.049755301794454, + "grad_norm": 0.04738950356841087, + "learning_rate": 0.0006669369777211344, + "loss": 0.1724, + "num_input_tokens_seen": 119830336, + "step": 55475 + }, + { + "epoch": 9.05057096247961, + "grad_norm": 0.003236339660361409, + "learning_rate": 0.000666869880667009, + "loss": 0.0072, + "num_input_tokens_seen": 119840416, + "step": 55480 + }, + { + "epoch": 9.051386623164763, + "grad_norm": 0.005051568150520325, + "learning_rate": 0.0006668027802310537, + "loss": 0.0348, + "num_input_tokens_seen": 119851104, + "step": 55485 + }, + { + "epoch": 9.052202283849919, + "grad_norm": 0.0035680499859154224, + "learning_rate": 0.0006667356764146284, + "loss": 0.0067, + "num_input_tokens_seen": 119862144, + "step": 55490 + }, + { + "epoch": 9.053017944535073, + "grad_norm": 0.05445701628923416, + "learning_rate": 0.0006666685692190931, + "loss": 0.0102, + "num_input_tokens_seen": 119873952, + "step": 55495 + }, + { + "epoch": 9.053833605220229, + "grad_norm": 0.44816941022872925, + "learning_rate": 0.0006666014586458079, + "loss": 0.1066, + "num_input_tokens_seen": 119884288, + "step": 55500 + }, + { + "epoch": 9.054649265905383, + "grad_norm": 0.08904964476823807, + "learning_rate": 0.0006665343446961327, + "loss": 0.011, + "num_input_tokens_seen": 119893696, + "step": 55505 + }, + { + "epoch": 9.055464926590538, + "grad_norm": 0.0033945401664823294, + "learning_rate": 0.0006664672273714278, + "loss": 0.0059, + "num_input_tokens_seen": 119905760, + "step": 55510 + }, + { + "epoch": 9.056280587275694, + "grad_norm": 0.00631315354257822, + "learning_rate": 0.0006664001066730532, + "loss": 0.004, + "num_input_tokens_seen": 119916992, + "step": 55515 + }, + { + "epoch": 9.057096247960848, + "grad_norm": 0.01878887228667736, + "learning_rate": 0.0006663329826023696, + "loss": 0.0412, + "num_input_tokens_seen": 119928192, + "step": 55520 + }, + { + "epoch": 9.057911908646004, + "grad_norm": 0.010319680906832218, + "learning_rate": 0.000666265855160737, + "loss": 0.088, + "num_input_tokens_seen": 119939360, + "step": 55525 + }, + { + "epoch": 9.058727569331158, + "grad_norm": 0.12095552682876587, + "learning_rate": 0.0006661987243495159, + "loss": 0.0356, + "num_input_tokens_seen": 119950432, + "step": 55530 + }, + { + "epoch": 9.059543230016313, + "grad_norm": 0.029308486729860306, + "learning_rate": 0.0006661315901700668, + "loss": 0.0118, + "num_input_tokens_seen": 119960768, + "step": 55535 + }, + { + "epoch": 9.060358890701469, + "grad_norm": 0.03496725484728813, + "learning_rate": 0.0006660644526237502, + "loss": 0.0211, + "num_input_tokens_seen": 119972032, + "step": 55540 + }, + { + "epoch": 9.061174551386623, + "grad_norm": 0.020432081073522568, + "learning_rate": 0.0006659973117119269, + "loss": 0.0563, + "num_input_tokens_seen": 119983072, + "step": 55545 + }, + { + "epoch": 9.061990212071779, + "grad_norm": 0.005481477826833725, + "learning_rate": 0.0006659301674359575, + "loss": 0.0254, + "num_input_tokens_seen": 119993472, + "step": 55550 + }, + { + "epoch": 9.062805872756933, + "grad_norm": 0.04623018950223923, + "learning_rate": 0.0006658630197972027, + "loss": 0.0283, + "num_input_tokens_seen": 120002656, + "step": 55555 + }, + { + "epoch": 9.063621533442088, + "grad_norm": 0.06435194611549377, + "learning_rate": 0.0006657958687970233, + "loss": 0.1708, + "num_input_tokens_seen": 120012224, + "step": 55560 + }, + { + "epoch": 9.064437194127244, + "grad_norm": 0.021413441747426987, + "learning_rate": 0.0006657287144367805, + "loss": 0.0544, + "num_input_tokens_seen": 120024128, + "step": 55565 + }, + { + "epoch": 9.065252854812398, + "grad_norm": 0.020738402381539345, + "learning_rate": 0.000665661556717835, + "loss": 0.007, + "num_input_tokens_seen": 120034112, + "step": 55570 + }, + { + "epoch": 9.066068515497554, + "grad_norm": 0.0270835030823946, + "learning_rate": 0.0006655943956415479, + "loss": 0.011, + "num_input_tokens_seen": 120045216, + "step": 55575 + }, + { + "epoch": 9.066884176182707, + "grad_norm": 0.013338779099285603, + "learning_rate": 0.0006655272312092802, + "loss": 0.0455, + "num_input_tokens_seen": 120056256, + "step": 55580 + }, + { + "epoch": 9.067699836867863, + "grad_norm": 0.23012396693229675, + "learning_rate": 0.0006654600634223933, + "loss": 0.1355, + "num_input_tokens_seen": 120067008, + "step": 55585 + }, + { + "epoch": 9.068515497553017, + "grad_norm": 0.1102285236120224, + "learning_rate": 0.0006653928922822482, + "loss": 0.0208, + "num_input_tokens_seen": 120078752, + "step": 55590 + }, + { + "epoch": 9.069331158238173, + "grad_norm": 0.09096402674913406, + "learning_rate": 0.0006653257177902063, + "loss": 0.0282, + "num_input_tokens_seen": 120090528, + "step": 55595 + }, + { + "epoch": 9.070146818923329, + "grad_norm": 0.002108287997543812, + "learning_rate": 0.0006652585399476292, + "loss": 0.0119, + "num_input_tokens_seen": 120101376, + "step": 55600 + }, + { + "epoch": 9.070962479608482, + "grad_norm": 0.03092172183096409, + "learning_rate": 0.000665191358755878, + "loss": 0.0087, + "num_input_tokens_seen": 120111648, + "step": 55605 + }, + { + "epoch": 9.071778140293638, + "grad_norm": 0.29341599345207214, + "learning_rate": 0.0006651241742163143, + "loss": 0.05, + "num_input_tokens_seen": 120121760, + "step": 55610 + }, + { + "epoch": 9.072593800978792, + "grad_norm": 0.003775701392441988, + "learning_rate": 0.0006650569863302999, + "loss": 0.0041, + "num_input_tokens_seen": 120133184, + "step": 55615 + }, + { + "epoch": 9.073409461663948, + "grad_norm": 0.27860262989997864, + "learning_rate": 0.0006649897950991962, + "loss": 0.0834, + "num_input_tokens_seen": 120143360, + "step": 55620 + }, + { + "epoch": 9.074225122349104, + "grad_norm": 0.0017340558115392923, + "learning_rate": 0.000664922600524365, + "loss": 0.1483, + "num_input_tokens_seen": 120153728, + "step": 55625 + }, + { + "epoch": 9.075040783034257, + "grad_norm": 0.00711076008155942, + "learning_rate": 0.000664855402607168, + "loss": 0.0305, + "num_input_tokens_seen": 120164160, + "step": 55630 + }, + { + "epoch": 9.075856443719413, + "grad_norm": 0.009782101027667522, + "learning_rate": 0.0006647882013489674, + "loss": 0.0758, + "num_input_tokens_seen": 120173472, + "step": 55635 + }, + { + "epoch": 9.076672104404567, + "grad_norm": 0.0022454196587204933, + "learning_rate": 0.0006647209967511245, + "loss": 0.0883, + "num_input_tokens_seen": 120184576, + "step": 55640 + }, + { + "epoch": 9.077487765089723, + "grad_norm": 0.009879418648779392, + "learning_rate": 0.0006646537888150019, + "loss": 0.0323, + "num_input_tokens_seen": 120196064, + "step": 55645 + }, + { + "epoch": 9.078303425774878, + "grad_norm": 0.249136283993721, + "learning_rate": 0.0006645865775419613, + "loss": 0.1152, + "num_input_tokens_seen": 120208096, + "step": 55650 + }, + { + "epoch": 9.079119086460032, + "grad_norm": 0.225576713681221, + "learning_rate": 0.0006645193629333649, + "loss": 0.0238, + "num_input_tokens_seen": 120218624, + "step": 55655 + }, + { + "epoch": 9.079934747145188, + "grad_norm": 0.012649464420974255, + "learning_rate": 0.0006644521449905749, + "loss": 0.1596, + "num_input_tokens_seen": 120230304, + "step": 55660 + }, + { + "epoch": 9.080750407830342, + "grad_norm": 0.002151126740500331, + "learning_rate": 0.0006643849237149536, + "loss": 0.023, + "num_input_tokens_seen": 120241248, + "step": 55665 + }, + { + "epoch": 9.081566068515498, + "grad_norm": 0.004632554017007351, + "learning_rate": 0.0006643176991078632, + "loss": 0.0139, + "num_input_tokens_seen": 120251584, + "step": 55670 + }, + { + "epoch": 9.082381729200652, + "grad_norm": 0.016930658370256424, + "learning_rate": 0.0006642504711706663, + "loss": 0.0195, + "num_input_tokens_seen": 120262336, + "step": 55675 + }, + { + "epoch": 9.083197389885807, + "grad_norm": 0.010655293241143227, + "learning_rate": 0.000664183239904725, + "loss": 0.0365, + "num_input_tokens_seen": 120274432, + "step": 55680 + }, + { + "epoch": 9.084013050570963, + "grad_norm": 0.04127860441803932, + "learning_rate": 0.0006641160053114021, + "loss": 0.0365, + "num_input_tokens_seen": 120283392, + "step": 55685 + }, + { + "epoch": 9.084828711256117, + "grad_norm": 0.05015069618821144, + "learning_rate": 0.0006640487673920605, + "loss": 0.0164, + "num_input_tokens_seen": 120293792, + "step": 55690 + }, + { + "epoch": 9.085644371941273, + "grad_norm": 0.022326963022351265, + "learning_rate": 0.0006639815261480622, + "loss": 0.0747, + "num_input_tokens_seen": 120305312, + "step": 55695 + }, + { + "epoch": 9.086460032626427, + "grad_norm": 0.02016095258295536, + "learning_rate": 0.0006639142815807704, + "loss": 0.0068, + "num_input_tokens_seen": 120315264, + "step": 55700 + }, + { + "epoch": 9.087275693311582, + "grad_norm": 0.042222410440444946, + "learning_rate": 0.0006638470336915477, + "loss": 0.1165, + "num_input_tokens_seen": 120326752, + "step": 55705 + }, + { + "epoch": 9.088091353996738, + "grad_norm": 0.1003473624587059, + "learning_rate": 0.0006637797824817569, + "loss": 0.0373, + "num_input_tokens_seen": 120337984, + "step": 55710 + }, + { + "epoch": 9.088907014681892, + "grad_norm": 0.15232303738594055, + "learning_rate": 0.000663712527952761, + "loss": 0.0373, + "num_input_tokens_seen": 120347872, + "step": 55715 + }, + { + "epoch": 9.089722675367048, + "grad_norm": 0.004314785357564688, + "learning_rate": 0.0006636452701059232, + "loss": 0.0063, + "num_input_tokens_seen": 120358656, + "step": 55720 + }, + { + "epoch": 9.090538336052202, + "grad_norm": 0.20391501486301422, + "learning_rate": 0.0006635780089426065, + "loss": 0.0809, + "num_input_tokens_seen": 120369696, + "step": 55725 + }, + { + "epoch": 9.091353996737357, + "grad_norm": 0.019305676221847534, + "learning_rate": 0.0006635107444641737, + "loss": 0.0213, + "num_input_tokens_seen": 120381440, + "step": 55730 + }, + { + "epoch": 9.092169657422513, + "grad_norm": 0.24267306923866272, + "learning_rate": 0.0006634434766719883, + "loss": 0.0856, + "num_input_tokens_seen": 120393344, + "step": 55735 + }, + { + "epoch": 9.092985318107667, + "grad_norm": 0.05459204688668251, + "learning_rate": 0.0006633762055674136, + "loss": 0.181, + "num_input_tokens_seen": 120403456, + "step": 55740 + }, + { + "epoch": 9.093800978792823, + "grad_norm": 0.1435762494802475, + "learning_rate": 0.0006633089311518128, + "loss": 0.1155, + "num_input_tokens_seen": 120414240, + "step": 55745 + }, + { + "epoch": 9.094616639477977, + "grad_norm": 0.2441323846578598, + "learning_rate": 0.0006632416534265493, + "loss": 0.05, + "num_input_tokens_seen": 120425696, + "step": 55750 + }, + { + "epoch": 9.095432300163132, + "grad_norm": 0.0228540301322937, + "learning_rate": 0.0006631743723929867, + "loss": 0.0207, + "num_input_tokens_seen": 120435424, + "step": 55755 + }, + { + "epoch": 9.096247960848286, + "grad_norm": 0.020277827978134155, + "learning_rate": 0.0006631070880524883, + "loss": 0.0162, + "num_input_tokens_seen": 120445184, + "step": 55760 + }, + { + "epoch": 9.097063621533442, + "grad_norm": 0.19620034098625183, + "learning_rate": 0.0006630398004064179, + "loss": 0.0263, + "num_input_tokens_seen": 120455648, + "step": 55765 + }, + { + "epoch": 9.097879282218598, + "grad_norm": 0.013875322416424751, + "learning_rate": 0.0006629725094561392, + "loss": 0.04, + "num_input_tokens_seen": 120467456, + "step": 55770 + }, + { + "epoch": 9.098694942903752, + "grad_norm": 0.011110931634902954, + "learning_rate": 0.0006629052152030158, + "loss": 0.0151, + "num_input_tokens_seen": 120477568, + "step": 55775 + }, + { + "epoch": 9.099510603588907, + "grad_norm": 0.015466023236513138, + "learning_rate": 0.0006628379176484115, + "loss": 0.0942, + "num_input_tokens_seen": 120487296, + "step": 55780 + }, + { + "epoch": 9.100326264274061, + "grad_norm": 0.5825760364532471, + "learning_rate": 0.0006627706167936903, + "loss": 0.2598, + "num_input_tokens_seen": 120498784, + "step": 55785 + }, + { + "epoch": 9.101141924959217, + "grad_norm": 0.03466307371854782, + "learning_rate": 0.0006627033126402159, + "loss": 0.1472, + "num_input_tokens_seen": 120509504, + "step": 55790 + }, + { + "epoch": 9.101957585644373, + "grad_norm": 0.003099187510088086, + "learning_rate": 0.0006626360051893526, + "loss": 0.0501, + "num_input_tokens_seen": 120520320, + "step": 55795 + }, + { + "epoch": 9.102773246329527, + "grad_norm": 0.07016048580408096, + "learning_rate": 0.0006625686944424642, + "loss": 0.0455, + "num_input_tokens_seen": 120530912, + "step": 55800 + }, + { + "epoch": 9.103588907014682, + "grad_norm": 0.008897113613784313, + "learning_rate": 0.0006625013804009152, + "loss": 0.0188, + "num_input_tokens_seen": 120541760, + "step": 55805 + }, + { + "epoch": 9.104404567699836, + "grad_norm": 0.022372784093022346, + "learning_rate": 0.0006624340630660695, + "loss": 0.0859, + "num_input_tokens_seen": 120552640, + "step": 55810 + }, + { + "epoch": 9.105220228384992, + "grad_norm": 0.027728265151381493, + "learning_rate": 0.0006623667424392914, + "loss": 0.0288, + "num_input_tokens_seen": 120564256, + "step": 55815 + }, + { + "epoch": 9.106035889070148, + "grad_norm": 0.31177589297294617, + "learning_rate": 0.0006622994185219453, + "loss": 0.0578, + "num_input_tokens_seen": 120574912, + "step": 55820 + }, + { + "epoch": 9.106851549755302, + "grad_norm": 0.04539692774415016, + "learning_rate": 0.0006622320913153957, + "loss": 0.0162, + "num_input_tokens_seen": 120585120, + "step": 55825 + }, + { + "epoch": 9.107667210440457, + "grad_norm": 0.004602258093655109, + "learning_rate": 0.0006621647608210068, + "loss": 0.1413, + "num_input_tokens_seen": 120595392, + "step": 55830 + }, + { + "epoch": 9.108482871125611, + "grad_norm": 0.0793989822268486, + "learning_rate": 0.0006620974270401434, + "loss": 0.0847, + "num_input_tokens_seen": 120607488, + "step": 55835 + }, + { + "epoch": 9.109298531810767, + "grad_norm": 0.01831854321062565, + "learning_rate": 0.00066203008997417, + "loss": 0.0166, + "num_input_tokens_seen": 120619136, + "step": 55840 + }, + { + "epoch": 9.11011419249592, + "grad_norm": 0.2046804577112198, + "learning_rate": 0.0006619627496244513, + "loss": 0.1595, + "num_input_tokens_seen": 120631808, + "step": 55845 + }, + { + "epoch": 9.110929853181077, + "grad_norm": 0.6116275787353516, + "learning_rate": 0.0006618954059923517, + "loss": 0.0899, + "num_input_tokens_seen": 120642752, + "step": 55850 + }, + { + "epoch": 9.111745513866232, + "grad_norm": 0.0629836916923523, + "learning_rate": 0.0006618280590792367, + "loss": 0.0101, + "num_input_tokens_seen": 120653312, + "step": 55855 + }, + { + "epoch": 9.112561174551386, + "grad_norm": 0.05145289748907089, + "learning_rate": 0.0006617607088864706, + "loss": 0.0339, + "num_input_tokens_seen": 120664384, + "step": 55860 + }, + { + "epoch": 9.113376835236542, + "grad_norm": 0.009486453607678413, + "learning_rate": 0.0006616933554154186, + "loss": 0.0151, + "num_input_tokens_seen": 120675520, + "step": 55865 + }, + { + "epoch": 9.114192495921696, + "grad_norm": 0.07350229471921921, + "learning_rate": 0.0006616259986674456, + "loss": 0.0276, + "num_input_tokens_seen": 120685856, + "step": 55870 + }, + { + "epoch": 9.115008156606851, + "grad_norm": 0.11855614185333252, + "learning_rate": 0.0006615586386439169, + "loss": 0.1287, + "num_input_tokens_seen": 120696864, + "step": 55875 + }, + { + "epoch": 9.115823817292007, + "grad_norm": 0.46437859535217285, + "learning_rate": 0.0006614912753461973, + "loss": 0.1984, + "num_input_tokens_seen": 120708352, + "step": 55880 + }, + { + "epoch": 9.116639477977161, + "grad_norm": 0.007540303748100996, + "learning_rate": 0.0006614239087756519, + "loss": 0.0658, + "num_input_tokens_seen": 120720128, + "step": 55885 + }, + { + "epoch": 9.117455138662317, + "grad_norm": 0.017881179228425026, + "learning_rate": 0.0006613565389336465, + "loss": 0.0154, + "num_input_tokens_seen": 120730048, + "step": 55890 + }, + { + "epoch": 9.11827079934747, + "grad_norm": 0.29263004660606384, + "learning_rate": 0.0006612891658215461, + "loss": 0.0828, + "num_input_tokens_seen": 120740576, + "step": 55895 + }, + { + "epoch": 9.119086460032626, + "grad_norm": 0.01154404878616333, + "learning_rate": 0.000661221789440716, + "loss": 0.0091, + "num_input_tokens_seen": 120752544, + "step": 55900 + }, + { + "epoch": 9.119902120717782, + "grad_norm": 0.01328980177640915, + "learning_rate": 0.0006611544097925219, + "loss": 0.0233, + "num_input_tokens_seen": 120763456, + "step": 55905 + }, + { + "epoch": 9.120717781402936, + "grad_norm": 0.05561533570289612, + "learning_rate": 0.0006610870268783292, + "loss": 0.0545, + "num_input_tokens_seen": 120774400, + "step": 55910 + }, + { + "epoch": 9.121533442088092, + "grad_norm": 0.11454028636217117, + "learning_rate": 0.0006610196406995038, + "loss": 0.0478, + "num_input_tokens_seen": 120785920, + "step": 55915 + }, + { + "epoch": 9.122349102773246, + "grad_norm": 0.0025789556093513966, + "learning_rate": 0.0006609522512574107, + "loss": 0.0049, + "num_input_tokens_seen": 120796960, + "step": 55920 + }, + { + "epoch": 9.123164763458401, + "grad_norm": 0.048425693064928055, + "learning_rate": 0.0006608848585534164, + "loss": 0.0134, + "num_input_tokens_seen": 120807840, + "step": 55925 + }, + { + "epoch": 9.123980424143557, + "grad_norm": 0.031781259924173355, + "learning_rate": 0.0006608174625888862, + "loss": 0.0213, + "num_input_tokens_seen": 120818944, + "step": 55930 + }, + { + "epoch": 9.124796084828711, + "grad_norm": 0.004460108932107687, + "learning_rate": 0.000660750063365186, + "loss": 0.0087, + "num_input_tokens_seen": 120830112, + "step": 55935 + }, + { + "epoch": 9.125611745513867, + "grad_norm": 0.029495006427168846, + "learning_rate": 0.000660682660883682, + "loss": 0.0379, + "num_input_tokens_seen": 120841248, + "step": 55940 + }, + { + "epoch": 9.12642740619902, + "grad_norm": 0.010408415459096432, + "learning_rate": 0.0006606152551457401, + "loss": 0.0092, + "num_input_tokens_seen": 120851840, + "step": 55945 + }, + { + "epoch": 9.127243066884176, + "grad_norm": 0.004695413634181023, + "learning_rate": 0.0006605478461527262, + "loss": 0.0373, + "num_input_tokens_seen": 120863520, + "step": 55950 + }, + { + "epoch": 9.12805872756933, + "grad_norm": 0.03372404724359512, + "learning_rate": 0.0006604804339060065, + "loss": 0.0752, + "num_input_tokens_seen": 120874176, + "step": 55955 + }, + { + "epoch": 9.128874388254486, + "grad_norm": 0.12020575255155563, + "learning_rate": 0.0006604130184069472, + "loss": 0.0095, + "num_input_tokens_seen": 120884736, + "step": 55960 + }, + { + "epoch": 9.129690048939642, + "grad_norm": 0.033447980880737305, + "learning_rate": 0.0006603455996569146, + "loss": 0.0392, + "num_input_tokens_seen": 120895296, + "step": 55965 + }, + { + "epoch": 9.130505709624796, + "grad_norm": 0.015460162423551083, + "learning_rate": 0.0006602781776572752, + "loss": 0.0314, + "num_input_tokens_seen": 120905472, + "step": 55970 + }, + { + "epoch": 9.131321370309951, + "grad_norm": 0.023283349350094795, + "learning_rate": 0.000660210752409395, + "loss": 0.053, + "num_input_tokens_seen": 120916576, + "step": 55975 + }, + { + "epoch": 9.132137030995105, + "grad_norm": 0.007297954987734556, + "learning_rate": 0.0006601433239146407, + "loss": 0.0587, + "num_input_tokens_seen": 120927232, + "step": 55980 + }, + { + "epoch": 9.132952691680261, + "grad_norm": 0.004983537830412388, + "learning_rate": 0.0006600758921743788, + "loss": 0.0129, + "num_input_tokens_seen": 120937280, + "step": 55985 + }, + { + "epoch": 9.133768352365417, + "grad_norm": 0.6522960662841797, + "learning_rate": 0.0006600084571899758, + "loss": 0.0495, + "num_input_tokens_seen": 120948992, + "step": 55990 + }, + { + "epoch": 9.13458401305057, + "grad_norm": 0.0034484846983104944, + "learning_rate": 0.0006599410189627985, + "loss": 0.0183, + "num_input_tokens_seen": 120958496, + "step": 55995 + }, + { + "epoch": 9.135399673735726, + "grad_norm": 0.10356710851192474, + "learning_rate": 0.0006598735774942135, + "loss": 0.0603, + "num_input_tokens_seen": 120971168, + "step": 56000 + }, + { + "epoch": 9.13621533442088, + "grad_norm": 0.02262182906270027, + "learning_rate": 0.0006598061327855876, + "loss": 0.0101, + "num_input_tokens_seen": 120982624, + "step": 56005 + }, + { + "epoch": 9.137030995106036, + "grad_norm": 0.018539773300290108, + "learning_rate": 0.0006597386848382878, + "loss": 0.0677, + "num_input_tokens_seen": 120994176, + "step": 56010 + }, + { + "epoch": 9.137846655791192, + "grad_norm": 0.010305029340088367, + "learning_rate": 0.000659671233653681, + "loss": 0.0253, + "num_input_tokens_seen": 121003968, + "step": 56015 + }, + { + "epoch": 9.138662316476346, + "grad_norm": 0.013459831476211548, + "learning_rate": 0.0006596037792331338, + "loss": 0.0167, + "num_input_tokens_seen": 121014112, + "step": 56020 + }, + { + "epoch": 9.139477977161501, + "grad_norm": 0.002566079143434763, + "learning_rate": 0.0006595363215780137, + "loss": 0.0126, + "num_input_tokens_seen": 121024800, + "step": 56025 + }, + { + "epoch": 9.140293637846655, + "grad_norm": 0.0190932247787714, + "learning_rate": 0.0006594688606896877, + "loss": 0.0064, + "num_input_tokens_seen": 121035552, + "step": 56030 + }, + { + "epoch": 9.141109298531811, + "grad_norm": 0.008819978684186935, + "learning_rate": 0.0006594013965695229, + "loss": 0.1523, + "num_input_tokens_seen": 121046208, + "step": 56035 + }, + { + "epoch": 9.141924959216965, + "grad_norm": 0.25872063636779785, + "learning_rate": 0.0006593339292188865, + "loss": 0.0396, + "num_input_tokens_seen": 121057312, + "step": 56040 + }, + { + "epoch": 9.14274061990212, + "grad_norm": 0.05374641716480255, + "learning_rate": 0.0006592664586391461, + "loss": 0.031, + "num_input_tokens_seen": 121068960, + "step": 56045 + }, + { + "epoch": 9.143556280587276, + "grad_norm": 0.03318631649017334, + "learning_rate": 0.0006591989848316687, + "loss": 0.0055, + "num_input_tokens_seen": 121079840, + "step": 56050 + }, + { + "epoch": 9.14437194127243, + "grad_norm": 0.002823259448632598, + "learning_rate": 0.0006591315077978221, + "loss": 0.0721, + "num_input_tokens_seen": 121089760, + "step": 56055 + }, + { + "epoch": 9.145187601957586, + "grad_norm": 0.16577103734016418, + "learning_rate": 0.0006590640275389734, + "loss": 0.0111, + "num_input_tokens_seen": 121100096, + "step": 56060 + }, + { + "epoch": 9.14600326264274, + "grad_norm": 0.00289472215808928, + "learning_rate": 0.0006589965440564905, + "loss": 0.0649, + "num_input_tokens_seen": 121110368, + "step": 56065 + }, + { + "epoch": 9.146818923327896, + "grad_norm": 0.263703316450119, + "learning_rate": 0.000658929057351741, + "loss": 0.0508, + "num_input_tokens_seen": 121119392, + "step": 56070 + }, + { + "epoch": 9.147634584013051, + "grad_norm": 0.29043954610824585, + "learning_rate": 0.0006588615674260925, + "loss": 0.0644, + "num_input_tokens_seen": 121129216, + "step": 56075 + }, + { + "epoch": 9.148450244698205, + "grad_norm": 0.013290850445628166, + "learning_rate": 0.0006587940742809127, + "loss": 0.0177, + "num_input_tokens_seen": 121139072, + "step": 56080 + }, + { + "epoch": 9.149265905383361, + "grad_norm": 0.47222426533699036, + "learning_rate": 0.0006587265779175696, + "loss": 0.0709, + "num_input_tokens_seen": 121149856, + "step": 56085 + }, + { + "epoch": 9.150081566068515, + "grad_norm": 0.004278893116861582, + "learning_rate": 0.0006586590783374311, + "loss": 0.005, + "num_input_tokens_seen": 121160608, + "step": 56090 + }, + { + "epoch": 9.15089722675367, + "grad_norm": 0.3925560414791107, + "learning_rate": 0.000658591575541865, + "loss": 0.0331, + "num_input_tokens_seen": 121170944, + "step": 56095 + }, + { + "epoch": 9.151712887438826, + "grad_norm": 0.011311687529087067, + "learning_rate": 0.0006585240695322395, + "loss": 0.0404, + "num_input_tokens_seen": 121181216, + "step": 56100 + }, + { + "epoch": 9.15252854812398, + "grad_norm": 0.005556880030781031, + "learning_rate": 0.0006584565603099227, + "loss": 0.0364, + "num_input_tokens_seen": 121192160, + "step": 56105 + }, + { + "epoch": 9.153344208809136, + "grad_norm": 0.004971515852957964, + "learning_rate": 0.0006583890478762824, + "loss": 0.0628, + "num_input_tokens_seen": 121203776, + "step": 56110 + }, + { + "epoch": 9.15415986949429, + "grad_norm": 0.007574469782412052, + "learning_rate": 0.0006583215322326874, + "loss": 0.014, + "num_input_tokens_seen": 121215040, + "step": 56115 + }, + { + "epoch": 9.154975530179446, + "grad_norm": 0.3267376720905304, + "learning_rate": 0.0006582540133805056, + "loss": 0.027, + "num_input_tokens_seen": 121226496, + "step": 56120 + }, + { + "epoch": 9.1557911908646, + "grad_norm": 0.020117826759815216, + "learning_rate": 0.0006581864913211055, + "loss": 0.0179, + "num_input_tokens_seen": 121237696, + "step": 56125 + }, + { + "epoch": 9.156606851549755, + "grad_norm": 0.005197125021368265, + "learning_rate": 0.0006581189660558554, + "loss": 0.0285, + "num_input_tokens_seen": 121248416, + "step": 56130 + }, + { + "epoch": 9.15742251223491, + "grad_norm": 0.020636849105358124, + "learning_rate": 0.000658051437586124, + "loss": 0.028, + "num_input_tokens_seen": 121259936, + "step": 56135 + }, + { + "epoch": 9.158238172920065, + "grad_norm": 0.01939970813691616, + "learning_rate": 0.0006579839059132796, + "loss": 0.0126, + "num_input_tokens_seen": 121271584, + "step": 56140 + }, + { + "epoch": 9.15905383360522, + "grad_norm": 0.004743052180856466, + "learning_rate": 0.000657916371038691, + "loss": 0.0102, + "num_input_tokens_seen": 121283072, + "step": 56145 + }, + { + "epoch": 9.159869494290374, + "grad_norm": 1.1395015716552734, + "learning_rate": 0.0006578488329637268, + "loss": 0.1382, + "num_input_tokens_seen": 121295264, + "step": 56150 + }, + { + "epoch": 9.16068515497553, + "grad_norm": 0.0044103991240262985, + "learning_rate": 0.0006577812916897558, + "loss": 0.0073, + "num_input_tokens_seen": 121306944, + "step": 56155 + }, + { + "epoch": 9.161500815660686, + "grad_norm": 0.006278656888753176, + "learning_rate": 0.0006577137472181466, + "loss": 0.2067, + "num_input_tokens_seen": 121316768, + "step": 56160 + }, + { + "epoch": 9.16231647634584, + "grad_norm": 0.09036379307508469, + "learning_rate": 0.0006576461995502682, + "loss": 0.0281, + "num_input_tokens_seen": 121327456, + "step": 56165 + }, + { + "epoch": 9.163132137030995, + "grad_norm": 0.5136862397193909, + "learning_rate": 0.0006575786486874897, + "loss": 0.0629, + "num_input_tokens_seen": 121339008, + "step": 56170 + }, + { + "epoch": 9.16394779771615, + "grad_norm": 0.09326247125864029, + "learning_rate": 0.0006575110946311801, + "loss": 0.0446, + "num_input_tokens_seen": 121350112, + "step": 56175 + }, + { + "epoch": 9.164763458401305, + "grad_norm": 0.09577307105064392, + "learning_rate": 0.0006574435373827083, + "loss": 0.2164, + "num_input_tokens_seen": 121361664, + "step": 56180 + }, + { + "epoch": 9.16557911908646, + "grad_norm": 0.4209381639957428, + "learning_rate": 0.0006573759769434433, + "loss": 0.1213, + "num_input_tokens_seen": 121372416, + "step": 56185 + }, + { + "epoch": 9.166394779771615, + "grad_norm": 0.009589829482138157, + "learning_rate": 0.0006573084133147547, + "loss": 0.0675, + "num_input_tokens_seen": 121381600, + "step": 56190 + }, + { + "epoch": 9.16721044045677, + "grad_norm": 0.014700385741889477, + "learning_rate": 0.0006572408464980115, + "loss": 0.051, + "num_input_tokens_seen": 121392000, + "step": 56195 + }, + { + "epoch": 9.168026101141924, + "grad_norm": 0.1325574666261673, + "learning_rate": 0.000657173276494583, + "loss": 0.0061, + "num_input_tokens_seen": 121402144, + "step": 56200 + }, + { + "epoch": 9.16884176182708, + "grad_norm": 0.5163242816925049, + "learning_rate": 0.0006571057033058386, + "loss": 0.0892, + "num_input_tokens_seen": 121413312, + "step": 56205 + }, + { + "epoch": 9.169657422512234, + "grad_norm": 0.7403216361999512, + "learning_rate": 0.000657038126933148, + "loss": 0.0592, + "num_input_tokens_seen": 121424384, + "step": 56210 + }, + { + "epoch": 9.17047308319739, + "grad_norm": 0.003989236429333687, + "learning_rate": 0.0006569705473778804, + "loss": 0.094, + "num_input_tokens_seen": 121434144, + "step": 56215 + }, + { + "epoch": 9.171288743882545, + "grad_norm": 0.01190384291112423, + "learning_rate": 0.0006569029646414055, + "loss": 0.0207, + "num_input_tokens_seen": 121445472, + "step": 56220 + }, + { + "epoch": 9.1721044045677, + "grad_norm": 0.3108609914779663, + "learning_rate": 0.0006568353787250931, + "loss": 0.069, + "num_input_tokens_seen": 121456704, + "step": 56225 + }, + { + "epoch": 9.172920065252855, + "grad_norm": 0.002502848394215107, + "learning_rate": 0.0006567677896303127, + "loss": 0.0191, + "num_input_tokens_seen": 121467264, + "step": 56230 + }, + { + "epoch": 9.173735725938009, + "grad_norm": 0.024630989879369736, + "learning_rate": 0.0006567001973584343, + "loss": 0.025, + "num_input_tokens_seen": 121477536, + "step": 56235 + }, + { + "epoch": 9.174551386623165, + "grad_norm": 0.2676560580730438, + "learning_rate": 0.0006566326019108275, + "loss": 0.1971, + "num_input_tokens_seen": 121489056, + "step": 56240 + }, + { + "epoch": 9.17536704730832, + "grad_norm": 0.030201373621821404, + "learning_rate": 0.0006565650032888624, + "loss": 0.0254, + "num_input_tokens_seen": 121500352, + "step": 56245 + }, + { + "epoch": 9.176182707993474, + "grad_norm": 0.04962538927793503, + "learning_rate": 0.0006564974014939088, + "loss": 0.0213, + "num_input_tokens_seen": 121511584, + "step": 56250 + }, + { + "epoch": 9.17699836867863, + "grad_norm": 0.3099713623523712, + "learning_rate": 0.0006564297965273369, + "loss": 0.1156, + "num_input_tokens_seen": 121521536, + "step": 56255 + }, + { + "epoch": 9.177814029363784, + "grad_norm": 0.42166343331336975, + "learning_rate": 0.0006563621883905167, + "loss": 0.0634, + "num_input_tokens_seen": 121531456, + "step": 56260 + }, + { + "epoch": 9.17862969004894, + "grad_norm": 0.012773294001817703, + "learning_rate": 0.0006562945770848183, + "loss": 0.1584, + "num_input_tokens_seen": 121542336, + "step": 56265 + }, + { + "epoch": 9.179445350734095, + "grad_norm": 0.03006591834127903, + "learning_rate": 0.0006562269626116122, + "loss": 0.0446, + "num_input_tokens_seen": 121553408, + "step": 56270 + }, + { + "epoch": 9.18026101141925, + "grad_norm": 0.044128891080617905, + "learning_rate": 0.0006561593449722683, + "loss": 0.0175, + "num_input_tokens_seen": 121563136, + "step": 56275 + }, + { + "epoch": 9.181076672104405, + "grad_norm": 0.04296493902802467, + "learning_rate": 0.0006560917241681573, + "loss": 0.0417, + "num_input_tokens_seen": 121573696, + "step": 56280 + }, + { + "epoch": 9.181892332789559, + "grad_norm": 0.47510603070259094, + "learning_rate": 0.0006560241002006495, + "loss": 0.1153, + "num_input_tokens_seen": 121584416, + "step": 56285 + }, + { + "epoch": 9.182707993474715, + "grad_norm": 0.017805377021431923, + "learning_rate": 0.0006559564730711153, + "loss": 0.0205, + "num_input_tokens_seen": 121595552, + "step": 56290 + }, + { + "epoch": 9.18352365415987, + "grad_norm": 0.027597038075327873, + "learning_rate": 0.0006558888427809255, + "loss": 0.0935, + "num_input_tokens_seen": 121606240, + "step": 56295 + }, + { + "epoch": 9.184339314845024, + "grad_norm": 0.0038924135733395815, + "learning_rate": 0.0006558212093314504, + "loss": 0.2137, + "num_input_tokens_seen": 121616352, + "step": 56300 + }, + { + "epoch": 9.18515497553018, + "grad_norm": 0.15157592296600342, + "learning_rate": 0.0006557535727240609, + "loss": 0.0268, + "num_input_tokens_seen": 121627776, + "step": 56305 + }, + { + "epoch": 9.185970636215334, + "grad_norm": 0.018170544877648354, + "learning_rate": 0.0006556859329601275, + "loss": 0.0576, + "num_input_tokens_seen": 121640672, + "step": 56310 + }, + { + "epoch": 9.18678629690049, + "grad_norm": 0.005017032381147146, + "learning_rate": 0.0006556182900410213, + "loss": 0.0415, + "num_input_tokens_seen": 121650976, + "step": 56315 + }, + { + "epoch": 9.187601957585644, + "grad_norm": 0.004428547341376543, + "learning_rate": 0.0006555506439681131, + "loss": 0.0155, + "num_input_tokens_seen": 121661856, + "step": 56320 + }, + { + "epoch": 9.1884176182708, + "grad_norm": 0.004647783003747463, + "learning_rate": 0.0006554829947427736, + "loss": 0.0147, + "num_input_tokens_seen": 121673024, + "step": 56325 + }, + { + "epoch": 9.189233278955955, + "grad_norm": 0.0270043034106493, + "learning_rate": 0.0006554153423663741, + "loss": 0.0127, + "num_input_tokens_seen": 121684000, + "step": 56330 + }, + { + "epoch": 9.190048939641109, + "grad_norm": 0.16975529491901398, + "learning_rate": 0.0006553476868402854, + "loss": 0.096, + "num_input_tokens_seen": 121693728, + "step": 56335 + }, + { + "epoch": 9.190864600326265, + "grad_norm": 0.00935075618326664, + "learning_rate": 0.0006552800281658789, + "loss": 0.0386, + "num_input_tokens_seen": 121703904, + "step": 56340 + }, + { + "epoch": 9.191680261011419, + "grad_norm": 0.02370108850300312, + "learning_rate": 0.0006552123663445255, + "loss": 0.0064, + "num_input_tokens_seen": 121714208, + "step": 56345 + }, + { + "epoch": 9.192495921696574, + "grad_norm": 0.012804976664483547, + "learning_rate": 0.0006551447013775967, + "loss": 0.0598, + "num_input_tokens_seen": 121723872, + "step": 56350 + }, + { + "epoch": 9.19331158238173, + "grad_norm": 0.03224405273795128, + "learning_rate": 0.0006550770332664637, + "loss": 0.0208, + "num_input_tokens_seen": 121735648, + "step": 56355 + }, + { + "epoch": 9.194127243066884, + "grad_norm": 0.12478592246770859, + "learning_rate": 0.0006550093620124979, + "loss": 0.0258, + "num_input_tokens_seen": 121747136, + "step": 56360 + }, + { + "epoch": 9.19494290375204, + "grad_norm": 0.0019217646913602948, + "learning_rate": 0.0006549416876170707, + "loss": 0.0085, + "num_input_tokens_seen": 121756544, + "step": 56365 + }, + { + "epoch": 9.195758564437194, + "grad_norm": 0.0025901400949805975, + "learning_rate": 0.0006548740100815537, + "loss": 0.0084, + "num_input_tokens_seen": 121767072, + "step": 56370 + }, + { + "epoch": 9.19657422512235, + "grad_norm": 0.22183768451213837, + "learning_rate": 0.0006548063294073183, + "loss": 0.0344, + "num_input_tokens_seen": 121777920, + "step": 56375 + }, + { + "epoch": 9.197389885807505, + "grad_norm": 0.017956186085939407, + "learning_rate": 0.0006547386455957364, + "loss": 0.1983, + "num_input_tokens_seen": 121788224, + "step": 56380 + }, + { + "epoch": 9.198205546492659, + "grad_norm": 0.00347818317823112, + "learning_rate": 0.0006546709586481794, + "loss": 0.0875, + "num_input_tokens_seen": 121799328, + "step": 56385 + }, + { + "epoch": 9.199021207177815, + "grad_norm": 0.05604798346757889, + "learning_rate": 0.0006546032685660193, + "loss": 0.0579, + "num_input_tokens_seen": 121810688, + "step": 56390 + }, + { + "epoch": 9.199836867862969, + "grad_norm": 0.07729680836200714, + "learning_rate": 0.000654535575350628, + "loss": 0.161, + "num_input_tokens_seen": 121821632, + "step": 56395 + }, + { + "epoch": 9.200652528548124, + "grad_norm": 0.3953785002231598, + "learning_rate": 0.0006544678790033769, + "loss": 0.0892, + "num_input_tokens_seen": 121832992, + "step": 56400 + }, + { + "epoch": 9.201468189233278, + "grad_norm": 0.013939537107944489, + "learning_rate": 0.0006544001795256385, + "loss": 0.0812, + "num_input_tokens_seen": 121842272, + "step": 56405 + }, + { + "epoch": 9.202283849918434, + "grad_norm": 0.025160158053040504, + "learning_rate": 0.0006543324769187844, + "loss": 0.0137, + "num_input_tokens_seen": 121852928, + "step": 56410 + }, + { + "epoch": 9.20309951060359, + "grad_norm": 0.014282099902629852, + "learning_rate": 0.0006542647711841869, + "loss": 0.0093, + "num_input_tokens_seen": 121864352, + "step": 56415 + }, + { + "epoch": 9.203915171288743, + "grad_norm": 0.1687372922897339, + "learning_rate": 0.0006541970623232183, + "loss": 0.1539, + "num_input_tokens_seen": 121875072, + "step": 56420 + }, + { + "epoch": 9.2047308319739, + "grad_norm": 0.22237369418144226, + "learning_rate": 0.0006541293503372506, + "loss": 0.0284, + "num_input_tokens_seen": 121886464, + "step": 56425 + }, + { + "epoch": 9.205546492659053, + "grad_norm": 0.025881322100758553, + "learning_rate": 0.0006540616352276558, + "loss": 0.0186, + "num_input_tokens_seen": 121897952, + "step": 56430 + }, + { + "epoch": 9.206362153344209, + "grad_norm": 0.27931275963783264, + "learning_rate": 0.0006539939169958067, + "loss": 0.0438, + "num_input_tokens_seen": 121907360, + "step": 56435 + }, + { + "epoch": 9.207177814029365, + "grad_norm": 0.005891446024179459, + "learning_rate": 0.0006539261956430755, + "loss": 0.0202, + "num_input_tokens_seen": 121918656, + "step": 56440 + }, + { + "epoch": 9.207993474714518, + "grad_norm": 0.031058935448527336, + "learning_rate": 0.0006538584711708348, + "loss": 0.1126, + "num_input_tokens_seen": 121929824, + "step": 56445 + }, + { + "epoch": 9.208809135399674, + "grad_norm": 0.06801172345876694, + "learning_rate": 0.0006537907435804569, + "loss": 0.1866, + "num_input_tokens_seen": 121939680, + "step": 56450 + }, + { + "epoch": 9.209624796084828, + "grad_norm": 0.2498117983341217, + "learning_rate": 0.0006537230128733144, + "loss": 0.0488, + "num_input_tokens_seen": 121950400, + "step": 56455 + }, + { + "epoch": 9.210440456769984, + "grad_norm": 0.41223159432411194, + "learning_rate": 0.0006536552790507802, + "loss": 0.0561, + "num_input_tokens_seen": 121960256, + "step": 56460 + }, + { + "epoch": 9.21125611745514, + "grad_norm": 0.1966393142938614, + "learning_rate": 0.0006535875421142267, + "loss": 0.0122, + "num_input_tokens_seen": 121970272, + "step": 56465 + }, + { + "epoch": 9.212071778140293, + "grad_norm": 0.01200894545763731, + "learning_rate": 0.0006535198020650269, + "loss": 0.0291, + "num_input_tokens_seen": 121979008, + "step": 56470 + }, + { + "epoch": 9.21288743882545, + "grad_norm": 0.014986738562583923, + "learning_rate": 0.0006534520589045537, + "loss": 0.029, + "num_input_tokens_seen": 121989504, + "step": 56475 + }, + { + "epoch": 9.213703099510603, + "grad_norm": 0.014505613595247269, + "learning_rate": 0.0006533843126341795, + "loss": 0.0117, + "num_input_tokens_seen": 122000320, + "step": 56480 + }, + { + "epoch": 9.214518760195759, + "grad_norm": 0.03650680184364319, + "learning_rate": 0.0006533165632552777, + "loss": 0.0721, + "num_input_tokens_seen": 122010592, + "step": 56485 + }, + { + "epoch": 9.215334420880913, + "grad_norm": 0.022520599886775017, + "learning_rate": 0.0006532488107692214, + "loss": 0.01, + "num_input_tokens_seen": 122020704, + "step": 56490 + }, + { + "epoch": 9.216150081566068, + "grad_norm": 0.0102704968303442, + "learning_rate": 0.0006531810551773836, + "loss": 0.0339, + "num_input_tokens_seen": 122031776, + "step": 56495 + }, + { + "epoch": 9.216965742251224, + "grad_norm": 0.6189824938774109, + "learning_rate": 0.0006531132964811374, + "loss": 0.1069, + "num_input_tokens_seen": 122042944, + "step": 56500 + }, + { + "epoch": 9.217781402936378, + "grad_norm": 0.002442688215523958, + "learning_rate": 0.0006530455346818559, + "loss": 0.0429, + "num_input_tokens_seen": 122053824, + "step": 56505 + }, + { + "epoch": 9.218597063621534, + "grad_norm": 0.007630040869116783, + "learning_rate": 0.0006529777697809125, + "loss": 0.0152, + "num_input_tokens_seen": 122064864, + "step": 56510 + }, + { + "epoch": 9.219412724306688, + "grad_norm": 0.009317480958998203, + "learning_rate": 0.0006529100017796805, + "loss": 0.0344, + "num_input_tokens_seen": 122075968, + "step": 56515 + }, + { + "epoch": 9.220228384991843, + "grad_norm": 0.0036200080066919327, + "learning_rate": 0.0006528422306795334, + "loss": 0.0101, + "num_input_tokens_seen": 122086912, + "step": 56520 + }, + { + "epoch": 9.221044045676999, + "grad_norm": 0.1551111936569214, + "learning_rate": 0.0006527744564818446, + "loss": 0.0387, + "num_input_tokens_seen": 122098368, + "step": 56525 + }, + { + "epoch": 9.221859706362153, + "grad_norm": 0.02000151202082634, + "learning_rate": 0.0006527066791879875, + "loss": 0.159, + "num_input_tokens_seen": 122108384, + "step": 56530 + }, + { + "epoch": 9.222675367047309, + "grad_norm": 0.5325279235839844, + "learning_rate": 0.000652638898799336, + "loss": 0.0634, + "num_input_tokens_seen": 122119808, + "step": 56535 + }, + { + "epoch": 9.223491027732463, + "grad_norm": 0.03554704412817955, + "learning_rate": 0.0006525711153172635, + "loss": 0.0072, + "num_input_tokens_seen": 122131424, + "step": 56540 + }, + { + "epoch": 9.224306688417618, + "grad_norm": 0.043445706367492676, + "learning_rate": 0.0006525033287431436, + "loss": 0.2337, + "num_input_tokens_seen": 122141536, + "step": 56545 + }, + { + "epoch": 9.225122349102774, + "grad_norm": 0.20792369544506073, + "learning_rate": 0.0006524355390783506, + "loss": 0.0909, + "num_input_tokens_seen": 122151840, + "step": 56550 + }, + { + "epoch": 9.225938009787928, + "grad_norm": 0.0052780830301344395, + "learning_rate": 0.0006523677463242579, + "loss": 0.011, + "num_input_tokens_seen": 122162624, + "step": 56555 + }, + { + "epoch": 9.226753670473084, + "grad_norm": 0.15979614853858948, + "learning_rate": 0.0006522999504822395, + "loss": 0.0126, + "num_input_tokens_seen": 122173664, + "step": 56560 + }, + { + "epoch": 9.227569331158238, + "grad_norm": 0.20983552932739258, + "learning_rate": 0.0006522321515536694, + "loss": 0.2322, + "num_input_tokens_seen": 122184480, + "step": 56565 + }, + { + "epoch": 9.228384991843393, + "grad_norm": 0.007776240352541208, + "learning_rate": 0.0006521643495399217, + "loss": 0.0231, + "num_input_tokens_seen": 122196032, + "step": 56570 + }, + { + "epoch": 9.229200652528547, + "grad_norm": 0.30364108085632324, + "learning_rate": 0.0006520965444423704, + "loss": 0.2579, + "num_input_tokens_seen": 122207392, + "step": 56575 + }, + { + "epoch": 9.230016313213703, + "grad_norm": 1.1685848236083984, + "learning_rate": 0.0006520287362623896, + "loss": 0.083, + "num_input_tokens_seen": 122218784, + "step": 56580 + }, + { + "epoch": 9.230831973898859, + "grad_norm": 0.006376779638230801, + "learning_rate": 0.0006519609250013538, + "loss": 0.0232, + "num_input_tokens_seen": 122230496, + "step": 56585 + }, + { + "epoch": 9.231647634584013, + "grad_norm": 0.018893474712967873, + "learning_rate": 0.000651893110660637, + "loss": 0.0196, + "num_input_tokens_seen": 122240512, + "step": 56590 + }, + { + "epoch": 9.232463295269168, + "grad_norm": 0.0246660728007555, + "learning_rate": 0.0006518252932416135, + "loss": 0.0541, + "num_input_tokens_seen": 122251200, + "step": 56595 + }, + { + "epoch": 9.233278955954322, + "grad_norm": 0.018446508795022964, + "learning_rate": 0.0006517574727456579, + "loss": 0.0429, + "num_input_tokens_seen": 122261184, + "step": 56600 + }, + { + "epoch": 9.234094616639478, + "grad_norm": 0.017274249345064163, + "learning_rate": 0.0006516896491741446, + "loss": 0.0276, + "num_input_tokens_seen": 122270048, + "step": 56605 + }, + { + "epoch": 9.234910277324634, + "grad_norm": 0.03069995529949665, + "learning_rate": 0.000651621822528448, + "loss": 0.0279, + "num_input_tokens_seen": 122280928, + "step": 56610 + }, + { + "epoch": 9.235725938009788, + "grad_norm": 0.5565629005432129, + "learning_rate": 0.000651553992809943, + "loss": 0.0835, + "num_input_tokens_seen": 122291488, + "step": 56615 + }, + { + "epoch": 9.236541598694943, + "grad_norm": 0.03480347990989685, + "learning_rate": 0.0006514861600200039, + "loss": 0.0487, + "num_input_tokens_seen": 122301408, + "step": 56620 + }, + { + "epoch": 9.237357259380097, + "grad_norm": 0.013822119683027267, + "learning_rate": 0.0006514183241600057, + "loss": 0.1043, + "num_input_tokens_seen": 122312416, + "step": 56625 + }, + { + "epoch": 9.238172920065253, + "grad_norm": 0.42332392930984497, + "learning_rate": 0.000651350485231323, + "loss": 0.0659, + "num_input_tokens_seen": 122324224, + "step": 56630 + }, + { + "epoch": 9.238988580750409, + "grad_norm": 0.02906874567270279, + "learning_rate": 0.0006512826432353308, + "loss": 0.0768, + "num_input_tokens_seen": 122335840, + "step": 56635 + }, + { + "epoch": 9.239804241435563, + "grad_norm": 0.01386654656380415, + "learning_rate": 0.000651214798173404, + "loss": 0.0109, + "num_input_tokens_seen": 122346208, + "step": 56640 + }, + { + "epoch": 9.240619902120718, + "grad_norm": 0.0913848802447319, + "learning_rate": 0.0006511469500469173, + "loss": 0.0962, + "num_input_tokens_seen": 122355808, + "step": 56645 + }, + { + "epoch": 9.241435562805872, + "grad_norm": 0.4897230267524719, + "learning_rate": 0.0006510790988572459, + "loss": 0.0846, + "num_input_tokens_seen": 122366656, + "step": 56650 + }, + { + "epoch": 9.242251223491028, + "grad_norm": 0.006247374229133129, + "learning_rate": 0.0006510112446057651, + "loss": 0.038, + "num_input_tokens_seen": 122375872, + "step": 56655 + }, + { + "epoch": 9.243066884176184, + "grad_norm": 0.00599702401086688, + "learning_rate": 0.0006509433872938497, + "loss": 0.0065, + "num_input_tokens_seen": 122387392, + "step": 56660 + }, + { + "epoch": 9.243882544861338, + "grad_norm": 0.024100353941321373, + "learning_rate": 0.0006508755269228752, + "loss": 0.1065, + "num_input_tokens_seen": 122396960, + "step": 56665 + }, + { + "epoch": 9.244698205546493, + "grad_norm": 0.3259066343307495, + "learning_rate": 0.0006508076634942167, + "loss": 0.0854, + "num_input_tokens_seen": 122408352, + "step": 56670 + }, + { + "epoch": 9.245513866231647, + "grad_norm": 0.0508267804980278, + "learning_rate": 0.0006507397970092496, + "loss": 0.0238, + "num_input_tokens_seen": 122420672, + "step": 56675 + }, + { + "epoch": 9.246329526916803, + "grad_norm": 0.020270129665732384, + "learning_rate": 0.0006506719274693492, + "loss": 0.0313, + "num_input_tokens_seen": 122431680, + "step": 56680 + }, + { + "epoch": 9.247145187601957, + "grad_norm": 0.016229957342147827, + "learning_rate": 0.0006506040548758911, + "loss": 0.147, + "num_input_tokens_seen": 122441984, + "step": 56685 + }, + { + "epoch": 9.247960848287113, + "grad_norm": 0.004499287344515324, + "learning_rate": 0.0006505361792302509, + "loss": 0.0328, + "num_input_tokens_seen": 122451616, + "step": 56690 + }, + { + "epoch": 9.248776508972268, + "grad_norm": 0.006004804745316505, + "learning_rate": 0.0006504683005338039, + "loss": 0.0074, + "num_input_tokens_seen": 122462848, + "step": 56695 + }, + { + "epoch": 9.249592169657422, + "grad_norm": 0.12613023817539215, + "learning_rate": 0.0006504004187879259, + "loss": 0.0231, + "num_input_tokens_seen": 122472960, + "step": 56700 + }, + { + "epoch": 9.250407830342578, + "grad_norm": 0.00950097106397152, + "learning_rate": 0.0006503325339939927, + "loss": 0.0194, + "num_input_tokens_seen": 122484064, + "step": 56705 + }, + { + "epoch": 9.251223491027732, + "grad_norm": 0.33747807145118713, + "learning_rate": 0.0006502646461533798, + "loss": 0.1316, + "num_input_tokens_seen": 122494816, + "step": 56710 + }, + { + "epoch": 9.252039151712887, + "grad_norm": 0.07069870829582214, + "learning_rate": 0.0006501967552674635, + "loss": 0.0612, + "num_input_tokens_seen": 122505024, + "step": 56715 + }, + { + "epoch": 9.252854812398043, + "grad_norm": 0.22603590786457062, + "learning_rate": 0.0006501288613376193, + "loss": 0.1817, + "num_input_tokens_seen": 122515680, + "step": 56720 + }, + { + "epoch": 9.253670473083197, + "grad_norm": 0.21570582687854767, + "learning_rate": 0.0006500609643652234, + "loss": 0.2376, + "num_input_tokens_seen": 122526368, + "step": 56725 + }, + { + "epoch": 9.254486133768353, + "grad_norm": 0.09161341190338135, + "learning_rate": 0.0006499930643516514, + "loss": 0.2312, + "num_input_tokens_seen": 122536448, + "step": 56730 + }, + { + "epoch": 9.255301794453507, + "grad_norm": 0.017243504524230957, + "learning_rate": 0.0006499251612982798, + "loss": 0.0246, + "num_input_tokens_seen": 122546464, + "step": 56735 + }, + { + "epoch": 9.256117455138662, + "grad_norm": 0.22162210941314697, + "learning_rate": 0.0006498572552064847, + "loss": 0.0464, + "num_input_tokens_seen": 122557152, + "step": 56740 + }, + { + "epoch": 9.256933115823816, + "grad_norm": 0.008149663917720318, + "learning_rate": 0.0006497893460776421, + "loss": 0.0349, + "num_input_tokens_seen": 122566848, + "step": 56745 + }, + { + "epoch": 9.257748776508972, + "grad_norm": 0.07649563997983932, + "learning_rate": 0.0006497214339131284, + "loss": 0.0273, + "num_input_tokens_seen": 122577568, + "step": 56750 + }, + { + "epoch": 9.258564437194128, + "grad_norm": 0.004069112706929445, + "learning_rate": 0.00064965351871432, + "loss": 0.0477, + "num_input_tokens_seen": 122587040, + "step": 56755 + }, + { + "epoch": 9.259380097879282, + "grad_norm": 0.0017357214819639921, + "learning_rate": 0.0006495856004825931, + "loss": 0.0669, + "num_input_tokens_seen": 122597856, + "step": 56760 + }, + { + "epoch": 9.260195758564437, + "grad_norm": 0.032659392803907394, + "learning_rate": 0.0006495176792193243, + "loss": 0.0631, + "num_input_tokens_seen": 122608032, + "step": 56765 + }, + { + "epoch": 9.261011419249591, + "grad_norm": 0.01769702136516571, + "learning_rate": 0.00064944975492589, + "loss": 0.0282, + "num_input_tokens_seen": 122620032, + "step": 56770 + }, + { + "epoch": 9.261827079934747, + "grad_norm": 0.23506838083267212, + "learning_rate": 0.0006493818276036669, + "loss": 0.0572, + "num_input_tokens_seen": 122629696, + "step": 56775 + }, + { + "epoch": 9.262642740619903, + "grad_norm": 0.06442558020353317, + "learning_rate": 0.0006493138972540316, + "loss": 0.0164, + "num_input_tokens_seen": 122639776, + "step": 56780 + }, + { + "epoch": 9.263458401305057, + "grad_norm": 0.06317390501499176, + "learning_rate": 0.0006492459638783606, + "loss": 0.0266, + "num_input_tokens_seen": 122650240, + "step": 56785 + }, + { + "epoch": 9.264274061990212, + "grad_norm": 0.29172617197036743, + "learning_rate": 0.0006491780274780308, + "loss": 0.0292, + "num_input_tokens_seen": 122661056, + "step": 56790 + }, + { + "epoch": 9.265089722675366, + "grad_norm": 0.008959591388702393, + "learning_rate": 0.0006491100880544191, + "loss": 0.0448, + "num_input_tokens_seen": 122671936, + "step": 56795 + }, + { + "epoch": 9.265905383360522, + "grad_norm": 0.2516701817512512, + "learning_rate": 0.0006490421456089023, + "loss": 0.0195, + "num_input_tokens_seen": 122683808, + "step": 56800 + }, + { + "epoch": 9.266721044045678, + "grad_norm": 0.03728965297341347, + "learning_rate": 0.0006489742001428573, + "loss": 0.0137, + "num_input_tokens_seen": 122695520, + "step": 56805 + }, + { + "epoch": 9.267536704730832, + "grad_norm": 0.006289742887020111, + "learning_rate": 0.0006489062516576613, + "loss": 0.008, + "num_input_tokens_seen": 122706624, + "step": 56810 + }, + { + "epoch": 9.268352365415987, + "grad_norm": 0.03731833025813103, + "learning_rate": 0.0006488383001546911, + "loss": 0.0216, + "num_input_tokens_seen": 122718016, + "step": 56815 + }, + { + "epoch": 9.269168026101141, + "grad_norm": 0.004130576737225056, + "learning_rate": 0.000648770345635324, + "loss": 0.0111, + "num_input_tokens_seen": 122727872, + "step": 56820 + }, + { + "epoch": 9.269983686786297, + "grad_norm": 0.0015647370601072907, + "learning_rate": 0.000648702388100937, + "loss": 0.1372, + "num_input_tokens_seen": 122737984, + "step": 56825 + }, + { + "epoch": 9.270799347471453, + "grad_norm": 0.006820981856435537, + "learning_rate": 0.0006486344275529076, + "loss": 0.1507, + "num_input_tokens_seen": 122749088, + "step": 56830 + }, + { + "epoch": 9.271615008156607, + "grad_norm": 0.0026828646659851074, + "learning_rate": 0.0006485664639926128, + "loss": 0.0131, + "num_input_tokens_seen": 122760128, + "step": 56835 + }, + { + "epoch": 9.272430668841762, + "grad_norm": 0.2718150019645691, + "learning_rate": 0.0006484984974214303, + "loss": 0.0515, + "num_input_tokens_seen": 122770592, + "step": 56840 + }, + { + "epoch": 9.273246329526916, + "grad_norm": 0.030774256214499474, + "learning_rate": 0.0006484305278407373, + "loss": 0.0379, + "num_input_tokens_seen": 122781536, + "step": 56845 + }, + { + "epoch": 9.274061990212072, + "grad_norm": 0.13015435636043549, + "learning_rate": 0.0006483625552519114, + "loss": 0.045, + "num_input_tokens_seen": 122791680, + "step": 56850 + }, + { + "epoch": 9.274877650897226, + "grad_norm": 0.03489632159471512, + "learning_rate": 0.00064829457965633, + "loss": 0.0624, + "num_input_tokens_seen": 122801728, + "step": 56855 + }, + { + "epoch": 9.275693311582382, + "grad_norm": 0.5021911263465881, + "learning_rate": 0.0006482266010553707, + "loss": 0.0752, + "num_input_tokens_seen": 122812896, + "step": 56860 + }, + { + "epoch": 9.276508972267537, + "grad_norm": 0.012664350681006908, + "learning_rate": 0.0006481586194504117, + "loss": 0.0092, + "num_input_tokens_seen": 122823488, + "step": 56865 + }, + { + "epoch": 9.277324632952691, + "grad_norm": 0.07986239343881607, + "learning_rate": 0.00064809063484283, + "loss": 0.0187, + "num_input_tokens_seen": 122834752, + "step": 56870 + }, + { + "epoch": 9.278140293637847, + "grad_norm": 0.0034950373228639364, + "learning_rate": 0.0006480226472340039, + "loss": 0.1122, + "num_input_tokens_seen": 122846080, + "step": 56875 + }, + { + "epoch": 9.278955954323001, + "grad_norm": 0.044152747839689255, + "learning_rate": 0.0006479546566253109, + "loss": 0.0141, + "num_input_tokens_seen": 122856896, + "step": 56880 + }, + { + "epoch": 9.279771615008157, + "grad_norm": 0.10844366997480392, + "learning_rate": 0.0006478866630181293, + "loss": 0.0171, + "num_input_tokens_seen": 122867456, + "step": 56885 + }, + { + "epoch": 9.280587275693312, + "grad_norm": 0.03289483115077019, + "learning_rate": 0.0006478186664138366, + "loss": 0.0796, + "num_input_tokens_seen": 122878944, + "step": 56890 + }, + { + "epoch": 9.281402936378466, + "grad_norm": 0.020551707595586777, + "learning_rate": 0.0006477506668138113, + "loss": 0.0608, + "num_input_tokens_seen": 122889408, + "step": 56895 + }, + { + "epoch": 9.282218597063622, + "grad_norm": 0.022187283262610435, + "learning_rate": 0.0006476826642194313, + "loss": 0.0563, + "num_input_tokens_seen": 122899584, + "step": 56900 + }, + { + "epoch": 9.283034257748776, + "grad_norm": 0.006548890843987465, + "learning_rate": 0.0006476146586320747, + "loss": 0.0859, + "num_input_tokens_seen": 122910976, + "step": 56905 + }, + { + "epoch": 9.283849918433932, + "grad_norm": 0.17593686282634735, + "learning_rate": 0.0006475466500531198, + "loss": 0.0161, + "num_input_tokens_seen": 122922336, + "step": 56910 + }, + { + "epoch": 9.284665579119087, + "grad_norm": 0.007709556259214878, + "learning_rate": 0.0006474786384839448, + "loss": 0.0368, + "num_input_tokens_seen": 122934208, + "step": 56915 + }, + { + "epoch": 9.285481239804241, + "grad_norm": 0.03171587735414505, + "learning_rate": 0.0006474106239259282, + "loss": 0.054, + "num_input_tokens_seen": 122945216, + "step": 56920 + }, + { + "epoch": 9.286296900489397, + "grad_norm": 0.01960030384361744, + "learning_rate": 0.0006473426063804483, + "loss": 0.007, + "num_input_tokens_seen": 122955904, + "step": 56925 + }, + { + "epoch": 9.28711256117455, + "grad_norm": 0.0011323639191687107, + "learning_rate": 0.0006472745858488835, + "loss": 0.0624, + "num_input_tokens_seen": 122967040, + "step": 56930 + }, + { + "epoch": 9.287928221859707, + "grad_norm": 0.012678350321948528, + "learning_rate": 0.0006472065623326123, + "loss": 0.008, + "num_input_tokens_seen": 122978080, + "step": 56935 + }, + { + "epoch": 9.28874388254486, + "grad_norm": 0.0027061044238507748, + "learning_rate": 0.0006471385358330135, + "loss": 0.1473, + "num_input_tokens_seen": 122988928, + "step": 56940 + }, + { + "epoch": 9.289559543230016, + "grad_norm": 0.004366982262581587, + "learning_rate": 0.0006470705063514656, + "loss": 0.0111, + "num_input_tokens_seen": 123000192, + "step": 56945 + }, + { + "epoch": 9.290375203915172, + "grad_norm": 0.0069068376906216145, + "learning_rate": 0.0006470024738893473, + "loss": 0.0357, + "num_input_tokens_seen": 123011264, + "step": 56950 + }, + { + "epoch": 9.291190864600326, + "grad_norm": 0.03688400238752365, + "learning_rate": 0.0006469344384480374, + "loss": 0.0318, + "num_input_tokens_seen": 123022336, + "step": 56955 + }, + { + "epoch": 9.292006525285482, + "grad_norm": 0.035516850650310516, + "learning_rate": 0.0006468664000289147, + "loss": 0.0397, + "num_input_tokens_seen": 123034304, + "step": 56960 + }, + { + "epoch": 9.292822185970635, + "grad_norm": 0.01059263851493597, + "learning_rate": 0.000646798358633358, + "loss": 0.1205, + "num_input_tokens_seen": 123044224, + "step": 56965 + }, + { + "epoch": 9.293637846655791, + "grad_norm": 0.1201028898358345, + "learning_rate": 0.0006467303142627465, + "loss": 0.104, + "num_input_tokens_seen": 123054624, + "step": 56970 + }, + { + "epoch": 9.294453507340947, + "grad_norm": 0.3978259861469269, + "learning_rate": 0.0006466622669184589, + "loss": 0.0853, + "num_input_tokens_seen": 123066368, + "step": 56975 + }, + { + "epoch": 9.2952691680261, + "grad_norm": 0.007530895993113518, + "learning_rate": 0.0006465942166018745, + "loss": 0.1685, + "num_input_tokens_seen": 123076224, + "step": 56980 + }, + { + "epoch": 9.296084828711257, + "grad_norm": 0.0026840101927518845, + "learning_rate": 0.0006465261633143722, + "loss": 0.0423, + "num_input_tokens_seen": 123088256, + "step": 56985 + }, + { + "epoch": 9.29690048939641, + "grad_norm": 0.03933865576982498, + "learning_rate": 0.0006464581070573315, + "loss": 0.0176, + "num_input_tokens_seen": 123098400, + "step": 56990 + }, + { + "epoch": 9.297716150081566, + "grad_norm": 0.18984612822532654, + "learning_rate": 0.0006463900478321314, + "loss": 0.0416, + "num_input_tokens_seen": 123108928, + "step": 56995 + }, + { + "epoch": 9.298531810766722, + "grad_norm": 0.047432951629161835, + "learning_rate": 0.0006463219856401513, + "loss": 0.0538, + "num_input_tokens_seen": 123120160, + "step": 57000 + }, + { + "epoch": 9.299347471451876, + "grad_norm": 0.005672098137438297, + "learning_rate": 0.0006462539204827705, + "loss": 0.0088, + "num_input_tokens_seen": 123129440, + "step": 57005 + }, + { + "epoch": 9.300163132137031, + "grad_norm": 0.011451846919953823, + "learning_rate": 0.0006461858523613684, + "loss": 0.0286, + "num_input_tokens_seen": 123139648, + "step": 57010 + }, + { + "epoch": 9.300978792822185, + "grad_norm": 0.05032016709446907, + "learning_rate": 0.0006461177812773246, + "loss": 0.1426, + "num_input_tokens_seen": 123150304, + "step": 57015 + }, + { + "epoch": 9.301794453507341, + "grad_norm": 0.009700275957584381, + "learning_rate": 0.0006460497072320186, + "loss": 0.0465, + "num_input_tokens_seen": 123160768, + "step": 57020 + }, + { + "epoch": 9.302610114192497, + "grad_norm": 0.03388618305325508, + "learning_rate": 0.00064598163022683, + "loss": 0.0154, + "num_input_tokens_seen": 123170336, + "step": 57025 + }, + { + "epoch": 9.30342577487765, + "grad_norm": 0.0011625054758042097, + "learning_rate": 0.0006459135502631386, + "loss": 0.0869, + "num_input_tokens_seen": 123181216, + "step": 57030 + }, + { + "epoch": 9.304241435562806, + "grad_norm": 0.02638522908091545, + "learning_rate": 0.0006458454673423238, + "loss": 0.0566, + "num_input_tokens_seen": 123192384, + "step": 57035 + }, + { + "epoch": 9.30505709624796, + "grad_norm": 0.21478807926177979, + "learning_rate": 0.0006457773814657657, + "loss": 0.1093, + "num_input_tokens_seen": 123203424, + "step": 57040 + }, + { + "epoch": 9.305872756933116, + "grad_norm": 0.04131392389535904, + "learning_rate": 0.000645709292634844, + "loss": 0.0413, + "num_input_tokens_seen": 123213152, + "step": 57045 + }, + { + "epoch": 9.30668841761827, + "grad_norm": 0.04098490625619888, + "learning_rate": 0.0006456412008509387, + "loss": 0.0425, + "num_input_tokens_seen": 123223616, + "step": 57050 + }, + { + "epoch": 9.307504078303426, + "grad_norm": 0.24334684014320374, + "learning_rate": 0.0006455731061154297, + "loss": 0.0385, + "num_input_tokens_seen": 123233632, + "step": 57055 + }, + { + "epoch": 9.308319738988581, + "grad_norm": 0.03460830822587013, + "learning_rate": 0.0006455050084296969, + "loss": 0.0976, + "num_input_tokens_seen": 123244608, + "step": 57060 + }, + { + "epoch": 9.309135399673735, + "grad_norm": 0.06306884437799454, + "learning_rate": 0.0006454369077951206, + "loss": 0.0363, + "num_input_tokens_seen": 123253952, + "step": 57065 + }, + { + "epoch": 9.309951060358891, + "grad_norm": 0.0017151982756331563, + "learning_rate": 0.0006453688042130808, + "loss": 0.0099, + "num_input_tokens_seen": 123263968, + "step": 57070 + }, + { + "epoch": 9.310766721044045, + "grad_norm": 0.013023875653743744, + "learning_rate": 0.0006453006976849578, + "loss": 0.012, + "num_input_tokens_seen": 123273312, + "step": 57075 + }, + { + "epoch": 9.3115823817292, + "grad_norm": 0.5808905363082886, + "learning_rate": 0.0006452325882121319, + "loss": 0.1844, + "num_input_tokens_seen": 123284864, + "step": 57080 + }, + { + "epoch": 9.312398042414356, + "grad_norm": 0.020586438477039337, + "learning_rate": 0.0006451644757959834, + "loss": 0.0135, + "num_input_tokens_seen": 123295840, + "step": 57085 + }, + { + "epoch": 9.31321370309951, + "grad_norm": 0.007403132040053606, + "learning_rate": 0.0006450963604378926, + "loss": 0.0573, + "num_input_tokens_seen": 123304992, + "step": 57090 + }, + { + "epoch": 9.314029363784666, + "grad_norm": 0.05940420553088188, + "learning_rate": 0.0006450282421392399, + "loss": 0.0852, + "num_input_tokens_seen": 123314720, + "step": 57095 + }, + { + "epoch": 9.31484502446982, + "grad_norm": 0.15579479932785034, + "learning_rate": 0.0006449601209014059, + "loss": 0.0413, + "num_input_tokens_seen": 123323168, + "step": 57100 + }, + { + "epoch": 9.315660685154976, + "grad_norm": 0.29078125953674316, + "learning_rate": 0.0006448919967257711, + "loss": 0.0435, + "num_input_tokens_seen": 123333952, + "step": 57105 + }, + { + "epoch": 9.31647634584013, + "grad_norm": 0.008018561638891697, + "learning_rate": 0.0006448238696137163, + "loss": 0.0134, + "num_input_tokens_seen": 123344928, + "step": 57110 + }, + { + "epoch": 9.317292006525285, + "grad_norm": 0.0108210863545537, + "learning_rate": 0.0006447557395666221, + "loss": 0.0491, + "num_input_tokens_seen": 123354720, + "step": 57115 + }, + { + "epoch": 9.318107667210441, + "grad_norm": 0.21279096603393555, + "learning_rate": 0.0006446876065858691, + "loss": 0.1351, + "num_input_tokens_seen": 123366272, + "step": 57120 + }, + { + "epoch": 9.318923327895595, + "grad_norm": 0.2715965211391449, + "learning_rate": 0.0006446194706728383, + "loss": 0.0278, + "num_input_tokens_seen": 123377472, + "step": 57125 + }, + { + "epoch": 9.31973898858075, + "grad_norm": 0.0017662273021414876, + "learning_rate": 0.0006445513318289104, + "loss": 0.048, + "num_input_tokens_seen": 123389280, + "step": 57130 + }, + { + "epoch": 9.320554649265905, + "grad_norm": 0.01154270675033331, + "learning_rate": 0.0006444831900554664, + "loss": 0.0076, + "num_input_tokens_seen": 123400352, + "step": 57135 + }, + { + "epoch": 9.32137030995106, + "grad_norm": 0.004029486328363419, + "learning_rate": 0.0006444150453538873, + "loss": 0.036, + "num_input_tokens_seen": 123411616, + "step": 57140 + }, + { + "epoch": 9.322185970636216, + "grad_norm": 0.01075716968625784, + "learning_rate": 0.000644346897725554, + "loss": 0.0088, + "num_input_tokens_seen": 123422304, + "step": 57145 + }, + { + "epoch": 9.32300163132137, + "grad_norm": 0.003960131201893091, + "learning_rate": 0.0006442787471718479, + "loss": 0.0067, + "num_input_tokens_seen": 123433024, + "step": 57150 + }, + { + "epoch": 9.323817292006526, + "grad_norm": 0.005982367787510157, + "learning_rate": 0.0006442105936941498, + "loss": 0.0425, + "num_input_tokens_seen": 123444416, + "step": 57155 + }, + { + "epoch": 9.32463295269168, + "grad_norm": 0.038486577570438385, + "learning_rate": 0.000644142437293841, + "loss": 0.1482, + "num_input_tokens_seen": 123454880, + "step": 57160 + }, + { + "epoch": 9.325448613376835, + "grad_norm": 0.11881979554891586, + "learning_rate": 0.000644074277972303, + "loss": 0.0463, + "num_input_tokens_seen": 123465472, + "step": 57165 + }, + { + "epoch": 9.326264274061991, + "grad_norm": 0.010927199386060238, + "learning_rate": 0.000644006115730917, + "loss": 0.0554, + "num_input_tokens_seen": 123477120, + "step": 57170 + }, + { + "epoch": 9.327079934747145, + "grad_norm": 0.01006519515067339, + "learning_rate": 0.000643937950571064, + "loss": 0.0619, + "num_input_tokens_seen": 123487296, + "step": 57175 + }, + { + "epoch": 9.3278955954323, + "grad_norm": 0.008437362499535084, + "learning_rate": 0.0006438697824941263, + "loss": 0.0606, + "num_input_tokens_seen": 123497440, + "step": 57180 + }, + { + "epoch": 9.328711256117455, + "grad_norm": 0.011827422305941582, + "learning_rate": 0.0006438016115014848, + "loss": 0.009, + "num_input_tokens_seen": 123508384, + "step": 57185 + }, + { + "epoch": 9.32952691680261, + "grad_norm": 0.0015898183919489384, + "learning_rate": 0.0006437334375945212, + "loss": 0.0186, + "num_input_tokens_seen": 123519296, + "step": 57190 + }, + { + "epoch": 9.330342577487766, + "grad_norm": 0.019101936370134354, + "learning_rate": 0.0006436652607746171, + "loss": 0.0432, + "num_input_tokens_seen": 123532256, + "step": 57195 + }, + { + "epoch": 9.33115823817292, + "grad_norm": 0.0033209563698619604, + "learning_rate": 0.0006435970810431544, + "loss": 0.0032, + "num_input_tokens_seen": 123542112, + "step": 57200 + }, + { + "epoch": 9.331973898858076, + "grad_norm": 0.029729114845395088, + "learning_rate": 0.0006435288984015146, + "loss": 0.0626, + "num_input_tokens_seen": 123552352, + "step": 57205 + }, + { + "epoch": 9.33278955954323, + "grad_norm": 0.36488205194473267, + "learning_rate": 0.0006434607128510796, + "loss": 0.1586, + "num_input_tokens_seen": 123562880, + "step": 57210 + }, + { + "epoch": 9.333605220228385, + "grad_norm": 0.3097657561302185, + "learning_rate": 0.0006433925243932312, + "loss": 0.1643, + "num_input_tokens_seen": 123573536, + "step": 57215 + }, + { + "epoch": 9.33442088091354, + "grad_norm": 0.008041603490710258, + "learning_rate": 0.0006433243330293514, + "loss": 0.0461, + "num_input_tokens_seen": 123584416, + "step": 57220 + }, + { + "epoch": 9.335236541598695, + "grad_norm": 0.003848917316645384, + "learning_rate": 0.0006432561387608222, + "loss": 0.0302, + "num_input_tokens_seen": 123596352, + "step": 57225 + }, + { + "epoch": 9.33605220228385, + "grad_norm": 0.004652928560972214, + "learning_rate": 0.0006431879415890256, + "loss": 0.0794, + "num_input_tokens_seen": 123607008, + "step": 57230 + }, + { + "epoch": 9.336867862969005, + "grad_norm": 0.04283025488257408, + "learning_rate": 0.0006431197415153437, + "loss": 0.1123, + "num_input_tokens_seen": 123617632, + "step": 57235 + }, + { + "epoch": 9.33768352365416, + "grad_norm": 0.01733693666756153, + "learning_rate": 0.0006430515385411588, + "loss": 0.0486, + "num_input_tokens_seen": 123628192, + "step": 57240 + }, + { + "epoch": 9.338499184339314, + "grad_norm": 0.2398867905139923, + "learning_rate": 0.0006429833326678529, + "loss": 0.0316, + "num_input_tokens_seen": 123638304, + "step": 57245 + }, + { + "epoch": 9.33931484502447, + "grad_norm": 0.331961452960968, + "learning_rate": 0.0006429151238968083, + "loss": 0.0822, + "num_input_tokens_seen": 123649792, + "step": 57250 + }, + { + "epoch": 9.340130505709626, + "grad_norm": 0.1437341868877411, + "learning_rate": 0.0006428469122294075, + "loss": 0.0107, + "num_input_tokens_seen": 123661024, + "step": 57255 + }, + { + "epoch": 9.34094616639478, + "grad_norm": 0.09274806827306747, + "learning_rate": 0.0006427786976670328, + "loss": 0.0151, + "num_input_tokens_seen": 123672672, + "step": 57260 + }, + { + "epoch": 9.341761827079935, + "grad_norm": 0.0013064906233921647, + "learning_rate": 0.0006427104802110667, + "loss": 0.0185, + "num_input_tokens_seen": 123683296, + "step": 57265 + }, + { + "epoch": 9.34257748776509, + "grad_norm": 0.4128345549106598, + "learning_rate": 0.0006426422598628916, + "loss": 0.094, + "num_input_tokens_seen": 123694848, + "step": 57270 + }, + { + "epoch": 9.343393148450245, + "grad_norm": 0.6230624318122864, + "learning_rate": 0.0006425740366238903, + "loss": 0.1, + "num_input_tokens_seen": 123705760, + "step": 57275 + }, + { + "epoch": 9.3442088091354, + "grad_norm": 0.01017848402261734, + "learning_rate": 0.0006425058104954451, + "loss": 0.018, + "num_input_tokens_seen": 123716000, + "step": 57280 + }, + { + "epoch": 9.345024469820554, + "grad_norm": 0.18681271374225616, + "learning_rate": 0.0006424375814789388, + "loss": 0.1853, + "num_input_tokens_seen": 123727616, + "step": 57285 + }, + { + "epoch": 9.34584013050571, + "grad_norm": 0.419785737991333, + "learning_rate": 0.0006423693495757545, + "loss": 0.1258, + "num_input_tokens_seen": 123738400, + "step": 57290 + }, + { + "epoch": 9.346655791190864, + "grad_norm": 0.005531011614948511, + "learning_rate": 0.0006423011147872745, + "loss": 0.0515, + "num_input_tokens_seen": 123747616, + "step": 57295 + }, + { + "epoch": 9.34747145187602, + "grad_norm": 0.04268835484981537, + "learning_rate": 0.000642232877114882, + "loss": 0.0256, + "num_input_tokens_seen": 123759424, + "step": 57300 + }, + { + "epoch": 9.348287112561174, + "grad_norm": 0.012237378396093845, + "learning_rate": 0.0006421646365599597, + "loss": 0.0214, + "num_input_tokens_seen": 123771072, + "step": 57305 + }, + { + "epoch": 9.34910277324633, + "grad_norm": 0.05506362393498421, + "learning_rate": 0.0006420963931238907, + "loss": 0.0551, + "num_input_tokens_seen": 123782816, + "step": 57310 + }, + { + "epoch": 9.349918433931485, + "grad_norm": 0.0642736479640007, + "learning_rate": 0.0006420281468080582, + "loss": 0.0235, + "num_input_tokens_seen": 123792416, + "step": 57315 + }, + { + "epoch": 9.350734094616639, + "grad_norm": 0.2579999566078186, + "learning_rate": 0.0006419598976138451, + "loss": 0.0431, + "num_input_tokens_seen": 123803680, + "step": 57320 + }, + { + "epoch": 9.351549755301795, + "grad_norm": 0.009822947904467583, + "learning_rate": 0.0006418916455426344, + "loss": 0.0166, + "num_input_tokens_seen": 123814912, + "step": 57325 + }, + { + "epoch": 9.352365415986949, + "grad_norm": 0.0744229406118393, + "learning_rate": 0.0006418233905958097, + "loss": 0.0148, + "num_input_tokens_seen": 123825856, + "step": 57330 + }, + { + "epoch": 9.353181076672104, + "grad_norm": 0.012941932305693626, + "learning_rate": 0.000641755132774754, + "loss": 0.1114, + "num_input_tokens_seen": 123836032, + "step": 57335 + }, + { + "epoch": 9.35399673735726, + "grad_norm": 0.027866492047905922, + "learning_rate": 0.0006416868720808507, + "loss": 0.181, + "num_input_tokens_seen": 123846080, + "step": 57340 + }, + { + "epoch": 9.354812398042414, + "grad_norm": 0.007846455089747906, + "learning_rate": 0.0006416186085154833, + "loss": 0.1681, + "num_input_tokens_seen": 123856928, + "step": 57345 + }, + { + "epoch": 9.35562805872757, + "grad_norm": 0.029931560158729553, + "learning_rate": 0.0006415503420800349, + "loss": 0.1149, + "num_input_tokens_seen": 123867072, + "step": 57350 + }, + { + "epoch": 9.356443719412724, + "grad_norm": 0.036976058036088943, + "learning_rate": 0.0006414820727758894, + "loss": 0.1055, + "num_input_tokens_seen": 123877280, + "step": 57355 + }, + { + "epoch": 9.35725938009788, + "grad_norm": 0.08427825570106506, + "learning_rate": 0.0006414138006044303, + "loss": 0.0165, + "num_input_tokens_seen": 123887040, + "step": 57360 + }, + { + "epoch": 9.358075040783035, + "grad_norm": 0.009288780391216278, + "learning_rate": 0.0006413455255670409, + "loss": 0.0601, + "num_input_tokens_seen": 123897632, + "step": 57365 + }, + { + "epoch": 9.358890701468189, + "grad_norm": 0.007328356616199017, + "learning_rate": 0.0006412772476651053, + "loss": 0.0468, + "num_input_tokens_seen": 123909024, + "step": 57370 + }, + { + "epoch": 9.359706362153345, + "grad_norm": 0.012570452876389027, + "learning_rate": 0.0006412089669000071, + "loss": 0.0284, + "num_input_tokens_seen": 123920864, + "step": 57375 + }, + { + "epoch": 9.360522022838499, + "grad_norm": 0.01703028939664364, + "learning_rate": 0.0006411406832731299, + "loss": 0.0177, + "num_input_tokens_seen": 123932224, + "step": 57380 + }, + { + "epoch": 9.361337683523654, + "grad_norm": 0.01287777628749609, + "learning_rate": 0.0006410723967858577, + "loss": 0.0599, + "num_input_tokens_seen": 123944064, + "step": 57385 + }, + { + "epoch": 9.362153344208808, + "grad_norm": 0.10792047530412674, + "learning_rate": 0.0006410041074395744, + "loss": 0.0751, + "num_input_tokens_seen": 123955168, + "step": 57390 + }, + { + "epoch": 9.362969004893964, + "grad_norm": 0.2537194788455963, + "learning_rate": 0.0006409358152356642, + "loss": 0.1646, + "num_input_tokens_seen": 123965920, + "step": 57395 + }, + { + "epoch": 9.36378466557912, + "grad_norm": 0.04030577838420868, + "learning_rate": 0.0006408675201755107, + "loss": 0.0461, + "num_input_tokens_seen": 123978048, + "step": 57400 + }, + { + "epoch": 9.364600326264274, + "grad_norm": 0.7500656247138977, + "learning_rate": 0.0006407992222604983, + "loss": 0.0785, + "num_input_tokens_seen": 123989216, + "step": 57405 + }, + { + "epoch": 9.36541598694943, + "grad_norm": 0.0026482176035642624, + "learning_rate": 0.000640730921492011, + "loss": 0.0115, + "num_input_tokens_seen": 124001408, + "step": 57410 + }, + { + "epoch": 9.366231647634583, + "grad_norm": 0.09167847037315369, + "learning_rate": 0.000640662617871433, + "loss": 0.1328, + "num_input_tokens_seen": 124011904, + "step": 57415 + }, + { + "epoch": 9.367047308319739, + "grad_norm": 0.023537907749414444, + "learning_rate": 0.0006405943114001486, + "loss": 0.0702, + "num_input_tokens_seen": 124023040, + "step": 57420 + }, + { + "epoch": 9.367862969004895, + "grad_norm": 0.722507655620575, + "learning_rate": 0.0006405260020795421, + "loss": 0.0655, + "num_input_tokens_seen": 124033728, + "step": 57425 + }, + { + "epoch": 9.368678629690049, + "grad_norm": 0.07008686661720276, + "learning_rate": 0.0006404576899109981, + "loss": 0.0191, + "num_input_tokens_seen": 124043840, + "step": 57430 + }, + { + "epoch": 9.369494290375204, + "grad_norm": 0.392330527305603, + "learning_rate": 0.0006403893748959007, + "loss": 0.0943, + "num_input_tokens_seen": 124055040, + "step": 57435 + }, + { + "epoch": 9.370309951060358, + "grad_norm": 0.008758448995649815, + "learning_rate": 0.0006403210570356346, + "loss": 0.1311, + "num_input_tokens_seen": 124065056, + "step": 57440 + }, + { + "epoch": 9.371125611745514, + "grad_norm": 0.0045740483328700066, + "learning_rate": 0.0006402527363315843, + "loss": 0.0355, + "num_input_tokens_seen": 124076224, + "step": 57445 + }, + { + "epoch": 9.37194127243067, + "grad_norm": 0.002997696865350008, + "learning_rate": 0.0006401844127851342, + "loss": 0.0072, + "num_input_tokens_seen": 124087424, + "step": 57450 + }, + { + "epoch": 9.372756933115824, + "grad_norm": 0.001006297068670392, + "learning_rate": 0.0006401160863976691, + "loss": 0.0045, + "num_input_tokens_seen": 124098144, + "step": 57455 + }, + { + "epoch": 9.37357259380098, + "grad_norm": 0.30021172761917114, + "learning_rate": 0.000640047757170574, + "loss": 0.0892, + "num_input_tokens_seen": 124109184, + "step": 57460 + }, + { + "epoch": 9.374388254486133, + "grad_norm": 0.21571239829063416, + "learning_rate": 0.0006399794251052333, + "loss": 0.0363, + "num_input_tokens_seen": 124120032, + "step": 57465 + }, + { + "epoch": 9.375203915171289, + "grad_norm": 0.21111957728862762, + "learning_rate": 0.000639911090203032, + "loss": 0.028, + "num_input_tokens_seen": 124130752, + "step": 57470 + }, + { + "epoch": 9.376019575856443, + "grad_norm": 0.4067305028438568, + "learning_rate": 0.000639842752465355, + "loss": 0.0688, + "num_input_tokens_seen": 124140896, + "step": 57475 + }, + { + "epoch": 9.376835236541599, + "grad_norm": 0.1772114485502243, + "learning_rate": 0.0006397744118935871, + "loss": 0.0654, + "num_input_tokens_seen": 124152000, + "step": 57480 + }, + { + "epoch": 9.377650897226754, + "grad_norm": 0.057778965681791306, + "learning_rate": 0.0006397060684891136, + "loss": 0.0127, + "num_input_tokens_seen": 124163296, + "step": 57485 + }, + { + "epoch": 9.378466557911908, + "grad_norm": 0.009378801099956036, + "learning_rate": 0.0006396377222533192, + "loss": 0.0094, + "num_input_tokens_seen": 124173824, + "step": 57490 + }, + { + "epoch": 9.379282218597064, + "grad_norm": 0.004530887119472027, + "learning_rate": 0.0006395693731875892, + "loss": 0.0342, + "num_input_tokens_seen": 124184384, + "step": 57495 + }, + { + "epoch": 9.380097879282218, + "grad_norm": 0.0177314355969429, + "learning_rate": 0.000639501021293309, + "loss": 0.0721, + "num_input_tokens_seen": 124196352, + "step": 57500 + }, + { + "epoch": 9.380913539967374, + "grad_norm": 0.10000865161418915, + "learning_rate": 0.0006394326665718635, + "loss": 0.0947, + "num_input_tokens_seen": 124206720, + "step": 57505 + }, + { + "epoch": 9.38172920065253, + "grad_norm": 0.026358895003795624, + "learning_rate": 0.0006393643090246381, + "loss": 0.1997, + "num_input_tokens_seen": 124217856, + "step": 57510 + }, + { + "epoch": 9.382544861337683, + "grad_norm": 0.01959538832306862, + "learning_rate": 0.0006392959486530183, + "loss": 0.0319, + "num_input_tokens_seen": 124227616, + "step": 57515 + }, + { + "epoch": 9.383360522022839, + "grad_norm": 0.1922665238380432, + "learning_rate": 0.0006392275854583894, + "loss": 0.0153, + "num_input_tokens_seen": 124236800, + "step": 57520 + }, + { + "epoch": 9.384176182707993, + "grad_norm": 0.0030915914103388786, + "learning_rate": 0.0006391592194421367, + "loss": 0.0529, + "num_input_tokens_seen": 124247456, + "step": 57525 + }, + { + "epoch": 9.384991843393149, + "grad_norm": 0.02654971182346344, + "learning_rate": 0.0006390908506056461, + "loss": 0.0673, + "num_input_tokens_seen": 124258752, + "step": 57530 + }, + { + "epoch": 9.385807504078304, + "grad_norm": 0.008271587081253529, + "learning_rate": 0.0006390224789503028, + "loss": 0.0834, + "num_input_tokens_seen": 124269856, + "step": 57535 + }, + { + "epoch": 9.386623164763458, + "grad_norm": 0.31094446778297424, + "learning_rate": 0.0006389541044774927, + "loss": 0.0556, + "num_input_tokens_seen": 124280640, + "step": 57540 + }, + { + "epoch": 9.387438825448614, + "grad_norm": 0.20158900320529938, + "learning_rate": 0.0006388857271886013, + "loss": 0.0321, + "num_input_tokens_seen": 124290976, + "step": 57545 + }, + { + "epoch": 9.388254486133768, + "grad_norm": 0.26112285256385803, + "learning_rate": 0.0006388173470850144, + "loss": 0.0209, + "num_input_tokens_seen": 124302752, + "step": 57550 + }, + { + "epoch": 9.389070146818923, + "grad_norm": 0.003522562561556697, + "learning_rate": 0.0006387489641681181, + "loss": 0.0037, + "num_input_tokens_seen": 124314208, + "step": 57555 + }, + { + "epoch": 9.38988580750408, + "grad_norm": 0.08223313838243484, + "learning_rate": 0.0006386805784392978, + "loss": 0.1029, + "num_input_tokens_seen": 124324992, + "step": 57560 + }, + { + "epoch": 9.390701468189233, + "grad_norm": 0.09716961532831192, + "learning_rate": 0.0006386121898999397, + "loss": 0.0255, + "num_input_tokens_seen": 124335264, + "step": 57565 + }, + { + "epoch": 9.391517128874389, + "grad_norm": 0.1217547208070755, + "learning_rate": 0.0006385437985514297, + "loss": 0.1149, + "num_input_tokens_seen": 124345952, + "step": 57570 + }, + { + "epoch": 9.392332789559543, + "grad_norm": 0.005859457887709141, + "learning_rate": 0.000638475404395154, + "loss": 0.0781, + "num_input_tokens_seen": 124357152, + "step": 57575 + }, + { + "epoch": 9.393148450244698, + "grad_norm": 0.02815798856317997, + "learning_rate": 0.0006384070074324984, + "loss": 0.0335, + "num_input_tokens_seen": 124368096, + "step": 57580 + }, + { + "epoch": 9.393964110929852, + "grad_norm": 0.3322744369506836, + "learning_rate": 0.0006383386076648494, + "loss": 0.1213, + "num_input_tokens_seen": 124378272, + "step": 57585 + }, + { + "epoch": 9.394779771615008, + "grad_norm": 0.09620869159698486, + "learning_rate": 0.0006382702050935929, + "loss": 0.0363, + "num_input_tokens_seen": 124389856, + "step": 57590 + }, + { + "epoch": 9.395595432300164, + "grad_norm": 0.011701155453920364, + "learning_rate": 0.0006382017997201152, + "loss": 0.0438, + "num_input_tokens_seen": 124402240, + "step": 57595 + }, + { + "epoch": 9.396411092985318, + "grad_norm": 0.007712031714618206, + "learning_rate": 0.000638133391545803, + "loss": 0.0098, + "num_input_tokens_seen": 124413344, + "step": 57600 + }, + { + "epoch": 9.397226753670473, + "grad_norm": 0.1123395785689354, + "learning_rate": 0.000638064980572042, + "loss": 0.0149, + "num_input_tokens_seen": 124425600, + "step": 57605 + }, + { + "epoch": 9.398042414355627, + "grad_norm": 0.017627792432904243, + "learning_rate": 0.0006379965668002192, + "loss": 0.1074, + "num_input_tokens_seen": 124435200, + "step": 57610 + }, + { + "epoch": 9.398858075040783, + "grad_norm": 0.0041339038871228695, + "learning_rate": 0.0006379281502317209, + "loss": 0.0121, + "num_input_tokens_seen": 124445888, + "step": 57615 + }, + { + "epoch": 9.399673735725939, + "grad_norm": 0.07806552201509476, + "learning_rate": 0.0006378597308679338, + "loss": 0.0267, + "num_input_tokens_seen": 124457696, + "step": 57620 + }, + { + "epoch": 9.400489396411093, + "grad_norm": 0.016718747094273567, + "learning_rate": 0.0006377913087102443, + "loss": 0.0852, + "num_input_tokens_seen": 124469312, + "step": 57625 + }, + { + "epoch": 9.401305057096248, + "grad_norm": 0.016923511400818825, + "learning_rate": 0.0006377228837600391, + "loss": 0.1711, + "num_input_tokens_seen": 124479584, + "step": 57630 + }, + { + "epoch": 9.402120717781402, + "grad_norm": 0.002410069340839982, + "learning_rate": 0.0006376544560187049, + "loss": 0.0281, + "num_input_tokens_seen": 124490432, + "step": 57635 + }, + { + "epoch": 9.402936378466558, + "grad_norm": 0.00935316551476717, + "learning_rate": 0.0006375860254876286, + "loss": 0.0536, + "num_input_tokens_seen": 124501312, + "step": 57640 + }, + { + "epoch": 9.403752039151712, + "grad_norm": 0.05050384998321533, + "learning_rate": 0.0006375175921681968, + "loss": 0.0437, + "num_input_tokens_seen": 124512256, + "step": 57645 + }, + { + "epoch": 9.404567699836868, + "grad_norm": 0.002771280240267515, + "learning_rate": 0.0006374491560617967, + "loss": 0.2557, + "num_input_tokens_seen": 124523328, + "step": 57650 + }, + { + "epoch": 9.405383360522023, + "grad_norm": 0.01832154579460621, + "learning_rate": 0.0006373807171698151, + "loss": 0.1944, + "num_input_tokens_seen": 124532832, + "step": 57655 + }, + { + "epoch": 9.406199021207177, + "grad_norm": 0.22622382640838623, + "learning_rate": 0.0006373122754936389, + "loss": 0.1608, + "num_input_tokens_seen": 124544288, + "step": 57660 + }, + { + "epoch": 9.407014681892333, + "grad_norm": 0.1542602777481079, + "learning_rate": 0.0006372438310346553, + "loss": 0.0357, + "num_input_tokens_seen": 124553952, + "step": 57665 + }, + { + "epoch": 9.407830342577487, + "grad_norm": 0.1983586847782135, + "learning_rate": 0.0006371753837942513, + "loss": 0.0626, + "num_input_tokens_seen": 124565344, + "step": 57670 + }, + { + "epoch": 9.408646003262643, + "grad_norm": 0.370592325925827, + "learning_rate": 0.0006371069337738142, + "loss": 0.0444, + "num_input_tokens_seen": 124576032, + "step": 57675 + }, + { + "epoch": 9.409461663947798, + "grad_norm": 0.004081313032656908, + "learning_rate": 0.000637038480974731, + "loss": 0.0331, + "num_input_tokens_seen": 124586976, + "step": 57680 + }, + { + "epoch": 9.410277324632952, + "grad_norm": 0.019369378685951233, + "learning_rate": 0.0006369700253983893, + "loss": 0.1193, + "num_input_tokens_seen": 124598368, + "step": 57685 + }, + { + "epoch": 9.411092985318108, + "grad_norm": 0.027157841250300407, + "learning_rate": 0.0006369015670461762, + "loss": 0.0228, + "num_input_tokens_seen": 124608832, + "step": 57690 + }, + { + "epoch": 9.411908646003262, + "grad_norm": 0.0030664519872516394, + "learning_rate": 0.0006368331059194792, + "loss": 0.0632, + "num_input_tokens_seen": 124620032, + "step": 57695 + }, + { + "epoch": 9.412724306688418, + "grad_norm": 0.47488272190093994, + "learning_rate": 0.0006367646420196857, + "loss": 0.1051, + "num_input_tokens_seen": 124630784, + "step": 57700 + }, + { + "epoch": 9.413539967373573, + "grad_norm": 0.007694408297538757, + "learning_rate": 0.0006366961753481832, + "loss": 0.0419, + "num_input_tokens_seen": 124641312, + "step": 57705 + }, + { + "epoch": 9.414355628058727, + "grad_norm": 0.014576110988855362, + "learning_rate": 0.0006366277059063594, + "loss": 0.0442, + "num_input_tokens_seen": 124652192, + "step": 57710 + }, + { + "epoch": 9.415171288743883, + "grad_norm": 0.35242435336112976, + "learning_rate": 0.0006365592336956017, + "loss": 0.1291, + "num_input_tokens_seen": 124663168, + "step": 57715 + }, + { + "epoch": 9.415986949429037, + "grad_norm": 0.023442458361387253, + "learning_rate": 0.0006364907587172978, + "loss": 0.0275, + "num_input_tokens_seen": 124672832, + "step": 57720 + }, + { + "epoch": 9.416802610114193, + "grad_norm": 0.08906247466802597, + "learning_rate": 0.0006364222809728358, + "loss": 0.0782, + "num_input_tokens_seen": 124684064, + "step": 57725 + }, + { + "epoch": 9.417618270799348, + "grad_norm": 0.21500514447689056, + "learning_rate": 0.0006363538004636032, + "loss": 0.0316, + "num_input_tokens_seen": 124695040, + "step": 57730 + }, + { + "epoch": 9.418433931484502, + "grad_norm": 0.008945376612246037, + "learning_rate": 0.0006362853171909876, + "loss": 0.1661, + "num_input_tokens_seen": 124705792, + "step": 57735 + }, + { + "epoch": 9.419249592169658, + "grad_norm": 0.09412268549203873, + "learning_rate": 0.0006362168311563773, + "loss": 0.0496, + "num_input_tokens_seen": 124716704, + "step": 57740 + }, + { + "epoch": 9.420065252854812, + "grad_norm": 0.05867978557944298, + "learning_rate": 0.00063614834236116, + "loss": 0.0418, + "num_input_tokens_seen": 124727936, + "step": 57745 + }, + { + "epoch": 9.420880913539968, + "grad_norm": 0.32948651909828186, + "learning_rate": 0.000636079850806724, + "loss": 0.0881, + "num_input_tokens_seen": 124738112, + "step": 57750 + }, + { + "epoch": 9.421696574225122, + "grad_norm": 0.007615845184773207, + "learning_rate": 0.0006360113564944571, + "loss": 0.1154, + "num_input_tokens_seen": 124748192, + "step": 57755 + }, + { + "epoch": 9.422512234910277, + "grad_norm": 0.02363566681742668, + "learning_rate": 0.0006359428594257476, + "loss": 0.0052, + "num_input_tokens_seen": 124758592, + "step": 57760 + }, + { + "epoch": 9.423327895595433, + "grad_norm": 0.33245760202407837, + "learning_rate": 0.0006358743596019836, + "loss": 0.1601, + "num_input_tokens_seen": 124768128, + "step": 57765 + }, + { + "epoch": 9.424143556280587, + "grad_norm": 0.2626391053199768, + "learning_rate": 0.0006358058570245532, + "loss": 0.1659, + "num_input_tokens_seen": 124777984, + "step": 57770 + }, + { + "epoch": 9.424959216965743, + "grad_norm": 0.004645919892936945, + "learning_rate": 0.0006357373516948451, + "loss": 0.0493, + "num_input_tokens_seen": 124788960, + "step": 57775 + }, + { + "epoch": 9.425774877650896, + "grad_norm": 0.045954253524541855, + "learning_rate": 0.0006356688436142471, + "loss": 0.0464, + "num_input_tokens_seen": 124799968, + "step": 57780 + }, + { + "epoch": 9.426590538336052, + "grad_norm": 0.0628044605255127, + "learning_rate": 0.000635600332784148, + "loss": 0.0164, + "num_input_tokens_seen": 124809888, + "step": 57785 + }, + { + "epoch": 9.427406199021208, + "grad_norm": 0.003180962521582842, + "learning_rate": 0.0006355318192059361, + "loss": 0.0241, + "num_input_tokens_seen": 124820544, + "step": 57790 + }, + { + "epoch": 9.428221859706362, + "grad_norm": 0.005581281613558531, + "learning_rate": 0.0006354633028809999, + "loss": 0.0133, + "num_input_tokens_seen": 124830752, + "step": 57795 + }, + { + "epoch": 9.429037520391518, + "grad_norm": 0.014229382388293743, + "learning_rate": 0.000635394783810728, + "loss": 0.0255, + "num_input_tokens_seen": 124842016, + "step": 57800 + }, + { + "epoch": 9.429853181076671, + "grad_norm": 0.01770959608256817, + "learning_rate": 0.0006353262619965091, + "loss": 0.0568, + "num_input_tokens_seen": 124854208, + "step": 57805 + }, + { + "epoch": 9.430668841761827, + "grad_norm": 0.003146283095702529, + "learning_rate": 0.000635257737439732, + "loss": 0.0476, + "num_input_tokens_seen": 124865600, + "step": 57810 + }, + { + "epoch": 9.431484502446983, + "grad_norm": 0.010914292186498642, + "learning_rate": 0.0006351892101417849, + "loss": 0.0275, + "num_input_tokens_seen": 124876736, + "step": 57815 + }, + { + "epoch": 9.432300163132137, + "grad_norm": 0.03408697247505188, + "learning_rate": 0.0006351206801040571, + "loss": 0.112, + "num_input_tokens_seen": 124887456, + "step": 57820 + }, + { + "epoch": 9.433115823817293, + "grad_norm": 0.01934865489602089, + "learning_rate": 0.0006350521473279374, + "loss": 0.1592, + "num_input_tokens_seen": 124899072, + "step": 57825 + }, + { + "epoch": 9.433931484502446, + "grad_norm": 0.12198976427316666, + "learning_rate": 0.0006349836118148146, + "loss": 0.0581, + "num_input_tokens_seen": 124910048, + "step": 57830 + }, + { + "epoch": 9.434747145187602, + "grad_norm": 0.0035420190542936325, + "learning_rate": 0.0006349150735660776, + "loss": 0.0177, + "num_input_tokens_seen": 124920736, + "step": 57835 + }, + { + "epoch": 9.435562805872756, + "grad_norm": 0.006238948553800583, + "learning_rate": 0.0006348465325831155, + "loss": 0.012, + "num_input_tokens_seen": 124931872, + "step": 57840 + }, + { + "epoch": 9.436378466557912, + "grad_norm": 0.15444837510585785, + "learning_rate": 0.0006347779888673175, + "loss": 0.0754, + "num_input_tokens_seen": 124941728, + "step": 57845 + }, + { + "epoch": 9.437194127243067, + "grad_norm": 0.022020984441041946, + "learning_rate": 0.0006347094424200724, + "loss": 0.0198, + "num_input_tokens_seen": 124953120, + "step": 57850 + }, + { + "epoch": 9.438009787928221, + "grad_norm": 0.017220299690961838, + "learning_rate": 0.0006346408932427696, + "loss": 0.0385, + "num_input_tokens_seen": 124963968, + "step": 57855 + }, + { + "epoch": 9.438825448613377, + "grad_norm": 0.13935768604278564, + "learning_rate": 0.0006345723413367983, + "loss": 0.0712, + "num_input_tokens_seen": 124975136, + "step": 57860 + }, + { + "epoch": 9.439641109298531, + "grad_norm": 0.225833460688591, + "learning_rate": 0.0006345037867035478, + "loss": 0.0764, + "num_input_tokens_seen": 124986240, + "step": 57865 + }, + { + "epoch": 9.440456769983687, + "grad_norm": 0.005525338929146528, + "learning_rate": 0.0006344352293444073, + "loss": 0.0289, + "num_input_tokens_seen": 124996960, + "step": 57870 + }, + { + "epoch": 9.441272430668842, + "grad_norm": 0.04588594287633896, + "learning_rate": 0.0006343666692607665, + "loss": 0.1242, + "num_input_tokens_seen": 125007584, + "step": 57875 + }, + { + "epoch": 9.442088091353996, + "grad_norm": 0.2472897619009018, + "learning_rate": 0.0006342981064540145, + "loss": 0.0699, + "num_input_tokens_seen": 125018880, + "step": 57880 + }, + { + "epoch": 9.442903752039152, + "grad_norm": 0.02394697442650795, + "learning_rate": 0.0006342295409255412, + "loss": 0.1556, + "num_input_tokens_seen": 125029312, + "step": 57885 + }, + { + "epoch": 9.443719412724306, + "grad_norm": 0.006148052867501974, + "learning_rate": 0.000634160972676736, + "loss": 0.0163, + "num_input_tokens_seen": 125040384, + "step": 57890 + }, + { + "epoch": 9.444535073409462, + "grad_norm": 0.08018513023853302, + "learning_rate": 0.0006340924017089884, + "loss": 0.0276, + "num_input_tokens_seen": 125051872, + "step": 57895 + }, + { + "epoch": 9.445350734094617, + "grad_norm": 0.369053453207016, + "learning_rate": 0.0006340238280236882, + "loss": 0.1174, + "num_input_tokens_seen": 125063264, + "step": 57900 + }, + { + "epoch": 9.446166394779771, + "grad_norm": 0.04056015610694885, + "learning_rate": 0.0006339552516222251, + "loss": 0.0163, + "num_input_tokens_seen": 125073696, + "step": 57905 + }, + { + "epoch": 9.446982055464927, + "grad_norm": 0.010821258649230003, + "learning_rate": 0.0006338866725059889, + "loss": 0.1023, + "num_input_tokens_seen": 125084896, + "step": 57910 + }, + { + "epoch": 9.447797716150081, + "grad_norm": 0.003268955973908305, + "learning_rate": 0.0006338180906763693, + "loss": 0.0413, + "num_input_tokens_seen": 125096224, + "step": 57915 + }, + { + "epoch": 9.448613376835237, + "grad_norm": 0.055417194962501526, + "learning_rate": 0.0006337495061347565, + "loss": 0.018, + "num_input_tokens_seen": 125106624, + "step": 57920 + }, + { + "epoch": 9.449429037520392, + "grad_norm": 0.22811178863048553, + "learning_rate": 0.0006336809188825401, + "loss": 0.0576, + "num_input_tokens_seen": 125116288, + "step": 57925 + }, + { + "epoch": 9.450244698205546, + "grad_norm": 0.025044074282050133, + "learning_rate": 0.0006336123289211104, + "loss": 0.0189, + "num_input_tokens_seen": 125125056, + "step": 57930 + }, + { + "epoch": 9.451060358890702, + "grad_norm": 0.04807139188051224, + "learning_rate": 0.0006335437362518574, + "loss": 0.1698, + "num_input_tokens_seen": 125136192, + "step": 57935 + }, + { + "epoch": 9.451876019575856, + "grad_norm": 0.01002733688801527, + "learning_rate": 0.0006334751408761712, + "loss": 0.0116, + "num_input_tokens_seen": 125147264, + "step": 57940 + }, + { + "epoch": 9.452691680261012, + "grad_norm": 0.02477033995091915, + "learning_rate": 0.0006334065427954418, + "loss": 0.1792, + "num_input_tokens_seen": 125158816, + "step": 57945 + }, + { + "epoch": 9.453507340946166, + "grad_norm": 0.20859457552433014, + "learning_rate": 0.0006333379420110597, + "loss": 0.0511, + "num_input_tokens_seen": 125168864, + "step": 57950 + }, + { + "epoch": 9.454323001631321, + "grad_norm": 0.12974859774112701, + "learning_rate": 0.000633269338524415, + "loss": 0.0842, + "num_input_tokens_seen": 125179968, + "step": 57955 + }, + { + "epoch": 9.455138662316477, + "grad_norm": 0.171275794506073, + "learning_rate": 0.0006332007323368983, + "loss": 0.1068, + "num_input_tokens_seen": 125191424, + "step": 57960 + }, + { + "epoch": 9.455954323001631, + "grad_norm": 0.2113136202096939, + "learning_rate": 0.0006331321234498995, + "loss": 0.0314, + "num_input_tokens_seen": 125200480, + "step": 57965 + }, + { + "epoch": 9.456769983686787, + "grad_norm": 0.008856832049787045, + "learning_rate": 0.0006330635118648093, + "loss": 0.0104, + "num_input_tokens_seen": 125212768, + "step": 57970 + }, + { + "epoch": 9.45758564437194, + "grad_norm": 0.22293557226657867, + "learning_rate": 0.0006329948975830184, + "loss": 0.0636, + "num_input_tokens_seen": 125223712, + "step": 57975 + }, + { + "epoch": 9.458401305057096, + "grad_norm": 0.20843131840229034, + "learning_rate": 0.0006329262806059173, + "loss": 0.054, + "num_input_tokens_seen": 125236224, + "step": 57980 + }, + { + "epoch": 9.459216965742252, + "grad_norm": 0.04087536782026291, + "learning_rate": 0.0006328576609348962, + "loss": 0.0079, + "num_input_tokens_seen": 125247296, + "step": 57985 + }, + { + "epoch": 9.460032626427406, + "grad_norm": 0.002280786167830229, + "learning_rate": 0.0006327890385713462, + "loss": 0.0996, + "num_input_tokens_seen": 125257632, + "step": 57990 + }, + { + "epoch": 9.460848287112562, + "grad_norm": 0.3266606330871582, + "learning_rate": 0.000632720413516658, + "loss": 0.0486, + "num_input_tokens_seen": 125268416, + "step": 57995 + }, + { + "epoch": 9.461663947797716, + "grad_norm": 0.02909723110496998, + "learning_rate": 0.000632651785772222, + "loss": 0.0268, + "num_input_tokens_seen": 125280064, + "step": 58000 + }, + { + "epoch": 9.462479608482871, + "grad_norm": 0.002902657026425004, + "learning_rate": 0.0006325831553394294, + "loss": 0.005, + "num_input_tokens_seen": 125291040, + "step": 58005 + }, + { + "epoch": 9.463295269168025, + "grad_norm": 0.011866576969623566, + "learning_rate": 0.000632514522219671, + "loss": 0.0618, + "num_input_tokens_seen": 125302400, + "step": 58010 + }, + { + "epoch": 9.464110929853181, + "grad_norm": 0.0071860142052173615, + "learning_rate": 0.0006324458864143377, + "loss": 0.0143, + "num_input_tokens_seen": 125314112, + "step": 58015 + }, + { + "epoch": 9.464926590538337, + "grad_norm": 0.005821306258440018, + "learning_rate": 0.0006323772479248204, + "loss": 0.059, + "num_input_tokens_seen": 125325088, + "step": 58020 + }, + { + "epoch": 9.46574225122349, + "grad_norm": 0.30362415313720703, + "learning_rate": 0.0006323086067525103, + "loss": 0.1722, + "num_input_tokens_seen": 125336256, + "step": 58025 + }, + { + "epoch": 9.466557911908646, + "grad_norm": 0.00525606470182538, + "learning_rate": 0.0006322399628987984, + "loss": 0.0639, + "num_input_tokens_seen": 125346848, + "step": 58030 + }, + { + "epoch": 9.4673735725938, + "grad_norm": 0.23222877085208893, + "learning_rate": 0.000632171316365076, + "loss": 0.052, + "num_input_tokens_seen": 125356544, + "step": 58035 + }, + { + "epoch": 9.468189233278956, + "grad_norm": 0.025633899495005608, + "learning_rate": 0.000632102667152734, + "loss": 0.014, + "num_input_tokens_seen": 125367200, + "step": 58040 + }, + { + "epoch": 9.469004893964112, + "grad_norm": 0.014735725708305836, + "learning_rate": 0.000632034015263164, + "loss": 0.012, + "num_input_tokens_seen": 125378688, + "step": 58045 + }, + { + "epoch": 9.469820554649266, + "grad_norm": 0.01710418239235878, + "learning_rate": 0.0006319653606977571, + "loss": 0.0051, + "num_input_tokens_seen": 125389664, + "step": 58050 + }, + { + "epoch": 9.470636215334421, + "grad_norm": 0.2504916191101074, + "learning_rate": 0.0006318967034579048, + "loss": 0.0984, + "num_input_tokens_seen": 125400096, + "step": 58055 + }, + { + "epoch": 9.471451876019575, + "grad_norm": 0.0372481495141983, + "learning_rate": 0.0006318280435449985, + "loss": 0.0286, + "num_input_tokens_seen": 125410656, + "step": 58060 + }, + { + "epoch": 9.47226753670473, + "grad_norm": 0.28393882513046265, + "learning_rate": 0.0006317593809604298, + "loss": 0.1305, + "num_input_tokens_seen": 125421152, + "step": 58065 + }, + { + "epoch": 9.473083197389887, + "grad_norm": 0.029981283470988274, + "learning_rate": 0.00063169071570559, + "loss": 0.0188, + "num_input_tokens_seen": 125432224, + "step": 58070 + }, + { + "epoch": 9.47389885807504, + "grad_norm": 0.17756913602352142, + "learning_rate": 0.0006316220477818707, + "loss": 0.0405, + "num_input_tokens_seen": 125442464, + "step": 58075 + }, + { + "epoch": 9.474714518760196, + "grad_norm": 0.2382059395313263, + "learning_rate": 0.0006315533771906638, + "loss": 0.0382, + "num_input_tokens_seen": 125454080, + "step": 58080 + }, + { + "epoch": 9.47553017944535, + "grad_norm": 0.010461671277880669, + "learning_rate": 0.0006314847039333607, + "loss": 0.0242, + "num_input_tokens_seen": 125464224, + "step": 58085 + }, + { + "epoch": 9.476345840130506, + "grad_norm": 0.42559152841567993, + "learning_rate": 0.0006314160280113532, + "loss": 0.1037, + "num_input_tokens_seen": 125475104, + "step": 58090 + }, + { + "epoch": 9.477161500815662, + "grad_norm": 0.37911534309387207, + "learning_rate": 0.0006313473494260333, + "loss": 0.0369, + "num_input_tokens_seen": 125484832, + "step": 58095 + }, + { + "epoch": 9.477977161500815, + "grad_norm": 0.00439072959125042, + "learning_rate": 0.0006312786681787928, + "loss": 0.0161, + "num_input_tokens_seen": 125496800, + "step": 58100 + }, + { + "epoch": 9.478792822185971, + "grad_norm": 0.25645124912261963, + "learning_rate": 0.0006312099842710234, + "loss": 0.0362, + "num_input_tokens_seen": 125507328, + "step": 58105 + }, + { + "epoch": 9.479608482871125, + "grad_norm": 0.006362576503306627, + "learning_rate": 0.0006311412977041172, + "loss": 0.1536, + "num_input_tokens_seen": 125517344, + "step": 58110 + }, + { + "epoch": 9.48042414355628, + "grad_norm": 0.6152973175048828, + "learning_rate": 0.0006310726084794663, + "loss": 0.0942, + "num_input_tokens_seen": 125528352, + "step": 58115 + }, + { + "epoch": 9.481239804241435, + "grad_norm": 0.10060965269804001, + "learning_rate": 0.0006310039165984628, + "loss": 0.0544, + "num_input_tokens_seen": 125539328, + "step": 58120 + }, + { + "epoch": 9.48205546492659, + "grad_norm": 0.3432973623275757, + "learning_rate": 0.0006309352220624986, + "loss": 0.0893, + "num_input_tokens_seen": 125550080, + "step": 58125 + }, + { + "epoch": 9.482871125611746, + "grad_norm": 0.22590389847755432, + "learning_rate": 0.0006308665248729662, + "loss": 0.1392, + "num_input_tokens_seen": 125559648, + "step": 58130 + }, + { + "epoch": 9.4836867862969, + "grad_norm": 0.04668281227350235, + "learning_rate": 0.0006307978250312574, + "loss": 0.1853, + "num_input_tokens_seen": 125570432, + "step": 58135 + }, + { + "epoch": 9.484502446982056, + "grad_norm": 0.3730715811252594, + "learning_rate": 0.0006307291225387648, + "loss": 0.2305, + "num_input_tokens_seen": 125580096, + "step": 58140 + }, + { + "epoch": 9.48531810766721, + "grad_norm": 0.1770205795764923, + "learning_rate": 0.0006306604173968808, + "loss": 0.068, + "num_input_tokens_seen": 125591008, + "step": 58145 + }, + { + "epoch": 9.486133768352365, + "grad_norm": 0.20119500160217285, + "learning_rate": 0.0006305917096069977, + "loss": 0.019, + "num_input_tokens_seen": 125601248, + "step": 58150 + }, + { + "epoch": 9.486949429037521, + "grad_norm": 0.32379111647605896, + "learning_rate": 0.000630522999170508, + "loss": 0.0727, + "num_input_tokens_seen": 125610944, + "step": 58155 + }, + { + "epoch": 9.487765089722675, + "grad_norm": 0.0593860000371933, + "learning_rate": 0.0006304542860888039, + "loss": 0.036, + "num_input_tokens_seen": 125622336, + "step": 58160 + }, + { + "epoch": 9.48858075040783, + "grad_norm": 0.022446129471063614, + "learning_rate": 0.0006303855703632783, + "loss": 0.0938, + "num_input_tokens_seen": 125633600, + "step": 58165 + }, + { + "epoch": 9.489396411092985, + "grad_norm": 0.002450139494612813, + "learning_rate": 0.0006303168519953238, + "loss": 0.1368, + "num_input_tokens_seen": 125645472, + "step": 58170 + }, + { + "epoch": 9.49021207177814, + "grad_norm": 0.021858789026737213, + "learning_rate": 0.0006302481309863329, + "loss": 0.0204, + "num_input_tokens_seen": 125656416, + "step": 58175 + }, + { + "epoch": 9.491027732463296, + "grad_norm": 0.01752588339149952, + "learning_rate": 0.0006301794073376985, + "loss": 0.1022, + "num_input_tokens_seen": 125667680, + "step": 58180 + }, + { + "epoch": 9.49184339314845, + "grad_norm": 0.019962133839726448, + "learning_rate": 0.0006301106810508131, + "loss": 0.0627, + "num_input_tokens_seen": 125678336, + "step": 58185 + }, + { + "epoch": 9.492659053833606, + "grad_norm": 0.06252746284008026, + "learning_rate": 0.0006300419521270697, + "loss": 0.0113, + "num_input_tokens_seen": 125688768, + "step": 58190 + }, + { + "epoch": 9.49347471451876, + "grad_norm": 0.22887328267097473, + "learning_rate": 0.0006299732205678613, + "loss": 0.0288, + "num_input_tokens_seen": 125699936, + "step": 58195 + }, + { + "epoch": 9.494290375203915, + "grad_norm": 0.0024231132119894028, + "learning_rate": 0.0006299044863745806, + "loss": 0.0118, + "num_input_tokens_seen": 125711136, + "step": 58200 + }, + { + "epoch": 9.49510603588907, + "grad_norm": 0.003173418343067169, + "learning_rate": 0.0006298357495486208, + "loss": 0.0094, + "num_input_tokens_seen": 125722848, + "step": 58205 + }, + { + "epoch": 9.495921696574225, + "grad_norm": 0.003356012748554349, + "learning_rate": 0.0006297670100913748, + "loss": 0.2534, + "num_input_tokens_seen": 125734464, + "step": 58210 + }, + { + "epoch": 9.49673735725938, + "grad_norm": 0.06293818354606628, + "learning_rate": 0.0006296982680042357, + "loss": 0.0294, + "num_input_tokens_seen": 125745632, + "step": 58215 + }, + { + "epoch": 9.497553017944535, + "grad_norm": 0.015413629822432995, + "learning_rate": 0.0006296295232885966, + "loss": 0.0083, + "num_input_tokens_seen": 125757856, + "step": 58220 + }, + { + "epoch": 9.49836867862969, + "grad_norm": 0.011318295262753963, + "learning_rate": 0.0006295607759458508, + "loss": 0.1357, + "num_input_tokens_seen": 125769248, + "step": 58225 + }, + { + "epoch": 9.499184339314844, + "grad_norm": 0.14822854101657867, + "learning_rate": 0.0006294920259773915, + "loss": 0.016, + "num_input_tokens_seen": 125780480, + "step": 58230 + }, + { + "epoch": 9.5, + "grad_norm": 0.37363535165786743, + "learning_rate": 0.0006294232733846121, + "loss": 0.1286, + "num_input_tokens_seen": 125791040, + "step": 58235 + }, + { + "epoch": 9.500815660685156, + "grad_norm": 0.07917729765176773, + "learning_rate": 0.0006293545181689057, + "loss": 0.1329, + "num_input_tokens_seen": 125801696, + "step": 58240 + }, + { + "epoch": 9.50163132137031, + "grad_norm": 0.06792290508747101, + "learning_rate": 0.000629285760331666, + "loss": 0.0105, + "num_input_tokens_seen": 125810816, + "step": 58245 + }, + { + "epoch": 9.502446982055465, + "grad_norm": 0.007692431099712849, + "learning_rate": 0.0006292169998742865, + "loss": 0.0984, + "num_input_tokens_seen": 125822272, + "step": 58250 + }, + { + "epoch": 9.50326264274062, + "grad_norm": 0.008680978789925575, + "learning_rate": 0.0006291482367981605, + "loss": 0.0546, + "num_input_tokens_seen": 125831808, + "step": 58255 + }, + { + "epoch": 9.504078303425775, + "grad_norm": 0.013723728246986866, + "learning_rate": 0.0006290794711046816, + "loss": 0.0097, + "num_input_tokens_seen": 125843360, + "step": 58260 + }, + { + "epoch": 9.50489396411093, + "grad_norm": 0.06591471284627914, + "learning_rate": 0.0006290107027952434, + "loss": 0.0117, + "num_input_tokens_seen": 125853408, + "step": 58265 + }, + { + "epoch": 9.505709624796085, + "grad_norm": 0.23483747243881226, + "learning_rate": 0.0006289419318712397, + "loss": 0.1553, + "num_input_tokens_seen": 125865152, + "step": 58270 + }, + { + "epoch": 9.50652528548124, + "grad_norm": 0.0060275401920080185, + "learning_rate": 0.0006288731583340642, + "loss": 0.0189, + "num_input_tokens_seen": 125875968, + "step": 58275 + }, + { + "epoch": 9.507340946166394, + "grad_norm": 0.012370044365525246, + "learning_rate": 0.0006288043821851107, + "loss": 0.0119, + "num_input_tokens_seen": 125887136, + "step": 58280 + }, + { + "epoch": 9.50815660685155, + "grad_norm": 0.005109517835080624, + "learning_rate": 0.000628735603425773, + "loss": 0.0329, + "num_input_tokens_seen": 125897792, + "step": 58285 + }, + { + "epoch": 9.508972267536706, + "grad_norm": 0.005160059779882431, + "learning_rate": 0.0006286668220574448, + "loss": 0.1185, + "num_input_tokens_seen": 125907360, + "step": 58290 + }, + { + "epoch": 9.50978792822186, + "grad_norm": 0.014347260817885399, + "learning_rate": 0.0006285980380815204, + "loss": 0.0251, + "num_input_tokens_seen": 125918240, + "step": 58295 + }, + { + "epoch": 9.510603588907015, + "grad_norm": 0.2521634101867676, + "learning_rate": 0.0006285292514993936, + "loss": 0.1995, + "num_input_tokens_seen": 125928928, + "step": 58300 + }, + { + "epoch": 9.51141924959217, + "grad_norm": 0.037277307361364365, + "learning_rate": 0.0006284604623124585, + "loss": 0.1115, + "num_input_tokens_seen": 125940096, + "step": 58305 + }, + { + "epoch": 9.512234910277325, + "grad_norm": 0.047494374215602875, + "learning_rate": 0.0006283916705221091, + "loss": 0.0453, + "num_input_tokens_seen": 125950880, + "step": 58310 + }, + { + "epoch": 9.513050570962479, + "grad_norm": 0.011362025514245033, + "learning_rate": 0.0006283228761297396, + "loss": 0.014, + "num_input_tokens_seen": 125962144, + "step": 58315 + }, + { + "epoch": 9.513866231647635, + "grad_norm": 0.004715372808277607, + "learning_rate": 0.0006282540791367442, + "loss": 0.0192, + "num_input_tokens_seen": 125973152, + "step": 58320 + }, + { + "epoch": 9.51468189233279, + "grad_norm": 0.05988180637359619, + "learning_rate": 0.0006281852795445173, + "loss": 0.1561, + "num_input_tokens_seen": 125984128, + "step": 58325 + }, + { + "epoch": 9.515497553017944, + "grad_norm": 0.22990791499614716, + "learning_rate": 0.000628116477354453, + "loss": 0.0564, + "num_input_tokens_seen": 125994304, + "step": 58330 + }, + { + "epoch": 9.5163132137031, + "grad_norm": 0.01864621788263321, + "learning_rate": 0.0006280476725679457, + "loss": 0.0084, + "num_input_tokens_seen": 126005472, + "step": 58335 + }, + { + "epoch": 9.517128874388254, + "grad_norm": 0.040201228111982346, + "learning_rate": 0.00062797886518639, + "loss": 0.0595, + "num_input_tokens_seen": 126015872, + "step": 58340 + }, + { + "epoch": 9.51794453507341, + "grad_norm": 0.13366523385047913, + "learning_rate": 0.0006279100552111803, + "loss": 0.0392, + "num_input_tokens_seen": 126026528, + "step": 58345 + }, + { + "epoch": 9.518760195758565, + "grad_norm": 0.19599227607250214, + "learning_rate": 0.0006278412426437109, + "loss": 0.0216, + "num_input_tokens_seen": 126037952, + "step": 58350 + }, + { + "epoch": 9.51957585644372, + "grad_norm": 0.0015695245238021016, + "learning_rate": 0.0006277724274853767, + "loss": 0.1863, + "num_input_tokens_seen": 126049376, + "step": 58355 + }, + { + "epoch": 9.520391517128875, + "grad_norm": 1.7492622137069702, + "learning_rate": 0.0006277036097375719, + "loss": 0.1198, + "num_input_tokens_seen": 126060160, + "step": 58360 + }, + { + "epoch": 9.521207177814029, + "grad_norm": 0.008433963172137737, + "learning_rate": 0.0006276347894016917, + "loss": 0.0316, + "num_input_tokens_seen": 126071744, + "step": 58365 + }, + { + "epoch": 9.522022838499185, + "grad_norm": 0.007258435245603323, + "learning_rate": 0.0006275659664791304, + "loss": 0.0119, + "num_input_tokens_seen": 126083744, + "step": 58370 + }, + { + "epoch": 9.522838499184338, + "grad_norm": 0.012848546728491783, + "learning_rate": 0.0006274971409712831, + "loss": 0.0252, + "num_input_tokens_seen": 126094016, + "step": 58375 + }, + { + "epoch": 9.523654159869494, + "grad_norm": 0.39458566904067993, + "learning_rate": 0.0006274283128795445, + "loss": 0.2455, + "num_input_tokens_seen": 126103808, + "step": 58380 + }, + { + "epoch": 9.52446982055465, + "grad_norm": 0.03209485858678818, + "learning_rate": 0.0006273594822053095, + "loss": 0.1695, + "num_input_tokens_seen": 126113120, + "step": 58385 + }, + { + "epoch": 9.525285481239804, + "grad_norm": 0.011099644005298615, + "learning_rate": 0.000627290648949973, + "loss": 0.0235, + "num_input_tokens_seen": 126123360, + "step": 58390 + }, + { + "epoch": 9.52610114192496, + "grad_norm": 0.010915805585682392, + "learning_rate": 0.00062722181311493, + "loss": 0.0657, + "num_input_tokens_seen": 126134240, + "step": 58395 + }, + { + "epoch": 9.526916802610113, + "grad_norm": 0.0570676252245903, + "learning_rate": 0.0006271529747015755, + "loss": 0.0222, + "num_input_tokens_seen": 126145952, + "step": 58400 + }, + { + "epoch": 9.52773246329527, + "grad_norm": 0.19531919062137604, + "learning_rate": 0.0006270841337113047, + "loss": 0.2215, + "num_input_tokens_seen": 126157024, + "step": 58405 + }, + { + "epoch": 9.528548123980425, + "grad_norm": 0.10229338705539703, + "learning_rate": 0.0006270152901455128, + "loss": 0.0303, + "num_input_tokens_seen": 126167648, + "step": 58410 + }, + { + "epoch": 9.529363784665579, + "grad_norm": 0.009482331573963165, + "learning_rate": 0.0006269464440055948, + "loss": 0.1183, + "num_input_tokens_seen": 126179424, + "step": 58415 + }, + { + "epoch": 9.530179445350734, + "grad_norm": 0.055833905935287476, + "learning_rate": 0.0006268775952929462, + "loss": 0.0924, + "num_input_tokens_seen": 126190048, + "step": 58420 + }, + { + "epoch": 9.530995106035888, + "grad_norm": 0.0862567201256752, + "learning_rate": 0.000626808744008962, + "loss": 0.0198, + "num_input_tokens_seen": 126199840, + "step": 58425 + }, + { + "epoch": 9.531810766721044, + "grad_norm": 0.01423166785389185, + "learning_rate": 0.0006267398901550379, + "loss": 0.0231, + "num_input_tokens_seen": 126211904, + "step": 58430 + }, + { + "epoch": 9.5326264274062, + "grad_norm": 0.22748292982578278, + "learning_rate": 0.000626671033732569, + "loss": 0.0462, + "num_input_tokens_seen": 126224160, + "step": 58435 + }, + { + "epoch": 9.533442088091354, + "grad_norm": 0.03178076446056366, + "learning_rate": 0.0006266021747429511, + "loss": 0.0361, + "num_input_tokens_seen": 126234432, + "step": 58440 + }, + { + "epoch": 9.53425774877651, + "grad_norm": 0.00904083251953125, + "learning_rate": 0.0006265333131875794, + "loss": 0.0628, + "num_input_tokens_seen": 126246176, + "step": 58445 + }, + { + "epoch": 9.535073409461663, + "grad_norm": 0.0102092195302248, + "learning_rate": 0.0006264644490678496, + "loss": 0.0638, + "num_input_tokens_seen": 126256992, + "step": 58450 + }, + { + "epoch": 9.535889070146819, + "grad_norm": 0.2707527279853821, + "learning_rate": 0.0006263955823851571, + "loss": 0.2005, + "num_input_tokens_seen": 126267968, + "step": 58455 + }, + { + "epoch": 9.536704730831975, + "grad_norm": 0.1802579015493393, + "learning_rate": 0.0006263267131408981, + "loss": 0.0418, + "num_input_tokens_seen": 126278976, + "step": 58460 + }, + { + "epoch": 9.537520391517129, + "grad_norm": 0.007812032010406256, + "learning_rate": 0.0006262578413364679, + "loss": 0.0185, + "num_input_tokens_seen": 126290464, + "step": 58465 + }, + { + "epoch": 9.538336052202284, + "grad_norm": 0.03092452511191368, + "learning_rate": 0.0006261889669732624, + "loss": 0.0897, + "num_input_tokens_seen": 126301888, + "step": 58470 + }, + { + "epoch": 9.539151712887438, + "grad_norm": 0.04860683158040047, + "learning_rate": 0.0006261200900526773, + "loss": 0.0176, + "num_input_tokens_seen": 126312736, + "step": 58475 + }, + { + "epoch": 9.539967373572594, + "grad_norm": 0.02373124659061432, + "learning_rate": 0.0006260512105761086, + "loss": 0.0823, + "num_input_tokens_seen": 126323840, + "step": 58480 + }, + { + "epoch": 9.540783034257748, + "grad_norm": 0.2580413520336151, + "learning_rate": 0.0006259823285449523, + "loss": 0.0238, + "num_input_tokens_seen": 126335360, + "step": 58485 + }, + { + "epoch": 9.541598694942904, + "grad_norm": 0.008553222753107548, + "learning_rate": 0.0006259134439606043, + "loss": 0.0223, + "num_input_tokens_seen": 126345056, + "step": 58490 + }, + { + "epoch": 9.54241435562806, + "grad_norm": 0.3394491970539093, + "learning_rate": 0.0006258445568244605, + "loss": 0.1001, + "num_input_tokens_seen": 126355168, + "step": 58495 + }, + { + "epoch": 9.543230016313213, + "grad_norm": 0.007404628675431013, + "learning_rate": 0.0006257756671379172, + "loss": 0.0125, + "num_input_tokens_seen": 126365600, + "step": 58500 + }, + { + "epoch": 9.544045676998369, + "grad_norm": 0.01686800643801689, + "learning_rate": 0.0006257067749023704, + "loss": 0.0698, + "num_input_tokens_seen": 126375008, + "step": 58505 + }, + { + "epoch": 9.544861337683523, + "grad_norm": 0.31130531430244446, + "learning_rate": 0.0006256378801192163, + "loss": 0.046, + "num_input_tokens_seen": 126386048, + "step": 58510 + }, + { + "epoch": 9.545676998368679, + "grad_norm": 0.009447862394154072, + "learning_rate": 0.0006255689827898512, + "loss": 0.1425, + "num_input_tokens_seen": 126396896, + "step": 58515 + }, + { + "epoch": 9.546492659053834, + "grad_norm": 0.07970761507749557, + "learning_rate": 0.0006255000829156714, + "loss": 0.029, + "num_input_tokens_seen": 126408480, + "step": 58520 + }, + { + "epoch": 9.547308319738988, + "grad_norm": 0.19980250298976898, + "learning_rate": 0.0006254311804980733, + "loss": 0.0472, + "num_input_tokens_seen": 126419552, + "step": 58525 + }, + { + "epoch": 9.548123980424144, + "grad_norm": 0.03398562967777252, + "learning_rate": 0.0006253622755384531, + "loss": 0.0324, + "num_input_tokens_seen": 126430240, + "step": 58530 + }, + { + "epoch": 9.548939641109298, + "grad_norm": 0.18277190625667572, + "learning_rate": 0.0006252933680382074, + "loss": 0.0262, + "num_input_tokens_seen": 126440736, + "step": 58535 + }, + { + "epoch": 9.549755301794454, + "grad_norm": 0.020722072571516037, + "learning_rate": 0.0006252244579987327, + "loss": 0.0369, + "num_input_tokens_seen": 126450944, + "step": 58540 + }, + { + "epoch": 9.550570962479608, + "grad_norm": 0.13694825768470764, + "learning_rate": 0.0006251555454214254, + "loss": 0.0874, + "num_input_tokens_seen": 126462112, + "step": 58545 + }, + { + "epoch": 9.551386623164763, + "grad_norm": 0.018573446199297905, + "learning_rate": 0.0006250866303076822, + "loss": 0.191, + "num_input_tokens_seen": 126472384, + "step": 58550 + }, + { + "epoch": 9.552202283849919, + "grad_norm": 0.052989520132541656, + "learning_rate": 0.0006250177126588998, + "loss": 0.0962, + "num_input_tokens_seen": 126483328, + "step": 58555 + }, + { + "epoch": 9.553017944535073, + "grad_norm": 0.3053835332393646, + "learning_rate": 0.0006249487924764747, + "loss": 0.0618, + "num_input_tokens_seen": 126494912, + "step": 58560 + }, + { + "epoch": 9.553833605220229, + "grad_norm": 0.032893870025873184, + "learning_rate": 0.000624879869761804, + "loss": 0.0106, + "num_input_tokens_seen": 126506944, + "step": 58565 + }, + { + "epoch": 9.554649265905383, + "grad_norm": 0.030350077897310257, + "learning_rate": 0.0006248109445162843, + "loss": 0.0809, + "num_input_tokens_seen": 126516160, + "step": 58570 + }, + { + "epoch": 9.555464926590538, + "grad_norm": 0.25913652777671814, + "learning_rate": 0.0006247420167413124, + "loss": 0.3567, + "num_input_tokens_seen": 126526144, + "step": 58575 + }, + { + "epoch": 9.556280587275694, + "grad_norm": 0.016176484525203705, + "learning_rate": 0.0006246730864382853, + "loss": 0.0134, + "num_input_tokens_seen": 126536608, + "step": 58580 + }, + { + "epoch": 9.557096247960848, + "grad_norm": 0.012450575828552246, + "learning_rate": 0.0006246041536086, + "loss": 0.1263, + "num_input_tokens_seen": 126547776, + "step": 58585 + }, + { + "epoch": 9.557911908646004, + "grad_norm": 0.24023890495300293, + "learning_rate": 0.0006245352182536535, + "loss": 0.0604, + "num_input_tokens_seen": 126558944, + "step": 58590 + }, + { + "epoch": 9.558727569331158, + "grad_norm": 0.03461911529302597, + "learning_rate": 0.0006244662803748427, + "loss": 0.1038, + "num_input_tokens_seen": 126569472, + "step": 58595 + }, + { + "epoch": 9.559543230016313, + "grad_norm": 0.06925342977046967, + "learning_rate": 0.0006243973399735649, + "loss": 0.0175, + "num_input_tokens_seen": 126580160, + "step": 58600 + }, + { + "epoch": 9.560358890701469, + "grad_norm": 0.024349384009838104, + "learning_rate": 0.0006243283970512172, + "loss": 0.1112, + "num_input_tokens_seen": 126590912, + "step": 58605 + }, + { + "epoch": 9.561174551386623, + "grad_norm": 0.01365981251001358, + "learning_rate": 0.0006242594516091967, + "loss": 0.0242, + "num_input_tokens_seen": 126601056, + "step": 58610 + }, + { + "epoch": 9.561990212071779, + "grad_norm": 0.02429444156587124, + "learning_rate": 0.000624190503648901, + "loss": 0.0273, + "num_input_tokens_seen": 126611936, + "step": 58615 + }, + { + "epoch": 9.562805872756933, + "grad_norm": 0.14289309084415436, + "learning_rate": 0.000624121553171727, + "loss": 0.1001, + "num_input_tokens_seen": 126622272, + "step": 58620 + }, + { + "epoch": 9.563621533442088, + "grad_norm": 0.20121271908283234, + "learning_rate": 0.0006240526001790723, + "loss": 0.0847, + "num_input_tokens_seen": 126634080, + "step": 58625 + }, + { + "epoch": 9.564437194127244, + "grad_norm": 0.026418212801218033, + "learning_rate": 0.0006239836446723343, + "loss": 0.1158, + "num_input_tokens_seen": 126644608, + "step": 58630 + }, + { + "epoch": 9.565252854812398, + "grad_norm": 0.2299506664276123, + "learning_rate": 0.0006239146866529105, + "loss": 0.0914, + "num_input_tokens_seen": 126654976, + "step": 58635 + }, + { + "epoch": 9.566068515497554, + "grad_norm": 0.03591744229197502, + "learning_rate": 0.0006238457261221983, + "loss": 0.0996, + "num_input_tokens_seen": 126666272, + "step": 58640 + }, + { + "epoch": 9.566884176182707, + "grad_norm": 0.3727372884750366, + "learning_rate": 0.0006237767630815955, + "loss": 0.0848, + "num_input_tokens_seen": 126677824, + "step": 58645 + }, + { + "epoch": 9.567699836867863, + "grad_norm": 0.0906633585691452, + "learning_rate": 0.0006237077975324994, + "loss": 0.0559, + "num_input_tokens_seen": 126688832, + "step": 58650 + }, + { + "epoch": 9.568515497553017, + "grad_norm": 0.007482402957975864, + "learning_rate": 0.0006236388294763079, + "loss": 0.0113, + "num_input_tokens_seen": 126700000, + "step": 58655 + }, + { + "epoch": 9.569331158238173, + "grad_norm": 0.0202884990721941, + "learning_rate": 0.0006235698589144188, + "loss": 0.0524, + "num_input_tokens_seen": 126711136, + "step": 58660 + }, + { + "epoch": 9.570146818923329, + "grad_norm": 0.19632995128631592, + "learning_rate": 0.0006235008858482295, + "loss": 0.069, + "num_input_tokens_seen": 126722944, + "step": 58665 + }, + { + "epoch": 9.570962479608482, + "grad_norm": 0.02798754721879959, + "learning_rate": 0.0006234319102791382, + "loss": 0.0135, + "num_input_tokens_seen": 126734240, + "step": 58670 + }, + { + "epoch": 9.571778140293638, + "grad_norm": 0.06160164624452591, + "learning_rate": 0.0006233629322085427, + "loss": 0.1022, + "num_input_tokens_seen": 126744000, + "step": 58675 + }, + { + "epoch": 9.572593800978792, + "grad_norm": 0.18928822875022888, + "learning_rate": 0.0006232939516378408, + "loss": 0.0703, + "num_input_tokens_seen": 126754592, + "step": 58680 + }, + { + "epoch": 9.573409461663948, + "grad_norm": 0.0757313072681427, + "learning_rate": 0.0006232249685684306, + "loss": 0.0523, + "num_input_tokens_seen": 126764544, + "step": 58685 + }, + { + "epoch": 9.574225122349104, + "grad_norm": 0.16569207608699799, + "learning_rate": 0.0006231559830017102, + "loss": 0.042, + "num_input_tokens_seen": 126775456, + "step": 58690 + }, + { + "epoch": 9.575040783034257, + "grad_norm": 0.2607077956199646, + "learning_rate": 0.0006230869949390774, + "loss": 0.0709, + "num_input_tokens_seen": 126787488, + "step": 58695 + }, + { + "epoch": 9.575856443719413, + "grad_norm": 0.26447293162345886, + "learning_rate": 0.0006230180043819306, + "loss": 0.163, + "num_input_tokens_seen": 126798944, + "step": 58700 + }, + { + "epoch": 9.576672104404567, + "grad_norm": 0.019896386191248894, + "learning_rate": 0.0006229490113316678, + "loss": 0.0264, + "num_input_tokens_seen": 126809696, + "step": 58705 + }, + { + "epoch": 9.577487765089723, + "grad_norm": 0.012389651499688625, + "learning_rate": 0.0006228800157896874, + "loss": 0.0454, + "num_input_tokens_seen": 126820288, + "step": 58710 + }, + { + "epoch": 9.578303425774878, + "grad_norm": 0.020707737654447556, + "learning_rate": 0.0006228110177573876, + "loss": 0.0159, + "num_input_tokens_seen": 126830400, + "step": 58715 + }, + { + "epoch": 9.579119086460032, + "grad_norm": 0.026347359642386436, + "learning_rate": 0.0006227420172361667, + "loss": 0.0181, + "num_input_tokens_seen": 126841440, + "step": 58720 + }, + { + "epoch": 9.579934747145188, + "grad_norm": 0.3248952031135559, + "learning_rate": 0.0006226730142274232, + "loss": 0.1265, + "num_input_tokens_seen": 126852256, + "step": 58725 + }, + { + "epoch": 9.580750407830342, + "grad_norm": 0.013630959205329418, + "learning_rate": 0.0006226040087325553, + "loss": 0.025, + "num_input_tokens_seen": 126863136, + "step": 58730 + }, + { + "epoch": 9.581566068515498, + "grad_norm": 0.4216710329055786, + "learning_rate": 0.0006225350007529616, + "loss": 0.0738, + "num_input_tokens_seen": 126874112, + "step": 58735 + }, + { + "epoch": 9.582381729200652, + "grad_norm": 0.04262214154005051, + "learning_rate": 0.0006224659902900408, + "loss": 0.215, + "num_input_tokens_seen": 126885024, + "step": 58740 + }, + { + "epoch": 9.583197389885807, + "grad_norm": 0.23827561736106873, + "learning_rate": 0.0006223969773451913, + "loss": 0.0261, + "num_input_tokens_seen": 126896384, + "step": 58745 + }, + { + "epoch": 9.584013050570963, + "grad_norm": 0.1313164383172989, + "learning_rate": 0.0006223279619198118, + "loss": 0.033, + "num_input_tokens_seen": 126906912, + "step": 58750 + }, + { + "epoch": 9.584828711256117, + "grad_norm": 0.46747714281082153, + "learning_rate": 0.000622258944015301, + "loss": 0.1526, + "num_input_tokens_seen": 126917568, + "step": 58755 + }, + { + "epoch": 9.585644371941273, + "grad_norm": 0.031990889459848404, + "learning_rate": 0.0006221899236330575, + "loss": 0.0248, + "num_input_tokens_seen": 126928384, + "step": 58760 + }, + { + "epoch": 9.586460032626427, + "grad_norm": 0.35653167963027954, + "learning_rate": 0.0006221209007744803, + "loss": 0.096, + "num_input_tokens_seen": 126939264, + "step": 58765 + }, + { + "epoch": 9.587275693311582, + "grad_norm": 0.26260942220687866, + "learning_rate": 0.0006220518754409681, + "loss": 0.1236, + "num_input_tokens_seen": 126949792, + "step": 58770 + }, + { + "epoch": 9.588091353996738, + "grad_norm": 0.010245069861412048, + "learning_rate": 0.0006219828476339195, + "loss": 0.0452, + "num_input_tokens_seen": 126961536, + "step": 58775 + }, + { + "epoch": 9.588907014681892, + "grad_norm": 0.019048362970352173, + "learning_rate": 0.0006219138173547341, + "loss": 0.0958, + "num_input_tokens_seen": 126971648, + "step": 58780 + }, + { + "epoch": 9.589722675367048, + "grad_norm": 0.03487079590559006, + "learning_rate": 0.0006218447846048106, + "loss": 0.0147, + "num_input_tokens_seen": 126982272, + "step": 58785 + }, + { + "epoch": 9.590538336052202, + "grad_norm": 0.004236708395183086, + "learning_rate": 0.0006217757493855477, + "loss": 0.0175, + "num_input_tokens_seen": 126993760, + "step": 58790 + }, + { + "epoch": 9.591353996737357, + "grad_norm": 0.00880725122988224, + "learning_rate": 0.0006217067116983449, + "loss": 0.0221, + "num_input_tokens_seen": 127004448, + "step": 58795 + }, + { + "epoch": 9.592169657422513, + "grad_norm": 0.003701163223013282, + "learning_rate": 0.0006216376715446011, + "loss": 0.0328, + "num_input_tokens_seen": 127014016, + "step": 58800 + }, + { + "epoch": 9.592985318107667, + "grad_norm": 0.03611031547188759, + "learning_rate": 0.0006215686289257156, + "loss": 0.0361, + "num_input_tokens_seen": 127024576, + "step": 58805 + }, + { + "epoch": 9.593800978792823, + "grad_norm": 0.009452610276639462, + "learning_rate": 0.0006214995838430878, + "loss": 0.0154, + "num_input_tokens_seen": 127035936, + "step": 58810 + }, + { + "epoch": 9.594616639477977, + "grad_norm": 0.1101599782705307, + "learning_rate": 0.0006214305362981167, + "loss": 0.0382, + "num_input_tokens_seen": 127046144, + "step": 58815 + }, + { + "epoch": 9.595432300163132, + "grad_norm": 0.006794488988816738, + "learning_rate": 0.0006213614862922015, + "loss": 0.0077, + "num_input_tokens_seen": 127056544, + "step": 58820 + }, + { + "epoch": 9.596247960848288, + "grad_norm": 0.07736363261938095, + "learning_rate": 0.0006212924338267421, + "loss": 0.0186, + "num_input_tokens_seen": 127067040, + "step": 58825 + }, + { + "epoch": 9.597063621533442, + "grad_norm": 0.06632602959871292, + "learning_rate": 0.0006212233789031376, + "loss": 0.0108, + "num_input_tokens_seen": 127076832, + "step": 58830 + }, + { + "epoch": 9.597879282218598, + "grad_norm": 0.4707207977771759, + "learning_rate": 0.0006211543215227874, + "loss": 0.1821, + "num_input_tokens_seen": 127088544, + "step": 58835 + }, + { + "epoch": 9.598694942903752, + "grad_norm": 0.03901342302560806, + "learning_rate": 0.0006210852616870913, + "loss": 0.0088, + "num_input_tokens_seen": 127099040, + "step": 58840 + }, + { + "epoch": 9.599510603588907, + "grad_norm": 0.019769612699747086, + "learning_rate": 0.0006210161993974488, + "loss": 0.1643, + "num_input_tokens_seen": 127109152, + "step": 58845 + }, + { + "epoch": 9.600326264274061, + "grad_norm": 0.005478884559124708, + "learning_rate": 0.0006209471346552594, + "loss": 0.0079, + "num_input_tokens_seen": 127121472, + "step": 58850 + }, + { + "epoch": 9.601141924959217, + "grad_norm": 0.1525253802537918, + "learning_rate": 0.000620878067461923, + "loss": 0.015, + "num_input_tokens_seen": 127133504, + "step": 58855 + }, + { + "epoch": 9.601957585644373, + "grad_norm": 0.18319138884544373, + "learning_rate": 0.0006208089978188392, + "loss": 0.0226, + "num_input_tokens_seen": 127143904, + "step": 58860 + }, + { + "epoch": 9.602773246329527, + "grad_norm": 0.014515785500407219, + "learning_rate": 0.0006207399257274077, + "loss": 0.0462, + "num_input_tokens_seen": 127151936, + "step": 58865 + }, + { + "epoch": 9.603588907014682, + "grad_norm": 0.006759721785783768, + "learning_rate": 0.0006206708511890286, + "loss": 0.0826, + "num_input_tokens_seen": 127162912, + "step": 58870 + }, + { + "epoch": 9.604404567699836, + "grad_norm": 0.021830957382917404, + "learning_rate": 0.0006206017742051014, + "loss": 0.0114, + "num_input_tokens_seen": 127174432, + "step": 58875 + }, + { + "epoch": 9.605220228384992, + "grad_norm": 0.5838247537612915, + "learning_rate": 0.0006205326947770263, + "loss": 0.0761, + "num_input_tokens_seen": 127184768, + "step": 58880 + }, + { + "epoch": 9.606035889070148, + "grad_norm": 0.0159307811409235, + "learning_rate": 0.0006204636129062034, + "loss": 0.0072, + "num_input_tokens_seen": 127195296, + "step": 58885 + }, + { + "epoch": 9.606851549755302, + "grad_norm": 0.012928396463394165, + "learning_rate": 0.0006203945285940325, + "loss": 0.0046, + "num_input_tokens_seen": 127205280, + "step": 58890 + }, + { + "epoch": 9.607667210440457, + "grad_norm": 0.008373846299946308, + "learning_rate": 0.0006203254418419137, + "loss": 0.0993, + "num_input_tokens_seen": 127214624, + "step": 58895 + }, + { + "epoch": 9.608482871125611, + "grad_norm": 0.10752927511930466, + "learning_rate": 0.0006202563526512471, + "loss": 0.0775, + "num_input_tokens_seen": 127225984, + "step": 58900 + }, + { + "epoch": 9.609298531810767, + "grad_norm": 0.012208987027406693, + "learning_rate": 0.0006201872610234331, + "loss": 0.043, + "num_input_tokens_seen": 127237472, + "step": 58905 + }, + { + "epoch": 9.61011419249592, + "grad_norm": 0.4600610136985779, + "learning_rate": 0.0006201181669598717, + "loss": 0.0201, + "num_input_tokens_seen": 127247328, + "step": 58910 + }, + { + "epoch": 9.610929853181077, + "grad_norm": 0.0066894530318677425, + "learning_rate": 0.0006200490704619633, + "loss": 0.0109, + "num_input_tokens_seen": 127259424, + "step": 58915 + }, + { + "epoch": 9.611745513866232, + "grad_norm": 0.005828152410686016, + "learning_rate": 0.0006199799715311083, + "loss": 0.0071, + "num_input_tokens_seen": 127270976, + "step": 58920 + }, + { + "epoch": 9.612561174551386, + "grad_norm": 0.27778568863868713, + "learning_rate": 0.0006199108701687068, + "loss": 0.0803, + "num_input_tokens_seen": 127282112, + "step": 58925 + }, + { + "epoch": 9.613376835236542, + "grad_norm": 0.007226116955280304, + "learning_rate": 0.0006198417663761596, + "loss": 0.0037, + "num_input_tokens_seen": 127293504, + "step": 58930 + }, + { + "epoch": 9.614192495921696, + "grad_norm": 0.0546656958758831, + "learning_rate": 0.0006197726601548667, + "loss": 0.0204, + "num_input_tokens_seen": 127305408, + "step": 58935 + }, + { + "epoch": 9.615008156606851, + "grad_norm": 0.323722779750824, + "learning_rate": 0.0006197035515062291, + "loss": 0.0675, + "num_input_tokens_seen": 127317344, + "step": 58940 + }, + { + "epoch": 9.615823817292007, + "grad_norm": 0.006941157393157482, + "learning_rate": 0.0006196344404316472, + "loss": 0.007, + "num_input_tokens_seen": 127328352, + "step": 58945 + }, + { + "epoch": 9.616639477977161, + "grad_norm": 0.3557772636413574, + "learning_rate": 0.0006195653269325214, + "loss": 0.0374, + "num_input_tokens_seen": 127340096, + "step": 58950 + }, + { + "epoch": 9.617455138662317, + "grad_norm": 0.012230448424816132, + "learning_rate": 0.0006194962110102528, + "loss": 0.1167, + "num_input_tokens_seen": 127351200, + "step": 58955 + }, + { + "epoch": 9.61827079934747, + "grad_norm": 0.0034625427797436714, + "learning_rate": 0.0006194270926662416, + "loss": 0.0182, + "num_input_tokens_seen": 127362400, + "step": 58960 + }, + { + "epoch": 9.619086460032626, + "grad_norm": 0.0702400952577591, + "learning_rate": 0.000619357971901889, + "loss": 0.0783, + "num_input_tokens_seen": 127371904, + "step": 58965 + }, + { + "epoch": 9.619902120717782, + "grad_norm": 0.18260350823402405, + "learning_rate": 0.0006192888487185958, + "loss": 0.0812, + "num_input_tokens_seen": 127381984, + "step": 58970 + }, + { + "epoch": 9.620717781402936, + "grad_norm": 0.10353845357894897, + "learning_rate": 0.0006192197231177627, + "loss": 0.0111, + "num_input_tokens_seen": 127391584, + "step": 58975 + }, + { + "epoch": 9.621533442088092, + "grad_norm": 0.01583891175687313, + "learning_rate": 0.0006191505951007906, + "loss": 0.1023, + "num_input_tokens_seen": 127401728, + "step": 58980 + }, + { + "epoch": 9.622349102773246, + "grad_norm": 0.18626467883586884, + "learning_rate": 0.0006190814646690805, + "loss": 0.0142, + "num_input_tokens_seen": 127413184, + "step": 58985 + }, + { + "epoch": 9.623164763458401, + "grad_norm": 0.027829086408019066, + "learning_rate": 0.0006190123318240335, + "loss": 0.024, + "num_input_tokens_seen": 127423264, + "step": 58990 + }, + { + "epoch": 9.623980424143557, + "grad_norm": 0.0037336426321417093, + "learning_rate": 0.0006189431965670507, + "loss": 0.0074, + "num_input_tokens_seen": 127435488, + "step": 58995 + }, + { + "epoch": 9.624796084828711, + "grad_norm": 0.003280236152932048, + "learning_rate": 0.0006188740588995331, + "loss": 0.046, + "num_input_tokens_seen": 127445472, + "step": 59000 + }, + { + "epoch": 9.625611745513867, + "grad_norm": 0.036388151347637177, + "learning_rate": 0.000618804918822882, + "loss": 0.062, + "num_input_tokens_seen": 127453984, + "step": 59005 + }, + { + "epoch": 9.62642740619902, + "grad_norm": 0.1263275146484375, + "learning_rate": 0.0006187357763384982, + "loss": 0.0422, + "num_input_tokens_seen": 127464960, + "step": 59010 + }, + { + "epoch": 9.627243066884176, + "grad_norm": 0.006821201648563147, + "learning_rate": 0.0006186666314477835, + "loss": 0.0234, + "num_input_tokens_seen": 127475040, + "step": 59015 + }, + { + "epoch": 9.62805872756933, + "grad_norm": 0.04489858075976372, + "learning_rate": 0.0006185974841521389, + "loss": 0.1792, + "num_input_tokens_seen": 127486496, + "step": 59020 + }, + { + "epoch": 9.628874388254486, + "grad_norm": 0.005536007694900036, + "learning_rate": 0.0006185283344529659, + "loss": 0.0293, + "num_input_tokens_seen": 127496416, + "step": 59025 + }, + { + "epoch": 9.629690048939642, + "grad_norm": 0.0015038796700537205, + "learning_rate": 0.0006184591823516658, + "loss": 0.0551, + "num_input_tokens_seen": 127508000, + "step": 59030 + }, + { + "epoch": 9.630505709624796, + "grad_norm": 0.6168532371520996, + "learning_rate": 0.00061839002784964, + "loss": 0.0912, + "num_input_tokens_seen": 127518016, + "step": 59035 + }, + { + "epoch": 9.631321370309951, + "grad_norm": 0.021128684282302856, + "learning_rate": 0.0006183208709482903, + "loss": 0.0933, + "num_input_tokens_seen": 127528576, + "step": 59040 + }, + { + "epoch": 9.632137030995105, + "grad_norm": 0.01242469996213913, + "learning_rate": 0.0006182517116490179, + "loss": 0.0197, + "num_input_tokens_seen": 127538336, + "step": 59045 + }, + { + "epoch": 9.632952691680261, + "grad_norm": 0.4242917001247406, + "learning_rate": 0.0006181825499532247, + "loss": 0.0546, + "num_input_tokens_seen": 127548672, + "step": 59050 + }, + { + "epoch": 9.633768352365417, + "grad_norm": 0.06874293833971024, + "learning_rate": 0.000618113385862312, + "loss": 0.0584, + "num_input_tokens_seen": 127559584, + "step": 59055 + }, + { + "epoch": 9.63458401305057, + "grad_norm": 0.026289397850632668, + "learning_rate": 0.0006180442193776818, + "loss": 0.1536, + "num_input_tokens_seen": 127570656, + "step": 59060 + }, + { + "epoch": 9.635399673735726, + "grad_norm": 0.0054007284343242645, + "learning_rate": 0.0006179750505007357, + "loss": 0.0494, + "num_input_tokens_seen": 127583008, + "step": 59065 + }, + { + "epoch": 9.63621533442088, + "grad_norm": 0.39873865246772766, + "learning_rate": 0.0006179058792328756, + "loss": 0.2067, + "num_input_tokens_seen": 127594464, + "step": 59070 + }, + { + "epoch": 9.637030995106036, + "grad_norm": 0.007571790833026171, + "learning_rate": 0.0006178367055755032, + "loss": 0.0249, + "num_input_tokens_seen": 127605568, + "step": 59075 + }, + { + "epoch": 9.63784665579119, + "grad_norm": 0.20183655619621277, + "learning_rate": 0.0006177675295300206, + "loss": 0.0159, + "num_input_tokens_seen": 127615328, + "step": 59080 + }, + { + "epoch": 9.638662316476346, + "grad_norm": 0.133195161819458, + "learning_rate": 0.0006176983510978296, + "loss": 0.0184, + "num_input_tokens_seen": 127625984, + "step": 59085 + }, + { + "epoch": 9.639477977161501, + "grad_norm": 0.08000269532203674, + "learning_rate": 0.000617629170280332, + "loss": 0.0184, + "num_input_tokens_seen": 127637056, + "step": 59090 + }, + { + "epoch": 9.640293637846655, + "grad_norm": 0.2687270939350128, + "learning_rate": 0.0006175599870789301, + "loss": 0.0766, + "num_input_tokens_seen": 127647648, + "step": 59095 + }, + { + "epoch": 9.641109298531811, + "grad_norm": 0.012075605802237988, + "learning_rate": 0.000617490801495026, + "loss": 0.0204, + "num_input_tokens_seen": 127657728, + "step": 59100 + }, + { + "epoch": 9.641924959216965, + "grad_norm": 0.33024096488952637, + "learning_rate": 0.0006174216135300219, + "loss": 0.1815, + "num_input_tokens_seen": 127669568, + "step": 59105 + }, + { + "epoch": 9.64274061990212, + "grad_norm": 0.03295661136507988, + "learning_rate": 0.0006173524231853197, + "loss": 0.027, + "num_input_tokens_seen": 127681056, + "step": 59110 + }, + { + "epoch": 9.643556280587276, + "grad_norm": 0.006422259379178286, + "learning_rate": 0.0006172832304623217, + "loss": 0.0367, + "num_input_tokens_seen": 127692000, + "step": 59115 + }, + { + "epoch": 9.64437194127243, + "grad_norm": 0.6102940440177917, + "learning_rate": 0.0006172140353624304, + "loss": 0.2075, + "num_input_tokens_seen": 127704352, + "step": 59120 + }, + { + "epoch": 9.645187601957586, + "grad_norm": 0.01776854135096073, + "learning_rate": 0.0006171448378870479, + "loss": 0.0831, + "num_input_tokens_seen": 127715520, + "step": 59125 + }, + { + "epoch": 9.64600326264274, + "grad_norm": 0.08590822666883469, + "learning_rate": 0.0006170756380375766, + "loss": 0.1248, + "num_input_tokens_seen": 127726656, + "step": 59130 + }, + { + "epoch": 9.646818923327896, + "grad_norm": 0.009952404536306858, + "learning_rate": 0.000617006435815419, + "loss": 0.0376, + "num_input_tokens_seen": 127738784, + "step": 59135 + }, + { + "epoch": 9.647634584013051, + "grad_norm": 0.014745515771210194, + "learning_rate": 0.0006169372312219777, + "loss": 0.0935, + "num_input_tokens_seen": 127748992, + "step": 59140 + }, + { + "epoch": 9.648450244698205, + "grad_norm": 0.06781429797410965, + "learning_rate": 0.0006168680242586549, + "loss": 0.2448, + "num_input_tokens_seen": 127760992, + "step": 59145 + }, + { + "epoch": 9.649265905383361, + "grad_norm": 0.08127600699663162, + "learning_rate": 0.0006167988149268533, + "loss": 0.0143, + "num_input_tokens_seen": 127772000, + "step": 59150 + }, + { + "epoch": 9.650081566068515, + "grad_norm": 0.11132414638996124, + "learning_rate": 0.0006167296032279757, + "loss": 0.0186, + "num_input_tokens_seen": 127781984, + "step": 59155 + }, + { + "epoch": 9.65089722675367, + "grad_norm": 0.11625031381845474, + "learning_rate": 0.0006166603891634245, + "loss": 0.0966, + "num_input_tokens_seen": 127793440, + "step": 59160 + }, + { + "epoch": 9.651712887438826, + "grad_norm": 0.01904509961605072, + "learning_rate": 0.0006165911727346025, + "loss": 0.0056, + "num_input_tokens_seen": 127803232, + "step": 59165 + }, + { + "epoch": 9.65252854812398, + "grad_norm": 0.017921043559908867, + "learning_rate": 0.0006165219539429126, + "loss": 0.0542, + "num_input_tokens_seen": 127813696, + "step": 59170 + }, + { + "epoch": 9.653344208809136, + "grad_norm": 0.30042189359664917, + "learning_rate": 0.0006164527327897574, + "loss": 0.0398, + "num_input_tokens_seen": 127824096, + "step": 59175 + }, + { + "epoch": 9.65415986949429, + "grad_norm": 0.18783047795295715, + "learning_rate": 0.0006163835092765399, + "loss": 0.1257, + "num_input_tokens_seen": 127833856, + "step": 59180 + }, + { + "epoch": 9.654975530179446, + "grad_norm": 0.033547911792993546, + "learning_rate": 0.0006163142834046629, + "loss": 0.0244, + "num_input_tokens_seen": 127845248, + "step": 59185 + }, + { + "epoch": 9.655791190864601, + "grad_norm": 0.2385920286178589, + "learning_rate": 0.0006162450551755295, + "loss": 0.0306, + "num_input_tokens_seen": 127855936, + "step": 59190 + }, + { + "epoch": 9.656606851549755, + "grad_norm": 0.018533451482653618, + "learning_rate": 0.0006161758245905423, + "loss": 0.0113, + "num_input_tokens_seen": 127866688, + "step": 59195 + }, + { + "epoch": 9.65742251223491, + "grad_norm": 0.0036566208582371473, + "learning_rate": 0.0006161065916511047, + "loss": 0.0683, + "num_input_tokens_seen": 127877568, + "step": 59200 + }, + { + "epoch": 9.658238172920065, + "grad_norm": 0.08195324242115021, + "learning_rate": 0.0006160373563586199, + "loss": 0.0547, + "num_input_tokens_seen": 127887808, + "step": 59205 + }, + { + "epoch": 9.65905383360522, + "grad_norm": 0.018927576020359993, + "learning_rate": 0.0006159681187144909, + "loss": 0.0134, + "num_input_tokens_seen": 127898816, + "step": 59210 + }, + { + "epoch": 9.659869494290374, + "grad_norm": 0.01739303395152092, + "learning_rate": 0.0006158988787201208, + "loss": 0.0732, + "num_input_tokens_seen": 127909856, + "step": 59215 + }, + { + "epoch": 9.66068515497553, + "grad_norm": 0.006371657829731703, + "learning_rate": 0.0006158296363769128, + "loss": 0.1002, + "num_input_tokens_seen": 127920000, + "step": 59220 + }, + { + "epoch": 9.661500815660686, + "grad_norm": 0.20840153098106384, + "learning_rate": 0.0006157603916862703, + "loss": 0.1262, + "num_input_tokens_seen": 127931040, + "step": 59225 + }, + { + "epoch": 9.66231647634584, + "grad_norm": 0.0057899076491594315, + "learning_rate": 0.0006156911446495967, + "loss": 0.038, + "num_input_tokens_seen": 127941888, + "step": 59230 + }, + { + "epoch": 9.663132137030995, + "grad_norm": 0.02108006179332733, + "learning_rate": 0.0006156218952682953, + "loss": 0.014, + "num_input_tokens_seen": 127951328, + "step": 59235 + }, + { + "epoch": 9.66394779771615, + "grad_norm": 0.07076103985309601, + "learning_rate": 0.0006155526435437694, + "loss": 0.1026, + "num_input_tokens_seen": 127962816, + "step": 59240 + }, + { + "epoch": 9.664763458401305, + "grad_norm": 0.0183942299336195, + "learning_rate": 0.0006154833894774226, + "loss": 0.0736, + "num_input_tokens_seen": 127974016, + "step": 59245 + }, + { + "epoch": 9.66557911908646, + "grad_norm": 0.038768794387578964, + "learning_rate": 0.0006154141330706586, + "loss": 0.0123, + "num_input_tokens_seen": 127983808, + "step": 59250 + }, + { + "epoch": 9.666394779771615, + "grad_norm": 0.18464896082878113, + "learning_rate": 0.0006153448743248805, + "loss": 0.0517, + "num_input_tokens_seen": 127994912, + "step": 59255 + }, + { + "epoch": 9.66721044045677, + "grad_norm": 0.010973125696182251, + "learning_rate": 0.0006152756132414924, + "loss": 0.1439, + "num_input_tokens_seen": 128003488, + "step": 59260 + }, + { + "epoch": 9.668026101141924, + "grad_norm": 0.09602286666631699, + "learning_rate": 0.0006152063498218977, + "loss": 0.091, + "num_input_tokens_seen": 128014272, + "step": 59265 + }, + { + "epoch": 9.66884176182708, + "grad_norm": 0.25912317633628845, + "learning_rate": 0.0006151370840675001, + "loss": 0.0373, + "num_input_tokens_seen": 128023808, + "step": 59270 + }, + { + "epoch": 9.669657422512234, + "grad_norm": 0.046629659831523895, + "learning_rate": 0.0006150678159797034, + "loss": 0.04, + "num_input_tokens_seen": 128034624, + "step": 59275 + }, + { + "epoch": 9.67047308319739, + "grad_norm": 0.052462734282016754, + "learning_rate": 0.0006149985455599115, + "loss": 0.1757, + "num_input_tokens_seen": 128045440, + "step": 59280 + }, + { + "epoch": 9.671288743882545, + "grad_norm": 0.06739270687103271, + "learning_rate": 0.0006149292728095283, + "loss": 0.1342, + "num_input_tokens_seen": 128054624, + "step": 59285 + }, + { + "epoch": 9.6721044045677, + "grad_norm": 0.02121824584901333, + "learning_rate": 0.0006148599977299575, + "loss": 0.0641, + "num_input_tokens_seen": 128065984, + "step": 59290 + }, + { + "epoch": 9.672920065252855, + "grad_norm": 0.0886927992105484, + "learning_rate": 0.0006147907203226031, + "loss": 0.0409, + "num_input_tokens_seen": 128076768, + "step": 59295 + }, + { + "epoch": 9.673735725938009, + "grad_norm": 0.013269363902509212, + "learning_rate": 0.0006147214405888692, + "loss": 0.0179, + "num_input_tokens_seen": 128087840, + "step": 59300 + }, + { + "epoch": 9.674551386623165, + "grad_norm": 0.06566104292869568, + "learning_rate": 0.0006146521585301596, + "loss": 0.1646, + "num_input_tokens_seen": 128099168, + "step": 59305 + }, + { + "epoch": 9.67536704730832, + "grad_norm": 0.013543189503252506, + "learning_rate": 0.0006145828741478788, + "loss": 0.0231, + "num_input_tokens_seen": 128109696, + "step": 59310 + }, + { + "epoch": 9.676182707993474, + "grad_norm": 0.010334925726056099, + "learning_rate": 0.0006145135874434305, + "loss": 0.0411, + "num_input_tokens_seen": 128120768, + "step": 59315 + }, + { + "epoch": 9.67699836867863, + "grad_norm": 0.003125895978882909, + "learning_rate": 0.0006144442984182193, + "loss": 0.0101, + "num_input_tokens_seen": 128132768, + "step": 59320 + }, + { + "epoch": 9.677814029363784, + "grad_norm": 0.025807317346334457, + "learning_rate": 0.0006143750070736491, + "loss": 0.0181, + "num_input_tokens_seen": 128144768, + "step": 59325 + }, + { + "epoch": 9.67862969004894, + "grad_norm": 0.016329145058989525, + "learning_rate": 0.0006143057134111243, + "loss": 0.058, + "num_input_tokens_seen": 128155616, + "step": 59330 + }, + { + "epoch": 9.679445350734095, + "grad_norm": 0.21162718534469604, + "learning_rate": 0.0006142364174320492, + "loss": 0.1422, + "num_input_tokens_seen": 128166240, + "step": 59335 + }, + { + "epoch": 9.68026101141925, + "grad_norm": 0.03283459693193436, + "learning_rate": 0.0006141671191378281, + "loss": 0.0244, + "num_input_tokens_seen": 128177408, + "step": 59340 + }, + { + "epoch": 9.681076672104405, + "grad_norm": 0.05063774436712265, + "learning_rate": 0.0006140978185298656, + "loss": 0.0938, + "num_input_tokens_seen": 128188672, + "step": 59345 + }, + { + "epoch": 9.681892332789559, + "grad_norm": 0.0327952615916729, + "learning_rate": 0.0006140285156095661, + "loss": 0.0925, + "num_input_tokens_seen": 128199520, + "step": 59350 + }, + { + "epoch": 9.682707993474715, + "grad_norm": 0.3350673019886017, + "learning_rate": 0.0006139592103783339, + "loss": 0.0744, + "num_input_tokens_seen": 128211232, + "step": 59355 + }, + { + "epoch": 9.68352365415987, + "grad_norm": 0.008121310733258724, + "learning_rate": 0.000613889902837574, + "loss": 0.0152, + "num_input_tokens_seen": 128220672, + "step": 59360 + }, + { + "epoch": 9.684339314845024, + "grad_norm": 0.2720755338668823, + "learning_rate": 0.0006138205929886905, + "loss": 0.1245, + "num_input_tokens_seen": 128231680, + "step": 59365 + }, + { + "epoch": 9.68515497553018, + "grad_norm": 0.02992718666791916, + "learning_rate": 0.0006137512808330884, + "loss": 0.2193, + "num_input_tokens_seen": 128242016, + "step": 59370 + }, + { + "epoch": 9.685970636215334, + "grad_norm": 0.023438721895217896, + "learning_rate": 0.0006136819663721722, + "loss": 0.0225, + "num_input_tokens_seen": 128252736, + "step": 59375 + }, + { + "epoch": 9.68678629690049, + "grad_norm": 0.07434836030006409, + "learning_rate": 0.0006136126496073469, + "loss": 0.0126, + "num_input_tokens_seen": 128262016, + "step": 59380 + }, + { + "epoch": 9.687601957585644, + "grad_norm": 0.004536564461886883, + "learning_rate": 0.0006135433305400169, + "loss": 0.0208, + "num_input_tokens_seen": 128273440, + "step": 59385 + }, + { + "epoch": 9.6884176182708, + "grad_norm": 0.12715773284435272, + "learning_rate": 0.0006134740091715875, + "loss": 0.0186, + "num_input_tokens_seen": 128284096, + "step": 59390 + }, + { + "epoch": 9.689233278955955, + "grad_norm": 0.1364831030368805, + "learning_rate": 0.0006134046855034631, + "loss": 0.0775, + "num_input_tokens_seen": 128295264, + "step": 59395 + }, + { + "epoch": 9.690048939641109, + "grad_norm": 0.0033251584973186255, + "learning_rate": 0.0006133353595370491, + "loss": 0.0707, + "num_input_tokens_seen": 128305856, + "step": 59400 + }, + { + "epoch": 9.690864600326265, + "grad_norm": 0.1248481348156929, + "learning_rate": 0.0006132660312737502, + "loss": 0.1275, + "num_input_tokens_seen": 128316928, + "step": 59405 + }, + { + "epoch": 9.691680261011419, + "grad_norm": 0.0053417375311255455, + "learning_rate": 0.0006131967007149716, + "loss": 0.0222, + "num_input_tokens_seen": 128327168, + "step": 59410 + }, + { + "epoch": 9.692495921696574, + "grad_norm": 0.03325970470905304, + "learning_rate": 0.000613127367862118, + "loss": 0.0222, + "num_input_tokens_seen": 128338720, + "step": 59415 + }, + { + "epoch": 9.69331158238173, + "grad_norm": 0.1174701601266861, + "learning_rate": 0.0006130580327165949, + "loss": 0.0693, + "num_input_tokens_seen": 128349216, + "step": 59420 + }, + { + "epoch": 9.694127243066884, + "grad_norm": 0.002682819264009595, + "learning_rate": 0.0006129886952798074, + "loss": 0.0506, + "num_input_tokens_seen": 128359552, + "step": 59425 + }, + { + "epoch": 9.69494290375204, + "grad_norm": 0.00796019472181797, + "learning_rate": 0.0006129193555531606, + "loss": 0.0339, + "num_input_tokens_seen": 128370976, + "step": 59430 + }, + { + "epoch": 9.695758564437194, + "grad_norm": 0.21734458208084106, + "learning_rate": 0.0006128500135380598, + "loss": 0.029, + "num_input_tokens_seen": 128381472, + "step": 59435 + }, + { + "epoch": 9.69657422512235, + "grad_norm": 0.2784644365310669, + "learning_rate": 0.0006127806692359103, + "loss": 0.0328, + "num_input_tokens_seen": 128393088, + "step": 59440 + }, + { + "epoch": 9.697389885807503, + "grad_norm": 0.010838964022696018, + "learning_rate": 0.0006127113226481175, + "loss": 0.044, + "num_input_tokens_seen": 128403520, + "step": 59445 + }, + { + "epoch": 9.698205546492659, + "grad_norm": 0.0185268372297287, + "learning_rate": 0.0006126419737760868, + "loss": 0.0095, + "num_input_tokens_seen": 128415040, + "step": 59450 + }, + { + "epoch": 9.699021207177815, + "grad_norm": 0.29471975564956665, + "learning_rate": 0.0006125726226212236, + "loss": 0.2644, + "num_input_tokens_seen": 128425376, + "step": 59455 + }, + { + "epoch": 9.699836867862969, + "grad_norm": 0.2613866627216339, + "learning_rate": 0.0006125032691849333, + "loss": 0.1377, + "num_input_tokens_seen": 128435520, + "step": 59460 + }, + { + "epoch": 9.700652528548124, + "grad_norm": 0.01687040366232395, + "learning_rate": 0.0006124339134686216, + "loss": 0.0096, + "num_input_tokens_seen": 128446368, + "step": 59465 + }, + { + "epoch": 9.701468189233278, + "grad_norm": 0.07034385949373245, + "learning_rate": 0.0006123645554736941, + "loss": 0.0466, + "num_input_tokens_seen": 128456736, + "step": 59470 + }, + { + "epoch": 9.702283849918434, + "grad_norm": 0.13933435082435608, + "learning_rate": 0.0006122951952015562, + "loss": 0.0233, + "num_input_tokens_seen": 128467680, + "step": 59475 + }, + { + "epoch": 9.70309951060359, + "grad_norm": 0.008489076048135757, + "learning_rate": 0.0006122258326536138, + "loss": 0.037, + "num_input_tokens_seen": 128478720, + "step": 59480 + }, + { + "epoch": 9.703915171288743, + "grad_norm": 0.2870648205280304, + "learning_rate": 0.0006121564678312724, + "loss": 0.0918, + "num_input_tokens_seen": 128489344, + "step": 59485 + }, + { + "epoch": 9.7047308319739, + "grad_norm": 0.0050661033019423485, + "learning_rate": 0.0006120871007359381, + "loss": 0.0205, + "num_input_tokens_seen": 128498816, + "step": 59490 + }, + { + "epoch": 9.705546492659053, + "grad_norm": 0.004017225466668606, + "learning_rate": 0.0006120177313690164, + "loss": 0.023, + "num_input_tokens_seen": 128509504, + "step": 59495 + }, + { + "epoch": 9.706362153344209, + "grad_norm": 0.1601206660270691, + "learning_rate": 0.0006119483597319132, + "loss": 0.0454, + "num_input_tokens_seen": 128521856, + "step": 59500 + }, + { + "epoch": 9.707177814029365, + "grad_norm": 0.008326910436153412, + "learning_rate": 0.0006118789858260347, + "loss": 0.1383, + "num_input_tokens_seen": 128532672, + "step": 59505 + }, + { + "epoch": 9.707993474714518, + "grad_norm": 0.06540261209011078, + "learning_rate": 0.0006118096096527863, + "loss": 0.0317, + "num_input_tokens_seen": 128543392, + "step": 59510 + }, + { + "epoch": 9.708809135399674, + "grad_norm": 0.034490231424570084, + "learning_rate": 0.0006117402312135746, + "loss": 0.0547, + "num_input_tokens_seen": 128554464, + "step": 59515 + }, + { + "epoch": 9.709624796084828, + "grad_norm": 0.01300547644495964, + "learning_rate": 0.0006116708505098051, + "loss": 0.1313, + "num_input_tokens_seen": 128565472, + "step": 59520 + }, + { + "epoch": 9.710440456769984, + "grad_norm": 0.009446562267839909, + "learning_rate": 0.0006116014675428842, + "loss": 0.0744, + "num_input_tokens_seen": 128576832, + "step": 59525 + }, + { + "epoch": 9.71125611745514, + "grad_norm": 0.007366952486336231, + "learning_rate": 0.0006115320823142182, + "loss": 0.1082, + "num_input_tokens_seen": 128587136, + "step": 59530 + }, + { + "epoch": 9.712071778140293, + "grad_norm": 0.011812661774456501, + "learning_rate": 0.000611462694825213, + "loss": 0.1366, + "num_input_tokens_seen": 128598176, + "step": 59535 + }, + { + "epoch": 9.71288743882545, + "grad_norm": 0.08814537525177002, + "learning_rate": 0.0006113933050772749, + "loss": 0.0737, + "num_input_tokens_seen": 128608736, + "step": 59540 + }, + { + "epoch": 9.713703099510603, + "grad_norm": 0.0336206778883934, + "learning_rate": 0.00061132391307181, + "loss": 0.0218, + "num_input_tokens_seen": 128619104, + "step": 59545 + }, + { + "epoch": 9.714518760195759, + "grad_norm": 0.06537744402885437, + "learning_rate": 0.0006112545188102249, + "loss": 0.0687, + "num_input_tokens_seen": 128629088, + "step": 59550 + }, + { + "epoch": 9.715334420880914, + "grad_norm": 0.03816675767302513, + "learning_rate": 0.0006111851222939257, + "loss": 0.1662, + "num_input_tokens_seen": 128640224, + "step": 59555 + }, + { + "epoch": 9.716150081566068, + "grad_norm": 0.09322861582040787, + "learning_rate": 0.0006111157235243192, + "loss": 0.1321, + "num_input_tokens_seen": 128651040, + "step": 59560 + }, + { + "epoch": 9.716965742251224, + "grad_norm": 0.0767819806933403, + "learning_rate": 0.0006110463225028114, + "loss": 0.137, + "num_input_tokens_seen": 128660928, + "step": 59565 + }, + { + "epoch": 9.717781402936378, + "grad_norm": 0.04609353095293045, + "learning_rate": 0.0006109769192308091, + "loss": 0.1022, + "num_input_tokens_seen": 128670144, + "step": 59570 + }, + { + "epoch": 9.718597063621534, + "grad_norm": 0.014793830923736095, + "learning_rate": 0.0006109075137097188, + "loss": 0.0489, + "num_input_tokens_seen": 128680352, + "step": 59575 + }, + { + "epoch": 9.719412724306688, + "grad_norm": 0.07300538569688797, + "learning_rate": 0.0006108381059409469, + "loss": 0.0268, + "num_input_tokens_seen": 128690240, + "step": 59580 + }, + { + "epoch": 9.720228384991843, + "grad_norm": 0.0640680119395256, + "learning_rate": 0.0006107686959259003, + "loss": 0.0205, + "num_input_tokens_seen": 128700032, + "step": 59585 + }, + { + "epoch": 9.721044045676999, + "grad_norm": 0.01870567351579666, + "learning_rate": 0.0006106992836659853, + "loss": 0.0472, + "num_input_tokens_seen": 128710016, + "step": 59590 + }, + { + "epoch": 9.721859706362153, + "grad_norm": 0.06907132267951965, + "learning_rate": 0.0006106298691626091, + "loss": 0.0095, + "num_input_tokens_seen": 128719040, + "step": 59595 + }, + { + "epoch": 9.722675367047309, + "grad_norm": 0.29290202260017395, + "learning_rate": 0.0006105604524171782, + "loss": 0.0434, + "num_input_tokens_seen": 128729824, + "step": 59600 + }, + { + "epoch": 9.723491027732463, + "grad_norm": 0.029978347942233086, + "learning_rate": 0.0006104910334310996, + "loss": 0.031, + "num_input_tokens_seen": 128740960, + "step": 59605 + }, + { + "epoch": 9.724306688417618, + "grad_norm": 0.008729356341063976, + "learning_rate": 0.0006104216122057799, + "loss": 0.0101, + "num_input_tokens_seen": 128752224, + "step": 59610 + }, + { + "epoch": 9.725122349102774, + "grad_norm": 0.057942282408475876, + "learning_rate": 0.0006103521887426262, + "loss": 0.0909, + "num_input_tokens_seen": 128762304, + "step": 59615 + }, + { + "epoch": 9.725938009787928, + "grad_norm": 0.005804257933050394, + "learning_rate": 0.0006102827630430454, + "loss": 0.055, + "num_input_tokens_seen": 128772448, + "step": 59620 + }, + { + "epoch": 9.726753670473084, + "grad_norm": 0.27918195724487305, + "learning_rate": 0.0006102133351084443, + "loss": 0.0455, + "num_input_tokens_seen": 128784320, + "step": 59625 + }, + { + "epoch": 9.727569331158238, + "grad_norm": 0.39616507291793823, + "learning_rate": 0.0006101439049402304, + "loss": 0.1558, + "num_input_tokens_seen": 128794784, + "step": 59630 + }, + { + "epoch": 9.728384991843393, + "grad_norm": 0.0009002613369375467, + "learning_rate": 0.0006100744725398105, + "loss": 0.0125, + "num_input_tokens_seen": 128806688, + "step": 59635 + }, + { + "epoch": 9.729200652528547, + "grad_norm": 0.1437094509601593, + "learning_rate": 0.0006100050379085918, + "loss": 0.0356, + "num_input_tokens_seen": 128816768, + "step": 59640 + }, + { + "epoch": 9.730016313213703, + "grad_norm": 0.03397361934185028, + "learning_rate": 0.0006099356010479814, + "loss": 0.104, + "num_input_tokens_seen": 128827104, + "step": 59645 + }, + { + "epoch": 9.730831973898859, + "grad_norm": 0.06053118407726288, + "learning_rate": 0.0006098661619593866, + "loss": 0.0096, + "num_input_tokens_seen": 128838592, + "step": 59650 + }, + { + "epoch": 9.731647634584013, + "grad_norm": 0.0009688441641628742, + "learning_rate": 0.0006097967206442147, + "loss": 0.0316, + "num_input_tokens_seen": 128847936, + "step": 59655 + }, + { + "epoch": 9.732463295269168, + "grad_norm": 0.010555686429142952, + "learning_rate": 0.0006097272771038728, + "loss": 0.008, + "num_input_tokens_seen": 128858432, + "step": 59660 + }, + { + "epoch": 9.733278955954322, + "grad_norm": 0.02458445355296135, + "learning_rate": 0.0006096578313397687, + "loss": 0.0158, + "num_input_tokens_seen": 128869888, + "step": 59665 + }, + { + "epoch": 9.734094616639478, + "grad_norm": 0.19622117280960083, + "learning_rate": 0.0006095883833533094, + "loss": 0.109, + "num_input_tokens_seen": 128879648, + "step": 59670 + }, + { + "epoch": 9.734910277324634, + "grad_norm": 0.007972361519932747, + "learning_rate": 0.0006095189331459024, + "loss": 0.0179, + "num_input_tokens_seen": 128891072, + "step": 59675 + }, + { + "epoch": 9.735725938009788, + "grad_norm": 0.003296657232567668, + "learning_rate": 0.0006094494807189555, + "loss": 0.0588, + "num_input_tokens_seen": 128903008, + "step": 59680 + }, + { + "epoch": 9.736541598694943, + "grad_norm": 0.08102521300315857, + "learning_rate": 0.0006093800260738758, + "loss": 0.0472, + "num_input_tokens_seen": 128913632, + "step": 59685 + }, + { + "epoch": 9.737357259380097, + "grad_norm": 0.014509606175124645, + "learning_rate": 0.0006093105692120712, + "loss": 0.1708, + "num_input_tokens_seen": 128924064, + "step": 59690 + }, + { + "epoch": 9.738172920065253, + "grad_norm": 0.0067308759316802025, + "learning_rate": 0.0006092411101349492, + "loss": 0.0577, + "num_input_tokens_seen": 128935104, + "step": 59695 + }, + { + "epoch": 9.738988580750409, + "grad_norm": 0.01863669790327549, + "learning_rate": 0.0006091716488439177, + "loss": 0.0272, + "num_input_tokens_seen": 128945952, + "step": 59700 + }, + { + "epoch": 9.739804241435563, + "grad_norm": 0.3315466344356537, + "learning_rate": 0.0006091021853403841, + "loss": 0.0688, + "num_input_tokens_seen": 128955808, + "step": 59705 + }, + { + "epoch": 9.740619902120718, + "grad_norm": 0.0017027267022058368, + "learning_rate": 0.0006090327196257562, + "loss": 0.0629, + "num_input_tokens_seen": 128966304, + "step": 59710 + }, + { + "epoch": 9.741435562805872, + "grad_norm": 0.08503128588199615, + "learning_rate": 0.000608963251701442, + "loss": 0.0178, + "num_input_tokens_seen": 128976768, + "step": 59715 + }, + { + "epoch": 9.742251223491028, + "grad_norm": 0.017233064398169518, + "learning_rate": 0.0006088937815688495, + "loss": 0.0258, + "num_input_tokens_seen": 128988416, + "step": 59720 + }, + { + "epoch": 9.743066884176184, + "grad_norm": 0.4578186571598053, + "learning_rate": 0.0006088243092293861, + "loss": 0.1782, + "num_input_tokens_seen": 129000032, + "step": 59725 + }, + { + "epoch": 9.743882544861338, + "grad_norm": 0.0711468756198883, + "learning_rate": 0.0006087548346844601, + "loss": 0.0207, + "num_input_tokens_seen": 129010464, + "step": 59730 + }, + { + "epoch": 9.744698205546493, + "grad_norm": 0.0229730773717165, + "learning_rate": 0.0006086853579354793, + "loss": 0.0431, + "num_input_tokens_seen": 129021920, + "step": 59735 + }, + { + "epoch": 9.745513866231647, + "grad_norm": 0.008088946342468262, + "learning_rate": 0.0006086158789838519, + "loss": 0.0827, + "num_input_tokens_seen": 129032480, + "step": 59740 + }, + { + "epoch": 9.746329526916803, + "grad_norm": 0.018943075090646744, + "learning_rate": 0.0006085463978309861, + "loss": 0.045, + "num_input_tokens_seen": 129043712, + "step": 59745 + }, + { + "epoch": 9.747145187601957, + "grad_norm": 0.02913905493915081, + "learning_rate": 0.0006084769144782897, + "loss": 0.0292, + "num_input_tokens_seen": 129054560, + "step": 59750 + }, + { + "epoch": 9.747960848287113, + "grad_norm": 0.003888669889420271, + "learning_rate": 0.0006084074289271711, + "loss": 0.0071, + "num_input_tokens_seen": 129065504, + "step": 59755 + }, + { + "epoch": 9.748776508972268, + "grad_norm": 0.08220919221639633, + "learning_rate": 0.0006083379411790383, + "loss": 0.0388, + "num_input_tokens_seen": 129075936, + "step": 59760 + }, + { + "epoch": 9.749592169657422, + "grad_norm": 0.13562801480293274, + "learning_rate": 0.0006082684512352997, + "loss": 0.08, + "num_input_tokens_seen": 129087168, + "step": 59765 + }, + { + "epoch": 9.750407830342578, + "grad_norm": 0.0025733276270329952, + "learning_rate": 0.0006081989590973637, + "loss": 0.0074, + "num_input_tokens_seen": 129096832, + "step": 59770 + }, + { + "epoch": 9.751223491027732, + "grad_norm": 0.019972285255789757, + "learning_rate": 0.0006081294647666385, + "loss": 0.3124, + "num_input_tokens_seen": 129107616, + "step": 59775 + }, + { + "epoch": 9.752039151712887, + "grad_norm": 0.36964648962020874, + "learning_rate": 0.0006080599682445325, + "loss": 0.0386, + "num_input_tokens_seen": 129119296, + "step": 59780 + }, + { + "epoch": 9.752854812398043, + "grad_norm": 0.11169246584177017, + "learning_rate": 0.000607990469532454, + "loss": 0.0647, + "num_input_tokens_seen": 129129792, + "step": 59785 + }, + { + "epoch": 9.753670473083197, + "grad_norm": 0.010639806278049946, + "learning_rate": 0.0006079209686318119, + "loss": 0.0163, + "num_input_tokens_seen": 129140320, + "step": 59790 + }, + { + "epoch": 9.754486133768353, + "grad_norm": 0.05021905153989792, + "learning_rate": 0.0006078514655440144, + "loss": 0.0626, + "num_input_tokens_seen": 129152288, + "step": 59795 + }, + { + "epoch": 9.755301794453507, + "grad_norm": 0.042118266224861145, + "learning_rate": 0.0006077819602704702, + "loss": 0.035, + "num_input_tokens_seen": 129162528, + "step": 59800 + }, + { + "epoch": 9.756117455138662, + "grad_norm": 0.8312237858772278, + "learning_rate": 0.0006077124528125877, + "loss": 0.0373, + "num_input_tokens_seen": 129172576, + "step": 59805 + }, + { + "epoch": 9.756933115823816, + "grad_norm": 0.0063532376661896706, + "learning_rate": 0.0006076429431717757, + "loss": 0.0285, + "num_input_tokens_seen": 129183744, + "step": 59810 + }, + { + "epoch": 9.757748776508972, + "grad_norm": 0.01858111470937729, + "learning_rate": 0.000607573431349443, + "loss": 0.1904, + "num_input_tokens_seen": 129194048, + "step": 59815 + }, + { + "epoch": 9.758564437194128, + "grad_norm": 0.23809611797332764, + "learning_rate": 0.0006075039173469982, + "loss": 0.1462, + "num_input_tokens_seen": 129204416, + "step": 59820 + }, + { + "epoch": 9.759380097879282, + "grad_norm": 0.006448432803153992, + "learning_rate": 0.0006074344011658501, + "loss": 0.0679, + "num_input_tokens_seen": 129214880, + "step": 59825 + }, + { + "epoch": 9.760195758564437, + "grad_norm": 0.0030931164510548115, + "learning_rate": 0.0006073648828074077, + "loss": 0.1305, + "num_input_tokens_seen": 129226176, + "step": 59830 + }, + { + "epoch": 9.761011419249591, + "grad_norm": 0.03220542520284653, + "learning_rate": 0.0006072953622730796, + "loss": 0.0369, + "num_input_tokens_seen": 129236896, + "step": 59835 + }, + { + "epoch": 9.761827079934747, + "grad_norm": 0.06129692122340202, + "learning_rate": 0.0006072258395642748, + "loss": 0.0273, + "num_input_tokens_seen": 129249088, + "step": 59840 + }, + { + "epoch": 9.762642740619903, + "grad_norm": 0.1948278546333313, + "learning_rate": 0.0006071563146824024, + "loss": 0.0644, + "num_input_tokens_seen": 129260096, + "step": 59845 + }, + { + "epoch": 9.763458401305057, + "grad_norm": 0.20120300352573395, + "learning_rate": 0.0006070867876288715, + "loss": 0.0493, + "num_input_tokens_seen": 129271744, + "step": 59850 + }, + { + "epoch": 9.764274061990212, + "grad_norm": 0.010271182283759117, + "learning_rate": 0.0006070172584050908, + "loss": 0.0582, + "num_input_tokens_seen": 129282688, + "step": 59855 + }, + { + "epoch": 9.765089722675366, + "grad_norm": 0.004881757777184248, + "learning_rate": 0.0006069477270124697, + "loss": 0.0399, + "num_input_tokens_seen": 129292576, + "step": 59860 + }, + { + "epoch": 9.765905383360522, + "grad_norm": 0.009391111321747303, + "learning_rate": 0.0006068781934524172, + "loss": 0.011, + "num_input_tokens_seen": 129302784, + "step": 59865 + }, + { + "epoch": 9.766721044045678, + "grad_norm": 0.014628876000642776, + "learning_rate": 0.0006068086577263426, + "loss": 0.1397, + "num_input_tokens_seen": 129314304, + "step": 59870 + }, + { + "epoch": 9.767536704730832, + "grad_norm": 0.29984965920448303, + "learning_rate": 0.0006067391198356551, + "loss": 0.0568, + "num_input_tokens_seen": 129325856, + "step": 59875 + }, + { + "epoch": 9.768352365415987, + "grad_norm": 0.06933252513408661, + "learning_rate": 0.0006066695797817638, + "loss": 0.0323, + "num_input_tokens_seen": 129337504, + "step": 59880 + }, + { + "epoch": 9.769168026101141, + "grad_norm": 0.3127744495868683, + "learning_rate": 0.0006066000375660782, + "loss": 0.1433, + "num_input_tokens_seen": 129350208, + "step": 59885 + }, + { + "epoch": 9.769983686786297, + "grad_norm": 0.02711273729801178, + "learning_rate": 0.0006065304931900076, + "loss": 0.0513, + "num_input_tokens_seen": 129362048, + "step": 59890 + }, + { + "epoch": 9.770799347471453, + "grad_norm": 0.18783365190029144, + "learning_rate": 0.0006064609466549614, + "loss": 0.0665, + "num_input_tokens_seen": 129372768, + "step": 59895 + }, + { + "epoch": 9.771615008156607, + "grad_norm": 0.928313672542572, + "learning_rate": 0.0006063913979623491, + "loss": 0.0947, + "num_input_tokens_seen": 129383584, + "step": 59900 + }, + { + "epoch": 9.772430668841762, + "grad_norm": 0.41291940212249756, + "learning_rate": 0.0006063218471135801, + "loss": 0.1186, + "num_input_tokens_seen": 129395136, + "step": 59905 + }, + { + "epoch": 9.773246329526916, + "grad_norm": 0.016780929639935493, + "learning_rate": 0.0006062522941100639, + "loss": 0.0408, + "num_input_tokens_seen": 129406720, + "step": 59910 + }, + { + "epoch": 9.774061990212072, + "grad_norm": 0.20784629881381989, + "learning_rate": 0.0006061827389532103, + "loss": 0.1078, + "num_input_tokens_seen": 129417888, + "step": 59915 + }, + { + "epoch": 9.774877650897226, + "grad_norm": 0.544096052646637, + "learning_rate": 0.0006061131816444287, + "loss": 0.071, + "num_input_tokens_seen": 129429088, + "step": 59920 + }, + { + "epoch": 9.775693311582382, + "grad_norm": 0.05459734797477722, + "learning_rate": 0.000606043622185129, + "loss": 0.0259, + "num_input_tokens_seen": 129439008, + "step": 59925 + }, + { + "epoch": 9.776508972267537, + "grad_norm": 0.05036766454577446, + "learning_rate": 0.0006059740605767207, + "loss": 0.1057, + "num_input_tokens_seen": 129450528, + "step": 59930 + }, + { + "epoch": 9.777324632952691, + "grad_norm": 0.010146408341825008, + "learning_rate": 0.0006059044968206136, + "loss": 0.0881, + "num_input_tokens_seen": 129460864, + "step": 59935 + }, + { + "epoch": 9.778140293637847, + "grad_norm": 0.00299668638035655, + "learning_rate": 0.0006058349309182176, + "loss": 0.0243, + "num_input_tokens_seen": 129471264, + "step": 59940 + }, + { + "epoch": 9.778955954323001, + "grad_norm": 0.00625614495947957, + "learning_rate": 0.0006057653628709424, + "loss": 0.1429, + "num_input_tokens_seen": 129482656, + "step": 59945 + }, + { + "epoch": 9.779771615008157, + "grad_norm": 0.15388129651546478, + "learning_rate": 0.0006056957926801979, + "loss": 0.0511, + "num_input_tokens_seen": 129493888, + "step": 59950 + }, + { + "epoch": 9.780587275693312, + "grad_norm": 0.35571151971817017, + "learning_rate": 0.0006056262203473941, + "loss": 0.2147, + "num_input_tokens_seen": 129505216, + "step": 59955 + }, + { + "epoch": 9.781402936378466, + "grad_norm": 0.028294721618294716, + "learning_rate": 0.000605556645873941, + "loss": 0.0188, + "num_input_tokens_seen": 129517216, + "step": 59960 + }, + { + "epoch": 9.782218597063622, + "grad_norm": 0.20071174204349518, + "learning_rate": 0.0006054870692612487, + "loss": 0.1073, + "num_input_tokens_seen": 129528896, + "step": 59965 + }, + { + "epoch": 9.783034257748776, + "grad_norm": 0.03279627487063408, + "learning_rate": 0.0006054174905107269, + "loss": 0.0299, + "num_input_tokens_seen": 129541088, + "step": 59970 + }, + { + "epoch": 9.783849918433932, + "grad_norm": 0.04848659038543701, + "learning_rate": 0.0006053479096237859, + "loss": 0.0347, + "num_input_tokens_seen": 129552320, + "step": 59975 + }, + { + "epoch": 9.784665579119086, + "grad_norm": 0.262071430683136, + "learning_rate": 0.000605278326601836, + "loss": 0.1985, + "num_input_tokens_seen": 129563104, + "step": 59980 + }, + { + "epoch": 9.785481239804241, + "grad_norm": 0.08661609143018723, + "learning_rate": 0.0006052087414462873, + "loss": 0.0616, + "num_input_tokens_seen": 129575168, + "step": 59985 + }, + { + "epoch": 9.786296900489397, + "grad_norm": 0.4128914177417755, + "learning_rate": 0.00060513915415855, + "loss": 0.0687, + "num_input_tokens_seen": 129586592, + "step": 59990 + }, + { + "epoch": 9.78711256117455, + "grad_norm": 0.101948581635952, + "learning_rate": 0.0006050695647400342, + "loss": 0.0472, + "num_input_tokens_seen": 129596416, + "step": 59995 + }, + { + "epoch": 9.787928221859707, + "grad_norm": 0.008494734764099121, + "learning_rate": 0.0006049999731921504, + "loss": 0.0759, + "num_input_tokens_seen": 129608256, + "step": 60000 + }, + { + "epoch": 9.78874388254486, + "grad_norm": 0.1565508395433426, + "learning_rate": 0.0006049303795163091, + "loss": 0.0226, + "num_input_tokens_seen": 129620160, + "step": 60005 + }, + { + "epoch": 9.789559543230016, + "grad_norm": 0.0611579567193985, + "learning_rate": 0.0006048607837139204, + "loss": 0.0151, + "num_input_tokens_seen": 129631264, + "step": 60010 + }, + { + "epoch": 9.790375203915172, + "grad_norm": 0.013933761976659298, + "learning_rate": 0.0006047911857863949, + "loss": 0.0899, + "num_input_tokens_seen": 129642048, + "step": 60015 + }, + { + "epoch": 9.791190864600326, + "grad_norm": 0.07861107587814331, + "learning_rate": 0.0006047215857351431, + "loss": 0.021, + "num_input_tokens_seen": 129652928, + "step": 60020 + }, + { + "epoch": 9.792006525285482, + "grad_norm": 0.005836360156536102, + "learning_rate": 0.0006046519835615756, + "loss": 0.036, + "num_input_tokens_seen": 129663008, + "step": 60025 + }, + { + "epoch": 9.792822185970635, + "grad_norm": 0.025735294446349144, + "learning_rate": 0.0006045823792671029, + "loss": 0.1123, + "num_input_tokens_seen": 129674688, + "step": 60030 + }, + { + "epoch": 9.793637846655791, + "grad_norm": 0.07211139798164368, + "learning_rate": 0.0006045127728531354, + "loss": 0.0211, + "num_input_tokens_seen": 129685312, + "step": 60035 + }, + { + "epoch": 9.794453507340947, + "grad_norm": 0.01239249762147665, + "learning_rate": 0.0006044431643210842, + "loss": 0.046, + "num_input_tokens_seen": 129697120, + "step": 60040 + }, + { + "epoch": 9.7952691680261, + "grad_norm": 0.15044008195400238, + "learning_rate": 0.0006043735536723595, + "loss": 0.0516, + "num_input_tokens_seen": 129708512, + "step": 60045 + }, + { + "epoch": 9.796084828711257, + "grad_norm": 0.1912791132926941, + "learning_rate": 0.0006043039409083726, + "loss": 0.0484, + "num_input_tokens_seen": 129719232, + "step": 60050 + }, + { + "epoch": 9.79690048939641, + "grad_norm": 0.013628056272864342, + "learning_rate": 0.0006042343260305339, + "loss": 0.0108, + "num_input_tokens_seen": 129729216, + "step": 60055 + }, + { + "epoch": 9.797716150081566, + "grad_norm": 0.024676060304045677, + "learning_rate": 0.0006041647090402544, + "loss": 0.0833, + "num_input_tokens_seen": 129740384, + "step": 60060 + }, + { + "epoch": 9.798531810766722, + "grad_norm": 0.018948128446936607, + "learning_rate": 0.0006040950899389449, + "loss": 0.0182, + "num_input_tokens_seen": 129750080, + "step": 60065 + }, + { + "epoch": 9.799347471451876, + "grad_norm": 0.06415722519159317, + "learning_rate": 0.0006040254687280163, + "loss": 0.037, + "num_input_tokens_seen": 129761216, + "step": 60070 + }, + { + "epoch": 9.800163132137031, + "grad_norm": 1.0381970405578613, + "learning_rate": 0.0006039558454088796, + "loss": 0.0945, + "num_input_tokens_seen": 129771872, + "step": 60075 + }, + { + "epoch": 9.800978792822185, + "grad_norm": 0.29000842571258545, + "learning_rate": 0.0006038862199829459, + "loss": 0.2056, + "num_input_tokens_seen": 129782240, + "step": 60080 + }, + { + "epoch": 9.801794453507341, + "grad_norm": 0.10297328978776932, + "learning_rate": 0.0006038165924516262, + "loss": 0.0503, + "num_input_tokens_seen": 129793664, + "step": 60085 + }, + { + "epoch": 9.802610114192497, + "grad_norm": 0.05766294151544571, + "learning_rate": 0.0006037469628163315, + "loss": 0.0227, + "num_input_tokens_seen": 129803904, + "step": 60090 + }, + { + "epoch": 9.80342577487765, + "grad_norm": 0.12936429679393768, + "learning_rate": 0.000603677331078473, + "loss": 0.0057, + "num_input_tokens_seen": 129814400, + "step": 60095 + }, + { + "epoch": 9.804241435562806, + "grad_norm": 0.004301194101572037, + "learning_rate": 0.0006036076972394618, + "loss": 0.0709, + "num_input_tokens_seen": 129825856, + "step": 60100 + }, + { + "epoch": 9.80505709624796, + "grad_norm": 0.008284702897071838, + "learning_rate": 0.0006035380613007093, + "loss": 0.1158, + "num_input_tokens_seen": 129836192, + "step": 60105 + }, + { + "epoch": 9.805872756933116, + "grad_norm": 0.03883660212159157, + "learning_rate": 0.0006034684232636266, + "loss": 0.0156, + "num_input_tokens_seen": 129847424, + "step": 60110 + }, + { + "epoch": 9.80668841761827, + "grad_norm": 0.0031957624014467, + "learning_rate": 0.0006033987831296251, + "loss": 0.0549, + "num_input_tokens_seen": 129858944, + "step": 60115 + }, + { + "epoch": 9.807504078303426, + "grad_norm": 0.0031466346699744463, + "learning_rate": 0.0006033291409001159, + "loss": 0.0405, + "num_input_tokens_seen": 129869056, + "step": 60120 + }, + { + "epoch": 9.808319738988581, + "grad_norm": 0.011443628929555416, + "learning_rate": 0.0006032594965765107, + "loss": 0.027, + "num_input_tokens_seen": 129878976, + "step": 60125 + }, + { + "epoch": 9.809135399673735, + "grad_norm": 0.006320877466350794, + "learning_rate": 0.0006031898501602207, + "loss": 0.1365, + "num_input_tokens_seen": 129889984, + "step": 60130 + }, + { + "epoch": 9.809951060358891, + "grad_norm": 0.25152286887168884, + "learning_rate": 0.0006031202016526576, + "loss": 0.0353, + "num_input_tokens_seen": 129900832, + "step": 60135 + }, + { + "epoch": 9.810766721044045, + "grad_norm": 0.09581360220909119, + "learning_rate": 0.0006030505510552329, + "loss": 0.1428, + "num_input_tokens_seen": 129911872, + "step": 60140 + }, + { + "epoch": 9.8115823817292, + "grad_norm": 0.02002377063035965, + "learning_rate": 0.0006029808983693579, + "loss": 0.0112, + "num_input_tokens_seen": 129919808, + "step": 60145 + }, + { + "epoch": 9.812398042414356, + "grad_norm": 0.03476516902446747, + "learning_rate": 0.0006029112435964444, + "loss": 0.016, + "num_input_tokens_seen": 129930848, + "step": 60150 + }, + { + "epoch": 9.81321370309951, + "grad_norm": 0.005062747281044722, + "learning_rate": 0.0006028415867379039, + "loss": 0.1012, + "num_input_tokens_seen": 129940576, + "step": 60155 + }, + { + "epoch": 9.814029363784666, + "grad_norm": 0.003687590127810836, + "learning_rate": 0.0006027719277951482, + "loss": 0.042, + "num_input_tokens_seen": 129952448, + "step": 60160 + }, + { + "epoch": 9.81484502446982, + "grad_norm": 0.006466528866440058, + "learning_rate": 0.000602702266769589, + "loss": 0.1049, + "num_input_tokens_seen": 129962752, + "step": 60165 + }, + { + "epoch": 9.815660685154976, + "grad_norm": 0.004529135767370462, + "learning_rate": 0.0006026326036626382, + "loss": 0.0156, + "num_input_tokens_seen": 129973184, + "step": 60170 + }, + { + "epoch": 9.81647634584013, + "grad_norm": 0.007038953714072704, + "learning_rate": 0.0006025629384757075, + "loss": 0.0083, + "num_input_tokens_seen": 129984608, + "step": 60175 + }, + { + "epoch": 9.817292006525285, + "grad_norm": 0.0019257472595199943, + "learning_rate": 0.0006024932712102085, + "loss": 0.0659, + "num_input_tokens_seen": 129995616, + "step": 60180 + }, + { + "epoch": 9.818107667210441, + "grad_norm": 0.004110215697437525, + "learning_rate": 0.0006024236018675537, + "loss": 0.0442, + "num_input_tokens_seen": 130007360, + "step": 60185 + }, + { + "epoch": 9.818923327895595, + "grad_norm": 0.15984342992305756, + "learning_rate": 0.0006023539304491544, + "loss": 0.0569, + "num_input_tokens_seen": 130018592, + "step": 60190 + }, + { + "epoch": 9.81973898858075, + "grad_norm": 0.11752087622880936, + "learning_rate": 0.000602284256956423, + "loss": 0.0347, + "num_input_tokens_seen": 130026720, + "step": 60195 + }, + { + "epoch": 9.820554649265905, + "grad_norm": 0.007008385378867388, + "learning_rate": 0.0006022145813907713, + "loss": 0.0883, + "num_input_tokens_seen": 130037728, + "step": 60200 + }, + { + "epoch": 9.82137030995106, + "grad_norm": 0.04539079964160919, + "learning_rate": 0.0006021449037536114, + "loss": 0.0214, + "num_input_tokens_seen": 130048928, + "step": 60205 + }, + { + "epoch": 9.822185970636216, + "grad_norm": 0.024069270119071007, + "learning_rate": 0.0006020752240463555, + "loss": 0.0293, + "num_input_tokens_seen": 130059968, + "step": 60210 + }, + { + "epoch": 9.82300163132137, + "grad_norm": 0.06953584402799606, + "learning_rate": 0.0006020055422704156, + "loss": 0.0119, + "num_input_tokens_seen": 130071712, + "step": 60215 + }, + { + "epoch": 9.823817292006526, + "grad_norm": 0.012529253028333187, + "learning_rate": 0.0006019358584272042, + "loss": 0.0079, + "num_input_tokens_seen": 130083360, + "step": 60220 + }, + { + "epoch": 9.82463295269168, + "grad_norm": 0.38807380199432373, + "learning_rate": 0.0006018661725181332, + "loss": 0.1423, + "num_input_tokens_seen": 130094368, + "step": 60225 + }, + { + "epoch": 9.825448613376835, + "grad_norm": 0.08964411914348602, + "learning_rate": 0.0006017964845446149, + "loss": 0.0273, + "num_input_tokens_seen": 130105184, + "step": 60230 + }, + { + "epoch": 9.826264274061991, + "grad_norm": 0.06751274317502975, + "learning_rate": 0.0006017267945080618, + "loss": 0.0312, + "num_input_tokens_seen": 130115584, + "step": 60235 + }, + { + "epoch": 9.827079934747145, + "grad_norm": 0.007398010231554508, + "learning_rate": 0.000601657102409886, + "loss": 0.0225, + "num_input_tokens_seen": 130126432, + "step": 60240 + }, + { + "epoch": 9.8278955954323, + "grad_norm": 0.4739130735397339, + "learning_rate": 0.0006015874082515003, + "loss": 0.1254, + "num_input_tokens_seen": 130137344, + "step": 60245 + }, + { + "epoch": 9.828711256117455, + "grad_norm": 0.06454982608556747, + "learning_rate": 0.0006015177120343168, + "loss": 0.0456, + "num_input_tokens_seen": 130148064, + "step": 60250 + }, + { + "epoch": 9.82952691680261, + "grad_norm": 0.035548772662878036, + "learning_rate": 0.000601448013759748, + "loss": 0.069, + "num_input_tokens_seen": 130159744, + "step": 60255 + }, + { + "epoch": 9.830342577487766, + "grad_norm": 0.07255587726831436, + "learning_rate": 0.0006013783134292067, + "loss": 0.0417, + "num_input_tokens_seen": 130171072, + "step": 60260 + }, + { + "epoch": 9.83115823817292, + "grad_norm": 0.0018127447692677379, + "learning_rate": 0.0006013086110441049, + "loss": 0.0485, + "num_input_tokens_seen": 130182400, + "step": 60265 + }, + { + "epoch": 9.831973898858076, + "grad_norm": 0.034299176186323166, + "learning_rate": 0.0006012389066058559, + "loss": 0.0827, + "num_input_tokens_seen": 130193568, + "step": 60270 + }, + { + "epoch": 9.83278955954323, + "grad_norm": 0.08214818686246872, + "learning_rate": 0.0006011692001158719, + "loss": 0.0305, + "num_input_tokens_seen": 130203680, + "step": 60275 + }, + { + "epoch": 9.833605220228385, + "grad_norm": 0.0023437021300196648, + "learning_rate": 0.0006010994915755659, + "loss": 0.006, + "num_input_tokens_seen": 130213632, + "step": 60280 + }, + { + "epoch": 9.83442088091354, + "grad_norm": 0.007850533351302147, + "learning_rate": 0.0006010297809863503, + "loss": 0.0241, + "num_input_tokens_seen": 130224128, + "step": 60285 + }, + { + "epoch": 9.835236541598695, + "grad_norm": 0.040578775107860565, + "learning_rate": 0.000600960068349638, + "loss": 0.1383, + "num_input_tokens_seen": 130235648, + "step": 60290 + }, + { + "epoch": 9.83605220228385, + "grad_norm": 0.08046291023492813, + "learning_rate": 0.000600890353666842, + "loss": 0.0305, + "num_input_tokens_seen": 130245792, + "step": 60295 + }, + { + "epoch": 9.836867862969005, + "grad_norm": 0.002211250364780426, + "learning_rate": 0.0006008206369393748, + "loss": 0.1303, + "num_input_tokens_seen": 130255744, + "step": 60300 + }, + { + "epoch": 9.83768352365416, + "grad_norm": 0.1887092888355255, + "learning_rate": 0.0006007509181686496, + "loss": 0.15, + "num_input_tokens_seen": 130266080, + "step": 60305 + }, + { + "epoch": 9.838499184339314, + "grad_norm": 0.5672429800033569, + "learning_rate": 0.0006006811973560792, + "loss": 0.1197, + "num_input_tokens_seen": 130277344, + "step": 60310 + }, + { + "epoch": 9.83931484502447, + "grad_norm": 0.398388534784317, + "learning_rate": 0.0006006114745030766, + "loss": 0.0883, + "num_input_tokens_seen": 130286656, + "step": 60315 + }, + { + "epoch": 9.840130505709626, + "grad_norm": 0.003656855085864663, + "learning_rate": 0.0006005417496110549, + "loss": 0.0308, + "num_input_tokens_seen": 130297344, + "step": 60320 + }, + { + "epoch": 9.84094616639478, + "grad_norm": 0.02871028333902359, + "learning_rate": 0.0006004720226814271, + "loss": 0.0243, + "num_input_tokens_seen": 130307200, + "step": 60325 + }, + { + "epoch": 9.841761827079935, + "grad_norm": 0.3796071410179138, + "learning_rate": 0.0006004022937156062, + "loss": 0.1448, + "num_input_tokens_seen": 130318144, + "step": 60330 + }, + { + "epoch": 9.84257748776509, + "grad_norm": 0.018963851034641266, + "learning_rate": 0.0006003325627150054, + "loss": 0.0183, + "num_input_tokens_seen": 130327904, + "step": 60335 + }, + { + "epoch": 9.843393148450245, + "grad_norm": 0.046277206391096115, + "learning_rate": 0.0006002628296810381, + "loss": 0.0121, + "num_input_tokens_seen": 130338176, + "step": 60340 + }, + { + "epoch": 9.844208809135399, + "grad_norm": 0.0019071658607572317, + "learning_rate": 0.0006001930946151172, + "loss": 0.0116, + "num_input_tokens_seen": 130349600, + "step": 60345 + }, + { + "epoch": 9.845024469820554, + "grad_norm": 0.010552327148616314, + "learning_rate": 0.0006001233575186563, + "loss": 0.1436, + "num_input_tokens_seen": 130360352, + "step": 60350 + }, + { + "epoch": 9.84584013050571, + "grad_norm": 0.023848099634051323, + "learning_rate": 0.0006000536183930684, + "loss": 0.0938, + "num_input_tokens_seen": 130370944, + "step": 60355 + }, + { + "epoch": 9.846655791190864, + "grad_norm": 0.08024164289236069, + "learning_rate": 0.000599983877239767, + "loss": 0.0648, + "num_input_tokens_seen": 130382336, + "step": 60360 + }, + { + "epoch": 9.84747145187602, + "grad_norm": 0.018898021429777145, + "learning_rate": 0.0005999141340601657, + "loss": 0.01, + "num_input_tokens_seen": 130392480, + "step": 60365 + }, + { + "epoch": 9.848287112561174, + "grad_norm": 0.03573465347290039, + "learning_rate": 0.0005998443888556776, + "loss": 0.0339, + "num_input_tokens_seen": 130403360, + "step": 60370 + }, + { + "epoch": 9.84910277324633, + "grad_norm": 0.09511330723762512, + "learning_rate": 0.0005997746416277162, + "loss": 0.0313, + "num_input_tokens_seen": 130414432, + "step": 60375 + }, + { + "epoch": 9.849918433931485, + "grad_norm": 0.00512855825945735, + "learning_rate": 0.0005997048923776953, + "loss": 0.0605, + "num_input_tokens_seen": 130424992, + "step": 60380 + }, + { + "epoch": 9.850734094616639, + "grad_norm": 0.26799216866493225, + "learning_rate": 0.000599635141107028, + "loss": 0.0399, + "num_input_tokens_seen": 130433920, + "step": 60385 + }, + { + "epoch": 9.851549755301795, + "grad_norm": 0.007397581823170185, + "learning_rate": 0.0005995653878171283, + "loss": 0.0136, + "num_input_tokens_seen": 130444544, + "step": 60390 + }, + { + "epoch": 9.852365415986949, + "grad_norm": 0.6185172200202942, + "learning_rate": 0.0005994956325094099, + "loss": 0.0705, + "num_input_tokens_seen": 130455360, + "step": 60395 + }, + { + "epoch": 9.853181076672104, + "grad_norm": 0.03895857185125351, + "learning_rate": 0.000599425875185286, + "loss": 0.2108, + "num_input_tokens_seen": 130465824, + "step": 60400 + }, + { + "epoch": 9.85399673735726, + "grad_norm": 0.0034981996286660433, + "learning_rate": 0.0005993561158461708, + "loss": 0.0399, + "num_input_tokens_seen": 130477376, + "step": 60405 + }, + { + "epoch": 9.854812398042414, + "grad_norm": 0.003301647724583745, + "learning_rate": 0.0005992863544934777, + "loss": 0.1199, + "num_input_tokens_seen": 130487328, + "step": 60410 + }, + { + "epoch": 9.85562805872757, + "grad_norm": 0.02783522754907608, + "learning_rate": 0.000599216591128621, + "loss": 0.031, + "num_input_tokens_seen": 130497248, + "step": 60415 + }, + { + "epoch": 9.856443719412724, + "grad_norm": 0.004357766360044479, + "learning_rate": 0.000599146825753014, + "loss": 0.1704, + "num_input_tokens_seen": 130507520, + "step": 60420 + }, + { + "epoch": 9.85725938009788, + "grad_norm": 0.004188814666122198, + "learning_rate": 0.0005990770583680707, + "loss": 0.0326, + "num_input_tokens_seen": 130519488, + "step": 60425 + }, + { + "epoch": 9.858075040783035, + "grad_norm": 0.10563724488019943, + "learning_rate": 0.0005990072889752052, + "loss": 0.0153, + "num_input_tokens_seen": 130530304, + "step": 60430 + }, + { + "epoch": 9.858890701468189, + "grad_norm": 0.052509855479002, + "learning_rate": 0.0005989375175758315, + "loss": 0.0174, + "num_input_tokens_seen": 130540544, + "step": 60435 + }, + { + "epoch": 9.859706362153345, + "grad_norm": 0.013773845508694649, + "learning_rate": 0.0005988677441713633, + "loss": 0.0949, + "num_input_tokens_seen": 130550336, + "step": 60440 + }, + { + "epoch": 9.860522022838499, + "grad_norm": 0.20639842748641968, + "learning_rate": 0.000598797968763215, + "loss": 0.0226, + "num_input_tokens_seen": 130560896, + "step": 60445 + }, + { + "epoch": 9.861337683523654, + "grad_norm": 0.009240848943591118, + "learning_rate": 0.0005987281913528006, + "loss": 0.0283, + "num_input_tokens_seen": 130572064, + "step": 60450 + }, + { + "epoch": 9.86215334420881, + "grad_norm": 0.025932233780622482, + "learning_rate": 0.0005986584119415339, + "loss": 0.1094, + "num_input_tokens_seen": 130583328, + "step": 60455 + }, + { + "epoch": 9.862969004893964, + "grad_norm": 0.009651134721934795, + "learning_rate": 0.0005985886305308295, + "loss": 0.0137, + "num_input_tokens_seen": 130591840, + "step": 60460 + }, + { + "epoch": 9.86378466557912, + "grad_norm": 0.030490349978208542, + "learning_rate": 0.0005985188471221014, + "loss": 0.0965, + "num_input_tokens_seen": 130603040, + "step": 60465 + }, + { + "epoch": 9.864600326264274, + "grad_norm": 0.05847940593957901, + "learning_rate": 0.0005984490617167639, + "loss": 0.0361, + "num_input_tokens_seen": 130614208, + "step": 60470 + }, + { + "epoch": 9.86541598694943, + "grad_norm": 0.13259384036064148, + "learning_rate": 0.0005983792743162313, + "loss": 0.033, + "num_input_tokens_seen": 130624064, + "step": 60475 + }, + { + "epoch": 9.866231647634583, + "grad_norm": 0.007011461537331343, + "learning_rate": 0.0005983094849219177, + "loss": 0.0917, + "num_input_tokens_seen": 130633984, + "step": 60480 + }, + { + "epoch": 9.867047308319739, + "grad_norm": 0.007801242638379335, + "learning_rate": 0.0005982396935352379, + "loss": 0.0606, + "num_input_tokens_seen": 130644896, + "step": 60485 + }, + { + "epoch": 9.867862969004895, + "grad_norm": 0.04473032057285309, + "learning_rate": 0.000598169900157606, + "loss": 0.1836, + "num_input_tokens_seen": 130654752, + "step": 60490 + }, + { + "epoch": 9.868678629690049, + "grad_norm": 0.29122939705848694, + "learning_rate": 0.0005981001047904365, + "loss": 0.0549, + "num_input_tokens_seen": 130665568, + "step": 60495 + }, + { + "epoch": 9.869494290375204, + "grad_norm": 0.007103382144123316, + "learning_rate": 0.000598030307435144, + "loss": 0.0387, + "num_input_tokens_seen": 130675904, + "step": 60500 + }, + { + "epoch": 9.870309951060358, + "grad_norm": 0.04765298217535019, + "learning_rate": 0.000597960508093143, + "loss": 0.0485, + "num_input_tokens_seen": 130686464, + "step": 60505 + }, + { + "epoch": 9.871125611745514, + "grad_norm": 0.009073898196220398, + "learning_rate": 0.0005978907067658479, + "loss": 0.0195, + "num_input_tokens_seen": 130697568, + "step": 60510 + }, + { + "epoch": 9.87194127243067, + "grad_norm": 0.01489250548183918, + "learning_rate": 0.0005978209034546736, + "loss": 0.0561, + "num_input_tokens_seen": 130708480, + "step": 60515 + }, + { + "epoch": 9.872756933115824, + "grad_norm": 0.3078218400478363, + "learning_rate": 0.0005977510981610344, + "loss": 0.1301, + "num_input_tokens_seen": 130719136, + "step": 60520 + }, + { + "epoch": 9.87357259380098, + "grad_norm": 0.0026563978753983974, + "learning_rate": 0.0005976812908863454, + "loss": 0.0459, + "num_input_tokens_seen": 130730368, + "step": 60525 + }, + { + "epoch": 9.874388254486133, + "grad_norm": 0.02801201492547989, + "learning_rate": 0.0005976114816320208, + "loss": 0.0951, + "num_input_tokens_seen": 130740544, + "step": 60530 + }, + { + "epoch": 9.875203915171289, + "grad_norm": 0.03558781370520592, + "learning_rate": 0.000597541670399476, + "loss": 0.0783, + "num_input_tokens_seen": 130752448, + "step": 60535 + }, + { + "epoch": 9.876019575856443, + "grad_norm": 0.15019097924232483, + "learning_rate": 0.0005974718571901254, + "loss": 0.0271, + "num_input_tokens_seen": 130763520, + "step": 60540 + }, + { + "epoch": 9.876835236541599, + "grad_norm": 0.005682834889739752, + "learning_rate": 0.0005974020420053841, + "loss": 0.0042, + "num_input_tokens_seen": 130774560, + "step": 60545 + }, + { + "epoch": 9.877650897226754, + "grad_norm": 0.40169715881347656, + "learning_rate": 0.0005973322248466666, + "loss": 0.1045, + "num_input_tokens_seen": 130784000, + "step": 60550 + }, + { + "epoch": 9.878466557911908, + "grad_norm": 0.014299996197223663, + "learning_rate": 0.0005972624057153882, + "loss": 0.0572, + "num_input_tokens_seen": 130793600, + "step": 60555 + }, + { + "epoch": 9.879282218597064, + "grad_norm": 0.006953614763915539, + "learning_rate": 0.0005971925846129639, + "loss": 0.0291, + "num_input_tokens_seen": 130805728, + "step": 60560 + }, + { + "epoch": 9.880097879282218, + "grad_norm": 0.007064024917781353, + "learning_rate": 0.0005971227615408084, + "loss": 0.015, + "num_input_tokens_seen": 130815936, + "step": 60565 + }, + { + "epoch": 9.880913539967374, + "grad_norm": 0.15870408713817596, + "learning_rate": 0.0005970529365003371, + "loss": 0.1804, + "num_input_tokens_seen": 130827456, + "step": 60570 + }, + { + "epoch": 9.88172920065253, + "grad_norm": 0.0761968344449997, + "learning_rate": 0.0005969831094929648, + "loss": 0.0119, + "num_input_tokens_seen": 130837664, + "step": 60575 + }, + { + "epoch": 9.882544861337683, + "grad_norm": 0.0183928981423378, + "learning_rate": 0.0005969132805201067, + "loss": 0.0249, + "num_input_tokens_seen": 130848928, + "step": 60580 + }, + { + "epoch": 9.883360522022839, + "grad_norm": 0.3906119465827942, + "learning_rate": 0.0005968434495831781, + "loss": 0.029, + "num_input_tokens_seen": 130860640, + "step": 60585 + }, + { + "epoch": 9.884176182707993, + "grad_norm": 0.004583487752825022, + "learning_rate": 0.000596773616683594, + "loss": 0.0393, + "num_input_tokens_seen": 130872768, + "step": 60590 + }, + { + "epoch": 9.884991843393149, + "grad_norm": 0.020378025248646736, + "learning_rate": 0.0005967037818227701, + "loss": 0.0306, + "num_input_tokens_seen": 130883360, + "step": 60595 + }, + { + "epoch": 9.885807504078304, + "grad_norm": 0.018993549048900604, + "learning_rate": 0.0005966339450021212, + "loss": 0.016, + "num_input_tokens_seen": 130893984, + "step": 60600 + }, + { + "epoch": 9.886623164763458, + "grad_norm": 0.0059882765635848045, + "learning_rate": 0.0005965641062230627, + "loss": 0.0176, + "num_input_tokens_seen": 130905152, + "step": 60605 + }, + { + "epoch": 9.887438825448614, + "grad_norm": 0.024164140224456787, + "learning_rate": 0.0005964942654870103, + "loss": 0.0178, + "num_input_tokens_seen": 130914368, + "step": 60610 + }, + { + "epoch": 9.888254486133768, + "grad_norm": 0.1327124685049057, + "learning_rate": 0.0005964244227953791, + "loss": 0.0153, + "num_input_tokens_seen": 130925088, + "step": 60615 + }, + { + "epoch": 9.889070146818923, + "grad_norm": 0.027247849851846695, + "learning_rate": 0.0005963545781495847, + "loss": 0.1065, + "num_input_tokens_seen": 130933344, + "step": 60620 + }, + { + "epoch": 9.88988580750408, + "grad_norm": 0.1004333570599556, + "learning_rate": 0.0005962847315510426, + "loss": 0.0409, + "num_input_tokens_seen": 130944448, + "step": 60625 + }, + { + "epoch": 9.890701468189233, + "grad_norm": 0.01161018293350935, + "learning_rate": 0.0005962148830011681, + "loss": 0.0105, + "num_input_tokens_seen": 130955040, + "step": 60630 + }, + { + "epoch": 9.891517128874389, + "grad_norm": 0.4277346134185791, + "learning_rate": 0.0005961450325013771, + "loss": 0.0289, + "num_input_tokens_seen": 130966400, + "step": 60635 + }, + { + "epoch": 9.892332789559543, + "grad_norm": 0.1819687783718109, + "learning_rate": 0.0005960751800530849, + "loss": 0.0179, + "num_input_tokens_seen": 130978176, + "step": 60640 + }, + { + "epoch": 9.893148450244698, + "grad_norm": 0.027873460203409195, + "learning_rate": 0.0005960053256577073, + "loss": 0.0477, + "num_input_tokens_seen": 130988352, + "step": 60645 + }, + { + "epoch": 9.893964110929852, + "grad_norm": 0.0183260440826416, + "learning_rate": 0.0005959354693166601, + "loss": 0.0455, + "num_input_tokens_seen": 130998560, + "step": 60650 + }, + { + "epoch": 9.894779771615008, + "grad_norm": 0.2565465271472931, + "learning_rate": 0.0005958656110313589, + "loss": 0.189, + "num_input_tokens_seen": 131008352, + "step": 60655 + }, + { + "epoch": 9.895595432300164, + "grad_norm": 0.08927177637815475, + "learning_rate": 0.0005957957508032194, + "loss": 0.0242, + "num_input_tokens_seen": 131019232, + "step": 60660 + }, + { + "epoch": 9.896411092985318, + "grad_norm": 0.0050674197264015675, + "learning_rate": 0.0005957258886336575, + "loss": 0.0053, + "num_input_tokens_seen": 131030336, + "step": 60665 + }, + { + "epoch": 9.897226753670473, + "grad_norm": 0.06827183067798615, + "learning_rate": 0.0005956560245240891, + "loss": 0.1387, + "num_input_tokens_seen": 131040416, + "step": 60670 + }, + { + "epoch": 9.898042414355627, + "grad_norm": 0.04122605174779892, + "learning_rate": 0.0005955861584759298, + "loss": 0.0608, + "num_input_tokens_seen": 131050624, + "step": 60675 + }, + { + "epoch": 9.898858075040783, + "grad_norm": 0.7276883125305176, + "learning_rate": 0.0005955162904905959, + "loss": 0.0778, + "num_input_tokens_seen": 131061600, + "step": 60680 + }, + { + "epoch": 9.899673735725939, + "grad_norm": 0.1265845149755478, + "learning_rate": 0.0005954464205695033, + "loss": 0.0996, + "num_input_tokens_seen": 131070848, + "step": 60685 + }, + { + "epoch": 9.900489396411093, + "grad_norm": 0.01593145728111267, + "learning_rate": 0.0005953765487140678, + "loss": 0.0745, + "num_input_tokens_seen": 131081888, + "step": 60690 + }, + { + "epoch": 9.901305057096248, + "grad_norm": 0.002230089157819748, + "learning_rate": 0.0005953066749257055, + "loss": 0.0098, + "num_input_tokens_seen": 131092896, + "step": 60695 + }, + { + "epoch": 9.902120717781402, + "grad_norm": 0.010627939365804195, + "learning_rate": 0.0005952367992058326, + "loss": 0.0212, + "num_input_tokens_seen": 131103936, + "step": 60700 + }, + { + "epoch": 9.902936378466558, + "grad_norm": 0.08064654469490051, + "learning_rate": 0.0005951669215558651, + "loss": 0.0163, + "num_input_tokens_seen": 131115424, + "step": 60705 + }, + { + "epoch": 9.903752039151712, + "grad_norm": 0.006806948687881231, + "learning_rate": 0.0005950970419772192, + "loss": 0.089, + "num_input_tokens_seen": 131126752, + "step": 60710 + }, + { + "epoch": 9.904567699836868, + "grad_norm": 0.1305762231349945, + "learning_rate": 0.0005950271604713111, + "loss": 0.1594, + "num_input_tokens_seen": 131138688, + "step": 60715 + }, + { + "epoch": 9.905383360522023, + "grad_norm": 0.056780025362968445, + "learning_rate": 0.000594957277039557, + "loss": 0.0415, + "num_input_tokens_seen": 131149440, + "step": 60720 + }, + { + "epoch": 9.906199021207177, + "grad_norm": 0.0119969192892313, + "learning_rate": 0.0005948873916833733, + "loss": 0.0214, + "num_input_tokens_seen": 131160768, + "step": 60725 + }, + { + "epoch": 9.907014681892333, + "grad_norm": 0.0028076330199837685, + "learning_rate": 0.0005948175044041764, + "loss": 0.0179, + "num_input_tokens_seen": 131172096, + "step": 60730 + }, + { + "epoch": 9.907830342577487, + "grad_norm": 0.4447990357875824, + "learning_rate": 0.0005947476152033822, + "loss": 0.0369, + "num_input_tokens_seen": 131182016, + "step": 60735 + }, + { + "epoch": 9.908646003262643, + "grad_norm": 0.6924632787704468, + "learning_rate": 0.0005946777240824076, + "loss": 0.1668, + "num_input_tokens_seen": 131193536, + "step": 60740 + }, + { + "epoch": 9.909461663947798, + "grad_norm": 0.015939053148031235, + "learning_rate": 0.0005946078310426687, + "loss": 0.1175, + "num_input_tokens_seen": 131204448, + "step": 60745 + }, + { + "epoch": 9.910277324632952, + "grad_norm": 0.0262970682233572, + "learning_rate": 0.000594537936085582, + "loss": 0.0461, + "num_input_tokens_seen": 131216384, + "step": 60750 + }, + { + "epoch": 9.911092985318108, + "grad_norm": 0.05035187304019928, + "learning_rate": 0.0005944680392125643, + "loss": 0.0649, + "num_input_tokens_seen": 131228064, + "step": 60755 + }, + { + "epoch": 9.911908646003262, + "grad_norm": 0.487958163022995, + "learning_rate": 0.0005943981404250318, + "loss": 0.0762, + "num_input_tokens_seen": 131239264, + "step": 60760 + }, + { + "epoch": 9.912724306688418, + "grad_norm": 0.003221668768674135, + "learning_rate": 0.0005943282397244013, + "loss": 0.0295, + "num_input_tokens_seen": 131251040, + "step": 60765 + }, + { + "epoch": 9.913539967373573, + "grad_norm": 0.16377022862434387, + "learning_rate": 0.0005942583371120893, + "loss": 0.0312, + "num_input_tokens_seen": 131261568, + "step": 60770 + }, + { + "epoch": 9.914355628058727, + "grad_norm": 0.024691110476851463, + "learning_rate": 0.0005941884325895127, + "loss": 0.0818, + "num_input_tokens_seen": 131272768, + "step": 60775 + }, + { + "epoch": 9.915171288743883, + "grad_norm": 0.026219695806503296, + "learning_rate": 0.0005941185261580878, + "loss": 0.0304, + "num_input_tokens_seen": 131282240, + "step": 60780 + }, + { + "epoch": 9.915986949429037, + "grad_norm": 0.003964452538639307, + "learning_rate": 0.0005940486178192317, + "loss": 0.1644, + "num_input_tokens_seen": 131292608, + "step": 60785 + }, + { + "epoch": 9.916802610114193, + "grad_norm": 0.03120097890496254, + "learning_rate": 0.000593978707574361, + "loss": 0.011, + "num_input_tokens_seen": 131303040, + "step": 60790 + }, + { + "epoch": 9.917618270799348, + "grad_norm": 0.05793861672282219, + "learning_rate": 0.0005939087954248926, + "loss": 0.0545, + "num_input_tokens_seen": 131314208, + "step": 60795 + }, + { + "epoch": 9.918433931484502, + "grad_norm": 0.005540280602872372, + "learning_rate": 0.0005938388813722432, + "loss": 0.0708, + "num_input_tokens_seen": 131324832, + "step": 60800 + }, + { + "epoch": 9.919249592169658, + "grad_norm": 0.431779146194458, + "learning_rate": 0.0005937689654178298, + "loss": 0.0786, + "num_input_tokens_seen": 131335904, + "step": 60805 + }, + { + "epoch": 9.920065252854812, + "grad_norm": 0.2070673108100891, + "learning_rate": 0.0005936990475630696, + "loss": 0.0929, + "num_input_tokens_seen": 131346368, + "step": 60810 + }, + { + "epoch": 9.920880913539968, + "grad_norm": 0.06501749902963638, + "learning_rate": 0.0005936291278093793, + "loss": 0.1148, + "num_input_tokens_seen": 131357216, + "step": 60815 + }, + { + "epoch": 9.921696574225122, + "grad_norm": 0.0018129857489839196, + "learning_rate": 0.0005935592061581758, + "loss": 0.0093, + "num_input_tokens_seen": 131367200, + "step": 60820 + }, + { + "epoch": 9.922512234910277, + "grad_norm": 0.028841711580753326, + "learning_rate": 0.0005934892826108764, + "loss": 0.0269, + "num_input_tokens_seen": 131379264, + "step": 60825 + }, + { + "epoch": 9.923327895595433, + "grad_norm": 0.01920981891453266, + "learning_rate": 0.0005934193571688981, + "loss": 0.0094, + "num_input_tokens_seen": 131389888, + "step": 60830 + }, + { + "epoch": 9.924143556280587, + "grad_norm": 0.013067704625427723, + "learning_rate": 0.0005933494298336579, + "loss": 0.0357, + "num_input_tokens_seen": 131401088, + "step": 60835 + }, + { + "epoch": 9.924959216965743, + "grad_norm": 0.0020066085271537304, + "learning_rate": 0.0005932795006065732, + "loss": 0.0285, + "num_input_tokens_seen": 131412320, + "step": 60840 + }, + { + "epoch": 9.925774877650896, + "grad_norm": 0.04752558097243309, + "learning_rate": 0.000593209569489061, + "loss": 0.02, + "num_input_tokens_seen": 131422880, + "step": 60845 + }, + { + "epoch": 9.926590538336052, + "grad_norm": 0.045283108949661255, + "learning_rate": 0.0005931396364825387, + "loss": 0.0159, + "num_input_tokens_seen": 131434272, + "step": 60850 + }, + { + "epoch": 9.927406199021208, + "grad_norm": 0.021290739998221397, + "learning_rate": 0.0005930697015884234, + "loss": 0.0388, + "num_input_tokens_seen": 131444736, + "step": 60855 + }, + { + "epoch": 9.928221859706362, + "grad_norm": 0.011512097902595997, + "learning_rate": 0.0005929997648081327, + "loss": 0.1208, + "num_input_tokens_seen": 131455104, + "step": 60860 + }, + { + "epoch": 9.929037520391518, + "grad_norm": 0.03910503908991814, + "learning_rate": 0.0005929298261430837, + "loss": 0.0365, + "num_input_tokens_seen": 131466272, + "step": 60865 + }, + { + "epoch": 9.929853181076671, + "grad_norm": 0.21563324332237244, + "learning_rate": 0.0005928598855946939, + "loss": 0.0183, + "num_input_tokens_seen": 131477856, + "step": 60870 + }, + { + "epoch": 9.930668841761827, + "grad_norm": 0.12936514616012573, + "learning_rate": 0.0005927899431643807, + "loss": 0.0206, + "num_input_tokens_seen": 131488288, + "step": 60875 + }, + { + "epoch": 9.931484502446983, + "grad_norm": 0.04893166944384575, + "learning_rate": 0.0005927199988535616, + "loss": 0.0832, + "num_input_tokens_seen": 131498848, + "step": 60880 + }, + { + "epoch": 9.932300163132137, + "grad_norm": 0.006314895115792751, + "learning_rate": 0.0005926500526636542, + "loss": 0.0453, + "num_input_tokens_seen": 131509440, + "step": 60885 + }, + { + "epoch": 9.933115823817293, + "grad_norm": 0.018990257754921913, + "learning_rate": 0.0005925801045960757, + "loss": 0.0191, + "num_input_tokens_seen": 131519840, + "step": 60890 + }, + { + "epoch": 9.933931484502446, + "grad_norm": 0.17692513763904572, + "learning_rate": 0.0005925101546522441, + "loss": 0.072, + "num_input_tokens_seen": 131530400, + "step": 60895 + }, + { + "epoch": 9.934747145187602, + "grad_norm": 0.008013364858925343, + "learning_rate": 0.0005924402028335769, + "loss": 0.0433, + "num_input_tokens_seen": 131540000, + "step": 60900 + }, + { + "epoch": 9.935562805872756, + "grad_norm": 0.1491565853357315, + "learning_rate": 0.0005923702491414916, + "loss": 0.0657, + "num_input_tokens_seen": 131552320, + "step": 60905 + }, + { + "epoch": 9.936378466557912, + "grad_norm": 0.11109715700149536, + "learning_rate": 0.000592300293577406, + "loss": 0.0222, + "num_input_tokens_seen": 131563840, + "step": 60910 + }, + { + "epoch": 9.937194127243067, + "grad_norm": 0.08512434363365173, + "learning_rate": 0.0005922303361427379, + "loss": 0.1019, + "num_input_tokens_seen": 131575488, + "step": 60915 + }, + { + "epoch": 9.938009787928221, + "grad_norm": 0.058436449617147446, + "learning_rate": 0.0005921603768389051, + "loss": 0.0415, + "num_input_tokens_seen": 131586080, + "step": 60920 + }, + { + "epoch": 9.938825448613377, + "grad_norm": 0.02144043892621994, + "learning_rate": 0.0005920904156673254, + "loss": 0.0409, + "num_input_tokens_seen": 131596096, + "step": 60925 + }, + { + "epoch": 9.939641109298531, + "grad_norm": 0.04910271614789963, + "learning_rate": 0.0005920204526294165, + "loss": 0.0446, + "num_input_tokens_seen": 131606304, + "step": 60930 + }, + { + "epoch": 9.940456769983687, + "grad_norm": 0.09139198809862137, + "learning_rate": 0.0005919504877265965, + "loss": 0.0119, + "num_input_tokens_seen": 131617312, + "step": 60935 + }, + { + "epoch": 9.941272430668842, + "grad_norm": 0.020040472969412804, + "learning_rate": 0.000591880520960283, + "loss": 0.1875, + "num_input_tokens_seen": 131627840, + "step": 60940 + }, + { + "epoch": 9.942088091353996, + "grad_norm": 0.07437156140804291, + "learning_rate": 0.0005918105523318944, + "loss": 0.0518, + "num_input_tokens_seen": 131639168, + "step": 60945 + }, + { + "epoch": 9.942903752039152, + "grad_norm": 0.009552833624184132, + "learning_rate": 0.0005917405818428484, + "loss": 0.1345, + "num_input_tokens_seen": 131649728, + "step": 60950 + }, + { + "epoch": 9.943719412724306, + "grad_norm": 0.12387213110923767, + "learning_rate": 0.0005916706094945631, + "loss": 0.2201, + "num_input_tokens_seen": 131659616, + "step": 60955 + }, + { + "epoch": 9.944535073409462, + "grad_norm": 0.010083312168717384, + "learning_rate": 0.0005916006352884567, + "loss": 0.0324, + "num_input_tokens_seen": 131669952, + "step": 60960 + }, + { + "epoch": 9.945350734094617, + "grad_norm": 0.19099733233451843, + "learning_rate": 0.0005915306592259471, + "loss": 0.0257, + "num_input_tokens_seen": 131679872, + "step": 60965 + }, + { + "epoch": 9.946166394779771, + "grad_norm": 0.08524459600448608, + "learning_rate": 0.0005914606813084526, + "loss": 0.0378, + "num_input_tokens_seen": 131690496, + "step": 60970 + }, + { + "epoch": 9.946982055464927, + "grad_norm": 0.3759321868419647, + "learning_rate": 0.0005913907015373915, + "loss": 0.0468, + "num_input_tokens_seen": 131701248, + "step": 60975 + }, + { + "epoch": 9.947797716150081, + "grad_norm": 0.00874224305152893, + "learning_rate": 0.0005913207199141818, + "loss": 0.0293, + "num_input_tokens_seen": 131712640, + "step": 60980 + }, + { + "epoch": 9.948613376835237, + "grad_norm": 0.49674367904663086, + "learning_rate": 0.0005912507364402419, + "loss": 0.0602, + "num_input_tokens_seen": 131723680, + "step": 60985 + }, + { + "epoch": 9.949429037520392, + "grad_norm": 0.010632969439029694, + "learning_rate": 0.0005911807511169899, + "loss": 0.1221, + "num_input_tokens_seen": 131734336, + "step": 60990 + }, + { + "epoch": 9.950244698205546, + "grad_norm": 0.24199523031711578, + "learning_rate": 0.0005911107639458444, + "loss": 0.0444, + "num_input_tokens_seen": 131744864, + "step": 60995 + }, + { + "epoch": 9.951060358890702, + "grad_norm": 0.0021362698171287775, + "learning_rate": 0.0005910407749282237, + "loss": 0.0504, + "num_input_tokens_seen": 131756128, + "step": 61000 + }, + { + "epoch": 9.951876019575856, + "grad_norm": 0.01978922076523304, + "learning_rate": 0.0005909707840655462, + "loss": 0.0828, + "num_input_tokens_seen": 131766528, + "step": 61005 + }, + { + "epoch": 9.952691680261012, + "grad_norm": 0.5361757278442383, + "learning_rate": 0.0005909007913592304, + "loss": 0.0965, + "num_input_tokens_seen": 131776096, + "step": 61010 + }, + { + "epoch": 9.953507340946166, + "grad_norm": 0.02593553252518177, + "learning_rate": 0.0005908307968106948, + "loss": 0.1404, + "num_input_tokens_seen": 131787328, + "step": 61015 + }, + { + "epoch": 9.954323001631321, + "grad_norm": 0.3522297441959381, + "learning_rate": 0.0005907608004213577, + "loss": 0.1902, + "num_input_tokens_seen": 131797856, + "step": 61020 + }, + { + "epoch": 9.955138662316477, + "grad_norm": 0.29401418566703796, + "learning_rate": 0.0005906908021926379, + "loss": 0.0445, + "num_input_tokens_seen": 131808224, + "step": 61025 + }, + { + "epoch": 9.955954323001631, + "grad_norm": 0.022986039519309998, + "learning_rate": 0.000590620802125954, + "loss": 0.0731, + "num_input_tokens_seen": 131818976, + "step": 61030 + }, + { + "epoch": 9.956769983686787, + "grad_norm": 0.43107253313064575, + "learning_rate": 0.0005905508002227247, + "loss": 0.0791, + "num_input_tokens_seen": 131829184, + "step": 61035 + }, + { + "epoch": 9.95758564437194, + "grad_norm": 0.009766715578734875, + "learning_rate": 0.0005904807964843684, + "loss": 0.0151, + "num_input_tokens_seen": 131840608, + "step": 61040 + }, + { + "epoch": 9.958401305057096, + "grad_norm": 0.27253955602645874, + "learning_rate": 0.0005904107909123039, + "loss": 0.1741, + "num_input_tokens_seen": 131850336, + "step": 61045 + }, + { + "epoch": 9.959216965742252, + "grad_norm": 0.35473403334617615, + "learning_rate": 0.0005903407835079502, + "loss": 0.0686, + "num_input_tokens_seen": 131860800, + "step": 61050 + }, + { + "epoch": 9.960032626427406, + "grad_norm": 0.0052872272208333015, + "learning_rate": 0.000590270774272726, + "loss": 0.0473, + "num_input_tokens_seen": 131871552, + "step": 61055 + }, + { + "epoch": 9.960848287112562, + "grad_norm": 0.04657332971692085, + "learning_rate": 0.0005902007632080499, + "loss": 0.0308, + "num_input_tokens_seen": 131881760, + "step": 61060 + }, + { + "epoch": 9.961663947797716, + "grad_norm": 0.22349655628204346, + "learning_rate": 0.0005901307503153408, + "loss": 0.0449, + "num_input_tokens_seen": 131892416, + "step": 61065 + }, + { + "epoch": 9.962479608482871, + "grad_norm": 0.0609329491853714, + "learning_rate": 0.0005900607355960178, + "loss": 0.0128, + "num_input_tokens_seen": 131903744, + "step": 61070 + }, + { + "epoch": 9.963295269168025, + "grad_norm": 0.22006216645240784, + "learning_rate": 0.0005899907190514999, + "loss": 0.1003, + "num_input_tokens_seen": 131914112, + "step": 61075 + }, + { + "epoch": 9.964110929853181, + "grad_norm": 0.04818188399076462, + "learning_rate": 0.0005899207006832056, + "loss": 0.0405, + "num_input_tokens_seen": 131924896, + "step": 61080 + }, + { + "epoch": 9.964926590538337, + "grad_norm": 0.2611391842365265, + "learning_rate": 0.0005898506804925545, + "loss": 0.1429, + "num_input_tokens_seen": 131936192, + "step": 61085 + }, + { + "epoch": 9.96574225122349, + "grad_norm": 0.2683767080307007, + "learning_rate": 0.0005897806584809653, + "loss": 0.0817, + "num_input_tokens_seen": 131947168, + "step": 61090 + }, + { + "epoch": 9.966557911908646, + "grad_norm": 0.005635687615722418, + "learning_rate": 0.0005897106346498571, + "loss": 0.0096, + "num_input_tokens_seen": 131958080, + "step": 61095 + }, + { + "epoch": 9.9673735725938, + "grad_norm": 0.005916200112551451, + "learning_rate": 0.0005896406090006491, + "loss": 0.0107, + "num_input_tokens_seen": 131969920, + "step": 61100 + }, + { + "epoch": 9.968189233278956, + "grad_norm": 0.021341202780604362, + "learning_rate": 0.0005895705815347605, + "loss": 0.0181, + "num_input_tokens_seen": 131979680, + "step": 61105 + }, + { + "epoch": 9.969004893964112, + "grad_norm": 0.19587284326553345, + "learning_rate": 0.0005895005522536104, + "loss": 0.0203, + "num_input_tokens_seen": 131991776, + "step": 61110 + }, + { + "epoch": 9.969820554649266, + "grad_norm": 0.0793571025133133, + "learning_rate": 0.000589430521158618, + "loss": 0.0262, + "num_input_tokens_seen": 132002656, + "step": 61115 + }, + { + "epoch": 9.970636215334421, + "grad_norm": 0.10754761099815369, + "learning_rate": 0.0005893604882512027, + "loss": 0.0962, + "num_input_tokens_seen": 132012928, + "step": 61120 + }, + { + "epoch": 9.971451876019575, + "grad_norm": 0.4493594765663147, + "learning_rate": 0.0005892904535327837, + "loss": 0.0467, + "num_input_tokens_seen": 132023520, + "step": 61125 + }, + { + "epoch": 9.97226753670473, + "grad_norm": 0.2553941011428833, + "learning_rate": 0.0005892204170047804, + "loss": 0.0181, + "num_input_tokens_seen": 132033216, + "step": 61130 + }, + { + "epoch": 9.973083197389887, + "grad_norm": 0.33169591426849365, + "learning_rate": 0.0005891503786686123, + "loss": 0.1709, + "num_input_tokens_seen": 132044736, + "step": 61135 + }, + { + "epoch": 9.97389885807504, + "grad_norm": 0.005872929468750954, + "learning_rate": 0.0005890803385256985, + "loss": 0.0358, + "num_input_tokens_seen": 132055232, + "step": 61140 + }, + { + "epoch": 9.974714518760196, + "grad_norm": 0.0037564358208328485, + "learning_rate": 0.0005890102965774587, + "loss": 0.0128, + "num_input_tokens_seen": 132066176, + "step": 61145 + }, + { + "epoch": 9.97553017944535, + "grad_norm": 0.04645087197422981, + "learning_rate": 0.0005889402528253124, + "loss": 0.0163, + "num_input_tokens_seen": 132076320, + "step": 61150 + }, + { + "epoch": 9.976345840130506, + "grad_norm": 0.002770586172118783, + "learning_rate": 0.0005888702072706788, + "loss": 0.0154, + "num_input_tokens_seen": 132086816, + "step": 61155 + }, + { + "epoch": 9.977161500815662, + "grad_norm": 0.286471962928772, + "learning_rate": 0.0005888001599149781, + "loss": 0.0986, + "num_input_tokens_seen": 132097056, + "step": 61160 + }, + { + "epoch": 9.977977161500815, + "grad_norm": 0.32440730929374695, + "learning_rate": 0.0005887301107596292, + "loss": 0.0263, + "num_input_tokens_seen": 132107424, + "step": 61165 + }, + { + "epoch": 9.978792822185971, + "grad_norm": 0.07186294347047806, + "learning_rate": 0.0005886600598060522, + "loss": 0.0543, + "num_input_tokens_seen": 132117376, + "step": 61170 + }, + { + "epoch": 9.979608482871125, + "grad_norm": 0.08263949304819107, + "learning_rate": 0.0005885900070556665, + "loss": 0.0171, + "num_input_tokens_seen": 132127776, + "step": 61175 + }, + { + "epoch": 9.98042414355628, + "grad_norm": 0.002160472795367241, + "learning_rate": 0.0005885199525098919, + "loss": 0.0293, + "num_input_tokens_seen": 132138720, + "step": 61180 + }, + { + "epoch": 9.981239804241435, + "grad_norm": 0.005913755390793085, + "learning_rate": 0.0005884498961701483, + "loss": 0.0074, + "num_input_tokens_seen": 132149632, + "step": 61185 + }, + { + "epoch": 9.98205546492659, + "grad_norm": 0.0032838478218764067, + "learning_rate": 0.0005883798380378554, + "loss": 0.1135, + "num_input_tokens_seen": 132161536, + "step": 61190 + }, + { + "epoch": 9.982871125611746, + "grad_norm": 0.21511363983154297, + "learning_rate": 0.0005883097781144329, + "loss": 0.234, + "num_input_tokens_seen": 132172992, + "step": 61195 + }, + { + "epoch": 9.9836867862969, + "grad_norm": 0.0015851340722292662, + "learning_rate": 0.0005882397164013005, + "loss": 0.0314, + "num_input_tokens_seen": 132183808, + "step": 61200 + }, + { + "epoch": 9.984502446982056, + "grad_norm": 0.02231002226471901, + "learning_rate": 0.0005881696528998785, + "loss": 0.0092, + "num_input_tokens_seen": 132194048, + "step": 61205 + }, + { + "epoch": 9.98531810766721, + "grad_norm": 0.014449645765125751, + "learning_rate": 0.0005880995876115868, + "loss": 0.0419, + "num_input_tokens_seen": 132205120, + "step": 61210 + }, + { + "epoch": 9.986133768352365, + "grad_norm": 0.01147771067917347, + "learning_rate": 0.0005880295205378449, + "loss": 0.0656, + "num_input_tokens_seen": 132214880, + "step": 61215 + }, + { + "epoch": 9.986949429037521, + "grad_norm": 0.061700109392404556, + "learning_rate": 0.0005879594516800732, + "loss": 0.0892, + "num_input_tokens_seen": 132225408, + "step": 61220 + }, + { + "epoch": 9.987765089722675, + "grad_norm": 0.010661949403584003, + "learning_rate": 0.0005878893810396916, + "loss": 0.1024, + "num_input_tokens_seen": 132236288, + "step": 61225 + }, + { + "epoch": 9.98858075040783, + "grad_norm": 0.08864899724721909, + "learning_rate": 0.0005878193086181203, + "loss": 0.0417, + "num_input_tokens_seen": 132247680, + "step": 61230 + }, + { + "epoch": 9.989396411092985, + "grad_norm": 0.3488578200340271, + "learning_rate": 0.0005877492344167792, + "loss": 0.1304, + "num_input_tokens_seen": 132257632, + "step": 61235 + }, + { + "epoch": 9.99021207177814, + "grad_norm": 0.0029131618794053793, + "learning_rate": 0.0005876791584370886, + "loss": 0.0816, + "num_input_tokens_seen": 132268288, + "step": 61240 + }, + { + "epoch": 9.991027732463294, + "grad_norm": 0.03511757031083107, + "learning_rate": 0.0005876090806804686, + "loss": 0.0618, + "num_input_tokens_seen": 132278432, + "step": 61245 + }, + { + "epoch": 9.99184339314845, + "grad_norm": 0.28010329604148865, + "learning_rate": 0.0005875390011483394, + "loss": 0.0442, + "num_input_tokens_seen": 132288512, + "step": 61250 + }, + { + "epoch": 9.992659053833606, + "grad_norm": 0.0038962685503065586, + "learning_rate": 0.0005874689198421214, + "loss": 0.0094, + "num_input_tokens_seen": 132300096, + "step": 61255 + }, + { + "epoch": 9.99347471451876, + "grad_norm": 0.13746079802513123, + "learning_rate": 0.0005873988367632347, + "loss": 0.1822, + "num_input_tokens_seen": 132309824, + "step": 61260 + }, + { + "epoch": 9.994290375203915, + "grad_norm": 0.16750729084014893, + "learning_rate": 0.0005873287519130997, + "loss": 0.1562, + "num_input_tokens_seen": 132320512, + "step": 61265 + }, + { + "epoch": 9.99510603588907, + "grad_norm": 0.09587028622627258, + "learning_rate": 0.0005872586652931368, + "loss": 0.0437, + "num_input_tokens_seen": 132331072, + "step": 61270 + }, + { + "epoch": 9.995921696574225, + "grad_norm": 0.22070851922035217, + "learning_rate": 0.0005871885769047664, + "loss": 0.0955, + "num_input_tokens_seen": 132340960, + "step": 61275 + }, + { + "epoch": 9.99673735725938, + "grad_norm": 0.021899957209825516, + "learning_rate": 0.0005871184867494088, + "loss": 0.0462, + "num_input_tokens_seen": 132351648, + "step": 61280 + }, + { + "epoch": 9.997553017944535, + "grad_norm": 0.18221528828144073, + "learning_rate": 0.0005870483948284845, + "loss": 0.0415, + "num_input_tokens_seen": 132363040, + "step": 61285 + }, + { + "epoch": 9.99836867862969, + "grad_norm": 0.2870374619960785, + "learning_rate": 0.0005869783011434141, + "loss": 0.0339, + "num_input_tokens_seen": 132372544, + "step": 61290 + }, + { + "epoch": 9.999184339314844, + "grad_norm": 0.04292251169681549, + "learning_rate": 0.0005869082056956181, + "loss": 0.1169, + "num_input_tokens_seen": 132383936, + "step": 61295 + }, + { + "epoch": 10.0, + "grad_norm": 0.024575114250183105, + "learning_rate": 0.000586838108486517, + "loss": 0.0154, + "num_input_tokens_seen": 132392640, + "step": 61300 + }, + { + "epoch": 10.0, + "eval_loss": 0.14106982946395874, + "eval_runtime": 103.848, + "eval_samples_per_second": 26.24, + "eval_steps_per_second": 6.567, + "num_input_tokens_seen": 132392640, + "step": 61300 + }, + { + "epoch": 10.000815660685156, + "grad_norm": 0.09867502748966217, + "learning_rate": 0.0005867680095175315, + "loss": 0.0202, + "num_input_tokens_seen": 132403680, + "step": 61305 + }, + { + "epoch": 10.00163132137031, + "grad_norm": 0.09049554169178009, + "learning_rate": 0.0005866979087900822, + "loss": 0.1042, + "num_input_tokens_seen": 132415040, + "step": 61310 + }, + { + "epoch": 10.002446982055465, + "grad_norm": 0.025646690279245377, + "learning_rate": 0.0005866278063055898, + "loss": 0.0278, + "num_input_tokens_seen": 132426496, + "step": 61315 + }, + { + "epoch": 10.00326264274062, + "grad_norm": 0.043658383190631866, + "learning_rate": 0.0005865577020654751, + "loss": 0.211, + "num_input_tokens_seen": 132436640, + "step": 61320 + }, + { + "epoch": 10.004078303425775, + "grad_norm": 0.10407885164022446, + "learning_rate": 0.0005864875960711588, + "loss": 0.0403, + "num_input_tokens_seen": 132447840, + "step": 61325 + }, + { + "epoch": 10.00489396411093, + "grad_norm": 0.010089286603033543, + "learning_rate": 0.0005864174883240614, + "loss": 0.0598, + "num_input_tokens_seen": 132458336, + "step": 61330 + }, + { + "epoch": 10.005709624796085, + "grad_norm": 0.1301630288362503, + "learning_rate": 0.0005863473788256042, + "loss": 0.1366, + "num_input_tokens_seen": 132469856, + "step": 61335 + }, + { + "epoch": 10.00652528548124, + "grad_norm": 0.051968734711408615, + "learning_rate": 0.0005862772675772076, + "loss": 0.0374, + "num_input_tokens_seen": 132479104, + "step": 61340 + }, + { + "epoch": 10.007340946166394, + "grad_norm": 0.017379827797412872, + "learning_rate": 0.000586207154580293, + "loss": 0.01, + "num_input_tokens_seen": 132490240, + "step": 61345 + }, + { + "epoch": 10.00815660685155, + "grad_norm": 0.002726492937654257, + "learning_rate": 0.0005861370398362809, + "loss": 0.0225, + "num_input_tokens_seen": 132500128, + "step": 61350 + }, + { + "epoch": 10.008972267536704, + "grad_norm": 0.035465896129608154, + "learning_rate": 0.0005860669233465925, + "loss": 0.1204, + "num_input_tokens_seen": 132511584, + "step": 61355 + }, + { + "epoch": 10.00978792822186, + "grad_norm": 0.18338146805763245, + "learning_rate": 0.0005859968051126486, + "loss": 0.0472, + "num_input_tokens_seen": 132521824, + "step": 61360 + }, + { + "epoch": 10.010603588907015, + "grad_norm": 0.02428065799176693, + "learning_rate": 0.0005859266851358704, + "loss": 0.0335, + "num_input_tokens_seen": 132533952, + "step": 61365 + }, + { + "epoch": 10.01141924959217, + "grad_norm": 0.0904858410358429, + "learning_rate": 0.0005858565634176789, + "loss": 0.0157, + "num_input_tokens_seen": 132544960, + "step": 61370 + }, + { + "epoch": 10.012234910277325, + "grad_norm": 0.019217217341065407, + "learning_rate": 0.0005857864399594953, + "loss": 0.0082, + "num_input_tokens_seen": 132556096, + "step": 61375 + }, + { + "epoch": 10.013050570962479, + "grad_norm": 0.20956331491470337, + "learning_rate": 0.0005857163147627406, + "loss": 0.055, + "num_input_tokens_seen": 132567264, + "step": 61380 + }, + { + "epoch": 10.013866231647635, + "grad_norm": 0.0058461870066821575, + "learning_rate": 0.000585646187828836, + "loss": 0.0078, + "num_input_tokens_seen": 132577376, + "step": 61385 + }, + { + "epoch": 10.01468189233279, + "grad_norm": 0.020629743114113808, + "learning_rate": 0.000585576059159203, + "loss": 0.0471, + "num_input_tokens_seen": 132587968, + "step": 61390 + }, + { + "epoch": 10.015497553017944, + "grad_norm": 0.023331712931394577, + "learning_rate": 0.0005855059287552623, + "loss": 0.0168, + "num_input_tokens_seen": 132600064, + "step": 61395 + }, + { + "epoch": 10.0163132137031, + "grad_norm": 0.0030110483057796955, + "learning_rate": 0.0005854357966184356, + "loss": 0.1784, + "num_input_tokens_seen": 132611008, + "step": 61400 + }, + { + "epoch": 10.017128874388254, + "grad_norm": 0.2905384600162506, + "learning_rate": 0.0005853656627501442, + "loss": 0.1317, + "num_input_tokens_seen": 132622592, + "step": 61405 + }, + { + "epoch": 10.01794453507341, + "grad_norm": 0.2557068467140198, + "learning_rate": 0.0005852955271518092, + "loss": 0.0358, + "num_input_tokens_seen": 132633120, + "step": 61410 + }, + { + "epoch": 10.018760195758565, + "grad_norm": 0.035086777061223984, + "learning_rate": 0.0005852253898248522, + "loss": 0.0359, + "num_input_tokens_seen": 132643712, + "step": 61415 + }, + { + "epoch": 10.01957585644372, + "grad_norm": 0.08694098889827728, + "learning_rate": 0.0005851552507706945, + "loss": 0.0205, + "num_input_tokens_seen": 132654144, + "step": 61420 + }, + { + "epoch": 10.020391517128875, + "grad_norm": 0.011595198884606361, + "learning_rate": 0.0005850851099907577, + "loss": 0.0072, + "num_input_tokens_seen": 132664096, + "step": 61425 + }, + { + "epoch": 10.021207177814029, + "grad_norm": 0.015262764878571033, + "learning_rate": 0.0005850149674864631, + "loss": 0.0379, + "num_input_tokens_seen": 132674528, + "step": 61430 + }, + { + "epoch": 10.022022838499185, + "grad_norm": 0.003421793458983302, + "learning_rate": 0.0005849448232592324, + "loss": 0.0696, + "num_input_tokens_seen": 132684896, + "step": 61435 + }, + { + "epoch": 10.022838499184338, + "grad_norm": 0.1386803835630417, + "learning_rate": 0.0005848746773104871, + "loss": 0.1301, + "num_input_tokens_seen": 132695904, + "step": 61440 + }, + { + "epoch": 10.023654159869494, + "grad_norm": 0.0038962659891694784, + "learning_rate": 0.0005848045296416488, + "loss": 0.0296, + "num_input_tokens_seen": 132706112, + "step": 61445 + }, + { + "epoch": 10.02446982055465, + "grad_norm": 0.10918844491243362, + "learning_rate": 0.0005847343802541391, + "loss": 0.0186, + "num_input_tokens_seen": 132716160, + "step": 61450 + }, + { + "epoch": 10.025285481239804, + "grad_norm": 0.18185824155807495, + "learning_rate": 0.0005846642291493796, + "loss": 0.0326, + "num_input_tokens_seen": 132728672, + "step": 61455 + }, + { + "epoch": 10.02610114192496, + "grad_norm": 0.036952123045921326, + "learning_rate": 0.0005845940763287923, + "loss": 0.0283, + "num_input_tokens_seen": 132739776, + "step": 61460 + }, + { + "epoch": 10.026916802610113, + "grad_norm": 0.16382429003715515, + "learning_rate": 0.0005845239217937986, + "loss": 0.0539, + "num_input_tokens_seen": 132751264, + "step": 61465 + }, + { + "epoch": 10.02773246329527, + "grad_norm": 0.04407104477286339, + "learning_rate": 0.0005844537655458203, + "loss": 0.0261, + "num_input_tokens_seen": 132762208, + "step": 61470 + }, + { + "epoch": 10.028548123980425, + "grad_norm": 0.0067809708416461945, + "learning_rate": 0.0005843836075862794, + "loss": 0.0284, + "num_input_tokens_seen": 132771040, + "step": 61475 + }, + { + "epoch": 10.029363784665579, + "grad_norm": 0.017581727355718613, + "learning_rate": 0.0005843134479165977, + "loss": 0.0743, + "num_input_tokens_seen": 132782528, + "step": 61480 + }, + { + "epoch": 10.030179445350734, + "grad_norm": 0.10218022763729095, + "learning_rate": 0.0005842432865381971, + "loss": 0.0171, + "num_input_tokens_seen": 132792704, + "step": 61485 + }, + { + "epoch": 10.030995106035888, + "grad_norm": 0.33561691641807556, + "learning_rate": 0.0005841731234524993, + "loss": 0.1242, + "num_input_tokens_seen": 132803904, + "step": 61490 + }, + { + "epoch": 10.031810766721044, + "grad_norm": 0.36867034435272217, + "learning_rate": 0.0005841029586609263, + "loss": 0.0274, + "num_input_tokens_seen": 132813664, + "step": 61495 + }, + { + "epoch": 10.0326264274062, + "grad_norm": 0.003252339782193303, + "learning_rate": 0.0005840327921649003, + "loss": 0.0351, + "num_input_tokens_seen": 132824640, + "step": 61500 + }, + { + "epoch": 10.033442088091354, + "grad_norm": 0.0059877620078623295, + "learning_rate": 0.0005839626239658431, + "loss": 0.0097, + "num_input_tokens_seen": 132836128, + "step": 61505 + }, + { + "epoch": 10.03425774877651, + "grad_norm": 0.002705489983782172, + "learning_rate": 0.0005838924540651769, + "loss": 0.0107, + "num_input_tokens_seen": 132847392, + "step": 61510 + }, + { + "epoch": 10.035073409461663, + "grad_norm": 0.005862162448465824, + "learning_rate": 0.0005838222824643235, + "loss": 0.065, + "num_input_tokens_seen": 132857152, + "step": 61515 + }, + { + "epoch": 10.035889070146819, + "grad_norm": 0.13416972756385803, + "learning_rate": 0.0005837521091647054, + "loss": 0.0183, + "num_input_tokens_seen": 132866560, + "step": 61520 + }, + { + "epoch": 10.036704730831975, + "grad_norm": 0.024556193500757217, + "learning_rate": 0.0005836819341677444, + "loss": 0.0198, + "num_input_tokens_seen": 132877952, + "step": 61525 + }, + { + "epoch": 10.037520391517129, + "grad_norm": 0.025332549586892128, + "learning_rate": 0.0005836117574748629, + "loss": 0.0488, + "num_input_tokens_seen": 132887744, + "step": 61530 + }, + { + "epoch": 10.038336052202284, + "grad_norm": 0.024207156151533127, + "learning_rate": 0.0005835415790874832, + "loss": 0.0257, + "num_input_tokens_seen": 132897696, + "step": 61535 + }, + { + "epoch": 10.039151712887438, + "grad_norm": 0.029303235933184624, + "learning_rate": 0.0005834713990070273, + "loss": 0.0162, + "num_input_tokens_seen": 132908704, + "step": 61540 + }, + { + "epoch": 10.039967373572594, + "grad_norm": 0.008033953607082367, + "learning_rate": 0.0005834012172349174, + "loss": 0.022, + "num_input_tokens_seen": 132919168, + "step": 61545 + }, + { + "epoch": 10.040783034257748, + "grad_norm": 0.5109202861785889, + "learning_rate": 0.0005833310337725764, + "loss": 0.1255, + "num_input_tokens_seen": 132931136, + "step": 61550 + }, + { + "epoch": 10.041598694942904, + "grad_norm": 0.008069335483014584, + "learning_rate": 0.0005832608486214261, + "loss": 0.0371, + "num_input_tokens_seen": 132942432, + "step": 61555 + }, + { + "epoch": 10.04241435562806, + "grad_norm": 0.007886008359491825, + "learning_rate": 0.0005831906617828892, + "loss": 0.0128, + "num_input_tokens_seen": 132952480, + "step": 61560 + }, + { + "epoch": 10.043230016313213, + "grad_norm": 0.005050900857895613, + "learning_rate": 0.0005831204732583879, + "loss": 0.0104, + "num_input_tokens_seen": 132963584, + "step": 61565 + }, + { + "epoch": 10.044045676998369, + "grad_norm": 0.007038873620331287, + "learning_rate": 0.0005830502830493447, + "loss": 0.0113, + "num_input_tokens_seen": 132973888, + "step": 61570 + }, + { + "epoch": 10.044861337683523, + "grad_norm": 0.00195617089048028, + "learning_rate": 0.0005829800911571824, + "loss": 0.0114, + "num_input_tokens_seen": 132984224, + "step": 61575 + }, + { + "epoch": 10.045676998368679, + "grad_norm": 0.020171010866761208, + "learning_rate": 0.000582909897583323, + "loss": 0.0065, + "num_input_tokens_seen": 132995392, + "step": 61580 + }, + { + "epoch": 10.046492659053834, + "grad_norm": 0.005067338235676289, + "learning_rate": 0.0005828397023291895, + "loss": 0.0929, + "num_input_tokens_seen": 133006432, + "step": 61585 + }, + { + "epoch": 10.047308319738988, + "grad_norm": 0.048682741820812225, + "learning_rate": 0.0005827695053962043, + "loss": 0.0268, + "num_input_tokens_seen": 133017280, + "step": 61590 + }, + { + "epoch": 10.048123980424144, + "grad_norm": 0.07136274874210358, + "learning_rate": 0.0005826993067857901, + "loss": 0.1019, + "num_input_tokens_seen": 133027936, + "step": 61595 + }, + { + "epoch": 10.048939641109298, + "grad_norm": 0.004224831238389015, + "learning_rate": 0.0005826291064993695, + "loss": 0.0079, + "num_input_tokens_seen": 133039200, + "step": 61600 + }, + { + "epoch": 10.049755301794454, + "grad_norm": 0.32198604941368103, + "learning_rate": 0.0005825589045383654, + "loss": 0.1041, + "num_input_tokens_seen": 133050528, + "step": 61605 + }, + { + "epoch": 10.05057096247961, + "grad_norm": 0.054331421852111816, + "learning_rate": 0.0005824887009042002, + "loss": 0.0091, + "num_input_tokens_seen": 133061248, + "step": 61610 + }, + { + "epoch": 10.051386623164763, + "grad_norm": 0.10078129172325134, + "learning_rate": 0.0005824184955982967, + "loss": 0.0125, + "num_input_tokens_seen": 133072544, + "step": 61615 + }, + { + "epoch": 10.052202283849919, + "grad_norm": 0.0027622587513178587, + "learning_rate": 0.000582348288622078, + "loss": 0.0034, + "num_input_tokens_seen": 133083616, + "step": 61620 + }, + { + "epoch": 10.053017944535073, + "grad_norm": 0.03783497214317322, + "learning_rate": 0.0005822780799769667, + "loss": 0.006, + "num_input_tokens_seen": 133094912, + "step": 61625 + }, + { + "epoch": 10.053833605220229, + "grad_norm": 0.06255345791578293, + "learning_rate": 0.0005822078696643859, + "loss": 0.0445, + "num_input_tokens_seen": 133106048, + "step": 61630 + }, + { + "epoch": 10.054649265905383, + "grad_norm": 0.02925264462828636, + "learning_rate": 0.0005821376576857582, + "loss": 0.1186, + "num_input_tokens_seen": 133116640, + "step": 61635 + }, + { + "epoch": 10.055464926590538, + "grad_norm": 0.014886287972331047, + "learning_rate": 0.0005820674440425067, + "loss": 0.0199, + "num_input_tokens_seen": 133127104, + "step": 61640 + }, + { + "epoch": 10.056280587275694, + "grad_norm": 0.08674079179763794, + "learning_rate": 0.0005819972287360543, + "loss": 0.0493, + "num_input_tokens_seen": 133138336, + "step": 61645 + }, + { + "epoch": 10.057096247960848, + "grad_norm": 0.03542179614305496, + "learning_rate": 0.0005819270117678239, + "loss": 0.0037, + "num_input_tokens_seen": 133147904, + "step": 61650 + }, + { + "epoch": 10.057911908646004, + "grad_norm": 0.3200596868991852, + "learning_rate": 0.0005818567931392389, + "loss": 0.0617, + "num_input_tokens_seen": 133158528, + "step": 61655 + }, + { + "epoch": 10.058727569331158, + "grad_norm": 0.0011431258171796799, + "learning_rate": 0.000581786572851722, + "loss": 0.0048, + "num_input_tokens_seen": 133169472, + "step": 61660 + }, + { + "epoch": 10.059543230016313, + "grad_norm": 0.011449893936514854, + "learning_rate": 0.0005817163509066966, + "loss": 0.009, + "num_input_tokens_seen": 133178880, + "step": 61665 + }, + { + "epoch": 10.060358890701469, + "grad_norm": 0.11256249994039536, + "learning_rate": 0.0005816461273055857, + "loss": 0.0151, + "num_input_tokens_seen": 133190464, + "step": 61670 + }, + { + "epoch": 10.061174551386623, + "grad_norm": 0.11716876924037933, + "learning_rate": 0.0005815759020498122, + "loss": 0.0041, + "num_input_tokens_seen": 133201920, + "step": 61675 + }, + { + "epoch": 10.061990212071779, + "grad_norm": 0.04273887351155281, + "learning_rate": 0.0005815056751407999, + "loss": 0.0298, + "num_input_tokens_seen": 133211648, + "step": 61680 + }, + { + "epoch": 10.062805872756933, + "grad_norm": 0.006083001848310232, + "learning_rate": 0.0005814354465799715, + "loss": 0.1231, + "num_input_tokens_seen": 133223072, + "step": 61685 + }, + { + "epoch": 10.063621533442088, + "grad_norm": 0.0192166268825531, + "learning_rate": 0.0005813652163687504, + "loss": 0.0106, + "num_input_tokens_seen": 133234784, + "step": 61690 + }, + { + "epoch": 10.064437194127244, + "grad_norm": 0.00653346860781312, + "learning_rate": 0.0005812949845085601, + "loss": 0.2327, + "num_input_tokens_seen": 133245504, + "step": 61695 + }, + { + "epoch": 10.065252854812398, + "grad_norm": 0.012311974540352821, + "learning_rate": 0.0005812247510008238, + "loss": 0.1, + "num_input_tokens_seen": 133255136, + "step": 61700 + }, + { + "epoch": 10.066068515497554, + "grad_norm": 0.1300489455461502, + "learning_rate": 0.0005811545158469649, + "loss": 0.0179, + "num_input_tokens_seen": 133267680, + "step": 61705 + }, + { + "epoch": 10.066884176182707, + "grad_norm": 0.6340866684913635, + "learning_rate": 0.0005810842790484066, + "loss": 0.2703, + "num_input_tokens_seen": 133279168, + "step": 61710 + }, + { + "epoch": 10.067699836867863, + "grad_norm": 0.026448015123605728, + "learning_rate": 0.0005810140406065727, + "loss": 0.0207, + "num_input_tokens_seen": 133289632, + "step": 61715 + }, + { + "epoch": 10.068515497553017, + "grad_norm": 0.08431733399629593, + "learning_rate": 0.0005809438005228866, + "loss": 0.0162, + "num_input_tokens_seen": 133300704, + "step": 61720 + }, + { + "epoch": 10.069331158238173, + "grad_norm": 0.008198346011340618, + "learning_rate": 0.0005808735587987714, + "loss": 0.0219, + "num_input_tokens_seen": 133311936, + "step": 61725 + }, + { + "epoch": 10.070146818923329, + "grad_norm": 0.06876257807016373, + "learning_rate": 0.0005808033154356511, + "loss": 0.0664, + "num_input_tokens_seen": 133322688, + "step": 61730 + }, + { + "epoch": 10.070962479608482, + "grad_norm": 0.03216838836669922, + "learning_rate": 0.0005807330704349492, + "loss": 0.069, + "num_input_tokens_seen": 133334304, + "step": 61735 + }, + { + "epoch": 10.071778140293638, + "grad_norm": 0.009478418156504631, + "learning_rate": 0.0005806628237980891, + "loss": 0.0064, + "num_input_tokens_seen": 133345568, + "step": 61740 + }, + { + "epoch": 10.072593800978792, + "grad_norm": 0.02647106908261776, + "learning_rate": 0.0005805925755264945, + "loss": 0.0087, + "num_input_tokens_seen": 133357472, + "step": 61745 + }, + { + "epoch": 10.073409461663948, + "grad_norm": 0.005002718418836594, + "learning_rate": 0.0005805223256215891, + "loss": 0.0052, + "num_input_tokens_seen": 133367712, + "step": 61750 + }, + { + "epoch": 10.074225122349104, + "grad_norm": 0.014104902744293213, + "learning_rate": 0.0005804520740847966, + "loss": 0.0377, + "num_input_tokens_seen": 133379840, + "step": 61755 + }, + { + "epoch": 10.075040783034257, + "grad_norm": 0.2684130072593689, + "learning_rate": 0.0005803818209175409, + "loss": 0.1045, + "num_input_tokens_seen": 133390688, + "step": 61760 + }, + { + "epoch": 10.075856443719413, + "grad_norm": 0.010500526987016201, + "learning_rate": 0.0005803115661212456, + "loss": 0.0658, + "num_input_tokens_seen": 133401632, + "step": 61765 + }, + { + "epoch": 10.076672104404567, + "grad_norm": 0.008303754031658173, + "learning_rate": 0.0005802413096973345, + "loss": 0.0029, + "num_input_tokens_seen": 133411808, + "step": 61770 + }, + { + "epoch": 10.077487765089723, + "grad_norm": 0.03415251150727272, + "learning_rate": 0.0005801710516472315, + "loss": 0.0039, + "num_input_tokens_seen": 133423616, + "step": 61775 + }, + { + "epoch": 10.078303425774878, + "grad_norm": 0.0074006495997309685, + "learning_rate": 0.0005801007919723605, + "loss": 0.0943, + "num_input_tokens_seen": 133435424, + "step": 61780 + }, + { + "epoch": 10.079119086460032, + "grad_norm": 0.002399343764409423, + "learning_rate": 0.000580030530674145, + "loss": 0.0127, + "num_input_tokens_seen": 133445472, + "step": 61785 + }, + { + "epoch": 10.079934747145188, + "grad_norm": 0.28245264291763306, + "learning_rate": 0.0005799602677540095, + "loss": 0.0462, + "num_input_tokens_seen": 133459008, + "step": 61790 + }, + { + "epoch": 10.080750407830342, + "grad_norm": 0.023097572848200798, + "learning_rate": 0.0005798900032133778, + "loss": 0.0592, + "num_input_tokens_seen": 133469440, + "step": 61795 + }, + { + "epoch": 10.081566068515498, + "grad_norm": 0.09736547619104385, + "learning_rate": 0.0005798197370536737, + "loss": 0.0638, + "num_input_tokens_seen": 133480832, + "step": 61800 + }, + { + "epoch": 10.082381729200652, + "grad_norm": 0.1749315857887268, + "learning_rate": 0.0005797494692763215, + "loss": 0.0504, + "num_input_tokens_seen": 133491552, + "step": 61805 + }, + { + "epoch": 10.083197389885807, + "grad_norm": 0.026390263810753822, + "learning_rate": 0.0005796791998827451, + "loss": 0.1452, + "num_input_tokens_seen": 133502720, + "step": 61810 + }, + { + "epoch": 10.084013050570963, + "grad_norm": 0.0233746450394392, + "learning_rate": 0.0005796089288743687, + "loss": 0.0221, + "num_input_tokens_seen": 133511936, + "step": 61815 + }, + { + "epoch": 10.084828711256117, + "grad_norm": 0.029851028695702553, + "learning_rate": 0.0005795386562526163, + "loss": 0.0236, + "num_input_tokens_seen": 133522432, + "step": 61820 + }, + { + "epoch": 10.085644371941273, + "grad_norm": 0.007268915418535471, + "learning_rate": 0.000579468382018912, + "loss": 0.0073, + "num_input_tokens_seen": 133533184, + "step": 61825 + }, + { + "epoch": 10.086460032626427, + "grad_norm": 0.033539608120918274, + "learning_rate": 0.0005793981061746802, + "loss": 0.0168, + "num_input_tokens_seen": 133544352, + "step": 61830 + }, + { + "epoch": 10.087275693311582, + "grad_norm": 0.006324341986328363, + "learning_rate": 0.0005793278287213453, + "loss": 0.0047, + "num_input_tokens_seen": 133554944, + "step": 61835 + }, + { + "epoch": 10.088091353996738, + "grad_norm": 0.28266897797584534, + "learning_rate": 0.000579257549660331, + "loss": 0.0245, + "num_input_tokens_seen": 133566048, + "step": 61840 + }, + { + "epoch": 10.088907014681892, + "grad_norm": 0.01207935530692339, + "learning_rate": 0.0005791872689930621, + "loss": 0.0218, + "num_input_tokens_seen": 133577760, + "step": 61845 + }, + { + "epoch": 10.089722675367048, + "grad_norm": 0.019263241440057755, + "learning_rate": 0.0005791169867209626, + "loss": 0.0166, + "num_input_tokens_seen": 133587424, + "step": 61850 + }, + { + "epoch": 10.090538336052202, + "grad_norm": 0.001974908635020256, + "learning_rate": 0.0005790467028454571, + "loss": 0.0168, + "num_input_tokens_seen": 133597024, + "step": 61855 + }, + { + "epoch": 10.091353996737357, + "grad_norm": 0.13980016112327576, + "learning_rate": 0.0005789764173679698, + "loss": 0.0141, + "num_input_tokens_seen": 133608704, + "step": 61860 + }, + { + "epoch": 10.092169657422513, + "grad_norm": 0.01586383581161499, + "learning_rate": 0.0005789061302899252, + "loss": 0.0035, + "num_input_tokens_seen": 133617504, + "step": 61865 + }, + { + "epoch": 10.092985318107667, + "grad_norm": 0.007646430283784866, + "learning_rate": 0.0005788358416127478, + "loss": 0.1635, + "num_input_tokens_seen": 133628448, + "step": 61870 + }, + { + "epoch": 10.093800978792823, + "grad_norm": 0.02338665910065174, + "learning_rate": 0.0005787655513378622, + "loss": 0.0679, + "num_input_tokens_seen": 133639328, + "step": 61875 + }, + { + "epoch": 10.094616639477977, + "grad_norm": 0.04463246837258339, + "learning_rate": 0.0005786952594666925, + "loss": 0.008, + "num_input_tokens_seen": 133649568, + "step": 61880 + }, + { + "epoch": 10.095432300163132, + "grad_norm": 0.17297907173633575, + "learning_rate": 0.0005786249660006638, + "loss": 0.0381, + "num_input_tokens_seen": 133660384, + "step": 61885 + }, + { + "epoch": 10.096247960848286, + "grad_norm": 0.0007581968093290925, + "learning_rate": 0.0005785546709412004, + "loss": 0.0061, + "num_input_tokens_seen": 133671232, + "step": 61890 + }, + { + "epoch": 10.097063621533442, + "grad_norm": 0.007879073731601238, + "learning_rate": 0.0005784843742897268, + "loss": 0.0031, + "num_input_tokens_seen": 133682080, + "step": 61895 + }, + { + "epoch": 10.097879282218598, + "grad_norm": 0.380930095911026, + "learning_rate": 0.0005784140760476679, + "loss": 0.1592, + "num_input_tokens_seen": 133692896, + "step": 61900 + }, + { + "epoch": 10.098694942903752, + "grad_norm": 0.017733553424477577, + "learning_rate": 0.0005783437762164483, + "loss": 0.0068, + "num_input_tokens_seen": 133702944, + "step": 61905 + }, + { + "epoch": 10.099510603588907, + "grad_norm": 0.015353784896433353, + "learning_rate": 0.0005782734747974926, + "loss": 0.1314, + "num_input_tokens_seen": 133714208, + "step": 61910 + }, + { + "epoch": 10.100326264274061, + "grad_norm": 0.004601314663887024, + "learning_rate": 0.0005782031717922256, + "loss": 0.0085, + "num_input_tokens_seen": 133725248, + "step": 61915 + }, + { + "epoch": 10.101141924959217, + "grad_norm": 0.005082305055111647, + "learning_rate": 0.0005781328672020723, + "loss": 0.0492, + "num_input_tokens_seen": 133736224, + "step": 61920 + }, + { + "epoch": 10.101957585644373, + "grad_norm": 0.07641912996768951, + "learning_rate": 0.0005780625610284572, + "loss": 0.0136, + "num_input_tokens_seen": 133747040, + "step": 61925 + }, + { + "epoch": 10.102773246329527, + "grad_norm": 0.1645813286304474, + "learning_rate": 0.000577992253272805, + "loss": 0.1875, + "num_input_tokens_seen": 133756896, + "step": 61930 + }, + { + "epoch": 10.103588907014682, + "grad_norm": 0.3278946280479431, + "learning_rate": 0.0005779219439365411, + "loss": 0.0501, + "num_input_tokens_seen": 133766720, + "step": 61935 + }, + { + "epoch": 10.104404567699836, + "grad_norm": 0.01809718646109104, + "learning_rate": 0.0005778516330210902, + "loss": 0.026, + "num_input_tokens_seen": 133778240, + "step": 61940 + }, + { + "epoch": 10.105220228384992, + "grad_norm": 0.006549215409904718, + "learning_rate": 0.0005777813205278772, + "loss": 0.0067, + "num_input_tokens_seen": 133788352, + "step": 61945 + }, + { + "epoch": 10.106035889070148, + "grad_norm": 0.002958184340968728, + "learning_rate": 0.0005777110064583271, + "loss": 0.0551, + "num_input_tokens_seen": 133798304, + "step": 61950 + }, + { + "epoch": 10.106851549755302, + "grad_norm": 0.003981069661676884, + "learning_rate": 0.0005776406908138648, + "loss": 0.0053, + "num_input_tokens_seen": 133809728, + "step": 61955 + }, + { + "epoch": 10.107667210440457, + "grad_norm": 0.04357610270380974, + "learning_rate": 0.0005775703735959155, + "loss": 0.0126, + "num_input_tokens_seen": 133820608, + "step": 61960 + }, + { + "epoch": 10.108482871125611, + "grad_norm": 0.3853638470172882, + "learning_rate": 0.000577500054805904, + "loss": 0.0787, + "num_input_tokens_seen": 133831904, + "step": 61965 + }, + { + "epoch": 10.109298531810767, + "grad_norm": 0.005354811903089285, + "learning_rate": 0.0005774297344452556, + "loss": 0.0685, + "num_input_tokens_seen": 133843392, + "step": 61970 + }, + { + "epoch": 10.11011419249592, + "grad_norm": 0.05633273720741272, + "learning_rate": 0.0005773594125153955, + "loss": 0.0138, + "num_input_tokens_seen": 133855008, + "step": 61975 + }, + { + "epoch": 10.110929853181077, + "grad_norm": 0.015436145476996899, + "learning_rate": 0.0005772890890177487, + "loss": 0.0687, + "num_input_tokens_seen": 133866304, + "step": 61980 + }, + { + "epoch": 10.111745513866232, + "grad_norm": 0.25812065601348877, + "learning_rate": 0.0005772187639537405, + "loss": 0.1369, + "num_input_tokens_seen": 133877952, + "step": 61985 + }, + { + "epoch": 10.112561174551386, + "grad_norm": 0.03582283854484558, + "learning_rate": 0.000577148437324796, + "loss": 0.0172, + "num_input_tokens_seen": 133888800, + "step": 61990 + }, + { + "epoch": 10.113376835236542, + "grad_norm": 0.004719121847301722, + "learning_rate": 0.0005770781091323407, + "loss": 0.0038, + "num_input_tokens_seen": 133900448, + "step": 61995 + }, + { + "epoch": 10.114192495921696, + "grad_norm": 0.0029625471215695143, + "learning_rate": 0.0005770077793777996, + "loss": 0.0131, + "num_input_tokens_seen": 133910304, + "step": 62000 + }, + { + "epoch": 10.115008156606851, + "grad_norm": 0.006111313123255968, + "learning_rate": 0.0005769374480625983, + "loss": 0.1681, + "num_input_tokens_seen": 133920640, + "step": 62005 + }, + { + "epoch": 10.115823817292007, + "grad_norm": 0.018650345504283905, + "learning_rate": 0.000576867115188162, + "loss": 0.0207, + "num_input_tokens_seen": 133932800, + "step": 62010 + }, + { + "epoch": 10.116639477977161, + "grad_norm": 0.002345480490475893, + "learning_rate": 0.000576796780755916, + "loss": 0.1642, + "num_input_tokens_seen": 133944608, + "step": 62015 + }, + { + "epoch": 10.117455138662317, + "grad_norm": 0.0071258461102843285, + "learning_rate": 0.0005767264447672859, + "loss": 0.0406, + "num_input_tokens_seen": 133956352, + "step": 62020 + }, + { + "epoch": 10.11827079934747, + "grad_norm": 0.020247265696525574, + "learning_rate": 0.000576656107223697, + "loss": 0.0582, + "num_input_tokens_seen": 133967200, + "step": 62025 + }, + { + "epoch": 10.119086460032626, + "grad_norm": 0.006692581344395876, + "learning_rate": 0.0005765857681265749, + "loss": 0.0067, + "num_input_tokens_seen": 133977216, + "step": 62030 + }, + { + "epoch": 10.119902120717782, + "grad_norm": 0.04609968885779381, + "learning_rate": 0.000576515427477345, + "loss": 0.0105, + "num_input_tokens_seen": 133988256, + "step": 62035 + }, + { + "epoch": 10.120717781402936, + "grad_norm": 0.016456104815006256, + "learning_rate": 0.0005764450852774329, + "loss": 0.0139, + "num_input_tokens_seen": 134000160, + "step": 62040 + }, + { + "epoch": 10.121533442088092, + "grad_norm": 0.01033777091652155, + "learning_rate": 0.0005763747415282642, + "loss": 0.0045, + "num_input_tokens_seen": 134010368, + "step": 62045 + }, + { + "epoch": 10.122349102773246, + "grad_norm": 0.2594705820083618, + "learning_rate": 0.0005763043962312644, + "loss": 0.0173, + "num_input_tokens_seen": 134020576, + "step": 62050 + }, + { + "epoch": 10.123164763458401, + "grad_norm": 0.0942777767777443, + "learning_rate": 0.0005762340493878593, + "loss": 0.095, + "num_input_tokens_seen": 134030656, + "step": 62055 + }, + { + "epoch": 10.123980424143557, + "grad_norm": 0.013030118308961391, + "learning_rate": 0.0005761637009994745, + "loss": 0.0844, + "num_input_tokens_seen": 134042176, + "step": 62060 + }, + { + "epoch": 10.124796084828711, + "grad_norm": 0.001847845152951777, + "learning_rate": 0.0005760933510675356, + "loss": 0.0658, + "num_input_tokens_seen": 134052736, + "step": 62065 + }, + { + "epoch": 10.125611745513867, + "grad_norm": 0.019251855090260506, + "learning_rate": 0.0005760229995934684, + "loss": 0.0617, + "num_input_tokens_seen": 134064032, + "step": 62070 + }, + { + "epoch": 10.12642740619902, + "grad_norm": 0.07578039169311523, + "learning_rate": 0.0005759526465786986, + "loss": 0.0156, + "num_input_tokens_seen": 134075936, + "step": 62075 + }, + { + "epoch": 10.127243066884176, + "grad_norm": 0.027117077261209488, + "learning_rate": 0.0005758822920246523, + "loss": 0.0148, + "num_input_tokens_seen": 134086560, + "step": 62080 + }, + { + "epoch": 10.12805872756933, + "grad_norm": 0.0019536991603672504, + "learning_rate": 0.000575811935932755, + "loss": 0.0139, + "num_input_tokens_seen": 134097728, + "step": 62085 + }, + { + "epoch": 10.128874388254486, + "grad_norm": 0.00918999221175909, + "learning_rate": 0.0005757415783044325, + "loss": 0.027, + "num_input_tokens_seen": 134107712, + "step": 62090 + }, + { + "epoch": 10.129690048939642, + "grad_norm": 0.029966186732053757, + "learning_rate": 0.0005756712191411109, + "loss": 0.0047, + "num_input_tokens_seen": 134119456, + "step": 62095 + }, + { + "epoch": 10.130505709624796, + "grad_norm": 0.3509294390678406, + "learning_rate": 0.0005756008584442161, + "loss": 0.0582, + "num_input_tokens_seen": 134129696, + "step": 62100 + }, + { + "epoch": 10.131321370309951, + "grad_norm": 0.08585608005523682, + "learning_rate": 0.0005755304962151739, + "loss": 0.0674, + "num_input_tokens_seen": 134140096, + "step": 62105 + }, + { + "epoch": 10.132137030995105, + "grad_norm": 0.0772661566734314, + "learning_rate": 0.0005754601324554104, + "loss": 0.0148, + "num_input_tokens_seen": 134151232, + "step": 62110 + }, + { + "epoch": 10.132952691680261, + "grad_norm": 0.002000660402700305, + "learning_rate": 0.0005753897671663518, + "loss": 0.0131, + "num_input_tokens_seen": 134161920, + "step": 62115 + }, + { + "epoch": 10.133768352365417, + "grad_norm": 0.06940358877182007, + "learning_rate": 0.0005753194003494237, + "loss": 0.025, + "num_input_tokens_seen": 134172448, + "step": 62120 + }, + { + "epoch": 10.13458401305057, + "grad_norm": 0.031187528744339943, + "learning_rate": 0.0005752490320060524, + "loss": 0.023, + "num_input_tokens_seen": 134182944, + "step": 62125 + }, + { + "epoch": 10.135399673735726, + "grad_norm": 0.00791481975466013, + "learning_rate": 0.0005751786621376641, + "loss": 0.1079, + "num_input_tokens_seen": 134194176, + "step": 62130 + }, + { + "epoch": 10.13621533442088, + "grad_norm": 0.03876109793782234, + "learning_rate": 0.0005751082907456849, + "loss": 0.0192, + "num_input_tokens_seen": 134205344, + "step": 62135 + }, + { + "epoch": 10.137030995106036, + "grad_norm": 0.01723620668053627, + "learning_rate": 0.0005750379178315408, + "loss": 0.0652, + "num_input_tokens_seen": 134216032, + "step": 62140 + }, + { + "epoch": 10.137846655791192, + "grad_norm": 0.059835128486156464, + "learning_rate": 0.0005749675433966581, + "loss": 0.0057, + "num_input_tokens_seen": 134227840, + "step": 62145 + }, + { + "epoch": 10.138662316476346, + "grad_norm": 0.0036580360028892756, + "learning_rate": 0.0005748971674424631, + "loss": 0.122, + "num_input_tokens_seen": 134237888, + "step": 62150 + }, + { + "epoch": 10.139477977161501, + "grad_norm": 0.09279145300388336, + "learning_rate": 0.0005748267899703819, + "loss": 0.0136, + "num_input_tokens_seen": 134249568, + "step": 62155 + }, + { + "epoch": 10.140293637846655, + "grad_norm": 0.00488590681925416, + "learning_rate": 0.000574756410981841, + "loss": 0.0038, + "num_input_tokens_seen": 134260736, + "step": 62160 + }, + { + "epoch": 10.141109298531811, + "grad_norm": 0.03295081481337547, + "learning_rate": 0.0005746860304782665, + "loss": 0.0761, + "num_input_tokens_seen": 134270784, + "step": 62165 + }, + { + "epoch": 10.141924959216965, + "grad_norm": 0.048429399728775024, + "learning_rate": 0.0005746156484610849, + "loss": 0.0327, + "num_input_tokens_seen": 134280928, + "step": 62170 + }, + { + "epoch": 10.14274061990212, + "grad_norm": 0.004736216738820076, + "learning_rate": 0.0005745452649317225, + "loss": 0.0293, + "num_input_tokens_seen": 134292352, + "step": 62175 + }, + { + "epoch": 10.143556280587276, + "grad_norm": 0.021222488954663277, + "learning_rate": 0.0005744748798916057, + "loss": 0.0179, + "num_input_tokens_seen": 134302880, + "step": 62180 + }, + { + "epoch": 10.14437194127243, + "grad_norm": 0.003893442451953888, + "learning_rate": 0.0005744044933421609, + "loss": 0.0136, + "num_input_tokens_seen": 134314240, + "step": 62185 + }, + { + "epoch": 10.145187601957586, + "grad_norm": 0.004684086889028549, + "learning_rate": 0.0005743341052848147, + "loss": 0.0761, + "num_input_tokens_seen": 134326144, + "step": 62190 + }, + { + "epoch": 10.14600326264274, + "grad_norm": 0.0026035832706838846, + "learning_rate": 0.0005742637157209936, + "loss": 0.1189, + "num_input_tokens_seen": 134335808, + "step": 62195 + }, + { + "epoch": 10.146818923327896, + "grad_norm": 0.0014494028873741627, + "learning_rate": 0.0005741933246521243, + "loss": 0.0027, + "num_input_tokens_seen": 134347072, + "step": 62200 + }, + { + "epoch": 10.147634584013051, + "grad_norm": 0.032787173986434937, + "learning_rate": 0.0005741229320796329, + "loss": 0.0069, + "num_input_tokens_seen": 134358080, + "step": 62205 + }, + { + "epoch": 10.148450244698205, + "grad_norm": 0.019167525693774223, + "learning_rate": 0.0005740525380049464, + "loss": 0.0263, + "num_input_tokens_seen": 134369504, + "step": 62210 + }, + { + "epoch": 10.149265905383361, + "grad_norm": 0.031887758523225784, + "learning_rate": 0.0005739821424294911, + "loss": 0.0095, + "num_input_tokens_seen": 134379008, + "step": 62215 + }, + { + "epoch": 10.150081566068515, + "grad_norm": 0.018726017326116562, + "learning_rate": 0.000573911745354694, + "loss": 0.0169, + "num_input_tokens_seen": 134390624, + "step": 62220 + }, + { + "epoch": 10.15089722675367, + "grad_norm": 0.15560075640678406, + "learning_rate": 0.0005738413467819816, + "loss": 0.0187, + "num_input_tokens_seen": 134402304, + "step": 62225 + }, + { + "epoch": 10.151712887438826, + "grad_norm": 0.026487508788704872, + "learning_rate": 0.0005737709467127805, + "loss": 0.1375, + "num_input_tokens_seen": 134412320, + "step": 62230 + }, + { + "epoch": 10.15252854812398, + "grad_norm": 0.023451926186680794, + "learning_rate": 0.0005737005451485177, + "loss": 0.1566, + "num_input_tokens_seen": 134423072, + "step": 62235 + }, + { + "epoch": 10.153344208809136, + "grad_norm": 0.04784998297691345, + "learning_rate": 0.0005736301420906196, + "loss": 0.0129, + "num_input_tokens_seen": 134434880, + "step": 62240 + }, + { + "epoch": 10.15415986949429, + "grad_norm": 0.25198274850845337, + "learning_rate": 0.0005735597375405135, + "loss": 0.0616, + "num_input_tokens_seen": 134446688, + "step": 62245 + }, + { + "epoch": 10.154975530179446, + "grad_norm": 0.10130038857460022, + "learning_rate": 0.000573489331499626, + "loss": 0.2152, + "num_input_tokens_seen": 134457632, + "step": 62250 + }, + { + "epoch": 10.1557911908646, + "grad_norm": 0.011092791333794594, + "learning_rate": 0.000573418923969384, + "loss": 0.0103, + "num_input_tokens_seen": 134467456, + "step": 62255 + }, + { + "epoch": 10.156606851549755, + "grad_norm": 0.03461114689707756, + "learning_rate": 0.0005733485149512143, + "loss": 0.1422, + "num_input_tokens_seen": 134478880, + "step": 62260 + }, + { + "epoch": 10.15742251223491, + "grad_norm": 0.47994375228881836, + "learning_rate": 0.000573278104446544, + "loss": 0.1253, + "num_input_tokens_seen": 134490912, + "step": 62265 + }, + { + "epoch": 10.158238172920065, + "grad_norm": 0.013990727253258228, + "learning_rate": 0.0005732076924567999, + "loss": 0.0087, + "num_input_tokens_seen": 134499872, + "step": 62270 + }, + { + "epoch": 10.15905383360522, + "grad_norm": 0.007365802302956581, + "learning_rate": 0.0005731372789834089, + "loss": 0.0073, + "num_input_tokens_seen": 134510944, + "step": 62275 + }, + { + "epoch": 10.159869494290374, + "grad_norm": 0.010104432702064514, + "learning_rate": 0.0005730668640277983, + "loss": 0.1117, + "num_input_tokens_seen": 134522464, + "step": 62280 + }, + { + "epoch": 10.16068515497553, + "grad_norm": 0.026980755850672722, + "learning_rate": 0.0005729964475913949, + "loss": 0.157, + "num_input_tokens_seen": 134532992, + "step": 62285 + }, + { + "epoch": 10.161500815660686, + "grad_norm": 0.003204572247341275, + "learning_rate": 0.0005729260296756259, + "loss": 0.1405, + "num_input_tokens_seen": 134542688, + "step": 62290 + }, + { + "epoch": 10.16231647634584, + "grad_norm": 0.13098019361495972, + "learning_rate": 0.0005728556102819185, + "loss": 0.0433, + "num_input_tokens_seen": 134553888, + "step": 62295 + }, + { + "epoch": 10.163132137030995, + "grad_norm": 0.0058778622187674046, + "learning_rate": 0.0005727851894116997, + "loss": 0.0117, + "num_input_tokens_seen": 134565088, + "step": 62300 + }, + { + "epoch": 10.16394779771615, + "grad_norm": 0.017019178718328476, + "learning_rate": 0.0005727147670663967, + "loss": 0.0125, + "num_input_tokens_seen": 134574080, + "step": 62305 + }, + { + "epoch": 10.164763458401305, + "grad_norm": 0.007292016409337521, + "learning_rate": 0.0005726443432474366, + "loss": 0.0079, + "num_input_tokens_seen": 134585536, + "step": 62310 + }, + { + "epoch": 10.16557911908646, + "grad_norm": 0.002903412329033017, + "learning_rate": 0.0005725739179562469, + "loss": 0.1014, + "num_input_tokens_seen": 134596512, + "step": 62315 + }, + { + "epoch": 10.166394779771615, + "grad_norm": 0.06075170263648033, + "learning_rate": 0.0005725034911942546, + "loss": 0.0459, + "num_input_tokens_seen": 134608192, + "step": 62320 + }, + { + "epoch": 10.16721044045677, + "grad_norm": 0.15462207794189453, + "learning_rate": 0.0005724330629628871, + "loss": 0.0748, + "num_input_tokens_seen": 134618688, + "step": 62325 + }, + { + "epoch": 10.168026101141924, + "grad_norm": 0.015479236841201782, + "learning_rate": 0.0005723626332635717, + "loss": 0.0093, + "num_input_tokens_seen": 134628128, + "step": 62330 + }, + { + "epoch": 10.16884176182708, + "grad_norm": 0.010428624227643013, + "learning_rate": 0.0005722922020977356, + "loss": 0.0492, + "num_input_tokens_seen": 134639200, + "step": 62335 + }, + { + "epoch": 10.169657422512234, + "grad_norm": 0.00993596762418747, + "learning_rate": 0.0005722217694668065, + "loss": 0.0162, + "num_input_tokens_seen": 134648192, + "step": 62340 + }, + { + "epoch": 10.17047308319739, + "grad_norm": 0.2954118549823761, + "learning_rate": 0.0005721513353722116, + "loss": 0.1768, + "num_input_tokens_seen": 134658784, + "step": 62345 + }, + { + "epoch": 10.171288743882545, + "grad_norm": 0.34019842743873596, + "learning_rate": 0.0005720808998153782, + "loss": 0.031, + "num_input_tokens_seen": 134670496, + "step": 62350 + }, + { + "epoch": 10.1721044045677, + "grad_norm": 0.055857911705970764, + "learning_rate": 0.000572010462797734, + "loss": 0.0216, + "num_input_tokens_seen": 134682272, + "step": 62355 + }, + { + "epoch": 10.172920065252855, + "grad_norm": 0.0796642005443573, + "learning_rate": 0.0005719400243207065, + "loss": 0.0347, + "num_input_tokens_seen": 134695008, + "step": 62360 + }, + { + "epoch": 10.173735725938009, + "grad_norm": 0.051125604659318924, + "learning_rate": 0.0005718695843857231, + "loss": 0.0959, + "num_input_tokens_seen": 134706144, + "step": 62365 + }, + { + "epoch": 10.174551386623165, + "grad_norm": 0.6277033686637878, + "learning_rate": 0.0005717991429942114, + "loss": 0.0287, + "num_input_tokens_seen": 134716896, + "step": 62370 + }, + { + "epoch": 10.17536704730832, + "grad_norm": 0.1190236285328865, + "learning_rate": 0.000571728700147599, + "loss": 0.1365, + "num_input_tokens_seen": 134726368, + "step": 62375 + }, + { + "epoch": 10.176182707993474, + "grad_norm": 0.5035422444343567, + "learning_rate": 0.0005716582558473136, + "loss": 0.054, + "num_input_tokens_seen": 134737568, + "step": 62380 + }, + { + "epoch": 10.17699836867863, + "grad_norm": 0.10199093073606491, + "learning_rate": 0.0005715878100947824, + "loss": 0.0832, + "num_input_tokens_seen": 134747168, + "step": 62385 + }, + { + "epoch": 10.177814029363784, + "grad_norm": 0.09676895290613174, + "learning_rate": 0.0005715173628914336, + "loss": 0.0742, + "num_input_tokens_seen": 134758464, + "step": 62390 + }, + { + "epoch": 10.17862969004894, + "grad_norm": 0.040533117949962616, + "learning_rate": 0.0005714469142386948, + "loss": 0.0183, + "num_input_tokens_seen": 134769760, + "step": 62395 + }, + { + "epoch": 10.179445350734095, + "grad_norm": 0.05296805128455162, + "learning_rate": 0.0005713764641379936, + "loss": 0.0145, + "num_input_tokens_seen": 134780896, + "step": 62400 + }, + { + "epoch": 10.18026101141925, + "grad_norm": 0.03004343807697296, + "learning_rate": 0.0005713060125907578, + "loss": 0.0247, + "num_input_tokens_seen": 134790624, + "step": 62405 + }, + { + "epoch": 10.181076672104405, + "grad_norm": 0.009422010742127895, + "learning_rate": 0.0005712355595984151, + "loss": 0.1624, + "num_input_tokens_seen": 134801952, + "step": 62410 + }, + { + "epoch": 10.181892332789559, + "grad_norm": 0.009660206735134125, + "learning_rate": 0.0005711651051623935, + "loss": 0.0074, + "num_input_tokens_seen": 134813472, + "step": 62415 + }, + { + "epoch": 10.182707993474715, + "grad_norm": 0.07207388430833817, + "learning_rate": 0.0005710946492841208, + "loss": 0.1007, + "num_input_tokens_seen": 134824512, + "step": 62420 + }, + { + "epoch": 10.18352365415987, + "grad_norm": 0.05100369080901146, + "learning_rate": 0.0005710241919650248, + "loss": 0.0882, + "num_input_tokens_seen": 134834816, + "step": 62425 + }, + { + "epoch": 10.184339314845024, + "grad_norm": 0.3938085436820984, + "learning_rate": 0.0005709537332065335, + "loss": 0.0666, + "num_input_tokens_seen": 134845280, + "step": 62430 + }, + { + "epoch": 10.18515497553018, + "grad_norm": 0.007621200289577246, + "learning_rate": 0.0005708832730100747, + "loss": 0.0403, + "num_input_tokens_seen": 134855744, + "step": 62435 + }, + { + "epoch": 10.185970636215334, + "grad_norm": 0.12467098236083984, + "learning_rate": 0.0005708128113770765, + "loss": 0.0412, + "num_input_tokens_seen": 134865568, + "step": 62440 + }, + { + "epoch": 10.18678629690049, + "grad_norm": 0.007261715363711119, + "learning_rate": 0.0005707423483089669, + "loss": 0.0355, + "num_input_tokens_seen": 134877344, + "step": 62445 + }, + { + "epoch": 10.187601957585644, + "grad_norm": 0.11845030635595322, + "learning_rate": 0.0005706718838071738, + "loss": 0.1115, + "num_input_tokens_seen": 134888896, + "step": 62450 + }, + { + "epoch": 10.1884176182708, + "grad_norm": 0.002159345429390669, + "learning_rate": 0.0005706014178731253, + "loss": 0.0134, + "num_input_tokens_seen": 134900000, + "step": 62455 + }, + { + "epoch": 10.189233278955955, + "grad_norm": 0.0020519730169326067, + "learning_rate": 0.0005705309505082496, + "loss": 0.0123, + "num_input_tokens_seen": 134909568, + "step": 62460 + }, + { + "epoch": 10.190048939641109, + "grad_norm": 0.1217883750796318, + "learning_rate": 0.0005704604817139747, + "loss": 0.0287, + "num_input_tokens_seen": 134920704, + "step": 62465 + }, + { + "epoch": 10.190864600326265, + "grad_norm": 0.011312576942145824, + "learning_rate": 0.0005703900114917286, + "loss": 0.0129, + "num_input_tokens_seen": 134932736, + "step": 62470 + }, + { + "epoch": 10.191680261011419, + "grad_norm": 0.004563915077596903, + "learning_rate": 0.0005703195398429397, + "loss": 0.0948, + "num_input_tokens_seen": 134943616, + "step": 62475 + }, + { + "epoch": 10.192495921696574, + "grad_norm": 0.10664792358875275, + "learning_rate": 0.0005702490667690363, + "loss": 0.0306, + "num_input_tokens_seen": 134954560, + "step": 62480 + }, + { + "epoch": 10.19331158238173, + "grad_norm": 0.212308868765831, + "learning_rate": 0.0005701785922714461, + "loss": 0.1525, + "num_input_tokens_seen": 134964192, + "step": 62485 + }, + { + "epoch": 10.194127243066884, + "grad_norm": 0.04527709260582924, + "learning_rate": 0.000570108116351598, + "loss": 0.036, + "num_input_tokens_seen": 134974976, + "step": 62490 + }, + { + "epoch": 10.19494290375204, + "grad_norm": 0.009464547038078308, + "learning_rate": 0.0005700376390109198, + "loss": 0.0156, + "num_input_tokens_seen": 134985696, + "step": 62495 + }, + { + "epoch": 10.195758564437194, + "grad_norm": 0.0898800790309906, + "learning_rate": 0.00056996716025084, + "loss": 0.0191, + "num_input_tokens_seen": 134996736, + "step": 62500 + }, + { + "epoch": 10.19657422512235, + "grad_norm": 0.05027703940868378, + "learning_rate": 0.000569896680072787, + "loss": 0.0106, + "num_input_tokens_seen": 135005856, + "step": 62505 + }, + { + "epoch": 10.197389885807505, + "grad_norm": 0.06976497173309326, + "learning_rate": 0.0005698261984781891, + "loss": 0.0189, + "num_input_tokens_seen": 135018400, + "step": 62510 + }, + { + "epoch": 10.198205546492659, + "grad_norm": 0.0337468683719635, + "learning_rate": 0.0005697557154684749, + "loss": 0.0147, + "num_input_tokens_seen": 135028448, + "step": 62515 + }, + { + "epoch": 10.199021207177815, + "grad_norm": 0.18263459205627441, + "learning_rate": 0.0005696852310450723, + "loss": 0.0178, + "num_input_tokens_seen": 135038080, + "step": 62520 + }, + { + "epoch": 10.199836867862969, + "grad_norm": 0.45159977674484253, + "learning_rate": 0.0005696147452094102, + "loss": 0.0384, + "num_input_tokens_seen": 135047392, + "step": 62525 + }, + { + "epoch": 10.200652528548124, + "grad_norm": 0.010472620837390423, + "learning_rate": 0.000569544257962917, + "loss": 0.0159, + "num_input_tokens_seen": 135057856, + "step": 62530 + }, + { + "epoch": 10.201468189233278, + "grad_norm": 0.06547081470489502, + "learning_rate": 0.0005694737693070213, + "loss": 0.0129, + "num_input_tokens_seen": 135069632, + "step": 62535 + }, + { + "epoch": 10.202283849918434, + "grad_norm": 0.005394492298364639, + "learning_rate": 0.0005694032792431515, + "loss": 0.0796, + "num_input_tokens_seen": 135080448, + "step": 62540 + }, + { + "epoch": 10.20309951060359, + "grad_norm": 0.00928778387606144, + "learning_rate": 0.0005693327877727361, + "loss": 0.1307, + "num_input_tokens_seen": 135091904, + "step": 62545 + }, + { + "epoch": 10.203915171288743, + "grad_norm": 0.003517286153510213, + "learning_rate": 0.0005692622948972039, + "loss": 0.1866, + "num_input_tokens_seen": 135102176, + "step": 62550 + }, + { + "epoch": 10.2047308319739, + "grad_norm": 0.008070231415331364, + "learning_rate": 0.0005691918006179833, + "loss": 0.0158, + "num_input_tokens_seen": 135113568, + "step": 62555 + }, + { + "epoch": 10.205546492659053, + "grad_norm": 0.021824125200510025, + "learning_rate": 0.0005691213049365031, + "loss": 0.0186, + "num_input_tokens_seen": 135124896, + "step": 62560 + }, + { + "epoch": 10.206362153344209, + "grad_norm": 0.12865550816059113, + "learning_rate": 0.000569050807854192, + "loss": 0.0138, + "num_input_tokens_seen": 135136768, + "step": 62565 + }, + { + "epoch": 10.207177814029365, + "grad_norm": 0.02046484500169754, + "learning_rate": 0.0005689803093724788, + "loss": 0.0095, + "num_input_tokens_seen": 135147488, + "step": 62570 + }, + { + "epoch": 10.207993474714518, + "grad_norm": 0.15102674067020416, + "learning_rate": 0.0005689098094927921, + "loss": 0.0391, + "num_input_tokens_seen": 135158240, + "step": 62575 + }, + { + "epoch": 10.208809135399674, + "grad_norm": 0.005512281786650419, + "learning_rate": 0.0005688393082165605, + "loss": 0.0349, + "num_input_tokens_seen": 135169984, + "step": 62580 + }, + { + "epoch": 10.209624796084828, + "grad_norm": 0.004724299535155296, + "learning_rate": 0.0005687688055452132, + "loss": 0.0119, + "num_input_tokens_seen": 135180832, + "step": 62585 + }, + { + "epoch": 10.210440456769984, + "grad_norm": 0.01652977615594864, + "learning_rate": 0.0005686983014801787, + "loss": 0.0129, + "num_input_tokens_seen": 135191584, + "step": 62590 + }, + { + "epoch": 10.21125611745514, + "grad_norm": 0.0027591027319431305, + "learning_rate": 0.000568627796022886, + "loss": 0.0149, + "num_input_tokens_seen": 135202048, + "step": 62595 + }, + { + "epoch": 10.212071778140293, + "grad_norm": 0.11414124071598053, + "learning_rate": 0.0005685572891747639, + "loss": 0.0483, + "num_input_tokens_seen": 135212384, + "step": 62600 + }, + { + "epoch": 10.21288743882545, + "grad_norm": 0.018032826483249664, + "learning_rate": 0.0005684867809372415, + "loss": 0.0211, + "num_input_tokens_seen": 135223456, + "step": 62605 + }, + { + "epoch": 10.213703099510603, + "grad_norm": 0.2531827688217163, + "learning_rate": 0.0005684162713117473, + "loss": 0.1153, + "num_input_tokens_seen": 135235488, + "step": 62610 + }, + { + "epoch": 10.214518760195759, + "grad_norm": 0.5033522248268127, + "learning_rate": 0.0005683457602997108, + "loss": 0.0361, + "num_input_tokens_seen": 135245376, + "step": 62615 + }, + { + "epoch": 10.215334420880913, + "grad_norm": 0.06980666518211365, + "learning_rate": 0.0005682752479025608, + "loss": 0.0058, + "num_input_tokens_seen": 135255968, + "step": 62620 + }, + { + "epoch": 10.216150081566068, + "grad_norm": 0.0026550409384071827, + "learning_rate": 0.0005682047341217262, + "loss": 0.0324, + "num_input_tokens_seen": 135267392, + "step": 62625 + }, + { + "epoch": 10.216965742251224, + "grad_norm": 0.23229344189167023, + "learning_rate": 0.0005681342189586362, + "loss": 0.0303, + "num_input_tokens_seen": 135277568, + "step": 62630 + }, + { + "epoch": 10.217781402936378, + "grad_norm": 0.19436459243297577, + "learning_rate": 0.0005680637024147199, + "loss": 0.0422, + "num_input_tokens_seen": 135286944, + "step": 62635 + }, + { + "epoch": 10.218597063621534, + "grad_norm": 0.008587691932916641, + "learning_rate": 0.0005679931844914061, + "loss": 0.0498, + "num_input_tokens_seen": 135296960, + "step": 62640 + }, + { + "epoch": 10.219412724306688, + "grad_norm": 0.5537342429161072, + "learning_rate": 0.0005679226651901243, + "loss": 0.1044, + "num_input_tokens_seen": 135307520, + "step": 62645 + }, + { + "epoch": 10.220228384991843, + "grad_norm": 0.17528803646564484, + "learning_rate": 0.0005678521445123036, + "loss": 0.0279, + "num_input_tokens_seen": 135315936, + "step": 62650 + }, + { + "epoch": 10.221044045676999, + "grad_norm": 0.0012471231166273355, + "learning_rate": 0.0005677816224593731, + "loss": 0.029, + "num_input_tokens_seen": 135326656, + "step": 62655 + }, + { + "epoch": 10.221859706362153, + "grad_norm": 0.7135857343673706, + "learning_rate": 0.0005677110990327618, + "loss": 0.1871, + "num_input_tokens_seen": 135337184, + "step": 62660 + }, + { + "epoch": 10.222675367047309, + "grad_norm": 0.4663824141025543, + "learning_rate": 0.0005676405742338995, + "loss": 0.0679, + "num_input_tokens_seen": 135348096, + "step": 62665 + }, + { + "epoch": 10.223491027732463, + "grad_norm": 0.010357555001974106, + "learning_rate": 0.0005675700480642149, + "loss": 0.0056, + "num_input_tokens_seen": 135358688, + "step": 62670 + }, + { + "epoch": 10.224306688417618, + "grad_norm": 0.002061615465208888, + "learning_rate": 0.0005674995205251376, + "loss": 0.025, + "num_input_tokens_seen": 135367840, + "step": 62675 + }, + { + "epoch": 10.225122349102774, + "grad_norm": 0.011426280252635479, + "learning_rate": 0.000567428991618097, + "loss": 0.0245, + "num_input_tokens_seen": 135378336, + "step": 62680 + }, + { + "epoch": 10.225938009787928, + "grad_norm": 0.005902221892029047, + "learning_rate": 0.0005673584613445223, + "loss": 0.0179, + "num_input_tokens_seen": 135389312, + "step": 62685 + }, + { + "epoch": 10.226753670473084, + "grad_norm": 0.015893463045358658, + "learning_rate": 0.000567287929705843, + "loss": 0.0067, + "num_input_tokens_seen": 135398496, + "step": 62690 + }, + { + "epoch": 10.227569331158238, + "grad_norm": 0.3865486979484558, + "learning_rate": 0.0005672173967034883, + "loss": 0.2104, + "num_input_tokens_seen": 135409024, + "step": 62695 + }, + { + "epoch": 10.228384991843393, + "grad_norm": 0.0027782840188592672, + "learning_rate": 0.0005671468623388878, + "loss": 0.0176, + "num_input_tokens_seen": 135419168, + "step": 62700 + }, + { + "epoch": 10.229200652528547, + "grad_norm": 0.010938719846308231, + "learning_rate": 0.000567076326613471, + "loss": 0.0374, + "num_input_tokens_seen": 135430368, + "step": 62705 + }, + { + "epoch": 10.230016313213703, + "grad_norm": 0.027225926518440247, + "learning_rate": 0.0005670057895286674, + "loss": 0.0756, + "num_input_tokens_seen": 135441408, + "step": 62710 + }, + { + "epoch": 10.230831973898859, + "grad_norm": 0.09897179901599884, + "learning_rate": 0.0005669352510859063, + "loss": 0.0988, + "num_input_tokens_seen": 135452160, + "step": 62715 + }, + { + "epoch": 10.231647634584013, + "grad_norm": 0.021300874650478363, + "learning_rate": 0.0005668647112866175, + "loss": 0.005, + "num_input_tokens_seen": 135463296, + "step": 62720 + }, + { + "epoch": 10.232463295269168, + "grad_norm": 0.00661829486489296, + "learning_rate": 0.0005667941701322305, + "loss": 0.0571, + "num_input_tokens_seen": 135473984, + "step": 62725 + }, + { + "epoch": 10.233278955954322, + "grad_norm": 0.007071008440107107, + "learning_rate": 0.000566723627624175, + "loss": 0.0259, + "num_input_tokens_seen": 135485632, + "step": 62730 + }, + { + "epoch": 10.234094616639478, + "grad_norm": 0.04191211238503456, + "learning_rate": 0.0005666530837638805, + "loss": 0.0137, + "num_input_tokens_seen": 135495968, + "step": 62735 + }, + { + "epoch": 10.234910277324634, + "grad_norm": 0.0013803663896396756, + "learning_rate": 0.0005665825385527766, + "loss": 0.0056, + "num_input_tokens_seen": 135506304, + "step": 62740 + }, + { + "epoch": 10.235725938009788, + "grad_norm": 0.02503710426390171, + "learning_rate": 0.0005665119919922932, + "loss": 0.0062, + "num_input_tokens_seen": 135516640, + "step": 62745 + }, + { + "epoch": 10.236541598694943, + "grad_norm": 0.0530230738222599, + "learning_rate": 0.0005664414440838598, + "loss": 0.1141, + "num_input_tokens_seen": 135527840, + "step": 62750 + }, + { + "epoch": 10.237357259380097, + "grad_norm": 0.23326678574085236, + "learning_rate": 0.0005663708948289065, + "loss": 0.0281, + "num_input_tokens_seen": 135537888, + "step": 62755 + }, + { + "epoch": 10.238172920065253, + "grad_norm": 0.019284890964627266, + "learning_rate": 0.0005663003442288626, + "loss": 0.0147, + "num_input_tokens_seen": 135548736, + "step": 62760 + }, + { + "epoch": 10.238988580750409, + "grad_norm": 0.010017280466854572, + "learning_rate": 0.0005662297922851583, + "loss": 0.0994, + "num_input_tokens_seen": 135557376, + "step": 62765 + }, + { + "epoch": 10.239804241435563, + "grad_norm": 0.022153059020638466, + "learning_rate": 0.0005661592389992231, + "loss": 0.0235, + "num_input_tokens_seen": 135567328, + "step": 62770 + }, + { + "epoch": 10.240619902120718, + "grad_norm": 0.0728738009929657, + "learning_rate": 0.0005660886843724869, + "loss": 0.0451, + "num_input_tokens_seen": 135578624, + "step": 62775 + }, + { + "epoch": 10.241435562805872, + "grad_norm": 0.01032840833067894, + "learning_rate": 0.0005660181284063798, + "loss": 0.0168, + "num_input_tokens_seen": 135588800, + "step": 62780 + }, + { + "epoch": 10.242251223491028, + "grad_norm": 0.009232861921191216, + "learning_rate": 0.0005659475711023317, + "loss": 0.0637, + "num_input_tokens_seen": 135599360, + "step": 62785 + }, + { + "epoch": 10.243066884176184, + "grad_norm": 0.021632635965943336, + "learning_rate": 0.0005658770124617722, + "loss": 0.164, + "num_input_tokens_seen": 135608960, + "step": 62790 + }, + { + "epoch": 10.243882544861338, + "grad_norm": 0.005081352312117815, + "learning_rate": 0.0005658064524861315, + "loss": 0.0099, + "num_input_tokens_seen": 135619744, + "step": 62795 + }, + { + "epoch": 10.244698205546493, + "grad_norm": 0.13025851547718048, + "learning_rate": 0.0005657358911768395, + "loss": 0.0129, + "num_input_tokens_seen": 135630016, + "step": 62800 + }, + { + "epoch": 10.245513866231647, + "grad_norm": 0.006488516461104155, + "learning_rate": 0.0005656653285353265, + "loss": 0.0647, + "num_input_tokens_seen": 135640480, + "step": 62805 + }, + { + "epoch": 10.246329526916803, + "grad_norm": 0.01548718847334385, + "learning_rate": 0.0005655947645630222, + "loss": 0.0346, + "num_input_tokens_seen": 135651328, + "step": 62810 + }, + { + "epoch": 10.247145187601957, + "grad_norm": 0.031630516052246094, + "learning_rate": 0.0005655241992613566, + "loss": 0.0084, + "num_input_tokens_seen": 135662976, + "step": 62815 + }, + { + "epoch": 10.247960848287113, + "grad_norm": 0.17202773690223694, + "learning_rate": 0.0005654536326317602, + "loss": 0.0239, + "num_input_tokens_seen": 135673920, + "step": 62820 + }, + { + "epoch": 10.248776508972268, + "grad_norm": 0.016138330101966858, + "learning_rate": 0.0005653830646756629, + "loss": 0.0047, + "num_input_tokens_seen": 135684672, + "step": 62825 + }, + { + "epoch": 10.249592169657422, + "grad_norm": 0.00821363739669323, + "learning_rate": 0.0005653124953944947, + "loss": 0.0151, + "num_input_tokens_seen": 135695232, + "step": 62830 + }, + { + "epoch": 10.250407830342578, + "grad_norm": 0.0317390076816082, + "learning_rate": 0.0005652419247896861, + "loss": 0.0118, + "num_input_tokens_seen": 135706432, + "step": 62835 + }, + { + "epoch": 10.251223491027732, + "grad_norm": 0.0038922594394534826, + "learning_rate": 0.000565171352862667, + "loss": 0.0434, + "num_input_tokens_seen": 135717536, + "step": 62840 + }, + { + "epoch": 10.252039151712887, + "grad_norm": 0.03252030164003372, + "learning_rate": 0.0005651007796148678, + "loss": 0.0082, + "num_input_tokens_seen": 135729856, + "step": 62845 + }, + { + "epoch": 10.252854812398043, + "grad_norm": 0.025612108409404755, + "learning_rate": 0.0005650302050477187, + "loss": 0.0097, + "num_input_tokens_seen": 135740000, + "step": 62850 + }, + { + "epoch": 10.253670473083197, + "grad_norm": 0.2749195992946625, + "learning_rate": 0.0005649596291626501, + "loss": 0.0855, + "num_input_tokens_seen": 135749888, + "step": 62855 + }, + { + "epoch": 10.254486133768353, + "grad_norm": 0.20973838865756989, + "learning_rate": 0.0005648890519610921, + "loss": 0.0515, + "num_input_tokens_seen": 135760992, + "step": 62860 + }, + { + "epoch": 10.255301794453507, + "grad_norm": 0.3340967893600464, + "learning_rate": 0.0005648184734444753, + "loss": 0.122, + "num_input_tokens_seen": 135769792, + "step": 62865 + }, + { + "epoch": 10.256117455138662, + "grad_norm": 0.01708339713513851, + "learning_rate": 0.0005647478936142296, + "loss": 0.0249, + "num_input_tokens_seen": 135781568, + "step": 62870 + }, + { + "epoch": 10.256933115823816, + "grad_norm": 0.01038318034261465, + "learning_rate": 0.0005646773124717858, + "loss": 0.0541, + "num_input_tokens_seen": 135792256, + "step": 62875 + }, + { + "epoch": 10.257748776508972, + "grad_norm": 0.004717283882200718, + "learning_rate": 0.0005646067300185744, + "loss": 0.0263, + "num_input_tokens_seen": 135803424, + "step": 62880 + }, + { + "epoch": 10.258564437194128, + "grad_norm": 0.03085217997431755, + "learning_rate": 0.0005645361462560256, + "loss": 0.0809, + "num_input_tokens_seen": 135814048, + "step": 62885 + }, + { + "epoch": 10.259380097879282, + "grad_norm": 0.006718597374856472, + "learning_rate": 0.0005644655611855698, + "loss": 0.0866, + "num_input_tokens_seen": 135825536, + "step": 62890 + }, + { + "epoch": 10.260195758564437, + "grad_norm": 0.004600659478455782, + "learning_rate": 0.0005643949748086377, + "loss": 0.0735, + "num_input_tokens_seen": 135837248, + "step": 62895 + }, + { + "epoch": 10.261011419249591, + "grad_norm": 0.008998743258416653, + "learning_rate": 0.0005643243871266598, + "loss": 0.0995, + "num_input_tokens_seen": 135847936, + "step": 62900 + }, + { + "epoch": 10.261827079934747, + "grad_norm": 0.013277465477585793, + "learning_rate": 0.0005642537981410665, + "loss": 0.1557, + "num_input_tokens_seen": 135859680, + "step": 62905 + }, + { + "epoch": 10.262642740619903, + "grad_norm": 0.0034990364219993353, + "learning_rate": 0.0005641832078532886, + "loss": 0.0127, + "num_input_tokens_seen": 135870880, + "step": 62910 + }, + { + "epoch": 10.263458401305057, + "grad_norm": 0.2049998939037323, + "learning_rate": 0.0005641126162647564, + "loss": 0.0452, + "num_input_tokens_seen": 135881568, + "step": 62915 + }, + { + "epoch": 10.264274061990212, + "grad_norm": 0.1066952794790268, + "learning_rate": 0.0005640420233769008, + "loss": 0.0869, + "num_input_tokens_seen": 135892800, + "step": 62920 + }, + { + "epoch": 10.265089722675366, + "grad_norm": 0.03311437368392944, + "learning_rate": 0.0005639714291911524, + "loss": 0.0305, + "num_input_tokens_seen": 135903712, + "step": 62925 + }, + { + "epoch": 10.265905383360522, + "grad_norm": 0.022299086675047874, + "learning_rate": 0.0005639008337089416, + "loss": 0.0416, + "num_input_tokens_seen": 135914240, + "step": 62930 + }, + { + "epoch": 10.266721044045678, + "grad_norm": 0.018949246034026146, + "learning_rate": 0.0005638302369316995, + "loss": 0.0721, + "num_input_tokens_seen": 135925760, + "step": 62935 + }, + { + "epoch": 10.267536704730832, + "grad_norm": 0.019130868837237358, + "learning_rate": 0.0005637596388608567, + "loss": 0.0275, + "num_input_tokens_seen": 135936096, + "step": 62940 + }, + { + "epoch": 10.268352365415987, + "grad_norm": 0.36915987730026245, + "learning_rate": 0.0005636890394978439, + "loss": 0.1173, + "num_input_tokens_seen": 135946144, + "step": 62945 + }, + { + "epoch": 10.269168026101141, + "grad_norm": 0.15957792103290558, + "learning_rate": 0.0005636184388440919, + "loss": 0.0772, + "num_input_tokens_seen": 135957152, + "step": 62950 + }, + { + "epoch": 10.269983686786297, + "grad_norm": 0.010827888734638691, + "learning_rate": 0.0005635478369010316, + "loss": 0.0212, + "num_input_tokens_seen": 135967328, + "step": 62955 + }, + { + "epoch": 10.270799347471453, + "grad_norm": 0.02170558087527752, + "learning_rate": 0.0005634772336700937, + "loss": 0.0751, + "num_input_tokens_seen": 135977472, + "step": 62960 + }, + { + "epoch": 10.271615008156607, + "grad_norm": 0.012610095553100109, + "learning_rate": 0.0005634066291527092, + "loss": 0.0074, + "num_input_tokens_seen": 135988032, + "step": 62965 + }, + { + "epoch": 10.272430668841762, + "grad_norm": 0.24561123549938202, + "learning_rate": 0.000563336023350309, + "loss": 0.0205, + "num_input_tokens_seen": 135998432, + "step": 62970 + }, + { + "epoch": 10.273246329526916, + "grad_norm": 0.007883837446570396, + "learning_rate": 0.0005632654162643239, + "loss": 0.0414, + "num_input_tokens_seen": 136009536, + "step": 62975 + }, + { + "epoch": 10.274061990212072, + "grad_norm": 0.010281133465468884, + "learning_rate": 0.0005631948078961847, + "loss": 0.0081, + "num_input_tokens_seen": 136020480, + "step": 62980 + }, + { + "epoch": 10.274877650897226, + "grad_norm": 0.019367830827832222, + "learning_rate": 0.0005631241982473227, + "loss": 0.0205, + "num_input_tokens_seen": 136031136, + "step": 62985 + }, + { + "epoch": 10.275693311582382, + "grad_norm": 0.20234939455986023, + "learning_rate": 0.0005630535873191687, + "loss": 0.0314, + "num_input_tokens_seen": 136043232, + "step": 62990 + }, + { + "epoch": 10.276508972267537, + "grad_norm": 0.2768293619155884, + "learning_rate": 0.0005629829751131538, + "loss": 0.0386, + "num_input_tokens_seen": 136053888, + "step": 62995 + }, + { + "epoch": 10.277324632952691, + "grad_norm": 0.3425898253917694, + "learning_rate": 0.0005629123616307089, + "loss": 0.168, + "num_input_tokens_seen": 136064160, + "step": 63000 + }, + { + "epoch": 10.278140293637847, + "grad_norm": 0.03600388392806053, + "learning_rate": 0.0005628417468732653, + "loss": 0.1116, + "num_input_tokens_seen": 136075584, + "step": 63005 + }, + { + "epoch": 10.278955954323001, + "grad_norm": 0.18759030103683472, + "learning_rate": 0.0005627711308422539, + "loss": 0.0283, + "num_input_tokens_seen": 136085312, + "step": 63010 + }, + { + "epoch": 10.279771615008157, + "grad_norm": 0.09952437877655029, + "learning_rate": 0.000562700513539106, + "loss": 0.1074, + "num_input_tokens_seen": 136096576, + "step": 63015 + }, + { + "epoch": 10.280587275693312, + "grad_norm": 0.07147414237260818, + "learning_rate": 0.0005626298949652524, + "loss": 0.0206, + "num_input_tokens_seen": 136107168, + "step": 63020 + }, + { + "epoch": 10.281402936378466, + "grad_norm": 0.19256940484046936, + "learning_rate": 0.0005625592751221248, + "loss": 0.1364, + "num_input_tokens_seen": 136118592, + "step": 63025 + }, + { + "epoch": 10.282218597063622, + "grad_norm": 0.0532815121114254, + "learning_rate": 0.000562488654011154, + "loss": 0.0341, + "num_input_tokens_seen": 136128960, + "step": 63030 + }, + { + "epoch": 10.283034257748776, + "grad_norm": 0.3896867632865906, + "learning_rate": 0.0005624180316337715, + "loss": 0.0315, + "num_input_tokens_seen": 136139584, + "step": 63035 + }, + { + "epoch": 10.283849918433932, + "grad_norm": 0.07998025417327881, + "learning_rate": 0.0005623474079914082, + "loss": 0.0164, + "num_input_tokens_seen": 136150560, + "step": 63040 + }, + { + "epoch": 10.284665579119087, + "grad_norm": 0.007289467379450798, + "learning_rate": 0.0005622767830854957, + "loss": 0.0069, + "num_input_tokens_seen": 136161216, + "step": 63045 + }, + { + "epoch": 10.285481239804241, + "grad_norm": 0.018334923312067986, + "learning_rate": 0.0005622061569174651, + "loss": 0.0083, + "num_input_tokens_seen": 136172192, + "step": 63050 + }, + { + "epoch": 10.286296900489397, + "grad_norm": 0.36530670523643494, + "learning_rate": 0.0005621355294887479, + "loss": 0.0248, + "num_input_tokens_seen": 136182240, + "step": 63055 + }, + { + "epoch": 10.28711256117455, + "grad_norm": 0.0075960480608046055, + "learning_rate": 0.0005620649008007755, + "loss": 0.0148, + "num_input_tokens_seen": 136193312, + "step": 63060 + }, + { + "epoch": 10.287928221859707, + "grad_norm": 0.0044697243720293045, + "learning_rate": 0.0005619942708549789, + "loss": 0.0277, + "num_input_tokens_seen": 136205280, + "step": 63065 + }, + { + "epoch": 10.28874388254486, + "grad_norm": 0.007282007485628128, + "learning_rate": 0.0005619236396527899, + "loss": 0.0035, + "num_input_tokens_seen": 136216768, + "step": 63070 + }, + { + "epoch": 10.289559543230016, + "grad_norm": 0.0313313864171505, + "learning_rate": 0.0005618530071956397, + "loss": 0.0065, + "num_input_tokens_seen": 136228608, + "step": 63075 + }, + { + "epoch": 10.290375203915172, + "grad_norm": 0.6887944340705872, + "learning_rate": 0.00056178237348496, + "loss": 0.0458, + "num_input_tokens_seen": 136239328, + "step": 63080 + }, + { + "epoch": 10.291190864600326, + "grad_norm": 0.056814152747392654, + "learning_rate": 0.0005617117385221819, + "loss": 0.0233, + "num_input_tokens_seen": 136249632, + "step": 63085 + }, + { + "epoch": 10.292006525285482, + "grad_norm": 0.00974601786583662, + "learning_rate": 0.0005616411023087373, + "loss": 0.0129, + "num_input_tokens_seen": 136260352, + "step": 63090 + }, + { + "epoch": 10.292822185970635, + "grad_norm": 0.260468453168869, + "learning_rate": 0.0005615704648460575, + "loss": 0.0706, + "num_input_tokens_seen": 136270752, + "step": 63095 + }, + { + "epoch": 10.293637846655791, + "grad_norm": 0.5153801441192627, + "learning_rate": 0.0005614998261355741, + "loss": 0.1921, + "num_input_tokens_seen": 136281664, + "step": 63100 + }, + { + "epoch": 10.294453507340947, + "grad_norm": 0.0031059994362294674, + "learning_rate": 0.0005614291861787188, + "loss": 0.0344, + "num_input_tokens_seen": 136292544, + "step": 63105 + }, + { + "epoch": 10.2952691680261, + "grad_norm": 0.2620164155960083, + "learning_rate": 0.0005613585449769232, + "loss": 0.1277, + "num_input_tokens_seen": 136303744, + "step": 63110 + }, + { + "epoch": 10.296084828711257, + "grad_norm": 0.07174117118120193, + "learning_rate": 0.0005612879025316186, + "loss": 0.0239, + "num_input_tokens_seen": 136314240, + "step": 63115 + }, + { + "epoch": 10.29690048939641, + "grad_norm": 0.018266484141349792, + "learning_rate": 0.000561217258844237, + "loss": 0.0191, + "num_input_tokens_seen": 136325824, + "step": 63120 + }, + { + "epoch": 10.297716150081566, + "grad_norm": 0.010049085132777691, + "learning_rate": 0.0005611466139162101, + "loss": 0.0193, + "num_input_tokens_seen": 136336864, + "step": 63125 + }, + { + "epoch": 10.298531810766722, + "grad_norm": 0.005402869079262018, + "learning_rate": 0.0005610759677489694, + "loss": 0.0324, + "num_input_tokens_seen": 136347456, + "step": 63130 + }, + { + "epoch": 10.299347471451876, + "grad_norm": 0.008500440046191216, + "learning_rate": 0.0005610053203439467, + "loss": 0.2237, + "num_input_tokens_seen": 136357408, + "step": 63135 + }, + { + "epoch": 10.300163132137031, + "grad_norm": 0.0620102696120739, + "learning_rate": 0.0005609346717025737, + "loss": 0.0534, + "num_input_tokens_seen": 136368640, + "step": 63140 + }, + { + "epoch": 10.300978792822185, + "grad_norm": 0.006548778153955936, + "learning_rate": 0.0005608640218262825, + "loss": 0.0972, + "num_input_tokens_seen": 136378144, + "step": 63145 + }, + { + "epoch": 10.301794453507341, + "grad_norm": 0.005673635751008987, + "learning_rate": 0.0005607933707165046, + "loss": 0.0815, + "num_input_tokens_seen": 136388672, + "step": 63150 + }, + { + "epoch": 10.302610114192497, + "grad_norm": 0.29767847061157227, + "learning_rate": 0.000560722718374672, + "loss": 0.1562, + "num_input_tokens_seen": 136398976, + "step": 63155 + }, + { + "epoch": 10.30342577487765, + "grad_norm": 0.00508272647857666, + "learning_rate": 0.0005606520648022164, + "loss": 0.0116, + "num_input_tokens_seen": 136410624, + "step": 63160 + }, + { + "epoch": 10.304241435562806, + "grad_norm": 0.01566510647535324, + "learning_rate": 0.0005605814100005696, + "loss": 0.0087, + "num_input_tokens_seen": 136420864, + "step": 63165 + }, + { + "epoch": 10.30505709624796, + "grad_norm": 0.020303290337324142, + "learning_rate": 0.0005605107539711639, + "loss": 0.0049, + "num_input_tokens_seen": 136430752, + "step": 63170 + }, + { + "epoch": 10.305872756933116, + "grad_norm": 0.1528480499982834, + "learning_rate": 0.000560440096715431, + "loss": 0.0131, + "num_input_tokens_seen": 136440736, + "step": 63175 + }, + { + "epoch": 10.30668841761827, + "grad_norm": 0.17993883788585663, + "learning_rate": 0.0005603694382348027, + "loss": 0.0398, + "num_input_tokens_seen": 136450240, + "step": 63180 + }, + { + "epoch": 10.307504078303426, + "grad_norm": 0.024285180494189262, + "learning_rate": 0.0005602987785307112, + "loss": 0.0801, + "num_input_tokens_seen": 136461376, + "step": 63185 + }, + { + "epoch": 10.308319738988581, + "grad_norm": 0.00804990902543068, + "learning_rate": 0.0005602281176045885, + "loss": 0.0407, + "num_input_tokens_seen": 136473504, + "step": 63190 + }, + { + "epoch": 10.309135399673735, + "grad_norm": 0.008934085257351398, + "learning_rate": 0.0005601574554578666, + "loss": 0.069, + "num_input_tokens_seen": 136483520, + "step": 63195 + }, + { + "epoch": 10.309951060358891, + "grad_norm": 0.004447769373655319, + "learning_rate": 0.0005600867920919775, + "loss": 0.0074, + "num_input_tokens_seen": 136493824, + "step": 63200 + }, + { + "epoch": 10.310766721044045, + "grad_norm": 0.007732720114290714, + "learning_rate": 0.0005600161275083535, + "loss": 0.0509, + "num_input_tokens_seen": 136503936, + "step": 63205 + }, + { + "epoch": 10.3115823817292, + "grad_norm": 0.0947844460606575, + "learning_rate": 0.0005599454617084264, + "loss": 0.0136, + "num_input_tokens_seen": 136515488, + "step": 63210 + }, + { + "epoch": 10.312398042414356, + "grad_norm": 0.0417468324303627, + "learning_rate": 0.0005598747946936285, + "loss": 0.0505, + "num_input_tokens_seen": 136526464, + "step": 63215 + }, + { + "epoch": 10.31321370309951, + "grad_norm": 0.0020598669070750475, + "learning_rate": 0.0005598041264653919, + "loss": 0.0385, + "num_input_tokens_seen": 136537248, + "step": 63220 + }, + { + "epoch": 10.314029363784666, + "grad_norm": 0.01746521145105362, + "learning_rate": 0.0005597334570251489, + "loss": 0.0156, + "num_input_tokens_seen": 136548320, + "step": 63225 + }, + { + "epoch": 10.31484502446982, + "grad_norm": 0.0852559357881546, + "learning_rate": 0.0005596627863743316, + "loss": 0.0217, + "num_input_tokens_seen": 136559360, + "step": 63230 + }, + { + "epoch": 10.315660685154976, + "grad_norm": 0.01237891148775816, + "learning_rate": 0.0005595921145143722, + "loss": 0.0091, + "num_input_tokens_seen": 136569760, + "step": 63235 + }, + { + "epoch": 10.31647634584013, + "grad_norm": 0.3985021710395813, + "learning_rate": 0.0005595214414467029, + "loss": 0.1213, + "num_input_tokens_seen": 136579904, + "step": 63240 + }, + { + "epoch": 10.317292006525285, + "grad_norm": 0.04400103539228439, + "learning_rate": 0.0005594507671727563, + "loss": 0.1041, + "num_input_tokens_seen": 136592064, + "step": 63245 + }, + { + "epoch": 10.318107667210441, + "grad_norm": 0.04320823401212692, + "learning_rate": 0.0005593800916939642, + "loss": 0.0836, + "num_input_tokens_seen": 136603424, + "step": 63250 + }, + { + "epoch": 10.318923327895595, + "grad_norm": 0.27597784996032715, + "learning_rate": 0.0005593094150117595, + "loss": 0.0342, + "num_input_tokens_seen": 136615232, + "step": 63255 + }, + { + "epoch": 10.31973898858075, + "grad_norm": 0.04080792888998985, + "learning_rate": 0.0005592387371275741, + "loss": 0.0607, + "num_input_tokens_seen": 136626240, + "step": 63260 + }, + { + "epoch": 10.320554649265905, + "grad_norm": 0.05028015002608299, + "learning_rate": 0.0005591680580428406, + "loss": 0.042, + "num_input_tokens_seen": 136637408, + "step": 63265 + }, + { + "epoch": 10.32137030995106, + "grad_norm": 0.04935172200202942, + "learning_rate": 0.0005590973777589912, + "loss": 0.0162, + "num_input_tokens_seen": 136647648, + "step": 63270 + }, + { + "epoch": 10.322185970636216, + "grad_norm": 0.004445977509021759, + "learning_rate": 0.0005590266962774588, + "loss": 0.0064, + "num_input_tokens_seen": 136658944, + "step": 63275 + }, + { + "epoch": 10.32300163132137, + "grad_norm": 0.11220485717058182, + "learning_rate": 0.0005589560135996752, + "loss": 0.1099, + "num_input_tokens_seen": 136669792, + "step": 63280 + }, + { + "epoch": 10.323817292006526, + "grad_norm": 0.28762274980545044, + "learning_rate": 0.0005588853297270734, + "loss": 0.054, + "num_input_tokens_seen": 136680608, + "step": 63285 + }, + { + "epoch": 10.32463295269168, + "grad_norm": 0.006721612997353077, + "learning_rate": 0.0005588146446610855, + "loss": 0.039, + "num_input_tokens_seen": 136691296, + "step": 63290 + }, + { + "epoch": 10.325448613376835, + "grad_norm": 0.007691043894737959, + "learning_rate": 0.0005587439584031444, + "loss": 0.0152, + "num_input_tokens_seen": 136701376, + "step": 63295 + }, + { + "epoch": 10.326264274061991, + "grad_norm": 0.00728636747226119, + "learning_rate": 0.0005586732709546824, + "loss": 0.006, + "num_input_tokens_seen": 136712832, + "step": 63300 + }, + { + "epoch": 10.327079934747145, + "grad_norm": 0.0071546598337590694, + "learning_rate": 0.0005586025823171321, + "loss": 0.2496, + "num_input_tokens_seen": 136722368, + "step": 63305 + }, + { + "epoch": 10.3278955954323, + "grad_norm": 0.009742275811731815, + "learning_rate": 0.0005585318924919262, + "loss": 0.0236, + "num_input_tokens_seen": 136733472, + "step": 63310 + }, + { + "epoch": 10.328711256117455, + "grad_norm": 0.008231345564126968, + "learning_rate": 0.0005584612014804972, + "loss": 0.0179, + "num_input_tokens_seen": 136744576, + "step": 63315 + }, + { + "epoch": 10.32952691680261, + "grad_norm": 0.016715819016098976, + "learning_rate": 0.0005583905092842777, + "loss": 0.0645, + "num_input_tokens_seen": 136756480, + "step": 63320 + }, + { + "epoch": 10.330342577487766, + "grad_norm": 0.15237416326999664, + "learning_rate": 0.0005583198159047005, + "loss": 0.0334, + "num_input_tokens_seen": 136766720, + "step": 63325 + }, + { + "epoch": 10.33115823817292, + "grad_norm": 0.015316602773964405, + "learning_rate": 0.0005582491213431983, + "loss": 0.0147, + "num_input_tokens_seen": 136777280, + "step": 63330 + }, + { + "epoch": 10.331973898858076, + "grad_norm": 0.009626131504774094, + "learning_rate": 0.0005581784256012037, + "loss": 0.0615, + "num_input_tokens_seen": 136788768, + "step": 63335 + }, + { + "epoch": 10.33278955954323, + "grad_norm": 0.025972846895456314, + "learning_rate": 0.0005581077286801495, + "loss": 0.1756, + "num_input_tokens_seen": 136799584, + "step": 63340 + }, + { + "epoch": 10.333605220228385, + "grad_norm": 0.02969386987388134, + "learning_rate": 0.0005580370305814686, + "loss": 0.1976, + "num_input_tokens_seen": 136811488, + "step": 63345 + }, + { + "epoch": 10.33442088091354, + "grad_norm": 0.18062162399291992, + "learning_rate": 0.0005579663313065935, + "loss": 0.0207, + "num_input_tokens_seen": 136822784, + "step": 63350 + }, + { + "epoch": 10.335236541598695, + "grad_norm": 0.15014511346817017, + "learning_rate": 0.0005578956308569572, + "loss": 0.0212, + "num_input_tokens_seen": 136833888, + "step": 63355 + }, + { + "epoch": 10.33605220228385, + "grad_norm": 0.011584420688450336, + "learning_rate": 0.0005578249292339924, + "loss": 0.0704, + "num_input_tokens_seen": 136844416, + "step": 63360 + }, + { + "epoch": 10.336867862969005, + "grad_norm": 0.004050334449857473, + "learning_rate": 0.0005577542264391322, + "loss": 0.0629, + "num_input_tokens_seen": 136854336, + "step": 63365 + }, + { + "epoch": 10.33768352365416, + "grad_norm": 0.035820942372083664, + "learning_rate": 0.0005576835224738092, + "loss": 0.0538, + "num_input_tokens_seen": 136865440, + "step": 63370 + }, + { + "epoch": 10.338499184339314, + "grad_norm": 0.0067452918738126755, + "learning_rate": 0.0005576128173394567, + "loss": 0.0962, + "num_input_tokens_seen": 136877248, + "step": 63375 + }, + { + "epoch": 10.33931484502447, + "grad_norm": 0.19582054018974304, + "learning_rate": 0.0005575421110375072, + "loss": 0.0485, + "num_input_tokens_seen": 136887360, + "step": 63380 + }, + { + "epoch": 10.340130505709626, + "grad_norm": 0.005524530075490475, + "learning_rate": 0.0005574714035693938, + "loss": 0.1338, + "num_input_tokens_seen": 136899008, + "step": 63385 + }, + { + "epoch": 10.34094616639478, + "grad_norm": 0.1523488163948059, + "learning_rate": 0.0005574006949365496, + "loss": 0.1644, + "num_input_tokens_seen": 136908832, + "step": 63390 + }, + { + "epoch": 10.341761827079935, + "grad_norm": 0.33566051721572876, + "learning_rate": 0.0005573299851404074, + "loss": 0.0468, + "num_input_tokens_seen": 136919168, + "step": 63395 + }, + { + "epoch": 10.34257748776509, + "grad_norm": 0.0225025936961174, + "learning_rate": 0.0005572592741824003, + "loss": 0.0136, + "num_input_tokens_seen": 136929312, + "step": 63400 + }, + { + "epoch": 10.343393148450245, + "grad_norm": 0.009087719023227692, + "learning_rate": 0.0005571885620639614, + "loss": 0.089, + "num_input_tokens_seen": 136940576, + "step": 63405 + }, + { + "epoch": 10.3442088091354, + "grad_norm": 0.02980167046189308, + "learning_rate": 0.0005571178487865238, + "loss": 0.1111, + "num_input_tokens_seen": 136951296, + "step": 63410 + }, + { + "epoch": 10.345024469820554, + "grad_norm": 0.012627690099179745, + "learning_rate": 0.0005570471343515205, + "loss": 0.0463, + "num_input_tokens_seen": 136962208, + "step": 63415 + }, + { + "epoch": 10.34584013050571, + "grad_norm": 0.5630673170089722, + "learning_rate": 0.0005569764187603846, + "loss": 0.1932, + "num_input_tokens_seen": 136973184, + "step": 63420 + }, + { + "epoch": 10.346655791190864, + "grad_norm": 0.16642948985099792, + "learning_rate": 0.0005569057020145494, + "loss": 0.0289, + "num_input_tokens_seen": 136983936, + "step": 63425 + }, + { + "epoch": 10.34747145187602, + "grad_norm": 0.04962924122810364, + "learning_rate": 0.0005568349841154479, + "loss": 0.01, + "num_input_tokens_seen": 136994848, + "step": 63430 + }, + { + "epoch": 10.348287112561174, + "grad_norm": 0.28208860754966736, + "learning_rate": 0.0005567642650645134, + "loss": 0.0792, + "num_input_tokens_seen": 137005760, + "step": 63435 + }, + { + "epoch": 10.34910277324633, + "grad_norm": 0.754994809627533, + "learning_rate": 0.000556693544863179, + "loss": 0.1711, + "num_input_tokens_seen": 137016192, + "step": 63440 + }, + { + "epoch": 10.349918433931485, + "grad_norm": 0.013265586458146572, + "learning_rate": 0.000556622823512878, + "loss": 0.0094, + "num_input_tokens_seen": 137027296, + "step": 63445 + }, + { + "epoch": 10.350734094616639, + "grad_norm": 0.028474433347582817, + "learning_rate": 0.0005565521010150436, + "loss": 0.021, + "num_input_tokens_seen": 137038592, + "step": 63450 + }, + { + "epoch": 10.351549755301795, + "grad_norm": 0.0459626168012619, + "learning_rate": 0.0005564813773711092, + "loss": 0.1247, + "num_input_tokens_seen": 137048768, + "step": 63455 + }, + { + "epoch": 10.352365415986949, + "grad_norm": 0.007897719740867615, + "learning_rate": 0.0005564106525825079, + "loss": 0.0831, + "num_input_tokens_seen": 137059520, + "step": 63460 + }, + { + "epoch": 10.353181076672104, + "grad_norm": 0.01814207434654236, + "learning_rate": 0.0005563399266506734, + "loss": 0.0318, + "num_input_tokens_seen": 137070912, + "step": 63465 + }, + { + "epoch": 10.35399673735726, + "grad_norm": 0.06560403853654861, + "learning_rate": 0.0005562691995770386, + "loss": 0.0657, + "num_input_tokens_seen": 137081472, + "step": 63470 + }, + { + "epoch": 10.354812398042414, + "grad_norm": 0.016114749014377594, + "learning_rate": 0.0005561984713630373, + "loss": 0.0155, + "num_input_tokens_seen": 137091520, + "step": 63475 + }, + { + "epoch": 10.35562805872757, + "grad_norm": 0.05790586769580841, + "learning_rate": 0.0005561277420101026, + "loss": 0.0392, + "num_input_tokens_seen": 137103680, + "step": 63480 + }, + { + "epoch": 10.356443719412724, + "grad_norm": 0.17016306519508362, + "learning_rate": 0.0005560570115196679, + "loss": 0.053, + "num_input_tokens_seen": 137115488, + "step": 63485 + }, + { + "epoch": 10.35725938009788, + "grad_norm": 0.009363112039864063, + "learning_rate": 0.0005559862798931668, + "loss": 0.0265, + "num_input_tokens_seen": 137126944, + "step": 63490 + }, + { + "epoch": 10.358075040783035, + "grad_norm": 0.2349333018064499, + "learning_rate": 0.0005559155471320326, + "loss": 0.041, + "num_input_tokens_seen": 137138112, + "step": 63495 + }, + { + "epoch": 10.358890701468189, + "grad_norm": 0.17752444744110107, + "learning_rate": 0.0005558448132376991, + "loss": 0.0234, + "num_input_tokens_seen": 137149216, + "step": 63500 + }, + { + "epoch": 10.359706362153345, + "grad_norm": 0.03303788974881172, + "learning_rate": 0.0005557740782115995, + "loss": 0.0323, + "num_input_tokens_seen": 137160640, + "step": 63505 + }, + { + "epoch": 10.360522022838499, + "grad_norm": 0.3540240228176117, + "learning_rate": 0.0005557033420551676, + "loss": 0.1315, + "num_input_tokens_seen": 137171808, + "step": 63510 + }, + { + "epoch": 10.361337683523654, + "grad_norm": 0.004490839783102274, + "learning_rate": 0.0005556326047698367, + "loss": 0.0513, + "num_input_tokens_seen": 137182912, + "step": 63515 + }, + { + "epoch": 10.362153344208808, + "grad_norm": 0.11256757378578186, + "learning_rate": 0.0005555618663570405, + "loss": 0.0545, + "num_input_tokens_seen": 137193792, + "step": 63520 + }, + { + "epoch": 10.362969004893964, + "grad_norm": 0.011594374664127827, + "learning_rate": 0.0005554911268182126, + "loss": 0.0333, + "num_input_tokens_seen": 137204160, + "step": 63525 + }, + { + "epoch": 10.36378466557912, + "grad_norm": 0.004073978401720524, + "learning_rate": 0.0005554203861547866, + "loss": 0.0157, + "num_input_tokens_seen": 137214720, + "step": 63530 + }, + { + "epoch": 10.364600326264274, + "grad_norm": 0.16724653542041779, + "learning_rate": 0.0005553496443681961, + "loss": 0.0675, + "num_input_tokens_seen": 137224096, + "step": 63535 + }, + { + "epoch": 10.36541598694943, + "grad_norm": 0.04027742147445679, + "learning_rate": 0.000555278901459875, + "loss": 0.1207, + "num_input_tokens_seen": 137235296, + "step": 63540 + }, + { + "epoch": 10.366231647634583, + "grad_norm": 0.006478854920715094, + "learning_rate": 0.0005552081574312568, + "loss": 0.0184, + "num_input_tokens_seen": 137247200, + "step": 63545 + }, + { + "epoch": 10.367047308319739, + "grad_norm": 0.01771964132785797, + "learning_rate": 0.0005551374122837752, + "loss": 0.0222, + "num_input_tokens_seen": 137257952, + "step": 63550 + }, + { + "epoch": 10.367862969004895, + "grad_norm": 0.01356664951890707, + "learning_rate": 0.000555066666018864, + "loss": 0.0282, + "num_input_tokens_seen": 137269088, + "step": 63555 + }, + { + "epoch": 10.368678629690049, + "grad_norm": 0.3984331786632538, + "learning_rate": 0.0005549959186379569, + "loss": 0.1308, + "num_input_tokens_seen": 137279520, + "step": 63560 + }, + { + "epoch": 10.369494290375204, + "grad_norm": 0.007946250960230827, + "learning_rate": 0.0005549251701424878, + "loss": 0.0346, + "num_input_tokens_seen": 137291008, + "step": 63565 + }, + { + "epoch": 10.370309951060358, + "grad_norm": 0.022877560928463936, + "learning_rate": 0.0005548544205338905, + "loss": 0.0125, + "num_input_tokens_seen": 137300832, + "step": 63570 + }, + { + "epoch": 10.371125611745514, + "grad_norm": 0.021704500541090965, + "learning_rate": 0.0005547836698135987, + "loss": 0.0746, + "num_input_tokens_seen": 137310208, + "step": 63575 + }, + { + "epoch": 10.37194127243067, + "grad_norm": 0.17134937644004822, + "learning_rate": 0.0005547129179830463, + "loss": 0.0237, + "num_input_tokens_seen": 137321504, + "step": 63580 + }, + { + "epoch": 10.372756933115824, + "grad_norm": 0.0015580940525978804, + "learning_rate": 0.0005546421650436674, + "loss": 0.0157, + "num_input_tokens_seen": 137331648, + "step": 63585 + }, + { + "epoch": 10.37357259380098, + "grad_norm": 0.006557346321642399, + "learning_rate": 0.0005545714109968956, + "loss": 0.0066, + "num_input_tokens_seen": 137342464, + "step": 63590 + }, + { + "epoch": 10.374388254486133, + "grad_norm": 0.01257567573338747, + "learning_rate": 0.0005545006558441649, + "loss": 0.0655, + "num_input_tokens_seen": 137353504, + "step": 63595 + }, + { + "epoch": 10.375203915171289, + "grad_norm": 0.04626630246639252, + "learning_rate": 0.0005544298995869093, + "loss": 0.0389, + "num_input_tokens_seen": 137365504, + "step": 63600 + }, + { + "epoch": 10.376019575856443, + "grad_norm": 0.00850379467010498, + "learning_rate": 0.0005543591422265627, + "loss": 0.0076, + "num_input_tokens_seen": 137376160, + "step": 63605 + }, + { + "epoch": 10.376835236541599, + "grad_norm": 0.033676404505968094, + "learning_rate": 0.0005542883837645592, + "loss": 0.0659, + "num_input_tokens_seen": 137387200, + "step": 63610 + }, + { + "epoch": 10.377650897226754, + "grad_norm": 0.08765646070241928, + "learning_rate": 0.0005542176242023326, + "loss": 0.0249, + "num_input_tokens_seen": 137399552, + "step": 63615 + }, + { + "epoch": 10.378466557911908, + "grad_norm": 0.10746024549007416, + "learning_rate": 0.0005541468635413172, + "loss": 0.0237, + "num_input_tokens_seen": 137411904, + "step": 63620 + }, + { + "epoch": 10.379282218597064, + "grad_norm": 0.007602016907185316, + "learning_rate": 0.0005540761017829468, + "loss": 0.0067, + "num_input_tokens_seen": 137421856, + "step": 63625 + }, + { + "epoch": 10.380097879282218, + "grad_norm": 0.00871317833662033, + "learning_rate": 0.0005540053389286556, + "loss": 0.0111, + "num_input_tokens_seen": 137432896, + "step": 63630 + }, + { + "epoch": 10.380913539967374, + "grad_norm": 0.011335165239870548, + "learning_rate": 0.0005539345749798778, + "loss": 0.0507, + "num_input_tokens_seen": 137444544, + "step": 63635 + }, + { + "epoch": 10.38172920065253, + "grad_norm": 0.03143840283155441, + "learning_rate": 0.0005538638099380473, + "loss": 0.0591, + "num_input_tokens_seen": 137454272, + "step": 63640 + }, + { + "epoch": 10.382544861337683, + "grad_norm": 0.2992579936981201, + "learning_rate": 0.0005537930438045984, + "loss": 0.0203, + "num_input_tokens_seen": 137464416, + "step": 63645 + }, + { + "epoch": 10.383360522022839, + "grad_norm": 0.0029851419385522604, + "learning_rate": 0.0005537222765809653, + "loss": 0.0984, + "num_input_tokens_seen": 137473472, + "step": 63650 + }, + { + "epoch": 10.384176182707993, + "grad_norm": 0.3246956169605255, + "learning_rate": 0.000553651508268582, + "loss": 0.0578, + "num_input_tokens_seen": 137484800, + "step": 63655 + }, + { + "epoch": 10.384991843393149, + "grad_norm": 0.006309543736279011, + "learning_rate": 0.000553580738868883, + "loss": 0.0208, + "num_input_tokens_seen": 137494592, + "step": 63660 + }, + { + "epoch": 10.385807504078304, + "grad_norm": 0.42174792289733887, + "learning_rate": 0.0005535099683833021, + "loss": 0.2332, + "num_input_tokens_seen": 137504352, + "step": 63665 + }, + { + "epoch": 10.386623164763458, + "grad_norm": 0.04054646193981171, + "learning_rate": 0.0005534391968132741, + "loss": 0.0298, + "num_input_tokens_seen": 137515680, + "step": 63670 + }, + { + "epoch": 10.387438825448614, + "grad_norm": 0.08717795461416245, + "learning_rate": 0.0005533684241602327, + "loss": 0.1018, + "num_input_tokens_seen": 137526592, + "step": 63675 + }, + { + "epoch": 10.388254486133768, + "grad_norm": 0.04790165647864342, + "learning_rate": 0.0005532976504256127, + "loss": 0.1052, + "num_input_tokens_seen": 137536864, + "step": 63680 + }, + { + "epoch": 10.389070146818923, + "grad_norm": 0.546216607093811, + "learning_rate": 0.000553226875610848, + "loss": 0.0735, + "num_input_tokens_seen": 137547232, + "step": 63685 + }, + { + "epoch": 10.38988580750408, + "grad_norm": 0.00997911486774683, + "learning_rate": 0.0005531560997173733, + "loss": 0.0209, + "num_input_tokens_seen": 137558016, + "step": 63690 + }, + { + "epoch": 10.390701468189233, + "grad_norm": 0.0133741470053792, + "learning_rate": 0.0005530853227466229, + "loss": 0.0082, + "num_input_tokens_seen": 137568896, + "step": 63695 + }, + { + "epoch": 10.391517128874389, + "grad_norm": 0.0015816029626876116, + "learning_rate": 0.0005530145447000308, + "loss": 0.0075, + "num_input_tokens_seen": 137578528, + "step": 63700 + }, + { + "epoch": 10.392332789559543, + "grad_norm": 0.09255795925855637, + "learning_rate": 0.0005529437655790319, + "loss": 0.1526, + "num_input_tokens_seen": 137589568, + "step": 63705 + }, + { + "epoch": 10.393148450244698, + "grad_norm": 0.09927723556756973, + "learning_rate": 0.0005528729853850604, + "loss": 0.0198, + "num_input_tokens_seen": 137600096, + "step": 63710 + }, + { + "epoch": 10.393964110929852, + "grad_norm": 0.1837208867073059, + "learning_rate": 0.0005528022041195507, + "loss": 0.0387, + "num_input_tokens_seen": 137611616, + "step": 63715 + }, + { + "epoch": 10.394779771615008, + "grad_norm": 0.0033106612972915173, + "learning_rate": 0.0005527314217839375, + "loss": 0.0115, + "num_input_tokens_seen": 137622304, + "step": 63720 + }, + { + "epoch": 10.395595432300164, + "grad_norm": 0.010321944952011108, + "learning_rate": 0.0005526606383796551, + "loss": 0.0358, + "num_input_tokens_seen": 137633344, + "step": 63725 + }, + { + "epoch": 10.396411092985318, + "grad_norm": 0.10109658539295197, + "learning_rate": 0.000552589853908138, + "loss": 0.1396, + "num_input_tokens_seen": 137643264, + "step": 63730 + }, + { + "epoch": 10.397226753670473, + "grad_norm": 0.0041798497550189495, + "learning_rate": 0.0005525190683708207, + "loss": 0.004, + "num_input_tokens_seen": 137654464, + "step": 63735 + }, + { + "epoch": 10.398042414355627, + "grad_norm": 0.48795971274375916, + "learning_rate": 0.0005524482817691381, + "loss": 0.0697, + "num_input_tokens_seen": 137664224, + "step": 63740 + }, + { + "epoch": 10.398858075040783, + "grad_norm": 0.014303839765489101, + "learning_rate": 0.0005523774941045244, + "loss": 0.0325, + "num_input_tokens_seen": 137675872, + "step": 63745 + }, + { + "epoch": 10.399673735725939, + "grad_norm": 0.006388854701071978, + "learning_rate": 0.0005523067053784143, + "loss": 0.013, + "num_input_tokens_seen": 137687328, + "step": 63750 + }, + { + "epoch": 10.400489396411093, + "grad_norm": 0.02298627234995365, + "learning_rate": 0.0005522359155922425, + "loss": 0.0322, + "num_input_tokens_seen": 137699072, + "step": 63755 + }, + { + "epoch": 10.401305057096248, + "grad_norm": 0.003545205108821392, + "learning_rate": 0.0005521651247474436, + "loss": 0.0531, + "num_input_tokens_seen": 137710720, + "step": 63760 + }, + { + "epoch": 10.402120717781402, + "grad_norm": 0.10142495483160019, + "learning_rate": 0.0005520943328454523, + "loss": 0.2223, + "num_input_tokens_seen": 137720640, + "step": 63765 + }, + { + "epoch": 10.402936378466558, + "grad_norm": 0.007538018748164177, + "learning_rate": 0.0005520235398877032, + "loss": 0.0385, + "num_input_tokens_seen": 137732000, + "step": 63770 + }, + { + "epoch": 10.403752039151712, + "grad_norm": 0.018357079476118088, + "learning_rate": 0.0005519527458756312, + "loss": 0.1069, + "num_input_tokens_seen": 137741760, + "step": 63775 + }, + { + "epoch": 10.404567699836868, + "grad_norm": 0.0528842993080616, + "learning_rate": 0.0005518819508106706, + "loss": 0.0097, + "num_input_tokens_seen": 137753408, + "step": 63780 + }, + { + "epoch": 10.405383360522023, + "grad_norm": 0.00990887638181448, + "learning_rate": 0.0005518111546942567, + "loss": 0.1179, + "num_input_tokens_seen": 137764608, + "step": 63785 + }, + { + "epoch": 10.406199021207177, + "grad_norm": 0.023492760956287384, + "learning_rate": 0.000551740357527824, + "loss": 0.0076, + "num_input_tokens_seen": 137775712, + "step": 63790 + }, + { + "epoch": 10.407014681892333, + "grad_norm": 0.11711253225803375, + "learning_rate": 0.0005516695593128073, + "loss": 0.0947, + "num_input_tokens_seen": 137786304, + "step": 63795 + }, + { + "epoch": 10.407830342577487, + "grad_norm": 0.00303363474085927, + "learning_rate": 0.0005515987600506414, + "loss": 0.0079, + "num_input_tokens_seen": 137797376, + "step": 63800 + }, + { + "epoch": 10.408646003262643, + "grad_norm": 0.008306864649057388, + "learning_rate": 0.0005515279597427612, + "loss": 0.0036, + "num_input_tokens_seen": 137808928, + "step": 63805 + }, + { + "epoch": 10.409461663947798, + "grad_norm": 0.029468778520822525, + "learning_rate": 0.0005514571583906014, + "loss": 0.0067, + "num_input_tokens_seen": 137819872, + "step": 63810 + }, + { + "epoch": 10.410277324632952, + "grad_norm": 0.3117620348930359, + "learning_rate": 0.0005513863559955971, + "loss": 0.1481, + "num_input_tokens_seen": 137830528, + "step": 63815 + }, + { + "epoch": 10.411092985318108, + "grad_norm": 0.11586478352546692, + "learning_rate": 0.0005513155525591831, + "loss": 0.016, + "num_input_tokens_seen": 137841920, + "step": 63820 + }, + { + "epoch": 10.411908646003262, + "grad_norm": 0.1153964027762413, + "learning_rate": 0.0005512447480827945, + "loss": 0.1048, + "num_input_tokens_seen": 137853280, + "step": 63825 + }, + { + "epoch": 10.412724306688418, + "grad_norm": 0.00367523985914886, + "learning_rate": 0.0005511739425678658, + "loss": 0.1209, + "num_input_tokens_seen": 137861984, + "step": 63830 + }, + { + "epoch": 10.413539967373573, + "grad_norm": 0.02312796749174595, + "learning_rate": 0.0005511031360158324, + "loss": 0.1354, + "num_input_tokens_seen": 137872192, + "step": 63835 + }, + { + "epoch": 10.414355628058727, + "grad_norm": 0.005195770412683487, + "learning_rate": 0.0005510323284281291, + "loss": 0.1633, + "num_input_tokens_seen": 137884160, + "step": 63840 + }, + { + "epoch": 10.415171288743883, + "grad_norm": 0.01457708328962326, + "learning_rate": 0.0005509615198061909, + "loss": 0.0152, + "num_input_tokens_seen": 137894912, + "step": 63845 + }, + { + "epoch": 10.415986949429037, + "grad_norm": 0.4979305863380432, + "learning_rate": 0.0005508907101514529, + "loss": 0.0577, + "num_input_tokens_seen": 137903648, + "step": 63850 + }, + { + "epoch": 10.416802610114193, + "grad_norm": 0.028883550316095352, + "learning_rate": 0.0005508198994653501, + "loss": 0.0111, + "num_input_tokens_seen": 137915680, + "step": 63855 + }, + { + "epoch": 10.417618270799348, + "grad_norm": 0.015348607674241066, + "learning_rate": 0.0005507490877493176, + "loss": 0.0235, + "num_input_tokens_seen": 137926912, + "step": 63860 + }, + { + "epoch": 10.418433931484502, + "grad_norm": 0.09224829822778702, + "learning_rate": 0.0005506782750047903, + "loss": 0.0134, + "num_input_tokens_seen": 137936192, + "step": 63865 + }, + { + "epoch": 10.419249592169658, + "grad_norm": 0.011172234080731869, + "learning_rate": 0.0005506074612332035, + "loss": 0.0402, + "num_input_tokens_seen": 137945856, + "step": 63870 + }, + { + "epoch": 10.420065252854812, + "grad_norm": 0.010413877665996552, + "learning_rate": 0.0005505366464359924, + "loss": 0.0652, + "num_input_tokens_seen": 137956864, + "step": 63875 + }, + { + "epoch": 10.420880913539968, + "grad_norm": 0.030237272381782532, + "learning_rate": 0.000550465830614592, + "loss": 0.0112, + "num_input_tokens_seen": 137969088, + "step": 63880 + }, + { + "epoch": 10.421696574225122, + "grad_norm": 0.04134310036897659, + "learning_rate": 0.0005503950137704374, + "loss": 0.1144, + "num_input_tokens_seen": 137979072, + "step": 63885 + }, + { + "epoch": 10.422512234910277, + "grad_norm": 0.6153962016105652, + "learning_rate": 0.0005503241959049641, + "loss": 0.2252, + "num_input_tokens_seen": 137990240, + "step": 63890 + }, + { + "epoch": 10.423327895595433, + "grad_norm": 0.30122601985931396, + "learning_rate": 0.000550253377019607, + "loss": 0.0448, + "num_input_tokens_seen": 138000832, + "step": 63895 + }, + { + "epoch": 10.424143556280587, + "grad_norm": 0.00784077774733305, + "learning_rate": 0.0005501825571158016, + "loss": 0.0227, + "num_input_tokens_seen": 138012576, + "step": 63900 + }, + { + "epoch": 10.424959216965743, + "grad_norm": 0.0033007084857672453, + "learning_rate": 0.000550111736194983, + "loss": 0.0132, + "num_input_tokens_seen": 138024000, + "step": 63905 + }, + { + "epoch": 10.425774877650896, + "grad_norm": 0.043511830270290375, + "learning_rate": 0.0005500409142585864, + "loss": 0.0384, + "num_input_tokens_seen": 138034464, + "step": 63910 + }, + { + "epoch": 10.426590538336052, + "grad_norm": 0.02446580119431019, + "learning_rate": 0.0005499700913080472, + "loss": 0.0242, + "num_input_tokens_seen": 138044448, + "step": 63915 + }, + { + "epoch": 10.427406199021208, + "grad_norm": 0.0117116067558527, + "learning_rate": 0.0005498992673448008, + "loss": 0.0354, + "num_input_tokens_seen": 138055424, + "step": 63920 + }, + { + "epoch": 10.428221859706362, + "grad_norm": 0.16764822602272034, + "learning_rate": 0.0005498284423702824, + "loss": 0.0158, + "num_input_tokens_seen": 138066560, + "step": 63925 + }, + { + "epoch": 10.429037520391518, + "grad_norm": 0.04869011417031288, + "learning_rate": 0.0005497576163859273, + "loss": 0.0856, + "num_input_tokens_seen": 138076992, + "step": 63930 + }, + { + "epoch": 10.429853181076671, + "grad_norm": 0.32588332891464233, + "learning_rate": 0.0005496867893931711, + "loss": 0.0419, + "num_input_tokens_seen": 138087296, + "step": 63935 + }, + { + "epoch": 10.430668841761827, + "grad_norm": 0.012518075294792652, + "learning_rate": 0.0005496159613934492, + "loss": 0.1043, + "num_input_tokens_seen": 138097184, + "step": 63940 + }, + { + "epoch": 10.431484502446983, + "grad_norm": 0.11210685223340988, + "learning_rate": 0.0005495451323881967, + "loss": 0.0525, + "num_input_tokens_seen": 138109216, + "step": 63945 + }, + { + "epoch": 10.432300163132137, + "grad_norm": 0.07045716047286987, + "learning_rate": 0.0005494743023788493, + "loss": 0.0087, + "num_input_tokens_seen": 138120032, + "step": 63950 + }, + { + "epoch": 10.433115823817293, + "grad_norm": 0.006280902773141861, + "learning_rate": 0.0005494034713668423, + "loss": 0.0157, + "num_input_tokens_seen": 138129696, + "step": 63955 + }, + { + "epoch": 10.433931484502446, + "grad_norm": 0.022228620946407318, + "learning_rate": 0.0005493326393536113, + "loss": 0.005, + "num_input_tokens_seen": 138141984, + "step": 63960 + }, + { + "epoch": 10.434747145187602, + "grad_norm": 0.41118839383125305, + "learning_rate": 0.000549261806340592, + "loss": 0.0481, + "num_input_tokens_seen": 138152512, + "step": 63965 + }, + { + "epoch": 10.435562805872756, + "grad_norm": 0.0460282601416111, + "learning_rate": 0.0005491909723292196, + "loss": 0.0952, + "num_input_tokens_seen": 138164160, + "step": 63970 + }, + { + "epoch": 10.436378466557912, + "grad_norm": 0.007916356436908245, + "learning_rate": 0.0005491201373209295, + "loss": 0.007, + "num_input_tokens_seen": 138175200, + "step": 63975 + }, + { + "epoch": 10.437194127243067, + "grad_norm": 0.006717904936522245, + "learning_rate": 0.0005490493013171578, + "loss": 0.0115, + "num_input_tokens_seen": 138185568, + "step": 63980 + }, + { + "epoch": 10.438009787928221, + "grad_norm": 0.03154471516609192, + "learning_rate": 0.0005489784643193397, + "loss": 0.0226, + "num_input_tokens_seen": 138195904, + "step": 63985 + }, + { + "epoch": 10.438825448613377, + "grad_norm": 0.021439258009195328, + "learning_rate": 0.0005489076263289109, + "loss": 0.0061, + "num_input_tokens_seen": 138206112, + "step": 63990 + }, + { + "epoch": 10.439641109298531, + "grad_norm": 0.013829846866428852, + "learning_rate": 0.000548836787347307, + "loss": 0.0246, + "num_input_tokens_seen": 138216320, + "step": 63995 + }, + { + "epoch": 10.440456769983687, + "grad_norm": 0.03632638603448868, + "learning_rate": 0.0005487659473759635, + "loss": 0.1465, + "num_input_tokens_seen": 138226656, + "step": 64000 + }, + { + "epoch": 10.441272430668842, + "grad_norm": 0.007183171808719635, + "learning_rate": 0.0005486951064163164, + "loss": 0.1501, + "num_input_tokens_seen": 138237664, + "step": 64005 + }, + { + "epoch": 10.442088091353996, + "grad_norm": 0.00477360375225544, + "learning_rate": 0.0005486242644698011, + "loss": 0.0094, + "num_input_tokens_seen": 138249568, + "step": 64010 + }, + { + "epoch": 10.442903752039152, + "grad_norm": 0.013287726789712906, + "learning_rate": 0.0005485534215378535, + "loss": 0.0705, + "num_input_tokens_seen": 138260128, + "step": 64015 + }, + { + "epoch": 10.443719412724306, + "grad_norm": 0.013567056506872177, + "learning_rate": 0.0005484825776219092, + "loss": 0.059, + "num_input_tokens_seen": 138270464, + "step": 64020 + }, + { + "epoch": 10.444535073409462, + "grad_norm": 0.02812664769589901, + "learning_rate": 0.0005484117327234038, + "loss": 0.0055, + "num_input_tokens_seen": 138280672, + "step": 64025 + }, + { + "epoch": 10.445350734094617, + "grad_norm": 0.0012703530956059694, + "learning_rate": 0.0005483408868437734, + "loss": 0.0095, + "num_input_tokens_seen": 138290944, + "step": 64030 + }, + { + "epoch": 10.446166394779771, + "grad_norm": 0.9293310046195984, + "learning_rate": 0.0005482700399844536, + "loss": 0.0507, + "num_input_tokens_seen": 138301728, + "step": 64035 + }, + { + "epoch": 10.446982055464927, + "grad_norm": 0.056482378393411636, + "learning_rate": 0.0005481991921468801, + "loss": 0.0048, + "num_input_tokens_seen": 138313248, + "step": 64040 + }, + { + "epoch": 10.447797716150081, + "grad_norm": 0.2966460585594177, + "learning_rate": 0.0005481283433324888, + "loss": 0.0907, + "num_input_tokens_seen": 138324032, + "step": 64045 + }, + { + "epoch": 10.448613376835237, + "grad_norm": 0.23112505674362183, + "learning_rate": 0.0005480574935427157, + "loss": 0.0209, + "num_input_tokens_seen": 138335424, + "step": 64050 + }, + { + "epoch": 10.449429037520392, + "grad_norm": 0.3356492221355438, + "learning_rate": 0.0005479866427789965, + "loss": 0.0301, + "num_input_tokens_seen": 138346400, + "step": 64055 + }, + { + "epoch": 10.450244698205546, + "grad_norm": 0.003506778972223401, + "learning_rate": 0.0005479157910427672, + "loss": 0.0095, + "num_input_tokens_seen": 138357600, + "step": 64060 + }, + { + "epoch": 10.451060358890702, + "grad_norm": 0.2725575268268585, + "learning_rate": 0.0005478449383354634, + "loss": 0.114, + "num_input_tokens_seen": 138368160, + "step": 64065 + }, + { + "epoch": 10.451876019575856, + "grad_norm": 0.003526828018948436, + "learning_rate": 0.0005477740846585213, + "loss": 0.0247, + "num_input_tokens_seen": 138379008, + "step": 64070 + }, + { + "epoch": 10.452691680261012, + "grad_norm": 0.06649752706289291, + "learning_rate": 0.0005477032300133768, + "loss": 0.0175, + "num_input_tokens_seen": 138389568, + "step": 64075 + }, + { + "epoch": 10.453507340946166, + "grad_norm": 0.25740283727645874, + "learning_rate": 0.0005476323744014658, + "loss": 0.0609, + "num_input_tokens_seen": 138400160, + "step": 64080 + }, + { + "epoch": 10.454323001631321, + "grad_norm": 0.03396669030189514, + "learning_rate": 0.0005475615178242244, + "loss": 0.0875, + "num_input_tokens_seen": 138410688, + "step": 64085 + }, + { + "epoch": 10.455138662316477, + "grad_norm": 0.0125178387388587, + "learning_rate": 0.0005474906602830884, + "loss": 0.039, + "num_input_tokens_seen": 138420320, + "step": 64090 + }, + { + "epoch": 10.455954323001631, + "grad_norm": 0.16476449370384216, + "learning_rate": 0.0005474198017794939, + "loss": 0.1703, + "num_input_tokens_seen": 138431680, + "step": 64095 + }, + { + "epoch": 10.456769983686787, + "grad_norm": 0.004711966495960951, + "learning_rate": 0.000547348942314877, + "loss": 0.1373, + "num_input_tokens_seen": 138442016, + "step": 64100 + }, + { + "epoch": 10.45758564437194, + "grad_norm": 0.02316650189459324, + "learning_rate": 0.0005472780818906736, + "loss": 0.0743, + "num_input_tokens_seen": 138452096, + "step": 64105 + }, + { + "epoch": 10.458401305057096, + "grad_norm": 0.0701381042599678, + "learning_rate": 0.00054720722050832, + "loss": 0.072, + "num_input_tokens_seen": 138460288, + "step": 64110 + }, + { + "epoch": 10.459216965742252, + "grad_norm": 0.22336918115615845, + "learning_rate": 0.0005471363581692523, + "loss": 0.0152, + "num_input_tokens_seen": 138471904, + "step": 64115 + }, + { + "epoch": 10.460032626427406, + "grad_norm": 0.003313810098916292, + "learning_rate": 0.0005470654948749065, + "loss": 0.0656, + "num_input_tokens_seen": 138483072, + "step": 64120 + }, + { + "epoch": 10.460848287112562, + "grad_norm": 0.041474759578704834, + "learning_rate": 0.0005469946306267185, + "loss": 0.0052, + "num_input_tokens_seen": 138494496, + "step": 64125 + }, + { + "epoch": 10.461663947797716, + "grad_norm": 0.003572209272533655, + "learning_rate": 0.0005469237654261249, + "loss": 0.0202, + "num_input_tokens_seen": 138505856, + "step": 64130 + }, + { + "epoch": 10.462479608482871, + "grad_norm": 0.1137644425034523, + "learning_rate": 0.0005468528992745615, + "loss": 0.021, + "num_input_tokens_seen": 138517440, + "step": 64135 + }, + { + "epoch": 10.463295269168025, + "grad_norm": 0.002851669443771243, + "learning_rate": 0.0005467820321734647, + "loss": 0.0179, + "num_input_tokens_seen": 138528320, + "step": 64140 + }, + { + "epoch": 10.464110929853181, + "grad_norm": 0.01338632870465517, + "learning_rate": 0.0005467111641242709, + "loss": 0.0815, + "num_input_tokens_seen": 138537888, + "step": 64145 + }, + { + "epoch": 10.464926590538337, + "grad_norm": 0.2335215061903, + "learning_rate": 0.000546640295128416, + "loss": 0.0825, + "num_input_tokens_seen": 138548672, + "step": 64150 + }, + { + "epoch": 10.46574225122349, + "grad_norm": 0.017317278310656548, + "learning_rate": 0.0005465694251873362, + "loss": 0.0199, + "num_input_tokens_seen": 138560032, + "step": 64155 + }, + { + "epoch": 10.466557911908646, + "grad_norm": 0.03315971791744232, + "learning_rate": 0.000546498554302468, + "loss": 0.0263, + "num_input_tokens_seen": 138571872, + "step": 64160 + }, + { + "epoch": 10.4673735725938, + "grad_norm": 0.4206830561161041, + "learning_rate": 0.0005464276824752477, + "loss": 0.1329, + "num_input_tokens_seen": 138583328, + "step": 64165 + }, + { + "epoch": 10.468189233278956, + "grad_norm": 0.007223771885037422, + "learning_rate": 0.0005463568097071115, + "loss": 0.0393, + "num_input_tokens_seen": 138593664, + "step": 64170 + }, + { + "epoch": 10.469004893964112, + "grad_norm": 0.01341505441814661, + "learning_rate": 0.0005462859359994957, + "loss": 0.0588, + "num_input_tokens_seen": 138605408, + "step": 64175 + }, + { + "epoch": 10.469820554649266, + "grad_norm": 0.005490301642566919, + "learning_rate": 0.0005462150613538366, + "loss": 0.0055, + "num_input_tokens_seen": 138615936, + "step": 64180 + }, + { + "epoch": 10.470636215334421, + "grad_norm": 0.10406633466482162, + "learning_rate": 0.0005461441857715708, + "loss": 0.0171, + "num_input_tokens_seen": 138626528, + "step": 64185 + }, + { + "epoch": 10.471451876019575, + "grad_norm": 0.004958090838044882, + "learning_rate": 0.0005460733092541345, + "loss": 0.0126, + "num_input_tokens_seen": 138636288, + "step": 64190 + }, + { + "epoch": 10.47226753670473, + "grad_norm": 0.3021959364414215, + "learning_rate": 0.000546002431802964, + "loss": 0.0472, + "num_input_tokens_seen": 138647904, + "step": 64195 + }, + { + "epoch": 10.473083197389887, + "grad_norm": 0.002978770760819316, + "learning_rate": 0.0005459315534194959, + "loss": 0.0803, + "num_input_tokens_seen": 138657792, + "step": 64200 + }, + { + "epoch": 10.47389885807504, + "grad_norm": 0.007034600712358952, + "learning_rate": 0.0005458606741051667, + "loss": 0.013, + "num_input_tokens_seen": 138668672, + "step": 64205 + }, + { + "epoch": 10.474714518760196, + "grad_norm": 0.14886121451854706, + "learning_rate": 0.0005457897938614127, + "loss": 0.0536, + "num_input_tokens_seen": 138679584, + "step": 64210 + }, + { + "epoch": 10.47553017944535, + "grad_norm": 0.11041484028100967, + "learning_rate": 0.0005457189126896704, + "loss": 0.0166, + "num_input_tokens_seen": 138691712, + "step": 64215 + }, + { + "epoch": 10.476345840130506, + "grad_norm": 0.004423327744007111, + "learning_rate": 0.0005456480305913765, + "loss": 0.0999, + "num_input_tokens_seen": 138703648, + "step": 64220 + }, + { + "epoch": 10.477161500815662, + "grad_norm": 0.018787806853652, + "learning_rate": 0.0005455771475679673, + "loss": 0.0406, + "num_input_tokens_seen": 138713920, + "step": 64225 + }, + { + "epoch": 10.477977161500815, + "grad_norm": 0.023285958915948868, + "learning_rate": 0.0005455062636208793, + "loss": 0.005, + "num_input_tokens_seen": 138724896, + "step": 64230 + }, + { + "epoch": 10.478792822185971, + "grad_norm": 0.002442354802042246, + "learning_rate": 0.0005454353787515493, + "loss": 0.0312, + "num_input_tokens_seen": 138735584, + "step": 64235 + }, + { + "epoch": 10.479608482871125, + "grad_norm": 0.011334599927067757, + "learning_rate": 0.0005453644929614136, + "loss": 0.0171, + "num_input_tokens_seen": 138745664, + "step": 64240 + }, + { + "epoch": 10.48042414355628, + "grad_norm": 0.017016872763633728, + "learning_rate": 0.0005452936062519088, + "loss": 0.1233, + "num_input_tokens_seen": 138756576, + "step": 64245 + }, + { + "epoch": 10.481239804241435, + "grad_norm": 0.06044808775186539, + "learning_rate": 0.0005452227186244717, + "loss": 0.0104, + "num_input_tokens_seen": 138766816, + "step": 64250 + }, + { + "epoch": 10.48205546492659, + "grad_norm": 0.0027733854949474335, + "learning_rate": 0.0005451518300805389, + "loss": 0.0224, + "num_input_tokens_seen": 138777376, + "step": 64255 + }, + { + "epoch": 10.482871125611746, + "grad_norm": 0.001782201579771936, + "learning_rate": 0.0005450809406215469, + "loss": 0.0042, + "num_input_tokens_seen": 138787648, + "step": 64260 + }, + { + "epoch": 10.4836867862969, + "grad_norm": 0.012305756099522114, + "learning_rate": 0.0005450100502489324, + "loss": 0.049, + "num_input_tokens_seen": 138798752, + "step": 64265 + }, + { + "epoch": 10.484502446982056, + "grad_norm": 0.008276435546576977, + "learning_rate": 0.0005449391589641321, + "loss": 0.0068, + "num_input_tokens_seen": 138810016, + "step": 64270 + }, + { + "epoch": 10.48531810766721, + "grad_norm": 0.07297209650278091, + "learning_rate": 0.0005448682667685829, + "loss": 0.0123, + "num_input_tokens_seen": 138821248, + "step": 64275 + }, + { + "epoch": 10.486133768352365, + "grad_norm": 0.047419674694538116, + "learning_rate": 0.0005447973736637214, + "loss": 0.0081, + "num_input_tokens_seen": 138832320, + "step": 64280 + }, + { + "epoch": 10.486949429037521, + "grad_norm": 0.0257173590362072, + "learning_rate": 0.0005447264796509841, + "loss": 0.0423, + "num_input_tokens_seen": 138842464, + "step": 64285 + }, + { + "epoch": 10.487765089722675, + "grad_norm": 0.004003862384706736, + "learning_rate": 0.0005446555847318081, + "loss": 0.0081, + "num_input_tokens_seen": 138852960, + "step": 64290 + }, + { + "epoch": 10.48858075040783, + "grad_norm": 0.10330451279878616, + "learning_rate": 0.00054458468890763, + "loss": 0.0104, + "num_input_tokens_seen": 138864384, + "step": 64295 + }, + { + "epoch": 10.489396411092985, + "grad_norm": 0.056239236146211624, + "learning_rate": 0.0005445137921798866, + "loss": 0.0616, + "num_input_tokens_seen": 138876416, + "step": 64300 + }, + { + "epoch": 10.49021207177814, + "grad_norm": 0.0013666888698935509, + "learning_rate": 0.0005444428945500147, + "loss": 0.0684, + "num_input_tokens_seen": 138886592, + "step": 64305 + }, + { + "epoch": 10.491027732463296, + "grad_norm": 0.1663975864648819, + "learning_rate": 0.0005443719960194513, + "loss": 0.0411, + "num_input_tokens_seen": 138896864, + "step": 64310 + }, + { + "epoch": 10.49184339314845, + "grad_norm": 0.008711322210729122, + "learning_rate": 0.0005443010965896327, + "loss": 0.0068, + "num_input_tokens_seen": 138907680, + "step": 64315 + }, + { + "epoch": 10.492659053833606, + "grad_norm": 0.01615816168487072, + "learning_rate": 0.0005442301962619965, + "loss": 0.0203, + "num_input_tokens_seen": 138917472, + "step": 64320 + }, + { + "epoch": 10.49347471451876, + "grad_norm": 0.028369436040520668, + "learning_rate": 0.0005441592950379792, + "loss": 0.0259, + "num_input_tokens_seen": 138927392, + "step": 64325 + }, + { + "epoch": 10.494290375203915, + "grad_norm": 0.09108876436948776, + "learning_rate": 0.0005440883929190179, + "loss": 0.0313, + "num_input_tokens_seen": 138937792, + "step": 64330 + }, + { + "epoch": 10.49510603588907, + "grad_norm": 0.2788495421409607, + "learning_rate": 0.0005440174899065493, + "loss": 0.0666, + "num_input_tokens_seen": 138949120, + "step": 64335 + }, + { + "epoch": 10.495921696574225, + "grad_norm": 0.003757023485377431, + "learning_rate": 0.0005439465860020104, + "loss": 0.0199, + "num_input_tokens_seen": 138960832, + "step": 64340 + }, + { + "epoch": 10.49673735725938, + "grad_norm": 0.00556677533313632, + "learning_rate": 0.0005438756812068382, + "loss": 0.0084, + "num_input_tokens_seen": 138971648, + "step": 64345 + }, + { + "epoch": 10.497553017944535, + "grad_norm": 0.07826634496450424, + "learning_rate": 0.0005438047755224696, + "loss": 0.0203, + "num_input_tokens_seen": 138982432, + "step": 64350 + }, + { + "epoch": 10.49836867862969, + "grad_norm": 0.017965713515877724, + "learning_rate": 0.0005437338689503417, + "loss": 0.0064, + "num_input_tokens_seen": 138992864, + "step": 64355 + }, + { + "epoch": 10.499184339314844, + "grad_norm": 0.00357259763404727, + "learning_rate": 0.0005436629614918915, + "loss": 0.0203, + "num_input_tokens_seen": 139004768, + "step": 64360 + }, + { + "epoch": 10.5, + "grad_norm": 0.0022475633304566145, + "learning_rate": 0.0005435920531485559, + "loss": 0.0118, + "num_input_tokens_seen": 139015680, + "step": 64365 + }, + { + "epoch": 10.500815660685156, + "grad_norm": 0.18087992072105408, + "learning_rate": 0.0005435211439217722, + "loss": 0.0253, + "num_input_tokens_seen": 139025376, + "step": 64370 + }, + { + "epoch": 10.50163132137031, + "grad_norm": 0.01190586294978857, + "learning_rate": 0.0005434502338129773, + "loss": 0.0041, + "num_input_tokens_seen": 139036288, + "step": 64375 + }, + { + "epoch": 10.502446982055465, + "grad_norm": 1.1199398040771484, + "learning_rate": 0.0005433793228236081, + "loss": 0.1947, + "num_input_tokens_seen": 139046752, + "step": 64380 + }, + { + "epoch": 10.50326264274062, + "grad_norm": 0.07040636241436005, + "learning_rate": 0.000543308410955102, + "loss": 0.0204, + "num_input_tokens_seen": 139057184, + "step": 64385 + }, + { + "epoch": 10.504078303425775, + "grad_norm": 0.05645016208291054, + "learning_rate": 0.0005432374982088961, + "loss": 0.021, + "num_input_tokens_seen": 139067648, + "step": 64390 + }, + { + "epoch": 10.50489396411093, + "grad_norm": 0.006974777206778526, + "learning_rate": 0.0005431665845864274, + "loss": 0.0138, + "num_input_tokens_seen": 139078528, + "step": 64395 + }, + { + "epoch": 10.505709624796085, + "grad_norm": 0.0022757535334676504, + "learning_rate": 0.0005430956700891331, + "loss": 0.0865, + "num_input_tokens_seen": 139090624, + "step": 64400 + }, + { + "epoch": 10.50652528548124, + "grad_norm": 0.4144793152809143, + "learning_rate": 0.0005430247547184504, + "loss": 0.066, + "num_input_tokens_seen": 139101760, + "step": 64405 + }, + { + "epoch": 10.507340946166394, + "grad_norm": 0.011025538668036461, + "learning_rate": 0.0005429538384758162, + "loss": 0.0381, + "num_input_tokens_seen": 139112672, + "step": 64410 + }, + { + "epoch": 10.50815660685155, + "grad_norm": 0.07199998944997787, + "learning_rate": 0.0005428829213626683, + "loss": 0.0099, + "num_input_tokens_seen": 139121440, + "step": 64415 + }, + { + "epoch": 10.508972267536706, + "grad_norm": 0.0066911992616951466, + "learning_rate": 0.0005428120033804433, + "loss": 0.0852, + "num_input_tokens_seen": 139132512, + "step": 64420 + }, + { + "epoch": 10.50978792822186, + "grad_norm": 0.007305980194360018, + "learning_rate": 0.0005427410845305791, + "loss": 0.0751, + "num_input_tokens_seen": 139143392, + "step": 64425 + }, + { + "epoch": 10.510603588907015, + "grad_norm": 0.0012512394459918141, + "learning_rate": 0.0005426701648145124, + "loss": 0.0556, + "num_input_tokens_seen": 139154880, + "step": 64430 + }, + { + "epoch": 10.51141924959217, + "grad_norm": 0.23672683537006378, + "learning_rate": 0.0005425992442336805, + "loss": 0.1436, + "num_input_tokens_seen": 139164480, + "step": 64435 + }, + { + "epoch": 10.512234910277325, + "grad_norm": 0.008508339524269104, + "learning_rate": 0.0005425283227895212, + "loss": 0.1699, + "num_input_tokens_seen": 139173856, + "step": 64440 + }, + { + "epoch": 10.513050570962479, + "grad_norm": 0.1878798007965088, + "learning_rate": 0.0005424574004834712, + "loss": 0.0442, + "num_input_tokens_seen": 139186016, + "step": 64445 + }, + { + "epoch": 10.513866231647635, + "grad_norm": 0.03218397870659828, + "learning_rate": 0.0005423864773169683, + "loss": 0.1763, + "num_input_tokens_seen": 139197088, + "step": 64450 + }, + { + "epoch": 10.51468189233279, + "grad_norm": 0.015488283708691597, + "learning_rate": 0.0005423155532914497, + "loss": 0.0437, + "num_input_tokens_seen": 139207264, + "step": 64455 + }, + { + "epoch": 10.515497553017944, + "grad_norm": 0.011527454480528831, + "learning_rate": 0.0005422446284083527, + "loss": 0.0341, + "num_input_tokens_seen": 139219040, + "step": 64460 + }, + { + "epoch": 10.5163132137031, + "grad_norm": 0.006106112617999315, + "learning_rate": 0.0005421737026691147, + "loss": 0.0074, + "num_input_tokens_seen": 139230784, + "step": 64465 + }, + { + "epoch": 10.517128874388254, + "grad_norm": 0.00418028375133872, + "learning_rate": 0.0005421027760751731, + "loss": 0.0401, + "num_input_tokens_seen": 139240512, + "step": 64470 + }, + { + "epoch": 10.51794453507341, + "grad_norm": 0.04402640089392662, + "learning_rate": 0.0005420318486279653, + "loss": 0.114, + "num_input_tokens_seen": 139250304, + "step": 64475 + }, + { + "epoch": 10.518760195758565, + "grad_norm": 0.42352381348609924, + "learning_rate": 0.0005419609203289288, + "loss": 0.1085, + "num_input_tokens_seen": 139261280, + "step": 64480 + }, + { + "epoch": 10.51957585644372, + "grad_norm": 0.25291287899017334, + "learning_rate": 0.0005418899911795011, + "loss": 0.0252, + "num_input_tokens_seen": 139272640, + "step": 64485 + }, + { + "epoch": 10.520391517128875, + "grad_norm": 0.0042486912570893764, + "learning_rate": 0.0005418190611811194, + "loss": 0.0061, + "num_input_tokens_seen": 139283168, + "step": 64490 + }, + { + "epoch": 10.521207177814029, + "grad_norm": 0.020534677430987358, + "learning_rate": 0.0005417481303352216, + "loss": 0.2991, + "num_input_tokens_seen": 139293920, + "step": 64495 + }, + { + "epoch": 10.522022838499185, + "grad_norm": 0.018889861181378365, + "learning_rate": 0.0005416771986432448, + "loss": 0.0539, + "num_input_tokens_seen": 139304416, + "step": 64500 + }, + { + "epoch": 10.522838499184338, + "grad_norm": 0.29440486431121826, + "learning_rate": 0.0005416062661066268, + "loss": 0.0387, + "num_input_tokens_seen": 139316224, + "step": 64505 + }, + { + "epoch": 10.523654159869494, + "grad_norm": 0.04716013744473457, + "learning_rate": 0.000541535332726805, + "loss": 0.0242, + "num_input_tokens_seen": 139327072, + "step": 64510 + }, + { + "epoch": 10.52446982055465, + "grad_norm": 0.007669588550925255, + "learning_rate": 0.000541464398505217, + "loss": 0.0097, + "num_input_tokens_seen": 139337504, + "step": 64515 + }, + { + "epoch": 10.525285481239804, + "grad_norm": 0.2716355621814728, + "learning_rate": 0.0005413934634433003, + "loss": 0.1008, + "num_input_tokens_seen": 139347616, + "step": 64520 + }, + { + "epoch": 10.52610114192496, + "grad_norm": 0.18859538435935974, + "learning_rate": 0.0005413225275424926, + "loss": 0.0371, + "num_input_tokens_seen": 139359456, + "step": 64525 + }, + { + "epoch": 10.526916802610113, + "grad_norm": 0.30474743247032166, + "learning_rate": 0.0005412515908042314, + "loss": 0.0143, + "num_input_tokens_seen": 139370752, + "step": 64530 + }, + { + "epoch": 10.52773246329527, + "grad_norm": 0.030280984938144684, + "learning_rate": 0.0005411806532299544, + "loss": 0.0078, + "num_input_tokens_seen": 139379584, + "step": 64535 + }, + { + "epoch": 10.528548123980425, + "grad_norm": 0.2833738923072815, + "learning_rate": 0.0005411097148210992, + "loss": 0.0516, + "num_input_tokens_seen": 139389152, + "step": 64540 + }, + { + "epoch": 10.529363784665579, + "grad_norm": 0.22054560482501984, + "learning_rate": 0.0005410387755791036, + "loss": 0.0403, + "num_input_tokens_seen": 139400416, + "step": 64545 + }, + { + "epoch": 10.530179445350734, + "grad_norm": 0.5191470980644226, + "learning_rate": 0.0005409678355054051, + "loss": 0.0801, + "num_input_tokens_seen": 139411456, + "step": 64550 + }, + { + "epoch": 10.530995106035888, + "grad_norm": 0.004127180203795433, + "learning_rate": 0.0005408968946014416, + "loss": 0.0043, + "num_input_tokens_seen": 139422880, + "step": 64555 + }, + { + "epoch": 10.531810766721044, + "grad_norm": 0.0998779833316803, + "learning_rate": 0.0005408259528686503, + "loss": 0.2009, + "num_input_tokens_seen": 139433632, + "step": 64560 + }, + { + "epoch": 10.5326264274062, + "grad_norm": 0.03429444134235382, + "learning_rate": 0.0005407550103084695, + "loss": 0.0099, + "num_input_tokens_seen": 139443520, + "step": 64565 + }, + { + "epoch": 10.533442088091354, + "grad_norm": 0.0019338505808264017, + "learning_rate": 0.0005406840669223367, + "loss": 0.012, + "num_input_tokens_seen": 139455456, + "step": 64570 + }, + { + "epoch": 10.53425774877651, + "grad_norm": 0.05488257855176926, + "learning_rate": 0.0005406131227116896, + "loss": 0.0458, + "num_input_tokens_seen": 139465600, + "step": 64575 + }, + { + "epoch": 10.535073409461663, + "grad_norm": 0.11232556402683258, + "learning_rate": 0.000540542177677966, + "loss": 0.0465, + "num_input_tokens_seen": 139475744, + "step": 64580 + }, + { + "epoch": 10.535889070146819, + "grad_norm": 0.011859245598316193, + "learning_rate": 0.0005404712318226038, + "loss": 0.0283, + "num_input_tokens_seen": 139485280, + "step": 64585 + }, + { + "epoch": 10.536704730831975, + "grad_norm": 0.022117752581834793, + "learning_rate": 0.0005404002851470409, + "loss": 0.0266, + "num_input_tokens_seen": 139495872, + "step": 64590 + }, + { + "epoch": 10.537520391517129, + "grad_norm": 0.01723141223192215, + "learning_rate": 0.0005403293376527148, + "loss": 0.0125, + "num_input_tokens_seen": 139506112, + "step": 64595 + }, + { + "epoch": 10.538336052202284, + "grad_norm": 0.12899601459503174, + "learning_rate": 0.0005402583893410636, + "loss": 0.0421, + "num_input_tokens_seen": 139516672, + "step": 64600 + }, + { + "epoch": 10.539151712887438, + "grad_norm": 0.1691344976425171, + "learning_rate": 0.0005401874402135249, + "loss": 0.0381, + "num_input_tokens_seen": 139527072, + "step": 64605 + }, + { + "epoch": 10.539967373572594, + "grad_norm": 0.17849282920360565, + "learning_rate": 0.000540116490271537, + "loss": 0.0242, + "num_input_tokens_seen": 139538656, + "step": 64610 + }, + { + "epoch": 10.540783034257748, + "grad_norm": 0.010269011370837688, + "learning_rate": 0.0005400455395165373, + "loss": 0.1574, + "num_input_tokens_seen": 139549568, + "step": 64615 + }, + { + "epoch": 10.541598694942904, + "grad_norm": 0.39542660117149353, + "learning_rate": 0.0005399745879499641, + "loss": 0.0575, + "num_input_tokens_seen": 139560768, + "step": 64620 + }, + { + "epoch": 10.54241435562806, + "grad_norm": 0.011781658045947552, + "learning_rate": 0.0005399036355732552, + "loss": 0.0123, + "num_input_tokens_seen": 139571232, + "step": 64625 + }, + { + "epoch": 10.543230016313213, + "grad_norm": 0.018769023939967155, + "learning_rate": 0.0005398326823878482, + "loss": 0.0835, + "num_input_tokens_seen": 139583200, + "step": 64630 + }, + { + "epoch": 10.544045676998369, + "grad_norm": 0.04893181473016739, + "learning_rate": 0.0005397617283951816, + "loss": 0.0129, + "num_input_tokens_seen": 139592032, + "step": 64635 + }, + { + "epoch": 10.544861337683523, + "grad_norm": 0.04125187546014786, + "learning_rate": 0.000539690773596693, + "loss": 0.026, + "num_input_tokens_seen": 139603712, + "step": 64640 + }, + { + "epoch": 10.545676998368679, + "grad_norm": 0.09487692266702652, + "learning_rate": 0.0005396198179938208, + "loss": 0.0631, + "num_input_tokens_seen": 139614400, + "step": 64645 + }, + { + "epoch": 10.546492659053834, + "grad_norm": 0.4042510986328125, + "learning_rate": 0.0005395488615880024, + "loss": 0.0617, + "num_input_tokens_seen": 139625984, + "step": 64650 + }, + { + "epoch": 10.547308319738988, + "grad_norm": 0.011217739433050156, + "learning_rate": 0.0005394779043806764, + "loss": 0.0264, + "num_input_tokens_seen": 139635936, + "step": 64655 + }, + { + "epoch": 10.548123980424144, + "grad_norm": 0.0025864015333354473, + "learning_rate": 0.0005394069463732805, + "loss": 0.0236, + "num_input_tokens_seen": 139646048, + "step": 64660 + }, + { + "epoch": 10.548939641109298, + "grad_norm": 0.04871753975749016, + "learning_rate": 0.0005393359875672527, + "loss": 0.0056, + "num_input_tokens_seen": 139656512, + "step": 64665 + }, + { + "epoch": 10.549755301794454, + "grad_norm": 0.009117928333580494, + "learning_rate": 0.0005392650279640314, + "loss": 0.0618, + "num_input_tokens_seen": 139667552, + "step": 64670 + }, + { + "epoch": 10.550570962479608, + "grad_norm": 0.00621196161955595, + "learning_rate": 0.0005391940675650545, + "loss": 0.0182, + "num_input_tokens_seen": 139679200, + "step": 64675 + }, + { + "epoch": 10.551386623164763, + "grad_norm": 0.14022231101989746, + "learning_rate": 0.00053912310637176, + "loss": 0.017, + "num_input_tokens_seen": 139691104, + "step": 64680 + }, + { + "epoch": 10.552202283849919, + "grad_norm": 0.005030880682170391, + "learning_rate": 0.0005390521443855861, + "loss": 0.0078, + "num_input_tokens_seen": 139702176, + "step": 64685 + }, + { + "epoch": 10.553017944535073, + "grad_norm": 0.0799722671508789, + "learning_rate": 0.0005389811816079711, + "loss": 0.0161, + "num_input_tokens_seen": 139712736, + "step": 64690 + }, + { + "epoch": 10.553833605220229, + "grad_norm": 0.009314349852502346, + "learning_rate": 0.0005389102180403529, + "loss": 0.0155, + "num_input_tokens_seen": 139723072, + "step": 64695 + }, + { + "epoch": 10.554649265905383, + "grad_norm": 0.02171134017407894, + "learning_rate": 0.0005388392536841697, + "loss": 0.0125, + "num_input_tokens_seen": 139733376, + "step": 64700 + }, + { + "epoch": 10.555464926590538, + "grad_norm": 0.006424791179597378, + "learning_rate": 0.00053876828854086, + "loss": 0.0155, + "num_input_tokens_seen": 139745024, + "step": 64705 + }, + { + "epoch": 10.556280587275694, + "grad_norm": 0.002661141101270914, + "learning_rate": 0.0005386973226118615, + "loss": 0.0097, + "num_input_tokens_seen": 139755168, + "step": 64710 + }, + { + "epoch": 10.557096247960848, + "grad_norm": 0.010994684882462025, + "learning_rate": 0.0005386263558986127, + "loss": 0.0817, + "num_input_tokens_seen": 139766144, + "step": 64715 + }, + { + "epoch": 10.557911908646004, + "grad_norm": 0.001512798946350813, + "learning_rate": 0.0005385553884025519, + "loss": 0.005, + "num_input_tokens_seen": 139777248, + "step": 64720 + }, + { + "epoch": 10.558727569331158, + "grad_norm": 0.022147612646222115, + "learning_rate": 0.000538484420125117, + "loss": 0.0091, + "num_input_tokens_seen": 139788000, + "step": 64725 + }, + { + "epoch": 10.559543230016313, + "grad_norm": 0.003416349645704031, + "learning_rate": 0.0005384134510677468, + "loss": 0.0865, + "num_input_tokens_seen": 139799328, + "step": 64730 + }, + { + "epoch": 10.560358890701469, + "grad_norm": 0.31981605291366577, + "learning_rate": 0.0005383424812318791, + "loss": 0.1284, + "num_input_tokens_seen": 139810144, + "step": 64735 + }, + { + "epoch": 10.561174551386623, + "grad_norm": 0.03253169730305672, + "learning_rate": 0.0005382715106189525, + "loss": 0.0047, + "num_input_tokens_seen": 139820480, + "step": 64740 + }, + { + "epoch": 10.561990212071779, + "grad_norm": 0.17456290125846863, + "learning_rate": 0.0005382005392304051, + "loss": 0.1414, + "num_input_tokens_seen": 139831040, + "step": 64745 + }, + { + "epoch": 10.562805872756933, + "grad_norm": 0.06870071589946747, + "learning_rate": 0.0005381295670676752, + "loss": 0.0198, + "num_input_tokens_seen": 139840800, + "step": 64750 + }, + { + "epoch": 10.563621533442088, + "grad_norm": 0.015359500423073769, + "learning_rate": 0.0005380585941322014, + "loss": 0.0808, + "num_input_tokens_seen": 139851904, + "step": 64755 + }, + { + "epoch": 10.564437194127244, + "grad_norm": 0.4003918170928955, + "learning_rate": 0.000537987620425422, + "loss": 0.0464, + "num_input_tokens_seen": 139863200, + "step": 64760 + }, + { + "epoch": 10.565252854812398, + "grad_norm": 0.3785932660102844, + "learning_rate": 0.0005379166459487752, + "loss": 0.0747, + "num_input_tokens_seen": 139873312, + "step": 64765 + }, + { + "epoch": 10.566068515497554, + "grad_norm": 0.0021906422916799784, + "learning_rate": 0.0005378456707036995, + "loss": 0.016, + "num_input_tokens_seen": 139885440, + "step": 64770 + }, + { + "epoch": 10.566884176182707, + "grad_norm": 0.007869427092373371, + "learning_rate": 0.0005377746946916332, + "loss": 0.0628, + "num_input_tokens_seen": 139894336, + "step": 64775 + }, + { + "epoch": 10.567699836867863, + "grad_norm": 0.0180523581802845, + "learning_rate": 0.0005377037179140149, + "loss": 0.0089, + "num_input_tokens_seen": 139906016, + "step": 64780 + }, + { + "epoch": 10.568515497553017, + "grad_norm": 0.18199679255485535, + "learning_rate": 0.0005376327403722828, + "loss": 0.0407, + "num_input_tokens_seen": 139917792, + "step": 64785 + }, + { + "epoch": 10.569331158238173, + "grad_norm": 0.01165593322366476, + "learning_rate": 0.0005375617620678756, + "loss": 0.0179, + "num_input_tokens_seen": 139928608, + "step": 64790 + }, + { + "epoch": 10.570146818923329, + "grad_norm": 0.013312139548361301, + "learning_rate": 0.0005374907830022316, + "loss": 0.0053, + "num_input_tokens_seen": 139940352, + "step": 64795 + }, + { + "epoch": 10.570962479608482, + "grad_norm": 0.020462390035390854, + "learning_rate": 0.0005374198031767892, + "loss": 0.1616, + "num_input_tokens_seen": 139950656, + "step": 64800 + }, + { + "epoch": 10.571778140293638, + "grad_norm": 0.09696569293737411, + "learning_rate": 0.0005373488225929871, + "loss": 0.0076, + "num_input_tokens_seen": 139960864, + "step": 64805 + }, + { + "epoch": 10.572593800978792, + "grad_norm": 0.03129251301288605, + "learning_rate": 0.0005372778412522638, + "loss": 0.006, + "num_input_tokens_seen": 139971968, + "step": 64810 + }, + { + "epoch": 10.573409461663948, + "grad_norm": 0.1927904337644577, + "learning_rate": 0.0005372068591560577, + "loss": 0.0291, + "num_input_tokens_seen": 139982816, + "step": 64815 + }, + { + "epoch": 10.574225122349104, + "grad_norm": 0.012073406018316746, + "learning_rate": 0.0005371358763058074, + "loss": 0.1344, + "num_input_tokens_seen": 139991904, + "step": 64820 + }, + { + "epoch": 10.575040783034257, + "grad_norm": 0.0031769592314958572, + "learning_rate": 0.0005370648927029515, + "loss": 0.1062, + "num_input_tokens_seen": 140002784, + "step": 64825 + }, + { + "epoch": 10.575856443719413, + "grad_norm": 0.0018186360830441117, + "learning_rate": 0.0005369939083489283, + "loss": 0.0101, + "num_input_tokens_seen": 140014272, + "step": 64830 + }, + { + "epoch": 10.576672104404567, + "grad_norm": 0.30345040559768677, + "learning_rate": 0.0005369229232451769, + "loss": 0.0294, + "num_input_tokens_seen": 140025376, + "step": 64835 + }, + { + "epoch": 10.577487765089723, + "grad_norm": 0.00335653405636549, + "learning_rate": 0.0005368519373931355, + "loss": 0.0725, + "num_input_tokens_seen": 140036832, + "step": 64840 + }, + { + "epoch": 10.578303425774878, + "grad_norm": 0.006152989808470011, + "learning_rate": 0.0005367809507942429, + "loss": 0.0029, + "num_input_tokens_seen": 140046464, + "step": 64845 + }, + { + "epoch": 10.579119086460032, + "grad_norm": 0.44906291365623474, + "learning_rate": 0.0005367099634499375, + "loss": 0.1718, + "num_input_tokens_seen": 140056992, + "step": 64850 + }, + { + "epoch": 10.579934747145188, + "grad_norm": 0.3523012101650238, + "learning_rate": 0.0005366389753616583, + "loss": 0.2722, + "num_input_tokens_seen": 140068320, + "step": 64855 + }, + { + "epoch": 10.580750407830342, + "grad_norm": 0.014367824420332909, + "learning_rate": 0.0005365679865308437, + "loss": 0.0082, + "num_input_tokens_seen": 140079328, + "step": 64860 + }, + { + "epoch": 10.581566068515498, + "grad_norm": 0.011161359958350658, + "learning_rate": 0.0005364969969589325, + "loss": 0.0365, + "num_input_tokens_seen": 140089056, + "step": 64865 + }, + { + "epoch": 10.582381729200652, + "grad_norm": 0.0025692936033010483, + "learning_rate": 0.0005364260066473634, + "loss": 0.0547, + "num_input_tokens_seen": 140100160, + "step": 64870 + }, + { + "epoch": 10.583197389885807, + "grad_norm": 0.3549707233905792, + "learning_rate": 0.000536355015597575, + "loss": 0.0397, + "num_input_tokens_seen": 140112128, + "step": 64875 + }, + { + "epoch": 10.584013050570963, + "grad_norm": 0.01046574767678976, + "learning_rate": 0.0005362840238110061, + "loss": 0.0416, + "num_input_tokens_seen": 140123456, + "step": 64880 + }, + { + "epoch": 10.584828711256117, + "grad_norm": 0.02689875289797783, + "learning_rate": 0.0005362130312890955, + "loss": 0.0106, + "num_input_tokens_seen": 140132320, + "step": 64885 + }, + { + "epoch": 10.585644371941273, + "grad_norm": 0.2877082824707031, + "learning_rate": 0.0005361420380332818, + "loss": 0.2631, + "num_input_tokens_seen": 140142880, + "step": 64890 + }, + { + "epoch": 10.586460032626427, + "grad_norm": 0.015195044688880444, + "learning_rate": 0.0005360710440450037, + "loss": 0.0125, + "num_input_tokens_seen": 140152896, + "step": 64895 + }, + { + "epoch": 10.587275693311582, + "grad_norm": 0.162540003657341, + "learning_rate": 0.0005360000493257003, + "loss": 0.0314, + "num_input_tokens_seen": 140162656, + "step": 64900 + }, + { + "epoch": 10.588091353996738, + "grad_norm": 0.00751123484224081, + "learning_rate": 0.0005359290538768102, + "loss": 0.0158, + "num_input_tokens_seen": 140173568, + "step": 64905 + }, + { + "epoch": 10.588907014681892, + "grad_norm": 0.18104144930839539, + "learning_rate": 0.0005358580576997723, + "loss": 0.0381, + "num_input_tokens_seen": 140184192, + "step": 64910 + }, + { + "epoch": 10.589722675367048, + "grad_norm": 0.089718297123909, + "learning_rate": 0.0005357870607960255, + "loss": 0.0439, + "num_input_tokens_seen": 140195648, + "step": 64915 + }, + { + "epoch": 10.590538336052202, + "grad_norm": 0.02439286932349205, + "learning_rate": 0.0005357160631670083, + "loss": 0.0382, + "num_input_tokens_seen": 140206784, + "step": 64920 + }, + { + "epoch": 10.591353996737357, + "grad_norm": 0.3429160416126251, + "learning_rate": 0.0005356450648141599, + "loss": 0.1792, + "num_input_tokens_seen": 140218048, + "step": 64925 + }, + { + "epoch": 10.592169657422513, + "grad_norm": 0.03793445602059364, + "learning_rate": 0.0005355740657389189, + "loss": 0.017, + "num_input_tokens_seen": 140228256, + "step": 64930 + }, + { + "epoch": 10.592985318107667, + "grad_norm": 0.012508481740951538, + "learning_rate": 0.0005355030659427245, + "loss": 0.0076, + "num_input_tokens_seen": 140238688, + "step": 64935 + }, + { + "epoch": 10.593800978792823, + "grad_norm": 0.02273573912680149, + "learning_rate": 0.0005354320654270153, + "loss": 0.0649, + "num_input_tokens_seen": 140249600, + "step": 64940 + }, + { + "epoch": 10.594616639477977, + "grad_norm": 0.326787531375885, + "learning_rate": 0.0005353610641932304, + "loss": 0.2481, + "num_input_tokens_seen": 140260576, + "step": 64945 + }, + { + "epoch": 10.595432300163132, + "grad_norm": 0.04980117827653885, + "learning_rate": 0.0005352900622428086, + "loss": 0.0898, + "num_input_tokens_seen": 140271776, + "step": 64950 + }, + { + "epoch": 10.596247960848288, + "grad_norm": 0.01142844371497631, + "learning_rate": 0.0005352190595771889, + "loss": 0.0473, + "num_input_tokens_seen": 140282528, + "step": 64955 + }, + { + "epoch": 10.597063621533442, + "grad_norm": 0.0047523933462798595, + "learning_rate": 0.0005351480561978103, + "loss": 0.0194, + "num_input_tokens_seen": 140293568, + "step": 64960 + }, + { + "epoch": 10.597879282218598, + "grad_norm": 0.1998644769191742, + "learning_rate": 0.0005350770521061118, + "loss": 0.0443, + "num_input_tokens_seen": 140305632, + "step": 64965 + }, + { + "epoch": 10.598694942903752, + "grad_norm": 0.09209080785512924, + "learning_rate": 0.0005350060473035324, + "loss": 0.0134, + "num_input_tokens_seen": 140316832, + "step": 64970 + }, + { + "epoch": 10.599510603588907, + "grad_norm": 0.012417695485055447, + "learning_rate": 0.000534935041791511, + "loss": 0.0066, + "num_input_tokens_seen": 140328736, + "step": 64975 + }, + { + "epoch": 10.600326264274061, + "grad_norm": 0.061468616127967834, + "learning_rate": 0.0005348640355714866, + "loss": 0.0179, + "num_input_tokens_seen": 140339456, + "step": 64980 + }, + { + "epoch": 10.601141924959217, + "grad_norm": 0.009134767577052116, + "learning_rate": 0.0005347930286448984, + "loss": 0.0054, + "num_input_tokens_seen": 140350720, + "step": 64985 + }, + { + "epoch": 10.601957585644373, + "grad_norm": 0.0015841845888644457, + "learning_rate": 0.0005347220210131853, + "loss": 0.1847, + "num_input_tokens_seen": 140363072, + "step": 64990 + }, + { + "epoch": 10.602773246329527, + "grad_norm": 0.29716333746910095, + "learning_rate": 0.0005346510126777864, + "loss": 0.0169, + "num_input_tokens_seen": 140374240, + "step": 64995 + }, + { + "epoch": 10.603588907014682, + "grad_norm": 0.5121808648109436, + "learning_rate": 0.0005345800036401407, + "loss": 0.0605, + "num_input_tokens_seen": 140385184, + "step": 65000 + }, + { + "epoch": 10.604404567699836, + "grad_norm": 0.17837762832641602, + "learning_rate": 0.0005345089939016874, + "loss": 0.1591, + "num_input_tokens_seen": 140396000, + "step": 65005 + }, + { + "epoch": 10.605220228384992, + "grad_norm": 0.04627210274338722, + "learning_rate": 0.0005344379834638656, + "loss": 0.0646, + "num_input_tokens_seen": 140406848, + "step": 65010 + }, + { + "epoch": 10.606035889070148, + "grad_norm": 0.039753254503011703, + "learning_rate": 0.0005343669723281144, + "loss": 0.0128, + "num_input_tokens_seen": 140418208, + "step": 65015 + }, + { + "epoch": 10.606851549755302, + "grad_norm": 0.26612791419029236, + "learning_rate": 0.0005342959604958728, + "loss": 0.0285, + "num_input_tokens_seen": 140428864, + "step": 65020 + }, + { + "epoch": 10.607667210440457, + "grad_norm": 0.13962987065315247, + "learning_rate": 0.0005342249479685801, + "loss": 0.0204, + "num_input_tokens_seen": 140440288, + "step": 65025 + }, + { + "epoch": 10.608482871125611, + "grad_norm": 0.004963894374668598, + "learning_rate": 0.0005341539347476754, + "loss": 0.0056, + "num_input_tokens_seen": 140451648, + "step": 65030 + }, + { + "epoch": 10.609298531810767, + "grad_norm": 0.24064743518829346, + "learning_rate": 0.0005340829208345979, + "loss": 0.2026, + "num_input_tokens_seen": 140462336, + "step": 65035 + }, + { + "epoch": 10.61011419249592, + "grad_norm": 0.16874991357326508, + "learning_rate": 0.0005340119062307866, + "loss": 0.0166, + "num_input_tokens_seen": 140473056, + "step": 65040 + }, + { + "epoch": 10.610929853181077, + "grad_norm": 0.03800236061215401, + "learning_rate": 0.0005339408909376812, + "loss": 0.0689, + "num_input_tokens_seen": 140482656, + "step": 65045 + }, + { + "epoch": 10.611745513866232, + "grad_norm": 0.11263815313577652, + "learning_rate": 0.0005338698749567203, + "loss": 0.0895, + "num_input_tokens_seen": 140494272, + "step": 65050 + }, + { + "epoch": 10.612561174551386, + "grad_norm": 0.012205363251268864, + "learning_rate": 0.0005337988582893436, + "loss": 0.0099, + "num_input_tokens_seen": 140504640, + "step": 65055 + }, + { + "epoch": 10.613376835236542, + "grad_norm": 0.0024235863238573074, + "learning_rate": 0.0005337278409369901, + "loss": 0.0182, + "num_input_tokens_seen": 140515680, + "step": 65060 + }, + { + "epoch": 10.614192495921696, + "grad_norm": 0.01602749712765217, + "learning_rate": 0.0005336568229010991, + "loss": 0.0831, + "num_input_tokens_seen": 140526752, + "step": 65065 + }, + { + "epoch": 10.615008156606851, + "grad_norm": 0.020563099533319473, + "learning_rate": 0.0005335858041831099, + "loss": 0.033, + "num_input_tokens_seen": 140536384, + "step": 65070 + }, + { + "epoch": 10.615823817292007, + "grad_norm": 0.003615399356931448, + "learning_rate": 0.0005335147847844618, + "loss": 0.076, + "num_input_tokens_seen": 140545440, + "step": 65075 + }, + { + "epoch": 10.616639477977161, + "grad_norm": 0.0032959782984107733, + "learning_rate": 0.000533443764706594, + "loss": 0.0034, + "num_input_tokens_seen": 140555936, + "step": 65080 + }, + { + "epoch": 10.617455138662317, + "grad_norm": 0.013945156708359718, + "learning_rate": 0.0005333727439509459, + "loss": 0.0347, + "num_input_tokens_seen": 140566944, + "step": 65085 + }, + { + "epoch": 10.61827079934747, + "grad_norm": 0.01374234538525343, + "learning_rate": 0.0005333017225189569, + "loss": 0.0238, + "num_input_tokens_seen": 140577568, + "step": 65090 + }, + { + "epoch": 10.619086460032626, + "grad_norm": 0.007713802624493837, + "learning_rate": 0.0005332307004120662, + "loss": 0.1414, + "num_input_tokens_seen": 140586528, + "step": 65095 + }, + { + "epoch": 10.619902120717782, + "grad_norm": 0.0017633598763495684, + "learning_rate": 0.0005331596776317133, + "loss": 0.0061, + "num_input_tokens_seen": 140597952, + "step": 65100 + }, + { + "epoch": 10.620717781402936, + "grad_norm": 0.013678031042218208, + "learning_rate": 0.0005330886541793372, + "loss": 0.0765, + "num_input_tokens_seen": 140608384, + "step": 65105 + }, + { + "epoch": 10.621533442088092, + "grad_norm": 0.013079334050416946, + "learning_rate": 0.0005330176300563778, + "loss": 0.0771, + "num_input_tokens_seen": 140620000, + "step": 65110 + }, + { + "epoch": 10.622349102773246, + "grad_norm": 0.0030080180149525404, + "learning_rate": 0.0005329466052642741, + "loss": 0.0067, + "num_input_tokens_seen": 140630496, + "step": 65115 + }, + { + "epoch": 10.623164763458401, + "grad_norm": 0.24842797219753265, + "learning_rate": 0.0005328755798044658, + "loss": 0.0944, + "num_input_tokens_seen": 140641984, + "step": 65120 + }, + { + "epoch": 10.623980424143557, + "grad_norm": 0.22249571979045868, + "learning_rate": 0.000532804553678392, + "loss": 0.0287, + "num_input_tokens_seen": 140652928, + "step": 65125 + }, + { + "epoch": 10.624796084828711, + "grad_norm": 0.004154822789132595, + "learning_rate": 0.0005327335268874924, + "loss": 0.0064, + "num_input_tokens_seen": 140664448, + "step": 65130 + }, + { + "epoch": 10.625611745513867, + "grad_norm": 0.0061147562228143215, + "learning_rate": 0.0005326624994332063, + "loss": 0.0985, + "num_input_tokens_seen": 140674112, + "step": 65135 + }, + { + "epoch": 10.62642740619902, + "grad_norm": 0.2392461895942688, + "learning_rate": 0.0005325914713169733, + "loss": 0.0699, + "num_input_tokens_seen": 140684352, + "step": 65140 + }, + { + "epoch": 10.627243066884176, + "grad_norm": 0.015056677162647247, + "learning_rate": 0.0005325204425402327, + "loss": 0.146, + "num_input_tokens_seen": 140696256, + "step": 65145 + }, + { + "epoch": 10.62805872756933, + "grad_norm": 0.35335031151771545, + "learning_rate": 0.0005324494131044241, + "loss": 0.0534, + "num_input_tokens_seen": 140707040, + "step": 65150 + }, + { + "epoch": 10.628874388254486, + "grad_norm": 0.015306448563933372, + "learning_rate": 0.000532378383010987, + "loss": 0.0072, + "num_input_tokens_seen": 140718080, + "step": 65155 + }, + { + "epoch": 10.629690048939642, + "grad_norm": 0.00962145160883665, + "learning_rate": 0.0005323073522613608, + "loss": 0.052, + "num_input_tokens_seen": 140728128, + "step": 65160 + }, + { + "epoch": 10.630505709624796, + "grad_norm": 0.018188240006566048, + "learning_rate": 0.0005322363208569851, + "loss": 0.0331, + "num_input_tokens_seen": 140738816, + "step": 65165 + }, + { + "epoch": 10.631321370309951, + "grad_norm": 0.00877345446497202, + "learning_rate": 0.0005321652887992996, + "loss": 0.0446, + "num_input_tokens_seen": 140749216, + "step": 65170 + }, + { + "epoch": 10.632137030995105, + "grad_norm": 0.5343616008758545, + "learning_rate": 0.0005320942560897436, + "loss": 0.0549, + "num_input_tokens_seen": 140760160, + "step": 65175 + }, + { + "epoch": 10.632952691680261, + "grad_norm": 0.0021271174773573875, + "learning_rate": 0.0005320232227297569, + "loss": 0.1599, + "num_input_tokens_seen": 140771200, + "step": 65180 + }, + { + "epoch": 10.633768352365417, + "grad_norm": 0.02698766253888607, + "learning_rate": 0.0005319521887207789, + "loss": 0.0241, + "num_input_tokens_seen": 140782560, + "step": 65185 + }, + { + "epoch": 10.63458401305057, + "grad_norm": 0.005686949472874403, + "learning_rate": 0.0005318811540642493, + "loss": 0.0326, + "num_input_tokens_seen": 140794656, + "step": 65190 + }, + { + "epoch": 10.635399673735726, + "grad_norm": 0.663912832736969, + "learning_rate": 0.0005318101187616077, + "loss": 0.2698, + "num_input_tokens_seen": 140805088, + "step": 65195 + }, + { + "epoch": 10.63621533442088, + "grad_norm": 0.3262537717819214, + "learning_rate": 0.0005317390828142937, + "loss": 0.0567, + "num_input_tokens_seen": 140814976, + "step": 65200 + }, + { + "epoch": 10.637030995106036, + "grad_norm": 0.006468473467975855, + "learning_rate": 0.0005316680462237468, + "loss": 0.0426, + "num_input_tokens_seen": 140826208, + "step": 65205 + }, + { + "epoch": 10.63784665579119, + "grad_norm": 0.006259999703615904, + "learning_rate": 0.0005315970089914068, + "loss": 0.0134, + "num_input_tokens_seen": 140836160, + "step": 65210 + }, + { + "epoch": 10.638662316476346, + "grad_norm": 0.05188210308551788, + "learning_rate": 0.0005315259711187134, + "loss": 0.0205, + "num_input_tokens_seen": 140847520, + "step": 65215 + }, + { + "epoch": 10.639477977161501, + "grad_norm": 0.017969651147723198, + "learning_rate": 0.0005314549326071061, + "loss": 0.0343, + "num_input_tokens_seen": 140858688, + "step": 65220 + }, + { + "epoch": 10.640293637846655, + "grad_norm": 0.1467316597700119, + "learning_rate": 0.0005313838934580248, + "loss": 0.0695, + "num_input_tokens_seen": 140869024, + "step": 65225 + }, + { + "epoch": 10.641109298531811, + "grad_norm": 0.006599868647754192, + "learning_rate": 0.0005313128536729091, + "loss": 0.0143, + "num_input_tokens_seen": 140880288, + "step": 65230 + }, + { + "epoch": 10.641924959216965, + "grad_norm": 0.33210447430610657, + "learning_rate": 0.0005312418132531985, + "loss": 0.0654, + "num_input_tokens_seen": 140890496, + "step": 65235 + }, + { + "epoch": 10.64274061990212, + "grad_norm": 0.0033072608057409525, + "learning_rate": 0.0005311707722003332, + "loss": 0.0421, + "num_input_tokens_seen": 140901728, + "step": 65240 + }, + { + "epoch": 10.643556280587276, + "grad_norm": 0.03693348169326782, + "learning_rate": 0.0005310997305157524, + "loss": 0.1053, + "num_input_tokens_seen": 140912544, + "step": 65245 + }, + { + "epoch": 10.64437194127243, + "grad_norm": 0.08483865857124329, + "learning_rate": 0.0005310286882008962, + "loss": 0.0405, + "num_input_tokens_seen": 140922944, + "step": 65250 + }, + { + "epoch": 10.645187601957586, + "grad_norm": 0.14889277517795563, + "learning_rate": 0.0005309576452572043, + "loss": 0.0287, + "num_input_tokens_seen": 140933216, + "step": 65255 + }, + { + "epoch": 10.64600326264274, + "grad_norm": 0.006382533814758062, + "learning_rate": 0.0005308866016861166, + "loss": 0.0158, + "num_input_tokens_seen": 140944160, + "step": 65260 + }, + { + "epoch": 10.646818923327896, + "grad_norm": 0.010671558789908886, + "learning_rate": 0.0005308155574890725, + "loss": 0.0585, + "num_input_tokens_seen": 140956480, + "step": 65265 + }, + { + "epoch": 10.647634584013051, + "grad_norm": 0.031797025352716446, + "learning_rate": 0.000530744512667512, + "loss": 0.1394, + "num_input_tokens_seen": 140967552, + "step": 65270 + }, + { + "epoch": 10.648450244698205, + "grad_norm": 0.004792911000549793, + "learning_rate": 0.0005306734672228751, + "loss": 0.0817, + "num_input_tokens_seen": 140978560, + "step": 65275 + }, + { + "epoch": 10.649265905383361, + "grad_norm": 0.7232376933097839, + "learning_rate": 0.0005306024211566014, + "loss": 0.0937, + "num_input_tokens_seen": 140989728, + "step": 65280 + }, + { + "epoch": 10.650081566068515, + "grad_norm": 0.11897439509630203, + "learning_rate": 0.0005305313744701309, + "loss": 0.0567, + "num_input_tokens_seen": 141000928, + "step": 65285 + }, + { + "epoch": 10.65089722675367, + "grad_norm": 0.11357161402702332, + "learning_rate": 0.0005304603271649033, + "loss": 0.0245, + "num_input_tokens_seen": 141011936, + "step": 65290 + }, + { + "epoch": 10.651712887438826, + "grad_norm": 0.005883332807570696, + "learning_rate": 0.0005303892792423585, + "loss": 0.0075, + "num_input_tokens_seen": 141023456, + "step": 65295 + }, + { + "epoch": 10.65252854812398, + "grad_norm": 0.2534210979938507, + "learning_rate": 0.0005303182307039364, + "loss": 0.0201, + "num_input_tokens_seen": 141034464, + "step": 65300 + }, + { + "epoch": 10.653344208809136, + "grad_norm": 0.004826645366847515, + "learning_rate": 0.0005302471815510771, + "loss": 0.0268, + "num_input_tokens_seen": 141045248, + "step": 65305 + }, + { + "epoch": 10.65415986949429, + "grad_norm": 0.010334780439734459, + "learning_rate": 0.00053017613178522, + "loss": 0.0064, + "num_input_tokens_seen": 141056736, + "step": 65310 + }, + { + "epoch": 10.654975530179446, + "grad_norm": 0.1845068484544754, + "learning_rate": 0.0005301050814078055, + "loss": 0.0627, + "num_input_tokens_seen": 141066784, + "step": 65315 + }, + { + "epoch": 10.655791190864601, + "grad_norm": 0.39638128876686096, + "learning_rate": 0.0005300340304202734, + "loss": 0.0741, + "num_input_tokens_seen": 141077344, + "step": 65320 + }, + { + "epoch": 10.656606851549755, + "grad_norm": 0.04189550504088402, + "learning_rate": 0.0005299629788240634, + "loss": 0.0603, + "num_input_tokens_seen": 141088320, + "step": 65325 + }, + { + "epoch": 10.65742251223491, + "grad_norm": 0.004338693805038929, + "learning_rate": 0.0005298919266206157, + "loss": 0.0533, + "num_input_tokens_seen": 141099776, + "step": 65330 + }, + { + "epoch": 10.658238172920065, + "grad_norm": 0.490159809589386, + "learning_rate": 0.0005298208738113701, + "loss": 0.088, + "num_input_tokens_seen": 141110592, + "step": 65335 + }, + { + "epoch": 10.65905383360522, + "grad_norm": 0.020773818716406822, + "learning_rate": 0.0005297498203977668, + "loss": 0.058, + "num_input_tokens_seen": 141120928, + "step": 65340 + }, + { + "epoch": 10.659869494290374, + "grad_norm": 0.5976955890655518, + "learning_rate": 0.0005296787663812456, + "loss": 0.0867, + "num_input_tokens_seen": 141132128, + "step": 65345 + }, + { + "epoch": 10.66068515497553, + "grad_norm": 0.004236851818859577, + "learning_rate": 0.0005296077117632464, + "loss": 0.0134, + "num_input_tokens_seen": 141142944, + "step": 65350 + }, + { + "epoch": 10.661500815660686, + "grad_norm": 0.0024764598347246647, + "learning_rate": 0.0005295366565452094, + "loss": 0.0215, + "num_input_tokens_seen": 141153984, + "step": 65355 + }, + { + "epoch": 10.66231647634584, + "grad_norm": 0.28249356150627136, + "learning_rate": 0.0005294656007285748, + "loss": 0.107, + "num_input_tokens_seen": 141163968, + "step": 65360 + }, + { + "epoch": 10.663132137030995, + "grad_norm": 0.017333753407001495, + "learning_rate": 0.0005293945443147821, + "loss": 0.0242, + "num_input_tokens_seen": 141175040, + "step": 65365 + }, + { + "epoch": 10.66394779771615, + "grad_norm": 0.2655371427536011, + "learning_rate": 0.000529323487305272, + "loss": 0.1751, + "num_input_tokens_seen": 141184928, + "step": 65370 + }, + { + "epoch": 10.664763458401305, + "grad_norm": 0.08187251538038254, + "learning_rate": 0.0005292524297014842, + "loss": 0.0071, + "num_input_tokens_seen": 141196768, + "step": 65375 + }, + { + "epoch": 10.66557911908646, + "grad_norm": 0.016217637807130814, + "learning_rate": 0.0005291813715048584, + "loss": 0.057, + "num_input_tokens_seen": 141207712, + "step": 65380 + }, + { + "epoch": 10.666394779771615, + "grad_norm": 0.016979215666651726, + "learning_rate": 0.0005291103127168355, + "loss": 0.0145, + "num_input_tokens_seen": 141218080, + "step": 65385 + }, + { + "epoch": 10.66721044045677, + "grad_norm": 0.006149845663458109, + "learning_rate": 0.000529039253338855, + "loss": 0.0718, + "num_input_tokens_seen": 141229024, + "step": 65390 + }, + { + "epoch": 10.668026101141924, + "grad_norm": 0.1777970790863037, + "learning_rate": 0.0005289681933723573, + "loss": 0.0487, + "num_input_tokens_seen": 141239072, + "step": 65395 + }, + { + "epoch": 10.66884176182708, + "grad_norm": 0.38320648670196533, + "learning_rate": 0.0005288971328187824, + "loss": 0.1186, + "num_input_tokens_seen": 141250400, + "step": 65400 + }, + { + "epoch": 10.669657422512234, + "grad_norm": 0.16135111451148987, + "learning_rate": 0.0005288260716795704, + "loss": 0.016, + "num_input_tokens_seen": 141260192, + "step": 65405 + }, + { + "epoch": 10.67047308319739, + "grad_norm": 0.00716315396130085, + "learning_rate": 0.0005287550099561614, + "loss": 0.0078, + "num_input_tokens_seen": 141270720, + "step": 65410 + }, + { + "epoch": 10.671288743882545, + "grad_norm": 0.2340540587902069, + "learning_rate": 0.0005286839476499959, + "loss": 0.0308, + "num_input_tokens_seen": 141282432, + "step": 65415 + }, + { + "epoch": 10.6721044045677, + "grad_norm": 0.003738554660230875, + "learning_rate": 0.0005286128847625136, + "loss": 0.0139, + "num_input_tokens_seen": 141294240, + "step": 65420 + }, + { + "epoch": 10.672920065252855, + "grad_norm": 0.2376752495765686, + "learning_rate": 0.0005285418212951549, + "loss": 0.0881, + "num_input_tokens_seen": 141305152, + "step": 65425 + }, + { + "epoch": 10.673735725938009, + "grad_norm": 0.0017827788833528757, + "learning_rate": 0.0005284707572493601, + "loss": 0.0143, + "num_input_tokens_seen": 141314944, + "step": 65430 + }, + { + "epoch": 10.674551386623165, + "grad_norm": 0.025138380005955696, + "learning_rate": 0.0005283996926265692, + "loss": 0.1245, + "num_input_tokens_seen": 141326368, + "step": 65435 + }, + { + "epoch": 10.67536704730832, + "grad_norm": 0.009806608781218529, + "learning_rate": 0.0005283286274282226, + "loss": 0.0123, + "num_input_tokens_seen": 141337760, + "step": 65440 + }, + { + "epoch": 10.676182707993474, + "grad_norm": 0.05824309587478638, + "learning_rate": 0.0005282575616557603, + "loss": 0.0242, + "num_input_tokens_seen": 141348768, + "step": 65445 + }, + { + "epoch": 10.67699836867863, + "grad_norm": 0.06231406703591347, + "learning_rate": 0.0005281864953106226, + "loss": 0.1298, + "num_input_tokens_seen": 141360544, + "step": 65450 + }, + { + "epoch": 10.677814029363784, + "grad_norm": 0.012798627838492393, + "learning_rate": 0.0005281154283942501, + "loss": 0.0735, + "num_input_tokens_seen": 141370944, + "step": 65455 + }, + { + "epoch": 10.67862969004894, + "grad_norm": 0.02479397878050804, + "learning_rate": 0.0005280443609080826, + "loss": 0.0367, + "num_input_tokens_seen": 141379584, + "step": 65460 + }, + { + "epoch": 10.679445350734095, + "grad_norm": 0.009278696030378342, + "learning_rate": 0.0005279732928535606, + "loss": 0.0162, + "num_input_tokens_seen": 141390880, + "step": 65465 + }, + { + "epoch": 10.68026101141925, + "grad_norm": 0.011147456243634224, + "learning_rate": 0.0005279022242321242, + "loss": 0.0035, + "num_input_tokens_seen": 141401312, + "step": 65470 + }, + { + "epoch": 10.681076672104405, + "grad_norm": 0.003964760806411505, + "learning_rate": 0.000527831155045214, + "loss": 0.0609, + "num_input_tokens_seen": 141411328, + "step": 65475 + }, + { + "epoch": 10.681892332789559, + "grad_norm": 0.016584768891334534, + "learning_rate": 0.00052776008529427, + "loss": 0.0091, + "num_input_tokens_seen": 141422240, + "step": 65480 + }, + { + "epoch": 10.682707993474715, + "grad_norm": 0.04356918856501579, + "learning_rate": 0.0005276890149807326, + "loss": 0.0253, + "num_input_tokens_seen": 141433152, + "step": 65485 + }, + { + "epoch": 10.68352365415987, + "grad_norm": 0.010738197714090347, + "learning_rate": 0.0005276179441060423, + "loss": 0.0407, + "num_input_tokens_seen": 141444992, + "step": 65490 + }, + { + "epoch": 10.684339314845024, + "grad_norm": 0.010608477517962456, + "learning_rate": 0.0005275468726716393, + "loss": 0.0188, + "num_input_tokens_seen": 141456832, + "step": 65495 + }, + { + "epoch": 10.68515497553018, + "grad_norm": 0.6359379887580872, + "learning_rate": 0.000527475800678964, + "loss": 0.0501, + "num_input_tokens_seen": 141469280, + "step": 65500 + }, + { + "epoch": 10.685970636215334, + "grad_norm": 0.0019201745744794607, + "learning_rate": 0.0005274047281294569, + "loss": 0.0088, + "num_input_tokens_seen": 141479520, + "step": 65505 + }, + { + "epoch": 10.68678629690049, + "grad_norm": 0.1839599311351776, + "learning_rate": 0.000527333655024558, + "loss": 0.0197, + "num_input_tokens_seen": 141490272, + "step": 65510 + }, + { + "epoch": 10.687601957585644, + "grad_norm": 0.10226281732320786, + "learning_rate": 0.0005272625813657079, + "loss": 0.0121, + "num_input_tokens_seen": 141501280, + "step": 65515 + }, + { + "epoch": 10.6884176182708, + "grad_norm": 0.001877550152130425, + "learning_rate": 0.000527191507154347, + "loss": 0.0256, + "num_input_tokens_seen": 141512512, + "step": 65520 + }, + { + "epoch": 10.689233278955955, + "grad_norm": 0.2550641894340515, + "learning_rate": 0.0005271204323919158, + "loss": 0.0788, + "num_input_tokens_seen": 141522912, + "step": 65525 + }, + { + "epoch": 10.690048939641109, + "grad_norm": 0.09536845982074738, + "learning_rate": 0.0005270493570798546, + "loss": 0.0136, + "num_input_tokens_seen": 141533952, + "step": 65530 + }, + { + "epoch": 10.690864600326265, + "grad_norm": 0.141608327627182, + "learning_rate": 0.000526978281219604, + "loss": 0.0671, + "num_input_tokens_seen": 141544768, + "step": 65535 + }, + { + "epoch": 10.691680261011419, + "grad_norm": 0.050538040697574615, + "learning_rate": 0.0005269072048126041, + "loss": 0.0737, + "num_input_tokens_seen": 141556416, + "step": 65540 + }, + { + "epoch": 10.692495921696574, + "grad_norm": 0.03494250029325485, + "learning_rate": 0.0005268361278602957, + "loss": 0.0673, + "num_input_tokens_seen": 141568032, + "step": 65545 + }, + { + "epoch": 10.69331158238173, + "grad_norm": 0.014967870898544788, + "learning_rate": 0.0005267650503641191, + "loss": 0.0549, + "num_input_tokens_seen": 141580224, + "step": 65550 + }, + { + "epoch": 10.694127243066884, + "grad_norm": 0.010322188027203083, + "learning_rate": 0.0005266939723255148, + "loss": 0.1173, + "num_input_tokens_seen": 141591456, + "step": 65555 + }, + { + "epoch": 10.69494290375204, + "grad_norm": 0.021448874846100807, + "learning_rate": 0.0005266228937459233, + "loss": 0.0126, + "num_input_tokens_seen": 141602368, + "step": 65560 + }, + { + "epoch": 10.695758564437194, + "grad_norm": 0.00392883038148284, + "learning_rate": 0.0005265518146267851, + "loss": 0.0375, + "num_input_tokens_seen": 141613696, + "step": 65565 + }, + { + "epoch": 10.69657422512235, + "grad_norm": 0.6401793360710144, + "learning_rate": 0.0005264807349695406, + "loss": 0.0524, + "num_input_tokens_seen": 141624768, + "step": 65570 + }, + { + "epoch": 10.697389885807503, + "grad_norm": 0.516755223274231, + "learning_rate": 0.0005264096547756305, + "loss": 0.0913, + "num_input_tokens_seen": 141635040, + "step": 65575 + }, + { + "epoch": 10.698205546492659, + "grad_norm": 0.03410768136382103, + "learning_rate": 0.0005263385740464951, + "loss": 0.0249, + "num_input_tokens_seen": 141644992, + "step": 65580 + }, + { + "epoch": 10.699021207177815, + "grad_norm": 0.13844673335552216, + "learning_rate": 0.0005262674927835752, + "loss": 0.1242, + "num_input_tokens_seen": 141655424, + "step": 65585 + }, + { + "epoch": 10.699836867862969, + "grad_norm": 0.0018074375111609697, + "learning_rate": 0.0005261964109883111, + "loss": 0.0093, + "num_input_tokens_seen": 141664224, + "step": 65590 + }, + { + "epoch": 10.700652528548124, + "grad_norm": 0.19906216859817505, + "learning_rate": 0.0005261253286621437, + "loss": 0.1212, + "num_input_tokens_seen": 141676512, + "step": 65595 + }, + { + "epoch": 10.701468189233278, + "grad_norm": 0.13830533623695374, + "learning_rate": 0.0005260542458065132, + "loss": 0.0527, + "num_input_tokens_seen": 141687040, + "step": 65600 + }, + { + "epoch": 10.702283849918434, + "grad_norm": 0.5063537359237671, + "learning_rate": 0.0005259831624228605, + "loss": 0.0797, + "num_input_tokens_seen": 141698208, + "step": 65605 + }, + { + "epoch": 10.70309951060359, + "grad_norm": 0.06567066162824631, + "learning_rate": 0.000525912078512626, + "loss": 0.0198, + "num_input_tokens_seen": 141710944, + "step": 65610 + }, + { + "epoch": 10.703915171288743, + "grad_norm": 0.19044040143489838, + "learning_rate": 0.0005258409940772504, + "loss": 0.0304, + "num_input_tokens_seen": 141722272, + "step": 65615 + }, + { + "epoch": 10.7047308319739, + "grad_norm": 0.06141744181513786, + "learning_rate": 0.0005257699091181742, + "loss": 0.178, + "num_input_tokens_seen": 141733664, + "step": 65620 + }, + { + "epoch": 10.705546492659053, + "grad_norm": 0.010067290626466274, + "learning_rate": 0.0005256988236368382, + "loss": 0.0183, + "num_input_tokens_seen": 141744160, + "step": 65625 + }, + { + "epoch": 10.706362153344209, + "grad_norm": 0.014722629450261593, + "learning_rate": 0.0005256277376346829, + "loss": 0.0323, + "num_input_tokens_seen": 141754400, + "step": 65630 + }, + { + "epoch": 10.707177814029365, + "grad_norm": 0.012095391750335693, + "learning_rate": 0.0005255566511131489, + "loss": 0.0283, + "num_input_tokens_seen": 141765504, + "step": 65635 + }, + { + "epoch": 10.707993474714518, + "grad_norm": 0.1298554241657257, + "learning_rate": 0.000525485564073677, + "loss": 0.0264, + "num_input_tokens_seen": 141777664, + "step": 65640 + }, + { + "epoch": 10.708809135399674, + "grad_norm": 0.005714466795325279, + "learning_rate": 0.0005254144765177078, + "loss": 0.0067, + "num_input_tokens_seen": 141788800, + "step": 65645 + }, + { + "epoch": 10.709624796084828, + "grad_norm": 0.0032331603579223156, + "learning_rate": 0.0005253433884466821, + "loss": 0.1435, + "num_input_tokens_seen": 141799936, + "step": 65650 + }, + { + "epoch": 10.710440456769984, + "grad_norm": 0.028762778267264366, + "learning_rate": 0.0005252722998620403, + "loss": 0.0974, + "num_input_tokens_seen": 141809952, + "step": 65655 + }, + { + "epoch": 10.71125611745514, + "grad_norm": 0.012786184437572956, + "learning_rate": 0.0005252012107652234, + "loss": 0.1218, + "num_input_tokens_seen": 141821056, + "step": 65660 + }, + { + "epoch": 10.712071778140293, + "grad_norm": 0.005127850454300642, + "learning_rate": 0.0005251301211576718, + "loss": 0.0301, + "num_input_tokens_seen": 141832416, + "step": 65665 + }, + { + "epoch": 10.71288743882545, + "grad_norm": 0.016213873401284218, + "learning_rate": 0.0005250590310408266, + "loss": 0.0137, + "num_input_tokens_seen": 141842528, + "step": 65670 + }, + { + "epoch": 10.713703099510603, + "grad_norm": 0.07902921736240387, + "learning_rate": 0.0005249879404161284, + "loss": 0.1831, + "num_input_tokens_seen": 141853824, + "step": 65675 + }, + { + "epoch": 10.714518760195759, + "grad_norm": 0.016699183732271194, + "learning_rate": 0.0005249168492850178, + "loss": 0.0312, + "num_input_tokens_seen": 141864960, + "step": 65680 + }, + { + "epoch": 10.715334420880914, + "grad_norm": 0.0019363955361768603, + "learning_rate": 0.0005248457576489356, + "loss": 0.0186, + "num_input_tokens_seen": 141874784, + "step": 65685 + }, + { + "epoch": 10.716150081566068, + "grad_norm": 0.09606332331895828, + "learning_rate": 0.0005247746655093228, + "loss": 0.1051, + "num_input_tokens_seen": 141884640, + "step": 65690 + }, + { + "epoch": 10.716965742251224, + "grad_norm": 0.015151958912611008, + "learning_rate": 0.0005247035728676196, + "loss": 0.0694, + "num_input_tokens_seen": 141895136, + "step": 65695 + }, + { + "epoch": 10.717781402936378, + "grad_norm": 0.004619085229933262, + "learning_rate": 0.0005246324797252674, + "loss": 0.0162, + "num_input_tokens_seen": 141905440, + "step": 65700 + }, + { + "epoch": 10.718597063621534, + "grad_norm": 0.2725040316581726, + "learning_rate": 0.0005245613860837068, + "loss": 0.0238, + "num_input_tokens_seen": 141916160, + "step": 65705 + }, + { + "epoch": 10.719412724306688, + "grad_norm": 0.00907763559371233, + "learning_rate": 0.0005244902919443785, + "loss": 0.0594, + "num_input_tokens_seen": 141927360, + "step": 65710 + }, + { + "epoch": 10.720228384991843, + "grad_norm": 0.5919573903083801, + "learning_rate": 0.0005244191973087233, + "loss": 0.1188, + "num_input_tokens_seen": 141937248, + "step": 65715 + }, + { + "epoch": 10.721044045676999, + "grad_norm": 0.3372504413127899, + "learning_rate": 0.0005243481021781821, + "loss": 0.0679, + "num_input_tokens_seen": 141948992, + "step": 65720 + }, + { + "epoch": 10.721859706362153, + "grad_norm": 0.2855179011821747, + "learning_rate": 0.0005242770065541958, + "loss": 0.041, + "num_input_tokens_seen": 141958880, + "step": 65725 + }, + { + "epoch": 10.722675367047309, + "grad_norm": 0.12741783261299133, + "learning_rate": 0.0005242059104382052, + "loss": 0.0217, + "num_input_tokens_seen": 141969152, + "step": 65730 + }, + { + "epoch": 10.723491027732463, + "grad_norm": 0.00721243629232049, + "learning_rate": 0.000524134813831651, + "loss": 0.0292, + "num_input_tokens_seen": 141980416, + "step": 65735 + }, + { + "epoch": 10.724306688417618, + "grad_norm": 0.04829755425453186, + "learning_rate": 0.0005240637167359743, + "loss": 0.0642, + "num_input_tokens_seen": 141991072, + "step": 65740 + }, + { + "epoch": 10.725122349102774, + "grad_norm": 0.0076208519749343395, + "learning_rate": 0.0005239926191526157, + "loss": 0.0349, + "num_input_tokens_seen": 142003168, + "step": 65745 + }, + { + "epoch": 10.725938009787928, + "grad_norm": 0.17454379796981812, + "learning_rate": 0.0005239215210830164, + "loss": 0.0365, + "num_input_tokens_seen": 142014304, + "step": 65750 + }, + { + "epoch": 10.726753670473084, + "grad_norm": 0.01813514530658722, + "learning_rate": 0.000523850422528617, + "loss": 0.0595, + "num_input_tokens_seen": 142024480, + "step": 65755 + }, + { + "epoch": 10.727569331158238, + "grad_norm": 0.37926414608955383, + "learning_rate": 0.0005237793234908586, + "loss": 0.031, + "num_input_tokens_seen": 142035552, + "step": 65760 + }, + { + "epoch": 10.728384991843393, + "grad_norm": 0.0012836528476327658, + "learning_rate": 0.000523708223971182, + "loss": 0.0101, + "num_input_tokens_seen": 142047456, + "step": 65765 + }, + { + "epoch": 10.729200652528547, + "grad_norm": 0.004975530784577131, + "learning_rate": 0.0005236371239710283, + "loss": 0.0055, + "num_input_tokens_seen": 142058976, + "step": 65770 + }, + { + "epoch": 10.730016313213703, + "grad_norm": 0.006139600649476051, + "learning_rate": 0.0005235660234918381, + "loss": 0.0386, + "num_input_tokens_seen": 142069920, + "step": 65775 + }, + { + "epoch": 10.730831973898859, + "grad_norm": 0.02576206438243389, + "learning_rate": 0.0005234949225350526, + "loss": 0.0445, + "num_input_tokens_seen": 142081152, + "step": 65780 + }, + { + "epoch": 10.731647634584013, + "grad_norm": 0.005682915449142456, + "learning_rate": 0.0005234238211021127, + "loss": 0.0903, + "num_input_tokens_seen": 142091232, + "step": 65785 + }, + { + "epoch": 10.732463295269168, + "grad_norm": 0.015125678852200508, + "learning_rate": 0.0005233527191944593, + "loss": 0.0652, + "num_input_tokens_seen": 142100736, + "step": 65790 + }, + { + "epoch": 10.733278955954322, + "grad_norm": 0.011067863553762436, + "learning_rate": 0.0005232816168135336, + "loss": 0.0116, + "num_input_tokens_seen": 142111232, + "step": 65795 + }, + { + "epoch": 10.734094616639478, + "grad_norm": 0.29029378294944763, + "learning_rate": 0.0005232105139607763, + "loss": 0.2617, + "num_input_tokens_seen": 142122688, + "step": 65800 + }, + { + "epoch": 10.734910277324634, + "grad_norm": 0.016473524272441864, + "learning_rate": 0.0005231394106376283, + "loss": 0.0213, + "num_input_tokens_seen": 142133248, + "step": 65805 + }, + { + "epoch": 10.735725938009788, + "grad_norm": 1.6077208518981934, + "learning_rate": 0.000523068306845531, + "loss": 0.1057, + "num_input_tokens_seen": 142143936, + "step": 65810 + }, + { + "epoch": 10.736541598694943, + "grad_norm": 0.05260138213634491, + "learning_rate": 0.0005229972025859252, + "loss": 0.016, + "num_input_tokens_seen": 142153952, + "step": 65815 + }, + { + "epoch": 10.737357259380097, + "grad_norm": 0.013729028403759003, + "learning_rate": 0.0005229260978602519, + "loss": 0.0038, + "num_input_tokens_seen": 142163712, + "step": 65820 + }, + { + "epoch": 10.738172920065253, + "grad_norm": 0.37588047981262207, + "learning_rate": 0.0005228549926699521, + "loss": 0.1415, + "num_input_tokens_seen": 142173568, + "step": 65825 + }, + { + "epoch": 10.738988580750409, + "grad_norm": 0.010452077724039555, + "learning_rate": 0.0005227838870164669, + "loss": 0.3089, + "num_input_tokens_seen": 142183040, + "step": 65830 + }, + { + "epoch": 10.739804241435563, + "grad_norm": 0.00847290363162756, + "learning_rate": 0.0005227127809012372, + "loss": 0.0174, + "num_input_tokens_seen": 142194752, + "step": 65835 + }, + { + "epoch": 10.740619902120718, + "grad_norm": 0.004005743190646172, + "learning_rate": 0.0005226416743257043, + "loss": 0.104, + "num_input_tokens_seen": 142206144, + "step": 65840 + }, + { + "epoch": 10.741435562805872, + "grad_norm": 0.010764031670987606, + "learning_rate": 0.0005225705672913092, + "loss": 0.0163, + "num_input_tokens_seen": 142215872, + "step": 65845 + }, + { + "epoch": 10.742251223491028, + "grad_norm": 0.0068029677495360374, + "learning_rate": 0.0005224994597994929, + "loss": 0.0413, + "num_input_tokens_seen": 142225184, + "step": 65850 + }, + { + "epoch": 10.743066884176184, + "grad_norm": 0.027975937351584435, + "learning_rate": 0.0005224283518516965, + "loss": 0.0766, + "num_input_tokens_seen": 142235328, + "step": 65855 + }, + { + "epoch": 10.743882544861338, + "grad_norm": 0.003287712810561061, + "learning_rate": 0.000522357243449361, + "loss": 0.0151, + "num_input_tokens_seen": 142246688, + "step": 65860 + }, + { + "epoch": 10.744698205546493, + "grad_norm": 0.014056977815926075, + "learning_rate": 0.0005222861345939278, + "loss": 0.0311, + "num_input_tokens_seen": 142258592, + "step": 65865 + }, + { + "epoch": 10.745513866231647, + "grad_norm": 0.24229301512241364, + "learning_rate": 0.0005222150252868375, + "loss": 0.0317, + "num_input_tokens_seen": 142270016, + "step": 65870 + }, + { + "epoch": 10.746329526916803, + "grad_norm": 0.005472760181874037, + "learning_rate": 0.0005221439155295318, + "loss": 0.0125, + "num_input_tokens_seen": 142280224, + "step": 65875 + }, + { + "epoch": 10.747145187601957, + "grad_norm": 0.20398905873298645, + "learning_rate": 0.0005220728053234514, + "loss": 0.0342, + "num_input_tokens_seen": 142291648, + "step": 65880 + }, + { + "epoch": 10.747960848287113, + "grad_norm": 0.1125456914305687, + "learning_rate": 0.0005220016946700378, + "loss": 0.0427, + "num_input_tokens_seen": 142303392, + "step": 65885 + }, + { + "epoch": 10.748776508972268, + "grad_norm": 0.004370346665382385, + "learning_rate": 0.0005219305835707318, + "loss": 0.1624, + "num_input_tokens_seen": 142313504, + "step": 65890 + }, + { + "epoch": 10.749592169657422, + "grad_norm": 0.019533276557922363, + "learning_rate": 0.0005218594720269748, + "loss": 0.1332, + "num_input_tokens_seen": 142323904, + "step": 65895 + }, + { + "epoch": 10.750407830342578, + "grad_norm": 0.04836461693048477, + "learning_rate": 0.0005217883600402076, + "loss": 0.011, + "num_input_tokens_seen": 142334880, + "step": 65900 + }, + { + "epoch": 10.751223491027732, + "grad_norm": 0.05884992331266403, + "learning_rate": 0.0005217172476118719, + "loss": 0.0278, + "num_input_tokens_seen": 142344960, + "step": 65905 + }, + { + "epoch": 10.752039151712887, + "grad_norm": 0.05737880989909172, + "learning_rate": 0.0005216461347434084, + "loss": 0.0813, + "num_input_tokens_seen": 142355808, + "step": 65910 + }, + { + "epoch": 10.752854812398043, + "grad_norm": 0.003403907176107168, + "learning_rate": 0.0005215750214362588, + "loss": 0.0151, + "num_input_tokens_seen": 142368032, + "step": 65915 + }, + { + "epoch": 10.753670473083197, + "grad_norm": 0.05276336520910263, + "learning_rate": 0.0005215039076918638, + "loss": 0.0087, + "num_input_tokens_seen": 142378784, + "step": 65920 + }, + { + "epoch": 10.754486133768353, + "grad_norm": 0.21015602350234985, + "learning_rate": 0.0005214327935116651, + "loss": 0.0215, + "num_input_tokens_seen": 142389600, + "step": 65925 + }, + { + "epoch": 10.755301794453507, + "grad_norm": 0.19275906682014465, + "learning_rate": 0.0005213616788971034, + "loss": 0.0239, + "num_input_tokens_seen": 142400704, + "step": 65930 + }, + { + "epoch": 10.756117455138662, + "grad_norm": 0.04935337230563164, + "learning_rate": 0.0005212905638496203, + "loss": 0.1233, + "num_input_tokens_seen": 142411904, + "step": 65935 + }, + { + "epoch": 10.756933115823816, + "grad_norm": 0.03261714428663254, + "learning_rate": 0.0005212194483706569, + "loss": 0.0411, + "num_input_tokens_seen": 142423872, + "step": 65940 + }, + { + "epoch": 10.757748776508972, + "grad_norm": 0.004834748338907957, + "learning_rate": 0.0005211483324616544, + "loss": 0.0058, + "num_input_tokens_seen": 142435648, + "step": 65945 + }, + { + "epoch": 10.758564437194128, + "grad_norm": 0.013836579397320747, + "learning_rate": 0.0005210772161240541, + "loss": 0.0076, + "num_input_tokens_seen": 142447936, + "step": 65950 + }, + { + "epoch": 10.759380097879282, + "grad_norm": 0.057208843529224396, + "learning_rate": 0.0005210060993592973, + "loss": 0.0416, + "num_input_tokens_seen": 142459136, + "step": 65955 + }, + { + "epoch": 10.760195758564437, + "grad_norm": 0.003955775871872902, + "learning_rate": 0.0005209349821688254, + "loss": 0.0474, + "num_input_tokens_seen": 142468928, + "step": 65960 + }, + { + "epoch": 10.761011419249591, + "grad_norm": 0.00470461742952466, + "learning_rate": 0.0005208638645540795, + "loss": 0.0353, + "num_input_tokens_seen": 142478656, + "step": 65965 + }, + { + "epoch": 10.761827079934747, + "grad_norm": 0.005068403668701649, + "learning_rate": 0.0005207927465165007, + "loss": 0.0613, + "num_input_tokens_seen": 142489184, + "step": 65970 + }, + { + "epoch": 10.762642740619903, + "grad_norm": 0.24675904214382172, + "learning_rate": 0.0005207216280575306, + "loss": 0.0938, + "num_input_tokens_seen": 142499584, + "step": 65975 + }, + { + "epoch": 10.763458401305057, + "grad_norm": 0.12622298300266266, + "learning_rate": 0.0005206505091786103, + "loss": 0.0198, + "num_input_tokens_seen": 142509440, + "step": 65980 + }, + { + "epoch": 10.764274061990212, + "grad_norm": 0.001572229783050716, + "learning_rate": 0.0005205793898811814, + "loss": 0.0059, + "num_input_tokens_seen": 142520512, + "step": 65985 + }, + { + "epoch": 10.765089722675366, + "grad_norm": 0.04246864467859268, + "learning_rate": 0.0005205082701666851, + "loss": 0.0388, + "num_input_tokens_seen": 142531872, + "step": 65990 + }, + { + "epoch": 10.765905383360522, + "grad_norm": 0.013875995762646198, + "learning_rate": 0.0005204371500365627, + "loss": 0.0517, + "num_input_tokens_seen": 142542784, + "step": 65995 + }, + { + "epoch": 10.766721044045678, + "grad_norm": 0.10224605351686478, + "learning_rate": 0.0005203660294922554, + "loss": 0.1327, + "num_input_tokens_seen": 142553408, + "step": 66000 + }, + { + "epoch": 10.767536704730832, + "grad_norm": 0.10035200417041779, + "learning_rate": 0.0005202949085352048, + "loss": 0.0202, + "num_input_tokens_seen": 142564352, + "step": 66005 + }, + { + "epoch": 10.768352365415987, + "grad_norm": 0.13176901638507843, + "learning_rate": 0.000520223787166852, + "loss": 0.1211, + "num_input_tokens_seen": 142576704, + "step": 66010 + }, + { + "epoch": 10.769168026101141, + "grad_norm": 0.004604107700288296, + "learning_rate": 0.0005201526653886385, + "loss": 0.0651, + "num_input_tokens_seen": 142587392, + "step": 66015 + }, + { + "epoch": 10.769983686786297, + "grad_norm": 0.0311175137758255, + "learning_rate": 0.0005200815432020058, + "loss": 0.1254, + "num_input_tokens_seen": 142597696, + "step": 66020 + }, + { + "epoch": 10.770799347471453, + "grad_norm": 0.027582794427871704, + "learning_rate": 0.0005200104206083951, + "loss": 0.0548, + "num_input_tokens_seen": 142609152, + "step": 66025 + }, + { + "epoch": 10.771615008156607, + "grad_norm": 0.011016631498932838, + "learning_rate": 0.0005199392976092479, + "loss": 0.0127, + "num_input_tokens_seen": 142620608, + "step": 66030 + }, + { + "epoch": 10.772430668841762, + "grad_norm": 0.011967485770583153, + "learning_rate": 0.0005198681742060055, + "loss": 0.0142, + "num_input_tokens_seen": 142630944, + "step": 66035 + }, + { + "epoch": 10.773246329526916, + "grad_norm": 0.013100696727633476, + "learning_rate": 0.0005197970504001091, + "loss": 0.0048, + "num_input_tokens_seen": 142642112, + "step": 66040 + }, + { + "epoch": 10.774061990212072, + "grad_norm": 0.0620444230735302, + "learning_rate": 0.0005197259261930007, + "loss": 0.0705, + "num_input_tokens_seen": 142652064, + "step": 66045 + }, + { + "epoch": 10.774877650897226, + "grad_norm": 0.5616212487220764, + "learning_rate": 0.0005196548015861212, + "loss": 0.0456, + "num_input_tokens_seen": 142663328, + "step": 66050 + }, + { + "epoch": 10.775693311582382, + "grad_norm": 0.0031712185591459274, + "learning_rate": 0.0005195836765809123, + "loss": 0.0436, + "num_input_tokens_seen": 142674304, + "step": 66055 + }, + { + "epoch": 10.776508972267537, + "grad_norm": 0.0015294282929971814, + "learning_rate": 0.0005195125511788153, + "loss": 0.1117, + "num_input_tokens_seen": 142684736, + "step": 66060 + }, + { + "epoch": 10.777324632952691, + "grad_norm": 0.004349547438323498, + "learning_rate": 0.0005194414253812718, + "loss": 0.0388, + "num_input_tokens_seen": 142694880, + "step": 66065 + }, + { + "epoch": 10.778140293637847, + "grad_norm": 0.29317227005958557, + "learning_rate": 0.000519370299189723, + "loss": 0.2097, + "num_input_tokens_seen": 142705280, + "step": 66070 + }, + { + "epoch": 10.778955954323001, + "grad_norm": 0.3925904333591461, + "learning_rate": 0.0005192991726056107, + "loss": 0.1295, + "num_input_tokens_seen": 142716928, + "step": 66075 + }, + { + "epoch": 10.779771615008157, + "grad_norm": 0.020632924512028694, + "learning_rate": 0.0005192280456303759, + "loss": 0.0299, + "num_input_tokens_seen": 142727072, + "step": 66080 + }, + { + "epoch": 10.780587275693312, + "grad_norm": 0.001718319021165371, + "learning_rate": 0.0005191569182654606, + "loss": 0.1368, + "num_input_tokens_seen": 142738560, + "step": 66085 + }, + { + "epoch": 10.781402936378466, + "grad_norm": 0.016666101291775703, + "learning_rate": 0.000519085790512306, + "loss": 0.0515, + "num_input_tokens_seen": 142749696, + "step": 66090 + }, + { + "epoch": 10.782218597063622, + "grad_norm": 0.005761300679296255, + "learning_rate": 0.0005190146623723536, + "loss": 0.0083, + "num_input_tokens_seen": 142761504, + "step": 66095 + }, + { + "epoch": 10.783034257748776, + "grad_norm": 0.020155632868409157, + "learning_rate": 0.000518943533847045, + "loss": 0.1114, + "num_input_tokens_seen": 142772416, + "step": 66100 + }, + { + "epoch": 10.783849918433932, + "grad_norm": 0.006656975019723177, + "learning_rate": 0.0005188724049378216, + "loss": 0.013, + "num_input_tokens_seen": 142783392, + "step": 66105 + }, + { + "epoch": 10.784665579119086, + "grad_norm": 0.005192107055336237, + "learning_rate": 0.0005188012756461251, + "loss": 0.0154, + "num_input_tokens_seen": 142793952, + "step": 66110 + }, + { + "epoch": 10.785481239804241, + "grad_norm": 0.024940945208072662, + "learning_rate": 0.0005187301459733967, + "loss": 0.0309, + "num_input_tokens_seen": 142805856, + "step": 66115 + }, + { + "epoch": 10.786296900489397, + "grad_norm": 0.00655796192586422, + "learning_rate": 0.0005186590159210783, + "loss": 0.0289, + "num_input_tokens_seen": 142818112, + "step": 66120 + }, + { + "epoch": 10.78711256117455, + "grad_norm": 0.24119259417057037, + "learning_rate": 0.0005185878854906111, + "loss": 0.0967, + "num_input_tokens_seen": 142829632, + "step": 66125 + }, + { + "epoch": 10.787928221859707, + "grad_norm": 0.13890810310840607, + "learning_rate": 0.0005185167546834368, + "loss": 0.0153, + "num_input_tokens_seen": 142840192, + "step": 66130 + }, + { + "epoch": 10.78874388254486, + "grad_norm": 0.03538502752780914, + "learning_rate": 0.0005184456235009972, + "loss": 0.0156, + "num_input_tokens_seen": 142851264, + "step": 66135 + }, + { + "epoch": 10.789559543230016, + "grad_norm": 0.02117316424846649, + "learning_rate": 0.0005183744919447335, + "loss": 0.0379, + "num_input_tokens_seen": 142862496, + "step": 66140 + }, + { + "epoch": 10.790375203915172, + "grad_norm": 0.006447410210967064, + "learning_rate": 0.0005183033600160875, + "loss": 0.0769, + "num_input_tokens_seen": 142872896, + "step": 66145 + }, + { + "epoch": 10.791190864600326, + "grad_norm": 0.025039151310920715, + "learning_rate": 0.0005182322277165005, + "loss": 0.022, + "num_input_tokens_seen": 142883936, + "step": 66150 + }, + { + "epoch": 10.792006525285482, + "grad_norm": 0.0019283173605799675, + "learning_rate": 0.0005181610950474143, + "loss": 0.0595, + "num_input_tokens_seen": 142894816, + "step": 66155 + }, + { + "epoch": 10.792822185970635, + "grad_norm": 0.5425769090652466, + "learning_rate": 0.0005180899620102707, + "loss": 0.0537, + "num_input_tokens_seen": 142906752, + "step": 66160 + }, + { + "epoch": 10.793637846655791, + "grad_norm": 0.007401785347610712, + "learning_rate": 0.000518018828606511, + "loss": 0.0908, + "num_input_tokens_seen": 142918464, + "step": 66165 + }, + { + "epoch": 10.794453507340947, + "grad_norm": 0.14382609724998474, + "learning_rate": 0.0005179476948375767, + "loss": 0.0308, + "num_input_tokens_seen": 142930304, + "step": 66170 + }, + { + "epoch": 10.7952691680261, + "grad_norm": 0.026092106476426125, + "learning_rate": 0.0005178765607049098, + "loss": 0.1134, + "num_input_tokens_seen": 142940320, + "step": 66175 + }, + { + "epoch": 10.796084828711257, + "grad_norm": 0.49475693702697754, + "learning_rate": 0.0005178054262099516, + "loss": 0.038, + "num_input_tokens_seen": 142950848, + "step": 66180 + }, + { + "epoch": 10.79690048939641, + "grad_norm": 0.3677700161933899, + "learning_rate": 0.000517734291354144, + "loss": 0.044, + "num_input_tokens_seen": 142962080, + "step": 66185 + }, + { + "epoch": 10.797716150081566, + "grad_norm": 0.15535861253738403, + "learning_rate": 0.0005176631561389283, + "loss": 0.0314, + "num_input_tokens_seen": 142971648, + "step": 66190 + }, + { + "epoch": 10.798531810766722, + "grad_norm": 0.015271559357643127, + "learning_rate": 0.0005175920205657465, + "loss": 0.0056, + "num_input_tokens_seen": 142982912, + "step": 66195 + }, + { + "epoch": 10.799347471451876, + "grad_norm": 0.4133912920951843, + "learning_rate": 0.0005175208846360399, + "loss": 0.0579, + "num_input_tokens_seen": 142992672, + "step": 66200 + }, + { + "epoch": 10.800163132137031, + "grad_norm": 0.0460764579474926, + "learning_rate": 0.0005174497483512506, + "loss": 0.0649, + "num_input_tokens_seen": 143003104, + "step": 66205 + }, + { + "epoch": 10.800978792822185, + "grad_norm": 0.004594626370817423, + "learning_rate": 0.0005173786117128198, + "loss": 0.0756, + "num_input_tokens_seen": 143013216, + "step": 66210 + }, + { + "epoch": 10.801794453507341, + "grad_norm": 0.043709345161914825, + "learning_rate": 0.0005173074747221895, + "loss": 0.028, + "num_input_tokens_seen": 143024192, + "step": 66215 + }, + { + "epoch": 10.802610114192497, + "grad_norm": 0.3211905062198639, + "learning_rate": 0.0005172363373808013, + "loss": 0.0268, + "num_input_tokens_seen": 143035136, + "step": 66220 + }, + { + "epoch": 10.80342577487765, + "grad_norm": 0.010561930947005749, + "learning_rate": 0.0005171651996900967, + "loss": 0.0181, + "num_input_tokens_seen": 143045504, + "step": 66225 + }, + { + "epoch": 10.804241435562806, + "grad_norm": 0.011289691552519798, + "learning_rate": 0.0005170940616515175, + "loss": 0.0166, + "num_input_tokens_seen": 143057024, + "step": 66230 + }, + { + "epoch": 10.80505709624796, + "grad_norm": 0.010316290892660618, + "learning_rate": 0.0005170229232665056, + "loss": 0.0544, + "num_input_tokens_seen": 143068608, + "step": 66235 + }, + { + "epoch": 10.805872756933116, + "grad_norm": 0.12447977811098099, + "learning_rate": 0.0005169517845365025, + "loss": 0.114, + "num_input_tokens_seen": 143078976, + "step": 66240 + }, + { + "epoch": 10.80668841761827, + "grad_norm": 0.05771623179316521, + "learning_rate": 0.0005168806454629501, + "loss": 0.0135, + "num_input_tokens_seen": 143089504, + "step": 66245 + }, + { + "epoch": 10.807504078303426, + "grad_norm": 0.6013209223747253, + "learning_rate": 0.0005168095060472899, + "loss": 0.0947, + "num_input_tokens_seen": 143099392, + "step": 66250 + }, + { + "epoch": 10.808319738988581, + "grad_norm": 0.017578154802322388, + "learning_rate": 0.0005167383662909638, + "loss": 0.0805, + "num_input_tokens_seen": 143109856, + "step": 66255 + }, + { + "epoch": 10.809135399673735, + "grad_norm": 0.012048912234604359, + "learning_rate": 0.0005166672261954134, + "loss": 0.0691, + "num_input_tokens_seen": 143119776, + "step": 66260 + }, + { + "epoch": 10.809951060358891, + "grad_norm": 0.0018194675212725997, + "learning_rate": 0.0005165960857620806, + "loss": 0.0093, + "num_input_tokens_seen": 143130816, + "step": 66265 + }, + { + "epoch": 10.810766721044045, + "grad_norm": 0.13204284012317657, + "learning_rate": 0.000516524944992407, + "loss": 0.0189, + "num_input_tokens_seen": 143141408, + "step": 66270 + }, + { + "epoch": 10.8115823817292, + "grad_norm": 0.02083822339773178, + "learning_rate": 0.0005164538038878345, + "loss": 0.0296, + "num_input_tokens_seen": 143153920, + "step": 66275 + }, + { + "epoch": 10.812398042414356, + "grad_norm": 0.010216380469501019, + "learning_rate": 0.0005163826624498047, + "loss": 0.1024, + "num_input_tokens_seen": 143164256, + "step": 66280 + }, + { + "epoch": 10.81321370309951, + "grad_norm": 0.2752525806427002, + "learning_rate": 0.0005163115206797596, + "loss": 0.1481, + "num_input_tokens_seen": 143175072, + "step": 66285 + }, + { + "epoch": 10.814029363784666, + "grad_norm": 0.14177221059799194, + "learning_rate": 0.0005162403785791408, + "loss": 0.0325, + "num_input_tokens_seen": 143185408, + "step": 66290 + }, + { + "epoch": 10.81484502446982, + "grad_norm": 0.0835612341761589, + "learning_rate": 0.0005161692361493899, + "loss": 0.0445, + "num_input_tokens_seen": 143196000, + "step": 66295 + }, + { + "epoch": 10.815660685154976, + "grad_norm": 0.005078664980828762, + "learning_rate": 0.0005160980933919491, + "loss": 0.0074, + "num_input_tokens_seen": 143206336, + "step": 66300 + }, + { + "epoch": 10.81647634584013, + "grad_norm": 0.01076800748705864, + "learning_rate": 0.00051602695030826, + "loss": 0.037, + "num_input_tokens_seen": 143217952, + "step": 66305 + }, + { + "epoch": 10.817292006525285, + "grad_norm": 0.35145077109336853, + "learning_rate": 0.0005159558068997644, + "loss": 0.116, + "num_input_tokens_seen": 143229504, + "step": 66310 + }, + { + "epoch": 10.818107667210441, + "grad_norm": 0.012480063363909721, + "learning_rate": 0.0005158846631679041, + "loss": 0.0731, + "num_input_tokens_seen": 143239136, + "step": 66315 + }, + { + "epoch": 10.818923327895595, + "grad_norm": 0.0028558603953570127, + "learning_rate": 0.0005158135191141211, + "loss": 0.062, + "num_input_tokens_seen": 143248512, + "step": 66320 + }, + { + "epoch": 10.81973898858075, + "grad_norm": 0.0026985383592545986, + "learning_rate": 0.000515742374739857, + "loss": 0.0593, + "num_input_tokens_seen": 143259776, + "step": 66325 + }, + { + "epoch": 10.820554649265905, + "grad_norm": 0.01275864988565445, + "learning_rate": 0.0005156712300465537, + "loss": 0.039, + "num_input_tokens_seen": 143269440, + "step": 66330 + }, + { + "epoch": 10.82137030995106, + "grad_norm": 0.02906421571969986, + "learning_rate": 0.000515600085035653, + "loss": 0.0167, + "num_input_tokens_seen": 143280384, + "step": 66335 + }, + { + "epoch": 10.822185970636216, + "grad_norm": 0.020170027390122414, + "learning_rate": 0.0005155289397085968, + "loss": 0.0326, + "num_input_tokens_seen": 143291040, + "step": 66340 + }, + { + "epoch": 10.82300163132137, + "grad_norm": 0.03715604171156883, + "learning_rate": 0.0005154577940668269, + "loss": 0.0715, + "num_input_tokens_seen": 143303296, + "step": 66345 + }, + { + "epoch": 10.823817292006526, + "grad_norm": 0.014832494780421257, + "learning_rate": 0.0005153866481117852, + "loss": 0.0322, + "num_input_tokens_seen": 143315072, + "step": 66350 + }, + { + "epoch": 10.82463295269168, + "grad_norm": 0.004788695368915796, + "learning_rate": 0.0005153155018449137, + "loss": 0.0079, + "num_input_tokens_seen": 143326688, + "step": 66355 + }, + { + "epoch": 10.825448613376835, + "grad_norm": 0.021626006811857224, + "learning_rate": 0.000515244355267654, + "loss": 0.0536, + "num_input_tokens_seen": 143336800, + "step": 66360 + }, + { + "epoch": 10.826264274061991, + "grad_norm": 0.09210459142923355, + "learning_rate": 0.0005151732083814481, + "loss": 0.0128, + "num_input_tokens_seen": 143347232, + "step": 66365 + }, + { + "epoch": 10.827079934747145, + "grad_norm": 0.007327007595449686, + "learning_rate": 0.000515102061187738, + "loss": 0.0546, + "num_input_tokens_seen": 143359104, + "step": 66370 + }, + { + "epoch": 10.8278955954323, + "grad_norm": 0.07148618251085281, + "learning_rate": 0.0005150309136879654, + "loss": 0.0249, + "num_input_tokens_seen": 143370304, + "step": 66375 + }, + { + "epoch": 10.828711256117455, + "grad_norm": 0.04153743386268616, + "learning_rate": 0.0005149597658835722, + "loss": 0.0253, + "num_input_tokens_seen": 143381152, + "step": 66380 + }, + { + "epoch": 10.82952691680261, + "grad_norm": 0.023836899548768997, + "learning_rate": 0.0005148886177760005, + "loss": 0.0174, + "num_input_tokens_seen": 143392672, + "step": 66385 + }, + { + "epoch": 10.830342577487766, + "grad_norm": 0.06939146667718887, + "learning_rate": 0.000514817469366692, + "loss": 0.0085, + "num_input_tokens_seen": 143403264, + "step": 66390 + }, + { + "epoch": 10.83115823817292, + "grad_norm": 0.0013916671741753817, + "learning_rate": 0.0005147463206570886, + "loss": 0.0123, + "num_input_tokens_seen": 143414944, + "step": 66395 + }, + { + "epoch": 10.831973898858076, + "grad_norm": 0.009549994952976704, + "learning_rate": 0.0005146751716486324, + "loss": 0.0229, + "num_input_tokens_seen": 143425216, + "step": 66400 + }, + { + "epoch": 10.83278955954323, + "grad_norm": 0.054410431534051895, + "learning_rate": 0.0005146040223427652, + "loss": 0.0585, + "num_input_tokens_seen": 143437120, + "step": 66405 + }, + { + "epoch": 10.833605220228385, + "grad_norm": 0.00718072010204196, + "learning_rate": 0.0005145328727409291, + "loss": 0.0341, + "num_input_tokens_seen": 143448128, + "step": 66410 + }, + { + "epoch": 10.83442088091354, + "grad_norm": 0.186012402176857, + "learning_rate": 0.0005144617228445657, + "loss": 0.019, + "num_input_tokens_seen": 143458976, + "step": 66415 + }, + { + "epoch": 10.835236541598695, + "grad_norm": 0.005144505761563778, + "learning_rate": 0.0005143905726551172, + "loss": 0.1658, + "num_input_tokens_seen": 143469376, + "step": 66420 + }, + { + "epoch": 10.83605220228385, + "grad_norm": 0.01144189853221178, + "learning_rate": 0.0005143194221740255, + "loss": 0.0033, + "num_input_tokens_seen": 143479872, + "step": 66425 + }, + { + "epoch": 10.836867862969005, + "grad_norm": 0.003008177038282156, + "learning_rate": 0.0005142482714027326, + "loss": 0.0458, + "num_input_tokens_seen": 143490976, + "step": 66430 + }, + { + "epoch": 10.83768352365416, + "grad_norm": 0.020077014341950417, + "learning_rate": 0.0005141771203426803, + "loss": 0.0113, + "num_input_tokens_seen": 143502496, + "step": 66435 + }, + { + "epoch": 10.838499184339314, + "grad_norm": 0.26483267545700073, + "learning_rate": 0.0005141059689953107, + "loss": 0.0711, + "num_input_tokens_seen": 143513184, + "step": 66440 + }, + { + "epoch": 10.83931484502447, + "grad_norm": 0.22102683782577515, + "learning_rate": 0.0005140348173620657, + "loss": 0.0152, + "num_input_tokens_seen": 143523232, + "step": 66445 + }, + { + "epoch": 10.840130505709626, + "grad_norm": 0.007105762138962746, + "learning_rate": 0.0005139636654443874, + "loss": 0.032, + "num_input_tokens_seen": 143534208, + "step": 66450 + }, + { + "epoch": 10.84094616639478, + "grad_norm": 0.03749940171837807, + "learning_rate": 0.0005138925132437178, + "loss": 0.0057, + "num_input_tokens_seen": 143545376, + "step": 66455 + }, + { + "epoch": 10.841761827079935, + "grad_norm": 0.00626655388623476, + "learning_rate": 0.0005138213607614985, + "loss": 0.0121, + "num_input_tokens_seen": 143555968, + "step": 66460 + }, + { + "epoch": 10.84257748776509, + "grad_norm": 0.004296998027712107, + "learning_rate": 0.000513750207999172, + "loss": 0.1055, + "num_input_tokens_seen": 143566560, + "step": 66465 + }, + { + "epoch": 10.843393148450245, + "grad_norm": 0.004289441742002964, + "learning_rate": 0.0005136790549581801, + "loss": 0.064, + "num_input_tokens_seen": 143577056, + "step": 66470 + }, + { + "epoch": 10.844208809135399, + "grad_norm": 0.005175084341317415, + "learning_rate": 0.0005136079016399647, + "loss": 0.0078, + "num_input_tokens_seen": 143588672, + "step": 66475 + }, + { + "epoch": 10.845024469820554, + "grad_norm": 0.28688469529151917, + "learning_rate": 0.000513536748045968, + "loss": 0.0346, + "num_input_tokens_seen": 143600320, + "step": 66480 + }, + { + "epoch": 10.84584013050571, + "grad_norm": 0.12599433958530426, + "learning_rate": 0.000513465594177632, + "loss": 0.1222, + "num_input_tokens_seen": 143612608, + "step": 66485 + }, + { + "epoch": 10.846655791190864, + "grad_norm": 0.31342703104019165, + "learning_rate": 0.0005133944400363986, + "loss": 0.2291, + "num_input_tokens_seen": 143623200, + "step": 66490 + }, + { + "epoch": 10.84747145187602, + "grad_norm": 0.10795027762651443, + "learning_rate": 0.0005133232856237098, + "loss": 0.014, + "num_input_tokens_seen": 143634560, + "step": 66495 + }, + { + "epoch": 10.848287112561174, + "grad_norm": 0.3790777027606964, + "learning_rate": 0.0005132521309410078, + "loss": 0.0711, + "num_input_tokens_seen": 143644288, + "step": 66500 + }, + { + "epoch": 10.84910277324633, + "grad_norm": 0.0046887993812561035, + "learning_rate": 0.0005131809759897345, + "loss": 0.005, + "num_input_tokens_seen": 143654848, + "step": 66505 + }, + { + "epoch": 10.849918433931485, + "grad_norm": 0.008742697536945343, + "learning_rate": 0.000513109820771332, + "loss": 0.0074, + "num_input_tokens_seen": 143664768, + "step": 66510 + }, + { + "epoch": 10.850734094616639, + "grad_norm": 0.021899035200476646, + "learning_rate": 0.0005130386652872423, + "loss": 0.0286, + "num_input_tokens_seen": 143675680, + "step": 66515 + }, + { + "epoch": 10.851549755301795, + "grad_norm": 0.33755260705947876, + "learning_rate": 0.0005129675095389076, + "loss": 0.0831, + "num_input_tokens_seen": 143685280, + "step": 66520 + }, + { + "epoch": 10.852365415986949, + "grad_norm": 0.002667512744665146, + "learning_rate": 0.0005128963535277699, + "loss": 0.0268, + "num_input_tokens_seen": 143695648, + "step": 66525 + }, + { + "epoch": 10.853181076672104, + "grad_norm": 0.026985513046383858, + "learning_rate": 0.0005128251972552711, + "loss": 0.0065, + "num_input_tokens_seen": 143706208, + "step": 66530 + }, + { + "epoch": 10.85399673735726, + "grad_norm": 0.3457069396972656, + "learning_rate": 0.0005127540407228535, + "loss": 0.2036, + "num_input_tokens_seen": 143717408, + "step": 66535 + }, + { + "epoch": 10.854812398042414, + "grad_norm": 0.003863809397444129, + "learning_rate": 0.0005126828839319591, + "loss": 0.0185, + "num_input_tokens_seen": 143729632, + "step": 66540 + }, + { + "epoch": 10.85562805872757, + "grad_norm": 0.008951705880463123, + "learning_rate": 0.0005126117268840299, + "loss": 0.029, + "num_input_tokens_seen": 143739968, + "step": 66545 + }, + { + "epoch": 10.856443719412724, + "grad_norm": 0.0031280622351914644, + "learning_rate": 0.000512540569580508, + "loss": 0.0555, + "num_input_tokens_seen": 143751712, + "step": 66550 + }, + { + "epoch": 10.85725938009788, + "grad_norm": 0.1753411591053009, + "learning_rate": 0.0005124694120228357, + "loss": 0.0195, + "num_input_tokens_seen": 143762720, + "step": 66555 + }, + { + "epoch": 10.858075040783035, + "grad_norm": 0.5248215198516846, + "learning_rate": 0.0005123982542124549, + "loss": 0.2099, + "num_input_tokens_seen": 143774304, + "step": 66560 + }, + { + "epoch": 10.858890701468189, + "grad_norm": 0.012592996470630169, + "learning_rate": 0.0005123270961508077, + "loss": 0.0103, + "num_input_tokens_seen": 143783712, + "step": 66565 + }, + { + "epoch": 10.859706362153345, + "grad_norm": 0.029887670651078224, + "learning_rate": 0.0005122559378393363, + "loss": 0.1224, + "num_input_tokens_seen": 143794400, + "step": 66570 + }, + { + "epoch": 10.860522022838499, + "grad_norm": 0.0571708083152771, + "learning_rate": 0.0005121847792794828, + "loss": 0.0685, + "num_input_tokens_seen": 143805728, + "step": 66575 + }, + { + "epoch": 10.861337683523654, + "grad_norm": 0.006564087700098753, + "learning_rate": 0.0005121136204726893, + "loss": 0.0293, + "num_input_tokens_seen": 143815424, + "step": 66580 + }, + { + "epoch": 10.86215334420881, + "grad_norm": 0.005001547280699015, + "learning_rate": 0.0005120424614203978, + "loss": 0.0233, + "num_input_tokens_seen": 143826560, + "step": 66585 + }, + { + "epoch": 10.862969004893964, + "grad_norm": 0.04530732333660126, + "learning_rate": 0.0005119713021240507, + "loss": 0.0681, + "num_input_tokens_seen": 143838016, + "step": 66590 + }, + { + "epoch": 10.86378466557912, + "grad_norm": 0.002425889251753688, + "learning_rate": 0.0005119001425850899, + "loss": 0.1426, + "num_input_tokens_seen": 143850240, + "step": 66595 + }, + { + "epoch": 10.864600326264274, + "grad_norm": 0.0060838027857244015, + "learning_rate": 0.0005118289828049575, + "loss": 0.0393, + "num_input_tokens_seen": 143860960, + "step": 66600 + }, + { + "epoch": 10.86541598694943, + "grad_norm": 0.07112309336662292, + "learning_rate": 0.0005117578227850958, + "loss": 0.1031, + "num_input_tokens_seen": 143871872, + "step": 66605 + }, + { + "epoch": 10.866231647634583, + "grad_norm": 0.026289258152246475, + "learning_rate": 0.000511686662526947, + "loss": 0.0053, + "num_input_tokens_seen": 143880800, + "step": 66610 + }, + { + "epoch": 10.867047308319739, + "grad_norm": 0.011879836209118366, + "learning_rate": 0.0005116155020319531, + "loss": 0.0279, + "num_input_tokens_seen": 143892128, + "step": 66615 + }, + { + "epoch": 10.867862969004895, + "grad_norm": 0.002536615589633584, + "learning_rate": 0.0005115443413015563, + "loss": 0.0288, + "num_input_tokens_seen": 143902304, + "step": 66620 + }, + { + "epoch": 10.868678629690049, + "grad_norm": 0.2889273762702942, + "learning_rate": 0.0005114731803371988, + "loss": 0.0921, + "num_input_tokens_seen": 143914688, + "step": 66625 + }, + { + "epoch": 10.869494290375204, + "grad_norm": 0.003246544860303402, + "learning_rate": 0.0005114020191403228, + "loss": 0.0073, + "num_input_tokens_seen": 143925344, + "step": 66630 + }, + { + "epoch": 10.870309951060358, + "grad_norm": 0.00995062105357647, + "learning_rate": 0.0005113308577123705, + "loss": 0.0165, + "num_input_tokens_seen": 143935776, + "step": 66635 + }, + { + "epoch": 10.871125611745514, + "grad_norm": 0.26391562819480896, + "learning_rate": 0.0005112596960547838, + "loss": 0.0205, + "num_input_tokens_seen": 143947296, + "step": 66640 + }, + { + "epoch": 10.87194127243067, + "grad_norm": 0.01041611097753048, + "learning_rate": 0.0005111885341690051, + "loss": 0.0861, + "num_input_tokens_seen": 143958784, + "step": 66645 + }, + { + "epoch": 10.872756933115824, + "grad_norm": 0.06782546639442444, + "learning_rate": 0.0005111173720564767, + "loss": 0.0348, + "num_input_tokens_seen": 143969824, + "step": 66650 + }, + { + "epoch": 10.87357259380098, + "grad_norm": 0.03075503371655941, + "learning_rate": 0.0005110462097186405, + "loss": 0.038, + "num_input_tokens_seen": 143981440, + "step": 66655 + }, + { + "epoch": 10.874388254486133, + "grad_norm": 0.0384233333170414, + "learning_rate": 0.0005109750471569388, + "loss": 0.2224, + "num_input_tokens_seen": 143991840, + "step": 66660 + }, + { + "epoch": 10.875203915171289, + "grad_norm": 0.38814109563827515, + "learning_rate": 0.000510903884372814, + "loss": 0.1535, + "num_input_tokens_seen": 144003200, + "step": 66665 + }, + { + "epoch": 10.876019575856443, + "grad_norm": 0.1948164850473404, + "learning_rate": 0.0005108327213677081, + "loss": 0.0234, + "num_input_tokens_seen": 144014656, + "step": 66670 + }, + { + "epoch": 10.876835236541599, + "grad_norm": 0.05382636934518814, + "learning_rate": 0.0005107615581430633, + "loss": 0.0065, + "num_input_tokens_seen": 144026400, + "step": 66675 + }, + { + "epoch": 10.877650897226754, + "grad_norm": 0.20784275233745575, + "learning_rate": 0.0005106903947003221, + "loss": 0.023, + "num_input_tokens_seen": 144036480, + "step": 66680 + }, + { + "epoch": 10.878466557911908, + "grad_norm": 0.4996677339076996, + "learning_rate": 0.0005106192310409263, + "loss": 0.0446, + "num_input_tokens_seen": 144046624, + "step": 66685 + }, + { + "epoch": 10.879282218597064, + "grad_norm": 0.002807484706863761, + "learning_rate": 0.0005105480671663183, + "loss": 0.0368, + "num_input_tokens_seen": 144056960, + "step": 66690 + }, + { + "epoch": 10.880097879282218, + "grad_norm": 0.06957192718982697, + "learning_rate": 0.0005104769030779404, + "loss": 0.0435, + "num_input_tokens_seen": 144066592, + "step": 66695 + }, + { + "epoch": 10.880913539967374, + "grad_norm": 0.009007420390844345, + "learning_rate": 0.0005104057387772347, + "loss": 0.02, + "num_input_tokens_seen": 144077504, + "step": 66700 + }, + { + "epoch": 10.88172920065253, + "grad_norm": 0.16739769279956818, + "learning_rate": 0.0005103345742656437, + "loss": 0.0271, + "num_input_tokens_seen": 144086112, + "step": 66705 + }, + { + "epoch": 10.882544861337683, + "grad_norm": 0.004342829342931509, + "learning_rate": 0.0005102634095446092, + "loss": 0.0516, + "num_input_tokens_seen": 144097216, + "step": 66710 + }, + { + "epoch": 10.883360522022839, + "grad_norm": 0.054401274770498276, + "learning_rate": 0.0005101922446155738, + "loss": 0.0135, + "num_input_tokens_seen": 144108000, + "step": 66715 + }, + { + "epoch": 10.884176182707993, + "grad_norm": 0.0049821496941149235, + "learning_rate": 0.0005101210794799797, + "loss": 0.0069, + "num_input_tokens_seen": 144119232, + "step": 66720 + }, + { + "epoch": 10.884991843393149, + "grad_norm": 0.0032072472386062145, + "learning_rate": 0.0005100499141392689, + "loss": 0.0321, + "num_input_tokens_seen": 144129696, + "step": 66725 + }, + { + "epoch": 10.885807504078304, + "grad_norm": 0.001098168664611876, + "learning_rate": 0.0005099787485948839, + "loss": 0.0478, + "num_input_tokens_seen": 144140576, + "step": 66730 + }, + { + "epoch": 10.886623164763458, + "grad_norm": 0.045703381299972534, + "learning_rate": 0.000509907582848267, + "loss": 0.0163, + "num_input_tokens_seen": 144151456, + "step": 66735 + }, + { + "epoch": 10.887438825448614, + "grad_norm": 0.0048604668118059635, + "learning_rate": 0.0005098364169008604, + "loss": 0.0195, + "num_input_tokens_seen": 144161984, + "step": 66740 + }, + { + "epoch": 10.888254486133768, + "grad_norm": 0.37948596477508545, + "learning_rate": 0.0005097652507541062, + "loss": 0.099, + "num_input_tokens_seen": 144173152, + "step": 66745 + }, + { + "epoch": 10.889070146818923, + "grad_norm": 0.012583531439304352, + "learning_rate": 0.0005096940844094467, + "loss": 0.0098, + "num_input_tokens_seen": 144184736, + "step": 66750 + }, + { + "epoch": 10.88988580750408, + "grad_norm": 0.02361808530986309, + "learning_rate": 0.0005096229178683244, + "loss": 0.0141, + "num_input_tokens_seen": 144196064, + "step": 66755 + }, + { + "epoch": 10.890701468189233, + "grad_norm": 0.15489862859249115, + "learning_rate": 0.0005095517511321815, + "loss": 0.02, + "num_input_tokens_seen": 144207200, + "step": 66760 + }, + { + "epoch": 10.891517128874389, + "grad_norm": 0.01060162577778101, + "learning_rate": 0.0005094805842024603, + "loss": 0.0102, + "num_input_tokens_seen": 144217664, + "step": 66765 + }, + { + "epoch": 10.892332789559543, + "grad_norm": 0.0021783667616546154, + "learning_rate": 0.000509409417080603, + "loss": 0.1081, + "num_input_tokens_seen": 144226784, + "step": 66770 + }, + { + "epoch": 10.893148450244698, + "grad_norm": 0.10572908818721771, + "learning_rate": 0.0005093382497680516, + "loss": 0.1198, + "num_input_tokens_seen": 144237024, + "step": 66775 + }, + { + "epoch": 10.893964110929852, + "grad_norm": 0.35556432604789734, + "learning_rate": 0.000509267082266249, + "loss": 0.0452, + "num_input_tokens_seen": 144247488, + "step": 66780 + }, + { + "epoch": 10.894779771615008, + "grad_norm": 0.020045407116413116, + "learning_rate": 0.0005091959145766373, + "loss": 0.0503, + "num_input_tokens_seen": 144258048, + "step": 66785 + }, + { + "epoch": 10.895595432300164, + "grad_norm": 0.10034667700529099, + "learning_rate": 0.0005091247467006588, + "loss": 0.0272, + "num_input_tokens_seen": 144268832, + "step": 66790 + }, + { + "epoch": 10.896411092985318, + "grad_norm": 0.005177509505301714, + "learning_rate": 0.0005090535786397556, + "loss": 0.1209, + "num_input_tokens_seen": 144279744, + "step": 66795 + }, + { + "epoch": 10.897226753670473, + "grad_norm": 0.0051989988423883915, + "learning_rate": 0.0005089824103953701, + "loss": 0.0217, + "num_input_tokens_seen": 144290976, + "step": 66800 + }, + { + "epoch": 10.898042414355627, + "grad_norm": 0.16461747884750366, + "learning_rate": 0.0005089112419689447, + "loss": 0.0298, + "num_input_tokens_seen": 144301312, + "step": 66805 + }, + { + "epoch": 10.898858075040783, + "grad_norm": 0.04938230663537979, + "learning_rate": 0.0005088400733619217, + "loss": 0.0197, + "num_input_tokens_seen": 144311904, + "step": 66810 + }, + { + "epoch": 10.899673735725939, + "grad_norm": 0.004325589165091515, + "learning_rate": 0.0005087689045757433, + "loss": 0.2009, + "num_input_tokens_seen": 144323392, + "step": 66815 + }, + { + "epoch": 10.900489396411093, + "grad_norm": 0.006989457178860903, + "learning_rate": 0.000508697735611852, + "loss": 0.0353, + "num_input_tokens_seen": 144335232, + "step": 66820 + }, + { + "epoch": 10.901305057096248, + "grad_norm": 0.021615099161863327, + "learning_rate": 0.0005086265664716901, + "loss": 0.0172, + "num_input_tokens_seen": 144345408, + "step": 66825 + }, + { + "epoch": 10.902120717781402, + "grad_norm": 0.005931622814387083, + "learning_rate": 0.0005085553971566998, + "loss": 0.0237, + "num_input_tokens_seen": 144356000, + "step": 66830 + }, + { + "epoch": 10.902936378466558, + "grad_norm": 0.6692498326301575, + "learning_rate": 0.0005084842276683236, + "loss": 0.0503, + "num_input_tokens_seen": 144366048, + "step": 66835 + }, + { + "epoch": 10.903752039151712, + "grad_norm": 0.003552852664142847, + "learning_rate": 0.0005084130580080038, + "loss": 0.0162, + "num_input_tokens_seen": 144377312, + "step": 66840 + }, + { + "epoch": 10.904567699836868, + "grad_norm": 0.004602952394634485, + "learning_rate": 0.0005083418881771826, + "loss": 0.0141, + "num_input_tokens_seen": 144387872, + "step": 66845 + }, + { + "epoch": 10.905383360522023, + "grad_norm": 0.1763039380311966, + "learning_rate": 0.0005082707181773025, + "loss": 0.0307, + "num_input_tokens_seen": 144399232, + "step": 66850 + }, + { + "epoch": 10.906199021207177, + "grad_norm": 0.11560584604740143, + "learning_rate": 0.0005081995480098057, + "loss": 0.02, + "num_input_tokens_seen": 144410144, + "step": 66855 + }, + { + "epoch": 10.907014681892333, + "grad_norm": 0.14307036995887756, + "learning_rate": 0.0005081283776761348, + "loss": 0.0186, + "num_input_tokens_seen": 144420672, + "step": 66860 + }, + { + "epoch": 10.907830342577487, + "grad_norm": 0.45195597410202026, + "learning_rate": 0.0005080572071777319, + "loss": 0.1616, + "num_input_tokens_seen": 144431936, + "step": 66865 + }, + { + "epoch": 10.908646003262643, + "grad_norm": 0.003012361004948616, + "learning_rate": 0.0005079860365160395, + "loss": 0.0048, + "num_input_tokens_seen": 144442816, + "step": 66870 + }, + { + "epoch": 10.909461663947798, + "grad_norm": 0.11098600924015045, + "learning_rate": 0.0005079148656924999, + "loss": 0.0662, + "num_input_tokens_seen": 144453408, + "step": 66875 + }, + { + "epoch": 10.910277324632952, + "grad_norm": 0.20769816637039185, + "learning_rate": 0.0005078436947085557, + "loss": 0.0218, + "num_input_tokens_seen": 144463968, + "step": 66880 + }, + { + "epoch": 10.911092985318108, + "grad_norm": 0.007285351864993572, + "learning_rate": 0.0005077725235656488, + "loss": 0.0462, + "num_input_tokens_seen": 144474176, + "step": 66885 + }, + { + "epoch": 10.911908646003262, + "grad_norm": 0.005105094984173775, + "learning_rate": 0.000507701352265222, + "loss": 0.0245, + "num_input_tokens_seen": 144485600, + "step": 66890 + }, + { + "epoch": 10.912724306688418, + "grad_norm": 0.05201911926269531, + "learning_rate": 0.0005076301808087176, + "loss": 0.0075, + "num_input_tokens_seen": 144496864, + "step": 66895 + }, + { + "epoch": 10.913539967373573, + "grad_norm": 0.0026850395370274782, + "learning_rate": 0.0005075590091975779, + "loss": 0.1397, + "num_input_tokens_seen": 144506944, + "step": 66900 + }, + { + "epoch": 10.914355628058727, + "grad_norm": 0.017833666875958443, + "learning_rate": 0.0005074878374332452, + "loss": 0.0205, + "num_input_tokens_seen": 144518144, + "step": 66905 + }, + { + "epoch": 10.915171288743883, + "grad_norm": 0.01121000200510025, + "learning_rate": 0.000507416665517162, + "loss": 0.0068, + "num_input_tokens_seen": 144527328, + "step": 66910 + }, + { + "epoch": 10.915986949429037, + "grad_norm": 0.006536161061376333, + "learning_rate": 0.0005073454934507708, + "loss": 0.006, + "num_input_tokens_seen": 144538560, + "step": 66915 + }, + { + "epoch": 10.916802610114193, + "grad_norm": 0.3094877600669861, + "learning_rate": 0.0005072743212355135, + "loss": 0.1158, + "num_input_tokens_seen": 144550560, + "step": 66920 + }, + { + "epoch": 10.917618270799348, + "grad_norm": 0.07924286276102066, + "learning_rate": 0.0005072031488728331, + "loss": 0.0204, + "num_input_tokens_seen": 144561472, + "step": 66925 + }, + { + "epoch": 10.918433931484502, + "grad_norm": 0.47475647926330566, + "learning_rate": 0.0005071319763641718, + "loss": 0.0448, + "num_input_tokens_seen": 144572864, + "step": 66930 + }, + { + "epoch": 10.919249592169658, + "grad_norm": 0.03268728777766228, + "learning_rate": 0.0005070608037109718, + "loss": 0.0069, + "num_input_tokens_seen": 144582912, + "step": 66935 + }, + { + "epoch": 10.920065252854812, + "grad_norm": 0.010898868553340435, + "learning_rate": 0.0005069896309146758, + "loss": 0.0214, + "num_input_tokens_seen": 144593664, + "step": 66940 + }, + { + "epoch": 10.920880913539968, + "grad_norm": 0.024903923273086548, + "learning_rate": 0.000506918457976726, + "loss": 0.1886, + "num_input_tokens_seen": 144604192, + "step": 66945 + }, + { + "epoch": 10.921696574225122, + "grad_norm": 2.0595877170562744, + "learning_rate": 0.0005068472848985647, + "loss": 0.0321, + "num_input_tokens_seen": 144614336, + "step": 66950 + }, + { + "epoch": 10.922512234910277, + "grad_norm": 0.003864932106807828, + "learning_rate": 0.0005067761116816348, + "loss": 0.0096, + "num_input_tokens_seen": 144625824, + "step": 66955 + }, + { + "epoch": 10.923327895595433, + "grad_norm": 0.02564224787056446, + "learning_rate": 0.0005067049383273783, + "loss": 0.0166, + "num_input_tokens_seen": 144636064, + "step": 66960 + }, + { + "epoch": 10.924143556280587, + "grad_norm": 0.0030069448985159397, + "learning_rate": 0.0005066337648372376, + "loss": 0.0394, + "num_input_tokens_seen": 144646176, + "step": 66965 + }, + { + "epoch": 10.924959216965743, + "grad_norm": 0.0064015681855380535, + "learning_rate": 0.0005065625912126553, + "loss": 0.0149, + "num_input_tokens_seen": 144655616, + "step": 66970 + }, + { + "epoch": 10.925774877650896, + "grad_norm": 0.016096873208880424, + "learning_rate": 0.0005064914174550737, + "loss": 0.0167, + "num_input_tokens_seen": 144667072, + "step": 66975 + }, + { + "epoch": 10.926590538336052, + "grad_norm": 0.018032198771834373, + "learning_rate": 0.0005064202435659354, + "loss": 0.05, + "num_input_tokens_seen": 144676800, + "step": 66980 + }, + { + "epoch": 10.927406199021208, + "grad_norm": 0.13074368238449097, + "learning_rate": 0.0005063490695466827, + "loss": 0.0129, + "num_input_tokens_seen": 144687616, + "step": 66985 + }, + { + "epoch": 10.928221859706362, + "grad_norm": 0.006677868310362101, + "learning_rate": 0.000506277895398758, + "loss": 0.017, + "num_input_tokens_seen": 144698464, + "step": 66990 + }, + { + "epoch": 10.929037520391518, + "grad_norm": 0.07817433029413223, + "learning_rate": 0.0005062067211236039, + "loss": 0.0938, + "num_input_tokens_seen": 144709216, + "step": 66995 + }, + { + "epoch": 10.929853181076671, + "grad_norm": 0.008835572749376297, + "learning_rate": 0.0005061355467226626, + "loss": 0.0782, + "num_input_tokens_seen": 144720224, + "step": 67000 + }, + { + "epoch": 10.930668841761827, + "grad_norm": 0.002849903656169772, + "learning_rate": 0.0005060643721973766, + "loss": 0.0499, + "num_input_tokens_seen": 144730592, + "step": 67005 + }, + { + "epoch": 10.931484502446983, + "grad_norm": 0.0012694394681602716, + "learning_rate": 0.0005059931975491886, + "loss": 0.1155, + "num_input_tokens_seen": 144740960, + "step": 67010 + }, + { + "epoch": 10.932300163132137, + "grad_norm": 0.00297158001922071, + "learning_rate": 0.0005059220227795409, + "loss": 0.0047, + "num_input_tokens_seen": 144751520, + "step": 67015 + }, + { + "epoch": 10.933115823817293, + "grad_norm": 0.8607537150382996, + "learning_rate": 0.0005058508478898757, + "loss": 0.0684, + "num_input_tokens_seen": 144761856, + "step": 67020 + }, + { + "epoch": 10.933931484502446, + "grad_norm": 0.20840585231781006, + "learning_rate": 0.0005057796728816358, + "loss": 0.016, + "num_input_tokens_seen": 144772672, + "step": 67025 + }, + { + "epoch": 10.934747145187602, + "grad_norm": 0.006408384069800377, + "learning_rate": 0.0005057084977562633, + "loss": 0.0143, + "num_input_tokens_seen": 144784224, + "step": 67030 + }, + { + "epoch": 10.935562805872756, + "grad_norm": 0.28648972511291504, + "learning_rate": 0.0005056373225152009, + "loss": 0.0144, + "num_input_tokens_seen": 144793216, + "step": 67035 + }, + { + "epoch": 10.936378466557912, + "grad_norm": 0.018111038953065872, + "learning_rate": 0.0005055661471598911, + "loss": 0.0031, + "num_input_tokens_seen": 144803680, + "step": 67040 + }, + { + "epoch": 10.937194127243067, + "grad_norm": 0.02693861722946167, + "learning_rate": 0.0005054949716917763, + "loss": 0.0052, + "num_input_tokens_seen": 144814816, + "step": 67045 + }, + { + "epoch": 10.938009787928221, + "grad_norm": 0.27363863587379456, + "learning_rate": 0.0005054237961122989, + "loss": 0.0874, + "num_input_tokens_seen": 144826176, + "step": 67050 + }, + { + "epoch": 10.938825448613377, + "grad_norm": 0.015029046684503555, + "learning_rate": 0.0005053526204229012, + "loss": 0.0694, + "num_input_tokens_seen": 144836160, + "step": 67055 + }, + { + "epoch": 10.939641109298531, + "grad_norm": 0.005386275239288807, + "learning_rate": 0.000505281444625026, + "loss": 0.0082, + "num_input_tokens_seen": 144845088, + "step": 67060 + }, + { + "epoch": 10.940456769983687, + "grad_norm": 0.015765998512506485, + "learning_rate": 0.0005052102687201156, + "loss": 0.0738, + "num_input_tokens_seen": 144856288, + "step": 67065 + }, + { + "epoch": 10.941272430668842, + "grad_norm": 0.046367112547159195, + "learning_rate": 0.0005051390927096125, + "loss": 0.0486, + "num_input_tokens_seen": 144866208, + "step": 67070 + }, + { + "epoch": 10.942088091353996, + "grad_norm": 0.014453914016485214, + "learning_rate": 0.0005050679165949592, + "loss": 0.0542, + "num_input_tokens_seen": 144877344, + "step": 67075 + }, + { + "epoch": 10.942903752039152, + "grad_norm": 0.02145060896873474, + "learning_rate": 0.0005049967403775982, + "loss": 0.0326, + "num_input_tokens_seen": 144888160, + "step": 67080 + }, + { + "epoch": 10.943719412724306, + "grad_norm": 0.0056599765084683895, + "learning_rate": 0.0005049255640589718, + "loss": 0.0566, + "num_input_tokens_seen": 144900224, + "step": 67085 + }, + { + "epoch": 10.944535073409462, + "grad_norm": 0.4373588562011719, + "learning_rate": 0.0005048543876405225, + "loss": 0.0846, + "num_input_tokens_seen": 144909792, + "step": 67090 + }, + { + "epoch": 10.945350734094617, + "grad_norm": 0.007855315692722797, + "learning_rate": 0.000504783211123693, + "loss": 0.1838, + "num_input_tokens_seen": 144920992, + "step": 67095 + }, + { + "epoch": 10.946166394779771, + "grad_norm": 0.41129010915756226, + "learning_rate": 0.0005047120345099258, + "loss": 0.1903, + "num_input_tokens_seen": 144930080, + "step": 67100 + }, + { + "epoch": 10.946982055464927, + "grad_norm": 0.00836831796914339, + "learning_rate": 0.0005046408578006631, + "loss": 0.0198, + "num_input_tokens_seen": 144940320, + "step": 67105 + }, + { + "epoch": 10.947797716150081, + "grad_norm": 0.04342430830001831, + "learning_rate": 0.0005045696809973474, + "loss": 0.0137, + "num_input_tokens_seen": 144951392, + "step": 67110 + }, + { + "epoch": 10.948613376835237, + "grad_norm": 0.0020754148717969656, + "learning_rate": 0.0005044985041014217, + "loss": 0.0124, + "num_input_tokens_seen": 144960928, + "step": 67115 + }, + { + "epoch": 10.949429037520392, + "grad_norm": 0.007274713832885027, + "learning_rate": 0.0005044273271143277, + "loss": 0.0152, + "num_input_tokens_seen": 144973024, + "step": 67120 + }, + { + "epoch": 10.950244698205546, + "grad_norm": 0.4020930826663971, + "learning_rate": 0.0005043561500375085, + "loss": 0.0408, + "num_input_tokens_seen": 144983776, + "step": 67125 + }, + { + "epoch": 10.951060358890702, + "grad_norm": 0.42312315106391907, + "learning_rate": 0.0005042849728724064, + "loss": 0.0615, + "num_input_tokens_seen": 144993888, + "step": 67130 + }, + { + "epoch": 10.951876019575856, + "grad_norm": 0.008536286652088165, + "learning_rate": 0.0005042137956204639, + "loss": 0.0313, + "num_input_tokens_seen": 145004352, + "step": 67135 + }, + { + "epoch": 10.952691680261012, + "grad_norm": 0.004205097444355488, + "learning_rate": 0.0005041426182831233, + "loss": 0.0166, + "num_input_tokens_seen": 145014592, + "step": 67140 + }, + { + "epoch": 10.953507340946166, + "grad_norm": 0.001511257141828537, + "learning_rate": 0.0005040714408618275, + "loss": 0.0205, + "num_input_tokens_seen": 145025344, + "step": 67145 + }, + { + "epoch": 10.954323001631321, + "grad_norm": 0.3182823061943054, + "learning_rate": 0.0005040002633580188, + "loss": 0.0477, + "num_input_tokens_seen": 145037088, + "step": 67150 + }, + { + "epoch": 10.955138662316477, + "grad_norm": 0.027804870158433914, + "learning_rate": 0.0005039290857731395, + "loss": 0.006, + "num_input_tokens_seen": 145048544, + "step": 67155 + }, + { + "epoch": 10.955954323001631, + "grad_norm": 0.3454773724079132, + "learning_rate": 0.0005038579081086324, + "loss": 0.0143, + "num_input_tokens_seen": 145059584, + "step": 67160 + }, + { + "epoch": 10.956769983686787, + "grad_norm": 0.01337494421750307, + "learning_rate": 0.0005037867303659399, + "loss": 0.0096, + "num_input_tokens_seen": 145070880, + "step": 67165 + }, + { + "epoch": 10.95758564437194, + "grad_norm": 0.01701531931757927, + "learning_rate": 0.0005037155525465046, + "loss": 0.0746, + "num_input_tokens_seen": 145082208, + "step": 67170 + }, + { + "epoch": 10.958401305057096, + "grad_norm": 0.015620725229382515, + "learning_rate": 0.0005036443746517688, + "loss": 0.0104, + "num_input_tokens_seen": 145093024, + "step": 67175 + }, + { + "epoch": 10.959216965742252, + "grad_norm": 0.017922695726156235, + "learning_rate": 0.0005035731966831752, + "loss": 0.0036, + "num_input_tokens_seen": 145104064, + "step": 67180 + }, + { + "epoch": 10.960032626427406, + "grad_norm": 0.17914153635501862, + "learning_rate": 0.0005035020186421661, + "loss": 0.0944, + "num_input_tokens_seen": 145116064, + "step": 67185 + }, + { + "epoch": 10.960848287112562, + "grad_norm": 0.0031310205813497305, + "learning_rate": 0.0005034308405301842, + "loss": 0.0992, + "num_input_tokens_seen": 145126720, + "step": 67190 + }, + { + "epoch": 10.961663947797716, + "grad_norm": 0.27558255195617676, + "learning_rate": 0.0005033596623486719, + "loss": 0.0291, + "num_input_tokens_seen": 145137344, + "step": 67195 + }, + { + "epoch": 10.962479608482871, + "grad_norm": 0.02502385526895523, + "learning_rate": 0.0005032884840990719, + "loss": 0.0127, + "num_input_tokens_seen": 145147744, + "step": 67200 + }, + { + "epoch": 10.963295269168025, + "grad_norm": 0.15393461287021637, + "learning_rate": 0.0005032173057828265, + "loss": 0.0902, + "num_input_tokens_seen": 145158528, + "step": 67205 + }, + { + "epoch": 10.964110929853181, + "grad_norm": 0.006907598581165075, + "learning_rate": 0.0005031461274013784, + "loss": 0.0077, + "num_input_tokens_seen": 145168864, + "step": 67210 + }, + { + "epoch": 10.964926590538337, + "grad_norm": 0.1618753969669342, + "learning_rate": 0.0005030749489561701, + "loss": 0.0743, + "num_input_tokens_seen": 145180064, + "step": 67215 + }, + { + "epoch": 10.96574225122349, + "grad_norm": 0.4483173191547394, + "learning_rate": 0.000503003770448644, + "loss": 0.1097, + "num_input_tokens_seen": 145191040, + "step": 67220 + }, + { + "epoch": 10.966557911908646, + "grad_norm": 0.01324823871254921, + "learning_rate": 0.0005029325918802426, + "loss": 0.0271, + "num_input_tokens_seen": 145201376, + "step": 67225 + }, + { + "epoch": 10.9673735725938, + "grad_norm": 0.0016422256594523787, + "learning_rate": 0.0005028614132524085, + "loss": 0.0231, + "num_input_tokens_seen": 145210880, + "step": 67230 + }, + { + "epoch": 10.968189233278956, + "grad_norm": 0.008640460669994354, + "learning_rate": 0.0005027902345665843, + "loss": 0.014, + "num_input_tokens_seen": 145221536, + "step": 67235 + }, + { + "epoch": 10.969004893964112, + "grad_norm": 0.010546923615038395, + "learning_rate": 0.0005027190558242124, + "loss": 0.0045, + "num_input_tokens_seen": 145231424, + "step": 67240 + }, + { + "epoch": 10.969820554649266, + "grad_norm": 0.004773357417434454, + "learning_rate": 0.0005026478770267355, + "loss": 0.0331, + "num_input_tokens_seen": 145241792, + "step": 67245 + }, + { + "epoch": 10.970636215334421, + "grad_norm": 0.021990058943629265, + "learning_rate": 0.0005025766981755959, + "loss": 0.0278, + "num_input_tokens_seen": 145252928, + "step": 67250 + }, + { + "epoch": 10.971451876019575, + "grad_norm": 0.34234485030174255, + "learning_rate": 0.0005025055192722363, + "loss": 0.0443, + "num_input_tokens_seen": 145262656, + "step": 67255 + }, + { + "epoch": 10.97226753670473, + "grad_norm": 0.02041991800069809, + "learning_rate": 0.0005024343403180992, + "loss": 0.0067, + "num_input_tokens_seen": 145273824, + "step": 67260 + }, + { + "epoch": 10.973083197389887, + "grad_norm": 0.0017821387154981494, + "learning_rate": 0.0005023631613146272, + "loss": 0.0246, + "num_input_tokens_seen": 145284928, + "step": 67265 + }, + { + "epoch": 10.97389885807504, + "grad_norm": 0.0038782746996730566, + "learning_rate": 0.0005022919822632625, + "loss": 0.0664, + "num_input_tokens_seen": 145295552, + "step": 67270 + }, + { + "epoch": 10.974714518760196, + "grad_norm": 0.004829897079616785, + "learning_rate": 0.0005022208031654479, + "loss": 0.006, + "num_input_tokens_seen": 145306336, + "step": 67275 + }, + { + "epoch": 10.97553017944535, + "grad_norm": 0.21816429495811462, + "learning_rate": 0.0005021496240226261, + "loss": 0.0352, + "num_input_tokens_seen": 145317152, + "step": 67280 + }, + { + "epoch": 10.976345840130506, + "grad_norm": 0.8397039175033569, + "learning_rate": 0.0005020784448362393, + "loss": 0.0911, + "num_input_tokens_seen": 145327776, + "step": 67285 + }, + { + "epoch": 10.977161500815662, + "grad_norm": 0.018613159656524658, + "learning_rate": 0.0005020072656077302, + "loss": 0.1081, + "num_input_tokens_seen": 145337984, + "step": 67290 + }, + { + "epoch": 10.977977161500815, + "grad_norm": 0.01907249540090561, + "learning_rate": 0.0005019360863385413, + "loss": 0.0057, + "num_input_tokens_seen": 145348832, + "step": 67295 + }, + { + "epoch": 10.978792822185971, + "grad_norm": 0.0018857029499486089, + "learning_rate": 0.0005018649070301152, + "loss": 0.023, + "num_input_tokens_seen": 145360000, + "step": 67300 + }, + { + "epoch": 10.979608482871125, + "grad_norm": 0.13006794452667236, + "learning_rate": 0.0005017937276838943, + "loss": 0.0915, + "num_input_tokens_seen": 145371008, + "step": 67305 + }, + { + "epoch": 10.98042414355628, + "grad_norm": 0.02020275965332985, + "learning_rate": 0.0005017225483013212, + "loss": 0.1884, + "num_input_tokens_seen": 145382656, + "step": 67310 + }, + { + "epoch": 10.981239804241435, + "grad_norm": 0.01665751077234745, + "learning_rate": 0.0005016513688838387, + "loss": 0.0106, + "num_input_tokens_seen": 145394880, + "step": 67315 + }, + { + "epoch": 10.98205546492659, + "grad_norm": 0.02400999516248703, + "learning_rate": 0.0005015801894328889, + "loss": 0.0801, + "num_input_tokens_seen": 145406464, + "step": 67320 + }, + { + "epoch": 10.982871125611746, + "grad_norm": 0.023982934653759003, + "learning_rate": 0.0005015090099499147, + "loss": 0.015, + "num_input_tokens_seen": 145416960, + "step": 67325 + }, + { + "epoch": 10.9836867862969, + "grad_norm": 0.00537499226629734, + "learning_rate": 0.0005014378304363584, + "loss": 0.029, + "num_input_tokens_seen": 145427232, + "step": 67330 + }, + { + "epoch": 10.984502446982056, + "grad_norm": 0.002083304338157177, + "learning_rate": 0.0005013666508936627, + "loss": 0.0136, + "num_input_tokens_seen": 145438560, + "step": 67335 + }, + { + "epoch": 10.98531810766721, + "grad_norm": 0.0014621549053117633, + "learning_rate": 0.0005012954713232701, + "loss": 0.0048, + "num_input_tokens_seen": 145449888, + "step": 67340 + }, + { + "epoch": 10.986133768352365, + "grad_norm": 0.4816358685493469, + "learning_rate": 0.0005012242917266232, + "loss": 0.09, + "num_input_tokens_seen": 145461184, + "step": 67345 + }, + { + "epoch": 10.986949429037521, + "grad_norm": 0.0055021862499415874, + "learning_rate": 0.0005011531121051643, + "loss": 0.0575, + "num_input_tokens_seen": 145470944, + "step": 67350 + }, + { + "epoch": 10.987765089722675, + "grad_norm": 0.0026292535476386547, + "learning_rate": 0.0005010819324603363, + "loss": 0.0114, + "num_input_tokens_seen": 145481824, + "step": 67355 + }, + { + "epoch": 10.98858075040783, + "grad_norm": 0.1441703736782074, + "learning_rate": 0.0005010107527935815, + "loss": 0.0207, + "num_input_tokens_seen": 145492576, + "step": 67360 + }, + { + "epoch": 10.989396411092985, + "grad_norm": 0.005147392395883799, + "learning_rate": 0.0005009395731063424, + "loss": 0.0153, + "num_input_tokens_seen": 145503936, + "step": 67365 + }, + { + "epoch": 10.99021207177814, + "grad_norm": 0.004951298236846924, + "learning_rate": 0.0005008683934000618, + "loss": 0.0056, + "num_input_tokens_seen": 145514208, + "step": 67370 + }, + { + "epoch": 10.991027732463294, + "grad_norm": 0.05322100967168808, + "learning_rate": 0.000500797213676182, + "loss": 0.0455, + "num_input_tokens_seen": 145524064, + "step": 67375 + }, + { + "epoch": 10.99184339314845, + "grad_norm": 0.04236412048339844, + "learning_rate": 0.0005007260339361456, + "loss": 0.017, + "num_input_tokens_seen": 145534464, + "step": 67380 + }, + { + "epoch": 10.992659053833606, + "grad_norm": 0.03592411428689957, + "learning_rate": 0.0005006548541813953, + "loss": 0.2842, + "num_input_tokens_seen": 145544672, + "step": 67385 + }, + { + "epoch": 10.99347471451876, + "grad_norm": 0.3007633090019226, + "learning_rate": 0.0005005836744133736, + "loss": 0.1761, + "num_input_tokens_seen": 145555296, + "step": 67390 + }, + { + "epoch": 10.994290375203915, + "grad_norm": 0.017643166705965996, + "learning_rate": 0.0005005124946335229, + "loss": 0.0344, + "num_input_tokens_seen": 145565984, + "step": 67395 + }, + { + "epoch": 10.99510603588907, + "grad_norm": 0.005647346377372742, + "learning_rate": 0.0005004413148432859, + "loss": 0.0183, + "num_input_tokens_seen": 145577472, + "step": 67400 + }, + { + "epoch": 10.995921696574225, + "grad_norm": 0.04438630864024162, + "learning_rate": 0.000500370135044105, + "loss": 0.0147, + "num_input_tokens_seen": 145588096, + "step": 67405 + }, + { + "epoch": 10.99673735725938, + "grad_norm": 0.04898401349782944, + "learning_rate": 0.000500298955237423, + "loss": 0.0896, + "num_input_tokens_seen": 145598752, + "step": 67410 + }, + { + "epoch": 10.997553017944535, + "grad_norm": 0.03524085506796837, + "learning_rate": 0.0005002277754246822, + "loss": 0.0186, + "num_input_tokens_seen": 145608992, + "step": 67415 + }, + { + "epoch": 10.99836867862969, + "grad_norm": 0.0137935196980834, + "learning_rate": 0.0005001565956073252, + "loss": 0.0551, + "num_input_tokens_seen": 145620512, + "step": 67420 + }, + { + "epoch": 10.999184339314844, + "grad_norm": 0.19982655346393585, + "learning_rate": 0.0005000854157867947, + "loss": 0.0408, + "num_input_tokens_seen": 145631296, + "step": 67425 + }, + { + "epoch": 11.0, + "grad_norm": 0.014280433766543865, + "learning_rate": 0.0005000142359645331, + "loss": 0.2095, + "num_input_tokens_seen": 145641920, + "step": 67430 + }, + { + "epoch": 11.0, + "eval_loss": 0.16289934515953064, + "eval_runtime": 103.8025, + "eval_samples_per_second": 26.252, + "eval_steps_per_second": 6.57, + "num_input_tokens_seen": 145641920, + "step": 67430 + }, + { + "epoch": 11.000815660685156, + "grad_norm": 0.002235093619674444, + "learning_rate": 0.0004999430561419831, + "loss": 0.0902, + "num_input_tokens_seen": 145653984, + "step": 67435 + }, + { + "epoch": 11.00163132137031, + "grad_norm": 0.165186807513237, + "learning_rate": 0.000499871876320587, + "loss": 0.0201, + "num_input_tokens_seen": 145664832, + "step": 67440 + }, + { + "epoch": 11.002446982055465, + "grad_norm": 0.03600761294364929, + "learning_rate": 0.0004998006965017876, + "loss": 0.1174, + "num_input_tokens_seen": 145674848, + "step": 67445 + }, + { + "epoch": 11.00326264274062, + "grad_norm": 0.004147016908973455, + "learning_rate": 0.0004997295166870271, + "loss": 0.0087, + "num_input_tokens_seen": 145685760, + "step": 67450 + }, + { + "epoch": 11.004078303425775, + "grad_norm": 0.05979970097541809, + "learning_rate": 0.0004996583368777484, + "loss": 0.0296, + "num_input_tokens_seen": 145697088, + "step": 67455 + }, + { + "epoch": 11.00489396411093, + "grad_norm": 0.01066543161869049, + "learning_rate": 0.000499587157075394, + "loss": 0.0168, + "num_input_tokens_seen": 145708704, + "step": 67460 + }, + { + "epoch": 11.005709624796085, + "grad_norm": 0.01953071542084217, + "learning_rate": 0.0004995159772814063, + "loss": 0.1, + "num_input_tokens_seen": 145720384, + "step": 67465 + }, + { + "epoch": 11.00652528548124, + "grad_norm": 0.03782833367586136, + "learning_rate": 0.0004994447974972281, + "loss": 0.0067, + "num_input_tokens_seen": 145732640, + "step": 67470 + }, + { + "epoch": 11.007340946166394, + "grad_norm": 0.11350507289171219, + "learning_rate": 0.0004993736177243016, + "loss": 0.0151, + "num_input_tokens_seen": 145744448, + "step": 67475 + }, + { + "epoch": 11.00815660685155, + "grad_norm": 0.23200421035289764, + "learning_rate": 0.0004993024379640697, + "loss": 0.0287, + "num_input_tokens_seen": 145755584, + "step": 67480 + }, + { + "epoch": 11.008972267536704, + "grad_norm": 0.003650764236226678, + "learning_rate": 0.0004992312582179746, + "loss": 0.0155, + "num_input_tokens_seen": 145766720, + "step": 67485 + }, + { + "epoch": 11.00978792822186, + "grad_norm": 0.02220398746430874, + "learning_rate": 0.0004991600784874593, + "loss": 0.0628, + "num_input_tokens_seen": 145778368, + "step": 67490 + }, + { + "epoch": 11.010603588907015, + "grad_norm": 0.131536066532135, + "learning_rate": 0.0004990888987739657, + "loss": 0.0104, + "num_input_tokens_seen": 145790144, + "step": 67495 + }, + { + "epoch": 11.01141924959217, + "grad_norm": 0.04819709062576294, + "learning_rate": 0.0004990177190789371, + "loss": 0.1136, + "num_input_tokens_seen": 145800352, + "step": 67500 + }, + { + "epoch": 11.012234910277325, + "grad_norm": 0.054440777748823166, + "learning_rate": 0.0004989465394038153, + "loss": 0.0212, + "num_input_tokens_seen": 145811776, + "step": 67505 + }, + { + "epoch": 11.013050570962479, + "grad_norm": 0.008946564979851246, + "learning_rate": 0.0004988753597500435, + "loss": 0.0105, + "num_input_tokens_seen": 145821376, + "step": 67510 + }, + { + "epoch": 11.013866231647635, + "grad_norm": 0.33636924624443054, + "learning_rate": 0.0004988041801190638, + "loss": 0.1091, + "num_input_tokens_seen": 145831328, + "step": 67515 + }, + { + "epoch": 11.01468189233279, + "grad_norm": 0.04350544139742851, + "learning_rate": 0.000498733000512319, + "loss": 0.0268, + "num_input_tokens_seen": 145842208, + "step": 67520 + }, + { + "epoch": 11.015497553017944, + "grad_norm": 0.025713231414556503, + "learning_rate": 0.0004986618209312515, + "loss": 0.1589, + "num_input_tokens_seen": 145851776, + "step": 67525 + }, + { + "epoch": 11.0163132137031, + "grad_norm": 0.0037758410908281803, + "learning_rate": 0.000498590641377304, + "loss": 0.0148, + "num_input_tokens_seen": 145862432, + "step": 67530 + }, + { + "epoch": 11.017128874388254, + "grad_norm": 0.1802942007780075, + "learning_rate": 0.0004985194618519188, + "loss": 0.0294, + "num_input_tokens_seen": 145873056, + "step": 67535 + }, + { + "epoch": 11.01794453507341, + "grad_norm": 0.10657081007957458, + "learning_rate": 0.0004984482823565386, + "loss": 0.0239, + "num_input_tokens_seen": 145884384, + "step": 67540 + }, + { + "epoch": 11.018760195758565, + "grad_norm": 0.02635457180440426, + "learning_rate": 0.0004983771028926059, + "loss": 0.0267, + "num_input_tokens_seen": 145895136, + "step": 67545 + }, + { + "epoch": 11.01957585644372, + "grad_norm": 0.0033126375637948513, + "learning_rate": 0.0004983059234615635, + "loss": 0.0207, + "num_input_tokens_seen": 145905984, + "step": 67550 + }, + { + "epoch": 11.020391517128875, + "grad_norm": 0.005782026797533035, + "learning_rate": 0.0004982347440648534, + "loss": 0.033, + "num_input_tokens_seen": 145916832, + "step": 67555 + }, + { + "epoch": 11.021207177814029, + "grad_norm": 0.016137108206748962, + "learning_rate": 0.0004981635647039186, + "loss": 0.0217, + "num_input_tokens_seen": 145927360, + "step": 67560 + }, + { + "epoch": 11.022022838499185, + "grad_norm": 0.3549785017967224, + "learning_rate": 0.0004980923853802015, + "loss": 0.0284, + "num_input_tokens_seen": 145936704, + "step": 67565 + }, + { + "epoch": 11.022838499184338, + "grad_norm": 0.23480737209320068, + "learning_rate": 0.0004980212060951447, + "loss": 0.1713, + "num_input_tokens_seen": 145948096, + "step": 67570 + }, + { + "epoch": 11.023654159869494, + "grad_norm": 0.4012732207775116, + "learning_rate": 0.0004979500268501905, + "loss": 0.0394, + "num_input_tokens_seen": 145958112, + "step": 67575 + }, + { + "epoch": 11.02446982055465, + "grad_norm": 0.00960595440119505, + "learning_rate": 0.0004978788476467816, + "loss": 0.0082, + "num_input_tokens_seen": 145969536, + "step": 67580 + }, + { + "epoch": 11.025285481239804, + "grad_norm": 0.34898385405540466, + "learning_rate": 0.0004978076684863607, + "loss": 0.1035, + "num_input_tokens_seen": 145981056, + "step": 67585 + }, + { + "epoch": 11.02610114192496, + "grad_norm": 0.34688490629196167, + "learning_rate": 0.0004977364893703701, + "loss": 0.0577, + "num_input_tokens_seen": 145991808, + "step": 67590 + }, + { + "epoch": 11.026916802610113, + "grad_norm": 0.1314949095249176, + "learning_rate": 0.0004976653103002526, + "loss": 0.0696, + "num_input_tokens_seen": 146003424, + "step": 67595 + }, + { + "epoch": 11.02773246329527, + "grad_norm": 0.024752607569098473, + "learning_rate": 0.0004975941312774502, + "loss": 0.1167, + "num_input_tokens_seen": 146014304, + "step": 67600 + }, + { + "epoch": 11.028548123980425, + "grad_norm": 0.012540026567876339, + "learning_rate": 0.0004975229523034061, + "loss": 0.0097, + "num_input_tokens_seen": 146025728, + "step": 67605 + }, + { + "epoch": 11.029363784665579, + "grad_norm": 0.01599235273897648, + "learning_rate": 0.0004974517733795623, + "loss": 0.0398, + "num_input_tokens_seen": 146037536, + "step": 67610 + }, + { + "epoch": 11.030179445350734, + "grad_norm": 0.018973039463162422, + "learning_rate": 0.0004973805945073617, + "loss": 0.0209, + "num_input_tokens_seen": 146048544, + "step": 67615 + }, + { + "epoch": 11.030995106035888, + "grad_norm": 0.016196317970752716, + "learning_rate": 0.0004973094156882466, + "loss": 0.009, + "num_input_tokens_seen": 146058784, + "step": 67620 + }, + { + "epoch": 11.031810766721044, + "grad_norm": 0.015408056788146496, + "learning_rate": 0.0004972382369236596, + "loss": 0.0159, + "num_input_tokens_seen": 146068416, + "step": 67625 + }, + { + "epoch": 11.0326264274062, + "grad_norm": 0.20243647694587708, + "learning_rate": 0.0004971670582150431, + "loss": 0.0211, + "num_input_tokens_seen": 146077664, + "step": 67630 + }, + { + "epoch": 11.033442088091354, + "grad_norm": 1.3160874843597412, + "learning_rate": 0.0004970958795638401, + "loss": 0.0713, + "num_input_tokens_seen": 146088320, + "step": 67635 + }, + { + "epoch": 11.03425774877651, + "grad_norm": 0.006498668342828751, + "learning_rate": 0.0004970247009714924, + "loss": 0.0051, + "num_input_tokens_seen": 146099424, + "step": 67640 + }, + { + "epoch": 11.035073409461663, + "grad_norm": 0.00784413330256939, + "learning_rate": 0.0004969535224394432, + "loss": 0.0024, + "num_input_tokens_seen": 146110112, + "step": 67645 + }, + { + "epoch": 11.035889070146819, + "grad_norm": 0.12007040530443192, + "learning_rate": 0.0004968823439691346, + "loss": 0.0142, + "num_input_tokens_seen": 146120224, + "step": 67650 + }, + { + "epoch": 11.036704730831975, + "grad_norm": 0.07254232466220856, + "learning_rate": 0.0004968111655620093, + "loss": 0.0263, + "num_input_tokens_seen": 146131328, + "step": 67655 + }, + { + "epoch": 11.037520391517129, + "grad_norm": 0.027044525370001793, + "learning_rate": 0.0004967399872195096, + "loss": 0.0067, + "num_input_tokens_seen": 146142144, + "step": 67660 + }, + { + "epoch": 11.038336052202284, + "grad_norm": 0.013522460125386715, + "learning_rate": 0.0004966688089430785, + "loss": 0.0919, + "num_input_tokens_seen": 146153696, + "step": 67665 + }, + { + "epoch": 11.039151712887438, + "grad_norm": 0.004441166762262583, + "learning_rate": 0.000496597630734158, + "loss": 0.0076, + "num_input_tokens_seen": 146164608, + "step": 67670 + }, + { + "epoch": 11.039967373572594, + "grad_norm": 0.04167890548706055, + "learning_rate": 0.0004965264525941908, + "loss": 0.0221, + "num_input_tokens_seen": 146175616, + "step": 67675 + }, + { + "epoch": 11.040783034257748, + "grad_norm": 0.3512006103992462, + "learning_rate": 0.0004964552745246196, + "loss": 0.0122, + "num_input_tokens_seen": 146187104, + "step": 67680 + }, + { + "epoch": 11.041598694942904, + "grad_norm": 0.007863182574510574, + "learning_rate": 0.0004963840965268866, + "loss": 0.0075, + "num_input_tokens_seen": 146196448, + "step": 67685 + }, + { + "epoch": 11.04241435562806, + "grad_norm": 0.01281198114156723, + "learning_rate": 0.0004963129186024346, + "loss": 0.0089, + "num_input_tokens_seen": 146206720, + "step": 67690 + }, + { + "epoch": 11.043230016313213, + "grad_norm": 0.42489489912986755, + "learning_rate": 0.0004962417407527059, + "loss": 0.1801, + "num_input_tokens_seen": 146215488, + "step": 67695 + }, + { + "epoch": 11.044045676998369, + "grad_norm": 0.02685629017651081, + "learning_rate": 0.0004961705629791431, + "loss": 0.0676, + "num_input_tokens_seen": 146226400, + "step": 67700 + }, + { + "epoch": 11.044861337683523, + "grad_norm": 0.6352334022521973, + "learning_rate": 0.0004960993852831888, + "loss": 0.0828, + "num_input_tokens_seen": 146237632, + "step": 67705 + }, + { + "epoch": 11.045676998368679, + "grad_norm": 0.003587668761610985, + "learning_rate": 0.0004960282076662853, + "loss": 0.0196, + "num_input_tokens_seen": 146248128, + "step": 67710 + }, + { + "epoch": 11.046492659053834, + "grad_norm": 0.006983236409723759, + "learning_rate": 0.0004959570301298752, + "loss": 0.0135, + "num_input_tokens_seen": 146258528, + "step": 67715 + }, + { + "epoch": 11.047308319738988, + "grad_norm": 0.01367577537894249, + "learning_rate": 0.0004958858526754012, + "loss": 0.0054, + "num_input_tokens_seen": 146268704, + "step": 67720 + }, + { + "epoch": 11.048123980424144, + "grad_norm": 0.012268884107470512, + "learning_rate": 0.0004958146753043053, + "loss": 0.0081, + "num_input_tokens_seen": 146279136, + "step": 67725 + }, + { + "epoch": 11.048939641109298, + "grad_norm": 0.045778777450323105, + "learning_rate": 0.0004957434980180307, + "loss": 0.0213, + "num_input_tokens_seen": 146288256, + "step": 67730 + }, + { + "epoch": 11.049755301794454, + "grad_norm": 0.07384860515594482, + "learning_rate": 0.0004956723208180191, + "loss": 0.0076, + "num_input_tokens_seen": 146298272, + "step": 67735 + }, + { + "epoch": 11.05057096247961, + "grad_norm": 0.12969577312469482, + "learning_rate": 0.0004956011437057138, + "loss": 0.0134, + "num_input_tokens_seen": 146309600, + "step": 67740 + }, + { + "epoch": 11.051386623164763, + "grad_norm": 0.013329303823411465, + "learning_rate": 0.0004955299666825566, + "loss": 0.0342, + "num_input_tokens_seen": 146320320, + "step": 67745 + }, + { + "epoch": 11.052202283849919, + "grad_norm": 0.02987365610897541, + "learning_rate": 0.0004954587897499905, + "loss": 0.0054, + "num_input_tokens_seen": 146331296, + "step": 67750 + }, + { + "epoch": 11.053017944535073, + "grad_norm": 0.003352835774421692, + "learning_rate": 0.0004953876129094576, + "loss": 0.0039, + "num_input_tokens_seen": 146342016, + "step": 67755 + }, + { + "epoch": 11.053833605220229, + "grad_norm": 0.008115014061331749, + "learning_rate": 0.0004953164361624008, + "loss": 0.0199, + "num_input_tokens_seen": 146353184, + "step": 67760 + }, + { + "epoch": 11.054649265905383, + "grad_norm": 0.009867136366665363, + "learning_rate": 0.0004952452595102621, + "loss": 0.0175, + "num_input_tokens_seen": 146364160, + "step": 67765 + }, + { + "epoch": 11.055464926590538, + "grad_norm": 0.003682214766740799, + "learning_rate": 0.0004951740829544846, + "loss": 0.019, + "num_input_tokens_seen": 146373760, + "step": 67770 + }, + { + "epoch": 11.056280587275694, + "grad_norm": 0.05358272045850754, + "learning_rate": 0.00049510290649651, + "loss": 0.007, + "num_input_tokens_seen": 146384928, + "step": 67775 + }, + { + "epoch": 11.057096247960848, + "grad_norm": 0.23653176426887512, + "learning_rate": 0.0004950317301377813, + "loss": 0.0563, + "num_input_tokens_seen": 146396192, + "step": 67780 + }, + { + "epoch": 11.057911908646004, + "grad_norm": 0.3197796642780304, + "learning_rate": 0.0004949605538797412, + "loss": 0.1492, + "num_input_tokens_seen": 146407680, + "step": 67785 + }, + { + "epoch": 11.058727569331158, + "grad_norm": 0.31645524501800537, + "learning_rate": 0.0004948893777238316, + "loss": 0.0944, + "num_input_tokens_seen": 146417632, + "step": 67790 + }, + { + "epoch": 11.059543230016313, + "grad_norm": 0.04331495612859726, + "learning_rate": 0.0004948182016714954, + "loss": 0.157, + "num_input_tokens_seen": 146428736, + "step": 67795 + }, + { + "epoch": 11.060358890701469, + "grad_norm": 0.007367865182459354, + "learning_rate": 0.0004947470257241748, + "loss": 0.0024, + "num_input_tokens_seen": 146439456, + "step": 67800 + }, + { + "epoch": 11.061174551386623, + "grad_norm": 0.05738005042076111, + "learning_rate": 0.0004946758498833125, + "loss": 0.0079, + "num_input_tokens_seen": 146450176, + "step": 67805 + }, + { + "epoch": 11.061990212071779, + "grad_norm": 0.4973194897174835, + "learning_rate": 0.0004946046741503507, + "loss": 0.0327, + "num_input_tokens_seen": 146461376, + "step": 67810 + }, + { + "epoch": 11.062805872756933, + "grad_norm": 0.009904002770781517, + "learning_rate": 0.0004945334985267323, + "loss": 0.0211, + "num_input_tokens_seen": 146473504, + "step": 67815 + }, + { + "epoch": 11.063621533442088, + "grad_norm": 0.7115747928619385, + "learning_rate": 0.0004944623230138991, + "loss": 0.0453, + "num_input_tokens_seen": 146484448, + "step": 67820 + }, + { + "epoch": 11.064437194127244, + "grad_norm": 0.006754287518560886, + "learning_rate": 0.0004943911476132943, + "loss": 0.0082, + "num_input_tokens_seen": 146495200, + "step": 67825 + }, + { + "epoch": 11.065252854812398, + "grad_norm": 0.07527286559343338, + "learning_rate": 0.0004943199723263597, + "loss": 0.0103, + "num_input_tokens_seen": 146505824, + "step": 67830 + }, + { + "epoch": 11.066068515497554, + "grad_norm": 0.022538485005497932, + "learning_rate": 0.0004942487971545383, + "loss": 0.0142, + "num_input_tokens_seen": 146515168, + "step": 67835 + }, + { + "epoch": 11.066884176182707, + "grad_norm": 0.23096723854541779, + "learning_rate": 0.0004941776220992722, + "loss": 0.2208, + "num_input_tokens_seen": 146525056, + "step": 67840 + }, + { + "epoch": 11.067699836867863, + "grad_norm": 0.017740461975336075, + "learning_rate": 0.0004941064471620041, + "loss": 0.1202, + "num_input_tokens_seen": 146535136, + "step": 67845 + }, + { + "epoch": 11.068515497553017, + "grad_norm": 0.17247579991817474, + "learning_rate": 0.0004940352723441763, + "loss": 0.0237, + "num_input_tokens_seen": 146545920, + "step": 67850 + }, + { + "epoch": 11.069331158238173, + "grad_norm": 0.04620293900370598, + "learning_rate": 0.0004939640976472311, + "loss": 0.0333, + "num_input_tokens_seen": 146557120, + "step": 67855 + }, + { + "epoch": 11.070146818923329, + "grad_norm": 0.015531661920249462, + "learning_rate": 0.0004938929230726111, + "loss": 0.0095, + "num_input_tokens_seen": 146568576, + "step": 67860 + }, + { + "epoch": 11.070962479608482, + "grad_norm": 0.22680914402008057, + "learning_rate": 0.0004938217486217591, + "loss": 0.0081, + "num_input_tokens_seen": 146579424, + "step": 67865 + }, + { + "epoch": 11.071778140293638, + "grad_norm": 0.0019467025995254517, + "learning_rate": 0.0004937505742961169, + "loss": 0.0032, + "num_input_tokens_seen": 146590560, + "step": 67870 + }, + { + "epoch": 11.072593800978792, + "grad_norm": 0.0013207707088440657, + "learning_rate": 0.0004936794000971274, + "loss": 0.008, + "num_input_tokens_seen": 146600768, + "step": 67875 + }, + { + "epoch": 11.073409461663948, + "grad_norm": 0.0049352445639669895, + "learning_rate": 0.0004936082260262328, + "loss": 0.0946, + "num_input_tokens_seen": 146610880, + "step": 67880 + }, + { + "epoch": 11.074225122349104, + "grad_norm": 0.14016573131084442, + "learning_rate": 0.0004935370520848755, + "loss": 0.0111, + "num_input_tokens_seen": 146621792, + "step": 67885 + }, + { + "epoch": 11.075040783034257, + "grad_norm": 0.01653108559548855, + "learning_rate": 0.0004934658782744983, + "loss": 0.0302, + "num_input_tokens_seen": 146632448, + "step": 67890 + }, + { + "epoch": 11.075856443719413, + "grad_norm": 0.002239247551187873, + "learning_rate": 0.0004933947045965431, + "loss": 0.0057, + "num_input_tokens_seen": 146643456, + "step": 67895 + }, + { + "epoch": 11.076672104404567, + "grad_norm": 0.013882400467991829, + "learning_rate": 0.0004933235310524528, + "loss": 0.0566, + "num_input_tokens_seen": 146654176, + "step": 67900 + }, + { + "epoch": 11.077487765089723, + "grad_norm": 0.32492849230766296, + "learning_rate": 0.0004932523576436695, + "loss": 0.0132, + "num_input_tokens_seen": 146665984, + "step": 67905 + }, + { + "epoch": 11.078303425774878, + "grad_norm": 0.006665470078587532, + "learning_rate": 0.0004931811843716358, + "loss": 0.116, + "num_input_tokens_seen": 146677440, + "step": 67910 + }, + { + "epoch": 11.079119086460032, + "grad_norm": 0.010101187974214554, + "learning_rate": 0.000493110011237794, + "loss": 0.0033, + "num_input_tokens_seen": 146688416, + "step": 67915 + }, + { + "epoch": 11.079934747145188, + "grad_norm": 0.00362041755579412, + "learning_rate": 0.0004930388382435866, + "loss": 0.009, + "num_input_tokens_seen": 146699360, + "step": 67920 + }, + { + "epoch": 11.080750407830342, + "grad_norm": 0.015067537315189838, + "learning_rate": 0.0004929676653904558, + "loss": 0.009, + "num_input_tokens_seen": 146709056, + "step": 67925 + }, + { + "epoch": 11.081566068515498, + "grad_norm": 0.010858445428311825, + "learning_rate": 0.0004928964926798445, + "loss": 0.0066, + "num_input_tokens_seen": 146718752, + "step": 67930 + }, + { + "epoch": 11.082381729200652, + "grad_norm": 0.011564402841031551, + "learning_rate": 0.0004928253201131945, + "loss": 0.0332, + "num_input_tokens_seen": 146729920, + "step": 67935 + }, + { + "epoch": 11.083197389885807, + "grad_norm": 0.0970597043633461, + "learning_rate": 0.0004927541476919487, + "loss": 0.0534, + "num_input_tokens_seen": 146740064, + "step": 67940 + }, + { + "epoch": 11.084013050570963, + "grad_norm": 0.35658419132232666, + "learning_rate": 0.0004926829754175492, + "loss": 0.0337, + "num_input_tokens_seen": 146750560, + "step": 67945 + }, + { + "epoch": 11.084828711256117, + "grad_norm": 0.3965001702308655, + "learning_rate": 0.0004926118032914385, + "loss": 0.1152, + "num_input_tokens_seen": 146760960, + "step": 67950 + }, + { + "epoch": 11.085644371941273, + "grad_norm": 0.02664480172097683, + "learning_rate": 0.0004925406313150589, + "loss": 0.1071, + "num_input_tokens_seen": 146772864, + "step": 67955 + }, + { + "epoch": 11.086460032626427, + "grad_norm": 0.10795983672142029, + "learning_rate": 0.000492469459489853, + "loss": 0.0145, + "num_input_tokens_seen": 146783968, + "step": 67960 + }, + { + "epoch": 11.087275693311582, + "grad_norm": 0.7373493909835815, + "learning_rate": 0.0004923982878172629, + "loss": 0.0582, + "num_input_tokens_seen": 146795808, + "step": 67965 + }, + { + "epoch": 11.088091353996738, + "grad_norm": 0.13917656242847443, + "learning_rate": 0.0004923271162987314, + "loss": 0.0078, + "num_input_tokens_seen": 146807104, + "step": 67970 + }, + { + "epoch": 11.088907014681892, + "grad_norm": 0.0017775179585441947, + "learning_rate": 0.0004922559449357003, + "loss": 0.0355, + "num_input_tokens_seen": 146817216, + "step": 67975 + }, + { + "epoch": 11.089722675367048, + "grad_norm": 0.003343122312799096, + "learning_rate": 0.0004921847737296125, + "loss": 0.0041, + "num_input_tokens_seen": 146827776, + "step": 67980 + }, + { + "epoch": 11.090538336052202, + "grad_norm": 0.3145506978034973, + "learning_rate": 0.0004921136026819101, + "loss": 0.0563, + "num_input_tokens_seen": 146839712, + "step": 67985 + }, + { + "epoch": 11.091353996737357, + "grad_norm": 0.01094027329236269, + "learning_rate": 0.0004920424317940355, + "loss": 0.0105, + "num_input_tokens_seen": 146850592, + "step": 67990 + }, + { + "epoch": 11.092169657422513, + "grad_norm": 0.007395964581519365, + "learning_rate": 0.0004919712610674312, + "loss": 0.0065, + "num_input_tokens_seen": 146862528, + "step": 67995 + }, + { + "epoch": 11.092985318107667, + "grad_norm": 0.038671478629112244, + "learning_rate": 0.0004919000905035394, + "loss": 0.0621, + "num_input_tokens_seen": 146871488, + "step": 68000 + }, + { + "epoch": 11.093800978792823, + "grad_norm": 0.0066762445494532585, + "learning_rate": 0.0004918289201038026, + "loss": 0.0707, + "num_input_tokens_seen": 146882496, + "step": 68005 + }, + { + "epoch": 11.094616639477977, + "grad_norm": 0.02582985907793045, + "learning_rate": 0.0004917577498696631, + "loss": 0.0111, + "num_input_tokens_seen": 146893280, + "step": 68010 + }, + { + "epoch": 11.095432300163132, + "grad_norm": 0.6989096403121948, + "learning_rate": 0.0004916865798025634, + "loss": 0.0422, + "num_input_tokens_seen": 146903008, + "step": 68015 + }, + { + "epoch": 11.096247960848286, + "grad_norm": 0.14155010879039764, + "learning_rate": 0.0004916154099039455, + "loss": 0.1259, + "num_input_tokens_seen": 146913472, + "step": 68020 + }, + { + "epoch": 11.097063621533442, + "grad_norm": 0.01753934472799301, + "learning_rate": 0.000491544240175252, + "loss": 0.0104, + "num_input_tokens_seen": 146925440, + "step": 68025 + }, + { + "epoch": 11.097879282218598, + "grad_norm": 0.01112120971083641, + "learning_rate": 0.0004914730706179251, + "loss": 0.0393, + "num_input_tokens_seen": 146935360, + "step": 68030 + }, + { + "epoch": 11.098694942903752, + "grad_norm": 0.07331804931163788, + "learning_rate": 0.0004914019012334075, + "loss": 0.0092, + "num_input_tokens_seen": 146945600, + "step": 68035 + }, + { + "epoch": 11.099510603588907, + "grad_norm": 0.07903977483510971, + "learning_rate": 0.000491330732023141, + "loss": 0.1622, + "num_input_tokens_seen": 146956928, + "step": 68040 + }, + { + "epoch": 11.100326264274061, + "grad_norm": 0.0027705691754817963, + "learning_rate": 0.0004912595629885685, + "loss": 0.0208, + "num_input_tokens_seen": 146967680, + "step": 68045 + }, + { + "epoch": 11.101141924959217, + "grad_norm": 0.0015439860289916396, + "learning_rate": 0.0004911883941311319, + "loss": 0.1458, + "num_input_tokens_seen": 146978656, + "step": 68050 + }, + { + "epoch": 11.101957585644373, + "grad_norm": 0.0027622964698821306, + "learning_rate": 0.0004911172254522737, + "loss": 0.0245, + "num_input_tokens_seen": 146989408, + "step": 68055 + }, + { + "epoch": 11.102773246329527, + "grad_norm": 0.013207031413912773, + "learning_rate": 0.0004910460569534361, + "loss": 0.0085, + "num_input_tokens_seen": 147000288, + "step": 68060 + }, + { + "epoch": 11.103588907014682, + "grad_norm": 0.010163257829844952, + "learning_rate": 0.0004909748886360617, + "loss": 0.0079, + "num_input_tokens_seen": 147011008, + "step": 68065 + }, + { + "epoch": 11.104404567699836, + "grad_norm": 0.005293056834489107, + "learning_rate": 0.0004909037205015924, + "loss": 0.0085, + "num_input_tokens_seen": 147022272, + "step": 68070 + }, + { + "epoch": 11.105220228384992, + "grad_norm": 0.03544028103351593, + "learning_rate": 0.000490832552551471, + "loss": 0.0416, + "num_input_tokens_seen": 147032864, + "step": 68075 + }, + { + "epoch": 11.106035889070148, + "grad_norm": 0.01018522959202528, + "learning_rate": 0.0004907613847871393, + "loss": 0.071, + "num_input_tokens_seen": 147042784, + "step": 68080 + }, + { + "epoch": 11.106851549755302, + "grad_norm": 0.18979839980602264, + "learning_rate": 0.00049069021721004, + "loss": 0.0272, + "num_input_tokens_seen": 147052928, + "step": 68085 + }, + { + "epoch": 11.107667210440457, + "grad_norm": 0.002638269681483507, + "learning_rate": 0.0004906190498216151, + "loss": 0.0137, + "num_input_tokens_seen": 147063808, + "step": 68090 + }, + { + "epoch": 11.108482871125611, + "grad_norm": 0.0040941014885902405, + "learning_rate": 0.0004905478826233072, + "loss": 0.0102, + "num_input_tokens_seen": 147074240, + "step": 68095 + }, + { + "epoch": 11.109298531810767, + "grad_norm": 0.016239026561379433, + "learning_rate": 0.0004904767156165585, + "loss": 0.0101, + "num_input_tokens_seen": 147084736, + "step": 68100 + }, + { + "epoch": 11.11011419249592, + "grad_norm": 0.251310259103775, + "learning_rate": 0.000490405548802811, + "loss": 0.1487, + "num_input_tokens_seen": 147097216, + "step": 68105 + }, + { + "epoch": 11.110929853181077, + "grad_norm": 0.01817292720079422, + "learning_rate": 0.0004903343821835075, + "loss": 0.0043, + "num_input_tokens_seen": 147107264, + "step": 68110 + }, + { + "epoch": 11.111745513866232, + "grad_norm": 0.0064503224566578865, + "learning_rate": 0.0004902632157600898, + "loss": 0.0141, + "num_input_tokens_seen": 147117440, + "step": 68115 + }, + { + "epoch": 11.112561174551386, + "grad_norm": 0.003892709966748953, + "learning_rate": 0.0004901920495340007, + "loss": 0.026, + "num_input_tokens_seen": 147128960, + "step": 68120 + }, + { + "epoch": 11.113376835236542, + "grad_norm": 0.013186760246753693, + "learning_rate": 0.0004901208835066818, + "loss": 0.0471, + "num_input_tokens_seen": 147139584, + "step": 68125 + }, + { + "epoch": 11.114192495921696, + "grad_norm": 0.0026877010241150856, + "learning_rate": 0.0004900497176795759, + "loss": 0.0016, + "num_input_tokens_seen": 147150432, + "step": 68130 + }, + { + "epoch": 11.115008156606851, + "grad_norm": 0.8579369187355042, + "learning_rate": 0.000489978552054125, + "loss": 0.1093, + "num_input_tokens_seen": 147160928, + "step": 68135 + }, + { + "epoch": 11.115823817292007, + "grad_norm": 0.0050546894781291485, + "learning_rate": 0.0004899073866317717, + "loss": 0.11, + "num_input_tokens_seen": 147171520, + "step": 68140 + }, + { + "epoch": 11.116639477977161, + "grad_norm": 0.11194411665201187, + "learning_rate": 0.0004898362214139577, + "loss": 0.0052, + "num_input_tokens_seen": 147182848, + "step": 68145 + }, + { + "epoch": 11.117455138662317, + "grad_norm": 0.031917814165353775, + "learning_rate": 0.0004897650564021257, + "loss": 0.013, + "num_input_tokens_seen": 147193248, + "step": 68150 + }, + { + "epoch": 11.11827079934747, + "grad_norm": 0.009770890697836876, + "learning_rate": 0.0004896938915977178, + "loss": 0.1756, + "num_input_tokens_seen": 147203744, + "step": 68155 + }, + { + "epoch": 11.119086460032626, + "grad_norm": 0.0032027806155383587, + "learning_rate": 0.0004896227270021763, + "loss": 0.0026, + "num_input_tokens_seen": 147215072, + "step": 68160 + }, + { + "epoch": 11.119902120717782, + "grad_norm": 0.042442020028829575, + "learning_rate": 0.0004895515626169433, + "loss": 0.0074, + "num_input_tokens_seen": 147225696, + "step": 68165 + }, + { + "epoch": 11.120717781402936, + "grad_norm": 0.06179466471076012, + "learning_rate": 0.0004894803984434613, + "loss": 0.0186, + "num_input_tokens_seen": 147236992, + "step": 68170 + }, + { + "epoch": 11.121533442088092, + "grad_norm": 0.02509087324142456, + "learning_rate": 0.0004894092344831722, + "loss": 0.023, + "num_input_tokens_seen": 147248416, + "step": 68175 + }, + { + "epoch": 11.122349102773246, + "grad_norm": 0.02915806882083416, + "learning_rate": 0.0004893380707375186, + "loss": 0.0308, + "num_input_tokens_seen": 147257984, + "step": 68180 + }, + { + "epoch": 11.123164763458401, + "grad_norm": 0.43293923139572144, + "learning_rate": 0.0004892669072079423, + "loss": 0.0323, + "num_input_tokens_seen": 147268480, + "step": 68185 + }, + { + "epoch": 11.123980424143557, + "grad_norm": 0.08693939447402954, + "learning_rate": 0.000489195743895886, + "loss": 0.0516, + "num_input_tokens_seen": 147279776, + "step": 68190 + }, + { + "epoch": 11.124796084828711, + "grad_norm": 0.0011673959670588374, + "learning_rate": 0.0004891245808027913, + "loss": 0.012, + "num_input_tokens_seen": 147290336, + "step": 68195 + }, + { + "epoch": 11.125611745513867, + "grad_norm": 0.3298425078392029, + "learning_rate": 0.0004890534179301009, + "loss": 0.0769, + "num_input_tokens_seen": 147300864, + "step": 68200 + }, + { + "epoch": 11.12642740619902, + "grad_norm": 0.3644849956035614, + "learning_rate": 0.0004889822552792572, + "loss": 0.0312, + "num_input_tokens_seen": 147310912, + "step": 68205 + }, + { + "epoch": 11.127243066884176, + "grad_norm": 0.004404593259096146, + "learning_rate": 0.0004889110928517016, + "loss": 0.0218, + "num_input_tokens_seen": 147321344, + "step": 68210 + }, + { + "epoch": 11.12805872756933, + "grad_norm": 0.21322403848171234, + "learning_rate": 0.0004888399306488771, + "loss": 0.0253, + "num_input_tokens_seen": 147332448, + "step": 68215 + }, + { + "epoch": 11.128874388254486, + "grad_norm": 0.002350582042708993, + "learning_rate": 0.0004887687686722254, + "loss": 0.003, + "num_input_tokens_seen": 147344192, + "step": 68220 + }, + { + "epoch": 11.129690048939642, + "grad_norm": 0.010090678930282593, + "learning_rate": 0.000488697606923189, + "loss": 0.1477, + "num_input_tokens_seen": 147355104, + "step": 68225 + }, + { + "epoch": 11.130505709624796, + "grad_norm": 0.0023009213618934155, + "learning_rate": 0.0004886264454032097, + "loss": 0.0098, + "num_input_tokens_seen": 147364608, + "step": 68230 + }, + { + "epoch": 11.131321370309951, + "grad_norm": 0.18410499393939972, + "learning_rate": 0.0004885552841137302, + "loss": 0.0118, + "num_input_tokens_seen": 147375296, + "step": 68235 + }, + { + "epoch": 11.132137030995105, + "grad_norm": 0.002545412862673402, + "learning_rate": 0.0004884841230561922, + "loss": 0.0066, + "num_input_tokens_seen": 147387296, + "step": 68240 + }, + { + "epoch": 11.132952691680261, + "grad_norm": 0.014841769821941853, + "learning_rate": 0.0004884129622320381, + "loss": 0.0104, + "num_input_tokens_seen": 147398144, + "step": 68245 + }, + { + "epoch": 11.133768352365417, + "grad_norm": 0.0036457956302911043, + "learning_rate": 0.0004883418016427099, + "loss": 0.0114, + "num_input_tokens_seen": 147409504, + "step": 68250 + }, + { + "epoch": 11.13458401305057, + "grad_norm": 0.0025173728354275227, + "learning_rate": 0.00048827064128965014, + "loss": 0.005, + "num_input_tokens_seen": 147420512, + "step": 68255 + }, + { + "epoch": 11.135399673735726, + "grad_norm": 0.007116433698683977, + "learning_rate": 0.00048819948117430047, + "loss": 0.0038, + "num_input_tokens_seen": 147432672, + "step": 68260 + }, + { + "epoch": 11.13621533442088, + "grad_norm": 0.0915973037481308, + "learning_rate": 0.00048812832129810347, + "loss": 0.1583, + "num_input_tokens_seen": 147444512, + "step": 68265 + }, + { + "epoch": 11.137030995106036, + "grad_norm": 0.008468952029943466, + "learning_rate": 0.0004880571616625009, + "loss": 0.0097, + "num_input_tokens_seen": 147455296, + "step": 68270 + }, + { + "epoch": 11.137846655791192, + "grad_norm": 0.004829864017665386, + "learning_rate": 0.00048798600226893535, + "loss": 0.0415, + "num_input_tokens_seen": 147465120, + "step": 68275 + }, + { + "epoch": 11.138662316476346, + "grad_norm": 0.005911378655582666, + "learning_rate": 0.00048791484311884844, + "loss": 0.0123, + "num_input_tokens_seen": 147476512, + "step": 68280 + }, + { + "epoch": 11.139477977161501, + "grad_norm": 0.6165083646774292, + "learning_rate": 0.0004878436842136828, + "loss": 0.2525, + "num_input_tokens_seen": 147487520, + "step": 68285 + }, + { + "epoch": 11.140293637846655, + "grad_norm": 0.02946476638317108, + "learning_rate": 0.0004877725255548801, + "loss": 0.1519, + "num_input_tokens_seen": 147499584, + "step": 68290 + }, + { + "epoch": 11.141109298531811, + "grad_norm": 0.010245737619698048, + "learning_rate": 0.0004877013671438828, + "loss": 0.0123, + "num_input_tokens_seen": 147510176, + "step": 68295 + }, + { + "epoch": 11.141924959216965, + "grad_norm": 0.030240066349506378, + "learning_rate": 0.0004876302089821329, + "loss": 0.009, + "num_input_tokens_seen": 147520352, + "step": 68300 + }, + { + "epoch": 11.14274061990212, + "grad_norm": 0.004862932022660971, + "learning_rate": 0.0004875590510710724, + "loss": 0.003, + "num_input_tokens_seen": 147530560, + "step": 68305 + }, + { + "epoch": 11.143556280587276, + "grad_norm": 0.2248290628194809, + "learning_rate": 0.00048748789341214373, + "loss": 0.0184, + "num_input_tokens_seen": 147542016, + "step": 68310 + }, + { + "epoch": 11.14437194127243, + "grad_norm": 0.08497530966997147, + "learning_rate": 0.00048741673600678857, + "loss": 0.0922, + "num_input_tokens_seen": 147552896, + "step": 68315 + }, + { + "epoch": 11.145187601957586, + "grad_norm": 0.02630682848393917, + "learning_rate": 0.00048734557885644924, + "loss": 0.0608, + "num_input_tokens_seen": 147563008, + "step": 68320 + }, + { + "epoch": 11.14600326264274, + "grad_norm": 0.0025665624998509884, + "learning_rate": 0.00048727442196256786, + "loss": 0.056, + "num_input_tokens_seen": 147573568, + "step": 68325 + }, + { + "epoch": 11.146818923327896, + "grad_norm": 0.008093073032796383, + "learning_rate": 0.0004872032653265865, + "loss": 0.1188, + "num_input_tokens_seen": 147584448, + "step": 68330 + }, + { + "epoch": 11.147634584013051, + "grad_norm": 0.42789462208747864, + "learning_rate": 0.0004871321089499472, + "loss": 0.0741, + "num_input_tokens_seen": 147595680, + "step": 68335 + }, + { + "epoch": 11.148450244698205, + "grad_norm": 0.010536248795688152, + "learning_rate": 0.00048706095283409194, + "loss": 0.006, + "num_input_tokens_seen": 147606688, + "step": 68340 + }, + { + "epoch": 11.149265905383361, + "grad_norm": 0.002236375818029046, + "learning_rate": 0.00048698979698046286, + "loss": 0.0296, + "num_input_tokens_seen": 147617728, + "step": 68345 + }, + { + "epoch": 11.150081566068515, + "grad_norm": 0.252946674823761, + "learning_rate": 0.0004869186413905023, + "loss": 0.058, + "num_input_tokens_seen": 147627136, + "step": 68350 + }, + { + "epoch": 11.15089722675367, + "grad_norm": 0.012744572013616562, + "learning_rate": 0.00048684748606565175, + "loss": 0.0104, + "num_input_tokens_seen": 147638368, + "step": 68355 + }, + { + "epoch": 11.151712887438826, + "grad_norm": 0.17559273540973663, + "learning_rate": 0.00048677633100735387, + "loss": 0.0119, + "num_input_tokens_seen": 147649536, + "step": 68360 + }, + { + "epoch": 11.15252854812398, + "grad_norm": 0.04494628682732582, + "learning_rate": 0.00048670517621705016, + "loss": 0.0073, + "num_input_tokens_seen": 147660736, + "step": 68365 + }, + { + "epoch": 11.153344208809136, + "grad_norm": 0.12925973534584045, + "learning_rate": 0.0004866340216961832, + "loss": 0.0964, + "num_input_tokens_seen": 147672704, + "step": 68370 + }, + { + "epoch": 11.15415986949429, + "grad_norm": 0.03734464943408966, + "learning_rate": 0.00048656286744619447, + "loss": 0.0714, + "num_input_tokens_seen": 147684128, + "step": 68375 + }, + { + "epoch": 11.154975530179446, + "grad_norm": 0.026860255748033524, + "learning_rate": 0.0004864917134685265, + "loss": 0.192, + "num_input_tokens_seen": 147694368, + "step": 68380 + }, + { + "epoch": 11.1557911908646, + "grad_norm": 0.049138400703668594, + "learning_rate": 0.0004864205597646209, + "loss": 0.0124, + "num_input_tokens_seen": 147704352, + "step": 68385 + }, + { + "epoch": 11.156606851549755, + "grad_norm": 0.012718032114207745, + "learning_rate": 0.00048634940633592006, + "loss": 0.0095, + "num_input_tokens_seen": 147716064, + "step": 68390 + }, + { + "epoch": 11.15742251223491, + "grad_norm": 0.022052332758903503, + "learning_rate": 0.00048627825318386567, + "loss": 0.1129, + "num_input_tokens_seen": 147726720, + "step": 68395 + }, + { + "epoch": 11.158238172920065, + "grad_norm": 0.007494083605706692, + "learning_rate": 0.00048620710030990004, + "loss": 0.0105, + "num_input_tokens_seen": 147737632, + "step": 68400 + }, + { + "epoch": 11.15905383360522, + "grad_norm": 0.0026938801165670156, + "learning_rate": 0.0004861359477154648, + "loss": 0.0068, + "num_input_tokens_seen": 147748160, + "step": 68405 + }, + { + "epoch": 11.159869494290374, + "grad_norm": 0.010102441534399986, + "learning_rate": 0.00048606479540200243, + "loss": 0.2594, + "num_input_tokens_seen": 147758560, + "step": 68410 + }, + { + "epoch": 11.16068515497553, + "grad_norm": 0.009311516769230366, + "learning_rate": 0.00048599364337095443, + "loss": 0.0699, + "num_input_tokens_seen": 147769568, + "step": 68415 + }, + { + "epoch": 11.161500815660686, + "grad_norm": 0.015968909487128258, + "learning_rate": 0.000485922491623763, + "loss": 0.0083, + "num_input_tokens_seen": 147780480, + "step": 68420 + }, + { + "epoch": 11.16231647634584, + "grad_norm": 0.003891808446496725, + "learning_rate": 0.0004858513401618704, + "loss": 0.009, + "num_input_tokens_seen": 147792096, + "step": 68425 + }, + { + "epoch": 11.163132137030995, + "grad_norm": 0.014763821847736835, + "learning_rate": 0.00048578018898671804, + "loss": 0.0104, + "num_input_tokens_seen": 147802144, + "step": 68430 + }, + { + "epoch": 11.16394779771615, + "grad_norm": 0.1828288435935974, + "learning_rate": 0.0004857090380997484, + "loss": 0.0814, + "num_input_tokens_seen": 147812480, + "step": 68435 + }, + { + "epoch": 11.164763458401305, + "grad_norm": 0.23500409722328186, + "learning_rate": 0.00048563788750240314, + "loss": 0.1066, + "num_input_tokens_seen": 147823328, + "step": 68440 + }, + { + "epoch": 11.16557911908646, + "grad_norm": 0.29729729890823364, + "learning_rate": 0.00048556673719612445, + "loss": 0.1152, + "num_input_tokens_seen": 147832992, + "step": 68445 + }, + { + "epoch": 11.166394779771615, + "grad_norm": 0.04158762842416763, + "learning_rate": 0.00048549558718235386, + "loss": 0.0312, + "num_input_tokens_seen": 147844160, + "step": 68450 + }, + { + "epoch": 11.16721044045677, + "grad_norm": 0.023969994857907295, + "learning_rate": 0.0004854244374625339, + "loss": 0.016, + "num_input_tokens_seen": 147855328, + "step": 68455 + }, + { + "epoch": 11.168026101141924, + "grad_norm": 0.33585116267204285, + "learning_rate": 0.00048535328803810595, + "loss": 0.1219, + "num_input_tokens_seen": 147866240, + "step": 68460 + }, + { + "epoch": 11.16884176182708, + "grad_norm": 0.02872396446764469, + "learning_rate": 0.0004852821389105123, + "loss": 0.073, + "num_input_tokens_seen": 147877824, + "step": 68465 + }, + { + "epoch": 11.169657422512234, + "grad_norm": 0.20101076364517212, + "learning_rate": 0.00048521099008119484, + "loss": 0.0217, + "num_input_tokens_seen": 147887200, + "step": 68470 + }, + { + "epoch": 11.17047308319739, + "grad_norm": 0.23845504224300385, + "learning_rate": 0.0004851398415515954, + "loss": 0.0247, + "num_input_tokens_seen": 147896224, + "step": 68475 + }, + { + "epoch": 11.171288743882545, + "grad_norm": 0.02124555967748165, + "learning_rate": 0.0004850686933231559, + "loss": 0.0392, + "num_input_tokens_seen": 147906368, + "step": 68480 + }, + { + "epoch": 11.1721044045677, + "grad_norm": 0.008393766358494759, + "learning_rate": 0.00048499754539731827, + "loss": 0.0166, + "num_input_tokens_seen": 147917952, + "step": 68485 + }, + { + "epoch": 11.172920065252855, + "grad_norm": 0.02445857785642147, + "learning_rate": 0.0004849263977755243, + "loss": 0.0246, + "num_input_tokens_seen": 147928064, + "step": 68490 + }, + { + "epoch": 11.173735725938009, + "grad_norm": 0.07245718687772751, + "learning_rate": 0.00048485525045921627, + "loss": 0.0295, + "num_input_tokens_seen": 147938336, + "step": 68495 + }, + { + "epoch": 11.174551386623165, + "grad_norm": 0.008152371272444725, + "learning_rate": 0.00048478410344983554, + "loss": 0.0155, + "num_input_tokens_seen": 147948480, + "step": 68500 + }, + { + "epoch": 11.17536704730832, + "grad_norm": 0.18014536798000336, + "learning_rate": 0.00048471295674882447, + "loss": 0.2172, + "num_input_tokens_seen": 147958560, + "step": 68505 + }, + { + "epoch": 11.176182707993474, + "grad_norm": 0.0194853488355875, + "learning_rate": 0.0004846418103576245, + "loss": 0.0456, + "num_input_tokens_seen": 147969056, + "step": 68510 + }, + { + "epoch": 11.17699836867863, + "grad_norm": 0.278815358877182, + "learning_rate": 0.000484570664277678, + "loss": 0.1935, + "num_input_tokens_seen": 147980064, + "step": 68515 + }, + { + "epoch": 11.177814029363784, + "grad_norm": 0.015023061074316502, + "learning_rate": 0.00048449951851042627, + "loss": 0.0262, + "num_input_tokens_seen": 147990752, + "step": 68520 + }, + { + "epoch": 11.17862969004894, + "grad_norm": 0.12025143951177597, + "learning_rate": 0.0004844283730573115, + "loss": 0.0308, + "num_input_tokens_seen": 148002176, + "step": 68525 + }, + { + "epoch": 11.179445350734095, + "grad_norm": 0.008281445130705833, + "learning_rate": 0.0004843572279197757, + "loss": 0.0199, + "num_input_tokens_seen": 148011872, + "step": 68530 + }, + { + "epoch": 11.18026101141925, + "grad_norm": 0.019231772050261497, + "learning_rate": 0.0004842860830992604, + "loss": 0.0178, + "num_input_tokens_seen": 148022592, + "step": 68535 + }, + { + "epoch": 11.181076672104405, + "grad_norm": 0.0041782851330935955, + "learning_rate": 0.00048421493859720767, + "loss": 0.0598, + "num_input_tokens_seen": 148032832, + "step": 68540 + }, + { + "epoch": 11.181892332789559, + "grad_norm": 0.09217726439237595, + "learning_rate": 0.000484143794415059, + "loss": 0.0425, + "num_input_tokens_seen": 148042816, + "step": 68545 + }, + { + "epoch": 11.182707993474715, + "grad_norm": 0.31993457674980164, + "learning_rate": 0.00048407265055425673, + "loss": 0.1465, + "num_input_tokens_seen": 148053056, + "step": 68550 + }, + { + "epoch": 11.18352365415987, + "grad_norm": 0.02010430581867695, + "learning_rate": 0.00048400150701624216, + "loss": 0.0309, + "num_input_tokens_seen": 148062912, + "step": 68555 + }, + { + "epoch": 11.184339314845024, + "grad_norm": 0.020727120339870453, + "learning_rate": 0.0004839303638024576, + "loss": 0.009, + "num_input_tokens_seen": 148074176, + "step": 68560 + }, + { + "epoch": 11.18515497553018, + "grad_norm": 0.016470473259687424, + "learning_rate": 0.0004838592209143444, + "loss": 0.0048, + "num_input_tokens_seen": 148085728, + "step": 68565 + }, + { + "epoch": 11.185970636215334, + "grad_norm": 0.007106783799827099, + "learning_rate": 0.0004837880783533447, + "loss": 0.0293, + "num_input_tokens_seen": 148095744, + "step": 68570 + }, + { + "epoch": 11.18678629690049, + "grad_norm": 0.04211915656924248, + "learning_rate": 0.00048371693612089996, + "loss": 0.0085, + "num_input_tokens_seen": 148106592, + "step": 68575 + }, + { + "epoch": 11.187601957585644, + "grad_norm": 0.026171937584877014, + "learning_rate": 0.00048364579421845245, + "loss": 0.0794, + "num_input_tokens_seen": 148118272, + "step": 68580 + }, + { + "epoch": 11.1884176182708, + "grad_norm": 0.028126433491706848, + "learning_rate": 0.0004835746526474434, + "loss": 0.0699, + "num_input_tokens_seen": 148129856, + "step": 68585 + }, + { + "epoch": 11.189233278955955, + "grad_norm": 0.49054139852523804, + "learning_rate": 0.00048350351140931505, + "loss": 0.0506, + "num_input_tokens_seen": 148140544, + "step": 68590 + }, + { + "epoch": 11.190048939641109, + "grad_norm": 0.4415249526500702, + "learning_rate": 0.00048343237050550876, + "loss": 0.0405, + "num_input_tokens_seen": 148149600, + "step": 68595 + }, + { + "epoch": 11.190864600326265, + "grad_norm": 0.013207647018134594, + "learning_rate": 0.0004833612299374667, + "loss": 0.0279, + "num_input_tokens_seen": 148161376, + "step": 68600 + }, + { + "epoch": 11.191680261011419, + "grad_norm": 0.006056750193238258, + "learning_rate": 0.0004832900897066303, + "loss": 0.0179, + "num_input_tokens_seen": 148171968, + "step": 68605 + }, + { + "epoch": 11.192495921696574, + "grad_norm": 0.42517709732055664, + "learning_rate": 0.0004832189498144415, + "loss": 0.0969, + "num_input_tokens_seen": 148183168, + "step": 68610 + }, + { + "epoch": 11.19331158238173, + "grad_norm": 0.029658634215593338, + "learning_rate": 0.0004831478102623419, + "loss": 0.0057, + "num_input_tokens_seen": 148194592, + "step": 68615 + }, + { + "epoch": 11.194127243066884, + "grad_norm": 0.004472116474062204, + "learning_rate": 0.0004830766710517733, + "loss": 0.0063, + "num_input_tokens_seen": 148206880, + "step": 68620 + }, + { + "epoch": 11.19494290375204, + "grad_norm": 0.004931016359478235, + "learning_rate": 0.00048300553218417753, + "loss": 0.0322, + "num_input_tokens_seen": 148217504, + "step": 68625 + }, + { + "epoch": 11.195758564437194, + "grad_norm": 1.0115134716033936, + "learning_rate": 0.0004829343936609961, + "loss": 0.0433, + "num_input_tokens_seen": 148228160, + "step": 68630 + }, + { + "epoch": 11.19657422512235, + "grad_norm": 0.13028618693351746, + "learning_rate": 0.00048286325548367083, + "loss": 0.0401, + "num_input_tokens_seen": 148238816, + "step": 68635 + }, + { + "epoch": 11.197389885807505, + "grad_norm": 0.42092257738113403, + "learning_rate": 0.0004827921176536435, + "loss": 0.0875, + "num_input_tokens_seen": 148249152, + "step": 68640 + }, + { + "epoch": 11.198205546492659, + "grad_norm": 0.009416461922228336, + "learning_rate": 0.00048272098017235573, + "loss": 0.0141, + "num_input_tokens_seen": 148261024, + "step": 68645 + }, + { + "epoch": 11.199021207177815, + "grad_norm": 0.02090616337954998, + "learning_rate": 0.0004826498430412492, + "loss": 0.017, + "num_input_tokens_seen": 148271712, + "step": 68650 + }, + { + "epoch": 11.199836867862969, + "grad_norm": 0.0011445780983194709, + "learning_rate": 0.00048257870626176565, + "loss": 0.0089, + "num_input_tokens_seen": 148283584, + "step": 68655 + }, + { + "epoch": 11.200652528548124, + "grad_norm": 0.0027027344331145287, + "learning_rate": 0.00048250756983534657, + "loss": 0.021, + "num_input_tokens_seen": 148293408, + "step": 68660 + }, + { + "epoch": 11.201468189233278, + "grad_norm": 0.006044385023415089, + "learning_rate": 0.000482436433763434, + "loss": 0.0261, + "num_input_tokens_seen": 148303776, + "step": 68665 + }, + { + "epoch": 11.202283849918434, + "grad_norm": 0.004509706981480122, + "learning_rate": 0.00048236529804746915, + "loss": 0.0046, + "num_input_tokens_seen": 148315008, + "step": 68670 + }, + { + "epoch": 11.20309951060359, + "grad_norm": 0.18956010043621063, + "learning_rate": 0.0004822941626888941, + "loss": 0.0489, + "num_input_tokens_seen": 148325344, + "step": 68675 + }, + { + "epoch": 11.203915171288743, + "grad_norm": 0.012917671352624893, + "learning_rate": 0.0004822230276891502, + "loss": 0.0532, + "num_input_tokens_seen": 148337120, + "step": 68680 + }, + { + "epoch": 11.2047308319739, + "grad_norm": 0.04024987295269966, + "learning_rate": 0.00048215189304967934, + "loss": 0.0764, + "num_input_tokens_seen": 148348576, + "step": 68685 + }, + { + "epoch": 11.205546492659053, + "grad_norm": 0.737769365310669, + "learning_rate": 0.00048208075877192275, + "loss": 0.1234, + "num_input_tokens_seen": 148359552, + "step": 68690 + }, + { + "epoch": 11.206362153344209, + "grad_norm": 0.44505423307418823, + "learning_rate": 0.0004820096248573226, + "loss": 0.1929, + "num_input_tokens_seen": 148369568, + "step": 68695 + }, + { + "epoch": 11.207177814029365, + "grad_norm": 0.008643914945423603, + "learning_rate": 0.00048193849130732, + "loss": 0.0077, + "num_input_tokens_seen": 148381184, + "step": 68700 + }, + { + "epoch": 11.207993474714518, + "grad_norm": 0.060006801038980484, + "learning_rate": 0.00048186735812335695, + "loss": 0.0786, + "num_input_tokens_seen": 148390880, + "step": 68705 + }, + { + "epoch": 11.208809135399674, + "grad_norm": 0.0032924246042966843, + "learning_rate": 0.0004817962253068747, + "loss": 0.0805, + "num_input_tokens_seen": 148402080, + "step": 68710 + }, + { + "epoch": 11.209624796084828, + "grad_norm": 0.12051805853843689, + "learning_rate": 0.0004817250928593153, + "loss": 0.0332, + "num_input_tokens_seen": 148413440, + "step": 68715 + }, + { + "epoch": 11.210440456769984, + "grad_norm": 0.020529478788375854, + "learning_rate": 0.0004816539607821198, + "loss": 0.0171, + "num_input_tokens_seen": 148424640, + "step": 68720 + }, + { + "epoch": 11.21125611745514, + "grad_norm": 0.010123873129487038, + "learning_rate": 0.0004815828290767303, + "loss": 0.0242, + "num_input_tokens_seen": 148434240, + "step": 68725 + }, + { + "epoch": 11.212071778140293, + "grad_norm": 0.04959236830472946, + "learning_rate": 0.00048151169774458797, + "loss": 0.0102, + "num_input_tokens_seen": 148443488, + "step": 68730 + }, + { + "epoch": 11.21288743882545, + "grad_norm": 0.467986524105072, + "learning_rate": 0.00048144056678713445, + "loss": 0.1084, + "num_input_tokens_seen": 148453120, + "step": 68735 + }, + { + "epoch": 11.213703099510603, + "grad_norm": 0.7139317393302917, + "learning_rate": 0.00048136943620581164, + "loss": 0.0229, + "num_input_tokens_seen": 148463712, + "step": 68740 + }, + { + "epoch": 11.214518760195759, + "grad_norm": 0.10186992585659027, + "learning_rate": 0.00048129830600206067, + "loss": 0.0082, + "num_input_tokens_seen": 148474336, + "step": 68745 + }, + { + "epoch": 11.215334420880913, + "grad_norm": 0.004726898390799761, + "learning_rate": 0.0004812271761773234, + "loss": 0.0443, + "num_input_tokens_seen": 148485184, + "step": 68750 + }, + { + "epoch": 11.216150081566068, + "grad_norm": 0.016972597688436508, + "learning_rate": 0.00048115604673304105, + "loss": 0.035, + "num_input_tokens_seen": 148495712, + "step": 68755 + }, + { + "epoch": 11.216965742251224, + "grad_norm": 0.2422623485326767, + "learning_rate": 0.0004810849176706555, + "loss": 0.0959, + "num_input_tokens_seen": 148507200, + "step": 68760 + }, + { + "epoch": 11.217781402936378, + "grad_norm": 0.3776349127292633, + "learning_rate": 0.00048101378899160786, + "loss": 0.1002, + "num_input_tokens_seen": 148517216, + "step": 68765 + }, + { + "epoch": 11.218597063621534, + "grad_norm": 0.016399575397372246, + "learning_rate": 0.0004809426606973401, + "loss": 0.0085, + "num_input_tokens_seen": 148527520, + "step": 68770 + }, + { + "epoch": 11.219412724306688, + "grad_norm": 0.003865182166919112, + "learning_rate": 0.00048087153278929327, + "loss": 0.0114, + "num_input_tokens_seen": 148537728, + "step": 68775 + }, + { + "epoch": 11.220228384991843, + "grad_norm": 0.1378757506608963, + "learning_rate": 0.0004808004052689093, + "loss": 0.0282, + "num_input_tokens_seen": 148548736, + "step": 68780 + }, + { + "epoch": 11.221044045676999, + "grad_norm": 0.004386617336422205, + "learning_rate": 0.0004807292781376294, + "loss": 0.0657, + "num_input_tokens_seen": 148559776, + "step": 68785 + }, + { + "epoch": 11.221859706362153, + "grad_norm": 0.014153995551168919, + "learning_rate": 0.0004806581513968951, + "loss": 0.0088, + "num_input_tokens_seen": 148571200, + "step": 68790 + }, + { + "epoch": 11.222675367047309, + "grad_norm": 0.004978655371814966, + "learning_rate": 0.00048058702504814795, + "loss": 0.0824, + "num_input_tokens_seen": 148582528, + "step": 68795 + }, + { + "epoch": 11.223491027732463, + "grad_norm": 0.2707526683807373, + "learning_rate": 0.0004805158990928293, + "loss": 0.0324, + "num_input_tokens_seen": 148593216, + "step": 68800 + }, + { + "epoch": 11.224306688417618, + "grad_norm": 0.02647767774760723, + "learning_rate": 0.0004804447735323806, + "loss": 0.0042, + "num_input_tokens_seen": 148605248, + "step": 68805 + }, + { + "epoch": 11.225122349102774, + "grad_norm": 0.008877074345946312, + "learning_rate": 0.0004803736483682436, + "loss": 0.1179, + "num_input_tokens_seen": 148614976, + "step": 68810 + }, + { + "epoch": 11.225938009787928, + "grad_norm": 0.005164226982742548, + "learning_rate": 0.0004803025236018593, + "loss": 0.0199, + "num_input_tokens_seen": 148625952, + "step": 68815 + }, + { + "epoch": 11.226753670473084, + "grad_norm": 0.04235182702541351, + "learning_rate": 0.00048023139923466954, + "loss": 0.1503, + "num_input_tokens_seen": 148636800, + "step": 68820 + }, + { + "epoch": 11.227569331158238, + "grad_norm": 0.3473689556121826, + "learning_rate": 0.00048016027526811536, + "loss": 0.0951, + "num_input_tokens_seen": 148648000, + "step": 68825 + }, + { + "epoch": 11.228384991843393, + "grad_norm": 0.5196056365966797, + "learning_rate": 0.00048008915170363853, + "loss": 0.0642, + "num_input_tokens_seen": 148659168, + "step": 68830 + }, + { + "epoch": 11.229200652528547, + "grad_norm": 0.04722573608160019, + "learning_rate": 0.0004800180285426802, + "loss": 0.0235, + "num_input_tokens_seen": 148670144, + "step": 68835 + }, + { + "epoch": 11.230016313213703, + "grad_norm": 0.006056048907339573, + "learning_rate": 0.00047994690578668175, + "loss": 0.012, + "num_input_tokens_seen": 148679808, + "step": 68840 + }, + { + "epoch": 11.230831973898859, + "grad_norm": 0.028124723583459854, + "learning_rate": 0.000479875783437085, + "loss": 0.0065, + "num_input_tokens_seen": 148690720, + "step": 68845 + }, + { + "epoch": 11.231647634584013, + "grad_norm": 0.03521675989031792, + "learning_rate": 0.00047980466149533075, + "loss": 0.0056, + "num_input_tokens_seen": 148703104, + "step": 68850 + }, + { + "epoch": 11.232463295269168, + "grad_norm": 0.020668139681220055, + "learning_rate": 0.0004797335399628609, + "loss": 0.0128, + "num_input_tokens_seen": 148714144, + "step": 68855 + }, + { + "epoch": 11.233278955954322, + "grad_norm": 0.3109743893146515, + "learning_rate": 0.0004796624188411163, + "loss": 0.0206, + "num_input_tokens_seen": 148725952, + "step": 68860 + }, + { + "epoch": 11.234094616639478, + "grad_norm": 0.002848732518032193, + "learning_rate": 0.00047959129813153885, + "loss": 0.0111, + "num_input_tokens_seen": 148736384, + "step": 68865 + }, + { + "epoch": 11.234910277324634, + "grad_norm": 0.11283021420240402, + "learning_rate": 0.00047952017783556945, + "loss": 0.0113, + "num_input_tokens_seen": 148746720, + "step": 68870 + }, + { + "epoch": 11.235725938009788, + "grad_norm": 0.06801166385412216, + "learning_rate": 0.00047944905795464977, + "loss": 0.0148, + "num_input_tokens_seen": 148757888, + "step": 68875 + }, + { + "epoch": 11.236541598694943, + "grad_norm": 0.34667694568634033, + "learning_rate": 0.0004793779384902208, + "loss": 0.1412, + "num_input_tokens_seen": 148768544, + "step": 68880 + }, + { + "epoch": 11.237357259380097, + "grad_norm": 0.4156077802181244, + "learning_rate": 0.00047930681944372434, + "loss": 0.0829, + "num_input_tokens_seen": 148780896, + "step": 68885 + }, + { + "epoch": 11.238172920065253, + "grad_norm": 0.0029288295190781355, + "learning_rate": 0.00047923570081660115, + "loss": 0.0029, + "num_input_tokens_seen": 148792992, + "step": 68890 + }, + { + "epoch": 11.238988580750409, + "grad_norm": 0.1302565187215805, + "learning_rate": 0.0004791645826102931, + "loss": 0.0099, + "num_input_tokens_seen": 148802560, + "step": 68895 + }, + { + "epoch": 11.239804241435563, + "grad_norm": 0.0034214449115097523, + "learning_rate": 0.000479093464826241, + "loss": 0.0178, + "num_input_tokens_seen": 148813344, + "step": 68900 + }, + { + "epoch": 11.240619902120718, + "grad_norm": 0.015604183077812195, + "learning_rate": 0.00047902234746588653, + "loss": 0.0411, + "num_input_tokens_seen": 148824064, + "step": 68905 + }, + { + "epoch": 11.241435562805872, + "grad_norm": 0.04579491913318634, + "learning_rate": 0.0004789512305306706, + "loss": 0.0316, + "num_input_tokens_seen": 148835232, + "step": 68910 + }, + { + "epoch": 11.242251223491028, + "grad_norm": 0.0059250290505588055, + "learning_rate": 0.0004788801140220349, + "loss": 0.0095, + "num_input_tokens_seen": 148845504, + "step": 68915 + }, + { + "epoch": 11.243066884176184, + "grad_norm": 0.013930793851613998, + "learning_rate": 0.00047880899794142026, + "loss": 0.1052, + "num_input_tokens_seen": 148857440, + "step": 68920 + }, + { + "epoch": 11.243882544861338, + "grad_norm": 0.003167049726471305, + "learning_rate": 0.00047873788229026826, + "loss": 0.0235, + "num_input_tokens_seen": 148868960, + "step": 68925 + }, + { + "epoch": 11.244698205546493, + "grad_norm": 0.6725971698760986, + "learning_rate": 0.0004786667670700201, + "loss": 0.0462, + "num_input_tokens_seen": 148879168, + "step": 68930 + }, + { + "epoch": 11.245513866231647, + "grad_norm": 0.552683413028717, + "learning_rate": 0.00047859565228211695, + "loss": 0.1268, + "num_input_tokens_seen": 148890016, + "step": 68935 + }, + { + "epoch": 11.246329526916803, + "grad_norm": 0.0028921207413077354, + "learning_rate": 0.00047852453792799997, + "loss": 0.006, + "num_input_tokens_seen": 148901344, + "step": 68940 + }, + { + "epoch": 11.247145187601957, + "grad_norm": 0.008932768367230892, + "learning_rate": 0.0004784534240091105, + "loss": 0.0332, + "num_input_tokens_seen": 148912480, + "step": 68945 + }, + { + "epoch": 11.247960848287113, + "grad_norm": 0.005634156055748463, + "learning_rate": 0.00047838231052688975, + "loss": 0.0064, + "num_input_tokens_seen": 148922592, + "step": 68950 + }, + { + "epoch": 11.248776508972268, + "grad_norm": 0.005630916450172663, + "learning_rate": 0.0004783111974827789, + "loss": 0.0109, + "num_input_tokens_seen": 148933824, + "step": 68955 + }, + { + "epoch": 11.249592169657422, + "grad_norm": 0.09499726444482803, + "learning_rate": 0.0004782400848782192, + "loss": 0.0164, + "num_input_tokens_seen": 148945152, + "step": 68960 + }, + { + "epoch": 11.250407830342578, + "grad_norm": 0.16316631436347961, + "learning_rate": 0.0004781689727146517, + "loss": 0.0227, + "num_input_tokens_seen": 148957504, + "step": 68965 + }, + { + "epoch": 11.251223491027732, + "grad_norm": 0.001331451814621687, + "learning_rate": 0.0004780978609935178, + "loss": 0.0178, + "num_input_tokens_seen": 148969312, + "step": 68970 + }, + { + "epoch": 11.252039151712887, + "grad_norm": 0.042243119329214096, + "learning_rate": 0.00047802674971625825, + "loss": 0.0096, + "num_input_tokens_seen": 148978656, + "step": 68975 + }, + { + "epoch": 11.252854812398043, + "grad_norm": 0.3264990746974945, + "learning_rate": 0.0004779556388843148, + "loss": 0.0268, + "num_input_tokens_seen": 148989408, + "step": 68980 + }, + { + "epoch": 11.253670473083197, + "grad_norm": 0.40584519505500793, + "learning_rate": 0.0004778845284991281, + "loss": 0.0428, + "num_input_tokens_seen": 148999360, + "step": 68985 + }, + { + "epoch": 11.254486133768353, + "grad_norm": 0.0077764419838786125, + "learning_rate": 0.00047781341856213965, + "loss": 0.0158, + "num_input_tokens_seen": 149010048, + "step": 68990 + }, + { + "epoch": 11.255301794453507, + "grad_norm": 0.0264874417334795, + "learning_rate": 0.00047774230907479025, + "loss": 0.0027, + "num_input_tokens_seen": 149019552, + "step": 68995 + }, + { + "epoch": 11.256117455138662, + "grad_norm": 0.0019950123969465494, + "learning_rate": 0.0004776712000385214, + "loss": 0.0248, + "num_input_tokens_seen": 149029216, + "step": 69000 + }, + { + "epoch": 11.256933115823816, + "grad_norm": 0.1232723593711853, + "learning_rate": 0.0004776000914547738, + "loss": 0.0121, + "num_input_tokens_seen": 149041088, + "step": 69005 + }, + { + "epoch": 11.257748776508972, + "grad_norm": 0.002051304094493389, + "learning_rate": 0.00047752898332498894, + "loss": 0.0062, + "num_input_tokens_seen": 149052224, + "step": 69010 + }, + { + "epoch": 11.258564437194128, + "grad_norm": 0.0008948579197749496, + "learning_rate": 0.00047745787565060756, + "loss": 0.0121, + "num_input_tokens_seen": 149063264, + "step": 69015 + }, + { + "epoch": 11.259380097879282, + "grad_norm": 0.011743937619030476, + "learning_rate": 0.0004773867684330711, + "loss": 0.0215, + "num_input_tokens_seen": 149074240, + "step": 69020 + }, + { + "epoch": 11.260195758564437, + "grad_norm": 0.7066226601600647, + "learning_rate": 0.0004773156616738203, + "loss": 0.0792, + "num_input_tokens_seen": 149085984, + "step": 69025 + }, + { + "epoch": 11.261011419249591, + "grad_norm": 0.012210289016366005, + "learning_rate": 0.00047724455537429656, + "loss": 0.1053, + "num_input_tokens_seen": 149096736, + "step": 69030 + }, + { + "epoch": 11.261827079934747, + "grad_norm": 0.001831859932281077, + "learning_rate": 0.00047717344953594054, + "loss": 0.0876, + "num_input_tokens_seen": 149107712, + "step": 69035 + }, + { + "epoch": 11.262642740619903, + "grad_norm": 0.01406815368682146, + "learning_rate": 0.0004771023441601938, + "loss": 0.0034, + "num_input_tokens_seen": 149118240, + "step": 69040 + }, + { + "epoch": 11.263458401305057, + "grad_norm": 0.06279000639915466, + "learning_rate": 0.0004770312392484968, + "loss": 0.0116, + "num_input_tokens_seen": 149128896, + "step": 69045 + }, + { + "epoch": 11.264274061990212, + "grad_norm": 0.007155647035688162, + "learning_rate": 0.000476960134802291, + "loss": 0.0221, + "num_input_tokens_seen": 149140960, + "step": 69050 + }, + { + "epoch": 11.265089722675366, + "grad_norm": 0.0038323281332850456, + "learning_rate": 0.00047688903082301746, + "loss": 0.0078, + "num_input_tokens_seen": 149152704, + "step": 69055 + }, + { + "epoch": 11.265905383360522, + "grad_norm": 0.03719216585159302, + "learning_rate": 0.00047681792731211684, + "loss": 0.0182, + "num_input_tokens_seen": 149164192, + "step": 69060 + }, + { + "epoch": 11.266721044045678, + "grad_norm": 0.015115122310817242, + "learning_rate": 0.00047674682427103045, + "loss": 0.01, + "num_input_tokens_seen": 149175008, + "step": 69065 + }, + { + "epoch": 11.267536704730832, + "grad_norm": 0.050118640065193176, + "learning_rate": 0.00047667572170119905, + "loss": 0.0488, + "num_input_tokens_seen": 149185440, + "step": 69070 + }, + { + "epoch": 11.268352365415987, + "grad_norm": 0.003916706424206495, + "learning_rate": 0.00047660461960406385, + "loss": 0.0159, + "num_input_tokens_seen": 149197184, + "step": 69075 + }, + { + "epoch": 11.269168026101141, + "grad_norm": 0.0028812792152166367, + "learning_rate": 0.0004765335179810656, + "loss": 0.0035, + "num_input_tokens_seen": 149207104, + "step": 69080 + }, + { + "epoch": 11.269983686786297, + "grad_norm": 0.0012590938713401556, + "learning_rate": 0.00047646241683364554, + "loss": 0.0129, + "num_input_tokens_seen": 149218464, + "step": 69085 + }, + { + "epoch": 11.270799347471453, + "grad_norm": 0.01192085538059473, + "learning_rate": 0.0004763913161632443, + "loss": 0.0174, + "num_input_tokens_seen": 149227328, + "step": 69090 + }, + { + "epoch": 11.271615008156607, + "grad_norm": 0.031869806349277496, + "learning_rate": 0.00047632021597130304, + "loss": 0.0346, + "num_input_tokens_seen": 149238752, + "step": 69095 + }, + { + "epoch": 11.272430668841762, + "grad_norm": 0.3605614900588989, + "learning_rate": 0.0004762491162592627, + "loss": 0.0333, + "num_input_tokens_seen": 149249024, + "step": 69100 + }, + { + "epoch": 11.273246329526916, + "grad_norm": 0.03341824561357498, + "learning_rate": 0.00047617801702856406, + "loss": 0.0162, + "num_input_tokens_seen": 149259968, + "step": 69105 + }, + { + "epoch": 11.274061990212072, + "grad_norm": 0.015178644098341465, + "learning_rate": 0.00047610691828064815, + "loss": 0.0139, + "num_input_tokens_seen": 149269632, + "step": 69110 + }, + { + "epoch": 11.274877650897226, + "grad_norm": 0.00792383961379528, + "learning_rate": 0.0004760358200169559, + "loss": 0.0043, + "num_input_tokens_seen": 149280480, + "step": 69115 + }, + { + "epoch": 11.275693311582382, + "grad_norm": 0.0019493248546496034, + "learning_rate": 0.000475964722238928, + "loss": 0.002, + "num_input_tokens_seen": 149291616, + "step": 69120 + }, + { + "epoch": 11.276508972267537, + "grad_norm": 0.0008896426879800856, + "learning_rate": 0.00047589362494800574, + "loss": 0.0016, + "num_input_tokens_seen": 149302368, + "step": 69125 + }, + { + "epoch": 11.277324632952691, + "grad_norm": 0.01973794586956501, + "learning_rate": 0.00047582252814562954, + "loss": 0.0232, + "num_input_tokens_seen": 149311648, + "step": 69130 + }, + { + "epoch": 11.278140293637847, + "grad_norm": 0.0590951032936573, + "learning_rate": 0.0004757514318332407, + "loss": 0.0864, + "num_input_tokens_seen": 149322912, + "step": 69135 + }, + { + "epoch": 11.278955954323001, + "grad_norm": 0.03127824887633324, + "learning_rate": 0.0004756803360122796, + "loss": 0.0046, + "num_input_tokens_seen": 149335168, + "step": 69140 + }, + { + "epoch": 11.279771615008157, + "grad_norm": 0.06734327971935272, + "learning_rate": 0.00047560924068418763, + "loss": 0.0141, + "num_input_tokens_seen": 149346368, + "step": 69145 + }, + { + "epoch": 11.280587275693312, + "grad_norm": 0.03294230252504349, + "learning_rate": 0.00047553814585040506, + "loss": 0.0062, + "num_input_tokens_seen": 149356544, + "step": 69150 + }, + { + "epoch": 11.281402936378466, + "grad_norm": 0.010984640568494797, + "learning_rate": 0.00047546705151237323, + "loss": 0.007, + "num_input_tokens_seen": 149366432, + "step": 69155 + }, + { + "epoch": 11.282218597063622, + "grad_norm": 0.008774088695645332, + "learning_rate": 0.00047539595767153255, + "loss": 0.0199, + "num_input_tokens_seen": 149377216, + "step": 69160 + }, + { + "epoch": 11.283034257748776, + "grad_norm": 0.021760782226920128, + "learning_rate": 0.00047532486432932394, + "loss": 0.0883, + "num_input_tokens_seen": 149388256, + "step": 69165 + }, + { + "epoch": 11.283849918433932, + "grad_norm": 0.0007815745775587857, + "learning_rate": 0.00047525377148718845, + "loss": 0.014, + "num_input_tokens_seen": 149400224, + "step": 69170 + }, + { + "epoch": 11.284665579119087, + "grad_norm": 0.10448987782001495, + "learning_rate": 0.00047518267914656656, + "loss": 0.039, + "num_input_tokens_seen": 149411808, + "step": 69175 + }, + { + "epoch": 11.285481239804241, + "grad_norm": 0.04002084583044052, + "learning_rate": 0.0004751115873088992, + "loss": 0.1977, + "num_input_tokens_seen": 149422272, + "step": 69180 + }, + { + "epoch": 11.286296900489397, + "grad_norm": 0.4018106162548065, + "learning_rate": 0.0004750404959756271, + "loss": 0.0552, + "num_input_tokens_seen": 149433664, + "step": 69185 + }, + { + "epoch": 11.28711256117455, + "grad_norm": 0.1768367737531662, + "learning_rate": 0.0004749694051481911, + "loss": 0.0444, + "num_input_tokens_seen": 149443936, + "step": 69190 + }, + { + "epoch": 11.287928221859707, + "grad_norm": 0.532922089099884, + "learning_rate": 0.00047489831482803167, + "loss": 0.0503, + "num_input_tokens_seen": 149453824, + "step": 69195 + }, + { + "epoch": 11.28874388254486, + "grad_norm": 0.013889400288462639, + "learning_rate": 0.00047482722501658993, + "loss": 0.0031, + "num_input_tokens_seen": 149464992, + "step": 69200 + }, + { + "epoch": 11.289559543230016, + "grad_norm": 0.020058369264006615, + "learning_rate": 0.00047475613571530624, + "loss": 0.0058, + "num_input_tokens_seen": 149476672, + "step": 69205 + }, + { + "epoch": 11.290375203915172, + "grad_norm": 0.0016307708574458957, + "learning_rate": 0.0004746850469256216, + "loss": 0.1704, + "num_input_tokens_seen": 149487392, + "step": 69210 + }, + { + "epoch": 11.291190864600326, + "grad_norm": 0.014769136905670166, + "learning_rate": 0.0004746139586489765, + "loss": 0.0223, + "num_input_tokens_seen": 149498208, + "step": 69215 + }, + { + "epoch": 11.292006525285482, + "grad_norm": 0.0010451297275722027, + "learning_rate": 0.00047454287088681194, + "loss": 0.0099, + "num_input_tokens_seen": 149509280, + "step": 69220 + }, + { + "epoch": 11.292822185970635, + "grad_norm": 0.005814549047499895, + "learning_rate": 0.0004744717836405681, + "loss": 0.0855, + "num_input_tokens_seen": 149519808, + "step": 69225 + }, + { + "epoch": 11.293637846655791, + "grad_norm": 0.041108760982751846, + "learning_rate": 0.00047440069691168617, + "loss": 0.0058, + "num_input_tokens_seen": 149530208, + "step": 69230 + }, + { + "epoch": 11.294453507340947, + "grad_norm": 0.517116367816925, + "learning_rate": 0.0004743296107016065, + "loss": 0.0336, + "num_input_tokens_seen": 149541728, + "step": 69235 + }, + { + "epoch": 11.2952691680261, + "grad_norm": 0.3088570833206177, + "learning_rate": 0.0004742585250117698, + "loss": 0.0157, + "num_input_tokens_seen": 149552544, + "step": 69240 + }, + { + "epoch": 11.296084828711257, + "grad_norm": 1.3200207948684692, + "learning_rate": 0.00047418743984361676, + "loss": 0.0721, + "num_input_tokens_seen": 149563456, + "step": 69245 + }, + { + "epoch": 11.29690048939641, + "grad_norm": 0.4354603886604309, + "learning_rate": 0.0004741163551985881, + "loss": 0.0331, + "num_input_tokens_seen": 149574336, + "step": 69250 + }, + { + "epoch": 11.297716150081566, + "grad_norm": 0.0031291439663618803, + "learning_rate": 0.00047404527107812423, + "loss": 0.002, + "num_input_tokens_seen": 149584512, + "step": 69255 + }, + { + "epoch": 11.298531810766722, + "grad_norm": 0.43325313925743103, + "learning_rate": 0.00047397418748366596, + "loss": 0.0693, + "num_input_tokens_seen": 149594176, + "step": 69260 + }, + { + "epoch": 11.299347471451876, + "grad_norm": 0.08179045468568802, + "learning_rate": 0.0004739031044166536, + "loss": 0.0592, + "num_input_tokens_seen": 149605216, + "step": 69265 + }, + { + "epoch": 11.300163132137031, + "grad_norm": 0.00407014973461628, + "learning_rate": 0.0004738320218785281, + "loss": 0.0163, + "num_input_tokens_seen": 149616480, + "step": 69270 + }, + { + "epoch": 11.300978792822185, + "grad_norm": 0.013852819800376892, + "learning_rate": 0.00047376093987072985, + "loss": 0.0048, + "num_input_tokens_seen": 149627392, + "step": 69275 + }, + { + "epoch": 11.301794453507341, + "grad_norm": 0.002191155683249235, + "learning_rate": 0.00047368985839469946, + "loss": 0.0029, + "num_input_tokens_seen": 149638144, + "step": 69280 + }, + { + "epoch": 11.302610114192497, + "grad_norm": 0.059072766453027725, + "learning_rate": 0.00047361877745187743, + "loss": 0.0103, + "num_input_tokens_seen": 149648640, + "step": 69285 + }, + { + "epoch": 11.30342577487765, + "grad_norm": 0.4255982041358948, + "learning_rate": 0.0004735476970437043, + "loss": 0.1163, + "num_input_tokens_seen": 149659392, + "step": 69290 + }, + { + "epoch": 11.304241435562806, + "grad_norm": 0.06577350199222565, + "learning_rate": 0.0004734766171716208, + "loss": 0.02, + "num_input_tokens_seen": 149669184, + "step": 69295 + }, + { + "epoch": 11.30505709624796, + "grad_norm": 0.47753238677978516, + "learning_rate": 0.0004734055378370671, + "loss": 0.0425, + "num_input_tokens_seen": 149681504, + "step": 69300 + }, + { + "epoch": 11.305872756933116, + "grad_norm": 0.012164420448243618, + "learning_rate": 0.00047333445904148414, + "loss": 0.0191, + "num_input_tokens_seen": 149692192, + "step": 69305 + }, + { + "epoch": 11.30668841761827, + "grad_norm": 0.0932174026966095, + "learning_rate": 0.0004732633807863119, + "loss": 0.0097, + "num_input_tokens_seen": 149702688, + "step": 69310 + }, + { + "epoch": 11.307504078303426, + "grad_norm": 0.007502132561057806, + "learning_rate": 0.0004731923030729915, + "loss": 0.0056, + "num_input_tokens_seen": 149714016, + "step": 69315 + }, + { + "epoch": 11.308319738988581, + "grad_norm": 0.002116349758580327, + "learning_rate": 0.0004731212259029628, + "loss": 0.006, + "num_input_tokens_seen": 149723456, + "step": 69320 + }, + { + "epoch": 11.309135399673735, + "grad_norm": 0.0006365369190461934, + "learning_rate": 0.0004730501492776668, + "loss": 0.004, + "num_input_tokens_seen": 149734816, + "step": 69325 + }, + { + "epoch": 11.309951060358891, + "grad_norm": 0.12664510309696198, + "learning_rate": 0.00047297907319854347, + "loss": 0.0369, + "num_input_tokens_seen": 149745440, + "step": 69330 + }, + { + "epoch": 11.310766721044045, + "grad_norm": 0.13877706229686737, + "learning_rate": 0.0004729079976670338, + "loss": 0.0101, + "num_input_tokens_seen": 149753664, + "step": 69335 + }, + { + "epoch": 11.3115823817292, + "grad_norm": 0.31556734442710876, + "learning_rate": 0.00047283692268457764, + "loss": 0.0417, + "num_input_tokens_seen": 149765280, + "step": 69340 + }, + { + "epoch": 11.312398042414356, + "grad_norm": 0.07661747932434082, + "learning_rate": 0.0004727658482526159, + "loss": 0.0281, + "num_input_tokens_seen": 149774944, + "step": 69345 + }, + { + "epoch": 11.31321370309951, + "grad_norm": 0.0017009270377457142, + "learning_rate": 0.00047269477437258863, + "loss": 0.0066, + "num_input_tokens_seen": 149785664, + "step": 69350 + }, + { + "epoch": 11.314029363784666, + "grad_norm": 0.03626738116145134, + "learning_rate": 0.0004726237010459366, + "loss": 0.0107, + "num_input_tokens_seen": 149796864, + "step": 69355 + }, + { + "epoch": 11.31484502446982, + "grad_norm": 0.0531466118991375, + "learning_rate": 0.00047255262827409974, + "loss": 0.1056, + "num_input_tokens_seen": 149807136, + "step": 69360 + }, + { + "epoch": 11.315660685154976, + "grad_norm": 0.3658745288848877, + "learning_rate": 0.00047248155605851896, + "loss": 0.0328, + "num_input_tokens_seen": 149817888, + "step": 69365 + }, + { + "epoch": 11.31647634584013, + "grad_norm": 0.5249987840652466, + "learning_rate": 0.0004724104844006341, + "loss": 0.0633, + "num_input_tokens_seen": 149828448, + "step": 69370 + }, + { + "epoch": 11.317292006525285, + "grad_norm": 0.0011662240140140057, + "learning_rate": 0.0004723394133018858, + "loss": 0.0368, + "num_input_tokens_seen": 149839968, + "step": 69375 + }, + { + "epoch": 11.318107667210441, + "grad_norm": 0.006454044952988625, + "learning_rate": 0.00047226834276371457, + "loss": 0.0474, + "num_input_tokens_seen": 149850688, + "step": 69380 + }, + { + "epoch": 11.318923327895595, + "grad_norm": 0.04705966264009476, + "learning_rate": 0.00047219727278756033, + "loss": 0.0116, + "num_input_tokens_seen": 149861152, + "step": 69385 + }, + { + "epoch": 11.31973898858075, + "grad_norm": 0.3665064871311188, + "learning_rate": 0.0004721262033748639, + "loss": 0.1215, + "num_input_tokens_seen": 149870592, + "step": 69390 + }, + { + "epoch": 11.320554649265905, + "grad_norm": 0.0022680433467030525, + "learning_rate": 0.00047205513452706503, + "loss": 0.0041, + "num_input_tokens_seen": 149880736, + "step": 69395 + }, + { + "epoch": 11.32137030995106, + "grad_norm": 0.4094318747520447, + "learning_rate": 0.0004719840662456046, + "loss": 0.0873, + "num_input_tokens_seen": 149890912, + "step": 69400 + }, + { + "epoch": 11.322185970636216, + "grad_norm": 0.9223697781562805, + "learning_rate": 0.0004719129985319223, + "loss": 0.0964, + "num_input_tokens_seen": 149901024, + "step": 69405 + }, + { + "epoch": 11.32300163132137, + "grad_norm": 0.004950478672981262, + "learning_rate": 0.0004718419313874589, + "loss": 0.0312, + "num_input_tokens_seen": 149912352, + "step": 69410 + }, + { + "epoch": 11.323817292006526, + "grad_norm": 0.04885503649711609, + "learning_rate": 0.00047177086481365444, + "loss": 0.0061, + "num_input_tokens_seen": 149923072, + "step": 69415 + }, + { + "epoch": 11.32463295269168, + "grad_norm": 0.021326279267668724, + "learning_rate": 0.00047169979881194927, + "loss": 0.0059, + "num_input_tokens_seen": 149933248, + "step": 69420 + }, + { + "epoch": 11.325448613376835, + "grad_norm": 0.18451477587223053, + "learning_rate": 0.00047162873338378353, + "loss": 0.0136, + "num_input_tokens_seen": 149943392, + "step": 69425 + }, + { + "epoch": 11.326264274061991, + "grad_norm": 0.3547409176826477, + "learning_rate": 0.0004715576685305975, + "loss": 0.0394, + "num_input_tokens_seen": 149954240, + "step": 69430 + }, + { + "epoch": 11.327079934747145, + "grad_norm": 0.02587219700217247, + "learning_rate": 0.0004714866042538313, + "loss": 0.0062, + "num_input_tokens_seen": 149964352, + "step": 69435 + }, + { + "epoch": 11.3278955954323, + "grad_norm": 0.11461975425481796, + "learning_rate": 0.00047141554055492546, + "loss": 0.0079, + "num_input_tokens_seen": 149975680, + "step": 69440 + }, + { + "epoch": 11.328711256117455, + "grad_norm": 0.18348151445388794, + "learning_rate": 0.0004713444774353197, + "loss": 0.0975, + "num_input_tokens_seen": 149987136, + "step": 69445 + }, + { + "epoch": 11.32952691680261, + "grad_norm": 0.039701469242572784, + "learning_rate": 0.0004712734148964547, + "loss": 0.0123, + "num_input_tokens_seen": 149998624, + "step": 69450 + }, + { + "epoch": 11.330342577487766, + "grad_norm": 0.00447084940969944, + "learning_rate": 0.00047120235293977023, + "loss": 0.0045, + "num_input_tokens_seen": 150009280, + "step": 69455 + }, + { + "epoch": 11.33115823817292, + "grad_norm": 0.14754629135131836, + "learning_rate": 0.00047113129156670677, + "loss": 0.0187, + "num_input_tokens_seen": 150019360, + "step": 69460 + }, + { + "epoch": 11.331973898858076, + "grad_norm": 0.0007346358615905046, + "learning_rate": 0.00047106023077870407, + "loss": 0.0088, + "num_input_tokens_seen": 150030336, + "step": 69465 + }, + { + "epoch": 11.33278955954323, + "grad_norm": 0.09433241188526154, + "learning_rate": 0.00047098917057720275, + "loss": 0.0178, + "num_input_tokens_seen": 150040288, + "step": 69470 + }, + { + "epoch": 11.333605220228385, + "grad_norm": 0.149391308426857, + "learning_rate": 0.00047091811096364243, + "loss": 0.0079, + "num_input_tokens_seen": 150051296, + "step": 69475 + }, + { + "epoch": 11.33442088091354, + "grad_norm": 0.0016867019003257155, + "learning_rate": 0.00047084705193946357, + "loss": 0.004, + "num_input_tokens_seen": 150062656, + "step": 69480 + }, + { + "epoch": 11.335236541598695, + "grad_norm": 0.006028663367033005, + "learning_rate": 0.0004707759935061063, + "loss": 0.0098, + "num_input_tokens_seen": 150073792, + "step": 69485 + }, + { + "epoch": 11.33605220228385, + "grad_norm": 0.015563595108687878, + "learning_rate": 0.0004707049356650105, + "loss": 0.0032, + "num_input_tokens_seen": 150085088, + "step": 69490 + }, + { + "epoch": 11.336867862969005, + "grad_norm": 0.9245015382766724, + "learning_rate": 0.0004706338784176165, + "loss": 0.0306, + "num_input_tokens_seen": 150096640, + "step": 69495 + }, + { + "epoch": 11.33768352365416, + "grad_norm": 0.1057695597410202, + "learning_rate": 0.000470562821765364, + "loss": 0.0087, + "num_input_tokens_seen": 150106656, + "step": 69500 + }, + { + "epoch": 11.338499184339314, + "grad_norm": 0.7162270545959473, + "learning_rate": 0.0004704917657096934, + "loss": 0.0747, + "num_input_tokens_seen": 150117056, + "step": 69505 + }, + { + "epoch": 11.33931484502447, + "grad_norm": 0.08723993599414825, + "learning_rate": 0.00047042071025204445, + "loss": 0.0063, + "num_input_tokens_seen": 150128832, + "step": 69510 + }, + { + "epoch": 11.340130505709626, + "grad_norm": 0.09651493281126022, + "learning_rate": 0.0004703496553938576, + "loss": 0.0111, + "num_input_tokens_seen": 150139168, + "step": 69515 + }, + { + "epoch": 11.34094616639478, + "grad_norm": 0.023214256390929222, + "learning_rate": 0.00047027860113657235, + "loss": 0.1542, + "num_input_tokens_seen": 150149344, + "step": 69520 + }, + { + "epoch": 11.341761827079935, + "grad_norm": 0.01802109181880951, + "learning_rate": 0.00047020754748162914, + "loss": 0.0138, + "num_input_tokens_seen": 150160032, + "step": 69525 + }, + { + "epoch": 11.34257748776509, + "grad_norm": 0.467803031206131, + "learning_rate": 0.0004701364944304675, + "loss": 0.0278, + "num_input_tokens_seen": 150171264, + "step": 69530 + }, + { + "epoch": 11.343393148450245, + "grad_norm": 0.018711155280470848, + "learning_rate": 0.000470065441984528, + "loss": 0.0039, + "num_input_tokens_seen": 150181696, + "step": 69535 + }, + { + "epoch": 11.3442088091354, + "grad_norm": 0.0073052081279456615, + "learning_rate": 0.00046999439014525004, + "loss": 0.0077, + "num_input_tokens_seen": 150192352, + "step": 69540 + }, + { + "epoch": 11.345024469820554, + "grad_norm": 0.0033433528151363134, + "learning_rate": 0.00046992333891407396, + "loss": 0.0414, + "num_input_tokens_seen": 150203392, + "step": 69545 + }, + { + "epoch": 11.34584013050571, + "grad_norm": 0.13693156838417053, + "learning_rate": 0.00046985228829243955, + "loss": 0.0771, + "num_input_tokens_seen": 150214144, + "step": 69550 + }, + { + "epoch": 11.346655791190864, + "grad_norm": 0.15868552029132843, + "learning_rate": 0.0004697812382817868, + "loss": 0.0192, + "num_input_tokens_seen": 150225664, + "step": 69555 + }, + { + "epoch": 11.34747145187602, + "grad_norm": 0.22203922271728516, + "learning_rate": 0.0004697101888835555, + "loss": 0.0222, + "num_input_tokens_seen": 150237728, + "step": 69560 + }, + { + "epoch": 11.348287112561174, + "grad_norm": 0.22300738096237183, + "learning_rate": 0.0004696391400991857, + "loss": 0.0206, + "num_input_tokens_seen": 150249056, + "step": 69565 + }, + { + "epoch": 11.34910277324633, + "grad_norm": 2.7132036685943604, + "learning_rate": 0.0004695680919301173, + "loss": 0.0948, + "num_input_tokens_seen": 150259872, + "step": 69570 + }, + { + "epoch": 11.349918433931485, + "grad_norm": 0.6747056245803833, + "learning_rate": 0.00046949704437779005, + "loss": 0.261, + "num_input_tokens_seen": 150269312, + "step": 69575 + }, + { + "epoch": 11.350734094616639, + "grad_norm": 0.04295853152871132, + "learning_rate": 0.0004694259974436438, + "loss": 0.0082, + "num_input_tokens_seen": 150280064, + "step": 69580 + }, + { + "epoch": 11.351549755301795, + "grad_norm": 0.08870097249746323, + "learning_rate": 0.00046935495112911856, + "loss": 0.0273, + "num_input_tokens_seen": 150290784, + "step": 69585 + }, + { + "epoch": 11.352365415986949, + "grad_norm": 0.002641193335875869, + "learning_rate": 0.0004692839054356542, + "loss": 0.0031, + "num_input_tokens_seen": 150301248, + "step": 69590 + }, + { + "epoch": 11.353181076672104, + "grad_norm": 0.015244108624756336, + "learning_rate": 0.0004692128603646904, + "loss": 0.0053, + "num_input_tokens_seen": 150312160, + "step": 69595 + }, + { + "epoch": 11.35399673735726, + "grad_norm": 0.00676583731546998, + "learning_rate": 0.0004691418159176671, + "loss": 0.0158, + "num_input_tokens_seen": 150322336, + "step": 69600 + }, + { + "epoch": 11.354812398042414, + "grad_norm": 0.11018446087837219, + "learning_rate": 0.00046907077209602387, + "loss": 0.0472, + "num_input_tokens_seen": 150332800, + "step": 69605 + }, + { + "epoch": 11.35562805872757, + "grad_norm": 0.013826750218868256, + "learning_rate": 0.0004689997289012009, + "loss": 0.0206, + "num_input_tokens_seen": 150343488, + "step": 69610 + }, + { + "epoch": 11.356443719412724, + "grad_norm": 0.014815778471529484, + "learning_rate": 0.0004689286863346376, + "loss": 0.0666, + "num_input_tokens_seen": 150355200, + "step": 69615 + }, + { + "epoch": 11.35725938009788, + "grad_norm": 0.06837964802980423, + "learning_rate": 0.00046885764439777406, + "loss": 0.2133, + "num_input_tokens_seen": 150365632, + "step": 69620 + }, + { + "epoch": 11.358075040783035, + "grad_norm": 0.08865787088871002, + "learning_rate": 0.0004687866030920496, + "loss": 0.0067, + "num_input_tokens_seen": 150376512, + "step": 69625 + }, + { + "epoch": 11.358890701468189, + "grad_norm": 0.005196568556129932, + "learning_rate": 0.00046871556241890455, + "loss": 0.1446, + "num_input_tokens_seen": 150385184, + "step": 69630 + }, + { + "epoch": 11.359706362153345, + "grad_norm": 0.046573054045438766, + "learning_rate": 0.000468644522379778, + "loss": 0.0273, + "num_input_tokens_seen": 150396384, + "step": 69635 + }, + { + "epoch": 11.360522022838499, + "grad_norm": 0.35547757148742676, + "learning_rate": 0.00046857348297611024, + "loss": 0.2188, + "num_input_tokens_seen": 150407712, + "step": 69640 + }, + { + "epoch": 11.361337683523654, + "grad_norm": 0.014049242250621319, + "learning_rate": 0.0004685024442093405, + "loss": 0.0876, + "num_input_tokens_seen": 150418016, + "step": 69645 + }, + { + "epoch": 11.362153344208808, + "grad_norm": 0.03289031237363815, + "learning_rate": 0.00046843140608090897, + "loss": 0.017, + "num_input_tokens_seen": 150427840, + "step": 69650 + }, + { + "epoch": 11.362969004893964, + "grad_norm": 0.33571213483810425, + "learning_rate": 0.0004683603685922547, + "loss": 0.0994, + "num_input_tokens_seen": 150439040, + "step": 69655 + }, + { + "epoch": 11.36378466557912, + "grad_norm": 0.01751234009861946, + "learning_rate": 0.00046828933174481797, + "loss": 0.1699, + "num_input_tokens_seen": 150450080, + "step": 69660 + }, + { + "epoch": 11.364600326264274, + "grad_norm": 0.03534317389130592, + "learning_rate": 0.000468218295540038, + "loss": 0.0356, + "num_input_tokens_seen": 150461728, + "step": 69665 + }, + { + "epoch": 11.36541598694943, + "grad_norm": 0.037167083472013474, + "learning_rate": 0.0004681472599793547, + "loss": 0.0182, + "num_input_tokens_seen": 150472288, + "step": 69670 + }, + { + "epoch": 11.366231647634583, + "grad_norm": 0.045636508613824844, + "learning_rate": 0.00046807622506420745, + "loss": 0.0124, + "num_input_tokens_seen": 150482048, + "step": 69675 + }, + { + "epoch": 11.367047308319739, + "grad_norm": 0.020483041182160378, + "learning_rate": 0.00046800519079603616, + "loss": 0.004, + "num_input_tokens_seen": 150492928, + "step": 69680 + }, + { + "epoch": 11.367862969004895, + "grad_norm": 0.06380794942378998, + "learning_rate": 0.00046793415717628006, + "loss": 0.0081, + "num_input_tokens_seen": 150504448, + "step": 69685 + }, + { + "epoch": 11.368678629690049, + "grad_norm": 0.006169379223138094, + "learning_rate": 0.000467863124206379, + "loss": 0.0785, + "num_input_tokens_seen": 150514880, + "step": 69690 + }, + { + "epoch": 11.369494290375204, + "grad_norm": 0.004469654988497496, + "learning_rate": 0.0004677920918877726, + "loss": 0.1212, + "num_input_tokens_seen": 150525984, + "step": 69695 + }, + { + "epoch": 11.370309951060358, + "grad_norm": 0.009194393642246723, + "learning_rate": 0.0004677210602219002, + "loss": 0.0221, + "num_input_tokens_seen": 150536352, + "step": 69700 + }, + { + "epoch": 11.371125611745514, + "grad_norm": 0.014019605703651905, + "learning_rate": 0.00046765002921020165, + "loss": 0.0168, + "num_input_tokens_seen": 150547040, + "step": 69705 + }, + { + "epoch": 11.37194127243067, + "grad_norm": 0.008166109211742878, + "learning_rate": 0.0004675789988541161, + "loss": 0.0146, + "num_input_tokens_seen": 150557920, + "step": 69710 + }, + { + "epoch": 11.372756933115824, + "grad_norm": 0.029136566445231438, + "learning_rate": 0.0004675079691550833, + "loss": 0.0126, + "num_input_tokens_seen": 150568832, + "step": 69715 + }, + { + "epoch": 11.37357259380098, + "grad_norm": 0.0021976104471832514, + "learning_rate": 0.0004674369401145428, + "loss": 0.0731, + "num_input_tokens_seen": 150579232, + "step": 69720 + }, + { + "epoch": 11.374388254486133, + "grad_norm": 0.0005827890709042549, + "learning_rate": 0.000467365911733934, + "loss": 0.0138, + "num_input_tokens_seen": 150590656, + "step": 69725 + }, + { + "epoch": 11.375203915171289, + "grad_norm": 0.017050622031092644, + "learning_rate": 0.0004672948840146964, + "loss": 0.0246, + "num_input_tokens_seen": 150600992, + "step": 69730 + }, + { + "epoch": 11.376019575856443, + "grad_norm": 0.07799620926380157, + "learning_rate": 0.0004672238569582695, + "loss": 0.0096, + "num_input_tokens_seen": 150612512, + "step": 69735 + }, + { + "epoch": 11.376835236541599, + "grad_norm": 0.008146431297063828, + "learning_rate": 0.00046715283056609255, + "loss": 0.0073, + "num_input_tokens_seen": 150622976, + "step": 69740 + }, + { + "epoch": 11.377650897226754, + "grad_norm": 0.01783410832285881, + "learning_rate": 0.0004670818048396054, + "loss": 0.0123, + "num_input_tokens_seen": 150633792, + "step": 69745 + }, + { + "epoch": 11.378466557911908, + "grad_norm": 0.22305737435817719, + "learning_rate": 0.00046701077978024695, + "loss": 0.0305, + "num_input_tokens_seen": 150644864, + "step": 69750 + }, + { + "epoch": 11.379282218597064, + "grad_norm": 0.012764952145516872, + "learning_rate": 0.0004669397553894572, + "loss": 0.0759, + "num_input_tokens_seen": 150654912, + "step": 69755 + }, + { + "epoch": 11.380097879282218, + "grad_norm": 0.07222677022218704, + "learning_rate": 0.00046686873166867503, + "loss": 0.0307, + "num_input_tokens_seen": 150665632, + "step": 69760 + }, + { + "epoch": 11.380913539967374, + "grad_norm": 0.007756201084703207, + "learning_rate": 0.00046679770861934026, + "loss": 0.0233, + "num_input_tokens_seen": 150676576, + "step": 69765 + }, + { + "epoch": 11.38172920065253, + "grad_norm": 0.001642027753405273, + "learning_rate": 0.00046672668624289177, + "loss": 0.0241, + "num_input_tokens_seen": 150687936, + "step": 69770 + }, + { + "epoch": 11.382544861337683, + "grad_norm": 0.005967188626527786, + "learning_rate": 0.0004666556645407695, + "loss": 0.0062, + "num_input_tokens_seen": 150699808, + "step": 69775 + }, + { + "epoch": 11.383360522022839, + "grad_norm": 0.24380135536193848, + "learning_rate": 0.00046658464351441214, + "loss": 0.0123, + "num_input_tokens_seen": 150710304, + "step": 69780 + }, + { + "epoch": 11.384176182707993, + "grad_norm": 0.04066390171647072, + "learning_rate": 0.0004665136231652597, + "loss": 0.0122, + "num_input_tokens_seen": 150721696, + "step": 69785 + }, + { + "epoch": 11.384991843393149, + "grad_norm": 0.004151922184973955, + "learning_rate": 0.0004664426034947509, + "loss": 0.0107, + "num_input_tokens_seen": 150732832, + "step": 69790 + }, + { + "epoch": 11.385807504078304, + "grad_norm": 0.3790360987186432, + "learning_rate": 0.00046637158450432557, + "loss": 0.1486, + "num_input_tokens_seen": 150743168, + "step": 69795 + }, + { + "epoch": 11.386623164763458, + "grad_norm": 0.4907276928424835, + "learning_rate": 0.0004663005661954225, + "loss": 0.0318, + "num_input_tokens_seen": 150753984, + "step": 69800 + }, + { + "epoch": 11.387438825448614, + "grad_norm": 0.004496270790696144, + "learning_rate": 0.0004662295485694812, + "loss": 0.0028, + "num_input_tokens_seen": 150763840, + "step": 69805 + }, + { + "epoch": 11.388254486133768, + "grad_norm": 0.013045874424278736, + "learning_rate": 0.00046615853162794115, + "loss": 0.0179, + "num_input_tokens_seen": 150774016, + "step": 69810 + }, + { + "epoch": 11.389070146818923, + "grad_norm": 0.12476672232151031, + "learning_rate": 0.00046608751537224115, + "loss": 0.0283, + "num_input_tokens_seen": 150785024, + "step": 69815 + }, + { + "epoch": 11.38988580750408, + "grad_norm": 0.000895587436389178, + "learning_rate": 0.0004660164998038209, + "loss": 0.1608, + "num_input_tokens_seen": 150795264, + "step": 69820 + }, + { + "epoch": 11.390701468189233, + "grad_norm": 0.003991547040641308, + "learning_rate": 0.0004659454849241192, + "loss": 0.0028, + "num_input_tokens_seen": 150805440, + "step": 69825 + }, + { + "epoch": 11.391517128874389, + "grad_norm": 0.001991401193663478, + "learning_rate": 0.0004658744707345757, + "loss": 0.0102, + "num_input_tokens_seen": 150816064, + "step": 69830 + }, + { + "epoch": 11.392332789559543, + "grad_norm": 0.025201058015227318, + "learning_rate": 0.000465803457236629, + "loss": 0.0161, + "num_input_tokens_seen": 150828000, + "step": 69835 + }, + { + "epoch": 11.393148450244698, + "grad_norm": 0.002868784824386239, + "learning_rate": 0.00046573244443171897, + "loss": 0.022, + "num_input_tokens_seen": 150838912, + "step": 69840 + }, + { + "epoch": 11.393964110929852, + "grad_norm": 0.0028491260018199682, + "learning_rate": 0.00046566143232128416, + "loss": 0.027, + "num_input_tokens_seen": 150850976, + "step": 69845 + }, + { + "epoch": 11.394779771615008, + "grad_norm": 0.003991770092397928, + "learning_rate": 0.0004655904209067642, + "loss": 0.006, + "num_input_tokens_seen": 150861216, + "step": 69850 + }, + { + "epoch": 11.395595432300164, + "grad_norm": 0.08987481147050858, + "learning_rate": 0.0004655194101895978, + "loss": 0.0087, + "num_input_tokens_seen": 150871808, + "step": 69855 + }, + { + "epoch": 11.396411092985318, + "grad_norm": 0.12651516497135162, + "learning_rate": 0.00046544840017122437, + "loss": 0.1577, + "num_input_tokens_seen": 150882432, + "step": 69860 + }, + { + "epoch": 11.397226753670473, + "grad_norm": 0.38407793641090393, + "learning_rate": 0.000465377390853083, + "loss": 0.0731, + "num_input_tokens_seen": 150894016, + "step": 69865 + }, + { + "epoch": 11.398042414355627, + "grad_norm": 0.5415304899215698, + "learning_rate": 0.0004653063822366127, + "loss": 0.0491, + "num_input_tokens_seen": 150904384, + "step": 69870 + }, + { + "epoch": 11.398858075040783, + "grad_norm": 0.001830374007113278, + "learning_rate": 0.00046523537432325256, + "loss": 0.0782, + "num_input_tokens_seen": 150915328, + "step": 69875 + }, + { + "epoch": 11.399673735725939, + "grad_norm": 0.1316104382276535, + "learning_rate": 0.00046516436711444166, + "loss": 0.0082, + "num_input_tokens_seen": 150927264, + "step": 69880 + }, + { + "epoch": 11.400489396411093, + "grad_norm": 0.005751550663262606, + "learning_rate": 0.000465093360611619, + "loss": 0.0139, + "num_input_tokens_seen": 150938752, + "step": 69885 + }, + { + "epoch": 11.401305057096248, + "grad_norm": 0.019678879529237747, + "learning_rate": 0.00046502235481622387, + "loss": 0.005, + "num_input_tokens_seen": 150949312, + "step": 69890 + }, + { + "epoch": 11.402120717781402, + "grad_norm": 0.03236980736255646, + "learning_rate": 0.00046495134972969476, + "loss": 0.0759, + "num_input_tokens_seen": 150959488, + "step": 69895 + }, + { + "epoch": 11.402936378466558, + "grad_norm": 0.11323463916778564, + "learning_rate": 0.00046488034535347133, + "loss": 0.0246, + "num_input_tokens_seen": 150971264, + "step": 69900 + }, + { + "epoch": 11.403752039151712, + "grad_norm": 0.011170793324708939, + "learning_rate": 0.00046480934168899204, + "loss": 0.007, + "num_input_tokens_seen": 150982688, + "step": 69905 + }, + { + "epoch": 11.404567699836868, + "grad_norm": 0.22744742035865784, + "learning_rate": 0.0004647383387376961, + "loss": 0.0193, + "num_input_tokens_seen": 150993504, + "step": 69910 + }, + { + "epoch": 11.405383360522023, + "grad_norm": 0.005999124143272638, + "learning_rate": 0.0004646673365010226, + "loss": 0.0046, + "num_input_tokens_seen": 151004256, + "step": 69915 + }, + { + "epoch": 11.406199021207177, + "grad_norm": 0.007814616896212101, + "learning_rate": 0.0004645963349804102, + "loss": 0.0679, + "num_input_tokens_seen": 151014560, + "step": 69920 + }, + { + "epoch": 11.407014681892333, + "grad_norm": 0.002219694433733821, + "learning_rate": 0.0004645253341772982, + "loss": 0.15, + "num_input_tokens_seen": 151026752, + "step": 69925 + }, + { + "epoch": 11.407830342577487, + "grad_norm": 0.0035255623515695333, + "learning_rate": 0.00046445433409312507, + "loss": 0.0028, + "num_input_tokens_seen": 151037792, + "step": 69930 + }, + { + "epoch": 11.408646003262643, + "grad_norm": 0.13510288298130035, + "learning_rate": 0.00046438333472933015, + "loss": 0.0644, + "num_input_tokens_seen": 151048832, + "step": 69935 + }, + { + "epoch": 11.409461663947798, + "grad_norm": 0.04429556801915169, + "learning_rate": 0.0004643123360873519, + "loss": 0.043, + "num_input_tokens_seen": 151059840, + "step": 69940 + }, + { + "epoch": 11.410277324632952, + "grad_norm": 0.26778528094291687, + "learning_rate": 0.00046424133816862966, + "loss": 0.0608, + "num_input_tokens_seen": 151071328, + "step": 69945 + }, + { + "epoch": 11.411092985318108, + "grad_norm": 0.031590599566698074, + "learning_rate": 0.00046417034097460193, + "loss": 0.0184, + "num_input_tokens_seen": 151081920, + "step": 69950 + }, + { + "epoch": 11.411908646003262, + "grad_norm": 0.41061243414878845, + "learning_rate": 0.0004640993445067078, + "loss": 0.1952, + "num_input_tokens_seen": 151093440, + "step": 69955 + }, + { + "epoch": 11.412724306688418, + "grad_norm": 0.011785534210503101, + "learning_rate": 0.00046402834876638584, + "loss": 0.0072, + "num_input_tokens_seen": 151104256, + "step": 69960 + }, + { + "epoch": 11.413539967373573, + "grad_norm": 0.0027609181124716997, + "learning_rate": 0.00046395735375507523, + "loss": 0.0053, + "num_input_tokens_seen": 151113856, + "step": 69965 + }, + { + "epoch": 11.414355628058727, + "grad_norm": 0.004242262803018093, + "learning_rate": 0.0004638863594742144, + "loss": 0.0083, + "num_input_tokens_seen": 151124928, + "step": 69970 + }, + { + "epoch": 11.415171288743883, + "grad_norm": 0.0068120453506708145, + "learning_rate": 0.00046381536592524244, + "loss": 0.0188, + "num_input_tokens_seen": 151134304, + "step": 69975 + }, + { + "epoch": 11.415986949429037, + "grad_norm": 0.43580371141433716, + "learning_rate": 0.00046374437310959783, + "loss": 0.089, + "num_input_tokens_seen": 151145600, + "step": 69980 + }, + { + "epoch": 11.416802610114193, + "grad_norm": 0.01116594672203064, + "learning_rate": 0.0004636733810287197, + "loss": 0.0091, + "num_input_tokens_seen": 151155872, + "step": 69985 + }, + { + "epoch": 11.417618270799348, + "grad_norm": 0.00956810638308525, + "learning_rate": 0.00046360238968404634, + "loss": 0.0033, + "num_input_tokens_seen": 151164960, + "step": 69990 + }, + { + "epoch": 11.418433931484502, + "grad_norm": 0.00572627317160368, + "learning_rate": 0.000463531399077017, + "loss": 0.0798, + "num_input_tokens_seen": 151176480, + "step": 69995 + }, + { + "epoch": 11.419249592169658, + "grad_norm": 0.9830451011657715, + "learning_rate": 0.00046346040920906985, + "loss": 0.0623, + "num_input_tokens_seen": 151187136, + "step": 70000 + }, + { + "epoch": 11.420065252854812, + "grad_norm": 0.03204105421900749, + "learning_rate": 0.000463389420081644, + "loss": 0.015, + "num_input_tokens_seen": 151197856, + "step": 70005 + }, + { + "epoch": 11.420880913539968, + "grad_norm": 0.20063550770282745, + "learning_rate": 0.000463318431696178, + "loss": 0.0212, + "num_input_tokens_seen": 151209280, + "step": 70010 + }, + { + "epoch": 11.421696574225122, + "grad_norm": 0.0036226080264896154, + "learning_rate": 0.00046324744405411034, + "loss": 0.0399, + "num_input_tokens_seen": 151219392, + "step": 70015 + }, + { + "epoch": 11.422512234910277, + "grad_norm": 0.005751903634518385, + "learning_rate": 0.00046317645715688015, + "loss": 0.2715, + "num_input_tokens_seen": 151230784, + "step": 70020 + }, + { + "epoch": 11.423327895595433, + "grad_norm": 0.23385001718997955, + "learning_rate": 0.00046310547100592557, + "loss": 0.1079, + "num_input_tokens_seen": 151241824, + "step": 70025 + }, + { + "epoch": 11.424143556280587, + "grad_norm": 0.0022833424154669046, + "learning_rate": 0.0004630344856026855, + "loss": 0.0047, + "num_input_tokens_seen": 151253280, + "step": 70030 + }, + { + "epoch": 11.424959216965743, + "grad_norm": 0.009258701466023922, + "learning_rate": 0.0004629635009485984, + "loss": 0.0887, + "num_input_tokens_seen": 151264864, + "step": 70035 + }, + { + "epoch": 11.425774877650896, + "grad_norm": 0.06533806771039963, + "learning_rate": 0.000462892517045103, + "loss": 0.0182, + "num_input_tokens_seen": 151275936, + "step": 70040 + }, + { + "epoch": 11.426590538336052, + "grad_norm": 0.039362918585538864, + "learning_rate": 0.0004628215338936378, + "loss": 0.0376, + "num_input_tokens_seen": 151286368, + "step": 70045 + }, + { + "epoch": 11.427406199021208, + "grad_norm": 0.20595617592334747, + "learning_rate": 0.0004627505514956414, + "loss": 0.0135, + "num_input_tokens_seen": 151296064, + "step": 70050 + }, + { + "epoch": 11.428221859706362, + "grad_norm": 0.05006346479058266, + "learning_rate": 0.0004626795698525522, + "loss": 0.0201, + "num_input_tokens_seen": 151306848, + "step": 70055 + }, + { + "epoch": 11.429037520391518, + "grad_norm": 0.18353790044784546, + "learning_rate": 0.00046260858896580916, + "loss": 0.1223, + "num_input_tokens_seen": 151317088, + "step": 70060 + }, + { + "epoch": 11.429853181076671, + "grad_norm": 0.1014518141746521, + "learning_rate": 0.0004625376088368502, + "loss": 0.0353, + "num_input_tokens_seen": 151328576, + "step": 70065 + }, + { + "epoch": 11.430668841761827, + "grad_norm": 0.003118648659437895, + "learning_rate": 0.0004624666294671143, + "loss": 0.006, + "num_input_tokens_seen": 151339424, + "step": 70070 + }, + { + "epoch": 11.431484502446983, + "grad_norm": 0.07444004714488983, + "learning_rate": 0.00046239565085803966, + "loss": 0.0096, + "num_input_tokens_seen": 151349696, + "step": 70075 + }, + { + "epoch": 11.432300163132137, + "grad_norm": 0.0086215203627944, + "learning_rate": 0.000462324673011065, + "loss": 0.0029, + "num_input_tokens_seen": 151361728, + "step": 70080 + }, + { + "epoch": 11.433115823817293, + "grad_norm": 0.005710784811526537, + "learning_rate": 0.00046225369592762844, + "loss": 0.0456, + "num_input_tokens_seen": 151372928, + "step": 70085 + }, + { + "epoch": 11.433931484502446, + "grad_norm": 0.003733087796717882, + "learning_rate": 0.00046218271960916886, + "loss": 0.016, + "num_input_tokens_seen": 151383072, + "step": 70090 + }, + { + "epoch": 11.434747145187602, + "grad_norm": 0.3411053419113159, + "learning_rate": 0.0004621117440571242, + "loss": 0.1267, + "num_input_tokens_seen": 151393952, + "step": 70095 + }, + { + "epoch": 11.435562805872756, + "grad_norm": 0.008186204358935356, + "learning_rate": 0.0004620407692729333, + "loss": 0.0087, + "num_input_tokens_seen": 151405376, + "step": 70100 + }, + { + "epoch": 11.436378466557912, + "grad_norm": 0.0025740989949554205, + "learning_rate": 0.0004619697952580342, + "loss": 0.0465, + "num_input_tokens_seen": 151415360, + "step": 70105 + }, + { + "epoch": 11.437194127243067, + "grad_norm": 0.011523899622261524, + "learning_rate": 0.00046189882201386564, + "loss": 0.0113, + "num_input_tokens_seen": 151424960, + "step": 70110 + }, + { + "epoch": 11.438009787928221, + "grad_norm": 0.05342297628521919, + "learning_rate": 0.0004618278495418655, + "loss": 0.0082, + "num_input_tokens_seen": 151436000, + "step": 70115 + }, + { + "epoch": 11.438825448613377, + "grad_norm": 0.001507753157056868, + "learning_rate": 0.0004617568778434725, + "loss": 0.004, + "num_input_tokens_seen": 151446944, + "step": 70120 + }, + { + "epoch": 11.439641109298531, + "grad_norm": 0.0007432901184074581, + "learning_rate": 0.0004616859069201251, + "loss": 0.0029, + "num_input_tokens_seen": 151457888, + "step": 70125 + }, + { + "epoch": 11.440456769983687, + "grad_norm": 0.0018831411143764853, + "learning_rate": 0.0004616149367732612, + "loss": 0.0091, + "num_input_tokens_seen": 151467904, + "step": 70130 + }, + { + "epoch": 11.441272430668842, + "grad_norm": 0.0827159658074379, + "learning_rate": 0.0004615439674043195, + "loss": 0.1162, + "num_input_tokens_seen": 151479744, + "step": 70135 + }, + { + "epoch": 11.442088091353996, + "grad_norm": 0.09129805862903595, + "learning_rate": 0.00046147299881473783, + "loss": 0.0245, + "num_input_tokens_seen": 151489760, + "step": 70140 + }, + { + "epoch": 11.442903752039152, + "grad_norm": 0.06670738756656647, + "learning_rate": 0.0004614020310059549, + "loss": 0.0095, + "num_input_tokens_seen": 151500640, + "step": 70145 + }, + { + "epoch": 11.443719412724306, + "grad_norm": 0.036870285868644714, + "learning_rate": 0.0004613310639794086, + "loss": 0.0096, + "num_input_tokens_seen": 151511200, + "step": 70150 + }, + { + "epoch": 11.444535073409462, + "grad_norm": 0.042107485234737396, + "learning_rate": 0.0004612600977365376, + "loss": 0.0109, + "num_input_tokens_seen": 151521632, + "step": 70155 + }, + { + "epoch": 11.445350734094617, + "grad_norm": 0.02666345052421093, + "learning_rate": 0.0004611891322787796, + "loss": 0.0095, + "num_input_tokens_seen": 151531904, + "step": 70160 + }, + { + "epoch": 11.446166394779771, + "grad_norm": 0.022787848487496376, + "learning_rate": 0.0004611181676075734, + "loss": 0.0058, + "num_input_tokens_seen": 151542816, + "step": 70165 + }, + { + "epoch": 11.446982055464927, + "grad_norm": 0.010411875322461128, + "learning_rate": 0.00046104720372435647, + "loss": 0.0078, + "num_input_tokens_seen": 151553312, + "step": 70170 + }, + { + "epoch": 11.447797716150081, + "grad_norm": 0.014083120971918106, + "learning_rate": 0.0004609762406305676, + "loss": 0.006, + "num_input_tokens_seen": 151562112, + "step": 70175 + }, + { + "epoch": 11.448613376835237, + "grad_norm": 0.008514699526131153, + "learning_rate": 0.0004609052783276447, + "loss": 0.0074, + "num_input_tokens_seen": 151571168, + "step": 70180 + }, + { + "epoch": 11.449429037520392, + "grad_norm": 0.07610338926315308, + "learning_rate": 0.0004608343168170259, + "loss": 0.0048, + "num_input_tokens_seen": 151580352, + "step": 70185 + }, + { + "epoch": 11.450244698205546, + "grad_norm": 0.01380261592566967, + "learning_rate": 0.0004607633561001493, + "loss": 0.0047, + "num_input_tokens_seen": 151591360, + "step": 70190 + }, + { + "epoch": 11.451060358890702, + "grad_norm": 0.0032468524295836687, + "learning_rate": 0.0004606923961784532, + "loss": 0.0021, + "num_input_tokens_seen": 151602368, + "step": 70195 + }, + { + "epoch": 11.451876019575856, + "grad_norm": 0.000461250776425004, + "learning_rate": 0.00046062143705337535, + "loss": 0.0027, + "num_input_tokens_seen": 151613504, + "step": 70200 + }, + { + "epoch": 11.452691680261012, + "grad_norm": 0.009105556644499302, + "learning_rate": 0.00046055047872635424, + "loss": 0.0024, + "num_input_tokens_seen": 151624160, + "step": 70205 + }, + { + "epoch": 11.453507340946166, + "grad_norm": 0.005286885425448418, + "learning_rate": 0.0004604795211988275, + "loss": 0.1602, + "num_input_tokens_seen": 151632672, + "step": 70210 + }, + { + "epoch": 11.454323001631321, + "grad_norm": 0.0044653876684606075, + "learning_rate": 0.00046040856447223375, + "loss": 0.0645, + "num_input_tokens_seen": 151644672, + "step": 70215 + }, + { + "epoch": 11.455138662316477, + "grad_norm": 0.0038570996839553118, + "learning_rate": 0.00046033760854801033, + "loss": 0.0166, + "num_input_tokens_seen": 151655168, + "step": 70220 + }, + { + "epoch": 11.455954323001631, + "grad_norm": 0.4873116612434387, + "learning_rate": 0.0004602666534275956, + "loss": 0.0768, + "num_input_tokens_seen": 151666080, + "step": 70225 + }, + { + "epoch": 11.456769983686787, + "grad_norm": 0.011731144040822983, + "learning_rate": 0.0004601956991124278, + "loss": 0.0027, + "num_input_tokens_seen": 151676864, + "step": 70230 + }, + { + "epoch": 11.45758564437194, + "grad_norm": 0.1812438666820526, + "learning_rate": 0.00046012474560394443, + "loss": 0.0688, + "num_input_tokens_seen": 151688768, + "step": 70235 + }, + { + "epoch": 11.458401305057096, + "grad_norm": 0.11268593370914459, + "learning_rate": 0.00046005379290358386, + "loss": 0.0731, + "num_input_tokens_seen": 151699200, + "step": 70240 + }, + { + "epoch": 11.459216965742252, + "grad_norm": 0.004744227509945631, + "learning_rate": 0.00045998284101278367, + "loss": 0.0363, + "num_input_tokens_seen": 151711168, + "step": 70245 + }, + { + "epoch": 11.460032626427406, + "grad_norm": 0.016108961775898933, + "learning_rate": 0.0004599118899329821, + "loss": 0.0137, + "num_input_tokens_seen": 151722208, + "step": 70250 + }, + { + "epoch": 11.460848287112562, + "grad_norm": 0.015514836646616459, + "learning_rate": 0.0004598409396656168, + "loss": 0.0761, + "num_input_tokens_seen": 151732256, + "step": 70255 + }, + { + "epoch": 11.461663947797716, + "grad_norm": 0.002084217732772231, + "learning_rate": 0.000459769990212126, + "loss": 0.0355, + "num_input_tokens_seen": 151742240, + "step": 70260 + }, + { + "epoch": 11.462479608482871, + "grad_norm": 0.009067544713616371, + "learning_rate": 0.0004596990415739472, + "loss": 0.0296, + "num_input_tokens_seen": 151752320, + "step": 70265 + }, + { + "epoch": 11.463295269168025, + "grad_norm": 0.42302384972572327, + "learning_rate": 0.0004596280937525186, + "loss": 0.0366, + "num_input_tokens_seen": 151763072, + "step": 70270 + }, + { + "epoch": 11.464110929853181, + "grad_norm": 0.0011174281826242805, + "learning_rate": 0.00045955714674927775, + "loss": 0.0465, + "num_input_tokens_seen": 151773088, + "step": 70275 + }, + { + "epoch": 11.464926590538337, + "grad_norm": 0.05450016260147095, + "learning_rate": 0.0004594862005656628, + "loss": 0.1505, + "num_input_tokens_seen": 151784992, + "step": 70280 + }, + { + "epoch": 11.46574225122349, + "grad_norm": 0.004575311206281185, + "learning_rate": 0.00045941525520311116, + "loss": 0.0564, + "num_input_tokens_seen": 151796288, + "step": 70285 + }, + { + "epoch": 11.466557911908646, + "grad_norm": 0.018959159031510353, + "learning_rate": 0.0004593443106630611, + "loss": 0.0244, + "num_input_tokens_seen": 151807904, + "step": 70290 + }, + { + "epoch": 11.4673735725938, + "grad_norm": 0.0019465818768367171, + "learning_rate": 0.00045927336694695, + "loss": 0.1312, + "num_input_tokens_seen": 151818592, + "step": 70295 + }, + { + "epoch": 11.468189233278956, + "grad_norm": 0.0028126207180321217, + "learning_rate": 0.00045920242405621595, + "loss": 0.0328, + "num_input_tokens_seen": 151827616, + "step": 70300 + }, + { + "epoch": 11.469004893964112, + "grad_norm": 0.00244735274463892, + "learning_rate": 0.0004591314819922963, + "loss": 0.0073, + "num_input_tokens_seen": 151838272, + "step": 70305 + }, + { + "epoch": 11.469820554649266, + "grad_norm": 0.010344883427023888, + "learning_rate": 0.0004590605407566292, + "loss": 0.0356, + "num_input_tokens_seen": 151848896, + "step": 70310 + }, + { + "epoch": 11.470636215334421, + "grad_norm": 0.07402225583791733, + "learning_rate": 0.00045898960035065204, + "loss": 0.0283, + "num_input_tokens_seen": 151861248, + "step": 70315 + }, + { + "epoch": 11.471451876019575, + "grad_norm": 0.5629965662956238, + "learning_rate": 0.00045891866077580267, + "loss": 0.0419, + "num_input_tokens_seen": 151872512, + "step": 70320 + }, + { + "epoch": 11.47226753670473, + "grad_norm": 0.027263466268777847, + "learning_rate": 0.0004588477220335188, + "loss": 0.0407, + "num_input_tokens_seen": 151883744, + "step": 70325 + }, + { + "epoch": 11.473083197389887, + "grad_norm": 0.4663159251213074, + "learning_rate": 0.000458776784125238, + "loss": 0.0163, + "num_input_tokens_seen": 151894592, + "step": 70330 + }, + { + "epoch": 11.47389885807504, + "grad_norm": 0.011494866572320461, + "learning_rate": 0.0004587058470523981, + "loss": 0.1729, + "num_input_tokens_seen": 151904224, + "step": 70335 + }, + { + "epoch": 11.474714518760196, + "grad_norm": 0.002531670266762376, + "learning_rate": 0.00045863491081643646, + "loss": 0.0069, + "num_input_tokens_seen": 151915232, + "step": 70340 + }, + { + "epoch": 11.47553017944535, + "grad_norm": 0.4217558801174164, + "learning_rate": 0.00045856397541879087, + "loss": 0.0437, + "num_input_tokens_seen": 151926784, + "step": 70345 + }, + { + "epoch": 11.476345840130506, + "grad_norm": 0.03666359931230545, + "learning_rate": 0.0004584930408608989, + "loss": 0.0105, + "num_input_tokens_seen": 151937184, + "step": 70350 + }, + { + "epoch": 11.477161500815662, + "grad_norm": 0.08825061470270157, + "learning_rate": 0.0004584221071441981, + "loss": 0.0575, + "num_input_tokens_seen": 151947808, + "step": 70355 + }, + { + "epoch": 11.477977161500815, + "grad_norm": 0.0489339604973793, + "learning_rate": 0.000458351174270126, + "loss": 0.0048, + "num_input_tokens_seen": 151958656, + "step": 70360 + }, + { + "epoch": 11.478792822185971, + "grad_norm": 0.08036676794290543, + "learning_rate": 0.00045828024224012025, + "loss": 0.1161, + "num_input_tokens_seen": 151969888, + "step": 70365 + }, + { + "epoch": 11.479608482871125, + "grad_norm": 0.009987418539822102, + "learning_rate": 0.00045820931105561817, + "loss": 0.0087, + "num_input_tokens_seen": 151980352, + "step": 70370 + }, + { + "epoch": 11.48042414355628, + "grad_norm": 1.6685467958450317, + "learning_rate": 0.0004581383807180577, + "loss": 0.0545, + "num_input_tokens_seen": 151991616, + "step": 70375 + }, + { + "epoch": 11.481239804241435, + "grad_norm": 0.09259206056594849, + "learning_rate": 0.0004580674512288758, + "loss": 0.1426, + "num_input_tokens_seen": 152002400, + "step": 70380 + }, + { + "epoch": 11.48205546492659, + "grad_norm": 0.001271451241336763, + "learning_rate": 0.0004579965225895104, + "loss": 0.0104, + "num_input_tokens_seen": 152013952, + "step": 70385 + }, + { + "epoch": 11.482871125611746, + "grad_norm": 0.0852578654885292, + "learning_rate": 0.00045792559480139854, + "loss": 0.0087, + "num_input_tokens_seen": 152025248, + "step": 70390 + }, + { + "epoch": 11.4836867862969, + "grad_norm": 0.03162388131022453, + "learning_rate": 0.0004578546678659781, + "loss": 0.115, + "num_input_tokens_seen": 152037536, + "step": 70395 + }, + { + "epoch": 11.484502446982056, + "grad_norm": 0.04780033975839615, + "learning_rate": 0.00045778374178468605, + "loss": 0.0536, + "num_input_tokens_seen": 152048480, + "step": 70400 + }, + { + "epoch": 11.48531810766721, + "grad_norm": 0.0038224325980991125, + "learning_rate": 0.0004577128165589603, + "loss": 0.0496, + "num_input_tokens_seen": 152059808, + "step": 70405 + }, + { + "epoch": 11.486133768352365, + "grad_norm": 0.031959887593984604, + "learning_rate": 0.0004576418921902377, + "loss": 0.0168, + "num_input_tokens_seen": 152070304, + "step": 70410 + }, + { + "epoch": 11.486949429037521, + "grad_norm": 0.006775837391614914, + "learning_rate": 0.0004575709686799561, + "loss": 0.0065, + "num_input_tokens_seen": 152081472, + "step": 70415 + }, + { + "epoch": 11.487765089722675, + "grad_norm": 0.237082839012146, + "learning_rate": 0.00045750004602955246, + "loss": 0.0241, + "num_input_tokens_seen": 152092512, + "step": 70420 + }, + { + "epoch": 11.48858075040783, + "grad_norm": 0.0024227348621934652, + "learning_rate": 0.0004574291242404645, + "loss": 0.0125, + "num_input_tokens_seen": 152103872, + "step": 70425 + }, + { + "epoch": 11.489396411092985, + "grad_norm": 0.4212666451931, + "learning_rate": 0.00045735820331412914, + "loss": 0.0137, + "num_input_tokens_seen": 152115648, + "step": 70430 + }, + { + "epoch": 11.49021207177814, + "grad_norm": 0.01448234636336565, + "learning_rate": 0.0004572872832519839, + "loss": 0.0084, + "num_input_tokens_seen": 152126368, + "step": 70435 + }, + { + "epoch": 11.491027732463296, + "grad_norm": 0.011855104938149452, + "learning_rate": 0.0004572163640554662, + "loss": 0.0058, + "num_input_tokens_seen": 152137760, + "step": 70440 + }, + { + "epoch": 11.49184339314845, + "grad_norm": 0.18611255288124084, + "learning_rate": 0.00045714544572601296, + "loss": 0.047, + "num_input_tokens_seen": 152149088, + "step": 70445 + }, + { + "epoch": 11.492659053833606, + "grad_norm": 0.0021328406874090433, + "learning_rate": 0.0004570745282650619, + "loss": 0.1562, + "num_input_tokens_seen": 152159904, + "step": 70450 + }, + { + "epoch": 11.49347471451876, + "grad_norm": 0.00876991543918848, + "learning_rate": 0.00045700361167404967, + "loss": 0.0285, + "num_input_tokens_seen": 152171712, + "step": 70455 + }, + { + "epoch": 11.494290375203915, + "grad_norm": 0.00328267109580338, + "learning_rate": 0.0004569326959544141, + "loss": 0.0684, + "num_input_tokens_seen": 152182912, + "step": 70460 + }, + { + "epoch": 11.49510603588907, + "grad_norm": 0.003475640434771776, + "learning_rate": 0.00045686178110759183, + "loss": 0.2062, + "num_input_tokens_seen": 152193760, + "step": 70465 + }, + { + "epoch": 11.495921696574225, + "grad_norm": 0.45626431703567505, + "learning_rate": 0.0004567908671350206, + "loss": 0.1644, + "num_input_tokens_seen": 152204288, + "step": 70470 + }, + { + "epoch": 11.49673735725938, + "grad_norm": 0.003484656335785985, + "learning_rate": 0.00045671995403813686, + "loss": 0.0197, + "num_input_tokens_seen": 152215328, + "step": 70475 + }, + { + "epoch": 11.497553017944535, + "grad_norm": 0.016352150589227676, + "learning_rate": 0.0004566490418183785, + "loss": 0.0053, + "num_input_tokens_seen": 152225408, + "step": 70480 + }, + { + "epoch": 11.49836867862969, + "grad_norm": 0.622386634349823, + "learning_rate": 0.00045657813047718203, + "loss": 0.0426, + "num_input_tokens_seen": 152235904, + "step": 70485 + }, + { + "epoch": 11.499184339314844, + "grad_norm": 0.12910671532154083, + "learning_rate": 0.000456507220015985, + "loss": 0.0196, + "num_input_tokens_seen": 152247424, + "step": 70490 + }, + { + "epoch": 11.5, + "grad_norm": 0.5777674317359924, + "learning_rate": 0.00045643631043622426, + "loss": 0.1849, + "num_input_tokens_seen": 152258624, + "step": 70495 + }, + { + "epoch": 11.500815660685156, + "grad_norm": 0.09449607133865356, + "learning_rate": 0.00045636540173933697, + "loss": 0.0105, + "num_input_tokens_seen": 152269760, + "step": 70500 + }, + { + "epoch": 11.50163132137031, + "grad_norm": 0.2962408661842346, + "learning_rate": 0.0004562944939267602, + "loss": 0.168, + "num_input_tokens_seen": 152280672, + "step": 70505 + }, + { + "epoch": 11.502446982055465, + "grad_norm": 0.4580059051513672, + "learning_rate": 0.00045622358699993093, + "loss": 0.0817, + "num_input_tokens_seen": 152291008, + "step": 70510 + }, + { + "epoch": 11.50326264274062, + "grad_norm": 0.07859183102846146, + "learning_rate": 0.00045615268096028613, + "loss": 0.0984, + "num_input_tokens_seen": 152301920, + "step": 70515 + }, + { + "epoch": 11.504078303425775, + "grad_norm": 0.019914044067263603, + "learning_rate": 0.0004560817758092631, + "loss": 0.028, + "num_input_tokens_seen": 152312256, + "step": 70520 + }, + { + "epoch": 11.50489396411093, + "grad_norm": 0.1658468097448349, + "learning_rate": 0.00045601087154829834, + "loss": 0.0312, + "num_input_tokens_seen": 152323104, + "step": 70525 + }, + { + "epoch": 11.505709624796085, + "grad_norm": 0.003438874613493681, + "learning_rate": 0.00045593996817882925, + "loss": 0.0346, + "num_input_tokens_seen": 152334432, + "step": 70530 + }, + { + "epoch": 11.50652528548124, + "grad_norm": 0.006378691643476486, + "learning_rate": 0.0004558690657022925, + "loss": 0.0845, + "num_input_tokens_seen": 152344224, + "step": 70535 + }, + { + "epoch": 11.507340946166394, + "grad_norm": 0.005127818323671818, + "learning_rate": 0.0004557981641201252, + "loss": 0.0707, + "num_input_tokens_seen": 152353824, + "step": 70540 + }, + { + "epoch": 11.50815660685155, + "grad_norm": 0.010716418735682964, + "learning_rate": 0.000455727263433764, + "loss": 0.0139, + "num_input_tokens_seen": 152364320, + "step": 70545 + }, + { + "epoch": 11.508972267536706, + "grad_norm": 0.01387725118547678, + "learning_rate": 0.000455656363644646, + "loss": 0.0109, + "num_input_tokens_seen": 152375808, + "step": 70550 + }, + { + "epoch": 11.50978792822186, + "grad_norm": 0.030756894499063492, + "learning_rate": 0.0004555854647542083, + "loss": 0.0246, + "num_input_tokens_seen": 152386912, + "step": 70555 + }, + { + "epoch": 11.510603588907015, + "grad_norm": 0.04727554693818092, + "learning_rate": 0.00045551456676388725, + "loss": 0.0825, + "num_input_tokens_seen": 152397984, + "step": 70560 + }, + { + "epoch": 11.51141924959217, + "grad_norm": 0.011782309971749783, + "learning_rate": 0.00045544366967512014, + "loss": 0.0468, + "num_input_tokens_seen": 152408704, + "step": 70565 + }, + { + "epoch": 11.512234910277325, + "grad_norm": 0.03985341265797615, + "learning_rate": 0.0004553727734893434, + "loss": 0.0423, + "num_input_tokens_seen": 152417984, + "step": 70570 + }, + { + "epoch": 11.513050570962479, + "grad_norm": 0.02030654065310955, + "learning_rate": 0.0004553018782079942, + "loss": 0.0136, + "num_input_tokens_seen": 152428640, + "step": 70575 + }, + { + "epoch": 11.513866231647635, + "grad_norm": 0.0016316096298396587, + "learning_rate": 0.00045523098383250894, + "loss": 0.1642, + "num_input_tokens_seen": 152439328, + "step": 70580 + }, + { + "epoch": 11.51468189233279, + "grad_norm": 0.6504743099212646, + "learning_rate": 0.0004551600903643248, + "loss": 0.1711, + "num_input_tokens_seen": 152450048, + "step": 70585 + }, + { + "epoch": 11.515497553017944, + "grad_norm": 0.034713149070739746, + "learning_rate": 0.00045508919780487805, + "loss": 0.025, + "num_input_tokens_seen": 152460320, + "step": 70590 + }, + { + "epoch": 11.5163132137031, + "grad_norm": 0.004274432547390461, + "learning_rate": 0.000455018306155606, + "loss": 0.0197, + "num_input_tokens_seen": 152471264, + "step": 70595 + }, + { + "epoch": 11.517128874388254, + "grad_norm": 0.004105722531676292, + "learning_rate": 0.0004549474154179447, + "loss": 0.0159, + "num_input_tokens_seen": 152482016, + "step": 70600 + }, + { + "epoch": 11.51794453507341, + "grad_norm": 0.10600485652685165, + "learning_rate": 0.0004548765255933315, + "loss": 0.0833, + "num_input_tokens_seen": 152493472, + "step": 70605 + }, + { + "epoch": 11.518760195758565, + "grad_norm": 0.011000092141330242, + "learning_rate": 0.00045480563668320244, + "loss": 0.014, + "num_input_tokens_seen": 152504704, + "step": 70610 + }, + { + "epoch": 11.51957585644372, + "grad_norm": 0.17460846900939941, + "learning_rate": 0.0004547347486889948, + "loss": 0.0521, + "num_input_tokens_seen": 152515872, + "step": 70615 + }, + { + "epoch": 11.520391517128875, + "grad_norm": 0.06345030665397644, + "learning_rate": 0.00045466386161214465, + "loss": 0.0101, + "num_input_tokens_seen": 152528192, + "step": 70620 + }, + { + "epoch": 11.521207177814029, + "grad_norm": 0.002006505150347948, + "learning_rate": 0.00045459297545408906, + "loss": 0.0123, + "num_input_tokens_seen": 152539264, + "step": 70625 + }, + { + "epoch": 11.522022838499185, + "grad_norm": 0.3534085750579834, + "learning_rate": 0.0004545220902162642, + "loss": 0.1996, + "num_input_tokens_seen": 152550560, + "step": 70630 + }, + { + "epoch": 11.522838499184338, + "grad_norm": 0.03179255872964859, + "learning_rate": 0.000454451205900107, + "loss": 0.0135, + "num_input_tokens_seen": 152561088, + "step": 70635 + }, + { + "epoch": 11.523654159869494, + "grad_norm": 0.006981425452977419, + "learning_rate": 0.00045438032250705394, + "loss": 0.0704, + "num_input_tokens_seen": 152572512, + "step": 70640 + }, + { + "epoch": 11.52446982055465, + "grad_norm": 0.024807730689644814, + "learning_rate": 0.00045430944003854143, + "loss": 0.0117, + "num_input_tokens_seen": 152583520, + "step": 70645 + }, + { + "epoch": 11.525285481239804, + "grad_norm": 0.008164377883076668, + "learning_rate": 0.00045423855849600615, + "loss": 0.0116, + "num_input_tokens_seen": 152594880, + "step": 70650 + }, + { + "epoch": 11.52610114192496, + "grad_norm": 0.4699248969554901, + "learning_rate": 0.00045416767788088435, + "loss": 0.0348, + "num_input_tokens_seen": 152604416, + "step": 70655 + }, + { + "epoch": 11.526916802610113, + "grad_norm": 0.04431702569127083, + "learning_rate": 0.00045409679819461286, + "loss": 0.0079, + "num_input_tokens_seen": 152615264, + "step": 70660 + }, + { + "epoch": 11.52773246329527, + "grad_norm": 0.039329253137111664, + "learning_rate": 0.000454025919438628, + "loss": 0.0114, + "num_input_tokens_seen": 152627072, + "step": 70665 + }, + { + "epoch": 11.528548123980425, + "grad_norm": 0.015786344185471535, + "learning_rate": 0.00045395504161436617, + "loss": 0.0249, + "num_input_tokens_seen": 152636704, + "step": 70670 + }, + { + "epoch": 11.529363784665579, + "grad_norm": 0.0025255740620195866, + "learning_rate": 0.0004538841647232639, + "loss": 0.0054, + "num_input_tokens_seen": 152647104, + "step": 70675 + }, + { + "epoch": 11.530179445350734, + "grad_norm": 0.011380949057638645, + "learning_rate": 0.0004538132887667574, + "loss": 0.0212, + "num_input_tokens_seen": 152657280, + "step": 70680 + }, + { + "epoch": 11.530995106035888, + "grad_norm": 0.018168801441788673, + "learning_rate": 0.0004537424137462832, + "loss": 0.0158, + "num_input_tokens_seen": 152667072, + "step": 70685 + }, + { + "epoch": 11.531810766721044, + "grad_norm": 0.552568793296814, + "learning_rate": 0.0004536715396632779, + "loss": 0.1466, + "num_input_tokens_seen": 152678336, + "step": 70690 + }, + { + "epoch": 11.5326264274062, + "grad_norm": 1.1314250230789185, + "learning_rate": 0.00045360066651917733, + "loss": 0.1199, + "num_input_tokens_seen": 152688128, + "step": 70695 + }, + { + "epoch": 11.533442088091354, + "grad_norm": 0.04501219838857651, + "learning_rate": 0.00045352979431541833, + "loss": 0.0052, + "num_input_tokens_seen": 152699872, + "step": 70700 + }, + { + "epoch": 11.53425774877651, + "grad_norm": 0.010922752320766449, + "learning_rate": 0.0004534589230534368, + "loss": 0.0111, + "num_input_tokens_seen": 152710880, + "step": 70705 + }, + { + "epoch": 11.535073409461663, + "grad_norm": 1.1566555500030518, + "learning_rate": 0.00045338805273466954, + "loss": 0.1121, + "num_input_tokens_seen": 152722272, + "step": 70710 + }, + { + "epoch": 11.535889070146819, + "grad_norm": 0.08548852056264877, + "learning_rate": 0.00045331718336055223, + "loss": 0.0773, + "num_input_tokens_seen": 152733568, + "step": 70715 + }, + { + "epoch": 11.536704730831975, + "grad_norm": 0.0486895889043808, + "learning_rate": 0.0004532463149325216, + "loss": 0.1141, + "num_input_tokens_seen": 152743488, + "step": 70720 + }, + { + "epoch": 11.537520391517129, + "grad_norm": 0.0107080964371562, + "learning_rate": 0.00045317544745201354, + "loss": 0.1, + "num_input_tokens_seen": 152754400, + "step": 70725 + }, + { + "epoch": 11.538336052202284, + "grad_norm": 0.06043444946408272, + "learning_rate": 0.00045310458092046464, + "loss": 0.035, + "num_input_tokens_seen": 152763648, + "step": 70730 + }, + { + "epoch": 11.539151712887438, + "grad_norm": 0.009252172894775867, + "learning_rate": 0.0004530337153393107, + "loss": 0.0772, + "num_input_tokens_seen": 152774784, + "step": 70735 + }, + { + "epoch": 11.539967373572594, + "grad_norm": 0.008908047340810299, + "learning_rate": 0.00045296285070998835, + "loss": 0.0091, + "num_input_tokens_seen": 152785024, + "step": 70740 + }, + { + "epoch": 11.540783034257748, + "grad_norm": 0.0023308051750063896, + "learning_rate": 0.0004528919870339332, + "loss": 0.019, + "num_input_tokens_seen": 152795872, + "step": 70745 + }, + { + "epoch": 11.541598694942904, + "grad_norm": 0.019828980788588524, + "learning_rate": 0.00045282112431258194, + "loss": 0.023, + "num_input_tokens_seen": 152804640, + "step": 70750 + }, + { + "epoch": 11.54241435562806, + "grad_norm": 0.047302018851041794, + "learning_rate": 0.00045275026254737027, + "loss": 0.0124, + "num_input_tokens_seen": 152815648, + "step": 70755 + }, + { + "epoch": 11.543230016313213, + "grad_norm": 0.005254935007542372, + "learning_rate": 0.0004526794017397344, + "loss": 0.0191, + "num_input_tokens_seen": 152828256, + "step": 70760 + }, + { + "epoch": 11.544045676998369, + "grad_norm": 0.012222270481288433, + "learning_rate": 0.0004526085418911108, + "loss": 0.0181, + "num_input_tokens_seen": 152838336, + "step": 70765 + }, + { + "epoch": 11.544861337683523, + "grad_norm": 0.0508912019431591, + "learning_rate": 0.0004525376830029349, + "loss": 0.1886, + "num_input_tokens_seen": 152847616, + "step": 70770 + }, + { + "epoch": 11.545676998368679, + "grad_norm": 0.0036810701712965965, + "learning_rate": 0.00045246682507664335, + "loss": 0.0209, + "num_input_tokens_seen": 152857984, + "step": 70775 + }, + { + "epoch": 11.546492659053834, + "grad_norm": 0.009106824174523354, + "learning_rate": 0.0004523959681136716, + "loss": 0.0314, + "num_input_tokens_seen": 152869280, + "step": 70780 + }, + { + "epoch": 11.547308319738988, + "grad_norm": 0.018032781779766083, + "learning_rate": 0.00045232511211545625, + "loss": 0.007, + "num_input_tokens_seen": 152879936, + "step": 70785 + }, + { + "epoch": 11.548123980424144, + "grad_norm": 0.17428778111934662, + "learning_rate": 0.0004522542570834327, + "loss": 0.0836, + "num_input_tokens_seen": 152891296, + "step": 70790 + }, + { + "epoch": 11.548939641109298, + "grad_norm": 0.004335889592766762, + "learning_rate": 0.0004521834030190375, + "loss": 0.0112, + "num_input_tokens_seen": 152903776, + "step": 70795 + }, + { + "epoch": 11.549755301794454, + "grad_norm": 0.020084548741579056, + "learning_rate": 0.000452112549923706, + "loss": 0.0251, + "num_input_tokens_seen": 152915232, + "step": 70800 + }, + { + "epoch": 11.550570962479608, + "grad_norm": 0.07404981553554535, + "learning_rate": 0.00045204169779887454, + "loss": 0.0116, + "num_input_tokens_seen": 152925632, + "step": 70805 + }, + { + "epoch": 11.551386623164763, + "grad_norm": 0.0036789614241570234, + "learning_rate": 0.0004519708466459789, + "loss": 0.0058, + "num_input_tokens_seen": 152936672, + "step": 70810 + }, + { + "epoch": 11.552202283849919, + "grad_norm": 0.004675018601119518, + "learning_rate": 0.0004518999964664551, + "loss": 0.0746, + "num_input_tokens_seen": 152947648, + "step": 70815 + }, + { + "epoch": 11.553017944535073, + "grad_norm": 0.031063973903656006, + "learning_rate": 0.0004518291472617387, + "loss": 0.0214, + "num_input_tokens_seen": 152959008, + "step": 70820 + }, + { + "epoch": 11.553833605220229, + "grad_norm": 0.006103998050093651, + "learning_rate": 0.00045175829903326594, + "loss": 0.0059, + "num_input_tokens_seen": 152969504, + "step": 70825 + }, + { + "epoch": 11.554649265905383, + "grad_norm": 0.001215186552144587, + "learning_rate": 0.0004516874517824722, + "loss": 0.0038, + "num_input_tokens_seen": 152979264, + "step": 70830 + }, + { + "epoch": 11.555464926590538, + "grad_norm": 0.0012902735034003854, + "learning_rate": 0.0004516166055107938, + "loss": 0.0212, + "num_input_tokens_seen": 152990496, + "step": 70835 + }, + { + "epoch": 11.556280587275694, + "grad_norm": 0.0009681761148385704, + "learning_rate": 0.00045154576021966605, + "loss": 0.0044, + "num_input_tokens_seen": 153001696, + "step": 70840 + }, + { + "epoch": 11.557096247960848, + "grad_norm": 0.5672785043716431, + "learning_rate": 0.00045147491591052515, + "loss": 0.1108, + "num_input_tokens_seen": 153011616, + "step": 70845 + }, + { + "epoch": 11.557911908646004, + "grad_norm": 0.183214470744133, + "learning_rate": 0.0004514040725848064, + "loss": 0.0455, + "num_input_tokens_seen": 153022848, + "step": 70850 + }, + { + "epoch": 11.558727569331158, + "grad_norm": 0.012468835338950157, + "learning_rate": 0.0004513332302439461, + "loss": 0.0065, + "num_input_tokens_seen": 153033280, + "step": 70855 + }, + { + "epoch": 11.559543230016313, + "grad_norm": 0.09847768396139145, + "learning_rate": 0.00045126238888937927, + "loss": 0.0173, + "num_input_tokens_seen": 153043584, + "step": 70860 + }, + { + "epoch": 11.560358890701469, + "grad_norm": 0.004849771969020367, + "learning_rate": 0.00045119154852254204, + "loss": 0.0074, + "num_input_tokens_seen": 153054080, + "step": 70865 + }, + { + "epoch": 11.561174551386623, + "grad_norm": 0.02971162274479866, + "learning_rate": 0.0004511207091448701, + "loss": 0.0201, + "num_input_tokens_seen": 153064128, + "step": 70870 + }, + { + "epoch": 11.561990212071779, + "grad_norm": 0.031622979789972305, + "learning_rate": 0.0004510498707577989, + "loss": 0.0153, + "num_input_tokens_seen": 153075328, + "step": 70875 + }, + { + "epoch": 11.562805872756933, + "grad_norm": 0.07961271703243256, + "learning_rate": 0.0004509790333627644, + "loss": 0.0087, + "num_input_tokens_seen": 153085600, + "step": 70880 + }, + { + "epoch": 11.563621533442088, + "grad_norm": 0.002815672429278493, + "learning_rate": 0.00045090819696120166, + "loss": 0.0111, + "num_input_tokens_seen": 153095744, + "step": 70885 + }, + { + "epoch": 11.564437194127244, + "grad_norm": 0.03652092441916466, + "learning_rate": 0.0004508373615545469, + "loss": 0.0065, + "num_input_tokens_seen": 153106624, + "step": 70890 + }, + { + "epoch": 11.565252854812398, + "grad_norm": 0.007009325083345175, + "learning_rate": 0.00045076652714423507, + "loss": 0.0029, + "num_input_tokens_seen": 153116800, + "step": 70895 + }, + { + "epoch": 11.566068515497554, + "grad_norm": 0.007585311774164438, + "learning_rate": 0.00045069569373170227, + "loss": 0.0016, + "num_input_tokens_seen": 153127008, + "step": 70900 + }, + { + "epoch": 11.566884176182707, + "grad_norm": 0.002985976403579116, + "learning_rate": 0.0004506248613183836, + "loss": 0.0264, + "num_input_tokens_seen": 153137376, + "step": 70905 + }, + { + "epoch": 11.567699836867863, + "grad_norm": 0.016019705682992935, + "learning_rate": 0.00045055402990571493, + "loss": 0.0053, + "num_input_tokens_seen": 153147264, + "step": 70910 + }, + { + "epoch": 11.568515497553017, + "grad_norm": 0.00407218374311924, + "learning_rate": 0.00045048319949513136, + "loss": 0.0052, + "num_input_tokens_seen": 153158816, + "step": 70915 + }, + { + "epoch": 11.569331158238173, + "grad_norm": 0.0016611559549346566, + "learning_rate": 0.0004504123700880688, + "loss": 0.0079, + "num_input_tokens_seen": 153169728, + "step": 70920 + }, + { + "epoch": 11.570146818923329, + "grad_norm": 0.02866864949464798, + "learning_rate": 0.00045034154168596224, + "loss": 0.006, + "num_input_tokens_seen": 153180704, + "step": 70925 + }, + { + "epoch": 11.570962479608482, + "grad_norm": 0.3860038220882416, + "learning_rate": 0.00045027071429024757, + "loss": 0.0218, + "num_input_tokens_seen": 153192320, + "step": 70930 + }, + { + "epoch": 11.571778140293638, + "grad_norm": 0.0035648008342832327, + "learning_rate": 0.00045019988790235974, + "loss": 0.0056, + "num_input_tokens_seen": 153203296, + "step": 70935 + }, + { + "epoch": 11.572593800978792, + "grad_norm": 0.5095841288566589, + "learning_rate": 0.0004501290625237345, + "loss": 0.0219, + "num_input_tokens_seen": 153213952, + "step": 70940 + }, + { + "epoch": 11.573409461663948, + "grad_norm": 0.004286561626940966, + "learning_rate": 0.00045005823815580696, + "loss": 0.0044, + "num_input_tokens_seen": 153225472, + "step": 70945 + }, + { + "epoch": 11.574225122349104, + "grad_norm": 0.002824546070769429, + "learning_rate": 0.00044998741480001264, + "loss": 0.0111, + "num_input_tokens_seen": 153235168, + "step": 70950 + }, + { + "epoch": 11.575040783034257, + "grad_norm": 0.00039686966920271516, + "learning_rate": 0.00044991659245778684, + "loss": 0.0033, + "num_input_tokens_seen": 153246112, + "step": 70955 + }, + { + "epoch": 11.575856443719413, + "grad_norm": 0.009134351275861263, + "learning_rate": 0.00044984577113056477, + "loss": 0.018, + "num_input_tokens_seen": 153257952, + "step": 70960 + }, + { + "epoch": 11.576672104404567, + "grad_norm": 0.016628161072731018, + "learning_rate": 0.0004497749508197818, + "loss": 0.0098, + "num_input_tokens_seen": 153268608, + "step": 70965 + }, + { + "epoch": 11.577487765089723, + "grad_norm": 0.03593922778964043, + "learning_rate": 0.00044970413152687304, + "loss": 0.1567, + "num_input_tokens_seen": 153279168, + "step": 70970 + }, + { + "epoch": 11.578303425774878, + "grad_norm": 0.37217482924461365, + "learning_rate": 0.000449633313253274, + "loss": 0.0878, + "num_input_tokens_seen": 153289824, + "step": 70975 + }, + { + "epoch": 11.579119086460032, + "grad_norm": 0.762610912322998, + "learning_rate": 0.00044956249600041975, + "loss": 0.0469, + "num_input_tokens_seen": 153300928, + "step": 70980 + }, + { + "epoch": 11.579934747145188, + "grad_norm": 0.025967687368392944, + "learning_rate": 0.00044949167976974553, + "loss": 0.0235, + "num_input_tokens_seen": 153311072, + "step": 70985 + }, + { + "epoch": 11.580750407830342, + "grad_norm": 0.10705938190221786, + "learning_rate": 0.00044942086456268643, + "loss": 0.0283, + "num_input_tokens_seen": 153322400, + "step": 70990 + }, + { + "epoch": 11.581566068515498, + "grad_norm": 0.48525816202163696, + "learning_rate": 0.0004493500503806777, + "loss": 0.0323, + "num_input_tokens_seen": 153333984, + "step": 70995 + }, + { + "epoch": 11.582381729200652, + "grad_norm": 0.008772018365561962, + "learning_rate": 0.0004492792372251544, + "loss": 0.0771, + "num_input_tokens_seen": 153344672, + "step": 71000 + }, + { + "epoch": 11.583197389885807, + "grad_norm": 0.0028414896223694086, + "learning_rate": 0.00044920842509755187, + "loss": 0.0161, + "num_input_tokens_seen": 153354528, + "step": 71005 + }, + { + "epoch": 11.584013050570963, + "grad_norm": 0.5699886083602905, + "learning_rate": 0.0004491376139993048, + "loss": 0.0729, + "num_input_tokens_seen": 153365536, + "step": 71010 + }, + { + "epoch": 11.584828711256117, + "grad_norm": 0.0065368469804525375, + "learning_rate": 0.0004490668039318488, + "loss": 0.0076, + "num_input_tokens_seen": 153376704, + "step": 71015 + }, + { + "epoch": 11.585644371941273, + "grad_norm": 0.004508780315518379, + "learning_rate": 0.00044899599489661837, + "loss": 0.0053, + "num_input_tokens_seen": 153387968, + "step": 71020 + }, + { + "epoch": 11.586460032626427, + "grad_norm": 0.018700627610087395, + "learning_rate": 0.000448925186895049, + "loss": 0.1483, + "num_input_tokens_seen": 153398688, + "step": 71025 + }, + { + "epoch": 11.587275693311582, + "grad_norm": 0.003485024208202958, + "learning_rate": 0.0004488543799285753, + "loss": 0.0029, + "num_input_tokens_seen": 153409344, + "step": 71030 + }, + { + "epoch": 11.588091353996738, + "grad_norm": 0.00151357043068856, + "learning_rate": 0.00044878357399863266, + "loss": 0.0494, + "num_input_tokens_seen": 153419648, + "step": 71035 + }, + { + "epoch": 11.588907014681892, + "grad_norm": 0.013715144246816635, + "learning_rate": 0.0004487127691066558, + "loss": 0.0606, + "num_input_tokens_seen": 153429472, + "step": 71040 + }, + { + "epoch": 11.589722675367048, + "grad_norm": 0.005180860869586468, + "learning_rate": 0.0004486419652540798, + "loss": 0.198, + "num_input_tokens_seen": 153440288, + "step": 71045 + }, + { + "epoch": 11.590538336052202, + "grad_norm": 0.0013805264607071877, + "learning_rate": 0.0004485711624423393, + "loss": 0.0538, + "num_input_tokens_seen": 153450336, + "step": 71050 + }, + { + "epoch": 11.591353996737357, + "grad_norm": 0.0028821558225899935, + "learning_rate": 0.0004485003606728698, + "loss": 0.035, + "num_input_tokens_seen": 153460864, + "step": 71055 + }, + { + "epoch": 11.592169657422513, + "grad_norm": 0.003991037607192993, + "learning_rate": 0.0004484295599471054, + "loss": 0.0029, + "num_input_tokens_seen": 153472576, + "step": 71060 + }, + { + "epoch": 11.592985318107667, + "grad_norm": 0.05960950627923012, + "learning_rate": 0.00044835876026648176, + "loss": 0.0081, + "num_input_tokens_seen": 153483584, + "step": 71065 + }, + { + "epoch": 11.593800978792823, + "grad_norm": 0.0034852263052016497, + "learning_rate": 0.00044828796163243315, + "loss": 0.0762, + "num_input_tokens_seen": 153495168, + "step": 71070 + }, + { + "epoch": 11.594616639477977, + "grad_norm": 0.47948548197746277, + "learning_rate": 0.0004482171640463945, + "loss": 0.028, + "num_input_tokens_seen": 153505568, + "step": 71075 + }, + { + "epoch": 11.595432300163132, + "grad_norm": 0.024540584534406662, + "learning_rate": 0.000448146367509801, + "loss": 0.0166, + "num_input_tokens_seen": 153515488, + "step": 71080 + }, + { + "epoch": 11.596247960848288, + "grad_norm": 0.20424458384513855, + "learning_rate": 0.0004480755720240869, + "loss": 0.0543, + "num_input_tokens_seen": 153524192, + "step": 71085 + }, + { + "epoch": 11.597063621533442, + "grad_norm": 0.24791987240314484, + "learning_rate": 0.0004480047775906874, + "loss": 0.0096, + "num_input_tokens_seen": 153535776, + "step": 71090 + }, + { + "epoch": 11.597879282218598, + "grad_norm": 0.023952824994921684, + "learning_rate": 0.0004479339842110368, + "loss": 0.0278, + "num_input_tokens_seen": 153546432, + "step": 71095 + }, + { + "epoch": 11.598694942903752, + "grad_norm": 0.0008945376030169427, + "learning_rate": 0.0004478631918865704, + "loss": 0.0057, + "num_input_tokens_seen": 153557920, + "step": 71100 + }, + { + "epoch": 11.599510603588907, + "grad_norm": 0.0072722178883850574, + "learning_rate": 0.00044779240061872225, + "loss": 0.071, + "num_input_tokens_seen": 153569216, + "step": 71105 + }, + { + "epoch": 11.600326264274061, + "grad_norm": 0.31958064436912537, + "learning_rate": 0.00044772161040892755, + "loss": 0.0305, + "num_input_tokens_seen": 153579360, + "step": 71110 + }, + { + "epoch": 11.601141924959217, + "grad_norm": 0.07264435291290283, + "learning_rate": 0.00044765082125862053, + "loss": 0.0394, + "num_input_tokens_seen": 153588768, + "step": 71115 + }, + { + "epoch": 11.601957585644373, + "grad_norm": 0.005538736004382372, + "learning_rate": 0.0004475800331692361, + "loss": 0.144, + "num_input_tokens_seen": 153600224, + "step": 71120 + }, + { + "epoch": 11.602773246329527, + "grad_norm": 0.21943026781082153, + "learning_rate": 0.0004475092461422089, + "loss": 0.0477, + "num_input_tokens_seen": 153611584, + "step": 71125 + }, + { + "epoch": 11.603588907014682, + "grad_norm": 0.06436196714639664, + "learning_rate": 0.0004474384601789733, + "loss": 0.0255, + "num_input_tokens_seen": 153622336, + "step": 71130 + }, + { + "epoch": 11.604404567699836, + "grad_norm": 0.0007576481439173222, + "learning_rate": 0.00044736767528096407, + "loss": 0.0411, + "num_input_tokens_seen": 153633376, + "step": 71135 + }, + { + "epoch": 11.605220228384992, + "grad_norm": 0.007207597605884075, + "learning_rate": 0.0004472968914496156, + "loss": 0.0439, + "num_input_tokens_seen": 153644544, + "step": 71140 + }, + { + "epoch": 11.606035889070148, + "grad_norm": 0.033221352845430374, + "learning_rate": 0.00044722610868636243, + "loss": 0.0196, + "num_input_tokens_seen": 153655872, + "step": 71145 + }, + { + "epoch": 11.606851549755302, + "grad_norm": 0.008055893704295158, + "learning_rate": 0.00044715532699263926, + "loss": 0.0873, + "num_input_tokens_seen": 153667040, + "step": 71150 + }, + { + "epoch": 11.607667210440457, + "grad_norm": 0.07398483902215958, + "learning_rate": 0.00044708454636988026, + "loss": 0.026, + "num_input_tokens_seen": 153678496, + "step": 71155 + }, + { + "epoch": 11.608482871125611, + "grad_norm": 0.008414418436586857, + "learning_rate": 0.00044701376681952033, + "loss": 0.0084, + "num_input_tokens_seen": 153690240, + "step": 71160 + }, + { + "epoch": 11.609298531810767, + "grad_norm": 0.04958515241742134, + "learning_rate": 0.00044694298834299336, + "loss": 0.0081, + "num_input_tokens_seen": 153700992, + "step": 71165 + }, + { + "epoch": 11.61011419249592, + "grad_norm": 0.3301873505115509, + "learning_rate": 0.00044687221094173425, + "loss": 0.0086, + "num_input_tokens_seen": 153712224, + "step": 71170 + }, + { + "epoch": 11.610929853181077, + "grad_norm": 0.21836425364017487, + "learning_rate": 0.0004468014346171769, + "loss": 0.0336, + "num_input_tokens_seen": 153723008, + "step": 71175 + }, + { + "epoch": 11.611745513866232, + "grad_norm": 0.1655757576227188, + "learning_rate": 0.0004467306593707563, + "loss": 0.0125, + "num_input_tokens_seen": 153733248, + "step": 71180 + }, + { + "epoch": 11.612561174551386, + "grad_norm": 0.01050649955868721, + "learning_rate": 0.00044665988520390624, + "loss": 0.0294, + "num_input_tokens_seen": 153744416, + "step": 71185 + }, + { + "epoch": 11.613376835236542, + "grad_norm": 0.016805484890937805, + "learning_rate": 0.0004465891121180612, + "loss": 0.0067, + "num_input_tokens_seen": 153754528, + "step": 71190 + }, + { + "epoch": 11.614192495921696, + "grad_norm": 0.0019383433973416686, + "learning_rate": 0.0004465183401146558, + "loss": 0.0027, + "num_input_tokens_seen": 153765216, + "step": 71195 + }, + { + "epoch": 11.615008156606851, + "grad_norm": 0.4953695833683014, + "learning_rate": 0.00044644756919512386, + "loss": 0.0204, + "num_input_tokens_seen": 153776352, + "step": 71200 + }, + { + "epoch": 11.615823817292007, + "grad_norm": 0.08260602504014969, + "learning_rate": 0.00044637679936090013, + "loss": 0.0265, + "num_input_tokens_seen": 153787040, + "step": 71205 + }, + { + "epoch": 11.616639477977161, + "grad_norm": 0.008234014734625816, + "learning_rate": 0.00044630603061341837, + "loss": 0.0062, + "num_input_tokens_seen": 153797664, + "step": 71210 + }, + { + "epoch": 11.617455138662317, + "grad_norm": 0.040098607540130615, + "learning_rate": 0.00044623526295411314, + "loss": 0.0418, + "num_input_tokens_seen": 153808352, + "step": 71215 + }, + { + "epoch": 11.61827079934747, + "grad_norm": 0.007715737447142601, + "learning_rate": 0.00044616449638441836, + "loss": 0.0025, + "num_input_tokens_seen": 153818400, + "step": 71220 + }, + { + "epoch": 11.619086460032626, + "grad_norm": 0.002162148244678974, + "learning_rate": 0.0004460937309057686, + "loss": 0.0161, + "num_input_tokens_seen": 153829376, + "step": 71225 + }, + { + "epoch": 11.619902120717782, + "grad_norm": 0.030692892149090767, + "learning_rate": 0.0004460229665195975, + "loss": 0.0489, + "num_input_tokens_seen": 153840672, + "step": 71230 + }, + { + "epoch": 11.620717781402936, + "grad_norm": 0.0370308980345726, + "learning_rate": 0.0004459522032273397, + "loss": 0.0121, + "num_input_tokens_seen": 153850976, + "step": 71235 + }, + { + "epoch": 11.621533442088092, + "grad_norm": 0.00597534142434597, + "learning_rate": 0.00044588144103042883, + "loss": 0.037, + "num_input_tokens_seen": 153862112, + "step": 71240 + }, + { + "epoch": 11.622349102773246, + "grad_norm": 0.0010464468505233526, + "learning_rate": 0.00044581067993029944, + "loss": 0.0013, + "num_input_tokens_seen": 153872928, + "step": 71245 + }, + { + "epoch": 11.623164763458401, + "grad_norm": 0.07114594429731369, + "learning_rate": 0.0004457399199283852, + "loss": 0.0026, + "num_input_tokens_seen": 153884320, + "step": 71250 + }, + { + "epoch": 11.623980424143557, + "grad_norm": 0.01107731182128191, + "learning_rate": 0.00044566916102612043, + "loss": 0.0933, + "num_input_tokens_seen": 153894880, + "step": 71255 + }, + { + "epoch": 11.624796084828711, + "grad_norm": 0.0015193721046671271, + "learning_rate": 0.0004455984032249389, + "loss": 0.0929, + "num_input_tokens_seen": 153905984, + "step": 71260 + }, + { + "epoch": 11.625611745513867, + "grad_norm": 0.2539882957935333, + "learning_rate": 0.0004455276465262748, + "loss": 0.015, + "num_input_tokens_seen": 153914624, + "step": 71265 + }, + { + "epoch": 11.62642740619902, + "grad_norm": 0.008223704993724823, + "learning_rate": 0.0004454568909315621, + "loss": 0.2522, + "num_input_tokens_seen": 153924640, + "step": 71270 + }, + { + "epoch": 11.627243066884176, + "grad_norm": 0.0044373562559485435, + "learning_rate": 0.0004453861364422347, + "loss": 0.0143, + "num_input_tokens_seen": 153935392, + "step": 71275 + }, + { + "epoch": 11.62805872756933, + "grad_norm": 0.016537398099899292, + "learning_rate": 0.00044531538305972646, + "loss": 0.0056, + "num_input_tokens_seen": 153945824, + "step": 71280 + }, + { + "epoch": 11.628874388254486, + "grad_norm": 0.012589816004037857, + "learning_rate": 0.0004452446307854714, + "loss": 0.1544, + "num_input_tokens_seen": 153958336, + "step": 71285 + }, + { + "epoch": 11.629690048939642, + "grad_norm": 0.0054365224204957485, + "learning_rate": 0.00044517387962090323, + "loss": 0.0861, + "num_input_tokens_seen": 153969728, + "step": 71290 + }, + { + "epoch": 11.630505709624796, + "grad_norm": 0.004397984594106674, + "learning_rate": 0.00044510312956745607, + "loss": 0.0168, + "num_input_tokens_seen": 153980416, + "step": 71295 + }, + { + "epoch": 11.631321370309951, + "grad_norm": 0.1747157722711563, + "learning_rate": 0.00044503238062656357, + "loss": 0.0271, + "num_input_tokens_seen": 153990432, + "step": 71300 + }, + { + "epoch": 11.632137030995105, + "grad_norm": 0.026617346331477165, + "learning_rate": 0.0004449616327996597, + "loss": 0.0429, + "num_input_tokens_seen": 154001024, + "step": 71305 + }, + { + "epoch": 11.632952691680261, + "grad_norm": 0.00191340537276119, + "learning_rate": 0.0004448908860881781, + "loss": 0.0846, + "num_input_tokens_seen": 154011136, + "step": 71310 + }, + { + "epoch": 11.633768352365417, + "grad_norm": 0.21351198852062225, + "learning_rate": 0.0004448201404935525, + "loss": 0.0238, + "num_input_tokens_seen": 154021760, + "step": 71315 + }, + { + "epoch": 11.63458401305057, + "grad_norm": 0.0022384291514754295, + "learning_rate": 0.00044474939601721705, + "loss": 0.0032, + "num_input_tokens_seen": 154033312, + "step": 71320 + }, + { + "epoch": 11.635399673735726, + "grad_norm": 0.009010802023112774, + "learning_rate": 0.00044467865266060487, + "loss": 0.012, + "num_input_tokens_seen": 154044864, + "step": 71325 + }, + { + "epoch": 11.63621533442088, + "grad_norm": 0.14343449473381042, + "learning_rate": 0.0004446079104251503, + "loss": 0.1224, + "num_input_tokens_seen": 154055712, + "step": 71330 + }, + { + "epoch": 11.637030995106036, + "grad_norm": 0.03126922994852066, + "learning_rate": 0.0004445371693122863, + "loss": 0.2138, + "num_input_tokens_seen": 154067200, + "step": 71335 + }, + { + "epoch": 11.63784665579119, + "grad_norm": 0.004906293470412493, + "learning_rate": 0.00044446642932344726, + "loss": 0.1311, + "num_input_tokens_seen": 154078176, + "step": 71340 + }, + { + "epoch": 11.638662316476346, + "grad_norm": 0.08446597307920456, + "learning_rate": 0.0004443956904600663, + "loss": 0.0163, + "num_input_tokens_seen": 154088832, + "step": 71345 + }, + { + "epoch": 11.639477977161501, + "grad_norm": 0.05098491162061691, + "learning_rate": 0.00044432495272357734, + "loss": 0.0066, + "num_input_tokens_seen": 154099808, + "step": 71350 + }, + { + "epoch": 11.640293637846655, + "grad_norm": 0.006743690464645624, + "learning_rate": 0.00044425421611541364, + "loss": 0.0077, + "num_input_tokens_seen": 154110528, + "step": 71355 + }, + { + "epoch": 11.641109298531811, + "grad_norm": 0.004304811824113131, + "learning_rate": 0.0004441834806370092, + "loss": 0.092, + "num_input_tokens_seen": 154120800, + "step": 71360 + }, + { + "epoch": 11.641924959216965, + "grad_norm": 0.5354902148246765, + "learning_rate": 0.00044411274628979714, + "loss": 0.0588, + "num_input_tokens_seen": 154132320, + "step": 71365 + }, + { + "epoch": 11.64274061990212, + "grad_norm": 0.033445850014686584, + "learning_rate": 0.00044404201307521134, + "loss": 0.0083, + "num_input_tokens_seen": 154143392, + "step": 71370 + }, + { + "epoch": 11.643556280587276, + "grad_norm": 0.011206441558897495, + "learning_rate": 0.00044397128099468497, + "loss": 0.0602, + "num_input_tokens_seen": 154156224, + "step": 71375 + }, + { + "epoch": 11.64437194127243, + "grad_norm": 0.03548984229564667, + "learning_rate": 0.0004439005500496519, + "loss": 0.1289, + "num_input_tokens_seen": 154166528, + "step": 71380 + }, + { + "epoch": 11.645187601957586, + "grad_norm": 0.16109098494052887, + "learning_rate": 0.00044382982024154506, + "loss": 0.1376, + "num_input_tokens_seen": 154177376, + "step": 71385 + }, + { + "epoch": 11.64600326264274, + "grad_norm": 0.09362545609474182, + "learning_rate": 0.0004437590915717984, + "loss": 0.0276, + "num_input_tokens_seen": 154188800, + "step": 71390 + }, + { + "epoch": 11.646818923327896, + "grad_norm": 0.008209917694330215, + "learning_rate": 0.0004436883640418449, + "loss": 0.0742, + "num_input_tokens_seen": 154200192, + "step": 71395 + }, + { + "epoch": 11.647634584013051, + "grad_norm": 0.04360055923461914, + "learning_rate": 0.0004436176376531181, + "loss": 0.1731, + "num_input_tokens_seen": 154212032, + "step": 71400 + }, + { + "epoch": 11.648450244698205, + "grad_norm": 0.006665311753749847, + "learning_rate": 0.00044354691240705167, + "loss": 0.0237, + "num_input_tokens_seen": 154222880, + "step": 71405 + }, + { + "epoch": 11.649265905383361, + "grad_norm": 0.1875232309103012, + "learning_rate": 0.00044347618830507845, + "loss": 0.0155, + "num_input_tokens_seen": 154233568, + "step": 71410 + }, + { + "epoch": 11.650081566068515, + "grad_norm": 0.003251552814617753, + "learning_rate": 0.00044340546534863226, + "loss": 0.0194, + "num_input_tokens_seen": 154244128, + "step": 71415 + }, + { + "epoch": 11.65089722675367, + "grad_norm": 0.005995164625346661, + "learning_rate": 0.00044333474353914576, + "loss": 0.0882, + "num_input_tokens_seen": 154254240, + "step": 71420 + }, + { + "epoch": 11.651712887438826, + "grad_norm": 0.005441619548946619, + "learning_rate": 0.0004432640228780529, + "loss": 0.0085, + "num_input_tokens_seen": 154265568, + "step": 71425 + }, + { + "epoch": 11.65252854812398, + "grad_norm": 0.13086704909801483, + "learning_rate": 0.0004431933033667863, + "loss": 0.0782, + "num_input_tokens_seen": 154275488, + "step": 71430 + }, + { + "epoch": 11.653344208809136, + "grad_norm": 0.04926230385899544, + "learning_rate": 0.0004431225850067796, + "loss": 0.0233, + "num_input_tokens_seen": 154287136, + "step": 71435 + }, + { + "epoch": 11.65415986949429, + "grad_norm": 0.050195761024951935, + "learning_rate": 0.0004430518677994659, + "loss": 0.0262, + "num_input_tokens_seen": 154298240, + "step": 71440 + }, + { + "epoch": 11.654975530179446, + "grad_norm": 0.6612548828125, + "learning_rate": 0.0004429811517462783, + "loss": 0.0612, + "num_input_tokens_seen": 154308672, + "step": 71445 + }, + { + "epoch": 11.655791190864601, + "grad_norm": 0.015694979578256607, + "learning_rate": 0.00044291043684865, + "loss": 0.0226, + "num_input_tokens_seen": 154318592, + "step": 71450 + }, + { + "epoch": 11.656606851549755, + "grad_norm": 0.027238953858613968, + "learning_rate": 0.0004428397231080141, + "loss": 0.0052, + "num_input_tokens_seen": 154329280, + "step": 71455 + }, + { + "epoch": 11.65742251223491, + "grad_norm": 0.00588644715026021, + "learning_rate": 0.0004427690105258037, + "loss": 0.0532, + "num_input_tokens_seen": 154339168, + "step": 71460 + }, + { + "epoch": 11.658238172920065, + "grad_norm": 0.007147925905883312, + "learning_rate": 0.00044269829910345207, + "loss": 0.0336, + "num_input_tokens_seen": 154349120, + "step": 71465 + }, + { + "epoch": 11.65905383360522, + "grad_norm": 0.009599827229976654, + "learning_rate": 0.00044262758884239185, + "loss": 0.0092, + "num_input_tokens_seen": 154359424, + "step": 71470 + }, + { + "epoch": 11.659869494290374, + "grad_norm": 0.008452493697404861, + "learning_rate": 0.00044255687974405656, + "loss": 0.0363, + "num_input_tokens_seen": 154369856, + "step": 71475 + }, + { + "epoch": 11.66068515497553, + "grad_norm": 0.26467153429985046, + "learning_rate": 0.0004424861718098788, + "loss": 0.0345, + "num_input_tokens_seen": 154381024, + "step": 71480 + }, + { + "epoch": 11.661500815660686, + "grad_norm": 0.011430994607508183, + "learning_rate": 0.00044241546504129186, + "loss": 0.0115, + "num_input_tokens_seen": 154392960, + "step": 71485 + }, + { + "epoch": 11.66231647634584, + "grad_norm": 0.048355769366025925, + "learning_rate": 0.0004423447594397284, + "loss": 0.0076, + "num_input_tokens_seen": 154404480, + "step": 71490 + }, + { + "epoch": 11.663132137030995, + "grad_norm": 0.41922426223754883, + "learning_rate": 0.00044227405500662175, + "loss": 0.0497, + "num_input_tokens_seen": 154415168, + "step": 71495 + }, + { + "epoch": 11.66394779771615, + "grad_norm": 0.009930071420967579, + "learning_rate": 0.00044220335174340443, + "loss": 0.0143, + "num_input_tokens_seen": 154425920, + "step": 71500 + }, + { + "epoch": 11.664763458401305, + "grad_norm": 0.03307803347706795, + "learning_rate": 0.00044213264965150943, + "loss": 0.0133, + "num_input_tokens_seen": 154436864, + "step": 71505 + }, + { + "epoch": 11.66557911908646, + "grad_norm": 0.00203361245803535, + "learning_rate": 0.00044206194873237, + "loss": 0.0773, + "num_input_tokens_seen": 154448416, + "step": 71510 + }, + { + "epoch": 11.666394779771615, + "grad_norm": 0.06398969888687134, + "learning_rate": 0.00044199124898741844, + "loss": 0.0295, + "num_input_tokens_seen": 154459872, + "step": 71515 + }, + { + "epoch": 11.66721044045677, + "grad_norm": 0.00550407450646162, + "learning_rate": 0.000441920550418088, + "loss": 0.0044, + "num_input_tokens_seen": 154470304, + "step": 71520 + }, + { + "epoch": 11.668026101141924, + "grad_norm": 0.023381365463137627, + "learning_rate": 0.00044184985302581103, + "loss": 0.0054, + "num_input_tokens_seen": 154481152, + "step": 71525 + }, + { + "epoch": 11.66884176182708, + "grad_norm": 0.058864492923021317, + "learning_rate": 0.00044177915681202083, + "loss": 0.0988, + "num_input_tokens_seen": 154491104, + "step": 71530 + }, + { + "epoch": 11.669657422512234, + "grad_norm": 0.005633897613734007, + "learning_rate": 0.00044170846177814965, + "loss": 0.0408, + "num_input_tokens_seen": 154503136, + "step": 71535 + }, + { + "epoch": 11.67047308319739, + "grad_norm": 0.03195603936910629, + "learning_rate": 0.0004416377679256307, + "loss": 0.0082, + "num_input_tokens_seen": 154513376, + "step": 71540 + }, + { + "epoch": 11.671288743882545, + "grad_norm": 0.006400907877832651, + "learning_rate": 0.0004415670752558961, + "loss": 0.0015, + "num_input_tokens_seen": 154524384, + "step": 71545 + }, + { + "epoch": 11.6721044045677, + "grad_norm": 0.010677113197743893, + "learning_rate": 0.0004414963837703791, + "loss": 0.0069, + "num_input_tokens_seen": 154535136, + "step": 71550 + }, + { + "epoch": 11.672920065252855, + "grad_norm": 0.007044284604489803, + "learning_rate": 0.0004414256934705119, + "loss": 0.1687, + "num_input_tokens_seen": 154545952, + "step": 71555 + }, + { + "epoch": 11.673735725938009, + "grad_norm": 0.042897067964076996, + "learning_rate": 0.00044135500435772755, + "loss": 0.0048, + "num_input_tokens_seen": 154556288, + "step": 71560 + }, + { + "epoch": 11.674551386623165, + "grad_norm": 0.05293981730937958, + "learning_rate": 0.0004412843164334582, + "loss": 0.0919, + "num_input_tokens_seen": 154567360, + "step": 71565 + }, + { + "epoch": 11.67536704730832, + "grad_norm": 0.2870676815509796, + "learning_rate": 0.00044121362969913683, + "loss": 0.0427, + "num_input_tokens_seen": 154577984, + "step": 71570 + }, + { + "epoch": 11.676182707993474, + "grad_norm": 0.034628961235284805, + "learning_rate": 0.00044114294415619577, + "loss": 0.0061, + "num_input_tokens_seen": 154589568, + "step": 71575 + }, + { + "epoch": 11.67699836867863, + "grad_norm": 0.00338340294547379, + "learning_rate": 0.00044107225980606765, + "loss": 0.0041, + "num_input_tokens_seen": 154601664, + "step": 71580 + }, + { + "epoch": 11.677814029363784, + "grad_norm": 0.49561360478401184, + "learning_rate": 0.0004410015766501849, + "loss": 0.1177, + "num_input_tokens_seen": 154612576, + "step": 71585 + }, + { + "epoch": 11.67862969004894, + "grad_norm": 0.06097765639424324, + "learning_rate": 0.00044093089468998006, + "loss": 0.0431, + "num_input_tokens_seen": 154624416, + "step": 71590 + }, + { + "epoch": 11.679445350734095, + "grad_norm": 0.00320567493326962, + "learning_rate": 0.0004408602139268856, + "loss": 0.0774, + "num_input_tokens_seen": 154634944, + "step": 71595 + }, + { + "epoch": 11.68026101141925, + "grad_norm": 0.014213986694812775, + "learning_rate": 0.00044078953436233387, + "loss": 0.0144, + "num_input_tokens_seen": 154646400, + "step": 71600 + }, + { + "epoch": 11.681076672104405, + "grad_norm": 0.03145314380526543, + "learning_rate": 0.0004407188559977573, + "loss": 0.007, + "num_input_tokens_seen": 154658112, + "step": 71605 + }, + { + "epoch": 11.681892332789559, + "grad_norm": 0.06955621391534805, + "learning_rate": 0.00044064817883458833, + "loss": 0.0498, + "num_input_tokens_seen": 154669632, + "step": 71610 + }, + { + "epoch": 11.682707993474715, + "grad_norm": 0.21142300963401794, + "learning_rate": 0.0004405775028742594, + "loss": 0.1781, + "num_input_tokens_seen": 154680992, + "step": 71615 + }, + { + "epoch": 11.68352365415987, + "grad_norm": 0.3759807050228119, + "learning_rate": 0.00044050682811820277, + "loss": 0.167, + "num_input_tokens_seen": 154691296, + "step": 71620 + }, + { + "epoch": 11.684339314845024, + "grad_norm": 0.004578839987516403, + "learning_rate": 0.00044043615456785065, + "loss": 0.0103, + "num_input_tokens_seen": 154702496, + "step": 71625 + }, + { + "epoch": 11.68515497553018, + "grad_norm": 0.004665186163038015, + "learning_rate": 0.00044036548222463535, + "loss": 0.0067, + "num_input_tokens_seen": 154713056, + "step": 71630 + }, + { + "epoch": 11.685970636215334, + "grad_norm": 0.007182937115430832, + "learning_rate": 0.0004402948110899894, + "loss": 0.0233, + "num_input_tokens_seen": 154724160, + "step": 71635 + }, + { + "epoch": 11.68678629690049, + "grad_norm": 0.4717956781387329, + "learning_rate": 0.0004402241411653447, + "loss": 0.0361, + "num_input_tokens_seen": 154735104, + "step": 71640 + }, + { + "epoch": 11.687601957585644, + "grad_norm": 0.14137259125709534, + "learning_rate": 0.00044015347245213377, + "loss": 0.0359, + "num_input_tokens_seen": 154747104, + "step": 71645 + }, + { + "epoch": 11.6884176182708, + "grad_norm": 0.0016079474007710814, + "learning_rate": 0.00044008280495178844, + "loss": 0.0189, + "num_input_tokens_seen": 154757184, + "step": 71650 + }, + { + "epoch": 11.689233278955955, + "grad_norm": 0.7597583532333374, + "learning_rate": 0.0004400121386657413, + "loss": 0.0709, + "num_input_tokens_seen": 154768896, + "step": 71655 + }, + { + "epoch": 11.690048939641109, + "grad_norm": 0.09265612810850143, + "learning_rate": 0.000439941473595424, + "loss": 0.0077, + "num_input_tokens_seen": 154779808, + "step": 71660 + }, + { + "epoch": 11.690864600326265, + "grad_norm": 0.009280568920075893, + "learning_rate": 0.00043987080974226925, + "loss": 0.0353, + "num_input_tokens_seen": 154789952, + "step": 71665 + }, + { + "epoch": 11.691680261011419, + "grad_norm": 0.001288570580072701, + "learning_rate": 0.00043980014710770857, + "loss": 0.0185, + "num_input_tokens_seen": 154799936, + "step": 71670 + }, + { + "epoch": 11.692495921696574, + "grad_norm": 0.07057865709066391, + "learning_rate": 0.00043972948569317446, + "loss": 0.077, + "num_input_tokens_seen": 154810720, + "step": 71675 + }, + { + "epoch": 11.69331158238173, + "grad_norm": 0.00797590147703886, + "learning_rate": 0.00043965882550009856, + "loss": 0.0107, + "num_input_tokens_seen": 154821152, + "step": 71680 + }, + { + "epoch": 11.694127243066884, + "grad_norm": 0.005033727269619703, + "learning_rate": 0.0004395881665299134, + "loss": 0.0182, + "num_input_tokens_seen": 154830944, + "step": 71685 + }, + { + "epoch": 11.69494290375204, + "grad_norm": 0.26927709579467773, + "learning_rate": 0.0004395175087840503, + "loss": 0.1215, + "num_input_tokens_seen": 154841920, + "step": 71690 + }, + { + "epoch": 11.695758564437194, + "grad_norm": 0.02626313455402851, + "learning_rate": 0.000439446852263942, + "loss": 0.0262, + "num_input_tokens_seen": 154854432, + "step": 71695 + }, + { + "epoch": 11.69657422512235, + "grad_norm": 0.05705942586064339, + "learning_rate": 0.00043937619697101974, + "loss": 0.0067, + "num_input_tokens_seen": 154864544, + "step": 71700 + }, + { + "epoch": 11.697389885807503, + "grad_norm": 0.010143991559743881, + "learning_rate": 0.00043930554290671597, + "loss": 0.0051, + "num_input_tokens_seen": 154874880, + "step": 71705 + }, + { + "epoch": 11.698205546492659, + "grad_norm": 0.0047828564420342445, + "learning_rate": 0.0004392348900724622, + "loss": 0.0073, + "num_input_tokens_seen": 154886240, + "step": 71710 + }, + { + "epoch": 11.699021207177815, + "grad_norm": 0.03272762522101402, + "learning_rate": 0.00043916423846969047, + "loss": 0.0039, + "num_input_tokens_seen": 154896192, + "step": 71715 + }, + { + "epoch": 11.699836867862969, + "grad_norm": 0.33838027715682983, + "learning_rate": 0.0004390935880998329, + "loss": 0.1627, + "num_input_tokens_seen": 154906560, + "step": 71720 + }, + { + "epoch": 11.700652528548124, + "grad_norm": 0.03905438259243965, + "learning_rate": 0.00043902293896432064, + "loss": 0.0452, + "num_input_tokens_seen": 154917568, + "step": 71725 + }, + { + "epoch": 11.701468189233278, + "grad_norm": 0.5773392915725708, + "learning_rate": 0.0004389522910645862, + "loss": 0.0153, + "num_input_tokens_seen": 154927232, + "step": 71730 + }, + { + "epoch": 11.702283849918434, + "grad_norm": 0.009112930856645107, + "learning_rate": 0.00043888164440206086, + "loss": 0.0061, + "num_input_tokens_seen": 154937376, + "step": 71735 + }, + { + "epoch": 11.70309951060359, + "grad_norm": 0.07352989912033081, + "learning_rate": 0.0004388109989781766, + "loss": 0.0039, + "num_input_tokens_seen": 154947744, + "step": 71740 + }, + { + "epoch": 11.703915171288743, + "grad_norm": 0.007768069859594107, + "learning_rate": 0.000438740354794365, + "loss": 0.0372, + "num_input_tokens_seen": 154959104, + "step": 71745 + }, + { + "epoch": 11.7047308319739, + "grad_norm": 0.004225387237966061, + "learning_rate": 0.0004386697118520579, + "loss": 0.0024, + "num_input_tokens_seen": 154971264, + "step": 71750 + }, + { + "epoch": 11.705546492659053, + "grad_norm": 0.2465088665485382, + "learning_rate": 0.00043859907015268685, + "loss": 0.1459, + "num_input_tokens_seen": 154983008, + "step": 71755 + }, + { + "epoch": 11.706362153344209, + "grad_norm": 0.00690916832536459, + "learning_rate": 0.00043852842969768356, + "loss": 0.0083, + "num_input_tokens_seen": 154993600, + "step": 71760 + }, + { + "epoch": 11.707177814029365, + "grad_norm": 0.011656506918370724, + "learning_rate": 0.0004384577904884795, + "loss": 0.024, + "num_input_tokens_seen": 155003776, + "step": 71765 + }, + { + "epoch": 11.707993474714518, + "grad_norm": 0.1968628615140915, + "learning_rate": 0.0004383871525265066, + "loss": 0.1653, + "num_input_tokens_seen": 155013056, + "step": 71770 + }, + { + "epoch": 11.708809135399674, + "grad_norm": 0.028053130954504013, + "learning_rate": 0.00043831651581319604, + "loss": 0.023, + "num_input_tokens_seen": 155023072, + "step": 71775 + }, + { + "epoch": 11.709624796084828, + "grad_norm": 0.07982442528009415, + "learning_rate": 0.00043824588034997974, + "loss": 0.0625, + "num_input_tokens_seen": 155033472, + "step": 71780 + }, + { + "epoch": 11.710440456769984, + "grad_norm": 0.11559919267892838, + "learning_rate": 0.0004381752461382888, + "loss": 0.0227, + "num_input_tokens_seen": 155044960, + "step": 71785 + }, + { + "epoch": 11.71125611745514, + "grad_norm": 0.0043896157294511795, + "learning_rate": 0.0004381046131795551, + "loss": 0.0182, + "num_input_tokens_seen": 155056704, + "step": 71790 + }, + { + "epoch": 11.712071778140293, + "grad_norm": 0.07724365592002869, + "learning_rate": 0.0004380339814752098, + "loss": 0.0099, + "num_input_tokens_seen": 155067584, + "step": 71795 + }, + { + "epoch": 11.71288743882545, + "grad_norm": 0.013904483988881111, + "learning_rate": 0.0004379633510266846, + "loss": 0.0106, + "num_input_tokens_seen": 155078880, + "step": 71800 + }, + { + "epoch": 11.713703099510603, + "grad_norm": 0.0046301172114908695, + "learning_rate": 0.0004378927218354106, + "loss": 0.0298, + "num_input_tokens_seen": 155089408, + "step": 71805 + }, + { + "epoch": 11.714518760195759, + "grad_norm": 0.005695376545190811, + "learning_rate": 0.00043782209390281964, + "loss": 0.0251, + "num_input_tokens_seen": 155098816, + "step": 71810 + }, + { + "epoch": 11.715334420880914, + "grad_norm": 0.005193586926907301, + "learning_rate": 0.00043775146723034253, + "loss": 0.015, + "num_input_tokens_seen": 155110048, + "step": 71815 + }, + { + "epoch": 11.716150081566068, + "grad_norm": 0.05635349825024605, + "learning_rate": 0.00043768084181941097, + "loss": 0.0186, + "num_input_tokens_seen": 155120256, + "step": 71820 + }, + { + "epoch": 11.716965742251224, + "grad_norm": 0.03338460251688957, + "learning_rate": 0.00043761021767145644, + "loss": 0.033, + "num_input_tokens_seen": 155130976, + "step": 71825 + }, + { + "epoch": 11.717781402936378, + "grad_norm": 0.0054107471369206905, + "learning_rate": 0.0004375395947879097, + "loss": 0.0038, + "num_input_tokens_seen": 155141376, + "step": 71830 + }, + { + "epoch": 11.718597063621534, + "grad_norm": 0.006177667994052172, + "learning_rate": 0.0004374689731702026, + "loss": 0.0159, + "num_input_tokens_seen": 155151616, + "step": 71835 + }, + { + "epoch": 11.719412724306688, + "grad_norm": 0.5736343860626221, + "learning_rate": 0.0004373983528197659, + "loss": 0.1354, + "num_input_tokens_seen": 155163232, + "step": 71840 + }, + { + "epoch": 11.720228384991843, + "grad_norm": 0.006295854225754738, + "learning_rate": 0.0004373277337380311, + "loss": 0.011, + "num_input_tokens_seen": 155173824, + "step": 71845 + }, + { + "epoch": 11.721044045676999, + "grad_norm": 0.0067205713130533695, + "learning_rate": 0.00043725711592642913, + "loss": 0.0057, + "num_input_tokens_seen": 155183552, + "step": 71850 + }, + { + "epoch": 11.721859706362153, + "grad_norm": 0.003925715573132038, + "learning_rate": 0.0004371864993863915, + "loss": 0.1543, + "num_input_tokens_seen": 155194048, + "step": 71855 + }, + { + "epoch": 11.722675367047309, + "grad_norm": 0.05358777940273285, + "learning_rate": 0.00043711588411934893, + "loss": 0.0682, + "num_input_tokens_seen": 155204800, + "step": 71860 + }, + { + "epoch": 11.723491027732463, + "grad_norm": 0.15012463927268982, + "learning_rate": 0.00043704527012673294, + "loss": 0.0292, + "num_input_tokens_seen": 155215904, + "step": 71865 + }, + { + "epoch": 11.724306688417618, + "grad_norm": 0.009517773985862732, + "learning_rate": 0.00043697465740997424, + "loss": 0.0189, + "num_input_tokens_seen": 155226464, + "step": 71870 + }, + { + "epoch": 11.725122349102774, + "grad_norm": 0.2332668900489807, + "learning_rate": 0.00043690404597050426, + "loss": 0.0632, + "num_input_tokens_seen": 155237152, + "step": 71875 + }, + { + "epoch": 11.725938009787928, + "grad_norm": 0.2832512855529785, + "learning_rate": 0.0004368334358097536, + "loss": 0.0268, + "num_input_tokens_seen": 155248608, + "step": 71880 + }, + { + "epoch": 11.726753670473084, + "grad_norm": 0.001509108697064221, + "learning_rate": 0.00043676282692915367, + "loss": 0.1853, + "num_input_tokens_seen": 155259744, + "step": 71885 + }, + { + "epoch": 11.727569331158238, + "grad_norm": 0.0022017841693013906, + "learning_rate": 0.0004366922193301352, + "loss": 0.0226, + "num_input_tokens_seen": 155272000, + "step": 71890 + }, + { + "epoch": 11.728384991843393, + "grad_norm": 0.0046439943835139275, + "learning_rate": 0.00043662161301412925, + "loss": 0.055, + "num_input_tokens_seen": 155282304, + "step": 71895 + }, + { + "epoch": 11.729200652528547, + "grad_norm": 0.006962585728615522, + "learning_rate": 0.0004365510079825667, + "loss": 0.0529, + "num_input_tokens_seen": 155291840, + "step": 71900 + }, + { + "epoch": 11.730016313213703, + "grad_norm": 0.011773492209613323, + "learning_rate": 0.00043648040423687845, + "loss": 0.006, + "num_input_tokens_seen": 155302976, + "step": 71905 + }, + { + "epoch": 11.730831973898859, + "grad_norm": 0.40067851543426514, + "learning_rate": 0.00043640980177849534, + "loss": 0.0753, + "num_input_tokens_seen": 155313728, + "step": 71910 + }, + { + "epoch": 11.731647634584013, + "grad_norm": 0.00812000036239624, + "learning_rate": 0.00043633920060884843, + "loss": 0.0072, + "num_input_tokens_seen": 155326080, + "step": 71915 + }, + { + "epoch": 11.732463295269168, + "grad_norm": 0.006097911857068539, + "learning_rate": 0.0004362686007293681, + "loss": 0.0566, + "num_input_tokens_seen": 155337248, + "step": 71920 + }, + { + "epoch": 11.733278955954322, + "grad_norm": 0.007870636880397797, + "learning_rate": 0.0004361980021414858, + "loss": 0.0629, + "num_input_tokens_seen": 155348224, + "step": 71925 + }, + { + "epoch": 11.734094616639478, + "grad_norm": 0.015115322545170784, + "learning_rate": 0.00043612740484663155, + "loss": 0.0099, + "num_input_tokens_seen": 155359744, + "step": 71930 + }, + { + "epoch": 11.734910277324634, + "grad_norm": 0.007088929880410433, + "learning_rate": 0.00043605680884623656, + "loss": 0.0338, + "num_input_tokens_seen": 155370912, + "step": 71935 + }, + { + "epoch": 11.735725938009788, + "grad_norm": 0.024514637887477875, + "learning_rate": 0.00043598621414173166, + "loss": 0.0059, + "num_input_tokens_seen": 155382336, + "step": 71940 + }, + { + "epoch": 11.736541598694943, + "grad_norm": 0.016428545117378235, + "learning_rate": 0.0004359156207345471, + "loss": 0.0081, + "num_input_tokens_seen": 155393152, + "step": 71945 + }, + { + "epoch": 11.737357259380097, + "grad_norm": 0.6242492198944092, + "learning_rate": 0.00043584502862611404, + "loss": 0.0421, + "num_input_tokens_seen": 155403744, + "step": 71950 + }, + { + "epoch": 11.738172920065253, + "grad_norm": 0.07482123374938965, + "learning_rate": 0.00043577443781786263, + "loss": 0.0078, + "num_input_tokens_seen": 155415360, + "step": 71955 + }, + { + "epoch": 11.738988580750409, + "grad_norm": 0.06953166425228119, + "learning_rate": 0.0004357038483112239, + "loss": 0.0079, + "num_input_tokens_seen": 155425632, + "step": 71960 + }, + { + "epoch": 11.739804241435563, + "grad_norm": 0.0036088728811591864, + "learning_rate": 0.00043563326010762803, + "loss": 0.0609, + "num_input_tokens_seen": 155437952, + "step": 71965 + }, + { + "epoch": 11.740619902120718, + "grad_norm": 0.08081181347370148, + "learning_rate": 0.00043556267320850605, + "loss": 0.0097, + "num_input_tokens_seen": 155447552, + "step": 71970 + }, + { + "epoch": 11.741435562805872, + "grad_norm": 0.019740041345357895, + "learning_rate": 0.000435492087615288, + "loss": 0.0245, + "num_input_tokens_seen": 155457216, + "step": 71975 + }, + { + "epoch": 11.742251223491028, + "grad_norm": 0.021085111424326897, + "learning_rate": 0.00043542150332940487, + "loss": 0.1118, + "num_input_tokens_seen": 155468064, + "step": 71980 + }, + { + "epoch": 11.743066884176184, + "grad_norm": 0.13764089345932007, + "learning_rate": 0.00043535092035228666, + "loss": 0.1031, + "num_input_tokens_seen": 155477280, + "step": 71985 + }, + { + "epoch": 11.743882544861338, + "grad_norm": 0.032629575580358505, + "learning_rate": 0.00043528033868536433, + "loss": 0.0121, + "num_input_tokens_seen": 155488672, + "step": 71990 + }, + { + "epoch": 11.744698205546493, + "grad_norm": 0.007054131478071213, + "learning_rate": 0.0004352097583300678, + "loss": 0.007, + "num_input_tokens_seen": 155499744, + "step": 71995 + }, + { + "epoch": 11.745513866231647, + "grad_norm": 0.3839552700519562, + "learning_rate": 0.0004351391792878279, + "loss": 0.1698, + "num_input_tokens_seen": 155510112, + "step": 72000 + }, + { + "epoch": 11.746329526916803, + "grad_norm": 0.03238167613744736, + "learning_rate": 0.00043506860156007453, + "loss": 0.0255, + "num_input_tokens_seen": 155520864, + "step": 72005 + }, + { + "epoch": 11.747145187601957, + "grad_norm": 0.4877621829509735, + "learning_rate": 0.00043499802514823866, + "loss": 0.0288, + "num_input_tokens_seen": 155532384, + "step": 72010 + }, + { + "epoch": 11.747960848287113, + "grad_norm": 0.13125985860824585, + "learning_rate": 0.00043492745005375, + "loss": 0.059, + "num_input_tokens_seen": 155544576, + "step": 72015 + }, + { + "epoch": 11.748776508972268, + "grad_norm": 0.3048825263977051, + "learning_rate": 0.00043485687627803935, + "loss": 0.1243, + "num_input_tokens_seen": 155554432, + "step": 72020 + }, + { + "epoch": 11.749592169657422, + "grad_norm": 0.0021826811134815216, + "learning_rate": 0.00043478630382253646, + "loss": 0.0617, + "num_input_tokens_seen": 155565024, + "step": 72025 + }, + { + "epoch": 11.750407830342578, + "grad_norm": 0.020689282566308975, + "learning_rate": 0.00043471573268867206, + "loss": 0.1112, + "num_input_tokens_seen": 155576736, + "step": 72030 + }, + { + "epoch": 11.751223491027732, + "grad_norm": 0.29919424653053284, + "learning_rate": 0.00043464516287787617, + "loss": 0.0361, + "num_input_tokens_seen": 155587968, + "step": 72035 + }, + { + "epoch": 11.752039151712887, + "grad_norm": 0.019435329362750053, + "learning_rate": 0.0004345745943915788, + "loss": 0.0073, + "num_input_tokens_seen": 155598208, + "step": 72040 + }, + { + "epoch": 11.752854812398043, + "grad_norm": 0.08339189738035202, + "learning_rate": 0.0004345040272312104, + "loss": 0.011, + "num_input_tokens_seen": 155610400, + "step": 72045 + }, + { + "epoch": 11.753670473083197, + "grad_norm": 0.34416478872299194, + "learning_rate": 0.00043443346139820086, + "loss": 0.0958, + "num_input_tokens_seen": 155620800, + "step": 72050 + }, + { + "epoch": 11.754486133768353, + "grad_norm": 0.017133589833974838, + "learning_rate": 0.0004343628968939805, + "loss": 0.0529, + "num_input_tokens_seen": 155632480, + "step": 72055 + }, + { + "epoch": 11.755301794453507, + "grad_norm": 0.17699752748012543, + "learning_rate": 0.0004342923337199793, + "loss": 0.0876, + "num_input_tokens_seen": 155643328, + "step": 72060 + }, + { + "epoch": 11.756117455138662, + "grad_norm": 0.004552872385829687, + "learning_rate": 0.0004342217718776273, + "loss": 0.0117, + "num_input_tokens_seen": 155654048, + "step": 72065 + }, + { + "epoch": 11.756933115823816, + "grad_norm": 0.33629310131073, + "learning_rate": 0.00043415121136835454, + "loss": 0.0809, + "num_input_tokens_seen": 155663904, + "step": 72070 + }, + { + "epoch": 11.757748776508972, + "grad_norm": 0.09349898993968964, + "learning_rate": 0.00043408065219359106, + "loss": 0.1015, + "num_input_tokens_seen": 155674816, + "step": 72075 + }, + { + "epoch": 11.758564437194128, + "grad_norm": 0.005316116847097874, + "learning_rate": 0.00043401009435476665, + "loss": 0.0045, + "num_input_tokens_seen": 155685216, + "step": 72080 + }, + { + "epoch": 11.759380097879282, + "grad_norm": 0.008520056493580341, + "learning_rate": 0.0004339395378533116, + "loss": 0.0169, + "num_input_tokens_seen": 155695840, + "step": 72085 + }, + { + "epoch": 11.760195758564437, + "grad_norm": 0.5987900495529175, + "learning_rate": 0.00043386898269065537, + "loss": 0.0355, + "num_input_tokens_seen": 155706656, + "step": 72090 + }, + { + "epoch": 11.761011419249591, + "grad_norm": 0.4160362780094147, + "learning_rate": 0.00043379842886822836, + "loss": 0.1533, + "num_input_tokens_seen": 155716928, + "step": 72095 + }, + { + "epoch": 11.761827079934747, + "grad_norm": 0.2085852324962616, + "learning_rate": 0.0004337278763874599, + "loss": 0.0114, + "num_input_tokens_seen": 155727104, + "step": 72100 + }, + { + "epoch": 11.762642740619903, + "grad_norm": 0.0025748233310878277, + "learning_rate": 0.0004336573252497804, + "loss": 0.0105, + "num_input_tokens_seen": 155738080, + "step": 72105 + }, + { + "epoch": 11.763458401305057, + "grad_norm": 0.002957735676318407, + "learning_rate": 0.00043358677545661913, + "loss": 0.0045, + "num_input_tokens_seen": 155749824, + "step": 72110 + }, + { + "epoch": 11.764274061990212, + "grad_norm": 0.07855616509914398, + "learning_rate": 0.0004335162270094063, + "loss": 0.0181, + "num_input_tokens_seen": 155760256, + "step": 72115 + }, + { + "epoch": 11.765089722675366, + "grad_norm": 0.010629786178469658, + "learning_rate": 0.0004334456799095712, + "loss": 0.0302, + "num_input_tokens_seen": 155771360, + "step": 72120 + }, + { + "epoch": 11.765905383360522, + "grad_norm": 0.39609336853027344, + "learning_rate": 0.00043337513415854414, + "loss": 0.0256, + "num_input_tokens_seen": 155783392, + "step": 72125 + }, + { + "epoch": 11.766721044045678, + "grad_norm": 0.3709852397441864, + "learning_rate": 0.0004333045897577542, + "loss": 0.222, + "num_input_tokens_seen": 155793728, + "step": 72130 + }, + { + "epoch": 11.767536704730832, + "grad_norm": 0.001966248033568263, + "learning_rate": 0.00043323404670863165, + "loss": 0.004, + "num_input_tokens_seen": 155804576, + "step": 72135 + }, + { + "epoch": 11.768352365415987, + "grad_norm": 0.018825042992830276, + "learning_rate": 0.0004331635050126056, + "loss": 0.0058, + "num_input_tokens_seen": 155815040, + "step": 72140 + }, + { + "epoch": 11.769168026101141, + "grad_norm": 0.01670904830098152, + "learning_rate": 0.0004330929646711059, + "loss": 0.1545, + "num_input_tokens_seen": 155825664, + "step": 72145 + }, + { + "epoch": 11.769983686786297, + "grad_norm": 0.04210560396313667, + "learning_rate": 0.0004330224256855624, + "loss": 0.0745, + "num_input_tokens_seen": 155836736, + "step": 72150 + }, + { + "epoch": 11.770799347471453, + "grad_norm": 0.36961254477500916, + "learning_rate": 0.00043295188805740414, + "loss": 0.1821, + "num_input_tokens_seen": 155847872, + "step": 72155 + }, + { + "epoch": 11.771615008156607, + "grad_norm": 0.014704558998346329, + "learning_rate": 0.0004328813517880612, + "loss": 0.0853, + "num_input_tokens_seen": 155859904, + "step": 72160 + }, + { + "epoch": 11.772430668841762, + "grad_norm": 0.04387712478637695, + "learning_rate": 0.00043281081687896253, + "loss": 0.015, + "num_input_tokens_seen": 155870848, + "step": 72165 + }, + { + "epoch": 11.773246329526916, + "grad_norm": 0.041084855794906616, + "learning_rate": 0.0004327402833315381, + "loss": 0.0096, + "num_input_tokens_seen": 155882336, + "step": 72170 + }, + { + "epoch": 11.774061990212072, + "grad_norm": 0.04652201011776924, + "learning_rate": 0.000432669751147217, + "loss": 0.0086, + "num_input_tokens_seen": 155894464, + "step": 72175 + }, + { + "epoch": 11.774877650897226, + "grad_norm": 0.006085763685405254, + "learning_rate": 0.000432599220327429, + "loss": 0.0269, + "num_input_tokens_seen": 155905120, + "step": 72180 + }, + { + "epoch": 11.775693311582382, + "grad_norm": 0.40596145391464233, + "learning_rate": 0.0004325286908736031, + "loss": 0.0509, + "num_input_tokens_seen": 155915680, + "step": 72185 + }, + { + "epoch": 11.776508972267537, + "grad_norm": 0.011549671180546284, + "learning_rate": 0.0004324581627871691, + "loss": 0.0183, + "num_input_tokens_seen": 155925856, + "step": 72190 + }, + { + "epoch": 11.777324632952691, + "grad_norm": 0.39453864097595215, + "learning_rate": 0.00043238763606955586, + "loss": 0.0818, + "num_input_tokens_seen": 155936736, + "step": 72195 + }, + { + "epoch": 11.778140293637847, + "grad_norm": 0.02689771167933941, + "learning_rate": 0.00043231711072219307, + "loss": 0.0361, + "num_input_tokens_seen": 155947072, + "step": 72200 + }, + { + "epoch": 11.778955954323001, + "grad_norm": 0.012872003018856049, + "learning_rate": 0.0004322465867465099, + "loss": 0.0329, + "num_input_tokens_seen": 155958656, + "step": 72205 + }, + { + "epoch": 11.779771615008157, + "grad_norm": 0.891151487827301, + "learning_rate": 0.0004321760641439356, + "loss": 0.0593, + "num_input_tokens_seen": 155968256, + "step": 72210 + }, + { + "epoch": 11.780587275693312, + "grad_norm": 0.026634545996785164, + "learning_rate": 0.00043210554291589937, + "loss": 0.0254, + "num_input_tokens_seen": 155978272, + "step": 72215 + }, + { + "epoch": 11.781402936378466, + "grad_norm": 0.36695098876953125, + "learning_rate": 0.00043203502306383046, + "loss": 0.0309, + "num_input_tokens_seen": 155989024, + "step": 72220 + }, + { + "epoch": 11.782218597063622, + "grad_norm": 0.008795715868473053, + "learning_rate": 0.0004319645045891579, + "loss": 0.0144, + "num_input_tokens_seen": 156000800, + "step": 72225 + }, + { + "epoch": 11.783034257748776, + "grad_norm": 0.018142348155379295, + "learning_rate": 0.0004318939874933113, + "loss": 0.0285, + "num_input_tokens_seen": 156011904, + "step": 72230 + }, + { + "epoch": 11.783849918433932, + "grad_norm": 0.004247451666742563, + "learning_rate": 0.00043182347177771907, + "loss": 0.1222, + "num_input_tokens_seen": 156024160, + "step": 72235 + }, + { + "epoch": 11.784665579119086, + "grad_norm": 0.0237015001475811, + "learning_rate": 0.000431752957443811, + "loss": 0.0067, + "num_input_tokens_seen": 156035488, + "step": 72240 + }, + { + "epoch": 11.785481239804241, + "grad_norm": 0.06558331102132797, + "learning_rate": 0.00043168244449301555, + "loss": 0.0623, + "num_input_tokens_seen": 156045888, + "step": 72245 + }, + { + "epoch": 11.786296900489397, + "grad_norm": 0.5224149227142334, + "learning_rate": 0.00043161193292676203, + "loss": 0.0465, + "num_input_tokens_seen": 156057056, + "step": 72250 + }, + { + "epoch": 11.78711256117455, + "grad_norm": 0.022556733340024948, + "learning_rate": 0.00043154142274647966, + "loss": 0.0282, + "num_input_tokens_seen": 156068032, + "step": 72255 + }, + { + "epoch": 11.787928221859707, + "grad_norm": 0.009831923991441727, + "learning_rate": 0.000431470913953597, + "loss": 0.0086, + "num_input_tokens_seen": 156078016, + "step": 72260 + }, + { + "epoch": 11.78874388254486, + "grad_norm": 0.273215115070343, + "learning_rate": 0.00043140040654954346, + "loss": 0.0367, + "num_input_tokens_seen": 156089472, + "step": 72265 + }, + { + "epoch": 11.789559543230016, + "grad_norm": 0.007073753513395786, + "learning_rate": 0.00043132990053574747, + "loss": 0.0049, + "num_input_tokens_seen": 156101504, + "step": 72270 + }, + { + "epoch": 11.790375203915172, + "grad_norm": 0.004455335903912783, + "learning_rate": 0.0004312593959136383, + "loss": 0.0196, + "num_input_tokens_seen": 156112192, + "step": 72275 + }, + { + "epoch": 11.791190864600326, + "grad_norm": 0.004594567697495222, + "learning_rate": 0.0004311888926846445, + "loss": 0.0364, + "num_input_tokens_seen": 156121984, + "step": 72280 + }, + { + "epoch": 11.792006525285482, + "grad_norm": 0.056093163788318634, + "learning_rate": 0.00043111839085019534, + "loss": 0.0064, + "num_input_tokens_seen": 156132160, + "step": 72285 + }, + { + "epoch": 11.792822185970635, + "grad_norm": 0.41622522473335266, + "learning_rate": 0.0004310478904117191, + "loss": 0.0237, + "num_input_tokens_seen": 156143200, + "step": 72290 + }, + { + "epoch": 11.793637846655791, + "grad_norm": 0.0016232366906479, + "learning_rate": 0.0004309773913706451, + "loss": 0.0161, + "num_input_tokens_seen": 156154496, + "step": 72295 + }, + { + "epoch": 11.794453507340947, + "grad_norm": 0.00741207879036665, + "learning_rate": 0.00043090689372840156, + "loss": 0.0072, + "num_input_tokens_seen": 156165824, + "step": 72300 + }, + { + "epoch": 11.7952691680261, + "grad_norm": 0.01622309908270836, + "learning_rate": 0.0004308363974864178, + "loss": 0.0057, + "num_input_tokens_seen": 156175520, + "step": 72305 + }, + { + "epoch": 11.796084828711257, + "grad_norm": 0.6132098436355591, + "learning_rate": 0.0004307659026461218, + "loss": 0.1409, + "num_input_tokens_seen": 156186144, + "step": 72310 + }, + { + "epoch": 11.79690048939641, + "grad_norm": 0.004296452272683382, + "learning_rate": 0.00043069540920894297, + "loss": 0.01, + "num_input_tokens_seen": 156197728, + "step": 72315 + }, + { + "epoch": 11.797716150081566, + "grad_norm": 0.26713958382606506, + "learning_rate": 0.0004306249171763093, + "loss": 0.0242, + "num_input_tokens_seen": 156207936, + "step": 72320 + }, + { + "epoch": 11.798531810766722, + "grad_norm": 0.002047403249889612, + "learning_rate": 0.0004305544265496499, + "loss": 0.0265, + "num_input_tokens_seen": 156219648, + "step": 72325 + }, + { + "epoch": 11.799347471451876, + "grad_norm": 0.00636103842407465, + "learning_rate": 0.000430483937330393, + "loss": 0.0037, + "num_input_tokens_seen": 156230080, + "step": 72330 + }, + { + "epoch": 11.800163132137031, + "grad_norm": 0.24282555282115936, + "learning_rate": 0.0004304134495199674, + "loss": 0.0891, + "num_input_tokens_seen": 156240448, + "step": 72335 + }, + { + "epoch": 11.800978792822185, + "grad_norm": 0.608010470867157, + "learning_rate": 0.0004303429631198014, + "loss": 0.1334, + "num_input_tokens_seen": 156251552, + "step": 72340 + }, + { + "epoch": 11.801794453507341, + "grad_norm": 0.01011840533465147, + "learning_rate": 0.0004302724781313237, + "loss": 0.0059, + "num_input_tokens_seen": 156262240, + "step": 72345 + }, + { + "epoch": 11.802610114192497, + "grad_norm": 0.03190528601408005, + "learning_rate": 0.0004302019945559627, + "loss": 0.1422, + "num_input_tokens_seen": 156273792, + "step": 72350 + }, + { + "epoch": 11.80342577487765, + "grad_norm": 0.3652302026748657, + "learning_rate": 0.0004301315123951467, + "loss": 0.0394, + "num_input_tokens_seen": 156284352, + "step": 72355 + }, + { + "epoch": 11.804241435562806, + "grad_norm": 0.15005654096603394, + "learning_rate": 0.0004300610316503045, + "loss": 0.0115, + "num_input_tokens_seen": 156294720, + "step": 72360 + }, + { + "epoch": 11.80505709624796, + "grad_norm": 0.00928251352161169, + "learning_rate": 0.00042999055232286387, + "loss": 0.1893, + "num_input_tokens_seen": 156304256, + "step": 72365 + }, + { + "epoch": 11.805872756933116, + "grad_norm": 0.08327314257621765, + "learning_rate": 0.00042992007441425376, + "loss": 0.0146, + "num_input_tokens_seen": 156314560, + "step": 72370 + }, + { + "epoch": 11.80668841761827, + "grad_norm": 1.7067999839782715, + "learning_rate": 0.00042984959792590215, + "loss": 0.0457, + "num_input_tokens_seen": 156325696, + "step": 72375 + }, + { + "epoch": 11.807504078303426, + "grad_norm": 0.010561229661107063, + "learning_rate": 0.00042977912285923747, + "loss": 0.0905, + "num_input_tokens_seen": 156335584, + "step": 72380 + }, + { + "epoch": 11.808319738988581, + "grad_norm": 0.007480216212570667, + "learning_rate": 0.000429708649215688, + "loss": 0.0226, + "num_input_tokens_seen": 156345024, + "step": 72385 + }, + { + "epoch": 11.809135399673735, + "grad_norm": 0.9692756533622742, + "learning_rate": 0.00042963817699668183, + "loss": 0.0896, + "num_input_tokens_seen": 156354976, + "step": 72390 + }, + { + "epoch": 11.809951060358891, + "grad_norm": 0.14579908549785614, + "learning_rate": 0.0004295677062036472, + "loss": 0.0503, + "num_input_tokens_seen": 156366496, + "step": 72395 + }, + { + "epoch": 11.810766721044045, + "grad_norm": 0.015927450731396675, + "learning_rate": 0.00042949723683801256, + "loss": 0.0572, + "num_input_tokens_seen": 156377344, + "step": 72400 + }, + { + "epoch": 11.8115823817292, + "grad_norm": 0.0816449522972107, + "learning_rate": 0.0004294267689012057, + "loss": 0.1673, + "num_input_tokens_seen": 156388704, + "step": 72405 + }, + { + "epoch": 11.812398042414356, + "grad_norm": 0.26386797428131104, + "learning_rate": 0.000429356302394655, + "loss": 0.0143, + "num_input_tokens_seen": 156399872, + "step": 72410 + }, + { + "epoch": 11.81321370309951, + "grad_norm": 0.004181982949376106, + "learning_rate": 0.00042928583731978833, + "loss": 0.0192, + "num_input_tokens_seen": 156412800, + "step": 72415 + }, + { + "epoch": 11.814029363784666, + "grad_norm": 0.4576594829559326, + "learning_rate": 0.00042921537367803403, + "loss": 0.0298, + "num_input_tokens_seen": 156421824, + "step": 72420 + }, + { + "epoch": 11.81484502446982, + "grad_norm": 0.004218620248138905, + "learning_rate": 0.0004291449114708198, + "loss": 0.1628, + "num_input_tokens_seen": 156432896, + "step": 72425 + }, + { + "epoch": 11.815660685154976, + "grad_norm": 0.02189936861395836, + "learning_rate": 0.000429074450699574, + "loss": 0.0228, + "num_input_tokens_seen": 156443936, + "step": 72430 + }, + { + "epoch": 11.81647634584013, + "grad_norm": 0.005173725076019764, + "learning_rate": 0.0004290039913657243, + "loss": 0.0237, + "num_input_tokens_seen": 156455840, + "step": 72435 + }, + { + "epoch": 11.817292006525285, + "grad_norm": 0.4792866110801697, + "learning_rate": 0.00042893353347069887, + "loss": 0.0449, + "num_input_tokens_seen": 156466400, + "step": 72440 + }, + { + "epoch": 11.818107667210441, + "grad_norm": 0.0040856278501451015, + "learning_rate": 0.0004288630770159254, + "loss": 0.0086, + "num_input_tokens_seen": 156476096, + "step": 72445 + }, + { + "epoch": 11.818923327895595, + "grad_norm": 0.1025933250784874, + "learning_rate": 0.00042879262200283216, + "loss": 0.0369, + "num_input_tokens_seen": 156488096, + "step": 72450 + }, + { + "epoch": 11.81973898858075, + "grad_norm": 0.11138315498828888, + "learning_rate": 0.0004287221684328465, + "loss": 0.0209, + "num_input_tokens_seen": 156499008, + "step": 72455 + }, + { + "epoch": 11.820554649265905, + "grad_norm": 0.0017173081869259477, + "learning_rate": 0.00042865171630739654, + "loss": 0.0046, + "num_input_tokens_seen": 156508992, + "step": 72460 + }, + { + "epoch": 11.82137030995106, + "grad_norm": 0.001704095397144556, + "learning_rate": 0.0004285812656279102, + "loss": 0.0048, + "num_input_tokens_seen": 156517664, + "step": 72465 + }, + { + "epoch": 11.822185970636216, + "grad_norm": 0.010462358593940735, + "learning_rate": 0.000428510816395815, + "loss": 0.0669, + "num_input_tokens_seen": 156528608, + "step": 72470 + }, + { + "epoch": 11.82300163132137, + "grad_norm": 0.04697816073894501, + "learning_rate": 0.00042844036861253897, + "loss": 0.0765, + "num_input_tokens_seen": 156539776, + "step": 72475 + }, + { + "epoch": 11.823817292006526, + "grad_norm": 0.003039855509996414, + "learning_rate": 0.00042836992227950944, + "loss": 0.0023, + "num_input_tokens_seen": 156551488, + "step": 72480 + }, + { + "epoch": 11.82463295269168, + "grad_norm": 0.28987327218055725, + "learning_rate": 0.0004282994773981546, + "loss": 0.0569, + "num_input_tokens_seen": 156561952, + "step": 72485 + }, + { + "epoch": 11.825448613376835, + "grad_norm": 0.06460442394018173, + "learning_rate": 0.00042822903396990146, + "loss": 0.0267, + "num_input_tokens_seen": 156572448, + "step": 72490 + }, + { + "epoch": 11.826264274061991, + "grad_norm": 0.006775304209440947, + "learning_rate": 0.0004281585919961783, + "loss": 0.032, + "num_input_tokens_seen": 156581696, + "step": 72495 + }, + { + "epoch": 11.827079934747145, + "grad_norm": 0.03162192925810814, + "learning_rate": 0.00042808815147841214, + "loss": 0.0071, + "num_input_tokens_seen": 156592224, + "step": 72500 + }, + { + "epoch": 11.8278955954323, + "grad_norm": 0.1275867521762848, + "learning_rate": 0.0004280177124180311, + "loss": 0.1319, + "num_input_tokens_seen": 156602432, + "step": 72505 + }, + { + "epoch": 11.828711256117455, + "grad_norm": 0.0015156982699409127, + "learning_rate": 0.0004279472748164621, + "loss": 0.0439, + "num_input_tokens_seen": 156613856, + "step": 72510 + }, + { + "epoch": 11.82952691680261, + "grad_norm": 0.005000817123800516, + "learning_rate": 0.0004278768386751332, + "loss": 0.182, + "num_input_tokens_seen": 156624256, + "step": 72515 + }, + { + "epoch": 11.830342577487766, + "grad_norm": 0.003169822273775935, + "learning_rate": 0.0004278064039954716, + "loss": 0.0317, + "num_input_tokens_seen": 156635648, + "step": 72520 + }, + { + "epoch": 11.83115823817292, + "grad_norm": 0.41648584604263306, + "learning_rate": 0.00042773597077890485, + "loss": 0.1084, + "num_input_tokens_seen": 156645984, + "step": 72525 + }, + { + "epoch": 11.831973898858076, + "grad_norm": 0.14442989230155945, + "learning_rate": 0.0004276655390268603, + "loss": 0.0124, + "num_input_tokens_seen": 156657056, + "step": 72530 + }, + { + "epoch": 11.83278955954323, + "grad_norm": 0.009963775984942913, + "learning_rate": 0.0004275951087407653, + "loss": 0.1158, + "num_input_tokens_seen": 156668928, + "step": 72535 + }, + { + "epoch": 11.833605220228385, + "grad_norm": 0.005215761251747608, + "learning_rate": 0.0004275246799220473, + "loss": 0.0212, + "num_input_tokens_seen": 156679808, + "step": 72540 + }, + { + "epoch": 11.83442088091354, + "grad_norm": 0.01756799779832363, + "learning_rate": 0.0004274542525721338, + "loss": 0.0152, + "num_input_tokens_seen": 156690400, + "step": 72545 + }, + { + "epoch": 11.835236541598695, + "grad_norm": 0.010215381160378456, + "learning_rate": 0.00042738382669245157, + "loss": 0.0113, + "num_input_tokens_seen": 156701024, + "step": 72550 + }, + { + "epoch": 11.83605220228385, + "grad_norm": 0.33329635858535767, + "learning_rate": 0.0004273134022844285, + "loss": 0.0295, + "num_input_tokens_seen": 156712576, + "step": 72555 + }, + { + "epoch": 11.836867862969005, + "grad_norm": 0.004677685908973217, + "learning_rate": 0.00042724297934949136, + "loss": 0.0091, + "num_input_tokens_seen": 156723552, + "step": 72560 + }, + { + "epoch": 11.83768352365416, + "grad_norm": 0.014706281013786793, + "learning_rate": 0.0004271725578890675, + "loss": 0.0045, + "num_input_tokens_seen": 156735328, + "step": 72565 + }, + { + "epoch": 11.838499184339314, + "grad_norm": 0.052408941090106964, + "learning_rate": 0.00042710213790458435, + "loss": 0.0571, + "num_input_tokens_seen": 156746304, + "step": 72570 + }, + { + "epoch": 11.83931484502447, + "grad_norm": 0.005835378542542458, + "learning_rate": 0.00042703171939746865, + "loss": 0.0169, + "num_input_tokens_seen": 156758208, + "step": 72575 + }, + { + "epoch": 11.840130505709626, + "grad_norm": 0.018282631412148476, + "learning_rate": 0.00042696130236914796, + "loss": 0.1527, + "num_input_tokens_seen": 156767488, + "step": 72580 + }, + { + "epoch": 11.84094616639478, + "grad_norm": 0.25564491748809814, + "learning_rate": 0.00042689088682104886, + "loss": 0.1054, + "num_input_tokens_seen": 156778592, + "step": 72585 + }, + { + "epoch": 11.841761827079935, + "grad_norm": 0.0028037067968398333, + "learning_rate": 0.00042682047275459893, + "loss": 0.0059, + "num_input_tokens_seen": 156789888, + "step": 72590 + }, + { + "epoch": 11.84257748776509, + "grad_norm": 0.15276266634464264, + "learning_rate": 0.00042675006017122477, + "loss": 0.045, + "num_input_tokens_seen": 156799872, + "step": 72595 + }, + { + "epoch": 11.843393148450245, + "grad_norm": 0.003195535857230425, + "learning_rate": 0.0004266796490723538, + "loss": 0.008, + "num_input_tokens_seen": 156809632, + "step": 72600 + }, + { + "epoch": 11.844208809135399, + "grad_norm": 0.009690443985164165, + "learning_rate": 0.0004266092394594124, + "loss": 0.0188, + "num_input_tokens_seen": 156820064, + "step": 72605 + }, + { + "epoch": 11.845024469820554, + "grad_norm": 0.01867981068789959, + "learning_rate": 0.00042653883133382824, + "loss": 0.0901, + "num_input_tokens_seen": 156831424, + "step": 72610 + }, + { + "epoch": 11.84584013050571, + "grad_norm": 0.0158432237803936, + "learning_rate": 0.00042646842469702754, + "loss": 0.0193, + "num_input_tokens_seen": 156841664, + "step": 72615 + }, + { + "epoch": 11.846655791190864, + "grad_norm": 0.01342178788036108, + "learning_rate": 0.0004263980195504378, + "loss": 0.01, + "num_input_tokens_seen": 156853888, + "step": 72620 + }, + { + "epoch": 11.84747145187602, + "grad_norm": 0.025491604581475258, + "learning_rate": 0.0004263276158954853, + "loss": 0.012, + "num_input_tokens_seen": 156865184, + "step": 72625 + }, + { + "epoch": 11.848287112561174, + "grad_norm": 0.00676423916593194, + "learning_rate": 0.0004262572137335973, + "loss": 0.013, + "num_input_tokens_seen": 156876736, + "step": 72630 + }, + { + "epoch": 11.84910277324633, + "grad_norm": 0.021774085238575935, + "learning_rate": 0.00042618681306620025, + "loss": 0.0122, + "num_input_tokens_seen": 156886848, + "step": 72635 + }, + { + "epoch": 11.849918433931485, + "grad_norm": 0.005161866080015898, + "learning_rate": 0.00042611641389472127, + "loss": 0.0046, + "num_input_tokens_seen": 156898592, + "step": 72640 + }, + { + "epoch": 11.850734094616639, + "grad_norm": 0.0016783748287707567, + "learning_rate": 0.0004260460162205867, + "loss": 0.0036, + "num_input_tokens_seen": 156909888, + "step": 72645 + }, + { + "epoch": 11.851549755301795, + "grad_norm": 0.0034606284461915493, + "learning_rate": 0.0004259756200452236, + "loss": 0.0063, + "num_input_tokens_seen": 156920320, + "step": 72650 + }, + { + "epoch": 11.852365415986949, + "grad_norm": 0.006742651574313641, + "learning_rate": 0.00042590522537005825, + "loss": 0.0132, + "num_input_tokens_seen": 156930368, + "step": 72655 + }, + { + "epoch": 11.853181076672104, + "grad_norm": 0.3364547789096832, + "learning_rate": 0.00042583483219651763, + "loss": 0.0294, + "num_input_tokens_seen": 156940608, + "step": 72660 + }, + { + "epoch": 11.85399673735726, + "grad_norm": 0.015008768998086452, + "learning_rate": 0.0004257644405260282, + "loss": 0.0062, + "num_input_tokens_seen": 156951168, + "step": 72665 + }, + { + "epoch": 11.854812398042414, + "grad_norm": 0.010510805994272232, + "learning_rate": 0.0004256940503600166, + "loss": 0.0103, + "num_input_tokens_seen": 156961920, + "step": 72670 + }, + { + "epoch": 11.85562805872757, + "grad_norm": 0.015424901619553566, + "learning_rate": 0.00042562366169990936, + "loss": 0.0085, + "num_input_tokens_seen": 156973888, + "step": 72675 + }, + { + "epoch": 11.856443719412724, + "grad_norm": 0.1747453808784485, + "learning_rate": 0.00042555327454713276, + "loss": 0.0947, + "num_input_tokens_seen": 156984448, + "step": 72680 + }, + { + "epoch": 11.85725938009788, + "grad_norm": 0.006023809779435396, + "learning_rate": 0.0004254828889031137, + "loss": 0.0107, + "num_input_tokens_seen": 156996864, + "step": 72685 + }, + { + "epoch": 11.858075040783035, + "grad_norm": 0.007003793492913246, + "learning_rate": 0.0004254125047692784, + "loss": 0.0661, + "num_input_tokens_seen": 157007424, + "step": 72690 + }, + { + "epoch": 11.858890701468189, + "grad_norm": 0.008758111856877804, + "learning_rate": 0.00042534212214705326, + "loss": 0.0322, + "num_input_tokens_seen": 157018624, + "step": 72695 + }, + { + "epoch": 11.859706362153345, + "grad_norm": 0.0082249129191041, + "learning_rate": 0.0004252717410378648, + "loss": 0.05, + "num_input_tokens_seen": 157028768, + "step": 72700 + }, + { + "epoch": 11.860522022838499, + "grad_norm": 0.00721960561349988, + "learning_rate": 0.00042520136144313925, + "loss": 0.0335, + "num_input_tokens_seen": 157041248, + "step": 72705 + }, + { + "epoch": 11.861337683523654, + "grad_norm": 0.010221786797046661, + "learning_rate": 0.0004251309833643029, + "loss": 0.0218, + "num_input_tokens_seen": 157051616, + "step": 72710 + }, + { + "epoch": 11.86215334420881, + "grad_norm": 0.015197236090898514, + "learning_rate": 0.00042506060680278234, + "loss": 0.0374, + "num_input_tokens_seen": 157062336, + "step": 72715 + }, + { + "epoch": 11.862969004893964, + "grad_norm": 0.45400407910346985, + "learning_rate": 0.00042499023176000353, + "loss": 0.0753, + "num_input_tokens_seen": 157072256, + "step": 72720 + }, + { + "epoch": 11.86378466557912, + "grad_norm": 0.12778514623641968, + "learning_rate": 0.000424919858237393, + "loss": 0.0286, + "num_input_tokens_seen": 157082848, + "step": 72725 + }, + { + "epoch": 11.864600326264274, + "grad_norm": 0.491372287273407, + "learning_rate": 0.00042484948623637656, + "loss": 0.0654, + "num_input_tokens_seen": 157094080, + "step": 72730 + }, + { + "epoch": 11.86541598694943, + "grad_norm": 0.003955055959522724, + "learning_rate": 0.0004247791157583808, + "loss": 0.0077, + "num_input_tokens_seen": 157104416, + "step": 72735 + }, + { + "epoch": 11.866231647634583, + "grad_norm": 0.006338398437947035, + "learning_rate": 0.0004247087468048315, + "loss": 0.0162, + "num_input_tokens_seen": 157114080, + "step": 72740 + }, + { + "epoch": 11.867047308319739, + "grad_norm": 0.004220007918775082, + "learning_rate": 0.00042463837937715515, + "loss": 0.0978, + "num_input_tokens_seen": 157124192, + "step": 72745 + }, + { + "epoch": 11.867862969004895, + "grad_norm": 0.0883263647556305, + "learning_rate": 0.0004245680134767775, + "loss": 0.0224, + "num_input_tokens_seen": 157135360, + "step": 72750 + }, + { + "epoch": 11.868678629690049, + "grad_norm": 0.0914829820394516, + "learning_rate": 0.0004244976491051249, + "loss": 0.0169, + "num_input_tokens_seen": 157146208, + "step": 72755 + }, + { + "epoch": 11.869494290375204, + "grad_norm": 0.04714475944638252, + "learning_rate": 0.00042442728626362306, + "loss": 0.08, + "num_input_tokens_seen": 157156928, + "step": 72760 + }, + { + "epoch": 11.870309951060358, + "grad_norm": 0.005293759051710367, + "learning_rate": 0.00042435692495369824, + "loss": 0.0727, + "num_input_tokens_seen": 157167936, + "step": 72765 + }, + { + "epoch": 11.871125611745514, + "grad_norm": 0.014321415685117245, + "learning_rate": 0.0004242865651767762, + "loss": 0.1189, + "num_input_tokens_seen": 157178272, + "step": 72770 + }, + { + "epoch": 11.87194127243067, + "grad_norm": 0.0015413248911499977, + "learning_rate": 0.0004242162069342831, + "loss": 0.0057, + "num_input_tokens_seen": 157189408, + "step": 72775 + }, + { + "epoch": 11.872756933115824, + "grad_norm": 0.012533880770206451, + "learning_rate": 0.0004241458502276446, + "loss": 0.0094, + "num_input_tokens_seen": 157200704, + "step": 72780 + }, + { + "epoch": 11.87357259380098, + "grad_norm": 0.16592223942279816, + "learning_rate": 0.00042407549505828657, + "loss": 0.0206, + "num_input_tokens_seen": 157213280, + "step": 72785 + }, + { + "epoch": 11.874388254486133, + "grad_norm": 0.11769827455282211, + "learning_rate": 0.0004240051414276352, + "loss": 0.0478, + "num_input_tokens_seen": 157223680, + "step": 72790 + }, + { + "epoch": 11.875203915171289, + "grad_norm": 0.017359424382448196, + "learning_rate": 0.00042393478933711585, + "loss": 0.0186, + "num_input_tokens_seen": 157233664, + "step": 72795 + }, + { + "epoch": 11.876019575856443, + "grad_norm": 0.05703236162662506, + "learning_rate": 0.0004238644387881546, + "loss": 0.0218, + "num_input_tokens_seen": 157244192, + "step": 72800 + }, + { + "epoch": 11.876835236541599, + "grad_norm": 0.030190356075763702, + "learning_rate": 0.000423794089782177, + "loss": 0.0064, + "num_input_tokens_seen": 157254080, + "step": 72805 + }, + { + "epoch": 11.877650897226754, + "grad_norm": 0.0018631864804774523, + "learning_rate": 0.000423723742320609, + "loss": 0.0051, + "num_input_tokens_seen": 157265824, + "step": 72810 + }, + { + "epoch": 11.878466557911908, + "grad_norm": 0.17252780497074127, + "learning_rate": 0.00042365339640487596, + "loss": 0.0133, + "num_input_tokens_seen": 157277280, + "step": 72815 + }, + { + "epoch": 11.879282218597064, + "grad_norm": 0.08619865775108337, + "learning_rate": 0.0004235830520364038, + "loss": 0.0658, + "num_input_tokens_seen": 157287584, + "step": 72820 + }, + { + "epoch": 11.880097879282218, + "grad_norm": 0.007091619074344635, + "learning_rate": 0.0004235127092166179, + "loss": 0.0053, + "num_input_tokens_seen": 157298816, + "step": 72825 + }, + { + "epoch": 11.880913539967374, + "grad_norm": 0.0057108355686068535, + "learning_rate": 0.0004234423679469441, + "loss": 0.0201, + "num_input_tokens_seen": 157310304, + "step": 72830 + }, + { + "epoch": 11.88172920065253, + "grad_norm": 0.009273702278733253, + "learning_rate": 0.0004233720282288078, + "loss": 0.0125, + "num_input_tokens_seen": 157319840, + "step": 72835 + }, + { + "epoch": 11.882544861337683, + "grad_norm": 0.7806374430656433, + "learning_rate": 0.00042330169006363455, + "loss": 0.094, + "num_input_tokens_seen": 157329984, + "step": 72840 + }, + { + "epoch": 11.883360522022839, + "grad_norm": 0.005861148703843355, + "learning_rate": 0.0004232313534528499, + "loss": 0.0059, + "num_input_tokens_seen": 157340768, + "step": 72845 + }, + { + "epoch": 11.884176182707993, + "grad_norm": 0.009260977618396282, + "learning_rate": 0.00042316101839787916, + "loss": 0.0863, + "num_input_tokens_seen": 157351488, + "step": 72850 + }, + { + "epoch": 11.884991843393149, + "grad_norm": 0.07549002766609192, + "learning_rate": 0.00042309068490014787, + "loss": 0.065, + "num_input_tokens_seen": 157363392, + "step": 72855 + }, + { + "epoch": 11.885807504078304, + "grad_norm": 0.5480133295059204, + "learning_rate": 0.00042302035296108156, + "loss": 0.0349, + "num_input_tokens_seen": 157374496, + "step": 72860 + }, + { + "epoch": 11.886623164763458, + "grad_norm": 0.0028016124852001667, + "learning_rate": 0.00042295002258210525, + "loss": 0.0144, + "num_input_tokens_seen": 157385856, + "step": 72865 + }, + { + "epoch": 11.887438825448614, + "grad_norm": 0.011609912849962711, + "learning_rate": 0.00042287969376464466, + "loss": 0.0094, + "num_input_tokens_seen": 157396512, + "step": 72870 + }, + { + "epoch": 11.888254486133768, + "grad_norm": 0.003198280232027173, + "learning_rate": 0.0004228093665101247, + "loss": 0.0068, + "num_input_tokens_seen": 157407328, + "step": 72875 + }, + { + "epoch": 11.889070146818923, + "grad_norm": 0.005472021643072367, + "learning_rate": 0.00042273904081997115, + "loss": 0.025, + "num_input_tokens_seen": 157417376, + "step": 72880 + }, + { + "epoch": 11.88988580750408, + "grad_norm": 0.7649688720703125, + "learning_rate": 0.0004226687166956087, + "loss": 0.0381, + "num_input_tokens_seen": 157430624, + "step": 72885 + }, + { + "epoch": 11.890701468189233, + "grad_norm": 0.004005796741694212, + "learning_rate": 0.00042259839413846275, + "loss": 0.1093, + "num_input_tokens_seen": 157442336, + "step": 72890 + }, + { + "epoch": 11.891517128874389, + "grad_norm": 0.034606240689754486, + "learning_rate": 0.0004225280731499588, + "loss": 0.0074, + "num_input_tokens_seen": 157452064, + "step": 72895 + }, + { + "epoch": 11.892332789559543, + "grad_norm": 0.028569230809807777, + "learning_rate": 0.00042245775373152153, + "loss": 0.0157, + "num_input_tokens_seen": 157463264, + "step": 72900 + }, + { + "epoch": 11.893148450244698, + "grad_norm": 0.018665973097085953, + "learning_rate": 0.0004223874358845764, + "loss": 0.0219, + "num_input_tokens_seen": 157475296, + "step": 72905 + }, + { + "epoch": 11.893964110929852, + "grad_norm": 0.004474216606467962, + "learning_rate": 0.0004223171196105482, + "loss": 0.0412, + "num_input_tokens_seen": 157486176, + "step": 72910 + }, + { + "epoch": 11.894779771615008, + "grad_norm": 0.005233396776020527, + "learning_rate": 0.0004222468049108623, + "loss": 0.0067, + "num_input_tokens_seen": 157497952, + "step": 72915 + }, + { + "epoch": 11.895595432300164, + "grad_norm": 0.006671491544693708, + "learning_rate": 0.00042217649178694327, + "loss": 0.0047, + "num_input_tokens_seen": 157509248, + "step": 72920 + }, + { + "epoch": 11.896411092985318, + "grad_norm": 0.007623288314789534, + "learning_rate": 0.00042210618024021663, + "loss": 0.0022, + "num_input_tokens_seen": 157519136, + "step": 72925 + }, + { + "epoch": 11.897226753670473, + "grad_norm": 0.03528051823377609, + "learning_rate": 0.00042203587027210684, + "loss": 0.0392, + "num_input_tokens_seen": 157529824, + "step": 72930 + }, + { + "epoch": 11.898042414355627, + "grad_norm": 0.5511897802352905, + "learning_rate": 0.00042196556188403924, + "loss": 0.1273, + "num_input_tokens_seen": 157540704, + "step": 72935 + }, + { + "epoch": 11.898858075040783, + "grad_norm": 0.015145723707973957, + "learning_rate": 0.0004218952550774383, + "loss": 0.0138, + "num_input_tokens_seen": 157552640, + "step": 72940 + }, + { + "epoch": 11.899673735725939, + "grad_norm": 0.3677273690700531, + "learning_rate": 0.00042182494985372937, + "loss": 0.0796, + "num_input_tokens_seen": 157562816, + "step": 72945 + }, + { + "epoch": 11.900489396411093, + "grad_norm": 0.03927216678857803, + "learning_rate": 0.0004217546462143368, + "loss": 0.0473, + "num_input_tokens_seen": 157574496, + "step": 72950 + }, + { + "epoch": 11.901305057096248, + "grad_norm": 0.0031949521508067846, + "learning_rate": 0.0004216843441606857, + "loss": 0.0693, + "num_input_tokens_seen": 157584832, + "step": 72955 + }, + { + "epoch": 11.902120717781402, + "grad_norm": 0.005437622778117657, + "learning_rate": 0.0004216140436942006, + "loss": 0.0939, + "num_input_tokens_seen": 157595520, + "step": 72960 + }, + { + "epoch": 11.902936378466558, + "grad_norm": 0.007426468189805746, + "learning_rate": 0.0004215437448163065, + "loss": 0.0326, + "num_input_tokens_seen": 157607520, + "step": 72965 + }, + { + "epoch": 11.903752039151712, + "grad_norm": 0.008727246895432472, + "learning_rate": 0.00042147344752842774, + "loss": 0.0091, + "num_input_tokens_seen": 157619520, + "step": 72970 + }, + { + "epoch": 11.904567699836868, + "grad_norm": 0.7979373335838318, + "learning_rate": 0.0004214031518319893, + "loss": 0.1674, + "num_input_tokens_seen": 157630400, + "step": 72975 + }, + { + "epoch": 11.905383360522023, + "grad_norm": 0.012567078694701195, + "learning_rate": 0.0004213328577284157, + "loss": 0.1404, + "num_input_tokens_seen": 157642080, + "step": 72980 + }, + { + "epoch": 11.906199021207177, + "grad_norm": 0.07097362726926804, + "learning_rate": 0.0004212625652191315, + "loss": 0.0049, + "num_input_tokens_seen": 157653504, + "step": 72985 + }, + { + "epoch": 11.907014681892333, + "grad_norm": 0.13373368978500366, + "learning_rate": 0.00042119227430556137, + "loss": 0.0298, + "num_input_tokens_seen": 157665280, + "step": 72990 + }, + { + "epoch": 11.907830342577487, + "grad_norm": 0.012603395618498325, + "learning_rate": 0.0004211219849891296, + "loss": 0.0817, + "num_input_tokens_seen": 157675136, + "step": 72995 + }, + { + "epoch": 11.908646003262643, + "grad_norm": 0.0021373082417994738, + "learning_rate": 0.00042105169727126094, + "loss": 0.1518, + "num_input_tokens_seen": 157684544, + "step": 73000 + }, + { + "epoch": 11.909461663947798, + "grad_norm": 0.016298236325383186, + "learning_rate": 0.00042098141115337986, + "loss": 0.0065, + "num_input_tokens_seen": 157694720, + "step": 73005 + }, + { + "epoch": 11.910277324632952, + "grad_norm": 0.015862375497817993, + "learning_rate": 0.0004209111266369107, + "loss": 0.0309, + "num_input_tokens_seen": 157705568, + "step": 73010 + }, + { + "epoch": 11.911092985318108, + "grad_norm": 0.007350914645940065, + "learning_rate": 0.0004208408437232779, + "loss": 0.0091, + "num_input_tokens_seen": 157717248, + "step": 73015 + }, + { + "epoch": 11.911908646003262, + "grad_norm": 0.5517430305480957, + "learning_rate": 0.00042077056241390586, + "loss": 0.0344, + "num_input_tokens_seen": 157728096, + "step": 73020 + }, + { + "epoch": 11.912724306688418, + "grad_norm": 0.044113751500844955, + "learning_rate": 0.00042070028271021877, + "loss": 0.1243, + "num_input_tokens_seen": 157738720, + "step": 73025 + }, + { + "epoch": 11.913539967373573, + "grad_norm": 0.0020097021479159594, + "learning_rate": 0.0004206300046136412, + "loss": 0.0363, + "num_input_tokens_seen": 157748768, + "step": 73030 + }, + { + "epoch": 11.914355628058727, + "grad_norm": 0.013776198029518127, + "learning_rate": 0.00042055972812559707, + "loss": 0.0945, + "num_input_tokens_seen": 157759392, + "step": 73035 + }, + { + "epoch": 11.915171288743883, + "grad_norm": 0.0191465113312006, + "learning_rate": 0.0004204894532475111, + "loss": 0.0793, + "num_input_tokens_seen": 157770976, + "step": 73040 + }, + { + "epoch": 11.915986949429037, + "grad_norm": 0.002885065972805023, + "learning_rate": 0.00042041917998080695, + "loss": 0.021, + "num_input_tokens_seen": 157782080, + "step": 73045 + }, + { + "epoch": 11.916802610114193, + "grad_norm": 0.006058351136744022, + "learning_rate": 0.0004203489083269093, + "loss": 0.0599, + "num_input_tokens_seen": 157793088, + "step": 73050 + }, + { + "epoch": 11.917618270799348, + "grad_norm": 0.016492463648319244, + "learning_rate": 0.0004202786382872419, + "loss": 0.1625, + "num_input_tokens_seen": 157805056, + "step": 73055 + }, + { + "epoch": 11.918433931484502, + "grad_norm": 0.03171005845069885, + "learning_rate": 0.00042020836986322917, + "loss": 0.0166, + "num_input_tokens_seen": 157815072, + "step": 73060 + }, + { + "epoch": 11.919249592169658, + "grad_norm": 0.0331353098154068, + "learning_rate": 0.0004201381030562949, + "loss": 0.0128, + "num_input_tokens_seen": 157824480, + "step": 73065 + }, + { + "epoch": 11.920065252854812, + "grad_norm": 0.12538665533065796, + "learning_rate": 0.00042006783786786346, + "loss": 0.0305, + "num_input_tokens_seen": 157836928, + "step": 73070 + }, + { + "epoch": 11.920880913539968, + "grad_norm": 0.005170703399926424, + "learning_rate": 0.0004199975742993585, + "loss": 0.0047, + "num_input_tokens_seen": 157848704, + "step": 73075 + }, + { + "epoch": 11.921696574225122, + "grad_norm": 0.04474220797419548, + "learning_rate": 0.0004199273123522044, + "loss": 0.0444, + "num_input_tokens_seen": 157860992, + "step": 73080 + }, + { + "epoch": 11.922512234910277, + "grad_norm": 0.006513546220958233, + "learning_rate": 0.00041985705202782464, + "loss": 0.0557, + "num_input_tokens_seen": 157872128, + "step": 73085 + }, + { + "epoch": 11.923327895595433, + "grad_norm": 0.042542729526758194, + "learning_rate": 0.00041978679332764366, + "loss": 0.0479, + "num_input_tokens_seen": 157883776, + "step": 73090 + }, + { + "epoch": 11.924143556280587, + "grad_norm": 0.007474198471754789, + "learning_rate": 0.0004197165362530848, + "loss": 0.063, + "num_input_tokens_seen": 157893984, + "step": 73095 + }, + { + "epoch": 11.924959216965743, + "grad_norm": 0.019985618069767952, + "learning_rate": 0.00041964628080557224, + "loss": 0.0069, + "num_input_tokens_seen": 157904960, + "step": 73100 + }, + { + "epoch": 11.925774877650896, + "grad_norm": 0.002067849040031433, + "learning_rate": 0.0004195760269865299, + "loss": 0.016, + "num_input_tokens_seen": 157914816, + "step": 73105 + }, + { + "epoch": 11.926590538336052, + "grad_norm": 0.39414486289024353, + "learning_rate": 0.0004195057747973812, + "loss": 0.2092, + "num_input_tokens_seen": 157926080, + "step": 73110 + }, + { + "epoch": 11.927406199021208, + "grad_norm": 0.0032160452101379633, + "learning_rate": 0.0004194355242395503, + "loss": 0.0482, + "num_input_tokens_seen": 157936992, + "step": 73115 + }, + { + "epoch": 11.928221859706362, + "grad_norm": 0.4254874289035797, + "learning_rate": 0.00041936527531446046, + "loss": 0.1429, + "num_input_tokens_seen": 157948864, + "step": 73120 + }, + { + "epoch": 11.929037520391518, + "grad_norm": 0.14057868719100952, + "learning_rate": 0.0004192950280235359, + "loss": 0.0168, + "num_input_tokens_seen": 157958912, + "step": 73125 + }, + { + "epoch": 11.929853181076671, + "grad_norm": 0.008988683111965656, + "learning_rate": 0.0004192247823681997, + "loss": 0.0432, + "num_input_tokens_seen": 157968704, + "step": 73130 + }, + { + "epoch": 11.930668841761827, + "grad_norm": 0.018005967140197754, + "learning_rate": 0.00041915453834987594, + "loss": 0.0148, + "num_input_tokens_seen": 157978944, + "step": 73135 + }, + { + "epoch": 11.931484502446983, + "grad_norm": 0.26343950629234314, + "learning_rate": 0.0004190842959699879, + "loss": 0.0194, + "num_input_tokens_seen": 157989504, + "step": 73140 + }, + { + "epoch": 11.932300163132137, + "grad_norm": 0.10334479063749313, + "learning_rate": 0.0004190140552299593, + "loss": 0.0142, + "num_input_tokens_seen": 157999904, + "step": 73145 + }, + { + "epoch": 11.933115823817293, + "grad_norm": 0.18696285784244537, + "learning_rate": 0.0004189438161312136, + "loss": 0.0162, + "num_input_tokens_seen": 158011168, + "step": 73150 + }, + { + "epoch": 11.933931484502446, + "grad_norm": 0.003918380010873079, + "learning_rate": 0.00041887357867517435, + "loss": 0.0469, + "num_input_tokens_seen": 158022176, + "step": 73155 + }, + { + "epoch": 11.934747145187602, + "grad_norm": 0.35251501202583313, + "learning_rate": 0.0004188033428632649, + "loss": 0.0338, + "num_input_tokens_seen": 158033408, + "step": 73160 + }, + { + "epoch": 11.935562805872756, + "grad_norm": 0.0022698971442878246, + "learning_rate": 0.00041873310869690875, + "loss": 0.0141, + "num_input_tokens_seen": 158045056, + "step": 73165 + }, + { + "epoch": 11.936378466557912, + "grad_norm": 0.0054527875036001205, + "learning_rate": 0.00041866287617752906, + "loss": 0.0126, + "num_input_tokens_seen": 158056352, + "step": 73170 + }, + { + "epoch": 11.937194127243067, + "grad_norm": 0.002886369824409485, + "learning_rate": 0.0004185926453065496, + "loss": 0.0076, + "num_input_tokens_seen": 158067488, + "step": 73175 + }, + { + "epoch": 11.938009787928221, + "grad_norm": 0.002251496771350503, + "learning_rate": 0.0004185224160853933, + "loss": 0.1491, + "num_input_tokens_seen": 158079488, + "step": 73180 + }, + { + "epoch": 11.938825448613377, + "grad_norm": 0.06846843659877777, + "learning_rate": 0.00041845218851548375, + "loss": 0.0121, + "num_input_tokens_seen": 158091232, + "step": 73185 + }, + { + "epoch": 11.939641109298531, + "grad_norm": 0.0593634694814682, + "learning_rate": 0.0004183819625982439, + "loss": 0.045, + "num_input_tokens_seen": 158102112, + "step": 73190 + }, + { + "epoch": 11.940456769983687, + "grad_norm": 0.007378603331744671, + "learning_rate": 0.0004183117383350973, + "loss": 0.0152, + "num_input_tokens_seen": 158114112, + "step": 73195 + }, + { + "epoch": 11.941272430668842, + "grad_norm": 0.005221458151936531, + "learning_rate": 0.0004182415157274668, + "loss": 0.0204, + "num_input_tokens_seen": 158124672, + "step": 73200 + }, + { + "epoch": 11.942088091353996, + "grad_norm": 0.04406864568591118, + "learning_rate": 0.00041817129477677564, + "loss": 0.0173, + "num_input_tokens_seen": 158134816, + "step": 73205 + }, + { + "epoch": 11.942903752039152, + "grad_norm": 0.004383188672363758, + "learning_rate": 0.0004181010754844472, + "loss": 0.0635, + "num_input_tokens_seen": 158145376, + "step": 73210 + }, + { + "epoch": 11.943719412724306, + "grad_norm": 0.00374322896823287, + "learning_rate": 0.00041803085785190416, + "loss": 0.0327, + "num_input_tokens_seen": 158156064, + "step": 73215 + }, + { + "epoch": 11.944535073409462, + "grad_norm": 0.010824406519532204, + "learning_rate": 0.00041796064188057, + "loss": 0.0089, + "num_input_tokens_seen": 158166176, + "step": 73220 + }, + { + "epoch": 11.945350734094617, + "grad_norm": 0.01288297027349472, + "learning_rate": 0.00041789042757186726, + "loss": 0.0273, + "num_input_tokens_seen": 158177184, + "step": 73225 + }, + { + "epoch": 11.946166394779771, + "grad_norm": 0.055671948939561844, + "learning_rate": 0.00041782021492721937, + "loss": 0.1799, + "num_input_tokens_seen": 158188416, + "step": 73230 + }, + { + "epoch": 11.946982055464927, + "grad_norm": 0.02129376120865345, + "learning_rate": 0.00041775000394804896, + "loss": 0.0095, + "num_input_tokens_seen": 158200256, + "step": 73235 + }, + { + "epoch": 11.947797716150081, + "grad_norm": 0.054272472858428955, + "learning_rate": 0.0004176797946357792, + "loss": 0.0122, + "num_input_tokens_seen": 158211776, + "step": 73240 + }, + { + "epoch": 11.948613376835237, + "grad_norm": 0.02800886332988739, + "learning_rate": 0.00041760958699183263, + "loss": 0.1102, + "num_input_tokens_seen": 158222944, + "step": 73245 + }, + { + "epoch": 11.949429037520392, + "grad_norm": 0.11227209866046906, + "learning_rate": 0.0004175393810176325, + "loss": 0.0936, + "num_input_tokens_seen": 158233600, + "step": 73250 + }, + { + "epoch": 11.950244698205546, + "grad_norm": 0.013811938464641571, + "learning_rate": 0.00041746917671460124, + "loss": 0.0141, + "num_input_tokens_seen": 158243360, + "step": 73255 + }, + { + "epoch": 11.951060358890702, + "grad_norm": 0.10094699263572693, + "learning_rate": 0.000417398974084162, + "loss": 0.0198, + "num_input_tokens_seen": 158253600, + "step": 73260 + }, + { + "epoch": 11.951876019575856, + "grad_norm": 0.05100074037909508, + "learning_rate": 0.0004173287731277371, + "loss": 0.1161, + "num_input_tokens_seen": 158265984, + "step": 73265 + }, + { + "epoch": 11.952691680261012, + "grad_norm": 0.009566979482769966, + "learning_rate": 0.00041725857384674974, + "loss": 0.0074, + "num_input_tokens_seen": 158277536, + "step": 73270 + }, + { + "epoch": 11.953507340946166, + "grad_norm": 0.004473550245165825, + "learning_rate": 0.0004171883762426221, + "loss": 0.0099, + "num_input_tokens_seen": 158288576, + "step": 73275 + }, + { + "epoch": 11.954323001631321, + "grad_norm": 0.004220154602080584, + "learning_rate": 0.00041711818031677737, + "loss": 0.0092, + "num_input_tokens_seen": 158300128, + "step": 73280 + }, + { + "epoch": 11.955138662316477, + "grad_norm": 0.022628581151366234, + "learning_rate": 0.00041704798607063756, + "loss": 0.019, + "num_input_tokens_seen": 158310400, + "step": 73285 + }, + { + "epoch": 11.955954323001631, + "grad_norm": 0.0035848692059516907, + "learning_rate": 0.0004169777935056257, + "loss": 0.0092, + "num_input_tokens_seen": 158320448, + "step": 73290 + }, + { + "epoch": 11.956769983686787, + "grad_norm": 0.13575603067874908, + "learning_rate": 0.00041690760262316415, + "loss": 0.0417, + "num_input_tokens_seen": 158330176, + "step": 73295 + }, + { + "epoch": 11.95758564437194, + "grad_norm": 0.03912244364619255, + "learning_rate": 0.0004168374134246754, + "loss": 0.0483, + "num_input_tokens_seen": 158340352, + "step": 73300 + }, + { + "epoch": 11.958401305057096, + "grad_norm": 0.006890024524182081, + "learning_rate": 0.000416767225911582, + "loss": 0.0818, + "num_input_tokens_seen": 158351904, + "step": 73305 + }, + { + "epoch": 11.959216965742252, + "grad_norm": 0.0030039497651159763, + "learning_rate": 0.0004166970400853064, + "loss": 0.0169, + "num_input_tokens_seen": 158363360, + "step": 73310 + }, + { + "epoch": 11.960032626427406, + "grad_norm": 0.006594918668270111, + "learning_rate": 0.00041662685594727076, + "loss": 0.0462, + "num_input_tokens_seen": 158374080, + "step": 73315 + }, + { + "epoch": 11.960848287112562, + "grad_norm": 0.058372244238853455, + "learning_rate": 0.0004165566734988979, + "loss": 0.0248, + "num_input_tokens_seen": 158385792, + "step": 73320 + }, + { + "epoch": 11.961663947797716, + "grad_norm": 0.06784452497959137, + "learning_rate": 0.00041648649274160976, + "loss": 0.0104, + "num_input_tokens_seen": 158395680, + "step": 73325 + }, + { + "epoch": 11.962479608482871, + "grad_norm": 0.010779723525047302, + "learning_rate": 0.0004164163136768289, + "loss": 0.0506, + "num_input_tokens_seen": 158407552, + "step": 73330 + }, + { + "epoch": 11.963295269168025, + "grad_norm": 0.21537868678569794, + "learning_rate": 0.0004163461363059774, + "loss": 0.0242, + "num_input_tokens_seen": 158418304, + "step": 73335 + }, + { + "epoch": 11.964110929853181, + "grad_norm": 0.004940703511238098, + "learning_rate": 0.00041627596063047753, + "loss": 0.0049, + "num_input_tokens_seen": 158429248, + "step": 73340 + }, + { + "epoch": 11.964926590538337, + "grad_norm": 0.139415442943573, + "learning_rate": 0.00041620578665175166, + "loss": 0.0116, + "num_input_tokens_seen": 158439840, + "step": 73345 + }, + { + "epoch": 11.96574225122349, + "grad_norm": 0.47116124629974365, + "learning_rate": 0.00041613561437122163, + "loss": 0.037, + "num_input_tokens_seen": 158449312, + "step": 73350 + }, + { + "epoch": 11.966557911908646, + "grad_norm": 0.001149240881204605, + "learning_rate": 0.0004160654437903101, + "loss": 0.0128, + "num_input_tokens_seen": 158460800, + "step": 73355 + }, + { + "epoch": 11.9673735725938, + "grad_norm": 0.018407588824629784, + "learning_rate": 0.0004159952749104385, + "loss": 0.0127, + "num_input_tokens_seen": 158471648, + "step": 73360 + }, + { + "epoch": 11.968189233278956, + "grad_norm": 0.004824475850909948, + "learning_rate": 0.00041592510773302946, + "loss": 0.01, + "num_input_tokens_seen": 158482944, + "step": 73365 + }, + { + "epoch": 11.969004893964112, + "grad_norm": 0.4063175320625305, + "learning_rate": 0.0004158549422595045, + "loss": 0.0741, + "num_input_tokens_seen": 158493856, + "step": 73370 + }, + { + "epoch": 11.969820554649266, + "grad_norm": 0.007951868698000908, + "learning_rate": 0.0004157847784912861, + "loss": 0.0037, + "num_input_tokens_seen": 158503680, + "step": 73375 + }, + { + "epoch": 11.970636215334421, + "grad_norm": 0.0126974331215024, + "learning_rate": 0.0004157146164297959, + "loss": 0.0307, + "num_input_tokens_seen": 158514336, + "step": 73380 + }, + { + "epoch": 11.971451876019575, + "grad_norm": 0.02527419850230217, + "learning_rate": 0.00041564445607645607, + "loss": 0.0166, + "num_input_tokens_seen": 158525088, + "step": 73385 + }, + { + "epoch": 11.97226753670473, + "grad_norm": 0.413861483335495, + "learning_rate": 0.0004155742974326881, + "loss": 0.1583, + "num_input_tokens_seen": 158536832, + "step": 73390 + }, + { + "epoch": 11.973083197389887, + "grad_norm": 0.009883550927042961, + "learning_rate": 0.00041550414049991435, + "loss": 0.0024, + "num_input_tokens_seen": 158547296, + "step": 73395 + }, + { + "epoch": 11.97389885807504, + "grad_norm": 0.04352164641022682, + "learning_rate": 0.0004154339852795562, + "loss": 0.0394, + "num_input_tokens_seen": 158558464, + "step": 73400 + }, + { + "epoch": 11.974714518760196, + "grad_norm": 0.002612270647659898, + "learning_rate": 0.0004153638317730358, + "loss": 0.0099, + "num_input_tokens_seen": 158569888, + "step": 73405 + }, + { + "epoch": 11.97553017944535, + "grad_norm": 0.011124800890684128, + "learning_rate": 0.00041529367998177446, + "loss": 0.0074, + "num_input_tokens_seen": 158580992, + "step": 73410 + }, + { + "epoch": 11.976345840130506, + "grad_norm": 0.01622111164033413, + "learning_rate": 0.00041522352990719434, + "loss": 0.0269, + "num_input_tokens_seen": 158593056, + "step": 73415 + }, + { + "epoch": 11.977161500815662, + "grad_norm": 0.002231568330898881, + "learning_rate": 0.0004151533815507168, + "loss": 0.0884, + "num_input_tokens_seen": 158604128, + "step": 73420 + }, + { + "epoch": 11.977977161500815, + "grad_norm": 0.1311320662498474, + "learning_rate": 0.00041508323491376364, + "loss": 0.0209, + "num_input_tokens_seen": 158615424, + "step": 73425 + }, + { + "epoch": 11.978792822185971, + "grad_norm": 0.002909077098593116, + "learning_rate": 0.00041501308999775664, + "loss": 0.0178, + "num_input_tokens_seen": 158627456, + "step": 73430 + }, + { + "epoch": 11.979608482871125, + "grad_norm": 0.30766910314559937, + "learning_rate": 0.00041494294680411695, + "loss": 0.0986, + "num_input_tokens_seen": 158638016, + "step": 73435 + }, + { + "epoch": 11.98042414355628, + "grad_norm": 0.37345361709594727, + "learning_rate": 0.0004148728053342665, + "loss": 0.0145, + "num_input_tokens_seen": 158649184, + "step": 73440 + }, + { + "epoch": 11.981239804241435, + "grad_norm": 0.018977565690875053, + "learning_rate": 0.0004148026655896265, + "loss": 0.0061, + "num_input_tokens_seen": 158661152, + "step": 73445 + }, + { + "epoch": 11.98205546492659, + "grad_norm": 0.04246421530842781, + "learning_rate": 0.0004147325275716188, + "loss": 0.0122, + "num_input_tokens_seen": 158670976, + "step": 73450 + }, + { + "epoch": 11.982871125611746, + "grad_norm": 0.0053014010190963745, + "learning_rate": 0.00041466239128166435, + "loss": 0.0141, + "num_input_tokens_seen": 158682080, + "step": 73455 + }, + { + "epoch": 11.9836867862969, + "grad_norm": 0.019669989123940468, + "learning_rate": 0.00041459225672118487, + "loss": 0.0445, + "num_input_tokens_seen": 158691232, + "step": 73460 + }, + { + "epoch": 11.984502446982056, + "grad_norm": 0.4572742283344269, + "learning_rate": 0.0004145221238916017, + "loss": 0.1544, + "num_input_tokens_seen": 158702688, + "step": 73465 + }, + { + "epoch": 11.98531810766721, + "grad_norm": 0.42395129799842834, + "learning_rate": 0.0004144519927943361, + "loss": 0.1659, + "num_input_tokens_seen": 158713408, + "step": 73470 + }, + { + "epoch": 11.986133768352365, + "grad_norm": 0.004313977435231209, + "learning_rate": 0.0004143818634308094, + "loss": 0.0113, + "num_input_tokens_seen": 158722624, + "step": 73475 + }, + { + "epoch": 11.986949429037521, + "grad_norm": 0.01042697299271822, + "learning_rate": 0.00041431173580244284, + "loss": 0.0055, + "num_input_tokens_seen": 158732928, + "step": 73480 + }, + { + "epoch": 11.987765089722675, + "grad_norm": 0.3611927628517151, + "learning_rate": 0.0004142416099106576, + "loss": 0.053, + "num_input_tokens_seen": 158743360, + "step": 73485 + }, + { + "epoch": 11.98858075040783, + "grad_norm": 0.03010513260960579, + "learning_rate": 0.0004141714857568751, + "loss": 0.0072, + "num_input_tokens_seen": 158753728, + "step": 73490 + }, + { + "epoch": 11.989396411092985, + "grad_norm": 0.002898262580856681, + "learning_rate": 0.0004141013633425161, + "loss": 0.0093, + "num_input_tokens_seen": 158764768, + "step": 73495 + }, + { + "epoch": 11.99021207177814, + "grad_norm": 0.0036916485987603664, + "learning_rate": 0.0004140312426690022, + "loss": 0.0285, + "num_input_tokens_seen": 158775072, + "step": 73500 + }, + { + "epoch": 11.991027732463294, + "grad_norm": 0.06801813840866089, + "learning_rate": 0.000413961123737754, + "loss": 0.0264, + "num_input_tokens_seen": 158786528, + "step": 73505 + }, + { + "epoch": 11.99184339314845, + "grad_norm": 0.40527936816215515, + "learning_rate": 0.00041389100655019295, + "loss": 0.0218, + "num_input_tokens_seen": 158797664, + "step": 73510 + }, + { + "epoch": 11.992659053833606, + "grad_norm": 0.005984405521303415, + "learning_rate": 0.00041382089110773975, + "loss": 0.0065, + "num_input_tokens_seen": 158808992, + "step": 73515 + }, + { + "epoch": 11.99347471451876, + "grad_norm": 0.3398209810256958, + "learning_rate": 0.00041375077741181564, + "loss": 0.0239, + "num_input_tokens_seen": 158819520, + "step": 73520 + }, + { + "epoch": 11.994290375203915, + "grad_norm": 0.03455796837806702, + "learning_rate": 0.0004136806654638413, + "loss": 0.0152, + "num_input_tokens_seen": 158829600, + "step": 73525 + }, + { + "epoch": 11.99510603588907, + "grad_norm": 0.423898309469223, + "learning_rate": 0.0004136105552652377, + "loss": 0.1036, + "num_input_tokens_seen": 158841376, + "step": 73530 + }, + { + "epoch": 11.995921696574225, + "grad_norm": 0.08282370865345001, + "learning_rate": 0.0004135404468174261, + "loss": 0.0658, + "num_input_tokens_seen": 158851264, + "step": 73535 + }, + { + "epoch": 11.99673735725938, + "grad_norm": 0.01659543439745903, + "learning_rate": 0.0004134703401218268, + "loss": 0.0133, + "num_input_tokens_seen": 158861536, + "step": 73540 + }, + { + "epoch": 11.997553017944535, + "grad_norm": 0.0014833472669124603, + "learning_rate": 0.00041340023517986096, + "loss": 0.024, + "num_input_tokens_seen": 158872288, + "step": 73545 + }, + { + "epoch": 11.99836867862969, + "grad_norm": 0.014465752989053726, + "learning_rate": 0.00041333013199294907, + "loss": 0.0194, + "num_input_tokens_seen": 158881920, + "step": 73550 + }, + { + "epoch": 11.999184339314844, + "grad_norm": 0.032355792820453644, + "learning_rate": 0.0004132600305625122, + "loss": 0.0053, + "num_input_tokens_seen": 158892960, + "step": 73555 + }, + { + "epoch": 12.0, + "grad_norm": 0.007642882410436869, + "learning_rate": 0.0004131899308899706, + "loss": 0.0057, + "num_input_tokens_seen": 158902432, + "step": 73560 + }, + { + "epoch": 12.0, + "eval_loss": 0.1854449361562729, + "eval_runtime": 103.8442, + "eval_samples_per_second": 26.241, + "eval_steps_per_second": 6.568, + "num_input_tokens_seen": 158902432, + "step": 73560 + }, + { + "epoch": 12.000815660685156, + "grad_norm": 0.5162748694419861, + "learning_rate": 0.00041311983297674545, + "loss": 0.0175, + "num_input_tokens_seen": 158913760, + "step": 73565 + }, + { + "epoch": 12.00163132137031, + "grad_norm": 0.05222615599632263, + "learning_rate": 0.00041304973682425685, + "loss": 0.0077, + "num_input_tokens_seen": 158924672, + "step": 73570 + }, + { + "epoch": 12.002446982055465, + "grad_norm": 0.021363843232393265, + "learning_rate": 0.00041297964243392583, + "loss": 0.0061, + "num_input_tokens_seen": 158935840, + "step": 73575 + }, + { + "epoch": 12.00326264274062, + "grad_norm": 0.002701932331547141, + "learning_rate": 0.0004129095498071726, + "loss": 0.0069, + "num_input_tokens_seen": 158946688, + "step": 73580 + }, + { + "epoch": 12.004078303425775, + "grad_norm": 0.013134065084159374, + "learning_rate": 0.000412839458945418, + "loss": 0.0071, + "num_input_tokens_seen": 158956704, + "step": 73585 + }, + { + "epoch": 12.00489396411093, + "grad_norm": 0.018720904365181923, + "learning_rate": 0.0004127693698500821, + "loss": 0.0057, + "num_input_tokens_seen": 158967648, + "step": 73590 + }, + { + "epoch": 12.005709624796085, + "grad_norm": 0.014408317394554615, + "learning_rate": 0.0004126992825225858, + "loss": 0.0891, + "num_input_tokens_seen": 158978912, + "step": 73595 + }, + { + "epoch": 12.00652528548124, + "grad_norm": 0.46795928478240967, + "learning_rate": 0.00041262919696434915, + "loss": 0.1526, + "num_input_tokens_seen": 158990272, + "step": 73600 + }, + { + "epoch": 12.007340946166394, + "grad_norm": 0.0026603129226714373, + "learning_rate": 0.0004125591131767927, + "loss": 0.053, + "num_input_tokens_seen": 159001440, + "step": 73605 + }, + { + "epoch": 12.00815660685155, + "grad_norm": 0.0020938925445079803, + "learning_rate": 0.00041248903116133674, + "loss": 0.003, + "num_input_tokens_seen": 159012608, + "step": 73610 + }, + { + "epoch": 12.008972267536704, + "grad_norm": 0.0015925763873383403, + "learning_rate": 0.0004124189509194016, + "loss": 0.0042, + "num_input_tokens_seen": 159024256, + "step": 73615 + }, + { + "epoch": 12.00978792822186, + "grad_norm": 0.03945109248161316, + "learning_rate": 0.00041234887245240756, + "loss": 0.0057, + "num_input_tokens_seen": 159035424, + "step": 73620 + }, + { + "epoch": 12.010603588907015, + "grad_norm": 0.06973441690206528, + "learning_rate": 0.00041227879576177475, + "loss": 0.0043, + "num_input_tokens_seen": 159044448, + "step": 73625 + }, + { + "epoch": 12.01141924959217, + "grad_norm": 0.05123627558350563, + "learning_rate": 0.00041220872084892337, + "loss": 0.0322, + "num_input_tokens_seen": 159054912, + "step": 73630 + }, + { + "epoch": 12.012234910277325, + "grad_norm": 0.014868333004415035, + "learning_rate": 0.00041213864771527366, + "loss": 0.021, + "num_input_tokens_seen": 159066912, + "step": 73635 + }, + { + "epoch": 12.013050570962479, + "grad_norm": 0.29003819823265076, + "learning_rate": 0.0004120685763622458, + "loss": 0.0768, + "num_input_tokens_seen": 159076704, + "step": 73640 + }, + { + "epoch": 12.013866231647635, + "grad_norm": 0.002655792748555541, + "learning_rate": 0.00041199850679125974, + "loss": 0.1417, + "num_input_tokens_seen": 159087200, + "step": 73645 + }, + { + "epoch": 12.01468189233279, + "grad_norm": 0.019231455400586128, + "learning_rate": 0.0004119284390037356, + "loss": 0.0102, + "num_input_tokens_seen": 159099200, + "step": 73650 + }, + { + "epoch": 12.015497553017944, + "grad_norm": 0.0031092215795069933, + "learning_rate": 0.00041185837300109326, + "loss": 0.1035, + "num_input_tokens_seen": 159108800, + "step": 73655 + }, + { + "epoch": 12.0163132137031, + "grad_norm": 0.08691810816526413, + "learning_rate": 0.00041178830878475304, + "loss": 0.0125, + "num_input_tokens_seen": 159120032, + "step": 73660 + }, + { + "epoch": 12.017128874388254, + "grad_norm": 0.05167734995484352, + "learning_rate": 0.00041171824635613443, + "loss": 0.0057, + "num_input_tokens_seen": 159131584, + "step": 73665 + }, + { + "epoch": 12.01794453507341, + "grad_norm": 0.03976357355713844, + "learning_rate": 0.00041164818571665774, + "loss": 0.1218, + "num_input_tokens_seen": 159142752, + "step": 73670 + }, + { + "epoch": 12.018760195758565, + "grad_norm": 0.0090693524107337, + "learning_rate": 0.00041157812686774245, + "loss": 0.0507, + "num_input_tokens_seen": 159152384, + "step": 73675 + }, + { + "epoch": 12.01957585644372, + "grad_norm": 0.004124946426600218, + "learning_rate": 0.0004115080698108088, + "loss": 0.009, + "num_input_tokens_seen": 159163360, + "step": 73680 + }, + { + "epoch": 12.020391517128875, + "grad_norm": 0.003406350966542959, + "learning_rate": 0.0004114380145472761, + "loss": 0.0528, + "num_input_tokens_seen": 159174176, + "step": 73685 + }, + { + "epoch": 12.021207177814029, + "grad_norm": 0.01586691476404667, + "learning_rate": 0.00041136796107856465, + "loss": 0.015, + "num_input_tokens_seen": 159185088, + "step": 73690 + }, + { + "epoch": 12.022022838499185, + "grad_norm": 0.005119045730680227, + "learning_rate": 0.00041129790940609375, + "loss": 0.0294, + "num_input_tokens_seen": 159196448, + "step": 73695 + }, + { + "epoch": 12.022838499184338, + "grad_norm": 0.0177877489477396, + "learning_rate": 0.0004112278595312834, + "loss": 0.0978, + "num_input_tokens_seen": 159207872, + "step": 73700 + }, + { + "epoch": 12.023654159869494, + "grad_norm": 0.003837467636913061, + "learning_rate": 0.00041115781145555286, + "loss": 0.0158, + "num_input_tokens_seen": 159218304, + "step": 73705 + }, + { + "epoch": 12.02446982055465, + "grad_norm": 0.006132758688181639, + "learning_rate": 0.0004110877651803222, + "loss": 0.0286, + "num_input_tokens_seen": 159229152, + "step": 73710 + }, + { + "epoch": 12.025285481239804, + "grad_norm": 0.0033713181037455797, + "learning_rate": 0.0004110177207070106, + "loss": 0.0034, + "num_input_tokens_seen": 159239808, + "step": 73715 + }, + { + "epoch": 12.02610114192496, + "grad_norm": 0.018628062680363655, + "learning_rate": 0.0004109476780370379, + "loss": 0.0183, + "num_input_tokens_seen": 159250400, + "step": 73720 + }, + { + "epoch": 12.026916802610113, + "grad_norm": 0.018260814249515533, + "learning_rate": 0.00041087763717182336, + "loss": 0.0364, + "num_input_tokens_seen": 159261024, + "step": 73725 + }, + { + "epoch": 12.02773246329527, + "grad_norm": 0.009373247623443604, + "learning_rate": 0.00041080759811278674, + "loss": 0.0082, + "num_input_tokens_seen": 159271264, + "step": 73730 + }, + { + "epoch": 12.028548123980425, + "grad_norm": 0.025325864553451538, + "learning_rate": 0.00041073756086134705, + "loss": 0.0073, + "num_input_tokens_seen": 159282400, + "step": 73735 + }, + { + "epoch": 12.029363784665579, + "grad_norm": 0.004139396827667952, + "learning_rate": 0.00041066752541892395, + "loss": 0.0093, + "num_input_tokens_seen": 159293248, + "step": 73740 + }, + { + "epoch": 12.030179445350734, + "grad_norm": 0.331638365983963, + "learning_rate": 0.000410597491786937, + "loss": 0.0759, + "num_input_tokens_seen": 159303040, + "step": 73745 + }, + { + "epoch": 12.030995106035888, + "grad_norm": 0.02152330055832863, + "learning_rate": 0.0004105274599668051, + "loss": 0.0321, + "num_input_tokens_seen": 159314432, + "step": 73750 + }, + { + "epoch": 12.031810766721044, + "grad_norm": 0.002201348775997758, + "learning_rate": 0.00041045742995994783, + "loss": 0.0067, + "num_input_tokens_seen": 159325696, + "step": 73755 + }, + { + "epoch": 12.0326264274062, + "grad_norm": 0.003991606179624796, + "learning_rate": 0.0004103874017677842, + "loss": 0.0143, + "num_input_tokens_seen": 159337664, + "step": 73760 + }, + { + "epoch": 12.033442088091354, + "grad_norm": 0.05896512418985367, + "learning_rate": 0.0004103173753917337, + "loss": 0.0149, + "num_input_tokens_seen": 159349088, + "step": 73765 + }, + { + "epoch": 12.03425774877651, + "grad_norm": 0.0014206412015482783, + "learning_rate": 0.0004102473508332153, + "loss": 0.0039, + "num_input_tokens_seen": 159361056, + "step": 73770 + }, + { + "epoch": 12.035073409461663, + "grad_norm": 0.024218376725912094, + "learning_rate": 0.00041017732809364824, + "loss": 0.0099, + "num_input_tokens_seen": 159371104, + "step": 73775 + }, + { + "epoch": 12.035889070146819, + "grad_norm": 0.6135803461074829, + "learning_rate": 0.00041010730717445156, + "loss": 0.0184, + "num_input_tokens_seen": 159380960, + "step": 73780 + }, + { + "epoch": 12.036704730831975, + "grad_norm": 0.014351412653923035, + "learning_rate": 0.00041003728807704435, + "loss": 0.0046, + "num_input_tokens_seen": 159393280, + "step": 73785 + }, + { + "epoch": 12.037520391517129, + "grad_norm": 0.0027120737358927727, + "learning_rate": 0.00040996727080284555, + "loss": 0.0208, + "num_input_tokens_seen": 159404960, + "step": 73790 + }, + { + "epoch": 12.038336052202284, + "grad_norm": 0.020590249449014664, + "learning_rate": 0.0004098972553532743, + "loss": 0.0149, + "num_input_tokens_seen": 159416544, + "step": 73795 + }, + { + "epoch": 12.039151712887438, + "grad_norm": 0.006616574712097645, + "learning_rate": 0.00040982724172974926, + "loss": 0.0131, + "num_input_tokens_seen": 159427072, + "step": 73800 + }, + { + "epoch": 12.039967373572594, + "grad_norm": 0.00522937485948205, + "learning_rate": 0.0004097572299336899, + "loss": 0.0061, + "num_input_tokens_seen": 159438144, + "step": 73805 + }, + { + "epoch": 12.040783034257748, + "grad_norm": 0.005134327802807093, + "learning_rate": 0.00040968721996651445, + "loss": 0.0015, + "num_input_tokens_seen": 159447392, + "step": 73810 + }, + { + "epoch": 12.041598694942904, + "grad_norm": 0.016191143542528152, + "learning_rate": 0.00040961721182964235, + "loss": 0.0247, + "num_input_tokens_seen": 159458144, + "step": 73815 + }, + { + "epoch": 12.04241435562806, + "grad_norm": 0.007574434857815504, + "learning_rate": 0.00040954720552449186, + "loss": 0.0024, + "num_input_tokens_seen": 159469472, + "step": 73820 + }, + { + "epoch": 12.043230016313213, + "grad_norm": 0.010849296115338802, + "learning_rate": 0.0004094772010524822, + "loss": 0.0039, + "num_input_tokens_seen": 159480096, + "step": 73825 + }, + { + "epoch": 12.044045676998369, + "grad_norm": 0.10277484357357025, + "learning_rate": 0.0004094071984150317, + "loss": 0.1408, + "num_input_tokens_seen": 159492000, + "step": 73830 + }, + { + "epoch": 12.044861337683523, + "grad_norm": 0.0031220330856740475, + "learning_rate": 0.0004093371976135595, + "loss": 0.0018, + "num_input_tokens_seen": 159503392, + "step": 73835 + }, + { + "epoch": 12.045676998368679, + "grad_norm": 0.03206094354391098, + "learning_rate": 0.0004092671986494837, + "loss": 0.1296, + "num_input_tokens_seen": 159514944, + "step": 73840 + }, + { + "epoch": 12.046492659053834, + "grad_norm": 0.043139755725860596, + "learning_rate": 0.00040919720152422323, + "loss": 0.0204, + "num_input_tokens_seen": 159526368, + "step": 73845 + }, + { + "epoch": 12.047308319738988, + "grad_norm": 0.0034898552112281322, + "learning_rate": 0.00040912720623919696, + "loss": 0.0064, + "num_input_tokens_seen": 159538336, + "step": 73850 + }, + { + "epoch": 12.048123980424144, + "grad_norm": 0.001161689287982881, + "learning_rate": 0.00040905721279582284, + "loss": 0.0106, + "num_input_tokens_seen": 159548096, + "step": 73855 + }, + { + "epoch": 12.048939641109298, + "grad_norm": 0.0020751608535647392, + "learning_rate": 0.00040898722119551994, + "loss": 0.0026, + "num_input_tokens_seen": 159558592, + "step": 73860 + }, + { + "epoch": 12.049755301794454, + "grad_norm": 0.002460476942360401, + "learning_rate": 0.0004089172314397063, + "loss": 0.0038, + "num_input_tokens_seen": 159569088, + "step": 73865 + }, + { + "epoch": 12.05057096247961, + "grad_norm": 0.012570970691740513, + "learning_rate": 0.00040884724352980065, + "loss": 0.0024, + "num_input_tokens_seen": 159579232, + "step": 73870 + }, + { + "epoch": 12.051386623164763, + "grad_norm": 0.023806337267160416, + "learning_rate": 0.00040877725746722097, + "loss": 0.0333, + "num_input_tokens_seen": 159590688, + "step": 73875 + }, + { + "epoch": 12.052202283849919, + "grad_norm": 0.004042148124426603, + "learning_rate": 0.0004087072732533862, + "loss": 0.1251, + "num_input_tokens_seen": 159602016, + "step": 73880 + }, + { + "epoch": 12.053017944535073, + "grad_norm": 0.001157692400738597, + "learning_rate": 0.0004086372908897141, + "loss": 0.0095, + "num_input_tokens_seen": 159611392, + "step": 73885 + }, + { + "epoch": 12.053833605220229, + "grad_norm": 0.03635965660214424, + "learning_rate": 0.0004085673103776234, + "loss": 0.0257, + "num_input_tokens_seen": 159623008, + "step": 73890 + }, + { + "epoch": 12.054649265905383, + "grad_norm": 0.0029063147958368063, + "learning_rate": 0.000408497331718532, + "loss": 0.0145, + "num_input_tokens_seen": 159633120, + "step": 73895 + }, + { + "epoch": 12.055464926590538, + "grad_norm": 0.027852777391672134, + "learning_rate": 0.0004084273549138584, + "loss": 0.0359, + "num_input_tokens_seen": 159642944, + "step": 73900 + }, + { + "epoch": 12.056280587275694, + "grad_norm": 0.6098665595054626, + "learning_rate": 0.0004083573799650204, + "loss": 0.1428, + "num_input_tokens_seen": 159652064, + "step": 73905 + }, + { + "epoch": 12.057096247960848, + "grad_norm": 0.002536676125600934, + "learning_rate": 0.00040828740687343654, + "loss": 0.002, + "num_input_tokens_seen": 159661824, + "step": 73910 + }, + { + "epoch": 12.057911908646004, + "grad_norm": 0.1208946481347084, + "learning_rate": 0.0004082174356405247, + "loss": 0.0692, + "num_input_tokens_seen": 159673024, + "step": 73915 + }, + { + "epoch": 12.058727569331158, + "grad_norm": 0.002335605677217245, + "learning_rate": 0.00040814746626770287, + "loss": 0.0039, + "num_input_tokens_seen": 159684736, + "step": 73920 + }, + { + "epoch": 12.059543230016313, + "grad_norm": 0.175007626414299, + "learning_rate": 0.0004080774987563893, + "loss": 0.0077, + "num_input_tokens_seen": 159696128, + "step": 73925 + }, + { + "epoch": 12.060358890701469, + "grad_norm": 0.019628094509243965, + "learning_rate": 0.0004080075331080017, + "loss": 0.0152, + "num_input_tokens_seen": 159707712, + "step": 73930 + }, + { + "epoch": 12.061174551386623, + "grad_norm": 0.7227426767349243, + "learning_rate": 0.0004079375693239581, + "loss": 0.2223, + "num_input_tokens_seen": 159717440, + "step": 73935 + }, + { + "epoch": 12.061990212071779, + "grad_norm": 0.2504555583000183, + "learning_rate": 0.0004078676074056766, + "loss": 0.021, + "num_input_tokens_seen": 159729312, + "step": 73940 + }, + { + "epoch": 12.062805872756933, + "grad_norm": 0.15913152694702148, + "learning_rate": 0.0004077976473545748, + "loss": 0.0104, + "num_input_tokens_seen": 159739680, + "step": 73945 + }, + { + "epoch": 12.063621533442088, + "grad_norm": 0.004883287940174341, + "learning_rate": 0.0004077276891720707, + "loss": 0.0138, + "num_input_tokens_seen": 159750784, + "step": 73950 + }, + { + "epoch": 12.064437194127244, + "grad_norm": 0.004812562372535467, + "learning_rate": 0.000407657732859582, + "loss": 0.0031, + "num_input_tokens_seen": 159760544, + "step": 73955 + }, + { + "epoch": 12.065252854812398, + "grad_norm": 0.2379300892353058, + "learning_rate": 0.00040758777841852647, + "loss": 0.1435, + "num_input_tokens_seen": 159772416, + "step": 73960 + }, + { + "epoch": 12.066068515497554, + "grad_norm": 0.220754474401474, + "learning_rate": 0.000407517825850322, + "loss": 0.0081, + "num_input_tokens_seen": 159782560, + "step": 73965 + }, + { + "epoch": 12.066884176182707, + "grad_norm": 0.004285324830561876, + "learning_rate": 0.00040744787515638585, + "loss": 0.009, + "num_input_tokens_seen": 159792480, + "step": 73970 + }, + { + "epoch": 12.067699836867863, + "grad_norm": 0.0040510534308850765, + "learning_rate": 0.00040737792633813624, + "loss": 0.0043, + "num_input_tokens_seen": 159803456, + "step": 73975 + }, + { + "epoch": 12.068515497553017, + "grad_norm": 0.0067472876980900764, + "learning_rate": 0.00040730797939699014, + "loss": 0.0909, + "num_input_tokens_seen": 159813984, + "step": 73980 + }, + { + "epoch": 12.069331158238173, + "grad_norm": 0.006962975487112999, + "learning_rate": 0.00040723803433436573, + "loss": 0.0056, + "num_input_tokens_seen": 159824608, + "step": 73985 + }, + { + "epoch": 12.070146818923329, + "grad_norm": 0.10496911406517029, + "learning_rate": 0.00040716809115167997, + "loss": 0.0192, + "num_input_tokens_seen": 159834720, + "step": 73990 + }, + { + "epoch": 12.070962479608482, + "grad_norm": 0.022543715313076973, + "learning_rate": 0.0004070981498503508, + "loss": 0.0316, + "num_input_tokens_seen": 159846080, + "step": 73995 + }, + { + "epoch": 12.071778140293638, + "grad_norm": 0.0111811188980937, + "learning_rate": 0.0004070282104317953, + "loss": 0.0073, + "num_input_tokens_seen": 159857792, + "step": 74000 + }, + { + "epoch": 12.072593800978792, + "grad_norm": 0.023856064304709435, + "learning_rate": 0.0004069582728974313, + "loss": 0.0073, + "num_input_tokens_seen": 159869056, + "step": 74005 + }, + { + "epoch": 12.073409461663948, + "grad_norm": 0.0023362748324871063, + "learning_rate": 0.00040688833724867565, + "loss": 0.0029, + "num_input_tokens_seen": 159880192, + "step": 74010 + }, + { + "epoch": 12.074225122349104, + "grad_norm": 0.0023756767623126507, + "learning_rate": 0.0004068184034869462, + "loss": 0.0058, + "num_input_tokens_seen": 159892288, + "step": 74015 + }, + { + "epoch": 12.075040783034257, + "grad_norm": 0.008905721828341484, + "learning_rate": 0.0004067484716136598, + "loss": 0.0026, + "num_input_tokens_seen": 159903200, + "step": 74020 + }, + { + "epoch": 12.075856443719413, + "grad_norm": 0.0030674946028739214, + "learning_rate": 0.00040667854163023415, + "loss": 0.0576, + "num_input_tokens_seen": 159913952, + "step": 74025 + }, + { + "epoch": 12.076672104404567, + "grad_norm": 0.0404362678527832, + "learning_rate": 0.000406608613538086, + "loss": 0.0075, + "num_input_tokens_seen": 159925248, + "step": 74030 + }, + { + "epoch": 12.077487765089723, + "grad_norm": 0.013325365260243416, + "learning_rate": 0.000406538687338633, + "loss": 0.07, + "num_input_tokens_seen": 159936096, + "step": 74035 + }, + { + "epoch": 12.078303425774878, + "grad_norm": 0.026394739747047424, + "learning_rate": 0.0004064687630332919, + "loss": 0.005, + "num_input_tokens_seen": 159946592, + "step": 74040 + }, + { + "epoch": 12.079119086460032, + "grad_norm": 0.16963490843772888, + "learning_rate": 0.0004063988406234801, + "loss": 0.0147, + "num_input_tokens_seen": 159957248, + "step": 74045 + }, + { + "epoch": 12.079934747145188, + "grad_norm": 0.1959126889705658, + "learning_rate": 0.0004063289201106144, + "loss": 0.0144, + "num_input_tokens_seen": 159968384, + "step": 74050 + }, + { + "epoch": 12.080750407830342, + "grad_norm": 0.006187156308442354, + "learning_rate": 0.000406259001496112, + "loss": 0.015, + "num_input_tokens_seen": 159977536, + "step": 74055 + }, + { + "epoch": 12.081566068515498, + "grad_norm": 0.008363723754882812, + "learning_rate": 0.00040618908478138986, + "loss": 0.0087, + "num_input_tokens_seen": 159987904, + "step": 74060 + }, + { + "epoch": 12.082381729200652, + "grad_norm": 0.025374621152877808, + "learning_rate": 0.0004061191699678649, + "loss": 0.0057, + "num_input_tokens_seen": 159998080, + "step": 74065 + }, + { + "epoch": 12.083197389885807, + "grad_norm": 0.001546714105643332, + "learning_rate": 0.0004060492570569542, + "loss": 0.0067, + "num_input_tokens_seen": 160009984, + "step": 74070 + }, + { + "epoch": 12.084013050570963, + "grad_norm": 0.014130688272416592, + "learning_rate": 0.0004059793460500742, + "loss": 0.0185, + "num_input_tokens_seen": 160021728, + "step": 74075 + }, + { + "epoch": 12.084828711256117, + "grad_norm": 0.3345913589000702, + "learning_rate": 0.0004059094369486423, + "loss": 0.0118, + "num_input_tokens_seen": 160031264, + "step": 74080 + }, + { + "epoch": 12.085644371941273, + "grad_norm": 0.0023609360214322805, + "learning_rate": 0.00040583952975407493, + "loss": 0.0036, + "num_input_tokens_seen": 160042560, + "step": 74085 + }, + { + "epoch": 12.086460032626427, + "grad_norm": 0.0316849909722805, + "learning_rate": 0.000405769624467789, + "loss": 0.0125, + "num_input_tokens_seen": 160053856, + "step": 74090 + }, + { + "epoch": 12.087275693311582, + "grad_norm": 0.0007231601630337536, + "learning_rate": 0.0004056997210912011, + "loss": 0.0156, + "num_input_tokens_seen": 160064800, + "step": 74095 + }, + { + "epoch": 12.088091353996738, + "grad_norm": 0.014824706129729748, + "learning_rate": 0.00040562981962572803, + "loss": 0.1165, + "num_input_tokens_seen": 160076224, + "step": 74100 + }, + { + "epoch": 12.088907014681892, + "grad_norm": 0.0029631692450493574, + "learning_rate": 0.00040555992007278624, + "loss": 0.0088, + "num_input_tokens_seen": 160087328, + "step": 74105 + }, + { + "epoch": 12.089722675367048, + "grad_norm": 0.01084907166659832, + "learning_rate": 0.00040549002243379267, + "loss": 0.0584, + "num_input_tokens_seen": 160097184, + "step": 74110 + }, + { + "epoch": 12.090538336052202, + "grad_norm": 0.017258066684007645, + "learning_rate": 0.00040542012671016355, + "loss": 0.0036, + "num_input_tokens_seen": 160107392, + "step": 74115 + }, + { + "epoch": 12.091353996737357, + "grad_norm": 0.022070029750466347, + "learning_rate": 0.00040535023290331573, + "loss": 0.0028, + "num_input_tokens_seen": 160118176, + "step": 74120 + }, + { + "epoch": 12.092169657422513, + "grad_norm": 0.013220726512372494, + "learning_rate": 0.0004052803410146653, + "loss": 0.0159, + "num_input_tokens_seen": 160129792, + "step": 74125 + }, + { + "epoch": 12.092985318107667, + "grad_norm": 0.018424084410071373, + "learning_rate": 0.0004052104510456291, + "loss": 0.0102, + "num_input_tokens_seen": 160139712, + "step": 74130 + }, + { + "epoch": 12.093800978792823, + "grad_norm": 0.0037279201205819845, + "learning_rate": 0.00040514056299762314, + "loss": 0.1487, + "num_input_tokens_seen": 160150880, + "step": 74135 + }, + { + "epoch": 12.094616639477977, + "grad_norm": 0.00917022954672575, + "learning_rate": 0.0004050706768720642, + "loss": 0.1348, + "num_input_tokens_seen": 160162368, + "step": 74140 + }, + { + "epoch": 12.095432300163132, + "grad_norm": 0.07259730249643326, + "learning_rate": 0.00040500079267036834, + "loss": 0.0033, + "num_input_tokens_seen": 160172864, + "step": 74145 + }, + { + "epoch": 12.096247960848286, + "grad_norm": 0.004743486177176237, + "learning_rate": 0.000404930910393952, + "loss": 0.0112, + "num_input_tokens_seen": 160183040, + "step": 74150 + }, + { + "epoch": 12.097063621533442, + "grad_norm": 0.0031580179929733276, + "learning_rate": 0.0004048610300442313, + "loss": 0.0051, + "num_input_tokens_seen": 160192416, + "step": 74155 + }, + { + "epoch": 12.097879282218598, + "grad_norm": 0.4201613664627075, + "learning_rate": 0.0004047911516226226, + "loss": 0.0184, + "num_input_tokens_seen": 160202496, + "step": 74160 + }, + { + "epoch": 12.098694942903752, + "grad_norm": 0.010104143060743809, + "learning_rate": 0.0004047212751305418, + "loss": 0.0021, + "num_input_tokens_seen": 160211904, + "step": 74165 + }, + { + "epoch": 12.099510603588907, + "grad_norm": 0.021120961755514145, + "learning_rate": 0.00040465140056940524, + "loss": 0.004, + "num_input_tokens_seen": 160221632, + "step": 74170 + }, + { + "epoch": 12.100326264274061, + "grad_norm": 0.04819793999195099, + "learning_rate": 0.00040458152794062925, + "loss": 0.0081, + "num_input_tokens_seen": 160232864, + "step": 74175 + }, + { + "epoch": 12.101141924959217, + "grad_norm": 0.01660446636378765, + "learning_rate": 0.00040451165724562937, + "loss": 0.0046, + "num_input_tokens_seen": 160244512, + "step": 74180 + }, + { + "epoch": 12.101957585644373, + "grad_norm": 0.0020989153999835253, + "learning_rate": 0.0004044417884858221, + "loss": 0.3075, + "num_input_tokens_seen": 160255872, + "step": 74185 + }, + { + "epoch": 12.102773246329527, + "grad_norm": 0.0209684856235981, + "learning_rate": 0.0004043719216626231, + "loss": 0.0249, + "num_input_tokens_seen": 160266432, + "step": 74190 + }, + { + "epoch": 12.103588907014682, + "grad_norm": 1.1977218389511108, + "learning_rate": 0.00040430205677744857, + "loss": 0.0415, + "num_input_tokens_seen": 160276928, + "step": 74195 + }, + { + "epoch": 12.104404567699836, + "grad_norm": 0.004599343985319138, + "learning_rate": 0.00040423219383171405, + "loss": 0.0048, + "num_input_tokens_seen": 160287072, + "step": 74200 + }, + { + "epoch": 12.105220228384992, + "grad_norm": 0.01282755471765995, + "learning_rate": 0.0004041623328268358, + "loss": 0.0224, + "num_input_tokens_seen": 160298240, + "step": 74205 + }, + { + "epoch": 12.106035889070148, + "grad_norm": 0.06137290969491005, + "learning_rate": 0.0004040924737642293, + "loss": 0.0072, + "num_input_tokens_seen": 160310016, + "step": 74210 + }, + { + "epoch": 12.106851549755302, + "grad_norm": 0.004412582144141197, + "learning_rate": 0.0004040226166453107, + "loss": 0.0158, + "num_input_tokens_seen": 160320928, + "step": 74215 + }, + { + "epoch": 12.107667210440457, + "grad_norm": 0.022301241755485535, + "learning_rate": 0.00040395276147149524, + "loss": 0.0096, + "num_input_tokens_seen": 160331360, + "step": 74220 + }, + { + "epoch": 12.108482871125611, + "grad_norm": 0.002443633507937193, + "learning_rate": 0.000403882908244199, + "loss": 0.0052, + "num_input_tokens_seen": 160342496, + "step": 74225 + }, + { + "epoch": 12.109298531810767, + "grad_norm": 0.028277039527893066, + "learning_rate": 0.00040381305696483773, + "loss": 0.0746, + "num_input_tokens_seen": 160352960, + "step": 74230 + }, + { + "epoch": 12.11011419249592, + "grad_norm": 0.006705199368298054, + "learning_rate": 0.00040374320763482673, + "loss": 0.0211, + "num_input_tokens_seen": 160363936, + "step": 74235 + }, + { + "epoch": 12.110929853181077, + "grad_norm": 0.24397537112236023, + "learning_rate": 0.0004036733602555818, + "loss": 0.0758, + "num_input_tokens_seen": 160375520, + "step": 74240 + }, + { + "epoch": 12.111745513866232, + "grad_norm": 0.0006988913519307971, + "learning_rate": 0.0004036035148285184, + "loss": 0.0076, + "num_input_tokens_seen": 160387072, + "step": 74245 + }, + { + "epoch": 12.112561174551386, + "grad_norm": 0.0005958918482065201, + "learning_rate": 0.00040353367135505193, + "loss": 0.0036, + "num_input_tokens_seen": 160397824, + "step": 74250 + }, + { + "epoch": 12.113376835236542, + "grad_norm": 0.012600153684616089, + "learning_rate": 0.00040346382983659826, + "loss": 0.0377, + "num_input_tokens_seen": 160409056, + "step": 74255 + }, + { + "epoch": 12.114192495921696, + "grad_norm": 0.03363886475563049, + "learning_rate": 0.0004033939902745723, + "loss": 0.0207, + "num_input_tokens_seen": 160417984, + "step": 74260 + }, + { + "epoch": 12.115008156606851, + "grad_norm": 0.027498042210936546, + "learning_rate": 0.0004033241526703899, + "loss": 0.0037, + "num_input_tokens_seen": 160428864, + "step": 74265 + }, + { + "epoch": 12.115823817292007, + "grad_norm": 0.013976640067994595, + "learning_rate": 0.00040325431702546596, + "loss": 0.0187, + "num_input_tokens_seen": 160439296, + "step": 74270 + }, + { + "epoch": 12.116639477977161, + "grad_norm": 0.09229818731546402, + "learning_rate": 0.000403184483341216, + "loss": 0.0075, + "num_input_tokens_seen": 160450336, + "step": 74275 + }, + { + "epoch": 12.117455138662317, + "grad_norm": 0.004774386063218117, + "learning_rate": 0.0004031146516190556, + "loss": 0.0143, + "num_input_tokens_seen": 160461152, + "step": 74280 + }, + { + "epoch": 12.11827079934747, + "grad_norm": 0.0031991363503038883, + "learning_rate": 0.00040304482186039937, + "loss": 0.0296, + "num_input_tokens_seen": 160472384, + "step": 74285 + }, + { + "epoch": 12.119086460032626, + "grad_norm": 0.08653301745653152, + "learning_rate": 0.0004029749940666631, + "loss": 0.008, + "num_input_tokens_seen": 160482080, + "step": 74290 + }, + { + "epoch": 12.119902120717782, + "grad_norm": 0.035763002932071686, + "learning_rate": 0.00040290516823926145, + "loss": 0.0205, + "num_input_tokens_seen": 160492864, + "step": 74295 + }, + { + "epoch": 12.120717781402936, + "grad_norm": 0.1830175220966339, + "learning_rate": 0.0004028353443796099, + "loss": 0.0184, + "num_input_tokens_seen": 160504064, + "step": 74300 + }, + { + "epoch": 12.121533442088092, + "grad_norm": 0.0060828630812466145, + "learning_rate": 0.00040276552248912317, + "loss": 0.0035, + "num_input_tokens_seen": 160513888, + "step": 74305 + }, + { + "epoch": 12.122349102773246, + "grad_norm": 0.013971041887998581, + "learning_rate": 0.00040269570256921673, + "loss": 0.0127, + "num_input_tokens_seen": 160524512, + "step": 74310 + }, + { + "epoch": 12.123164763458401, + "grad_norm": 0.5153065323829651, + "learning_rate": 0.00040262588462130507, + "loss": 0.1341, + "num_input_tokens_seen": 160535712, + "step": 74315 + }, + { + "epoch": 12.123980424143557, + "grad_norm": 0.01689624786376953, + "learning_rate": 0.0004025560686468036, + "loss": 0.0043, + "num_input_tokens_seen": 160546432, + "step": 74320 + }, + { + "epoch": 12.124796084828711, + "grad_norm": 0.058722566813230515, + "learning_rate": 0.0004024862546471268, + "loss": 0.0054, + "num_input_tokens_seen": 160556704, + "step": 74325 + }, + { + "epoch": 12.125611745513867, + "grad_norm": 0.006426146719604731, + "learning_rate": 0.00040241644262368993, + "loss": 0.0029, + "num_input_tokens_seen": 160568064, + "step": 74330 + }, + { + "epoch": 12.12642740619902, + "grad_norm": 0.007449703756719828, + "learning_rate": 0.00040234663257790747, + "loss": 0.0075, + "num_input_tokens_seen": 160578496, + "step": 74335 + }, + { + "epoch": 12.127243066884176, + "grad_norm": 0.03456910327076912, + "learning_rate": 0.00040227682451119464, + "loss": 0.1314, + "num_input_tokens_seen": 160588352, + "step": 74340 + }, + { + "epoch": 12.12805872756933, + "grad_norm": 0.006443498190492392, + "learning_rate": 0.0004022070184249657, + "loss": 0.0055, + "num_input_tokens_seen": 160599232, + "step": 74345 + }, + { + "epoch": 12.128874388254486, + "grad_norm": 0.003807036206126213, + "learning_rate": 0.0004021372143206358, + "loss": 0.0759, + "num_input_tokens_seen": 160610400, + "step": 74350 + }, + { + "epoch": 12.129690048939642, + "grad_norm": 0.3225690722465515, + "learning_rate": 0.0004020674121996191, + "loss": 0.0325, + "num_input_tokens_seen": 160622368, + "step": 74355 + }, + { + "epoch": 12.130505709624796, + "grad_norm": 0.06799621880054474, + "learning_rate": 0.0004019976120633308, + "loss": 0.139, + "num_input_tokens_seen": 160633344, + "step": 74360 + }, + { + "epoch": 12.131321370309951, + "grad_norm": 0.009520080871880054, + "learning_rate": 0.000401927813913185, + "loss": 0.0027, + "num_input_tokens_seen": 160643552, + "step": 74365 + }, + { + "epoch": 12.132137030995105, + "grad_norm": 0.016409458592534065, + "learning_rate": 0.0004018580177505966, + "loss": 0.0079, + "num_input_tokens_seen": 160653280, + "step": 74370 + }, + { + "epoch": 12.132952691680261, + "grad_norm": 0.0015376622322946787, + "learning_rate": 0.00040178822357698, + "loss": 0.0117, + "num_input_tokens_seen": 160664224, + "step": 74375 + }, + { + "epoch": 12.133768352365417, + "grad_norm": 0.0044131772592663765, + "learning_rate": 0.0004017184313937494, + "loss": 0.0278, + "num_input_tokens_seen": 160676128, + "step": 74380 + }, + { + "epoch": 12.13458401305057, + "grad_norm": 0.0254372451454401, + "learning_rate": 0.0004016486412023198, + "loss": 0.0107, + "num_input_tokens_seen": 160686400, + "step": 74385 + }, + { + "epoch": 12.135399673735726, + "grad_norm": 0.02156521938741207, + "learning_rate": 0.000401578853004105, + "loss": 0.0034, + "num_input_tokens_seen": 160696832, + "step": 74390 + }, + { + "epoch": 12.13621533442088, + "grad_norm": 0.008907387033104897, + "learning_rate": 0.00040150906680051974, + "loss": 0.0031, + "num_input_tokens_seen": 160707712, + "step": 74395 + }, + { + "epoch": 12.137030995106036, + "grad_norm": 0.12198542058467865, + "learning_rate": 0.00040143928259297817, + "loss": 0.0279, + "num_input_tokens_seen": 160719104, + "step": 74400 + }, + { + "epoch": 12.137846655791192, + "grad_norm": 0.0023205087054520845, + "learning_rate": 0.00040136950038289457, + "loss": 0.0047, + "num_input_tokens_seen": 160729920, + "step": 74405 + }, + { + "epoch": 12.138662316476346, + "grad_norm": 0.008131838403642178, + "learning_rate": 0.0004012997201716831, + "loss": 0.0156, + "num_input_tokens_seen": 160740384, + "step": 74410 + }, + { + "epoch": 12.139477977161501, + "grad_norm": 0.07084905356168747, + "learning_rate": 0.0004012299419607581, + "loss": 0.0119, + "num_input_tokens_seen": 160751360, + "step": 74415 + }, + { + "epoch": 12.140293637846655, + "grad_norm": 0.008088597096502781, + "learning_rate": 0.00040116016575153344, + "loss": 0.0051, + "num_input_tokens_seen": 160763488, + "step": 74420 + }, + { + "epoch": 12.141109298531811, + "grad_norm": 0.012776483781635761, + "learning_rate": 0.0004010903915454237, + "loss": 0.0038, + "num_input_tokens_seen": 160775072, + "step": 74425 + }, + { + "epoch": 12.141924959216965, + "grad_norm": 0.33567920327186584, + "learning_rate": 0.0004010206193438424, + "loss": 0.1305, + "num_input_tokens_seen": 160785984, + "step": 74430 + }, + { + "epoch": 12.14274061990212, + "grad_norm": 0.005716219078749418, + "learning_rate": 0.0004009508491482041, + "loss": 0.0154, + "num_input_tokens_seen": 160796576, + "step": 74435 + }, + { + "epoch": 12.143556280587276, + "grad_norm": 0.005408111959695816, + "learning_rate": 0.00040088108095992216, + "loss": 0.0671, + "num_input_tokens_seen": 160806752, + "step": 74440 + }, + { + "epoch": 12.14437194127243, + "grad_norm": 0.019407780840992928, + "learning_rate": 0.00040081131478041115, + "loss": 0.062, + "num_input_tokens_seen": 160817408, + "step": 74445 + }, + { + "epoch": 12.145187601957586, + "grad_norm": 0.004047623835504055, + "learning_rate": 0.00040074155061108443, + "loss": 0.0075, + "num_input_tokens_seen": 160828160, + "step": 74450 + }, + { + "epoch": 12.14600326264274, + "grad_norm": 0.005578754004091024, + "learning_rate": 0.00040067178845335633, + "loss": 0.0032, + "num_input_tokens_seen": 160839072, + "step": 74455 + }, + { + "epoch": 12.146818923327896, + "grad_norm": 0.005533120129257441, + "learning_rate": 0.0004006020283086402, + "loss": 0.0069, + "num_input_tokens_seen": 160851200, + "step": 74460 + }, + { + "epoch": 12.147634584013051, + "grad_norm": 1.1328115463256836, + "learning_rate": 0.00040053227017835033, + "loss": 0.0612, + "num_input_tokens_seen": 160862976, + "step": 74465 + }, + { + "epoch": 12.148450244698205, + "grad_norm": 0.1987270712852478, + "learning_rate": 0.00040046251406389993, + "loss": 0.1336, + "num_input_tokens_seen": 160874752, + "step": 74470 + }, + { + "epoch": 12.149265905383361, + "grad_norm": 0.03130248934030533, + "learning_rate": 0.0004003927599667032, + "loss": 0.0089, + "num_input_tokens_seen": 160885312, + "step": 74475 + }, + { + "epoch": 12.150081566068515, + "grad_norm": 0.02075079269707203, + "learning_rate": 0.0004003230078881733, + "loss": 0.018, + "num_input_tokens_seen": 160896704, + "step": 74480 + }, + { + "epoch": 12.15089722675367, + "grad_norm": 0.004763288889080286, + "learning_rate": 0.0004002532578297241, + "loss": 0.0017, + "num_input_tokens_seen": 160907680, + "step": 74485 + }, + { + "epoch": 12.151712887438826, + "grad_norm": 0.012047209776937962, + "learning_rate": 0.0004001835097927694, + "loss": 0.0045, + "num_input_tokens_seen": 160916992, + "step": 74490 + }, + { + "epoch": 12.15252854812398, + "grad_norm": 0.024075627326965332, + "learning_rate": 0.00040011376377872235, + "loss": 0.0081, + "num_input_tokens_seen": 160926816, + "step": 74495 + }, + { + "epoch": 12.153344208809136, + "grad_norm": 0.005183520261198282, + "learning_rate": 0.0004000440197889967, + "loss": 0.1024, + "num_input_tokens_seen": 160938112, + "step": 74500 + }, + { + "epoch": 12.15415986949429, + "grad_norm": 0.031936485320329666, + "learning_rate": 0.0003999742778250056, + "loss": 0.0055, + "num_input_tokens_seen": 160948704, + "step": 74505 + }, + { + "epoch": 12.154975530179446, + "grad_norm": 0.6701197624206543, + "learning_rate": 0.0003999045378881629, + "loss": 0.082, + "num_input_tokens_seen": 160959840, + "step": 74510 + }, + { + "epoch": 12.1557911908646, + "grad_norm": 0.01326004695147276, + "learning_rate": 0.0003998347999798815, + "loss": 0.0083, + "num_input_tokens_seen": 160971488, + "step": 74515 + }, + { + "epoch": 12.156606851549755, + "grad_norm": 0.0024851495400071144, + "learning_rate": 0.00039976506410157513, + "loss": 0.0033, + "num_input_tokens_seen": 160982208, + "step": 74520 + }, + { + "epoch": 12.15742251223491, + "grad_norm": 0.0026938130613416433, + "learning_rate": 0.0003996953302546567, + "loss": 0.0171, + "num_input_tokens_seen": 160994304, + "step": 74525 + }, + { + "epoch": 12.158238172920065, + "grad_norm": 0.03997796028852463, + "learning_rate": 0.0003996255984405399, + "loss": 0.0037, + "num_input_tokens_seen": 161004096, + "step": 74530 + }, + { + "epoch": 12.15905383360522, + "grad_norm": 0.002501038834452629, + "learning_rate": 0.00039955586866063735, + "loss": 0.0196, + "num_input_tokens_seen": 161013920, + "step": 74535 + }, + { + "epoch": 12.159869494290374, + "grad_norm": 0.0058213709853589535, + "learning_rate": 0.0003994861409163628, + "loss": 0.0027, + "num_input_tokens_seen": 161025728, + "step": 74540 + }, + { + "epoch": 12.16068515497553, + "grad_norm": 0.04390028864145279, + "learning_rate": 0.000399416415209129, + "loss": 0.1984, + "num_input_tokens_seen": 161037120, + "step": 74545 + }, + { + "epoch": 12.161500815660686, + "grad_norm": 0.6059911847114563, + "learning_rate": 0.0003993466915403492, + "loss": 0.0559, + "num_input_tokens_seen": 161048128, + "step": 74550 + }, + { + "epoch": 12.16231647634584, + "grad_norm": 0.002712165005505085, + "learning_rate": 0.0003992769699114364, + "loss": 0.122, + "num_input_tokens_seen": 161059424, + "step": 74555 + }, + { + "epoch": 12.163132137030995, + "grad_norm": 0.11503525823354721, + "learning_rate": 0.0003992072503238035, + "loss": 0.0098, + "num_input_tokens_seen": 161070848, + "step": 74560 + }, + { + "epoch": 12.16394779771615, + "grad_norm": 0.005386181641370058, + "learning_rate": 0.0003991375327788635, + "loss": 0.0185, + "num_input_tokens_seen": 161083104, + "step": 74565 + }, + { + "epoch": 12.164763458401305, + "grad_norm": 0.01078125275671482, + "learning_rate": 0.00039906781727802956, + "loss": 0.1355, + "num_input_tokens_seen": 161094560, + "step": 74570 + }, + { + "epoch": 12.16557911908646, + "grad_norm": 0.012528739869594574, + "learning_rate": 0.0003989981038227141, + "loss": 0.0135, + "num_input_tokens_seen": 161104256, + "step": 74575 + }, + { + "epoch": 12.166394779771615, + "grad_norm": 0.0010541232768446207, + "learning_rate": 0.0003989283924143304, + "loss": 0.0044, + "num_input_tokens_seen": 161114176, + "step": 74580 + }, + { + "epoch": 12.16721044045677, + "grad_norm": 0.017634112387895584, + "learning_rate": 0.0003988586830542909, + "loss": 0.0076, + "num_input_tokens_seen": 161124992, + "step": 74585 + }, + { + "epoch": 12.168026101141924, + "grad_norm": 0.006476237438619137, + "learning_rate": 0.00039878897574400845, + "loss": 0.0054, + "num_input_tokens_seen": 161135488, + "step": 74590 + }, + { + "epoch": 12.16884176182708, + "grad_norm": 0.021481206640601158, + "learning_rate": 0.00039871927048489605, + "loss": 0.0049, + "num_input_tokens_seen": 161147648, + "step": 74595 + }, + { + "epoch": 12.169657422512234, + "grad_norm": 0.006172158755362034, + "learning_rate": 0.0003986495672783659, + "loss": 0.0068, + "num_input_tokens_seen": 161157120, + "step": 74600 + }, + { + "epoch": 12.17047308319739, + "grad_norm": 0.005506650544703007, + "learning_rate": 0.000398579866125831, + "loss": 0.061, + "num_input_tokens_seen": 161169152, + "step": 74605 + }, + { + "epoch": 12.171288743882545, + "grad_norm": 0.06159405782818794, + "learning_rate": 0.00039851016702870356, + "loss": 0.1283, + "num_input_tokens_seen": 161179808, + "step": 74610 + }, + { + "epoch": 12.1721044045677, + "grad_norm": 0.30520564317703247, + "learning_rate": 0.0003984404699883966, + "loss": 0.0298, + "num_input_tokens_seen": 161190784, + "step": 74615 + }, + { + "epoch": 12.172920065252855, + "grad_norm": 0.015198386274278164, + "learning_rate": 0.00039837077500632213, + "loss": 0.0062, + "num_input_tokens_seen": 161202176, + "step": 74620 + }, + { + "epoch": 12.173735725938009, + "grad_norm": 0.015425390563905239, + "learning_rate": 0.00039830108208389306, + "loss": 0.0026, + "num_input_tokens_seen": 161213248, + "step": 74625 + }, + { + "epoch": 12.174551386623165, + "grad_norm": 0.00755777582526207, + "learning_rate": 0.00039823139122252126, + "loss": 0.0124, + "num_input_tokens_seen": 161222752, + "step": 74630 + }, + { + "epoch": 12.17536704730832, + "grad_norm": 0.004761831369251013, + "learning_rate": 0.0003981617024236197, + "loss": 0.0026, + "num_input_tokens_seen": 161233824, + "step": 74635 + }, + { + "epoch": 12.176182707993474, + "grad_norm": 0.0022641567047685385, + "learning_rate": 0.0003980920156886003, + "loss": 0.0071, + "num_input_tokens_seen": 161243840, + "step": 74640 + }, + { + "epoch": 12.17699836867863, + "grad_norm": 0.02729635499417782, + "learning_rate": 0.0003980223310188756, + "loss": 0.0047, + "num_input_tokens_seen": 161254560, + "step": 74645 + }, + { + "epoch": 12.177814029363784, + "grad_norm": 0.06542062014341354, + "learning_rate": 0.00039795264841585755, + "loss": 0.0211, + "num_input_tokens_seen": 161264960, + "step": 74650 + }, + { + "epoch": 12.17862969004894, + "grad_norm": 0.027670329436659813, + "learning_rate": 0.00039788296788095866, + "loss": 0.0032, + "num_input_tokens_seen": 161276128, + "step": 74655 + }, + { + "epoch": 12.179445350734095, + "grad_norm": 0.008770488202571869, + "learning_rate": 0.00039781328941559084, + "loss": 0.0494, + "num_input_tokens_seen": 161288288, + "step": 74660 + }, + { + "epoch": 12.18026101141925, + "grad_norm": 0.04313148930668831, + "learning_rate": 0.0003977436130211666, + "loss": 0.0081, + "num_input_tokens_seen": 161299232, + "step": 74665 + }, + { + "epoch": 12.181076672104405, + "grad_norm": 0.001219844096340239, + "learning_rate": 0.0003976739386990975, + "loss": 0.0134, + "num_input_tokens_seen": 161309600, + "step": 74670 + }, + { + "epoch": 12.181892332789559, + "grad_norm": 0.013389154337346554, + "learning_rate": 0.0003976042664507961, + "loss": 0.0415, + "num_input_tokens_seen": 161320256, + "step": 74675 + }, + { + "epoch": 12.182707993474715, + "grad_norm": 1.1981724500656128, + "learning_rate": 0.0003975345962776738, + "loss": 0.0513, + "num_input_tokens_seen": 161331840, + "step": 74680 + }, + { + "epoch": 12.18352365415987, + "grad_norm": 0.006036388222128153, + "learning_rate": 0.0003974649281811431, + "loss": 0.0065, + "num_input_tokens_seen": 161342656, + "step": 74685 + }, + { + "epoch": 12.184339314845024, + "grad_norm": 0.006011773832142353, + "learning_rate": 0.00039739526216261566, + "loss": 0.005, + "num_input_tokens_seen": 161352768, + "step": 74690 + }, + { + "epoch": 12.18515497553018, + "grad_norm": 0.0059346966445446014, + "learning_rate": 0.00039732559822350336, + "loss": 0.1203, + "num_input_tokens_seen": 161364000, + "step": 74695 + }, + { + "epoch": 12.185970636215334, + "grad_norm": 0.3481042981147766, + "learning_rate": 0.00039725593636521817, + "loss": 0.0506, + "num_input_tokens_seen": 161374816, + "step": 74700 + }, + { + "epoch": 12.18678629690049, + "grad_norm": 0.07168328016996384, + "learning_rate": 0.0003971862765891716, + "loss": 0.0671, + "num_input_tokens_seen": 161385088, + "step": 74705 + }, + { + "epoch": 12.187601957585644, + "grad_norm": 0.11459054052829742, + "learning_rate": 0.00039711661889677577, + "loss": 0.0086, + "num_input_tokens_seen": 161395136, + "step": 74710 + }, + { + "epoch": 12.1884176182708, + "grad_norm": 0.026510460302233696, + "learning_rate": 0.00039704696328944205, + "loss": 0.0036, + "num_input_tokens_seen": 161405248, + "step": 74715 + }, + { + "epoch": 12.189233278955955, + "grad_norm": 0.006999279838055372, + "learning_rate": 0.0003969773097685823, + "loss": 0.0073, + "num_input_tokens_seen": 161414304, + "step": 74720 + }, + { + "epoch": 12.190048939641109, + "grad_norm": 0.008538886904716492, + "learning_rate": 0.000396907658335608, + "loss": 0.0039, + "num_input_tokens_seen": 161425184, + "step": 74725 + }, + { + "epoch": 12.190864600326265, + "grad_norm": 0.024406736716628075, + "learning_rate": 0.0003968380089919308, + "loss": 0.0967, + "num_input_tokens_seen": 161436352, + "step": 74730 + }, + { + "epoch": 12.191680261011419, + "grad_norm": 0.007404988165944815, + "learning_rate": 0.0003967683617389621, + "loss": 0.0044, + "num_input_tokens_seen": 161447424, + "step": 74735 + }, + { + "epoch": 12.192495921696574, + "grad_norm": 0.024634407833218575, + "learning_rate": 0.0003966987165781138, + "loss": 0.007, + "num_input_tokens_seen": 161458496, + "step": 74740 + }, + { + "epoch": 12.19331158238173, + "grad_norm": 0.10026438534259796, + "learning_rate": 0.00039662907351079675, + "loss": 0.1007, + "num_input_tokens_seen": 161469152, + "step": 74745 + }, + { + "epoch": 12.194127243066884, + "grad_norm": 0.016706952825188637, + "learning_rate": 0.00039655943253842293, + "loss": 0.0027, + "num_input_tokens_seen": 161479616, + "step": 74750 + }, + { + "epoch": 12.19494290375204, + "grad_norm": 0.0008600183646194637, + "learning_rate": 0.00039648979366240325, + "loss": 0.003, + "num_input_tokens_seen": 161491136, + "step": 74755 + }, + { + "epoch": 12.195758564437194, + "grad_norm": 0.025418315082788467, + "learning_rate": 0.00039642015688414936, + "loss": 0.0029, + "num_input_tokens_seen": 161502144, + "step": 74760 + }, + { + "epoch": 12.19657422512235, + "grad_norm": 0.002887872513383627, + "learning_rate": 0.00039635052220507216, + "loss": 0.0019, + "num_input_tokens_seen": 161513248, + "step": 74765 + }, + { + "epoch": 12.197389885807505, + "grad_norm": 0.017393076792359352, + "learning_rate": 0.0003962808896265834, + "loss": 0.0523, + "num_input_tokens_seen": 161523424, + "step": 74770 + }, + { + "epoch": 12.198205546492659, + "grad_norm": 0.25441282987594604, + "learning_rate": 0.0003962112591500937, + "loss": 0.0195, + "num_input_tokens_seen": 161535200, + "step": 74775 + }, + { + "epoch": 12.199021207177815, + "grad_norm": 0.18550816178321838, + "learning_rate": 0.00039614163077701474, + "loss": 0.1819, + "num_input_tokens_seen": 161543808, + "step": 74780 + }, + { + "epoch": 12.199836867862969, + "grad_norm": 0.3364203870296478, + "learning_rate": 0.00039607200450875716, + "loss": 0.0632, + "num_input_tokens_seen": 161554656, + "step": 74785 + }, + { + "epoch": 12.200652528548124, + "grad_norm": 0.003161477390676737, + "learning_rate": 0.0003960023803467325, + "loss": 0.0021, + "num_input_tokens_seen": 161565920, + "step": 74790 + }, + { + "epoch": 12.201468189233278, + "grad_norm": 0.007758776657283306, + "learning_rate": 0.0003959327582923513, + "loss": 0.0041, + "num_input_tokens_seen": 161577824, + "step": 74795 + }, + { + "epoch": 12.202283849918434, + "grad_norm": 0.0124747259542346, + "learning_rate": 0.000395863138347025, + "loss": 0.0131, + "num_input_tokens_seen": 161588352, + "step": 74800 + }, + { + "epoch": 12.20309951060359, + "grad_norm": 0.019156094640493393, + "learning_rate": 0.0003957935205121641, + "loss": 0.0227, + "num_input_tokens_seen": 161599680, + "step": 74805 + }, + { + "epoch": 12.203915171288743, + "grad_norm": 0.032875653356313705, + "learning_rate": 0.00039572390478917973, + "loss": 0.0123, + "num_input_tokens_seen": 161610016, + "step": 74810 + }, + { + "epoch": 12.2047308319739, + "grad_norm": 0.0038414266891777515, + "learning_rate": 0.00039565429117948287, + "loss": 0.0075, + "num_input_tokens_seen": 161621184, + "step": 74815 + }, + { + "epoch": 12.205546492659053, + "grad_norm": 0.0043778130784630775, + "learning_rate": 0.000395584679684484, + "loss": 0.1721, + "num_input_tokens_seen": 161631872, + "step": 74820 + }, + { + "epoch": 12.206362153344209, + "grad_norm": 0.006347167305648327, + "learning_rate": 0.00039551507030559423, + "loss": 0.003, + "num_input_tokens_seen": 161642304, + "step": 74825 + }, + { + "epoch": 12.207177814029365, + "grad_norm": 0.40752550959587097, + "learning_rate": 0.0003954454630442239, + "loss": 0.1171, + "num_input_tokens_seen": 161652672, + "step": 74830 + }, + { + "epoch": 12.207993474714518, + "grad_norm": 0.0724559798836708, + "learning_rate": 0.0003953758579017842, + "loss": 0.0265, + "num_input_tokens_seen": 161664000, + "step": 74835 + }, + { + "epoch": 12.208809135399674, + "grad_norm": 0.08504586666822433, + "learning_rate": 0.00039530625487968507, + "loss": 0.0195, + "num_input_tokens_seen": 161675296, + "step": 74840 + }, + { + "epoch": 12.209624796084828, + "grad_norm": 0.01224282942712307, + "learning_rate": 0.00039523665397933784, + "loss": 0.0058, + "num_input_tokens_seen": 161686880, + "step": 74845 + }, + { + "epoch": 12.210440456769984, + "grad_norm": 1.089669942855835, + "learning_rate": 0.0003951670552021525, + "loss": 0.1151, + "num_input_tokens_seen": 161697408, + "step": 74850 + }, + { + "epoch": 12.21125611745514, + "grad_norm": 0.0018654355080798268, + "learning_rate": 0.0003950974585495399, + "loss": 0.012, + "num_input_tokens_seen": 161707616, + "step": 74855 + }, + { + "epoch": 12.212071778140293, + "grad_norm": 0.053873226046562195, + "learning_rate": 0.0003950278640229103, + "loss": 0.0099, + "num_input_tokens_seen": 161718912, + "step": 74860 + }, + { + "epoch": 12.21288743882545, + "grad_norm": 0.0093051353469491, + "learning_rate": 0.0003949582716236743, + "loss": 0.0042, + "num_input_tokens_seen": 161729280, + "step": 74865 + }, + { + "epoch": 12.213703099510603, + "grad_norm": 0.02878117561340332, + "learning_rate": 0.0003948886813532421, + "loss": 0.1499, + "num_input_tokens_seen": 161739456, + "step": 74870 + }, + { + "epoch": 12.214518760195759, + "grad_norm": 0.004532721359282732, + "learning_rate": 0.00039481909321302413, + "loss": 0.1098, + "num_input_tokens_seen": 161750464, + "step": 74875 + }, + { + "epoch": 12.215334420880913, + "grad_norm": 0.005164369475096464, + "learning_rate": 0.0003947495072044306, + "loss": 0.0166, + "num_input_tokens_seen": 161761760, + "step": 74880 + }, + { + "epoch": 12.216150081566068, + "grad_norm": 0.00576377147808671, + "learning_rate": 0.00039467992332887196, + "loss": 0.0097, + "num_input_tokens_seen": 161773344, + "step": 74885 + }, + { + "epoch": 12.216965742251224, + "grad_norm": 0.024265503510832787, + "learning_rate": 0.0003946103415877582, + "loss": 0.0151, + "num_input_tokens_seen": 161785440, + "step": 74890 + }, + { + "epoch": 12.217781402936378, + "grad_norm": 0.008004284463822842, + "learning_rate": 0.00039454076198249964, + "loss": 0.0237, + "num_input_tokens_seen": 161795808, + "step": 74895 + }, + { + "epoch": 12.218597063621534, + "grad_norm": 0.0055056121200323105, + "learning_rate": 0.00039447118451450613, + "loss": 0.0036, + "num_input_tokens_seen": 161806432, + "step": 74900 + }, + { + "epoch": 12.219412724306688, + "grad_norm": 0.624906599521637, + "learning_rate": 0.00039440160918518825, + "loss": 0.1011, + "num_input_tokens_seen": 161816736, + "step": 74905 + }, + { + "epoch": 12.220228384991843, + "grad_norm": 0.05831537023186684, + "learning_rate": 0.00039433203599595546, + "loss": 0.0059, + "num_input_tokens_seen": 161826848, + "step": 74910 + }, + { + "epoch": 12.221044045676999, + "grad_norm": 0.007869354449212551, + "learning_rate": 0.00039426246494821793, + "loss": 0.0118, + "num_input_tokens_seen": 161837120, + "step": 74915 + }, + { + "epoch": 12.221859706362153, + "grad_norm": 0.05399933084845543, + "learning_rate": 0.000394192896043386, + "loss": 0.0087, + "num_input_tokens_seen": 161848064, + "step": 74920 + }, + { + "epoch": 12.222675367047309, + "grad_norm": 0.01727372780442238, + "learning_rate": 0.000394123329282869, + "loss": 0.0141, + "num_input_tokens_seen": 161858432, + "step": 74925 + }, + { + "epoch": 12.223491027732463, + "grad_norm": 0.00785834901034832, + "learning_rate": 0.0003940537646680773, + "loss": 0.0043, + "num_input_tokens_seen": 161869792, + "step": 74930 + }, + { + "epoch": 12.224306688417618, + "grad_norm": 0.06565750390291214, + "learning_rate": 0.0003939842022004202, + "loss": 0.0154, + "num_input_tokens_seen": 161880384, + "step": 74935 + }, + { + "epoch": 12.225122349102774, + "grad_norm": 0.005559089593589306, + "learning_rate": 0.00039391464188130796, + "loss": 0.0386, + "num_input_tokens_seen": 161889984, + "step": 74940 + }, + { + "epoch": 12.225938009787928, + "grad_norm": 0.041153181344270706, + "learning_rate": 0.0003938450837121499, + "loss": 0.0063, + "num_input_tokens_seen": 161900128, + "step": 74945 + }, + { + "epoch": 12.226753670473084, + "grad_norm": 0.0016776022966951132, + "learning_rate": 0.00039377552769435606, + "loss": 0.0016, + "num_input_tokens_seen": 161910944, + "step": 74950 + }, + { + "epoch": 12.227569331158238, + "grad_norm": 0.00216303626075387, + "learning_rate": 0.0003937059738293357, + "loss": 0.1426, + "num_input_tokens_seen": 161921600, + "step": 74955 + }, + { + "epoch": 12.228384991843393, + "grad_norm": 0.003117464715614915, + "learning_rate": 0.0003936364221184988, + "loss": 0.0097, + "num_input_tokens_seen": 161932768, + "step": 74960 + }, + { + "epoch": 12.229200652528547, + "grad_norm": 0.44894319772720337, + "learning_rate": 0.00039356687256325465, + "loss": 0.0747, + "num_input_tokens_seen": 161944480, + "step": 74965 + }, + { + "epoch": 12.230016313213703, + "grad_norm": 0.8600792288780212, + "learning_rate": 0.0003934973251650129, + "loss": 0.0236, + "num_input_tokens_seen": 161954048, + "step": 74970 + }, + { + "epoch": 12.230831973898859, + "grad_norm": 0.05913609266281128, + "learning_rate": 0.0003934277799251829, + "loss": 0.0307, + "num_input_tokens_seen": 161965792, + "step": 74975 + }, + { + "epoch": 12.231647634584013, + "grad_norm": 0.010049772448837757, + "learning_rate": 0.00039335823684517423, + "loss": 0.0067, + "num_input_tokens_seen": 161977312, + "step": 74980 + }, + { + "epoch": 12.232463295269168, + "grad_norm": 0.0010888243559747934, + "learning_rate": 0.00039328869592639604, + "loss": 0.005, + "num_input_tokens_seen": 161987680, + "step": 74985 + }, + { + "epoch": 12.233278955954322, + "grad_norm": 0.0009124244097620249, + "learning_rate": 0.00039321915717025797, + "loss": 0.0057, + "num_input_tokens_seen": 161998784, + "step": 74990 + }, + { + "epoch": 12.234094616639478, + "grad_norm": 0.01991415210068226, + "learning_rate": 0.00039314962057816896, + "loss": 0.1298, + "num_input_tokens_seen": 162009696, + "step": 74995 + }, + { + "epoch": 12.234910277324634, + "grad_norm": 0.016737831756472588, + "learning_rate": 0.0003930800861515385, + "loss": 0.0037, + "num_input_tokens_seen": 162020640, + "step": 75000 + }, + { + "epoch": 12.235725938009788, + "grad_norm": 0.02599795162677765, + "learning_rate": 0.00039301055389177577, + "loss": 0.0141, + "num_input_tokens_seen": 162031776, + "step": 75005 + }, + { + "epoch": 12.236541598694943, + "grad_norm": 0.0481642484664917, + "learning_rate": 0.00039294102380028987, + "loss": 0.0057, + "num_input_tokens_seen": 162043456, + "step": 75010 + }, + { + "epoch": 12.237357259380097, + "grad_norm": 0.004535711370408535, + "learning_rate": 0.0003928714958784899, + "loss": 0.0181, + "num_input_tokens_seen": 162055264, + "step": 75015 + }, + { + "epoch": 12.238172920065253, + "grad_norm": 0.016865408048033714, + "learning_rate": 0.00039280197012778493, + "loss": 0.0197, + "num_input_tokens_seen": 162065920, + "step": 75020 + }, + { + "epoch": 12.238988580750409, + "grad_norm": 0.020147256553173065, + "learning_rate": 0.0003927324465495841, + "loss": 0.0058, + "num_input_tokens_seen": 162076032, + "step": 75025 + }, + { + "epoch": 12.239804241435563, + "grad_norm": 0.004990022629499435, + "learning_rate": 0.0003926629251452963, + "loss": 0.0078, + "num_input_tokens_seen": 162087552, + "step": 75030 + }, + { + "epoch": 12.240619902120718, + "grad_norm": 0.00859206635504961, + "learning_rate": 0.0003925934059163306, + "loss": 0.0023, + "num_input_tokens_seen": 162097760, + "step": 75035 + }, + { + "epoch": 12.241435562805872, + "grad_norm": 0.0490594208240509, + "learning_rate": 0.0003925238888640957, + "loss": 0.0184, + "num_input_tokens_seen": 162107936, + "step": 75040 + }, + { + "epoch": 12.242251223491028, + "grad_norm": 0.006913432851433754, + "learning_rate": 0.0003924543739900005, + "loss": 0.033, + "num_input_tokens_seen": 162119264, + "step": 75045 + }, + { + "epoch": 12.243066884176184, + "grad_norm": 0.3514501750469208, + "learning_rate": 0.00039238486129545376, + "loss": 0.1685, + "num_input_tokens_seen": 162130464, + "step": 75050 + }, + { + "epoch": 12.243882544861338, + "grad_norm": 0.018597450107336044, + "learning_rate": 0.0003923153507818645, + "loss": 0.0288, + "num_input_tokens_seen": 162140928, + "step": 75055 + }, + { + "epoch": 12.244698205546493, + "grad_norm": 0.02613472379744053, + "learning_rate": 0.00039224584245064114, + "loss": 0.0078, + "num_input_tokens_seen": 162151936, + "step": 75060 + }, + { + "epoch": 12.245513866231647, + "grad_norm": 0.02216893993318081, + "learning_rate": 0.00039217633630319264, + "loss": 0.0027, + "num_input_tokens_seen": 162163744, + "step": 75065 + }, + { + "epoch": 12.246329526916803, + "grad_norm": 0.35483214259147644, + "learning_rate": 0.00039210683234092733, + "loss": 0.0128, + "num_input_tokens_seen": 162174880, + "step": 75070 + }, + { + "epoch": 12.247145187601957, + "grad_norm": 0.013999617658555508, + "learning_rate": 0.000392037330565254, + "loss": 0.0023, + "num_input_tokens_seen": 162185984, + "step": 75075 + }, + { + "epoch": 12.247960848287113, + "grad_norm": 0.09692630916833878, + "learning_rate": 0.000391967830977581, + "loss": 0.0063, + "num_input_tokens_seen": 162196000, + "step": 75080 + }, + { + "epoch": 12.248776508972268, + "grad_norm": 0.0007761928136460483, + "learning_rate": 0.0003918983335793173, + "loss": 0.1045, + "num_input_tokens_seen": 162205312, + "step": 75085 + }, + { + "epoch": 12.249592169657422, + "grad_norm": 0.013476379215717316, + "learning_rate": 0.00039182883837187056, + "loss": 0.0123, + "num_input_tokens_seen": 162213888, + "step": 75090 + }, + { + "epoch": 12.250407830342578, + "grad_norm": 0.017461730167269707, + "learning_rate": 0.00039175934535665, + "loss": 0.0037, + "num_input_tokens_seen": 162224960, + "step": 75095 + }, + { + "epoch": 12.251223491027732, + "grad_norm": 0.1801888644695282, + "learning_rate": 0.00039168985453506334, + "loss": 0.0109, + "num_input_tokens_seen": 162234592, + "step": 75100 + }, + { + "epoch": 12.252039151712887, + "grad_norm": 0.006768012419342995, + "learning_rate": 0.0003916203659085194, + "loss": 0.0058, + "num_input_tokens_seen": 162245440, + "step": 75105 + }, + { + "epoch": 12.252854812398043, + "grad_norm": 0.010229643434286118, + "learning_rate": 0.00039155087947842607, + "loss": 0.0678, + "num_input_tokens_seen": 162256320, + "step": 75110 + }, + { + "epoch": 12.253670473083197, + "grad_norm": 0.003439029911532998, + "learning_rate": 0.00039148139524619184, + "loss": 0.0025, + "num_input_tokens_seen": 162267264, + "step": 75115 + }, + { + "epoch": 12.254486133768353, + "grad_norm": 0.041233912110328674, + "learning_rate": 0.00039141191321322464, + "loss": 0.0076, + "num_input_tokens_seen": 162278112, + "step": 75120 + }, + { + "epoch": 12.255301794453507, + "grad_norm": 0.00618229852989316, + "learning_rate": 0.00039134243338093285, + "loss": 0.0033, + "num_input_tokens_seen": 162288480, + "step": 75125 + }, + { + "epoch": 12.256117455138662, + "grad_norm": 0.003569080028682947, + "learning_rate": 0.0003912729557507246, + "loss": 0.0086, + "num_input_tokens_seen": 162299680, + "step": 75130 + }, + { + "epoch": 12.256933115823816, + "grad_norm": 0.019696485251188278, + "learning_rate": 0.0003912034803240077, + "loss": 0.0173, + "num_input_tokens_seen": 162309984, + "step": 75135 + }, + { + "epoch": 12.257748776508972, + "grad_norm": 0.19719895720481873, + "learning_rate": 0.0003911340071021905, + "loss": 0.0075, + "num_input_tokens_seen": 162321152, + "step": 75140 + }, + { + "epoch": 12.258564437194128, + "grad_norm": 0.07820143550634384, + "learning_rate": 0.00039106453608668047, + "loss": 0.0567, + "num_input_tokens_seen": 162332448, + "step": 75145 + }, + { + "epoch": 12.259380097879282, + "grad_norm": 0.013048024848103523, + "learning_rate": 0.0003909950672788861, + "loss": 0.0053, + "num_input_tokens_seen": 162343552, + "step": 75150 + }, + { + "epoch": 12.260195758564437, + "grad_norm": 0.02106913924217224, + "learning_rate": 0.0003909256006802147, + "loss": 0.0076, + "num_input_tokens_seen": 162353280, + "step": 75155 + }, + { + "epoch": 12.261011419249591, + "grad_norm": 0.00745503231883049, + "learning_rate": 0.0003908561362920746, + "loss": 0.0496, + "num_input_tokens_seen": 162364224, + "step": 75160 + }, + { + "epoch": 12.261827079934747, + "grad_norm": 0.0053404951468110085, + "learning_rate": 0.00039078667411587316, + "loss": 0.0024, + "num_input_tokens_seen": 162375520, + "step": 75165 + }, + { + "epoch": 12.262642740619903, + "grad_norm": 0.003612641477957368, + "learning_rate": 0.0003907172141530184, + "loss": 0.0019, + "num_input_tokens_seen": 162386016, + "step": 75170 + }, + { + "epoch": 12.263458401305057, + "grad_norm": 0.004556257743388414, + "learning_rate": 0.00039064775640491796, + "loss": 0.0014, + "num_input_tokens_seen": 162396576, + "step": 75175 + }, + { + "epoch": 12.264274061990212, + "grad_norm": 0.4065341651439667, + "learning_rate": 0.00039057830087297946, + "loss": 0.0141, + "num_input_tokens_seen": 162406912, + "step": 75180 + }, + { + "epoch": 12.265089722675366, + "grad_norm": 0.004324205219745636, + "learning_rate": 0.0003905088475586105, + "loss": 0.0497, + "num_input_tokens_seen": 162418144, + "step": 75185 + }, + { + "epoch": 12.265905383360522, + "grad_norm": 0.002117524156346917, + "learning_rate": 0.0003904393964632186, + "loss": 0.003, + "num_input_tokens_seen": 162428640, + "step": 75190 + }, + { + "epoch": 12.266721044045678, + "grad_norm": 0.007448033429682255, + "learning_rate": 0.00039036994758821124, + "loss": 0.1817, + "num_input_tokens_seen": 162440064, + "step": 75195 + }, + { + "epoch": 12.267536704730832, + "grad_norm": 0.10965090245008469, + "learning_rate": 0.00039030050093499623, + "loss": 0.0442, + "num_input_tokens_seen": 162451744, + "step": 75200 + }, + { + "epoch": 12.268352365415987, + "grad_norm": 0.48558029532432556, + "learning_rate": 0.0003902310565049805, + "loss": 0.0091, + "num_input_tokens_seen": 162462368, + "step": 75205 + }, + { + "epoch": 12.269168026101141, + "grad_norm": 0.013588961213827133, + "learning_rate": 0.0003901616142995718, + "loss": 0.125, + "num_input_tokens_seen": 162472128, + "step": 75210 + }, + { + "epoch": 12.269983686786297, + "grad_norm": 1.1377973556518555, + "learning_rate": 0.0003900921743201772, + "loss": 0.0954, + "num_input_tokens_seen": 162482848, + "step": 75215 + }, + { + "epoch": 12.270799347471453, + "grad_norm": 0.023800566792488098, + "learning_rate": 0.00039002273656820423, + "loss": 0.0457, + "num_input_tokens_seen": 162493408, + "step": 75220 + }, + { + "epoch": 12.271615008156607, + "grad_norm": 0.0248698852956295, + "learning_rate": 0.0003899533010450599, + "loss": 0.0097, + "num_input_tokens_seen": 162505184, + "step": 75225 + }, + { + "epoch": 12.272430668841762, + "grad_norm": 0.06870092451572418, + "learning_rate": 0.0003898838677521515, + "loss": 0.0128, + "num_input_tokens_seen": 162515552, + "step": 75230 + }, + { + "epoch": 12.273246329526916, + "grad_norm": 0.07277870923280716, + "learning_rate": 0.00038981443669088646, + "loss": 0.1021, + "num_input_tokens_seen": 162526176, + "step": 75235 + }, + { + "epoch": 12.274061990212072, + "grad_norm": 0.01803704723715782, + "learning_rate": 0.0003897450078626714, + "loss": 0.0147, + "num_input_tokens_seen": 162537600, + "step": 75240 + }, + { + "epoch": 12.274877650897226, + "grad_norm": 0.0018408960895612836, + "learning_rate": 0.0003896755812689138, + "loss": 0.0197, + "num_input_tokens_seen": 162548096, + "step": 75245 + }, + { + "epoch": 12.275693311582382, + "grad_norm": 0.0035825977101922035, + "learning_rate": 0.0003896061569110203, + "loss": 0.0085, + "num_input_tokens_seen": 162559872, + "step": 75250 + }, + { + "epoch": 12.276508972267537, + "grad_norm": 0.032333966344594955, + "learning_rate": 0.0003895367347903983, + "loss": 0.0207, + "num_input_tokens_seen": 162570816, + "step": 75255 + }, + { + "epoch": 12.277324632952691, + "grad_norm": 0.0008797519840300083, + "learning_rate": 0.0003894673149084543, + "loss": 0.0035, + "num_input_tokens_seen": 162581536, + "step": 75260 + }, + { + "epoch": 12.278140293637847, + "grad_norm": 0.0028373999521136284, + "learning_rate": 0.0003893978972665956, + "loss": 0.0742, + "num_input_tokens_seen": 162591712, + "step": 75265 + }, + { + "epoch": 12.278955954323001, + "grad_norm": 0.0014782834332436323, + "learning_rate": 0.0003893284818662286, + "loss": 0.0037, + "num_input_tokens_seen": 162602624, + "step": 75270 + }, + { + "epoch": 12.279771615008157, + "grad_norm": 0.008053838275372982, + "learning_rate": 0.0003892590687087605, + "loss": 0.0069, + "num_input_tokens_seen": 162614048, + "step": 75275 + }, + { + "epoch": 12.280587275693312, + "grad_norm": 0.0029203668236732483, + "learning_rate": 0.0003891896577955977, + "loss": 0.0346, + "num_input_tokens_seen": 162624160, + "step": 75280 + }, + { + "epoch": 12.281402936378466, + "grad_norm": 0.021967828273773193, + "learning_rate": 0.0003891202491281472, + "loss": 0.06, + "num_input_tokens_seen": 162636000, + "step": 75285 + }, + { + "epoch": 12.282218597063622, + "grad_norm": 0.006306948605924845, + "learning_rate": 0.0003890508427078153, + "loss": 0.0015, + "num_input_tokens_seen": 162646400, + "step": 75290 + }, + { + "epoch": 12.283034257748776, + "grad_norm": 0.07245063781738281, + "learning_rate": 0.0003889814385360091, + "loss": 0.0118, + "num_input_tokens_seen": 162657344, + "step": 75295 + }, + { + "epoch": 12.283849918433932, + "grad_norm": 0.0017536969389766455, + "learning_rate": 0.0003889120366141347, + "loss": 0.1543, + "num_input_tokens_seen": 162668544, + "step": 75300 + }, + { + "epoch": 12.284665579119087, + "grad_norm": 0.25096216797828674, + "learning_rate": 0.0003888426369435989, + "loss": 0.0068, + "num_input_tokens_seen": 162679296, + "step": 75305 + }, + { + "epoch": 12.285481239804241, + "grad_norm": 0.002455186564475298, + "learning_rate": 0.0003887732395258079, + "loss": 0.0049, + "num_input_tokens_seen": 162689728, + "step": 75310 + }, + { + "epoch": 12.286296900489397, + "grad_norm": 0.004991905763745308, + "learning_rate": 0.0003887038443621684, + "loss": 0.008, + "num_input_tokens_seen": 162700576, + "step": 75315 + }, + { + "epoch": 12.28711256117455, + "grad_norm": 0.013448765501379967, + "learning_rate": 0.0003886344514540868, + "loss": 0.0036, + "num_input_tokens_seen": 162711808, + "step": 75320 + }, + { + "epoch": 12.287928221859707, + "grad_norm": 0.03673629090189934, + "learning_rate": 0.0003885650608029692, + "loss": 0.0065, + "num_input_tokens_seen": 162722208, + "step": 75325 + }, + { + "epoch": 12.28874388254486, + "grad_norm": 0.2677193582057953, + "learning_rate": 0.00038849567241022205, + "loss": 0.0226, + "num_input_tokens_seen": 162732416, + "step": 75330 + }, + { + "epoch": 12.289559543230016, + "grad_norm": 0.006491140462458134, + "learning_rate": 0.0003884262862772514, + "loss": 0.0028, + "num_input_tokens_seen": 162744576, + "step": 75335 + }, + { + "epoch": 12.290375203915172, + "grad_norm": 0.3217860162258148, + "learning_rate": 0.0003883569024054638, + "loss": 0.1769, + "num_input_tokens_seen": 162753600, + "step": 75340 + }, + { + "epoch": 12.291190864600326, + "grad_norm": 0.019507482647895813, + "learning_rate": 0.0003882875207962651, + "loss": 0.0056, + "num_input_tokens_seen": 162764256, + "step": 75345 + }, + { + "epoch": 12.292006525285482, + "grad_norm": 0.0013944999082013965, + "learning_rate": 0.0003882181414510616, + "loss": 0.0074, + "num_input_tokens_seen": 162775840, + "step": 75350 + }, + { + "epoch": 12.292822185970635, + "grad_norm": 0.03465007618069649, + "learning_rate": 0.00038814876437125916, + "loss": 0.0037, + "num_input_tokens_seen": 162786656, + "step": 75355 + }, + { + "epoch": 12.293637846655791, + "grad_norm": 0.04906386137008667, + "learning_rate": 0.000388079389558264, + "loss": 0.0085, + "num_input_tokens_seen": 162799584, + "step": 75360 + }, + { + "epoch": 12.294453507340947, + "grad_norm": 0.028646433725953102, + "learning_rate": 0.0003880100170134818, + "loss": 0.0081, + "num_input_tokens_seen": 162809984, + "step": 75365 + }, + { + "epoch": 12.2952691680261, + "grad_norm": 0.028914660215377808, + "learning_rate": 0.00038794064673831896, + "loss": 0.0114, + "num_input_tokens_seen": 162820256, + "step": 75370 + }, + { + "epoch": 12.296084828711257, + "grad_norm": 0.016150979325175285, + "learning_rate": 0.0003878712787341809, + "loss": 0.0354, + "num_input_tokens_seen": 162830880, + "step": 75375 + }, + { + "epoch": 12.29690048939641, + "grad_norm": 0.0038916615303605795, + "learning_rate": 0.0003878019130024737, + "loss": 0.0096, + "num_input_tokens_seen": 162840896, + "step": 75380 + }, + { + "epoch": 12.297716150081566, + "grad_norm": 0.011551225557923317, + "learning_rate": 0.000387732549544603, + "loss": 0.1076, + "num_input_tokens_seen": 162852000, + "step": 75385 + }, + { + "epoch": 12.298531810766722, + "grad_norm": 0.017128009349107742, + "learning_rate": 0.0003876631883619747, + "loss": 0.0223, + "num_input_tokens_seen": 162862368, + "step": 75390 + }, + { + "epoch": 12.299347471451876, + "grad_norm": 0.01165260374546051, + "learning_rate": 0.0003875938294559942, + "loss": 0.003, + "num_input_tokens_seen": 162873504, + "step": 75395 + }, + { + "epoch": 12.300163132137031, + "grad_norm": 0.01163018774241209, + "learning_rate": 0.0003875244728280676, + "loss": 0.2225, + "num_input_tokens_seen": 162884992, + "step": 75400 + }, + { + "epoch": 12.300978792822185, + "grad_norm": 0.019337153062224388, + "learning_rate": 0.00038745511847960003, + "loss": 0.1481, + "num_input_tokens_seen": 162895488, + "step": 75405 + }, + { + "epoch": 12.301794453507341, + "grad_norm": 0.0666937604546547, + "learning_rate": 0.0003873857664119974, + "loss": 0.0548, + "num_input_tokens_seen": 162905888, + "step": 75410 + }, + { + "epoch": 12.302610114192497, + "grad_norm": 0.06270725280046463, + "learning_rate": 0.00038731641662666493, + "loss": 0.0347, + "num_input_tokens_seen": 162917344, + "step": 75415 + }, + { + "epoch": 12.30342577487765, + "grad_norm": 0.000730838452000171, + "learning_rate": 0.00038724706912500847, + "loss": 0.0063, + "num_input_tokens_seen": 162927616, + "step": 75420 + }, + { + "epoch": 12.304241435562806, + "grad_norm": 0.052667297422885895, + "learning_rate": 0.0003871777239084329, + "loss": 0.0157, + "num_input_tokens_seen": 162937856, + "step": 75425 + }, + { + "epoch": 12.30505709624796, + "grad_norm": 0.02632969245314598, + "learning_rate": 0.00038710838097834414, + "loss": 0.0766, + "num_input_tokens_seen": 162949632, + "step": 75430 + }, + { + "epoch": 12.305872756933116, + "grad_norm": 0.005952873267233372, + "learning_rate": 0.000387039040336147, + "loss": 0.0048, + "num_input_tokens_seen": 162959552, + "step": 75435 + }, + { + "epoch": 12.30668841761827, + "grad_norm": 0.022846754640340805, + "learning_rate": 0.0003869697019832473, + "loss": 0.1431, + "num_input_tokens_seen": 162970496, + "step": 75440 + }, + { + "epoch": 12.307504078303426, + "grad_norm": 0.22744394838809967, + "learning_rate": 0.0003869003659210497, + "loss": 0.1829, + "num_input_tokens_seen": 162980160, + "step": 75445 + }, + { + "epoch": 12.308319738988581, + "grad_norm": 0.05033477395772934, + "learning_rate": 0.00038683103215095965, + "loss": 0.1123, + "num_input_tokens_seen": 162990496, + "step": 75450 + }, + { + "epoch": 12.309135399673735, + "grad_norm": 0.007061969488859177, + "learning_rate": 0.00038676170067438256, + "loss": 0.011, + "num_input_tokens_seen": 163000832, + "step": 75455 + }, + { + "epoch": 12.309951060358891, + "grad_norm": 0.022670293226838112, + "learning_rate": 0.00038669237149272303, + "loss": 0.0088, + "num_input_tokens_seen": 163012416, + "step": 75460 + }, + { + "epoch": 12.310766721044045, + "grad_norm": 0.0011929698521271348, + "learning_rate": 0.0003866230446073865, + "loss": 0.01, + "num_input_tokens_seen": 163022912, + "step": 75465 + }, + { + "epoch": 12.3115823817292, + "grad_norm": 0.006316805724054575, + "learning_rate": 0.0003865537200197776, + "loss": 0.0059, + "num_input_tokens_seen": 163033280, + "step": 75470 + }, + { + "epoch": 12.312398042414356, + "grad_norm": 0.45057278871536255, + "learning_rate": 0.0003864843977313017, + "loss": 0.0742, + "num_input_tokens_seen": 163043808, + "step": 75475 + }, + { + "epoch": 12.31321370309951, + "grad_norm": 0.010832220315933228, + "learning_rate": 0.0003864150777433634, + "loss": 0.0587, + "num_input_tokens_seen": 163054752, + "step": 75480 + }, + { + "epoch": 12.314029363784666, + "grad_norm": 0.10409935563802719, + "learning_rate": 0.0003863457600573676, + "loss": 0.1449, + "num_input_tokens_seen": 163065152, + "step": 75485 + }, + { + "epoch": 12.31484502446982, + "grad_norm": 0.014816675335168839, + "learning_rate": 0.00038627644467471915, + "loss": 0.0093, + "num_input_tokens_seen": 163076992, + "step": 75490 + }, + { + "epoch": 12.315660685154976, + "grad_norm": 0.003097902750596404, + "learning_rate": 0.00038620713159682286, + "loss": 0.0294, + "num_input_tokens_seen": 163088704, + "step": 75495 + }, + { + "epoch": 12.31647634584013, + "grad_norm": 0.032903462648391724, + "learning_rate": 0.0003861378208250834, + "loss": 0.0136, + "num_input_tokens_seen": 163099168, + "step": 75500 + }, + { + "epoch": 12.317292006525285, + "grad_norm": 0.01986708678305149, + "learning_rate": 0.00038606851236090543, + "loss": 0.0251, + "num_input_tokens_seen": 163108800, + "step": 75505 + }, + { + "epoch": 12.318107667210441, + "grad_norm": 0.029241114854812622, + "learning_rate": 0.00038599920620569357, + "loss": 0.0785, + "num_input_tokens_seen": 163119456, + "step": 75510 + }, + { + "epoch": 12.318923327895595, + "grad_norm": 0.0034615658223628998, + "learning_rate": 0.00038592990236085257, + "loss": 0.0089, + "num_input_tokens_seen": 163130656, + "step": 75515 + }, + { + "epoch": 12.31973898858075, + "grad_norm": 0.020488232374191284, + "learning_rate": 0.0003858606008277866, + "loss": 0.041, + "num_input_tokens_seen": 163141056, + "step": 75520 + }, + { + "epoch": 12.320554649265905, + "grad_norm": 0.04791320860385895, + "learning_rate": 0.0003857913016079005, + "loss": 0.1059, + "num_input_tokens_seen": 163151328, + "step": 75525 + }, + { + "epoch": 12.32137030995106, + "grad_norm": 0.014877380803227425, + "learning_rate": 0.0003857220047025984, + "loss": 0.0387, + "num_input_tokens_seen": 163161952, + "step": 75530 + }, + { + "epoch": 12.322185970636216, + "grad_norm": 0.016949398443102837, + "learning_rate": 0.00038565271011328507, + "loss": 0.0205, + "num_input_tokens_seen": 163173184, + "step": 75535 + }, + { + "epoch": 12.32300163132137, + "grad_norm": 0.022877110168337822, + "learning_rate": 0.00038558341784136437, + "loss": 0.1064, + "num_input_tokens_seen": 163185472, + "step": 75540 + }, + { + "epoch": 12.323817292006526, + "grad_norm": 0.3788914680480957, + "learning_rate": 0.00038551412788824106, + "loss": 0.0798, + "num_input_tokens_seen": 163195936, + "step": 75545 + }, + { + "epoch": 12.32463295269168, + "grad_norm": 0.005041462369263172, + "learning_rate": 0.0003854448402553191, + "loss": 0.0708, + "num_input_tokens_seen": 163205152, + "step": 75550 + }, + { + "epoch": 12.325448613376835, + "grad_norm": 0.007434530183672905, + "learning_rate": 0.0003853755549440026, + "loss": 0.0199, + "num_input_tokens_seen": 163214848, + "step": 75555 + }, + { + "epoch": 12.326264274061991, + "grad_norm": 0.19846367835998535, + "learning_rate": 0.0003853062719556962, + "loss": 0.0238, + "num_input_tokens_seen": 163225888, + "step": 75560 + }, + { + "epoch": 12.327079934747145, + "grad_norm": 0.1723729521036148, + "learning_rate": 0.0003852369912918035, + "loss": 0.0675, + "num_input_tokens_seen": 163237664, + "step": 75565 + }, + { + "epoch": 12.3278955954323, + "grad_norm": 0.014083434827625751, + "learning_rate": 0.00038516771295372894, + "loss": 0.0091, + "num_input_tokens_seen": 163248896, + "step": 75570 + }, + { + "epoch": 12.328711256117455, + "grad_norm": 0.011963681317865849, + "learning_rate": 0.00038509843694287615, + "loss": 0.0126, + "num_input_tokens_seen": 163258784, + "step": 75575 + }, + { + "epoch": 12.32952691680261, + "grad_norm": 0.042714718729257584, + "learning_rate": 0.0003850291632606495, + "loss": 0.0186, + "num_input_tokens_seen": 163269536, + "step": 75580 + }, + { + "epoch": 12.330342577487766, + "grad_norm": 0.008637349121272564, + "learning_rate": 0.00038495989190845246, + "loss": 0.0394, + "num_input_tokens_seen": 163280736, + "step": 75585 + }, + { + "epoch": 12.33115823817292, + "grad_norm": 0.0011679278686642647, + "learning_rate": 0.00038489062288768944, + "loss": 0.003, + "num_input_tokens_seen": 163291040, + "step": 75590 + }, + { + "epoch": 12.331973898858076, + "grad_norm": 0.018200945109128952, + "learning_rate": 0.00038482135619976373, + "loss": 0.0067, + "num_input_tokens_seen": 163302464, + "step": 75595 + }, + { + "epoch": 12.33278955954323, + "grad_norm": 0.010142846964299679, + "learning_rate": 0.0003847520918460795, + "loss": 0.0091, + "num_input_tokens_seen": 163312832, + "step": 75600 + }, + { + "epoch": 12.333605220228385, + "grad_norm": 0.01162709854543209, + "learning_rate": 0.00038468282982804023, + "loss": 0.0056, + "num_input_tokens_seen": 163323552, + "step": 75605 + }, + { + "epoch": 12.33442088091354, + "grad_norm": 0.004280074033886194, + "learning_rate": 0.00038461357014704986, + "loss": 0.0029, + "num_input_tokens_seen": 163333824, + "step": 75610 + }, + { + "epoch": 12.335236541598695, + "grad_norm": 0.0024590755347162485, + "learning_rate": 0.00038454431280451163, + "loss": 0.009, + "num_input_tokens_seen": 163345152, + "step": 75615 + }, + { + "epoch": 12.33605220228385, + "grad_norm": 0.019506709650158882, + "learning_rate": 0.00038447505780182963, + "loss": 0.0101, + "num_input_tokens_seen": 163356288, + "step": 75620 + }, + { + "epoch": 12.336867862969005, + "grad_norm": 0.03595130145549774, + "learning_rate": 0.0003844058051404069, + "loss": 0.0354, + "num_input_tokens_seen": 163367488, + "step": 75625 + }, + { + "epoch": 12.33768352365416, + "grad_norm": 0.011195999570190907, + "learning_rate": 0.00038433655482164727, + "loss": 0.0184, + "num_input_tokens_seen": 163379328, + "step": 75630 + }, + { + "epoch": 12.338499184339314, + "grad_norm": 0.20951534807682037, + "learning_rate": 0.0003842673068469541, + "loss": 0.1006, + "num_input_tokens_seen": 163390656, + "step": 75635 + }, + { + "epoch": 12.33931484502447, + "grad_norm": 0.008074449375271797, + "learning_rate": 0.0003841980612177308, + "loss": 0.004, + "num_input_tokens_seen": 163401568, + "step": 75640 + }, + { + "epoch": 12.340130505709626, + "grad_norm": 0.007740038447082043, + "learning_rate": 0.00038412881793538063, + "loss": 0.0179, + "num_input_tokens_seen": 163413088, + "step": 75645 + }, + { + "epoch": 12.34094616639478, + "grad_norm": 0.022545767948031425, + "learning_rate": 0.000384059577001307, + "loss": 0.0115, + "num_input_tokens_seen": 163425024, + "step": 75650 + }, + { + "epoch": 12.341761827079935, + "grad_norm": 0.012054262682795525, + "learning_rate": 0.000383990338416913, + "loss": 0.0167, + "num_input_tokens_seen": 163436736, + "step": 75655 + }, + { + "epoch": 12.34257748776509, + "grad_norm": 0.02720271609723568, + "learning_rate": 0.00038392110218360203, + "loss": 0.025, + "num_input_tokens_seen": 163446464, + "step": 75660 + }, + { + "epoch": 12.343393148450245, + "grad_norm": 0.0017249691300094128, + "learning_rate": 0.0003838518683027772, + "loss": 0.0115, + "num_input_tokens_seen": 163457312, + "step": 75665 + }, + { + "epoch": 12.3442088091354, + "grad_norm": 0.008015700615942478, + "learning_rate": 0.0003837826367758417, + "loss": 0.0092, + "num_input_tokens_seen": 163467296, + "step": 75670 + }, + { + "epoch": 12.345024469820554, + "grad_norm": 0.004186101723462343, + "learning_rate": 0.0003837134076041984, + "loss": 0.0031, + "num_input_tokens_seen": 163477216, + "step": 75675 + }, + { + "epoch": 12.34584013050571, + "grad_norm": 0.0532694011926651, + "learning_rate": 0.00038364418078925037, + "loss": 0.0089, + "num_input_tokens_seen": 163487008, + "step": 75680 + }, + { + "epoch": 12.346655791190864, + "grad_norm": 0.004246499389410019, + "learning_rate": 0.0003835749563324008, + "loss": 0.0027, + "num_input_tokens_seen": 163498624, + "step": 75685 + }, + { + "epoch": 12.34747145187602, + "grad_norm": 0.005101019516587257, + "learning_rate": 0.0003835057342350522, + "loss": 0.0017, + "num_input_tokens_seen": 163508832, + "step": 75690 + }, + { + "epoch": 12.348287112561174, + "grad_norm": 0.003990354016423225, + "learning_rate": 0.0003834365144986079, + "loss": 0.1461, + "num_input_tokens_seen": 163520032, + "step": 75695 + }, + { + "epoch": 12.34910277324633, + "grad_norm": 0.015845347195863724, + "learning_rate": 0.00038336729712447034, + "loss": 0.0331, + "num_input_tokens_seen": 163530368, + "step": 75700 + }, + { + "epoch": 12.349918433931485, + "grad_norm": 0.001217706361785531, + "learning_rate": 0.0003832980821140426, + "loss": 0.0098, + "num_input_tokens_seen": 163541280, + "step": 75705 + }, + { + "epoch": 12.350734094616639, + "grad_norm": 0.006318389903753996, + "learning_rate": 0.00038322886946872716, + "loss": 0.0794, + "num_input_tokens_seen": 163551584, + "step": 75710 + }, + { + "epoch": 12.351549755301795, + "grad_norm": 0.13416573405265808, + "learning_rate": 0.000383159659189927, + "loss": 0.1529, + "num_input_tokens_seen": 163562592, + "step": 75715 + }, + { + "epoch": 12.352365415986949, + "grad_norm": 0.23575806617736816, + "learning_rate": 0.0003830904512790443, + "loss": 0.1086, + "num_input_tokens_seen": 163573728, + "step": 75720 + }, + { + "epoch": 12.353181076672104, + "grad_norm": 0.026259060949087143, + "learning_rate": 0.0003830212457374821, + "loss": 0.0059, + "num_input_tokens_seen": 163585216, + "step": 75725 + }, + { + "epoch": 12.35399673735726, + "grad_norm": 0.019766176119446754, + "learning_rate": 0.00038295204256664264, + "loss": 0.0041, + "num_input_tokens_seen": 163596288, + "step": 75730 + }, + { + "epoch": 12.354812398042414, + "grad_norm": 0.020990602672100067, + "learning_rate": 0.00038288284176792866, + "loss": 0.0167, + "num_input_tokens_seen": 163608000, + "step": 75735 + }, + { + "epoch": 12.35562805872757, + "grad_norm": 0.28732380270957947, + "learning_rate": 0.0003828136433427423, + "loss": 0.1368, + "num_input_tokens_seen": 163618976, + "step": 75740 + }, + { + "epoch": 12.356443719412724, + "grad_norm": 0.011000544764101505, + "learning_rate": 0.00038274444729248633, + "loss": 0.01, + "num_input_tokens_seen": 163628320, + "step": 75745 + }, + { + "epoch": 12.35725938009788, + "grad_norm": 0.002288726856932044, + "learning_rate": 0.00038267525361856264, + "loss": 0.0062, + "num_input_tokens_seen": 163638336, + "step": 75750 + }, + { + "epoch": 12.358075040783035, + "grad_norm": 0.03208902105689049, + "learning_rate": 0.000382606062322374, + "loss": 0.0044, + "num_input_tokens_seen": 163649376, + "step": 75755 + }, + { + "epoch": 12.358890701468189, + "grad_norm": 0.019938675686717033, + "learning_rate": 0.00038253687340532224, + "loss": 0.0064, + "num_input_tokens_seen": 163660416, + "step": 75760 + }, + { + "epoch": 12.359706362153345, + "grad_norm": 0.030354809015989304, + "learning_rate": 0.0003824676868688097, + "loss": 0.0428, + "num_input_tokens_seen": 163670720, + "step": 75765 + }, + { + "epoch": 12.360522022838499, + "grad_norm": 0.020712848752737045, + "learning_rate": 0.0003823985027142389, + "loss": 0.0043, + "num_input_tokens_seen": 163681728, + "step": 75770 + }, + { + "epoch": 12.361337683523654, + "grad_norm": 0.012714402750134468, + "learning_rate": 0.0003823293209430113, + "loss": 0.0044, + "num_input_tokens_seen": 163691584, + "step": 75775 + }, + { + "epoch": 12.362153344208808, + "grad_norm": 0.003048856742680073, + "learning_rate": 0.00038226014155652956, + "loss": 0.0054, + "num_input_tokens_seen": 163702688, + "step": 75780 + }, + { + "epoch": 12.362969004893964, + "grad_norm": 0.049370184540748596, + "learning_rate": 0.0003821909645561952, + "loss": 0.0042, + "num_input_tokens_seen": 163713472, + "step": 75785 + }, + { + "epoch": 12.36378466557912, + "grad_norm": 0.043685123324394226, + "learning_rate": 0.0003821217899434106, + "loss": 0.0054, + "num_input_tokens_seen": 163725760, + "step": 75790 + }, + { + "epoch": 12.364600326264274, + "grad_norm": 0.021820900961756706, + "learning_rate": 0.0003820526177195772, + "loss": 0.0047, + "num_input_tokens_seen": 163736928, + "step": 75795 + }, + { + "epoch": 12.36541598694943, + "grad_norm": 0.10135416686534882, + "learning_rate": 0.00038198344788609737, + "loss": 0.0081, + "num_input_tokens_seen": 163747392, + "step": 75800 + }, + { + "epoch": 12.366231647634583, + "grad_norm": 0.02166566252708435, + "learning_rate": 0.0003819142804443726, + "loss": 0.0131, + "num_input_tokens_seen": 163758144, + "step": 75805 + }, + { + "epoch": 12.367047308319739, + "grad_norm": 0.011187167838215828, + "learning_rate": 0.0003818451153958047, + "loss": 0.0423, + "num_input_tokens_seen": 163770528, + "step": 75810 + }, + { + "epoch": 12.367862969004895, + "grad_norm": 0.00776352034881711, + "learning_rate": 0.0003817759527417955, + "loss": 0.0035, + "num_input_tokens_seen": 163782112, + "step": 75815 + }, + { + "epoch": 12.368678629690049, + "grad_norm": 0.00991200003772974, + "learning_rate": 0.00038170679248374653, + "loss": 0.0044, + "num_input_tokens_seen": 163792736, + "step": 75820 + }, + { + "epoch": 12.369494290375204, + "grad_norm": 0.2220042645931244, + "learning_rate": 0.00038163763462305944, + "loss": 0.0094, + "num_input_tokens_seen": 163804320, + "step": 75825 + }, + { + "epoch": 12.370309951060358, + "grad_norm": 0.052491020411252975, + "learning_rate": 0.000381568479161136, + "loss": 0.0061, + "num_input_tokens_seen": 163814272, + "step": 75830 + }, + { + "epoch": 12.371125611745514, + "grad_norm": 0.18457885086536407, + "learning_rate": 0.00038149932609937736, + "loss": 0.0268, + "num_input_tokens_seen": 163825664, + "step": 75835 + }, + { + "epoch": 12.37194127243067, + "grad_norm": 0.007164886686950922, + "learning_rate": 0.00038143017543918546, + "loss": 0.0203, + "num_input_tokens_seen": 163835680, + "step": 75840 + }, + { + "epoch": 12.372756933115824, + "grad_norm": 0.01174076460301876, + "learning_rate": 0.0003813610271819612, + "loss": 0.0053, + "num_input_tokens_seen": 163845536, + "step": 75845 + }, + { + "epoch": 12.37357259380098, + "grad_norm": 0.002305036410689354, + "learning_rate": 0.00038129188132910645, + "loss": 0.1235, + "num_input_tokens_seen": 163855968, + "step": 75850 + }, + { + "epoch": 12.374388254486133, + "grad_norm": 0.36928537487983704, + "learning_rate": 0.00038122273788202216, + "loss": 0.0149, + "num_input_tokens_seen": 163866496, + "step": 75855 + }, + { + "epoch": 12.375203915171289, + "grad_norm": 0.00295365322381258, + "learning_rate": 0.00038115359684210993, + "loss": 0.0068, + "num_input_tokens_seen": 163877184, + "step": 75860 + }, + { + "epoch": 12.376019575856443, + "grad_norm": 0.4139918386936188, + "learning_rate": 0.00038108445821077066, + "loss": 0.0076, + "num_input_tokens_seen": 163887200, + "step": 75865 + }, + { + "epoch": 12.376835236541599, + "grad_norm": 0.033299028873443604, + "learning_rate": 0.00038101532198940563, + "loss": 0.0251, + "num_input_tokens_seen": 163898208, + "step": 75870 + }, + { + "epoch": 12.377650897226754, + "grad_norm": 0.024843864142894745, + "learning_rate": 0.0003809461881794163, + "loss": 0.0106, + "num_input_tokens_seen": 163908512, + "step": 75875 + }, + { + "epoch": 12.378466557911908, + "grad_norm": 1.026667594909668, + "learning_rate": 0.0003808770567822033, + "loss": 0.0927, + "num_input_tokens_seen": 163918976, + "step": 75880 + }, + { + "epoch": 12.379282218597064, + "grad_norm": 0.01017333846539259, + "learning_rate": 0.000380807927799168, + "loss": 0.2334, + "num_input_tokens_seen": 163928544, + "step": 75885 + }, + { + "epoch": 12.380097879282218, + "grad_norm": 0.009438030421733856, + "learning_rate": 0.0003807388012317111, + "loss": 0.1196, + "num_input_tokens_seen": 163939072, + "step": 75890 + }, + { + "epoch": 12.380913539967374, + "grad_norm": 0.03808213025331497, + "learning_rate": 0.0003806696770812339, + "loss": 0.0146, + "num_input_tokens_seen": 163949216, + "step": 75895 + }, + { + "epoch": 12.38172920065253, + "grad_norm": 0.01764621213078499, + "learning_rate": 0.00038060055534913683, + "loss": 0.0278, + "num_input_tokens_seen": 163959744, + "step": 75900 + }, + { + "epoch": 12.382544861337683, + "grad_norm": 0.0389665849506855, + "learning_rate": 0.0003805314360368212, + "loss": 0.2302, + "num_input_tokens_seen": 163970144, + "step": 75905 + }, + { + "epoch": 12.383360522022839, + "grad_norm": 0.09590235352516174, + "learning_rate": 0.0003804623191456874, + "loss": 0.0173, + "num_input_tokens_seen": 163980832, + "step": 75910 + }, + { + "epoch": 12.384176182707993, + "grad_norm": 0.028626440092921257, + "learning_rate": 0.00038039320467713654, + "loss": 0.0128, + "num_input_tokens_seen": 163992032, + "step": 75915 + }, + { + "epoch": 12.384991843393149, + "grad_norm": 0.055960919708013535, + "learning_rate": 0.0003803240926325689, + "loss": 0.0064, + "num_input_tokens_seen": 164002368, + "step": 75920 + }, + { + "epoch": 12.385807504078304, + "grad_norm": 0.5099681615829468, + "learning_rate": 0.00038025498301338554, + "loss": 0.0107, + "num_input_tokens_seen": 164013472, + "step": 75925 + }, + { + "epoch": 12.386623164763458, + "grad_norm": 0.004505421034991741, + "learning_rate": 0.00038018587582098665, + "loss": 0.0025, + "num_input_tokens_seen": 164024224, + "step": 75930 + }, + { + "epoch": 12.387438825448614, + "grad_norm": 0.035571370273828506, + "learning_rate": 0.0003801167710567731, + "loss": 0.0085, + "num_input_tokens_seen": 164034880, + "step": 75935 + }, + { + "epoch": 12.388254486133768, + "grad_norm": 0.003777115372940898, + "learning_rate": 0.00038004766872214526, + "loss": 0.0035, + "num_input_tokens_seen": 164045632, + "step": 75940 + }, + { + "epoch": 12.389070146818923, + "grad_norm": 0.011398572474718094, + "learning_rate": 0.0003799785688185036, + "loss": 0.0615, + "num_input_tokens_seen": 164055648, + "step": 75945 + }, + { + "epoch": 12.38988580750408, + "grad_norm": 0.005105683580040932, + "learning_rate": 0.00037990947134724845, + "loss": 0.151, + "num_input_tokens_seen": 164066560, + "step": 75950 + }, + { + "epoch": 12.390701468189233, + "grad_norm": 0.07168906182050705, + "learning_rate": 0.00037984037630978026, + "loss": 0.0169, + "num_input_tokens_seen": 164076992, + "step": 75955 + }, + { + "epoch": 12.391517128874389, + "grad_norm": 0.20706669986248016, + "learning_rate": 0.00037977128370749916, + "loss": 0.0277, + "num_input_tokens_seen": 164086912, + "step": 75960 + }, + { + "epoch": 12.392332789559543, + "grad_norm": 0.38154295086860657, + "learning_rate": 0.00037970219354180573, + "loss": 0.0972, + "num_input_tokens_seen": 164097504, + "step": 75965 + }, + { + "epoch": 12.393148450244698, + "grad_norm": 0.04678434878587723, + "learning_rate": 0.0003796331058140997, + "loss": 0.0166, + "num_input_tokens_seen": 164107968, + "step": 75970 + }, + { + "epoch": 12.393964110929852, + "grad_norm": 0.0053862021304667, + "learning_rate": 0.00037956402052578164, + "loss": 0.0181, + "num_input_tokens_seen": 164118080, + "step": 75975 + }, + { + "epoch": 12.394779771615008, + "grad_norm": 0.0014402979286387563, + "learning_rate": 0.0003794949376782515, + "loss": 0.11, + "num_input_tokens_seen": 164129152, + "step": 75980 + }, + { + "epoch": 12.395595432300164, + "grad_norm": 0.003660842776298523, + "learning_rate": 0.00037942585727290926, + "loss": 0.0028, + "num_input_tokens_seen": 164139488, + "step": 75985 + }, + { + "epoch": 12.396411092985318, + "grad_norm": 0.00224318471737206, + "learning_rate": 0.000379356779311155, + "loss": 0.005, + "num_input_tokens_seen": 164151104, + "step": 75990 + }, + { + "epoch": 12.397226753670473, + "grad_norm": 0.08662576228380203, + "learning_rate": 0.0003792877037943886, + "loss": 0.0567, + "num_input_tokens_seen": 164161696, + "step": 75995 + }, + { + "epoch": 12.398042414355627, + "grad_norm": 0.026156943291425705, + "learning_rate": 0.0003792186307240102, + "loss": 0.0047, + "num_input_tokens_seen": 164172320, + "step": 76000 + }, + { + "epoch": 12.398858075040783, + "grad_norm": 0.4226818382740021, + "learning_rate": 0.0003791495601014192, + "loss": 0.0428, + "num_input_tokens_seen": 164183104, + "step": 76005 + }, + { + "epoch": 12.399673735725939, + "grad_norm": 0.014250795356929302, + "learning_rate": 0.00037908049192801596, + "loss": 0.0906, + "num_input_tokens_seen": 164195008, + "step": 76010 + }, + { + "epoch": 12.400489396411093, + "grad_norm": 0.3014688193798065, + "learning_rate": 0.00037901142620519967, + "loss": 0.0257, + "num_input_tokens_seen": 164206208, + "step": 76015 + }, + { + "epoch": 12.401305057096248, + "grad_norm": 0.008320405147969723, + "learning_rate": 0.00037894236293437055, + "loss": 0.0072, + "num_input_tokens_seen": 164216832, + "step": 76020 + }, + { + "epoch": 12.402120717781402, + "grad_norm": 0.06271061301231384, + "learning_rate": 0.00037887330211692783, + "loss": 0.0122, + "num_input_tokens_seen": 164227936, + "step": 76025 + }, + { + "epoch": 12.402936378466558, + "grad_norm": 0.1204233169555664, + "learning_rate": 0.00037880424375427154, + "loss": 0.0122, + "num_input_tokens_seen": 164238688, + "step": 76030 + }, + { + "epoch": 12.403752039151712, + "grad_norm": 0.008676442317664623, + "learning_rate": 0.00037873518784780074, + "loss": 0.0555, + "num_input_tokens_seen": 164249408, + "step": 76035 + }, + { + "epoch": 12.404567699836868, + "grad_norm": 0.019311709329485893, + "learning_rate": 0.0003786661343989154, + "loss": 0.0161, + "num_input_tokens_seen": 164260736, + "step": 76040 + }, + { + "epoch": 12.405383360522023, + "grad_norm": 0.0016197053482756019, + "learning_rate": 0.00037859708340901455, + "loss": 0.075, + "num_input_tokens_seen": 164270144, + "step": 76045 + }, + { + "epoch": 12.406199021207177, + "grad_norm": 0.0075719174928963184, + "learning_rate": 0.00037852803487949804, + "loss": 0.0167, + "num_input_tokens_seen": 164279936, + "step": 76050 + }, + { + "epoch": 12.407014681892333, + "grad_norm": 0.016589025035500526, + "learning_rate": 0.0003784589888117648, + "loss": 0.0467, + "num_input_tokens_seen": 164292704, + "step": 76055 + }, + { + "epoch": 12.407830342577487, + "grad_norm": 0.4371892213821411, + "learning_rate": 0.0003783899452072146, + "loss": 0.009, + "num_input_tokens_seen": 164303904, + "step": 76060 + }, + { + "epoch": 12.408646003262643, + "grad_norm": 0.0021243118681013584, + "learning_rate": 0.00037832090406724617, + "loss": 0.0039, + "num_input_tokens_seen": 164316000, + "step": 76065 + }, + { + "epoch": 12.409461663947798, + "grad_norm": 0.0714460015296936, + "learning_rate": 0.0003782518653932592, + "loss": 0.0351, + "num_input_tokens_seen": 164326848, + "step": 76070 + }, + { + "epoch": 12.410277324632952, + "grad_norm": 0.2750934660434723, + "learning_rate": 0.00037818282918665236, + "loss": 0.0179, + "num_input_tokens_seen": 164337792, + "step": 76075 + }, + { + "epoch": 12.411092985318108, + "grad_norm": 0.01962362602353096, + "learning_rate": 0.0003781137954488251, + "loss": 0.0042, + "num_input_tokens_seen": 164349856, + "step": 76080 + }, + { + "epoch": 12.411908646003262, + "grad_norm": 0.03165091201663017, + "learning_rate": 0.0003780447641811766, + "loss": 0.0208, + "num_input_tokens_seen": 164360672, + "step": 76085 + }, + { + "epoch": 12.412724306688418, + "grad_norm": 0.005478391423821449, + "learning_rate": 0.0003779757353851054, + "loss": 0.0128, + "num_input_tokens_seen": 164371392, + "step": 76090 + }, + { + "epoch": 12.413539967373573, + "grad_norm": 0.0063373674638569355, + "learning_rate": 0.000377906709062011, + "loss": 0.0862, + "num_input_tokens_seen": 164381664, + "step": 76095 + }, + { + "epoch": 12.414355628058727, + "grad_norm": 0.027820097282528877, + "learning_rate": 0.00037783768521329177, + "loss": 0.0071, + "num_input_tokens_seen": 164393184, + "step": 76100 + }, + { + "epoch": 12.415171288743883, + "grad_norm": 0.002753217238932848, + "learning_rate": 0.0003777686638403469, + "loss": 0.0465, + "num_input_tokens_seen": 164402048, + "step": 76105 + }, + { + "epoch": 12.415986949429037, + "grad_norm": 1.1631052494049072, + "learning_rate": 0.0003776996449445752, + "loss": 0.0841, + "num_input_tokens_seen": 164411904, + "step": 76110 + }, + { + "epoch": 12.416802610114193, + "grad_norm": 0.0034380650613456964, + "learning_rate": 0.0003776306285273753, + "loss": 0.0017, + "num_input_tokens_seen": 164421920, + "step": 76115 + }, + { + "epoch": 12.417618270799348, + "grad_norm": 0.04066552594304085, + "learning_rate": 0.0003775616145901459, + "loss": 0.0109, + "num_input_tokens_seen": 164432576, + "step": 76120 + }, + { + "epoch": 12.418433931484502, + "grad_norm": 0.0008301659254357219, + "learning_rate": 0.0003774926031342858, + "loss": 0.0066, + "num_input_tokens_seen": 164443936, + "step": 76125 + }, + { + "epoch": 12.419249592169658, + "grad_norm": 0.002140910131856799, + "learning_rate": 0.0003774235941611934, + "loss": 0.017, + "num_input_tokens_seen": 164456416, + "step": 76130 + }, + { + "epoch": 12.420065252854812, + "grad_norm": 0.05225667729973793, + "learning_rate": 0.0003773545876722675, + "loss": 0.0427, + "num_input_tokens_seen": 164467776, + "step": 76135 + }, + { + "epoch": 12.420880913539968, + "grad_norm": 0.004565094597637653, + "learning_rate": 0.00037728558366890633, + "loss": 0.0737, + "num_input_tokens_seen": 164478912, + "step": 76140 + }, + { + "epoch": 12.421696574225122, + "grad_norm": 0.013736764900386333, + "learning_rate": 0.00037721658215250864, + "loss": 0.0062, + "num_input_tokens_seen": 164489280, + "step": 76145 + }, + { + "epoch": 12.422512234910277, + "grad_norm": 0.0084912134334445, + "learning_rate": 0.00037714758312447247, + "loss": 0.0194, + "num_input_tokens_seen": 164498144, + "step": 76150 + }, + { + "epoch": 12.423327895595433, + "grad_norm": 0.0022617534268647432, + "learning_rate": 0.0003770785865861966, + "loss": 0.089, + "num_input_tokens_seen": 164510048, + "step": 76155 + }, + { + "epoch": 12.424143556280587, + "grad_norm": 0.02164643630385399, + "learning_rate": 0.0003770095925390789, + "loss": 0.0114, + "num_input_tokens_seen": 164521472, + "step": 76160 + }, + { + "epoch": 12.424959216965743, + "grad_norm": 0.00475015165284276, + "learning_rate": 0.000376940600984518, + "loss": 0.0049, + "num_input_tokens_seen": 164532320, + "step": 76165 + }, + { + "epoch": 12.425774877650896, + "grad_norm": 0.007536349352449179, + "learning_rate": 0.0003768716119239118, + "loss": 0.0071, + "num_input_tokens_seen": 164543296, + "step": 76170 + }, + { + "epoch": 12.426590538336052, + "grad_norm": 0.030248427763581276, + "learning_rate": 0.0003768026253586587, + "loss": 0.005, + "num_input_tokens_seen": 164554720, + "step": 76175 + }, + { + "epoch": 12.427406199021208, + "grad_norm": 0.07713694125413895, + "learning_rate": 0.00037673364129015653, + "loss": 0.0364, + "num_input_tokens_seen": 164566432, + "step": 76180 + }, + { + "epoch": 12.428221859706362, + "grad_norm": 0.11699513345956802, + "learning_rate": 0.0003766646597198037, + "loss": 0.0136, + "num_input_tokens_seen": 164576672, + "step": 76185 + }, + { + "epoch": 12.429037520391518, + "grad_norm": 0.02320980280637741, + "learning_rate": 0.0003765956806489978, + "loss": 0.0027, + "num_input_tokens_seen": 164588544, + "step": 76190 + }, + { + "epoch": 12.429853181076671, + "grad_norm": 0.010927310213446617, + "learning_rate": 0.00037652670407913697, + "loss": 0.019, + "num_input_tokens_seen": 164599136, + "step": 76195 + }, + { + "epoch": 12.430668841761827, + "grad_norm": 0.011162595823407173, + "learning_rate": 0.00037645773001161937, + "loss": 0.0913, + "num_input_tokens_seen": 164608672, + "step": 76200 + }, + { + "epoch": 12.431484502446983, + "grad_norm": 0.008266448974609375, + "learning_rate": 0.0003763887584478423, + "loss": 0.0099, + "num_input_tokens_seen": 164619168, + "step": 76205 + }, + { + "epoch": 12.432300163132137, + "grad_norm": 0.002109188586473465, + "learning_rate": 0.00037631978938920414, + "loss": 0.0707, + "num_input_tokens_seen": 164630016, + "step": 76210 + }, + { + "epoch": 12.433115823817293, + "grad_norm": 0.007610084023326635, + "learning_rate": 0.0003762508228371021, + "loss": 0.0051, + "num_input_tokens_seen": 164641152, + "step": 76215 + }, + { + "epoch": 12.433931484502446, + "grad_norm": 0.002785197226330638, + "learning_rate": 0.0003761818587929344, + "loss": 0.0041, + "num_input_tokens_seen": 164652672, + "step": 76220 + }, + { + "epoch": 12.434747145187602, + "grad_norm": 0.1258106827735901, + "learning_rate": 0.0003761128972580981, + "loss": 0.0801, + "num_input_tokens_seen": 164662848, + "step": 76225 + }, + { + "epoch": 12.435562805872756, + "grad_norm": 0.40725257992744446, + "learning_rate": 0.00037604393823399137, + "loss": 0.0898, + "num_input_tokens_seen": 164674208, + "step": 76230 + }, + { + "epoch": 12.436378466557912, + "grad_norm": 0.017194107174873352, + "learning_rate": 0.00037597498172201125, + "loss": 0.0443, + "num_input_tokens_seen": 164686272, + "step": 76235 + }, + { + "epoch": 12.437194127243067, + "grad_norm": 0.00854388065636158, + "learning_rate": 0.0003759060277235556, + "loss": 0.0677, + "num_input_tokens_seen": 164697536, + "step": 76240 + }, + { + "epoch": 12.438009787928221, + "grad_norm": 0.001409175805747509, + "learning_rate": 0.00037583707624002163, + "loss": 0.0644, + "num_input_tokens_seen": 164708512, + "step": 76245 + }, + { + "epoch": 12.438825448613377, + "grad_norm": 0.020490366965532303, + "learning_rate": 0.00037576812727280683, + "loss": 0.0046, + "num_input_tokens_seen": 164718688, + "step": 76250 + }, + { + "epoch": 12.439641109298531, + "grad_norm": 0.017789531499147415, + "learning_rate": 0.0003756991808233086, + "loss": 0.0046, + "num_input_tokens_seen": 164729344, + "step": 76255 + }, + { + "epoch": 12.440456769983687, + "grad_norm": 0.0018462835578247905, + "learning_rate": 0.0003756302368929241, + "loss": 0.1103, + "num_input_tokens_seen": 164740288, + "step": 76260 + }, + { + "epoch": 12.441272430668842, + "grad_norm": 0.007640472613275051, + "learning_rate": 0.00037556129548305074, + "loss": 0.0041, + "num_input_tokens_seen": 164751520, + "step": 76265 + }, + { + "epoch": 12.442088091353996, + "grad_norm": 0.002510238206014037, + "learning_rate": 0.0003754923565950855, + "loss": 0.0034, + "num_input_tokens_seen": 164762336, + "step": 76270 + }, + { + "epoch": 12.442903752039152, + "grad_norm": 0.0017224326729774475, + "learning_rate": 0.0003754234202304255, + "loss": 0.013, + "num_input_tokens_seen": 164773888, + "step": 76275 + }, + { + "epoch": 12.443719412724306, + "grad_norm": 0.0037370871286839247, + "learning_rate": 0.00037535448639046816, + "loss": 0.0064, + "num_input_tokens_seen": 164785408, + "step": 76280 + }, + { + "epoch": 12.444535073409462, + "grad_norm": 0.001963126938790083, + "learning_rate": 0.00037528555507661, + "loss": 0.0034, + "num_input_tokens_seen": 164797376, + "step": 76285 + }, + { + "epoch": 12.445350734094617, + "grad_norm": 0.0748407244682312, + "learning_rate": 0.00037521662629024855, + "loss": 0.0147, + "num_input_tokens_seen": 164808640, + "step": 76290 + }, + { + "epoch": 12.446166394779771, + "grad_norm": 0.0008259370806626976, + "learning_rate": 0.00037514770003278027, + "loss": 0.0489, + "num_input_tokens_seen": 164820672, + "step": 76295 + }, + { + "epoch": 12.446982055464927, + "grad_norm": 0.008207214064896107, + "learning_rate": 0.00037507877630560215, + "loss": 0.0158, + "num_input_tokens_seen": 164830912, + "step": 76300 + }, + { + "epoch": 12.447797716150081, + "grad_norm": 0.13782595098018646, + "learning_rate": 0.00037500985511011145, + "loss": 0.0231, + "num_input_tokens_seen": 164841216, + "step": 76305 + }, + { + "epoch": 12.448613376835237, + "grad_norm": 0.06296905130147934, + "learning_rate": 0.00037494093644770425, + "loss": 0.0214, + "num_input_tokens_seen": 164852160, + "step": 76310 + }, + { + "epoch": 12.449429037520392, + "grad_norm": 0.5310968160629272, + "learning_rate": 0.000374872020319778, + "loss": 0.0279, + "num_input_tokens_seen": 164862848, + "step": 76315 + }, + { + "epoch": 12.450244698205546, + "grad_norm": 0.009087975136935711, + "learning_rate": 0.0003748031067277286, + "loss": 0.034, + "num_input_tokens_seen": 164872896, + "step": 76320 + }, + { + "epoch": 12.451060358890702, + "grad_norm": 0.040168534964323044, + "learning_rate": 0.00037473419567295337, + "loss": 0.0878, + "num_input_tokens_seen": 164882816, + "step": 76325 + }, + { + "epoch": 12.451876019575856, + "grad_norm": 0.03108314424753189, + "learning_rate": 0.0003746652871568483, + "loss": 0.0094, + "num_input_tokens_seen": 164892800, + "step": 76330 + }, + { + "epoch": 12.452691680261012, + "grad_norm": 0.01985093019902706, + "learning_rate": 0.0003745963811808105, + "loss": 0.02, + "num_input_tokens_seen": 164903648, + "step": 76335 + }, + { + "epoch": 12.453507340946166, + "grad_norm": 0.019592365249991417, + "learning_rate": 0.00037452747774623584, + "loss": 0.0049, + "num_input_tokens_seen": 164914048, + "step": 76340 + }, + { + "epoch": 12.454323001631321, + "grad_norm": 0.015187375247478485, + "learning_rate": 0.0003744585768545212, + "loss": 0.1063, + "num_input_tokens_seen": 164924928, + "step": 76345 + }, + { + "epoch": 12.455138662316477, + "grad_norm": 0.009520821273326874, + "learning_rate": 0.00037438967850706264, + "loss": 0.0033, + "num_input_tokens_seen": 164935680, + "step": 76350 + }, + { + "epoch": 12.455954323001631, + "grad_norm": 0.12158270925283432, + "learning_rate": 0.0003743207827052567, + "loss": 0.0308, + "num_input_tokens_seen": 164946944, + "step": 76355 + }, + { + "epoch": 12.456769983686787, + "grad_norm": 0.003837285563349724, + "learning_rate": 0.0003742518894504994, + "loss": 0.0466, + "num_input_tokens_seen": 164958336, + "step": 76360 + }, + { + "epoch": 12.45758564437194, + "grad_norm": 0.06163405254483223, + "learning_rate": 0.00037418299874418726, + "loss": 0.0038, + "num_input_tokens_seen": 164971104, + "step": 76365 + }, + { + "epoch": 12.458401305057096, + "grad_norm": 0.003957700449973345, + "learning_rate": 0.00037411411058771606, + "loss": 0.0231, + "num_input_tokens_seen": 164982208, + "step": 76370 + }, + { + "epoch": 12.459216965742252, + "grad_norm": 0.019950520247220993, + "learning_rate": 0.00037404522498248234, + "loss": 0.0125, + "num_input_tokens_seen": 164994144, + "step": 76375 + }, + { + "epoch": 12.460032626427406, + "grad_norm": 0.014288338832557201, + "learning_rate": 0.0003739763419298817, + "loss": 0.0124, + "num_input_tokens_seen": 165004864, + "step": 76380 + }, + { + "epoch": 12.460848287112562, + "grad_norm": 0.012740354984998703, + "learning_rate": 0.0003739074614313105, + "loss": 0.0069, + "num_input_tokens_seen": 165016544, + "step": 76385 + }, + { + "epoch": 12.461663947797716, + "grad_norm": 0.0955539122223854, + "learning_rate": 0.00037383858348816445, + "loss": 0.0247, + "num_input_tokens_seen": 165027232, + "step": 76390 + }, + { + "epoch": 12.462479608482871, + "grad_norm": 0.5261601209640503, + "learning_rate": 0.0003737697081018396, + "loss": 0.2179, + "num_input_tokens_seen": 165038272, + "step": 76395 + }, + { + "epoch": 12.463295269168025, + "grad_norm": 0.3856097459793091, + "learning_rate": 0.0003737008352737318, + "loss": 0.0523, + "num_input_tokens_seen": 165047968, + "step": 76400 + }, + { + "epoch": 12.464110929853181, + "grad_norm": 0.0045360904186964035, + "learning_rate": 0.0003736319650052366, + "loss": 0.0119, + "num_input_tokens_seen": 165059872, + "step": 76405 + }, + { + "epoch": 12.464926590538337, + "grad_norm": 0.004846095573157072, + "learning_rate": 0.0003735630972977502, + "loss": 0.0064, + "num_input_tokens_seen": 165070912, + "step": 76410 + }, + { + "epoch": 12.46574225122349, + "grad_norm": 0.0615304559469223, + "learning_rate": 0.00037349423215266784, + "loss": 0.0149, + "num_input_tokens_seen": 165082208, + "step": 76415 + }, + { + "epoch": 12.466557911908646, + "grad_norm": 0.41746899485588074, + "learning_rate": 0.0003734253695713854, + "loss": 0.0213, + "num_input_tokens_seen": 165092096, + "step": 76420 + }, + { + "epoch": 12.4673735725938, + "grad_norm": 0.03316102921962738, + "learning_rate": 0.0003733565095552985, + "loss": 0.0053, + "num_input_tokens_seen": 165101472, + "step": 76425 + }, + { + "epoch": 12.468189233278956, + "grad_norm": 0.12689688801765442, + "learning_rate": 0.0003732876521058025, + "loss": 0.0144, + "num_input_tokens_seen": 165111168, + "step": 76430 + }, + { + "epoch": 12.469004893964112, + "grad_norm": 0.012960254214704037, + "learning_rate": 0.000373218797224293, + "loss": 0.0599, + "num_input_tokens_seen": 165121792, + "step": 76435 + }, + { + "epoch": 12.469820554649266, + "grad_norm": 0.008622650988399982, + "learning_rate": 0.00037314994491216547, + "loss": 0.0103, + "num_input_tokens_seen": 165132160, + "step": 76440 + }, + { + "epoch": 12.470636215334421, + "grad_norm": 0.0005445160204544663, + "learning_rate": 0.00037308109517081506, + "loss": 0.01, + "num_input_tokens_seen": 165143872, + "step": 76445 + }, + { + "epoch": 12.471451876019575, + "grad_norm": 0.0248736385256052, + "learning_rate": 0.0003730122480016375, + "loss": 0.0042, + "num_input_tokens_seen": 165154624, + "step": 76450 + }, + { + "epoch": 12.47226753670473, + "grad_norm": 0.5134662985801697, + "learning_rate": 0.00037294340340602764, + "loss": 0.1234, + "num_input_tokens_seen": 165164704, + "step": 76455 + }, + { + "epoch": 12.473083197389887, + "grad_norm": 0.005300603806972504, + "learning_rate": 0.0003728745613853811, + "loss": 0.0756, + "num_input_tokens_seen": 165176192, + "step": 76460 + }, + { + "epoch": 12.47389885807504, + "grad_norm": 0.09585113823413849, + "learning_rate": 0.00037280572194109255, + "loss": 0.0272, + "num_input_tokens_seen": 165186784, + "step": 76465 + }, + { + "epoch": 12.474714518760196, + "grad_norm": 0.020219076424837112, + "learning_rate": 0.00037273688507455773, + "loss": 0.0254, + "num_input_tokens_seen": 165198496, + "step": 76470 + }, + { + "epoch": 12.47553017944535, + "grad_norm": 0.11191736906766891, + "learning_rate": 0.00037266805078717106, + "loss": 0.0172, + "num_input_tokens_seen": 165209216, + "step": 76475 + }, + { + "epoch": 12.476345840130506, + "grad_norm": 0.009172594174742699, + "learning_rate": 0.00037259921908032814, + "loss": 0.0428, + "num_input_tokens_seen": 165220736, + "step": 76480 + }, + { + "epoch": 12.477161500815662, + "grad_norm": 0.003471218980848789, + "learning_rate": 0.0003725303899554234, + "loss": 0.0092, + "num_input_tokens_seen": 165231104, + "step": 76485 + }, + { + "epoch": 12.477977161500815, + "grad_norm": 0.3565479815006256, + "learning_rate": 0.00037246156341385234, + "loss": 0.0655, + "num_input_tokens_seen": 165241536, + "step": 76490 + }, + { + "epoch": 12.478792822185971, + "grad_norm": 0.003832635236904025, + "learning_rate": 0.0003723927394570092, + "loss": 0.1338, + "num_input_tokens_seen": 165253024, + "step": 76495 + }, + { + "epoch": 12.479608482871125, + "grad_norm": 0.02578004077076912, + "learning_rate": 0.0003723239180862893, + "loss": 0.1179, + "num_input_tokens_seen": 165263616, + "step": 76500 + }, + { + "epoch": 12.48042414355628, + "grad_norm": 0.010445257648825645, + "learning_rate": 0.00037225509930308696, + "loss": 0.0054, + "num_input_tokens_seen": 165275168, + "step": 76505 + }, + { + "epoch": 12.481239804241435, + "grad_norm": 0.02623414248228073, + "learning_rate": 0.0003721862831087971, + "loss": 0.0221, + "num_input_tokens_seen": 165286208, + "step": 76510 + }, + { + "epoch": 12.48205546492659, + "grad_norm": 0.003427008166909218, + "learning_rate": 0.0003721174695048145, + "loss": 0.0111, + "num_input_tokens_seen": 165296960, + "step": 76515 + }, + { + "epoch": 12.482871125611746, + "grad_norm": 0.036691464483737946, + "learning_rate": 0.0003720486584925335, + "loss": 0.0072, + "num_input_tokens_seen": 165308608, + "step": 76520 + }, + { + "epoch": 12.4836867862969, + "grad_norm": 0.02001815289258957, + "learning_rate": 0.0003719798500733489, + "loss": 0.0421, + "num_input_tokens_seen": 165318816, + "step": 76525 + }, + { + "epoch": 12.484502446982056, + "grad_norm": 0.5764277577400208, + "learning_rate": 0.00037191104424865487, + "loss": 0.1406, + "num_input_tokens_seen": 165330144, + "step": 76530 + }, + { + "epoch": 12.48531810766721, + "grad_norm": 0.11190090328454971, + "learning_rate": 0.0003718422410198462, + "loss": 0.0179, + "num_input_tokens_seen": 165340800, + "step": 76535 + }, + { + "epoch": 12.486133768352365, + "grad_norm": 0.013130726292729378, + "learning_rate": 0.0003717734403883169, + "loss": 0.0164, + "num_input_tokens_seen": 165351776, + "step": 76540 + }, + { + "epoch": 12.486949429037521, + "grad_norm": 1.2218176126480103, + "learning_rate": 0.0003717046423554617, + "loss": 0.0147, + "num_input_tokens_seen": 165363008, + "step": 76545 + }, + { + "epoch": 12.487765089722675, + "grad_norm": 0.017160970717668533, + "learning_rate": 0.0003716358469226745, + "loss": 0.0056, + "num_input_tokens_seen": 165373888, + "step": 76550 + }, + { + "epoch": 12.48858075040783, + "grad_norm": 0.06486137211322784, + "learning_rate": 0.0003715670540913499, + "loss": 0.01, + "num_input_tokens_seen": 165384928, + "step": 76555 + }, + { + "epoch": 12.489396411092985, + "grad_norm": 0.2934146523475647, + "learning_rate": 0.0003714982638628817, + "loss": 0.1709, + "num_input_tokens_seen": 165395616, + "step": 76560 + }, + { + "epoch": 12.49021207177814, + "grad_norm": 0.0009802387794479728, + "learning_rate": 0.00037142947623866417, + "loss": 0.0393, + "num_input_tokens_seen": 165405568, + "step": 76565 + }, + { + "epoch": 12.491027732463296, + "grad_norm": 0.28663870692253113, + "learning_rate": 0.0003713606912200915, + "loss": 0.084, + "num_input_tokens_seen": 165417216, + "step": 76570 + }, + { + "epoch": 12.49184339314845, + "grad_norm": 0.0640694722533226, + "learning_rate": 0.00037129190880855764, + "loss": 0.0058, + "num_input_tokens_seen": 165428608, + "step": 76575 + }, + { + "epoch": 12.492659053833606, + "grad_norm": 0.0026230604853481054, + "learning_rate": 0.00037122312900545644, + "loss": 0.0175, + "num_input_tokens_seen": 165440576, + "step": 76580 + }, + { + "epoch": 12.49347471451876, + "grad_norm": 0.0017055338248610497, + "learning_rate": 0.000371154351812182, + "loss": 0.0408, + "num_input_tokens_seen": 165450976, + "step": 76585 + }, + { + "epoch": 12.494290375203915, + "grad_norm": 0.14844626188278198, + "learning_rate": 0.0003710855772301279, + "loss": 0.0093, + "num_input_tokens_seen": 165463104, + "step": 76590 + }, + { + "epoch": 12.49510603588907, + "grad_norm": 0.0018314715707674623, + "learning_rate": 0.00037101680526068837, + "loss": 0.0151, + "num_input_tokens_seen": 165473376, + "step": 76595 + }, + { + "epoch": 12.495921696574225, + "grad_norm": 0.008383872918784618, + "learning_rate": 0.0003709480359052566, + "loss": 0.0945, + "num_input_tokens_seen": 165483840, + "step": 76600 + }, + { + "epoch": 12.49673735725938, + "grad_norm": 0.00485193869099021, + "learning_rate": 0.0003708792691652269, + "loss": 0.0051, + "num_input_tokens_seen": 165494944, + "step": 76605 + }, + { + "epoch": 12.497553017944535, + "grad_norm": 0.3109101355075836, + "learning_rate": 0.00037081050504199245, + "loss": 0.1019, + "num_input_tokens_seen": 165505792, + "step": 76610 + }, + { + "epoch": 12.49836867862969, + "grad_norm": 0.02356518618762493, + "learning_rate": 0.0003707417435369469, + "loss": 0.0127, + "num_input_tokens_seen": 165515808, + "step": 76615 + }, + { + "epoch": 12.499184339314844, + "grad_norm": 0.00426015630364418, + "learning_rate": 0.00037067298465148416, + "loss": 0.0076, + "num_input_tokens_seen": 165525920, + "step": 76620 + }, + { + "epoch": 12.5, + "grad_norm": 0.005503225605934858, + "learning_rate": 0.00037060422838699716, + "loss": 0.018, + "num_input_tokens_seen": 165536352, + "step": 76625 + }, + { + "epoch": 12.500815660685156, + "grad_norm": 0.010103323496878147, + "learning_rate": 0.0003705354747448799, + "loss": 0.0278, + "num_input_tokens_seen": 165546240, + "step": 76630 + }, + { + "epoch": 12.50163132137031, + "grad_norm": 0.02620244398713112, + "learning_rate": 0.00037046672372652523, + "loss": 0.0066, + "num_input_tokens_seen": 165556096, + "step": 76635 + }, + { + "epoch": 12.502446982055465, + "grad_norm": 0.02562333457171917, + "learning_rate": 0.00037039797533332697, + "loss": 0.0064, + "num_input_tokens_seen": 165567552, + "step": 76640 + }, + { + "epoch": 12.50326264274062, + "grad_norm": 0.010979540646076202, + "learning_rate": 0.000370329229566678, + "loss": 0.0037, + "num_input_tokens_seen": 165579008, + "step": 76645 + }, + { + "epoch": 12.504078303425775, + "grad_norm": 0.08632088452577591, + "learning_rate": 0.0003702604864279718, + "loss": 0.1055, + "num_input_tokens_seen": 165589216, + "step": 76650 + }, + { + "epoch": 12.50489396411093, + "grad_norm": 0.006022381596267223, + "learning_rate": 0.00037019174591860127, + "loss": 0.0139, + "num_input_tokens_seen": 165601056, + "step": 76655 + }, + { + "epoch": 12.505709624796085, + "grad_norm": 0.0006824670126661658, + "learning_rate": 0.0003701230080399599, + "loss": 0.0111, + "num_input_tokens_seen": 165610336, + "step": 76660 + }, + { + "epoch": 12.50652528548124, + "grad_norm": 0.13738100230693817, + "learning_rate": 0.00037005427279344027, + "loss": 0.0101, + "num_input_tokens_seen": 165620800, + "step": 76665 + }, + { + "epoch": 12.507340946166394, + "grad_norm": 0.0023835247848182917, + "learning_rate": 0.0003699855401804359, + "loss": 0.0068, + "num_input_tokens_seen": 165631488, + "step": 76670 + }, + { + "epoch": 12.50815660685155, + "grad_norm": 0.013245565816760063, + "learning_rate": 0.0003699168102023393, + "loss": 0.0028, + "num_input_tokens_seen": 165641760, + "step": 76675 + }, + { + "epoch": 12.508972267536706, + "grad_norm": 0.00538500864058733, + "learning_rate": 0.0003698480828605437, + "loss": 0.0095, + "num_input_tokens_seen": 165652256, + "step": 76680 + }, + { + "epoch": 12.50978792822186, + "grad_norm": 0.02130158618092537, + "learning_rate": 0.0003697793581564417, + "loss": 0.0321, + "num_input_tokens_seen": 165663712, + "step": 76685 + }, + { + "epoch": 12.510603588907015, + "grad_norm": 0.010575438849627972, + "learning_rate": 0.00036971063609142637, + "loss": 0.0047, + "num_input_tokens_seen": 165674432, + "step": 76690 + }, + { + "epoch": 12.51141924959217, + "grad_norm": 0.007054158952087164, + "learning_rate": 0.00036964191666689005, + "loss": 0.0054, + "num_input_tokens_seen": 165685280, + "step": 76695 + }, + { + "epoch": 12.512234910277325, + "grad_norm": 0.008702908642590046, + "learning_rate": 0.00036957319988422586, + "loss": 0.1665, + "num_input_tokens_seen": 165694368, + "step": 76700 + }, + { + "epoch": 12.513050570962479, + "grad_norm": 0.008559376932680607, + "learning_rate": 0.0003695044857448261, + "loss": 0.0048, + "num_input_tokens_seen": 165704768, + "step": 76705 + }, + { + "epoch": 12.513866231647635, + "grad_norm": 0.006825583055615425, + "learning_rate": 0.0003694357742500835, + "loss": 0.0079, + "num_input_tokens_seen": 165715296, + "step": 76710 + }, + { + "epoch": 12.51468189233279, + "grad_norm": 0.41185325384140015, + "learning_rate": 0.00036936706540139063, + "loss": 0.1374, + "num_input_tokens_seen": 165725376, + "step": 76715 + }, + { + "epoch": 12.515497553017944, + "grad_norm": 0.02468119002878666, + "learning_rate": 0.0003692983592001398, + "loss": 0.0171, + "num_input_tokens_seen": 165735648, + "step": 76720 + }, + { + "epoch": 12.5163132137031, + "grad_norm": 0.10309385508298874, + "learning_rate": 0.0003692296556477237, + "loss": 0.0533, + "num_input_tokens_seen": 165746464, + "step": 76725 + }, + { + "epoch": 12.517128874388254, + "grad_norm": 0.5680201053619385, + "learning_rate": 0.0003691609547455343, + "loss": 0.0943, + "num_input_tokens_seen": 165757216, + "step": 76730 + }, + { + "epoch": 12.51794453507341, + "grad_norm": 0.007687503471970558, + "learning_rate": 0.0003690922564949643, + "loss": 0.0071, + "num_input_tokens_seen": 165769440, + "step": 76735 + }, + { + "epoch": 12.518760195758565, + "grad_norm": 0.765103816986084, + "learning_rate": 0.0003690235608974057, + "loss": 0.1081, + "num_input_tokens_seen": 165781568, + "step": 76740 + }, + { + "epoch": 12.51957585644372, + "grad_norm": 0.17138712108135223, + "learning_rate": 0.0003689548679542508, + "loss": 0.0124, + "num_input_tokens_seen": 165790880, + "step": 76745 + }, + { + "epoch": 12.520391517128875, + "grad_norm": 0.007275717798620462, + "learning_rate": 0.0003688861776668918, + "loss": 0.0085, + "num_input_tokens_seen": 165802336, + "step": 76750 + }, + { + "epoch": 12.521207177814029, + "grad_norm": 0.004311291500926018, + "learning_rate": 0.0003688174900367207, + "loss": 0.0608, + "num_input_tokens_seen": 165812512, + "step": 76755 + }, + { + "epoch": 12.522022838499185, + "grad_norm": 0.02970127761363983, + "learning_rate": 0.00036874880506512954, + "loss": 0.0069, + "num_input_tokens_seen": 165823136, + "step": 76760 + }, + { + "epoch": 12.522838499184338, + "grad_norm": 0.016825757920742035, + "learning_rate": 0.0003686801227535105, + "loss": 0.0056, + "num_input_tokens_seen": 165833568, + "step": 76765 + }, + { + "epoch": 12.523654159869494, + "grad_norm": 0.0012513466645032167, + "learning_rate": 0.00036861144310325523, + "loss": 0.0039, + "num_input_tokens_seen": 165843136, + "step": 76770 + }, + { + "epoch": 12.52446982055465, + "grad_norm": 0.47699278593063354, + "learning_rate": 0.0003685427661157559, + "loss": 0.0367, + "num_input_tokens_seen": 165853920, + "step": 76775 + }, + { + "epoch": 12.525285481239804, + "grad_norm": 0.0014302321942523122, + "learning_rate": 0.00036847409179240396, + "loss": 0.0086, + "num_input_tokens_seen": 165865216, + "step": 76780 + }, + { + "epoch": 12.52610114192496, + "grad_norm": 0.09856893867254257, + "learning_rate": 0.00036840542013459154, + "loss": 0.0106, + "num_input_tokens_seen": 165875712, + "step": 76785 + }, + { + "epoch": 12.526916802610113, + "grad_norm": 0.03922778740525246, + "learning_rate": 0.00036833675114371014, + "loss": 0.0032, + "num_input_tokens_seen": 165887072, + "step": 76790 + }, + { + "epoch": 12.52773246329527, + "grad_norm": 0.74526447057724, + "learning_rate": 0.00036826808482115167, + "loss": 0.0451, + "num_input_tokens_seen": 165897792, + "step": 76795 + }, + { + "epoch": 12.528548123980425, + "grad_norm": 0.020173760131001472, + "learning_rate": 0.00036819942116830736, + "loss": 0.1448, + "num_input_tokens_seen": 165908160, + "step": 76800 + }, + { + "epoch": 12.529363784665579, + "grad_norm": 0.6142863631248474, + "learning_rate": 0.0003681307601865692, + "loss": 0.2382, + "num_input_tokens_seen": 165919008, + "step": 76805 + }, + { + "epoch": 12.530179445350734, + "grad_norm": 0.02296280302107334, + "learning_rate": 0.00036806210187732824, + "loss": 0.0068, + "num_input_tokens_seen": 165930880, + "step": 76810 + }, + { + "epoch": 12.530995106035888, + "grad_norm": 0.01190591137856245, + "learning_rate": 0.00036799344624197637, + "loss": 0.0398, + "num_input_tokens_seen": 165942432, + "step": 76815 + }, + { + "epoch": 12.531810766721044, + "grad_norm": 0.004319501109421253, + "learning_rate": 0.00036792479328190457, + "loss": 0.0023, + "num_input_tokens_seen": 165954304, + "step": 76820 + }, + { + "epoch": 12.5326264274062, + "grad_norm": 0.05026920139789581, + "learning_rate": 0.0003678561429985044, + "loss": 0.0073, + "num_input_tokens_seen": 165965024, + "step": 76825 + }, + { + "epoch": 12.533442088091354, + "grad_norm": 0.0011591935763135552, + "learning_rate": 0.00036778749539316736, + "loss": 0.0626, + "num_input_tokens_seen": 165976096, + "step": 76830 + }, + { + "epoch": 12.53425774877651, + "grad_norm": 0.006536852102726698, + "learning_rate": 0.00036771885046728417, + "loss": 0.0171, + "num_input_tokens_seen": 165987232, + "step": 76835 + }, + { + "epoch": 12.535073409461663, + "grad_norm": 0.04312235489487648, + "learning_rate": 0.00036765020822224654, + "loss": 0.0191, + "num_input_tokens_seen": 165996800, + "step": 76840 + }, + { + "epoch": 12.535889070146819, + "grad_norm": 0.01564362645149231, + "learning_rate": 0.0003675815686594451, + "loss": 0.0077, + "num_input_tokens_seen": 166007424, + "step": 76845 + }, + { + "epoch": 12.536704730831975, + "grad_norm": 0.02771487832069397, + "learning_rate": 0.00036751293178027144, + "loss": 0.0185, + "num_input_tokens_seen": 166018144, + "step": 76850 + }, + { + "epoch": 12.537520391517129, + "grad_norm": 0.015305979177355766, + "learning_rate": 0.000367444297586116, + "loss": 0.003, + "num_input_tokens_seen": 166028992, + "step": 76855 + }, + { + "epoch": 12.538336052202284, + "grad_norm": 0.4608621299266815, + "learning_rate": 0.0003673756660783703, + "loss": 0.1386, + "num_input_tokens_seen": 166039968, + "step": 76860 + }, + { + "epoch": 12.539151712887438, + "grad_norm": 0.016641966998577118, + "learning_rate": 0.00036730703725842474, + "loss": 0.0034, + "num_input_tokens_seen": 166051552, + "step": 76865 + }, + { + "epoch": 12.539967373572594, + "grad_norm": 0.004178878851234913, + "learning_rate": 0.0003672384111276705, + "loss": 0.0035, + "num_input_tokens_seen": 166062144, + "step": 76870 + }, + { + "epoch": 12.540783034257748, + "grad_norm": 0.004638664424419403, + "learning_rate": 0.0003671697876874982, + "loss": 0.1137, + "num_input_tokens_seen": 166073632, + "step": 76875 + }, + { + "epoch": 12.541598694942904, + "grad_norm": 0.03444049507379532, + "learning_rate": 0.00036710116693929875, + "loss": 0.0083, + "num_input_tokens_seen": 166083552, + "step": 76880 + }, + { + "epoch": 12.54241435562806, + "grad_norm": 0.03458646312355995, + "learning_rate": 0.0003670325488844627, + "loss": 0.0075, + "num_input_tokens_seen": 166093376, + "step": 76885 + }, + { + "epoch": 12.543230016313213, + "grad_norm": 0.008939512073993683, + "learning_rate": 0.00036696393352438083, + "loss": 0.0846, + "num_input_tokens_seen": 166104800, + "step": 76890 + }, + { + "epoch": 12.544045676998369, + "grad_norm": 0.030445056036114693, + "learning_rate": 0.0003668953208604435, + "loss": 0.0431, + "num_input_tokens_seen": 166115968, + "step": 76895 + }, + { + "epoch": 12.544861337683523, + "grad_norm": 0.015219416469335556, + "learning_rate": 0.0003668267108940414, + "loss": 0.0087, + "num_input_tokens_seen": 166125152, + "step": 76900 + }, + { + "epoch": 12.545676998368679, + "grad_norm": 0.05321419611573219, + "learning_rate": 0.00036675810362656486, + "loss": 0.0073, + "num_input_tokens_seen": 166136192, + "step": 76905 + }, + { + "epoch": 12.546492659053834, + "grad_norm": 0.008124981075525284, + "learning_rate": 0.00036668949905940455, + "loss": 0.0045, + "num_input_tokens_seen": 166144512, + "step": 76910 + }, + { + "epoch": 12.547308319738988, + "grad_norm": 0.03787637874484062, + "learning_rate": 0.0003666208971939505, + "loss": 0.1269, + "num_input_tokens_seen": 166155168, + "step": 76915 + }, + { + "epoch": 12.548123980424144, + "grad_norm": 0.09556692838668823, + "learning_rate": 0.0003665522980315933, + "loss": 0.0059, + "num_input_tokens_seen": 166164864, + "step": 76920 + }, + { + "epoch": 12.548939641109298, + "grad_norm": 0.008689331822097301, + "learning_rate": 0.0003664837015737229, + "loss": 0.0278, + "num_input_tokens_seen": 166174272, + "step": 76925 + }, + { + "epoch": 12.549755301794454, + "grad_norm": 0.01637548767030239, + "learning_rate": 0.00036641510782172993, + "loss": 0.0044, + "num_input_tokens_seen": 166185536, + "step": 76930 + }, + { + "epoch": 12.550570962479608, + "grad_norm": 0.011396531015634537, + "learning_rate": 0.0003663465167770039, + "loss": 0.1109, + "num_input_tokens_seen": 166194944, + "step": 76935 + }, + { + "epoch": 12.551386623164763, + "grad_norm": 0.005017245654016733, + "learning_rate": 0.00036627792844093544, + "loss": 0.0065, + "num_input_tokens_seen": 166207168, + "step": 76940 + }, + { + "epoch": 12.552202283849919, + "grad_norm": 0.0026462902314960957, + "learning_rate": 0.0003662093428149145, + "loss": 0.0174, + "num_input_tokens_seen": 166217792, + "step": 76945 + }, + { + "epoch": 12.553017944535073, + "grad_norm": 0.048384591937065125, + "learning_rate": 0.0003661407599003308, + "loss": 0.0765, + "num_input_tokens_seen": 166227616, + "step": 76950 + }, + { + "epoch": 12.553833605220229, + "grad_norm": 0.011623039841651917, + "learning_rate": 0.0003660721796985746, + "loss": 0.0046, + "num_input_tokens_seen": 166237504, + "step": 76955 + }, + { + "epoch": 12.554649265905383, + "grad_norm": 0.0399308018386364, + "learning_rate": 0.0003660036022110353, + "loss": 0.0044, + "num_input_tokens_seen": 166248480, + "step": 76960 + }, + { + "epoch": 12.555464926590538, + "grad_norm": 0.09246581792831421, + "learning_rate": 0.00036593502743910336, + "loss": 0.011, + "num_input_tokens_seen": 166259904, + "step": 76965 + }, + { + "epoch": 12.556280587275694, + "grad_norm": 0.005269172601401806, + "learning_rate": 0.00036586645538416783, + "loss": 0.0056, + "num_input_tokens_seen": 166271872, + "step": 76970 + }, + { + "epoch": 12.557096247960848, + "grad_norm": 0.015488158911466599, + "learning_rate": 0.00036579788604761896, + "loss": 0.0139, + "num_input_tokens_seen": 166283136, + "step": 76975 + }, + { + "epoch": 12.557911908646004, + "grad_norm": 0.6963528394699097, + "learning_rate": 0.000365729319430846, + "loss": 0.033, + "num_input_tokens_seen": 166293632, + "step": 76980 + }, + { + "epoch": 12.558727569331158, + "grad_norm": 0.051269035786390305, + "learning_rate": 0.00036566075553523894, + "loss": 0.0782, + "num_input_tokens_seen": 166304992, + "step": 76985 + }, + { + "epoch": 12.559543230016313, + "grad_norm": 0.01623843051493168, + "learning_rate": 0.0003655921943621868, + "loss": 0.0256, + "num_input_tokens_seen": 166314880, + "step": 76990 + }, + { + "epoch": 12.560358890701469, + "grad_norm": 0.003099554916843772, + "learning_rate": 0.0003655236359130796, + "loss": 0.1376, + "num_input_tokens_seen": 166325536, + "step": 76995 + }, + { + "epoch": 12.561174551386623, + "grad_norm": 0.27658185362815857, + "learning_rate": 0.0003654550801893063, + "loss": 0.015, + "num_input_tokens_seen": 166336096, + "step": 77000 + }, + { + "epoch": 12.561990212071779, + "grad_norm": 0.012075589969754219, + "learning_rate": 0.00036538652719225674, + "loss": 0.0092, + "num_input_tokens_seen": 166345984, + "step": 77005 + }, + { + "epoch": 12.562805872756933, + "grad_norm": 0.016627484932541847, + "learning_rate": 0.0003653179769233197, + "loss": 0.0353, + "num_input_tokens_seen": 166357152, + "step": 77010 + }, + { + "epoch": 12.563621533442088, + "grad_norm": 0.009096194058656693, + "learning_rate": 0.00036524942938388495, + "loss": 0.0075, + "num_input_tokens_seen": 166368736, + "step": 77015 + }, + { + "epoch": 12.564437194127244, + "grad_norm": 0.0031699403189122677, + "learning_rate": 0.00036518088457534125, + "loss": 0.0148, + "num_input_tokens_seen": 166379264, + "step": 77020 + }, + { + "epoch": 12.565252854812398, + "grad_norm": 0.020030856132507324, + "learning_rate": 0.0003651123424990781, + "loss": 0.0806, + "num_input_tokens_seen": 166391072, + "step": 77025 + }, + { + "epoch": 12.566068515497554, + "grad_norm": 0.022214405238628387, + "learning_rate": 0.00036504380315648447, + "loss": 0.0081, + "num_input_tokens_seen": 166402304, + "step": 77030 + }, + { + "epoch": 12.566884176182707, + "grad_norm": 0.07444703578948975, + "learning_rate": 0.0003649752665489492, + "loss": 0.0088, + "num_input_tokens_seen": 166412736, + "step": 77035 + }, + { + "epoch": 12.567699836867863, + "grad_norm": 0.013522377237677574, + "learning_rate": 0.00036490673267786154, + "loss": 0.007, + "num_input_tokens_seen": 166423936, + "step": 77040 + }, + { + "epoch": 12.568515497553017, + "grad_norm": 0.00476167444139719, + "learning_rate": 0.0003648382015446103, + "loss": 0.0456, + "num_input_tokens_seen": 166433472, + "step": 77045 + }, + { + "epoch": 12.569331158238173, + "grad_norm": 0.009685852564871311, + "learning_rate": 0.0003647696731505844, + "loss": 0.0379, + "num_input_tokens_seen": 166443328, + "step": 77050 + }, + { + "epoch": 12.570146818923329, + "grad_norm": 0.009335564449429512, + "learning_rate": 0.00036470114749717267, + "loss": 0.0074, + "num_input_tokens_seen": 166453792, + "step": 77055 + }, + { + "epoch": 12.570962479608482, + "grad_norm": 0.022270025685429573, + "learning_rate": 0.00036463262458576374, + "loss": 0.014, + "num_input_tokens_seen": 166464896, + "step": 77060 + }, + { + "epoch": 12.571778140293638, + "grad_norm": 0.24031361937522888, + "learning_rate": 0.0003645641044177465, + "loss": 0.0119, + "num_input_tokens_seen": 166475904, + "step": 77065 + }, + { + "epoch": 12.572593800978792, + "grad_norm": 0.0068599446676671505, + "learning_rate": 0.00036449558699450937, + "loss": 0.0112, + "num_input_tokens_seen": 166486880, + "step": 77070 + }, + { + "epoch": 12.573409461663948, + "grad_norm": 0.5030407309532166, + "learning_rate": 0.0003644270723174411, + "loss": 0.1388, + "num_input_tokens_seen": 166498944, + "step": 77075 + }, + { + "epoch": 12.574225122349104, + "grad_norm": 0.5042504668235779, + "learning_rate": 0.0003643585603879303, + "loss": 0.137, + "num_input_tokens_seen": 166510112, + "step": 77080 + }, + { + "epoch": 12.575040783034257, + "grad_norm": 0.020460493862628937, + "learning_rate": 0.0003642900512073652, + "loss": 0.0093, + "num_input_tokens_seen": 166520736, + "step": 77085 + }, + { + "epoch": 12.575856443719413, + "grad_norm": 0.12157510221004486, + "learning_rate": 0.00036422154477713456, + "loss": 0.0101, + "num_input_tokens_seen": 166532192, + "step": 77090 + }, + { + "epoch": 12.576672104404567, + "grad_norm": 0.009250016883015633, + "learning_rate": 0.00036415304109862633, + "loss": 0.0459, + "num_input_tokens_seen": 166542688, + "step": 77095 + }, + { + "epoch": 12.577487765089723, + "grad_norm": 0.004898452199995518, + "learning_rate": 0.0003640845401732293, + "loss": 0.016, + "num_input_tokens_seen": 166553824, + "step": 77100 + }, + { + "epoch": 12.578303425774878, + "grad_norm": 0.08203618228435516, + "learning_rate": 0.0003640160420023313, + "loss": 0.0757, + "num_input_tokens_seen": 166564512, + "step": 77105 + }, + { + "epoch": 12.579119086460032, + "grad_norm": 0.015118278563022614, + "learning_rate": 0.00036394754658732086, + "loss": 0.0056, + "num_input_tokens_seen": 166575616, + "step": 77110 + }, + { + "epoch": 12.579934747145188, + "grad_norm": 0.010293328203260899, + "learning_rate": 0.00036387905392958574, + "loss": 0.0167, + "num_input_tokens_seen": 166586848, + "step": 77115 + }, + { + "epoch": 12.580750407830342, + "grad_norm": 0.03571222350001335, + "learning_rate": 0.0003638105640305146, + "loss": 0.0101, + "num_input_tokens_seen": 166597568, + "step": 77120 + }, + { + "epoch": 12.581566068515498, + "grad_norm": 0.011700263246893883, + "learning_rate": 0.00036374207689149487, + "loss": 0.0066, + "num_input_tokens_seen": 166608448, + "step": 77125 + }, + { + "epoch": 12.582381729200652, + "grad_norm": 0.1275133490562439, + "learning_rate": 0.00036367359251391506, + "loss": 0.0099, + "num_input_tokens_seen": 166618912, + "step": 77130 + }, + { + "epoch": 12.583197389885807, + "grad_norm": 0.10448265820741653, + "learning_rate": 0.0003636051108991626, + "loss": 0.0071, + "num_input_tokens_seen": 166628960, + "step": 77135 + }, + { + "epoch": 12.584013050570963, + "grad_norm": 0.5431002378463745, + "learning_rate": 0.0003635366320486258, + "loss": 0.0921, + "num_input_tokens_seen": 166638496, + "step": 77140 + }, + { + "epoch": 12.584828711256117, + "grad_norm": 0.0023790623527020216, + "learning_rate": 0.0003634681559636921, + "loss": 0.0262, + "num_input_tokens_seen": 166650464, + "step": 77145 + }, + { + "epoch": 12.585644371941273, + "grad_norm": 0.04713069275021553, + "learning_rate": 0.0003633996826457494, + "loss": 0.0044, + "num_input_tokens_seen": 166660384, + "step": 77150 + }, + { + "epoch": 12.586460032626427, + "grad_norm": 0.14566659927368164, + "learning_rate": 0.0003633312120961856, + "loss": 0.0214, + "num_input_tokens_seen": 166670848, + "step": 77155 + }, + { + "epoch": 12.587275693311582, + "grad_norm": 0.005130293779075146, + "learning_rate": 0.000363262744316388, + "loss": 0.014, + "num_input_tokens_seen": 166681216, + "step": 77160 + }, + { + "epoch": 12.588091353996738, + "grad_norm": 0.006175138521939516, + "learning_rate": 0.00036319427930774453, + "loss": 0.042, + "num_input_tokens_seen": 166691680, + "step": 77165 + }, + { + "epoch": 12.588907014681892, + "grad_norm": 0.004177759867161512, + "learning_rate": 0.0003631258170716423, + "loss": 0.0052, + "num_input_tokens_seen": 166701024, + "step": 77170 + }, + { + "epoch": 12.589722675367048, + "grad_norm": 0.0710231363773346, + "learning_rate": 0.0003630573576094693, + "loss": 0.0449, + "num_input_tokens_seen": 166711296, + "step": 77175 + }, + { + "epoch": 12.590538336052202, + "grad_norm": 0.1315511018037796, + "learning_rate": 0.0003629889009226124, + "loss": 0.0168, + "num_input_tokens_seen": 166722368, + "step": 77180 + }, + { + "epoch": 12.591353996737357, + "grad_norm": 0.6337395310401917, + "learning_rate": 0.0003629204470124595, + "loss": 0.0962, + "num_input_tokens_seen": 166733696, + "step": 77185 + }, + { + "epoch": 12.592169657422513, + "grad_norm": 0.0053985025733709335, + "learning_rate": 0.00036285199588039743, + "loss": 0.0026, + "num_input_tokens_seen": 166743456, + "step": 77190 + }, + { + "epoch": 12.592985318107667, + "grad_norm": 0.011733738705515862, + "learning_rate": 0.0003627835475278137, + "loss": 0.0389, + "num_input_tokens_seen": 166754528, + "step": 77195 + }, + { + "epoch": 12.593800978792823, + "grad_norm": 0.438748836517334, + "learning_rate": 0.0003627151019560955, + "loss": 0.0514, + "num_input_tokens_seen": 166766912, + "step": 77200 + }, + { + "epoch": 12.594616639477977, + "grad_norm": 0.4337851405143738, + "learning_rate": 0.00036264665916662986, + "loss": 0.0634, + "num_input_tokens_seen": 166778464, + "step": 77205 + }, + { + "epoch": 12.595432300163132, + "grad_norm": 0.15691335499286652, + "learning_rate": 0.000362578219160804, + "loss": 0.0409, + "num_input_tokens_seen": 166789760, + "step": 77210 + }, + { + "epoch": 12.596247960848288, + "grad_norm": 0.0065701864659786224, + "learning_rate": 0.0003625097819400048, + "loss": 0.0162, + "num_input_tokens_seen": 166799648, + "step": 77215 + }, + { + "epoch": 12.597063621533442, + "grad_norm": 0.010911048389971256, + "learning_rate": 0.0003624413475056192, + "loss": 0.0109, + "num_input_tokens_seen": 166810304, + "step": 77220 + }, + { + "epoch": 12.597879282218598, + "grad_norm": 0.011135376058518887, + "learning_rate": 0.00036237291585903436, + "loss": 0.0046, + "num_input_tokens_seen": 166820608, + "step": 77225 + }, + { + "epoch": 12.598694942903752, + "grad_norm": 0.019748615100979805, + "learning_rate": 0.0003623044870016368, + "loss": 0.0054, + "num_input_tokens_seen": 166831616, + "step": 77230 + }, + { + "epoch": 12.599510603588907, + "grad_norm": 0.009777350351214409, + "learning_rate": 0.0003622360609348138, + "loss": 0.0934, + "num_input_tokens_seen": 166842240, + "step": 77235 + }, + { + "epoch": 12.600326264274061, + "grad_norm": 0.0011331519344821572, + "learning_rate": 0.0003621676376599514, + "loss": 0.0041, + "num_input_tokens_seen": 166853984, + "step": 77240 + }, + { + "epoch": 12.601141924959217, + "grad_norm": 0.005403982475399971, + "learning_rate": 0.00036209921717843697, + "loss": 0.0959, + "num_input_tokens_seen": 166863712, + "step": 77245 + }, + { + "epoch": 12.601957585644373, + "grad_norm": 0.11000215262174606, + "learning_rate": 0.00036203079949165664, + "loss": 0.0111, + "num_input_tokens_seen": 166874592, + "step": 77250 + }, + { + "epoch": 12.602773246329527, + "grad_norm": 0.009174146689474583, + "learning_rate": 0.00036196238460099717, + "loss": 0.005, + "num_input_tokens_seen": 166883136, + "step": 77255 + }, + { + "epoch": 12.603588907014682, + "grad_norm": 0.02310585230588913, + "learning_rate": 0.0003618939725078453, + "loss": 0.0731, + "num_input_tokens_seen": 166893600, + "step": 77260 + }, + { + "epoch": 12.604404567699836, + "grad_norm": 0.005659396760165691, + "learning_rate": 0.0003618255632135871, + "loss": 0.0124, + "num_input_tokens_seen": 166906016, + "step": 77265 + }, + { + "epoch": 12.605220228384992, + "grad_norm": 0.011715354397892952, + "learning_rate": 0.00036175715671960934, + "loss": 0.0704, + "num_input_tokens_seen": 166917216, + "step": 77270 + }, + { + "epoch": 12.606035889070148, + "grad_norm": 0.003530156798660755, + "learning_rate": 0.000361688753027298, + "loss": 0.0026, + "num_input_tokens_seen": 166927584, + "step": 77275 + }, + { + "epoch": 12.606851549755302, + "grad_norm": 0.010420121252536774, + "learning_rate": 0.0003616203521380397, + "loss": 0.005, + "num_input_tokens_seen": 166937824, + "step": 77280 + }, + { + "epoch": 12.607667210440457, + "grad_norm": 0.004818596411496401, + "learning_rate": 0.00036155195405322026, + "loss": 0.0489, + "num_input_tokens_seen": 166948480, + "step": 77285 + }, + { + "epoch": 12.608482871125611, + "grad_norm": 0.010222864337265491, + "learning_rate": 0.0003614835587742264, + "loss": 0.0506, + "num_input_tokens_seen": 166958848, + "step": 77290 + }, + { + "epoch": 12.609298531810767, + "grad_norm": 0.5348839163780212, + "learning_rate": 0.0003614151663024436, + "loss": 0.0773, + "num_input_tokens_seen": 166969344, + "step": 77295 + }, + { + "epoch": 12.61011419249592, + "grad_norm": 0.04770367220044136, + "learning_rate": 0.0003613467766392586, + "loss": 0.0079, + "num_input_tokens_seen": 166980832, + "step": 77300 + }, + { + "epoch": 12.610929853181077, + "grad_norm": 0.009626589715480804, + "learning_rate": 0.00036127838978605687, + "loss": 0.0305, + "num_input_tokens_seen": 166989920, + "step": 77305 + }, + { + "epoch": 12.611745513866232, + "grad_norm": 0.021754087880253792, + "learning_rate": 0.0003612100057442247, + "loss": 0.0289, + "num_input_tokens_seen": 167000320, + "step": 77310 + }, + { + "epoch": 12.612561174551386, + "grad_norm": 0.009066320955753326, + "learning_rate": 0.00036114162451514765, + "loss": 0.0025, + "num_input_tokens_seen": 167011584, + "step": 77315 + }, + { + "epoch": 12.613376835236542, + "grad_norm": 0.0031363347079604864, + "learning_rate": 0.000361073246100212, + "loss": 0.0947, + "num_input_tokens_seen": 167022784, + "step": 77320 + }, + { + "epoch": 12.614192495921696, + "grad_norm": 0.10713813453912735, + "learning_rate": 0.0003610048705008029, + "loss": 0.0119, + "num_input_tokens_seen": 167033952, + "step": 77325 + }, + { + "epoch": 12.615008156606851, + "grad_norm": 0.03944196179509163, + "learning_rate": 0.00036093649771830674, + "loss": 0.0402, + "num_input_tokens_seen": 167044768, + "step": 77330 + }, + { + "epoch": 12.615823817292007, + "grad_norm": 0.001964397495612502, + "learning_rate": 0.0003608681277541086, + "loss": 0.0072, + "num_input_tokens_seen": 167055776, + "step": 77335 + }, + { + "epoch": 12.616639477977161, + "grad_norm": 0.11983584612607956, + "learning_rate": 0.00036079976060959454, + "loss": 0.0115, + "num_input_tokens_seen": 167067328, + "step": 77340 + }, + { + "epoch": 12.617455138662317, + "grad_norm": 0.0034255923237651587, + "learning_rate": 0.0003607313962861499, + "loss": 0.0671, + "num_input_tokens_seen": 167078432, + "step": 77345 + }, + { + "epoch": 12.61827079934747, + "grad_norm": 0.007270899601280689, + "learning_rate": 0.00036066303478516016, + "loss": 0.0279, + "num_input_tokens_seen": 167089472, + "step": 77350 + }, + { + "epoch": 12.619086460032626, + "grad_norm": 0.002617582445964217, + "learning_rate": 0.0003605946761080108, + "loss": 0.0071, + "num_input_tokens_seen": 167100768, + "step": 77355 + }, + { + "epoch": 12.619902120717782, + "grad_norm": 0.4317784905433655, + "learning_rate": 0.000360526320256087, + "loss": 0.0725, + "num_input_tokens_seen": 167112416, + "step": 77360 + }, + { + "epoch": 12.620717781402936, + "grad_norm": 0.04355085641145706, + "learning_rate": 0.0003604579672307744, + "loss": 0.0157, + "num_input_tokens_seen": 167123296, + "step": 77365 + }, + { + "epoch": 12.621533442088092, + "grad_norm": 0.05431177094578743, + "learning_rate": 0.00036038961703345815, + "loss": 0.0818, + "num_input_tokens_seen": 167133184, + "step": 77370 + }, + { + "epoch": 12.622349102773246, + "grad_norm": 0.010911340825259686, + "learning_rate": 0.00036032126966552335, + "loss": 0.1086, + "num_input_tokens_seen": 167143744, + "step": 77375 + }, + { + "epoch": 12.623164763458401, + "grad_norm": 0.04832053557038307, + "learning_rate": 0.0003602529251283553, + "loss": 0.0061, + "num_input_tokens_seen": 167154688, + "step": 77380 + }, + { + "epoch": 12.623980424143557, + "grad_norm": 0.005749577656388283, + "learning_rate": 0.000360184583423339, + "loss": 0.0085, + "num_input_tokens_seen": 167166368, + "step": 77385 + }, + { + "epoch": 12.624796084828711, + "grad_norm": 0.3927803337574005, + "learning_rate": 0.0003601162445518593, + "loss": 0.0297, + "num_input_tokens_seen": 167178080, + "step": 77390 + }, + { + "epoch": 12.625611745513867, + "grad_norm": 0.0049598063342273235, + "learning_rate": 0.0003600479085153017, + "loss": 0.0896, + "num_input_tokens_seen": 167189216, + "step": 77395 + }, + { + "epoch": 12.62642740619902, + "grad_norm": 0.042065005749464035, + "learning_rate": 0.00035997957531505045, + "loss": 0.0098, + "num_input_tokens_seen": 167200320, + "step": 77400 + }, + { + "epoch": 12.627243066884176, + "grad_norm": 0.001665329560637474, + "learning_rate": 0.00035991124495249094, + "loss": 0.0057, + "num_input_tokens_seen": 167210912, + "step": 77405 + }, + { + "epoch": 12.62805872756933, + "grad_norm": 0.0032141683623194695, + "learning_rate": 0.0003598429174290076, + "loss": 0.0129, + "num_input_tokens_seen": 167223296, + "step": 77410 + }, + { + "epoch": 12.628874388254486, + "grad_norm": 0.02148437313735485, + "learning_rate": 0.0003597745927459856, + "loss": 0.0345, + "num_input_tokens_seen": 167235232, + "step": 77415 + }, + { + "epoch": 12.629690048939642, + "grad_norm": 0.08603756129741669, + "learning_rate": 0.00035970627090480906, + "loss": 0.0382, + "num_input_tokens_seen": 167244928, + "step": 77420 + }, + { + "epoch": 12.630505709624796, + "grad_norm": 0.03180314227938652, + "learning_rate": 0.0003596379519068632, + "loss": 0.0403, + "num_input_tokens_seen": 167254464, + "step": 77425 + }, + { + "epoch": 12.631321370309951, + "grad_norm": 0.3329426944255829, + "learning_rate": 0.000359569635753532, + "loss": 0.0766, + "num_input_tokens_seen": 167265984, + "step": 77430 + }, + { + "epoch": 12.632137030995105, + "grad_norm": 0.0028075268492102623, + "learning_rate": 0.00035950132244620057, + "loss": 0.003, + "num_input_tokens_seen": 167275968, + "step": 77435 + }, + { + "epoch": 12.632952691680261, + "grad_norm": 0.011716500855982304, + "learning_rate": 0.0003594330119862529, + "loss": 0.0101, + "num_input_tokens_seen": 167286016, + "step": 77440 + }, + { + "epoch": 12.633768352365417, + "grad_norm": 0.40379977226257324, + "learning_rate": 0.00035936470437507366, + "loss": 0.0451, + "num_input_tokens_seen": 167296448, + "step": 77445 + }, + { + "epoch": 12.63458401305057, + "grad_norm": 0.0061897290870547295, + "learning_rate": 0.000359296399614047, + "loss": 0.0198, + "num_input_tokens_seen": 167306240, + "step": 77450 + }, + { + "epoch": 12.635399673735726, + "grad_norm": 0.0006506768404506147, + "learning_rate": 0.00035922809770455745, + "loss": 0.016, + "num_input_tokens_seen": 167317088, + "step": 77455 + }, + { + "epoch": 12.63621533442088, + "grad_norm": 0.5576114058494568, + "learning_rate": 0.00035915979864798884, + "loss": 0.0658, + "num_input_tokens_seen": 167328160, + "step": 77460 + }, + { + "epoch": 12.637030995106036, + "grad_norm": 0.44090718030929565, + "learning_rate": 0.0003590915024457256, + "loss": 0.1749, + "num_input_tokens_seen": 167338496, + "step": 77465 + }, + { + "epoch": 12.63784665579119, + "grad_norm": 0.0012273893225938082, + "learning_rate": 0.0003590232090991521, + "loss": 0.0547, + "num_input_tokens_seen": 167348544, + "step": 77470 + }, + { + "epoch": 12.638662316476346, + "grad_norm": 0.04741557314991951, + "learning_rate": 0.0003589549186096518, + "loss": 0.0062, + "num_input_tokens_seen": 167357568, + "step": 77475 + }, + { + "epoch": 12.639477977161501, + "grad_norm": 0.025142524391412735, + "learning_rate": 0.0003588866309786093, + "loss": 0.0201, + "num_input_tokens_seen": 167367520, + "step": 77480 + }, + { + "epoch": 12.640293637846655, + "grad_norm": 1.77473783493042, + "learning_rate": 0.00035881834620740796, + "loss": 0.1292, + "num_input_tokens_seen": 167377952, + "step": 77485 + }, + { + "epoch": 12.641109298531811, + "grad_norm": 0.009403154253959656, + "learning_rate": 0.0003587500642974322, + "loss": 0.0039, + "num_input_tokens_seen": 167389024, + "step": 77490 + }, + { + "epoch": 12.641924959216965, + "grad_norm": 0.04819444566965103, + "learning_rate": 0.0003586817852500653, + "loss": 0.0079, + "num_input_tokens_seen": 167399744, + "step": 77495 + }, + { + "epoch": 12.64274061990212, + "grad_norm": 0.006756368558853865, + "learning_rate": 0.00035861350906669156, + "loss": 0.003, + "num_input_tokens_seen": 167410976, + "step": 77500 + }, + { + "epoch": 12.643556280587276, + "grad_norm": 0.04680448770523071, + "learning_rate": 0.00035854523574869416, + "loss": 0.02, + "num_input_tokens_seen": 167423072, + "step": 77505 + }, + { + "epoch": 12.64437194127243, + "grad_norm": 0.0036935280077159405, + "learning_rate": 0.00035847696529745714, + "loss": 0.0657, + "num_input_tokens_seen": 167433056, + "step": 77510 + }, + { + "epoch": 12.645187601957586, + "grad_norm": 0.04204836115241051, + "learning_rate": 0.000358408697714364, + "loss": 0.0172, + "num_input_tokens_seen": 167444832, + "step": 77515 + }, + { + "epoch": 12.64600326264274, + "grad_norm": 0.003103738185018301, + "learning_rate": 0.0003583404330007981, + "loss": 0.0087, + "num_input_tokens_seen": 167456032, + "step": 77520 + }, + { + "epoch": 12.646818923327896, + "grad_norm": 0.31443697214126587, + "learning_rate": 0.00035827217115814313, + "loss": 0.059, + "num_input_tokens_seen": 167465856, + "step": 77525 + }, + { + "epoch": 12.647634584013051, + "grad_norm": 0.6064665913581848, + "learning_rate": 0.0003582039121877824, + "loss": 0.046, + "num_input_tokens_seen": 167477600, + "step": 77530 + }, + { + "epoch": 12.648450244698205, + "grad_norm": 0.007075733970850706, + "learning_rate": 0.0003581356560910992, + "loss": 0.0107, + "num_input_tokens_seen": 167488640, + "step": 77535 + }, + { + "epoch": 12.649265905383361, + "grad_norm": 0.5475460886955261, + "learning_rate": 0.00035806740286947704, + "loss": 0.039, + "num_input_tokens_seen": 167498816, + "step": 77540 + }, + { + "epoch": 12.650081566068515, + "grad_norm": 0.003381013870239258, + "learning_rate": 0.0003579991525242988, + "loss": 0.0022, + "num_input_tokens_seen": 167510080, + "step": 77545 + }, + { + "epoch": 12.65089722675367, + "grad_norm": 0.050899162888526917, + "learning_rate": 0.0003579309050569481, + "loss": 0.0374, + "num_input_tokens_seen": 167520576, + "step": 77550 + }, + { + "epoch": 12.651712887438826, + "grad_norm": 0.02303638495504856, + "learning_rate": 0.00035786266046880765, + "loss": 0.0506, + "num_input_tokens_seen": 167532160, + "step": 77555 + }, + { + "epoch": 12.65252854812398, + "grad_norm": 0.006254466250538826, + "learning_rate": 0.0003577944187612609, + "loss": 0.1495, + "num_input_tokens_seen": 167542816, + "step": 77560 + }, + { + "epoch": 12.653344208809136, + "grad_norm": 0.00909637100994587, + "learning_rate": 0.0003577261799356905, + "loss": 0.0073, + "num_input_tokens_seen": 167553184, + "step": 77565 + }, + { + "epoch": 12.65415986949429, + "grad_norm": 0.002966930391266942, + "learning_rate": 0.0003576579439934796, + "loss": 0.0031, + "num_input_tokens_seen": 167564672, + "step": 77570 + }, + { + "epoch": 12.654975530179446, + "grad_norm": 0.0037646342534571886, + "learning_rate": 0.000357589710936011, + "loss": 0.1096, + "num_input_tokens_seen": 167576128, + "step": 77575 + }, + { + "epoch": 12.655791190864601, + "grad_norm": 0.001021684962324798, + "learning_rate": 0.0003575214807646675, + "loss": 0.2004, + "num_input_tokens_seen": 167586752, + "step": 77580 + }, + { + "epoch": 12.656606851549755, + "grad_norm": 0.008952321484684944, + "learning_rate": 0.0003574532534808321, + "loss": 0.0051, + "num_input_tokens_seen": 167598048, + "step": 77585 + }, + { + "epoch": 12.65742251223491, + "grad_norm": 0.04184262827038765, + "learning_rate": 0.00035738502908588723, + "loss": 0.0467, + "num_input_tokens_seen": 167608832, + "step": 77590 + }, + { + "epoch": 12.658238172920065, + "grad_norm": 0.02576330676674843, + "learning_rate": 0.0003573168075812158, + "loss": 0.0055, + "num_input_tokens_seen": 167618880, + "step": 77595 + }, + { + "epoch": 12.65905383360522, + "grad_norm": 0.04942226782441139, + "learning_rate": 0.0003572485889682001, + "loss": 0.0174, + "num_input_tokens_seen": 167628928, + "step": 77600 + }, + { + "epoch": 12.659869494290374, + "grad_norm": 0.2924436330795288, + "learning_rate": 0.00035718037324822304, + "loss": 0.027, + "num_input_tokens_seen": 167639296, + "step": 77605 + }, + { + "epoch": 12.66068515497553, + "grad_norm": 0.01681629940867424, + "learning_rate": 0.0003571121604226667, + "loss": 0.0147, + "num_input_tokens_seen": 167648320, + "step": 77610 + }, + { + "epoch": 12.661500815660686, + "grad_norm": 0.024903155863285065, + "learning_rate": 0.0003570439504929139, + "loss": 0.0567, + "num_input_tokens_seen": 167658464, + "step": 77615 + }, + { + "epoch": 12.66231647634584, + "grad_norm": 0.027922337874770164, + "learning_rate": 0.00035697574346034655, + "loss": 0.0291, + "num_input_tokens_seen": 167669696, + "step": 77620 + }, + { + "epoch": 12.663132137030995, + "grad_norm": 0.010534364730119705, + "learning_rate": 0.0003569075393263475, + "loss": 0.0069, + "num_input_tokens_seen": 167680960, + "step": 77625 + }, + { + "epoch": 12.66394779771615, + "grad_norm": 0.0009661827934905887, + "learning_rate": 0.0003568393380922984, + "loss": 0.0019, + "num_input_tokens_seen": 167691520, + "step": 77630 + }, + { + "epoch": 12.664763458401305, + "grad_norm": 0.008585717529058456, + "learning_rate": 0.0003567711397595819, + "loss": 0.0564, + "num_input_tokens_seen": 167703456, + "step": 77635 + }, + { + "epoch": 12.66557911908646, + "grad_norm": 0.0037884414196014404, + "learning_rate": 0.00035670294432957984, + "loss": 0.0206, + "num_input_tokens_seen": 167713664, + "step": 77640 + }, + { + "epoch": 12.666394779771615, + "grad_norm": 0.05994252860546112, + "learning_rate": 0.00035663475180367453, + "loss": 0.0627, + "num_input_tokens_seen": 167723456, + "step": 77645 + }, + { + "epoch": 12.66721044045677, + "grad_norm": 0.9670343399047852, + "learning_rate": 0.00035656656218324765, + "loss": 0.0979, + "num_input_tokens_seen": 167734144, + "step": 77650 + }, + { + "epoch": 12.668026101141924, + "grad_norm": 0.0011733782012015581, + "learning_rate": 0.0003564983754696815, + "loss": 0.0062, + "num_input_tokens_seen": 167743584, + "step": 77655 + }, + { + "epoch": 12.66884176182708, + "grad_norm": 0.0013517803745344281, + "learning_rate": 0.00035643019166435775, + "loss": 0.0083, + "num_input_tokens_seen": 167753440, + "step": 77660 + }, + { + "epoch": 12.669657422512234, + "grad_norm": 0.03769034519791603, + "learning_rate": 0.00035636201076865836, + "loss": 0.1392, + "num_input_tokens_seen": 167765216, + "step": 77665 + }, + { + "epoch": 12.67047308319739, + "grad_norm": 0.0010428469395264983, + "learning_rate": 0.000356293832783965, + "loss": 0.0037, + "num_input_tokens_seen": 167776256, + "step": 77670 + }, + { + "epoch": 12.671288743882545, + "grad_norm": 0.193328395485878, + "learning_rate": 0.0003562256577116595, + "loss": 0.0556, + "num_input_tokens_seen": 167786432, + "step": 77675 + }, + { + "epoch": 12.6721044045677, + "grad_norm": 0.0028232985641807318, + "learning_rate": 0.0003561574855531232, + "loss": 0.0014, + "num_input_tokens_seen": 167796256, + "step": 77680 + }, + { + "epoch": 12.672920065252855, + "grad_norm": 0.12443973869085312, + "learning_rate": 0.00035608931630973814, + "loss": 0.0076, + "num_input_tokens_seen": 167806752, + "step": 77685 + }, + { + "epoch": 12.673735725938009, + "grad_norm": 0.07046890258789062, + "learning_rate": 0.0003560211499828856, + "loss": 0.0945, + "num_input_tokens_seen": 167818336, + "step": 77690 + }, + { + "epoch": 12.674551386623165, + "grad_norm": 0.011465424671769142, + "learning_rate": 0.00035595298657394714, + "loss": 0.0076, + "num_input_tokens_seen": 167829312, + "step": 77695 + }, + { + "epoch": 12.67536704730832, + "grad_norm": 0.010278213769197464, + "learning_rate": 0.0003558848260843041, + "loss": 0.0059, + "num_input_tokens_seen": 167839744, + "step": 77700 + }, + { + "epoch": 12.676182707993474, + "grad_norm": 0.007338542491197586, + "learning_rate": 0.00035581666851533777, + "loss": 0.0155, + "num_input_tokens_seen": 167849696, + "step": 77705 + }, + { + "epoch": 12.67699836867863, + "grad_norm": 0.003327986691147089, + "learning_rate": 0.0003557485138684299, + "loss": 0.0147, + "num_input_tokens_seen": 167860800, + "step": 77710 + }, + { + "epoch": 12.677814029363784, + "grad_norm": 0.025251364335417747, + "learning_rate": 0.00035568036214496103, + "loss": 0.0088, + "num_input_tokens_seen": 167870560, + "step": 77715 + }, + { + "epoch": 12.67862969004894, + "grad_norm": 0.34143656492233276, + "learning_rate": 0.000355612213346313, + "loss": 0.1471, + "num_input_tokens_seen": 167881600, + "step": 77720 + }, + { + "epoch": 12.679445350734095, + "grad_norm": 0.03525270149111748, + "learning_rate": 0.00035554406747386635, + "loss": 0.0164, + "num_input_tokens_seen": 167892064, + "step": 77725 + }, + { + "epoch": 12.68026101141925, + "grad_norm": 0.05492442101240158, + "learning_rate": 0.0003554759245290027, + "loss": 0.0066, + "num_input_tokens_seen": 167902848, + "step": 77730 + }, + { + "epoch": 12.681076672104405, + "grad_norm": 0.3540668785572052, + "learning_rate": 0.0003554077845131025, + "loss": 0.0098, + "num_input_tokens_seen": 167914272, + "step": 77735 + }, + { + "epoch": 12.681892332789559, + "grad_norm": 0.044319842010736465, + "learning_rate": 0.0003553396474275473, + "loss": 0.0057, + "num_input_tokens_seen": 167925696, + "step": 77740 + }, + { + "epoch": 12.682707993474715, + "grad_norm": 0.052253205329179764, + "learning_rate": 0.00035527151327371736, + "loss": 0.0575, + "num_input_tokens_seen": 167936160, + "step": 77745 + }, + { + "epoch": 12.68352365415987, + "grad_norm": 0.005088796839118004, + "learning_rate": 0.00035520338205299407, + "loss": 0.0105, + "num_input_tokens_seen": 167946112, + "step": 77750 + }, + { + "epoch": 12.684339314845024, + "grad_norm": 0.024718090891838074, + "learning_rate": 0.0003551352537667577, + "loss": 0.0045, + "num_input_tokens_seen": 167955328, + "step": 77755 + }, + { + "epoch": 12.68515497553018, + "grad_norm": 0.03583148866891861, + "learning_rate": 0.0003550671284163894, + "loss": 0.0063, + "num_input_tokens_seen": 167966688, + "step": 77760 + }, + { + "epoch": 12.685970636215334, + "grad_norm": 0.02463572286069393, + "learning_rate": 0.00035499900600326933, + "loss": 0.0075, + "num_input_tokens_seen": 167977984, + "step": 77765 + }, + { + "epoch": 12.68678629690049, + "grad_norm": 0.006799501832574606, + "learning_rate": 0.00035493088652877866, + "loss": 0.0064, + "num_input_tokens_seen": 167987680, + "step": 77770 + }, + { + "epoch": 12.687601957585644, + "grad_norm": 0.022389927878975868, + "learning_rate": 0.00035486276999429733, + "loss": 0.0193, + "num_input_tokens_seen": 167997984, + "step": 77775 + }, + { + "epoch": 12.6884176182708, + "grad_norm": 0.0026521605905145407, + "learning_rate": 0.00035479465640120636, + "loss": 0.0131, + "num_input_tokens_seen": 168007648, + "step": 77780 + }, + { + "epoch": 12.689233278955955, + "grad_norm": 0.008616507053375244, + "learning_rate": 0.0003547265457508856, + "loss": 0.0114, + "num_input_tokens_seen": 168019168, + "step": 77785 + }, + { + "epoch": 12.690048939641109, + "grad_norm": 0.003639386035501957, + "learning_rate": 0.0003546584380447157, + "loss": 0.0025, + "num_input_tokens_seen": 168030816, + "step": 77790 + }, + { + "epoch": 12.690864600326265, + "grad_norm": 0.051106810569763184, + "learning_rate": 0.0003545903332840772, + "loss": 0.0131, + "num_input_tokens_seen": 168041952, + "step": 77795 + }, + { + "epoch": 12.691680261011419, + "grad_norm": 0.06359019875526428, + "learning_rate": 0.0003545222314703498, + "loss": 0.0036, + "num_input_tokens_seen": 168052544, + "step": 77800 + }, + { + "epoch": 12.692495921696574, + "grad_norm": 0.016655128449201584, + "learning_rate": 0.0003544541326049141, + "loss": 0.1497, + "num_input_tokens_seen": 168063648, + "step": 77805 + }, + { + "epoch": 12.69331158238173, + "grad_norm": 0.17468668520450592, + "learning_rate": 0.0003543860366891499, + "loss": 0.0094, + "num_input_tokens_seen": 168075520, + "step": 77810 + }, + { + "epoch": 12.694127243066884, + "grad_norm": 0.0019930857233703136, + "learning_rate": 0.0003543179437244376, + "loss": 0.0045, + "num_input_tokens_seen": 168085888, + "step": 77815 + }, + { + "epoch": 12.69494290375204, + "grad_norm": 0.007549921050667763, + "learning_rate": 0.0003542498537121567, + "loss": 0.0125, + "num_input_tokens_seen": 168096064, + "step": 77820 + }, + { + "epoch": 12.695758564437194, + "grad_norm": 0.17789390683174133, + "learning_rate": 0.0003541817666536876, + "loss": 0.0103, + "num_input_tokens_seen": 168106624, + "step": 77825 + }, + { + "epoch": 12.69657422512235, + "grad_norm": 0.029698913916945457, + "learning_rate": 0.00035411368255040994, + "loss": 0.1498, + "num_input_tokens_seen": 168120064, + "step": 77830 + }, + { + "epoch": 12.697389885807503, + "grad_norm": 0.19213663041591644, + "learning_rate": 0.0003540456014037036, + "loss": 0.0545, + "num_input_tokens_seen": 168130624, + "step": 77835 + }, + { + "epoch": 12.698205546492659, + "grad_norm": 0.0012613199651241302, + "learning_rate": 0.00035397752321494826, + "loss": 0.0064, + "num_input_tokens_seen": 168141344, + "step": 77840 + }, + { + "epoch": 12.699021207177815, + "grad_norm": 0.0015342944534495473, + "learning_rate": 0.0003539094479855237, + "loss": 0.0181, + "num_input_tokens_seen": 168152192, + "step": 77845 + }, + { + "epoch": 12.699836867862969, + "grad_norm": 0.4590141475200653, + "learning_rate": 0.00035384137571680936, + "loss": 0.2501, + "num_input_tokens_seen": 168162496, + "step": 77850 + }, + { + "epoch": 12.700652528548124, + "grad_norm": 0.009294010698795319, + "learning_rate": 0.0003537733064101852, + "loss": 0.0372, + "num_input_tokens_seen": 168173984, + "step": 77855 + }, + { + "epoch": 12.701468189233278, + "grad_norm": 0.008999227546155453, + "learning_rate": 0.0003537052400670303, + "loss": 0.1625, + "num_input_tokens_seen": 168183328, + "step": 77860 + }, + { + "epoch": 12.702283849918434, + "grad_norm": 0.28203514218330383, + "learning_rate": 0.00035363717668872443, + "loss": 0.0765, + "num_input_tokens_seen": 168195680, + "step": 77865 + }, + { + "epoch": 12.70309951060359, + "grad_norm": 0.0068134767934679985, + "learning_rate": 0.00035356911627664665, + "loss": 0.0082, + "num_input_tokens_seen": 168206304, + "step": 77870 + }, + { + "epoch": 12.703915171288743, + "grad_norm": 0.01844414509832859, + "learning_rate": 0.00035350105883217675, + "loss": 0.0139, + "num_input_tokens_seen": 168217248, + "step": 77875 + }, + { + "epoch": 12.7047308319739, + "grad_norm": 0.0019640587270259857, + "learning_rate": 0.00035343300435669356, + "loss": 0.0108, + "num_input_tokens_seen": 168227744, + "step": 77880 + }, + { + "epoch": 12.705546492659053, + "grad_norm": 0.3847804069519043, + "learning_rate": 0.0003533649528515766, + "loss": 0.0297, + "num_input_tokens_seen": 168238304, + "step": 77885 + }, + { + "epoch": 12.706362153344209, + "grad_norm": 0.005796543322503567, + "learning_rate": 0.0003532969043182047, + "loss": 0.0055, + "num_input_tokens_seen": 168250688, + "step": 77890 + }, + { + "epoch": 12.707177814029365, + "grad_norm": 0.014010935090482235, + "learning_rate": 0.0003532288587579572, + "loss": 0.0135, + "num_input_tokens_seen": 168260960, + "step": 77895 + }, + { + "epoch": 12.707993474714518, + "grad_norm": 0.008485809899866581, + "learning_rate": 0.0003531608161722132, + "loss": 0.0027, + "num_input_tokens_seen": 168270656, + "step": 77900 + }, + { + "epoch": 12.708809135399674, + "grad_norm": 0.05919577181339264, + "learning_rate": 0.00035309277656235137, + "loss": 0.0103, + "num_input_tokens_seen": 168281632, + "step": 77905 + }, + { + "epoch": 12.709624796084828, + "grad_norm": 0.03997613489627838, + "learning_rate": 0.000353024739929751, + "loss": 0.0545, + "num_input_tokens_seen": 168293216, + "step": 77910 + }, + { + "epoch": 12.710440456769984, + "grad_norm": 0.007604612503200769, + "learning_rate": 0.0003529567062757905, + "loss": 0.0073, + "num_input_tokens_seen": 168304032, + "step": 77915 + }, + { + "epoch": 12.71125611745514, + "grad_norm": 0.010337037965655327, + "learning_rate": 0.0003528886756018491, + "loss": 0.0211, + "num_input_tokens_seen": 168314624, + "step": 77920 + }, + { + "epoch": 12.712071778140293, + "grad_norm": 0.0031499990727752447, + "learning_rate": 0.0003528206479093051, + "loss": 0.02, + "num_input_tokens_seen": 168324320, + "step": 77925 + }, + { + "epoch": 12.71288743882545, + "grad_norm": 0.014163109473884106, + "learning_rate": 0.0003527526231995376, + "loss": 0.0178, + "num_input_tokens_seen": 168336128, + "step": 77930 + }, + { + "epoch": 12.713703099510603, + "grad_norm": 0.020187662914395332, + "learning_rate": 0.0003526846014739248, + "loss": 0.0122, + "num_input_tokens_seen": 168344992, + "step": 77935 + }, + { + "epoch": 12.714518760195759, + "grad_norm": 0.04705316200852394, + "learning_rate": 0.00035261658273384554, + "loss": 0.0071, + "num_input_tokens_seen": 168356128, + "step": 77940 + }, + { + "epoch": 12.715334420880914, + "grad_norm": 0.0028371878433972597, + "learning_rate": 0.00035254856698067806, + "loss": 0.0061, + "num_input_tokens_seen": 168366784, + "step": 77945 + }, + { + "epoch": 12.716150081566068, + "grad_norm": 0.003474497003480792, + "learning_rate": 0.00035248055421580114, + "loss": 0.0051, + "num_input_tokens_seen": 168379552, + "step": 77950 + }, + { + "epoch": 12.716965742251224, + "grad_norm": 0.004116491414606571, + "learning_rate": 0.0003524125444405928, + "loss": 0.1206, + "num_input_tokens_seen": 168390592, + "step": 77955 + }, + { + "epoch": 12.717781402936378, + "grad_norm": 0.0031350203789770603, + "learning_rate": 0.00035234453765643146, + "loss": 0.0111, + "num_input_tokens_seen": 168401600, + "step": 77960 + }, + { + "epoch": 12.718597063621534, + "grad_norm": 0.41057583689689636, + "learning_rate": 0.0003522765338646954, + "loss": 0.1253, + "num_input_tokens_seen": 168412448, + "step": 77965 + }, + { + "epoch": 12.719412724306688, + "grad_norm": 0.42899951338768005, + "learning_rate": 0.00035220853306676284, + "loss": 0.0739, + "num_input_tokens_seen": 168423328, + "step": 77970 + }, + { + "epoch": 12.720228384991843, + "grad_norm": 0.7427116632461548, + "learning_rate": 0.0003521405352640118, + "loss": 0.0192, + "num_input_tokens_seen": 168434272, + "step": 77975 + }, + { + "epoch": 12.721044045676999, + "grad_norm": 0.059706129133701324, + "learning_rate": 0.00035207254045782036, + "loss": 0.0624, + "num_input_tokens_seen": 168445664, + "step": 77980 + }, + { + "epoch": 12.721859706362153, + "grad_norm": 0.0043745641596615314, + "learning_rate": 0.00035200454864956653, + "loss": 0.0217, + "num_input_tokens_seen": 168456928, + "step": 77985 + }, + { + "epoch": 12.722675367047309, + "grad_norm": 0.055647846311330795, + "learning_rate": 0.00035193655984062835, + "loss": 0.1082, + "num_input_tokens_seen": 168467904, + "step": 77990 + }, + { + "epoch": 12.723491027732463, + "grad_norm": 0.023597707971930504, + "learning_rate": 0.0003518685740323835, + "loss": 0.0056, + "num_input_tokens_seen": 168479104, + "step": 77995 + }, + { + "epoch": 12.724306688417618, + "grad_norm": 1.1642639636993408, + "learning_rate": 0.00035180059122621, + "loss": 0.0656, + "num_input_tokens_seen": 168490016, + "step": 78000 + }, + { + "epoch": 12.725122349102774, + "grad_norm": 0.025501245632767677, + "learning_rate": 0.0003517326114234855, + "loss": 0.0024, + "num_input_tokens_seen": 168501088, + "step": 78005 + }, + { + "epoch": 12.725938009787928, + "grad_norm": 0.3803076446056366, + "learning_rate": 0.0003516646346255877, + "loss": 0.1273, + "num_input_tokens_seen": 168511392, + "step": 78010 + }, + { + "epoch": 12.726753670473084, + "grad_norm": 0.00820028968155384, + "learning_rate": 0.00035159666083389436, + "loss": 0.0344, + "num_input_tokens_seen": 168523328, + "step": 78015 + }, + { + "epoch": 12.727569331158238, + "grad_norm": 0.0002793922321870923, + "learning_rate": 0.00035152869004978276, + "loss": 0.0373, + "num_input_tokens_seen": 168535040, + "step": 78020 + }, + { + "epoch": 12.728384991843393, + "grad_norm": 0.0012598390458151698, + "learning_rate": 0.0003514607222746309, + "loss": 0.0196, + "num_input_tokens_seen": 168545760, + "step": 78025 + }, + { + "epoch": 12.729200652528547, + "grad_norm": 0.5319569110870361, + "learning_rate": 0.0003513927575098156, + "loss": 0.0419, + "num_input_tokens_seen": 168556032, + "step": 78030 + }, + { + "epoch": 12.730016313213703, + "grad_norm": 0.01204077061265707, + "learning_rate": 0.0003513247957567149, + "loss": 0.0406, + "num_input_tokens_seen": 168567552, + "step": 78035 + }, + { + "epoch": 12.730831973898859, + "grad_norm": 0.021115312352776527, + "learning_rate": 0.0003512568370167055, + "loss": 0.0037, + "num_input_tokens_seen": 168579904, + "step": 78040 + }, + { + "epoch": 12.731647634584013, + "grad_norm": 0.012967637740075588, + "learning_rate": 0.0003511888812911653, + "loss": 0.008, + "num_input_tokens_seen": 168590560, + "step": 78045 + }, + { + "epoch": 12.732463295269168, + "grad_norm": 0.003758589504286647, + "learning_rate": 0.00035112092858147106, + "loss": 0.012, + "num_input_tokens_seen": 168601504, + "step": 78050 + }, + { + "epoch": 12.733278955954322, + "grad_norm": 0.09647537022829056, + "learning_rate": 0.0003510529788890001, + "loss": 0.0249, + "num_input_tokens_seen": 168612672, + "step": 78055 + }, + { + "epoch": 12.734094616639478, + "grad_norm": 0.020022863522171974, + "learning_rate": 0.0003509850322151294, + "loss": 0.0072, + "num_input_tokens_seen": 168623968, + "step": 78060 + }, + { + "epoch": 12.734910277324634, + "grad_norm": 0.0031233022455126047, + "learning_rate": 0.0003509170885612362, + "loss": 0.0143, + "num_input_tokens_seen": 168635712, + "step": 78065 + }, + { + "epoch": 12.735725938009788, + "grad_norm": 0.0440434105694294, + "learning_rate": 0.00035084914792869715, + "loss": 0.0723, + "num_input_tokens_seen": 168646176, + "step": 78070 + }, + { + "epoch": 12.736541598694943, + "grad_norm": 0.004339613951742649, + "learning_rate": 0.0003507812103188895, + "loss": 0.0337, + "num_input_tokens_seen": 168657408, + "step": 78075 + }, + { + "epoch": 12.737357259380097, + "grad_norm": 0.4139990508556366, + "learning_rate": 0.0003507132757331898, + "loss": 0.0269, + "num_input_tokens_seen": 168668672, + "step": 78080 + }, + { + "epoch": 12.738172920065253, + "grad_norm": 0.21979013085365295, + "learning_rate": 0.00035064534417297513, + "loss": 0.0137, + "num_input_tokens_seen": 168679648, + "step": 78085 + }, + { + "epoch": 12.738988580750409, + "grad_norm": 0.03787407651543617, + "learning_rate": 0.00035057741563962176, + "loss": 0.0078, + "num_input_tokens_seen": 168692480, + "step": 78090 + }, + { + "epoch": 12.739804241435563, + "grad_norm": 0.01006343774497509, + "learning_rate": 0.00035050949013450686, + "loss": 0.0379, + "num_input_tokens_seen": 168702048, + "step": 78095 + }, + { + "epoch": 12.740619902120718, + "grad_norm": 0.006184085737913847, + "learning_rate": 0.0003504415676590066, + "loss": 0.0029, + "num_input_tokens_seen": 168713216, + "step": 78100 + }, + { + "epoch": 12.741435562805872, + "grad_norm": 1.6347105503082275, + "learning_rate": 0.00035037364821449766, + "loss": 0.2852, + "num_input_tokens_seen": 168723712, + "step": 78105 + }, + { + "epoch": 12.742251223491028, + "grad_norm": 0.00397314690053463, + "learning_rate": 0.0003503057318023568, + "loss": 0.0059, + "num_input_tokens_seen": 168734752, + "step": 78110 + }, + { + "epoch": 12.743066884176184, + "grad_norm": 0.002033184515312314, + "learning_rate": 0.00035023781842395994, + "loss": 0.0047, + "num_input_tokens_seen": 168744480, + "step": 78115 + }, + { + "epoch": 12.743882544861338, + "grad_norm": 0.02932196483016014, + "learning_rate": 0.0003501699080806839, + "loss": 0.0091, + "num_input_tokens_seen": 168754624, + "step": 78120 + }, + { + "epoch": 12.744698205546493, + "grad_norm": 0.5784959197044373, + "learning_rate": 0.0003501020007739045, + "loss": 0.1011, + "num_input_tokens_seen": 168765920, + "step": 78125 + }, + { + "epoch": 12.745513866231647, + "grad_norm": 0.3741350769996643, + "learning_rate": 0.0003500340965049984, + "loss": 0.1566, + "num_input_tokens_seen": 168776000, + "step": 78130 + }, + { + "epoch": 12.746329526916803, + "grad_norm": 0.002900507999584079, + "learning_rate": 0.00034996619527534153, + "loss": 0.0061, + "num_input_tokens_seen": 168787200, + "step": 78135 + }, + { + "epoch": 12.747145187601957, + "grad_norm": 0.3536345064640045, + "learning_rate": 0.00034989829708631005, + "loss": 0.0596, + "num_input_tokens_seen": 168798464, + "step": 78140 + }, + { + "epoch": 12.747960848287113, + "grad_norm": 0.008484461344778538, + "learning_rate": 0.00034983040193927996, + "loss": 0.0407, + "num_input_tokens_seen": 168809536, + "step": 78145 + }, + { + "epoch": 12.748776508972268, + "grad_norm": 0.006402932107448578, + "learning_rate": 0.0003497625098356273, + "loss": 0.0148, + "num_input_tokens_seen": 168820512, + "step": 78150 + }, + { + "epoch": 12.749592169657422, + "grad_norm": 0.11143842339515686, + "learning_rate": 0.00034969462077672793, + "loss": 0.0108, + "num_input_tokens_seen": 168830752, + "step": 78155 + }, + { + "epoch": 12.750407830342578, + "grad_norm": 0.00579224806278944, + "learning_rate": 0.0003496267347639579, + "loss": 0.0696, + "num_input_tokens_seen": 168841920, + "step": 78160 + }, + { + "epoch": 12.751223491027732, + "grad_norm": 0.002410769695416093, + "learning_rate": 0.00034955885179869265, + "loss": 0.0027, + "num_input_tokens_seen": 168852896, + "step": 78165 + }, + { + "epoch": 12.752039151712887, + "grad_norm": 0.6489723920822144, + "learning_rate": 0.0003494909718823083, + "loss": 0.0587, + "num_input_tokens_seen": 168864736, + "step": 78170 + }, + { + "epoch": 12.752854812398043, + "grad_norm": 0.0014829429564997554, + "learning_rate": 0.00034942309501618016, + "loss": 0.0093, + "num_input_tokens_seen": 168876256, + "step": 78175 + }, + { + "epoch": 12.753670473083197, + "grad_norm": 0.0063831862062215805, + "learning_rate": 0.00034935522120168417, + "loss": 0.0058, + "num_input_tokens_seen": 168887456, + "step": 78180 + }, + { + "epoch": 12.754486133768353, + "grad_norm": 0.587399423122406, + "learning_rate": 0.0003492873504401956, + "loss": 0.0277, + "num_input_tokens_seen": 168899552, + "step": 78185 + }, + { + "epoch": 12.755301794453507, + "grad_norm": 0.08072449266910553, + "learning_rate": 0.0003492194827330902, + "loss": 0.0194, + "num_input_tokens_seen": 168910368, + "step": 78190 + }, + { + "epoch": 12.756117455138662, + "grad_norm": 0.0015662991208955646, + "learning_rate": 0.00034915161808174314, + "loss": 0.0179, + "num_input_tokens_seen": 168921728, + "step": 78195 + }, + { + "epoch": 12.756933115823816, + "grad_norm": 0.08228830248117447, + "learning_rate": 0.0003490837564875301, + "loss": 0.0053, + "num_input_tokens_seen": 168933568, + "step": 78200 + }, + { + "epoch": 12.757748776508972, + "grad_norm": 0.002885582856833935, + "learning_rate": 0.0003490158979518259, + "loss": 0.0176, + "num_input_tokens_seen": 168944384, + "step": 78205 + }, + { + "epoch": 12.758564437194128, + "grad_norm": 0.5468868017196655, + "learning_rate": 0.00034894804247600613, + "loss": 0.0556, + "num_input_tokens_seen": 168955584, + "step": 78210 + }, + { + "epoch": 12.759380097879282, + "grad_norm": 0.004578125663101673, + "learning_rate": 0.0003488801900614461, + "loss": 0.0158, + "num_input_tokens_seen": 168966016, + "step": 78215 + }, + { + "epoch": 12.760195758564437, + "grad_norm": 0.17387300729751587, + "learning_rate": 0.0003488123407095205, + "loss": 0.0282, + "num_input_tokens_seen": 168976832, + "step": 78220 + }, + { + "epoch": 12.761011419249591, + "grad_norm": 0.03517686203122139, + "learning_rate": 0.00034874449442160485, + "loss": 0.0159, + "num_input_tokens_seen": 168987936, + "step": 78225 + }, + { + "epoch": 12.761827079934747, + "grad_norm": 0.006768247578293085, + "learning_rate": 0.00034867665119907363, + "loss": 0.0183, + "num_input_tokens_seen": 168999584, + "step": 78230 + }, + { + "epoch": 12.762642740619903, + "grad_norm": 0.09498634934425354, + "learning_rate": 0.0003486088110433023, + "loss": 0.1167, + "num_input_tokens_seen": 169010272, + "step": 78235 + }, + { + "epoch": 12.763458401305057, + "grad_norm": 0.01517215184867382, + "learning_rate": 0.0003485409739556653, + "loss": 0.0036, + "num_input_tokens_seen": 169020768, + "step": 78240 + }, + { + "epoch": 12.764274061990212, + "grad_norm": 0.016436690464615822, + "learning_rate": 0.0003484731399375377, + "loss": 0.1903, + "num_input_tokens_seen": 169029472, + "step": 78245 + }, + { + "epoch": 12.765089722675366, + "grad_norm": 0.009139187633991241, + "learning_rate": 0.00034840530899029405, + "loss": 0.1342, + "num_input_tokens_seen": 169039520, + "step": 78250 + }, + { + "epoch": 12.765905383360522, + "grad_norm": 0.0008956545498222113, + "learning_rate": 0.00034833748111530926, + "loss": 0.0154, + "num_input_tokens_seen": 169049440, + "step": 78255 + }, + { + "epoch": 12.766721044045678, + "grad_norm": 0.02780218981206417, + "learning_rate": 0.00034826965631395767, + "loss": 0.0034, + "num_input_tokens_seen": 169061312, + "step": 78260 + }, + { + "epoch": 12.767536704730832, + "grad_norm": 0.005282025318592787, + "learning_rate": 0.0003482018345876141, + "loss": 0.0081, + "num_input_tokens_seen": 169071776, + "step": 78265 + }, + { + "epoch": 12.768352365415987, + "grad_norm": 0.01068632211536169, + "learning_rate": 0.0003481340159376528, + "loss": 0.0079, + "num_input_tokens_seen": 169082528, + "step": 78270 + }, + { + "epoch": 12.769168026101141, + "grad_norm": 0.002080516656860709, + "learning_rate": 0.0003480662003654483, + "loss": 0.0355, + "num_input_tokens_seen": 169093440, + "step": 78275 + }, + { + "epoch": 12.769983686786297, + "grad_norm": 0.021333087235689163, + "learning_rate": 0.00034799838787237514, + "loss": 0.0067, + "num_input_tokens_seen": 169103840, + "step": 78280 + }, + { + "epoch": 12.770799347471453, + "grad_norm": 0.02037815749645233, + "learning_rate": 0.00034793057845980744, + "loss": 0.009, + "num_input_tokens_seen": 169113760, + "step": 78285 + }, + { + "epoch": 12.771615008156607, + "grad_norm": 0.006305212154984474, + "learning_rate": 0.00034786277212911943, + "loss": 0.0026, + "num_input_tokens_seen": 169123552, + "step": 78290 + }, + { + "epoch": 12.772430668841762, + "grad_norm": 0.006527638528496027, + "learning_rate": 0.0003477949688816854, + "loss": 0.007, + "num_input_tokens_seen": 169133664, + "step": 78295 + }, + { + "epoch": 12.773246329526916, + "grad_norm": 0.008840296417474747, + "learning_rate": 0.00034772716871887924, + "loss": 0.0116, + "num_input_tokens_seen": 169145440, + "step": 78300 + }, + { + "epoch": 12.774061990212072, + "grad_norm": 0.48614218831062317, + "learning_rate": 0.0003476593716420754, + "loss": 0.025, + "num_input_tokens_seen": 169156480, + "step": 78305 + }, + { + "epoch": 12.774877650897226, + "grad_norm": 0.029282858595252037, + "learning_rate": 0.00034759157765264746, + "loss": 0.0047, + "num_input_tokens_seen": 169167744, + "step": 78310 + }, + { + "epoch": 12.775693311582382, + "grad_norm": 0.27233338356018066, + "learning_rate": 0.00034752378675196975, + "loss": 0.0169, + "num_input_tokens_seen": 169178464, + "step": 78315 + }, + { + "epoch": 12.776508972267537, + "grad_norm": 0.005301279481500387, + "learning_rate": 0.0003474559989414158, + "loss": 0.0074, + "num_input_tokens_seen": 169189248, + "step": 78320 + }, + { + "epoch": 12.777324632952691, + "grad_norm": 0.04829799011349678, + "learning_rate": 0.00034738821422235943, + "loss": 0.0076, + "num_input_tokens_seen": 169201024, + "step": 78325 + }, + { + "epoch": 12.778140293637847, + "grad_norm": 0.030659900978207588, + "learning_rate": 0.00034732043259617473, + "loss": 0.0049, + "num_input_tokens_seen": 169212480, + "step": 78330 + }, + { + "epoch": 12.778955954323001, + "grad_norm": 0.006221574265509844, + "learning_rate": 0.000347252654064235, + "loss": 0.0549, + "num_input_tokens_seen": 169224128, + "step": 78335 + }, + { + "epoch": 12.779771615008157, + "grad_norm": 0.01066858321428299, + "learning_rate": 0.00034718487862791413, + "loss": 0.0024, + "num_input_tokens_seen": 169234592, + "step": 78340 + }, + { + "epoch": 12.780587275693312, + "grad_norm": 0.556633472442627, + "learning_rate": 0.0003471171062885854, + "loss": 0.0951, + "num_input_tokens_seen": 169245728, + "step": 78345 + }, + { + "epoch": 12.781402936378466, + "grad_norm": 0.031409382820129395, + "learning_rate": 0.00034704933704762266, + "loss": 0.0668, + "num_input_tokens_seen": 169256160, + "step": 78350 + }, + { + "epoch": 12.782218597063622, + "grad_norm": 0.007934520952403545, + "learning_rate": 0.00034698157090639893, + "loss": 0.0025, + "num_input_tokens_seen": 169267232, + "step": 78355 + }, + { + "epoch": 12.783034257748776, + "grad_norm": 0.18476316332817078, + "learning_rate": 0.000346913807866288, + "loss": 0.0094, + "num_input_tokens_seen": 169277120, + "step": 78360 + }, + { + "epoch": 12.783849918433932, + "grad_norm": 0.020248549059033394, + "learning_rate": 0.00034684604792866277, + "loss": 0.0063, + "num_input_tokens_seen": 169288096, + "step": 78365 + }, + { + "epoch": 12.784665579119086, + "grad_norm": 0.01171860285103321, + "learning_rate": 0.00034677829109489684, + "loss": 0.0058, + "num_input_tokens_seen": 169299744, + "step": 78370 + }, + { + "epoch": 12.785481239804241, + "grad_norm": 0.002188315847888589, + "learning_rate": 0.00034671053736636307, + "loss": 0.0073, + "num_input_tokens_seen": 169310528, + "step": 78375 + }, + { + "epoch": 12.786296900489397, + "grad_norm": 0.05189083144068718, + "learning_rate": 0.0003466427867444348, + "loss": 0.0166, + "num_input_tokens_seen": 169321440, + "step": 78380 + }, + { + "epoch": 12.78711256117455, + "grad_norm": 0.004644290544092655, + "learning_rate": 0.00034657503923048497, + "loss": 0.1892, + "num_input_tokens_seen": 169332640, + "step": 78385 + }, + { + "epoch": 12.787928221859707, + "grad_norm": 0.02340223640203476, + "learning_rate": 0.00034650729482588665, + "loss": 0.0228, + "num_input_tokens_seen": 169344576, + "step": 78390 + }, + { + "epoch": 12.78874388254486, + "grad_norm": 0.12741638720035553, + "learning_rate": 0.0003464395535320126, + "loss": 0.0293, + "num_input_tokens_seen": 169356288, + "step": 78395 + }, + { + "epoch": 12.789559543230016, + "grad_norm": 0.4657769203186035, + "learning_rate": 0.000346371815350236, + "loss": 0.0757, + "num_input_tokens_seen": 169367520, + "step": 78400 + }, + { + "epoch": 12.790375203915172, + "grad_norm": 0.011172788217663765, + "learning_rate": 0.0003463040802819292, + "loss": 0.0059, + "num_input_tokens_seen": 169378208, + "step": 78405 + }, + { + "epoch": 12.791190864600326, + "grad_norm": 0.0089213652536273, + "learning_rate": 0.0003462363483284654, + "loss": 0.0036, + "num_input_tokens_seen": 169387968, + "step": 78410 + }, + { + "epoch": 12.792006525285482, + "grad_norm": 0.019134066998958588, + "learning_rate": 0.0003461686194912169, + "loss": 0.0101, + "num_input_tokens_seen": 169397952, + "step": 78415 + }, + { + "epoch": 12.792822185970635, + "grad_norm": 0.0643213540315628, + "learning_rate": 0.00034610089377155656, + "loss": 0.141, + "num_input_tokens_seen": 169410176, + "step": 78420 + }, + { + "epoch": 12.793637846655791, + "grad_norm": 0.004243266768753529, + "learning_rate": 0.0003460331711708569, + "loss": 0.004, + "num_input_tokens_seen": 169421408, + "step": 78425 + }, + { + "epoch": 12.794453507340947, + "grad_norm": 0.008267040364444256, + "learning_rate": 0.00034596545169049013, + "loss": 0.0035, + "num_input_tokens_seen": 169432032, + "step": 78430 + }, + { + "epoch": 12.7952691680261, + "grad_norm": 0.00525688799098134, + "learning_rate": 0.00034589773533182924, + "loss": 0.0098, + "num_input_tokens_seen": 169441888, + "step": 78435 + }, + { + "epoch": 12.796084828711257, + "grad_norm": 0.04394717514514923, + "learning_rate": 0.00034583002209624594, + "loss": 0.1423, + "num_input_tokens_seen": 169452064, + "step": 78440 + }, + { + "epoch": 12.79690048939641, + "grad_norm": 0.02494252845644951, + "learning_rate": 0.0003457623119851129, + "loss": 0.0046, + "num_input_tokens_seen": 169464576, + "step": 78445 + }, + { + "epoch": 12.797716150081566, + "grad_norm": 0.14180971682071686, + "learning_rate": 0.00034569460499980233, + "loss": 0.0109, + "num_input_tokens_seen": 169474464, + "step": 78450 + }, + { + "epoch": 12.798531810766722, + "grad_norm": 0.008769070729613304, + "learning_rate": 0.00034562690114168626, + "loss": 0.0253, + "num_input_tokens_seen": 169484992, + "step": 78455 + }, + { + "epoch": 12.799347471451876, + "grad_norm": 0.0028862846083939075, + "learning_rate": 0.000345559200412137, + "loss": 0.0063, + "num_input_tokens_seen": 169495328, + "step": 78460 + }, + { + "epoch": 12.800163132137031, + "grad_norm": 0.006227980833500624, + "learning_rate": 0.00034549150281252633, + "loss": 0.0137, + "num_input_tokens_seen": 169507008, + "step": 78465 + }, + { + "epoch": 12.800978792822185, + "grad_norm": 0.505739688873291, + "learning_rate": 0.00034542380834422633, + "loss": 0.0376, + "num_input_tokens_seen": 169518144, + "step": 78470 + }, + { + "epoch": 12.801794453507341, + "grad_norm": 0.2012212574481964, + "learning_rate": 0.00034535611700860913, + "loss": 0.105, + "num_input_tokens_seen": 169529536, + "step": 78475 + }, + { + "epoch": 12.802610114192497, + "grad_norm": 0.015639374032616615, + "learning_rate": 0.00034528842880704626, + "loss": 0.0097, + "num_input_tokens_seen": 169540384, + "step": 78480 + }, + { + "epoch": 12.80342577487765, + "grad_norm": 0.034782614558935165, + "learning_rate": 0.0003452207437409097, + "loss": 0.0168, + "num_input_tokens_seen": 169550816, + "step": 78485 + }, + { + "epoch": 12.804241435562806, + "grad_norm": 0.010439387522637844, + "learning_rate": 0.00034515306181157106, + "loss": 0.06, + "num_input_tokens_seen": 169562560, + "step": 78490 + }, + { + "epoch": 12.80505709624796, + "grad_norm": 0.0014605855103582144, + "learning_rate": 0.00034508538302040225, + "loss": 0.0254, + "num_input_tokens_seen": 169573312, + "step": 78495 + }, + { + "epoch": 12.805872756933116, + "grad_norm": 0.2424784004688263, + "learning_rate": 0.00034501770736877443, + "loss": 0.0935, + "num_input_tokens_seen": 169584032, + "step": 78500 + }, + { + "epoch": 12.80668841761827, + "grad_norm": 0.00401564035564661, + "learning_rate": 0.0003449500348580596, + "loss": 0.0177, + "num_input_tokens_seen": 169594368, + "step": 78505 + }, + { + "epoch": 12.807504078303426, + "grad_norm": 0.009832642041146755, + "learning_rate": 0.0003448823654896288, + "loss": 0.0599, + "num_input_tokens_seen": 169605248, + "step": 78510 + }, + { + "epoch": 12.808319738988581, + "grad_norm": 0.29828646779060364, + "learning_rate": 0.00034481469926485385, + "loss": 0.0193, + "num_input_tokens_seen": 169615904, + "step": 78515 + }, + { + "epoch": 12.809135399673735, + "grad_norm": 0.027520187199115753, + "learning_rate": 0.00034474703618510565, + "loss": 0.0323, + "num_input_tokens_seen": 169626592, + "step": 78520 + }, + { + "epoch": 12.809951060358891, + "grad_norm": 0.08540055900812149, + "learning_rate": 0.00034467937625175596, + "loss": 0.1196, + "num_input_tokens_seen": 169637504, + "step": 78525 + }, + { + "epoch": 12.810766721044045, + "grad_norm": 0.0028646751306951046, + "learning_rate": 0.00034461171946617553, + "loss": 0.0752, + "num_input_tokens_seen": 169647232, + "step": 78530 + }, + { + "epoch": 12.8115823817292, + "grad_norm": 0.0070752971805632114, + "learning_rate": 0.0003445440658297357, + "loss": 0.0048, + "num_input_tokens_seen": 169657824, + "step": 78535 + }, + { + "epoch": 12.812398042414356, + "grad_norm": 0.007390094920992851, + "learning_rate": 0.0003444764153438079, + "loss": 0.0079, + "num_input_tokens_seen": 169667776, + "step": 78540 + }, + { + "epoch": 12.81321370309951, + "grad_norm": 0.1049143373966217, + "learning_rate": 0.0003444087680097625, + "loss": 0.0481, + "num_input_tokens_seen": 169678784, + "step": 78545 + }, + { + "epoch": 12.814029363784666, + "grad_norm": 0.008307898417115211, + "learning_rate": 0.00034434112382897107, + "loss": 0.036, + "num_input_tokens_seen": 169688960, + "step": 78550 + }, + { + "epoch": 12.81484502446982, + "grad_norm": 0.0053911637514829636, + "learning_rate": 0.000344273482802804, + "loss": 0.0137, + "num_input_tokens_seen": 169698208, + "step": 78555 + }, + { + "epoch": 12.815660685154976, + "grad_norm": 0.012019234709441662, + "learning_rate": 0.00034420584493263264, + "loss": 0.0108, + "num_input_tokens_seen": 169709056, + "step": 78560 + }, + { + "epoch": 12.81647634584013, + "grad_norm": 0.5703594088554382, + "learning_rate": 0.0003441382102198272, + "loss": 0.0911, + "num_input_tokens_seen": 169719936, + "step": 78565 + }, + { + "epoch": 12.817292006525285, + "grad_norm": 0.01869955286383629, + "learning_rate": 0.0003440705786657588, + "loss": 0.0259, + "num_input_tokens_seen": 169730816, + "step": 78570 + }, + { + "epoch": 12.818107667210441, + "grad_norm": 0.16946589946746826, + "learning_rate": 0.00034400295027179776, + "loss": 0.0073, + "num_input_tokens_seen": 169741696, + "step": 78575 + }, + { + "epoch": 12.818923327895595, + "grad_norm": 0.5616081953048706, + "learning_rate": 0.00034393532503931514, + "loss": 0.0469, + "num_input_tokens_seen": 169752512, + "step": 78580 + }, + { + "epoch": 12.81973898858075, + "grad_norm": 0.0031734046060591936, + "learning_rate": 0.0003438677029696808, + "loss": 0.01, + "num_input_tokens_seen": 169760992, + "step": 78585 + }, + { + "epoch": 12.820554649265905, + "grad_norm": 0.06362012028694153, + "learning_rate": 0.0003438000840642657, + "loss": 0.0378, + "num_input_tokens_seen": 169771424, + "step": 78590 + }, + { + "epoch": 12.82137030995106, + "grad_norm": 0.02829126827418804, + "learning_rate": 0.00034373246832444007, + "loss": 0.0224, + "num_input_tokens_seen": 169781344, + "step": 78595 + }, + { + "epoch": 12.822185970636216, + "grad_norm": 2.228576183319092, + "learning_rate": 0.00034366485575157413, + "loss": 0.1215, + "num_input_tokens_seen": 169791488, + "step": 78600 + }, + { + "epoch": 12.82300163132137, + "grad_norm": 0.0022986563853919506, + "learning_rate": 0.00034359724634703827, + "loss": 0.0039, + "num_input_tokens_seen": 169803136, + "step": 78605 + }, + { + "epoch": 12.823817292006526, + "grad_norm": 0.007648915518075228, + "learning_rate": 0.0003435296401122027, + "loss": 0.0078, + "num_input_tokens_seen": 169813664, + "step": 78610 + }, + { + "epoch": 12.82463295269168, + "grad_norm": 0.015873296186327934, + "learning_rate": 0.0003434620370484372, + "loss": 0.0045, + "num_input_tokens_seen": 169824288, + "step": 78615 + }, + { + "epoch": 12.825448613376835, + "grad_norm": 0.001540105091407895, + "learning_rate": 0.0003433944371571124, + "loss": 0.0136, + "num_input_tokens_seen": 169833728, + "step": 78620 + }, + { + "epoch": 12.826264274061991, + "grad_norm": 1.0538294315338135, + "learning_rate": 0.00034332684043959777, + "loss": 0.2605, + "num_input_tokens_seen": 169843936, + "step": 78625 + }, + { + "epoch": 12.827079934747145, + "grad_norm": 0.002735902788117528, + "learning_rate": 0.00034325924689726376, + "loss": 0.012, + "num_input_tokens_seen": 169854688, + "step": 78630 + }, + { + "epoch": 12.8278955954323, + "grad_norm": 0.2805021107196808, + "learning_rate": 0.00034319165653147964, + "loss": 0.0268, + "num_input_tokens_seen": 169863680, + "step": 78635 + }, + { + "epoch": 12.828711256117455, + "grad_norm": 0.023348089307546616, + "learning_rate": 0.00034312406934361553, + "loss": 0.0082, + "num_input_tokens_seen": 169874176, + "step": 78640 + }, + { + "epoch": 12.82952691680261, + "grad_norm": 0.03763202577829361, + "learning_rate": 0.0003430564853350414, + "loss": 0.0118, + "num_input_tokens_seen": 169885632, + "step": 78645 + }, + { + "epoch": 12.830342577487766, + "grad_norm": 0.005634233821183443, + "learning_rate": 0.0003429889045071265, + "loss": 0.0463, + "num_input_tokens_seen": 169895552, + "step": 78650 + }, + { + "epoch": 12.83115823817292, + "grad_norm": 0.023782063275575638, + "learning_rate": 0.0003429213268612408, + "loss": 0.1291, + "num_input_tokens_seen": 169905728, + "step": 78655 + }, + { + "epoch": 12.831973898858076, + "grad_norm": 0.05306378751993179, + "learning_rate": 0.0003428537523987535, + "loss": 0.0052, + "num_input_tokens_seen": 169914304, + "step": 78660 + }, + { + "epoch": 12.83278955954323, + "grad_norm": 0.0029371960554271936, + "learning_rate": 0.0003427861811210345, + "loss": 0.0051, + "num_input_tokens_seen": 169924800, + "step": 78665 + }, + { + "epoch": 12.833605220228385, + "grad_norm": 0.006226977799087763, + "learning_rate": 0.0003427186130294527, + "loss": 0.0046, + "num_input_tokens_seen": 169935712, + "step": 78670 + }, + { + "epoch": 12.83442088091354, + "grad_norm": 0.31288060545921326, + "learning_rate": 0.00034265104812537805, + "loss": 0.0823, + "num_input_tokens_seen": 169947296, + "step": 78675 + }, + { + "epoch": 12.835236541598695, + "grad_norm": 0.0038242738228291273, + "learning_rate": 0.0003425834864101792, + "loss": 0.0058, + "num_input_tokens_seen": 169957504, + "step": 78680 + }, + { + "epoch": 12.83605220228385, + "grad_norm": 0.031265001744031906, + "learning_rate": 0.000342515927885226, + "loss": 0.0828, + "num_input_tokens_seen": 169968896, + "step": 78685 + }, + { + "epoch": 12.836867862969005, + "grad_norm": 0.014412354677915573, + "learning_rate": 0.000342448372551887, + "loss": 0.0143, + "num_input_tokens_seen": 169979680, + "step": 78690 + }, + { + "epoch": 12.83768352365416, + "grad_norm": 0.010973574593663216, + "learning_rate": 0.0003423808204115318, + "loss": 0.0045, + "num_input_tokens_seen": 169989952, + "step": 78695 + }, + { + "epoch": 12.838499184339314, + "grad_norm": 0.015177428722381592, + "learning_rate": 0.00034231327146552916, + "loss": 0.0109, + "num_input_tokens_seen": 170000896, + "step": 78700 + }, + { + "epoch": 12.83931484502447, + "grad_norm": 0.02052193135023117, + "learning_rate": 0.00034224572571524823, + "loss": 0.0538, + "num_input_tokens_seen": 170011616, + "step": 78705 + }, + { + "epoch": 12.840130505709626, + "grad_norm": 0.04011628404259682, + "learning_rate": 0.00034217818316205757, + "loss": 0.009, + "num_input_tokens_seen": 170023296, + "step": 78710 + }, + { + "epoch": 12.84094616639478, + "grad_norm": 0.003024019068107009, + "learning_rate": 0.0003421106438073265, + "loss": 0.0082, + "num_input_tokens_seen": 170032384, + "step": 78715 + }, + { + "epoch": 12.841761827079935, + "grad_norm": 0.10613281279802322, + "learning_rate": 0.0003420431076524233, + "loss": 0.0085, + "num_input_tokens_seen": 170044352, + "step": 78720 + }, + { + "epoch": 12.84257748776509, + "grad_norm": 0.0019029824761673808, + "learning_rate": 0.0003419755746987171, + "loss": 0.0025, + "num_input_tokens_seen": 170055136, + "step": 78725 + }, + { + "epoch": 12.843393148450245, + "grad_norm": 0.013800938613712788, + "learning_rate": 0.0003419080449475761, + "loss": 0.0047, + "num_input_tokens_seen": 170064672, + "step": 78730 + }, + { + "epoch": 12.844208809135399, + "grad_norm": 0.006535480264574289, + "learning_rate": 0.0003418405184003693, + "loss": 0.0067, + "num_input_tokens_seen": 170075488, + "step": 78735 + }, + { + "epoch": 12.845024469820554, + "grad_norm": 0.35378557443618774, + "learning_rate": 0.000341772995058465, + "loss": 0.0358, + "num_input_tokens_seen": 170084736, + "step": 78740 + }, + { + "epoch": 12.84584013050571, + "grad_norm": 0.004953265190124512, + "learning_rate": 0.0003417054749232316, + "loss": 0.1255, + "num_input_tokens_seen": 170094784, + "step": 78745 + }, + { + "epoch": 12.846655791190864, + "grad_norm": 0.006483436096459627, + "learning_rate": 0.0003416379579960377, + "loss": 0.0064, + "num_input_tokens_seen": 170104640, + "step": 78750 + }, + { + "epoch": 12.84747145187602, + "grad_norm": 0.01915556751191616, + "learning_rate": 0.00034157044427825137, + "loss": 0.0042, + "num_input_tokens_seen": 170115744, + "step": 78755 + }, + { + "epoch": 12.848287112561174, + "grad_norm": 0.00910657923668623, + "learning_rate": 0.000341502933771241, + "loss": 0.0356, + "num_input_tokens_seen": 170127008, + "step": 78760 + }, + { + "epoch": 12.84910277324633, + "grad_norm": 0.01944858767092228, + "learning_rate": 0.00034143542647637474, + "loss": 0.0171, + "num_input_tokens_seen": 170137248, + "step": 78765 + }, + { + "epoch": 12.849918433931485, + "grad_norm": 0.002999127609655261, + "learning_rate": 0.00034136792239502074, + "loss": 0.0088, + "num_input_tokens_seen": 170147904, + "step": 78770 + }, + { + "epoch": 12.850734094616639, + "grad_norm": 0.7215339541435242, + "learning_rate": 0.000341300421528547, + "loss": 0.1081, + "num_input_tokens_seen": 170158912, + "step": 78775 + }, + { + "epoch": 12.851549755301795, + "grad_norm": 0.004867668263614178, + "learning_rate": 0.0003412329238783216, + "loss": 0.0065, + "num_input_tokens_seen": 170169760, + "step": 78780 + }, + { + "epoch": 12.852365415986949, + "grad_norm": 0.009730189107358456, + "learning_rate": 0.00034116542944571227, + "loss": 0.0279, + "num_input_tokens_seen": 170180448, + "step": 78785 + }, + { + "epoch": 12.853181076672104, + "grad_norm": 0.0032304124906659126, + "learning_rate": 0.00034109793823208724, + "loss": 0.0051, + "num_input_tokens_seen": 170191136, + "step": 78790 + }, + { + "epoch": 12.85399673735726, + "grad_norm": 0.010908522643148899, + "learning_rate": 0.0003410304502388139, + "loss": 0.0062, + "num_input_tokens_seen": 170201376, + "step": 78795 + }, + { + "epoch": 12.854812398042414, + "grad_norm": 0.0023245131596922874, + "learning_rate": 0.0003409629654672602, + "loss": 0.0141, + "num_input_tokens_seen": 170212928, + "step": 78800 + }, + { + "epoch": 12.85562805872757, + "grad_norm": 0.02462649531662464, + "learning_rate": 0.0003408954839187938, + "loss": 0.0148, + "num_input_tokens_seen": 170224032, + "step": 78805 + }, + { + "epoch": 12.856443719412724, + "grad_norm": 0.004026814829558134, + "learning_rate": 0.0003408280055947823, + "loss": 0.0066, + "num_input_tokens_seen": 170234784, + "step": 78810 + }, + { + "epoch": 12.85725938009788, + "grad_norm": 0.001957811415195465, + "learning_rate": 0.00034076053049659295, + "loss": 0.0067, + "num_input_tokens_seen": 170247328, + "step": 78815 + }, + { + "epoch": 12.858075040783035, + "grad_norm": 0.00296620256267488, + "learning_rate": 0.00034069305862559373, + "loss": 0.0027, + "num_input_tokens_seen": 170258720, + "step": 78820 + }, + { + "epoch": 12.858890701468189, + "grad_norm": 0.03529384359717369, + "learning_rate": 0.00034062558998315163, + "loss": 0.063, + "num_input_tokens_seen": 170269376, + "step": 78825 + }, + { + "epoch": 12.859706362153345, + "grad_norm": 0.09343760460615158, + "learning_rate": 0.0003405581245706342, + "loss": 0.0072, + "num_input_tokens_seen": 170280672, + "step": 78830 + }, + { + "epoch": 12.860522022838499, + "grad_norm": 0.011827799491584301, + "learning_rate": 0.0003404906623894085, + "loss": 0.0095, + "num_input_tokens_seen": 170291488, + "step": 78835 + }, + { + "epoch": 12.861337683523654, + "grad_norm": 0.0021250757854431868, + "learning_rate": 0.0003404232034408421, + "loss": 0.0062, + "num_input_tokens_seen": 170303264, + "step": 78840 + }, + { + "epoch": 12.86215334420881, + "grad_norm": 0.5916451215744019, + "learning_rate": 0.00034035574772630175, + "loss": 0.0772, + "num_input_tokens_seen": 170313280, + "step": 78845 + }, + { + "epoch": 12.862969004893964, + "grad_norm": 0.019389253109693527, + "learning_rate": 0.00034028829524715464, + "loss": 0.015, + "num_input_tokens_seen": 170324544, + "step": 78850 + }, + { + "epoch": 12.86378466557912, + "grad_norm": 0.07211606204509735, + "learning_rate": 0.000340220846004768, + "loss": 0.0096, + "num_input_tokens_seen": 170336672, + "step": 78855 + }, + { + "epoch": 12.864600326264274, + "grad_norm": 0.0109171811491251, + "learning_rate": 0.00034015340000050846, + "loss": 0.0024, + "num_input_tokens_seen": 170347680, + "step": 78860 + }, + { + "epoch": 12.86541598694943, + "grad_norm": 0.0226032342761755, + "learning_rate": 0.00034008595723574326, + "loss": 0.0035, + "num_input_tokens_seen": 170359136, + "step": 78865 + }, + { + "epoch": 12.866231647634583, + "grad_norm": 0.016140323132276535, + "learning_rate": 0.00034001851771183877, + "loss": 0.0688, + "num_input_tokens_seen": 170369440, + "step": 78870 + }, + { + "epoch": 12.867047308319739, + "grad_norm": 0.07779782265424728, + "learning_rate": 0.00033995108143016216, + "loss": 0.0144, + "num_input_tokens_seen": 170380864, + "step": 78875 + }, + { + "epoch": 12.867862969004895, + "grad_norm": 0.06042582169175148, + "learning_rate": 0.0003398836483920798, + "loss": 0.0052, + "num_input_tokens_seen": 170392704, + "step": 78880 + }, + { + "epoch": 12.868678629690049, + "grad_norm": 0.020869217813014984, + "learning_rate": 0.0003398162185989586, + "loss": 0.0058, + "num_input_tokens_seen": 170403552, + "step": 78885 + }, + { + "epoch": 12.869494290375204, + "grad_norm": 0.010013763792812824, + "learning_rate": 0.0003397487920521647, + "loss": 0.013, + "num_input_tokens_seen": 170414336, + "step": 78890 + }, + { + "epoch": 12.870309951060358, + "grad_norm": 0.0005221384926699102, + "learning_rate": 0.00033968136875306496, + "loss": 0.0119, + "num_input_tokens_seen": 170426400, + "step": 78895 + }, + { + "epoch": 12.871125611745514, + "grad_norm": 0.0225661713629961, + "learning_rate": 0.0003396139487030256, + "loss": 0.1702, + "num_input_tokens_seen": 170437472, + "step": 78900 + }, + { + "epoch": 12.87194127243067, + "grad_norm": 0.00248327711597085, + "learning_rate": 0.00033954653190341306, + "loss": 0.1597, + "num_input_tokens_seen": 170448416, + "step": 78905 + }, + { + "epoch": 12.872756933115824, + "grad_norm": 0.019737066701054573, + "learning_rate": 0.0003394791183555936, + "loss": 0.0068, + "num_input_tokens_seen": 170460256, + "step": 78910 + }, + { + "epoch": 12.87357259380098, + "grad_norm": 0.10288142412900925, + "learning_rate": 0.0003394117080609335, + "loss": 0.0048, + "num_input_tokens_seen": 170471776, + "step": 78915 + }, + { + "epoch": 12.874388254486133, + "grad_norm": 0.0019228693563491106, + "learning_rate": 0.0003393443010207988, + "loss": 0.0038, + "num_input_tokens_seen": 170483328, + "step": 78920 + }, + { + "epoch": 12.875203915171289, + "grad_norm": 0.010113107040524483, + "learning_rate": 0.0003392768972365556, + "loss": 0.0585, + "num_input_tokens_seen": 170494432, + "step": 78925 + }, + { + "epoch": 12.876019575856443, + "grad_norm": 0.037867337465286255, + "learning_rate": 0.00033920949670956994, + "loss": 0.0172, + "num_input_tokens_seen": 170504864, + "step": 78930 + }, + { + "epoch": 12.876835236541599, + "grad_norm": 0.015708623453974724, + "learning_rate": 0.000339142099441208, + "loss": 0.0058, + "num_input_tokens_seen": 170515136, + "step": 78935 + }, + { + "epoch": 12.877650897226754, + "grad_norm": 0.0006929939845576882, + "learning_rate": 0.0003390747054328353, + "loss": 0.0064, + "num_input_tokens_seen": 170525824, + "step": 78940 + }, + { + "epoch": 12.878466557911908, + "grad_norm": 0.3186556398868561, + "learning_rate": 0.00033900731468581804, + "loss": 0.2179, + "num_input_tokens_seen": 170536928, + "step": 78945 + }, + { + "epoch": 12.879282218597064, + "grad_norm": 0.0015595832373946905, + "learning_rate": 0.0003389399272015215, + "loss": 0.023, + "num_input_tokens_seen": 170547520, + "step": 78950 + }, + { + "epoch": 12.880097879282218, + "grad_norm": 0.007515274453908205, + "learning_rate": 0.0003388725429813117, + "loss": 0.002, + "num_input_tokens_seen": 170558464, + "step": 78955 + }, + { + "epoch": 12.880913539967374, + "grad_norm": 0.030450142920017242, + "learning_rate": 0.0003388051620265544, + "loss": 0.0093, + "num_input_tokens_seen": 170568768, + "step": 78960 + }, + { + "epoch": 12.88172920065253, + "grad_norm": 0.32018008828163147, + "learning_rate": 0.0003387377843386148, + "loss": 0.1654, + "num_input_tokens_seen": 170579648, + "step": 78965 + }, + { + "epoch": 12.882544861337683, + "grad_norm": 0.018319450318813324, + "learning_rate": 0.00033867040991885885, + "loss": 0.0129, + "num_input_tokens_seen": 170589440, + "step": 78970 + }, + { + "epoch": 12.883360522022839, + "grad_norm": 0.00849565677344799, + "learning_rate": 0.0003386030387686514, + "loss": 0.0034, + "num_input_tokens_seen": 170600384, + "step": 78975 + }, + { + "epoch": 12.884176182707993, + "grad_norm": 0.0026667932979762554, + "learning_rate": 0.0003385356708893584, + "loss": 0.0022, + "num_input_tokens_seen": 170611584, + "step": 78980 + }, + { + "epoch": 12.884991843393149, + "grad_norm": 0.0011457474902272224, + "learning_rate": 0.0003384683062823446, + "loss": 0.0027, + "num_input_tokens_seen": 170623328, + "step": 78985 + }, + { + "epoch": 12.885807504078304, + "grad_norm": 0.0027485296595841646, + "learning_rate": 0.00033840094494897566, + "loss": 0.0037, + "num_input_tokens_seen": 170633760, + "step": 78990 + }, + { + "epoch": 12.886623164763458, + "grad_norm": 0.13075962662696838, + "learning_rate": 0.0003383335868906164, + "loss": 0.0112, + "num_input_tokens_seen": 170644704, + "step": 78995 + }, + { + "epoch": 12.887438825448614, + "grad_norm": 0.004995839670300484, + "learning_rate": 0.0003382662321086324, + "loss": 0.0041, + "num_input_tokens_seen": 170655040, + "step": 79000 + }, + { + "epoch": 12.888254486133768, + "grad_norm": 0.10305944830179214, + "learning_rate": 0.0003381988806043881, + "loss": 0.0128, + "num_input_tokens_seen": 170664352, + "step": 79005 + }, + { + "epoch": 12.889070146818923, + "grad_norm": 0.0039926618337631226, + "learning_rate": 0.0003381315323792489, + "loss": 0.1244, + "num_input_tokens_seen": 170675552, + "step": 79010 + }, + { + "epoch": 12.88988580750408, + "grad_norm": 0.1336335837841034, + "learning_rate": 0.00033806418743457937, + "loss": 0.0076, + "num_input_tokens_seen": 170686336, + "step": 79015 + }, + { + "epoch": 12.890701468189233, + "grad_norm": 0.0027748846914619207, + "learning_rate": 0.0003379968457717447, + "loss": 0.0058, + "num_input_tokens_seen": 170697408, + "step": 79020 + }, + { + "epoch": 12.891517128874389, + "grad_norm": 0.01572308875620365, + "learning_rate": 0.00033792950739210934, + "loss": 0.0087, + "num_input_tokens_seen": 170707872, + "step": 79025 + }, + { + "epoch": 12.892332789559543, + "grad_norm": 0.006033416371792555, + "learning_rate": 0.0003378621722970382, + "loss": 0.0043, + "num_input_tokens_seen": 170717792, + "step": 79030 + }, + { + "epoch": 12.893148450244698, + "grad_norm": 0.0038843636866658926, + "learning_rate": 0.00033779484048789574, + "loss": 0.1461, + "num_input_tokens_seen": 170728672, + "step": 79035 + }, + { + "epoch": 12.893964110929852, + "grad_norm": 0.008453777059912682, + "learning_rate": 0.0003377275119660467, + "loss": 0.0211, + "num_input_tokens_seen": 170739840, + "step": 79040 + }, + { + "epoch": 12.894779771615008, + "grad_norm": 0.0564873032271862, + "learning_rate": 0.00033766018673285535, + "loss": 0.0061, + "num_input_tokens_seen": 170750720, + "step": 79045 + }, + { + "epoch": 12.895595432300164, + "grad_norm": 0.010843920521438122, + "learning_rate": 0.0003375928647896863, + "loss": 0.0262, + "num_input_tokens_seen": 170761024, + "step": 79050 + }, + { + "epoch": 12.896411092985318, + "grad_norm": 0.00172845006454736, + "learning_rate": 0.000337525546137904, + "loss": 0.0081, + "num_input_tokens_seen": 170771872, + "step": 79055 + }, + { + "epoch": 12.897226753670473, + "grad_norm": 0.40799251198768616, + "learning_rate": 0.0003374582307788725, + "loss": 0.1212, + "num_input_tokens_seen": 170783264, + "step": 79060 + }, + { + "epoch": 12.898042414355627, + "grad_norm": 0.005817765835672617, + "learning_rate": 0.0003373909187139562, + "loss": 0.0031, + "num_input_tokens_seen": 170794752, + "step": 79065 + }, + { + "epoch": 12.898858075040783, + "grad_norm": 0.0027624014765024185, + "learning_rate": 0.0003373236099445191, + "loss": 0.0182, + "num_input_tokens_seen": 170805696, + "step": 79070 + }, + { + "epoch": 12.899673735725939, + "grad_norm": 0.010239914059638977, + "learning_rate": 0.00033725630447192556, + "loss": 0.0041, + "num_input_tokens_seen": 170816480, + "step": 79075 + }, + { + "epoch": 12.900489396411093, + "grad_norm": 0.36265629529953003, + "learning_rate": 0.0003371890022975394, + "loss": 0.1057, + "num_input_tokens_seen": 170826528, + "step": 79080 + }, + { + "epoch": 12.901305057096248, + "grad_norm": 0.01303062029182911, + "learning_rate": 0.0003371217034227247, + "loss": 0.0304, + "num_input_tokens_seen": 170838048, + "step": 79085 + }, + { + "epoch": 12.902120717781402, + "grad_norm": 0.053241170942783356, + "learning_rate": 0.0003370544078488453, + "loss": 0.0172, + "num_input_tokens_seen": 170849472, + "step": 79090 + }, + { + "epoch": 12.902936378466558, + "grad_norm": 0.010403000749647617, + "learning_rate": 0.000336987115577265, + "loss": 0.0309, + "num_input_tokens_seen": 170861472, + "step": 79095 + }, + { + "epoch": 12.903752039151712, + "grad_norm": 0.0021436321549117565, + "learning_rate": 0.0003369198266093475, + "loss": 0.0087, + "num_input_tokens_seen": 170873632, + "step": 79100 + }, + { + "epoch": 12.904567699836868, + "grad_norm": 0.007782533764839172, + "learning_rate": 0.00033685254094645685, + "loss": 0.1225, + "num_input_tokens_seen": 170883840, + "step": 79105 + }, + { + "epoch": 12.905383360522023, + "grad_norm": 0.0011054445058107376, + "learning_rate": 0.0003367852585899562, + "loss": 0.0048, + "num_input_tokens_seen": 170892480, + "step": 79110 + }, + { + "epoch": 12.906199021207177, + "grad_norm": 0.041748058050870895, + "learning_rate": 0.00033671797954120953, + "loss": 0.0063, + "num_input_tokens_seen": 170903040, + "step": 79115 + }, + { + "epoch": 12.907014681892333, + "grad_norm": 0.7993329763412476, + "learning_rate": 0.0003366507038015799, + "loss": 0.0212, + "num_input_tokens_seen": 170913440, + "step": 79120 + }, + { + "epoch": 12.907830342577487, + "grad_norm": 0.07743581384420395, + "learning_rate": 0.0003365834313724312, + "loss": 0.022, + "num_input_tokens_seen": 170923712, + "step": 79125 + }, + { + "epoch": 12.908646003262643, + "grad_norm": 0.0008200727752409875, + "learning_rate": 0.00033651616225512636, + "loss": 0.0165, + "num_input_tokens_seen": 170934720, + "step": 79130 + }, + { + "epoch": 12.909461663947798, + "grad_norm": 0.010752071626484394, + "learning_rate": 0.0003364488964510292, + "loss": 0.0223, + "num_input_tokens_seen": 170945568, + "step": 79135 + }, + { + "epoch": 12.910277324632952, + "grad_norm": 0.5752093195915222, + "learning_rate": 0.00033638163396150234, + "loss": 0.1113, + "num_input_tokens_seen": 170955616, + "step": 79140 + }, + { + "epoch": 12.911092985318108, + "grad_norm": 0.0010481280041858554, + "learning_rate": 0.0003363143747879094, + "loss": 0.1485, + "num_input_tokens_seen": 170967200, + "step": 79145 + }, + { + "epoch": 12.911908646003262, + "grad_norm": 1.013627529144287, + "learning_rate": 0.00033624711893161317, + "loss": 0.1638, + "num_input_tokens_seen": 170977952, + "step": 79150 + }, + { + "epoch": 12.912724306688418, + "grad_norm": 0.012935176491737366, + "learning_rate": 0.000336179866393977, + "loss": 0.0035, + "num_input_tokens_seen": 170989856, + "step": 79155 + }, + { + "epoch": 12.913539967373573, + "grad_norm": 0.008194522932171822, + "learning_rate": 0.0003361126171763634, + "loss": 0.0104, + "num_input_tokens_seen": 171000160, + "step": 79160 + }, + { + "epoch": 12.914355628058727, + "grad_norm": 0.0054146721959114075, + "learning_rate": 0.0003360453712801358, + "loss": 0.0031, + "num_input_tokens_seen": 171011488, + "step": 79165 + }, + { + "epoch": 12.915171288743883, + "grad_norm": 0.0015244726091623306, + "learning_rate": 0.00033597812870665657, + "loss": 0.0111, + "num_input_tokens_seen": 171022688, + "step": 79170 + }, + { + "epoch": 12.915986949429037, + "grad_norm": 0.032646216452121735, + "learning_rate": 0.00033591088945728856, + "loss": 0.0043, + "num_input_tokens_seen": 171033728, + "step": 79175 + }, + { + "epoch": 12.916802610114193, + "grad_norm": 0.003069676924496889, + "learning_rate": 0.0003358436535333947, + "loss": 0.004, + "num_input_tokens_seen": 171045152, + "step": 79180 + }, + { + "epoch": 12.917618270799348, + "grad_norm": 0.011279561556875706, + "learning_rate": 0.0003357764209363373, + "loss": 0.0035, + "num_input_tokens_seen": 171055872, + "step": 79185 + }, + { + "epoch": 12.918433931484502, + "grad_norm": 0.008914372883737087, + "learning_rate": 0.00033570919166747926, + "loss": 0.0038, + "num_input_tokens_seen": 171066624, + "step": 79190 + }, + { + "epoch": 12.919249592169658, + "grad_norm": 0.17829741537570953, + "learning_rate": 0.0003356419657281827, + "loss": 0.0136, + "num_input_tokens_seen": 171077632, + "step": 79195 + }, + { + "epoch": 12.920065252854812, + "grad_norm": 0.02999373897910118, + "learning_rate": 0.0003355747431198104, + "loss": 0.0856, + "num_input_tokens_seen": 171088640, + "step": 79200 + }, + { + "epoch": 12.920880913539968, + "grad_norm": 0.6304906010627747, + "learning_rate": 0.0003355075238437243, + "loss": 0.0576, + "num_input_tokens_seen": 171099200, + "step": 79205 + }, + { + "epoch": 12.921696574225122, + "grad_norm": 0.7816330790519714, + "learning_rate": 0.0003354403079012871, + "loss": 0.098, + "num_input_tokens_seen": 171109728, + "step": 79210 + }, + { + "epoch": 12.922512234910277, + "grad_norm": 0.4523005187511444, + "learning_rate": 0.0003353730952938606, + "loss": 0.0258, + "num_input_tokens_seen": 171120320, + "step": 79215 + }, + { + "epoch": 12.923327895595433, + "grad_norm": 0.0062120272777974606, + "learning_rate": 0.0003353058860228073, + "loss": 0.0028, + "num_input_tokens_seen": 171131328, + "step": 79220 + }, + { + "epoch": 12.924143556280587, + "grad_norm": 0.23003165423870087, + "learning_rate": 0.0003352386800894891, + "loss": 0.0433, + "num_input_tokens_seen": 171141984, + "step": 79225 + }, + { + "epoch": 12.924959216965743, + "grad_norm": 0.036548204720020294, + "learning_rate": 0.0003351714774952681, + "loss": 0.0078, + "num_input_tokens_seen": 171153152, + "step": 79230 + }, + { + "epoch": 12.925774877650896, + "grad_norm": 0.1406838595867157, + "learning_rate": 0.00033510427824150625, + "loss": 0.0133, + "num_input_tokens_seen": 171164288, + "step": 79235 + }, + { + "epoch": 12.926590538336052, + "grad_norm": 0.013899928890168667, + "learning_rate": 0.0003350370823295653, + "loss": 0.0089, + "num_input_tokens_seen": 171174944, + "step": 79240 + }, + { + "epoch": 12.927406199021208, + "grad_norm": 0.011088810861110687, + "learning_rate": 0.0003349698897608071, + "loss": 0.0179, + "num_input_tokens_seen": 171184864, + "step": 79245 + }, + { + "epoch": 12.928221859706362, + "grad_norm": 0.010716955177485943, + "learning_rate": 0.00033490270053659367, + "loss": 0.0034, + "num_input_tokens_seen": 171195840, + "step": 79250 + }, + { + "epoch": 12.929037520391518, + "grad_norm": 0.133016899228096, + "learning_rate": 0.0003348355146582862, + "loss": 0.0078, + "num_input_tokens_seen": 171207104, + "step": 79255 + }, + { + "epoch": 12.929853181076671, + "grad_norm": 0.013618758879601955, + "learning_rate": 0.00033476833212724676, + "loss": 0.0029, + "num_input_tokens_seen": 171217792, + "step": 79260 + }, + { + "epoch": 12.930668841761827, + "grad_norm": 0.005291712004691362, + "learning_rate": 0.0003347011529448365, + "loss": 0.0031, + "num_input_tokens_seen": 171229408, + "step": 79265 + }, + { + "epoch": 12.931484502446983, + "grad_norm": 0.0027662403881549835, + "learning_rate": 0.00033463397711241727, + "loss": 0.0076, + "num_input_tokens_seen": 171238784, + "step": 79270 + }, + { + "epoch": 12.932300163132137, + "grad_norm": 0.004883588757365942, + "learning_rate": 0.00033456680463135006, + "loss": 0.0758, + "num_input_tokens_seen": 171249344, + "step": 79275 + }, + { + "epoch": 12.933115823817293, + "grad_norm": 0.0021018683910369873, + "learning_rate": 0.00033449963550299646, + "loss": 0.0072, + "num_input_tokens_seen": 171259392, + "step": 79280 + }, + { + "epoch": 12.933931484502446, + "grad_norm": 0.12410213053226471, + "learning_rate": 0.00033443246972871785, + "loss": 0.0069, + "num_input_tokens_seen": 171269888, + "step": 79285 + }, + { + "epoch": 12.934747145187602, + "grad_norm": 0.04729204624891281, + "learning_rate": 0.000334365307309875, + "loss": 0.0045, + "num_input_tokens_seen": 171281696, + "step": 79290 + }, + { + "epoch": 12.935562805872756, + "grad_norm": 0.13005883991718292, + "learning_rate": 0.00033429814824782967, + "loss": 0.0113, + "num_input_tokens_seen": 171291232, + "step": 79295 + }, + { + "epoch": 12.936378466557912, + "grad_norm": 0.0018234510207548738, + "learning_rate": 0.0003342309925439423, + "loss": 0.1334, + "num_input_tokens_seen": 171300864, + "step": 79300 + }, + { + "epoch": 12.937194127243067, + "grad_norm": 0.007303939666599035, + "learning_rate": 0.0003341638401995744, + "loss": 0.0206, + "num_input_tokens_seen": 171311264, + "step": 79305 + }, + { + "epoch": 12.938009787928221, + "grad_norm": 0.23786698281764984, + "learning_rate": 0.0003340966912160864, + "loss": 0.02, + "num_input_tokens_seen": 171322080, + "step": 79310 + }, + { + "epoch": 12.938825448613377, + "grad_norm": 0.07854799926280975, + "learning_rate": 0.00033402954559483966, + "loss": 0.0721, + "num_input_tokens_seen": 171333760, + "step": 79315 + }, + { + "epoch": 12.939641109298531, + "grad_norm": 0.0014028697041794658, + "learning_rate": 0.0003339624033371945, + "loss": 0.1161, + "num_input_tokens_seen": 171345088, + "step": 79320 + }, + { + "epoch": 12.940456769983687, + "grad_norm": 0.40298980474472046, + "learning_rate": 0.00033389526444451215, + "loss": 0.0158, + "num_input_tokens_seen": 171355360, + "step": 79325 + }, + { + "epoch": 12.941272430668842, + "grad_norm": 0.00499499449506402, + "learning_rate": 0.00033382812891815267, + "loss": 0.0167, + "num_input_tokens_seen": 171365856, + "step": 79330 + }, + { + "epoch": 12.942088091353996, + "grad_norm": 0.0031798086129128933, + "learning_rate": 0.00033376099675947726, + "loss": 0.0016, + "num_input_tokens_seen": 171375904, + "step": 79335 + }, + { + "epoch": 12.942903752039152, + "grad_norm": 0.2039460837841034, + "learning_rate": 0.0003336938679698459, + "loss": 0.0109, + "num_input_tokens_seen": 171386144, + "step": 79340 + }, + { + "epoch": 12.943719412724306, + "grad_norm": 0.6421544551849365, + "learning_rate": 0.0003336267425506194, + "loss": 0.0613, + "num_input_tokens_seen": 171397248, + "step": 79345 + }, + { + "epoch": 12.944535073409462, + "grad_norm": 0.017462583258748055, + "learning_rate": 0.0003335596205031579, + "loss": 0.018, + "num_input_tokens_seen": 171407680, + "step": 79350 + }, + { + "epoch": 12.945350734094617, + "grad_norm": 0.1923362910747528, + "learning_rate": 0.00033349250182882205, + "loss": 0.0152, + "num_input_tokens_seen": 171418368, + "step": 79355 + }, + { + "epoch": 12.946166394779771, + "grad_norm": 0.8219704627990723, + "learning_rate": 0.0003334253865289717, + "loss": 0.0385, + "num_input_tokens_seen": 171428352, + "step": 79360 + }, + { + "epoch": 12.946982055464927, + "grad_norm": 0.002316342433914542, + "learning_rate": 0.00033335827460496725, + "loss": 0.0296, + "num_input_tokens_seen": 171439424, + "step": 79365 + }, + { + "epoch": 12.947797716150081, + "grad_norm": 0.00106413708999753, + "learning_rate": 0.0003332911660581688, + "loss": 0.0838, + "num_input_tokens_seen": 171449632, + "step": 79370 + }, + { + "epoch": 12.948613376835237, + "grad_norm": 0.5249331593513489, + "learning_rate": 0.0003332240608899363, + "loss": 0.0539, + "num_input_tokens_seen": 171458592, + "step": 79375 + }, + { + "epoch": 12.949429037520392, + "grad_norm": 0.003496425226330757, + "learning_rate": 0.0003331569591016298, + "loss": 0.0227, + "num_input_tokens_seen": 171469120, + "step": 79380 + }, + { + "epoch": 12.950244698205546, + "grad_norm": 0.11744219064712524, + "learning_rate": 0.0003330898606946091, + "loss": 0.0088, + "num_input_tokens_seen": 171480032, + "step": 79385 + }, + { + "epoch": 12.951060358890702, + "grad_norm": 0.02061287686228752, + "learning_rate": 0.0003330227656702342, + "loss": 0.0075, + "num_input_tokens_seen": 171491552, + "step": 79390 + }, + { + "epoch": 12.951876019575856, + "grad_norm": 0.001456076861359179, + "learning_rate": 0.00033295567402986476, + "loss": 0.0062, + "num_input_tokens_seen": 171502976, + "step": 79395 + }, + { + "epoch": 12.952691680261012, + "grad_norm": 0.8078159689903259, + "learning_rate": 0.0003328885857748605, + "loss": 0.1798, + "num_input_tokens_seen": 171513152, + "step": 79400 + }, + { + "epoch": 12.953507340946166, + "grad_norm": 0.008021415211260319, + "learning_rate": 0.00033282150090658115, + "loss": 0.1147, + "num_input_tokens_seen": 171524416, + "step": 79405 + }, + { + "epoch": 12.954323001631321, + "grad_norm": 0.007471402175724506, + "learning_rate": 0.0003327544194263861, + "loss": 0.0029, + "num_input_tokens_seen": 171535328, + "step": 79410 + }, + { + "epoch": 12.955138662316477, + "grad_norm": 0.017298957332968712, + "learning_rate": 0.0003326873413356347, + "loss": 0.005, + "num_input_tokens_seen": 171547008, + "step": 79415 + }, + { + "epoch": 12.955954323001631, + "grad_norm": 0.8735957145690918, + "learning_rate": 0.0003326202666356869, + "loss": 0.044, + "num_input_tokens_seen": 171559104, + "step": 79420 + }, + { + "epoch": 12.956769983686787, + "grad_norm": 0.4984961748123169, + "learning_rate": 0.0003325531953279015, + "loss": 0.0768, + "num_input_tokens_seen": 171569248, + "step": 79425 + }, + { + "epoch": 12.95758564437194, + "grad_norm": 0.00716983899474144, + "learning_rate": 0.0003324861274136382, + "loss": 0.0034, + "num_input_tokens_seen": 171579776, + "step": 79430 + }, + { + "epoch": 12.958401305057096, + "grad_norm": 0.5368052124977112, + "learning_rate": 0.0003324190628942558, + "loss": 0.0639, + "num_input_tokens_seen": 171590432, + "step": 79435 + }, + { + "epoch": 12.959216965742252, + "grad_norm": 0.01023983582854271, + "learning_rate": 0.000332352001771114, + "loss": 0.0163, + "num_input_tokens_seen": 171601760, + "step": 79440 + }, + { + "epoch": 12.960032626427406, + "grad_norm": 0.03688769415020943, + "learning_rate": 0.0003322849440455713, + "loss": 0.0031, + "num_input_tokens_seen": 171613376, + "step": 79445 + }, + { + "epoch": 12.960848287112562, + "grad_norm": 0.009764597751200199, + "learning_rate": 0.0003322178897189871, + "loss": 0.0192, + "num_input_tokens_seen": 171625056, + "step": 79450 + }, + { + "epoch": 12.961663947797716, + "grad_norm": 0.006400657817721367, + "learning_rate": 0.00033215083879272015, + "loss": 0.0113, + "num_input_tokens_seen": 171635712, + "step": 79455 + }, + { + "epoch": 12.962479608482871, + "grad_norm": 0.0073861065320670605, + "learning_rate": 0.00033208379126812947, + "loss": 0.0061, + "num_input_tokens_seen": 171646432, + "step": 79460 + }, + { + "epoch": 12.963295269168025, + "grad_norm": 0.0030887688044458628, + "learning_rate": 0.0003320167471465736, + "loss": 0.0068, + "num_input_tokens_seen": 171657472, + "step": 79465 + }, + { + "epoch": 12.964110929853181, + "grad_norm": 0.35004812479019165, + "learning_rate": 0.0003319497064294117, + "loss": 0.0852, + "num_input_tokens_seen": 171668864, + "step": 79470 + }, + { + "epoch": 12.964926590538337, + "grad_norm": 0.012314059771597385, + "learning_rate": 0.0003318826691180019, + "loss": 0.0392, + "num_input_tokens_seen": 171680832, + "step": 79475 + }, + { + "epoch": 12.96574225122349, + "grad_norm": 0.004713066387921572, + "learning_rate": 0.00033181563521370337, + "loss": 0.0612, + "num_input_tokens_seen": 171692640, + "step": 79480 + }, + { + "epoch": 12.966557911908646, + "grad_norm": 0.01903359591960907, + "learning_rate": 0.0003317486047178742, + "loss": 0.0807, + "num_input_tokens_seen": 171702752, + "step": 79485 + }, + { + "epoch": 12.9673735725938, + "grad_norm": 0.026393504813313484, + "learning_rate": 0.00033168157763187285, + "loss": 0.0161, + "num_input_tokens_seen": 171713120, + "step": 79490 + }, + { + "epoch": 12.968189233278956, + "grad_norm": 0.0038926454726606607, + "learning_rate": 0.0003316145539570581, + "loss": 0.0089, + "num_input_tokens_seen": 171725216, + "step": 79495 + }, + { + "epoch": 12.969004893964112, + "grad_norm": 0.0027679800987243652, + "learning_rate": 0.00033154753369478787, + "loss": 0.0049, + "num_input_tokens_seen": 171735072, + "step": 79500 + }, + { + "epoch": 12.969820554649266, + "grad_norm": 0.003413753118366003, + "learning_rate": 0.00033148051684642074, + "loss": 0.0104, + "num_input_tokens_seen": 171745536, + "step": 79505 + }, + { + "epoch": 12.970636215334421, + "grad_norm": 0.0008030119352042675, + "learning_rate": 0.00033141350341331447, + "loss": 0.0053, + "num_input_tokens_seen": 171756768, + "step": 79510 + }, + { + "epoch": 12.971451876019575, + "grad_norm": 0.09928394109010696, + "learning_rate": 0.00033134649339682773, + "loss": 0.0125, + "num_input_tokens_seen": 171767520, + "step": 79515 + }, + { + "epoch": 12.97226753670473, + "grad_norm": 0.4272739589214325, + "learning_rate": 0.000331279486798318, + "loss": 0.0468, + "num_input_tokens_seen": 171777920, + "step": 79520 + }, + { + "epoch": 12.973083197389887, + "grad_norm": 0.25293394923210144, + "learning_rate": 0.0003312124836191437, + "loss": 0.0519, + "num_input_tokens_seen": 171787648, + "step": 79525 + }, + { + "epoch": 12.97389885807504, + "grad_norm": 0.006494682747870684, + "learning_rate": 0.00033114548386066234, + "loss": 0.0326, + "num_input_tokens_seen": 171798176, + "step": 79530 + }, + { + "epoch": 12.974714518760196, + "grad_norm": 0.0020080001559108496, + "learning_rate": 0.00033107848752423203, + "loss": 0.1135, + "num_input_tokens_seen": 171808608, + "step": 79535 + }, + { + "epoch": 12.97553017944535, + "grad_norm": 0.010064424015581608, + "learning_rate": 0.0003310114946112105, + "loss": 0.0463, + "num_input_tokens_seen": 171820256, + "step": 79540 + }, + { + "epoch": 12.976345840130506, + "grad_norm": 0.03084593638777733, + "learning_rate": 0.00033094450512295535, + "loss": 0.0052, + "num_input_tokens_seen": 171830432, + "step": 79545 + }, + { + "epoch": 12.977161500815662, + "grad_norm": 0.0262883510440588, + "learning_rate": 0.00033087751906082436, + "loss": 0.0138, + "num_input_tokens_seen": 171841216, + "step": 79550 + }, + { + "epoch": 12.977977161500815, + "grad_norm": 0.005067694932222366, + "learning_rate": 0.000330810536426175, + "loss": 0.0054, + "num_input_tokens_seen": 171851616, + "step": 79555 + }, + { + "epoch": 12.978792822185971, + "grad_norm": 0.039184898138046265, + "learning_rate": 0.0003307435572203645, + "loss": 0.0037, + "num_input_tokens_seen": 171862912, + "step": 79560 + }, + { + "epoch": 12.979608482871125, + "grad_norm": 0.008203844539821148, + "learning_rate": 0.00033067658144475087, + "loss": 0.0184, + "num_input_tokens_seen": 171873504, + "step": 79565 + }, + { + "epoch": 12.98042414355628, + "grad_norm": 0.008222805336117744, + "learning_rate": 0.0003306096091006909, + "loss": 0.013, + "num_input_tokens_seen": 171883040, + "step": 79570 + }, + { + "epoch": 12.981239804241435, + "grad_norm": 0.009524693712592125, + "learning_rate": 0.0003305426401895423, + "loss": 0.0042, + "num_input_tokens_seen": 171893984, + "step": 79575 + }, + { + "epoch": 12.98205546492659, + "grad_norm": 0.17951497435569763, + "learning_rate": 0.0003304756747126618, + "loss": 0.0818, + "num_input_tokens_seen": 171905024, + "step": 79580 + }, + { + "epoch": 12.982871125611746, + "grad_norm": 0.08185935020446777, + "learning_rate": 0.00033040871267140705, + "loss": 0.0199, + "num_input_tokens_seen": 171915776, + "step": 79585 + }, + { + "epoch": 12.9836867862969, + "grad_norm": 0.01454191654920578, + "learning_rate": 0.00033034175406713464, + "loss": 0.0299, + "num_input_tokens_seen": 171926496, + "step": 79590 + }, + { + "epoch": 12.984502446982056, + "grad_norm": 0.001568776206113398, + "learning_rate": 0.0003302747989012019, + "loss": 0.0074, + "num_input_tokens_seen": 171936896, + "step": 79595 + }, + { + "epoch": 12.98531810766721, + "grad_norm": 0.1458302140235901, + "learning_rate": 0.00033020784717496576, + "loss": 0.0087, + "num_input_tokens_seen": 171947744, + "step": 79600 + }, + { + "epoch": 12.986133768352365, + "grad_norm": 0.0026035963091999292, + "learning_rate": 0.0003301408988897829, + "loss": 0.0048, + "num_input_tokens_seen": 171957216, + "step": 79605 + }, + { + "epoch": 12.986949429037521, + "grad_norm": 0.1463485211133957, + "learning_rate": 0.00033007395404701035, + "loss": 0.0555, + "num_input_tokens_seen": 171968640, + "step": 79610 + }, + { + "epoch": 12.987765089722675, + "grad_norm": 0.0034880635794252157, + "learning_rate": 0.0003300070126480045, + "loss": 0.0025, + "num_input_tokens_seen": 171979904, + "step": 79615 + }, + { + "epoch": 12.98858075040783, + "grad_norm": 0.0010988858994096518, + "learning_rate": 0.00032994007469412234, + "loss": 0.0043, + "num_input_tokens_seen": 171990944, + "step": 79620 + }, + { + "epoch": 12.989396411092985, + "grad_norm": 0.012164515443146229, + "learning_rate": 0.0003298731401867202, + "loss": 0.1356, + "num_input_tokens_seen": 172002432, + "step": 79625 + }, + { + "epoch": 12.99021207177814, + "grad_norm": 0.05736779049038887, + "learning_rate": 0.0003298062091271548, + "loss": 0.01, + "num_input_tokens_seen": 172013504, + "step": 79630 + }, + { + "epoch": 12.991027732463294, + "grad_norm": 0.018071817234158516, + "learning_rate": 0.00032973928151678233, + "loss": 0.0131, + "num_input_tokens_seen": 172025376, + "step": 79635 + }, + { + "epoch": 12.99184339314845, + "grad_norm": 0.0019351065857335925, + "learning_rate": 0.00032967235735695955, + "loss": 0.0056, + "num_input_tokens_seen": 172036320, + "step": 79640 + }, + { + "epoch": 12.992659053833606, + "grad_norm": 0.005769283045083284, + "learning_rate": 0.00032960543664904224, + "loss": 0.0284, + "num_input_tokens_seen": 172047584, + "step": 79645 + }, + { + "epoch": 12.99347471451876, + "grad_norm": 0.0038038466591387987, + "learning_rate": 0.0003295385193943872, + "loss": 0.0029, + "num_input_tokens_seen": 172059072, + "step": 79650 + }, + { + "epoch": 12.994290375203915, + "grad_norm": 0.004908800590783358, + "learning_rate": 0.00032947160559435, + "loss": 0.0057, + "num_input_tokens_seen": 172069920, + "step": 79655 + }, + { + "epoch": 12.99510603588907, + "grad_norm": 0.0024959484580904245, + "learning_rate": 0.00032940469525028735, + "loss": 0.0516, + "num_input_tokens_seen": 172079840, + "step": 79660 + }, + { + "epoch": 12.995921696574225, + "grad_norm": 0.07145286351442337, + "learning_rate": 0.0003293377883635547, + "loss": 0.0155, + "num_input_tokens_seen": 172092320, + "step": 79665 + }, + { + "epoch": 12.99673735725938, + "grad_norm": 1.0221548080444336, + "learning_rate": 0.0003292708849355085, + "loss": 0.0672, + "num_input_tokens_seen": 172102336, + "step": 79670 + }, + { + "epoch": 12.997553017944535, + "grad_norm": 0.007001963909715414, + "learning_rate": 0.0003292039849675042, + "loss": 0.0359, + "num_input_tokens_seen": 172113120, + "step": 79675 + }, + { + "epoch": 12.99836867862969, + "grad_norm": 0.009530258364975452, + "learning_rate": 0.0003291370884608979, + "loss": 0.0039, + "num_input_tokens_seen": 172123136, + "step": 79680 + }, + { + "epoch": 12.999184339314844, + "grad_norm": 0.004639983177185059, + "learning_rate": 0.00032907019541704533, + "loss": 0.0163, + "num_input_tokens_seen": 172133984, + "step": 79685 + }, + { + "epoch": 13.0, + "grad_norm": 0.06852234899997711, + "learning_rate": 0.00032900330583730196, + "loss": 0.0084, + "num_input_tokens_seen": 172144032, + "step": 79690 + }, + { + "epoch": 13.0, + "eval_loss": 0.19773797690868378, + "eval_runtime": 104.1935, + "eval_samples_per_second": 26.153, + "eval_steps_per_second": 6.546, + "num_input_tokens_seen": 172144032, + "step": 79690 + }, + { + "epoch": 13.000815660685156, + "grad_norm": 0.009670889936387539, + "learning_rate": 0.0003289364197230236, + "loss": 0.0033, + "num_input_tokens_seen": 172156000, + "step": 79695 + }, + { + "epoch": 13.00163132137031, + "grad_norm": 0.004153914283961058, + "learning_rate": 0.0003288695370755657, + "loss": 0.0047, + "num_input_tokens_seen": 172165952, + "step": 79700 + }, + { + "epoch": 13.002446982055465, + "grad_norm": 0.016005946323275566, + "learning_rate": 0.0003288026578962836, + "loss": 0.0043, + "num_input_tokens_seen": 172177600, + "step": 79705 + }, + { + "epoch": 13.00326264274062, + "grad_norm": 0.009066256694495678, + "learning_rate": 0.0003287357821865329, + "loss": 0.0522, + "num_input_tokens_seen": 172187872, + "step": 79710 + }, + { + "epoch": 13.004078303425775, + "grad_norm": 0.011497425846755505, + "learning_rate": 0.0003286689099476689, + "loss": 0.1064, + "num_input_tokens_seen": 172199136, + "step": 79715 + }, + { + "epoch": 13.00489396411093, + "grad_norm": 0.02010742947459221, + "learning_rate": 0.00032860204118104674, + "loss": 0.0035, + "num_input_tokens_seen": 172210720, + "step": 79720 + }, + { + "epoch": 13.005709624796085, + "grad_norm": 0.004298577085137367, + "learning_rate": 0.00032853517588802173, + "loss": 0.004, + "num_input_tokens_seen": 172221984, + "step": 79725 + }, + { + "epoch": 13.00652528548124, + "grad_norm": 0.00979915913194418, + "learning_rate": 0.0003284683140699487, + "loss": 0.0051, + "num_input_tokens_seen": 172231520, + "step": 79730 + }, + { + "epoch": 13.007340946166394, + "grad_norm": 0.009911553002893925, + "learning_rate": 0.00032840145572818314, + "loss": 0.0046, + "num_input_tokens_seen": 172242560, + "step": 79735 + }, + { + "epoch": 13.00815660685155, + "grad_norm": 0.00278778956271708, + "learning_rate": 0.0003283346008640795, + "loss": 0.0044, + "num_input_tokens_seen": 172252864, + "step": 79740 + }, + { + "epoch": 13.008972267536704, + "grad_norm": 0.00484444759786129, + "learning_rate": 0.0003282677494789933, + "loss": 0.0034, + "num_input_tokens_seen": 172262496, + "step": 79745 + }, + { + "epoch": 13.00978792822186, + "grad_norm": 0.004565055947750807, + "learning_rate": 0.0003282009015742787, + "loss": 0.0043, + "num_input_tokens_seen": 172273376, + "step": 79750 + }, + { + "epoch": 13.010603588907015, + "grad_norm": 0.010577378794550896, + "learning_rate": 0.00032813405715129097, + "loss": 0.003, + "num_input_tokens_seen": 172284256, + "step": 79755 + }, + { + "epoch": 13.01141924959217, + "grad_norm": 0.3631739616394043, + "learning_rate": 0.00032806721621138444, + "loss": 0.0748, + "num_input_tokens_seen": 172295232, + "step": 79760 + }, + { + "epoch": 13.012234910277325, + "grad_norm": 0.038808513432741165, + "learning_rate": 0.00032800037875591406, + "loss": 0.0509, + "num_input_tokens_seen": 172306272, + "step": 79765 + }, + { + "epoch": 13.013050570962479, + "grad_norm": 0.016878092661499977, + "learning_rate": 0.000327933544786234, + "loss": 0.0085, + "num_input_tokens_seen": 172316800, + "step": 79770 + }, + { + "epoch": 13.013866231647635, + "grad_norm": 0.08253694325685501, + "learning_rate": 0.00032786671430369915, + "loss": 0.0082, + "num_input_tokens_seen": 172328064, + "step": 79775 + }, + { + "epoch": 13.01468189233279, + "grad_norm": 0.004199854098260403, + "learning_rate": 0.0003277998873096635, + "loss": 0.1454, + "num_input_tokens_seen": 172338016, + "step": 79780 + }, + { + "epoch": 13.015497553017944, + "grad_norm": 0.3190031051635742, + "learning_rate": 0.00032773306380548176, + "loss": 0.1243, + "num_input_tokens_seen": 172349440, + "step": 79785 + }, + { + "epoch": 13.0163132137031, + "grad_norm": 0.021624628454446793, + "learning_rate": 0.0003276662437925079, + "loss": 0.0026, + "num_input_tokens_seen": 172359776, + "step": 79790 + }, + { + "epoch": 13.017128874388254, + "grad_norm": 0.021654745563864708, + "learning_rate": 0.0003275994272720963, + "loss": 0.0032, + "num_input_tokens_seen": 172370560, + "step": 79795 + }, + { + "epoch": 13.01794453507341, + "grad_norm": 0.06402740627527237, + "learning_rate": 0.0003275326142456009, + "loss": 0.0157, + "num_input_tokens_seen": 172380992, + "step": 79800 + }, + { + "epoch": 13.018760195758565, + "grad_norm": 0.004721821751445532, + "learning_rate": 0.00032746580471437606, + "loss": 0.0225, + "num_input_tokens_seen": 172392928, + "step": 79805 + }, + { + "epoch": 13.01957585644372, + "grad_norm": 0.0027865879237651825, + "learning_rate": 0.0003273989986797753, + "loss": 0.0153, + "num_input_tokens_seen": 172401888, + "step": 79810 + }, + { + "epoch": 13.020391517128875, + "grad_norm": 0.007285828702151775, + "learning_rate": 0.00032733219614315283, + "loss": 0.0126, + "num_input_tokens_seen": 172412352, + "step": 79815 + }, + { + "epoch": 13.021207177814029, + "grad_norm": 0.008828174322843552, + "learning_rate": 0.00032726539710586266, + "loss": 0.0189, + "num_input_tokens_seen": 172422720, + "step": 79820 + }, + { + "epoch": 13.022022838499185, + "grad_norm": 0.004288358148187399, + "learning_rate": 0.0003271986015692582, + "loss": 0.0029, + "num_input_tokens_seen": 172431584, + "step": 79825 + }, + { + "epoch": 13.022838499184338, + "grad_norm": 0.05239563062787056, + "learning_rate": 0.0003271318095346934, + "loss": 0.1232, + "num_input_tokens_seen": 172444032, + "step": 79830 + }, + { + "epoch": 13.023654159869494, + "grad_norm": 0.0009064357727766037, + "learning_rate": 0.00032706502100352165, + "loss": 0.0035, + "num_input_tokens_seen": 172454976, + "step": 79835 + }, + { + "epoch": 13.02446982055465, + "grad_norm": 0.03271019458770752, + "learning_rate": 0.00032699823597709675, + "loss": 0.0663, + "num_input_tokens_seen": 172465600, + "step": 79840 + }, + { + "epoch": 13.025285481239804, + "grad_norm": 0.020236380398273468, + "learning_rate": 0.00032693145445677194, + "loss": 0.0086, + "num_input_tokens_seen": 172476864, + "step": 79845 + }, + { + "epoch": 13.02610114192496, + "grad_norm": 0.020625924691557884, + "learning_rate": 0.00032686467644390085, + "loss": 0.0027, + "num_input_tokens_seen": 172487328, + "step": 79850 + }, + { + "epoch": 13.026916802610113, + "grad_norm": 0.0036424091085791588, + "learning_rate": 0.00032679790193983666, + "loss": 0.0074, + "num_input_tokens_seen": 172496320, + "step": 79855 + }, + { + "epoch": 13.02773246329527, + "grad_norm": 0.025376515462994576, + "learning_rate": 0.0003267311309459328, + "loss": 0.0243, + "num_input_tokens_seen": 172505696, + "step": 79860 + }, + { + "epoch": 13.028548123980425, + "grad_norm": 0.0028710965998470783, + "learning_rate": 0.00032666436346354236, + "loss": 0.144, + "num_input_tokens_seen": 172516896, + "step": 79865 + }, + { + "epoch": 13.029363784665579, + "grad_norm": 0.005142997018992901, + "learning_rate": 0.0003265975994940185, + "loss": 0.0068, + "num_input_tokens_seen": 172527360, + "step": 79870 + }, + { + "epoch": 13.030179445350734, + "grad_norm": 0.00245002587325871, + "learning_rate": 0.00032653083903871406, + "loss": 0.0029, + "num_input_tokens_seen": 172537728, + "step": 79875 + }, + { + "epoch": 13.030995106035888, + "grad_norm": 0.00920133013278246, + "learning_rate": 0.0003264640820989825, + "loss": 0.003, + "num_input_tokens_seen": 172549344, + "step": 79880 + }, + { + "epoch": 13.031810766721044, + "grad_norm": 0.28441938757896423, + "learning_rate": 0.0003263973286761762, + "loss": 0.0296, + "num_input_tokens_seen": 172558880, + "step": 79885 + }, + { + "epoch": 13.0326264274062, + "grad_norm": 0.00399240804836154, + "learning_rate": 0.0003263305787716486, + "loss": 0.1267, + "num_input_tokens_seen": 172570752, + "step": 79890 + }, + { + "epoch": 13.033442088091354, + "grad_norm": 0.024981455877423286, + "learning_rate": 0.00032626383238675184, + "loss": 0.04, + "num_input_tokens_seen": 172581856, + "step": 79895 + }, + { + "epoch": 13.03425774877651, + "grad_norm": 0.0030545040499418974, + "learning_rate": 0.0003261970895228391, + "loss": 0.0176, + "num_input_tokens_seen": 172592256, + "step": 79900 + }, + { + "epoch": 13.035073409461663, + "grad_norm": 0.03148532286286354, + "learning_rate": 0.00032613035018126267, + "loss": 0.0069, + "num_input_tokens_seen": 172603104, + "step": 79905 + }, + { + "epoch": 13.035889070146819, + "grad_norm": 0.07724172621965408, + "learning_rate": 0.0003260636143633755, + "loss": 0.0066, + "num_input_tokens_seen": 172613312, + "step": 79910 + }, + { + "epoch": 13.036704730831975, + "grad_norm": 0.003198714228346944, + "learning_rate": 0.0003259968820705296, + "loss": 0.0024, + "num_input_tokens_seen": 172622912, + "step": 79915 + }, + { + "epoch": 13.037520391517129, + "grad_norm": 0.004381487611681223, + "learning_rate": 0.0003259301533040776, + "loss": 0.0075, + "num_input_tokens_seen": 172634112, + "step": 79920 + }, + { + "epoch": 13.038336052202284, + "grad_norm": 0.4412562847137451, + "learning_rate": 0.00032586342806537207, + "loss": 0.0686, + "num_input_tokens_seen": 172644800, + "step": 79925 + }, + { + "epoch": 13.039151712887438, + "grad_norm": 0.002459079958498478, + "learning_rate": 0.0003257967063557649, + "loss": 0.0045, + "num_input_tokens_seen": 172655072, + "step": 79930 + }, + { + "epoch": 13.039967373572594, + "grad_norm": 0.026812493801116943, + "learning_rate": 0.0003257299881766087, + "loss": 0.0036, + "num_input_tokens_seen": 172666912, + "step": 79935 + }, + { + "epoch": 13.040783034257748, + "grad_norm": 0.0030856519006192684, + "learning_rate": 0.0003256632735292551, + "loss": 0.0047, + "num_input_tokens_seen": 172678368, + "step": 79940 + }, + { + "epoch": 13.041598694942904, + "grad_norm": 0.0029760266188532114, + "learning_rate": 0.00032559656241505663, + "loss": 0.0033, + "num_input_tokens_seen": 172688224, + "step": 79945 + }, + { + "epoch": 13.04241435562806, + "grad_norm": 0.005138032604008913, + "learning_rate": 0.0003255298548353649, + "loss": 0.0982, + "num_input_tokens_seen": 172698784, + "step": 79950 + }, + { + "epoch": 13.043230016313213, + "grad_norm": 0.004156198818236589, + "learning_rate": 0.0003254631507915322, + "loss": 0.0042, + "num_input_tokens_seen": 172708480, + "step": 79955 + }, + { + "epoch": 13.044045676998369, + "grad_norm": 0.001793356379494071, + "learning_rate": 0.00032539645028490993, + "loss": 0.0283, + "num_input_tokens_seen": 172718080, + "step": 79960 + }, + { + "epoch": 13.044861337683523, + "grad_norm": 0.0031504526268690825, + "learning_rate": 0.0003253297533168503, + "loss": 0.0066, + "num_input_tokens_seen": 172728864, + "step": 79965 + }, + { + "epoch": 13.045676998368679, + "grad_norm": 0.003528774017468095, + "learning_rate": 0.0003252630598887046, + "loss": 0.0865, + "num_input_tokens_seen": 172740320, + "step": 79970 + }, + { + "epoch": 13.046492659053834, + "grad_norm": 0.16039009392261505, + "learning_rate": 0.00032519637000182495, + "loss": 0.0091, + "num_input_tokens_seen": 172752064, + "step": 79975 + }, + { + "epoch": 13.047308319738988, + "grad_norm": 0.007757482118904591, + "learning_rate": 0.0003251296836575623, + "loss": 0.0066, + "num_input_tokens_seen": 172763072, + "step": 79980 + }, + { + "epoch": 13.048123980424144, + "grad_norm": 0.013547541573643684, + "learning_rate": 0.00032506300085726874, + "loss": 0.0147, + "num_input_tokens_seen": 172773632, + "step": 79985 + }, + { + "epoch": 13.048939641109298, + "grad_norm": 0.0315958634018898, + "learning_rate": 0.0003249963216022951, + "loss": 0.0067, + "num_input_tokens_seen": 172784128, + "step": 79990 + }, + { + "epoch": 13.049755301794454, + "grad_norm": 0.01640213653445244, + "learning_rate": 0.0003249296458939932, + "loss": 0.0266, + "num_input_tokens_seen": 172794368, + "step": 79995 + }, + { + "epoch": 13.05057096247961, + "grad_norm": 0.012024256400763988, + "learning_rate": 0.0003248629737337141, + "loss": 0.0068, + "num_input_tokens_seen": 172805600, + "step": 80000 + }, + { + "epoch": 13.051386623164763, + "grad_norm": 0.004929613322019577, + "learning_rate": 0.000324796305122809, + "loss": 0.013, + "num_input_tokens_seen": 172815296, + "step": 80005 + }, + { + "epoch": 13.052202283849919, + "grad_norm": 0.028229599818587303, + "learning_rate": 0.000324729640062629, + "loss": 0.0078, + "num_input_tokens_seen": 172825280, + "step": 80010 + }, + { + "epoch": 13.053017944535073, + "grad_norm": 0.002132690977305174, + "learning_rate": 0.0003246629785545252, + "loss": 0.0037, + "num_input_tokens_seen": 172836576, + "step": 80015 + }, + { + "epoch": 13.053833605220229, + "grad_norm": 0.00830269604921341, + "learning_rate": 0.0003245963205998485, + "loss": 0.0079, + "num_input_tokens_seen": 172847296, + "step": 80020 + }, + { + "epoch": 13.054649265905383, + "grad_norm": 0.009801110252737999, + "learning_rate": 0.00032452966619994997, + "loss": 0.0026, + "num_input_tokens_seen": 172857440, + "step": 80025 + }, + { + "epoch": 13.055464926590538, + "grad_norm": 0.03717103973031044, + "learning_rate": 0.00032446301535618034, + "loss": 0.0036, + "num_input_tokens_seen": 172869344, + "step": 80030 + }, + { + "epoch": 13.056280587275694, + "grad_norm": 0.01708907075226307, + "learning_rate": 0.0003243963680698904, + "loss": 0.0031, + "num_input_tokens_seen": 172878528, + "step": 80035 + }, + { + "epoch": 13.057096247960848, + "grad_norm": 0.034726761281490326, + "learning_rate": 0.0003243297243424308, + "loss": 0.0395, + "num_input_tokens_seen": 172888992, + "step": 80040 + }, + { + "epoch": 13.057911908646004, + "grad_norm": 0.0035679759457707405, + "learning_rate": 0.0003242630841751522, + "loss": 0.0021, + "num_input_tokens_seen": 172899776, + "step": 80045 + }, + { + "epoch": 13.058727569331158, + "grad_norm": 0.044592756778001785, + "learning_rate": 0.00032419644756940527, + "loss": 0.0035, + "num_input_tokens_seen": 172910368, + "step": 80050 + }, + { + "epoch": 13.059543230016313, + "grad_norm": 0.0022370279766619205, + "learning_rate": 0.0003241298145265401, + "loss": 0.0049, + "num_input_tokens_seen": 172921664, + "step": 80055 + }, + { + "epoch": 13.060358890701469, + "grad_norm": 0.002283082576468587, + "learning_rate": 0.00032406318504790753, + "loss": 0.0028, + "num_input_tokens_seen": 172933472, + "step": 80060 + }, + { + "epoch": 13.061174551386623, + "grad_norm": 0.3229667842388153, + "learning_rate": 0.0003239965591348576, + "loss": 0.0932, + "num_input_tokens_seen": 172943872, + "step": 80065 + }, + { + "epoch": 13.061990212071779, + "grad_norm": 0.01312661450356245, + "learning_rate": 0.00032392993678874085, + "loss": 0.0307, + "num_input_tokens_seen": 172954912, + "step": 80070 + }, + { + "epoch": 13.062805872756933, + "grad_norm": 0.0027966343332082033, + "learning_rate": 0.0003238633180109071, + "loss": 0.0726, + "num_input_tokens_seen": 172964736, + "step": 80075 + }, + { + "epoch": 13.063621533442088, + "grad_norm": 0.00816719327121973, + "learning_rate": 0.00032379670280270677, + "loss": 0.0045, + "num_input_tokens_seen": 172974848, + "step": 80080 + }, + { + "epoch": 13.064437194127244, + "grad_norm": 0.0017934865318238735, + "learning_rate": 0.0003237300911654897, + "loss": 0.0526, + "num_input_tokens_seen": 172984448, + "step": 80085 + }, + { + "epoch": 13.065252854812398, + "grad_norm": 0.33631932735443115, + "learning_rate": 0.0003236634831006061, + "loss": 0.0124, + "num_input_tokens_seen": 172995776, + "step": 80090 + }, + { + "epoch": 13.066068515497554, + "grad_norm": 0.002418245654553175, + "learning_rate": 0.0003235968786094055, + "loss": 0.0054, + "num_input_tokens_seen": 173006016, + "step": 80095 + }, + { + "epoch": 13.066884176182707, + "grad_norm": 0.5294049978256226, + "learning_rate": 0.0003235302776932382, + "loss": 0.0995, + "num_input_tokens_seen": 173016960, + "step": 80100 + }, + { + "epoch": 13.067699836867863, + "grad_norm": 0.38888972997665405, + "learning_rate": 0.00032346368035345344, + "loss": 0.2369, + "num_input_tokens_seen": 173027648, + "step": 80105 + }, + { + "epoch": 13.068515497553017, + "grad_norm": 0.6325135231018066, + "learning_rate": 0.0003233970865914013, + "loss": 0.216, + "num_input_tokens_seen": 173039424, + "step": 80110 + }, + { + "epoch": 13.069331158238173, + "grad_norm": 0.0035932499449700117, + "learning_rate": 0.0003233304964084311, + "loss": 0.0453, + "num_input_tokens_seen": 173050048, + "step": 80115 + }, + { + "epoch": 13.070146818923329, + "grad_norm": 0.02140737511217594, + "learning_rate": 0.0003232639098058927, + "loss": 0.0095, + "num_input_tokens_seen": 173059904, + "step": 80120 + }, + { + "epoch": 13.070962479608482, + "grad_norm": 0.07032874971628189, + "learning_rate": 0.00032319732678513514, + "loss": 0.007, + "num_input_tokens_seen": 173070880, + "step": 80125 + }, + { + "epoch": 13.071778140293638, + "grad_norm": 0.0047665243037045, + "learning_rate": 0.00032313074734750813, + "loss": 0.0199, + "num_input_tokens_seen": 173081280, + "step": 80130 + }, + { + "epoch": 13.072593800978792, + "grad_norm": 0.0695071592926979, + "learning_rate": 0.000323064171494361, + "loss": 0.0165, + "num_input_tokens_seen": 173092960, + "step": 80135 + }, + { + "epoch": 13.073409461663948, + "grad_norm": 0.08060871809720993, + "learning_rate": 0.00032299759922704277, + "loss": 0.0156, + "num_input_tokens_seen": 173105216, + "step": 80140 + }, + { + "epoch": 13.074225122349104, + "grad_norm": 0.008816416375339031, + "learning_rate": 0.0003229310305469029, + "loss": 0.0043, + "num_input_tokens_seen": 173116352, + "step": 80145 + }, + { + "epoch": 13.075040783034257, + "grad_norm": 0.026547571644186974, + "learning_rate": 0.00032286446545529016, + "loss": 0.0114, + "num_input_tokens_seen": 173127008, + "step": 80150 + }, + { + "epoch": 13.075856443719413, + "grad_norm": 1.0129142999649048, + "learning_rate": 0.0003227979039535538, + "loss": 0.0597, + "num_input_tokens_seen": 173137856, + "step": 80155 + }, + { + "epoch": 13.076672104404567, + "grad_norm": 0.07595764100551605, + "learning_rate": 0.0003227313460430427, + "loss": 0.0088, + "num_input_tokens_seen": 173149952, + "step": 80160 + }, + { + "epoch": 13.077487765089723, + "grad_norm": 0.029978493228554726, + "learning_rate": 0.0003226647917251058, + "loss": 0.0054, + "num_input_tokens_seen": 173160512, + "step": 80165 + }, + { + "epoch": 13.078303425774878, + "grad_norm": 0.01935637556016445, + "learning_rate": 0.0003225982410010918, + "loss": 0.0049, + "num_input_tokens_seen": 173171872, + "step": 80170 + }, + { + "epoch": 13.079119086460032, + "grad_norm": 0.06953626126050949, + "learning_rate": 0.00032253169387234953, + "loss": 0.0098, + "num_input_tokens_seen": 173183552, + "step": 80175 + }, + { + "epoch": 13.079934747145188, + "grad_norm": 1.5633295774459839, + "learning_rate": 0.0003224651503402276, + "loss": 0.06, + "num_input_tokens_seen": 173194144, + "step": 80180 + }, + { + "epoch": 13.080750407830342, + "grad_norm": 0.01591937430202961, + "learning_rate": 0.00032239861040607464, + "loss": 0.0083, + "num_input_tokens_seen": 173205024, + "step": 80185 + }, + { + "epoch": 13.081566068515498, + "grad_norm": 0.00395574327558279, + "learning_rate": 0.0003223320740712391, + "loss": 0.0015, + "num_input_tokens_seen": 173214176, + "step": 80190 + }, + { + "epoch": 13.082381729200652, + "grad_norm": 0.5557219386100769, + "learning_rate": 0.0003222655413370696, + "loss": 0.061, + "num_input_tokens_seen": 173224832, + "step": 80195 + }, + { + "epoch": 13.083197389885807, + "grad_norm": 0.05771186947822571, + "learning_rate": 0.00032219901220491417, + "loss": 0.0263, + "num_input_tokens_seen": 173234528, + "step": 80200 + }, + { + "epoch": 13.084013050570963, + "grad_norm": 0.005979506764560938, + "learning_rate": 0.0003221324866761215, + "loss": 0.0042, + "num_input_tokens_seen": 173245248, + "step": 80205 + }, + { + "epoch": 13.084828711256117, + "grad_norm": 0.029441453516483307, + "learning_rate": 0.0003220659647520395, + "loss": 0.0248, + "num_input_tokens_seen": 173255776, + "step": 80210 + }, + { + "epoch": 13.085644371941273, + "grad_norm": 0.7436314225196838, + "learning_rate": 0.00032199944643401655, + "loss": 0.0732, + "num_input_tokens_seen": 173266272, + "step": 80215 + }, + { + "epoch": 13.086460032626427, + "grad_norm": 0.006093572359532118, + "learning_rate": 0.00032193293172340056, + "loss": 0.0016, + "num_input_tokens_seen": 173276064, + "step": 80220 + }, + { + "epoch": 13.087275693311582, + "grad_norm": 0.07572884112596512, + "learning_rate": 0.0003218664206215397, + "loss": 0.0072, + "num_input_tokens_seen": 173286880, + "step": 80225 + }, + { + "epoch": 13.088091353996738, + "grad_norm": 0.0063800751231610775, + "learning_rate": 0.00032179991312978164, + "loss": 0.0129, + "num_input_tokens_seen": 173297728, + "step": 80230 + }, + { + "epoch": 13.088907014681892, + "grad_norm": 0.014107972383499146, + "learning_rate": 0.00032173340924947436, + "loss": 0.0048, + "num_input_tokens_seen": 173308512, + "step": 80235 + }, + { + "epoch": 13.089722675367048, + "grad_norm": 0.03823497146368027, + "learning_rate": 0.00032166690898196594, + "loss": 0.0065, + "num_input_tokens_seen": 173320128, + "step": 80240 + }, + { + "epoch": 13.090538336052202, + "grad_norm": 0.021396825090050697, + "learning_rate": 0.0003216004123286036, + "loss": 0.0048, + "num_input_tokens_seen": 173331168, + "step": 80245 + }, + { + "epoch": 13.091353996737357, + "grad_norm": 0.02987091988325119, + "learning_rate": 0.0003215339192907355, + "loss": 0.0152, + "num_input_tokens_seen": 173341536, + "step": 80250 + }, + { + "epoch": 13.092169657422513, + "grad_norm": 0.3899349570274353, + "learning_rate": 0.00032146742986970865, + "loss": 0.0587, + "num_input_tokens_seen": 173352000, + "step": 80255 + }, + { + "epoch": 13.092985318107667, + "grad_norm": 0.06607574969530106, + "learning_rate": 0.000321400944066871, + "loss": 0.1516, + "num_input_tokens_seen": 173362368, + "step": 80260 + }, + { + "epoch": 13.093800978792823, + "grad_norm": 0.021596498787403107, + "learning_rate": 0.00032133446188356964, + "loss": 0.0084, + "num_input_tokens_seen": 173374016, + "step": 80265 + }, + { + "epoch": 13.094616639477977, + "grad_norm": 0.0013468860415741801, + "learning_rate": 0.00032126798332115223, + "loss": 0.0017, + "num_input_tokens_seen": 173384160, + "step": 80270 + }, + { + "epoch": 13.095432300163132, + "grad_norm": 0.09484121203422546, + "learning_rate": 0.00032120150838096576, + "loss": 0.0071, + "num_input_tokens_seen": 173395296, + "step": 80275 + }, + { + "epoch": 13.096247960848286, + "grad_norm": 0.0008138703415170312, + "learning_rate": 0.00032113503706435767, + "loss": 0.0039, + "num_input_tokens_seen": 173406304, + "step": 80280 + }, + { + "epoch": 13.097063621533442, + "grad_norm": 0.11847636848688126, + "learning_rate": 0.00032106856937267475, + "loss": 0.0186, + "num_input_tokens_seen": 173416160, + "step": 80285 + }, + { + "epoch": 13.097879282218598, + "grad_norm": 0.029739893972873688, + "learning_rate": 0.00032100210530726446, + "loss": 0.1203, + "num_input_tokens_seen": 173427328, + "step": 80290 + }, + { + "epoch": 13.098694942903752, + "grad_norm": 0.0190031286329031, + "learning_rate": 0.00032093564486947347, + "loss": 0.0093, + "num_input_tokens_seen": 173438784, + "step": 80295 + }, + { + "epoch": 13.099510603588907, + "grad_norm": 0.012608149088919163, + "learning_rate": 0.0003208691880606488, + "loss": 0.0026, + "num_input_tokens_seen": 173450464, + "step": 80300 + }, + { + "epoch": 13.100326264274061, + "grad_norm": 0.5554617643356323, + "learning_rate": 0.0003208027348821373, + "loss": 0.0652, + "num_input_tokens_seen": 173461664, + "step": 80305 + }, + { + "epoch": 13.101141924959217, + "grad_norm": 0.026764320209622383, + "learning_rate": 0.00032073628533528574, + "loss": 0.0069, + "num_input_tokens_seen": 173472704, + "step": 80310 + }, + { + "epoch": 13.101957585644373, + "grad_norm": 0.014278876595199108, + "learning_rate": 0.0003206698394214407, + "loss": 0.0062, + "num_input_tokens_seen": 173483616, + "step": 80315 + }, + { + "epoch": 13.102773246329527, + "grad_norm": 0.009230894036591053, + "learning_rate": 0.00032060339714194897, + "loss": 0.0322, + "num_input_tokens_seen": 173495104, + "step": 80320 + }, + { + "epoch": 13.103588907014682, + "grad_norm": 0.010614125989377499, + "learning_rate": 0.0003205369584981568, + "loss": 0.0987, + "num_input_tokens_seen": 173505376, + "step": 80325 + }, + { + "epoch": 13.104404567699836, + "grad_norm": 0.008995750918984413, + "learning_rate": 0.000320470523491411, + "loss": 0.027, + "num_input_tokens_seen": 173517024, + "step": 80330 + }, + { + "epoch": 13.105220228384992, + "grad_norm": 0.38214293122291565, + "learning_rate": 0.00032040409212305765, + "loss": 0.0829, + "num_input_tokens_seen": 173528256, + "step": 80335 + }, + { + "epoch": 13.106035889070148, + "grad_norm": 0.00634557381272316, + "learning_rate": 0.0003203376643944433, + "loss": 0.0079, + "num_input_tokens_seen": 173538240, + "step": 80340 + }, + { + "epoch": 13.106851549755302, + "grad_norm": 0.009608942084014416, + "learning_rate": 0.0003202712403069141, + "loss": 0.0064, + "num_input_tokens_seen": 173548160, + "step": 80345 + }, + { + "epoch": 13.107667210440457, + "grad_norm": 0.010670358315110207, + "learning_rate": 0.00032020481986181606, + "loss": 0.0033, + "num_input_tokens_seen": 173559776, + "step": 80350 + }, + { + "epoch": 13.108482871125611, + "grad_norm": 0.019212661311030388, + "learning_rate": 0.0003201384030604957, + "loss": 0.0661, + "num_input_tokens_seen": 173570368, + "step": 80355 + }, + { + "epoch": 13.109298531810767, + "grad_norm": 0.3519008159637451, + "learning_rate": 0.0003200719899042985, + "loss": 0.0225, + "num_input_tokens_seen": 173581792, + "step": 80360 + }, + { + "epoch": 13.11011419249592, + "grad_norm": 0.09790132939815521, + "learning_rate": 0.00032000558039457094, + "loss": 0.011, + "num_input_tokens_seen": 173592928, + "step": 80365 + }, + { + "epoch": 13.110929853181077, + "grad_norm": 0.008577114902436733, + "learning_rate": 0.0003199391745326585, + "loss": 0.0028, + "num_input_tokens_seen": 173604032, + "step": 80370 + }, + { + "epoch": 13.111745513866232, + "grad_norm": 0.006519824266433716, + "learning_rate": 0.0003198727723199072, + "loss": 0.0074, + "num_input_tokens_seen": 173614848, + "step": 80375 + }, + { + "epoch": 13.112561174551386, + "grad_norm": 0.001890502288006246, + "learning_rate": 0.0003198063737576625, + "loss": 0.0039, + "num_input_tokens_seen": 173625472, + "step": 80380 + }, + { + "epoch": 13.113376835236542, + "grad_norm": 0.2293672114610672, + "learning_rate": 0.0003197399788472705, + "loss": 0.0216, + "num_input_tokens_seen": 173637376, + "step": 80385 + }, + { + "epoch": 13.114192495921696, + "grad_norm": 0.005673011764883995, + "learning_rate": 0.0003196735875900762, + "loss": 0.0033, + "num_input_tokens_seen": 173648800, + "step": 80390 + }, + { + "epoch": 13.115008156606851, + "grad_norm": 0.004373501054942608, + "learning_rate": 0.00031960719998742567, + "loss": 0.0113, + "num_input_tokens_seen": 173659200, + "step": 80395 + }, + { + "epoch": 13.115823817292007, + "grad_norm": 0.024961533024907112, + "learning_rate": 0.0003195408160406638, + "loss": 0.0129, + "num_input_tokens_seen": 173671168, + "step": 80400 + }, + { + "epoch": 13.116639477977161, + "grad_norm": 0.0037182255182415247, + "learning_rate": 0.00031947443575113655, + "loss": 0.011, + "num_input_tokens_seen": 173681280, + "step": 80405 + }, + { + "epoch": 13.117455138662317, + "grad_norm": 0.023325273767113686, + "learning_rate": 0.00031940805912018854, + "loss": 0.1272, + "num_input_tokens_seen": 173693024, + "step": 80410 + }, + { + "epoch": 13.11827079934747, + "grad_norm": 0.005407822318375111, + "learning_rate": 0.0003193416861491656, + "loss": 0.0109, + "num_input_tokens_seen": 173702944, + "step": 80415 + }, + { + "epoch": 13.119086460032626, + "grad_norm": 0.013056616298854351, + "learning_rate": 0.00031927531683941234, + "loss": 0.0054, + "num_input_tokens_seen": 173714592, + "step": 80420 + }, + { + "epoch": 13.119902120717782, + "grad_norm": 0.016717007383704185, + "learning_rate": 0.0003192089511922742, + "loss": 0.0647, + "num_input_tokens_seen": 173726592, + "step": 80425 + }, + { + "epoch": 13.120717781402936, + "grad_norm": 0.2105436474084854, + "learning_rate": 0.0003191425892090959, + "loss": 0.0068, + "num_input_tokens_seen": 173736896, + "step": 80430 + }, + { + "epoch": 13.121533442088092, + "grad_norm": 0.03821544349193573, + "learning_rate": 0.0003190762308912226, + "loss": 0.0942, + "num_input_tokens_seen": 173748352, + "step": 80435 + }, + { + "epoch": 13.122349102773246, + "grad_norm": 0.019991006702184677, + "learning_rate": 0.0003190098762399989, + "loss": 0.0113, + "num_input_tokens_seen": 173759616, + "step": 80440 + }, + { + "epoch": 13.123164763458401, + "grad_norm": 0.007260370999574661, + "learning_rate": 0.0003189435252567697, + "loss": 0.0209, + "num_input_tokens_seen": 173769632, + "step": 80445 + }, + { + "epoch": 13.123980424143557, + "grad_norm": 0.006024550646543503, + "learning_rate": 0.00031887717794287963, + "loss": 0.0035, + "num_input_tokens_seen": 173780992, + "step": 80450 + }, + { + "epoch": 13.124796084828711, + "grad_norm": 0.001860006363131106, + "learning_rate": 0.0003188108342996732, + "loss": 0.0171, + "num_input_tokens_seen": 173792128, + "step": 80455 + }, + { + "epoch": 13.125611745513867, + "grad_norm": 0.008678854443132877, + "learning_rate": 0.0003187444943284953, + "loss": 0.0036, + "num_input_tokens_seen": 173802656, + "step": 80460 + }, + { + "epoch": 13.12642740619902, + "grad_norm": 0.006701524835079908, + "learning_rate": 0.00031867815803068996, + "loss": 0.0072, + "num_input_tokens_seen": 173813792, + "step": 80465 + }, + { + "epoch": 13.127243066884176, + "grad_norm": 0.0022508781403303146, + "learning_rate": 0.0003186118254076018, + "loss": 0.0035, + "num_input_tokens_seen": 173824800, + "step": 80470 + }, + { + "epoch": 13.12805872756933, + "grad_norm": 0.02744651958346367, + "learning_rate": 0.00031854549646057517, + "loss": 0.0129, + "num_input_tokens_seen": 173835168, + "step": 80475 + }, + { + "epoch": 13.128874388254486, + "grad_norm": 0.544084370136261, + "learning_rate": 0.00031847917119095425, + "loss": 0.1083, + "num_input_tokens_seen": 173847168, + "step": 80480 + }, + { + "epoch": 13.129690048939642, + "grad_norm": 0.05963525548577309, + "learning_rate": 0.0003184128496000832, + "loss": 0.0823, + "num_input_tokens_seen": 173857120, + "step": 80485 + }, + { + "epoch": 13.130505709624796, + "grad_norm": 0.005323616787791252, + "learning_rate": 0.00031834653168930614, + "loss": 0.0125, + "num_input_tokens_seen": 173868064, + "step": 80490 + }, + { + "epoch": 13.131321370309951, + "grad_norm": 0.11876530200242996, + "learning_rate": 0.0003182802174599669, + "loss": 0.0094, + "num_input_tokens_seen": 173878560, + "step": 80495 + }, + { + "epoch": 13.132137030995105, + "grad_norm": 0.011408278718590736, + "learning_rate": 0.00031821390691340985, + "loss": 0.0021, + "num_input_tokens_seen": 173890176, + "step": 80500 + }, + { + "epoch": 13.132952691680261, + "grad_norm": 0.007186530157923698, + "learning_rate": 0.0003181476000509783, + "loss": 0.0035, + "num_input_tokens_seen": 173900928, + "step": 80505 + }, + { + "epoch": 13.133768352365417, + "grad_norm": 0.06655506044626236, + "learning_rate": 0.00031808129687401664, + "loss": 0.0123, + "num_input_tokens_seen": 173912128, + "step": 80510 + }, + { + "epoch": 13.13458401305057, + "grad_norm": 0.0020698008593171835, + "learning_rate": 0.00031801499738386797, + "loss": 0.0049, + "num_input_tokens_seen": 173923200, + "step": 80515 + }, + { + "epoch": 13.135399673735726, + "grad_norm": 0.00747555959969759, + "learning_rate": 0.0003179487015818765, + "loss": 0.0039, + "num_input_tokens_seen": 173933120, + "step": 80520 + }, + { + "epoch": 13.13621533442088, + "grad_norm": 0.12066885083913803, + "learning_rate": 0.00031788240946938534, + "loss": 0.0349, + "num_input_tokens_seen": 173944160, + "step": 80525 + }, + { + "epoch": 13.137030995106036, + "grad_norm": 0.047779396176338196, + "learning_rate": 0.00031781612104773836, + "loss": 0.0247, + "num_input_tokens_seen": 173955040, + "step": 80530 + }, + { + "epoch": 13.137846655791192, + "grad_norm": 0.021179448813199997, + "learning_rate": 0.00031774983631827866, + "loss": 0.004, + "num_input_tokens_seen": 173966272, + "step": 80535 + }, + { + "epoch": 13.138662316476346, + "grad_norm": 0.002796266693621874, + "learning_rate": 0.00031768355528234986, + "loss": 0.0051, + "num_input_tokens_seen": 173976352, + "step": 80540 + }, + { + "epoch": 13.139477977161501, + "grad_norm": 0.0028553269803524017, + "learning_rate": 0.0003176172779412949, + "loss": 0.0065, + "num_input_tokens_seen": 173987008, + "step": 80545 + }, + { + "epoch": 13.140293637846655, + "grad_norm": 0.07305045425891876, + "learning_rate": 0.00031755100429645746, + "loss": 0.0135, + "num_input_tokens_seen": 173997856, + "step": 80550 + }, + { + "epoch": 13.141109298531811, + "grad_norm": 0.15813440084457397, + "learning_rate": 0.00031748473434918014, + "loss": 0.0102, + "num_input_tokens_seen": 174009120, + "step": 80555 + }, + { + "epoch": 13.141924959216965, + "grad_norm": 0.4499052166938782, + "learning_rate": 0.0003174184681008061, + "loss": 0.0158, + "num_input_tokens_seen": 174019776, + "step": 80560 + }, + { + "epoch": 13.14274061990212, + "grad_norm": 0.006134378258138895, + "learning_rate": 0.00031735220555267874, + "loss": 0.0038, + "num_input_tokens_seen": 174030816, + "step": 80565 + }, + { + "epoch": 13.143556280587276, + "grad_norm": 0.013704081997275352, + "learning_rate": 0.0003172859467061404, + "loss": 0.0618, + "num_input_tokens_seen": 174041472, + "step": 80570 + }, + { + "epoch": 13.14437194127243, + "grad_norm": 0.0008135527605190873, + "learning_rate": 0.0003172196915625344, + "loss": 0.002, + "num_input_tokens_seen": 174054368, + "step": 80575 + }, + { + "epoch": 13.145187601957586, + "grad_norm": 0.024581970646977425, + "learning_rate": 0.0003171534401232029, + "loss": 0.0936, + "num_input_tokens_seen": 174065504, + "step": 80580 + }, + { + "epoch": 13.14600326264274, + "grad_norm": 0.022131305187940598, + "learning_rate": 0.0003170871923894892, + "loss": 0.0075, + "num_input_tokens_seen": 174075744, + "step": 80585 + }, + { + "epoch": 13.146818923327896, + "grad_norm": 0.028195692226290703, + "learning_rate": 0.0003170209483627353, + "loss": 0.0041, + "num_input_tokens_seen": 174085920, + "step": 80590 + }, + { + "epoch": 13.147634584013051, + "grad_norm": 0.08796575665473938, + "learning_rate": 0.00031695470804428427, + "loss": 0.0071, + "num_input_tokens_seen": 174095968, + "step": 80595 + }, + { + "epoch": 13.148450244698205, + "grad_norm": 0.005800274666398764, + "learning_rate": 0.0003168884714354781, + "loss": 0.0551, + "num_input_tokens_seen": 174106496, + "step": 80600 + }, + { + "epoch": 13.149265905383361, + "grad_norm": 0.6200029850006104, + "learning_rate": 0.0003168222385376596, + "loss": 0.0266, + "num_input_tokens_seen": 174116256, + "step": 80605 + }, + { + "epoch": 13.150081566068515, + "grad_norm": 0.02263057976961136, + "learning_rate": 0.0003167560093521705, + "loss": 0.1138, + "num_input_tokens_seen": 174127648, + "step": 80610 + }, + { + "epoch": 13.15089722675367, + "grad_norm": 0.002096653450280428, + "learning_rate": 0.00031668978388035347, + "loss": 0.0062, + "num_input_tokens_seen": 174137856, + "step": 80615 + }, + { + "epoch": 13.151712887438826, + "grad_norm": 0.00246584415435791, + "learning_rate": 0.0003166235621235505, + "loss": 0.0066, + "num_input_tokens_seen": 174148160, + "step": 80620 + }, + { + "epoch": 13.15252854812398, + "grad_norm": 0.15061096847057343, + "learning_rate": 0.00031655734408310367, + "loss": 0.026, + "num_input_tokens_seen": 174158144, + "step": 80625 + }, + { + "epoch": 13.153344208809136, + "grad_norm": 0.03953423351049423, + "learning_rate": 0.000316491129760355, + "loss": 0.0116, + "num_input_tokens_seen": 174168096, + "step": 80630 + }, + { + "epoch": 13.15415986949429, + "grad_norm": 0.002793251071125269, + "learning_rate": 0.0003164249191566464, + "loss": 0.0034, + "num_input_tokens_seen": 174178752, + "step": 80635 + }, + { + "epoch": 13.154975530179446, + "grad_norm": 0.00592702254652977, + "learning_rate": 0.00031635871227331957, + "loss": 0.0725, + "num_input_tokens_seen": 174190720, + "step": 80640 + }, + { + "epoch": 13.1557911908646, + "grad_norm": 0.04241395369172096, + "learning_rate": 0.00031629250911171657, + "loss": 0.004, + "num_input_tokens_seen": 174202080, + "step": 80645 + }, + { + "epoch": 13.156606851549755, + "grad_norm": 0.014272456057369709, + "learning_rate": 0.0003162263096731788, + "loss": 0.0305, + "num_input_tokens_seen": 174213728, + "step": 80650 + }, + { + "epoch": 13.15742251223491, + "grad_norm": 0.013360538519918919, + "learning_rate": 0.0003161601139590482, + "loss": 0.0037, + "num_input_tokens_seen": 174224320, + "step": 80655 + }, + { + "epoch": 13.158238172920065, + "grad_norm": 0.4321325123310089, + "learning_rate": 0.0003160939219706658, + "loss": 0.1981, + "num_input_tokens_seen": 174235808, + "step": 80660 + }, + { + "epoch": 13.15905383360522, + "grad_norm": 0.004693038295954466, + "learning_rate": 0.00031602773370937345, + "loss": 0.002, + "num_input_tokens_seen": 174246848, + "step": 80665 + }, + { + "epoch": 13.159869494290374, + "grad_norm": 0.005606858525425196, + "learning_rate": 0.00031596154917651266, + "loss": 0.0071, + "num_input_tokens_seen": 174258720, + "step": 80670 + }, + { + "epoch": 13.16068515497553, + "grad_norm": 0.04412202537059784, + "learning_rate": 0.0003158953683734244, + "loss": 0.0239, + "num_input_tokens_seen": 174268736, + "step": 80675 + }, + { + "epoch": 13.161500815660686, + "grad_norm": 0.04025116190314293, + "learning_rate": 0.00031582919130145016, + "loss": 0.0068, + "num_input_tokens_seen": 174279136, + "step": 80680 + }, + { + "epoch": 13.16231647634584, + "grad_norm": 0.01368759199976921, + "learning_rate": 0.0003157630179619308, + "loss": 0.0037, + "num_input_tokens_seen": 174291168, + "step": 80685 + }, + { + "epoch": 13.163132137030995, + "grad_norm": 0.03054218553006649, + "learning_rate": 0.00031569684835620784, + "loss": 0.0106, + "num_input_tokens_seen": 174301440, + "step": 80690 + }, + { + "epoch": 13.16394779771615, + "grad_norm": 0.04629860818386078, + "learning_rate": 0.00031563068248562185, + "loss": 0.011, + "num_input_tokens_seen": 174312640, + "step": 80695 + }, + { + "epoch": 13.164763458401305, + "grad_norm": 0.019359026104211807, + "learning_rate": 0.00031556452035151416, + "loss": 0.0132, + "num_input_tokens_seen": 174322880, + "step": 80700 + }, + { + "epoch": 13.16557911908646, + "grad_norm": 0.029904767870903015, + "learning_rate": 0.00031549836195522517, + "loss": 0.0043, + "num_input_tokens_seen": 174333600, + "step": 80705 + }, + { + "epoch": 13.166394779771615, + "grad_norm": 0.0014057289808988571, + "learning_rate": 0.00031543220729809626, + "loss": 0.0035, + "num_input_tokens_seen": 174344256, + "step": 80710 + }, + { + "epoch": 13.16721044045677, + "grad_norm": 0.002946326043456793, + "learning_rate": 0.00031536605638146756, + "loss": 0.0092, + "num_input_tokens_seen": 174353888, + "step": 80715 + }, + { + "epoch": 13.168026101141924, + "grad_norm": 0.0019200635142624378, + "learning_rate": 0.0003152999092066801, + "loss": 0.0028, + "num_input_tokens_seen": 174365312, + "step": 80720 + }, + { + "epoch": 13.16884176182708, + "grad_norm": 0.003745671361684799, + "learning_rate": 0.0003152337657750741, + "loss": 0.0026, + "num_input_tokens_seen": 174375168, + "step": 80725 + }, + { + "epoch": 13.169657422512234, + "grad_norm": 0.0020364460069686174, + "learning_rate": 0.00031516762608799047, + "loss": 0.0073, + "num_input_tokens_seen": 174387264, + "step": 80730 + }, + { + "epoch": 13.17047308319739, + "grad_norm": 0.052907831966876984, + "learning_rate": 0.0003151014901467691, + "loss": 0.0101, + "num_input_tokens_seen": 174397152, + "step": 80735 + }, + { + "epoch": 13.171288743882545, + "grad_norm": 0.017299756407737732, + "learning_rate": 0.00031503535795275096, + "loss": 0.0206, + "num_input_tokens_seen": 174407136, + "step": 80740 + }, + { + "epoch": 13.1721044045677, + "grad_norm": 0.058468643575906754, + "learning_rate": 0.00031496922950727556, + "loss": 0.0256, + "num_input_tokens_seen": 174416896, + "step": 80745 + }, + { + "epoch": 13.172920065252855, + "grad_norm": 0.016621742397546768, + "learning_rate": 0.00031490310481168375, + "loss": 0.0082, + "num_input_tokens_seen": 174426976, + "step": 80750 + }, + { + "epoch": 13.173735725938009, + "grad_norm": 0.0437234528362751, + "learning_rate": 0.0003148369838673151, + "loss": 0.0062, + "num_input_tokens_seen": 174436544, + "step": 80755 + }, + { + "epoch": 13.174551386623165, + "grad_norm": 0.006686724256724119, + "learning_rate": 0.00031477086667551003, + "loss": 0.014, + "num_input_tokens_seen": 174447040, + "step": 80760 + }, + { + "epoch": 13.17536704730832, + "grad_norm": 0.007379279471933842, + "learning_rate": 0.00031470475323760826, + "loss": 0.0077, + "num_input_tokens_seen": 174456640, + "step": 80765 + }, + { + "epoch": 13.176182707993474, + "grad_norm": 0.004275395534932613, + "learning_rate": 0.0003146386435549496, + "loss": 0.0015, + "num_input_tokens_seen": 174467328, + "step": 80770 + }, + { + "epoch": 13.17699836867863, + "grad_norm": 0.014936030842363834, + "learning_rate": 0.0003145725376288742, + "loss": 0.0272, + "num_input_tokens_seen": 174478176, + "step": 80775 + }, + { + "epoch": 13.177814029363784, + "grad_norm": 0.0015776983927935362, + "learning_rate": 0.00031450643546072145, + "loss": 0.0035, + "num_input_tokens_seen": 174490432, + "step": 80780 + }, + { + "epoch": 13.17862969004894, + "grad_norm": 0.3135847747325897, + "learning_rate": 0.0003144403370518311, + "loss": 0.0104, + "num_input_tokens_seen": 174501120, + "step": 80785 + }, + { + "epoch": 13.179445350734095, + "grad_norm": 0.012622885406017303, + "learning_rate": 0.00031437424240354274, + "loss": 0.1161, + "num_input_tokens_seen": 174511232, + "step": 80790 + }, + { + "epoch": 13.18026101141925, + "grad_norm": 0.10842663049697876, + "learning_rate": 0.00031430815151719583, + "loss": 0.124, + "num_input_tokens_seen": 174521664, + "step": 80795 + }, + { + "epoch": 13.181076672104405, + "grad_norm": 0.15451642870903015, + "learning_rate": 0.00031424206439412984, + "loss": 0.0168, + "num_input_tokens_seen": 174531424, + "step": 80800 + }, + { + "epoch": 13.181892332789559, + "grad_norm": 0.04291224852204323, + "learning_rate": 0.00031417598103568404, + "loss": 0.025, + "num_input_tokens_seen": 174542016, + "step": 80805 + }, + { + "epoch": 13.182707993474715, + "grad_norm": 0.2300167977809906, + "learning_rate": 0.00031410990144319756, + "loss": 0.0131, + "num_input_tokens_seen": 174553088, + "step": 80810 + }, + { + "epoch": 13.18352365415987, + "grad_norm": 0.001681014895439148, + "learning_rate": 0.00031404382561801006, + "loss": 0.0275, + "num_input_tokens_seen": 174563840, + "step": 80815 + }, + { + "epoch": 13.184339314845024, + "grad_norm": 0.007472775410860777, + "learning_rate": 0.00031397775356146004, + "loss": 0.0659, + "num_input_tokens_seen": 174575648, + "step": 80820 + }, + { + "epoch": 13.18515497553018, + "grad_norm": 0.0014233127003535628, + "learning_rate": 0.000313911685274887, + "loss": 0.0134, + "num_input_tokens_seen": 174587392, + "step": 80825 + }, + { + "epoch": 13.185970636215334, + "grad_norm": 0.003380796406418085, + "learning_rate": 0.0003138456207596296, + "loss": 0.0472, + "num_input_tokens_seen": 174598912, + "step": 80830 + }, + { + "epoch": 13.18678629690049, + "grad_norm": 0.004419084172695875, + "learning_rate": 0.0003137795600170271, + "loss": 0.0046, + "num_input_tokens_seen": 174610112, + "step": 80835 + }, + { + "epoch": 13.187601957585644, + "grad_norm": 0.0024323707912117243, + "learning_rate": 0.0003137135030484177, + "loss": 0.0029, + "num_input_tokens_seen": 174620928, + "step": 80840 + }, + { + "epoch": 13.1884176182708, + "grad_norm": 0.0012152445269748569, + "learning_rate": 0.00031364744985514084, + "loss": 0.0028, + "num_input_tokens_seen": 174631712, + "step": 80845 + }, + { + "epoch": 13.189233278955955, + "grad_norm": 0.02515444904565811, + "learning_rate": 0.00031358140043853455, + "loss": 0.0042, + "num_input_tokens_seen": 174642176, + "step": 80850 + }, + { + "epoch": 13.190048939641109, + "grad_norm": 0.0368582084774971, + "learning_rate": 0.00031351535479993785, + "loss": 0.0051, + "num_input_tokens_seen": 174653152, + "step": 80855 + }, + { + "epoch": 13.190864600326265, + "grad_norm": 0.022137103602290154, + "learning_rate": 0.0003134493129406889, + "loss": 0.0068, + "num_input_tokens_seen": 174664160, + "step": 80860 + }, + { + "epoch": 13.191680261011419, + "grad_norm": 0.003924847114831209, + "learning_rate": 0.00031338327486212647, + "loss": 0.0013, + "num_input_tokens_seen": 174674432, + "step": 80865 + }, + { + "epoch": 13.192495921696574, + "grad_norm": 0.002630516653880477, + "learning_rate": 0.00031331724056558847, + "loss": 0.0098, + "num_input_tokens_seen": 174685088, + "step": 80870 + }, + { + "epoch": 13.19331158238173, + "grad_norm": 0.011913495138287544, + "learning_rate": 0.0003132512100524134, + "loss": 0.0735, + "num_input_tokens_seen": 174696256, + "step": 80875 + }, + { + "epoch": 13.194127243066884, + "grad_norm": 0.0055811344645917416, + "learning_rate": 0.00031318518332393975, + "loss": 0.011, + "num_input_tokens_seen": 174707328, + "step": 80880 + }, + { + "epoch": 13.19494290375204, + "grad_norm": 0.003324878169223666, + "learning_rate": 0.0003131191603815051, + "loss": 0.0064, + "num_input_tokens_seen": 174717248, + "step": 80885 + }, + { + "epoch": 13.195758564437194, + "grad_norm": 0.010468652471899986, + "learning_rate": 0.000313053141226448, + "loss": 0.0043, + "num_input_tokens_seen": 174728160, + "step": 80890 + }, + { + "epoch": 13.19657422512235, + "grad_norm": 0.004916087724268436, + "learning_rate": 0.0003129871258601059, + "loss": 0.0087, + "num_input_tokens_seen": 174738688, + "step": 80895 + }, + { + "epoch": 13.197389885807505, + "grad_norm": 0.05585349351167679, + "learning_rate": 0.0003129211142838171, + "loss": 0.0146, + "num_input_tokens_seen": 174750016, + "step": 80900 + }, + { + "epoch": 13.198205546492659, + "grad_norm": 0.0034299406688660383, + "learning_rate": 0.0003128551064989191, + "loss": 0.0039, + "num_input_tokens_seen": 174759456, + "step": 80905 + }, + { + "epoch": 13.199021207177815, + "grad_norm": 0.33458200097084045, + "learning_rate": 0.00031278910250674994, + "loss": 0.0168, + "num_input_tokens_seen": 174769440, + "step": 80910 + }, + { + "epoch": 13.199836867862969, + "grad_norm": 0.009169838391244411, + "learning_rate": 0.00031272310230864695, + "loss": 0.0061, + "num_input_tokens_seen": 174779488, + "step": 80915 + }, + { + "epoch": 13.200652528548124, + "grad_norm": 0.01070548314601183, + "learning_rate": 0.0003126571059059481, + "loss": 0.0055, + "num_input_tokens_seen": 174790048, + "step": 80920 + }, + { + "epoch": 13.201468189233278, + "grad_norm": 0.013349352404475212, + "learning_rate": 0.00031259111329999035, + "loss": 0.003, + "num_input_tokens_seen": 174801056, + "step": 80925 + }, + { + "epoch": 13.202283849918434, + "grad_norm": 0.0030847955495119095, + "learning_rate": 0.00031252512449211163, + "loss": 0.0017, + "num_input_tokens_seen": 174811872, + "step": 80930 + }, + { + "epoch": 13.20309951060359, + "grad_norm": 0.2001817226409912, + "learning_rate": 0.0003124591394836491, + "loss": 0.0055, + "num_input_tokens_seen": 174822464, + "step": 80935 + }, + { + "epoch": 13.203915171288743, + "grad_norm": 0.009598773904144764, + "learning_rate": 0.00031239315827593994, + "loss": 0.0042, + "num_input_tokens_seen": 174833984, + "step": 80940 + }, + { + "epoch": 13.2047308319739, + "grad_norm": 0.0008926258306019008, + "learning_rate": 0.0003123271808703215, + "loss": 0.008, + "num_input_tokens_seen": 174844608, + "step": 80945 + }, + { + "epoch": 13.205546492659053, + "grad_norm": 0.00110113644041121, + "learning_rate": 0.0003122612072681308, + "loss": 0.0029, + "num_input_tokens_seen": 174856000, + "step": 80950 + }, + { + "epoch": 13.206362153344209, + "grad_norm": 0.044156067073345184, + "learning_rate": 0.00031219523747070475, + "loss": 0.0154, + "num_input_tokens_seen": 174867200, + "step": 80955 + }, + { + "epoch": 13.207177814029365, + "grad_norm": 0.24586060643196106, + "learning_rate": 0.00031212927147938066, + "loss": 0.0109, + "num_input_tokens_seen": 174877312, + "step": 80960 + }, + { + "epoch": 13.207993474714518, + "grad_norm": 0.008321182802319527, + "learning_rate": 0.0003120633092954951, + "loss": 0.002, + "num_input_tokens_seen": 174888960, + "step": 80965 + }, + { + "epoch": 13.208809135399674, + "grad_norm": 0.0031696436926722527, + "learning_rate": 0.0003119973509203851, + "loss": 0.0027, + "num_input_tokens_seen": 174900448, + "step": 80970 + }, + { + "epoch": 13.209624796084828, + "grad_norm": 0.0024241674691438675, + "learning_rate": 0.00031193139635538714, + "loss": 0.0031, + "num_input_tokens_seen": 174910208, + "step": 80975 + }, + { + "epoch": 13.210440456769984, + "grad_norm": 0.0023144527804106474, + "learning_rate": 0.00031186544560183796, + "loss": 0.0045, + "num_input_tokens_seen": 174921664, + "step": 80980 + }, + { + "epoch": 13.21125611745514, + "grad_norm": 0.0020555031951516867, + "learning_rate": 0.00031179949866107443, + "loss": 0.0658, + "num_input_tokens_seen": 174932416, + "step": 80985 + }, + { + "epoch": 13.212071778140293, + "grad_norm": 0.005608719773590565, + "learning_rate": 0.0003117335555344326, + "loss": 0.0849, + "num_input_tokens_seen": 174942048, + "step": 80990 + }, + { + "epoch": 13.21288743882545, + "grad_norm": 0.07525213807821274, + "learning_rate": 0.00031166761622324936, + "loss": 0.0628, + "num_input_tokens_seen": 174952960, + "step": 80995 + }, + { + "epoch": 13.213703099510603, + "grad_norm": 0.005778506398200989, + "learning_rate": 0.00031160168072886054, + "loss": 0.0014, + "num_input_tokens_seen": 174963392, + "step": 81000 + }, + { + "epoch": 13.214518760195759, + "grad_norm": 0.22658313810825348, + "learning_rate": 0.00031153574905260287, + "loss": 0.0072, + "num_input_tokens_seen": 174974624, + "step": 81005 + }, + { + "epoch": 13.215334420880913, + "grad_norm": 0.001897740876302123, + "learning_rate": 0.000311469821195812, + "loss": 0.0024, + "num_input_tokens_seen": 174985024, + "step": 81010 + }, + { + "epoch": 13.216150081566068, + "grad_norm": 0.03674355894327164, + "learning_rate": 0.00031140389715982476, + "loss": 0.1143, + "num_input_tokens_seen": 174995104, + "step": 81015 + }, + { + "epoch": 13.216965742251224, + "grad_norm": 0.47961094975471497, + "learning_rate": 0.00031133797694597655, + "loss": 0.127, + "num_input_tokens_seen": 175006464, + "step": 81020 + }, + { + "epoch": 13.217781402936378, + "grad_norm": 0.012490352615714073, + "learning_rate": 0.0003112720605556037, + "loss": 0.0024, + "num_input_tokens_seen": 175017280, + "step": 81025 + }, + { + "epoch": 13.218597063621534, + "grad_norm": 0.00828443095088005, + "learning_rate": 0.00031120614799004184, + "loss": 0.0141, + "num_input_tokens_seen": 175028800, + "step": 81030 + }, + { + "epoch": 13.219412724306688, + "grad_norm": 0.0018339533125981688, + "learning_rate": 0.0003111402392506271, + "loss": 0.0065, + "num_input_tokens_seen": 175036800, + "step": 81035 + }, + { + "epoch": 13.220228384991843, + "grad_norm": 0.5048210024833679, + "learning_rate": 0.0003110743343386947, + "loss": 0.048, + "num_input_tokens_seen": 175047840, + "step": 81040 + }, + { + "epoch": 13.221044045676999, + "grad_norm": 0.007345435209572315, + "learning_rate": 0.0003110084332555808, + "loss": 0.0202, + "num_input_tokens_seen": 175059552, + "step": 81045 + }, + { + "epoch": 13.221859706362153, + "grad_norm": 0.4212218225002289, + "learning_rate": 0.00031094253600262063, + "loss": 0.1887, + "num_input_tokens_seen": 175070400, + "step": 81050 + }, + { + "epoch": 13.222675367047309, + "grad_norm": 0.0020032948814332485, + "learning_rate": 0.00031087664258115, + "loss": 0.0059, + "num_input_tokens_seen": 175080992, + "step": 81055 + }, + { + "epoch": 13.223491027732463, + "grad_norm": 0.017578184604644775, + "learning_rate": 0.0003108107529925038, + "loss": 0.1716, + "num_input_tokens_seen": 175090976, + "step": 81060 + }, + { + "epoch": 13.224306688417618, + "grad_norm": 0.005086452234536409, + "learning_rate": 0.0003107448672380181, + "loss": 0.0103, + "num_input_tokens_seen": 175101408, + "step": 81065 + }, + { + "epoch": 13.225122349102774, + "grad_norm": 0.003535451367497444, + "learning_rate": 0.0003106789853190274, + "loss": 0.007, + "num_input_tokens_seen": 175110912, + "step": 81070 + }, + { + "epoch": 13.225938009787928, + "grad_norm": 0.08481805771589279, + "learning_rate": 0.0003106131072368674, + "loss": 0.0081, + "num_input_tokens_seen": 175122112, + "step": 81075 + }, + { + "epoch": 13.226753670473084, + "grad_norm": 0.03594028204679489, + "learning_rate": 0.00031054723299287303, + "loss": 0.0036, + "num_input_tokens_seen": 175131808, + "step": 81080 + }, + { + "epoch": 13.227569331158238, + "grad_norm": 0.0058160750195384026, + "learning_rate": 0.00031048136258837923, + "loss": 0.0075, + "num_input_tokens_seen": 175142752, + "step": 81085 + }, + { + "epoch": 13.228384991843393, + "grad_norm": 0.3698887228965759, + "learning_rate": 0.0003104154960247211, + "loss": 0.0261, + "num_input_tokens_seen": 175152928, + "step": 81090 + }, + { + "epoch": 13.229200652528547, + "grad_norm": 0.24206924438476562, + "learning_rate": 0.0003103496333032334, + "loss": 0.0204, + "num_input_tokens_seen": 175164352, + "step": 81095 + }, + { + "epoch": 13.230016313213703, + "grad_norm": 0.001961946953088045, + "learning_rate": 0.00031028377442525104, + "loss": 0.0115, + "num_input_tokens_seen": 175175840, + "step": 81100 + }, + { + "epoch": 13.230831973898859, + "grad_norm": 0.025502916425466537, + "learning_rate": 0.0003102179193921086, + "loss": 0.0073, + "num_input_tokens_seen": 175186848, + "step": 81105 + }, + { + "epoch": 13.231647634584013, + "grad_norm": 0.05788380652666092, + "learning_rate": 0.00031015206820514087, + "loss": 0.0067, + "num_input_tokens_seen": 175197888, + "step": 81110 + }, + { + "epoch": 13.232463295269168, + "grad_norm": 0.010292734019458294, + "learning_rate": 0.0003100862208656823, + "loss": 0.0454, + "num_input_tokens_seen": 175208000, + "step": 81115 + }, + { + "epoch": 13.233278955954322, + "grad_norm": 0.006660535931587219, + "learning_rate": 0.0003100203773750674, + "loss": 0.0299, + "num_input_tokens_seen": 175219008, + "step": 81120 + }, + { + "epoch": 13.234094616639478, + "grad_norm": 0.003208763664588332, + "learning_rate": 0.00030995453773463035, + "loss": 0.0183, + "num_input_tokens_seen": 175229728, + "step": 81125 + }, + { + "epoch": 13.234910277324634, + "grad_norm": 0.013174341060221195, + "learning_rate": 0.00030988870194570596, + "loss": 0.0169, + "num_input_tokens_seen": 175241536, + "step": 81130 + }, + { + "epoch": 13.235725938009788, + "grad_norm": 0.00958284828811884, + "learning_rate": 0.00030982287000962805, + "loss": 0.0043, + "num_input_tokens_seen": 175252992, + "step": 81135 + }, + { + "epoch": 13.236541598694943, + "grad_norm": 0.0013074814341962337, + "learning_rate": 0.000309757041927731, + "loss": 0.0028, + "num_input_tokens_seen": 175263424, + "step": 81140 + }, + { + "epoch": 13.237357259380097, + "grad_norm": 0.02158118039369583, + "learning_rate": 0.00030969121770134877, + "loss": 0.01, + "num_input_tokens_seen": 175274304, + "step": 81145 + }, + { + "epoch": 13.238172920065253, + "grad_norm": 0.011400923132896423, + "learning_rate": 0.0003096253973318156, + "loss": 0.0184, + "num_input_tokens_seen": 175284864, + "step": 81150 + }, + { + "epoch": 13.238988580750409, + "grad_norm": 0.00538475438952446, + "learning_rate": 0.000309559580820465, + "loss": 0.0028, + "num_input_tokens_seen": 175295488, + "step": 81155 + }, + { + "epoch": 13.239804241435563, + "grad_norm": 0.013732331804931164, + "learning_rate": 0.0003094937681686314, + "loss": 0.0118, + "num_input_tokens_seen": 175305216, + "step": 81160 + }, + { + "epoch": 13.240619902120718, + "grad_norm": 0.05636392906308174, + "learning_rate": 0.00030942795937764794, + "loss": 0.0198, + "num_input_tokens_seen": 175316832, + "step": 81165 + }, + { + "epoch": 13.241435562805872, + "grad_norm": 0.0052260602824389935, + "learning_rate": 0.00030936215444884893, + "loss": 0.0071, + "num_input_tokens_seen": 175328480, + "step": 81170 + }, + { + "epoch": 13.242251223491028, + "grad_norm": 0.005099338013678789, + "learning_rate": 0.00030929635338356745, + "loss": 0.0098, + "num_input_tokens_seen": 175340736, + "step": 81175 + }, + { + "epoch": 13.243066884176184, + "grad_norm": 0.000985561404377222, + "learning_rate": 0.0003092305561831375, + "loss": 0.0034, + "num_input_tokens_seen": 175351072, + "step": 81180 + }, + { + "epoch": 13.243882544861338, + "grad_norm": 0.0050900704227387905, + "learning_rate": 0.0003091647628488922, + "loss": 0.002, + "num_input_tokens_seen": 175361984, + "step": 81185 + }, + { + "epoch": 13.244698205546493, + "grad_norm": 0.06734821945428848, + "learning_rate": 0.0003090989733821652, + "loss": 0.0092, + "num_input_tokens_seen": 175373120, + "step": 81190 + }, + { + "epoch": 13.245513866231647, + "grad_norm": 0.002881730208173394, + "learning_rate": 0.0003090331877842895, + "loss": 0.0019, + "num_input_tokens_seen": 175385248, + "step": 81195 + }, + { + "epoch": 13.246329526916803, + "grad_norm": 0.005004175938665867, + "learning_rate": 0.00030896740605659845, + "loss": 0.0079, + "num_input_tokens_seen": 175395776, + "step": 81200 + }, + { + "epoch": 13.247145187601957, + "grad_norm": 0.000822130125015974, + "learning_rate": 0.00030890162820042553, + "loss": 0.0016, + "num_input_tokens_seen": 175405920, + "step": 81205 + }, + { + "epoch": 13.247960848287113, + "grad_norm": 0.33728986978530884, + "learning_rate": 0.00030883585421710334, + "loss": 0.0201, + "num_input_tokens_seen": 175415936, + "step": 81210 + }, + { + "epoch": 13.248776508972268, + "grad_norm": 0.0008757903706282377, + "learning_rate": 0.00030877008410796526, + "loss": 0.0006, + "num_input_tokens_seen": 175425856, + "step": 81215 + }, + { + "epoch": 13.249592169657422, + "grad_norm": 0.00804536696523428, + "learning_rate": 0.00030870431787434385, + "loss": 0.0031, + "num_input_tokens_seen": 175436800, + "step": 81220 + }, + { + "epoch": 13.250407830342578, + "grad_norm": 0.3633720278739929, + "learning_rate": 0.00030863855551757223, + "loss": 0.1165, + "num_input_tokens_seen": 175447072, + "step": 81225 + }, + { + "epoch": 13.251223491027732, + "grad_norm": 0.15613357722759247, + "learning_rate": 0.0003085727970389829, + "loss": 0.0103, + "num_input_tokens_seen": 175457472, + "step": 81230 + }, + { + "epoch": 13.252039151712887, + "grad_norm": 0.0012791818007826805, + "learning_rate": 0.0003085070424399089, + "loss": 0.0071, + "num_input_tokens_seen": 175468672, + "step": 81235 + }, + { + "epoch": 13.252854812398043, + "grad_norm": 0.02506769821047783, + "learning_rate": 0.00030844129172168236, + "loss": 0.0057, + "num_input_tokens_seen": 175479008, + "step": 81240 + }, + { + "epoch": 13.253670473083197, + "grad_norm": 0.002241175388917327, + "learning_rate": 0.0003083755448856361, + "loss": 0.0022, + "num_input_tokens_seen": 175490144, + "step": 81245 + }, + { + "epoch": 13.254486133768353, + "grad_norm": 0.006746441125869751, + "learning_rate": 0.00030830980193310265, + "loss": 0.0983, + "num_input_tokens_seen": 175501376, + "step": 81250 + }, + { + "epoch": 13.255301794453507, + "grad_norm": 0.0012192511931061745, + "learning_rate": 0.00030824406286541415, + "loss": 0.152, + "num_input_tokens_seen": 175511808, + "step": 81255 + }, + { + "epoch": 13.256117455138662, + "grad_norm": 0.016503628343343735, + "learning_rate": 0.00030817832768390306, + "loss": 0.0109, + "num_input_tokens_seen": 175523584, + "step": 81260 + }, + { + "epoch": 13.256933115823816, + "grad_norm": 0.009415880776941776, + "learning_rate": 0.0003081125963899014, + "loss": 0.0033, + "num_input_tokens_seen": 175533856, + "step": 81265 + }, + { + "epoch": 13.257748776508972, + "grad_norm": 0.039036672562360764, + "learning_rate": 0.0003080468689847414, + "loss": 0.0214, + "num_input_tokens_seen": 175543840, + "step": 81270 + }, + { + "epoch": 13.258564437194128, + "grad_norm": 0.006212171167135239, + "learning_rate": 0.00030798114546975525, + "loss": 0.045, + "num_input_tokens_seen": 175555968, + "step": 81275 + }, + { + "epoch": 13.259380097879282, + "grad_norm": 0.06656394898891449, + "learning_rate": 0.00030791542584627455, + "loss": 0.0046, + "num_input_tokens_seen": 175567424, + "step": 81280 + }, + { + "epoch": 13.260195758564437, + "grad_norm": 0.0009761211695149541, + "learning_rate": 0.0003078497101156317, + "loss": 0.0053, + "num_input_tokens_seen": 175578016, + "step": 81285 + }, + { + "epoch": 13.261011419249591, + "grad_norm": 0.000771388178691268, + "learning_rate": 0.00030778399827915796, + "loss": 0.003, + "num_input_tokens_seen": 175588576, + "step": 81290 + }, + { + "epoch": 13.261827079934747, + "grad_norm": 0.000772759725805372, + "learning_rate": 0.0003077182903381856, + "loss": 0.0016, + "num_input_tokens_seen": 175598848, + "step": 81295 + }, + { + "epoch": 13.262642740619903, + "grad_norm": 0.01377933844923973, + "learning_rate": 0.0003076525862940458, + "loss": 0.1336, + "num_input_tokens_seen": 175609504, + "step": 81300 + }, + { + "epoch": 13.263458401305057, + "grad_norm": 0.6922663450241089, + "learning_rate": 0.00030758688614807033, + "loss": 0.0214, + "num_input_tokens_seen": 175621184, + "step": 81305 + }, + { + "epoch": 13.264274061990212, + "grad_norm": 0.0026870991569012403, + "learning_rate": 0.0003075211899015909, + "loss": 0.0131, + "num_input_tokens_seen": 175630816, + "step": 81310 + }, + { + "epoch": 13.265089722675366, + "grad_norm": 0.002322110114619136, + "learning_rate": 0.0003074554975559386, + "loss": 0.005, + "num_input_tokens_seen": 175640064, + "step": 81315 + }, + { + "epoch": 13.265905383360522, + "grad_norm": 0.009497624821960926, + "learning_rate": 0.000307389809112445, + "loss": 0.002, + "num_input_tokens_seen": 175650720, + "step": 81320 + }, + { + "epoch": 13.266721044045678, + "grad_norm": 0.015067550353705883, + "learning_rate": 0.0003073241245724411, + "loss": 0.1275, + "num_input_tokens_seen": 175661920, + "step": 81325 + }, + { + "epoch": 13.267536704730832, + "grad_norm": 0.006270645186305046, + "learning_rate": 0.00030725844393725846, + "loss": 0.0038, + "num_input_tokens_seen": 175672224, + "step": 81330 + }, + { + "epoch": 13.268352365415987, + "grad_norm": 0.016796309500932693, + "learning_rate": 0.00030719276720822774, + "loss": 0.0499, + "num_input_tokens_seen": 175681792, + "step": 81335 + }, + { + "epoch": 13.269168026101141, + "grad_norm": 1.0089938640594482, + "learning_rate": 0.0003071270943866804, + "loss": 0.0447, + "num_input_tokens_seen": 175692736, + "step": 81340 + }, + { + "epoch": 13.269983686786297, + "grad_norm": 0.17437076568603516, + "learning_rate": 0.000307061425473947, + "loss": 0.0069, + "num_input_tokens_seen": 175705088, + "step": 81345 + }, + { + "epoch": 13.270799347471453, + "grad_norm": 0.36235466599464417, + "learning_rate": 0.00030699576047135875, + "loss": 0.0071, + "num_input_tokens_seen": 175715392, + "step": 81350 + }, + { + "epoch": 13.271615008156607, + "grad_norm": 0.0024113464169204235, + "learning_rate": 0.0003069300993802461, + "loss": 0.002, + "num_input_tokens_seen": 175726464, + "step": 81355 + }, + { + "epoch": 13.272430668841762, + "grad_norm": 0.020136456936597824, + "learning_rate": 0.00030686444220194, + "loss": 0.0067, + "num_input_tokens_seen": 175737120, + "step": 81360 + }, + { + "epoch": 13.273246329526916, + "grad_norm": 0.3934916853904724, + "learning_rate": 0.00030679878893777085, + "loss": 0.1054, + "num_input_tokens_seen": 175747328, + "step": 81365 + }, + { + "epoch": 13.274061990212072, + "grad_norm": 0.0014037713408470154, + "learning_rate": 0.0003067331395890696, + "loss": 0.0101, + "num_input_tokens_seen": 175758784, + "step": 81370 + }, + { + "epoch": 13.274877650897226, + "grad_norm": 0.001718403771519661, + "learning_rate": 0.0003066674941571661, + "loss": 0.0027, + "num_input_tokens_seen": 175770656, + "step": 81375 + }, + { + "epoch": 13.275693311582382, + "grad_norm": 0.0006151496199890971, + "learning_rate": 0.0003066018526433914, + "loss": 0.0019, + "num_input_tokens_seen": 175780992, + "step": 81380 + }, + { + "epoch": 13.276508972267537, + "grad_norm": 0.0005740922060795128, + "learning_rate": 0.00030653621504907533, + "loss": 0.0177, + "num_input_tokens_seen": 175791680, + "step": 81385 + }, + { + "epoch": 13.277324632952691, + "grad_norm": 0.04168350249528885, + "learning_rate": 0.0003064705813755483, + "loss": 0.0109, + "num_input_tokens_seen": 175801664, + "step": 81390 + }, + { + "epoch": 13.278140293637847, + "grad_norm": 0.028292395174503326, + "learning_rate": 0.0003064049516241405, + "loss": 0.0043, + "num_input_tokens_seen": 175811264, + "step": 81395 + }, + { + "epoch": 13.278955954323001, + "grad_norm": 0.012449781410396099, + "learning_rate": 0.00030633932579618195, + "loss": 0.0051, + "num_input_tokens_seen": 175823616, + "step": 81400 + }, + { + "epoch": 13.279771615008157, + "grad_norm": 0.010674958117306232, + "learning_rate": 0.00030627370389300256, + "loss": 0.0072, + "num_input_tokens_seen": 175834464, + "step": 81405 + }, + { + "epoch": 13.280587275693312, + "grad_norm": 0.05215312913060188, + "learning_rate": 0.0003062080859159323, + "loss": 0.0028, + "num_input_tokens_seen": 175845728, + "step": 81410 + }, + { + "epoch": 13.281402936378466, + "grad_norm": 0.0022178348153829575, + "learning_rate": 0.0003061424718663011, + "loss": 0.0164, + "num_input_tokens_seen": 175856576, + "step": 81415 + }, + { + "epoch": 13.282218597063622, + "grad_norm": 0.02007582038640976, + "learning_rate": 0.00030607686174543864, + "loss": 0.0042, + "num_input_tokens_seen": 175867424, + "step": 81420 + }, + { + "epoch": 13.283034257748776, + "grad_norm": 0.34574416279792786, + "learning_rate": 0.00030601125555467456, + "loss": 0.0755, + "num_input_tokens_seen": 175878496, + "step": 81425 + }, + { + "epoch": 13.283849918433932, + "grad_norm": 0.003219359088689089, + "learning_rate": 0.0003059456532953385, + "loss": 0.0046, + "num_input_tokens_seen": 175889152, + "step": 81430 + }, + { + "epoch": 13.284665579119087, + "grad_norm": 0.029977506026625633, + "learning_rate": 0.00030588005496876, + "loss": 0.0027, + "num_input_tokens_seen": 175899328, + "step": 81435 + }, + { + "epoch": 13.285481239804241, + "grad_norm": 0.0027198202442377806, + "learning_rate": 0.00030581446057626827, + "loss": 0.0013, + "num_input_tokens_seen": 175908480, + "step": 81440 + }, + { + "epoch": 13.286296900489397, + "grad_norm": 0.028357025235891342, + "learning_rate": 0.00030574887011919306, + "loss": 0.0755, + "num_input_tokens_seen": 175918752, + "step": 81445 + }, + { + "epoch": 13.28711256117455, + "grad_norm": 0.0038904561661183834, + "learning_rate": 0.0003056832835988632, + "loss": 0.0129, + "num_input_tokens_seen": 175929344, + "step": 81450 + }, + { + "epoch": 13.287928221859707, + "grad_norm": 0.004254198633134365, + "learning_rate": 0.00030561770101660837, + "loss": 0.0765, + "num_input_tokens_seen": 175937792, + "step": 81455 + }, + { + "epoch": 13.28874388254486, + "grad_norm": 0.010023529641330242, + "learning_rate": 0.0003055521223737572, + "loss": 0.0023, + "num_input_tokens_seen": 175948960, + "step": 81460 + }, + { + "epoch": 13.289559543230016, + "grad_norm": 0.007340825628489256, + "learning_rate": 0.0003054865476716391, + "loss": 0.0018, + "num_input_tokens_seen": 175960512, + "step": 81465 + }, + { + "epoch": 13.290375203915172, + "grad_norm": 0.7565301656723022, + "learning_rate": 0.0003054209769115827, + "loss": 0.0896, + "num_input_tokens_seen": 175971904, + "step": 81470 + }, + { + "epoch": 13.291190864600326, + "grad_norm": 0.0019012526609003544, + "learning_rate": 0.0003053554100949173, + "loss": 0.0042, + "num_input_tokens_seen": 175983136, + "step": 81475 + }, + { + "epoch": 13.292006525285482, + "grad_norm": 0.007661410607397556, + "learning_rate": 0.0003052898472229711, + "loss": 0.0024, + "num_input_tokens_seen": 175994272, + "step": 81480 + }, + { + "epoch": 13.292822185970635, + "grad_norm": 0.20898938179016113, + "learning_rate": 0.0003052242882970735, + "loss": 0.0078, + "num_input_tokens_seen": 176005056, + "step": 81485 + }, + { + "epoch": 13.293637846655791, + "grad_norm": 0.00497079873457551, + "learning_rate": 0.0003051587333185525, + "loss": 0.0102, + "num_input_tokens_seen": 176015104, + "step": 81490 + }, + { + "epoch": 13.294453507340947, + "grad_norm": 0.011369774118065834, + "learning_rate": 0.00030509318228873715, + "loss": 0.0291, + "num_input_tokens_seen": 176026080, + "step": 81495 + }, + { + "epoch": 13.2952691680261, + "grad_norm": 0.003535378258675337, + "learning_rate": 0.00030502763520895556, + "loss": 0.0019, + "num_input_tokens_seen": 176036096, + "step": 81500 + }, + { + "epoch": 13.296084828711257, + "grad_norm": 0.030523095279932022, + "learning_rate": 0.00030496209208053643, + "loss": 0.004, + "num_input_tokens_seen": 176048128, + "step": 81505 + }, + { + "epoch": 13.29690048939641, + "grad_norm": 0.0029729788657277822, + "learning_rate": 0.0003048965529048078, + "loss": 0.0946, + "num_input_tokens_seen": 176058848, + "step": 81510 + }, + { + "epoch": 13.297716150081566, + "grad_norm": 0.004624223802238703, + "learning_rate": 0.00030483101768309797, + "loss": 0.0067, + "num_input_tokens_seen": 176069888, + "step": 81515 + }, + { + "epoch": 13.298531810766722, + "grad_norm": 0.05178157985210419, + "learning_rate": 0.00030476548641673537, + "loss": 0.0042, + "num_input_tokens_seen": 176081216, + "step": 81520 + }, + { + "epoch": 13.299347471451876, + "grad_norm": 0.06961380690336227, + "learning_rate": 0.0003046999591070476, + "loss": 0.0042, + "num_input_tokens_seen": 176092288, + "step": 81525 + }, + { + "epoch": 13.300163132137031, + "grad_norm": 0.09609808772802353, + "learning_rate": 0.0003046344357553632, + "loss": 0.0458, + "num_input_tokens_seen": 176103456, + "step": 81530 + }, + { + "epoch": 13.300978792822185, + "grad_norm": 0.6587584018707275, + "learning_rate": 0.0003045689163630095, + "loss": 0.1092, + "num_input_tokens_seen": 176114560, + "step": 81535 + }, + { + "epoch": 13.301794453507341, + "grad_norm": 0.043220143765211105, + "learning_rate": 0.000304503400931315, + "loss": 0.0121, + "num_input_tokens_seen": 176126112, + "step": 81540 + }, + { + "epoch": 13.302610114192497, + "grad_norm": 0.0008794625173322856, + "learning_rate": 0.00030443788946160676, + "loss": 0.0022, + "num_input_tokens_seen": 176137632, + "step": 81545 + }, + { + "epoch": 13.30342577487765, + "grad_norm": 0.001853870926424861, + "learning_rate": 0.000304372381955213, + "loss": 0.0051, + "num_input_tokens_seen": 176148512, + "step": 81550 + }, + { + "epoch": 13.304241435562806, + "grad_norm": 0.0031397638376802206, + "learning_rate": 0.00030430687841346096, + "loss": 0.0021, + "num_input_tokens_seen": 176159680, + "step": 81555 + }, + { + "epoch": 13.30505709624796, + "grad_norm": 0.013141549192368984, + "learning_rate": 0.00030424137883767826, + "loss": 0.0033, + "num_input_tokens_seen": 176169568, + "step": 81560 + }, + { + "epoch": 13.305872756933116, + "grad_norm": 0.0030862074345350266, + "learning_rate": 0.00030417588322919243, + "loss": 0.0033, + "num_input_tokens_seen": 176181120, + "step": 81565 + }, + { + "epoch": 13.30668841761827, + "grad_norm": 0.10375366359949112, + "learning_rate": 0.00030411039158933075, + "loss": 0.0037, + "num_input_tokens_seen": 176191264, + "step": 81570 + }, + { + "epoch": 13.307504078303426, + "grad_norm": 0.0012237022165209055, + "learning_rate": 0.0003040449039194205, + "loss": 0.0048, + "num_input_tokens_seen": 176202464, + "step": 81575 + }, + { + "epoch": 13.308319738988581, + "grad_norm": 0.008354654535651207, + "learning_rate": 0.00030397942022078884, + "loss": 0.0634, + "num_input_tokens_seen": 176213920, + "step": 81580 + }, + { + "epoch": 13.309135399673735, + "grad_norm": 0.022409193217754364, + "learning_rate": 0.00030391394049476275, + "loss": 0.002, + "num_input_tokens_seen": 176224672, + "step": 81585 + }, + { + "epoch": 13.309951060358891, + "grad_norm": 0.0038927465211600065, + "learning_rate": 0.00030384846474266965, + "loss": 0.0267, + "num_input_tokens_seen": 176234176, + "step": 81590 + }, + { + "epoch": 13.310766721044045, + "grad_norm": 0.004279454704374075, + "learning_rate": 0.0003037829929658361, + "loss": 0.0043, + "num_input_tokens_seen": 176245792, + "step": 81595 + }, + { + "epoch": 13.3115823817292, + "grad_norm": 0.006501410156488419, + "learning_rate": 0.0003037175251655892, + "loss": 0.0762, + "num_input_tokens_seen": 176255392, + "step": 81600 + }, + { + "epoch": 13.312398042414356, + "grad_norm": 0.07710537314414978, + "learning_rate": 0.0003036520613432555, + "loss": 0.0291, + "num_input_tokens_seen": 176266368, + "step": 81605 + }, + { + "epoch": 13.31321370309951, + "grad_norm": 0.01355685107409954, + "learning_rate": 0.0003035866015001621, + "loss": 0.1107, + "num_input_tokens_seen": 176278080, + "step": 81610 + }, + { + "epoch": 13.314029363784666, + "grad_norm": 0.00818372517824173, + "learning_rate": 0.00030352114563763515, + "loss": 0.003, + "num_input_tokens_seen": 176289792, + "step": 81615 + }, + { + "epoch": 13.31484502446982, + "grad_norm": 0.39141932129859924, + "learning_rate": 0.00030345569375700145, + "loss": 0.1082, + "num_input_tokens_seen": 176299840, + "step": 81620 + }, + { + "epoch": 13.315660685154976, + "grad_norm": 0.4454955458641052, + "learning_rate": 0.0003033902458595877, + "loss": 0.0579, + "num_input_tokens_seen": 176310688, + "step": 81625 + }, + { + "epoch": 13.31647634584013, + "grad_norm": 0.0065238154493272305, + "learning_rate": 0.00030332480194671975, + "loss": 0.0022, + "num_input_tokens_seen": 176321312, + "step": 81630 + }, + { + "epoch": 13.317292006525285, + "grad_norm": 0.0018881976138800383, + "learning_rate": 0.0003032593620197245, + "loss": 0.0037, + "num_input_tokens_seen": 176333344, + "step": 81635 + }, + { + "epoch": 13.318107667210441, + "grad_norm": 0.0015710759907960892, + "learning_rate": 0.0003031939260799276, + "loss": 0.0308, + "num_input_tokens_seen": 176343552, + "step": 81640 + }, + { + "epoch": 13.318923327895595, + "grad_norm": 0.8399984240531921, + "learning_rate": 0.00030312849412865564, + "loss": 0.0926, + "num_input_tokens_seen": 176354912, + "step": 81645 + }, + { + "epoch": 13.31973898858075, + "grad_norm": 0.366643488407135, + "learning_rate": 0.00030306306616723424, + "loss": 0.0228, + "num_input_tokens_seen": 176366048, + "step": 81650 + }, + { + "epoch": 13.320554649265905, + "grad_norm": 0.031812455505132675, + "learning_rate": 0.00030299764219698987, + "loss": 0.0104, + "num_input_tokens_seen": 176377120, + "step": 81655 + }, + { + "epoch": 13.32137030995106, + "grad_norm": 0.08288736641407013, + "learning_rate": 0.00030293222221924805, + "loss": 0.0131, + "num_input_tokens_seen": 176387968, + "step": 81660 + }, + { + "epoch": 13.322185970636216, + "grad_norm": 0.003492532530799508, + "learning_rate": 0.0003028668062353349, + "loss": 0.0057, + "num_input_tokens_seen": 176397472, + "step": 81665 + }, + { + "epoch": 13.32300163132137, + "grad_norm": 0.004395823460072279, + "learning_rate": 0.0003028013942465758, + "loss": 0.0439, + "num_input_tokens_seen": 176408224, + "step": 81670 + }, + { + "epoch": 13.323817292006526, + "grad_norm": 0.002135923132300377, + "learning_rate": 0.00030273598625429687, + "loss": 0.0013, + "num_input_tokens_seen": 176419552, + "step": 81675 + }, + { + "epoch": 13.32463295269168, + "grad_norm": 0.002409202978014946, + "learning_rate": 0.00030267058225982315, + "loss": 0.0033, + "num_input_tokens_seen": 176430272, + "step": 81680 + }, + { + "epoch": 13.325448613376835, + "grad_norm": 0.0041789524257183075, + "learning_rate": 0.00030260518226448064, + "loss": 0.0046, + "num_input_tokens_seen": 176440064, + "step": 81685 + }, + { + "epoch": 13.326264274061991, + "grad_norm": 0.00487999664619565, + "learning_rate": 0.00030253978626959435, + "loss": 0.0181, + "num_input_tokens_seen": 176450400, + "step": 81690 + }, + { + "epoch": 13.327079934747145, + "grad_norm": 0.00734216021373868, + "learning_rate": 0.00030247439427649, + "loss": 0.0113, + "num_input_tokens_seen": 176459968, + "step": 81695 + }, + { + "epoch": 13.3278955954323, + "grad_norm": 0.0037031807005405426, + "learning_rate": 0.0003024090062864924, + "loss": 0.0018, + "num_input_tokens_seen": 176470080, + "step": 81700 + }, + { + "epoch": 13.328711256117455, + "grad_norm": 0.023402415215969086, + "learning_rate": 0.00030234362230092705, + "loss": 0.0032, + "num_input_tokens_seen": 176481120, + "step": 81705 + }, + { + "epoch": 13.32952691680261, + "grad_norm": 0.012723736464977264, + "learning_rate": 0.0003022782423211189, + "loss": 0.0144, + "num_input_tokens_seen": 176492864, + "step": 81710 + }, + { + "epoch": 13.330342577487766, + "grad_norm": 0.01561590563505888, + "learning_rate": 0.0003022128663483931, + "loss": 0.009, + "num_input_tokens_seen": 176503232, + "step": 81715 + }, + { + "epoch": 13.33115823817292, + "grad_norm": 0.009133810177445412, + "learning_rate": 0.0003021474943840743, + "loss": 0.0068, + "num_input_tokens_seen": 176513024, + "step": 81720 + }, + { + "epoch": 13.331973898858076, + "grad_norm": 0.0011698536109179258, + "learning_rate": 0.00030208212642948755, + "loss": 0.0057, + "num_input_tokens_seen": 176524288, + "step": 81725 + }, + { + "epoch": 13.33278955954323, + "grad_norm": 0.3569418489933014, + "learning_rate": 0.0003020167624859577, + "loss": 0.018, + "num_input_tokens_seen": 176535744, + "step": 81730 + }, + { + "epoch": 13.333605220228385, + "grad_norm": 0.13893947005271912, + "learning_rate": 0.00030195140255480927, + "loss": 0.008, + "num_input_tokens_seen": 176546848, + "step": 81735 + }, + { + "epoch": 13.33442088091354, + "grad_norm": 0.004899358842521906, + "learning_rate": 0.0003018860466373669, + "loss": 0.0664, + "num_input_tokens_seen": 176558336, + "step": 81740 + }, + { + "epoch": 13.335236541598695, + "grad_norm": 0.006934888660907745, + "learning_rate": 0.0003018206947349551, + "loss": 0.0036, + "num_input_tokens_seen": 176570304, + "step": 81745 + }, + { + "epoch": 13.33605220228385, + "grad_norm": 0.6603171825408936, + "learning_rate": 0.00030175534684889836, + "loss": 0.0089, + "num_input_tokens_seen": 176580224, + "step": 81750 + }, + { + "epoch": 13.336867862969005, + "grad_norm": 0.07441363483667374, + "learning_rate": 0.00030169000298052096, + "loss": 0.0081, + "num_input_tokens_seen": 176590784, + "step": 81755 + }, + { + "epoch": 13.33768352365416, + "grad_norm": 0.021589141339063644, + "learning_rate": 0.00030162466313114734, + "loss": 0.0076, + "num_input_tokens_seen": 176601248, + "step": 81760 + }, + { + "epoch": 13.338499184339314, + "grad_norm": 0.007798145059496164, + "learning_rate": 0.00030155932730210145, + "loss": 0.0069, + "num_input_tokens_seen": 176612704, + "step": 81765 + }, + { + "epoch": 13.33931484502447, + "grad_norm": 0.018131252378225327, + "learning_rate": 0.00030149399549470767, + "loss": 0.0031, + "num_input_tokens_seen": 176623424, + "step": 81770 + }, + { + "epoch": 13.340130505709626, + "grad_norm": 0.014698930084705353, + "learning_rate": 0.00030142866771028974, + "loss": 0.0048, + "num_input_tokens_seen": 176634048, + "step": 81775 + }, + { + "epoch": 13.34094616639478, + "grad_norm": 0.0022598044015467167, + "learning_rate": 0.00030136334395017197, + "loss": 0.1079, + "num_input_tokens_seen": 176644704, + "step": 81780 + }, + { + "epoch": 13.341761827079935, + "grad_norm": 0.0038392143324017525, + "learning_rate": 0.0003012980242156778, + "loss": 0.0019, + "num_input_tokens_seen": 176654848, + "step": 81785 + }, + { + "epoch": 13.34257748776509, + "grad_norm": 0.04017645865678787, + "learning_rate": 0.00030123270850813147, + "loss": 0.0367, + "num_input_tokens_seen": 176666048, + "step": 81790 + }, + { + "epoch": 13.343393148450245, + "grad_norm": 0.00553273456171155, + "learning_rate": 0.0003011673968288562, + "loss": 0.0073, + "num_input_tokens_seen": 176677472, + "step": 81795 + }, + { + "epoch": 13.3442088091354, + "grad_norm": 0.012419447302818298, + "learning_rate": 0.00030110208917917607, + "loss": 0.0023, + "num_input_tokens_seen": 176688864, + "step": 81800 + }, + { + "epoch": 13.345024469820554, + "grad_norm": 0.03526076301932335, + "learning_rate": 0.00030103678556041427, + "loss": 0.0053, + "num_input_tokens_seen": 176700864, + "step": 81805 + }, + { + "epoch": 13.34584013050571, + "grad_norm": 0.0011652401881292462, + "learning_rate": 0.00030097148597389456, + "loss": 0.0067, + "num_input_tokens_seen": 176711936, + "step": 81810 + }, + { + "epoch": 13.346655791190864, + "grad_norm": 0.0031598478090018034, + "learning_rate": 0.00030090619042094, + "loss": 0.0289, + "num_input_tokens_seen": 176722464, + "step": 81815 + }, + { + "epoch": 13.34747145187602, + "grad_norm": 0.002571272198110819, + "learning_rate": 0.0003008408989028743, + "loss": 0.0023, + "num_input_tokens_seen": 176733440, + "step": 81820 + }, + { + "epoch": 13.348287112561174, + "grad_norm": 0.012336530722677708, + "learning_rate": 0.00030077561142102024, + "loss": 0.0098, + "num_input_tokens_seen": 176743104, + "step": 81825 + }, + { + "epoch": 13.34910277324633, + "grad_norm": 0.007683815434575081, + "learning_rate": 0.0003007103279767013, + "loss": 0.009, + "num_input_tokens_seen": 176753792, + "step": 81830 + }, + { + "epoch": 13.349918433931485, + "grad_norm": 0.06616160273551941, + "learning_rate": 0.0003006450485712402, + "loss": 0.0036, + "num_input_tokens_seen": 176764832, + "step": 81835 + }, + { + "epoch": 13.350734094616639, + "grad_norm": 0.0150661151856184, + "learning_rate": 0.00030057977320596007, + "loss": 0.007, + "num_input_tokens_seen": 176776128, + "step": 81840 + }, + { + "epoch": 13.351549755301795, + "grad_norm": 0.0010423744097352028, + "learning_rate": 0.00030051450188218397, + "loss": 0.0071, + "num_input_tokens_seen": 176787072, + "step": 81845 + }, + { + "epoch": 13.352365415986949, + "grad_norm": 0.10020679235458374, + "learning_rate": 0.0003004492346012345, + "loss": 0.019, + "num_input_tokens_seen": 176798816, + "step": 81850 + }, + { + "epoch": 13.353181076672104, + "grad_norm": 0.0933566614985466, + "learning_rate": 0.0003003839713644345, + "loss": 0.0031, + "num_input_tokens_seen": 176809984, + "step": 81855 + }, + { + "epoch": 13.35399673735726, + "grad_norm": 0.023334039375185966, + "learning_rate": 0.0003003187121731064, + "loss": 0.0031, + "num_input_tokens_seen": 176820288, + "step": 81860 + }, + { + "epoch": 13.354812398042414, + "grad_norm": 0.0006777640082873404, + "learning_rate": 0.0003002534570285731, + "loss": 0.0032, + "num_input_tokens_seen": 176830080, + "step": 81865 + }, + { + "epoch": 13.35562805872757, + "grad_norm": 0.0137909771874547, + "learning_rate": 0.00030018820593215675, + "loss": 0.0277, + "num_input_tokens_seen": 176841088, + "step": 81870 + }, + { + "epoch": 13.356443719412724, + "grad_norm": 0.0035228354390710592, + "learning_rate": 0.0003001229588851799, + "loss": 0.0044, + "num_input_tokens_seen": 176852576, + "step": 81875 + }, + { + "epoch": 13.35725938009788, + "grad_norm": 0.010440012440085411, + "learning_rate": 0.0003000577158889649, + "loss": 0.0767, + "num_input_tokens_seen": 176863584, + "step": 81880 + }, + { + "epoch": 13.358075040783035, + "grad_norm": 0.001993312034755945, + "learning_rate": 0.00029999247694483395, + "loss": 0.0017, + "num_input_tokens_seen": 176875584, + "step": 81885 + }, + { + "epoch": 13.358890701468189, + "grad_norm": 0.04758576303720474, + "learning_rate": 0.00029992724205410914, + "loss": 0.004, + "num_input_tokens_seen": 176886752, + "step": 81890 + }, + { + "epoch": 13.359706362153345, + "grad_norm": 0.2109840214252472, + "learning_rate": 0.0002998620112181126, + "loss": 0.0065, + "num_input_tokens_seen": 176896672, + "step": 81895 + }, + { + "epoch": 13.360522022838499, + "grad_norm": 0.043509677052497864, + "learning_rate": 0.0002997967844381662, + "loss": 0.0056, + "num_input_tokens_seen": 176907840, + "step": 81900 + }, + { + "epoch": 13.361337683523654, + "grad_norm": 0.016660314053297043, + "learning_rate": 0.00029973156171559214, + "loss": 0.0402, + "num_input_tokens_seen": 176918368, + "step": 81905 + }, + { + "epoch": 13.362153344208808, + "grad_norm": 0.002962973900139332, + "learning_rate": 0.0002996663430517118, + "loss": 0.0023, + "num_input_tokens_seen": 176929600, + "step": 81910 + }, + { + "epoch": 13.362969004893964, + "grad_norm": 0.0031862088944762945, + "learning_rate": 0.0002996011284478474, + "loss": 0.0023, + "num_input_tokens_seen": 176939360, + "step": 81915 + }, + { + "epoch": 13.36378466557912, + "grad_norm": 0.004130939487367868, + "learning_rate": 0.00029953591790532014, + "loss": 0.0028, + "num_input_tokens_seen": 176950272, + "step": 81920 + }, + { + "epoch": 13.364600326264274, + "grad_norm": 0.0006332904449664056, + "learning_rate": 0.000299470711425452, + "loss": 0.0013, + "num_input_tokens_seen": 176959648, + "step": 81925 + }, + { + "epoch": 13.36541598694943, + "grad_norm": 0.013381035067141056, + "learning_rate": 0.0002994055090095641, + "loss": 0.0059, + "num_input_tokens_seen": 176969600, + "step": 81930 + }, + { + "epoch": 13.366231647634583, + "grad_norm": 0.002639003796502948, + "learning_rate": 0.00029934031065897824, + "loss": 0.0027, + "num_input_tokens_seen": 176980608, + "step": 81935 + }, + { + "epoch": 13.367047308319739, + "grad_norm": 0.004228521604090929, + "learning_rate": 0.00029927511637501536, + "loss": 0.0027, + "num_input_tokens_seen": 176990656, + "step": 81940 + }, + { + "epoch": 13.367862969004895, + "grad_norm": 0.4222617447376251, + "learning_rate": 0.0002992099261589968, + "loss": 0.0184, + "num_input_tokens_seen": 177001120, + "step": 81945 + }, + { + "epoch": 13.368678629690049, + "grad_norm": 0.0035730917006731033, + "learning_rate": 0.00029914474001224413, + "loss": 0.0054, + "num_input_tokens_seen": 177012448, + "step": 81950 + }, + { + "epoch": 13.369494290375204, + "grad_norm": 0.10068176686763763, + "learning_rate": 0.0002990795579360778, + "loss": 0.1867, + "num_input_tokens_seen": 177023168, + "step": 81955 + }, + { + "epoch": 13.370309951060358, + "grad_norm": 0.008547582663595676, + "learning_rate": 0.00029901437993181936, + "loss": 0.0062, + "num_input_tokens_seen": 177035136, + "step": 81960 + }, + { + "epoch": 13.371125611745514, + "grad_norm": 0.4465034306049347, + "learning_rate": 0.0002989492060007893, + "loss": 0.0537, + "num_input_tokens_seen": 177046208, + "step": 81965 + }, + { + "epoch": 13.37194127243067, + "grad_norm": 0.0010136293713003397, + "learning_rate": 0.0002988840361443088, + "loss": 0.0012, + "num_input_tokens_seen": 177056256, + "step": 81970 + }, + { + "epoch": 13.372756933115824, + "grad_norm": 0.001000964897684753, + "learning_rate": 0.0002988188703636983, + "loss": 0.0265, + "num_input_tokens_seen": 177067040, + "step": 81975 + }, + { + "epoch": 13.37357259380098, + "grad_norm": 0.9909574389457703, + "learning_rate": 0.0002987537086602787, + "loss": 0.0442, + "num_input_tokens_seen": 177077120, + "step": 81980 + }, + { + "epoch": 13.374388254486133, + "grad_norm": 0.0032265952322632074, + "learning_rate": 0.0002986885510353703, + "loss": 0.1605, + "num_input_tokens_seen": 177087744, + "step": 81985 + }, + { + "epoch": 13.375203915171289, + "grad_norm": 0.013295507058501244, + "learning_rate": 0.00029862339749029413, + "loss": 0.0092, + "num_input_tokens_seen": 177100096, + "step": 81990 + }, + { + "epoch": 13.376019575856443, + "grad_norm": 0.00572703592479229, + "learning_rate": 0.0002985582480263699, + "loss": 0.0115, + "num_input_tokens_seen": 177110816, + "step": 81995 + }, + { + "epoch": 13.376835236541599, + "grad_norm": 0.013873127289116383, + "learning_rate": 0.00029849310264491865, + "loss": 0.0026, + "num_input_tokens_seen": 177121792, + "step": 82000 + }, + { + "epoch": 13.377650897226754, + "grad_norm": 1.48568856716156, + "learning_rate": 0.00029842796134726, + "loss": 0.0551, + "num_input_tokens_seen": 177131936, + "step": 82005 + }, + { + "epoch": 13.378466557911908, + "grad_norm": 0.036852333694696426, + "learning_rate": 0.0002983628241347147, + "loss": 0.0021, + "num_input_tokens_seen": 177142848, + "step": 82010 + }, + { + "epoch": 13.379282218597064, + "grad_norm": 0.10424373298883438, + "learning_rate": 0.0002982976910086024, + "loss": 0.017, + "num_input_tokens_seen": 177153312, + "step": 82015 + }, + { + "epoch": 13.380097879282218, + "grad_norm": 0.08175593614578247, + "learning_rate": 0.0002982325619702433, + "loss": 0.0063, + "num_input_tokens_seen": 177164224, + "step": 82020 + }, + { + "epoch": 13.380913539967374, + "grad_norm": 0.006303591188043356, + "learning_rate": 0.0002981674370209573, + "loss": 0.0893, + "num_input_tokens_seen": 177174944, + "step": 82025 + }, + { + "epoch": 13.38172920065253, + "grad_norm": 0.0339217446744442, + "learning_rate": 0.00029810231616206426, + "loss": 0.0165, + "num_input_tokens_seen": 177185184, + "step": 82030 + }, + { + "epoch": 13.382544861337683, + "grad_norm": 0.002143233548849821, + "learning_rate": 0.00029803719939488387, + "loss": 0.0025, + "num_input_tokens_seen": 177197536, + "step": 82035 + }, + { + "epoch": 13.383360522022839, + "grad_norm": 0.0740416944026947, + "learning_rate": 0.0002979720867207358, + "loss": 0.0063, + "num_input_tokens_seen": 177209184, + "step": 82040 + }, + { + "epoch": 13.384176182707993, + "grad_norm": 0.005400381051003933, + "learning_rate": 0.0002979069781409397, + "loss": 0.0072, + "num_input_tokens_seen": 177220416, + "step": 82045 + }, + { + "epoch": 13.384991843393149, + "grad_norm": 0.012265348806977272, + "learning_rate": 0.00029784187365681516, + "loss": 0.0061, + "num_input_tokens_seen": 177232032, + "step": 82050 + }, + { + "epoch": 13.385807504078304, + "grad_norm": 0.002419403288513422, + "learning_rate": 0.00029777677326968144, + "loss": 0.0047, + "num_input_tokens_seen": 177243328, + "step": 82055 + }, + { + "epoch": 13.386623164763458, + "grad_norm": 0.0014468590961769223, + "learning_rate": 0.0002977116769808579, + "loss": 0.0027, + "num_input_tokens_seen": 177254368, + "step": 82060 + }, + { + "epoch": 13.387438825448614, + "grad_norm": 0.0005729938857257366, + "learning_rate": 0.000297646584791664, + "loss": 0.0059, + "num_input_tokens_seen": 177264960, + "step": 82065 + }, + { + "epoch": 13.388254486133768, + "grad_norm": 0.023177186027169228, + "learning_rate": 0.0002975814967034185, + "loss": 0.0064, + "num_input_tokens_seen": 177275936, + "step": 82070 + }, + { + "epoch": 13.389070146818923, + "grad_norm": 0.02205835096538067, + "learning_rate": 0.000297516412717441, + "loss": 0.0033, + "num_input_tokens_seen": 177286144, + "step": 82075 + }, + { + "epoch": 13.38988580750408, + "grad_norm": 0.004381283186376095, + "learning_rate": 0.0002974513328350501, + "loss": 0.0034, + "num_input_tokens_seen": 177297408, + "step": 82080 + }, + { + "epoch": 13.390701468189233, + "grad_norm": 0.004533675499260426, + "learning_rate": 0.00029738625705756514, + "loss": 0.0061, + "num_input_tokens_seen": 177309568, + "step": 82085 + }, + { + "epoch": 13.391517128874389, + "grad_norm": 0.002455994486808777, + "learning_rate": 0.0002973211853863044, + "loss": 0.0036, + "num_input_tokens_seen": 177320352, + "step": 82090 + }, + { + "epoch": 13.392332789559543, + "grad_norm": 0.0036011829506605864, + "learning_rate": 0.0002972561178225872, + "loss": 0.0024, + "num_input_tokens_seen": 177330048, + "step": 82095 + }, + { + "epoch": 13.393148450244698, + "grad_norm": 0.0032857232727110386, + "learning_rate": 0.00029719105436773187, + "loss": 0.0017, + "num_input_tokens_seen": 177340896, + "step": 82100 + }, + { + "epoch": 13.393964110929852, + "grad_norm": 0.002111183013767004, + "learning_rate": 0.00029712599502305714, + "loss": 0.0053, + "num_input_tokens_seen": 177352480, + "step": 82105 + }, + { + "epoch": 13.394779771615008, + "grad_norm": 0.0007205134606920183, + "learning_rate": 0.0002970609397898814, + "loss": 0.0022, + "num_input_tokens_seen": 177361856, + "step": 82110 + }, + { + "epoch": 13.395595432300164, + "grad_norm": 0.0033076724503189325, + "learning_rate": 0.0002969958886695233, + "loss": 0.1464, + "num_input_tokens_seen": 177372768, + "step": 82115 + }, + { + "epoch": 13.396411092985318, + "grad_norm": 0.0014311681734398007, + "learning_rate": 0.00029693084166330084, + "loss": 0.009, + "num_input_tokens_seen": 177383360, + "step": 82120 + }, + { + "epoch": 13.397226753670473, + "grad_norm": 0.017201535403728485, + "learning_rate": 0.00029686579877253276, + "loss": 0.0025, + "num_input_tokens_seen": 177393888, + "step": 82125 + }, + { + "epoch": 13.398042414355627, + "grad_norm": 0.00801269244402647, + "learning_rate": 0.0002968007599985367, + "loss": 0.0014, + "num_input_tokens_seen": 177404992, + "step": 82130 + }, + { + "epoch": 13.398858075040783, + "grad_norm": 0.018369020894169807, + "learning_rate": 0.0002967357253426313, + "loss": 0.0023, + "num_input_tokens_seen": 177415136, + "step": 82135 + }, + { + "epoch": 13.399673735725939, + "grad_norm": 0.008904990740120411, + "learning_rate": 0.000296670694806134, + "loss": 0.0965, + "num_input_tokens_seen": 177424256, + "step": 82140 + }, + { + "epoch": 13.400489396411093, + "grad_norm": 0.003861672943457961, + "learning_rate": 0.00029660566839036315, + "loss": 0.0014, + "num_input_tokens_seen": 177435456, + "step": 82145 + }, + { + "epoch": 13.401305057096248, + "grad_norm": 0.447512149810791, + "learning_rate": 0.0002965406460966364, + "loss": 0.0375, + "num_input_tokens_seen": 177445536, + "step": 82150 + }, + { + "epoch": 13.402120717781402, + "grad_norm": 0.00251060351729393, + "learning_rate": 0.00029647562792627145, + "loss": 0.0217, + "num_input_tokens_seen": 177457184, + "step": 82155 + }, + { + "epoch": 13.402936378466558, + "grad_norm": 0.016238614916801453, + "learning_rate": 0.0002964106138805864, + "loss": 0.0031, + "num_input_tokens_seen": 177467488, + "step": 82160 + }, + { + "epoch": 13.403752039151712, + "grad_norm": 0.010289501398801804, + "learning_rate": 0.00029634560396089827, + "loss": 0.0092, + "num_input_tokens_seen": 177477856, + "step": 82165 + }, + { + "epoch": 13.404567699836868, + "grad_norm": 0.0029544297140091658, + "learning_rate": 0.00029628059816852497, + "loss": 0.0494, + "num_input_tokens_seen": 177488480, + "step": 82170 + }, + { + "epoch": 13.405383360522023, + "grad_norm": 0.012710874900221825, + "learning_rate": 0.0002962155965047837, + "loss": 0.029, + "num_input_tokens_seen": 177498656, + "step": 82175 + }, + { + "epoch": 13.406199021207177, + "grad_norm": 0.0044509959407150745, + "learning_rate": 0.00029615059897099196, + "loss": 0.0113, + "num_input_tokens_seen": 177510016, + "step": 82180 + }, + { + "epoch": 13.407014681892333, + "grad_norm": 0.019407030194997787, + "learning_rate": 0.0002960856055684668, + "loss": 0.0017, + "num_input_tokens_seen": 177521216, + "step": 82185 + }, + { + "epoch": 13.407830342577487, + "grad_norm": 0.004138377495110035, + "learning_rate": 0.0002960206162985256, + "loss": 0.0646, + "num_input_tokens_seen": 177530784, + "step": 82190 + }, + { + "epoch": 13.408646003262643, + "grad_norm": 0.0222416240721941, + "learning_rate": 0.0002959556311624855, + "loss": 0.0104, + "num_input_tokens_seen": 177540800, + "step": 82195 + }, + { + "epoch": 13.409461663947798, + "grad_norm": 0.0034956561867147684, + "learning_rate": 0.0002958906501616632, + "loss": 0.0054, + "num_input_tokens_seen": 177551328, + "step": 82200 + }, + { + "epoch": 13.410277324632952, + "grad_norm": 0.0069087352603673935, + "learning_rate": 0.0002958256732973759, + "loss": 0.0024, + "num_input_tokens_seen": 177563872, + "step": 82205 + }, + { + "epoch": 13.411092985318108, + "grad_norm": 0.0064132362604141235, + "learning_rate": 0.00029576070057094034, + "loss": 0.0173, + "num_input_tokens_seen": 177575808, + "step": 82210 + }, + { + "epoch": 13.411908646003262, + "grad_norm": 0.4843251407146454, + "learning_rate": 0.00029569573198367317, + "loss": 0.05, + "num_input_tokens_seen": 177586304, + "step": 82215 + }, + { + "epoch": 13.412724306688418, + "grad_norm": 0.007687193341553211, + "learning_rate": 0.00029563076753689137, + "loss": 0.0161, + "num_input_tokens_seen": 177597760, + "step": 82220 + }, + { + "epoch": 13.413539967373573, + "grad_norm": 0.018417716026306152, + "learning_rate": 0.00029556580723191116, + "loss": 0.0154, + "num_input_tokens_seen": 177608992, + "step": 82225 + }, + { + "epoch": 13.414355628058727, + "grad_norm": 0.03263521566987038, + "learning_rate": 0.00029550085107004937, + "loss": 0.0021, + "num_input_tokens_seen": 177620096, + "step": 82230 + }, + { + "epoch": 13.415171288743883, + "grad_norm": 0.006521139293909073, + "learning_rate": 0.0002954358990526221, + "loss": 0.0106, + "num_input_tokens_seen": 177630848, + "step": 82235 + }, + { + "epoch": 13.415986949429037, + "grad_norm": 0.010773967020213604, + "learning_rate": 0.000295370951180946, + "loss": 0.1575, + "num_input_tokens_seen": 177641600, + "step": 82240 + }, + { + "epoch": 13.416802610114193, + "grad_norm": 0.032416198402643204, + "learning_rate": 0.00029530600745633693, + "loss": 0.0056, + "num_input_tokens_seen": 177652992, + "step": 82245 + }, + { + "epoch": 13.417618270799348, + "grad_norm": 0.0022988219279795885, + "learning_rate": 0.0002952410678801116, + "loss": 0.0608, + "num_input_tokens_seen": 177664384, + "step": 82250 + }, + { + "epoch": 13.418433931484502, + "grad_norm": 0.002156211994588375, + "learning_rate": 0.0002951761324535855, + "loss": 0.0025, + "num_input_tokens_seen": 177674784, + "step": 82255 + }, + { + "epoch": 13.419249592169658, + "grad_norm": 0.002031585667282343, + "learning_rate": 0.00029511120117807493, + "loss": 0.0055, + "num_input_tokens_seen": 177684768, + "step": 82260 + }, + { + "epoch": 13.420065252854812, + "grad_norm": 0.002410220680758357, + "learning_rate": 0.00029504627405489605, + "loss": 0.0031, + "num_input_tokens_seen": 177695456, + "step": 82265 + }, + { + "epoch": 13.420880913539968, + "grad_norm": 0.11466678231954575, + "learning_rate": 0.0002949813510853641, + "loss": 0.0986, + "num_input_tokens_seen": 177706912, + "step": 82270 + }, + { + "epoch": 13.421696574225122, + "grad_norm": 0.0004403699131216854, + "learning_rate": 0.00029491643227079543, + "loss": 0.0071, + "num_input_tokens_seen": 177718240, + "step": 82275 + }, + { + "epoch": 13.422512234910277, + "grad_norm": 0.019405366852879524, + "learning_rate": 0.00029485151761250527, + "loss": 0.1391, + "num_input_tokens_seen": 177728384, + "step": 82280 + }, + { + "epoch": 13.423327895595433, + "grad_norm": 0.0068669854663312435, + "learning_rate": 0.0002947866071118095, + "loss": 0.0063, + "num_input_tokens_seen": 177739328, + "step": 82285 + }, + { + "epoch": 13.424143556280587, + "grad_norm": 0.4046556353569031, + "learning_rate": 0.00029472170077002324, + "loss": 0.129, + "num_input_tokens_seen": 177749120, + "step": 82290 + }, + { + "epoch": 13.424959216965743, + "grad_norm": 0.0009423498995602131, + "learning_rate": 0.0002946567985884624, + "loss": 0.0036, + "num_input_tokens_seen": 177759808, + "step": 82295 + }, + { + "epoch": 13.425774877650896, + "grad_norm": 0.004357687663286924, + "learning_rate": 0.0002945919005684418, + "loss": 0.0039, + "num_input_tokens_seen": 177768512, + "step": 82300 + }, + { + "epoch": 13.426590538336052, + "grad_norm": 0.0007036282331682742, + "learning_rate": 0.0002945270067112771, + "loss": 0.0063, + "num_input_tokens_seen": 177779104, + "step": 82305 + }, + { + "epoch": 13.427406199021208, + "grad_norm": 0.004518335685133934, + "learning_rate": 0.0002944621170182831, + "loss": 0.0024, + "num_input_tokens_seen": 177789024, + "step": 82310 + }, + { + "epoch": 13.428221859706362, + "grad_norm": 0.002193318447098136, + "learning_rate": 0.00029439723149077523, + "loss": 0.0025, + "num_input_tokens_seen": 177799392, + "step": 82315 + }, + { + "epoch": 13.429037520391518, + "grad_norm": 0.050887856632471085, + "learning_rate": 0.0002943323501300681, + "loss": 0.0037, + "num_input_tokens_seen": 177809088, + "step": 82320 + }, + { + "epoch": 13.429853181076671, + "grad_norm": 0.05680084228515625, + "learning_rate": 0.00029426747293747685, + "loss": 0.0064, + "num_input_tokens_seen": 177820608, + "step": 82325 + }, + { + "epoch": 13.430668841761827, + "grad_norm": 0.00340280425734818, + "learning_rate": 0.00029420259991431633, + "loss": 0.006, + "num_input_tokens_seen": 177832352, + "step": 82330 + }, + { + "epoch": 13.431484502446983, + "grad_norm": 0.018648672848939896, + "learning_rate": 0.0002941377310619011, + "loss": 0.0082, + "num_input_tokens_seen": 177843616, + "step": 82335 + }, + { + "epoch": 13.432300163132137, + "grad_norm": 1.2718156576156616, + "learning_rate": 0.00029407286638154597, + "loss": 0.0984, + "num_input_tokens_seen": 177854144, + "step": 82340 + }, + { + "epoch": 13.433115823817293, + "grad_norm": 0.009632064029574394, + "learning_rate": 0.00029400800587456544, + "loss": 0.0028, + "num_input_tokens_seen": 177865152, + "step": 82345 + }, + { + "epoch": 13.433931484502446, + "grad_norm": 0.0709516853094101, + "learning_rate": 0.00029394314954227387, + "loss": 0.0043, + "num_input_tokens_seen": 177876416, + "step": 82350 + }, + { + "epoch": 13.434747145187602, + "grad_norm": 0.0647798404097557, + "learning_rate": 0.000293878297385986, + "loss": 0.0214, + "num_input_tokens_seen": 177887424, + "step": 82355 + }, + { + "epoch": 13.435562805872756, + "grad_norm": 0.007885068655014038, + "learning_rate": 0.0002938134494070157, + "loss": 0.0053, + "num_input_tokens_seen": 177899392, + "step": 82360 + }, + { + "epoch": 13.436378466557912, + "grad_norm": 0.016093425452709198, + "learning_rate": 0.00029374860560667747, + "loss": 0.0134, + "num_input_tokens_seen": 177910048, + "step": 82365 + }, + { + "epoch": 13.437194127243067, + "grad_norm": 0.01708192005753517, + "learning_rate": 0.00029368376598628545, + "loss": 0.0023, + "num_input_tokens_seen": 177920224, + "step": 82370 + }, + { + "epoch": 13.438009787928221, + "grad_norm": 0.0012648747069761157, + "learning_rate": 0.00029361893054715365, + "loss": 0.0066, + "num_input_tokens_seen": 177930880, + "step": 82375 + }, + { + "epoch": 13.438825448613377, + "grad_norm": 0.007827023044228554, + "learning_rate": 0.000293554099290596, + "loss": 0.0212, + "num_input_tokens_seen": 177942016, + "step": 82380 + }, + { + "epoch": 13.439641109298531, + "grad_norm": 0.009230383671820164, + "learning_rate": 0.0002934892722179264, + "loss": 0.0015, + "num_input_tokens_seen": 177950560, + "step": 82385 + }, + { + "epoch": 13.440456769983687, + "grad_norm": 0.0039056178648024797, + "learning_rate": 0.0002934244493304588, + "loss": 0.1685, + "num_input_tokens_seen": 177961088, + "step": 82390 + }, + { + "epoch": 13.441272430668842, + "grad_norm": 0.0029746955260634422, + "learning_rate": 0.0002933596306295066, + "loss": 0.0086, + "num_input_tokens_seen": 177970656, + "step": 82395 + }, + { + "epoch": 13.442088091353996, + "grad_norm": 0.009408979676663876, + "learning_rate": 0.0002932948161163839, + "loss": 0.1229, + "num_input_tokens_seen": 177981792, + "step": 82400 + }, + { + "epoch": 13.442903752039152, + "grad_norm": 0.06069672852754593, + "learning_rate": 0.0002932300057924037, + "loss": 0.0038, + "num_input_tokens_seen": 177992480, + "step": 82405 + }, + { + "epoch": 13.443719412724306, + "grad_norm": 0.0016424978384748101, + "learning_rate": 0.0002931651996588799, + "loss": 0.0783, + "num_input_tokens_seen": 178002944, + "step": 82410 + }, + { + "epoch": 13.444535073409462, + "grad_norm": 0.12447664141654968, + "learning_rate": 0.0002931003977171256, + "loss": 0.009, + "num_input_tokens_seen": 178012704, + "step": 82415 + }, + { + "epoch": 13.445350734094617, + "grad_norm": 0.0165871512144804, + "learning_rate": 0.00029303559996845434, + "loss": 0.0542, + "num_input_tokens_seen": 178024160, + "step": 82420 + }, + { + "epoch": 13.446166394779771, + "grad_norm": 0.003005496459081769, + "learning_rate": 0.00029297080641417907, + "loss": 0.034, + "num_input_tokens_seen": 178035552, + "step": 82425 + }, + { + "epoch": 13.446982055464927, + "grad_norm": 0.0012617846950888634, + "learning_rate": 0.0002929060170556132, + "loss": 0.1092, + "num_input_tokens_seen": 178046848, + "step": 82430 + }, + { + "epoch": 13.447797716150081, + "grad_norm": 0.6911592483520508, + "learning_rate": 0.00029284123189406944, + "loss": 0.1113, + "num_input_tokens_seen": 178058432, + "step": 82435 + }, + { + "epoch": 13.448613376835237, + "grad_norm": 0.03883085772395134, + "learning_rate": 0.00029277645093086114, + "loss": 0.0076, + "num_input_tokens_seen": 178070496, + "step": 82440 + }, + { + "epoch": 13.449429037520392, + "grad_norm": 0.0312496330589056, + "learning_rate": 0.00029271167416730073, + "loss": 0.0099, + "num_input_tokens_seen": 178080960, + "step": 82445 + }, + { + "epoch": 13.450244698205546, + "grad_norm": 0.008447905071079731, + "learning_rate": 0.0002926469016047013, + "loss": 0.0788, + "num_input_tokens_seen": 178092000, + "step": 82450 + }, + { + "epoch": 13.451060358890702, + "grad_norm": 0.031177420169115067, + "learning_rate": 0.00029258213324437533, + "loss": 0.0185, + "num_input_tokens_seen": 178102336, + "step": 82455 + }, + { + "epoch": 13.451876019575856, + "grad_norm": 0.10189617425203323, + "learning_rate": 0.00029251736908763584, + "loss": 0.0469, + "num_input_tokens_seen": 178113248, + "step": 82460 + }, + { + "epoch": 13.452691680261012, + "grad_norm": 0.6185612678527832, + "learning_rate": 0.00029245260913579477, + "loss": 0.0228, + "num_input_tokens_seen": 178124192, + "step": 82465 + }, + { + "epoch": 13.453507340946166, + "grad_norm": 0.013284931890666485, + "learning_rate": 0.00029238785339016487, + "loss": 0.0224, + "num_input_tokens_seen": 178135424, + "step": 82470 + }, + { + "epoch": 13.454323001631321, + "grad_norm": 0.19014760851860046, + "learning_rate": 0.0002923231018520588, + "loss": 0.0096, + "num_input_tokens_seen": 178145952, + "step": 82475 + }, + { + "epoch": 13.455138662316477, + "grad_norm": 0.006610923446714878, + "learning_rate": 0.0002922583545227882, + "loss": 0.0039, + "num_input_tokens_seen": 178156768, + "step": 82480 + }, + { + "epoch": 13.455954323001631, + "grad_norm": 0.14724524319171906, + "learning_rate": 0.00029219361140366587, + "loss": 0.0108, + "num_input_tokens_seen": 178167616, + "step": 82485 + }, + { + "epoch": 13.456769983686787, + "grad_norm": 0.026569068431854248, + "learning_rate": 0.0002921288724960034, + "loss": 0.0036, + "num_input_tokens_seen": 178179008, + "step": 82490 + }, + { + "epoch": 13.45758564437194, + "grad_norm": 0.009068429470062256, + "learning_rate": 0.00029206413780111305, + "loss": 0.0427, + "num_input_tokens_seen": 178189312, + "step": 82495 + }, + { + "epoch": 13.458401305057096, + "grad_norm": 0.002162341959774494, + "learning_rate": 0.00029199940732030686, + "loss": 0.0084, + "num_input_tokens_seen": 178198688, + "step": 82500 + }, + { + "epoch": 13.459216965742252, + "grad_norm": 0.008699199184775352, + "learning_rate": 0.0002919346810548965, + "loss": 0.052, + "num_input_tokens_seen": 178208768, + "step": 82505 + }, + { + "epoch": 13.460032626427406, + "grad_norm": 1.6339211463928223, + "learning_rate": 0.00029186995900619373, + "loss": 0.0332, + "num_input_tokens_seen": 178219776, + "step": 82510 + }, + { + "epoch": 13.460848287112562, + "grad_norm": 0.012227796018123627, + "learning_rate": 0.00029180524117551035, + "loss": 0.0082, + "num_input_tokens_seen": 178230560, + "step": 82515 + }, + { + "epoch": 13.461663947797716, + "grad_norm": 0.000886613386683166, + "learning_rate": 0.0002917405275641578, + "loss": 0.0126, + "num_input_tokens_seen": 178240704, + "step": 82520 + }, + { + "epoch": 13.462479608482871, + "grad_norm": 0.04731287062168121, + "learning_rate": 0.00029167581817344775, + "loss": 0.0819, + "num_input_tokens_seen": 178252096, + "step": 82525 + }, + { + "epoch": 13.463295269168025, + "grad_norm": 0.0025267750024795532, + "learning_rate": 0.00029161111300469143, + "loss": 0.0022, + "num_input_tokens_seen": 178262400, + "step": 82530 + }, + { + "epoch": 13.464110929853181, + "grad_norm": 0.04110927879810333, + "learning_rate": 0.0002915464120592003, + "loss": 0.0065, + "num_input_tokens_seen": 178273568, + "step": 82535 + }, + { + "epoch": 13.464926590538337, + "grad_norm": 0.007446099538356066, + "learning_rate": 0.0002914817153382856, + "loss": 0.0027, + "num_input_tokens_seen": 178284576, + "step": 82540 + }, + { + "epoch": 13.46574225122349, + "grad_norm": 0.005990834906697273, + "learning_rate": 0.00029141702284325846, + "loss": 0.0069, + "num_input_tokens_seen": 178295456, + "step": 82545 + }, + { + "epoch": 13.466557911908646, + "grad_norm": 0.017972951754927635, + "learning_rate": 0.0002913523345754299, + "loss": 0.0037, + "num_input_tokens_seen": 178306848, + "step": 82550 + }, + { + "epoch": 13.4673735725938, + "grad_norm": 0.0014292324194684625, + "learning_rate": 0.0002912876505361111, + "loss": 0.0045, + "num_input_tokens_seen": 178318688, + "step": 82555 + }, + { + "epoch": 13.468189233278956, + "grad_norm": 0.010235908441245556, + "learning_rate": 0.00029122297072661264, + "loss": 0.0165, + "num_input_tokens_seen": 178328288, + "step": 82560 + }, + { + "epoch": 13.469004893964112, + "grad_norm": 0.019145233556628227, + "learning_rate": 0.00029115829514824565, + "loss": 0.018, + "num_input_tokens_seen": 178337600, + "step": 82565 + }, + { + "epoch": 13.469820554649266, + "grad_norm": 0.0088666882365942, + "learning_rate": 0.00029109362380232075, + "loss": 0.0045, + "num_input_tokens_seen": 178347552, + "step": 82570 + }, + { + "epoch": 13.470636215334421, + "grad_norm": 0.036711398512125015, + "learning_rate": 0.0002910289566901485, + "loss": 0.0779, + "num_input_tokens_seen": 178359520, + "step": 82575 + }, + { + "epoch": 13.471451876019575, + "grad_norm": 0.004469173029065132, + "learning_rate": 0.0002909642938130394, + "loss": 0.0034, + "num_input_tokens_seen": 178371104, + "step": 82580 + }, + { + "epoch": 13.47226753670473, + "grad_norm": 0.005990062840282917, + "learning_rate": 0.0002908996351723043, + "loss": 0.0013, + "num_input_tokens_seen": 178381984, + "step": 82585 + }, + { + "epoch": 13.473083197389887, + "grad_norm": 0.0013577784411609173, + "learning_rate": 0.0002908349807692533, + "loss": 0.008, + "num_input_tokens_seen": 178392032, + "step": 82590 + }, + { + "epoch": 13.47389885807504, + "grad_norm": 0.29914239048957825, + "learning_rate": 0.00029077033060519674, + "loss": 0.0557, + "num_input_tokens_seen": 178402272, + "step": 82595 + }, + { + "epoch": 13.474714518760196, + "grad_norm": 0.006352984346449375, + "learning_rate": 0.0002907056846814449, + "loss": 0.0029, + "num_input_tokens_seen": 178412992, + "step": 82600 + }, + { + "epoch": 13.47553017944535, + "grad_norm": 0.003830127650871873, + "learning_rate": 0.00029064104299930785, + "loss": 0.0168, + "num_input_tokens_seen": 178423904, + "step": 82605 + }, + { + "epoch": 13.476345840130506, + "grad_norm": 0.0034729652106761932, + "learning_rate": 0.00029057640556009567, + "loss": 0.0399, + "num_input_tokens_seen": 178433376, + "step": 82610 + }, + { + "epoch": 13.477161500815662, + "grad_norm": 0.0034876209683716297, + "learning_rate": 0.0002905117723651183, + "loss": 0.094, + "num_input_tokens_seen": 178444480, + "step": 82615 + }, + { + "epoch": 13.477977161500815, + "grad_norm": 0.06251949816942215, + "learning_rate": 0.0002904471434156856, + "loss": 0.0109, + "num_input_tokens_seen": 178455328, + "step": 82620 + }, + { + "epoch": 13.478792822185971, + "grad_norm": 0.006139388307929039, + "learning_rate": 0.0002903825187131074, + "loss": 0.0031, + "num_input_tokens_seen": 178465888, + "step": 82625 + }, + { + "epoch": 13.479608482871125, + "grad_norm": 0.01711699180305004, + "learning_rate": 0.00029031789825869334, + "loss": 0.0057, + "num_input_tokens_seen": 178476288, + "step": 82630 + }, + { + "epoch": 13.48042414355628, + "grad_norm": 0.009003594517707825, + "learning_rate": 0.0002902532820537531, + "loss": 0.0763, + "num_input_tokens_seen": 178486688, + "step": 82635 + }, + { + "epoch": 13.481239804241435, + "grad_norm": 0.0052217403426766396, + "learning_rate": 0.00029018867009959623, + "loss": 0.0043, + "num_input_tokens_seen": 178496928, + "step": 82640 + }, + { + "epoch": 13.48205546492659, + "grad_norm": 0.15717969834804535, + "learning_rate": 0.0002901240623975321, + "loss": 0.0928, + "num_input_tokens_seen": 178507296, + "step": 82645 + }, + { + "epoch": 13.482871125611746, + "grad_norm": 0.13155721127986908, + "learning_rate": 0.00029005945894887, + "loss": 0.119, + "num_input_tokens_seen": 178518592, + "step": 82650 + }, + { + "epoch": 13.4836867862969, + "grad_norm": 0.026101280003786087, + "learning_rate": 0.0002899948597549194, + "loss": 0.0057, + "num_input_tokens_seen": 178528896, + "step": 82655 + }, + { + "epoch": 13.484502446982056, + "grad_norm": 0.35371142625808716, + "learning_rate": 0.00028993026481698934, + "loss": 0.1464, + "num_input_tokens_seen": 178539712, + "step": 82660 + }, + { + "epoch": 13.48531810766721, + "grad_norm": 0.16254015266895294, + "learning_rate": 0.00028986567413638895, + "loss": 0.0077, + "num_input_tokens_seen": 178549728, + "step": 82665 + }, + { + "epoch": 13.486133768352365, + "grad_norm": 0.010674665682017803, + "learning_rate": 0.00028980108771442726, + "loss": 0.0255, + "num_input_tokens_seen": 178561248, + "step": 82670 + }, + { + "epoch": 13.486949429037521, + "grad_norm": 0.0005801775259897113, + "learning_rate": 0.00028973650555241316, + "loss": 0.0074, + "num_input_tokens_seen": 178572256, + "step": 82675 + }, + { + "epoch": 13.487765089722675, + "grad_norm": 0.04390391334891319, + "learning_rate": 0.0002896719276516555, + "loss": 0.0084, + "num_input_tokens_seen": 178583584, + "step": 82680 + }, + { + "epoch": 13.48858075040783, + "grad_norm": 0.718368649482727, + "learning_rate": 0.0002896073540134631, + "loss": 0.0385, + "num_input_tokens_seen": 178594720, + "step": 82685 + }, + { + "epoch": 13.489396411092985, + "grad_norm": 0.02619067020714283, + "learning_rate": 0.00028954278463914435, + "loss": 0.0058, + "num_input_tokens_seen": 178604480, + "step": 82690 + }, + { + "epoch": 13.49021207177814, + "grad_norm": 0.005772117991000414, + "learning_rate": 0.00028947821953000845, + "loss": 0.0516, + "num_input_tokens_seen": 178614272, + "step": 82695 + }, + { + "epoch": 13.491027732463296, + "grad_norm": 0.0048257578164339066, + "learning_rate": 0.00028941365868736315, + "loss": 0.0043, + "num_input_tokens_seen": 178624608, + "step": 82700 + }, + { + "epoch": 13.49184339314845, + "grad_norm": 0.1376570165157318, + "learning_rate": 0.00028934910211251755, + "loss": 0.0073, + "num_input_tokens_seen": 178635552, + "step": 82705 + }, + { + "epoch": 13.492659053833606, + "grad_norm": 0.0016758694546297193, + "learning_rate": 0.0002892845498067792, + "loss": 0.1607, + "num_input_tokens_seen": 178647008, + "step": 82710 + }, + { + "epoch": 13.49347471451876, + "grad_norm": 0.00668095238506794, + "learning_rate": 0.0002892200017714572, + "loss": 0.0149, + "num_input_tokens_seen": 178659040, + "step": 82715 + }, + { + "epoch": 13.494290375203915, + "grad_norm": 0.056710727512836456, + "learning_rate": 0.00028915545800785883, + "loss": 0.044, + "num_input_tokens_seen": 178669600, + "step": 82720 + }, + { + "epoch": 13.49510603588907, + "grad_norm": 0.02826865203678608, + "learning_rate": 0.0002890909185172928, + "loss": 0.0159, + "num_input_tokens_seen": 178678560, + "step": 82725 + }, + { + "epoch": 13.495921696574225, + "grad_norm": 1.417874813079834, + "learning_rate": 0.00028902638330106684, + "loss": 0.0397, + "num_input_tokens_seen": 178689504, + "step": 82730 + }, + { + "epoch": 13.49673735725938, + "grad_norm": 0.0027501648291945457, + "learning_rate": 0.0002889618523604889, + "loss": 0.1004, + "num_input_tokens_seen": 178700608, + "step": 82735 + }, + { + "epoch": 13.497553017944535, + "grad_norm": 0.003090892219915986, + "learning_rate": 0.0002888973256968667, + "loss": 0.0373, + "num_input_tokens_seen": 178711744, + "step": 82740 + }, + { + "epoch": 13.49836867862969, + "grad_norm": 0.009183863177895546, + "learning_rate": 0.000288832803311508, + "loss": 0.0142, + "num_input_tokens_seen": 178722016, + "step": 82745 + }, + { + "epoch": 13.499184339314844, + "grad_norm": 0.0019286853494122624, + "learning_rate": 0.00028876828520572043, + "loss": 0.0031, + "num_input_tokens_seen": 178733056, + "step": 82750 + }, + { + "epoch": 13.5, + "grad_norm": 0.013043577782809734, + "learning_rate": 0.0002887037713808116, + "loss": 0.0038, + "num_input_tokens_seen": 178744384, + "step": 82755 + }, + { + "epoch": 13.500815660685156, + "grad_norm": 0.005044702906161547, + "learning_rate": 0.0002886392618380888, + "loss": 0.0636, + "num_input_tokens_seen": 178755104, + "step": 82760 + }, + { + "epoch": 13.50163132137031, + "grad_norm": 0.08455964177846909, + "learning_rate": 0.00028857475657885956, + "loss": 0.0137, + "num_input_tokens_seen": 178766208, + "step": 82765 + }, + { + "epoch": 13.502446982055465, + "grad_norm": 0.1594812124967575, + "learning_rate": 0.00028851025560443103, + "loss": 0.0424, + "num_input_tokens_seen": 178776672, + "step": 82770 + }, + { + "epoch": 13.50326264274062, + "grad_norm": 0.0038309351075440645, + "learning_rate": 0.0002884457589161105, + "loss": 0.0946, + "num_input_tokens_seen": 178787648, + "step": 82775 + }, + { + "epoch": 13.504078303425775, + "grad_norm": 0.004739274736493826, + "learning_rate": 0.000288381266515205, + "loss": 0.004, + "num_input_tokens_seen": 178799328, + "step": 82780 + }, + { + "epoch": 13.50489396411093, + "grad_norm": 0.010969329625368118, + "learning_rate": 0.0002883167784030216, + "loss": 0.0991, + "num_input_tokens_seen": 178809984, + "step": 82785 + }, + { + "epoch": 13.505709624796085, + "grad_norm": 0.20839625597000122, + "learning_rate": 0.00028825229458086726, + "loss": 0.0167, + "num_input_tokens_seen": 178819392, + "step": 82790 + }, + { + "epoch": 13.50652528548124, + "grad_norm": 0.0025618516374379396, + "learning_rate": 0.0002881878150500486, + "loss": 0.1612, + "num_input_tokens_seen": 178830624, + "step": 82795 + }, + { + "epoch": 13.507340946166394, + "grad_norm": 0.116312175989151, + "learning_rate": 0.00028812333981187297, + "loss": 0.1245, + "num_input_tokens_seen": 178840512, + "step": 82800 + }, + { + "epoch": 13.50815660685155, + "grad_norm": 0.020539624616503716, + "learning_rate": 0.00028805886886764623, + "loss": 0.0041, + "num_input_tokens_seen": 178851168, + "step": 82805 + }, + { + "epoch": 13.508972267536706, + "grad_norm": 0.052292123436927795, + "learning_rate": 0.00028799440221867576, + "loss": 0.007, + "num_input_tokens_seen": 178862112, + "step": 82810 + }, + { + "epoch": 13.50978792822186, + "grad_norm": 0.04409019276499748, + "learning_rate": 0.00028792993986626725, + "loss": 0.0053, + "num_input_tokens_seen": 178873888, + "step": 82815 + }, + { + "epoch": 13.510603588907015, + "grad_norm": 0.10002454370260239, + "learning_rate": 0.000287865481811728, + "loss": 0.0152, + "num_input_tokens_seen": 178884448, + "step": 82820 + }, + { + "epoch": 13.51141924959217, + "grad_norm": 0.004765619989484549, + "learning_rate": 0.00028780102805636346, + "loss": 0.0023, + "num_input_tokens_seen": 178895648, + "step": 82825 + }, + { + "epoch": 13.512234910277325, + "grad_norm": 0.011683987453579903, + "learning_rate": 0.0002877365786014806, + "loss": 0.0039, + "num_input_tokens_seen": 178906304, + "step": 82830 + }, + { + "epoch": 13.513050570962479, + "grad_norm": 0.0020222472958266735, + "learning_rate": 0.00028767213344838493, + "loss": 0.0322, + "num_input_tokens_seen": 178917440, + "step": 82835 + }, + { + "epoch": 13.513866231647635, + "grad_norm": 0.0032288068905472755, + "learning_rate": 0.00028760769259838327, + "loss": 0.1212, + "num_input_tokens_seen": 178929216, + "step": 82840 + }, + { + "epoch": 13.51468189233279, + "grad_norm": 0.008228391408920288, + "learning_rate": 0.00028754325605278067, + "loss": 0.1432, + "num_input_tokens_seen": 178940512, + "step": 82845 + }, + { + "epoch": 13.515497553017944, + "grad_norm": 0.0787166953086853, + "learning_rate": 0.00028747882381288393, + "loss": 0.016, + "num_input_tokens_seen": 178951040, + "step": 82850 + }, + { + "epoch": 13.5163132137031, + "grad_norm": 0.007508369162678719, + "learning_rate": 0.00028741439587999805, + "loss": 0.0067, + "num_input_tokens_seen": 178962784, + "step": 82855 + }, + { + "epoch": 13.517128874388254, + "grad_norm": 0.0014908368466421962, + "learning_rate": 0.00028734997225542954, + "loss": 0.0084, + "num_input_tokens_seen": 178973696, + "step": 82860 + }, + { + "epoch": 13.51794453507341, + "grad_norm": 0.519140362739563, + "learning_rate": 0.0002872855529404832, + "loss": 0.0338, + "num_input_tokens_seen": 178984960, + "step": 82865 + }, + { + "epoch": 13.518760195758565, + "grad_norm": 0.015057248063385487, + "learning_rate": 0.0002872211379364651, + "loss": 0.0158, + "num_input_tokens_seen": 178995904, + "step": 82870 + }, + { + "epoch": 13.51957585644372, + "grad_norm": 0.19566139578819275, + "learning_rate": 0.00028715672724468065, + "loss": 0.0091, + "num_input_tokens_seen": 179006784, + "step": 82875 + }, + { + "epoch": 13.520391517128875, + "grad_norm": 0.012074259109795094, + "learning_rate": 0.0002870923208664351, + "loss": 0.0082, + "num_input_tokens_seen": 179017472, + "step": 82880 + }, + { + "epoch": 13.521207177814029, + "grad_norm": 0.01812121272087097, + "learning_rate": 0.0002870279188030338, + "loss": 0.0047, + "num_input_tokens_seen": 179028960, + "step": 82885 + }, + { + "epoch": 13.522022838499185, + "grad_norm": 0.08663219213485718, + "learning_rate": 0.00028696352105578185, + "loss": 0.0073, + "num_input_tokens_seen": 179040448, + "step": 82890 + }, + { + "epoch": 13.522838499184338, + "grad_norm": 0.008986803703010082, + "learning_rate": 0.0002868991276259844, + "loss": 0.0028, + "num_input_tokens_seen": 179052064, + "step": 82895 + }, + { + "epoch": 13.523654159869494, + "grad_norm": 0.001685730996541679, + "learning_rate": 0.0002868347385149465, + "loss": 0.0104, + "num_input_tokens_seen": 179062464, + "step": 82900 + }, + { + "epoch": 13.52446982055465, + "grad_norm": 0.033600907772779465, + "learning_rate": 0.000286770353723973, + "loss": 0.0366, + "num_input_tokens_seen": 179073152, + "step": 82905 + }, + { + "epoch": 13.525285481239804, + "grad_norm": 0.014643524773418903, + "learning_rate": 0.00028670597325436886, + "loss": 0.0072, + "num_input_tokens_seen": 179083328, + "step": 82910 + }, + { + "epoch": 13.52610114192496, + "grad_norm": 0.010406344197690487, + "learning_rate": 0.0002866415971074387, + "loss": 0.0086, + "num_input_tokens_seen": 179093920, + "step": 82915 + }, + { + "epoch": 13.526916802610113, + "grad_norm": 0.006890468765050173, + "learning_rate": 0.000286577225284487, + "loss": 0.0045, + "num_input_tokens_seen": 179105344, + "step": 82920 + }, + { + "epoch": 13.52773246329527, + "grad_norm": 0.0029026255942881107, + "learning_rate": 0.00028651285778681906, + "loss": 0.0105, + "num_input_tokens_seen": 179116704, + "step": 82925 + }, + { + "epoch": 13.528548123980425, + "grad_norm": 0.003112967126071453, + "learning_rate": 0.00028644849461573847, + "loss": 0.0316, + "num_input_tokens_seen": 179128032, + "step": 82930 + }, + { + "epoch": 13.529363784665579, + "grad_norm": 0.026661496609449387, + "learning_rate": 0.0002863841357725504, + "loss": 0.0105, + "num_input_tokens_seen": 179139104, + "step": 82935 + }, + { + "epoch": 13.530179445350734, + "grad_norm": 0.011198869906365871, + "learning_rate": 0.00028631978125855844, + "loss": 0.0066, + "num_input_tokens_seen": 179149984, + "step": 82940 + }, + { + "epoch": 13.530995106035888, + "grad_norm": 0.29773199558258057, + "learning_rate": 0.0002862554310750676, + "loss": 0.0216, + "num_input_tokens_seen": 179161184, + "step": 82945 + }, + { + "epoch": 13.531810766721044, + "grad_norm": 0.010510992258787155, + "learning_rate": 0.0002861910852233812, + "loss": 0.0101, + "num_input_tokens_seen": 179173024, + "step": 82950 + }, + { + "epoch": 13.5326264274062, + "grad_norm": 0.002821574453264475, + "learning_rate": 0.00028612674370480406, + "loss": 0.0054, + "num_input_tokens_seen": 179183520, + "step": 82955 + }, + { + "epoch": 13.533442088091354, + "grad_norm": 0.005407446064054966, + "learning_rate": 0.0002860624065206394, + "loss": 0.0724, + "num_input_tokens_seen": 179194240, + "step": 82960 + }, + { + "epoch": 13.53425774877651, + "grad_norm": 0.0026232078671455383, + "learning_rate": 0.0002859980736721918, + "loss": 0.0075, + "num_input_tokens_seen": 179203424, + "step": 82965 + }, + { + "epoch": 13.535073409461663, + "grad_norm": 0.06447270512580872, + "learning_rate": 0.0002859337451607644, + "loss": 0.0889, + "num_input_tokens_seen": 179214880, + "step": 82970 + }, + { + "epoch": 13.535889070146819, + "grad_norm": 0.005597327370196581, + "learning_rate": 0.0002858694209876616, + "loss": 0.0119, + "num_input_tokens_seen": 179226080, + "step": 82975 + }, + { + "epoch": 13.536704730831975, + "grad_norm": 0.04041731357574463, + "learning_rate": 0.00028580510115418624, + "loss": 0.1063, + "num_input_tokens_seen": 179237856, + "step": 82980 + }, + { + "epoch": 13.537520391517129, + "grad_norm": 0.0014987689210101962, + "learning_rate": 0.0002857407856616426, + "loss": 0.0037, + "num_input_tokens_seen": 179248928, + "step": 82985 + }, + { + "epoch": 13.538336052202284, + "grad_norm": 0.018002718687057495, + "learning_rate": 0.0002856764745113334, + "loss": 0.0121, + "num_input_tokens_seen": 179260128, + "step": 82990 + }, + { + "epoch": 13.539151712887438, + "grad_norm": 0.3203493356704712, + "learning_rate": 0.00028561216770456267, + "loss": 0.0239, + "num_input_tokens_seen": 179270912, + "step": 82995 + }, + { + "epoch": 13.539967373572594, + "grad_norm": 0.01611095853149891, + "learning_rate": 0.000285547865242633, + "loss": 0.0031, + "num_input_tokens_seen": 179280864, + "step": 83000 + }, + { + "epoch": 13.540783034257748, + "grad_norm": 0.012098937295377254, + "learning_rate": 0.000285483567126848, + "loss": 0.0111, + "num_input_tokens_seen": 179292288, + "step": 83005 + }, + { + "epoch": 13.541598694942904, + "grad_norm": 0.009693530388176441, + "learning_rate": 0.0002854192733585107, + "loss": 0.0047, + "num_input_tokens_seen": 179302432, + "step": 83010 + }, + { + "epoch": 13.54241435562806, + "grad_norm": 0.0010672948556020856, + "learning_rate": 0.000285354983938924, + "loss": 0.0031, + "num_input_tokens_seen": 179313440, + "step": 83015 + }, + { + "epoch": 13.543230016313213, + "grad_norm": 0.05182463303208351, + "learning_rate": 0.0002852906988693909, + "loss": 0.0099, + "num_input_tokens_seen": 179324960, + "step": 83020 + }, + { + "epoch": 13.544045676998369, + "grad_norm": 0.10404568165540695, + "learning_rate": 0.0002852264181512142, + "loss": 0.0073, + "num_input_tokens_seen": 179335424, + "step": 83025 + }, + { + "epoch": 13.544861337683523, + "grad_norm": 0.013150133192539215, + "learning_rate": 0.00028516214178569656, + "loss": 0.0091, + "num_input_tokens_seen": 179347392, + "step": 83030 + }, + { + "epoch": 13.545676998368679, + "grad_norm": 0.05724117159843445, + "learning_rate": 0.0002850978697741406, + "loss": 0.0117, + "num_input_tokens_seen": 179357920, + "step": 83035 + }, + { + "epoch": 13.546492659053834, + "grad_norm": 0.004341833759099245, + "learning_rate": 0.000285033602117849, + "loss": 0.0033, + "num_input_tokens_seen": 179368736, + "step": 83040 + }, + { + "epoch": 13.547308319738988, + "grad_norm": 0.03877821937203407, + "learning_rate": 0.0002849693388181241, + "loss": 0.0161, + "num_input_tokens_seen": 179380320, + "step": 83045 + }, + { + "epoch": 13.548123980424144, + "grad_norm": 0.015632281079888344, + "learning_rate": 0.00028490507987626837, + "loss": 0.0901, + "num_input_tokens_seen": 179389792, + "step": 83050 + }, + { + "epoch": 13.548939641109298, + "grad_norm": 0.0029565368313342333, + "learning_rate": 0.00028484082529358403, + "loss": 0.0129, + "num_input_tokens_seen": 179401504, + "step": 83055 + }, + { + "epoch": 13.549755301794454, + "grad_norm": 0.6263442039489746, + "learning_rate": 0.0002847765750713733, + "loss": 0.1232, + "num_input_tokens_seen": 179412736, + "step": 83060 + }, + { + "epoch": 13.550570962479608, + "grad_norm": 0.009329462423920631, + "learning_rate": 0.0002847123292109382, + "loss": 0.0026, + "num_input_tokens_seen": 179424288, + "step": 83065 + }, + { + "epoch": 13.551386623164763, + "grad_norm": 0.004615492187440395, + "learning_rate": 0.0002846480877135812, + "loss": 0.0025, + "num_input_tokens_seen": 179435904, + "step": 83070 + }, + { + "epoch": 13.552202283849919, + "grad_norm": 0.08717557042837143, + "learning_rate": 0.00028458385058060355, + "loss": 0.0095, + "num_input_tokens_seen": 179446688, + "step": 83075 + }, + { + "epoch": 13.553017944535073, + "grad_norm": 0.05257457494735718, + "learning_rate": 0.0002845196178133078, + "loss": 0.104, + "num_input_tokens_seen": 179456992, + "step": 83080 + }, + { + "epoch": 13.553833605220229, + "grad_norm": 0.0037878549192100763, + "learning_rate": 0.00028445538941299493, + "loss": 0.0027, + "num_input_tokens_seen": 179468000, + "step": 83085 + }, + { + "epoch": 13.554649265905383, + "grad_norm": 0.015577950514853, + "learning_rate": 0.00028439116538096743, + "loss": 0.0133, + "num_input_tokens_seen": 179478880, + "step": 83090 + }, + { + "epoch": 13.555464926590538, + "grad_norm": 0.011540939100086689, + "learning_rate": 0.0002843269457185261, + "loss": 0.1051, + "num_input_tokens_seen": 179490208, + "step": 83095 + }, + { + "epoch": 13.556280587275694, + "grad_norm": 0.005807126872241497, + "learning_rate": 0.00028426273042697327, + "loss": 0.0278, + "num_input_tokens_seen": 179501056, + "step": 83100 + }, + { + "epoch": 13.557096247960848, + "grad_norm": 0.3871236741542816, + "learning_rate": 0.0002841985195076094, + "loss": 0.1398, + "num_input_tokens_seen": 179511872, + "step": 83105 + }, + { + "epoch": 13.557911908646004, + "grad_norm": 0.002544855000451207, + "learning_rate": 0.0002841343129617365, + "loss": 0.0028, + "num_input_tokens_seen": 179522784, + "step": 83110 + }, + { + "epoch": 13.558727569331158, + "grad_norm": 0.006097372155636549, + "learning_rate": 0.0002840701107906557, + "loss": 0.0069, + "num_input_tokens_seen": 179534528, + "step": 83115 + }, + { + "epoch": 13.559543230016313, + "grad_norm": 0.22003108263015747, + "learning_rate": 0.00028400591299566793, + "loss": 0.0302, + "num_input_tokens_seen": 179545312, + "step": 83120 + }, + { + "epoch": 13.560358890701469, + "grad_norm": 0.006511087529361248, + "learning_rate": 0.00028394171957807433, + "loss": 0.0388, + "num_input_tokens_seen": 179554720, + "step": 83125 + }, + { + "epoch": 13.561174551386623, + "grad_norm": 0.013055982068181038, + "learning_rate": 0.000283877530539176, + "loss": 0.1136, + "num_input_tokens_seen": 179566304, + "step": 83130 + }, + { + "epoch": 13.561990212071779, + "grad_norm": 1.0412561893463135, + "learning_rate": 0.00028381334588027353, + "loss": 0.0204, + "num_input_tokens_seen": 179575968, + "step": 83135 + }, + { + "epoch": 13.562805872756933, + "grad_norm": 0.006631617899984121, + "learning_rate": 0.00028374916560266794, + "loss": 0.0116, + "num_input_tokens_seen": 179586688, + "step": 83140 + }, + { + "epoch": 13.563621533442088, + "grad_norm": 0.004046297632157803, + "learning_rate": 0.0002836849897076598, + "loss": 0.0019, + "num_input_tokens_seen": 179597280, + "step": 83145 + }, + { + "epoch": 13.564437194127244, + "grad_norm": 0.054977841675281525, + "learning_rate": 0.00028362081819654984, + "loss": 0.0079, + "num_input_tokens_seen": 179607904, + "step": 83150 + }, + { + "epoch": 13.565252854812398, + "grad_norm": 0.00645839050412178, + "learning_rate": 0.00028355665107063845, + "loss": 0.0023, + "num_input_tokens_seen": 179620256, + "step": 83155 + }, + { + "epoch": 13.566068515497554, + "grad_norm": 0.0034914060961455107, + "learning_rate": 0.00028349248833122603, + "loss": 0.0818, + "num_input_tokens_seen": 179630720, + "step": 83160 + }, + { + "epoch": 13.566884176182707, + "grad_norm": 0.03189859911799431, + "learning_rate": 0.0002834283299796131, + "loss": 0.0569, + "num_input_tokens_seen": 179640352, + "step": 83165 + }, + { + "epoch": 13.567699836867863, + "grad_norm": 0.010987777262926102, + "learning_rate": 0.00028336417601709975, + "loss": 0.0198, + "num_input_tokens_seen": 179652128, + "step": 83170 + }, + { + "epoch": 13.568515497553017, + "grad_norm": 0.009719179011881351, + "learning_rate": 0.0002833000264449862, + "loss": 0.0786, + "num_input_tokens_seen": 179663584, + "step": 83175 + }, + { + "epoch": 13.569331158238173, + "grad_norm": 0.008665206842124462, + "learning_rate": 0.00028323588126457255, + "loss": 0.0055, + "num_input_tokens_seen": 179674912, + "step": 83180 + }, + { + "epoch": 13.570146818923329, + "grad_norm": 0.002040495164692402, + "learning_rate": 0.00028317174047715873, + "loss": 0.093, + "num_input_tokens_seen": 179685664, + "step": 83185 + }, + { + "epoch": 13.570962479608482, + "grad_norm": 0.010339323431253433, + "learning_rate": 0.0002831076040840446, + "loss": 0.0843, + "num_input_tokens_seen": 179696608, + "step": 83190 + }, + { + "epoch": 13.571778140293638, + "grad_norm": 0.03784068301320076, + "learning_rate": 0.0002830434720865301, + "loss": 0.0898, + "num_input_tokens_seen": 179706560, + "step": 83195 + }, + { + "epoch": 13.572593800978792, + "grad_norm": 0.011570471338927746, + "learning_rate": 0.0002829793444859148, + "loss": 0.0027, + "num_input_tokens_seen": 179718336, + "step": 83200 + }, + { + "epoch": 13.573409461663948, + "grad_norm": 0.0015834379009902477, + "learning_rate": 0.0002829152212834984, + "loss": 0.0662, + "num_input_tokens_seen": 179728896, + "step": 83205 + }, + { + "epoch": 13.574225122349104, + "grad_norm": 0.025010643526911736, + "learning_rate": 0.0002828511024805803, + "loss": 0.0212, + "num_input_tokens_seen": 179739424, + "step": 83210 + }, + { + "epoch": 13.575040783034257, + "grad_norm": 0.03622874245047569, + "learning_rate": 0.0002827869880784605, + "loss": 0.0386, + "num_input_tokens_seen": 179749024, + "step": 83215 + }, + { + "epoch": 13.575856443719413, + "grad_norm": 0.2895975112915039, + "learning_rate": 0.00028272287807843744, + "loss": 0.0153, + "num_input_tokens_seen": 179759712, + "step": 83220 + }, + { + "epoch": 13.576672104404567, + "grad_norm": 0.03724903613328934, + "learning_rate": 0.00028265877248181113, + "loss": 0.1597, + "num_input_tokens_seen": 179769344, + "step": 83225 + }, + { + "epoch": 13.577487765089723, + "grad_norm": 0.005164381116628647, + "learning_rate": 0.0002825946712898806, + "loss": 0.0158, + "num_input_tokens_seen": 179781216, + "step": 83230 + }, + { + "epoch": 13.578303425774878, + "grad_norm": 0.08302219212055206, + "learning_rate": 0.0002825305745039447, + "loss": 0.1403, + "num_input_tokens_seen": 179792448, + "step": 83235 + }, + { + "epoch": 13.579119086460032, + "grad_norm": 0.05873296037316322, + "learning_rate": 0.00028246648212530267, + "loss": 0.0066, + "num_input_tokens_seen": 179802528, + "step": 83240 + }, + { + "epoch": 13.579934747145188, + "grad_norm": 0.04700169339776039, + "learning_rate": 0.00028240239415525337, + "loss": 0.0096, + "num_input_tokens_seen": 179813216, + "step": 83245 + }, + { + "epoch": 13.580750407830342, + "grad_norm": 0.05630794167518616, + "learning_rate": 0.0002823383105950955, + "loss": 0.0401, + "num_input_tokens_seen": 179824864, + "step": 83250 + }, + { + "epoch": 13.581566068515498, + "grad_norm": 0.035866476595401764, + "learning_rate": 0.00028227423144612794, + "loss": 0.0995, + "num_input_tokens_seen": 179836416, + "step": 83255 + }, + { + "epoch": 13.582381729200652, + "grad_norm": 0.021164124831557274, + "learning_rate": 0.00028221015670964935, + "loss": 0.0109, + "num_input_tokens_seen": 179848064, + "step": 83260 + }, + { + "epoch": 13.583197389885807, + "grad_norm": 0.40941694378852844, + "learning_rate": 0.0002821460863869582, + "loss": 0.0318, + "num_input_tokens_seen": 179859968, + "step": 83265 + }, + { + "epoch": 13.584013050570963, + "grad_norm": 0.0200693067163229, + "learning_rate": 0.0002820820204793529, + "loss": 0.0065, + "num_input_tokens_seen": 179870816, + "step": 83270 + }, + { + "epoch": 13.584828711256117, + "grad_norm": 0.6771963238716125, + "learning_rate": 0.0002820179589881319, + "loss": 0.0799, + "num_input_tokens_seen": 179881952, + "step": 83275 + }, + { + "epoch": 13.585644371941273, + "grad_norm": 0.01566133089363575, + "learning_rate": 0.00028195390191459356, + "loss": 0.0265, + "num_input_tokens_seen": 179892896, + "step": 83280 + }, + { + "epoch": 13.586460032626427, + "grad_norm": 0.00984056293964386, + "learning_rate": 0.000281889849260036, + "loss": 0.0339, + "num_input_tokens_seen": 179903584, + "step": 83285 + }, + { + "epoch": 13.587275693311582, + "grad_norm": 0.014284429140388966, + "learning_rate": 0.00028182580102575726, + "loss": 0.0325, + "num_input_tokens_seen": 179914528, + "step": 83290 + }, + { + "epoch": 13.588091353996738, + "grad_norm": 0.010695664212107658, + "learning_rate": 0.00028176175721305555, + "loss": 0.0376, + "num_input_tokens_seen": 179925184, + "step": 83295 + }, + { + "epoch": 13.588907014681892, + "grad_norm": 0.04023078456521034, + "learning_rate": 0.0002816977178232286, + "loss": 0.0056, + "num_input_tokens_seen": 179934880, + "step": 83300 + }, + { + "epoch": 13.589722675367048, + "grad_norm": 0.11148897558450699, + "learning_rate": 0.0002816336828575744, + "loss": 0.0134, + "num_input_tokens_seen": 179944768, + "step": 83305 + }, + { + "epoch": 13.590538336052202, + "grad_norm": 0.02717600390315056, + "learning_rate": 0.0002815696523173906, + "loss": 0.0038, + "num_input_tokens_seen": 179954240, + "step": 83310 + }, + { + "epoch": 13.591353996737357, + "grad_norm": 0.05596160888671875, + "learning_rate": 0.0002815056262039749, + "loss": 0.0077, + "num_input_tokens_seen": 179965568, + "step": 83315 + }, + { + "epoch": 13.592169657422513, + "grad_norm": 0.008291252888739109, + "learning_rate": 0.0002814416045186249, + "loss": 0.0573, + "num_input_tokens_seen": 179976032, + "step": 83320 + }, + { + "epoch": 13.592985318107667, + "grad_norm": 1.1229381561279297, + "learning_rate": 0.00028137758726263796, + "loss": 0.0458, + "num_input_tokens_seen": 179987232, + "step": 83325 + }, + { + "epoch": 13.593800978792823, + "grad_norm": 0.07029259949922562, + "learning_rate": 0.0002813135744373114, + "loss": 0.0212, + "num_input_tokens_seen": 179998176, + "step": 83330 + }, + { + "epoch": 13.594616639477977, + "grad_norm": 0.0030100038275122643, + "learning_rate": 0.000281249566043943, + "loss": 0.018, + "num_input_tokens_seen": 180008768, + "step": 83335 + }, + { + "epoch": 13.595432300163132, + "grad_norm": 0.005541645456105471, + "learning_rate": 0.0002811855620838294, + "loss": 0.0071, + "num_input_tokens_seen": 180018208, + "step": 83340 + }, + { + "epoch": 13.596247960848288, + "grad_norm": 0.008036543615162373, + "learning_rate": 0.00028112156255826826, + "loss": 0.0709, + "num_input_tokens_seen": 180029792, + "step": 83345 + }, + { + "epoch": 13.597063621533442, + "grad_norm": 0.0052529601380229, + "learning_rate": 0.000281057567468556, + "loss": 0.009, + "num_input_tokens_seen": 180041184, + "step": 83350 + }, + { + "epoch": 13.597879282218598, + "grad_norm": 0.1867624670267105, + "learning_rate": 0.00028099357681599004, + "loss": 0.011, + "num_input_tokens_seen": 180052032, + "step": 83355 + }, + { + "epoch": 13.598694942903752, + "grad_norm": 0.04028286039829254, + "learning_rate": 0.0002809295906018671, + "loss": 0.0057, + "num_input_tokens_seen": 180061664, + "step": 83360 + }, + { + "epoch": 13.599510603588907, + "grad_norm": 0.056735120713710785, + "learning_rate": 0.00028086560882748386, + "loss": 0.1783, + "num_input_tokens_seen": 180072960, + "step": 83365 + }, + { + "epoch": 13.600326264274061, + "grad_norm": 0.0013902663486078382, + "learning_rate": 0.00028080163149413705, + "loss": 0.0133, + "num_input_tokens_seen": 180084576, + "step": 83370 + }, + { + "epoch": 13.601141924959217, + "grad_norm": 0.0006005800678394735, + "learning_rate": 0.0002807376586031233, + "loss": 0.0062, + "num_input_tokens_seen": 180095008, + "step": 83375 + }, + { + "epoch": 13.601957585644373, + "grad_norm": 0.002407664433121681, + "learning_rate": 0.0002806736901557391, + "loss": 0.0853, + "num_input_tokens_seen": 180106784, + "step": 83380 + }, + { + "epoch": 13.602773246329527, + "grad_norm": 0.04227178543806076, + "learning_rate": 0.00028060972615328065, + "loss": 0.013, + "num_input_tokens_seen": 180117152, + "step": 83385 + }, + { + "epoch": 13.603588907014682, + "grad_norm": 0.00501489220187068, + "learning_rate": 0.00028054576659704457, + "loss": 0.0408, + "num_input_tokens_seen": 180128576, + "step": 83390 + }, + { + "epoch": 13.604404567699836, + "grad_norm": 0.00669768825173378, + "learning_rate": 0.00028048181148832685, + "loss": 0.0052, + "num_input_tokens_seen": 180139264, + "step": 83395 + }, + { + "epoch": 13.605220228384992, + "grad_norm": 0.02733282558619976, + "learning_rate": 0.00028041786082842366, + "loss": 0.0031, + "num_input_tokens_seen": 180148736, + "step": 83400 + }, + { + "epoch": 13.606035889070148, + "grad_norm": 0.0024951754603534937, + "learning_rate": 0.0002803539146186311, + "loss": 0.0068, + "num_input_tokens_seen": 180158944, + "step": 83405 + }, + { + "epoch": 13.606851549755302, + "grad_norm": 0.0043714833445847034, + "learning_rate": 0.0002802899728602452, + "loss": 0.0069, + "num_input_tokens_seen": 180169664, + "step": 83410 + }, + { + "epoch": 13.607667210440457, + "grad_norm": 0.0753655731678009, + "learning_rate": 0.00028022603555456164, + "loss": 0.0284, + "num_input_tokens_seen": 180180064, + "step": 83415 + }, + { + "epoch": 13.608482871125611, + "grad_norm": 0.013464689254760742, + "learning_rate": 0.00028016210270287635, + "loss": 0.0039, + "num_input_tokens_seen": 180190912, + "step": 83420 + }, + { + "epoch": 13.609298531810767, + "grad_norm": 0.06028769165277481, + "learning_rate": 0.00028009817430648483, + "loss": 0.0051, + "num_input_tokens_seen": 180202880, + "step": 83425 + }, + { + "epoch": 13.61011419249592, + "grad_norm": 0.014267779886722565, + "learning_rate": 0.00028003425036668287, + "loss": 0.0056, + "num_input_tokens_seen": 180213728, + "step": 83430 + }, + { + "epoch": 13.610929853181077, + "grad_norm": 0.0189402736723423, + "learning_rate": 0.00027997033088476554, + "loss": 0.0193, + "num_input_tokens_seen": 180223936, + "step": 83435 + }, + { + "epoch": 13.611745513866232, + "grad_norm": 0.18437455594539642, + "learning_rate": 0.000279906415862029, + "loss": 0.0112, + "num_input_tokens_seen": 180234624, + "step": 83440 + }, + { + "epoch": 13.612561174551386, + "grad_norm": 0.2045287787914276, + "learning_rate": 0.00027984250529976783, + "loss": 0.0131, + "num_input_tokens_seen": 180245088, + "step": 83445 + }, + { + "epoch": 13.613376835236542, + "grad_norm": 0.06810999661684036, + "learning_rate": 0.000279778599199278, + "loss": 0.0113, + "num_input_tokens_seen": 180257344, + "step": 83450 + }, + { + "epoch": 13.614192495921696, + "grad_norm": 0.01237307209521532, + "learning_rate": 0.0002797146975618538, + "loss": 0.0069, + "num_input_tokens_seen": 180268672, + "step": 83455 + }, + { + "epoch": 13.615008156606851, + "grad_norm": 0.0055781882256269455, + "learning_rate": 0.0002796508003887911, + "loss": 0.0053, + "num_input_tokens_seen": 180280640, + "step": 83460 + }, + { + "epoch": 13.615823817292007, + "grad_norm": 0.08466996997594833, + "learning_rate": 0.00027958690768138406, + "loss": 0.0102, + "num_input_tokens_seen": 180290656, + "step": 83465 + }, + { + "epoch": 13.616639477977161, + "grad_norm": 0.2827630639076233, + "learning_rate": 0.0002795230194409283, + "loss": 0.0136, + "num_input_tokens_seen": 180300672, + "step": 83470 + }, + { + "epoch": 13.617455138662317, + "grad_norm": 0.037059321999549866, + "learning_rate": 0.00027945913566871793, + "loss": 0.0044, + "num_input_tokens_seen": 180311008, + "step": 83475 + }, + { + "epoch": 13.61827079934747, + "grad_norm": 0.38653436303138733, + "learning_rate": 0.0002793952563660483, + "loss": 0.0219, + "num_input_tokens_seen": 180322048, + "step": 83480 + }, + { + "epoch": 13.619086460032626, + "grad_norm": 0.250503808259964, + "learning_rate": 0.0002793313815342133, + "loss": 0.0205, + "num_input_tokens_seen": 180332960, + "step": 83485 + }, + { + "epoch": 13.619902120717782, + "grad_norm": 0.4236673414707184, + "learning_rate": 0.0002792675111745081, + "loss": 0.0141, + "num_input_tokens_seen": 180343040, + "step": 83490 + }, + { + "epoch": 13.620717781402936, + "grad_norm": 0.009462445043027401, + "learning_rate": 0.0002792036452882265, + "loss": 0.1319, + "num_input_tokens_seen": 180353792, + "step": 83495 + }, + { + "epoch": 13.621533442088092, + "grad_norm": 0.005843911319971085, + "learning_rate": 0.00027913978387666326, + "loss": 0.0058, + "num_input_tokens_seen": 180363264, + "step": 83500 + }, + { + "epoch": 13.622349102773246, + "grad_norm": 0.06309421360492706, + "learning_rate": 0.0002790759269411125, + "loss": 0.0062, + "num_input_tokens_seen": 180374272, + "step": 83505 + }, + { + "epoch": 13.623164763458401, + "grad_norm": 0.005838216748088598, + "learning_rate": 0.00027901207448286836, + "loss": 0.112, + "num_input_tokens_seen": 180385952, + "step": 83510 + }, + { + "epoch": 13.623980424143557, + "grad_norm": 0.15519148111343384, + "learning_rate": 0.0002789482265032249, + "loss": 0.0082, + "num_input_tokens_seen": 180396192, + "step": 83515 + }, + { + "epoch": 13.624796084828711, + "grad_norm": 0.024984611198306084, + "learning_rate": 0.00027888438300347607, + "loss": 0.1304, + "num_input_tokens_seen": 180406976, + "step": 83520 + }, + { + "epoch": 13.625611745513867, + "grad_norm": 0.3052389323711395, + "learning_rate": 0.00027882054398491564, + "loss": 0.0616, + "num_input_tokens_seen": 180418752, + "step": 83525 + }, + { + "epoch": 13.62642740619902, + "grad_norm": 0.007396694738417864, + "learning_rate": 0.0002787567094488375, + "loss": 0.0045, + "num_input_tokens_seen": 180430048, + "step": 83530 + }, + { + "epoch": 13.627243066884176, + "grad_norm": 0.019618911668658257, + "learning_rate": 0.00027869287939653534, + "loss": 0.0045, + "num_input_tokens_seen": 180441792, + "step": 83535 + }, + { + "epoch": 13.62805872756933, + "grad_norm": 0.012303023599088192, + "learning_rate": 0.0002786290538293027, + "loss": 0.0082, + "num_input_tokens_seen": 180453152, + "step": 83540 + }, + { + "epoch": 13.628874388254486, + "grad_norm": 0.09575760364532471, + "learning_rate": 0.00027856523274843314, + "loss": 0.0049, + "num_input_tokens_seen": 180463296, + "step": 83545 + }, + { + "epoch": 13.629690048939642, + "grad_norm": 0.014896899461746216, + "learning_rate": 0.00027850141615521983, + "loss": 0.1152, + "num_input_tokens_seen": 180473856, + "step": 83550 + }, + { + "epoch": 13.630505709624796, + "grad_norm": 0.14120285212993622, + "learning_rate": 0.0002784376040509567, + "loss": 0.0099, + "num_input_tokens_seen": 180483264, + "step": 83555 + }, + { + "epoch": 13.631321370309951, + "grad_norm": 0.01524326205253601, + "learning_rate": 0.00027837379643693615, + "loss": 0.0083, + "num_input_tokens_seen": 180493376, + "step": 83560 + }, + { + "epoch": 13.632137030995105, + "grad_norm": 0.161854088306427, + "learning_rate": 0.0002783099933144523, + "loss": 0.0526, + "num_input_tokens_seen": 180504064, + "step": 83565 + }, + { + "epoch": 13.632952691680261, + "grad_norm": 0.0047560338862240314, + "learning_rate": 0.00027824619468479715, + "loss": 0.0112, + "num_input_tokens_seen": 180515392, + "step": 83570 + }, + { + "epoch": 13.633768352365417, + "grad_norm": 0.027127450332045555, + "learning_rate": 0.00027818240054926463, + "loss": 0.007, + "num_input_tokens_seen": 180525216, + "step": 83575 + }, + { + "epoch": 13.63458401305057, + "grad_norm": 0.0030362384859472513, + "learning_rate": 0.0002781186109091467, + "loss": 0.0076, + "num_input_tokens_seen": 180537024, + "step": 83580 + }, + { + "epoch": 13.635399673735726, + "grad_norm": 0.00605916790664196, + "learning_rate": 0.0002780548257657371, + "loss": 0.0026, + "num_input_tokens_seen": 180547744, + "step": 83585 + }, + { + "epoch": 13.63621533442088, + "grad_norm": 0.06712619960308075, + "learning_rate": 0.00027799104512032756, + "loss": 0.0096, + "num_input_tokens_seen": 180557472, + "step": 83590 + }, + { + "epoch": 13.637030995106036, + "grad_norm": 0.0055035678669810295, + "learning_rate": 0.0002779272689742115, + "loss": 0.0017, + "num_input_tokens_seen": 180568320, + "step": 83595 + }, + { + "epoch": 13.63784665579119, + "grad_norm": 0.009830011986196041, + "learning_rate": 0.0002778634973286807, + "loss": 0.0036, + "num_input_tokens_seen": 180579040, + "step": 83600 + }, + { + "epoch": 13.638662316476346, + "grad_norm": 0.059241216629743576, + "learning_rate": 0.00027779973018502834, + "loss": 0.0062, + "num_input_tokens_seen": 180590016, + "step": 83605 + }, + { + "epoch": 13.639477977161501, + "grad_norm": 0.002657790668308735, + "learning_rate": 0.0002777359675445459, + "loss": 0.0021, + "num_input_tokens_seen": 180600960, + "step": 83610 + }, + { + "epoch": 13.640293637846655, + "grad_norm": 0.11882360279560089, + "learning_rate": 0.00027767220940852646, + "loss": 0.0083, + "num_input_tokens_seen": 180610848, + "step": 83615 + }, + { + "epoch": 13.641109298531811, + "grad_norm": 0.004164085257798433, + "learning_rate": 0.0002776084557782613, + "loss": 0.0014, + "num_input_tokens_seen": 180622048, + "step": 83620 + }, + { + "epoch": 13.641924959216965, + "grad_norm": 0.007447056006640196, + "learning_rate": 0.00027754470665504336, + "loss": 0.0055, + "num_input_tokens_seen": 180634016, + "step": 83625 + }, + { + "epoch": 13.64274061990212, + "grad_norm": 0.007610693573951721, + "learning_rate": 0.0002774809620401637, + "loss": 0.002, + "num_input_tokens_seen": 180644576, + "step": 83630 + }, + { + "epoch": 13.643556280587276, + "grad_norm": 0.017725123092532158, + "learning_rate": 0.000277417221934915, + "loss": 0.0033, + "num_input_tokens_seen": 180656544, + "step": 83635 + }, + { + "epoch": 13.64437194127243, + "grad_norm": 0.002556213643401861, + "learning_rate": 0.00027735348634058834, + "loss": 0.0045, + "num_input_tokens_seen": 180668480, + "step": 83640 + }, + { + "epoch": 13.645187601957586, + "grad_norm": 2.327058792114258, + "learning_rate": 0.0002772897552584759, + "loss": 0.1113, + "num_input_tokens_seen": 180679424, + "step": 83645 + }, + { + "epoch": 13.64600326264274, + "grad_norm": 0.0008912270423024893, + "learning_rate": 0.000277226028689869, + "loss": 0.0042, + "num_input_tokens_seen": 180690208, + "step": 83650 + }, + { + "epoch": 13.646818923327896, + "grad_norm": 0.03987288847565651, + "learning_rate": 0.00027716230663605933, + "loss": 0.013, + "num_input_tokens_seen": 180700704, + "step": 83655 + }, + { + "epoch": 13.647634584013051, + "grad_norm": 0.09051557630300522, + "learning_rate": 0.00027709858909833823, + "loss": 0.0127, + "num_input_tokens_seen": 180711488, + "step": 83660 + }, + { + "epoch": 13.648450244698205, + "grad_norm": 0.2718043923377991, + "learning_rate": 0.000277034876077997, + "loss": 0.0173, + "num_input_tokens_seen": 180722400, + "step": 83665 + }, + { + "epoch": 13.649265905383361, + "grad_norm": 0.012056714855134487, + "learning_rate": 0.00027697116757632677, + "loss": 0.0162, + "num_input_tokens_seen": 180733472, + "step": 83670 + }, + { + "epoch": 13.650081566068515, + "grad_norm": 0.0026014503091573715, + "learning_rate": 0.0002769074635946188, + "loss": 0.0022, + "num_input_tokens_seen": 180744416, + "step": 83675 + }, + { + "epoch": 13.65089722675367, + "grad_norm": 0.006171499844640493, + "learning_rate": 0.0002768437641341641, + "loss": 0.0018, + "num_input_tokens_seen": 180756480, + "step": 83680 + }, + { + "epoch": 13.651712887438826, + "grad_norm": 0.018329549580812454, + "learning_rate": 0.00027678006919625367, + "loss": 0.0038, + "num_input_tokens_seen": 180766912, + "step": 83685 + }, + { + "epoch": 13.65252854812398, + "grad_norm": 0.09856487810611725, + "learning_rate": 0.00027671637878217824, + "loss": 0.0099, + "num_input_tokens_seen": 180777888, + "step": 83690 + }, + { + "epoch": 13.653344208809136, + "grad_norm": 0.0025012048427015543, + "learning_rate": 0.0002766526928932285, + "loss": 0.0018, + "num_input_tokens_seen": 180789408, + "step": 83695 + }, + { + "epoch": 13.65415986949429, + "grad_norm": 0.007546972017735243, + "learning_rate": 0.0002765890115306956, + "loss": 0.0046, + "num_input_tokens_seen": 180799712, + "step": 83700 + }, + { + "epoch": 13.654975530179446, + "grad_norm": 0.023198723793029785, + "learning_rate": 0.0002765253346958695, + "loss": 0.0029, + "num_input_tokens_seen": 180811104, + "step": 83705 + }, + { + "epoch": 13.655791190864601, + "grad_norm": 0.0014068408636376262, + "learning_rate": 0.00027646166239004134, + "loss": 0.0065, + "num_input_tokens_seen": 180823296, + "step": 83710 + }, + { + "epoch": 13.656606851549755, + "grad_norm": 0.005566820967942476, + "learning_rate": 0.0002763979946145008, + "loss": 0.0051, + "num_input_tokens_seen": 180834336, + "step": 83715 + }, + { + "epoch": 13.65742251223491, + "grad_norm": 0.0010641274275258183, + "learning_rate": 0.00027633433137053885, + "loss": 0.0171, + "num_input_tokens_seen": 180844192, + "step": 83720 + }, + { + "epoch": 13.658238172920065, + "grad_norm": 0.006609211675822735, + "learning_rate": 0.00027627067265944514, + "loss": 0.0055, + "num_input_tokens_seen": 180855616, + "step": 83725 + }, + { + "epoch": 13.65905383360522, + "grad_norm": 0.06236208230257034, + "learning_rate": 0.0002762070184825104, + "loss": 0.0218, + "num_input_tokens_seen": 180867552, + "step": 83730 + }, + { + "epoch": 13.659869494290374, + "grad_norm": 0.0006166854873299599, + "learning_rate": 0.00027614336884102393, + "loss": 0.0025, + "num_input_tokens_seen": 180878208, + "step": 83735 + }, + { + "epoch": 13.66068515497553, + "grad_norm": 0.7124009728431702, + "learning_rate": 0.0002760797237362765, + "loss": 0.0983, + "num_input_tokens_seen": 180889664, + "step": 83740 + }, + { + "epoch": 13.661500815660686, + "grad_norm": 0.029279787093400955, + "learning_rate": 0.00027601608316955715, + "loss": 0.0246, + "num_input_tokens_seen": 180901280, + "step": 83745 + }, + { + "epoch": 13.66231647634584, + "grad_norm": 0.0014781494392082095, + "learning_rate": 0.0002759524471421562, + "loss": 0.0012, + "num_input_tokens_seen": 180911712, + "step": 83750 + }, + { + "epoch": 13.663132137030995, + "grad_norm": 0.007471561431884766, + "learning_rate": 0.00027588881565536303, + "loss": 0.0023, + "num_input_tokens_seen": 180922368, + "step": 83755 + }, + { + "epoch": 13.66394779771615, + "grad_norm": 0.0038479752838611603, + "learning_rate": 0.00027582518871046744, + "loss": 0.0024, + "num_input_tokens_seen": 180933504, + "step": 83760 + }, + { + "epoch": 13.664763458401305, + "grad_norm": 0.03010265901684761, + "learning_rate": 0.00027576156630875875, + "loss": 0.0081, + "num_input_tokens_seen": 180943296, + "step": 83765 + }, + { + "epoch": 13.66557911908646, + "grad_norm": 0.0018516803393140435, + "learning_rate": 0.0002756979484515264, + "loss": 0.0027, + "num_input_tokens_seen": 180954944, + "step": 83770 + }, + { + "epoch": 13.666394779771615, + "grad_norm": 0.00570902694016695, + "learning_rate": 0.00027563433514005966, + "loss": 0.0198, + "num_input_tokens_seen": 180965504, + "step": 83775 + }, + { + "epoch": 13.66721044045677, + "grad_norm": 0.5824912190437317, + "learning_rate": 0.0002755707263756477, + "loss": 0.1139, + "num_input_tokens_seen": 180977504, + "step": 83780 + }, + { + "epoch": 13.668026101141924, + "grad_norm": 0.01962290145456791, + "learning_rate": 0.0002755071221595798, + "loss": 0.0032, + "num_input_tokens_seen": 180988160, + "step": 83785 + }, + { + "epoch": 13.66884176182708, + "grad_norm": 0.004911630880087614, + "learning_rate": 0.0002754435224931447, + "loss": 0.0109, + "num_input_tokens_seen": 180998368, + "step": 83790 + }, + { + "epoch": 13.669657422512234, + "grad_norm": 0.00739239202812314, + "learning_rate": 0.00027537992737763163, + "loss": 0.0415, + "num_input_tokens_seen": 181009024, + "step": 83795 + }, + { + "epoch": 13.67047308319739, + "grad_norm": 0.04392838850617409, + "learning_rate": 0.00027531633681432925, + "loss": 0.0236, + "num_input_tokens_seen": 181020640, + "step": 83800 + }, + { + "epoch": 13.671288743882545, + "grad_norm": 0.004304789938032627, + "learning_rate": 0.0002752527508045263, + "loss": 0.1498, + "num_input_tokens_seen": 181030368, + "step": 83805 + }, + { + "epoch": 13.6721044045677, + "grad_norm": 0.0005669619422405958, + "learning_rate": 0.0002751891693495115, + "loss": 0.0048, + "num_input_tokens_seen": 181040896, + "step": 83810 + }, + { + "epoch": 13.672920065252855, + "grad_norm": 0.011217975057661533, + "learning_rate": 0.00027512559245057333, + "loss": 0.064, + "num_input_tokens_seen": 181052256, + "step": 83815 + }, + { + "epoch": 13.673735725938009, + "grad_norm": 0.0023022620007395744, + "learning_rate": 0.00027506202010900037, + "loss": 0.0026, + "num_input_tokens_seen": 181062912, + "step": 83820 + }, + { + "epoch": 13.674551386623165, + "grad_norm": 0.0035705927293747663, + "learning_rate": 0.00027499845232608087, + "loss": 0.0021, + "num_input_tokens_seen": 181073632, + "step": 83825 + }, + { + "epoch": 13.67536704730832, + "grad_norm": 0.01892230100929737, + "learning_rate": 0.00027493488910310316, + "loss": 0.0078, + "num_input_tokens_seen": 181084576, + "step": 83830 + }, + { + "epoch": 13.676182707993474, + "grad_norm": 0.008273656480014324, + "learning_rate": 0.0002748713304413555, + "loss": 0.0102, + "num_input_tokens_seen": 181095360, + "step": 83835 + }, + { + "epoch": 13.67699836867863, + "grad_norm": 0.009142505936324596, + "learning_rate": 0.0002748077763421257, + "loss": 0.1122, + "num_input_tokens_seen": 181107712, + "step": 83840 + }, + { + "epoch": 13.677814029363784, + "grad_norm": 0.06141829863190651, + "learning_rate": 0.0002747442268067024, + "loss": 0.0052, + "num_input_tokens_seen": 181119008, + "step": 83845 + }, + { + "epoch": 13.67862969004894, + "grad_norm": 0.0016691337805241346, + "learning_rate": 0.00027468068183637265, + "loss": 0.0104, + "num_input_tokens_seen": 181130720, + "step": 83850 + }, + { + "epoch": 13.679445350734095, + "grad_norm": 0.0046843248419463634, + "learning_rate": 0.0002746171414324249, + "loss": 0.0013, + "num_input_tokens_seen": 181142976, + "step": 83855 + }, + { + "epoch": 13.68026101141925, + "grad_norm": 0.44291582703590393, + "learning_rate": 0.00027455360559614677, + "loss": 0.027, + "num_input_tokens_seen": 181153920, + "step": 83860 + }, + { + "epoch": 13.681076672104405, + "grad_norm": 0.00250981654971838, + "learning_rate": 0.00027449007432882576, + "loss": 0.0043, + "num_input_tokens_seen": 181163712, + "step": 83865 + }, + { + "epoch": 13.681892332789559, + "grad_norm": 0.011784011498093605, + "learning_rate": 0.00027442654763174955, + "loss": 0.0846, + "num_input_tokens_seen": 181173568, + "step": 83870 + }, + { + "epoch": 13.682707993474715, + "grad_norm": 0.008331255055963993, + "learning_rate": 0.00027436302550620545, + "loss": 0.0074, + "num_input_tokens_seen": 181182752, + "step": 83875 + }, + { + "epoch": 13.68352365415987, + "grad_norm": 0.008108728565275669, + "learning_rate": 0.0002742995079534809, + "loss": 0.0083, + "num_input_tokens_seen": 181193216, + "step": 83880 + }, + { + "epoch": 13.684339314845024, + "grad_norm": 0.006428302265703678, + "learning_rate": 0.0002742359949748632, + "loss": 0.0036, + "num_input_tokens_seen": 181204512, + "step": 83885 + }, + { + "epoch": 13.68515497553018, + "grad_norm": 0.005887574050575495, + "learning_rate": 0.0002741724865716394, + "loss": 0.0097, + "num_input_tokens_seen": 181215520, + "step": 83890 + }, + { + "epoch": 13.685970636215334, + "grad_norm": 0.021212387830018997, + "learning_rate": 0.0002741089827450966, + "loss": 0.0233, + "num_input_tokens_seen": 181225984, + "step": 83895 + }, + { + "epoch": 13.68678629690049, + "grad_norm": 0.19733788073062897, + "learning_rate": 0.0002740454834965219, + "loss": 0.0804, + "num_input_tokens_seen": 181237952, + "step": 83900 + }, + { + "epoch": 13.687601957585644, + "grad_norm": 0.058703526854515076, + "learning_rate": 0.0002739819888272021, + "loss": 0.0144, + "num_input_tokens_seen": 181247840, + "step": 83905 + }, + { + "epoch": 13.6884176182708, + "grad_norm": 0.0021352344192564487, + "learning_rate": 0.000273918498738424, + "loss": 0.1252, + "num_input_tokens_seen": 181258048, + "step": 83910 + }, + { + "epoch": 13.689233278955955, + "grad_norm": 0.001603231648914516, + "learning_rate": 0.00027385501323147433, + "loss": 0.0109, + "num_input_tokens_seen": 181268288, + "step": 83915 + }, + { + "epoch": 13.690048939641109, + "grad_norm": 0.0029747539665549994, + "learning_rate": 0.00027379153230763976, + "loss": 0.0035, + "num_input_tokens_seen": 181278304, + "step": 83920 + }, + { + "epoch": 13.690864600326265, + "grad_norm": 0.0037416473496705294, + "learning_rate": 0.00027372805596820673, + "loss": 0.0032, + "num_input_tokens_seen": 181289312, + "step": 83925 + }, + { + "epoch": 13.691680261011419, + "grad_norm": 0.0018563204212114215, + "learning_rate": 0.0002736645842144616, + "loss": 0.004, + "num_input_tokens_seen": 181300928, + "step": 83930 + }, + { + "epoch": 13.692495921696574, + "grad_norm": 0.042291343212127686, + "learning_rate": 0.00027360111704769093, + "loss": 0.0041, + "num_input_tokens_seen": 181311840, + "step": 83935 + }, + { + "epoch": 13.69331158238173, + "grad_norm": 0.0007176153594627976, + "learning_rate": 0.00027353765446918075, + "loss": 0.0386, + "num_input_tokens_seen": 181321696, + "step": 83940 + }, + { + "epoch": 13.694127243066884, + "grad_norm": 0.0034198344219475985, + "learning_rate": 0.0002734741964802173, + "loss": 0.0029, + "num_input_tokens_seen": 181333088, + "step": 83945 + }, + { + "epoch": 13.69494290375204, + "grad_norm": 0.0027348636649549007, + "learning_rate": 0.00027341074308208667, + "loss": 0.0026, + "num_input_tokens_seen": 181344736, + "step": 83950 + }, + { + "epoch": 13.695758564437194, + "grad_norm": 0.0024968599900603294, + "learning_rate": 0.00027334729427607476, + "loss": 0.1133, + "num_input_tokens_seen": 181355136, + "step": 83955 + }, + { + "epoch": 13.69657422512235, + "grad_norm": 0.003038478083908558, + "learning_rate": 0.00027328385006346746, + "loss": 0.0012, + "num_input_tokens_seen": 181365824, + "step": 83960 + }, + { + "epoch": 13.697389885807503, + "grad_norm": 0.1116863489151001, + "learning_rate": 0.00027322041044555045, + "loss": 0.0469, + "num_input_tokens_seen": 181376576, + "step": 83965 + }, + { + "epoch": 13.698205546492659, + "grad_norm": 0.016957435756921768, + "learning_rate": 0.00027315697542360944, + "loss": 0.0081, + "num_input_tokens_seen": 181387744, + "step": 83970 + }, + { + "epoch": 13.699021207177815, + "grad_norm": 0.08671362698078156, + "learning_rate": 0.00027309354499893045, + "loss": 0.006, + "num_input_tokens_seen": 181398912, + "step": 83975 + }, + { + "epoch": 13.699836867862969, + "grad_norm": 0.0035883912350982428, + "learning_rate": 0.00027303011917279826, + "loss": 0.0014, + "num_input_tokens_seen": 181410112, + "step": 83980 + }, + { + "epoch": 13.700652528548124, + "grad_norm": 0.21623927354812622, + "learning_rate": 0.00027296669794649875, + "loss": 0.011, + "num_input_tokens_seen": 181421600, + "step": 83985 + }, + { + "epoch": 13.701468189233278, + "grad_norm": 0.001589043764397502, + "learning_rate": 0.0002729032813213172, + "loss": 0.0017, + "num_input_tokens_seen": 181432192, + "step": 83990 + }, + { + "epoch": 13.702283849918434, + "grad_norm": 0.0007788580842316151, + "learning_rate": 0.00027283986929853873, + "loss": 0.0078, + "num_input_tokens_seen": 181443200, + "step": 83995 + }, + { + "epoch": 13.70309951060359, + "grad_norm": 0.005699070170521736, + "learning_rate": 0.0002727764618794485, + "loss": 0.0345, + "num_input_tokens_seen": 181453728, + "step": 84000 + }, + { + "epoch": 13.703915171288743, + "grad_norm": 0.00218218844383955, + "learning_rate": 0.00027271305906533146, + "loss": 0.0417, + "num_input_tokens_seen": 181464192, + "step": 84005 + }, + { + "epoch": 13.7047308319739, + "grad_norm": 0.026671582832932472, + "learning_rate": 0.00027264966085747267, + "loss": 0.0071, + "num_input_tokens_seen": 181475040, + "step": 84010 + }, + { + "epoch": 13.705546492659053, + "grad_norm": 0.010932182893157005, + "learning_rate": 0.00027258626725715684, + "loss": 0.0182, + "num_input_tokens_seen": 181485952, + "step": 84015 + }, + { + "epoch": 13.706362153344209, + "grad_norm": 0.0035217327531427145, + "learning_rate": 0.0002725228782656689, + "loss": 0.0092, + "num_input_tokens_seen": 181497504, + "step": 84020 + }, + { + "epoch": 13.707177814029365, + "grad_norm": 0.0011677873553708196, + "learning_rate": 0.00027245949388429334, + "loss": 0.0076, + "num_input_tokens_seen": 181508896, + "step": 84025 + }, + { + "epoch": 13.707993474714518, + "grad_norm": 0.007658544462174177, + "learning_rate": 0.0002723961141143148, + "loss": 0.0066, + "num_input_tokens_seen": 181519904, + "step": 84030 + }, + { + "epoch": 13.708809135399674, + "grad_norm": 0.009968415834009647, + "learning_rate": 0.0002723327389570177, + "loss": 0.0185, + "num_input_tokens_seen": 181530688, + "step": 84035 + }, + { + "epoch": 13.709624796084828, + "grad_norm": 0.05588683858513832, + "learning_rate": 0.00027226936841368655, + "loss": 0.0064, + "num_input_tokens_seen": 181541792, + "step": 84040 + }, + { + "epoch": 13.710440456769984, + "grad_norm": 0.0008577611879445612, + "learning_rate": 0.00027220600248560557, + "loss": 0.0713, + "num_input_tokens_seen": 181552864, + "step": 84045 + }, + { + "epoch": 13.71125611745514, + "grad_norm": 0.4839954376220703, + "learning_rate": 0.00027214264117405884, + "loss": 0.0166, + "num_input_tokens_seen": 181563264, + "step": 84050 + }, + { + "epoch": 13.712071778140293, + "grad_norm": 0.006796311587095261, + "learning_rate": 0.0002720792844803306, + "loss": 0.0013, + "num_input_tokens_seen": 181574848, + "step": 84055 + }, + { + "epoch": 13.71288743882545, + "grad_norm": 0.0028383873868733644, + "learning_rate": 0.00027201593240570475, + "loss": 0.0028, + "num_input_tokens_seen": 181584864, + "step": 84060 + }, + { + "epoch": 13.713703099510603, + "grad_norm": 0.011330782435834408, + "learning_rate": 0.00027195258495146525, + "loss": 0.0236, + "num_input_tokens_seen": 181596320, + "step": 84065 + }, + { + "epoch": 13.714518760195759, + "grad_norm": 0.004811130929738283, + "learning_rate": 0.00027188924211889593, + "loss": 0.002, + "num_input_tokens_seen": 181605632, + "step": 84070 + }, + { + "epoch": 13.715334420880914, + "grad_norm": 0.0025700810365378857, + "learning_rate": 0.0002718259039092803, + "loss": 0.0044, + "num_input_tokens_seen": 181617888, + "step": 84075 + }, + { + "epoch": 13.716150081566068, + "grad_norm": 0.022676723077893257, + "learning_rate": 0.0002717625703239026, + "loss": 0.0028, + "num_input_tokens_seen": 181629088, + "step": 84080 + }, + { + "epoch": 13.716965742251224, + "grad_norm": 0.05975797772407532, + "learning_rate": 0.00027169924136404553, + "loss": 0.0042, + "num_input_tokens_seen": 181640064, + "step": 84085 + }, + { + "epoch": 13.717781402936378, + "grad_norm": 0.0030763172544538975, + "learning_rate": 0.00027163591703099335, + "loss": 0.0517, + "num_input_tokens_seen": 181650432, + "step": 84090 + }, + { + "epoch": 13.718597063621534, + "grad_norm": 0.007155897095799446, + "learning_rate": 0.0002715725973260286, + "loss": 0.0326, + "num_input_tokens_seen": 181660512, + "step": 84095 + }, + { + "epoch": 13.719412724306688, + "grad_norm": 0.12784910202026367, + "learning_rate": 0.00027150928225043545, + "loss": 0.0048, + "num_input_tokens_seen": 181672000, + "step": 84100 + }, + { + "epoch": 13.720228384991843, + "grad_norm": 0.006302386522293091, + "learning_rate": 0.00027144597180549603, + "loss": 0.0012, + "num_input_tokens_seen": 181682688, + "step": 84105 + }, + { + "epoch": 13.721044045676999, + "grad_norm": 0.004140893928706646, + "learning_rate": 0.0002713826659924944, + "loss": 0.0049, + "num_input_tokens_seen": 181693056, + "step": 84110 + }, + { + "epoch": 13.721859706362153, + "grad_norm": 0.009677722118794918, + "learning_rate": 0.00027131936481271265, + "loss": 0.0483, + "num_input_tokens_seen": 181703072, + "step": 84115 + }, + { + "epoch": 13.722675367047309, + "grad_norm": 0.0024610969703644514, + "learning_rate": 0.00027125606826743445, + "loss": 0.0525, + "num_input_tokens_seen": 181714112, + "step": 84120 + }, + { + "epoch": 13.723491027732463, + "grad_norm": 0.01368603203445673, + "learning_rate": 0.0002711927763579418, + "loss": 0.0082, + "num_input_tokens_seen": 181725088, + "step": 84125 + }, + { + "epoch": 13.724306688417618, + "grad_norm": 0.00212348741479218, + "learning_rate": 0.00027112948908551807, + "loss": 0.0019, + "num_input_tokens_seen": 181734368, + "step": 84130 + }, + { + "epoch": 13.725122349102774, + "grad_norm": 0.0046083335764706135, + "learning_rate": 0.00027106620645144555, + "loss": 0.0172, + "num_input_tokens_seen": 181744576, + "step": 84135 + }, + { + "epoch": 13.725938009787928, + "grad_norm": 0.004685471300035715, + "learning_rate": 0.00027100292845700676, + "loss": 0.0795, + "num_input_tokens_seen": 181754496, + "step": 84140 + }, + { + "epoch": 13.726753670473084, + "grad_norm": 0.002028076443821192, + "learning_rate": 0.0002709396551034842, + "loss": 0.0427, + "num_input_tokens_seen": 181763840, + "step": 84145 + }, + { + "epoch": 13.727569331158238, + "grad_norm": 0.0031546044629067183, + "learning_rate": 0.00027087638639215994, + "loss": 0.005, + "num_input_tokens_seen": 181774656, + "step": 84150 + }, + { + "epoch": 13.728384991843393, + "grad_norm": 0.0017633294919505715, + "learning_rate": 0.00027081312232431654, + "loss": 0.0322, + "num_input_tokens_seen": 181785696, + "step": 84155 + }, + { + "epoch": 13.729200652528547, + "grad_norm": 0.03633604198694229, + "learning_rate": 0.00027074986290123596, + "loss": 0.0109, + "num_input_tokens_seen": 181796288, + "step": 84160 + }, + { + "epoch": 13.730016313213703, + "grad_norm": 0.013025002554059029, + "learning_rate": 0.0002706866081242001, + "loss": 0.028, + "num_input_tokens_seen": 181805376, + "step": 84165 + }, + { + "epoch": 13.730831973898859, + "grad_norm": 0.003769845236092806, + "learning_rate": 0.0002706233579944911, + "loss": 0.0287, + "num_input_tokens_seen": 181814752, + "step": 84170 + }, + { + "epoch": 13.731647634584013, + "grad_norm": 0.006845514755696058, + "learning_rate": 0.00027056011251339073, + "loss": 0.0957, + "num_input_tokens_seen": 181826496, + "step": 84175 + }, + { + "epoch": 13.732463295269168, + "grad_norm": 0.009542626328766346, + "learning_rate": 0.0002704968716821806, + "loss": 0.0032, + "num_input_tokens_seen": 181836096, + "step": 84180 + }, + { + "epoch": 13.733278955954322, + "grad_norm": 0.0029800382908433676, + "learning_rate": 0.00027043363550214287, + "loss": 0.0934, + "num_input_tokens_seen": 181846976, + "step": 84185 + }, + { + "epoch": 13.734094616639478, + "grad_norm": 0.06575489789247513, + "learning_rate": 0.00027037040397455837, + "loss": 0.0069, + "num_input_tokens_seen": 181856160, + "step": 84190 + }, + { + "epoch": 13.734910277324634, + "grad_norm": 0.002500128000974655, + "learning_rate": 0.0002703071771007093, + "loss": 0.0008, + "num_input_tokens_seen": 181866496, + "step": 84195 + }, + { + "epoch": 13.735725938009788, + "grad_norm": 1.1656486988067627, + "learning_rate": 0.0002702439548818763, + "loss": 0.0549, + "num_input_tokens_seen": 181876736, + "step": 84200 + }, + { + "epoch": 13.736541598694943, + "grad_norm": 0.0023803820367902517, + "learning_rate": 0.0002701807373193414, + "loss": 0.1148, + "num_input_tokens_seen": 181887616, + "step": 84205 + }, + { + "epoch": 13.737357259380097, + "grad_norm": 0.008336534723639488, + "learning_rate": 0.000270117524414385, + "loss": 0.0364, + "num_input_tokens_seen": 181897536, + "step": 84210 + }, + { + "epoch": 13.738172920065253, + "grad_norm": 0.003979240078479052, + "learning_rate": 0.000270054316168289, + "loss": 0.0057, + "num_input_tokens_seen": 181909504, + "step": 84215 + }, + { + "epoch": 13.738988580750409, + "grad_norm": 0.003386629745364189, + "learning_rate": 0.0002699911125823336, + "loss": 0.004, + "num_input_tokens_seen": 181921088, + "step": 84220 + }, + { + "epoch": 13.739804241435563, + "grad_norm": 0.0035156127996742725, + "learning_rate": 0.0002699279136578005, + "loss": 0.1362, + "num_input_tokens_seen": 181931712, + "step": 84225 + }, + { + "epoch": 13.740619902120718, + "grad_norm": 0.00738931680098176, + "learning_rate": 0.0002698647193959697, + "loss": 0.0135, + "num_input_tokens_seen": 181942816, + "step": 84230 + }, + { + "epoch": 13.741435562805872, + "grad_norm": 0.0014010763261467218, + "learning_rate": 0.00026980152979812265, + "loss": 0.0037, + "num_input_tokens_seen": 181954016, + "step": 84235 + }, + { + "epoch": 13.742251223491028, + "grad_norm": 0.007098844274878502, + "learning_rate": 0.0002697383448655393, + "loss": 0.0134, + "num_input_tokens_seen": 181964448, + "step": 84240 + }, + { + "epoch": 13.743066884176184, + "grad_norm": 0.0010831760009750724, + "learning_rate": 0.00026967516459950084, + "loss": 0.0161, + "num_input_tokens_seen": 181974880, + "step": 84245 + }, + { + "epoch": 13.743882544861338, + "grad_norm": 0.46945667266845703, + "learning_rate": 0.000269611989001287, + "loss": 0.0636, + "num_input_tokens_seen": 181986080, + "step": 84250 + }, + { + "epoch": 13.744698205546493, + "grad_norm": 0.006263429298996925, + "learning_rate": 0.0002695488180721789, + "loss": 0.0072, + "num_input_tokens_seen": 181995360, + "step": 84255 + }, + { + "epoch": 13.745513866231647, + "grad_norm": 0.025617733597755432, + "learning_rate": 0.0002694856518134559, + "loss": 0.0214, + "num_input_tokens_seen": 182007648, + "step": 84260 + }, + { + "epoch": 13.746329526916803, + "grad_norm": 0.048633527010679245, + "learning_rate": 0.000269422490226399, + "loss": 0.0076, + "num_input_tokens_seen": 182018080, + "step": 84265 + }, + { + "epoch": 13.747145187601957, + "grad_norm": 0.017586344853043556, + "learning_rate": 0.00026935933331228743, + "loss": 0.1629, + "num_input_tokens_seen": 182030368, + "step": 84270 + }, + { + "epoch": 13.747960848287113, + "grad_norm": 0.563724935054779, + "learning_rate": 0.00026929618107240173, + "loss": 0.0444, + "num_input_tokens_seen": 182041504, + "step": 84275 + }, + { + "epoch": 13.748776508972268, + "grad_norm": 0.002480928786098957, + "learning_rate": 0.0002692330335080216, + "loss": 0.0222, + "num_input_tokens_seen": 182050208, + "step": 84280 + }, + { + "epoch": 13.749592169657422, + "grad_norm": 0.6787986159324646, + "learning_rate": 0.00026916989062042684, + "loss": 0.0167, + "num_input_tokens_seen": 182060704, + "step": 84285 + }, + { + "epoch": 13.750407830342578, + "grad_norm": 0.010774930939078331, + "learning_rate": 0.0002691067524108971, + "loss": 0.0108, + "num_input_tokens_seen": 182072544, + "step": 84290 + }, + { + "epoch": 13.751223491027732, + "grad_norm": 0.022650204598903656, + "learning_rate": 0.00026904361888071193, + "loss": 0.0087, + "num_input_tokens_seen": 182082656, + "step": 84295 + }, + { + "epoch": 13.752039151712887, + "grad_norm": 0.0077858190052211285, + "learning_rate": 0.0002689804900311508, + "loss": 0.0397, + "num_input_tokens_seen": 182093088, + "step": 84300 + }, + { + "epoch": 13.752854812398043, + "grad_norm": 0.011099128052592278, + "learning_rate": 0.000268917365863493, + "loss": 0.0061, + "num_input_tokens_seen": 182103648, + "step": 84305 + }, + { + "epoch": 13.753670473083197, + "grad_norm": 0.001572756445966661, + "learning_rate": 0.000268854246379018, + "loss": 0.0149, + "num_input_tokens_seen": 182114976, + "step": 84310 + }, + { + "epoch": 13.754486133768353, + "grad_norm": 0.012012061662971973, + "learning_rate": 0.00026879113157900496, + "loss": 0.0025, + "num_input_tokens_seen": 182125056, + "step": 84315 + }, + { + "epoch": 13.755301794453507, + "grad_norm": 0.004653714597225189, + "learning_rate": 0.00026872802146473296, + "loss": 0.0057, + "num_input_tokens_seen": 182135104, + "step": 84320 + }, + { + "epoch": 13.756117455138662, + "grad_norm": 2.9742019176483154, + "learning_rate": 0.0002686649160374808, + "loss": 0.0497, + "num_input_tokens_seen": 182146144, + "step": 84325 + }, + { + "epoch": 13.756933115823816, + "grad_norm": 0.0020722979679703712, + "learning_rate": 0.0002686018152985279, + "loss": 0.0117, + "num_input_tokens_seen": 182158464, + "step": 84330 + }, + { + "epoch": 13.757748776508972, + "grad_norm": 0.0032649089116603136, + "learning_rate": 0.0002685387192491524, + "loss": 0.0133, + "num_input_tokens_seen": 182168640, + "step": 84335 + }, + { + "epoch": 13.758564437194128, + "grad_norm": 0.014669415540993214, + "learning_rate": 0.0002684756278906338, + "loss": 0.0286, + "num_input_tokens_seen": 182179712, + "step": 84340 + }, + { + "epoch": 13.759380097879282, + "grad_norm": 0.08696259558200836, + "learning_rate": 0.0002684125412242499, + "loss": 0.0501, + "num_input_tokens_seen": 182191296, + "step": 84345 + }, + { + "epoch": 13.760195758564437, + "grad_norm": 0.19893871247768402, + "learning_rate": 0.00026834945925128005, + "loss": 0.1038, + "num_input_tokens_seen": 182201696, + "step": 84350 + }, + { + "epoch": 13.761011419249591, + "grad_norm": 0.004175386857241392, + "learning_rate": 0.00026828638197300185, + "loss": 0.005, + "num_input_tokens_seen": 182211328, + "step": 84355 + }, + { + "epoch": 13.761827079934747, + "grad_norm": 0.01928592659533024, + "learning_rate": 0.0002682233093906945, + "loss": 0.0027, + "num_input_tokens_seen": 182222336, + "step": 84360 + }, + { + "epoch": 13.762642740619903, + "grad_norm": 0.2888076603412628, + "learning_rate": 0.00026816024150563546, + "loss": 0.0144, + "num_input_tokens_seen": 182233728, + "step": 84365 + }, + { + "epoch": 13.763458401305057, + "grad_norm": 0.46259981393814087, + "learning_rate": 0.00026809717831910353, + "loss": 0.0295, + "num_input_tokens_seen": 182243296, + "step": 84370 + }, + { + "epoch": 13.764274061990212, + "grad_norm": 0.012987246736884117, + "learning_rate": 0.0002680341198323761, + "loss": 0.1032, + "num_input_tokens_seen": 182254240, + "step": 84375 + }, + { + "epoch": 13.765089722675366, + "grad_norm": 0.009005763567984104, + "learning_rate": 0.0002679710660467319, + "loss": 0.0064, + "num_input_tokens_seen": 182265632, + "step": 84380 + }, + { + "epoch": 13.765905383360522, + "grad_norm": 0.0051773120649158955, + "learning_rate": 0.00026790801696344814, + "loss": 0.0109, + "num_input_tokens_seen": 182276960, + "step": 84385 + }, + { + "epoch": 13.766721044045678, + "grad_norm": 0.01954108476638794, + "learning_rate": 0.00026784497258380293, + "loss": 0.045, + "num_input_tokens_seen": 182288800, + "step": 84390 + }, + { + "epoch": 13.767536704730832, + "grad_norm": 0.02403266169130802, + "learning_rate": 0.0002677819329090738, + "loss": 0.0131, + "num_input_tokens_seen": 182299136, + "step": 84395 + }, + { + "epoch": 13.768352365415987, + "grad_norm": 0.005993073806166649, + "learning_rate": 0.00026771889794053845, + "loss": 0.0096, + "num_input_tokens_seen": 182309728, + "step": 84400 + }, + { + "epoch": 13.769168026101141, + "grad_norm": 0.004992147441953421, + "learning_rate": 0.00026765586767947433, + "loss": 0.0061, + "num_input_tokens_seen": 182320352, + "step": 84405 + }, + { + "epoch": 13.769983686786297, + "grad_norm": 0.009609325788915157, + "learning_rate": 0.00026759284212715873, + "loss": 0.0066, + "num_input_tokens_seen": 182330880, + "step": 84410 + }, + { + "epoch": 13.770799347471453, + "grad_norm": 0.004481468815356493, + "learning_rate": 0.000267529821284869, + "loss": 0.0013, + "num_input_tokens_seen": 182341664, + "step": 84415 + }, + { + "epoch": 13.771615008156607, + "grad_norm": 0.00853494182229042, + "learning_rate": 0.0002674668051538824, + "loss": 0.0531, + "num_input_tokens_seen": 182353024, + "step": 84420 + }, + { + "epoch": 13.772430668841762, + "grad_norm": 0.05223782733082771, + "learning_rate": 0.0002674037937354761, + "loss": 0.0315, + "num_input_tokens_seen": 182363840, + "step": 84425 + }, + { + "epoch": 13.773246329526916, + "grad_norm": 0.009661542251706123, + "learning_rate": 0.00026734078703092684, + "loss": 0.0793, + "num_input_tokens_seen": 182374368, + "step": 84430 + }, + { + "epoch": 13.774061990212072, + "grad_norm": 0.004293730482459068, + "learning_rate": 0.0002672777850415117, + "loss": 0.0047, + "num_input_tokens_seen": 182385024, + "step": 84435 + }, + { + "epoch": 13.774877650897226, + "grad_norm": 0.018209824338555336, + "learning_rate": 0.0002672147877685075, + "loss": 0.1032, + "num_input_tokens_seen": 182394656, + "step": 84440 + }, + { + "epoch": 13.775693311582382, + "grad_norm": 0.001877213828265667, + "learning_rate": 0.00026715179521319095, + "loss": 0.0017, + "num_input_tokens_seen": 182405536, + "step": 84445 + }, + { + "epoch": 13.776508972267537, + "grad_norm": 0.0022872108966112137, + "learning_rate": 0.00026708880737683863, + "loss": 0.1174, + "num_input_tokens_seen": 182416640, + "step": 84450 + }, + { + "epoch": 13.777324632952691, + "grad_norm": 0.07921797782182693, + "learning_rate": 0.00026702582426072705, + "loss": 0.008, + "num_input_tokens_seen": 182427712, + "step": 84455 + }, + { + "epoch": 13.778140293637847, + "grad_norm": 0.39221569895744324, + "learning_rate": 0.0002669628458661326, + "loss": 0.1612, + "num_input_tokens_seen": 182438432, + "step": 84460 + }, + { + "epoch": 13.778955954323001, + "grad_norm": 0.02193089947104454, + "learning_rate": 0.000266899872194332, + "loss": 0.1029, + "num_input_tokens_seen": 182448608, + "step": 84465 + }, + { + "epoch": 13.779771615008157, + "grad_norm": 0.015678465366363525, + "learning_rate": 0.0002668369032466009, + "loss": 0.0052, + "num_input_tokens_seen": 182459168, + "step": 84470 + }, + { + "epoch": 13.780587275693312, + "grad_norm": 0.18530204892158508, + "learning_rate": 0.0002667739390242161, + "loss": 0.0063, + "num_input_tokens_seen": 182469664, + "step": 84475 + }, + { + "epoch": 13.781402936378466, + "grad_norm": 0.373296320438385, + "learning_rate": 0.00026671097952845284, + "loss": 0.0379, + "num_input_tokens_seen": 182479136, + "step": 84480 + }, + { + "epoch": 13.782218597063622, + "grad_norm": 0.0032780959736555815, + "learning_rate": 0.00026664802476058803, + "loss": 0.0055, + "num_input_tokens_seen": 182489632, + "step": 84485 + }, + { + "epoch": 13.783034257748776, + "grad_norm": 0.027845237404108047, + "learning_rate": 0.00026658507472189654, + "loss": 0.0056, + "num_input_tokens_seen": 182501024, + "step": 84490 + }, + { + "epoch": 13.783849918433932, + "grad_norm": 0.013598952442407608, + "learning_rate": 0.0002665221294136548, + "loss": 0.0163, + "num_input_tokens_seen": 182511936, + "step": 84495 + }, + { + "epoch": 13.784665579119086, + "grad_norm": 0.009678972885012627, + "learning_rate": 0.0002664591888371384, + "loss": 0.0924, + "num_input_tokens_seen": 182521248, + "step": 84500 + }, + { + "epoch": 13.785481239804241, + "grad_norm": 0.003536543343216181, + "learning_rate": 0.00026639625299362276, + "loss": 0.0078, + "num_input_tokens_seen": 182532256, + "step": 84505 + }, + { + "epoch": 13.786296900489397, + "grad_norm": 0.005610863212496042, + "learning_rate": 0.00026633332188438335, + "loss": 0.0093, + "num_input_tokens_seen": 182543392, + "step": 84510 + }, + { + "epoch": 13.78711256117455, + "grad_norm": 0.004347702953964472, + "learning_rate": 0.00026627039551069563, + "loss": 0.0015, + "num_input_tokens_seen": 182553536, + "step": 84515 + }, + { + "epoch": 13.787928221859707, + "grad_norm": 0.013319053687155247, + "learning_rate": 0.00026620747387383494, + "loss": 0.0597, + "num_input_tokens_seen": 182564448, + "step": 84520 + }, + { + "epoch": 13.78874388254486, + "grad_norm": 0.033942725509405136, + "learning_rate": 0.0002661445569750762, + "loss": 0.0065, + "num_input_tokens_seen": 182575424, + "step": 84525 + }, + { + "epoch": 13.789559543230016, + "grad_norm": 0.07782161235809326, + "learning_rate": 0.00026608164481569486, + "loss": 0.0079, + "num_input_tokens_seen": 182586528, + "step": 84530 + }, + { + "epoch": 13.790375203915172, + "grad_norm": 0.1057814285159111, + "learning_rate": 0.0002660187373969656, + "loss": 0.0152, + "num_input_tokens_seen": 182596224, + "step": 84535 + }, + { + "epoch": 13.791190864600326, + "grad_norm": 0.014159079641103745, + "learning_rate": 0.00026595583472016355, + "loss": 0.0036, + "num_input_tokens_seen": 182607424, + "step": 84540 + }, + { + "epoch": 13.792006525285482, + "grad_norm": 0.0020694267004728317, + "learning_rate": 0.00026589293678656336, + "loss": 0.0023, + "num_input_tokens_seen": 182618368, + "step": 84545 + }, + { + "epoch": 13.792822185970635, + "grad_norm": 0.017610616981983185, + "learning_rate": 0.0002658300435974398, + "loss": 0.0715, + "num_input_tokens_seen": 182628832, + "step": 84550 + }, + { + "epoch": 13.793637846655791, + "grad_norm": 0.011342491954565048, + "learning_rate": 0.00026576715515406747, + "loss": 0.0203, + "num_input_tokens_seen": 182639552, + "step": 84555 + }, + { + "epoch": 13.794453507340947, + "grad_norm": 0.0068123419769108295, + "learning_rate": 0.0002657042714577209, + "loss": 0.0311, + "num_input_tokens_seen": 182650560, + "step": 84560 + }, + { + "epoch": 13.7952691680261, + "grad_norm": 0.0036243554204702377, + "learning_rate": 0.0002656413925096745, + "loss": 0.004, + "num_input_tokens_seen": 182660992, + "step": 84565 + }, + { + "epoch": 13.796084828711257, + "grad_norm": 0.0034546160604804754, + "learning_rate": 0.00026557851831120254, + "loss": 0.0406, + "num_input_tokens_seen": 182671648, + "step": 84570 + }, + { + "epoch": 13.79690048939641, + "grad_norm": 0.03969002887606621, + "learning_rate": 0.00026551564886357937, + "loss": 0.1201, + "num_input_tokens_seen": 182683264, + "step": 84575 + }, + { + "epoch": 13.797716150081566, + "grad_norm": 0.0044247061014175415, + "learning_rate": 0.00026545278416807895, + "loss": 0.002, + "num_input_tokens_seen": 182694976, + "step": 84580 + }, + { + "epoch": 13.798531810766722, + "grad_norm": 0.04211275652050972, + "learning_rate": 0.00026538992422597547, + "loss": 0.0143, + "num_input_tokens_seen": 182706752, + "step": 84585 + }, + { + "epoch": 13.799347471451876, + "grad_norm": 0.0638275295495987, + "learning_rate": 0.0002653270690385428, + "loss": 0.0599, + "num_input_tokens_seen": 182718848, + "step": 84590 + }, + { + "epoch": 13.800163132137031, + "grad_norm": 0.006323820445686579, + "learning_rate": 0.00026526421860705474, + "loss": 0.0086, + "num_input_tokens_seen": 182728928, + "step": 84595 + }, + { + "epoch": 13.800978792822185, + "grad_norm": 0.007789155002683401, + "learning_rate": 0.0002652013729327849, + "loss": 0.0041, + "num_input_tokens_seen": 182740832, + "step": 84600 + }, + { + "epoch": 13.801794453507341, + "grad_norm": 0.022198403254151344, + "learning_rate": 0.00026513853201700727, + "loss": 0.1351, + "num_input_tokens_seen": 182752352, + "step": 84605 + }, + { + "epoch": 13.802610114192497, + "grad_norm": 0.007201942149549723, + "learning_rate": 0.00026507569586099527, + "loss": 0.0069, + "num_input_tokens_seen": 182763488, + "step": 84610 + }, + { + "epoch": 13.80342577487765, + "grad_norm": 0.009613179601728916, + "learning_rate": 0.0002650128644660223, + "loss": 0.0054, + "num_input_tokens_seen": 182774048, + "step": 84615 + }, + { + "epoch": 13.804241435562806, + "grad_norm": 0.055438682436943054, + "learning_rate": 0.0002649500378333617, + "loss": 0.0239, + "num_input_tokens_seen": 182784480, + "step": 84620 + }, + { + "epoch": 13.80505709624796, + "grad_norm": 0.00259758229367435, + "learning_rate": 0.0002648872159642868, + "loss": 0.0156, + "num_input_tokens_seen": 182794912, + "step": 84625 + }, + { + "epoch": 13.805872756933116, + "grad_norm": 0.01642073690891266, + "learning_rate": 0.00026482439886007077, + "loss": 0.0083, + "num_input_tokens_seen": 182805792, + "step": 84630 + }, + { + "epoch": 13.80668841761827, + "grad_norm": 0.5491905212402344, + "learning_rate": 0.00026476158652198655, + "loss": 0.0656, + "num_input_tokens_seen": 182818080, + "step": 84635 + }, + { + "epoch": 13.807504078303426, + "grad_norm": 0.00983067974448204, + "learning_rate": 0.00026469877895130727, + "loss": 0.211, + "num_input_tokens_seen": 182828608, + "step": 84640 + }, + { + "epoch": 13.808319738988581, + "grad_norm": 0.1057048887014389, + "learning_rate": 0.00026463597614930575, + "loss": 0.0174, + "num_input_tokens_seen": 182839488, + "step": 84645 + }, + { + "epoch": 13.809135399673735, + "grad_norm": 0.01889374665915966, + "learning_rate": 0.00026457317811725466, + "loss": 0.0118, + "num_input_tokens_seen": 182849856, + "step": 84650 + }, + { + "epoch": 13.809951060358891, + "grad_norm": 0.012557010166347027, + "learning_rate": 0.00026451038485642687, + "loss": 0.0055, + "num_input_tokens_seen": 182857760, + "step": 84655 + }, + { + "epoch": 13.810766721044045, + "grad_norm": 0.0034552598372101784, + "learning_rate": 0.0002644475963680948, + "loss": 0.0596, + "num_input_tokens_seen": 182868000, + "step": 84660 + }, + { + "epoch": 13.8115823817292, + "grad_norm": 0.1016710177063942, + "learning_rate": 0.0002643848126535311, + "loss": 0.0273, + "num_input_tokens_seen": 182880032, + "step": 84665 + }, + { + "epoch": 13.812398042414356, + "grad_norm": 0.0039877621456980705, + "learning_rate": 0.000264322033714008, + "loss": 0.0034, + "num_input_tokens_seen": 182891136, + "step": 84670 + }, + { + "epoch": 13.81321370309951, + "grad_norm": 0.009610519744455814, + "learning_rate": 0.0002642592595507979, + "loss": 0.0055, + "num_input_tokens_seen": 182902944, + "step": 84675 + }, + { + "epoch": 13.814029363784666, + "grad_norm": 0.047046490013599396, + "learning_rate": 0.0002641964901651729, + "loss": 0.0061, + "num_input_tokens_seen": 182913440, + "step": 84680 + }, + { + "epoch": 13.81484502446982, + "grad_norm": 0.002050283830612898, + "learning_rate": 0.0002641337255584052, + "loss": 0.0189, + "num_input_tokens_seen": 182923872, + "step": 84685 + }, + { + "epoch": 13.815660685154976, + "grad_norm": 0.016913149505853653, + "learning_rate": 0.0002640709657317668, + "loss": 0.0074, + "num_input_tokens_seen": 182934432, + "step": 84690 + }, + { + "epoch": 13.81647634584013, + "grad_norm": 0.007640815805643797, + "learning_rate": 0.0002640082106865295, + "loss": 0.0087, + "num_input_tokens_seen": 182946464, + "step": 84695 + }, + { + "epoch": 13.817292006525285, + "grad_norm": 0.008956543169915676, + "learning_rate": 0.00026394546042396525, + "loss": 0.0204, + "num_input_tokens_seen": 182957728, + "step": 84700 + }, + { + "epoch": 13.818107667210441, + "grad_norm": 1.4677728414535522, + "learning_rate": 0.0002638827149453457, + "loss": 0.1167, + "num_input_tokens_seen": 182966976, + "step": 84705 + }, + { + "epoch": 13.818923327895595, + "grad_norm": 0.018476588651537895, + "learning_rate": 0.0002638199742519425, + "loss": 0.0567, + "num_input_tokens_seen": 182977920, + "step": 84710 + }, + { + "epoch": 13.81973898858075, + "grad_norm": 0.03129766136407852, + "learning_rate": 0.00026375723834502686, + "loss": 0.008, + "num_input_tokens_seen": 182989504, + "step": 84715 + }, + { + "epoch": 13.820554649265905, + "grad_norm": 0.061310023069381714, + "learning_rate": 0.0002636945072258709, + "loss": 0.1149, + "num_input_tokens_seen": 183000992, + "step": 84720 + }, + { + "epoch": 13.82137030995106, + "grad_norm": 0.04653109982609749, + "learning_rate": 0.00026363178089574516, + "loss": 0.0465, + "num_input_tokens_seen": 183011776, + "step": 84725 + }, + { + "epoch": 13.822185970636216, + "grad_norm": 0.0033070375211536884, + "learning_rate": 0.0002635690593559216, + "loss": 0.0212, + "num_input_tokens_seen": 183023072, + "step": 84730 + }, + { + "epoch": 13.82300163132137, + "grad_norm": 0.0212293341755867, + "learning_rate": 0.0002635063426076706, + "loss": 0.0363, + "num_input_tokens_seen": 183034144, + "step": 84735 + }, + { + "epoch": 13.823817292006526, + "grad_norm": 0.0049853515811264515, + "learning_rate": 0.000263443630652264, + "loss": 0.0179, + "num_input_tokens_seen": 183044320, + "step": 84740 + }, + { + "epoch": 13.82463295269168, + "grad_norm": 0.002307276474311948, + "learning_rate": 0.00026338092349097186, + "loss": 0.0022, + "num_input_tokens_seen": 183054368, + "step": 84745 + }, + { + "epoch": 13.825448613376835, + "grad_norm": 0.004544033668935299, + "learning_rate": 0.00026331822112506576, + "loss": 0.0037, + "num_input_tokens_seen": 183064800, + "step": 84750 + }, + { + "epoch": 13.826264274061991, + "grad_norm": 0.002783995820209384, + "learning_rate": 0.0002632555235558161, + "loss": 0.005, + "num_input_tokens_seen": 183075168, + "step": 84755 + }, + { + "epoch": 13.827079934747145, + "grad_norm": 0.04571967199444771, + "learning_rate": 0.00026319283078449365, + "loss": 0.0074, + "num_input_tokens_seen": 183087296, + "step": 84760 + }, + { + "epoch": 13.8278955954323, + "grad_norm": 0.6793679594993591, + "learning_rate": 0.0002631301428123688, + "loss": 0.1606, + "num_input_tokens_seen": 183097184, + "step": 84765 + }, + { + "epoch": 13.828711256117455, + "grad_norm": 0.04691294580698013, + "learning_rate": 0.00026306745964071223, + "loss": 0.0063, + "num_input_tokens_seen": 183107712, + "step": 84770 + }, + { + "epoch": 13.82952691680261, + "grad_norm": 0.08239693194627762, + "learning_rate": 0.00026300478127079405, + "loss": 0.0297, + "num_input_tokens_seen": 183118080, + "step": 84775 + }, + { + "epoch": 13.830342577487766, + "grad_norm": 0.0026116548106074333, + "learning_rate": 0.0002629421077038846, + "loss": 0.0378, + "num_input_tokens_seen": 183129216, + "step": 84780 + }, + { + "epoch": 13.83115823817292, + "grad_norm": 0.03541423752903938, + "learning_rate": 0.00026287943894125415, + "loss": 0.0059, + "num_input_tokens_seen": 183140736, + "step": 84785 + }, + { + "epoch": 13.831973898858076, + "grad_norm": 0.00940261036157608, + "learning_rate": 0.0002628167749841727, + "loss": 0.005, + "num_input_tokens_seen": 183151296, + "step": 84790 + }, + { + "epoch": 13.83278955954323, + "grad_norm": 0.006862754467874765, + "learning_rate": 0.0002627541158339101, + "loss": 0.0065, + "num_input_tokens_seen": 183163488, + "step": 84795 + }, + { + "epoch": 13.833605220228385, + "grad_norm": 0.015600350685417652, + "learning_rate": 0.0002626914614917364, + "loss": 0.0099, + "num_input_tokens_seen": 183174272, + "step": 84800 + }, + { + "epoch": 13.83442088091354, + "grad_norm": 0.008726726286113262, + "learning_rate": 0.0002626288119589212, + "loss": 0.0078, + "num_input_tokens_seen": 183184384, + "step": 84805 + }, + { + "epoch": 13.835236541598695, + "grad_norm": 0.009681164287030697, + "learning_rate": 0.0002625661672367343, + "loss": 0.0034, + "num_input_tokens_seen": 183195552, + "step": 84810 + }, + { + "epoch": 13.83605220228385, + "grad_norm": 0.012375738471746445, + "learning_rate": 0.00026250352732644524, + "loss": 0.0035, + "num_input_tokens_seen": 183207328, + "step": 84815 + }, + { + "epoch": 13.836867862969005, + "grad_norm": 1.4686344861984253, + "learning_rate": 0.0002624408922293232, + "loss": 0.0164, + "num_input_tokens_seen": 183218048, + "step": 84820 + }, + { + "epoch": 13.83768352365416, + "grad_norm": 0.004356234800070524, + "learning_rate": 0.0002623782619466383, + "loss": 0.0714, + "num_input_tokens_seen": 183228928, + "step": 84825 + }, + { + "epoch": 13.838499184339314, + "grad_norm": 0.0027383537963032722, + "learning_rate": 0.00026231563647965896, + "loss": 0.0056, + "num_input_tokens_seen": 183239776, + "step": 84830 + }, + { + "epoch": 13.83931484502447, + "grad_norm": 0.9869494438171387, + "learning_rate": 0.00026225301582965524, + "loss": 0.0909, + "num_input_tokens_seen": 183250144, + "step": 84835 + }, + { + "epoch": 13.840130505709626, + "grad_norm": 0.003971732687205076, + "learning_rate": 0.0002621903999978953, + "loss": 0.022, + "num_input_tokens_seen": 183261344, + "step": 84840 + }, + { + "epoch": 13.84094616639478, + "grad_norm": 0.004560328088700771, + "learning_rate": 0.0002621277889856489, + "loss": 0.0167, + "num_input_tokens_seen": 183273120, + "step": 84845 + }, + { + "epoch": 13.841761827079935, + "grad_norm": 0.0031801187433302402, + "learning_rate": 0.0002620651827941843, + "loss": 0.0026, + "num_input_tokens_seen": 183284384, + "step": 84850 + }, + { + "epoch": 13.84257748776509, + "grad_norm": 0.218495711684227, + "learning_rate": 0.00026200258142477107, + "loss": 0.0102, + "num_input_tokens_seen": 183294720, + "step": 84855 + }, + { + "epoch": 13.843393148450245, + "grad_norm": 0.0040638744831085205, + "learning_rate": 0.00026193998487867697, + "loss": 0.0164, + "num_input_tokens_seen": 183305792, + "step": 84860 + }, + { + "epoch": 13.844208809135399, + "grad_norm": 0.6690198183059692, + "learning_rate": 0.0002618773931571715, + "loss": 0.1063, + "num_input_tokens_seen": 183316448, + "step": 84865 + }, + { + "epoch": 13.845024469820554, + "grad_norm": 0.010879079811275005, + "learning_rate": 0.00026181480626152236, + "loss": 0.0068, + "num_input_tokens_seen": 183326880, + "step": 84870 + }, + { + "epoch": 13.84584013050571, + "grad_norm": 0.30008864402770996, + "learning_rate": 0.0002617522241929987, + "loss": 0.0098, + "num_input_tokens_seen": 183337856, + "step": 84875 + }, + { + "epoch": 13.846655791190864, + "grad_norm": 0.00918416865170002, + "learning_rate": 0.0002616896469528681, + "loss": 0.0109, + "num_input_tokens_seen": 183348256, + "step": 84880 + }, + { + "epoch": 13.84747145187602, + "grad_norm": 0.0027300086803734303, + "learning_rate": 0.00026162707454239944, + "loss": 0.0145, + "num_input_tokens_seen": 183360160, + "step": 84885 + }, + { + "epoch": 13.848287112561174, + "grad_norm": 0.047441281378269196, + "learning_rate": 0.00026156450696286014, + "loss": 0.0451, + "num_input_tokens_seen": 183371648, + "step": 84890 + }, + { + "epoch": 13.84910277324633, + "grad_norm": 0.002008978510275483, + "learning_rate": 0.0002615019442155189, + "loss": 0.0036, + "num_input_tokens_seen": 183384032, + "step": 84895 + }, + { + "epoch": 13.849918433931485, + "grad_norm": 0.002454567002132535, + "learning_rate": 0.00026143938630164316, + "loss": 0.0367, + "num_input_tokens_seen": 183395072, + "step": 84900 + }, + { + "epoch": 13.850734094616639, + "grad_norm": 0.013349570333957672, + "learning_rate": 0.00026137683322250094, + "loss": 0.0039, + "num_input_tokens_seen": 183405824, + "step": 84905 + }, + { + "epoch": 13.851549755301795, + "grad_norm": 0.06632737070322037, + "learning_rate": 0.00026131428497935995, + "loss": 0.003, + "num_input_tokens_seen": 183416704, + "step": 84910 + }, + { + "epoch": 13.852365415986949, + "grad_norm": 0.4345989525318146, + "learning_rate": 0.0002612517415734877, + "loss": 0.0761, + "num_input_tokens_seen": 183427616, + "step": 84915 + }, + { + "epoch": 13.853181076672104, + "grad_norm": 0.15262597799301147, + "learning_rate": 0.00026118920300615187, + "loss": 0.0094, + "num_input_tokens_seen": 183436960, + "step": 84920 + }, + { + "epoch": 13.85399673735726, + "grad_norm": 0.00835461262613535, + "learning_rate": 0.0002611266692786197, + "loss": 0.0061, + "num_input_tokens_seen": 183447296, + "step": 84925 + }, + { + "epoch": 13.854812398042414, + "grad_norm": 0.10195163637399673, + "learning_rate": 0.00026106414039215865, + "loss": 0.0077, + "num_input_tokens_seen": 183458048, + "step": 84930 + }, + { + "epoch": 13.85562805872757, + "grad_norm": 0.030093245208263397, + "learning_rate": 0.00026100161634803594, + "loss": 0.0028, + "num_input_tokens_seen": 183469376, + "step": 84935 + }, + { + "epoch": 13.856443719412724, + "grad_norm": 0.007934209890663624, + "learning_rate": 0.0002609390971475186, + "loss": 0.0029, + "num_input_tokens_seen": 183480064, + "step": 84940 + }, + { + "epoch": 13.85725938009788, + "grad_norm": 0.1264921873807907, + "learning_rate": 0.00026087658279187357, + "loss": 0.0071, + "num_input_tokens_seen": 183490880, + "step": 84945 + }, + { + "epoch": 13.858075040783035, + "grad_norm": 1.145666241645813, + "learning_rate": 0.0002608140732823684, + "loss": 0.0163, + "num_input_tokens_seen": 183502048, + "step": 84950 + }, + { + "epoch": 13.858890701468189, + "grad_norm": 0.013281790539622307, + "learning_rate": 0.00026075156862026896, + "loss": 0.0027, + "num_input_tokens_seen": 183512960, + "step": 84955 + }, + { + "epoch": 13.859706362153345, + "grad_norm": 0.003783997381106019, + "learning_rate": 0.00026068906880684297, + "loss": 0.0039, + "num_input_tokens_seen": 183523840, + "step": 84960 + }, + { + "epoch": 13.860522022838499, + "grad_norm": 0.20053230226039886, + "learning_rate": 0.0002606265738433561, + "loss": 0.0741, + "num_input_tokens_seen": 183534176, + "step": 84965 + }, + { + "epoch": 13.861337683523654, + "grad_norm": 0.02767985127866268, + "learning_rate": 0.0002605640837310758, + "loss": 0.0041, + "num_input_tokens_seen": 183544320, + "step": 84970 + }, + { + "epoch": 13.86215334420881, + "grad_norm": 0.0010478844633325934, + "learning_rate": 0.0002605015984712678, + "loss": 0.0343, + "num_input_tokens_seen": 183553600, + "step": 84975 + }, + { + "epoch": 13.862969004893964, + "grad_norm": 0.002999087329953909, + "learning_rate": 0.000260439118065199, + "loss": 0.0269, + "num_input_tokens_seen": 183563744, + "step": 84980 + }, + { + "epoch": 13.86378466557912, + "grad_norm": 0.015121141448616982, + "learning_rate": 0.000260376642514135, + "loss": 0.009, + "num_input_tokens_seen": 183574592, + "step": 84985 + }, + { + "epoch": 13.864600326264274, + "grad_norm": 0.014611300081014633, + "learning_rate": 0.00026031417181934276, + "loss": 0.0026, + "num_input_tokens_seen": 183585344, + "step": 84990 + }, + { + "epoch": 13.86541598694943, + "grad_norm": 0.0058423797599971294, + "learning_rate": 0.0002602517059820875, + "loss": 0.0028, + "num_input_tokens_seen": 183596544, + "step": 84995 + }, + { + "epoch": 13.866231647634583, + "grad_norm": 0.04339960962533951, + "learning_rate": 0.0002601892450036359, + "loss": 0.0038, + "num_input_tokens_seen": 183606976, + "step": 85000 + }, + { + "epoch": 13.867047308319739, + "grad_norm": 0.003346919547766447, + "learning_rate": 0.0002601267888852531, + "loss": 0.0084, + "num_input_tokens_seen": 183618720, + "step": 85005 + }, + { + "epoch": 13.867862969004895, + "grad_norm": 0.005301931872963905, + "learning_rate": 0.0002600643376282056, + "loss": 0.0903, + "num_input_tokens_seen": 183629760, + "step": 85010 + }, + { + "epoch": 13.868678629690049, + "grad_norm": 0.005934323649853468, + "learning_rate": 0.0002600018912337584, + "loss": 0.156, + "num_input_tokens_seen": 183639584, + "step": 85015 + }, + { + "epoch": 13.869494290375204, + "grad_norm": 0.006170249078422785, + "learning_rate": 0.00025993944970317763, + "loss": 0.065, + "num_input_tokens_seen": 183651040, + "step": 85020 + }, + { + "epoch": 13.870309951060358, + "grad_norm": 0.005769214127212763, + "learning_rate": 0.00025987701303772806, + "loss": 0.0222, + "num_input_tokens_seen": 183663200, + "step": 85025 + }, + { + "epoch": 13.871125611745514, + "grad_norm": 0.17763355374336243, + "learning_rate": 0.00025981458123867566, + "loss": 0.0116, + "num_input_tokens_seen": 183672864, + "step": 85030 + }, + { + "epoch": 13.87194127243067, + "grad_norm": 0.016868766397237778, + "learning_rate": 0.0002597521543072854, + "loss": 0.0018, + "num_input_tokens_seen": 183683296, + "step": 85035 + }, + { + "epoch": 13.872756933115824, + "grad_norm": 0.0015195843297988176, + "learning_rate": 0.00025968973224482257, + "loss": 0.0126, + "num_input_tokens_seen": 183694176, + "step": 85040 + }, + { + "epoch": 13.87357259380098, + "grad_norm": 0.07358266413211823, + "learning_rate": 0.00025962731505255215, + "loss": 0.0057, + "num_input_tokens_seen": 183706304, + "step": 85045 + }, + { + "epoch": 13.874388254486133, + "grad_norm": 0.0072519490495324135, + "learning_rate": 0.0002595649027317392, + "loss": 0.0041, + "num_input_tokens_seen": 183717312, + "step": 85050 + }, + { + "epoch": 13.875203915171289, + "grad_norm": 0.013897925615310669, + "learning_rate": 0.0002595024952836484, + "loss": 0.2139, + "num_input_tokens_seen": 183728928, + "step": 85055 + }, + { + "epoch": 13.876019575856443, + "grad_norm": 0.0067464206367731094, + "learning_rate": 0.00025944009270954463, + "loss": 0.007, + "num_input_tokens_seen": 183740896, + "step": 85060 + }, + { + "epoch": 13.876835236541599, + "grad_norm": 0.009500235319137573, + "learning_rate": 0.00025937769501069264, + "loss": 0.003, + "num_input_tokens_seen": 183749856, + "step": 85065 + }, + { + "epoch": 13.877650897226754, + "grad_norm": 0.6065912246704102, + "learning_rate": 0.00025931530218835684, + "loss": 0.1251, + "num_input_tokens_seen": 183760256, + "step": 85070 + }, + { + "epoch": 13.878466557911908, + "grad_norm": 0.013110343366861343, + "learning_rate": 0.00025925291424380183, + "loss": 0.0033, + "num_input_tokens_seen": 183771616, + "step": 85075 + }, + { + "epoch": 13.879282218597064, + "grad_norm": 0.10349424928426743, + "learning_rate": 0.00025919053117829185, + "loss": 0.0053, + "num_input_tokens_seen": 183781984, + "step": 85080 + }, + { + "epoch": 13.880097879282218, + "grad_norm": 0.10486262291669846, + "learning_rate": 0.0002591281529930913, + "loss": 0.0494, + "num_input_tokens_seen": 183793440, + "step": 85085 + }, + { + "epoch": 13.880913539967374, + "grad_norm": 0.13725866377353668, + "learning_rate": 0.0002590657796894641, + "loss": 0.0872, + "num_input_tokens_seen": 183803072, + "step": 85090 + }, + { + "epoch": 13.88172920065253, + "grad_norm": 0.003872843226417899, + "learning_rate": 0.0002590034112686749, + "loss": 0.0015, + "num_input_tokens_seen": 183814176, + "step": 85095 + }, + { + "epoch": 13.882544861337683, + "grad_norm": 0.005437849089503288, + "learning_rate": 0.0002589410477319869, + "loss": 0.0028, + "num_input_tokens_seen": 183823712, + "step": 85100 + }, + { + "epoch": 13.883360522022839, + "grad_norm": 0.0025222674012184143, + "learning_rate": 0.0002588786890806647, + "loss": 0.0515, + "num_input_tokens_seen": 183834048, + "step": 85105 + }, + { + "epoch": 13.884176182707993, + "grad_norm": 0.006505226716399193, + "learning_rate": 0.0002588163353159715, + "loss": 0.0485, + "num_input_tokens_seen": 183844864, + "step": 85110 + }, + { + "epoch": 13.884991843393149, + "grad_norm": 0.007280663587152958, + "learning_rate": 0.00025875398643917147, + "loss": 0.0031, + "num_input_tokens_seen": 183856672, + "step": 85115 + }, + { + "epoch": 13.885807504078304, + "grad_norm": 0.26501476764678955, + "learning_rate": 0.00025869164245152765, + "loss": 0.0507, + "num_input_tokens_seen": 183867840, + "step": 85120 + }, + { + "epoch": 13.886623164763458, + "grad_norm": 0.003495575860142708, + "learning_rate": 0.00025862930335430426, + "loss": 0.0043, + "num_input_tokens_seen": 183878688, + "step": 85125 + }, + { + "epoch": 13.887438825448614, + "grad_norm": 0.8573818206787109, + "learning_rate": 0.0002585669691487637, + "loss": 0.0293, + "num_input_tokens_seen": 183889920, + "step": 85130 + }, + { + "epoch": 13.888254486133768, + "grad_norm": 0.09581318497657776, + "learning_rate": 0.00025850463983617005, + "loss": 0.0196, + "num_input_tokens_seen": 183901888, + "step": 85135 + }, + { + "epoch": 13.889070146818923, + "grad_norm": 0.010820590890944004, + "learning_rate": 0.0002584423154177863, + "loss": 0.0068, + "num_input_tokens_seen": 183912672, + "step": 85140 + }, + { + "epoch": 13.88988580750408, + "grad_norm": 0.002558765932917595, + "learning_rate": 0.0002583799958948754, + "loss": 0.0057, + "num_input_tokens_seen": 183922752, + "step": 85145 + }, + { + "epoch": 13.890701468189233, + "grad_norm": 0.007388087455183268, + "learning_rate": 0.00025831768126870035, + "loss": 0.0068, + "num_input_tokens_seen": 183934272, + "step": 85150 + }, + { + "epoch": 13.891517128874389, + "grad_norm": 0.013861283659934998, + "learning_rate": 0.00025825537154052414, + "loss": 0.0042, + "num_input_tokens_seen": 183944320, + "step": 85155 + }, + { + "epoch": 13.892332789559543, + "grad_norm": 0.38176852464675903, + "learning_rate": 0.00025819306671160953, + "loss": 0.1258, + "num_input_tokens_seen": 183954688, + "step": 85160 + }, + { + "epoch": 13.893148450244698, + "grad_norm": 0.06643305718898773, + "learning_rate": 0.00025813076678321914, + "loss": 0.0046, + "num_input_tokens_seen": 183965312, + "step": 85165 + }, + { + "epoch": 13.893964110929852, + "grad_norm": 0.012475523166358471, + "learning_rate": 0.0002580684717566156, + "loss": 0.0137, + "num_input_tokens_seen": 183977056, + "step": 85170 + }, + { + "epoch": 13.894779771615008, + "grad_norm": 0.015131724998354912, + "learning_rate": 0.0002580061816330614, + "loss": 0.096, + "num_input_tokens_seen": 183988224, + "step": 85175 + }, + { + "epoch": 13.895595432300164, + "grad_norm": 0.28440892696380615, + "learning_rate": 0.00025794389641381894, + "loss": 0.0517, + "num_input_tokens_seen": 183999136, + "step": 85180 + }, + { + "epoch": 13.896411092985318, + "grad_norm": 0.01567523181438446, + "learning_rate": 0.0002578816161001505, + "loss": 0.0035, + "num_input_tokens_seen": 184008928, + "step": 85185 + }, + { + "epoch": 13.897226753670473, + "grad_norm": 0.00586994644254446, + "learning_rate": 0.0002578193406933182, + "loss": 0.0591, + "num_input_tokens_seen": 184019520, + "step": 85190 + }, + { + "epoch": 13.898042414355627, + "grad_norm": 0.002432781970128417, + "learning_rate": 0.00025775707019458415, + "loss": 0.0084, + "num_input_tokens_seen": 184030592, + "step": 85195 + }, + { + "epoch": 13.898858075040783, + "grad_norm": 0.005925891920924187, + "learning_rate": 0.0002576948046052105, + "loss": 0.185, + "num_input_tokens_seen": 184041216, + "step": 85200 + }, + { + "epoch": 13.899673735725939, + "grad_norm": 0.039722055196762085, + "learning_rate": 0.000257632543926459, + "loss": 0.0128, + "num_input_tokens_seen": 184051712, + "step": 85205 + }, + { + "epoch": 13.900489396411093, + "grad_norm": 0.01062532514333725, + "learning_rate": 0.0002575702881595914, + "loss": 0.0024, + "num_input_tokens_seen": 184063424, + "step": 85210 + }, + { + "epoch": 13.901305057096248, + "grad_norm": 0.02566523477435112, + "learning_rate": 0.0002575080373058695, + "loss": 0.0604, + "num_input_tokens_seen": 184074912, + "step": 85215 + }, + { + "epoch": 13.902120717781402, + "grad_norm": 0.4470478296279907, + "learning_rate": 0.0002574457913665548, + "loss": 0.0136, + "num_input_tokens_seen": 184085696, + "step": 85220 + }, + { + "epoch": 13.902936378466558, + "grad_norm": 0.049299828708171844, + "learning_rate": 0.00025738355034290886, + "loss": 0.0032, + "num_input_tokens_seen": 184095552, + "step": 85225 + }, + { + "epoch": 13.903752039151712, + "grad_norm": 0.06083039939403534, + "learning_rate": 0.00025732131423619303, + "loss": 0.0059, + "num_input_tokens_seen": 184107008, + "step": 85230 + }, + { + "epoch": 13.904567699836868, + "grad_norm": 0.028353175148367882, + "learning_rate": 0.0002572590830476685, + "loss": 0.0139, + "num_input_tokens_seen": 184117120, + "step": 85235 + }, + { + "epoch": 13.905383360522023, + "grad_norm": 0.37317419052124023, + "learning_rate": 0.0002571968567785967, + "loss": 0.139, + "num_input_tokens_seen": 184129408, + "step": 85240 + }, + { + "epoch": 13.906199021207177, + "grad_norm": 0.009255973622202873, + "learning_rate": 0.0002571346354302387, + "loss": 0.0156, + "num_input_tokens_seen": 184140384, + "step": 85245 + }, + { + "epoch": 13.907014681892333, + "grad_norm": 0.005792307201772928, + "learning_rate": 0.0002570724190038554, + "loss": 0.004, + "num_input_tokens_seen": 184150720, + "step": 85250 + }, + { + "epoch": 13.907830342577487, + "grad_norm": 0.001610096194781363, + "learning_rate": 0.00025701020750070765, + "loss": 0.0021, + "num_input_tokens_seen": 184161344, + "step": 85255 + }, + { + "epoch": 13.908646003262643, + "grad_norm": 0.008702469989657402, + "learning_rate": 0.0002569480009220563, + "loss": 0.026, + "num_input_tokens_seen": 184171936, + "step": 85260 + }, + { + "epoch": 13.909461663947798, + "grad_norm": 0.009642422199249268, + "learning_rate": 0.00025688579926916213, + "loss": 0.0096, + "num_input_tokens_seen": 184183200, + "step": 85265 + }, + { + "epoch": 13.910277324632952, + "grad_norm": 0.009435923770070076, + "learning_rate": 0.0002568236025432855, + "loss": 0.0263, + "num_input_tokens_seen": 184194880, + "step": 85270 + }, + { + "epoch": 13.911092985318108, + "grad_norm": 0.5569436550140381, + "learning_rate": 0.00025676141074568713, + "loss": 0.0775, + "num_input_tokens_seen": 184205248, + "step": 85275 + }, + { + "epoch": 13.911908646003262, + "grad_norm": 0.023024236783385277, + "learning_rate": 0.00025669922387762747, + "loss": 0.0051, + "num_input_tokens_seen": 184214976, + "step": 85280 + }, + { + "epoch": 13.912724306688418, + "grad_norm": 0.4745536744594574, + "learning_rate": 0.00025663704194036653, + "loss": 0.1285, + "num_input_tokens_seen": 184226240, + "step": 85285 + }, + { + "epoch": 13.913539967373573, + "grad_norm": 0.04086502268910408, + "learning_rate": 0.0002565748649351647, + "loss": 0.0046, + "num_input_tokens_seen": 184237248, + "step": 85290 + }, + { + "epoch": 13.914355628058727, + "grad_norm": 0.005516092758625746, + "learning_rate": 0.0002565126928632821, + "loss": 0.0065, + "num_input_tokens_seen": 184248640, + "step": 85295 + }, + { + "epoch": 13.915171288743883, + "grad_norm": 0.02265411801636219, + "learning_rate": 0.00025645052572597856, + "loss": 0.0034, + "num_input_tokens_seen": 184259328, + "step": 85300 + }, + { + "epoch": 13.915986949429037, + "grad_norm": 0.026116758584976196, + "learning_rate": 0.0002563883635245141, + "loss": 0.017, + "num_input_tokens_seen": 184269984, + "step": 85305 + }, + { + "epoch": 13.916802610114193, + "grad_norm": 0.0021625554654747248, + "learning_rate": 0.0002563262062601486, + "loss": 0.0723, + "num_input_tokens_seen": 184279712, + "step": 85310 + }, + { + "epoch": 13.917618270799348, + "grad_norm": 0.02202964387834072, + "learning_rate": 0.0002562640539341415, + "loss": 0.005, + "num_input_tokens_seen": 184289760, + "step": 85315 + }, + { + "epoch": 13.918433931484502, + "grad_norm": 0.10059604048728943, + "learning_rate": 0.0002562019065477527, + "loss": 0.0061, + "num_input_tokens_seen": 184300512, + "step": 85320 + }, + { + "epoch": 13.919249592169658, + "grad_norm": 0.08960135281085968, + "learning_rate": 0.00025613976410224145, + "loss": 0.0109, + "num_input_tokens_seen": 184312224, + "step": 85325 + }, + { + "epoch": 13.920065252854812, + "grad_norm": 0.07900179177522659, + "learning_rate": 0.00025607762659886726, + "loss": 0.0702, + "num_input_tokens_seen": 184323360, + "step": 85330 + }, + { + "epoch": 13.920880913539968, + "grad_norm": 0.024132825434207916, + "learning_rate": 0.00025601549403888934, + "loss": 0.0092, + "num_input_tokens_seen": 184333152, + "step": 85335 + }, + { + "epoch": 13.921696574225122, + "grad_norm": 0.02644437924027443, + "learning_rate": 0.00025595336642356706, + "loss": 0.0067, + "num_input_tokens_seen": 184343488, + "step": 85340 + }, + { + "epoch": 13.922512234910277, + "grad_norm": 0.01221081055700779, + "learning_rate": 0.0002558912437541594, + "loss": 0.0037, + "num_input_tokens_seen": 184354080, + "step": 85345 + }, + { + "epoch": 13.923327895595433, + "grad_norm": 0.20858865976333618, + "learning_rate": 0.0002558291260319253, + "loss": 0.0064, + "num_input_tokens_seen": 184364352, + "step": 85350 + }, + { + "epoch": 13.924143556280587, + "grad_norm": 0.006945399101823568, + "learning_rate": 0.0002557670132581235, + "loss": 0.0041, + "num_input_tokens_seen": 184376672, + "step": 85355 + }, + { + "epoch": 13.924959216965743, + "grad_norm": 0.470683753490448, + "learning_rate": 0.00025570490543401345, + "loss": 0.0933, + "num_input_tokens_seen": 184387104, + "step": 85360 + }, + { + "epoch": 13.925774877650896, + "grad_norm": 0.4291492700576782, + "learning_rate": 0.00025564280256085305, + "loss": 0.0118, + "num_input_tokens_seen": 184396736, + "step": 85365 + }, + { + "epoch": 13.926590538336052, + "grad_norm": 0.04624152556061745, + "learning_rate": 0.0002555807046399016, + "loss": 0.0194, + "num_input_tokens_seen": 184406208, + "step": 85370 + }, + { + "epoch": 13.927406199021208, + "grad_norm": 0.021825360134243965, + "learning_rate": 0.00025551861167241675, + "loss": 0.0034, + "num_input_tokens_seen": 184417280, + "step": 85375 + }, + { + "epoch": 13.928221859706362, + "grad_norm": 0.33354225754737854, + "learning_rate": 0.00025545652365965767, + "loss": 0.0929, + "num_input_tokens_seen": 184428768, + "step": 85380 + }, + { + "epoch": 13.929037520391518, + "grad_norm": 0.02854643389582634, + "learning_rate": 0.00025539444060288235, + "loss": 0.0055, + "num_input_tokens_seen": 184439904, + "step": 85385 + }, + { + "epoch": 13.929853181076671, + "grad_norm": 0.0056158872321248055, + "learning_rate": 0.000255332362503349, + "loss": 0.0103, + "num_input_tokens_seen": 184450496, + "step": 85390 + }, + { + "epoch": 13.930668841761827, + "grad_norm": 0.027426814660429955, + "learning_rate": 0.00025527028936231567, + "loss": 0.0137, + "num_input_tokens_seen": 184461152, + "step": 85395 + }, + { + "epoch": 13.931484502446983, + "grad_norm": 0.03571975603699684, + "learning_rate": 0.0002552082211810405, + "loss": 0.0196, + "num_input_tokens_seen": 184472544, + "step": 85400 + }, + { + "epoch": 13.932300163132137, + "grad_norm": 0.0027300782967358828, + "learning_rate": 0.0002551461579607811, + "loss": 0.0026, + "num_input_tokens_seen": 184482240, + "step": 85405 + }, + { + "epoch": 13.933115823817293, + "grad_norm": 0.08104506134986877, + "learning_rate": 0.00025508409970279554, + "loss": 0.0108, + "num_input_tokens_seen": 184492448, + "step": 85410 + }, + { + "epoch": 13.933931484502446, + "grad_norm": 0.0030523252207785845, + "learning_rate": 0.00025502204640834135, + "loss": 0.0523, + "num_input_tokens_seen": 184502816, + "step": 85415 + }, + { + "epoch": 13.934747145187602, + "grad_norm": 0.04271979257464409, + "learning_rate": 0.0002549599980786762, + "loss": 0.055, + "num_input_tokens_seen": 184514080, + "step": 85420 + }, + { + "epoch": 13.935562805872756, + "grad_norm": 0.0023610251955688, + "learning_rate": 0.0002548979547150576, + "loss": 0.0028, + "num_input_tokens_seen": 184525312, + "step": 85425 + }, + { + "epoch": 13.936378466557912, + "grad_norm": 0.00905569177120924, + "learning_rate": 0.0002548359163187428, + "loss": 0.0094, + "num_input_tokens_seen": 184536224, + "step": 85430 + }, + { + "epoch": 13.937194127243067, + "grad_norm": 0.017019646242260933, + "learning_rate": 0.0002547738828909891, + "loss": 0.0426, + "num_input_tokens_seen": 184547776, + "step": 85435 + }, + { + "epoch": 13.938009787928221, + "grad_norm": 0.004658792167901993, + "learning_rate": 0.0002547118544330539, + "loss": 0.1296, + "num_input_tokens_seen": 184558656, + "step": 85440 + }, + { + "epoch": 13.938825448613377, + "grad_norm": 0.001412428799085319, + "learning_rate": 0.0002546498309461941, + "loss": 0.0019, + "num_input_tokens_seen": 184568640, + "step": 85445 + }, + { + "epoch": 13.939641109298531, + "grad_norm": 3.1154069900512695, + "learning_rate": 0.00025458781243166667, + "loss": 0.0838, + "num_input_tokens_seen": 184580736, + "step": 85450 + }, + { + "epoch": 13.940456769983687, + "grad_norm": 0.03391305357217789, + "learning_rate": 0.0002545257988907286, + "loss": 0.0034, + "num_input_tokens_seen": 184593184, + "step": 85455 + }, + { + "epoch": 13.941272430668842, + "grad_norm": 0.00259930407628417, + "learning_rate": 0.0002544637903246364, + "loss": 0.0108, + "num_input_tokens_seen": 184603392, + "step": 85460 + }, + { + "epoch": 13.942088091353996, + "grad_norm": 0.0017219664296135306, + "learning_rate": 0.0002544017867346474, + "loss": 0.0688, + "num_input_tokens_seen": 184614528, + "step": 85465 + }, + { + "epoch": 13.942903752039152, + "grad_norm": 0.017398755997419357, + "learning_rate": 0.0002543397881220173, + "loss": 0.0081, + "num_input_tokens_seen": 184625888, + "step": 85470 + }, + { + "epoch": 13.943719412724306, + "grad_norm": 0.05955229327082634, + "learning_rate": 0.00025427779448800345, + "loss": 0.0093, + "num_input_tokens_seen": 184636000, + "step": 85475 + }, + { + "epoch": 13.944535073409462, + "grad_norm": 0.012460576370358467, + "learning_rate": 0.0002542158058338615, + "loss": 0.0027, + "num_input_tokens_seen": 184647808, + "step": 85480 + }, + { + "epoch": 13.945350734094617, + "grad_norm": 0.0023759007453918457, + "learning_rate": 0.00025415382216084837, + "loss": 0.0084, + "num_input_tokens_seen": 184660000, + "step": 85485 + }, + { + "epoch": 13.946166394779771, + "grad_norm": 0.16356733441352844, + "learning_rate": 0.0002540918434702195, + "loss": 0.0066, + "num_input_tokens_seen": 184670240, + "step": 85490 + }, + { + "epoch": 13.946982055464927, + "grad_norm": 0.00025423121405765414, + "learning_rate": 0.0002540298697632318, + "loss": 0.0015, + "num_input_tokens_seen": 184681056, + "step": 85495 + }, + { + "epoch": 13.947797716150081, + "grad_norm": 0.11059121787548065, + "learning_rate": 0.0002539679010411404, + "loss": 0.0095, + "num_input_tokens_seen": 184691264, + "step": 85500 + }, + { + "epoch": 13.948613376835237, + "grad_norm": 0.020271632820367813, + "learning_rate": 0.00025390593730520206, + "loss": 0.0056, + "num_input_tokens_seen": 184701824, + "step": 85505 + }, + { + "epoch": 13.949429037520392, + "grad_norm": 0.4572683572769165, + "learning_rate": 0.00025384397855667164, + "loss": 0.0541, + "num_input_tokens_seen": 184712640, + "step": 85510 + }, + { + "epoch": 13.950244698205546, + "grad_norm": 0.005310698878020048, + "learning_rate": 0.0002537820247968057, + "loss": 0.0183, + "num_input_tokens_seen": 184723552, + "step": 85515 + }, + { + "epoch": 13.951060358890702, + "grad_norm": 0.005224080290645361, + "learning_rate": 0.00025372007602685894, + "loss": 0.0073, + "num_input_tokens_seen": 184735008, + "step": 85520 + }, + { + "epoch": 13.951876019575856, + "grad_norm": 0.007012546062469482, + "learning_rate": 0.00025365813224808746, + "loss": 0.0024, + "num_input_tokens_seen": 184745856, + "step": 85525 + }, + { + "epoch": 13.952691680261012, + "grad_norm": 0.0013683406868949533, + "learning_rate": 0.00025359619346174644, + "loss": 0.0022, + "num_input_tokens_seen": 184756000, + "step": 85530 + }, + { + "epoch": 13.953507340946166, + "grad_norm": 0.027201242744922638, + "learning_rate": 0.0002535342596690912, + "loss": 0.0067, + "num_input_tokens_seen": 184766752, + "step": 85535 + }, + { + "epoch": 13.954323001631321, + "grad_norm": 0.6355860829353333, + "learning_rate": 0.0002534723308713768, + "loss": 0.0863, + "num_input_tokens_seen": 184778592, + "step": 85540 + }, + { + "epoch": 13.955138662316477, + "grad_norm": 0.002448199549689889, + "learning_rate": 0.0002534104070698584, + "loss": 0.0034, + "num_input_tokens_seen": 184789376, + "step": 85545 + }, + { + "epoch": 13.955954323001631, + "grad_norm": 0.0003716732608154416, + "learning_rate": 0.00025334848826579095, + "loss": 0.008, + "num_input_tokens_seen": 184801152, + "step": 85550 + }, + { + "epoch": 13.956769983686787, + "grad_norm": 0.3382447361946106, + "learning_rate": 0.0002532865744604292, + "loss": 0.0286, + "num_input_tokens_seen": 184812256, + "step": 85555 + }, + { + "epoch": 13.95758564437194, + "grad_norm": 0.003654716769233346, + "learning_rate": 0.000253224665655028, + "loss": 0.0095, + "num_input_tokens_seen": 184823680, + "step": 85560 + }, + { + "epoch": 13.958401305057096, + "grad_norm": 0.12044371664524078, + "learning_rate": 0.0002531627618508421, + "loss": 0.0155, + "num_input_tokens_seen": 184835072, + "step": 85565 + }, + { + "epoch": 13.959216965742252, + "grad_norm": 0.6087353825569153, + "learning_rate": 0.00025310086304912584, + "loss": 0.0122, + "num_input_tokens_seen": 184844864, + "step": 85570 + }, + { + "epoch": 13.960032626427406, + "grad_norm": 0.018098818138241768, + "learning_rate": 0.0002530389692511337, + "loss": 0.0032, + "num_input_tokens_seen": 184855776, + "step": 85575 + }, + { + "epoch": 13.960848287112562, + "grad_norm": 0.014368905685842037, + "learning_rate": 0.0002529770804581205, + "loss": 0.1265, + "num_input_tokens_seen": 184866272, + "step": 85580 + }, + { + "epoch": 13.961663947797716, + "grad_norm": 0.012142996303737164, + "learning_rate": 0.0002529151966713398, + "loss": 0.0033, + "num_input_tokens_seen": 184877312, + "step": 85585 + }, + { + "epoch": 13.962479608482871, + "grad_norm": 0.002565637230873108, + "learning_rate": 0.00025285331789204633, + "loss": 0.0027, + "num_input_tokens_seen": 184888192, + "step": 85590 + }, + { + "epoch": 13.963295269168025, + "grad_norm": 0.0035322627518326044, + "learning_rate": 0.0002527914441214937, + "loss": 0.0052, + "num_input_tokens_seen": 184896448, + "step": 85595 + }, + { + "epoch": 13.964110929853181, + "grad_norm": 0.3586525022983551, + "learning_rate": 0.00025272957536093634, + "loss": 0.0117, + "num_input_tokens_seen": 184908032, + "step": 85600 + }, + { + "epoch": 13.964926590538337, + "grad_norm": 0.0012097652070224285, + "learning_rate": 0.00025266771161162736, + "loss": 0.0018, + "num_input_tokens_seen": 184918176, + "step": 85605 + }, + { + "epoch": 13.96574225122349, + "grad_norm": 0.005765705835074186, + "learning_rate": 0.00025260585287482153, + "loss": 0.0018, + "num_input_tokens_seen": 184927104, + "step": 85610 + }, + { + "epoch": 13.966557911908646, + "grad_norm": 0.0335857979953289, + "learning_rate": 0.0002525439991517714, + "loss": 0.004, + "num_input_tokens_seen": 184938240, + "step": 85615 + }, + { + "epoch": 13.9673735725938, + "grad_norm": 0.004372979048639536, + "learning_rate": 0.0002524821504437316, + "loss": 0.0012, + "num_input_tokens_seen": 184949088, + "step": 85620 + }, + { + "epoch": 13.968189233278956, + "grad_norm": 0.0033910067286342382, + "learning_rate": 0.0002524203067519545, + "loss": 0.0017, + "num_input_tokens_seen": 184960096, + "step": 85625 + }, + { + "epoch": 13.969004893964112, + "grad_norm": 0.021418794989585876, + "learning_rate": 0.00025235846807769433, + "loss": 0.0166, + "num_input_tokens_seen": 184969504, + "step": 85630 + }, + { + "epoch": 13.969820554649266, + "grad_norm": 0.004271005280315876, + "learning_rate": 0.0002522966344222036, + "loss": 0.1278, + "num_input_tokens_seen": 184980320, + "step": 85635 + }, + { + "epoch": 13.970636215334421, + "grad_norm": 0.026607416570186615, + "learning_rate": 0.00025223480578673627, + "loss": 0.0158, + "num_input_tokens_seen": 184991616, + "step": 85640 + }, + { + "epoch": 13.971451876019575, + "grad_norm": 0.005434891674667597, + "learning_rate": 0.00025217298217254446, + "loss": 0.012, + "num_input_tokens_seen": 185000352, + "step": 85645 + }, + { + "epoch": 13.97226753670473, + "grad_norm": 0.00019001559121534228, + "learning_rate": 0.0002521111635808819, + "loss": 0.0244, + "num_input_tokens_seen": 185011328, + "step": 85650 + }, + { + "epoch": 13.973083197389887, + "grad_norm": 0.0019092840375378728, + "learning_rate": 0.0002520493500130008, + "loss": 0.0061, + "num_input_tokens_seen": 185021408, + "step": 85655 + }, + { + "epoch": 13.97389885807504, + "grad_norm": 0.0820712149143219, + "learning_rate": 0.0002519875414701545, + "loss": 0.0187, + "num_input_tokens_seen": 185031392, + "step": 85660 + }, + { + "epoch": 13.974714518760196, + "grad_norm": 0.01357912179082632, + "learning_rate": 0.0002519257379535949, + "loss": 0.0098, + "num_input_tokens_seen": 185042176, + "step": 85665 + }, + { + "epoch": 13.97553017944535, + "grad_norm": 0.0026590253692120314, + "learning_rate": 0.00025186393946457516, + "loss": 0.1236, + "num_input_tokens_seen": 185053248, + "step": 85670 + }, + { + "epoch": 13.976345840130506, + "grad_norm": 0.25764936208724976, + "learning_rate": 0.0002518021460043474, + "loss": 0.0109, + "num_input_tokens_seen": 185063872, + "step": 85675 + }, + { + "epoch": 13.977161500815662, + "grad_norm": 0.0026979451067745686, + "learning_rate": 0.0002517403575741641, + "loss": 0.0074, + "num_input_tokens_seen": 185076192, + "step": 85680 + }, + { + "epoch": 13.977977161500815, + "grad_norm": 0.27346065640449524, + "learning_rate": 0.0002516785741752773, + "loss": 0.0194, + "num_input_tokens_seen": 185086656, + "step": 85685 + }, + { + "epoch": 13.978792822185971, + "grad_norm": 0.0007937068003229797, + "learning_rate": 0.0002516167958089393, + "loss": 0.0018, + "num_input_tokens_seen": 185097728, + "step": 85690 + }, + { + "epoch": 13.979608482871125, + "grad_norm": 0.01872055046260357, + "learning_rate": 0.00025155502247640196, + "loss": 0.0173, + "num_input_tokens_seen": 185109120, + "step": 85695 + }, + { + "epoch": 13.98042414355628, + "grad_norm": 0.04877206310629845, + "learning_rate": 0.0002514932541789173, + "loss": 0.0046, + "num_input_tokens_seen": 185117792, + "step": 85700 + }, + { + "epoch": 13.981239804241435, + "grad_norm": 0.005501462146639824, + "learning_rate": 0.0002514314909177371, + "loss": 0.002, + "num_input_tokens_seen": 185128928, + "step": 85705 + }, + { + "epoch": 13.98205546492659, + "grad_norm": 0.01914447546005249, + "learning_rate": 0.00025136973269411305, + "loss": 0.0059, + "num_input_tokens_seen": 185138400, + "step": 85710 + }, + { + "epoch": 13.982871125611746, + "grad_norm": 0.0917380303144455, + "learning_rate": 0.0002513079795092968, + "loss": 0.0073, + "num_input_tokens_seen": 185149760, + "step": 85715 + }, + { + "epoch": 13.9836867862969, + "grad_norm": 0.0017335203010588884, + "learning_rate": 0.0002512462313645396, + "loss": 0.0126, + "num_input_tokens_seen": 185160864, + "step": 85720 + }, + { + "epoch": 13.984502446982056, + "grad_norm": 0.023793328553438187, + "learning_rate": 0.0002511844882610935, + "loss": 0.0031, + "num_input_tokens_seen": 185169888, + "step": 85725 + }, + { + "epoch": 13.98531810766721, + "grad_norm": 0.20731094479560852, + "learning_rate": 0.00025112275020020903, + "loss": 0.0397, + "num_input_tokens_seen": 185181632, + "step": 85730 + }, + { + "epoch": 13.986133768352365, + "grad_norm": 0.09553972631692886, + "learning_rate": 0.0002510610171831381, + "loss": 0.0038, + "num_input_tokens_seen": 185192576, + "step": 85735 + }, + { + "epoch": 13.986949429037521, + "grad_norm": 0.008362910710275173, + "learning_rate": 0.00025099928921113113, + "loss": 0.0123, + "num_input_tokens_seen": 185203968, + "step": 85740 + }, + { + "epoch": 13.987765089722675, + "grad_norm": 0.1448216736316681, + "learning_rate": 0.0002509375662854397, + "loss": 0.0107, + "num_input_tokens_seen": 185215808, + "step": 85745 + }, + { + "epoch": 13.98858075040783, + "grad_norm": 0.07602065801620483, + "learning_rate": 0.0002508758484073142, + "loss": 0.033, + "num_input_tokens_seen": 185226720, + "step": 85750 + }, + { + "epoch": 13.989396411092985, + "grad_norm": 0.0024841674603521824, + "learning_rate": 0.00025081413557800604, + "loss": 0.0387, + "num_input_tokens_seen": 185236704, + "step": 85755 + }, + { + "epoch": 13.99021207177814, + "grad_norm": 0.10913265496492386, + "learning_rate": 0.0002507524277987651, + "loss": 0.0089, + "num_input_tokens_seen": 185248256, + "step": 85760 + }, + { + "epoch": 13.991027732463294, + "grad_norm": 0.005257087759673595, + "learning_rate": 0.0002506907250708428, + "loss": 0.0047, + "num_input_tokens_seen": 185258976, + "step": 85765 + }, + { + "epoch": 13.99184339314845, + "grad_norm": 0.003243145067244768, + "learning_rate": 0.0002506290273954888, + "loss": 0.0076, + "num_input_tokens_seen": 185270048, + "step": 85770 + }, + { + "epoch": 13.992659053833606, + "grad_norm": 0.033470362424850464, + "learning_rate": 0.00025056733477395415, + "loss": 0.0084, + "num_input_tokens_seen": 185281856, + "step": 85775 + }, + { + "epoch": 13.99347471451876, + "grad_norm": 0.015511090867221355, + "learning_rate": 0.0002505056472074889, + "loss": 0.0025, + "num_input_tokens_seen": 185293152, + "step": 85780 + }, + { + "epoch": 13.994290375203915, + "grad_norm": 0.0026244802866131067, + "learning_rate": 0.0002504439646973432, + "loss": 0.0656, + "num_input_tokens_seen": 185305184, + "step": 85785 + }, + { + "epoch": 13.99510603588907, + "grad_norm": 0.005293934140354395, + "learning_rate": 0.00025038228724476715, + "loss": 0.0078, + "num_input_tokens_seen": 185316480, + "step": 85790 + }, + { + "epoch": 13.995921696574225, + "grad_norm": 0.3593876361846924, + "learning_rate": 0.00025032061485101066, + "loss": 0.0218, + "num_input_tokens_seen": 185325888, + "step": 85795 + }, + { + "epoch": 13.99673735725938, + "grad_norm": 0.0052078114822506905, + "learning_rate": 0.0002502589475173237, + "loss": 0.0028, + "num_input_tokens_seen": 185335808, + "step": 85800 + }, + { + "epoch": 13.997553017944535, + "grad_norm": 0.011453598737716675, + "learning_rate": 0.000250197285244956, + "loss": 0.0037, + "num_input_tokens_seen": 185347040, + "step": 85805 + }, + { + "epoch": 13.99836867862969, + "grad_norm": 0.00860854797065258, + "learning_rate": 0.0002501356280351572, + "loss": 0.0203, + "num_input_tokens_seen": 185357888, + "step": 85810 + }, + { + "epoch": 13.999184339314844, + "grad_norm": 0.0036592965479940176, + "learning_rate": 0.00025007397588917683, + "loss": 0.0063, + "num_input_tokens_seen": 185368896, + "step": 85815 + }, + { + "epoch": 14.0, + "grad_norm": 0.8848853707313538, + "learning_rate": 0.0002500123288082644, + "loss": 0.2741, + "num_input_tokens_seen": 185378480, + "step": 85820 + }, + { + "epoch": 14.0, + "eval_loss": 0.2316729873418808, + "eval_runtime": 104.2082, + "eval_samples_per_second": 26.15, + "eval_steps_per_second": 6.545, + "num_input_tokens_seen": 185378480, + "step": 85820 + }, + { + "epoch": 14.000815660685156, + "grad_norm": 0.0017862764652818441, + "learning_rate": 0.00024995068679366933, + "loss": 0.001, + "num_input_tokens_seen": 185389712, + "step": 85825 + }, + { + "epoch": 14.00163132137031, + "grad_norm": 0.00797954760491848, + "learning_rate": 0.00024988904984664075, + "loss": 0.0041, + "num_input_tokens_seen": 185400880, + "step": 85830 + }, + { + "epoch": 14.002446982055465, + "grad_norm": 0.09861485660076141, + "learning_rate": 0.00024982741796842787, + "loss": 0.0046, + "num_input_tokens_seen": 185411248, + "step": 85835 + }, + { + "epoch": 14.00326264274062, + "grad_norm": 0.013391591608524323, + "learning_rate": 0.00024976579116027975, + "loss": 0.1774, + "num_input_tokens_seen": 185422608, + "step": 85840 + }, + { + "epoch": 14.004078303425775, + "grad_norm": 0.0022156049963086843, + "learning_rate": 0.00024970416942344533, + "loss": 0.0014, + "num_input_tokens_seen": 185433040, + "step": 85845 + }, + { + "epoch": 14.00489396411093, + "grad_norm": 0.009373231790959835, + "learning_rate": 0.00024964255275917335, + "loss": 0.002, + "num_input_tokens_seen": 185444656, + "step": 85850 + }, + { + "epoch": 14.005709624796085, + "grad_norm": 0.08181966841220856, + "learning_rate": 0.00024958094116871274, + "loss": 0.0051, + "num_input_tokens_seen": 185456368, + "step": 85855 + }, + { + "epoch": 14.00652528548124, + "grad_norm": 0.003635540371760726, + "learning_rate": 0.000249519334653312, + "loss": 0.003, + "num_input_tokens_seen": 185465872, + "step": 85860 + }, + { + "epoch": 14.007340946166394, + "grad_norm": 0.0031431580428034067, + "learning_rate": 0.0002494577332142195, + "loss": 0.0015, + "num_input_tokens_seen": 185477520, + "step": 85865 + }, + { + "epoch": 14.00815660685155, + "grad_norm": 0.04018649458885193, + "learning_rate": 0.0002493961368526843, + "loss": 0.0112, + "num_input_tokens_seen": 185488688, + "step": 85870 + }, + { + "epoch": 14.008972267536704, + "grad_norm": 0.017326852306723595, + "learning_rate": 0.0002493345455699538, + "loss": 0.0032, + "num_input_tokens_seen": 185499632, + "step": 85875 + }, + { + "epoch": 14.00978792822186, + "grad_norm": 0.12720097601413727, + "learning_rate": 0.000249272959367277, + "loss": 0.0134, + "num_input_tokens_seen": 185510928, + "step": 85880 + }, + { + "epoch": 14.010603588907015, + "grad_norm": 0.006081325467675924, + "learning_rate": 0.0002492113782459017, + "loss": 0.0023, + "num_input_tokens_seen": 185521712, + "step": 85885 + }, + { + "epoch": 14.01141924959217, + "grad_norm": 0.075434111058712, + "learning_rate": 0.00024914980220707605, + "loss": 0.0024, + "num_input_tokens_seen": 185532272, + "step": 85890 + }, + { + "epoch": 14.012234910277325, + "grad_norm": 0.01512030977755785, + "learning_rate": 0.00024908823125204785, + "loss": 0.0018, + "num_input_tokens_seen": 185543280, + "step": 85895 + }, + { + "epoch": 14.013050570962479, + "grad_norm": 0.08885496854782104, + "learning_rate": 0.00024902666538206494, + "loss": 0.0064, + "num_input_tokens_seen": 185553392, + "step": 85900 + }, + { + "epoch": 14.013866231647635, + "grad_norm": 0.022448772564530373, + "learning_rate": 0.000248965104598375, + "loss": 0.0022, + "num_input_tokens_seen": 185564464, + "step": 85905 + }, + { + "epoch": 14.01468189233279, + "grad_norm": 0.09179883450269699, + "learning_rate": 0.0002489035489022257, + "loss": 0.0146, + "num_input_tokens_seen": 185575664, + "step": 85910 + }, + { + "epoch": 14.015497553017944, + "grad_norm": 0.00924753863364458, + "learning_rate": 0.0002488419982948646, + "loss": 0.0129, + "num_input_tokens_seen": 185586224, + "step": 85915 + }, + { + "epoch": 14.0163132137031, + "grad_norm": 0.007630279287695885, + "learning_rate": 0.0002487804527775389, + "loss": 0.0089, + "num_input_tokens_seen": 185596112, + "step": 85920 + }, + { + "epoch": 14.017128874388254, + "grad_norm": 0.006223857868462801, + "learning_rate": 0.0002487189123514961, + "loss": 0.0053, + "num_input_tokens_seen": 185606544, + "step": 85925 + }, + { + "epoch": 14.01794453507341, + "grad_norm": 0.019415950402617455, + "learning_rate": 0.0002486573770179833, + "loss": 0.0046, + "num_input_tokens_seen": 185615824, + "step": 85930 + }, + { + "epoch": 14.018760195758565, + "grad_norm": 0.024973466992378235, + "learning_rate": 0.00024859584677824757, + "loss": 0.0171, + "num_input_tokens_seen": 185627312, + "step": 85935 + }, + { + "epoch": 14.01957585644372, + "grad_norm": 0.020961524918675423, + "learning_rate": 0.00024853432163353596, + "loss": 0.0049, + "num_input_tokens_seen": 185638064, + "step": 85940 + }, + { + "epoch": 14.020391517128875, + "grad_norm": 0.06421557813882828, + "learning_rate": 0.00024847280158509535, + "loss": 0.0089, + "num_input_tokens_seen": 185649200, + "step": 85945 + }, + { + "epoch": 14.021207177814029, + "grad_norm": 0.032138630747795105, + "learning_rate": 0.00024841128663417243, + "loss": 0.1026, + "num_input_tokens_seen": 185658928, + "step": 85950 + }, + { + "epoch": 14.022022838499185, + "grad_norm": 0.035478681325912476, + "learning_rate": 0.000248349776782014, + "loss": 0.0041, + "num_input_tokens_seen": 185670544, + "step": 85955 + }, + { + "epoch": 14.022838499184338, + "grad_norm": 0.016271864995360374, + "learning_rate": 0.0002482882720298666, + "loss": 0.0032, + "num_input_tokens_seen": 185681808, + "step": 85960 + }, + { + "epoch": 14.023654159869494, + "grad_norm": 0.03173820301890373, + "learning_rate": 0.0002482267723789767, + "loss": 0.0087, + "num_input_tokens_seen": 185692720, + "step": 85965 + }, + { + "epoch": 14.02446982055465, + "grad_norm": 0.004889285191893578, + "learning_rate": 0.0002481652778305906, + "loss": 0.0028, + "num_input_tokens_seen": 185702992, + "step": 85970 + }, + { + "epoch": 14.025285481239804, + "grad_norm": 0.0015836816746741533, + "learning_rate": 0.00024810378838595467, + "loss": 0.0017, + "num_input_tokens_seen": 185714416, + "step": 85975 + }, + { + "epoch": 14.02610114192496, + "grad_norm": 0.30528175830841064, + "learning_rate": 0.00024804230404631495, + "loss": 0.1344, + "num_input_tokens_seen": 185724624, + "step": 85980 + }, + { + "epoch": 14.026916802610113, + "grad_norm": 0.011321947909891605, + "learning_rate": 0.0002479808248129174, + "loss": 0.0047, + "num_input_tokens_seen": 185735248, + "step": 85985 + }, + { + "epoch": 14.02773246329527, + "grad_norm": 0.007731277495622635, + "learning_rate": 0.00024791935068700855, + "loss": 0.0023, + "num_input_tokens_seen": 185745168, + "step": 85990 + }, + { + "epoch": 14.028548123980425, + "grad_norm": 0.0016969919670373201, + "learning_rate": 0.0002478578816698335, + "loss": 0.0021, + "num_input_tokens_seen": 185757200, + "step": 85995 + }, + { + "epoch": 14.029363784665579, + "grad_norm": 0.06662982702255249, + "learning_rate": 0.00024779641776263866, + "loss": 0.0056, + "num_input_tokens_seen": 185767664, + "step": 86000 + }, + { + "epoch": 14.030179445350734, + "grad_norm": 0.013063081540167332, + "learning_rate": 0.00024773495896666904, + "loss": 0.0066, + "num_input_tokens_seen": 185777040, + "step": 86005 + }, + { + "epoch": 14.030995106035888, + "grad_norm": 0.0027070636861026287, + "learning_rate": 0.0002476735052831706, + "loss": 0.0099, + "num_input_tokens_seen": 185788432, + "step": 86010 + }, + { + "epoch": 14.031810766721044, + "grad_norm": 0.05823206901550293, + "learning_rate": 0.0002476120567133888, + "loss": 0.0028, + "num_input_tokens_seen": 185799088, + "step": 86015 + }, + { + "epoch": 14.0326264274062, + "grad_norm": 0.022384580224752426, + "learning_rate": 0.0002475506132585687, + "loss": 0.0028, + "num_input_tokens_seen": 185808944, + "step": 86020 + }, + { + "epoch": 14.033442088091354, + "grad_norm": 0.022329630330204964, + "learning_rate": 0.0002474891749199558, + "loss": 0.0029, + "num_input_tokens_seen": 185819376, + "step": 86025 + }, + { + "epoch": 14.03425774877651, + "grad_norm": 0.0012048856588080525, + "learning_rate": 0.000247427741698795, + "loss": 0.0026, + "num_input_tokens_seen": 185829904, + "step": 86030 + }, + { + "epoch": 14.035073409461663, + "grad_norm": 0.0027390222530812025, + "learning_rate": 0.00024736631359633147, + "loss": 0.0061, + "num_input_tokens_seen": 185840528, + "step": 86035 + }, + { + "epoch": 14.035889070146819, + "grad_norm": 0.01180250570178032, + "learning_rate": 0.00024730489061381013, + "loss": 0.002, + "num_input_tokens_seen": 185851984, + "step": 86040 + }, + { + "epoch": 14.036704730831975, + "grad_norm": 0.008089886978268623, + "learning_rate": 0.00024724347275247564, + "loss": 0.0603, + "num_input_tokens_seen": 185862896, + "step": 86045 + }, + { + "epoch": 14.037520391517129, + "grad_norm": 0.0009739563683979213, + "learning_rate": 0.0002471820600135729, + "loss": 0.0009, + "num_input_tokens_seen": 185873904, + "step": 86050 + }, + { + "epoch": 14.038336052202284, + "grad_norm": 0.002611349569633603, + "learning_rate": 0.0002471206523983465, + "loss": 0.0064, + "num_input_tokens_seen": 185885232, + "step": 86055 + }, + { + "epoch": 14.039151712887438, + "grad_norm": 0.0013060198398306966, + "learning_rate": 0.00024705924990804076, + "loss": 0.0075, + "num_input_tokens_seen": 185895920, + "step": 86060 + }, + { + "epoch": 14.039967373572594, + "grad_norm": 0.0006362605490721762, + "learning_rate": 0.0002469978525439002, + "loss": 0.002, + "num_input_tokens_seen": 185906928, + "step": 86065 + }, + { + "epoch": 14.040783034257748, + "grad_norm": 0.0027871110942214727, + "learning_rate": 0.00024693646030716923, + "loss": 0.0035, + "num_input_tokens_seen": 185917840, + "step": 86070 + }, + { + "epoch": 14.041598694942904, + "grad_norm": 0.044228702783584595, + "learning_rate": 0.0002468750731990918, + "loss": 0.0053, + "num_input_tokens_seen": 185928304, + "step": 86075 + }, + { + "epoch": 14.04241435562806, + "grad_norm": 0.016400212422013283, + "learning_rate": 0.0002468136912209122, + "loss": 0.1018, + "num_input_tokens_seen": 185938992, + "step": 86080 + }, + { + "epoch": 14.043230016313213, + "grad_norm": 0.03972737863659859, + "learning_rate": 0.0002467523143738743, + "loss": 0.0096, + "num_input_tokens_seen": 185949584, + "step": 86085 + }, + { + "epoch": 14.044045676998369, + "grad_norm": 0.0019362044986337423, + "learning_rate": 0.00024669094265922204, + "loss": 0.003, + "num_input_tokens_seen": 185961296, + "step": 86090 + }, + { + "epoch": 14.044861337683523, + "grad_norm": 0.2900139391422272, + "learning_rate": 0.00024662957607819914, + "loss": 0.0069, + "num_input_tokens_seen": 185972400, + "step": 86095 + }, + { + "epoch": 14.045676998368679, + "grad_norm": 0.0015170919941738248, + "learning_rate": 0.00024656821463204913, + "loss": 0.0109, + "num_input_tokens_seen": 185982736, + "step": 86100 + }, + { + "epoch": 14.046492659053834, + "grad_norm": 0.04802202805876732, + "learning_rate": 0.0002465068583220161, + "loss": 0.0033, + "num_input_tokens_seen": 185993712, + "step": 86105 + }, + { + "epoch": 14.047308319738988, + "grad_norm": 0.0018274127505719662, + "learning_rate": 0.0002464455071493429, + "loss": 0.0122, + "num_input_tokens_seen": 186004368, + "step": 86110 + }, + { + "epoch": 14.048123980424144, + "grad_norm": 0.0069192443042993546, + "learning_rate": 0.00024638416111527346, + "loss": 0.0012, + "num_input_tokens_seen": 186014736, + "step": 86115 + }, + { + "epoch": 14.048939641109298, + "grad_norm": 0.0008330877753905952, + "learning_rate": 0.0002463228202210503, + "loss": 0.0139, + "num_input_tokens_seen": 186025392, + "step": 86120 + }, + { + "epoch": 14.049755301794454, + "grad_norm": 0.002327044727280736, + "learning_rate": 0.00024626148446791745, + "loss": 0.0051, + "num_input_tokens_seen": 186037008, + "step": 86125 + }, + { + "epoch": 14.05057096247961, + "grad_norm": 0.003909484948962927, + "learning_rate": 0.00024620015385711706, + "loss": 0.0033, + "num_input_tokens_seen": 186048048, + "step": 86130 + }, + { + "epoch": 14.051386623164763, + "grad_norm": 0.006379165221005678, + "learning_rate": 0.000246138828389893, + "loss": 0.0064, + "num_input_tokens_seen": 186058448, + "step": 86135 + }, + { + "epoch": 14.052202283849919, + "grad_norm": 0.003839249489828944, + "learning_rate": 0.0002460775080674872, + "loss": 0.0026, + "num_input_tokens_seen": 186068400, + "step": 86140 + }, + { + "epoch": 14.053017944535073, + "grad_norm": 0.07193329930305481, + "learning_rate": 0.0002460161928911432, + "loss": 0.0051, + "num_input_tokens_seen": 186080752, + "step": 86145 + }, + { + "epoch": 14.053833605220229, + "grad_norm": 0.011074123904109001, + "learning_rate": 0.0002459548828621028, + "loss": 0.0067, + "num_input_tokens_seen": 186092336, + "step": 86150 + }, + { + "epoch": 14.054649265905383, + "grad_norm": 0.002707752399146557, + "learning_rate": 0.00024589357798160925, + "loss": 0.0011, + "num_input_tokens_seen": 186103632, + "step": 86155 + }, + { + "epoch": 14.055464926590538, + "grad_norm": 0.0034250058233737946, + "learning_rate": 0.0002458322782509047, + "loss": 0.0016, + "num_input_tokens_seen": 186114256, + "step": 86160 + }, + { + "epoch": 14.056280587275694, + "grad_norm": 0.010328114964067936, + "learning_rate": 0.00024577098367123146, + "loss": 0.0024, + "num_input_tokens_seen": 186124816, + "step": 86165 + }, + { + "epoch": 14.057096247960848, + "grad_norm": 0.0008839413640089333, + "learning_rate": 0.00024570969424383174, + "loss": 0.0036, + "num_input_tokens_seen": 186136624, + "step": 86170 + }, + { + "epoch": 14.057911908646004, + "grad_norm": 0.00860717985779047, + "learning_rate": 0.00024564840996994764, + "loss": 0.0072, + "num_input_tokens_seen": 186146864, + "step": 86175 + }, + { + "epoch": 14.058727569331158, + "grad_norm": 0.0009140484617091715, + "learning_rate": 0.0002455871308508212, + "loss": 0.0044, + "num_input_tokens_seen": 186157904, + "step": 86180 + }, + { + "epoch": 14.059543230016313, + "grad_norm": 0.8887136578559875, + "learning_rate": 0.0002455258568876943, + "loss": 0.1385, + "num_input_tokens_seen": 186168464, + "step": 86185 + }, + { + "epoch": 14.060358890701469, + "grad_norm": 0.0013078611809760332, + "learning_rate": 0.0002454645880818087, + "loss": 0.002, + "num_input_tokens_seen": 186179568, + "step": 86190 + }, + { + "epoch": 14.061174551386623, + "grad_norm": 0.7345595955848694, + "learning_rate": 0.00024540332443440615, + "loss": 0.0126, + "num_input_tokens_seen": 186191152, + "step": 86195 + }, + { + "epoch": 14.061990212071779, + "grad_norm": 0.027177168056368828, + "learning_rate": 0.0002453420659467282, + "loss": 0.0056, + "num_input_tokens_seen": 186201776, + "step": 86200 + }, + { + "epoch": 14.062805872756933, + "grad_norm": 0.009244061075150967, + "learning_rate": 0.00024528081262001615, + "loss": 0.0008, + "num_input_tokens_seen": 186213392, + "step": 86205 + }, + { + "epoch": 14.063621533442088, + "grad_norm": 0.0036014586221426725, + "learning_rate": 0.000245219564455512, + "loss": 0.0021, + "num_input_tokens_seen": 186222448, + "step": 86210 + }, + { + "epoch": 14.064437194127244, + "grad_norm": 0.006802697200328112, + "learning_rate": 0.00024515832145445614, + "loss": 0.0043, + "num_input_tokens_seen": 186232624, + "step": 86215 + }, + { + "epoch": 14.065252854812398, + "grad_norm": 0.004956814460456371, + "learning_rate": 0.0002450970836180906, + "loss": 0.006, + "num_input_tokens_seen": 186242896, + "step": 86220 + }, + { + "epoch": 14.066068515497554, + "grad_norm": 0.051006123423576355, + "learning_rate": 0.0002450358509476556, + "loss": 0.0054, + "num_input_tokens_seen": 186253616, + "step": 86225 + }, + { + "epoch": 14.066884176182707, + "grad_norm": 0.08910132199525833, + "learning_rate": 0.00024497462344439297, + "loss": 0.0032, + "num_input_tokens_seen": 186264080, + "step": 86230 + }, + { + "epoch": 14.067699836867863, + "grad_norm": 0.002252694685012102, + "learning_rate": 0.0002449134011095427, + "loss": 0.0855, + "num_input_tokens_seen": 186275760, + "step": 86235 + }, + { + "epoch": 14.068515497553017, + "grad_norm": 0.0041665323078632355, + "learning_rate": 0.0002448521839443464, + "loss": 0.1032, + "num_input_tokens_seen": 186286864, + "step": 86240 + }, + { + "epoch": 14.069331158238173, + "grad_norm": 0.00127582682762295, + "learning_rate": 0.00024479097195004377, + "loss": 0.032, + "num_input_tokens_seen": 186296560, + "step": 86245 + }, + { + "epoch": 14.070146818923329, + "grad_norm": 0.0045285290107131, + "learning_rate": 0.0002447297651278763, + "loss": 0.0039, + "num_input_tokens_seen": 186307600, + "step": 86250 + }, + { + "epoch": 14.070962479608482, + "grad_norm": 0.0880713164806366, + "learning_rate": 0.0002446685634790836, + "loss": 0.0075, + "num_input_tokens_seen": 186318704, + "step": 86255 + }, + { + "epoch": 14.071778140293638, + "grad_norm": 0.0022388026118278503, + "learning_rate": 0.00024460736700490676, + "loss": 0.0101, + "num_input_tokens_seen": 186328016, + "step": 86260 + }, + { + "epoch": 14.072593800978792, + "grad_norm": 0.005308662075549364, + "learning_rate": 0.00024454617570658524, + "loss": 0.0015, + "num_input_tokens_seen": 186340048, + "step": 86265 + }, + { + "epoch": 14.073409461663948, + "grad_norm": 0.010273891501128674, + "learning_rate": 0.00024448498958535984, + "loss": 0.004, + "num_input_tokens_seen": 186351408, + "step": 86270 + }, + { + "epoch": 14.074225122349104, + "grad_norm": 0.0025942821521312, + "learning_rate": 0.00024442380864247, + "loss": 0.0075, + "num_input_tokens_seen": 186361904, + "step": 86275 + }, + { + "epoch": 14.075040783034257, + "grad_norm": 0.0011069714091718197, + "learning_rate": 0.00024436263287915623, + "loss": 0.0047, + "num_input_tokens_seen": 186372848, + "step": 86280 + }, + { + "epoch": 14.075856443719413, + "grad_norm": 0.001931919134221971, + "learning_rate": 0.00024430146229665754, + "loss": 0.0014, + "num_input_tokens_seen": 186384240, + "step": 86285 + }, + { + "epoch": 14.076672104404567, + "grad_norm": 0.018992312252521515, + "learning_rate": 0.0002442402968962146, + "loss": 0.0064, + "num_input_tokens_seen": 186394608, + "step": 86290 + }, + { + "epoch": 14.077487765089723, + "grad_norm": 0.017261963337659836, + "learning_rate": 0.00024417913667906604, + "loss": 0.0025, + "num_input_tokens_seen": 186404560, + "step": 86295 + }, + { + "epoch": 14.078303425774878, + "grad_norm": 0.007981205359101295, + "learning_rate": 0.00024411798164645205, + "loss": 0.0014, + "num_input_tokens_seen": 186415248, + "step": 86300 + }, + { + "epoch": 14.079119086460032, + "grad_norm": 0.4204510450363159, + "learning_rate": 0.00024405683179961176, + "loss": 0.0144, + "num_input_tokens_seen": 186426544, + "step": 86305 + }, + { + "epoch": 14.079934747145188, + "grad_norm": 0.012655510567128658, + "learning_rate": 0.00024399568713978444, + "loss": 0.0017, + "num_input_tokens_seen": 186437040, + "step": 86310 + }, + { + "epoch": 14.080750407830342, + "grad_norm": 0.00166032905690372, + "learning_rate": 0.00024393454766820927, + "loss": 0.0016, + "num_input_tokens_seen": 186448336, + "step": 86315 + }, + { + "epoch": 14.081566068515498, + "grad_norm": 0.11170172691345215, + "learning_rate": 0.00024387341338612535, + "loss": 0.0052, + "num_input_tokens_seen": 186459152, + "step": 86320 + }, + { + "epoch": 14.082381729200652, + "grad_norm": 0.0015688682906329632, + "learning_rate": 0.00024381228429477166, + "loss": 0.002, + "num_input_tokens_seen": 186471504, + "step": 86325 + }, + { + "epoch": 14.083197389885807, + "grad_norm": 0.003982523921877146, + "learning_rate": 0.00024375116039538697, + "loss": 0.1025, + "num_input_tokens_seen": 186481360, + "step": 86330 + }, + { + "epoch": 14.084013050570963, + "grad_norm": 0.0194878950715065, + "learning_rate": 0.0002436900416892101, + "loss": 0.0028, + "num_input_tokens_seen": 186493328, + "step": 86335 + }, + { + "epoch": 14.084828711256117, + "grad_norm": 0.33860138058662415, + "learning_rate": 0.00024362892817747972, + "loss": 0.0053, + "num_input_tokens_seen": 186504688, + "step": 86340 + }, + { + "epoch": 14.085644371941273, + "grad_norm": 0.0014220515731722116, + "learning_rate": 0.00024356781986143434, + "loss": 0.0162, + "num_input_tokens_seen": 186516144, + "step": 86345 + }, + { + "epoch": 14.086460032626427, + "grad_norm": 0.002930557122454047, + "learning_rate": 0.00024350671674231217, + "loss": 0.1392, + "num_input_tokens_seen": 186526288, + "step": 86350 + }, + { + "epoch": 14.087275693311582, + "grad_norm": 0.01115860790014267, + "learning_rate": 0.0002434456188213522, + "loss": 0.0016, + "num_input_tokens_seen": 186536432, + "step": 86355 + }, + { + "epoch": 14.088091353996738, + "grad_norm": 0.004121196456253529, + "learning_rate": 0.00024338452609979177, + "loss": 0.0095, + "num_input_tokens_seen": 186546064, + "step": 86360 + }, + { + "epoch": 14.088907014681892, + "grad_norm": 0.005770614370703697, + "learning_rate": 0.0002433234385788699, + "loss": 0.0037, + "num_input_tokens_seen": 186557872, + "step": 86365 + }, + { + "epoch": 14.089722675367048, + "grad_norm": 0.004038193728774786, + "learning_rate": 0.00024326235625982378, + "loss": 0.0027, + "num_input_tokens_seen": 186569136, + "step": 86370 + }, + { + "epoch": 14.090538336052202, + "grad_norm": 0.03139469772577286, + "learning_rate": 0.00024320127914389213, + "loss": 0.0035, + "num_input_tokens_seen": 186581232, + "step": 86375 + }, + { + "epoch": 14.091353996737357, + "grad_norm": 0.16970932483673096, + "learning_rate": 0.00024314020723231183, + "loss": 0.0117, + "num_input_tokens_seen": 186592144, + "step": 86380 + }, + { + "epoch": 14.092169657422513, + "grad_norm": 0.04780832678079605, + "learning_rate": 0.00024307914052632159, + "loss": 0.0031, + "num_input_tokens_seen": 186603504, + "step": 86385 + }, + { + "epoch": 14.092985318107667, + "grad_norm": 0.002771410159766674, + "learning_rate": 0.000243018079027158, + "loss": 0.0022, + "num_input_tokens_seen": 186615024, + "step": 86390 + }, + { + "epoch": 14.093800978792823, + "grad_norm": 0.029407048597931862, + "learning_rate": 0.0002429570227360595, + "loss": 0.0024, + "num_input_tokens_seen": 186624976, + "step": 86395 + }, + { + "epoch": 14.094616639477977, + "grad_norm": 0.0027717319317162037, + "learning_rate": 0.00024289597165426264, + "loss": 0.0026, + "num_input_tokens_seen": 186636752, + "step": 86400 + }, + { + "epoch": 14.095432300163132, + "grad_norm": 0.016943685710430145, + "learning_rate": 0.00024283492578300542, + "loss": 0.0805, + "num_input_tokens_seen": 186647632, + "step": 86405 + }, + { + "epoch": 14.096247960848286, + "grad_norm": 0.002924926346167922, + "learning_rate": 0.00024277388512352428, + "loss": 0.0033, + "num_input_tokens_seen": 186658896, + "step": 86410 + }, + { + "epoch": 14.097063621533442, + "grad_norm": 0.014270029030740261, + "learning_rate": 0.00024271284967705687, + "loss": 0.016, + "num_input_tokens_seen": 186671056, + "step": 86415 + }, + { + "epoch": 14.097879282218598, + "grad_norm": 0.0008768712286837399, + "learning_rate": 0.00024265181944483995, + "loss": 0.0006, + "num_input_tokens_seen": 186681872, + "step": 86420 + }, + { + "epoch": 14.098694942903752, + "grad_norm": 0.004642259329557419, + "learning_rate": 0.0002425907944281104, + "loss": 0.0129, + "num_input_tokens_seen": 186692880, + "step": 86425 + }, + { + "epoch": 14.099510603588907, + "grad_norm": 0.017092658206820488, + "learning_rate": 0.00024252977462810494, + "loss": 0.0037, + "num_input_tokens_seen": 186702704, + "step": 86430 + }, + { + "epoch": 14.100326264274061, + "grad_norm": 0.0033077350817620754, + "learning_rate": 0.0002424687600460602, + "loss": 0.0097, + "num_input_tokens_seen": 186712944, + "step": 86435 + }, + { + "epoch": 14.101141924959217, + "grad_norm": 0.0017457004869356751, + "learning_rate": 0.00024240775068321273, + "loss": 0.0038, + "num_input_tokens_seen": 186723408, + "step": 86440 + }, + { + "epoch": 14.101957585644373, + "grad_norm": 0.0013185646384954453, + "learning_rate": 0.00024234674654079901, + "loss": 0.0053, + "num_input_tokens_seen": 186735312, + "step": 86445 + }, + { + "epoch": 14.102773246329527, + "grad_norm": 0.0159031692892313, + "learning_rate": 0.00024228574762005534, + "loss": 0.0064, + "num_input_tokens_seen": 186746544, + "step": 86450 + }, + { + "epoch": 14.103588907014682, + "grad_norm": 0.03777594491839409, + "learning_rate": 0.00024222475392221787, + "loss": 0.0022, + "num_input_tokens_seen": 186757712, + "step": 86455 + }, + { + "epoch": 14.104404567699836, + "grad_norm": 0.015740415081381798, + "learning_rate": 0.0002421637654485228, + "loss": 0.002, + "num_input_tokens_seen": 186768144, + "step": 86460 + }, + { + "epoch": 14.105220228384992, + "grad_norm": 2.156970977783203, + "learning_rate": 0.00024210278220020614, + "loss": 0.0342, + "num_input_tokens_seen": 186778096, + "step": 86465 + }, + { + "epoch": 14.106035889070148, + "grad_norm": 0.005683009047061205, + "learning_rate": 0.00024204180417850373, + "loss": 0.0037, + "num_input_tokens_seen": 186789008, + "step": 86470 + }, + { + "epoch": 14.106851549755302, + "grad_norm": 0.07511241734027863, + "learning_rate": 0.00024198083138465143, + "loss": 0.005, + "num_input_tokens_seen": 186799280, + "step": 86475 + }, + { + "epoch": 14.107667210440457, + "grad_norm": 0.05983182042837143, + "learning_rate": 0.0002419198638198849, + "loss": 0.0105, + "num_input_tokens_seen": 186809808, + "step": 86480 + }, + { + "epoch": 14.108482871125611, + "grad_norm": 0.011176004074513912, + "learning_rate": 0.0002418589014854397, + "loss": 0.0046, + "num_input_tokens_seen": 186821104, + "step": 86485 + }, + { + "epoch": 14.109298531810767, + "grad_norm": 0.024094609543681145, + "learning_rate": 0.00024179794438255133, + "loss": 0.0024, + "num_input_tokens_seen": 186830864, + "step": 86490 + }, + { + "epoch": 14.11011419249592, + "grad_norm": 0.020474612712860107, + "learning_rate": 0.000241736992512455, + "loss": 0.0016, + "num_input_tokens_seen": 186841936, + "step": 86495 + }, + { + "epoch": 14.110929853181077, + "grad_norm": 0.0037670359015464783, + "learning_rate": 0.00024167604587638653, + "loss": 0.0012, + "num_input_tokens_seen": 186852944, + "step": 86500 + }, + { + "epoch": 14.111745513866232, + "grad_norm": 0.020464560016989708, + "learning_rate": 0.00024161510447558032, + "loss": 0.0025, + "num_input_tokens_seen": 186863152, + "step": 86505 + }, + { + "epoch": 14.112561174551386, + "grad_norm": 0.030841641128063202, + "learning_rate": 0.0002415541683112722, + "loss": 0.0022, + "num_input_tokens_seen": 186873904, + "step": 86510 + }, + { + "epoch": 14.113376835236542, + "grad_norm": 0.0019353614188730717, + "learning_rate": 0.0002414932373846963, + "loss": 0.0033, + "num_input_tokens_seen": 186884592, + "step": 86515 + }, + { + "epoch": 14.114192495921696, + "grad_norm": 0.0009746397845447063, + "learning_rate": 0.00024143231169708806, + "loss": 0.0008, + "num_input_tokens_seen": 186895536, + "step": 86520 + }, + { + "epoch": 14.115008156606851, + "grad_norm": 0.01233255211263895, + "learning_rate": 0.0002413713912496821, + "loss": 0.0145, + "num_input_tokens_seen": 186906448, + "step": 86525 + }, + { + "epoch": 14.115823817292007, + "grad_norm": 0.005642162170261145, + "learning_rate": 0.00024131047604371292, + "loss": 0.0019, + "num_input_tokens_seen": 186917328, + "step": 86530 + }, + { + "epoch": 14.116639477977161, + "grad_norm": 0.012053254060447216, + "learning_rate": 0.0002412495660804152, + "loss": 0.0545, + "num_input_tokens_seen": 186928336, + "step": 86535 + }, + { + "epoch": 14.117455138662317, + "grad_norm": 0.01833995431661606, + "learning_rate": 0.0002411886613610232, + "loss": 0.0031, + "num_input_tokens_seen": 186938896, + "step": 86540 + }, + { + "epoch": 14.11827079934747, + "grad_norm": 0.007797705475240946, + "learning_rate": 0.00024112776188677133, + "loss": 0.0014, + "num_input_tokens_seen": 186948688, + "step": 86545 + }, + { + "epoch": 14.119086460032626, + "grad_norm": 0.01950424164533615, + "learning_rate": 0.0002410668676588938, + "loss": 0.0015, + "num_input_tokens_seen": 186958928, + "step": 86550 + }, + { + "epoch": 14.119902120717782, + "grad_norm": 0.6473231315612793, + "learning_rate": 0.0002410059786786246, + "loss": 0.0235, + "num_input_tokens_seen": 186969744, + "step": 86555 + }, + { + "epoch": 14.120717781402936, + "grad_norm": 0.07638765871524811, + "learning_rate": 0.00024094509494719784, + "loss": 0.0198, + "num_input_tokens_seen": 186980528, + "step": 86560 + }, + { + "epoch": 14.121533442088092, + "grad_norm": 0.03419485315680504, + "learning_rate": 0.0002408842164658474, + "loss": 0.0053, + "num_input_tokens_seen": 186991280, + "step": 86565 + }, + { + "epoch": 14.122349102773246, + "grad_norm": 0.0056074392050504684, + "learning_rate": 0.00024082334323580695, + "loss": 0.0012, + "num_input_tokens_seen": 187002448, + "step": 86570 + }, + { + "epoch": 14.123164763458401, + "grad_norm": 0.043900150805711746, + "learning_rate": 0.0002407624752583103, + "loss": 0.0037, + "num_input_tokens_seen": 187013552, + "step": 86575 + }, + { + "epoch": 14.123980424143557, + "grad_norm": 0.006590542383491993, + "learning_rate": 0.00024070161253459093, + "loss": 0.0052, + "num_input_tokens_seen": 187023984, + "step": 86580 + }, + { + "epoch": 14.124796084828711, + "grad_norm": 0.0019354376709088683, + "learning_rate": 0.00024064075506588235, + "loss": 0.0567, + "num_input_tokens_seen": 187036496, + "step": 86585 + }, + { + "epoch": 14.125611745513867, + "grad_norm": 0.013171792961657047, + "learning_rate": 0.00024057990285341786, + "loss": 0.0026, + "num_input_tokens_seen": 187047056, + "step": 86590 + }, + { + "epoch": 14.12642740619902, + "grad_norm": 0.06522294878959656, + "learning_rate": 0.00024051905589843076, + "loss": 0.0573, + "num_input_tokens_seen": 187057712, + "step": 86595 + }, + { + "epoch": 14.127243066884176, + "grad_norm": 0.0005969098419882357, + "learning_rate": 0.00024045821420215412, + "loss": 0.0051, + "num_input_tokens_seen": 187069296, + "step": 86600 + }, + { + "epoch": 14.12805872756933, + "grad_norm": 0.6809018850326538, + "learning_rate": 0.0002403973777658211, + "loss": 0.0124, + "num_input_tokens_seen": 187080368, + "step": 86605 + }, + { + "epoch": 14.128874388254486, + "grad_norm": 0.0005238762823864818, + "learning_rate": 0.0002403365465906645, + "loss": 0.003, + "num_input_tokens_seen": 187091728, + "step": 86610 + }, + { + "epoch": 14.129690048939642, + "grad_norm": 0.0008300320478156209, + "learning_rate": 0.0002402757206779172, + "loss": 0.0009, + "num_input_tokens_seen": 187101584, + "step": 86615 + }, + { + "epoch": 14.130505709624796, + "grad_norm": 0.006851747632026672, + "learning_rate": 0.00024021490002881186, + "loss": 0.0029, + "num_input_tokens_seen": 187112112, + "step": 86620 + }, + { + "epoch": 14.131321370309951, + "grad_norm": 0.004974581766873598, + "learning_rate": 0.000240154084644581, + "loss": 0.0284, + "num_input_tokens_seen": 187123440, + "step": 86625 + }, + { + "epoch": 14.132137030995105, + "grad_norm": 0.5447921752929688, + "learning_rate": 0.0002400932745264574, + "loss": 0.0427, + "num_input_tokens_seen": 187134416, + "step": 86630 + }, + { + "epoch": 14.132952691680261, + "grad_norm": 0.02428017184138298, + "learning_rate": 0.00024003246967567332, + "loss": 0.0018, + "num_input_tokens_seen": 187146128, + "step": 86635 + }, + { + "epoch": 14.133768352365417, + "grad_norm": 0.001342527917586267, + "learning_rate": 0.00023997167009346104, + "loss": 0.0012, + "num_input_tokens_seen": 187156624, + "step": 86640 + }, + { + "epoch": 14.13458401305057, + "grad_norm": 0.03645501285791397, + "learning_rate": 0.00023991087578105274, + "loss": 0.0031, + "num_input_tokens_seen": 187167600, + "step": 86645 + }, + { + "epoch": 14.135399673735726, + "grad_norm": 0.00650134077295661, + "learning_rate": 0.00023985008673968052, + "loss": 0.0191, + "num_input_tokens_seen": 187179248, + "step": 86650 + }, + { + "epoch": 14.13621533442088, + "grad_norm": 0.0668526217341423, + "learning_rate": 0.00023978930297057627, + "loss": 0.0066, + "num_input_tokens_seen": 187189872, + "step": 86655 + }, + { + "epoch": 14.137030995106036, + "grad_norm": 0.05055573210120201, + "learning_rate": 0.0002397285244749719, + "loss": 0.0036, + "num_input_tokens_seen": 187201616, + "step": 86660 + }, + { + "epoch": 14.137846655791192, + "grad_norm": 0.008112654089927673, + "learning_rate": 0.00023966775125409918, + "loss": 0.0029, + "num_input_tokens_seen": 187211856, + "step": 86665 + }, + { + "epoch": 14.138662316476346, + "grad_norm": 0.0011952114291489124, + "learning_rate": 0.00023960698330918972, + "loss": 0.002, + "num_input_tokens_seen": 187222864, + "step": 86670 + }, + { + "epoch": 14.139477977161501, + "grad_norm": 0.0007925685495138168, + "learning_rate": 0.00023954622064147507, + "loss": 0.0017, + "num_input_tokens_seen": 187232912, + "step": 86675 + }, + { + "epoch": 14.140293637846655, + "grad_norm": 0.05681190267205238, + "learning_rate": 0.00023948546325218667, + "loss": 0.0032, + "num_input_tokens_seen": 187243984, + "step": 86680 + }, + { + "epoch": 14.141109298531811, + "grad_norm": 0.036385323852300644, + "learning_rate": 0.00023942471114255588, + "loss": 0.0228, + "num_input_tokens_seen": 187255280, + "step": 86685 + }, + { + "epoch": 14.141924959216965, + "grad_norm": 0.0023627562914043665, + "learning_rate": 0.00023936396431381386, + "loss": 0.0032, + "num_input_tokens_seen": 187265488, + "step": 86690 + }, + { + "epoch": 14.14274061990212, + "grad_norm": 0.1351795792579651, + "learning_rate": 0.00023930322276719175, + "loss": 0.0041, + "num_input_tokens_seen": 187277872, + "step": 86695 + }, + { + "epoch": 14.143556280587276, + "grad_norm": 0.011718549765646458, + "learning_rate": 0.0002392424865039205, + "loss": 0.0008, + "num_input_tokens_seen": 187288144, + "step": 86700 + }, + { + "epoch": 14.14437194127243, + "grad_norm": 0.001978749642148614, + "learning_rate": 0.0002391817555252311, + "loss": 0.0029, + "num_input_tokens_seen": 187297488, + "step": 86705 + }, + { + "epoch": 14.145187601957586, + "grad_norm": 0.03943789377808571, + "learning_rate": 0.0002391210298323543, + "loss": 0.012, + "num_input_tokens_seen": 187309904, + "step": 86710 + }, + { + "epoch": 14.14600326264274, + "grad_norm": 0.0004417496966198087, + "learning_rate": 0.00023906030942652073, + "loss": 0.0018, + "num_input_tokens_seen": 187321136, + "step": 86715 + }, + { + "epoch": 14.146818923327896, + "grad_norm": 0.010585743933916092, + "learning_rate": 0.00023899959430896106, + "loss": 0.2156, + "num_input_tokens_seen": 187331440, + "step": 86720 + }, + { + "epoch": 14.147634584013051, + "grad_norm": 0.006574005354195833, + "learning_rate": 0.00023893888448090573, + "loss": 0.0024, + "num_input_tokens_seen": 187342352, + "step": 86725 + }, + { + "epoch": 14.148450244698205, + "grad_norm": 0.004500469658523798, + "learning_rate": 0.00023887817994358484, + "loss": 0.0024, + "num_input_tokens_seen": 187352496, + "step": 86730 + }, + { + "epoch": 14.149265905383361, + "grad_norm": 0.00272945174947381, + "learning_rate": 0.0002388174806982293, + "loss": 0.0015, + "num_input_tokens_seen": 187362480, + "step": 86735 + }, + { + "epoch": 14.150081566068515, + "grad_norm": 0.004126972518861294, + "learning_rate": 0.00023875678674606848, + "loss": 0.0034, + "num_input_tokens_seen": 187374064, + "step": 86740 + }, + { + "epoch": 14.15089722675367, + "grad_norm": 0.030439136549830437, + "learning_rate": 0.00023869609808833316, + "loss": 0.0111, + "num_input_tokens_seen": 187384176, + "step": 86745 + }, + { + "epoch": 14.151712887438826, + "grad_norm": 0.009578816592693329, + "learning_rate": 0.0002386354147262525, + "loss": 0.0014, + "num_input_tokens_seen": 187395568, + "step": 86750 + }, + { + "epoch": 14.15252854812398, + "grad_norm": 0.004324750974774361, + "learning_rate": 0.0002385747366610571, + "loss": 0.0011, + "num_input_tokens_seen": 187407408, + "step": 86755 + }, + { + "epoch": 14.153344208809136, + "grad_norm": 0.0028149730060249567, + "learning_rate": 0.00023851406389397594, + "loss": 0.003, + "num_input_tokens_seen": 187418032, + "step": 86760 + }, + { + "epoch": 14.15415986949429, + "grad_norm": 0.003918309696018696, + "learning_rate": 0.00023845339642623937, + "loss": 0.0042, + "num_input_tokens_seen": 187428464, + "step": 86765 + }, + { + "epoch": 14.154975530179446, + "grad_norm": 0.004220111761242151, + "learning_rate": 0.00023839273425907615, + "loss": 0.0045, + "num_input_tokens_seen": 187438256, + "step": 86770 + }, + { + "epoch": 14.1557911908646, + "grad_norm": 0.0007073664455674589, + "learning_rate": 0.0002383320773937162, + "loss": 0.0956, + "num_input_tokens_seen": 187448496, + "step": 86775 + }, + { + "epoch": 14.156606851549755, + "grad_norm": 0.0012666697148233652, + "learning_rate": 0.00023827142583138873, + "loss": 0.0027, + "num_input_tokens_seen": 187459792, + "step": 86780 + }, + { + "epoch": 14.15742251223491, + "grad_norm": 0.0030706448014825583, + "learning_rate": 0.00023821077957332276, + "loss": 0.002, + "num_input_tokens_seen": 187471984, + "step": 86785 + }, + { + "epoch": 14.158238172920065, + "grad_norm": 0.0018182602943852544, + "learning_rate": 0.00023815013862074746, + "loss": 0.004, + "num_input_tokens_seen": 187482960, + "step": 86790 + }, + { + "epoch": 14.15905383360522, + "grad_norm": 0.006480704993009567, + "learning_rate": 0.0002380895029748918, + "loss": 0.0023, + "num_input_tokens_seen": 187493680, + "step": 86795 + }, + { + "epoch": 14.159869494290374, + "grad_norm": 0.005142877344042063, + "learning_rate": 0.00023802887263698464, + "loss": 0.0995, + "num_input_tokens_seen": 187505488, + "step": 86800 + }, + { + "epoch": 14.16068515497553, + "grad_norm": 0.025212204083800316, + "learning_rate": 0.00023796824760825464, + "loss": 0.0031, + "num_input_tokens_seen": 187516336, + "step": 86805 + }, + { + "epoch": 14.161500815660686, + "grad_norm": 0.004730370827019215, + "learning_rate": 0.0002379076278899306, + "loss": 0.0015, + "num_input_tokens_seen": 187526576, + "step": 86810 + }, + { + "epoch": 14.16231647634584, + "grad_norm": 0.008115909993648529, + "learning_rate": 0.0002378470134832409, + "loss": 0.0023, + "num_input_tokens_seen": 187537936, + "step": 86815 + }, + { + "epoch": 14.163132137030995, + "grad_norm": 0.019025664776563644, + "learning_rate": 0.00023778640438941408, + "loss": 0.0023, + "num_input_tokens_seen": 187547792, + "step": 86820 + }, + { + "epoch": 14.16394779771615, + "grad_norm": 0.04850027337670326, + "learning_rate": 0.00023772580060967834, + "loss": 0.0032, + "num_input_tokens_seen": 187559024, + "step": 86825 + }, + { + "epoch": 14.164763458401305, + "grad_norm": 0.0032715355046093464, + "learning_rate": 0.00023766520214526206, + "loss": 0.005, + "num_input_tokens_seen": 187570288, + "step": 86830 + }, + { + "epoch": 14.16557911908646, + "grad_norm": 0.03740094229578972, + "learning_rate": 0.00023760460899739322, + "loss": 0.0042, + "num_input_tokens_seen": 187580624, + "step": 86835 + }, + { + "epoch": 14.166394779771615, + "grad_norm": 0.0004254610976204276, + "learning_rate": 0.00023754402116729983, + "loss": 0.0035, + "num_input_tokens_seen": 187591248, + "step": 86840 + }, + { + "epoch": 14.16721044045677, + "grad_norm": 0.003979322500526905, + "learning_rate": 0.00023748343865620964, + "loss": 0.0847, + "num_input_tokens_seen": 187601136, + "step": 86845 + }, + { + "epoch": 14.168026101141924, + "grad_norm": 0.0024420591071248055, + "learning_rate": 0.00023742286146535098, + "loss": 0.0011, + "num_input_tokens_seen": 187611472, + "step": 86850 + }, + { + "epoch": 14.16884176182708, + "grad_norm": 0.027474381029605865, + "learning_rate": 0.00023736228959595073, + "loss": 0.0055, + "num_input_tokens_seen": 187621488, + "step": 86855 + }, + { + "epoch": 14.169657422512234, + "grad_norm": 0.0005245811189524829, + "learning_rate": 0.00023730172304923725, + "loss": 0.0667, + "num_input_tokens_seen": 187631696, + "step": 86860 + }, + { + "epoch": 14.17047308319739, + "grad_norm": 0.002572552999481559, + "learning_rate": 0.00023724116182643725, + "loss": 0.009, + "num_input_tokens_seen": 187642768, + "step": 86865 + }, + { + "epoch": 14.171288743882545, + "grad_norm": 0.5119752287864685, + "learning_rate": 0.00023718060592877878, + "loss": 0.046, + "num_input_tokens_seen": 187653488, + "step": 86870 + }, + { + "epoch": 14.1721044045677, + "grad_norm": 0.0018368182936683297, + "learning_rate": 0.00023712005535748838, + "loss": 0.0024, + "num_input_tokens_seen": 187663632, + "step": 86875 + }, + { + "epoch": 14.172920065252855, + "grad_norm": 0.0015098400181159377, + "learning_rate": 0.0002370595101137939, + "loss": 0.0063, + "num_input_tokens_seen": 187672912, + "step": 86880 + }, + { + "epoch": 14.173735725938009, + "grad_norm": 0.014908955432474613, + "learning_rate": 0.00023699897019892165, + "loss": 0.001, + "num_input_tokens_seen": 187683536, + "step": 86885 + }, + { + "epoch": 14.174551386623165, + "grad_norm": 0.008537651039659977, + "learning_rate": 0.00023693843561409928, + "loss": 0.0108, + "num_input_tokens_seen": 187695280, + "step": 86890 + }, + { + "epoch": 14.17536704730832, + "grad_norm": 0.0010102560045197606, + "learning_rate": 0.0002368779063605529, + "loss": 0.001, + "num_input_tokens_seen": 187705744, + "step": 86895 + }, + { + "epoch": 14.176182707993474, + "grad_norm": 0.17667905986309052, + "learning_rate": 0.00023681738243950984, + "loss": 0.0188, + "num_input_tokens_seen": 187716976, + "step": 86900 + }, + { + "epoch": 14.17699836867863, + "grad_norm": 0.5679713487625122, + "learning_rate": 0.00023675686385219607, + "loss": 0.0884, + "num_input_tokens_seen": 187728208, + "step": 86905 + }, + { + "epoch": 14.177814029363784, + "grad_norm": 0.004683853592723608, + "learning_rate": 0.0002366963505998388, + "loss": 0.0113, + "num_input_tokens_seen": 187738608, + "step": 86910 + }, + { + "epoch": 14.17862969004894, + "grad_norm": 0.02120167389512062, + "learning_rate": 0.00023663584268366356, + "loss": 0.0052, + "num_input_tokens_seen": 187750032, + "step": 86915 + }, + { + "epoch": 14.179445350734095, + "grad_norm": 0.010204663500189781, + "learning_rate": 0.00023657534010489733, + "loss": 0.0141, + "num_input_tokens_seen": 187760272, + "step": 86920 + }, + { + "epoch": 14.18026101141925, + "grad_norm": 0.0016371725359931588, + "learning_rate": 0.000236514842864766, + "loss": 0.0654, + "num_input_tokens_seen": 187770832, + "step": 86925 + }, + { + "epoch": 14.181076672104405, + "grad_norm": 0.0017514110077172518, + "learning_rate": 0.00023645435096449557, + "loss": 0.0035, + "num_input_tokens_seen": 187782192, + "step": 86930 + }, + { + "epoch": 14.181892332789559, + "grad_norm": 0.0801762267947197, + "learning_rate": 0.00023639386440531208, + "loss": 0.0106, + "num_input_tokens_seen": 187793456, + "step": 86935 + }, + { + "epoch": 14.182707993474715, + "grad_norm": 0.5289106369018555, + "learning_rate": 0.00023633338318844137, + "loss": 0.1003, + "num_input_tokens_seen": 187804976, + "step": 86940 + }, + { + "epoch": 14.18352365415987, + "grad_norm": 0.006982157472521067, + "learning_rate": 0.00023627290731510908, + "loss": 0.0869, + "num_input_tokens_seen": 187815120, + "step": 86945 + }, + { + "epoch": 14.184339314845024, + "grad_norm": 0.010652474127709866, + "learning_rate": 0.00023621243678654099, + "loss": 0.0167, + "num_input_tokens_seen": 187825776, + "step": 86950 + }, + { + "epoch": 14.18515497553018, + "grad_norm": 0.16883718967437744, + "learning_rate": 0.0002361519716039624, + "loss": 0.0064, + "num_input_tokens_seen": 187835920, + "step": 86955 + }, + { + "epoch": 14.185970636215334, + "grad_norm": 0.024424118921160698, + "learning_rate": 0.00023609151176859884, + "loss": 0.0013, + "num_input_tokens_seen": 187846832, + "step": 86960 + }, + { + "epoch": 14.18678629690049, + "grad_norm": 0.09532174468040466, + "learning_rate": 0.00023603105728167562, + "loss": 0.0077, + "num_input_tokens_seen": 187856624, + "step": 86965 + }, + { + "epoch": 14.187601957585644, + "grad_norm": 0.026332538574934006, + "learning_rate": 0.00023597060814441767, + "loss": 0.1008, + "num_input_tokens_seen": 187867120, + "step": 86970 + }, + { + "epoch": 14.1884176182708, + "grad_norm": 0.0035820265766233206, + "learning_rate": 0.00023591016435805067, + "loss": 0.0036, + "num_input_tokens_seen": 187877040, + "step": 86975 + }, + { + "epoch": 14.189233278955955, + "grad_norm": 0.0029624279122799635, + "learning_rate": 0.00023584972592379888, + "loss": 0.0022, + "num_input_tokens_seen": 187888304, + "step": 86980 + }, + { + "epoch": 14.190048939641109, + "grad_norm": 0.03424012288451195, + "learning_rate": 0.0002357892928428878, + "loss": 0.0189, + "num_input_tokens_seen": 187897680, + "step": 86985 + }, + { + "epoch": 14.190864600326265, + "grad_norm": 0.0046348837204277515, + "learning_rate": 0.00023572886511654157, + "loss": 0.002, + "num_input_tokens_seen": 187908784, + "step": 86990 + }, + { + "epoch": 14.191680261011419, + "grad_norm": 0.0020335959270596504, + "learning_rate": 0.00023566844274598548, + "loss": 0.0019, + "num_input_tokens_seen": 187919920, + "step": 86995 + }, + { + "epoch": 14.192495921696574, + "grad_norm": 0.06452670693397522, + "learning_rate": 0.00023560802573244333, + "loss": 0.0028, + "num_input_tokens_seen": 187931408, + "step": 87000 + }, + { + "epoch": 14.19331158238173, + "grad_norm": 0.006201412994414568, + "learning_rate": 0.00023554761407714036, + "loss": 0.0519, + "num_input_tokens_seen": 187940816, + "step": 87005 + }, + { + "epoch": 14.194127243066884, + "grad_norm": 0.027852863073349, + "learning_rate": 0.00023548720778130005, + "loss": 0.0053, + "num_input_tokens_seen": 187951664, + "step": 87010 + }, + { + "epoch": 14.19494290375204, + "grad_norm": 0.09883508831262589, + "learning_rate": 0.0002354268068461475, + "loss": 0.018, + "num_input_tokens_seen": 187962768, + "step": 87015 + }, + { + "epoch": 14.195758564437194, + "grad_norm": 0.008603896014392376, + "learning_rate": 0.00023536641127290588, + "loss": 0.0037, + "num_input_tokens_seen": 187972624, + "step": 87020 + }, + { + "epoch": 14.19657422512235, + "grad_norm": 0.0015382606070488691, + "learning_rate": 0.00023530602106280004, + "loss": 0.2456, + "num_input_tokens_seen": 187984592, + "step": 87025 + }, + { + "epoch": 14.197389885807505, + "grad_norm": 0.21547020971775055, + "learning_rate": 0.00023524563621705308, + "loss": 0.0141, + "num_input_tokens_seen": 187996688, + "step": 87030 + }, + { + "epoch": 14.198205546492659, + "grad_norm": 0.018739258870482445, + "learning_rate": 0.00023518525673688957, + "loss": 0.003, + "num_input_tokens_seen": 188007600, + "step": 87035 + }, + { + "epoch": 14.199021207177815, + "grad_norm": 0.3604316711425781, + "learning_rate": 0.0002351248826235324, + "loss": 0.0733, + "num_input_tokens_seen": 188018288, + "step": 87040 + }, + { + "epoch": 14.199836867862969, + "grad_norm": 0.0037503130733966827, + "learning_rate": 0.00023506451387820588, + "loss": 0.0786, + "num_input_tokens_seen": 188029456, + "step": 87045 + }, + { + "epoch": 14.200652528548124, + "grad_norm": 0.015447917394340038, + "learning_rate": 0.0002350041505021327, + "loss": 0.0033, + "num_input_tokens_seen": 188042864, + "step": 87050 + }, + { + "epoch": 14.201468189233278, + "grad_norm": 0.04216151684522629, + "learning_rate": 0.00023494379249653675, + "loss": 0.0032, + "num_input_tokens_seen": 188053904, + "step": 87055 + }, + { + "epoch": 14.202283849918434, + "grad_norm": 0.13910488784313202, + "learning_rate": 0.0002348834398626411, + "loss": 0.0084, + "num_input_tokens_seen": 188063504, + "step": 87060 + }, + { + "epoch": 14.20309951060359, + "grad_norm": 0.0654771700501442, + "learning_rate": 0.0002348230926016689, + "loss": 0.0045, + "num_input_tokens_seen": 188073712, + "step": 87065 + }, + { + "epoch": 14.203915171288743, + "grad_norm": 0.011523857712745667, + "learning_rate": 0.00023476275071484309, + "loss": 0.0073, + "num_input_tokens_seen": 188083888, + "step": 87070 + }, + { + "epoch": 14.2047308319739, + "grad_norm": 0.003080186201259494, + "learning_rate": 0.0002347024142033866, + "loss": 0.0123, + "num_input_tokens_seen": 188094640, + "step": 87075 + }, + { + "epoch": 14.205546492659053, + "grad_norm": 0.5070033669471741, + "learning_rate": 0.0002346420830685223, + "loss": 0.1119, + "num_input_tokens_seen": 188106256, + "step": 87080 + }, + { + "epoch": 14.206362153344209, + "grad_norm": 0.008827326819300652, + "learning_rate": 0.0002345817573114728, + "loss": 0.0108, + "num_input_tokens_seen": 188115536, + "step": 87085 + }, + { + "epoch": 14.207177814029365, + "grad_norm": 0.003627562429755926, + "learning_rate": 0.00023452143693346067, + "loss": 0.0109, + "num_input_tokens_seen": 188126800, + "step": 87090 + }, + { + "epoch": 14.207993474714518, + "grad_norm": 0.007856626994907856, + "learning_rate": 0.0002344611219357084, + "loss": 0.05, + "num_input_tokens_seen": 188137680, + "step": 87095 + }, + { + "epoch": 14.208809135399674, + "grad_norm": 0.10289426147937775, + "learning_rate": 0.0002344008123194384, + "loss": 0.0075, + "num_input_tokens_seen": 188149200, + "step": 87100 + }, + { + "epoch": 14.209624796084828, + "grad_norm": 0.00227095908485353, + "learning_rate": 0.0002343405080858728, + "loss": 0.0009, + "num_input_tokens_seen": 188160080, + "step": 87105 + }, + { + "epoch": 14.210440456769984, + "grad_norm": 0.11845573782920837, + "learning_rate": 0.00023428020923623382, + "loss": 0.0087, + "num_input_tokens_seen": 188171856, + "step": 87110 + }, + { + "epoch": 14.21125611745514, + "grad_norm": 0.0016365720657631755, + "learning_rate": 0.0002342199157717434, + "loss": 0.0095, + "num_input_tokens_seen": 188181936, + "step": 87115 + }, + { + "epoch": 14.212071778140293, + "grad_norm": 0.0020826237741857767, + "learning_rate": 0.00023415962769362386, + "loss": 0.0029, + "num_input_tokens_seen": 188193040, + "step": 87120 + }, + { + "epoch": 14.21288743882545, + "grad_norm": 0.023198019713163376, + "learning_rate": 0.00023409934500309633, + "loss": 0.0093, + "num_input_tokens_seen": 188202864, + "step": 87125 + }, + { + "epoch": 14.213703099510603, + "grad_norm": 0.010550117120146751, + "learning_rate": 0.00023403906770138328, + "loss": 0.0084, + "num_input_tokens_seen": 188214032, + "step": 87130 + }, + { + "epoch": 14.214518760195759, + "grad_norm": 0.005151396617293358, + "learning_rate": 0.00023397879578970554, + "loss": 0.008, + "num_input_tokens_seen": 188224784, + "step": 87135 + }, + { + "epoch": 14.215334420880913, + "grad_norm": 0.010745275765657425, + "learning_rate": 0.00023391852926928536, + "loss": 0.0078, + "num_input_tokens_seen": 188235152, + "step": 87140 + }, + { + "epoch": 14.216150081566068, + "grad_norm": 0.004303690977394581, + "learning_rate": 0.0002338582681413433, + "loss": 0.0568, + "num_input_tokens_seen": 188246288, + "step": 87145 + }, + { + "epoch": 14.216965742251224, + "grad_norm": 0.008361046202480793, + "learning_rate": 0.0002337980124071015, + "loss": 0.0046, + "num_input_tokens_seen": 188258160, + "step": 87150 + }, + { + "epoch": 14.217781402936378, + "grad_norm": 0.0012672094162553549, + "learning_rate": 0.0002337377620677803, + "loss": 0.0008, + "num_input_tokens_seen": 188268656, + "step": 87155 + }, + { + "epoch": 14.218597063621534, + "grad_norm": 0.004940166603773832, + "learning_rate": 0.00023367751712460134, + "loss": 0.0013, + "num_input_tokens_seen": 188279664, + "step": 87160 + }, + { + "epoch": 14.219412724306688, + "grad_norm": 0.016097739338874817, + "learning_rate": 0.00023361727757878527, + "loss": 0.0072, + "num_input_tokens_seen": 188290512, + "step": 87165 + }, + { + "epoch": 14.220228384991843, + "grad_norm": 0.009004230611026287, + "learning_rate": 0.00023355704343155305, + "loss": 0.0047, + "num_input_tokens_seen": 188300848, + "step": 87170 + }, + { + "epoch": 14.221044045676999, + "grad_norm": 0.014903482049703598, + "learning_rate": 0.00023349681468412537, + "loss": 0.0047, + "num_input_tokens_seen": 188310832, + "step": 87175 + }, + { + "epoch": 14.221859706362153, + "grad_norm": 0.030258629471063614, + "learning_rate": 0.00023343659133772277, + "loss": 0.0055, + "num_input_tokens_seen": 188322704, + "step": 87180 + }, + { + "epoch": 14.222675367047309, + "grad_norm": 0.40277299284935, + "learning_rate": 0.0002333763733935659, + "loss": 0.0047, + "num_input_tokens_seen": 188332656, + "step": 87185 + }, + { + "epoch": 14.223491027732463, + "grad_norm": 0.001041868468746543, + "learning_rate": 0.00023331616085287492, + "loss": 0.002, + "num_input_tokens_seen": 188343408, + "step": 87190 + }, + { + "epoch": 14.224306688417618, + "grad_norm": 0.03579118847846985, + "learning_rate": 0.00023325595371687037, + "loss": 0.0271, + "num_input_tokens_seen": 188354736, + "step": 87195 + }, + { + "epoch": 14.225122349102774, + "grad_norm": 0.006182889919728041, + "learning_rate": 0.00023319575198677223, + "loss": 0.0046, + "num_input_tokens_seen": 188364944, + "step": 87200 + }, + { + "epoch": 14.225938009787928, + "grad_norm": 0.0006182396900840104, + "learning_rate": 0.00023313555566380068, + "loss": 0.0076, + "num_input_tokens_seen": 188374928, + "step": 87205 + }, + { + "epoch": 14.226753670473084, + "grad_norm": 0.006085644010454416, + "learning_rate": 0.00023307536474917567, + "loss": 0.0022, + "num_input_tokens_seen": 188384816, + "step": 87210 + }, + { + "epoch": 14.227569331158238, + "grad_norm": 0.052729446440935135, + "learning_rate": 0.00023301517924411696, + "loss": 0.0035, + "num_input_tokens_seen": 188394128, + "step": 87215 + }, + { + "epoch": 14.228384991843393, + "grad_norm": 0.003131187055259943, + "learning_rate": 0.00023295499914984436, + "loss": 0.002, + "num_input_tokens_seen": 188405552, + "step": 87220 + }, + { + "epoch": 14.229200652528547, + "grad_norm": 0.004549449775367975, + "learning_rate": 0.00023289482446757747, + "loss": 0.0029, + "num_input_tokens_seen": 188416368, + "step": 87225 + }, + { + "epoch": 14.230016313213703, + "grad_norm": 0.48102879524230957, + "learning_rate": 0.0002328346551985358, + "loss": 0.1339, + "num_input_tokens_seen": 188426512, + "step": 87230 + }, + { + "epoch": 14.230831973898859, + "grad_norm": 0.002627470064908266, + "learning_rate": 0.00023277449134393875, + "loss": 0.0024, + "num_input_tokens_seen": 188437968, + "step": 87235 + }, + { + "epoch": 14.231647634584013, + "grad_norm": 0.021344967186450958, + "learning_rate": 0.00023271433290500567, + "loss": 0.0099, + "num_input_tokens_seen": 188448304, + "step": 87240 + }, + { + "epoch": 14.232463295269168, + "grad_norm": 0.003346246900036931, + "learning_rate": 0.00023265417988295567, + "loss": 0.0014, + "num_input_tokens_seen": 188458320, + "step": 87245 + }, + { + "epoch": 14.233278955954322, + "grad_norm": 1.6928178071975708, + "learning_rate": 0.0002325940322790079, + "loss": 0.0198, + "num_input_tokens_seen": 188469040, + "step": 87250 + }, + { + "epoch": 14.234094616639478, + "grad_norm": 0.0035675603430718184, + "learning_rate": 0.0002325338900943813, + "loss": 0.0016, + "num_input_tokens_seen": 188480880, + "step": 87255 + }, + { + "epoch": 14.234910277324634, + "grad_norm": 0.004161432385444641, + "learning_rate": 0.00023247375333029452, + "loss": 0.0012, + "num_input_tokens_seen": 188491824, + "step": 87260 + }, + { + "epoch": 14.235725938009788, + "grad_norm": 0.04490378871560097, + "learning_rate": 0.00023241362198796666, + "loss": 0.0091, + "num_input_tokens_seen": 188502896, + "step": 87265 + }, + { + "epoch": 14.236541598694943, + "grad_norm": 0.4051935374736786, + "learning_rate": 0.00023235349606861628, + "loss": 0.0071, + "num_input_tokens_seen": 188514800, + "step": 87270 + }, + { + "epoch": 14.237357259380097, + "grad_norm": 0.005929006729274988, + "learning_rate": 0.00023229337557346174, + "loss": 0.0267, + "num_input_tokens_seen": 188524048, + "step": 87275 + }, + { + "epoch": 14.238172920065253, + "grad_norm": 0.0033224388025701046, + "learning_rate": 0.00023223326050372163, + "loss": 0.0026, + "num_input_tokens_seen": 188535600, + "step": 87280 + }, + { + "epoch": 14.238988580750409, + "grad_norm": 0.024302540346980095, + "learning_rate": 0.0002321731508606142, + "loss": 0.0253, + "num_input_tokens_seen": 188546288, + "step": 87285 + }, + { + "epoch": 14.239804241435563, + "grad_norm": 0.004064117558300495, + "learning_rate": 0.0002321130466453576, + "loss": 0.1404, + "num_input_tokens_seen": 188557424, + "step": 87290 + }, + { + "epoch": 14.240619902120718, + "grad_norm": 0.05272166058421135, + "learning_rate": 0.0002320529478591699, + "loss": 0.0123, + "num_input_tokens_seen": 188566928, + "step": 87295 + }, + { + "epoch": 14.241435562805872, + "grad_norm": 0.006723629776388407, + "learning_rate": 0.00023199285450326918, + "loss": 0.0062, + "num_input_tokens_seen": 188576528, + "step": 87300 + }, + { + "epoch": 14.242251223491028, + "grad_norm": 0.0016809606458991766, + "learning_rate": 0.00023193276657887326, + "loss": 0.004, + "num_input_tokens_seen": 188586992, + "step": 87305 + }, + { + "epoch": 14.243066884176184, + "grad_norm": 0.014636986888945103, + "learning_rate": 0.00023187268408719986, + "loss": 0.0013, + "num_input_tokens_seen": 188597776, + "step": 87310 + }, + { + "epoch": 14.243882544861338, + "grad_norm": 0.5428915023803711, + "learning_rate": 0.00023181260702946673, + "loss": 0.0146, + "num_input_tokens_seen": 188608784, + "step": 87315 + }, + { + "epoch": 14.244698205546493, + "grad_norm": 0.003895719302818179, + "learning_rate": 0.00023175253540689124, + "loss": 0.0031, + "num_input_tokens_seen": 188620656, + "step": 87320 + }, + { + "epoch": 14.245513866231647, + "grad_norm": 0.0009167763637378812, + "learning_rate": 0.00023169246922069098, + "loss": 0.0056, + "num_input_tokens_seen": 188631920, + "step": 87325 + }, + { + "epoch": 14.246329526916803, + "grad_norm": 0.035440631210803986, + "learning_rate": 0.00023163240847208318, + "loss": 0.0026, + "num_input_tokens_seen": 188641744, + "step": 87330 + }, + { + "epoch": 14.247145187601957, + "grad_norm": 0.6476132273674011, + "learning_rate": 0.0002315723531622851, + "loss": 0.1426, + "num_input_tokens_seen": 188653168, + "step": 87335 + }, + { + "epoch": 14.247960848287113, + "grad_norm": 0.10940902680158615, + "learning_rate": 0.00023151230329251376, + "loss": 0.0045, + "num_input_tokens_seen": 188664144, + "step": 87340 + }, + { + "epoch": 14.248776508972268, + "grad_norm": 0.002636961406096816, + "learning_rate": 0.00023145225886398617, + "loss": 0.033, + "num_input_tokens_seen": 188674736, + "step": 87345 + }, + { + "epoch": 14.249592169657422, + "grad_norm": 0.00327065447345376, + "learning_rate": 0.0002313922198779193, + "loss": 0.0024, + "num_input_tokens_seen": 188686960, + "step": 87350 + }, + { + "epoch": 14.250407830342578, + "grad_norm": 0.007365102879703045, + "learning_rate": 0.00023133218633552982, + "loss": 0.0091, + "num_input_tokens_seen": 188698576, + "step": 87355 + }, + { + "epoch": 14.251223491027732, + "grad_norm": 0.03469528630375862, + "learning_rate": 0.00023127215823803444, + "loss": 0.0061, + "num_input_tokens_seen": 188709808, + "step": 87360 + }, + { + "epoch": 14.252039151712887, + "grad_norm": 0.30641695857048035, + "learning_rate": 0.00023121213558664966, + "loss": 0.0111, + "num_input_tokens_seen": 188721712, + "step": 87365 + }, + { + "epoch": 14.252854812398043, + "grad_norm": 0.0382346473634243, + "learning_rate": 0.00023115211838259175, + "loss": 0.0027, + "num_input_tokens_seen": 188731920, + "step": 87370 + }, + { + "epoch": 14.253670473083197, + "grad_norm": 0.0007187232258729637, + "learning_rate": 0.00023109210662707757, + "loss": 0.0062, + "num_input_tokens_seen": 188742448, + "step": 87375 + }, + { + "epoch": 14.254486133768353, + "grad_norm": 0.004653709474951029, + "learning_rate": 0.00023103210032132267, + "loss": 0.0013, + "num_input_tokens_seen": 188752976, + "step": 87380 + }, + { + "epoch": 14.255301794453507, + "grad_norm": 0.0012320553651079535, + "learning_rate": 0.0002309720994665438, + "loss": 0.0124, + "num_input_tokens_seen": 188763664, + "step": 87385 + }, + { + "epoch": 14.256117455138662, + "grad_norm": 0.0008974694646894932, + "learning_rate": 0.00023091210406395624, + "loss": 0.0017, + "num_input_tokens_seen": 188773936, + "step": 87390 + }, + { + "epoch": 14.256933115823816, + "grad_norm": 0.008518456481397152, + "learning_rate": 0.00023085211411477663, + "loss": 0.001, + "num_input_tokens_seen": 188784176, + "step": 87395 + }, + { + "epoch": 14.257748776508972, + "grad_norm": 0.019535856321454048, + "learning_rate": 0.00023079212962022, + "loss": 0.0054, + "num_input_tokens_seen": 188794736, + "step": 87400 + }, + { + "epoch": 14.258564437194128, + "grad_norm": 0.015992237254977226, + "learning_rate": 0.00023073215058150255, + "loss": 0.0012, + "num_input_tokens_seen": 188805200, + "step": 87405 + }, + { + "epoch": 14.259380097879282, + "grad_norm": 0.0015473555540665984, + "learning_rate": 0.00023067217699983966, + "loss": 0.0373, + "num_input_tokens_seen": 188816240, + "step": 87410 + }, + { + "epoch": 14.260195758564437, + "grad_norm": 0.005111176986247301, + "learning_rate": 0.00023061220887644679, + "loss": 0.0025, + "num_input_tokens_seen": 188824848, + "step": 87415 + }, + { + "epoch": 14.261011419249591, + "grad_norm": 0.00272758980281651, + "learning_rate": 0.00023055224621253923, + "loss": 0.1147, + "num_input_tokens_seen": 188835888, + "step": 87420 + }, + { + "epoch": 14.261827079934747, + "grad_norm": 0.0052657704800367355, + "learning_rate": 0.00023049228900933223, + "loss": 0.0034, + "num_input_tokens_seen": 188846608, + "step": 87425 + }, + { + "epoch": 14.262642740619903, + "grad_norm": 0.005922100506722927, + "learning_rate": 0.00023043233726804087, + "loss": 0.0017, + "num_input_tokens_seen": 188858096, + "step": 87430 + }, + { + "epoch": 14.263458401305057, + "grad_norm": 0.09431520849466324, + "learning_rate": 0.00023037239098988016, + "loss": 0.016, + "num_input_tokens_seen": 188868880, + "step": 87435 + }, + { + "epoch": 14.264274061990212, + "grad_norm": 0.011786018498241901, + "learning_rate": 0.00023031245017606506, + "loss": 0.0122, + "num_input_tokens_seen": 188880144, + "step": 87440 + }, + { + "epoch": 14.265089722675366, + "grad_norm": 0.049001362174749374, + "learning_rate": 0.00023025251482781023, + "loss": 0.0072, + "num_input_tokens_seen": 188890896, + "step": 87445 + }, + { + "epoch": 14.265905383360522, + "grad_norm": 0.012988328002393246, + "learning_rate": 0.00023019258494633038, + "loss": 0.0042, + "num_input_tokens_seen": 188901680, + "step": 87450 + }, + { + "epoch": 14.266721044045678, + "grad_norm": 0.05504751577973366, + "learning_rate": 0.0002301326605328401, + "loss": 0.0049, + "num_input_tokens_seen": 188912400, + "step": 87455 + }, + { + "epoch": 14.267536704730832, + "grad_norm": 0.0019441379699856043, + "learning_rate": 0.00023007274158855378, + "loss": 0.0049, + "num_input_tokens_seen": 188924048, + "step": 87460 + }, + { + "epoch": 14.268352365415987, + "grad_norm": 0.01831858977675438, + "learning_rate": 0.00023001282811468577, + "loss": 0.0176, + "num_input_tokens_seen": 188935536, + "step": 87465 + }, + { + "epoch": 14.269168026101141, + "grad_norm": 0.3804619610309601, + "learning_rate": 0.00022995292011245033, + "loss": 0.0058, + "num_input_tokens_seen": 188947216, + "step": 87470 + }, + { + "epoch": 14.269983686786297, + "grad_norm": 0.0805104523897171, + "learning_rate": 0.00022989301758306153, + "loss": 0.0056, + "num_input_tokens_seen": 188958288, + "step": 87475 + }, + { + "epoch": 14.270799347471453, + "grad_norm": 0.00453876843675971, + "learning_rate": 0.00022983312052773336, + "loss": 0.003, + "num_input_tokens_seen": 188967920, + "step": 87480 + }, + { + "epoch": 14.271615008156607, + "grad_norm": 0.001278862589970231, + "learning_rate": 0.0002297732289476796, + "loss": 0.0046, + "num_input_tokens_seen": 188977712, + "step": 87485 + }, + { + "epoch": 14.272430668841762, + "grad_norm": 0.0043226066045463085, + "learning_rate": 0.0002297133428441145, + "loss": 0.0026, + "num_input_tokens_seen": 188987728, + "step": 87490 + }, + { + "epoch": 14.273246329526916, + "grad_norm": 0.005642724223434925, + "learning_rate": 0.000229653462218251, + "loss": 0.0047, + "num_input_tokens_seen": 188999088, + "step": 87495 + }, + { + "epoch": 14.274061990212072, + "grad_norm": 0.01294635608792305, + "learning_rate": 0.00022959358707130346, + "loss": 0.0259, + "num_input_tokens_seen": 189010288, + "step": 87500 + }, + { + "epoch": 14.274877650897226, + "grad_norm": 0.000444377918029204, + "learning_rate": 0.00022953371740448453, + "loss": 0.0061, + "num_input_tokens_seen": 189021744, + "step": 87505 + }, + { + "epoch": 14.275693311582382, + "grad_norm": 0.0009572534472681582, + "learning_rate": 0.00022947385321900825, + "loss": 0.0073, + "num_input_tokens_seen": 189031568, + "step": 87510 + }, + { + "epoch": 14.276508972267537, + "grad_norm": 0.0006329436437226832, + "learning_rate": 0.00022941399451608725, + "loss": 0.0006, + "num_input_tokens_seen": 189042032, + "step": 87515 + }, + { + "epoch": 14.277324632952691, + "grad_norm": 0.0018760806415230036, + "learning_rate": 0.00022935414129693523, + "loss": 0.0083, + "num_input_tokens_seen": 189051856, + "step": 87520 + }, + { + "epoch": 14.278140293637847, + "grad_norm": 0.017724091187119484, + "learning_rate": 0.0002292942935627645, + "loss": 0.0038, + "num_input_tokens_seen": 189062096, + "step": 87525 + }, + { + "epoch": 14.278955954323001, + "grad_norm": 0.000525585375726223, + "learning_rate": 0.00022923445131478866, + "loss": 0.1287, + "num_input_tokens_seen": 189072656, + "step": 87530 + }, + { + "epoch": 14.279771615008157, + "grad_norm": 0.1081756129860878, + "learning_rate": 0.00022917461455421984, + "loss": 0.0079, + "num_input_tokens_seen": 189081904, + "step": 87535 + }, + { + "epoch": 14.280587275693312, + "grad_norm": 0.043451886624097824, + "learning_rate": 0.00022911478328227136, + "loss": 0.1889, + "num_input_tokens_seen": 189092176, + "step": 87540 + }, + { + "epoch": 14.281402936378466, + "grad_norm": 0.005053219385445118, + "learning_rate": 0.00022905495750015508, + "loss": 0.0105, + "num_input_tokens_seen": 189102096, + "step": 87545 + }, + { + "epoch": 14.282218597063622, + "grad_norm": 0.1774892508983612, + "learning_rate": 0.000228995137209084, + "loss": 0.0058, + "num_input_tokens_seen": 189113808, + "step": 87550 + }, + { + "epoch": 14.283034257748776, + "grad_norm": 0.0033074861858040094, + "learning_rate": 0.00022893532241027026, + "loss": 0.0021, + "num_input_tokens_seen": 189123312, + "step": 87555 + }, + { + "epoch": 14.283849918433932, + "grad_norm": 0.0016520784702152014, + "learning_rate": 0.00022887551310492605, + "loss": 0.0038, + "num_input_tokens_seen": 189135184, + "step": 87560 + }, + { + "epoch": 14.284665579119087, + "grad_norm": 0.008472919464111328, + "learning_rate": 0.00022881570929426354, + "loss": 0.0043, + "num_input_tokens_seen": 189145936, + "step": 87565 + }, + { + "epoch": 14.285481239804241, + "grad_norm": 0.8978723287582397, + "learning_rate": 0.00022875591097949472, + "loss": 0.118, + "num_input_tokens_seen": 189156304, + "step": 87570 + }, + { + "epoch": 14.286296900489397, + "grad_norm": 0.02543191984295845, + "learning_rate": 0.00022869611816183144, + "loss": 0.0061, + "num_input_tokens_seen": 189166448, + "step": 87575 + }, + { + "epoch": 14.28711256117455, + "grad_norm": 0.0005691770347766578, + "learning_rate": 0.00022863633084248549, + "loss": 0.0024, + "num_input_tokens_seen": 189177456, + "step": 87580 + }, + { + "epoch": 14.287928221859707, + "grad_norm": 0.008937807753682137, + "learning_rate": 0.00022857654902266856, + "loss": 0.004, + "num_input_tokens_seen": 189187248, + "step": 87585 + }, + { + "epoch": 14.28874388254486, + "grad_norm": 0.0018283298704773188, + "learning_rate": 0.00022851677270359217, + "loss": 0.0235, + "num_input_tokens_seen": 189197776, + "step": 87590 + }, + { + "epoch": 14.289559543230016, + "grad_norm": 0.01428473275154829, + "learning_rate": 0.0002284570018864678, + "loss": 0.0025, + "num_input_tokens_seen": 189208624, + "step": 87595 + }, + { + "epoch": 14.290375203915172, + "grad_norm": 0.0034628030844032764, + "learning_rate": 0.0002283972365725066, + "loss": 0.0021, + "num_input_tokens_seen": 189219568, + "step": 87600 + }, + { + "epoch": 14.291190864600326, + "grad_norm": 0.012105366215109825, + "learning_rate": 0.00022833747676292027, + "loss": 0.0037, + "num_input_tokens_seen": 189231120, + "step": 87605 + }, + { + "epoch": 14.292006525285482, + "grad_norm": 0.3718734085559845, + "learning_rate": 0.00022827772245891925, + "loss": 0.0146, + "num_input_tokens_seen": 189241904, + "step": 87610 + }, + { + "epoch": 14.292822185970635, + "grad_norm": 0.08666027337312698, + "learning_rate": 0.00022821797366171531, + "loss": 0.0051, + "num_input_tokens_seen": 189252336, + "step": 87615 + }, + { + "epoch": 14.293637846655791, + "grad_norm": 0.01287077460438013, + "learning_rate": 0.00022815823037251849, + "loss": 0.0039, + "num_input_tokens_seen": 189263184, + "step": 87620 + }, + { + "epoch": 14.294453507340947, + "grad_norm": 0.0899038165807724, + "learning_rate": 0.00022809849259254034, + "loss": 0.0173, + "num_input_tokens_seen": 189274384, + "step": 87625 + }, + { + "epoch": 14.2952691680261, + "grad_norm": 0.03881601616740227, + "learning_rate": 0.00022803876032299086, + "loss": 0.0043, + "num_input_tokens_seen": 189285648, + "step": 87630 + }, + { + "epoch": 14.296084828711257, + "grad_norm": 0.04519505426287651, + "learning_rate": 0.00022797903356508125, + "loss": 0.0041, + "num_input_tokens_seen": 189295920, + "step": 87635 + }, + { + "epoch": 14.29690048939641, + "grad_norm": 0.003327068639919162, + "learning_rate": 0.00022791931232002123, + "loss": 0.0015, + "num_input_tokens_seen": 189307280, + "step": 87640 + }, + { + "epoch": 14.297716150081566, + "grad_norm": 0.04209542274475098, + "learning_rate": 0.00022785959658902188, + "loss": 0.0064, + "num_input_tokens_seen": 189319568, + "step": 87645 + }, + { + "epoch": 14.298531810766722, + "grad_norm": 0.0016900094924494624, + "learning_rate": 0.00022779988637329263, + "loss": 0.0014, + "num_input_tokens_seen": 189329616, + "step": 87650 + }, + { + "epoch": 14.299347471451876, + "grad_norm": 0.0013236630475148559, + "learning_rate": 0.00022774018167404442, + "loss": 0.0012, + "num_input_tokens_seen": 189340496, + "step": 87655 + }, + { + "epoch": 14.300163132137031, + "grad_norm": 0.022487998008728027, + "learning_rate": 0.00022768048249248646, + "loss": 0.0041, + "num_input_tokens_seen": 189350416, + "step": 87660 + }, + { + "epoch": 14.300978792822185, + "grad_norm": 0.0019900943152606487, + "learning_rate": 0.00022762078882982928, + "loss": 0.001, + "num_input_tokens_seen": 189361840, + "step": 87665 + }, + { + "epoch": 14.301794453507341, + "grad_norm": 0.007686574477702379, + "learning_rate": 0.00022756110068728204, + "loss": 0.0024, + "num_input_tokens_seen": 189372880, + "step": 87670 + }, + { + "epoch": 14.302610114192497, + "grad_norm": 0.00679362565279007, + "learning_rate": 0.00022750141806605507, + "loss": 0.0012, + "num_input_tokens_seen": 189384944, + "step": 87675 + }, + { + "epoch": 14.30342577487765, + "grad_norm": 0.028075747191905975, + "learning_rate": 0.00022744174096735715, + "loss": 0.0032, + "num_input_tokens_seen": 189395824, + "step": 87680 + }, + { + "epoch": 14.304241435562806, + "grad_norm": 1.1675291061401367, + "learning_rate": 0.00022738206939239852, + "loss": 0.019, + "num_input_tokens_seen": 189406352, + "step": 87685 + }, + { + "epoch": 14.30505709624796, + "grad_norm": 0.002868425566703081, + "learning_rate": 0.0002273224033423877, + "loss": 0.002, + "num_input_tokens_seen": 189417584, + "step": 87690 + }, + { + "epoch": 14.305872756933116, + "grad_norm": 0.21164196729660034, + "learning_rate": 0.0002272627428185345, + "loss": 0.0145, + "num_input_tokens_seen": 189428944, + "step": 87695 + }, + { + "epoch": 14.30668841761827, + "grad_norm": 0.042567916214466095, + "learning_rate": 0.0002272030878220478, + "loss": 0.0016, + "num_input_tokens_seen": 189439632, + "step": 87700 + }, + { + "epoch": 14.307504078303426, + "grad_norm": 0.058382321149110794, + "learning_rate": 0.0002271434383541366, + "loss": 0.0057, + "num_input_tokens_seen": 189450672, + "step": 87705 + }, + { + "epoch": 14.308319738988581, + "grad_norm": 0.01143584307283163, + "learning_rate": 0.00022708379441600975, + "loss": 0.0105, + "num_input_tokens_seen": 189461200, + "step": 87710 + }, + { + "epoch": 14.309135399673735, + "grad_norm": 0.0020317668095231056, + "learning_rate": 0.000227024156008876, + "loss": 0.0009, + "num_input_tokens_seen": 189470480, + "step": 87715 + }, + { + "epoch": 14.309951060358891, + "grad_norm": 0.006676700431853533, + "learning_rate": 0.00022696452313394406, + "loss": 0.0037, + "num_input_tokens_seen": 189481520, + "step": 87720 + }, + { + "epoch": 14.310766721044045, + "grad_norm": 0.0036642735358327627, + "learning_rate": 0.0002269048957924224, + "loss": 0.004, + "num_input_tokens_seen": 189491280, + "step": 87725 + }, + { + "epoch": 14.3115823817292, + "grad_norm": 0.0023649828508496284, + "learning_rate": 0.0002268452739855195, + "loss": 0.0078, + "num_input_tokens_seen": 189501648, + "step": 87730 + }, + { + "epoch": 14.312398042414356, + "grad_norm": 0.09851660579442978, + "learning_rate": 0.00022678565771444364, + "loss": 0.0031, + "num_input_tokens_seen": 189513616, + "step": 87735 + }, + { + "epoch": 14.31321370309951, + "grad_norm": 0.011403873562812805, + "learning_rate": 0.00022672604698040306, + "loss": 0.0149, + "num_input_tokens_seen": 189523984, + "step": 87740 + }, + { + "epoch": 14.314029363784666, + "grad_norm": 0.7276797294616699, + "learning_rate": 0.00022666644178460555, + "loss": 0.018, + "num_input_tokens_seen": 189535088, + "step": 87745 + }, + { + "epoch": 14.31484502446982, + "grad_norm": 0.0041549173183739185, + "learning_rate": 0.00022660684212825978, + "loss": 0.0015, + "num_input_tokens_seen": 189546800, + "step": 87750 + }, + { + "epoch": 14.315660685154976, + "grad_norm": 0.003960830625146627, + "learning_rate": 0.00022654724801257276, + "loss": 0.1172, + "num_input_tokens_seen": 189558192, + "step": 87755 + }, + { + "epoch": 14.31647634584013, + "grad_norm": 0.0012428623158484697, + "learning_rate": 0.00022648765943875305, + "loss": 0.002, + "num_input_tokens_seen": 189568720, + "step": 87760 + }, + { + "epoch": 14.317292006525285, + "grad_norm": 0.09902480244636536, + "learning_rate": 0.00022642807640800756, + "loss": 0.0084, + "num_input_tokens_seen": 189579216, + "step": 87765 + }, + { + "epoch": 14.318107667210441, + "grad_norm": 0.005643976386636496, + "learning_rate": 0.0002263684989215445, + "loss": 0.015, + "num_input_tokens_seen": 189589648, + "step": 87770 + }, + { + "epoch": 14.318923327895595, + "grad_norm": 0.015322118066251278, + "learning_rate": 0.00022630892698057055, + "loss": 0.0019, + "num_input_tokens_seen": 189600592, + "step": 87775 + }, + { + "epoch": 14.31973898858075, + "grad_norm": 0.29420289397239685, + "learning_rate": 0.00022624936058629374, + "loss": 0.0063, + "num_input_tokens_seen": 189612176, + "step": 87780 + }, + { + "epoch": 14.320554649265905, + "grad_norm": 0.004782841540873051, + "learning_rate": 0.00022618979973992054, + "loss": 0.0041, + "num_input_tokens_seen": 189623600, + "step": 87785 + }, + { + "epoch": 14.32137030995106, + "grad_norm": 0.00045195547863841057, + "learning_rate": 0.00022613024444265883, + "loss": 0.0041, + "num_input_tokens_seen": 189633360, + "step": 87790 + }, + { + "epoch": 14.322185970636216, + "grad_norm": 0.005995205603539944, + "learning_rate": 0.00022607069469571473, + "loss": 0.001, + "num_input_tokens_seen": 189642288, + "step": 87795 + }, + { + "epoch": 14.32300163132137, + "grad_norm": 0.009188726544380188, + "learning_rate": 0.00022601115050029574, + "loss": 0.0234, + "num_input_tokens_seen": 189653040, + "step": 87800 + }, + { + "epoch": 14.323817292006526, + "grad_norm": 0.0009074592380784452, + "learning_rate": 0.0002259516118576083, + "loss": 0.0017, + "num_input_tokens_seen": 189663440, + "step": 87805 + }, + { + "epoch": 14.32463295269168, + "grad_norm": 0.0021508075296878815, + "learning_rate": 0.00022589207876885914, + "loss": 0.0214, + "num_input_tokens_seen": 189673392, + "step": 87810 + }, + { + "epoch": 14.325448613376835, + "grad_norm": 0.033135004341602325, + "learning_rate": 0.00022583255123525476, + "loss": 0.1537, + "num_input_tokens_seen": 189684144, + "step": 87815 + }, + { + "epoch": 14.326264274061991, + "grad_norm": 0.003989869728684425, + "learning_rate": 0.00022577302925800153, + "loss": 0.0013, + "num_input_tokens_seen": 189695440, + "step": 87820 + }, + { + "epoch": 14.327079934747145, + "grad_norm": 0.0038525178097188473, + "learning_rate": 0.0002257135128383057, + "loss": 0.0013, + "num_input_tokens_seen": 189705936, + "step": 87825 + }, + { + "epoch": 14.3278955954323, + "grad_norm": 0.009815668687224388, + "learning_rate": 0.00022565400197737352, + "loss": 0.0371, + "num_input_tokens_seen": 189716848, + "step": 87830 + }, + { + "epoch": 14.328711256117455, + "grad_norm": 0.03508485481142998, + "learning_rate": 0.000225594496676411, + "loss": 0.0054, + "num_input_tokens_seen": 189727088, + "step": 87835 + }, + { + "epoch": 14.32952691680261, + "grad_norm": 0.0028070693369954824, + "learning_rate": 0.0002255349969366241, + "loss": 0.0021, + "num_input_tokens_seen": 189738608, + "step": 87840 + }, + { + "epoch": 14.330342577487766, + "grad_norm": 0.03543877974152565, + "learning_rate": 0.0002254755027592187, + "loss": 0.0081, + "num_input_tokens_seen": 189749712, + "step": 87845 + }, + { + "epoch": 14.33115823817292, + "grad_norm": 0.1265224665403366, + "learning_rate": 0.00022541601414540052, + "loss": 0.0086, + "num_input_tokens_seen": 189760624, + "step": 87850 + }, + { + "epoch": 14.331973898858076, + "grad_norm": 0.002698419615626335, + "learning_rate": 0.00022535653109637512, + "loss": 0.0019, + "num_input_tokens_seen": 189770224, + "step": 87855 + }, + { + "epoch": 14.33278955954323, + "grad_norm": 0.0011117426911368966, + "learning_rate": 0.000225297053613348, + "loss": 0.0014, + "num_input_tokens_seen": 189780752, + "step": 87860 + }, + { + "epoch": 14.333605220228385, + "grad_norm": 0.0030760967638343573, + "learning_rate": 0.0002252375816975246, + "loss": 0.0986, + "num_input_tokens_seen": 189792240, + "step": 87865 + }, + { + "epoch": 14.33442088091354, + "grad_norm": 0.3648514747619629, + "learning_rate": 0.0002251781153501102, + "loss": 0.0044, + "num_input_tokens_seen": 189803120, + "step": 87870 + }, + { + "epoch": 14.335236541598695, + "grad_norm": 0.006983945611864328, + "learning_rate": 0.0002251186545723099, + "loss": 0.0031, + "num_input_tokens_seen": 189814416, + "step": 87875 + }, + { + "epoch": 14.33605220228385, + "grad_norm": 0.01047761645168066, + "learning_rate": 0.00022505919936532877, + "loss": 0.0088, + "num_input_tokens_seen": 189824016, + "step": 87880 + }, + { + "epoch": 14.336867862969005, + "grad_norm": 0.016600316390395164, + "learning_rate": 0.00022499974973037173, + "loss": 0.0142, + "num_input_tokens_seen": 189834416, + "step": 87885 + }, + { + "epoch": 14.33768352365416, + "grad_norm": 0.0008033128106035292, + "learning_rate": 0.0002249403056686435, + "loss": 0.0016, + "num_input_tokens_seen": 189844496, + "step": 87890 + }, + { + "epoch": 14.338499184339314, + "grad_norm": 0.48811638355255127, + "learning_rate": 0.0002248808671813492, + "loss": 0.1388, + "num_input_tokens_seen": 189855792, + "step": 87895 + }, + { + "epoch": 14.33931484502447, + "grad_norm": 0.008555108681321144, + "learning_rate": 0.00022482143426969282, + "loss": 0.0044, + "num_input_tokens_seen": 189867376, + "step": 87900 + }, + { + "epoch": 14.340130505709626, + "grad_norm": 0.001388808828778565, + "learning_rate": 0.00022476200693487936, + "loss": 0.0041, + "num_input_tokens_seen": 189878768, + "step": 87905 + }, + { + "epoch": 14.34094616639478, + "grad_norm": 0.0068275779485702515, + "learning_rate": 0.000224702585178113, + "loss": 0.0032, + "num_input_tokens_seen": 189889744, + "step": 87910 + }, + { + "epoch": 14.341761827079935, + "grad_norm": 0.005051123443990946, + "learning_rate": 0.00022464316900059795, + "loss": 0.0016, + "num_input_tokens_seen": 189900752, + "step": 87915 + }, + { + "epoch": 14.34257748776509, + "grad_norm": 0.00225257920101285, + "learning_rate": 0.0002245837584035384, + "loss": 0.0038, + "num_input_tokens_seen": 189910000, + "step": 87920 + }, + { + "epoch": 14.343393148450245, + "grad_norm": 0.0017936478834599257, + "learning_rate": 0.00022452435338813842, + "loss": 0.1515, + "num_input_tokens_seen": 189920240, + "step": 87925 + }, + { + "epoch": 14.3442088091354, + "grad_norm": 0.0054099103435873985, + "learning_rate": 0.00022446495395560186, + "loss": 0.0035, + "num_input_tokens_seen": 189930704, + "step": 87930 + }, + { + "epoch": 14.345024469820554, + "grad_norm": 0.003530829679220915, + "learning_rate": 0.00022440556010713253, + "loss": 0.0072, + "num_input_tokens_seen": 189941456, + "step": 87935 + }, + { + "epoch": 14.34584013050571, + "grad_norm": 0.008934364654123783, + "learning_rate": 0.00022434617184393418, + "loss": 0.0069, + "num_input_tokens_seen": 189952688, + "step": 87940 + }, + { + "epoch": 14.346655791190864, + "grad_norm": 0.002388365101069212, + "learning_rate": 0.00022428678916721029, + "loss": 0.0331, + "num_input_tokens_seen": 189963984, + "step": 87945 + }, + { + "epoch": 14.34747145187602, + "grad_norm": 0.026174485683441162, + "learning_rate": 0.00022422741207816444, + "loss": 0.002, + "num_input_tokens_seen": 189975024, + "step": 87950 + }, + { + "epoch": 14.348287112561174, + "grad_norm": 0.00028221847605891526, + "learning_rate": 0.00022416804057799988, + "loss": 0.0013, + "num_input_tokens_seen": 189986544, + "step": 87955 + }, + { + "epoch": 14.34910277324633, + "grad_norm": 0.07184788584709167, + "learning_rate": 0.00022410867466791996, + "loss": 0.0021, + "num_input_tokens_seen": 189997840, + "step": 87960 + }, + { + "epoch": 14.349918433931485, + "grad_norm": 0.0035779913887381554, + "learning_rate": 0.00022404931434912768, + "loss": 0.0014, + "num_input_tokens_seen": 190009360, + "step": 87965 + }, + { + "epoch": 14.350734094616639, + "grad_norm": 0.02154787816107273, + "learning_rate": 0.00022398995962282615, + "loss": 0.0014, + "num_input_tokens_seen": 190021328, + "step": 87970 + }, + { + "epoch": 14.351549755301795, + "grad_norm": 0.05506477877497673, + "learning_rate": 0.00022393061049021823, + "loss": 0.0028, + "num_input_tokens_seen": 190030256, + "step": 87975 + }, + { + "epoch": 14.352365415986949, + "grad_norm": 0.0024936876725405455, + "learning_rate": 0.0002238712669525067, + "loss": 0.0014, + "num_input_tokens_seen": 190040400, + "step": 87980 + }, + { + "epoch": 14.353181076672104, + "grad_norm": 0.05352950841188431, + "learning_rate": 0.0002238119290108942, + "loss": 0.0029, + "num_input_tokens_seen": 190051888, + "step": 87985 + }, + { + "epoch": 14.35399673735726, + "grad_norm": 0.0007369217346422374, + "learning_rate": 0.00022375259666658338, + "loss": 0.0014, + "num_input_tokens_seen": 190062576, + "step": 87990 + }, + { + "epoch": 14.354812398042414, + "grad_norm": 0.0017139033880084753, + "learning_rate": 0.0002236932699207766, + "loss": 0.0022, + "num_input_tokens_seen": 190073776, + "step": 87995 + }, + { + "epoch": 14.35562805872757, + "grad_norm": 0.4416674077510834, + "learning_rate": 0.00022363394877467625, + "loss": 0.0075, + "num_input_tokens_seen": 190085168, + "step": 88000 + }, + { + "epoch": 14.356443719412724, + "grad_norm": 0.004349089693278074, + "learning_rate": 0.0002235746332294845, + "loss": 0.0022, + "num_input_tokens_seen": 190096080, + "step": 88005 + }, + { + "epoch": 14.35725938009788, + "grad_norm": 0.002140692202374339, + "learning_rate": 0.00022351532328640335, + "loss": 0.0021, + "num_input_tokens_seen": 190107056, + "step": 88010 + }, + { + "epoch": 14.358075040783035, + "grad_norm": 0.0060596526600420475, + "learning_rate": 0.0002234560189466352, + "loss": 0.0165, + "num_input_tokens_seen": 190117968, + "step": 88015 + }, + { + "epoch": 14.358890701468189, + "grad_norm": 0.004749086685478687, + "learning_rate": 0.00022339672021138136, + "loss": 0.0015, + "num_input_tokens_seen": 190128112, + "step": 88020 + }, + { + "epoch": 14.359706362153345, + "grad_norm": 0.005048302933573723, + "learning_rate": 0.00022333742708184417, + "loss": 0.0026, + "num_input_tokens_seen": 190139120, + "step": 88025 + }, + { + "epoch": 14.360522022838499, + "grad_norm": 0.04125377535820007, + "learning_rate": 0.0002232781395592247, + "loss": 0.0066, + "num_input_tokens_seen": 190150256, + "step": 88030 + }, + { + "epoch": 14.361337683523654, + "grad_norm": 0.001568611478433013, + "learning_rate": 0.00022321885764472495, + "loss": 0.0016, + "num_input_tokens_seen": 190161904, + "step": 88035 + }, + { + "epoch": 14.362153344208808, + "grad_norm": 0.014550375752151012, + "learning_rate": 0.00022315958133954612, + "loss": 0.0022, + "num_input_tokens_seen": 190173104, + "step": 88040 + }, + { + "epoch": 14.362969004893964, + "grad_norm": 0.005815689451992512, + "learning_rate": 0.00022310031064488962, + "loss": 0.0011, + "num_input_tokens_seen": 190185200, + "step": 88045 + }, + { + "epoch": 14.36378466557912, + "grad_norm": 0.0417536124587059, + "learning_rate": 0.0002230410455619566, + "loss": 0.0017, + "num_input_tokens_seen": 190195664, + "step": 88050 + }, + { + "epoch": 14.364600326264274, + "grad_norm": 0.0010946786496788263, + "learning_rate": 0.00022298178609194807, + "loss": 0.0033, + "num_input_tokens_seen": 190207376, + "step": 88055 + }, + { + "epoch": 14.36541598694943, + "grad_norm": 0.008044305257499218, + "learning_rate": 0.00022292253223606513, + "loss": 0.0018, + "num_input_tokens_seen": 190217264, + "step": 88060 + }, + { + "epoch": 14.366231647634583, + "grad_norm": 0.0005770522402599454, + "learning_rate": 0.0002228632839955086, + "loss": 0.0031, + "num_input_tokens_seen": 190227600, + "step": 88065 + }, + { + "epoch": 14.367047308319739, + "grad_norm": 0.030013680458068848, + "learning_rate": 0.00022280404137147914, + "loss": 0.0032, + "num_input_tokens_seen": 190237584, + "step": 88070 + }, + { + "epoch": 14.367862969004895, + "grad_norm": 0.002448596293106675, + "learning_rate": 0.00022274480436517742, + "loss": 0.0007, + "num_input_tokens_seen": 190249040, + "step": 88075 + }, + { + "epoch": 14.368678629690049, + "grad_norm": 0.010528423823416233, + "learning_rate": 0.00022268557297780396, + "loss": 0.0011, + "num_input_tokens_seen": 190260240, + "step": 88080 + }, + { + "epoch": 14.369494290375204, + "grad_norm": 0.0026562747079879045, + "learning_rate": 0.00022262634721055918, + "loss": 0.123, + "num_input_tokens_seen": 190271856, + "step": 88085 + }, + { + "epoch": 14.370309951060358, + "grad_norm": 0.044105686247348785, + "learning_rate": 0.00022256712706464338, + "loss": 0.0215, + "num_input_tokens_seen": 190282832, + "step": 88090 + }, + { + "epoch": 14.371125611745514, + "grad_norm": 0.03261735662817955, + "learning_rate": 0.0002225079125412567, + "loss": 0.0019, + "num_input_tokens_seen": 190293296, + "step": 88095 + }, + { + "epoch": 14.37194127243067, + "grad_norm": 0.004019154701381922, + "learning_rate": 0.00022244870364159912, + "loss": 0.1147, + "num_input_tokens_seen": 190304752, + "step": 88100 + }, + { + "epoch": 14.372756933115824, + "grad_norm": 0.00436245184391737, + "learning_rate": 0.00022238950036687071, + "loss": 0.0023, + "num_input_tokens_seen": 190315280, + "step": 88105 + }, + { + "epoch": 14.37357259380098, + "grad_norm": 0.002692397916689515, + "learning_rate": 0.00022233030271827126, + "loss": 0.0022, + "num_input_tokens_seen": 190324880, + "step": 88110 + }, + { + "epoch": 14.374388254486133, + "grad_norm": 0.17537789046764374, + "learning_rate": 0.0002222711106970003, + "loss": 0.1043, + "num_input_tokens_seen": 190335792, + "step": 88115 + }, + { + "epoch": 14.375203915171289, + "grad_norm": 0.02732802927494049, + "learning_rate": 0.0002222119243042579, + "loss": 0.0048, + "num_input_tokens_seen": 190345936, + "step": 88120 + }, + { + "epoch": 14.376019575856443, + "grad_norm": 0.0324014350771904, + "learning_rate": 0.00022215274354124294, + "loss": 0.0205, + "num_input_tokens_seen": 190357488, + "step": 88125 + }, + { + "epoch": 14.376835236541599, + "grad_norm": 0.0018137918086722493, + "learning_rate": 0.00022209356840915552, + "loss": 0.0031, + "num_input_tokens_seen": 190368688, + "step": 88130 + }, + { + "epoch": 14.377650897226754, + "grad_norm": 0.02366684377193451, + "learning_rate": 0.00022203439890919403, + "loss": 0.0018, + "num_input_tokens_seen": 190379504, + "step": 88135 + }, + { + "epoch": 14.378466557911908, + "grad_norm": 0.0013595132622867823, + "learning_rate": 0.00022197523504255846, + "loss": 0.0012, + "num_input_tokens_seen": 190390256, + "step": 88140 + }, + { + "epoch": 14.379282218597064, + "grad_norm": 0.0014586466131731868, + "learning_rate": 0.00022191607681044712, + "loss": 0.0029, + "num_input_tokens_seen": 190400016, + "step": 88145 + }, + { + "epoch": 14.380097879282218, + "grad_norm": 0.5295431017875671, + "learning_rate": 0.00022185692421405962, + "loss": 0.0821, + "num_input_tokens_seen": 190410544, + "step": 88150 + }, + { + "epoch": 14.380913539967374, + "grad_norm": 0.024179702624678612, + "learning_rate": 0.000221797777254594, + "loss": 0.0041, + "num_input_tokens_seen": 190422032, + "step": 88155 + }, + { + "epoch": 14.38172920065253, + "grad_norm": 0.05698935687541962, + "learning_rate": 0.00022173863593324971, + "loss": 0.0029, + "num_input_tokens_seen": 190433392, + "step": 88160 + }, + { + "epoch": 14.382544861337683, + "grad_norm": 0.002039377810433507, + "learning_rate": 0.00022167950025122463, + "loss": 0.0046, + "num_input_tokens_seen": 190443760, + "step": 88165 + }, + { + "epoch": 14.383360522022839, + "grad_norm": 0.09846755862236023, + "learning_rate": 0.00022162037020971793, + "loss": 0.0165, + "num_input_tokens_seen": 190454288, + "step": 88170 + }, + { + "epoch": 14.384176182707993, + "grad_norm": 0.002468029735609889, + "learning_rate": 0.00022156124580992716, + "loss": 0.0023, + "num_input_tokens_seen": 190465168, + "step": 88175 + }, + { + "epoch": 14.384991843393149, + "grad_norm": 0.0019818132277578115, + "learning_rate": 0.00022150212705305118, + "loss": 0.0026, + "num_input_tokens_seen": 190476720, + "step": 88180 + }, + { + "epoch": 14.385807504078304, + "grad_norm": 0.004764073994010687, + "learning_rate": 0.00022144301394028793, + "loss": 0.0399, + "num_input_tokens_seen": 190488048, + "step": 88185 + }, + { + "epoch": 14.386623164763458, + "grad_norm": 0.005842797923833132, + "learning_rate": 0.0002213839064728353, + "loss": 0.001, + "num_input_tokens_seen": 190497488, + "step": 88190 + }, + { + "epoch": 14.387438825448614, + "grad_norm": 0.06038953736424446, + "learning_rate": 0.0002213248046518913, + "loss": 0.0592, + "num_input_tokens_seen": 190506640, + "step": 88195 + }, + { + "epoch": 14.388254486133768, + "grad_norm": 0.0026502537075430155, + "learning_rate": 0.00022126570847865368, + "loss": 0.002, + "num_input_tokens_seen": 190516656, + "step": 88200 + }, + { + "epoch": 14.389070146818923, + "grad_norm": 0.04889017343521118, + "learning_rate": 0.00022120661795432, + "loss": 0.0083, + "num_input_tokens_seen": 190528464, + "step": 88205 + }, + { + "epoch": 14.38988580750408, + "grad_norm": 0.030095215886831284, + "learning_rate": 0.00022114753308008795, + "loss": 0.0019, + "num_input_tokens_seen": 190539536, + "step": 88210 + }, + { + "epoch": 14.390701468189233, + "grad_norm": 0.6848757863044739, + "learning_rate": 0.00022108845385715488, + "loss": 0.0973, + "num_input_tokens_seen": 190551376, + "step": 88215 + }, + { + "epoch": 14.391517128874389, + "grad_norm": 0.3863895833492279, + "learning_rate": 0.00022102938028671816, + "loss": 0.0152, + "num_input_tokens_seen": 190562320, + "step": 88220 + }, + { + "epoch": 14.392332789559543, + "grad_norm": 0.0029419090133160353, + "learning_rate": 0.00022097031236997488, + "loss": 0.0058, + "num_input_tokens_seen": 190572432, + "step": 88225 + }, + { + "epoch": 14.393148450244698, + "grad_norm": 0.001146205817349255, + "learning_rate": 0.00022091125010812202, + "loss": 0.001, + "num_input_tokens_seen": 190583184, + "step": 88230 + }, + { + "epoch": 14.393964110929852, + "grad_norm": 0.0018329378217458725, + "learning_rate": 0.00022085219350235707, + "loss": 0.0012, + "num_input_tokens_seen": 190593712, + "step": 88235 + }, + { + "epoch": 14.394779771615008, + "grad_norm": 0.0034815967082977295, + "learning_rate": 0.00022079314255387623, + "loss": 0.0024, + "num_input_tokens_seen": 190604464, + "step": 88240 + }, + { + "epoch": 14.395595432300164, + "grad_norm": 0.048917580395936966, + "learning_rate": 0.00022073409726387688, + "loss": 0.0023, + "num_input_tokens_seen": 190615952, + "step": 88245 + }, + { + "epoch": 14.396411092985318, + "grad_norm": 0.00788689311593771, + "learning_rate": 0.000220675057633555, + "loss": 0.002, + "num_input_tokens_seen": 190627216, + "step": 88250 + }, + { + "epoch": 14.397226753670473, + "grad_norm": 0.003668892662972212, + "learning_rate": 0.00022061602366410776, + "loss": 0.0803, + "num_input_tokens_seen": 190637968, + "step": 88255 + }, + { + "epoch": 14.398042414355627, + "grad_norm": 0.0010389218805357814, + "learning_rate": 0.0002205569953567309, + "loss": 0.0042, + "num_input_tokens_seen": 190649168, + "step": 88260 + }, + { + "epoch": 14.398858075040783, + "grad_norm": 0.2328641712665558, + "learning_rate": 0.00022049797271262133, + "loss": 0.0175, + "num_input_tokens_seen": 190660240, + "step": 88265 + }, + { + "epoch": 14.399673735725939, + "grad_norm": 0.14842480421066284, + "learning_rate": 0.00022043895573297463, + "loss": 0.0109, + "num_input_tokens_seen": 190670640, + "step": 88270 + }, + { + "epoch": 14.400489396411093, + "grad_norm": 0.02398735098540783, + "learning_rate": 0.0002203799444189874, + "loss": 0.0017, + "num_input_tokens_seen": 190679984, + "step": 88275 + }, + { + "epoch": 14.401305057096248, + "grad_norm": 0.0224875770509243, + "learning_rate": 0.00022032093877185504, + "loss": 0.0029, + "num_input_tokens_seen": 190690608, + "step": 88280 + }, + { + "epoch": 14.402120717781402, + "grad_norm": 0.020505795255303383, + "learning_rate": 0.000220261938792774, + "loss": 0.0039, + "num_input_tokens_seen": 190701840, + "step": 88285 + }, + { + "epoch": 14.402936378466558, + "grad_norm": 0.0015917181735858321, + "learning_rate": 0.00022020294448293925, + "loss": 0.0042, + "num_input_tokens_seen": 190714096, + "step": 88290 + }, + { + "epoch": 14.403752039151712, + "grad_norm": 0.003339561866596341, + "learning_rate": 0.00022014395584354717, + "loss": 0.0016, + "num_input_tokens_seen": 190725296, + "step": 88295 + }, + { + "epoch": 14.404567699836868, + "grad_norm": 0.007421552669256926, + "learning_rate": 0.0002200849728757925, + "loss": 0.0087, + "num_input_tokens_seen": 190736016, + "step": 88300 + }, + { + "epoch": 14.405383360522023, + "grad_norm": 0.0025911256670951843, + "learning_rate": 0.00022002599558087126, + "loss": 0.0018, + "num_input_tokens_seen": 190748304, + "step": 88305 + }, + { + "epoch": 14.406199021207177, + "grad_norm": 0.007793547119945288, + "learning_rate": 0.00021996702395997807, + "loss": 0.002, + "num_input_tokens_seen": 190759824, + "step": 88310 + }, + { + "epoch": 14.407014681892333, + "grad_norm": 0.035313066095113754, + "learning_rate": 0.00021990805801430874, + "loss": 0.0045, + "num_input_tokens_seen": 190771056, + "step": 88315 + }, + { + "epoch": 14.407830342577487, + "grad_norm": 0.0019238482927903533, + "learning_rate": 0.00021984909774505756, + "loss": 0.0063, + "num_input_tokens_seen": 190782608, + "step": 88320 + }, + { + "epoch": 14.408646003262643, + "grad_norm": 0.019743841141462326, + "learning_rate": 0.00021979014315342, + "loss": 0.0033, + "num_input_tokens_seen": 190794352, + "step": 88325 + }, + { + "epoch": 14.409461663947798, + "grad_norm": 0.0016499466728419065, + "learning_rate": 0.00021973119424059068, + "loss": 0.0689, + "num_input_tokens_seen": 190805360, + "step": 88330 + }, + { + "epoch": 14.410277324632952, + "grad_norm": 0.008232859894633293, + "learning_rate": 0.00021967225100776424, + "loss": 0.0011, + "num_input_tokens_seen": 190815408, + "step": 88335 + }, + { + "epoch": 14.411092985318108, + "grad_norm": 0.3779599070549011, + "learning_rate": 0.00021961331345613522, + "loss": 0.0097, + "num_input_tokens_seen": 190826576, + "step": 88340 + }, + { + "epoch": 14.411908646003262, + "grad_norm": 0.0002817259228322655, + "learning_rate": 0.00021955438158689818, + "loss": 0.0024, + "num_input_tokens_seen": 190836688, + "step": 88345 + }, + { + "epoch": 14.412724306688418, + "grad_norm": 0.5337850451469421, + "learning_rate": 0.00021949545540124734, + "loss": 0.0211, + "num_input_tokens_seen": 190846800, + "step": 88350 + }, + { + "epoch": 14.413539967373573, + "grad_norm": 0.08282860368490219, + "learning_rate": 0.0002194365349003769, + "loss": 0.1119, + "num_input_tokens_seen": 190856240, + "step": 88355 + }, + { + "epoch": 14.414355628058727, + "grad_norm": 0.006659589242190123, + "learning_rate": 0.00021937762008548102, + "loss": 0.0019, + "num_input_tokens_seen": 190867024, + "step": 88360 + }, + { + "epoch": 14.415171288743883, + "grad_norm": 0.005563710350543261, + "learning_rate": 0.00021931871095775364, + "loss": 0.0027, + "num_input_tokens_seen": 190877200, + "step": 88365 + }, + { + "epoch": 14.415986949429037, + "grad_norm": 0.00834878534078598, + "learning_rate": 0.0002192598075183887, + "loss": 0.0017, + "num_input_tokens_seen": 190887536, + "step": 88370 + }, + { + "epoch": 14.416802610114193, + "grad_norm": 0.37227877974510193, + "learning_rate": 0.00021920090976857971, + "loss": 0.0896, + "num_input_tokens_seen": 190897872, + "step": 88375 + }, + { + "epoch": 14.417618270799348, + "grad_norm": 0.00478452630341053, + "learning_rate": 0.00021914201770952086, + "loss": 0.0009, + "num_input_tokens_seen": 190909072, + "step": 88380 + }, + { + "epoch": 14.418433931484502, + "grad_norm": 0.6292222738265991, + "learning_rate": 0.00021908313134240493, + "loss": 0.0597, + "num_input_tokens_seen": 190920080, + "step": 88385 + }, + { + "epoch": 14.419249592169658, + "grad_norm": 0.0013072388246655464, + "learning_rate": 0.00021902425066842608, + "loss": 0.0168, + "num_input_tokens_seen": 190931600, + "step": 88390 + }, + { + "epoch": 14.420065252854812, + "grad_norm": 0.0008924621506594121, + "learning_rate": 0.00021896537568877688, + "loss": 0.0548, + "num_input_tokens_seen": 190943312, + "step": 88395 + }, + { + "epoch": 14.420880913539968, + "grad_norm": 0.004021234344691038, + "learning_rate": 0.00021890650640465125, + "loss": 0.0025, + "num_input_tokens_seen": 190954064, + "step": 88400 + }, + { + "epoch": 14.421696574225122, + "grad_norm": 0.004071609117090702, + "learning_rate": 0.00021884764281724145, + "loss": 0.0019, + "num_input_tokens_seen": 190964848, + "step": 88405 + }, + { + "epoch": 14.422512234910277, + "grad_norm": 0.00245073065161705, + "learning_rate": 0.00021878878492774125, + "loss": 0.0021, + "num_input_tokens_seen": 190975696, + "step": 88410 + }, + { + "epoch": 14.423327895595433, + "grad_norm": 0.006399341858923435, + "learning_rate": 0.00021872993273734266, + "loss": 0.0071, + "num_input_tokens_seen": 190986448, + "step": 88415 + }, + { + "epoch": 14.424143556280587, + "grad_norm": 0.0016032133717089891, + "learning_rate": 0.0002186710862472392, + "loss": 0.0032, + "num_input_tokens_seen": 190996912, + "step": 88420 + }, + { + "epoch": 14.424959216965743, + "grad_norm": 0.0015737857902422547, + "learning_rate": 0.00021861224545862264, + "loss": 0.006, + "num_input_tokens_seen": 191007440, + "step": 88425 + }, + { + "epoch": 14.425774877650896, + "grad_norm": 0.08014027774333954, + "learning_rate": 0.0002185534103726863, + "loss": 0.0076, + "num_input_tokens_seen": 191019344, + "step": 88430 + }, + { + "epoch": 14.426590538336052, + "grad_norm": 0.48051971197128296, + "learning_rate": 0.00021849458099062175, + "loss": 0.105, + "num_input_tokens_seen": 191030000, + "step": 88435 + }, + { + "epoch": 14.427406199021208, + "grad_norm": 0.003187313210219145, + "learning_rate": 0.00021843575731362187, + "loss": 0.1245, + "num_input_tokens_seen": 191040656, + "step": 88440 + }, + { + "epoch": 14.428221859706362, + "grad_norm": 0.6462088823318481, + "learning_rate": 0.0002183769393428785, + "loss": 0.0935, + "num_input_tokens_seen": 191052240, + "step": 88445 + }, + { + "epoch": 14.429037520391518, + "grad_norm": 0.04802269861102104, + "learning_rate": 0.00021831812707958376, + "loss": 0.0054, + "num_input_tokens_seen": 191063408, + "step": 88450 + }, + { + "epoch": 14.429853181076671, + "grad_norm": 0.03946414217352867, + "learning_rate": 0.00021825932052492946, + "loss": 0.1331, + "num_input_tokens_seen": 191074352, + "step": 88455 + }, + { + "epoch": 14.430668841761827, + "grad_norm": 0.01283244974911213, + "learning_rate": 0.0002182005196801075, + "loss": 0.171, + "num_input_tokens_seen": 191084496, + "step": 88460 + }, + { + "epoch": 14.431484502446983, + "grad_norm": 0.11222032457590103, + "learning_rate": 0.0002181417245463095, + "loss": 0.0201, + "num_input_tokens_seen": 191095696, + "step": 88465 + }, + { + "epoch": 14.432300163132137, + "grad_norm": 0.013230902142822742, + "learning_rate": 0.00021808293512472698, + "loss": 0.0154, + "num_input_tokens_seen": 191106480, + "step": 88470 + }, + { + "epoch": 14.433115823817293, + "grad_norm": 0.004127667285501957, + "learning_rate": 0.0002180241514165514, + "loss": 0.0075, + "num_input_tokens_seen": 191118416, + "step": 88475 + }, + { + "epoch": 14.433931484502446, + "grad_norm": 0.08697032928466797, + "learning_rate": 0.00021796537342297413, + "loss": 0.0057, + "num_input_tokens_seen": 191129232, + "step": 88480 + }, + { + "epoch": 14.434747145187602, + "grad_norm": 0.08197087794542313, + "learning_rate": 0.00021790660114518633, + "loss": 0.0069, + "num_input_tokens_seen": 191139184, + "step": 88485 + }, + { + "epoch": 14.435562805872756, + "grad_norm": 0.37481895089149475, + "learning_rate": 0.0002178478345843792, + "loss": 0.011, + "num_input_tokens_seen": 191149456, + "step": 88490 + }, + { + "epoch": 14.436378466557912, + "grad_norm": 0.011596872471272945, + "learning_rate": 0.00021778907374174356, + "loss": 0.0041, + "num_input_tokens_seen": 191160656, + "step": 88495 + }, + { + "epoch": 14.437194127243067, + "grad_norm": 0.010168236680328846, + "learning_rate": 0.00021773031861847036, + "loss": 0.0052, + "num_input_tokens_seen": 191172336, + "step": 88500 + }, + { + "epoch": 14.438009787928221, + "grad_norm": 0.12285585701465607, + "learning_rate": 0.0002176715692157503, + "loss": 0.0087, + "num_input_tokens_seen": 191184176, + "step": 88505 + }, + { + "epoch": 14.438825448613377, + "grad_norm": 0.049924153834581375, + "learning_rate": 0.00021761282553477412, + "loss": 0.0053, + "num_input_tokens_seen": 191194256, + "step": 88510 + }, + { + "epoch": 14.439641109298531, + "grad_norm": 0.5651575326919556, + "learning_rate": 0.00021755408757673228, + "loss": 0.0149, + "num_input_tokens_seen": 191205584, + "step": 88515 + }, + { + "epoch": 14.440456769983687, + "grad_norm": 0.0005408431752584875, + "learning_rate": 0.00021749535534281488, + "loss": 0.0041, + "num_input_tokens_seen": 191217104, + "step": 88520 + }, + { + "epoch": 14.441272430668842, + "grad_norm": 0.007131251972168684, + "learning_rate": 0.00021743662883421294, + "loss": 0.0016, + "num_input_tokens_seen": 191228176, + "step": 88525 + }, + { + "epoch": 14.442088091353996, + "grad_norm": 0.0007430678233504295, + "learning_rate": 0.00021737790805211578, + "loss": 0.0025, + "num_input_tokens_seen": 191237872, + "step": 88530 + }, + { + "epoch": 14.442903752039152, + "grad_norm": 0.0031173175666481256, + "learning_rate": 0.00021731919299771424, + "loss": 0.007, + "num_input_tokens_seen": 191249040, + "step": 88535 + }, + { + "epoch": 14.443719412724306, + "grad_norm": 0.0021781879477202892, + "learning_rate": 0.00021726048367219747, + "loss": 0.0029, + "num_input_tokens_seen": 191260848, + "step": 88540 + }, + { + "epoch": 14.444535073409462, + "grad_norm": 0.0342792384326458, + "learning_rate": 0.00021720178007675583, + "loss": 0.0022, + "num_input_tokens_seen": 191272304, + "step": 88545 + }, + { + "epoch": 14.445350734094617, + "grad_norm": 0.14234837889671326, + "learning_rate": 0.00021714308221257889, + "loss": 0.0083, + "num_input_tokens_seen": 191281904, + "step": 88550 + }, + { + "epoch": 14.446166394779771, + "grad_norm": 0.03180849552154541, + "learning_rate": 0.00021708439008085624, + "loss": 0.0165, + "num_input_tokens_seen": 191292304, + "step": 88555 + }, + { + "epoch": 14.446982055464927, + "grad_norm": 0.020385757088661194, + "learning_rate": 0.0002170257036827773, + "loss": 0.0034, + "num_input_tokens_seen": 191303696, + "step": 88560 + }, + { + "epoch": 14.447797716150081, + "grad_norm": 0.0029727816581726074, + "learning_rate": 0.00021696702301953147, + "loss": 0.0619, + "num_input_tokens_seen": 191313648, + "step": 88565 + }, + { + "epoch": 14.448613376835237, + "grad_norm": 0.007808151189237833, + "learning_rate": 0.00021690834809230797, + "loss": 0.0053, + "num_input_tokens_seen": 191323856, + "step": 88570 + }, + { + "epoch": 14.449429037520392, + "grad_norm": 0.005912197753787041, + "learning_rate": 0.00021684967890229595, + "loss": 0.0058, + "num_input_tokens_seen": 191334512, + "step": 88575 + }, + { + "epoch": 14.450244698205546, + "grad_norm": 0.5385004878044128, + "learning_rate": 0.00021679101545068436, + "loss": 0.2575, + "num_input_tokens_seen": 191345616, + "step": 88580 + }, + { + "epoch": 14.451060358890702, + "grad_norm": 0.004685994237661362, + "learning_rate": 0.00021673235773866212, + "loss": 0.0026, + "num_input_tokens_seen": 191356592, + "step": 88585 + }, + { + "epoch": 14.451876019575856, + "grad_norm": 0.0021670262794941664, + "learning_rate": 0.00021667370576741802, + "loss": 0.0019, + "num_input_tokens_seen": 191367792, + "step": 88590 + }, + { + "epoch": 14.452691680261012, + "grad_norm": 0.02692868933081627, + "learning_rate": 0.00021661505953814064, + "loss": 0.0081, + "num_input_tokens_seen": 191378928, + "step": 88595 + }, + { + "epoch": 14.453507340946166, + "grad_norm": 0.005074275657534599, + "learning_rate": 0.0002165564190520186, + "loss": 0.0835, + "num_input_tokens_seen": 191390224, + "step": 88600 + }, + { + "epoch": 14.454323001631321, + "grad_norm": 0.024734511971473694, + "learning_rate": 0.00021649778431024035, + "loss": 0.0061, + "num_input_tokens_seen": 191401232, + "step": 88605 + }, + { + "epoch": 14.455138662316477, + "grad_norm": 0.0013288380578160286, + "learning_rate": 0.0002164391553139941, + "loss": 0.0907, + "num_input_tokens_seen": 191411248, + "step": 88610 + }, + { + "epoch": 14.455954323001631, + "grad_norm": 0.002668878063559532, + "learning_rate": 0.00021638053206446813, + "loss": 0.0023, + "num_input_tokens_seen": 191421840, + "step": 88615 + }, + { + "epoch": 14.456769983686787, + "grad_norm": 0.01918703131377697, + "learning_rate": 0.00021632191456285045, + "loss": 0.0232, + "num_input_tokens_seen": 191432688, + "step": 88620 + }, + { + "epoch": 14.45758564437194, + "grad_norm": 0.28013843297958374, + "learning_rate": 0.00021626330281032902, + "loss": 0.027, + "num_input_tokens_seen": 191443312, + "step": 88625 + }, + { + "epoch": 14.458401305057096, + "grad_norm": 0.026643967255949974, + "learning_rate": 0.00021620469680809173, + "loss": 0.0042, + "num_input_tokens_seen": 191453104, + "step": 88630 + }, + { + "epoch": 14.459216965742252, + "grad_norm": 0.0020155615638941526, + "learning_rate": 0.0002161460965573263, + "loss": 0.0111, + "num_input_tokens_seen": 191464144, + "step": 88635 + }, + { + "epoch": 14.460032626427406, + "grad_norm": 0.012268748134374619, + "learning_rate": 0.0002160875020592203, + "loss": 0.0107, + "num_input_tokens_seen": 191475088, + "step": 88640 + }, + { + "epoch": 14.460848287112562, + "grad_norm": 0.002203174866735935, + "learning_rate": 0.00021602891331496123, + "loss": 0.0217, + "num_input_tokens_seen": 191485840, + "step": 88645 + }, + { + "epoch": 14.461663947797716, + "grad_norm": 0.0011056186631321907, + "learning_rate": 0.0002159703303257363, + "loss": 0.0051, + "num_input_tokens_seen": 191496720, + "step": 88650 + }, + { + "epoch": 14.462479608482871, + "grad_norm": 0.05333087965846062, + "learning_rate": 0.00021591175309273314, + "loss": 0.0722, + "num_input_tokens_seen": 191507728, + "step": 88655 + }, + { + "epoch": 14.463295269168025, + "grad_norm": 0.013012475334107876, + "learning_rate": 0.00021585318161713868, + "loss": 0.0041, + "num_input_tokens_seen": 191518096, + "step": 88660 + }, + { + "epoch": 14.464110929853181, + "grad_norm": 0.013713795691728592, + "learning_rate": 0.00021579461590013994, + "loss": 0.0039, + "num_input_tokens_seen": 191528560, + "step": 88665 + }, + { + "epoch": 14.464926590538337, + "grad_norm": 0.0031545383390039206, + "learning_rate": 0.0002157360559429239, + "loss": 0.0022, + "num_input_tokens_seen": 191539024, + "step": 88670 + }, + { + "epoch": 14.46574225122349, + "grad_norm": 0.03223045915365219, + "learning_rate": 0.00021567750174667722, + "loss": 0.1562, + "num_input_tokens_seen": 191550672, + "step": 88675 + }, + { + "epoch": 14.466557911908646, + "grad_norm": 0.000889264396391809, + "learning_rate": 0.00021561895331258674, + "loss": 0.0017, + "num_input_tokens_seen": 191561456, + "step": 88680 + }, + { + "epoch": 14.4673735725938, + "grad_norm": 0.004097413271665573, + "learning_rate": 0.0002155604106418389, + "loss": 0.0053, + "num_input_tokens_seen": 191573104, + "step": 88685 + }, + { + "epoch": 14.468189233278956, + "grad_norm": 0.023272814229130745, + "learning_rate": 0.00021550187373562015, + "loss": 0.0043, + "num_input_tokens_seen": 191582832, + "step": 88690 + }, + { + "epoch": 14.469004893964112, + "grad_norm": 0.16478866338729858, + "learning_rate": 0.00021544334259511688, + "loss": 0.0101, + "num_input_tokens_seen": 191592784, + "step": 88695 + }, + { + "epoch": 14.469820554649266, + "grad_norm": 0.33433669805526733, + "learning_rate": 0.0002153848172215152, + "loss": 0.0139, + "num_input_tokens_seen": 191603344, + "step": 88700 + }, + { + "epoch": 14.470636215334421, + "grad_norm": 0.0031498237513005733, + "learning_rate": 0.00021532629761600132, + "loss": 0.1815, + "num_input_tokens_seen": 191613744, + "step": 88705 + }, + { + "epoch": 14.471451876019575, + "grad_norm": 0.07845421880483627, + "learning_rate": 0.00021526778377976114, + "loss": 0.0034, + "num_input_tokens_seen": 191624784, + "step": 88710 + }, + { + "epoch": 14.47226753670473, + "grad_norm": 0.4229518473148346, + "learning_rate": 0.00021520927571398052, + "loss": 0.1305, + "num_input_tokens_seen": 191635440, + "step": 88715 + }, + { + "epoch": 14.473083197389887, + "grad_norm": 0.012796735391020775, + "learning_rate": 0.00021515077341984523, + "loss": 0.0038, + "num_input_tokens_seen": 191644880, + "step": 88720 + }, + { + "epoch": 14.47389885807504, + "grad_norm": 0.07664711028337479, + "learning_rate": 0.00021509227689854083, + "loss": 0.0047, + "num_input_tokens_seen": 191655120, + "step": 88725 + }, + { + "epoch": 14.474714518760196, + "grad_norm": 0.013845582492649555, + "learning_rate": 0.0002150337861512529, + "loss": 0.0076, + "num_input_tokens_seen": 191666032, + "step": 88730 + }, + { + "epoch": 14.47553017944535, + "grad_norm": 0.019843915477395058, + "learning_rate": 0.0002149753011791668, + "loss": 0.0052, + "num_input_tokens_seen": 191675760, + "step": 88735 + }, + { + "epoch": 14.476345840130506, + "grad_norm": 0.033653657883405685, + "learning_rate": 0.00021491682198346778, + "loss": 0.2274, + "num_input_tokens_seen": 191685584, + "step": 88740 + }, + { + "epoch": 14.477161500815662, + "grad_norm": 0.004855956416577101, + "learning_rate": 0.00021485834856534104, + "loss": 0.0764, + "num_input_tokens_seen": 191695632, + "step": 88745 + }, + { + "epoch": 14.477977161500815, + "grad_norm": 0.026927856728434563, + "learning_rate": 0.00021479988092597157, + "loss": 0.0287, + "num_input_tokens_seen": 191706416, + "step": 88750 + }, + { + "epoch": 14.478792822185971, + "grad_norm": 0.0082674166187644, + "learning_rate": 0.00021474141906654414, + "loss": 0.0029, + "num_input_tokens_seen": 191717936, + "step": 88755 + }, + { + "epoch": 14.479608482871125, + "grad_norm": 0.013341606594622135, + "learning_rate": 0.00021468296298824413, + "loss": 0.0058, + "num_input_tokens_seen": 191728976, + "step": 88760 + }, + { + "epoch": 14.48042414355628, + "grad_norm": 0.008929502218961716, + "learning_rate": 0.00021462451269225547, + "loss": 0.0033, + "num_input_tokens_seen": 191740272, + "step": 88765 + }, + { + "epoch": 14.481239804241435, + "grad_norm": 0.007938308641314507, + "learning_rate": 0.00021456606817976337, + "loss": 0.003, + "num_input_tokens_seen": 191751888, + "step": 88770 + }, + { + "epoch": 14.48205546492659, + "grad_norm": 0.015212338417768478, + "learning_rate": 0.00021450762945195167, + "loss": 0.0076, + "num_input_tokens_seen": 191763728, + "step": 88775 + }, + { + "epoch": 14.482871125611746, + "grad_norm": 1.0990099906921387, + "learning_rate": 0.00021444919651000544, + "loss": 0.0907, + "num_input_tokens_seen": 191774640, + "step": 88780 + }, + { + "epoch": 14.4836867862969, + "grad_norm": 0.03344845771789551, + "learning_rate": 0.0002143907693551081, + "loss": 0.0256, + "num_input_tokens_seen": 191784656, + "step": 88785 + }, + { + "epoch": 14.484502446982056, + "grad_norm": 0.019984211772680283, + "learning_rate": 0.00021433234798844448, + "loss": 0.0044, + "num_input_tokens_seen": 191796848, + "step": 88790 + }, + { + "epoch": 14.48531810766721, + "grad_norm": 0.02110087312757969, + "learning_rate": 0.00021427393241119785, + "loss": 0.0102, + "num_input_tokens_seen": 191806032, + "step": 88795 + }, + { + "epoch": 14.486133768352365, + "grad_norm": 0.0715024396777153, + "learning_rate": 0.00021421552262455268, + "loss": 0.0311, + "num_input_tokens_seen": 191817328, + "step": 88800 + }, + { + "epoch": 14.486949429037521, + "grad_norm": 0.008351379074156284, + "learning_rate": 0.00021415711862969244, + "loss": 0.0112, + "num_input_tokens_seen": 191827888, + "step": 88805 + }, + { + "epoch": 14.487765089722675, + "grad_norm": 0.01090281642973423, + "learning_rate": 0.00021409872042780083, + "loss": 0.0766, + "num_input_tokens_seen": 191839280, + "step": 88810 + }, + { + "epoch": 14.48858075040783, + "grad_norm": 0.01043806690722704, + "learning_rate": 0.00021404032802006134, + "loss": 0.0061, + "num_input_tokens_seen": 191848784, + "step": 88815 + }, + { + "epoch": 14.489396411092985, + "grad_norm": 0.0038927318528294563, + "learning_rate": 0.00021398194140765736, + "loss": 0.0051, + "num_input_tokens_seen": 191858800, + "step": 88820 + }, + { + "epoch": 14.49021207177814, + "grad_norm": 0.005791252013295889, + "learning_rate": 0.0002139235605917722, + "loss": 0.0062, + "num_input_tokens_seen": 191870608, + "step": 88825 + }, + { + "epoch": 14.491027732463296, + "grad_norm": 0.003040252486243844, + "learning_rate": 0.00021386518557358898, + "loss": 0.0671, + "num_input_tokens_seen": 191881296, + "step": 88830 + }, + { + "epoch": 14.49184339314845, + "grad_norm": 0.029066117480397224, + "learning_rate": 0.00021380681635429079, + "loss": 0.0905, + "num_input_tokens_seen": 191892400, + "step": 88835 + }, + { + "epoch": 14.492659053833606, + "grad_norm": 0.028188761323690414, + "learning_rate": 0.00021374845293506046, + "loss": 0.0201, + "num_input_tokens_seen": 191901680, + "step": 88840 + }, + { + "epoch": 14.49347471451876, + "grad_norm": 0.011652766726911068, + "learning_rate": 0.00021369009531708094, + "loss": 0.0104, + "num_input_tokens_seen": 191911824, + "step": 88845 + }, + { + "epoch": 14.494290375203915, + "grad_norm": 0.006433951668441296, + "learning_rate": 0.0002136317435015348, + "loss": 0.0065, + "num_input_tokens_seen": 191922992, + "step": 88850 + }, + { + "epoch": 14.49510603588907, + "grad_norm": 0.02798541635274887, + "learning_rate": 0.0002135733974896047, + "loss": 0.0054, + "num_input_tokens_seen": 191932496, + "step": 88855 + }, + { + "epoch": 14.495921696574225, + "grad_norm": 0.007322490680962801, + "learning_rate": 0.00021351505728247282, + "loss": 0.0075, + "num_input_tokens_seen": 191942672, + "step": 88860 + }, + { + "epoch": 14.49673735725938, + "grad_norm": 0.015419857576489449, + "learning_rate": 0.00021345672288132218, + "loss": 0.0126, + "num_input_tokens_seen": 191953936, + "step": 88865 + }, + { + "epoch": 14.497553017944535, + "grad_norm": 0.05873335525393486, + "learning_rate": 0.00021339839428733415, + "loss": 0.0052, + "num_input_tokens_seen": 191963504, + "step": 88870 + }, + { + "epoch": 14.49836867862969, + "grad_norm": 0.08444296568632126, + "learning_rate": 0.0002133400715016916, + "loss": 0.0254, + "num_input_tokens_seen": 191973776, + "step": 88875 + }, + { + "epoch": 14.499184339314844, + "grad_norm": 0.01983826793730259, + "learning_rate": 0.0002132817545255758, + "loss": 0.0081, + "num_input_tokens_seen": 191982800, + "step": 88880 + }, + { + "epoch": 14.5, + "grad_norm": 0.020987823605537415, + "learning_rate": 0.0002132234433601693, + "loss": 0.0055, + "num_input_tokens_seen": 191994608, + "step": 88885 + }, + { + "epoch": 14.500815660685156, + "grad_norm": 0.05628190189599991, + "learning_rate": 0.00021316513800665322, + "loss": 0.0155, + "num_input_tokens_seen": 192004112, + "step": 88890 + }, + { + "epoch": 14.50163132137031, + "grad_norm": 0.013542087748646736, + "learning_rate": 0.0002131068384662098, + "loss": 0.0026, + "num_input_tokens_seen": 192016720, + "step": 88895 + }, + { + "epoch": 14.502446982055465, + "grad_norm": 0.01771514117717743, + "learning_rate": 0.00021304854474001993, + "loss": 0.003, + "num_input_tokens_seen": 192027760, + "step": 88900 + }, + { + "epoch": 14.50326264274062, + "grad_norm": 0.021325405687093735, + "learning_rate": 0.00021299025682926565, + "loss": 0.0015, + "num_input_tokens_seen": 192038992, + "step": 88905 + }, + { + "epoch": 14.504078303425775, + "grad_norm": 0.002273843390867114, + "learning_rate": 0.0002129319747351276, + "loss": 0.0025, + "num_input_tokens_seen": 192050032, + "step": 88910 + }, + { + "epoch": 14.50489396411093, + "grad_norm": 0.20562784373760223, + "learning_rate": 0.00021287369845878756, + "loss": 0.0154, + "num_input_tokens_seen": 192060400, + "step": 88915 + }, + { + "epoch": 14.505709624796085, + "grad_norm": 0.002092252252623439, + "learning_rate": 0.00021281542800142595, + "loss": 0.0157, + "num_input_tokens_seen": 192070448, + "step": 88920 + }, + { + "epoch": 14.50652528548124, + "grad_norm": 0.025624606758356094, + "learning_rate": 0.00021275716336422435, + "loss": 0.0052, + "num_input_tokens_seen": 192081712, + "step": 88925 + }, + { + "epoch": 14.507340946166394, + "grad_norm": 0.03953010216355324, + "learning_rate": 0.00021269890454836288, + "loss": 0.0084, + "num_input_tokens_seen": 192092752, + "step": 88930 + }, + { + "epoch": 14.50815660685155, + "grad_norm": 0.010210723616182804, + "learning_rate": 0.00021264065155502293, + "loss": 0.0042, + "num_input_tokens_seen": 192104112, + "step": 88935 + }, + { + "epoch": 14.508972267536706, + "grad_norm": 0.00433401670306921, + "learning_rate": 0.00021258240438538434, + "loss": 0.0158, + "num_input_tokens_seen": 192114448, + "step": 88940 + }, + { + "epoch": 14.50978792822186, + "grad_norm": 0.006997460499405861, + "learning_rate": 0.0002125241630406281, + "loss": 0.005, + "num_input_tokens_seen": 192124176, + "step": 88945 + }, + { + "epoch": 14.510603588907015, + "grad_norm": 0.004508296027779579, + "learning_rate": 0.00021246592752193445, + "loss": 0.1081, + "num_input_tokens_seen": 192134960, + "step": 88950 + }, + { + "epoch": 14.51141924959217, + "grad_norm": 0.0038452772423624992, + "learning_rate": 0.00021240769783048352, + "loss": 0.0026, + "num_input_tokens_seen": 192145648, + "step": 88955 + }, + { + "epoch": 14.512234910277325, + "grad_norm": 0.009463680908083916, + "learning_rate": 0.00021234947396745542, + "loss": 0.0458, + "num_input_tokens_seen": 192156560, + "step": 88960 + }, + { + "epoch": 14.513050570962479, + "grad_norm": 0.005169307813048363, + "learning_rate": 0.00021229125593403016, + "loss": 0.0136, + "num_input_tokens_seen": 192167184, + "step": 88965 + }, + { + "epoch": 14.513866231647635, + "grad_norm": 0.0034116103779524565, + "learning_rate": 0.00021223304373138753, + "loss": 0.0045, + "num_input_tokens_seen": 192178608, + "step": 88970 + }, + { + "epoch": 14.51468189233279, + "grad_norm": 0.014676249586045742, + "learning_rate": 0.00021217483736070736, + "loss": 0.0055, + "num_input_tokens_seen": 192189328, + "step": 88975 + }, + { + "epoch": 14.515497553017944, + "grad_norm": 0.039021145552396774, + "learning_rate": 0.00021211663682316922, + "loss": 0.025, + "num_input_tokens_seen": 192200880, + "step": 88980 + }, + { + "epoch": 14.5163132137031, + "grad_norm": 0.026364067569375038, + "learning_rate": 0.00021205844211995268, + "loss": 0.0033, + "num_input_tokens_seen": 192211504, + "step": 88985 + }, + { + "epoch": 14.517128874388254, + "grad_norm": 0.051095642149448395, + "learning_rate": 0.0002120002532522371, + "loss": 0.0083, + "num_input_tokens_seen": 192223216, + "step": 88990 + }, + { + "epoch": 14.51794453507341, + "grad_norm": 0.009368719533085823, + "learning_rate": 0.00021194207022120153, + "loss": 0.0031, + "num_input_tokens_seen": 192234640, + "step": 88995 + }, + { + "epoch": 14.518760195758565, + "grad_norm": 0.005646569188684225, + "learning_rate": 0.0002118838930280257, + "loss": 0.0032, + "num_input_tokens_seen": 192244752, + "step": 89000 + }, + { + "epoch": 14.51957585644372, + "grad_norm": 0.0006784353754483163, + "learning_rate": 0.00021182572167388792, + "loss": 0.0012, + "num_input_tokens_seen": 192254448, + "step": 89005 + }, + { + "epoch": 14.520391517128875, + "grad_norm": 0.013773053884506226, + "learning_rate": 0.00021176755615996785, + "loss": 0.0016, + "num_input_tokens_seen": 192265584, + "step": 89010 + }, + { + "epoch": 14.521207177814029, + "grad_norm": 0.014610863290727139, + "learning_rate": 0.00021170939648744346, + "loss": 0.0882, + "num_input_tokens_seen": 192277200, + "step": 89015 + }, + { + "epoch": 14.522022838499185, + "grad_norm": 0.06779501587152481, + "learning_rate": 0.00021165124265749431, + "loss": 0.0883, + "num_input_tokens_seen": 192288848, + "step": 89020 + }, + { + "epoch": 14.522838499184338, + "grad_norm": 0.0008561646100133657, + "learning_rate": 0.00021159309467129816, + "loss": 0.0024, + "num_input_tokens_seen": 192299056, + "step": 89025 + }, + { + "epoch": 14.523654159869494, + "grad_norm": 0.08283943682909012, + "learning_rate": 0.0002115349525300342, + "loss": 0.006, + "num_input_tokens_seen": 192309232, + "step": 89030 + }, + { + "epoch": 14.52446982055465, + "grad_norm": 0.037029191851615906, + "learning_rate": 0.00021147681623487997, + "loss": 0.0038, + "num_input_tokens_seen": 192320112, + "step": 89035 + }, + { + "epoch": 14.525285481239804, + "grad_norm": 0.01584443263709545, + "learning_rate": 0.0002114186857870144, + "loss": 0.0016, + "num_input_tokens_seen": 192330096, + "step": 89040 + }, + { + "epoch": 14.52610114192496, + "grad_norm": 0.10031913965940475, + "learning_rate": 0.00021136056118761494, + "loss": 0.0497, + "num_input_tokens_seen": 192341744, + "step": 89045 + }, + { + "epoch": 14.526916802610113, + "grad_norm": 0.04384492337703705, + "learning_rate": 0.00021130244243786024, + "loss": 0.0035, + "num_input_tokens_seen": 192353200, + "step": 89050 + }, + { + "epoch": 14.52773246329527, + "grad_norm": 0.006799424532800913, + "learning_rate": 0.00021124432953892742, + "loss": 0.0105, + "num_input_tokens_seen": 192364496, + "step": 89055 + }, + { + "epoch": 14.528548123980425, + "grad_norm": 0.004936617333441973, + "learning_rate": 0.00021118622249199494, + "loss": 0.0723, + "num_input_tokens_seen": 192373680, + "step": 89060 + }, + { + "epoch": 14.529363784665579, + "grad_norm": 0.003013910725712776, + "learning_rate": 0.00021112812129823967, + "loss": 0.0061, + "num_input_tokens_seen": 192384496, + "step": 89065 + }, + { + "epoch": 14.530179445350734, + "grad_norm": 0.5813406109809875, + "learning_rate": 0.00021107002595883978, + "loss": 0.0227, + "num_input_tokens_seen": 192394768, + "step": 89070 + }, + { + "epoch": 14.530995106035888, + "grad_norm": 0.00857462827116251, + "learning_rate": 0.00021101193647497208, + "loss": 0.0353, + "num_input_tokens_seen": 192406736, + "step": 89075 + }, + { + "epoch": 14.531810766721044, + "grad_norm": 0.013798504136502743, + "learning_rate": 0.00021095385284781426, + "loss": 0.0096, + "num_input_tokens_seen": 192418192, + "step": 89080 + }, + { + "epoch": 14.5326264274062, + "grad_norm": 0.005595772061496973, + "learning_rate": 0.00021089577507854324, + "loss": 0.0981, + "num_input_tokens_seen": 192429424, + "step": 89085 + }, + { + "epoch": 14.533442088091354, + "grad_norm": 0.035856693983078, + "learning_rate": 0.00021083770316833618, + "loss": 0.0431, + "num_input_tokens_seen": 192439952, + "step": 89090 + }, + { + "epoch": 14.53425774877651, + "grad_norm": 0.0023227352648973465, + "learning_rate": 0.00021077963711836983, + "loss": 0.0029, + "num_input_tokens_seen": 192450096, + "step": 89095 + }, + { + "epoch": 14.535073409461663, + "grad_norm": 0.3182942569255829, + "learning_rate": 0.00021072157692982103, + "loss": 0.0081, + "num_input_tokens_seen": 192461456, + "step": 89100 + }, + { + "epoch": 14.535889070146819, + "grad_norm": 0.00510216411203146, + "learning_rate": 0.00021066352260386644, + "loss": 0.0035, + "num_input_tokens_seen": 192471824, + "step": 89105 + }, + { + "epoch": 14.536704730831975, + "grad_norm": 0.07574431598186493, + "learning_rate": 0.0002106054741416827, + "loss": 0.0036, + "num_input_tokens_seen": 192482320, + "step": 89110 + }, + { + "epoch": 14.537520391517129, + "grad_norm": 0.004888062831014395, + "learning_rate": 0.00021054743154444607, + "loss": 0.0048, + "num_input_tokens_seen": 192492816, + "step": 89115 + }, + { + "epoch": 14.538336052202284, + "grad_norm": 0.016683122143149376, + "learning_rate": 0.00021048939481333297, + "loss": 0.0033, + "num_input_tokens_seen": 192504016, + "step": 89120 + }, + { + "epoch": 14.539151712887438, + "grad_norm": 0.006947814021259546, + "learning_rate": 0.00021043136394951955, + "loss": 0.125, + "num_input_tokens_seen": 192514416, + "step": 89125 + }, + { + "epoch": 14.539967373572594, + "grad_norm": 0.01835622824728489, + "learning_rate": 0.00021037333895418186, + "loss": 0.0437, + "num_input_tokens_seen": 192524336, + "step": 89130 + }, + { + "epoch": 14.540783034257748, + "grad_norm": 0.0712086632847786, + "learning_rate": 0.0002103153198284959, + "loss": 0.0035, + "num_input_tokens_seen": 192533200, + "step": 89135 + }, + { + "epoch": 14.541598694942904, + "grad_norm": 0.002161943819373846, + "learning_rate": 0.0002102573065736373, + "loss": 0.0038, + "num_input_tokens_seen": 192545104, + "step": 89140 + }, + { + "epoch": 14.54241435562806, + "grad_norm": 0.0018386748852208257, + "learning_rate": 0.00021019929919078228, + "loss": 0.009, + "num_input_tokens_seen": 192555856, + "step": 89145 + }, + { + "epoch": 14.543230016313213, + "grad_norm": 0.8079872727394104, + "learning_rate": 0.00021014129768110574, + "loss": 0.098, + "num_input_tokens_seen": 192566192, + "step": 89150 + }, + { + "epoch": 14.544045676998369, + "grad_norm": 0.020592303946614265, + "learning_rate": 0.0002100833020457839, + "loss": 0.0136, + "num_input_tokens_seen": 192577680, + "step": 89155 + }, + { + "epoch": 14.544861337683523, + "grad_norm": 0.5749585628509521, + "learning_rate": 0.00021002531228599136, + "loss": 0.0973, + "num_input_tokens_seen": 192588400, + "step": 89160 + }, + { + "epoch": 14.545676998368679, + "grad_norm": 0.017069535329937935, + "learning_rate": 0.00020996732840290405, + "loss": 0.0057, + "num_input_tokens_seen": 192599376, + "step": 89165 + }, + { + "epoch": 14.546492659053834, + "grad_norm": 0.009924108162522316, + "learning_rate": 0.0002099093503976965, + "loss": 0.0044, + "num_input_tokens_seen": 192610960, + "step": 89170 + }, + { + "epoch": 14.547308319738988, + "grad_norm": 0.24204865097999573, + "learning_rate": 0.0002098513782715442, + "loss": 0.0177, + "num_input_tokens_seen": 192621680, + "step": 89175 + }, + { + "epoch": 14.548123980424144, + "grad_norm": 0.05922012776136398, + "learning_rate": 0.00020979341202562152, + "loss": 0.0034, + "num_input_tokens_seen": 192633328, + "step": 89180 + }, + { + "epoch": 14.548939641109298, + "grad_norm": 0.05222422257065773, + "learning_rate": 0.00020973545166110368, + "loss": 0.0051, + "num_input_tokens_seen": 192644816, + "step": 89185 + }, + { + "epoch": 14.549755301794454, + "grad_norm": 0.015433356165885925, + "learning_rate": 0.00020967749717916513, + "loss": 0.006, + "num_input_tokens_seen": 192655760, + "step": 89190 + }, + { + "epoch": 14.550570962479608, + "grad_norm": 0.10926583409309387, + "learning_rate": 0.00020961954858098037, + "loss": 0.009, + "num_input_tokens_seen": 192666096, + "step": 89195 + }, + { + "epoch": 14.551386623164763, + "grad_norm": 0.0032105380669236183, + "learning_rate": 0.0002095616058677239, + "loss": 0.0081, + "num_input_tokens_seen": 192675440, + "step": 89200 + }, + { + "epoch": 14.552202283849919, + "grad_norm": 0.14845839142799377, + "learning_rate": 0.00020950366904056984, + "loss": 0.0055, + "num_input_tokens_seen": 192686416, + "step": 89205 + }, + { + "epoch": 14.553017944535073, + "grad_norm": 0.008143202401697636, + "learning_rate": 0.00020944573810069252, + "loss": 0.005, + "num_input_tokens_seen": 192697936, + "step": 89210 + }, + { + "epoch": 14.553833605220229, + "grad_norm": 0.0077461740002036095, + "learning_rate": 0.00020938781304926586, + "loss": 0.0032, + "num_input_tokens_seen": 192709008, + "step": 89215 + }, + { + "epoch": 14.554649265905383, + "grad_norm": 0.0006443687016144395, + "learning_rate": 0.00020932989388746387, + "loss": 0.0048, + "num_input_tokens_seen": 192717552, + "step": 89220 + }, + { + "epoch": 14.555464926590538, + "grad_norm": 0.0020855157636106014, + "learning_rate": 0.0002092719806164603, + "loss": 0.0122, + "num_input_tokens_seen": 192727888, + "step": 89225 + }, + { + "epoch": 14.556280587275694, + "grad_norm": 0.009736152365803719, + "learning_rate": 0.00020921407323742892, + "loss": 0.0016, + "num_input_tokens_seen": 192739984, + "step": 89230 + }, + { + "epoch": 14.557096247960848, + "grad_norm": 0.006911523174494505, + "learning_rate": 0.00020915617175154316, + "loss": 0.0034, + "num_input_tokens_seen": 192751312, + "step": 89235 + }, + { + "epoch": 14.557911908646004, + "grad_norm": 0.009532546624541283, + "learning_rate": 0.00020909827615997657, + "loss": 0.0135, + "num_input_tokens_seen": 192763248, + "step": 89240 + }, + { + "epoch": 14.558727569331158, + "grad_norm": 0.020412901416420937, + "learning_rate": 0.00020904038646390246, + "loss": 0.0175, + "num_input_tokens_seen": 192774672, + "step": 89245 + }, + { + "epoch": 14.559543230016313, + "grad_norm": 0.050955090671777725, + "learning_rate": 0.00020898250266449399, + "loss": 0.0121, + "num_input_tokens_seen": 192785552, + "step": 89250 + }, + { + "epoch": 14.560358890701469, + "grad_norm": 0.004365491680800915, + "learning_rate": 0.0002089246247629243, + "loss": 0.0032, + "num_input_tokens_seen": 192795952, + "step": 89255 + }, + { + "epoch": 14.561174551386623, + "grad_norm": 0.009529031813144684, + "learning_rate": 0.00020886675276036637, + "loss": 0.0023, + "num_input_tokens_seen": 192806544, + "step": 89260 + }, + { + "epoch": 14.561990212071779, + "grad_norm": 0.01040890347212553, + "learning_rate": 0.00020880888665799304, + "loss": 0.0029, + "num_input_tokens_seen": 192817520, + "step": 89265 + }, + { + "epoch": 14.562805872756933, + "grad_norm": 0.00223185820505023, + "learning_rate": 0.00020875102645697696, + "loss": 0.0021, + "num_input_tokens_seen": 192827920, + "step": 89270 + }, + { + "epoch": 14.563621533442088, + "grad_norm": 0.0598750114440918, + "learning_rate": 0.0002086931721584908, + "loss": 0.0057, + "num_input_tokens_seen": 192838736, + "step": 89275 + }, + { + "epoch": 14.564437194127244, + "grad_norm": 0.10287459939718246, + "learning_rate": 0.00020863532376370715, + "loss": 0.0063, + "num_input_tokens_seen": 192849776, + "step": 89280 + }, + { + "epoch": 14.565252854812398, + "grad_norm": 0.007824460975825787, + "learning_rate": 0.000208577481273798, + "loss": 0.0166, + "num_input_tokens_seen": 192858800, + "step": 89285 + }, + { + "epoch": 14.566068515497554, + "grad_norm": 0.026574891060590744, + "learning_rate": 0.00020851964468993612, + "loss": 0.1111, + "num_input_tokens_seen": 192870064, + "step": 89290 + }, + { + "epoch": 14.566884176182707, + "grad_norm": 0.0022794893011450768, + "learning_rate": 0.00020846181401329338, + "loss": 0.0026, + "num_input_tokens_seen": 192881872, + "step": 89295 + }, + { + "epoch": 14.567699836867863, + "grad_norm": 0.0017811759607866406, + "learning_rate": 0.00020840398924504188, + "loss": 0.0127, + "num_input_tokens_seen": 192892464, + "step": 89300 + }, + { + "epoch": 14.568515497553017, + "grad_norm": 0.043051186949014664, + "learning_rate": 0.0002083461703863534, + "loss": 0.0053, + "num_input_tokens_seen": 192903792, + "step": 89305 + }, + { + "epoch": 14.569331158238173, + "grad_norm": 0.009227285161614418, + "learning_rate": 0.0002082883574383998, + "loss": 0.0021, + "num_input_tokens_seen": 192914704, + "step": 89310 + }, + { + "epoch": 14.570146818923329, + "grad_norm": 0.12363433092832565, + "learning_rate": 0.00020823055040235266, + "loss": 0.0049, + "num_input_tokens_seen": 192925776, + "step": 89315 + }, + { + "epoch": 14.570962479608482, + "grad_norm": 0.039080094546079636, + "learning_rate": 0.0002081727492793836, + "loss": 0.0095, + "num_input_tokens_seen": 192936656, + "step": 89320 + }, + { + "epoch": 14.571778140293638, + "grad_norm": 0.0033353553153574467, + "learning_rate": 0.00020811495407066394, + "loss": 0.0063, + "num_input_tokens_seen": 192948048, + "step": 89325 + }, + { + "epoch": 14.572593800978792, + "grad_norm": 0.056662097573280334, + "learning_rate": 0.00020805716477736508, + "loss": 0.0373, + "num_input_tokens_seen": 192957840, + "step": 89330 + }, + { + "epoch": 14.573409461663948, + "grad_norm": 0.0010778683936223388, + "learning_rate": 0.00020799938140065804, + "loss": 0.0079, + "num_input_tokens_seen": 192968528, + "step": 89335 + }, + { + "epoch": 14.574225122349104, + "grad_norm": 0.0030743195675313473, + "learning_rate": 0.00020794160394171403, + "loss": 0.0025, + "num_input_tokens_seen": 192979088, + "step": 89340 + }, + { + "epoch": 14.575040783034257, + "grad_norm": 0.007394440937787294, + "learning_rate": 0.00020788383240170395, + "loss": 0.0124, + "num_input_tokens_seen": 192990640, + "step": 89345 + }, + { + "epoch": 14.575856443719413, + "grad_norm": 0.001514918520115316, + "learning_rate": 0.0002078260667817985, + "loss": 0.0017, + "num_input_tokens_seen": 193001840, + "step": 89350 + }, + { + "epoch": 14.576672104404567, + "grad_norm": 0.002615422708913684, + "learning_rate": 0.0002077683070831685, + "loss": 0.0697, + "num_input_tokens_seen": 193013232, + "step": 89355 + }, + { + "epoch": 14.577487765089723, + "grad_norm": 0.027480650693178177, + "learning_rate": 0.00020771055330698446, + "loss": 0.0028, + "num_input_tokens_seen": 193025040, + "step": 89360 + }, + { + "epoch": 14.578303425774878, + "grad_norm": 0.43630602955818176, + "learning_rate": 0.0002076528054544169, + "loss": 0.0143, + "num_input_tokens_seen": 193036464, + "step": 89365 + }, + { + "epoch": 14.579119086460032, + "grad_norm": 0.009855729527771473, + "learning_rate": 0.00020759506352663605, + "loss": 0.0073, + "num_input_tokens_seen": 193046896, + "step": 89370 + }, + { + "epoch": 14.579934747145188, + "grad_norm": 0.46799859404563904, + "learning_rate": 0.0002075373275248122, + "loss": 0.1245, + "num_input_tokens_seen": 193058192, + "step": 89375 + }, + { + "epoch": 14.580750407830342, + "grad_norm": 0.010498947463929653, + "learning_rate": 0.00020747959745011542, + "loss": 0.0054, + "num_input_tokens_seen": 193069808, + "step": 89380 + }, + { + "epoch": 14.581566068515498, + "grad_norm": 0.07971848547458649, + "learning_rate": 0.0002074218733037157, + "loss": 0.0047, + "num_input_tokens_seen": 193081392, + "step": 89385 + }, + { + "epoch": 14.582381729200652, + "grad_norm": 0.0011771656572818756, + "learning_rate": 0.00020736415508678285, + "loss": 0.0299, + "num_input_tokens_seen": 193092144, + "step": 89390 + }, + { + "epoch": 14.583197389885807, + "grad_norm": 0.07137224078178406, + "learning_rate": 0.0002073064428004865, + "loss": 0.0033, + "num_input_tokens_seen": 193103312, + "step": 89395 + }, + { + "epoch": 14.584013050570963, + "grad_norm": 0.004249855410307646, + "learning_rate": 0.00020724873644599668, + "loss": 0.0054, + "num_input_tokens_seen": 193115152, + "step": 89400 + }, + { + "epoch": 14.584828711256117, + "grad_norm": 0.08202840387821198, + "learning_rate": 0.0002071910360244823, + "loss": 0.0753, + "num_input_tokens_seen": 193125776, + "step": 89405 + }, + { + "epoch": 14.585644371941273, + "grad_norm": 0.017344074323773384, + "learning_rate": 0.0002071333415371134, + "loss": 0.0098, + "num_input_tokens_seen": 193136208, + "step": 89410 + }, + { + "epoch": 14.586460032626427, + "grad_norm": 0.013641850091516972, + "learning_rate": 0.00020707565298505842, + "loss": 0.0018, + "num_input_tokens_seen": 193148208, + "step": 89415 + }, + { + "epoch": 14.587275693311582, + "grad_norm": 0.002559355227276683, + "learning_rate": 0.00020701797036948739, + "loss": 0.0024, + "num_input_tokens_seen": 193159088, + "step": 89420 + }, + { + "epoch": 14.588091353996738, + "grad_norm": 0.003523677121847868, + "learning_rate": 0.00020696029369156844, + "loss": 0.0019, + "num_input_tokens_seen": 193169648, + "step": 89425 + }, + { + "epoch": 14.588907014681892, + "grad_norm": 0.0047660889104008675, + "learning_rate": 0.0002069026229524711, + "loss": 0.0038, + "num_input_tokens_seen": 193180624, + "step": 89430 + }, + { + "epoch": 14.589722675367048, + "grad_norm": 0.0020290075335651636, + "learning_rate": 0.00020684495815336392, + "loss": 0.0009, + "num_input_tokens_seen": 193191408, + "step": 89435 + }, + { + "epoch": 14.590538336052202, + "grad_norm": 0.0014430329902097583, + "learning_rate": 0.00020678729929541552, + "loss": 0.0032, + "num_input_tokens_seen": 193201008, + "step": 89440 + }, + { + "epoch": 14.591353996737357, + "grad_norm": 0.0040334672667086124, + "learning_rate": 0.00020672964637979453, + "loss": 0.0055, + "num_input_tokens_seen": 193211120, + "step": 89445 + }, + { + "epoch": 14.592169657422513, + "grad_norm": 0.0017633815295994282, + "learning_rate": 0.00020667199940766924, + "loss": 0.0084, + "num_input_tokens_seen": 193221936, + "step": 89450 + }, + { + "epoch": 14.592985318107667, + "grad_norm": 0.7666230201721191, + "learning_rate": 0.00020661435838020798, + "loss": 0.0843, + "num_input_tokens_seen": 193233616, + "step": 89455 + }, + { + "epoch": 14.593800978792823, + "grad_norm": 0.005272711627185345, + "learning_rate": 0.000206556723298579, + "loss": 0.0025, + "num_input_tokens_seen": 193242992, + "step": 89460 + }, + { + "epoch": 14.594616639477977, + "grad_norm": 0.0023867280688136816, + "learning_rate": 0.00020649909416395025, + "loss": 0.0038, + "num_input_tokens_seen": 193252912, + "step": 89465 + }, + { + "epoch": 14.595432300163132, + "grad_norm": 0.016159815713763237, + "learning_rate": 0.00020644147097748967, + "loss": 0.0012, + "num_input_tokens_seen": 193264304, + "step": 89470 + }, + { + "epoch": 14.596247960848288, + "grad_norm": 0.0027893621008843184, + "learning_rate": 0.0002063838537403651, + "loss": 0.0108, + "num_input_tokens_seen": 193275632, + "step": 89475 + }, + { + "epoch": 14.597063621533442, + "grad_norm": 0.00795994233340025, + "learning_rate": 0.00020632624245374426, + "loss": 0.0085, + "num_input_tokens_seen": 193285680, + "step": 89480 + }, + { + "epoch": 14.597879282218598, + "grad_norm": 0.014036700129508972, + "learning_rate": 0.0002062686371187946, + "loss": 0.0968, + "num_input_tokens_seen": 193295792, + "step": 89485 + }, + { + "epoch": 14.598694942903752, + "grad_norm": 0.0036570588126778603, + "learning_rate": 0.00020621103773668366, + "loss": 0.0009, + "num_input_tokens_seen": 193306320, + "step": 89490 + }, + { + "epoch": 14.599510603588907, + "grad_norm": 0.004171954933553934, + "learning_rate": 0.00020615344430857874, + "loss": 0.1716, + "num_input_tokens_seen": 193317616, + "step": 89495 + }, + { + "epoch": 14.600326264274061, + "grad_norm": 0.005262458231300116, + "learning_rate": 0.00020609585683564687, + "loss": 0.0025, + "num_input_tokens_seen": 193328784, + "step": 89500 + }, + { + "epoch": 14.601141924959217, + "grad_norm": 0.0018460382707417011, + "learning_rate": 0.00020603827531905566, + "loss": 0.0877, + "num_input_tokens_seen": 193339408, + "step": 89505 + }, + { + "epoch": 14.601957585644373, + "grad_norm": 0.0008219721494242549, + "learning_rate": 0.00020598069975997135, + "loss": 0.0909, + "num_input_tokens_seen": 193349744, + "step": 89510 + }, + { + "epoch": 14.602773246329527, + "grad_norm": 0.008539113216102123, + "learning_rate": 0.0002059231301595615, + "loss": 0.0013, + "num_input_tokens_seen": 193360528, + "step": 89515 + }, + { + "epoch": 14.603588907014682, + "grad_norm": 0.008514752611517906, + "learning_rate": 0.00020586556651899213, + "loss": 0.0024, + "num_input_tokens_seen": 193371792, + "step": 89520 + }, + { + "epoch": 14.604404567699836, + "grad_norm": 0.138560950756073, + "learning_rate": 0.00020580800883943058, + "loss": 0.0106, + "num_input_tokens_seen": 193381392, + "step": 89525 + }, + { + "epoch": 14.605220228384992, + "grad_norm": 0.015043669380247593, + "learning_rate": 0.00020575045712204254, + "loss": 0.0061, + "num_input_tokens_seen": 193392272, + "step": 89530 + }, + { + "epoch": 14.606035889070148, + "grad_norm": 0.1300588846206665, + "learning_rate": 0.00020569291136799512, + "loss": 0.0147, + "num_input_tokens_seen": 193402800, + "step": 89535 + }, + { + "epoch": 14.606851549755302, + "grad_norm": 0.02232443168759346, + "learning_rate": 0.00020563537157845392, + "loss": 0.0043, + "num_input_tokens_seen": 193413008, + "step": 89540 + }, + { + "epoch": 14.607667210440457, + "grad_norm": 0.33156269788742065, + "learning_rate": 0.0002055778377545856, + "loss": 0.012, + "num_input_tokens_seen": 193424272, + "step": 89545 + }, + { + "epoch": 14.608482871125611, + "grad_norm": 0.4333055317401886, + "learning_rate": 0.0002055203098975556, + "loss": 0.1049, + "num_input_tokens_seen": 193434928, + "step": 89550 + }, + { + "epoch": 14.609298531810767, + "grad_norm": 0.00710340915247798, + "learning_rate": 0.00020546278800853048, + "loss": 0.0012, + "num_input_tokens_seen": 193447184, + "step": 89555 + }, + { + "epoch": 14.61011419249592, + "grad_norm": 1.8786109685897827, + "learning_rate": 0.00020540527208867522, + "loss": 0.2019, + "num_input_tokens_seen": 193458448, + "step": 89560 + }, + { + "epoch": 14.610929853181077, + "grad_norm": 0.00856061466038227, + "learning_rate": 0.00020534776213915619, + "loss": 0.0035, + "num_input_tokens_seen": 193469680, + "step": 89565 + }, + { + "epoch": 14.611745513866232, + "grad_norm": 2.7863962650299072, + "learning_rate": 0.00020529025816113817, + "loss": 0.3, + "num_input_tokens_seen": 193480848, + "step": 89570 + }, + { + "epoch": 14.612561174551386, + "grad_norm": 0.006839493755251169, + "learning_rate": 0.00020523276015578713, + "loss": 0.0012, + "num_input_tokens_seen": 193491280, + "step": 89575 + }, + { + "epoch": 14.613376835236542, + "grad_norm": 0.0007321059238165617, + "learning_rate": 0.0002051752681242682, + "loss": 0.0072, + "num_input_tokens_seen": 193502384, + "step": 89580 + }, + { + "epoch": 14.614192495921696, + "grad_norm": 0.005245604086667299, + "learning_rate": 0.0002051177820677464, + "loss": 0.0025, + "num_input_tokens_seen": 193510896, + "step": 89585 + }, + { + "epoch": 14.615008156606851, + "grad_norm": 0.0040390766225755215, + "learning_rate": 0.00020506030198738683, + "loss": 0.0075, + "num_input_tokens_seen": 193521712, + "step": 89590 + }, + { + "epoch": 14.615823817292007, + "grad_norm": 0.017503926530480385, + "learning_rate": 0.00020500282788435441, + "loss": 0.0035, + "num_input_tokens_seen": 193533008, + "step": 89595 + }, + { + "epoch": 14.616639477977161, + "grad_norm": 0.03211967274546623, + "learning_rate": 0.00020494535975981398, + "loss": 0.0097, + "num_input_tokens_seen": 193543472, + "step": 89600 + }, + { + "epoch": 14.617455138662317, + "grad_norm": 0.397350013256073, + "learning_rate": 0.0002048878976149301, + "loss": 0.0192, + "num_input_tokens_seen": 193553392, + "step": 89605 + }, + { + "epoch": 14.61827079934747, + "grad_norm": 0.008442388847470284, + "learning_rate": 0.00020483044145086732, + "loss": 0.0032, + "num_input_tokens_seen": 193564912, + "step": 89610 + }, + { + "epoch": 14.619086460032626, + "grad_norm": 0.0035397973842918873, + "learning_rate": 0.00020477299126879013, + "loss": 0.0635, + "num_input_tokens_seen": 193574512, + "step": 89615 + }, + { + "epoch": 14.619902120717782, + "grad_norm": 0.004368765279650688, + "learning_rate": 0.00020471554706986273, + "loss": 0.0045, + "num_input_tokens_seen": 193585968, + "step": 89620 + }, + { + "epoch": 14.620717781402936, + "grad_norm": 0.002169569954276085, + "learning_rate": 0.00020465810885524928, + "loss": 0.0135, + "num_input_tokens_seen": 193596304, + "step": 89625 + }, + { + "epoch": 14.621533442088092, + "grad_norm": 0.015089782886207104, + "learning_rate": 0.0002046006766261142, + "loss": 0.0033, + "num_input_tokens_seen": 193606672, + "step": 89630 + }, + { + "epoch": 14.622349102773246, + "grad_norm": 0.012145834043622017, + "learning_rate": 0.00020454325038362083, + "loss": 0.0077, + "num_input_tokens_seen": 193616912, + "step": 89635 + }, + { + "epoch": 14.623164763458401, + "grad_norm": 0.004161675926297903, + "learning_rate": 0.00020448583012893363, + "loss": 0.0056, + "num_input_tokens_seen": 193627856, + "step": 89640 + }, + { + "epoch": 14.623980424143557, + "grad_norm": 0.0038490283768624067, + "learning_rate": 0.00020442841586321565, + "loss": 0.0627, + "num_input_tokens_seen": 193638896, + "step": 89645 + }, + { + "epoch": 14.624796084828711, + "grad_norm": 0.005434063263237476, + "learning_rate": 0.0002043710075876311, + "loss": 0.0054, + "num_input_tokens_seen": 193648816, + "step": 89650 + }, + { + "epoch": 14.625611745513867, + "grad_norm": 0.02254394255578518, + "learning_rate": 0.00020431360530334282, + "loss": 0.003, + "num_input_tokens_seen": 193660144, + "step": 89655 + }, + { + "epoch": 14.62642740619902, + "grad_norm": 0.05815372243523598, + "learning_rate": 0.0002042562090115147, + "loss": 0.0077, + "num_input_tokens_seen": 193671024, + "step": 89660 + }, + { + "epoch": 14.627243066884176, + "grad_norm": 0.11653152108192444, + "learning_rate": 0.0002041988187133094, + "loss": 0.0498, + "num_input_tokens_seen": 193682960, + "step": 89665 + }, + { + "epoch": 14.62805872756933, + "grad_norm": 0.0057633547112345695, + "learning_rate": 0.00020414143440989062, + "loss": 0.126, + "num_input_tokens_seen": 193693136, + "step": 89670 + }, + { + "epoch": 14.628874388254486, + "grad_norm": 0.053961724042892456, + "learning_rate": 0.00020408405610242063, + "loss": 0.0065, + "num_input_tokens_seen": 193704592, + "step": 89675 + }, + { + "epoch": 14.629690048939642, + "grad_norm": 0.009925310499966145, + "learning_rate": 0.000204026683792063, + "loss": 0.0026, + "num_input_tokens_seen": 193715088, + "step": 89680 + }, + { + "epoch": 14.630505709624796, + "grad_norm": 0.006178427021950483, + "learning_rate": 0.00020396931747997978, + "loss": 0.003, + "num_input_tokens_seen": 193726160, + "step": 89685 + }, + { + "epoch": 14.631321370309951, + "grad_norm": 0.12800955772399902, + "learning_rate": 0.0002039119571673342, + "loss": 0.0242, + "num_input_tokens_seen": 193736848, + "step": 89690 + }, + { + "epoch": 14.632137030995105, + "grad_norm": 0.25726088881492615, + "learning_rate": 0.00020385460285528807, + "loss": 0.0543, + "num_input_tokens_seen": 193748080, + "step": 89695 + }, + { + "epoch": 14.632952691680261, + "grad_norm": 0.0024385061115026474, + "learning_rate": 0.0002037972545450044, + "loss": 0.0065, + "num_input_tokens_seen": 193759440, + "step": 89700 + }, + { + "epoch": 14.633768352365417, + "grad_norm": 0.008196687325835228, + "learning_rate": 0.0002037399122376449, + "loss": 0.0058, + "num_input_tokens_seen": 193770768, + "step": 89705 + }, + { + "epoch": 14.63458401305057, + "grad_norm": 0.024108847603201866, + "learning_rate": 0.0002036825759343721, + "loss": 0.003, + "num_input_tokens_seen": 193781616, + "step": 89710 + }, + { + "epoch": 14.635399673735726, + "grad_norm": 0.03441760316491127, + "learning_rate": 0.0002036252456363476, + "loss": 0.022, + "num_input_tokens_seen": 193792656, + "step": 89715 + }, + { + "epoch": 14.63621533442088, + "grad_norm": 0.010302347131073475, + "learning_rate": 0.00020356792134473356, + "loss": 0.0027, + "num_input_tokens_seen": 193802384, + "step": 89720 + }, + { + "epoch": 14.637030995106036, + "grad_norm": 0.009490042924880981, + "learning_rate": 0.0002035106030606917, + "loss": 0.0124, + "num_input_tokens_seen": 193812400, + "step": 89725 + }, + { + "epoch": 14.63784665579119, + "grad_norm": 0.017873436212539673, + "learning_rate": 0.00020345329078538354, + "loss": 0.0055, + "num_input_tokens_seen": 193823664, + "step": 89730 + }, + { + "epoch": 14.638662316476346, + "grad_norm": 0.03383546322584152, + "learning_rate": 0.00020339598451997066, + "loss": 0.0084, + "num_input_tokens_seen": 193835696, + "step": 89735 + }, + { + "epoch": 14.639477977161501, + "grad_norm": 0.006421535741537809, + "learning_rate": 0.00020333868426561448, + "loss": 0.002, + "num_input_tokens_seen": 193847376, + "step": 89740 + }, + { + "epoch": 14.640293637846655, + "grad_norm": 0.0031049128156155348, + "learning_rate": 0.00020328139002347612, + "loss": 0.0032, + "num_input_tokens_seen": 193858480, + "step": 89745 + }, + { + "epoch": 14.641109298531811, + "grad_norm": 0.037416599690914154, + "learning_rate": 0.00020322410179471684, + "loss": 0.0045, + "num_input_tokens_seen": 193871024, + "step": 89750 + }, + { + "epoch": 14.641924959216965, + "grad_norm": 0.007306250277906656, + "learning_rate": 0.00020316681958049758, + "loss": 0.0061, + "num_input_tokens_seen": 193881104, + "step": 89755 + }, + { + "epoch": 14.64274061990212, + "grad_norm": 0.007262531202286482, + "learning_rate": 0.00020310954338197934, + "loss": 0.002, + "num_input_tokens_seen": 193892784, + "step": 89760 + }, + { + "epoch": 14.643556280587276, + "grad_norm": 0.0022280393168330193, + "learning_rate": 0.00020305227320032283, + "loss": 0.0036, + "num_input_tokens_seen": 193904528, + "step": 89765 + }, + { + "epoch": 14.64437194127243, + "grad_norm": 0.0012667253613471985, + "learning_rate": 0.00020299500903668856, + "loss": 0.0255, + "num_input_tokens_seen": 193914640, + "step": 89770 + }, + { + "epoch": 14.645187601957586, + "grad_norm": 0.0008011406753212214, + "learning_rate": 0.00020293775089223748, + "loss": 0.0325, + "num_input_tokens_seen": 193926416, + "step": 89775 + }, + { + "epoch": 14.64600326264274, + "grad_norm": 0.0034976284950971603, + "learning_rate": 0.00020288049876812943, + "loss": 0.0026, + "num_input_tokens_seen": 193937776, + "step": 89780 + }, + { + "epoch": 14.646818923327896, + "grad_norm": 0.003417432773858309, + "learning_rate": 0.00020282325266552536, + "loss": 0.1563, + "num_input_tokens_seen": 193948976, + "step": 89785 + }, + { + "epoch": 14.647634584013051, + "grad_norm": 0.014804985374212265, + "learning_rate": 0.0002027660125855847, + "loss": 0.0036, + "num_input_tokens_seen": 193959408, + "step": 89790 + }, + { + "epoch": 14.648450244698205, + "grad_norm": 0.0008979029953479767, + "learning_rate": 0.00020270877852946817, + "loss": 0.0023, + "num_input_tokens_seen": 193971536, + "step": 89795 + }, + { + "epoch": 14.649265905383361, + "grad_norm": 0.0022123174276202917, + "learning_rate": 0.0002026515504983351, + "loss": 0.007, + "num_input_tokens_seen": 193981392, + "step": 89800 + }, + { + "epoch": 14.650081566068515, + "grad_norm": 0.0016149893635883927, + "learning_rate": 0.00020259432849334592, + "loss": 0.0039, + "num_input_tokens_seen": 193992592, + "step": 89805 + }, + { + "epoch": 14.65089722675367, + "grad_norm": 0.4601063132286072, + "learning_rate": 0.00020253711251565953, + "loss": 0.1179, + "num_input_tokens_seen": 194003280, + "step": 89810 + }, + { + "epoch": 14.651712887438826, + "grad_norm": 0.0017725087236613035, + "learning_rate": 0.00020247990256643634, + "loss": 0.0016, + "num_input_tokens_seen": 194014160, + "step": 89815 + }, + { + "epoch": 14.65252854812398, + "grad_norm": 0.0016594589687883854, + "learning_rate": 0.000202422698646835, + "loss": 0.0603, + "num_input_tokens_seen": 194025072, + "step": 89820 + }, + { + "epoch": 14.653344208809136, + "grad_norm": 0.004949385765939951, + "learning_rate": 0.00020236550075801535, + "loss": 0.0142, + "num_input_tokens_seen": 194034384, + "step": 89825 + }, + { + "epoch": 14.65415986949429, + "grad_norm": 0.002924638567492366, + "learning_rate": 0.0002023083089011364, + "loss": 0.0047, + "num_input_tokens_seen": 194046064, + "step": 89830 + }, + { + "epoch": 14.654975530179446, + "grad_norm": 0.5376624464988708, + "learning_rate": 0.00020225112307735717, + "loss": 0.1602, + "num_input_tokens_seen": 194056016, + "step": 89835 + }, + { + "epoch": 14.655791190864601, + "grad_norm": 0.0024497162085026503, + "learning_rate": 0.00020219394328783668, + "loss": 0.0051, + "num_input_tokens_seen": 194067056, + "step": 89840 + }, + { + "epoch": 14.656606851549755, + "grad_norm": 0.057129036635160446, + "learning_rate": 0.00020213676953373372, + "loss": 0.0018, + "num_input_tokens_seen": 194077712, + "step": 89845 + }, + { + "epoch": 14.65742251223491, + "grad_norm": 0.006286917254328728, + "learning_rate": 0.00020207960181620706, + "loss": 0.003, + "num_input_tokens_seen": 194087312, + "step": 89850 + }, + { + "epoch": 14.658238172920065, + "grad_norm": 0.05066222324967384, + "learning_rate": 0.00020202244013641513, + "loss": 0.0055, + "num_input_tokens_seen": 194098800, + "step": 89855 + }, + { + "epoch": 14.65905383360522, + "grad_norm": 0.004280565306544304, + "learning_rate": 0.0002019652844955165, + "loss": 0.002, + "num_input_tokens_seen": 194110704, + "step": 89860 + }, + { + "epoch": 14.659869494290374, + "grad_norm": 0.0028599584475159645, + "learning_rate": 0.00020190813489466943, + "loss": 0.1685, + "num_input_tokens_seen": 194122000, + "step": 89865 + }, + { + "epoch": 14.66068515497553, + "grad_norm": 0.003662576898932457, + "learning_rate": 0.00020185099133503216, + "loss": 0.0109, + "num_input_tokens_seen": 194132336, + "step": 89870 + }, + { + "epoch": 14.661500815660686, + "grad_norm": 0.0418967604637146, + "learning_rate": 0.00020179385381776283, + "loss": 0.0035, + "num_input_tokens_seen": 194144080, + "step": 89875 + }, + { + "epoch": 14.66231647634584, + "grad_norm": 0.009954468347132206, + "learning_rate": 0.00020173672234401928, + "loss": 0.0032, + "num_input_tokens_seen": 194153200, + "step": 89880 + }, + { + "epoch": 14.663132137030995, + "grad_norm": 0.023417534306645393, + "learning_rate": 0.00020167959691495946, + "loss": 0.0078, + "num_input_tokens_seen": 194165104, + "step": 89885 + }, + { + "epoch": 14.66394779771615, + "grad_norm": 0.008343107998371124, + "learning_rate": 0.00020162247753174105, + "loss": 0.0016, + "num_input_tokens_seen": 194175600, + "step": 89890 + }, + { + "epoch": 14.664763458401305, + "grad_norm": 0.006546036805957556, + "learning_rate": 0.00020156536419552168, + "loss": 0.0035, + "num_input_tokens_seen": 194184880, + "step": 89895 + }, + { + "epoch": 14.66557911908646, + "grad_norm": 0.005429636221379042, + "learning_rate": 0.00020150825690745883, + "loss": 0.0197, + "num_input_tokens_seen": 194195152, + "step": 89900 + }, + { + "epoch": 14.666394779771615, + "grad_norm": 0.003119829809293151, + "learning_rate": 0.00020145115566870975, + "loss": 0.0068, + "num_input_tokens_seen": 194205744, + "step": 89905 + }, + { + "epoch": 14.66721044045677, + "grad_norm": 0.14407505095005035, + "learning_rate": 0.00020139406048043173, + "loss": 0.0078, + "num_input_tokens_seen": 194215920, + "step": 89910 + }, + { + "epoch": 14.668026101141924, + "grad_norm": 0.1059744581580162, + "learning_rate": 0.00020133697134378176, + "loss": 0.0368, + "num_input_tokens_seen": 194226736, + "step": 89915 + }, + { + "epoch": 14.66884176182708, + "grad_norm": 0.011918950825929642, + "learning_rate": 0.0002012798882599173, + "loss": 0.0051, + "num_input_tokens_seen": 194238320, + "step": 89920 + }, + { + "epoch": 14.669657422512234, + "grad_norm": 0.05558808147907257, + "learning_rate": 0.00020122281122999443, + "loss": 0.0112, + "num_input_tokens_seen": 194249744, + "step": 89925 + }, + { + "epoch": 14.67047308319739, + "grad_norm": 0.0026128387544304132, + "learning_rate": 0.00020116574025517053, + "loss": 0.0203, + "num_input_tokens_seen": 194259664, + "step": 89930 + }, + { + "epoch": 14.671288743882545, + "grad_norm": 0.5910449028015137, + "learning_rate": 0.00020110867533660204, + "loss": 0.0156, + "num_input_tokens_seen": 194271600, + "step": 89935 + }, + { + "epoch": 14.6721044045677, + "grad_norm": 0.005343761760741472, + "learning_rate": 0.00020105161647544534, + "loss": 0.0017, + "num_input_tokens_seen": 194281840, + "step": 89940 + }, + { + "epoch": 14.672920065252855, + "grad_norm": 0.016192588955163956, + "learning_rate": 0.00020099456367285695, + "loss": 0.1246, + "num_input_tokens_seen": 194292816, + "step": 89945 + }, + { + "epoch": 14.673735725938009, + "grad_norm": 0.009091212414205074, + "learning_rate": 0.00020093751692999302, + "loss": 0.0199, + "num_input_tokens_seen": 194304176, + "step": 89950 + }, + { + "epoch": 14.674551386623165, + "grad_norm": 0.06575567275285721, + "learning_rate": 0.00020088047624800966, + "loss": 0.0068, + "num_input_tokens_seen": 194314128, + "step": 89955 + }, + { + "epoch": 14.67536704730832, + "grad_norm": 0.07140957564115524, + "learning_rate": 0.00020082344162806293, + "loss": 0.0035, + "num_input_tokens_seen": 194324976, + "step": 89960 + }, + { + "epoch": 14.676182707993474, + "grad_norm": 0.05011884495615959, + "learning_rate": 0.00020076641307130872, + "loss": 0.0036, + "num_input_tokens_seen": 194336112, + "step": 89965 + }, + { + "epoch": 14.67699836867863, + "grad_norm": 0.007385524921119213, + "learning_rate": 0.00020070939057890275, + "loss": 0.0217, + "num_input_tokens_seen": 194347728, + "step": 89970 + }, + { + "epoch": 14.677814029363784, + "grad_norm": 0.01539340615272522, + "learning_rate": 0.00020065237415200062, + "loss": 0.0039, + "num_input_tokens_seen": 194358576, + "step": 89975 + }, + { + "epoch": 14.67862969004894, + "grad_norm": 0.03188261017203331, + "learning_rate": 0.00020059536379175792, + "loss": 0.0048, + "num_input_tokens_seen": 194368464, + "step": 89980 + }, + { + "epoch": 14.679445350734095, + "grad_norm": 0.8307998180389404, + "learning_rate": 0.0002005383594993299, + "loss": 0.0216, + "num_input_tokens_seen": 194378352, + "step": 89985 + }, + { + "epoch": 14.68026101141925, + "grad_norm": 0.02968890219926834, + "learning_rate": 0.00020048136127587203, + "loss": 0.0483, + "num_input_tokens_seen": 194388720, + "step": 89990 + }, + { + "epoch": 14.681076672104405, + "grad_norm": 0.002068957546725869, + "learning_rate": 0.0002004243691225393, + "loss": 0.0022, + "num_input_tokens_seen": 194400080, + "step": 89995 + }, + { + "epoch": 14.681892332789559, + "grad_norm": 0.008313358761370182, + "learning_rate": 0.00020036738304048674, + "loss": 0.0016, + "num_input_tokens_seen": 194410832, + "step": 90000 + }, + { + "epoch": 14.682707993474715, + "grad_norm": 0.013613566756248474, + "learning_rate": 0.00020031040303086932, + "loss": 0.0134, + "num_input_tokens_seen": 194421840, + "step": 90005 + }, + { + "epoch": 14.68352365415987, + "grad_norm": 0.0009858844568952918, + "learning_rate": 0.00020025342909484173, + "loss": 0.0019, + "num_input_tokens_seen": 194432752, + "step": 90010 + }, + { + "epoch": 14.684339314845024, + "grad_norm": 0.06172553449869156, + "learning_rate": 0.00020019646123355868, + "loss": 0.0295, + "num_input_tokens_seen": 194443344, + "step": 90015 + }, + { + "epoch": 14.68515497553018, + "grad_norm": 0.009558094665408134, + "learning_rate": 0.00020013949944817466, + "loss": 0.0682, + "num_input_tokens_seen": 194454544, + "step": 90020 + }, + { + "epoch": 14.685970636215334, + "grad_norm": 0.021997565403580666, + "learning_rate": 0.00020008254373984408, + "loss": 0.0024, + "num_input_tokens_seen": 194465136, + "step": 90025 + }, + { + "epoch": 14.68678629690049, + "grad_norm": 0.004823493305593729, + "learning_rate": 0.00020002559410972121, + "loss": 0.0025, + "num_input_tokens_seen": 194476016, + "step": 90030 + }, + { + "epoch": 14.687601957585644, + "grad_norm": 0.009510107338428497, + "learning_rate": 0.00019996865055896008, + "loss": 0.0052, + "num_input_tokens_seen": 194486320, + "step": 90035 + }, + { + "epoch": 14.6884176182708, + "grad_norm": 0.0036022786516696215, + "learning_rate": 0.0001999117130887152, + "loss": 0.0129, + "num_input_tokens_seen": 194497232, + "step": 90040 + }, + { + "epoch": 14.689233278955955, + "grad_norm": 0.01022917777299881, + "learning_rate": 0.00019985478170013977, + "loss": 0.0053, + "num_input_tokens_seen": 194508784, + "step": 90045 + }, + { + "epoch": 14.690048939641109, + "grad_norm": 0.17686966061592102, + "learning_rate": 0.00019979785639438836, + "loss": 0.0376, + "num_input_tokens_seen": 194519120, + "step": 90050 + }, + { + "epoch": 14.690864600326265, + "grad_norm": 0.08314191550016403, + "learning_rate": 0.00019974093717261383, + "loss": 0.004, + "num_input_tokens_seen": 194528144, + "step": 90055 + }, + { + "epoch": 14.691680261011419, + "grad_norm": 0.057678647339344025, + "learning_rate": 0.0001996840240359703, + "loss": 0.0194, + "num_input_tokens_seen": 194539344, + "step": 90060 + }, + { + "epoch": 14.692495921696574, + "grad_norm": 0.015334702096879482, + "learning_rate": 0.00019962711698561097, + "loss": 0.0109, + "num_input_tokens_seen": 194550160, + "step": 90065 + }, + { + "epoch": 14.69331158238173, + "grad_norm": 0.011460235342383385, + "learning_rate": 0.0001995702160226892, + "loss": 0.0034, + "num_input_tokens_seen": 194561936, + "step": 90070 + }, + { + "epoch": 14.694127243066884, + "grad_norm": 0.0077322013676166534, + "learning_rate": 0.00019951332114835808, + "loss": 0.0071, + "num_input_tokens_seen": 194572368, + "step": 90075 + }, + { + "epoch": 14.69494290375204, + "grad_norm": 0.008056914433836937, + "learning_rate": 0.00019945643236377074, + "loss": 0.002, + "num_input_tokens_seen": 194583152, + "step": 90080 + }, + { + "epoch": 14.695758564437194, + "grad_norm": 0.014877298846840858, + "learning_rate": 0.00019939954967008005, + "loss": 0.1314, + "num_input_tokens_seen": 194594768, + "step": 90085 + }, + { + "epoch": 14.69657422512235, + "grad_norm": 0.015234891325235367, + "learning_rate": 0.00019934267306843885, + "loss": 0.0028, + "num_input_tokens_seen": 194606032, + "step": 90090 + }, + { + "epoch": 14.697389885807503, + "grad_norm": 0.02361419051885605, + "learning_rate": 0.0001992858025599998, + "loss": 0.0027, + "num_input_tokens_seen": 194617328, + "step": 90095 + }, + { + "epoch": 14.698205546492659, + "grad_norm": 0.006071890238672495, + "learning_rate": 0.00019922893814591541, + "loss": 0.005, + "num_input_tokens_seen": 194627920, + "step": 90100 + }, + { + "epoch": 14.699021207177815, + "grad_norm": 0.0035932869650423527, + "learning_rate": 0.00019917207982733814, + "loss": 0.0022, + "num_input_tokens_seen": 194637936, + "step": 90105 + }, + { + "epoch": 14.699836867862969, + "grad_norm": 0.0846271961927414, + "learning_rate": 0.00019911522760542028, + "loss": 0.0045, + "num_input_tokens_seen": 194648688, + "step": 90110 + }, + { + "epoch": 14.700652528548124, + "grad_norm": 0.0042757634073495865, + "learning_rate": 0.0001990583814813141, + "loss": 0.003, + "num_input_tokens_seen": 194659184, + "step": 90115 + }, + { + "epoch": 14.701468189233278, + "grad_norm": 0.0037720445543527603, + "learning_rate": 0.00019900154145617157, + "loss": 0.0032, + "num_input_tokens_seen": 194670384, + "step": 90120 + }, + { + "epoch": 14.702283849918434, + "grad_norm": 0.002661908743903041, + "learning_rate": 0.00019894470753114456, + "loss": 0.0047, + "num_input_tokens_seen": 194680400, + "step": 90125 + }, + { + "epoch": 14.70309951060359, + "grad_norm": 0.31255388259887695, + "learning_rate": 0.00019888787970738508, + "loss": 0.0032, + "num_input_tokens_seen": 194691152, + "step": 90130 + }, + { + "epoch": 14.703915171288743, + "grad_norm": 0.0013531736331060529, + "learning_rate": 0.00019883105798604468, + "loss": 0.0049, + "num_input_tokens_seen": 194701744, + "step": 90135 + }, + { + "epoch": 14.7047308319739, + "grad_norm": 0.07724149525165558, + "learning_rate": 0.00019877424236827473, + "loss": 0.0075, + "num_input_tokens_seen": 194713488, + "step": 90140 + }, + { + "epoch": 14.705546492659053, + "grad_norm": 0.00285876146517694, + "learning_rate": 0.00019871743285522725, + "loss": 0.0008, + "num_input_tokens_seen": 194723664, + "step": 90145 + }, + { + "epoch": 14.706362153344209, + "grad_norm": 0.5335696339607239, + "learning_rate": 0.0001986606294480529, + "loss": 0.0215, + "num_input_tokens_seen": 194733712, + "step": 90150 + }, + { + "epoch": 14.707177814029365, + "grad_norm": 0.007888701744377613, + "learning_rate": 0.00019860383214790345, + "loss": 0.0039, + "num_input_tokens_seen": 194743504, + "step": 90155 + }, + { + "epoch": 14.707993474714518, + "grad_norm": 0.005462154280394316, + "learning_rate": 0.0001985470409559294, + "loss": 0.0038, + "num_input_tokens_seen": 194755248, + "step": 90160 + }, + { + "epoch": 14.708809135399674, + "grad_norm": 0.0036751835141330957, + "learning_rate": 0.00019849025587328228, + "loss": 0.0024, + "num_input_tokens_seen": 194766128, + "step": 90165 + }, + { + "epoch": 14.709624796084828, + "grad_norm": 0.22061067819595337, + "learning_rate": 0.00019843347690111235, + "loss": 0.0081, + "num_input_tokens_seen": 194777360, + "step": 90170 + }, + { + "epoch": 14.710440456769984, + "grad_norm": 0.032692890614271164, + "learning_rate": 0.00019837670404057085, + "loss": 0.0117, + "num_input_tokens_seen": 194788656, + "step": 90175 + }, + { + "epoch": 14.71125611745514, + "grad_norm": 0.004137896467000246, + "learning_rate": 0.00019831993729280774, + "loss": 0.0007, + "num_input_tokens_seen": 194798096, + "step": 90180 + }, + { + "epoch": 14.712071778140293, + "grad_norm": 0.0014908617595210671, + "learning_rate": 0.0001982631766589742, + "loss": 0.0024, + "num_input_tokens_seen": 194809456, + "step": 90185 + }, + { + "epoch": 14.71288743882545, + "grad_norm": 0.01189829409122467, + "learning_rate": 0.00019820642214021979, + "loss": 0.0035, + "num_input_tokens_seen": 194820080, + "step": 90190 + }, + { + "epoch": 14.713703099510603, + "grad_norm": 0.47292107343673706, + "learning_rate": 0.00019814967373769544, + "loss": 0.0814, + "num_input_tokens_seen": 194830320, + "step": 90195 + }, + { + "epoch": 14.714518760195759, + "grad_norm": 0.001293840236030519, + "learning_rate": 0.00019809293145255048, + "loss": 0.079, + "num_input_tokens_seen": 194841776, + "step": 90200 + }, + { + "epoch": 14.715334420880914, + "grad_norm": 0.14460763335227966, + "learning_rate": 0.00019803619528593547, + "loss": 0.0077, + "num_input_tokens_seen": 194851408, + "step": 90205 + }, + { + "epoch": 14.716150081566068, + "grad_norm": 0.03119935840368271, + "learning_rate": 0.00019797946523900006, + "loss": 0.0019, + "num_input_tokens_seen": 194861200, + "step": 90210 + }, + { + "epoch": 14.716965742251224, + "grad_norm": 0.020736441016197205, + "learning_rate": 0.0001979227413128939, + "loss": 0.0027, + "num_input_tokens_seen": 194871248, + "step": 90215 + }, + { + "epoch": 14.717781402936378, + "grad_norm": 0.8582524657249451, + "learning_rate": 0.0001978660235087666, + "loss": 0.1751, + "num_input_tokens_seen": 194882416, + "step": 90220 + }, + { + "epoch": 14.718597063621534, + "grad_norm": 0.006416116375476122, + "learning_rate": 0.00019780931182776762, + "loss": 0.0031, + "num_input_tokens_seen": 194892816, + "step": 90225 + }, + { + "epoch": 14.719412724306688, + "grad_norm": 0.10771431028842926, + "learning_rate": 0.0001977526062710463, + "loss": 0.0054, + "num_input_tokens_seen": 194904176, + "step": 90230 + }, + { + "epoch": 14.720228384991843, + "grad_norm": 0.3856275677680969, + "learning_rate": 0.0001976959068397518, + "loss": 0.0062, + "num_input_tokens_seen": 194916112, + "step": 90235 + }, + { + "epoch": 14.721044045676999, + "grad_norm": 0.0013380798045545816, + "learning_rate": 0.00019763921353503335, + "loss": 0.0036, + "num_input_tokens_seen": 194926416, + "step": 90240 + }, + { + "epoch": 14.721859706362153, + "grad_norm": 0.031052274629473686, + "learning_rate": 0.0001975825263580397, + "loss": 0.003, + "num_input_tokens_seen": 194936464, + "step": 90245 + }, + { + "epoch": 14.722675367047309, + "grad_norm": 0.1070781797170639, + "learning_rate": 0.00019752584530991984, + "loss": 0.0328, + "num_input_tokens_seen": 194947376, + "step": 90250 + }, + { + "epoch": 14.723491027732463, + "grad_norm": 0.016105569899082184, + "learning_rate": 0.00019746917039182226, + "loss": 0.0081, + "num_input_tokens_seen": 194957456, + "step": 90255 + }, + { + "epoch": 14.724306688417618, + "grad_norm": 0.6800005435943604, + "learning_rate": 0.0001974125016048961, + "loss": 0.0457, + "num_input_tokens_seen": 194969200, + "step": 90260 + }, + { + "epoch": 14.725122349102774, + "grad_norm": 0.00817457027733326, + "learning_rate": 0.0001973558389502891, + "loss": 0.005, + "num_input_tokens_seen": 194980560, + "step": 90265 + }, + { + "epoch": 14.725938009787928, + "grad_norm": 0.02452375739812851, + "learning_rate": 0.0001972991824291503, + "loss": 0.0254, + "num_input_tokens_seen": 194989968, + "step": 90270 + }, + { + "epoch": 14.726753670473084, + "grad_norm": 0.018420519307255745, + "learning_rate": 0.00019724253204262717, + "loss": 0.0177, + "num_input_tokens_seen": 195001264, + "step": 90275 + }, + { + "epoch": 14.727569331158238, + "grad_norm": 0.005502955988049507, + "learning_rate": 0.00019718588779186864, + "loss": 0.0017, + "num_input_tokens_seen": 195014000, + "step": 90280 + }, + { + "epoch": 14.728384991843393, + "grad_norm": 0.0030444420408457518, + "learning_rate": 0.00019712924967802182, + "loss": 0.0044, + "num_input_tokens_seen": 195025712, + "step": 90285 + }, + { + "epoch": 14.729200652528547, + "grad_norm": 0.004858762491494417, + "learning_rate": 0.00019707261770223532, + "loss": 0.0019, + "num_input_tokens_seen": 195036912, + "step": 90290 + }, + { + "epoch": 14.730016313213703, + "grad_norm": 0.0944218784570694, + "learning_rate": 0.00019701599186565621, + "loss": 0.0063, + "num_input_tokens_seen": 195048112, + "step": 90295 + }, + { + "epoch": 14.730831973898859, + "grad_norm": 0.004670348484069109, + "learning_rate": 0.00019695937216943272, + "loss": 0.0027, + "num_input_tokens_seen": 195058736, + "step": 90300 + }, + { + "epoch": 14.731647634584013, + "grad_norm": 0.05685482174158096, + "learning_rate": 0.00019690275861471168, + "loss": 0.004, + "num_input_tokens_seen": 195068144, + "step": 90305 + }, + { + "epoch": 14.732463295269168, + "grad_norm": 0.004699467681348324, + "learning_rate": 0.00019684615120264104, + "loss": 0.0027, + "num_input_tokens_seen": 195078608, + "step": 90310 + }, + { + "epoch": 14.733278955954322, + "grad_norm": 0.011271055787801743, + "learning_rate": 0.00019678954993436736, + "loss": 0.0064, + "num_input_tokens_seen": 195089072, + "step": 90315 + }, + { + "epoch": 14.734094616639478, + "grad_norm": 0.4532552659511566, + "learning_rate": 0.00019673295481103847, + "loss": 0.0568, + "num_input_tokens_seen": 195101648, + "step": 90320 + }, + { + "epoch": 14.734910277324634, + "grad_norm": 0.017157189548015594, + "learning_rate": 0.00019667636583380066, + "loss": 0.0249, + "num_input_tokens_seen": 195113520, + "step": 90325 + }, + { + "epoch": 14.735725938009788, + "grad_norm": 0.0010682785650715232, + "learning_rate": 0.0001966197830038014, + "loss": 0.0061, + "num_input_tokens_seen": 195125008, + "step": 90330 + }, + { + "epoch": 14.736541598694943, + "grad_norm": 0.007584839593619108, + "learning_rate": 0.00019656320632218676, + "loss": 0.0029, + "num_input_tokens_seen": 195136208, + "step": 90335 + }, + { + "epoch": 14.737357259380097, + "grad_norm": 0.018214622512459755, + "learning_rate": 0.00019650663579010401, + "loss": 0.0089, + "num_input_tokens_seen": 195145264, + "step": 90340 + }, + { + "epoch": 14.738172920065253, + "grad_norm": 0.031968094408512115, + "learning_rate": 0.00019645007140869897, + "loss": 0.0028, + "num_input_tokens_seen": 195156912, + "step": 90345 + }, + { + "epoch": 14.738988580750409, + "grad_norm": 0.009707508608698845, + "learning_rate": 0.00019639351317911853, + "loss": 0.0061, + "num_input_tokens_seen": 195168464, + "step": 90350 + }, + { + "epoch": 14.739804241435563, + "grad_norm": 0.5486598610877991, + "learning_rate": 0.00019633696110250864, + "loss": 0.0153, + "num_input_tokens_seen": 195179888, + "step": 90355 + }, + { + "epoch": 14.740619902120718, + "grad_norm": 0.002595171332359314, + "learning_rate": 0.0001962804151800155, + "loss": 0.0026, + "num_input_tokens_seen": 195190352, + "step": 90360 + }, + { + "epoch": 14.741435562805872, + "grad_norm": 0.04814111813902855, + "learning_rate": 0.00019622387541278497, + "loss": 0.002, + "num_input_tokens_seen": 195200880, + "step": 90365 + }, + { + "epoch": 14.742251223491028, + "grad_norm": 0.00787262711673975, + "learning_rate": 0.000196167341801963, + "loss": 0.0028, + "num_input_tokens_seen": 195211600, + "step": 90370 + }, + { + "epoch": 14.743066884176184, + "grad_norm": 0.001716782571747899, + "learning_rate": 0.00019611081434869532, + "loss": 0.0017, + "num_input_tokens_seen": 195222480, + "step": 90375 + }, + { + "epoch": 14.743882544861338, + "grad_norm": 0.004030111711472273, + "learning_rate": 0.00019605429305412746, + "loss": 0.0051, + "num_input_tokens_seen": 195232976, + "step": 90380 + }, + { + "epoch": 14.744698205546493, + "grad_norm": 0.0008914527716115117, + "learning_rate": 0.00019599777791940497, + "loss": 0.0011, + "num_input_tokens_seen": 195243856, + "step": 90385 + }, + { + "epoch": 14.745513866231647, + "grad_norm": 0.016651881858706474, + "learning_rate": 0.00019594126894567315, + "loss": 0.0102, + "num_input_tokens_seen": 195255216, + "step": 90390 + }, + { + "epoch": 14.746329526916803, + "grad_norm": 0.009090129286050797, + "learning_rate": 0.00019588476613407725, + "loss": 0.0296, + "num_input_tokens_seen": 195265712, + "step": 90395 + }, + { + "epoch": 14.747145187601957, + "grad_norm": 0.000620881502982229, + "learning_rate": 0.00019582826948576215, + "loss": 0.001, + "num_input_tokens_seen": 195276944, + "step": 90400 + }, + { + "epoch": 14.747960848287113, + "grad_norm": 0.004584138281643391, + "learning_rate": 0.00019577177900187342, + "loss": 0.0025, + "num_input_tokens_seen": 195285136, + "step": 90405 + }, + { + "epoch": 14.748776508972268, + "grad_norm": 0.0013352871173992753, + "learning_rate": 0.0001957152946835552, + "loss": 0.1063, + "num_input_tokens_seen": 195294832, + "step": 90410 + }, + { + "epoch": 14.749592169657422, + "grad_norm": 0.013338768854737282, + "learning_rate": 0.00019565881653195284, + "loss": 0.0036, + "num_input_tokens_seen": 195306384, + "step": 90415 + }, + { + "epoch": 14.750407830342578, + "grad_norm": 0.009870841167867184, + "learning_rate": 0.00019560234454821034, + "loss": 0.0019, + "num_input_tokens_seen": 195317488, + "step": 90420 + }, + { + "epoch": 14.751223491027732, + "grad_norm": 0.007064263802021742, + "learning_rate": 0.0001955458787334728, + "loss": 0.1473, + "num_input_tokens_seen": 195327824, + "step": 90425 + }, + { + "epoch": 14.752039151712887, + "grad_norm": 0.0038319004233926535, + "learning_rate": 0.00019548941908888396, + "loss": 0.0048, + "num_input_tokens_seen": 195338288, + "step": 90430 + }, + { + "epoch": 14.752854812398043, + "grad_norm": 0.011020504869520664, + "learning_rate": 0.00019543296561558865, + "loss": 0.0028, + "num_input_tokens_seen": 195347760, + "step": 90435 + }, + { + "epoch": 14.753670473083197, + "grad_norm": 0.10672824084758759, + "learning_rate": 0.0001953765183147303, + "loss": 0.0732, + "num_input_tokens_seen": 195359120, + "step": 90440 + }, + { + "epoch": 14.754486133768353, + "grad_norm": 0.005035056732594967, + "learning_rate": 0.00019532007718745366, + "loss": 0.0029, + "num_input_tokens_seen": 195369872, + "step": 90445 + }, + { + "epoch": 14.755301794453507, + "grad_norm": 0.027038784697651863, + "learning_rate": 0.00019526364223490172, + "loss": 0.0021, + "num_input_tokens_seen": 195380816, + "step": 90450 + }, + { + "epoch": 14.756117455138662, + "grad_norm": 0.009903905913233757, + "learning_rate": 0.00019520721345821907, + "loss": 0.0022, + "num_input_tokens_seen": 195392080, + "step": 90455 + }, + { + "epoch": 14.756933115823816, + "grad_norm": 0.013743668794631958, + "learning_rate": 0.00019515079085854854, + "loss": 0.0027, + "num_input_tokens_seen": 195403408, + "step": 90460 + }, + { + "epoch": 14.757748776508972, + "grad_norm": 0.002215584507212043, + "learning_rate": 0.00019509437443703415, + "loss": 0.0148, + "num_input_tokens_seen": 195414480, + "step": 90465 + }, + { + "epoch": 14.758564437194128, + "grad_norm": 0.021999172866344452, + "learning_rate": 0.00019503796419481908, + "loss": 0.0038, + "num_input_tokens_seen": 195425520, + "step": 90470 + }, + { + "epoch": 14.759380097879282, + "grad_norm": 0.01619911380112171, + "learning_rate": 0.00019498156013304647, + "loss": 0.0028, + "num_input_tokens_seen": 195435792, + "step": 90475 + }, + { + "epoch": 14.760195758564437, + "grad_norm": 0.022966833785176277, + "learning_rate": 0.0001949251622528595, + "loss": 0.0046, + "num_input_tokens_seen": 195446512, + "step": 90480 + }, + { + "epoch": 14.761011419249591, + "grad_norm": 0.06447285413742065, + "learning_rate": 0.0001948687705554012, + "loss": 0.0033, + "num_input_tokens_seen": 195456656, + "step": 90485 + }, + { + "epoch": 14.761827079934747, + "grad_norm": 0.005138483829796314, + "learning_rate": 0.00019481238504181431, + "loss": 0.0013, + "num_input_tokens_seen": 195468336, + "step": 90490 + }, + { + "epoch": 14.762642740619903, + "grad_norm": 0.0013741077855229378, + "learning_rate": 0.0001947560057132416, + "loss": 0.156, + "num_input_tokens_seen": 195479056, + "step": 90495 + }, + { + "epoch": 14.763458401305057, + "grad_norm": 0.0013339656870812178, + "learning_rate": 0.00019469963257082564, + "loss": 0.0012, + "num_input_tokens_seen": 195490512, + "step": 90500 + }, + { + "epoch": 14.764274061990212, + "grad_norm": 0.4919218122959137, + "learning_rate": 0.00019464326561570894, + "loss": 0.0458, + "num_input_tokens_seen": 195502704, + "step": 90505 + }, + { + "epoch": 14.765089722675366, + "grad_norm": 0.0020044157281517982, + "learning_rate": 0.0001945869048490338, + "loss": 0.0013, + "num_input_tokens_seen": 195512816, + "step": 90510 + }, + { + "epoch": 14.765905383360522, + "grad_norm": 0.010413050651550293, + "learning_rate": 0.00019453055027194256, + "loss": 0.0029, + "num_input_tokens_seen": 195524592, + "step": 90515 + }, + { + "epoch": 14.766721044045678, + "grad_norm": 0.12600289285182953, + "learning_rate": 0.00019447420188557714, + "loss": 0.0035, + "num_input_tokens_seen": 195535024, + "step": 90520 + }, + { + "epoch": 14.767536704730832, + "grad_norm": 0.0043631913140416145, + "learning_rate": 0.00019441785969107967, + "loss": 0.0052, + "num_input_tokens_seen": 195546384, + "step": 90525 + }, + { + "epoch": 14.768352365415987, + "grad_norm": 0.0018164021894335747, + "learning_rate": 0.00019436152368959193, + "loss": 0.0518, + "num_input_tokens_seen": 195556240, + "step": 90530 + }, + { + "epoch": 14.769168026101141, + "grad_norm": 0.005169565323740244, + "learning_rate": 0.0001943051938822556, + "loss": 0.016, + "num_input_tokens_seen": 195566864, + "step": 90535 + }, + { + "epoch": 14.769983686786297, + "grad_norm": 0.0052770026959478855, + "learning_rate": 0.00019424887027021237, + "loss": 0.0055, + "num_input_tokens_seen": 195578672, + "step": 90540 + }, + { + "epoch": 14.770799347471453, + "grad_norm": 0.004595869220793247, + "learning_rate": 0.00019419255285460347, + "loss": 0.0011, + "num_input_tokens_seen": 195588496, + "step": 90545 + }, + { + "epoch": 14.771615008156607, + "grad_norm": 0.3980657756328583, + "learning_rate": 0.00019413624163657072, + "loss": 0.0161, + "num_input_tokens_seen": 195598160, + "step": 90550 + }, + { + "epoch": 14.772430668841762, + "grad_norm": 0.01790624111890793, + "learning_rate": 0.00019407993661725475, + "loss": 0.0038, + "num_input_tokens_seen": 195608432, + "step": 90555 + }, + { + "epoch": 14.773246329526916, + "grad_norm": 0.1566634178161621, + "learning_rate": 0.0001940236377977973, + "loss": 0.137, + "num_input_tokens_seen": 195619504, + "step": 90560 + }, + { + "epoch": 14.774061990212072, + "grad_norm": 1.9931586980819702, + "learning_rate": 0.00019396734517933867, + "loss": 0.1028, + "num_input_tokens_seen": 195629968, + "step": 90565 + }, + { + "epoch": 14.774877650897226, + "grad_norm": 0.013670021668076515, + "learning_rate": 0.00019391105876302012, + "loss": 0.1662, + "num_input_tokens_seen": 195642320, + "step": 90570 + }, + { + "epoch": 14.775693311582382, + "grad_norm": 0.00712958350777626, + "learning_rate": 0.00019385477854998235, + "loss": 0.0095, + "num_input_tokens_seen": 195651984, + "step": 90575 + }, + { + "epoch": 14.776508972267537, + "grad_norm": 0.45859119296073914, + "learning_rate": 0.00019379850454136582, + "loss": 0.125, + "num_input_tokens_seen": 195662672, + "step": 90580 + }, + { + "epoch": 14.777324632952691, + "grad_norm": 0.09679024666547775, + "learning_rate": 0.00019374223673831103, + "loss": 0.0069, + "num_input_tokens_seen": 195674640, + "step": 90585 + }, + { + "epoch": 14.778140293637847, + "grad_norm": 0.01906924694776535, + "learning_rate": 0.00019368597514195834, + "loss": 0.0382, + "num_input_tokens_seen": 195685424, + "step": 90590 + }, + { + "epoch": 14.778955954323001, + "grad_norm": 0.005224125925451517, + "learning_rate": 0.00019362971975344796, + "loss": 0.0224, + "num_input_tokens_seen": 195696944, + "step": 90595 + }, + { + "epoch": 14.779771615008157, + "grad_norm": 0.004919872619211674, + "learning_rate": 0.00019357347057391994, + "loss": 0.005, + "num_input_tokens_seen": 195707920, + "step": 90600 + }, + { + "epoch": 14.780587275693312, + "grad_norm": 0.014207074418663979, + "learning_rate": 0.0001935172276045143, + "loss": 0.0041, + "num_input_tokens_seen": 195718544, + "step": 90605 + }, + { + "epoch": 14.781402936378466, + "grad_norm": 0.09853319078683853, + "learning_rate": 0.0001934609908463708, + "loss": 0.1058, + "num_input_tokens_seen": 195728304, + "step": 90610 + }, + { + "epoch": 14.782218597063622, + "grad_norm": 0.011572792194783688, + "learning_rate": 0.00019340476030062925, + "loss": 0.0095, + "num_input_tokens_seen": 195739344, + "step": 90615 + }, + { + "epoch": 14.783034257748776, + "grad_norm": 0.003625387093052268, + "learning_rate": 0.00019334853596842915, + "loss": 0.0042, + "num_input_tokens_seen": 195749936, + "step": 90620 + }, + { + "epoch": 14.783849918433932, + "grad_norm": 0.0010916964383795857, + "learning_rate": 0.00019329231785090994, + "loss": 0.0017, + "num_input_tokens_seen": 195760048, + "step": 90625 + }, + { + "epoch": 14.784665579119086, + "grad_norm": 0.002343985252082348, + "learning_rate": 0.0001932361059492111, + "loss": 0.0037, + "num_input_tokens_seen": 195771632, + "step": 90630 + }, + { + "epoch": 14.785481239804241, + "grad_norm": 0.009995860978960991, + "learning_rate": 0.00019317990026447164, + "loss": 0.0018, + "num_input_tokens_seen": 195781520, + "step": 90635 + }, + { + "epoch": 14.786296900489397, + "grad_norm": 0.0010385174537077546, + "learning_rate": 0.00019312370079783075, + "loss": 0.0154, + "num_input_tokens_seen": 195793648, + "step": 90640 + }, + { + "epoch": 14.78711256117455, + "grad_norm": 0.055788811296224594, + "learning_rate": 0.0001930675075504274, + "loss": 0.0134, + "num_input_tokens_seen": 195802928, + "step": 90645 + }, + { + "epoch": 14.787928221859707, + "grad_norm": 0.010686542838811874, + "learning_rate": 0.00019301132052340031, + "loss": 0.0212, + "num_input_tokens_seen": 195814352, + "step": 90650 + }, + { + "epoch": 14.78874388254486, + "grad_norm": 0.014930814504623413, + "learning_rate": 0.0001929551397178883, + "loss": 0.0014, + "num_input_tokens_seen": 195825232, + "step": 90655 + }, + { + "epoch": 14.789559543230016, + "grad_norm": 0.00482224440202117, + "learning_rate": 0.00019289896513502991, + "loss": 0.0022, + "num_input_tokens_seen": 195836880, + "step": 90660 + }, + { + "epoch": 14.790375203915172, + "grad_norm": 0.7234705090522766, + "learning_rate": 0.00019284279677596355, + "loss": 0.0291, + "num_input_tokens_seen": 195847952, + "step": 90665 + }, + { + "epoch": 14.791190864600326, + "grad_norm": 0.005445053800940514, + "learning_rate": 0.0001927866346418276, + "loss": 0.0556, + "num_input_tokens_seen": 195858960, + "step": 90670 + }, + { + "epoch": 14.792006525285482, + "grad_norm": 0.8060768842697144, + "learning_rate": 0.00019273047873376005, + "loss": 0.0477, + "num_input_tokens_seen": 195869264, + "step": 90675 + }, + { + "epoch": 14.792822185970635, + "grad_norm": 0.3666798770427704, + "learning_rate": 0.00019267432905289945, + "loss": 0.0736, + "num_input_tokens_seen": 195880368, + "step": 90680 + }, + { + "epoch": 14.793637846655791, + "grad_norm": 0.03639459237456322, + "learning_rate": 0.00019261818560038313, + "loss": 0.0041, + "num_input_tokens_seen": 195890416, + "step": 90685 + }, + { + "epoch": 14.794453507340947, + "grad_norm": 0.010454765520989895, + "learning_rate": 0.00019256204837734937, + "loss": 0.0109, + "num_input_tokens_seen": 195900464, + "step": 90690 + }, + { + "epoch": 14.7952691680261, + "grad_norm": 0.004342780914157629, + "learning_rate": 0.00019250591738493572, + "loss": 0.0071, + "num_input_tokens_seen": 195911536, + "step": 90695 + }, + { + "epoch": 14.796084828711257, + "grad_norm": 0.055346082895994186, + "learning_rate": 0.00019244979262427974, + "loss": 0.0048, + "num_input_tokens_seen": 195922704, + "step": 90700 + }, + { + "epoch": 14.79690048939641, + "grad_norm": 0.015100730583071709, + "learning_rate": 0.00019239367409651893, + "loss": 0.0173, + "num_input_tokens_seen": 195933296, + "step": 90705 + }, + { + "epoch": 14.797716150081566, + "grad_norm": 0.015843048691749573, + "learning_rate": 0.00019233756180279043, + "loss": 0.0058, + "num_input_tokens_seen": 195942768, + "step": 90710 + }, + { + "epoch": 14.798531810766722, + "grad_norm": 0.012738611549139023, + "learning_rate": 0.00019228145574423162, + "loss": 0.0016, + "num_input_tokens_seen": 195953808, + "step": 90715 + }, + { + "epoch": 14.799347471451876, + "grad_norm": 0.6077333688735962, + "learning_rate": 0.00019222535592197944, + "loss": 0.1671, + "num_input_tokens_seen": 195964464, + "step": 90720 + }, + { + "epoch": 14.800163132137031, + "grad_norm": 0.30826112627983093, + "learning_rate": 0.00019216926233717085, + "loss": 0.0347, + "num_input_tokens_seen": 195975664, + "step": 90725 + }, + { + "epoch": 14.800978792822185, + "grad_norm": 0.13063177466392517, + "learning_rate": 0.0001921131749909427, + "loss": 0.0084, + "num_input_tokens_seen": 195986320, + "step": 90730 + }, + { + "epoch": 14.801794453507341, + "grad_norm": 0.014707125723361969, + "learning_rate": 0.00019205709388443165, + "loss": 0.0025, + "num_input_tokens_seen": 195996784, + "step": 90735 + }, + { + "epoch": 14.802610114192497, + "grad_norm": 0.003288878360763192, + "learning_rate": 0.00019200101901877426, + "loss": 0.0039, + "num_input_tokens_seen": 196008304, + "step": 90740 + }, + { + "epoch": 14.80342577487765, + "grad_norm": 0.011088044382631779, + "learning_rate": 0.0001919449503951069, + "loss": 0.0785, + "num_input_tokens_seen": 196018864, + "step": 90745 + }, + { + "epoch": 14.804241435562806, + "grad_norm": 0.0074303289875388145, + "learning_rate": 0.00019188888801456594, + "loss": 0.0039, + "num_input_tokens_seen": 196029648, + "step": 90750 + }, + { + "epoch": 14.80505709624796, + "grad_norm": 0.0014132543001323938, + "learning_rate": 0.0001918328318782875, + "loss": 0.0068, + "num_input_tokens_seen": 196039568, + "step": 90755 + }, + { + "epoch": 14.805872756933116, + "grad_norm": 0.013476034626364708, + "learning_rate": 0.00019177678198740766, + "loss": 0.0211, + "num_input_tokens_seen": 196051280, + "step": 90760 + }, + { + "epoch": 14.80668841761827, + "grad_norm": 0.03059094212949276, + "learning_rate": 0.00019172073834306235, + "loss": 0.0076, + "num_input_tokens_seen": 196062928, + "step": 90765 + }, + { + "epoch": 14.807504078303426, + "grad_norm": 0.04523979872465134, + "learning_rate": 0.00019166470094638739, + "loss": 0.0035, + "num_input_tokens_seen": 196073680, + "step": 90770 + }, + { + "epoch": 14.808319738988581, + "grad_norm": 0.005500610917806625, + "learning_rate": 0.00019160866979851842, + "loss": 0.0064, + "num_input_tokens_seen": 196083984, + "step": 90775 + }, + { + "epoch": 14.809135399673735, + "grad_norm": 0.010868792422115803, + "learning_rate": 0.00019155264490059077, + "loss": 0.0038, + "num_input_tokens_seen": 196093968, + "step": 90780 + }, + { + "epoch": 14.809951060358891, + "grad_norm": 0.022862639278173447, + "learning_rate": 0.00019149662625374042, + "loss": 0.0029, + "num_input_tokens_seen": 196103824, + "step": 90785 + }, + { + "epoch": 14.810766721044045, + "grad_norm": 0.00445165578275919, + "learning_rate": 0.00019144061385910195, + "loss": 0.0017, + "num_input_tokens_seen": 196113104, + "step": 90790 + }, + { + "epoch": 14.8115823817292, + "grad_norm": 0.0044853598810732365, + "learning_rate": 0.00019138460771781125, + "loss": 0.0017, + "num_input_tokens_seen": 196124560, + "step": 90795 + }, + { + "epoch": 14.812398042414356, + "grad_norm": 0.0008106532623060048, + "learning_rate": 0.0001913286078310026, + "loss": 0.0019, + "num_input_tokens_seen": 196134928, + "step": 90800 + }, + { + "epoch": 14.81321370309951, + "grad_norm": 0.0041127754375338554, + "learning_rate": 0.00019127261419981168, + "loss": 0.0035, + "num_input_tokens_seen": 196146352, + "step": 90805 + }, + { + "epoch": 14.814029363784666, + "grad_norm": 0.011095692403614521, + "learning_rate": 0.0001912166268253725, + "loss": 0.0121, + "num_input_tokens_seen": 196156368, + "step": 90810 + }, + { + "epoch": 14.81484502446982, + "grad_norm": 0.005023022647947073, + "learning_rate": 0.0001911606457088204, + "loss": 0.0019, + "num_input_tokens_seen": 196167696, + "step": 90815 + }, + { + "epoch": 14.815660685154976, + "grad_norm": 0.008457830175757408, + "learning_rate": 0.00019110467085128936, + "loss": 0.0026, + "num_input_tokens_seen": 196178512, + "step": 90820 + }, + { + "epoch": 14.81647634584013, + "grad_norm": 0.011036441661417484, + "learning_rate": 0.00019104870225391412, + "loss": 0.0028, + "num_input_tokens_seen": 196188432, + "step": 90825 + }, + { + "epoch": 14.817292006525285, + "grad_norm": 0.036544330418109894, + "learning_rate": 0.0001909927399178289, + "loss": 0.0047, + "num_input_tokens_seen": 196199184, + "step": 90830 + }, + { + "epoch": 14.818107667210441, + "grad_norm": 0.026460448279976845, + "learning_rate": 0.0001909367838441678, + "loss": 0.0035, + "num_input_tokens_seen": 196209840, + "step": 90835 + }, + { + "epoch": 14.818923327895595, + "grad_norm": 0.3815443813800812, + "learning_rate": 0.00019088083403406486, + "loss": 0.0906, + "num_input_tokens_seen": 196221584, + "step": 90840 + }, + { + "epoch": 14.81973898858075, + "grad_norm": 0.009295003488659859, + "learning_rate": 0.00019082489048865393, + "loss": 0.0016, + "num_input_tokens_seen": 196233328, + "step": 90845 + }, + { + "epoch": 14.820554649265905, + "grad_norm": 1.4162017107009888, + "learning_rate": 0.00019076895320906885, + "loss": 0.0921, + "num_input_tokens_seen": 196244144, + "step": 90850 + }, + { + "epoch": 14.82137030995106, + "grad_norm": 0.003560206387192011, + "learning_rate": 0.0001907130221964432, + "loss": 0.0157, + "num_input_tokens_seen": 196254704, + "step": 90855 + }, + { + "epoch": 14.822185970636216, + "grad_norm": 0.006497807335108519, + "learning_rate": 0.0001906570974519105, + "loss": 0.0937, + "num_input_tokens_seen": 196265648, + "step": 90860 + }, + { + "epoch": 14.82300163132137, + "grad_norm": 0.0016851173713803291, + "learning_rate": 0.00019060117897660417, + "loss": 0.0042, + "num_input_tokens_seen": 196276368, + "step": 90865 + }, + { + "epoch": 14.823817292006526, + "grad_norm": 0.4738226532936096, + "learning_rate": 0.00019054526677165744, + "loss": 0.142, + "num_input_tokens_seen": 196287696, + "step": 90870 + }, + { + "epoch": 14.82463295269168, + "grad_norm": 0.00117388810031116, + "learning_rate": 0.00019048936083820346, + "loss": 0.0048, + "num_input_tokens_seen": 196298352, + "step": 90875 + }, + { + "epoch": 14.825448613376835, + "grad_norm": 0.0461152046918869, + "learning_rate": 0.00019043346117737526, + "loss": 0.0053, + "num_input_tokens_seen": 196309296, + "step": 90880 + }, + { + "epoch": 14.826264274061991, + "grad_norm": 0.0075841969810426235, + "learning_rate": 0.00019037756779030545, + "loss": 0.0027, + "num_input_tokens_seen": 196319568, + "step": 90885 + }, + { + "epoch": 14.827079934747145, + "grad_norm": 0.013668366707861423, + "learning_rate": 0.00019032168067812738, + "loss": 0.0026, + "num_input_tokens_seen": 196330352, + "step": 90890 + }, + { + "epoch": 14.8278955954323, + "grad_norm": 0.0021600218024104834, + "learning_rate": 0.00019026579984197296, + "loss": 0.0052, + "num_input_tokens_seen": 196341296, + "step": 90895 + }, + { + "epoch": 14.828711256117455, + "grad_norm": 0.010189131833612919, + "learning_rate": 0.00019020992528297537, + "loss": 0.0042, + "num_input_tokens_seen": 196352560, + "step": 90900 + }, + { + "epoch": 14.82952691680261, + "grad_norm": 0.043539393693208694, + "learning_rate": 0.0001901540570022663, + "loss": 0.0047, + "num_input_tokens_seen": 196363312, + "step": 90905 + }, + { + "epoch": 14.830342577487766, + "grad_norm": 0.022573234513401985, + "learning_rate": 0.0001900981950009787, + "loss": 0.0865, + "num_input_tokens_seen": 196374352, + "step": 90910 + }, + { + "epoch": 14.83115823817292, + "grad_norm": 0.004776300862431526, + "learning_rate": 0.00019004233928024395, + "loss": 0.0066, + "num_input_tokens_seen": 196384656, + "step": 90915 + }, + { + "epoch": 14.831973898858076, + "grad_norm": 0.02242133766412735, + "learning_rate": 0.0001899864898411947, + "loss": 0.002, + "num_input_tokens_seen": 196396240, + "step": 90920 + }, + { + "epoch": 14.83278955954323, + "grad_norm": 0.003272157395258546, + "learning_rate": 0.00018993064668496225, + "loss": 0.0036, + "num_input_tokens_seen": 196405808, + "step": 90925 + }, + { + "epoch": 14.833605220228385, + "grad_norm": 0.004656730219721794, + "learning_rate": 0.00018987480981267892, + "loss": 0.002, + "num_input_tokens_seen": 196417104, + "step": 90930 + }, + { + "epoch": 14.83442088091354, + "grad_norm": 0.0012749811867251992, + "learning_rate": 0.00018981897922547565, + "loss": 0.0258, + "num_input_tokens_seen": 196427952, + "step": 90935 + }, + { + "epoch": 14.835236541598695, + "grad_norm": 0.15742255747318268, + "learning_rate": 0.00018976315492448453, + "loss": 0.0064, + "num_input_tokens_seen": 196439024, + "step": 90940 + }, + { + "epoch": 14.83605220228385, + "grad_norm": 0.013149394653737545, + "learning_rate": 0.00018970733691083637, + "loss": 0.0031, + "num_input_tokens_seen": 196449584, + "step": 90945 + }, + { + "epoch": 14.836867862969005, + "grad_norm": 0.057830024510622025, + "learning_rate": 0.000189651525185663, + "loss": 0.0076, + "num_input_tokens_seen": 196460528, + "step": 90950 + }, + { + "epoch": 14.83768352365416, + "grad_norm": 0.001897350768558681, + "learning_rate": 0.00018959571975009481, + "loss": 0.0017, + "num_input_tokens_seen": 196471856, + "step": 90955 + }, + { + "epoch": 14.838499184339314, + "grad_norm": 0.6547830104827881, + "learning_rate": 0.00018953992060526348, + "loss": 0.0512, + "num_input_tokens_seen": 196482096, + "step": 90960 + }, + { + "epoch": 14.83931484502447, + "grad_norm": 0.07979469001293182, + "learning_rate": 0.00018948412775229918, + "loss": 0.0674, + "num_input_tokens_seen": 196492272, + "step": 90965 + }, + { + "epoch": 14.840130505709626, + "grad_norm": 0.00814758613705635, + "learning_rate": 0.0001894283411923331, + "loss": 0.0046, + "num_input_tokens_seen": 196501872, + "step": 90970 + }, + { + "epoch": 14.84094616639478, + "grad_norm": 0.001418950268998742, + "learning_rate": 0.0001893725609264957, + "loss": 0.0048, + "num_input_tokens_seen": 196514000, + "step": 90975 + }, + { + "epoch": 14.841761827079935, + "grad_norm": 0.0042955600656569, + "learning_rate": 0.00018931678695591742, + "loss": 0.0545, + "num_input_tokens_seen": 196524400, + "step": 90980 + }, + { + "epoch": 14.84257748776509, + "grad_norm": 0.0012596363667398691, + "learning_rate": 0.00018926101928172856, + "loss": 0.0149, + "num_input_tokens_seen": 196535472, + "step": 90985 + }, + { + "epoch": 14.843393148450245, + "grad_norm": 0.007855188101530075, + "learning_rate": 0.00018920525790505933, + "loss": 0.0038, + "num_input_tokens_seen": 196546576, + "step": 90990 + }, + { + "epoch": 14.844208809135399, + "grad_norm": 0.016350040212273598, + "learning_rate": 0.00018914950282703985, + "loss": 0.0028, + "num_input_tokens_seen": 196557808, + "step": 90995 + }, + { + "epoch": 14.845024469820554, + "grad_norm": 0.06966894865036011, + "learning_rate": 0.00018909375404879998, + "loss": 0.0054, + "num_input_tokens_seen": 196568112, + "step": 91000 + }, + { + "epoch": 14.84584013050571, + "grad_norm": 0.01267112884670496, + "learning_rate": 0.00018903801157146965, + "loss": 0.0114, + "num_input_tokens_seen": 196579856, + "step": 91005 + }, + { + "epoch": 14.846655791190864, + "grad_norm": 0.012737995944917202, + "learning_rate": 0.00018898227539617852, + "loss": 0.0022, + "num_input_tokens_seen": 196591152, + "step": 91010 + }, + { + "epoch": 14.84747145187602, + "grad_norm": 0.07722505927085876, + "learning_rate": 0.0001889265455240561, + "loss": 0.0092, + "num_input_tokens_seen": 196601136, + "step": 91015 + }, + { + "epoch": 14.848287112561174, + "grad_norm": 0.18848595023155212, + "learning_rate": 0.00018887082195623167, + "loss": 0.0077, + "num_input_tokens_seen": 196612112, + "step": 91020 + }, + { + "epoch": 14.84910277324633, + "grad_norm": 0.002311618998646736, + "learning_rate": 0.00018881510469383506, + "loss": 0.0181, + "num_input_tokens_seen": 196621040, + "step": 91025 + }, + { + "epoch": 14.849918433931485, + "grad_norm": 0.019284280017018318, + "learning_rate": 0.00018875939373799483, + "loss": 0.0037, + "num_input_tokens_seen": 196631632, + "step": 91030 + }, + { + "epoch": 14.850734094616639, + "grad_norm": 0.25687074661254883, + "learning_rate": 0.00018870368908984063, + "loss": 0.0112, + "num_input_tokens_seen": 196641904, + "step": 91035 + }, + { + "epoch": 14.851549755301795, + "grad_norm": 0.009678049944341183, + "learning_rate": 0.00018864799075050078, + "loss": 0.0269, + "num_input_tokens_seen": 196652624, + "step": 91040 + }, + { + "epoch": 14.852365415986949, + "grad_norm": 0.1337086260318756, + "learning_rate": 0.00018859229872110467, + "loss": 0.1025, + "num_input_tokens_seen": 196663152, + "step": 91045 + }, + { + "epoch": 14.853181076672104, + "grad_norm": 0.6179363131523132, + "learning_rate": 0.00018853661300278034, + "loss": 0.0868, + "num_input_tokens_seen": 196674384, + "step": 91050 + }, + { + "epoch": 14.85399673735726, + "grad_norm": 0.006021140608936548, + "learning_rate": 0.00018848093359665703, + "loss": 0.0043, + "num_input_tokens_seen": 196684528, + "step": 91055 + }, + { + "epoch": 14.854812398042414, + "grad_norm": 0.44819143414497375, + "learning_rate": 0.0001884252605038624, + "loss": 0.0606, + "num_input_tokens_seen": 196695600, + "step": 91060 + }, + { + "epoch": 14.85562805872757, + "grad_norm": 0.001804789761081338, + "learning_rate": 0.00018836959372552553, + "loss": 0.0017, + "num_input_tokens_seen": 196706224, + "step": 91065 + }, + { + "epoch": 14.856443719412724, + "grad_norm": 0.01530960202217102, + "learning_rate": 0.0001883139332627738, + "loss": 0.003, + "num_input_tokens_seen": 196716432, + "step": 91070 + }, + { + "epoch": 14.85725938009788, + "grad_norm": 0.4987078607082367, + "learning_rate": 0.00018825827911673592, + "loss": 0.1173, + "num_input_tokens_seen": 196728016, + "step": 91075 + }, + { + "epoch": 14.858075040783035, + "grad_norm": 0.008473776280879974, + "learning_rate": 0.0001882026312885392, + "loss": 0.0012, + "num_input_tokens_seen": 196738544, + "step": 91080 + }, + { + "epoch": 14.858890701468189, + "grad_norm": 0.49949583411216736, + "learning_rate": 0.00018814698977931204, + "loss": 0.0146, + "num_input_tokens_seen": 196749072, + "step": 91085 + }, + { + "epoch": 14.859706362153345, + "grad_norm": 0.03785784915089607, + "learning_rate": 0.0001880913545901814, + "loss": 0.0025, + "num_input_tokens_seen": 196759280, + "step": 91090 + }, + { + "epoch": 14.860522022838499, + "grad_norm": 0.013276136480271816, + "learning_rate": 0.00018803572572227546, + "loss": 0.0053, + "num_input_tokens_seen": 196770832, + "step": 91095 + }, + { + "epoch": 14.861337683523654, + "grad_norm": 0.00885779783129692, + "learning_rate": 0.000187980103176721, + "loss": 0.0979, + "num_input_tokens_seen": 196780944, + "step": 91100 + }, + { + "epoch": 14.86215334420881, + "grad_norm": 0.07836762815713882, + "learning_rate": 0.0001879244869546457, + "loss": 0.018, + "num_input_tokens_seen": 196792464, + "step": 91105 + }, + { + "epoch": 14.862969004893964, + "grad_norm": 0.0056850542314350605, + "learning_rate": 0.00018786887705717658, + "loss": 0.0039, + "num_input_tokens_seen": 196803216, + "step": 91110 + }, + { + "epoch": 14.86378466557912, + "grad_norm": 0.003115960629656911, + "learning_rate": 0.00018781327348544065, + "loss": 0.0021, + "num_input_tokens_seen": 196814032, + "step": 91115 + }, + { + "epoch": 14.864600326264274, + "grad_norm": 0.005707794800400734, + "learning_rate": 0.00018775767624056472, + "loss": 0.0128, + "num_input_tokens_seen": 196825072, + "step": 91120 + }, + { + "epoch": 14.86541598694943, + "grad_norm": 0.10761536657810211, + "learning_rate": 0.0001877020853236756, + "loss": 0.0104, + "num_input_tokens_seen": 196835888, + "step": 91125 + }, + { + "epoch": 14.866231647634583, + "grad_norm": 0.0006443029851652682, + "learning_rate": 0.00018764650073589995, + "loss": 0.005, + "num_input_tokens_seen": 196846544, + "step": 91130 + }, + { + "epoch": 14.867047308319739, + "grad_norm": 0.004941369406878948, + "learning_rate": 0.0001875909224783642, + "loss": 0.0057, + "num_input_tokens_seen": 196857168, + "step": 91135 + }, + { + "epoch": 14.867862969004895, + "grad_norm": 0.031196700409054756, + "learning_rate": 0.00018753535055219468, + "loss": 0.0199, + "num_input_tokens_seen": 196867760, + "step": 91140 + }, + { + "epoch": 14.868678629690049, + "grad_norm": 0.011366274207830429, + "learning_rate": 0.0001874797849585177, + "loss": 0.1045, + "num_input_tokens_seen": 196878928, + "step": 91145 + }, + { + "epoch": 14.869494290375204, + "grad_norm": 1.00128972530365, + "learning_rate": 0.00018742422569845935, + "loss": 0.0436, + "num_input_tokens_seen": 196888944, + "step": 91150 + }, + { + "epoch": 14.870309951060358, + "grad_norm": 0.45802080631256104, + "learning_rate": 0.00018736867277314556, + "loss": 0.0808, + "num_input_tokens_seen": 196898960, + "step": 91155 + }, + { + "epoch": 14.871125611745514, + "grad_norm": 0.016989484429359436, + "learning_rate": 0.00018731312618370228, + "loss": 0.0442, + "num_input_tokens_seen": 196909808, + "step": 91160 + }, + { + "epoch": 14.87194127243067, + "grad_norm": 0.002204073593020439, + "learning_rate": 0.0001872575859312549, + "loss": 0.007, + "num_input_tokens_seen": 196919728, + "step": 91165 + }, + { + "epoch": 14.872756933115824, + "grad_norm": 0.006088990718126297, + "learning_rate": 0.00018720205201692975, + "loss": 0.0027, + "num_input_tokens_seen": 196929456, + "step": 91170 + }, + { + "epoch": 14.87357259380098, + "grad_norm": 0.01547847967594862, + "learning_rate": 0.00018714652444185137, + "loss": 0.0046, + "num_input_tokens_seen": 196939248, + "step": 91175 + }, + { + "epoch": 14.874388254486133, + "grad_norm": 0.02324049361050129, + "learning_rate": 0.00018709100320714594, + "loss": 0.0148, + "num_input_tokens_seen": 196950544, + "step": 91180 + }, + { + "epoch": 14.875203915171289, + "grad_norm": 0.6144483089447021, + "learning_rate": 0.00018703548831393795, + "loss": 0.0567, + "num_input_tokens_seen": 196961232, + "step": 91185 + }, + { + "epoch": 14.876019575856443, + "grad_norm": 0.006280634086579084, + "learning_rate": 0.00018697997976335317, + "loss": 0.0671, + "num_input_tokens_seen": 196971536, + "step": 91190 + }, + { + "epoch": 14.876835236541599, + "grad_norm": 0.015296096913516521, + "learning_rate": 0.0001869244775565158, + "loss": 0.0144, + "num_input_tokens_seen": 196982160, + "step": 91195 + }, + { + "epoch": 14.877650897226754, + "grad_norm": 0.003120235400274396, + "learning_rate": 0.00018686898169455147, + "loss": 0.0078, + "num_input_tokens_seen": 196992944, + "step": 91200 + }, + { + "epoch": 14.878466557911908, + "grad_norm": 0.043369755148887634, + "learning_rate": 0.00018681349217858408, + "loss": 0.019, + "num_input_tokens_seen": 197003792, + "step": 91205 + }, + { + "epoch": 14.879282218597064, + "grad_norm": 0.004237358458340168, + "learning_rate": 0.00018675800900973876, + "loss": 0.004, + "num_input_tokens_seen": 197014960, + "step": 91210 + }, + { + "epoch": 14.880097879282218, + "grad_norm": 0.0031959593761712313, + "learning_rate": 0.00018670253218913975, + "loss": 0.0043, + "num_input_tokens_seen": 197025552, + "step": 91215 + }, + { + "epoch": 14.880913539967374, + "grad_norm": 0.008891470730304718, + "learning_rate": 0.00018664706171791134, + "loss": 0.0025, + "num_input_tokens_seen": 197036752, + "step": 91220 + }, + { + "epoch": 14.88172920065253, + "grad_norm": 0.00029773113783448935, + "learning_rate": 0.0001865915975971778, + "loss": 0.0059, + "num_input_tokens_seen": 197047024, + "step": 91225 + }, + { + "epoch": 14.882544861337683, + "grad_norm": 0.005044011864811182, + "learning_rate": 0.00018653613982806311, + "loss": 0.0023, + "num_input_tokens_seen": 197057392, + "step": 91230 + }, + { + "epoch": 14.883360522022839, + "grad_norm": 0.009152603335678577, + "learning_rate": 0.0001864806884116912, + "loss": 0.0021, + "num_input_tokens_seen": 197066896, + "step": 91235 + }, + { + "epoch": 14.884176182707993, + "grad_norm": 0.004073834978044033, + "learning_rate": 0.00018642524334918582, + "loss": 0.002, + "num_input_tokens_seen": 197079248, + "step": 91240 + }, + { + "epoch": 14.884991843393149, + "grad_norm": 0.0020526114385575056, + "learning_rate": 0.00018636980464167076, + "loss": 0.0039, + "num_input_tokens_seen": 197090288, + "step": 91245 + }, + { + "epoch": 14.885807504078304, + "grad_norm": 0.0018403942231088877, + "learning_rate": 0.00018631437229026942, + "loss": 0.002, + "num_input_tokens_seen": 197100784, + "step": 91250 + }, + { + "epoch": 14.886623164763458, + "grad_norm": 0.0024247504770755768, + "learning_rate": 0.0001862589462961053, + "loss": 0.0106, + "num_input_tokens_seen": 197112752, + "step": 91255 + }, + { + "epoch": 14.887438825448614, + "grad_norm": 0.049593936651945114, + "learning_rate": 0.0001862035266603016, + "loss": 0.0054, + "num_input_tokens_seen": 197123920, + "step": 91260 + }, + { + "epoch": 14.888254486133768, + "grad_norm": 0.009330598637461662, + "learning_rate": 0.00018614811338398153, + "loss": 0.0018, + "num_input_tokens_seen": 197133584, + "step": 91265 + }, + { + "epoch": 14.889070146818923, + "grad_norm": 0.012051105499267578, + "learning_rate": 0.0001860927064682681, + "loss": 0.0025, + "num_input_tokens_seen": 197143856, + "step": 91270 + }, + { + "epoch": 14.88988580750408, + "grad_norm": 0.06999044865369797, + "learning_rate": 0.0001860373059142842, + "loss": 0.0062, + "num_input_tokens_seen": 197154640, + "step": 91275 + }, + { + "epoch": 14.890701468189233, + "grad_norm": 0.0663766860961914, + "learning_rate": 0.00018598191172315253, + "loss": 0.0036, + "num_input_tokens_seen": 197165648, + "step": 91280 + }, + { + "epoch": 14.891517128874389, + "grad_norm": 0.07517898827791214, + "learning_rate": 0.00018592652389599583, + "loss": 0.0964, + "num_input_tokens_seen": 197176496, + "step": 91285 + }, + { + "epoch": 14.892332789559543, + "grad_norm": 0.002310275798663497, + "learning_rate": 0.00018587114243393655, + "loss": 0.0023, + "num_input_tokens_seen": 197186992, + "step": 91290 + }, + { + "epoch": 14.893148450244698, + "grad_norm": 0.6070193648338318, + "learning_rate": 0.00018581576733809707, + "loss": 0.0378, + "num_input_tokens_seen": 197197680, + "step": 91295 + }, + { + "epoch": 14.893964110929852, + "grad_norm": 0.004368333145976067, + "learning_rate": 0.00018576039860959966, + "loss": 0.0024, + "num_input_tokens_seen": 197209168, + "step": 91300 + }, + { + "epoch": 14.894779771615008, + "grad_norm": 0.06271976977586746, + "learning_rate": 0.00018570503624956635, + "loss": 0.0054, + "num_input_tokens_seen": 197219728, + "step": 91305 + }, + { + "epoch": 14.895595432300164, + "grad_norm": 0.05428668111562729, + "learning_rate": 0.00018564968025911905, + "loss": 0.0048, + "num_input_tokens_seen": 197229744, + "step": 91310 + }, + { + "epoch": 14.896411092985318, + "grad_norm": 0.005725554656237364, + "learning_rate": 0.00018559433063937997, + "loss": 0.0023, + "num_input_tokens_seen": 197240816, + "step": 91315 + }, + { + "epoch": 14.897226753670473, + "grad_norm": 0.00881729181855917, + "learning_rate": 0.00018553898739147057, + "loss": 0.0167, + "num_input_tokens_seen": 197252464, + "step": 91320 + }, + { + "epoch": 14.898042414355627, + "grad_norm": 0.001196861732751131, + "learning_rate": 0.00018548365051651255, + "loss": 0.0038, + "num_input_tokens_seen": 197262352, + "step": 91325 + }, + { + "epoch": 14.898858075040783, + "grad_norm": 0.031206313520669937, + "learning_rate": 0.00018542832001562732, + "loss": 0.0029, + "num_input_tokens_seen": 197273520, + "step": 91330 + }, + { + "epoch": 14.899673735725939, + "grad_norm": 0.20482459664344788, + "learning_rate": 0.00018537299588993627, + "loss": 0.1099, + "num_input_tokens_seen": 197283920, + "step": 91335 + }, + { + "epoch": 14.900489396411093, + "grad_norm": 0.007990190759301186, + "learning_rate": 0.0001853176781405606, + "loss": 0.0922, + "num_input_tokens_seen": 197294448, + "step": 91340 + }, + { + "epoch": 14.901305057096248, + "grad_norm": 0.0065262895077466965, + "learning_rate": 0.00018526236676862134, + "loss": 0.0049, + "num_input_tokens_seen": 197304016, + "step": 91345 + }, + { + "epoch": 14.902120717781402, + "grad_norm": 0.004501709248870611, + "learning_rate": 0.00018520706177523955, + "loss": 0.0022, + "num_input_tokens_seen": 197314544, + "step": 91350 + }, + { + "epoch": 14.902936378466558, + "grad_norm": 0.0008994314703159034, + "learning_rate": 0.000185151763161536, + "loss": 0.0356, + "num_input_tokens_seen": 197325808, + "step": 91355 + }, + { + "epoch": 14.903752039151712, + "grad_norm": 0.5988969206809998, + "learning_rate": 0.0001850964709286313, + "loss": 0.0955, + "num_input_tokens_seen": 197338352, + "step": 91360 + }, + { + "epoch": 14.904567699836868, + "grad_norm": 0.015295770950615406, + "learning_rate": 0.00018504118507764618, + "loss": 0.0065, + "num_input_tokens_seen": 197348976, + "step": 91365 + }, + { + "epoch": 14.905383360522023, + "grad_norm": 0.015201558358967304, + "learning_rate": 0.00018498590560970098, + "loss": 0.0028, + "num_input_tokens_seen": 197359472, + "step": 91370 + }, + { + "epoch": 14.906199021207177, + "grad_norm": 0.07447752356529236, + "learning_rate": 0.00018493063252591596, + "loss": 0.0136, + "num_input_tokens_seen": 197369776, + "step": 91375 + }, + { + "epoch": 14.907014681892333, + "grad_norm": 0.004664100240916014, + "learning_rate": 0.00018487536582741142, + "loss": 0.0034, + "num_input_tokens_seen": 197379536, + "step": 91380 + }, + { + "epoch": 14.907830342577487, + "grad_norm": 0.004202402196824551, + "learning_rate": 0.00018482010551530736, + "loss": 0.0065, + "num_input_tokens_seen": 197390160, + "step": 91385 + }, + { + "epoch": 14.908646003262643, + "grad_norm": 0.013313495554029942, + "learning_rate": 0.00018476485159072371, + "loss": 0.0083, + "num_input_tokens_seen": 197401008, + "step": 91390 + }, + { + "epoch": 14.909461663947798, + "grad_norm": 0.0597468763589859, + "learning_rate": 0.0001847096040547802, + "loss": 0.0085, + "num_input_tokens_seen": 197413520, + "step": 91395 + }, + { + "epoch": 14.910277324632952, + "grad_norm": 0.008388367481529713, + "learning_rate": 0.00018465436290859662, + "loss": 0.0088, + "num_input_tokens_seen": 197424240, + "step": 91400 + }, + { + "epoch": 14.911092985318108, + "grad_norm": 0.0027642918284982443, + "learning_rate": 0.00018459912815329234, + "loss": 0.087, + "num_input_tokens_seen": 197436816, + "step": 91405 + }, + { + "epoch": 14.911908646003262, + "grad_norm": 0.0009764356655068696, + "learning_rate": 0.00018454389978998686, + "loss": 0.0932, + "num_input_tokens_seen": 197448016, + "step": 91410 + }, + { + "epoch": 14.912724306688418, + "grad_norm": 0.0351264625787735, + "learning_rate": 0.00018448867781979943, + "loss": 0.0526, + "num_input_tokens_seen": 197457744, + "step": 91415 + }, + { + "epoch": 14.913539967373573, + "grad_norm": 0.008587658405303955, + "learning_rate": 0.00018443346224384906, + "loss": 0.0168, + "num_input_tokens_seen": 197469264, + "step": 91420 + }, + { + "epoch": 14.914355628058727, + "grad_norm": 0.005530293099582195, + "learning_rate": 0.00018437825306325524, + "loss": 0.0301, + "num_input_tokens_seen": 197478960, + "step": 91425 + }, + { + "epoch": 14.915171288743883, + "grad_norm": 0.030203763395547867, + "learning_rate": 0.00018432305027913615, + "loss": 0.002, + "num_input_tokens_seen": 197489616, + "step": 91430 + }, + { + "epoch": 14.915986949429037, + "grad_norm": 0.009578707627952099, + "learning_rate": 0.00018426785389261124, + "loss": 0.0161, + "num_input_tokens_seen": 197499376, + "step": 91435 + }, + { + "epoch": 14.916802610114193, + "grad_norm": 0.07021234184503555, + "learning_rate": 0.00018421266390479846, + "loss": 0.004, + "num_input_tokens_seen": 197509552, + "step": 91440 + }, + { + "epoch": 14.917618270799348, + "grad_norm": 0.0011171189835295081, + "learning_rate": 0.00018415748031681706, + "loss": 0.0352, + "num_input_tokens_seen": 197520656, + "step": 91445 + }, + { + "epoch": 14.918433931484502, + "grad_norm": 0.04959210380911827, + "learning_rate": 0.0001841023031297846, + "loss": 0.0396, + "num_input_tokens_seen": 197532432, + "step": 91450 + }, + { + "epoch": 14.919249592169658, + "grad_norm": 0.0012606215896084905, + "learning_rate": 0.0001840471323448199, + "loss": 0.0313, + "num_input_tokens_seen": 197543056, + "step": 91455 + }, + { + "epoch": 14.920065252854812, + "grad_norm": 0.0010699069825932384, + "learning_rate": 0.00018399196796304085, + "loss": 0.0691, + "num_input_tokens_seen": 197552880, + "step": 91460 + }, + { + "epoch": 14.920880913539968, + "grad_norm": 0.020792873576283455, + "learning_rate": 0.0001839368099855655, + "loss": 0.0429, + "num_input_tokens_seen": 197564560, + "step": 91465 + }, + { + "epoch": 14.921696574225122, + "grad_norm": 0.003776568453758955, + "learning_rate": 0.00018388165841351162, + "loss": 0.009, + "num_input_tokens_seen": 197575312, + "step": 91470 + }, + { + "epoch": 14.922512234910277, + "grad_norm": 0.09122282266616821, + "learning_rate": 0.000183826513247997, + "loss": 0.0097, + "num_input_tokens_seen": 197585264, + "step": 91475 + }, + { + "epoch": 14.923327895595433, + "grad_norm": 0.10566361993551254, + "learning_rate": 0.0001837713744901391, + "loss": 0.0121, + "num_input_tokens_seen": 197595728, + "step": 91480 + }, + { + "epoch": 14.924143556280587, + "grad_norm": 0.5424984693527222, + "learning_rate": 0.00018371624214105553, + "loss": 0.0201, + "num_input_tokens_seen": 197605296, + "step": 91485 + }, + { + "epoch": 14.924959216965743, + "grad_norm": 0.4760046601295471, + "learning_rate": 0.00018366111620186348, + "loss": 0.0254, + "num_input_tokens_seen": 197615792, + "step": 91490 + }, + { + "epoch": 14.925774877650896, + "grad_norm": 0.04116629436612129, + "learning_rate": 0.0001836059966736803, + "loss": 0.0137, + "num_input_tokens_seen": 197627312, + "step": 91495 + }, + { + "epoch": 14.926590538336052, + "grad_norm": 0.008484826423227787, + "learning_rate": 0.0001835508835576229, + "loss": 0.0026, + "num_input_tokens_seen": 197637584, + "step": 91500 + }, + { + "epoch": 14.927406199021208, + "grad_norm": 0.033912546932697296, + "learning_rate": 0.00018349577685480834, + "loss": 0.0092, + "num_input_tokens_seen": 197648560, + "step": 91505 + }, + { + "epoch": 14.928221859706362, + "grad_norm": 0.0016596310306340456, + "learning_rate": 0.0001834406765663534, + "loss": 0.0371, + "num_input_tokens_seen": 197659536, + "step": 91510 + }, + { + "epoch": 14.929037520391518, + "grad_norm": 0.034390322864055634, + "learning_rate": 0.00018338558269337464, + "loss": 0.0025, + "num_input_tokens_seen": 197669904, + "step": 91515 + }, + { + "epoch": 14.929853181076671, + "grad_norm": 0.2875620722770691, + "learning_rate": 0.00018333049523698876, + "loss": 0.0116, + "num_input_tokens_seen": 197679280, + "step": 91520 + }, + { + "epoch": 14.930668841761827, + "grad_norm": 0.03757292404770851, + "learning_rate": 0.00018327541419831196, + "loss": 0.0045, + "num_input_tokens_seen": 197690480, + "step": 91525 + }, + { + "epoch": 14.931484502446983, + "grad_norm": 0.008096279576420784, + "learning_rate": 0.00018322033957846097, + "loss": 0.0276, + "num_input_tokens_seen": 197701712, + "step": 91530 + }, + { + "epoch": 14.932300163132137, + "grad_norm": 0.06404435634613037, + "learning_rate": 0.00018316527137855138, + "loss": 0.0048, + "num_input_tokens_seen": 197712560, + "step": 91535 + }, + { + "epoch": 14.933115823817293, + "grad_norm": 0.0034364284947514534, + "learning_rate": 0.00018311020959969982, + "loss": 0.0017, + "num_input_tokens_seen": 197722640, + "step": 91540 + }, + { + "epoch": 14.933931484502446, + "grad_norm": 0.013405256904661655, + "learning_rate": 0.0001830551542430215, + "loss": 0.0016, + "num_input_tokens_seen": 197733616, + "step": 91545 + }, + { + "epoch": 14.934747145187602, + "grad_norm": 0.12012229114770889, + "learning_rate": 0.0001830001053096329, + "loss": 0.0743, + "num_input_tokens_seen": 197745552, + "step": 91550 + }, + { + "epoch": 14.935562805872756, + "grad_norm": 0.6721145510673523, + "learning_rate": 0.000182945062800649, + "loss": 0.0137, + "num_input_tokens_seen": 197755632, + "step": 91555 + }, + { + "epoch": 14.936378466557912, + "grad_norm": 0.021277163177728653, + "learning_rate": 0.0001828900267171859, + "loss": 0.0022, + "num_input_tokens_seen": 197765552, + "step": 91560 + }, + { + "epoch": 14.937194127243067, + "grad_norm": 0.14915668964385986, + "learning_rate": 0.0001828349970603584, + "loss": 0.0112, + "num_input_tokens_seen": 197776720, + "step": 91565 + }, + { + "epoch": 14.938009787928221, + "grad_norm": 0.0062555489130318165, + "learning_rate": 0.00018277997383128237, + "loss": 0.013, + "num_input_tokens_seen": 197787824, + "step": 91570 + }, + { + "epoch": 14.938825448613377, + "grad_norm": 0.005437719635665417, + "learning_rate": 0.00018272495703107222, + "loss": 0.0073, + "num_input_tokens_seen": 197797808, + "step": 91575 + }, + { + "epoch": 14.939641109298531, + "grad_norm": 0.0006717185606248677, + "learning_rate": 0.00018266994666084368, + "loss": 0.0265, + "num_input_tokens_seen": 197808624, + "step": 91580 + }, + { + "epoch": 14.940456769983687, + "grad_norm": 0.2600712776184082, + "learning_rate": 0.0001826149427217109, + "loss": 0.0136, + "num_input_tokens_seen": 197819824, + "step": 91585 + }, + { + "epoch": 14.941272430668842, + "grad_norm": 0.006795211229473352, + "learning_rate": 0.00018255994521478925, + "loss": 0.0013, + "num_input_tokens_seen": 197830800, + "step": 91590 + }, + { + "epoch": 14.942088091353996, + "grad_norm": 0.0022412503603845835, + "learning_rate": 0.00018250495414119273, + "loss": 0.0129, + "num_input_tokens_seen": 197841968, + "step": 91595 + }, + { + "epoch": 14.942903752039152, + "grad_norm": 0.10791927576065063, + "learning_rate": 0.0001824499695020362, + "loss": 0.0108, + "num_input_tokens_seen": 197852368, + "step": 91600 + }, + { + "epoch": 14.943719412724306, + "grad_norm": 0.005554559174925089, + "learning_rate": 0.0001823949912984339, + "loss": 0.0261, + "num_input_tokens_seen": 197862960, + "step": 91605 + }, + { + "epoch": 14.944535073409462, + "grad_norm": 0.007660820614546537, + "learning_rate": 0.00018234001953149997, + "loss": 0.0042, + "num_input_tokens_seen": 197874192, + "step": 91610 + }, + { + "epoch": 14.945350734094617, + "grad_norm": 0.0357113778591156, + "learning_rate": 0.00018228505420234858, + "loss": 0.0021, + "num_input_tokens_seen": 197885680, + "step": 91615 + }, + { + "epoch": 14.946166394779771, + "grad_norm": 0.0036698810290545225, + "learning_rate": 0.00018223009531209355, + "loss": 0.0027, + "num_input_tokens_seen": 197897040, + "step": 91620 + }, + { + "epoch": 14.946982055464927, + "grad_norm": 0.4504069685935974, + "learning_rate": 0.00018217514286184884, + "loss": 0.0453, + "num_input_tokens_seen": 197907952, + "step": 91625 + }, + { + "epoch": 14.947797716150081, + "grad_norm": 0.000682205252815038, + "learning_rate": 0.00018212019685272802, + "loss": 0.0193, + "num_input_tokens_seen": 197920048, + "step": 91630 + }, + { + "epoch": 14.948613376835237, + "grad_norm": 0.004739740863442421, + "learning_rate": 0.00018206525728584462, + "loss": 0.0064, + "num_input_tokens_seen": 197930672, + "step": 91635 + }, + { + "epoch": 14.949429037520392, + "grad_norm": 0.004977010656148195, + "learning_rate": 0.00018201032416231217, + "loss": 0.056, + "num_input_tokens_seen": 197942352, + "step": 91640 + }, + { + "epoch": 14.950244698205546, + "grad_norm": 0.6323051452636719, + "learning_rate": 0.00018195539748324386, + "loss": 0.0118, + "num_input_tokens_seen": 197952784, + "step": 91645 + }, + { + "epoch": 14.951060358890702, + "grad_norm": 0.02068396657705307, + "learning_rate": 0.00018190047724975271, + "loss": 0.0027, + "num_input_tokens_seen": 197963696, + "step": 91650 + }, + { + "epoch": 14.951876019575856, + "grad_norm": 0.0008221376338042319, + "learning_rate": 0.00018184556346295233, + "loss": 0.0018, + "num_input_tokens_seen": 197974960, + "step": 91655 + }, + { + "epoch": 14.952691680261012, + "grad_norm": 0.0007419289904646575, + "learning_rate": 0.00018179065612395484, + "loss": 0.0033, + "num_input_tokens_seen": 197985904, + "step": 91660 + }, + { + "epoch": 14.953507340946166, + "grad_norm": 0.23194904625415802, + "learning_rate": 0.0001817357552338737, + "loss": 0.0095, + "num_input_tokens_seen": 197996688, + "step": 91665 + }, + { + "epoch": 14.954323001631321, + "grad_norm": 0.00299538834951818, + "learning_rate": 0.0001816808607938209, + "loss": 0.0029, + "num_input_tokens_seen": 198007888, + "step": 91670 + }, + { + "epoch": 14.955138662316477, + "grad_norm": 0.01290897186845541, + "learning_rate": 0.00018162597280490966, + "loss": 0.0474, + "num_input_tokens_seen": 198019632, + "step": 91675 + }, + { + "epoch": 14.955954323001631, + "grad_norm": 0.00599845964461565, + "learning_rate": 0.00018157109126825156, + "loss": 0.0041, + "num_input_tokens_seen": 198029936, + "step": 91680 + }, + { + "epoch": 14.956769983686787, + "grad_norm": 0.005180804058909416, + "learning_rate": 0.0001815162161849596, + "loss": 0.0019, + "num_input_tokens_seen": 198041264, + "step": 91685 + }, + { + "epoch": 14.95758564437194, + "grad_norm": 0.01789415255188942, + "learning_rate": 0.00018146134755614524, + "loss": 0.0017, + "num_input_tokens_seen": 198052528, + "step": 91690 + }, + { + "epoch": 14.958401305057096, + "grad_norm": 0.003401143942028284, + "learning_rate": 0.0001814064853829211, + "loss": 0.0025, + "num_input_tokens_seen": 198063216, + "step": 91695 + }, + { + "epoch": 14.959216965742252, + "grad_norm": 0.03557276725769043, + "learning_rate": 0.00018135162966639835, + "loss": 0.0114, + "num_input_tokens_seen": 198074608, + "step": 91700 + }, + { + "epoch": 14.960032626427406, + "grad_norm": 0.0038898277562111616, + "learning_rate": 0.00018129678040768938, + "loss": 0.0034, + "num_input_tokens_seen": 198085680, + "step": 91705 + }, + { + "epoch": 14.960848287112562, + "grad_norm": 0.010417568497359753, + "learning_rate": 0.00018124193760790514, + "loss": 0.0051, + "num_input_tokens_seen": 198095888, + "step": 91710 + }, + { + "epoch": 14.961663947797716, + "grad_norm": 0.01605057343840599, + "learning_rate": 0.00018118710126815773, + "loss": 0.0442, + "num_input_tokens_seen": 198106576, + "step": 91715 + }, + { + "epoch": 14.962479608482871, + "grad_norm": 0.00376236904412508, + "learning_rate": 0.00018113227138955785, + "loss": 0.0061, + "num_input_tokens_seen": 198116368, + "step": 91720 + }, + { + "epoch": 14.963295269168025, + "grad_norm": 0.4293442368507385, + "learning_rate": 0.00018107744797321728, + "loss": 0.2416, + "num_input_tokens_seen": 198125552, + "step": 91725 + }, + { + "epoch": 14.964110929853181, + "grad_norm": 0.0034157487098127604, + "learning_rate": 0.00018102263102024653, + "loss": 0.0025, + "num_input_tokens_seen": 198136912, + "step": 91730 + }, + { + "epoch": 14.964926590538337, + "grad_norm": 0.1372259259223938, + "learning_rate": 0.00018096782053175715, + "loss": 0.0083, + "num_input_tokens_seen": 198146800, + "step": 91735 + }, + { + "epoch": 14.96574225122349, + "grad_norm": 0.7950195074081421, + "learning_rate": 0.00018091301650885922, + "loss": 0.1111, + "num_input_tokens_seen": 198157648, + "step": 91740 + }, + { + "epoch": 14.966557911908646, + "grad_norm": 0.012486966326832771, + "learning_rate": 0.00018085821895266402, + "loss": 0.0917, + "num_input_tokens_seen": 198170384, + "step": 91745 + }, + { + "epoch": 14.9673735725938, + "grad_norm": 0.00040594261372461915, + "learning_rate": 0.00018080342786428184, + "loss": 0.0339, + "num_input_tokens_seen": 198181104, + "step": 91750 + }, + { + "epoch": 14.968189233278956, + "grad_norm": 0.0035222459118813276, + "learning_rate": 0.00018074864324482315, + "loss": 0.0014, + "num_input_tokens_seen": 198192176, + "step": 91755 + }, + { + "epoch": 14.969004893964112, + "grad_norm": 0.9425178170204163, + "learning_rate": 0.0001806938650953982, + "loss": 0.0278, + "num_input_tokens_seen": 198203312, + "step": 91760 + }, + { + "epoch": 14.969820554649266, + "grad_norm": 0.005051793530583382, + "learning_rate": 0.00018063909341711716, + "loss": 0.0075, + "num_input_tokens_seen": 198213904, + "step": 91765 + }, + { + "epoch": 14.970636215334421, + "grad_norm": 0.0014896744396537542, + "learning_rate": 0.00018058432821109, + "loss": 0.0026, + "num_input_tokens_seen": 198225424, + "step": 91770 + }, + { + "epoch": 14.971451876019575, + "grad_norm": 0.011105705052614212, + "learning_rate": 0.00018052956947842665, + "loss": 0.0018, + "num_input_tokens_seen": 198236496, + "step": 91775 + }, + { + "epoch": 14.97226753670473, + "grad_norm": 0.003602163400501013, + "learning_rate": 0.0001804748172202368, + "loss": 0.0106, + "num_input_tokens_seen": 198248208, + "step": 91780 + }, + { + "epoch": 14.973083197389887, + "grad_norm": 0.002370339585468173, + "learning_rate": 0.00018042007143763018, + "loss": 0.0085, + "num_input_tokens_seen": 198259824, + "step": 91785 + }, + { + "epoch": 14.97389885807504, + "grad_norm": 0.005036045331507921, + "learning_rate": 0.00018036533213171618, + "loss": 0.1786, + "num_input_tokens_seen": 198271856, + "step": 91790 + }, + { + "epoch": 14.974714518760196, + "grad_norm": 0.027367407456040382, + "learning_rate": 0.0001803105993036041, + "loss": 0.0068, + "num_input_tokens_seen": 198282672, + "step": 91795 + }, + { + "epoch": 14.97553017944535, + "grad_norm": 0.0048852418549358845, + "learning_rate": 0.0001802558729544036, + "loss": 0.0082, + "num_input_tokens_seen": 198293552, + "step": 91800 + }, + { + "epoch": 14.976345840130506, + "grad_norm": 0.1048927903175354, + "learning_rate": 0.0001802011530852231, + "loss": 0.0703, + "num_input_tokens_seen": 198304272, + "step": 91805 + }, + { + "epoch": 14.977161500815662, + "grad_norm": 0.723358690738678, + "learning_rate": 0.00018014643969717231, + "loss": 0.1453, + "num_input_tokens_seen": 198314992, + "step": 91810 + }, + { + "epoch": 14.977977161500815, + "grad_norm": 0.0012425847817212343, + "learning_rate": 0.0001800917327913593, + "loss": 0.0109, + "num_input_tokens_seen": 198326352, + "step": 91815 + }, + { + "epoch": 14.978792822185971, + "grad_norm": 0.03811126574873924, + "learning_rate": 0.0001800370323688935, + "loss": 0.0044, + "num_input_tokens_seen": 198336816, + "step": 91820 + }, + { + "epoch": 14.979608482871125, + "grad_norm": 0.11717061698436737, + "learning_rate": 0.00017998233843088284, + "loss": 0.0076, + "num_input_tokens_seen": 198347536, + "step": 91825 + }, + { + "epoch": 14.98042414355628, + "grad_norm": 0.010843515396118164, + "learning_rate": 0.00017992765097843639, + "loss": 0.0031, + "num_input_tokens_seen": 198358576, + "step": 91830 + }, + { + "epoch": 14.981239804241435, + "grad_norm": 0.0015287426067516208, + "learning_rate": 0.00017987297001266172, + "loss": 0.002, + "num_input_tokens_seen": 198370064, + "step": 91835 + }, + { + "epoch": 14.98205546492659, + "grad_norm": 0.005267042201012373, + "learning_rate": 0.00017981829553466783, + "loss": 0.0157, + "num_input_tokens_seen": 198380656, + "step": 91840 + }, + { + "epoch": 14.982871125611746, + "grad_norm": 0.0034920715261250734, + "learning_rate": 0.00017976362754556203, + "loss": 0.0132, + "num_input_tokens_seen": 198392112, + "step": 91845 + }, + { + "epoch": 14.9836867862969, + "grad_norm": 0.005846341140568256, + "learning_rate": 0.0001797089660464527, + "loss": 0.0132, + "num_input_tokens_seen": 198403440, + "step": 91850 + }, + { + "epoch": 14.984502446982056, + "grad_norm": 0.004615858197212219, + "learning_rate": 0.00017965431103844753, + "loss": 0.0016, + "num_input_tokens_seen": 198413712, + "step": 91855 + }, + { + "epoch": 14.98531810766721, + "grad_norm": 0.00345953949727118, + "learning_rate": 0.00017959966252265407, + "loss": 0.0108, + "num_input_tokens_seen": 198425424, + "step": 91860 + }, + { + "epoch": 14.986133768352365, + "grad_norm": 0.04297134652733803, + "learning_rate": 0.00017954502050018, + "loss": 0.0034, + "num_input_tokens_seen": 198435920, + "step": 91865 + }, + { + "epoch": 14.986949429037521, + "grad_norm": 0.0170292966067791, + "learning_rate": 0.00017949038497213255, + "loss": 0.002, + "num_input_tokens_seen": 198448304, + "step": 91870 + }, + { + "epoch": 14.987765089722675, + "grad_norm": 0.0019456454319879413, + "learning_rate": 0.0001794357559396191, + "loss": 0.0047, + "num_input_tokens_seen": 198458512, + "step": 91875 + }, + { + "epoch": 14.98858075040783, + "grad_norm": 0.0030888323672115803, + "learning_rate": 0.00017938113340374662, + "loss": 0.0058, + "num_input_tokens_seen": 198469936, + "step": 91880 + }, + { + "epoch": 14.989396411092985, + "grad_norm": 0.008779522962868214, + "learning_rate": 0.00017932651736562226, + "loss": 0.0049, + "num_input_tokens_seen": 198480688, + "step": 91885 + }, + { + "epoch": 14.99021207177814, + "grad_norm": 0.1324496865272522, + "learning_rate": 0.00017927190782635283, + "loss": 0.1448, + "num_input_tokens_seen": 198492240, + "step": 91890 + }, + { + "epoch": 14.991027732463294, + "grad_norm": 0.11609335243701935, + "learning_rate": 0.00017921730478704506, + "loss": 0.0059, + "num_input_tokens_seen": 198502352, + "step": 91895 + }, + { + "epoch": 14.99184339314845, + "grad_norm": 0.004886234644800425, + "learning_rate": 0.0001791627082488056, + "loss": 0.0674, + "num_input_tokens_seen": 198514224, + "step": 91900 + }, + { + "epoch": 14.992659053833606, + "grad_norm": 0.03304458037018776, + "learning_rate": 0.00017910811821274082, + "loss": 0.008, + "num_input_tokens_seen": 198524720, + "step": 91905 + }, + { + "epoch": 14.99347471451876, + "grad_norm": 0.004803449381142855, + "learning_rate": 0.0001790535346799571, + "loss": 0.002, + "num_input_tokens_seen": 198536176, + "step": 91910 + }, + { + "epoch": 14.994290375203915, + "grad_norm": 0.004887313582003117, + "learning_rate": 0.00017899895765156065, + "loss": 0.0038, + "num_input_tokens_seen": 198547952, + "step": 91915 + }, + { + "epoch": 14.99510603588907, + "grad_norm": 0.0403841994702816, + "learning_rate": 0.00017894438712865753, + "loss": 0.0041, + "num_input_tokens_seen": 198559216, + "step": 91920 + }, + { + "epoch": 14.995921696574225, + "grad_norm": 0.0004432197310961783, + "learning_rate": 0.00017888982311235375, + "loss": 0.018, + "num_input_tokens_seen": 198570256, + "step": 91925 + }, + { + "epoch": 14.99673735725938, + "grad_norm": 0.04184507951140404, + "learning_rate": 0.00017883526560375502, + "loss": 0.0037, + "num_input_tokens_seen": 198581808, + "step": 91930 + }, + { + "epoch": 14.997553017944535, + "grad_norm": 0.01710711419582367, + "learning_rate": 0.00017878071460396706, + "loss": 0.0048, + "num_input_tokens_seen": 198593040, + "step": 91935 + }, + { + "epoch": 14.99836867862969, + "grad_norm": 0.1359815001487732, + "learning_rate": 0.0001787261701140952, + "loss": 0.0072, + "num_input_tokens_seen": 198601776, + "step": 91940 + }, + { + "epoch": 14.999184339314844, + "grad_norm": 0.32246536016464233, + "learning_rate": 0.00017867163213524545, + "loss": 0.0294, + "num_input_tokens_seen": 198611472, + "step": 91945 + }, + { + "epoch": 15.0, + "grad_norm": 0.08324826508760452, + "learning_rate": 0.00017861710066852237, + "loss": 0.1494, + "num_input_tokens_seen": 198621168, + "step": 91950 + }, + { + "epoch": 15.0, + "eval_loss": 0.2368113100528717, + "eval_runtime": 104.2294, + "eval_samples_per_second": 26.144, + "eval_steps_per_second": 6.543, + "num_input_tokens_seen": 198621168, + "step": 91950 + }, + { + "epoch": 15.000815660685156, + "grad_norm": 0.15005184710025787, + "learning_rate": 0.00017856257571503164, + "loss": 0.0103, + "num_input_tokens_seen": 198632528, + "step": 91955 + }, + { + "epoch": 15.00163132137031, + "grad_norm": 0.026279544457793236, + "learning_rate": 0.00017850805727587804, + "loss": 0.0176, + "num_input_tokens_seen": 198643888, + "step": 91960 + }, + { + "epoch": 15.002446982055465, + "grad_norm": 0.0065717375837266445, + "learning_rate": 0.00017845354535216658, + "loss": 0.004, + "num_input_tokens_seen": 198654416, + "step": 91965 + }, + { + "epoch": 15.00326264274062, + "grad_norm": 0.0031500456389039755, + "learning_rate": 0.00017839903994500185, + "loss": 0.0419, + "num_input_tokens_seen": 198666960, + "step": 91970 + }, + { + "epoch": 15.004078303425775, + "grad_norm": 0.005922715645283461, + "learning_rate": 0.0001783445410554886, + "loss": 0.0014, + "num_input_tokens_seen": 198677808, + "step": 91975 + }, + { + "epoch": 15.00489396411093, + "grad_norm": 0.0009123813943006098, + "learning_rate": 0.00017829004868473124, + "loss": 0.004, + "num_input_tokens_seen": 198688272, + "step": 91980 + }, + { + "epoch": 15.005709624796085, + "grad_norm": 0.009499759413301945, + "learning_rate": 0.00017823556283383418, + "loss": 0.0097, + "num_input_tokens_seen": 198699504, + "step": 91985 + }, + { + "epoch": 15.00652528548124, + "grad_norm": 0.0025711434427648783, + "learning_rate": 0.0001781810835039016, + "loss": 0.0026, + "num_input_tokens_seen": 198710544, + "step": 91990 + }, + { + "epoch": 15.007340946166394, + "grad_norm": 0.0020499620586633682, + "learning_rate": 0.0001781266106960377, + "loss": 0.0034, + "num_input_tokens_seen": 198720496, + "step": 91995 + }, + { + "epoch": 15.00815660685155, + "grad_norm": 0.015157933346927166, + "learning_rate": 0.00017807214441134628, + "loss": 0.0031, + "num_input_tokens_seen": 198729744, + "step": 92000 + }, + { + "epoch": 15.008972267536704, + "grad_norm": 0.01744169555604458, + "learning_rate": 0.00017801768465093126, + "loss": 0.0027, + "num_input_tokens_seen": 198740400, + "step": 92005 + }, + { + "epoch": 15.00978792822186, + "grad_norm": 0.0031720127444714308, + "learning_rate": 0.00017796323141589638, + "loss": 0.0011, + "num_input_tokens_seen": 198752368, + "step": 92010 + }, + { + "epoch": 15.010603588907015, + "grad_norm": 0.001125030335970223, + "learning_rate": 0.00017790878470734506, + "loss": 0.0023, + "num_input_tokens_seen": 198762704, + "step": 92015 + }, + { + "epoch": 15.01141924959217, + "grad_norm": 0.07347182184457779, + "learning_rate": 0.0001778543445263809, + "loss": 0.0078, + "num_input_tokens_seen": 198772656, + "step": 92020 + }, + { + "epoch": 15.012234910277325, + "grad_norm": 0.00362205458804965, + "learning_rate": 0.00017779991087410707, + "loss": 0.0066, + "num_input_tokens_seen": 198782864, + "step": 92025 + }, + { + "epoch": 15.013050570962479, + "grad_norm": 0.00837631057947874, + "learning_rate": 0.0001777454837516268, + "loss": 0.0309, + "num_input_tokens_seen": 198793776, + "step": 92030 + }, + { + "epoch": 15.013866231647635, + "grad_norm": 1.850484848022461, + "learning_rate": 0.00017769106316004314, + "loss": 0.0401, + "num_input_tokens_seen": 198804496, + "step": 92035 + }, + { + "epoch": 15.01468189233279, + "grad_norm": 0.009433651342988014, + "learning_rate": 0.0001776366491004589, + "loss": 0.1475, + "num_input_tokens_seen": 198815216, + "step": 92040 + }, + { + "epoch": 15.015497553017944, + "grad_norm": 0.004046480171382427, + "learning_rate": 0.00017758224157397696, + "loss": 0.0097, + "num_input_tokens_seen": 198825296, + "step": 92045 + }, + { + "epoch": 15.0163132137031, + "grad_norm": 0.06106355041265488, + "learning_rate": 0.00017752784058169992, + "loss": 0.0051, + "num_input_tokens_seen": 198836080, + "step": 92050 + }, + { + "epoch": 15.017128874388254, + "grad_norm": 0.029603945091366768, + "learning_rate": 0.00017747344612473022, + "loss": 0.0028, + "num_input_tokens_seen": 198847696, + "step": 92055 + }, + { + "epoch": 15.01794453507341, + "grad_norm": 0.0066187456250190735, + "learning_rate": 0.00017741905820417014, + "loss": 0.0053, + "num_input_tokens_seen": 198860080, + "step": 92060 + }, + { + "epoch": 15.018760195758565, + "grad_norm": 0.009210659191012383, + "learning_rate": 0.00017736467682112245, + "loss": 0.0023, + "num_input_tokens_seen": 198870640, + "step": 92065 + }, + { + "epoch": 15.01957585644372, + "grad_norm": 0.0005773415905423462, + "learning_rate": 0.00017731030197668847, + "loss": 0.0031, + "num_input_tokens_seen": 198881488, + "step": 92070 + }, + { + "epoch": 15.020391517128875, + "grad_norm": 0.02137085609138012, + "learning_rate": 0.00017725593367197095, + "loss": 0.0033, + "num_input_tokens_seen": 198892944, + "step": 92075 + }, + { + "epoch": 15.021207177814029, + "grad_norm": 0.014607875607907772, + "learning_rate": 0.00017720157190807107, + "loss": 0.0027, + "num_input_tokens_seen": 198903408, + "step": 92080 + }, + { + "epoch": 15.022022838499185, + "grad_norm": 0.003844099584966898, + "learning_rate": 0.00017714721668609095, + "loss": 0.0017, + "num_input_tokens_seen": 198914160, + "step": 92085 + }, + { + "epoch": 15.022838499184338, + "grad_norm": 0.003906742203980684, + "learning_rate": 0.00017709286800713202, + "loss": 0.025, + "num_input_tokens_seen": 198924304, + "step": 92090 + }, + { + "epoch": 15.023654159869494, + "grad_norm": 0.10455742478370667, + "learning_rate": 0.00017703852587229584, + "loss": 0.009, + "num_input_tokens_seen": 198935312, + "step": 92095 + }, + { + "epoch": 15.02446982055465, + "grad_norm": 0.00283225835300982, + "learning_rate": 0.00017698419028268358, + "loss": 0.001, + "num_input_tokens_seen": 198946640, + "step": 92100 + }, + { + "epoch": 15.025285481239804, + "grad_norm": 0.025962086394429207, + "learning_rate": 0.00017692986123939652, + "loss": 0.0076, + "num_input_tokens_seen": 198956656, + "step": 92105 + }, + { + "epoch": 15.02610114192496, + "grad_norm": 0.008207517676055431, + "learning_rate": 0.00017687553874353563, + "loss": 0.0013, + "num_input_tokens_seen": 198968304, + "step": 92110 + }, + { + "epoch": 15.026916802610113, + "grad_norm": 0.019631782546639442, + "learning_rate": 0.0001768212227962019, + "loss": 0.0019, + "num_input_tokens_seen": 198979632, + "step": 92115 + }, + { + "epoch": 15.02773246329527, + "grad_norm": 0.018773145973682404, + "learning_rate": 0.00017676691339849605, + "loss": 0.0069, + "num_input_tokens_seen": 198989808, + "step": 92120 + }, + { + "epoch": 15.028548123980425, + "grad_norm": 0.021854877471923828, + "learning_rate": 0.00017671261055151872, + "loss": 0.0047, + "num_input_tokens_seen": 199001040, + "step": 92125 + }, + { + "epoch": 15.029363784665579, + "grad_norm": 0.05074403062462807, + "learning_rate": 0.00017665831425637052, + "loss": 0.0109, + "num_input_tokens_seen": 199011568, + "step": 92130 + }, + { + "epoch": 15.030179445350734, + "grad_norm": 0.0030274391174316406, + "learning_rate": 0.0001766040245141517, + "loss": 0.0016, + "num_input_tokens_seen": 199023312, + "step": 92135 + }, + { + "epoch": 15.030995106035888, + "grad_norm": 0.0029113576747477055, + "learning_rate": 0.00017654974132596263, + "loss": 0.0733, + "num_input_tokens_seen": 199033296, + "step": 92140 + }, + { + "epoch": 15.031810766721044, + "grad_norm": 0.010478825308382511, + "learning_rate": 0.00017649546469290333, + "loss": 0.0025, + "num_input_tokens_seen": 199043856, + "step": 92145 + }, + { + "epoch": 15.0326264274062, + "grad_norm": 0.0034063730854541063, + "learning_rate": 0.00017644119461607388, + "loss": 0.0018, + "num_input_tokens_seen": 199055056, + "step": 92150 + }, + { + "epoch": 15.033442088091354, + "grad_norm": 0.0036105539184063673, + "learning_rate": 0.0001763869310965741, + "loss": 0.2094, + "num_input_tokens_seen": 199065520, + "step": 92155 + }, + { + "epoch": 15.03425774877651, + "grad_norm": 0.0027978713624179363, + "learning_rate": 0.00017633267413550362, + "loss": 0.0041, + "num_input_tokens_seen": 199076496, + "step": 92160 + }, + { + "epoch": 15.035073409461663, + "grad_norm": 0.0005292710848152637, + "learning_rate": 0.00017627842373396202, + "loss": 0.0058, + "num_input_tokens_seen": 199086096, + "step": 92165 + }, + { + "epoch": 15.035889070146819, + "grad_norm": 0.014616936445236206, + "learning_rate": 0.00017622417989304913, + "loss": 0.0051, + "num_input_tokens_seen": 199096016, + "step": 92170 + }, + { + "epoch": 15.036704730831975, + "grad_norm": 0.010652135126292706, + "learning_rate": 0.0001761699426138636, + "loss": 0.0075, + "num_input_tokens_seen": 199106832, + "step": 92175 + }, + { + "epoch": 15.037520391517129, + "grad_norm": 0.03607923537492752, + "learning_rate": 0.00017611571189750537, + "loss": 0.0137, + "num_input_tokens_seen": 199117680, + "step": 92180 + }, + { + "epoch": 15.038336052202284, + "grad_norm": 0.0021634928416460752, + "learning_rate": 0.00017606148774507274, + "loss": 0.0028, + "num_input_tokens_seen": 199128368, + "step": 92185 + }, + { + "epoch": 15.039151712887438, + "grad_norm": 0.004082173574715853, + "learning_rate": 0.0001760072701576654, + "loss": 0.0655, + "num_input_tokens_seen": 199137872, + "step": 92190 + }, + { + "epoch": 15.039967373572594, + "grad_norm": 0.003854207694530487, + "learning_rate": 0.00017595305913638138, + "loss": 0.0028, + "num_input_tokens_seen": 199148912, + "step": 92195 + }, + { + "epoch": 15.040783034257748, + "grad_norm": 0.027118144556879997, + "learning_rate": 0.00017589885468232002, + "loss": 0.0035, + "num_input_tokens_seen": 199159888, + "step": 92200 + }, + { + "epoch": 15.041598694942904, + "grad_norm": 0.07028216123580933, + "learning_rate": 0.00017584465679657918, + "loss": 0.0059, + "num_input_tokens_seen": 199170768, + "step": 92205 + }, + { + "epoch": 15.04241435562806, + "grad_norm": 0.001322569907642901, + "learning_rate": 0.00017579046548025796, + "loss": 0.0019, + "num_input_tokens_seen": 199182800, + "step": 92210 + }, + { + "epoch": 15.043230016313213, + "grad_norm": 0.008521920070052147, + "learning_rate": 0.00017573628073445393, + "loss": 0.0022, + "num_input_tokens_seen": 199194000, + "step": 92215 + }, + { + "epoch": 15.044045676998369, + "grad_norm": 0.007180997170507908, + "learning_rate": 0.00017568210256026578, + "loss": 0.0328, + "num_input_tokens_seen": 199203888, + "step": 92220 + }, + { + "epoch": 15.044861337683523, + "grad_norm": 0.005620477721095085, + "learning_rate": 0.000175627930958791, + "loss": 0.008, + "num_input_tokens_seen": 199215184, + "step": 92225 + }, + { + "epoch": 15.045676998368679, + "grad_norm": 0.0026418042834848166, + "learning_rate": 0.0001755737659311278, + "loss": 0.0286, + "num_input_tokens_seen": 199225616, + "step": 92230 + }, + { + "epoch": 15.046492659053834, + "grad_norm": 0.004119323566555977, + "learning_rate": 0.00017551960747837382, + "loss": 0.0017, + "num_input_tokens_seen": 199236304, + "step": 92235 + }, + { + "epoch": 15.047308319738988, + "grad_norm": 0.10706538707017899, + "learning_rate": 0.00017546545560162663, + "loss": 0.0644, + "num_input_tokens_seen": 199248496, + "step": 92240 + }, + { + "epoch": 15.048123980424144, + "grad_norm": 0.02891051024198532, + "learning_rate": 0.00017541131030198364, + "loss": 0.0026, + "num_input_tokens_seen": 199258608, + "step": 92245 + }, + { + "epoch": 15.048939641109298, + "grad_norm": 0.00028251283220015466, + "learning_rate": 0.00017535717158054226, + "loss": 0.004, + "num_input_tokens_seen": 199268080, + "step": 92250 + }, + { + "epoch": 15.049755301794454, + "grad_norm": 0.001709071220830083, + "learning_rate": 0.00017530303943839965, + "loss": 0.0129, + "num_input_tokens_seen": 199278800, + "step": 92255 + }, + { + "epoch": 15.05057096247961, + "grad_norm": 0.00671932240948081, + "learning_rate": 0.00017524891387665282, + "loss": 0.1309, + "num_input_tokens_seen": 199289360, + "step": 92260 + }, + { + "epoch": 15.051386623164763, + "grad_norm": 0.05627777799963951, + "learning_rate": 0.00017519479489639877, + "loss": 0.0045, + "num_input_tokens_seen": 199300112, + "step": 92265 + }, + { + "epoch": 15.052202283849919, + "grad_norm": 0.008651613257825375, + "learning_rate": 0.0001751406824987342, + "loss": 0.0092, + "num_input_tokens_seen": 199310768, + "step": 92270 + }, + { + "epoch": 15.053017944535073, + "grad_norm": 0.013139888644218445, + "learning_rate": 0.00017508657668475585, + "loss": 0.0018, + "num_input_tokens_seen": 199320976, + "step": 92275 + }, + { + "epoch": 15.053833605220229, + "grad_norm": 0.015267265029251575, + "learning_rate": 0.00017503247745556, + "loss": 0.003, + "num_input_tokens_seen": 199331984, + "step": 92280 + }, + { + "epoch": 15.054649265905383, + "grad_norm": 0.008942226879298687, + "learning_rate": 0.0001749783848122436, + "loss": 0.0028, + "num_input_tokens_seen": 199343568, + "step": 92285 + }, + { + "epoch": 15.055464926590538, + "grad_norm": 0.003952791448682547, + "learning_rate": 0.0001749242987559022, + "loss": 0.0265, + "num_input_tokens_seen": 199353168, + "step": 92290 + }, + { + "epoch": 15.056280587275694, + "grad_norm": 0.014960017055273056, + "learning_rate": 0.00017487021928763263, + "loss": 0.0047, + "num_input_tokens_seen": 199363728, + "step": 92295 + }, + { + "epoch": 15.057096247960848, + "grad_norm": 0.01710602268576622, + "learning_rate": 0.0001748161464085302, + "loss": 0.0973, + "num_input_tokens_seen": 199374192, + "step": 92300 + }, + { + "epoch": 15.057911908646004, + "grad_norm": 0.07812569290399551, + "learning_rate": 0.00017476208011969142, + "loss": 0.0051, + "num_input_tokens_seen": 199386000, + "step": 92305 + }, + { + "epoch": 15.058727569331158, + "grad_norm": 0.0018170730909332633, + "learning_rate": 0.0001747080204222113, + "loss": 0.0033, + "num_input_tokens_seen": 199396464, + "step": 92310 + }, + { + "epoch": 15.059543230016313, + "grad_norm": 0.021724211052060127, + "learning_rate": 0.00017465396731718619, + "loss": 0.0033, + "num_input_tokens_seen": 199408176, + "step": 92315 + }, + { + "epoch": 15.060358890701469, + "grad_norm": 0.005055665969848633, + "learning_rate": 0.0001745999208057108, + "loss": 0.0024, + "num_input_tokens_seen": 199419088, + "step": 92320 + }, + { + "epoch": 15.061174551386623, + "grad_norm": 0.05920695886015892, + "learning_rate": 0.00017454588088888117, + "loss": 0.0045, + "num_input_tokens_seen": 199430736, + "step": 92325 + }, + { + "epoch": 15.061990212071779, + "grad_norm": 0.0020981167908757925, + "learning_rate": 0.00017449184756779178, + "loss": 0.0041, + "num_input_tokens_seen": 199441456, + "step": 92330 + }, + { + "epoch": 15.062805872756933, + "grad_norm": 0.004952155519276857, + "learning_rate": 0.00017443782084353837, + "loss": 0.002, + "num_input_tokens_seen": 199451664, + "step": 92335 + }, + { + "epoch": 15.063621533442088, + "grad_norm": 0.04457241669297218, + "learning_rate": 0.0001743838007172152, + "loss": 0.004, + "num_input_tokens_seen": 199462800, + "step": 92340 + }, + { + "epoch": 15.064437194127244, + "grad_norm": 0.028170321136713028, + "learning_rate": 0.00017432978718991772, + "loss": 0.0059, + "num_input_tokens_seen": 199473200, + "step": 92345 + }, + { + "epoch": 15.065252854812398, + "grad_norm": 0.015449753031134605, + "learning_rate": 0.00017427578026273988, + "loss": 0.046, + "num_input_tokens_seen": 199484144, + "step": 92350 + }, + { + "epoch": 15.066068515497554, + "grad_norm": 0.0020990390330553055, + "learning_rate": 0.00017422177993677696, + "loss": 0.0032, + "num_input_tokens_seen": 199494800, + "step": 92355 + }, + { + "epoch": 15.066884176182707, + "grad_norm": 0.010104361921548843, + "learning_rate": 0.00017416778621312257, + "loss": 0.01, + "num_input_tokens_seen": 199505776, + "step": 92360 + }, + { + "epoch": 15.067699836867863, + "grad_norm": 0.010933980345726013, + "learning_rate": 0.00017411379909287167, + "loss": 0.0047, + "num_input_tokens_seen": 199516784, + "step": 92365 + }, + { + "epoch": 15.068515497553017, + "grad_norm": 0.18525753915309906, + "learning_rate": 0.00017405981857711772, + "loss": 0.0094, + "num_input_tokens_seen": 199528400, + "step": 92370 + }, + { + "epoch": 15.069331158238173, + "grad_norm": 0.0010363530600443482, + "learning_rate": 0.0001740058446669552, + "loss": 0.002, + "num_input_tokens_seen": 199540400, + "step": 92375 + }, + { + "epoch": 15.070146818923329, + "grad_norm": 0.2711713910102844, + "learning_rate": 0.00017395187736347778, + "loss": 0.01, + "num_input_tokens_seen": 199551408, + "step": 92380 + }, + { + "epoch": 15.070962479608482, + "grad_norm": 0.02081671543419361, + "learning_rate": 0.0001738979166677792, + "loss": 0.0022, + "num_input_tokens_seen": 199560880, + "step": 92385 + }, + { + "epoch": 15.071778140293638, + "grad_norm": 0.001128299511037767, + "learning_rate": 0.00017384396258095304, + "loss": 0.0006, + "num_input_tokens_seen": 199572144, + "step": 92390 + }, + { + "epoch": 15.072593800978792, + "grad_norm": 0.02086414210498333, + "learning_rate": 0.0001737900151040927, + "loss": 0.0079, + "num_input_tokens_seen": 199584048, + "step": 92395 + }, + { + "epoch": 15.073409461663948, + "grad_norm": 0.16544772684574127, + "learning_rate": 0.00017373607423829159, + "loss": 0.0068, + "num_input_tokens_seen": 199595824, + "step": 92400 + }, + { + "epoch": 15.074225122349104, + "grad_norm": 0.04024311900138855, + "learning_rate": 0.00017368213998464278, + "loss": 0.007, + "num_input_tokens_seen": 199607216, + "step": 92405 + }, + { + "epoch": 15.075040783034257, + "grad_norm": 0.000666849547997117, + "learning_rate": 0.00017362821234423936, + "loss": 0.0014, + "num_input_tokens_seen": 199618736, + "step": 92410 + }, + { + "epoch": 15.075856443719413, + "grad_norm": 0.007606880739331245, + "learning_rate": 0.00017357429131817432, + "loss": 0.0337, + "num_input_tokens_seen": 199629648, + "step": 92415 + }, + { + "epoch": 15.076672104404567, + "grad_norm": 0.001018756302073598, + "learning_rate": 0.0001735203769075403, + "loss": 0.0023, + "num_input_tokens_seen": 199639888, + "step": 92420 + }, + { + "epoch": 15.077487765089723, + "grad_norm": 0.00730155361816287, + "learning_rate": 0.00017346646911342985, + "loss": 0.0032, + "num_input_tokens_seen": 199651600, + "step": 92425 + }, + { + "epoch": 15.078303425774878, + "grad_norm": 0.0009757946827448905, + "learning_rate": 0.000173412567936936, + "loss": 0.002, + "num_input_tokens_seen": 199660848, + "step": 92430 + }, + { + "epoch": 15.079119086460032, + "grad_norm": 0.014223473146557808, + "learning_rate": 0.0001733586733791504, + "loss": 0.0021, + "num_input_tokens_seen": 199671184, + "step": 92435 + }, + { + "epoch": 15.079934747145188, + "grad_norm": 0.0027299304492771626, + "learning_rate": 0.000173304785441166, + "loss": 0.0022, + "num_input_tokens_seen": 199681136, + "step": 92440 + }, + { + "epoch": 15.080750407830342, + "grad_norm": 0.0019904670771211386, + "learning_rate": 0.00017325090412407423, + "loss": 0.0115, + "num_input_tokens_seen": 199691344, + "step": 92445 + }, + { + "epoch": 15.081566068515498, + "grad_norm": 0.004478312563151121, + "learning_rate": 0.00017319702942896777, + "loss": 0.005, + "num_input_tokens_seen": 199701456, + "step": 92450 + }, + { + "epoch": 15.082381729200652, + "grad_norm": 0.014595243148505688, + "learning_rate": 0.00017314316135693775, + "loss": 0.0165, + "num_input_tokens_seen": 199710800, + "step": 92455 + }, + { + "epoch": 15.083197389885807, + "grad_norm": 0.0018416885286569595, + "learning_rate": 0.00017308929990907652, + "loss": 0.0033, + "num_input_tokens_seen": 199721072, + "step": 92460 + }, + { + "epoch": 15.084013050570963, + "grad_norm": 0.005210902541875839, + "learning_rate": 0.000173035445086475, + "loss": 0.0015, + "num_input_tokens_seen": 199732816, + "step": 92465 + }, + { + "epoch": 15.084828711256117, + "grad_norm": 0.013290762901306152, + "learning_rate": 0.0001729815968902253, + "loss": 0.0069, + "num_input_tokens_seen": 199743152, + "step": 92470 + }, + { + "epoch": 15.085644371941273, + "grad_norm": 0.0032980344258248806, + "learning_rate": 0.0001729277553214181, + "loss": 0.0101, + "num_input_tokens_seen": 199753680, + "step": 92475 + }, + { + "epoch": 15.086460032626427, + "grad_norm": 0.006125927437096834, + "learning_rate": 0.00017287392038114514, + "loss": 0.0026, + "num_input_tokens_seen": 199764400, + "step": 92480 + }, + { + "epoch": 15.087275693311582, + "grad_norm": 0.00176598085090518, + "learning_rate": 0.00017282009207049686, + "loss": 0.0029, + "num_input_tokens_seen": 199774608, + "step": 92485 + }, + { + "epoch": 15.088091353996738, + "grad_norm": 0.0010756379924714565, + "learning_rate": 0.00017276627039056463, + "loss": 0.0109, + "num_input_tokens_seen": 199784912, + "step": 92490 + }, + { + "epoch": 15.088907014681892, + "grad_norm": 0.020969685167074203, + "learning_rate": 0.00017271245534243912, + "loss": 0.1436, + "num_input_tokens_seen": 199795024, + "step": 92495 + }, + { + "epoch": 15.089722675367048, + "grad_norm": 0.010281615890562534, + "learning_rate": 0.00017265864692721084, + "loss": 0.0009, + "num_input_tokens_seen": 199805904, + "step": 92500 + }, + { + "epoch": 15.090538336052202, + "grad_norm": 0.0017593882512301207, + "learning_rate": 0.00017260484514597035, + "loss": 0.0019, + "num_input_tokens_seen": 199816208, + "step": 92505 + }, + { + "epoch": 15.091353996737357, + "grad_norm": 0.001155554549768567, + "learning_rate": 0.00017255104999980799, + "loss": 0.0048, + "num_input_tokens_seen": 199826544, + "step": 92510 + }, + { + "epoch": 15.092169657422513, + "grad_norm": 0.010744703002274036, + "learning_rate": 0.00017249726148981399, + "loss": 0.0063, + "num_input_tokens_seen": 199837360, + "step": 92515 + }, + { + "epoch": 15.092985318107667, + "grad_norm": 0.011290965601801872, + "learning_rate": 0.00017244347961707852, + "loss": 0.0015, + "num_input_tokens_seen": 199848176, + "step": 92520 + }, + { + "epoch": 15.093800978792823, + "grad_norm": 0.006745223421603441, + "learning_rate": 0.00017238970438269142, + "loss": 0.0046, + "num_input_tokens_seen": 199859344, + "step": 92525 + }, + { + "epoch": 15.094616639477977, + "grad_norm": 0.003182594198733568, + "learning_rate": 0.00017233593578774254, + "loss": 0.0012, + "num_input_tokens_seen": 199869520, + "step": 92530 + }, + { + "epoch": 15.095432300163132, + "grad_norm": 0.1688506007194519, + "learning_rate": 0.00017228217383332163, + "loss": 0.008, + "num_input_tokens_seen": 199880592, + "step": 92535 + }, + { + "epoch": 15.096247960848286, + "grad_norm": 0.0019982391968369484, + "learning_rate": 0.00017222841852051817, + "loss": 0.0033, + "num_input_tokens_seen": 199889936, + "step": 92540 + }, + { + "epoch": 15.097063621533442, + "grad_norm": 0.01286247931420803, + "learning_rate": 0.0001721746698504217, + "loss": 0.0034, + "num_input_tokens_seen": 199899376, + "step": 92545 + }, + { + "epoch": 15.097879282218598, + "grad_norm": 0.0021901631262153387, + "learning_rate": 0.0001721209278241213, + "loss": 0.0013, + "num_input_tokens_seen": 199910640, + "step": 92550 + }, + { + "epoch": 15.098694942903752, + "grad_norm": 0.31956011056900024, + "learning_rate": 0.00017206719244270636, + "loss": 0.0241, + "num_input_tokens_seen": 199921872, + "step": 92555 + }, + { + "epoch": 15.099510603588907, + "grad_norm": 0.024079471826553345, + "learning_rate": 0.00017201346370726572, + "loss": 0.0425, + "num_input_tokens_seen": 199932624, + "step": 92560 + }, + { + "epoch": 15.100326264274061, + "grad_norm": 0.00888835173100233, + "learning_rate": 0.00017195974161888833, + "loss": 0.0023, + "num_input_tokens_seen": 199943664, + "step": 92565 + }, + { + "epoch": 15.101141924959217, + "grad_norm": 0.13645078241825104, + "learning_rate": 0.00017190602617866274, + "loss": 0.0059, + "num_input_tokens_seen": 199953488, + "step": 92570 + }, + { + "epoch": 15.101957585644373, + "grad_norm": 0.004840241279453039, + "learning_rate": 0.0001718523173876781, + "loss": 0.0028, + "num_input_tokens_seen": 199962768, + "step": 92575 + }, + { + "epoch": 15.102773246329527, + "grad_norm": 0.0016001993790268898, + "learning_rate": 0.00017179861524702216, + "loss": 0.0051, + "num_input_tokens_seen": 199974352, + "step": 92580 + }, + { + "epoch": 15.103588907014682, + "grad_norm": 0.004659554921090603, + "learning_rate": 0.000171744919757784, + "loss": 0.0037, + "num_input_tokens_seen": 199983824, + "step": 92585 + }, + { + "epoch": 15.104404567699836, + "grad_norm": 0.010251539759337902, + "learning_rate": 0.00017169123092105115, + "loss": 0.0019, + "num_input_tokens_seen": 199994704, + "step": 92590 + }, + { + "epoch": 15.105220228384992, + "grad_norm": 0.0009150686673820019, + "learning_rate": 0.0001716375487379121, + "loss": 0.0343, + "num_input_tokens_seen": 200007824, + "step": 92595 + }, + { + "epoch": 15.106035889070148, + "grad_norm": 0.0012699142098426819, + "learning_rate": 0.00017158387320945472, + "loss": 0.0291, + "num_input_tokens_seen": 200016880, + "step": 92600 + }, + { + "epoch": 15.106851549755302, + "grad_norm": 0.030986489728093147, + "learning_rate": 0.0001715302043367668, + "loss": 0.0025, + "num_input_tokens_seen": 200028464, + "step": 92605 + }, + { + "epoch": 15.107667210440457, + "grad_norm": 0.007337637711316347, + "learning_rate": 0.00017147654212093595, + "loss": 0.0026, + "num_input_tokens_seen": 200038480, + "step": 92610 + }, + { + "epoch": 15.108482871125611, + "grad_norm": 0.0179149117320776, + "learning_rate": 0.00017142288656304977, + "loss": 0.003, + "num_input_tokens_seen": 200049328, + "step": 92615 + }, + { + "epoch": 15.109298531810767, + "grad_norm": 0.010523026809096336, + "learning_rate": 0.0001713692376641956, + "loss": 0.1059, + "num_input_tokens_seen": 200059728, + "step": 92620 + }, + { + "epoch": 15.11011419249592, + "grad_norm": 0.006119042169302702, + "learning_rate": 0.0001713155954254607, + "loss": 0.0068, + "num_input_tokens_seen": 200069552, + "step": 92625 + }, + { + "epoch": 15.110929853181077, + "grad_norm": 0.01386276911944151, + "learning_rate": 0.00017126195984793225, + "loss": 0.0014, + "num_input_tokens_seen": 200079568, + "step": 92630 + }, + { + "epoch": 15.111745513866232, + "grad_norm": 0.011046777479350567, + "learning_rate": 0.0001712083309326972, + "loss": 0.005, + "num_input_tokens_seen": 200089776, + "step": 92635 + }, + { + "epoch": 15.112561174551386, + "grad_norm": 0.0011883076513186097, + "learning_rate": 0.0001711547086808425, + "loss": 0.0008, + "num_input_tokens_seen": 200101200, + "step": 92640 + }, + { + "epoch": 15.113376835236542, + "grad_norm": 0.01813393086194992, + "learning_rate": 0.00017110109309345468, + "loss": 0.0037, + "num_input_tokens_seen": 200111952, + "step": 92645 + }, + { + "epoch": 15.114192495921696, + "grad_norm": 0.003187261987477541, + "learning_rate": 0.00017104748417162054, + "loss": 0.0046, + "num_input_tokens_seen": 200123184, + "step": 92650 + }, + { + "epoch": 15.115008156606851, + "grad_norm": 0.06230101361870766, + "learning_rate": 0.0001709938819164264, + "loss": 0.0036, + "num_input_tokens_seen": 200134000, + "step": 92655 + }, + { + "epoch": 15.115823817292007, + "grad_norm": 0.08341676741838455, + "learning_rate": 0.00017094028632895863, + "loss": 0.0028, + "num_input_tokens_seen": 200144976, + "step": 92660 + }, + { + "epoch": 15.116639477977161, + "grad_norm": 0.007194597739726305, + "learning_rate": 0.0001708866974103034, + "loss": 0.0021, + "num_input_tokens_seen": 200157008, + "step": 92665 + }, + { + "epoch": 15.117455138662317, + "grad_norm": 0.08438636362552643, + "learning_rate": 0.0001708331151615467, + "loss": 0.124, + "num_input_tokens_seen": 200168656, + "step": 92670 + }, + { + "epoch": 15.11827079934747, + "grad_norm": 0.001630541984923184, + "learning_rate": 0.00017077953958377458, + "loss": 0.0029, + "num_input_tokens_seen": 200180368, + "step": 92675 + }, + { + "epoch": 15.119086460032626, + "grad_norm": 0.001464636530727148, + "learning_rate": 0.0001707259706780727, + "loss": 0.0052, + "num_input_tokens_seen": 200190864, + "step": 92680 + }, + { + "epoch": 15.119902120717782, + "grad_norm": 0.005892497021704912, + "learning_rate": 0.00017067240844552672, + "loss": 0.0014, + "num_input_tokens_seen": 200200528, + "step": 92685 + }, + { + "epoch": 15.120717781402936, + "grad_norm": 0.002837817883118987, + "learning_rate": 0.00017061885288722218, + "loss": 0.0073, + "num_input_tokens_seen": 200212144, + "step": 92690 + }, + { + "epoch": 15.121533442088092, + "grad_norm": 0.014140215702354908, + "learning_rate": 0.00017056530400424446, + "loss": 0.0016, + "num_input_tokens_seen": 200222864, + "step": 92695 + }, + { + "epoch": 15.122349102773246, + "grad_norm": 0.003915575798600912, + "learning_rate": 0.00017051176179767858, + "loss": 0.0063, + "num_input_tokens_seen": 200234320, + "step": 92700 + }, + { + "epoch": 15.123164763458401, + "grad_norm": 0.007146508898586035, + "learning_rate": 0.00017045822626861017, + "loss": 0.0653, + "num_input_tokens_seen": 200244912, + "step": 92705 + }, + { + "epoch": 15.123980424143557, + "grad_norm": 0.06835640966892242, + "learning_rate": 0.00017040469741812353, + "loss": 0.0118, + "num_input_tokens_seen": 200256784, + "step": 92710 + }, + { + "epoch": 15.124796084828711, + "grad_norm": 0.0016879525501281023, + "learning_rate": 0.00017035117524730398, + "loss": 0.0016, + "num_input_tokens_seen": 200267472, + "step": 92715 + }, + { + "epoch": 15.125611745513867, + "grad_norm": 0.005772008560597897, + "learning_rate": 0.00017029765975723604, + "loss": 0.0064, + "num_input_tokens_seen": 200278288, + "step": 92720 + }, + { + "epoch": 15.12642740619902, + "grad_norm": 0.0018967565847560763, + "learning_rate": 0.0001702441509490043, + "loss": 0.0148, + "num_input_tokens_seen": 200288976, + "step": 92725 + }, + { + "epoch": 15.127243066884176, + "grad_norm": 0.02684551104903221, + "learning_rate": 0.00017019064882369317, + "loss": 0.1522, + "num_input_tokens_seen": 200299376, + "step": 92730 + }, + { + "epoch": 15.12805872756933, + "grad_norm": 0.018632011488080025, + "learning_rate": 0.00017013715338238695, + "loss": 0.0055, + "num_input_tokens_seen": 200309680, + "step": 92735 + }, + { + "epoch": 15.128874388254486, + "grad_norm": 0.0026431684382259846, + "learning_rate": 0.00017008366462616976, + "loss": 0.0031, + "num_input_tokens_seen": 200319696, + "step": 92740 + }, + { + "epoch": 15.129690048939642, + "grad_norm": 0.014334792271256447, + "learning_rate": 0.00017003018255612562, + "loss": 0.0027, + "num_input_tokens_seen": 200330864, + "step": 92745 + }, + { + "epoch": 15.130505709624796, + "grad_norm": 0.0024962888564914465, + "learning_rate": 0.00016997670717333846, + "loss": 0.0063, + "num_input_tokens_seen": 200341648, + "step": 92750 + }, + { + "epoch": 15.131321370309951, + "grad_norm": 0.03874950855970383, + "learning_rate": 0.00016992323847889195, + "loss": 0.0116, + "num_input_tokens_seen": 200352816, + "step": 92755 + }, + { + "epoch": 15.132137030995105, + "grad_norm": 0.001166831818409264, + "learning_rate": 0.00016986977647386975, + "loss": 0.0049, + "num_input_tokens_seen": 200364208, + "step": 92760 + }, + { + "epoch": 15.132952691680261, + "grad_norm": 0.11291606724262238, + "learning_rate": 0.00016981632115935536, + "loss": 0.0046, + "num_input_tokens_seen": 200374224, + "step": 92765 + }, + { + "epoch": 15.133768352365417, + "grad_norm": 0.0012088071089237928, + "learning_rate": 0.00016976287253643208, + "loss": 0.0011, + "num_input_tokens_seen": 200386064, + "step": 92770 + }, + { + "epoch": 15.13458401305057, + "grad_norm": 0.0039956653490662575, + "learning_rate": 0.0001697094306061831, + "loss": 0.0059, + "num_input_tokens_seen": 200396240, + "step": 92775 + }, + { + "epoch": 15.135399673735726, + "grad_norm": 0.036707255989313126, + "learning_rate": 0.00016965599536969156, + "loss": 0.0064, + "num_input_tokens_seen": 200406672, + "step": 92780 + }, + { + "epoch": 15.13621533442088, + "grad_norm": 0.016374798491597176, + "learning_rate": 0.00016960256682804032, + "loss": 0.0026, + "num_input_tokens_seen": 200418320, + "step": 92785 + }, + { + "epoch": 15.137030995106036, + "grad_norm": 0.008576109074056149, + "learning_rate": 0.00016954914498231217, + "loss": 0.005, + "num_input_tokens_seen": 200429840, + "step": 92790 + }, + { + "epoch": 15.137846655791192, + "grad_norm": 0.0024898925330489874, + "learning_rate": 0.00016949572983358986, + "loss": 0.0056, + "num_input_tokens_seen": 200440784, + "step": 92795 + }, + { + "epoch": 15.138662316476346, + "grad_norm": 0.014025689102709293, + "learning_rate": 0.0001694423213829558, + "loss": 0.0013, + "num_input_tokens_seen": 200452048, + "step": 92800 + }, + { + "epoch": 15.139477977161501, + "grad_norm": 0.06928585469722748, + "learning_rate": 0.00016938891963149232, + "loss": 0.0084, + "num_input_tokens_seen": 200462000, + "step": 92805 + }, + { + "epoch": 15.140293637846655, + "grad_norm": 0.0028578825294971466, + "learning_rate": 0.00016933552458028213, + "loss": 0.0025, + "num_input_tokens_seen": 200472176, + "step": 92810 + }, + { + "epoch": 15.141109298531811, + "grad_norm": 0.04835427552461624, + "learning_rate": 0.0001692821362304066, + "loss": 0.0047, + "num_input_tokens_seen": 200483216, + "step": 92815 + }, + { + "epoch": 15.141924959216965, + "grad_norm": 0.0028014755807816982, + "learning_rate": 0.00016922875458294856, + "loss": 0.0016, + "num_input_tokens_seen": 200494160, + "step": 92820 + }, + { + "epoch": 15.14274061990212, + "grad_norm": 0.0011742091737687588, + "learning_rate": 0.00016917537963898903, + "loss": 0.0031, + "num_input_tokens_seen": 200504560, + "step": 92825 + }, + { + "epoch": 15.143556280587276, + "grad_norm": 0.007004075683653355, + "learning_rate": 0.0001691220113996105, + "loss": 0.0024, + "num_input_tokens_seen": 200515440, + "step": 92830 + }, + { + "epoch": 15.14437194127243, + "grad_norm": 0.01578759215772152, + "learning_rate": 0.00016906864986589377, + "loss": 0.0013, + "num_input_tokens_seen": 200525712, + "step": 92835 + }, + { + "epoch": 15.145187601957586, + "grad_norm": 0.0015252482844516635, + "learning_rate": 0.00016901529503892098, + "loss": 0.0041, + "num_input_tokens_seen": 200536784, + "step": 92840 + }, + { + "epoch": 15.14600326264274, + "grad_norm": 0.006211650092154741, + "learning_rate": 0.00016896194691977284, + "loss": 0.0032, + "num_input_tokens_seen": 200548176, + "step": 92845 + }, + { + "epoch": 15.146818923327896, + "grad_norm": 0.001195227261632681, + "learning_rate": 0.00016890860550953092, + "loss": 0.0018, + "num_input_tokens_seen": 200559920, + "step": 92850 + }, + { + "epoch": 15.147634584013051, + "grad_norm": 0.00874117948114872, + "learning_rate": 0.00016885527080927616, + "loss": 0.0021, + "num_input_tokens_seen": 200569616, + "step": 92855 + }, + { + "epoch": 15.148450244698205, + "grad_norm": 0.1447891891002655, + "learning_rate": 0.00016880194282008941, + "loss": 0.0041, + "num_input_tokens_seen": 200580464, + "step": 92860 + }, + { + "epoch": 15.149265905383361, + "grad_norm": 0.004833567887544632, + "learning_rate": 0.0001687486215430515, + "loss": 0.0038, + "num_input_tokens_seen": 200592272, + "step": 92865 + }, + { + "epoch": 15.150081566068515, + "grad_norm": 0.009706101380288601, + "learning_rate": 0.0001686953069792429, + "loss": 0.0021, + "num_input_tokens_seen": 200603824, + "step": 92870 + }, + { + "epoch": 15.15089722675367, + "grad_norm": 0.002112502697855234, + "learning_rate": 0.00016864199912974427, + "loss": 0.0032, + "num_input_tokens_seen": 200613584, + "step": 92875 + }, + { + "epoch": 15.151712887438826, + "grad_norm": 0.008575985208153725, + "learning_rate": 0.00016858869799563585, + "loss": 0.0034, + "num_input_tokens_seen": 200624464, + "step": 92880 + }, + { + "epoch": 15.15252854812398, + "grad_norm": 0.07412799447774887, + "learning_rate": 0.0001685354035779979, + "loss": 0.0036, + "num_input_tokens_seen": 200635280, + "step": 92885 + }, + { + "epoch": 15.153344208809136, + "grad_norm": 0.00945495069026947, + "learning_rate": 0.00016848211587791045, + "loss": 0.0007, + "num_input_tokens_seen": 200645968, + "step": 92890 + }, + { + "epoch": 15.15415986949429, + "grad_norm": 0.0040870546363294125, + "learning_rate": 0.00016842883489645355, + "loss": 0.0012, + "num_input_tokens_seen": 200656016, + "step": 92895 + }, + { + "epoch": 15.154975530179446, + "grad_norm": 0.07097148895263672, + "learning_rate": 0.00016837556063470688, + "loss": 0.0024, + "num_input_tokens_seen": 200665968, + "step": 92900 + }, + { + "epoch": 15.1557911908646, + "grad_norm": 0.21168480813503265, + "learning_rate": 0.0001683222930937502, + "loss": 0.0035, + "num_input_tokens_seen": 200677296, + "step": 92905 + }, + { + "epoch": 15.156606851549755, + "grad_norm": 0.009473263286054134, + "learning_rate": 0.00016826903227466284, + "loss": 0.0458, + "num_input_tokens_seen": 200687472, + "step": 92910 + }, + { + "epoch": 15.15742251223491, + "grad_norm": 0.16500073671340942, + "learning_rate": 0.00016821577817852473, + "loss": 0.0242, + "num_input_tokens_seen": 200698704, + "step": 92915 + }, + { + "epoch": 15.158238172920065, + "grad_norm": 0.013035625219345093, + "learning_rate": 0.00016816253080641441, + "loss": 0.0036, + "num_input_tokens_seen": 200708272, + "step": 92920 + }, + { + "epoch": 15.15905383360522, + "grad_norm": 0.08334468305110931, + "learning_rate": 0.00016810929015941174, + "loss": 0.0014, + "num_input_tokens_seen": 200718608, + "step": 92925 + }, + { + "epoch": 15.159869494290374, + "grad_norm": 0.0007003924110904336, + "learning_rate": 0.00016805605623859492, + "loss": 0.0013, + "num_input_tokens_seen": 200730288, + "step": 92930 + }, + { + "epoch": 15.16068515497553, + "grad_norm": 0.003557375865057111, + "learning_rate": 0.0001680028290450436, + "loss": 0.1876, + "num_input_tokens_seen": 200741360, + "step": 92935 + }, + { + "epoch": 15.161500815660686, + "grad_norm": 0.012556234374642372, + "learning_rate": 0.00016794960857983583, + "loss": 0.2872, + "num_input_tokens_seen": 200753968, + "step": 92940 + }, + { + "epoch": 15.16231647634584, + "grad_norm": 0.03704483434557915, + "learning_rate": 0.00016789639484405077, + "loss": 0.0028, + "num_input_tokens_seen": 200764112, + "step": 92945 + }, + { + "epoch": 15.163132137030995, + "grad_norm": 0.002027664100751281, + "learning_rate": 0.00016784318783876623, + "loss": 0.0012, + "num_input_tokens_seen": 200776208, + "step": 92950 + }, + { + "epoch": 15.16394779771615, + "grad_norm": 0.005491977091878653, + "learning_rate": 0.0001677899875650612, + "loss": 0.0114, + "num_input_tokens_seen": 200787152, + "step": 92955 + }, + { + "epoch": 15.164763458401305, + "grad_norm": 0.0017870229203253984, + "learning_rate": 0.00016773679402401321, + "loss": 0.0032, + "num_input_tokens_seen": 200799024, + "step": 92960 + }, + { + "epoch": 15.16557911908646, + "grad_norm": 0.038198426365852356, + "learning_rate": 0.0001676836072167009, + "loss": 0.0023, + "num_input_tokens_seen": 200810032, + "step": 92965 + }, + { + "epoch": 15.166394779771615, + "grad_norm": 0.002829183591529727, + "learning_rate": 0.0001676304271442015, + "loss": 0.0015, + "num_input_tokens_seen": 200820816, + "step": 92970 + }, + { + "epoch": 15.16721044045677, + "grad_norm": 0.19888825714588165, + "learning_rate": 0.00016757725380759354, + "loss": 0.1378, + "num_input_tokens_seen": 200830128, + "step": 92975 + }, + { + "epoch": 15.168026101141924, + "grad_norm": 0.01338116079568863, + "learning_rate": 0.00016752408720795386, + "loss": 0.0066, + "num_input_tokens_seen": 200840336, + "step": 92980 + }, + { + "epoch": 15.16884176182708, + "grad_norm": 0.00568029098212719, + "learning_rate": 0.00016747092734636067, + "loss": 0.0061, + "num_input_tokens_seen": 200851376, + "step": 92985 + }, + { + "epoch": 15.169657422512234, + "grad_norm": 0.1049458459019661, + "learning_rate": 0.0001674177742238906, + "loss": 0.0066, + "num_input_tokens_seen": 200863280, + "step": 92990 + }, + { + "epoch": 15.17047308319739, + "grad_norm": 0.002612957265228033, + "learning_rate": 0.0001673646278416215, + "loss": 0.0015, + "num_input_tokens_seen": 200874224, + "step": 92995 + }, + { + "epoch": 15.171288743882545, + "grad_norm": 0.0045676566660404205, + "learning_rate": 0.00016731148820063013, + "loss": 0.0707, + "num_input_tokens_seen": 200885104, + "step": 93000 + }, + { + "epoch": 15.1721044045677, + "grad_norm": 0.0016079711494967341, + "learning_rate": 0.00016725835530199352, + "loss": 0.0041, + "num_input_tokens_seen": 200895568, + "step": 93005 + }, + { + "epoch": 15.172920065252855, + "grad_norm": 0.002138703130185604, + "learning_rate": 0.00016720522914678843, + "loss": 0.0024, + "num_input_tokens_seen": 200907248, + "step": 93010 + }, + { + "epoch": 15.173735725938009, + "grad_norm": 0.005183606408536434, + "learning_rate": 0.00016715210973609158, + "loss": 0.0016, + "num_input_tokens_seen": 200918064, + "step": 93015 + }, + { + "epoch": 15.174551386623165, + "grad_norm": 0.026629121974110603, + "learning_rate": 0.00016709899707097948, + "loss": 0.0105, + "num_input_tokens_seen": 200928560, + "step": 93020 + }, + { + "epoch": 15.17536704730832, + "grad_norm": 0.018558355048298836, + "learning_rate": 0.0001670458911525285, + "loss": 0.0029, + "num_input_tokens_seen": 200938448, + "step": 93025 + }, + { + "epoch": 15.176182707993474, + "grad_norm": 0.009968779049813747, + "learning_rate": 0.00016699279198181493, + "loss": 0.0095, + "num_input_tokens_seen": 200949488, + "step": 93030 + }, + { + "epoch": 15.17699836867863, + "grad_norm": 0.0031037803273648024, + "learning_rate": 0.00016693969955991483, + "loss": 0.0059, + "num_input_tokens_seen": 200961072, + "step": 93035 + }, + { + "epoch": 15.177814029363784, + "grad_norm": 0.029598917812108994, + "learning_rate": 0.00016688661388790434, + "loss": 0.0806, + "num_input_tokens_seen": 200972752, + "step": 93040 + }, + { + "epoch": 15.17862969004894, + "grad_norm": 0.0018472730880603194, + "learning_rate": 0.00016683353496685895, + "loss": 0.0046, + "num_input_tokens_seen": 200983920, + "step": 93045 + }, + { + "epoch": 15.179445350734095, + "grad_norm": 0.018483439460396767, + "learning_rate": 0.00016678046279785497, + "loss": 0.0418, + "num_input_tokens_seen": 200995408, + "step": 93050 + }, + { + "epoch": 15.18026101141925, + "grad_norm": 0.06471030414104462, + "learning_rate": 0.00016672739738196734, + "loss": 0.0071, + "num_input_tokens_seen": 201005616, + "step": 93055 + }, + { + "epoch": 15.181076672104405, + "grad_norm": 0.000771304068621248, + "learning_rate": 0.0001666743387202721, + "loss": 0.0037, + "num_input_tokens_seen": 201015344, + "step": 93060 + }, + { + "epoch": 15.181892332789559, + "grad_norm": 0.0015893502859398723, + "learning_rate": 0.00016662128681384388, + "loss": 0.0027, + "num_input_tokens_seen": 201026448, + "step": 93065 + }, + { + "epoch": 15.182707993474715, + "grad_norm": 0.0014162855222821236, + "learning_rate": 0.00016656824166375855, + "loss": 0.0047, + "num_input_tokens_seen": 201037456, + "step": 93070 + }, + { + "epoch": 15.18352365415987, + "grad_norm": 0.01789838634431362, + "learning_rate": 0.0001665152032710905, + "loss": 0.0082, + "num_input_tokens_seen": 201048624, + "step": 93075 + }, + { + "epoch": 15.184339314845024, + "grad_norm": 0.0031051000114530325, + "learning_rate": 0.0001664621716369152, + "loss": 0.0024, + "num_input_tokens_seen": 201059760, + "step": 93080 + }, + { + "epoch": 15.18515497553018, + "grad_norm": 0.0025171549059450626, + "learning_rate": 0.00016640914676230677, + "loss": 0.0012, + "num_input_tokens_seen": 201072208, + "step": 93085 + }, + { + "epoch": 15.185970636215334, + "grad_norm": 0.08467067778110504, + "learning_rate": 0.00016635612864834048, + "loss": 0.0086, + "num_input_tokens_seen": 201082416, + "step": 93090 + }, + { + "epoch": 15.18678629690049, + "grad_norm": 0.012077942490577698, + "learning_rate": 0.00016630311729609026, + "loss": 0.0029, + "num_input_tokens_seen": 201092944, + "step": 93095 + }, + { + "epoch": 15.187601957585644, + "grad_norm": 0.10698788613080978, + "learning_rate": 0.00016625011270663098, + "loss": 0.0039, + "num_input_tokens_seen": 201103504, + "step": 93100 + }, + { + "epoch": 15.1884176182708, + "grad_norm": 0.005380884278565645, + "learning_rate": 0.00016619711488103622, + "loss": 0.0019, + "num_input_tokens_seen": 201115568, + "step": 93105 + }, + { + "epoch": 15.189233278955955, + "grad_norm": 0.025823216885328293, + "learning_rate": 0.0001661441238203807, + "loss": 0.0037, + "num_input_tokens_seen": 201126000, + "step": 93110 + }, + { + "epoch": 15.190048939641109, + "grad_norm": 0.012832976877689362, + "learning_rate": 0.00016609113952573774, + "loss": 0.003, + "num_input_tokens_seen": 201138064, + "step": 93115 + }, + { + "epoch": 15.190864600326265, + "grad_norm": 0.005685700569301844, + "learning_rate": 0.0001660381619981817, + "loss": 0.1171, + "num_input_tokens_seen": 201148368, + "step": 93120 + }, + { + "epoch": 15.191680261011419, + "grad_norm": 0.010949775576591492, + "learning_rate": 0.0001659851912387857, + "loss": 0.0028, + "num_input_tokens_seen": 201159280, + "step": 93125 + }, + { + "epoch": 15.192495921696574, + "grad_norm": 0.003707254771143198, + "learning_rate": 0.00016593222724862366, + "loss": 0.0025, + "num_input_tokens_seen": 201170032, + "step": 93130 + }, + { + "epoch": 15.19331158238173, + "grad_norm": 0.002400952624157071, + "learning_rate": 0.0001658792700287689, + "loss": 0.0016, + "num_input_tokens_seen": 201179984, + "step": 93135 + }, + { + "epoch": 15.194127243066884, + "grad_norm": 0.014919549226760864, + "learning_rate": 0.00016582631958029454, + "loss": 0.0039, + "num_input_tokens_seen": 201189872, + "step": 93140 + }, + { + "epoch": 15.19494290375204, + "grad_norm": 0.0005131821380928159, + "learning_rate": 0.00016577337590427372, + "loss": 0.0046, + "num_input_tokens_seen": 201201392, + "step": 93145 + }, + { + "epoch": 15.195758564437194, + "grad_norm": 0.029786646366119385, + "learning_rate": 0.00016572043900177946, + "loss": 0.0016, + "num_input_tokens_seen": 201213104, + "step": 93150 + }, + { + "epoch": 15.19657422512235, + "grad_norm": 0.004074044059962034, + "learning_rate": 0.0001656675088738846, + "loss": 0.0108, + "num_input_tokens_seen": 201224432, + "step": 93155 + }, + { + "epoch": 15.197389885807505, + "grad_norm": 0.007668066769838333, + "learning_rate": 0.00016561458552166174, + "loss": 0.0107, + "num_input_tokens_seen": 201235248, + "step": 93160 + }, + { + "epoch": 15.198205546492659, + "grad_norm": 0.8521307110786438, + "learning_rate": 0.00016556166894618352, + "loss": 0.1039, + "num_input_tokens_seen": 201245168, + "step": 93165 + }, + { + "epoch": 15.199021207177815, + "grad_norm": 0.004517595283687115, + "learning_rate": 0.00016550875914852237, + "loss": 0.0046, + "num_input_tokens_seen": 201254384, + "step": 93170 + }, + { + "epoch": 15.199836867862969, + "grad_norm": 0.0450996570289135, + "learning_rate": 0.00016545585612975051, + "loss": 0.0451, + "num_input_tokens_seen": 201265776, + "step": 93175 + }, + { + "epoch": 15.200652528548124, + "grad_norm": 0.0036346532870084047, + "learning_rate": 0.00016540295989094018, + "loss": 0.002, + "num_input_tokens_seen": 201277136, + "step": 93180 + }, + { + "epoch": 15.201468189233278, + "grad_norm": 0.008076614700257778, + "learning_rate": 0.0001653500704331633, + "loss": 0.0085, + "num_input_tokens_seen": 201287856, + "step": 93185 + }, + { + "epoch": 15.202283849918434, + "grad_norm": 0.001289551379159093, + "learning_rate": 0.0001652971877574916, + "loss": 0.0074, + "num_input_tokens_seen": 201297168, + "step": 93190 + }, + { + "epoch": 15.20309951060359, + "grad_norm": 0.1675768941640854, + "learning_rate": 0.00016524431186499733, + "loss": 0.0058, + "num_input_tokens_seen": 201307920, + "step": 93195 + }, + { + "epoch": 15.203915171288743, + "grad_norm": 0.00890275463461876, + "learning_rate": 0.0001651914427567514, + "loss": 0.1466, + "num_input_tokens_seen": 201318704, + "step": 93200 + }, + { + "epoch": 15.2047308319739, + "grad_norm": 0.0075667728669941425, + "learning_rate": 0.000165138580433826, + "loss": 0.0086, + "num_input_tokens_seen": 201329360, + "step": 93205 + }, + { + "epoch": 15.205546492659053, + "grad_norm": 0.019372614100575447, + "learning_rate": 0.00016508572489729172, + "loss": 0.0034, + "num_input_tokens_seen": 201340528, + "step": 93210 + }, + { + "epoch": 15.206362153344209, + "grad_norm": 0.013845077715814114, + "learning_rate": 0.00016503287614822042, + "loss": 0.003, + "num_input_tokens_seen": 201351632, + "step": 93215 + }, + { + "epoch": 15.207177814029365, + "grad_norm": 0.012783776968717575, + "learning_rate": 0.00016498003418768248, + "loss": 0.0031, + "num_input_tokens_seen": 201362704, + "step": 93220 + }, + { + "epoch": 15.207993474714518, + "grad_norm": 0.044286951422691345, + "learning_rate": 0.00016492719901674947, + "loss": 0.0031, + "num_input_tokens_seen": 201373264, + "step": 93225 + }, + { + "epoch": 15.208809135399674, + "grad_norm": 0.0009127430967055261, + "learning_rate": 0.00016487437063649152, + "loss": 0.0031, + "num_input_tokens_seen": 201382928, + "step": 93230 + }, + { + "epoch": 15.209624796084828, + "grad_norm": 0.017166294157505035, + "learning_rate": 0.00016482154904797974, + "loss": 0.0664, + "num_input_tokens_seen": 201394896, + "step": 93235 + }, + { + "epoch": 15.210440456769984, + "grad_norm": 0.0013074681628495455, + "learning_rate": 0.0001647687342522845, + "loss": 0.0015, + "num_input_tokens_seen": 201403952, + "step": 93240 + }, + { + "epoch": 15.21125611745514, + "grad_norm": 0.0034039251040667295, + "learning_rate": 0.00016471592625047615, + "loss": 0.0053, + "num_input_tokens_seen": 201415856, + "step": 93245 + }, + { + "epoch": 15.212071778140293, + "grad_norm": 0.00026853723102249205, + "learning_rate": 0.00016466312504362485, + "loss": 0.0038, + "num_input_tokens_seen": 201426800, + "step": 93250 + }, + { + "epoch": 15.21288743882545, + "grad_norm": 0.02092001773416996, + "learning_rate": 0.00016461033063280074, + "loss": 0.035, + "num_input_tokens_seen": 201437488, + "step": 93255 + }, + { + "epoch": 15.213703099510603, + "grad_norm": 0.0905800610780716, + "learning_rate": 0.00016455754301907376, + "loss": 0.0037, + "num_input_tokens_seen": 201448336, + "step": 93260 + }, + { + "epoch": 15.214518760195759, + "grad_norm": 0.013709750957787037, + "learning_rate": 0.00016450476220351368, + "loss": 0.0034, + "num_input_tokens_seen": 201459536, + "step": 93265 + }, + { + "epoch": 15.215334420880913, + "grad_norm": 0.017409684136509895, + "learning_rate": 0.00016445198818719025, + "loss": 0.0073, + "num_input_tokens_seen": 201468944, + "step": 93270 + }, + { + "epoch": 15.216150081566068, + "grad_norm": 0.0011702359188348055, + "learning_rate": 0.00016439922097117294, + "loss": 0.0016, + "num_input_tokens_seen": 201479664, + "step": 93275 + }, + { + "epoch": 15.216965742251224, + "grad_norm": 0.012537084519863129, + "learning_rate": 0.00016434646055653112, + "loss": 0.0149, + "num_input_tokens_seen": 201490064, + "step": 93280 + }, + { + "epoch": 15.217781402936378, + "grad_norm": 0.01967203989624977, + "learning_rate": 0.0001642937069443341, + "loss": 0.0025, + "num_input_tokens_seen": 201501776, + "step": 93285 + }, + { + "epoch": 15.218597063621534, + "grad_norm": 0.05715855583548546, + "learning_rate": 0.00016424096013565098, + "loss": 0.0031, + "num_input_tokens_seen": 201513040, + "step": 93290 + }, + { + "epoch": 15.219412724306688, + "grad_norm": 0.024029148742556572, + "learning_rate": 0.00016418822013155077, + "loss": 0.0024, + "num_input_tokens_seen": 201523856, + "step": 93295 + }, + { + "epoch": 15.220228384991843, + "grad_norm": 0.5616940855979919, + "learning_rate": 0.00016413548693310225, + "loss": 0.1153, + "num_input_tokens_seen": 201535376, + "step": 93300 + }, + { + "epoch": 15.221044045676999, + "grad_norm": 0.005037497729063034, + "learning_rate": 0.00016408276054137417, + "loss": 0.0673, + "num_input_tokens_seen": 201546064, + "step": 93305 + }, + { + "epoch": 15.221859706362153, + "grad_norm": 0.006846841424703598, + "learning_rate": 0.00016403004095743513, + "loss": 0.0037, + "num_input_tokens_seen": 201557392, + "step": 93310 + }, + { + "epoch": 15.222675367047309, + "grad_norm": 0.0015953175025060773, + "learning_rate": 0.00016397732818235344, + "loss": 0.0063, + "num_input_tokens_seen": 201568944, + "step": 93315 + }, + { + "epoch": 15.223491027732463, + "grad_norm": 0.022259226068854332, + "learning_rate": 0.0001639246222171975, + "loss": 0.0025, + "num_input_tokens_seen": 201579376, + "step": 93320 + }, + { + "epoch": 15.224306688417618, + "grad_norm": 0.01390728447586298, + "learning_rate": 0.0001638719230630355, + "loss": 0.0029, + "num_input_tokens_seen": 201590544, + "step": 93325 + }, + { + "epoch": 15.225122349102774, + "grad_norm": 0.00581662543118, + "learning_rate": 0.0001638192307209353, + "loss": 0.0026, + "num_input_tokens_seen": 201602064, + "step": 93330 + }, + { + "epoch": 15.225938009787928, + "grad_norm": 0.022805117070674896, + "learning_rate": 0.00016376654519196477, + "loss": 0.0054, + "num_input_tokens_seen": 201612944, + "step": 93335 + }, + { + "epoch": 15.226753670473084, + "grad_norm": 0.520837664604187, + "learning_rate": 0.00016371386647719182, + "loss": 0.0559, + "num_input_tokens_seen": 201622192, + "step": 93340 + }, + { + "epoch": 15.227569331158238, + "grad_norm": 0.003279214957728982, + "learning_rate": 0.00016366119457768407, + "loss": 0.0019, + "num_input_tokens_seen": 201632560, + "step": 93345 + }, + { + "epoch": 15.228384991843393, + "grad_norm": 0.008738663047552109, + "learning_rate": 0.00016360852949450882, + "loss": 0.0631, + "num_input_tokens_seen": 201643792, + "step": 93350 + }, + { + "epoch": 15.229200652528547, + "grad_norm": 0.014815381728112698, + "learning_rate": 0.00016355587122873349, + "loss": 0.0396, + "num_input_tokens_seen": 201654416, + "step": 93355 + }, + { + "epoch": 15.230016313213703, + "grad_norm": 0.07490991055965424, + "learning_rate": 0.00016350321978142525, + "loss": 0.0071, + "num_input_tokens_seen": 201665744, + "step": 93360 + }, + { + "epoch": 15.230831973898859, + "grad_norm": 0.022860554978251457, + "learning_rate": 0.00016345057515365115, + "loss": 0.0073, + "num_input_tokens_seen": 201675216, + "step": 93365 + }, + { + "epoch": 15.231647634584013, + "grad_norm": 0.011496799066662788, + "learning_rate": 0.00016339793734647807, + "loss": 0.0903, + "num_input_tokens_seen": 201685776, + "step": 93370 + }, + { + "epoch": 15.232463295269168, + "grad_norm": 0.008875560946762562, + "learning_rate": 0.00016334530636097277, + "loss": 0.0033, + "num_input_tokens_seen": 201697296, + "step": 93375 + }, + { + "epoch": 15.233278955954322, + "grad_norm": 0.004284188617020845, + "learning_rate": 0.00016329268219820192, + "loss": 0.1195, + "num_input_tokens_seen": 201707440, + "step": 93380 + }, + { + "epoch": 15.234094616639478, + "grad_norm": 1.4567502737045288, + "learning_rate": 0.00016324006485923204, + "loss": 0.0522, + "num_input_tokens_seen": 201718704, + "step": 93385 + }, + { + "epoch": 15.234910277324634, + "grad_norm": 0.020433912053704262, + "learning_rate": 0.00016318745434512944, + "loss": 0.0034, + "num_input_tokens_seen": 201728592, + "step": 93390 + }, + { + "epoch": 15.235725938009788, + "grad_norm": 0.004512346815317869, + "learning_rate": 0.00016313485065696037, + "loss": 0.006, + "num_input_tokens_seen": 201738320, + "step": 93395 + }, + { + "epoch": 15.236541598694943, + "grad_norm": 0.03213776648044586, + "learning_rate": 0.00016308225379579088, + "loss": 0.0155, + "num_input_tokens_seen": 201750160, + "step": 93400 + }, + { + "epoch": 15.237357259380097, + "grad_norm": 0.06638370454311371, + "learning_rate": 0.0001630296637626869, + "loss": 0.0061, + "num_input_tokens_seen": 201761360, + "step": 93405 + }, + { + "epoch": 15.238172920065253, + "grad_norm": 0.08439627289772034, + "learning_rate": 0.0001629770805587143, + "loss": 0.0067, + "num_input_tokens_seen": 201772496, + "step": 93410 + }, + { + "epoch": 15.238988580750409, + "grad_norm": 0.0017806835239753127, + "learning_rate": 0.0001629245041849387, + "loss": 0.1468, + "num_input_tokens_seen": 201782640, + "step": 93415 + }, + { + "epoch": 15.239804241435563, + "grad_norm": 0.03064138814806938, + "learning_rate": 0.0001628719346424256, + "loss": 0.009, + "num_input_tokens_seen": 201793968, + "step": 93420 + }, + { + "epoch": 15.240619902120718, + "grad_norm": 0.03552016243338585, + "learning_rate": 0.00016281937193224051, + "loss": 0.0065, + "num_input_tokens_seen": 201805264, + "step": 93425 + }, + { + "epoch": 15.241435562805872, + "grad_norm": 0.009931345470249653, + "learning_rate": 0.0001627668160554485, + "loss": 0.0039, + "num_input_tokens_seen": 201816112, + "step": 93430 + }, + { + "epoch": 15.242251223491028, + "grad_norm": 0.038621384650468826, + "learning_rate": 0.00016271426701311483, + "loss": 0.0474, + "num_input_tokens_seen": 201826064, + "step": 93435 + }, + { + "epoch": 15.243066884176184, + "grad_norm": 0.0006465500337071717, + "learning_rate": 0.00016266172480630436, + "loss": 0.0043, + "num_input_tokens_seen": 201837200, + "step": 93440 + }, + { + "epoch": 15.243882544861338, + "grad_norm": 0.007097797933965921, + "learning_rate": 0.0001626091894360819, + "loss": 0.0304, + "num_input_tokens_seen": 201848816, + "step": 93445 + }, + { + "epoch": 15.244698205546493, + "grad_norm": 0.02344132959842682, + "learning_rate": 0.00016255666090351245, + "loss": 0.0033, + "num_input_tokens_seen": 201858416, + "step": 93450 + }, + { + "epoch": 15.245513866231647, + "grad_norm": 0.6161487102508545, + "learning_rate": 0.00016250413920966013, + "loss": 0.0466, + "num_input_tokens_seen": 201868720, + "step": 93455 + }, + { + "epoch": 15.246329526916803, + "grad_norm": 0.010321940295398235, + "learning_rate": 0.0001624516243555898, + "loss": 0.0383, + "num_input_tokens_seen": 201879600, + "step": 93460 + }, + { + "epoch": 15.247145187601957, + "grad_norm": 0.003955533728003502, + "learning_rate": 0.00016239911634236527, + "loss": 0.1012, + "num_input_tokens_seen": 201892816, + "step": 93465 + }, + { + "epoch": 15.247960848287113, + "grad_norm": 0.001835820497944951, + "learning_rate": 0.00016234661517105115, + "loss": 0.0033, + "num_input_tokens_seen": 201904816, + "step": 93470 + }, + { + "epoch": 15.248776508972268, + "grad_norm": 0.00370815210044384, + "learning_rate": 0.00016229412084271095, + "loss": 0.0134, + "num_input_tokens_seen": 201915248, + "step": 93475 + }, + { + "epoch": 15.249592169657422, + "grad_norm": 0.005513612646609545, + "learning_rate": 0.00016224163335840897, + "loss": 0.0665, + "num_input_tokens_seen": 201926064, + "step": 93480 + }, + { + "epoch": 15.250407830342578, + "grad_norm": 0.0008234487031586468, + "learning_rate": 0.00016218915271920875, + "loss": 0.0033, + "num_input_tokens_seen": 201938000, + "step": 93485 + }, + { + "epoch": 15.251223491027732, + "grad_norm": 0.0024828226305544376, + "learning_rate": 0.00016213667892617394, + "loss": 0.0017, + "num_input_tokens_seen": 201948112, + "step": 93490 + }, + { + "epoch": 15.252039151712887, + "grad_norm": 0.004459597636014223, + "learning_rate": 0.00016208421198036789, + "loss": 0.0079, + "num_input_tokens_seen": 201959216, + "step": 93495 + }, + { + "epoch": 15.252854812398043, + "grad_norm": 0.32820403575897217, + "learning_rate": 0.00016203175188285397, + "loss": 0.0275, + "num_input_tokens_seen": 201969936, + "step": 93500 + }, + { + "epoch": 15.253670473083197, + "grad_norm": 0.0012011234648525715, + "learning_rate": 0.00016197929863469534, + "loss": 0.006, + "num_input_tokens_seen": 201981872, + "step": 93505 + }, + { + "epoch": 15.254486133768353, + "grad_norm": 0.006218986120074987, + "learning_rate": 0.0001619268522369551, + "loss": 0.0275, + "num_input_tokens_seen": 201992048, + "step": 93510 + }, + { + "epoch": 15.255301794453507, + "grad_norm": 0.004850749392062426, + "learning_rate": 0.00016187441269069596, + "loss": 0.0029, + "num_input_tokens_seen": 202003120, + "step": 93515 + }, + { + "epoch": 15.256117455138662, + "grad_norm": 0.0029057359788566828, + "learning_rate": 0.00016182197999698084, + "loss": 0.0017, + "num_input_tokens_seen": 202014320, + "step": 93520 + }, + { + "epoch": 15.256933115823816, + "grad_norm": 0.0069501763209700584, + "learning_rate": 0.00016176955415687233, + "loss": 0.0052, + "num_input_tokens_seen": 202024208, + "step": 93525 + }, + { + "epoch": 15.257748776508972, + "grad_norm": 0.01931832917034626, + "learning_rate": 0.00016171713517143288, + "loss": 0.0052, + "num_input_tokens_seen": 202034672, + "step": 93530 + }, + { + "epoch": 15.258564437194128, + "grad_norm": 0.02083902806043625, + "learning_rate": 0.0001616647230417248, + "loss": 0.012, + "num_input_tokens_seen": 202045008, + "step": 93535 + }, + { + "epoch": 15.259380097879282, + "grad_norm": 0.0012294130865484476, + "learning_rate": 0.0001616123177688103, + "loss": 0.0015, + "num_input_tokens_seen": 202056848, + "step": 93540 + }, + { + "epoch": 15.260195758564437, + "grad_norm": 0.0032329263631254435, + "learning_rate": 0.00016155991935375147, + "loss": 0.0272, + "num_input_tokens_seen": 202067600, + "step": 93545 + }, + { + "epoch": 15.261011419249591, + "grad_norm": 0.007929227314889431, + "learning_rate": 0.00016150752779761008, + "loss": 0.0032, + "num_input_tokens_seen": 202078288, + "step": 93550 + }, + { + "epoch": 15.261827079934747, + "grad_norm": 0.003024327801540494, + "learning_rate": 0.00016145514310144838, + "loss": 0.0029, + "num_input_tokens_seen": 202089104, + "step": 93555 + }, + { + "epoch": 15.262642740619903, + "grad_norm": 0.09575843811035156, + "learning_rate": 0.0001614027652663273, + "loss": 0.0046, + "num_input_tokens_seen": 202098576, + "step": 93560 + }, + { + "epoch": 15.263458401305057, + "grad_norm": 0.00582880387082696, + "learning_rate": 0.00016135039429330912, + "loss": 0.0043, + "num_input_tokens_seen": 202108528, + "step": 93565 + }, + { + "epoch": 15.264274061990212, + "grad_norm": 0.0019978994969278574, + "learning_rate": 0.0001612980301834544, + "loss": 0.0565, + "num_input_tokens_seen": 202119792, + "step": 93570 + }, + { + "epoch": 15.265089722675366, + "grad_norm": 0.040785159915685654, + "learning_rate": 0.00016124567293782517, + "loss": 0.0776, + "num_input_tokens_seen": 202131280, + "step": 93575 + }, + { + "epoch": 15.265905383360522, + "grad_norm": 0.012387178838253021, + "learning_rate": 0.00016119332255748177, + "loss": 0.0076, + "num_input_tokens_seen": 202140880, + "step": 93580 + }, + { + "epoch": 15.266721044045678, + "grad_norm": 0.00521023478358984, + "learning_rate": 0.0001611409790434858, + "loss": 0.0036, + "num_input_tokens_seen": 202152496, + "step": 93585 + }, + { + "epoch": 15.267536704730832, + "grad_norm": 0.5057440996170044, + "learning_rate": 0.00016108864239689746, + "loss": 0.0665, + "num_input_tokens_seen": 202162544, + "step": 93590 + }, + { + "epoch": 15.268352365415987, + "grad_norm": 0.23563796281814575, + "learning_rate": 0.00016103631261877799, + "loss": 0.01, + "num_input_tokens_seen": 202173584, + "step": 93595 + }, + { + "epoch": 15.269168026101141, + "grad_norm": 0.0431128591299057, + "learning_rate": 0.0001609839897101874, + "loss": 0.0067, + "num_input_tokens_seen": 202184176, + "step": 93600 + }, + { + "epoch": 15.269983686786297, + "grad_norm": 0.01184097956866026, + "learning_rate": 0.00016093167367218665, + "loss": 0.0169, + "num_input_tokens_seen": 202194544, + "step": 93605 + }, + { + "epoch": 15.270799347471453, + "grad_norm": 0.0048804692924022675, + "learning_rate": 0.0001608793645058353, + "loss": 0.0025, + "num_input_tokens_seen": 202206896, + "step": 93610 + }, + { + "epoch": 15.271615008156607, + "grad_norm": 0.007331953849643469, + "learning_rate": 0.0001608270622121942, + "loss": 0.0041, + "num_input_tokens_seen": 202216848, + "step": 93615 + }, + { + "epoch": 15.272430668841762, + "grad_norm": 0.0063942731358110905, + "learning_rate": 0.00016077476679232262, + "loss": 0.002, + "num_input_tokens_seen": 202228112, + "step": 93620 + }, + { + "epoch": 15.273246329526916, + "grad_norm": 0.003414425067603588, + "learning_rate": 0.00016072247824728086, + "loss": 0.0022, + "num_input_tokens_seen": 202239248, + "step": 93625 + }, + { + "epoch": 15.274061990212072, + "grad_norm": 0.026459679007530212, + "learning_rate": 0.00016067019657812852, + "loss": 0.0169, + "num_input_tokens_seen": 202251120, + "step": 93630 + }, + { + "epoch": 15.274877650897226, + "grad_norm": 0.06253305077552795, + "learning_rate": 0.0001606179217859251, + "loss": 0.0098, + "num_input_tokens_seen": 202260560, + "step": 93635 + }, + { + "epoch": 15.275693311582382, + "grad_norm": 0.029631705954670906, + "learning_rate": 0.00016056565387173005, + "loss": 0.0459, + "num_input_tokens_seen": 202271664, + "step": 93640 + }, + { + "epoch": 15.276508972267537, + "grad_norm": 0.005431040655821562, + "learning_rate": 0.0001605133928366026, + "loss": 0.0054, + "num_input_tokens_seen": 202282320, + "step": 93645 + }, + { + "epoch": 15.277324632952691, + "grad_norm": 0.00246419757604599, + "learning_rate": 0.00016046113868160194, + "loss": 0.01, + "num_input_tokens_seen": 202292816, + "step": 93650 + }, + { + "epoch": 15.278140293637847, + "grad_norm": 0.0032046616543084383, + "learning_rate": 0.00016040889140778703, + "loss": 0.001, + "num_input_tokens_seen": 202302896, + "step": 93655 + }, + { + "epoch": 15.278955954323001, + "grad_norm": 0.02585562691092491, + "learning_rate": 0.00016035665101621672, + "loss": 0.1417, + "num_input_tokens_seen": 202313840, + "step": 93660 + }, + { + "epoch": 15.279771615008157, + "grad_norm": 0.09804775565862656, + "learning_rate": 0.00016030441750794976, + "loss": 0.0678, + "num_input_tokens_seen": 202324080, + "step": 93665 + }, + { + "epoch": 15.280587275693312, + "grad_norm": 0.05777350068092346, + "learning_rate": 0.00016025219088404468, + "loss": 0.0118, + "num_input_tokens_seen": 202334960, + "step": 93670 + }, + { + "epoch": 15.281402936378466, + "grad_norm": 0.009660948067903519, + "learning_rate": 0.00016019997114555983, + "loss": 0.0801, + "num_input_tokens_seen": 202345648, + "step": 93675 + }, + { + "epoch": 15.282218597063622, + "grad_norm": 0.0101390415802598, + "learning_rate": 0.000160147758293554, + "loss": 0.0055, + "num_input_tokens_seen": 202357008, + "step": 93680 + }, + { + "epoch": 15.283034257748776, + "grad_norm": 0.010949664749205112, + "learning_rate": 0.00016009555232908456, + "loss": 0.0042, + "num_input_tokens_seen": 202367344, + "step": 93685 + }, + { + "epoch": 15.283849918433932, + "grad_norm": 0.19166868925094604, + "learning_rate": 0.00016004335325321033, + "loss": 0.0067, + "num_input_tokens_seen": 202378416, + "step": 93690 + }, + { + "epoch": 15.284665579119087, + "grad_norm": 0.0036083634477108717, + "learning_rate": 0.00015999116106698848, + "loss": 0.0073, + "num_input_tokens_seen": 202388848, + "step": 93695 + }, + { + "epoch": 15.285481239804241, + "grad_norm": 0.47052520513534546, + "learning_rate": 0.0001599389757714774, + "loss": 0.1437, + "num_input_tokens_seen": 202400048, + "step": 93700 + }, + { + "epoch": 15.286296900489397, + "grad_norm": 0.000593140721321106, + "learning_rate": 0.0001598867973677341, + "loss": 0.0132, + "num_input_tokens_seen": 202411216, + "step": 93705 + }, + { + "epoch": 15.28711256117455, + "grad_norm": 0.05543004721403122, + "learning_rate": 0.00015983462585681657, + "loss": 0.006, + "num_input_tokens_seen": 202421424, + "step": 93710 + }, + { + "epoch": 15.287928221859707, + "grad_norm": 0.008918811567127705, + "learning_rate": 0.00015978246123978158, + "loss": 0.0038, + "num_input_tokens_seen": 202433040, + "step": 93715 + }, + { + "epoch": 15.28874388254486, + "grad_norm": 0.15539592504501343, + "learning_rate": 0.0001597303035176869, + "loss": 0.0061, + "num_input_tokens_seen": 202443760, + "step": 93720 + }, + { + "epoch": 15.289559543230016, + "grad_norm": 0.0010111124720424414, + "learning_rate": 0.00015967815269158904, + "loss": 0.001, + "num_input_tokens_seen": 202454480, + "step": 93725 + }, + { + "epoch": 15.290375203915172, + "grad_norm": 0.006419785786420107, + "learning_rate": 0.0001596260087625454, + "loss": 0.0028, + "num_input_tokens_seen": 202464592, + "step": 93730 + }, + { + "epoch": 15.291190864600326, + "grad_norm": 0.08302279561758041, + "learning_rate": 0.0001595738717316122, + "loss": 0.0084, + "num_input_tokens_seen": 202476528, + "step": 93735 + }, + { + "epoch": 15.292006525285482, + "grad_norm": 0.0035879616625607014, + "learning_rate": 0.00015952174159984667, + "loss": 0.0317, + "num_input_tokens_seen": 202487440, + "step": 93740 + }, + { + "epoch": 15.292822185970635, + "grad_norm": 0.08168449252843857, + "learning_rate": 0.0001594696183683046, + "loss": 0.0077, + "num_input_tokens_seen": 202498192, + "step": 93745 + }, + { + "epoch": 15.293637846655791, + "grad_norm": 0.035135045647621155, + "learning_rate": 0.00015941750203804305, + "loss": 0.0095, + "num_input_tokens_seen": 202509616, + "step": 93750 + }, + { + "epoch": 15.294453507340947, + "grad_norm": 0.08051912486553192, + "learning_rate": 0.0001593653926101176, + "loss": 0.0068, + "num_input_tokens_seen": 202520816, + "step": 93755 + }, + { + "epoch": 15.2952691680261, + "grad_norm": 0.007569537963718176, + "learning_rate": 0.00015931329008558477, + "loss": 0.0029, + "num_input_tokens_seen": 202530928, + "step": 93760 + }, + { + "epoch": 15.296084828711257, + "grad_norm": 0.22661983966827393, + "learning_rate": 0.00015926119446550024, + "loss": 0.0096, + "num_input_tokens_seen": 202541648, + "step": 93765 + }, + { + "epoch": 15.29690048939641, + "grad_norm": 0.0014906668802723289, + "learning_rate": 0.0001592091057509199, + "loss": 0.0074, + "num_input_tokens_seen": 202553424, + "step": 93770 + }, + { + "epoch": 15.297716150081566, + "grad_norm": 0.00238198135048151, + "learning_rate": 0.00015915702394289933, + "loss": 0.0035, + "num_input_tokens_seen": 202564944, + "step": 93775 + }, + { + "epoch": 15.298531810766722, + "grad_norm": 0.026480555534362793, + "learning_rate": 0.00015910494904249411, + "loss": 0.0063, + "num_input_tokens_seen": 202575888, + "step": 93780 + }, + { + "epoch": 15.299347471451876, + "grad_norm": 0.14808295667171478, + "learning_rate": 0.0001590528810507595, + "loss": 0.0071, + "num_input_tokens_seen": 202586800, + "step": 93785 + }, + { + "epoch": 15.300163132137031, + "grad_norm": 0.011909517459571362, + "learning_rate": 0.00015900081996875082, + "loss": 0.0037, + "num_input_tokens_seen": 202597424, + "step": 93790 + }, + { + "epoch": 15.300978792822185, + "grad_norm": 0.00829069223254919, + "learning_rate": 0.0001589487657975231, + "loss": 0.0125, + "num_input_tokens_seen": 202608080, + "step": 93795 + }, + { + "epoch": 15.301794453507341, + "grad_norm": 0.013668195344507694, + "learning_rate": 0.00015889671853813126, + "loss": 0.0029, + "num_input_tokens_seen": 202618640, + "step": 93800 + }, + { + "epoch": 15.302610114192497, + "grad_norm": 0.6986984610557556, + "learning_rate": 0.0001588446781916302, + "loss": 0.0505, + "num_input_tokens_seen": 202629360, + "step": 93805 + }, + { + "epoch": 15.30342577487765, + "grad_norm": 0.016121450811624527, + "learning_rate": 0.00015879264475907447, + "loss": 0.0112, + "num_input_tokens_seen": 202641360, + "step": 93810 + }, + { + "epoch": 15.304241435562806, + "grad_norm": 0.005392593797296286, + "learning_rate": 0.00015874061824151865, + "loss": 0.0046, + "num_input_tokens_seen": 202651216, + "step": 93815 + }, + { + "epoch": 15.30505709624796, + "grad_norm": 0.020854827016592026, + "learning_rate": 0.00015868859864001693, + "loss": 0.0045, + "num_input_tokens_seen": 202663024, + "step": 93820 + }, + { + "epoch": 15.305872756933116, + "grad_norm": 0.00112466502469033, + "learning_rate": 0.00015863658595562414, + "loss": 0.0123, + "num_input_tokens_seen": 202675056, + "step": 93825 + }, + { + "epoch": 15.30668841761827, + "grad_norm": 0.0506683811545372, + "learning_rate": 0.00015858458018939365, + "loss": 0.0066, + "num_input_tokens_seen": 202685488, + "step": 93830 + }, + { + "epoch": 15.307504078303426, + "grad_norm": 0.029055537655949593, + "learning_rate": 0.00015853258134238007, + "loss": 0.0053, + "num_input_tokens_seen": 202697232, + "step": 93835 + }, + { + "epoch": 15.308319738988581, + "grad_norm": 0.002884516492486, + "learning_rate": 0.0001584805894156366, + "loss": 0.0041, + "num_input_tokens_seen": 202709008, + "step": 93840 + }, + { + "epoch": 15.309135399673735, + "grad_norm": 0.0017596816178411245, + "learning_rate": 0.0001584286044102175, + "loss": 0.0086, + "num_input_tokens_seen": 202720368, + "step": 93845 + }, + { + "epoch": 15.309951060358891, + "grad_norm": 0.021234875544905663, + "learning_rate": 0.00015837662632717575, + "loss": 0.008, + "num_input_tokens_seen": 202730448, + "step": 93850 + }, + { + "epoch": 15.310766721044045, + "grad_norm": 0.017943846061825752, + "learning_rate": 0.00015832465516756538, + "loss": 0.0038, + "num_input_tokens_seen": 202741552, + "step": 93855 + }, + { + "epoch": 15.3115823817292, + "grad_norm": 0.2247970998287201, + "learning_rate": 0.00015827269093243902, + "loss": 0.0067, + "num_input_tokens_seen": 202753168, + "step": 93860 + }, + { + "epoch": 15.312398042414356, + "grad_norm": 0.004060364793986082, + "learning_rate": 0.0001582207336228504, + "loss": 0.0697, + "num_input_tokens_seen": 202763760, + "step": 93865 + }, + { + "epoch": 15.31321370309951, + "grad_norm": 0.030661238357424736, + "learning_rate": 0.00015816878323985184, + "loss": 0.0777, + "num_input_tokens_seen": 202774448, + "step": 93870 + }, + { + "epoch": 15.314029363784666, + "grad_norm": 0.004571146331727505, + "learning_rate": 0.0001581168397844967, + "loss": 0.005, + "num_input_tokens_seen": 202785712, + "step": 93875 + }, + { + "epoch": 15.31484502446982, + "grad_norm": 0.0031279721297323704, + "learning_rate": 0.0001580649032578375, + "loss": 0.0027, + "num_input_tokens_seen": 202795248, + "step": 93880 + }, + { + "epoch": 15.315660685154976, + "grad_norm": 0.015604222193360329, + "learning_rate": 0.00015801297366092689, + "loss": 0.0112, + "num_input_tokens_seen": 202806288, + "step": 93885 + }, + { + "epoch": 15.31647634584013, + "grad_norm": 0.09192720055580139, + "learning_rate": 0.00015796105099481712, + "loss": 0.0516, + "num_input_tokens_seen": 202817872, + "step": 93890 + }, + { + "epoch": 15.317292006525285, + "grad_norm": 0.005644198041409254, + "learning_rate": 0.00015790913526056061, + "loss": 0.0031, + "num_input_tokens_seen": 202828080, + "step": 93895 + }, + { + "epoch": 15.318107667210441, + "grad_norm": 0.3908234238624573, + "learning_rate": 0.00015785722645920942, + "loss": 0.0095, + "num_input_tokens_seen": 202838192, + "step": 93900 + }, + { + "epoch": 15.318923327895595, + "grad_norm": 0.7623684406280518, + "learning_rate": 0.00015780532459181557, + "loss": 0.0279, + "num_input_tokens_seen": 202848976, + "step": 93905 + }, + { + "epoch": 15.31973898858075, + "grad_norm": 0.012597830034792423, + "learning_rate": 0.00015775342965943095, + "loss": 0.0026, + "num_input_tokens_seen": 202860560, + "step": 93910 + }, + { + "epoch": 15.320554649265905, + "grad_norm": 0.0024378118105232716, + "learning_rate": 0.00015770154166310724, + "loss": 0.0787, + "num_input_tokens_seen": 202872400, + "step": 93915 + }, + { + "epoch": 15.32137030995106, + "grad_norm": 0.5856313109397888, + "learning_rate": 0.00015764966060389602, + "loss": 0.0115, + "num_input_tokens_seen": 202883024, + "step": 93920 + }, + { + "epoch": 15.322185970636216, + "grad_norm": 0.0010387571528553963, + "learning_rate": 0.00015759778648284873, + "loss": 0.0031, + "num_input_tokens_seen": 202894416, + "step": 93925 + }, + { + "epoch": 15.32300163132137, + "grad_norm": 0.3006853461265564, + "learning_rate": 0.00015754591930101664, + "loss": 0.1149, + "num_input_tokens_seen": 202905360, + "step": 93930 + }, + { + "epoch": 15.323817292006526, + "grad_norm": 0.06582503020763397, + "learning_rate": 0.00015749405905945095, + "loss": 0.0048, + "num_input_tokens_seen": 202916240, + "step": 93935 + }, + { + "epoch": 15.32463295269168, + "grad_norm": 0.007767208386212587, + "learning_rate": 0.00015744220575920266, + "loss": 0.0031, + "num_input_tokens_seen": 202927920, + "step": 93940 + }, + { + "epoch": 15.325448613376835, + "grad_norm": 0.014898211695253849, + "learning_rate": 0.00015739035940132262, + "loss": 0.0024, + "num_input_tokens_seen": 202937840, + "step": 93945 + }, + { + "epoch": 15.326264274061991, + "grad_norm": 0.022433992475271225, + "learning_rate": 0.0001573385199868616, + "loss": 0.0144, + "num_input_tokens_seen": 202948912, + "step": 93950 + }, + { + "epoch": 15.327079934747145, + "grad_norm": 0.008677488192915916, + "learning_rate": 0.00015728668751687015, + "loss": 0.0054, + "num_input_tokens_seen": 202959344, + "step": 93955 + }, + { + "epoch": 15.3278955954323, + "grad_norm": 0.0584244430065155, + "learning_rate": 0.00015723486199239878, + "loss": 0.0034, + "num_input_tokens_seen": 202970736, + "step": 93960 + }, + { + "epoch": 15.328711256117455, + "grad_norm": 0.009163172915577888, + "learning_rate": 0.00015718304341449759, + "loss": 0.0107, + "num_input_tokens_seen": 202981776, + "step": 93965 + }, + { + "epoch": 15.32952691680261, + "grad_norm": 0.00910011027008295, + "learning_rate": 0.00015713123178421717, + "loss": 0.0011, + "num_input_tokens_seen": 202993328, + "step": 93970 + }, + { + "epoch": 15.330342577487766, + "grad_norm": 0.029195060953497887, + "learning_rate": 0.00015707942710260704, + "loss": 0.0023, + "num_input_tokens_seen": 203004336, + "step": 93975 + }, + { + "epoch": 15.33115823817292, + "grad_norm": 0.01511499285697937, + "learning_rate": 0.00015702762937071747, + "loss": 0.006, + "num_input_tokens_seen": 203014576, + "step": 93980 + }, + { + "epoch": 15.331973898858076, + "grad_norm": 0.008337062783539295, + "learning_rate": 0.00015697583858959813, + "loss": 0.0034, + "num_input_tokens_seen": 203025200, + "step": 93985 + }, + { + "epoch": 15.33278955954323, + "grad_norm": 0.01428698655217886, + "learning_rate": 0.00015692405476029853, + "loss": 0.0026, + "num_input_tokens_seen": 203036016, + "step": 93990 + }, + { + "epoch": 15.333605220228385, + "grad_norm": 0.05316992104053497, + "learning_rate": 0.00015687227788386822, + "loss": 0.005, + "num_input_tokens_seen": 203047248, + "step": 93995 + }, + { + "epoch": 15.33442088091354, + "grad_norm": 0.0014572610380128026, + "learning_rate": 0.00015682050796135644, + "loss": 0.0065, + "num_input_tokens_seen": 203057872, + "step": 94000 + }, + { + "epoch": 15.335236541598695, + "grad_norm": 0.0003803163126576692, + "learning_rate": 0.0001567687449938125, + "loss": 0.0064, + "num_input_tokens_seen": 203069136, + "step": 94005 + }, + { + "epoch": 15.33605220228385, + "grad_norm": 0.0005980475689284503, + "learning_rate": 0.0001567169889822853, + "loss": 0.009, + "num_input_tokens_seen": 203078576, + "step": 94010 + }, + { + "epoch": 15.336867862969005, + "grad_norm": 0.0017918755766004324, + "learning_rate": 0.00015666523992782384, + "loss": 0.006, + "num_input_tokens_seen": 203087888, + "step": 94015 + }, + { + "epoch": 15.33768352365416, + "grad_norm": 0.017859825864434242, + "learning_rate": 0.00015661349783147678, + "loss": 0.0018, + "num_input_tokens_seen": 203097904, + "step": 94020 + }, + { + "epoch": 15.338499184339314, + "grad_norm": 0.0014235659036785364, + "learning_rate": 0.00015656176269429283, + "loss": 0.0008, + "num_input_tokens_seen": 203109072, + "step": 94025 + }, + { + "epoch": 15.33931484502447, + "grad_norm": 0.0009418278350494802, + "learning_rate": 0.00015651003451732048, + "loss": 0.0034, + "num_input_tokens_seen": 203120208, + "step": 94030 + }, + { + "epoch": 15.340130505709626, + "grad_norm": 0.008652381598949432, + "learning_rate": 0.00015645831330160804, + "loss": 0.0018, + "num_input_tokens_seen": 203130928, + "step": 94035 + }, + { + "epoch": 15.34094616639478, + "grad_norm": 0.00047669251216575503, + "learning_rate": 0.00015640659904820364, + "loss": 0.0016, + "num_input_tokens_seen": 203141712, + "step": 94040 + }, + { + "epoch": 15.341761827079935, + "grad_norm": 0.0017961309058591723, + "learning_rate": 0.00015635489175815537, + "loss": 0.0025, + "num_input_tokens_seen": 203152688, + "step": 94045 + }, + { + "epoch": 15.34257748776509, + "grad_norm": 0.006201804615557194, + "learning_rate": 0.0001563031914325112, + "loss": 0.0014, + "num_input_tokens_seen": 203163408, + "step": 94050 + }, + { + "epoch": 15.343393148450245, + "grad_norm": 0.019376035779714584, + "learning_rate": 0.00015625149807231892, + "loss": 0.0015, + "num_input_tokens_seen": 203174320, + "step": 94055 + }, + { + "epoch": 15.3442088091354, + "grad_norm": 0.0009080119198188186, + "learning_rate": 0.00015619981167862602, + "loss": 0.001, + "num_input_tokens_seen": 203184816, + "step": 94060 + }, + { + "epoch": 15.345024469820554, + "grad_norm": 0.015443303622305393, + "learning_rate": 0.00015614813225248015, + "loss": 0.0068, + "num_input_tokens_seen": 203195056, + "step": 94065 + }, + { + "epoch": 15.34584013050571, + "grad_norm": 0.00035426352405920625, + "learning_rate": 0.00015609645979492855, + "loss": 0.0052, + "num_input_tokens_seen": 203206000, + "step": 94070 + }, + { + "epoch": 15.346655791190864, + "grad_norm": 0.11414900422096252, + "learning_rate": 0.00015604479430701845, + "loss": 0.0065, + "num_input_tokens_seen": 203217168, + "step": 94075 + }, + { + "epoch": 15.34747145187602, + "grad_norm": 0.004541793372482061, + "learning_rate": 0.00015599313578979696, + "loss": 0.0018, + "num_input_tokens_seen": 203227248, + "step": 94080 + }, + { + "epoch": 15.348287112561174, + "grad_norm": 0.003625446930527687, + "learning_rate": 0.00015594148424431076, + "loss": 0.003, + "num_input_tokens_seen": 203238672, + "step": 94085 + }, + { + "epoch": 15.34910277324633, + "grad_norm": 0.011053141206502914, + "learning_rate": 0.00015588983967160724, + "loss": 0.0023, + "num_input_tokens_seen": 203249552, + "step": 94090 + }, + { + "epoch": 15.349918433931485, + "grad_norm": 0.016718612983822823, + "learning_rate": 0.0001558382020727323, + "loss": 0.002, + "num_input_tokens_seen": 203259056, + "step": 94095 + }, + { + "epoch": 15.350734094616639, + "grad_norm": 0.011769182980060577, + "learning_rate": 0.00015578657144873316, + "loss": 0.0025, + "num_input_tokens_seen": 203269456, + "step": 94100 + }, + { + "epoch": 15.351549755301795, + "grad_norm": 0.03749024122953415, + "learning_rate": 0.00015573494780065543, + "loss": 0.0023, + "num_input_tokens_seen": 203278256, + "step": 94105 + }, + { + "epoch": 15.352365415986949, + "grad_norm": 0.005377328023314476, + "learning_rate": 0.00015568333112954592, + "loss": 0.0017, + "num_input_tokens_seen": 203288880, + "step": 94110 + }, + { + "epoch": 15.353181076672104, + "grad_norm": 0.0017177603440359235, + "learning_rate": 0.00015563172143645044, + "loss": 0.008, + "num_input_tokens_seen": 203298640, + "step": 94115 + }, + { + "epoch": 15.35399673735726, + "grad_norm": 0.0025564224924892187, + "learning_rate": 0.00015558011872241506, + "loss": 0.0048, + "num_input_tokens_seen": 203309168, + "step": 94120 + }, + { + "epoch": 15.354812398042414, + "grad_norm": 0.588083028793335, + "learning_rate": 0.00015552852298848546, + "loss": 0.0598, + "num_input_tokens_seen": 203320048, + "step": 94125 + }, + { + "epoch": 15.35562805872757, + "grad_norm": 0.0020723820198327303, + "learning_rate": 0.00015547693423570736, + "loss": 0.0016, + "num_input_tokens_seen": 203331408, + "step": 94130 + }, + { + "epoch": 15.356443719412724, + "grad_norm": 0.0025758659467101097, + "learning_rate": 0.00015542535246512623, + "loss": 0.001, + "num_input_tokens_seen": 203342928, + "step": 94135 + }, + { + "epoch": 15.35725938009788, + "grad_norm": 0.019694067537784576, + "learning_rate": 0.00015537377767778742, + "loss": 0.0019, + "num_input_tokens_seen": 203353552, + "step": 94140 + }, + { + "epoch": 15.358075040783035, + "grad_norm": 0.008567390032112598, + "learning_rate": 0.00015532220987473627, + "loss": 0.1234, + "num_input_tokens_seen": 203363280, + "step": 94145 + }, + { + "epoch": 15.358890701468189, + "grad_norm": 0.002558504231274128, + "learning_rate": 0.00015527064905701776, + "loss": 0.0033, + "num_input_tokens_seen": 203374288, + "step": 94150 + }, + { + "epoch": 15.359706362153345, + "grad_norm": 0.0029807421378791332, + "learning_rate": 0.00015521909522567685, + "loss": 0.0323, + "num_input_tokens_seen": 203385616, + "step": 94155 + }, + { + "epoch": 15.360522022838499, + "grad_norm": 0.0011462070979177952, + "learning_rate": 0.0001551675483817584, + "loss": 0.0462, + "num_input_tokens_seen": 203397104, + "step": 94160 + }, + { + "epoch": 15.361337683523654, + "grad_norm": 0.00590652646496892, + "learning_rate": 0.00015511600852630698, + "loss": 0.002, + "num_input_tokens_seen": 203408656, + "step": 94165 + }, + { + "epoch": 15.362153344208808, + "grad_norm": 0.011698364280164242, + "learning_rate": 0.0001550644756603672, + "loss": 0.0012, + "num_input_tokens_seen": 203419120, + "step": 94170 + }, + { + "epoch": 15.362969004893964, + "grad_norm": 0.0006024792673997581, + "learning_rate": 0.00015501294978498344, + "loss": 0.0015, + "num_input_tokens_seen": 203430960, + "step": 94175 + }, + { + "epoch": 15.36378466557912, + "grad_norm": 0.003224707907065749, + "learning_rate": 0.0001549614309011998, + "loss": 0.0043, + "num_input_tokens_seen": 203441616, + "step": 94180 + }, + { + "epoch": 15.364600326264274, + "grad_norm": 0.01984231546521187, + "learning_rate": 0.00015490991901006052, + "loss": 0.0039, + "num_input_tokens_seen": 203453008, + "step": 94185 + }, + { + "epoch": 15.36541598694943, + "grad_norm": 0.0009202081128023565, + "learning_rate": 0.00015485841411260937, + "loss": 0.0083, + "num_input_tokens_seen": 203463184, + "step": 94190 + }, + { + "epoch": 15.366231647634583, + "grad_norm": 0.0021524764597415924, + "learning_rate": 0.00015480691620989062, + "loss": 0.0013, + "num_input_tokens_seen": 203473936, + "step": 94195 + }, + { + "epoch": 15.367047308319739, + "grad_norm": 0.016492361202836037, + "learning_rate": 0.00015475542530294728, + "loss": 0.002, + "num_input_tokens_seen": 203485136, + "step": 94200 + }, + { + "epoch": 15.367862969004895, + "grad_norm": 0.02251831255853176, + "learning_rate": 0.00015470394139282357, + "loss": 0.0083, + "num_input_tokens_seen": 203495760, + "step": 94205 + }, + { + "epoch": 15.368678629690049, + "grad_norm": 0.004201612900942564, + "learning_rate": 0.0001546524644805622, + "loss": 0.0012, + "num_input_tokens_seen": 203507472, + "step": 94210 + }, + { + "epoch": 15.369494290375204, + "grad_norm": 0.001497769495472312, + "learning_rate": 0.00015460099456720706, + "loss": 0.0052, + "num_input_tokens_seen": 203518352, + "step": 94215 + }, + { + "epoch": 15.370309951060358, + "grad_norm": 0.0014730320544913411, + "learning_rate": 0.0001545495316538006, + "loss": 0.0016, + "num_input_tokens_seen": 203526960, + "step": 94220 + }, + { + "epoch": 15.371125611745514, + "grad_norm": 0.04781882092356682, + "learning_rate": 0.0001544980757413864, + "loss": 0.0025, + "num_input_tokens_seen": 203538448, + "step": 94225 + }, + { + "epoch": 15.37194127243067, + "grad_norm": 0.0022414957638829947, + "learning_rate": 0.00015444662683100676, + "loss": 0.0025, + "num_input_tokens_seen": 203548560, + "step": 94230 + }, + { + "epoch": 15.372756933115824, + "grad_norm": 0.0006568465032614768, + "learning_rate": 0.00015439518492370486, + "loss": 0.0023, + "num_input_tokens_seen": 203558640, + "step": 94235 + }, + { + "epoch": 15.37357259380098, + "grad_norm": 0.0073090302757918835, + "learning_rate": 0.00015434375002052264, + "loss": 0.0033, + "num_input_tokens_seen": 203569424, + "step": 94240 + }, + { + "epoch": 15.374388254486133, + "grad_norm": 0.002609936287626624, + "learning_rate": 0.00015429232212250317, + "loss": 0.0027, + "num_input_tokens_seen": 203579408, + "step": 94245 + }, + { + "epoch": 15.375203915171289, + "grad_norm": 0.0006220395443961024, + "learning_rate": 0.00015424090123068802, + "loss": 0.0039, + "num_input_tokens_seen": 203589840, + "step": 94250 + }, + { + "epoch": 15.376019575856443, + "grad_norm": 0.016170065850019455, + "learning_rate": 0.00015418948734611976, + "loss": 0.0371, + "num_input_tokens_seen": 203600208, + "step": 94255 + }, + { + "epoch": 15.376835236541599, + "grad_norm": 0.0014452317263931036, + "learning_rate": 0.0001541380804698403, + "loss": 0.0013, + "num_input_tokens_seen": 203611696, + "step": 94260 + }, + { + "epoch": 15.377650897226754, + "grad_norm": 0.001372209400869906, + "learning_rate": 0.00015408668060289132, + "loss": 0.002, + "num_input_tokens_seen": 203621648, + "step": 94265 + }, + { + "epoch": 15.378466557911908, + "grad_norm": 0.0006071311072446406, + "learning_rate": 0.00015403528774631463, + "loss": 0.0017, + "num_input_tokens_seen": 203632080, + "step": 94270 + }, + { + "epoch": 15.379282218597064, + "grad_norm": 0.03576105460524559, + "learning_rate": 0.00015398390190115175, + "loss": 0.1296, + "num_input_tokens_seen": 203642800, + "step": 94275 + }, + { + "epoch": 15.380097879282218, + "grad_norm": 0.0016932595754042268, + "learning_rate": 0.00015393252306844402, + "loss": 0.0318, + "num_input_tokens_seen": 203654160, + "step": 94280 + }, + { + "epoch": 15.380913539967374, + "grad_norm": 0.06304704397916794, + "learning_rate": 0.00015388115124923267, + "loss": 0.1331, + "num_input_tokens_seen": 203665040, + "step": 94285 + }, + { + "epoch": 15.38172920065253, + "grad_norm": 0.016461336985230446, + "learning_rate": 0.00015382978644455896, + "loss": 0.0363, + "num_input_tokens_seen": 203675024, + "step": 94290 + }, + { + "epoch": 15.382544861337683, + "grad_norm": 0.011744924820959568, + "learning_rate": 0.00015377842865546372, + "loss": 0.0041, + "num_input_tokens_seen": 203685616, + "step": 94295 + }, + { + "epoch": 15.383360522022839, + "grad_norm": 0.10838112235069275, + "learning_rate": 0.0001537270778829879, + "loss": 0.0064, + "num_input_tokens_seen": 203696944, + "step": 94300 + }, + { + "epoch": 15.384176182707993, + "grad_norm": 0.6037494540214539, + "learning_rate": 0.00015367573412817186, + "loss": 0.0831, + "num_input_tokens_seen": 203707344, + "step": 94305 + }, + { + "epoch": 15.384991843393149, + "grad_norm": 0.0010668321046978235, + "learning_rate": 0.0001536243973920568, + "loss": 0.0014, + "num_input_tokens_seen": 203717648, + "step": 94310 + }, + { + "epoch": 15.385807504078304, + "grad_norm": 0.002049540402367711, + "learning_rate": 0.00015357306767568242, + "loss": 0.0023, + "num_input_tokens_seen": 203728560, + "step": 94315 + }, + { + "epoch": 15.386623164763458, + "grad_norm": 0.006294840015470982, + "learning_rate": 0.00015352174498008963, + "loss": 0.0027, + "num_input_tokens_seen": 203740080, + "step": 94320 + }, + { + "epoch": 15.387438825448614, + "grad_norm": 0.01672750897705555, + "learning_rate": 0.00015347042930631788, + "loss": 0.0012, + "num_input_tokens_seen": 203751472, + "step": 94325 + }, + { + "epoch": 15.388254486133768, + "grad_norm": 0.008858496323227882, + "learning_rate": 0.0001534191206554078, + "loss": 0.0023, + "num_input_tokens_seen": 203763376, + "step": 94330 + }, + { + "epoch": 15.389070146818923, + "grad_norm": 0.02900371141731739, + "learning_rate": 0.00015336781902839858, + "loss": 0.0019, + "num_input_tokens_seen": 203774320, + "step": 94335 + }, + { + "epoch": 15.38988580750408, + "grad_norm": 0.001398382824845612, + "learning_rate": 0.00015331652442633053, + "loss": 0.0014, + "num_input_tokens_seen": 203785136, + "step": 94340 + }, + { + "epoch": 15.390701468189233, + "grad_norm": 0.03610919788479805, + "learning_rate": 0.00015326523685024263, + "loss": 0.013, + "num_input_tokens_seen": 203795280, + "step": 94345 + }, + { + "epoch": 15.391517128874389, + "grad_norm": 0.006073774769902229, + "learning_rate": 0.0001532139563011749, + "loss": 0.0009, + "num_input_tokens_seen": 203806000, + "step": 94350 + }, + { + "epoch": 15.392332789559543, + "grad_norm": 0.0019028345122933388, + "learning_rate": 0.00015316268278016594, + "loss": 0.0019, + "num_input_tokens_seen": 203816816, + "step": 94355 + }, + { + "epoch": 15.393148450244698, + "grad_norm": 0.0005348801496438682, + "learning_rate": 0.00015311141628825554, + "loss": 0.0038, + "num_input_tokens_seen": 203827472, + "step": 94360 + }, + { + "epoch": 15.393964110929852, + "grad_norm": 0.0017356444150209427, + "learning_rate": 0.000153060156826482, + "loss": 0.0012, + "num_input_tokens_seen": 203837680, + "step": 94365 + }, + { + "epoch": 15.394779771615008, + "grad_norm": 0.03037400357425213, + "learning_rate": 0.0001530089043958849, + "loss": 0.0235, + "num_input_tokens_seen": 203847760, + "step": 94370 + }, + { + "epoch": 15.395595432300164, + "grad_norm": 0.008682015351951122, + "learning_rate": 0.00015295765899750214, + "loss": 0.002, + "num_input_tokens_seen": 203857744, + "step": 94375 + }, + { + "epoch": 15.396411092985318, + "grad_norm": 0.029603829607367516, + "learning_rate": 0.00015290642063237302, + "loss": 0.0033, + "num_input_tokens_seen": 203868720, + "step": 94380 + }, + { + "epoch": 15.397226753670473, + "grad_norm": 0.0024776794016361237, + "learning_rate": 0.0001528551893015353, + "loss": 0.0895, + "num_input_tokens_seen": 203879248, + "step": 94385 + }, + { + "epoch": 15.398042414355627, + "grad_norm": 0.002934439340606332, + "learning_rate": 0.00015280396500602783, + "loss": 0.0011, + "num_input_tokens_seen": 203889296, + "step": 94390 + }, + { + "epoch": 15.398858075040783, + "grad_norm": 0.028770821169018745, + "learning_rate": 0.00015275274774688817, + "loss": 0.0012, + "num_input_tokens_seen": 203900848, + "step": 94395 + }, + { + "epoch": 15.399673735725939, + "grad_norm": 0.11703412979841232, + "learning_rate": 0.00015270153752515474, + "loss": 0.0099, + "num_input_tokens_seen": 203911408, + "step": 94400 + }, + { + "epoch": 15.400489396411093, + "grad_norm": 0.019773095846176147, + "learning_rate": 0.00015265033434186525, + "loss": 0.0271, + "num_input_tokens_seen": 203923184, + "step": 94405 + }, + { + "epoch": 15.401305057096248, + "grad_norm": 0.005440262146294117, + "learning_rate": 0.00015259913819805736, + "loss": 0.0019, + "num_input_tokens_seen": 203933552, + "step": 94410 + }, + { + "epoch": 15.402120717781402, + "grad_norm": 0.011708968318998814, + "learning_rate": 0.0001525479490947687, + "loss": 0.006, + "num_input_tokens_seen": 203944752, + "step": 94415 + }, + { + "epoch": 15.402936378466558, + "grad_norm": 0.0032694439869374037, + "learning_rate": 0.00015249676703303654, + "loss": 0.001, + "num_input_tokens_seen": 203955824, + "step": 94420 + }, + { + "epoch": 15.403752039151712, + "grad_norm": 0.0076155886054039, + "learning_rate": 0.0001524455920138983, + "loss": 0.0165, + "num_input_tokens_seen": 203965456, + "step": 94425 + }, + { + "epoch": 15.404567699836868, + "grad_norm": 0.0033928006887435913, + "learning_rate": 0.00015239442403839105, + "loss": 0.0024, + "num_input_tokens_seen": 203975536, + "step": 94430 + }, + { + "epoch": 15.405383360522023, + "grad_norm": 0.4387916922569275, + "learning_rate": 0.0001523432631075517, + "loss": 0.1567, + "num_input_tokens_seen": 203986256, + "step": 94435 + }, + { + "epoch": 15.406199021207177, + "grad_norm": 0.0012723475228995085, + "learning_rate": 0.00015229210922241721, + "loss": 0.1107, + "num_input_tokens_seen": 203997040, + "step": 94440 + }, + { + "epoch": 15.407014681892333, + "grad_norm": 0.004824480973184109, + "learning_rate": 0.0001522409623840242, + "loss": 0.0104, + "num_input_tokens_seen": 204007920, + "step": 94445 + }, + { + "epoch": 15.407830342577487, + "grad_norm": 0.0012717196950688958, + "learning_rate": 0.00015218982259340908, + "loss": 0.0067, + "num_input_tokens_seen": 204018448, + "step": 94450 + }, + { + "epoch": 15.408646003262643, + "grad_norm": 0.07703101634979248, + "learning_rate": 0.0001521386898516088, + "loss": 0.0332, + "num_input_tokens_seen": 204028752, + "step": 94455 + }, + { + "epoch": 15.409461663947798, + "grad_norm": 0.07954272627830505, + "learning_rate": 0.0001520875641596589, + "loss": 0.0054, + "num_input_tokens_seen": 204039344, + "step": 94460 + }, + { + "epoch": 15.410277324632952, + "grad_norm": 0.008622833527624607, + "learning_rate": 0.0001520364455185962, + "loss": 0.029, + "num_input_tokens_seen": 204051376, + "step": 94465 + }, + { + "epoch": 15.411092985318108, + "grad_norm": 0.2871306538581848, + "learning_rate": 0.00015198533392945602, + "loss": 0.019, + "num_input_tokens_seen": 204062672, + "step": 94470 + }, + { + "epoch": 15.411908646003262, + "grad_norm": 0.531629741191864, + "learning_rate": 0.00015193422939327488, + "loss": 0.2146, + "num_input_tokens_seen": 204073360, + "step": 94475 + }, + { + "epoch": 15.412724306688418, + "grad_norm": 0.42784038186073303, + "learning_rate": 0.00015188313191108783, + "loss": 0.0204, + "num_input_tokens_seen": 204084592, + "step": 94480 + }, + { + "epoch": 15.413539967373573, + "grad_norm": 0.07477138191461563, + "learning_rate": 0.00015183204148393103, + "loss": 0.0222, + "num_input_tokens_seen": 204095696, + "step": 94485 + }, + { + "epoch": 15.414355628058727, + "grad_norm": 0.0180678591132164, + "learning_rate": 0.00015178095811283927, + "loss": 0.0026, + "num_input_tokens_seen": 204106864, + "step": 94490 + }, + { + "epoch": 15.415171288743883, + "grad_norm": 0.04884007200598717, + "learning_rate": 0.00015172988179884846, + "loss": 0.0038, + "num_input_tokens_seen": 204118896, + "step": 94495 + }, + { + "epoch": 15.415986949429037, + "grad_norm": 0.00031529387342743576, + "learning_rate": 0.0001516788125429931, + "loss": 0.0027, + "num_input_tokens_seen": 204128016, + "step": 94500 + }, + { + "epoch": 15.416802610114193, + "grad_norm": 0.2522253394126892, + "learning_rate": 0.0001516277503463086, + "loss": 0.0153, + "num_input_tokens_seen": 204138672, + "step": 94505 + }, + { + "epoch": 15.417618270799348, + "grad_norm": 0.03922179341316223, + "learning_rate": 0.00015157669520982975, + "loss": 0.0071, + "num_input_tokens_seen": 204149840, + "step": 94510 + }, + { + "epoch": 15.418433931484502, + "grad_norm": 0.007353988941758871, + "learning_rate": 0.0001515256471345911, + "loss": 0.0041, + "num_input_tokens_seen": 204159664, + "step": 94515 + }, + { + "epoch": 15.419249592169658, + "grad_norm": 0.005019092466682196, + "learning_rate": 0.00015147460612162733, + "loss": 0.0016, + "num_input_tokens_seen": 204171056, + "step": 94520 + }, + { + "epoch": 15.420065252854812, + "grad_norm": 0.03900919482111931, + "learning_rate": 0.00015142357217197278, + "loss": 0.0064, + "num_input_tokens_seen": 204181296, + "step": 94525 + }, + { + "epoch": 15.420880913539968, + "grad_norm": 0.05264601483941078, + "learning_rate": 0.00015137254528666178, + "loss": 0.0039, + "num_input_tokens_seen": 204192304, + "step": 94530 + }, + { + "epoch": 15.421696574225122, + "grad_norm": 0.08832412958145142, + "learning_rate": 0.0001513215254667284, + "loss": 0.0091, + "num_input_tokens_seen": 204203536, + "step": 94535 + }, + { + "epoch": 15.422512234910277, + "grad_norm": 0.0050577265210449696, + "learning_rate": 0.00015127051271320664, + "loss": 0.0023, + "num_input_tokens_seen": 204214928, + "step": 94540 + }, + { + "epoch": 15.423327895595433, + "grad_norm": 0.0012761307880282402, + "learning_rate": 0.00015121950702713029, + "loss": 0.0014, + "num_input_tokens_seen": 204226000, + "step": 94545 + }, + { + "epoch": 15.424143556280587, + "grad_norm": 0.0036849654279649258, + "learning_rate": 0.00015116850840953311, + "loss": 0.0026, + "num_input_tokens_seen": 204237200, + "step": 94550 + }, + { + "epoch": 15.424959216965743, + "grad_norm": 0.0011683054035529494, + "learning_rate": 0.00015111751686144864, + "loss": 0.0022, + "num_input_tokens_seen": 204248144, + "step": 94555 + }, + { + "epoch": 15.425774877650896, + "grad_norm": 0.10220471024513245, + "learning_rate": 0.00015106653238391028, + "loss": 0.0118, + "num_input_tokens_seen": 204258768, + "step": 94560 + }, + { + "epoch": 15.426590538336052, + "grad_norm": 0.0024899379350245, + "learning_rate": 0.00015101555497795127, + "loss": 0.0048, + "num_input_tokens_seen": 204268496, + "step": 94565 + }, + { + "epoch": 15.427406199021208, + "grad_norm": 0.0045095449313521385, + "learning_rate": 0.00015096458464460482, + "loss": 0.0019, + "num_input_tokens_seen": 204278864, + "step": 94570 + }, + { + "epoch": 15.428221859706362, + "grad_norm": 0.01300779264420271, + "learning_rate": 0.0001509136213849038, + "loss": 0.0017, + "num_input_tokens_seen": 204289648, + "step": 94575 + }, + { + "epoch": 15.429037520391518, + "grad_norm": 0.0015890076756477356, + "learning_rate": 0.00015086266519988108, + "loss": 0.0153, + "num_input_tokens_seen": 204300528, + "step": 94580 + }, + { + "epoch": 15.429853181076671, + "grad_norm": 0.002844614442437887, + "learning_rate": 0.00015081171609056937, + "loss": 0.0044, + "num_input_tokens_seen": 204310896, + "step": 94585 + }, + { + "epoch": 15.430668841761827, + "grad_norm": 0.0012606673408299685, + "learning_rate": 0.00015076077405800126, + "loss": 0.0015, + "num_input_tokens_seen": 204322096, + "step": 94590 + }, + { + "epoch": 15.431484502446983, + "grad_norm": 0.0005698453169316053, + "learning_rate": 0.0001507098391032089, + "loss": 0.0008, + "num_input_tokens_seen": 204333616, + "step": 94595 + }, + { + "epoch": 15.432300163132137, + "grad_norm": 0.024903813377022743, + "learning_rate": 0.00015065891122722507, + "loss": 0.0044, + "num_input_tokens_seen": 204343824, + "step": 94600 + }, + { + "epoch": 15.433115823817293, + "grad_norm": 0.005087335593998432, + "learning_rate": 0.00015060799043108126, + "loss": 0.0015, + "num_input_tokens_seen": 204354128, + "step": 94605 + }, + { + "epoch": 15.433931484502446, + "grad_norm": 0.0013004738138988614, + "learning_rate": 0.00015055707671581008, + "loss": 0.0059, + "num_input_tokens_seen": 204365008, + "step": 94610 + }, + { + "epoch": 15.434747145187602, + "grad_norm": 0.006157164927572012, + "learning_rate": 0.00015050617008244272, + "loss": 0.0014, + "num_input_tokens_seen": 204376368, + "step": 94615 + }, + { + "epoch": 15.435562805872756, + "grad_norm": 0.0021772703621536493, + "learning_rate": 0.00015045527053201137, + "loss": 0.0025, + "num_input_tokens_seen": 204386288, + "step": 94620 + }, + { + "epoch": 15.436378466557912, + "grad_norm": 0.020397908985614777, + "learning_rate": 0.00015040437806554735, + "loss": 0.1338, + "num_input_tokens_seen": 204396784, + "step": 94625 + }, + { + "epoch": 15.437194127243067, + "grad_norm": 0.004979412537068129, + "learning_rate": 0.00015035349268408216, + "loss": 0.0911, + "num_input_tokens_seen": 204407184, + "step": 94630 + }, + { + "epoch": 15.438009787928221, + "grad_norm": 0.009470396675169468, + "learning_rate": 0.00015030261438864694, + "loss": 0.0013, + "num_input_tokens_seen": 204416400, + "step": 94635 + }, + { + "epoch": 15.438825448613377, + "grad_norm": 0.0030095677357167006, + "learning_rate": 0.0001502517431802729, + "loss": 0.0014, + "num_input_tokens_seen": 204426256, + "step": 94640 + }, + { + "epoch": 15.439641109298531, + "grad_norm": 0.041119664907455444, + "learning_rate": 0.00015020087905999097, + "loss": 0.0014, + "num_input_tokens_seen": 204437424, + "step": 94645 + }, + { + "epoch": 15.440456769983687, + "grad_norm": 0.008877326734364033, + "learning_rate": 0.00015015002202883193, + "loss": 0.0063, + "num_input_tokens_seen": 204446800, + "step": 94650 + }, + { + "epoch": 15.441272430668842, + "grad_norm": 0.07724998146295547, + "learning_rate": 0.00015009917208782657, + "loss": 0.017, + "num_input_tokens_seen": 204458576, + "step": 94655 + }, + { + "epoch": 15.442088091353996, + "grad_norm": 0.005192750133574009, + "learning_rate": 0.00015004832923800533, + "loss": 0.0254, + "num_input_tokens_seen": 204468752, + "step": 94660 + }, + { + "epoch": 15.442903752039152, + "grad_norm": 0.010963196866214275, + "learning_rate": 0.00014999749348039866, + "loss": 0.0034, + "num_input_tokens_seen": 204479024, + "step": 94665 + }, + { + "epoch": 15.443719412724306, + "grad_norm": 0.07558204233646393, + "learning_rate": 0.0001499466648160368, + "loss": 0.0059, + "num_input_tokens_seen": 204489744, + "step": 94670 + }, + { + "epoch": 15.444535073409462, + "grad_norm": 0.005794727709144354, + "learning_rate": 0.00014989584324594986, + "loss": 0.0898, + "num_input_tokens_seen": 204500592, + "step": 94675 + }, + { + "epoch": 15.445350734094617, + "grad_norm": 0.005403991788625717, + "learning_rate": 0.00014984502877116773, + "loss": 0.0019, + "num_input_tokens_seen": 204510960, + "step": 94680 + }, + { + "epoch": 15.446166394779771, + "grad_norm": 0.018055513501167297, + "learning_rate": 0.00014979422139272037, + "loss": 0.0028, + "num_input_tokens_seen": 204521776, + "step": 94685 + }, + { + "epoch": 15.446982055464927, + "grad_norm": 0.01070409920066595, + "learning_rate": 0.00014974342111163735, + "loss": 0.0025, + "num_input_tokens_seen": 204530896, + "step": 94690 + }, + { + "epoch": 15.447797716150081, + "grad_norm": 0.09509050846099854, + "learning_rate": 0.00014969262792894822, + "loss": 0.0031, + "num_input_tokens_seen": 204541648, + "step": 94695 + }, + { + "epoch": 15.448613376835237, + "grad_norm": 0.05279732868075371, + "learning_rate": 0.0001496418418456824, + "loss": 0.0027, + "num_input_tokens_seen": 204552784, + "step": 94700 + }, + { + "epoch": 15.449429037520392, + "grad_norm": 0.0030731274746358395, + "learning_rate": 0.0001495910628628691, + "loss": 0.0572, + "num_input_tokens_seen": 204564016, + "step": 94705 + }, + { + "epoch": 15.450244698205546, + "grad_norm": 0.0014273212291300297, + "learning_rate": 0.00014954029098153748, + "loss": 0.0031, + "num_input_tokens_seen": 204574032, + "step": 94710 + }, + { + "epoch": 15.451060358890702, + "grad_norm": 0.001934555359184742, + "learning_rate": 0.00014948952620271643, + "loss": 0.0017, + "num_input_tokens_seen": 204585552, + "step": 94715 + }, + { + "epoch": 15.451876019575856, + "grad_norm": 0.0016809795051813126, + "learning_rate": 0.00014943876852743475, + "loss": 0.0012, + "num_input_tokens_seen": 204596848, + "step": 94720 + }, + { + "epoch": 15.452691680261012, + "grad_norm": 0.0013799264561384916, + "learning_rate": 0.00014938801795672102, + "loss": 0.0047, + "num_input_tokens_seen": 204607792, + "step": 94725 + }, + { + "epoch": 15.453507340946166, + "grad_norm": 0.02855961211025715, + "learning_rate": 0.00014933727449160423, + "loss": 0.0382, + "num_input_tokens_seen": 204619184, + "step": 94730 + }, + { + "epoch": 15.454323001631321, + "grad_norm": 0.013205167837440968, + "learning_rate": 0.00014928653813311204, + "loss": 0.003, + "num_input_tokens_seen": 204629584, + "step": 94735 + }, + { + "epoch": 15.455138662316477, + "grad_norm": 0.0069459774531424046, + "learning_rate": 0.00014923580888227329, + "loss": 0.0044, + "num_input_tokens_seen": 204641424, + "step": 94740 + }, + { + "epoch": 15.455954323001631, + "grad_norm": 0.004139984026551247, + "learning_rate": 0.00014918508674011582, + "loss": 0.0041, + "num_input_tokens_seen": 204652208, + "step": 94745 + }, + { + "epoch": 15.456769983686787, + "grad_norm": 0.004607068374752998, + "learning_rate": 0.0001491343717076676, + "loss": 0.0024, + "num_input_tokens_seen": 204662608, + "step": 94750 + }, + { + "epoch": 15.45758564437194, + "grad_norm": 1.534938931465149, + "learning_rate": 0.00014908366378595645, + "loss": 0.0989, + "num_input_tokens_seen": 204674448, + "step": 94755 + }, + { + "epoch": 15.458401305057096, + "grad_norm": 0.002478801878169179, + "learning_rate": 0.00014903296297601, + "loss": 0.0011, + "num_input_tokens_seen": 204686096, + "step": 94760 + }, + { + "epoch": 15.459216965742252, + "grad_norm": 0.010276932269334793, + "learning_rate": 0.00014898226927885584, + "loss": 0.0018, + "num_input_tokens_seen": 204696048, + "step": 94765 + }, + { + "epoch": 15.460032626427406, + "grad_norm": 0.0010761553421616554, + "learning_rate": 0.00014893158269552127, + "loss": 0.0016, + "num_input_tokens_seen": 204706704, + "step": 94770 + }, + { + "epoch": 15.460848287112562, + "grad_norm": 0.02356075681746006, + "learning_rate": 0.00014888090322703353, + "loss": 0.0051, + "num_input_tokens_seen": 204716144, + "step": 94775 + }, + { + "epoch": 15.461663947797716, + "grad_norm": 0.023741189390420914, + "learning_rate": 0.00014883023087441965, + "loss": 0.0028, + "num_input_tokens_seen": 204727984, + "step": 94780 + }, + { + "epoch": 15.462479608482871, + "grad_norm": 0.5104894042015076, + "learning_rate": 0.0001487795656387067, + "loss": 0.0932, + "num_input_tokens_seen": 204738960, + "step": 94785 + }, + { + "epoch": 15.463295269168025, + "grad_norm": 0.006746853701770306, + "learning_rate": 0.00014872890752092144, + "loss": 0.0011, + "num_input_tokens_seen": 204749456, + "step": 94790 + }, + { + "epoch": 15.464110929853181, + "grad_norm": 0.023834139108657837, + "learning_rate": 0.00014867825652209045, + "loss": 0.0064, + "num_input_tokens_seen": 204760304, + "step": 94795 + }, + { + "epoch": 15.464926590538337, + "grad_norm": 0.002329958835616708, + "learning_rate": 0.00014862761264324025, + "loss": 0.0008, + "num_input_tokens_seen": 204771984, + "step": 94800 + }, + { + "epoch": 15.46574225122349, + "grad_norm": 0.007659323513507843, + "learning_rate": 0.00014857697588539727, + "loss": 0.0029, + "num_input_tokens_seen": 204782640, + "step": 94805 + }, + { + "epoch": 15.466557911908646, + "grad_norm": 0.0002704902726691216, + "learning_rate": 0.00014852634624958766, + "loss": 0.0101, + "num_input_tokens_seen": 204791632, + "step": 94810 + }, + { + "epoch": 15.4673735725938, + "grad_norm": 0.015242592431604862, + "learning_rate": 0.00014847572373683749, + "loss": 0.0133, + "num_input_tokens_seen": 204804080, + "step": 94815 + }, + { + "epoch": 15.468189233278956, + "grad_norm": 0.006385531276464462, + "learning_rate": 0.00014842510834817274, + "loss": 0.0027, + "num_input_tokens_seen": 204814672, + "step": 94820 + }, + { + "epoch": 15.469004893964112, + "grad_norm": 0.005965860094875097, + "learning_rate": 0.00014837450008461922, + "loss": 0.0024, + "num_input_tokens_seen": 204825552, + "step": 94825 + }, + { + "epoch": 15.469820554649266, + "grad_norm": 0.002931939670816064, + "learning_rate": 0.00014832389894720233, + "loss": 0.001, + "num_input_tokens_seen": 204835280, + "step": 94830 + }, + { + "epoch": 15.470636215334421, + "grad_norm": 0.0005909419851377606, + "learning_rate": 0.00014827330493694807, + "loss": 0.0078, + "num_input_tokens_seen": 204844752, + "step": 94835 + }, + { + "epoch": 15.471451876019575, + "grad_norm": 0.0040916381403803825, + "learning_rate": 0.0001482227180548812, + "loss": 0.004, + "num_input_tokens_seen": 204855472, + "step": 94840 + }, + { + "epoch": 15.47226753670473, + "grad_norm": 0.006552472244948149, + "learning_rate": 0.00014817213830202748, + "loss": 0.008, + "num_input_tokens_seen": 204866832, + "step": 94845 + }, + { + "epoch": 15.473083197389887, + "grad_norm": 0.00648926105350256, + "learning_rate": 0.00014812156567941143, + "loss": 0.0004, + "num_input_tokens_seen": 204877712, + "step": 94850 + }, + { + "epoch": 15.47389885807504, + "grad_norm": 0.2792307734489441, + "learning_rate": 0.00014807100018805853, + "loss": 0.0058, + "num_input_tokens_seen": 204887696, + "step": 94855 + }, + { + "epoch": 15.474714518760196, + "grad_norm": 0.017968228086829185, + "learning_rate": 0.00014802044182899294, + "loss": 0.0026, + "num_input_tokens_seen": 204899632, + "step": 94860 + }, + { + "epoch": 15.47553017944535, + "grad_norm": 0.00422089034691453, + "learning_rate": 0.00014796989060323997, + "loss": 0.0009, + "num_input_tokens_seen": 204910000, + "step": 94865 + }, + { + "epoch": 15.476345840130506, + "grad_norm": 0.0017101641278713942, + "learning_rate": 0.00014791934651182338, + "loss": 0.0653, + "num_input_tokens_seen": 204921168, + "step": 94870 + }, + { + "epoch": 15.477161500815662, + "grad_norm": 0.0047962963581085205, + "learning_rate": 0.0001478688095557682, + "loss": 0.0013, + "num_input_tokens_seen": 204930704, + "step": 94875 + }, + { + "epoch": 15.477977161500815, + "grad_norm": 0.02295534871518612, + "learning_rate": 0.00014781827973609803, + "loss": 0.027, + "num_input_tokens_seen": 204941872, + "step": 94880 + }, + { + "epoch": 15.478792822185971, + "grad_norm": 0.008626680821180344, + "learning_rate": 0.00014776775705383733, + "loss": 0.0009, + "num_input_tokens_seen": 204952208, + "step": 94885 + }, + { + "epoch": 15.479608482871125, + "grad_norm": 0.0017688804073259234, + "learning_rate": 0.00014771724151000986, + "loss": 0.0034, + "num_input_tokens_seen": 204963728, + "step": 94890 + }, + { + "epoch": 15.48042414355628, + "grad_norm": 0.019344795495271683, + "learning_rate": 0.00014766673310563945, + "loss": 0.0008, + "num_input_tokens_seen": 204975440, + "step": 94895 + }, + { + "epoch": 15.481239804241435, + "grad_norm": 0.013398679904639721, + "learning_rate": 0.0001476162318417496, + "loss": 0.0015, + "num_input_tokens_seen": 204986768, + "step": 94900 + }, + { + "epoch": 15.48205546492659, + "grad_norm": 0.05068299174308777, + "learning_rate": 0.00014756573771936382, + "loss": 0.0055, + "num_input_tokens_seen": 204998064, + "step": 94905 + }, + { + "epoch": 15.482871125611746, + "grad_norm": 0.002898820675909519, + "learning_rate": 0.0001475152507395055, + "loss": 0.0008, + "num_input_tokens_seen": 205008304, + "step": 94910 + }, + { + "epoch": 15.4836867862969, + "grad_norm": 0.016531746834516525, + "learning_rate": 0.00014746477090319781, + "loss": 0.0072, + "num_input_tokens_seen": 205018192, + "step": 94915 + }, + { + "epoch": 15.484502446982056, + "grad_norm": 0.06572934985160828, + "learning_rate": 0.00014741429821146375, + "loss": 0.0037, + "num_input_tokens_seen": 205028816, + "step": 94920 + }, + { + "epoch": 15.48531810766721, + "grad_norm": 0.001525534433312714, + "learning_rate": 0.00014736383266532622, + "loss": 0.0006, + "num_input_tokens_seen": 205039920, + "step": 94925 + }, + { + "epoch": 15.486133768352365, + "grad_norm": 0.0032828834373503923, + "learning_rate": 0.00014731337426580792, + "loss": 0.0048, + "num_input_tokens_seen": 205050032, + "step": 94930 + }, + { + "epoch": 15.486949429037521, + "grad_norm": 0.01835629530251026, + "learning_rate": 0.0001472629230139314, + "loss": 0.0046, + "num_input_tokens_seen": 205061968, + "step": 94935 + }, + { + "epoch": 15.487765089722675, + "grad_norm": 0.007460998836904764, + "learning_rate": 0.00014721247891071954, + "loss": 0.0015, + "num_input_tokens_seen": 205072976, + "step": 94940 + }, + { + "epoch": 15.48858075040783, + "grad_norm": 0.0013818942243233323, + "learning_rate": 0.00014716204195719396, + "loss": 0.0032, + "num_input_tokens_seen": 205082832, + "step": 94945 + }, + { + "epoch": 15.489396411092985, + "grad_norm": 0.008082177489995956, + "learning_rate": 0.00014711161215437757, + "loss": 0.0025, + "num_input_tokens_seen": 205094736, + "step": 94950 + }, + { + "epoch": 15.49021207177814, + "grad_norm": 0.0091333519667387, + "learning_rate": 0.00014706118950329173, + "loss": 0.0014, + "num_input_tokens_seen": 205105616, + "step": 94955 + }, + { + "epoch": 15.491027732463296, + "grad_norm": 0.001196563825942576, + "learning_rate": 0.00014701077400495894, + "loss": 0.0662, + "num_input_tokens_seen": 205115312, + "step": 94960 + }, + { + "epoch": 15.49184339314845, + "grad_norm": 0.0008588659111410379, + "learning_rate": 0.00014696036566040028, + "loss": 0.0265, + "num_input_tokens_seen": 205126800, + "step": 94965 + }, + { + "epoch": 15.492659053833606, + "grad_norm": 0.00046497659059241414, + "learning_rate": 0.00014690996447063798, + "loss": 0.0014, + "num_input_tokens_seen": 205137808, + "step": 94970 + }, + { + "epoch": 15.49347471451876, + "grad_norm": 0.001684483140707016, + "learning_rate": 0.00014685957043669283, + "loss": 0.0017, + "num_input_tokens_seen": 205148560, + "step": 94975 + }, + { + "epoch": 15.494290375203915, + "grad_norm": 0.08418180793523788, + "learning_rate": 0.00014680918355958683, + "loss": 0.0681, + "num_input_tokens_seen": 205160080, + "step": 94980 + }, + { + "epoch": 15.49510603588907, + "grad_norm": 0.0057790386490523815, + "learning_rate": 0.00014675880384034046, + "loss": 0.0023, + "num_input_tokens_seen": 205170544, + "step": 94985 + }, + { + "epoch": 15.495921696574225, + "grad_norm": 0.001607961137779057, + "learning_rate": 0.00014670843127997542, + "loss": 0.0033, + "num_input_tokens_seen": 205181712, + "step": 94990 + }, + { + "epoch": 15.49673735725938, + "grad_norm": 0.0023566484451293945, + "learning_rate": 0.0001466580658795118, + "loss": 0.0023, + "num_input_tokens_seen": 205192560, + "step": 94995 + }, + { + "epoch": 15.497553017944535, + "grad_norm": 0.14754217863082886, + "learning_rate": 0.00014660770763997105, + "loss": 0.106, + "num_input_tokens_seen": 205202032, + "step": 95000 + }, + { + "epoch": 15.49836867862969, + "grad_norm": 0.009285212494432926, + "learning_rate": 0.00014655735656237312, + "loss": 0.0017, + "num_input_tokens_seen": 205213104, + "step": 95005 + }, + { + "epoch": 15.499184339314844, + "grad_norm": 0.0009993219282478094, + "learning_rate": 0.00014650701264773907, + "loss": 0.0007, + "num_input_tokens_seen": 205224752, + "step": 95010 + }, + { + "epoch": 15.5, + "grad_norm": 0.04408176988363266, + "learning_rate": 0.0001464566758970885, + "loss": 0.0042, + "num_input_tokens_seen": 205236304, + "step": 95015 + }, + { + "epoch": 15.500815660685156, + "grad_norm": 0.0037940412294119596, + "learning_rate": 0.00014640634631144206, + "loss": 0.0014, + "num_input_tokens_seen": 205246224, + "step": 95020 + }, + { + "epoch": 15.50163132137031, + "grad_norm": 0.013590458780527115, + "learning_rate": 0.00014635602389181956, + "loss": 0.003, + "num_input_tokens_seen": 205257424, + "step": 95025 + }, + { + "epoch": 15.502446982055465, + "grad_norm": 0.008497872389853, + "learning_rate": 0.00014630570863924088, + "loss": 0.008, + "num_input_tokens_seen": 205268560, + "step": 95030 + }, + { + "epoch": 15.50326264274062, + "grad_norm": 0.005137204192578793, + "learning_rate": 0.0001462554005547257, + "loss": 0.0047, + "num_input_tokens_seen": 205279280, + "step": 95035 + }, + { + "epoch": 15.504078303425775, + "grad_norm": 0.012710629031062126, + "learning_rate": 0.00014620509963929362, + "loss": 0.0027, + "num_input_tokens_seen": 205289616, + "step": 95040 + }, + { + "epoch": 15.50489396411093, + "grad_norm": 0.03326995298266411, + "learning_rate": 0.00014615480589396396, + "loss": 0.0075, + "num_input_tokens_seen": 205300976, + "step": 95045 + }, + { + "epoch": 15.505709624796085, + "grad_norm": 0.0013846260262653232, + "learning_rate": 0.0001461045193197561, + "loss": 0.0344, + "num_input_tokens_seen": 205311568, + "step": 95050 + }, + { + "epoch": 15.50652528548124, + "grad_norm": 0.02186959609389305, + "learning_rate": 0.00014605423991768908, + "loss": 0.0199, + "num_input_tokens_seen": 205322032, + "step": 95055 + }, + { + "epoch": 15.507340946166394, + "grad_norm": 0.00837793666869402, + "learning_rate": 0.00014600396768878188, + "loss": 0.0045, + "num_input_tokens_seen": 205333104, + "step": 95060 + }, + { + "epoch": 15.50815660685155, + "grad_norm": 0.18578238785266876, + "learning_rate": 0.0001459537026340534, + "loss": 0.0141, + "num_input_tokens_seen": 205344496, + "step": 95065 + }, + { + "epoch": 15.508972267536706, + "grad_norm": 0.00431056646630168, + "learning_rate": 0.0001459034447545222, + "loss": 0.0303, + "num_input_tokens_seen": 205355248, + "step": 95070 + }, + { + "epoch": 15.50978792822186, + "grad_norm": 0.011880909092724323, + "learning_rate": 0.00014585319405120695, + "loss": 0.0025, + "num_input_tokens_seen": 205367600, + "step": 95075 + }, + { + "epoch": 15.510603588907015, + "grad_norm": 0.0075214398093521595, + "learning_rate": 0.0001458029505251258, + "loss": 0.0066, + "num_input_tokens_seen": 205376976, + "step": 95080 + }, + { + "epoch": 15.51141924959217, + "grad_norm": 0.01479738112539053, + "learning_rate": 0.0001457527141772975, + "loss": 0.0722, + "num_input_tokens_seen": 205388272, + "step": 95085 + }, + { + "epoch": 15.512234910277325, + "grad_norm": 0.0015288260765373707, + "learning_rate": 0.00014570248500873963, + "loss": 0.0023, + "num_input_tokens_seen": 205398864, + "step": 95090 + }, + { + "epoch": 15.513050570962479, + "grad_norm": 0.039064157754182816, + "learning_rate": 0.00014565226302047058, + "loss": 0.0097, + "num_input_tokens_seen": 205409872, + "step": 95095 + }, + { + "epoch": 15.513866231647635, + "grad_norm": 0.44162848591804504, + "learning_rate": 0.00014560204821350764, + "loss": 0.0117, + "num_input_tokens_seen": 205420208, + "step": 95100 + }, + { + "epoch": 15.51468189233279, + "grad_norm": 0.0027341239620000124, + "learning_rate": 0.00014555184058886905, + "loss": 0.0178, + "num_input_tokens_seen": 205431536, + "step": 95105 + }, + { + "epoch": 15.515497553017944, + "grad_norm": 0.008773371577262878, + "learning_rate": 0.00014550164014757183, + "loss": 0.0011, + "num_input_tokens_seen": 205443504, + "step": 95110 + }, + { + "epoch": 15.5163132137031, + "grad_norm": 0.6953399181365967, + "learning_rate": 0.00014545144689063382, + "loss": 0.2316, + "num_input_tokens_seen": 205453456, + "step": 95115 + }, + { + "epoch": 15.517128874388254, + "grad_norm": 0.7138208746910095, + "learning_rate": 0.0001454012608190718, + "loss": 0.072, + "num_input_tokens_seen": 205465136, + "step": 95120 + }, + { + "epoch": 15.51794453507341, + "grad_norm": 0.0015598111785948277, + "learning_rate": 0.0001453510819339033, + "loss": 0.0026, + "num_input_tokens_seen": 205475184, + "step": 95125 + }, + { + "epoch": 15.518760195758565, + "grad_norm": 0.00230622966773808, + "learning_rate": 0.0001453009102361447, + "loss": 0.0065, + "num_input_tokens_seen": 205484688, + "step": 95130 + }, + { + "epoch": 15.51957585644372, + "grad_norm": 0.02480519562959671, + "learning_rate": 0.0001452507457268135, + "loss": 0.0032, + "num_input_tokens_seen": 205494960, + "step": 95135 + }, + { + "epoch": 15.520391517128875, + "grad_norm": 0.0014786440879106522, + "learning_rate": 0.00014520058840692562, + "loss": 0.0009, + "num_input_tokens_seen": 205506064, + "step": 95140 + }, + { + "epoch": 15.521207177814029, + "grad_norm": 0.023763207718729973, + "learning_rate": 0.00014515043827749812, + "loss": 0.0023, + "num_input_tokens_seen": 205516944, + "step": 95145 + }, + { + "epoch": 15.522022838499185, + "grad_norm": 0.05934037268161774, + "learning_rate": 0.0001451002953395471, + "loss": 0.0033, + "num_input_tokens_seen": 205527056, + "step": 95150 + }, + { + "epoch": 15.522838499184338, + "grad_norm": 0.015051600523293018, + "learning_rate": 0.00014505015959408884, + "loss": 0.0016, + "num_input_tokens_seen": 205536944, + "step": 95155 + }, + { + "epoch": 15.523654159869494, + "grad_norm": 0.0194566547870636, + "learning_rate": 0.00014500003104213932, + "loss": 0.0033, + "num_input_tokens_seen": 205546768, + "step": 95160 + }, + { + "epoch": 15.52446982055465, + "grad_norm": 0.004796968307346106, + "learning_rate": 0.0001449499096847146, + "loss": 0.0064, + "num_input_tokens_seen": 205557200, + "step": 95165 + }, + { + "epoch": 15.525285481239804, + "grad_norm": 0.004861933179199696, + "learning_rate": 0.00014489979552283035, + "loss": 0.093, + "num_input_tokens_seen": 205568144, + "step": 95170 + }, + { + "epoch": 15.52610114192496, + "grad_norm": 0.023706277832388878, + "learning_rate": 0.0001448496885575022, + "loss": 0.0026, + "num_input_tokens_seen": 205579504, + "step": 95175 + }, + { + "epoch": 15.526916802610113, + "grad_norm": 0.010353012010455132, + "learning_rate": 0.00014479958878974564, + "loss": 0.0029, + "num_input_tokens_seen": 205590256, + "step": 95180 + }, + { + "epoch": 15.52773246329527, + "grad_norm": 0.018889334052801132, + "learning_rate": 0.00014474949622057603, + "loss": 0.002, + "num_input_tokens_seen": 205601520, + "step": 95185 + }, + { + "epoch": 15.528548123980425, + "grad_norm": 0.008149920962750912, + "learning_rate": 0.00014469941085100857, + "loss": 0.0015, + "num_input_tokens_seen": 205612304, + "step": 95190 + }, + { + "epoch": 15.529363784665579, + "grad_norm": 0.030473439022898674, + "learning_rate": 0.00014464933268205826, + "loss": 0.0017, + "num_input_tokens_seen": 205622768, + "step": 95195 + }, + { + "epoch": 15.530179445350734, + "grad_norm": 0.01042587123811245, + "learning_rate": 0.00014459926171474002, + "loss": 0.0077, + "num_input_tokens_seen": 205633072, + "step": 95200 + }, + { + "epoch": 15.530995106035888, + "grad_norm": 0.06183413416147232, + "learning_rate": 0.0001445491979500686, + "loss": 0.0107, + "num_input_tokens_seen": 205643120, + "step": 95205 + }, + { + "epoch": 15.531810766721044, + "grad_norm": 0.0014860248193144798, + "learning_rate": 0.0001444991413890586, + "loss": 0.0024, + "num_input_tokens_seen": 205653296, + "step": 95210 + }, + { + "epoch": 15.5326264274062, + "grad_norm": 0.0037677655927836895, + "learning_rate": 0.00014444909203272438, + "loss": 0.0344, + "num_input_tokens_seen": 205662672, + "step": 95215 + }, + { + "epoch": 15.533442088091354, + "grad_norm": 0.034878209233284, + "learning_rate": 0.0001443990498820806, + "loss": 0.0067, + "num_input_tokens_seen": 205672752, + "step": 95220 + }, + { + "epoch": 15.53425774877651, + "grad_norm": 0.366243839263916, + "learning_rate": 0.0001443490149381409, + "loss": 0.022, + "num_input_tokens_seen": 205683984, + "step": 95225 + }, + { + "epoch": 15.535073409461663, + "grad_norm": 0.003331542480736971, + "learning_rate": 0.0001442989872019199, + "loss": 0.0056, + "num_input_tokens_seen": 205694992, + "step": 95230 + }, + { + "epoch": 15.535889070146819, + "grad_norm": 0.0007717033731751144, + "learning_rate": 0.00014424896667443083, + "loss": 0.0027, + "num_input_tokens_seen": 205706512, + "step": 95235 + }, + { + "epoch": 15.536704730831975, + "grad_norm": 0.004548640456050634, + "learning_rate": 0.00014419895335668809, + "loss": 0.0032, + "num_input_tokens_seen": 205717968, + "step": 95240 + }, + { + "epoch": 15.537520391517129, + "grad_norm": 0.006112218368798494, + "learning_rate": 0.00014414894724970462, + "loss": 0.0018, + "num_input_tokens_seen": 205728912, + "step": 95245 + }, + { + "epoch": 15.538336052202284, + "grad_norm": 0.02950763888657093, + "learning_rate": 0.00014409894835449444, + "loss": 0.0033, + "num_input_tokens_seen": 205739888, + "step": 95250 + }, + { + "epoch": 15.539151712887438, + "grad_norm": 0.1370188146829605, + "learning_rate": 0.00014404895667207028, + "loss": 0.0057, + "num_input_tokens_seen": 205751376, + "step": 95255 + }, + { + "epoch": 15.539967373572594, + "grad_norm": 0.0022804271429777145, + "learning_rate": 0.00014399897220344576, + "loss": 0.0006, + "num_input_tokens_seen": 205761072, + "step": 95260 + }, + { + "epoch": 15.540783034257748, + "grad_norm": 0.001658109133131802, + "learning_rate": 0.00014394899494963364, + "loss": 0.0009, + "num_input_tokens_seen": 205771824, + "step": 95265 + }, + { + "epoch": 15.541598694942904, + "grad_norm": 0.0006953808479011059, + "learning_rate": 0.00014389902491164681, + "loss": 0.1072, + "num_input_tokens_seen": 205781616, + "step": 95270 + }, + { + "epoch": 15.54241435562806, + "grad_norm": 0.060906656086444855, + "learning_rate": 0.00014384906209049804, + "loss": 0.002, + "num_input_tokens_seen": 205792528, + "step": 95275 + }, + { + "epoch": 15.543230016313213, + "grad_norm": 0.008214866742491722, + "learning_rate": 0.0001437991064871998, + "loss": 0.1226, + "num_input_tokens_seen": 205803568, + "step": 95280 + }, + { + "epoch": 15.544045676998369, + "grad_norm": 0.0363452285528183, + "learning_rate": 0.0001437491581027645, + "loss": 0.0863, + "num_input_tokens_seen": 205813424, + "step": 95285 + }, + { + "epoch": 15.544861337683523, + "grad_norm": 0.0006147515960037708, + "learning_rate": 0.00014369921693820447, + "loss": 0.0011, + "num_input_tokens_seen": 205824688, + "step": 95290 + }, + { + "epoch": 15.545676998368679, + "grad_norm": 0.004925783723592758, + "learning_rate": 0.00014364928299453184, + "loss": 0.0018, + "num_input_tokens_seen": 205835760, + "step": 95295 + }, + { + "epoch": 15.546492659053834, + "grad_norm": 0.0010029467521235347, + "learning_rate": 0.00014359935627275856, + "loss": 0.0034, + "num_input_tokens_seen": 205846672, + "step": 95300 + }, + { + "epoch": 15.547308319738988, + "grad_norm": 0.0026191682554781437, + "learning_rate": 0.00014354943677389643, + "loss": 0.0013, + "num_input_tokens_seen": 205858384, + "step": 95305 + }, + { + "epoch": 15.548123980424144, + "grad_norm": 0.007862421683967113, + "learning_rate": 0.00014349952449895715, + "loss": 0.0016, + "num_input_tokens_seen": 205868720, + "step": 95310 + }, + { + "epoch": 15.548939641109298, + "grad_norm": 0.0005457144579850137, + "learning_rate": 0.00014344961944895223, + "loss": 0.0021, + "num_input_tokens_seen": 205877776, + "step": 95315 + }, + { + "epoch": 15.549755301794454, + "grad_norm": 0.011868278495967388, + "learning_rate": 0.00014339972162489317, + "loss": 0.015, + "num_input_tokens_seen": 205888912, + "step": 95320 + }, + { + "epoch": 15.550570962479608, + "grad_norm": 0.0013630108442157507, + "learning_rate": 0.0001433498310277911, + "loss": 0.0148, + "num_input_tokens_seen": 205900240, + "step": 95325 + }, + { + "epoch": 15.551386623164763, + "grad_norm": 0.004692132119089365, + "learning_rate": 0.0001432999476586571, + "loss": 0.0011, + "num_input_tokens_seen": 205911792, + "step": 95330 + }, + { + "epoch": 15.552202283849919, + "grad_norm": 0.002104677725583315, + "learning_rate": 0.00014325007151850218, + "loss": 0.0064, + "num_input_tokens_seen": 205922704, + "step": 95335 + }, + { + "epoch": 15.553017944535073, + "grad_norm": 0.030288465321063995, + "learning_rate": 0.00014320020260833716, + "loss": 0.0021, + "num_input_tokens_seen": 205934064, + "step": 95340 + }, + { + "epoch": 15.553833605220229, + "grad_norm": 0.0025799486320465803, + "learning_rate": 0.00014315034092917268, + "loss": 0.001, + "num_input_tokens_seen": 205943760, + "step": 95345 + }, + { + "epoch": 15.554649265905383, + "grad_norm": 0.0033469286281615496, + "learning_rate": 0.00014310048648201917, + "loss": 0.0049, + "num_input_tokens_seen": 205952528, + "step": 95350 + }, + { + "epoch": 15.555464926590538, + "grad_norm": 0.021118100732564926, + "learning_rate": 0.0001430506392678871, + "loss": 0.004, + "num_input_tokens_seen": 205963056, + "step": 95355 + }, + { + "epoch": 15.556280587275694, + "grad_norm": 0.024281244724988937, + "learning_rate": 0.00014300079928778646, + "loss": 0.0024, + "num_input_tokens_seen": 205973648, + "step": 95360 + }, + { + "epoch": 15.557096247960848, + "grad_norm": 0.09317111223936081, + "learning_rate": 0.00014295096654272772, + "loss": 0.0031, + "num_input_tokens_seen": 205984112, + "step": 95365 + }, + { + "epoch": 15.557911908646004, + "grad_norm": 0.0007803754997439682, + "learning_rate": 0.00014290114103372058, + "loss": 0.0567, + "num_input_tokens_seen": 205994704, + "step": 95370 + }, + { + "epoch": 15.558727569331158, + "grad_norm": 0.009101621806621552, + "learning_rate": 0.00014285132276177482, + "loss": 0.0037, + "num_input_tokens_seen": 206004784, + "step": 95375 + }, + { + "epoch": 15.559543230016313, + "grad_norm": 0.0009343404090031981, + "learning_rate": 0.00014280151172790006, + "loss": 0.0052, + "num_input_tokens_seen": 206015472, + "step": 95380 + }, + { + "epoch": 15.560358890701469, + "grad_norm": 0.0013652790803462267, + "learning_rate": 0.00014275170793310582, + "loss": 0.0072, + "num_input_tokens_seen": 206028336, + "step": 95385 + }, + { + "epoch": 15.561174551386623, + "grad_norm": 0.015270788222551346, + "learning_rate": 0.00014270191137840145, + "loss": 0.1079, + "num_input_tokens_seen": 206037424, + "step": 95390 + }, + { + "epoch": 15.561990212071779, + "grad_norm": 0.0003416137769818306, + "learning_rate": 0.00014265212206479604, + "loss": 0.014, + "num_input_tokens_seen": 206047600, + "step": 95395 + }, + { + "epoch": 15.562805872756933, + "grad_norm": 0.020423779264092445, + "learning_rate": 0.00014260233999329873, + "loss": 0.0035, + "num_input_tokens_seen": 206057488, + "step": 95400 + }, + { + "epoch": 15.563621533442088, + "grad_norm": 0.001411057892255485, + "learning_rate": 0.00014255256516491845, + "loss": 0.0022, + "num_input_tokens_seen": 206069008, + "step": 95405 + }, + { + "epoch": 15.564437194127244, + "grad_norm": 0.8168393969535828, + "learning_rate": 0.00014250279758066387, + "loss": 0.0089, + "num_input_tokens_seen": 206079984, + "step": 95410 + }, + { + "epoch": 15.565252854812398, + "grad_norm": 0.04632632061839104, + "learning_rate": 0.00014245303724154358, + "loss": 0.0034, + "num_input_tokens_seen": 206091248, + "step": 95415 + }, + { + "epoch": 15.566068515497554, + "grad_norm": 0.011034045368432999, + "learning_rate": 0.00014240328414856607, + "loss": 0.0018, + "num_input_tokens_seen": 206101328, + "step": 95420 + }, + { + "epoch": 15.566884176182707, + "grad_norm": 0.006134867202490568, + "learning_rate": 0.00014235353830273966, + "loss": 0.0027, + "num_input_tokens_seen": 206111472, + "step": 95425 + }, + { + "epoch": 15.567699836867863, + "grad_norm": 0.002034999430179596, + "learning_rate": 0.00014230379970507252, + "loss": 0.0046, + "num_input_tokens_seen": 206122736, + "step": 95430 + }, + { + "epoch": 15.568515497553017, + "grad_norm": 0.0007061712676659226, + "learning_rate": 0.00014225406835657262, + "loss": 0.0011, + "num_input_tokens_seen": 206134576, + "step": 95435 + }, + { + "epoch": 15.569331158238173, + "grad_norm": 0.04730561748147011, + "learning_rate": 0.00014220434425824785, + "loss": 0.0132, + "num_input_tokens_seen": 206145040, + "step": 95440 + }, + { + "epoch": 15.570146818923329, + "grad_norm": 0.006063089240342379, + "learning_rate": 0.00014215462741110597, + "loss": 0.0023, + "num_input_tokens_seen": 206156208, + "step": 95445 + }, + { + "epoch": 15.570962479608482, + "grad_norm": 0.5385159850120544, + "learning_rate": 0.00014210491781615453, + "loss": 0.0871, + "num_input_tokens_seen": 206167408, + "step": 95450 + }, + { + "epoch": 15.571778140293638, + "grad_norm": 0.002197818597778678, + "learning_rate": 0.00014205521547440092, + "loss": 0.0049, + "num_input_tokens_seen": 206177808, + "step": 95455 + }, + { + "epoch": 15.572593800978792, + "grad_norm": 0.009187380783259869, + "learning_rate": 0.00014200552038685249, + "loss": 0.0039, + "num_input_tokens_seen": 206188016, + "step": 95460 + }, + { + "epoch": 15.573409461663948, + "grad_norm": 0.00341814081184566, + "learning_rate": 0.00014195583255451633, + "loss": 0.0043, + "num_input_tokens_seen": 206199472, + "step": 95465 + }, + { + "epoch": 15.574225122349104, + "grad_norm": 0.0038523096591234207, + "learning_rate": 0.00014190615197839929, + "loss": 0.0035, + "num_input_tokens_seen": 206209520, + "step": 95470 + }, + { + "epoch": 15.575040783034257, + "grad_norm": 0.0005846923450008035, + "learning_rate": 0.00014185647865950861, + "loss": 0.0008, + "num_input_tokens_seen": 206220368, + "step": 95475 + }, + { + "epoch": 15.575856443719413, + "grad_norm": 0.22896532714366913, + "learning_rate": 0.00014180681259885048, + "loss": 0.1469, + "num_input_tokens_seen": 206231568, + "step": 95480 + }, + { + "epoch": 15.576672104404567, + "grad_norm": 0.0027704713866114616, + "learning_rate": 0.000141757153797432, + "loss": 0.001, + "num_input_tokens_seen": 206243376, + "step": 95485 + }, + { + "epoch": 15.577487765089723, + "grad_norm": 0.05355888605117798, + "learning_rate": 0.00014170750225625888, + "loss": 0.0442, + "num_input_tokens_seen": 206253360, + "step": 95490 + }, + { + "epoch": 15.578303425774878, + "grad_norm": 0.004209849517792463, + "learning_rate": 0.00014165785797633812, + "loss": 0.0015, + "num_input_tokens_seen": 206263856, + "step": 95495 + }, + { + "epoch": 15.579119086460032, + "grad_norm": 0.004551318474113941, + "learning_rate": 0.00014160822095867515, + "loss": 0.0031, + "num_input_tokens_seen": 206275376, + "step": 95500 + }, + { + "epoch": 15.579934747145188, + "grad_norm": 0.009526636451482773, + "learning_rate": 0.00014155859120427633, + "loss": 0.0051, + "num_input_tokens_seen": 206286608, + "step": 95505 + }, + { + "epoch": 15.580750407830342, + "grad_norm": 0.05764400586485863, + "learning_rate": 0.00014150896871414743, + "loss": 0.0085, + "num_input_tokens_seen": 206297616, + "step": 95510 + }, + { + "epoch": 15.581566068515498, + "grad_norm": 0.05977749451994896, + "learning_rate": 0.00014145935348929407, + "loss": 0.0058, + "num_input_tokens_seen": 206309232, + "step": 95515 + }, + { + "epoch": 15.582381729200652, + "grad_norm": 0.00463469885289669, + "learning_rate": 0.0001414097455307217, + "loss": 0.0054, + "num_input_tokens_seen": 206320528, + "step": 95520 + }, + { + "epoch": 15.583197389885807, + "grad_norm": 0.0017053117044270039, + "learning_rate": 0.00014136014483943576, + "loss": 0.0737, + "num_input_tokens_seen": 206330192, + "step": 95525 + }, + { + "epoch": 15.584013050570963, + "grad_norm": 0.02842816151678562, + "learning_rate": 0.0001413105514164415, + "loss": 0.0025, + "num_input_tokens_seen": 206341200, + "step": 95530 + }, + { + "epoch": 15.584828711256117, + "grad_norm": 0.006738144904375076, + "learning_rate": 0.0001412609652627439, + "loss": 0.0016, + "num_input_tokens_seen": 206351472, + "step": 95535 + }, + { + "epoch": 15.585644371941273, + "grad_norm": 0.13522782921791077, + "learning_rate": 0.00014121138637934795, + "loss": 0.0111, + "num_input_tokens_seen": 206362512, + "step": 95540 + }, + { + "epoch": 15.586460032626427, + "grad_norm": 0.01704743131995201, + "learning_rate": 0.00014116181476725838, + "loss": 0.0305, + "num_input_tokens_seen": 206374160, + "step": 95545 + }, + { + "epoch": 15.587275693311582, + "grad_norm": 0.0007164751878008246, + "learning_rate": 0.00014111225042747987, + "loss": 0.003, + "num_input_tokens_seen": 206383952, + "step": 95550 + }, + { + "epoch": 15.588091353996738, + "grad_norm": 0.0008742042118683457, + "learning_rate": 0.00014106269336101692, + "loss": 0.001, + "num_input_tokens_seen": 206393552, + "step": 95555 + }, + { + "epoch": 15.588907014681892, + "grad_norm": 0.0022135020699352026, + "learning_rate": 0.0001410131435688738, + "loss": 0.0014, + "num_input_tokens_seen": 206404112, + "step": 95560 + }, + { + "epoch": 15.589722675367048, + "grad_norm": 0.006435474380850792, + "learning_rate": 0.00014096360105205475, + "loss": 0.0063, + "num_input_tokens_seen": 206414704, + "step": 95565 + }, + { + "epoch": 15.590538336052202, + "grad_norm": 0.0010266860481351614, + "learning_rate": 0.00014091406581156373, + "loss": 0.0019, + "num_input_tokens_seen": 206425264, + "step": 95570 + }, + { + "epoch": 15.591353996737357, + "grad_norm": 0.00455155223608017, + "learning_rate": 0.00014086453784840463, + "loss": 0.0018, + "num_input_tokens_seen": 206434768, + "step": 95575 + }, + { + "epoch": 15.592169657422513, + "grad_norm": 0.0011100763222202659, + "learning_rate": 0.00014081501716358154, + "loss": 0.0017, + "num_input_tokens_seen": 206446928, + "step": 95580 + }, + { + "epoch": 15.592985318107667, + "grad_norm": 0.002654121723026037, + "learning_rate": 0.0001407655037580975, + "loss": 0.0012, + "num_input_tokens_seen": 206456880, + "step": 95585 + }, + { + "epoch": 15.593800978792823, + "grad_norm": 0.0031467049848288298, + "learning_rate": 0.0001407159976329565, + "loss": 0.001, + "num_input_tokens_seen": 206467216, + "step": 95590 + }, + { + "epoch": 15.594616639477977, + "grad_norm": 0.0007504248642362654, + "learning_rate": 0.00014066649878916133, + "loss": 0.0014, + "num_input_tokens_seen": 206478128, + "step": 95595 + }, + { + "epoch": 15.595432300163132, + "grad_norm": 0.0024569076485931873, + "learning_rate": 0.00014061700722771569, + "loss": 0.0006, + "num_input_tokens_seen": 206490128, + "step": 95600 + }, + { + "epoch": 15.596247960848288, + "grad_norm": 0.00822295993566513, + "learning_rate": 0.000140567522949622, + "loss": 0.0014, + "num_input_tokens_seen": 206501040, + "step": 95605 + }, + { + "epoch": 15.597063621533442, + "grad_norm": 0.052377086132764816, + "learning_rate": 0.00014051804595588375, + "loss": 0.002, + "num_input_tokens_seen": 206511696, + "step": 95610 + }, + { + "epoch": 15.597879282218598, + "grad_norm": 0.014150702394545078, + "learning_rate": 0.00014046857624750304, + "loss": 0.0034, + "num_input_tokens_seen": 206521936, + "step": 95615 + }, + { + "epoch": 15.598694942903752, + "grad_norm": 0.003785144304856658, + "learning_rate": 0.00014041911382548305, + "loss": 0.0012, + "num_input_tokens_seen": 206533200, + "step": 95620 + }, + { + "epoch": 15.599510603588907, + "grad_norm": 0.019707906991243362, + "learning_rate": 0.00014036965869082551, + "loss": 0.022, + "num_input_tokens_seen": 206544304, + "step": 95625 + }, + { + "epoch": 15.600326264274061, + "grad_norm": 0.01421266794204712, + "learning_rate": 0.00014032021084453344, + "loss": 0.0058, + "num_input_tokens_seen": 206555152, + "step": 95630 + }, + { + "epoch": 15.601141924959217, + "grad_norm": 0.01474962942302227, + "learning_rate": 0.0001402707702876082, + "loss": 0.0073, + "num_input_tokens_seen": 206567056, + "step": 95635 + }, + { + "epoch": 15.601957585644373, + "grad_norm": 0.0010720965219661593, + "learning_rate": 0.0001402213370210525, + "loss": 0.0015, + "num_input_tokens_seen": 206577520, + "step": 95640 + }, + { + "epoch": 15.602773246329527, + "grad_norm": 0.007528163492679596, + "learning_rate": 0.00014017191104586751, + "loss": 0.0059, + "num_input_tokens_seen": 206588720, + "step": 95645 + }, + { + "epoch": 15.603588907014682, + "grad_norm": 0.045820482075214386, + "learning_rate": 0.00014012249236305542, + "loss": 0.0033, + "num_input_tokens_seen": 206599760, + "step": 95650 + }, + { + "epoch": 15.604404567699836, + "grad_norm": 0.009412800893187523, + "learning_rate": 0.00014007308097361749, + "loss": 0.0043, + "num_input_tokens_seen": 206610192, + "step": 95655 + }, + { + "epoch": 15.605220228384992, + "grad_norm": 0.005069994367659092, + "learning_rate": 0.00014002367687855516, + "loss": 0.0017, + "num_input_tokens_seen": 206620240, + "step": 95660 + }, + { + "epoch": 15.606035889070148, + "grad_norm": 0.009124625474214554, + "learning_rate": 0.00013997428007886975, + "loss": 0.0048, + "num_input_tokens_seen": 206631056, + "step": 95665 + }, + { + "epoch": 15.606851549755302, + "grad_norm": 0.0022985092364251614, + "learning_rate": 0.00013992489057556223, + "loss": 0.0015, + "num_input_tokens_seen": 206642032, + "step": 95670 + }, + { + "epoch": 15.607667210440457, + "grad_norm": 0.002274824073538184, + "learning_rate": 0.00013987550836963358, + "loss": 0.0028, + "num_input_tokens_seen": 206652528, + "step": 95675 + }, + { + "epoch": 15.608482871125611, + "grad_norm": 0.1229882463812828, + "learning_rate": 0.0001398261334620846, + "loss": 0.0025, + "num_input_tokens_seen": 206662864, + "step": 95680 + }, + { + "epoch": 15.609298531810767, + "grad_norm": 0.0009678273927420378, + "learning_rate": 0.00013977676585391597, + "loss": 0.0022, + "num_input_tokens_seen": 206673840, + "step": 95685 + }, + { + "epoch": 15.61011419249592, + "grad_norm": 0.008544718846678734, + "learning_rate": 0.00013972740554612817, + "loss": 0.0042, + "num_input_tokens_seen": 206684240, + "step": 95690 + }, + { + "epoch": 15.610929853181077, + "grad_norm": 0.010948436334729195, + "learning_rate": 0.0001396780525397215, + "loss": 0.0606, + "num_input_tokens_seen": 206695440, + "step": 95695 + }, + { + "epoch": 15.611745513866232, + "grad_norm": 0.06881558895111084, + "learning_rate": 0.00013962870683569605, + "loss": 0.0048, + "num_input_tokens_seen": 206705840, + "step": 95700 + }, + { + "epoch": 15.612561174551386, + "grad_norm": 0.006645071320235729, + "learning_rate": 0.00013957936843505238, + "loss": 0.0306, + "num_input_tokens_seen": 206716176, + "step": 95705 + }, + { + "epoch": 15.613376835236542, + "grad_norm": 0.2631056606769562, + "learning_rate": 0.00013953003733878965, + "loss": 0.0179, + "num_input_tokens_seen": 206727376, + "step": 95710 + }, + { + "epoch": 15.614192495921696, + "grad_norm": 0.007076509762555361, + "learning_rate": 0.0001394807135479083, + "loss": 0.0041, + "num_input_tokens_seen": 206737648, + "step": 95715 + }, + { + "epoch": 15.615008156606851, + "grad_norm": 0.733444094657898, + "learning_rate": 0.0001394313970634074, + "loss": 0.1427, + "num_input_tokens_seen": 206748592, + "step": 95720 + }, + { + "epoch": 15.615823817292007, + "grad_norm": 0.0013660003896802664, + "learning_rate": 0.0001393820878862869, + "loss": 0.0773, + "num_input_tokens_seen": 206759120, + "step": 95725 + }, + { + "epoch": 15.616639477977161, + "grad_norm": 0.01996493898332119, + "learning_rate": 0.00013933278601754563, + "loss": 0.0134, + "num_input_tokens_seen": 206770896, + "step": 95730 + }, + { + "epoch": 15.617455138662317, + "grad_norm": 0.0015449258498847485, + "learning_rate": 0.00013928349145818326, + "loss": 0.0061, + "num_input_tokens_seen": 206782000, + "step": 95735 + }, + { + "epoch": 15.61827079934747, + "grad_norm": 0.014810550957918167, + "learning_rate": 0.00013923420420919823, + "loss": 0.0039, + "num_input_tokens_seen": 206793072, + "step": 95740 + }, + { + "epoch": 15.619086460032626, + "grad_norm": 0.0011037884978577495, + "learning_rate": 0.00013918492427159002, + "loss": 0.0278, + "num_input_tokens_seen": 206805168, + "step": 95745 + }, + { + "epoch": 15.619902120717782, + "grad_norm": 0.004637254402041435, + "learning_rate": 0.00013913565164635672, + "loss": 0.0012, + "num_input_tokens_seen": 206815440, + "step": 95750 + }, + { + "epoch": 15.620717781402936, + "grad_norm": 0.0011846721172332764, + "learning_rate": 0.00013908638633449756, + "loss": 0.0005, + "num_input_tokens_seen": 206825008, + "step": 95755 + }, + { + "epoch": 15.621533442088092, + "grad_norm": 0.0013002973282709718, + "learning_rate": 0.00013903712833701032, + "loss": 0.0044, + "num_input_tokens_seen": 206835536, + "step": 95760 + }, + { + "epoch": 15.622349102773246, + "grad_norm": 0.008994800969958305, + "learning_rate": 0.0001389878776548939, + "loss": 0.0023, + "num_input_tokens_seen": 206847344, + "step": 95765 + }, + { + "epoch": 15.623164763458401, + "grad_norm": 0.0006031625671312213, + "learning_rate": 0.00013893863428914583, + "loss": 0.0006, + "num_input_tokens_seen": 206858288, + "step": 95770 + }, + { + "epoch": 15.623980424143557, + "grad_norm": 0.10449585318565369, + "learning_rate": 0.00013888939824076464, + "loss": 0.0023, + "num_input_tokens_seen": 206870512, + "step": 95775 + }, + { + "epoch": 15.624796084828711, + "grad_norm": 0.010346177034080029, + "learning_rate": 0.00013884016951074758, + "loss": 0.0102, + "num_input_tokens_seen": 206882384, + "step": 95780 + }, + { + "epoch": 15.625611745513867, + "grad_norm": 0.0041154371574521065, + "learning_rate": 0.00013879094810009284, + "loss": 0.0018, + "num_input_tokens_seen": 206892784, + "step": 95785 + }, + { + "epoch": 15.62642740619902, + "grad_norm": 0.0015759262023493648, + "learning_rate": 0.00013874173400979772, + "loss": 0.0214, + "num_input_tokens_seen": 206904528, + "step": 95790 + }, + { + "epoch": 15.627243066884176, + "grad_norm": 0.01136757992208004, + "learning_rate": 0.00013869252724085974, + "loss": 0.0037, + "num_input_tokens_seen": 206916400, + "step": 95795 + }, + { + "epoch": 15.62805872756933, + "grad_norm": 0.1922260969877243, + "learning_rate": 0.00013864332779427597, + "loss": 0.007, + "num_input_tokens_seen": 206927440, + "step": 95800 + }, + { + "epoch": 15.628874388254486, + "grad_norm": 0.000996628892607987, + "learning_rate": 0.00013859413567104357, + "loss": 0.018, + "num_input_tokens_seen": 206938608, + "step": 95805 + }, + { + "epoch": 15.629690048939642, + "grad_norm": 0.00351770780980587, + "learning_rate": 0.00013854495087215951, + "loss": 0.0013, + "num_input_tokens_seen": 206949136, + "step": 95810 + }, + { + "epoch": 15.630505709624796, + "grad_norm": 0.0383528470993042, + "learning_rate": 0.00013849577339862057, + "loss": 0.0032, + "num_input_tokens_seen": 206958544, + "step": 95815 + }, + { + "epoch": 15.631321370309951, + "grad_norm": 0.06590034067630768, + "learning_rate": 0.00013844660325142334, + "loss": 0.0059, + "num_input_tokens_seen": 206969456, + "step": 95820 + }, + { + "epoch": 15.632137030995105, + "grad_norm": 0.0005166734335944057, + "learning_rate": 0.00013839744043156438, + "loss": 0.0008, + "num_input_tokens_seen": 206980176, + "step": 95825 + }, + { + "epoch": 15.632952691680261, + "grad_norm": 0.004043356981128454, + "learning_rate": 0.00013834828494004004, + "loss": 0.0086, + "num_input_tokens_seen": 206991184, + "step": 95830 + }, + { + "epoch": 15.633768352365417, + "grad_norm": 0.006381909362971783, + "learning_rate": 0.0001382991367778465, + "loss": 0.0057, + "num_input_tokens_seen": 207001360, + "step": 95835 + }, + { + "epoch": 15.63458401305057, + "grad_norm": 0.0019231840269640088, + "learning_rate": 0.00013824999594597975, + "loss": 0.0011, + "num_input_tokens_seen": 207010736, + "step": 95840 + }, + { + "epoch": 15.635399673735726, + "grad_norm": 0.00047763565089553595, + "learning_rate": 0.00013820086244543562, + "loss": 0.1042, + "num_input_tokens_seen": 207021840, + "step": 95845 + }, + { + "epoch": 15.63621533442088, + "grad_norm": 0.0834665521979332, + "learning_rate": 0.00013815173627721027, + "loss": 0.0058, + "num_input_tokens_seen": 207032848, + "step": 95850 + }, + { + "epoch": 15.637030995106036, + "grad_norm": 0.09207990765571594, + "learning_rate": 0.00013810261744229873, + "loss": 0.0052, + "num_input_tokens_seen": 207044336, + "step": 95855 + }, + { + "epoch": 15.63784665579119, + "grad_norm": 0.008700598031282425, + "learning_rate": 0.00013805350594169708, + "loss": 0.0021, + "num_input_tokens_seen": 207055376, + "step": 95860 + }, + { + "epoch": 15.638662316476346, + "grad_norm": 0.03562851995229721, + "learning_rate": 0.0001380044017764, + "loss": 0.0079, + "num_input_tokens_seen": 207065904, + "step": 95865 + }, + { + "epoch": 15.639477977161501, + "grad_norm": 0.00032584878499619663, + "learning_rate": 0.0001379553049474032, + "loss": 0.03, + "num_input_tokens_seen": 207076752, + "step": 95870 + }, + { + "epoch": 15.640293637846655, + "grad_norm": 0.025242719799280167, + "learning_rate": 0.00013790621545570114, + "loss": 0.004, + "num_input_tokens_seen": 207088016, + "step": 95875 + }, + { + "epoch": 15.641109298531811, + "grad_norm": 0.0008892813930287957, + "learning_rate": 0.00013785713330228928, + "loss": 0.0025, + "num_input_tokens_seen": 207098160, + "step": 95880 + }, + { + "epoch": 15.641924959216965, + "grad_norm": 0.030518537387251854, + "learning_rate": 0.00013780805848816175, + "loss": 0.0026, + "num_input_tokens_seen": 207108176, + "step": 95885 + }, + { + "epoch": 15.64274061990212, + "grad_norm": 0.016238387674093246, + "learning_rate": 0.0001377589910143135, + "loss": 0.008, + "num_input_tokens_seen": 207119440, + "step": 95890 + }, + { + "epoch": 15.643556280587276, + "grad_norm": 0.0018854053923860192, + "learning_rate": 0.00013770993088173884, + "loss": 0.0014, + "num_input_tokens_seen": 207129168, + "step": 95895 + }, + { + "epoch": 15.64437194127243, + "grad_norm": 0.00233276654034853, + "learning_rate": 0.000137660878091432, + "loss": 0.0017, + "num_input_tokens_seen": 207139600, + "step": 95900 + }, + { + "epoch": 15.645187601957586, + "grad_norm": 0.09753572940826416, + "learning_rate": 0.0001376118326443872, + "loss": 0.002, + "num_input_tokens_seen": 207150896, + "step": 95905 + }, + { + "epoch": 15.64600326264274, + "grad_norm": 0.1661272794008255, + "learning_rate": 0.00013756279454159827, + "loss": 0.0099, + "num_input_tokens_seen": 207160592, + "step": 95910 + }, + { + "epoch": 15.646818923327896, + "grad_norm": 0.000456740875961259, + "learning_rate": 0.0001375137637840591, + "loss": 0.0123, + "num_input_tokens_seen": 207171856, + "step": 95915 + }, + { + "epoch": 15.647634584013051, + "grad_norm": 0.006311553064733744, + "learning_rate": 0.00013746474037276335, + "loss": 0.0014, + "num_input_tokens_seen": 207183728, + "step": 95920 + }, + { + "epoch": 15.648450244698205, + "grad_norm": 0.0018103966722264886, + "learning_rate": 0.0001374157243087046, + "loss": 0.0022, + "num_input_tokens_seen": 207192976, + "step": 95925 + }, + { + "epoch": 15.649265905383361, + "grad_norm": 0.0052458480931818485, + "learning_rate": 0.00013736671559287612, + "loss": 0.0221, + "num_input_tokens_seen": 207204560, + "step": 95930 + }, + { + "epoch": 15.650081566068515, + "grad_norm": 0.35821104049682617, + "learning_rate": 0.0001373177142262712, + "loss": 0.0182, + "num_input_tokens_seen": 207215824, + "step": 95935 + }, + { + "epoch": 15.65089722675367, + "grad_norm": 0.014001819305121899, + "learning_rate": 0.0001372687202098829, + "loss": 0.0025, + "num_input_tokens_seen": 207226960, + "step": 95940 + }, + { + "epoch": 15.651712887438826, + "grad_norm": 0.0030215075239539146, + "learning_rate": 0.00013721973354470412, + "loss": 0.0032, + "num_input_tokens_seen": 207237552, + "step": 95945 + }, + { + "epoch": 15.65252854812398, + "grad_norm": 0.002811993472278118, + "learning_rate": 0.00013717075423172765, + "loss": 0.001, + "num_input_tokens_seen": 207249424, + "step": 95950 + }, + { + "epoch": 15.653344208809136, + "grad_norm": 0.04759371280670166, + "learning_rate": 0.00013712178227194617, + "loss": 0.0054, + "num_input_tokens_seen": 207259600, + "step": 95955 + }, + { + "epoch": 15.65415986949429, + "grad_norm": 0.0038091284222900867, + "learning_rate": 0.00013707281766635204, + "loss": 0.0682, + "num_input_tokens_seen": 207270448, + "step": 95960 + }, + { + "epoch": 15.654975530179446, + "grad_norm": 0.04629696160554886, + "learning_rate": 0.00013702386041593772, + "loss": 0.0065, + "num_input_tokens_seen": 207281424, + "step": 95965 + }, + { + "epoch": 15.655791190864601, + "grad_norm": 0.006149706896394491, + "learning_rate": 0.00013697491052169536, + "loss": 0.0024, + "num_input_tokens_seen": 207292976, + "step": 95970 + }, + { + "epoch": 15.656606851549755, + "grad_norm": 0.01826256327331066, + "learning_rate": 0.00013692596798461692, + "loss": 0.0016, + "num_input_tokens_seen": 207304240, + "step": 95975 + }, + { + "epoch": 15.65742251223491, + "grad_norm": 0.002373218536376953, + "learning_rate": 0.00013687703280569437, + "loss": 0.1386, + "num_input_tokens_seen": 207314896, + "step": 95980 + }, + { + "epoch": 15.658238172920065, + "grad_norm": 0.001200975151732564, + "learning_rate": 0.0001368281049859194, + "loss": 0.0011, + "num_input_tokens_seen": 207326512, + "step": 95985 + }, + { + "epoch": 15.65905383360522, + "grad_norm": 0.046008266508579254, + "learning_rate": 0.0001367791845262834, + "loss": 0.0039, + "num_input_tokens_seen": 207336944, + "step": 95990 + }, + { + "epoch": 15.659869494290374, + "grad_norm": 0.0028325358871370554, + "learning_rate": 0.0001367302714277784, + "loss": 0.0027, + "num_input_tokens_seen": 207348560, + "step": 95995 + }, + { + "epoch": 15.66068515497553, + "grad_norm": 0.0385119691491127, + "learning_rate": 0.00013668136569139488, + "loss": 0.0065, + "num_input_tokens_seen": 207359440, + "step": 96000 + }, + { + "epoch": 15.661500815660686, + "grad_norm": 0.0017604627646505833, + "learning_rate": 0.00013663246731812463, + "loss": 0.0062, + "num_input_tokens_seen": 207370768, + "step": 96005 + }, + { + "epoch": 15.66231647634584, + "grad_norm": 0.0032870511058717966, + "learning_rate": 0.00013658357630895834, + "loss": 0.0018, + "num_input_tokens_seen": 207382768, + "step": 96010 + }, + { + "epoch": 15.663132137030995, + "grad_norm": 0.0009842633735388517, + "learning_rate": 0.00013653469266488688, + "loss": 0.0025, + "num_input_tokens_seen": 207392976, + "step": 96015 + }, + { + "epoch": 15.66394779771615, + "grad_norm": 0.023544909432530403, + "learning_rate": 0.000136485816386901, + "loss": 0.0975, + "num_input_tokens_seen": 207404592, + "step": 96020 + }, + { + "epoch": 15.664763458401305, + "grad_norm": 0.0010194077622145414, + "learning_rate": 0.00013643694747599123, + "loss": 0.0015, + "num_input_tokens_seen": 207415408, + "step": 96025 + }, + { + "epoch": 15.66557911908646, + "grad_norm": 0.0012003988958895206, + "learning_rate": 0.0001363880859331479, + "loss": 0.0907, + "num_input_tokens_seen": 207425232, + "step": 96030 + }, + { + "epoch": 15.666394779771615, + "grad_norm": 0.001043745898641646, + "learning_rate": 0.00013633923175936124, + "loss": 0.0779, + "num_input_tokens_seen": 207437584, + "step": 96035 + }, + { + "epoch": 15.66721044045677, + "grad_norm": 0.025039061903953552, + "learning_rate": 0.00013629038495562145, + "loss": 0.1374, + "num_input_tokens_seen": 207448816, + "step": 96040 + }, + { + "epoch": 15.668026101141924, + "grad_norm": 0.0040556760504841805, + "learning_rate": 0.00013624154552291834, + "loss": 0.0025, + "num_input_tokens_seen": 207458960, + "step": 96045 + }, + { + "epoch": 15.66884176182708, + "grad_norm": 0.00263931299559772, + "learning_rate": 0.00013619271346224183, + "loss": 0.1143, + "num_input_tokens_seen": 207468656, + "step": 96050 + }, + { + "epoch": 15.669657422512234, + "grad_norm": 0.015455449931323528, + "learning_rate": 0.0001361438887745815, + "loss": 0.0032, + "num_input_tokens_seen": 207479408, + "step": 96055 + }, + { + "epoch": 15.67047308319739, + "grad_norm": 0.03252527117729187, + "learning_rate": 0.0001360950714609268, + "loss": 0.004, + "num_input_tokens_seen": 207489968, + "step": 96060 + }, + { + "epoch": 15.671288743882545, + "grad_norm": 0.003963053692132235, + "learning_rate": 0.00013604626152226719, + "loss": 0.0027, + "num_input_tokens_seen": 207502000, + "step": 96065 + }, + { + "epoch": 15.6721044045677, + "grad_norm": 0.008263356983661652, + "learning_rate": 0.00013599745895959175, + "loss": 0.0605, + "num_input_tokens_seen": 207512144, + "step": 96070 + }, + { + "epoch": 15.672920065252855, + "grad_norm": 0.45873066782951355, + "learning_rate": 0.00013594866377388958, + "loss": 0.0082, + "num_input_tokens_seen": 207523248, + "step": 96075 + }, + { + "epoch": 15.673735725938009, + "grad_norm": 0.02381652407348156, + "learning_rate": 0.0001358998759661496, + "loss": 0.0024, + "num_input_tokens_seen": 207532880, + "step": 96080 + }, + { + "epoch": 15.674551386623165, + "grad_norm": 0.020507873967289925, + "learning_rate": 0.00013585109553736053, + "loss": 0.0036, + "num_input_tokens_seen": 207543696, + "step": 96085 + }, + { + "epoch": 15.67536704730832, + "grad_norm": 0.03291149064898491, + "learning_rate": 0.00013580232248851094, + "loss": 0.0121, + "num_input_tokens_seen": 207554736, + "step": 96090 + }, + { + "epoch": 15.676182707993474, + "grad_norm": 0.011057699099183083, + "learning_rate": 0.00013575355682058932, + "loss": 0.0088, + "num_input_tokens_seen": 207565808, + "step": 96095 + }, + { + "epoch": 15.67699836867863, + "grad_norm": 0.01052065659314394, + "learning_rate": 0.0001357047985345839, + "loss": 0.0029, + "num_input_tokens_seen": 207575120, + "step": 96100 + }, + { + "epoch": 15.677814029363784, + "grad_norm": 0.0016519392374902964, + "learning_rate": 0.00013565604763148294, + "loss": 0.0057, + "num_input_tokens_seen": 207586480, + "step": 96105 + }, + { + "epoch": 15.67862969004894, + "grad_norm": 0.0008954678778536618, + "learning_rate": 0.00013560730411227417, + "loss": 0.0528, + "num_input_tokens_seen": 207596656, + "step": 96110 + }, + { + "epoch": 15.679445350734095, + "grad_norm": 0.010876849293708801, + "learning_rate": 0.000135558567977946, + "loss": 0.052, + "num_input_tokens_seen": 207607728, + "step": 96115 + }, + { + "epoch": 15.68026101141925, + "grad_norm": 0.007376998197287321, + "learning_rate": 0.00013550983922948546, + "loss": 0.0026, + "num_input_tokens_seen": 207617552, + "step": 96120 + }, + { + "epoch": 15.681076672104405, + "grad_norm": 0.5630950927734375, + "learning_rate": 0.00013546111786788073, + "loss": 0.0629, + "num_input_tokens_seen": 207628080, + "step": 96125 + }, + { + "epoch": 15.681892332789559, + "grad_norm": 0.005674498621374369, + "learning_rate": 0.00013541240389411857, + "loss": 0.0199, + "num_input_tokens_seen": 207639696, + "step": 96130 + }, + { + "epoch": 15.682707993474715, + "grad_norm": 0.002528025070205331, + "learning_rate": 0.00013536369730918668, + "loss": 0.0063, + "num_input_tokens_seen": 207649936, + "step": 96135 + }, + { + "epoch": 15.68352365415987, + "grad_norm": 0.024930205196142197, + "learning_rate": 0.00013531499811407212, + "loss": 0.0043, + "num_input_tokens_seen": 207660400, + "step": 96140 + }, + { + "epoch": 15.684339314845024, + "grad_norm": 0.0038991905748844147, + "learning_rate": 0.00013526630630976172, + "loss": 0.0043, + "num_input_tokens_seen": 207670608, + "step": 96145 + }, + { + "epoch": 15.68515497553018, + "grad_norm": 0.00235603260807693, + "learning_rate": 0.00013521762189724228, + "loss": 0.0058, + "num_input_tokens_seen": 207681584, + "step": 96150 + }, + { + "epoch": 15.685970636215334, + "grad_norm": 0.763839840888977, + "learning_rate": 0.00013516894487750053, + "loss": 0.0579, + "num_input_tokens_seen": 207692368, + "step": 96155 + }, + { + "epoch": 15.68678629690049, + "grad_norm": 0.003856255440041423, + "learning_rate": 0.00013512027525152293, + "loss": 0.002, + "num_input_tokens_seen": 207703408, + "step": 96160 + }, + { + "epoch": 15.687601957585644, + "grad_norm": 0.00043420374277047813, + "learning_rate": 0.00013507161302029586, + "loss": 0.0019, + "num_input_tokens_seen": 207714480, + "step": 96165 + }, + { + "epoch": 15.6884176182708, + "grad_norm": 0.02960328944027424, + "learning_rate": 0.00013502295818480548, + "loss": 0.0039, + "num_input_tokens_seen": 207724656, + "step": 96170 + }, + { + "epoch": 15.689233278955955, + "grad_norm": 0.0026452350430190563, + "learning_rate": 0.00013497431074603784, + "loss": 0.0284, + "num_input_tokens_seen": 207735984, + "step": 96175 + }, + { + "epoch": 15.690048939641109, + "grad_norm": 0.0021662067156285048, + "learning_rate": 0.00013492567070497885, + "loss": 0.0051, + "num_input_tokens_seen": 207746864, + "step": 96180 + }, + { + "epoch": 15.690864600326265, + "grad_norm": 0.013586881570518017, + "learning_rate": 0.0001348770380626143, + "loss": 0.0079, + "num_input_tokens_seen": 207757520, + "step": 96185 + }, + { + "epoch": 15.691680261011419, + "grad_norm": 0.39000317454338074, + "learning_rate": 0.00013482841281992975, + "loss": 0.1045, + "num_input_tokens_seen": 207769552, + "step": 96190 + }, + { + "epoch": 15.692495921696574, + "grad_norm": 0.02108220010995865, + "learning_rate": 0.00013477979497791064, + "loss": 0.0034, + "num_input_tokens_seen": 207781168, + "step": 96195 + }, + { + "epoch": 15.69331158238173, + "grad_norm": 0.001122990041039884, + "learning_rate": 0.00013473118453754236, + "loss": 0.0019, + "num_input_tokens_seen": 207792368, + "step": 96200 + }, + { + "epoch": 15.694127243066884, + "grad_norm": 0.004716485273092985, + "learning_rate": 0.00013468258149981, + "loss": 0.0148, + "num_input_tokens_seen": 207802896, + "step": 96205 + }, + { + "epoch": 15.69494290375204, + "grad_norm": 0.0714481994509697, + "learning_rate": 0.00013463398586569854, + "loss": 0.0023, + "num_input_tokens_seen": 207813552, + "step": 96210 + }, + { + "epoch": 15.695758564437194, + "grad_norm": 0.002846804680302739, + "learning_rate": 0.00013458539763619272, + "loss": 0.0113, + "num_input_tokens_seen": 207822768, + "step": 96215 + }, + { + "epoch": 15.69657422512235, + "grad_norm": 0.05376075953245163, + "learning_rate": 0.00013453681681227763, + "loss": 0.0125, + "num_input_tokens_seen": 207833008, + "step": 96220 + }, + { + "epoch": 15.697389885807503, + "grad_norm": 0.004778926260769367, + "learning_rate": 0.0001344882433949373, + "loss": 0.0266, + "num_input_tokens_seen": 207843920, + "step": 96225 + }, + { + "epoch": 15.698205546492659, + "grad_norm": 0.005003618076443672, + "learning_rate": 0.00013443967738515673, + "loss": 0.0039, + "num_input_tokens_seen": 207853840, + "step": 96230 + }, + { + "epoch": 15.699021207177815, + "grad_norm": 0.0037451880052685738, + "learning_rate": 0.00013439111878391953, + "loss": 0.0016, + "num_input_tokens_seen": 207865104, + "step": 96235 + }, + { + "epoch": 15.699836867862969, + "grad_norm": 0.5726847648620605, + "learning_rate": 0.00013434256759221037, + "loss": 0.066, + "num_input_tokens_seen": 207874224, + "step": 96240 + }, + { + "epoch": 15.700652528548124, + "grad_norm": 0.03150755539536476, + "learning_rate": 0.00013429402381101268, + "loss": 0.0015, + "num_input_tokens_seen": 207884752, + "step": 96245 + }, + { + "epoch": 15.701468189233278, + "grad_norm": 0.0010075062746182084, + "learning_rate": 0.00013424548744131088, + "loss": 0.001, + "num_input_tokens_seen": 207894448, + "step": 96250 + }, + { + "epoch": 15.702283849918434, + "grad_norm": 0.782049298286438, + "learning_rate": 0.00013419695848408792, + "loss": 0.0916, + "num_input_tokens_seen": 207906608, + "step": 96255 + }, + { + "epoch": 15.70309951060359, + "grad_norm": 0.026163609698414803, + "learning_rate": 0.00013414843694032792, + "loss": 0.0035, + "num_input_tokens_seen": 207918608, + "step": 96260 + }, + { + "epoch": 15.703915171288743, + "grad_norm": 0.004696679767221212, + "learning_rate": 0.00013409992281101368, + "loss": 0.0031, + "num_input_tokens_seen": 207930064, + "step": 96265 + }, + { + "epoch": 15.7047308319739, + "grad_norm": 0.0025974763557314873, + "learning_rate": 0.000134051416097129, + "loss": 0.0085, + "num_input_tokens_seen": 207942160, + "step": 96270 + }, + { + "epoch": 15.705546492659053, + "grad_norm": 0.039734333753585815, + "learning_rate": 0.00013400291679965633, + "loss": 0.0052, + "num_input_tokens_seen": 207951312, + "step": 96275 + }, + { + "epoch": 15.706362153344209, + "grad_norm": 0.057665541768074036, + "learning_rate": 0.000133954424919579, + "loss": 0.083, + "num_input_tokens_seen": 207962768, + "step": 96280 + }, + { + "epoch": 15.707177814029365, + "grad_norm": 0.23219189047813416, + "learning_rate": 0.00013390594045787957, + "loss": 0.0067, + "num_input_tokens_seen": 207973936, + "step": 96285 + }, + { + "epoch": 15.707993474714518, + "grad_norm": 0.022502528503537178, + "learning_rate": 0.00013385746341554067, + "loss": 0.0067, + "num_input_tokens_seen": 207984688, + "step": 96290 + }, + { + "epoch": 15.708809135399674, + "grad_norm": 0.1352599561214447, + "learning_rate": 0.0001338089937935448, + "loss": 0.0849, + "num_input_tokens_seen": 207996208, + "step": 96295 + }, + { + "epoch": 15.709624796084828, + "grad_norm": 0.19445081055164337, + "learning_rate": 0.0001337605315928742, + "loss": 0.0096, + "num_input_tokens_seen": 208007568, + "step": 96300 + }, + { + "epoch": 15.710440456769984, + "grad_norm": 0.4363537132740021, + "learning_rate": 0.00013371207681451102, + "loss": 0.0164, + "num_input_tokens_seen": 208016912, + "step": 96305 + }, + { + "epoch": 15.71125611745514, + "grad_norm": 0.001682588830590248, + "learning_rate": 0.00013366362945943733, + "loss": 0.0035, + "num_input_tokens_seen": 208026960, + "step": 96310 + }, + { + "epoch": 15.712071778140293, + "grad_norm": 0.001164038316346705, + "learning_rate": 0.00013361518952863488, + "loss": 0.0029, + "num_input_tokens_seen": 208037104, + "step": 96315 + }, + { + "epoch": 15.71288743882545, + "grad_norm": 0.0353374183177948, + "learning_rate": 0.00013356675702308541, + "loss": 0.088, + "num_input_tokens_seen": 208047632, + "step": 96320 + }, + { + "epoch": 15.713703099510603, + "grad_norm": 0.08301739394664764, + "learning_rate": 0.00013351833194377044, + "loss": 0.0057, + "num_input_tokens_seen": 208057744, + "step": 96325 + }, + { + "epoch": 15.714518760195759, + "grad_norm": 0.004812445491552353, + "learning_rate": 0.00013346991429167128, + "loss": 0.0105, + "num_input_tokens_seen": 208067632, + "step": 96330 + }, + { + "epoch": 15.715334420880914, + "grad_norm": 0.0030532728414982557, + "learning_rate": 0.00013342150406776953, + "loss": 0.0054, + "num_input_tokens_seen": 208078192, + "step": 96335 + }, + { + "epoch": 15.716150081566068, + "grad_norm": 0.00913258083164692, + "learning_rate": 0.00013337310127304575, + "loss": 0.0039, + "num_input_tokens_seen": 208088912, + "step": 96340 + }, + { + "epoch": 15.716965742251224, + "grad_norm": 0.0003174035809934139, + "learning_rate": 0.0001333247059084815, + "loss": 0.0012, + "num_input_tokens_seen": 208100464, + "step": 96345 + }, + { + "epoch": 15.717781402936378, + "grad_norm": 0.07068848609924316, + "learning_rate": 0.00013327631797505697, + "loss": 0.0048, + "num_input_tokens_seen": 208110704, + "step": 96350 + }, + { + "epoch": 15.718597063621534, + "grad_norm": 0.0047727604396641254, + "learning_rate": 0.00013322793747375333, + "loss": 0.0019, + "num_input_tokens_seen": 208122608, + "step": 96355 + }, + { + "epoch": 15.719412724306688, + "grad_norm": 0.0011388602433726192, + "learning_rate": 0.00013317956440555051, + "loss": 0.0012, + "num_input_tokens_seen": 208133648, + "step": 96360 + }, + { + "epoch": 15.720228384991843, + "grad_norm": 0.00047913682647049427, + "learning_rate": 0.00013313119877142947, + "loss": 0.003, + "num_input_tokens_seen": 208144752, + "step": 96365 + }, + { + "epoch": 15.721044045676999, + "grad_norm": 0.0011415015906095505, + "learning_rate": 0.00013308284057236984, + "loss": 0.0037, + "num_input_tokens_seen": 208154384, + "step": 96370 + }, + { + "epoch": 15.721859706362153, + "grad_norm": 0.2590673565864563, + "learning_rate": 0.00013303448980935218, + "loss": 0.0078, + "num_input_tokens_seen": 208165296, + "step": 96375 + }, + { + "epoch": 15.722675367047309, + "grad_norm": 0.025836044922471046, + "learning_rate": 0.00013298614648335583, + "loss": 0.0028, + "num_input_tokens_seen": 208176112, + "step": 96380 + }, + { + "epoch": 15.723491027732463, + "grad_norm": 0.021637318655848503, + "learning_rate": 0.0001329378105953611, + "loss": 0.0036, + "num_input_tokens_seen": 208185872, + "step": 96385 + }, + { + "epoch": 15.724306688417618, + "grad_norm": 0.019314365461468697, + "learning_rate": 0.00013288948214634698, + "loss": 0.004, + "num_input_tokens_seen": 208196976, + "step": 96390 + }, + { + "epoch": 15.725122349102774, + "grad_norm": 0.007318970747292042, + "learning_rate": 0.00013284116113729356, + "loss": 0.0037, + "num_input_tokens_seen": 208208336, + "step": 96395 + }, + { + "epoch": 15.725938009787928, + "grad_norm": 0.03209533914923668, + "learning_rate": 0.00013279284756917943, + "loss": 0.0022, + "num_input_tokens_seen": 208219856, + "step": 96400 + }, + { + "epoch": 15.726753670473084, + "grad_norm": 0.0057929218746721745, + "learning_rate": 0.00013274454144298438, + "loss": 0.003, + "num_input_tokens_seen": 208229808, + "step": 96405 + }, + { + "epoch": 15.727569331158238, + "grad_norm": 0.03650674223899841, + "learning_rate": 0.00013269624275968683, + "loss": 0.0682, + "num_input_tokens_seen": 208239568, + "step": 96410 + }, + { + "epoch": 15.728384991843393, + "grad_norm": 0.0009229037095792592, + "learning_rate": 0.00013264795152026615, + "loss": 0.0198, + "num_input_tokens_seen": 208250448, + "step": 96415 + }, + { + "epoch": 15.729200652528547, + "grad_norm": 0.1574191153049469, + "learning_rate": 0.00013259966772570048, + "loss": 0.0061, + "num_input_tokens_seen": 208261840, + "step": 96420 + }, + { + "epoch": 15.730016313213703, + "grad_norm": 0.026466218754649162, + "learning_rate": 0.00013255139137696874, + "loss": 0.0015, + "num_input_tokens_seen": 208272752, + "step": 96425 + }, + { + "epoch": 15.730831973898859, + "grad_norm": 0.0026206725742667913, + "learning_rate": 0.0001325031224750492, + "loss": 0.0023, + "num_input_tokens_seen": 208282800, + "step": 96430 + }, + { + "epoch": 15.731647634584013, + "grad_norm": 0.4331301748752594, + "learning_rate": 0.0001324548610209201, + "loss": 0.0106, + "num_input_tokens_seen": 208293072, + "step": 96435 + }, + { + "epoch": 15.732463295269168, + "grad_norm": 0.09264283627271652, + "learning_rate": 0.00013240660701555951, + "loss": 0.0031, + "num_input_tokens_seen": 208304560, + "step": 96440 + }, + { + "epoch": 15.733278955954322, + "grad_norm": 0.0013159823138266802, + "learning_rate": 0.00013235836045994532, + "loss": 0.0085, + "num_input_tokens_seen": 208316272, + "step": 96445 + }, + { + "epoch": 15.734094616639478, + "grad_norm": 0.012794621288776398, + "learning_rate": 0.00013231012135505538, + "loss": 0.004, + "num_input_tokens_seen": 208327184, + "step": 96450 + }, + { + "epoch": 15.734910277324634, + "grad_norm": 0.00966342631727457, + "learning_rate": 0.00013226188970186725, + "loss": 0.0041, + "num_input_tokens_seen": 208338736, + "step": 96455 + }, + { + "epoch": 15.735725938009788, + "grad_norm": 0.002653430448845029, + "learning_rate": 0.0001322136655013585, + "loss": 0.0048, + "num_input_tokens_seen": 208348272, + "step": 96460 + }, + { + "epoch": 15.736541598694943, + "grad_norm": 0.13813112676143646, + "learning_rate": 0.00013216544875450633, + "loss": 0.0456, + "num_input_tokens_seen": 208358672, + "step": 96465 + }, + { + "epoch": 15.737357259380097, + "grad_norm": 0.005092136561870575, + "learning_rate": 0.00013211723946228798, + "loss": 0.0037, + "num_input_tokens_seen": 208369264, + "step": 96470 + }, + { + "epoch": 15.738172920065253, + "grad_norm": 0.008314227685332298, + "learning_rate": 0.00013206903762568028, + "loss": 0.0011, + "num_input_tokens_seen": 208380048, + "step": 96475 + }, + { + "epoch": 15.738988580750409, + "grad_norm": 0.013196723535656929, + "learning_rate": 0.00013202084324566066, + "loss": 0.0043, + "num_input_tokens_seen": 208391024, + "step": 96480 + }, + { + "epoch": 15.739804241435563, + "grad_norm": 0.211594820022583, + "learning_rate": 0.0001319726563232051, + "loss": 0.0277, + "num_input_tokens_seen": 208401776, + "step": 96485 + }, + { + "epoch": 15.740619902120718, + "grad_norm": 0.005501213483512402, + "learning_rate": 0.00013192447685929088, + "loss": 0.0018, + "num_input_tokens_seen": 208413776, + "step": 96490 + }, + { + "epoch": 15.741435562805872, + "grad_norm": 0.0033411455806344748, + "learning_rate": 0.00013187630485489378, + "loss": 0.0014, + "num_input_tokens_seen": 208424112, + "step": 96495 + }, + { + "epoch": 15.742251223491028, + "grad_norm": 0.000817911874037236, + "learning_rate": 0.0001318281403109906, + "loss": 0.0006, + "num_input_tokens_seen": 208435152, + "step": 96500 + }, + { + "epoch": 15.743066884176184, + "grad_norm": 0.00817803479731083, + "learning_rate": 0.00013177998322855695, + "loss": 0.0022, + "num_input_tokens_seen": 208445552, + "step": 96505 + }, + { + "epoch": 15.743882544861338, + "grad_norm": 0.02054639346897602, + "learning_rate": 0.00013173183360856938, + "loss": 0.0016, + "num_input_tokens_seen": 208455536, + "step": 96510 + }, + { + "epoch": 15.744698205546493, + "grad_norm": 0.0038372152484953403, + "learning_rate": 0.00013168369145200303, + "loss": 0.0017, + "num_input_tokens_seen": 208466224, + "step": 96515 + }, + { + "epoch": 15.745513866231647, + "grad_norm": 0.040325090289115906, + "learning_rate": 0.0001316355567598343, + "loss": 0.0022, + "num_input_tokens_seen": 208476752, + "step": 96520 + }, + { + "epoch": 15.746329526916803, + "grad_norm": 0.0026046517305076122, + "learning_rate": 0.00013158742953303792, + "loss": 0.0162, + "num_input_tokens_seen": 208487280, + "step": 96525 + }, + { + "epoch": 15.747145187601957, + "grad_norm": 0.0007771203527227044, + "learning_rate": 0.00013153930977258987, + "loss": 0.0007, + "num_input_tokens_seen": 208497776, + "step": 96530 + }, + { + "epoch": 15.747960848287113, + "grad_norm": 0.00024037643743213266, + "learning_rate": 0.0001314911974794651, + "loss": 0.002, + "num_input_tokens_seen": 208509168, + "step": 96535 + }, + { + "epoch": 15.748776508972268, + "grad_norm": 0.2253538817167282, + "learning_rate": 0.00013144309265463873, + "loss": 0.0125, + "num_input_tokens_seen": 208519856, + "step": 96540 + }, + { + "epoch": 15.749592169657422, + "grad_norm": 0.0013409418752416968, + "learning_rate": 0.00013139499529908562, + "loss": 0.0541, + "num_input_tokens_seen": 208530704, + "step": 96545 + }, + { + "epoch": 15.750407830342578, + "grad_norm": 0.00018763738626148552, + "learning_rate": 0.00013134690541378053, + "loss": 0.0009, + "num_input_tokens_seen": 208541968, + "step": 96550 + }, + { + "epoch": 15.751223491027732, + "grad_norm": 0.0010253424989059567, + "learning_rate": 0.00013129882299969803, + "loss": 0.0012, + "num_input_tokens_seen": 208553648, + "step": 96555 + }, + { + "epoch": 15.752039151712887, + "grad_norm": 0.0036719876807183027, + "learning_rate": 0.00013125074805781268, + "loss": 0.0015, + "num_input_tokens_seen": 208564560, + "step": 96560 + }, + { + "epoch": 15.752854812398043, + "grad_norm": 0.001249134773388505, + "learning_rate": 0.0001312026805890987, + "loss": 0.0026, + "num_input_tokens_seen": 208574832, + "step": 96565 + }, + { + "epoch": 15.753670473083197, + "grad_norm": 0.005163257010281086, + "learning_rate": 0.00013115462059453022, + "loss": 0.0013, + "num_input_tokens_seen": 208584720, + "step": 96570 + }, + { + "epoch": 15.754486133768353, + "grad_norm": 0.0015362701378762722, + "learning_rate": 0.00013110656807508125, + "loss": 0.0016, + "num_input_tokens_seen": 208595696, + "step": 96575 + }, + { + "epoch": 15.755301794453507, + "grad_norm": 0.07348399609327316, + "learning_rate": 0.0001310585230317257, + "loss": 0.0024, + "num_input_tokens_seen": 208606352, + "step": 96580 + }, + { + "epoch": 15.756117455138662, + "grad_norm": 0.0003318371600471437, + "learning_rate": 0.0001310104854654372, + "loss": 0.0034, + "num_input_tokens_seen": 208615664, + "step": 96585 + }, + { + "epoch": 15.756933115823816, + "grad_norm": 0.01745942048728466, + "learning_rate": 0.0001309624553771893, + "loss": 0.0049, + "num_input_tokens_seen": 208626960, + "step": 96590 + }, + { + "epoch": 15.757748776508972, + "grad_norm": 0.010197905823588371, + "learning_rate": 0.00013091443276795544, + "loss": 0.0022, + "num_input_tokens_seen": 208637232, + "step": 96595 + }, + { + "epoch": 15.758564437194128, + "grad_norm": 0.0012122580083087087, + "learning_rate": 0.00013086641763870876, + "loss": 0.0067, + "num_input_tokens_seen": 208648048, + "step": 96600 + }, + { + "epoch": 15.759380097879282, + "grad_norm": 0.0027509070932865143, + "learning_rate": 0.00013081840999042244, + "loss": 0.0024, + "num_input_tokens_seen": 208658960, + "step": 96605 + }, + { + "epoch": 15.760195758564437, + "grad_norm": 0.002174974186345935, + "learning_rate": 0.0001307704098240694, + "loss": 0.015, + "num_input_tokens_seen": 208670064, + "step": 96610 + }, + { + "epoch": 15.761011419249591, + "grad_norm": 0.0011658243602141738, + "learning_rate": 0.0001307224171406224, + "loss": 0.0085, + "num_input_tokens_seen": 208681904, + "step": 96615 + }, + { + "epoch": 15.761827079934747, + "grad_norm": 0.0030715486500412226, + "learning_rate": 0.0001306744319410539, + "loss": 0.014, + "num_input_tokens_seen": 208693168, + "step": 96620 + }, + { + "epoch": 15.762642740619903, + "grad_norm": 0.11549370735883713, + "learning_rate": 0.00013062645422633683, + "loss": 0.0028, + "num_input_tokens_seen": 208703344, + "step": 96625 + }, + { + "epoch": 15.763458401305057, + "grad_norm": 0.0026244190521538258, + "learning_rate": 0.000130578483997443, + "loss": 0.0006, + "num_input_tokens_seen": 208712176, + "step": 96630 + }, + { + "epoch": 15.764274061990212, + "grad_norm": 0.0077323331497609615, + "learning_rate": 0.00013053052125534497, + "loss": 0.0013, + "num_input_tokens_seen": 208723344, + "step": 96635 + }, + { + "epoch": 15.765089722675366, + "grad_norm": 0.0006282702088356018, + "learning_rate": 0.00013048256600101465, + "loss": 0.002, + "num_input_tokens_seen": 208732624, + "step": 96640 + }, + { + "epoch": 15.765905383360522, + "grad_norm": 0.0021541621536016464, + "learning_rate": 0.00013043461823542387, + "loss": 0.0047, + "num_input_tokens_seen": 208744432, + "step": 96645 + }, + { + "epoch": 15.766721044045678, + "grad_norm": 0.0005122669972479343, + "learning_rate": 0.0001303866779595444, + "loss": 0.0022, + "num_input_tokens_seen": 208754864, + "step": 96650 + }, + { + "epoch": 15.767536704730832, + "grad_norm": 0.006448698695749044, + "learning_rate": 0.0001303387451743478, + "loss": 0.0038, + "num_input_tokens_seen": 208766544, + "step": 96655 + }, + { + "epoch": 15.768352365415987, + "grad_norm": 0.0061017731204628944, + "learning_rate": 0.00013029081988080545, + "loss": 0.1066, + "num_input_tokens_seen": 208777392, + "step": 96660 + }, + { + "epoch": 15.769168026101141, + "grad_norm": 0.06350026279687881, + "learning_rate": 0.00013024290207988866, + "loss": 0.0035, + "num_input_tokens_seen": 208787408, + "step": 96665 + }, + { + "epoch": 15.769983686786297, + "grad_norm": 0.012828153558075428, + "learning_rate": 0.00013019499177256848, + "loss": 0.1233, + "num_input_tokens_seen": 208797040, + "step": 96670 + }, + { + "epoch": 15.770799347471453, + "grad_norm": 0.005641296040266752, + "learning_rate": 0.00013014708895981597, + "loss": 0.0057, + "num_input_tokens_seen": 208809648, + "step": 96675 + }, + { + "epoch": 15.771615008156607, + "grad_norm": 0.0009565745131112635, + "learning_rate": 0.00013009919364260193, + "loss": 0.0085, + "num_input_tokens_seen": 208820592, + "step": 96680 + }, + { + "epoch": 15.772430668841762, + "grad_norm": 0.0075340899638831615, + "learning_rate": 0.0001300513058218969, + "loss": 0.0027, + "num_input_tokens_seen": 208830672, + "step": 96685 + }, + { + "epoch": 15.773246329526916, + "grad_norm": 0.005056384485214949, + "learning_rate": 0.0001300034254986715, + "loss": 0.0035, + "num_input_tokens_seen": 208841552, + "step": 96690 + }, + { + "epoch": 15.774061990212072, + "grad_norm": 0.01369437761604786, + "learning_rate": 0.00012995555267389608, + "loss": 0.0042, + "num_input_tokens_seen": 208852208, + "step": 96695 + }, + { + "epoch": 15.774877650897226, + "grad_norm": 3.37766170501709, + "learning_rate": 0.0001299076873485408, + "loss": 0.1299, + "num_input_tokens_seen": 208862672, + "step": 96700 + }, + { + "epoch": 15.775693311582382, + "grad_norm": 0.0019586030393838882, + "learning_rate": 0.00012985982952357577, + "loss": 0.0007, + "num_input_tokens_seen": 208873776, + "step": 96705 + }, + { + "epoch": 15.776508972267537, + "grad_norm": 0.00436317827552557, + "learning_rate": 0.00012981197919997078, + "loss": 0.0058, + "num_input_tokens_seen": 208884368, + "step": 96710 + }, + { + "epoch": 15.777324632952691, + "grad_norm": 0.0014498537639155984, + "learning_rate": 0.00012976413637869573, + "loss": 0.0038, + "num_input_tokens_seen": 208894512, + "step": 96715 + }, + { + "epoch": 15.778140293637847, + "grad_norm": 0.018102500587701797, + "learning_rate": 0.00012971630106072007, + "loss": 0.0027, + "num_input_tokens_seen": 208905488, + "step": 96720 + }, + { + "epoch": 15.778955954323001, + "grad_norm": 0.0548224151134491, + "learning_rate": 0.00012966847324701337, + "loss": 0.0031, + "num_input_tokens_seen": 208915760, + "step": 96725 + }, + { + "epoch": 15.779771615008157, + "grad_norm": 0.003279587486758828, + "learning_rate": 0.0001296206529385448, + "loss": 0.0067, + "num_input_tokens_seen": 208925808, + "step": 96730 + }, + { + "epoch": 15.780587275693312, + "grad_norm": 0.009405073709785938, + "learning_rate": 0.00012957284013628357, + "loss": 0.0014, + "num_input_tokens_seen": 208935024, + "step": 96735 + }, + { + "epoch": 15.781402936378466, + "grad_norm": 0.028064055368304253, + "learning_rate": 0.00012952503484119866, + "loss": 0.0022, + "num_input_tokens_seen": 208943728, + "step": 96740 + }, + { + "epoch": 15.782218597063622, + "grad_norm": 0.00217696325853467, + "learning_rate": 0.0001294772370542589, + "loss": 0.0102, + "num_input_tokens_seen": 208954096, + "step": 96745 + }, + { + "epoch": 15.783034257748776, + "grad_norm": 0.0025390072260051966, + "learning_rate": 0.00012942944677643282, + "loss": 0.001, + "num_input_tokens_seen": 208965136, + "step": 96750 + }, + { + "epoch": 15.783849918433932, + "grad_norm": 0.15127967298030853, + "learning_rate": 0.0001293816640086894, + "loss": 0.0625, + "num_input_tokens_seen": 208976112, + "step": 96755 + }, + { + "epoch": 15.784665579119086, + "grad_norm": 0.0010695239761844277, + "learning_rate": 0.00012933388875199643, + "loss": 0.0013, + "num_input_tokens_seen": 208986928, + "step": 96760 + }, + { + "epoch": 15.785481239804241, + "grad_norm": 0.00038432751898653805, + "learning_rate": 0.00012928612100732257, + "loss": 0.0007, + "num_input_tokens_seen": 208998512, + "step": 96765 + }, + { + "epoch": 15.786296900489397, + "grad_norm": 0.005832038354128599, + "learning_rate": 0.00012923836077563576, + "loss": 0.0014, + "num_input_tokens_seen": 209008208, + "step": 96770 + }, + { + "epoch": 15.78711256117455, + "grad_norm": 0.00981160532683134, + "learning_rate": 0.0001291906080579039, + "loss": 0.0009, + "num_input_tokens_seen": 209018480, + "step": 96775 + }, + { + "epoch": 15.787928221859707, + "grad_norm": 0.012114566750824451, + "learning_rate": 0.0001291428628550948, + "loss": 0.0192, + "num_input_tokens_seen": 209029072, + "step": 96780 + }, + { + "epoch": 15.78874388254486, + "grad_norm": 0.0163866113871336, + "learning_rate": 0.000129095125168176, + "loss": 0.0256, + "num_input_tokens_seen": 209040304, + "step": 96785 + }, + { + "epoch": 15.789559543230016, + "grad_norm": 0.0029381830245256424, + "learning_rate": 0.00012904739499811508, + "loss": 0.0048, + "num_input_tokens_seen": 209051152, + "step": 96790 + }, + { + "epoch": 15.790375203915172, + "grad_norm": 0.006611849181354046, + "learning_rate": 0.00012899967234587922, + "loss": 0.0292, + "num_input_tokens_seen": 209062000, + "step": 96795 + }, + { + "epoch": 15.791190864600326, + "grad_norm": 0.00876909215003252, + "learning_rate": 0.00012895195721243568, + "loss": 0.0016, + "num_input_tokens_seen": 209072304, + "step": 96800 + }, + { + "epoch": 15.792006525285482, + "grad_norm": 0.0037997523322701454, + "learning_rate": 0.00012890424959875147, + "loss": 0.0009, + "num_input_tokens_seen": 209083888, + "step": 96805 + }, + { + "epoch": 15.792822185970635, + "grad_norm": 0.005078902002424002, + "learning_rate": 0.0001288565495057934, + "loss": 0.001, + "num_input_tokens_seen": 209095888, + "step": 96810 + }, + { + "epoch": 15.793637846655791, + "grad_norm": 0.0032598378602415323, + "learning_rate": 0.00012880885693452814, + "loss": 0.0062, + "num_input_tokens_seen": 209106000, + "step": 96815 + }, + { + "epoch": 15.794453507340947, + "grad_norm": 0.0034630298614501953, + "learning_rate": 0.0001287611718859223, + "loss": 0.0016, + "num_input_tokens_seen": 209115952, + "step": 96820 + }, + { + "epoch": 15.7952691680261, + "grad_norm": 0.0033087453339248896, + "learning_rate": 0.00012871349436094226, + "loss": 0.001, + "num_input_tokens_seen": 209127248, + "step": 96825 + }, + { + "epoch": 15.796084828711257, + "grad_norm": 0.001051778206601739, + "learning_rate": 0.0001286658243605543, + "loss": 0.0026, + "num_input_tokens_seen": 209138288, + "step": 96830 + }, + { + "epoch": 15.79690048939641, + "grad_norm": 0.04102922976016998, + "learning_rate": 0.00012861816188572444, + "loss": 0.0039, + "num_input_tokens_seen": 209149296, + "step": 96835 + }, + { + "epoch": 15.797716150081566, + "grad_norm": 0.0023798923939466476, + "learning_rate": 0.00012857050693741866, + "loss": 0.005, + "num_input_tokens_seen": 209160496, + "step": 96840 + }, + { + "epoch": 15.798531810766722, + "grad_norm": 0.009345081634819508, + "learning_rate": 0.00012852285951660275, + "loss": 0.0018, + "num_input_tokens_seen": 209171440, + "step": 96845 + }, + { + "epoch": 15.799347471451876, + "grad_norm": 0.0020883092656731606, + "learning_rate": 0.00012847521962424237, + "loss": 0.0015, + "num_input_tokens_seen": 209182704, + "step": 96850 + }, + { + "epoch": 15.800163132137031, + "grad_norm": 0.007022218778729439, + "learning_rate": 0.00012842758726130281, + "loss": 0.0025, + "num_input_tokens_seen": 209194256, + "step": 96855 + }, + { + "epoch": 15.800978792822185, + "grad_norm": 0.0004571795871015638, + "learning_rate": 0.0001283799624287499, + "loss": 0.0095, + "num_input_tokens_seen": 209205264, + "step": 96860 + }, + { + "epoch": 15.801794453507341, + "grad_norm": 0.0013870035763829947, + "learning_rate": 0.00012833234512754817, + "loss": 0.001, + "num_input_tokens_seen": 209215952, + "step": 96865 + }, + { + "epoch": 15.802610114192497, + "grad_norm": 0.002285180613398552, + "learning_rate": 0.0001282847353586632, + "loss": 0.0024, + "num_input_tokens_seen": 209226928, + "step": 96870 + }, + { + "epoch": 15.80342577487765, + "grad_norm": 0.007267099339514971, + "learning_rate": 0.0001282371331230594, + "loss": 0.0032, + "num_input_tokens_seen": 209238704, + "step": 96875 + }, + { + "epoch": 15.804241435562806, + "grad_norm": 0.6497460007667542, + "learning_rate": 0.00012818953842170193, + "loss": 0.1171, + "num_input_tokens_seen": 209249808, + "step": 96880 + }, + { + "epoch": 15.80505709624796, + "grad_norm": 0.00022022541088517755, + "learning_rate": 0.0001281419512555549, + "loss": 0.0008, + "num_input_tokens_seen": 209259952, + "step": 96885 + }, + { + "epoch": 15.805872756933116, + "grad_norm": 0.0015587671659886837, + "learning_rate": 0.00012809437162558324, + "loss": 0.0659, + "num_input_tokens_seen": 209270576, + "step": 96890 + }, + { + "epoch": 15.80668841761827, + "grad_norm": 0.02945493347942829, + "learning_rate": 0.00012804679953275068, + "loss": 0.0022, + "num_input_tokens_seen": 209281424, + "step": 96895 + }, + { + "epoch": 15.807504078303426, + "grad_norm": 0.03355622664093971, + "learning_rate": 0.00012799923497802185, + "loss": 0.0051, + "num_input_tokens_seen": 209291664, + "step": 96900 + }, + { + "epoch": 15.808319738988581, + "grad_norm": 0.0016738567501306534, + "learning_rate": 0.00012795167796236012, + "loss": 0.0052, + "num_input_tokens_seen": 209303088, + "step": 96905 + }, + { + "epoch": 15.809135399673735, + "grad_norm": 0.0012268935097381473, + "learning_rate": 0.00012790412848672977, + "loss": 0.0013, + "num_input_tokens_seen": 209313328, + "step": 96910 + }, + { + "epoch": 15.809951060358891, + "grad_norm": 0.0003424619499128312, + "learning_rate": 0.0001278565865520943, + "loss": 0.0007, + "num_input_tokens_seen": 209324080, + "step": 96915 + }, + { + "epoch": 15.810766721044045, + "grad_norm": 0.5638118386268616, + "learning_rate": 0.00012780905215941724, + "loss": 0.0926, + "num_input_tokens_seen": 209334992, + "step": 96920 + }, + { + "epoch": 15.8115823817292, + "grad_norm": 0.000379926961613819, + "learning_rate": 0.00012776152530966184, + "loss": 0.003, + "num_input_tokens_seen": 209345680, + "step": 96925 + }, + { + "epoch": 15.812398042414356, + "grad_norm": 0.12855461239814758, + "learning_rate": 0.0001277140060037914, + "loss": 0.0069, + "num_input_tokens_seen": 209357072, + "step": 96930 + }, + { + "epoch": 15.81321370309951, + "grad_norm": 0.001596722868271172, + "learning_rate": 0.00012766649424276888, + "loss": 0.0004, + "num_input_tokens_seen": 209368688, + "step": 96935 + }, + { + "epoch": 15.814029363784666, + "grad_norm": 0.002013827906921506, + "learning_rate": 0.00012761899002755716, + "loss": 0.0013, + "num_input_tokens_seen": 209380176, + "step": 96940 + }, + { + "epoch": 15.81484502446982, + "grad_norm": 0.0006711476598866284, + "learning_rate": 0.00012757149335911906, + "loss": 0.004, + "num_input_tokens_seen": 209390000, + "step": 96945 + }, + { + "epoch": 15.815660685154976, + "grad_norm": 0.00945072341710329, + "learning_rate": 0.00012752400423841708, + "loss": 0.0013, + "num_input_tokens_seen": 209401008, + "step": 96950 + }, + { + "epoch": 15.81647634584013, + "grad_norm": 0.10607539117336273, + "learning_rate": 0.0001274765226664137, + "loss": 0.0026, + "num_input_tokens_seen": 209411216, + "step": 96955 + }, + { + "epoch": 15.817292006525285, + "grad_norm": 0.003964398056268692, + "learning_rate": 0.00012742904864407095, + "loss": 0.0188, + "num_input_tokens_seen": 209422384, + "step": 96960 + }, + { + "epoch": 15.818107667210441, + "grad_norm": 0.10050233453512192, + "learning_rate": 0.0001273815821723515, + "loss": 0.0032, + "num_input_tokens_seen": 209433488, + "step": 96965 + }, + { + "epoch": 15.818923327895595, + "grad_norm": 0.10801159590482712, + "learning_rate": 0.00012733412325221673, + "loss": 0.0035, + "num_input_tokens_seen": 209444496, + "step": 96970 + }, + { + "epoch": 15.81973898858075, + "grad_norm": 0.13759921491146088, + "learning_rate": 0.00012728667188462893, + "loss": 0.0072, + "num_input_tokens_seen": 209454960, + "step": 96975 + }, + { + "epoch": 15.820554649265905, + "grad_norm": 0.002384861698374152, + "learning_rate": 0.00012723922807054934, + "loss": 0.0039, + "num_input_tokens_seen": 209465456, + "step": 96980 + }, + { + "epoch": 15.82137030995106, + "grad_norm": 0.0007334126275964081, + "learning_rate": 0.00012719179181093992, + "loss": 0.0009, + "num_input_tokens_seen": 209476080, + "step": 96985 + }, + { + "epoch": 15.822185970636216, + "grad_norm": 0.007079676259309053, + "learning_rate": 0.00012714436310676147, + "loss": 0.0013, + "num_input_tokens_seen": 209487504, + "step": 96990 + }, + { + "epoch": 15.82300163132137, + "grad_norm": 0.010506193153560162, + "learning_rate": 0.00012709694195897587, + "loss": 0.0017, + "num_input_tokens_seen": 209498704, + "step": 96995 + }, + { + "epoch": 15.823817292006526, + "grad_norm": 0.007928671315312386, + "learning_rate": 0.00012704952836854345, + "loss": 0.0025, + "num_input_tokens_seen": 209509808, + "step": 97000 + }, + { + "epoch": 15.82463295269168, + "grad_norm": 0.005038387607783079, + "learning_rate": 0.00012700212233642577, + "loss": 0.0015, + "num_input_tokens_seen": 209518896, + "step": 97005 + }, + { + "epoch": 15.825448613376835, + "grad_norm": 0.19782593846321106, + "learning_rate": 0.00012695472386358293, + "loss": 0.0107, + "num_input_tokens_seen": 209529392, + "step": 97010 + }, + { + "epoch": 15.826264274061991, + "grad_norm": 0.013628569431602955, + "learning_rate": 0.00012690733295097617, + "loss": 0.0024, + "num_input_tokens_seen": 209541072, + "step": 97015 + }, + { + "epoch": 15.827079934747145, + "grad_norm": 0.0022729025222361088, + "learning_rate": 0.00012685994959956532, + "loss": 0.0037, + "num_input_tokens_seen": 209551568, + "step": 97020 + }, + { + "epoch": 15.8278955954323, + "grad_norm": 0.002973973285406828, + "learning_rate": 0.00012681257381031124, + "loss": 0.0018, + "num_input_tokens_seen": 209562480, + "step": 97025 + }, + { + "epoch": 15.828711256117455, + "grad_norm": 0.0014633465325459838, + "learning_rate": 0.00012676520558417347, + "loss": 0.0013, + "num_input_tokens_seen": 209572848, + "step": 97030 + }, + { + "epoch": 15.82952691680261, + "grad_norm": 0.0008202652097679675, + "learning_rate": 0.00012671784492211262, + "loss": 0.0026, + "num_input_tokens_seen": 209584048, + "step": 97035 + }, + { + "epoch": 15.830342577487766, + "grad_norm": 0.000434060872066766, + "learning_rate": 0.00012667049182508788, + "loss": 0.0005, + "num_input_tokens_seen": 209595088, + "step": 97040 + }, + { + "epoch": 15.83115823817292, + "grad_norm": 0.0006406829343177378, + "learning_rate": 0.00012662314629405936, + "loss": 0.0024, + "num_input_tokens_seen": 209605200, + "step": 97045 + }, + { + "epoch": 15.831973898858076, + "grad_norm": 0.003248979104682803, + "learning_rate": 0.00012657580832998644, + "loss": 0.0011, + "num_input_tokens_seen": 209615056, + "step": 97050 + }, + { + "epoch": 15.83278955954323, + "grad_norm": 0.0008684792555868626, + "learning_rate": 0.0001265284779338285, + "loss": 0.0022, + "num_input_tokens_seen": 209625552, + "step": 97055 + }, + { + "epoch": 15.833605220228385, + "grad_norm": 0.029019569978117943, + "learning_rate": 0.00012648115510654473, + "loss": 0.006, + "num_input_tokens_seen": 209636336, + "step": 97060 + }, + { + "epoch": 15.83442088091354, + "grad_norm": 0.0013905549421906471, + "learning_rate": 0.00012643383984909423, + "loss": 0.0009, + "num_input_tokens_seen": 209647056, + "step": 97065 + }, + { + "epoch": 15.835236541598695, + "grad_norm": 0.0020442737732082605, + "learning_rate": 0.0001263865321624358, + "loss": 0.0124, + "num_input_tokens_seen": 209658512, + "step": 97070 + }, + { + "epoch": 15.83605220228385, + "grad_norm": 0.0013671980705112219, + "learning_rate": 0.0001263392320475283, + "loss": 0.0774, + "num_input_tokens_seen": 209668400, + "step": 97075 + }, + { + "epoch": 15.836867862969005, + "grad_norm": 0.000905030348803848, + "learning_rate": 0.0001262919395053303, + "loss": 0.0014, + "num_input_tokens_seen": 209679920, + "step": 97080 + }, + { + "epoch": 15.83768352365416, + "grad_norm": 0.004797177854925394, + "learning_rate": 0.0001262446545368002, + "loss": 0.005, + "num_input_tokens_seen": 209690512, + "step": 97085 + }, + { + "epoch": 15.838499184339314, + "grad_norm": 0.0026325734797865152, + "learning_rate": 0.0001261973771428963, + "loss": 0.0012, + "num_input_tokens_seen": 209701872, + "step": 97090 + }, + { + "epoch": 15.83931484502447, + "grad_norm": 0.0004018640611320734, + "learning_rate": 0.00012615010732457677, + "loss": 0.0025, + "num_input_tokens_seen": 209713680, + "step": 97095 + }, + { + "epoch": 15.840130505709626, + "grad_norm": 0.01304841972887516, + "learning_rate": 0.00012610284508279956, + "loss": 0.0012, + "num_input_tokens_seen": 209725552, + "step": 97100 + }, + { + "epoch": 15.84094616639478, + "grad_norm": 0.03244048357009888, + "learning_rate": 0.00012605559041852245, + "loss": 0.0059, + "num_input_tokens_seen": 209735984, + "step": 97105 + }, + { + "epoch": 15.841761827079935, + "grad_norm": 0.02472323551774025, + "learning_rate": 0.0001260083433327034, + "loss": 0.0012, + "num_input_tokens_seen": 209747312, + "step": 97110 + }, + { + "epoch": 15.84257748776509, + "grad_norm": 0.002015564125031233, + "learning_rate": 0.00012596110382629943, + "loss": 0.0029, + "num_input_tokens_seen": 209757680, + "step": 97115 + }, + { + "epoch": 15.843393148450245, + "grad_norm": 0.0009261518134735525, + "learning_rate": 0.0001259138719002685, + "loss": 0.001, + "num_input_tokens_seen": 209769040, + "step": 97120 + }, + { + "epoch": 15.844208809135399, + "grad_norm": 0.0004853067803196609, + "learning_rate": 0.0001258666475555672, + "loss": 0.0016, + "num_input_tokens_seen": 209780624, + "step": 97125 + }, + { + "epoch": 15.845024469820554, + "grad_norm": 0.0011482738191261888, + "learning_rate": 0.00012581943079315323, + "loss": 0.0005, + "num_input_tokens_seen": 209791440, + "step": 97130 + }, + { + "epoch": 15.84584013050571, + "grad_norm": 0.012917263433337212, + "learning_rate": 0.00012577222161398288, + "loss": 0.0253, + "num_input_tokens_seen": 209802800, + "step": 97135 + }, + { + "epoch": 15.846655791190864, + "grad_norm": 0.012864846736192703, + "learning_rate": 0.00012572502001901347, + "loss": 0.0018, + "num_input_tokens_seen": 209813072, + "step": 97140 + }, + { + "epoch": 15.84747145187602, + "grad_norm": 0.0005281308549456298, + "learning_rate": 0.00012567782600920107, + "loss": 0.0015, + "num_input_tokens_seen": 209825232, + "step": 97145 + }, + { + "epoch": 15.848287112561174, + "grad_norm": 0.0009208358242176473, + "learning_rate": 0.0001256306395855027, + "loss": 0.0013, + "num_input_tokens_seen": 209835120, + "step": 97150 + }, + { + "epoch": 15.84910277324633, + "grad_norm": 0.02001389116048813, + "learning_rate": 0.000125583460748874, + "loss": 0.0016, + "num_input_tokens_seen": 209845200, + "step": 97155 + }, + { + "epoch": 15.849918433931485, + "grad_norm": 0.010557861998677254, + "learning_rate": 0.00012553628950027175, + "loss": 0.0028, + "num_input_tokens_seen": 209856336, + "step": 97160 + }, + { + "epoch": 15.850734094616639, + "grad_norm": 0.11188769340515137, + "learning_rate": 0.00012548912584065135, + "loss": 0.0019, + "num_input_tokens_seen": 209868240, + "step": 97165 + }, + { + "epoch": 15.851549755301795, + "grad_norm": 0.00966912042349577, + "learning_rate": 0.00012544196977096905, + "loss": 0.0004, + "num_input_tokens_seen": 209879120, + "step": 97170 + }, + { + "epoch": 15.852365415986949, + "grad_norm": 0.12375235557556152, + "learning_rate": 0.00012539482129218045, + "loss": 0.0029, + "num_input_tokens_seen": 209889712, + "step": 97175 + }, + { + "epoch": 15.853181076672104, + "grad_norm": 0.000904184824321419, + "learning_rate": 0.00012534768040524098, + "loss": 0.0005, + "num_input_tokens_seen": 209900816, + "step": 97180 + }, + { + "epoch": 15.85399673735726, + "grad_norm": 0.10164017230272293, + "learning_rate": 0.000125300547111106, + "loss": 0.004, + "num_input_tokens_seen": 209910320, + "step": 97185 + }, + { + "epoch": 15.854812398042414, + "grad_norm": 0.011888348497450352, + "learning_rate": 0.00012525342141073083, + "loss": 0.0014, + "num_input_tokens_seen": 209920944, + "step": 97190 + }, + { + "epoch": 15.85562805872757, + "grad_norm": 0.008458067663013935, + "learning_rate": 0.00012520630330507042, + "loss": 0.0013, + "num_input_tokens_seen": 209932272, + "step": 97195 + }, + { + "epoch": 15.856443719412724, + "grad_norm": 0.000870992022100836, + "learning_rate": 0.0001251591927950798, + "loss": 0.0007, + "num_input_tokens_seen": 209942736, + "step": 97200 + }, + { + "epoch": 15.85725938009788, + "grad_norm": 0.000473090389277786, + "learning_rate": 0.00012511208988171362, + "loss": 0.0005, + "num_input_tokens_seen": 209953072, + "step": 97205 + }, + { + "epoch": 15.858075040783035, + "grad_norm": 0.0010090031428262591, + "learning_rate": 0.0001250649945659265, + "loss": 0.0011, + "num_input_tokens_seen": 209964112, + "step": 97210 + }, + { + "epoch": 15.858890701468189, + "grad_norm": 0.015405405312776566, + "learning_rate": 0.00012501790684867292, + "loss": 0.0016, + "num_input_tokens_seen": 209975440, + "step": 97215 + }, + { + "epoch": 15.859706362153345, + "grad_norm": 0.010843812488019466, + "learning_rate": 0.0001249708267309072, + "loss": 0.001, + "num_input_tokens_seen": 209987088, + "step": 97220 + }, + { + "epoch": 15.860522022838499, + "grad_norm": 0.007993648760020733, + "learning_rate": 0.00012492375421358336, + "loss": 0.0026, + "num_input_tokens_seen": 209998832, + "step": 97225 + }, + { + "epoch": 15.861337683523654, + "grad_norm": 0.008369416929781437, + "learning_rate": 0.00012487668929765555, + "loss": 0.0046, + "num_input_tokens_seen": 210009328, + "step": 97230 + }, + { + "epoch": 15.86215334420881, + "grad_norm": 0.0016901845810934901, + "learning_rate": 0.00012482963198407742, + "loss": 0.0023, + "num_input_tokens_seen": 210020080, + "step": 97235 + }, + { + "epoch": 15.862969004893964, + "grad_norm": 0.004488547798246145, + "learning_rate": 0.00012478258227380262, + "loss": 0.0029, + "num_input_tokens_seen": 210030448, + "step": 97240 + }, + { + "epoch": 15.86378466557912, + "grad_norm": 0.003256887663155794, + "learning_rate": 0.0001247355401677851, + "loss": 0.0256, + "num_input_tokens_seen": 210043024, + "step": 97245 + }, + { + "epoch": 15.864600326264274, + "grad_norm": 0.00019677575619425625, + "learning_rate": 0.00012468850566697758, + "loss": 0.0015, + "num_input_tokens_seen": 210055024, + "step": 97250 + }, + { + "epoch": 15.86541598694943, + "grad_norm": 0.0008272241684608161, + "learning_rate": 0.00012464147877233394, + "loss": 0.0012, + "num_input_tokens_seen": 210066096, + "step": 97255 + }, + { + "epoch": 15.866231647634583, + "grad_norm": 0.007103449199348688, + "learning_rate": 0.00012459445948480663, + "loss": 0.0014, + "num_input_tokens_seen": 210076688, + "step": 97260 + }, + { + "epoch": 15.867047308319739, + "grad_norm": 0.021840078756213188, + "learning_rate": 0.0001245474478053491, + "loss": 0.0025, + "num_input_tokens_seen": 210086256, + "step": 97265 + }, + { + "epoch": 15.867862969004895, + "grad_norm": 0.003334933193400502, + "learning_rate": 0.00012450044373491355, + "loss": 0.0006, + "num_input_tokens_seen": 210097424, + "step": 97270 + }, + { + "epoch": 15.868678629690049, + "grad_norm": 0.0055555677972733974, + "learning_rate": 0.00012445344727445303, + "loss": 0.0016, + "num_input_tokens_seen": 210107024, + "step": 97275 + }, + { + "epoch": 15.869494290375204, + "grad_norm": 0.0009360619587823749, + "learning_rate": 0.00012440645842491977, + "loss": 0.0005, + "num_input_tokens_seen": 210116880, + "step": 97280 + }, + { + "epoch": 15.870309951060358, + "grad_norm": 0.002635387470945716, + "learning_rate": 0.0001243594771872661, + "loss": 0.1729, + "num_input_tokens_seen": 210127184, + "step": 97285 + }, + { + "epoch": 15.871125611745514, + "grad_norm": 0.06094209477305412, + "learning_rate": 0.00012431250356244422, + "loss": 0.0038, + "num_input_tokens_seen": 210138224, + "step": 97290 + }, + { + "epoch": 15.87194127243067, + "grad_norm": 0.0005155637627467513, + "learning_rate": 0.000124265537551406, + "loss": 0.0034, + "num_input_tokens_seen": 210148656, + "step": 97295 + }, + { + "epoch": 15.872756933115824, + "grad_norm": 0.4359850585460663, + "learning_rate": 0.00012421857915510332, + "loss": 0.0271, + "num_input_tokens_seen": 210160496, + "step": 97300 + }, + { + "epoch": 15.87357259380098, + "grad_norm": 0.5114566683769226, + "learning_rate": 0.00012417162837448787, + "loss": 0.0092, + "num_input_tokens_seen": 210172400, + "step": 97305 + }, + { + "epoch": 15.874388254486133, + "grad_norm": 0.0038012703880667686, + "learning_rate": 0.0001241246852105111, + "loss": 0.0284, + "num_input_tokens_seen": 210183984, + "step": 97310 + }, + { + "epoch": 15.875203915171289, + "grad_norm": 0.013922316022217274, + "learning_rate": 0.00012407774966412445, + "loss": 0.0075, + "num_input_tokens_seen": 210196112, + "step": 97315 + }, + { + "epoch": 15.876019575856443, + "grad_norm": 0.02212394028902054, + "learning_rate": 0.0001240308217362791, + "loss": 0.0011, + "num_input_tokens_seen": 210207408, + "step": 97320 + }, + { + "epoch": 15.876835236541599, + "grad_norm": 0.017958035692572594, + "learning_rate": 0.0001239839014279261, + "loss": 0.0014, + "num_input_tokens_seen": 210218832, + "step": 97325 + }, + { + "epoch": 15.877650897226754, + "grad_norm": 0.004952315706759691, + "learning_rate": 0.0001239369887400163, + "loss": 0.005, + "num_input_tokens_seen": 210230832, + "step": 97330 + }, + { + "epoch": 15.878466557911908, + "grad_norm": 0.10258053243160248, + "learning_rate": 0.0001238900836735005, + "loss": 0.0038, + "num_input_tokens_seen": 210241168, + "step": 97335 + }, + { + "epoch": 15.879282218597064, + "grad_norm": 0.010011304169893265, + "learning_rate": 0.00012384318622932932, + "loss": 0.0034, + "num_input_tokens_seen": 210252176, + "step": 97340 + }, + { + "epoch": 15.880097879282218, + "grad_norm": 0.000613482145126909, + "learning_rate": 0.00012379629640845314, + "loss": 0.0016, + "num_input_tokens_seen": 210262704, + "step": 97345 + }, + { + "epoch": 15.880913539967374, + "grad_norm": 0.0014483414124697447, + "learning_rate": 0.0001237494142118223, + "loss": 0.0008, + "num_input_tokens_seen": 210273104, + "step": 97350 + }, + { + "epoch": 15.88172920065253, + "grad_norm": 0.005220226943492889, + "learning_rate": 0.00012370253964038685, + "loss": 0.0024, + "num_input_tokens_seen": 210283728, + "step": 97355 + }, + { + "epoch": 15.882544861337683, + "grad_norm": 0.0020932487677782774, + "learning_rate": 0.0001236556726950968, + "loss": 0.0015, + "num_input_tokens_seen": 210295440, + "step": 97360 + }, + { + "epoch": 15.883360522022839, + "grad_norm": 0.03078819066286087, + "learning_rate": 0.000123608813376902, + "loss": 0.0018, + "num_input_tokens_seen": 210307440, + "step": 97365 + }, + { + "epoch": 15.884176182707993, + "grad_norm": 0.00039980438305065036, + "learning_rate": 0.00012356196168675205, + "loss": 0.0031, + "num_input_tokens_seen": 210317392, + "step": 97370 + }, + { + "epoch": 15.884991843393149, + "grad_norm": 0.0004841327026952058, + "learning_rate": 0.00012351511762559653, + "loss": 0.0012, + "num_input_tokens_seen": 210327728, + "step": 97375 + }, + { + "epoch": 15.885807504078304, + "grad_norm": 0.752201497554779, + "learning_rate": 0.0001234682811943847, + "loss": 0.0498, + "num_input_tokens_seen": 210338672, + "step": 97380 + }, + { + "epoch": 15.886623164763458, + "grad_norm": 0.005839809309691191, + "learning_rate": 0.00012342145239406573, + "loss": 0.0702, + "num_input_tokens_seen": 210350352, + "step": 97385 + }, + { + "epoch": 15.887438825448614, + "grad_norm": 0.0002493146457709372, + "learning_rate": 0.00012337463122558885, + "loss": 0.0014, + "num_input_tokens_seen": 210361872, + "step": 97390 + }, + { + "epoch": 15.888254486133768, + "grad_norm": 0.0055696722120046616, + "learning_rate": 0.00012332781768990286, + "loss": 0.0042, + "num_input_tokens_seen": 210373520, + "step": 97395 + }, + { + "epoch": 15.889070146818923, + "grad_norm": 0.001248400192707777, + "learning_rate": 0.00012328101178795648, + "loss": 0.0013, + "num_input_tokens_seen": 210384912, + "step": 97400 + }, + { + "epoch": 15.88988580750408, + "grad_norm": 0.058470193296670914, + "learning_rate": 0.0001232342135206983, + "loss": 0.024, + "num_input_tokens_seen": 210394896, + "step": 97405 + }, + { + "epoch": 15.890701468189233, + "grad_norm": 0.7765676379203796, + "learning_rate": 0.0001231874228890768, + "loss": 0.0108, + "num_input_tokens_seen": 210405136, + "step": 97410 + }, + { + "epoch": 15.891517128874389, + "grad_norm": 0.0009560906910337508, + "learning_rate": 0.00012314063989404012, + "loss": 0.0073, + "num_input_tokens_seen": 210416144, + "step": 97415 + }, + { + "epoch": 15.892332789559543, + "grad_norm": 0.0044351788237690926, + "learning_rate": 0.00012309386453653647, + "loss": 0.0407, + "num_input_tokens_seen": 210426960, + "step": 97420 + }, + { + "epoch": 15.893148450244698, + "grad_norm": 0.00936975609511137, + "learning_rate": 0.00012304709681751385, + "loss": 0.0128, + "num_input_tokens_seen": 210437680, + "step": 97425 + }, + { + "epoch": 15.893964110929852, + "grad_norm": 0.00683948677033186, + "learning_rate": 0.00012300033673792, + "loss": 0.0012, + "num_input_tokens_seen": 210449200, + "step": 97430 + }, + { + "epoch": 15.894779771615008, + "grad_norm": 0.00031043830676935613, + "learning_rate": 0.00012295358429870252, + "loss": 0.0055, + "num_input_tokens_seen": 210459824, + "step": 97435 + }, + { + "epoch": 15.895595432300164, + "grad_norm": 0.002923042280599475, + "learning_rate": 0.000122906839500809, + "loss": 0.0045, + "num_input_tokens_seen": 210469808, + "step": 97440 + }, + { + "epoch": 15.896411092985318, + "grad_norm": 0.0020240000449121, + "learning_rate": 0.0001228601023451868, + "loss": 0.0011, + "num_input_tokens_seen": 210479184, + "step": 97445 + }, + { + "epoch": 15.897226753670473, + "grad_norm": 0.0005645108758471906, + "learning_rate": 0.00012281337283278298, + "loss": 0.0317, + "num_input_tokens_seen": 210489264, + "step": 97450 + }, + { + "epoch": 15.898042414355627, + "grad_norm": 0.00976780615746975, + "learning_rate": 0.0001227666509645447, + "loss": 0.0029, + "num_input_tokens_seen": 210499280, + "step": 97455 + }, + { + "epoch": 15.898858075040783, + "grad_norm": 0.004547907970845699, + "learning_rate": 0.00012271993674141878, + "loss": 0.0153, + "num_input_tokens_seen": 210509648, + "step": 97460 + }, + { + "epoch": 15.899673735725939, + "grad_norm": 0.0006972053670324385, + "learning_rate": 0.000122673230164352, + "loss": 0.0013, + "num_input_tokens_seen": 210520752, + "step": 97465 + }, + { + "epoch": 15.900489396411093, + "grad_norm": 1.194573998451233, + "learning_rate": 0.00012262653123429085, + "loss": 0.0174, + "num_input_tokens_seen": 210532336, + "step": 97470 + }, + { + "epoch": 15.901305057096248, + "grad_norm": 0.005130276549607515, + "learning_rate": 0.0001225798399521818, + "loss": 0.0009, + "num_input_tokens_seen": 210543024, + "step": 97475 + }, + { + "epoch": 15.902120717781402, + "grad_norm": 0.010159132070839405, + "learning_rate": 0.00012253315631897106, + "loss": 0.0192, + "num_input_tokens_seen": 210553488, + "step": 97480 + }, + { + "epoch": 15.902936378466558, + "grad_norm": 0.01264498382806778, + "learning_rate": 0.00012248648033560473, + "loss": 0.0027, + "num_input_tokens_seen": 210565296, + "step": 97485 + }, + { + "epoch": 15.903752039151712, + "grad_norm": 0.062267009168863297, + "learning_rate": 0.00012243981200302885, + "loss": 0.0034, + "num_input_tokens_seen": 210575632, + "step": 97490 + }, + { + "epoch": 15.904567699836868, + "grad_norm": 0.012246742844581604, + "learning_rate": 0.00012239315132218898, + "loss": 0.0029, + "num_input_tokens_seen": 210587184, + "step": 97495 + }, + { + "epoch": 15.905383360522023, + "grad_norm": 0.0017103358404710889, + "learning_rate": 0.00012234649829403116, + "loss": 0.0685, + "num_input_tokens_seen": 210596912, + "step": 97500 + }, + { + "epoch": 15.906199021207177, + "grad_norm": 0.0008929313044063747, + "learning_rate": 0.0001222998529195004, + "loss": 0.0024, + "num_input_tokens_seen": 210607376, + "step": 97505 + }, + { + "epoch": 15.907014681892333, + "grad_norm": 0.015163707546889782, + "learning_rate": 0.00012225321519954258, + "loss": 0.0017, + "num_input_tokens_seen": 210618192, + "step": 97510 + }, + { + "epoch": 15.907830342577487, + "grad_norm": 0.0005488864844664931, + "learning_rate": 0.00012220658513510224, + "loss": 0.0007, + "num_input_tokens_seen": 210629904, + "step": 97515 + }, + { + "epoch": 15.908646003262643, + "grad_norm": 0.0006303298287093639, + "learning_rate": 0.00012215996272712498, + "loss": 0.0023, + "num_input_tokens_seen": 210640592, + "step": 97520 + }, + { + "epoch": 15.909461663947798, + "grad_norm": 0.005725410301238298, + "learning_rate": 0.00012211334797655515, + "loss": 0.0013, + "num_input_tokens_seen": 210651312, + "step": 97525 + }, + { + "epoch": 15.910277324632952, + "grad_norm": 0.0017852602759376168, + "learning_rate": 0.00012206674088433784, + "loss": 0.006, + "num_input_tokens_seen": 210660880, + "step": 97530 + }, + { + "epoch": 15.911092985318108, + "grad_norm": 0.046053625643253326, + "learning_rate": 0.00012202014145141749, + "loss": 0.0023, + "num_input_tokens_seen": 210671472, + "step": 97535 + }, + { + "epoch": 15.911908646003262, + "grad_norm": 0.0011109462939202785, + "learning_rate": 0.00012197354967873847, + "loss": 0.0006, + "num_input_tokens_seen": 210682768, + "step": 97540 + }, + { + "epoch": 15.912724306688418, + "grad_norm": 0.00119930156506598, + "learning_rate": 0.00012192696556724497, + "loss": 0.0007, + "num_input_tokens_seen": 210693296, + "step": 97545 + }, + { + "epoch": 15.913539967373573, + "grad_norm": 0.013788484036922455, + "learning_rate": 0.00012188038911788119, + "loss": 0.0024, + "num_input_tokens_seen": 210704080, + "step": 97550 + }, + { + "epoch": 15.914355628058727, + "grad_norm": 0.034569744020700455, + "learning_rate": 0.00012183382033159101, + "loss": 0.0018, + "num_input_tokens_seen": 210714704, + "step": 97555 + }, + { + "epoch": 15.915171288743883, + "grad_norm": 0.003412411315366626, + "learning_rate": 0.00012178725920931816, + "loss": 0.1418, + "num_input_tokens_seen": 210725264, + "step": 97560 + }, + { + "epoch": 15.915986949429037, + "grad_norm": 0.00040752938366495073, + "learning_rate": 0.0001217407057520063, + "loss": 0.0008, + "num_input_tokens_seen": 210736272, + "step": 97565 + }, + { + "epoch": 15.916802610114193, + "grad_norm": 0.003446828341111541, + "learning_rate": 0.0001216941599605989, + "loss": 0.0047, + "num_input_tokens_seen": 210746320, + "step": 97570 + }, + { + "epoch": 15.917618270799348, + "grad_norm": 0.02085341326892376, + "learning_rate": 0.00012164762183603928, + "loss": 0.0009, + "num_input_tokens_seen": 210757712, + "step": 97575 + }, + { + "epoch": 15.918433931484502, + "grad_norm": 0.0007540509686805308, + "learning_rate": 0.00012160109137927061, + "loss": 0.0015, + "num_input_tokens_seen": 210768560, + "step": 97580 + }, + { + "epoch": 15.919249592169658, + "grad_norm": 0.00683918921276927, + "learning_rate": 0.00012155456859123582, + "loss": 0.0037, + "num_input_tokens_seen": 210779440, + "step": 97585 + }, + { + "epoch": 15.920065252854812, + "grad_norm": 0.009121835231781006, + "learning_rate": 0.00012150805347287774, + "loss": 0.0957, + "num_input_tokens_seen": 210789616, + "step": 97590 + }, + { + "epoch": 15.920880913539968, + "grad_norm": 0.007207232527434826, + "learning_rate": 0.00012146154602513915, + "loss": 0.0037, + "num_input_tokens_seen": 210800560, + "step": 97595 + }, + { + "epoch": 15.921696574225122, + "grad_norm": 0.002390147652477026, + "learning_rate": 0.00012141504624896244, + "loss": 0.0044, + "num_input_tokens_seen": 210809488, + "step": 97600 + }, + { + "epoch": 15.922512234910277, + "grad_norm": 0.041978128254413605, + "learning_rate": 0.0001213685541452903, + "loss": 0.0018, + "num_input_tokens_seen": 210819696, + "step": 97605 + }, + { + "epoch": 15.923327895595433, + "grad_norm": 0.010881558991968632, + "learning_rate": 0.00012132206971506449, + "loss": 0.0018, + "num_input_tokens_seen": 210831440, + "step": 97610 + }, + { + "epoch": 15.924143556280587, + "grad_norm": 0.008309472352266312, + "learning_rate": 0.00012127559295922764, + "loss": 0.001, + "num_input_tokens_seen": 210841680, + "step": 97615 + }, + { + "epoch": 15.924959216965743, + "grad_norm": 0.00020515847427304834, + "learning_rate": 0.00012122912387872098, + "loss": 0.0013, + "num_input_tokens_seen": 210852528, + "step": 97620 + }, + { + "epoch": 15.925774877650896, + "grad_norm": 0.003146476112306118, + "learning_rate": 0.000121182662474487, + "loss": 0.0022, + "num_input_tokens_seen": 210863440, + "step": 97625 + }, + { + "epoch": 15.926590538336052, + "grad_norm": 0.0006061471067368984, + "learning_rate": 0.00012113620874746656, + "loss": 0.0008, + "num_input_tokens_seen": 210874640, + "step": 97630 + }, + { + "epoch": 15.927406199021208, + "grad_norm": 0.00795169360935688, + "learning_rate": 0.00012108976269860183, + "loss": 0.0568, + "num_input_tokens_seen": 210886064, + "step": 97635 + }, + { + "epoch": 15.928221859706362, + "grad_norm": 0.01079578511416912, + "learning_rate": 0.00012104332432883342, + "loss": 0.0019, + "num_input_tokens_seen": 210894896, + "step": 97640 + }, + { + "epoch": 15.929037520391518, + "grad_norm": 0.005672822240740061, + "learning_rate": 0.0001209968936391031, + "loss": 0.0252, + "num_input_tokens_seen": 210905296, + "step": 97645 + }, + { + "epoch": 15.929853181076671, + "grad_norm": 0.0013776031555607915, + "learning_rate": 0.00012095047063035119, + "loss": 0.0016, + "num_input_tokens_seen": 210916368, + "step": 97650 + }, + { + "epoch": 15.930668841761827, + "grad_norm": 0.003583358135074377, + "learning_rate": 0.00012090405530351916, + "loss": 0.0664, + "num_input_tokens_seen": 210926032, + "step": 97655 + }, + { + "epoch": 15.931484502446983, + "grad_norm": 0.20289726555347443, + "learning_rate": 0.0001208576476595471, + "loss": 0.0104, + "num_input_tokens_seen": 210937584, + "step": 97660 + }, + { + "epoch": 15.932300163132137, + "grad_norm": 0.0075626983307302, + "learning_rate": 0.00012081124769937607, + "loss": 0.0038, + "num_input_tokens_seen": 210950224, + "step": 97665 + }, + { + "epoch": 15.933115823817293, + "grad_norm": 0.07312007993459702, + "learning_rate": 0.00012076485542394583, + "loss": 0.0031, + "num_input_tokens_seen": 210961104, + "step": 97670 + }, + { + "epoch": 15.933931484502446, + "grad_norm": 0.04900915548205376, + "learning_rate": 0.00012071847083419708, + "loss": 0.0321, + "num_input_tokens_seen": 210972880, + "step": 97675 + }, + { + "epoch": 15.934747145187602, + "grad_norm": 0.006645069923251867, + "learning_rate": 0.00012067209393106959, + "loss": 0.0189, + "num_input_tokens_seen": 210984272, + "step": 97680 + }, + { + "epoch": 15.935562805872756, + "grad_norm": 0.0015154111897572875, + "learning_rate": 0.00012062572471550337, + "loss": 0.0016, + "num_input_tokens_seen": 210994320, + "step": 97685 + }, + { + "epoch": 15.936378466557912, + "grad_norm": 0.0016459511825814843, + "learning_rate": 0.00012057936318843816, + "loss": 0.0034, + "num_input_tokens_seen": 211005392, + "step": 97690 + }, + { + "epoch": 15.937194127243067, + "grad_norm": 0.0007665985031053424, + "learning_rate": 0.00012053300935081341, + "loss": 0.0039, + "num_input_tokens_seen": 211016336, + "step": 97695 + }, + { + "epoch": 15.938009787928221, + "grad_norm": 0.002678055316209793, + "learning_rate": 0.00012048666320356865, + "loss": 0.0012, + "num_input_tokens_seen": 211027312, + "step": 97700 + }, + { + "epoch": 15.938825448613377, + "grad_norm": 0.06500992923974991, + "learning_rate": 0.0001204403247476431, + "loss": 0.0033, + "num_input_tokens_seen": 211037456, + "step": 97705 + }, + { + "epoch": 15.939641109298531, + "grad_norm": 0.0016458886675536633, + "learning_rate": 0.00012039399398397588, + "loss": 0.0013, + "num_input_tokens_seen": 211048624, + "step": 97710 + }, + { + "epoch": 15.940456769983687, + "grad_norm": 0.11121902614831924, + "learning_rate": 0.00012034767091350591, + "loss": 0.0046, + "num_input_tokens_seen": 211059248, + "step": 97715 + }, + { + "epoch": 15.941272430668842, + "grad_norm": 0.0037498734891414642, + "learning_rate": 0.00012030135553717204, + "loss": 0.0221, + "num_input_tokens_seen": 211069808, + "step": 97720 + }, + { + "epoch": 15.942088091353996, + "grad_norm": 0.02941116690635681, + "learning_rate": 0.00012025504785591273, + "loss": 0.0391, + "num_input_tokens_seen": 211081264, + "step": 97725 + }, + { + "epoch": 15.942903752039152, + "grad_norm": 0.036870796233415604, + "learning_rate": 0.00012020874787066688, + "loss": 0.0061, + "num_input_tokens_seen": 211092688, + "step": 97730 + }, + { + "epoch": 15.943719412724306, + "grad_norm": 0.0026832197327166796, + "learning_rate": 0.00012016245558237232, + "loss": 0.0012, + "num_input_tokens_seen": 211102960, + "step": 97735 + }, + { + "epoch": 15.944535073409462, + "grad_norm": 0.012120860628783703, + "learning_rate": 0.0001201161709919677, + "loss": 0.0012, + "num_input_tokens_seen": 211114416, + "step": 97740 + }, + { + "epoch": 15.945350734094617, + "grad_norm": 0.0010019437177106738, + "learning_rate": 0.00012006989410039055, + "loss": 0.0397, + "num_input_tokens_seen": 211125936, + "step": 97745 + }, + { + "epoch": 15.946166394779771, + "grad_norm": 0.00462432811036706, + "learning_rate": 0.00012002362490857921, + "loss": 0.0298, + "num_input_tokens_seen": 211138608, + "step": 97750 + }, + { + "epoch": 15.946982055464927, + "grad_norm": 0.001551853958517313, + "learning_rate": 0.00011997736341747085, + "loss": 0.0032, + "num_input_tokens_seen": 211151184, + "step": 97755 + }, + { + "epoch": 15.947797716150081, + "grad_norm": 0.0018696035258471966, + "learning_rate": 0.00011993110962800363, + "loss": 0.0035, + "num_input_tokens_seen": 211161584, + "step": 97760 + }, + { + "epoch": 15.948613376835237, + "grad_norm": 0.0018065494950860739, + "learning_rate": 0.00011988486354111433, + "loss": 0.0109, + "num_input_tokens_seen": 211172496, + "step": 97765 + }, + { + "epoch": 15.949429037520392, + "grad_norm": 0.005451989360153675, + "learning_rate": 0.0001198386251577408, + "loss": 0.0009, + "num_input_tokens_seen": 211183536, + "step": 97770 + }, + { + "epoch": 15.950244698205546, + "grad_norm": 0.7213369011878967, + "learning_rate": 0.00011979239447881945, + "loss": 0.0131, + "num_input_tokens_seen": 211194480, + "step": 97775 + }, + { + "epoch": 15.951060358890702, + "grad_norm": 0.0018106530187651515, + "learning_rate": 0.00011974617150528788, + "loss": 0.0007, + "num_input_tokens_seen": 211204944, + "step": 97780 + }, + { + "epoch": 15.951876019575856, + "grad_norm": 0.002773257438093424, + "learning_rate": 0.00011969995623808221, + "loss": 0.0163, + "num_input_tokens_seen": 211215472, + "step": 97785 + }, + { + "epoch": 15.952691680261012, + "grad_norm": 0.016532791778445244, + "learning_rate": 0.00011965374867813972, + "loss": 0.0028, + "num_input_tokens_seen": 211227216, + "step": 97790 + }, + { + "epoch": 15.953507340946166, + "grad_norm": 0.0011013613548129797, + "learning_rate": 0.00011960754882639619, + "loss": 0.0986, + "num_input_tokens_seen": 211238768, + "step": 97795 + }, + { + "epoch": 15.954323001631321, + "grad_norm": 0.0032066667918115854, + "learning_rate": 0.00011956135668378853, + "loss": 0.0024, + "num_input_tokens_seen": 211249232, + "step": 97800 + }, + { + "epoch": 15.955138662316477, + "grad_norm": 0.0005308827967382967, + "learning_rate": 0.00011951517225125231, + "loss": 0.0011, + "num_input_tokens_seen": 211259792, + "step": 97805 + }, + { + "epoch": 15.955954323001631, + "grad_norm": 0.0034774949308484793, + "learning_rate": 0.00011946899552972395, + "loss": 0.0017, + "num_input_tokens_seen": 211270448, + "step": 97810 + }, + { + "epoch": 15.956769983686787, + "grad_norm": 0.0022737339604645967, + "learning_rate": 0.00011942282652013914, + "loss": 0.0017, + "num_input_tokens_seen": 211280624, + "step": 97815 + }, + { + "epoch": 15.95758564437194, + "grad_norm": 0.0004652600619010627, + "learning_rate": 0.00011937666522343354, + "loss": 0.0012, + "num_input_tokens_seen": 211291760, + "step": 97820 + }, + { + "epoch": 15.958401305057096, + "grad_norm": 0.001015088171698153, + "learning_rate": 0.0001193305116405427, + "loss": 0.0014, + "num_input_tokens_seen": 211301392, + "step": 97825 + }, + { + "epoch": 15.959216965742252, + "grad_norm": 0.002087209140881896, + "learning_rate": 0.00011928436577240193, + "loss": 0.0121, + "num_input_tokens_seen": 211312560, + "step": 97830 + }, + { + "epoch": 15.960032626427406, + "grad_norm": 0.0011707304511219263, + "learning_rate": 0.00011923822761994646, + "loss": 0.0021, + "num_input_tokens_seen": 211324784, + "step": 97835 + }, + { + "epoch": 15.960848287112562, + "grad_norm": 0.4978778660297394, + "learning_rate": 0.00011919209718411134, + "loss": 0.0095, + "num_input_tokens_seen": 211335216, + "step": 97840 + }, + { + "epoch": 15.961663947797716, + "grad_norm": 0.0035809699911624193, + "learning_rate": 0.00011914597446583147, + "loss": 0.0021, + "num_input_tokens_seen": 211346864, + "step": 97845 + }, + { + "epoch": 15.962479608482871, + "grad_norm": 0.0041794972494244576, + "learning_rate": 0.00011909985946604157, + "loss": 0.0073, + "num_input_tokens_seen": 211358192, + "step": 97850 + }, + { + "epoch": 15.963295269168025, + "grad_norm": 0.0012313545448705554, + "learning_rate": 0.00011905375218567621, + "loss": 0.0483, + "num_input_tokens_seen": 211369968, + "step": 97855 + }, + { + "epoch": 15.964110929853181, + "grad_norm": 0.014864086173474789, + "learning_rate": 0.00011900765262566988, + "loss": 0.0015, + "num_input_tokens_seen": 211380304, + "step": 97860 + }, + { + "epoch": 15.964926590538337, + "grad_norm": 0.0016739999409765005, + "learning_rate": 0.00011896156078695675, + "loss": 0.0012, + "num_input_tokens_seen": 211391888, + "step": 97865 + }, + { + "epoch": 15.96574225122349, + "grad_norm": 0.004763337317854166, + "learning_rate": 0.00011891547667047082, + "loss": 0.0037, + "num_input_tokens_seen": 211402800, + "step": 97870 + }, + { + "epoch": 15.966557911908646, + "grad_norm": 0.015505307354032993, + "learning_rate": 0.00011886940027714649, + "loss": 0.0027, + "num_input_tokens_seen": 211413776, + "step": 97875 + }, + { + "epoch": 15.9673735725938, + "grad_norm": 0.9275649785995483, + "learning_rate": 0.00011882333160791697, + "loss": 0.1289, + "num_input_tokens_seen": 211424816, + "step": 97880 + }, + { + "epoch": 15.968189233278956, + "grad_norm": 0.006364729721099138, + "learning_rate": 0.00011877727066371646, + "loss": 0.0037, + "num_input_tokens_seen": 211435568, + "step": 97885 + }, + { + "epoch": 15.969004893964112, + "grad_norm": 0.014689773321151733, + "learning_rate": 0.00011873121744547794, + "loss": 0.0689, + "num_input_tokens_seen": 211446704, + "step": 97890 + }, + { + "epoch": 15.969820554649266, + "grad_norm": 0.006804941687732935, + "learning_rate": 0.00011868517195413525, + "loss": 0.0021, + "num_input_tokens_seen": 211456304, + "step": 97895 + }, + { + "epoch": 15.970636215334421, + "grad_norm": 0.0007602769765071571, + "learning_rate": 0.00011863913419062095, + "loss": 0.0139, + "num_input_tokens_seen": 211467824, + "step": 97900 + }, + { + "epoch": 15.971451876019575, + "grad_norm": 0.0020225904881954193, + "learning_rate": 0.00011859310415586871, + "loss": 0.0684, + "num_input_tokens_seen": 211478672, + "step": 97905 + }, + { + "epoch": 15.97226753670473, + "grad_norm": 0.0005114731029607356, + "learning_rate": 0.00011854708185081076, + "loss": 0.001, + "num_input_tokens_seen": 211489520, + "step": 97910 + }, + { + "epoch": 15.973083197389887, + "grad_norm": 0.0024293966125696898, + "learning_rate": 0.00011850106727638026, + "loss": 0.0015, + "num_input_tokens_seen": 211501008, + "step": 97915 + }, + { + "epoch": 15.97389885807504, + "grad_norm": 0.1450122594833374, + "learning_rate": 0.00011845506043350956, + "loss": 0.008, + "num_input_tokens_seen": 211510928, + "step": 97920 + }, + { + "epoch": 15.974714518760196, + "grad_norm": 0.014834724366664886, + "learning_rate": 0.00011840906132313117, + "loss": 0.1368, + "num_input_tokens_seen": 211521264, + "step": 97925 + }, + { + "epoch": 15.97553017944535, + "grad_norm": 0.6983307003974915, + "learning_rate": 0.00011836306994617718, + "loss": 0.0056, + "num_input_tokens_seen": 211532272, + "step": 97930 + }, + { + "epoch": 15.976345840130506, + "grad_norm": 0.009043251164257526, + "learning_rate": 0.00011831708630357968, + "loss": 0.0009, + "num_input_tokens_seen": 211542992, + "step": 97935 + }, + { + "epoch": 15.977161500815662, + "grad_norm": 0.008625411428511143, + "learning_rate": 0.0001182711103962707, + "loss": 0.0021, + "num_input_tokens_seen": 211551312, + "step": 97940 + }, + { + "epoch": 15.977977161500815, + "grad_norm": 0.004919607657939196, + "learning_rate": 0.00011822514222518188, + "loss": 0.0007, + "num_input_tokens_seen": 211562672, + "step": 97945 + }, + { + "epoch": 15.978792822185971, + "grad_norm": 0.0008576642139814794, + "learning_rate": 0.00011817918179124487, + "loss": 0.0045, + "num_input_tokens_seen": 211574384, + "step": 97950 + }, + { + "epoch": 15.979608482871125, + "grad_norm": 0.0007568973815068603, + "learning_rate": 0.00011813322909539115, + "loss": 0.0119, + "num_input_tokens_seen": 211585168, + "step": 97955 + }, + { + "epoch": 15.98042414355628, + "grad_norm": 0.0010796175338327885, + "learning_rate": 0.0001180872841385519, + "loss": 0.0035, + "num_input_tokens_seen": 211594384, + "step": 97960 + }, + { + "epoch": 15.981239804241435, + "grad_norm": 0.09835665673017502, + "learning_rate": 0.00011804134692165841, + "loss": 0.0141, + "num_input_tokens_seen": 211604496, + "step": 97965 + }, + { + "epoch": 15.98205546492659, + "grad_norm": 0.03971698135137558, + "learning_rate": 0.00011799541744564151, + "loss": 0.0019, + "num_input_tokens_seen": 211615056, + "step": 97970 + }, + { + "epoch": 15.982871125611746, + "grad_norm": 0.020225724205374718, + "learning_rate": 0.00011794949571143215, + "loss": 0.001, + "num_input_tokens_seen": 211626832, + "step": 97975 + }, + { + "epoch": 15.9836867862969, + "grad_norm": 0.0011494300561025739, + "learning_rate": 0.00011790358171996086, + "loss": 0.0073, + "num_input_tokens_seen": 211636752, + "step": 97980 + }, + { + "epoch": 15.984502446982056, + "grad_norm": 0.00521137984469533, + "learning_rate": 0.00011785767547215825, + "loss": 0.006, + "num_input_tokens_seen": 211648272, + "step": 97985 + }, + { + "epoch": 15.98531810766721, + "grad_norm": 0.013017826713621616, + "learning_rate": 0.00011781177696895462, + "loss": 0.0018, + "num_input_tokens_seen": 211659344, + "step": 97990 + }, + { + "epoch": 15.986133768352365, + "grad_norm": 0.024100029841065407, + "learning_rate": 0.00011776588621128015, + "loss": 0.1615, + "num_input_tokens_seen": 211670544, + "step": 97995 + }, + { + "epoch": 15.986949429037521, + "grad_norm": 0.0009241014486178756, + "learning_rate": 0.00011772000320006493, + "loss": 0.0008, + "num_input_tokens_seen": 211681776, + "step": 98000 + }, + { + "epoch": 15.987765089722675, + "grad_norm": 0.2770422101020813, + "learning_rate": 0.00011767412793623878, + "loss": 0.0132, + "num_input_tokens_seen": 211691952, + "step": 98005 + }, + { + "epoch": 15.98858075040783, + "grad_norm": 0.004331182222813368, + "learning_rate": 0.00011762826042073144, + "loss": 0.001, + "num_input_tokens_seen": 211702288, + "step": 98010 + }, + { + "epoch": 15.989396411092985, + "grad_norm": 0.02343442477285862, + "learning_rate": 0.00011758240065447234, + "loss": 0.0029, + "num_input_tokens_seen": 211712496, + "step": 98015 + }, + { + "epoch": 15.99021207177814, + "grad_norm": 0.013188188895583153, + "learning_rate": 0.00011753654863839114, + "loss": 0.0017, + "num_input_tokens_seen": 211724688, + "step": 98020 + }, + { + "epoch": 15.991027732463294, + "grad_norm": 0.001225476386025548, + "learning_rate": 0.00011749070437341702, + "loss": 0.0013, + "num_input_tokens_seen": 211736656, + "step": 98025 + }, + { + "epoch": 15.99184339314845, + "grad_norm": 0.0011202679015696049, + "learning_rate": 0.00011744486786047898, + "loss": 0.0015, + "num_input_tokens_seen": 211746864, + "step": 98030 + }, + { + "epoch": 15.992659053833606, + "grad_norm": 0.002501759212464094, + "learning_rate": 0.00011739903910050603, + "loss": 0.0014, + "num_input_tokens_seen": 211758896, + "step": 98035 + }, + { + "epoch": 15.99347471451876, + "grad_norm": 0.4698786735534668, + "learning_rate": 0.00011735321809442689, + "loss": 0.0252, + "num_input_tokens_seen": 211769968, + "step": 98040 + }, + { + "epoch": 15.994290375203915, + "grad_norm": 0.466574490070343, + "learning_rate": 0.00011730740484317021, + "loss": 0.0282, + "num_input_tokens_seen": 211782512, + "step": 98045 + }, + { + "epoch": 15.99510603588907, + "grad_norm": 0.08782917261123657, + "learning_rate": 0.00011726159934766445, + "loss": 0.0015, + "num_input_tokens_seen": 211792528, + "step": 98050 + }, + { + "epoch": 15.995921696574225, + "grad_norm": 0.0014665591297671199, + "learning_rate": 0.00011721580160883794, + "loss": 0.0961, + "num_input_tokens_seen": 211803696, + "step": 98055 + }, + { + "epoch": 15.99673735725938, + "grad_norm": 0.005211786832660437, + "learning_rate": 0.00011717001162761881, + "loss": 0.0024, + "num_input_tokens_seen": 211814128, + "step": 98060 + }, + { + "epoch": 15.997553017944535, + "grad_norm": 0.0027232381980866194, + "learning_rate": 0.000117124229404935, + "loss": 0.0036, + "num_input_tokens_seen": 211825872, + "step": 98065 + }, + { + "epoch": 15.99836867862969, + "grad_norm": 0.001201624283567071, + "learning_rate": 0.00011707845494171443, + "loss": 0.0034, + "num_input_tokens_seen": 211837392, + "step": 98070 + }, + { + "epoch": 15.999184339314844, + "grad_norm": 0.0005471862968988717, + "learning_rate": 0.00011703268823888475, + "loss": 0.0009, + "num_input_tokens_seen": 211847184, + "step": 98075 + }, + { + "epoch": 16.0, + "grad_norm": 0.0004963973187841475, + "learning_rate": 0.00011698692929737348, + "loss": 0.0007, + "num_input_tokens_seen": 211855376, + "step": 98080 + }, + { + "epoch": 16.0, + "eval_loss": 0.262260377407074, + "eval_runtime": 104.0194, + "eval_samples_per_second": 26.197, + "eval_steps_per_second": 6.556, + "num_input_tokens_seen": 211855376, + "step": 98080 + }, + { + "epoch": 16.000815660685156, + "grad_norm": 0.015522826462984085, + "learning_rate": 0.00011694117811810795, + "loss": 0.0904, + "num_input_tokens_seen": 211865616, + "step": 98085 + }, + { + "epoch": 16.00163132137031, + "grad_norm": 0.005051231477409601, + "learning_rate": 0.00011689543470201536, + "loss": 0.001, + "num_input_tokens_seen": 211877392, + "step": 98090 + }, + { + "epoch": 16.002446982055464, + "grad_norm": 0.028706401586532593, + "learning_rate": 0.00011684969905002286, + "loss": 0.002, + "num_input_tokens_seen": 211887856, + "step": 98095 + }, + { + "epoch": 16.00326264274062, + "grad_norm": 0.015213343314826488, + "learning_rate": 0.00011680397116305719, + "loss": 0.0018, + "num_input_tokens_seen": 211898256, + "step": 98100 + }, + { + "epoch": 16.004078303425775, + "grad_norm": 1.6256518363952637, + "learning_rate": 0.00011675825104204523, + "loss": 0.044, + "num_input_tokens_seen": 211909136, + "step": 98105 + }, + { + "epoch": 16.00489396411093, + "grad_norm": 0.0015888881171122193, + "learning_rate": 0.00011671253868791343, + "loss": 0.0027, + "num_input_tokens_seen": 211920208, + "step": 98110 + }, + { + "epoch": 16.005709624796086, + "grad_norm": 0.0021806336008012295, + "learning_rate": 0.00011666683410158829, + "loss": 0.0019, + "num_input_tokens_seen": 211930512, + "step": 98115 + }, + { + "epoch": 16.00652528548124, + "grad_norm": 0.24877884984016418, + "learning_rate": 0.0001166211372839961, + "loss": 0.0057, + "num_input_tokens_seen": 211941968, + "step": 98120 + }, + { + "epoch": 16.007340946166394, + "grad_norm": 0.05665234103798866, + "learning_rate": 0.00011657544823606286, + "loss": 0.012, + "num_input_tokens_seen": 211953168, + "step": 98125 + }, + { + "epoch": 16.00815660685155, + "grad_norm": 0.007977227680385113, + "learning_rate": 0.00011652976695871459, + "loss": 0.0072, + "num_input_tokens_seen": 211964432, + "step": 98130 + }, + { + "epoch": 16.008972267536706, + "grad_norm": 0.0041960496455430984, + "learning_rate": 0.00011648409345287691, + "loss": 0.0011, + "num_input_tokens_seen": 211975504, + "step": 98135 + }, + { + "epoch": 16.00978792822186, + "grad_norm": 0.0007657354581169784, + "learning_rate": 0.00011643842771947588, + "loss": 0.0075, + "num_input_tokens_seen": 211987184, + "step": 98140 + }, + { + "epoch": 16.010603588907014, + "grad_norm": 0.04990185424685478, + "learning_rate": 0.00011639276975943641, + "loss": 0.0046, + "num_input_tokens_seen": 211998224, + "step": 98145 + }, + { + "epoch": 16.01141924959217, + "grad_norm": 0.08976588398218155, + "learning_rate": 0.00011634711957368438, + "loss": 0.0022, + "num_input_tokens_seen": 212008944, + "step": 98150 + }, + { + "epoch": 16.012234910277325, + "grad_norm": 0.012806740589439869, + "learning_rate": 0.00011630147716314443, + "loss": 0.0014, + "num_input_tokens_seen": 212020112, + "step": 98155 + }, + { + "epoch": 16.01305057096248, + "grad_norm": 0.0011214031837880611, + "learning_rate": 0.00011625584252874189, + "loss": 0.0007, + "num_input_tokens_seen": 212030928, + "step": 98160 + }, + { + "epoch": 16.013866231647636, + "grad_norm": 0.7840534448623657, + "learning_rate": 0.00011621021567140156, + "loss": 0.0302, + "num_input_tokens_seen": 212043152, + "step": 98165 + }, + { + "epoch": 16.01468189233279, + "grad_norm": 0.00202510179951787, + "learning_rate": 0.00011616459659204803, + "loss": 0.0026, + "num_input_tokens_seen": 212053936, + "step": 98170 + }, + { + "epoch": 16.015497553017944, + "grad_norm": 0.00578334229066968, + "learning_rate": 0.00011611898529160591, + "loss": 0.0014, + "num_input_tokens_seen": 212064016, + "step": 98175 + }, + { + "epoch": 16.0163132137031, + "grad_norm": 0.003464588662609458, + "learning_rate": 0.00011607338177099952, + "loss": 0.0013, + "num_input_tokens_seen": 212075216, + "step": 98180 + }, + { + "epoch": 16.017128874388256, + "grad_norm": 0.007143693510442972, + "learning_rate": 0.00011602778603115311, + "loss": 0.01, + "num_input_tokens_seen": 212086512, + "step": 98185 + }, + { + "epoch": 16.017944535073408, + "grad_norm": 0.00191160524263978, + "learning_rate": 0.00011598219807299076, + "loss": 0.0045, + "num_input_tokens_seen": 212097232, + "step": 98190 + }, + { + "epoch": 16.018760195758563, + "grad_norm": 0.010454155504703522, + "learning_rate": 0.00011593661789743626, + "loss": 0.0412, + "num_input_tokens_seen": 212107696, + "step": 98195 + }, + { + "epoch": 16.01957585644372, + "grad_norm": 0.005367065314203501, + "learning_rate": 0.00011589104550541346, + "loss": 0.0029, + "num_input_tokens_seen": 212116944, + "step": 98200 + }, + { + "epoch": 16.020391517128875, + "grad_norm": 0.003508794354274869, + "learning_rate": 0.00011584548089784585, + "loss": 0.0008, + "num_input_tokens_seen": 212126608, + "step": 98205 + }, + { + "epoch": 16.02120717781403, + "grad_norm": 0.003974012564867735, + "learning_rate": 0.00011579992407565698, + "loss": 0.001, + "num_input_tokens_seen": 212137040, + "step": 98210 + }, + { + "epoch": 16.022022838499183, + "grad_norm": 0.004293251316994429, + "learning_rate": 0.00011575437503976998, + "loss": 0.0016, + "num_input_tokens_seen": 212147792, + "step": 98215 + }, + { + "epoch": 16.02283849918434, + "grad_norm": 0.03774509206414223, + "learning_rate": 0.00011570883379110803, + "loss": 0.0074, + "num_input_tokens_seen": 212158032, + "step": 98220 + }, + { + "epoch": 16.023654159869494, + "grad_norm": 0.04184706136584282, + "learning_rate": 0.00011566330033059407, + "loss": 0.0055, + "num_input_tokens_seen": 212167536, + "step": 98225 + }, + { + "epoch": 16.02446982055465, + "grad_norm": 0.005745955277234316, + "learning_rate": 0.00011561777465915091, + "loss": 0.0009, + "num_input_tokens_seen": 212177712, + "step": 98230 + }, + { + "epoch": 16.025285481239806, + "grad_norm": 0.0024114667903631926, + "learning_rate": 0.00011557225677770116, + "loss": 0.0006, + "num_input_tokens_seen": 212188624, + "step": 98235 + }, + { + "epoch": 16.026101141924958, + "grad_norm": 0.0005438401130959392, + "learning_rate": 0.00011552674668716723, + "loss": 0.0017, + "num_input_tokens_seen": 212199504, + "step": 98240 + }, + { + "epoch": 16.026916802610113, + "grad_norm": 0.007553048897534609, + "learning_rate": 0.00011548124438847174, + "loss": 0.0711, + "num_input_tokens_seen": 212209552, + "step": 98245 + }, + { + "epoch": 16.02773246329527, + "grad_norm": 0.010238065384328365, + "learning_rate": 0.0001154357498825363, + "loss": 0.1005, + "num_input_tokens_seen": 212219376, + "step": 98250 + }, + { + "epoch": 16.028548123980425, + "grad_norm": 0.30445703864097595, + "learning_rate": 0.00011539026317028361, + "loss": 0.017, + "num_input_tokens_seen": 212229968, + "step": 98255 + }, + { + "epoch": 16.02936378466558, + "grad_norm": 0.0014553911751136184, + "learning_rate": 0.00011534478425263484, + "loss": 0.0026, + "num_input_tokens_seen": 212239792, + "step": 98260 + }, + { + "epoch": 16.030179445350733, + "grad_norm": 0.009973454289138317, + "learning_rate": 0.00011529931313051222, + "loss": 0.0037, + "num_input_tokens_seen": 212251632, + "step": 98265 + }, + { + "epoch": 16.03099510603589, + "grad_norm": 0.04578957334160805, + "learning_rate": 0.00011525384980483683, + "loss": 0.1222, + "num_input_tokens_seen": 212262864, + "step": 98270 + }, + { + "epoch": 16.031810766721044, + "grad_norm": 0.002932838397100568, + "learning_rate": 0.00011520839427653052, + "loss": 0.0057, + "num_input_tokens_seen": 212273328, + "step": 98275 + }, + { + "epoch": 16.0326264274062, + "grad_norm": 0.001764679211191833, + "learning_rate": 0.00011516294654651393, + "loss": 0.0034, + "num_input_tokens_seen": 212284880, + "step": 98280 + }, + { + "epoch": 16.033442088091356, + "grad_norm": 0.003940457943826914, + "learning_rate": 0.00011511750661570875, + "loss": 0.008, + "num_input_tokens_seen": 212294800, + "step": 98285 + }, + { + "epoch": 16.034257748776508, + "grad_norm": 0.00984375923871994, + "learning_rate": 0.00011507207448503526, + "loss": 0.0023, + "num_input_tokens_seen": 212305424, + "step": 98290 + }, + { + "epoch": 16.035073409461663, + "grad_norm": 0.003804681124165654, + "learning_rate": 0.00011502665015541481, + "loss": 0.0024, + "num_input_tokens_seen": 212316336, + "step": 98295 + }, + { + "epoch": 16.03588907014682, + "grad_norm": 0.048975877463817596, + "learning_rate": 0.0001149812336277673, + "loss": 0.0059, + "num_input_tokens_seen": 212327216, + "step": 98300 + }, + { + "epoch": 16.036704730831975, + "grad_norm": 0.0003695717023219913, + "learning_rate": 0.00011493582490301374, + "loss": 0.0137, + "num_input_tokens_seen": 212338576, + "step": 98305 + }, + { + "epoch": 16.03752039151713, + "grad_norm": 0.03170023486018181, + "learning_rate": 0.00011489042398207416, + "loss": 0.0088, + "num_input_tokens_seen": 212348496, + "step": 98310 + }, + { + "epoch": 16.038336052202283, + "grad_norm": 0.005363030359148979, + "learning_rate": 0.00011484503086586867, + "loss": 0.0016, + "num_input_tokens_seen": 212360240, + "step": 98315 + }, + { + "epoch": 16.03915171288744, + "grad_norm": 0.00753002380952239, + "learning_rate": 0.00011479964555531725, + "loss": 0.0023, + "num_input_tokens_seen": 212370256, + "step": 98320 + }, + { + "epoch": 16.039967373572594, + "grad_norm": 0.017763635143637657, + "learning_rate": 0.00011475426805133965, + "loss": 0.0016, + "num_input_tokens_seen": 212382288, + "step": 98325 + }, + { + "epoch": 16.04078303425775, + "grad_norm": 0.0341855026781559, + "learning_rate": 0.00011470889835485554, + "loss": 0.0038, + "num_input_tokens_seen": 212393008, + "step": 98330 + }, + { + "epoch": 16.041598694942905, + "grad_norm": 0.0011914388742297888, + "learning_rate": 0.0001146635364667844, + "loss": 0.0016, + "num_input_tokens_seen": 212404272, + "step": 98335 + }, + { + "epoch": 16.042414355628058, + "grad_norm": 0.005368915386497974, + "learning_rate": 0.0001146181823880455, + "loss": 0.0069, + "num_input_tokens_seen": 212415888, + "step": 98340 + }, + { + "epoch": 16.043230016313213, + "grad_norm": 0.0011746870586648583, + "learning_rate": 0.00011457283611955804, + "loss": 0.1048, + "num_input_tokens_seen": 212426320, + "step": 98345 + }, + { + "epoch": 16.04404567699837, + "grad_norm": 0.0012608602410182357, + "learning_rate": 0.00011452749766224102, + "loss": 0.0018, + "num_input_tokens_seen": 212436880, + "step": 98350 + }, + { + "epoch": 16.044861337683525, + "grad_norm": 0.20264336466789246, + "learning_rate": 0.00011448216701701309, + "loss": 0.0085, + "num_input_tokens_seen": 212446832, + "step": 98355 + }, + { + "epoch": 16.045676998368677, + "grad_norm": 0.005520314909517765, + "learning_rate": 0.00011443684418479344, + "loss": 0.007, + "num_input_tokens_seen": 212457424, + "step": 98360 + }, + { + "epoch": 16.046492659053833, + "grad_norm": 0.0019661204423755407, + "learning_rate": 0.00011439152916649992, + "loss": 0.0011, + "num_input_tokens_seen": 212468848, + "step": 98365 + }, + { + "epoch": 16.04730831973899, + "grad_norm": 0.004119037184864283, + "learning_rate": 0.00011434622196305156, + "loss": 0.0018, + "num_input_tokens_seen": 212480048, + "step": 98370 + }, + { + "epoch": 16.048123980424144, + "grad_norm": 0.01271512359380722, + "learning_rate": 0.00011430092257536596, + "loss": 0.0038, + "num_input_tokens_seen": 212490928, + "step": 98375 + }, + { + "epoch": 16.0489396411093, + "grad_norm": 0.06934128701686859, + "learning_rate": 0.00011425563100436175, + "loss": 0.0044, + "num_input_tokens_seen": 212500880, + "step": 98380 + }, + { + "epoch": 16.049755301794452, + "grad_norm": 0.06471730768680573, + "learning_rate": 0.00011421034725095625, + "loss": 0.0029, + "num_input_tokens_seen": 212511824, + "step": 98385 + }, + { + "epoch": 16.050570962479608, + "grad_norm": 0.0072989496402442455, + "learning_rate": 0.00011416507131606773, + "loss": 0.0069, + "num_input_tokens_seen": 212522032, + "step": 98390 + }, + { + "epoch": 16.051386623164763, + "grad_norm": 0.057368844747543335, + "learning_rate": 0.00011411980320061322, + "loss": 0.0044, + "num_input_tokens_seen": 212532016, + "step": 98395 + }, + { + "epoch": 16.05220228384992, + "grad_norm": 0.0406746082007885, + "learning_rate": 0.00011407454290551073, + "loss": 0.041, + "num_input_tokens_seen": 212542672, + "step": 98400 + }, + { + "epoch": 16.053017944535075, + "grad_norm": 0.0061463662423193455, + "learning_rate": 0.00011402929043167692, + "loss": 0.0019, + "num_input_tokens_seen": 212553104, + "step": 98405 + }, + { + "epoch": 16.053833605220227, + "grad_norm": 0.0020612115040421486, + "learning_rate": 0.00011398404578002946, + "loss": 0.0008, + "num_input_tokens_seen": 212564752, + "step": 98410 + }, + { + "epoch": 16.054649265905383, + "grad_norm": 0.005618616472929716, + "learning_rate": 0.00011393880895148473, + "loss": 0.0019, + "num_input_tokens_seen": 212575760, + "step": 98415 + }, + { + "epoch": 16.05546492659054, + "grad_norm": 0.0006880142027512193, + "learning_rate": 0.00011389357994696003, + "loss": 0.0012, + "num_input_tokens_seen": 212587376, + "step": 98420 + }, + { + "epoch": 16.056280587275694, + "grad_norm": 0.009598582983016968, + "learning_rate": 0.00011384835876737154, + "loss": 0.0031, + "num_input_tokens_seen": 212598896, + "step": 98425 + }, + { + "epoch": 16.05709624796085, + "grad_norm": 0.008436683565378189, + "learning_rate": 0.00011380314541363612, + "loss": 0.0015, + "num_input_tokens_seen": 212610448, + "step": 98430 + }, + { + "epoch": 16.057911908646002, + "grad_norm": 0.054017916321754456, + "learning_rate": 0.00011375793988666966, + "loss": 0.0122, + "num_input_tokens_seen": 212622032, + "step": 98435 + }, + { + "epoch": 16.058727569331158, + "grad_norm": 0.004722116515040398, + "learning_rate": 0.0001137127421873888, + "loss": 0.0016, + "num_input_tokens_seen": 212633904, + "step": 98440 + }, + { + "epoch": 16.059543230016313, + "grad_norm": 0.05647740885615349, + "learning_rate": 0.000113667552316709, + "loss": 0.0043, + "num_input_tokens_seen": 212645744, + "step": 98445 + }, + { + "epoch": 16.06035889070147, + "grad_norm": 0.008270112797617912, + "learning_rate": 0.00011362237027554645, + "loss": 0.0028, + "num_input_tokens_seen": 212655600, + "step": 98450 + }, + { + "epoch": 16.061174551386625, + "grad_norm": 0.005991316866129637, + "learning_rate": 0.00011357719606481675, + "loss": 0.005, + "num_input_tokens_seen": 212667408, + "step": 98455 + }, + { + "epoch": 16.061990212071777, + "grad_norm": 0.005687070544809103, + "learning_rate": 0.00011353202968543535, + "loss": 0.0023, + "num_input_tokens_seen": 212676304, + "step": 98460 + }, + { + "epoch": 16.062805872756933, + "grad_norm": 0.0009080614545382559, + "learning_rate": 0.00011348687113831768, + "loss": 0.0088, + "num_input_tokens_seen": 212686864, + "step": 98465 + }, + { + "epoch": 16.063621533442088, + "grad_norm": 0.007313088979572058, + "learning_rate": 0.00011344172042437889, + "loss": 0.0016, + "num_input_tokens_seen": 212697328, + "step": 98470 + }, + { + "epoch": 16.064437194127244, + "grad_norm": 0.7464372515678406, + "learning_rate": 0.00011339657754453398, + "loss": 0.0857, + "num_input_tokens_seen": 212708432, + "step": 98475 + }, + { + "epoch": 16.0652528548124, + "grad_norm": 0.003813547547906637, + "learning_rate": 0.00011335144249969793, + "loss": 0.0138, + "num_input_tokens_seen": 212719216, + "step": 98480 + }, + { + "epoch": 16.06606851549755, + "grad_norm": 0.0055252122692763805, + "learning_rate": 0.00011330631529078533, + "loss": 0.0006, + "num_input_tokens_seen": 212730320, + "step": 98485 + }, + { + "epoch": 16.066884176182707, + "grad_norm": 0.038586895912885666, + "learning_rate": 0.00011326119591871087, + "loss": 0.0049, + "num_input_tokens_seen": 212741520, + "step": 98490 + }, + { + "epoch": 16.067699836867863, + "grad_norm": 0.01220710203051567, + "learning_rate": 0.00011321608438438885, + "loss": 0.0022, + "num_input_tokens_seen": 212752112, + "step": 98495 + }, + { + "epoch": 16.06851549755302, + "grad_norm": 0.005802713334560394, + "learning_rate": 0.00011317098068873339, + "loss": 0.0017, + "num_input_tokens_seen": 212763280, + "step": 98500 + }, + { + "epoch": 16.069331158238175, + "grad_norm": 0.019641386345028877, + "learning_rate": 0.000113125884832659, + "loss": 0.015, + "num_input_tokens_seen": 212775120, + "step": 98505 + }, + { + "epoch": 16.070146818923327, + "grad_norm": 0.0012488446664065123, + "learning_rate": 0.00011308079681707911, + "loss": 0.0019, + "num_input_tokens_seen": 212785904, + "step": 98510 + }, + { + "epoch": 16.070962479608482, + "grad_norm": 0.3433908224105835, + "learning_rate": 0.00011303571664290801, + "loss": 0.017, + "num_input_tokens_seen": 212797296, + "step": 98515 + }, + { + "epoch": 16.071778140293638, + "grad_norm": 0.018292676657438278, + "learning_rate": 0.0001129906443110587, + "loss": 0.0328, + "num_input_tokens_seen": 212807312, + "step": 98520 + }, + { + "epoch": 16.072593800978794, + "grad_norm": 0.00970328226685524, + "learning_rate": 0.0001129455798224452, + "loss": 0.0027, + "num_input_tokens_seen": 212817712, + "step": 98525 + }, + { + "epoch": 16.07340946166395, + "grad_norm": 0.07746391743421555, + "learning_rate": 0.00011290052317798027, + "loss": 0.0747, + "num_input_tokens_seen": 212829104, + "step": 98530 + }, + { + "epoch": 16.0742251223491, + "grad_norm": 0.020664365962147713, + "learning_rate": 0.00011285547437857763, + "loss": 0.0029, + "num_input_tokens_seen": 212839856, + "step": 98535 + }, + { + "epoch": 16.075040783034257, + "grad_norm": 0.000956006464548409, + "learning_rate": 0.00011281043342514957, + "loss": 0.0046, + "num_input_tokens_seen": 212851280, + "step": 98540 + }, + { + "epoch": 16.075856443719413, + "grad_norm": 0.005796929355710745, + "learning_rate": 0.0001127654003186096, + "loss": 0.0014, + "num_input_tokens_seen": 212861808, + "step": 98545 + }, + { + "epoch": 16.07667210440457, + "grad_norm": 0.03431297466158867, + "learning_rate": 0.00011272037505986976, + "loss": 0.0066, + "num_input_tokens_seen": 212872496, + "step": 98550 + }, + { + "epoch": 16.07748776508972, + "grad_norm": 0.011982480064034462, + "learning_rate": 0.00011267535764984293, + "loss": 0.0029, + "num_input_tokens_seen": 212882160, + "step": 98555 + }, + { + "epoch": 16.078303425774877, + "grad_norm": 0.008244593627750874, + "learning_rate": 0.00011263034808944134, + "loss": 0.0027, + "num_input_tokens_seen": 212892720, + "step": 98560 + }, + { + "epoch": 16.079119086460032, + "grad_norm": 0.013054000213742256, + "learning_rate": 0.00011258534637957718, + "loss": 0.0031, + "num_input_tokens_seen": 212904208, + "step": 98565 + }, + { + "epoch": 16.079934747145188, + "grad_norm": 0.0004884781083092093, + "learning_rate": 0.0001125403525211624, + "loss": 0.0011, + "num_input_tokens_seen": 212913232, + "step": 98570 + }, + { + "epoch": 16.080750407830344, + "grad_norm": 0.0003876937844324857, + "learning_rate": 0.00011249536651510894, + "loss": 0.0263, + "num_input_tokens_seen": 212923280, + "step": 98575 + }, + { + "epoch": 16.081566068515496, + "grad_norm": 0.052026715129613876, + "learning_rate": 0.00011245038836232846, + "loss": 0.005, + "num_input_tokens_seen": 212933392, + "step": 98580 + }, + { + "epoch": 16.08238172920065, + "grad_norm": 0.011535944417119026, + "learning_rate": 0.0001124054180637325, + "loss": 0.0011, + "num_input_tokens_seen": 212944080, + "step": 98585 + }, + { + "epoch": 16.083197389885807, + "grad_norm": 0.006972441915422678, + "learning_rate": 0.00011236045562023245, + "loss": 0.005, + "num_input_tokens_seen": 212954128, + "step": 98590 + }, + { + "epoch": 16.084013050570963, + "grad_norm": 0.016211438924074173, + "learning_rate": 0.00011231550103273952, + "loss": 0.0022, + "num_input_tokens_seen": 212964784, + "step": 98595 + }, + { + "epoch": 16.08482871125612, + "grad_norm": 0.004645606502890587, + "learning_rate": 0.00011227055430216476, + "loss": 0.0059, + "num_input_tokens_seen": 212976048, + "step": 98600 + }, + { + "epoch": 16.08564437194127, + "grad_norm": 0.005268410313874483, + "learning_rate": 0.00011222561542941906, + "loss": 0.002, + "num_input_tokens_seen": 212985712, + "step": 98605 + }, + { + "epoch": 16.086460032626427, + "grad_norm": 0.009896847419440746, + "learning_rate": 0.00011218068441541323, + "loss": 0.0014, + "num_input_tokens_seen": 212996784, + "step": 98610 + }, + { + "epoch": 16.087275693311582, + "grad_norm": 0.003218175610527396, + "learning_rate": 0.0001121357612610578, + "loss": 0.0008, + "num_input_tokens_seen": 213007440, + "step": 98615 + }, + { + "epoch": 16.088091353996738, + "grad_norm": 0.011862734332680702, + "learning_rate": 0.0001120908459672632, + "loss": 0.0013, + "num_input_tokens_seen": 213018704, + "step": 98620 + }, + { + "epoch": 16.088907014681894, + "grad_norm": 0.10315841436386108, + "learning_rate": 0.00011204593853493978, + "loss": 0.0052, + "num_input_tokens_seen": 213029584, + "step": 98625 + }, + { + "epoch": 16.089722675367046, + "grad_norm": 0.0024018334224820137, + "learning_rate": 0.00011200103896499748, + "loss": 0.0012, + "num_input_tokens_seen": 213040816, + "step": 98630 + }, + { + "epoch": 16.0905383360522, + "grad_norm": 0.0007672040374018252, + "learning_rate": 0.00011195614725834636, + "loss": 0.002, + "num_input_tokens_seen": 213050928, + "step": 98635 + }, + { + "epoch": 16.091353996737357, + "grad_norm": 0.03508285805583, + "learning_rate": 0.0001119112634158962, + "loss": 0.0061, + "num_input_tokens_seen": 213061968, + "step": 98640 + }, + { + "epoch": 16.092169657422513, + "grad_norm": 0.002200368558987975, + "learning_rate": 0.00011186638743855643, + "loss": 0.0007, + "num_input_tokens_seen": 213073456, + "step": 98645 + }, + { + "epoch": 16.09298531810767, + "grad_norm": 0.012955213896930218, + "learning_rate": 0.00011182151932723706, + "loss": 0.0059, + "num_input_tokens_seen": 213084112, + "step": 98650 + }, + { + "epoch": 16.09380097879282, + "grad_norm": 0.003678369102999568, + "learning_rate": 0.00011177665908284667, + "loss": 0.0025, + "num_input_tokens_seen": 213095664, + "step": 98655 + }, + { + "epoch": 16.094616639477977, + "grad_norm": 0.00983490701764822, + "learning_rate": 0.00011173180670629496, + "loss": 0.0021, + "num_input_tokens_seen": 213106640, + "step": 98660 + }, + { + "epoch": 16.095432300163132, + "grad_norm": 0.0047003780491650105, + "learning_rate": 0.00011168696219849078, + "loss": 0.0037, + "num_input_tokens_seen": 213117840, + "step": 98665 + }, + { + "epoch": 16.096247960848288, + "grad_norm": 0.0013329943176358938, + "learning_rate": 0.00011164212556034287, + "loss": 0.0368, + "num_input_tokens_seen": 213129424, + "step": 98670 + }, + { + "epoch": 16.097063621533444, + "grad_norm": 0.004717966075986624, + "learning_rate": 0.00011159729679275999, + "loss": 0.0038, + "num_input_tokens_seen": 213140944, + "step": 98675 + }, + { + "epoch": 16.097879282218596, + "grad_norm": 0.15023410320281982, + "learning_rate": 0.00011155247589665057, + "loss": 0.0082, + "num_input_tokens_seen": 213151248, + "step": 98680 + }, + { + "epoch": 16.09869494290375, + "grad_norm": 0.020730411633849144, + "learning_rate": 0.00011150766287292302, + "loss": 0.0035, + "num_input_tokens_seen": 213162320, + "step": 98685 + }, + { + "epoch": 16.099510603588907, + "grad_norm": 0.004016099963337183, + "learning_rate": 0.00011146285772248555, + "loss": 0.0024, + "num_input_tokens_seen": 213172432, + "step": 98690 + }, + { + "epoch": 16.100326264274063, + "grad_norm": 0.006559828761965036, + "learning_rate": 0.00011141806044624614, + "loss": 0.0014, + "num_input_tokens_seen": 213183184, + "step": 98695 + }, + { + "epoch": 16.10114192495922, + "grad_norm": 0.0008914469508454204, + "learning_rate": 0.00011137327104511268, + "loss": 0.0074, + "num_input_tokens_seen": 213193072, + "step": 98700 + }, + { + "epoch": 16.10195758564437, + "grad_norm": 0.006500248797237873, + "learning_rate": 0.00011132848951999286, + "loss": 0.0014, + "num_input_tokens_seen": 213203824, + "step": 98705 + }, + { + "epoch": 16.102773246329527, + "grad_norm": 0.004205408971756697, + "learning_rate": 0.00011128371587179431, + "loss": 0.0017, + "num_input_tokens_seen": 213214800, + "step": 98710 + }, + { + "epoch": 16.103588907014682, + "grad_norm": 0.0021865079179406166, + "learning_rate": 0.00011123895010142437, + "loss": 0.0033, + "num_input_tokens_seen": 213225488, + "step": 98715 + }, + { + "epoch": 16.104404567699838, + "grad_norm": 0.0016050647245720029, + "learning_rate": 0.00011119419220979033, + "loss": 0.0768, + "num_input_tokens_seen": 213235760, + "step": 98720 + }, + { + "epoch": 16.10522022838499, + "grad_norm": 0.01331863272935152, + "learning_rate": 0.00011114944219779916, + "loss": 0.0025, + "num_input_tokens_seen": 213246704, + "step": 98725 + }, + { + "epoch": 16.106035889070146, + "grad_norm": 0.0021768363658338785, + "learning_rate": 0.00011110470006635781, + "loss": 0.0024, + "num_input_tokens_seen": 213258672, + "step": 98730 + }, + { + "epoch": 16.1068515497553, + "grad_norm": 0.19919277727603912, + "learning_rate": 0.00011105996581637312, + "loss": 0.0032, + "num_input_tokens_seen": 213270608, + "step": 98735 + }, + { + "epoch": 16.107667210440457, + "grad_norm": 0.007927102036774158, + "learning_rate": 0.00011101523944875163, + "loss": 0.0018, + "num_input_tokens_seen": 213281360, + "step": 98740 + }, + { + "epoch": 16.108482871125613, + "grad_norm": 0.018627608194947243, + "learning_rate": 0.00011097052096439974, + "loss": 0.0013, + "num_input_tokens_seen": 213292048, + "step": 98745 + }, + { + "epoch": 16.109298531810765, + "grad_norm": 0.002585037611424923, + "learning_rate": 0.00011092581036422378, + "loss": 0.0068, + "num_input_tokens_seen": 213303280, + "step": 98750 + }, + { + "epoch": 16.11011419249592, + "grad_norm": 0.005573483649641275, + "learning_rate": 0.00011088110764912984, + "loss": 0.0034, + "num_input_tokens_seen": 213313744, + "step": 98755 + }, + { + "epoch": 16.110929853181077, + "grad_norm": 0.7131521701812744, + "learning_rate": 0.00011083641282002387, + "loss": 0.0908, + "num_input_tokens_seen": 213324592, + "step": 98760 + }, + { + "epoch": 16.111745513866232, + "grad_norm": 0.005434775725007057, + "learning_rate": 0.00011079172587781172, + "loss": 0.0012, + "num_input_tokens_seen": 213336400, + "step": 98765 + }, + { + "epoch": 16.112561174551388, + "grad_norm": 0.007682743947952986, + "learning_rate": 0.00011074704682339897, + "loss": 0.0043, + "num_input_tokens_seen": 213346000, + "step": 98770 + }, + { + "epoch": 16.11337683523654, + "grad_norm": 0.008814748376607895, + "learning_rate": 0.00011070237565769097, + "loss": 0.0025, + "num_input_tokens_seen": 213358096, + "step": 98775 + }, + { + "epoch": 16.114192495921696, + "grad_norm": 0.004235987085849047, + "learning_rate": 0.0001106577123815935, + "loss": 0.0019, + "num_input_tokens_seen": 213367408, + "step": 98780 + }, + { + "epoch": 16.11500815660685, + "grad_norm": 2.6287970542907715, + "learning_rate": 0.0001106130569960111, + "loss": 0.0368, + "num_input_tokens_seen": 213377840, + "step": 98785 + }, + { + "epoch": 16.115823817292007, + "grad_norm": 0.04276340827345848, + "learning_rate": 0.00011056840950184921, + "loss": 0.0035, + "num_input_tokens_seen": 213388144, + "step": 98790 + }, + { + "epoch": 16.116639477977163, + "grad_norm": 0.5271095633506775, + "learning_rate": 0.00011052376990001256, + "loss": 0.0459, + "num_input_tokens_seen": 213399088, + "step": 98795 + }, + { + "epoch": 16.117455138662315, + "grad_norm": 0.007122796028852463, + "learning_rate": 0.00011047913819140576, + "loss": 0.0013, + "num_input_tokens_seen": 213409840, + "step": 98800 + }, + { + "epoch": 16.11827079934747, + "grad_norm": 0.13764910399913788, + "learning_rate": 0.00011043451437693342, + "loss": 0.013, + "num_input_tokens_seen": 213420528, + "step": 98805 + }, + { + "epoch": 16.119086460032626, + "grad_norm": 0.0022578334901481867, + "learning_rate": 0.00011038989845749981, + "loss": 0.0181, + "num_input_tokens_seen": 213430800, + "step": 98810 + }, + { + "epoch": 16.119902120717782, + "grad_norm": 0.004519890993833542, + "learning_rate": 0.00011034529043400915, + "loss": 0.0032, + "num_input_tokens_seen": 213441936, + "step": 98815 + }, + { + "epoch": 16.120717781402938, + "grad_norm": 0.0013229832984507084, + "learning_rate": 0.00011030069030736551, + "loss": 0.0023, + "num_input_tokens_seen": 213452688, + "step": 98820 + }, + { + "epoch": 16.12153344208809, + "grad_norm": 0.004833935294300318, + "learning_rate": 0.0001102560980784727, + "loss": 0.0035, + "num_input_tokens_seen": 213463312, + "step": 98825 + }, + { + "epoch": 16.122349102773246, + "grad_norm": 0.06656540185213089, + "learning_rate": 0.00011021151374823457, + "loss": 0.0899, + "num_input_tokens_seen": 213474192, + "step": 98830 + }, + { + "epoch": 16.1231647634584, + "grad_norm": 0.028403708711266518, + "learning_rate": 0.00011016693731755456, + "loss": 0.0032, + "num_input_tokens_seen": 213485232, + "step": 98835 + }, + { + "epoch": 16.123980424143557, + "grad_norm": 0.01657683216035366, + "learning_rate": 0.00011012236878733606, + "loss": 0.0035, + "num_input_tokens_seen": 213496048, + "step": 98840 + }, + { + "epoch": 16.124796084828713, + "grad_norm": 0.009340647608041763, + "learning_rate": 0.00011007780815848239, + "loss": 0.0103, + "num_input_tokens_seen": 213507472, + "step": 98845 + }, + { + "epoch": 16.125611745513865, + "grad_norm": 0.0011907644802704453, + "learning_rate": 0.00011003325543189663, + "loss": 0.0055, + "num_input_tokens_seen": 213517520, + "step": 98850 + }, + { + "epoch": 16.12642740619902, + "grad_norm": 0.0006321436958387494, + "learning_rate": 0.0001099887106084816, + "loss": 0.0009, + "num_input_tokens_seen": 213528336, + "step": 98855 + }, + { + "epoch": 16.127243066884176, + "grad_norm": 0.0071949586272239685, + "learning_rate": 0.00010994417368914011, + "loss": 0.0026, + "num_input_tokens_seen": 213539856, + "step": 98860 + }, + { + "epoch": 16.128058727569332, + "grad_norm": 0.0012756186770275235, + "learning_rate": 0.00010989964467477481, + "loss": 0.0006, + "num_input_tokens_seen": 213550448, + "step": 98865 + }, + { + "epoch": 16.128874388254488, + "grad_norm": 0.002935679629445076, + "learning_rate": 0.00010985512356628807, + "loss": 0.0026, + "num_input_tokens_seen": 213562192, + "step": 98870 + }, + { + "epoch": 16.12969004893964, + "grad_norm": 0.006117780692875385, + "learning_rate": 0.00010981061036458218, + "loss": 0.002, + "num_input_tokens_seen": 213573360, + "step": 98875 + }, + { + "epoch": 16.130505709624796, + "grad_norm": 0.029337430372834206, + "learning_rate": 0.00010976610507055906, + "loss": 0.0021, + "num_input_tokens_seen": 213584112, + "step": 98880 + }, + { + "epoch": 16.13132137030995, + "grad_norm": 0.14752079546451569, + "learning_rate": 0.00010972160768512123, + "loss": 0.0049, + "num_input_tokens_seen": 213595216, + "step": 98885 + }, + { + "epoch": 16.132137030995107, + "grad_norm": 0.004511414561420679, + "learning_rate": 0.00010967711820916982, + "loss": 0.1444, + "num_input_tokens_seen": 213606160, + "step": 98890 + }, + { + "epoch": 16.13295269168026, + "grad_norm": 0.0020912738982588053, + "learning_rate": 0.00010963263664360706, + "loss": 0.108, + "num_input_tokens_seen": 213615984, + "step": 98895 + }, + { + "epoch": 16.133768352365415, + "grad_norm": 0.03671063110232353, + "learning_rate": 0.00010958816298933383, + "loss": 0.0032, + "num_input_tokens_seen": 213627440, + "step": 98900 + }, + { + "epoch": 16.13458401305057, + "grad_norm": 0.0017622812883928418, + "learning_rate": 0.00010954369724725205, + "loss": 0.0024, + "num_input_tokens_seen": 213638384, + "step": 98905 + }, + { + "epoch": 16.135399673735726, + "grad_norm": 0.11492667347192764, + "learning_rate": 0.00010949923941826229, + "loss": 0.0057, + "num_input_tokens_seen": 213649040, + "step": 98910 + }, + { + "epoch": 16.136215334420882, + "grad_norm": 0.012901815585792065, + "learning_rate": 0.0001094547895032661, + "loss": 0.001, + "num_input_tokens_seen": 213658256, + "step": 98915 + }, + { + "epoch": 16.137030995106034, + "grad_norm": 0.014208734035491943, + "learning_rate": 0.00010941034750316375, + "loss": 0.003, + "num_input_tokens_seen": 213668560, + "step": 98920 + }, + { + "epoch": 16.13784665579119, + "grad_norm": 0.0005539971170946956, + "learning_rate": 0.00010936591341885648, + "loss": 0.0036, + "num_input_tokens_seen": 213678896, + "step": 98925 + }, + { + "epoch": 16.138662316476346, + "grad_norm": 0.2483779489994049, + "learning_rate": 0.0001093214872512443, + "loss": 0.0058, + "num_input_tokens_seen": 213689712, + "step": 98930 + }, + { + "epoch": 16.1394779771615, + "grad_norm": 0.023973651230335236, + "learning_rate": 0.00010927706900122791, + "loss": 0.0031, + "num_input_tokens_seen": 213699920, + "step": 98935 + }, + { + "epoch": 16.140293637846657, + "grad_norm": 0.018499545753002167, + "learning_rate": 0.00010923265866970739, + "loss": 0.0085, + "num_input_tokens_seen": 213710864, + "step": 98940 + }, + { + "epoch": 16.14110929853181, + "grad_norm": 0.0017729535466060042, + "learning_rate": 0.00010918825625758273, + "loss": 0.0044, + "num_input_tokens_seen": 213721200, + "step": 98945 + }, + { + "epoch": 16.141924959216965, + "grad_norm": 0.001103546703234315, + "learning_rate": 0.00010914386176575386, + "loss": 0.0074, + "num_input_tokens_seen": 213732752, + "step": 98950 + }, + { + "epoch": 16.14274061990212, + "grad_norm": 0.0006307591684162617, + "learning_rate": 0.00010909947519512048, + "loss": 0.0062, + "num_input_tokens_seen": 213743056, + "step": 98955 + }, + { + "epoch": 16.143556280587276, + "grad_norm": 0.0002562662702985108, + "learning_rate": 0.00010905509654658208, + "loss": 0.0016, + "num_input_tokens_seen": 213753584, + "step": 98960 + }, + { + "epoch": 16.144371941272432, + "grad_norm": 0.09816617518663406, + "learning_rate": 0.00010901072582103816, + "loss": 0.0754, + "num_input_tokens_seen": 213762992, + "step": 98965 + }, + { + "epoch": 16.145187601957584, + "grad_norm": 0.022168634459376335, + "learning_rate": 0.00010896636301938784, + "loss": 0.0032, + "num_input_tokens_seen": 213773072, + "step": 98970 + }, + { + "epoch": 16.14600326264274, + "grad_norm": 0.03289058804512024, + "learning_rate": 0.00010892200814253023, + "loss": 0.0017, + "num_input_tokens_seen": 213784816, + "step": 98975 + }, + { + "epoch": 16.146818923327896, + "grad_norm": 0.0013062867801636457, + "learning_rate": 0.00010887766119136427, + "loss": 0.0015, + "num_input_tokens_seen": 213795440, + "step": 98980 + }, + { + "epoch": 16.14763458401305, + "grad_norm": 0.3104763627052307, + "learning_rate": 0.00010883332216678853, + "loss": 0.0055, + "num_input_tokens_seen": 213807088, + "step": 98985 + }, + { + "epoch": 16.148450244698207, + "grad_norm": 0.005590509623289108, + "learning_rate": 0.00010878899106970203, + "loss": 0.0013, + "num_input_tokens_seen": 213818832, + "step": 98990 + }, + { + "epoch": 16.14926590538336, + "grad_norm": 0.5460083484649658, + "learning_rate": 0.00010874466790100268, + "loss": 0.0888, + "num_input_tokens_seen": 213829264, + "step": 98995 + }, + { + "epoch": 16.150081566068515, + "grad_norm": 0.027004707604646683, + "learning_rate": 0.00010870035266158918, + "loss": 0.003, + "num_input_tokens_seen": 213839792, + "step": 99000 + }, + { + "epoch": 16.15089722675367, + "grad_norm": 0.0023864214308559895, + "learning_rate": 0.00010865604535235918, + "loss": 0.0017, + "num_input_tokens_seen": 213850736, + "step": 99005 + }, + { + "epoch": 16.151712887438826, + "grad_norm": 0.020365918055176735, + "learning_rate": 0.0001086117459742112, + "loss": 0.0013, + "num_input_tokens_seen": 213862704, + "step": 99010 + }, + { + "epoch": 16.152528548123982, + "grad_norm": 0.029816294088959694, + "learning_rate": 0.00010856745452804234, + "loss": 0.047, + "num_input_tokens_seen": 213873296, + "step": 99015 + }, + { + "epoch": 16.153344208809134, + "grad_norm": 0.0036337687633931637, + "learning_rate": 0.0001085231710147509, + "loss": 0.0026, + "num_input_tokens_seen": 213884560, + "step": 99020 + }, + { + "epoch": 16.15415986949429, + "grad_norm": 0.005663975607603788, + "learning_rate": 0.00010847889543523376, + "loss": 0.0053, + "num_input_tokens_seen": 213894928, + "step": 99025 + }, + { + "epoch": 16.154975530179446, + "grad_norm": 0.001540932571515441, + "learning_rate": 0.00010843462779038876, + "loss": 0.0009, + "num_input_tokens_seen": 213905904, + "step": 99030 + }, + { + "epoch": 16.1557911908646, + "grad_norm": 0.008301756344735622, + "learning_rate": 0.00010839036808111246, + "loss": 0.0046, + "num_input_tokens_seen": 213915952, + "step": 99035 + }, + { + "epoch": 16.156606851549757, + "grad_norm": 0.009943228214979172, + "learning_rate": 0.00010834611630830244, + "loss": 0.0047, + "num_input_tokens_seen": 213927248, + "step": 99040 + }, + { + "epoch": 16.15742251223491, + "grad_norm": 0.01648455671966076, + "learning_rate": 0.00010830187247285489, + "loss": 0.0082, + "num_input_tokens_seen": 213938736, + "step": 99045 + }, + { + "epoch": 16.158238172920065, + "grad_norm": 0.0015992774860933423, + "learning_rate": 0.00010825763657566717, + "loss": 0.0024, + "num_input_tokens_seen": 213949840, + "step": 99050 + }, + { + "epoch": 16.15905383360522, + "grad_norm": 0.0430905781686306, + "learning_rate": 0.00010821340861763506, + "loss": 0.0041, + "num_input_tokens_seen": 213960112, + "step": 99055 + }, + { + "epoch": 16.159869494290376, + "grad_norm": 0.010663102380931377, + "learning_rate": 0.00010816918859965552, + "loss": 0.0016, + "num_input_tokens_seen": 213970416, + "step": 99060 + }, + { + "epoch": 16.160685154975532, + "grad_norm": 0.06955873966217041, + "learning_rate": 0.00010812497652262421, + "loss": 0.0065, + "num_input_tokens_seen": 213981264, + "step": 99065 + }, + { + "epoch": 16.161500815660684, + "grad_norm": 0.01041333470493555, + "learning_rate": 0.00010808077238743763, + "loss": 0.0023, + "num_input_tokens_seen": 213991888, + "step": 99070 + }, + { + "epoch": 16.16231647634584, + "grad_norm": 0.03263779357075691, + "learning_rate": 0.00010803657619499107, + "loss": 0.0015, + "num_input_tokens_seen": 214003600, + "step": 99075 + }, + { + "epoch": 16.163132137030995, + "grad_norm": 0.017389433458447456, + "learning_rate": 0.00010799238794618077, + "loss": 0.0039, + "num_input_tokens_seen": 214013808, + "step": 99080 + }, + { + "epoch": 16.16394779771615, + "grad_norm": 0.00429139519110322, + "learning_rate": 0.00010794820764190194, + "loss": 0.0013, + "num_input_tokens_seen": 214024624, + "step": 99085 + }, + { + "epoch": 16.164763458401303, + "grad_norm": 0.034365396946668625, + "learning_rate": 0.00010790403528305004, + "loss": 0.0044, + "num_input_tokens_seen": 214035376, + "step": 99090 + }, + { + "epoch": 16.16557911908646, + "grad_norm": 0.023165516555309296, + "learning_rate": 0.0001078598708705203, + "loss": 0.0413, + "num_input_tokens_seen": 214045104, + "step": 99095 + }, + { + "epoch": 16.166394779771615, + "grad_norm": 0.0035238233394920826, + "learning_rate": 0.00010781571440520777, + "loss": 0.001, + "num_input_tokens_seen": 214055536, + "step": 99100 + }, + { + "epoch": 16.16721044045677, + "grad_norm": 0.0005931582418270409, + "learning_rate": 0.00010777156588800724, + "loss": 0.0009, + "num_input_tokens_seen": 214065680, + "step": 99105 + }, + { + "epoch": 16.168026101141926, + "grad_norm": 0.07158027589321136, + "learning_rate": 0.00010772742531981356, + "loss": 0.0012, + "num_input_tokens_seen": 214074800, + "step": 99110 + }, + { + "epoch": 16.16884176182708, + "grad_norm": 0.0070646717213094234, + "learning_rate": 0.00010768329270152122, + "loss": 0.001, + "num_input_tokens_seen": 214085648, + "step": 99115 + }, + { + "epoch": 16.169657422512234, + "grad_norm": 0.007711863610893488, + "learning_rate": 0.00010763916803402463, + "loss": 0.0053, + "num_input_tokens_seen": 214095952, + "step": 99120 + }, + { + "epoch": 16.17047308319739, + "grad_norm": 0.0007553675677627325, + "learning_rate": 0.00010759505131821806, + "loss": 0.004, + "num_input_tokens_seen": 214107472, + "step": 99125 + }, + { + "epoch": 16.171288743882545, + "grad_norm": 0.017188599333167076, + "learning_rate": 0.00010755094255499542, + "loss": 0.0038, + "num_input_tokens_seen": 214120144, + "step": 99130 + }, + { + "epoch": 16.1721044045677, + "grad_norm": 0.01340003963559866, + "learning_rate": 0.00010750684174525111, + "loss": 0.0032, + "num_input_tokens_seen": 214131120, + "step": 99135 + }, + { + "epoch": 16.172920065252853, + "grad_norm": 0.009431690908968449, + "learning_rate": 0.00010746274888987822, + "loss": 0.0011, + "num_input_tokens_seen": 214141616, + "step": 99140 + }, + { + "epoch": 16.17373572593801, + "grad_norm": 0.062283746898174286, + "learning_rate": 0.00010741866398977101, + "loss": 0.0047, + "num_input_tokens_seen": 214152784, + "step": 99145 + }, + { + "epoch": 16.174551386623165, + "grad_norm": 0.009358714334666729, + "learning_rate": 0.00010737458704582232, + "loss": 0.0009, + "num_input_tokens_seen": 214163952, + "step": 99150 + }, + { + "epoch": 16.17536704730832, + "grad_norm": 0.013387867249548435, + "learning_rate": 0.00010733051805892602, + "loss": 0.0024, + "num_input_tokens_seen": 214173680, + "step": 99155 + }, + { + "epoch": 16.176182707993476, + "grad_norm": 0.00021368158922996372, + "learning_rate": 0.00010728645702997458, + "loss": 0.0011, + "num_input_tokens_seen": 214184912, + "step": 99160 + }, + { + "epoch": 16.17699836867863, + "grad_norm": 0.012210970744490623, + "learning_rate": 0.00010724240395986156, + "loss": 0.0225, + "num_input_tokens_seen": 214195152, + "step": 99165 + }, + { + "epoch": 16.177814029363784, + "grad_norm": 0.0026052501052618027, + "learning_rate": 0.00010719835884947921, + "loss": 0.0007, + "num_input_tokens_seen": 214205104, + "step": 99170 + }, + { + "epoch": 16.17862969004894, + "grad_norm": 0.0005025434657000005, + "learning_rate": 0.00010715432169972067, + "loss": 0.0549, + "num_input_tokens_seen": 214215184, + "step": 99175 + }, + { + "epoch": 16.179445350734095, + "grad_norm": 0.000986822065897286, + "learning_rate": 0.00010711029251147791, + "loss": 0.0007, + "num_input_tokens_seen": 214226896, + "step": 99180 + }, + { + "epoch": 16.18026101141925, + "grad_norm": 0.018340054899454117, + "learning_rate": 0.00010706627128564378, + "loss": 0.0014, + "num_input_tokens_seen": 214238672, + "step": 99185 + }, + { + "epoch": 16.181076672104403, + "grad_norm": 0.009344175457954407, + "learning_rate": 0.00010702225802310983, + "loss": 0.0025, + "num_input_tokens_seen": 214248432, + "step": 99190 + }, + { + "epoch": 16.18189233278956, + "grad_norm": 0.00024389364989474416, + "learning_rate": 0.00010697825272476847, + "loss": 0.0015, + "num_input_tokens_seen": 214259376, + "step": 99195 + }, + { + "epoch": 16.182707993474715, + "grad_norm": 0.0008997777476906776, + "learning_rate": 0.00010693425539151141, + "loss": 0.0005, + "num_input_tokens_seen": 214270256, + "step": 99200 + }, + { + "epoch": 16.18352365415987, + "grad_norm": 0.003580469638109207, + "learning_rate": 0.00010689026602423036, + "loss": 0.0013, + "num_input_tokens_seen": 214280624, + "step": 99205 + }, + { + "epoch": 16.184339314845026, + "grad_norm": 0.013014066033065319, + "learning_rate": 0.00010684628462381673, + "loss": 0.0023, + "num_input_tokens_seen": 214293232, + "step": 99210 + }, + { + "epoch": 16.18515497553018, + "grad_norm": 0.0009328118176199496, + "learning_rate": 0.00010680231119116185, + "loss": 0.0022, + "num_input_tokens_seen": 214302192, + "step": 99215 + }, + { + "epoch": 16.185970636215334, + "grad_norm": 0.00644258176907897, + "learning_rate": 0.00010675834572715698, + "loss": 0.0013, + "num_input_tokens_seen": 214313264, + "step": 99220 + }, + { + "epoch": 16.18678629690049, + "grad_norm": 0.029108474031090736, + "learning_rate": 0.00010671438823269314, + "loss": 0.0137, + "num_input_tokens_seen": 214323504, + "step": 99225 + }, + { + "epoch": 16.187601957585645, + "grad_norm": 0.045794326812028885, + "learning_rate": 0.00010667043870866105, + "loss": 0.0027, + "num_input_tokens_seen": 214332688, + "step": 99230 + }, + { + "epoch": 16.1884176182708, + "grad_norm": 0.0035296755377203226, + "learning_rate": 0.00010662649715595157, + "loss": 0.0025, + "num_input_tokens_seen": 214343728, + "step": 99235 + }, + { + "epoch": 16.189233278955953, + "grad_norm": 0.015425696969032288, + "learning_rate": 0.00010658256357545509, + "loss": 0.036, + "num_input_tokens_seen": 214354160, + "step": 99240 + }, + { + "epoch": 16.19004893964111, + "grad_norm": 0.0035507178399711847, + "learning_rate": 0.00010653863796806213, + "loss": 0.0011, + "num_input_tokens_seen": 214365232, + "step": 99245 + }, + { + "epoch": 16.190864600326265, + "grad_norm": 0.0019442274933680892, + "learning_rate": 0.00010649472033466273, + "loss": 0.0018, + "num_input_tokens_seen": 214377616, + "step": 99250 + }, + { + "epoch": 16.19168026101142, + "grad_norm": 0.0013061447534710169, + "learning_rate": 0.00010645081067614703, + "loss": 0.0015, + "num_input_tokens_seen": 214387856, + "step": 99255 + }, + { + "epoch": 16.192495921696572, + "grad_norm": 0.0017848755232989788, + "learning_rate": 0.00010640690899340494, + "loss": 0.0713, + "num_input_tokens_seen": 214398352, + "step": 99260 + }, + { + "epoch": 16.193311582381728, + "grad_norm": 0.0009831542847678065, + "learning_rate": 0.00010636301528732612, + "loss": 0.0012, + "num_input_tokens_seen": 214409520, + "step": 99265 + }, + { + "epoch": 16.194127243066884, + "grad_norm": 0.00133727234788239, + "learning_rate": 0.00010631912955880018, + "loss": 0.0013, + "num_input_tokens_seen": 214420400, + "step": 99270 + }, + { + "epoch": 16.19494290375204, + "grad_norm": 0.0031839951407164335, + "learning_rate": 0.00010627525180871633, + "loss": 0.0046, + "num_input_tokens_seen": 214431664, + "step": 99275 + }, + { + "epoch": 16.195758564437195, + "grad_norm": 0.00220080791041255, + "learning_rate": 0.00010623138203796429, + "loss": 0.0027, + "num_input_tokens_seen": 214441808, + "step": 99280 + }, + { + "epoch": 16.196574225122347, + "grad_norm": 0.032900307327508926, + "learning_rate": 0.00010618752024743255, + "loss": 0.0039, + "num_input_tokens_seen": 214453680, + "step": 99285 + }, + { + "epoch": 16.197389885807503, + "grad_norm": 0.0010238890536129475, + "learning_rate": 0.00010614366643801055, + "loss": 0.0055, + "num_input_tokens_seen": 214464176, + "step": 99290 + }, + { + "epoch": 16.19820554649266, + "grad_norm": 0.009197982028126717, + "learning_rate": 0.00010609982061058654, + "loss": 0.0081, + "num_input_tokens_seen": 214474000, + "step": 99295 + }, + { + "epoch": 16.199021207177815, + "grad_norm": 0.5025539398193359, + "learning_rate": 0.0001060559827660495, + "loss": 0.0729, + "num_input_tokens_seen": 214485328, + "step": 99300 + }, + { + "epoch": 16.19983686786297, + "grad_norm": 0.0005666270735673606, + "learning_rate": 0.0001060121529052877, + "loss": 0.001, + "num_input_tokens_seen": 214496080, + "step": 99305 + }, + { + "epoch": 16.200652528548122, + "grad_norm": 0.005442335736006498, + "learning_rate": 0.0001059683310291894, + "loss": 0.0013, + "num_input_tokens_seen": 214507216, + "step": 99310 + }, + { + "epoch": 16.201468189233278, + "grad_norm": 0.0619342066347599, + "learning_rate": 0.00010592451713864282, + "loss": 0.0031, + "num_input_tokens_seen": 214517648, + "step": 99315 + }, + { + "epoch": 16.202283849918434, + "grad_norm": 0.0022380822338163853, + "learning_rate": 0.00010588071123453574, + "loss": 0.0033, + "num_input_tokens_seen": 214528816, + "step": 99320 + }, + { + "epoch": 16.20309951060359, + "grad_norm": 0.12963344156742096, + "learning_rate": 0.00010583691331775608, + "loss": 0.0072, + "num_input_tokens_seen": 214539568, + "step": 99325 + }, + { + "epoch": 16.203915171288745, + "grad_norm": 0.008783910423517227, + "learning_rate": 0.0001057931233891914, + "loss": 0.0029, + "num_input_tokens_seen": 214551920, + "step": 99330 + }, + { + "epoch": 16.204730831973897, + "grad_norm": 0.005578738637268543, + "learning_rate": 0.00010574934144972908, + "loss": 0.0104, + "num_input_tokens_seen": 214562768, + "step": 99335 + }, + { + "epoch": 16.205546492659053, + "grad_norm": 0.0020929095335304737, + "learning_rate": 0.00010570556750025656, + "loss": 0.0014, + "num_input_tokens_seen": 214573520, + "step": 99340 + }, + { + "epoch": 16.20636215334421, + "grad_norm": 0.012415740638971329, + "learning_rate": 0.00010566180154166094, + "loss": 0.0031, + "num_input_tokens_seen": 214584336, + "step": 99345 + }, + { + "epoch": 16.207177814029365, + "grad_norm": 0.002348710782825947, + "learning_rate": 0.00010561804357482912, + "loss": 0.002, + "num_input_tokens_seen": 214594288, + "step": 99350 + }, + { + "epoch": 16.20799347471452, + "grad_norm": 0.011959983967244625, + "learning_rate": 0.00010557429360064796, + "loss": 0.0026, + "num_input_tokens_seen": 214605712, + "step": 99355 + }, + { + "epoch": 16.208809135399672, + "grad_norm": 0.04903974384069443, + "learning_rate": 0.00010553055162000414, + "loss": 0.0033, + "num_input_tokens_seen": 214616112, + "step": 99360 + }, + { + "epoch": 16.209624796084828, + "grad_norm": 0.001221096026711166, + "learning_rate": 0.0001054868176337841, + "loss": 0.0012, + "num_input_tokens_seen": 214626448, + "step": 99365 + }, + { + "epoch": 16.210440456769984, + "grad_norm": 0.008052774704992771, + "learning_rate": 0.00010544309164287418, + "loss": 0.0021, + "num_input_tokens_seen": 214636112, + "step": 99370 + }, + { + "epoch": 16.21125611745514, + "grad_norm": 0.051362331956624985, + "learning_rate": 0.00010539937364816049, + "loss": 0.0023, + "num_input_tokens_seen": 214646864, + "step": 99375 + }, + { + "epoch": 16.212071778140295, + "grad_norm": 0.0010595549829304218, + "learning_rate": 0.00010535566365052913, + "loss": 0.0026, + "num_input_tokens_seen": 214658192, + "step": 99380 + }, + { + "epoch": 16.212887438825447, + "grad_norm": 0.003621351206675172, + "learning_rate": 0.00010531196165086587, + "loss": 0.0025, + "num_input_tokens_seen": 214668976, + "step": 99385 + }, + { + "epoch": 16.213703099510603, + "grad_norm": 0.01673658937215805, + "learning_rate": 0.00010526826765005642, + "loss": 0.0016, + "num_input_tokens_seen": 214679728, + "step": 99390 + }, + { + "epoch": 16.21451876019576, + "grad_norm": 0.0043097264133393764, + "learning_rate": 0.00010522458164898624, + "loss": 0.0014, + "num_input_tokens_seen": 214691408, + "step": 99395 + }, + { + "epoch": 16.215334420880914, + "grad_norm": 0.00409234594553709, + "learning_rate": 0.00010518090364854077, + "loss": 0.0015, + "num_input_tokens_seen": 214702256, + "step": 99400 + }, + { + "epoch": 16.21615008156607, + "grad_norm": 0.00013403875345829874, + "learning_rate": 0.00010513723364960497, + "loss": 0.0004, + "num_input_tokens_seen": 214713456, + "step": 99405 + }, + { + "epoch": 16.216965742251222, + "grad_norm": 0.00076089589856565, + "learning_rate": 0.00010509357165306422, + "loss": 0.0016, + "num_input_tokens_seen": 214723664, + "step": 99410 + }, + { + "epoch": 16.217781402936378, + "grad_norm": 0.0016013638814911246, + "learning_rate": 0.00010504991765980321, + "loss": 0.008, + "num_input_tokens_seen": 214735792, + "step": 99415 + }, + { + "epoch": 16.218597063621534, + "grad_norm": 0.009542165324091911, + "learning_rate": 0.00010500627167070665, + "loss": 0.0018, + "num_input_tokens_seen": 214748272, + "step": 99420 + }, + { + "epoch": 16.21941272430669, + "grad_norm": 0.002033479744568467, + "learning_rate": 0.00010496263368665904, + "loss": 0.0011, + "num_input_tokens_seen": 214760112, + "step": 99425 + }, + { + "epoch": 16.22022838499184, + "grad_norm": 0.00081063894322142, + "learning_rate": 0.00010491900370854484, + "loss": 0.0045, + "num_input_tokens_seen": 214769712, + "step": 99430 + }, + { + "epoch": 16.221044045676997, + "grad_norm": 0.46691176295280457, + "learning_rate": 0.0001048753817372482, + "loss": 0.103, + "num_input_tokens_seen": 214781168, + "step": 99435 + }, + { + "epoch": 16.221859706362153, + "grad_norm": 0.017485421150922775, + "learning_rate": 0.00010483176777365322, + "loss": 0.0022, + "num_input_tokens_seen": 214793040, + "step": 99440 + }, + { + "epoch": 16.22267536704731, + "grad_norm": 0.008777834475040436, + "learning_rate": 0.00010478816181864376, + "loss": 0.0054, + "num_input_tokens_seen": 214804304, + "step": 99445 + }, + { + "epoch": 16.223491027732464, + "grad_norm": 0.003942748997360468, + "learning_rate": 0.0001047445638731036, + "loss": 0.0016, + "num_input_tokens_seen": 214814512, + "step": 99450 + }, + { + "epoch": 16.224306688417617, + "grad_norm": 0.027318790555000305, + "learning_rate": 0.00010470097393791622, + "loss": 0.0019, + "num_input_tokens_seen": 214824912, + "step": 99455 + }, + { + "epoch": 16.225122349102772, + "grad_norm": 0.030543800443410873, + "learning_rate": 0.00010465739201396512, + "loss": 0.0013, + "num_input_tokens_seen": 214835344, + "step": 99460 + }, + { + "epoch": 16.225938009787928, + "grad_norm": 0.000653235474601388, + "learning_rate": 0.00010461381810213344, + "loss": 0.0008, + "num_input_tokens_seen": 214846704, + "step": 99465 + }, + { + "epoch": 16.226753670473084, + "grad_norm": 0.008265483193099499, + "learning_rate": 0.00010457025220330435, + "loss": 0.0011, + "num_input_tokens_seen": 214857936, + "step": 99470 + }, + { + "epoch": 16.22756933115824, + "grad_norm": 0.0002337160549359396, + "learning_rate": 0.00010452669431836076, + "loss": 0.0115, + "num_input_tokens_seen": 214868208, + "step": 99475 + }, + { + "epoch": 16.22838499184339, + "grad_norm": 0.0013019460020586848, + "learning_rate": 0.00010448314444818541, + "loss": 0.0032, + "num_input_tokens_seen": 214878128, + "step": 99480 + }, + { + "epoch": 16.229200652528547, + "grad_norm": 0.002835572464391589, + "learning_rate": 0.00010443960259366081, + "loss": 0.0006, + "num_input_tokens_seen": 214887600, + "step": 99485 + }, + { + "epoch": 16.230016313213703, + "grad_norm": 0.0008782123913988471, + "learning_rate": 0.00010439606875566954, + "loss": 0.0017, + "num_input_tokens_seen": 214898992, + "step": 99490 + }, + { + "epoch": 16.23083197389886, + "grad_norm": 0.011612669564783573, + "learning_rate": 0.00010435254293509378, + "loss": 0.0016, + "num_input_tokens_seen": 214911024, + "step": 99495 + }, + { + "epoch": 16.231647634584014, + "grad_norm": 0.012568632140755653, + "learning_rate": 0.00010430902513281565, + "loss": 0.0015, + "num_input_tokens_seen": 214923376, + "step": 99500 + }, + { + "epoch": 16.232463295269167, + "grad_norm": 0.0033671578858047724, + "learning_rate": 0.00010426551534971706, + "loss": 0.0012, + "num_input_tokens_seen": 214934672, + "step": 99505 + }, + { + "epoch": 16.233278955954322, + "grad_norm": 0.008715744130313396, + "learning_rate": 0.00010422201358667987, + "loss": 0.0006, + "num_input_tokens_seen": 214946032, + "step": 99510 + }, + { + "epoch": 16.234094616639478, + "grad_norm": 0.0662357434630394, + "learning_rate": 0.00010417851984458565, + "loss": 0.001, + "num_input_tokens_seen": 214958096, + "step": 99515 + }, + { + "epoch": 16.234910277324634, + "grad_norm": 0.008330133743584156, + "learning_rate": 0.00010413503412431568, + "loss": 0.001, + "num_input_tokens_seen": 214970000, + "step": 99520 + }, + { + "epoch": 16.23572593800979, + "grad_norm": 0.001419969368726015, + "learning_rate": 0.00010409155642675178, + "loss": 0.0055, + "num_input_tokens_seen": 214980144, + "step": 99525 + }, + { + "epoch": 16.23654159869494, + "grad_norm": 0.002042067004367709, + "learning_rate": 0.00010404808675277444, + "loss": 0.0012, + "num_input_tokens_seen": 214990800, + "step": 99530 + }, + { + "epoch": 16.237357259380097, + "grad_norm": 0.008127299137413502, + "learning_rate": 0.00010400462510326513, + "loss": 0.0012, + "num_input_tokens_seen": 215000816, + "step": 99535 + }, + { + "epoch": 16.238172920065253, + "grad_norm": 0.0008488766034133732, + "learning_rate": 0.00010396117147910422, + "loss": 0.0023, + "num_input_tokens_seen": 215011632, + "step": 99540 + }, + { + "epoch": 16.23898858075041, + "grad_norm": 0.08584143966436386, + "learning_rate": 0.00010391772588117288, + "loss": 0.0028, + "num_input_tokens_seen": 215023504, + "step": 99545 + }, + { + "epoch": 16.239804241435564, + "grad_norm": 0.004345927853137255, + "learning_rate": 0.000103874288310351, + "loss": 0.0026, + "num_input_tokens_seen": 215033552, + "step": 99550 + }, + { + "epoch": 16.240619902120716, + "grad_norm": 0.40933969616889954, + "learning_rate": 0.0001038308587675193, + "loss": 0.0147, + "num_input_tokens_seen": 215044496, + "step": 99555 + }, + { + "epoch": 16.241435562805872, + "grad_norm": 0.0008070006733760238, + "learning_rate": 0.00010378743725355788, + "loss": 0.0006, + "num_input_tokens_seen": 215053776, + "step": 99560 + }, + { + "epoch": 16.242251223491028, + "grad_norm": 0.00782832596451044, + "learning_rate": 0.00010374402376934661, + "loss": 0.196, + "num_input_tokens_seen": 215064688, + "step": 99565 + }, + { + "epoch": 16.243066884176184, + "grad_norm": 0.0021462365984916687, + "learning_rate": 0.00010370061831576544, + "loss": 0.0457, + "num_input_tokens_seen": 215074576, + "step": 99570 + }, + { + "epoch": 16.24388254486134, + "grad_norm": 0.004718319047242403, + "learning_rate": 0.00010365722089369395, + "loss": 0.0008, + "num_input_tokens_seen": 215085744, + "step": 99575 + }, + { + "epoch": 16.24469820554649, + "grad_norm": 0.02573045901954174, + "learning_rate": 0.00010361383150401165, + "loss": 0.0048, + "num_input_tokens_seen": 215096432, + "step": 99580 + }, + { + "epoch": 16.245513866231647, + "grad_norm": 0.020224103704094887, + "learning_rate": 0.00010357045014759797, + "loss": 0.001, + "num_input_tokens_seen": 215107056, + "step": 99585 + }, + { + "epoch": 16.246329526916803, + "grad_norm": 0.011003488674759865, + "learning_rate": 0.00010352707682533197, + "loss": 0.0019, + "num_input_tokens_seen": 215118736, + "step": 99590 + }, + { + "epoch": 16.24714518760196, + "grad_norm": 0.0029920844826847315, + "learning_rate": 0.00010348371153809277, + "loss": 0.0007, + "num_input_tokens_seen": 215129680, + "step": 99595 + }, + { + "epoch": 16.247960848287114, + "grad_norm": 0.0012058455031365156, + "learning_rate": 0.00010344035428675914, + "loss": 0.0011, + "num_input_tokens_seen": 215140560, + "step": 99600 + }, + { + "epoch": 16.248776508972266, + "grad_norm": 0.0012687857961282134, + "learning_rate": 0.00010339700507220978, + "loss": 0.0112, + "num_input_tokens_seen": 215151984, + "step": 99605 + }, + { + "epoch": 16.249592169657422, + "grad_norm": 0.0020511329639703035, + "learning_rate": 0.0001033536638953233, + "loss": 0.0008, + "num_input_tokens_seen": 215163632, + "step": 99610 + }, + { + "epoch": 16.250407830342578, + "grad_norm": 0.02142617478966713, + "learning_rate": 0.00010331033075697793, + "loss": 0.0037, + "num_input_tokens_seen": 215174096, + "step": 99615 + }, + { + "epoch": 16.251223491027734, + "grad_norm": 0.007285781670361757, + "learning_rate": 0.00010326700565805197, + "loss": 0.0025, + "num_input_tokens_seen": 215185008, + "step": 99620 + }, + { + "epoch": 16.252039151712886, + "grad_norm": 0.0007665912853553891, + "learning_rate": 0.00010322368859942333, + "loss": 0.0049, + "num_input_tokens_seen": 215196464, + "step": 99625 + }, + { + "epoch": 16.25285481239804, + "grad_norm": 0.003922312520444393, + "learning_rate": 0.00010318037958197024, + "loss": 0.0016, + "num_input_tokens_seen": 215207536, + "step": 99630 + }, + { + "epoch": 16.253670473083197, + "grad_norm": 0.0018488741479814053, + "learning_rate": 0.0001031370786065699, + "loss": 0.0008, + "num_input_tokens_seen": 215216432, + "step": 99635 + }, + { + "epoch": 16.254486133768353, + "grad_norm": 0.0002045558503596112, + "learning_rate": 0.00010309378567410039, + "loss": 0.0011, + "num_input_tokens_seen": 215226384, + "step": 99640 + }, + { + "epoch": 16.25530179445351, + "grad_norm": 0.09213671088218689, + "learning_rate": 0.00010305050078543848, + "loss": 0.0045, + "num_input_tokens_seen": 215236752, + "step": 99645 + }, + { + "epoch": 16.25611745513866, + "grad_norm": 0.0008904563146643341, + "learning_rate": 0.00010300722394146212, + "loss": 0.0016, + "num_input_tokens_seen": 215246608, + "step": 99650 + }, + { + "epoch": 16.256933115823816, + "grad_norm": 0.007827579975128174, + "learning_rate": 0.00010296395514304763, + "loss": 0.0011, + "num_input_tokens_seen": 215257072, + "step": 99655 + }, + { + "epoch": 16.257748776508972, + "grad_norm": 0.0644720047712326, + "learning_rate": 0.00010292069439107254, + "loss": 0.003, + "num_input_tokens_seen": 215267120, + "step": 99660 + }, + { + "epoch": 16.258564437194128, + "grad_norm": 0.0004442843492142856, + "learning_rate": 0.00010287744168641311, + "loss": 0.0025, + "num_input_tokens_seen": 215277424, + "step": 99665 + }, + { + "epoch": 16.259380097879284, + "grad_norm": 0.0010822077747434378, + "learning_rate": 0.00010283419702994634, + "loss": 0.0021, + "num_input_tokens_seen": 215288112, + "step": 99670 + }, + { + "epoch": 16.260195758564436, + "grad_norm": 0.10656613856554031, + "learning_rate": 0.0001027909604225481, + "loss": 0.0023, + "num_input_tokens_seen": 215297776, + "step": 99675 + }, + { + "epoch": 16.26101141924959, + "grad_norm": 0.013543715700507164, + "learning_rate": 0.00010274773186509528, + "loss": 0.0018, + "num_input_tokens_seen": 215307280, + "step": 99680 + }, + { + "epoch": 16.261827079934747, + "grad_norm": 0.00781995989382267, + "learning_rate": 0.00010270451135846332, + "loss": 0.0007, + "num_input_tokens_seen": 215316880, + "step": 99685 + }, + { + "epoch": 16.262642740619903, + "grad_norm": 0.0015151110710576177, + "learning_rate": 0.00010266129890352872, + "loss": 0.0034, + "num_input_tokens_seen": 215328528, + "step": 99690 + }, + { + "epoch": 16.26345840130506, + "grad_norm": 0.0037482441402971745, + "learning_rate": 0.00010261809450116666, + "loss": 0.0048, + "num_input_tokens_seen": 215340720, + "step": 99695 + }, + { + "epoch": 16.26427406199021, + "grad_norm": 0.5841511487960815, + "learning_rate": 0.00010257489815225318, + "loss": 0.1694, + "num_input_tokens_seen": 215352624, + "step": 99700 + }, + { + "epoch": 16.265089722675366, + "grad_norm": 0.006499356124550104, + "learning_rate": 0.00010253170985766357, + "loss": 0.0012, + "num_input_tokens_seen": 215362480, + "step": 99705 + }, + { + "epoch": 16.265905383360522, + "grad_norm": 0.0037484378553926945, + "learning_rate": 0.00010248852961827309, + "loss": 0.0023, + "num_input_tokens_seen": 215372752, + "step": 99710 + }, + { + "epoch": 16.266721044045678, + "grad_norm": 0.011288094334304333, + "learning_rate": 0.00010244535743495681, + "loss": 0.0025, + "num_input_tokens_seen": 215383600, + "step": 99715 + }, + { + "epoch": 16.267536704730833, + "grad_norm": 0.008340914733707905, + "learning_rate": 0.00010240219330858969, + "loss": 0.0654, + "num_input_tokens_seen": 215393296, + "step": 99720 + }, + { + "epoch": 16.268352365415986, + "grad_norm": 0.055706895887851715, + "learning_rate": 0.00010235903724004652, + "loss": 0.0107, + "num_input_tokens_seen": 215402384, + "step": 99725 + }, + { + "epoch": 16.26916802610114, + "grad_norm": 0.02399086207151413, + "learning_rate": 0.00010231588923020196, + "loss": 0.0013, + "num_input_tokens_seen": 215413392, + "step": 99730 + }, + { + "epoch": 16.269983686786297, + "grad_norm": 0.014908327721059322, + "learning_rate": 0.00010227274927993035, + "loss": 0.0008, + "num_input_tokens_seen": 215424656, + "step": 99735 + }, + { + "epoch": 16.270799347471453, + "grad_norm": 0.008819537237286568, + "learning_rate": 0.000102229617390106, + "loss": 0.0035, + "num_input_tokens_seen": 215436272, + "step": 99740 + }, + { + "epoch": 16.27161500815661, + "grad_norm": 0.0023069872986525297, + "learning_rate": 0.00010218649356160314, + "loss": 0.0006, + "num_input_tokens_seen": 215447120, + "step": 99745 + }, + { + "epoch": 16.27243066884176, + "grad_norm": 0.007820419035851955, + "learning_rate": 0.00010214337779529548, + "loss": 0.0046, + "num_input_tokens_seen": 215457232, + "step": 99750 + }, + { + "epoch": 16.273246329526916, + "grad_norm": 0.0034470772370696068, + "learning_rate": 0.00010210027009205719, + "loss": 0.0013, + "num_input_tokens_seen": 215465840, + "step": 99755 + }, + { + "epoch": 16.274061990212072, + "grad_norm": 0.3569754660129547, + "learning_rate": 0.00010205717045276153, + "loss": 0.0156, + "num_input_tokens_seen": 215476272, + "step": 99760 + }, + { + "epoch": 16.274877650897228, + "grad_norm": 0.002355807228013873, + "learning_rate": 0.00010201407887828234, + "loss": 0.0064, + "num_input_tokens_seen": 215486128, + "step": 99765 + }, + { + "epoch": 16.275693311582383, + "grad_norm": 0.0006279262597672641, + "learning_rate": 0.0001019709953694925, + "loss": 0.0008, + "num_input_tokens_seen": 215496976, + "step": 99770 + }, + { + "epoch": 16.276508972267536, + "grad_norm": 0.000803434697445482, + "learning_rate": 0.00010192791992726558, + "loss": 0.0029, + "num_input_tokens_seen": 215506544, + "step": 99775 + }, + { + "epoch": 16.27732463295269, + "grad_norm": 0.0010221730917692184, + "learning_rate": 0.00010188485255247415, + "loss": 0.0012, + "num_input_tokens_seen": 215518384, + "step": 99780 + }, + { + "epoch": 16.278140293637847, + "grad_norm": 0.0014653302496299148, + "learning_rate": 0.00010184179324599147, + "loss": 0.0035, + "num_input_tokens_seen": 215527664, + "step": 99785 + }, + { + "epoch": 16.278955954323003, + "grad_norm": 0.0004814395506400615, + "learning_rate": 0.00010179874200868966, + "loss": 0.0053, + "num_input_tokens_seen": 215538640, + "step": 99790 + }, + { + "epoch": 16.27977161500816, + "grad_norm": 0.0009255227050743997, + "learning_rate": 0.00010175569884144182, + "loss": 0.0007, + "num_input_tokens_seen": 215549552, + "step": 99795 + }, + { + "epoch": 16.28058727569331, + "grad_norm": 0.0268571674823761, + "learning_rate": 0.00010171266374511962, + "loss": 0.0723, + "num_input_tokens_seen": 215561168, + "step": 99800 + }, + { + "epoch": 16.281402936378466, + "grad_norm": 0.006670699920505285, + "learning_rate": 0.00010166963672059588, + "loss": 0.0011, + "num_input_tokens_seen": 215573488, + "step": 99805 + }, + { + "epoch": 16.282218597063622, + "grad_norm": 0.0005371726001612842, + "learning_rate": 0.00010162661776874193, + "loss": 0.0012, + "num_input_tokens_seen": 215583536, + "step": 99810 + }, + { + "epoch": 16.283034257748778, + "grad_norm": 0.001128238276578486, + "learning_rate": 0.00010158360689043028, + "loss": 0.0213, + "num_input_tokens_seen": 215594384, + "step": 99815 + }, + { + "epoch": 16.28384991843393, + "grad_norm": 0.0018089942168444395, + "learning_rate": 0.00010154060408653198, + "loss": 0.0025, + "num_input_tokens_seen": 215604336, + "step": 99820 + }, + { + "epoch": 16.284665579119086, + "grad_norm": 0.0002369862631894648, + "learning_rate": 0.00010149760935791907, + "loss": 0.0004, + "num_input_tokens_seen": 215615984, + "step": 99825 + }, + { + "epoch": 16.28548123980424, + "grad_norm": 0.02218855917453766, + "learning_rate": 0.00010145462270546241, + "loss": 0.0025, + "num_input_tokens_seen": 215626576, + "step": 99830 + }, + { + "epoch": 16.286296900489397, + "grad_norm": 0.0026553068310022354, + "learning_rate": 0.00010141164413003351, + "loss": 0.0009, + "num_input_tokens_seen": 215636720, + "step": 99835 + }, + { + "epoch": 16.287112561174553, + "grad_norm": 0.013349421322345734, + "learning_rate": 0.00010136867363250329, + "loss": 0.002, + "num_input_tokens_seen": 215647152, + "step": 99840 + }, + { + "epoch": 16.287928221859705, + "grad_norm": 0.0031846666242927313, + "learning_rate": 0.00010132571121374257, + "loss": 0.0022, + "num_input_tokens_seen": 215657392, + "step": 99845 + }, + { + "epoch": 16.28874388254486, + "grad_norm": 0.002197221852838993, + "learning_rate": 0.00010128275687462212, + "loss": 0.1308, + "num_input_tokens_seen": 215669776, + "step": 99850 + }, + { + "epoch": 16.289559543230016, + "grad_norm": 0.0004804141935892403, + "learning_rate": 0.0001012398106160124, + "loss": 0.003, + "num_input_tokens_seen": 215679440, + "step": 99855 + }, + { + "epoch": 16.290375203915172, + "grad_norm": 0.004510819911956787, + "learning_rate": 0.00010119687243878379, + "loss": 0.034, + "num_input_tokens_seen": 215690000, + "step": 99860 + }, + { + "epoch": 16.291190864600328, + "grad_norm": 0.022695958614349365, + "learning_rate": 0.00010115394234380642, + "loss": 0.0013, + "num_input_tokens_seen": 215701808, + "step": 99865 + }, + { + "epoch": 16.29200652528548, + "grad_norm": 0.007637821137905121, + "learning_rate": 0.00010111102033195041, + "loss": 0.0124, + "num_input_tokens_seen": 215713200, + "step": 99870 + }, + { + "epoch": 16.292822185970635, + "grad_norm": 0.007616210263222456, + "learning_rate": 0.00010106810640408564, + "loss": 0.0028, + "num_input_tokens_seen": 215724208, + "step": 99875 + }, + { + "epoch": 16.29363784665579, + "grad_norm": 0.0030214902944862843, + "learning_rate": 0.00010102520056108172, + "loss": 0.0015, + "num_input_tokens_seen": 215735792, + "step": 99880 + }, + { + "epoch": 16.294453507340947, + "grad_norm": 0.026455482468008995, + "learning_rate": 0.00010098230280380826, + "loss": 0.003, + "num_input_tokens_seen": 215747024, + "step": 99885 + }, + { + "epoch": 16.295269168026103, + "grad_norm": 0.02323424257338047, + "learning_rate": 0.00010093941313313465, + "loss": 0.002, + "num_input_tokens_seen": 215756432, + "step": 99890 + }, + { + "epoch": 16.296084828711255, + "grad_norm": 0.020892612636089325, + "learning_rate": 0.00010089653154992994, + "loss": 0.0013, + "num_input_tokens_seen": 215766224, + "step": 99895 + }, + { + "epoch": 16.29690048939641, + "grad_norm": 0.0010717104887589812, + "learning_rate": 0.00010085365805506358, + "loss": 0.0027, + "num_input_tokens_seen": 215776944, + "step": 99900 + }, + { + "epoch": 16.297716150081566, + "grad_norm": 0.008391788229346275, + "learning_rate": 0.00010081079264940391, + "loss": 0.0614, + "num_input_tokens_seen": 215788432, + "step": 99905 + }, + { + "epoch": 16.298531810766722, + "grad_norm": 0.03581464663147926, + "learning_rate": 0.00010076793533382022, + "loss": 0.0028, + "num_input_tokens_seen": 215799600, + "step": 99910 + }, + { + "epoch": 16.299347471451878, + "grad_norm": 0.025882374495267868, + "learning_rate": 0.00010072508610918046, + "loss": 0.002, + "num_input_tokens_seen": 215811184, + "step": 99915 + }, + { + "epoch": 16.30016313213703, + "grad_norm": 0.31241723895072937, + "learning_rate": 0.00010068224497635369, + "loss": 0.0057, + "num_input_tokens_seen": 215822320, + "step": 99920 + }, + { + "epoch": 16.300978792822185, + "grad_norm": 0.043358415365219116, + "learning_rate": 0.00010063941193620751, + "loss": 0.0023, + "num_input_tokens_seen": 215833808, + "step": 99925 + }, + { + "epoch": 16.30179445350734, + "grad_norm": 0.003072767984122038, + "learning_rate": 0.0001005965869896105, + "loss": 0.0016, + "num_input_tokens_seen": 215845392, + "step": 99930 + }, + { + "epoch": 16.302610114192497, + "grad_norm": 0.031429387629032135, + "learning_rate": 0.00010055377013743012, + "loss": 0.0018, + "num_input_tokens_seen": 215855312, + "step": 99935 + }, + { + "epoch": 16.303425774877653, + "grad_norm": 0.0011349570704624057, + "learning_rate": 0.0001005109613805344, + "loss": 0.0035, + "num_input_tokens_seen": 215866224, + "step": 99940 + }, + { + "epoch": 16.304241435562805, + "grad_norm": 0.0012783968122676015, + "learning_rate": 0.00010046816071979087, + "loss": 0.002, + "num_input_tokens_seen": 215877424, + "step": 99945 + }, + { + "epoch": 16.30505709624796, + "grad_norm": 0.00905763078480959, + "learning_rate": 0.0001004253681560669, + "loss": 0.0014, + "num_input_tokens_seen": 215889488, + "step": 99950 + }, + { + "epoch": 16.305872756933116, + "grad_norm": 0.04101025313138962, + "learning_rate": 0.00010038258369022974, + "loss": 0.0225, + "num_input_tokens_seen": 215899280, + "step": 99955 + }, + { + "epoch": 16.306688417618272, + "grad_norm": 0.00823147501796484, + "learning_rate": 0.00010033980732314646, + "loss": 0.0009, + "num_input_tokens_seen": 215909648, + "step": 99960 + }, + { + "epoch": 16.307504078303428, + "grad_norm": 0.0012991069816052914, + "learning_rate": 0.00010029703905568399, + "loss": 0.0017, + "num_input_tokens_seen": 215920720, + "step": 99965 + }, + { + "epoch": 16.30831973898858, + "grad_norm": 0.008023583330214024, + "learning_rate": 0.00010025427888870909, + "loss": 0.001, + "num_input_tokens_seen": 215931120, + "step": 99970 + }, + { + "epoch": 16.309135399673735, + "grad_norm": 0.014352011494338512, + "learning_rate": 0.00010021152682308837, + "loss": 0.0121, + "num_input_tokens_seen": 215941680, + "step": 99975 + }, + { + "epoch": 16.30995106035889, + "grad_norm": 0.0030116417910903692, + "learning_rate": 0.00010016878285968816, + "loss": 0.001, + "num_input_tokens_seen": 215951152, + "step": 99980 + }, + { + "epoch": 16.310766721044047, + "grad_norm": 0.02265646867454052, + "learning_rate": 0.00010012604699937483, + "loss": 0.0031, + "num_input_tokens_seen": 215962544, + "step": 99985 + }, + { + "epoch": 16.3115823817292, + "grad_norm": 0.0016602237010374665, + "learning_rate": 0.00010008331924301445, + "loss": 0.0066, + "num_input_tokens_seen": 215972848, + "step": 99990 + }, + { + "epoch": 16.312398042414355, + "grad_norm": 0.022008635103702545, + "learning_rate": 0.00010004059959147293, + "loss": 0.001, + "num_input_tokens_seen": 215983824, + "step": 99995 + }, + { + "epoch": 16.31321370309951, + "grad_norm": 0.004182157106697559, + "learning_rate": 9.999788804561605e-05, + "loss": 0.001, + "num_input_tokens_seen": 215994512, + "step": 100000 + }, + { + "epoch": 16.314029363784666, + "grad_norm": 0.0018245892133563757, + "learning_rate": 9.995518460630937e-05, + "loss": 0.0033, + "num_input_tokens_seen": 216004976, + "step": 100005 + }, + { + "epoch": 16.31484502446982, + "grad_norm": 0.008289888501167297, + "learning_rate": 9.991248927441837e-05, + "loss": 0.0016, + "num_input_tokens_seen": 216016176, + "step": 100010 + }, + { + "epoch": 16.315660685154974, + "grad_norm": 0.001752890762872994, + "learning_rate": 9.986980205080837e-05, + "loss": 0.0025, + "num_input_tokens_seen": 216026480, + "step": 100015 + }, + { + "epoch": 16.31647634584013, + "grad_norm": 0.00022865060600452125, + "learning_rate": 9.982712293634438e-05, + "loss": 0.0101, + "num_input_tokens_seen": 216036336, + "step": 100020 + }, + { + "epoch": 16.317292006525285, + "grad_norm": 0.00168028159532696, + "learning_rate": 9.97844519318914e-05, + "loss": 0.0014, + "num_input_tokens_seen": 216046416, + "step": 100025 + }, + { + "epoch": 16.31810766721044, + "grad_norm": 0.0004960019723512232, + "learning_rate": 9.974178903831427e-05, + "loss": 0.0019, + "num_input_tokens_seen": 216057776, + "step": 100030 + }, + { + "epoch": 16.318923327895597, + "grad_norm": 0.003168502589687705, + "learning_rate": 9.969913425647747e-05, + "loss": 0.0031, + "num_input_tokens_seen": 216068560, + "step": 100035 + }, + { + "epoch": 16.31973898858075, + "grad_norm": 0.007589966524392366, + "learning_rate": 9.965648758724544e-05, + "loss": 0.0034, + "num_input_tokens_seen": 216079056, + "step": 100040 + }, + { + "epoch": 16.320554649265905, + "grad_norm": 0.00240236334502697, + "learning_rate": 9.961384903148269e-05, + "loss": 0.0015, + "num_input_tokens_seen": 216091728, + "step": 100045 + }, + { + "epoch": 16.32137030995106, + "grad_norm": 0.0010267647448927164, + "learning_rate": 9.957121859005324e-05, + "loss": 0.0009, + "num_input_tokens_seen": 216102448, + "step": 100050 + }, + { + "epoch": 16.322185970636216, + "grad_norm": 0.0273845586925745, + "learning_rate": 9.952859626382099e-05, + "loss": 0.0021, + "num_input_tokens_seen": 216114768, + "step": 100055 + }, + { + "epoch": 16.32300163132137, + "grad_norm": 0.00038628673064522445, + "learning_rate": 9.948598205364979e-05, + "loss": 0.0091, + "num_input_tokens_seen": 216125200, + "step": 100060 + }, + { + "epoch": 16.323817292006524, + "grad_norm": 0.0002461184049025178, + "learning_rate": 9.944337596040326e-05, + "loss": 0.0013, + "num_input_tokens_seen": 216134992, + "step": 100065 + }, + { + "epoch": 16.32463295269168, + "grad_norm": 0.0075553716160357, + "learning_rate": 9.940077798494485e-05, + "loss": 0.0012, + "num_input_tokens_seen": 216146544, + "step": 100070 + }, + { + "epoch": 16.325448613376835, + "grad_norm": 0.026246318593621254, + "learning_rate": 9.935818812813784e-05, + "loss": 0.0019, + "num_input_tokens_seen": 216156816, + "step": 100075 + }, + { + "epoch": 16.32626427406199, + "grad_norm": 0.005259836558252573, + "learning_rate": 9.931560639084541e-05, + "loss": 0.0011, + "num_input_tokens_seen": 216167600, + "step": 100080 + }, + { + "epoch": 16.327079934747147, + "grad_norm": 0.0017907143337652087, + "learning_rate": 9.927303277393051e-05, + "loss": 0.0053, + "num_input_tokens_seen": 216178096, + "step": 100085 + }, + { + "epoch": 16.3278955954323, + "grad_norm": 0.021919501945376396, + "learning_rate": 9.923046727825602e-05, + "loss": 0.0019, + "num_input_tokens_seen": 216187952, + "step": 100090 + }, + { + "epoch": 16.328711256117455, + "grad_norm": 0.06038171425461769, + "learning_rate": 9.918790990468446e-05, + "loss": 0.0047, + "num_input_tokens_seen": 216197712, + "step": 100095 + }, + { + "epoch": 16.32952691680261, + "grad_norm": 0.01833273656666279, + "learning_rate": 9.914536065407842e-05, + "loss": 0.0442, + "num_input_tokens_seen": 216209392, + "step": 100100 + }, + { + "epoch": 16.330342577487766, + "grad_norm": 0.0007771208183839917, + "learning_rate": 9.910281952730011e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216221136, + "step": 100105 + }, + { + "epoch": 16.33115823817292, + "grad_norm": 0.02714325487613678, + "learning_rate": 9.906028652521176e-05, + "loss": 0.0012, + "num_input_tokens_seen": 216231152, + "step": 100110 + }, + { + "epoch": 16.331973898858074, + "grad_norm": 0.00896800123155117, + "learning_rate": 9.901776164867538e-05, + "loss": 0.0014, + "num_input_tokens_seen": 216241488, + "step": 100115 + }, + { + "epoch": 16.33278955954323, + "grad_norm": 0.0016855057328939438, + "learning_rate": 9.89752448985527e-05, + "loss": 0.0026, + "num_input_tokens_seen": 216252400, + "step": 100120 + }, + { + "epoch": 16.333605220228385, + "grad_norm": 0.0003753203200176358, + "learning_rate": 9.893273627570542e-05, + "loss": 0.0022, + "num_input_tokens_seen": 216262320, + "step": 100125 + }, + { + "epoch": 16.33442088091354, + "grad_norm": 0.010976849123835564, + "learning_rate": 9.889023578099504e-05, + "loss": 0.001, + "num_input_tokens_seen": 216272912, + "step": 100130 + }, + { + "epoch": 16.335236541598697, + "grad_norm": 0.012816797941923141, + "learning_rate": 9.884774341528285e-05, + "loss": 0.001, + "num_input_tokens_seen": 216282928, + "step": 100135 + }, + { + "epoch": 16.33605220228385, + "grad_norm": 0.5955064296722412, + "learning_rate": 9.880525917943006e-05, + "loss": 0.0952, + "num_input_tokens_seen": 216293648, + "step": 100140 + }, + { + "epoch": 16.336867862969005, + "grad_norm": 0.03133242577314377, + "learning_rate": 9.876278307429764e-05, + "loss": 0.0027, + "num_input_tokens_seen": 216304240, + "step": 100145 + }, + { + "epoch": 16.33768352365416, + "grad_norm": 0.0009458880522288382, + "learning_rate": 9.872031510074625e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216313904, + "step": 100150 + }, + { + "epoch": 16.338499184339316, + "grad_norm": 0.03488912805914879, + "learning_rate": 9.867785525963707e-05, + "loss": 0.0011, + "num_input_tokens_seen": 216324176, + "step": 100155 + }, + { + "epoch": 16.339314845024468, + "grad_norm": 0.010793568566441536, + "learning_rate": 9.863540355182998e-05, + "loss": 0.0022, + "num_input_tokens_seen": 216334640, + "step": 100160 + }, + { + "epoch": 16.340130505709624, + "grad_norm": 0.032283537089824677, + "learning_rate": 9.859295997818585e-05, + "loss": 0.0082, + "num_input_tokens_seen": 216344912, + "step": 100165 + }, + { + "epoch": 16.34094616639478, + "grad_norm": 0.0003030995430890471, + "learning_rate": 9.855052453956437e-05, + "loss": 0.0009, + "num_input_tokens_seen": 216355760, + "step": 100170 + }, + { + "epoch": 16.341761827079935, + "grad_norm": 0.0007913812878541648, + "learning_rate": 9.850809723682603e-05, + "loss": 0.0007, + "num_input_tokens_seen": 216367760, + "step": 100175 + }, + { + "epoch": 16.34257748776509, + "grad_norm": 0.0012517020804807544, + "learning_rate": 9.846567807083018e-05, + "loss": 0.0011, + "num_input_tokens_seen": 216379600, + "step": 100180 + }, + { + "epoch": 16.343393148450243, + "grad_norm": 0.0006271583843044937, + "learning_rate": 9.842326704243682e-05, + "loss": 0.0011, + "num_input_tokens_seen": 216391856, + "step": 100185 + }, + { + "epoch": 16.3442088091354, + "grad_norm": 0.0003550504916347563, + "learning_rate": 9.838086415250547e-05, + "loss": 0.0101, + "num_input_tokens_seen": 216402512, + "step": 100190 + }, + { + "epoch": 16.345024469820554, + "grad_norm": 0.00036078577977605164, + "learning_rate": 9.833846940189533e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216414960, + "step": 100195 + }, + { + "epoch": 16.34584013050571, + "grad_norm": 0.005881108809262514, + "learning_rate": 9.829608279146568e-05, + "loss": 0.0012, + "num_input_tokens_seen": 216425904, + "step": 100200 + }, + { + "epoch": 16.346655791190866, + "grad_norm": 0.0021825244184583426, + "learning_rate": 9.825370432207554e-05, + "loss": 0.0013, + "num_input_tokens_seen": 216436656, + "step": 100205 + }, + { + "epoch": 16.347471451876018, + "grad_norm": 0.06234239786863327, + "learning_rate": 9.821133399458371e-05, + "loss": 0.0021, + "num_input_tokens_seen": 216446800, + "step": 100210 + }, + { + "epoch": 16.348287112561174, + "grad_norm": 0.008905709721148014, + "learning_rate": 9.81689718098489e-05, + "loss": 0.013, + "num_input_tokens_seen": 216457264, + "step": 100215 + }, + { + "epoch": 16.34910277324633, + "grad_norm": 0.558327853679657, + "learning_rate": 9.81266177687296e-05, + "loss": 0.1895, + "num_input_tokens_seen": 216467664, + "step": 100220 + }, + { + "epoch": 16.349918433931485, + "grad_norm": 0.0004579754895530641, + "learning_rate": 9.808427187208424e-05, + "loss": 0.003, + "num_input_tokens_seen": 216477936, + "step": 100225 + }, + { + "epoch": 16.35073409461664, + "grad_norm": 0.0029174680821597576, + "learning_rate": 9.8041934120771e-05, + "loss": 0.0014, + "num_input_tokens_seen": 216489488, + "step": 100230 + }, + { + "epoch": 16.351549755301793, + "grad_norm": 0.00225959368981421, + "learning_rate": 9.799960451564787e-05, + "loss": 0.0026, + "num_input_tokens_seen": 216498992, + "step": 100235 + }, + { + "epoch": 16.35236541598695, + "grad_norm": 0.0002525453455746174, + "learning_rate": 9.795728305757267e-05, + "loss": 0.002, + "num_input_tokens_seen": 216509936, + "step": 100240 + }, + { + "epoch": 16.353181076672104, + "grad_norm": 0.004050148651003838, + "learning_rate": 9.791496974740321e-05, + "loss": 0.0037, + "num_input_tokens_seen": 216521328, + "step": 100245 + }, + { + "epoch": 16.35399673735726, + "grad_norm": 0.000871855765581131, + "learning_rate": 9.787266458599697e-05, + "loss": 0.0021, + "num_input_tokens_seen": 216532592, + "step": 100250 + }, + { + "epoch": 16.354812398042416, + "grad_norm": 0.0029846071265637875, + "learning_rate": 9.783036757421132e-05, + "loss": 0.0008, + "num_input_tokens_seen": 216544432, + "step": 100255 + }, + { + "epoch": 16.355628058727568, + "grad_norm": 0.004262133967131376, + "learning_rate": 9.778807871290346e-05, + "loss": 0.0008, + "num_input_tokens_seen": 216556240, + "step": 100260 + }, + { + "epoch": 16.356443719412724, + "grad_norm": 0.001604230608791113, + "learning_rate": 9.774579800293026e-05, + "loss": 0.0014, + "num_input_tokens_seen": 216568464, + "step": 100265 + }, + { + "epoch": 16.35725938009788, + "grad_norm": 0.016766056418418884, + "learning_rate": 9.770352544514904e-05, + "loss": 0.0019, + "num_input_tokens_seen": 216580272, + "step": 100270 + }, + { + "epoch": 16.358075040783035, + "grad_norm": 0.0005941848503425717, + "learning_rate": 9.766126104041601e-05, + "loss": 0.001, + "num_input_tokens_seen": 216590800, + "step": 100275 + }, + { + "epoch": 16.35889070146819, + "grad_norm": 0.0018064542673528194, + "learning_rate": 9.761900478958813e-05, + "loss": 0.0012, + "num_input_tokens_seen": 216601648, + "step": 100280 + }, + { + "epoch": 16.359706362153343, + "grad_norm": 0.0018726956332102418, + "learning_rate": 9.757675669352133e-05, + "loss": 0.0008, + "num_input_tokens_seen": 216611600, + "step": 100285 + }, + { + "epoch": 16.3605220228385, + "grad_norm": 0.005994674749672413, + "learning_rate": 9.753451675307234e-05, + "loss": 0.0005, + "num_input_tokens_seen": 216622224, + "step": 100290 + }, + { + "epoch": 16.361337683523654, + "grad_norm": 0.0016363600734621286, + "learning_rate": 9.749228496909668e-05, + "loss": 0.0003, + "num_input_tokens_seen": 216633648, + "step": 100295 + }, + { + "epoch": 16.36215334420881, + "grad_norm": 0.0008571971557103097, + "learning_rate": 9.745006134245072e-05, + "loss": 0.0023, + "num_input_tokens_seen": 216644656, + "step": 100300 + }, + { + "epoch": 16.362969004893966, + "grad_norm": 0.2415996789932251, + "learning_rate": 9.740784587398965e-05, + "loss": 0.0177, + "num_input_tokens_seen": 216656176, + "step": 100305 + }, + { + "epoch": 16.363784665579118, + "grad_norm": 0.017015738412737846, + "learning_rate": 9.736563856456959e-05, + "loss": 0.0018, + "num_input_tokens_seen": 216666064, + "step": 100310 + }, + { + "epoch": 16.364600326264274, + "grad_norm": 0.0016154218465089798, + "learning_rate": 9.73234394150454e-05, + "loss": 0.003, + "num_input_tokens_seen": 216677136, + "step": 100315 + }, + { + "epoch": 16.36541598694943, + "grad_norm": 0.0032554895151406527, + "learning_rate": 9.728124842627278e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216689456, + "step": 100320 + }, + { + "epoch": 16.366231647634585, + "grad_norm": 0.09540196508169174, + "learning_rate": 9.723906559910634e-05, + "loss": 0.002, + "num_input_tokens_seen": 216701392, + "step": 100325 + }, + { + "epoch": 16.36704730831974, + "grad_norm": 0.0013432763516902924, + "learning_rate": 9.719689093440126e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216709264, + "step": 100330 + }, + { + "epoch": 16.367862969004893, + "grad_norm": 0.003039369825273752, + "learning_rate": 9.715472443301215e-05, + "loss": 0.0007, + "num_input_tokens_seen": 216720912, + "step": 100335 + }, + { + "epoch": 16.36867862969005, + "grad_norm": 0.014389106072485447, + "learning_rate": 9.711256609579367e-05, + "loss": 0.0013, + "num_input_tokens_seen": 216732848, + "step": 100340 + }, + { + "epoch": 16.369494290375204, + "grad_norm": 0.07118832319974899, + "learning_rate": 9.707041592360005e-05, + "loss": 0.0018, + "num_input_tokens_seen": 216743440, + "step": 100345 + }, + { + "epoch": 16.37030995106036, + "grad_norm": 0.0002033188648056239, + "learning_rate": 9.702827391728564e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216754320, + "step": 100350 + }, + { + "epoch": 16.371125611745512, + "grad_norm": 0.00933680310845375, + "learning_rate": 9.69861400777045e-05, + "loss": 0.0015, + "num_input_tokens_seen": 216764720, + "step": 100355 + }, + { + "epoch": 16.371941272430668, + "grad_norm": 0.0018059660214930773, + "learning_rate": 9.694401440571043e-05, + "loss": 0.0023, + "num_input_tokens_seen": 216774832, + "step": 100360 + }, + { + "epoch": 16.372756933115824, + "grad_norm": 0.0006134548457339406, + "learning_rate": 9.690189690215728e-05, + "loss": 0.0005, + "num_input_tokens_seen": 216784880, + "step": 100365 + }, + { + "epoch": 16.37357259380098, + "grad_norm": 0.0003548564272932708, + "learning_rate": 9.685978756789854e-05, + "loss": 0.006, + "num_input_tokens_seen": 216795504, + "step": 100370 + }, + { + "epoch": 16.374388254486135, + "grad_norm": 0.011922224424779415, + "learning_rate": 9.681768640378757e-05, + "loss": 0.0151, + "num_input_tokens_seen": 216805520, + "step": 100375 + }, + { + "epoch": 16.375203915171287, + "grad_norm": 0.00019268895266577601, + "learning_rate": 9.677559341067759e-05, + "loss": 0.0018, + "num_input_tokens_seen": 216816880, + "step": 100380 + }, + { + "epoch": 16.376019575856443, + "grad_norm": 0.0481029748916626, + "learning_rate": 9.673350858942198e-05, + "loss": 0.0063, + "num_input_tokens_seen": 216828080, + "step": 100385 + }, + { + "epoch": 16.3768352365416, + "grad_norm": 0.000992569257505238, + "learning_rate": 9.669143194087315e-05, + "loss": 0.0049, + "num_input_tokens_seen": 216838640, + "step": 100390 + }, + { + "epoch": 16.377650897226754, + "grad_norm": 0.0029242881573736668, + "learning_rate": 9.664936346588432e-05, + "loss": 0.0011, + "num_input_tokens_seen": 216850320, + "step": 100395 + }, + { + "epoch": 16.37846655791191, + "grad_norm": 0.0005835880292579532, + "learning_rate": 9.660730316530757e-05, + "loss": 0.0007, + "num_input_tokens_seen": 216862288, + "step": 100400 + }, + { + "epoch": 16.379282218597062, + "grad_norm": 0.0011815468315035105, + "learning_rate": 9.65652510399958e-05, + "loss": 0.0027, + "num_input_tokens_seen": 216871664, + "step": 100405 + }, + { + "epoch": 16.380097879282218, + "grad_norm": 0.0024670150596648455, + "learning_rate": 9.652320709080082e-05, + "loss": 0.0015, + "num_input_tokens_seen": 216882608, + "step": 100410 + }, + { + "epoch": 16.380913539967374, + "grad_norm": 0.002712622517719865, + "learning_rate": 9.648117131857509e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216894352, + "step": 100415 + }, + { + "epoch": 16.38172920065253, + "grad_norm": 0.06670382618904114, + "learning_rate": 9.643914372417011e-05, + "loss": 0.0026, + "num_input_tokens_seen": 216906480, + "step": 100420 + }, + { + "epoch": 16.382544861337685, + "grad_norm": 0.5531919002532959, + "learning_rate": 9.639712430843806e-05, + "loss": 0.0123, + "num_input_tokens_seen": 216917840, + "step": 100425 + }, + { + "epoch": 16.383360522022837, + "grad_norm": 0.17486710846424103, + "learning_rate": 9.635511307223005e-05, + "loss": 0.0099, + "num_input_tokens_seen": 216929104, + "step": 100430 + }, + { + "epoch": 16.384176182707993, + "grad_norm": 0.06316438317298889, + "learning_rate": 9.631311001639798e-05, + "loss": 0.0009, + "num_input_tokens_seen": 216939152, + "step": 100435 + }, + { + "epoch": 16.38499184339315, + "grad_norm": 0.0012215528404340148, + "learning_rate": 9.62711151417926e-05, + "loss": 0.0007, + "num_input_tokens_seen": 216949744, + "step": 100440 + }, + { + "epoch": 16.385807504078304, + "grad_norm": 0.004731375258415937, + "learning_rate": 9.622912844926551e-05, + "loss": 0.0019, + "num_input_tokens_seen": 216960912, + "step": 100445 + }, + { + "epoch": 16.38662316476346, + "grad_norm": 0.0028271775227040052, + "learning_rate": 9.618714993966704e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216972816, + "step": 100450 + }, + { + "epoch": 16.387438825448612, + "grad_norm": 0.010311473160982132, + "learning_rate": 9.614517961384856e-05, + "loss": 0.0034, + "num_input_tokens_seen": 216985040, + "step": 100455 + }, + { + "epoch": 16.388254486133768, + "grad_norm": 0.025720862671732903, + "learning_rate": 9.610321747266005e-05, + "loss": 0.0011, + "num_input_tokens_seen": 216994576, + "step": 100460 + }, + { + "epoch": 16.389070146818923, + "grad_norm": 0.008032547309994698, + "learning_rate": 9.60612635169525e-05, + "loss": 0.0021, + "num_input_tokens_seen": 217004816, + "step": 100465 + }, + { + "epoch": 16.38988580750408, + "grad_norm": 0.0005619633011519909, + "learning_rate": 9.601931774757561e-05, + "loss": 0.0011, + "num_input_tokens_seen": 217014064, + "step": 100470 + }, + { + "epoch": 16.390701468189235, + "grad_norm": 0.0003188513219356537, + "learning_rate": 9.597738016537988e-05, + "loss": 0.0025, + "num_input_tokens_seen": 217024528, + "step": 100475 + }, + { + "epoch": 16.391517128874387, + "grad_norm": 0.0011438900837674737, + "learning_rate": 9.593545077121507e-05, + "loss": 0.0005, + "num_input_tokens_seen": 217035472, + "step": 100480 + }, + { + "epoch": 16.392332789559543, + "grad_norm": 0.003715142607688904, + "learning_rate": 9.589352956593095e-05, + "loss": 0.0035, + "num_input_tokens_seen": 217046096, + "step": 100485 + }, + { + "epoch": 16.3931484502447, + "grad_norm": 0.041871681809425354, + "learning_rate": 9.585161655037705e-05, + "loss": 0.0072, + "num_input_tokens_seen": 217056368, + "step": 100490 + }, + { + "epoch": 16.393964110929854, + "grad_norm": 0.0028728186152875423, + "learning_rate": 9.580971172540287e-05, + "loss": 0.1086, + "num_input_tokens_seen": 217065808, + "step": 100495 + }, + { + "epoch": 16.39477977161501, + "grad_norm": 0.010508287698030472, + "learning_rate": 9.576781509185766e-05, + "loss": 0.0006, + "num_input_tokens_seen": 217077648, + "step": 100500 + }, + { + "epoch": 16.395595432300162, + "grad_norm": 0.004550436977297068, + "learning_rate": 9.572592665059043e-05, + "loss": 0.0045, + "num_input_tokens_seen": 217087952, + "step": 100505 + }, + { + "epoch": 16.396411092985318, + "grad_norm": 0.0050187078304588795, + "learning_rate": 9.568404640245022e-05, + "loss": 0.0017, + "num_input_tokens_seen": 217096688, + "step": 100510 + }, + { + "epoch": 16.397226753670473, + "grad_norm": 0.2578853666782379, + "learning_rate": 9.564217434828565e-05, + "loss": 0.0078, + "num_input_tokens_seen": 217106864, + "step": 100515 + }, + { + "epoch": 16.39804241435563, + "grad_norm": 0.0025490387342870235, + "learning_rate": 9.56003104889454e-05, + "loss": 0.0107, + "num_input_tokens_seen": 217118064, + "step": 100520 + }, + { + "epoch": 16.39885807504078, + "grad_norm": 0.029599115252494812, + "learning_rate": 9.55584548252778e-05, + "loss": 0.0018, + "num_input_tokens_seen": 217127376, + "step": 100525 + }, + { + "epoch": 16.399673735725937, + "grad_norm": 0.006668840069323778, + "learning_rate": 9.55166073581314e-05, + "loss": 0.0011, + "num_input_tokens_seen": 217136816, + "step": 100530 + }, + { + "epoch": 16.400489396411093, + "grad_norm": 0.1902245283126831, + "learning_rate": 9.547476808835381e-05, + "loss": 0.0056, + "num_input_tokens_seen": 217147952, + "step": 100535 + }, + { + "epoch": 16.40130505709625, + "grad_norm": 0.44840678572654724, + "learning_rate": 9.54329370167935e-05, + "loss": 0.024, + "num_input_tokens_seen": 217158800, + "step": 100540 + }, + { + "epoch": 16.402120717781404, + "grad_norm": 0.0007062299409881234, + "learning_rate": 9.539111414429769e-05, + "loss": 0.0004, + "num_input_tokens_seen": 217169040, + "step": 100545 + }, + { + "epoch": 16.402936378466556, + "grad_norm": 0.00029036731575615704, + "learning_rate": 9.53492994717145e-05, + "loss": 0.0014, + "num_input_tokens_seen": 217180496, + "step": 100550 + }, + { + "epoch": 16.403752039151712, + "grad_norm": 0.019752731546759605, + "learning_rate": 9.530749299989078e-05, + "loss": 0.0011, + "num_input_tokens_seen": 217191536, + "step": 100555 + }, + { + "epoch": 16.404567699836868, + "grad_norm": 0.0012440073769539595, + "learning_rate": 9.526569472967444e-05, + "loss": 0.0038, + "num_input_tokens_seen": 217203568, + "step": 100560 + }, + { + "epoch": 16.405383360522023, + "grad_norm": 0.031068088486790657, + "learning_rate": 9.522390466191194e-05, + "loss": 0.0022, + "num_input_tokens_seen": 217213328, + "step": 100565 + }, + { + "epoch": 16.40619902120718, + "grad_norm": 0.0025797896087169647, + "learning_rate": 9.518212279745075e-05, + "loss": 0.0012, + "num_input_tokens_seen": 217223504, + "step": 100570 + }, + { + "epoch": 16.40701468189233, + "grad_norm": 0.004863258916884661, + "learning_rate": 9.514034913713714e-05, + "loss": 0.0036, + "num_input_tokens_seen": 217234672, + "step": 100575 + }, + { + "epoch": 16.407830342577487, + "grad_norm": 0.005317374598234892, + "learning_rate": 9.509858368181812e-05, + "loss": 0.0019, + "num_input_tokens_seen": 217244944, + "step": 100580 + }, + { + "epoch": 16.408646003262643, + "grad_norm": 0.0019695402588695288, + "learning_rate": 9.505682643233993e-05, + "loss": 0.0011, + "num_input_tokens_seen": 217254864, + "step": 100585 + }, + { + "epoch": 16.4094616639478, + "grad_norm": 0.017480649054050446, + "learning_rate": 9.501507738954884e-05, + "loss": 0.0906, + "num_input_tokens_seen": 217264464, + "step": 100590 + }, + { + "epoch": 16.410277324632954, + "grad_norm": 0.0020491848699748516, + "learning_rate": 9.497333655429097e-05, + "loss": 0.0006, + "num_input_tokens_seen": 217275152, + "step": 100595 + }, + { + "epoch": 16.411092985318106, + "grad_norm": 0.44643938541412354, + "learning_rate": 9.493160392741229e-05, + "loss": 0.0839, + "num_input_tokens_seen": 217286096, + "step": 100600 + }, + { + "epoch": 16.411908646003262, + "grad_norm": 0.0006037737475708127, + "learning_rate": 9.488987950975847e-05, + "loss": 0.0016, + "num_input_tokens_seen": 217296400, + "step": 100605 + }, + { + "epoch": 16.412724306688418, + "grad_norm": 0.0008719302131794393, + "learning_rate": 9.484816330217522e-05, + "loss": 0.002, + "num_input_tokens_seen": 217307856, + "step": 100610 + }, + { + "epoch": 16.413539967373573, + "grad_norm": 0.0005770606803707778, + "learning_rate": 9.480645530550785e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217318320, + "step": 100615 + }, + { + "epoch": 16.41435562805873, + "grad_norm": 0.0018063917523249984, + "learning_rate": 9.47647555206017e-05, + "loss": 0.0006, + "num_input_tokens_seen": 217327888, + "step": 100620 + }, + { + "epoch": 16.41517128874388, + "grad_norm": 0.001203131745569408, + "learning_rate": 9.472306394830188e-05, + "loss": 0.0012, + "num_input_tokens_seen": 217339664, + "step": 100625 + }, + { + "epoch": 16.415986949429037, + "grad_norm": 0.007052603177726269, + "learning_rate": 9.46813805894533e-05, + "loss": 0.002, + "num_input_tokens_seen": 217351056, + "step": 100630 + }, + { + "epoch": 16.416802610114193, + "grad_norm": 0.003801350248977542, + "learning_rate": 9.46397054449007e-05, + "loss": 0.0769, + "num_input_tokens_seen": 217361552, + "step": 100635 + }, + { + "epoch": 16.41761827079935, + "grad_norm": 0.00352098629809916, + "learning_rate": 9.459803851548876e-05, + "loss": 0.0005, + "num_input_tokens_seen": 217371440, + "step": 100640 + }, + { + "epoch": 16.418433931484504, + "grad_norm": 0.0003427791816648096, + "learning_rate": 9.455637980206177e-05, + "loss": 0.0005, + "num_input_tokens_seen": 217381424, + "step": 100645 + }, + { + "epoch": 16.419249592169656, + "grad_norm": 0.02932632900774479, + "learning_rate": 9.451472930546417e-05, + "loss": 0.0009, + "num_input_tokens_seen": 217392368, + "step": 100650 + }, + { + "epoch": 16.420065252854812, + "grad_norm": 0.013364373706281185, + "learning_rate": 9.447308702653995e-05, + "loss": 0.0019, + "num_input_tokens_seen": 217402736, + "step": 100655 + }, + { + "epoch": 16.420880913539968, + "grad_norm": 0.007504095323383808, + "learning_rate": 9.443145296613303e-05, + "loss": 0.0027, + "num_input_tokens_seen": 217413712, + "step": 100660 + }, + { + "epoch": 16.421696574225123, + "grad_norm": 0.06577505171298981, + "learning_rate": 9.438982712508726e-05, + "loss": 0.0044, + "num_input_tokens_seen": 217425232, + "step": 100665 + }, + { + "epoch": 16.42251223491028, + "grad_norm": 0.0009927983628585935, + "learning_rate": 9.434820950424605e-05, + "loss": 0.0005, + "num_input_tokens_seen": 217435984, + "step": 100670 + }, + { + "epoch": 16.42332789559543, + "grad_norm": 0.002471206011250615, + "learning_rate": 9.430660010445325e-05, + "loss": 0.0428, + "num_input_tokens_seen": 217447568, + "step": 100675 + }, + { + "epoch": 16.424143556280587, + "grad_norm": 0.007938587106764317, + "learning_rate": 9.426499892655155e-05, + "loss": 0.0021, + "num_input_tokens_seen": 217458800, + "step": 100680 + }, + { + "epoch": 16.424959216965743, + "grad_norm": 0.0019349503563717008, + "learning_rate": 9.422340597138457e-05, + "loss": 0.0022, + "num_input_tokens_seen": 217468208, + "step": 100685 + }, + { + "epoch": 16.4257748776509, + "grad_norm": 0.004137630108743906, + "learning_rate": 9.418182123979496e-05, + "loss": 0.0018, + "num_input_tokens_seen": 217479120, + "step": 100690 + }, + { + "epoch": 16.42659053833605, + "grad_norm": 0.0010737567208707333, + "learning_rate": 9.414024473262561e-05, + "loss": 0.1038, + "num_input_tokens_seen": 217488976, + "step": 100695 + }, + { + "epoch": 16.427406199021206, + "grad_norm": 0.0011117482790723443, + "learning_rate": 9.409867645071901e-05, + "loss": 0.0024, + "num_input_tokens_seen": 217499728, + "step": 100700 + }, + { + "epoch": 16.428221859706362, + "grad_norm": 0.06709881126880646, + "learning_rate": 9.405711639491771e-05, + "loss": 0.1206, + "num_input_tokens_seen": 217510832, + "step": 100705 + }, + { + "epoch": 16.429037520391518, + "grad_norm": 0.6046932935714722, + "learning_rate": 9.401556456606392e-05, + "loss": 0.0548, + "num_input_tokens_seen": 217522128, + "step": 100710 + }, + { + "epoch": 16.429853181076673, + "grad_norm": 0.0038243194576352835, + "learning_rate": 9.397402096499973e-05, + "loss": 0.0022, + "num_input_tokens_seen": 217533104, + "step": 100715 + }, + { + "epoch": 16.430668841761825, + "grad_norm": 0.0012090579839423299, + "learning_rate": 9.393248559256706e-05, + "loss": 0.0023, + "num_input_tokens_seen": 217542896, + "step": 100720 + }, + { + "epoch": 16.43148450244698, + "grad_norm": 0.021008076146245003, + "learning_rate": 9.389095844960771e-05, + "loss": 0.0032, + "num_input_tokens_seen": 217554320, + "step": 100725 + }, + { + "epoch": 16.432300163132137, + "grad_norm": 0.0003240357618778944, + "learning_rate": 9.384943953696329e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217565328, + "step": 100730 + }, + { + "epoch": 16.433115823817293, + "grad_norm": 0.6940795183181763, + "learning_rate": 9.380792885547523e-05, + "loss": 0.0253, + "num_input_tokens_seen": 217576720, + "step": 100735 + }, + { + "epoch": 16.43393148450245, + "grad_norm": 0.035171881318092346, + "learning_rate": 9.376642640598476e-05, + "loss": 0.0024, + "num_input_tokens_seen": 217588336, + "step": 100740 + }, + { + "epoch": 16.4347471451876, + "grad_norm": 0.0009132300619967282, + "learning_rate": 9.372493218933303e-05, + "loss": 0.0047, + "num_input_tokens_seen": 217597840, + "step": 100745 + }, + { + "epoch": 16.435562805872756, + "grad_norm": 0.06404221057891846, + "learning_rate": 9.368344620636094e-05, + "loss": 0.0043, + "num_input_tokens_seen": 217609552, + "step": 100750 + }, + { + "epoch": 16.436378466557912, + "grad_norm": 0.005812915042042732, + "learning_rate": 9.364196845790924e-05, + "loss": 0.0019, + "num_input_tokens_seen": 217620752, + "step": 100755 + }, + { + "epoch": 16.437194127243067, + "grad_norm": 0.009950408712029457, + "learning_rate": 9.360049894481854e-05, + "loss": 0.0012, + "num_input_tokens_seen": 217631984, + "step": 100760 + }, + { + "epoch": 16.438009787928223, + "grad_norm": 0.1717950999736786, + "learning_rate": 9.355903766792929e-05, + "loss": 0.004, + "num_input_tokens_seen": 217643056, + "step": 100765 + }, + { + "epoch": 16.438825448613375, + "grad_norm": 0.0056233457289636135, + "learning_rate": 9.351758462808174e-05, + "loss": 0.0009, + "num_input_tokens_seen": 217653968, + "step": 100770 + }, + { + "epoch": 16.43964110929853, + "grad_norm": 0.0006802030256949365, + "learning_rate": 9.347613982611603e-05, + "loss": 0.001, + "num_input_tokens_seen": 217664432, + "step": 100775 + }, + { + "epoch": 16.440456769983687, + "grad_norm": 0.1902633160352707, + "learning_rate": 9.343470326287206e-05, + "loss": 0.051, + "num_input_tokens_seen": 217674768, + "step": 100780 + }, + { + "epoch": 16.441272430668842, + "grad_norm": 0.0022835908457636833, + "learning_rate": 9.339327493918958e-05, + "loss": 0.0009, + "num_input_tokens_seen": 217686256, + "step": 100785 + }, + { + "epoch": 16.442088091353998, + "grad_norm": 0.0061843437142670155, + "learning_rate": 9.335185485590807e-05, + "loss": 0.0011, + "num_input_tokens_seen": 217696784, + "step": 100790 + }, + { + "epoch": 16.44290375203915, + "grad_norm": 0.0036174890119582415, + "learning_rate": 9.331044301386732e-05, + "loss": 0.0014, + "num_input_tokens_seen": 217707056, + "step": 100795 + }, + { + "epoch": 16.443719412724306, + "grad_norm": 0.0004997859941795468, + "learning_rate": 9.326903941390613e-05, + "loss": 0.0022, + "num_input_tokens_seen": 217718864, + "step": 100800 + }, + { + "epoch": 16.44453507340946, + "grad_norm": 0.0004371833929326385, + "learning_rate": 9.322764405686412e-05, + "loss": 0.0006, + "num_input_tokens_seen": 217730160, + "step": 100805 + }, + { + "epoch": 16.445350734094617, + "grad_norm": 0.0010925378883257508, + "learning_rate": 9.318625694357962e-05, + "loss": 0.0006, + "num_input_tokens_seen": 217741936, + "step": 100810 + }, + { + "epoch": 16.446166394779773, + "grad_norm": 0.0005652908002957702, + "learning_rate": 9.314487807489186e-05, + "loss": 0.0005, + "num_input_tokens_seen": 217751664, + "step": 100815 + }, + { + "epoch": 16.446982055464925, + "grad_norm": 0.015928996726870537, + "learning_rate": 9.310350745163931e-05, + "loss": 0.0466, + "num_input_tokens_seen": 217762416, + "step": 100820 + }, + { + "epoch": 16.44779771615008, + "grad_norm": 0.002068012021481991, + "learning_rate": 9.306214507466032e-05, + "loss": 0.0022, + "num_input_tokens_seen": 217773552, + "step": 100825 + }, + { + "epoch": 16.448613376835237, + "grad_norm": 0.006545333191752434, + "learning_rate": 9.302079094479321e-05, + "loss": 0.0033, + "num_input_tokens_seen": 217784048, + "step": 100830 + }, + { + "epoch": 16.449429037520392, + "grad_norm": 0.007893134839832783, + "learning_rate": 9.297944506287609e-05, + "loss": 0.0713, + "num_input_tokens_seen": 217796400, + "step": 100835 + }, + { + "epoch": 16.450244698205548, + "grad_norm": 0.0014113986399024725, + "learning_rate": 9.293810742974679e-05, + "loss": 0.001, + "num_input_tokens_seen": 217807600, + "step": 100840 + }, + { + "epoch": 16.4510603588907, + "grad_norm": 0.0015867466572672129, + "learning_rate": 9.28967780462432e-05, + "loss": 0.0039, + "num_input_tokens_seen": 217818224, + "step": 100845 + }, + { + "epoch": 16.451876019575856, + "grad_norm": 0.004073529504239559, + "learning_rate": 9.28554569132028e-05, + "loss": 0.0017, + "num_input_tokens_seen": 217830032, + "step": 100850 + }, + { + "epoch": 16.45269168026101, + "grad_norm": 0.0011026524007320404, + "learning_rate": 9.28141440314631e-05, + "loss": 0.0011, + "num_input_tokens_seen": 217842448, + "step": 100855 + }, + { + "epoch": 16.453507340946167, + "grad_norm": 0.013522444292902946, + "learning_rate": 9.277283940186132e-05, + "loss": 0.0016, + "num_input_tokens_seen": 217853008, + "step": 100860 + }, + { + "epoch": 16.454323001631323, + "grad_norm": 0.013203301467001438, + "learning_rate": 9.273154302523456e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217864080, + "step": 100865 + }, + { + "epoch": 16.455138662316475, + "grad_norm": 0.03669867664575577, + "learning_rate": 9.269025490241972e-05, + "loss": 0.0028, + "num_input_tokens_seen": 217875536, + "step": 100870 + }, + { + "epoch": 16.45595432300163, + "grad_norm": 0.007512333802878857, + "learning_rate": 9.264897503425357e-05, + "loss": 0.0025, + "num_input_tokens_seen": 217885904, + "step": 100875 + }, + { + "epoch": 16.456769983686787, + "grad_norm": 0.0010369790252298117, + "learning_rate": 9.260770342157272e-05, + "loss": 0.0024, + "num_input_tokens_seen": 217895504, + "step": 100880 + }, + { + "epoch": 16.457585644371942, + "grad_norm": 0.0006969812093302608, + "learning_rate": 9.256644006521358e-05, + "loss": 0.1333, + "num_input_tokens_seen": 217906480, + "step": 100885 + }, + { + "epoch": 16.458401305057095, + "grad_norm": 0.009753060527145863, + "learning_rate": 9.252518496601237e-05, + "loss": 0.0016, + "num_input_tokens_seen": 217917776, + "step": 100890 + }, + { + "epoch": 16.45921696574225, + "grad_norm": 0.004010849166661501, + "learning_rate": 9.248393812480522e-05, + "loss": 0.0009, + "num_input_tokens_seen": 217928560, + "step": 100895 + }, + { + "epoch": 16.460032626427406, + "grad_norm": 0.028223834931850433, + "learning_rate": 9.244269954242806e-05, + "loss": 0.0016, + "num_input_tokens_seen": 217938288, + "step": 100900 + }, + { + "epoch": 16.46084828711256, + "grad_norm": 0.029094593599438667, + "learning_rate": 9.240146921971642e-05, + "loss": 0.0026, + "num_input_tokens_seen": 217948464, + "step": 100905 + }, + { + "epoch": 16.461663947797717, + "grad_norm": 0.000985774677246809, + "learning_rate": 9.23602471575064e-05, + "loss": 0.0488, + "num_input_tokens_seen": 217958128, + "step": 100910 + }, + { + "epoch": 16.46247960848287, + "grad_norm": 0.15060503780841827, + "learning_rate": 9.231903335663283e-05, + "loss": 0.0053, + "num_input_tokens_seen": 217968432, + "step": 100915 + }, + { + "epoch": 16.463295269168025, + "grad_norm": 0.0014211301458999515, + "learning_rate": 9.227782781793148e-05, + "loss": 0.0007, + "num_input_tokens_seen": 217979760, + "step": 100920 + }, + { + "epoch": 16.46411092985318, + "grad_norm": 0.002074574586004019, + "learning_rate": 9.223663054223692e-05, + "loss": 0.0039, + "num_input_tokens_seen": 217990128, + "step": 100925 + }, + { + "epoch": 16.464926590538337, + "grad_norm": 0.21957442164421082, + "learning_rate": 9.219544153038462e-05, + "loss": 0.0276, + "num_input_tokens_seen": 218001232, + "step": 100930 + }, + { + "epoch": 16.465742251223492, + "grad_norm": 0.0020057554356753826, + "learning_rate": 9.21542607832087e-05, + "loss": 0.0021, + "num_input_tokens_seen": 218011824, + "step": 100935 + }, + { + "epoch": 16.466557911908644, + "grad_norm": 0.001532508060336113, + "learning_rate": 9.211308830154441e-05, + "loss": 0.0053, + "num_input_tokens_seen": 218020624, + "step": 100940 + }, + { + "epoch": 16.4673735725938, + "grad_norm": 0.004052693955600262, + "learning_rate": 9.20719240862255e-05, + "loss": 0.0007, + "num_input_tokens_seen": 218031280, + "step": 100945 + }, + { + "epoch": 16.468189233278956, + "grad_norm": 0.0012395764933899045, + "learning_rate": 9.203076813808687e-05, + "loss": 0.0012, + "num_input_tokens_seen": 218041520, + "step": 100950 + }, + { + "epoch": 16.46900489396411, + "grad_norm": 0.003163372864946723, + "learning_rate": 9.198962045796195e-05, + "loss": 0.0022, + "num_input_tokens_seen": 218051856, + "step": 100955 + }, + { + "epoch": 16.469820554649267, + "grad_norm": 0.00656129838898778, + "learning_rate": 9.194848104668513e-05, + "loss": 0.0031, + "num_input_tokens_seen": 218062480, + "step": 100960 + }, + { + "epoch": 16.47063621533442, + "grad_norm": 0.0020579954143613577, + "learning_rate": 9.190734990508998e-05, + "loss": 0.0057, + "num_input_tokens_seen": 218073808, + "step": 100965 + }, + { + "epoch": 16.471451876019575, + "grad_norm": 0.002411720808595419, + "learning_rate": 9.18662270340101e-05, + "loss": 0.1435, + "num_input_tokens_seen": 218084336, + "step": 100970 + }, + { + "epoch": 16.47226753670473, + "grad_norm": 0.41781216859817505, + "learning_rate": 9.182511243427888e-05, + "loss": 0.0312, + "num_input_tokens_seen": 218095408, + "step": 100975 + }, + { + "epoch": 16.473083197389887, + "grad_norm": 0.035349469631910324, + "learning_rate": 9.178400610672954e-05, + "loss": 0.0021, + "num_input_tokens_seen": 218106416, + "step": 100980 + }, + { + "epoch": 16.473898858075042, + "grad_norm": 0.0007361548487097025, + "learning_rate": 9.174290805219521e-05, + "loss": 0.0053, + "num_input_tokens_seen": 218117840, + "step": 100985 + }, + { + "epoch": 16.474714518760194, + "grad_norm": 0.07083853334188461, + "learning_rate": 9.170181827150875e-05, + "loss": 0.0028, + "num_input_tokens_seen": 218127440, + "step": 100990 + }, + { + "epoch": 16.47553017944535, + "grad_norm": 0.0019853876437991858, + "learning_rate": 9.166073676550291e-05, + "loss": 0.0465, + "num_input_tokens_seen": 218139568, + "step": 100995 + }, + { + "epoch": 16.476345840130506, + "grad_norm": 0.0003953164559789002, + "learning_rate": 9.161966353501023e-05, + "loss": 0.0199, + "num_input_tokens_seen": 218149616, + "step": 101000 + }, + { + "epoch": 16.47716150081566, + "grad_norm": 0.008749466389417648, + "learning_rate": 9.157859858086315e-05, + "loss": 0.0054, + "num_input_tokens_seen": 218160560, + "step": 101005 + }, + { + "epoch": 16.477977161500817, + "grad_norm": 0.0012293050531297922, + "learning_rate": 9.153754190389379e-05, + "loss": 0.0016, + "num_input_tokens_seen": 218172304, + "step": 101010 + }, + { + "epoch": 16.47879282218597, + "grad_norm": 0.007884092628955841, + "learning_rate": 9.149649350493456e-05, + "loss": 0.079, + "num_input_tokens_seen": 218182288, + "step": 101015 + }, + { + "epoch": 16.479608482871125, + "grad_norm": 0.0007453529397025704, + "learning_rate": 9.145545338481682e-05, + "loss": 0.001, + "num_input_tokens_seen": 218193840, + "step": 101020 + }, + { + "epoch": 16.48042414355628, + "grad_norm": 0.006361248902976513, + "learning_rate": 9.141442154437286e-05, + "loss": 0.0017, + "num_input_tokens_seen": 218205456, + "step": 101025 + }, + { + "epoch": 16.481239804241437, + "grad_norm": 0.028780123218894005, + "learning_rate": 9.137339798443372e-05, + "loss": 0.0035, + "num_input_tokens_seen": 218214896, + "step": 101030 + }, + { + "epoch": 16.482055464926592, + "grad_norm": 0.0026272626128047705, + "learning_rate": 9.133238270583133e-05, + "loss": 0.0034, + "num_input_tokens_seen": 218224304, + "step": 101035 + }, + { + "epoch": 16.482871125611744, + "grad_norm": 0.0005200458108447492, + "learning_rate": 9.129137570939632e-05, + "loss": 0.0015, + "num_input_tokens_seen": 218235568, + "step": 101040 + }, + { + "epoch": 16.4836867862969, + "grad_norm": 0.00038884973037056625, + "learning_rate": 9.125037699596039e-05, + "loss": 0.0125, + "num_input_tokens_seen": 218247056, + "step": 101045 + }, + { + "epoch": 16.484502446982056, + "grad_norm": 0.007529503665864468, + "learning_rate": 9.12093865663538e-05, + "loss": 0.0008, + "num_input_tokens_seen": 218257840, + "step": 101050 + }, + { + "epoch": 16.48531810766721, + "grad_norm": 0.03659482300281525, + "learning_rate": 9.11684044214079e-05, + "loss": 0.0039, + "num_input_tokens_seen": 218269168, + "step": 101055 + }, + { + "epoch": 16.486133768352367, + "grad_norm": 0.008131932467222214, + "learning_rate": 9.112743056195261e-05, + "loss": 0.0494, + "num_input_tokens_seen": 218279440, + "step": 101060 + }, + { + "epoch": 16.48694942903752, + "grad_norm": 0.0013752073282375932, + "learning_rate": 9.10864649888189e-05, + "loss": 0.0017, + "num_input_tokens_seen": 218291472, + "step": 101065 + }, + { + "epoch": 16.487765089722675, + "grad_norm": 0.010680504143238068, + "learning_rate": 9.104550770283648e-05, + "loss": 0.0026, + "num_input_tokens_seen": 218302096, + "step": 101070 + }, + { + "epoch": 16.48858075040783, + "grad_norm": 0.020263204351067543, + "learning_rate": 9.100455870483587e-05, + "loss": 0.0015, + "num_input_tokens_seen": 218312720, + "step": 101075 + }, + { + "epoch": 16.489396411092986, + "grad_norm": 0.000559748790692538, + "learning_rate": 9.096361799564651e-05, + "loss": 0.0066, + "num_input_tokens_seen": 218323440, + "step": 101080 + }, + { + "epoch": 16.49021207177814, + "grad_norm": 0.028419995680451393, + "learning_rate": 9.092268557609856e-05, + "loss": 0.0115, + "num_input_tokens_seen": 218333616, + "step": 101085 + }, + { + "epoch": 16.491027732463294, + "grad_norm": 0.000426318816607818, + "learning_rate": 9.088176144702104e-05, + "loss": 0.0007, + "num_input_tokens_seen": 218345520, + "step": 101090 + }, + { + "epoch": 16.49184339314845, + "grad_norm": 0.013602585531771183, + "learning_rate": 9.084084560924394e-05, + "loss": 0.0716, + "num_input_tokens_seen": 218355984, + "step": 101095 + }, + { + "epoch": 16.492659053833606, + "grad_norm": 0.002315348945558071, + "learning_rate": 9.079993806359587e-05, + "loss": 0.1014, + "num_input_tokens_seen": 218366832, + "step": 101100 + }, + { + "epoch": 16.49347471451876, + "grad_norm": 0.02149783819913864, + "learning_rate": 9.075903881090636e-05, + "loss": 0.0027, + "num_input_tokens_seen": 218375568, + "step": 101105 + }, + { + "epoch": 16.494290375203914, + "grad_norm": 0.023237407207489014, + "learning_rate": 9.071814785200399e-05, + "loss": 0.0104, + "num_input_tokens_seen": 218386832, + "step": 101110 + }, + { + "epoch": 16.49510603588907, + "grad_norm": 0.0034335225354880095, + "learning_rate": 9.067726518771762e-05, + "loss": 0.0032, + "num_input_tokens_seen": 218398576, + "step": 101115 + }, + { + "epoch": 16.495921696574225, + "grad_norm": 0.00745142437517643, + "learning_rate": 9.063639081887576e-05, + "loss": 0.0021, + "num_input_tokens_seen": 218407600, + "step": 101120 + }, + { + "epoch": 16.49673735725938, + "grad_norm": 0.029559500515460968, + "learning_rate": 9.059552474630672e-05, + "loss": 0.002, + "num_input_tokens_seen": 218419376, + "step": 101125 + }, + { + "epoch": 16.497553017944536, + "grad_norm": 0.0032735865097492933, + "learning_rate": 9.055466697083875e-05, + "loss": 0.0013, + "num_input_tokens_seen": 218429744, + "step": 101130 + }, + { + "epoch": 16.49836867862969, + "grad_norm": 0.7694928050041199, + "learning_rate": 9.051381749329984e-05, + "loss": 0.0928, + "num_input_tokens_seen": 218439536, + "step": 101135 + }, + { + "epoch": 16.499184339314844, + "grad_norm": 0.15859928727149963, + "learning_rate": 9.04729763145179e-05, + "loss": 0.0169, + "num_input_tokens_seen": 218451184, + "step": 101140 + }, + { + "epoch": 16.5, + "grad_norm": 0.001213204930536449, + "learning_rate": 9.043214343532063e-05, + "loss": 0.01, + "num_input_tokens_seen": 218462448, + "step": 101145 + }, + { + "epoch": 16.500815660685156, + "grad_norm": 0.03677457943558693, + "learning_rate": 9.039131885653556e-05, + "loss": 0.0021, + "num_input_tokens_seen": 218474256, + "step": 101150 + }, + { + "epoch": 16.50163132137031, + "grad_norm": 0.002333037555217743, + "learning_rate": 9.035050257898991e-05, + "loss": 0.0083, + "num_input_tokens_seen": 218485520, + "step": 101155 + }, + { + "epoch": 16.502446982055464, + "grad_norm": 0.004182538483291864, + "learning_rate": 9.030969460351124e-05, + "loss": 0.0028, + "num_input_tokens_seen": 218496432, + "step": 101160 + }, + { + "epoch": 16.50326264274062, + "grad_norm": 0.000874399789609015, + "learning_rate": 9.026889493092605e-05, + "loss": 0.0056, + "num_input_tokens_seen": 218507344, + "step": 101165 + }, + { + "epoch": 16.504078303425775, + "grad_norm": 0.007135962136089802, + "learning_rate": 9.022810356206179e-05, + "loss": 0.0041, + "num_input_tokens_seen": 218518704, + "step": 101170 + }, + { + "epoch": 16.50489396411093, + "grad_norm": 0.3483419716358185, + "learning_rate": 9.018732049774459e-05, + "loss": 0.0233, + "num_input_tokens_seen": 218529744, + "step": 101175 + }, + { + "epoch": 16.505709624796086, + "grad_norm": 0.009324166923761368, + "learning_rate": 9.014654573880143e-05, + "loss": 0.0054, + "num_input_tokens_seen": 218540400, + "step": 101180 + }, + { + "epoch": 16.50652528548124, + "grad_norm": 0.004226782359182835, + "learning_rate": 9.010577928605823e-05, + "loss": 0.0026, + "num_input_tokens_seen": 218552272, + "step": 101185 + }, + { + "epoch": 16.507340946166394, + "grad_norm": 0.06776424497365952, + "learning_rate": 9.00650211403417e-05, + "loss": 0.0442, + "num_input_tokens_seen": 218561424, + "step": 101190 + }, + { + "epoch": 16.50815660685155, + "grad_norm": 0.03477175533771515, + "learning_rate": 9.002427130247726e-05, + "loss": 0.004, + "num_input_tokens_seen": 218572624, + "step": 101195 + }, + { + "epoch": 16.508972267536706, + "grad_norm": 0.00041841110214591026, + "learning_rate": 8.998352977329127e-05, + "loss": 0.001, + "num_input_tokens_seen": 218583376, + "step": 101200 + }, + { + "epoch": 16.50978792822186, + "grad_norm": 0.004782752133905888, + "learning_rate": 8.994279655360899e-05, + "loss": 0.0014, + "num_input_tokens_seen": 218594224, + "step": 101205 + }, + { + "epoch": 16.510603588907014, + "grad_norm": 0.37345457077026367, + "learning_rate": 8.99020716442564e-05, + "loss": 0.0122, + "num_input_tokens_seen": 218604880, + "step": 101210 + }, + { + "epoch": 16.51141924959217, + "grad_norm": 0.002796668093651533, + "learning_rate": 8.986135504605831e-05, + "loss": 0.0006, + "num_input_tokens_seen": 218615152, + "step": 101215 + }, + { + "epoch": 16.512234910277325, + "grad_norm": 2.6558995246887207, + "learning_rate": 8.982064675984025e-05, + "loss": 0.0671, + "num_input_tokens_seen": 218625904, + "step": 101220 + }, + { + "epoch": 16.51305057096248, + "grad_norm": 0.0028051333501935005, + "learning_rate": 8.977994678642714e-05, + "loss": 0.0047, + "num_input_tokens_seen": 218636976, + "step": 101225 + }, + { + "epoch": 16.513866231647633, + "grad_norm": 0.0031697454396635294, + "learning_rate": 8.973925512664383e-05, + "loss": 0.0067, + "num_input_tokens_seen": 218647408, + "step": 101230 + }, + { + "epoch": 16.51468189233279, + "grad_norm": 0.8315576910972595, + "learning_rate": 8.969857178131497e-05, + "loss": 0.0166, + "num_input_tokens_seen": 218658864, + "step": 101235 + }, + { + "epoch": 16.515497553017944, + "grad_norm": 0.011037657037377357, + "learning_rate": 8.965789675126501e-05, + "loss": 0.0027, + "num_input_tokens_seen": 218670032, + "step": 101240 + }, + { + "epoch": 16.5163132137031, + "grad_norm": 0.0005106102908030152, + "learning_rate": 8.961723003731837e-05, + "loss": 0.0023, + "num_input_tokens_seen": 218681424, + "step": 101245 + }, + { + "epoch": 16.517128874388256, + "grad_norm": 0.034446511417627335, + "learning_rate": 8.95765716402992e-05, + "loss": 0.0586, + "num_input_tokens_seen": 218692208, + "step": 101250 + }, + { + "epoch": 16.517944535073408, + "grad_norm": 0.007385551929473877, + "learning_rate": 8.953592156103141e-05, + "loss": 0.0023, + "num_input_tokens_seen": 218703024, + "step": 101255 + }, + { + "epoch": 16.518760195758563, + "grad_norm": 0.017408102750778198, + "learning_rate": 8.949527980033889e-05, + "loss": 0.007, + "num_input_tokens_seen": 218713840, + "step": 101260 + }, + { + "epoch": 16.51957585644372, + "grad_norm": 0.0040249088779091835, + "learning_rate": 8.945464635904532e-05, + "loss": 0.0009, + "num_input_tokens_seen": 218724624, + "step": 101265 + }, + { + "epoch": 16.520391517128875, + "grad_norm": 0.000684377911966294, + "learning_rate": 8.94140212379741e-05, + "loss": 0.0006, + "num_input_tokens_seen": 218736080, + "step": 101270 + }, + { + "epoch": 16.52120717781403, + "grad_norm": 0.06121218577027321, + "learning_rate": 8.937340443794867e-05, + "loss": 0.0047, + "num_input_tokens_seen": 218745872, + "step": 101275 + }, + { + "epoch": 16.522022838499183, + "grad_norm": 0.004632898606359959, + "learning_rate": 8.933279595979205e-05, + "loss": 0.0019, + "num_input_tokens_seen": 218757136, + "step": 101280 + }, + { + "epoch": 16.52283849918434, + "grad_norm": 0.002208688296377659, + "learning_rate": 8.929219580432735e-05, + "loss": 0.0092, + "num_input_tokens_seen": 218768080, + "step": 101285 + }, + { + "epoch": 16.523654159869494, + "grad_norm": 0.0014574574306607246, + "learning_rate": 8.925160397237725e-05, + "loss": 0.0044, + "num_input_tokens_seen": 218779472, + "step": 101290 + }, + { + "epoch": 16.52446982055465, + "grad_norm": 0.022507159039378166, + "learning_rate": 8.921102046476454e-05, + "loss": 0.0022, + "num_input_tokens_seen": 218790064, + "step": 101295 + }, + { + "epoch": 16.525285481239806, + "grad_norm": 0.22740155458450317, + "learning_rate": 8.917044528231145e-05, + "loss": 0.0156, + "num_input_tokens_seen": 218800720, + "step": 101300 + }, + { + "epoch": 16.526101141924958, + "grad_norm": 0.0901143029332161, + "learning_rate": 8.912987842584075e-05, + "loss": 0.0502, + "num_input_tokens_seen": 218810512, + "step": 101305 + }, + { + "epoch": 16.526916802610113, + "grad_norm": 0.03165145590901375, + "learning_rate": 8.908931989617403e-05, + "loss": 0.0027, + "num_input_tokens_seen": 218820432, + "step": 101310 + }, + { + "epoch": 16.52773246329527, + "grad_norm": 2.9440736770629883, + "learning_rate": 8.904876969413372e-05, + "loss": 0.0321, + "num_input_tokens_seen": 218831152, + "step": 101315 + }, + { + "epoch": 16.528548123980425, + "grad_norm": 0.015373525209724903, + "learning_rate": 8.900822782054124e-05, + "loss": 0.0378, + "num_input_tokens_seen": 218842640, + "step": 101320 + }, + { + "epoch": 16.52936378466558, + "grad_norm": 0.10055476427078247, + "learning_rate": 8.896769427621848e-05, + "loss": 0.0081, + "num_input_tokens_seen": 218852720, + "step": 101325 + }, + { + "epoch": 16.530179445350733, + "grad_norm": 0.004800902679562569, + "learning_rate": 8.892716906198683e-05, + "loss": 0.0021, + "num_input_tokens_seen": 218862288, + "step": 101330 + }, + { + "epoch": 16.53099510603589, + "grad_norm": 0.00658207293599844, + "learning_rate": 8.88866521786676e-05, + "loss": 0.0013, + "num_input_tokens_seen": 218873136, + "step": 101335 + }, + { + "epoch": 16.531810766721044, + "grad_norm": 0.0034926505759358406, + "learning_rate": 8.884614362708188e-05, + "loss": 0.001, + "num_input_tokens_seen": 218884496, + "step": 101340 + }, + { + "epoch": 16.5326264274062, + "grad_norm": 0.0007944152457639575, + "learning_rate": 8.88056434080507e-05, + "loss": 0.0017, + "num_input_tokens_seen": 218894800, + "step": 101345 + }, + { + "epoch": 16.533442088091356, + "grad_norm": 0.01701800711452961, + "learning_rate": 8.876515152239472e-05, + "loss": 0.0054, + "num_input_tokens_seen": 218904688, + "step": 101350 + }, + { + "epoch": 16.534257748776508, + "grad_norm": 0.008906069211661816, + "learning_rate": 8.872466797093464e-05, + "loss": 0.0028, + "num_input_tokens_seen": 218914896, + "step": 101355 + }, + { + "epoch": 16.535073409461663, + "grad_norm": 0.045917339622974396, + "learning_rate": 8.868419275449096e-05, + "loss": 0.0212, + "num_input_tokens_seen": 218927632, + "step": 101360 + }, + { + "epoch": 16.53588907014682, + "grad_norm": 0.053435854613780975, + "learning_rate": 8.864372587388387e-05, + "loss": 0.0033, + "num_input_tokens_seen": 218938928, + "step": 101365 + }, + { + "epoch": 16.536704730831975, + "grad_norm": 0.002096776617690921, + "learning_rate": 8.860326732993352e-05, + "loss": 0.0022, + "num_input_tokens_seen": 218949168, + "step": 101370 + }, + { + "epoch": 16.53752039151713, + "grad_norm": 0.0024299444630742073, + "learning_rate": 8.856281712345988e-05, + "loss": 0.0161, + "num_input_tokens_seen": 218959344, + "step": 101375 + }, + { + "epoch": 16.538336052202283, + "grad_norm": 0.0005039930110797286, + "learning_rate": 8.852237525528262e-05, + "loss": 0.0066, + "num_input_tokens_seen": 218970416, + "step": 101380 + }, + { + "epoch": 16.53915171288744, + "grad_norm": 0.0017340783961117268, + "learning_rate": 8.848194172622148e-05, + "loss": 0.0011, + "num_input_tokens_seen": 218982320, + "step": 101385 + }, + { + "epoch": 16.539967373572594, + "grad_norm": 0.14250528812408447, + "learning_rate": 8.844151653709581e-05, + "loss": 0.0057, + "num_input_tokens_seen": 218992848, + "step": 101390 + }, + { + "epoch": 16.54078303425775, + "grad_norm": 0.0025329969357699156, + "learning_rate": 8.840109968872495e-05, + "loss": 0.0005, + "num_input_tokens_seen": 219004464, + "step": 101395 + }, + { + "epoch": 16.541598694942905, + "grad_norm": 0.0013825197238475084, + "learning_rate": 8.836069118192791e-05, + "loss": 0.0019, + "num_input_tokens_seen": 219015952, + "step": 101400 + }, + { + "epoch": 16.542414355628058, + "grad_norm": 0.0008943129214458168, + "learning_rate": 8.83202910175237e-05, + "loss": 0.0019, + "num_input_tokens_seen": 219026320, + "step": 101405 + }, + { + "epoch": 16.543230016313213, + "grad_norm": 0.00017344093066640198, + "learning_rate": 8.827989919633106e-05, + "loss": 0.0024, + "num_input_tokens_seen": 219036624, + "step": 101410 + }, + { + "epoch": 16.54404567699837, + "grad_norm": 0.0071393647231161594, + "learning_rate": 8.82395157191685e-05, + "loss": 0.0187, + "num_input_tokens_seen": 219046896, + "step": 101415 + }, + { + "epoch": 16.544861337683525, + "grad_norm": 0.0163432527333498, + "learning_rate": 8.819914058685458e-05, + "loss": 0.0008, + "num_input_tokens_seen": 219057904, + "step": 101420 + }, + { + "epoch": 16.545676998368677, + "grad_norm": 0.5346035957336426, + "learning_rate": 8.815877380020743e-05, + "loss": 0.0643, + "num_input_tokens_seen": 219067760, + "step": 101425 + }, + { + "epoch": 16.546492659053833, + "grad_norm": 0.00447038421407342, + "learning_rate": 8.811841536004505e-05, + "loss": 0.0015, + "num_input_tokens_seen": 219078672, + "step": 101430 + }, + { + "epoch": 16.54730831973899, + "grad_norm": 0.02596464194357395, + "learning_rate": 8.807806526718565e-05, + "loss": 0.0014, + "num_input_tokens_seen": 219089424, + "step": 101435 + }, + { + "epoch": 16.548123980424144, + "grad_norm": 0.0009697464993223548, + "learning_rate": 8.803772352244683e-05, + "loss": 0.0009, + "num_input_tokens_seen": 219099920, + "step": 101440 + }, + { + "epoch": 16.5489396411093, + "grad_norm": 0.000986489118076861, + "learning_rate": 8.799739012664615e-05, + "loss": 0.0024, + "num_input_tokens_seen": 219110864, + "step": 101445 + }, + { + "epoch": 16.549755301794452, + "grad_norm": 0.03657018765807152, + "learning_rate": 8.795706508060102e-05, + "loss": 0.0038, + "num_input_tokens_seen": 219121616, + "step": 101450 + }, + { + "epoch": 16.550570962479608, + "grad_norm": 0.12137595564126968, + "learning_rate": 8.791674838512864e-05, + "loss": 0.0052, + "num_input_tokens_seen": 219133328, + "step": 101455 + }, + { + "epoch": 16.551386623164763, + "grad_norm": 0.0015651886351406574, + "learning_rate": 8.787644004104617e-05, + "loss": 0.0008, + "num_input_tokens_seen": 219144016, + "step": 101460 + }, + { + "epoch": 16.55220228384992, + "grad_norm": 0.002112816786393523, + "learning_rate": 8.78361400491704e-05, + "loss": 0.0012, + "num_input_tokens_seen": 219154992, + "step": 101465 + }, + { + "epoch": 16.553017944535075, + "grad_norm": 0.01039827335625887, + "learning_rate": 8.779584841031818e-05, + "loss": 0.0007, + "num_input_tokens_seen": 219165104, + "step": 101470 + }, + { + "epoch": 16.553833605220227, + "grad_norm": 0.009416126646101475, + "learning_rate": 8.775556512530597e-05, + "loss": 0.0055, + "num_input_tokens_seen": 219176496, + "step": 101475 + }, + { + "epoch": 16.554649265905383, + "grad_norm": 0.005137943662703037, + "learning_rate": 8.771529019495022e-05, + "loss": 0.0029, + "num_input_tokens_seen": 219185584, + "step": 101480 + }, + { + "epoch": 16.55546492659054, + "grad_norm": 0.0015324490377679467, + "learning_rate": 8.767502362006713e-05, + "loss": 0.0008, + "num_input_tokens_seen": 219196048, + "step": 101485 + }, + { + "epoch": 16.556280587275694, + "grad_norm": 0.006908372975885868, + "learning_rate": 8.763476540147275e-05, + "loss": 0.0007, + "num_input_tokens_seen": 219206704, + "step": 101490 + }, + { + "epoch": 16.55709624796085, + "grad_norm": 0.0011675909627228975, + "learning_rate": 8.759451553998299e-05, + "loss": 0.0061, + "num_input_tokens_seen": 219216880, + "step": 101495 + }, + { + "epoch": 16.557911908646002, + "grad_norm": 0.3666263818740845, + "learning_rate": 8.755427403641352e-05, + "loss": 0.0224, + "num_input_tokens_seen": 219227216, + "step": 101500 + }, + { + "epoch": 16.558727569331158, + "grad_norm": 0.13365083932876587, + "learning_rate": 8.751404089157993e-05, + "loss": 0.0062, + "num_input_tokens_seen": 219237136, + "step": 101505 + }, + { + "epoch": 16.559543230016313, + "grad_norm": 0.010968620888888836, + "learning_rate": 8.747381610629762e-05, + "loss": 0.0013, + "num_input_tokens_seen": 219247760, + "step": 101510 + }, + { + "epoch": 16.56035889070147, + "grad_norm": 0.02978862263262272, + "learning_rate": 8.74335996813817e-05, + "loss": 0.0021, + "num_input_tokens_seen": 219259408, + "step": 101515 + }, + { + "epoch": 16.561174551386625, + "grad_norm": 0.00066575180971995, + "learning_rate": 8.739339161764725e-05, + "loss": 0.0026, + "num_input_tokens_seen": 219271216, + "step": 101520 + }, + { + "epoch": 16.561990212071777, + "grad_norm": 0.680747389793396, + "learning_rate": 8.735319191590918e-05, + "loss": 0.14, + "num_input_tokens_seen": 219281904, + "step": 101525 + }, + { + "epoch": 16.562805872756933, + "grad_norm": 0.08297364413738251, + "learning_rate": 8.731300057698216e-05, + "loss": 0.0041, + "num_input_tokens_seen": 219292848, + "step": 101530 + }, + { + "epoch": 16.563621533442088, + "grad_norm": 0.005971312522888184, + "learning_rate": 8.727281760168055e-05, + "loss": 0.0015, + "num_input_tokens_seen": 219303888, + "step": 101535 + }, + { + "epoch": 16.564437194127244, + "grad_norm": 0.0019951933063566685, + "learning_rate": 8.723264299081912e-05, + "loss": 0.0009, + "num_input_tokens_seen": 219315120, + "step": 101540 + }, + { + "epoch": 16.5652528548124, + "grad_norm": 0.015224998816847801, + "learning_rate": 8.719247674521157e-05, + "loss": 0.0035, + "num_input_tokens_seen": 219325648, + "step": 101545 + }, + { + "epoch": 16.56606851549755, + "grad_norm": 0.009353390894830227, + "learning_rate": 8.715231886567248e-05, + "loss": 0.0731, + "num_input_tokens_seen": 219336496, + "step": 101550 + }, + { + "epoch": 16.566884176182707, + "grad_norm": 0.0951535701751709, + "learning_rate": 8.711216935301508e-05, + "loss": 0.0042, + "num_input_tokens_seen": 219347120, + "step": 101555 + }, + { + "epoch": 16.567699836867863, + "grad_norm": 0.0017404680838808417, + "learning_rate": 8.70720282080536e-05, + "loss": 0.0032, + "num_input_tokens_seen": 219357744, + "step": 101560 + }, + { + "epoch": 16.56851549755302, + "grad_norm": 0.005474665202200413, + "learning_rate": 8.703189543160106e-05, + "loss": 0.001, + "num_input_tokens_seen": 219368848, + "step": 101565 + }, + { + "epoch": 16.569331158238175, + "grad_norm": 0.07186252623796463, + "learning_rate": 8.699177102447126e-05, + "loss": 0.0842, + "num_input_tokens_seen": 219379760, + "step": 101570 + }, + { + "epoch": 16.570146818923327, + "grad_norm": 0.04543714597821236, + "learning_rate": 8.695165498747698e-05, + "loss": 0.005, + "num_input_tokens_seen": 219389488, + "step": 101575 + }, + { + "epoch": 16.570962479608482, + "grad_norm": 0.01591755822300911, + "learning_rate": 8.691154732143147e-05, + "loss": 0.0016, + "num_input_tokens_seen": 219400400, + "step": 101580 + }, + { + "epoch": 16.571778140293638, + "grad_norm": 0.2322196513414383, + "learning_rate": 8.687144802714753e-05, + "loss": 0.0085, + "num_input_tokens_seen": 219412208, + "step": 101585 + }, + { + "epoch": 16.572593800978794, + "grad_norm": 0.0029731979593634605, + "learning_rate": 8.683135710543777e-05, + "loss": 0.0034, + "num_input_tokens_seen": 219423888, + "step": 101590 + }, + { + "epoch": 16.57340946166395, + "grad_norm": 0.0015354871284216642, + "learning_rate": 8.679127455711466e-05, + "loss": 0.0008, + "num_input_tokens_seen": 219434128, + "step": 101595 + }, + { + "epoch": 16.5742251223491, + "grad_norm": 0.07060811668634415, + "learning_rate": 8.675120038299062e-05, + "loss": 0.0883, + "num_input_tokens_seen": 219445328, + "step": 101600 + }, + { + "epoch": 16.575040783034257, + "grad_norm": 0.0051761167123913765, + "learning_rate": 8.671113458387775e-05, + "loss": 0.0018, + "num_input_tokens_seen": 219456976, + "step": 101605 + }, + { + "epoch": 16.575856443719413, + "grad_norm": 0.5322887897491455, + "learning_rate": 8.667107716058798e-05, + "loss": 0.0237, + "num_input_tokens_seen": 219467856, + "step": 101610 + }, + { + "epoch": 16.57667210440457, + "grad_norm": 0.0018487609922885895, + "learning_rate": 8.66310281139332e-05, + "loss": 0.001, + "num_input_tokens_seen": 219476944, + "step": 101615 + }, + { + "epoch": 16.57748776508972, + "grad_norm": 0.017845647409558296, + "learning_rate": 8.659098744472505e-05, + "loss": 0.0113, + "num_input_tokens_seen": 219487728, + "step": 101620 + }, + { + "epoch": 16.578303425774877, + "grad_norm": 0.057124871760606766, + "learning_rate": 8.655095515377498e-05, + "loss": 0.0044, + "num_input_tokens_seen": 219499120, + "step": 101625 + }, + { + "epoch": 16.579119086460032, + "grad_norm": 0.0012420967686921358, + "learning_rate": 8.65109312418943e-05, + "loss": 0.0027, + "num_input_tokens_seen": 219510192, + "step": 101630 + }, + { + "epoch": 16.579934747145188, + "grad_norm": 0.00048453285126015544, + "learning_rate": 8.647091570989413e-05, + "loss": 0.0225, + "num_input_tokens_seen": 219520848, + "step": 101635 + }, + { + "epoch": 16.580750407830344, + "grad_norm": 0.0012041418813169003, + "learning_rate": 8.643090855858549e-05, + "loss": 0.0031, + "num_input_tokens_seen": 219530896, + "step": 101640 + }, + { + "epoch": 16.581566068515496, + "grad_norm": 0.03431824967265129, + "learning_rate": 8.639090978877912e-05, + "loss": 0.0115, + "num_input_tokens_seen": 219542576, + "step": 101645 + }, + { + "epoch": 16.58238172920065, + "grad_norm": 0.0006362311542034149, + "learning_rate": 8.635091940128548e-05, + "loss": 0.0017, + "num_input_tokens_seen": 219553776, + "step": 101650 + }, + { + "epoch": 16.583197389885807, + "grad_norm": 0.0020954282954335213, + "learning_rate": 8.631093739691553e-05, + "loss": 0.0027, + "num_input_tokens_seen": 219565200, + "step": 101655 + }, + { + "epoch": 16.584013050570963, + "grad_norm": 0.16877447068691254, + "learning_rate": 8.627096377647898e-05, + "loss": 0.0057, + "num_input_tokens_seen": 219576976, + "step": 101660 + }, + { + "epoch": 16.58482871125612, + "grad_norm": 0.004283056128770113, + "learning_rate": 8.623099854078643e-05, + "loss": 0.0015, + "num_input_tokens_seen": 219588336, + "step": 101665 + }, + { + "epoch": 16.58564437194127, + "grad_norm": 0.03166608139872551, + "learning_rate": 8.619104169064734e-05, + "loss": 0.0021, + "num_input_tokens_seen": 219600144, + "step": 101670 + }, + { + "epoch": 16.586460032626427, + "grad_norm": 0.03157994523644447, + "learning_rate": 8.615109322687203e-05, + "loss": 0.0021, + "num_input_tokens_seen": 219610384, + "step": 101675 + }, + { + "epoch": 16.587275693311582, + "grad_norm": 0.0213120449334383, + "learning_rate": 8.611115315026951e-05, + "loss": 0.0063, + "num_input_tokens_seen": 219620880, + "step": 101680 + }, + { + "epoch": 16.588091353996738, + "grad_norm": 0.2438468486070633, + "learning_rate": 8.607122146164986e-05, + "loss": 0.009, + "num_input_tokens_seen": 219631856, + "step": 101685 + }, + { + "epoch": 16.588907014681894, + "grad_norm": 0.027045302093029022, + "learning_rate": 8.60312981618217e-05, + "loss": 0.0024, + "num_input_tokens_seen": 219643376, + "step": 101690 + }, + { + "epoch": 16.589722675367046, + "grad_norm": 0.013801125809550285, + "learning_rate": 8.599138325159472e-05, + "loss": 0.0033, + "num_input_tokens_seen": 219655248, + "step": 101695 + }, + { + "epoch": 16.5905383360522, + "grad_norm": 0.00023734763090033084, + "learning_rate": 8.595147673177728e-05, + "loss": 0.0838, + "num_input_tokens_seen": 219665360, + "step": 101700 + }, + { + "epoch": 16.591353996737357, + "grad_norm": 0.0007523433305323124, + "learning_rate": 8.591157860317871e-05, + "loss": 0.0011, + "num_input_tokens_seen": 219676848, + "step": 101705 + }, + { + "epoch": 16.592169657422513, + "grad_norm": 0.0046799443662166595, + "learning_rate": 8.587168886660707e-05, + "loss": 0.0008, + "num_input_tokens_seen": 219688272, + "step": 101710 + }, + { + "epoch": 16.59298531810767, + "grad_norm": 0.051697228103876114, + "learning_rate": 8.583180752287123e-05, + "loss": 0.0031, + "num_input_tokens_seen": 219698160, + "step": 101715 + }, + { + "epoch": 16.59380097879282, + "grad_norm": 0.0009116280707530677, + "learning_rate": 8.579193457277895e-05, + "loss": 0.0042, + "num_input_tokens_seen": 219708400, + "step": 101720 + }, + { + "epoch": 16.594616639477977, + "grad_norm": 0.001496818382292986, + "learning_rate": 8.575207001713875e-05, + "loss": 0.0041, + "num_input_tokens_seen": 219718640, + "step": 101725 + }, + { + "epoch": 16.595432300163132, + "grad_norm": 0.007261293474584818, + "learning_rate": 8.571221385675832e-05, + "loss": 0.0027, + "num_input_tokens_seen": 219730608, + "step": 101730 + }, + { + "epoch": 16.596247960848288, + "grad_norm": 0.07469271868467331, + "learning_rate": 8.567236609244544e-05, + "loss": 0.004, + "num_input_tokens_seen": 219740656, + "step": 101735 + }, + { + "epoch": 16.597063621533444, + "grad_norm": 0.005149087402969599, + "learning_rate": 8.563252672500771e-05, + "loss": 0.0118, + "num_input_tokens_seen": 219750896, + "step": 101740 + }, + { + "epoch": 16.597879282218596, + "grad_norm": 0.011800228618085384, + "learning_rate": 8.559269575525247e-05, + "loss": 0.0018, + "num_input_tokens_seen": 219762256, + "step": 101745 + }, + { + "epoch": 16.59869494290375, + "grad_norm": 0.01077636331319809, + "learning_rate": 8.555287318398697e-05, + "loss": 0.0302, + "num_input_tokens_seen": 219773104, + "step": 101750 + }, + { + "epoch": 16.599510603588907, + "grad_norm": 0.01085708662867546, + "learning_rate": 8.551305901201822e-05, + "loss": 0.0022, + "num_input_tokens_seen": 219784592, + "step": 101755 + }, + { + "epoch": 16.600326264274063, + "grad_norm": 0.01687583513557911, + "learning_rate": 8.54732532401532e-05, + "loss": 0.0041, + "num_input_tokens_seen": 219796112, + "step": 101760 + }, + { + "epoch": 16.601141924959215, + "grad_norm": 0.01624632440507412, + "learning_rate": 8.543345586919854e-05, + "loss": 0.0039, + "num_input_tokens_seen": 219806288, + "step": 101765 + }, + { + "epoch": 16.60195758564437, + "grad_norm": 0.013760429807007313, + "learning_rate": 8.53936668999608e-05, + "loss": 0.0012, + "num_input_tokens_seen": 219816496, + "step": 101770 + }, + { + "epoch": 16.602773246329527, + "grad_norm": 0.004715532064437866, + "learning_rate": 8.535388633324625e-05, + "loss": 0.0031, + "num_input_tokens_seen": 219828336, + "step": 101775 + }, + { + "epoch": 16.603588907014682, + "grad_norm": 0.0017586436588317156, + "learning_rate": 8.531411416986152e-05, + "loss": 0.0012, + "num_input_tokens_seen": 219839568, + "step": 101780 + }, + { + "epoch": 16.604404567699838, + "grad_norm": 0.0021014027297496796, + "learning_rate": 8.5274350410612e-05, + "loss": 0.0637, + "num_input_tokens_seen": 219849680, + "step": 101785 + }, + { + "epoch": 16.605220228384994, + "grad_norm": 0.007624736521393061, + "learning_rate": 8.523459505630415e-05, + "loss": 0.0013, + "num_input_tokens_seen": 219861232, + "step": 101790 + }, + { + "epoch": 16.606035889070146, + "grad_norm": 0.0007187062292359769, + "learning_rate": 8.51948481077432e-05, + "loss": 0.0009, + "num_input_tokens_seen": 219872624, + "step": 101795 + }, + { + "epoch": 16.6068515497553, + "grad_norm": 0.02473929524421692, + "learning_rate": 8.515510956573507e-05, + "loss": 0.0026, + "num_input_tokens_seen": 219883952, + "step": 101800 + }, + { + "epoch": 16.607667210440457, + "grad_norm": 0.004971285816282034, + "learning_rate": 8.511537943108466e-05, + "loss": 0.0316, + "num_input_tokens_seen": 219895408, + "step": 101805 + }, + { + "epoch": 16.608482871125613, + "grad_norm": 0.0360880121588707, + "learning_rate": 8.507565770459769e-05, + "loss": 0.0052, + "num_input_tokens_seen": 219906224, + "step": 101810 + }, + { + "epoch": 16.609298531810765, + "grad_norm": 0.001713123987428844, + "learning_rate": 8.503594438707856e-05, + "loss": 0.0013, + "num_input_tokens_seen": 219919248, + "step": 101815 + }, + { + "epoch": 16.61011419249592, + "grad_norm": 0.0006942551117390394, + "learning_rate": 8.499623947933276e-05, + "loss": 0.0011, + "num_input_tokens_seen": 219930704, + "step": 101820 + }, + { + "epoch": 16.610929853181077, + "grad_norm": 0.00033700844505801797, + "learning_rate": 8.495654298216438e-05, + "loss": 0.0009, + "num_input_tokens_seen": 219941232, + "step": 101825 + }, + { + "epoch": 16.611745513866232, + "grad_norm": 0.00037762854481115937, + "learning_rate": 8.49168548963784e-05, + "loss": 0.0014, + "num_input_tokens_seen": 219951472, + "step": 101830 + }, + { + "epoch": 16.612561174551388, + "grad_norm": 0.009005128405988216, + "learning_rate": 8.487717522277872e-05, + "loss": 0.0056, + "num_input_tokens_seen": 219961936, + "step": 101835 + }, + { + "epoch": 16.61337683523654, + "grad_norm": 0.0003597979375626892, + "learning_rate": 8.483750396216988e-05, + "loss": 0.0989, + "num_input_tokens_seen": 219972976, + "step": 101840 + }, + { + "epoch": 16.614192495921696, + "grad_norm": 0.009030995890498161, + "learning_rate": 8.479784111535549e-05, + "loss": 0.0047, + "num_input_tokens_seen": 219983760, + "step": 101845 + }, + { + "epoch": 16.61500815660685, + "grad_norm": 0.025649599730968475, + "learning_rate": 8.475818668313984e-05, + "loss": 0.0085, + "num_input_tokens_seen": 219993808, + "step": 101850 + }, + { + "epoch": 16.615823817292007, + "grad_norm": 0.0020423270761966705, + "learning_rate": 8.471854066632607e-05, + "loss": 0.0196, + "num_input_tokens_seen": 220004560, + "step": 101855 + }, + { + "epoch": 16.616639477977163, + "grad_norm": 0.000799459929112345, + "learning_rate": 8.467890306571795e-05, + "loss": 0.0015, + "num_input_tokens_seen": 220016816, + "step": 101860 + }, + { + "epoch": 16.617455138662315, + "grad_norm": 0.006219548638910055, + "learning_rate": 8.463927388211878e-05, + "loss": 0.0029, + "num_input_tokens_seen": 220028304, + "step": 101865 + }, + { + "epoch": 16.61827079934747, + "grad_norm": 0.010150066576898098, + "learning_rate": 8.459965311633161e-05, + "loss": 0.0124, + "num_input_tokens_seen": 220038768, + "step": 101870 + }, + { + "epoch": 16.619086460032626, + "grad_norm": 0.12095730751752853, + "learning_rate": 8.456004076915952e-05, + "loss": 0.0057, + "num_input_tokens_seen": 220049136, + "step": 101875 + }, + { + "epoch": 16.619902120717782, + "grad_norm": 0.05160725861787796, + "learning_rate": 8.452043684140514e-05, + "loss": 0.0016, + "num_input_tokens_seen": 220060048, + "step": 101880 + }, + { + "epoch": 16.620717781402938, + "grad_norm": 0.08182302862405777, + "learning_rate": 8.448084133387124e-05, + "loss": 0.0067, + "num_input_tokens_seen": 220071088, + "step": 101885 + }, + { + "epoch": 16.62153344208809, + "grad_norm": 0.005083407275378704, + "learning_rate": 8.444125424736016e-05, + "loss": 0.0007, + "num_input_tokens_seen": 220081712, + "step": 101890 + }, + { + "epoch": 16.622349102773246, + "grad_norm": 0.004309537820518017, + "learning_rate": 8.440167558267431e-05, + "loss": 0.0007, + "num_input_tokens_seen": 220092016, + "step": 101895 + }, + { + "epoch": 16.6231647634584, + "grad_norm": 0.18933819234371185, + "learning_rate": 8.436210534061567e-05, + "loss": 0.0073, + "num_input_tokens_seen": 220104464, + "step": 101900 + }, + { + "epoch": 16.623980424143557, + "grad_norm": 0.07430551201105118, + "learning_rate": 8.432254352198626e-05, + "loss": 0.0036, + "num_input_tokens_seen": 220115184, + "step": 101905 + }, + { + "epoch": 16.624796084828713, + "grad_norm": 0.046566374599933624, + "learning_rate": 8.428299012758778e-05, + "loss": 0.0054, + "num_input_tokens_seen": 220125104, + "step": 101910 + }, + { + "epoch": 16.625611745513865, + "grad_norm": 0.047151170670986176, + "learning_rate": 8.424344515822197e-05, + "loss": 0.0031, + "num_input_tokens_seen": 220136496, + "step": 101915 + }, + { + "epoch": 16.62642740619902, + "grad_norm": 0.0035677056293934584, + "learning_rate": 8.420390861468996e-05, + "loss": 0.0015, + "num_input_tokens_seen": 220146928, + "step": 101920 + }, + { + "epoch": 16.627243066884176, + "grad_norm": 0.0034210113808512688, + "learning_rate": 8.416438049779351e-05, + "loss": 0.0005, + "num_input_tokens_seen": 220156752, + "step": 101925 + }, + { + "epoch": 16.628058727569332, + "grad_norm": 0.0017582608852535486, + "learning_rate": 8.412486080833315e-05, + "loss": 0.0005, + "num_input_tokens_seen": 220168048, + "step": 101930 + }, + { + "epoch": 16.628874388254488, + "grad_norm": 0.0009029234643094242, + "learning_rate": 8.408534954711034e-05, + "loss": 0.1427, + "num_input_tokens_seen": 220178448, + "step": 101935 + }, + { + "epoch": 16.62969004893964, + "grad_norm": 0.8179956078529358, + "learning_rate": 8.404584671492526e-05, + "loss": 0.0238, + "num_input_tokens_seen": 220189488, + "step": 101940 + }, + { + "epoch": 16.630505709624796, + "grad_norm": 0.0009043613681569695, + "learning_rate": 8.400635231257902e-05, + "loss": 0.0212, + "num_input_tokens_seen": 220201008, + "step": 101945 + }, + { + "epoch": 16.63132137030995, + "grad_norm": 0.043483562767505646, + "learning_rate": 8.396686634087159e-05, + "loss": 0.0069, + "num_input_tokens_seen": 220212464, + "step": 101950 + }, + { + "epoch": 16.632137030995107, + "grad_norm": 0.02974863536655903, + "learning_rate": 8.392738880060358e-05, + "loss": 0.0283, + "num_input_tokens_seen": 220221744, + "step": 101955 + }, + { + "epoch": 16.63295269168026, + "grad_norm": 0.036095499992370605, + "learning_rate": 8.388791969257458e-05, + "loss": 0.0014, + "num_input_tokens_seen": 220233232, + "step": 101960 + }, + { + "epoch": 16.633768352365415, + "grad_norm": 0.0013275218661874533, + "learning_rate": 8.384845901758498e-05, + "loss": 0.0014, + "num_input_tokens_seen": 220244240, + "step": 101965 + }, + { + "epoch": 16.63458401305057, + "grad_norm": 0.0053516267798841, + "learning_rate": 8.380900677643421e-05, + "loss": 0.0011, + "num_input_tokens_seen": 220255536, + "step": 101970 + }, + { + "epoch": 16.635399673735726, + "grad_norm": 0.0003697921638377011, + "learning_rate": 8.376956296992195e-05, + "loss": 0.0031, + "num_input_tokens_seen": 220267728, + "step": 101975 + }, + { + "epoch": 16.636215334420882, + "grad_norm": 0.0016742395237088203, + "learning_rate": 8.373012759884746e-05, + "loss": 0.0466, + "num_input_tokens_seen": 220279088, + "step": 101980 + }, + { + "epoch": 16.637030995106034, + "grad_norm": 0.01789051480591297, + "learning_rate": 8.369070066401003e-05, + "loss": 0.0015, + "num_input_tokens_seen": 220289296, + "step": 101985 + }, + { + "epoch": 16.63784665579119, + "grad_norm": 0.008225271478295326, + "learning_rate": 8.365128216620871e-05, + "loss": 0.001, + "num_input_tokens_seen": 220297744, + "step": 101990 + }, + { + "epoch": 16.638662316476346, + "grad_norm": 0.07070305198431015, + "learning_rate": 8.361187210624232e-05, + "loss": 0.0183, + "num_input_tokens_seen": 220309712, + "step": 101995 + }, + { + "epoch": 16.6394779771615, + "grad_norm": 3.6314432621002197, + "learning_rate": 8.357247048490957e-05, + "loss": 0.0253, + "num_input_tokens_seen": 220320848, + "step": 102000 + }, + { + "epoch": 16.640293637846657, + "grad_norm": 0.003030292922630906, + "learning_rate": 8.353307730300897e-05, + "loss": 0.0011, + "num_input_tokens_seen": 220331248, + "step": 102005 + }, + { + "epoch": 16.64110929853181, + "grad_norm": 0.0020804372616112232, + "learning_rate": 8.349369256133888e-05, + "loss": 0.0684, + "num_input_tokens_seen": 220341488, + "step": 102010 + }, + { + "epoch": 16.641924959216965, + "grad_norm": 0.008241880685091019, + "learning_rate": 8.345431626069744e-05, + "loss": 0.0013, + "num_input_tokens_seen": 220352656, + "step": 102015 + }, + { + "epoch": 16.64274061990212, + "grad_norm": 0.012514442205429077, + "learning_rate": 8.34149484018828e-05, + "loss": 0.0595, + "num_input_tokens_seen": 220364080, + "step": 102020 + }, + { + "epoch": 16.643556280587276, + "grad_norm": 0.013286514207720757, + "learning_rate": 8.337558898569264e-05, + "loss": 0.001, + "num_input_tokens_seen": 220375056, + "step": 102025 + }, + { + "epoch": 16.644371941272432, + "grad_norm": 0.003999212756752968, + "learning_rate": 8.333623801292472e-05, + "loss": 0.002, + "num_input_tokens_seen": 220386832, + "step": 102030 + }, + { + "epoch": 16.645187601957584, + "grad_norm": 0.013726749457418919, + "learning_rate": 8.329689548437652e-05, + "loss": 0.0026, + "num_input_tokens_seen": 220397776, + "step": 102035 + }, + { + "epoch": 16.64600326264274, + "grad_norm": 0.029906732961535454, + "learning_rate": 8.325756140084533e-05, + "loss": 0.0021, + "num_input_tokens_seen": 220408528, + "step": 102040 + }, + { + "epoch": 16.646818923327896, + "grad_norm": 0.0009055176051333547, + "learning_rate": 8.321823576312837e-05, + "loss": 0.0022, + "num_input_tokens_seen": 220419888, + "step": 102045 + }, + { + "epoch": 16.64763458401305, + "grad_norm": 0.009986934252083302, + "learning_rate": 8.317891857202253e-05, + "loss": 0.0007, + "num_input_tokens_seen": 220431056, + "step": 102050 + }, + { + "epoch": 16.648450244698207, + "grad_norm": 0.10870281606912613, + "learning_rate": 8.313960982832475e-05, + "loss": 0.0024, + "num_input_tokens_seen": 220441360, + "step": 102055 + }, + { + "epoch": 16.64926590538336, + "grad_norm": 0.03312503546476364, + "learning_rate": 8.310030953283154e-05, + "loss": 0.0016, + "num_input_tokens_seen": 220452528, + "step": 102060 + }, + { + "epoch": 16.650081566068515, + "grad_norm": 0.0009839548729360104, + "learning_rate": 8.30610176863394e-05, + "loss": 0.0014, + "num_input_tokens_seen": 220463600, + "step": 102065 + }, + { + "epoch": 16.65089722675367, + "grad_norm": 0.008478997275233269, + "learning_rate": 8.302173428964472e-05, + "loss": 0.0021, + "num_input_tokens_seen": 220474256, + "step": 102070 + }, + { + "epoch": 16.651712887438826, + "grad_norm": 0.00024375740031246096, + "learning_rate": 8.298245934354353e-05, + "loss": 0.0211, + "num_input_tokens_seen": 220484912, + "step": 102075 + }, + { + "epoch": 16.652528548123982, + "grad_norm": 0.007510208059102297, + "learning_rate": 8.29431928488319e-05, + "loss": 0.0009, + "num_input_tokens_seen": 220495280, + "step": 102080 + }, + { + "epoch": 16.653344208809134, + "grad_norm": 0.01156105101108551, + "learning_rate": 8.290393480630549e-05, + "loss": 0.0029, + "num_input_tokens_seen": 220505840, + "step": 102085 + }, + { + "epoch": 16.65415986949429, + "grad_norm": 0.0036057571414858103, + "learning_rate": 8.286468521676e-05, + "loss": 0.0031, + "num_input_tokens_seen": 220517584, + "step": 102090 + }, + { + "epoch": 16.654975530179446, + "grad_norm": 0.023752061650156975, + "learning_rate": 8.282544408099079e-05, + "loss": 0.0027, + "num_input_tokens_seen": 220528464, + "step": 102095 + }, + { + "epoch": 16.6557911908646, + "grad_norm": 0.0068184020929038525, + "learning_rate": 8.278621139979325e-05, + "loss": 0.001, + "num_input_tokens_seen": 220538864, + "step": 102100 + }, + { + "epoch": 16.656606851549757, + "grad_norm": 0.011158975772559643, + "learning_rate": 8.274698717396234e-05, + "loss": 0.006, + "num_input_tokens_seen": 220549264, + "step": 102105 + }, + { + "epoch": 16.65742251223491, + "grad_norm": 0.3075798451900482, + "learning_rate": 8.270777140429308e-05, + "loss": 0.0094, + "num_input_tokens_seen": 220560112, + "step": 102110 + }, + { + "epoch": 16.658238172920065, + "grad_norm": 0.05352885648608208, + "learning_rate": 8.266856409158025e-05, + "loss": 0.0013, + "num_input_tokens_seen": 220570960, + "step": 102115 + }, + { + "epoch": 16.65905383360522, + "grad_norm": 0.027840284630656242, + "learning_rate": 8.262936523661835e-05, + "loss": 0.0027, + "num_input_tokens_seen": 220582096, + "step": 102120 + }, + { + "epoch": 16.659869494290376, + "grad_norm": 0.07525905966758728, + "learning_rate": 8.259017484020181e-05, + "loss": 0.0033, + "num_input_tokens_seen": 220592016, + "step": 102125 + }, + { + "epoch": 16.660685154975532, + "grad_norm": 0.14387553930282593, + "learning_rate": 8.255099290312495e-05, + "loss": 0.006, + "num_input_tokens_seen": 220601936, + "step": 102130 + }, + { + "epoch": 16.661500815660684, + "grad_norm": 0.056067511439323425, + "learning_rate": 8.251181942618174e-05, + "loss": 0.0031, + "num_input_tokens_seen": 220612048, + "step": 102135 + }, + { + "epoch": 16.66231647634584, + "grad_norm": 0.002062713261693716, + "learning_rate": 8.247265441016621e-05, + "loss": 0.0008, + "num_input_tokens_seen": 220622608, + "step": 102140 + }, + { + "epoch": 16.663132137030995, + "grad_norm": 0.001355032087303698, + "learning_rate": 8.243349785587195e-05, + "loss": 0.0038, + "num_input_tokens_seen": 220633264, + "step": 102145 + }, + { + "epoch": 16.66394779771615, + "grad_norm": 0.018915260210633278, + "learning_rate": 8.23943497640926e-05, + "loss": 0.0021, + "num_input_tokens_seen": 220643696, + "step": 102150 + }, + { + "epoch": 16.664763458401303, + "grad_norm": 0.0024553509429097176, + "learning_rate": 8.235521013562148e-05, + "loss": 0.0262, + "num_input_tokens_seen": 220655248, + "step": 102155 + }, + { + "epoch": 16.66557911908646, + "grad_norm": 0.004102553240954876, + "learning_rate": 8.231607897125188e-05, + "loss": 0.0022, + "num_input_tokens_seen": 220664816, + "step": 102160 + }, + { + "epoch": 16.666394779771615, + "grad_norm": 0.001667728298343718, + "learning_rate": 8.227695627177678e-05, + "loss": 0.0028, + "num_input_tokens_seen": 220674448, + "step": 102165 + }, + { + "epoch": 16.66721044045677, + "grad_norm": 0.003264149883762002, + "learning_rate": 8.223784203798912e-05, + "loss": 0.0034, + "num_input_tokens_seen": 220685936, + "step": 102170 + }, + { + "epoch": 16.668026101141926, + "grad_norm": 0.023338552564382553, + "learning_rate": 8.219873627068141e-05, + "loss": 0.005, + "num_input_tokens_seen": 220696976, + "step": 102175 + }, + { + "epoch": 16.66884176182708, + "grad_norm": 0.0016530726570636034, + "learning_rate": 8.21596389706466e-05, + "loss": 0.0054, + "num_input_tokens_seen": 220707504, + "step": 102180 + }, + { + "epoch": 16.669657422512234, + "grad_norm": 0.003783087246119976, + "learning_rate": 8.212055013867654e-05, + "loss": 0.0005, + "num_input_tokens_seen": 220718928, + "step": 102185 + }, + { + "epoch": 16.67047308319739, + "grad_norm": 0.0009449265198782086, + "learning_rate": 8.208146977556386e-05, + "loss": 0.0008, + "num_input_tokens_seen": 220728784, + "step": 102190 + }, + { + "epoch": 16.671288743882545, + "grad_norm": 0.0008899805252440274, + "learning_rate": 8.204239788210011e-05, + "loss": 0.002, + "num_input_tokens_seen": 220740688, + "step": 102195 + }, + { + "epoch": 16.6721044045677, + "grad_norm": 0.000524374539963901, + "learning_rate": 8.200333445907766e-05, + "loss": 0.0033, + "num_input_tokens_seen": 220750000, + "step": 102200 + }, + { + "epoch": 16.672920065252853, + "grad_norm": 0.0010391840478405356, + "learning_rate": 8.196427950728763e-05, + "loss": 0.0019, + "num_input_tokens_seen": 220761136, + "step": 102205 + }, + { + "epoch": 16.67373572593801, + "grad_norm": 0.006226594094187021, + "learning_rate": 8.192523302752192e-05, + "loss": 0.0011, + "num_input_tokens_seen": 220771856, + "step": 102210 + }, + { + "epoch": 16.674551386623165, + "grad_norm": 0.0289426501840353, + "learning_rate": 8.188619502057176e-05, + "loss": 0.0014, + "num_input_tokens_seen": 220782512, + "step": 102215 + }, + { + "epoch": 16.67536704730832, + "grad_norm": 0.03636833652853966, + "learning_rate": 8.184716548722825e-05, + "loss": 0.0035, + "num_input_tokens_seen": 220793552, + "step": 102220 + }, + { + "epoch": 16.676182707993476, + "grad_norm": 0.005582255311310291, + "learning_rate": 8.180814442828238e-05, + "loss": 0.0568, + "num_input_tokens_seen": 220803792, + "step": 102225 + }, + { + "epoch": 16.67699836867863, + "grad_norm": 0.017085885629057884, + "learning_rate": 8.1769131844525e-05, + "loss": 0.002, + "num_input_tokens_seen": 220814896, + "step": 102230 + }, + { + "epoch": 16.677814029363784, + "grad_norm": 0.22158437967300415, + "learning_rate": 8.173012773674671e-05, + "loss": 0.0081, + "num_input_tokens_seen": 220825264, + "step": 102235 + }, + { + "epoch": 16.67862969004894, + "grad_norm": 0.013059341348707676, + "learning_rate": 8.169113210573803e-05, + "loss": 0.003, + "num_input_tokens_seen": 220835824, + "step": 102240 + }, + { + "epoch": 16.679445350734095, + "grad_norm": 0.012183960527181625, + "learning_rate": 8.165214495228918e-05, + "loss": 0.0026, + "num_input_tokens_seen": 220847536, + "step": 102245 + }, + { + "epoch": 16.68026101141925, + "grad_norm": 0.001767508452758193, + "learning_rate": 8.161316627719035e-05, + "loss": 0.0591, + "num_input_tokens_seen": 220858256, + "step": 102250 + }, + { + "epoch": 16.681076672104403, + "grad_norm": 0.01171040441840887, + "learning_rate": 8.157419608123145e-05, + "loss": 0.0016, + "num_input_tokens_seen": 220868784, + "step": 102255 + }, + { + "epoch": 16.68189233278956, + "grad_norm": 0.008576060645282269, + "learning_rate": 8.153523436520226e-05, + "loss": 0.0025, + "num_input_tokens_seen": 220880656, + "step": 102260 + }, + { + "epoch": 16.682707993474715, + "grad_norm": 0.07342652976512909, + "learning_rate": 8.149628112989243e-05, + "loss": 0.0014, + "num_input_tokens_seen": 220890448, + "step": 102265 + }, + { + "epoch": 16.68352365415987, + "grad_norm": 0.008088194765150547, + "learning_rate": 8.145733637609137e-05, + "loss": 0.0012, + "num_input_tokens_seen": 220901040, + "step": 102270 + }, + { + "epoch": 16.684339314845026, + "grad_norm": 0.0017258359584957361, + "learning_rate": 8.141840010458835e-05, + "loss": 0.0012, + "num_input_tokens_seen": 220911536, + "step": 102275 + }, + { + "epoch": 16.68515497553018, + "grad_norm": 0.4991852343082428, + "learning_rate": 8.137947231617237e-05, + "loss": 0.0058, + "num_input_tokens_seen": 220923056, + "step": 102280 + }, + { + "epoch": 16.685970636215334, + "grad_norm": 0.021041272208094597, + "learning_rate": 8.134055301163263e-05, + "loss": 0.0041, + "num_input_tokens_seen": 220933712, + "step": 102285 + }, + { + "epoch": 16.68678629690049, + "grad_norm": 0.0014039005618542433, + "learning_rate": 8.130164219175745e-05, + "loss": 0.0015, + "num_input_tokens_seen": 220945968, + "step": 102290 + }, + { + "epoch": 16.687601957585645, + "grad_norm": 0.0002282148489030078, + "learning_rate": 8.126273985733595e-05, + "loss": 0.0004, + "num_input_tokens_seen": 220956112, + "step": 102295 + }, + { + "epoch": 16.6884176182708, + "grad_norm": 0.0002533920051064342, + "learning_rate": 8.122384600915594e-05, + "loss": 0.0031, + "num_input_tokens_seen": 220966896, + "step": 102300 + }, + { + "epoch": 16.689233278955953, + "grad_norm": 0.0018143865745514631, + "learning_rate": 8.118496064800618e-05, + "loss": 0.0009, + "num_input_tokens_seen": 220977936, + "step": 102305 + }, + { + "epoch": 16.69004893964111, + "grad_norm": 0.02916126139461994, + "learning_rate": 8.11460837746743e-05, + "loss": 0.002, + "num_input_tokens_seen": 220989264, + "step": 102310 + }, + { + "epoch": 16.690864600326265, + "grad_norm": 0.024176111444830894, + "learning_rate": 8.110721538994859e-05, + "loss": 0.0031, + "num_input_tokens_seen": 220999600, + "step": 102315 + }, + { + "epoch": 16.69168026101142, + "grad_norm": 0.004567863419651985, + "learning_rate": 8.106835549461633e-05, + "loss": 0.0037, + "num_input_tokens_seen": 221010640, + "step": 102320 + }, + { + "epoch": 16.692495921696576, + "grad_norm": 0.0015767280710861087, + "learning_rate": 8.102950408946552e-05, + "loss": 0.0011, + "num_input_tokens_seen": 221021200, + "step": 102325 + }, + { + "epoch": 16.693311582381728, + "grad_norm": 1.0787469148635864, + "learning_rate": 8.099066117528308e-05, + "loss": 0.0256, + "num_input_tokens_seen": 221033968, + "step": 102330 + }, + { + "epoch": 16.694127243066884, + "grad_norm": 0.18266649544239044, + "learning_rate": 8.095182675285673e-05, + "loss": 0.0064, + "num_input_tokens_seen": 221043632, + "step": 102335 + }, + { + "epoch": 16.69494290375204, + "grad_norm": 0.0012430261122062802, + "learning_rate": 8.091300082297293e-05, + "loss": 0.0018, + "num_input_tokens_seen": 221054032, + "step": 102340 + }, + { + "epoch": 16.695758564437195, + "grad_norm": 0.0010876876767724752, + "learning_rate": 8.087418338641906e-05, + "loss": 0.001, + "num_input_tokens_seen": 221065072, + "step": 102345 + }, + { + "epoch": 16.696574225122347, + "grad_norm": 0.002364259911701083, + "learning_rate": 8.083537444398131e-05, + "loss": 0.0016, + "num_input_tokens_seen": 221074864, + "step": 102350 + }, + { + "epoch": 16.697389885807503, + "grad_norm": 0.00427822582423687, + "learning_rate": 8.079657399644664e-05, + "loss": 0.0054, + "num_input_tokens_seen": 221084784, + "step": 102355 + }, + { + "epoch": 16.69820554649266, + "grad_norm": 0.014106487855315208, + "learning_rate": 8.07577820446011e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221095248, + "step": 102360 + }, + { + "epoch": 16.699021207177815, + "grad_norm": 0.014190200716257095, + "learning_rate": 8.071899858923098e-05, + "loss": 0.0316, + "num_input_tokens_seen": 221105904, + "step": 102365 + }, + { + "epoch": 16.69983686786297, + "grad_norm": 0.0014781900681555271, + "learning_rate": 8.068022363112227e-05, + "loss": 0.0009, + "num_input_tokens_seen": 221116368, + "step": 102370 + }, + { + "epoch": 16.700652528548122, + "grad_norm": 0.005353093612939119, + "learning_rate": 8.064145717106075e-05, + "loss": 0.0816, + "num_input_tokens_seen": 221127472, + "step": 102375 + }, + { + "epoch": 16.701468189233278, + "grad_norm": 0.0054572150111198425, + "learning_rate": 8.06026992098321e-05, + "loss": 0.012, + "num_input_tokens_seen": 221139280, + "step": 102380 + }, + { + "epoch": 16.702283849918434, + "grad_norm": 0.00832337699830532, + "learning_rate": 8.056394974822185e-05, + "loss": 0.0045, + "num_input_tokens_seen": 221150416, + "step": 102385 + }, + { + "epoch": 16.70309951060359, + "grad_norm": 0.017912698909640312, + "learning_rate": 8.052520878701519e-05, + "loss": 0.002, + "num_input_tokens_seen": 221161296, + "step": 102390 + }, + { + "epoch": 16.703915171288745, + "grad_norm": 0.0015576289733871818, + "learning_rate": 8.04864763269973e-05, + "loss": 0.0005, + "num_input_tokens_seen": 221170480, + "step": 102395 + }, + { + "epoch": 16.704730831973897, + "grad_norm": 0.004138452000916004, + "learning_rate": 8.044775236895319e-05, + "loss": 0.0031, + "num_input_tokens_seen": 221179408, + "step": 102400 + }, + { + "epoch": 16.705546492659053, + "grad_norm": 0.005685943178832531, + "learning_rate": 8.040903691366753e-05, + "loss": 0.0016, + "num_input_tokens_seen": 221190192, + "step": 102405 + }, + { + "epoch": 16.70636215334421, + "grad_norm": 0.0030472618527710438, + "learning_rate": 8.037032996192522e-05, + "loss": 0.0003, + "num_input_tokens_seen": 221200912, + "step": 102410 + }, + { + "epoch": 16.707177814029365, + "grad_norm": 0.0015341610414907336, + "learning_rate": 8.033163151451028e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221211920, + "step": 102415 + }, + { + "epoch": 16.70799347471452, + "grad_norm": 0.004777231719344854, + "learning_rate": 8.029294157220746e-05, + "loss": 0.0181, + "num_input_tokens_seen": 221223280, + "step": 102420 + }, + { + "epoch": 16.708809135399672, + "grad_norm": 0.002837100997567177, + "learning_rate": 8.025426013580033e-05, + "loss": 0.0018, + "num_input_tokens_seen": 221234480, + "step": 102425 + }, + { + "epoch": 16.709624796084828, + "grad_norm": 0.0010288365883752704, + "learning_rate": 8.021558720607342e-05, + "loss": 0.0042, + "num_input_tokens_seen": 221243984, + "step": 102430 + }, + { + "epoch": 16.710440456769984, + "grad_norm": 0.0010139404330402613, + "learning_rate": 8.01769227838099e-05, + "loss": 0.0007, + "num_input_tokens_seen": 221255536, + "step": 102435 + }, + { + "epoch": 16.71125611745514, + "grad_norm": 0.002217318629845977, + "learning_rate": 8.013826686979381e-05, + "loss": 0.0004, + "num_input_tokens_seen": 221266832, + "step": 102440 + }, + { + "epoch": 16.712071778140295, + "grad_norm": 0.0004526945995166898, + "learning_rate": 8.00996194648082e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221277616, + "step": 102445 + }, + { + "epoch": 16.712887438825447, + "grad_norm": 0.01151892077177763, + "learning_rate": 8.006098056963668e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221289360, + "step": 102450 + }, + { + "epoch": 16.713703099510603, + "grad_norm": 0.002219117246568203, + "learning_rate": 8.002235018506194e-05, + "loss": 0.0009, + "num_input_tokens_seen": 221301104, + "step": 102455 + }, + { + "epoch": 16.71451876019576, + "grad_norm": 0.002388479420915246, + "learning_rate": 7.998372831186723e-05, + "loss": 0.0022, + "num_input_tokens_seen": 221311824, + "step": 102460 + }, + { + "epoch": 16.715334420880914, + "grad_norm": 0.2429102659225464, + "learning_rate": 7.99451149508349e-05, + "loss": 0.0069, + "num_input_tokens_seen": 221322192, + "step": 102465 + }, + { + "epoch": 16.71615008156607, + "grad_norm": 0.028374364599585533, + "learning_rate": 7.990651010274791e-05, + "loss": 0.0017, + "num_input_tokens_seen": 221333552, + "step": 102470 + }, + { + "epoch": 16.716965742251222, + "grad_norm": 0.025974059477448463, + "learning_rate": 7.98679137683882e-05, + "loss": 0.0024, + "num_input_tokens_seen": 221344368, + "step": 102475 + }, + { + "epoch": 16.717781402936378, + "grad_norm": 0.01371039729565382, + "learning_rate": 7.982932594853837e-05, + "loss": 0.0029, + "num_input_tokens_seen": 221355792, + "step": 102480 + }, + { + "epoch": 16.718597063621534, + "grad_norm": 0.0056204842403531075, + "learning_rate": 7.979074664398012e-05, + "loss": 0.0037, + "num_input_tokens_seen": 221365520, + "step": 102485 + }, + { + "epoch": 16.71941272430669, + "grad_norm": 0.00258276448585093, + "learning_rate": 7.975217585549566e-05, + "loss": 0.0026, + "num_input_tokens_seen": 221375856, + "step": 102490 + }, + { + "epoch": 16.72022838499184, + "grad_norm": 0.003028827253729105, + "learning_rate": 7.97136135838662e-05, + "loss": 0.0009, + "num_input_tokens_seen": 221386064, + "step": 102495 + }, + { + "epoch": 16.721044045676997, + "grad_norm": 0.027169395238161087, + "learning_rate": 7.967505982987372e-05, + "loss": 0.0087, + "num_input_tokens_seen": 221397392, + "step": 102500 + }, + { + "epoch": 16.721859706362153, + "grad_norm": 0.00019676069496199489, + "learning_rate": 7.963651459429932e-05, + "loss": 0.0019, + "num_input_tokens_seen": 221406288, + "step": 102505 + }, + { + "epoch": 16.72267536704731, + "grad_norm": 0.0002667378284968436, + "learning_rate": 7.959797787792428e-05, + "loss": 0.1659, + "num_input_tokens_seen": 221416720, + "step": 102510 + }, + { + "epoch": 16.723491027732464, + "grad_norm": 0.0024046706967055798, + "learning_rate": 7.955944968152951e-05, + "loss": 0.0011, + "num_input_tokens_seen": 221427536, + "step": 102515 + }, + { + "epoch": 16.724306688417617, + "grad_norm": 0.011399206705391407, + "learning_rate": 7.952093000589583e-05, + "loss": 0.0009, + "num_input_tokens_seen": 221437712, + "step": 102520 + }, + { + "epoch": 16.725122349102772, + "grad_norm": 0.041121955960989, + "learning_rate": 7.948241885180396e-05, + "loss": 0.0017, + "num_input_tokens_seen": 221448720, + "step": 102525 + }, + { + "epoch": 16.725938009787928, + "grad_norm": 0.0030209736432880163, + "learning_rate": 7.944391622003427e-05, + "loss": 0.0007, + "num_input_tokens_seen": 221460016, + "step": 102530 + }, + { + "epoch": 16.726753670473084, + "grad_norm": 0.004665852524340153, + "learning_rate": 7.94054221113672e-05, + "loss": 0.0005, + "num_input_tokens_seen": 221470128, + "step": 102535 + }, + { + "epoch": 16.72756933115824, + "grad_norm": 0.0013338279677554965, + "learning_rate": 7.936693652658278e-05, + "loss": 0.0003, + "num_input_tokens_seen": 221481488, + "step": 102540 + }, + { + "epoch": 16.72838499184339, + "grad_norm": 0.04637147858738899, + "learning_rate": 7.9328459466461e-05, + "loss": 0.0025, + "num_input_tokens_seen": 221492912, + "step": 102545 + }, + { + "epoch": 16.729200652528547, + "grad_norm": 0.0005551987560465932, + "learning_rate": 7.928999093178157e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221503856, + "step": 102550 + }, + { + "epoch": 16.730016313213703, + "grad_norm": 0.003815334988757968, + "learning_rate": 7.925153092332438e-05, + "loss": 0.0021, + "num_input_tokens_seen": 221514736, + "step": 102555 + }, + { + "epoch": 16.73083197389886, + "grad_norm": 0.0010202974081039429, + "learning_rate": 7.921307944186845e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221526256, + "step": 102560 + }, + { + "epoch": 16.731647634584014, + "grad_norm": 0.0013838201994076371, + "learning_rate": 7.91746364881935e-05, + "loss": 0.0016, + "num_input_tokens_seen": 221537168, + "step": 102565 + }, + { + "epoch": 16.732463295269167, + "grad_norm": 0.0036898739635944366, + "learning_rate": 7.913620206307814e-05, + "loss": 0.0022, + "num_input_tokens_seen": 221547632, + "step": 102570 + }, + { + "epoch": 16.733278955954322, + "grad_norm": 0.0027934997342526913, + "learning_rate": 7.909777616730185e-05, + "loss": 0.0014, + "num_input_tokens_seen": 221558832, + "step": 102575 + }, + { + "epoch": 16.734094616639478, + "grad_norm": 0.0016968920826911926, + "learning_rate": 7.905935880164278e-05, + "loss": 0.0029, + "num_input_tokens_seen": 221569136, + "step": 102580 + }, + { + "epoch": 16.734910277324634, + "grad_norm": 0.10568311810493469, + "learning_rate": 7.902094996688009e-05, + "loss": 0.0028, + "num_input_tokens_seen": 221579824, + "step": 102585 + }, + { + "epoch": 16.73572593800979, + "grad_norm": 0.003486029338091612, + "learning_rate": 7.89825496637916e-05, + "loss": 0.0006, + "num_input_tokens_seen": 221591632, + "step": 102590 + }, + { + "epoch": 16.73654159869494, + "grad_norm": 0.038367435336112976, + "learning_rate": 7.894415789315612e-05, + "loss": 0.0034, + "num_input_tokens_seen": 221600624, + "step": 102595 + }, + { + "epoch": 16.737357259380097, + "grad_norm": 0.01631920039653778, + "learning_rate": 7.890577465575121e-05, + "loss": 0.0204, + "num_input_tokens_seen": 221612016, + "step": 102600 + }, + { + "epoch": 16.738172920065253, + "grad_norm": 0.019952211529016495, + "learning_rate": 7.886739995235504e-05, + "loss": 0.0027, + "num_input_tokens_seen": 221623184, + "step": 102605 + }, + { + "epoch": 16.73898858075041, + "grad_norm": 0.001536556170322001, + "learning_rate": 7.882903378374528e-05, + "loss": 0.0004, + "num_input_tokens_seen": 221633520, + "step": 102610 + }, + { + "epoch": 16.739804241435564, + "grad_norm": 0.019367242231965065, + "learning_rate": 7.879067615069946e-05, + "loss": 0.0014, + "num_input_tokens_seen": 221646256, + "step": 102615 + }, + { + "epoch": 16.740619902120716, + "grad_norm": 0.2488294392824173, + "learning_rate": 7.875232705399488e-05, + "loss": 0.0085, + "num_input_tokens_seen": 221657232, + "step": 102620 + }, + { + "epoch": 16.741435562805872, + "grad_norm": 0.012795425951480865, + "learning_rate": 7.871398649440886e-05, + "loss": 0.0023, + "num_input_tokens_seen": 221669520, + "step": 102625 + }, + { + "epoch": 16.742251223491028, + "grad_norm": 0.9706597328186035, + "learning_rate": 7.867565447271829e-05, + "loss": 0.0807, + "num_input_tokens_seen": 221680144, + "step": 102630 + }, + { + "epoch": 16.743066884176184, + "grad_norm": 0.062246449291706085, + "learning_rate": 7.863733098970006e-05, + "loss": 0.0026, + "num_input_tokens_seen": 221690672, + "step": 102635 + }, + { + "epoch": 16.74388254486134, + "grad_norm": 0.0013690270716324449, + "learning_rate": 7.85990160461309e-05, + "loss": 0.004, + "num_input_tokens_seen": 221701296, + "step": 102640 + }, + { + "epoch": 16.74469820554649, + "grad_norm": 0.0025680058170109987, + "learning_rate": 7.856070964278722e-05, + "loss": 0.0009, + "num_input_tokens_seen": 221712816, + "step": 102645 + }, + { + "epoch": 16.745513866231647, + "grad_norm": 0.00345767755061388, + "learning_rate": 7.852241178044539e-05, + "loss": 0.0077, + "num_input_tokens_seen": 221724656, + "step": 102650 + }, + { + "epoch": 16.746329526916803, + "grad_norm": 0.011416045017540455, + "learning_rate": 7.848412245988157e-05, + "loss": 0.0007, + "num_input_tokens_seen": 221735568, + "step": 102655 + }, + { + "epoch": 16.74714518760196, + "grad_norm": 0.00027606345247477293, + "learning_rate": 7.84458416818718e-05, + "loss": 0.0062, + "num_input_tokens_seen": 221746448, + "step": 102660 + }, + { + "epoch": 16.747960848287114, + "grad_norm": 0.030321862548589706, + "learning_rate": 7.840756944719174e-05, + "loss": 0.0532, + "num_input_tokens_seen": 221757584, + "step": 102665 + }, + { + "epoch": 16.748776508972266, + "grad_norm": 0.0008409228757955134, + "learning_rate": 7.836930575661716e-05, + "loss": 0.0013, + "num_input_tokens_seen": 221769296, + "step": 102670 + }, + { + "epoch": 16.749592169657422, + "grad_norm": 0.02600860223174095, + "learning_rate": 7.83310506109235e-05, + "loss": 0.0053, + "num_input_tokens_seen": 221780400, + "step": 102675 + }, + { + "epoch": 16.750407830342578, + "grad_norm": 0.0010515855392441154, + "learning_rate": 7.829280401088601e-05, + "loss": 0.0036, + "num_input_tokens_seen": 221791312, + "step": 102680 + }, + { + "epoch": 16.751223491027734, + "grad_norm": 0.07031827419996262, + "learning_rate": 7.82545659572798e-05, + "loss": 0.002, + "num_input_tokens_seen": 221802192, + "step": 102685 + }, + { + "epoch": 16.752039151712886, + "grad_norm": 0.004942765459418297, + "learning_rate": 7.821633645087984e-05, + "loss": 0.002, + "num_input_tokens_seen": 221813520, + "step": 102690 + }, + { + "epoch": 16.75285481239804, + "grad_norm": 1.208173155784607, + "learning_rate": 7.817811549246079e-05, + "loss": 0.0556, + "num_input_tokens_seen": 221823376, + "step": 102695 + }, + { + "epoch": 16.753670473083197, + "grad_norm": 0.0008134068921208382, + "learning_rate": 7.813990308279755e-05, + "loss": 0.0021, + "num_input_tokens_seen": 221834352, + "step": 102700 + }, + { + "epoch": 16.754486133768353, + "grad_norm": 0.00394340418279171, + "learning_rate": 7.810169922266413e-05, + "loss": 0.0025, + "num_input_tokens_seen": 221845168, + "step": 102705 + }, + { + "epoch": 16.75530179445351, + "grad_norm": 0.041125066578388214, + "learning_rate": 7.806350391283507e-05, + "loss": 0.0022, + "num_input_tokens_seen": 221856112, + "step": 102710 + }, + { + "epoch": 16.75611745513866, + "grad_norm": 0.3884848356246948, + "learning_rate": 7.80253171540844e-05, + "loss": 0.0127, + "num_input_tokens_seen": 221867440, + "step": 102715 + }, + { + "epoch": 16.756933115823816, + "grad_norm": 0.004419300705194473, + "learning_rate": 7.798713894718602e-05, + "loss": 0.0017, + "num_input_tokens_seen": 221877072, + "step": 102720 + }, + { + "epoch": 16.757748776508972, + "grad_norm": 0.00803311262279749, + "learning_rate": 7.794896929291361e-05, + "loss": 0.0293, + "num_input_tokens_seen": 221886832, + "step": 102725 + }, + { + "epoch": 16.758564437194128, + "grad_norm": 0.0018697066698223352, + "learning_rate": 7.791080819204072e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221897552, + "step": 102730 + }, + { + "epoch": 16.759380097879284, + "grad_norm": 0.0029300868045538664, + "learning_rate": 7.78726556453408e-05, + "loss": 0.0007, + "num_input_tokens_seen": 221907728, + "step": 102735 + }, + { + "epoch": 16.760195758564436, + "grad_norm": 0.20307497680187225, + "learning_rate": 7.783451165358696e-05, + "loss": 0.012, + "num_input_tokens_seen": 221918480, + "step": 102740 + }, + { + "epoch": 16.76101141924959, + "grad_norm": 0.008771148510277271, + "learning_rate": 7.779637621755236e-05, + "loss": 0.002, + "num_input_tokens_seen": 221930160, + "step": 102745 + }, + { + "epoch": 16.761827079934747, + "grad_norm": 0.030420590192079544, + "learning_rate": 7.775824933800979e-05, + "loss": 0.0134, + "num_input_tokens_seen": 221940464, + "step": 102750 + }, + { + "epoch": 16.762642740619903, + "grad_norm": 0.00467941677197814, + "learning_rate": 7.772013101573195e-05, + "loss": 0.0015, + "num_input_tokens_seen": 221951920, + "step": 102755 + }, + { + "epoch": 16.76345840130506, + "grad_norm": 0.003154685953631997, + "learning_rate": 7.768202125149132e-05, + "loss": 0.0021, + "num_input_tokens_seen": 221962736, + "step": 102760 + }, + { + "epoch": 16.76427406199021, + "grad_norm": 0.0002088886103592813, + "learning_rate": 7.76439200460603e-05, + "loss": 0.0004, + "num_input_tokens_seen": 221974832, + "step": 102765 + }, + { + "epoch": 16.765089722675366, + "grad_norm": 0.000144814039231278, + "learning_rate": 7.7605827400211e-05, + "loss": 0.001, + "num_input_tokens_seen": 221984528, + "step": 102770 + }, + { + "epoch": 16.765905383360522, + "grad_norm": 0.29430752992630005, + "learning_rate": 7.75677433147155e-05, + "loss": 0.0105, + "num_input_tokens_seen": 221994768, + "step": 102775 + }, + { + "epoch": 16.766721044045678, + "grad_norm": 0.017759494483470917, + "learning_rate": 7.752966779034553e-05, + "loss": 0.003, + "num_input_tokens_seen": 222004752, + "step": 102780 + }, + { + "epoch": 16.767536704730833, + "grad_norm": 0.0003618684713728726, + "learning_rate": 7.749160082787283e-05, + "loss": 0.0011, + "num_input_tokens_seen": 222015408, + "step": 102785 + }, + { + "epoch": 16.768352365415986, + "grad_norm": 0.01978623867034912, + "learning_rate": 7.745354242806884e-05, + "loss": 0.0011, + "num_input_tokens_seen": 222026288, + "step": 102790 + }, + { + "epoch": 16.76916802610114, + "grad_norm": 0.0008822871604934335, + "learning_rate": 7.741549259170483e-05, + "loss": 0.0028, + "num_input_tokens_seen": 222037008, + "step": 102795 + }, + { + "epoch": 16.769983686786297, + "grad_norm": 0.0161877628415823, + "learning_rate": 7.737745131955192e-05, + "loss": 0.0022, + "num_input_tokens_seen": 222047280, + "step": 102800 + }, + { + "epoch": 16.770799347471453, + "grad_norm": 0.03637959808111191, + "learning_rate": 7.733941861238114e-05, + "loss": 0.0056, + "num_input_tokens_seen": 222058160, + "step": 102805 + }, + { + "epoch": 16.77161500815661, + "grad_norm": 0.003914504777640104, + "learning_rate": 7.730139447096319e-05, + "loss": 0.0007, + "num_input_tokens_seen": 222069840, + "step": 102810 + }, + { + "epoch": 16.77243066884176, + "grad_norm": 0.00933013390749693, + "learning_rate": 7.726337889606861e-05, + "loss": 0.0053, + "num_input_tokens_seen": 222080720, + "step": 102815 + }, + { + "epoch": 16.773246329526916, + "grad_norm": 0.006620690226554871, + "learning_rate": 7.722537188846817e-05, + "loss": 0.0008, + "num_input_tokens_seen": 222091472, + "step": 102820 + }, + { + "epoch": 16.774061990212072, + "grad_norm": 0.00023687862267252058, + "learning_rate": 7.718737344893167e-05, + "loss": 0.0014, + "num_input_tokens_seen": 222103952, + "step": 102825 + }, + { + "epoch": 16.774877650897228, + "grad_norm": 0.07730203866958618, + "learning_rate": 7.714938357822965e-05, + "loss": 0.0021, + "num_input_tokens_seen": 222115280, + "step": 102830 + }, + { + "epoch": 16.775693311582383, + "grad_norm": 0.0037358549889177084, + "learning_rate": 7.711140227713154e-05, + "loss": 0.0012, + "num_input_tokens_seen": 222125008, + "step": 102835 + }, + { + "epoch": 16.776508972267536, + "grad_norm": 0.004577254876494408, + "learning_rate": 7.70734295464075e-05, + "loss": 0.0288, + "num_input_tokens_seen": 222136848, + "step": 102840 + }, + { + "epoch": 16.77732463295269, + "grad_norm": 0.024739494547247887, + "learning_rate": 7.703546538682688e-05, + "loss": 0.0027, + "num_input_tokens_seen": 222148656, + "step": 102845 + }, + { + "epoch": 16.778140293637847, + "grad_norm": 0.0011499575339257717, + "learning_rate": 7.699750979915915e-05, + "loss": 0.0016, + "num_input_tokens_seen": 222158896, + "step": 102850 + }, + { + "epoch": 16.778955954323003, + "grad_norm": 0.007596036419272423, + "learning_rate": 7.695956278417349e-05, + "loss": 0.0078, + "num_input_tokens_seen": 222169680, + "step": 102855 + }, + { + "epoch": 16.77977161500816, + "grad_norm": 0.0020268235821276903, + "learning_rate": 7.692162434263894e-05, + "loss": 0.0018, + "num_input_tokens_seen": 222181776, + "step": 102860 + }, + { + "epoch": 16.78058727569331, + "grad_norm": 0.0006173772853799164, + "learning_rate": 7.688369447532444e-05, + "loss": 0.0024, + "num_input_tokens_seen": 222192752, + "step": 102865 + }, + { + "epoch": 16.781402936378466, + "grad_norm": 0.004046997986733913, + "learning_rate": 7.684577318299857e-05, + "loss": 0.0091, + "num_input_tokens_seen": 222203408, + "step": 102870 + }, + { + "epoch": 16.782218597063622, + "grad_norm": 0.010694226250052452, + "learning_rate": 7.680786046642996e-05, + "loss": 0.0009, + "num_input_tokens_seen": 222213904, + "step": 102875 + }, + { + "epoch": 16.783034257748778, + "grad_norm": 0.0009469189099036157, + "learning_rate": 7.676995632638689e-05, + "loss": 0.0013, + "num_input_tokens_seen": 222223920, + "step": 102880 + }, + { + "epoch": 16.78384991843393, + "grad_norm": 0.0009855440584942698, + "learning_rate": 7.67320607636376e-05, + "loss": 0.0004, + "num_input_tokens_seen": 222234672, + "step": 102885 + }, + { + "epoch": 16.784665579119086, + "grad_norm": 0.000437272887211293, + "learning_rate": 7.669417377894999e-05, + "loss": 0.0005, + "num_input_tokens_seen": 222245264, + "step": 102890 + }, + { + "epoch": 16.78548123980424, + "grad_norm": 0.00246535986661911, + "learning_rate": 7.665629537309199e-05, + "loss": 0.0007, + "num_input_tokens_seen": 222255856, + "step": 102895 + }, + { + "epoch": 16.786296900489397, + "grad_norm": 0.0025290593039244413, + "learning_rate": 7.661842554683124e-05, + "loss": 0.0005, + "num_input_tokens_seen": 222266416, + "step": 102900 + }, + { + "epoch": 16.787112561174553, + "grad_norm": 0.009743542410433292, + "learning_rate": 7.658056430093512e-05, + "loss": 0.001, + "num_input_tokens_seen": 222276048, + "step": 102905 + }, + { + "epoch": 16.787928221859705, + "grad_norm": 0.01540299877524376, + "learning_rate": 7.654271163617105e-05, + "loss": 0.0012, + "num_input_tokens_seen": 222286512, + "step": 102910 + }, + { + "epoch": 16.78874388254486, + "grad_norm": 0.06199616193771362, + "learning_rate": 7.650486755330616e-05, + "loss": 0.0094, + "num_input_tokens_seen": 222297072, + "step": 102915 + }, + { + "epoch": 16.789559543230016, + "grad_norm": 0.0010426411172375083, + "learning_rate": 7.646703205310718e-05, + "loss": 0.1021, + "num_input_tokens_seen": 222309328, + "step": 102920 + }, + { + "epoch": 16.790375203915172, + "grad_norm": 0.0029606884345412254, + "learning_rate": 7.642920513634138e-05, + "loss": 0.0032, + "num_input_tokens_seen": 222320272, + "step": 102925 + }, + { + "epoch": 16.791190864600328, + "grad_norm": 0.03106829896569252, + "learning_rate": 7.639138680377478e-05, + "loss": 0.0023, + "num_input_tokens_seen": 222331248, + "step": 102930 + }, + { + "epoch": 16.79200652528548, + "grad_norm": 0.013690001331269741, + "learning_rate": 7.63535770561744e-05, + "loss": 0.0032, + "num_input_tokens_seen": 222342160, + "step": 102935 + }, + { + "epoch": 16.792822185970635, + "grad_norm": 0.0004479142662603408, + "learning_rate": 7.631577589430593e-05, + "loss": 0.0007, + "num_input_tokens_seen": 222353008, + "step": 102940 + }, + { + "epoch": 16.79363784665579, + "grad_norm": 0.0002369188005104661, + "learning_rate": 7.627798331893604e-05, + "loss": 0.0006, + "num_input_tokens_seen": 222362864, + "step": 102945 + }, + { + "epoch": 16.794453507340947, + "grad_norm": 0.17374515533447266, + "learning_rate": 7.62401993308301e-05, + "loss": 0.0057, + "num_input_tokens_seen": 222373776, + "step": 102950 + }, + { + "epoch": 16.795269168026103, + "grad_norm": 0.0009877387201413512, + "learning_rate": 7.620242393075432e-05, + "loss": 0.1063, + "num_input_tokens_seen": 222382640, + "step": 102955 + }, + { + "epoch": 16.796084828711255, + "grad_norm": 0.017264485359191895, + "learning_rate": 7.61646571194738e-05, + "loss": 0.0024, + "num_input_tokens_seen": 222395056, + "step": 102960 + }, + { + "epoch": 16.79690048939641, + "grad_norm": 0.0007135859923437238, + "learning_rate": 7.612689889775443e-05, + "loss": 0.0008, + "num_input_tokens_seen": 222404304, + "step": 102965 + }, + { + "epoch": 16.797716150081566, + "grad_norm": 0.0024973410181701183, + "learning_rate": 7.60891492663609e-05, + "loss": 0.0013, + "num_input_tokens_seen": 222414448, + "step": 102970 + }, + { + "epoch": 16.798531810766722, + "grad_norm": 0.059601426124572754, + "learning_rate": 7.605140822605883e-05, + "loss": 0.0252, + "num_input_tokens_seen": 222424624, + "step": 102975 + }, + { + "epoch": 16.799347471451878, + "grad_norm": 0.0006366174784488976, + "learning_rate": 7.601367577761248e-05, + "loss": 0.033, + "num_input_tokens_seen": 222434576, + "step": 102980 + }, + { + "epoch": 16.80016313213703, + "grad_norm": 0.003634232562035322, + "learning_rate": 7.597595192178702e-05, + "loss": 0.0011, + "num_input_tokens_seen": 222444624, + "step": 102985 + }, + { + "epoch": 16.800978792822185, + "grad_norm": 0.6686355471611023, + "learning_rate": 7.59382366593468e-05, + "loss": 0.0108, + "num_input_tokens_seen": 222453552, + "step": 102990 + }, + { + "epoch": 16.80179445350734, + "grad_norm": 0.0005412403261289, + "learning_rate": 7.590052999105618e-05, + "loss": 0.0027, + "num_input_tokens_seen": 222465072, + "step": 102995 + }, + { + "epoch": 16.802610114192497, + "grad_norm": 0.02159789204597473, + "learning_rate": 7.586283191767929e-05, + "loss": 0.0104, + "num_input_tokens_seen": 222476496, + "step": 103000 + }, + { + "epoch": 16.803425774877653, + "grad_norm": 0.000245524977799505, + "learning_rate": 7.582514243998023e-05, + "loss": 0.0066, + "num_input_tokens_seen": 222487408, + "step": 103005 + }, + { + "epoch": 16.804241435562805, + "grad_norm": 0.00048794830217957497, + "learning_rate": 7.578746155872268e-05, + "loss": 0.0003, + "num_input_tokens_seen": 222499184, + "step": 103010 + }, + { + "epoch": 16.80505709624796, + "grad_norm": 0.009726781398057938, + "learning_rate": 7.574978927467046e-05, + "loss": 0.0056, + "num_input_tokens_seen": 222511120, + "step": 103015 + }, + { + "epoch": 16.805872756933116, + "grad_norm": 0.0008596468833275139, + "learning_rate": 7.571212558858692e-05, + "loss": 0.0013, + "num_input_tokens_seen": 222522096, + "step": 103020 + }, + { + "epoch": 16.806688417618272, + "grad_norm": 0.0003236684715375304, + "learning_rate": 7.567447050123538e-05, + "loss": 0.0009, + "num_input_tokens_seen": 222534416, + "step": 103025 + }, + { + "epoch": 16.807504078303424, + "grad_norm": 0.22375887632369995, + "learning_rate": 7.563682401337901e-05, + "loss": 0.0048, + "num_input_tokens_seen": 222546096, + "step": 103030 + }, + { + "epoch": 16.80831973898858, + "grad_norm": 0.0021393627393990755, + "learning_rate": 7.559918612578065e-05, + "loss": 0.0008, + "num_input_tokens_seen": 222556848, + "step": 103035 + }, + { + "epoch": 16.809135399673735, + "grad_norm": 0.011619689874351025, + "learning_rate": 7.55615568392034e-05, + "loss": 0.0873, + "num_input_tokens_seen": 222568272, + "step": 103040 + }, + { + "epoch": 16.80995106035889, + "grad_norm": 0.017800265923142433, + "learning_rate": 7.552393615440939e-05, + "loss": 0.001, + "num_input_tokens_seen": 222579248, + "step": 103045 + }, + { + "epoch": 16.810766721044047, + "grad_norm": 0.000227114578592591, + "learning_rate": 7.548632407216155e-05, + "loss": 0.0048, + "num_input_tokens_seen": 222590800, + "step": 103050 + }, + { + "epoch": 16.8115823817292, + "grad_norm": 0.0071326131001114845, + "learning_rate": 7.544872059322161e-05, + "loss": 0.0005, + "num_input_tokens_seen": 222601680, + "step": 103055 + }, + { + "epoch": 16.812398042414355, + "grad_norm": 0.3597804605960846, + "learning_rate": 7.541112571835218e-05, + "loss": 0.0067, + "num_input_tokens_seen": 222610352, + "step": 103060 + }, + { + "epoch": 16.81321370309951, + "grad_norm": 0.00383292930200696, + "learning_rate": 7.537353944831471e-05, + "loss": 0.043, + "num_input_tokens_seen": 222621648, + "step": 103065 + }, + { + "epoch": 16.814029363784666, + "grad_norm": 0.0040711634792387486, + "learning_rate": 7.533596178387136e-05, + "loss": 0.0041, + "num_input_tokens_seen": 222632528, + "step": 103070 + }, + { + "epoch": 16.81484502446982, + "grad_norm": 0.011220073327422142, + "learning_rate": 7.529839272578326e-05, + "loss": 0.0011, + "num_input_tokens_seen": 222642768, + "step": 103075 + }, + { + "epoch": 16.815660685154974, + "grad_norm": 0.7358731031417847, + "learning_rate": 7.526083227481223e-05, + "loss": 0.1654, + "num_input_tokens_seen": 222653392, + "step": 103080 + }, + { + "epoch": 16.81647634584013, + "grad_norm": 0.0004308177740313113, + "learning_rate": 7.522328043171899e-05, + "loss": 0.0012, + "num_input_tokens_seen": 222664112, + "step": 103085 + }, + { + "epoch": 16.817292006525285, + "grad_norm": 0.010764655657112598, + "learning_rate": 7.518573719726507e-05, + "loss": 0.009, + "num_input_tokens_seen": 222675408, + "step": 103090 + }, + { + "epoch": 16.81810766721044, + "grad_norm": 0.001309889485128224, + "learning_rate": 7.514820257221088e-05, + "loss": 0.0014, + "num_input_tokens_seen": 222685520, + "step": 103095 + }, + { + "epoch": 16.818923327895597, + "grad_norm": 0.009377938695251942, + "learning_rate": 7.511067655731757e-05, + "loss": 0.0075, + "num_input_tokens_seen": 222696592, + "step": 103100 + }, + { + "epoch": 16.81973898858075, + "grad_norm": 0.03656245023012161, + "learning_rate": 7.507315915334517e-05, + "loss": 0.01, + "num_input_tokens_seen": 222707888, + "step": 103105 + }, + { + "epoch": 16.820554649265905, + "grad_norm": 0.012941381894052029, + "learning_rate": 7.503565036105447e-05, + "loss": 0.0023, + "num_input_tokens_seen": 222718896, + "step": 103110 + }, + { + "epoch": 16.82137030995106, + "grad_norm": 0.0073400200344622135, + "learning_rate": 7.49981501812052e-05, + "loss": 0.0073, + "num_input_tokens_seen": 222729904, + "step": 103115 + }, + { + "epoch": 16.822185970636216, + "grad_norm": 0.02853931486606598, + "learning_rate": 7.496065861455786e-05, + "loss": 0.0019, + "num_input_tokens_seen": 222739984, + "step": 103120 + }, + { + "epoch": 16.82300163132137, + "grad_norm": 0.0017178819980472326, + "learning_rate": 7.492317566187167e-05, + "loss": 0.1302, + "num_input_tokens_seen": 222751280, + "step": 103125 + }, + { + "epoch": 16.823817292006524, + "grad_norm": 0.0007930384599603713, + "learning_rate": 7.48857013239067e-05, + "loss": 0.0028, + "num_input_tokens_seen": 222760656, + "step": 103130 + }, + { + "epoch": 16.82463295269168, + "grad_norm": 0.019918566569685936, + "learning_rate": 7.484823560142235e-05, + "loss": 0.0105, + "num_input_tokens_seen": 222772976, + "step": 103135 + }, + { + "epoch": 16.825448613376835, + "grad_norm": 0.004923573229461908, + "learning_rate": 7.481077849517776e-05, + "loss": 0.0048, + "num_input_tokens_seen": 222782896, + "step": 103140 + }, + { + "epoch": 16.82626427406199, + "grad_norm": 0.0015327023575082421, + "learning_rate": 7.477333000593218e-05, + "loss": 0.0007, + "num_input_tokens_seen": 222793808, + "step": 103145 + }, + { + "epoch": 16.827079934747147, + "grad_norm": 0.08445288985967636, + "learning_rate": 7.473589013444449e-05, + "loss": 0.005, + "num_input_tokens_seen": 222805200, + "step": 103150 + }, + { + "epoch": 16.8278955954323, + "grad_norm": 0.001118175801821053, + "learning_rate": 7.469845888147348e-05, + "loss": 0.0117, + "num_input_tokens_seen": 222816816, + "step": 103155 + }, + { + "epoch": 16.828711256117455, + "grad_norm": 0.20687885582447052, + "learning_rate": 7.466103624777776e-05, + "loss": 0.0045, + "num_input_tokens_seen": 222827376, + "step": 103160 + }, + { + "epoch": 16.82952691680261, + "grad_norm": 0.007650259882211685, + "learning_rate": 7.462362223411568e-05, + "loss": 0.0816, + "num_input_tokens_seen": 222838320, + "step": 103165 + }, + { + "epoch": 16.830342577487766, + "grad_norm": 0.0002957629330921918, + "learning_rate": 7.458621684124556e-05, + "loss": 0.0005, + "num_input_tokens_seen": 222849104, + "step": 103170 + }, + { + "epoch": 16.83115823817292, + "grad_norm": 0.006406435277312994, + "learning_rate": 7.454882006992541e-05, + "loss": 0.001, + "num_input_tokens_seen": 222860688, + "step": 103175 + }, + { + "epoch": 16.831973898858074, + "grad_norm": 0.00206724158488214, + "learning_rate": 7.451143192091304e-05, + "loss": 0.0024, + "num_input_tokens_seen": 222872208, + "step": 103180 + }, + { + "epoch": 16.83278955954323, + "grad_norm": 0.014522617682814598, + "learning_rate": 7.447405239496646e-05, + "loss": 0.0011, + "num_input_tokens_seen": 222881776, + "step": 103185 + }, + { + "epoch": 16.833605220228385, + "grad_norm": 0.006389922928065062, + "learning_rate": 7.443668149284289e-05, + "loss": 0.001, + "num_input_tokens_seen": 222892144, + "step": 103190 + }, + { + "epoch": 16.83442088091354, + "grad_norm": 0.020866891369223595, + "learning_rate": 7.439931921529996e-05, + "loss": 0.0867, + "num_input_tokens_seen": 222902608, + "step": 103195 + }, + { + "epoch": 16.835236541598697, + "grad_norm": 0.00029015709878876805, + "learning_rate": 7.436196556309454e-05, + "loss": 0.0008, + "num_input_tokens_seen": 222913648, + "step": 103200 + }, + { + "epoch": 16.83605220228385, + "grad_norm": 0.031761713325977325, + "learning_rate": 7.432462053698413e-05, + "loss": 0.0039, + "num_input_tokens_seen": 222924336, + "step": 103205 + }, + { + "epoch": 16.836867862969005, + "grad_norm": 0.0018911163788288832, + "learning_rate": 7.428728413772502e-05, + "loss": 0.0087, + "num_input_tokens_seen": 222934800, + "step": 103210 + }, + { + "epoch": 16.83768352365416, + "grad_norm": 0.034703124314546585, + "learning_rate": 7.42499563660744e-05, + "loss": 0.0156, + "num_input_tokens_seen": 222945488, + "step": 103215 + }, + { + "epoch": 16.838499184339316, + "grad_norm": 0.04697426036000252, + "learning_rate": 7.421263722278826e-05, + "loss": 0.0027, + "num_input_tokens_seen": 222954608, + "step": 103220 + }, + { + "epoch": 16.839314845024468, + "grad_norm": 0.9602540135383606, + "learning_rate": 7.417532670862343e-05, + "loss": 0.0421, + "num_input_tokens_seen": 222965712, + "step": 103225 + }, + { + "epoch": 16.840130505709624, + "grad_norm": 0.0008537370013073087, + "learning_rate": 7.413802482433557e-05, + "loss": 0.0017, + "num_input_tokens_seen": 222976400, + "step": 103230 + }, + { + "epoch": 16.84094616639478, + "grad_norm": 0.0003730040625669062, + "learning_rate": 7.41007315706811e-05, + "loss": 0.0023, + "num_input_tokens_seen": 222987184, + "step": 103235 + }, + { + "epoch": 16.841761827079935, + "grad_norm": 0.002069843467324972, + "learning_rate": 7.406344694841538e-05, + "loss": 0.0007, + "num_input_tokens_seen": 222999056, + "step": 103240 + }, + { + "epoch": 16.84257748776509, + "grad_norm": 0.0002810598525684327, + "learning_rate": 7.402617095829434e-05, + "loss": 0.0023, + "num_input_tokens_seen": 223008944, + "step": 103245 + }, + { + "epoch": 16.843393148450243, + "grad_norm": 0.010194149799644947, + "learning_rate": 7.398890360107336e-05, + "loss": 0.0027, + "num_input_tokens_seen": 223020464, + "step": 103250 + }, + { + "epoch": 16.8442088091354, + "grad_norm": 0.011366844177246094, + "learning_rate": 7.395164487750766e-05, + "loss": 0.0049, + "num_input_tokens_seen": 223031632, + "step": 103255 + }, + { + "epoch": 16.845024469820554, + "grad_norm": 0.002313476288691163, + "learning_rate": 7.391439478835233e-05, + "loss": 0.0026, + "num_input_tokens_seen": 223041840, + "step": 103260 + }, + { + "epoch": 16.84584013050571, + "grad_norm": 0.05588260293006897, + "learning_rate": 7.387715333436235e-05, + "loss": 0.0019, + "num_input_tokens_seen": 223052144, + "step": 103265 + }, + { + "epoch": 16.846655791190866, + "grad_norm": 0.0007918982882983983, + "learning_rate": 7.383992051629246e-05, + "loss": 0.0008, + "num_input_tokens_seen": 223062704, + "step": 103270 + }, + { + "epoch": 16.847471451876018, + "grad_norm": 0.004887265618890524, + "learning_rate": 7.380269633489717e-05, + "loss": 0.0007, + "num_input_tokens_seen": 223072720, + "step": 103275 + }, + { + "epoch": 16.848287112561174, + "grad_norm": 0.0006118956953287125, + "learning_rate": 7.376548079093087e-05, + "loss": 0.0033, + "num_input_tokens_seen": 223082480, + "step": 103280 + }, + { + "epoch": 16.84910277324633, + "grad_norm": 0.002460494404658675, + "learning_rate": 7.372827388514792e-05, + "loss": 0.0047, + "num_input_tokens_seen": 223092432, + "step": 103285 + }, + { + "epoch": 16.849918433931485, + "grad_norm": 0.0028719434048980474, + "learning_rate": 7.369107561830218e-05, + "loss": 0.0069, + "num_input_tokens_seen": 223102992, + "step": 103290 + }, + { + "epoch": 16.85073409461664, + "grad_norm": 0.0015683751553297043, + "learning_rate": 7.365388599114764e-05, + "loss": 0.0015, + "num_input_tokens_seen": 223113520, + "step": 103295 + }, + { + "epoch": 16.851549755301793, + "grad_norm": 0.0028194987680763006, + "learning_rate": 7.361670500443796e-05, + "loss": 0.0012, + "num_input_tokens_seen": 223125872, + "step": 103300 + }, + { + "epoch": 16.85236541598695, + "grad_norm": 0.01556948758661747, + "learning_rate": 7.357953265892665e-05, + "loss": 0.0013, + "num_input_tokens_seen": 223137936, + "step": 103305 + }, + { + "epoch": 16.853181076672104, + "grad_norm": 0.00034893525298684835, + "learning_rate": 7.354236895536704e-05, + "loss": 0.001, + "num_input_tokens_seen": 223148592, + "step": 103310 + }, + { + "epoch": 16.85399673735726, + "grad_norm": 0.045288246124982834, + "learning_rate": 7.350521389451231e-05, + "loss": 0.002, + "num_input_tokens_seen": 223159056, + "step": 103315 + }, + { + "epoch": 16.854812398042416, + "grad_norm": 0.0001437898463336751, + "learning_rate": 7.346806747711554e-05, + "loss": 0.001, + "num_input_tokens_seen": 223170096, + "step": 103320 + }, + { + "epoch": 16.855628058727568, + "grad_norm": 0.006058032624423504, + "learning_rate": 7.343092970392929e-05, + "loss": 0.0014, + "num_input_tokens_seen": 223179248, + "step": 103325 + }, + { + "epoch": 16.856443719412724, + "grad_norm": 0.02314470149576664, + "learning_rate": 7.339380057570666e-05, + "loss": 0.0055, + "num_input_tokens_seen": 223190384, + "step": 103330 + }, + { + "epoch": 16.85725938009788, + "grad_norm": 0.0012422037543728948, + "learning_rate": 7.335668009319962e-05, + "loss": 0.0115, + "num_input_tokens_seen": 223200752, + "step": 103335 + }, + { + "epoch": 16.858075040783035, + "grad_norm": 0.27252358198165894, + "learning_rate": 7.331956825716091e-05, + "loss": 0.0034, + "num_input_tokens_seen": 223212400, + "step": 103340 + }, + { + "epoch": 16.85889070146819, + "grad_norm": 0.7663934826850891, + "learning_rate": 7.328246506834224e-05, + "loss": 0.0342, + "num_input_tokens_seen": 223223888, + "step": 103345 + }, + { + "epoch": 16.859706362153343, + "grad_norm": 0.002288650721311569, + "learning_rate": 7.32453705274958e-05, + "loss": 0.0044, + "num_input_tokens_seen": 223234384, + "step": 103350 + }, + { + "epoch": 16.8605220228385, + "grad_norm": 0.04717063903808594, + "learning_rate": 7.320828463537333e-05, + "loss": 0.0056, + "num_input_tokens_seen": 223245392, + "step": 103355 + }, + { + "epoch": 16.861337683523654, + "grad_norm": 0.01188843697309494, + "learning_rate": 7.317120739272643e-05, + "loss": 0.0772, + "num_input_tokens_seen": 223255664, + "step": 103360 + }, + { + "epoch": 16.86215334420881, + "grad_norm": 0.006205637939274311, + "learning_rate": 7.313413880030645e-05, + "loss": 0.0006, + "num_input_tokens_seen": 223267024, + "step": 103365 + }, + { + "epoch": 16.862969004893966, + "grad_norm": 0.01393813919275999, + "learning_rate": 7.309707885886462e-05, + "loss": 0.001, + "num_input_tokens_seen": 223278192, + "step": 103370 + }, + { + "epoch": 16.863784665579118, + "grad_norm": 0.009371892549097538, + "learning_rate": 7.306002756915214e-05, + "loss": 0.0032, + "num_input_tokens_seen": 223288912, + "step": 103375 + }, + { + "epoch": 16.864600326264274, + "grad_norm": 0.0004683208535425365, + "learning_rate": 7.302298493191972e-05, + "loss": 0.0005, + "num_input_tokens_seen": 223300400, + "step": 103380 + }, + { + "epoch": 16.86541598694943, + "grad_norm": 0.059036415070295334, + "learning_rate": 7.298595094791826e-05, + "loss": 0.0727, + "num_input_tokens_seen": 223311856, + "step": 103385 + }, + { + "epoch": 16.866231647634585, + "grad_norm": 0.004718313459306955, + "learning_rate": 7.294892561789817e-05, + "loss": 0.138, + "num_input_tokens_seen": 223323248, + "step": 103390 + }, + { + "epoch": 16.86704730831974, + "grad_norm": 0.005941477138549089, + "learning_rate": 7.291190894260985e-05, + "loss": 0.0011, + "num_input_tokens_seen": 223335216, + "step": 103395 + }, + { + "epoch": 16.867862969004893, + "grad_norm": 0.007990571670234203, + "learning_rate": 7.287490092280346e-05, + "loss": 0.0116, + "num_input_tokens_seen": 223345808, + "step": 103400 + }, + { + "epoch": 16.86867862969005, + "grad_norm": 0.005993335507810116, + "learning_rate": 7.28379015592291e-05, + "loss": 0.0011, + "num_input_tokens_seen": 223357584, + "step": 103405 + }, + { + "epoch": 16.869494290375204, + "grad_norm": 0.019427742809057236, + "learning_rate": 7.280091085263657e-05, + "loss": 0.027, + "num_input_tokens_seen": 223367248, + "step": 103410 + }, + { + "epoch": 16.87030995106036, + "grad_norm": 0.6797969937324524, + "learning_rate": 7.276392880377548e-05, + "loss": 0.0292, + "num_input_tokens_seen": 223377584, + "step": 103415 + }, + { + "epoch": 16.871125611745512, + "grad_norm": 0.007113391533493996, + "learning_rate": 7.27269554133954e-05, + "loss": 0.0082, + "num_input_tokens_seen": 223389104, + "step": 103420 + }, + { + "epoch": 16.871941272430668, + "grad_norm": 0.0298901479691267, + "learning_rate": 7.268999068224557e-05, + "loss": 0.0033, + "num_input_tokens_seen": 223400112, + "step": 103425 + }, + { + "epoch": 16.872756933115824, + "grad_norm": 0.002857472514733672, + "learning_rate": 7.265303461107519e-05, + "loss": 0.0063, + "num_input_tokens_seen": 223411216, + "step": 103430 + }, + { + "epoch": 16.87357259380098, + "grad_norm": 0.001847639330662787, + "learning_rate": 7.261608720063317e-05, + "loss": 0.0025, + "num_input_tokens_seen": 223423440, + "step": 103435 + }, + { + "epoch": 16.874388254486135, + "grad_norm": 0.0019470597617328167, + "learning_rate": 7.25791484516683e-05, + "loss": 0.003, + "num_input_tokens_seen": 223432944, + "step": 103440 + }, + { + "epoch": 16.875203915171287, + "grad_norm": 0.002979752840474248, + "learning_rate": 7.254221836492925e-05, + "loss": 0.1159, + "num_input_tokens_seen": 223443824, + "step": 103445 + }, + { + "epoch": 16.876019575856443, + "grad_norm": 0.05077700689435005, + "learning_rate": 7.250529694116436e-05, + "loss": 0.0033, + "num_input_tokens_seen": 223455024, + "step": 103450 + }, + { + "epoch": 16.8768352365416, + "grad_norm": 0.0010402423795312643, + "learning_rate": 7.246838418112189e-05, + "loss": 0.0084, + "num_input_tokens_seen": 223464720, + "step": 103455 + }, + { + "epoch": 16.877650897226754, + "grad_norm": 0.007393400650471449, + "learning_rate": 7.243148008555017e-05, + "loss": 0.003, + "num_input_tokens_seen": 223475216, + "step": 103460 + }, + { + "epoch": 16.87846655791191, + "grad_norm": 0.00041530292946845293, + "learning_rate": 7.239458465519672e-05, + "loss": 0.0021, + "num_input_tokens_seen": 223485232, + "step": 103465 + }, + { + "epoch": 16.879282218597062, + "grad_norm": 0.00204356387257576, + "learning_rate": 7.235769789080954e-05, + "loss": 0.1144, + "num_input_tokens_seen": 223495696, + "step": 103470 + }, + { + "epoch": 16.880097879282218, + "grad_norm": 0.0011162912705913186, + "learning_rate": 7.232081979313615e-05, + "loss": 0.0077, + "num_input_tokens_seen": 223508336, + "step": 103475 + }, + { + "epoch": 16.880913539967374, + "grad_norm": 0.00022204949345905334, + "learning_rate": 7.228395036292384e-05, + "loss": 0.0016, + "num_input_tokens_seen": 223517680, + "step": 103480 + }, + { + "epoch": 16.88172920065253, + "grad_norm": 0.0025065632071346045, + "learning_rate": 7.224708960091992e-05, + "loss": 0.0014, + "num_input_tokens_seen": 223528240, + "step": 103485 + }, + { + "epoch": 16.882544861337685, + "grad_norm": 0.0005242445622570813, + "learning_rate": 7.221023750787136e-05, + "loss": 0.0004, + "num_input_tokens_seen": 223539248, + "step": 103490 + }, + { + "epoch": 16.883360522022837, + "grad_norm": 0.005537915974855423, + "learning_rate": 7.217339408452505e-05, + "loss": 0.0026, + "num_input_tokens_seen": 223549968, + "step": 103495 + }, + { + "epoch": 16.884176182707993, + "grad_norm": 0.0028699340764433146, + "learning_rate": 7.21365593316276e-05, + "loss": 0.0005, + "num_input_tokens_seen": 223561040, + "step": 103500 + }, + { + "epoch": 16.88499184339315, + "grad_norm": 0.0010814238339662552, + "learning_rate": 7.209973324992558e-05, + "loss": 0.0042, + "num_input_tokens_seen": 223571152, + "step": 103505 + }, + { + "epoch": 16.885807504078304, + "grad_norm": 0.04626893624663353, + "learning_rate": 7.206291584016533e-05, + "loss": 0.0028, + "num_input_tokens_seen": 223582032, + "step": 103510 + }, + { + "epoch": 16.88662316476346, + "grad_norm": 0.07623946666717529, + "learning_rate": 7.202610710309293e-05, + "loss": 0.0025, + "num_input_tokens_seen": 223594224, + "step": 103515 + }, + { + "epoch": 16.887438825448612, + "grad_norm": 0.008306603878736496, + "learning_rate": 7.198930703945439e-05, + "loss": 0.0021, + "num_input_tokens_seen": 223604144, + "step": 103520 + }, + { + "epoch": 16.888254486133768, + "grad_norm": 0.0008515430381521583, + "learning_rate": 7.19525156499955e-05, + "loss": 0.0033, + "num_input_tokens_seen": 223613616, + "step": 103525 + }, + { + "epoch": 16.889070146818923, + "grad_norm": 0.005168965086340904, + "learning_rate": 7.191573293546195e-05, + "loss": 0.0034, + "num_input_tokens_seen": 223625744, + "step": 103530 + }, + { + "epoch": 16.88988580750408, + "grad_norm": 0.04074247553944588, + "learning_rate": 7.187895889659906e-05, + "loss": 0.0018, + "num_input_tokens_seen": 223635632, + "step": 103535 + }, + { + "epoch": 16.890701468189235, + "grad_norm": 0.04196935519576073, + "learning_rate": 7.184219353415228e-05, + "loss": 0.0592, + "num_input_tokens_seen": 223647120, + "step": 103540 + }, + { + "epoch": 16.891517128874387, + "grad_norm": 0.005630762781947851, + "learning_rate": 7.180543684886654e-05, + "loss": 0.0018, + "num_input_tokens_seen": 223658416, + "step": 103545 + }, + { + "epoch": 16.892332789559543, + "grad_norm": 0.004152704030275345, + "learning_rate": 7.176868884148679e-05, + "loss": 0.0043, + "num_input_tokens_seen": 223669072, + "step": 103550 + }, + { + "epoch": 16.8931484502447, + "grad_norm": 0.1723194271326065, + "learning_rate": 7.173194951275786e-05, + "loss": 0.0052, + "num_input_tokens_seen": 223678096, + "step": 103555 + }, + { + "epoch": 16.893964110929854, + "grad_norm": 0.1672668755054474, + "learning_rate": 7.169521886342417e-05, + "loss": 0.0097, + "num_input_tokens_seen": 223688624, + "step": 103560 + }, + { + "epoch": 16.894779771615006, + "grad_norm": 0.017755307257175446, + "learning_rate": 7.165849689423043e-05, + "loss": 0.0029, + "num_input_tokens_seen": 223699216, + "step": 103565 + }, + { + "epoch": 16.895595432300162, + "grad_norm": 0.0071003190241754055, + "learning_rate": 7.162178360592037e-05, + "loss": 0.0035, + "num_input_tokens_seen": 223709904, + "step": 103570 + }, + { + "epoch": 16.896411092985318, + "grad_norm": 0.0005213333643041551, + "learning_rate": 7.15850789992386e-05, + "loss": 0.001, + "num_input_tokens_seen": 223721072, + "step": 103575 + }, + { + "epoch": 16.897226753670473, + "grad_norm": 0.012438364326953888, + "learning_rate": 7.154838307492839e-05, + "loss": 0.0216, + "num_input_tokens_seen": 223732784, + "step": 103580 + }, + { + "epoch": 16.89804241435563, + "grad_norm": 0.00046717922668904066, + "learning_rate": 7.151169583373402e-05, + "loss": 0.001, + "num_input_tokens_seen": 223743792, + "step": 103585 + }, + { + "epoch": 16.898858075040785, + "grad_norm": 0.08749958872795105, + "learning_rate": 7.147501727639844e-05, + "loss": 0.0107, + "num_input_tokens_seen": 223755344, + "step": 103590 + }, + { + "epoch": 16.899673735725937, + "grad_norm": 0.0009969095699489117, + "learning_rate": 7.14383474036655e-05, + "loss": 0.0079, + "num_input_tokens_seen": 223766032, + "step": 103595 + }, + { + "epoch": 16.900489396411093, + "grad_norm": 0.008744543418288231, + "learning_rate": 7.140168621627786e-05, + "loss": 0.0014, + "num_input_tokens_seen": 223775440, + "step": 103600 + }, + { + "epoch": 16.90130505709625, + "grad_norm": 0.0034252856858074665, + "learning_rate": 7.136503371497888e-05, + "loss": 0.0155, + "num_input_tokens_seen": 223785712, + "step": 103605 + }, + { + "epoch": 16.902120717781404, + "grad_norm": 0.0015179646434262395, + "learning_rate": 7.132838990051132e-05, + "loss": 0.001, + "num_input_tokens_seen": 223795312, + "step": 103610 + }, + { + "epoch": 16.902936378466556, + "grad_norm": 0.0007867804961279035, + "learning_rate": 7.129175477361766e-05, + "loss": 0.0014, + "num_input_tokens_seen": 223806736, + "step": 103615 + }, + { + "epoch": 16.903752039151712, + "grad_norm": 0.6858383417129517, + "learning_rate": 7.125512833504049e-05, + "loss": 0.0167, + "num_input_tokens_seen": 223817424, + "step": 103620 + }, + { + "epoch": 16.904567699836868, + "grad_norm": 0.004548117518424988, + "learning_rate": 7.121851058552209e-05, + "loss": 0.0011, + "num_input_tokens_seen": 223827856, + "step": 103625 + }, + { + "epoch": 16.905383360522023, + "grad_norm": 0.01413120049983263, + "learning_rate": 7.118190152580444e-05, + "loss": 0.0771, + "num_input_tokens_seen": 223837104, + "step": 103630 + }, + { + "epoch": 16.90619902120718, + "grad_norm": 0.0013508544070646167, + "learning_rate": 7.114530115662959e-05, + "loss": 0.0889, + "num_input_tokens_seen": 223848720, + "step": 103635 + }, + { + "epoch": 16.90701468189233, + "grad_norm": 0.006228696089237928, + "learning_rate": 7.110870947873926e-05, + "loss": 0.0014, + "num_input_tokens_seen": 223859216, + "step": 103640 + }, + { + "epoch": 16.907830342577487, + "grad_norm": 0.0016030416591092944, + "learning_rate": 7.107212649287497e-05, + "loss": 0.001, + "num_input_tokens_seen": 223870448, + "step": 103645 + }, + { + "epoch": 16.908646003262643, + "grad_norm": 0.005661866627633572, + "learning_rate": 7.103555219977825e-05, + "loss": 0.0006, + "num_input_tokens_seen": 223880720, + "step": 103650 + }, + { + "epoch": 16.9094616639478, + "grad_norm": 0.008992401883006096, + "learning_rate": 7.099898660019016e-05, + "loss": 0.0096, + "num_input_tokens_seen": 223891856, + "step": 103655 + }, + { + "epoch": 16.910277324632954, + "grad_norm": 0.0017499992391094565, + "learning_rate": 7.096242969485189e-05, + "loss": 0.0006, + "num_input_tokens_seen": 223903280, + "step": 103660 + }, + { + "epoch": 16.911092985318106, + "grad_norm": 0.016757963225245476, + "learning_rate": 7.092588148450413e-05, + "loss": 0.0029, + "num_input_tokens_seen": 223914160, + "step": 103665 + }, + { + "epoch": 16.911908646003262, + "grad_norm": 0.0017213866813108325, + "learning_rate": 7.088934196988795e-05, + "loss": 0.0004, + "num_input_tokens_seen": 223925424, + "step": 103670 + }, + { + "epoch": 16.912724306688418, + "grad_norm": 0.12674477696418762, + "learning_rate": 7.085281115174335e-05, + "loss": 0.0066, + "num_input_tokens_seen": 223936880, + "step": 103675 + }, + { + "epoch": 16.913539967373573, + "grad_norm": 0.07425151765346527, + "learning_rate": 7.081628903081116e-05, + "loss": 0.003, + "num_input_tokens_seen": 223947120, + "step": 103680 + }, + { + "epoch": 16.91435562805873, + "grad_norm": 0.0015521092573180795, + "learning_rate": 7.077977560783117e-05, + "loss": 0.0013, + "num_input_tokens_seen": 223957936, + "step": 103685 + }, + { + "epoch": 16.91517128874388, + "grad_norm": 0.003834107890725136, + "learning_rate": 7.074327088354371e-05, + "loss": 0.0017, + "num_input_tokens_seen": 223966544, + "step": 103690 + }, + { + "epoch": 16.915986949429037, + "grad_norm": 0.0003727386356331408, + "learning_rate": 7.070677485868821e-05, + "loss": 0.0007, + "num_input_tokens_seen": 223978640, + "step": 103695 + }, + { + "epoch": 16.916802610114193, + "grad_norm": 0.005801330786198378, + "learning_rate": 7.067028753400473e-05, + "loss": 0.0086, + "num_input_tokens_seen": 223988752, + "step": 103700 + }, + { + "epoch": 16.91761827079935, + "grad_norm": 0.264864981174469, + "learning_rate": 7.06338089102323e-05, + "loss": 0.0039, + "num_input_tokens_seen": 223999536, + "step": 103705 + }, + { + "epoch": 16.918433931484504, + "grad_norm": 0.00687580369412899, + "learning_rate": 7.05973389881106e-05, + "loss": 0.0169, + "num_input_tokens_seen": 224011088, + "step": 103710 + }, + { + "epoch": 16.919249592169656, + "grad_norm": 0.008090431801974773, + "learning_rate": 7.056087776837838e-05, + "loss": 0.0043, + "num_input_tokens_seen": 224021488, + "step": 103715 + }, + { + "epoch": 16.920065252854812, + "grad_norm": 0.10078676789999008, + "learning_rate": 7.052442525177499e-05, + "loss": 0.0035, + "num_input_tokens_seen": 224032528, + "step": 103720 + }, + { + "epoch": 16.920880913539968, + "grad_norm": 0.015244879759848118, + "learning_rate": 7.048798143903873e-05, + "loss": 0.0012, + "num_input_tokens_seen": 224044432, + "step": 103725 + }, + { + "epoch": 16.921696574225123, + "grad_norm": 0.007751537952572107, + "learning_rate": 7.045154633090861e-05, + "loss": 0.0162, + "num_input_tokens_seen": 224054992, + "step": 103730 + }, + { + "epoch": 16.92251223491028, + "grad_norm": 0.0004403532948344946, + "learning_rate": 7.041511992812255e-05, + "loss": 0.0827, + "num_input_tokens_seen": 224065136, + "step": 103735 + }, + { + "epoch": 16.92332789559543, + "grad_norm": 0.040228236466646194, + "learning_rate": 7.037870223141935e-05, + "loss": 0.0069, + "num_input_tokens_seen": 224077392, + "step": 103740 + }, + { + "epoch": 16.924143556280587, + "grad_norm": 0.05721784383058548, + "learning_rate": 7.034229324153652e-05, + "loss": 0.0022, + "num_input_tokens_seen": 224089040, + "step": 103745 + }, + { + "epoch": 16.924959216965743, + "grad_norm": 0.02705197036266327, + "learning_rate": 7.030589295921224e-05, + "loss": 0.0349, + "num_input_tokens_seen": 224100976, + "step": 103750 + }, + { + "epoch": 16.9257748776509, + "grad_norm": 0.0023379966150969267, + "learning_rate": 7.026950138518423e-05, + "loss": 0.0046, + "num_input_tokens_seen": 224112528, + "step": 103755 + }, + { + "epoch": 16.92659053833605, + "grad_norm": 0.002170360879972577, + "learning_rate": 7.023311852018988e-05, + "loss": 0.0008, + "num_input_tokens_seen": 224123024, + "step": 103760 + }, + { + "epoch": 16.927406199021206, + "grad_norm": 0.0021907107438892126, + "learning_rate": 7.019674436496653e-05, + "loss": 0.0013, + "num_input_tokens_seen": 224135216, + "step": 103765 + }, + { + "epoch": 16.928221859706362, + "grad_norm": 0.6404808759689331, + "learning_rate": 7.01603789202515e-05, + "loss": 0.0145, + "num_input_tokens_seen": 224147056, + "step": 103770 + }, + { + "epoch": 16.929037520391518, + "grad_norm": 0.0004183925047982484, + "learning_rate": 7.01240221867816e-05, + "loss": 0.0006, + "num_input_tokens_seen": 224158096, + "step": 103775 + }, + { + "epoch": 16.929853181076673, + "grad_norm": 0.00027834964566864073, + "learning_rate": 7.008767416529376e-05, + "loss": 0.0007, + "num_input_tokens_seen": 224168080, + "step": 103780 + }, + { + "epoch": 16.930668841761825, + "grad_norm": 0.7886783480644226, + "learning_rate": 7.00513348565246e-05, + "loss": 0.0603, + "num_input_tokens_seen": 224179248, + "step": 103785 + }, + { + "epoch": 16.93148450244698, + "grad_norm": 0.0031499317847192287, + "learning_rate": 7.001500426121055e-05, + "loss": 0.0295, + "num_input_tokens_seen": 224190960, + "step": 103790 + }, + { + "epoch": 16.932300163132137, + "grad_norm": 0.01849014312028885, + "learning_rate": 6.997868238008793e-05, + "loss": 0.0008, + "num_input_tokens_seen": 224200656, + "step": 103795 + }, + { + "epoch": 16.933115823817293, + "grad_norm": 0.007087129633873701, + "learning_rate": 6.994236921389268e-05, + "loss": 0.0026, + "num_input_tokens_seen": 224212016, + "step": 103800 + }, + { + "epoch": 16.93393148450245, + "grad_norm": 0.09033387899398804, + "learning_rate": 6.990606476336114e-05, + "loss": 0.0034, + "num_input_tokens_seen": 224222448, + "step": 103805 + }, + { + "epoch": 16.9347471451876, + "grad_norm": 0.011910875327885151, + "learning_rate": 6.98697690292286e-05, + "loss": 0.0434, + "num_input_tokens_seen": 224233008, + "step": 103810 + }, + { + "epoch": 16.935562805872756, + "grad_norm": 0.002567737130448222, + "learning_rate": 6.983348201223105e-05, + "loss": 0.0023, + "num_input_tokens_seen": 224245200, + "step": 103815 + }, + { + "epoch": 16.936378466557912, + "grad_norm": 0.0010787706123664975, + "learning_rate": 6.97972037131035e-05, + "loss": 0.0005, + "num_input_tokens_seen": 224255888, + "step": 103820 + }, + { + "epoch": 16.937194127243067, + "grad_norm": 0.010754414834082127, + "learning_rate": 6.976093413258156e-05, + "loss": 0.0013, + "num_input_tokens_seen": 224265904, + "step": 103825 + }, + { + "epoch": 16.938009787928223, + "grad_norm": 0.0027486232575029135, + "learning_rate": 6.972467327139987e-05, + "loss": 0.0028, + "num_input_tokens_seen": 224275664, + "step": 103830 + }, + { + "epoch": 16.938825448613375, + "grad_norm": 0.0008375774486921728, + "learning_rate": 6.968842113029372e-05, + "loss": 0.0018, + "num_input_tokens_seen": 224286768, + "step": 103835 + }, + { + "epoch": 16.93964110929853, + "grad_norm": 0.02906578592956066, + "learning_rate": 6.965217770999738e-05, + "loss": 0.0031, + "num_input_tokens_seen": 224297008, + "step": 103840 + }, + { + "epoch": 16.940456769983687, + "grad_norm": 0.005131200421601534, + "learning_rate": 6.961594301124585e-05, + "loss": 0.0058, + "num_input_tokens_seen": 224307216, + "step": 103845 + }, + { + "epoch": 16.941272430668842, + "grad_norm": 0.01488333661109209, + "learning_rate": 6.957971703477301e-05, + "loss": 0.0146, + "num_input_tokens_seen": 224317424, + "step": 103850 + }, + { + "epoch": 16.942088091353998, + "grad_norm": 0.12265031039714813, + "learning_rate": 6.954349978131342e-05, + "loss": 0.0061, + "num_input_tokens_seen": 224328688, + "step": 103855 + }, + { + "epoch": 16.94290375203915, + "grad_norm": 0.0015900880098342896, + "learning_rate": 6.950729125160066e-05, + "loss": 0.0003, + "num_input_tokens_seen": 224339312, + "step": 103860 + }, + { + "epoch": 16.943719412724306, + "grad_norm": 0.03072093427181244, + "learning_rate": 6.947109144636898e-05, + "loss": 0.0038, + "num_input_tokens_seen": 224351280, + "step": 103865 + }, + { + "epoch": 16.94453507340946, + "grad_norm": 0.002643781015649438, + "learning_rate": 6.943490036635158e-05, + "loss": 0.0006, + "num_input_tokens_seen": 224362064, + "step": 103870 + }, + { + "epoch": 16.945350734094617, + "grad_norm": 0.005371812731027603, + "learning_rate": 6.939871801228236e-05, + "loss": 0.0027, + "num_input_tokens_seen": 224372976, + "step": 103875 + }, + { + "epoch": 16.946166394779773, + "grad_norm": 0.007469065487384796, + "learning_rate": 6.936254438489414e-05, + "loss": 0.002, + "num_input_tokens_seen": 224384880, + "step": 103880 + }, + { + "epoch": 16.946982055464925, + "grad_norm": 0.012764902785420418, + "learning_rate": 6.932637948492038e-05, + "loss": 0.001, + "num_input_tokens_seen": 224396496, + "step": 103885 + }, + { + "epoch": 16.94779771615008, + "grad_norm": 0.07486124336719513, + "learning_rate": 6.929022331309392e-05, + "loss": 0.0041, + "num_input_tokens_seen": 224408208, + "step": 103890 + }, + { + "epoch": 16.948613376835237, + "grad_norm": 0.6385883688926697, + "learning_rate": 6.925407587014743e-05, + "loss": 0.0952, + "num_input_tokens_seen": 224418288, + "step": 103895 + }, + { + "epoch": 16.949429037520392, + "grad_norm": 0.5001348257064819, + "learning_rate": 6.921793715681358e-05, + "loss": 0.1072, + "num_input_tokens_seen": 224428688, + "step": 103900 + }, + { + "epoch": 16.950244698205548, + "grad_norm": 0.013521653600037098, + "learning_rate": 6.918180717382466e-05, + "loss": 0.0017, + "num_input_tokens_seen": 224439216, + "step": 103905 + }, + { + "epoch": 16.9510603588907, + "grad_norm": 0.003049998078495264, + "learning_rate": 6.914568592191301e-05, + "loss": 0.005, + "num_input_tokens_seen": 224450256, + "step": 103910 + }, + { + "epoch": 16.951876019575856, + "grad_norm": 0.020504631102085114, + "learning_rate": 6.910957340181056e-05, + "loss": 0.0039, + "num_input_tokens_seen": 224461872, + "step": 103915 + }, + { + "epoch": 16.95269168026101, + "grad_norm": 0.0012223842786625028, + "learning_rate": 6.907346961424926e-05, + "loss": 0.0754, + "num_input_tokens_seen": 224473712, + "step": 103920 + }, + { + "epoch": 16.953507340946167, + "grad_norm": 0.009138503111898899, + "learning_rate": 6.903737455996073e-05, + "loss": 0.0026, + "num_input_tokens_seen": 224484560, + "step": 103925 + }, + { + "epoch": 16.954323001631323, + "grad_norm": 0.005319240037351847, + "learning_rate": 6.900128823967655e-05, + "loss": 0.0008, + "num_input_tokens_seen": 224494608, + "step": 103930 + }, + { + "epoch": 16.955138662316475, + "grad_norm": 0.0023457880597561598, + "learning_rate": 6.896521065412803e-05, + "loss": 0.0019, + "num_input_tokens_seen": 224505488, + "step": 103935 + }, + { + "epoch": 16.95595432300163, + "grad_norm": 0.0007057767361402512, + "learning_rate": 6.89291418040463e-05, + "loss": 0.006, + "num_input_tokens_seen": 224516304, + "step": 103940 + }, + { + "epoch": 16.956769983686787, + "grad_norm": 0.0032573130447417498, + "learning_rate": 6.889308169016229e-05, + "loss": 0.0007, + "num_input_tokens_seen": 224527056, + "step": 103945 + }, + { + "epoch": 16.957585644371942, + "grad_norm": 0.05521783605217934, + "learning_rate": 6.885703031320706e-05, + "loss": 0.0149, + "num_input_tokens_seen": 224537360, + "step": 103950 + }, + { + "epoch": 16.958401305057095, + "grad_norm": 0.00016859463357832283, + "learning_rate": 6.882098767391087e-05, + "loss": 0.0048, + "num_input_tokens_seen": 224548240, + "step": 103955 + }, + { + "epoch": 16.95921696574225, + "grad_norm": 0.0033800648525357246, + "learning_rate": 6.878495377300453e-05, + "loss": 0.0009, + "num_input_tokens_seen": 224560336, + "step": 103960 + }, + { + "epoch": 16.960032626427406, + "grad_norm": 0.01001080870628357, + "learning_rate": 6.874892861121795e-05, + "loss": 0.0011, + "num_input_tokens_seen": 224572496, + "step": 103965 + }, + { + "epoch": 16.96084828711256, + "grad_norm": 0.004089967347681522, + "learning_rate": 6.871291218928166e-05, + "loss": 0.0012, + "num_input_tokens_seen": 224583216, + "step": 103970 + }, + { + "epoch": 16.961663947797717, + "grad_norm": 0.0007215813966467977, + "learning_rate": 6.867690450792508e-05, + "loss": 0.0007, + "num_input_tokens_seen": 224595696, + "step": 103975 + }, + { + "epoch": 16.96247960848287, + "grad_norm": 0.0013670484768226743, + "learning_rate": 6.864090556787838e-05, + "loss": 0.0016, + "num_input_tokens_seen": 224606288, + "step": 103980 + }, + { + "epoch": 16.963295269168025, + "grad_norm": 0.011604733765125275, + "learning_rate": 6.860491536987079e-05, + "loss": 0.0015, + "num_input_tokens_seen": 224617488, + "step": 103985 + }, + { + "epoch": 16.96411092985318, + "grad_norm": 0.04235439747571945, + "learning_rate": 6.856893391463192e-05, + "loss": 0.0052, + "num_input_tokens_seen": 224628720, + "step": 103990 + }, + { + "epoch": 16.964926590538337, + "grad_norm": 0.0010337868006899953, + "learning_rate": 6.853296120289094e-05, + "loss": 0.0016, + "num_input_tokens_seen": 224640432, + "step": 103995 + }, + { + "epoch": 16.965742251223492, + "grad_norm": 0.02236298657953739, + "learning_rate": 6.849699723537684e-05, + "loss": 0.0067, + "num_input_tokens_seen": 224651024, + "step": 104000 + }, + { + "epoch": 16.966557911908644, + "grad_norm": 0.00834821816533804, + "learning_rate": 6.84610420128185e-05, + "loss": 0.0031, + "num_input_tokens_seen": 224661904, + "step": 104005 + }, + { + "epoch": 16.9673735725938, + "grad_norm": 0.0009754026541486382, + "learning_rate": 6.842509553594462e-05, + "loss": 0.0054, + "num_input_tokens_seen": 224672592, + "step": 104010 + }, + { + "epoch": 16.968189233278956, + "grad_norm": 0.07079813629388809, + "learning_rate": 6.83891578054836e-05, + "loss": 0.0036, + "num_input_tokens_seen": 224683600, + "step": 104015 + }, + { + "epoch": 16.96900489396411, + "grad_norm": 0.0004905558307655156, + "learning_rate": 6.835322882216388e-05, + "loss": 0.0014, + "num_input_tokens_seen": 224694032, + "step": 104020 + }, + { + "epoch": 16.969820554649267, + "grad_norm": 0.00019701993733178824, + "learning_rate": 6.831730858671353e-05, + "loss": 0.0125, + "num_input_tokens_seen": 224705712, + "step": 104025 + }, + { + "epoch": 16.97063621533442, + "grad_norm": 0.018763495609164238, + "learning_rate": 6.828139709986058e-05, + "loss": 0.0194, + "num_input_tokens_seen": 224714768, + "step": 104030 + }, + { + "epoch": 16.971451876019575, + "grad_norm": 0.009430291131138802, + "learning_rate": 6.824549436233279e-05, + "loss": 0.0012, + "num_input_tokens_seen": 224725808, + "step": 104035 + }, + { + "epoch": 16.97226753670473, + "grad_norm": 0.003956033382564783, + "learning_rate": 6.820960037485779e-05, + "loss": 0.0015, + "num_input_tokens_seen": 224736752, + "step": 104040 + }, + { + "epoch": 16.973083197389887, + "grad_norm": 0.0390721932053566, + "learning_rate": 6.8173715138163e-05, + "loss": 0.002, + "num_input_tokens_seen": 224747728, + "step": 104045 + }, + { + "epoch": 16.973898858075042, + "grad_norm": 0.0026890782173722982, + "learning_rate": 6.813783865297563e-05, + "loss": 0.0016, + "num_input_tokens_seen": 224758320, + "step": 104050 + }, + { + "epoch": 16.974714518760194, + "grad_norm": 0.00021689318236894906, + "learning_rate": 6.810197092002285e-05, + "loss": 0.0009, + "num_input_tokens_seen": 224769232, + "step": 104055 + }, + { + "epoch": 16.97553017944535, + "grad_norm": 0.008539892733097076, + "learning_rate": 6.806611194003154e-05, + "loss": 0.0053, + "num_input_tokens_seen": 224780016, + "step": 104060 + }, + { + "epoch": 16.976345840130506, + "grad_norm": 0.002406371058896184, + "learning_rate": 6.803026171372845e-05, + "loss": 0.001, + "num_input_tokens_seen": 224792560, + "step": 104065 + }, + { + "epoch": 16.97716150081566, + "grad_norm": 0.011961296200752258, + "learning_rate": 6.799442024184005e-05, + "loss": 0.0015, + "num_input_tokens_seen": 224803216, + "step": 104070 + }, + { + "epoch": 16.977977161500817, + "grad_norm": 0.0037891874089837074, + "learning_rate": 6.795858752509276e-05, + "loss": 0.0011, + "num_input_tokens_seen": 224814768, + "step": 104075 + }, + { + "epoch": 16.97879282218597, + "grad_norm": 0.006421142257750034, + "learning_rate": 6.792276356421278e-05, + "loss": 0.001, + "num_input_tokens_seen": 224826128, + "step": 104080 + }, + { + "epoch": 16.979608482871125, + "grad_norm": 0.0007180146058090031, + "learning_rate": 6.788694835992615e-05, + "loss": 0.0108, + "num_input_tokens_seen": 224836848, + "step": 104085 + }, + { + "epoch": 16.98042414355628, + "grad_norm": 0.0014143181033432484, + "learning_rate": 6.785114191295854e-05, + "loss": 0.0017, + "num_input_tokens_seen": 224848816, + "step": 104090 + }, + { + "epoch": 16.981239804241437, + "grad_norm": 0.012540574185550213, + "learning_rate": 6.78153442240359e-05, + "loss": 0.0022, + "num_input_tokens_seen": 224858992, + "step": 104095 + }, + { + "epoch": 16.982055464926592, + "grad_norm": 0.045478545129299164, + "learning_rate": 6.777955529388358e-05, + "loss": 0.0078, + "num_input_tokens_seen": 224869616, + "step": 104100 + }, + { + "epoch": 16.982871125611744, + "grad_norm": 0.005564156919717789, + "learning_rate": 6.774377512322688e-05, + "loss": 0.0043, + "num_input_tokens_seen": 224880880, + "step": 104105 + }, + { + "epoch": 16.9836867862969, + "grad_norm": 0.004332674667239189, + "learning_rate": 6.77080037127909e-05, + "loss": 0.0024, + "num_input_tokens_seen": 224890896, + "step": 104110 + }, + { + "epoch": 16.984502446982056, + "grad_norm": 0.004451078828424215, + "learning_rate": 6.767224106330067e-05, + "loss": 0.0009, + "num_input_tokens_seen": 224901168, + "step": 104115 + }, + { + "epoch": 16.98531810766721, + "grad_norm": 0.0015942800091579556, + "learning_rate": 6.763648717548088e-05, + "loss": 0.1505, + "num_input_tokens_seen": 224912048, + "step": 104120 + }, + { + "epoch": 16.986133768352367, + "grad_norm": 0.00040785997407510877, + "learning_rate": 6.760074205005617e-05, + "loss": 0.0015, + "num_input_tokens_seen": 224922768, + "step": 104125 + }, + { + "epoch": 16.98694942903752, + "grad_norm": 0.00912250392138958, + "learning_rate": 6.756500568775098e-05, + "loss": 0.0708, + "num_input_tokens_seen": 224934064, + "step": 104130 + }, + { + "epoch": 16.987765089722675, + "grad_norm": 0.0014891140162944794, + "learning_rate": 6.752927808928955e-05, + "loss": 0.002, + "num_input_tokens_seen": 224944720, + "step": 104135 + }, + { + "epoch": 16.98858075040783, + "grad_norm": 0.01289412658661604, + "learning_rate": 6.749355925539591e-05, + "loss": 0.0025, + "num_input_tokens_seen": 224956144, + "step": 104140 + }, + { + "epoch": 16.989396411092986, + "grad_norm": 0.10449906438589096, + "learning_rate": 6.745784918679399e-05, + "loss": 0.1343, + "num_input_tokens_seen": 224966960, + "step": 104145 + }, + { + "epoch": 16.99021207177814, + "grad_norm": 0.0009628917323425412, + "learning_rate": 6.742214788420742e-05, + "loss": 0.0271, + "num_input_tokens_seen": 224976528, + "step": 104150 + }, + { + "epoch": 16.991027732463294, + "grad_norm": 0.0012101201573386788, + "learning_rate": 6.73864553483598e-05, + "loss": 0.0006, + "num_input_tokens_seen": 224989584, + "step": 104155 + }, + { + "epoch": 16.99184339314845, + "grad_norm": 0.0003157875908073038, + "learning_rate": 6.735077157997448e-05, + "loss": 0.0019, + "num_input_tokens_seen": 225000272, + "step": 104160 + }, + { + "epoch": 16.992659053833606, + "grad_norm": 0.1327991783618927, + "learning_rate": 6.731509657977464e-05, + "loss": 0.007, + "num_input_tokens_seen": 225011440, + "step": 104165 + }, + { + "epoch": 16.99347471451876, + "grad_norm": 0.009232837706804276, + "learning_rate": 6.727943034848327e-05, + "loss": 0.0019, + "num_input_tokens_seen": 225022096, + "step": 104170 + }, + { + "epoch": 16.994290375203914, + "grad_norm": 0.001391786616295576, + "learning_rate": 6.72437728868232e-05, + "loss": 0.0006, + "num_input_tokens_seen": 225033232, + "step": 104175 + }, + { + "epoch": 16.99510603588907, + "grad_norm": 0.0021746442653238773, + "learning_rate": 6.720812419551703e-05, + "loss": 0.0037, + "num_input_tokens_seen": 225043664, + "step": 104180 + }, + { + "epoch": 16.995921696574225, + "grad_norm": 0.0031335726380348206, + "learning_rate": 6.717248427528727e-05, + "loss": 0.0139, + "num_input_tokens_seen": 225053840, + "step": 104185 + }, + { + "epoch": 16.99673735725938, + "grad_norm": 0.002999882446601987, + "learning_rate": 6.713685312685619e-05, + "loss": 0.0019, + "num_input_tokens_seen": 225064688, + "step": 104190 + }, + { + "epoch": 16.997553017944536, + "grad_norm": 0.17880254983901978, + "learning_rate": 6.710123075094593e-05, + "loss": 0.0087, + "num_input_tokens_seen": 225075696, + "step": 104195 + }, + { + "epoch": 16.99836867862969, + "grad_norm": 0.0011067570885643363, + "learning_rate": 6.70656171482783e-05, + "loss": 0.0007, + "num_input_tokens_seen": 225084624, + "step": 104200 + }, + { + "epoch": 16.999184339314844, + "grad_norm": 0.9609259963035583, + "learning_rate": 6.703001231957535e-05, + "loss": 0.0853, + "num_input_tokens_seen": 225095632, + "step": 104205 + }, + { + "epoch": 17.0, + "grad_norm": 0.22522194683551788, + "learning_rate": 6.699441626555824e-05, + "loss": 0.0111, + "num_input_tokens_seen": 225105296, + "step": 104210 + }, + { + "epoch": 17.0, + "eval_loss": 0.29674801230430603, + "eval_runtime": 104.219, + "eval_samples_per_second": 26.147, + "eval_steps_per_second": 6.544, + "num_input_tokens_seen": 225105296, + "step": 104210 + }, + { + "epoch": 17.000815660685156, + "grad_norm": 0.0023162413854151964, + "learning_rate": 6.695882898694883e-05, + "loss": 0.0008, + "num_input_tokens_seen": 225116784, + "step": 104215 + }, + { + "epoch": 17.00163132137031, + "grad_norm": 0.0006858249544166028, + "learning_rate": 6.692325048446784e-05, + "loss": 0.0014, + "num_input_tokens_seen": 225129328, + "step": 104220 + }, + { + "epoch": 17.002446982055464, + "grad_norm": 0.7215015888214111, + "learning_rate": 6.688768075883683e-05, + "loss": 0.0454, + "num_input_tokens_seen": 225138544, + "step": 104225 + }, + { + "epoch": 17.00326264274062, + "grad_norm": 0.015421743504703045, + "learning_rate": 6.685211981077616e-05, + "loss": 0.0055, + "num_input_tokens_seen": 225149072, + "step": 104230 + }, + { + "epoch": 17.004078303425775, + "grad_norm": 0.001810228219255805, + "learning_rate": 6.68165676410069e-05, + "loss": 0.0718, + "num_input_tokens_seen": 225159344, + "step": 104235 + }, + { + "epoch": 17.00489396411093, + "grad_norm": 0.004929349757730961, + "learning_rate": 6.678102425024946e-05, + "loss": 0.0034, + "num_input_tokens_seen": 225170096, + "step": 104240 + }, + { + "epoch": 17.005709624796086, + "grad_norm": 0.030193855985999107, + "learning_rate": 6.674548963922412e-05, + "loss": 0.0033, + "num_input_tokens_seen": 225181104, + "step": 104245 + }, + { + "epoch": 17.00652528548124, + "grad_norm": 0.0012685685651376843, + "learning_rate": 6.670996380865101e-05, + "loss": 0.0022, + "num_input_tokens_seen": 225191376, + "step": 104250 + }, + { + "epoch": 17.007340946166394, + "grad_norm": 0.037792641669511795, + "learning_rate": 6.667444675925022e-05, + "loss": 0.0028, + "num_input_tokens_seen": 225202000, + "step": 104255 + }, + { + "epoch": 17.00815660685155, + "grad_norm": 0.008928779512643814, + "learning_rate": 6.663893849174147e-05, + "loss": 0.002, + "num_input_tokens_seen": 225212592, + "step": 104260 + }, + { + "epoch": 17.008972267536706, + "grad_norm": 0.02849671244621277, + "learning_rate": 6.660343900684434e-05, + "loss": 0.0034, + "num_input_tokens_seen": 225224912, + "step": 104265 + }, + { + "epoch": 17.00978792822186, + "grad_norm": 0.0014940741239115596, + "learning_rate": 6.656794830527835e-05, + "loss": 0.0019, + "num_input_tokens_seen": 225235856, + "step": 104270 + }, + { + "epoch": 17.010603588907014, + "grad_norm": 0.004284200258553028, + "learning_rate": 6.653246638776273e-05, + "loss": 0.0025, + "num_input_tokens_seen": 225246640, + "step": 104275 + }, + { + "epoch": 17.01141924959217, + "grad_norm": 0.15875405073165894, + "learning_rate": 6.649699325501657e-05, + "loss": 0.008, + "num_input_tokens_seen": 225258864, + "step": 104280 + }, + { + "epoch": 17.012234910277325, + "grad_norm": 0.1828366369009018, + "learning_rate": 6.64615289077588e-05, + "loss": 0.0078, + "num_input_tokens_seen": 225270416, + "step": 104285 + }, + { + "epoch": 17.01305057096248, + "grad_norm": 0.02164081484079361, + "learning_rate": 6.642607334670808e-05, + "loss": 0.0041, + "num_input_tokens_seen": 225280560, + "step": 104290 + }, + { + "epoch": 17.013866231647636, + "grad_norm": 0.01167318969964981, + "learning_rate": 6.639062657258305e-05, + "loss": 0.0009, + "num_input_tokens_seen": 225290864, + "step": 104295 + }, + { + "epoch": 17.01468189233279, + "grad_norm": 0.009521464817225933, + "learning_rate": 6.635518858610207e-05, + "loss": 0.0031, + "num_input_tokens_seen": 225302576, + "step": 104300 + }, + { + "epoch": 17.015497553017944, + "grad_norm": 0.02223174087703228, + "learning_rate": 6.631975938798312e-05, + "loss": 0.0019, + "num_input_tokens_seen": 225314000, + "step": 104305 + }, + { + "epoch": 17.0163132137031, + "grad_norm": 0.0020480211824178696, + "learning_rate": 6.62843389789447e-05, + "loss": 0.0011, + "num_input_tokens_seen": 225323952, + "step": 104310 + }, + { + "epoch": 17.017128874388256, + "grad_norm": 0.031527843326330185, + "learning_rate": 6.624892735970412e-05, + "loss": 0.0033, + "num_input_tokens_seen": 225334416, + "step": 104315 + }, + { + "epoch": 17.017944535073408, + "grad_norm": 0.018897900357842445, + "learning_rate": 6.621352453097951e-05, + "loss": 0.0012, + "num_input_tokens_seen": 225345008, + "step": 104320 + }, + { + "epoch": 17.018760195758563, + "grad_norm": 0.004922399763017893, + "learning_rate": 6.617813049348787e-05, + "loss": 0.0071, + "num_input_tokens_seen": 225357008, + "step": 104325 + }, + { + "epoch": 17.01957585644372, + "grad_norm": 0.003675678512081504, + "learning_rate": 6.6142745247947e-05, + "loss": 0.0025, + "num_input_tokens_seen": 225367312, + "step": 104330 + }, + { + "epoch": 17.020391517128875, + "grad_norm": 0.0006571310805156827, + "learning_rate": 6.610736879507356e-05, + "loss": 0.0049, + "num_input_tokens_seen": 225378224, + "step": 104335 + }, + { + "epoch": 17.02120717781403, + "grad_norm": 0.008297720924019814, + "learning_rate": 6.607200113558493e-05, + "loss": 0.001, + "num_input_tokens_seen": 225389936, + "step": 104340 + }, + { + "epoch": 17.022022838499183, + "grad_norm": 0.0017699478194117546, + "learning_rate": 6.603664227019745e-05, + "loss": 0.001, + "num_input_tokens_seen": 225400592, + "step": 104345 + }, + { + "epoch": 17.02283849918434, + "grad_norm": 0.0011303251376375556, + "learning_rate": 6.600129219962819e-05, + "loss": 0.0008, + "num_input_tokens_seen": 225411632, + "step": 104350 + }, + { + "epoch": 17.023654159869494, + "grad_norm": 0.031992051750421524, + "learning_rate": 6.596595092459307e-05, + "loss": 0.0027, + "num_input_tokens_seen": 225423088, + "step": 104355 + }, + { + "epoch": 17.02446982055465, + "grad_norm": 0.00496671674773097, + "learning_rate": 6.593061844580878e-05, + "loss": 0.0034, + "num_input_tokens_seen": 225434448, + "step": 104360 + }, + { + "epoch": 17.025285481239806, + "grad_norm": 0.014466633088886738, + "learning_rate": 6.589529476399097e-05, + "loss": 0.0022, + "num_input_tokens_seen": 225445456, + "step": 104365 + }, + { + "epoch": 17.026101141924958, + "grad_norm": 0.13525429368019104, + "learning_rate": 6.585997987985592e-05, + "loss": 0.043, + "num_input_tokens_seen": 225456400, + "step": 104370 + }, + { + "epoch": 17.026916802610113, + "grad_norm": 0.5867858529090881, + "learning_rate": 6.582467379411889e-05, + "loss": 0.0628, + "num_input_tokens_seen": 225466256, + "step": 104375 + }, + { + "epoch": 17.02773246329527, + "grad_norm": 0.0019362906459718943, + "learning_rate": 6.578937650749573e-05, + "loss": 0.0012, + "num_input_tokens_seen": 225477008, + "step": 104380 + }, + { + "epoch": 17.028548123980425, + "grad_norm": 0.006260257679969072, + "learning_rate": 6.575408802070171e-05, + "loss": 0.0039, + "num_input_tokens_seen": 225488592, + "step": 104385 + }, + { + "epoch": 17.02936378466558, + "grad_norm": 0.021634496748447418, + "learning_rate": 6.571880833445198e-05, + "loss": 0.0118, + "num_input_tokens_seen": 225499248, + "step": 104390 + }, + { + "epoch": 17.030179445350733, + "grad_norm": 0.0008673843694850802, + "learning_rate": 6.568353744946154e-05, + "loss": 0.0025, + "num_input_tokens_seen": 225509968, + "step": 104395 + }, + { + "epoch": 17.03099510603589, + "grad_norm": 0.0009401759598404169, + "learning_rate": 6.564827536644519e-05, + "loss": 0.0013, + "num_input_tokens_seen": 225520752, + "step": 104400 + }, + { + "epoch": 17.031810766721044, + "grad_norm": 0.014809912070631981, + "learning_rate": 6.561302208611752e-05, + "loss": 0.0016, + "num_input_tokens_seen": 225530736, + "step": 104405 + }, + { + "epoch": 17.0326264274062, + "grad_norm": 0.03141998499631882, + "learning_rate": 6.557777760919303e-05, + "loss": 0.0033, + "num_input_tokens_seen": 225542864, + "step": 104410 + }, + { + "epoch": 17.033442088091356, + "grad_norm": 0.001158158527687192, + "learning_rate": 6.554254193638598e-05, + "loss": 0.004, + "num_input_tokens_seen": 225553296, + "step": 104415 + }, + { + "epoch": 17.034257748776508, + "grad_norm": 0.49954646825790405, + "learning_rate": 6.550731506841046e-05, + "loss": 0.0202, + "num_input_tokens_seen": 225564016, + "step": 104420 + }, + { + "epoch": 17.035073409461663, + "grad_norm": 0.0024979086592793465, + "learning_rate": 6.54720970059804e-05, + "loss": 0.0082, + "num_input_tokens_seen": 225575024, + "step": 104425 + }, + { + "epoch": 17.03588907014682, + "grad_norm": 0.004868679214268923, + "learning_rate": 6.543688774980944e-05, + "loss": 0.0118, + "num_input_tokens_seen": 225586608, + "step": 104430 + }, + { + "epoch": 17.036704730831975, + "grad_norm": 0.008296649903059006, + "learning_rate": 6.540168730061141e-05, + "loss": 0.0011, + "num_input_tokens_seen": 225596592, + "step": 104435 + }, + { + "epoch": 17.03752039151713, + "grad_norm": 0.024006053805351257, + "learning_rate": 6.53664956590993e-05, + "loss": 0.0015, + "num_input_tokens_seen": 225607024, + "step": 104440 + }, + { + "epoch": 17.038336052202283, + "grad_norm": 0.008441988378763199, + "learning_rate": 6.533131282598676e-05, + "loss": 0.0037, + "num_input_tokens_seen": 225617968, + "step": 104445 + }, + { + "epoch": 17.03915171288744, + "grad_norm": 0.03972849249839783, + "learning_rate": 6.529613880198638e-05, + "loss": 0.0035, + "num_input_tokens_seen": 225628976, + "step": 104450 + }, + { + "epoch": 17.039967373572594, + "grad_norm": 0.01612004078924656, + "learning_rate": 6.526097358781141e-05, + "loss": 0.0494, + "num_input_tokens_seen": 225639088, + "step": 104455 + }, + { + "epoch": 17.04078303425775, + "grad_norm": 0.0036437015514820814, + "learning_rate": 6.522581718417409e-05, + "loss": 0.0086, + "num_input_tokens_seen": 225649008, + "step": 104460 + }, + { + "epoch": 17.041598694942905, + "grad_norm": 0.08305442333221436, + "learning_rate": 6.519066959178738e-05, + "loss": 0.0044, + "num_input_tokens_seen": 225659632, + "step": 104465 + }, + { + "epoch": 17.042414355628058, + "grad_norm": 0.000809130840934813, + "learning_rate": 6.515553081136311e-05, + "loss": 0.08, + "num_input_tokens_seen": 225670320, + "step": 104470 + }, + { + "epoch": 17.043230016313213, + "grad_norm": 0.01146237924695015, + "learning_rate": 6.512040084361388e-05, + "loss": 0.0071, + "num_input_tokens_seen": 225680080, + "step": 104475 + }, + { + "epoch": 17.04404567699837, + "grad_norm": 0.04106791317462921, + "learning_rate": 6.508527968925115e-05, + "loss": 0.0034, + "num_input_tokens_seen": 225691024, + "step": 104480 + }, + { + "epoch": 17.044861337683525, + "grad_norm": 0.00045756070176139474, + "learning_rate": 6.505016734898722e-05, + "loss": 0.0125, + "num_input_tokens_seen": 225701840, + "step": 104485 + }, + { + "epoch": 17.045676998368677, + "grad_norm": 0.007039316929876804, + "learning_rate": 6.501506382353317e-05, + "loss": 0.0033, + "num_input_tokens_seen": 225713680, + "step": 104490 + }, + { + "epoch": 17.046492659053833, + "grad_norm": 0.447841078042984, + "learning_rate": 6.497996911360093e-05, + "loss": 0.0284, + "num_input_tokens_seen": 225724336, + "step": 104495 + }, + { + "epoch": 17.04730831973899, + "grad_norm": 0.001014457899145782, + "learning_rate": 6.494488321990122e-05, + "loss": 0.0045, + "num_input_tokens_seen": 225735376, + "step": 104500 + }, + { + "epoch": 17.048123980424144, + "grad_norm": 0.06084701418876648, + "learning_rate": 6.490980614314556e-05, + "loss": 0.0037, + "num_input_tokens_seen": 225746736, + "step": 104505 + }, + { + "epoch": 17.0489396411093, + "grad_norm": 0.008512042462825775, + "learning_rate": 6.487473788404446e-05, + "loss": 0.0013, + "num_input_tokens_seen": 225757264, + "step": 104510 + }, + { + "epoch": 17.049755301794452, + "grad_norm": 0.003893906017765403, + "learning_rate": 6.483967844330901e-05, + "loss": 0.0015, + "num_input_tokens_seen": 225769104, + "step": 104515 + }, + { + "epoch": 17.050570962479608, + "grad_norm": 0.004594842437654734, + "learning_rate": 6.480462782164925e-05, + "loss": 0.003, + "num_input_tokens_seen": 225780464, + "step": 104520 + }, + { + "epoch": 17.051386623164763, + "grad_norm": 1.0909298658370972, + "learning_rate": 6.476958601977595e-05, + "loss": 0.0641, + "num_input_tokens_seen": 225790736, + "step": 104525 + }, + { + "epoch": 17.05220228384992, + "grad_norm": 0.001894684974104166, + "learning_rate": 6.473455303839909e-05, + "loss": 0.002, + "num_input_tokens_seen": 225801584, + "step": 104530 + }, + { + "epoch": 17.053017944535075, + "grad_norm": 0.0030393460765480995, + "learning_rate": 6.469952887822866e-05, + "loss": 0.0006, + "num_input_tokens_seen": 225811920, + "step": 104535 + }, + { + "epoch": 17.053833605220227, + "grad_norm": 0.007903835736215115, + "learning_rate": 6.466451353997455e-05, + "loss": 0.003, + "num_input_tokens_seen": 225822000, + "step": 104540 + }, + { + "epoch": 17.054649265905383, + "grad_norm": 0.012811913155019283, + "learning_rate": 6.462950702434633e-05, + "loss": 0.0116, + "num_input_tokens_seen": 225833456, + "step": 104545 + }, + { + "epoch": 17.05546492659054, + "grad_norm": 0.0014708518283441663, + "learning_rate": 6.459450933205346e-05, + "loss": 0.002, + "num_input_tokens_seen": 225845072, + "step": 104550 + }, + { + "epoch": 17.056280587275694, + "grad_norm": 0.0006639196653850377, + "learning_rate": 6.455952046380514e-05, + "loss": 0.0022, + "num_input_tokens_seen": 225854960, + "step": 104555 + }, + { + "epoch": 17.05709624796085, + "grad_norm": 0.02281191386282444, + "learning_rate": 6.452454042031059e-05, + "loss": 0.0023, + "num_input_tokens_seen": 225866544, + "step": 104560 + }, + { + "epoch": 17.057911908646002, + "grad_norm": 0.00603465223684907, + "learning_rate": 6.448956920227867e-05, + "loss": 0.0024, + "num_input_tokens_seen": 225876016, + "step": 104565 + }, + { + "epoch": 17.058727569331158, + "grad_norm": 0.007597712334245443, + "learning_rate": 6.445460681041815e-05, + "loss": 0.005, + "num_input_tokens_seen": 225888176, + "step": 104570 + }, + { + "epoch": 17.059543230016313, + "grad_norm": 0.0021226252429187298, + "learning_rate": 6.441965324543737e-05, + "loss": 0.0094, + "num_input_tokens_seen": 225898672, + "step": 104575 + }, + { + "epoch": 17.06035889070147, + "grad_norm": 0.005578478332608938, + "learning_rate": 6.438470850804512e-05, + "loss": 0.0033, + "num_input_tokens_seen": 225910256, + "step": 104580 + }, + { + "epoch": 17.061174551386625, + "grad_norm": 0.0020552859641611576, + "learning_rate": 6.43497725989492e-05, + "loss": 0.0037, + "num_input_tokens_seen": 225921968, + "step": 104585 + }, + { + "epoch": 17.061990212071777, + "grad_norm": 0.016926616430282593, + "learning_rate": 6.431484551885797e-05, + "loss": 0.0042, + "num_input_tokens_seen": 225932848, + "step": 104590 + }, + { + "epoch": 17.062805872756933, + "grad_norm": 0.6727308034896851, + "learning_rate": 6.427992726847892e-05, + "loss": 0.1606, + "num_input_tokens_seen": 225944240, + "step": 104595 + }, + { + "epoch": 17.063621533442088, + "grad_norm": 0.003617391223087907, + "learning_rate": 6.424501784852004e-05, + "loss": 0.003, + "num_input_tokens_seen": 225955856, + "step": 104600 + }, + { + "epoch": 17.064437194127244, + "grad_norm": 0.0032554580830037594, + "learning_rate": 6.421011725968856e-05, + "loss": 0.0086, + "num_input_tokens_seen": 225966160, + "step": 104605 + }, + { + "epoch": 17.0652528548124, + "grad_norm": 0.0010663648135960102, + "learning_rate": 6.4175225502692e-05, + "loss": 0.0017, + "num_input_tokens_seen": 225977328, + "step": 104610 + }, + { + "epoch": 17.06606851549755, + "grad_norm": 0.003959180787205696, + "learning_rate": 6.414034257823725e-05, + "loss": 0.003, + "num_input_tokens_seen": 225986480, + "step": 104615 + }, + { + "epoch": 17.066884176182707, + "grad_norm": 0.0037438899744302034, + "learning_rate": 6.410546848703153e-05, + "loss": 0.0056, + "num_input_tokens_seen": 225997840, + "step": 104620 + }, + { + "epoch": 17.067699836867863, + "grad_norm": 0.010389694944024086, + "learning_rate": 6.407060322978131e-05, + "loss": 0.0045, + "num_input_tokens_seen": 226009360, + "step": 104625 + }, + { + "epoch": 17.06851549755302, + "grad_norm": 0.0010346529306843877, + "learning_rate": 6.403574680719343e-05, + "loss": 0.0026, + "num_input_tokens_seen": 226020176, + "step": 104630 + }, + { + "epoch": 17.069331158238175, + "grad_norm": 0.00428399071097374, + "learning_rate": 6.400089921997415e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226031216, + "step": 104635 + }, + { + "epoch": 17.070146818923327, + "grad_norm": 0.007730559445917606, + "learning_rate": 6.39660604688298e-05, + "loss": 0.0014, + "num_input_tokens_seen": 226041680, + "step": 104640 + }, + { + "epoch": 17.070962479608482, + "grad_norm": 0.018052903935313225, + "learning_rate": 6.393123055446637e-05, + "loss": 0.0013, + "num_input_tokens_seen": 226052784, + "step": 104645 + }, + { + "epoch": 17.071778140293638, + "grad_norm": 0.092511385679245, + "learning_rate": 6.389640947758973e-05, + "loss": 0.0125, + "num_input_tokens_seen": 226063824, + "step": 104650 + }, + { + "epoch": 17.072593800978794, + "grad_norm": 0.005565250292420387, + "learning_rate": 6.38615972389056e-05, + "loss": 0.0027, + "num_input_tokens_seen": 226074896, + "step": 104655 + }, + { + "epoch": 17.07340946166395, + "grad_norm": 0.1231377050280571, + "learning_rate": 6.382679383911949e-05, + "loss": 0.0029, + "num_input_tokens_seen": 226084816, + "step": 104660 + }, + { + "epoch": 17.0742251223491, + "grad_norm": 0.001401619054377079, + "learning_rate": 6.37919992789367e-05, + "loss": 0.0017, + "num_input_tokens_seen": 226095376, + "step": 104665 + }, + { + "epoch": 17.075040783034257, + "grad_norm": 0.012654143385589123, + "learning_rate": 6.375721355906245e-05, + "loss": 0.0195, + "num_input_tokens_seen": 226105936, + "step": 104670 + }, + { + "epoch": 17.075856443719413, + "grad_norm": 0.000800961337517947, + "learning_rate": 6.372243668020167e-05, + "loss": 0.0029, + "num_input_tokens_seen": 226116368, + "step": 104675 + }, + { + "epoch": 17.07667210440457, + "grad_norm": 0.00023649254580959678, + "learning_rate": 6.368766864305914e-05, + "loss": 0.0043, + "num_input_tokens_seen": 226127056, + "step": 104680 + }, + { + "epoch": 17.07748776508972, + "grad_norm": 0.001044937875121832, + "learning_rate": 6.365290944833952e-05, + "loss": 0.0022, + "num_input_tokens_seen": 226137200, + "step": 104685 + }, + { + "epoch": 17.078303425774877, + "grad_norm": 0.17576022446155548, + "learning_rate": 6.361815909674722e-05, + "loss": 0.005, + "num_input_tokens_seen": 226147600, + "step": 104690 + }, + { + "epoch": 17.079119086460032, + "grad_norm": 0.002019402338191867, + "learning_rate": 6.358341758898656e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226156784, + "step": 104695 + }, + { + "epoch": 17.079934747145188, + "grad_norm": 0.012892481870949268, + "learning_rate": 6.354868492576154e-05, + "loss": 0.0011, + "num_input_tokens_seen": 226167600, + "step": 104700 + }, + { + "epoch": 17.080750407830344, + "grad_norm": 0.024450616911053658, + "learning_rate": 6.351396110777613e-05, + "loss": 0.0041, + "num_input_tokens_seen": 226179088, + "step": 104705 + }, + { + "epoch": 17.081566068515496, + "grad_norm": 0.00017877235950436443, + "learning_rate": 6.347924613573402e-05, + "loss": 0.0021, + "num_input_tokens_seen": 226190928, + "step": 104710 + }, + { + "epoch": 17.08238172920065, + "grad_norm": 0.0333293154835701, + "learning_rate": 6.344454001033873e-05, + "loss": 0.0037, + "num_input_tokens_seen": 226201072, + "step": 104715 + }, + { + "epoch": 17.083197389885807, + "grad_norm": 0.009895344264805317, + "learning_rate": 6.340984273229355e-05, + "loss": 0.0013, + "num_input_tokens_seen": 226212688, + "step": 104720 + }, + { + "epoch": 17.084013050570963, + "grad_norm": 0.0060686697252094746, + "learning_rate": 6.337515430230196e-05, + "loss": 0.0024, + "num_input_tokens_seen": 226223760, + "step": 104725 + }, + { + "epoch": 17.08482871125612, + "grad_norm": 0.00028824826586060226, + "learning_rate": 6.334047472106657e-05, + "loss": 0.0022, + "num_input_tokens_seen": 226233264, + "step": 104730 + }, + { + "epoch": 17.08564437194127, + "grad_norm": 0.0006094975979067385, + "learning_rate": 6.330580398929047e-05, + "loss": 0.0006, + "num_input_tokens_seen": 226243440, + "step": 104735 + }, + { + "epoch": 17.086460032626427, + "grad_norm": 0.010639664717018604, + "learning_rate": 6.327114210767632e-05, + "loss": 0.0015, + "num_input_tokens_seen": 226255504, + "step": 104740 + }, + { + "epoch": 17.087275693311582, + "grad_norm": 0.5435303449630737, + "learning_rate": 6.323648907692642e-05, + "loss": 0.0707, + "num_input_tokens_seen": 226266768, + "step": 104745 + }, + { + "epoch": 17.088091353996738, + "grad_norm": 0.04907776787877083, + "learning_rate": 6.320184489774317e-05, + "loss": 0.0057, + "num_input_tokens_seen": 226277616, + "step": 104750 + }, + { + "epoch": 17.088907014681894, + "grad_norm": 0.0444052629172802, + "learning_rate": 6.316720957082867e-05, + "loss": 0.0028, + "num_input_tokens_seen": 226288144, + "step": 104755 + }, + { + "epoch": 17.089722675367046, + "grad_norm": 0.008682828396558762, + "learning_rate": 6.31325830968848e-05, + "loss": 0.0018, + "num_input_tokens_seen": 226300560, + "step": 104760 + }, + { + "epoch": 17.0905383360522, + "grad_norm": 0.01425588596612215, + "learning_rate": 6.30979654766134e-05, + "loss": 0.0016, + "num_input_tokens_seen": 226311248, + "step": 104765 + }, + { + "epoch": 17.091353996737357, + "grad_norm": 0.05291926860809326, + "learning_rate": 6.306335671071589e-05, + "loss": 0.0026, + "num_input_tokens_seen": 226323056, + "step": 104770 + }, + { + "epoch": 17.092169657422513, + "grad_norm": 0.004806335549801588, + "learning_rate": 6.302875679989384e-05, + "loss": 0.0005, + "num_input_tokens_seen": 226333072, + "step": 104775 + }, + { + "epoch": 17.09298531810767, + "grad_norm": 0.0032293670810759068, + "learning_rate": 6.299416574484828e-05, + "loss": 0.0017, + "num_input_tokens_seen": 226343920, + "step": 104780 + }, + { + "epoch": 17.09380097879282, + "grad_norm": 0.008256429806351662, + "learning_rate": 6.29595835462804e-05, + "loss": 0.0042, + "num_input_tokens_seen": 226354288, + "step": 104785 + }, + { + "epoch": 17.094616639477977, + "grad_norm": 0.030211608856916428, + "learning_rate": 6.2925010204891e-05, + "loss": 0.0055, + "num_input_tokens_seen": 226365520, + "step": 104790 + }, + { + "epoch": 17.095432300163132, + "grad_norm": 0.000742889940738678, + "learning_rate": 6.289044572138069e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226376944, + "step": 104795 + }, + { + "epoch": 17.096247960848288, + "grad_norm": 0.00248258956708014, + "learning_rate": 6.285589009644999e-05, + "loss": 0.0345, + "num_input_tokens_seen": 226387824, + "step": 104800 + }, + { + "epoch": 17.097063621533444, + "grad_norm": 0.00329033937305212, + "learning_rate": 6.282134333079926e-05, + "loss": 0.0042, + "num_input_tokens_seen": 226399056, + "step": 104805 + }, + { + "epoch": 17.097879282218596, + "grad_norm": 0.47946974635124207, + "learning_rate": 6.278680542512866e-05, + "loss": 0.0237, + "num_input_tokens_seen": 226408624, + "step": 104810 + }, + { + "epoch": 17.09869494290375, + "grad_norm": 0.0005470985197462142, + "learning_rate": 6.275227638013803e-05, + "loss": 0.0034, + "num_input_tokens_seen": 226418640, + "step": 104815 + }, + { + "epoch": 17.099510603588907, + "grad_norm": 0.0025842119939625263, + "learning_rate": 6.271775619652719e-05, + "loss": 0.0012, + "num_input_tokens_seen": 226430576, + "step": 104820 + }, + { + "epoch": 17.100326264274063, + "grad_norm": 0.002082726452499628, + "learning_rate": 6.268324487499583e-05, + "loss": 0.0068, + "num_input_tokens_seen": 226441456, + "step": 104825 + }, + { + "epoch": 17.10114192495922, + "grad_norm": 0.0005375830223783851, + "learning_rate": 6.264874241624324e-05, + "loss": 0.0008, + "num_input_tokens_seen": 226453040, + "step": 104830 + }, + { + "epoch": 17.10195758564437, + "grad_norm": 0.0021206443198025227, + "learning_rate": 6.261424882096866e-05, + "loss": 0.0015, + "num_input_tokens_seen": 226464624, + "step": 104835 + }, + { + "epoch": 17.102773246329527, + "grad_norm": 0.0012728559086099267, + "learning_rate": 6.257976408987115e-05, + "loss": 0.0006, + "num_input_tokens_seen": 226476048, + "step": 104840 + }, + { + "epoch": 17.103588907014682, + "grad_norm": 0.0008652537362650037, + "learning_rate": 6.254528822364985e-05, + "loss": 0.0035, + "num_input_tokens_seen": 226487408, + "step": 104845 + }, + { + "epoch": 17.104404567699838, + "grad_norm": 0.001977971289306879, + "learning_rate": 6.2510821223003e-05, + "loss": 0.0152, + "num_input_tokens_seen": 226498896, + "step": 104850 + }, + { + "epoch": 17.10522022838499, + "grad_norm": 0.00024319304793607444, + "learning_rate": 6.247636308862953e-05, + "loss": 0.0007, + "num_input_tokens_seen": 226509456, + "step": 104855 + }, + { + "epoch": 17.106035889070146, + "grad_norm": 0.0021559372544288635, + "learning_rate": 6.244191382122744e-05, + "loss": 0.0014, + "num_input_tokens_seen": 226518640, + "step": 104860 + }, + { + "epoch": 17.1068515497553, + "grad_norm": 0.013561434112489223, + "learning_rate": 6.240747342149511e-05, + "loss": 0.0011, + "num_input_tokens_seen": 226528944, + "step": 104865 + }, + { + "epoch": 17.107667210440457, + "grad_norm": 0.01310847606509924, + "learning_rate": 6.237304189013049e-05, + "loss": 0.0011, + "num_input_tokens_seen": 226540528, + "step": 104870 + }, + { + "epoch": 17.108482871125613, + "grad_norm": 0.05319130793213844, + "learning_rate": 6.233861922783135e-05, + "loss": 0.0035, + "num_input_tokens_seen": 226551024, + "step": 104875 + }, + { + "epoch": 17.109298531810765, + "grad_norm": 0.005616676993668079, + "learning_rate": 6.230420543529525e-05, + "loss": 0.0015, + "num_input_tokens_seen": 226561616, + "step": 104880 + }, + { + "epoch": 17.11011419249592, + "grad_norm": 0.002177638467401266, + "learning_rate": 6.226980051321973e-05, + "loss": 0.0006, + "num_input_tokens_seen": 226572240, + "step": 104885 + }, + { + "epoch": 17.110929853181077, + "grad_norm": 0.0035934222396463156, + "learning_rate": 6.223540446230202e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226583728, + "step": 104890 + }, + { + "epoch": 17.111745513866232, + "grad_norm": 0.00031812474480830133, + "learning_rate": 6.220101728323913e-05, + "loss": 0.0012, + "num_input_tokens_seen": 226593392, + "step": 104895 + }, + { + "epoch": 17.112561174551388, + "grad_norm": 0.08002560585737228, + "learning_rate": 6.216663897672803e-05, + "loss": 0.0038, + "num_input_tokens_seen": 226605040, + "step": 104900 + }, + { + "epoch": 17.11337683523654, + "grad_norm": 0.0005137875559739769, + "learning_rate": 6.213226954346546e-05, + "loss": 0.0949, + "num_input_tokens_seen": 226614992, + "step": 104905 + }, + { + "epoch": 17.114192495921696, + "grad_norm": 0.00699897576123476, + "learning_rate": 6.209790898414785e-05, + "loss": 0.0015, + "num_input_tokens_seen": 226624432, + "step": 104910 + }, + { + "epoch": 17.11500815660685, + "grad_norm": 0.0014027615543454885, + "learning_rate": 6.206355729947171e-05, + "loss": 0.0036, + "num_input_tokens_seen": 226634928, + "step": 104915 + }, + { + "epoch": 17.115823817292007, + "grad_norm": 0.038768503814935684, + "learning_rate": 6.20292144901331e-05, + "loss": 0.002, + "num_input_tokens_seen": 226646736, + "step": 104920 + }, + { + "epoch": 17.116639477977163, + "grad_norm": 0.09800397604703903, + "learning_rate": 6.199488055682806e-05, + "loss": 0.0085, + "num_input_tokens_seen": 226657040, + "step": 104925 + }, + { + "epoch": 17.117455138662315, + "grad_norm": 0.0012403081636875868, + "learning_rate": 6.196055550025243e-05, + "loss": 0.0076, + "num_input_tokens_seen": 226668112, + "step": 104930 + }, + { + "epoch": 17.11827079934747, + "grad_norm": 0.00031859471346251667, + "learning_rate": 6.192623932110187e-05, + "loss": 0.0014, + "num_input_tokens_seen": 226679152, + "step": 104935 + }, + { + "epoch": 17.119086460032626, + "grad_norm": 0.0007942294469103217, + "learning_rate": 6.189193202007176e-05, + "loss": 0.001, + "num_input_tokens_seen": 226690224, + "step": 104940 + }, + { + "epoch": 17.119902120717782, + "grad_norm": 0.0008386078989133239, + "learning_rate": 6.185763359785729e-05, + "loss": 0.0002, + "num_input_tokens_seen": 226701264, + "step": 104945 + }, + { + "epoch": 17.120717781402938, + "grad_norm": 0.013233168050646782, + "learning_rate": 6.182334405515399e-05, + "loss": 0.0021, + "num_input_tokens_seen": 226711600, + "step": 104950 + }, + { + "epoch": 17.12153344208809, + "grad_norm": 0.005685046315193176, + "learning_rate": 6.178906339265622e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226722672, + "step": 104955 + }, + { + "epoch": 17.122349102773246, + "grad_norm": 0.002959656063467264, + "learning_rate": 6.175479161105923e-05, + "loss": 0.0063, + "num_input_tokens_seen": 226734064, + "step": 104960 + }, + { + "epoch": 17.1231647634584, + "grad_norm": 0.14378587901592255, + "learning_rate": 6.17205287110571e-05, + "loss": 0.0028, + "num_input_tokens_seen": 226745392, + "step": 104965 + }, + { + "epoch": 17.123980424143557, + "grad_norm": 0.0013684089062735438, + "learning_rate": 6.16862746933447e-05, + "loss": 0.0456, + "num_input_tokens_seen": 226754512, + "step": 104970 + }, + { + "epoch": 17.124796084828713, + "grad_norm": 0.0015672908630222082, + "learning_rate": 6.165202955861577e-05, + "loss": 0.0004, + "num_input_tokens_seen": 226765328, + "step": 104975 + }, + { + "epoch": 17.125611745513865, + "grad_norm": 0.0004411570553202182, + "learning_rate": 6.161779330756473e-05, + "loss": 0.0005, + "num_input_tokens_seen": 226775888, + "step": 104980 + }, + { + "epoch": 17.12642740619902, + "grad_norm": 0.00609734607860446, + "learning_rate": 6.158356594088504e-05, + "loss": 0.0011, + "num_input_tokens_seen": 226787728, + "step": 104985 + }, + { + "epoch": 17.127243066884176, + "grad_norm": 0.0029845749959349632, + "learning_rate": 6.154934745927076e-05, + "loss": 0.0018, + "num_input_tokens_seen": 226798736, + "step": 104990 + }, + { + "epoch": 17.128058727569332, + "grad_norm": 0.021898532286286354, + "learning_rate": 6.151513786341495e-05, + "loss": 0.0025, + "num_input_tokens_seen": 226808880, + "step": 104995 + }, + { + "epoch": 17.128874388254488, + "grad_norm": 0.0029983953572809696, + "learning_rate": 6.148093715401138e-05, + "loss": 0.0043, + "num_input_tokens_seen": 226819696, + "step": 105000 + }, + { + "epoch": 17.12969004893964, + "grad_norm": 0.001115454942919314, + "learning_rate": 6.144674533175265e-05, + "loss": 0.0011, + "num_input_tokens_seen": 226830608, + "step": 105005 + }, + { + "epoch": 17.130505709624796, + "grad_norm": 0.0005920961848460138, + "learning_rate": 6.141256239733212e-05, + "loss": 0.0014, + "num_input_tokens_seen": 226841712, + "step": 105010 + }, + { + "epoch": 17.13132137030995, + "grad_norm": 0.00042530731298029423, + "learning_rate": 6.137838835144239e-05, + "loss": 0.0014, + "num_input_tokens_seen": 226852432, + "step": 105015 + }, + { + "epoch": 17.132137030995107, + "grad_norm": 0.001647250261157751, + "learning_rate": 6.1344223194776e-05, + "loss": 0.0005, + "num_input_tokens_seen": 226861552, + "step": 105020 + }, + { + "epoch": 17.13295269168026, + "grad_norm": 0.013181711547076702, + "learning_rate": 6.13100669280255e-05, + "loss": 0.0023, + "num_input_tokens_seen": 226873392, + "step": 105025 + }, + { + "epoch": 17.133768352365415, + "grad_norm": 0.0009910253575071692, + "learning_rate": 6.127591955188295e-05, + "loss": 0.0029, + "num_input_tokens_seen": 226884368, + "step": 105030 + }, + { + "epoch": 17.13458401305057, + "grad_norm": 0.009880750440061092, + "learning_rate": 6.124178106704042e-05, + "loss": 0.0017, + "num_input_tokens_seen": 226895088, + "step": 105035 + }, + { + "epoch": 17.135399673735726, + "grad_norm": 0.00189596030395478, + "learning_rate": 6.120765147418989e-05, + "loss": 0.0019, + "num_input_tokens_seen": 226905872, + "step": 105040 + }, + { + "epoch": 17.136215334420882, + "grad_norm": 0.037216730415821075, + "learning_rate": 6.117353077402288e-05, + "loss": 0.003, + "num_input_tokens_seen": 226917072, + "step": 105045 + }, + { + "epoch": 17.137030995106034, + "grad_norm": 0.006446500774472952, + "learning_rate": 6.113941896723097e-05, + "loss": 0.0048, + "num_input_tokens_seen": 226929584, + "step": 105050 + }, + { + "epoch": 17.13784665579119, + "grad_norm": 0.03469831869006157, + "learning_rate": 6.110531605450548e-05, + "loss": 0.0021, + "num_input_tokens_seen": 226941456, + "step": 105055 + }, + { + "epoch": 17.138662316476346, + "grad_norm": 0.14603827893733978, + "learning_rate": 6.107122203653742e-05, + "loss": 0.0047, + "num_input_tokens_seen": 226950672, + "step": 105060 + }, + { + "epoch": 17.1394779771615, + "grad_norm": 0.0005901344702579081, + "learning_rate": 6.103713691401813e-05, + "loss": 0.0036, + "num_input_tokens_seen": 226961008, + "step": 105065 + }, + { + "epoch": 17.140293637846657, + "grad_norm": 0.001494377851486206, + "learning_rate": 6.1003060687637836e-05, + "loss": 0.0052, + "num_input_tokens_seen": 226971632, + "step": 105070 + }, + { + "epoch": 17.14110929853181, + "grad_norm": 0.0025044973008334637, + "learning_rate": 6.09689933580877e-05, + "loss": 0.002, + "num_input_tokens_seen": 226982192, + "step": 105075 + }, + { + "epoch": 17.141924959216965, + "grad_norm": 0.0009858324192464352, + "learning_rate": 6.0934934926057616e-05, + "loss": 0.0199, + "num_input_tokens_seen": 226992752, + "step": 105080 + }, + { + "epoch": 17.14274061990212, + "grad_norm": 0.008440673351287842, + "learning_rate": 6.0900885392238316e-05, + "loss": 0.0011, + "num_input_tokens_seen": 227003152, + "step": 105085 + }, + { + "epoch": 17.143556280587276, + "grad_norm": 0.023581720888614655, + "learning_rate": 6.086684475731935e-05, + "loss": 0.0122, + "num_input_tokens_seen": 227014128, + "step": 105090 + }, + { + "epoch": 17.144371941272432, + "grad_norm": 7.596343040466309, + "learning_rate": 6.083281302199112e-05, + "loss": 0.0715, + "num_input_tokens_seen": 227024336, + "step": 105095 + }, + { + "epoch": 17.145187601957584, + "grad_norm": 0.0012364864815026522, + "learning_rate": 6.0798790186942784e-05, + "loss": 0.0035, + "num_input_tokens_seen": 227035632, + "step": 105100 + }, + { + "epoch": 17.14600326264274, + "grad_norm": 0.004918968304991722, + "learning_rate": 6.0764776252864365e-05, + "loss": 0.0023, + "num_input_tokens_seen": 227046288, + "step": 105105 + }, + { + "epoch": 17.146818923327896, + "grad_norm": 0.04475180432200432, + "learning_rate": 6.073077122044479e-05, + "loss": 0.1003, + "num_input_tokens_seen": 227056624, + "step": 105110 + }, + { + "epoch": 17.14763458401305, + "grad_norm": 0.0032711310777813196, + "learning_rate": 6.069677509037358e-05, + "loss": 0.0035, + "num_input_tokens_seen": 227067984, + "step": 105115 + }, + { + "epoch": 17.148450244698207, + "grad_norm": 0.004042398650199175, + "learning_rate": 6.066278786333928e-05, + "loss": 0.0006, + "num_input_tokens_seen": 227078800, + "step": 105120 + }, + { + "epoch": 17.14926590538336, + "grad_norm": 0.00016070179117377847, + "learning_rate": 6.062880954003114e-05, + "loss": 0.0012, + "num_input_tokens_seen": 227091120, + "step": 105125 + }, + { + "epoch": 17.150081566068515, + "grad_norm": 0.0017621108563616872, + "learning_rate": 6.059484012113736e-05, + "loss": 0.0025, + "num_input_tokens_seen": 227100752, + "step": 105130 + }, + { + "epoch": 17.15089722675367, + "grad_norm": 0.024079062044620514, + "learning_rate": 6.0560879607346795e-05, + "loss": 0.0089, + "num_input_tokens_seen": 227112176, + "step": 105135 + }, + { + "epoch": 17.151712887438826, + "grad_norm": 0.010822267271578312, + "learning_rate": 6.0526927999347224e-05, + "loss": 0.0036, + "num_input_tokens_seen": 227123504, + "step": 105140 + }, + { + "epoch": 17.152528548123982, + "grad_norm": 0.04497012123465538, + "learning_rate": 6.049298529782721e-05, + "loss": 0.0014, + "num_input_tokens_seen": 227133712, + "step": 105145 + }, + { + "epoch": 17.153344208809134, + "grad_norm": 0.017783276736736298, + "learning_rate": 6.045905150347419e-05, + "loss": 0.0007, + "num_input_tokens_seen": 227144816, + "step": 105150 + }, + { + "epoch": 17.15415986949429, + "grad_norm": 0.004039814695715904, + "learning_rate": 6.0425126616976186e-05, + "loss": 0.0032, + "num_input_tokens_seen": 227155152, + "step": 105155 + }, + { + "epoch": 17.154975530179446, + "grad_norm": 0.0007679007248952985, + "learning_rate": 6.039121063902064e-05, + "loss": 0.006, + "num_input_tokens_seen": 227165680, + "step": 105160 + }, + { + "epoch": 17.1557911908646, + "grad_norm": 0.0743962973356247, + "learning_rate": 6.03573035702949e-05, + "loss": 0.004, + "num_input_tokens_seen": 227177168, + "step": 105165 + }, + { + "epoch": 17.156606851549757, + "grad_norm": 0.005738751031458378, + "learning_rate": 6.032340541148612e-05, + "loss": 0.0047, + "num_input_tokens_seen": 227186576, + "step": 105170 + }, + { + "epoch": 17.15742251223491, + "grad_norm": 0.0028034579008817673, + "learning_rate": 6.0289516163281264e-05, + "loss": 0.0023, + "num_input_tokens_seen": 227197264, + "step": 105175 + }, + { + "epoch": 17.158238172920065, + "grad_norm": 0.06388754397630692, + "learning_rate": 6.025563582636723e-05, + "loss": 0.0022, + "num_input_tokens_seen": 227207984, + "step": 105180 + }, + { + "epoch": 17.15905383360522, + "grad_norm": 0.00031546890386380255, + "learning_rate": 6.0221764401430565e-05, + "loss": 0.0045, + "num_input_tokens_seen": 227219568, + "step": 105185 + }, + { + "epoch": 17.159869494290376, + "grad_norm": 0.0043876804411411285, + "learning_rate": 6.0187901889157735e-05, + "loss": 0.0056, + "num_input_tokens_seen": 227231056, + "step": 105190 + }, + { + "epoch": 17.160685154975532, + "grad_norm": 0.016384651884436607, + "learning_rate": 6.015404829023502e-05, + "loss": 0.001, + "num_input_tokens_seen": 227243184, + "step": 105195 + }, + { + "epoch": 17.161500815660684, + "grad_norm": 0.0020360194612294436, + "learning_rate": 6.012020360534853e-05, + "loss": 0.0006, + "num_input_tokens_seen": 227255024, + "step": 105200 + }, + { + "epoch": 17.16231647634584, + "grad_norm": 0.0018645500531420112, + "learning_rate": 6.008636783518401e-05, + "loss": 0.001, + "num_input_tokens_seen": 227266576, + "step": 105205 + }, + { + "epoch": 17.163132137030995, + "grad_norm": 0.017587218433618546, + "learning_rate": 6.005254098042751e-05, + "loss": 0.0515, + "num_input_tokens_seen": 227277072, + "step": 105210 + }, + { + "epoch": 17.16394779771615, + "grad_norm": 0.0003726345603354275, + "learning_rate": 6.00187230417642e-05, + "loss": 0.0028, + "num_input_tokens_seen": 227287280, + "step": 105215 + }, + { + "epoch": 17.164763458401303, + "grad_norm": 0.015266234055161476, + "learning_rate": 5.998491401987982e-05, + "loss": 0.0011, + "num_input_tokens_seen": 227299632, + "step": 105220 + }, + { + "epoch": 17.16557911908646, + "grad_norm": 0.0002786066324915737, + "learning_rate": 5.9951113915459154e-05, + "loss": 0.0005, + "num_input_tokens_seen": 227311024, + "step": 105225 + }, + { + "epoch": 17.166394779771615, + "grad_norm": 0.037656597793102264, + "learning_rate": 5.9917322729187594e-05, + "loss": 0.0017, + "num_input_tokens_seen": 227321424, + "step": 105230 + }, + { + "epoch": 17.16721044045677, + "grad_norm": 0.0023824572563171387, + "learning_rate": 5.9883540461749596e-05, + "loss": 0.1298, + "num_input_tokens_seen": 227330864, + "step": 105235 + }, + { + "epoch": 17.168026101141926, + "grad_norm": 0.01473549846559763, + "learning_rate": 5.984976711383017e-05, + "loss": 0.0052, + "num_input_tokens_seen": 227342640, + "step": 105240 + }, + { + "epoch": 17.16884176182708, + "grad_norm": 0.46272391080856323, + "learning_rate": 5.981600268611337e-05, + "loss": 0.0114, + "num_input_tokens_seen": 227354864, + "step": 105245 + }, + { + "epoch": 17.169657422512234, + "grad_norm": 0.014566629193723202, + "learning_rate": 5.9782247179283875e-05, + "loss": 0.0105, + "num_input_tokens_seen": 227366000, + "step": 105250 + }, + { + "epoch": 17.17047308319739, + "grad_norm": 0.0011301173362880945, + "learning_rate": 5.9748500594025425e-05, + "loss": 0.0083, + "num_input_tokens_seen": 227375696, + "step": 105255 + }, + { + "epoch": 17.171288743882545, + "grad_norm": 0.01436126884073019, + "learning_rate": 5.971476293102229e-05, + "loss": 0.0459, + "num_input_tokens_seen": 227385936, + "step": 105260 + }, + { + "epoch": 17.1721044045677, + "grad_norm": 0.016061117872595787, + "learning_rate": 5.9681034190957886e-05, + "loss": 0.0011, + "num_input_tokens_seen": 227396720, + "step": 105265 + }, + { + "epoch": 17.172920065252853, + "grad_norm": 0.0018371932674199343, + "learning_rate": 5.964731437451593e-05, + "loss": 0.0013, + "num_input_tokens_seen": 227407472, + "step": 105270 + }, + { + "epoch": 17.17373572593801, + "grad_norm": 0.0007460727938450873, + "learning_rate": 5.961360348237982e-05, + "loss": 0.0033, + "num_input_tokens_seen": 227417616, + "step": 105275 + }, + { + "epoch": 17.174551386623165, + "grad_norm": 0.007591134402900934, + "learning_rate": 5.9579901515232684e-05, + "loss": 0.0052, + "num_input_tokens_seen": 227428816, + "step": 105280 + }, + { + "epoch": 17.17536704730832, + "grad_norm": 0.022664356976747513, + "learning_rate": 5.954620847375758e-05, + "loss": 0.0025, + "num_input_tokens_seen": 227438160, + "step": 105285 + }, + { + "epoch": 17.176182707993476, + "grad_norm": 0.004871509037911892, + "learning_rate": 5.9512524358637296e-05, + "loss": 0.001, + "num_input_tokens_seen": 227448144, + "step": 105290 + }, + { + "epoch": 17.17699836867863, + "grad_norm": 0.017250265926122665, + "learning_rate": 5.9478849170554513e-05, + "loss": 0.0011, + "num_input_tokens_seen": 227459856, + "step": 105295 + }, + { + "epoch": 17.177814029363784, + "grad_norm": 0.0011680923635140061, + "learning_rate": 5.944518291019168e-05, + "loss": 0.001, + "num_input_tokens_seen": 227470896, + "step": 105300 + }, + { + "epoch": 17.17862969004894, + "grad_norm": 9.958234295481816e-05, + "learning_rate": 5.9411525578231094e-05, + "loss": 0.0013, + "num_input_tokens_seen": 227481328, + "step": 105305 + }, + { + "epoch": 17.179445350734095, + "grad_norm": 0.006742374040186405, + "learning_rate": 5.9377877175354865e-05, + "loss": 0.0023, + "num_input_tokens_seen": 227493104, + "step": 105310 + }, + { + "epoch": 17.18026101141925, + "grad_norm": 0.027583172544836998, + "learning_rate": 5.934423770224495e-05, + "loss": 0.0021, + "num_input_tokens_seen": 227504400, + "step": 105315 + }, + { + "epoch": 17.181076672104403, + "grad_norm": 0.0003342593845445663, + "learning_rate": 5.931060715958309e-05, + "loss": 0.0065, + "num_input_tokens_seen": 227516272, + "step": 105320 + }, + { + "epoch": 17.18189233278956, + "grad_norm": 0.0027394210919737816, + "learning_rate": 5.9276985548050775e-05, + "loss": 0.0023, + "num_input_tokens_seen": 227526704, + "step": 105325 + }, + { + "epoch": 17.182707993474715, + "grad_norm": 0.008948412723839283, + "learning_rate": 5.924337286832948e-05, + "loss": 0.0009, + "num_input_tokens_seen": 227537520, + "step": 105330 + }, + { + "epoch": 17.18352365415987, + "grad_norm": 0.0016631630714982748, + "learning_rate": 5.9209769121100374e-05, + "loss": 0.0023, + "num_input_tokens_seen": 227548688, + "step": 105335 + }, + { + "epoch": 17.184339314845026, + "grad_norm": 0.0002861691755242646, + "learning_rate": 5.917617430704447e-05, + "loss": 0.0008, + "num_input_tokens_seen": 227558320, + "step": 105340 + }, + { + "epoch": 17.18515497553018, + "grad_norm": 0.00043132706196047366, + "learning_rate": 5.9142588426842615e-05, + "loss": 0.0028, + "num_input_tokens_seen": 227569392, + "step": 105345 + }, + { + "epoch": 17.185970636215334, + "grad_norm": 0.0013702671276405454, + "learning_rate": 5.9109011481175364e-05, + "loss": 0.0183, + "num_input_tokens_seen": 227580976, + "step": 105350 + }, + { + "epoch": 17.18678629690049, + "grad_norm": 0.009048002772033215, + "learning_rate": 5.907544347072352e-05, + "loss": 0.0019, + "num_input_tokens_seen": 227591216, + "step": 105355 + }, + { + "epoch": 17.187601957585645, + "grad_norm": 0.009810620918869972, + "learning_rate": 5.904188439616692e-05, + "loss": 0.001, + "num_input_tokens_seen": 227602288, + "step": 105360 + }, + { + "epoch": 17.1884176182708, + "grad_norm": 0.7481173276901245, + "learning_rate": 5.9008334258186195e-05, + "loss": 0.062, + "num_input_tokens_seen": 227614064, + "step": 105365 + }, + { + "epoch": 17.189233278955953, + "grad_norm": 0.007403769996017218, + "learning_rate": 5.897479305746079e-05, + "loss": 0.0053, + "num_input_tokens_seen": 227624432, + "step": 105370 + }, + { + "epoch": 17.19004893964111, + "grad_norm": 0.001949156867340207, + "learning_rate": 5.894126079467077e-05, + "loss": 0.0016, + "num_input_tokens_seen": 227635792, + "step": 105375 + }, + { + "epoch": 17.190864600326265, + "grad_norm": 0.0001903936208691448, + "learning_rate": 5.890773747049566e-05, + "loss": 0.0003, + "num_input_tokens_seen": 227645552, + "step": 105380 + }, + { + "epoch": 17.19168026101142, + "grad_norm": 0.019825341179966927, + "learning_rate": 5.88742230856148e-05, + "loss": 0.0017, + "num_input_tokens_seen": 227656304, + "step": 105385 + }, + { + "epoch": 17.192495921696572, + "grad_norm": 0.010199088603258133, + "learning_rate": 5.884071764070736e-05, + "loss": 0.0013, + "num_input_tokens_seen": 227666416, + "step": 105390 + }, + { + "epoch": 17.193311582381728, + "grad_norm": 0.007535295560956001, + "learning_rate": 5.880722113645248e-05, + "loss": 0.0056, + "num_input_tokens_seen": 227677424, + "step": 105395 + }, + { + "epoch": 17.194127243066884, + "grad_norm": 0.005390184931457043, + "learning_rate": 5.877373357352894e-05, + "loss": 0.0006, + "num_input_tokens_seen": 227688784, + "step": 105400 + }, + { + "epoch": 17.19494290375204, + "grad_norm": 0.0014657375868409872, + "learning_rate": 5.874025495261548e-05, + "loss": 0.0003, + "num_input_tokens_seen": 227698800, + "step": 105405 + }, + { + "epoch": 17.195758564437195, + "grad_norm": 0.0014141921419650316, + "learning_rate": 5.870678527439049e-05, + "loss": 0.0042, + "num_input_tokens_seen": 227708688, + "step": 105410 + }, + { + "epoch": 17.196574225122347, + "grad_norm": 0.0031896489672362804, + "learning_rate": 5.867332453953228e-05, + "loss": 0.1541, + "num_input_tokens_seen": 227719120, + "step": 105415 + }, + { + "epoch": 17.197389885807503, + "grad_norm": 0.5455310940742493, + "learning_rate": 5.863987274871907e-05, + "loss": 0.0074, + "num_input_tokens_seen": 227730128, + "step": 105420 + }, + { + "epoch": 17.19820554649266, + "grad_norm": 0.0009574625873938203, + "learning_rate": 5.860642990262871e-05, + "loss": 0.003, + "num_input_tokens_seen": 227742608, + "step": 105425 + }, + { + "epoch": 17.199021207177815, + "grad_norm": 0.010655845515429974, + "learning_rate": 5.857299600193899e-05, + "loss": 0.0007, + "num_input_tokens_seen": 227753680, + "step": 105430 + }, + { + "epoch": 17.19983686786297, + "grad_norm": 0.0006226776167750359, + "learning_rate": 5.853957104732749e-05, + "loss": 0.0049, + "num_input_tokens_seen": 227764592, + "step": 105435 + }, + { + "epoch": 17.200652528548122, + "grad_norm": 0.011091392487287521, + "learning_rate": 5.850615503947166e-05, + "loss": 0.0009, + "num_input_tokens_seen": 227776048, + "step": 105440 + }, + { + "epoch": 17.201468189233278, + "grad_norm": 0.00044521092786453664, + "learning_rate": 5.8472747979048665e-05, + "loss": 0.0007, + "num_input_tokens_seen": 227787024, + "step": 105445 + }, + { + "epoch": 17.202283849918434, + "grad_norm": 0.2557276785373688, + "learning_rate": 5.843934986673549e-05, + "loss": 0.0067, + "num_input_tokens_seen": 227799056, + "step": 105450 + }, + { + "epoch": 17.20309951060359, + "grad_norm": 0.0029599005356431007, + "learning_rate": 5.840596070320914e-05, + "loss": 0.0052, + "num_input_tokens_seen": 227809296, + "step": 105455 + }, + { + "epoch": 17.203915171288745, + "grad_norm": 0.00032506947172805667, + "learning_rate": 5.837258048914612e-05, + "loss": 0.0046, + "num_input_tokens_seen": 227820624, + "step": 105460 + }, + { + "epoch": 17.204730831973897, + "grad_norm": 0.00039597388240508735, + "learning_rate": 5.833920922522301e-05, + "loss": 0.0019, + "num_input_tokens_seen": 227830640, + "step": 105465 + }, + { + "epoch": 17.205546492659053, + "grad_norm": 0.0009330728207714856, + "learning_rate": 5.830584691211615e-05, + "loss": 0.0005, + "num_input_tokens_seen": 227841904, + "step": 105470 + }, + { + "epoch": 17.20636215334421, + "grad_norm": 0.0004881395725533366, + "learning_rate": 5.827249355050163e-05, + "loss": 0.0017, + "num_input_tokens_seen": 227851536, + "step": 105475 + }, + { + "epoch": 17.207177814029365, + "grad_norm": 0.0007340696756727993, + "learning_rate": 5.823914914105527e-05, + "loss": 0.0005, + "num_input_tokens_seen": 227862288, + "step": 105480 + }, + { + "epoch": 17.20799347471452, + "grad_norm": 0.008161459118127823, + "learning_rate": 5.820581368445316e-05, + "loss": 0.0044, + "num_input_tokens_seen": 227872720, + "step": 105485 + }, + { + "epoch": 17.208809135399672, + "grad_norm": 0.008229502476751804, + "learning_rate": 5.817248718137053e-05, + "loss": 0.0008, + "num_input_tokens_seen": 227884336, + "step": 105490 + }, + { + "epoch": 17.209624796084828, + "grad_norm": 0.0020968979224562645, + "learning_rate": 5.8139169632483e-05, + "loss": 0.0005, + "num_input_tokens_seen": 227895248, + "step": 105495 + }, + { + "epoch": 17.210440456769984, + "grad_norm": 0.0001975457853404805, + "learning_rate": 5.810586103846577e-05, + "loss": 0.0035, + "num_input_tokens_seen": 227906064, + "step": 105500 + }, + { + "epoch": 17.21125611745514, + "grad_norm": 0.0006039740983396769, + "learning_rate": 5.807256139999384e-05, + "loss": 0.0008, + "num_input_tokens_seen": 227917552, + "step": 105505 + }, + { + "epoch": 17.212071778140295, + "grad_norm": 0.0028502692002803087, + "learning_rate": 5.8039270717742065e-05, + "loss": 0.0009, + "num_input_tokens_seen": 227927792, + "step": 105510 + }, + { + "epoch": 17.212887438825447, + "grad_norm": 0.0009251784649677575, + "learning_rate": 5.8005988992385184e-05, + "loss": 0.0043, + "num_input_tokens_seen": 227938800, + "step": 105515 + }, + { + "epoch": 17.213703099510603, + "grad_norm": 0.011867698282003403, + "learning_rate": 5.79727162245976e-05, + "loss": 0.0007, + "num_input_tokens_seen": 227950160, + "step": 105520 + }, + { + "epoch": 17.21451876019576, + "grad_norm": 0.00040899330633692443, + "learning_rate": 5.7939452415053664e-05, + "loss": 0.0009, + "num_input_tokens_seen": 227961904, + "step": 105525 + }, + { + "epoch": 17.215334420880914, + "grad_norm": 0.0014331662096083164, + "learning_rate": 5.7906197564427557e-05, + "loss": 0.0168, + "num_input_tokens_seen": 227972944, + "step": 105530 + }, + { + "epoch": 17.21615008156607, + "grad_norm": 0.024769123643636703, + "learning_rate": 5.7872951673393184e-05, + "loss": 0.0035, + "num_input_tokens_seen": 227982800, + "step": 105535 + }, + { + "epoch": 17.216965742251222, + "grad_norm": 0.0008173759561032057, + "learning_rate": 5.7839714742624284e-05, + "loss": 0.0008, + "num_input_tokens_seen": 227993104, + "step": 105540 + }, + { + "epoch": 17.217781402936378, + "grad_norm": 0.0005532324430532753, + "learning_rate": 5.780648677279454e-05, + "loss": 0.0015, + "num_input_tokens_seen": 228004208, + "step": 105545 + }, + { + "epoch": 17.218597063621534, + "grad_norm": 0.008099708706140518, + "learning_rate": 5.777326776457725e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228014864, + "step": 105550 + }, + { + "epoch": 17.21941272430669, + "grad_norm": 0.011459157802164555, + "learning_rate": 5.774005771864571e-05, + "loss": 0.0009, + "num_input_tokens_seen": 228026480, + "step": 105555 + }, + { + "epoch": 17.22022838499184, + "grad_norm": 0.0037186089903116226, + "learning_rate": 5.7706856635672986e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228037328, + "step": 105560 + }, + { + "epoch": 17.221044045676997, + "grad_norm": 0.08368998020887375, + "learning_rate": 5.767366451633188e-05, + "loss": 0.0018, + "num_input_tokens_seen": 228048944, + "step": 105565 + }, + { + "epoch": 17.221859706362153, + "grad_norm": 0.0037311904598027468, + "learning_rate": 5.764048136129507e-05, + "loss": 0.0009, + "num_input_tokens_seen": 228058832, + "step": 105570 + }, + { + "epoch": 17.22267536704731, + "grad_norm": 0.003354444168508053, + "learning_rate": 5.760730717123508e-05, + "loss": 0.0008, + "num_input_tokens_seen": 228069424, + "step": 105575 + }, + { + "epoch": 17.223491027732464, + "grad_norm": 0.00479935435578227, + "learning_rate": 5.757414194682426e-05, + "loss": 0.026, + "num_input_tokens_seen": 228080464, + "step": 105580 + }, + { + "epoch": 17.224306688417617, + "grad_norm": 0.013407070189714432, + "learning_rate": 5.754098568873456e-05, + "loss": 0.0021, + "num_input_tokens_seen": 228091696, + "step": 105585 + }, + { + "epoch": 17.225122349102772, + "grad_norm": 0.511169970035553, + "learning_rate": 5.7507838397638346e-05, + "loss": 0.0432, + "num_input_tokens_seen": 228102800, + "step": 105590 + }, + { + "epoch": 17.225938009787928, + "grad_norm": 0.0041065155528485775, + "learning_rate": 5.7474700074206856e-05, + "loss": 0.0055, + "num_input_tokens_seen": 228113744, + "step": 105595 + }, + { + "epoch": 17.226753670473084, + "grad_norm": 0.004399343393743038, + "learning_rate": 5.7441570719112216e-05, + "loss": 0.0104, + "num_input_tokens_seen": 228124336, + "step": 105600 + }, + { + "epoch": 17.22756933115824, + "grad_norm": 0.018818650394678116, + "learning_rate": 5.740845033302533e-05, + "loss": 0.0077, + "num_input_tokens_seen": 228134576, + "step": 105605 + }, + { + "epoch": 17.22838499184339, + "grad_norm": 0.06135449558496475, + "learning_rate": 5.737533891661789e-05, + "loss": 0.0034, + "num_input_tokens_seen": 228145200, + "step": 105610 + }, + { + "epoch": 17.229200652528547, + "grad_norm": 0.025219673290848732, + "learning_rate": 5.734223647056053e-05, + "loss": 0.0023, + "num_input_tokens_seen": 228156048, + "step": 105615 + }, + { + "epoch": 17.230016313213703, + "grad_norm": 0.023373369127511978, + "learning_rate": 5.7309142995524475e-05, + "loss": 0.0034, + "num_input_tokens_seen": 228165808, + "step": 105620 + }, + { + "epoch": 17.23083197389886, + "grad_norm": 0.08417593687772751, + "learning_rate": 5.7276058492179984e-05, + "loss": 0.0059, + "num_input_tokens_seen": 228176592, + "step": 105625 + }, + { + "epoch": 17.231647634584014, + "grad_norm": 0.01127390656620264, + "learning_rate": 5.724298296119796e-05, + "loss": 0.0018, + "num_input_tokens_seen": 228187696, + "step": 105630 + }, + { + "epoch": 17.232463295269167, + "grad_norm": 0.03395693004131317, + "learning_rate": 5.7209916403248574e-05, + "loss": 0.0029, + "num_input_tokens_seen": 228198224, + "step": 105635 + }, + { + "epoch": 17.233278955954322, + "grad_norm": 0.005188298411667347, + "learning_rate": 5.717685881900192e-05, + "loss": 0.0085, + "num_input_tokens_seen": 228209040, + "step": 105640 + }, + { + "epoch": 17.234094616639478, + "grad_norm": 0.0007153578335419297, + "learning_rate": 5.714381020912801e-05, + "loss": 0.0043, + "num_input_tokens_seen": 228219984, + "step": 105645 + }, + { + "epoch": 17.234910277324634, + "grad_norm": 0.0007897784234955907, + "learning_rate": 5.711077057429659e-05, + "loss": 0.0018, + "num_input_tokens_seen": 228230544, + "step": 105650 + }, + { + "epoch": 17.23572593800979, + "grad_norm": 0.005037500057369471, + "learning_rate": 5.7077739915177226e-05, + "loss": 0.0008, + "num_input_tokens_seen": 228240880, + "step": 105655 + }, + { + "epoch": 17.23654159869494, + "grad_norm": 0.015429419465363026, + "learning_rate": 5.704471823243934e-05, + "loss": 0.0014, + "num_input_tokens_seen": 228250384, + "step": 105660 + }, + { + "epoch": 17.237357259380097, + "grad_norm": 0.001333926455117762, + "learning_rate": 5.701170552675217e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228261968, + "step": 105665 + }, + { + "epoch": 17.238172920065253, + "grad_norm": 0.008555333130061626, + "learning_rate": 5.6978701798784785e-05, + "loss": 0.0025, + "num_input_tokens_seen": 228271664, + "step": 105670 + }, + { + "epoch": 17.23898858075041, + "grad_norm": 0.1655566245317459, + "learning_rate": 5.6945707049205985e-05, + "loss": 0.0066, + "num_input_tokens_seen": 228282704, + "step": 105675 + }, + { + "epoch": 17.239804241435564, + "grad_norm": 0.0028315861709415913, + "learning_rate": 5.691272127868452e-05, + "loss": 0.0021, + "num_input_tokens_seen": 228294032, + "step": 105680 + }, + { + "epoch": 17.240619902120716, + "grad_norm": 0.007239778526127338, + "learning_rate": 5.6879744487888854e-05, + "loss": 0.0013, + "num_input_tokens_seen": 228305648, + "step": 105685 + }, + { + "epoch": 17.241435562805872, + "grad_norm": 0.00043697163346223533, + "learning_rate": 5.684677667748717e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228316304, + "step": 105690 + }, + { + "epoch": 17.242251223491028, + "grad_norm": 0.0034264670684933662, + "learning_rate": 5.681381784814799e-05, + "loss": 0.0017, + "num_input_tokens_seen": 228326608, + "step": 105695 + }, + { + "epoch": 17.243066884176184, + "grad_norm": 0.0002579323190730065, + "learning_rate": 5.678086800053878e-05, + "loss": 0.002, + "num_input_tokens_seen": 228337168, + "step": 105700 + }, + { + "epoch": 17.24388254486134, + "grad_norm": 0.003010801738128066, + "learning_rate": 5.674792713532772e-05, + "loss": 0.0018, + "num_input_tokens_seen": 228347632, + "step": 105705 + }, + { + "epoch": 17.24469820554649, + "grad_norm": 0.0034991370048373938, + "learning_rate": 5.671499525318208e-05, + "loss": 0.0256, + "num_input_tokens_seen": 228359408, + "step": 105710 + }, + { + "epoch": 17.245513866231647, + "grad_norm": 0.0008481157710775733, + "learning_rate": 5.668207235476957e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228370768, + "step": 105715 + }, + { + "epoch": 17.246329526916803, + "grad_norm": 0.004653229843825102, + "learning_rate": 5.664915844075702e-05, + "loss": 0.004, + "num_input_tokens_seen": 228382000, + "step": 105720 + }, + { + "epoch": 17.24714518760196, + "grad_norm": 0.0016604745760560036, + "learning_rate": 5.6616253511811934e-05, + "loss": 0.0016, + "num_input_tokens_seen": 228392688, + "step": 105725 + }, + { + "epoch": 17.247960848287114, + "grad_norm": 0.000807464646641165, + "learning_rate": 5.6583357568600776e-05, + "loss": 0.0008, + "num_input_tokens_seen": 228404240, + "step": 105730 + }, + { + "epoch": 17.248776508972266, + "grad_norm": 0.004214088898152113, + "learning_rate": 5.6550470611790584e-05, + "loss": 0.0027, + "num_input_tokens_seen": 228414992, + "step": 105735 + }, + { + "epoch": 17.249592169657422, + "grad_norm": 0.002003757981583476, + "learning_rate": 5.6517592642047424e-05, + "loss": 0.0013, + "num_input_tokens_seen": 228426224, + "step": 105740 + }, + { + "epoch": 17.250407830342578, + "grad_norm": 0.00016738157137297094, + "learning_rate": 5.648472366003804e-05, + "loss": 0.0055, + "num_input_tokens_seen": 228437552, + "step": 105745 + }, + { + "epoch": 17.251223491027734, + "grad_norm": 0.04558353126049042, + "learning_rate": 5.6451863666428236e-05, + "loss": 0.0024, + "num_input_tokens_seen": 228449072, + "step": 105750 + }, + { + "epoch": 17.252039151712886, + "grad_norm": 0.0008563185692764819, + "learning_rate": 5.6419012661884206e-05, + "loss": 0.0007, + "num_input_tokens_seen": 228460240, + "step": 105755 + }, + { + "epoch": 17.25285481239804, + "grad_norm": 0.0002939916157629341, + "learning_rate": 5.6386170647071464e-05, + "loss": 0.0151, + "num_input_tokens_seen": 228471088, + "step": 105760 + }, + { + "epoch": 17.253670473083197, + "grad_norm": 0.005507839843630791, + "learning_rate": 5.6353337622655935e-05, + "loss": 0.0009, + "num_input_tokens_seen": 228482224, + "step": 105765 + }, + { + "epoch": 17.254486133768353, + "grad_norm": 0.0018616120796650648, + "learning_rate": 5.632051358930263e-05, + "loss": 0.054, + "num_input_tokens_seen": 228493104, + "step": 105770 + }, + { + "epoch": 17.25530179445351, + "grad_norm": 0.06906536966562271, + "learning_rate": 5.628769854767707e-05, + "loss": 0.0026, + "num_input_tokens_seen": 228502800, + "step": 105775 + }, + { + "epoch": 17.25611745513866, + "grad_norm": 0.0015966150676831603, + "learning_rate": 5.6254892498444175e-05, + "loss": 0.0018, + "num_input_tokens_seen": 228511952, + "step": 105780 + }, + { + "epoch": 17.256933115823816, + "grad_norm": 0.000293926423182711, + "learning_rate": 5.6222095442268805e-05, + "loss": 0.0052, + "num_input_tokens_seen": 228522576, + "step": 105785 + }, + { + "epoch": 17.257748776508972, + "grad_norm": 0.006921887863427401, + "learning_rate": 5.6189307379815645e-05, + "loss": 0.0011, + "num_input_tokens_seen": 228532336, + "step": 105790 + }, + { + "epoch": 17.258564437194128, + "grad_norm": 0.0023210467770695686, + "learning_rate": 5.615652831174917e-05, + "loss": 0.0009, + "num_input_tokens_seen": 228544304, + "step": 105795 + }, + { + "epoch": 17.259380097879284, + "grad_norm": 0.004994812421500683, + "learning_rate": 5.612375823873373e-05, + "loss": 0.0014, + "num_input_tokens_seen": 228555664, + "step": 105800 + }, + { + "epoch": 17.260195758564436, + "grad_norm": 0.004979619290679693, + "learning_rate": 5.60909971614334e-05, + "loss": 0.0012, + "num_input_tokens_seen": 228565840, + "step": 105805 + }, + { + "epoch": 17.26101141924959, + "grad_norm": 0.004859395790845156, + "learning_rate": 5.605824508051216e-05, + "loss": 0.001, + "num_input_tokens_seen": 228577936, + "step": 105810 + }, + { + "epoch": 17.261827079934747, + "grad_norm": 0.21579575538635254, + "learning_rate": 5.602550199663381e-05, + "loss": 0.0092, + "num_input_tokens_seen": 228587760, + "step": 105815 + }, + { + "epoch": 17.262642740619903, + "grad_norm": 0.0003847281914204359, + "learning_rate": 5.599276791046182e-05, + "loss": 0.0007, + "num_input_tokens_seen": 228598192, + "step": 105820 + }, + { + "epoch": 17.26345840130506, + "grad_norm": 0.003210867289453745, + "learning_rate": 5.5960042822659596e-05, + "loss": 0.0025, + "num_input_tokens_seen": 228609744, + "step": 105825 + }, + { + "epoch": 17.26427406199021, + "grad_norm": 0.0023815941531211138, + "learning_rate": 5.592732673389056e-05, + "loss": 0.0004, + "num_input_tokens_seen": 228620816, + "step": 105830 + }, + { + "epoch": 17.265089722675366, + "grad_norm": 0.005627058446407318, + "learning_rate": 5.5894619644817455e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228631088, + "step": 105835 + }, + { + "epoch": 17.265905383360522, + "grad_norm": 0.045385442674160004, + "learning_rate": 5.586192155610342e-05, + "loss": 0.0016, + "num_input_tokens_seen": 228640368, + "step": 105840 + }, + { + "epoch": 17.266721044045678, + "grad_norm": 0.01856350153684616, + "learning_rate": 5.582923246841082e-05, + "loss": 0.0016, + "num_input_tokens_seen": 228651024, + "step": 105845 + }, + { + "epoch": 17.267536704730833, + "grad_norm": 0.01033297274261713, + "learning_rate": 5.5796552382402446e-05, + "loss": 0.0014, + "num_input_tokens_seen": 228661680, + "step": 105850 + }, + { + "epoch": 17.268352365415986, + "grad_norm": 0.0004607291193678975, + "learning_rate": 5.576388129874027e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228671664, + "step": 105855 + }, + { + "epoch": 17.26916802610114, + "grad_norm": 0.010909834876656532, + "learning_rate": 5.5731219218086824e-05, + "loss": 0.017, + "num_input_tokens_seen": 228681968, + "step": 105860 + }, + { + "epoch": 17.269983686786297, + "grad_norm": 0.0018032594816759229, + "learning_rate": 5.569856614110358e-05, + "loss": 0.002, + "num_input_tokens_seen": 228693616, + "step": 105865 + }, + { + "epoch": 17.270799347471453, + "grad_norm": 0.0004068401758559048, + "learning_rate": 5.566592206845272e-05, + "loss": 0.0012, + "num_input_tokens_seen": 228704400, + "step": 105870 + }, + { + "epoch": 17.27161500815661, + "grad_norm": 0.025432869791984558, + "learning_rate": 5.563328700079545e-05, + "loss": 0.0035, + "num_input_tokens_seen": 228714704, + "step": 105875 + }, + { + "epoch": 17.27243066884176, + "grad_norm": 0.003037866437807679, + "learning_rate": 5.560066093879351e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228724656, + "step": 105880 + }, + { + "epoch": 17.273246329526916, + "grad_norm": 0.00031391988159157336, + "learning_rate": 5.556804388310777e-05, + "loss": 0.0005, + "num_input_tokens_seen": 228735280, + "step": 105885 + }, + { + "epoch": 17.274061990212072, + "grad_norm": 0.0008085937006399035, + "learning_rate": 5.5535435834399626e-05, + "loss": 0.0047, + "num_input_tokens_seen": 228746288, + "step": 105890 + }, + { + "epoch": 17.274877650897228, + "grad_norm": 0.0009067684295587242, + "learning_rate": 5.550283679332951e-05, + "loss": 0.0014, + "num_input_tokens_seen": 228757200, + "step": 105895 + }, + { + "epoch": 17.275693311582383, + "grad_norm": 0.0005091895000077784, + "learning_rate": 5.5470246760558455e-05, + "loss": 0.0009, + "num_input_tokens_seen": 228768720, + "step": 105900 + }, + { + "epoch": 17.276508972267536, + "grad_norm": 0.00601581484079361, + "learning_rate": 5.543766573674663e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228778640, + "step": 105905 + }, + { + "epoch": 17.27732463295269, + "grad_norm": 0.00027671127463690937, + "learning_rate": 5.5405093722554534e-05, + "loss": 0.0483, + "num_input_tokens_seen": 228790256, + "step": 105910 + }, + { + "epoch": 17.278140293637847, + "grad_norm": 0.08887345343828201, + "learning_rate": 5.5372530718642235e-05, + "loss": 0.0021, + "num_input_tokens_seen": 228801232, + "step": 105915 + }, + { + "epoch": 17.278955954323003, + "grad_norm": 0.006647658068686724, + "learning_rate": 5.533997672566965e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228812528, + "step": 105920 + }, + { + "epoch": 17.27977161500816, + "grad_norm": 0.047590646892786026, + "learning_rate": 5.5307431744296534e-05, + "loss": 0.006, + "num_input_tokens_seen": 228822800, + "step": 105925 + }, + { + "epoch": 17.28058727569331, + "grad_norm": 0.002654826734215021, + "learning_rate": 5.5274895775182464e-05, + "loss": 0.0044, + "num_input_tokens_seen": 228833584, + "step": 105930 + }, + { + "epoch": 17.281402936378466, + "grad_norm": 0.004782018251717091, + "learning_rate": 5.524236881898681e-05, + "loss": 0.0032, + "num_input_tokens_seen": 228843984, + "step": 105935 + }, + { + "epoch": 17.282218597063622, + "grad_norm": 0.009541511535644531, + "learning_rate": 5.5209850876368705e-05, + "loss": 0.0007, + "num_input_tokens_seen": 228855824, + "step": 105940 + }, + { + "epoch": 17.283034257748778, + "grad_norm": 0.003033567452803254, + "learning_rate": 5.517734194798729e-05, + "loss": 0.0005, + "num_input_tokens_seen": 228866384, + "step": 105945 + }, + { + "epoch": 17.28384991843393, + "grad_norm": 0.017595946788787842, + "learning_rate": 5.514484203450132e-05, + "loss": 0.0296, + "num_input_tokens_seen": 228878000, + "step": 105950 + }, + { + "epoch": 17.284665579119086, + "grad_norm": 0.00025085004745051265, + "learning_rate": 5.511235113656943e-05, + "loss": 0.0023, + "num_input_tokens_seen": 228888240, + "step": 105955 + }, + { + "epoch": 17.28548123980424, + "grad_norm": 0.0006416584365069866, + "learning_rate": 5.50798692548502e-05, + "loss": 0.0134, + "num_input_tokens_seen": 228899920, + "step": 105960 + }, + { + "epoch": 17.286296900489397, + "grad_norm": 0.004831426776945591, + "learning_rate": 5.504739639000178e-05, + "loss": 0.0011, + "num_input_tokens_seen": 228911760, + "step": 105965 + }, + { + "epoch": 17.287112561174553, + "grad_norm": 0.0004655012162402272, + "learning_rate": 5.501493254268225e-05, + "loss": 0.0026, + "num_input_tokens_seen": 228922192, + "step": 105970 + }, + { + "epoch": 17.287928221859705, + "grad_norm": 0.0009831018978729844, + "learning_rate": 5.4982477713549806e-05, + "loss": 0.0009, + "num_input_tokens_seen": 228932848, + "step": 105975 + }, + { + "epoch": 17.28874388254486, + "grad_norm": 0.002096477197483182, + "learning_rate": 5.495003190326181e-05, + "loss": 0.0022, + "num_input_tokens_seen": 228944560, + "step": 105980 + }, + { + "epoch": 17.289559543230016, + "grad_norm": 0.0002526229072827846, + "learning_rate": 5.491759511247618e-05, + "loss": 0.0034, + "num_input_tokens_seen": 228955824, + "step": 105985 + }, + { + "epoch": 17.290375203915172, + "grad_norm": 0.0006310658063739538, + "learning_rate": 5.488516734184995e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228966672, + "step": 105990 + }, + { + "epoch": 17.291190864600328, + "grad_norm": 0.01685495860874653, + "learning_rate": 5.485274859204065e-05, + "loss": 0.0009, + "num_input_tokens_seen": 228978224, + "step": 105995 + }, + { + "epoch": 17.29200652528548, + "grad_norm": 0.013227180577814579, + "learning_rate": 5.482033886370491e-05, + "loss": 0.0009, + "num_input_tokens_seen": 228988880, + "step": 106000 + }, + { + "epoch": 17.292822185970635, + "grad_norm": 0.05939861759543419, + "learning_rate": 5.478793815749994e-05, + "loss": 0.0024, + "num_input_tokens_seen": 229000944, + "step": 106005 + }, + { + "epoch": 17.29363784665579, + "grad_norm": 0.011226335540413857, + "learning_rate": 5.4755546474082044e-05, + "loss": 0.0013, + "num_input_tokens_seen": 229011472, + "step": 106010 + }, + { + "epoch": 17.294453507340947, + "grad_norm": 0.00020121457055211067, + "learning_rate": 5.472316381410786e-05, + "loss": 0.0002, + "num_input_tokens_seen": 229022032, + "step": 106015 + }, + { + "epoch": 17.295269168026103, + "grad_norm": 0.002397694159299135, + "learning_rate": 5.46907901782337e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229033328, + "step": 106020 + }, + { + "epoch": 17.296084828711255, + "grad_norm": 0.002774233929812908, + "learning_rate": 5.4658425567115535e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229044656, + "step": 106025 + }, + { + "epoch": 17.29690048939641, + "grad_norm": 0.0003554042486939579, + "learning_rate": 5.4626069981409395e-05, + "loss": 0.0039, + "num_input_tokens_seen": 229056656, + "step": 106030 + }, + { + "epoch": 17.297716150081566, + "grad_norm": 0.06345248967409134, + "learning_rate": 5.459372342177088e-05, + "loss": 0.0025, + "num_input_tokens_seen": 229067440, + "step": 106035 + }, + { + "epoch": 17.298531810766722, + "grad_norm": 0.01404933538287878, + "learning_rate": 5.456138588885562e-05, + "loss": 0.0013, + "num_input_tokens_seen": 229077168, + "step": 106040 + }, + { + "epoch": 17.299347471451878, + "grad_norm": 0.0008357339538633823, + "learning_rate": 5.452905738331898e-05, + "loss": 0.0006, + "num_input_tokens_seen": 229087984, + "step": 106045 + }, + { + "epoch": 17.30016313213703, + "grad_norm": 0.03588249534368515, + "learning_rate": 5.449673790581611e-05, + "loss": 0.0018, + "num_input_tokens_seen": 229098448, + "step": 106050 + }, + { + "epoch": 17.300978792822185, + "grad_norm": 0.009078881703317165, + "learning_rate": 5.446442745700198e-05, + "loss": 0.0031, + "num_input_tokens_seen": 229110160, + "step": 106055 + }, + { + "epoch": 17.30179445350734, + "grad_norm": 0.11321654915809631, + "learning_rate": 5.443212603753145e-05, + "loss": 0.0039, + "num_input_tokens_seen": 229119024, + "step": 106060 + }, + { + "epoch": 17.302610114192497, + "grad_norm": 0.00385329220443964, + "learning_rate": 5.439983364805912e-05, + "loss": 0.0027, + "num_input_tokens_seen": 229130128, + "step": 106065 + }, + { + "epoch": 17.303425774877653, + "grad_norm": 0.0014652871759608388, + "learning_rate": 5.436755028923945e-05, + "loss": 0.0007, + "num_input_tokens_seen": 229141552, + "step": 106070 + }, + { + "epoch": 17.304241435562805, + "grad_norm": 0.0064964075572788715, + "learning_rate": 5.433527596172666e-05, + "loss": 0.0034, + "num_input_tokens_seen": 229152208, + "step": 106075 + }, + { + "epoch": 17.30505709624796, + "grad_norm": 0.002419215627014637, + "learning_rate": 5.430301066617493e-05, + "loss": 0.0009, + "num_input_tokens_seen": 229163504, + "step": 106080 + }, + { + "epoch": 17.305872756933116, + "grad_norm": 0.00044188229367136955, + "learning_rate": 5.4270754403238034e-05, + "loss": 0.0015, + "num_input_tokens_seen": 229174864, + "step": 106085 + }, + { + "epoch": 17.306688417618272, + "grad_norm": 0.12870584428310394, + "learning_rate": 5.4238507173569816e-05, + "loss": 0.0022, + "num_input_tokens_seen": 229185584, + "step": 106090 + }, + { + "epoch": 17.307504078303428, + "grad_norm": 0.0003019646101165563, + "learning_rate": 5.420626897782366e-05, + "loss": 0.0563, + "num_input_tokens_seen": 229196432, + "step": 106095 + }, + { + "epoch": 17.30831973898858, + "grad_norm": 0.002201332477852702, + "learning_rate": 5.417403981665309e-05, + "loss": 0.0031, + "num_input_tokens_seen": 229207088, + "step": 106100 + }, + { + "epoch": 17.309135399673735, + "grad_norm": 0.006552582141011953, + "learning_rate": 5.414181969071108e-05, + "loss": 0.0006, + "num_input_tokens_seen": 229216560, + "step": 106105 + }, + { + "epoch": 17.30995106035889, + "grad_norm": 0.0005127699696458876, + "learning_rate": 5.410960860065073e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229227600, + "step": 106110 + }, + { + "epoch": 17.310766721044047, + "grad_norm": 0.0056101856753230095, + "learning_rate": 5.407740654712473e-05, + "loss": 0.001, + "num_input_tokens_seen": 229236816, + "step": 106115 + }, + { + "epoch": 17.3115823817292, + "grad_norm": 0.03478972986340523, + "learning_rate": 5.4045213530785896e-05, + "loss": 0.0252, + "num_input_tokens_seen": 229247248, + "step": 106120 + }, + { + "epoch": 17.312398042414355, + "grad_norm": 0.0011379508068785071, + "learning_rate": 5.401302955228654e-05, + "loss": 0.0013, + "num_input_tokens_seen": 229257264, + "step": 106125 + }, + { + "epoch": 17.31321370309951, + "grad_norm": 0.00601399689912796, + "learning_rate": 5.398085461227886e-05, + "loss": 0.0029, + "num_input_tokens_seen": 229266416, + "step": 106130 + }, + { + "epoch": 17.314029363784666, + "grad_norm": 0.004694198723882437, + "learning_rate": 5.394868871141506e-05, + "loss": 0.0071, + "num_input_tokens_seen": 229275984, + "step": 106135 + }, + { + "epoch": 17.31484502446982, + "grad_norm": 0.000795087544247508, + "learning_rate": 5.3916531850346895e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229287568, + "step": 106140 + }, + { + "epoch": 17.315660685154974, + "grad_norm": 0.027289612218737602, + "learning_rate": 5.388438402972612e-05, + "loss": 0.0022, + "num_input_tokens_seen": 229297904, + "step": 106145 + }, + { + "epoch": 17.31647634584013, + "grad_norm": 0.0041520558297634125, + "learning_rate": 5.385224525020421e-05, + "loss": 0.0032, + "num_input_tokens_seen": 229309264, + "step": 106150 + }, + { + "epoch": 17.317292006525285, + "grad_norm": 0.004298088140785694, + "learning_rate": 5.382011551243254e-05, + "loss": 0.001, + "num_input_tokens_seen": 229319760, + "step": 106155 + }, + { + "epoch": 17.31810766721044, + "grad_norm": 0.00606426689773798, + "learning_rate": 5.3787994817062256e-05, + "loss": 0.0015, + "num_input_tokens_seen": 229330832, + "step": 106160 + }, + { + "epoch": 17.318923327895597, + "grad_norm": 0.000536845822352916, + "learning_rate": 5.3755883164744335e-05, + "loss": 0.0011, + "num_input_tokens_seen": 229341040, + "step": 106165 + }, + { + "epoch": 17.31973898858075, + "grad_norm": 0.11563540250062943, + "learning_rate": 5.372378055612953e-05, + "loss": 0.0039, + "num_input_tokens_seen": 229350992, + "step": 106170 + }, + { + "epoch": 17.320554649265905, + "grad_norm": 0.00938224047422409, + "learning_rate": 5.369168699186844e-05, + "loss": 0.0019, + "num_input_tokens_seen": 229361488, + "step": 106175 + }, + { + "epoch": 17.32137030995106, + "grad_norm": 0.002466683741658926, + "learning_rate": 5.365960247261148e-05, + "loss": 0.054, + "num_input_tokens_seen": 229371472, + "step": 106180 + }, + { + "epoch": 17.322185970636216, + "grad_norm": 0.726901113986969, + "learning_rate": 5.3627526999008966e-05, + "loss": 0.0237, + "num_input_tokens_seen": 229383216, + "step": 106185 + }, + { + "epoch": 17.32300163132137, + "grad_norm": 0.03403662145137787, + "learning_rate": 5.359546057171083e-05, + "loss": 0.0052, + "num_input_tokens_seen": 229393072, + "step": 106190 + }, + { + "epoch": 17.323817292006524, + "grad_norm": 0.0012271327432245016, + "learning_rate": 5.356340319136699e-05, + "loss": 0.0016, + "num_input_tokens_seen": 229403120, + "step": 106195 + }, + { + "epoch": 17.32463295269168, + "grad_norm": 0.002099724020808935, + "learning_rate": 5.353135485862715e-05, + "loss": 0.0013, + "num_input_tokens_seen": 229413872, + "step": 106200 + }, + { + "epoch": 17.325448613376835, + "grad_norm": 0.0009085267083719373, + "learning_rate": 5.3499315574140784e-05, + "loss": 0.1136, + "num_input_tokens_seen": 229424624, + "step": 106205 + }, + { + "epoch": 17.32626427406199, + "grad_norm": 0.008246471174061298, + "learning_rate": 5.3467285338557213e-05, + "loss": 0.0013, + "num_input_tokens_seen": 229435504, + "step": 106210 + }, + { + "epoch": 17.327079934747147, + "grad_norm": 0.03950975835323334, + "learning_rate": 5.343526415252553e-05, + "loss": 0.0026, + "num_input_tokens_seen": 229445456, + "step": 106215 + }, + { + "epoch": 17.3278955954323, + "grad_norm": 0.06826602667570114, + "learning_rate": 5.340325201669477e-05, + "loss": 0.0022, + "num_input_tokens_seen": 229454960, + "step": 106220 + }, + { + "epoch": 17.328711256117455, + "grad_norm": 0.0021825393196195364, + "learning_rate": 5.337124893171358e-05, + "loss": 0.0009, + "num_input_tokens_seen": 229466000, + "step": 106225 + }, + { + "epoch": 17.32952691680261, + "grad_norm": 0.002758385380730033, + "learning_rate": 5.333925489823077e-05, + "loss": 0.0021, + "num_input_tokens_seen": 229476912, + "step": 106230 + }, + { + "epoch": 17.330342577487766, + "grad_norm": 0.0008691848488524556, + "learning_rate": 5.330726991689439e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229486928, + "step": 106235 + }, + { + "epoch": 17.33115823817292, + "grad_norm": 0.06600486487150192, + "learning_rate": 5.327529398835307e-05, + "loss": 0.0087, + "num_input_tokens_seen": 229497072, + "step": 106240 + }, + { + "epoch": 17.331973898858074, + "grad_norm": 0.0016669132746756077, + "learning_rate": 5.324332711325447e-05, + "loss": 0.0025, + "num_input_tokens_seen": 229509232, + "step": 106245 + }, + { + "epoch": 17.33278955954323, + "grad_norm": 0.009230856783688068, + "learning_rate": 5.3211369292246735e-05, + "loss": 0.0027, + "num_input_tokens_seen": 229519952, + "step": 106250 + }, + { + "epoch": 17.333605220228385, + "grad_norm": 0.01821569725871086, + "learning_rate": 5.317942052597724e-05, + "loss": 0.0368, + "num_input_tokens_seen": 229530096, + "step": 106255 + }, + { + "epoch": 17.33442088091354, + "grad_norm": 0.029215874150395393, + "learning_rate": 5.3147480815093684e-05, + "loss": 0.0019, + "num_input_tokens_seen": 229540656, + "step": 106260 + }, + { + "epoch": 17.335236541598697, + "grad_norm": 0.04781011864542961, + "learning_rate": 5.311555016024328e-05, + "loss": 0.0021, + "num_input_tokens_seen": 229549968, + "step": 106265 + }, + { + "epoch": 17.33605220228385, + "grad_norm": 0.000586999929510057, + "learning_rate": 5.308362856207322e-05, + "loss": 0.0008, + "num_input_tokens_seen": 229561264, + "step": 106270 + }, + { + "epoch": 17.336867862969005, + "grad_norm": 0.0008068184251897037, + "learning_rate": 5.3051716021230375e-05, + "loss": 0.0007, + "num_input_tokens_seen": 229572272, + "step": 106275 + }, + { + "epoch": 17.33768352365416, + "grad_norm": 0.12466217577457428, + "learning_rate": 5.3019812538361466e-05, + "loss": 0.0032, + "num_input_tokens_seen": 229583408, + "step": 106280 + }, + { + "epoch": 17.338499184339316, + "grad_norm": 0.0025564394891262054, + "learning_rate": 5.298791811411313e-05, + "loss": 0.0008, + "num_input_tokens_seen": 229593200, + "step": 106285 + }, + { + "epoch": 17.339314845024468, + "grad_norm": 0.0067188916727900505, + "learning_rate": 5.295603274913169e-05, + "loss": 0.0015, + "num_input_tokens_seen": 229603280, + "step": 106290 + }, + { + "epoch": 17.340130505709624, + "grad_norm": 0.044277604669332504, + "learning_rate": 5.292415644406334e-05, + "loss": 0.0032, + "num_input_tokens_seen": 229613840, + "step": 106295 + }, + { + "epoch": 17.34094616639478, + "grad_norm": 0.0005162619636394083, + "learning_rate": 5.289228919955413e-05, + "loss": 0.0039, + "num_input_tokens_seen": 229624784, + "step": 106300 + }, + { + "epoch": 17.341761827079935, + "grad_norm": 0.0008603575988672674, + "learning_rate": 5.286043101624988e-05, + "loss": 0.0045, + "num_input_tokens_seen": 229636240, + "step": 106305 + }, + { + "epoch": 17.34257748776509, + "grad_norm": 0.00944200623780489, + "learning_rate": 5.2828581894796226e-05, + "loss": 0.0009, + "num_input_tokens_seen": 229647696, + "step": 106310 + }, + { + "epoch": 17.343393148450243, + "grad_norm": 0.01924579218029976, + "learning_rate": 5.2796741835838656e-05, + "loss": 0.0027, + "num_input_tokens_seen": 229657808, + "step": 106315 + }, + { + "epoch": 17.3442088091354, + "grad_norm": 0.0027068655472248793, + "learning_rate": 5.276491084002238e-05, + "loss": 0.0005, + "num_input_tokens_seen": 229668592, + "step": 106320 + }, + { + "epoch": 17.345024469820554, + "grad_norm": 0.0045636678114533424, + "learning_rate": 5.273308890799261e-05, + "loss": 0.0015, + "num_input_tokens_seen": 229679632, + "step": 106325 + }, + { + "epoch": 17.34584013050571, + "grad_norm": 0.010973965749144554, + "learning_rate": 5.270127604039404e-05, + "loss": 0.0029, + "num_input_tokens_seen": 229691408, + "step": 106330 + }, + { + "epoch": 17.346655791190866, + "grad_norm": 0.2658245861530304, + "learning_rate": 5.266947223787177e-05, + "loss": 0.0074, + "num_input_tokens_seen": 229701776, + "step": 106335 + }, + { + "epoch": 17.347471451876018, + "grad_norm": 0.02542303130030632, + "learning_rate": 5.263767750106996e-05, + "loss": 0.0022, + "num_input_tokens_seen": 229712720, + "step": 106340 + }, + { + "epoch": 17.348287112561174, + "grad_norm": 0.0022277773823589087, + "learning_rate": 5.2605891830633304e-05, + "loss": 0.0046, + "num_input_tokens_seen": 229722480, + "step": 106345 + }, + { + "epoch": 17.34910277324633, + "grad_norm": 0.005143773276358843, + "learning_rate": 5.257411522720562e-05, + "loss": 0.0019, + "num_input_tokens_seen": 229733456, + "step": 106350 + }, + { + "epoch": 17.349918433931485, + "grad_norm": 0.009331930428743362, + "learning_rate": 5.2542347691431235e-05, + "loss": 0.0007, + "num_input_tokens_seen": 229743600, + "step": 106355 + }, + { + "epoch": 17.35073409461664, + "grad_norm": 0.001864943071268499, + "learning_rate": 5.251058922395368e-05, + "loss": 0.0005, + "num_input_tokens_seen": 229754544, + "step": 106360 + }, + { + "epoch": 17.351549755301793, + "grad_norm": 0.00036566847120411694, + "learning_rate": 5.24788398254169e-05, + "loss": 0.004, + "num_input_tokens_seen": 229765200, + "step": 106365 + }, + { + "epoch": 17.35236541598695, + "grad_norm": 0.006779797375202179, + "learning_rate": 5.2447099496463925e-05, + "loss": 0.005, + "num_input_tokens_seen": 229775568, + "step": 106370 + }, + { + "epoch": 17.353181076672104, + "grad_norm": 0.0017445924459025264, + "learning_rate": 5.241536823773846e-05, + "loss": 0.0009, + "num_input_tokens_seen": 229787152, + "step": 106375 + }, + { + "epoch": 17.35399673735726, + "grad_norm": 0.022132201120257378, + "learning_rate": 5.238364604988316e-05, + "loss": 0.0015, + "num_input_tokens_seen": 229796560, + "step": 106380 + }, + { + "epoch": 17.354812398042416, + "grad_norm": 0.38006922602653503, + "learning_rate": 5.235193293354129e-05, + "loss": 0.0359, + "num_input_tokens_seen": 229806512, + "step": 106385 + }, + { + "epoch": 17.355628058727568, + "grad_norm": 0.0037477388978004456, + "learning_rate": 5.2320228889355224e-05, + "loss": 0.0028, + "num_input_tokens_seen": 229816816, + "step": 106390 + }, + { + "epoch": 17.356443719412724, + "grad_norm": 0.004561097361147404, + "learning_rate": 5.228853391796784e-05, + "loss": 0.0007, + "num_input_tokens_seen": 229826544, + "step": 106395 + }, + { + "epoch": 17.35725938009788, + "grad_norm": 0.015991326421499252, + "learning_rate": 5.225684802002106e-05, + "loss": 0.0009, + "num_input_tokens_seen": 229838064, + "step": 106400 + }, + { + "epoch": 17.358075040783035, + "grad_norm": 0.023101402446627617, + "learning_rate": 5.222517119615733e-05, + "loss": 0.0012, + "num_input_tokens_seen": 229848880, + "step": 106405 + }, + { + "epoch": 17.35889070146819, + "grad_norm": 0.000387304782634601, + "learning_rate": 5.2193503447018564e-05, + "loss": 0.0005, + "num_input_tokens_seen": 229860720, + "step": 106410 + }, + { + "epoch": 17.359706362153343, + "grad_norm": 0.0031212973408401012, + "learning_rate": 5.216184477324659e-05, + "loss": 0.0008, + "num_input_tokens_seen": 229871984, + "step": 106415 + }, + { + "epoch": 17.3605220228385, + "grad_norm": 0.0009755408391356468, + "learning_rate": 5.2130195175482896e-05, + "loss": 0.0005, + "num_input_tokens_seen": 229881808, + "step": 106420 + }, + { + "epoch": 17.361337683523654, + "grad_norm": 0.00044328568037599325, + "learning_rate": 5.209855465436897e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229891728, + "step": 106425 + }, + { + "epoch": 17.36215334420881, + "grad_norm": 0.015467526391148567, + "learning_rate": 5.2066923210546015e-05, + "loss": 0.0166, + "num_input_tokens_seen": 229902288, + "step": 106430 + }, + { + "epoch": 17.362969004893966, + "grad_norm": 0.0005860764067620039, + "learning_rate": 5.203530084465513e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229911824, + "step": 106435 + }, + { + "epoch": 17.363784665579118, + "grad_norm": 0.0004039146879222244, + "learning_rate": 5.20036875573372e-05, + "loss": 0.0085, + "num_input_tokens_seen": 229923472, + "step": 106440 + }, + { + "epoch": 17.364600326264274, + "grad_norm": 0.0005128368502482772, + "learning_rate": 5.197208334923281e-05, + "loss": 0.0011, + "num_input_tokens_seen": 229934064, + "step": 106445 + }, + { + "epoch": 17.36541598694943, + "grad_norm": 0.0003911785315722227, + "learning_rate": 5.1940488220982516e-05, + "loss": 0.0022, + "num_input_tokens_seen": 229944496, + "step": 106450 + }, + { + "epoch": 17.366231647634585, + "grad_norm": 0.7548543214797974, + "learning_rate": 5.1908902173226524e-05, + "loss": 0.0669, + "num_input_tokens_seen": 229955504, + "step": 106455 + }, + { + "epoch": 17.36704730831974, + "grad_norm": 0.008681437000632286, + "learning_rate": 5.1877325206605316e-05, + "loss": 0.0024, + "num_input_tokens_seen": 229966544, + "step": 106460 + }, + { + "epoch": 17.367862969004893, + "grad_norm": 0.0003187756519764662, + "learning_rate": 5.1845757321758394e-05, + "loss": 0.001, + "num_input_tokens_seen": 229977520, + "step": 106465 + }, + { + "epoch": 17.36867862969005, + "grad_norm": 0.0001434768782928586, + "learning_rate": 5.181419851932589e-05, + "loss": 0.0005, + "num_input_tokens_seen": 229989072, + "step": 106470 + }, + { + "epoch": 17.369494290375204, + "grad_norm": 0.0026647213380783796, + "learning_rate": 5.178264879994704e-05, + "loss": 0.002, + "num_input_tokens_seen": 229999984, + "step": 106475 + }, + { + "epoch": 17.37030995106036, + "grad_norm": 0.0023641285952180624, + "learning_rate": 5.17511081642616e-05, + "loss": 0.0026, + "num_input_tokens_seen": 230010704, + "step": 106480 + }, + { + "epoch": 17.371125611745512, + "grad_norm": 0.7835355997085571, + "learning_rate": 5.171957661290838e-05, + "loss": 0.0747, + "num_input_tokens_seen": 230022224, + "step": 106485 + }, + { + "epoch": 17.371941272430668, + "grad_norm": 0.003112988080829382, + "learning_rate": 5.1688054146526886e-05, + "loss": 0.0016, + "num_input_tokens_seen": 230033360, + "step": 106490 + }, + { + "epoch": 17.372756933115824, + "grad_norm": 0.0008296226733364165, + "learning_rate": 5.165654076575543e-05, + "loss": 0.0033, + "num_input_tokens_seen": 230044336, + "step": 106495 + }, + { + "epoch": 17.37357259380098, + "grad_norm": 0.010152243077754974, + "learning_rate": 5.162503647123318e-05, + "loss": 0.0008, + "num_input_tokens_seen": 230054288, + "step": 106500 + }, + { + "epoch": 17.374388254486135, + "grad_norm": 0.0011838224017992616, + "learning_rate": 5.159354126359816e-05, + "loss": 0.001, + "num_input_tokens_seen": 230066000, + "step": 106505 + }, + { + "epoch": 17.375203915171287, + "grad_norm": 0.0012060764711350203, + "learning_rate": 5.156205514348905e-05, + "loss": 0.0006, + "num_input_tokens_seen": 230075344, + "step": 106510 + }, + { + "epoch": 17.376019575856443, + "grad_norm": 0.003541109850630164, + "learning_rate": 5.1530578111543605e-05, + "loss": 0.0016, + "num_input_tokens_seen": 230086192, + "step": 106515 + }, + { + "epoch": 17.3768352365416, + "grad_norm": 0.001406823517754674, + "learning_rate": 5.149911016840009e-05, + "loss": 0.0058, + "num_input_tokens_seen": 230097808, + "step": 106520 + }, + { + "epoch": 17.377650897226754, + "grad_norm": 0.006291515659540892, + "learning_rate": 5.146765131469594e-05, + "loss": 0.005, + "num_input_tokens_seen": 230108176, + "step": 106525 + }, + { + "epoch": 17.37846655791191, + "grad_norm": 0.00061332545010373, + "learning_rate": 5.1436201551068987e-05, + "loss": 0.0009, + "num_input_tokens_seen": 230118672, + "step": 106530 + }, + { + "epoch": 17.379282218597062, + "grad_norm": 0.0031427890062332153, + "learning_rate": 5.140476087815621e-05, + "loss": 0.0013, + "num_input_tokens_seen": 230129712, + "step": 106535 + }, + { + "epoch": 17.380097879282218, + "grad_norm": 0.002123972401022911, + "learning_rate": 5.137332929659522e-05, + "loss": 0.0005, + "num_input_tokens_seen": 230141648, + "step": 106540 + }, + { + "epoch": 17.380913539967374, + "grad_norm": 0.000174164364580065, + "learning_rate": 5.134190680702278e-05, + "loss": 0.0152, + "num_input_tokens_seen": 230151504, + "step": 106545 + }, + { + "epoch": 17.38172920065253, + "grad_norm": 0.021573202684521675, + "learning_rate": 5.1310493410075765e-05, + "loss": 0.0011, + "num_input_tokens_seen": 230162512, + "step": 106550 + }, + { + "epoch": 17.382544861337685, + "grad_norm": 0.017056437209248543, + "learning_rate": 5.127908910639084e-05, + "loss": 0.031, + "num_input_tokens_seen": 230173968, + "step": 106555 + }, + { + "epoch": 17.383360522022837, + "grad_norm": 0.0004546472628135234, + "learning_rate": 5.1247693896604386e-05, + "loss": 0.021, + "num_input_tokens_seen": 230184016, + "step": 106560 + }, + { + "epoch": 17.384176182707993, + "grad_norm": 0.009080810472369194, + "learning_rate": 5.1216307781352724e-05, + "loss": 0.0009, + "num_input_tokens_seen": 230193648, + "step": 106565 + }, + { + "epoch": 17.38499184339315, + "grad_norm": 0.22605130076408386, + "learning_rate": 5.11849307612719e-05, + "loss": 0.0077, + "num_input_tokens_seen": 230204912, + "step": 106570 + }, + { + "epoch": 17.385807504078304, + "grad_norm": 0.04267728328704834, + "learning_rate": 5.115356283699779e-05, + "loss": 0.0012, + "num_input_tokens_seen": 230216368, + "step": 106575 + }, + { + "epoch": 17.38662316476346, + "grad_norm": 0.012378888204693794, + "learning_rate": 5.112220400916617e-05, + "loss": 0.0009, + "num_input_tokens_seen": 230227664, + "step": 106580 + }, + { + "epoch": 17.387438825448612, + "grad_norm": 0.005728128831833601, + "learning_rate": 5.109085427841248e-05, + "loss": 0.0012, + "num_input_tokens_seen": 230239312, + "step": 106585 + }, + { + "epoch": 17.388254486133768, + "grad_norm": 0.0034778222907334566, + "learning_rate": 5.1059513645372146e-05, + "loss": 0.0006, + "num_input_tokens_seen": 230249616, + "step": 106590 + }, + { + "epoch": 17.389070146818923, + "grad_norm": 0.000555566162802279, + "learning_rate": 5.1028182110680275e-05, + "loss": 0.0007, + "num_input_tokens_seen": 230260400, + "step": 106595 + }, + { + "epoch": 17.38988580750408, + "grad_norm": 0.001118313753977418, + "learning_rate": 5.0996859674971805e-05, + "loss": 0.0038, + "num_input_tokens_seen": 230271184, + "step": 106600 + }, + { + "epoch": 17.390701468189235, + "grad_norm": 0.002053825417533517, + "learning_rate": 5.096554633888173e-05, + "loss": 0.0006, + "num_input_tokens_seen": 230282576, + "step": 106605 + }, + { + "epoch": 17.391517128874387, + "grad_norm": 0.008048903197050095, + "learning_rate": 5.093424210304426e-05, + "loss": 0.0011, + "num_input_tokens_seen": 230293392, + "step": 106610 + }, + { + "epoch": 17.392332789559543, + "grad_norm": 0.003288182895630598, + "learning_rate": 5.090294696809428e-05, + "loss": 0.0004, + "num_input_tokens_seen": 230303216, + "step": 106615 + }, + { + "epoch": 17.3931484502447, + "grad_norm": 0.005758529528975487, + "learning_rate": 5.087166093466566e-05, + "loss": 0.0006, + "num_input_tokens_seen": 230315152, + "step": 106620 + }, + { + "epoch": 17.393964110929854, + "grad_norm": 0.6126771569252014, + "learning_rate": 5.0840384003392745e-05, + "loss": 0.0095, + "num_input_tokens_seen": 230325328, + "step": 106625 + }, + { + "epoch": 17.39477977161501, + "grad_norm": 0.001895575551316142, + "learning_rate": 5.080911617490902e-05, + "loss": 0.0013, + "num_input_tokens_seen": 230335152, + "step": 106630 + }, + { + "epoch": 17.395595432300162, + "grad_norm": 0.00044205409358255565, + "learning_rate": 5.0777857449848644e-05, + "loss": 0.0009, + "num_input_tokens_seen": 230346224, + "step": 106635 + }, + { + "epoch": 17.396411092985318, + "grad_norm": 0.005649374332278967, + "learning_rate": 5.074660782884461e-05, + "loss": 0.0008, + "num_input_tokens_seen": 230357744, + "step": 106640 + }, + { + "epoch": 17.397226753670473, + "grad_norm": 0.0003312532207928598, + "learning_rate": 5.071536731253074e-05, + "loss": 0.0019, + "num_input_tokens_seen": 230368912, + "step": 106645 + }, + { + "epoch": 17.39804241435563, + "grad_norm": 0.0009800927946344018, + "learning_rate": 5.0684135901539694e-05, + "loss": 0.0004, + "num_input_tokens_seen": 230379600, + "step": 106650 + }, + { + "epoch": 17.39885807504078, + "grad_norm": 0.002080594189465046, + "learning_rate": 5.0652913596504704e-05, + "loss": 0.001, + "num_input_tokens_seen": 230390704, + "step": 106655 + }, + { + "epoch": 17.399673735725937, + "grad_norm": 0.03588714450597763, + "learning_rate": 5.062170039805847e-05, + "loss": 0.0947, + "num_input_tokens_seen": 230402416, + "step": 106660 + }, + { + "epoch": 17.400489396411093, + "grad_norm": 0.0012130021350458264, + "learning_rate": 5.05904963068336e-05, + "loss": 0.0085, + "num_input_tokens_seen": 230413360, + "step": 106665 + }, + { + "epoch": 17.40130505709625, + "grad_norm": 0.044631388038396835, + "learning_rate": 5.055930132346237e-05, + "loss": 0.0032, + "num_input_tokens_seen": 230423792, + "step": 106670 + }, + { + "epoch": 17.402120717781404, + "grad_norm": 0.001390898018144071, + "learning_rate": 5.0528115448577105e-05, + "loss": 0.0007, + "num_input_tokens_seen": 230434352, + "step": 106675 + }, + { + "epoch": 17.402936378466556, + "grad_norm": 0.5931783318519592, + "learning_rate": 5.0496938682809744e-05, + "loss": 0.0803, + "num_input_tokens_seen": 230445680, + "step": 106680 + }, + { + "epoch": 17.403752039151712, + "grad_norm": 0.0012526774080470204, + "learning_rate": 5.0465771026792175e-05, + "loss": 0.001, + "num_input_tokens_seen": 230456336, + "step": 106685 + }, + { + "epoch": 17.404567699836868, + "grad_norm": 0.0003373456420376897, + "learning_rate": 5.043461248115605e-05, + "loss": 0.0042, + "num_input_tokens_seen": 230468368, + "step": 106690 + }, + { + "epoch": 17.405383360522023, + "grad_norm": 0.05024491995573044, + "learning_rate": 5.040346304653276e-05, + "loss": 0.057, + "num_input_tokens_seen": 230479408, + "step": 106695 + }, + { + "epoch": 17.40619902120718, + "grad_norm": 0.0005603748722933233, + "learning_rate": 5.037232272355369e-05, + "loss": 0.0018, + "num_input_tokens_seen": 230490800, + "step": 106700 + }, + { + "epoch": 17.40701468189233, + "grad_norm": 0.00028684749850071967, + "learning_rate": 5.034119151284988e-05, + "loss": 0.0014, + "num_input_tokens_seen": 230502832, + "step": 106705 + }, + { + "epoch": 17.407830342577487, + "grad_norm": 0.007525811903178692, + "learning_rate": 5.031006941505228e-05, + "loss": 0.0007, + "num_input_tokens_seen": 230513744, + "step": 106710 + }, + { + "epoch": 17.408646003262643, + "grad_norm": 0.00031510432017967105, + "learning_rate": 5.0278956430791555e-05, + "loss": 0.0008, + "num_input_tokens_seen": 230524336, + "step": 106715 + }, + { + "epoch": 17.4094616639478, + "grad_norm": 0.0023549527395516634, + "learning_rate": 5.0247852560698304e-05, + "loss": 0.0004, + "num_input_tokens_seen": 230535440, + "step": 106720 + }, + { + "epoch": 17.410277324632954, + "grad_norm": 0.0041503324173390865, + "learning_rate": 5.0216757805402856e-05, + "loss": 0.0012, + "num_input_tokens_seen": 230544336, + "step": 106725 + }, + { + "epoch": 17.411092985318106, + "grad_norm": 0.0039215851575136185, + "learning_rate": 5.018567216553543e-05, + "loss": 0.001, + "num_input_tokens_seen": 230555792, + "step": 106730 + }, + { + "epoch": 17.411908646003262, + "grad_norm": 0.005930093117058277, + "learning_rate": 5.015459564172597e-05, + "loss": 0.0129, + "num_input_tokens_seen": 230567536, + "step": 106735 + }, + { + "epoch": 17.412724306688418, + "grad_norm": 0.0019989213906228542, + "learning_rate": 5.0123528234604307e-05, + "loss": 0.0511, + "num_input_tokens_seen": 230578736, + "step": 106740 + }, + { + "epoch": 17.413539967373573, + "grad_norm": 0.049455612897872925, + "learning_rate": 5.009246994479999e-05, + "loss": 0.0012, + "num_input_tokens_seen": 230588432, + "step": 106745 + }, + { + "epoch": 17.41435562805873, + "grad_norm": 0.0006300982204265893, + "learning_rate": 5.006142077294268e-05, + "loss": 0.0125, + "num_input_tokens_seen": 230599120, + "step": 106750 + }, + { + "epoch": 17.41517128874388, + "grad_norm": 0.0014582215808331966, + "learning_rate": 5.003038071966126e-05, + "loss": 0.0017, + "num_input_tokens_seen": 230610960, + "step": 106755 + }, + { + "epoch": 17.415986949429037, + "grad_norm": 0.015856770798563957, + "learning_rate": 4.999934978558513e-05, + "loss": 0.08, + "num_input_tokens_seen": 230621136, + "step": 106760 + }, + { + "epoch": 17.416802610114193, + "grad_norm": 0.007289855740964413, + "learning_rate": 4.996832797134299e-05, + "loss": 0.0006, + "num_input_tokens_seen": 230632368, + "step": 106765 + }, + { + "epoch": 17.41761827079935, + "grad_norm": 0.002110978588461876, + "learning_rate": 4.9937315277563625e-05, + "loss": 0.0008, + "num_input_tokens_seen": 230643600, + "step": 106770 + }, + { + "epoch": 17.418433931484504, + "grad_norm": 0.012086848728358746, + "learning_rate": 4.990631170487553e-05, + "loss": 0.0022, + "num_input_tokens_seen": 230655696, + "step": 106775 + }, + { + "epoch": 17.419249592169656, + "grad_norm": 0.08439251780509949, + "learning_rate": 4.987531725390698e-05, + "loss": 0.0095, + "num_input_tokens_seen": 230667920, + "step": 106780 + }, + { + "epoch": 17.420065252854812, + "grad_norm": 0.1708887368440628, + "learning_rate": 4.9844331925286145e-05, + "loss": 0.0077, + "num_input_tokens_seen": 230678800, + "step": 106785 + }, + { + "epoch": 17.420880913539968, + "grad_norm": 0.009966623969376087, + "learning_rate": 4.981335571964102e-05, + "loss": 0.0005, + "num_input_tokens_seen": 230689360, + "step": 106790 + }, + { + "epoch": 17.421696574225123, + "grad_norm": 0.007955643348395824, + "learning_rate": 4.978238863759932e-05, + "loss": 0.0005, + "num_input_tokens_seen": 230698672, + "step": 106795 + }, + { + "epoch": 17.42251223491028, + "grad_norm": 0.00029836222529411316, + "learning_rate": 4.975143067978866e-05, + "loss": 0.0013, + "num_input_tokens_seen": 230709360, + "step": 106800 + }, + { + "epoch": 17.42332789559543, + "grad_norm": 0.0006177327013574541, + "learning_rate": 4.9720481846836416e-05, + "loss": 0.0015, + "num_input_tokens_seen": 230721008, + "step": 106805 + }, + { + "epoch": 17.424143556280587, + "grad_norm": 0.013303990475833416, + "learning_rate": 4.968954213936988e-05, + "loss": 0.0028, + "num_input_tokens_seen": 230733872, + "step": 106810 + }, + { + "epoch": 17.424959216965743, + "grad_norm": 0.003197494661435485, + "learning_rate": 4.9658611558015984e-05, + "loss": 0.0006, + "num_input_tokens_seen": 230744880, + "step": 106815 + }, + { + "epoch": 17.4257748776509, + "grad_norm": 0.00020710163516923785, + "learning_rate": 4.962769010340163e-05, + "loss": 0.0011, + "num_input_tokens_seen": 230755504, + "step": 106820 + }, + { + "epoch": 17.42659053833605, + "grad_norm": 0.00011777384497690946, + "learning_rate": 4.959677777615351e-05, + "loss": 0.0023, + "num_input_tokens_seen": 230765968, + "step": 106825 + }, + { + "epoch": 17.427406199021206, + "grad_norm": 0.03826236352324486, + "learning_rate": 4.956587457689804e-05, + "loss": 0.0132, + "num_input_tokens_seen": 230777296, + "step": 106830 + }, + { + "epoch": 17.428221859706362, + "grad_norm": 0.000653933675494045, + "learning_rate": 4.953498050626154e-05, + "loss": 0.0007, + "num_input_tokens_seen": 230788944, + "step": 106835 + }, + { + "epoch": 17.429037520391518, + "grad_norm": 0.0026093318592756987, + "learning_rate": 4.9504095564870124e-05, + "loss": 0.0008, + "num_input_tokens_seen": 230799984, + "step": 106840 + }, + { + "epoch": 17.429853181076673, + "grad_norm": 0.0002395760966464877, + "learning_rate": 4.947321975334967e-05, + "loss": 0.0008, + "num_input_tokens_seen": 230810096, + "step": 106845 + }, + { + "epoch": 17.430668841761825, + "grad_norm": 0.0016788537614047527, + "learning_rate": 4.944235307232597e-05, + "loss": 0.0008, + "num_input_tokens_seen": 230820688, + "step": 106850 + }, + { + "epoch": 17.43148450244698, + "grad_norm": 0.008425015024840832, + "learning_rate": 4.941149552242458e-05, + "loss": 0.0012, + "num_input_tokens_seen": 230832848, + "step": 106855 + }, + { + "epoch": 17.432300163132137, + "grad_norm": 0.0009443744784221053, + "learning_rate": 4.9380647104270814e-05, + "loss": 0.002, + "num_input_tokens_seen": 230843088, + "step": 106860 + }, + { + "epoch": 17.433115823817293, + "grad_norm": 0.0007148012518882751, + "learning_rate": 4.93498078184898e-05, + "loss": 0.001, + "num_input_tokens_seen": 230853616, + "step": 106865 + }, + { + "epoch": 17.43393148450245, + "grad_norm": 0.0006740608369000256, + "learning_rate": 4.9318977665706866e-05, + "loss": 0.0043, + "num_input_tokens_seen": 230864336, + "step": 106870 + }, + { + "epoch": 17.4347471451876, + "grad_norm": 0.0037371008656919003, + "learning_rate": 4.928815664654635e-05, + "loss": 0.0015, + "num_input_tokens_seen": 230875312, + "step": 106875 + }, + { + "epoch": 17.435562805872756, + "grad_norm": 0.036037005484104156, + "learning_rate": 4.9257344761633236e-05, + "loss": 0.0014, + "num_input_tokens_seen": 230885936, + "step": 106880 + }, + { + "epoch": 17.436378466557912, + "grad_norm": 0.0017335086595267057, + "learning_rate": 4.9226542011591716e-05, + "loss": 0.0003, + "num_input_tokens_seen": 230896688, + "step": 106885 + }, + { + "epoch": 17.437194127243067, + "grad_norm": 0.015234340913593769, + "learning_rate": 4.919574839704627e-05, + "loss": 0.0158, + "num_input_tokens_seen": 230907408, + "step": 106890 + }, + { + "epoch": 17.438009787928223, + "grad_norm": 0.005732585676014423, + "learning_rate": 4.916496391862085e-05, + "loss": 0.0337, + "num_input_tokens_seen": 230918256, + "step": 106895 + }, + { + "epoch": 17.438825448613375, + "grad_norm": 0.0019450652180239558, + "learning_rate": 4.913418857693936e-05, + "loss": 0.0017, + "num_input_tokens_seen": 230928944, + "step": 106900 + }, + { + "epoch": 17.43964110929853, + "grad_norm": 0.051235880702733994, + "learning_rate": 4.9103422372625496e-05, + "loss": 0.0013, + "num_input_tokens_seen": 230939408, + "step": 106905 + }, + { + "epoch": 17.440456769983687, + "grad_norm": 0.10393361747264862, + "learning_rate": 4.907266530630278e-05, + "loss": 0.0035, + "num_input_tokens_seen": 230949296, + "step": 106910 + }, + { + "epoch": 17.441272430668842, + "grad_norm": 0.001110541052184999, + "learning_rate": 4.904191737859454e-05, + "loss": 0.0005, + "num_input_tokens_seen": 230960592, + "step": 106915 + }, + { + "epoch": 17.442088091353998, + "grad_norm": 0.013865980319678783, + "learning_rate": 4.901117859012394e-05, + "loss": 0.0014, + "num_input_tokens_seen": 230971504, + "step": 106920 + }, + { + "epoch": 17.44290375203915, + "grad_norm": 0.0016207977896556258, + "learning_rate": 4.898044894151393e-05, + "loss": 0.0003, + "num_input_tokens_seen": 230981200, + "step": 106925 + }, + { + "epoch": 17.443719412724306, + "grad_norm": 0.00044690087088383734, + "learning_rate": 4.894972843338724e-05, + "loss": 0.0041, + "num_input_tokens_seen": 230991920, + "step": 106930 + }, + { + "epoch": 17.44453507340946, + "grad_norm": 0.00033451549825258553, + "learning_rate": 4.891901706636653e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231002480, + "step": 106935 + }, + { + "epoch": 17.445350734094617, + "grad_norm": 0.0024510840885341167, + "learning_rate": 4.88883148410742e-05, + "loss": 0.0008, + "num_input_tokens_seen": 231013904, + "step": 106940 + }, + { + "epoch": 17.446166394779773, + "grad_norm": 0.0023836391046643257, + "learning_rate": 4.885762175813241e-05, + "loss": 0.0009, + "num_input_tokens_seen": 231024528, + "step": 106945 + }, + { + "epoch": 17.446982055464925, + "grad_norm": 0.005944438744336367, + "learning_rate": 4.882693781816327e-05, + "loss": 0.0015, + "num_input_tokens_seen": 231036048, + "step": 106950 + }, + { + "epoch": 17.44779771615008, + "grad_norm": 0.0009455296094529331, + "learning_rate": 4.8796263021788524e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231047344, + "step": 106955 + }, + { + "epoch": 17.448613376835237, + "grad_norm": 0.026120547205209732, + "learning_rate": 4.876559736962999e-05, + "loss": 0.0009, + "num_input_tokens_seen": 231059440, + "step": 106960 + }, + { + "epoch": 17.449429037520392, + "grad_norm": 0.002475123852491379, + "learning_rate": 4.8734940862309006e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231071088, + "step": 106965 + }, + { + "epoch": 17.450244698205548, + "grad_norm": 0.045191384851932526, + "learning_rate": 4.8704293500446806e-05, + "loss": 0.0101, + "num_input_tokens_seen": 231082576, + "step": 106970 + }, + { + "epoch": 17.4510603588907, + "grad_norm": 0.007040271535515785, + "learning_rate": 4.867365528466477e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231092816, + "step": 106975 + }, + { + "epoch": 17.451876019575856, + "grad_norm": 0.0007439907640218735, + "learning_rate": 4.864302621558353e-05, + "loss": 0.003, + "num_input_tokens_seen": 231103760, + "step": 106980 + }, + { + "epoch": 17.45269168026101, + "grad_norm": 0.2785327434539795, + "learning_rate": 4.861240629382413e-05, + "loss": 0.0085, + "num_input_tokens_seen": 231114800, + "step": 106985 + }, + { + "epoch": 17.453507340946167, + "grad_norm": 0.030314238741993904, + "learning_rate": 4.858179552000674e-05, + "loss": 0.003, + "num_input_tokens_seen": 231126064, + "step": 106990 + }, + { + "epoch": 17.454323001631323, + "grad_norm": 0.0005355747998692095, + "learning_rate": 4.85511938947521e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231137424, + "step": 106995 + }, + { + "epoch": 17.455138662316475, + "grad_norm": 0.006188563071191311, + "learning_rate": 4.8520601418680085e-05, + "loss": 0.0011, + "num_input_tokens_seen": 231148464, + "step": 107000 + }, + { + "epoch": 17.45595432300163, + "grad_norm": 0.00108015863224864, + "learning_rate": 4.849001809241099e-05, + "loss": 0.0026, + "num_input_tokens_seen": 231159696, + "step": 107005 + }, + { + "epoch": 17.456769983686787, + "grad_norm": 0.0003742810513358563, + "learning_rate": 4.845944391656426e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231170096, + "step": 107010 + }, + { + "epoch": 17.457585644371942, + "grad_norm": 0.0034818367566913366, + "learning_rate": 4.84288788917599e-05, + "loss": 0.0004, + "num_input_tokens_seen": 231181232, + "step": 107015 + }, + { + "epoch": 17.458401305057095, + "grad_norm": 0.0006757063092663884, + "learning_rate": 4.839832301861696e-05, + "loss": 0.0005, + "num_input_tokens_seen": 231192848, + "step": 107020 + }, + { + "epoch": 17.45921696574225, + "grad_norm": 0.0012268743012100458, + "learning_rate": 4.836777629775513e-05, + "loss": 0.0044, + "num_input_tokens_seen": 231204144, + "step": 107025 + }, + { + "epoch": 17.460032626427406, + "grad_norm": 0.0005029301391914487, + "learning_rate": 4.833723872979306e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231215344, + "step": 107030 + }, + { + "epoch": 17.46084828711256, + "grad_norm": 0.0007763210451230407, + "learning_rate": 4.830671031534989e-05, + "loss": 0.0008, + "num_input_tokens_seen": 231226416, + "step": 107035 + }, + { + "epoch": 17.461663947797717, + "grad_norm": 0.010013229213654995, + "learning_rate": 4.827619105504427e-05, + "loss": 0.007, + "num_input_tokens_seen": 231237840, + "step": 107040 + }, + { + "epoch": 17.46247960848287, + "grad_norm": 0.00031955906888470054, + "learning_rate": 4.8245680949494664e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231247312, + "step": 107045 + }, + { + "epoch": 17.463295269168025, + "grad_norm": 0.00021452225337270647, + "learning_rate": 4.821517999931946e-05, + "loss": 0.0004, + "num_input_tokens_seen": 231258320, + "step": 107050 + }, + { + "epoch": 17.46411092985318, + "grad_norm": 0.08768285810947418, + "learning_rate": 4.8184688205136716e-05, + "loss": 0.0026, + "num_input_tokens_seen": 231268080, + "step": 107055 + }, + { + "epoch": 17.464926590538337, + "grad_norm": 0.0003200488572474569, + "learning_rate": 4.8154205567564503e-05, + "loss": 0.0014, + "num_input_tokens_seen": 231277552, + "step": 107060 + }, + { + "epoch": 17.465742251223492, + "grad_norm": 0.023660294711589813, + "learning_rate": 4.812373208722048e-05, + "loss": 0.0016, + "num_input_tokens_seen": 231289360, + "step": 107065 + }, + { + "epoch": 17.466557911908644, + "grad_norm": 0.002158315386623144, + "learning_rate": 4.809326776472228e-05, + "loss": 0.0026, + "num_input_tokens_seen": 231299888, + "step": 107070 + }, + { + "epoch": 17.4673735725938, + "grad_norm": 0.0004791323735844344, + "learning_rate": 4.806281260068729e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231311568, + "step": 107075 + }, + { + "epoch": 17.468189233278956, + "grad_norm": 0.0012294030748307705, + "learning_rate": 4.803236659573274e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231322032, + "step": 107080 + }, + { + "epoch": 17.46900489396411, + "grad_norm": 1.8501989841461182, + "learning_rate": 4.800192975047551e-05, + "loss": 0.0749, + "num_input_tokens_seen": 231332880, + "step": 107085 + }, + { + "epoch": 17.469820554649267, + "grad_norm": 0.016211597248911858, + "learning_rate": 4.79715020655328e-05, + "loss": 0.0043, + "num_input_tokens_seen": 231344560, + "step": 107090 + }, + { + "epoch": 17.47063621533442, + "grad_norm": 0.0010530364234000444, + "learning_rate": 4.794108354152082e-05, + "loss": 0.0008, + "num_input_tokens_seen": 231354896, + "step": 107095 + }, + { + "epoch": 17.471451876019575, + "grad_norm": 0.0028281863778829575, + "learning_rate": 4.791067417905648e-05, + "loss": 0.0016, + "num_input_tokens_seen": 231365648, + "step": 107100 + }, + { + "epoch": 17.47226753670473, + "grad_norm": 0.5425127744674683, + "learning_rate": 4.7880273978755606e-05, + "loss": 0.164, + "num_input_tokens_seen": 231375728, + "step": 107105 + }, + { + "epoch": 17.473083197389887, + "grad_norm": 0.00036838767118752003, + "learning_rate": 4.784988294123477e-05, + "loss": 0.0012, + "num_input_tokens_seen": 231387376, + "step": 107110 + }, + { + "epoch": 17.473898858075042, + "grad_norm": 0.022055508568882942, + "learning_rate": 4.781950106710942e-05, + "loss": 0.002, + "num_input_tokens_seen": 231397776, + "step": 107115 + }, + { + "epoch": 17.474714518760194, + "grad_norm": 0.0015256558544933796, + "learning_rate": 4.7789128356995727e-05, + "loss": 0.002, + "num_input_tokens_seen": 231408464, + "step": 107120 + }, + { + "epoch": 17.47553017944535, + "grad_norm": 0.8088774681091309, + "learning_rate": 4.775876481150887e-05, + "loss": 0.1319, + "num_input_tokens_seen": 231419312, + "step": 107125 + }, + { + "epoch": 17.476345840130506, + "grad_norm": 0.0034476907458156347, + "learning_rate": 4.772841043126447e-05, + "loss": 0.0004, + "num_input_tokens_seen": 231429648, + "step": 107130 + }, + { + "epoch": 17.47716150081566, + "grad_norm": 0.0049369195476174355, + "learning_rate": 4.769806521687742e-05, + "loss": 0.0005, + "num_input_tokens_seen": 231440208, + "step": 107135 + }, + { + "epoch": 17.477977161500817, + "grad_norm": 0.004842468071728945, + "learning_rate": 4.766772916896306e-05, + "loss": 0.021, + "num_input_tokens_seen": 231451856, + "step": 107140 + }, + { + "epoch": 17.47879282218597, + "grad_norm": 0.014109466224908829, + "learning_rate": 4.763740228813579e-05, + "loss": 0.0014, + "num_input_tokens_seen": 231463888, + "step": 107145 + }, + { + "epoch": 17.479608482871125, + "grad_norm": 0.008914794772863388, + "learning_rate": 4.760708457501062e-05, + "loss": 0.0042, + "num_input_tokens_seen": 231475184, + "step": 107150 + }, + { + "epoch": 17.48042414355628, + "grad_norm": 0.0373489186167717, + "learning_rate": 4.7576776030201606e-05, + "loss": 0.0017, + "num_input_tokens_seen": 231487312, + "step": 107155 + }, + { + "epoch": 17.481239804241437, + "grad_norm": 0.01696755364537239, + "learning_rate": 4.754647665432338e-05, + "loss": 0.0108, + "num_input_tokens_seen": 231497488, + "step": 107160 + }, + { + "epoch": 17.482055464926592, + "grad_norm": 0.0018327207071706653, + "learning_rate": 4.751618644798955e-05, + "loss": 0.0017, + "num_input_tokens_seen": 231508080, + "step": 107165 + }, + { + "epoch": 17.482871125611744, + "grad_norm": 0.00462839612737298, + "learning_rate": 4.7485905411814414e-05, + "loss": 0.0016, + "num_input_tokens_seen": 231518480, + "step": 107170 + }, + { + "epoch": 17.4836867862969, + "grad_norm": 0.00042224518256261945, + "learning_rate": 4.745563354641125e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231528496, + "step": 107175 + }, + { + "epoch": 17.484502446982056, + "grad_norm": 0.0018490392249077559, + "learning_rate": 4.74253708523939e-05, + "loss": 0.0008, + "num_input_tokens_seen": 231538128, + "step": 107180 + }, + { + "epoch": 17.48531810766721, + "grad_norm": 0.0025340570136904716, + "learning_rate": 4.7395117330375494e-05, + "loss": 0.0138, + "num_input_tokens_seen": 231548688, + "step": 107185 + }, + { + "epoch": 17.486133768352367, + "grad_norm": 0.024287715554237366, + "learning_rate": 4.7364872980969254e-05, + "loss": 0.0073, + "num_input_tokens_seen": 231560080, + "step": 107190 + }, + { + "epoch": 17.48694942903752, + "grad_norm": 0.0004691890790127218, + "learning_rate": 4.733463780478808e-05, + "loss": 0.02, + "num_input_tokens_seen": 231571952, + "step": 107195 + }, + { + "epoch": 17.487765089722675, + "grad_norm": 0.001160036539658904, + "learning_rate": 4.7304411802444656e-05, + "loss": 0.0012, + "num_input_tokens_seen": 231582736, + "step": 107200 + }, + { + "epoch": 17.48858075040783, + "grad_norm": 0.9447407126426697, + "learning_rate": 4.7274194974551656e-05, + "loss": 0.0331, + "num_input_tokens_seen": 231593872, + "step": 107205 + }, + { + "epoch": 17.489396411092986, + "grad_norm": 0.0005999091663397849, + "learning_rate": 4.724398732172142e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231603216, + "step": 107210 + }, + { + "epoch": 17.49021207177814, + "grad_norm": 0.006160971242934465, + "learning_rate": 4.721378884456612e-05, + "loss": 0.0251, + "num_input_tokens_seen": 231614544, + "step": 107215 + }, + { + "epoch": 17.491027732463294, + "grad_norm": 0.0011769005795940757, + "learning_rate": 4.718359954369783e-05, + "loss": 0.0015, + "num_input_tokens_seen": 231624048, + "step": 107220 + }, + { + "epoch": 17.49184339314845, + "grad_norm": 0.0006996638257987797, + "learning_rate": 4.7153419419728285e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231634800, + "step": 107225 + }, + { + "epoch": 17.492659053833606, + "grad_norm": 0.005700491834431887, + "learning_rate": 4.7123248473269096e-05, + "loss": 0.0013, + "num_input_tokens_seen": 231645456, + "step": 107230 + }, + { + "epoch": 17.49347471451876, + "grad_norm": 0.009123490191996098, + "learning_rate": 4.7093086704931955e-05, + "loss": 0.0028, + "num_input_tokens_seen": 231657136, + "step": 107235 + }, + { + "epoch": 17.494290375203914, + "grad_norm": 0.91878342628479, + "learning_rate": 4.7062934115327804e-05, + "loss": 0.005, + "num_input_tokens_seen": 231668368, + "step": 107240 + }, + { + "epoch": 17.49510603588907, + "grad_norm": 0.0010388504015281796, + "learning_rate": 4.7032790705068105e-05, + "loss": 0.0005, + "num_input_tokens_seen": 231679088, + "step": 107245 + }, + { + "epoch": 17.495921696574225, + "grad_norm": 0.0025188778527081013, + "learning_rate": 4.700265647476332e-05, + "loss": 0.001, + "num_input_tokens_seen": 231690480, + "step": 107250 + }, + { + "epoch": 17.49673735725938, + "grad_norm": 0.03646330535411835, + "learning_rate": 4.69725314250245e-05, + "loss": 0.0037, + "num_input_tokens_seen": 231701744, + "step": 107255 + }, + { + "epoch": 17.497553017944536, + "grad_norm": 0.004327393136918545, + "learning_rate": 4.6942415556461894e-05, + "loss": 0.0021, + "num_input_tokens_seen": 231712944, + "step": 107260 + }, + { + "epoch": 17.49836867862969, + "grad_norm": 0.007958785630762577, + "learning_rate": 4.691230886968617e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231723088, + "step": 107265 + }, + { + "epoch": 17.499184339314844, + "grad_norm": 0.0002876732323784381, + "learning_rate": 4.688221136530712e-05, + "loss": 0.0009, + "num_input_tokens_seen": 231733072, + "step": 107270 + }, + { + "epoch": 17.5, + "grad_norm": 0.015398185700178146, + "learning_rate": 4.6852123043935044e-05, + "loss": 0.0016, + "num_input_tokens_seen": 231743824, + "step": 107275 + }, + { + "epoch": 17.500815660685156, + "grad_norm": 0.0010171079775318503, + "learning_rate": 4.682204390617939e-05, + "loss": 0.0005, + "num_input_tokens_seen": 231754192, + "step": 107280 + }, + { + "epoch": 17.50163132137031, + "grad_norm": 0.01936420053243637, + "learning_rate": 4.6791973952650056e-05, + "loss": 0.001, + "num_input_tokens_seen": 231765200, + "step": 107285 + }, + { + "epoch": 17.502446982055464, + "grad_norm": 0.38633960485458374, + "learning_rate": 4.6761913183956175e-05, + "loss": 0.0156, + "num_input_tokens_seen": 231776368, + "step": 107290 + }, + { + "epoch": 17.50326264274062, + "grad_norm": 0.004747708793729544, + "learning_rate": 4.673186160070714e-05, + "loss": 0.0036, + "num_input_tokens_seen": 231788112, + "step": 107295 + }, + { + "epoch": 17.504078303425775, + "grad_norm": 0.02657368592917919, + "learning_rate": 4.6701819203511964e-05, + "loss": 0.0773, + "num_input_tokens_seen": 231798480, + "step": 107300 + }, + { + "epoch": 17.50489396411093, + "grad_norm": 0.00034374097594991326, + "learning_rate": 4.667178599297944e-05, + "loss": 0.1074, + "num_input_tokens_seen": 231810256, + "step": 107305 + }, + { + "epoch": 17.505709624796086, + "grad_norm": 0.002067964058369398, + "learning_rate": 4.664176196971831e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231821168, + "step": 107310 + }, + { + "epoch": 17.50652528548124, + "grad_norm": 0.0008781051146797836, + "learning_rate": 4.661174713433697e-05, + "loss": 0.0154, + "num_input_tokens_seen": 231831536, + "step": 107315 + }, + { + "epoch": 17.507340946166394, + "grad_norm": 0.006266510114073753, + "learning_rate": 4.6581741487443765e-05, + "loss": 0.002, + "num_input_tokens_seen": 231841520, + "step": 107320 + }, + { + "epoch": 17.50815660685155, + "grad_norm": 0.00011919608368771151, + "learning_rate": 4.655174502964676e-05, + "loss": 0.0013, + "num_input_tokens_seen": 231851184, + "step": 107325 + }, + { + "epoch": 17.508972267536706, + "grad_norm": 0.0016120661748573184, + "learning_rate": 4.6521757761553873e-05, + "loss": 0.0016, + "num_input_tokens_seen": 231862864, + "step": 107330 + }, + { + "epoch": 17.50978792822186, + "grad_norm": 0.0043755825608968735, + "learning_rate": 4.6491779683772825e-05, + "loss": 0.0008, + "num_input_tokens_seen": 231873456, + "step": 107335 + }, + { + "epoch": 17.510603588907014, + "grad_norm": 0.0005683773779310286, + "learning_rate": 4.64618107969112e-05, + "loss": 0.0249, + "num_input_tokens_seen": 231883792, + "step": 107340 + }, + { + "epoch": 17.51141924959217, + "grad_norm": 0.0006177395698614419, + "learning_rate": 4.643185110157633e-05, + "loss": 0.002, + "num_input_tokens_seen": 231894640, + "step": 107345 + }, + { + "epoch": 17.512234910277325, + "grad_norm": 0.010703234001994133, + "learning_rate": 4.640190059837535e-05, + "loss": 0.0071, + "num_input_tokens_seen": 231906000, + "step": 107350 + }, + { + "epoch": 17.51305057096248, + "grad_norm": 0.0019364446634426713, + "learning_rate": 4.637195928791532e-05, + "loss": 0.0029, + "num_input_tokens_seen": 231917232, + "step": 107355 + }, + { + "epoch": 17.513866231647633, + "grad_norm": 0.0028983517549932003, + "learning_rate": 4.634202717080305e-05, + "loss": 0.0261, + "num_input_tokens_seen": 231927120, + "step": 107360 + }, + { + "epoch": 17.51468189233279, + "grad_norm": 0.0004596057115122676, + "learning_rate": 4.6312104247645035e-05, + "loss": 0.0015, + "num_input_tokens_seen": 231937872, + "step": 107365 + }, + { + "epoch": 17.515497553017944, + "grad_norm": 0.0075641958974301815, + "learning_rate": 4.6282190519047805e-05, + "loss": 0.0016, + "num_input_tokens_seen": 231947984, + "step": 107370 + }, + { + "epoch": 17.5163132137031, + "grad_norm": 0.0040355888195335865, + "learning_rate": 4.625228598561748e-05, + "loss": 0.0005, + "num_input_tokens_seen": 231959248, + "step": 107375 + }, + { + "epoch": 17.517128874388256, + "grad_norm": 0.0004805214412044734, + "learning_rate": 4.6222390647960356e-05, + "loss": 0.0006, + "num_input_tokens_seen": 231970064, + "step": 107380 + }, + { + "epoch": 17.517944535073408, + "grad_norm": 0.0017403739038854837, + "learning_rate": 4.619250450668194e-05, + "loss": 0.0032, + "num_input_tokens_seen": 231980944, + "step": 107385 + }, + { + "epoch": 17.518760195758563, + "grad_norm": 0.010331138968467712, + "learning_rate": 4.616262756238837e-05, + "loss": 0.0011, + "num_input_tokens_seen": 231991088, + "step": 107390 + }, + { + "epoch": 17.51957585644372, + "grad_norm": 0.0025749632623046637, + "learning_rate": 4.613275981568465e-05, + "loss": 0.0012, + "num_input_tokens_seen": 232001968, + "step": 107395 + }, + { + "epoch": 17.520391517128875, + "grad_norm": 0.011596056632697582, + "learning_rate": 4.610290126717642e-05, + "loss": 0.0031, + "num_input_tokens_seen": 232012400, + "step": 107400 + }, + { + "epoch": 17.52120717781403, + "grad_norm": 0.0006934392149560153, + "learning_rate": 4.607305191746874e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232023088, + "step": 107405 + }, + { + "epoch": 17.522022838499183, + "grad_norm": 0.0007819249876774848, + "learning_rate": 4.604321176716647e-05, + "loss": 0.009, + "num_input_tokens_seen": 232035024, + "step": 107410 + }, + { + "epoch": 17.52283849918434, + "grad_norm": 0.000496567867230624, + "learning_rate": 4.6013380816874394e-05, + "loss": 0.0032, + "num_input_tokens_seen": 232045904, + "step": 107415 + }, + { + "epoch": 17.523654159869494, + "grad_norm": 0.021955974400043488, + "learning_rate": 4.598355906719709e-05, + "loss": 0.0014, + "num_input_tokens_seen": 232057872, + "step": 107420 + }, + { + "epoch": 17.52446982055465, + "grad_norm": 0.01769077777862549, + "learning_rate": 4.595374651873896e-05, + "loss": 0.0094, + "num_input_tokens_seen": 232069072, + "step": 107425 + }, + { + "epoch": 17.525285481239806, + "grad_norm": 0.0007907668477855623, + "learning_rate": 4.592394317210413e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232079728, + "step": 107430 + }, + { + "epoch": 17.526101141924958, + "grad_norm": 0.02364833652973175, + "learning_rate": 4.589414902789662e-05, + "loss": 0.0054, + "num_input_tokens_seen": 232090800, + "step": 107435 + }, + { + "epoch": 17.526916802610113, + "grad_norm": 0.0035746158100664616, + "learning_rate": 4.586436408672023e-05, + "loss": 0.0029, + "num_input_tokens_seen": 232101360, + "step": 107440 + }, + { + "epoch": 17.52773246329527, + "grad_norm": 0.19809827208518982, + "learning_rate": 4.583458834917864e-05, + "loss": 0.0083, + "num_input_tokens_seen": 232112336, + "step": 107445 + }, + { + "epoch": 17.528548123980425, + "grad_norm": 0.14923065900802612, + "learning_rate": 4.580482181587531e-05, + "loss": 0.0035, + "num_input_tokens_seen": 232123152, + "step": 107450 + }, + { + "epoch": 17.52936378466558, + "grad_norm": 0.7618481516838074, + "learning_rate": 4.5775064487413424e-05, + "loss": 0.0509, + "num_input_tokens_seen": 232133328, + "step": 107455 + }, + { + "epoch": 17.530179445350733, + "grad_norm": 0.003851557383313775, + "learning_rate": 4.574531636439605e-05, + "loss": 0.0036, + "num_input_tokens_seen": 232143824, + "step": 107460 + }, + { + "epoch": 17.53099510603589, + "grad_norm": 0.007336875889450312, + "learning_rate": 4.57155774474261e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232156080, + "step": 107465 + }, + { + "epoch": 17.531810766721044, + "grad_norm": 0.002769289305433631, + "learning_rate": 4.568584773710632e-05, + "loss": 0.0034, + "num_input_tokens_seen": 232165968, + "step": 107470 + }, + { + "epoch": 17.5326264274062, + "grad_norm": 0.1800115555524826, + "learning_rate": 4.565612723403911e-05, + "loss": 0.0078, + "num_input_tokens_seen": 232176752, + "step": 107475 + }, + { + "epoch": 17.533442088091356, + "grad_norm": 0.0010139649966731668, + "learning_rate": 4.562641593882694e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232187856, + "step": 107480 + }, + { + "epoch": 17.534257748776508, + "grad_norm": 0.0031493548303842545, + "learning_rate": 4.5596713852071816e-05, + "loss": 0.0013, + "num_input_tokens_seen": 232199696, + "step": 107485 + }, + { + "epoch": 17.535073409461663, + "grad_norm": 0.31990453600883484, + "learning_rate": 4.556702097437576e-05, + "loss": 0.0078, + "num_input_tokens_seen": 232210832, + "step": 107490 + }, + { + "epoch": 17.53588907014682, + "grad_norm": 0.005481668282300234, + "learning_rate": 4.5537337306340466e-05, + "loss": 0.0011, + "num_input_tokens_seen": 232222000, + "step": 107495 + }, + { + "epoch": 17.536704730831975, + "grad_norm": 0.00037945323856547475, + "learning_rate": 4.550766284856761e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232233584, + "step": 107500 + }, + { + "epoch": 17.53752039151713, + "grad_norm": 0.013014235533773899, + "learning_rate": 4.5477997601658384e-05, + "loss": 0.0033, + "num_input_tokens_seen": 232244464, + "step": 107505 + }, + { + "epoch": 17.538336052202283, + "grad_norm": 0.0016114584868773818, + "learning_rate": 4.5448341566214354e-05, + "loss": 0.0023, + "num_input_tokens_seen": 232255184, + "step": 107510 + }, + { + "epoch": 17.53915171288744, + "grad_norm": 0.001107938471250236, + "learning_rate": 4.541869474283616e-05, + "loss": 0.0005, + "num_input_tokens_seen": 232266896, + "step": 107515 + }, + { + "epoch": 17.539967373572594, + "grad_norm": 0.04263180494308472, + "learning_rate": 4.538905713212488e-05, + "loss": 0.0023, + "num_input_tokens_seen": 232276816, + "step": 107520 + }, + { + "epoch": 17.54078303425775, + "grad_norm": 0.0007417293963953853, + "learning_rate": 4.535942873468102e-05, + "loss": 0.0007, + "num_input_tokens_seen": 232287664, + "step": 107525 + }, + { + "epoch": 17.541598694942905, + "grad_norm": 0.0029360156040638685, + "learning_rate": 4.532980955110516e-05, + "loss": 0.0034, + "num_input_tokens_seen": 232298512, + "step": 107530 + }, + { + "epoch": 17.542414355628058, + "grad_norm": 0.009608111344277859, + "learning_rate": 4.530019958199744e-05, + "loss": 0.0012, + "num_input_tokens_seen": 232309520, + "step": 107535 + }, + { + "epoch": 17.543230016313213, + "grad_norm": 0.006099861580878496, + "learning_rate": 4.527059882795803e-05, + "loss": 0.0014, + "num_input_tokens_seen": 232320272, + "step": 107540 + }, + { + "epoch": 17.54404567699837, + "grad_norm": 0.002252019476145506, + "learning_rate": 4.52410072895868e-05, + "loss": 0.0011, + "num_input_tokens_seen": 232329808, + "step": 107545 + }, + { + "epoch": 17.544861337683525, + "grad_norm": 0.09089305996894836, + "learning_rate": 4.521142496748348e-05, + "loss": 0.0019, + "num_input_tokens_seen": 232340592, + "step": 107550 + }, + { + "epoch": 17.545676998368677, + "grad_norm": 0.03456374630331993, + "learning_rate": 4.5181851862247544e-05, + "loss": 0.0022, + "num_input_tokens_seen": 232350960, + "step": 107555 + }, + { + "epoch": 17.546492659053833, + "grad_norm": 0.0005953084328211844, + "learning_rate": 4.51522879744784e-05, + "loss": 0.0029, + "num_input_tokens_seen": 232362416, + "step": 107560 + }, + { + "epoch": 17.54730831973899, + "grad_norm": 0.00016484300431329757, + "learning_rate": 4.5122733304775124e-05, + "loss": 0.002, + "num_input_tokens_seen": 232373168, + "step": 107565 + }, + { + "epoch": 17.548123980424144, + "grad_norm": 0.012281266041100025, + "learning_rate": 4.509318785373667e-05, + "loss": 0.0007, + "num_input_tokens_seen": 232382352, + "step": 107570 + }, + { + "epoch": 17.5489396411093, + "grad_norm": 0.002700270852074027, + "learning_rate": 4.506365162196191e-05, + "loss": 0.0011, + "num_input_tokens_seen": 232392304, + "step": 107575 + }, + { + "epoch": 17.549755301794452, + "grad_norm": 0.00035837123868986964, + "learning_rate": 4.503412461004935e-05, + "loss": 0.0099, + "num_input_tokens_seen": 232403152, + "step": 107580 + }, + { + "epoch": 17.550570962479608, + "grad_norm": 0.0021404859144240618, + "learning_rate": 4.500460681859742e-05, + "loss": 0.0009, + "num_input_tokens_seen": 232414960, + "step": 107585 + }, + { + "epoch": 17.551386623164763, + "grad_norm": 0.00026893772883340716, + "learning_rate": 4.4975098248204394e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232425904, + "step": 107590 + }, + { + "epoch": 17.55220228384992, + "grad_norm": 0.6711235046386719, + "learning_rate": 4.494559889946814e-05, + "loss": 0.0272, + "num_input_tokens_seen": 232437328, + "step": 107595 + }, + { + "epoch": 17.553017944535075, + "grad_norm": 0.00047256724792532623, + "learning_rate": 4.4916108772986686e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232448720, + "step": 107600 + }, + { + "epoch": 17.553833605220227, + "grad_norm": 0.0007653324282728136, + "learning_rate": 4.48866278693576e-05, + "loss": 0.0214, + "num_input_tokens_seen": 232459184, + "step": 107605 + }, + { + "epoch": 17.554649265905383, + "grad_norm": 0.0006618410698138177, + "learning_rate": 4.485715618917818e-05, + "loss": 0.0078, + "num_input_tokens_seen": 232468880, + "step": 107610 + }, + { + "epoch": 17.55546492659054, + "grad_norm": 0.001561740762554109, + "learning_rate": 4.482769373304613e-05, + "loss": 0.0009, + "num_input_tokens_seen": 232479312, + "step": 107615 + }, + { + "epoch": 17.556280587275694, + "grad_norm": 0.7048082947731018, + "learning_rate": 4.4798240501558115e-05, + "loss": 0.0768, + "num_input_tokens_seen": 232489712, + "step": 107620 + }, + { + "epoch": 17.55709624796085, + "grad_norm": 0.05319645628333092, + "learning_rate": 4.4768796495311406e-05, + "loss": 0.0021, + "num_input_tokens_seen": 232500368, + "step": 107625 + }, + { + "epoch": 17.557911908646002, + "grad_norm": 0.0017085699364542961, + "learning_rate": 4.473936171490228e-05, + "loss": 0.0025, + "num_input_tokens_seen": 232509616, + "step": 107630 + }, + { + "epoch": 17.558727569331158, + "grad_norm": 0.2203325629234314, + "learning_rate": 4.470993616092778e-05, + "loss": 0.0061, + "num_input_tokens_seen": 232521072, + "step": 107635 + }, + { + "epoch": 17.559543230016313, + "grad_norm": 0.014941530302166939, + "learning_rate": 4.46805198339838e-05, + "loss": 0.0025, + "num_input_tokens_seen": 232531984, + "step": 107640 + }, + { + "epoch": 17.56035889070147, + "grad_norm": 0.002396009163931012, + "learning_rate": 4.4651112734666874e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232543728, + "step": 107645 + }, + { + "epoch": 17.561174551386625, + "grad_norm": 0.010223069228231907, + "learning_rate": 4.462171486357264e-05, + "loss": 0.0009, + "num_input_tokens_seen": 232553456, + "step": 107650 + }, + { + "epoch": 17.561990212071777, + "grad_norm": 0.00022783187159802765, + "learning_rate": 4.459232622129722e-05, + "loss": 0.001, + "num_input_tokens_seen": 232564432, + "step": 107655 + }, + { + "epoch": 17.562805872756933, + "grad_norm": 0.001957368105649948, + "learning_rate": 4.4562946808435864e-05, + "loss": 0.0005, + "num_input_tokens_seen": 232574992, + "step": 107660 + }, + { + "epoch": 17.563621533442088, + "grad_norm": 1.1116943359375, + "learning_rate": 4.453357662558422e-05, + "loss": 0.1027, + "num_input_tokens_seen": 232585392, + "step": 107665 + }, + { + "epoch": 17.564437194127244, + "grad_norm": 0.0035373906139284372, + "learning_rate": 4.450421567333746e-05, + "loss": 0.0015, + "num_input_tokens_seen": 232596528, + "step": 107670 + }, + { + "epoch": 17.5652528548124, + "grad_norm": 0.004156868439167738, + "learning_rate": 4.447486395229061e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232607344, + "step": 107675 + }, + { + "epoch": 17.56606851549755, + "grad_norm": 0.004597888793796301, + "learning_rate": 4.4445521463038486e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232617584, + "step": 107680 + }, + { + "epoch": 17.566884176182707, + "grad_norm": 0.005564799532294273, + "learning_rate": 4.441618820617582e-05, + "loss": 0.0009, + "num_input_tokens_seen": 232628592, + "step": 107685 + }, + { + "epoch": 17.567699836867863, + "grad_norm": 0.014463113620877266, + "learning_rate": 4.438686418229698e-05, + "loss": 0.0015, + "num_input_tokens_seen": 232640080, + "step": 107690 + }, + { + "epoch": 17.56851549755302, + "grad_norm": 0.007256032433360815, + "learning_rate": 4.4357549391996376e-05, + "loss": 0.0019, + "num_input_tokens_seen": 232651920, + "step": 107695 + }, + { + "epoch": 17.569331158238175, + "grad_norm": 0.008663265034556389, + "learning_rate": 4.432824383586809e-05, + "loss": 0.0043, + "num_input_tokens_seen": 232663600, + "step": 107700 + }, + { + "epoch": 17.570146818923327, + "grad_norm": 0.04024729132652283, + "learning_rate": 4.429894751450597e-05, + "loss": 0.0033, + "num_input_tokens_seen": 232673808, + "step": 107705 + }, + { + "epoch": 17.570962479608482, + "grad_norm": 0.11653250455856323, + "learning_rate": 4.4269660428503774e-05, + "loss": 0.0042, + "num_input_tokens_seen": 232684816, + "step": 107710 + }, + { + "epoch": 17.571778140293638, + "grad_norm": 0.002741542411968112, + "learning_rate": 4.4240382578454915e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232695728, + "step": 107715 + }, + { + "epoch": 17.572593800978794, + "grad_norm": 0.2372535616159439, + "learning_rate": 4.4211113964953144e-05, + "loss": 0.0104, + "num_input_tokens_seen": 232705840, + "step": 107720 + }, + { + "epoch": 17.57340946166395, + "grad_norm": 0.0019646419677883387, + "learning_rate": 4.4181854588591085e-05, + "loss": 0.0023, + "num_input_tokens_seen": 232716720, + "step": 107725 + }, + { + "epoch": 17.5742251223491, + "grad_norm": 0.0009456843254156411, + "learning_rate": 4.415260444996222e-05, + "loss": 0.0012, + "num_input_tokens_seen": 232728944, + "step": 107730 + }, + { + "epoch": 17.575040783034257, + "grad_norm": 0.004937485791742802, + "learning_rate": 4.4123363549658955e-05, + "loss": 0.0027, + "num_input_tokens_seen": 232738576, + "step": 107735 + }, + { + "epoch": 17.575856443719413, + "grad_norm": 0.0002907993330154568, + "learning_rate": 4.409413188827416e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232749616, + "step": 107740 + }, + { + "epoch": 17.57667210440457, + "grad_norm": 0.006005709525197744, + "learning_rate": 4.4064909466400014e-05, + "loss": 0.002, + "num_input_tokens_seen": 232760496, + "step": 107745 + }, + { + "epoch": 17.57748776508972, + "grad_norm": 0.007954063825309277, + "learning_rate": 4.4035696284629e-05, + "loss": 0.0057, + "num_input_tokens_seen": 232771632, + "step": 107750 + }, + { + "epoch": 17.578303425774877, + "grad_norm": 0.0007008819957263768, + "learning_rate": 4.4006492343552915e-05, + "loss": 0.0011, + "num_input_tokens_seen": 232781936, + "step": 107755 + }, + { + "epoch": 17.579119086460032, + "grad_norm": 0.004294464364647865, + "learning_rate": 4.39772976437639e-05, + "loss": 0.0017, + "num_input_tokens_seen": 232793040, + "step": 107760 + }, + { + "epoch": 17.579934747145188, + "grad_norm": 0.0009277364588342607, + "learning_rate": 4.394811218585326e-05, + "loss": 0.001, + "num_input_tokens_seen": 232803216, + "step": 107765 + }, + { + "epoch": 17.580750407830344, + "grad_norm": 0.022793620824813843, + "learning_rate": 4.3918935970412796e-05, + "loss": 0.0013, + "num_input_tokens_seen": 232814544, + "step": 107770 + }, + { + "epoch": 17.581566068515496, + "grad_norm": 0.01750621385872364, + "learning_rate": 4.38897689980336e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232825872, + "step": 107775 + }, + { + "epoch": 17.58238172920065, + "grad_norm": 0.00042929017217829823, + "learning_rate": 4.386061126930696e-05, + "loss": 0.0009, + "num_input_tokens_seen": 232837136, + "step": 107780 + }, + { + "epoch": 17.583197389885807, + "grad_norm": 0.0012211805442348123, + "learning_rate": 4.3831462784823525e-05, + "loss": 0.0005, + "num_input_tokens_seen": 232848208, + "step": 107785 + }, + { + "epoch": 17.584013050570963, + "grad_norm": 0.07960768789052963, + "learning_rate": 4.380232354517433e-05, + "loss": 0.0063, + "num_input_tokens_seen": 232859248, + "step": 107790 + }, + { + "epoch": 17.58482871125612, + "grad_norm": 0.0004789176455233246, + "learning_rate": 4.3773193550949664e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232870224, + "step": 107795 + }, + { + "epoch": 17.58564437194127, + "grad_norm": 0.038380883634090424, + "learning_rate": 4.374407280274007e-05, + "loss": 0.0748, + "num_input_tokens_seen": 232880464, + "step": 107800 + }, + { + "epoch": 17.586460032626427, + "grad_norm": 0.007192968390882015, + "learning_rate": 4.371496130113561e-05, + "loss": 0.0016, + "num_input_tokens_seen": 232889936, + "step": 107805 + }, + { + "epoch": 17.587275693311582, + "grad_norm": 0.0016420006286352873, + "learning_rate": 4.3685859046726284e-05, + "loss": 0.0006, + "num_input_tokens_seen": 232900752, + "step": 107810 + }, + { + "epoch": 17.588091353996738, + "grad_norm": 0.0026500627864152193, + "learning_rate": 4.3656766040101933e-05, + "loss": 0.0013, + "num_input_tokens_seen": 232911600, + "step": 107815 + }, + { + "epoch": 17.588907014681894, + "grad_norm": 0.0004607281007338315, + "learning_rate": 4.362768228185216e-05, + "loss": 0.0092, + "num_input_tokens_seen": 232922352, + "step": 107820 + }, + { + "epoch": 17.589722675367046, + "grad_norm": 0.01853001117706299, + "learning_rate": 4.35986077725663e-05, + "loss": 0.0026, + "num_input_tokens_seen": 232931920, + "step": 107825 + }, + { + "epoch": 17.5905383360522, + "grad_norm": 0.0008818014757707715, + "learning_rate": 4.3569542512833684e-05, + "loss": 0.0059, + "num_input_tokens_seen": 232942160, + "step": 107830 + }, + { + "epoch": 17.591353996737357, + "grad_norm": 0.005279912613332272, + "learning_rate": 4.354048650324327e-05, + "loss": 0.0793, + "num_input_tokens_seen": 232953424, + "step": 107835 + }, + { + "epoch": 17.592169657422513, + "grad_norm": 0.001021630479954183, + "learning_rate": 4.3511439744383984e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232964240, + "step": 107840 + }, + { + "epoch": 17.59298531810767, + "grad_norm": 0.002027069916948676, + "learning_rate": 4.348240223684447e-05, + "loss": 0.0032, + "num_input_tokens_seen": 232974576, + "step": 107845 + }, + { + "epoch": 17.59380097879282, + "grad_norm": 0.0007032614084891975, + "learning_rate": 4.3453373981213184e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232986032, + "step": 107850 + }, + { + "epoch": 17.594616639477977, + "grad_norm": 0.002503114752471447, + "learning_rate": 4.342435497807845e-05, + "loss": 0.0011, + "num_input_tokens_seen": 232997104, + "step": 107855 + }, + { + "epoch": 17.595432300163132, + "grad_norm": 0.0064894710667431355, + "learning_rate": 4.3395345228028294e-05, + "loss": 0.0008, + "num_input_tokens_seen": 233008208, + "step": 107860 + }, + { + "epoch": 17.596247960848288, + "grad_norm": 0.000594555342104286, + "learning_rate": 4.336634473165091e-05, + "loss": 0.0028, + "num_input_tokens_seen": 233019888, + "step": 107865 + }, + { + "epoch": 17.597063621533444, + "grad_norm": 0.5004954934120178, + "learning_rate": 4.3337353489533606e-05, + "loss": 0.0346, + "num_input_tokens_seen": 233029872, + "step": 107870 + }, + { + "epoch": 17.597879282218596, + "grad_norm": 0.00310205458663404, + "learning_rate": 4.3308371502264355e-05, + "loss": 0.0007, + "num_input_tokens_seen": 233041136, + "step": 107875 + }, + { + "epoch": 17.59869494290375, + "grad_norm": 0.010869835503399372, + "learning_rate": 4.327939877043013e-05, + "loss": 0.0012, + "num_input_tokens_seen": 233051216, + "step": 107880 + }, + { + "epoch": 17.599510603588907, + "grad_norm": 0.8642117977142334, + "learning_rate": 4.3250435294618473e-05, + "loss": 0.0312, + "num_input_tokens_seen": 233062064, + "step": 107885 + }, + { + "epoch": 17.600326264274063, + "grad_norm": 0.0018593736458569765, + "learning_rate": 4.322148107541596e-05, + "loss": 0.0011, + "num_input_tokens_seen": 233072208, + "step": 107890 + }, + { + "epoch": 17.601141924959215, + "grad_norm": 0.008606432005763054, + "learning_rate": 4.3192536113409785e-05, + "loss": 0.0014, + "num_input_tokens_seen": 233082224, + "step": 107895 + }, + { + "epoch": 17.60195758564437, + "grad_norm": 0.006274912506341934, + "learning_rate": 4.316360040918621e-05, + "loss": 0.0048, + "num_input_tokens_seen": 233092944, + "step": 107900 + }, + { + "epoch": 17.602773246329527, + "grad_norm": 0.0020932599436491728, + "learning_rate": 4.3134673963331985e-05, + "loss": 0.0009, + "num_input_tokens_seen": 233102576, + "step": 107905 + }, + { + "epoch": 17.603588907014682, + "grad_norm": 0.0031999878119677305, + "learning_rate": 4.310575677643297e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233112944, + "step": 107910 + }, + { + "epoch": 17.604404567699838, + "grad_norm": 0.050068553537130356, + "learning_rate": 4.307684884907559e-05, + "loss": 0.0041, + "num_input_tokens_seen": 233123536, + "step": 107915 + }, + { + "epoch": 17.605220228384994, + "grad_norm": 0.0012258957140147686, + "learning_rate": 4.304795018184537e-05, + "loss": 0.0007, + "num_input_tokens_seen": 233133744, + "step": 107920 + }, + { + "epoch": 17.606035889070146, + "grad_norm": 0.02541196160018444, + "learning_rate": 4.3019060775328186e-05, + "loss": 0.0008, + "num_input_tokens_seen": 233144720, + "step": 107925 + }, + { + "epoch": 17.6068515497553, + "grad_norm": 0.004853926599025726, + "learning_rate": 4.2990180630109455e-05, + "loss": 0.0009, + "num_input_tokens_seen": 233154576, + "step": 107930 + }, + { + "epoch": 17.607667210440457, + "grad_norm": 0.012128345668315887, + "learning_rate": 4.296130974677448e-05, + "loss": 0.0015, + "num_input_tokens_seen": 233164976, + "step": 107935 + }, + { + "epoch": 17.608482871125613, + "grad_norm": 0.006950197741389275, + "learning_rate": 4.293244812590835e-05, + "loss": 0.0029, + "num_input_tokens_seen": 233176528, + "step": 107940 + }, + { + "epoch": 17.609298531810765, + "grad_norm": 0.0022767765913158655, + "learning_rate": 4.2903595768095995e-05, + "loss": 0.0017, + "num_input_tokens_seen": 233187248, + "step": 107945 + }, + { + "epoch": 17.61011419249592, + "grad_norm": 0.0008330377168022096, + "learning_rate": 4.28747526739221e-05, + "loss": 0.0007, + "num_input_tokens_seen": 233197136, + "step": 107950 + }, + { + "epoch": 17.610929853181077, + "grad_norm": 0.00958797987550497, + "learning_rate": 4.284591884397132e-05, + "loss": 0.0006, + "num_input_tokens_seen": 233208720, + "step": 107955 + }, + { + "epoch": 17.611745513866232, + "grad_norm": 0.024692602455615997, + "learning_rate": 4.281709427882791e-05, + "loss": 0.0048, + "num_input_tokens_seen": 233218704, + "step": 107960 + }, + { + "epoch": 17.612561174551388, + "grad_norm": 0.017007293179631233, + "learning_rate": 4.2788278979076003e-05, + "loss": 0.0026, + "num_input_tokens_seen": 233229616, + "step": 107965 + }, + { + "epoch": 17.61337683523654, + "grad_norm": 0.0005782050429843366, + "learning_rate": 4.275947294529969e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233240080, + "step": 107970 + }, + { + "epoch": 17.614192495921696, + "grad_norm": 0.0035270596854388714, + "learning_rate": 4.2730676178082736e-05, + "loss": 0.0013, + "num_input_tokens_seen": 233250480, + "step": 107975 + }, + { + "epoch": 17.61500815660685, + "grad_norm": 0.0002987095504067838, + "learning_rate": 4.2701888678008674e-05, + "loss": 0.0012, + "num_input_tokens_seen": 233261456, + "step": 107980 + }, + { + "epoch": 17.615823817292007, + "grad_norm": 0.004291311372071505, + "learning_rate": 4.267311044566097e-05, + "loss": 0.001, + "num_input_tokens_seen": 233272144, + "step": 107985 + }, + { + "epoch": 17.616639477977163, + "grad_norm": 0.001111071789637208, + "learning_rate": 4.2644341481622825e-05, + "loss": 0.0039, + "num_input_tokens_seen": 233281200, + "step": 107990 + }, + { + "epoch": 17.617455138662315, + "grad_norm": 0.0010424138745293021, + "learning_rate": 4.2615581786477234e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233291888, + "step": 107995 + }, + { + "epoch": 17.61827079934747, + "grad_norm": 0.0060048531740903854, + "learning_rate": 4.2586831360807265e-05, + "loss": 0.0005, + "num_input_tokens_seen": 233302704, + "step": 108000 + }, + { + "epoch": 17.619086460032626, + "grad_norm": 0.011316410265862942, + "learning_rate": 4.25580902051953e-05, + "loss": 0.0026, + "num_input_tokens_seen": 233314192, + "step": 108005 + }, + { + "epoch": 17.619902120717782, + "grad_norm": 0.08108188211917877, + "learning_rate": 4.252935832022409e-05, + "loss": 0.0041, + "num_input_tokens_seen": 233322992, + "step": 108010 + }, + { + "epoch": 17.620717781402938, + "grad_norm": 0.0003150638658553362, + "learning_rate": 4.250063570647561e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233334352, + "step": 108015 + }, + { + "epoch": 17.62153344208809, + "grad_norm": 0.008647634647786617, + "learning_rate": 4.247192236453229e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233344784, + "step": 108020 + }, + { + "epoch": 17.622349102773246, + "grad_norm": 0.015910470858216286, + "learning_rate": 4.244321829497566e-05, + "loss": 0.0042, + "num_input_tokens_seen": 233356080, + "step": 108025 + }, + { + "epoch": 17.6231647634584, + "grad_norm": 0.001022401382215321, + "learning_rate": 4.2414523498387926e-05, + "loss": 0.0009, + "num_input_tokens_seen": 233367440, + "step": 108030 + }, + { + "epoch": 17.623980424143557, + "grad_norm": 0.0027995568234473467, + "learning_rate": 4.2385837975350115e-05, + "loss": 0.0122, + "num_input_tokens_seen": 233378640, + "step": 108035 + }, + { + "epoch": 17.624796084828713, + "grad_norm": 0.013003799133002758, + "learning_rate": 4.235716172644394e-05, + "loss": 0.0843, + "num_input_tokens_seen": 233388112, + "step": 108040 + }, + { + "epoch": 17.625611745513865, + "grad_norm": 0.002790980041027069, + "learning_rate": 4.232849475225048e-05, + "loss": 0.002, + "num_input_tokens_seen": 233398992, + "step": 108045 + }, + { + "epoch": 17.62642740619902, + "grad_norm": 0.0019258302636444569, + "learning_rate": 4.2299837053350606e-05, + "loss": 0.001, + "num_input_tokens_seen": 233409072, + "step": 108050 + }, + { + "epoch": 17.627243066884176, + "grad_norm": 0.002759368624538183, + "learning_rate": 4.2271188630325195e-05, + "loss": 0.0068, + "num_input_tokens_seen": 233419664, + "step": 108055 + }, + { + "epoch": 17.628058727569332, + "grad_norm": 0.0022524246014654636, + "learning_rate": 4.2242549483754836e-05, + "loss": 0.0038, + "num_input_tokens_seen": 233430480, + "step": 108060 + }, + { + "epoch": 17.628874388254488, + "grad_norm": 0.003963864874094725, + "learning_rate": 4.221391961421989e-05, + "loss": 0.0021, + "num_input_tokens_seen": 233440400, + "step": 108065 + }, + { + "epoch": 17.62969004893964, + "grad_norm": 0.00029403064399957657, + "learning_rate": 4.218529902230062e-05, + "loss": 0.0119, + "num_input_tokens_seen": 233450896, + "step": 108070 + }, + { + "epoch": 17.630505709624796, + "grad_norm": 0.003268659580498934, + "learning_rate": 4.2156687708577e-05, + "loss": 0.0006, + "num_input_tokens_seen": 233460048, + "step": 108075 + }, + { + "epoch": 17.63132137030995, + "grad_norm": 0.0030870982445776463, + "learning_rate": 4.212808567362897e-05, + "loss": 0.0006, + "num_input_tokens_seen": 233471568, + "step": 108080 + }, + { + "epoch": 17.632137030995107, + "grad_norm": 0.031759873032569885, + "learning_rate": 4.209949291803611e-05, + "loss": 0.0028, + "num_input_tokens_seen": 233481904, + "step": 108085 + }, + { + "epoch": 17.63295269168026, + "grad_norm": 0.0062539586797356606, + "learning_rate": 4.207090944237796e-05, + "loss": 0.0017, + "num_input_tokens_seen": 233491408, + "step": 108090 + }, + { + "epoch": 17.633768352365415, + "grad_norm": 0.010095684789121151, + "learning_rate": 4.204233524723372e-05, + "loss": 0.0016, + "num_input_tokens_seen": 233501168, + "step": 108095 + }, + { + "epoch": 17.63458401305057, + "grad_norm": 0.0003338803071528673, + "learning_rate": 4.201377033318249e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233512336, + "step": 108100 + }, + { + "epoch": 17.635399673735726, + "grad_norm": 0.05965923145413399, + "learning_rate": 4.198521470080324e-05, + "loss": 0.0024, + "num_input_tokens_seen": 233522256, + "step": 108105 + }, + { + "epoch": 17.636215334420882, + "grad_norm": 0.0004708467167802155, + "learning_rate": 4.195666835067463e-05, + "loss": 0.0247, + "num_input_tokens_seen": 233531824, + "step": 108110 + }, + { + "epoch": 17.637030995106034, + "grad_norm": 0.1438266485929489, + "learning_rate": 4.1928131283375246e-05, + "loss": 0.005, + "num_input_tokens_seen": 233544368, + "step": 108115 + }, + { + "epoch": 17.63784665579119, + "grad_norm": 0.0019570267759263515, + "learning_rate": 4.189960349948335e-05, + "loss": 0.0006, + "num_input_tokens_seen": 233555440, + "step": 108120 + }, + { + "epoch": 17.638662316476346, + "grad_norm": 0.002199590904638171, + "learning_rate": 4.1871084999577146e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233565840, + "step": 108125 + }, + { + "epoch": 17.6394779771615, + "grad_norm": 0.013834419660270214, + "learning_rate": 4.184257578423456e-05, + "loss": 0.0008, + "num_input_tokens_seen": 233575856, + "step": 108130 + }, + { + "epoch": 17.640293637846657, + "grad_norm": 0.0027103605680167675, + "learning_rate": 4.1814075854033405e-05, + "loss": 0.0011, + "num_input_tokens_seen": 233588624, + "step": 108135 + }, + { + "epoch": 17.64110929853181, + "grad_norm": 0.0007338287541642785, + "learning_rate": 4.178558520955117e-05, + "loss": 0.0013, + "num_input_tokens_seen": 233599696, + "step": 108140 + }, + { + "epoch": 17.641924959216965, + "grad_norm": 0.004689326509833336, + "learning_rate": 4.175710385136539e-05, + "loss": 0.0329, + "num_input_tokens_seen": 233610928, + "step": 108145 + }, + { + "epoch": 17.64274061990212, + "grad_norm": 0.0011317178141325712, + "learning_rate": 4.172863178005326e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233622160, + "step": 108150 + }, + { + "epoch": 17.643556280587276, + "grad_norm": 0.0009731571190059185, + "learning_rate": 4.1700168996191726e-05, + "loss": 0.0011, + "num_input_tokens_seen": 233632208, + "step": 108155 + }, + { + "epoch": 17.644371941272432, + "grad_norm": 0.0007394000422209501, + "learning_rate": 4.16717155003577e-05, + "loss": 0.0027, + "num_input_tokens_seen": 233642480, + "step": 108160 + }, + { + "epoch": 17.645187601957584, + "grad_norm": 0.00035144094727002084, + "learning_rate": 4.164327129312778e-05, + "loss": 0.0015, + "num_input_tokens_seen": 233653680, + "step": 108165 + }, + { + "epoch": 17.64600326264274, + "grad_norm": 0.004731602966785431, + "learning_rate": 4.161483637507846e-05, + "loss": 0.0005, + "num_input_tokens_seen": 233665712, + "step": 108170 + }, + { + "epoch": 17.646818923327896, + "grad_norm": 0.023019464686512947, + "learning_rate": 4.1586410746785927e-05, + "loss": 0.0017, + "num_input_tokens_seen": 233674896, + "step": 108175 + }, + { + "epoch": 17.64763458401305, + "grad_norm": 0.0024400795809924603, + "learning_rate": 4.155799440882635e-05, + "loss": 0.0031, + "num_input_tokens_seen": 233685424, + "step": 108180 + }, + { + "epoch": 17.648450244698207, + "grad_norm": 0.005999886896461248, + "learning_rate": 4.152958736177559e-05, + "loss": 0.0005, + "num_input_tokens_seen": 233696688, + "step": 108185 + }, + { + "epoch": 17.64926590538336, + "grad_norm": 0.0010004190262407064, + "learning_rate": 4.1501189606209356e-05, + "loss": 0.0008, + "num_input_tokens_seen": 233708304, + "step": 108190 + }, + { + "epoch": 17.650081566068515, + "grad_norm": 0.007027873769402504, + "learning_rate": 4.147280114270319e-05, + "loss": 0.0017, + "num_input_tokens_seen": 233720304, + "step": 108195 + }, + { + "epoch": 17.65089722675367, + "grad_norm": 0.007540034130215645, + "learning_rate": 4.1444421971832346e-05, + "loss": 0.0008, + "num_input_tokens_seen": 233729328, + "step": 108200 + }, + { + "epoch": 17.651712887438826, + "grad_norm": 0.004675657954066992, + "learning_rate": 4.1416052094171985e-05, + "loss": 0.0014, + "num_input_tokens_seen": 233739408, + "step": 108205 + }, + { + "epoch": 17.652528548123982, + "grad_norm": 0.0004989549051970243, + "learning_rate": 4.1387691510297146e-05, + "loss": 0.0015, + "num_input_tokens_seen": 233750544, + "step": 108210 + }, + { + "epoch": 17.653344208809134, + "grad_norm": 0.00018444911984261125, + "learning_rate": 4.1359340220782524e-05, + "loss": 0.0011, + "num_input_tokens_seen": 233761328, + "step": 108215 + }, + { + "epoch": 17.65415986949429, + "grad_norm": 0.004228262230753899, + "learning_rate": 4.133099822620268e-05, + "loss": 0.0022, + "num_input_tokens_seen": 233772176, + "step": 108220 + }, + { + "epoch": 17.654975530179446, + "grad_norm": 0.002176476875320077, + "learning_rate": 4.130266552713202e-05, + "loss": 0.0405, + "num_input_tokens_seen": 233782992, + "step": 108225 + }, + { + "epoch": 17.6557911908646, + "grad_norm": 0.0019155082991346717, + "learning_rate": 4.1274342124144713e-05, + "loss": 0.0012, + "num_input_tokens_seen": 233793936, + "step": 108230 + }, + { + "epoch": 17.656606851549757, + "grad_norm": 0.002272221725434065, + "learning_rate": 4.124602801781485e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233804336, + "step": 108235 + }, + { + "epoch": 17.65742251223491, + "grad_norm": 0.0009390924824401736, + "learning_rate": 4.1217723208716196e-05, + "loss": 0.001, + "num_input_tokens_seen": 233816304, + "step": 108240 + }, + { + "epoch": 17.658238172920065, + "grad_norm": 0.00035862938966602087, + "learning_rate": 4.118942769742234e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233826864, + "step": 108245 + }, + { + "epoch": 17.65905383360522, + "grad_norm": 0.6004579663276672, + "learning_rate": 4.116114148450673e-05, + "loss": 0.031, + "num_input_tokens_seen": 233838288, + "step": 108250 + }, + { + "epoch": 17.659869494290376, + "grad_norm": 0.0022010619286447763, + "learning_rate": 4.113286457054283e-05, + "loss": 0.0008, + "num_input_tokens_seen": 233848784, + "step": 108255 + }, + { + "epoch": 17.660685154975532, + "grad_norm": 0.0006814883090555668, + "learning_rate": 4.1104596956103356e-05, + "loss": 0.001, + "num_input_tokens_seen": 233859472, + "step": 108260 + }, + { + "epoch": 17.661500815660684, + "grad_norm": 0.06318142265081406, + "learning_rate": 4.107633864176158e-05, + "loss": 0.0045, + "num_input_tokens_seen": 233871280, + "step": 108265 + }, + { + "epoch": 17.66231647634584, + "grad_norm": 0.022026631981134415, + "learning_rate": 4.104808962808976e-05, + "loss": 0.001, + "num_input_tokens_seen": 233882160, + "step": 108270 + }, + { + "epoch": 17.663132137030995, + "grad_norm": 0.01632312871515751, + "learning_rate": 4.101984991566082e-05, + "loss": 0.0013, + "num_input_tokens_seen": 233892816, + "step": 108275 + }, + { + "epoch": 17.66394779771615, + "grad_norm": 0.00041980453534051776, + "learning_rate": 4.0991619505046764e-05, + "loss": 0.0007, + "num_input_tokens_seen": 233903984, + "step": 108280 + }, + { + "epoch": 17.664763458401303, + "grad_norm": 0.0003276610223110765, + "learning_rate": 4.096339839681984e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233914096, + "step": 108285 + }, + { + "epoch": 17.66557911908646, + "grad_norm": 0.021783655509352684, + "learning_rate": 4.0935186591552044e-05, + "loss": 0.0051, + "num_input_tokens_seen": 233925392, + "step": 108290 + }, + { + "epoch": 17.666394779771615, + "grad_norm": 0.0006872511585243046, + "learning_rate": 4.0906984089815026e-05, + "loss": 0.0056, + "num_input_tokens_seen": 233935888, + "step": 108295 + }, + { + "epoch": 17.66721044045677, + "grad_norm": 0.03984300047159195, + "learning_rate": 4.087879089218033e-05, + "loss": 0.0017, + "num_input_tokens_seen": 233946672, + "step": 108300 + }, + { + "epoch": 17.668026101141926, + "grad_norm": 0.0005269440589472651, + "learning_rate": 4.085060699921944e-05, + "loss": 0.0017, + "num_input_tokens_seen": 233958256, + "step": 108305 + }, + { + "epoch": 17.66884176182708, + "grad_norm": 0.0010156655916944146, + "learning_rate": 4.0822432411503464e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233969520, + "step": 108310 + }, + { + "epoch": 17.669657422512234, + "grad_norm": 0.0011098864488303661, + "learning_rate": 4.079426712960338e-05, + "loss": 0.0042, + "num_input_tokens_seen": 233979632, + "step": 108315 + }, + { + "epoch": 17.67047308319739, + "grad_norm": 0.006729105953127146, + "learning_rate": 4.076611115409001e-05, + "loss": 0.0247, + "num_input_tokens_seen": 233990416, + "step": 108320 + }, + { + "epoch": 17.671288743882545, + "grad_norm": 0.014925811439752579, + "learning_rate": 4.073796448553402e-05, + "loss": 0.002, + "num_input_tokens_seen": 234000624, + "step": 108325 + }, + { + "epoch": 17.6721044045677, + "grad_norm": 0.0018322813557460904, + "learning_rate": 4.070982712450571e-05, + "loss": 0.0016, + "num_input_tokens_seen": 234010928, + "step": 108330 + }, + { + "epoch": 17.672920065252853, + "grad_norm": 0.0005330965504981577, + "learning_rate": 4.068169907157548e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234021776, + "step": 108335 + }, + { + "epoch": 17.67373572593801, + "grad_norm": 1.1490570306777954, + "learning_rate": 4.065358032731331e-05, + "loss": 0.2548, + "num_input_tokens_seen": 234033168, + "step": 108340 + }, + { + "epoch": 17.674551386623165, + "grad_norm": 0.0013266679598018527, + "learning_rate": 4.062547089228902e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234044368, + "step": 108345 + }, + { + "epoch": 17.67536704730832, + "grad_norm": 0.0019488186808302999, + "learning_rate": 4.0597370767072315e-05, + "loss": 0.0014, + "num_input_tokens_seen": 234054544, + "step": 108350 + }, + { + "epoch": 17.676182707993476, + "grad_norm": 0.002594847697764635, + "learning_rate": 4.056927995223264e-05, + "loss": 0.0009, + "num_input_tokens_seen": 234065808, + "step": 108355 + }, + { + "epoch": 17.67699836867863, + "grad_norm": 0.04415489733219147, + "learning_rate": 4.054119844833948e-05, + "loss": 0.0021, + "num_input_tokens_seen": 234075920, + "step": 108360 + }, + { + "epoch": 17.677814029363784, + "grad_norm": 0.008037679828703403, + "learning_rate": 4.0513126255961594e-05, + "loss": 0.0006, + "num_input_tokens_seen": 234086608, + "step": 108365 + }, + { + "epoch": 17.67862969004894, + "grad_norm": 0.005952348466962576, + "learning_rate": 4.0485063375668316e-05, + "loss": 0.1419, + "num_input_tokens_seen": 234096560, + "step": 108370 + }, + { + "epoch": 17.679445350734095, + "grad_norm": 0.0006694883340969682, + "learning_rate": 4.045700980802802e-05, + "loss": 0.0012, + "num_input_tokens_seen": 234108528, + "step": 108375 + }, + { + "epoch": 17.68026101141925, + "grad_norm": 0.00230517890304327, + "learning_rate": 4.042896555360953e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234118416, + "step": 108380 + }, + { + "epoch": 17.681076672104403, + "grad_norm": 0.05230387672781944, + "learning_rate": 4.040093061298089e-05, + "loss": 0.004, + "num_input_tokens_seen": 234128688, + "step": 108385 + }, + { + "epoch": 17.68189233278956, + "grad_norm": 0.005648719146847725, + "learning_rate": 4.037290498671059e-05, + "loss": 0.0005, + "num_input_tokens_seen": 234139376, + "step": 108390 + }, + { + "epoch": 17.682707993474715, + "grad_norm": 0.035768892616033554, + "learning_rate": 4.0344888675366285e-05, + "loss": 0.0066, + "num_input_tokens_seen": 234149232, + "step": 108395 + }, + { + "epoch": 17.68352365415987, + "grad_norm": 0.010486296378076077, + "learning_rate": 4.031688167951614e-05, + "loss": 0.0038, + "num_input_tokens_seen": 234159248, + "step": 108400 + }, + { + "epoch": 17.684339314845026, + "grad_norm": 0.0009895983384922147, + "learning_rate": 4.02888839997273e-05, + "loss": 0.0093, + "num_input_tokens_seen": 234168208, + "step": 108405 + }, + { + "epoch": 17.68515497553018, + "grad_norm": 0.007241661660373211, + "learning_rate": 4.0260895636567654e-05, + "loss": 0.0007, + "num_input_tokens_seen": 234178768, + "step": 108410 + }, + { + "epoch": 17.685970636215334, + "grad_norm": 0.006733461283147335, + "learning_rate": 4.0232916590603964e-05, + "loss": 0.0011, + "num_input_tokens_seen": 234190096, + "step": 108415 + }, + { + "epoch": 17.68678629690049, + "grad_norm": 0.017089251428842545, + "learning_rate": 4.020494686240361e-05, + "loss": 0.122, + "num_input_tokens_seen": 234200592, + "step": 108420 + }, + { + "epoch": 17.687601957585645, + "grad_norm": 0.15010204911231995, + "learning_rate": 4.017698645253321e-05, + "loss": 0.004, + "num_input_tokens_seen": 234211120, + "step": 108425 + }, + { + "epoch": 17.6884176182708, + "grad_norm": 0.05645974352955818, + "learning_rate": 4.0149035361559504e-05, + "loss": 0.001, + "num_input_tokens_seen": 234222576, + "step": 108430 + }, + { + "epoch": 17.689233278955953, + "grad_norm": 0.006962623447179794, + "learning_rate": 4.0121093590049004e-05, + "loss": 0.0013, + "num_input_tokens_seen": 234233808, + "step": 108435 + }, + { + "epoch": 17.69004893964111, + "grad_norm": 0.0016587387071922421, + "learning_rate": 4.009316113856798e-05, + "loss": 0.0009, + "num_input_tokens_seen": 234244688, + "step": 108440 + }, + { + "epoch": 17.690864600326265, + "grad_norm": 0.0021991583053022623, + "learning_rate": 4.0065238007682414e-05, + "loss": 0.0019, + "num_input_tokens_seen": 234255024, + "step": 108445 + }, + { + "epoch": 17.69168026101142, + "grad_norm": 0.04479004442691803, + "learning_rate": 4.0037324197958304e-05, + "loss": 0.0021, + "num_input_tokens_seen": 234266480, + "step": 108450 + }, + { + "epoch": 17.692495921696576, + "grad_norm": 0.0026538416277617216, + "learning_rate": 4.00094197099613e-05, + "loss": 0.0011, + "num_input_tokens_seen": 234277456, + "step": 108455 + }, + { + "epoch": 17.693311582381728, + "grad_norm": 0.003920292016118765, + "learning_rate": 3.9981524544256964e-05, + "loss": 0.0593, + "num_input_tokens_seen": 234287536, + "step": 108460 + }, + { + "epoch": 17.694127243066884, + "grad_norm": 0.001578305964358151, + "learning_rate": 3.995363870141061e-05, + "loss": 0.001, + "num_input_tokens_seen": 234298352, + "step": 108465 + }, + { + "epoch": 17.69494290375204, + "grad_norm": 0.013417751528322697, + "learning_rate": 3.9925762181987345e-05, + "loss": 0.0026, + "num_input_tokens_seen": 234309488, + "step": 108470 + }, + { + "epoch": 17.695758564437195, + "grad_norm": 0.0006595024606212974, + "learning_rate": 3.9897894986552216e-05, + "loss": 0.0009, + "num_input_tokens_seen": 234319216, + "step": 108475 + }, + { + "epoch": 17.696574225122347, + "grad_norm": 0.005656297784298658, + "learning_rate": 3.987003711566978e-05, + "loss": 0.0275, + "num_input_tokens_seen": 234329776, + "step": 108480 + }, + { + "epoch": 17.697389885807503, + "grad_norm": 0.005397633649408817, + "learning_rate": 3.984218856990496e-05, + "loss": 0.0017, + "num_input_tokens_seen": 234341488, + "step": 108485 + }, + { + "epoch": 17.69820554649266, + "grad_norm": 0.0029238967690616846, + "learning_rate": 3.981434934982176e-05, + "loss": 0.001, + "num_input_tokens_seen": 234352144, + "step": 108490 + }, + { + "epoch": 17.699021207177815, + "grad_norm": 0.0007100607035681605, + "learning_rate": 3.978651945598472e-05, + "loss": 0.0681, + "num_input_tokens_seen": 234364208, + "step": 108495 + }, + { + "epoch": 17.69983686786297, + "grad_norm": 0.002662144135683775, + "learning_rate": 3.975869888895756e-05, + "loss": 0.0008, + "num_input_tokens_seen": 234374320, + "step": 108500 + }, + { + "epoch": 17.700652528548122, + "grad_norm": 0.000382350233849138, + "learning_rate": 3.973088764930433e-05, + "loss": 0.0054, + "num_input_tokens_seen": 234384368, + "step": 108505 + }, + { + "epoch": 17.701468189233278, + "grad_norm": 0.0005653170519508421, + "learning_rate": 3.9703085737588405e-05, + "loss": 0.0072, + "num_input_tokens_seen": 234395856, + "step": 108510 + }, + { + "epoch": 17.702283849918434, + "grad_norm": 0.11164949834346771, + "learning_rate": 3.967529315437357e-05, + "loss": 0.0031, + "num_input_tokens_seen": 234406576, + "step": 108515 + }, + { + "epoch": 17.70309951060359, + "grad_norm": 0.00023540828260593116, + "learning_rate": 3.96475099002227e-05, + "loss": 0.0034, + "num_input_tokens_seen": 234417904, + "step": 108520 + }, + { + "epoch": 17.703915171288745, + "grad_norm": 0.0013595453929156065, + "learning_rate": 3.9619735975699236e-05, + "loss": 0.0016, + "num_input_tokens_seen": 234428240, + "step": 108525 + }, + { + "epoch": 17.704730831973897, + "grad_norm": 0.004526201635599136, + "learning_rate": 3.9591971381365665e-05, + "loss": 0.0019, + "num_input_tokens_seen": 234439152, + "step": 108530 + }, + { + "epoch": 17.705546492659053, + "grad_norm": 0.07590554654598236, + "learning_rate": 3.956421611778499e-05, + "loss": 0.0046, + "num_input_tokens_seen": 234449744, + "step": 108535 + }, + { + "epoch": 17.70636215334421, + "grad_norm": 0.005136074032634497, + "learning_rate": 3.953647018551948e-05, + "loss": 0.001, + "num_input_tokens_seen": 234461520, + "step": 108540 + }, + { + "epoch": 17.707177814029365, + "grad_norm": 0.0015011136420071125, + "learning_rate": 3.950873358513168e-05, + "loss": 0.0014, + "num_input_tokens_seen": 234473232, + "step": 108545 + }, + { + "epoch": 17.70799347471452, + "grad_norm": 0.023294158279895782, + "learning_rate": 3.948100631718338e-05, + "loss": 0.0009, + "num_input_tokens_seen": 234483504, + "step": 108550 + }, + { + "epoch": 17.708809135399672, + "grad_norm": 0.0010147414868697524, + "learning_rate": 3.945328838223688e-05, + "loss": 0.0016, + "num_input_tokens_seen": 234494512, + "step": 108555 + }, + { + "epoch": 17.709624796084828, + "grad_norm": 0.12862026691436768, + "learning_rate": 3.942557978085354e-05, + "loss": 0.0126, + "num_input_tokens_seen": 234505008, + "step": 108560 + }, + { + "epoch": 17.710440456769984, + "grad_norm": 0.00031221084645949304, + "learning_rate": 3.939788051359522e-05, + "loss": 0.0023, + "num_input_tokens_seen": 234515664, + "step": 108565 + }, + { + "epoch": 17.71125611745514, + "grad_norm": 0.0004358456062618643, + "learning_rate": 3.93701905810232e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234523856, + "step": 108570 + }, + { + "epoch": 17.712071778140295, + "grad_norm": 0.0015146576333791018, + "learning_rate": 3.934250998369859e-05, + "loss": 0.0023, + "num_input_tokens_seen": 234534480, + "step": 108575 + }, + { + "epoch": 17.712887438825447, + "grad_norm": 0.2730502784252167, + "learning_rate": 3.931483872218239e-05, + "loss": 0.0069, + "num_input_tokens_seen": 234545040, + "step": 108580 + }, + { + "epoch": 17.713703099510603, + "grad_norm": 0.0036683762446045876, + "learning_rate": 3.928717679703542e-05, + "loss": 0.0008, + "num_input_tokens_seen": 234556176, + "step": 108585 + }, + { + "epoch": 17.71451876019576, + "grad_norm": 0.5490561127662659, + "learning_rate": 3.925952420881823e-05, + "loss": 0.0327, + "num_input_tokens_seen": 234567568, + "step": 108590 + }, + { + "epoch": 17.715334420880914, + "grad_norm": 0.0008095699595287442, + "learning_rate": 3.9231880958091325e-05, + "loss": 0.0007, + "num_input_tokens_seen": 234579792, + "step": 108595 + }, + { + "epoch": 17.71615008156607, + "grad_norm": 0.0022214106284081936, + "learning_rate": 3.920424704541481e-05, + "loss": 0.0417, + "num_input_tokens_seen": 234590448, + "step": 108600 + }, + { + "epoch": 17.716965742251222, + "grad_norm": 0.0017520349938422441, + "learning_rate": 3.9176622471348845e-05, + "loss": 0.0012, + "num_input_tokens_seen": 234602992, + "step": 108605 + }, + { + "epoch": 17.717781402936378, + "grad_norm": 0.006847703829407692, + "learning_rate": 3.9149007236453204e-05, + "loss": 0.0625, + "num_input_tokens_seen": 234613840, + "step": 108610 + }, + { + "epoch": 17.718597063621534, + "grad_norm": 0.02551284059882164, + "learning_rate": 3.912140134128761e-05, + "loss": 0.002, + "num_input_tokens_seen": 234624976, + "step": 108615 + }, + { + "epoch": 17.71941272430669, + "grad_norm": 0.03996856510639191, + "learning_rate": 3.909380478641139e-05, + "loss": 0.003, + "num_input_tokens_seen": 234635760, + "step": 108620 + }, + { + "epoch": 17.72022838499184, + "grad_norm": 0.014086296781897545, + "learning_rate": 3.906621757238393e-05, + "loss": 0.0031, + "num_input_tokens_seen": 234646832, + "step": 108625 + }, + { + "epoch": 17.721044045676997, + "grad_norm": 0.11001487076282501, + "learning_rate": 3.90386396997644e-05, + "loss": 0.0038, + "num_input_tokens_seen": 234657200, + "step": 108630 + }, + { + "epoch": 17.721859706362153, + "grad_norm": 0.0010492857545614243, + "learning_rate": 3.901107116911145e-05, + "loss": 0.0018, + "num_input_tokens_seen": 234666928, + "step": 108635 + }, + { + "epoch": 17.72267536704731, + "grad_norm": 0.0010918010957539082, + "learning_rate": 3.8983511980984154e-05, + "loss": 0.0006, + "num_input_tokens_seen": 234677424, + "step": 108640 + }, + { + "epoch": 17.723491027732464, + "grad_norm": 0.002043887274339795, + "learning_rate": 3.895596213594066e-05, + "loss": 0.0006, + "num_input_tokens_seen": 234688400, + "step": 108645 + }, + { + "epoch": 17.724306688417617, + "grad_norm": 0.010542036034166813, + "learning_rate": 3.892842163453964e-05, + "loss": 0.0021, + "num_input_tokens_seen": 234699792, + "step": 108650 + }, + { + "epoch": 17.725122349102772, + "grad_norm": 0.006847497075796127, + "learning_rate": 3.8900890477338856e-05, + "loss": 0.0021, + "num_input_tokens_seen": 234710736, + "step": 108655 + }, + { + "epoch": 17.725938009787928, + "grad_norm": 0.010414250195026398, + "learning_rate": 3.887336866489666e-05, + "loss": 0.0019, + "num_input_tokens_seen": 234722000, + "step": 108660 + }, + { + "epoch": 17.726753670473084, + "grad_norm": 0.009203067980706692, + "learning_rate": 3.884585619777048e-05, + "loss": 0.0024, + "num_input_tokens_seen": 234733328, + "step": 108665 + }, + { + "epoch": 17.72756933115824, + "grad_norm": 0.005820513237267733, + "learning_rate": 3.881835307651816e-05, + "loss": 0.0017, + "num_input_tokens_seen": 234743792, + "step": 108670 + }, + { + "epoch": 17.72838499184339, + "grad_norm": 0.0019853932317346334, + "learning_rate": 3.879085930169685e-05, + "loss": 0.0011, + "num_input_tokens_seen": 234755440, + "step": 108675 + }, + { + "epoch": 17.729200652528547, + "grad_norm": 0.01240463275462389, + "learning_rate": 3.8763374873863886e-05, + "loss": 0.0056, + "num_input_tokens_seen": 234765776, + "step": 108680 + }, + { + "epoch": 17.730016313213703, + "grad_norm": 0.06792809069156647, + "learning_rate": 3.873589979357633e-05, + "loss": 0.0013, + "num_input_tokens_seen": 234777008, + "step": 108685 + }, + { + "epoch": 17.73083197389886, + "grad_norm": 0.01229359582066536, + "learning_rate": 3.870843406139085e-05, + "loss": 0.0009, + "num_input_tokens_seen": 234787152, + "step": 108690 + }, + { + "epoch": 17.731647634584014, + "grad_norm": 0.00027565364143811166, + "learning_rate": 3.868097767786416e-05, + "loss": 0.002, + "num_input_tokens_seen": 234799472, + "step": 108695 + }, + { + "epoch": 17.732463295269167, + "grad_norm": 0.015476987697184086, + "learning_rate": 3.86535306435527e-05, + "loss": 0.0017, + "num_input_tokens_seen": 234809296, + "step": 108700 + }, + { + "epoch": 17.733278955954322, + "grad_norm": 0.0027482211589813232, + "learning_rate": 3.8626092959012706e-05, + "loss": 0.0013, + "num_input_tokens_seen": 234821296, + "step": 108705 + }, + { + "epoch": 17.734094616639478, + "grad_norm": 0.001372481812722981, + "learning_rate": 3.8598664624800215e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234831792, + "step": 108710 + }, + { + "epoch": 17.734910277324634, + "grad_norm": 0.008024675771594048, + "learning_rate": 3.857124564147113e-05, + "loss": 0.0012, + "num_input_tokens_seen": 234843408, + "step": 108715 + }, + { + "epoch": 17.73572593800979, + "grad_norm": 0.0030358731746673584, + "learning_rate": 3.8543836009581115e-05, + "loss": 0.0024, + "num_input_tokens_seen": 234854960, + "step": 108720 + }, + { + "epoch": 17.73654159869494, + "grad_norm": 0.018493857234716415, + "learning_rate": 3.851643572968566e-05, + "loss": 0.0041, + "num_input_tokens_seen": 234866224, + "step": 108725 + }, + { + "epoch": 17.737357259380097, + "grad_norm": 0.006060306448489428, + "learning_rate": 3.848904480234006e-05, + "loss": 0.0251, + "num_input_tokens_seen": 234875760, + "step": 108730 + }, + { + "epoch": 17.738172920065253, + "grad_norm": 0.008324912749230862, + "learning_rate": 3.846166322809941e-05, + "loss": 0.0051, + "num_input_tokens_seen": 234886768, + "step": 108735 + }, + { + "epoch": 17.73898858075041, + "grad_norm": 0.00037973938742652535, + "learning_rate": 3.8434291007518665e-05, + "loss": 0.0008, + "num_input_tokens_seen": 234897392, + "step": 108740 + }, + { + "epoch": 17.739804241435564, + "grad_norm": 0.0335724800825119, + "learning_rate": 3.8406928141152596e-05, + "loss": 0.0017, + "num_input_tokens_seen": 234909776, + "step": 108745 + }, + { + "epoch": 17.740619902120716, + "grad_norm": 0.0029921771492809057, + "learning_rate": 3.8379574629555656e-05, + "loss": 0.0054, + "num_input_tokens_seen": 234921360, + "step": 108750 + }, + { + "epoch": 17.741435562805872, + "grad_norm": 0.007268199231475592, + "learning_rate": 3.835223047328229e-05, + "loss": 0.0017, + "num_input_tokens_seen": 234931280, + "step": 108755 + }, + { + "epoch": 17.742251223491028, + "grad_norm": 0.0002096295211231336, + "learning_rate": 3.8324895672886554e-05, + "loss": 0.1414, + "num_input_tokens_seen": 234941584, + "step": 108760 + }, + { + "epoch": 17.743066884176184, + "grad_norm": 0.16069717705249786, + "learning_rate": 3.829757022892255e-05, + "loss": 0.0029, + "num_input_tokens_seen": 234951728, + "step": 108765 + }, + { + "epoch": 17.74388254486134, + "grad_norm": 0.05204736813902855, + "learning_rate": 3.827025414194385e-05, + "loss": 0.0024, + "num_input_tokens_seen": 234962000, + "step": 108770 + }, + { + "epoch": 17.74469820554649, + "grad_norm": 0.004038558341562748, + "learning_rate": 3.824294741250439e-05, + "loss": 0.0009, + "num_input_tokens_seen": 234973488, + "step": 108775 + }, + { + "epoch": 17.745513866231647, + "grad_norm": 0.0019828190561383963, + "learning_rate": 3.821565004115723e-05, + "loss": 0.0013, + "num_input_tokens_seen": 234984016, + "step": 108780 + }, + { + "epoch": 17.746329526916803, + "grad_norm": 0.00056139484513551, + "learning_rate": 3.8188362028455826e-05, + "loss": 0.0027, + "num_input_tokens_seen": 234994832, + "step": 108785 + }, + { + "epoch": 17.74714518760196, + "grad_norm": 1.5413379669189453, + "learning_rate": 3.8161083374953056e-05, + "loss": 0.0312, + "num_input_tokens_seen": 235004816, + "step": 108790 + }, + { + "epoch": 17.747960848287114, + "grad_norm": 0.026222562417387962, + "learning_rate": 3.8133814081201866e-05, + "loss": 0.0024, + "num_input_tokens_seen": 235015504, + "step": 108795 + }, + { + "epoch": 17.748776508972266, + "grad_norm": 0.0037302477285265923, + "learning_rate": 3.810655414775482e-05, + "loss": 0.0466, + "num_input_tokens_seen": 235027728, + "step": 108800 + }, + { + "epoch": 17.749592169657422, + "grad_norm": 0.00023018640058580786, + "learning_rate": 3.807930357516448e-05, + "loss": 0.0081, + "num_input_tokens_seen": 235039344, + "step": 108805 + }, + { + "epoch": 17.750407830342578, + "grad_norm": 0.004767855163663626, + "learning_rate": 3.8052062363982957e-05, + "loss": 0.0023, + "num_input_tokens_seen": 235050480, + "step": 108810 + }, + { + "epoch": 17.751223491027734, + "grad_norm": 0.004195967223495245, + "learning_rate": 3.8024830514762465e-05, + "loss": 0.0009, + "num_input_tokens_seen": 235062960, + "step": 108815 + }, + { + "epoch": 17.752039151712886, + "grad_norm": 0.05753675475716591, + "learning_rate": 3.79976080280548e-05, + "loss": 0.0172, + "num_input_tokens_seen": 235074416, + "step": 108820 + }, + { + "epoch": 17.75285481239804, + "grad_norm": 0.0006173241999931633, + "learning_rate": 3.7970394904411733e-05, + "loss": 0.0008, + "num_input_tokens_seen": 235084112, + "step": 108825 + }, + { + "epoch": 17.753670473083197, + "grad_norm": 0.694864809513092, + "learning_rate": 3.7943191144384716e-05, + "loss": 0.012, + "num_input_tokens_seen": 235094128, + "step": 108830 + }, + { + "epoch": 17.754486133768353, + "grad_norm": 0.09449499845504761, + "learning_rate": 3.7915996748525086e-05, + "loss": 0.0036, + "num_input_tokens_seen": 235106320, + "step": 108835 + }, + { + "epoch": 17.75530179445351, + "grad_norm": 0.003336785826832056, + "learning_rate": 3.788881171738401e-05, + "loss": 0.0031, + "num_input_tokens_seen": 235117584, + "step": 108840 + }, + { + "epoch": 17.75611745513866, + "grad_norm": 0.002545825904235244, + "learning_rate": 3.7861636051512385e-05, + "loss": 0.0159, + "num_input_tokens_seen": 235128528, + "step": 108845 + }, + { + "epoch": 17.756933115823816, + "grad_norm": 0.0005718961474485695, + "learning_rate": 3.783446975146099e-05, + "loss": 0.0008, + "num_input_tokens_seen": 235139984, + "step": 108850 + }, + { + "epoch": 17.757748776508972, + "grad_norm": 0.05380915477871895, + "learning_rate": 3.7807312817780325e-05, + "loss": 0.0029, + "num_input_tokens_seen": 235151408, + "step": 108855 + }, + { + "epoch": 17.758564437194128, + "grad_norm": 0.010704193264245987, + "learning_rate": 3.7780165251020794e-05, + "loss": 0.0022, + "num_input_tokens_seen": 235163056, + "step": 108860 + }, + { + "epoch": 17.759380097879284, + "grad_norm": 0.0006450397195294499, + "learning_rate": 3.7753027051732615e-05, + "loss": 0.0101, + "num_input_tokens_seen": 235174032, + "step": 108865 + }, + { + "epoch": 17.760195758564436, + "grad_norm": 0.004813667386770248, + "learning_rate": 3.772589822046568e-05, + "loss": 0.0033, + "num_input_tokens_seen": 235184848, + "step": 108870 + }, + { + "epoch": 17.76101141924959, + "grad_norm": 0.0434638075530529, + "learning_rate": 3.7698778757769944e-05, + "loss": 0.002, + "num_input_tokens_seen": 235195344, + "step": 108875 + }, + { + "epoch": 17.761827079934747, + "grad_norm": 0.002826629439368844, + "learning_rate": 3.767166866419486e-05, + "loss": 0.0029, + "num_input_tokens_seen": 235206064, + "step": 108880 + }, + { + "epoch": 17.762642740619903, + "grad_norm": 0.0023013916797935963, + "learning_rate": 3.764456794028992e-05, + "loss": 0.001, + "num_input_tokens_seen": 235215632, + "step": 108885 + }, + { + "epoch": 17.76345840130506, + "grad_norm": 0.00613722950220108, + "learning_rate": 3.7617476586604304e-05, + "loss": 0.0009, + "num_input_tokens_seen": 235226192, + "step": 108890 + }, + { + "epoch": 17.76427406199021, + "grad_norm": 0.0007096104673109949, + "learning_rate": 3.759039460368724e-05, + "loss": 0.0742, + "num_input_tokens_seen": 235236464, + "step": 108895 + }, + { + "epoch": 17.765089722675366, + "grad_norm": 0.0005393307656049728, + "learning_rate": 3.756332199208728e-05, + "loss": 0.0034, + "num_input_tokens_seen": 235246704, + "step": 108900 + }, + { + "epoch": 17.765905383360522, + "grad_norm": 0.9690297842025757, + "learning_rate": 3.753625875235345e-05, + "loss": 0.1084, + "num_input_tokens_seen": 235257008, + "step": 108905 + }, + { + "epoch": 17.766721044045678, + "grad_norm": 0.0005110432975925505, + "learning_rate": 3.750920488503379e-05, + "loss": 0.0134, + "num_input_tokens_seen": 235268976, + "step": 108910 + }, + { + "epoch": 17.767536704730833, + "grad_norm": 0.00060313317226246, + "learning_rate": 3.7482160390676866e-05, + "loss": 0.0049, + "num_input_tokens_seen": 235278768, + "step": 108915 + }, + { + "epoch": 17.768352365415986, + "grad_norm": 0.019515041261911392, + "learning_rate": 3.745512526983075e-05, + "loss": 0.0024, + "num_input_tokens_seen": 235288784, + "step": 108920 + }, + { + "epoch": 17.76916802610114, + "grad_norm": 0.13256117701530457, + "learning_rate": 3.7428099523043325e-05, + "loss": 0.1224, + "num_input_tokens_seen": 235299280, + "step": 108925 + }, + { + "epoch": 17.769983686786297, + "grad_norm": 0.0040601822547614574, + "learning_rate": 3.7401083150862216e-05, + "loss": 0.0041, + "num_input_tokens_seen": 235310032, + "step": 108930 + }, + { + "epoch": 17.770799347471453, + "grad_norm": 0.008540820330381393, + "learning_rate": 3.7374076153835033e-05, + "loss": 0.0053, + "num_input_tokens_seen": 235319952, + "step": 108935 + }, + { + "epoch": 17.77161500815661, + "grad_norm": 0.07201741635799408, + "learning_rate": 3.734707853250907e-05, + "loss": 0.0031, + "num_input_tokens_seen": 235331504, + "step": 108940 + }, + { + "epoch": 17.77243066884176, + "grad_norm": 0.00032880945946089923, + "learning_rate": 3.73200902874315e-05, + "loss": 0.004, + "num_input_tokens_seen": 235341488, + "step": 108945 + }, + { + "epoch": 17.773246329526916, + "grad_norm": 0.011850826442241669, + "learning_rate": 3.729311141914926e-05, + "loss": 0.0014, + "num_input_tokens_seen": 235352592, + "step": 108950 + }, + { + "epoch": 17.774061990212072, + "grad_norm": 0.004222301300615072, + "learning_rate": 3.72661419282091e-05, + "loss": 0.005, + "num_input_tokens_seen": 235362224, + "step": 108955 + }, + { + "epoch": 17.774877650897228, + "grad_norm": 0.014178330078721046, + "learning_rate": 3.723918181515756e-05, + "loss": 0.0013, + "num_input_tokens_seen": 235372688, + "step": 108960 + }, + { + "epoch": 17.775693311582383, + "grad_norm": 0.0010670581832528114, + "learning_rate": 3.721223108054106e-05, + "loss": 0.0039, + "num_input_tokens_seen": 235383440, + "step": 108965 + }, + { + "epoch": 17.776508972267536, + "grad_norm": 0.026346096768975258, + "learning_rate": 3.7185289724905814e-05, + "loss": 0.0012, + "num_input_tokens_seen": 235394640, + "step": 108970 + }, + { + "epoch": 17.77732463295269, + "grad_norm": 0.025496676564216614, + "learning_rate": 3.7158357748797775e-05, + "loss": 0.0028, + "num_input_tokens_seen": 235406736, + "step": 108975 + }, + { + "epoch": 17.778140293637847, + "grad_norm": 0.043573055416345596, + "learning_rate": 3.7131435152762735e-05, + "loss": 0.0013, + "num_input_tokens_seen": 235415600, + "step": 108980 + }, + { + "epoch": 17.778955954323003, + "grad_norm": 0.26204147934913635, + "learning_rate": 3.710452193734643e-05, + "loss": 0.005, + "num_input_tokens_seen": 235425776, + "step": 108985 + }, + { + "epoch": 17.77977161500816, + "grad_norm": 0.0021411122288554907, + "learning_rate": 3.707761810309418e-05, + "loss": 0.036, + "num_input_tokens_seen": 235436752, + "step": 108990 + }, + { + "epoch": 17.78058727569331, + "grad_norm": 0.0380295105278492, + "learning_rate": 3.705072365055112e-05, + "loss": 0.0039, + "num_input_tokens_seen": 235446608, + "step": 108995 + }, + { + "epoch": 17.781402936378466, + "grad_norm": 0.02980238012969494, + "learning_rate": 3.7023838580262706e-05, + "loss": 0.003, + "num_input_tokens_seen": 235457264, + "step": 109000 + }, + { + "epoch": 17.782218597063622, + "grad_norm": 0.010661580599844456, + "learning_rate": 3.699696289277327e-05, + "loss": 0.028, + "num_input_tokens_seen": 235468784, + "step": 109005 + }, + { + "epoch": 17.783034257748778, + "grad_norm": 0.009770001284778118, + "learning_rate": 3.697009658862793e-05, + "loss": 0.0009, + "num_input_tokens_seen": 235479952, + "step": 109010 + }, + { + "epoch": 17.78384991843393, + "grad_norm": 0.00064946518978104, + "learning_rate": 3.694323966837088e-05, + "loss": 0.0006, + "num_input_tokens_seen": 235491664, + "step": 109015 + }, + { + "epoch": 17.784665579119086, + "grad_norm": 0.010038640350103378, + "learning_rate": 3.6916392132546605e-05, + "loss": 0.0081, + "num_input_tokens_seen": 235502416, + "step": 109020 + }, + { + "epoch": 17.78548123980424, + "grad_norm": 0.0002301902131875977, + "learning_rate": 3.6889553981698966e-05, + "loss": 0.001, + "num_input_tokens_seen": 235514032, + "step": 109025 + }, + { + "epoch": 17.786296900489397, + "grad_norm": 0.010864865966141224, + "learning_rate": 3.6862725216372185e-05, + "loss": 0.0027, + "num_input_tokens_seen": 235525680, + "step": 109030 + }, + { + "epoch": 17.787112561174553, + "grad_norm": 0.046079590916633606, + "learning_rate": 3.683590583710961e-05, + "loss": 0.0014, + "num_input_tokens_seen": 235535056, + "step": 109035 + }, + { + "epoch": 17.787928221859705, + "grad_norm": 0.0027483527082949877, + "learning_rate": 3.6809095844455134e-05, + "loss": 0.003, + "num_input_tokens_seen": 235545968, + "step": 109040 + }, + { + "epoch": 17.78874388254486, + "grad_norm": 0.023552676662802696, + "learning_rate": 3.678229523895177e-05, + "loss": 0.0016, + "num_input_tokens_seen": 235555600, + "step": 109045 + }, + { + "epoch": 17.789559543230016, + "grad_norm": 0.0004201144038233906, + "learning_rate": 3.675550402114303e-05, + "loss": 0.0011, + "num_input_tokens_seen": 235566576, + "step": 109050 + }, + { + "epoch": 17.790375203915172, + "grad_norm": 0.018458297476172447, + "learning_rate": 3.6728722191571476e-05, + "loss": 0.0007, + "num_input_tokens_seen": 235576784, + "step": 109055 + }, + { + "epoch": 17.791190864600328, + "grad_norm": 0.007056164089590311, + "learning_rate": 3.670194975078017e-05, + "loss": 0.0011, + "num_input_tokens_seen": 235588400, + "step": 109060 + }, + { + "epoch": 17.79200652528548, + "grad_norm": 0.009251178242266178, + "learning_rate": 3.667518669931158e-05, + "loss": 0.001, + "num_input_tokens_seen": 235599024, + "step": 109065 + }, + { + "epoch": 17.792822185970635, + "grad_norm": 0.003141851397231221, + "learning_rate": 3.6648433037708094e-05, + "loss": 0.0038, + "num_input_tokens_seen": 235608944, + "step": 109070 + }, + { + "epoch": 17.79363784665579, + "grad_norm": 0.00041024110396392643, + "learning_rate": 3.66216887665119e-05, + "loss": 0.0029, + "num_input_tokens_seen": 235618896, + "step": 109075 + }, + { + "epoch": 17.794453507340947, + "grad_norm": 0.001330662053078413, + "learning_rate": 3.659495388626505e-05, + "loss": 0.0007, + "num_input_tokens_seen": 235629616, + "step": 109080 + }, + { + "epoch": 17.795269168026103, + "grad_norm": 0.018950335681438446, + "learning_rate": 3.6568228397509286e-05, + "loss": 0.0013, + "num_input_tokens_seen": 235641008, + "step": 109085 + }, + { + "epoch": 17.796084828711255, + "grad_norm": 0.044273439794778824, + "learning_rate": 3.654151230078628e-05, + "loss": 0.0111, + "num_input_tokens_seen": 235651408, + "step": 109090 + }, + { + "epoch": 17.79690048939641, + "grad_norm": 0.0021813130006194115, + "learning_rate": 3.6514805596637504e-05, + "loss": 0.0005, + "num_input_tokens_seen": 235661648, + "step": 109095 + }, + { + "epoch": 17.797716150081566, + "grad_norm": 0.013074532151222229, + "learning_rate": 3.648810828560417e-05, + "loss": 0.0008, + "num_input_tokens_seen": 235672880, + "step": 109100 + }, + { + "epoch": 17.798531810766722, + "grad_norm": 0.002100384095683694, + "learning_rate": 3.6461420368227304e-05, + "loss": 0.1017, + "num_input_tokens_seen": 235683408, + "step": 109105 + }, + { + "epoch": 17.799347471451878, + "grad_norm": 0.0026194609235972166, + "learning_rate": 3.643474184504775e-05, + "loss": 0.0029, + "num_input_tokens_seen": 235694608, + "step": 109110 + }, + { + "epoch": 17.80016313213703, + "grad_norm": 0.04093625396490097, + "learning_rate": 3.6408072716606344e-05, + "loss": 0.0027, + "num_input_tokens_seen": 235703984, + "step": 109115 + }, + { + "epoch": 17.800978792822185, + "grad_norm": 0.016903575509786606, + "learning_rate": 3.6381412983443277e-05, + "loss": 0.0063, + "num_input_tokens_seen": 235714704, + "step": 109120 + }, + { + "epoch": 17.80179445350734, + "grad_norm": 0.012498315423727036, + "learning_rate": 3.635476264609922e-05, + "loss": 0.0025, + "num_input_tokens_seen": 235726800, + "step": 109125 + }, + { + "epoch": 17.802610114192497, + "grad_norm": 0.01264413632452488, + "learning_rate": 3.6328121705113905e-05, + "loss": 0.0032, + "num_input_tokens_seen": 235738864, + "step": 109130 + }, + { + "epoch": 17.803425774877653, + "grad_norm": 0.8477015495300293, + "learning_rate": 3.6301490161027574e-05, + "loss": 0.0125, + "num_input_tokens_seen": 235750288, + "step": 109135 + }, + { + "epoch": 17.804241435562805, + "grad_norm": 0.01968962326645851, + "learning_rate": 3.6274868014379624e-05, + "loss": 0.0011, + "num_input_tokens_seen": 235761488, + "step": 109140 + }, + { + "epoch": 17.80505709624796, + "grad_norm": 0.02571706660091877, + "learning_rate": 3.6248255265709906e-05, + "loss": 0.0053, + "num_input_tokens_seen": 235772400, + "step": 109145 + }, + { + "epoch": 17.805872756933116, + "grad_norm": 0.001857051276601851, + "learning_rate": 3.6221651915557484e-05, + "loss": 0.015, + "num_input_tokens_seen": 235783376, + "step": 109150 + }, + { + "epoch": 17.806688417618272, + "grad_norm": 0.0020788402762264013, + "learning_rate": 3.6195057964461764e-05, + "loss": 0.0126, + "num_input_tokens_seen": 235794736, + "step": 109155 + }, + { + "epoch": 17.807504078303424, + "grad_norm": 0.0018391057383269072, + "learning_rate": 3.616847341296137e-05, + "loss": 0.0012, + "num_input_tokens_seen": 235805104, + "step": 109160 + }, + { + "epoch": 17.80831973898858, + "grad_norm": 0.0004060390347149223, + "learning_rate": 3.6141898261595475e-05, + "loss": 0.0049, + "num_input_tokens_seen": 235816912, + "step": 109165 + }, + { + "epoch": 17.809135399673735, + "grad_norm": 0.021962953731417656, + "learning_rate": 3.611533251090232e-05, + "loss": 0.0012, + "num_input_tokens_seen": 235826992, + "step": 109170 + }, + { + "epoch": 17.80995106035889, + "grad_norm": 0.013397028669714928, + "learning_rate": 3.608877616142053e-05, + "loss": 0.0025, + "num_input_tokens_seen": 235837136, + "step": 109175 + }, + { + "epoch": 17.810766721044047, + "grad_norm": 0.0003482665924821049, + "learning_rate": 3.606222921368807e-05, + "loss": 0.0022, + "num_input_tokens_seen": 235847536, + "step": 109180 + }, + { + "epoch": 17.8115823817292, + "grad_norm": 0.03608010709285736, + "learning_rate": 3.603569166824327e-05, + "loss": 0.039, + "num_input_tokens_seen": 235857712, + "step": 109185 + }, + { + "epoch": 17.812398042414355, + "grad_norm": 0.010204904712736607, + "learning_rate": 3.600916352562356e-05, + "loss": 0.0075, + "num_input_tokens_seen": 235866704, + "step": 109190 + }, + { + "epoch": 17.81321370309951, + "grad_norm": 0.0035822775680571795, + "learning_rate": 3.598264478636698e-05, + "loss": 0.0047, + "num_input_tokens_seen": 235877840, + "step": 109195 + }, + { + "epoch": 17.814029363784666, + "grad_norm": 0.0004829168610740453, + "learning_rate": 3.595613545101056e-05, + "loss": 0.0561, + "num_input_tokens_seen": 235888368, + "step": 109200 + }, + { + "epoch": 17.81484502446982, + "grad_norm": 0.0008832847815938294, + "learning_rate": 3.592963552009182e-05, + "loss": 0.0024, + "num_input_tokens_seen": 235898224, + "step": 109205 + }, + { + "epoch": 17.815660685154974, + "grad_norm": 0.03870289772748947, + "learning_rate": 3.590314499414771e-05, + "loss": 0.008, + "num_input_tokens_seen": 235909584, + "step": 109210 + }, + { + "epoch": 17.81647634584013, + "grad_norm": 0.0008303043432533741, + "learning_rate": 3.587666387371513e-05, + "loss": 0.0045, + "num_input_tokens_seen": 235921168, + "step": 109215 + }, + { + "epoch": 17.817292006525285, + "grad_norm": 0.007308437488973141, + "learning_rate": 3.585019215933072e-05, + "loss": 0.0012, + "num_input_tokens_seen": 235932240, + "step": 109220 + }, + { + "epoch": 17.81810766721044, + "grad_norm": 0.0020328452810645103, + "learning_rate": 3.5823729851530983e-05, + "loss": 0.0539, + "num_input_tokens_seen": 235943280, + "step": 109225 + }, + { + "epoch": 17.818923327895597, + "grad_norm": 0.0017934415955096483, + "learning_rate": 3.5797276950852276e-05, + "loss": 0.0013, + "num_input_tokens_seen": 235954192, + "step": 109230 + }, + { + "epoch": 17.81973898858075, + "grad_norm": 0.019754432141780853, + "learning_rate": 3.5770833457830554e-05, + "loss": 0.0014, + "num_input_tokens_seen": 235964784, + "step": 109235 + }, + { + "epoch": 17.820554649265905, + "grad_norm": 0.007611890789121389, + "learning_rate": 3.5744399373001834e-05, + "loss": 0.0008, + "num_input_tokens_seen": 235975536, + "step": 109240 + }, + { + "epoch": 17.82137030995106, + "grad_norm": 0.0005911207990720868, + "learning_rate": 3.57179746969018e-05, + "loss": 0.0024, + "num_input_tokens_seen": 235986576, + "step": 109245 + }, + { + "epoch": 17.822185970636216, + "grad_norm": 0.0020840333309024572, + "learning_rate": 3.569155943006602e-05, + "loss": 0.0091, + "num_input_tokens_seen": 235996944, + "step": 109250 + }, + { + "epoch": 17.82300163132137, + "grad_norm": 0.017575936391949654, + "learning_rate": 3.566515357302974e-05, + "loss": 0.0009, + "num_input_tokens_seen": 236008144, + "step": 109255 + }, + { + "epoch": 17.823817292006524, + "grad_norm": 0.0008498613606207073, + "learning_rate": 3.56387571263283e-05, + "loss": 0.0003, + "num_input_tokens_seen": 236019952, + "step": 109260 + }, + { + "epoch": 17.82463295269168, + "grad_norm": 0.0718621090054512, + "learning_rate": 3.561237009049639e-05, + "loss": 0.0163, + "num_input_tokens_seen": 236031568, + "step": 109265 + }, + { + "epoch": 17.825448613376835, + "grad_norm": 0.01773866079747677, + "learning_rate": 3.558599246606903e-05, + "loss": 0.0035, + "num_input_tokens_seen": 236042864, + "step": 109270 + }, + { + "epoch": 17.82626427406199, + "grad_norm": 0.016890283674001694, + "learning_rate": 3.555962425358056e-05, + "loss": 0.0025, + "num_input_tokens_seen": 236053296, + "step": 109275 + }, + { + "epoch": 17.827079934747147, + "grad_norm": 0.001565889222547412, + "learning_rate": 3.5533265453565664e-05, + "loss": 0.0066, + "num_input_tokens_seen": 236063760, + "step": 109280 + }, + { + "epoch": 17.8278955954323, + "grad_norm": 0.012458628974854946, + "learning_rate": 3.55069160665582e-05, + "loss": 0.0056, + "num_input_tokens_seen": 236075472, + "step": 109285 + }, + { + "epoch": 17.828711256117455, + "grad_norm": 0.019188063219189644, + "learning_rate": 3.5480576093092466e-05, + "loss": 0.0016, + "num_input_tokens_seen": 236086384, + "step": 109290 + }, + { + "epoch": 17.82952691680261, + "grad_norm": 0.0013830027310177684, + "learning_rate": 3.545424553370202e-05, + "loss": 0.001, + "num_input_tokens_seen": 236096720, + "step": 109295 + }, + { + "epoch": 17.830342577487766, + "grad_norm": 0.0006741559482179582, + "learning_rate": 3.5427924388920727e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236107696, + "step": 109300 + }, + { + "epoch": 17.83115823817292, + "grad_norm": 0.000454758177511394, + "learning_rate": 3.540161265928177e-05, + "loss": 0.0059, + "num_input_tokens_seen": 236118576, + "step": 109305 + }, + { + "epoch": 17.831973898858074, + "grad_norm": 0.0011603111634030938, + "learning_rate": 3.537531034531855e-05, + "loss": 0.0707, + "num_input_tokens_seen": 236129136, + "step": 109310 + }, + { + "epoch": 17.83278955954323, + "grad_norm": 0.004742736462503672, + "learning_rate": 3.5349017447564135e-05, + "loss": 0.0006, + "num_input_tokens_seen": 236139952, + "step": 109315 + }, + { + "epoch": 17.833605220228385, + "grad_norm": 0.013719492591917515, + "learning_rate": 3.532273396655128e-05, + "loss": 0.0009, + "num_input_tokens_seen": 236150352, + "step": 109320 + }, + { + "epoch": 17.83442088091354, + "grad_norm": 0.0017408907879143953, + "learning_rate": 3.5296459902812775e-05, + "loss": 0.0018, + "num_input_tokens_seen": 236160464, + "step": 109325 + }, + { + "epoch": 17.835236541598697, + "grad_norm": 0.0012018510606139898, + "learning_rate": 3.527019525688097e-05, + "loss": 0.0024, + "num_input_tokens_seen": 236170800, + "step": 109330 + }, + { + "epoch": 17.83605220228385, + "grad_norm": 0.0053267451003193855, + "learning_rate": 3.524394002928821e-05, + "loss": 0.0007, + "num_input_tokens_seen": 236181680, + "step": 109335 + }, + { + "epoch": 17.836867862969005, + "grad_norm": 0.00016377547581214458, + "learning_rate": 3.5217694220566644e-05, + "loss": 0.0009, + "num_input_tokens_seen": 236191952, + "step": 109340 + }, + { + "epoch": 17.83768352365416, + "grad_norm": 0.006869807373732328, + "learning_rate": 3.5191457831248054e-05, + "loss": 0.0715, + "num_input_tokens_seen": 236201936, + "step": 109345 + }, + { + "epoch": 17.838499184339316, + "grad_norm": 0.020623216405510902, + "learning_rate": 3.516523086186429e-05, + "loss": 0.0021, + "num_input_tokens_seen": 236212272, + "step": 109350 + }, + { + "epoch": 17.839314845024468, + "grad_norm": 0.0009061881573870778, + "learning_rate": 3.513901331294678e-05, + "loss": 0.0013, + "num_input_tokens_seen": 236221712, + "step": 109355 + }, + { + "epoch": 17.840130505709624, + "grad_norm": 0.0009150461410172284, + "learning_rate": 3.5112805185026853e-05, + "loss": 0.0019, + "num_input_tokens_seen": 236232400, + "step": 109360 + }, + { + "epoch": 17.84094616639478, + "grad_norm": 0.02719375491142273, + "learning_rate": 3.5086606478635706e-05, + "loss": 0.0024, + "num_input_tokens_seen": 236242512, + "step": 109365 + }, + { + "epoch": 17.841761827079935, + "grad_norm": 0.0005360327195376158, + "learning_rate": 3.506041719430425e-05, + "loss": 0.0006, + "num_input_tokens_seen": 236252752, + "step": 109370 + }, + { + "epoch": 17.84257748776509, + "grad_norm": 0.012039005756378174, + "learning_rate": 3.503423733256328e-05, + "loss": 0.0101, + "num_input_tokens_seen": 236263344, + "step": 109375 + }, + { + "epoch": 17.843393148450243, + "grad_norm": 0.002144909929484129, + "learning_rate": 3.500806689394337e-05, + "loss": 0.0043, + "num_input_tokens_seen": 236274704, + "step": 109380 + }, + { + "epoch": 17.8442088091354, + "grad_norm": 0.0027891851495951414, + "learning_rate": 3.4981905878974815e-05, + "loss": 0.0008, + "num_input_tokens_seen": 236284944, + "step": 109385 + }, + { + "epoch": 17.845024469820554, + "grad_norm": 0.004484755452722311, + "learning_rate": 3.495575428818787e-05, + "loss": 0.0008, + "num_input_tokens_seen": 236295856, + "step": 109390 + }, + { + "epoch": 17.84584013050571, + "grad_norm": 0.5081759691238403, + "learning_rate": 3.492961212211249e-05, + "loss": 0.0177, + "num_input_tokens_seen": 236306512, + "step": 109395 + }, + { + "epoch": 17.846655791190866, + "grad_norm": 0.0007591186440549791, + "learning_rate": 3.490347938127847e-05, + "loss": 0.0008, + "num_input_tokens_seen": 236317872, + "step": 109400 + }, + { + "epoch": 17.847471451876018, + "grad_norm": 0.03884272277355194, + "learning_rate": 3.4877356066215614e-05, + "loss": 0.0669, + "num_input_tokens_seen": 236327536, + "step": 109405 + }, + { + "epoch": 17.848287112561174, + "grad_norm": 0.0006161820492707193, + "learning_rate": 3.4851242177453e-05, + "loss": 0.0009, + "num_input_tokens_seen": 236339408, + "step": 109410 + }, + { + "epoch": 17.84910277324633, + "grad_norm": 0.013513866811990738, + "learning_rate": 3.482513771552021e-05, + "loss": 0.0023, + "num_input_tokens_seen": 236349904, + "step": 109415 + }, + { + "epoch": 17.849918433931485, + "grad_norm": 0.009950819425284863, + "learning_rate": 3.4799042680945966e-05, + "loss": 0.0016, + "num_input_tokens_seen": 236361296, + "step": 109420 + }, + { + "epoch": 17.85073409461664, + "grad_norm": 0.0011644375044852495, + "learning_rate": 3.477295707425937e-05, + "loss": 0.0005, + "num_input_tokens_seen": 236370928, + "step": 109425 + }, + { + "epoch": 17.851549755301793, + "grad_norm": 0.0018306487472727895, + "learning_rate": 3.474688089598893e-05, + "loss": 0.0017, + "num_input_tokens_seen": 236381200, + "step": 109430 + }, + { + "epoch": 17.85236541598695, + "grad_norm": 0.0012890893267467618, + "learning_rate": 3.4720814146663226e-05, + "loss": 0.0017, + "num_input_tokens_seen": 236391760, + "step": 109435 + }, + { + "epoch": 17.853181076672104, + "grad_norm": 0.014044544659554958, + "learning_rate": 3.469475682681045e-05, + "loss": 0.0018, + "num_input_tokens_seen": 236403472, + "step": 109440 + }, + { + "epoch": 17.85399673735726, + "grad_norm": 0.00026541019906289876, + "learning_rate": 3.466870893695867e-05, + "loss": 0.0007, + "num_input_tokens_seen": 236415088, + "step": 109445 + }, + { + "epoch": 17.854812398042416, + "grad_norm": 0.001360461232252419, + "learning_rate": 3.4642670477635866e-05, + "loss": 0.0008, + "num_input_tokens_seen": 236427216, + "step": 109450 + }, + { + "epoch": 17.855628058727568, + "grad_norm": 0.018865276128053665, + "learning_rate": 3.4616641449369656e-05, + "loss": 0.0022, + "num_input_tokens_seen": 236437232, + "step": 109455 + }, + { + "epoch": 17.856443719412724, + "grad_norm": 0.0013094799360260367, + "learning_rate": 3.459062185268763e-05, + "loss": 0.001, + "num_input_tokens_seen": 236447056, + "step": 109460 + }, + { + "epoch": 17.85725938009788, + "grad_norm": 0.20267651975154877, + "learning_rate": 3.456461168811703e-05, + "loss": 0.0058, + "num_input_tokens_seen": 236459120, + "step": 109465 + }, + { + "epoch": 17.858075040783035, + "grad_norm": 0.009335266426205635, + "learning_rate": 3.4538610956185044e-05, + "loss": 0.001, + "num_input_tokens_seen": 236471568, + "step": 109470 + }, + { + "epoch": 17.85889070146819, + "grad_norm": 0.0011765094241127372, + "learning_rate": 3.451261965741859e-05, + "loss": 0.0025, + "num_input_tokens_seen": 236483024, + "step": 109475 + }, + { + "epoch": 17.859706362153343, + "grad_norm": 0.06980552524328232, + "learning_rate": 3.44866377923444e-05, + "loss": 0.0036, + "num_input_tokens_seen": 236495248, + "step": 109480 + }, + { + "epoch": 17.8605220228385, + "grad_norm": 0.0011967658065259457, + "learning_rate": 3.446066536148901e-05, + "loss": 0.0018, + "num_input_tokens_seen": 236505296, + "step": 109485 + }, + { + "epoch": 17.861337683523654, + "grad_norm": 0.004047077614814043, + "learning_rate": 3.4434702365378825e-05, + "loss": 0.0078, + "num_input_tokens_seen": 236516272, + "step": 109490 + }, + { + "epoch": 17.86215334420881, + "grad_norm": 0.002524849260225892, + "learning_rate": 3.4408748804540034e-05, + "loss": 0.0021, + "num_input_tokens_seen": 236526160, + "step": 109495 + }, + { + "epoch": 17.862969004893966, + "grad_norm": 0.003883121768012643, + "learning_rate": 3.4382804679498616e-05, + "loss": 0.001, + "num_input_tokens_seen": 236535696, + "step": 109500 + }, + { + "epoch": 17.863784665579118, + "grad_norm": 0.0491781048476696, + "learning_rate": 3.4356869990780305e-05, + "loss": 0.0019, + "num_input_tokens_seen": 236547152, + "step": 109505 + }, + { + "epoch": 17.864600326264274, + "grad_norm": 0.005752094089984894, + "learning_rate": 3.4330944738910744e-05, + "loss": 0.0015, + "num_input_tokens_seen": 236559088, + "step": 109510 + }, + { + "epoch": 17.86541598694943, + "grad_norm": 0.008952261880040169, + "learning_rate": 3.430502892441528e-05, + "loss": 0.0656, + "num_input_tokens_seen": 236569744, + "step": 109515 + }, + { + "epoch": 17.866231647634585, + "grad_norm": 0.07735848426818848, + "learning_rate": 3.427912254781923e-05, + "loss": 0.0036, + "num_input_tokens_seen": 236580432, + "step": 109520 + }, + { + "epoch": 17.86704730831974, + "grad_norm": 0.01697462424635887, + "learning_rate": 3.425322560964761e-05, + "loss": 0.001, + "num_input_tokens_seen": 236590832, + "step": 109525 + }, + { + "epoch": 17.867862969004893, + "grad_norm": 0.0002120243152603507, + "learning_rate": 3.422733811042506e-05, + "loss": 0.0006, + "num_input_tokens_seen": 236602192, + "step": 109530 + }, + { + "epoch": 17.86867862969005, + "grad_norm": 0.012320012785494328, + "learning_rate": 3.420146005067659e-05, + "loss": 0.001, + "num_input_tokens_seen": 236614512, + "step": 109535 + }, + { + "epoch": 17.869494290375204, + "grad_norm": 0.010592760518193245, + "learning_rate": 3.4175591430926244e-05, + "loss": 0.0064, + "num_input_tokens_seen": 236624272, + "step": 109540 + }, + { + "epoch": 17.87030995106036, + "grad_norm": 0.02042955532670021, + "learning_rate": 3.414973225169854e-05, + "loss": 0.0042, + "num_input_tokens_seen": 236635152, + "step": 109545 + }, + { + "epoch": 17.871125611745512, + "grad_norm": 0.008258404210209846, + "learning_rate": 3.412388251351756e-05, + "loss": 0.0009, + "num_input_tokens_seen": 236646256, + "step": 109550 + }, + { + "epoch": 17.871941272430668, + "grad_norm": 0.0008917743107303977, + "learning_rate": 3.4098042216907045e-05, + "loss": 0.001, + "num_input_tokens_seen": 236657520, + "step": 109555 + }, + { + "epoch": 17.872756933115824, + "grad_norm": 0.05271517485380173, + "learning_rate": 3.4072211362390746e-05, + "loss": 0.0018, + "num_input_tokens_seen": 236668816, + "step": 109560 + }, + { + "epoch": 17.87357259380098, + "grad_norm": 0.01361636072397232, + "learning_rate": 3.40463899504922e-05, + "loss": 0.0016, + "num_input_tokens_seen": 236679280, + "step": 109565 + }, + { + "epoch": 17.874388254486135, + "grad_norm": 0.0656086653470993, + "learning_rate": 3.402057798173463e-05, + "loss": 0.0023, + "num_input_tokens_seen": 236689424, + "step": 109570 + }, + { + "epoch": 17.875203915171287, + "grad_norm": 0.000795271247625351, + "learning_rate": 3.39947754566412e-05, + "loss": 0.0068, + "num_input_tokens_seen": 236699856, + "step": 109575 + }, + { + "epoch": 17.876019575856443, + "grad_norm": 0.003363175317645073, + "learning_rate": 3.3968982375734813e-05, + "loss": 0.0022, + "num_input_tokens_seen": 236711280, + "step": 109580 + }, + { + "epoch": 17.8768352365416, + "grad_norm": 0.004550011362880468, + "learning_rate": 3.394319873953816e-05, + "loss": 0.0006, + "num_input_tokens_seen": 236723024, + "step": 109585 + }, + { + "epoch": 17.877650897226754, + "grad_norm": 0.003007990773767233, + "learning_rate": 3.391742454857388e-05, + "loss": 0.0046, + "num_input_tokens_seen": 236733072, + "step": 109590 + }, + { + "epoch": 17.87846655791191, + "grad_norm": 0.4620771110057831, + "learning_rate": 3.3891659803364225e-05, + "loss": 0.1076, + "num_input_tokens_seen": 236743056, + "step": 109595 + }, + { + "epoch": 17.879282218597062, + "grad_norm": 0.0010317835258319974, + "learning_rate": 3.386590450443139e-05, + "loss": 0.0013, + "num_input_tokens_seen": 236754512, + "step": 109600 + }, + { + "epoch": 17.880097879282218, + "grad_norm": 0.019852880388498306, + "learning_rate": 3.3840158652297335e-05, + "loss": 0.0009, + "num_input_tokens_seen": 236765936, + "step": 109605 + }, + { + "epoch": 17.880913539967374, + "grad_norm": 0.02316543459892273, + "learning_rate": 3.381442224748382e-05, + "loss": 0.0012, + "num_input_tokens_seen": 236776720, + "step": 109610 + }, + { + "epoch": 17.88172920065253, + "grad_norm": 0.0006642960361205041, + "learning_rate": 3.378869529051243e-05, + "loss": 0.0014, + "num_input_tokens_seen": 236787696, + "step": 109615 + }, + { + "epoch": 17.882544861337685, + "grad_norm": 0.07310860604047775, + "learning_rate": 3.376297778190457e-05, + "loss": 0.0035, + "num_input_tokens_seen": 236799056, + "step": 109620 + }, + { + "epoch": 17.883360522022837, + "grad_norm": 0.010377427563071251, + "learning_rate": 3.373726972218144e-05, + "loss": 0.0061, + "num_input_tokens_seen": 236810576, + "step": 109625 + }, + { + "epoch": 17.884176182707993, + "grad_norm": 0.005838080309331417, + "learning_rate": 3.3711571111864014e-05, + "loss": 0.0058, + "num_input_tokens_seen": 236821264, + "step": 109630 + }, + { + "epoch": 17.88499184339315, + "grad_norm": 0.003536843927577138, + "learning_rate": 3.3685881951473096e-05, + "loss": 0.0004, + "num_input_tokens_seen": 236833328, + "step": 109635 + }, + { + "epoch": 17.885807504078304, + "grad_norm": 0.006238589994609356, + "learning_rate": 3.366020224152949e-05, + "loss": 0.0006, + "num_input_tokens_seen": 236843600, + "step": 109640 + }, + { + "epoch": 17.88662316476346, + "grad_norm": 0.0026695330161601305, + "learning_rate": 3.363453198255328e-05, + "loss": 0.0009, + "num_input_tokens_seen": 236854800, + "step": 109645 + }, + { + "epoch": 17.887438825448612, + "grad_norm": 0.055036984384059906, + "learning_rate": 3.360887117506506e-05, + "loss": 0.0027, + "num_input_tokens_seen": 236865456, + "step": 109650 + }, + { + "epoch": 17.888254486133768, + "grad_norm": 0.0004414636641740799, + "learning_rate": 3.358321981958462e-05, + "loss": 0.001, + "num_input_tokens_seen": 236876208, + "step": 109655 + }, + { + "epoch": 17.889070146818923, + "grad_norm": 0.0011676463764160872, + "learning_rate": 3.3557577916632055e-05, + "loss": 0.0017, + "num_input_tokens_seen": 236887536, + "step": 109660 + }, + { + "epoch": 17.88988580750408, + "grad_norm": 0.0007017211173661053, + "learning_rate": 3.353194546672672e-05, + "loss": 0.0031, + "num_input_tokens_seen": 236898032, + "step": 109665 + }, + { + "epoch": 17.890701468189235, + "grad_norm": 0.0006231152801774442, + "learning_rate": 3.3506322470388426e-05, + "loss": 0.0114, + "num_input_tokens_seen": 236908656, + "step": 109670 + }, + { + "epoch": 17.891517128874387, + "grad_norm": 0.02516918070614338, + "learning_rate": 3.3480708928136204e-05, + "loss": 0.0015, + "num_input_tokens_seen": 236920400, + "step": 109675 + }, + { + "epoch": 17.892332789559543, + "grad_norm": 0.0003355523804202676, + "learning_rate": 3.34551048404893e-05, + "loss": 0.0023, + "num_input_tokens_seen": 236931024, + "step": 109680 + }, + { + "epoch": 17.8931484502447, + "grad_norm": 0.014823941513895988, + "learning_rate": 3.342951020796647e-05, + "loss": 0.0019, + "num_input_tokens_seen": 236940976, + "step": 109685 + }, + { + "epoch": 17.893964110929854, + "grad_norm": 0.0024333603214472532, + "learning_rate": 3.3403925031086525e-05, + "loss": 0.0012, + "num_input_tokens_seen": 236951984, + "step": 109690 + }, + { + "epoch": 17.894779771615006, + "grad_norm": 0.051741719245910645, + "learning_rate": 3.337834931036798e-05, + "loss": 0.007, + "num_input_tokens_seen": 236963472, + "step": 109695 + }, + { + "epoch": 17.895595432300162, + "grad_norm": 0.025518298149108887, + "learning_rate": 3.335278304632916e-05, + "loss": 0.0028, + "num_input_tokens_seen": 236973264, + "step": 109700 + }, + { + "epoch": 17.896411092985318, + "grad_norm": 0.00023219654394779354, + "learning_rate": 3.332722623948814e-05, + "loss": 0.0019, + "num_input_tokens_seen": 236983440, + "step": 109705 + }, + { + "epoch": 17.897226753670473, + "grad_norm": 0.0038019700441509485, + "learning_rate": 3.330167889036295e-05, + "loss": 0.0011, + "num_input_tokens_seen": 236994928, + "step": 109710 + }, + { + "epoch": 17.89804241435563, + "grad_norm": 0.0009693879983387887, + "learning_rate": 3.327614099947124e-05, + "loss": 0.0029, + "num_input_tokens_seen": 237006224, + "step": 109715 + }, + { + "epoch": 17.898858075040785, + "grad_norm": 0.0016091925790533423, + "learning_rate": 3.325061256733058e-05, + "loss": 0.0039, + "num_input_tokens_seen": 237017488, + "step": 109720 + }, + { + "epoch": 17.899673735725937, + "grad_norm": 0.07085609436035156, + "learning_rate": 3.3225093594458465e-05, + "loss": 0.0029, + "num_input_tokens_seen": 237027696, + "step": 109725 + }, + { + "epoch": 17.900489396411093, + "grad_norm": 0.00461050309240818, + "learning_rate": 3.319958408137192e-05, + "loss": 0.0038, + "num_input_tokens_seen": 237038896, + "step": 109730 + }, + { + "epoch": 17.90130505709625, + "grad_norm": 0.005827105604112148, + "learning_rate": 3.317408402858796e-05, + "loss": 0.1704, + "num_input_tokens_seen": 237049040, + "step": 109735 + }, + { + "epoch": 17.902120717781404, + "grad_norm": 0.015067823231220245, + "learning_rate": 3.314859343662335e-05, + "loss": 0.0012, + "num_input_tokens_seen": 237060048, + "step": 109740 + }, + { + "epoch": 17.902936378466556, + "grad_norm": 0.0008944774162955582, + "learning_rate": 3.312311230599491e-05, + "loss": 0.0031, + "num_input_tokens_seen": 237071472, + "step": 109745 + }, + { + "epoch": 17.903752039151712, + "grad_norm": 0.00048781235818751156, + "learning_rate": 3.3097640637218654e-05, + "loss": 0.001, + "num_input_tokens_seen": 237082896, + "step": 109750 + }, + { + "epoch": 17.904567699836868, + "grad_norm": 0.07903767377138138, + "learning_rate": 3.307217843081123e-05, + "loss": 0.0026, + "num_input_tokens_seen": 237093776, + "step": 109755 + }, + { + "epoch": 17.905383360522023, + "grad_norm": 0.00035915974876843393, + "learning_rate": 3.3046725687288285e-05, + "loss": 0.0022, + "num_input_tokens_seen": 237103440, + "step": 109760 + }, + { + "epoch": 17.90619902120718, + "grad_norm": 0.002234160201624036, + "learning_rate": 3.302128240716595e-05, + "loss": 0.0088, + "num_input_tokens_seen": 237113008, + "step": 109765 + }, + { + "epoch": 17.90701468189233, + "grad_norm": 0.03307786211371422, + "learning_rate": 3.299584859095961e-05, + "loss": 0.0023, + "num_input_tokens_seen": 237124016, + "step": 109770 + }, + { + "epoch": 17.907830342577487, + "grad_norm": 0.0002506078453734517, + "learning_rate": 3.297042423918495e-05, + "loss": 0.0031, + "num_input_tokens_seen": 237135888, + "step": 109775 + }, + { + "epoch": 17.908646003262643, + "grad_norm": 0.0012812531786039472, + "learning_rate": 3.2945009352357e-05, + "loss": 0.0009, + "num_input_tokens_seen": 237145968, + "step": 109780 + }, + { + "epoch": 17.9094616639478, + "grad_norm": 0.0002897421072702855, + "learning_rate": 3.291960393099108e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237155024, + "step": 109785 + }, + { + "epoch": 17.910277324632954, + "grad_norm": 0.0034674883354455233, + "learning_rate": 3.289420797560172e-05, + "loss": 0.0017, + "num_input_tokens_seen": 237165616, + "step": 109790 + }, + { + "epoch": 17.911092985318106, + "grad_norm": 0.018525205552577972, + "learning_rate": 3.2868821486704003e-05, + "loss": 0.007, + "num_input_tokens_seen": 237177200, + "step": 109795 + }, + { + "epoch": 17.911908646003262, + "grad_norm": 0.007207350339740515, + "learning_rate": 3.284344446481208e-05, + "loss": 0.0257, + "num_input_tokens_seen": 237188592, + "step": 109800 + }, + { + "epoch": 17.912724306688418, + "grad_norm": 0.0070750233717262745, + "learning_rate": 3.2818076910440476e-05, + "loss": 0.0028, + "num_input_tokens_seen": 237198320, + "step": 109805 + }, + { + "epoch": 17.913539967373573, + "grad_norm": 0.00534399040043354, + "learning_rate": 3.279271882410312e-05, + "loss": 0.001, + "num_input_tokens_seen": 237209616, + "step": 109810 + }, + { + "epoch": 17.91435562805873, + "grad_norm": 0.0017620498547330499, + "learning_rate": 3.27673702063141e-05, + "loss": 0.0007, + "num_input_tokens_seen": 237220720, + "step": 109815 + }, + { + "epoch": 17.91517128874388, + "grad_norm": 0.002240038476884365, + "learning_rate": 3.274203105758694e-05, + "loss": 0.0023, + "num_input_tokens_seen": 237231440, + "step": 109820 + }, + { + "epoch": 17.915986949429037, + "grad_norm": 0.0004176609800197184, + "learning_rate": 3.2716701378435355e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237242640, + "step": 109825 + }, + { + "epoch": 17.916802610114193, + "grad_norm": 0.004200051072984934, + "learning_rate": 3.269138116937259e-05, + "loss": 0.0016, + "num_input_tokens_seen": 237253744, + "step": 109830 + }, + { + "epoch": 17.91761827079935, + "grad_norm": 0.00025301595451310277, + "learning_rate": 3.2666070430911796e-05, + "loss": 0.0011, + "num_input_tokens_seen": 237265168, + "step": 109835 + }, + { + "epoch": 17.918433931484504, + "grad_norm": 0.0005692046834155917, + "learning_rate": 3.264076916356601e-05, + "loss": 0.0013, + "num_input_tokens_seen": 237276336, + "step": 109840 + }, + { + "epoch": 17.919249592169656, + "grad_norm": 0.004916089586913586, + "learning_rate": 3.2615477367847866e-05, + "loss": 0.0014, + "num_input_tokens_seen": 237288784, + "step": 109845 + }, + { + "epoch": 17.920065252854812, + "grad_norm": 0.001998396823182702, + "learning_rate": 3.2590195044269965e-05, + "loss": 0.0039, + "num_input_tokens_seen": 237300112, + "step": 109850 + }, + { + "epoch": 17.920880913539968, + "grad_norm": 0.002172433538362384, + "learning_rate": 3.256492219334478e-05, + "loss": 0.0048, + "num_input_tokens_seen": 237311248, + "step": 109855 + }, + { + "epoch": 17.921696574225123, + "grad_norm": 0.022449221462011337, + "learning_rate": 3.2539658815584404e-05, + "loss": 0.0008, + "num_input_tokens_seen": 237321552, + "step": 109860 + }, + { + "epoch": 17.92251223491028, + "grad_norm": 0.003850628389045596, + "learning_rate": 3.2514404911500814e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237331248, + "step": 109865 + }, + { + "epoch": 17.92332789559543, + "grad_norm": 0.46473076939582825, + "learning_rate": 3.248916048160588e-05, + "loss": 0.028, + "num_input_tokens_seen": 237342096, + "step": 109870 + }, + { + "epoch": 17.924143556280587, + "grad_norm": 0.0005388594581745565, + "learning_rate": 3.246392552641125e-05, + "loss": 0.0047, + "num_input_tokens_seen": 237352048, + "step": 109875 + }, + { + "epoch": 17.924959216965743, + "grad_norm": 0.0011852516327053308, + "learning_rate": 3.2438700046428185e-05, + "loss": 0.0022, + "num_input_tokens_seen": 237362992, + "step": 109880 + }, + { + "epoch": 17.9257748776509, + "grad_norm": 0.0035580755211412907, + "learning_rate": 3.2413484042167984e-05, + "loss": 0.0362, + "num_input_tokens_seen": 237373392, + "step": 109885 + }, + { + "epoch": 17.92659053833605, + "grad_norm": 0.0026270966045558453, + "learning_rate": 3.2388277514141864e-05, + "loss": 0.0005, + "num_input_tokens_seen": 237384432, + "step": 109890 + }, + { + "epoch": 17.927406199021206, + "grad_norm": 0.04227229207754135, + "learning_rate": 3.236308046286035e-05, + "loss": 0.0021, + "num_input_tokens_seen": 237395344, + "step": 109895 + }, + { + "epoch": 17.928221859706362, + "grad_norm": 0.194888174533844, + "learning_rate": 3.2337892888834375e-05, + "loss": 0.0175, + "num_input_tokens_seen": 237406192, + "step": 109900 + }, + { + "epoch": 17.929037520391518, + "grad_norm": 0.000787916244007647, + "learning_rate": 3.231271479257414e-05, + "loss": 0.0025, + "num_input_tokens_seen": 237416880, + "step": 109905 + }, + { + "epoch": 17.929853181076673, + "grad_norm": 0.00029855401953682303, + "learning_rate": 3.228754617459023e-05, + "loss": 0.0002, + "num_input_tokens_seen": 237427440, + "step": 109910 + }, + { + "epoch": 17.930668841761825, + "grad_norm": 0.01982448808848858, + "learning_rate": 3.2262387035392305e-05, + "loss": 0.0045, + "num_input_tokens_seen": 237437328, + "step": 109915 + }, + { + "epoch": 17.93148450244698, + "grad_norm": 0.019642792642116547, + "learning_rate": 3.2237237375490666e-05, + "loss": 0.0016, + "num_input_tokens_seen": 237448048, + "step": 109920 + }, + { + "epoch": 17.932300163132137, + "grad_norm": 0.0037098608445376158, + "learning_rate": 3.221209719539469e-05, + "loss": 0.0048, + "num_input_tokens_seen": 237459376, + "step": 109925 + }, + { + "epoch": 17.933115823817293, + "grad_norm": 0.0006825768505223095, + "learning_rate": 3.218696649561409e-05, + "loss": 0.1397, + "num_input_tokens_seen": 237470384, + "step": 109930 + }, + { + "epoch": 17.93393148450245, + "grad_norm": 0.0034533455036580563, + "learning_rate": 3.2161845276658e-05, + "loss": 0.0005, + "num_input_tokens_seen": 237480784, + "step": 109935 + }, + { + "epoch": 17.9347471451876, + "grad_norm": 0.00120734260417521, + "learning_rate": 3.213673353903568e-05, + "loss": 0.0007, + "num_input_tokens_seen": 237490864, + "step": 109940 + }, + { + "epoch": 17.935562805872756, + "grad_norm": 0.009501025080680847, + "learning_rate": 3.211163128325589e-05, + "loss": 0.0009, + "num_input_tokens_seen": 237502352, + "step": 109945 + }, + { + "epoch": 17.936378466557912, + "grad_norm": 0.0702584832906723, + "learning_rate": 3.208653850982746e-05, + "loss": 0.0028, + "num_input_tokens_seen": 237512432, + "step": 109950 + }, + { + "epoch": 17.937194127243067, + "grad_norm": 0.0038338962476700544, + "learning_rate": 3.206145521925896e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237521744, + "step": 109955 + }, + { + "epoch": 17.938009787928223, + "grad_norm": 0.16808141767978668, + "learning_rate": 3.2036381412058725e-05, + "loss": 0.0044, + "num_input_tokens_seen": 237531920, + "step": 109960 + }, + { + "epoch": 17.938825448613375, + "grad_norm": 0.006209705490618944, + "learning_rate": 3.2011317088734836e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237542640, + "step": 109965 + }, + { + "epoch": 17.93964110929853, + "grad_norm": 0.0027591967955231667, + "learning_rate": 3.1986262249795286e-05, + "loss": 0.0048, + "num_input_tokens_seen": 237552976, + "step": 109970 + }, + { + "epoch": 17.940456769983687, + "grad_norm": 0.10435988754034042, + "learning_rate": 3.196121689574782e-05, + "loss": 0.0028, + "num_input_tokens_seen": 237563408, + "step": 109975 + }, + { + "epoch": 17.941272430668842, + "grad_norm": 0.00376000814139843, + "learning_rate": 3.193618102710011e-05, + "loss": 0.0009, + "num_input_tokens_seen": 237574000, + "step": 109980 + }, + { + "epoch": 17.942088091353998, + "grad_norm": 0.6628631949424744, + "learning_rate": 3.191115464435945e-05, + "loss": 0.0781, + "num_input_tokens_seen": 237584976, + "step": 109985 + }, + { + "epoch": 17.94290375203915, + "grad_norm": 0.0003267792926635593, + "learning_rate": 3.188613774803306e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237595600, + "step": 109990 + }, + { + "epoch": 17.943719412724306, + "grad_norm": 0.0014076323714107275, + "learning_rate": 3.186113033862792e-05, + "loss": 0.0007, + "num_input_tokens_seen": 237606512, + "step": 109995 + }, + { + "epoch": 17.94453507340946, + "grad_norm": 0.002126255538314581, + "learning_rate": 3.1836132416650844e-05, + "loss": 0.0062, + "num_input_tokens_seen": 237616816, + "step": 110000 + }, + { + "epoch": 17.945350734094617, + "grad_norm": 0.004582113586366177, + "learning_rate": 3.1811143982608426e-05, + "loss": 0.0024, + "num_input_tokens_seen": 237627920, + "step": 110005 + }, + { + "epoch": 17.946166394779773, + "grad_norm": 0.004174842499196529, + "learning_rate": 3.1786165037007156e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237638800, + "step": 110010 + }, + { + "epoch": 17.946982055464925, + "grad_norm": 0.0007544121472164989, + "learning_rate": 3.176119558035323e-05, + "loss": 0.0008, + "num_input_tokens_seen": 237648304, + "step": 110015 + }, + { + "epoch": 17.94779771615008, + "grad_norm": 0.008781618438661098, + "learning_rate": 3.173623561315259e-05, + "loss": 0.0008, + "num_input_tokens_seen": 237659984, + "step": 110020 + }, + { + "epoch": 17.948613376835237, + "grad_norm": 0.007624879479408264, + "learning_rate": 3.171128513591132e-05, + "loss": 0.001, + "num_input_tokens_seen": 237670832, + "step": 110025 + }, + { + "epoch": 17.949429037520392, + "grad_norm": 0.00031044858042150736, + "learning_rate": 3.1686344149134735e-05, + "loss": 0.0034, + "num_input_tokens_seen": 237682672, + "step": 110030 + }, + { + "epoch": 17.950244698205548, + "grad_norm": 0.0014524642610922456, + "learning_rate": 3.1661412653328724e-05, + "loss": 0.0007, + "num_input_tokens_seen": 237694000, + "step": 110035 + }, + { + "epoch": 17.9510603588907, + "grad_norm": 0.0019769843202084303, + "learning_rate": 3.1636490648998095e-05, + "loss": 0.0026, + "num_input_tokens_seen": 237704336, + "step": 110040 + }, + { + "epoch": 17.951876019575856, + "grad_norm": 0.0006612506695091724, + "learning_rate": 3.1611578136648336e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237715216, + "step": 110045 + }, + { + "epoch": 17.95269168026101, + "grad_norm": 0.003691247198730707, + "learning_rate": 3.158667511678393e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237725744, + "step": 110050 + }, + { + "epoch": 17.953507340946167, + "grad_norm": 0.02169613167643547, + "learning_rate": 3.156178158990991e-05, + "loss": 0.0015, + "num_input_tokens_seen": 237736688, + "step": 110055 + }, + { + "epoch": 17.954323001631323, + "grad_norm": 0.004092794377356768, + "learning_rate": 3.153689755653061e-05, + "loss": 0.0095, + "num_input_tokens_seen": 237746640, + "step": 110060 + }, + { + "epoch": 17.955138662316475, + "grad_norm": 0.004916083998978138, + "learning_rate": 3.151202301715034e-05, + "loss": 0.0045, + "num_input_tokens_seen": 237757488, + "step": 110065 + }, + { + "epoch": 17.95595432300163, + "grad_norm": 0.005175785627216101, + "learning_rate": 3.148715797227331e-05, + "loss": 0.0021, + "num_input_tokens_seen": 237768016, + "step": 110070 + }, + { + "epoch": 17.956769983686787, + "grad_norm": 0.000567434064578265, + "learning_rate": 3.1462302422403334e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237779792, + "step": 110075 + }, + { + "epoch": 17.957585644371942, + "grad_norm": 0.0003794727090280503, + "learning_rate": 3.143745636804418e-05, + "loss": 0.0146, + "num_input_tokens_seen": 237790640, + "step": 110080 + }, + { + "epoch": 17.958401305057095, + "grad_norm": 0.00449990713968873, + "learning_rate": 3.14126198096994e-05, + "loss": 0.0022, + "num_input_tokens_seen": 237800240, + "step": 110085 + }, + { + "epoch": 17.95921696574225, + "grad_norm": 0.0035965435672551394, + "learning_rate": 3.138779274787235e-05, + "loss": 0.0005, + "num_input_tokens_seen": 237811216, + "step": 110090 + }, + { + "epoch": 17.960032626427406, + "grad_norm": 0.0003991451230831444, + "learning_rate": 3.136297518306614e-05, + "loss": 0.0008, + "num_input_tokens_seen": 237820528, + "step": 110095 + }, + { + "epoch": 17.96084828711256, + "grad_norm": 0.003139512613415718, + "learning_rate": 3.133816711578369e-05, + "loss": 0.0007, + "num_input_tokens_seen": 237831504, + "step": 110100 + }, + { + "epoch": 17.961663947797717, + "grad_norm": 0.00021856573584955186, + "learning_rate": 3.131336854652789e-05, + "loss": 0.0894, + "num_input_tokens_seen": 237842224, + "step": 110105 + }, + { + "epoch": 17.96247960848287, + "grad_norm": 0.0007067588157951832, + "learning_rate": 3.1288579475801215e-05, + "loss": 0.0012, + "num_input_tokens_seen": 237853680, + "step": 110110 + }, + { + "epoch": 17.963295269168025, + "grad_norm": 0.057319898158311844, + "learning_rate": 3.12637999041061e-05, + "loss": 0.0043, + "num_input_tokens_seen": 237863856, + "step": 110115 + }, + { + "epoch": 17.96411092985318, + "grad_norm": 0.027158288285136223, + "learning_rate": 3.123902983194471e-05, + "loss": 0.0013, + "num_input_tokens_seen": 237874800, + "step": 110120 + }, + { + "epoch": 17.964926590538337, + "grad_norm": 0.007293624337762594, + "learning_rate": 3.1214269259819014e-05, + "loss": 0.0047, + "num_input_tokens_seen": 237885616, + "step": 110125 + }, + { + "epoch": 17.965742251223492, + "grad_norm": 0.00030968463397584856, + "learning_rate": 3.11895181882309e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237896784, + "step": 110130 + }, + { + "epoch": 17.966557911908644, + "grad_norm": 0.044866591691970825, + "learning_rate": 3.116477661768191e-05, + "loss": 0.0017, + "num_input_tokens_seen": 237906768, + "step": 110135 + }, + { + "epoch": 17.9673735725938, + "grad_norm": 0.3906330466270447, + "learning_rate": 3.1140044548673476e-05, + "loss": 0.0076, + "num_input_tokens_seen": 237917456, + "step": 110140 + }, + { + "epoch": 17.968189233278956, + "grad_norm": 0.0008933874778449535, + "learning_rate": 3.11153219817068e-05, + "loss": 0.0012, + "num_input_tokens_seen": 237928848, + "step": 110145 + }, + { + "epoch": 17.96900489396411, + "grad_norm": 0.008592470549046993, + "learning_rate": 3.109060891728299e-05, + "loss": 0.0007, + "num_input_tokens_seen": 237938960, + "step": 110150 + }, + { + "epoch": 17.969820554649267, + "grad_norm": 0.008075353689491749, + "learning_rate": 3.1065905355902865e-05, + "loss": 0.0023, + "num_input_tokens_seen": 237949680, + "step": 110155 + }, + { + "epoch": 17.97063621533442, + "grad_norm": 0.005265532527118921, + "learning_rate": 3.104121129806697e-05, + "loss": 0.0007, + "num_input_tokens_seen": 237961424, + "step": 110160 + }, + { + "epoch": 17.971451876019575, + "grad_norm": 0.0018472732044756413, + "learning_rate": 3.101652674427585e-05, + "loss": 0.0005, + "num_input_tokens_seen": 237973232, + "step": 110165 + }, + { + "epoch": 17.97226753670473, + "grad_norm": 0.0158048328012228, + "learning_rate": 3.0991851695029825e-05, + "loss": 0.0064, + "num_input_tokens_seen": 237985296, + "step": 110170 + }, + { + "epoch": 17.973083197389887, + "grad_norm": 0.000548655865713954, + "learning_rate": 3.0967186150828886e-05, + "loss": 0.0014, + "num_input_tokens_seen": 237994096, + "step": 110175 + }, + { + "epoch": 17.973898858075042, + "grad_norm": 0.028464488685131073, + "learning_rate": 3.0942530112172905e-05, + "loss": 0.0023, + "num_input_tokens_seen": 238005392, + "step": 110180 + }, + { + "epoch": 17.974714518760194, + "grad_norm": 0.011310449801385403, + "learning_rate": 3.0917883579561604e-05, + "loss": 0.0013, + "num_input_tokens_seen": 238017072, + "step": 110185 + }, + { + "epoch": 17.97553017944535, + "grad_norm": 0.012373429723083973, + "learning_rate": 3.0893246553494516e-05, + "loss": 0.0015, + "num_input_tokens_seen": 238028112, + "step": 110190 + }, + { + "epoch": 17.976345840130506, + "grad_norm": 0.0010668218601495028, + "learning_rate": 3.08686190344708e-05, + "loss": 0.0271, + "num_input_tokens_seen": 238040240, + "step": 110195 + }, + { + "epoch": 17.97716150081566, + "grad_norm": 0.007802395615726709, + "learning_rate": 3.084400102298973e-05, + "loss": 0.0772, + "num_input_tokens_seen": 238051408, + "step": 110200 + }, + { + "epoch": 17.977977161500817, + "grad_norm": 0.011837205849587917, + "learning_rate": 3.0819392519550125e-05, + "loss": 0.001, + "num_input_tokens_seen": 238062160, + "step": 110205 + }, + { + "epoch": 17.97879282218597, + "grad_norm": 0.002454120898619294, + "learning_rate": 3.079479352465076e-05, + "loss": 0.0048, + "num_input_tokens_seen": 238073520, + "step": 110210 + }, + { + "epoch": 17.979608482871125, + "grad_norm": 0.0006663525127805769, + "learning_rate": 3.077020403879005e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238083952, + "step": 110215 + }, + { + "epoch": 17.98042414355628, + "grad_norm": 0.0008392453892156482, + "learning_rate": 3.07456240624665e-05, + "loss": 0.0012, + "num_input_tokens_seen": 238095120, + "step": 110220 + }, + { + "epoch": 17.981239804241437, + "grad_norm": 0.008441498503088951, + "learning_rate": 3.072105359617811e-05, + "loss": 0.0011, + "num_input_tokens_seen": 238105072, + "step": 110225 + }, + { + "epoch": 17.982055464926592, + "grad_norm": 0.000342810177244246, + "learning_rate": 3.0696492640422954e-05, + "loss": 0.0007, + "num_input_tokens_seen": 238114928, + "step": 110230 + }, + { + "epoch": 17.982871125611744, + "grad_norm": 0.0003783302381634712, + "learning_rate": 3.067194119569866e-05, + "loss": 0.001, + "num_input_tokens_seen": 238124944, + "step": 110235 + }, + { + "epoch": 17.9836867862969, + "grad_norm": 0.16300013661384583, + "learning_rate": 3.064739926250293e-05, + "loss": 0.0079, + "num_input_tokens_seen": 238135888, + "step": 110240 + }, + { + "epoch": 17.984502446982056, + "grad_norm": 0.0008707176893949509, + "learning_rate": 3.062286684133303e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238146448, + "step": 110245 + }, + { + "epoch": 17.98531810766721, + "grad_norm": 0.008672765456140041, + "learning_rate": 3.059834393268618e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238156976, + "step": 110250 + }, + { + "epoch": 17.986133768352367, + "grad_norm": 0.018309568986296654, + "learning_rate": 3.057383053705937e-05, + "loss": 0.0082, + "num_input_tokens_seen": 238168528, + "step": 110255 + }, + { + "epoch": 17.98694942903752, + "grad_norm": 0.030882669612765312, + "learning_rate": 3.054932665494936e-05, + "loss": 0.0028, + "num_input_tokens_seen": 238179952, + "step": 110260 + }, + { + "epoch": 17.987765089722675, + "grad_norm": 0.0036772945895791054, + "learning_rate": 3.052483228685282e-05, + "loss": 0.0029, + "num_input_tokens_seen": 238191312, + "step": 110265 + }, + { + "epoch": 17.98858075040783, + "grad_norm": 0.008443798869848251, + "learning_rate": 3.050034743326613e-05, + "loss": 0.0008, + "num_input_tokens_seen": 238203088, + "step": 110270 + }, + { + "epoch": 17.989396411092986, + "grad_norm": 0.0029752785339951515, + "learning_rate": 3.0475872094685443e-05, + "loss": 0.0004, + "num_input_tokens_seen": 238213968, + "step": 110275 + }, + { + "epoch": 17.99021207177814, + "grad_norm": 0.005904734134674072, + "learning_rate": 3.0451406271606974e-05, + "loss": 0.0012, + "num_input_tokens_seen": 238224752, + "step": 110280 + }, + { + "epoch": 17.991027732463294, + "grad_norm": 0.0010160219389945269, + "learning_rate": 3.0426949964526272e-05, + "loss": 0.0028, + "num_input_tokens_seen": 238235536, + "step": 110285 + }, + { + "epoch": 17.99184339314845, + "grad_norm": 0.12286140024662018, + "learning_rate": 3.0402503173939277e-05, + "loss": 0.002, + "num_input_tokens_seen": 238246448, + "step": 110290 + }, + { + "epoch": 17.992659053833606, + "grad_norm": 0.002489682286977768, + "learning_rate": 3.0378065900341146e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238257744, + "step": 110295 + }, + { + "epoch": 17.99347471451876, + "grad_norm": 0.0010223733261227608, + "learning_rate": 3.035363814422737e-05, + "loss": 0.0135, + "num_input_tokens_seen": 238267952, + "step": 110300 + }, + { + "epoch": 17.994290375203914, + "grad_norm": 0.00035545893479138613, + "learning_rate": 3.0329219906092776e-05, + "loss": 0.0017, + "num_input_tokens_seen": 238278288, + "step": 110305 + }, + { + "epoch": 17.99510603588907, + "grad_norm": 0.0034552181605249643, + "learning_rate": 3.030481118643247e-05, + "loss": 0.0009, + "num_input_tokens_seen": 238288752, + "step": 110310 + }, + { + "epoch": 17.995921696574225, + "grad_norm": 0.0017270646058022976, + "learning_rate": 3.0280411985740995e-05, + "loss": 0.0008, + "num_input_tokens_seen": 238299600, + "step": 110315 + }, + { + "epoch": 17.99673735725938, + "grad_norm": 0.0005409326404333115, + "learning_rate": 3.0256022304512854e-05, + "loss": 0.0037, + "num_input_tokens_seen": 238310128, + "step": 110320 + }, + { + "epoch": 17.997553017944536, + "grad_norm": 0.05676782503724098, + "learning_rate": 3.023164214324231e-05, + "loss": 0.0019, + "num_input_tokens_seen": 238321808, + "step": 110325 + }, + { + "epoch": 17.99836867862969, + "grad_norm": 0.0008575510582886636, + "learning_rate": 3.0207271502423527e-05, + "loss": 0.0053, + "num_input_tokens_seen": 238332176, + "step": 110330 + }, + { + "epoch": 17.999184339314844, + "grad_norm": 0.00040171988075599074, + "learning_rate": 3.018291038255033e-05, + "loss": 0.0031, + "num_input_tokens_seen": 238343280, + "step": 110335 + }, + { + "epoch": 18.0, + "grad_norm": 0.0528254434466362, + "learning_rate": 3.0158558784116442e-05, + "loss": 0.0617, + "num_input_tokens_seen": 238352272, + "step": 110340 + }, + { + "epoch": 18.0, + "eval_loss": 0.32154321670532227, + "eval_runtime": 103.778, + "eval_samples_per_second": 26.258, + "eval_steps_per_second": 6.572, + "num_input_tokens_seen": 238352272, + "step": 110340 + }, + { + "epoch": 18.000815660685156, + "grad_norm": 0.0019651330076158047, + "learning_rate": 3.0134216707615404e-05, + "loss": 0.0008, + "num_input_tokens_seen": 238364784, + "step": 110345 + }, + { + "epoch": 18.00163132137031, + "grad_norm": 0.0010777993593364954, + "learning_rate": 3.0109884153540545e-05, + "loss": 0.0012, + "num_input_tokens_seen": 238372880, + "step": 110350 + }, + { + "epoch": 18.002446982055464, + "grad_norm": 0.021584536880254745, + "learning_rate": 3.0085561122384974e-05, + "loss": 0.0026, + "num_input_tokens_seen": 238383504, + "step": 110355 + }, + { + "epoch": 18.00326264274062, + "grad_norm": 0.004007100127637386, + "learning_rate": 3.0061247614641684e-05, + "loss": 0.0026, + "num_input_tokens_seen": 238394192, + "step": 110360 + }, + { + "epoch": 18.004078303425775, + "grad_norm": 0.0018758628284558654, + "learning_rate": 3.0036943630803282e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238405584, + "step": 110365 + }, + { + "epoch": 18.00489396411093, + "grad_norm": 0.0010444171493873, + "learning_rate": 3.0012649171362482e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238417232, + "step": 110370 + }, + { + "epoch": 18.005709624796086, + "grad_norm": 0.0003816418757196516, + "learning_rate": 2.998836423681156e-05, + "loss": 0.0297, + "num_input_tokens_seen": 238429552, + "step": 110375 + }, + { + "epoch": 18.00652528548124, + "grad_norm": 0.009662347845733166, + "learning_rate": 2.9964088827642564e-05, + "loss": 0.0078, + "num_input_tokens_seen": 238440304, + "step": 110380 + }, + { + "epoch": 18.007340946166394, + "grad_norm": 0.027501266449689865, + "learning_rate": 2.993982294434777e-05, + "loss": 0.0017, + "num_input_tokens_seen": 238449552, + "step": 110385 + }, + { + "epoch": 18.00815660685155, + "grad_norm": 0.033807143568992615, + "learning_rate": 2.991556658741862e-05, + "loss": 0.0016, + "num_input_tokens_seen": 238460944, + "step": 110390 + }, + { + "epoch": 18.008972267536706, + "grad_norm": 0.0020302990451455116, + "learning_rate": 2.9891319757347047e-05, + "loss": 0.0099, + "num_input_tokens_seen": 238471568, + "step": 110395 + }, + { + "epoch": 18.00978792822186, + "grad_norm": 0.0008874621125869453, + "learning_rate": 2.986708245462405e-05, + "loss": 0.0012, + "num_input_tokens_seen": 238482128, + "step": 110400 + }, + { + "epoch": 18.010603588907014, + "grad_norm": 0.0006465526530519128, + "learning_rate": 2.984285467974124e-05, + "loss": 0.0018, + "num_input_tokens_seen": 238494288, + "step": 110405 + }, + { + "epoch": 18.01141924959217, + "grad_norm": 0.014971431344747543, + "learning_rate": 2.981863643318922e-05, + "loss": 0.0009, + "num_input_tokens_seen": 238505104, + "step": 110410 + }, + { + "epoch": 18.012234910277325, + "grad_norm": 0.003481280989944935, + "learning_rate": 2.979442771545915e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238516144, + "step": 110415 + }, + { + "epoch": 18.01305057096248, + "grad_norm": 0.007337215356528759, + "learning_rate": 2.9770228527041364e-05, + "loss": 0.0008, + "num_input_tokens_seen": 238528496, + "step": 110420 + }, + { + "epoch": 18.013866231647636, + "grad_norm": 0.01175409834831953, + "learning_rate": 2.9746038868426584e-05, + "loss": 0.0011, + "num_input_tokens_seen": 238538448, + "step": 110425 + }, + { + "epoch": 18.01468189233279, + "grad_norm": 0.0015376220690086484, + "learning_rate": 2.9721858740104747e-05, + "loss": 0.0004, + "num_input_tokens_seen": 238548432, + "step": 110430 + }, + { + "epoch": 18.015497553017944, + "grad_norm": 0.0022492543794214725, + "learning_rate": 2.9697688142566127e-05, + "loss": 0.0012, + "num_input_tokens_seen": 238557456, + "step": 110435 + }, + { + "epoch": 18.0163132137031, + "grad_norm": 0.012653055600821972, + "learning_rate": 2.967352707630039e-05, + "loss": 0.0024, + "num_input_tokens_seen": 238568464, + "step": 110440 + }, + { + "epoch": 18.017128874388256, + "grad_norm": 0.0003922838077414781, + "learning_rate": 2.9649375541797418e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238580464, + "step": 110445 + }, + { + "epoch": 18.017944535073408, + "grad_norm": 0.001494093332439661, + "learning_rate": 2.9625233539546326e-05, + "loss": 0.0482, + "num_input_tokens_seen": 238590864, + "step": 110450 + }, + { + "epoch": 18.018760195758563, + "grad_norm": 0.005561790894716978, + "learning_rate": 2.960110107003672e-05, + "loss": 0.0007, + "num_input_tokens_seen": 238603024, + "step": 110455 + }, + { + "epoch": 18.01957585644372, + "grad_norm": 0.0008520457777194679, + "learning_rate": 2.9576978133757536e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238614320, + "step": 110460 + }, + { + "epoch": 18.020391517128875, + "grad_norm": 0.7163333892822266, + "learning_rate": 2.955286473119767e-05, + "loss": 0.0741, + "num_input_tokens_seen": 238624720, + "step": 110465 + }, + { + "epoch": 18.02120717781403, + "grad_norm": 0.03220055624842644, + "learning_rate": 2.9528760862845783e-05, + "loss": 0.0014, + "num_input_tokens_seen": 238636848, + "step": 110470 + }, + { + "epoch": 18.022022838499183, + "grad_norm": 0.00888325646519661, + "learning_rate": 2.9504666529190426e-05, + "loss": 0.001, + "num_input_tokens_seen": 238648336, + "step": 110475 + }, + { + "epoch": 18.02283849918434, + "grad_norm": 0.007564366329461336, + "learning_rate": 2.9480581730719825e-05, + "loss": 0.0012, + "num_input_tokens_seen": 238658224, + "step": 110480 + }, + { + "epoch": 18.023654159869494, + "grad_norm": 0.0019379006698727608, + "learning_rate": 2.945650646792214e-05, + "loss": 0.0012, + "num_input_tokens_seen": 238669744, + "step": 110485 + }, + { + "epoch": 18.02446982055465, + "grad_norm": 0.0025463791098445654, + "learning_rate": 2.9432440741285314e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238680720, + "step": 110490 + }, + { + "epoch": 18.025285481239806, + "grad_norm": 0.38154155015945435, + "learning_rate": 2.940838455129696e-05, + "loss": 0.0078, + "num_input_tokens_seen": 238691504, + "step": 110495 + }, + { + "epoch": 18.026101141924958, + "grad_norm": 0.044939037412405014, + "learning_rate": 2.9384337898444747e-05, + "loss": 0.0017, + "num_input_tokens_seen": 238702160, + "step": 110500 + }, + { + "epoch": 18.026916802610113, + "grad_norm": 0.0029059057123959064, + "learning_rate": 2.9360300783215832e-05, + "loss": 0.0011, + "num_input_tokens_seen": 238712208, + "step": 110505 + }, + { + "epoch": 18.02773246329527, + "grad_norm": 0.007650961168110371, + "learning_rate": 2.9336273206097663e-05, + "loss": 0.0041, + "num_input_tokens_seen": 238721840, + "step": 110510 + }, + { + "epoch": 18.028548123980425, + "grad_norm": 0.0004907246329821646, + "learning_rate": 2.931225516757685e-05, + "loss": 0.0018, + "num_input_tokens_seen": 238732784, + "step": 110515 + }, + { + "epoch": 18.02936378466558, + "grad_norm": 0.006268959492444992, + "learning_rate": 2.9288246668140396e-05, + "loss": 0.0015, + "num_input_tokens_seen": 238743600, + "step": 110520 + }, + { + "epoch": 18.030179445350733, + "grad_norm": 0.216169536113739, + "learning_rate": 2.9264247708274628e-05, + "loss": 0.004, + "num_input_tokens_seen": 238754576, + "step": 110525 + }, + { + "epoch": 18.03099510603589, + "grad_norm": 0.0017121587879955769, + "learning_rate": 2.9240258288466215e-05, + "loss": 0.0018, + "num_input_tokens_seen": 238765680, + "step": 110530 + }, + { + "epoch": 18.031810766721044, + "grad_norm": 0.0071411821991205215, + "learning_rate": 2.921627840920099e-05, + "loss": 0.0022, + "num_input_tokens_seen": 238776464, + "step": 110535 + }, + { + "epoch": 18.0326264274062, + "grad_norm": 0.03119852766394615, + "learning_rate": 2.919230807096529e-05, + "loss": 0.0019, + "num_input_tokens_seen": 238787824, + "step": 110540 + }, + { + "epoch": 18.033442088091356, + "grad_norm": 0.003736414248123765, + "learning_rate": 2.916834727424461e-05, + "loss": 0.0021, + "num_input_tokens_seen": 238797872, + "step": 110545 + }, + { + "epoch": 18.034257748776508, + "grad_norm": 0.002718925941735506, + "learning_rate": 2.9144396019524788e-05, + "loss": 0.0017, + "num_input_tokens_seen": 238807248, + "step": 110550 + }, + { + "epoch": 18.035073409461663, + "grad_norm": 0.008032168261706829, + "learning_rate": 2.9120454307290933e-05, + "loss": 0.0566, + "num_input_tokens_seen": 238818256, + "step": 110555 + }, + { + "epoch": 18.03588907014682, + "grad_norm": 0.027552763000130653, + "learning_rate": 2.90965221380286e-05, + "loss": 0.0014, + "num_input_tokens_seen": 238828688, + "step": 110560 + }, + { + "epoch": 18.036704730831975, + "grad_norm": 0.040653783828020096, + "learning_rate": 2.9072599512222464e-05, + "loss": 0.052, + "num_input_tokens_seen": 238839696, + "step": 110565 + }, + { + "epoch": 18.03752039151713, + "grad_norm": 0.008090752176940441, + "learning_rate": 2.9048686430357685e-05, + "loss": 0.0017, + "num_input_tokens_seen": 238851440, + "step": 110570 + }, + { + "epoch": 18.038336052202283, + "grad_norm": 0.0007834371645003557, + "learning_rate": 2.9024782892918543e-05, + "loss": 0.0021, + "num_input_tokens_seen": 238862288, + "step": 110575 + }, + { + "epoch": 18.03915171288744, + "grad_norm": 0.00212163757532835, + "learning_rate": 2.9000888900389764e-05, + "loss": 0.001, + "num_input_tokens_seen": 238873776, + "step": 110580 + }, + { + "epoch": 18.039967373572594, + "grad_norm": 0.0008154436945915222, + "learning_rate": 2.8977004453255406e-05, + "loss": 0.0022, + "num_input_tokens_seen": 238884720, + "step": 110585 + }, + { + "epoch": 18.04078303425775, + "grad_norm": 0.0016963942907750607, + "learning_rate": 2.8953129551999634e-05, + "loss": 0.0009, + "num_input_tokens_seen": 238894320, + "step": 110590 + }, + { + "epoch": 18.041598694942905, + "grad_norm": 0.011484961025416851, + "learning_rate": 2.892926419710623e-05, + "loss": 0.0019, + "num_input_tokens_seen": 238903920, + "step": 110595 + }, + { + "epoch": 18.042414355628058, + "grad_norm": 0.03104168362915516, + "learning_rate": 2.8905408389058917e-05, + "loss": 0.0031, + "num_input_tokens_seen": 238914896, + "step": 110600 + }, + { + "epoch": 18.043230016313213, + "grad_norm": 0.0025729406625032425, + "learning_rate": 2.8881562128341088e-05, + "loss": 0.0012, + "num_input_tokens_seen": 238925552, + "step": 110605 + }, + { + "epoch": 18.04404567699837, + "grad_norm": 0.002775913570076227, + "learning_rate": 2.885772541543613e-05, + "loss": 0.0008, + "num_input_tokens_seen": 238936784, + "step": 110610 + }, + { + "epoch": 18.044861337683525, + "grad_norm": 0.001294884947128594, + "learning_rate": 2.8833898250826994e-05, + "loss": 0.0014, + "num_input_tokens_seen": 238948688, + "step": 110615 + }, + { + "epoch": 18.045676998368677, + "grad_norm": 0.0018858517287299037, + "learning_rate": 2.881008063499663e-05, + "loss": 0.0007, + "num_input_tokens_seen": 238959664, + "step": 110620 + }, + { + "epoch": 18.046492659053833, + "grad_norm": 0.0010302024893462658, + "learning_rate": 2.878627256842775e-05, + "loss": 0.001, + "num_input_tokens_seen": 238971440, + "step": 110625 + }, + { + "epoch": 18.04730831973899, + "grad_norm": 0.002369890222325921, + "learning_rate": 2.8762474051602816e-05, + "loss": 0.0009, + "num_input_tokens_seen": 238981552, + "step": 110630 + }, + { + "epoch": 18.048123980424144, + "grad_norm": 0.0025272388011217117, + "learning_rate": 2.8738685085004156e-05, + "loss": 0.0014, + "num_input_tokens_seen": 238993168, + "step": 110635 + }, + { + "epoch": 18.0489396411093, + "grad_norm": 0.0013844823697581887, + "learning_rate": 2.871490566911389e-05, + "loss": 0.0011, + "num_input_tokens_seen": 239004944, + "step": 110640 + }, + { + "epoch": 18.049755301794452, + "grad_norm": 0.0002905686560552567, + "learning_rate": 2.8691135804413905e-05, + "loss": 0.0005, + "num_input_tokens_seen": 239014320, + "step": 110645 + }, + { + "epoch": 18.050570962479608, + "grad_norm": 0.012505102902650833, + "learning_rate": 2.8667375491385928e-05, + "loss": 0.0049, + "num_input_tokens_seen": 239026064, + "step": 110650 + }, + { + "epoch": 18.051386623164763, + "grad_norm": 0.0537576824426651, + "learning_rate": 2.864362473051163e-05, + "loss": 0.0023, + "num_input_tokens_seen": 239037456, + "step": 110655 + }, + { + "epoch": 18.05220228384992, + "grad_norm": 0.0007542030070908368, + "learning_rate": 2.8619883522272072e-05, + "loss": 0.0016, + "num_input_tokens_seen": 239048976, + "step": 110660 + }, + { + "epoch": 18.053017944535075, + "grad_norm": 0.0008729331311769783, + "learning_rate": 2.85961518671487e-05, + "loss": 0.0006, + "num_input_tokens_seen": 239059216, + "step": 110665 + }, + { + "epoch": 18.053833605220227, + "grad_norm": 0.007356339134275913, + "learning_rate": 2.8572429765622243e-05, + "loss": 0.0016, + "num_input_tokens_seen": 239069648, + "step": 110670 + }, + { + "epoch": 18.054649265905383, + "grad_norm": 0.006332451477646828, + "learning_rate": 2.8548717218173647e-05, + "loss": 0.0405, + "num_input_tokens_seen": 239081744, + "step": 110675 + }, + { + "epoch": 18.05546492659054, + "grad_norm": 0.17264924943447113, + "learning_rate": 2.8525014225283195e-05, + "loss": 0.0091, + "num_input_tokens_seen": 239092144, + "step": 110680 + }, + { + "epoch": 18.056280587275694, + "grad_norm": 0.006838109809905291, + "learning_rate": 2.8501320787431673e-05, + "loss": 0.0019, + "num_input_tokens_seen": 239104016, + "step": 110685 + }, + { + "epoch": 18.05709624796085, + "grad_norm": 0.008148097433149815, + "learning_rate": 2.8477636905098802e-05, + "loss": 0.0021, + "num_input_tokens_seen": 239114576, + "step": 110690 + }, + { + "epoch": 18.057911908646002, + "grad_norm": 0.0005130280624143779, + "learning_rate": 2.845396257876487e-05, + "loss": 0.0039, + "num_input_tokens_seen": 239125264, + "step": 110695 + }, + { + "epoch": 18.058727569331158, + "grad_norm": 0.17097468674182892, + "learning_rate": 2.84302978089096e-05, + "loss": 0.0107, + "num_input_tokens_seen": 239136784, + "step": 110700 + }, + { + "epoch": 18.059543230016313, + "grad_norm": 0.000584763940423727, + "learning_rate": 2.840664259601261e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239146480, + "step": 110705 + }, + { + "epoch": 18.06035889070147, + "grad_norm": 0.036263592541217804, + "learning_rate": 2.838299694055324e-05, + "loss": 0.0317, + "num_input_tokens_seen": 239157840, + "step": 110710 + }, + { + "epoch": 18.061174551386625, + "grad_norm": 0.026984870433807373, + "learning_rate": 2.835936084301072e-05, + "loss": 0.0038, + "num_input_tokens_seen": 239167152, + "step": 110715 + }, + { + "epoch": 18.061990212071777, + "grad_norm": 0.06570431590080261, + "learning_rate": 2.8335734303864047e-05, + "loss": 0.0026, + "num_input_tokens_seen": 239177648, + "step": 110720 + }, + { + "epoch": 18.062805872756933, + "grad_norm": 0.004828798584640026, + "learning_rate": 2.8312117323592125e-05, + "loss": 0.0029, + "num_input_tokens_seen": 239188464, + "step": 110725 + }, + { + "epoch": 18.063621533442088, + "grad_norm": 0.000707502942532301, + "learning_rate": 2.8288509902673454e-05, + "loss": 0.0007, + "num_input_tokens_seen": 239198896, + "step": 110730 + }, + { + "epoch": 18.064437194127244, + "grad_norm": 0.0006841020658612251, + "learning_rate": 2.8264912041586598e-05, + "loss": 0.0017, + "num_input_tokens_seen": 239210640, + "step": 110735 + }, + { + "epoch": 18.0652528548124, + "grad_norm": 0.006858312990516424, + "learning_rate": 2.8241323740809676e-05, + "loss": 0.0015, + "num_input_tokens_seen": 239221360, + "step": 110740 + }, + { + "epoch": 18.06606851549755, + "grad_norm": 0.002047081710770726, + "learning_rate": 2.821774500082086e-05, + "loss": 0.0015, + "num_input_tokens_seen": 239231856, + "step": 110745 + }, + { + "epoch": 18.066884176182707, + "grad_norm": 0.001701177330687642, + "learning_rate": 2.819417582209788e-05, + "loss": 0.0015, + "num_input_tokens_seen": 239242384, + "step": 110750 + }, + { + "epoch": 18.067699836867863, + "grad_norm": 0.0004878344479948282, + "learning_rate": 2.8170616205118516e-05, + "loss": 0.0352, + "num_input_tokens_seen": 239253072, + "step": 110755 + }, + { + "epoch": 18.06851549755302, + "grad_norm": 0.0009230131399817765, + "learning_rate": 2.8147066150360167e-05, + "loss": 0.0023, + "num_input_tokens_seen": 239263760, + "step": 110760 + }, + { + "epoch": 18.069331158238175, + "grad_norm": 0.001374510582536459, + "learning_rate": 2.8123525658300066e-05, + "loss": 0.0307, + "num_input_tokens_seen": 239274288, + "step": 110765 + }, + { + "epoch": 18.070146818923327, + "grad_norm": 0.0009231427684426308, + "learning_rate": 2.8099994729415377e-05, + "loss": 0.108, + "num_input_tokens_seen": 239285776, + "step": 110770 + }, + { + "epoch": 18.070962479608482, + "grad_norm": 0.0004372471885289997, + "learning_rate": 2.8076473364182897e-05, + "loss": 0.0031, + "num_input_tokens_seen": 239295920, + "step": 110775 + }, + { + "epoch": 18.071778140293638, + "grad_norm": 0.0012657229090109468, + "learning_rate": 2.8052961563079403e-05, + "loss": 0.0009, + "num_input_tokens_seen": 239306800, + "step": 110780 + }, + { + "epoch": 18.072593800978794, + "grad_norm": 0.004266361240297556, + "learning_rate": 2.8029459326581353e-05, + "loss": 0.0014, + "num_input_tokens_seen": 239317488, + "step": 110785 + }, + { + "epoch": 18.07340946166395, + "grad_norm": 0.00956976879388094, + "learning_rate": 2.8005966655165026e-05, + "loss": 0.0014, + "num_input_tokens_seen": 239327312, + "step": 110790 + }, + { + "epoch": 18.0742251223491, + "grad_norm": 0.001034679007716477, + "learning_rate": 2.7982483549306435e-05, + "loss": 0.001, + "num_input_tokens_seen": 239338320, + "step": 110795 + }, + { + "epoch": 18.075040783034257, + "grad_norm": 0.0001628376339795068, + "learning_rate": 2.795901000948181e-05, + "loss": 0.0005, + "num_input_tokens_seen": 239347952, + "step": 110800 + }, + { + "epoch": 18.075856443719413, + "grad_norm": 0.0015392429195344448, + "learning_rate": 2.7935546036166548e-05, + "loss": 0.0013, + "num_input_tokens_seen": 239358384, + "step": 110805 + }, + { + "epoch": 18.07667210440457, + "grad_norm": 0.0050073969177901745, + "learning_rate": 2.7912091629836324e-05, + "loss": 0.0013, + "num_input_tokens_seen": 239369296, + "step": 110810 + }, + { + "epoch": 18.07748776508972, + "grad_norm": 0.0005766893737018108, + "learning_rate": 2.7888646790966476e-05, + "loss": 0.0057, + "num_input_tokens_seen": 239379344, + "step": 110815 + }, + { + "epoch": 18.078303425774877, + "grad_norm": 0.05712695047259331, + "learning_rate": 2.786521152003213e-05, + "loss": 0.0023, + "num_input_tokens_seen": 239390096, + "step": 110820 + }, + { + "epoch": 18.079119086460032, + "grad_norm": 0.0008328685071319342, + "learning_rate": 2.784178581750818e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239401328, + "step": 110825 + }, + { + "epoch": 18.079934747145188, + "grad_norm": 0.00018631898274179548, + "learning_rate": 2.781836968386947e-05, + "loss": 0.0012, + "num_input_tokens_seen": 239412688, + "step": 110830 + }, + { + "epoch": 18.080750407830344, + "grad_norm": 0.0006286951247602701, + "learning_rate": 2.7794963119590454e-05, + "loss": 0.0008, + "num_input_tokens_seen": 239422960, + "step": 110835 + }, + { + "epoch": 18.081566068515496, + "grad_norm": 0.003879460971802473, + "learning_rate": 2.7771566125145588e-05, + "loss": 0.0028, + "num_input_tokens_seen": 239434320, + "step": 110840 + }, + { + "epoch": 18.08238172920065, + "grad_norm": 0.0002762196818366647, + "learning_rate": 2.774817870100893e-05, + "loss": 0.0009, + "num_input_tokens_seen": 239445104, + "step": 110845 + }, + { + "epoch": 18.083197389885807, + "grad_norm": 0.0018938088323920965, + "learning_rate": 2.7724800847654608e-05, + "loss": 0.0006, + "num_input_tokens_seen": 239457584, + "step": 110850 + }, + { + "epoch": 18.084013050570963, + "grad_norm": 0.0024898534175008535, + "learning_rate": 2.7701432565556296e-05, + "loss": 0.0011, + "num_input_tokens_seen": 239469392, + "step": 110855 + }, + { + "epoch": 18.08482871125612, + "grad_norm": 0.015188485383987427, + "learning_rate": 2.767807385518756e-05, + "loss": 0.0011, + "num_input_tokens_seen": 239480176, + "step": 110860 + }, + { + "epoch": 18.08564437194127, + "grad_norm": 0.00016453674470540136, + "learning_rate": 2.765472471702185e-05, + "loss": 0.0038, + "num_input_tokens_seen": 239490928, + "step": 110865 + }, + { + "epoch": 18.086460032626427, + "grad_norm": 0.004059888422489166, + "learning_rate": 2.7631385151532405e-05, + "loss": 0.0025, + "num_input_tokens_seen": 239501200, + "step": 110870 + }, + { + "epoch": 18.087275693311582, + "grad_norm": 0.3584325313568115, + "learning_rate": 2.7608055159192125e-05, + "loss": 0.0082, + "num_input_tokens_seen": 239511856, + "step": 110875 + }, + { + "epoch": 18.088091353996738, + "grad_norm": 0.009047990664839745, + "learning_rate": 2.7584734740473905e-05, + "loss": 0.0014, + "num_input_tokens_seen": 239522736, + "step": 110880 + }, + { + "epoch": 18.088907014681894, + "grad_norm": 0.0010313192615285516, + "learning_rate": 2.756142389585037e-05, + "loss": 0.0008, + "num_input_tokens_seen": 239533744, + "step": 110885 + }, + { + "epoch": 18.089722675367046, + "grad_norm": 0.01068951841443777, + "learning_rate": 2.753812262579386e-05, + "loss": 0.0104, + "num_input_tokens_seen": 239543760, + "step": 110890 + }, + { + "epoch": 18.0905383360522, + "grad_norm": 0.0006580300396308303, + "learning_rate": 2.7514830930776667e-05, + "loss": 0.0023, + "num_input_tokens_seen": 239554544, + "step": 110895 + }, + { + "epoch": 18.091353996737357, + "grad_norm": 0.00027455881354399025, + "learning_rate": 2.749154881127086e-05, + "loss": 0.0012, + "num_input_tokens_seen": 239564880, + "step": 110900 + }, + { + "epoch": 18.092169657422513, + "grad_norm": 0.004377785138785839, + "learning_rate": 2.7468276267748172e-05, + "loss": 0.0018, + "num_input_tokens_seen": 239575696, + "step": 110905 + }, + { + "epoch": 18.09298531810767, + "grad_norm": 0.004063542932271957, + "learning_rate": 2.7445013300680333e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239586384, + "step": 110910 + }, + { + "epoch": 18.09380097879282, + "grad_norm": 0.002702921163290739, + "learning_rate": 2.7421759910538745e-05, + "loss": 0.0024, + "num_input_tokens_seen": 239596688, + "step": 110915 + }, + { + "epoch": 18.094616639477977, + "grad_norm": 0.001507585751824081, + "learning_rate": 2.739851609779481e-05, + "loss": 0.0009, + "num_input_tokens_seen": 239607120, + "step": 110920 + }, + { + "epoch": 18.095432300163132, + "grad_norm": 0.03425592556595802, + "learning_rate": 2.737528186291932e-05, + "loss": 0.0015, + "num_input_tokens_seen": 239617840, + "step": 110925 + }, + { + "epoch": 18.096247960848288, + "grad_norm": 0.011485468596220016, + "learning_rate": 2.735205720638351e-05, + "loss": 0.0007, + "num_input_tokens_seen": 239628880, + "step": 110930 + }, + { + "epoch": 18.097063621533444, + "grad_norm": 0.000180011527845636, + "learning_rate": 2.732884212865766e-05, + "loss": 0.0031, + "num_input_tokens_seen": 239639280, + "step": 110935 + }, + { + "epoch": 18.097879282218596, + "grad_norm": 0.0004578085499815643, + "learning_rate": 2.730563663021257e-05, + "loss": 0.0005, + "num_input_tokens_seen": 239650704, + "step": 110940 + }, + { + "epoch": 18.09869494290375, + "grad_norm": 0.02956857904791832, + "learning_rate": 2.7282440711518363e-05, + "loss": 0.0045, + "num_input_tokens_seen": 239661776, + "step": 110945 + }, + { + "epoch": 18.099510603588907, + "grad_norm": 0.003422102192416787, + "learning_rate": 2.725925437304522e-05, + "loss": 0.0006, + "num_input_tokens_seen": 239674512, + "step": 110950 + }, + { + "epoch": 18.100326264274063, + "grad_norm": 0.0030121582094579935, + "learning_rate": 2.7236077615262976e-05, + "loss": 0.0016, + "num_input_tokens_seen": 239686352, + "step": 110955 + }, + { + "epoch": 18.10114192495922, + "grad_norm": 0.00024480524007230997, + "learning_rate": 2.721291043864138e-05, + "loss": 0.0022, + "num_input_tokens_seen": 239698000, + "step": 110960 + }, + { + "epoch": 18.10195758564437, + "grad_norm": 0.0009636294562369585, + "learning_rate": 2.7189752843649885e-05, + "loss": 0.0035, + "num_input_tokens_seen": 239708304, + "step": 110965 + }, + { + "epoch": 18.102773246329527, + "grad_norm": 0.0029267354402691126, + "learning_rate": 2.716660483075789e-05, + "loss": 0.0039, + "num_input_tokens_seen": 239719568, + "step": 110970 + }, + { + "epoch": 18.103588907014682, + "grad_norm": 0.0013445314252749085, + "learning_rate": 2.714346640043447e-05, + "loss": 0.0054, + "num_input_tokens_seen": 239731184, + "step": 110975 + }, + { + "epoch": 18.104404567699838, + "grad_norm": 0.004225427284836769, + "learning_rate": 2.7120337553148578e-05, + "loss": 0.0093, + "num_input_tokens_seen": 239741872, + "step": 110980 + }, + { + "epoch": 18.10522022838499, + "grad_norm": 0.007663280237466097, + "learning_rate": 2.7097218289368896e-05, + "loss": 0.0012, + "num_input_tokens_seen": 239754032, + "step": 110985 + }, + { + "epoch": 18.106035889070146, + "grad_norm": 0.0012027625925838947, + "learning_rate": 2.7074108609564053e-05, + "loss": 0.0034, + "num_input_tokens_seen": 239765488, + "step": 110990 + }, + { + "epoch": 18.1068515497553, + "grad_norm": 0.0012016665423288941, + "learning_rate": 2.7051008514202336e-05, + "loss": 0.0049, + "num_input_tokens_seen": 239775824, + "step": 110995 + }, + { + "epoch": 18.107667210440457, + "grad_norm": 0.01750839501619339, + "learning_rate": 2.7027918003751873e-05, + "loss": 0.0166, + "num_input_tokens_seen": 239786608, + "step": 111000 + }, + { + "epoch": 18.108482871125613, + "grad_norm": 0.0005055178189650178, + "learning_rate": 2.7004837078680678e-05, + "loss": 0.0067, + "num_input_tokens_seen": 239797200, + "step": 111005 + }, + { + "epoch": 18.109298531810765, + "grad_norm": 0.0007495254976674914, + "learning_rate": 2.698176573945654e-05, + "loss": 0.0821, + "num_input_tokens_seen": 239806928, + "step": 111010 + }, + { + "epoch": 18.11011419249592, + "grad_norm": 0.0008099843980744481, + "learning_rate": 2.695870398654693e-05, + "loss": 0.0011, + "num_input_tokens_seen": 239817232, + "step": 111015 + }, + { + "epoch": 18.110929853181077, + "grad_norm": 0.0007534879259765148, + "learning_rate": 2.693565182041924e-05, + "loss": 0.0014, + "num_input_tokens_seen": 239827952, + "step": 111020 + }, + { + "epoch": 18.111745513866232, + "grad_norm": 0.017502669245004654, + "learning_rate": 2.6912609241540818e-05, + "loss": 0.0015, + "num_input_tokens_seen": 239839248, + "step": 111025 + }, + { + "epoch": 18.112561174551388, + "grad_norm": 0.003195826429873705, + "learning_rate": 2.688957625037841e-05, + "loss": 0.0006, + "num_input_tokens_seen": 239849904, + "step": 111030 + }, + { + "epoch": 18.11337683523654, + "grad_norm": 0.021487493067979813, + "learning_rate": 2.6866552847399028e-05, + "loss": 0.0011, + "num_input_tokens_seen": 239860976, + "step": 111035 + }, + { + "epoch": 18.114192495921696, + "grad_norm": 0.0024699419736862183, + "learning_rate": 2.684353903306902e-05, + "loss": 0.004, + "num_input_tokens_seen": 239872016, + "step": 111040 + }, + { + "epoch": 18.11500815660685, + "grad_norm": 0.0003210293361917138, + "learning_rate": 2.6820534807855124e-05, + "loss": 0.0013, + "num_input_tokens_seen": 239882416, + "step": 111045 + }, + { + "epoch": 18.115823817292007, + "grad_norm": 0.00045671319821849465, + "learning_rate": 2.679754017222319e-05, + "loss": 0.0025, + "num_input_tokens_seen": 239893872, + "step": 111050 + }, + { + "epoch": 18.116639477977163, + "grad_norm": 0.0006556420703418553, + "learning_rate": 2.677455512663951e-05, + "loss": 0.0008, + "num_input_tokens_seen": 239904304, + "step": 111055 + }, + { + "epoch": 18.117455138662315, + "grad_norm": 0.0008096517412923276, + "learning_rate": 2.6751579671569715e-05, + "loss": 0.001, + "num_input_tokens_seen": 239914896, + "step": 111060 + }, + { + "epoch": 18.11827079934747, + "grad_norm": 0.012913156300783157, + "learning_rate": 2.6728613807479594e-05, + "loss": 0.0055, + "num_input_tokens_seen": 239927184, + "step": 111065 + }, + { + "epoch": 18.119086460032626, + "grad_norm": 0.5544732213020325, + "learning_rate": 2.6705657534834394e-05, + "loss": 0.0975, + "num_input_tokens_seen": 239937488, + "step": 111070 + }, + { + "epoch": 18.119902120717782, + "grad_norm": 0.05267966538667679, + "learning_rate": 2.6682710854099623e-05, + "loss": 0.0438, + "num_input_tokens_seen": 239948432, + "step": 111075 + }, + { + "epoch": 18.120717781402938, + "grad_norm": 0.023334262892603874, + "learning_rate": 2.6659773765740025e-05, + "loss": 0.0025, + "num_input_tokens_seen": 239958864, + "step": 111080 + }, + { + "epoch": 18.12153344208809, + "grad_norm": 0.016666380688548088, + "learning_rate": 2.6636846270220615e-05, + "loss": 0.0018, + "num_input_tokens_seen": 239968976, + "step": 111085 + }, + { + "epoch": 18.122349102773246, + "grad_norm": 0.003353215055540204, + "learning_rate": 2.661392836800608e-05, + "loss": 0.0011, + "num_input_tokens_seen": 239978960, + "step": 111090 + }, + { + "epoch": 18.1231647634584, + "grad_norm": 0.006997089833021164, + "learning_rate": 2.6591020059560766e-05, + "loss": 0.0032, + "num_input_tokens_seen": 239989008, + "step": 111095 + }, + { + "epoch": 18.123980424143557, + "grad_norm": 0.005469950847327709, + "learning_rate": 2.656812134534897e-05, + "loss": 0.0005, + "num_input_tokens_seen": 240000432, + "step": 111100 + }, + { + "epoch": 18.124796084828713, + "grad_norm": 0.0023466164711862803, + "learning_rate": 2.6545232225834825e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240012272, + "step": 111105 + }, + { + "epoch": 18.125611745513865, + "grad_norm": 0.0005659526796080172, + "learning_rate": 2.6522352701482178e-05, + "loss": 0.0041, + "num_input_tokens_seen": 240022288, + "step": 111110 + }, + { + "epoch": 18.12642740619902, + "grad_norm": 0.0014040175592526793, + "learning_rate": 2.6499482772754714e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240032144, + "step": 111115 + }, + { + "epoch": 18.127243066884176, + "grad_norm": 0.06160569190979004, + "learning_rate": 2.6476622440115894e-05, + "loss": 0.0038, + "num_input_tokens_seen": 240042192, + "step": 111120 + }, + { + "epoch": 18.128058727569332, + "grad_norm": 0.0021232604049146175, + "learning_rate": 2.6453771704029017e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240052272, + "step": 111125 + }, + { + "epoch": 18.128874388254488, + "grad_norm": 0.0018104122718796134, + "learning_rate": 2.6430930564957213e-05, + "loss": 0.0009, + "num_input_tokens_seen": 240063632, + "step": 111130 + }, + { + "epoch": 18.12969004893964, + "grad_norm": 0.0006452035158872604, + "learning_rate": 2.6408099023363275e-05, + "loss": 0.0013, + "num_input_tokens_seen": 240073232, + "step": 111135 + }, + { + "epoch": 18.130505709624796, + "grad_norm": 0.0019445134093984962, + "learning_rate": 2.6385277079710113e-05, + "loss": 0.0005, + "num_input_tokens_seen": 240083344, + "step": 111140 + }, + { + "epoch": 18.13132137030995, + "grad_norm": 0.0015937142306938767, + "learning_rate": 2.6362464734460024e-05, + "loss": 0.0012, + "num_input_tokens_seen": 240093008, + "step": 111145 + }, + { + "epoch": 18.132137030995107, + "grad_norm": 0.48695728182792664, + "learning_rate": 2.633966198807558e-05, + "loss": 0.0239, + "num_input_tokens_seen": 240105200, + "step": 111150 + }, + { + "epoch": 18.13295269168026, + "grad_norm": 0.0005318346084095538, + "learning_rate": 2.631686884101864e-05, + "loss": 0.0013, + "num_input_tokens_seen": 240116720, + "step": 111155 + }, + { + "epoch": 18.133768352365415, + "grad_norm": 0.01507630106061697, + "learning_rate": 2.6294085293751435e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240127984, + "step": 111160 + }, + { + "epoch": 18.13458401305057, + "grad_norm": 0.002034904668107629, + "learning_rate": 2.6271311346735326e-05, + "loss": 0.0009, + "num_input_tokens_seen": 240137872, + "step": 111165 + }, + { + "epoch": 18.135399673735726, + "grad_norm": 0.0023250230588018894, + "learning_rate": 2.624854700043222e-05, + "loss": 0.0014, + "num_input_tokens_seen": 240149456, + "step": 111170 + }, + { + "epoch": 18.136215334420882, + "grad_norm": 0.0006091590621508658, + "learning_rate": 2.6225792255303195e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240159792, + "step": 111175 + }, + { + "epoch": 18.137030995106034, + "grad_norm": 0.004074991215020418, + "learning_rate": 2.6203047111809597e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240169360, + "step": 111180 + }, + { + "epoch": 18.13784665579119, + "grad_norm": 0.0071708871982991695, + "learning_rate": 2.6180311570412174e-05, + "loss": 0.0029, + "num_input_tokens_seen": 240180528, + "step": 111185 + }, + { + "epoch": 18.138662316476346, + "grad_norm": 0.0010574172483757138, + "learning_rate": 2.6157585631572e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240192016, + "step": 111190 + }, + { + "epoch": 18.1394779771615, + "grad_norm": 0.001377054606564343, + "learning_rate": 2.613486929574932e-05, + "loss": 0.0016, + "num_input_tokens_seen": 240202480, + "step": 111195 + }, + { + "epoch": 18.140293637846657, + "grad_norm": 0.0010247458703815937, + "learning_rate": 2.611216256340476e-05, + "loss": 0.0007, + "num_input_tokens_seen": 240212912, + "step": 111200 + }, + { + "epoch": 18.14110929853181, + "grad_norm": 0.006079982966184616, + "learning_rate": 2.6089465434998296e-05, + "loss": 0.0018, + "num_input_tokens_seen": 240224752, + "step": 111205 + }, + { + "epoch": 18.141924959216965, + "grad_norm": 0.04028640687465668, + "learning_rate": 2.6066777910990104e-05, + "loss": 0.0015, + "num_input_tokens_seen": 240235056, + "step": 111210 + }, + { + "epoch": 18.14274061990212, + "grad_norm": 0.0025856448337435722, + "learning_rate": 2.6044099991839766e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240245680, + "step": 111215 + }, + { + "epoch": 18.143556280587276, + "grad_norm": 0.0061494940891861916, + "learning_rate": 2.602143167800719e-05, + "loss": 0.0018, + "num_input_tokens_seen": 240255728, + "step": 111220 + }, + { + "epoch": 18.144371941272432, + "grad_norm": 0.002374051371589303, + "learning_rate": 2.59987729699514e-05, + "loss": 0.0021, + "num_input_tokens_seen": 240265264, + "step": 111225 + }, + { + "epoch": 18.145187601957584, + "grad_norm": 0.00037141350912861526, + "learning_rate": 2.5976123868131864e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240275856, + "step": 111230 + }, + { + "epoch": 18.14600326264274, + "grad_norm": 0.00015754564083181322, + "learning_rate": 2.5953484373007487e-05, + "loss": 0.0011, + "num_input_tokens_seen": 240285712, + "step": 111235 + }, + { + "epoch": 18.146818923327896, + "grad_norm": 0.004322631284594536, + "learning_rate": 2.5930854485037124e-05, + "loss": 0.0011, + "num_input_tokens_seen": 240296528, + "step": 111240 + }, + { + "epoch": 18.14763458401305, + "grad_norm": 0.0036023175343871117, + "learning_rate": 2.590823420467947e-05, + "loss": 0.1025, + "num_input_tokens_seen": 240307824, + "step": 111245 + }, + { + "epoch": 18.148450244698207, + "grad_norm": 0.0007313843816518784, + "learning_rate": 2.5885623532392823e-05, + "loss": 0.0009, + "num_input_tokens_seen": 240318672, + "step": 111250 + }, + { + "epoch": 18.14926590538336, + "grad_norm": 0.017052991315722466, + "learning_rate": 2.586302246863548e-05, + "loss": 0.0015, + "num_input_tokens_seen": 240329488, + "step": 111255 + }, + { + "epoch": 18.150081566068515, + "grad_norm": 0.0006939188460819423, + "learning_rate": 2.584043101386546e-05, + "loss": 0.0005, + "num_input_tokens_seen": 240341264, + "step": 111260 + }, + { + "epoch": 18.15089722675367, + "grad_norm": 0.0010113732423633337, + "learning_rate": 2.5817849168540576e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240352464, + "step": 111265 + }, + { + "epoch": 18.151712887438826, + "grad_norm": 0.003736126236617565, + "learning_rate": 2.5795276933118618e-05, + "loss": 0.0129, + "num_input_tokens_seen": 240363440, + "step": 111270 + }, + { + "epoch": 18.152528548123982, + "grad_norm": 0.02830549329519272, + "learning_rate": 2.5772714308056887e-05, + "loss": 0.002, + "num_input_tokens_seen": 240373616, + "step": 111275 + }, + { + "epoch": 18.153344208809134, + "grad_norm": 0.0023447242565453053, + "learning_rate": 2.5750161293812635e-05, + "loss": 0.0009, + "num_input_tokens_seen": 240384560, + "step": 111280 + }, + { + "epoch": 18.15415986949429, + "grad_norm": 0.005311821587383747, + "learning_rate": 2.572761789084316e-05, + "loss": 0.0012, + "num_input_tokens_seen": 240394960, + "step": 111285 + }, + { + "epoch": 18.154975530179446, + "grad_norm": 0.03360892832279205, + "learning_rate": 2.570508409960498e-05, + "loss": 0.0033, + "num_input_tokens_seen": 240403984, + "step": 111290 + }, + { + "epoch": 18.1557911908646, + "grad_norm": 0.03504948318004608, + "learning_rate": 2.5682559920555127e-05, + "loss": 0.0049, + "num_input_tokens_seen": 240414352, + "step": 111295 + }, + { + "epoch": 18.156606851549757, + "grad_norm": 0.0005736067541874945, + "learning_rate": 2.5660045354149786e-05, + "loss": 0.0007, + "num_input_tokens_seen": 240425392, + "step": 111300 + }, + { + "epoch": 18.15742251223491, + "grad_norm": 0.0002982286678161472, + "learning_rate": 2.5637540400845483e-05, + "loss": 0.0009, + "num_input_tokens_seen": 240436944, + "step": 111305 + }, + { + "epoch": 18.158238172920065, + "grad_norm": 0.042514994740486145, + "learning_rate": 2.561504506109802e-05, + "loss": 0.0025, + "num_input_tokens_seen": 240448272, + "step": 111310 + }, + { + "epoch": 18.15905383360522, + "grad_norm": 0.004912849515676498, + "learning_rate": 2.5592559335363696e-05, + "loss": 0.0009, + "num_input_tokens_seen": 240460368, + "step": 111315 + }, + { + "epoch": 18.159869494290376, + "grad_norm": 0.0012176425661891699, + "learning_rate": 2.5570083224097763e-05, + "loss": 0.0016, + "num_input_tokens_seen": 240472368, + "step": 111320 + }, + { + "epoch": 18.160685154975532, + "grad_norm": 0.03821130096912384, + "learning_rate": 2.554761672775613e-05, + "loss": 0.0015, + "num_input_tokens_seen": 240483056, + "step": 111325 + }, + { + "epoch": 18.161500815660684, + "grad_norm": 0.004230343271046877, + "learning_rate": 2.5525159846793822e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240494096, + "step": 111330 + }, + { + "epoch": 18.16231647634584, + "grad_norm": 0.07421465963125229, + "learning_rate": 2.550271258166609e-05, + "loss": 0.0047, + "num_input_tokens_seen": 240505168, + "step": 111335 + }, + { + "epoch": 18.163132137030995, + "grad_norm": 0.004852895624935627, + "learning_rate": 2.548027493282784e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240515632, + "step": 111340 + }, + { + "epoch": 18.16394779771615, + "grad_norm": 0.003312204033136368, + "learning_rate": 2.5457846900733774e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240526608, + "step": 111345 + }, + { + "epoch": 18.164763458401303, + "grad_norm": 0.0339200459420681, + "learning_rate": 2.5435428485838465e-05, + "loss": 0.0017, + "num_input_tokens_seen": 240538032, + "step": 111350 + }, + { + "epoch": 18.16557911908646, + "grad_norm": 0.0012343511916697025, + "learning_rate": 2.5413019688596218e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240549104, + "step": 111355 + }, + { + "epoch": 18.166394779771615, + "grad_norm": 0.002318829298019409, + "learning_rate": 2.539062050946117e-05, + "loss": 0.0017, + "num_input_tokens_seen": 240559120, + "step": 111360 + }, + { + "epoch": 18.16721044045677, + "grad_norm": 0.001546688610687852, + "learning_rate": 2.5368230948887295e-05, + "loss": 0.001, + "num_input_tokens_seen": 240571408, + "step": 111365 + }, + { + "epoch": 18.168026101141926, + "grad_norm": 0.003105068812146783, + "learning_rate": 2.5345851007328336e-05, + "loss": 0.0024, + "num_input_tokens_seen": 240581264, + "step": 111370 + }, + { + "epoch": 18.16884176182708, + "grad_norm": 0.0007292951340787113, + "learning_rate": 2.532348068523782e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240591664, + "step": 111375 + }, + { + "epoch": 18.169657422512234, + "grad_norm": 0.027069859206676483, + "learning_rate": 2.5301119983069165e-05, + "loss": 0.0013, + "num_input_tokens_seen": 240602224, + "step": 111380 + }, + { + "epoch": 18.17047308319739, + "grad_norm": 0.0005704367067664862, + "learning_rate": 2.5278768901275506e-05, + "loss": 0.003, + "num_input_tokens_seen": 240612880, + "step": 111385 + }, + { + "epoch": 18.171288743882545, + "grad_norm": 0.009828636422753334, + "learning_rate": 2.5256427440309815e-05, + "loss": 0.0033, + "num_input_tokens_seen": 240624752, + "step": 111390 + }, + { + "epoch": 18.1721044045677, + "grad_norm": 0.0319136418402195, + "learning_rate": 2.5234095600624896e-05, + "loss": 0.0029, + "num_input_tokens_seen": 240634672, + "step": 111395 + }, + { + "epoch": 18.172920065252853, + "grad_norm": 0.0002865030546672642, + "learning_rate": 2.5211773382673274e-05, + "loss": 0.0015, + "num_input_tokens_seen": 240645776, + "step": 111400 + }, + { + "epoch": 18.17373572593801, + "grad_norm": 0.025485774502158165, + "learning_rate": 2.5189460786907425e-05, + "loss": 0.0028, + "num_input_tokens_seen": 240655920, + "step": 111405 + }, + { + "epoch": 18.174551386623165, + "grad_norm": 0.0005207830108702183, + "learning_rate": 2.5167157813779485e-05, + "loss": 0.0458, + "num_input_tokens_seen": 240665392, + "step": 111410 + }, + { + "epoch": 18.17536704730832, + "grad_norm": 0.005462713073939085, + "learning_rate": 2.5144864463741423e-05, + "loss": 0.0059, + "num_input_tokens_seen": 240677040, + "step": 111415 + }, + { + "epoch": 18.176182707993476, + "grad_norm": 0.0015583484200760722, + "learning_rate": 2.5122580737245105e-05, + "loss": 0.0014, + "num_input_tokens_seen": 240688336, + "step": 111420 + }, + { + "epoch": 18.17699836867863, + "grad_norm": 0.06314843893051147, + "learning_rate": 2.5100306634742053e-05, + "loss": 0.0027, + "num_input_tokens_seen": 240699664, + "step": 111425 + }, + { + "epoch": 18.177814029363784, + "grad_norm": 0.16858816146850586, + "learning_rate": 2.5078042156683854e-05, + "loss": 0.0043, + "num_input_tokens_seen": 240708752, + "step": 111430 + }, + { + "epoch": 18.17862969004894, + "grad_norm": 0.0017226624768227339, + "learning_rate": 2.5055787303521483e-05, + "loss": 0.0018, + "num_input_tokens_seen": 240719248, + "step": 111435 + }, + { + "epoch": 18.179445350734095, + "grad_norm": 0.007539310026913881, + "learning_rate": 2.5033542075706184e-05, + "loss": 0.0022, + "num_input_tokens_seen": 240731536, + "step": 111440 + }, + { + "epoch": 18.18026101141925, + "grad_norm": 0.005234704352915287, + "learning_rate": 2.5011306473688656e-05, + "loss": 0.0011, + "num_input_tokens_seen": 240743088, + "step": 111445 + }, + { + "epoch": 18.181076672104403, + "grad_norm": 0.0010076353792101145, + "learning_rate": 2.4989080497919593e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240755056, + "step": 111450 + }, + { + "epoch": 18.18189233278956, + "grad_norm": 0.002447428647428751, + "learning_rate": 2.496686414884941e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240766576, + "step": 111455 + }, + { + "epoch": 18.182707993474715, + "grad_norm": 0.07020247727632523, + "learning_rate": 2.4944657426928306e-05, + "loss": 0.0072, + "num_input_tokens_seen": 240776720, + "step": 111460 + }, + { + "epoch": 18.18352365415987, + "grad_norm": 0.0051605477929115295, + "learning_rate": 2.492246033260642e-05, + "loss": 0.1114, + "num_input_tokens_seen": 240787760, + "step": 111465 + }, + { + "epoch": 18.184339314845026, + "grad_norm": 0.0043184030801057816, + "learning_rate": 2.490027286633356e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240798640, + "step": 111470 + }, + { + "epoch": 18.18515497553018, + "grad_norm": 0.002179432427510619, + "learning_rate": 2.487809502855931e-05, + "loss": 0.0082, + "num_input_tokens_seen": 240810480, + "step": 111475 + }, + { + "epoch": 18.185970636215334, + "grad_norm": 0.00016462391067761928, + "learning_rate": 2.4855926819733253e-05, + "loss": 0.0017, + "num_input_tokens_seen": 240821680, + "step": 111480 + }, + { + "epoch": 18.18678629690049, + "grad_norm": 0.16513219475746155, + "learning_rate": 2.4833768240304587e-05, + "loss": 0.0026, + "num_input_tokens_seen": 240832592, + "step": 111485 + }, + { + "epoch": 18.187601957585645, + "grad_norm": 0.00029663904570043087, + "learning_rate": 2.48116192907224e-05, + "loss": 0.0009, + "num_input_tokens_seen": 240844208, + "step": 111490 + }, + { + "epoch": 18.1884176182708, + "grad_norm": 0.014264887198805809, + "learning_rate": 2.4789479971435602e-05, + "loss": 0.0015, + "num_input_tokens_seen": 240855472, + "step": 111495 + }, + { + "epoch": 18.189233278955953, + "grad_norm": 0.02956242486834526, + "learning_rate": 2.4767350282892788e-05, + "loss": 0.0031, + "num_input_tokens_seen": 240866896, + "step": 111500 + }, + { + "epoch": 18.19004893964111, + "grad_norm": 0.08667551726102829, + "learning_rate": 2.4745230225542536e-05, + "loss": 0.0024, + "num_input_tokens_seen": 240877488, + "step": 111505 + }, + { + "epoch": 18.190864600326265, + "grad_norm": 0.017870064824819565, + "learning_rate": 2.472311979983305e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240887472, + "step": 111510 + }, + { + "epoch": 18.19168026101142, + "grad_norm": 0.0027186137158423662, + "learning_rate": 2.470101900621252e-05, + "loss": 0.0122, + "num_input_tokens_seen": 240899344, + "step": 111515 + }, + { + "epoch": 18.192495921696572, + "grad_norm": 0.009860222227871418, + "learning_rate": 2.4678927845128762e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240910480, + "step": 111520 + }, + { + "epoch": 18.193311582381728, + "grad_norm": 0.0051208180375397205, + "learning_rate": 2.4656846317029524e-05, + "loss": 0.0005, + "num_input_tokens_seen": 240921168, + "step": 111525 + }, + { + "epoch": 18.194127243066884, + "grad_norm": 0.12531216442584991, + "learning_rate": 2.463477442236234e-05, + "loss": 0.0018, + "num_input_tokens_seen": 240932272, + "step": 111530 + }, + { + "epoch": 18.19494290375204, + "grad_norm": 0.028876209631562233, + "learning_rate": 2.4612712161574457e-05, + "loss": 0.0123, + "num_input_tokens_seen": 240941712, + "step": 111535 + }, + { + "epoch": 18.195758564437195, + "grad_norm": 0.0025795248802751303, + "learning_rate": 2.459065953511308e-05, + "loss": 0.0013, + "num_input_tokens_seen": 240952816, + "step": 111540 + }, + { + "epoch": 18.196574225122347, + "grad_norm": 0.0012704171240329742, + "learning_rate": 2.456861654342507e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240962192, + "step": 111545 + }, + { + "epoch": 18.197389885807503, + "grad_norm": 0.0005869403248652816, + "learning_rate": 2.454658318695713e-05, + "loss": 0.0017, + "num_input_tokens_seen": 240972592, + "step": 111550 + }, + { + "epoch": 18.19820554649266, + "grad_norm": 0.0015953588299453259, + "learning_rate": 2.4524559466155838e-05, + "loss": 0.0007, + "num_input_tokens_seen": 240983536, + "step": 111555 + }, + { + "epoch": 18.199021207177815, + "grad_norm": 0.0008225612109526992, + "learning_rate": 2.450254538146762e-05, + "loss": 0.0021, + "num_input_tokens_seen": 240993968, + "step": 111560 + }, + { + "epoch": 18.19983686786297, + "grad_norm": 0.002442733384668827, + "learning_rate": 2.44805409333384e-05, + "loss": 0.0017, + "num_input_tokens_seen": 241004272, + "step": 111565 + }, + { + "epoch": 18.200652528548122, + "grad_norm": 0.00041253273957408965, + "learning_rate": 2.445854612221432e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241014928, + "step": 111570 + }, + { + "epoch": 18.201468189233278, + "grad_norm": 0.0012305235723033547, + "learning_rate": 2.443656094854113e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241025968, + "step": 111575 + }, + { + "epoch": 18.202283849918434, + "grad_norm": 0.0023012920282781124, + "learning_rate": 2.4414585412764255e-05, + "loss": 0.0019, + "num_input_tokens_seen": 241037040, + "step": 111580 + }, + { + "epoch": 18.20309951060359, + "grad_norm": 0.001884901081211865, + "learning_rate": 2.4392619515329173e-05, + "loss": 0.0009, + "num_input_tokens_seen": 241048080, + "step": 111585 + }, + { + "epoch": 18.203915171288745, + "grad_norm": 0.002780719194561243, + "learning_rate": 2.437066325668097e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241059216, + "step": 111590 + }, + { + "epoch": 18.204730831973897, + "grad_norm": 0.02738889679312706, + "learning_rate": 2.434871663726468e-05, + "loss": 0.0025, + "num_input_tokens_seen": 241069968, + "step": 111595 + }, + { + "epoch": 18.205546492659053, + "grad_norm": 0.013765150681138039, + "learning_rate": 2.4326779657525055e-05, + "loss": 0.0016, + "num_input_tokens_seen": 241081008, + "step": 111600 + }, + { + "epoch": 18.20636215334421, + "grad_norm": 0.00026065035490319133, + "learning_rate": 2.430485231790669e-05, + "loss": 0.003, + "num_input_tokens_seen": 241091056, + "step": 111605 + }, + { + "epoch": 18.207177814029365, + "grad_norm": 0.0007315054535865784, + "learning_rate": 2.428293461885389e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241102064, + "step": 111610 + }, + { + "epoch": 18.20799347471452, + "grad_norm": 0.0005587812629528344, + "learning_rate": 2.426102656081097e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241113456, + "step": 111615 + }, + { + "epoch": 18.208809135399672, + "grad_norm": 0.022680338472127914, + "learning_rate": 2.4239128144221857e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241123472, + "step": 111620 + }, + { + "epoch": 18.209624796084828, + "grad_norm": 0.007227160967886448, + "learning_rate": 2.4217239369530354e-05, + "loss": 0.0142, + "num_input_tokens_seen": 241133808, + "step": 111625 + }, + { + "epoch": 18.210440456769984, + "grad_norm": 0.03996798023581505, + "learning_rate": 2.4195360237180053e-05, + "loss": 0.0017, + "num_input_tokens_seen": 241144304, + "step": 111630 + }, + { + "epoch": 18.21125611745514, + "grad_norm": 0.002206821460276842, + "learning_rate": 2.417349074761438e-05, + "loss": 0.0015, + "num_input_tokens_seen": 241153264, + "step": 111635 + }, + { + "epoch": 18.212071778140295, + "grad_norm": 0.07152996957302094, + "learning_rate": 2.4151630901276534e-05, + "loss": 0.0014, + "num_input_tokens_seen": 241164816, + "step": 111640 + }, + { + "epoch": 18.212887438825447, + "grad_norm": 0.0010918622137978673, + "learning_rate": 2.4129780698609606e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241175952, + "step": 111645 + }, + { + "epoch": 18.213703099510603, + "grad_norm": 0.007130472920835018, + "learning_rate": 2.4107940140056294e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241186032, + "step": 111650 + }, + { + "epoch": 18.21451876019576, + "grad_norm": 0.0018399967812001705, + "learning_rate": 2.4086109226059305e-05, + "loss": 0.0019, + "num_input_tokens_seen": 241197584, + "step": 111655 + }, + { + "epoch": 18.215334420880914, + "grad_norm": 0.001044351258315146, + "learning_rate": 2.4064287957061003e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241209168, + "step": 111660 + }, + { + "epoch": 18.21615008156607, + "grad_norm": 0.00033728586276993155, + "learning_rate": 2.404247633350376e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241220304, + "step": 111665 + }, + { + "epoch": 18.216965742251222, + "grad_norm": 0.007836922071874142, + "learning_rate": 2.402067435582944e-05, + "loss": 0.0019, + "num_input_tokens_seen": 241231024, + "step": 111670 + }, + { + "epoch": 18.217781402936378, + "grad_norm": 0.006131039932370186, + "learning_rate": 2.3998882024480085e-05, + "loss": 0.0017, + "num_input_tokens_seen": 241242192, + "step": 111675 + }, + { + "epoch": 18.218597063621534, + "grad_norm": 0.0007795770070515573, + "learning_rate": 2.3977099339897112e-05, + "loss": 0.0012, + "num_input_tokens_seen": 241253104, + "step": 111680 + }, + { + "epoch": 18.21941272430669, + "grad_norm": 0.0077186450362205505, + "learning_rate": 2.395532630252223e-05, + "loss": 0.0031, + "num_input_tokens_seen": 241263920, + "step": 111685 + }, + { + "epoch": 18.22022838499184, + "grad_norm": 0.010755318216979504, + "learning_rate": 2.393356291279647e-05, + "loss": 0.0015, + "num_input_tokens_seen": 241274288, + "step": 111690 + }, + { + "epoch": 18.221044045676997, + "grad_norm": 0.003711380995810032, + "learning_rate": 2.391180917116109e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241285904, + "step": 111695 + }, + { + "epoch": 18.221859706362153, + "grad_norm": 0.0002724926162045449, + "learning_rate": 2.389006507805669e-05, + "loss": 0.0014, + "num_input_tokens_seen": 241298000, + "step": 111700 + }, + { + "epoch": 18.22267536704731, + "grad_norm": 0.002828385913744569, + "learning_rate": 2.3868330633924295e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241308496, + "step": 111705 + }, + { + "epoch": 18.223491027732464, + "grad_norm": 0.000973310845438391, + "learning_rate": 2.3846605839204062e-05, + "loss": 0.0108, + "num_input_tokens_seen": 241318960, + "step": 111710 + }, + { + "epoch": 18.224306688417617, + "grad_norm": 0.005908642895519733, + "learning_rate": 2.3824890694336467e-05, + "loss": 0.0297, + "num_input_tokens_seen": 241328656, + "step": 111715 + }, + { + "epoch": 18.225122349102772, + "grad_norm": 0.47186583280563354, + "learning_rate": 2.380318519976149e-05, + "loss": 0.0148, + "num_input_tokens_seen": 241340112, + "step": 111720 + }, + { + "epoch": 18.225938009787928, + "grad_norm": 0.04481413587927818, + "learning_rate": 2.3781489355919117e-05, + "loss": 0.0016, + "num_input_tokens_seen": 241351024, + "step": 111725 + }, + { + "epoch": 18.226753670473084, + "grad_norm": 0.0014045239659026265, + "learning_rate": 2.375980316324894e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241361616, + "step": 111730 + }, + { + "epoch": 18.22756933115824, + "grad_norm": 0.021581880748271942, + "learning_rate": 2.373812662219055e-05, + "loss": 0.0049, + "num_input_tokens_seen": 241371696, + "step": 111735 + }, + { + "epoch": 18.22838499184339, + "grad_norm": 0.001914651715196669, + "learning_rate": 2.3716459733183205e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241383120, + "step": 111740 + }, + { + "epoch": 18.229200652528547, + "grad_norm": 0.0057961605489254, + "learning_rate": 2.3694802496665945e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241392752, + "step": 111745 + }, + { + "epoch": 18.230016313213703, + "grad_norm": 0.004409853368997574, + "learning_rate": 2.367315491307781e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241403376, + "step": 111750 + }, + { + "epoch": 18.23083197389886, + "grad_norm": 0.009331168606877327, + "learning_rate": 2.3651516982857448e-05, + "loss": 0.0009, + "num_input_tokens_seen": 241413840, + "step": 111755 + }, + { + "epoch": 18.231647634584014, + "grad_norm": 0.0019217518856748939, + "learning_rate": 2.362988870644339e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241424752, + "step": 111760 + }, + { + "epoch": 18.232463295269167, + "grad_norm": 0.0011666314676404, + "learning_rate": 2.3608270084273853e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241435536, + "step": 111765 + }, + { + "epoch": 18.233278955954322, + "grad_norm": 0.14562870562076569, + "learning_rate": 2.3586661116787255e-05, + "loss": 0.0049, + "num_input_tokens_seen": 241446960, + "step": 111770 + }, + { + "epoch": 18.234094616639478, + "grad_norm": 0.01970742829144001, + "learning_rate": 2.3565061804421195e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241457264, + "step": 111775 + }, + { + "epoch": 18.234910277324634, + "grad_norm": 0.4335617125034332, + "learning_rate": 2.3543472147613654e-05, + "loss": 0.0163, + "num_input_tokens_seen": 241469104, + "step": 111780 + }, + { + "epoch": 18.23572593800979, + "grad_norm": 0.00016372000391129404, + "learning_rate": 2.3521892146801947e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241479472, + "step": 111785 + }, + { + "epoch": 18.23654159869494, + "grad_norm": 0.0005032969056628644, + "learning_rate": 2.350032180242373e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241490256, + "step": 111790 + }, + { + "epoch": 18.237357259380097, + "grad_norm": 0.03568139672279358, + "learning_rate": 2.3478761114915814e-05, + "loss": 0.0011, + "num_input_tokens_seen": 241501616, + "step": 111795 + }, + { + "epoch": 18.238172920065253, + "grad_norm": 0.0010190936736762524, + "learning_rate": 2.3457210084715462e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241514256, + "step": 111800 + }, + { + "epoch": 18.23898858075041, + "grad_norm": 0.0015556697035208344, + "learning_rate": 2.3435668712259105e-05, + "loss": 0.005, + "num_input_tokens_seen": 241525712, + "step": 111805 + }, + { + "epoch": 18.239804241435564, + "grad_norm": 0.002752800937741995, + "learning_rate": 2.341413699798367e-05, + "loss": 0.0009, + "num_input_tokens_seen": 241536656, + "step": 111810 + }, + { + "epoch": 18.240619902120716, + "grad_norm": 0.0019639593083411455, + "learning_rate": 2.3392614942325196e-05, + "loss": 0.0016, + "num_input_tokens_seen": 241547920, + "step": 111815 + }, + { + "epoch": 18.241435562805872, + "grad_norm": 0.06264805048704147, + "learning_rate": 2.3371102545720112e-05, + "loss": 0.0693, + "num_input_tokens_seen": 241559728, + "step": 111820 + }, + { + "epoch": 18.242251223491028, + "grad_norm": 0.10108703374862671, + "learning_rate": 2.3349599808604182e-05, + "loss": 0.003, + "num_input_tokens_seen": 241571216, + "step": 111825 + }, + { + "epoch": 18.243066884176184, + "grad_norm": 0.0005204555345699191, + "learning_rate": 2.332810673141339e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241583024, + "step": 111830 + }, + { + "epoch": 18.24388254486134, + "grad_norm": 0.003744245506823063, + "learning_rate": 2.3306623314583108e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241593712, + "step": 111835 + }, + { + "epoch": 18.24469820554649, + "grad_norm": 0.0042017437517642975, + "learning_rate": 2.3285149558548934e-05, + "loss": 0.002, + "num_input_tokens_seen": 241604240, + "step": 111840 + }, + { + "epoch": 18.245513866231647, + "grad_norm": 0.007984976284205914, + "learning_rate": 2.3263685463745854e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241615408, + "step": 111845 + }, + { + "epoch": 18.246329526916803, + "grad_norm": 0.004355450160801411, + "learning_rate": 2.324223103060913e-05, + "loss": 0.0035, + "num_input_tokens_seen": 241628016, + "step": 111850 + }, + { + "epoch": 18.24714518760196, + "grad_norm": 0.0002695178845897317, + "learning_rate": 2.322078625957319e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241639504, + "step": 111855 + }, + { + "epoch": 18.247960848287114, + "grad_norm": 0.037556588649749756, + "learning_rate": 2.319935115107302e-05, + "loss": 0.0262, + "num_input_tokens_seen": 241648880, + "step": 111860 + }, + { + "epoch": 18.248776508972266, + "grad_norm": 0.0008369534043595195, + "learning_rate": 2.317792570554278e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241660464, + "step": 111865 + }, + { + "epoch": 18.249592169657422, + "grad_norm": 0.0007577822543680668, + "learning_rate": 2.3156509923416778e-05, + "loss": 0.0009, + "num_input_tokens_seen": 241670064, + "step": 111870 + }, + { + "epoch": 18.250407830342578, + "grad_norm": 0.0008776098839007318, + "learning_rate": 2.3135103805129065e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241680432, + "step": 111875 + }, + { + "epoch": 18.251223491027734, + "grad_norm": 0.02049904502928257, + "learning_rate": 2.31137073511134e-05, + "loss": 0.0632, + "num_input_tokens_seen": 241690736, + "step": 111880 + }, + { + "epoch": 18.252039151712886, + "grad_norm": 0.009858843870460987, + "learning_rate": 2.3092320561803436e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241701264, + "step": 111885 + }, + { + "epoch": 18.25285481239804, + "grad_norm": 0.0036323664244264364, + "learning_rate": 2.3070943437632553e-05, + "loss": 0.0234, + "num_input_tokens_seen": 241711152, + "step": 111890 + }, + { + "epoch": 18.253670473083197, + "grad_norm": 0.00025795798865146935, + "learning_rate": 2.3049575979034066e-05, + "loss": 0.0013, + "num_input_tokens_seen": 241722640, + "step": 111895 + }, + { + "epoch": 18.254486133768353, + "grad_norm": 0.0005865280982106924, + "learning_rate": 2.3028218186440964e-05, + "loss": 0.0074, + "num_input_tokens_seen": 241733680, + "step": 111900 + }, + { + "epoch": 18.25530179445351, + "grad_norm": 1.113283634185791, + "learning_rate": 2.3006870060286123e-05, + "loss": 0.0298, + "num_input_tokens_seen": 241744240, + "step": 111905 + }, + { + "epoch": 18.25611745513866, + "grad_norm": 0.00035421474603936076, + "learning_rate": 2.2985531601002084e-05, + "loss": 0.0012, + "num_input_tokens_seen": 241753872, + "step": 111910 + }, + { + "epoch": 18.256933115823816, + "grad_norm": 0.0012218153569847345, + "learning_rate": 2.2964202809021563e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241764304, + "step": 111915 + }, + { + "epoch": 18.257748776508972, + "grad_norm": 0.006138972472399473, + "learning_rate": 2.2942883684776428e-05, + "loss": 0.0009, + "num_input_tokens_seen": 241774160, + "step": 111920 + }, + { + "epoch": 18.258564437194128, + "grad_norm": 0.0003647230041678995, + "learning_rate": 2.2921574228699116e-05, + "loss": 0.0063, + "num_input_tokens_seen": 241786128, + "step": 111925 + }, + { + "epoch": 18.259380097879284, + "grad_norm": 0.0019345534965395927, + "learning_rate": 2.290027444122117e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241796368, + "step": 111930 + }, + { + "epoch": 18.260195758564436, + "grad_norm": 0.005643834825605154, + "learning_rate": 2.2878984322774578e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241807216, + "step": 111935 + }, + { + "epoch": 18.26101141924959, + "grad_norm": 0.029966186732053757, + "learning_rate": 2.2857703873790435e-05, + "loss": 0.0017, + "num_input_tokens_seen": 241818608, + "step": 111940 + }, + { + "epoch": 18.261827079934747, + "grad_norm": 0.00763977924361825, + "learning_rate": 2.2836433094700405e-05, + "loss": 0.0035, + "num_input_tokens_seen": 241828656, + "step": 111945 + }, + { + "epoch": 18.262642740619903, + "grad_norm": 0.0040611946024000645, + "learning_rate": 2.2815171985935246e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241840208, + "step": 111950 + }, + { + "epoch": 18.26345840130506, + "grad_norm": 0.03236650675535202, + "learning_rate": 2.279392054792612e-05, + "loss": 0.0057, + "num_input_tokens_seen": 241851696, + "step": 111955 + }, + { + "epoch": 18.26427406199021, + "grad_norm": 0.0016628196462988853, + "learning_rate": 2.277267878110345e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241863056, + "step": 111960 + }, + { + "epoch": 18.265089722675366, + "grad_norm": 0.0033213666174560785, + "learning_rate": 2.275144668589796e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241873040, + "step": 111965 + }, + { + "epoch": 18.265905383360522, + "grad_norm": 0.008418967947363853, + "learning_rate": 2.2730224262739687e-05, + "loss": 0.0021, + "num_input_tokens_seen": 241884720, + "step": 111970 + }, + { + "epoch": 18.266721044045678, + "grad_norm": 0.004653456620872021, + "learning_rate": 2.270901151205895e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241894512, + "step": 111975 + }, + { + "epoch": 18.267536704730833, + "grad_norm": 0.00020115444203838706, + "learning_rate": 2.2687808434285585e-05, + "loss": 0.0039, + "num_input_tokens_seen": 241906544, + "step": 111980 + }, + { + "epoch": 18.268352365415986, + "grad_norm": 0.005816313438117504, + "learning_rate": 2.266661502984929e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241917456, + "step": 111985 + }, + { + "epoch": 18.26916802610114, + "grad_norm": 0.08827083557844162, + "learning_rate": 2.264543129917962e-05, + "loss": 0.004, + "num_input_tokens_seen": 241929200, + "step": 111990 + }, + { + "epoch": 18.269983686786297, + "grad_norm": 0.1319461166858673, + "learning_rate": 2.2624257242705838e-05, + "loss": 0.0034, + "num_input_tokens_seen": 241940016, + "step": 111995 + }, + { + "epoch": 18.270799347471453, + "grad_norm": 0.04293489083647728, + "learning_rate": 2.2603092860857045e-05, + "loss": 0.0015, + "num_input_tokens_seen": 241950928, + "step": 112000 + }, + { + "epoch": 18.27161500815661, + "grad_norm": 0.0004455571179278195, + "learning_rate": 2.258193815406223e-05, + "loss": 0.0078, + "num_input_tokens_seen": 241961488, + "step": 112005 + }, + { + "epoch": 18.27243066884176, + "grad_norm": 0.0009946267819032073, + "learning_rate": 2.2560793122750056e-05, + "loss": 0.0033, + "num_input_tokens_seen": 241972208, + "step": 112010 + }, + { + "epoch": 18.273246329526916, + "grad_norm": 0.00034329970367252827, + "learning_rate": 2.253965776734912e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241984176, + "step": 112015 + }, + { + "epoch": 18.274061990212072, + "grad_norm": 0.002764312084764242, + "learning_rate": 2.251853208828769e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241994064, + "step": 112020 + }, + { + "epoch": 18.274877650897228, + "grad_norm": 0.008333449251949787, + "learning_rate": 2.2497416085993983e-05, + "loss": 0.0012, + "num_input_tokens_seen": 242003600, + "step": 112025 + }, + { + "epoch": 18.275693311582383, + "grad_norm": 0.002981035504490137, + "learning_rate": 2.247630976089582e-05, + "loss": 0.001, + "num_input_tokens_seen": 242015152, + "step": 112030 + }, + { + "epoch": 18.276508972267536, + "grad_norm": 0.004562276415526867, + "learning_rate": 2.245521311342108e-05, + "loss": 0.0022, + "num_input_tokens_seen": 242026704, + "step": 112035 + }, + { + "epoch": 18.27732463295269, + "grad_norm": 0.007158250547945499, + "learning_rate": 2.2434126143997258e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242037552, + "step": 112040 + }, + { + "epoch": 18.278140293637847, + "grad_norm": 0.004958420526236296, + "learning_rate": 2.241304885305162e-05, + "loss": 0.0011, + "num_input_tokens_seen": 242048720, + "step": 112045 + }, + { + "epoch": 18.278955954323003, + "grad_norm": 0.0006449085776694119, + "learning_rate": 2.2391981241011495e-05, + "loss": 0.0016, + "num_input_tokens_seen": 242060656, + "step": 112050 + }, + { + "epoch": 18.27977161500816, + "grad_norm": 0.009936443530023098, + "learning_rate": 2.2370923308303702e-05, + "loss": 0.0009, + "num_input_tokens_seen": 242071184, + "step": 112055 + }, + { + "epoch": 18.28058727569331, + "grad_norm": 0.00033961181179620326, + "learning_rate": 2.234987505535513e-05, + "loss": 0.0016, + "num_input_tokens_seen": 242080560, + "step": 112060 + }, + { + "epoch": 18.281402936378466, + "grad_norm": 0.00022909794643055648, + "learning_rate": 2.2328836482592208e-05, + "loss": 0.0011, + "num_input_tokens_seen": 242091600, + "step": 112065 + }, + { + "epoch": 18.282218597063622, + "grad_norm": 0.0009541076142340899, + "learning_rate": 2.2307807590441486e-05, + "loss": 0.0011, + "num_input_tokens_seen": 242103088, + "step": 112070 + }, + { + "epoch": 18.283034257748778, + "grad_norm": 0.019319789484143257, + "learning_rate": 2.2286788379328905e-05, + "loss": 0.0031, + "num_input_tokens_seen": 242113392, + "step": 112075 + }, + { + "epoch": 18.28384991843393, + "grad_norm": 0.012067809700965881, + "learning_rate": 2.2265778849680673e-05, + "loss": 0.0008, + "num_input_tokens_seen": 242124208, + "step": 112080 + }, + { + "epoch": 18.284665579119086, + "grad_norm": 0.0042475382797420025, + "learning_rate": 2.2244779001922457e-05, + "loss": 0.0021, + "num_input_tokens_seen": 242134160, + "step": 112085 + }, + { + "epoch": 18.28548123980424, + "grad_norm": 0.21117204427719116, + "learning_rate": 2.222378883647985e-05, + "loss": 0.009, + "num_input_tokens_seen": 242145296, + "step": 112090 + }, + { + "epoch": 18.286296900489397, + "grad_norm": 0.0018978551961481571, + "learning_rate": 2.2202808353778302e-05, + "loss": 0.0043, + "num_input_tokens_seen": 242156848, + "step": 112095 + }, + { + "epoch": 18.287112561174553, + "grad_norm": 0.20463545620441437, + "learning_rate": 2.2181837554242968e-05, + "loss": 0.0046, + "num_input_tokens_seen": 242167632, + "step": 112100 + }, + { + "epoch": 18.287928221859705, + "grad_norm": 0.0037592577282339334, + "learning_rate": 2.216087643829884e-05, + "loss": 0.001, + "num_input_tokens_seen": 242178704, + "step": 112105 + }, + { + "epoch": 18.28874388254486, + "grad_norm": 0.01856519654393196, + "learning_rate": 2.213992500637074e-05, + "loss": 0.0083, + "num_input_tokens_seen": 242189712, + "step": 112110 + }, + { + "epoch": 18.289559543230016, + "grad_norm": 0.004871395882219076, + "learning_rate": 2.211898325888323e-05, + "loss": 0.0006, + "num_input_tokens_seen": 242201424, + "step": 112115 + }, + { + "epoch": 18.290375203915172, + "grad_norm": 0.02684687077999115, + "learning_rate": 2.2098051196260794e-05, + "loss": 0.0013, + "num_input_tokens_seen": 242212240, + "step": 112120 + }, + { + "epoch": 18.291190864600328, + "grad_norm": 0.0012460710713639855, + "learning_rate": 2.207712881892765e-05, + "loss": 0.0033, + "num_input_tokens_seen": 242221808, + "step": 112125 + }, + { + "epoch": 18.29200652528548, + "grad_norm": 0.002025953261181712, + "learning_rate": 2.205621612730774e-05, + "loss": 0.1082, + "num_input_tokens_seen": 242231760, + "step": 112130 + }, + { + "epoch": 18.292822185970635, + "grad_norm": 0.013695158064365387, + "learning_rate": 2.2035313121824884e-05, + "loss": 0.0012, + "num_input_tokens_seen": 242241104, + "step": 112135 + }, + { + "epoch": 18.29363784665579, + "grad_norm": 0.003423569491133094, + "learning_rate": 2.2014419802902808e-05, + "loss": 0.02, + "num_input_tokens_seen": 242250608, + "step": 112140 + }, + { + "epoch": 18.294453507340947, + "grad_norm": 0.00023604616580996662, + "learning_rate": 2.1993536170964832e-05, + "loss": 0.0008, + "num_input_tokens_seen": 242261584, + "step": 112145 + }, + { + "epoch": 18.295269168026103, + "grad_norm": 0.00926015805453062, + "learning_rate": 2.1972662226434292e-05, + "loss": 0.0011, + "num_input_tokens_seen": 242272144, + "step": 112150 + }, + { + "epoch": 18.296084828711255, + "grad_norm": 0.002171823987737298, + "learning_rate": 2.1951797969734178e-05, + "loss": 0.013, + "num_input_tokens_seen": 242283824, + "step": 112155 + }, + { + "epoch": 18.29690048939641, + "grad_norm": 0.004147836938500404, + "learning_rate": 2.193094340128726e-05, + "loss": 0.0011, + "num_input_tokens_seen": 242294128, + "step": 112160 + }, + { + "epoch": 18.297716150081566, + "grad_norm": 0.0004257794935256243, + "learning_rate": 2.191009852151632e-05, + "loss": 0.0005, + "num_input_tokens_seen": 242304080, + "step": 112165 + }, + { + "epoch": 18.298531810766722, + "grad_norm": 0.0004469923733267933, + "learning_rate": 2.188926333084368e-05, + "loss": 0.0017, + "num_input_tokens_seen": 242313328, + "step": 112170 + }, + { + "epoch": 18.299347471451878, + "grad_norm": 0.04959937185049057, + "learning_rate": 2.186843782969167e-05, + "loss": 0.0015, + "num_input_tokens_seen": 242323984, + "step": 112175 + }, + { + "epoch": 18.30016313213703, + "grad_norm": 0.0006796122179366648, + "learning_rate": 2.1847622018482283e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242334800, + "step": 112180 + }, + { + "epoch": 18.300978792822185, + "grad_norm": 0.01616012305021286, + "learning_rate": 2.182681589763741e-05, + "loss": 0.0009, + "num_input_tokens_seen": 242345936, + "step": 112185 + }, + { + "epoch": 18.30179445350734, + "grad_norm": 0.03500431403517723, + "learning_rate": 2.1806019467578765e-05, + "loss": 0.0138, + "num_input_tokens_seen": 242355504, + "step": 112190 + }, + { + "epoch": 18.302610114192497, + "grad_norm": 0.00026632804656401277, + "learning_rate": 2.1785232728727734e-05, + "loss": 0.0051, + "num_input_tokens_seen": 242366512, + "step": 112195 + }, + { + "epoch": 18.303425774877653, + "grad_norm": 0.00029793393332511187, + "learning_rate": 2.1764455681505645e-05, + "loss": 0.0064, + "num_input_tokens_seen": 242376528, + "step": 112200 + }, + { + "epoch": 18.304241435562805, + "grad_norm": 0.00025668280432000756, + "learning_rate": 2.1743688326333555e-05, + "loss": 0.0038, + "num_input_tokens_seen": 242388528, + "step": 112205 + }, + { + "epoch": 18.30505709624796, + "grad_norm": 0.0053967381827533245, + "learning_rate": 2.1722930663632344e-05, + "loss": 0.0015, + "num_input_tokens_seen": 242399312, + "step": 112210 + }, + { + "epoch": 18.305872756933116, + "grad_norm": 0.004073168616741896, + "learning_rate": 2.1702182693822625e-05, + "loss": 0.0009, + "num_input_tokens_seen": 242409808, + "step": 112215 + }, + { + "epoch": 18.306688417618272, + "grad_norm": 0.002889038994908333, + "learning_rate": 2.1681444417325004e-05, + "loss": 0.0014, + "num_input_tokens_seen": 242419632, + "step": 112220 + }, + { + "epoch": 18.307504078303428, + "grad_norm": 0.0007618418894708157, + "learning_rate": 2.166071583455964e-05, + "loss": 0.0004, + "num_input_tokens_seen": 242430096, + "step": 112225 + }, + { + "epoch": 18.30831973898858, + "grad_norm": 0.01946703903377056, + "learning_rate": 2.1639996945946706e-05, + "loss": 0.0025, + "num_input_tokens_seen": 242440592, + "step": 112230 + }, + { + "epoch": 18.309135399673735, + "grad_norm": 0.008552854880690575, + "learning_rate": 2.1619287751906135e-05, + "loss": 0.0008, + "num_input_tokens_seen": 242450288, + "step": 112235 + }, + { + "epoch": 18.30995106035889, + "grad_norm": 0.0020822572987526655, + "learning_rate": 2.1598588252857486e-05, + "loss": 0.0432, + "num_input_tokens_seen": 242460336, + "step": 112240 + }, + { + "epoch": 18.310766721044047, + "grad_norm": 0.0005850521847605705, + "learning_rate": 2.157789844922037e-05, + "loss": 0.0044, + "num_input_tokens_seen": 242471632, + "step": 112245 + }, + { + "epoch": 18.3115823817292, + "grad_norm": 0.00023612409131601453, + "learning_rate": 2.1557218341414055e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242482608, + "step": 112250 + }, + { + "epoch": 18.312398042414355, + "grad_norm": 0.019624780863523483, + "learning_rate": 2.1536547929857707e-05, + "loss": 0.0014, + "num_input_tokens_seen": 242493712, + "step": 112255 + }, + { + "epoch": 18.31321370309951, + "grad_norm": 0.0011469227029010653, + "learning_rate": 2.1515887214970165e-05, + "loss": 0.025, + "num_input_tokens_seen": 242505072, + "step": 112260 + }, + { + "epoch": 18.314029363784666, + "grad_norm": 0.09801533818244934, + "learning_rate": 2.1495236197170143e-05, + "loss": 0.0022, + "num_input_tokens_seen": 242516304, + "step": 112265 + }, + { + "epoch": 18.31484502446982, + "grad_norm": 0.00623701885342598, + "learning_rate": 2.1474594876876198e-05, + "loss": 0.0092, + "num_input_tokens_seen": 242526640, + "step": 112270 + }, + { + "epoch": 18.315660685154974, + "grad_norm": 0.000984379556030035, + "learning_rate": 2.1453963254506604e-05, + "loss": 0.002, + "num_input_tokens_seen": 242537648, + "step": 112275 + }, + { + "epoch": 18.31647634584013, + "grad_norm": 0.00510720070451498, + "learning_rate": 2.1433341330479583e-05, + "loss": 0.0054, + "num_input_tokens_seen": 242547792, + "step": 112280 + }, + { + "epoch": 18.317292006525285, + "grad_norm": 0.001066899043507874, + "learning_rate": 2.141272910521297e-05, + "loss": 0.0035, + "num_input_tokens_seen": 242558384, + "step": 112285 + }, + { + "epoch": 18.31810766721044, + "grad_norm": 0.012570103630423546, + "learning_rate": 2.1392126579124536e-05, + "loss": 0.0016, + "num_input_tokens_seen": 242569488, + "step": 112290 + }, + { + "epoch": 18.318923327895597, + "grad_norm": 0.15933559834957123, + "learning_rate": 2.1371533752631844e-05, + "loss": 0.0045, + "num_input_tokens_seen": 242580048, + "step": 112295 + }, + { + "epoch": 18.31973898858075, + "grad_norm": 0.003937386907637119, + "learning_rate": 2.135095062615211e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242590480, + "step": 112300 + }, + { + "epoch": 18.320554649265905, + "grad_norm": 0.06767729669809341, + "learning_rate": 2.1330377200102723e-05, + "loss": 0.0053, + "num_input_tokens_seen": 242602032, + "step": 112305 + }, + { + "epoch": 18.32137030995106, + "grad_norm": 0.0006834762170910835, + "learning_rate": 2.130981347490035e-05, + "loss": 0.0005, + "num_input_tokens_seen": 242612624, + "step": 112310 + }, + { + "epoch": 18.322185970636216, + "grad_norm": 1.1935534477233887, + "learning_rate": 2.1289259450961995e-05, + "loss": 0.1134, + "num_input_tokens_seen": 242620752, + "step": 112315 + }, + { + "epoch": 18.32300163132137, + "grad_norm": 0.003689026227220893, + "learning_rate": 2.1268715128703932e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242633200, + "step": 112320 + }, + { + "epoch": 18.323817292006524, + "grad_norm": 0.010268572717905045, + "learning_rate": 2.124818050854277e-05, + "loss": 0.0009, + "num_input_tokens_seen": 242643472, + "step": 112325 + }, + { + "epoch": 18.32463295269168, + "grad_norm": 0.0009742376278154552, + "learning_rate": 2.122765559089451e-05, + "loss": 0.1357, + "num_input_tokens_seen": 242654736, + "step": 112330 + }, + { + "epoch": 18.325448613376835, + "grad_norm": 0.013293848372995853, + "learning_rate": 2.1207140376175214e-05, + "loss": 0.0036, + "num_input_tokens_seen": 242665136, + "step": 112335 + }, + { + "epoch": 18.32626427406199, + "grad_norm": 0.0005903943674638867, + "learning_rate": 2.1186634864800603e-05, + "loss": 0.0005, + "num_input_tokens_seen": 242676272, + "step": 112340 + }, + { + "epoch": 18.327079934747147, + "grad_norm": 0.012852600775659084, + "learning_rate": 2.116613905718623e-05, + "loss": 0.0012, + "num_input_tokens_seen": 242688336, + "step": 112345 + }, + { + "epoch": 18.3278955954323, + "grad_norm": 0.0014939934480935335, + "learning_rate": 2.114565295374754e-05, + "loss": 0.0539, + "num_input_tokens_seen": 242698672, + "step": 112350 + }, + { + "epoch": 18.328711256117455, + "grad_norm": 0.00032311692484654486, + "learning_rate": 2.112517655489965e-05, + "loss": 0.0012, + "num_input_tokens_seen": 242709904, + "step": 112355 + }, + { + "epoch": 18.32952691680261, + "grad_norm": 0.004247542470693588, + "learning_rate": 2.110470986105756e-05, + "loss": 0.0009, + "num_input_tokens_seen": 242720720, + "step": 112360 + }, + { + "epoch": 18.330342577487766, + "grad_norm": 0.006098807789385319, + "learning_rate": 2.1084252872636046e-05, + "loss": 0.0068, + "num_input_tokens_seen": 242731056, + "step": 112365 + }, + { + "epoch": 18.33115823817292, + "grad_norm": 0.005680213216692209, + "learning_rate": 2.1063805590049667e-05, + "loss": 0.0163, + "num_input_tokens_seen": 242743312, + "step": 112370 + }, + { + "epoch": 18.331973898858074, + "grad_norm": 0.00022195794736035168, + "learning_rate": 2.1043368013712872e-05, + "loss": 0.0026, + "num_input_tokens_seen": 242753936, + "step": 112375 + }, + { + "epoch": 18.33278955954323, + "grad_norm": 0.0006564015056937933, + "learning_rate": 2.102294014403977e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242764272, + "step": 112380 + }, + { + "epoch": 18.333605220228385, + "grad_norm": 0.0004953066818416119, + "learning_rate": 2.1002521981444477e-05, + "loss": 0.001, + "num_input_tokens_seen": 242776048, + "step": 112385 + }, + { + "epoch": 18.33442088091354, + "grad_norm": 0.000778991321567446, + "learning_rate": 2.0982113526340662e-05, + "loss": 0.038, + "num_input_tokens_seen": 242786864, + "step": 112390 + }, + { + "epoch": 18.335236541598697, + "grad_norm": 0.001229040906764567, + "learning_rate": 2.0961714779142048e-05, + "loss": 0.0004, + "num_input_tokens_seen": 242797552, + "step": 112395 + }, + { + "epoch": 18.33605220228385, + "grad_norm": 0.0008231330430135131, + "learning_rate": 2.0941325740261975e-05, + "loss": 0.0033, + "num_input_tokens_seen": 242809488, + "step": 112400 + }, + { + "epoch": 18.336867862969005, + "grad_norm": 0.056143589317798615, + "learning_rate": 2.0920946410113604e-05, + "loss": 0.0025, + "num_input_tokens_seen": 242820656, + "step": 112405 + }, + { + "epoch": 18.33768352365416, + "grad_norm": 0.008879280649125576, + "learning_rate": 2.0900576789110116e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242831856, + "step": 112410 + }, + { + "epoch": 18.338499184339316, + "grad_norm": 0.007489972282201052, + "learning_rate": 2.0880216877664116e-05, + "loss": 0.0667, + "num_input_tokens_seen": 242842320, + "step": 112415 + }, + { + "epoch": 18.339314845024468, + "grad_norm": 0.0021333445329219103, + "learning_rate": 2.0859866676188445e-05, + "loss": 0.0012, + "num_input_tokens_seen": 242852784, + "step": 112420 + }, + { + "epoch": 18.340130505709624, + "grad_norm": 0.018818873912096024, + "learning_rate": 2.083952618509527e-05, + "loss": 0.0031, + "num_input_tokens_seen": 242862960, + "step": 112425 + }, + { + "epoch": 18.34094616639478, + "grad_norm": 0.13519856333732605, + "learning_rate": 2.0819195404797098e-05, + "loss": 0.0036, + "num_input_tokens_seen": 242872528, + "step": 112430 + }, + { + "epoch": 18.341761827079935, + "grad_norm": 0.07205647975206375, + "learning_rate": 2.0798874335705707e-05, + "loss": 0.0026, + "num_input_tokens_seen": 242882512, + "step": 112435 + }, + { + "epoch": 18.34257748776509, + "grad_norm": 0.0011894232593476772, + "learning_rate": 2.077856297823316e-05, + "loss": 0.0008, + "num_input_tokens_seen": 242893648, + "step": 112440 + }, + { + "epoch": 18.343393148450243, + "grad_norm": 0.00026534864446148276, + "learning_rate": 2.0758261332790796e-05, + "loss": 0.0004, + "num_input_tokens_seen": 242903792, + "step": 112445 + }, + { + "epoch": 18.3442088091354, + "grad_norm": 0.0033330917358398438, + "learning_rate": 2.0737969399790392e-05, + "loss": 0.0012, + "num_input_tokens_seen": 242914608, + "step": 112450 + }, + { + "epoch": 18.345024469820554, + "grad_norm": 0.00043053895933553576, + "learning_rate": 2.0717687179642896e-05, + "loss": 0.0003, + "num_input_tokens_seen": 242924176, + "step": 112455 + }, + { + "epoch": 18.34584013050571, + "grad_norm": 0.021750640124082565, + "learning_rate": 2.0697414672759596e-05, + "loss": 0.0017, + "num_input_tokens_seen": 242934992, + "step": 112460 + }, + { + "epoch": 18.346655791190866, + "grad_norm": 0.001099413144402206, + "learning_rate": 2.0677151879551103e-05, + "loss": 0.0009, + "num_input_tokens_seen": 242946832, + "step": 112465 + }, + { + "epoch": 18.347471451876018, + "grad_norm": 0.0010571812745183706, + "learning_rate": 2.0656898800428313e-05, + "loss": 0.0014, + "num_input_tokens_seen": 242955696, + "step": 112470 + }, + { + "epoch": 18.348287112561174, + "grad_norm": 0.0031790726352483034, + "learning_rate": 2.0636655435801455e-05, + "loss": 0.0011, + "num_input_tokens_seen": 242967632, + "step": 112475 + }, + { + "epoch": 18.34910277324633, + "grad_norm": 0.0003818267723545432, + "learning_rate": 2.061642178608092e-05, + "loss": 0.0016, + "num_input_tokens_seen": 242977584, + "step": 112480 + }, + { + "epoch": 18.349918433931485, + "grad_norm": 0.0007671648636460304, + "learning_rate": 2.0596197851676768e-05, + "loss": 0.0018, + "num_input_tokens_seen": 242987056, + "step": 112485 + }, + { + "epoch": 18.35073409461664, + "grad_norm": 0.0005823525134474039, + "learning_rate": 2.057598363299884e-05, + "loss": 0.0025, + "num_input_tokens_seen": 242998000, + "step": 112490 + }, + { + "epoch": 18.351549755301793, + "grad_norm": 0.010062271729111671, + "learning_rate": 2.055577913045675e-05, + "loss": 0.0007, + "num_input_tokens_seen": 243007600, + "step": 112495 + }, + { + "epoch": 18.35236541598695, + "grad_norm": 0.02205917239189148, + "learning_rate": 2.0535584344460066e-05, + "loss": 0.0015, + "num_input_tokens_seen": 243017840, + "step": 112500 + }, + { + "epoch": 18.353181076672104, + "grad_norm": 0.0054580941796302795, + "learning_rate": 2.0515399275417958e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243029072, + "step": 112505 + }, + { + "epoch": 18.35399673735726, + "grad_norm": 0.0038355544675141573, + "learning_rate": 2.0495223923739593e-05, + "loss": 0.001, + "num_input_tokens_seen": 243039472, + "step": 112510 + }, + { + "epoch": 18.354812398042416, + "grad_norm": 0.008391822688281536, + "learning_rate": 2.0475058289833815e-05, + "loss": 0.0011, + "num_input_tokens_seen": 243050384, + "step": 112515 + }, + { + "epoch": 18.355628058727568, + "grad_norm": 0.017565961927175522, + "learning_rate": 2.045490237410924e-05, + "loss": 0.0013, + "num_input_tokens_seen": 243060848, + "step": 112520 + }, + { + "epoch": 18.356443719412724, + "grad_norm": 0.01259287167340517, + "learning_rate": 2.043475617697449e-05, + "loss": 0.0034, + "num_input_tokens_seen": 243071952, + "step": 112525 + }, + { + "epoch": 18.35725938009788, + "grad_norm": 0.1289086937904358, + "learning_rate": 2.0414619698837677e-05, + "loss": 0.0029, + "num_input_tokens_seen": 243082736, + "step": 112530 + }, + { + "epoch": 18.358075040783035, + "grad_norm": 0.06562699377536774, + "learning_rate": 2.0394492940107144e-05, + "loss": 0.003, + "num_input_tokens_seen": 243093968, + "step": 112535 + }, + { + "epoch": 18.35889070146819, + "grad_norm": 0.0011165590258315206, + "learning_rate": 2.0374375901190456e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243104816, + "step": 112540 + }, + { + "epoch": 18.359706362153343, + "grad_norm": 0.0002490470069460571, + "learning_rate": 2.0354268582495673e-05, + "loss": 0.0008, + "num_input_tokens_seen": 243115792, + "step": 112545 + }, + { + "epoch": 18.3605220228385, + "grad_norm": 0.0014397975755855441, + "learning_rate": 2.0334170984429966e-05, + "loss": 0.0083, + "num_input_tokens_seen": 243127152, + "step": 112550 + }, + { + "epoch": 18.361337683523654, + "grad_norm": 0.01572202518582344, + "learning_rate": 2.0314083107400904e-05, + "loss": 0.0017, + "num_input_tokens_seen": 243138032, + "step": 112555 + }, + { + "epoch": 18.36215334420881, + "grad_norm": 0.0020776886958628893, + "learning_rate": 2.0294004951815324e-05, + "loss": 0.0866, + "num_input_tokens_seen": 243148432, + "step": 112560 + }, + { + "epoch": 18.362969004893966, + "grad_norm": 0.00019504585361573845, + "learning_rate": 2.027393651808046e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243159664, + "step": 112565 + }, + { + "epoch": 18.363784665579118, + "grad_norm": 0.0009072918328456581, + "learning_rate": 2.0253877806602648e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243170448, + "step": 112570 + }, + { + "epoch": 18.364600326264274, + "grad_norm": 0.00788530521094799, + "learning_rate": 2.0233828817788792e-05, + "loss": 0.0069, + "num_input_tokens_seen": 243180592, + "step": 112575 + }, + { + "epoch": 18.36541598694943, + "grad_norm": 0.0006791690248064697, + "learning_rate": 2.0213789552044893e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243191408, + "step": 112580 + }, + { + "epoch": 18.366231647634585, + "grad_norm": 0.0016448087990283966, + "learning_rate": 2.0193760009777295e-05, + "loss": 0.0091, + "num_input_tokens_seen": 243202160, + "step": 112585 + }, + { + "epoch": 18.36704730831974, + "grad_norm": 0.032738588750362396, + "learning_rate": 2.0173740191391732e-05, + "loss": 0.0014, + "num_input_tokens_seen": 243212464, + "step": 112590 + }, + { + "epoch": 18.367862969004893, + "grad_norm": 0.003810546128079295, + "learning_rate": 2.0153730097294153e-05, + "loss": 0.0009, + "num_input_tokens_seen": 243222768, + "step": 112595 + }, + { + "epoch": 18.36867862969005, + "grad_norm": 0.00023762752243783325, + "learning_rate": 2.0133729727889794e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243234064, + "step": 112600 + }, + { + "epoch": 18.369494290375204, + "grad_norm": 0.0093738604336977, + "learning_rate": 2.0113739083584327e-05, + "loss": 0.0009, + "num_input_tokens_seen": 243244208, + "step": 112605 + }, + { + "epoch": 18.37030995106036, + "grad_norm": 0.003199538215994835, + "learning_rate": 2.0093758164782595e-05, + "loss": 0.1295, + "num_input_tokens_seen": 243254512, + "step": 112610 + }, + { + "epoch": 18.371125611745512, + "grad_norm": 0.0027874563820660114, + "learning_rate": 2.0073786971889662e-05, + "loss": 0.0023, + "num_input_tokens_seen": 243265648, + "step": 112615 + }, + { + "epoch": 18.371941272430668, + "grad_norm": 0.0008438194054178894, + "learning_rate": 2.0053825505310318e-05, + "loss": 0.0012, + "num_input_tokens_seen": 243276560, + "step": 112620 + }, + { + "epoch": 18.372756933115824, + "grad_norm": 0.0001784580817911774, + "learning_rate": 2.0033873765449018e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243286608, + "step": 112625 + }, + { + "epoch": 18.37357259380098, + "grad_norm": 0.002208051038905978, + "learning_rate": 2.0013931752710214e-05, + "loss": 0.002, + "num_input_tokens_seen": 243298128, + "step": 112630 + }, + { + "epoch": 18.374388254486135, + "grad_norm": 0.001969917444512248, + "learning_rate": 1.9993999467497913e-05, + "loss": 0.0009, + "num_input_tokens_seen": 243309680, + "step": 112635 + }, + { + "epoch": 18.375203915171287, + "grad_norm": 0.0012371476041153073, + "learning_rate": 1.9974076910216188e-05, + "loss": 0.0012, + "num_input_tokens_seen": 243318608, + "step": 112640 + }, + { + "epoch": 18.376019575856443, + "grad_norm": 0.00016509677516296506, + "learning_rate": 1.995416408126871e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243328624, + "step": 112645 + }, + { + "epoch": 18.3768352365416, + "grad_norm": 0.03946005553007126, + "learning_rate": 1.9934260981059103e-05, + "loss": 0.0094, + "num_input_tokens_seen": 243339600, + "step": 112650 + }, + { + "epoch": 18.377650897226754, + "grad_norm": 0.023350073024630547, + "learning_rate": 1.9914367609990713e-05, + "loss": 0.0011, + "num_input_tokens_seen": 243349584, + "step": 112655 + }, + { + "epoch": 18.37846655791191, + "grad_norm": 0.013439115136861801, + "learning_rate": 1.9894483968466715e-05, + "loss": 0.0017, + "num_input_tokens_seen": 243361392, + "step": 112660 + }, + { + "epoch": 18.379282218597062, + "grad_norm": 0.0004325744812376797, + "learning_rate": 1.9874610056890007e-05, + "loss": 0.0011, + "num_input_tokens_seen": 243371408, + "step": 112665 + }, + { + "epoch": 18.380097879282218, + "grad_norm": 0.0010596549836918712, + "learning_rate": 1.9854745875663438e-05, + "loss": 0.0024, + "num_input_tokens_seen": 243382928, + "step": 112670 + }, + { + "epoch": 18.380913539967374, + "grad_norm": 0.003541856538504362, + "learning_rate": 1.983489142518946e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243393328, + "step": 112675 + }, + { + "epoch": 18.38172920065253, + "grad_norm": 0.018053626641631126, + "learning_rate": 1.9815046705870697e-05, + "loss": 0.0017, + "num_input_tokens_seen": 243404816, + "step": 112680 + }, + { + "epoch": 18.382544861337685, + "grad_norm": 0.010469280183315277, + "learning_rate": 1.979521171810905e-05, + "loss": 0.0007, + "num_input_tokens_seen": 243414640, + "step": 112685 + }, + { + "epoch": 18.383360522022837, + "grad_norm": 0.0004034289449919015, + "learning_rate": 1.9775386462306756e-05, + "loss": 0.0033, + "num_input_tokens_seen": 243425104, + "step": 112690 + }, + { + "epoch": 18.384176182707993, + "grad_norm": 0.01321091316640377, + "learning_rate": 1.9755570938865263e-05, + "loss": 0.0028, + "num_input_tokens_seen": 243436176, + "step": 112695 + }, + { + "epoch": 18.38499184339315, + "grad_norm": 0.0053224824368953705, + "learning_rate": 1.9735765148186536e-05, + "loss": 0.0538, + "num_input_tokens_seen": 243445456, + "step": 112700 + }, + { + "epoch": 18.385807504078304, + "grad_norm": 0.0077959164045751095, + "learning_rate": 1.9715969090671693e-05, + "loss": 0.0031, + "num_input_tokens_seen": 243456784, + "step": 112705 + }, + { + "epoch": 18.38662316476346, + "grad_norm": 0.018376147374510765, + "learning_rate": 1.969618276672208e-05, + "loss": 0.0082, + "num_input_tokens_seen": 243468656, + "step": 112710 + }, + { + "epoch": 18.387438825448612, + "grad_norm": 0.025106191635131836, + "learning_rate": 1.9676406176738547e-05, + "loss": 0.0013, + "num_input_tokens_seen": 243479824, + "step": 112715 + }, + { + "epoch": 18.388254486133768, + "grad_norm": 0.0026839233469218016, + "learning_rate": 1.965663932112205e-05, + "loss": 0.0025, + "num_input_tokens_seen": 243490512, + "step": 112720 + }, + { + "epoch": 18.389070146818923, + "grad_norm": 0.9623442888259888, + "learning_rate": 1.96368822002731e-05, + "loss": 0.0193, + "num_input_tokens_seen": 243501456, + "step": 112725 + }, + { + "epoch": 18.38988580750408, + "grad_norm": 0.031202662736177444, + "learning_rate": 1.9617134814592096e-05, + "loss": 0.0898, + "num_input_tokens_seen": 243513424, + "step": 112730 + }, + { + "epoch": 18.390701468189235, + "grad_norm": 0.0001352128601865843, + "learning_rate": 1.9597397164479282e-05, + "loss": 0.0021, + "num_input_tokens_seen": 243524880, + "step": 112735 + }, + { + "epoch": 18.391517128874387, + "grad_norm": 0.026948045939207077, + "learning_rate": 1.957766925033466e-05, + "loss": 0.0025, + "num_input_tokens_seen": 243535120, + "step": 112740 + }, + { + "epoch": 18.392332789559543, + "grad_norm": 0.0015484422910958529, + "learning_rate": 1.9557951072557978e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243545232, + "step": 112745 + }, + { + "epoch": 18.3931484502447, + "grad_norm": 0.02422100119292736, + "learning_rate": 1.9538242631548965e-05, + "loss": 0.0024, + "num_input_tokens_seen": 243555216, + "step": 112750 + }, + { + "epoch": 18.393964110929854, + "grad_norm": 0.00910177081823349, + "learning_rate": 1.9518543927706968e-05, + "loss": 0.0008, + "num_input_tokens_seen": 243564848, + "step": 112755 + }, + { + "epoch": 18.39477977161501, + "grad_norm": 0.0012397068785503507, + "learning_rate": 1.949885496143117e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243575120, + "step": 112760 + }, + { + "epoch": 18.395595432300162, + "grad_norm": 0.02492532506585121, + "learning_rate": 1.947917573312069e-05, + "loss": 0.0011, + "num_input_tokens_seen": 243585168, + "step": 112765 + }, + { + "epoch": 18.396411092985318, + "grad_norm": 0.000635263801086694, + "learning_rate": 1.945950624317422e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243595920, + "step": 112770 + }, + { + "epoch": 18.397226753670473, + "grad_norm": 0.001339736278168857, + "learning_rate": 1.943984649199054e-05, + "loss": 0.0009, + "num_input_tokens_seen": 243605424, + "step": 112775 + }, + { + "epoch": 18.39804241435563, + "grad_norm": 0.005495937541127205, + "learning_rate": 1.9420196479967957e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243616720, + "step": 112780 + }, + { + "epoch": 18.39885807504078, + "grad_norm": 0.0017825653776526451, + "learning_rate": 1.9400556207504805e-05, + "loss": 0.031, + "num_input_tokens_seen": 243627440, + "step": 112785 + }, + { + "epoch": 18.399673735725937, + "grad_norm": 0.0019627753645181656, + "learning_rate": 1.9380925674998995e-05, + "loss": 0.0012, + "num_input_tokens_seen": 243638096, + "step": 112790 + }, + { + "epoch": 18.400489396411093, + "grad_norm": 0.0011574667878448963, + "learning_rate": 1.9361304882848487e-05, + "loss": 0.0013, + "num_input_tokens_seen": 243648848, + "step": 112795 + }, + { + "epoch": 18.40130505709625, + "grad_norm": 0.0014631313970312476, + "learning_rate": 1.9341693831450847e-05, + "loss": 0.0012, + "num_input_tokens_seen": 243658768, + "step": 112800 + }, + { + "epoch": 18.402120717781404, + "grad_norm": 0.005520334001630545, + "learning_rate": 1.9322092521203537e-05, + "loss": 0.0008, + "num_input_tokens_seen": 243669104, + "step": 112805 + }, + { + "epoch": 18.402936378466556, + "grad_norm": 0.12357936054468155, + "learning_rate": 1.93025009525038e-05, + "loss": 0.0044, + "num_input_tokens_seen": 243679760, + "step": 112810 + }, + { + "epoch": 18.403752039151712, + "grad_norm": 0.0001971587771549821, + "learning_rate": 1.92829191257487e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243690928, + "step": 112815 + }, + { + "epoch": 18.404567699836868, + "grad_norm": 0.013569245114922523, + "learning_rate": 1.9263347041335033e-05, + "loss": 0.0063, + "num_input_tokens_seen": 243701776, + "step": 112820 + }, + { + "epoch": 18.405383360522023, + "grad_norm": 0.0014590908540412784, + "learning_rate": 1.9243784699659538e-05, + "loss": 0.0022, + "num_input_tokens_seen": 243712528, + "step": 112825 + }, + { + "epoch": 18.40619902120718, + "grad_norm": 0.00046936338185332716, + "learning_rate": 1.9224232101118623e-05, + "loss": 0.0019, + "num_input_tokens_seen": 243722576, + "step": 112830 + }, + { + "epoch": 18.40701468189233, + "grad_norm": 0.0006246300181373954, + "learning_rate": 1.9204689246108576e-05, + "loss": 0.0016, + "num_input_tokens_seen": 243732976, + "step": 112835 + }, + { + "epoch": 18.407830342577487, + "grad_norm": 0.0062375376001000404, + "learning_rate": 1.9185156135025417e-05, + "loss": 0.0022, + "num_input_tokens_seen": 243744464, + "step": 112840 + }, + { + "epoch": 18.408646003262643, + "grad_norm": 0.1260538250207901, + "learning_rate": 1.9165632768264994e-05, + "loss": 0.0035, + "num_input_tokens_seen": 243754544, + "step": 112845 + }, + { + "epoch": 18.4094616639478, + "grad_norm": 0.06947627663612366, + "learning_rate": 1.9146119146223052e-05, + "loss": 0.001, + "num_input_tokens_seen": 243765648, + "step": 112850 + }, + { + "epoch": 18.410277324632954, + "grad_norm": 0.00032668912899680436, + "learning_rate": 1.9126615269294988e-05, + "loss": 0.0012, + "num_input_tokens_seen": 243776816, + "step": 112855 + }, + { + "epoch": 18.411092985318106, + "grad_norm": 0.002180765848606825, + "learning_rate": 1.9107121137876106e-05, + "loss": 0.0009, + "num_input_tokens_seen": 243788400, + "step": 112860 + }, + { + "epoch": 18.411908646003262, + "grad_norm": 0.0003920606686733663, + "learning_rate": 1.908763675236147e-05, + "loss": 0.0006, + "num_input_tokens_seen": 243797904, + "step": 112865 + }, + { + "epoch": 18.412724306688418, + "grad_norm": 0.0018217455362901092, + "learning_rate": 1.906816211314599e-05, + "loss": 0.0017, + "num_input_tokens_seen": 243809392, + "step": 112870 + }, + { + "epoch": 18.413539967373573, + "grad_norm": 0.0001498242054367438, + "learning_rate": 1.9048697220624244e-05, + "loss": 0.001, + "num_input_tokens_seen": 243820464, + "step": 112875 + }, + { + "epoch": 18.41435562805873, + "grad_norm": 0.007615982089191675, + "learning_rate": 1.9029242075190856e-05, + "loss": 0.0015, + "num_input_tokens_seen": 243831408, + "step": 112880 + }, + { + "epoch": 18.41517128874388, + "grad_norm": 0.004452873952686787, + "learning_rate": 1.9009796677239953e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243841808, + "step": 112885 + }, + { + "epoch": 18.415986949429037, + "grad_norm": 0.0032919731456786394, + "learning_rate": 1.8990361027165726e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243853776, + "step": 112890 + }, + { + "epoch": 18.416802610114193, + "grad_norm": 0.0024367780424654484, + "learning_rate": 1.8970935125362076e-05, + "loss": 0.0042, + "num_input_tokens_seen": 243864432, + "step": 112895 + }, + { + "epoch": 18.41761827079935, + "grad_norm": 0.1128418818116188, + "learning_rate": 1.8951518972222637e-05, + "loss": 0.0025, + "num_input_tokens_seen": 243875504, + "step": 112900 + }, + { + "epoch": 18.418433931484504, + "grad_norm": 0.04299400746822357, + "learning_rate": 1.893211256814087e-05, + "loss": 0.0011, + "num_input_tokens_seen": 243885168, + "step": 112905 + }, + { + "epoch": 18.419249592169656, + "grad_norm": 0.007050327956676483, + "learning_rate": 1.891271591351018e-05, + "loss": 0.0011, + "num_input_tokens_seen": 243896208, + "step": 112910 + }, + { + "epoch": 18.420065252854812, + "grad_norm": 0.00047753899707458913, + "learning_rate": 1.8893329008723593e-05, + "loss": 0.0008, + "num_input_tokens_seen": 243907344, + "step": 112915 + }, + { + "epoch": 18.420880913539968, + "grad_norm": 0.6455734372138977, + "learning_rate": 1.8873951854173955e-05, + "loss": 0.0304, + "num_input_tokens_seen": 243918448, + "step": 112920 + }, + { + "epoch": 18.421696574225123, + "grad_norm": 0.00031255558133125305, + "learning_rate": 1.885458445025412e-05, + "loss": 0.0023, + "num_input_tokens_seen": 243927984, + "step": 112925 + }, + { + "epoch": 18.42251223491028, + "grad_norm": 0.0006976706790737808, + "learning_rate": 1.883522679735644e-05, + "loss": 0.062, + "num_input_tokens_seen": 243938160, + "step": 112930 + }, + { + "epoch": 18.42332789559543, + "grad_norm": 0.0007012999849393964, + "learning_rate": 1.8815878895873328e-05, + "loss": 0.0033, + "num_input_tokens_seen": 243949168, + "step": 112935 + }, + { + "epoch": 18.424143556280587, + "grad_norm": 0.00946141593158245, + "learning_rate": 1.87965407461968e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243960880, + "step": 112940 + }, + { + "epoch": 18.424959216965743, + "grad_norm": 0.002110017230734229, + "learning_rate": 1.877721234871893e-05, + "loss": 0.0026, + "num_input_tokens_seen": 243971120, + "step": 112945 + }, + { + "epoch": 18.4257748776509, + "grad_norm": 0.0002857319777831435, + "learning_rate": 1.8757893703831243e-05, + "loss": 0.0035, + "num_input_tokens_seen": 243981104, + "step": 112950 + }, + { + "epoch": 18.42659053833605, + "grad_norm": 0.009105951525270939, + "learning_rate": 1.8738584811925417e-05, + "loss": 0.0006, + "num_input_tokens_seen": 243992816, + "step": 112955 + }, + { + "epoch": 18.427406199021206, + "grad_norm": 0.0012741464888677, + "learning_rate": 1.8719285673392594e-05, + "loss": 0.0032, + "num_input_tokens_seen": 244003920, + "step": 112960 + }, + { + "epoch": 18.428221859706362, + "grad_norm": 0.021076209843158722, + "learning_rate": 1.869999628862401e-05, + "loss": 0.0018, + "num_input_tokens_seen": 244016528, + "step": 112965 + }, + { + "epoch": 18.429037520391518, + "grad_norm": 0.00040486734360456467, + "learning_rate": 1.8680716658010633e-05, + "loss": 0.0004, + "num_input_tokens_seen": 244027824, + "step": 112970 + }, + { + "epoch": 18.429853181076673, + "grad_norm": 0.00019396857533138245, + "learning_rate": 1.8661446781943093e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244039184, + "step": 112975 + }, + { + "epoch": 18.430668841761825, + "grad_norm": 0.004015372600406408, + "learning_rate": 1.8642186660811965e-05, + "loss": 0.0016, + "num_input_tokens_seen": 244051024, + "step": 112980 + }, + { + "epoch": 18.43148450244698, + "grad_norm": 0.0005323364166542888, + "learning_rate": 1.862293629500761e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244063248, + "step": 112985 + }, + { + "epoch": 18.432300163132137, + "grad_norm": 0.8979222774505615, + "learning_rate": 1.8603695684920042e-05, + "loss": 0.1016, + "num_input_tokens_seen": 244074000, + "step": 112990 + }, + { + "epoch": 18.433115823817293, + "grad_norm": 0.00034616264747455716, + "learning_rate": 1.858446483093934e-05, + "loss": 0.0037, + "num_input_tokens_seen": 244084752, + "step": 112995 + }, + { + "epoch": 18.43393148450245, + "grad_norm": 0.0007030692067928612, + "learning_rate": 1.856524373345514e-05, + "loss": 0.0016, + "num_input_tokens_seen": 244095856, + "step": 113000 + }, + { + "epoch": 18.4347471451876, + "grad_norm": 0.004537553526461124, + "learning_rate": 1.8546032392857014e-05, + "loss": 0.0179, + "num_input_tokens_seen": 244107696, + "step": 113005 + }, + { + "epoch": 18.435562805872756, + "grad_norm": 0.010070395655930042, + "learning_rate": 1.8526830809534377e-05, + "loss": 0.0023, + "num_input_tokens_seen": 244120528, + "step": 113010 + }, + { + "epoch": 18.436378466557912, + "grad_norm": 0.0016404170310124755, + "learning_rate": 1.8507638983876252e-05, + "loss": 0.0021, + "num_input_tokens_seen": 244131664, + "step": 113015 + }, + { + "epoch": 18.437194127243067, + "grad_norm": 0.002478382084518671, + "learning_rate": 1.84884569162716e-05, + "loss": 0.0022, + "num_input_tokens_seen": 244142288, + "step": 113020 + }, + { + "epoch": 18.438009787928223, + "grad_norm": 0.1491037756204605, + "learning_rate": 1.8469284607109282e-05, + "loss": 0.0029, + "num_input_tokens_seen": 244154480, + "step": 113025 + }, + { + "epoch": 18.438825448613375, + "grad_norm": 0.0012238170020282269, + "learning_rate": 1.8450122056777762e-05, + "loss": 0.0035, + "num_input_tokens_seen": 244164944, + "step": 113030 + }, + { + "epoch": 18.43964110929853, + "grad_norm": 0.00036461750278249383, + "learning_rate": 1.8430969265665398e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244175248, + "step": 113035 + }, + { + "epoch": 18.440456769983687, + "grad_norm": 0.002877857070416212, + "learning_rate": 1.8411826234160324e-05, + "loss": 0.0008, + "num_input_tokens_seen": 244185552, + "step": 113040 + }, + { + "epoch": 18.441272430668842, + "grad_norm": 0.010942216031253338, + "learning_rate": 1.8392692962650504e-05, + "loss": 0.0017, + "num_input_tokens_seen": 244195312, + "step": 113045 + }, + { + "epoch": 18.442088091353998, + "grad_norm": 0.007464864756911993, + "learning_rate": 1.8373569451523853e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244205840, + "step": 113050 + }, + { + "epoch": 18.44290375203915, + "grad_norm": 0.025183305144309998, + "learning_rate": 1.8354455701167672e-05, + "loss": 0.0017, + "num_input_tokens_seen": 244215216, + "step": 113055 + }, + { + "epoch": 18.443719412724306, + "grad_norm": 0.0012140703620389104, + "learning_rate": 1.833535171196954e-05, + "loss": 0.0019, + "num_input_tokens_seen": 244225648, + "step": 113060 + }, + { + "epoch": 18.44453507340946, + "grad_norm": 0.000981401652097702, + "learning_rate": 1.831625748431648e-05, + "loss": 0.0139, + "num_input_tokens_seen": 244237872, + "step": 113065 + }, + { + "epoch": 18.445350734094617, + "grad_norm": 0.00016673772188369185, + "learning_rate": 1.829717301859557e-05, + "loss": 0.0011, + "num_input_tokens_seen": 244248560, + "step": 113070 + }, + { + "epoch": 18.446166394779773, + "grad_norm": 0.014688815921545029, + "learning_rate": 1.8278098315193504e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244258960, + "step": 113075 + }, + { + "epoch": 18.446982055464925, + "grad_norm": 0.0007847630186006427, + "learning_rate": 1.8259033374496915e-05, + "loss": 0.0007, + "num_input_tokens_seen": 244269424, + "step": 113080 + }, + { + "epoch": 18.44779771615008, + "grad_norm": 0.00025912452838383615, + "learning_rate": 1.8239978196892105e-05, + "loss": 0.002, + "num_input_tokens_seen": 244281008, + "step": 113085 + }, + { + "epoch": 18.448613376835237, + "grad_norm": 0.0014495945069938898, + "learning_rate": 1.8220932782765377e-05, + "loss": 0.0027, + "num_input_tokens_seen": 244290544, + "step": 113090 + }, + { + "epoch": 18.449429037520392, + "grad_norm": 0.013381626456975937, + "learning_rate": 1.8201897132502476e-05, + "loss": 0.0067, + "num_input_tokens_seen": 244302000, + "step": 113095 + }, + { + "epoch": 18.450244698205548, + "grad_norm": 0.014497867785394192, + "learning_rate": 1.8182871246489487e-05, + "loss": 0.0007, + "num_input_tokens_seen": 244312336, + "step": 113100 + }, + { + "epoch": 18.4510603588907, + "grad_norm": 0.024804679676890373, + "learning_rate": 1.8163855125111707e-05, + "loss": 0.0024, + "num_input_tokens_seen": 244323600, + "step": 113105 + }, + { + "epoch": 18.451876019575856, + "grad_norm": 0.0011982301948592067, + "learning_rate": 1.8144848768754717e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244334704, + "step": 113110 + }, + { + "epoch": 18.45269168026101, + "grad_norm": 0.0005384967080317438, + "learning_rate": 1.8125852177803658e-05, + "loss": 0.0011, + "num_input_tokens_seen": 244344816, + "step": 113115 + }, + { + "epoch": 18.453507340946167, + "grad_norm": 0.003350053681060672, + "learning_rate": 1.8106865352643498e-05, + "loss": 0.0011, + "num_input_tokens_seen": 244355984, + "step": 113120 + }, + { + "epoch": 18.454323001631323, + "grad_norm": 0.01903350092470646, + "learning_rate": 1.808788829365904e-05, + "loss": 0.0014, + "num_input_tokens_seen": 244366320, + "step": 113125 + }, + { + "epoch": 18.455138662316475, + "grad_norm": 0.0018174276920035481, + "learning_rate": 1.8068921001234862e-05, + "loss": 0.0024, + "num_input_tokens_seen": 244377296, + "step": 113130 + }, + { + "epoch": 18.45595432300163, + "grad_norm": 0.0027823823038488626, + "learning_rate": 1.804996347575538e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244389392, + "step": 113135 + }, + { + "epoch": 18.456769983686787, + "grad_norm": 0.0007917169132269919, + "learning_rate": 1.8031015717604793e-05, + "loss": 0.0004, + "num_input_tokens_seen": 244399888, + "step": 113140 + }, + { + "epoch": 18.457585644371942, + "grad_norm": 0.563396155834198, + "learning_rate": 1.8012077727167065e-05, + "loss": 0.031, + "num_input_tokens_seen": 244410128, + "step": 113145 + }, + { + "epoch": 18.458401305057095, + "grad_norm": 0.03793555125594139, + "learning_rate": 1.7993149504826056e-05, + "loss": 0.0178, + "num_input_tokens_seen": 244421488, + "step": 113150 + }, + { + "epoch": 18.45921696574225, + "grad_norm": 0.010772444307804108, + "learning_rate": 1.7974231050965352e-05, + "loss": 0.002, + "num_input_tokens_seen": 244432464, + "step": 113155 + }, + { + "epoch": 18.460032626427406, + "grad_norm": 0.00041844710358418524, + "learning_rate": 1.7955322365968253e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244443984, + "step": 113160 + }, + { + "epoch": 18.46084828711256, + "grad_norm": 0.004533614031970501, + "learning_rate": 1.793642345021823e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244455056, + "step": 113165 + }, + { + "epoch": 18.461663947797717, + "grad_norm": 0.0014635130064561963, + "learning_rate": 1.7917534304097983e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244465808, + "step": 113170 + }, + { + "epoch": 18.46247960848287, + "grad_norm": 0.0006045596674084663, + "learning_rate": 1.7898654927990587e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244476080, + "step": 113175 + }, + { + "epoch": 18.463295269168025, + "grad_norm": 0.006617935374379158, + "learning_rate": 1.7879785322278408e-05, + "loss": 0.0035, + "num_input_tokens_seen": 244486608, + "step": 113180 + }, + { + "epoch": 18.46411092985318, + "grad_norm": 0.005817278753966093, + "learning_rate": 1.786092548734408e-05, + "loss": 0.0044, + "num_input_tokens_seen": 244497296, + "step": 113185 + }, + { + "epoch": 18.464926590538337, + "grad_norm": 0.003113929880782962, + "learning_rate": 1.7842075423569692e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244506288, + "step": 113190 + }, + { + "epoch": 18.465742251223492, + "grad_norm": 0.010674776509404182, + "learning_rate": 1.782323513133738e-05, + "loss": 0.0055, + "num_input_tokens_seen": 244517712, + "step": 113195 + }, + { + "epoch": 18.466557911908644, + "grad_norm": 0.0003821174323093146, + "learning_rate": 1.7804404611028778e-05, + "loss": 0.0044, + "num_input_tokens_seen": 244528912, + "step": 113200 + }, + { + "epoch": 18.4673735725938, + "grad_norm": 0.010828257538378239, + "learning_rate": 1.7785583863025757e-05, + "loss": 0.0022, + "num_input_tokens_seen": 244539984, + "step": 113205 + }, + { + "epoch": 18.468189233278956, + "grad_norm": 0.003220900660380721, + "learning_rate": 1.776677288770945e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244550864, + "step": 113210 + }, + { + "epoch": 18.46900489396411, + "grad_norm": 0.015322371385991573, + "learning_rate": 1.7747971685461383e-05, + "loss": 0.0023, + "num_input_tokens_seen": 244561392, + "step": 113215 + }, + { + "epoch": 18.469820554649267, + "grad_norm": 0.013095523230731487, + "learning_rate": 1.772918025666237e-05, + "loss": 0.0031, + "num_input_tokens_seen": 244571504, + "step": 113220 + }, + { + "epoch": 18.47063621533442, + "grad_norm": 0.004092440940439701, + "learning_rate": 1.7710398601693432e-05, + "loss": 0.0015, + "num_input_tokens_seen": 244581552, + "step": 113225 + }, + { + "epoch": 18.471451876019575, + "grad_norm": 0.0057231769897043705, + "learning_rate": 1.769162672093494e-05, + "loss": 0.0024, + "num_input_tokens_seen": 244592144, + "step": 113230 + }, + { + "epoch": 18.47226753670473, + "grad_norm": 0.255003958940506, + "learning_rate": 1.7672864614767636e-05, + "loss": 0.0078, + "num_input_tokens_seen": 244602448, + "step": 113235 + }, + { + "epoch": 18.473083197389887, + "grad_norm": 0.0010036260355263948, + "learning_rate": 1.7654112283571446e-05, + "loss": 0.001, + "num_input_tokens_seen": 244612368, + "step": 113240 + }, + { + "epoch": 18.473898858075042, + "grad_norm": 0.0030960855074226856, + "learning_rate": 1.7635369727726726e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244623184, + "step": 113245 + }, + { + "epoch": 18.474714518760194, + "grad_norm": 0.05858004838228226, + "learning_rate": 1.7616636947613063e-05, + "loss": 0.0031, + "num_input_tokens_seen": 244634032, + "step": 113250 + }, + { + "epoch": 18.47553017944535, + "grad_norm": 0.006544803269207478, + "learning_rate": 1.759791394361021e-05, + "loss": 0.0017, + "num_input_tokens_seen": 244645296, + "step": 113255 + }, + { + "epoch": 18.476345840130506, + "grad_norm": 0.000575044599827379, + "learning_rate": 1.757920071609764e-05, + "loss": 0.0013, + "num_input_tokens_seen": 244655632, + "step": 113260 + }, + { + "epoch": 18.47716150081566, + "grad_norm": 0.0021637838799506426, + "learning_rate": 1.75604972654545e-05, + "loss": 0.0099, + "num_input_tokens_seen": 244665776, + "step": 113265 + }, + { + "epoch": 18.477977161500817, + "grad_norm": 0.016059428453445435, + "learning_rate": 1.754180359205998e-05, + "loss": 0.0022, + "num_input_tokens_seen": 244677168, + "step": 113270 + }, + { + "epoch": 18.47879282218597, + "grad_norm": 0.025442641228437424, + "learning_rate": 1.752311969629278e-05, + "loss": 0.0018, + "num_input_tokens_seen": 244687632, + "step": 113275 + }, + { + "epoch": 18.479608482871125, + "grad_norm": 0.015501430258154869, + "learning_rate": 1.7504445578531703e-05, + "loss": 0.0033, + "num_input_tokens_seen": 244697328, + "step": 113280 + }, + { + "epoch": 18.48042414355628, + "grad_norm": 0.0016439296305179596, + "learning_rate": 1.7485781239155063e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244707952, + "step": 113285 + }, + { + "epoch": 18.481239804241437, + "grad_norm": 0.003533845068886876, + "learning_rate": 1.7467126678541223e-05, + "loss": 0.0016, + "num_input_tokens_seen": 244718896, + "step": 113290 + }, + { + "epoch": 18.482055464926592, + "grad_norm": 0.01935855858027935, + "learning_rate": 1.7448481897068158e-05, + "loss": 0.0012, + "num_input_tokens_seen": 244729104, + "step": 113295 + }, + { + "epoch": 18.482871125611744, + "grad_norm": 0.00022205821005627513, + "learning_rate": 1.742984689511379e-05, + "loss": 0.0032, + "num_input_tokens_seen": 244738320, + "step": 113300 + }, + { + "epoch": 18.4836867862969, + "grad_norm": 0.0034483519848436117, + "learning_rate": 1.7411221673055644e-05, + "loss": 0.0011, + "num_input_tokens_seen": 244749680, + "step": 113305 + }, + { + "epoch": 18.484502446982056, + "grad_norm": 0.004889797419309616, + "learning_rate": 1.739260623127148e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244760816, + "step": 113310 + }, + { + "epoch": 18.48531810766721, + "grad_norm": 0.008101309649646282, + "learning_rate": 1.737400057013827e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244771440, + "step": 113315 + }, + { + "epoch": 18.486133768352367, + "grad_norm": 0.0009130858816206455, + "learning_rate": 1.735540469003327e-05, + "loss": 0.1336, + "num_input_tokens_seen": 244782096, + "step": 113320 + }, + { + "epoch": 18.48694942903752, + "grad_norm": 0.055807098746299744, + "learning_rate": 1.733681859133318e-05, + "loss": 0.1183, + "num_input_tokens_seen": 244792144, + "step": 113325 + }, + { + "epoch": 18.487765089722675, + "grad_norm": 0.00047300878213718534, + "learning_rate": 1.7318242274414864e-05, + "loss": 0.0023, + "num_input_tokens_seen": 244801424, + "step": 113330 + }, + { + "epoch": 18.48858075040783, + "grad_norm": 0.0005087702884338796, + "learning_rate": 1.7299675739654575e-05, + "loss": 0.0036, + "num_input_tokens_seen": 244812560, + "step": 113335 + }, + { + "epoch": 18.489396411092986, + "grad_norm": 0.15531721711158752, + "learning_rate": 1.7281118987428847e-05, + "loss": 0.0044, + "num_input_tokens_seen": 244821808, + "step": 113340 + }, + { + "epoch": 18.49021207177814, + "grad_norm": 0.00020166859030723572, + "learning_rate": 1.7262572018113488e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244833424, + "step": 113345 + }, + { + "epoch": 18.491027732463294, + "grad_norm": 0.0009022870799526572, + "learning_rate": 1.7244034832084587e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244843504, + "step": 113350 + }, + { + "epoch": 18.49184339314845, + "grad_norm": 0.0002938243851531297, + "learning_rate": 1.722550742971768e-05, + "loss": 0.0004, + "num_input_tokens_seen": 244853360, + "step": 113355 + }, + { + "epoch": 18.492659053833606, + "grad_norm": 0.04535282030701637, + "learning_rate": 1.720698981138835e-05, + "loss": 0.0035, + "num_input_tokens_seen": 244864432, + "step": 113360 + }, + { + "epoch": 18.49347471451876, + "grad_norm": 0.003897402435541153, + "learning_rate": 1.7188481977471804e-05, + "loss": 0.0034, + "num_input_tokens_seen": 244874928, + "step": 113365 + }, + { + "epoch": 18.494290375203914, + "grad_norm": 0.01214703917503357, + "learning_rate": 1.716998392834318e-05, + "loss": 0.0013, + "num_input_tokens_seen": 244886640, + "step": 113370 + }, + { + "epoch": 18.49510603588907, + "grad_norm": 0.0006659848149865866, + "learning_rate": 1.715149566437735e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244896912, + "step": 113375 + }, + { + "epoch": 18.495921696574225, + "grad_norm": 0.008747376501560211, + "learning_rate": 1.7133017185949007e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244907728, + "step": 113380 + }, + { + "epoch": 18.49673735725938, + "grad_norm": 0.0059668924659490585, + "learning_rate": 1.711454849343258e-05, + "loss": 0.0677, + "num_input_tokens_seen": 244918352, + "step": 113385 + }, + { + "epoch": 18.497553017944536, + "grad_norm": 0.0030300829093903303, + "learning_rate": 1.709608958720249e-05, + "loss": 0.0044, + "num_input_tokens_seen": 244928336, + "step": 113390 + }, + { + "epoch": 18.49836867862969, + "grad_norm": 0.00733586261048913, + "learning_rate": 1.7077640467632714e-05, + "loss": 0.0014, + "num_input_tokens_seen": 244938512, + "step": 113395 + }, + { + "epoch": 18.499184339314844, + "grad_norm": 0.002480535302311182, + "learning_rate": 1.705920113509718e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244950128, + "step": 113400 + }, + { + "epoch": 18.5, + "grad_norm": 0.0012112815165892243, + "learning_rate": 1.7040771589969583e-05, + "loss": 0.0371, + "num_input_tokens_seen": 244960368, + "step": 113405 + }, + { + "epoch": 18.500815660685156, + "grad_norm": 0.002310445299372077, + "learning_rate": 1.7022351832623407e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244971440, + "step": 113410 + }, + { + "epoch": 18.50163132137031, + "grad_norm": 0.004520154092460871, + "learning_rate": 1.7003941863432014e-05, + "loss": 0.0026, + "num_input_tokens_seen": 244981456, + "step": 113415 + }, + { + "epoch": 18.502446982055464, + "grad_norm": 0.0012209441047161818, + "learning_rate": 1.6985541682768445e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244991216, + "step": 113420 + }, + { + "epoch": 18.50326264274062, + "grad_norm": 0.000785894924774766, + "learning_rate": 1.696715129100562e-05, + "loss": 0.0013, + "num_input_tokens_seen": 245002352, + "step": 113425 + }, + { + "epoch": 18.504078303425775, + "grad_norm": 0.008201303891837597, + "learning_rate": 1.6948770688516248e-05, + "loss": 0.0012, + "num_input_tokens_seen": 245013904, + "step": 113430 + }, + { + "epoch": 18.50489396411093, + "grad_norm": 0.0007624907302670181, + "learning_rate": 1.6930399875672853e-05, + "loss": 0.083, + "num_input_tokens_seen": 245025456, + "step": 113435 + }, + { + "epoch": 18.505709624796086, + "grad_norm": 0.11084222048521042, + "learning_rate": 1.69120388528477e-05, + "loss": 0.0091, + "num_input_tokens_seen": 245036560, + "step": 113440 + }, + { + "epoch": 18.50652528548124, + "grad_norm": 0.0011350169079378247, + "learning_rate": 1.6893687620412933e-05, + "loss": 0.0029, + "num_input_tokens_seen": 245047248, + "step": 113445 + }, + { + "epoch": 18.507340946166394, + "grad_norm": 0.0018546866485849023, + "learning_rate": 1.687534617874037e-05, + "loss": 0.0012, + "num_input_tokens_seen": 245059536, + "step": 113450 + }, + { + "epoch": 18.50815660685155, + "grad_norm": 0.0010910548735409975, + "learning_rate": 1.685701452820193e-05, + "loss": 0.0005, + "num_input_tokens_seen": 245068592, + "step": 113455 + }, + { + "epoch": 18.508972267536706, + "grad_norm": 0.0018128188094124198, + "learning_rate": 1.6838692669168876e-05, + "loss": 0.0014, + "num_input_tokens_seen": 245080912, + "step": 113460 + }, + { + "epoch": 18.50978792822186, + "grad_norm": 0.006234641652554274, + "learning_rate": 1.682038060201274e-05, + "loss": 0.0011, + "num_input_tokens_seen": 245091824, + "step": 113465 + }, + { + "epoch": 18.510603588907014, + "grad_norm": 0.00048175413394346833, + "learning_rate": 1.680207832710451e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245102928, + "step": 113470 + }, + { + "epoch": 18.51141924959217, + "grad_norm": 0.002115569543093443, + "learning_rate": 1.6783785844815157e-05, + "loss": 0.002, + "num_input_tokens_seen": 245113200, + "step": 113475 + }, + { + "epoch": 18.512234910277325, + "grad_norm": 0.007819109596312046, + "learning_rate": 1.6765503155515394e-05, + "loss": 0.0019, + "num_input_tokens_seen": 245125360, + "step": 113480 + }, + { + "epoch": 18.51305057096248, + "grad_norm": 0.0004998321528546512, + "learning_rate": 1.6747230259575696e-05, + "loss": 0.0016, + "num_input_tokens_seen": 245136016, + "step": 113485 + }, + { + "epoch": 18.513866231647633, + "grad_norm": 0.005353355780243874, + "learning_rate": 1.6728967157366492e-05, + "loss": 0.0038, + "num_input_tokens_seen": 245146896, + "step": 113490 + }, + { + "epoch": 18.51468189233279, + "grad_norm": 0.0007995230262167752, + "learning_rate": 1.671071384925782e-05, + "loss": 0.0005, + "num_input_tokens_seen": 245157104, + "step": 113495 + }, + { + "epoch": 18.515497553017944, + "grad_norm": 0.07687735557556152, + "learning_rate": 1.66924703356196e-05, + "loss": 0.0028, + "num_input_tokens_seen": 245167696, + "step": 113500 + }, + { + "epoch": 18.5163132137031, + "grad_norm": 0.03595279902219772, + "learning_rate": 1.6674236616821602e-05, + "loss": 0.0018, + "num_input_tokens_seen": 245178768, + "step": 113505 + }, + { + "epoch": 18.517128874388256, + "grad_norm": 0.2188006043434143, + "learning_rate": 1.6656012693233357e-05, + "loss": 0.0039, + "num_input_tokens_seen": 245189264, + "step": 113510 + }, + { + "epoch": 18.517944535073408, + "grad_norm": 0.005239508114755154, + "learning_rate": 1.6637798565224127e-05, + "loss": 0.0045, + "num_input_tokens_seen": 245199600, + "step": 113515 + }, + { + "epoch": 18.518760195758563, + "grad_norm": 0.0013815397396683693, + "learning_rate": 1.6619594233163172e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245211856, + "step": 113520 + }, + { + "epoch": 18.51957585644372, + "grad_norm": 0.0036403543781489134, + "learning_rate": 1.6601399697419306e-05, + "loss": 0.003, + "num_input_tokens_seen": 245222288, + "step": 113525 + }, + { + "epoch": 18.520391517128875, + "grad_norm": 0.0004640101979020983, + "learning_rate": 1.658321495836135e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245230896, + "step": 113530 + }, + { + "epoch": 18.52120717781403, + "grad_norm": 0.00018253900634590536, + "learning_rate": 1.6565040016357725e-05, + "loss": 0.0023, + "num_input_tokens_seen": 245242736, + "step": 113535 + }, + { + "epoch": 18.522022838499183, + "grad_norm": 0.0010609666351228952, + "learning_rate": 1.654687487177692e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245253616, + "step": 113540 + }, + { + "epoch": 18.52283849918434, + "grad_norm": 0.0076196757145226, + "learning_rate": 1.6528719524986967e-05, + "loss": 0.0029, + "num_input_tokens_seen": 245263792, + "step": 113545 + }, + { + "epoch": 18.523654159869494, + "grad_norm": 0.008771974593400955, + "learning_rate": 1.6510573976355858e-05, + "loss": 0.001, + "num_input_tokens_seen": 245273104, + "step": 113550 + }, + { + "epoch": 18.52446982055465, + "grad_norm": 0.0018892057705670595, + "learning_rate": 1.6492438226251295e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245283312, + "step": 113555 + }, + { + "epoch": 18.525285481239806, + "grad_norm": 0.0011229922529309988, + "learning_rate": 1.647431227504087e-05, + "loss": 0.0059, + "num_input_tokens_seen": 245293776, + "step": 113560 + }, + { + "epoch": 18.526101141924958, + "grad_norm": 0.0634569600224495, + "learning_rate": 1.645619612309185e-05, + "loss": 0.0013, + "num_input_tokens_seen": 245304816, + "step": 113565 + }, + { + "epoch": 18.526916802610113, + "grad_norm": 0.2746676504611969, + "learning_rate": 1.6438089770771435e-05, + "loss": 0.0055, + "num_input_tokens_seen": 245316112, + "step": 113570 + }, + { + "epoch": 18.52773246329527, + "grad_norm": 0.016933711245656013, + "learning_rate": 1.6419993218446673e-05, + "loss": 0.0026, + "num_input_tokens_seen": 245328016, + "step": 113575 + }, + { + "epoch": 18.528548123980425, + "grad_norm": 0.007328105624765158, + "learning_rate": 1.640190646648404e-05, + "loss": 0.0011, + "num_input_tokens_seen": 245339856, + "step": 113580 + }, + { + "epoch": 18.52936378466558, + "grad_norm": 0.002034168690443039, + "learning_rate": 1.638382951525047e-05, + "loss": 0.0018, + "num_input_tokens_seen": 245350160, + "step": 113585 + }, + { + "epoch": 18.530179445350733, + "grad_norm": 0.0021341920364648104, + "learning_rate": 1.6365762365111947e-05, + "loss": 0.0017, + "num_input_tokens_seen": 245361200, + "step": 113590 + }, + { + "epoch": 18.53099510603589, + "grad_norm": 0.0005403195391409099, + "learning_rate": 1.6347705016434844e-05, + "loss": 0.0022, + "num_input_tokens_seen": 245370832, + "step": 113595 + }, + { + "epoch": 18.531810766721044, + "grad_norm": 0.00026967190206050873, + "learning_rate": 1.6329657469585037e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245380688, + "step": 113600 + }, + { + "epoch": 18.5326264274062, + "grad_norm": 0.00889151357114315, + "learning_rate": 1.6311619724928283e-05, + "loss": 0.0054, + "num_input_tokens_seen": 245391920, + "step": 113605 + }, + { + "epoch": 18.533442088091356, + "grad_norm": 0.0009094868437387049, + "learning_rate": 1.6293591782830186e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245404144, + "step": 113610 + }, + { + "epoch": 18.534257748776508, + "grad_norm": 0.0718315988779068, + "learning_rate": 1.6275573643656115e-05, + "loss": 0.0088, + "num_input_tokens_seen": 245414064, + "step": 113615 + }, + { + "epoch": 18.535073409461663, + "grad_norm": 0.04146379604935646, + "learning_rate": 1.6257565307771115e-05, + "loss": 0.0023, + "num_input_tokens_seen": 245425424, + "step": 113620 + }, + { + "epoch": 18.53588907014682, + "grad_norm": 0.015665479004383087, + "learning_rate": 1.6239566775540283e-05, + "loss": 0.0028, + "num_input_tokens_seen": 245434992, + "step": 113625 + }, + { + "epoch": 18.536704730831975, + "grad_norm": 0.0012484738836064935, + "learning_rate": 1.6221578047328322e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245446288, + "step": 113630 + }, + { + "epoch": 18.53752039151713, + "grad_norm": 0.28304043412208557, + "learning_rate": 1.6203599123499778e-05, + "loss": 0.0022, + "num_input_tokens_seen": 245456816, + "step": 113635 + }, + { + "epoch": 18.538336052202283, + "grad_norm": 0.008190099149942398, + "learning_rate": 1.6185630004419027e-05, + "loss": 0.0044, + "num_input_tokens_seen": 245467408, + "step": 113640 + }, + { + "epoch": 18.53915171288744, + "grad_norm": 0.00023345145746134222, + "learning_rate": 1.6167670690450276e-05, + "loss": 0.0125, + "num_input_tokens_seen": 245478160, + "step": 113645 + }, + { + "epoch": 18.539967373572594, + "grad_norm": 0.012173679657280445, + "learning_rate": 1.6149721181957456e-05, + "loss": 0.0011, + "num_input_tokens_seen": 245488688, + "step": 113650 + }, + { + "epoch": 18.54078303425775, + "grad_norm": 0.00276687229052186, + "learning_rate": 1.6131781479304332e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245499600, + "step": 113655 + }, + { + "epoch": 18.541598694942905, + "grad_norm": 0.004582415334880352, + "learning_rate": 1.61138515828545e-05, + "loss": 0.0013, + "num_input_tokens_seen": 245510096, + "step": 113660 + }, + { + "epoch": 18.542414355628058, + "grad_norm": 0.012344161979854107, + "learning_rate": 1.6095931492971282e-05, + "loss": 0.0022, + "num_input_tokens_seen": 245519760, + "step": 113665 + }, + { + "epoch": 18.543230016313213, + "grad_norm": 0.0012308456934988499, + "learning_rate": 1.6078021210017945e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245530768, + "step": 113670 + }, + { + "epoch": 18.54404567699837, + "grad_norm": 0.004262410569936037, + "learning_rate": 1.6060120734357366e-05, + "loss": 0.0009, + "num_input_tokens_seen": 245541232, + "step": 113675 + }, + { + "epoch": 18.544861337683525, + "grad_norm": 0.006430953275412321, + "learning_rate": 1.604223006635236e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245552976, + "step": 113680 + }, + { + "epoch": 18.545676998368677, + "grad_norm": 0.0008573816157877445, + "learning_rate": 1.6024349206365475e-05, + "loss": 0.0023, + "num_input_tokens_seen": 245564272, + "step": 113685 + }, + { + "epoch": 18.546492659053833, + "grad_norm": 0.0015064050676301122, + "learning_rate": 1.6006478154759197e-05, + "loss": 0.003, + "num_input_tokens_seen": 245574896, + "step": 113690 + }, + { + "epoch": 18.54730831973899, + "grad_norm": 0.015532300807535648, + "learning_rate": 1.598861691189557e-05, + "loss": 0.0016, + "num_input_tokens_seen": 245586096, + "step": 113695 + }, + { + "epoch": 18.548123980424144, + "grad_norm": 0.0036405641585588455, + "learning_rate": 1.5970765478136696e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245596656, + "step": 113700 + }, + { + "epoch": 18.5489396411093, + "grad_norm": 0.14665931463241577, + "learning_rate": 1.5952923853844224e-05, + "loss": 0.0032, + "num_input_tokens_seen": 245608112, + "step": 113705 + }, + { + "epoch": 18.549755301794452, + "grad_norm": 0.0008804710232652724, + "learning_rate": 1.5935092039379874e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245619152, + "step": 113710 + }, + { + "epoch": 18.550570962479608, + "grad_norm": 0.006566312164068222, + "learning_rate": 1.5917270035104903e-05, + "loss": 0.0018, + "num_input_tokens_seen": 245629424, + "step": 113715 + }, + { + "epoch": 18.551386623164763, + "grad_norm": 0.0007352828979492188, + "learning_rate": 1.5899457841380637e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245640560, + "step": 113720 + }, + { + "epoch": 18.55220228384992, + "grad_norm": 0.00016575584595557302, + "learning_rate": 1.5881655458567847e-05, + "loss": 0.0012, + "num_input_tokens_seen": 245651568, + "step": 113725 + }, + { + "epoch": 18.553017944535075, + "grad_norm": 0.025155337527394295, + "learning_rate": 1.5863862887027626e-05, + "loss": 0.001, + "num_input_tokens_seen": 245662768, + "step": 113730 + }, + { + "epoch": 18.553833605220227, + "grad_norm": 0.04157517850399017, + "learning_rate": 1.5846080127120244e-05, + "loss": 0.004, + "num_input_tokens_seen": 245674128, + "step": 113735 + }, + { + "epoch": 18.554649265905383, + "grad_norm": 0.0004997915239073336, + "learning_rate": 1.58283071792063e-05, + "loss": 0.0424, + "num_input_tokens_seen": 245684912, + "step": 113740 + }, + { + "epoch": 18.55546492659054, + "grad_norm": 0.017312675714492798, + "learning_rate": 1.581054404364596e-05, + "loss": 0.0033, + "num_input_tokens_seen": 245695760, + "step": 113745 + }, + { + "epoch": 18.556280587275694, + "grad_norm": 0.12121226638555527, + "learning_rate": 1.5792790720799144e-05, + "loss": 0.0295, + "num_input_tokens_seen": 245706416, + "step": 113750 + }, + { + "epoch": 18.55709624796085, + "grad_norm": 0.0010711740469560027, + "learning_rate": 1.5775047211025685e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245717648, + "step": 113755 + }, + { + "epoch": 18.557911908646002, + "grad_norm": 0.007575131021440029, + "learning_rate": 1.575731351468518e-05, + "loss": 0.0034, + "num_input_tokens_seen": 245729008, + "step": 113760 + }, + { + "epoch": 18.558727569331158, + "grad_norm": 0.0685509443283081, + "learning_rate": 1.5739589632137006e-05, + "loss": 0.0027, + "num_input_tokens_seen": 245738416, + "step": 113765 + }, + { + "epoch": 18.559543230016313, + "grad_norm": 0.013830579817295074, + "learning_rate": 1.572187556374044e-05, + "loss": 0.0065, + "num_input_tokens_seen": 245747920, + "step": 113770 + }, + { + "epoch": 18.56035889070147, + "grad_norm": 0.016269782558083534, + "learning_rate": 1.5704171309854354e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245759920, + "step": 113775 + }, + { + "epoch": 18.561174551386625, + "grad_norm": 0.0017003034008666873, + "learning_rate": 1.568647687083763e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245770416, + "step": 113780 + }, + { + "epoch": 18.561990212071777, + "grad_norm": 0.0017265173373743892, + "learning_rate": 1.5668792247048868e-05, + "loss": 0.0009, + "num_input_tokens_seen": 245782384, + "step": 113785 + }, + { + "epoch": 18.562805872756933, + "grad_norm": 0.009840810671448708, + "learning_rate": 1.565111743884634e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245793744, + "step": 113790 + }, + { + "epoch": 18.563621533442088, + "grad_norm": 0.008591379038989544, + "learning_rate": 1.5633452446588537e-05, + "loss": 0.0009, + "num_input_tokens_seen": 245803824, + "step": 113795 + }, + { + "epoch": 18.564437194127244, + "grad_norm": 0.0034411675296723843, + "learning_rate": 1.5615797270633114e-05, + "loss": 0.001, + "num_input_tokens_seen": 245814832, + "step": 113800 + }, + { + "epoch": 18.5652528548124, + "grad_norm": 0.004118712618947029, + "learning_rate": 1.5598151911338176e-05, + "loss": 0.0335, + "num_input_tokens_seen": 245825072, + "step": 113805 + }, + { + "epoch": 18.56606851549755, + "grad_norm": 0.007064030971378088, + "learning_rate": 1.5580516369061103e-05, + "loss": 0.0008, + "num_input_tokens_seen": 245835312, + "step": 113810 + }, + { + "epoch": 18.566884176182707, + "grad_norm": 0.16438302397727966, + "learning_rate": 1.55628906441595e-05, + "loss": 0.003, + "num_input_tokens_seen": 245847056, + "step": 113815 + }, + { + "epoch": 18.567699836867863, + "grad_norm": 0.005964207462966442, + "learning_rate": 1.5545274736990354e-05, + "loss": 0.001, + "num_input_tokens_seen": 245858896, + "step": 113820 + }, + { + "epoch": 18.56851549755302, + "grad_norm": 0.0032159110996872187, + "learning_rate": 1.5527668647910886e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245867472, + "step": 113825 + }, + { + "epoch": 18.569331158238175, + "grad_norm": 0.05260089412331581, + "learning_rate": 1.5510072377277696e-05, + "loss": 0.0038, + "num_input_tokens_seen": 245877040, + "step": 113830 + }, + { + "epoch": 18.570146818923327, + "grad_norm": 0.4256327748298645, + "learning_rate": 1.5492485925447663e-05, + "loss": 0.0134, + "num_input_tokens_seen": 245887664, + "step": 113835 + }, + { + "epoch": 18.570962479608482, + "grad_norm": 0.055682551115751266, + "learning_rate": 1.5474909292776895e-05, + "loss": 0.0013, + "num_input_tokens_seen": 245899248, + "step": 113840 + }, + { + "epoch": 18.571778140293638, + "grad_norm": 0.005342925898730755, + "learning_rate": 1.5457342479621883e-05, + "loss": 0.0008, + "num_input_tokens_seen": 245909488, + "step": 113845 + }, + { + "epoch": 18.572593800978794, + "grad_norm": 0.0508258119225502, + "learning_rate": 1.5439785486338396e-05, + "loss": 0.0012, + "num_input_tokens_seen": 245921264, + "step": 113850 + }, + { + "epoch": 18.57340946166395, + "grad_norm": 0.0017754074651747942, + "learning_rate": 1.5422238313282434e-05, + "loss": 0.0076, + "num_input_tokens_seen": 245932880, + "step": 113855 + }, + { + "epoch": 18.5742251223491, + "grad_norm": 0.003250130685046315, + "learning_rate": 1.540470096080948e-05, + "loss": 0.0005, + "num_input_tokens_seen": 245943120, + "step": 113860 + }, + { + "epoch": 18.575040783034257, + "grad_norm": 0.039365023374557495, + "learning_rate": 1.538717342927509e-05, + "loss": 0.0042, + "num_input_tokens_seen": 245954256, + "step": 113865 + }, + { + "epoch": 18.575856443719413, + "grad_norm": 0.0029328095261007547, + "learning_rate": 1.536965571903437e-05, + "loss": 0.0018, + "num_input_tokens_seen": 245966000, + "step": 113870 + }, + { + "epoch": 18.57667210440457, + "grad_norm": 0.053971800953149796, + "learning_rate": 1.535214783044242e-05, + "loss": 0.0025, + "num_input_tokens_seen": 245976752, + "step": 113875 + }, + { + "epoch": 18.57748776508972, + "grad_norm": 0.002047772752121091, + "learning_rate": 1.5334649763853903e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245987888, + "step": 113880 + }, + { + "epoch": 18.578303425774877, + "grad_norm": 0.0011964102741330862, + "learning_rate": 1.5317161519623647e-05, + "loss": 0.005, + "num_input_tokens_seen": 245999216, + "step": 113885 + }, + { + "epoch": 18.579119086460032, + "grad_norm": 0.0003268739383202046, + "learning_rate": 1.529968309810592e-05, + "loss": 0.0057, + "num_input_tokens_seen": 246010032, + "step": 113890 + }, + { + "epoch": 18.579934747145188, + "grad_norm": 0.008182469755411148, + "learning_rate": 1.5282214499655055e-05, + "loss": 0.0068, + "num_input_tokens_seen": 246019952, + "step": 113895 + }, + { + "epoch": 18.580750407830344, + "grad_norm": 0.0009706748533062637, + "learning_rate": 1.526475572462499e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246030704, + "step": 113900 + }, + { + "epoch": 18.581566068515496, + "grad_norm": 0.016164537519216537, + "learning_rate": 1.5247306773369552e-05, + "loss": 0.0421, + "num_input_tokens_seen": 246040816, + "step": 113905 + }, + { + "epoch": 18.58238172920065, + "grad_norm": 0.0016942867077887058, + "learning_rate": 1.5229867646242457e-05, + "loss": 0.0023, + "num_input_tokens_seen": 246049808, + "step": 113910 + }, + { + "epoch": 18.583197389885807, + "grad_norm": 0.004356156103312969, + "learning_rate": 1.5212438343597036e-05, + "loss": 0.0015, + "num_input_tokens_seen": 246060816, + "step": 113915 + }, + { + "epoch": 18.584013050570963, + "grad_norm": 0.010752552188932896, + "learning_rate": 1.5195018865786559e-05, + "loss": 0.0013, + "num_input_tokens_seen": 246071408, + "step": 113920 + }, + { + "epoch": 18.58482871125612, + "grad_norm": 0.003911882638931274, + "learning_rate": 1.5177609213164023e-05, + "loss": 0.0014, + "num_input_tokens_seen": 246082800, + "step": 113925 + }, + { + "epoch": 18.58564437194127, + "grad_norm": 0.011551330797374249, + "learning_rate": 1.5160209386082314e-05, + "loss": 0.0008, + "num_input_tokens_seen": 246093872, + "step": 113930 + }, + { + "epoch": 18.586460032626427, + "grad_norm": 0.0020503744017332792, + "learning_rate": 1.5142819384893925e-05, + "loss": 0.0014, + "num_input_tokens_seen": 246103568, + "step": 113935 + }, + { + "epoch": 18.587275693311582, + "grad_norm": 0.004918430466204882, + "learning_rate": 1.512543920995152e-05, + "loss": 0.0013, + "num_input_tokens_seen": 246114256, + "step": 113940 + }, + { + "epoch": 18.588091353996738, + "grad_norm": 0.0029170692432671785, + "learning_rate": 1.5108068861607094e-05, + "loss": 0.0011, + "num_input_tokens_seen": 246125456, + "step": 113945 + }, + { + "epoch": 18.588907014681894, + "grad_norm": 0.002532045356929302, + "learning_rate": 1.5090708340212867e-05, + "loss": 0.0022, + "num_input_tokens_seen": 246135920, + "step": 113950 + }, + { + "epoch": 18.589722675367046, + "grad_norm": 0.00437668664380908, + "learning_rate": 1.5073357646120501e-05, + "loss": 0.0007, + "num_input_tokens_seen": 246146576, + "step": 113955 + }, + { + "epoch": 18.5905383360522, + "grad_norm": 0.010784292593598366, + "learning_rate": 1.5056016779681825e-05, + "loss": 0.0008, + "num_input_tokens_seen": 246157552, + "step": 113960 + }, + { + "epoch": 18.591353996737357, + "grad_norm": 0.009024699218571186, + "learning_rate": 1.5038685741248059e-05, + "loss": 0.002, + "num_input_tokens_seen": 246168880, + "step": 113965 + }, + { + "epoch": 18.592169657422513, + "grad_norm": 0.0006613527657464147, + "learning_rate": 1.502136453117059e-05, + "loss": 0.0202, + "num_input_tokens_seen": 246179024, + "step": 113970 + }, + { + "epoch": 18.59298531810767, + "grad_norm": 0.0016143594402819872, + "learning_rate": 1.5004053149800356e-05, + "loss": 0.0006, + "num_input_tokens_seen": 246190832, + "step": 113975 + }, + { + "epoch": 18.59380097879282, + "grad_norm": 0.0007201501284725964, + "learning_rate": 1.4986751597488357e-05, + "loss": 0.0036, + "num_input_tokens_seen": 246203216, + "step": 113980 + }, + { + "epoch": 18.594616639477977, + "grad_norm": 0.0003400088753551245, + "learning_rate": 1.4969459874585034e-05, + "loss": 0.0658, + "num_input_tokens_seen": 246213488, + "step": 113985 + }, + { + "epoch": 18.595432300163132, + "grad_norm": 0.0024805832654237747, + "learning_rate": 1.495217798144094e-05, + "loss": 0.0011, + "num_input_tokens_seen": 246224624, + "step": 113990 + }, + { + "epoch": 18.596247960848288, + "grad_norm": 0.06852786242961884, + "learning_rate": 1.4934905918406239e-05, + "loss": 0.0019, + "num_input_tokens_seen": 246234928, + "step": 113995 + }, + { + "epoch": 18.597063621533444, + "grad_norm": 0.30148282647132874, + "learning_rate": 1.491764368583104e-05, + "loss": 0.0127, + "num_input_tokens_seen": 246244784, + "step": 114000 + }, + { + "epoch": 18.597879282218596, + "grad_norm": 0.0019158965442329645, + "learning_rate": 1.4900391284065229e-05, + "loss": 0.0076, + "num_input_tokens_seen": 246254672, + "step": 114005 + }, + { + "epoch": 18.59869494290375, + "grad_norm": 0.00015638173499610275, + "learning_rate": 1.4883148713458306e-05, + "loss": 0.0009, + "num_input_tokens_seen": 246265904, + "step": 114010 + }, + { + "epoch": 18.599510603588907, + "grad_norm": 0.010793359018862247, + "learning_rate": 1.4865915974359823e-05, + "loss": 0.0018, + "num_input_tokens_seen": 246276240, + "step": 114015 + }, + { + "epoch": 18.600326264274063, + "grad_norm": 0.0045156884007155895, + "learning_rate": 1.4848693067119e-05, + "loss": 0.0022, + "num_input_tokens_seen": 246287472, + "step": 114020 + }, + { + "epoch": 18.601141924959215, + "grad_norm": 0.003806586842983961, + "learning_rate": 1.483147999208484e-05, + "loss": 0.0018, + "num_input_tokens_seen": 246297200, + "step": 114025 + }, + { + "epoch": 18.60195758564437, + "grad_norm": 0.03463774919509888, + "learning_rate": 1.4814276749606226e-05, + "loss": 0.0029, + "num_input_tokens_seen": 246307472, + "step": 114030 + }, + { + "epoch": 18.602773246329527, + "grad_norm": 0.0018811143236234784, + "learning_rate": 1.4797083340031769e-05, + "loss": 0.002, + "num_input_tokens_seen": 246319152, + "step": 114035 + }, + { + "epoch": 18.603588907014682, + "grad_norm": 0.15123361349105835, + "learning_rate": 1.477989976370997e-05, + "loss": 0.0057, + "num_input_tokens_seen": 246329648, + "step": 114040 + }, + { + "epoch": 18.604404567699838, + "grad_norm": 0.009590948931872845, + "learning_rate": 1.4762726020989047e-05, + "loss": 0.0011, + "num_input_tokens_seen": 246339472, + "step": 114045 + }, + { + "epoch": 18.605220228384994, + "grad_norm": 0.5494592189788818, + "learning_rate": 1.4745562112217059e-05, + "loss": 0.0209, + "num_input_tokens_seen": 246350224, + "step": 114050 + }, + { + "epoch": 18.606035889070146, + "grad_norm": 0.002357152756303549, + "learning_rate": 1.4728408037741836e-05, + "loss": 0.0023, + "num_input_tokens_seen": 246362224, + "step": 114055 + }, + { + "epoch": 18.6068515497553, + "grad_norm": 0.041462235152721405, + "learning_rate": 1.4711263797911045e-05, + "loss": 0.0024, + "num_input_tokens_seen": 246373584, + "step": 114060 + }, + { + "epoch": 18.607667210440457, + "grad_norm": 0.10488732159137726, + "learning_rate": 1.469412939307213e-05, + "loss": 0.0022, + "num_input_tokens_seen": 246383920, + "step": 114065 + }, + { + "epoch": 18.608482871125613, + "grad_norm": 0.038092661648988724, + "learning_rate": 1.4677004823572316e-05, + "loss": 0.0028, + "num_input_tokens_seen": 246394544, + "step": 114070 + }, + { + "epoch": 18.609298531810765, + "grad_norm": 0.0004575471393764019, + "learning_rate": 1.4659890089758654e-05, + "loss": 0.0005, + "num_input_tokens_seen": 246405200, + "step": 114075 + }, + { + "epoch": 18.61011419249592, + "grad_norm": 0.0010731341317296028, + "learning_rate": 1.4642785191978036e-05, + "loss": 0.0494, + "num_input_tokens_seen": 246414960, + "step": 114080 + }, + { + "epoch": 18.610929853181077, + "grad_norm": 0.0021726840641349554, + "learning_rate": 1.462569013057713e-05, + "loss": 0.0007, + "num_input_tokens_seen": 246426352, + "step": 114085 + }, + { + "epoch": 18.611745513866232, + "grad_norm": 0.00024937037960626185, + "learning_rate": 1.4608604905902268e-05, + "loss": 0.0006, + "num_input_tokens_seen": 246437872, + "step": 114090 + }, + { + "epoch": 18.612561174551388, + "grad_norm": 0.26687636971473694, + "learning_rate": 1.4591529518299896e-05, + "loss": 0.0089, + "num_input_tokens_seen": 246448816, + "step": 114095 + }, + { + "epoch": 18.61337683523654, + "grad_norm": 0.003532203147187829, + "learning_rate": 1.4574463968115903e-05, + "loss": 0.0012, + "num_input_tokens_seen": 246459696, + "step": 114100 + }, + { + "epoch": 18.614192495921696, + "grad_norm": 0.0039835479110479355, + "learning_rate": 1.4557408255696181e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246470032, + "step": 114105 + }, + { + "epoch": 18.61500815660685, + "grad_norm": 0.0008681021281518042, + "learning_rate": 1.4540362381386452e-05, + "loss": 0.0711, + "num_input_tokens_seen": 246481776, + "step": 114110 + }, + { + "epoch": 18.615823817292007, + "grad_norm": 0.0026605729945003986, + "learning_rate": 1.4523326345532163e-05, + "loss": 0.002, + "num_input_tokens_seen": 246491504, + "step": 114115 + }, + { + "epoch": 18.616639477977163, + "grad_norm": 0.00021350267343223095, + "learning_rate": 1.450630014847848e-05, + "loss": 0.0047, + "num_input_tokens_seen": 246501968, + "step": 114120 + }, + { + "epoch": 18.617455138662315, + "grad_norm": 0.008653911761939526, + "learning_rate": 1.4489283790570518e-05, + "loss": 0.0005, + "num_input_tokens_seen": 246512720, + "step": 114125 + }, + { + "epoch": 18.61827079934747, + "grad_norm": 0.001182371866889298, + "learning_rate": 1.4472277272153167e-05, + "loss": 0.0051, + "num_input_tokens_seen": 246522512, + "step": 114130 + }, + { + "epoch": 18.619086460032626, + "grad_norm": 0.0021306921262294054, + "learning_rate": 1.445528059357104e-05, + "loss": 0.1048, + "num_input_tokens_seen": 246534064, + "step": 114135 + }, + { + "epoch": 18.619902120717782, + "grad_norm": 0.001645290874876082, + "learning_rate": 1.4438293755168585e-05, + "loss": 0.0061, + "num_input_tokens_seen": 246544848, + "step": 114140 + }, + { + "epoch": 18.620717781402938, + "grad_norm": 0.0037784897722303867, + "learning_rate": 1.4421316757290082e-05, + "loss": 0.0196, + "num_input_tokens_seen": 246556272, + "step": 114145 + }, + { + "epoch": 18.62153344208809, + "grad_norm": 0.004677009768784046, + "learning_rate": 1.4404349600279642e-05, + "loss": 0.0012, + "num_input_tokens_seen": 246566288, + "step": 114150 + }, + { + "epoch": 18.622349102773246, + "grad_norm": 0.0036729995626956224, + "learning_rate": 1.4387392284481049e-05, + "loss": 0.0008, + "num_input_tokens_seen": 246577264, + "step": 114155 + }, + { + "epoch": 18.6231647634584, + "grad_norm": 0.0003251898742746562, + "learning_rate": 1.437044481023797e-05, + "loss": 0.0006, + "num_input_tokens_seen": 246587312, + "step": 114160 + }, + { + "epoch": 18.623980424143557, + "grad_norm": 0.10283267498016357, + "learning_rate": 1.4353507177893964e-05, + "loss": 0.0039, + "num_input_tokens_seen": 246597776, + "step": 114165 + }, + { + "epoch": 18.624796084828713, + "grad_norm": 0.039473868906497955, + "learning_rate": 1.4336579387792148e-05, + "loss": 0.0061, + "num_input_tokens_seen": 246611408, + "step": 114170 + }, + { + "epoch": 18.625611745513865, + "grad_norm": 1.0579966306686401, + "learning_rate": 1.4319661440275689e-05, + "loss": 0.1601, + "num_input_tokens_seen": 246621424, + "step": 114175 + }, + { + "epoch": 18.62642740619902, + "grad_norm": 0.006494295317679644, + "learning_rate": 1.4302753335687423e-05, + "loss": 0.0009, + "num_input_tokens_seen": 246632208, + "step": 114180 + }, + { + "epoch": 18.627243066884176, + "grad_norm": 0.06191675364971161, + "learning_rate": 1.4285855074370025e-05, + "loss": 0.0116, + "num_input_tokens_seen": 246643120, + "step": 114185 + }, + { + "epoch": 18.628058727569332, + "grad_norm": 0.0025623554829508066, + "learning_rate": 1.4268966656665938e-05, + "loss": 0.0007, + "num_input_tokens_seen": 246652336, + "step": 114190 + }, + { + "epoch": 18.628874388254488, + "grad_norm": 0.08002685755491257, + "learning_rate": 1.4252088082917391e-05, + "loss": 0.0024, + "num_input_tokens_seen": 246664176, + "step": 114195 + }, + { + "epoch": 18.62969004893964, + "grad_norm": 0.0005803200765512884, + "learning_rate": 1.4235219353466555e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246674864, + "step": 114200 + }, + { + "epoch": 18.630505709624796, + "grad_norm": 0.0005485960282385349, + "learning_rate": 1.4218360468655212e-05, + "loss": 0.0035, + "num_input_tokens_seen": 246686480, + "step": 114205 + }, + { + "epoch": 18.63132137030995, + "grad_norm": 0.0003624989476520568, + "learning_rate": 1.4201511428824976e-05, + "loss": 0.0683, + "num_input_tokens_seen": 246697648, + "step": 114210 + }, + { + "epoch": 18.632137030995107, + "grad_norm": 0.0008376438054256141, + "learning_rate": 1.4184672234317463e-05, + "loss": 0.0009, + "num_input_tokens_seen": 246707472, + "step": 114215 + }, + { + "epoch": 18.63295269168026, + "grad_norm": 0.005695714149624109, + "learning_rate": 1.4167842885473903e-05, + "loss": 0.0033, + "num_input_tokens_seen": 246717584, + "step": 114220 + }, + { + "epoch": 18.633768352365415, + "grad_norm": 0.013842624612152576, + "learning_rate": 1.4151023382635298e-05, + "loss": 0.001, + "num_input_tokens_seen": 246728272, + "step": 114225 + }, + { + "epoch": 18.63458401305057, + "grad_norm": 0.0036778177600353956, + "learning_rate": 1.4134213726142541e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246738736, + "step": 114230 + }, + { + "epoch": 18.635399673735726, + "grad_norm": 0.0011332188732922077, + "learning_rate": 1.4117413916336307e-05, + "loss": 0.004, + "num_input_tokens_seen": 246749424, + "step": 114235 + }, + { + "epoch": 18.636215334420882, + "grad_norm": 0.001157104386948049, + "learning_rate": 1.4100623953557045e-05, + "loss": 0.023, + "num_input_tokens_seen": 246760208, + "step": 114240 + }, + { + "epoch": 18.637030995106034, + "grad_norm": 0.009338432922959328, + "learning_rate": 1.4083843838145095e-05, + "loss": 0.0025, + "num_input_tokens_seen": 246771408, + "step": 114245 + }, + { + "epoch": 18.63784665579119, + "grad_norm": 0.00642793532460928, + "learning_rate": 1.4067073570440458e-05, + "loss": 0.0007, + "num_input_tokens_seen": 246782736, + "step": 114250 + }, + { + "epoch": 18.638662316476346, + "grad_norm": 0.032264117151498795, + "learning_rate": 1.4050313150782978e-05, + "loss": 0.0029, + "num_input_tokens_seen": 246793968, + "step": 114255 + }, + { + "epoch": 18.6394779771615, + "grad_norm": 0.011337845586240292, + "learning_rate": 1.4033562579512438e-05, + "loss": 0.0032, + "num_input_tokens_seen": 246804144, + "step": 114260 + }, + { + "epoch": 18.640293637846657, + "grad_norm": 0.008878304623067379, + "learning_rate": 1.4016821856968232e-05, + "loss": 0.0011, + "num_input_tokens_seen": 246815280, + "step": 114265 + }, + { + "epoch": 18.64110929853181, + "grad_norm": 0.0022187468130141497, + "learning_rate": 1.4000090983489588e-05, + "loss": 0.0007, + "num_input_tokens_seen": 246826064, + "step": 114270 + }, + { + "epoch": 18.641924959216965, + "grad_norm": 0.0002171692467527464, + "learning_rate": 1.3983369959415682e-05, + "loss": 0.0035, + "num_input_tokens_seen": 246837328, + "step": 114275 + }, + { + "epoch": 18.64274061990212, + "grad_norm": 0.00032952241599559784, + "learning_rate": 1.3966658785085352e-05, + "loss": 0.0007, + "num_input_tokens_seen": 246847568, + "step": 114280 + }, + { + "epoch": 18.643556280587276, + "grad_norm": 0.00012888593482784927, + "learning_rate": 1.394995746083727e-05, + "loss": 0.0007, + "num_input_tokens_seen": 246858704, + "step": 114285 + }, + { + "epoch": 18.644371941272432, + "grad_norm": 0.019832175225019455, + "learning_rate": 1.3933265987009836e-05, + "loss": 0.0022, + "num_input_tokens_seen": 246868688, + "step": 114290 + }, + { + "epoch": 18.645187601957584, + "grad_norm": 0.003522902261465788, + "learning_rate": 1.3916584363941442e-05, + "loss": 0.0013, + "num_input_tokens_seen": 246879728, + "step": 114295 + }, + { + "epoch": 18.64600326264274, + "grad_norm": 0.000486519857076928, + "learning_rate": 1.3899912591970099e-05, + "loss": 0.0032, + "num_input_tokens_seen": 246890096, + "step": 114300 + }, + { + "epoch": 18.646818923327896, + "grad_norm": 0.018939530476927757, + "learning_rate": 1.3883250671433645e-05, + "loss": 0.0102, + "num_input_tokens_seen": 246900016, + "step": 114305 + }, + { + "epoch": 18.64763458401305, + "grad_norm": 0.0028730384074151516, + "learning_rate": 1.3866598602669866e-05, + "loss": 0.0305, + "num_input_tokens_seen": 246910896, + "step": 114310 + }, + { + "epoch": 18.648450244698207, + "grad_norm": 0.00035425060195848346, + "learning_rate": 1.3849956386016049e-05, + "loss": 0.0041, + "num_input_tokens_seen": 246922256, + "step": 114315 + }, + { + "epoch": 18.64926590538336, + "grad_norm": 0.0004246874595992267, + "learning_rate": 1.3833324021809756e-05, + "loss": 0.0023, + "num_input_tokens_seen": 246933200, + "step": 114320 + }, + { + "epoch": 18.650081566068515, + "grad_norm": 0.0004184528661426157, + "learning_rate": 1.3816701510387775e-05, + "loss": 0.0007, + "num_input_tokens_seen": 246944368, + "step": 114325 + }, + { + "epoch": 18.65089722675367, + "grad_norm": 0.002552991034463048, + "learning_rate": 1.3800088852087166e-05, + "loss": 0.0013, + "num_input_tokens_seen": 246955280, + "step": 114330 + }, + { + "epoch": 18.651712887438826, + "grad_norm": 0.002675180323421955, + "learning_rate": 1.3783486047244497e-05, + "loss": 0.0245, + "num_input_tokens_seen": 246967920, + "step": 114335 + }, + { + "epoch": 18.652528548123982, + "grad_norm": 0.011710644699633121, + "learning_rate": 1.3766893096196386e-05, + "loss": 0.0034, + "num_input_tokens_seen": 246978576, + "step": 114340 + }, + { + "epoch": 18.653344208809134, + "grad_norm": 0.0022386705968528986, + "learning_rate": 1.3750309999278899e-05, + "loss": 0.0019, + "num_input_tokens_seen": 246989968, + "step": 114345 + }, + { + "epoch": 18.65415986949429, + "grad_norm": 0.017226964235305786, + "learning_rate": 1.373373675682832e-05, + "loss": 0.0046, + "num_input_tokens_seen": 247001872, + "step": 114350 + }, + { + "epoch": 18.654975530179446, + "grad_norm": 0.00021573618869297206, + "learning_rate": 1.371717336918038e-05, + "loss": 0.0025, + "num_input_tokens_seen": 247012400, + "step": 114355 + }, + { + "epoch": 18.6557911908646, + "grad_norm": 0.0006454604445025325, + "learning_rate": 1.3700619836670813e-05, + "loss": 0.0068, + "num_input_tokens_seen": 247023696, + "step": 114360 + }, + { + "epoch": 18.656606851549757, + "grad_norm": 0.00180201162584126, + "learning_rate": 1.3684076159635129e-05, + "loss": 0.001, + "num_input_tokens_seen": 247034128, + "step": 114365 + }, + { + "epoch": 18.65742251223491, + "grad_norm": 0.00018552408437244594, + "learning_rate": 1.3667542338408611e-05, + "loss": 0.0015, + "num_input_tokens_seen": 247044112, + "step": 114370 + }, + { + "epoch": 18.658238172920065, + "grad_norm": 0.0041071511805057526, + "learning_rate": 1.3651018373326219e-05, + "loss": 0.0005, + "num_input_tokens_seen": 247055696, + "step": 114375 + }, + { + "epoch": 18.65905383360522, + "grad_norm": 0.2897575795650482, + "learning_rate": 1.3634504264723013e-05, + "loss": 0.0035, + "num_input_tokens_seen": 247067280, + "step": 114380 + }, + { + "epoch": 18.659869494290376, + "grad_norm": 0.00036819299566559494, + "learning_rate": 1.3618000012933506e-05, + "loss": 0.0049, + "num_input_tokens_seen": 247078096, + "step": 114385 + }, + { + "epoch": 18.660685154975532, + "grad_norm": 0.002458331175148487, + "learning_rate": 1.3601505618292264e-05, + "loss": 0.0012, + "num_input_tokens_seen": 247088048, + "step": 114390 + }, + { + "epoch": 18.661500815660684, + "grad_norm": 0.0003740904794540256, + "learning_rate": 1.3585021081133575e-05, + "loss": 0.0009, + "num_input_tokens_seen": 247099184, + "step": 114395 + }, + { + "epoch": 18.66231647634584, + "grad_norm": 0.008161837235093117, + "learning_rate": 1.3568546401791449e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247109328, + "step": 114400 + }, + { + "epoch": 18.663132137030995, + "grad_norm": 0.008084426634013653, + "learning_rate": 1.355208158059984e-05, + "loss": 0.0095, + "num_input_tokens_seen": 247118064, + "step": 114405 + }, + { + "epoch": 18.66394779771615, + "grad_norm": 0.00394302187487483, + "learning_rate": 1.3535626617892426e-05, + "loss": 0.0037, + "num_input_tokens_seen": 247130224, + "step": 114410 + }, + { + "epoch": 18.664763458401303, + "grad_norm": 0.0034730606712400913, + "learning_rate": 1.3519181514002665e-05, + "loss": 0.0495, + "num_input_tokens_seen": 247140848, + "step": 114415 + }, + { + "epoch": 18.66557911908646, + "grad_norm": 0.001881223637610674, + "learning_rate": 1.3502746269263788e-05, + "loss": 0.0034, + "num_input_tokens_seen": 247151952, + "step": 114420 + }, + { + "epoch": 18.666394779771615, + "grad_norm": 0.00045266575762070715, + "learning_rate": 1.3486320884008918e-05, + "loss": 0.0103, + "num_input_tokens_seen": 247162352, + "step": 114425 + }, + { + "epoch": 18.66721044045677, + "grad_norm": 0.015825331211090088, + "learning_rate": 1.3469905358570956e-05, + "loss": 0.0006, + "num_input_tokens_seen": 247173904, + "step": 114430 + }, + { + "epoch": 18.668026101141926, + "grad_norm": 0.003665680531412363, + "learning_rate": 1.3453499693282633e-05, + "loss": 0.0008, + "num_input_tokens_seen": 247184976, + "step": 114435 + }, + { + "epoch": 18.66884176182708, + "grad_norm": 0.0014401959488168359, + "learning_rate": 1.3437103888476244e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247195824, + "step": 114440 + }, + { + "epoch": 18.669657422512234, + "grad_norm": 0.0011207032948732376, + "learning_rate": 1.342071794448435e-05, + "loss": 0.0005, + "num_input_tokens_seen": 247206064, + "step": 114445 + }, + { + "epoch": 18.67047308319739, + "grad_norm": 0.00031443015905097127, + "learning_rate": 1.340434186163869e-05, + "loss": 0.0008, + "num_input_tokens_seen": 247217360, + "step": 114450 + }, + { + "epoch": 18.671288743882545, + "grad_norm": 0.026929231360554695, + "learning_rate": 1.33879756402715e-05, + "loss": 0.0033, + "num_input_tokens_seen": 247228944, + "step": 114455 + }, + { + "epoch": 18.6721044045677, + "grad_norm": 0.00107129430398345, + "learning_rate": 1.3371619280714175e-05, + "loss": 0.0021, + "num_input_tokens_seen": 247238192, + "step": 114460 + }, + { + "epoch": 18.672920065252853, + "grad_norm": 0.001654400723055005, + "learning_rate": 1.3355272783298455e-05, + "loss": 0.0025, + "num_input_tokens_seen": 247250192, + "step": 114465 + }, + { + "epoch": 18.67373572593801, + "grad_norm": 0.00030322172096930444, + "learning_rate": 1.3338936148355351e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247261168, + "step": 114470 + }, + { + "epoch": 18.674551386623165, + "grad_norm": 0.002237173030152917, + "learning_rate": 1.3322609376216155e-05, + "loss": 0.0014, + "num_input_tokens_seen": 247273104, + "step": 114475 + }, + { + "epoch": 18.67536704730832, + "grad_norm": 0.004785965196788311, + "learning_rate": 1.33062924672116e-05, + "loss": 0.0011, + "num_input_tokens_seen": 247284336, + "step": 114480 + }, + { + "epoch": 18.676182707993476, + "grad_norm": 1.1634474992752075, + "learning_rate": 1.3289985421672534e-05, + "loss": 0.0547, + "num_input_tokens_seen": 247295312, + "step": 114485 + }, + { + "epoch": 18.67699836867863, + "grad_norm": 0.0024988525547087193, + "learning_rate": 1.3273688239929248e-05, + "loss": 0.0005, + "num_input_tokens_seen": 247306864, + "step": 114490 + }, + { + "epoch": 18.677814029363784, + "grad_norm": 0.0052915457636117935, + "learning_rate": 1.3257400922312258e-05, + "loss": 0.0056, + "num_input_tokens_seen": 247317520, + "step": 114495 + }, + { + "epoch": 18.67862969004894, + "grad_norm": 0.0011443269904702902, + "learning_rate": 1.3241123469151406e-05, + "loss": 0.0009, + "num_input_tokens_seen": 247328240, + "step": 114500 + }, + { + "epoch": 18.679445350734095, + "grad_norm": 0.0029852113220840693, + "learning_rate": 1.322485588077671e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247339952, + "step": 114505 + }, + { + "epoch": 18.68026101141925, + "grad_norm": 0.0009250577422790229, + "learning_rate": 1.3208598157517849e-05, + "loss": 0.0013, + "num_input_tokens_seen": 247350896, + "step": 114510 + }, + { + "epoch": 18.681076672104403, + "grad_norm": 0.0006689508445560932, + "learning_rate": 1.3192350299704225e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247361168, + "step": 114515 + }, + { + "epoch": 18.68189233278956, + "grad_norm": 0.00025370350340381265, + "learning_rate": 1.3176112307665245e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247373232, + "step": 114520 + }, + { + "epoch": 18.682707993474715, + "grad_norm": 0.00041831081034615636, + "learning_rate": 1.315988418172992e-05, + "loss": 0.0073, + "num_input_tokens_seen": 247384624, + "step": 114525 + }, + { + "epoch": 18.68352365415987, + "grad_norm": 0.0011938015231862664, + "learning_rate": 1.3143665922227155e-05, + "loss": 0.0049, + "num_input_tokens_seen": 247395248, + "step": 114530 + }, + { + "epoch": 18.684339314845026, + "grad_norm": 0.25502070784568787, + "learning_rate": 1.3127457529485576e-05, + "loss": 0.0059, + "num_input_tokens_seen": 247406096, + "step": 114535 + }, + { + "epoch": 18.68515497553018, + "grad_norm": 0.06968465447425842, + "learning_rate": 1.3111259003833753e-05, + "loss": 0.0026, + "num_input_tokens_seen": 247415568, + "step": 114540 + }, + { + "epoch": 18.685970636215334, + "grad_norm": 0.0003617781330831349, + "learning_rate": 1.3095070345599924e-05, + "loss": 0.0016, + "num_input_tokens_seen": 247426160, + "step": 114545 + }, + { + "epoch": 18.68678629690049, + "grad_norm": 0.0005903160781599581, + "learning_rate": 1.3078891555112161e-05, + "loss": 0.0014, + "num_input_tokens_seen": 247437712, + "step": 114550 + }, + { + "epoch": 18.687601957585645, + "grad_norm": 0.0013629597378894687, + "learning_rate": 1.306272263269831e-05, + "loss": 0.0052, + "num_input_tokens_seen": 247447536, + "step": 114555 + }, + { + "epoch": 18.6884176182708, + "grad_norm": 0.07200295478105545, + "learning_rate": 1.3046563578686222e-05, + "loss": 0.0036, + "num_input_tokens_seen": 247458768, + "step": 114560 + }, + { + "epoch": 18.689233278955953, + "grad_norm": 0.05303337797522545, + "learning_rate": 1.303041439340319e-05, + "loss": 0.004, + "num_input_tokens_seen": 247470032, + "step": 114565 + }, + { + "epoch": 18.69004893964111, + "grad_norm": 0.0004202370473649353, + "learning_rate": 1.3014275077176618e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247481200, + "step": 114570 + }, + { + "epoch": 18.690864600326265, + "grad_norm": 0.018010199069976807, + "learning_rate": 1.2998145630333469e-05, + "loss": 0.0019, + "num_input_tokens_seen": 247491824, + "step": 114575 + }, + { + "epoch": 18.69168026101142, + "grad_norm": 0.002485614735633135, + "learning_rate": 1.2982026053200813e-05, + "loss": 0.0024, + "num_input_tokens_seen": 247501904, + "step": 114580 + }, + { + "epoch": 18.692495921696576, + "grad_norm": 0.002339205238968134, + "learning_rate": 1.2965916346105166e-05, + "loss": 0.0043, + "num_input_tokens_seen": 247511600, + "step": 114585 + }, + { + "epoch": 18.693311582381728, + "grad_norm": 0.02498902939260006, + "learning_rate": 1.2949816509373102e-05, + "loss": 0.0008, + "num_input_tokens_seen": 247522832, + "step": 114590 + }, + { + "epoch": 18.694127243066884, + "grad_norm": 0.004603876266628504, + "learning_rate": 1.2933726543330804e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247533328, + "step": 114595 + }, + { + "epoch": 18.69494290375204, + "grad_norm": 0.041026998311281204, + "learning_rate": 1.2917646448304509e-05, + "loss": 0.0009, + "num_input_tokens_seen": 247544272, + "step": 114600 + }, + { + "epoch": 18.695758564437195, + "grad_norm": 0.04702477157115936, + "learning_rate": 1.2901576224619959e-05, + "loss": 0.0041, + "num_input_tokens_seen": 247554064, + "step": 114605 + }, + { + "epoch": 18.696574225122347, + "grad_norm": 0.02038668841123581, + "learning_rate": 1.2885515872602949e-05, + "loss": 0.0025, + "num_input_tokens_seen": 247565328, + "step": 114610 + }, + { + "epoch": 18.697389885807503, + "grad_norm": 0.0002236200380139053, + "learning_rate": 1.2869465392578828e-05, + "loss": 0.0012, + "num_input_tokens_seen": 247577136, + "step": 114615 + }, + { + "epoch": 18.69820554649266, + "grad_norm": 0.004946097731590271, + "learning_rate": 1.2853424784873059e-05, + "loss": 0.0033, + "num_input_tokens_seen": 247587184, + "step": 114620 + }, + { + "epoch": 18.699021207177815, + "grad_norm": 0.010659070685505867, + "learning_rate": 1.2837394049810547e-05, + "loss": 0.0009, + "num_input_tokens_seen": 247597040, + "step": 114625 + }, + { + "epoch": 18.69983686786297, + "grad_norm": 0.0010920735076069832, + "learning_rate": 1.2821373187716311e-05, + "loss": 0.001, + "num_input_tokens_seen": 247608944, + "step": 114630 + }, + { + "epoch": 18.700652528548122, + "grad_norm": 0.001096490304917097, + "learning_rate": 1.2805362198914872e-05, + "loss": 0.0015, + "num_input_tokens_seen": 247620528, + "step": 114635 + }, + { + "epoch": 18.701468189233278, + "grad_norm": 0.0010115448385477066, + "learning_rate": 1.2789361083730911e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247630800, + "step": 114640 + }, + { + "epoch": 18.702283849918434, + "grad_norm": 0.03550710901618004, + "learning_rate": 1.2773369842488614e-05, + "loss": 0.0024, + "num_input_tokens_seen": 247642256, + "step": 114645 + }, + { + "epoch": 18.70309951060359, + "grad_norm": 0.0003345920122228563, + "learning_rate": 1.2757388475512055e-05, + "loss": 0.0032, + "num_input_tokens_seen": 247653808, + "step": 114650 + }, + { + "epoch": 18.703915171288745, + "grad_norm": 0.014875334687530994, + "learning_rate": 1.2741416983125143e-05, + "loss": 0.0013, + "num_input_tokens_seen": 247664720, + "step": 114655 + }, + { + "epoch": 18.704730831973897, + "grad_norm": 0.0014985312009230256, + "learning_rate": 1.2725455365651507e-05, + "loss": 0.0008, + "num_input_tokens_seen": 247676112, + "step": 114660 + }, + { + "epoch": 18.705546492659053, + "grad_norm": 0.006818423047661781, + "learning_rate": 1.270950362341472e-05, + "loss": 0.0489, + "num_input_tokens_seen": 247687248, + "step": 114665 + }, + { + "epoch": 18.70636215334421, + "grad_norm": 0.0007490873686037958, + "learning_rate": 1.269356175673797e-05, + "loss": 0.0068, + "num_input_tokens_seen": 247697968, + "step": 114670 + }, + { + "epoch": 18.707177814029365, + "grad_norm": 0.0012531366664916277, + "learning_rate": 1.2677629765944387e-05, + "loss": 0.0026, + "num_input_tokens_seen": 247707920, + "step": 114675 + }, + { + "epoch": 18.70799347471452, + "grad_norm": 0.001863017096184194, + "learning_rate": 1.266170765135688e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247718992, + "step": 114680 + }, + { + "epoch": 18.708809135399672, + "grad_norm": 0.04067402333021164, + "learning_rate": 1.2645795413298078e-05, + "loss": 0.0209, + "num_input_tokens_seen": 247730800, + "step": 114685 + }, + { + "epoch": 18.709624796084828, + "grad_norm": 0.0016221472760662436, + "learning_rate": 1.2629893052090502e-05, + "loss": 0.0048, + "num_input_tokens_seen": 247740272, + "step": 114690 + }, + { + "epoch": 18.710440456769984, + "grad_norm": 0.07899585366249084, + "learning_rate": 1.2614000568056395e-05, + "loss": 0.003, + "num_input_tokens_seen": 247751504, + "step": 114695 + }, + { + "epoch": 18.71125611745514, + "grad_norm": 0.000546223483979702, + "learning_rate": 1.259811796151783e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247762480, + "step": 114700 + }, + { + "epoch": 18.712071778140295, + "grad_norm": 0.02150532603263855, + "learning_rate": 1.258224523279683e-05, + "loss": 0.0019, + "num_input_tokens_seen": 247773552, + "step": 114705 + }, + { + "epoch": 18.712887438825447, + "grad_norm": 0.008169742301106453, + "learning_rate": 1.2566382382214859e-05, + "loss": 0.0017, + "num_input_tokens_seen": 247783216, + "step": 114710 + }, + { + "epoch": 18.713703099510603, + "grad_norm": 0.09331442415714264, + "learning_rate": 1.2550529410093548e-05, + "loss": 0.0086, + "num_input_tokens_seen": 247793936, + "step": 114715 + }, + { + "epoch": 18.71451876019576, + "grad_norm": 0.04075554758310318, + "learning_rate": 1.2534686316754085e-05, + "loss": 0.0014, + "num_input_tokens_seen": 247805392, + "step": 114720 + }, + { + "epoch": 18.715334420880914, + "grad_norm": 0.0007709045894443989, + "learning_rate": 1.2518853102517657e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247817072, + "step": 114725 + }, + { + "epoch": 18.71615008156607, + "grad_norm": 0.050977423787117004, + "learning_rate": 1.250302976770501e-05, + "loss": 0.0013, + "num_input_tokens_seen": 247827888, + "step": 114730 + }, + { + "epoch": 18.716965742251222, + "grad_norm": 0.038305021822452545, + "learning_rate": 1.248721631263705e-05, + "loss": 0.0019, + "num_input_tokens_seen": 247838800, + "step": 114735 + }, + { + "epoch": 18.717781402936378, + "grad_norm": 0.008927847258746624, + "learning_rate": 1.2471412737633914e-05, + "loss": 0.0019, + "num_input_tokens_seen": 247849872, + "step": 114740 + }, + { + "epoch": 18.718597063621534, + "grad_norm": 0.02261229418218136, + "learning_rate": 1.2455619043016175e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247862064, + "step": 114745 + }, + { + "epoch": 18.71941272430669, + "grad_norm": 0.00020072948245797306, + "learning_rate": 1.2439835229103803e-05, + "loss": 0.0005, + "num_input_tokens_seen": 247871920, + "step": 114750 + }, + { + "epoch": 18.72022838499184, + "grad_norm": 0.001048129634000361, + "learning_rate": 1.242406129621665e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247882800, + "step": 114755 + }, + { + "epoch": 18.721044045676997, + "grad_norm": 0.003789189737290144, + "learning_rate": 1.240829724467446e-05, + "loss": 0.0014, + "num_input_tokens_seen": 247894288, + "step": 114760 + }, + { + "epoch": 18.721859706362153, + "grad_norm": 0.2328716516494751, + "learning_rate": 1.2392543074796702e-05, + "loss": 0.0041, + "num_input_tokens_seen": 247906672, + "step": 114765 + }, + { + "epoch": 18.72267536704731, + "grad_norm": 0.0024382395204156637, + "learning_rate": 1.2376798786902621e-05, + "loss": 0.001, + "num_input_tokens_seen": 247917936, + "step": 114770 + }, + { + "epoch": 18.723491027732464, + "grad_norm": 0.002633201191201806, + "learning_rate": 1.2361064381311293e-05, + "loss": 0.0011, + "num_input_tokens_seen": 247929680, + "step": 114775 + }, + { + "epoch": 18.724306688417617, + "grad_norm": 0.0014084518188610673, + "learning_rate": 1.2345339858341576e-05, + "loss": 0.0008, + "num_input_tokens_seen": 247940368, + "step": 114780 + }, + { + "epoch": 18.725122349102772, + "grad_norm": 0.00445615453645587, + "learning_rate": 1.2329625218312213e-05, + "loss": 0.0005, + "num_input_tokens_seen": 247951984, + "step": 114785 + }, + { + "epoch": 18.725938009787928, + "grad_norm": 0.0038615395314991474, + "learning_rate": 1.2313920461541672e-05, + "loss": 0.0016, + "num_input_tokens_seen": 247961456, + "step": 114790 + }, + { + "epoch": 18.726753670473084, + "grad_norm": 0.00038773167761974037, + "learning_rate": 1.22982255883482e-05, + "loss": 0.0015, + "num_input_tokens_seen": 247971440, + "step": 114795 + }, + { + "epoch": 18.72756933115824, + "grad_norm": 0.00029015023028478026, + "learning_rate": 1.2282540599049873e-05, + "loss": 0.0009, + "num_input_tokens_seen": 247982192, + "step": 114800 + }, + { + "epoch": 18.72838499184339, + "grad_norm": 0.004771945532411337, + "learning_rate": 1.2266865493964551e-05, + "loss": 0.0015, + "num_input_tokens_seen": 247993008, + "step": 114805 + }, + { + "epoch": 18.729200652528547, + "grad_norm": 0.003790721297264099, + "learning_rate": 1.2251200273409923e-05, + "loss": 0.0197, + "num_input_tokens_seen": 248004720, + "step": 114810 + }, + { + "epoch": 18.730016313213703, + "grad_norm": 0.0013847892405465245, + "learning_rate": 1.2235544937703513e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248015024, + "step": 114815 + }, + { + "epoch": 18.73083197389886, + "grad_norm": 0.004622996784746647, + "learning_rate": 1.2219899487162567e-05, + "loss": 0.0014, + "num_input_tokens_seen": 248025232, + "step": 114820 + }, + { + "epoch": 18.731647634584014, + "grad_norm": 0.031056227162480354, + "learning_rate": 1.2204263922104108e-05, + "loss": 0.0017, + "num_input_tokens_seen": 248036016, + "step": 114825 + }, + { + "epoch": 18.732463295269167, + "grad_norm": 0.0014217033749446273, + "learning_rate": 1.2188638242845108e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248046160, + "step": 114830 + }, + { + "epoch": 18.733278955954322, + "grad_norm": 0.00779396528378129, + "learning_rate": 1.2173022449702142e-05, + "loss": 0.0009, + "num_input_tokens_seen": 248055984, + "step": 114835 + }, + { + "epoch": 18.734094616639478, + "grad_norm": 0.1975318193435669, + "learning_rate": 1.215741654299174e-05, + "loss": 0.0079, + "num_input_tokens_seen": 248066512, + "step": 114840 + }, + { + "epoch": 18.734910277324634, + "grad_norm": 0.017688684165477753, + "learning_rate": 1.214182052303009e-05, + "loss": 0.0032, + "num_input_tokens_seen": 248075344, + "step": 114845 + }, + { + "epoch": 18.73572593800979, + "grad_norm": 0.003837002906948328, + "learning_rate": 1.2126234390133439e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248086832, + "step": 114850 + }, + { + "epoch": 18.73654159869494, + "grad_norm": 0.0023042631801217794, + "learning_rate": 1.2110658144617538e-05, + "loss": 0.0062, + "num_input_tokens_seen": 248097328, + "step": 114855 + }, + { + "epoch": 18.737357259380097, + "grad_norm": 0.0019547000993043184, + "learning_rate": 1.2095091786798074e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248107792, + "step": 114860 + }, + { + "epoch": 18.738172920065253, + "grad_norm": 0.7636070251464844, + "learning_rate": 1.207953531699052e-05, + "loss": 0.1813, + "num_input_tokens_seen": 248118608, + "step": 114865 + }, + { + "epoch": 18.73898858075041, + "grad_norm": 0.0049219997599720955, + "learning_rate": 1.206398873551018e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248130192, + "step": 114870 + }, + { + "epoch": 18.739804241435564, + "grad_norm": 0.020860247313976288, + "learning_rate": 1.2048452042672075e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248141232, + "step": 114875 + }, + { + "epoch": 18.740619902120716, + "grad_norm": 0.0031487110536545515, + "learning_rate": 1.2032925238791071e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248151856, + "step": 114880 + }, + { + "epoch": 18.741435562805872, + "grad_norm": 0.06586775928735733, + "learning_rate": 1.2017408324181911e-05, + "loss": 0.002, + "num_input_tokens_seen": 248162416, + "step": 114885 + }, + { + "epoch": 18.742251223491028, + "grad_norm": 0.0008164440514519811, + "learning_rate": 1.2001901299159013e-05, + "loss": 0.001, + "num_input_tokens_seen": 248174064, + "step": 114890 + }, + { + "epoch": 18.743066884176184, + "grad_norm": 0.0023708927910774946, + "learning_rate": 1.1986404164036679e-05, + "loss": 0.1441, + "num_input_tokens_seen": 248184432, + "step": 114895 + }, + { + "epoch": 18.74388254486134, + "grad_norm": 0.0004083328531123698, + "learning_rate": 1.1970916919128937e-05, + "loss": 0.001, + "num_input_tokens_seen": 248195792, + "step": 114900 + }, + { + "epoch": 18.74469820554649, + "grad_norm": 0.002401529112830758, + "learning_rate": 1.1955439564749649e-05, + "loss": 0.0026, + "num_input_tokens_seen": 248206768, + "step": 114905 + }, + { + "epoch": 18.745513866231647, + "grad_norm": 0.0043855938129127026, + "learning_rate": 1.1939972101212503e-05, + "loss": 0.001, + "num_input_tokens_seen": 248218672, + "step": 114910 + }, + { + "epoch": 18.746329526916803, + "grad_norm": 0.022415775805711746, + "learning_rate": 1.1924514528831032e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248229488, + "step": 114915 + }, + { + "epoch": 18.74714518760196, + "grad_norm": 0.007502218242734671, + "learning_rate": 1.190906684791837e-05, + "loss": 0.0013, + "num_input_tokens_seen": 248240176, + "step": 114920 + }, + { + "epoch": 18.747960848287114, + "grad_norm": 0.0006908404175192118, + "learning_rate": 1.1893629058787714e-05, + "loss": 0.0012, + "num_input_tokens_seen": 248249296, + "step": 114925 + }, + { + "epoch": 18.748776508972266, + "grad_norm": 0.0905354842543602, + "learning_rate": 1.187820116175181e-05, + "loss": 0.0041, + "num_input_tokens_seen": 248259696, + "step": 114930 + }, + { + "epoch": 18.749592169657422, + "grad_norm": 0.001892009051516652, + "learning_rate": 1.1862783157123413e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248270416, + "step": 114935 + }, + { + "epoch": 18.750407830342578, + "grad_norm": 0.010540666058659554, + "learning_rate": 1.1847375045214992e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248282000, + "step": 114940 + }, + { + "epoch": 18.751223491027734, + "grad_norm": 0.0007760238368064165, + "learning_rate": 1.1831976826338742e-05, + "loss": 0.0028, + "num_input_tokens_seen": 248294096, + "step": 114945 + }, + { + "epoch": 18.752039151712886, + "grad_norm": 0.004532194696366787, + "learning_rate": 1.1816588500806802e-05, + "loss": 0.0014, + "num_input_tokens_seen": 248305136, + "step": 114950 + }, + { + "epoch": 18.75285481239804, + "grad_norm": 0.0071411640383303165, + "learning_rate": 1.1801210068930923e-05, + "loss": 0.0023, + "num_input_tokens_seen": 248315632, + "step": 114955 + }, + { + "epoch": 18.753670473083197, + "grad_norm": 0.011389593593776226, + "learning_rate": 1.1785841531022968e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248326576, + "step": 114960 + }, + { + "epoch": 18.754486133768353, + "grad_norm": 0.05140427127480507, + "learning_rate": 1.177048288739413e-05, + "loss": 0.0049, + "num_input_tokens_seen": 248337488, + "step": 114965 + }, + { + "epoch": 18.75530179445351, + "grad_norm": 0.00020385747484397143, + "learning_rate": 1.1755134138355995e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248347728, + "step": 114970 + }, + { + "epoch": 18.75611745513866, + "grad_norm": 0.010403123684227467, + "learning_rate": 1.1739795284219256e-05, + "loss": 0.0012, + "num_input_tokens_seen": 248358448, + "step": 114975 + }, + { + "epoch": 18.756933115823816, + "grad_norm": 0.024493994191288948, + "learning_rate": 1.172446632529517e-05, + "loss": 0.0011, + "num_input_tokens_seen": 248369712, + "step": 114980 + }, + { + "epoch": 18.757748776508972, + "grad_norm": 0.0017098193056881428, + "learning_rate": 1.1709147261894037e-05, + "loss": 0.0024, + "num_input_tokens_seen": 248381232, + "step": 114985 + }, + { + "epoch": 18.758564437194128, + "grad_norm": 0.004953702911734581, + "learning_rate": 1.1693838094326502e-05, + "loss": 0.0035, + "num_input_tokens_seen": 248391536, + "step": 114990 + }, + { + "epoch": 18.759380097879284, + "grad_norm": 0.0007283523445948958, + "learning_rate": 1.1678538822902817e-05, + "loss": 0.0006, + "num_input_tokens_seen": 248402640, + "step": 114995 + }, + { + "epoch": 18.760195758564436, + "grad_norm": 0.00265447492711246, + "learning_rate": 1.1663249447933067e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248413040, + "step": 115000 + }, + { + "epoch": 18.76101141924959, + "grad_norm": 0.0027859921101480722, + "learning_rate": 1.1647969969727e-05, + "loss": 0.002, + "num_input_tokens_seen": 248425008, + "step": 115005 + }, + { + "epoch": 18.761827079934747, + "grad_norm": 0.0012829096522182226, + "learning_rate": 1.1632700388594375e-05, + "loss": 0.0017, + "num_input_tokens_seen": 248436368, + "step": 115010 + }, + { + "epoch": 18.762642740619903, + "grad_norm": 0.0012848442420363426, + "learning_rate": 1.1617440704844661e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248447920, + "step": 115015 + }, + { + "epoch": 18.76345840130506, + "grad_norm": 0.009974795393645763, + "learning_rate": 1.1602190918787004e-05, + "loss": 0.0023, + "num_input_tokens_seen": 248458768, + "step": 115020 + }, + { + "epoch": 18.76427406199021, + "grad_norm": 0.3886551856994629, + "learning_rate": 1.1586951030730542e-05, + "loss": 0.0298, + "num_input_tokens_seen": 248470384, + "step": 115025 + }, + { + "epoch": 18.765089722675366, + "grad_norm": 0.0006359002436511219, + "learning_rate": 1.1571721040984084e-05, + "loss": 0.0011, + "num_input_tokens_seen": 248481840, + "step": 115030 + }, + { + "epoch": 18.765905383360522, + "grad_norm": 0.0023894875776022673, + "learning_rate": 1.1556500949856386e-05, + "loss": 0.003, + "num_input_tokens_seen": 248492240, + "step": 115035 + }, + { + "epoch": 18.766721044045678, + "grad_norm": 0.028124431148171425, + "learning_rate": 1.1541290757655754e-05, + "loss": 0.0018, + "num_input_tokens_seen": 248503472, + "step": 115040 + }, + { + "epoch": 18.767536704730833, + "grad_norm": 0.00027735813637264073, + "learning_rate": 1.1526090464690553e-05, + "loss": 0.0035, + "num_input_tokens_seen": 248514288, + "step": 115045 + }, + { + "epoch": 18.768352365415986, + "grad_norm": 0.0014982965076342225, + "learning_rate": 1.1510900071268815e-05, + "loss": 0.001, + "num_input_tokens_seen": 248525456, + "step": 115050 + }, + { + "epoch": 18.76916802610114, + "grad_norm": 0.0015739202499389648, + "learning_rate": 1.149571957769835e-05, + "loss": 0.0006, + "num_input_tokens_seen": 248536976, + "step": 115055 + }, + { + "epoch": 18.769983686786297, + "grad_norm": 0.057564280927181244, + "learning_rate": 1.1480548984286853e-05, + "loss": 0.0028, + "num_input_tokens_seen": 248545904, + "step": 115060 + }, + { + "epoch": 18.770799347471453, + "grad_norm": 0.0035541686229407787, + "learning_rate": 1.1465388291341804e-05, + "loss": 0.0013, + "num_input_tokens_seen": 248556368, + "step": 115065 + }, + { + "epoch": 18.77161500815661, + "grad_norm": 0.0001766427740221843, + "learning_rate": 1.145023749917029e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248566640, + "step": 115070 + }, + { + "epoch": 18.77243066884176, + "grad_norm": 0.019232071936130524, + "learning_rate": 1.143509660807962e-05, + "loss": 0.0029, + "num_input_tokens_seen": 248579440, + "step": 115075 + }, + { + "epoch": 18.773246329526916, + "grad_norm": 0.0011684228666126728, + "learning_rate": 1.1419965618376383e-05, + "loss": 0.0013, + "num_input_tokens_seen": 248589008, + "step": 115080 + }, + { + "epoch": 18.774061990212072, + "grad_norm": 0.0053740390576422215, + "learning_rate": 1.1404844530367498e-05, + "loss": 0.0039, + "num_input_tokens_seen": 248598640, + "step": 115085 + }, + { + "epoch": 18.774877650897228, + "grad_norm": 0.0055008502677083015, + "learning_rate": 1.138973334435911e-05, + "loss": 0.0016, + "num_input_tokens_seen": 248610064, + "step": 115090 + }, + { + "epoch": 18.775693311582383, + "grad_norm": 0.0013196800136938691, + "learning_rate": 1.1374632060657753e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248620848, + "step": 115095 + }, + { + "epoch": 18.776508972267536, + "grad_norm": 0.0019128243438899517, + "learning_rate": 1.1359540679569236e-05, + "loss": 0.001, + "num_input_tokens_seen": 248631792, + "step": 115100 + }, + { + "epoch": 18.77732463295269, + "grad_norm": 0.0034028757363557816, + "learning_rate": 1.1344459201399592e-05, + "loss": 0.0016, + "num_input_tokens_seen": 248641264, + "step": 115105 + }, + { + "epoch": 18.778140293637847, + "grad_norm": 0.009112970903515816, + "learning_rate": 1.1329387626454358e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248652432, + "step": 115110 + }, + { + "epoch": 18.778955954323003, + "grad_norm": 0.003698774380609393, + "learning_rate": 1.1314325955039007e-05, + "loss": 0.0041, + "num_input_tokens_seen": 248663056, + "step": 115115 + }, + { + "epoch": 18.77977161500816, + "grad_norm": 0.002369646681472659, + "learning_rate": 1.1299274187458741e-05, + "loss": 0.0069, + "num_input_tokens_seen": 248673936, + "step": 115120 + }, + { + "epoch": 18.78058727569331, + "grad_norm": 0.0011856432538479567, + "learning_rate": 1.1284232324018761e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248683600, + "step": 115125 + }, + { + "epoch": 18.781402936378466, + "grad_norm": 0.0010567542631179094, + "learning_rate": 1.1269200365023657e-05, + "loss": 0.0011, + "num_input_tokens_seen": 248694256, + "step": 115130 + }, + { + "epoch": 18.782218597063622, + "grad_norm": 0.4526723623275757, + "learning_rate": 1.125417831077824e-05, + "loss": 0.021, + "num_input_tokens_seen": 248704560, + "step": 115135 + }, + { + "epoch": 18.783034257748778, + "grad_norm": 0.00026864337269216776, + "learning_rate": 1.1239166161586933e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248717232, + "step": 115140 + }, + { + "epoch": 18.78384991843393, + "grad_norm": 0.002488876460120082, + "learning_rate": 1.1224163917753993e-05, + "loss": 0.0085, + "num_input_tokens_seen": 248727728, + "step": 115145 + }, + { + "epoch": 18.784665579119086, + "grad_norm": 0.00027354180929251015, + "learning_rate": 1.1209171579583399e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248738544, + "step": 115150 + }, + { + "epoch": 18.78548123980424, + "grad_norm": 0.0005132790538482368, + "learning_rate": 1.1194189147379018e-05, + "loss": 0.0013, + "num_input_tokens_seen": 248749456, + "step": 115155 + }, + { + "epoch": 18.786296900489397, + "grad_norm": 0.0002152713859686628, + "learning_rate": 1.1179216621444499e-05, + "loss": 0.001, + "num_input_tokens_seen": 248760208, + "step": 115160 + }, + { + "epoch": 18.787112561174553, + "grad_norm": 0.007923472672700882, + "learning_rate": 1.1164254002083262e-05, + "loss": 0.001, + "num_input_tokens_seen": 248771600, + "step": 115165 + }, + { + "epoch": 18.787928221859705, + "grad_norm": 0.00116739550139755, + "learning_rate": 1.1149301289598569e-05, + "loss": 0.0011, + "num_input_tokens_seen": 248782416, + "step": 115170 + }, + { + "epoch": 18.78874388254486, + "grad_norm": 0.0004970782902091742, + "learning_rate": 1.1134358484293395e-05, + "loss": 0.0028, + "num_input_tokens_seen": 248793296, + "step": 115175 + }, + { + "epoch": 18.789559543230016, + "grad_norm": 0.0009810201590880752, + "learning_rate": 1.1119425586470667e-05, + "loss": 0.0249, + "num_input_tokens_seen": 248804528, + "step": 115180 + }, + { + "epoch": 18.790375203915172, + "grad_norm": 0.0015762445982545614, + "learning_rate": 1.1104502596432863e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248814608, + "step": 115185 + }, + { + "epoch": 18.791190864600328, + "grad_norm": 0.0027050410863012075, + "learning_rate": 1.1089589514482635e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248825680, + "step": 115190 + }, + { + "epoch": 18.79200652528548, + "grad_norm": 0.004898820538073778, + "learning_rate": 1.1074686340922068e-05, + "loss": 0.0011, + "num_input_tokens_seen": 248837744, + "step": 115195 + }, + { + "epoch": 18.792822185970635, + "grad_norm": 0.0005425384151749313, + "learning_rate": 1.105979307605326e-05, + "loss": 0.002, + "num_input_tokens_seen": 248845904, + "step": 115200 + }, + { + "epoch": 18.79363784665579, + "grad_norm": 0.0005339878844097257, + "learning_rate": 1.104490972017791e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248856720, + "step": 115205 + }, + { + "epoch": 18.794453507340947, + "grad_norm": 0.003522041952237487, + "learning_rate": 1.1030036273597888e-05, + "loss": 0.0012, + "num_input_tokens_seen": 248867536, + "step": 115210 + }, + { + "epoch": 18.795269168026103, + "grad_norm": 0.0015702954260632396, + "learning_rate": 1.1015172736614343e-05, + "loss": 0.0032, + "num_input_tokens_seen": 248878096, + "step": 115215 + }, + { + "epoch": 18.796084828711255, + "grad_norm": 0.0037163293454796076, + "learning_rate": 1.1000319109528755e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248889936, + "step": 115220 + }, + { + "epoch": 18.79690048939641, + "grad_norm": 0.0001467862311983481, + "learning_rate": 1.0985475392641941e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248900464, + "step": 115225 + }, + { + "epoch": 18.797716150081566, + "grad_norm": 0.0035111182369291782, + "learning_rate": 1.0970641586254937e-05, + "loss": 0.001, + "num_input_tokens_seen": 248911280, + "step": 115230 + }, + { + "epoch": 18.798531810766722, + "grad_norm": 0.0016056197928264737, + "learning_rate": 1.0955817690668169e-05, + "loss": 0.0009, + "num_input_tokens_seen": 248922864, + "step": 115235 + }, + { + "epoch": 18.799347471451878, + "grad_norm": 0.0003613026347011328, + "learning_rate": 1.094100370618223e-05, + "loss": 0.0019, + "num_input_tokens_seen": 248932560, + "step": 115240 + }, + { + "epoch": 18.80016313213703, + "grad_norm": 0.0015075618866831064, + "learning_rate": 1.0926199633097156e-05, + "loss": 0.0004, + "num_input_tokens_seen": 248943440, + "step": 115245 + }, + { + "epoch": 18.800978792822185, + "grad_norm": 0.003285182174295187, + "learning_rate": 1.091140547171321e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248953712, + "step": 115250 + }, + { + "epoch": 18.80179445350734, + "grad_norm": 0.006845667026937008, + "learning_rate": 1.0896621222329983e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248965392, + "step": 115255 + }, + { + "epoch": 18.802610114192497, + "grad_norm": 0.003190365619957447, + "learning_rate": 1.0881846885247293e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248975952, + "step": 115260 + }, + { + "epoch": 18.803425774877653, + "grad_norm": 0.00023938875528983772, + "learning_rate": 1.0867082460764343e-05, + "loss": 0.0032, + "num_input_tokens_seen": 248987184, + "step": 115265 + }, + { + "epoch": 18.804241435562805, + "grad_norm": 0.0011155434185639024, + "learning_rate": 1.0852327949180618e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248998448, + "step": 115270 + }, + { + "epoch": 18.80505709624796, + "grad_norm": 0.001338596804998815, + "learning_rate": 1.0837583350794878e-05, + "loss": 0.0027, + "num_input_tokens_seen": 249008592, + "step": 115275 + }, + { + "epoch": 18.805872756933116, + "grad_norm": 0.01313408650457859, + "learning_rate": 1.0822848665906104e-05, + "loss": 0.0024, + "num_input_tokens_seen": 249019216, + "step": 115280 + }, + { + "epoch": 18.806688417618272, + "grad_norm": 0.008214665576815605, + "learning_rate": 1.0808123894812838e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249031056, + "step": 115285 + }, + { + "epoch": 18.807504078303424, + "grad_norm": 0.0005401197704486549, + "learning_rate": 1.0793409037813562e-05, + "loss": 0.0405, + "num_input_tokens_seen": 249041712, + "step": 115290 + }, + { + "epoch": 18.80831973898858, + "grad_norm": 0.0016540754586458206, + "learning_rate": 1.0778704095206427e-05, + "loss": 0.002, + "num_input_tokens_seen": 249051504, + "step": 115295 + }, + { + "epoch": 18.809135399673735, + "grad_norm": 0.004030495882034302, + "learning_rate": 1.0764009067289526e-05, + "loss": 0.0005, + "num_input_tokens_seen": 249062960, + "step": 115300 + }, + { + "epoch": 18.80995106035889, + "grad_norm": 0.001202249201014638, + "learning_rate": 1.0749323954360568e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249072560, + "step": 115305 + }, + { + "epoch": 18.810766721044047, + "grad_norm": 0.04477335512638092, + "learning_rate": 1.0734648756717258e-05, + "loss": 0.001, + "num_input_tokens_seen": 249083056, + "step": 115310 + }, + { + "epoch": 18.8115823817292, + "grad_norm": 0.0677478015422821, + "learning_rate": 1.0719983474656914e-05, + "loss": 0.0036, + "num_input_tokens_seen": 249092688, + "step": 115315 + }, + { + "epoch": 18.812398042414355, + "grad_norm": 0.00028590558213181794, + "learning_rate": 1.0705328108476852e-05, + "loss": 0.0005, + "num_input_tokens_seen": 249103696, + "step": 115320 + }, + { + "epoch": 18.81321370309951, + "grad_norm": 0.0034891394898295403, + "learning_rate": 1.0690682658474004e-05, + "loss": 0.0011, + "num_input_tokens_seen": 249115120, + "step": 115325 + }, + { + "epoch": 18.814029363784666, + "grad_norm": 0.004498482681810856, + "learning_rate": 1.0676047124945187e-05, + "loss": 0.0013, + "num_input_tokens_seen": 249126128, + "step": 115330 + }, + { + "epoch": 18.81484502446982, + "grad_norm": 0.11555361747741699, + "learning_rate": 1.0661421508187109e-05, + "loss": 0.0038, + "num_input_tokens_seen": 249137360, + "step": 115335 + }, + { + "epoch": 18.815660685154974, + "grad_norm": 0.0011468741577118635, + "learning_rate": 1.0646805808495974e-05, + "loss": 0.0084, + "num_input_tokens_seen": 249148400, + "step": 115340 + }, + { + "epoch": 18.81647634584013, + "grad_norm": 0.0004126753192394972, + "learning_rate": 1.0632200026168215e-05, + "loss": 0.0002, + "num_input_tokens_seen": 249160656, + "step": 115345 + }, + { + "epoch": 18.817292006525285, + "grad_norm": 0.011663687415421009, + "learning_rate": 1.061760416149965e-05, + "loss": 0.0147, + "num_input_tokens_seen": 249172304, + "step": 115350 + }, + { + "epoch": 18.81810766721044, + "grad_norm": 0.07764124870300293, + "learning_rate": 1.0603018214786264e-05, + "loss": 0.0031, + "num_input_tokens_seen": 249182896, + "step": 115355 + }, + { + "epoch": 18.818923327895597, + "grad_norm": 0.0025914544239640236, + "learning_rate": 1.0588442186323433e-05, + "loss": 0.0028, + "num_input_tokens_seen": 249193712, + "step": 115360 + }, + { + "epoch": 18.81973898858075, + "grad_norm": 0.0012623146176338196, + "learning_rate": 1.0573876076406807e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249203696, + "step": 115365 + }, + { + "epoch": 18.820554649265905, + "grad_norm": 0.007144573610275984, + "learning_rate": 1.055931988533132e-05, + "loss": 0.0012, + "num_input_tokens_seen": 249213936, + "step": 115370 + }, + { + "epoch": 18.82137030995106, + "grad_norm": 0.010374244302511215, + "learning_rate": 1.0544773613392289e-05, + "loss": 0.0061, + "num_input_tokens_seen": 249225520, + "step": 115375 + }, + { + "epoch": 18.822185970636216, + "grad_norm": 0.0038773410487920046, + "learning_rate": 1.0530237260884146e-05, + "loss": 0.0084, + "num_input_tokens_seen": 249236368, + "step": 115380 + }, + { + "epoch": 18.82300163132137, + "grad_norm": 0.002093594754114747, + "learning_rate": 1.051571082810182e-05, + "loss": 0.0022, + "num_input_tokens_seen": 249247984, + "step": 115385 + }, + { + "epoch": 18.823817292006524, + "grad_norm": 0.0006376361125148833, + "learning_rate": 1.0501194315339523e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249258832, + "step": 115390 + }, + { + "epoch": 18.82463295269168, + "grad_norm": 0.5937981605529785, + "learning_rate": 1.048668772289152e-05, + "loss": 0.0298, + "num_input_tokens_seen": 249269776, + "step": 115395 + }, + { + "epoch": 18.825448613376835, + "grad_norm": 0.0035166044253855944, + "learning_rate": 1.0472191051051738e-05, + "loss": 0.0012, + "num_input_tokens_seen": 249280432, + "step": 115400 + }, + { + "epoch": 18.82626427406199, + "grad_norm": 0.003904164768755436, + "learning_rate": 1.0457704300114057e-05, + "loss": 0.0017, + "num_input_tokens_seen": 249291472, + "step": 115405 + }, + { + "epoch": 18.827079934747147, + "grad_norm": 0.00040897587314248085, + "learning_rate": 1.0443227470372018e-05, + "loss": 0.0004, + "num_input_tokens_seen": 249302768, + "step": 115410 + }, + { + "epoch": 18.8278955954323, + "grad_norm": 0.0001675260136835277, + "learning_rate": 1.0428760562119e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249315216, + "step": 115415 + }, + { + "epoch": 18.828711256117455, + "grad_norm": 0.10226722061634064, + "learning_rate": 1.041430357564821e-05, + "loss": 0.0026, + "num_input_tokens_seen": 249326128, + "step": 115420 + }, + { + "epoch": 18.82952691680261, + "grad_norm": 0.00045980140566825867, + "learning_rate": 1.0399856511252692e-05, + "loss": 0.0003, + "num_input_tokens_seen": 249338608, + "step": 115425 + }, + { + "epoch": 18.830342577487766, + "grad_norm": 0.0012188085820525885, + "learning_rate": 1.0385419369225157e-05, + "loss": 0.0011, + "num_input_tokens_seen": 249348496, + "step": 115430 + }, + { + "epoch": 18.83115823817292, + "grad_norm": 0.0006280313245952129, + "learning_rate": 1.0370992149858205e-05, + "loss": 0.0004, + "num_input_tokens_seen": 249358832, + "step": 115435 + }, + { + "epoch": 18.831973898858074, + "grad_norm": 0.0012271900195628405, + "learning_rate": 1.0356574853444211e-05, + "loss": 0.0007, + "num_input_tokens_seen": 249367760, + "step": 115440 + }, + { + "epoch": 18.83278955954323, + "grad_norm": 0.004578801337629557, + "learning_rate": 1.0342167480275444e-05, + "loss": 0.002, + "num_input_tokens_seen": 249378320, + "step": 115445 + }, + { + "epoch": 18.833605220228385, + "grad_norm": 0.0065257553942501545, + "learning_rate": 1.032777003064378e-05, + "loss": 0.0015, + "num_input_tokens_seen": 249389328, + "step": 115450 + }, + { + "epoch": 18.83442088091354, + "grad_norm": 0.549997866153717, + "learning_rate": 1.0313382504841096e-05, + "loss": 0.0268, + "num_input_tokens_seen": 249402192, + "step": 115455 + }, + { + "epoch": 18.835236541598697, + "grad_norm": 0.00017082234262488782, + "learning_rate": 1.0299004903158882e-05, + "loss": 0.0004, + "num_input_tokens_seen": 249414096, + "step": 115460 + }, + { + "epoch": 18.83605220228385, + "grad_norm": 0.0017220403533428907, + "learning_rate": 1.0284637225888626e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249425520, + "step": 115465 + }, + { + "epoch": 18.836867862969005, + "grad_norm": 0.0026273017283529043, + "learning_rate": 1.0270279473321375e-05, + "loss": 0.0015, + "num_input_tokens_seen": 249437872, + "step": 115470 + }, + { + "epoch": 18.83768352365416, + "grad_norm": 0.0008895907667465508, + "learning_rate": 1.0255931645748174e-05, + "loss": 0.0013, + "num_input_tokens_seen": 249448848, + "step": 115475 + }, + { + "epoch": 18.838499184339316, + "grad_norm": 0.0026277219876646996, + "learning_rate": 1.0241593743459898e-05, + "loss": 0.0013, + "num_input_tokens_seen": 249461008, + "step": 115480 + }, + { + "epoch": 18.839314845024468, + "grad_norm": 0.0016103885136544704, + "learning_rate": 1.0227265766746874e-05, + "loss": 0.0232, + "num_input_tokens_seen": 249471216, + "step": 115485 + }, + { + "epoch": 18.840130505709624, + "grad_norm": 0.0002219690359197557, + "learning_rate": 1.0212947715899757e-05, + "loss": 0.0048, + "num_input_tokens_seen": 249482256, + "step": 115490 + }, + { + "epoch": 18.84094616639478, + "grad_norm": 0.0006362979183904827, + "learning_rate": 1.0198639591208535e-05, + "loss": 0.0005, + "num_input_tokens_seen": 249493328, + "step": 115495 + }, + { + "epoch": 18.841761827079935, + "grad_norm": 0.004850686062127352, + "learning_rate": 1.0184341392963259e-05, + "loss": 0.002, + "num_input_tokens_seen": 249505360, + "step": 115500 + }, + { + "epoch": 18.84257748776509, + "grad_norm": 0.006848242599517107, + "learning_rate": 1.0170053121453694e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249515632, + "step": 115505 + }, + { + "epoch": 18.843393148450243, + "grad_norm": 0.0005007157451473176, + "learning_rate": 1.0155774776969385e-05, + "loss": 0.0014, + "num_input_tokens_seen": 249525680, + "step": 115510 + }, + { + "epoch": 18.8442088091354, + "grad_norm": 0.002938035409897566, + "learning_rate": 1.0141506359799712e-05, + "loss": 0.0019, + "num_input_tokens_seen": 249535824, + "step": 115515 + }, + { + "epoch": 18.845024469820554, + "grad_norm": 0.0021078032441437244, + "learning_rate": 1.0127247870233836e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249547376, + "step": 115520 + }, + { + "epoch": 18.84584013050571, + "grad_norm": 0.032018523663282394, + "learning_rate": 1.011299930856069e-05, + "loss": 0.0034, + "num_input_tokens_seen": 249557808, + "step": 115525 + }, + { + "epoch": 18.846655791190866, + "grad_norm": 0.0021978262811899185, + "learning_rate": 1.0098760675069151e-05, + "loss": 0.0195, + "num_input_tokens_seen": 249566864, + "step": 115530 + }, + { + "epoch": 18.847471451876018, + "grad_norm": 0.003689467441290617, + "learning_rate": 1.0084531970047662e-05, + "loss": 0.0014, + "num_input_tokens_seen": 249577328, + "step": 115535 + }, + { + "epoch": 18.848287112561174, + "grad_norm": 0.004149348940700293, + "learning_rate": 1.0070313193784653e-05, + "loss": 0.0048, + "num_input_tokens_seen": 249588784, + "step": 115540 + }, + { + "epoch": 18.84910277324633, + "grad_norm": 0.007302007172256708, + "learning_rate": 1.0056104346568285e-05, + "loss": 0.0013, + "num_input_tokens_seen": 249598928, + "step": 115545 + }, + { + "epoch": 18.849918433931485, + "grad_norm": 0.0032237458508461714, + "learning_rate": 1.0041905428686493e-05, + "loss": 0.0015, + "num_input_tokens_seen": 249610160, + "step": 115550 + }, + { + "epoch": 18.85073409461664, + "grad_norm": 0.06821348518133163, + "learning_rate": 1.0027716440427049e-05, + "loss": 0.0021, + "num_input_tokens_seen": 249621776, + "step": 115555 + }, + { + "epoch": 18.851549755301793, + "grad_norm": 0.01791023463010788, + "learning_rate": 1.0013537382077443e-05, + "loss": 0.004, + "num_input_tokens_seen": 249633072, + "step": 115560 + }, + { + "epoch": 18.85236541598695, + "grad_norm": 0.00621650880202651, + "learning_rate": 9.999368253925167e-06, + "loss": 0.0031, + "num_input_tokens_seen": 249643568, + "step": 115565 + }, + { + "epoch": 18.853181076672104, + "grad_norm": 0.012429935857653618, + "learning_rate": 9.985209056257272e-06, + "loss": 0.0153, + "num_input_tokens_seen": 249654832, + "step": 115570 + }, + { + "epoch": 18.85399673735726, + "grad_norm": 0.0016874076100066304, + "learning_rate": 9.971059789360749e-06, + "loss": 0.0004, + "num_input_tokens_seen": 249666416, + "step": 115575 + }, + { + "epoch": 18.854812398042416, + "grad_norm": 0.05518745630979538, + "learning_rate": 9.956920453522366e-06, + "loss": 0.0016, + "num_input_tokens_seen": 249677808, + "step": 115580 + }, + { + "epoch": 18.855628058727568, + "grad_norm": 0.015488969162106514, + "learning_rate": 9.942791049028621e-06, + "loss": 0.001, + "num_input_tokens_seen": 249688176, + "step": 115585 + }, + { + "epoch": 18.856443719412724, + "grad_norm": 0.01876658760011196, + "learning_rate": 9.928671576165893e-06, + "loss": 0.0032, + "num_input_tokens_seen": 249699920, + "step": 115590 + }, + { + "epoch": 18.85725938009788, + "grad_norm": 0.001732186763547361, + "learning_rate": 9.914562035220287e-06, + "loss": 0.0026, + "num_input_tokens_seen": 249710800, + "step": 115595 + }, + { + "epoch": 18.858075040783035, + "grad_norm": 0.010737020522356033, + "learning_rate": 9.900462426477908e-06, + "loss": 0.0021, + "num_input_tokens_seen": 249721712, + "step": 115600 + }, + { + "epoch": 18.85889070146819, + "grad_norm": 0.0032939244993031025, + "learning_rate": 9.886372750224304e-06, + "loss": 0.003, + "num_input_tokens_seen": 249733040, + "step": 115605 + }, + { + "epoch": 18.859706362153343, + "grad_norm": 0.000969278160482645, + "learning_rate": 9.872293006745192e-06, + "loss": 0.0481, + "num_input_tokens_seen": 249744848, + "step": 115610 + }, + { + "epoch": 18.8605220228385, + "grad_norm": 0.0010464123915880919, + "learning_rate": 9.858223196325789e-06, + "loss": 0.0004, + "num_input_tokens_seen": 249755248, + "step": 115615 + }, + { + "epoch": 18.861337683523654, + "grad_norm": 0.0052170101553201675, + "learning_rate": 9.844163319251253e-06, + "loss": 0.0023, + "num_input_tokens_seen": 249766352, + "step": 115620 + }, + { + "epoch": 18.86215334420881, + "grad_norm": 0.0025711439084261656, + "learning_rate": 9.830113375806582e-06, + "loss": 0.0005, + "num_input_tokens_seen": 249776912, + "step": 115625 + }, + { + "epoch": 18.862969004893966, + "grad_norm": 0.0010705140884965658, + "learning_rate": 9.816073366276545e-06, + "loss": 0.0006, + "num_input_tokens_seen": 249788272, + "step": 115630 + }, + { + "epoch": 18.863784665579118, + "grad_norm": 0.11389190703630447, + "learning_rate": 9.802043290945529e-06, + "loss": 0.0059, + "num_input_tokens_seen": 249800368, + "step": 115635 + }, + { + "epoch": 18.864600326264274, + "grad_norm": 0.02490387298166752, + "learning_rate": 9.788023150098024e-06, + "loss": 0.0035, + "num_input_tokens_seen": 249811408, + "step": 115640 + }, + { + "epoch": 18.86541598694943, + "grad_norm": 0.017327111214399338, + "learning_rate": 9.774012944018085e-06, + "loss": 0.0006, + "num_input_tokens_seen": 249821808, + "step": 115645 + }, + { + "epoch": 18.866231647634585, + "grad_norm": 0.059728048741817474, + "learning_rate": 9.760012672989704e-06, + "loss": 0.0044, + "num_input_tokens_seen": 249832528, + "step": 115650 + }, + { + "epoch": 18.86704730831974, + "grad_norm": 0.02098773419857025, + "learning_rate": 9.746022337296546e-06, + "loss": 0.0035, + "num_input_tokens_seen": 249842640, + "step": 115655 + }, + { + "epoch": 18.867862969004893, + "grad_norm": 0.00045805005356669426, + "learning_rate": 9.732041937222157e-06, + "loss": 0.0007, + "num_input_tokens_seen": 249852240, + "step": 115660 + }, + { + "epoch": 18.86867862969005, + "grad_norm": 0.002120513003319502, + "learning_rate": 9.718071473049927e-06, + "loss": 0.001, + "num_input_tokens_seen": 249863056, + "step": 115665 + }, + { + "epoch": 18.869494290375204, + "grad_norm": 0.016152381896972656, + "learning_rate": 9.70411094506296e-06, + "loss": 0.0011, + "num_input_tokens_seen": 249874224, + "step": 115670 + }, + { + "epoch": 18.87030995106036, + "grad_norm": 0.025782400742173195, + "learning_rate": 9.690160353544142e-06, + "loss": 0.0004, + "num_input_tokens_seen": 249882704, + "step": 115675 + }, + { + "epoch": 18.871125611745512, + "grad_norm": 0.009104576893150806, + "learning_rate": 9.67621969877619e-06, + "loss": 0.0009, + "num_input_tokens_seen": 249893424, + "step": 115680 + }, + { + "epoch": 18.871941272430668, + "grad_norm": 0.008896476589143276, + "learning_rate": 9.66228898104171e-06, + "loss": 0.0005, + "num_input_tokens_seen": 249902992, + "step": 115685 + }, + { + "epoch": 18.872756933115824, + "grad_norm": 0.002238509012386203, + "learning_rate": 9.64836820062298e-06, + "loss": 0.0012, + "num_input_tokens_seen": 249913072, + "step": 115690 + }, + { + "epoch": 18.87357259380098, + "grad_norm": 0.03264370560646057, + "learning_rate": 9.634457357802107e-06, + "loss": 0.001, + "num_input_tokens_seen": 249923312, + "step": 115695 + }, + { + "epoch": 18.874388254486135, + "grad_norm": 0.06711148470640182, + "learning_rate": 9.62055645286103e-06, + "loss": 0.0342, + "num_input_tokens_seen": 249933712, + "step": 115700 + }, + { + "epoch": 18.875203915171287, + "grad_norm": 0.0005814563482999802, + "learning_rate": 9.606665486081522e-06, + "loss": 0.0003, + "num_input_tokens_seen": 249943024, + "step": 115705 + }, + { + "epoch": 18.876019575856443, + "grad_norm": 0.0026038573123514652, + "learning_rate": 9.592784457744918e-06, + "loss": 0.0016, + "num_input_tokens_seen": 249953648, + "step": 115710 + }, + { + "epoch": 18.8768352365416, + "grad_norm": 0.0004891370190307498, + "learning_rate": 9.578913368132824e-06, + "loss": 0.0003, + "num_input_tokens_seen": 249963536, + "step": 115715 + }, + { + "epoch": 18.877650897226754, + "grad_norm": 0.00019704003352671862, + "learning_rate": 9.565052217526072e-06, + "loss": 0.0004, + "num_input_tokens_seen": 249974064, + "step": 115720 + }, + { + "epoch": 18.87846655791191, + "grad_norm": 0.19600987434387207, + "learning_rate": 9.551201006205767e-06, + "loss": 0.0057, + "num_input_tokens_seen": 249984240, + "step": 115725 + }, + { + "epoch": 18.879282218597062, + "grad_norm": 0.025361159816384315, + "learning_rate": 9.537359734452466e-06, + "loss": 0.0008, + "num_input_tokens_seen": 249996016, + "step": 115730 + }, + { + "epoch": 18.880097879282218, + "grad_norm": 0.0005024754791520536, + "learning_rate": 9.523528402546888e-06, + "loss": 0.0064, + "num_input_tokens_seen": 250007408, + "step": 115735 + }, + { + "epoch": 18.880913539967374, + "grad_norm": 0.0042595332488417625, + "learning_rate": 9.509707010769086e-06, + "loss": 0.0015, + "num_input_tokens_seen": 250017456, + "step": 115740 + }, + { + "epoch": 18.88172920065253, + "grad_norm": 0.019248517230153084, + "learning_rate": 9.495895559399449e-06, + "loss": 0.0541, + "num_input_tokens_seen": 250029296, + "step": 115745 + }, + { + "epoch": 18.882544861337685, + "grad_norm": 0.0002776541223283857, + "learning_rate": 9.482094048717637e-06, + "loss": 0.0017, + "num_input_tokens_seen": 250040336, + "step": 115750 + }, + { + "epoch": 18.883360522022837, + "grad_norm": 0.009291916154325008, + "learning_rate": 9.468302479003487e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250050896, + "step": 115755 + }, + { + "epoch": 18.884176182707993, + "grad_norm": 0.002779381349682808, + "learning_rate": 9.45452085053644e-06, + "loss": 0.004, + "num_input_tokens_seen": 250061168, + "step": 115760 + }, + { + "epoch": 18.88499184339315, + "grad_norm": 0.004881167318671942, + "learning_rate": 9.44074916359583e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250071632, + "step": 115765 + }, + { + "epoch": 18.885807504078304, + "grad_norm": 0.012340943329036236, + "learning_rate": 9.42698741846082e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250082768, + "step": 115770 + }, + { + "epoch": 18.88662316476346, + "grad_norm": 0.07426692545413971, + "learning_rate": 9.413235615410188e-06, + "loss": 0.0325, + "num_input_tokens_seen": 250093936, + "step": 115775 + }, + { + "epoch": 18.887438825448612, + "grad_norm": 0.0016281426651403308, + "learning_rate": 9.39949375472271e-06, + "loss": 0.0517, + "num_input_tokens_seen": 250105040, + "step": 115780 + }, + { + "epoch": 18.888254486133768, + "grad_norm": 0.009493221528828144, + "learning_rate": 9.385761836676832e-06, + "loss": 0.0015, + "num_input_tokens_seen": 250114960, + "step": 115785 + }, + { + "epoch": 18.889070146818923, + "grad_norm": 0.0010956472251564264, + "learning_rate": 9.37203986155094e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250126224, + "step": 115790 + }, + { + "epoch": 18.88988580750408, + "grad_norm": 0.0018236135365441442, + "learning_rate": 9.358327829623038e-06, + "loss": 0.0007, + "num_input_tokens_seen": 250135536, + "step": 115795 + }, + { + "epoch": 18.890701468189235, + "grad_norm": 0.17699794471263885, + "learning_rate": 9.344625741171009e-06, + "loss": 0.0035, + "num_input_tokens_seen": 250146128, + "step": 115800 + }, + { + "epoch": 18.891517128874387, + "grad_norm": 0.028981979936361313, + "learning_rate": 9.330933596472635e-06, + "loss": 0.0049, + "num_input_tokens_seen": 250157232, + "step": 115805 + }, + { + "epoch": 18.892332789559543, + "grad_norm": 0.0012324347626417875, + "learning_rate": 9.317251395805304e-06, + "loss": 0.0008, + "num_input_tokens_seen": 250167888, + "step": 115810 + }, + { + "epoch": 18.8931484502447, + "grad_norm": 0.004059400409460068, + "learning_rate": 9.303579139446349e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250179184, + "step": 115815 + }, + { + "epoch": 18.893964110929854, + "grad_norm": 0.006122584920376539, + "learning_rate": 9.28991682767294e-06, + "loss": 0.1101, + "num_input_tokens_seen": 250188624, + "step": 115820 + }, + { + "epoch": 18.894779771615006, + "grad_norm": 0.0018198771867901087, + "learning_rate": 9.27626446076174e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250197712, + "step": 115825 + }, + { + "epoch": 18.895595432300162, + "grad_norm": 0.04111974686384201, + "learning_rate": 9.2626220389897e-06, + "loss": 0.0015, + "num_input_tokens_seen": 250208272, + "step": 115830 + }, + { + "epoch": 18.896411092985318, + "grad_norm": 0.00017160769493784755, + "learning_rate": 9.248989562633037e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250218448, + "step": 115835 + }, + { + "epoch": 18.897226753670473, + "grad_norm": 0.0034603665117174387, + "learning_rate": 9.235367031968312e-06, + "loss": 0.0744, + "num_input_tokens_seen": 250229552, + "step": 115840 + }, + { + "epoch": 18.89804241435563, + "grad_norm": 0.0018716700142249465, + "learning_rate": 9.221754447271302e-06, + "loss": 0.002, + "num_input_tokens_seen": 250240080, + "step": 115845 + }, + { + "epoch": 18.898858075040785, + "grad_norm": 0.003498975420370698, + "learning_rate": 9.208151808818177e-06, + "loss": 0.001, + "num_input_tokens_seen": 250251152, + "step": 115850 + }, + { + "epoch": 18.899673735725937, + "grad_norm": 0.007458406500518322, + "learning_rate": 9.194559116884327e-06, + "loss": 0.0056, + "num_input_tokens_seen": 250262480, + "step": 115855 + }, + { + "epoch": 18.900489396411093, + "grad_norm": 0.0024467897601425648, + "learning_rate": 9.18097637174553e-06, + "loss": 0.1571, + "num_input_tokens_seen": 250273584, + "step": 115860 + }, + { + "epoch": 18.90130505709625, + "grad_norm": 0.0011468434240669012, + "learning_rate": 9.167403573676736e-06, + "loss": 0.054, + "num_input_tokens_seen": 250285488, + "step": 115865 + }, + { + "epoch": 18.902120717781404, + "grad_norm": 0.0010236542439088225, + "learning_rate": 9.153840722953278e-06, + "loss": 0.0011, + "num_input_tokens_seen": 250295536, + "step": 115870 + }, + { + "epoch": 18.902936378466556, + "grad_norm": 0.0005618699942715466, + "learning_rate": 9.14028781984988e-06, + "loss": 0.0083, + "num_input_tokens_seen": 250306288, + "step": 115875 + }, + { + "epoch": 18.903752039151712, + "grad_norm": 0.00021621494670398533, + "learning_rate": 9.126744864641267e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250318352, + "step": 115880 + }, + { + "epoch": 18.904567699836868, + "grad_norm": 0.0009274820913560688, + "learning_rate": 9.113211857601833e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250328528, + "step": 115885 + }, + { + "epoch": 18.905383360522023, + "grad_norm": 0.007432404439896345, + "learning_rate": 9.099688799005967e-06, + "loss": 0.0009, + "num_input_tokens_seen": 250339536, + "step": 115890 + }, + { + "epoch": 18.90619902120718, + "grad_norm": 0.0005458049126900733, + "learning_rate": 9.086175689127618e-06, + "loss": 0.0013, + "num_input_tokens_seen": 250350640, + "step": 115895 + }, + { + "epoch": 18.90701468189233, + "grad_norm": 0.0005109109915792942, + "learning_rate": 9.072672528240733e-06, + "loss": 0.0193, + "num_input_tokens_seen": 250360624, + "step": 115900 + }, + { + "epoch": 18.907830342577487, + "grad_norm": 0.006997792515903711, + "learning_rate": 9.059179316618871e-06, + "loss": 0.104, + "num_input_tokens_seen": 250371088, + "step": 115905 + }, + { + "epoch": 18.908646003262643, + "grad_norm": 0.04977473244071007, + "learning_rate": 9.045696054535535e-06, + "loss": 0.0017, + "num_input_tokens_seen": 250381872, + "step": 115910 + }, + { + "epoch": 18.9094616639478, + "grad_norm": 0.0013368047075346112, + "learning_rate": 9.032222742264008e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250391632, + "step": 115915 + }, + { + "epoch": 18.910277324632954, + "grad_norm": 0.009490997530519962, + "learning_rate": 9.018759380077346e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250403632, + "step": 115920 + }, + { + "epoch": 18.911092985318106, + "grad_norm": 0.0015856948448345065, + "learning_rate": 9.005305968248334e-06, + "loss": 0.0011, + "num_input_tokens_seen": 250414704, + "step": 115925 + }, + { + "epoch": 18.911908646003262, + "grad_norm": 0.0006589622935280204, + "learning_rate": 8.991862507049698e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250425872, + "step": 115930 + }, + { + "epoch": 18.912724306688418, + "grad_norm": 0.0006037901039235294, + "learning_rate": 8.978428996753885e-06, + "loss": 0.0007, + "num_input_tokens_seen": 250436752, + "step": 115935 + }, + { + "epoch": 18.913539967373573, + "grad_norm": 0.0019094824092462659, + "learning_rate": 8.965005437633067e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250447952, + "step": 115940 + }, + { + "epoch": 18.91435562805873, + "grad_norm": 0.018927903845906258, + "learning_rate": 8.95159182995936e-06, + "loss": 0.0012, + "num_input_tokens_seen": 250459888, + "step": 115945 + }, + { + "epoch": 18.91517128874388, + "grad_norm": 0.00041185764712281525, + "learning_rate": 8.938188174004602e-06, + "loss": 0.0022, + "num_input_tokens_seen": 250470704, + "step": 115950 + }, + { + "epoch": 18.915986949429037, + "grad_norm": 0.009737544693052769, + "learning_rate": 8.924794470040354e-06, + "loss": 0.0008, + "num_input_tokens_seen": 250480880, + "step": 115955 + }, + { + "epoch": 18.916802610114193, + "grad_norm": 0.0019982964731752872, + "learning_rate": 8.91141071833812e-06, + "loss": 0.0028, + "num_input_tokens_seen": 250491568, + "step": 115960 + }, + { + "epoch": 18.91761827079935, + "grad_norm": 0.01937304250895977, + "learning_rate": 8.89803691916924e-06, + "loss": 0.0052, + "num_input_tokens_seen": 250501840, + "step": 115965 + }, + { + "epoch": 18.918433931484504, + "grad_norm": 0.00334971328265965, + "learning_rate": 8.88467307280455e-06, + "loss": 0.0018, + "num_input_tokens_seen": 250513776, + "step": 115970 + }, + { + "epoch": 18.919249592169656, + "grad_norm": 0.013554212637245655, + "learning_rate": 8.871319179515058e-06, + "loss": 0.0008, + "num_input_tokens_seen": 250524400, + "step": 115975 + }, + { + "epoch": 18.920065252854812, + "grad_norm": 0.06993885338306427, + "learning_rate": 8.857975239571215e-06, + "loss": 0.0027, + "num_input_tokens_seen": 250535088, + "step": 115980 + }, + { + "epoch": 18.920880913539968, + "grad_norm": 0.005087228491902351, + "learning_rate": 8.84464125324369e-06, + "loss": 0.0029, + "num_input_tokens_seen": 250547056, + "step": 115985 + }, + { + "epoch": 18.921696574225123, + "grad_norm": 0.00482860766351223, + "learning_rate": 8.831317220802493e-06, + "loss": 0.0029, + "num_input_tokens_seen": 250558256, + "step": 115990 + }, + { + "epoch": 18.92251223491028, + "grad_norm": 0.0009906106861308217, + "learning_rate": 8.818003142517794e-06, + "loss": 0.0015, + "num_input_tokens_seen": 250569360, + "step": 115995 + }, + { + "epoch": 18.92332789559543, + "grad_norm": 0.004632228519767523, + "learning_rate": 8.804699018659324e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250581008, + "step": 116000 + }, + { + "epoch": 18.924143556280587, + "grad_norm": 0.0004046796530019492, + "learning_rate": 8.79140484949681e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250592272, + "step": 116005 + }, + { + "epoch": 18.924959216965743, + "grad_norm": 0.006622139364480972, + "learning_rate": 8.778120635299537e-06, + "loss": 0.0061, + "num_input_tokens_seen": 250603376, + "step": 116010 + }, + { + "epoch": 18.9257748776509, + "grad_norm": 0.0018909795908257365, + "learning_rate": 8.7648463763369e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250614224, + "step": 116015 + }, + { + "epoch": 18.92659053833605, + "grad_norm": 0.0008149382774718106, + "learning_rate": 8.751582072877739e-06, + "loss": 0.0008, + "num_input_tokens_seen": 250624752, + "step": 116020 + }, + { + "epoch": 18.927406199021206, + "grad_norm": 0.0009487331844866276, + "learning_rate": 8.738327725191064e-06, + "loss": 0.0011, + "num_input_tokens_seen": 250635504, + "step": 116025 + }, + { + "epoch": 18.928221859706362, + "grad_norm": 0.0005238814628683031, + "learning_rate": 8.725083333545326e-06, + "loss": 0.0007, + "num_input_tokens_seen": 250646096, + "step": 116030 + }, + { + "epoch": 18.929037520391518, + "grad_norm": 0.00510450080037117, + "learning_rate": 8.711848898208974e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250657424, + "step": 116035 + }, + { + "epoch": 18.929853181076673, + "grad_norm": 0.009839179925620556, + "learning_rate": 8.698624419450296e-06, + "loss": 0.002, + "num_input_tokens_seen": 250665936, + "step": 116040 + }, + { + "epoch": 18.930668841761825, + "grad_norm": 0.10089553147554398, + "learning_rate": 8.685409897537244e-06, + "loss": 0.0028, + "num_input_tokens_seen": 250676912, + "step": 116045 + }, + { + "epoch": 18.93148450244698, + "grad_norm": 0.0005019927630200982, + "learning_rate": 8.672205332737603e-06, + "loss": 0.0089, + "num_input_tokens_seen": 250688080, + "step": 116050 + }, + { + "epoch": 18.932300163132137, + "grad_norm": 0.00039481374551542103, + "learning_rate": 8.65901072531905e-06, + "loss": 0.0008, + "num_input_tokens_seen": 250698224, + "step": 116055 + }, + { + "epoch": 18.933115823817293, + "grad_norm": 0.0027844554278999567, + "learning_rate": 8.64582607554898e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250709776, + "step": 116060 + }, + { + "epoch": 18.93393148450245, + "grad_norm": 0.0012518571456894279, + "learning_rate": 8.632651383694513e-06, + "loss": 0.0011, + "num_input_tokens_seen": 250721360, + "step": 116065 + }, + { + "epoch": 18.9347471451876, + "grad_norm": 0.05422195792198181, + "learning_rate": 8.619486650022768e-06, + "loss": 0.0024, + "num_input_tokens_seen": 250732112, + "step": 116070 + }, + { + "epoch": 18.935562805872756, + "grad_norm": 0.0015667621046304703, + "learning_rate": 8.606331874800421e-06, + "loss": 0.0022, + "num_input_tokens_seen": 250742384, + "step": 116075 + }, + { + "epoch": 18.936378466557912, + "grad_norm": 0.0554991140961647, + "learning_rate": 8.593187058294205e-06, + "loss": 0.0018, + "num_input_tokens_seen": 250752048, + "step": 116080 + }, + { + "epoch": 18.937194127243067, + "grad_norm": 0.005774365738034248, + "learning_rate": 8.580052200770405e-06, + "loss": 0.0054, + "num_input_tokens_seen": 250762704, + "step": 116085 + }, + { + "epoch": 18.938009787928223, + "grad_norm": 0.00019944304949603975, + "learning_rate": 8.566927302495254e-06, + "loss": 0.001, + "num_input_tokens_seen": 250772336, + "step": 116090 + }, + { + "epoch": 18.938825448613375, + "grad_norm": 0.005691157653927803, + "learning_rate": 8.553812363734759e-06, + "loss": 0.0025, + "num_input_tokens_seen": 250783184, + "step": 116095 + }, + { + "epoch": 18.93964110929853, + "grad_norm": 0.0055681378580629826, + "learning_rate": 8.54070738475471e-06, + "loss": 0.001, + "num_input_tokens_seen": 250793360, + "step": 116100 + }, + { + "epoch": 18.940456769983687, + "grad_norm": 0.0003302092372905463, + "learning_rate": 8.527612365820613e-06, + "loss": 0.0007, + "num_input_tokens_seen": 250804880, + "step": 116105 + }, + { + "epoch": 18.941272430668842, + "grad_norm": 0.002624097280204296, + "learning_rate": 8.514527307198038e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250814416, + "step": 116110 + }, + { + "epoch": 18.942088091353998, + "grad_norm": 0.007586228661239147, + "learning_rate": 8.501452209151995e-06, + "loss": 0.0032, + "num_input_tokens_seen": 250825552, + "step": 116115 + }, + { + "epoch": 18.94290375203915, + "grad_norm": 0.0007885852828621864, + "learning_rate": 8.488387071947601e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250836336, + "step": 116120 + }, + { + "epoch": 18.943719412724306, + "grad_norm": 0.04808273911476135, + "learning_rate": 8.47533189584948e-06, + "loss": 0.0021, + "num_input_tokens_seen": 250847984, + "step": 116125 + }, + { + "epoch": 18.94453507340946, + "grad_norm": 0.00023655618133489043, + "learning_rate": 8.46228668112231e-06, + "loss": 0.001, + "num_input_tokens_seen": 250858512, + "step": 116130 + }, + { + "epoch": 18.945350734094617, + "grad_norm": 0.0001312753011006862, + "learning_rate": 8.449251428030492e-06, + "loss": 0.0012, + "num_input_tokens_seen": 250869776, + "step": 116135 + }, + { + "epoch": 18.946166394779773, + "grad_norm": 0.00039740095962770283, + "learning_rate": 8.436226136838198e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250880656, + "step": 116140 + }, + { + "epoch": 18.946982055464925, + "grad_norm": 0.003538029734045267, + "learning_rate": 8.423210807809333e-06, + "loss": 0.0029, + "num_input_tokens_seen": 250891376, + "step": 116145 + }, + { + "epoch": 18.94779771615008, + "grad_norm": 0.00038812385173514485, + "learning_rate": 8.410205441207741e-06, + "loss": 0.0501, + "num_input_tokens_seen": 250903600, + "step": 116150 + }, + { + "epoch": 18.948613376835237, + "grad_norm": 0.09095561504364014, + "learning_rate": 8.397210037296931e-06, + "loss": 0.0016, + "num_input_tokens_seen": 250914896, + "step": 116155 + }, + { + "epoch": 18.949429037520392, + "grad_norm": 0.9006114602088928, + "learning_rate": 8.384224596340306e-06, + "loss": 0.138, + "num_input_tokens_seen": 250924848, + "step": 116160 + }, + { + "epoch": 18.950244698205548, + "grad_norm": 0.1281556338071823, + "learning_rate": 8.371249118601043e-06, + "loss": 0.0046, + "num_input_tokens_seen": 250935472, + "step": 116165 + }, + { + "epoch": 18.9510603588907, + "grad_norm": 0.011393179185688496, + "learning_rate": 8.358283604342098e-06, + "loss": 0.0012, + "num_input_tokens_seen": 250946960, + "step": 116170 + }, + { + "epoch": 18.951876019575856, + "grad_norm": 0.0008362371590919793, + "learning_rate": 8.345328053826207e-06, + "loss": 0.0011, + "num_input_tokens_seen": 250957040, + "step": 116175 + }, + { + "epoch": 18.95269168026101, + "grad_norm": 0.00016408613009843975, + "learning_rate": 8.33238246731599e-06, + "loss": 0.0033, + "num_input_tokens_seen": 250968496, + "step": 116180 + }, + { + "epoch": 18.953507340946167, + "grad_norm": 0.0007871238049119711, + "learning_rate": 8.319446845073741e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250979600, + "step": 116185 + }, + { + "epoch": 18.954323001631323, + "grad_norm": 0.007136243861168623, + "learning_rate": 8.306521187361638e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250989200, + "step": 116190 + }, + { + "epoch": 18.955138662316475, + "grad_norm": 0.20850905776023865, + "learning_rate": 8.293605494441636e-06, + "loss": 0.0036, + "num_input_tokens_seen": 250999824, + "step": 116195 + }, + { + "epoch": 18.95595432300163, + "grad_norm": 0.0009187610703520477, + "learning_rate": 8.280699766575528e-06, + "loss": 0.0006, + "num_input_tokens_seen": 251010576, + "step": 116200 + }, + { + "epoch": 18.956769983686787, + "grad_norm": 0.08381669223308563, + "learning_rate": 8.26780400402477e-06, + "loss": 0.0037, + "num_input_tokens_seen": 251021808, + "step": 116205 + }, + { + "epoch": 18.957585644371942, + "grad_norm": 0.027391066774725914, + "learning_rate": 8.254918207050821e-06, + "loss": 0.0015, + "num_input_tokens_seen": 251033168, + "step": 116210 + }, + { + "epoch": 18.958401305057095, + "grad_norm": 0.002228178782388568, + "learning_rate": 8.242042375914748e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251044976, + "step": 116215 + }, + { + "epoch": 18.95921696574225, + "grad_norm": 0.00165777956135571, + "learning_rate": 8.229176510877512e-06, + "loss": 0.0018, + "num_input_tokens_seen": 251056816, + "step": 116220 + }, + { + "epoch": 18.960032626427406, + "grad_norm": 0.008388200774788857, + "learning_rate": 8.216320612199901e-06, + "loss": 0.0007, + "num_input_tokens_seen": 251067568, + "step": 116225 + }, + { + "epoch": 18.96084828711256, + "grad_norm": 0.008720295503735542, + "learning_rate": 8.203474680142431e-06, + "loss": 0.0083, + "num_input_tokens_seen": 251077456, + "step": 116230 + }, + { + "epoch": 18.961663947797717, + "grad_norm": 0.007424628362059593, + "learning_rate": 8.190638714965393e-06, + "loss": 0.0012, + "num_input_tokens_seen": 251089232, + "step": 116235 + }, + { + "epoch": 18.96247960848287, + "grad_norm": 0.005576164927333593, + "learning_rate": 8.177812716928967e-06, + "loss": 0.0004, + "num_input_tokens_seen": 251099536, + "step": 116240 + }, + { + "epoch": 18.963295269168025, + "grad_norm": 0.00031054625287652016, + "learning_rate": 8.164996686293114e-06, + "loss": 0.0009, + "num_input_tokens_seen": 251110128, + "step": 116245 + }, + { + "epoch": 18.96411092985318, + "grad_norm": 0.016679290682077408, + "learning_rate": 8.152190623317569e-06, + "loss": 0.0014, + "num_input_tokens_seen": 251120368, + "step": 116250 + }, + { + "epoch": 18.964926590538337, + "grad_norm": 0.07447069138288498, + "learning_rate": 8.13939452826179e-06, + "loss": 0.0028, + "num_input_tokens_seen": 251130672, + "step": 116255 + }, + { + "epoch": 18.965742251223492, + "grad_norm": 0.00019847380463033915, + "learning_rate": 8.126608401385183e-06, + "loss": 0.0708, + "num_input_tokens_seen": 251141264, + "step": 116260 + }, + { + "epoch": 18.966557911908644, + "grad_norm": 0.002280977787449956, + "learning_rate": 8.113832242946818e-06, + "loss": 0.005, + "num_input_tokens_seen": 251153040, + "step": 116265 + }, + { + "epoch": 18.9673735725938, + "grad_norm": 0.0031850580126047134, + "learning_rate": 8.101066053205653e-06, + "loss": 0.0013, + "num_input_tokens_seen": 251163504, + "step": 116270 + }, + { + "epoch": 18.968189233278956, + "grad_norm": 0.06683320552110672, + "learning_rate": 8.08830983242037e-06, + "loss": 0.0042, + "num_input_tokens_seen": 251174480, + "step": 116275 + }, + { + "epoch": 18.96900489396411, + "grad_norm": 0.0011318209581077099, + "learning_rate": 8.0755635808496e-06, + "loss": 0.001, + "num_input_tokens_seen": 251185200, + "step": 116280 + }, + { + "epoch": 18.969820554649267, + "grad_norm": 0.005109846591949463, + "learning_rate": 8.062827298751518e-06, + "loss": 0.0011, + "num_input_tokens_seen": 251197904, + "step": 116285 + }, + { + "epoch": 18.97063621533442, + "grad_norm": 0.00022063420328777283, + "learning_rate": 8.050100986384312e-06, + "loss": 0.0029, + "num_input_tokens_seen": 251207984, + "step": 116290 + }, + { + "epoch": 18.971451876019575, + "grad_norm": 0.006765791680663824, + "learning_rate": 8.037384644005941e-06, + "loss": 0.0023, + "num_input_tokens_seen": 251218704, + "step": 116295 + }, + { + "epoch": 18.97226753670473, + "grad_norm": 0.0005463669076561928, + "learning_rate": 8.024678271874031e-06, + "loss": 0.0189, + "num_input_tokens_seen": 251229136, + "step": 116300 + }, + { + "epoch": 18.973083197389887, + "grad_norm": 0.006524787284433842, + "learning_rate": 8.011981870246099e-06, + "loss": 0.0029, + "num_input_tokens_seen": 251240272, + "step": 116305 + }, + { + "epoch": 18.973898858075042, + "grad_norm": 0.026674775406718254, + "learning_rate": 7.99929543937955e-06, + "loss": 0.0042, + "num_input_tokens_seen": 251250800, + "step": 116310 + }, + { + "epoch": 18.974714518760194, + "grad_norm": 0.004923704545944929, + "learning_rate": 7.9866189795314e-06, + "loss": 0.0007, + "num_input_tokens_seen": 251260720, + "step": 116315 + }, + { + "epoch": 18.97553017944535, + "grad_norm": 0.0006612880388274789, + "learning_rate": 7.973952490958559e-06, + "loss": 0.0016, + "num_input_tokens_seen": 251272080, + "step": 116320 + }, + { + "epoch": 18.976345840130506, + "grad_norm": 0.40225768089294434, + "learning_rate": 7.961295973917759e-06, + "loss": 0.0155, + "num_input_tokens_seen": 251282800, + "step": 116325 + }, + { + "epoch": 18.97716150081566, + "grad_norm": 0.023114413022994995, + "learning_rate": 7.948649428665522e-06, + "loss": 0.001, + "num_input_tokens_seen": 251293648, + "step": 116330 + }, + { + "epoch": 18.977977161500817, + "grad_norm": 0.003399110399186611, + "learning_rate": 7.936012855458085e-06, + "loss": 0.0008, + "num_input_tokens_seen": 251304368, + "step": 116335 + }, + { + "epoch": 18.97879282218597, + "grad_norm": 0.01887008547782898, + "learning_rate": 7.923386254551523e-06, + "loss": 0.0142, + "num_input_tokens_seen": 251314992, + "step": 116340 + }, + { + "epoch": 18.979608482871125, + "grad_norm": 0.01707925647497177, + "learning_rate": 7.910769626201908e-06, + "loss": 0.0153, + "num_input_tokens_seen": 251325232, + "step": 116345 + }, + { + "epoch": 18.98042414355628, + "grad_norm": 0.00015691196313127875, + "learning_rate": 7.898162970664702e-06, + "loss": 0.0007, + "num_input_tokens_seen": 251335792, + "step": 116350 + }, + { + "epoch": 18.981239804241437, + "grad_norm": 0.004649725742638111, + "learning_rate": 7.88556628819559e-06, + "loss": 0.0005, + "num_input_tokens_seen": 251347120, + "step": 116355 + }, + { + "epoch": 18.982055464926592, + "grad_norm": 0.002170866122469306, + "learning_rate": 7.872979579049644e-06, + "loss": 0.0102, + "num_input_tokens_seen": 251357872, + "step": 116360 + }, + { + "epoch": 18.982871125611744, + "grad_norm": 0.0008006882853806019, + "learning_rate": 7.860402843482218e-06, + "loss": 0.0018, + "num_input_tokens_seen": 251368592, + "step": 116365 + }, + { + "epoch": 18.9836867862969, + "grad_norm": 0.023480042815208435, + "learning_rate": 7.847836081747939e-06, + "loss": 0.0038, + "num_input_tokens_seen": 251380112, + "step": 116370 + }, + { + "epoch": 18.984502446982056, + "grad_norm": 0.009893891401588917, + "learning_rate": 7.83527929410166e-06, + "loss": 0.0019, + "num_input_tokens_seen": 251392016, + "step": 116375 + }, + { + "epoch": 18.98531810766721, + "grad_norm": 0.004652200732380152, + "learning_rate": 7.822732480797734e-06, + "loss": 0.0044, + "num_input_tokens_seen": 251402384, + "step": 116380 + }, + { + "epoch": 18.986133768352367, + "grad_norm": 0.0024909900967031717, + "learning_rate": 7.810195642090568e-06, + "loss": 0.0038, + "num_input_tokens_seen": 251413136, + "step": 116385 + }, + { + "epoch": 18.98694942903752, + "grad_norm": 0.0004570172750391066, + "learning_rate": 7.797668778234179e-06, + "loss": 0.0031, + "num_input_tokens_seen": 251424240, + "step": 116390 + }, + { + "epoch": 18.987765089722675, + "grad_norm": 0.0004380632599350065, + "learning_rate": 7.785151889482422e-06, + "loss": 0.0006, + "num_input_tokens_seen": 251435472, + "step": 116395 + }, + { + "epoch": 18.98858075040783, + "grad_norm": 0.0006351694464683533, + "learning_rate": 7.772644976088982e-06, + "loss": 0.0012, + "num_input_tokens_seen": 251446768, + "step": 116400 + }, + { + "epoch": 18.989396411092986, + "grad_norm": 0.014859266579151154, + "learning_rate": 7.760148038307324e-06, + "loss": 0.0065, + "num_input_tokens_seen": 251457808, + "step": 116405 + }, + { + "epoch": 18.99021207177814, + "grad_norm": 0.004753198474645615, + "learning_rate": 7.747661076390688e-06, + "loss": 0.0008, + "num_input_tokens_seen": 251469936, + "step": 116410 + }, + { + "epoch": 18.991027732463294, + "grad_norm": 0.011917391791939735, + "learning_rate": 7.735184090592206e-06, + "loss": 0.0025, + "num_input_tokens_seen": 251480336, + "step": 116415 + }, + { + "epoch": 18.99184339314845, + "grad_norm": 0.0011199692962691188, + "learning_rate": 7.722717081164677e-06, + "loss": 0.0007, + "num_input_tokens_seen": 251490448, + "step": 116420 + }, + { + "epoch": 18.992659053833606, + "grad_norm": 0.0011024402920156717, + "learning_rate": 7.710260048360784e-06, + "loss": 0.0038, + "num_input_tokens_seen": 251501072, + "step": 116425 + }, + { + "epoch": 18.99347471451876, + "grad_norm": 0.006140429060906172, + "learning_rate": 7.697812992432996e-06, + "loss": 0.001, + "num_input_tokens_seen": 251513648, + "step": 116430 + }, + { + "epoch": 18.994290375203914, + "grad_norm": 0.06212307885289192, + "learning_rate": 7.685375913633607e-06, + "loss": 0.001, + "num_input_tokens_seen": 251523248, + "step": 116435 + }, + { + "epoch": 18.99510603588907, + "grad_norm": 0.02086738497018814, + "learning_rate": 7.67294881221453e-06, + "loss": 0.0007, + "num_input_tokens_seen": 251534256, + "step": 116440 + }, + { + "epoch": 18.995921696574225, + "grad_norm": 0.004113317932933569, + "learning_rate": 7.660531688427729e-06, + "loss": 0.0007, + "num_input_tokens_seen": 251543632, + "step": 116445 + }, + { + "epoch": 18.99673735725938, + "grad_norm": 0.0009120566537603736, + "learning_rate": 7.648124542524892e-06, + "loss": 0.0005, + "num_input_tokens_seen": 251554128, + "step": 116450 + }, + { + "epoch": 18.997553017944536, + "grad_norm": 0.06423819810152054, + "learning_rate": 7.635727374757318e-06, + "loss": 0.0035, + "num_input_tokens_seen": 251563728, + "step": 116455 + }, + { + "epoch": 18.99836867862969, + "grad_norm": 0.00021028223272878677, + "learning_rate": 7.623340185376415e-06, + "loss": 0.0014, + "num_input_tokens_seen": 251575088, + "step": 116460 + }, + { + "epoch": 18.999184339314844, + "grad_norm": 0.021248627454042435, + "learning_rate": 7.6109629746330955e-06, + "loss": 0.0011, + "num_input_tokens_seen": 251585328, + "step": 116465 + }, + { + "epoch": 19.0, + "grad_norm": 0.0254563819617033, + "learning_rate": 7.5985957427782695e-06, + "loss": 0.0019, + "num_input_tokens_seen": 251594480, + "step": 116470 + }, + { + "epoch": 19.0, + "eval_loss": 0.3440645635128021, + "eval_runtime": 103.9658, + "eval_samples_per_second": 26.211, + "eval_steps_per_second": 6.56, + "num_input_tokens_seen": 251594480, + "step": 116470 + }, + { + "epoch": 19.000815660685156, + "grad_norm": 0.001239095930941403, + "learning_rate": 7.5862384900625135e-06, + "loss": 0.0035, + "num_input_tokens_seen": 251605456, + "step": 116475 + }, + { + "epoch": 19.00163132137031, + "grad_norm": 0.0003935934801120311, + "learning_rate": 7.573891216736406e-06, + "loss": 0.0265, + "num_input_tokens_seen": 251616656, + "step": 116480 + }, + { + "epoch": 19.002446982055464, + "grad_norm": 0.0005455015343613923, + "learning_rate": 7.561553923049969e-06, + "loss": 0.0006, + "num_input_tokens_seen": 251627472, + "step": 116485 + }, + { + "epoch": 19.00326264274062, + "grad_norm": 0.00020809544366784394, + "learning_rate": 7.549226609253446e-06, + "loss": 0.0016, + "num_input_tokens_seen": 251638288, + "step": 116490 + }, + { + "epoch": 19.004078303425775, + "grad_norm": 0.12179408222436905, + "learning_rate": 7.536909275596471e-06, + "loss": 0.0041, + "num_input_tokens_seen": 251649104, + "step": 116495 + }, + { + "epoch": 19.00489396411093, + "grad_norm": 0.0023868621792644262, + "learning_rate": 7.524601922328844e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251659568, + "step": 116500 + }, + { + "epoch": 19.005709624796086, + "grad_norm": 0.001443555229343474, + "learning_rate": 7.512304549699811e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251670416, + "step": 116505 + }, + { + "epoch": 19.00652528548124, + "grad_norm": 0.05254826694726944, + "learning_rate": 7.500017157958838e-06, + "loss": 0.0148, + "num_input_tokens_seen": 251682224, + "step": 116510 + }, + { + "epoch": 19.007340946166394, + "grad_norm": 0.008038941770792007, + "learning_rate": 7.487739747354672e-06, + "loss": 0.0013, + "num_input_tokens_seen": 251693072, + "step": 116515 + }, + { + "epoch": 19.00815660685155, + "grad_norm": 0.0005029493477195501, + "learning_rate": 7.475472318136334e-06, + "loss": 0.0017, + "num_input_tokens_seen": 251703728, + "step": 116520 + }, + { + "epoch": 19.008972267536706, + "grad_norm": 0.025066372007131577, + "learning_rate": 7.4632148705522374e-06, + "loss": 0.0015, + "num_input_tokens_seen": 251715472, + "step": 116525 + }, + { + "epoch": 19.00978792822186, + "grad_norm": 0.012049240060150623, + "learning_rate": 7.450967404851017e-06, + "loss": 0.0009, + "num_input_tokens_seen": 251726032, + "step": 116530 + }, + { + "epoch": 19.010603588907014, + "grad_norm": 0.020044559612870216, + "learning_rate": 7.438729921280752e-06, + "loss": 0.0008, + "num_input_tokens_seen": 251736656, + "step": 116535 + }, + { + "epoch": 19.01141924959217, + "grad_norm": 0.007238124031573534, + "learning_rate": 7.42650242008952e-06, + "loss": 0.0008, + "num_input_tokens_seen": 251747216, + "step": 116540 + }, + { + "epoch": 19.012234910277325, + "grad_norm": 0.003442551242187619, + "learning_rate": 7.41428490152507e-06, + "loss": 0.0007, + "num_input_tokens_seen": 251757424, + "step": 116545 + }, + { + "epoch": 19.01305057096248, + "grad_norm": 0.0030905790627002716, + "learning_rate": 7.402077365835036e-06, + "loss": 0.0016, + "num_input_tokens_seen": 251769072, + "step": 116550 + }, + { + "epoch": 19.013866231647636, + "grad_norm": 0.006709754001349211, + "learning_rate": 7.389879813266831e-06, + "loss": 0.0004, + "num_input_tokens_seen": 251780240, + "step": 116555 + }, + { + "epoch": 19.01468189233279, + "grad_norm": 0.00024487529299221933, + "learning_rate": 7.377692244067591e-06, + "loss": 0.0009, + "num_input_tokens_seen": 251791344, + "step": 116560 + }, + { + "epoch": 19.015497553017944, + "grad_norm": 0.002104366896674037, + "learning_rate": 7.36551465848434e-06, + "loss": 0.0005, + "num_input_tokens_seen": 251803216, + "step": 116565 + }, + { + "epoch": 19.0163132137031, + "grad_norm": 0.00022835972777102143, + "learning_rate": 7.353347056763937e-06, + "loss": 0.0009, + "num_input_tokens_seen": 251814864, + "step": 116570 + }, + { + "epoch": 19.017128874388256, + "grad_norm": 0.0004103815299458802, + "learning_rate": 7.341189439152907e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251825680, + "step": 116575 + }, + { + "epoch": 19.017944535073408, + "grad_norm": 0.00012124201748520136, + "learning_rate": 7.329041805897551e-06, + "loss": 0.0027, + "num_input_tokens_seen": 251836016, + "step": 116580 + }, + { + "epoch": 19.018760195758563, + "grad_norm": 0.03281310573220253, + "learning_rate": 7.316904157244342e-06, + "loss": 0.002, + "num_input_tokens_seen": 251847536, + "step": 116585 + }, + { + "epoch": 19.01957585644372, + "grad_norm": 0.0035343714989721775, + "learning_rate": 7.304776493438914e-06, + "loss": 0.0004, + "num_input_tokens_seen": 251859440, + "step": 116590 + }, + { + "epoch": 19.020391517128875, + "grad_norm": 0.0038738809525966644, + "learning_rate": 7.2926588147273484e-06, + "loss": 0.002, + "num_input_tokens_seen": 251870352, + "step": 116595 + }, + { + "epoch": 19.02120717781403, + "grad_norm": 0.0016290287021547556, + "learning_rate": 7.280551121355005e-06, + "loss": 0.0005, + "num_input_tokens_seen": 251881584, + "step": 116600 + }, + { + "epoch": 19.022022838499183, + "grad_norm": 0.07534030079841614, + "learning_rate": 7.268453413567467e-06, + "loss": 0.0019, + "num_input_tokens_seen": 251891568, + "step": 116605 + }, + { + "epoch": 19.02283849918434, + "grad_norm": 0.5721881985664368, + "learning_rate": 7.256365691609645e-06, + "loss": 0.0873, + "num_input_tokens_seen": 251900272, + "step": 116610 + }, + { + "epoch": 19.023654159869494, + "grad_norm": 0.005474581383168697, + "learning_rate": 7.244287955726791e-06, + "loss": 0.1363, + "num_input_tokens_seen": 251911728, + "step": 116615 + }, + { + "epoch": 19.02446982055465, + "grad_norm": 0.0002741733333095908, + "learning_rate": 7.232220206163431e-06, + "loss": 0.0019, + "num_input_tokens_seen": 251922128, + "step": 116620 + }, + { + "epoch": 19.025285481239806, + "grad_norm": 0.008602812886238098, + "learning_rate": 7.220162443164369e-06, + "loss": 0.001, + "num_input_tokens_seen": 251932400, + "step": 116625 + }, + { + "epoch": 19.026101141924958, + "grad_norm": 0.0025156764313578606, + "learning_rate": 7.2081146669737416e-06, + "loss": 0.0005, + "num_input_tokens_seen": 251942416, + "step": 116630 + }, + { + "epoch": 19.026916802610113, + "grad_norm": 0.0007300799479708076, + "learning_rate": 7.196076877835911e-06, + "loss": 0.0058, + "num_input_tokens_seen": 251954064, + "step": 116635 + }, + { + "epoch": 19.02773246329527, + "grad_norm": 0.0036786433774977922, + "learning_rate": 7.1840490759946805e-06, + "loss": 0.0021, + "num_input_tokens_seen": 251964848, + "step": 116640 + }, + { + "epoch": 19.028548123980425, + "grad_norm": 0.0011658791918307543, + "learning_rate": 7.172031261693967e-06, + "loss": 0.0033, + "num_input_tokens_seen": 251974736, + "step": 116645 + }, + { + "epoch": 19.02936378466558, + "grad_norm": 0.001465832581743598, + "learning_rate": 7.160023435177132e-06, + "loss": 0.0008, + "num_input_tokens_seen": 251987024, + "step": 116650 + }, + { + "epoch": 19.030179445350733, + "grad_norm": 0.00024204261717386544, + "learning_rate": 7.148025596687702e-06, + "loss": 0.0222, + "num_input_tokens_seen": 251997648, + "step": 116655 + }, + { + "epoch": 19.03099510603589, + "grad_norm": 0.0005633268738165498, + "learning_rate": 7.136037746468704e-06, + "loss": 0.0016, + "num_input_tokens_seen": 252008784, + "step": 116660 + }, + { + "epoch": 19.031810766721044, + "grad_norm": 0.001010783831588924, + "learning_rate": 7.124059884763168e-06, + "loss": 0.0007, + "num_input_tokens_seen": 252018992, + "step": 116665 + }, + { + "epoch": 19.0326264274062, + "grad_norm": 0.008563019335269928, + "learning_rate": 7.112092011813842e-06, + "loss": 0.0015, + "num_input_tokens_seen": 252029328, + "step": 116670 + }, + { + "epoch": 19.033442088091356, + "grad_norm": 0.0015962064499035478, + "learning_rate": 7.1001341278632e-06, + "loss": 0.0013, + "num_input_tokens_seen": 252040144, + "step": 116675 + }, + { + "epoch": 19.034257748776508, + "grad_norm": 0.00032261203159578145, + "learning_rate": 7.08818623315366e-06, + "loss": 0.0025, + "num_input_tokens_seen": 252051472, + "step": 116680 + }, + { + "epoch": 19.035073409461663, + "grad_norm": 0.003594001056626439, + "learning_rate": 7.076248327927359e-06, + "loss": 0.0007, + "num_input_tokens_seen": 252062352, + "step": 116685 + }, + { + "epoch": 19.03588907014682, + "grad_norm": 0.005797537509351969, + "learning_rate": 7.064320412426162e-06, + "loss": 0.0077, + "num_input_tokens_seen": 252073904, + "step": 116690 + }, + { + "epoch": 19.036704730831975, + "grad_norm": 0.0007113271858543158, + "learning_rate": 7.052402486891818e-06, + "loss": 0.0006, + "num_input_tokens_seen": 252085264, + "step": 116695 + }, + { + "epoch": 19.03752039151713, + "grad_norm": 0.0007374466513283551, + "learning_rate": 7.040494551565912e-06, + "loss": 0.0025, + "num_input_tokens_seen": 252096240, + "step": 116700 + }, + { + "epoch": 19.038336052202283, + "grad_norm": 0.009614666923880577, + "learning_rate": 7.028596606689808e-06, + "loss": 0.0029, + "num_input_tokens_seen": 252106320, + "step": 116705 + }, + { + "epoch": 19.03915171288744, + "grad_norm": 0.005013478919863701, + "learning_rate": 7.016708652504477e-06, + "loss": 0.0006, + "num_input_tokens_seen": 252117136, + "step": 116710 + }, + { + "epoch": 19.039967373572594, + "grad_norm": 0.00930885411798954, + "learning_rate": 7.004830689251007e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252128464, + "step": 116715 + }, + { + "epoch": 19.04078303425775, + "grad_norm": 0.0016969816060736775, + "learning_rate": 6.992962717170038e-06, + "loss": 0.1444, + "num_input_tokens_seen": 252138576, + "step": 116720 + }, + { + "epoch": 19.041598694942905, + "grad_norm": 0.0009569272515363991, + "learning_rate": 6.981104736502042e-06, + "loss": 0.001, + "num_input_tokens_seen": 252150448, + "step": 116725 + }, + { + "epoch": 19.042414355628058, + "grad_norm": 0.00043876888230443, + "learning_rate": 6.969256747487496e-06, + "loss": 0.0009, + "num_input_tokens_seen": 252161424, + "step": 116730 + }, + { + "epoch": 19.043230016313213, + "grad_norm": 0.5943044424057007, + "learning_rate": 6.957418750366318e-06, + "loss": 0.0351, + "num_input_tokens_seen": 252173424, + "step": 116735 + }, + { + "epoch": 19.04404567699837, + "grad_norm": 0.024856556206941605, + "learning_rate": 6.945590745378594e-06, + "loss": 0.0016, + "num_input_tokens_seen": 252184656, + "step": 116740 + }, + { + "epoch": 19.044861337683525, + "grad_norm": 0.03356698527932167, + "learning_rate": 6.9337727327639096e-06, + "loss": 0.0014, + "num_input_tokens_seen": 252196880, + "step": 116745 + }, + { + "epoch": 19.045676998368677, + "grad_norm": 0.31922638416290283, + "learning_rate": 6.921964712761853e-06, + "loss": 0.0058, + "num_input_tokens_seen": 252206704, + "step": 116750 + }, + { + "epoch": 19.046492659053833, + "grad_norm": 0.004116969183087349, + "learning_rate": 6.910166685611674e-06, + "loss": 0.0012, + "num_input_tokens_seen": 252216912, + "step": 116755 + }, + { + "epoch": 19.04730831973899, + "grad_norm": 0.001806379295885563, + "learning_rate": 6.898378651552517e-06, + "loss": 0.0013, + "num_input_tokens_seen": 252227984, + "step": 116760 + }, + { + "epoch": 19.048123980424144, + "grad_norm": 0.02706415392458439, + "learning_rate": 6.88660061082319e-06, + "loss": 0.0032, + "num_input_tokens_seen": 252238128, + "step": 116765 + }, + { + "epoch": 19.0489396411093, + "grad_norm": 0.0019197918009012938, + "learning_rate": 6.874832563662559e-06, + "loss": 0.0013, + "num_input_tokens_seen": 252247024, + "step": 116770 + }, + { + "epoch": 19.049755301794452, + "grad_norm": 0.01021169126033783, + "learning_rate": 6.863074510308931e-06, + "loss": 0.0028, + "num_input_tokens_seen": 252258608, + "step": 116775 + }, + { + "epoch": 19.050570962479608, + "grad_norm": 0.06623980402946472, + "learning_rate": 6.851326451000783e-06, + "loss": 0.0032, + "num_input_tokens_seen": 252269136, + "step": 116780 + }, + { + "epoch": 19.051386623164763, + "grad_norm": 0.005052113905549049, + "learning_rate": 6.839588385976036e-06, + "loss": 0.0018, + "num_input_tokens_seen": 252278928, + "step": 116785 + }, + { + "epoch": 19.05220228384992, + "grad_norm": 0.02421603351831436, + "learning_rate": 6.827860315472667e-06, + "loss": 0.0016, + "num_input_tokens_seen": 252289040, + "step": 116790 + }, + { + "epoch": 19.053017944535075, + "grad_norm": 0.0010228599421679974, + "learning_rate": 6.816142239728373e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252297680, + "step": 116795 + }, + { + "epoch": 19.053833605220227, + "grad_norm": 0.013187268748879433, + "learning_rate": 6.804434158980577e-06, + "loss": 0.0033, + "num_input_tokens_seen": 252308176, + "step": 116800 + }, + { + "epoch": 19.054649265905383, + "grad_norm": 0.0018919931026175618, + "learning_rate": 6.792736073466587e-06, + "loss": 0.0012, + "num_input_tokens_seen": 252319024, + "step": 116805 + }, + { + "epoch": 19.05546492659054, + "grad_norm": 0.004553182981908321, + "learning_rate": 6.781047983423439e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252329360, + "step": 116810 + }, + { + "epoch": 19.056280587275694, + "grad_norm": 0.005015052855014801, + "learning_rate": 6.769369889088106e-06, + "loss": 0.0007, + "num_input_tokens_seen": 252338800, + "step": 116815 + }, + { + "epoch": 19.05709624796085, + "grad_norm": 0.002011285861954093, + "learning_rate": 6.75770179069718e-06, + "loss": 0.003, + "num_input_tokens_seen": 252348848, + "step": 116820 + }, + { + "epoch": 19.057911908646002, + "grad_norm": 0.0005088383913971484, + "learning_rate": 6.746043688487136e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252360048, + "step": 116825 + }, + { + "epoch": 19.058727569331158, + "grad_norm": 0.027116188779473305, + "learning_rate": 6.734395582694286e-06, + "loss": 0.0039, + "num_input_tokens_seen": 252371472, + "step": 116830 + }, + { + "epoch": 19.059543230016313, + "grad_norm": 0.030280839651823044, + "learning_rate": 6.722757473554608e-06, + "loss": 0.0017, + "num_input_tokens_seen": 252381520, + "step": 116835 + }, + { + "epoch": 19.06035889070147, + "grad_norm": 0.004732145462185144, + "learning_rate": 6.71112936130408e-06, + "loss": 0.0027, + "num_input_tokens_seen": 252391536, + "step": 116840 + }, + { + "epoch": 19.061174551386625, + "grad_norm": 0.025895684957504272, + "learning_rate": 6.6995112461782355e-06, + "loss": 0.0015, + "num_input_tokens_seen": 252402416, + "step": 116845 + }, + { + "epoch": 19.061990212071777, + "grad_norm": 0.00530358636751771, + "learning_rate": 6.6879031284126646e-06, + "loss": 0.0046, + "num_input_tokens_seen": 252413008, + "step": 116850 + }, + { + "epoch": 19.062805872756933, + "grad_norm": 0.0013073545414954424, + "learning_rate": 6.676305008242512e-06, + "loss": 0.0017, + "num_input_tokens_seen": 252424848, + "step": 116855 + }, + { + "epoch": 19.063621533442088, + "grad_norm": 0.00042659181053750217, + "learning_rate": 6.664716885902811e-06, + "loss": 0.0022, + "num_input_tokens_seen": 252435120, + "step": 116860 + }, + { + "epoch": 19.064437194127244, + "grad_norm": 0.001548528904095292, + "learning_rate": 6.653138761628541e-06, + "loss": 0.0036, + "num_input_tokens_seen": 252446320, + "step": 116865 + }, + { + "epoch": 19.0652528548124, + "grad_norm": 0.04929887875914574, + "learning_rate": 6.641570635654182e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252456944, + "step": 116870 + }, + { + "epoch": 19.06606851549755, + "grad_norm": 0.0002772485022433102, + "learning_rate": 6.630012508214322e-06, + "loss": 0.005, + "num_input_tokens_seen": 252466736, + "step": 116875 + }, + { + "epoch": 19.066884176182707, + "grad_norm": 0.00854497030377388, + "learning_rate": 6.618464379543166e-06, + "loss": 0.0027, + "num_input_tokens_seen": 252477328, + "step": 116880 + }, + { + "epoch": 19.067699836867863, + "grad_norm": 0.0005079619586467743, + "learning_rate": 6.6069262498746895e-06, + "loss": 0.0028, + "num_input_tokens_seen": 252487664, + "step": 116885 + }, + { + "epoch": 19.06851549755302, + "grad_norm": 0.003630138235166669, + "learning_rate": 6.595398119442764e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252499248, + "step": 116890 + }, + { + "epoch": 19.069331158238175, + "grad_norm": 0.004067980218678713, + "learning_rate": 6.583879988481034e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252509584, + "step": 116895 + }, + { + "epoch": 19.070146818923327, + "grad_norm": 0.0006684943800792098, + "learning_rate": 6.572371857222925e-06, + "loss": 0.0007, + "num_input_tokens_seen": 252520816, + "step": 116900 + }, + { + "epoch": 19.070962479608482, + "grad_norm": 0.00023432802117895335, + "learning_rate": 6.560873725901695e-06, + "loss": 0.0021, + "num_input_tokens_seen": 252532336, + "step": 116905 + }, + { + "epoch": 19.071778140293638, + "grad_norm": 0.002191467909142375, + "learning_rate": 6.5493855947502674e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252543344, + "step": 116910 + }, + { + "epoch": 19.072593800978794, + "grad_norm": 0.014515231363475323, + "learning_rate": 6.537907464001569e-06, + "loss": 0.0007, + "num_input_tokens_seen": 252553968, + "step": 116915 + }, + { + "epoch": 19.07340946166395, + "grad_norm": 0.004022237379103899, + "learning_rate": 6.5264393338881345e-06, + "loss": 0.0005, + "num_input_tokens_seen": 252564368, + "step": 116920 + }, + { + "epoch": 19.0742251223491, + "grad_norm": 0.002793958643451333, + "learning_rate": 6.514981204642445e-06, + "loss": 0.0009, + "num_input_tokens_seen": 252576304, + "step": 116925 + }, + { + "epoch": 19.075040783034257, + "grad_norm": 0.08884984999895096, + "learning_rate": 6.503533076496704e-06, + "loss": 0.0028, + "num_input_tokens_seen": 252586864, + "step": 116930 + }, + { + "epoch": 19.075856443719413, + "grad_norm": 0.0009011936490423977, + "learning_rate": 6.492094949682892e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252597360, + "step": 116935 + }, + { + "epoch": 19.07667210440457, + "grad_norm": 0.0028816265985369682, + "learning_rate": 6.480666824432879e-06, + "loss": 0.0006, + "num_input_tokens_seen": 252607888, + "step": 116940 + }, + { + "epoch": 19.07748776508972, + "grad_norm": 0.0019338749116286635, + "learning_rate": 6.469248700978148e-06, + "loss": 0.0009, + "num_input_tokens_seen": 252619056, + "step": 116945 + }, + { + "epoch": 19.078303425774877, + "grad_norm": 0.001564970356412232, + "learning_rate": 6.457840579550234e-06, + "loss": 0.0005, + "num_input_tokens_seen": 252629776, + "step": 116950 + }, + { + "epoch": 19.079119086460032, + "grad_norm": 0.0020377058535814285, + "learning_rate": 6.4464424603802865e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252641008, + "step": 116955 + }, + { + "epoch": 19.079934747145188, + "grad_norm": 0.04820120707154274, + "learning_rate": 6.435054343699287e-06, + "loss": 0.0025, + "num_input_tokens_seen": 252651408, + "step": 116960 + }, + { + "epoch": 19.080750407830344, + "grad_norm": 0.01028536818921566, + "learning_rate": 6.423676229738051e-06, + "loss": 0.001, + "num_input_tokens_seen": 252662160, + "step": 116965 + }, + { + "epoch": 19.081566068515496, + "grad_norm": 0.00021261714573483914, + "learning_rate": 6.412308118727117e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252673456, + "step": 116970 + }, + { + "epoch": 19.08238172920065, + "grad_norm": 0.005654108710587025, + "learning_rate": 6.400950010896966e-06, + "loss": 0.0016, + "num_input_tokens_seen": 252686000, + "step": 116975 + }, + { + "epoch": 19.083197389885807, + "grad_norm": 0.001454255892895162, + "learning_rate": 6.389601906477693e-06, + "loss": 0.0014, + "num_input_tokens_seen": 252697776, + "step": 116980 + }, + { + "epoch": 19.084013050570963, + "grad_norm": 0.0011134854285046458, + "learning_rate": 6.378263805699391e-06, + "loss": 0.0005, + "num_input_tokens_seen": 252708272, + "step": 116985 + }, + { + "epoch": 19.08482871125612, + "grad_norm": 0.015325321815907955, + "learning_rate": 6.36693570879171e-06, + "loss": 0.0019, + "num_input_tokens_seen": 252718416, + "step": 116990 + }, + { + "epoch": 19.08564437194127, + "grad_norm": 0.0018553230911493301, + "learning_rate": 6.355617615984355e-06, + "loss": 0.0018, + "num_input_tokens_seen": 252729968, + "step": 116995 + }, + { + "epoch": 19.086460032626427, + "grad_norm": 0.001695129438303411, + "learning_rate": 6.344309527506587e-06, + "loss": 0.0021, + "num_input_tokens_seen": 252740400, + "step": 117000 + }, + { + "epoch": 19.087275693311582, + "grad_norm": 0.01279063243418932, + "learning_rate": 6.333011443587722e-06, + "loss": 0.0022, + "num_input_tokens_seen": 252750992, + "step": 117005 + }, + { + "epoch": 19.088091353996738, + "grad_norm": 0.0003366192686371505, + "learning_rate": 6.3217233644565216e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252762224, + "step": 117010 + }, + { + "epoch": 19.088907014681894, + "grad_norm": 0.04421667009592056, + "learning_rate": 6.3104452903419704e-06, + "loss": 0.0045, + "num_input_tokens_seen": 252773872, + "step": 117015 + }, + { + "epoch": 19.089722675367046, + "grad_norm": 0.0018370678881183267, + "learning_rate": 6.299177221472496e-06, + "loss": 0.0012, + "num_input_tokens_seen": 252783792, + "step": 117020 + }, + { + "epoch": 19.0905383360522, + "grad_norm": 0.07757820188999176, + "learning_rate": 6.287919158076472e-06, + "loss": 0.0023, + "num_input_tokens_seen": 252794320, + "step": 117025 + }, + { + "epoch": 19.091353996737357, + "grad_norm": 0.00017441553063690662, + "learning_rate": 6.2766711003821035e-06, + "loss": 0.0049, + "num_input_tokens_seen": 252805552, + "step": 117030 + }, + { + "epoch": 19.092169657422513, + "grad_norm": 0.0007293216185644269, + "learning_rate": 6.265433048617375e-06, + "loss": 0.0707, + "num_input_tokens_seen": 252817648, + "step": 117035 + }, + { + "epoch": 19.09298531810767, + "grad_norm": 0.0004739946161862463, + "learning_rate": 6.254205003009938e-06, + "loss": 0.0035, + "num_input_tokens_seen": 252827408, + "step": 117040 + }, + { + "epoch": 19.09380097879282, + "grad_norm": 0.013173624873161316, + "learning_rate": 6.242986963787445e-06, + "loss": 0.0009, + "num_input_tokens_seen": 252838224, + "step": 117045 + }, + { + "epoch": 19.094616639477977, + "grad_norm": 0.0021853481885045767, + "learning_rate": 6.231778931177157e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252849648, + "step": 117050 + }, + { + "epoch": 19.095432300163132, + "grad_norm": 0.0002483553544152528, + "learning_rate": 6.220580905406226e-06, + "loss": 0.0011, + "num_input_tokens_seen": 252860816, + "step": 117055 + }, + { + "epoch": 19.096247960848288, + "grad_norm": 0.006167882587760687, + "learning_rate": 6.209392886701692e-06, + "loss": 0.0066, + "num_input_tokens_seen": 252871088, + "step": 117060 + }, + { + "epoch": 19.097063621533444, + "grad_norm": 0.013794321566820145, + "learning_rate": 6.198214875290209e-06, + "loss": 0.001, + "num_input_tokens_seen": 252880816, + "step": 117065 + }, + { + "epoch": 19.097879282218596, + "grad_norm": 0.0014956948580220342, + "learning_rate": 6.187046871398316e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252891472, + "step": 117070 + }, + { + "epoch": 19.09869494290375, + "grad_norm": 0.044382814317941666, + "learning_rate": 6.175888875252389e-06, + "loss": 0.0022, + "num_input_tokens_seen": 252902064, + "step": 117075 + }, + { + "epoch": 19.099510603588907, + "grad_norm": 0.003115827450528741, + "learning_rate": 6.1647408870785236e-06, + "loss": 0.0006, + "num_input_tokens_seen": 252912272, + "step": 117080 + }, + { + "epoch": 19.100326264274063, + "grad_norm": 0.02964218147099018, + "learning_rate": 6.1536029071025955e-06, + "loss": 0.0044, + "num_input_tokens_seen": 252923664, + "step": 117085 + }, + { + "epoch": 19.10114192495922, + "grad_norm": 0.5666136741638184, + "learning_rate": 6.142474935550535e-06, + "loss": 0.0195, + "num_input_tokens_seen": 252934128, + "step": 117090 + }, + { + "epoch": 19.10195758564437, + "grad_norm": 0.011689902283251286, + "learning_rate": 6.131356972647606e-06, + "loss": 0.0006, + "num_input_tokens_seen": 252944752, + "step": 117095 + }, + { + "epoch": 19.102773246329527, + "grad_norm": 0.0005122054717503488, + "learning_rate": 6.120249018619295e-06, + "loss": 0.0049, + "num_input_tokens_seen": 252955152, + "step": 117100 + }, + { + "epoch": 19.103588907014682, + "grad_norm": 0.0004246353928465396, + "learning_rate": 6.109151073690644e-06, + "loss": 0.0035, + "num_input_tokens_seen": 252966928, + "step": 117105 + }, + { + "epoch": 19.104404567699838, + "grad_norm": 0.000601739389821887, + "learning_rate": 6.0980631380866405e-06, + "loss": 0.0015, + "num_input_tokens_seen": 252977040, + "step": 117110 + }, + { + "epoch": 19.10522022838499, + "grad_norm": 0.007016733754426241, + "learning_rate": 6.086985212031881e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252988688, + "step": 117115 + }, + { + "epoch": 19.106035889070146, + "grad_norm": 0.0003089867241214961, + "learning_rate": 6.075917295750965e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252999824, + "step": 117120 + }, + { + "epoch": 19.1068515497553, + "grad_norm": 0.0007841411279514432, + "learning_rate": 6.064859389468158e-06, + "loss": 0.002, + "num_input_tokens_seen": 253010544, + "step": 117125 + }, + { + "epoch": 19.107667210440457, + "grad_norm": 0.006227858830243349, + "learning_rate": 6.053811493407613e-06, + "loss": 0.0294, + "num_input_tokens_seen": 253021456, + "step": 117130 + }, + { + "epoch": 19.108482871125613, + "grad_norm": 0.0027969330549240112, + "learning_rate": 6.04277360779315e-06, + "loss": 0.0867, + "num_input_tokens_seen": 253031632, + "step": 117135 + }, + { + "epoch": 19.109298531810765, + "grad_norm": 0.012334640137851238, + "learning_rate": 6.031745732848593e-06, + "loss": 0.0036, + "num_input_tokens_seen": 253043088, + "step": 117140 + }, + { + "epoch": 19.11011419249592, + "grad_norm": 0.0017199370777234435, + "learning_rate": 6.02072786879726e-06, + "loss": 0.0014, + "num_input_tokens_seen": 253055152, + "step": 117145 + }, + { + "epoch": 19.110929853181077, + "grad_norm": 0.002734170528128743, + "learning_rate": 6.009720015862585e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253065968, + "step": 117150 + }, + { + "epoch": 19.111745513866232, + "grad_norm": 0.01975974440574646, + "learning_rate": 5.9987221742675566e-06, + "loss": 0.0032, + "num_input_tokens_seen": 253078416, + "step": 117155 + }, + { + "epoch": 19.112561174551388, + "grad_norm": 0.003430173732340336, + "learning_rate": 5.987734344235107e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253089456, + "step": 117160 + }, + { + "epoch": 19.11337683523654, + "grad_norm": 0.04491540789604187, + "learning_rate": 5.976756525987948e-06, + "loss": 0.003, + "num_input_tokens_seen": 253100880, + "step": 117165 + }, + { + "epoch": 19.114192495921696, + "grad_norm": 0.006507876794785261, + "learning_rate": 5.965788719748566e-06, + "loss": 0.0048, + "num_input_tokens_seen": 253111280, + "step": 117170 + }, + { + "epoch": 19.11500815660685, + "grad_norm": 0.5028917789459229, + "learning_rate": 5.954830925739174e-06, + "loss": 0.0135, + "num_input_tokens_seen": 253121424, + "step": 117175 + }, + { + "epoch": 19.115823817292007, + "grad_norm": 0.004586064722388983, + "learning_rate": 5.943883144181872e-06, + "loss": 0.0013, + "num_input_tokens_seen": 253131920, + "step": 117180 + }, + { + "epoch": 19.116639477977163, + "grad_norm": 0.006693511735647917, + "learning_rate": 5.932945375298537e-06, + "loss": 0.0064, + "num_input_tokens_seen": 253142416, + "step": 117185 + }, + { + "epoch": 19.117455138662315, + "grad_norm": 0.0018724793335422873, + "learning_rate": 5.922017619310826e-06, + "loss": 0.0028, + "num_input_tokens_seen": 253152048, + "step": 117190 + }, + { + "epoch": 19.11827079934747, + "grad_norm": 0.02574349008500576, + "learning_rate": 5.911099876440173e-06, + "loss": 0.0024, + "num_input_tokens_seen": 253162480, + "step": 117195 + }, + { + "epoch": 19.119086460032626, + "grad_norm": 0.024611355736851692, + "learning_rate": 5.900192146907957e-06, + "loss": 0.0021, + "num_input_tokens_seen": 253172304, + "step": 117200 + }, + { + "epoch": 19.119902120717782, + "grad_norm": 0.0015057043638080359, + "learning_rate": 5.889294430935111e-06, + "loss": 0.021, + "num_input_tokens_seen": 253182832, + "step": 117205 + }, + { + "epoch": 19.120717781402938, + "grad_norm": 0.01581161841750145, + "learning_rate": 5.8784067287424584e-06, + "loss": 0.0009, + "num_input_tokens_seen": 253193776, + "step": 117210 + }, + { + "epoch": 19.12153344208809, + "grad_norm": 0.0052831522189080715, + "learning_rate": 5.8675290405508785e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253203792, + "step": 117215 + }, + { + "epoch": 19.122349102773246, + "grad_norm": 0.0007881993660703301, + "learning_rate": 5.856661366580584e-06, + "loss": 0.001, + "num_input_tokens_seen": 253213776, + "step": 117220 + }, + { + "epoch": 19.1231647634584, + "grad_norm": 0.03156473860144615, + "learning_rate": 5.845803707051955e-06, + "loss": 0.0077, + "num_input_tokens_seen": 253223984, + "step": 117225 + }, + { + "epoch": 19.123980424143557, + "grad_norm": 0.0015464631142094731, + "learning_rate": 5.834956062184926e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253233744, + "step": 117230 + }, + { + "epoch": 19.124796084828713, + "grad_norm": 0.0007315798429772258, + "learning_rate": 5.824118432199488e-06, + "loss": 0.0012, + "num_input_tokens_seen": 253243984, + "step": 117235 + }, + { + "epoch": 19.125611745513865, + "grad_norm": 0.000989689608104527, + "learning_rate": 5.813290817315131e-06, + "loss": 0.0033, + "num_input_tokens_seen": 253256176, + "step": 117240 + }, + { + "epoch": 19.12642740619902, + "grad_norm": 0.8543770909309387, + "learning_rate": 5.8024732177514585e-06, + "loss": 0.0732, + "num_input_tokens_seen": 253265616, + "step": 117245 + }, + { + "epoch": 19.127243066884176, + "grad_norm": 0.6602007150650024, + "learning_rate": 5.791665633727461e-06, + "loss": 0.0358, + "num_input_tokens_seen": 253276688, + "step": 117250 + }, + { + "epoch": 19.128058727569332, + "grad_norm": 0.6810281872749329, + "learning_rate": 5.780868065462408e-06, + "loss": 0.0384, + "num_input_tokens_seen": 253287792, + "step": 117255 + }, + { + "epoch": 19.128874388254488, + "grad_norm": 0.0007883647922426462, + "learning_rate": 5.770080513174958e-06, + "loss": 0.0013, + "num_input_tokens_seen": 253298512, + "step": 117260 + }, + { + "epoch": 19.12969004893964, + "grad_norm": 0.002425673883408308, + "learning_rate": 5.75930297708388e-06, + "loss": 0.0028, + "num_input_tokens_seen": 253309424, + "step": 117265 + }, + { + "epoch": 19.130505709624796, + "grad_norm": 0.0005828720168210566, + "learning_rate": 5.748535457407444e-06, + "loss": 0.0017, + "num_input_tokens_seen": 253319696, + "step": 117270 + }, + { + "epoch": 19.13132137030995, + "grad_norm": 0.001128783798776567, + "learning_rate": 5.737777954364032e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253330192, + "step": 117275 + }, + { + "epoch": 19.132137030995107, + "grad_norm": 0.0017047654837369919, + "learning_rate": 5.727030468171468e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253341264, + "step": 117280 + }, + { + "epoch": 19.13295269168026, + "grad_norm": 0.049430977553129196, + "learning_rate": 5.71629299904769e-06, + "loss": 0.0017, + "num_input_tokens_seen": 253352368, + "step": 117285 + }, + { + "epoch": 19.133768352365415, + "grad_norm": 0.40919792652130127, + "learning_rate": 5.705565547210301e-06, + "loss": 0.0129, + "num_input_tokens_seen": 253364208, + "step": 117290 + }, + { + "epoch": 19.13458401305057, + "grad_norm": 0.001255987910553813, + "learning_rate": 5.694848112876683e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253375440, + "step": 117295 + }, + { + "epoch": 19.135399673735726, + "grad_norm": 0.0008938516257330775, + "learning_rate": 5.684140696263995e-06, + "loss": 0.0008, + "num_input_tokens_seen": 253385360, + "step": 117300 + }, + { + "epoch": 19.136215334420882, + "grad_norm": 0.001965533709153533, + "learning_rate": 5.673443297589287e-06, + "loss": 0.0011, + "num_input_tokens_seen": 253396016, + "step": 117305 + }, + { + "epoch": 19.137030995106034, + "grad_norm": 0.03859832137823105, + "learning_rate": 5.662755917069384e-06, + "loss": 0.0035, + "num_input_tokens_seen": 253407152, + "step": 117310 + }, + { + "epoch": 19.13784665579119, + "grad_norm": 0.006483915261924267, + "learning_rate": 5.652078554920836e-06, + "loss": 0.0009, + "num_input_tokens_seen": 253417136, + "step": 117315 + }, + { + "epoch": 19.138662316476346, + "grad_norm": 0.0037106431555002928, + "learning_rate": 5.6414112113600254e-06, + "loss": 0.0051, + "num_input_tokens_seen": 253427376, + "step": 117320 + }, + { + "epoch": 19.1394779771615, + "grad_norm": 0.00029239041032269597, + "learning_rate": 5.630753886603168e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253437232, + "step": 117325 + }, + { + "epoch": 19.140293637846657, + "grad_norm": 0.007669747807085514, + "learning_rate": 5.6201065808662025e-06, + "loss": 0.0008, + "num_input_tokens_seen": 253448688, + "step": 117330 + }, + { + "epoch": 19.14110929853181, + "grad_norm": 0.004348631016910076, + "learning_rate": 5.609469294364955e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253459248, + "step": 117335 + }, + { + "epoch": 19.141924959216965, + "grad_norm": 0.0015068600187078118, + "learning_rate": 5.598842027315032e-06, + "loss": 0.0032, + "num_input_tokens_seen": 253470960, + "step": 117340 + }, + { + "epoch": 19.14274061990212, + "grad_norm": 0.0020891670137643814, + "learning_rate": 5.588224779931761e-06, + "loss": 0.0015, + "num_input_tokens_seen": 253480528, + "step": 117345 + }, + { + "epoch": 19.143556280587276, + "grad_norm": 0.004071689676493406, + "learning_rate": 5.577617552430303e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253491632, + "step": 117350 + }, + { + "epoch": 19.144371941272432, + "grad_norm": 0.0002320687344763428, + "learning_rate": 5.567020345025597e-06, + "loss": 0.0009, + "num_input_tokens_seen": 253502128, + "step": 117355 + }, + { + "epoch": 19.145187601957584, + "grad_norm": 0.5973049402236938, + "learning_rate": 5.556433157932528e-06, + "loss": 0.0255, + "num_input_tokens_seen": 253512880, + "step": 117360 + }, + { + "epoch": 19.14600326264274, + "grad_norm": 0.013899151235818863, + "learning_rate": 5.5458559913655335e-06, + "loss": 0.0009, + "num_input_tokens_seen": 253524656, + "step": 117365 + }, + { + "epoch": 19.146818923327896, + "grad_norm": 0.0036639608442783356, + "learning_rate": 5.5352888455390546e-06, + "loss": 0.0005, + "num_input_tokens_seen": 253535088, + "step": 117370 + }, + { + "epoch": 19.14763458401305, + "grad_norm": 0.00020154824596829712, + "learning_rate": 5.524731720667197e-06, + "loss": 0.0011, + "num_input_tokens_seen": 253545552, + "step": 117375 + }, + { + "epoch": 19.148450244698207, + "grad_norm": 0.0019946058746427298, + "learning_rate": 5.514184616964013e-06, + "loss": 0.0009, + "num_input_tokens_seen": 253556144, + "step": 117380 + }, + { + "epoch": 19.14926590538336, + "grad_norm": 0.0249673742800951, + "learning_rate": 5.503647534643108e-06, + "loss": 0.0017, + "num_input_tokens_seen": 253567920, + "step": 117385 + }, + { + "epoch": 19.150081566068515, + "grad_norm": 0.008728111162781715, + "learning_rate": 5.493120473918145e-06, + "loss": 0.0319, + "num_input_tokens_seen": 253579600, + "step": 117390 + }, + { + "epoch": 19.15089722675367, + "grad_norm": 0.0003968800010625273, + "learning_rate": 5.4826034350023426e-06, + "loss": 0.001, + "num_input_tokens_seen": 253589008, + "step": 117395 + }, + { + "epoch": 19.151712887438826, + "grad_norm": 0.00017136444512289017, + "learning_rate": 5.472096418108974e-06, + "loss": 0.0057, + "num_input_tokens_seen": 253598960, + "step": 117400 + }, + { + "epoch": 19.152528548123982, + "grad_norm": 0.004832593258470297, + "learning_rate": 5.461599423450924e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253609360, + "step": 117405 + }, + { + "epoch": 19.153344208809134, + "grad_norm": 0.0034371260553598404, + "learning_rate": 5.451112451240914e-06, + "loss": 0.001, + "num_input_tokens_seen": 253621616, + "step": 117410 + }, + { + "epoch": 19.15415986949429, + "grad_norm": 0.007875418290495872, + "learning_rate": 5.440635501691493e-06, + "loss": 0.0019, + "num_input_tokens_seen": 253632848, + "step": 117415 + }, + { + "epoch": 19.154975530179446, + "grad_norm": 0.036121610552072525, + "learning_rate": 5.4301685750149935e-06, + "loss": 0.0076, + "num_input_tokens_seen": 253642800, + "step": 117420 + }, + { + "epoch": 19.1557911908646, + "grad_norm": 0.0009760346729308367, + "learning_rate": 5.419711671423577e-06, + "loss": 0.0019, + "num_input_tokens_seen": 253654544, + "step": 117425 + }, + { + "epoch": 19.156606851549757, + "grad_norm": 0.002728499239310622, + "learning_rate": 5.409264791129076e-06, + "loss": 0.0021, + "num_input_tokens_seen": 253666000, + "step": 117430 + }, + { + "epoch": 19.15742251223491, + "grad_norm": 0.002243755152449012, + "learning_rate": 5.398827934343264e-06, + "loss": 0.0018, + "num_input_tokens_seen": 253675504, + "step": 117435 + }, + { + "epoch": 19.158238172920065, + "grad_norm": 0.07680145651102066, + "learning_rate": 5.38840110127764e-06, + "loss": 0.0041, + "num_input_tokens_seen": 253686768, + "step": 117440 + }, + { + "epoch": 19.15905383360522, + "grad_norm": 0.0037976547610014677, + "learning_rate": 5.377984292143534e-06, + "loss": 0.0013, + "num_input_tokens_seen": 253697520, + "step": 117445 + }, + { + "epoch": 19.159869494290376, + "grad_norm": 0.0003936160064768046, + "learning_rate": 5.367577507152055e-06, + "loss": 0.0015, + "num_input_tokens_seen": 253705776, + "step": 117450 + }, + { + "epoch": 19.160685154975532, + "grad_norm": 0.03844565153121948, + "learning_rate": 5.35718074651409e-06, + "loss": 0.002, + "num_input_tokens_seen": 253716752, + "step": 117455 + }, + { + "epoch": 19.161500815660684, + "grad_norm": 0.03414055332541466, + "learning_rate": 5.346794010440359e-06, + "loss": 0.0018, + "num_input_tokens_seen": 253727760, + "step": 117460 + }, + { + "epoch": 19.16231647634584, + "grad_norm": 0.001296155620366335, + "learning_rate": 5.336417299141361e-06, + "loss": 0.0038, + "num_input_tokens_seen": 253738992, + "step": 117465 + }, + { + "epoch": 19.163132137030995, + "grad_norm": 0.0953516960144043, + "learning_rate": 5.326050612827426e-06, + "loss": 0.0021, + "num_input_tokens_seen": 253749040, + "step": 117470 + }, + { + "epoch": 19.16394779771615, + "grad_norm": 0.00608966825529933, + "learning_rate": 5.315693951708555e-06, + "loss": 0.0012, + "num_input_tokens_seen": 253759856, + "step": 117475 + }, + { + "epoch": 19.164763458401303, + "grad_norm": 0.020399967208504677, + "learning_rate": 5.305347315994747e-06, + "loss": 0.0021, + "num_input_tokens_seen": 253769648, + "step": 117480 + }, + { + "epoch": 19.16557911908646, + "grad_norm": 0.004619190003722906, + "learning_rate": 5.295010705895609e-06, + "loss": 0.005, + "num_input_tokens_seen": 253780624, + "step": 117485 + }, + { + "epoch": 19.166394779771615, + "grad_norm": 0.0010359683074057102, + "learning_rate": 5.284684121620697e-06, + "loss": 0.0019, + "num_input_tokens_seen": 253791600, + "step": 117490 + }, + { + "epoch": 19.16721044045677, + "grad_norm": 0.0013771315570920706, + "learning_rate": 5.2743675633792345e-06, + "loss": 0.0038, + "num_input_tokens_seen": 253801584, + "step": 117495 + }, + { + "epoch": 19.168026101141926, + "grad_norm": 0.0016533228335902095, + "learning_rate": 5.264061031380274e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253813456, + "step": 117500 + }, + { + "epoch": 19.16884176182708, + "grad_norm": 0.004072641488164663, + "learning_rate": 5.253764525832761e-06, + "loss": 0.0019, + "num_input_tokens_seen": 253824464, + "step": 117505 + }, + { + "epoch": 19.169657422512234, + "grad_norm": 0.028276223689317703, + "learning_rate": 5.243478046945305e-06, + "loss": 0.0015, + "num_input_tokens_seen": 253835344, + "step": 117510 + }, + { + "epoch": 19.17047308319739, + "grad_norm": 0.002490977058187127, + "learning_rate": 5.233201594926462e-06, + "loss": 0.0008, + "num_input_tokens_seen": 253845360, + "step": 117515 + }, + { + "epoch": 19.171288743882545, + "grad_norm": 0.018174799159169197, + "learning_rate": 5.222935169984455e-06, + "loss": 0.0058, + "num_input_tokens_seen": 253856880, + "step": 117520 + }, + { + "epoch": 19.1721044045677, + "grad_norm": 0.00024710557772777975, + "learning_rate": 5.212678772327284e-06, + "loss": 0.0008, + "num_input_tokens_seen": 253867088, + "step": 117525 + }, + { + "epoch": 19.172920065252853, + "grad_norm": 0.0025143155362457037, + "learning_rate": 5.202432402162893e-06, + "loss": 0.0562, + "num_input_tokens_seen": 253878608, + "step": 117530 + }, + { + "epoch": 19.17373572593801, + "grad_norm": 0.08865071088075638, + "learning_rate": 5.192196059698895e-06, + "loss": 0.0031, + "num_input_tokens_seen": 253886896, + "step": 117535 + }, + { + "epoch": 19.174551386623165, + "grad_norm": 0.0015286827692762017, + "learning_rate": 5.18196974514279e-06, + "loss": 0.0027, + "num_input_tokens_seen": 253898352, + "step": 117540 + }, + { + "epoch": 19.17536704730832, + "grad_norm": 0.0010672721546143293, + "learning_rate": 5.1717534587017445e-06, + "loss": 0.002, + "num_input_tokens_seen": 253910448, + "step": 117545 + }, + { + "epoch": 19.176182707993476, + "grad_norm": 0.12418833374977112, + "learning_rate": 5.161547200582872e-06, + "loss": 0.0031, + "num_input_tokens_seen": 253921008, + "step": 117550 + }, + { + "epoch": 19.17699836867863, + "grad_norm": 0.00027891527861356735, + "learning_rate": 5.151350970993007e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253931600, + "step": 117555 + }, + { + "epoch": 19.177814029363784, + "grad_norm": 0.00248112459667027, + "learning_rate": 5.141164770138707e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253942512, + "step": 117560 + }, + { + "epoch": 19.17862969004894, + "grad_norm": 0.0024953444954007864, + "learning_rate": 5.130988598226527e-06, + "loss": 0.0013, + "num_input_tokens_seen": 253954032, + "step": 117565 + }, + { + "epoch": 19.179445350734095, + "grad_norm": 0.021138276904821396, + "learning_rate": 5.120822455462637e-06, + "loss": 0.0049, + "num_input_tokens_seen": 253965136, + "step": 117570 + }, + { + "epoch": 19.18026101141925, + "grad_norm": 0.001367824850603938, + "learning_rate": 5.110666342053094e-06, + "loss": 0.0119, + "num_input_tokens_seen": 253975408, + "step": 117575 + }, + { + "epoch": 19.181076672104403, + "grad_norm": 0.013493673875927925, + "learning_rate": 5.100520258203734e-06, + "loss": 0.0017, + "num_input_tokens_seen": 253984912, + "step": 117580 + }, + { + "epoch": 19.18189233278956, + "grad_norm": 0.02722967229783535, + "learning_rate": 5.090384204120113e-06, + "loss": 0.0046, + "num_input_tokens_seen": 253996656, + "step": 117585 + }, + { + "epoch": 19.182707993474715, + "grad_norm": 0.001604323973879218, + "learning_rate": 5.08025818000768e-06, + "loss": 0.0018, + "num_input_tokens_seen": 254007312, + "step": 117590 + }, + { + "epoch": 19.18352365415987, + "grad_norm": 0.00195878348313272, + "learning_rate": 5.0701421860717135e-06, + "loss": 0.0145, + "num_input_tokens_seen": 254018928, + "step": 117595 + }, + { + "epoch": 19.184339314845026, + "grad_norm": 0.0005671089165844023, + "learning_rate": 5.060036222517161e-06, + "loss": 0.001, + "num_input_tokens_seen": 254030160, + "step": 117600 + }, + { + "epoch": 19.18515497553018, + "grad_norm": 0.2616872191429138, + "learning_rate": 5.049940289548804e-06, + "loss": 0.0708, + "num_input_tokens_seen": 254040400, + "step": 117605 + }, + { + "epoch": 19.185970636215334, + "grad_norm": 0.00945583451539278, + "learning_rate": 5.039854387371368e-06, + "loss": 0.001, + "num_input_tokens_seen": 254051472, + "step": 117610 + }, + { + "epoch": 19.18678629690049, + "grad_norm": 0.003492174670100212, + "learning_rate": 5.0297785161891315e-06, + "loss": 0.1164, + "num_input_tokens_seen": 254062192, + "step": 117615 + }, + { + "epoch": 19.187601957585645, + "grad_norm": 0.0006185670499689877, + "learning_rate": 5.019712676206323e-06, + "loss": 0.0007, + "num_input_tokens_seen": 254073360, + "step": 117620 + }, + { + "epoch": 19.1884176182708, + "grad_norm": 0.0019063102081418037, + "learning_rate": 5.009656867627055e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254084112, + "step": 117625 + }, + { + "epoch": 19.189233278955953, + "grad_norm": 0.0005886392900720239, + "learning_rate": 4.999611090654943e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254094672, + "step": 117630 + }, + { + "epoch": 19.19004893964111, + "grad_norm": 0.0002696272567845881, + "learning_rate": 4.989575345493713e-06, + "loss": 0.0006, + "num_input_tokens_seen": 254105616, + "step": 117635 + }, + { + "epoch": 19.190864600326265, + "grad_norm": 0.005746932700276375, + "learning_rate": 4.979549632346702e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254116240, + "step": 117640 + }, + { + "epoch": 19.19168026101142, + "grad_norm": 0.00037611470906995237, + "learning_rate": 4.969533951417082e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254127088, + "step": 117645 + }, + { + "epoch": 19.192495921696572, + "grad_norm": 0.04209225997328758, + "learning_rate": 4.959528302907857e-06, + "loss": 0.0075, + "num_input_tokens_seen": 254138096, + "step": 117650 + }, + { + "epoch": 19.193311582381728, + "grad_norm": 0.00023832359875086695, + "learning_rate": 4.949532687021751e-06, + "loss": 0.0016, + "num_input_tokens_seen": 254149552, + "step": 117655 + }, + { + "epoch": 19.194127243066884, + "grad_norm": 0.008501997217535973, + "learning_rate": 4.939547103961439e-06, + "loss": 0.0012, + "num_input_tokens_seen": 254159184, + "step": 117660 + }, + { + "epoch": 19.19494290375204, + "grad_norm": 0.009387916885316372, + "learning_rate": 4.929571553929202e-06, + "loss": 0.0012, + "num_input_tokens_seen": 254170800, + "step": 117665 + }, + { + "epoch": 19.195758564437195, + "grad_norm": 0.003329535946249962, + "learning_rate": 4.919606037127267e-06, + "loss": 0.0065, + "num_input_tokens_seen": 254181584, + "step": 117670 + }, + { + "epoch": 19.196574225122347, + "grad_norm": 0.042027123272418976, + "learning_rate": 4.909650553757583e-06, + "loss": 0.0012, + "num_input_tokens_seen": 254191952, + "step": 117675 + }, + { + "epoch": 19.197389885807503, + "grad_norm": 0.0032699257135391235, + "learning_rate": 4.8997051040218235e-06, + "loss": 0.0013, + "num_input_tokens_seen": 254202864, + "step": 117680 + }, + { + "epoch": 19.19820554649266, + "grad_norm": 0.0007212890195660293, + "learning_rate": 4.889769688121715e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254212848, + "step": 117685 + }, + { + "epoch": 19.199021207177815, + "grad_norm": 0.00019408235675655305, + "learning_rate": 4.87984430625843e-06, + "loss": 0.0015, + "num_input_tokens_seen": 254223408, + "step": 117690 + }, + { + "epoch": 19.19983686786297, + "grad_norm": 0.0005199372535571456, + "learning_rate": 4.869928958633252e-06, + "loss": 0.0038, + "num_input_tokens_seen": 254234288, + "step": 117695 + }, + { + "epoch": 19.200652528548122, + "grad_norm": 0.003805541666224599, + "learning_rate": 4.860023645447076e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254245392, + "step": 117700 + }, + { + "epoch": 19.201468189233278, + "grad_norm": 0.03584432229399681, + "learning_rate": 4.85012836690063e-06, + "loss": 0.0896, + "num_input_tokens_seen": 254255728, + "step": 117705 + }, + { + "epoch": 19.202283849918434, + "grad_norm": 0.48327863216400146, + "learning_rate": 4.840243123194477e-06, + "loss": 0.1086, + "num_input_tokens_seen": 254265904, + "step": 117710 + }, + { + "epoch": 19.20309951060359, + "grad_norm": 0.011278538964688778, + "learning_rate": 4.83036791452901e-06, + "loss": 0.0007, + "num_input_tokens_seen": 254276944, + "step": 117715 + }, + { + "epoch": 19.203915171288745, + "grad_norm": 0.0011459417873993516, + "learning_rate": 4.820502741104238e-06, + "loss": 0.0035, + "num_input_tokens_seen": 254287760, + "step": 117720 + }, + { + "epoch": 19.204730831973897, + "grad_norm": 0.0020964513532817364, + "learning_rate": 4.810647603120166e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254299216, + "step": 117725 + }, + { + "epoch": 19.205546492659053, + "grad_norm": 0.00022387487115338445, + "learning_rate": 4.800802500776524e-06, + "loss": 0.0006, + "num_input_tokens_seen": 254309328, + "step": 117730 + }, + { + "epoch": 19.20636215334421, + "grad_norm": 0.009423289448022842, + "learning_rate": 4.790967434272819e-06, + "loss": 0.0006, + "num_input_tokens_seen": 254319280, + "step": 117735 + }, + { + "epoch": 19.207177814029365, + "grad_norm": 0.004757652059197426, + "learning_rate": 4.781142403808392e-06, + "loss": 0.0041, + "num_input_tokens_seen": 254329584, + "step": 117740 + }, + { + "epoch": 19.20799347471452, + "grad_norm": 0.0001649027253733948, + "learning_rate": 4.771327409582305e-06, + "loss": 0.0013, + "num_input_tokens_seen": 254339888, + "step": 117745 + }, + { + "epoch": 19.208809135399672, + "grad_norm": 0.0006828425102867186, + "learning_rate": 4.761522451793565e-06, + "loss": 0.0015, + "num_input_tokens_seen": 254351280, + "step": 117750 + }, + { + "epoch": 19.209624796084828, + "grad_norm": 0.18796727061271667, + "learning_rate": 4.751727530640793e-06, + "loss": 0.0026, + "num_input_tokens_seen": 254362640, + "step": 117755 + }, + { + "epoch": 19.210440456769984, + "grad_norm": 0.00041193258948624134, + "learning_rate": 4.74194264632255e-06, + "loss": 0.0378, + "num_input_tokens_seen": 254373136, + "step": 117760 + }, + { + "epoch": 19.21125611745514, + "grad_norm": 0.0002577627310529351, + "learning_rate": 4.732167799037068e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254383792, + "step": 117765 + }, + { + "epoch": 19.212071778140295, + "grad_norm": 0.010403187945485115, + "learning_rate": 4.722402988982577e-06, + "loss": 0.1138, + "num_input_tokens_seen": 254393648, + "step": 117770 + }, + { + "epoch": 19.212887438825447, + "grad_norm": 0.005965354852378368, + "learning_rate": 4.7126482163568075e-06, + "loss": 0.0006, + "num_input_tokens_seen": 254403728, + "step": 117775 + }, + { + "epoch": 19.213703099510603, + "grad_norm": 0.0038343167398124933, + "learning_rate": 4.702903481357601e-06, + "loss": 0.0496, + "num_input_tokens_seen": 254414544, + "step": 117780 + }, + { + "epoch": 19.21451876019576, + "grad_norm": 0.002071639057248831, + "learning_rate": 4.693168784182356e-06, + "loss": 0.0015, + "num_input_tokens_seen": 254425872, + "step": 117785 + }, + { + "epoch": 19.215334420880914, + "grad_norm": 0.0016836397117003798, + "learning_rate": 4.6834441250284135e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254436464, + "step": 117790 + }, + { + "epoch": 19.21615008156607, + "grad_norm": 1.1679688692092896, + "learning_rate": 4.673729504092783e-06, + "loss": 0.0623, + "num_input_tokens_seen": 254446512, + "step": 117795 + }, + { + "epoch": 19.216965742251222, + "grad_norm": 0.00035125756403431296, + "learning_rate": 4.664024921572419e-06, + "loss": 0.0024, + "num_input_tokens_seen": 254458320, + "step": 117800 + }, + { + "epoch": 19.217781402936378, + "grad_norm": 0.002369961701333523, + "learning_rate": 4.654330377663996e-06, + "loss": 0.0014, + "num_input_tokens_seen": 254468624, + "step": 117805 + }, + { + "epoch": 19.218597063621534, + "grad_norm": 0.008754052221775055, + "learning_rate": 4.644645872563913e-06, + "loss": 0.003, + "num_input_tokens_seen": 254478480, + "step": 117810 + }, + { + "epoch": 19.21941272430669, + "grad_norm": 0.007495723199099302, + "learning_rate": 4.634971406468514e-06, + "loss": 0.0021, + "num_input_tokens_seen": 254490416, + "step": 117815 + }, + { + "epoch": 19.22022838499184, + "grad_norm": 0.05519472435116768, + "learning_rate": 4.625306979573807e-06, + "loss": 0.0013, + "num_input_tokens_seen": 254500720, + "step": 117820 + }, + { + "epoch": 19.221044045676997, + "grad_norm": 0.020412957295775414, + "learning_rate": 4.615652592075747e-06, + "loss": 0.0018, + "num_input_tokens_seen": 254512304, + "step": 117825 + }, + { + "epoch": 19.221859706362153, + "grad_norm": 0.0004900472704321146, + "learning_rate": 4.606008244169846e-06, + "loss": 0.0017, + "num_input_tokens_seen": 254522608, + "step": 117830 + }, + { + "epoch": 19.22267536704731, + "grad_norm": 0.36650630831718445, + "learning_rate": 4.596373936051667e-06, + "loss": 0.0111, + "num_input_tokens_seen": 254533040, + "step": 117835 + }, + { + "epoch": 19.223491027732464, + "grad_norm": 0.01161511242389679, + "learning_rate": 4.586749667916446e-06, + "loss": 0.0018, + "num_input_tokens_seen": 254543728, + "step": 117840 + }, + { + "epoch": 19.224306688417617, + "grad_norm": 0.000983836012892425, + "learning_rate": 4.57713543995919e-06, + "loss": 0.0027, + "num_input_tokens_seen": 254553360, + "step": 117845 + }, + { + "epoch": 19.225122349102772, + "grad_norm": 0.05470266193151474, + "learning_rate": 4.567531252374801e-06, + "loss": 0.0025, + "num_input_tokens_seen": 254565328, + "step": 117850 + }, + { + "epoch": 19.225938009787928, + "grad_norm": 0.010444153100252151, + "learning_rate": 4.557937105357901e-06, + "loss": 0.0104, + "num_input_tokens_seen": 254577552, + "step": 117855 + }, + { + "epoch": 19.226753670473084, + "grad_norm": 0.0179997980594635, + "learning_rate": 4.54835299910289e-06, + "loss": 0.0028, + "num_input_tokens_seen": 254589712, + "step": 117860 + }, + { + "epoch": 19.22756933115824, + "grad_norm": 0.0017869179137051105, + "learning_rate": 4.5387789338040555e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254601744, + "step": 117865 + }, + { + "epoch": 19.22838499184339, + "grad_norm": 0.00039110815851017833, + "learning_rate": 4.529214909655355e-06, + "loss": 0.0013, + "num_input_tokens_seen": 254613904, + "step": 117870 + }, + { + "epoch": 19.229200652528547, + "grad_norm": 0.04959358274936676, + "learning_rate": 4.519660926850744e-06, + "loss": 0.0027, + "num_input_tokens_seen": 254625200, + "step": 117875 + }, + { + "epoch": 19.230016313213703, + "grad_norm": 0.011024784296751022, + "learning_rate": 4.510116985583679e-06, + "loss": 0.001, + "num_input_tokens_seen": 254636560, + "step": 117880 + }, + { + "epoch": 19.23083197389886, + "grad_norm": 0.2488701045513153, + "learning_rate": 4.500583086047782e-06, + "loss": 0.0047, + "num_input_tokens_seen": 254647440, + "step": 117885 + }, + { + "epoch": 19.231647634584014, + "grad_norm": 0.02225778065621853, + "learning_rate": 4.491059228436012e-06, + "loss": 0.0024, + "num_input_tokens_seen": 254657296, + "step": 117890 + }, + { + "epoch": 19.232463295269167, + "grad_norm": 0.931378960609436, + "learning_rate": 4.481545412941657e-06, + "loss": 0.126, + "num_input_tokens_seen": 254668432, + "step": 117895 + }, + { + "epoch": 19.233278955954322, + "grad_norm": 0.0007507778936997056, + "learning_rate": 4.472041639757285e-06, + "loss": 0.0009, + "num_input_tokens_seen": 254680208, + "step": 117900 + }, + { + "epoch": 19.234094616639478, + "grad_norm": 0.003547506872564554, + "learning_rate": 4.462547909075687e-06, + "loss": 0.0032, + "num_input_tokens_seen": 254690800, + "step": 117905 + }, + { + "epoch": 19.234910277324634, + "grad_norm": 0.00012341790716163814, + "learning_rate": 4.453064221089154e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254702576, + "step": 117910 + }, + { + "epoch": 19.23572593800979, + "grad_norm": 0.010750551708042622, + "learning_rate": 4.44359057598992e-06, + "loss": 0.0007, + "num_input_tokens_seen": 254714480, + "step": 117915 + }, + { + "epoch": 19.23654159869494, + "grad_norm": 0.012489181011915207, + "learning_rate": 4.434126973969998e-06, + "loss": 0.0216, + "num_input_tokens_seen": 254725456, + "step": 117920 + }, + { + "epoch": 19.237357259380097, + "grad_norm": 0.0013013400603085756, + "learning_rate": 4.424673415221181e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254736560, + "step": 117925 + }, + { + "epoch": 19.238172920065253, + "grad_norm": 0.015076026320457458, + "learning_rate": 4.41522989993498e-06, + "loss": 0.0011, + "num_input_tokens_seen": 254748624, + "step": 117930 + }, + { + "epoch": 19.23898858075041, + "grad_norm": 0.001261144527234137, + "learning_rate": 4.405796428302855e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254759664, + "step": 117935 + }, + { + "epoch": 19.239804241435564, + "grad_norm": 0.001300856121815741, + "learning_rate": 4.396373000515986e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254770352, + "step": 117940 + }, + { + "epoch": 19.240619902120716, + "grad_norm": 0.0004666333843488246, + "learning_rate": 4.3869596167653296e-06, + "loss": 0.0016, + "num_input_tokens_seen": 254780880, + "step": 117945 + }, + { + "epoch": 19.241435562805872, + "grad_norm": 0.018931837752461433, + "learning_rate": 4.377556277241679e-06, + "loss": 0.005, + "num_input_tokens_seen": 254793008, + "step": 117950 + }, + { + "epoch": 19.242251223491028, + "grad_norm": 0.0013247689930722117, + "learning_rate": 4.368162982135604e-06, + "loss": 0.0015, + "num_input_tokens_seen": 254803312, + "step": 117955 + }, + { + "epoch": 19.243066884176184, + "grad_norm": 0.06686493009328842, + "learning_rate": 4.3587797316373965e-06, + "loss": 0.0038, + "num_input_tokens_seen": 254814032, + "step": 117960 + }, + { + "epoch": 19.24388254486134, + "grad_norm": 0.0002263520291307941, + "learning_rate": 4.34940652593735e-06, + "loss": 0.0012, + "num_input_tokens_seen": 254824816, + "step": 117965 + }, + { + "epoch": 19.24469820554649, + "grad_norm": 0.007901106961071491, + "learning_rate": 4.34004336522531e-06, + "loss": 0.0033, + "num_input_tokens_seen": 254835952, + "step": 117970 + }, + { + "epoch": 19.245513866231647, + "grad_norm": 0.04995737969875336, + "learning_rate": 4.330690249691127e-06, + "loss": 0.0035, + "num_input_tokens_seen": 254845584, + "step": 117975 + }, + { + "epoch": 19.246329526916803, + "grad_norm": 0.0003960870089940727, + "learning_rate": 4.321347179524316e-06, + "loss": 0.0007, + "num_input_tokens_seen": 254856656, + "step": 117980 + }, + { + "epoch": 19.24714518760196, + "grad_norm": 0.0004914596793241799, + "learning_rate": 4.312014154914113e-06, + "loss": 0.0018, + "num_input_tokens_seen": 254866256, + "step": 117985 + }, + { + "epoch": 19.247960848287114, + "grad_norm": 0.003282026154920459, + "learning_rate": 4.302691176049922e-06, + "loss": 0.0042, + "num_input_tokens_seen": 254875888, + "step": 117990 + }, + { + "epoch": 19.248776508972266, + "grad_norm": 0.00018431748321745545, + "learning_rate": 4.293378243120371e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254886672, + "step": 117995 + }, + { + "epoch": 19.249592169657422, + "grad_norm": 0.0007251430070027709, + "learning_rate": 4.284075356314476e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254898096, + "step": 118000 + }, + { + "epoch": 19.250407830342578, + "grad_norm": 0.011223108507692814, + "learning_rate": 4.2747825158205855e-06, + "loss": 0.0019, + "num_input_tokens_seen": 254908080, + "step": 118005 + }, + { + "epoch": 19.251223491027734, + "grad_norm": 0.0019064913503825665, + "learning_rate": 4.265499721827159e-06, + "loss": 0.001, + "num_input_tokens_seen": 254918256, + "step": 118010 + }, + { + "epoch": 19.252039151712886, + "grad_norm": 0.00043139405897818506, + "learning_rate": 4.256226974522215e-06, + "loss": 0.0009, + "num_input_tokens_seen": 254928272, + "step": 118015 + }, + { + "epoch": 19.25285481239804, + "grad_norm": 0.0022782967425882816, + "learning_rate": 4.246964274093767e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254940016, + "step": 118020 + }, + { + "epoch": 19.253670473083197, + "grad_norm": 0.014259060844779015, + "learning_rate": 4.237711620729501e-06, + "loss": 0.001, + "num_input_tokens_seen": 254950800, + "step": 118025 + }, + { + "epoch": 19.254486133768353, + "grad_norm": 0.0016648249002173543, + "learning_rate": 4.228469014616931e-06, + "loss": 0.0036, + "num_input_tokens_seen": 254962992, + "step": 118030 + }, + { + "epoch": 19.25530179445351, + "grad_norm": 0.010028230026364326, + "learning_rate": 4.219236455943298e-06, + "loss": 0.0023, + "num_input_tokens_seen": 254973392, + "step": 118035 + }, + { + "epoch": 19.25611745513866, + "grad_norm": 0.0038817732129245996, + "learning_rate": 4.210013944895841e-06, + "loss": 0.0058, + "num_input_tokens_seen": 254984368, + "step": 118040 + }, + { + "epoch": 19.256933115823816, + "grad_norm": 0.0060980357229709625, + "learning_rate": 4.2008014816613534e-06, + "loss": 0.0013, + "num_input_tokens_seen": 254995280, + "step": 118045 + }, + { + "epoch": 19.257748776508972, + "grad_norm": 0.0006417598924599588, + "learning_rate": 4.191599066426632e-06, + "loss": 0.0018, + "num_input_tokens_seen": 255006832, + "step": 118050 + }, + { + "epoch": 19.258564437194128, + "grad_norm": 0.00042296203901059926, + "learning_rate": 4.182406699378138e-06, + "loss": 0.0022, + "num_input_tokens_seen": 255017424, + "step": 118055 + }, + { + "epoch": 19.259380097879284, + "grad_norm": 0.00021238908811938018, + "learning_rate": 4.173224380702112e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255027184, + "step": 118060 + }, + { + "epoch": 19.260195758564436, + "grad_norm": 0.0052239480428397655, + "learning_rate": 4.164052110584737e-06, + "loss": 0.0014, + "num_input_tokens_seen": 255036816, + "step": 118065 + }, + { + "epoch": 19.26101141924959, + "grad_norm": 0.00019243801943957806, + "learning_rate": 4.154889889211866e-06, + "loss": 0.0014, + "num_input_tokens_seen": 255047408, + "step": 118070 + }, + { + "epoch": 19.261827079934747, + "grad_norm": 0.001658376189880073, + "learning_rate": 4.145737716769182e-06, + "loss": 0.0012, + "num_input_tokens_seen": 255057168, + "step": 118075 + }, + { + "epoch": 19.262642740619903, + "grad_norm": 0.0030029506888240576, + "learning_rate": 4.136595593442149e-06, + "loss": 0.0078, + "num_input_tokens_seen": 255068752, + "step": 118080 + }, + { + "epoch": 19.26345840130506, + "grad_norm": 0.00022957536566536874, + "learning_rate": 4.1274635194160086e-06, + "loss": 0.0006, + "num_input_tokens_seen": 255079760, + "step": 118085 + }, + { + "epoch": 19.26427406199021, + "grad_norm": 0.0007117181667126715, + "learning_rate": 4.118341494875944e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255091088, + "step": 118090 + }, + { + "epoch": 19.265089722675366, + "grad_norm": 0.027176441624760628, + "learning_rate": 4.1092295200066966e-06, + "loss": 0.0159, + "num_input_tokens_seen": 255101968, + "step": 118095 + }, + { + "epoch": 19.265905383360522, + "grad_norm": 0.07800480723381042, + "learning_rate": 4.100127594993064e-06, + "loss": 0.0029, + "num_input_tokens_seen": 255112976, + "step": 118100 + }, + { + "epoch": 19.266721044045678, + "grad_norm": 0.009774110279977322, + "learning_rate": 4.091035720019398e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255122288, + "step": 118105 + }, + { + "epoch": 19.267536704730833, + "grad_norm": 0.0008097323589026928, + "learning_rate": 4.081953895269996e-06, + "loss": 0.0011, + "num_input_tokens_seen": 255132048, + "step": 118110 + }, + { + "epoch": 19.268352365415986, + "grad_norm": 0.0004488844715524465, + "learning_rate": 4.072882120928933e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255143152, + "step": 118115 + }, + { + "epoch": 19.26916802610114, + "grad_norm": 0.0007533471216447651, + "learning_rate": 4.063820397180007e-06, + "loss": 0.0054, + "num_input_tokens_seen": 255155664, + "step": 118120 + }, + { + "epoch": 19.269983686786297, + "grad_norm": 0.04722285270690918, + "learning_rate": 4.054768724206958e-06, + "loss": 0.0015, + "num_input_tokens_seen": 255165872, + "step": 118125 + }, + { + "epoch": 19.270799347471453, + "grad_norm": 0.005354705266654491, + "learning_rate": 4.045727102193087e-06, + "loss": 0.0021, + "num_input_tokens_seen": 255176560, + "step": 118130 + }, + { + "epoch": 19.27161500815661, + "grad_norm": 0.0037820693105459213, + "learning_rate": 4.036695531321799e-06, + "loss": 0.0012, + "num_input_tokens_seen": 255188176, + "step": 118135 + }, + { + "epoch": 19.27243066884176, + "grad_norm": 0.0006078414153307676, + "learning_rate": 4.027674011776006e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255198736, + "step": 118140 + }, + { + "epoch": 19.273246329526916, + "grad_norm": 0.0028573654126375914, + "learning_rate": 4.018662543738616e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255210384, + "step": 118145 + }, + { + "epoch": 19.274061990212072, + "grad_norm": 0.001126722665503621, + "learning_rate": 4.009661127392206e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255220688, + "step": 118150 + }, + { + "epoch": 19.274877650897228, + "grad_norm": 0.010454751551151276, + "learning_rate": 4.00066976291924e-06, + "loss": 0.0034, + "num_input_tokens_seen": 255230608, + "step": 118155 + }, + { + "epoch": 19.275693311582383, + "grad_norm": 0.023937121033668518, + "learning_rate": 3.9916884505019065e-06, + "loss": 0.0018, + "num_input_tokens_seen": 255240848, + "step": 118160 + }, + { + "epoch": 19.276508972267536, + "grad_norm": 0.0342971608042717, + "learning_rate": 3.982717190322227e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255251344, + "step": 118165 + }, + { + "epoch": 19.27732463295269, + "grad_norm": 0.0011292777489870787, + "learning_rate": 3.973755982562055e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255261904, + "step": 118170 + }, + { + "epoch": 19.278140293637847, + "grad_norm": 0.0016227407613769174, + "learning_rate": 3.964804827402913e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255273008, + "step": 118175 + }, + { + "epoch": 19.278955954323003, + "grad_norm": 0.007771195378154516, + "learning_rate": 3.955863725026321e-06, + "loss": 0.0006, + "num_input_tokens_seen": 255283664, + "step": 118180 + }, + { + "epoch": 19.27977161500816, + "grad_norm": 0.0009318848024122417, + "learning_rate": 3.946932675613413e-06, + "loss": 0.0022, + "num_input_tokens_seen": 255293456, + "step": 118185 + }, + { + "epoch": 19.28058727569331, + "grad_norm": 0.0019882982596755028, + "learning_rate": 3.93801167934521e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255304528, + "step": 118190 + }, + { + "epoch": 19.281402936378466, + "grad_norm": 0.018294580280780792, + "learning_rate": 3.929100736402513e-06, + "loss": 0.0006, + "num_input_tokens_seen": 255316368, + "step": 118195 + }, + { + "epoch": 19.282218597063622, + "grad_norm": 0.0001411344128428027, + "learning_rate": 3.920199846965844e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255328432, + "step": 118200 + }, + { + "epoch": 19.283034257748778, + "grad_norm": 0.00018316751811653376, + "learning_rate": 3.911309011215725e-06, + "loss": 0.0009, + "num_input_tokens_seen": 255340080, + "step": 118205 + }, + { + "epoch": 19.28384991843393, + "grad_norm": 0.0008493968634866178, + "learning_rate": 3.902428229332233e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255350512, + "step": 118210 + }, + { + "epoch": 19.284665579119086, + "grad_norm": 0.0007020276971161366, + "learning_rate": 3.8935575014953374e-06, + "loss": 0.0024, + "num_input_tokens_seen": 255362128, + "step": 118215 + }, + { + "epoch": 19.28548123980424, + "grad_norm": 0.002223310759291053, + "learning_rate": 3.884696827884893e-06, + "loss": 0.0007, + "num_input_tokens_seen": 255373200, + "step": 118220 + }, + { + "epoch": 19.286296900489397, + "grad_norm": 0.006080263294279575, + "learning_rate": 3.8758462086804225e-06, + "loss": 0.0006, + "num_input_tokens_seen": 255383696, + "step": 118225 + }, + { + "epoch": 19.287112561174553, + "grad_norm": 0.001246536965481937, + "learning_rate": 3.867005644061283e-06, + "loss": 0.0024, + "num_input_tokens_seen": 255394448, + "step": 118230 + }, + { + "epoch": 19.287928221859705, + "grad_norm": 0.012224650010466576, + "learning_rate": 3.8581751342067205e-06, + "loss": 0.0013, + "num_input_tokens_seen": 255404592, + "step": 118235 + }, + { + "epoch": 19.28874388254486, + "grad_norm": 0.09165532886981964, + "learning_rate": 3.849354679295591e-06, + "loss": 0.0036, + "num_input_tokens_seen": 255414928, + "step": 118240 + }, + { + "epoch": 19.289559543230016, + "grad_norm": 0.011029092594981194, + "learning_rate": 3.840544279506753e-06, + "loss": 0.0052, + "num_input_tokens_seen": 255426992, + "step": 118245 + }, + { + "epoch": 19.290375203915172, + "grad_norm": 0.00046244170516729355, + "learning_rate": 3.831743935018672e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255436880, + "step": 118250 + }, + { + "epoch": 19.291190864600328, + "grad_norm": 0.8931393623352051, + "learning_rate": 3.822953646009708e-06, + "loss": 0.0574, + "num_input_tokens_seen": 255447024, + "step": 118255 + }, + { + "epoch": 19.29200652528548, + "grad_norm": 0.012058882042765617, + "learning_rate": 3.8141734126580505e-06, + "loss": 0.001, + "num_input_tokens_seen": 255457264, + "step": 118260 + }, + { + "epoch": 19.292822185970635, + "grad_norm": 0.0006178324692882597, + "learning_rate": 3.805403235141669e-06, + "loss": 0.0007, + "num_input_tokens_seen": 255467408, + "step": 118265 + }, + { + "epoch": 19.29363784665579, + "grad_norm": 0.0005874041235074401, + "learning_rate": 3.7966431136381985e-06, + "loss": 0.0009, + "num_input_tokens_seen": 255478032, + "step": 118270 + }, + { + "epoch": 19.294453507340947, + "grad_norm": 0.0011451609898358583, + "learning_rate": 3.7878930483252195e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255489328, + "step": 118275 + }, + { + "epoch": 19.295269168026103, + "grad_norm": 0.007449743337929249, + "learning_rate": 3.7791530393801456e-06, + "loss": 0.0015, + "num_input_tokens_seen": 255500272, + "step": 118280 + }, + { + "epoch": 19.296084828711255, + "grad_norm": 0.00993234384804964, + "learning_rate": 3.7704230869800015e-06, + "loss": 0.0007, + "num_input_tokens_seen": 255511920, + "step": 118285 + }, + { + "epoch": 19.29690048939641, + "grad_norm": 0.0009379129041917622, + "learning_rate": 3.7617031913017573e-06, + "loss": 0.0014, + "num_input_tokens_seen": 255521936, + "step": 118290 + }, + { + "epoch": 19.297716150081566, + "grad_norm": 0.0034401898737996817, + "learning_rate": 3.752993352522105e-06, + "loss": 0.0009, + "num_input_tokens_seen": 255532432, + "step": 118295 + }, + { + "epoch": 19.298531810766722, + "grad_norm": 0.025949660688638687, + "learning_rate": 3.7442935708176253e-06, + "loss": 0.0654, + "num_input_tokens_seen": 255542960, + "step": 118300 + }, + { + "epoch": 19.299347471451878, + "grad_norm": 0.004528548568487167, + "learning_rate": 3.7356038463645105e-06, + "loss": 0.0044, + "num_input_tokens_seen": 255553296, + "step": 118305 + }, + { + "epoch": 19.30016313213703, + "grad_norm": 0.012715993449091911, + "learning_rate": 3.7269241793390084e-06, + "loss": 0.002, + "num_input_tokens_seen": 255562832, + "step": 118310 + }, + { + "epoch": 19.300978792822185, + "grad_norm": 0.0012263595126569271, + "learning_rate": 3.7182545699169236e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255571888, + "step": 118315 + }, + { + "epoch": 19.30179445350734, + "grad_norm": 0.011231029406189919, + "learning_rate": 3.7095950182739478e-06, + "loss": 0.0044, + "num_input_tokens_seen": 255583408, + "step": 118320 + }, + { + "epoch": 19.302610114192497, + "grad_norm": 0.001041249604895711, + "learning_rate": 3.700945524585664e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255594608, + "step": 118325 + }, + { + "epoch": 19.303425774877653, + "grad_norm": 0.00013112963642925024, + "learning_rate": 3.6923060890273195e-06, + "loss": 0.0187, + "num_input_tokens_seen": 255605296, + "step": 118330 + }, + { + "epoch": 19.304241435562805, + "grad_norm": 0.0004950486472807825, + "learning_rate": 3.683676711773998e-06, + "loss": 0.0111, + "num_input_tokens_seen": 255615120, + "step": 118335 + }, + { + "epoch": 19.30505709624796, + "grad_norm": 0.018098052591085434, + "learning_rate": 3.6750573930005583e-06, + "loss": 0.0017, + "num_input_tokens_seen": 255626160, + "step": 118340 + }, + { + "epoch": 19.305872756933116, + "grad_norm": 0.0003636969195213169, + "learning_rate": 3.66644813288175e-06, + "loss": 0.0007, + "num_input_tokens_seen": 255637488, + "step": 118345 + }, + { + "epoch": 19.306688417618272, + "grad_norm": 0.00021313635807018727, + "learning_rate": 3.6578489315919893e-06, + "loss": 0.0011, + "num_input_tokens_seen": 255648272, + "step": 118350 + }, + { + "epoch": 19.307504078303428, + "grad_norm": 0.07795961946249008, + "learning_rate": 3.6492597893056367e-06, + "loss": 0.0048, + "num_input_tokens_seen": 255659632, + "step": 118355 + }, + { + "epoch": 19.30831973898858, + "grad_norm": 0.001691961195319891, + "learning_rate": 3.6406807061966085e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255671504, + "step": 118360 + }, + { + "epoch": 19.309135399673735, + "grad_norm": 0.021926045417785645, + "learning_rate": 3.6321116824388767e-06, + "loss": 0.0017, + "num_input_tokens_seen": 255683152, + "step": 118365 + }, + { + "epoch": 19.30995106035889, + "grad_norm": 0.0014312977436929941, + "learning_rate": 3.6235527182061912e-06, + "loss": 0.0007, + "num_input_tokens_seen": 255695280, + "step": 118370 + }, + { + "epoch": 19.310766721044047, + "grad_norm": 0.0011881274404004216, + "learning_rate": 3.615003813671802e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255706320, + "step": 118375 + }, + { + "epoch": 19.3115823817292, + "grad_norm": 0.002549724653363228, + "learning_rate": 3.6064649690091268e-06, + "loss": 0.0006, + "num_input_tokens_seen": 255717744, + "step": 118380 + }, + { + "epoch": 19.312398042414355, + "grad_norm": 0.027324769645929337, + "learning_rate": 3.5979361843910817e-06, + "loss": 0.0012, + "num_input_tokens_seen": 255727344, + "step": 118385 + }, + { + "epoch": 19.31321370309951, + "grad_norm": 0.24201883375644684, + "learning_rate": 3.589417459990696e-06, + "loss": 0.0021, + "num_input_tokens_seen": 255737520, + "step": 118390 + }, + { + "epoch": 19.314029363784666, + "grad_norm": 0.04044476896524429, + "learning_rate": 3.580908795980442e-06, + "loss": 0.0016, + "num_input_tokens_seen": 255747920, + "step": 118395 + }, + { + "epoch": 19.31484502446982, + "grad_norm": 0.05046987533569336, + "learning_rate": 3.572410192532849e-06, + "loss": 0.0019, + "num_input_tokens_seen": 255758896, + "step": 118400 + }, + { + "epoch": 19.315660685154974, + "grad_norm": 0.0012489468790590763, + "learning_rate": 3.563921649820112e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255769392, + "step": 118405 + }, + { + "epoch": 19.31647634584013, + "grad_norm": 0.041108161211013794, + "learning_rate": 3.555443168014261e-06, + "loss": 0.0023, + "num_input_tokens_seen": 255779440, + "step": 118410 + }, + { + "epoch": 19.317292006525285, + "grad_norm": 0.3899226784706116, + "learning_rate": 3.5469747472871574e-06, + "loss": 0.0115, + "num_input_tokens_seen": 255790800, + "step": 118415 + }, + { + "epoch": 19.31810766721044, + "grad_norm": 0.0016491117421537638, + "learning_rate": 3.5385163878103864e-06, + "loss": 0.0041, + "num_input_tokens_seen": 255801296, + "step": 118420 + }, + { + "epoch": 19.318923327895597, + "grad_norm": 0.001680854824371636, + "learning_rate": 3.5300680897554226e-06, + "loss": 0.0021, + "num_input_tokens_seen": 255812400, + "step": 118425 + }, + { + "epoch": 19.31973898858075, + "grad_norm": 0.08602673560380936, + "learning_rate": 3.5216298532934068e-06, + "loss": 0.0047, + "num_input_tokens_seen": 255822992, + "step": 118430 + }, + { + "epoch": 19.320554649265905, + "grad_norm": 0.0004549971781671047, + "learning_rate": 3.5132016785954235e-06, + "loss": 0.0014, + "num_input_tokens_seen": 255833616, + "step": 118435 + }, + { + "epoch": 19.32137030995106, + "grad_norm": 0.00023607736511621624, + "learning_rate": 3.504783565832226e-06, + "loss": 0.0028, + "num_input_tokens_seen": 255845008, + "step": 118440 + }, + { + "epoch": 19.322185970636216, + "grad_norm": 0.009514588862657547, + "learning_rate": 3.496375515174455e-06, + "loss": 0.0012, + "num_input_tokens_seen": 255855440, + "step": 118445 + }, + { + "epoch": 19.32300163132137, + "grad_norm": 0.009080817922949791, + "learning_rate": 3.4879775267925297e-06, + "loss": 0.0031, + "num_input_tokens_seen": 255866608, + "step": 118450 + }, + { + "epoch": 19.323817292006524, + "grad_norm": 0.04134466499090195, + "learning_rate": 3.4795896008565363e-06, + "loss": 0.0035, + "num_input_tokens_seen": 255878416, + "step": 118455 + }, + { + "epoch": 19.32463295269168, + "grad_norm": 0.019892286509275436, + "learning_rate": 3.4712117375365615e-06, + "loss": 0.0011, + "num_input_tokens_seen": 255888848, + "step": 118460 + }, + { + "epoch": 19.325448613376835, + "grad_norm": 0.0010795299895107746, + "learning_rate": 3.4628439370024133e-06, + "loss": 0.0858, + "num_input_tokens_seen": 255899312, + "step": 118465 + }, + { + "epoch": 19.32626427406199, + "grad_norm": 0.0002479618415236473, + "learning_rate": 3.454486199423568e-06, + "loss": 0.0012, + "num_input_tokens_seen": 255910640, + "step": 118470 + }, + { + "epoch": 19.327079934747147, + "grad_norm": 0.001029806793667376, + "learning_rate": 3.4461385249695e-06, + "loss": 0.0092, + "num_input_tokens_seen": 255921840, + "step": 118475 + }, + { + "epoch": 19.3278955954323, + "grad_norm": 0.005694256164133549, + "learning_rate": 3.4378009138093524e-06, + "loss": 0.0031, + "num_input_tokens_seen": 255931472, + "step": 118480 + }, + { + "epoch": 19.328711256117455, + "grad_norm": 0.031680673360824585, + "learning_rate": 3.429473366112157e-06, + "loss": 0.0023, + "num_input_tokens_seen": 255941360, + "step": 118485 + }, + { + "epoch": 19.32952691680261, + "grad_norm": 0.0053010135889053345, + "learning_rate": 3.421155882046556e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255951632, + "step": 118490 + }, + { + "epoch": 19.330342577487766, + "grad_norm": 0.002608070382848382, + "learning_rate": 3.4128484617812482e-06, + "loss": 0.0012, + "num_input_tokens_seen": 255961584, + "step": 118495 + }, + { + "epoch": 19.33115823817292, + "grad_norm": 0.0017077375669032335, + "learning_rate": 3.404551105484488e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255971248, + "step": 118500 + }, + { + "epoch": 19.331973898858074, + "grad_norm": 0.03870732709765434, + "learning_rate": 3.3962638133245296e-06, + "loss": 0.0016, + "num_input_tokens_seen": 255982384, + "step": 118505 + }, + { + "epoch": 19.33278955954323, + "grad_norm": 0.00011174618703080341, + "learning_rate": 3.3879865854691825e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255992976, + "step": 118510 + }, + { + "epoch": 19.333605220228385, + "grad_norm": 0.012585737742483616, + "learning_rate": 3.3797194220863694e-06, + "loss": 0.0023, + "num_input_tokens_seen": 256004080, + "step": 118515 + }, + { + "epoch": 19.33442088091354, + "grad_norm": 0.002018422121182084, + "learning_rate": 3.371462323343455e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256015184, + "step": 118520 + }, + { + "epoch": 19.335236541598697, + "grad_norm": 0.07069176435470581, + "learning_rate": 3.3632152894079727e-06, + "loss": 0.0013, + "num_input_tokens_seen": 256026192, + "step": 118525 + }, + { + "epoch": 19.33605220228385, + "grad_norm": 0.006027981173247099, + "learning_rate": 3.3549783204469e-06, + "loss": 0.0018, + "num_input_tokens_seen": 256036496, + "step": 118530 + }, + { + "epoch": 19.336867862969005, + "grad_norm": 0.0005176325212232769, + "learning_rate": 3.3467514166272696e-06, + "loss": 0.0015, + "num_input_tokens_seen": 256046992, + "step": 118535 + }, + { + "epoch": 19.33768352365416, + "grad_norm": 0.05079817399382591, + "learning_rate": 3.338534578115726e-06, + "loss": 0.0023, + "num_input_tokens_seen": 256056176, + "step": 118540 + }, + { + "epoch": 19.338499184339316, + "grad_norm": 0.015587205067276955, + "learning_rate": 3.3303278050789143e-06, + "loss": 0.0006, + "num_input_tokens_seen": 256066768, + "step": 118545 + }, + { + "epoch": 19.339314845024468, + "grad_norm": 0.07898391783237457, + "learning_rate": 3.3221310976829787e-06, + "loss": 0.0059, + "num_input_tokens_seen": 256076880, + "step": 118550 + }, + { + "epoch": 19.340130505709624, + "grad_norm": 0.003357226261869073, + "learning_rate": 3.313944456094231e-06, + "loss": 0.0061, + "num_input_tokens_seen": 256088528, + "step": 118555 + }, + { + "epoch": 19.34094616639478, + "grad_norm": 0.0023918889928609133, + "learning_rate": 3.3057678804784276e-06, + "loss": 0.0015, + "num_input_tokens_seen": 256098800, + "step": 118560 + }, + { + "epoch": 19.341761827079935, + "grad_norm": 0.015937641263008118, + "learning_rate": 3.29760137100138e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256109488, + "step": 118565 + }, + { + "epoch": 19.34257748776509, + "grad_norm": 0.08707918971776962, + "learning_rate": 3.289444927828511e-06, + "loss": 0.0018, + "num_input_tokens_seen": 256119600, + "step": 118570 + }, + { + "epoch": 19.343393148450243, + "grad_norm": 0.0558028407394886, + "learning_rate": 3.281298551125189e-06, + "loss": 0.003, + "num_input_tokens_seen": 256130096, + "step": 118575 + }, + { + "epoch": 19.3442088091354, + "grad_norm": 0.0014487484004348516, + "learning_rate": 3.2731622410565043e-06, + "loss": 0.0006, + "num_input_tokens_seen": 256141264, + "step": 118580 + }, + { + "epoch": 19.345024469820554, + "grad_norm": 0.0015977158909663558, + "learning_rate": 3.265035997787269e-06, + "loss": 0.0006, + "num_input_tokens_seen": 256151120, + "step": 118585 + }, + { + "epoch": 19.34584013050571, + "grad_norm": 0.0008332631550729275, + "learning_rate": 3.256919821482296e-06, + "loss": 0.0005, + "num_input_tokens_seen": 256162448, + "step": 118590 + }, + { + "epoch": 19.346655791190866, + "grad_norm": 0.002260663080960512, + "learning_rate": 3.2488137123059537e-06, + "loss": 0.0004, + "num_input_tokens_seen": 256173296, + "step": 118595 + }, + { + "epoch": 19.347471451876018, + "grad_norm": 0.02533086948096752, + "learning_rate": 3.2407176704226102e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256184080, + "step": 118600 + }, + { + "epoch": 19.348287112561174, + "grad_norm": 0.005144801922142506, + "learning_rate": 3.2326316959962463e-06, + "loss": 0.0014, + "num_input_tokens_seen": 256195184, + "step": 118605 + }, + { + "epoch": 19.34910277324633, + "grad_norm": 0.012791904620826244, + "learning_rate": 3.224555789190897e-06, + "loss": 0.0014, + "num_input_tokens_seen": 256206640, + "step": 118610 + }, + { + "epoch": 19.349918433931485, + "grad_norm": 0.00018290229490958154, + "learning_rate": 3.216489950170043e-06, + "loss": 0.0013, + "num_input_tokens_seen": 256218288, + "step": 118615 + }, + { + "epoch": 19.35073409461664, + "grad_norm": 0.00029546156292781234, + "learning_rate": 3.208434179097275e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256229904, + "step": 118620 + }, + { + "epoch": 19.351549755301793, + "grad_norm": 0.0011031778994947672, + "learning_rate": 3.200388476135796e-06, + "loss": 0.0096, + "num_input_tokens_seen": 256239152, + "step": 118625 + }, + { + "epoch": 19.35236541598695, + "grad_norm": 0.013649040833115578, + "learning_rate": 3.1923528414487535e-06, + "loss": 0.0011, + "num_input_tokens_seen": 256250288, + "step": 118630 + }, + { + "epoch": 19.353181076672104, + "grad_norm": 0.0035270475782454014, + "learning_rate": 3.184327275198795e-06, + "loss": 0.0005, + "num_input_tokens_seen": 256260880, + "step": 118635 + }, + { + "epoch": 19.35399673735726, + "grad_norm": 0.2129955291748047, + "learning_rate": 3.1763117775487903e-06, + "loss": 0.0048, + "num_input_tokens_seen": 256272240, + "step": 118640 + }, + { + "epoch": 19.354812398042416, + "grad_norm": 0.0009809770854189992, + "learning_rate": 3.168306348661054e-06, + "loss": 0.0006, + "num_input_tokens_seen": 256282704, + "step": 118645 + }, + { + "epoch": 19.355628058727568, + "grad_norm": 0.005014079622924328, + "learning_rate": 3.160310988697901e-06, + "loss": 0.0015, + "num_input_tokens_seen": 256292784, + "step": 118650 + }, + { + "epoch": 19.356443719412724, + "grad_norm": 0.0024872953072190285, + "learning_rate": 3.152325697821312e-06, + "loss": 0.0031, + "num_input_tokens_seen": 256302576, + "step": 118655 + }, + { + "epoch": 19.35725938009788, + "grad_norm": 0.0008488795720040798, + "learning_rate": 3.1443504761931585e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256313456, + "step": 118660 + }, + { + "epoch": 19.358075040783035, + "grad_norm": 0.005055803805589676, + "learning_rate": 3.1363853239750327e-06, + "loss": 0.0019, + "num_input_tokens_seen": 256323216, + "step": 118665 + }, + { + "epoch": 19.35889070146819, + "grad_norm": 0.005085945129394531, + "learning_rate": 3.1284302413283615e-06, + "loss": 0.0042, + "num_input_tokens_seen": 256333616, + "step": 118670 + }, + { + "epoch": 19.359706362153343, + "grad_norm": 0.03696412593126297, + "learning_rate": 3.1204852284143493e-06, + "loss": 0.0023, + "num_input_tokens_seen": 256343760, + "step": 118675 + }, + { + "epoch": 19.3605220228385, + "grad_norm": 0.0008415200281888247, + "learning_rate": 3.1125502853941444e-06, + "loss": 0.001, + "num_input_tokens_seen": 256355344, + "step": 118680 + }, + { + "epoch": 19.361337683523654, + "grad_norm": 0.005204454530030489, + "learning_rate": 3.1046254124283413e-06, + "loss": 0.0016, + "num_input_tokens_seen": 256366032, + "step": 118685 + }, + { + "epoch": 19.36215334420881, + "grad_norm": 0.02970193885266781, + "learning_rate": 3.0967106096777e-06, + "loss": 0.0017, + "num_input_tokens_seen": 256376208, + "step": 118690 + }, + { + "epoch": 19.362969004893966, + "grad_norm": 0.0011499657994136214, + "learning_rate": 3.088805877302592e-06, + "loss": 0.0005, + "num_input_tokens_seen": 256387280, + "step": 118695 + }, + { + "epoch": 19.363784665579118, + "grad_norm": 0.0023238202556967735, + "learning_rate": 3.0809112154632226e-06, + "loss": 0.0195, + "num_input_tokens_seen": 256397648, + "step": 118700 + }, + { + "epoch": 19.364600326264274, + "grad_norm": 0.00012586277443915606, + "learning_rate": 3.073026624319575e-06, + "loss": 0.002, + "num_input_tokens_seen": 256407472, + "step": 118705 + }, + { + "epoch": 19.36541598694943, + "grad_norm": 0.5267165899276733, + "learning_rate": 3.06515210403141e-06, + "loss": 0.0105, + "num_input_tokens_seen": 256418096, + "step": 118710 + }, + { + "epoch": 19.366231647634585, + "grad_norm": 0.00432355422526598, + "learning_rate": 3.0572876547583785e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256427568, + "step": 118715 + }, + { + "epoch": 19.36704730831974, + "grad_norm": 0.02034470997750759, + "learning_rate": 3.0494332766597967e-06, + "loss": 0.004, + "num_input_tokens_seen": 256438736, + "step": 118720 + }, + { + "epoch": 19.367862969004893, + "grad_norm": 0.00162877154070884, + "learning_rate": 3.0415889698949262e-06, + "loss": 0.0012, + "num_input_tokens_seen": 256448592, + "step": 118725 + }, + { + "epoch": 19.36867862969005, + "grad_norm": 0.007950716651976109, + "learning_rate": 3.0337547346226404e-06, + "loss": 0.0053, + "num_input_tokens_seen": 256459472, + "step": 118730 + }, + { + "epoch": 19.369494290375204, + "grad_norm": 0.05686101317405701, + "learning_rate": 3.025930571001756e-06, + "loss": 0.0053, + "num_input_tokens_seen": 256469296, + "step": 118735 + }, + { + "epoch": 19.37030995106036, + "grad_norm": 0.028149202466011047, + "learning_rate": 3.018116479190869e-06, + "loss": 0.0017, + "num_input_tokens_seen": 256479248, + "step": 118740 + }, + { + "epoch": 19.371125611745512, + "grad_norm": 0.0007396186701953411, + "learning_rate": 3.0103124593483522e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256491344, + "step": 118745 + }, + { + "epoch": 19.371941272430668, + "grad_norm": 0.00017671390378382057, + "learning_rate": 3.002518511632246e-06, + "loss": 0.0005, + "num_input_tokens_seen": 256501776, + "step": 118750 + }, + { + "epoch": 19.372756933115824, + "grad_norm": 0.024652238935232162, + "learning_rate": 2.9947346362006466e-06, + "loss": 0.0038, + "num_input_tokens_seen": 256512944, + "step": 118755 + }, + { + "epoch": 19.37357259380098, + "grad_norm": 0.0003830741043202579, + "learning_rate": 2.986960833211205e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256522064, + "step": 118760 + }, + { + "epoch": 19.374388254486135, + "grad_norm": 0.0011993915541097522, + "learning_rate": 2.9791971028215737e-06, + "loss": 0.0037, + "num_input_tokens_seen": 256533456, + "step": 118765 + }, + { + "epoch": 19.375203915171287, + "grad_norm": 0.04528145119547844, + "learning_rate": 2.9714434451889595e-06, + "loss": 0.0014, + "num_input_tokens_seen": 256542896, + "step": 118770 + }, + { + "epoch": 19.376019575856443, + "grad_norm": 0.008095352910459042, + "learning_rate": 2.9636998604706255e-06, + "loss": 0.0038, + "num_input_tokens_seen": 256553264, + "step": 118775 + }, + { + "epoch": 19.3768352365416, + "grad_norm": 0.003776898607611656, + "learning_rate": 2.955966348823391e-06, + "loss": 0.0008, + "num_input_tokens_seen": 256563536, + "step": 118780 + }, + { + "epoch": 19.377650897226754, + "grad_norm": 0.06061271205544472, + "learning_rate": 2.948242910404131e-06, + "loss": 0.0021, + "num_input_tokens_seen": 256573808, + "step": 118785 + }, + { + "epoch": 19.37846655791191, + "grad_norm": 0.0016545297112315893, + "learning_rate": 2.9405295453692195e-06, + "loss": 0.0031, + "num_input_tokens_seen": 256584304, + "step": 118790 + }, + { + "epoch": 19.379282218597062, + "grad_norm": 0.002400145400315523, + "learning_rate": 2.9328262538750316e-06, + "loss": 0.0031, + "num_input_tokens_seen": 256595024, + "step": 118795 + }, + { + "epoch": 19.380097879282218, + "grad_norm": 0.061130374670028687, + "learning_rate": 2.9251330360777205e-06, + "loss": 0.0021, + "num_input_tokens_seen": 256605776, + "step": 118800 + }, + { + "epoch": 19.380913539967374, + "grad_norm": 0.0015816733939573169, + "learning_rate": 2.9174498921331616e-06, + "loss": 0.0096, + "num_input_tokens_seen": 256615920, + "step": 118805 + }, + { + "epoch": 19.38172920065253, + "grad_norm": 0.07032934576272964, + "learning_rate": 2.909776822197063e-06, + "loss": 0.0015, + "num_input_tokens_seen": 256627504, + "step": 118810 + }, + { + "epoch": 19.382544861337685, + "grad_norm": 0.0010459988843649626, + "learning_rate": 2.902113826424968e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256639856, + "step": 118815 + }, + { + "epoch": 19.383360522022837, + "grad_norm": 0.0010384476045146585, + "learning_rate": 2.8944609049721406e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256650096, + "step": 118820 + }, + { + "epoch": 19.384176182707993, + "grad_norm": 0.0006806451710872352, + "learning_rate": 2.8868180579936787e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256659760, + "step": 118825 + }, + { + "epoch": 19.38499184339315, + "grad_norm": 0.012895084917545319, + "learning_rate": 2.8791852856445143e-06, + "loss": 0.0008, + "num_input_tokens_seen": 256670512, + "step": 118830 + }, + { + "epoch": 19.385807504078304, + "grad_norm": 0.005823103711009026, + "learning_rate": 2.8715625880792463e-06, + "loss": 0.0005, + "num_input_tokens_seen": 256681744, + "step": 118835 + }, + { + "epoch": 19.38662316476346, + "grad_norm": 0.0037850309163331985, + "learning_rate": 2.8639499654524724e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256692912, + "step": 118840 + }, + { + "epoch": 19.387438825448612, + "grad_norm": 0.02443789690732956, + "learning_rate": 2.856347417918348e-06, + "loss": 0.0018, + "num_input_tokens_seen": 256704400, + "step": 118845 + }, + { + "epoch": 19.388254486133768, + "grad_norm": 0.008325217291712761, + "learning_rate": 2.8487549456310824e-06, + "loss": 0.0025, + "num_input_tokens_seen": 256714704, + "step": 118850 + }, + { + "epoch": 19.389070146818923, + "grad_norm": 0.09903115034103394, + "learning_rate": 2.841172548744442e-06, + "loss": 0.0031, + "num_input_tokens_seen": 256725488, + "step": 118855 + }, + { + "epoch": 19.38988580750408, + "grad_norm": 0.0006401181453838944, + "learning_rate": 2.8336002274121365e-06, + "loss": 0.0005, + "num_input_tokens_seen": 256735184, + "step": 118860 + }, + { + "epoch": 19.390701468189235, + "grad_norm": 0.0005234939744696021, + "learning_rate": 2.8260379817875993e-06, + "loss": 0.0006, + "num_input_tokens_seen": 256746160, + "step": 118865 + }, + { + "epoch": 19.391517128874387, + "grad_norm": 0.007503115572035313, + "learning_rate": 2.818485812024152e-06, + "loss": 0.0008, + "num_input_tokens_seen": 256757808, + "step": 118870 + }, + { + "epoch": 19.392332789559543, + "grad_norm": 0.00616914639249444, + "learning_rate": 2.810943718274783e-06, + "loss": 0.0004, + "num_input_tokens_seen": 256769296, + "step": 118875 + }, + { + "epoch": 19.3931484502447, + "grad_norm": 0.0006514721899293363, + "learning_rate": 2.8034117006924264e-06, + "loss": 0.0012, + "num_input_tokens_seen": 256780144, + "step": 118880 + }, + { + "epoch": 19.393964110929854, + "grad_norm": 0.030497943982481956, + "learning_rate": 2.795889759429626e-06, + "loss": 0.0025, + "num_input_tokens_seen": 256790832, + "step": 118885 + }, + { + "epoch": 19.39477977161501, + "grad_norm": 0.14161071181297302, + "learning_rate": 2.788377894638816e-06, + "loss": 0.0029, + "num_input_tokens_seen": 256801232, + "step": 118890 + }, + { + "epoch": 19.395595432300162, + "grad_norm": 0.008544592186808586, + "learning_rate": 2.7808761064723186e-06, + "loss": 0.0014, + "num_input_tokens_seen": 256812880, + "step": 118895 + }, + { + "epoch": 19.396411092985318, + "grad_norm": 0.13467442989349365, + "learning_rate": 2.773384395082179e-06, + "loss": 0.0047, + "num_input_tokens_seen": 256823888, + "step": 118900 + }, + { + "epoch": 19.397226753670473, + "grad_norm": 0.006914534140378237, + "learning_rate": 2.765902760620165e-06, + "loss": 0.0017, + "num_input_tokens_seen": 256835600, + "step": 118905 + }, + { + "epoch": 19.39804241435563, + "grad_norm": 0.008217011578381062, + "learning_rate": 2.758431203237877e-06, + "loss": 0.0013, + "num_input_tokens_seen": 256845424, + "step": 118910 + }, + { + "epoch": 19.39885807504078, + "grad_norm": 0.005663623567670584, + "learning_rate": 2.7509697230868048e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256857168, + "step": 118915 + }, + { + "epoch": 19.399673735725937, + "grad_norm": 0.0005932076601311564, + "learning_rate": 2.7435183203181613e-06, + "loss": 0.0043, + "num_input_tokens_seen": 256868496, + "step": 118920 + }, + { + "epoch": 19.400489396411093, + "grad_norm": 0.0005741248605772853, + "learning_rate": 2.7360769950828814e-06, + "loss": 0.0006, + "num_input_tokens_seen": 256878544, + "step": 118925 + }, + { + "epoch": 19.40130505709625, + "grad_norm": 0.001585560035891831, + "learning_rate": 2.728645747531844e-06, + "loss": 0.0164, + "num_input_tokens_seen": 256889264, + "step": 118930 + }, + { + "epoch": 19.402120717781404, + "grad_norm": 0.011332179419696331, + "learning_rate": 2.721224577815651e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256901296, + "step": 118935 + }, + { + "epoch": 19.402936378466556, + "grad_norm": 0.010597261600196362, + "learning_rate": 2.713813486084682e-06, + "loss": 0.0023, + "num_input_tokens_seen": 256912048, + "step": 118940 + }, + { + "epoch": 19.403752039151712, + "grad_norm": 0.1503845602273941, + "learning_rate": 2.7064124724891505e-06, + "loss": 0.0027, + "num_input_tokens_seen": 256922960, + "step": 118945 + }, + { + "epoch": 19.404567699836868, + "grad_norm": 0.0053455098532140255, + "learning_rate": 2.6990215371789916e-06, + "loss": 0.002, + "num_input_tokens_seen": 256932656, + "step": 118950 + }, + { + "epoch": 19.405383360522023, + "grad_norm": 0.03206378594040871, + "learning_rate": 2.691640680304086e-06, + "loss": 0.0036, + "num_input_tokens_seen": 256941712, + "step": 118955 + }, + { + "epoch": 19.40619902120718, + "grad_norm": 0.0004055328026879579, + "learning_rate": 2.684269902013925e-06, + "loss": 0.0017, + "num_input_tokens_seen": 256952592, + "step": 118960 + }, + { + "epoch": 19.40701468189233, + "grad_norm": 0.0035495799966156483, + "learning_rate": 2.676909202457889e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256963920, + "step": 118965 + }, + { + "epoch": 19.407830342577487, + "grad_norm": 0.0012541578616946936, + "learning_rate": 2.6695585817852476e-06, + "loss": 0.0029, + "num_input_tokens_seen": 256975312, + "step": 118970 + }, + { + "epoch": 19.408646003262643, + "grad_norm": 0.020598217844963074, + "learning_rate": 2.6622180401448815e-06, + "loss": 0.016, + "num_input_tokens_seen": 256984304, + "step": 118975 + }, + { + "epoch": 19.4094616639478, + "grad_norm": 0.0014313864521682262, + "learning_rate": 2.6548875776856163e-06, + "loss": 0.0012, + "num_input_tokens_seen": 256996624, + "step": 118980 + }, + { + "epoch": 19.410277324632954, + "grad_norm": 0.009284721687436104, + "learning_rate": 2.6475671945559442e-06, + "loss": 0.0017, + "num_input_tokens_seen": 257008144, + "step": 118985 + }, + { + "epoch": 19.411092985318106, + "grad_norm": 0.02313319407403469, + "learning_rate": 2.6402568909042467e-06, + "loss": 0.001, + "num_input_tokens_seen": 257020048, + "step": 118990 + }, + { + "epoch": 19.411908646003262, + "grad_norm": 0.002016447950154543, + "learning_rate": 2.6329566668787384e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257031952, + "step": 118995 + }, + { + "epoch": 19.412724306688418, + "grad_norm": 0.001090245321393013, + "learning_rate": 2.625666522627301e-06, + "loss": 0.0005, + "num_input_tokens_seen": 257042928, + "step": 119000 + }, + { + "epoch": 19.413539967373573, + "grad_norm": 0.026525719091296196, + "learning_rate": 2.6183864582976503e-06, + "loss": 0.0012, + "num_input_tokens_seen": 257053872, + "step": 119005 + }, + { + "epoch": 19.41435562805873, + "grad_norm": 0.0006216730689629912, + "learning_rate": 2.611116474037445e-06, + "loss": 0.001, + "num_input_tokens_seen": 257064528, + "step": 119010 + }, + { + "epoch": 19.41517128874388, + "grad_norm": 0.07049921900033951, + "learning_rate": 2.603856569993901e-06, + "loss": 0.0021, + "num_input_tokens_seen": 257076304, + "step": 119015 + }, + { + "epoch": 19.415986949429037, + "grad_norm": 0.006914778146892786, + "learning_rate": 2.596606746314234e-06, + "loss": 0.0008, + "num_input_tokens_seen": 257088400, + "step": 119020 + }, + { + "epoch": 19.416802610114193, + "grad_norm": 0.02052348665893078, + "learning_rate": 2.589367003145271e-06, + "loss": 0.0007, + "num_input_tokens_seen": 257099728, + "step": 119025 + }, + { + "epoch": 19.41761827079935, + "grad_norm": 0.0031544077210128307, + "learning_rate": 2.5821373406338387e-06, + "loss": 0.0021, + "num_input_tokens_seen": 257110800, + "step": 119030 + }, + { + "epoch": 19.418433931484504, + "grad_norm": 0.010480822063982487, + "learning_rate": 2.574917758926376e-06, + "loss": 0.0011, + "num_input_tokens_seen": 257121968, + "step": 119035 + }, + { + "epoch": 19.419249592169656, + "grad_norm": 0.003024548990651965, + "learning_rate": 2.5677082581692657e-06, + "loss": 0.0043, + "num_input_tokens_seen": 257131952, + "step": 119040 + }, + { + "epoch": 19.420065252854812, + "grad_norm": 0.03305144980549812, + "learning_rate": 2.5605088385085573e-06, + "loss": 0.0015, + "num_input_tokens_seen": 257142864, + "step": 119045 + }, + { + "epoch": 19.420880913539968, + "grad_norm": 0.0013253577053546906, + "learning_rate": 2.553319500090245e-06, + "loss": 0.0019, + "num_input_tokens_seen": 257153904, + "step": 119050 + }, + { + "epoch": 19.421696574225123, + "grad_norm": 0.009747570380568504, + "learning_rate": 2.5461402430599357e-06, + "loss": 0.0013, + "num_input_tokens_seen": 257166032, + "step": 119055 + }, + { + "epoch": 19.42251223491028, + "grad_norm": 0.00021776130597572774, + "learning_rate": 2.5389710675631227e-06, + "loss": 0.0076, + "num_input_tokens_seen": 257177296, + "step": 119060 + }, + { + "epoch": 19.42332789559543, + "grad_norm": 0.0012730876915156841, + "learning_rate": 2.5318119737451905e-06, + "loss": 0.0008, + "num_input_tokens_seen": 257187824, + "step": 119065 + }, + { + "epoch": 19.424143556280587, + "grad_norm": 0.0005625274498015642, + "learning_rate": 2.524662961751134e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257197616, + "step": 119070 + }, + { + "epoch": 19.424959216965743, + "grad_norm": 0.0018120675813406706, + "learning_rate": 2.517524031725893e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257207216, + "step": 119075 + }, + { + "epoch": 19.4257748776509, + "grad_norm": 0.001958174630999565, + "learning_rate": 2.5103951838141292e-06, + "loss": 0.0079, + "num_input_tokens_seen": 257217840, + "step": 119080 + }, + { + "epoch": 19.42659053833605, + "grad_norm": 0.0011971911881119013, + "learning_rate": 2.503276418160283e-06, + "loss": 0.0018, + "num_input_tokens_seen": 257229136, + "step": 119085 + }, + { + "epoch": 19.427406199021206, + "grad_norm": 0.013330371119081974, + "learning_rate": 2.496167734908683e-06, + "loss": 0.1763, + "num_input_tokens_seen": 257239664, + "step": 119090 + }, + { + "epoch": 19.428221859706362, + "grad_norm": 0.00020406786643434316, + "learning_rate": 2.489069134203381e-06, + "loss": 0.0005, + "num_input_tokens_seen": 257250544, + "step": 119095 + }, + { + "epoch": 19.429037520391518, + "grad_norm": 0.025051139295101166, + "learning_rate": 2.481980616188262e-06, + "loss": 0.0021, + "num_input_tokens_seen": 257261168, + "step": 119100 + }, + { + "epoch": 19.429853181076673, + "grad_norm": 0.011714156717061996, + "learning_rate": 2.474902181006877e-06, + "loss": 0.0024, + "num_input_tokens_seen": 257272624, + "step": 119105 + }, + { + "epoch": 19.430668841761825, + "grad_norm": 0.00022403965704143047, + "learning_rate": 2.467833828802779e-06, + "loss": 0.0023, + "num_input_tokens_seen": 257283344, + "step": 119110 + }, + { + "epoch": 19.43148450244698, + "grad_norm": 0.0003524889179971069, + "learning_rate": 2.4607755597192417e-06, + "loss": 0.0023, + "num_input_tokens_seen": 257293840, + "step": 119115 + }, + { + "epoch": 19.432300163132137, + "grad_norm": 0.001508756191469729, + "learning_rate": 2.453727373899206e-06, + "loss": 0.0013, + "num_input_tokens_seen": 257305168, + "step": 119120 + }, + { + "epoch": 19.433115823817293, + "grad_norm": 0.011893289163708687, + "learning_rate": 2.4466892714856137e-06, + "loss": 0.0044, + "num_input_tokens_seen": 257316752, + "step": 119125 + }, + { + "epoch": 19.43393148450245, + "grad_norm": 0.23823878169059753, + "learning_rate": 2.439661252621017e-06, + "loss": 0.0065, + "num_input_tokens_seen": 257328880, + "step": 119130 + }, + { + "epoch": 19.4347471451876, + "grad_norm": 0.0054446193389594555, + "learning_rate": 2.4326433174479133e-06, + "loss": 0.0005, + "num_input_tokens_seen": 257340208, + "step": 119135 + }, + { + "epoch": 19.435562805872756, + "grad_norm": 0.004071327392011881, + "learning_rate": 2.4256354661084666e-06, + "loss": 0.0039, + "num_input_tokens_seen": 257349904, + "step": 119140 + }, + { + "epoch": 19.436378466557912, + "grad_norm": 0.0009328233427368104, + "learning_rate": 2.4186376987447857e-06, + "loss": 0.0017, + "num_input_tokens_seen": 257361552, + "step": 119145 + }, + { + "epoch": 19.437194127243067, + "grad_norm": 0.0005683921044692397, + "learning_rate": 2.41165001549859e-06, + "loss": 0.0015, + "num_input_tokens_seen": 257373072, + "step": 119150 + }, + { + "epoch": 19.438009787928223, + "grad_norm": 0.005114169325679541, + "learning_rate": 2.4046724165115998e-06, + "loss": 0.0005, + "num_input_tokens_seen": 257383984, + "step": 119155 + }, + { + "epoch": 19.438825448613375, + "grad_norm": 0.08982323110103607, + "learning_rate": 2.3977049019250907e-06, + "loss": 0.0094, + "num_input_tokens_seen": 257395152, + "step": 119160 + }, + { + "epoch": 19.43964110929853, + "grad_norm": 0.0284635778516531, + "learning_rate": 2.3907474718803944e-06, + "loss": 0.0021, + "num_input_tokens_seen": 257404240, + "step": 119165 + }, + { + "epoch": 19.440456769983687, + "grad_norm": 0.039190005511045456, + "learning_rate": 2.383800126518454e-06, + "loss": 0.0013, + "num_input_tokens_seen": 257413904, + "step": 119170 + }, + { + "epoch": 19.441272430668842, + "grad_norm": 0.1472269892692566, + "learning_rate": 2.3768628659801005e-06, + "loss": 0.0035, + "num_input_tokens_seen": 257425008, + "step": 119175 + }, + { + "epoch": 19.442088091353998, + "grad_norm": 0.00017574279627297074, + "learning_rate": 2.3699356904058334e-06, + "loss": 0.0015, + "num_input_tokens_seen": 257436112, + "step": 119180 + }, + { + "epoch": 19.44290375203915, + "grad_norm": 0.001107974792830646, + "learning_rate": 2.363018599936151e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257448144, + "step": 119185 + }, + { + "epoch": 19.443719412724306, + "grad_norm": 1.321487545967102, + "learning_rate": 2.3561115947111635e-06, + "loss": 0.0023, + "num_input_tokens_seen": 257458608, + "step": 119190 + }, + { + "epoch": 19.44453507340946, + "grad_norm": 0.0002932049101218581, + "learning_rate": 2.349214674870925e-06, + "loss": 0.0038, + "num_input_tokens_seen": 257469936, + "step": 119195 + }, + { + "epoch": 19.445350734094617, + "grad_norm": 0.023268211632966995, + "learning_rate": 2.3423278405551583e-06, + "loss": 0.0033, + "num_input_tokens_seen": 257478736, + "step": 119200 + }, + { + "epoch": 19.446166394779773, + "grad_norm": 0.004889782518148422, + "learning_rate": 2.335451091903418e-06, + "loss": 0.0024, + "num_input_tokens_seen": 257489424, + "step": 119205 + }, + { + "epoch": 19.446982055464925, + "grad_norm": 0.020940110087394714, + "learning_rate": 2.3285844290550916e-06, + "loss": 0.0017, + "num_input_tokens_seen": 257499888, + "step": 119210 + }, + { + "epoch": 19.44779771615008, + "grad_norm": 0.0025529158301651478, + "learning_rate": 2.321727852149402e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257509840, + "step": 119215 + }, + { + "epoch": 19.448613376835237, + "grad_norm": 0.0006794088985770941, + "learning_rate": 2.314881361325183e-06, + "loss": 0.0005, + "num_input_tokens_seen": 257521680, + "step": 119220 + }, + { + "epoch": 19.449429037520392, + "grad_norm": 0.0012869905913248658, + "learning_rate": 2.308044956721267e-06, + "loss": 0.0009, + "num_input_tokens_seen": 257530672, + "step": 119225 + }, + { + "epoch": 19.450244698205548, + "grad_norm": 0.03023373894393444, + "learning_rate": 2.30121863847621e-06, + "loss": 0.0013, + "num_input_tokens_seen": 257540720, + "step": 119230 + }, + { + "epoch": 19.4510603588907, + "grad_norm": 0.00452503701671958, + "learning_rate": 2.294402406728291e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257552048, + "step": 119235 + }, + { + "epoch": 19.451876019575856, + "grad_norm": 0.0002490385086275637, + "learning_rate": 2.2875962616157318e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257562416, + "step": 119240 + }, + { + "epoch": 19.45269168026101, + "grad_norm": 0.02928837016224861, + "learning_rate": 2.2808002032763676e-06, + "loss": 0.0011, + "num_input_tokens_seen": 257572592, + "step": 119245 + }, + { + "epoch": 19.453507340946167, + "grad_norm": 0.004705357365310192, + "learning_rate": 2.2740142318480873e-06, + "loss": 0.0064, + "num_input_tokens_seen": 257583856, + "step": 119250 + }, + { + "epoch": 19.454323001631323, + "grad_norm": 0.0028875924181193113, + "learning_rate": 2.267238347468226e-06, + "loss": 0.0007, + "num_input_tokens_seen": 257594640, + "step": 119255 + }, + { + "epoch": 19.455138662316475, + "grad_norm": 0.0010396402794867754, + "learning_rate": 2.2604725502742286e-06, + "loss": 0.0006, + "num_input_tokens_seen": 257605712, + "step": 119260 + }, + { + "epoch": 19.45595432300163, + "grad_norm": 0.00031200837111100554, + "learning_rate": 2.2537168404032082e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257617232, + "step": 119265 + }, + { + "epoch": 19.456769983686787, + "grad_norm": 0.0012279873481020331, + "learning_rate": 2.2469712179920555e-06, + "loss": 0.0022, + "num_input_tokens_seen": 257628976, + "step": 119270 + }, + { + "epoch": 19.457585644371942, + "grad_norm": 0.037878625094890594, + "learning_rate": 2.2402356831774383e-06, + "loss": 0.0011, + "num_input_tokens_seen": 257639696, + "step": 119275 + }, + { + "epoch": 19.458401305057095, + "grad_norm": 0.0029649846255779266, + "learning_rate": 2.2335102360959148e-06, + "loss": 0.0025, + "num_input_tokens_seen": 257651280, + "step": 119280 + }, + { + "epoch": 19.45921696574225, + "grad_norm": 0.019405025988817215, + "learning_rate": 2.226794876883764e-06, + "loss": 0.0167, + "num_input_tokens_seen": 257661808, + "step": 119285 + }, + { + "epoch": 19.460032626427406, + "grad_norm": 0.05081169307231903, + "learning_rate": 2.2200896056771004e-06, + "loss": 0.002, + "num_input_tokens_seen": 257672912, + "step": 119290 + }, + { + "epoch": 19.46084828711256, + "grad_norm": 0.007771750912070274, + "learning_rate": 2.2133944226117587e-06, + "loss": 0.0014, + "num_input_tokens_seen": 257685008, + "step": 119295 + }, + { + "epoch": 19.461663947797717, + "grad_norm": 0.0009954735869541764, + "learning_rate": 2.2067093278235194e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257695760, + "step": 119300 + }, + { + "epoch": 19.46247960848287, + "grad_norm": 0.0010892596328631043, + "learning_rate": 2.2000343214477746e-06, + "loss": 0.0019, + "num_input_tokens_seen": 257707280, + "step": 119305 + }, + { + "epoch": 19.463295269168025, + "grad_norm": 0.0006688821013085544, + "learning_rate": 2.1933694036198605e-06, + "loss": 0.0009, + "num_input_tokens_seen": 257717328, + "step": 119310 + }, + { + "epoch": 19.46411092985318, + "grad_norm": 0.12306705862283707, + "learning_rate": 2.1867145744747796e-06, + "loss": 0.0038, + "num_input_tokens_seen": 257727760, + "step": 119315 + }, + { + "epoch": 19.464926590538337, + "grad_norm": 0.00043416867265477777, + "learning_rate": 2.1800698341475355e-06, + "loss": 0.0007, + "num_input_tokens_seen": 257737040, + "step": 119320 + }, + { + "epoch": 19.465742251223492, + "grad_norm": 0.00032600079430267215, + "learning_rate": 2.173435182772632e-06, + "loss": 0.0006, + "num_input_tokens_seen": 257747888, + "step": 119325 + }, + { + "epoch": 19.466557911908644, + "grad_norm": 0.007647597696632147, + "learning_rate": 2.166810620484627e-06, + "loss": 0.0387, + "num_input_tokens_seen": 257758672, + "step": 119330 + }, + { + "epoch": 19.4673735725938, + "grad_norm": 0.031011521816253662, + "learning_rate": 2.160196147417748e-06, + "loss": 0.002, + "num_input_tokens_seen": 257768720, + "step": 119335 + }, + { + "epoch": 19.468189233278956, + "grad_norm": 0.008647006005048752, + "learning_rate": 2.153591763706053e-06, + "loss": 0.0024, + "num_input_tokens_seen": 257779376, + "step": 119340 + }, + { + "epoch": 19.46900489396411, + "grad_norm": 0.0007197380182333291, + "learning_rate": 2.1469974694833805e-06, + "loss": 0.0222, + "num_input_tokens_seen": 257790672, + "step": 119345 + }, + { + "epoch": 19.469820554649267, + "grad_norm": 0.007564597297459841, + "learning_rate": 2.140413264883401e-06, + "loss": 0.0036, + "num_input_tokens_seen": 257801296, + "step": 119350 + }, + { + "epoch": 19.47063621533442, + "grad_norm": 0.002344276290386915, + "learning_rate": 2.1338391500394516e-06, + "loss": 0.0356, + "num_input_tokens_seen": 257813072, + "step": 119355 + }, + { + "epoch": 19.471451876019575, + "grad_norm": 0.00835796445608139, + "learning_rate": 2.1272751250849263e-06, + "loss": 0.0009, + "num_input_tokens_seen": 257824720, + "step": 119360 + }, + { + "epoch": 19.47226753670473, + "grad_norm": 0.003108770353719592, + "learning_rate": 2.120721190152719e-06, + "loss": 0.0018, + "num_input_tokens_seen": 257835312, + "step": 119365 + }, + { + "epoch": 19.473083197389887, + "grad_norm": 0.0004182121774647385, + "learning_rate": 2.114177345375723e-06, + "loss": 0.0023, + "num_input_tokens_seen": 257846544, + "step": 119370 + }, + { + "epoch": 19.473898858075042, + "grad_norm": 0.03978124260902405, + "learning_rate": 2.1076435908864986e-06, + "loss": 0.0021, + "num_input_tokens_seen": 257857776, + "step": 119375 + }, + { + "epoch": 19.474714518760194, + "grad_norm": 0.0012924002949148417, + "learning_rate": 2.1011199268175517e-06, + "loss": 0.017, + "num_input_tokens_seen": 257868592, + "step": 119380 + }, + { + "epoch": 19.47553017944535, + "grad_norm": 0.002158062532544136, + "learning_rate": 2.0946063533009986e-06, + "loss": 0.0006, + "num_input_tokens_seen": 257879504, + "step": 119385 + }, + { + "epoch": 19.476345840130506, + "grad_norm": 0.005779027007520199, + "learning_rate": 2.0881028704688997e-06, + "loss": 0.0005, + "num_input_tokens_seen": 257891152, + "step": 119390 + }, + { + "epoch": 19.47716150081566, + "grad_norm": 0.019146548584103584, + "learning_rate": 2.0816094784530394e-06, + "loss": 0.0013, + "num_input_tokens_seen": 257901456, + "step": 119395 + }, + { + "epoch": 19.477977161500817, + "grad_norm": 0.010578310117125511, + "learning_rate": 2.075126177385034e-06, + "loss": 0.0021, + "num_input_tokens_seen": 257912144, + "step": 119400 + }, + { + "epoch": 19.47879282218597, + "grad_norm": 0.0001685271126916632, + "learning_rate": 2.0686529673962784e-06, + "loss": 0.0015, + "num_input_tokens_seen": 257921584, + "step": 119405 + }, + { + "epoch": 19.479608482871125, + "grad_norm": 0.0007612688350491226, + "learning_rate": 2.06218984861789e-06, + "loss": 0.0069, + "num_input_tokens_seen": 257930800, + "step": 119410 + }, + { + "epoch": 19.48042414355628, + "grad_norm": 0.0007769337389618158, + "learning_rate": 2.0557368211809314e-06, + "loss": 0.0075, + "num_input_tokens_seen": 257940848, + "step": 119415 + }, + { + "epoch": 19.481239804241437, + "grad_norm": 0.001466889400035143, + "learning_rate": 2.0492938852161304e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257952304, + "step": 119420 + }, + { + "epoch": 19.482055464926592, + "grad_norm": 0.016221707686781883, + "learning_rate": 2.042861040854105e-06, + "loss": 0.0008, + "num_input_tokens_seen": 257963056, + "step": 119425 + }, + { + "epoch": 19.482871125611744, + "grad_norm": 0.03494711592793465, + "learning_rate": 2.0364382882251952e-06, + "loss": 0.0013, + "num_input_tokens_seen": 257973744, + "step": 119430 + }, + { + "epoch": 19.4836867862969, + "grad_norm": 0.0013683143770322204, + "learning_rate": 2.030025627459575e-06, + "loss": 0.0018, + "num_input_tokens_seen": 257984912, + "step": 119435 + }, + { + "epoch": 19.484502446982056, + "grad_norm": 0.001632693805731833, + "learning_rate": 2.023623058687196e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257996048, + "step": 119440 + }, + { + "epoch": 19.48531810766721, + "grad_norm": 0.0575888529419899, + "learning_rate": 2.0172305820378434e-06, + "loss": 0.0027, + "num_input_tokens_seen": 258005936, + "step": 119445 + }, + { + "epoch": 19.486133768352367, + "grad_norm": 0.0022574923932552338, + "learning_rate": 2.010848197641024e-06, + "loss": 0.0011, + "num_input_tokens_seen": 258016336, + "step": 119450 + }, + { + "epoch": 19.48694942903752, + "grad_norm": 0.00851681362837553, + "learning_rate": 2.0044759056261354e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258026384, + "step": 119455 + }, + { + "epoch": 19.487765089722675, + "grad_norm": 0.0007707860204391181, + "learning_rate": 1.9981137061222954e-06, + "loss": 0.008, + "num_input_tokens_seen": 258036784, + "step": 119460 + }, + { + "epoch": 19.48858075040783, + "grad_norm": 0.00015356663789134473, + "learning_rate": 1.9917615992584017e-06, + "loss": 0.1028, + "num_input_tokens_seen": 258048688, + "step": 119465 + }, + { + "epoch": 19.489396411092986, + "grad_norm": 0.6992786526679993, + "learning_rate": 1.985419585163295e-06, + "loss": 0.0374, + "num_input_tokens_seen": 258059376, + "step": 119470 + }, + { + "epoch": 19.49021207177814, + "grad_norm": 0.0010557199129834771, + "learning_rate": 1.9790876639653733e-06, + "loss": 0.0299, + "num_input_tokens_seen": 258071472, + "step": 119475 + }, + { + "epoch": 19.491027732463294, + "grad_norm": 0.014713788405060768, + "learning_rate": 1.972765835793089e-06, + "loss": 0.0016, + "num_input_tokens_seen": 258082448, + "step": 119480 + }, + { + "epoch": 19.49184339314845, + "grad_norm": 0.0009430281934328377, + "learning_rate": 1.9664541007744508e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258093232, + "step": 119485 + }, + { + "epoch": 19.492659053833606, + "grad_norm": 0.0018127475632354617, + "learning_rate": 1.960152459037412e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258104880, + "step": 119490 + }, + { + "epoch": 19.49347471451876, + "grad_norm": 0.008685811422765255, + "learning_rate": 1.953860910709704e-06, + "loss": 0.0016, + "num_input_tokens_seen": 258115984, + "step": 119495 + }, + { + "epoch": 19.494290375203914, + "grad_norm": 0.006860397756099701, + "learning_rate": 1.9475794559188354e-06, + "loss": 0.0013, + "num_input_tokens_seen": 258125520, + "step": 119500 + }, + { + "epoch": 19.49510603588907, + "grad_norm": 0.0003657161723822355, + "learning_rate": 1.9413080947920934e-06, + "loss": 0.0028, + "num_input_tokens_seen": 258137712, + "step": 119505 + }, + { + "epoch": 19.495921696574225, + "grad_norm": 0.0007896169554442167, + "learning_rate": 1.9350468274565434e-06, + "loss": 0.0004, + "num_input_tokens_seen": 258148272, + "step": 119510 + }, + { + "epoch": 19.49673735725938, + "grad_norm": 0.0059143430553376675, + "learning_rate": 1.9287956540391395e-06, + "loss": 0.0202, + "num_input_tokens_seen": 258159120, + "step": 119515 + }, + { + "epoch": 19.497553017944536, + "grad_norm": 0.03275555744767189, + "learning_rate": 1.9225545746665575e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258170384, + "step": 119520 + }, + { + "epoch": 19.49836867862969, + "grad_norm": 0.0034429405350238085, + "learning_rate": 1.9163235894651965e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258181808, + "step": 119525 + }, + { + "epoch": 19.499184339314844, + "grad_norm": 0.0001575792266521603, + "learning_rate": 1.9101026985614558e-06, + "loss": 0.0013, + "num_input_tokens_seen": 258191984, + "step": 119530 + }, + { + "epoch": 19.5, + "grad_norm": 0.00021977766300551593, + "learning_rate": 1.903891902081345e-06, + "loss": 0.0008, + "num_input_tokens_seen": 258202736, + "step": 119535 + }, + { + "epoch": 19.500815660685156, + "grad_norm": 0.0011759491171687841, + "learning_rate": 1.8976912001507084e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258214576, + "step": 119540 + }, + { + "epoch": 19.50163132137031, + "grad_norm": 0.0012104709167033434, + "learning_rate": 1.8915005928953344e-06, + "loss": 0.001, + "num_input_tokens_seen": 258224368, + "step": 119545 + }, + { + "epoch": 19.502446982055464, + "grad_norm": 0.0025493651628494263, + "learning_rate": 1.8853200804405113e-06, + "loss": 0.0011, + "num_input_tokens_seen": 258235056, + "step": 119550 + }, + { + "epoch": 19.50326264274062, + "grad_norm": 0.008421454578638077, + "learning_rate": 1.879149662911639e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258246544, + "step": 119555 + }, + { + "epoch": 19.504078303425775, + "grad_norm": 0.00022099376656115055, + "learning_rate": 1.8729893404336728e-06, + "loss": 0.0008, + "num_input_tokens_seen": 258258352, + "step": 119560 + }, + { + "epoch": 19.50489396411093, + "grad_norm": 0.007480515167117119, + "learning_rate": 1.8668391131315133e-06, + "loss": 0.0008, + "num_input_tokens_seen": 258269520, + "step": 119565 + }, + { + "epoch": 19.505709624796086, + "grad_norm": 0.0013238782994449139, + "learning_rate": 1.8606989811297824e-06, + "loss": 0.0017, + "num_input_tokens_seen": 258279600, + "step": 119570 + }, + { + "epoch": 19.50652528548124, + "grad_norm": 0.01068634632974863, + "learning_rate": 1.8545689445528813e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258291664, + "step": 119575 + }, + { + "epoch": 19.507340946166394, + "grad_norm": 0.053680844604969025, + "learning_rate": 1.8484490035251544e-06, + "loss": 0.0034, + "num_input_tokens_seen": 258303152, + "step": 119580 + }, + { + "epoch": 19.50815660685155, + "grad_norm": 0.0013580488739535213, + "learning_rate": 1.842339158170503e-06, + "loss": 0.0082, + "num_input_tokens_seen": 258312528, + "step": 119585 + }, + { + "epoch": 19.508972267536706, + "grad_norm": 0.00019802911265287548, + "learning_rate": 1.8362394086128276e-06, + "loss": 0.0011, + "num_input_tokens_seen": 258323792, + "step": 119590 + }, + { + "epoch": 19.50978792822186, + "grad_norm": 0.07218465209007263, + "learning_rate": 1.8301497549757518e-06, + "loss": 0.0024, + "num_input_tokens_seen": 258334064, + "step": 119595 + }, + { + "epoch": 19.510603588907014, + "grad_norm": 0.0058241235092282295, + "learning_rate": 1.8240701973826213e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258342704, + "step": 119600 + }, + { + "epoch": 19.51141924959217, + "grad_norm": 0.2714917063713074, + "learning_rate": 1.8180007359567263e-06, + "loss": 0.0048, + "num_input_tokens_seen": 258353296, + "step": 119605 + }, + { + "epoch": 19.512234910277325, + "grad_norm": 0.0011957393726333976, + "learning_rate": 1.8119413708210243e-06, + "loss": 0.002, + "num_input_tokens_seen": 258364944, + "step": 119610 + }, + { + "epoch": 19.51305057096248, + "grad_norm": 0.0009538216982036829, + "learning_rate": 1.8058921020983055e-06, + "loss": 0.0008, + "num_input_tokens_seen": 258376592, + "step": 119615 + }, + { + "epoch": 19.513866231647633, + "grad_norm": 0.0010454310104250908, + "learning_rate": 1.7998529299111944e-06, + "loss": 0.0062, + "num_input_tokens_seen": 258387280, + "step": 119620 + }, + { + "epoch": 19.51468189233279, + "grad_norm": 0.006765805184841156, + "learning_rate": 1.7938238543820928e-06, + "loss": 0.0009, + "num_input_tokens_seen": 258397104, + "step": 119625 + }, + { + "epoch": 19.515497553017944, + "grad_norm": 0.011556816287338734, + "learning_rate": 1.7878048756331256e-06, + "loss": 0.0594, + "num_input_tokens_seen": 258407344, + "step": 119630 + }, + { + "epoch": 19.5163132137031, + "grad_norm": 0.048716992139816284, + "learning_rate": 1.7817959937863615e-06, + "loss": 0.0569, + "num_input_tokens_seen": 258418416, + "step": 119635 + }, + { + "epoch": 19.517128874388256, + "grad_norm": 0.01016552746295929, + "learning_rate": 1.7757972089635367e-06, + "loss": 0.0012, + "num_input_tokens_seen": 258428176, + "step": 119640 + }, + { + "epoch": 19.517944535073408, + "grad_norm": 0.005958016030490398, + "learning_rate": 1.7698085212862203e-06, + "loss": 0.002, + "num_input_tokens_seen": 258439952, + "step": 119645 + }, + { + "epoch": 19.518760195758563, + "grad_norm": 0.0012907384661957622, + "learning_rate": 1.76382993087576e-06, + "loss": 0.0026, + "num_input_tokens_seen": 258449424, + "step": 119650 + }, + { + "epoch": 19.51957585644372, + "grad_norm": 0.0009790941840037704, + "learning_rate": 1.7578614378533365e-06, + "loss": 0.0015, + "num_input_tokens_seen": 258461264, + "step": 119655 + }, + { + "epoch": 19.520391517128875, + "grad_norm": 0.03771071508526802, + "learning_rate": 1.751903042339964e-06, + "loss": 0.0021, + "num_input_tokens_seen": 258472688, + "step": 119660 + }, + { + "epoch": 19.52120717781403, + "grad_norm": 0.0106875104829669, + "learning_rate": 1.745954744456324e-06, + "loss": 0.0124, + "num_input_tokens_seen": 258484080, + "step": 119665 + }, + { + "epoch": 19.522022838499183, + "grad_norm": 0.0014177068369463086, + "learning_rate": 1.7400165443229865e-06, + "loss": 0.12, + "num_input_tokens_seen": 258494512, + "step": 119670 + }, + { + "epoch": 19.52283849918434, + "grad_norm": 0.010459963232278824, + "learning_rate": 1.7340884420603e-06, + "loss": 0.0485, + "num_input_tokens_seen": 258505200, + "step": 119675 + }, + { + "epoch": 19.523654159869494, + "grad_norm": 0.006741983816027641, + "learning_rate": 1.7281704377884454e-06, + "loss": 0.0012, + "num_input_tokens_seen": 258516656, + "step": 119680 + }, + { + "epoch": 19.52446982055465, + "grad_norm": 0.0004489065904635936, + "learning_rate": 1.7222625316272723e-06, + "loss": 0.0018, + "num_input_tokens_seen": 258527408, + "step": 119685 + }, + { + "epoch": 19.525285481239806, + "grad_norm": 0.0006436831317842007, + "learning_rate": 1.7163647236965728e-06, + "loss": 0.0022, + "num_input_tokens_seen": 258538064, + "step": 119690 + }, + { + "epoch": 19.526101141924958, + "grad_norm": 0.000623969070147723, + "learning_rate": 1.7104770141158631e-06, + "loss": 0.0021, + "num_input_tokens_seen": 258550448, + "step": 119695 + }, + { + "epoch": 19.526916802610113, + "grad_norm": 0.00015011659706942737, + "learning_rate": 1.704599403004492e-06, + "loss": 0.0002, + "num_input_tokens_seen": 258561488, + "step": 119700 + }, + { + "epoch": 19.52773246329527, + "grad_norm": 0.026546292006969452, + "learning_rate": 1.6987318904814753e-06, + "loss": 0.0012, + "num_input_tokens_seen": 258571792, + "step": 119705 + }, + { + "epoch": 19.528548123980425, + "grad_norm": 0.0041951765306293964, + "learning_rate": 1.6928744766658844e-06, + "loss": 0.0025, + "num_input_tokens_seen": 258582288, + "step": 119710 + }, + { + "epoch": 19.52936378466558, + "grad_norm": 0.00497661717236042, + "learning_rate": 1.687027161676291e-06, + "loss": 0.0029, + "num_input_tokens_seen": 258593104, + "step": 119715 + }, + { + "epoch": 19.530179445350733, + "grad_norm": 0.021917784586548805, + "learning_rate": 1.6811899456312119e-06, + "loss": 0.0011, + "num_input_tokens_seen": 258603728, + "step": 119720 + }, + { + "epoch": 19.53099510603589, + "grad_norm": 0.0025708836037665606, + "learning_rate": 1.6753628286490518e-06, + "loss": 0.0002, + "num_input_tokens_seen": 258613968, + "step": 119725 + }, + { + "epoch": 19.531810766721044, + "grad_norm": 0.00024284885148517787, + "learning_rate": 1.6695458108477724e-06, + "loss": 0.0016, + "num_input_tokens_seen": 258624304, + "step": 119730 + }, + { + "epoch": 19.5326264274062, + "grad_norm": 0.009805253706872463, + "learning_rate": 1.66373889234539e-06, + "loss": 0.0016, + "num_input_tokens_seen": 258636432, + "step": 119735 + }, + { + "epoch": 19.533442088091356, + "grad_norm": 0.0007772601675242186, + "learning_rate": 1.6579420732594774e-06, + "loss": 0.0018, + "num_input_tokens_seen": 258647248, + "step": 119740 + }, + { + "epoch": 19.534257748776508, + "grad_norm": 0.016048630699515343, + "learning_rate": 1.6521553537075518e-06, + "loss": 0.0014, + "num_input_tokens_seen": 258658768, + "step": 119745 + }, + { + "epoch": 19.535073409461663, + "grad_norm": 0.07577808946371078, + "learning_rate": 1.646378733806908e-06, + "loss": 0.0022, + "num_input_tokens_seen": 258669648, + "step": 119750 + }, + { + "epoch": 19.53588907014682, + "grad_norm": 0.00451402785256505, + "learning_rate": 1.6406122136746193e-06, + "loss": 0.0009, + "num_input_tokens_seen": 258679376, + "step": 119755 + }, + { + "epoch": 19.536704730831975, + "grad_norm": 0.0001716844126349315, + "learning_rate": 1.634855793427481e-06, + "loss": 0.0022, + "num_input_tokens_seen": 258690288, + "step": 119760 + }, + { + "epoch": 19.53752039151713, + "grad_norm": 0.0004944111569784582, + "learning_rate": 1.6291094731822886e-06, + "loss": 0.0014, + "num_input_tokens_seen": 258701584, + "step": 119765 + }, + { + "epoch": 19.538336052202283, + "grad_norm": 0.0008311573183164, + "learning_rate": 1.6233732530553935e-06, + "loss": 0.0018, + "num_input_tokens_seen": 258711920, + "step": 119770 + }, + { + "epoch": 19.53915171288744, + "grad_norm": 0.005382438190281391, + "learning_rate": 1.6176471331630915e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258722224, + "step": 119775 + }, + { + "epoch": 19.539967373572594, + "grad_norm": 0.007278508972376585, + "learning_rate": 1.6119311136213455e-06, + "loss": 0.0005, + "num_input_tokens_seen": 258733520, + "step": 119780 + }, + { + "epoch": 19.54078303425775, + "grad_norm": 0.0013596662320196629, + "learning_rate": 1.6062251945461737e-06, + "loss": 0.0008, + "num_input_tokens_seen": 258744368, + "step": 119785 + }, + { + "epoch": 19.541598694942905, + "grad_norm": 0.04996887966990471, + "learning_rate": 1.6005293760530393e-06, + "loss": 0.0204, + "num_input_tokens_seen": 258755088, + "step": 119790 + }, + { + "epoch": 19.542414355628058, + "grad_norm": 0.030599098652601242, + "learning_rate": 1.594843658257461e-06, + "loss": 0.0014, + "num_input_tokens_seen": 258764976, + "step": 119795 + }, + { + "epoch": 19.543230016313213, + "grad_norm": 0.006039235275238752, + "learning_rate": 1.5891680412746246e-06, + "loss": 0.0008, + "num_input_tokens_seen": 258776528, + "step": 119800 + }, + { + "epoch": 19.54404567699837, + "grad_norm": 0.0015127577353268862, + "learning_rate": 1.5835025252196044e-06, + "loss": 0.0029, + "num_input_tokens_seen": 258787248, + "step": 119805 + }, + { + "epoch": 19.544861337683525, + "grad_norm": 0.005725604481995106, + "learning_rate": 1.5778471102071423e-06, + "loss": 0.0005, + "num_input_tokens_seen": 258798416, + "step": 119810 + }, + { + "epoch": 19.545676998368677, + "grad_norm": 0.015134379267692566, + "learning_rate": 1.572201796351924e-06, + "loss": 0.0028, + "num_input_tokens_seen": 258809680, + "step": 119815 + }, + { + "epoch": 19.546492659053833, + "grad_norm": 0.010965199209749699, + "learning_rate": 1.5665665837683584e-06, + "loss": 0.003, + "num_input_tokens_seen": 258820528, + "step": 119820 + }, + { + "epoch": 19.54730831973899, + "grad_norm": 0.000573936675209552, + "learning_rate": 1.5609414725706317e-06, + "loss": 0.0026, + "num_input_tokens_seen": 258831056, + "step": 119825 + }, + { + "epoch": 19.548123980424144, + "grad_norm": 0.011310549452900887, + "learning_rate": 1.5553264628727082e-06, + "loss": 0.0009, + "num_input_tokens_seen": 258842736, + "step": 119830 + }, + { + "epoch": 19.5489396411093, + "grad_norm": 0.013286259956657887, + "learning_rate": 1.5497215547884414e-06, + "loss": 0.0804, + "num_input_tokens_seen": 258853872, + "step": 119835 + }, + { + "epoch": 19.549755301794452, + "grad_norm": 0.02167029306292534, + "learning_rate": 1.544126748431407e-06, + "loss": 0.0234, + "num_input_tokens_seen": 258864144, + "step": 119840 + }, + { + "epoch": 19.550570962479608, + "grad_norm": 0.0003503776097204536, + "learning_rate": 1.538542043914959e-06, + "loss": 0.0055, + "num_input_tokens_seen": 258874992, + "step": 119845 + }, + { + "epoch": 19.551386623164763, + "grad_norm": 0.0004293158417567611, + "learning_rate": 1.5329674413522843e-06, + "loss": 0.0004, + "num_input_tokens_seen": 258884880, + "step": 119850 + }, + { + "epoch": 19.55220228384992, + "grad_norm": 0.007084133103489876, + "learning_rate": 1.527402940856404e-06, + "loss": 0.0009, + "num_input_tokens_seen": 258896016, + "step": 119855 + }, + { + "epoch": 19.553017944535075, + "grad_norm": 0.13451212644577026, + "learning_rate": 1.5218485425400607e-06, + "loss": 0.0141, + "num_input_tokens_seen": 258906096, + "step": 119860 + }, + { + "epoch": 19.553833605220227, + "grad_norm": 0.0023989281617105007, + "learning_rate": 1.516304246515776e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258916432, + "step": 119865 + }, + { + "epoch": 19.554649265905383, + "grad_norm": 0.007038596551865339, + "learning_rate": 1.5107700528960156e-06, + "loss": 0.0015, + "num_input_tokens_seen": 258926416, + "step": 119870 + }, + { + "epoch": 19.55546492659054, + "grad_norm": 0.0006149518303573132, + "learning_rate": 1.505245961792856e-06, + "loss": 0.0015, + "num_input_tokens_seen": 258938608, + "step": 119875 + }, + { + "epoch": 19.556280587275694, + "grad_norm": 0.0030068473424762487, + "learning_rate": 1.4997319733182636e-06, + "loss": 0.0008, + "num_input_tokens_seen": 258948880, + "step": 119880 + }, + { + "epoch": 19.55709624796085, + "grad_norm": 0.0009674095781520009, + "learning_rate": 1.494228087583982e-06, + "loss": 0.0012, + "num_input_tokens_seen": 258957936, + "step": 119885 + }, + { + "epoch": 19.557911908646002, + "grad_norm": 0.0016425395151600242, + "learning_rate": 1.4887343047016444e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258969552, + "step": 119890 + }, + { + "epoch": 19.558727569331158, + "grad_norm": 0.010517501272261143, + "learning_rate": 1.4832506247824396e-06, + "loss": 0.0051, + "num_input_tokens_seen": 258979856, + "step": 119895 + }, + { + "epoch": 19.559543230016313, + "grad_norm": 0.0007454265141859651, + "learning_rate": 1.4777770479376118e-06, + "loss": 0.0016, + "num_input_tokens_seen": 258990160, + "step": 119900 + }, + { + "epoch": 19.56035889070147, + "grad_norm": 0.0038657390978187323, + "learning_rate": 1.472313574278017e-06, + "loss": 0.0009, + "num_input_tokens_seen": 259001168, + "step": 119905 + }, + { + "epoch": 19.561174551386625, + "grad_norm": 0.0014202585443854332, + "learning_rate": 1.4668602039144551e-06, + "loss": 0.0006, + "num_input_tokens_seen": 259012432, + "step": 119910 + }, + { + "epoch": 19.561990212071777, + "grad_norm": 0.11280523985624313, + "learning_rate": 1.4614169369573382e-06, + "loss": 0.0048, + "num_input_tokens_seen": 259023824, + "step": 119915 + }, + { + "epoch": 19.562805872756933, + "grad_norm": 0.002691589528694749, + "learning_rate": 1.4559837735171333e-06, + "loss": 0.0015, + "num_input_tokens_seen": 259034896, + "step": 119920 + }, + { + "epoch": 19.563621533442088, + "grad_norm": 0.001120995031669736, + "learning_rate": 1.450560713703808e-06, + "loss": 0.0016, + "num_input_tokens_seen": 259045680, + "step": 119925 + }, + { + "epoch": 19.564437194127244, + "grad_norm": 0.14628851413726807, + "learning_rate": 1.4451477576273298e-06, + "loss": 0.0029, + "num_input_tokens_seen": 259056944, + "step": 119930 + }, + { + "epoch": 19.5652528548124, + "grad_norm": 0.0006048278883099556, + "learning_rate": 1.4397449053973888e-06, + "loss": 0.0006, + "num_input_tokens_seen": 259067120, + "step": 119935 + }, + { + "epoch": 19.56606851549755, + "grad_norm": 0.005681201349943876, + "learning_rate": 1.4343521571235086e-06, + "loss": 0.0012, + "num_input_tokens_seen": 259077296, + "step": 119940 + }, + { + "epoch": 19.566884176182707, + "grad_norm": 0.0018342856783419847, + "learning_rate": 1.4289695129149349e-06, + "loss": 0.0024, + "num_input_tokens_seen": 259087632, + "step": 119945 + }, + { + "epoch": 19.567699836867863, + "grad_norm": 0.005485404282808304, + "learning_rate": 1.423596972880803e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259096656, + "step": 119950 + }, + { + "epoch": 19.56851549755302, + "grad_norm": 0.25319576263427734, + "learning_rate": 1.4182345371299699e-06, + "loss": 0.0058, + "num_input_tokens_seen": 259108304, + "step": 119955 + }, + { + "epoch": 19.569331158238175, + "grad_norm": 0.002403180580586195, + "learning_rate": 1.412882205771071e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259117456, + "step": 119960 + }, + { + "epoch": 19.570146818923327, + "grad_norm": 0.001975890714675188, + "learning_rate": 1.4075399789126308e-06, + "loss": 0.0006, + "num_input_tokens_seen": 259128528, + "step": 119965 + }, + { + "epoch": 19.570962479608482, + "grad_norm": 0.00043759553227573633, + "learning_rate": 1.4022078566629515e-06, + "loss": 0.0016, + "num_input_tokens_seen": 259140016, + "step": 119970 + }, + { + "epoch": 19.571778140293638, + "grad_norm": 0.2719343602657318, + "learning_rate": 1.396885839130002e-06, + "loss": 0.0081, + "num_input_tokens_seen": 259150064, + "step": 119975 + }, + { + "epoch": 19.572593800978794, + "grad_norm": 0.0024640439078211784, + "learning_rate": 1.3915739264216964e-06, + "loss": 0.0032, + "num_input_tokens_seen": 259161360, + "step": 119980 + }, + { + "epoch": 19.57340946166395, + "grad_norm": 0.009985164739191532, + "learning_rate": 1.3862721186456706e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259172720, + "step": 119985 + }, + { + "epoch": 19.5742251223491, + "grad_norm": 0.0014474820345640182, + "learning_rate": 1.3809804159093386e-06, + "loss": 0.0012, + "num_input_tokens_seen": 259184336, + "step": 119990 + }, + { + "epoch": 19.575040783034257, + "grad_norm": 0.0026534050703048706, + "learning_rate": 1.3756988183200037e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259195120, + "step": 119995 + }, + { + "epoch": 19.575856443719413, + "grad_norm": 0.00017664516053628176, + "learning_rate": 1.3704273259847467e-06, + "loss": 0.0011, + "num_input_tokens_seen": 259207024, + "step": 120000 + }, + { + "epoch": 19.57667210440457, + "grad_norm": 0.06056229770183563, + "learning_rate": 1.36516593901026e-06, + "loss": 0.0053, + "num_input_tokens_seen": 259217872, + "step": 120005 + }, + { + "epoch": 19.57748776508972, + "grad_norm": 0.008258544839918613, + "learning_rate": 1.3599146575032363e-06, + "loss": 0.001, + "num_input_tokens_seen": 259228304, + "step": 120010 + }, + { + "epoch": 19.578303425774877, + "grad_norm": 0.0049018110148608685, + "learning_rate": 1.3546734815702012e-06, + "loss": 0.0012, + "num_input_tokens_seen": 259239632, + "step": 120015 + }, + { + "epoch": 19.579119086460032, + "grad_norm": 0.0018762719118967652, + "learning_rate": 1.349442411317181e-06, + "loss": 0.0017, + "num_input_tokens_seen": 259251024, + "step": 120020 + }, + { + "epoch": 19.579934747145188, + "grad_norm": 0.06191835179924965, + "learning_rate": 1.3442214468503688e-06, + "loss": 0.0029, + "num_input_tokens_seen": 259259856, + "step": 120025 + }, + { + "epoch": 19.580750407830344, + "grad_norm": 0.0006233254680410028, + "learning_rate": 1.3390105882754577e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259269936, + "step": 120030 + }, + { + "epoch": 19.581566068515496, + "grad_norm": 0.00015882418665569276, + "learning_rate": 1.333809835698141e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259279888, + "step": 120035 + }, + { + "epoch": 19.58238172920065, + "grad_norm": 0.01703408546745777, + "learning_rate": 1.3286191892237231e-06, + "loss": 0.0041, + "num_input_tokens_seen": 259290608, + "step": 120040 + }, + { + "epoch": 19.583197389885807, + "grad_norm": 0.00424957275390625, + "learning_rate": 1.323438648957509e-06, + "loss": 0.0563, + "num_input_tokens_seen": 259301808, + "step": 120045 + }, + { + "epoch": 19.584013050570963, + "grad_norm": 0.04442833736538887, + "learning_rate": 1.318268215004359e-06, + "loss": 0.0027, + "num_input_tokens_seen": 259311472, + "step": 120050 + }, + { + "epoch": 19.58482871125612, + "grad_norm": 0.006586727686226368, + "learning_rate": 1.3131078874691337e-06, + "loss": 0.0032, + "num_input_tokens_seen": 259322416, + "step": 120055 + }, + { + "epoch": 19.58564437194127, + "grad_norm": 0.00016416041762568057, + "learning_rate": 1.3079576664564163e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259333616, + "step": 120060 + }, + { + "epoch": 19.586460032626427, + "grad_norm": 0.002335761673748493, + "learning_rate": 1.302817552070623e-06, + "loss": 0.0009, + "num_input_tokens_seen": 259343248, + "step": 120065 + }, + { + "epoch": 19.587275693311582, + "grad_norm": 0.0020196493715047836, + "learning_rate": 1.297687544415782e-06, + "loss": 0.0008, + "num_input_tokens_seen": 259353008, + "step": 120070 + }, + { + "epoch": 19.588091353996738, + "grad_norm": 0.03180438652634621, + "learning_rate": 1.292567643596032e-06, + "loss": 0.0011, + "num_input_tokens_seen": 259365040, + "step": 120075 + }, + { + "epoch": 19.588907014681894, + "grad_norm": 0.0011726239463314414, + "learning_rate": 1.2874578497150125e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259376688, + "step": 120080 + }, + { + "epoch": 19.589722675367046, + "grad_norm": 0.0019082275684922934, + "learning_rate": 1.282358162876307e-06, + "loss": 0.0122, + "num_input_tokens_seen": 259388592, + "step": 120085 + }, + { + "epoch": 19.5905383360522, + "grad_norm": 0.0001999034866457805, + "learning_rate": 1.277268583183333e-06, + "loss": 0.0011, + "num_input_tokens_seen": 259399408, + "step": 120090 + }, + { + "epoch": 19.591353996737357, + "grad_norm": 0.0022268639877438545, + "learning_rate": 1.2721891107391192e-06, + "loss": 0.0008, + "num_input_tokens_seen": 259410224, + "step": 120095 + }, + { + "epoch": 19.592169657422513, + "grad_norm": 0.0030834621284157038, + "learning_rate": 1.2671197456467497e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259420048, + "step": 120100 + }, + { + "epoch": 19.59298531810767, + "grad_norm": 0.0024934473913162947, + "learning_rate": 1.2620604880088093e-06, + "loss": 0.0017, + "num_input_tokens_seen": 259430864, + "step": 120105 + }, + { + "epoch": 19.59380097879282, + "grad_norm": 0.019052933901548386, + "learning_rate": 1.2570113379279936e-06, + "loss": 0.0006, + "num_input_tokens_seen": 259442128, + "step": 120110 + }, + { + "epoch": 19.594616639477977, + "grad_norm": 0.051151152700185776, + "learning_rate": 1.2519722955064982e-06, + "loss": 0.0058, + "num_input_tokens_seen": 259453360, + "step": 120115 + }, + { + "epoch": 19.595432300163132, + "grad_norm": 0.0011440202360972762, + "learning_rate": 1.2469433608464642e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259464112, + "step": 120120 + }, + { + "epoch": 19.596247960848288, + "grad_norm": 0.0026353909634053707, + "learning_rate": 1.2419245340498652e-06, + "loss": 0.0007, + "num_input_tokens_seen": 259475088, + "step": 120125 + }, + { + "epoch": 19.597063621533444, + "grad_norm": 0.0006127303349785507, + "learning_rate": 1.236915815218398e-06, + "loss": 0.023, + "num_input_tokens_seen": 259486160, + "step": 120130 + }, + { + "epoch": 19.597879282218596, + "grad_norm": 0.0013539056526497006, + "learning_rate": 1.2319172044535365e-06, + "loss": 0.0016, + "num_input_tokens_seen": 259496880, + "step": 120135 + }, + { + "epoch": 19.59869494290375, + "grad_norm": 0.0010810773819684982, + "learning_rate": 1.2269287018565888e-06, + "loss": 0.0006, + "num_input_tokens_seen": 259508336, + "step": 120140 + }, + { + "epoch": 19.599510603588907, + "grad_norm": 0.00026650080690160394, + "learning_rate": 1.2219503075286963e-06, + "loss": 0.0007, + "num_input_tokens_seen": 259518896, + "step": 120145 + }, + { + "epoch": 19.600326264274063, + "grad_norm": 0.001171777956187725, + "learning_rate": 1.2169820215707228e-06, + "loss": 0.0022, + "num_input_tokens_seen": 259531184, + "step": 120150 + }, + { + "epoch": 19.601141924959215, + "grad_norm": 0.0004032772849313915, + "learning_rate": 1.2120238440833653e-06, + "loss": 0.0067, + "num_input_tokens_seen": 259542768, + "step": 120155 + }, + { + "epoch": 19.60195758564437, + "grad_norm": 0.0009708287543617189, + "learning_rate": 1.207075775167099e-06, + "loss": 0.0022, + "num_input_tokens_seen": 259551856, + "step": 120160 + }, + { + "epoch": 19.602773246329527, + "grad_norm": 0.0016535356407985091, + "learning_rate": 1.2021378149221773e-06, + "loss": 0.0008, + "num_input_tokens_seen": 259561328, + "step": 120165 + }, + { + "epoch": 19.603588907014682, + "grad_norm": 0.003574906848371029, + "learning_rate": 1.1972099634487422e-06, + "loss": 0.0403, + "num_input_tokens_seen": 259571248, + "step": 120170 + }, + { + "epoch": 19.604404567699838, + "grad_norm": 0.0005903760902583599, + "learning_rate": 1.1922922208466026e-06, + "loss": 0.0007, + "num_input_tokens_seen": 259582544, + "step": 120175 + }, + { + "epoch": 19.605220228384994, + "grad_norm": 0.0017175710527226329, + "learning_rate": 1.1873845872154565e-06, + "loss": 0.0078, + "num_input_tokens_seen": 259593936, + "step": 120180 + }, + { + "epoch": 19.606035889070146, + "grad_norm": 0.00047921930672600865, + "learning_rate": 1.1824870626547247e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259604336, + "step": 120185 + }, + { + "epoch": 19.6068515497553, + "grad_norm": 0.006056176032871008, + "learning_rate": 1.1775996472637163e-06, + "loss": 0.0007, + "num_input_tokens_seen": 259615696, + "step": 120190 + }, + { + "epoch": 19.607667210440457, + "grad_norm": 0.00036932036164216697, + "learning_rate": 1.1727223411414078e-06, + "loss": 0.0012, + "num_input_tokens_seen": 259627536, + "step": 120195 + }, + { + "epoch": 19.608482871125613, + "grad_norm": 0.001396836363710463, + "learning_rate": 1.1678551443867203e-06, + "loss": 0.0008, + "num_input_tokens_seen": 259637040, + "step": 120200 + }, + { + "epoch": 19.609298531810765, + "grad_norm": 0.004054496064782143, + "learning_rate": 1.1629980570982967e-06, + "loss": 0.0018, + "num_input_tokens_seen": 259648336, + "step": 120205 + }, + { + "epoch": 19.61011419249592, + "grad_norm": 0.017525173723697662, + "learning_rate": 1.1581510793745032e-06, + "loss": 0.0015, + "num_input_tokens_seen": 259660304, + "step": 120210 + }, + { + "epoch": 19.610929853181077, + "grad_norm": 0.00820534024387598, + "learning_rate": 1.153314211313594e-06, + "loss": 0.0008, + "num_input_tokens_seen": 259671120, + "step": 120215 + }, + { + "epoch": 19.611745513866232, + "grad_norm": 0.0011833886383101344, + "learning_rate": 1.1484874530136025e-06, + "loss": 0.0006, + "num_input_tokens_seen": 259681968, + "step": 120220 + }, + { + "epoch": 19.612561174551388, + "grad_norm": 0.01561807096004486, + "learning_rate": 1.1436708045723388e-06, + "loss": 0.001, + "num_input_tokens_seen": 259691568, + "step": 120225 + }, + { + "epoch": 19.61337683523654, + "grad_norm": 0.0010521980002522469, + "learning_rate": 1.1388642660875025e-06, + "loss": 0.0012, + "num_input_tokens_seen": 259703248, + "step": 120230 + }, + { + "epoch": 19.614192495921696, + "grad_norm": 0.05352751538157463, + "learning_rate": 1.1340678376563495e-06, + "loss": 0.0025, + "num_input_tokens_seen": 259714224, + "step": 120235 + }, + { + "epoch": 19.61500815660685, + "grad_norm": 0.00045512960059568286, + "learning_rate": 1.1292815193761907e-06, + "loss": 0.0014, + "num_input_tokens_seen": 259725808, + "step": 120240 + }, + { + "epoch": 19.615823817292007, + "grad_norm": 0.27281224727630615, + "learning_rate": 1.1245053113440596e-06, + "loss": 0.0098, + "num_input_tokens_seen": 259737168, + "step": 120245 + }, + { + "epoch": 19.616639477977163, + "grad_norm": 0.002132084220647812, + "learning_rate": 1.1197392136566565e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259747760, + "step": 120250 + }, + { + "epoch": 19.617455138662315, + "grad_norm": 0.038451775908470154, + "learning_rate": 1.114983226410571e-06, + "loss": 0.0009, + "num_input_tokens_seen": 259759632, + "step": 120255 + }, + { + "epoch": 19.61827079934747, + "grad_norm": 1.293935775756836, + "learning_rate": 1.110237349702281e-06, + "loss": 0.1449, + "num_input_tokens_seen": 259770672, + "step": 120260 + }, + { + "epoch": 19.619086460032626, + "grad_norm": 0.003990166820585728, + "learning_rate": 1.1055015836279326e-06, + "loss": 0.0027, + "num_input_tokens_seen": 259781360, + "step": 120265 + }, + { + "epoch": 19.619902120717782, + "grad_norm": 0.0009797315578907728, + "learning_rate": 1.1007759282834484e-06, + "loss": 0.0006, + "num_input_tokens_seen": 259792656, + "step": 120270 + }, + { + "epoch": 19.620717781402938, + "grad_norm": 0.03197610005736351, + "learning_rate": 1.096060383764641e-06, + "loss": 0.0007, + "num_input_tokens_seen": 259802288, + "step": 120275 + }, + { + "epoch": 19.62153344208809, + "grad_norm": 0.01582932658493519, + "learning_rate": 1.0913549501671004e-06, + "loss": 0.0007, + "num_input_tokens_seen": 259812400, + "step": 120280 + }, + { + "epoch": 19.622349102773246, + "grad_norm": 0.0015432540094479918, + "learning_rate": 1.0866596275861395e-06, + "loss": 0.0071, + "num_input_tokens_seen": 259823728, + "step": 120285 + }, + { + "epoch": 19.6231647634584, + "grad_norm": 0.0003914940753020346, + "learning_rate": 1.0819744161169597e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259835408, + "step": 120290 + }, + { + "epoch": 19.623980424143557, + "grad_norm": 0.013478913344442844, + "learning_rate": 1.0772993158544297e-06, + "loss": 0.0032, + "num_input_tokens_seen": 259845616, + "step": 120295 + }, + { + "epoch": 19.624796084828713, + "grad_norm": 0.0010959201026707888, + "learning_rate": 1.072634326893418e-06, + "loss": 0.0033, + "num_input_tokens_seen": 259856464, + "step": 120300 + }, + { + "epoch": 19.625611745513865, + "grad_norm": 0.005378579255193472, + "learning_rate": 1.0679794493284045e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259868720, + "step": 120305 + }, + { + "epoch": 19.62642740619902, + "grad_norm": 0.015822935849428177, + "learning_rate": 1.0633346832537026e-06, + "loss": 0.0024, + "num_input_tokens_seen": 259879600, + "step": 120310 + }, + { + "epoch": 19.627243066884176, + "grad_norm": 0.0004820248286705464, + "learning_rate": 1.0587000287634596e-06, + "loss": 0.0008, + "num_input_tokens_seen": 259889232, + "step": 120315 + }, + { + "epoch": 19.628058727569332, + "grad_norm": 0.010135025717318058, + "learning_rate": 1.0540754859516554e-06, + "loss": 0.0008, + "num_input_tokens_seen": 259900240, + "step": 120320 + }, + { + "epoch": 19.628874388254488, + "grad_norm": 0.07188961654901505, + "learning_rate": 1.0494610549119377e-06, + "loss": 0.0028, + "num_input_tokens_seen": 259910992, + "step": 120325 + }, + { + "epoch": 19.62969004893964, + "grad_norm": 0.010499064810574055, + "learning_rate": 1.0448567357378424e-06, + "loss": 0.0016, + "num_input_tokens_seen": 259922224, + "step": 120330 + }, + { + "epoch": 19.630505709624796, + "grad_norm": 0.01909225806593895, + "learning_rate": 1.0402625285227396e-06, + "loss": 0.0015, + "num_input_tokens_seen": 259932944, + "step": 120335 + }, + { + "epoch": 19.63132137030995, + "grad_norm": 0.20919649302959442, + "learning_rate": 1.0356784333596658e-06, + "loss": 0.0064, + "num_input_tokens_seen": 259944272, + "step": 120340 + }, + { + "epoch": 19.632137030995107, + "grad_norm": 0.1292264610528946, + "learning_rate": 1.0311044503415468e-06, + "loss": 0.0029, + "num_input_tokens_seen": 259953232, + "step": 120345 + }, + { + "epoch": 19.63295269168026, + "grad_norm": 0.05426434054970741, + "learning_rate": 1.026540579561086e-06, + "loss": 0.002, + "num_input_tokens_seen": 259963856, + "step": 120350 + }, + { + "epoch": 19.633768352365415, + "grad_norm": 0.00942949764430523, + "learning_rate": 1.0219868211108208e-06, + "loss": 0.0013, + "num_input_tokens_seen": 259974992, + "step": 120355 + }, + { + "epoch": 19.63458401305057, + "grad_norm": 0.0067382687702775, + "learning_rate": 1.0174431750828993e-06, + "loss": 0.0065, + "num_input_tokens_seen": 259986800, + "step": 120360 + }, + { + "epoch": 19.635399673735726, + "grad_norm": 0.014512408524751663, + "learning_rate": 1.0129096415695816e-06, + "loss": 0.0056, + "num_input_tokens_seen": 259997552, + "step": 120365 + }, + { + "epoch": 19.636215334420882, + "grad_norm": 0.006454018875956535, + "learning_rate": 1.008386220662627e-06, + "loss": 0.0009, + "num_input_tokens_seen": 260008912, + "step": 120370 + }, + { + "epoch": 19.637030995106034, + "grad_norm": 0.0028814023826271296, + "learning_rate": 1.0038729124537405e-06, + "loss": 0.0027, + "num_input_tokens_seen": 260020752, + "step": 120375 + }, + { + "epoch": 19.63784665579119, + "grad_norm": 0.00044552632607519627, + "learning_rate": 9.993697170343485e-07, + "loss": 0.0009, + "num_input_tokens_seen": 260032176, + "step": 120380 + }, + { + "epoch": 19.638662316476346, + "grad_norm": 0.07475344091653824, + "learning_rate": 9.948766344958227e-07, + "loss": 0.003, + "num_input_tokens_seen": 260043216, + "step": 120385 + }, + { + "epoch": 19.6394779771615, + "grad_norm": 0.000331522838678211, + "learning_rate": 9.9039366492909e-07, + "loss": 0.0008, + "num_input_tokens_seen": 260054192, + "step": 120390 + }, + { + "epoch": 19.640293637846657, + "grad_norm": 0.0012437768746167421, + "learning_rate": 9.859208084251337e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260063792, + "step": 120395 + }, + { + "epoch": 19.64110929853181, + "grad_norm": 0.0010564016411080956, + "learning_rate": 9.81458065074492e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260075120, + "step": 120400 + }, + { + "epoch": 19.641924959216965, + "grad_norm": 0.0017424465622752905, + "learning_rate": 9.770054349677037e-07, + "loss": 0.0006, + "num_input_tokens_seen": 260085456, + "step": 120405 + }, + { + "epoch": 19.64274061990212, + "grad_norm": 0.0012548412196338177, + "learning_rate": 9.725629181949192e-07, + "loss": 0.001, + "num_input_tokens_seen": 260096080, + "step": 120410 + }, + { + "epoch": 19.643556280587276, + "grad_norm": 0.005961594637483358, + "learning_rate": 9.681305148462328e-07, + "loss": 0.0012, + "num_input_tokens_seen": 260106320, + "step": 120415 + }, + { + "epoch": 19.644371941272432, + "grad_norm": 0.0003299217496532947, + "learning_rate": 9.63708225011406e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260118096, + "step": 120420 + }, + { + "epoch": 19.645187601957584, + "grad_norm": 0.0025435383431613445, + "learning_rate": 9.59296048780145e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260129232, + "step": 120425 + }, + { + "epoch": 19.64600326264274, + "grad_norm": 0.00037977020838297904, + "learning_rate": 9.54893986241767e-07, + "loss": 0.0079, + "num_input_tokens_seen": 260139728, + "step": 120430 + }, + { + "epoch": 19.646818923327896, + "grad_norm": 0.003552139736711979, + "learning_rate": 9.505020374855899e-07, + "loss": 0.0009, + "num_input_tokens_seen": 260150800, + "step": 120435 + }, + { + "epoch": 19.64763458401305, + "grad_norm": 0.0030579909216612577, + "learning_rate": 9.461202026005978e-07, + "loss": 0.0035, + "num_input_tokens_seen": 260162224, + "step": 120440 + }, + { + "epoch": 19.648450244698207, + "grad_norm": 0.010102360509335995, + "learning_rate": 9.417484816755528e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260171984, + "step": 120445 + }, + { + "epoch": 19.64926590538336, + "grad_norm": 0.003113190643489361, + "learning_rate": 9.37386874799051e-07, + "loss": 0.0034, + "num_input_tokens_seen": 260182672, + "step": 120450 + }, + { + "epoch": 19.650081566068515, + "grad_norm": 0.002211271785199642, + "learning_rate": 9.330353820595217e-07, + "loss": 0.0107, + "num_input_tokens_seen": 260192912, + "step": 120455 + }, + { + "epoch": 19.65089722675367, + "grad_norm": 0.00032293720869347453, + "learning_rate": 9.286940035451718e-07, + "loss": 0.001, + "num_input_tokens_seen": 260204912, + "step": 120460 + }, + { + "epoch": 19.651712887438826, + "grad_norm": 0.07494331896305084, + "learning_rate": 9.243627393439313e-07, + "loss": 0.0024, + "num_input_tokens_seen": 260216976, + "step": 120465 + }, + { + "epoch": 19.652528548123982, + "grad_norm": 0.0016844982746988535, + "learning_rate": 9.200415895436187e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260227984, + "step": 120470 + }, + { + "epoch": 19.653344208809134, + "grad_norm": 0.0008055089274421334, + "learning_rate": 9.157305542317751e-07, + "loss": 0.0028, + "num_input_tokens_seen": 260238544, + "step": 120475 + }, + { + "epoch": 19.65415986949429, + "grad_norm": 0.007196464110165834, + "learning_rate": 9.11429633495775e-07, + "loss": 0.0132, + "num_input_tokens_seen": 260249360, + "step": 120480 + }, + { + "epoch": 19.654975530179446, + "grad_norm": 0.001205752487294376, + "learning_rate": 9.071388274228264e-07, + "loss": 0.0009, + "num_input_tokens_seen": 260260400, + "step": 120485 + }, + { + "epoch": 19.6557911908646, + "grad_norm": 0.009604084305465221, + "learning_rate": 9.028581360998045e-07, + "loss": 0.0031, + "num_input_tokens_seen": 260270032, + "step": 120490 + }, + { + "epoch": 19.656606851549757, + "grad_norm": 0.005923762917518616, + "learning_rate": 8.985875596135285e-07, + "loss": 0.0006, + "num_input_tokens_seen": 260281264, + "step": 120495 + }, + { + "epoch": 19.65742251223491, + "grad_norm": 0.003814230440184474, + "learning_rate": 8.943270980505957e-07, + "loss": 0.0009, + "num_input_tokens_seen": 260291568, + "step": 120500 + }, + { + "epoch": 19.658238172920065, + "grad_norm": 0.5724601149559021, + "learning_rate": 8.900767514972152e-07, + "loss": 0.0137, + "num_input_tokens_seen": 260303920, + "step": 120505 + }, + { + "epoch": 19.65905383360522, + "grad_norm": 0.004697203170508146, + "learning_rate": 8.858365200395957e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260314032, + "step": 120510 + }, + { + "epoch": 19.659869494290376, + "grad_norm": 0.22248674929141998, + "learning_rate": 8.816064037636684e-07, + "loss": 0.0052, + "num_input_tokens_seen": 260325008, + "step": 120515 + }, + { + "epoch": 19.660685154975532, + "grad_norm": 0.018114212900400162, + "learning_rate": 8.773864027551981e-07, + "loss": 0.0021, + "num_input_tokens_seen": 260336400, + "step": 120520 + }, + { + "epoch": 19.661500815660684, + "grad_norm": 0.0033652205020189285, + "learning_rate": 8.73176517099672e-07, + "loss": 0.0051, + "num_input_tokens_seen": 260347888, + "step": 120525 + }, + { + "epoch": 19.66231647634584, + "grad_norm": 0.019522542133927345, + "learning_rate": 8.689767468824105e-07, + "loss": 0.0008, + "num_input_tokens_seen": 260359216, + "step": 120530 + }, + { + "epoch": 19.663132137030995, + "grad_norm": 0.023196915164589882, + "learning_rate": 8.647870921885126e-07, + "loss": 0.004, + "num_input_tokens_seen": 260370096, + "step": 120535 + }, + { + "epoch": 19.66394779771615, + "grad_norm": 0.3506559431552887, + "learning_rate": 8.606075531029101e-07, + "loss": 0.0067, + "num_input_tokens_seen": 260380432, + "step": 120540 + }, + { + "epoch": 19.664763458401303, + "grad_norm": 0.003931544255465269, + "learning_rate": 8.564381297102575e-07, + "loss": 0.0171, + "num_input_tokens_seen": 260391536, + "step": 120545 + }, + { + "epoch": 19.66557911908646, + "grad_norm": 0.003058247035369277, + "learning_rate": 8.522788220951538e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260402320, + "step": 120550 + }, + { + "epoch": 19.666394779771615, + "grad_norm": 0.004594247788190842, + "learning_rate": 8.481296303418096e-07, + "loss": 0.001, + "num_input_tokens_seen": 260413776, + "step": 120555 + }, + { + "epoch": 19.66721044045677, + "grad_norm": 0.006736138369888067, + "learning_rate": 8.439905545343796e-07, + "loss": 0.0006, + "num_input_tokens_seen": 260425232, + "step": 120560 + }, + { + "epoch": 19.668026101141926, + "grad_norm": 0.003108308184891939, + "learning_rate": 8.398615947566302e-07, + "loss": 0.0018, + "num_input_tokens_seen": 260435792, + "step": 120565 + }, + { + "epoch": 19.66884176182708, + "grad_norm": 0.0003533684357535094, + "learning_rate": 8.357427510923832e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260446320, + "step": 120570 + }, + { + "epoch": 19.669657422512234, + "grad_norm": 0.0026117609813809395, + "learning_rate": 8.316340236249609e-07, + "loss": 0.0015, + "num_input_tokens_seen": 260458160, + "step": 120575 + }, + { + "epoch": 19.67047308319739, + "grad_norm": 0.0008695587166585028, + "learning_rate": 8.275354124377965e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260469712, + "step": 120580 + }, + { + "epoch": 19.671288743882545, + "grad_norm": 0.002099714009091258, + "learning_rate": 8.234469176138238e-07, + "loss": 0.0014, + "num_input_tokens_seen": 260481296, + "step": 120585 + }, + { + "epoch": 19.6721044045677, + "grad_norm": 0.008611966855823994, + "learning_rate": 8.193685392359762e-07, + "loss": 0.0008, + "num_input_tokens_seen": 260490928, + "step": 120590 + }, + { + "epoch": 19.672920065252853, + "grad_norm": 0.0001795227435650304, + "learning_rate": 8.153002773868546e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260501648, + "step": 120595 + }, + { + "epoch": 19.67373572593801, + "grad_norm": 0.005488082300871611, + "learning_rate": 8.112421321489483e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260512272, + "step": 120600 + }, + { + "epoch": 19.674551386623165, + "grad_norm": 0.0002953264338430017, + "learning_rate": 8.07194103604525e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260523088, + "step": 120605 + }, + { + "epoch": 19.67536704730832, + "grad_norm": 0.009819312021136284, + "learning_rate": 8.03156191835519e-07, + "loss": 0.0009, + "num_input_tokens_seen": 260534512, + "step": 120610 + }, + { + "epoch": 19.676182707993476, + "grad_norm": 0.0009091253159567714, + "learning_rate": 7.99128396923865e-07, + "loss": 0.0023, + "num_input_tokens_seen": 260543280, + "step": 120615 + }, + { + "epoch": 19.67699836867863, + "grad_norm": 0.022837691009044647, + "learning_rate": 7.951107189511641e-07, + "loss": 0.0013, + "num_input_tokens_seen": 260553712, + "step": 120620 + }, + { + "epoch": 19.677814029363784, + "grad_norm": 0.0009270032169297338, + "learning_rate": 7.91103157998796e-07, + "loss": 0.0028, + "num_input_tokens_seen": 260563632, + "step": 120625 + }, + { + "epoch": 19.67862969004894, + "grad_norm": 0.4371551275253296, + "learning_rate": 7.871057141480287e-07, + "loss": 0.0252, + "num_input_tokens_seen": 260573808, + "step": 120630 + }, + { + "epoch": 19.679445350734095, + "grad_norm": 0.0009796569356694818, + "learning_rate": 7.831183874798531e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260585808, + "step": 120635 + }, + { + "epoch": 19.68026101141925, + "grad_norm": 0.00811771210283041, + "learning_rate": 7.791411780750935e-07, + "loss": 0.0006, + "num_input_tokens_seen": 260596400, + "step": 120640 + }, + { + "epoch": 19.681076672104403, + "grad_norm": 0.20137394964694977, + "learning_rate": 7.751740860143519e-07, + "loss": 0.0064, + "num_input_tokens_seen": 260606640, + "step": 120645 + }, + { + "epoch": 19.68189233278956, + "grad_norm": 0.013613549061119556, + "learning_rate": 7.712171113780086e-07, + "loss": 0.0008, + "num_input_tokens_seen": 260618000, + "step": 120650 + }, + { + "epoch": 19.682707993474715, + "grad_norm": 0.0029773954302072525, + "learning_rate": 7.672702542462773e-07, + "loss": 0.0006, + "num_input_tokens_seen": 260629264, + "step": 120655 + }, + { + "epoch": 19.68352365415987, + "grad_norm": 0.0026561871636658907, + "learning_rate": 7.633335146991493e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260640368, + "step": 120660 + }, + { + "epoch": 19.684339314845026, + "grad_norm": 0.009900989942252636, + "learning_rate": 7.594068928163944e-07, + "loss": 0.0137, + "num_input_tokens_seen": 260650768, + "step": 120665 + }, + { + "epoch": 19.68515497553018, + "grad_norm": 0.018907103687524796, + "learning_rate": 7.554903886775599e-07, + "loss": 0.0014, + "num_input_tokens_seen": 260661680, + "step": 120670 + }, + { + "epoch": 19.685970636215334, + "grad_norm": 0.0027872032951563597, + "learning_rate": 7.515840023620824e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260673584, + "step": 120675 + }, + { + "epoch": 19.68678629690049, + "grad_norm": 0.011443251743912697, + "learning_rate": 7.476877339490651e-07, + "loss": 0.0011, + "num_input_tokens_seen": 260684912, + "step": 120680 + }, + { + "epoch": 19.687601957585645, + "grad_norm": 0.06114175543189049, + "learning_rate": 7.438015835175005e-07, + "loss": 0.0086, + "num_input_tokens_seen": 260694864, + "step": 120685 + }, + { + "epoch": 19.6884176182708, + "grad_norm": 0.0009734915802255273, + "learning_rate": 7.399255511461589e-07, + "loss": 0.0014, + "num_input_tokens_seen": 260704592, + "step": 120690 + }, + { + "epoch": 19.689233278955953, + "grad_norm": 0.0013609203742817044, + "learning_rate": 7.360596369135886e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260715888, + "step": 120695 + }, + { + "epoch": 19.69004893964111, + "grad_norm": 0.003440382657572627, + "learning_rate": 7.322038408981157e-07, + "loss": 0.0044, + "num_input_tokens_seen": 260725840, + "step": 120700 + }, + { + "epoch": 19.690864600326265, + "grad_norm": 0.0012171886628493667, + "learning_rate": 7.283581631779002e-07, + "loss": 0.0071, + "num_input_tokens_seen": 260736368, + "step": 120705 + }, + { + "epoch": 19.69168026101142, + "grad_norm": 0.015224111266434193, + "learning_rate": 7.245226038308794e-07, + "loss": 0.0876, + "num_input_tokens_seen": 260745648, + "step": 120710 + }, + { + "epoch": 19.692495921696576, + "grad_norm": 0.007610964123159647, + "learning_rate": 7.206971629348246e-07, + "loss": 0.0017, + "num_input_tokens_seen": 260755888, + "step": 120715 + }, + { + "epoch": 19.693311582381728, + "grad_norm": 0.0033253964502364397, + "learning_rate": 7.16881840567174e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260766768, + "step": 120720 + }, + { + "epoch": 19.694127243066884, + "grad_norm": 0.0012907941127195954, + "learning_rate": 7.130766368053099e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260778576, + "step": 120725 + }, + { + "epoch": 19.69494290375204, + "grad_norm": 0.03420671820640564, + "learning_rate": 7.092815517263373e-07, + "loss": 0.0015, + "num_input_tokens_seen": 260789136, + "step": 120730 + }, + { + "epoch": 19.695758564437195, + "grad_norm": 0.00766712473705411, + "learning_rate": 7.054965854071948e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260800496, + "step": 120735 + }, + { + "epoch": 19.696574225122347, + "grad_norm": 0.0032816394232213497, + "learning_rate": 7.017217379245433e-07, + "loss": 0.0006, + "num_input_tokens_seen": 260812048, + "step": 120740 + }, + { + "epoch": 19.697389885807503, + "grad_norm": 0.0004926707479171455, + "learning_rate": 6.979570093548771e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260823984, + "step": 120745 + }, + { + "epoch": 19.69820554649266, + "grad_norm": 0.0021867440082132816, + "learning_rate": 6.942023997745794e-07, + "loss": 0.0431, + "num_input_tokens_seen": 260834800, + "step": 120750 + }, + { + "epoch": 19.699021207177815, + "grad_norm": 0.0003217519260942936, + "learning_rate": 6.904579092596452e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260845744, + "step": 120755 + }, + { + "epoch": 19.69983686786297, + "grad_norm": 0.003933432046324015, + "learning_rate": 6.867235378860137e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260856112, + "step": 120760 + }, + { + "epoch": 19.700652528548122, + "grad_norm": 0.0009351072367280722, + "learning_rate": 6.829992857293465e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260867440, + "step": 120765 + }, + { + "epoch": 19.701468189233278, + "grad_norm": 0.022774334996938705, + "learning_rate": 6.792851528651389e-07, + "loss": 0.0032, + "num_input_tokens_seen": 260877936, + "step": 120770 + }, + { + "epoch": 19.702283849918434, + "grad_norm": 0.0001839139877120033, + "learning_rate": 6.755811393686084e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260889808, + "step": 120775 + }, + { + "epoch": 19.70309951060359, + "grad_norm": 0.012218792922794819, + "learning_rate": 6.718872453149172e-07, + "loss": 0.002, + "num_input_tokens_seen": 260901520, + "step": 120780 + }, + { + "epoch": 19.703915171288745, + "grad_norm": 0.009069071151316166, + "learning_rate": 6.682034707788386e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260911184, + "step": 120785 + }, + { + "epoch": 19.704730831973897, + "grad_norm": 0.0019039036706089973, + "learning_rate": 6.645298158350909e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260921008, + "step": 120790 + }, + { + "epoch": 19.705546492659053, + "grad_norm": 0.002577113453298807, + "learning_rate": 6.608662805580589e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260933104, + "step": 120795 + }, + { + "epoch": 19.70636215334421, + "grad_norm": 0.004389368463307619, + "learning_rate": 6.572128650220721e-07, + "loss": 0.0044, + "num_input_tokens_seen": 260943728, + "step": 120800 + }, + { + "epoch": 19.707177814029365, + "grad_norm": 0.00035338502493686974, + "learning_rate": 6.535695693011268e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260953008, + "step": 120805 + }, + { + "epoch": 19.70799347471452, + "grad_norm": 0.0008030373719520867, + "learning_rate": 6.499363934690528e-07, + "loss": 0.0013, + "num_input_tokens_seen": 260964400, + "step": 120810 + }, + { + "epoch": 19.708809135399672, + "grad_norm": 0.00022977576008997858, + "learning_rate": 6.463133375994579e-07, + "loss": 0.0008, + "num_input_tokens_seen": 260974096, + "step": 120815 + }, + { + "epoch": 19.709624796084828, + "grad_norm": 0.0035287451464682817, + "learning_rate": 6.427004017658389e-07, + "loss": 0.0013, + "num_input_tokens_seen": 260985296, + "step": 120820 + }, + { + "epoch": 19.710440456769984, + "grad_norm": 0.011943004094064236, + "learning_rate": 6.390975860413594e-07, + "loss": 0.003, + "num_input_tokens_seen": 260996368, + "step": 120825 + }, + { + "epoch": 19.71125611745514, + "grad_norm": 0.06614907085895538, + "learning_rate": 6.355048904990724e-07, + "loss": 0.0653, + "num_input_tokens_seen": 261006608, + "step": 120830 + }, + { + "epoch": 19.712071778140295, + "grad_norm": 0.006899102125316858, + "learning_rate": 6.319223152117526e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261018032, + "step": 120835 + }, + { + "epoch": 19.712887438825447, + "grad_norm": 0.010936878621578217, + "learning_rate": 6.283498602520088e-07, + "loss": 0.001, + "num_input_tokens_seen": 261029104, + "step": 120840 + }, + { + "epoch": 19.713703099510603, + "grad_norm": 0.0007386531797237694, + "learning_rate": 6.247875256922275e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261038480, + "step": 120845 + }, + { + "epoch": 19.71451876019576, + "grad_norm": 0.24150149524211884, + "learning_rate": 6.212353116046843e-07, + "loss": 0.0099, + "num_input_tokens_seen": 261048656, + "step": 120850 + }, + { + "epoch": 19.715334420880914, + "grad_norm": 0.001007181708700955, + "learning_rate": 6.17693218061266e-07, + "loss": 0.1039, + "num_input_tokens_seen": 261060816, + "step": 120855 + }, + { + "epoch": 19.71615008156607, + "grad_norm": 0.0009667161502875388, + "learning_rate": 6.141612451338596e-07, + "loss": 0.0017, + "num_input_tokens_seen": 261071568, + "step": 120860 + }, + { + "epoch": 19.716965742251222, + "grad_norm": 0.019680418074131012, + "learning_rate": 6.106393928939635e-07, + "loss": 0.0016, + "num_input_tokens_seen": 261083184, + "step": 120865 + }, + { + "epoch": 19.717781402936378, + "grad_norm": 0.000474753585876897, + "learning_rate": 6.07127661412965e-07, + "loss": 0.0021, + "num_input_tokens_seen": 261094928, + "step": 120870 + }, + { + "epoch": 19.718597063621534, + "grad_norm": 0.0027427682653069496, + "learning_rate": 6.036260507620849e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261105840, + "step": 120875 + }, + { + "epoch": 19.71941272430669, + "grad_norm": 0.03328926861286163, + "learning_rate": 6.001345610122111e-07, + "loss": 0.005, + "num_input_tokens_seen": 261116336, + "step": 120880 + }, + { + "epoch": 19.72022838499184, + "grad_norm": 0.0030891122296452522, + "learning_rate": 5.966531922341756e-07, + "loss": 0.0005, + "num_input_tokens_seen": 261128496, + "step": 120885 + }, + { + "epoch": 19.721044045676997, + "grad_norm": 0.0007997844368219376, + "learning_rate": 5.931819444984777e-07, + "loss": 0.0016, + "num_input_tokens_seen": 261139440, + "step": 120890 + }, + { + "epoch": 19.721859706362153, + "grad_norm": 0.0012421332066878676, + "learning_rate": 5.897208178755054e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261150192, + "step": 120895 + }, + { + "epoch": 19.72267536704731, + "grad_norm": 0.0008974650991149247, + "learning_rate": 5.862698124353694e-07, + "loss": 0.0004, + "num_input_tokens_seen": 261159824, + "step": 120900 + }, + { + "epoch": 19.723491027732464, + "grad_norm": 0.08313114941120148, + "learning_rate": 5.828289282480692e-07, + "loss": 0.0048, + "num_input_tokens_seen": 261170960, + "step": 120905 + }, + { + "epoch": 19.724306688417617, + "grad_norm": 0.010251346975564957, + "learning_rate": 5.793981653832714e-07, + "loss": 0.0006, + "num_input_tokens_seen": 261182000, + "step": 120910 + }, + { + "epoch": 19.725122349102772, + "grad_norm": 0.0011222073808312416, + "learning_rate": 5.759775239105314e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261193168, + "step": 120915 + }, + { + "epoch": 19.725938009787928, + "grad_norm": 0.0014564159791916609, + "learning_rate": 5.72567003899127e-07, + "loss": 0.0018, + "num_input_tokens_seen": 261203856, + "step": 120920 + }, + { + "epoch": 19.726753670473084, + "grad_norm": 0.0033955418039113283, + "learning_rate": 5.691666054182809e-07, + "loss": 0.0008, + "num_input_tokens_seen": 261215088, + "step": 120925 + }, + { + "epoch": 19.72756933115824, + "grad_norm": 0.0031751454807817936, + "learning_rate": 5.657763285368267e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261226512, + "step": 120930 + }, + { + "epoch": 19.72838499184339, + "grad_norm": 0.00040086961234919727, + "learning_rate": 5.623961733234873e-07, + "loss": 0.0005, + "num_input_tokens_seen": 261237040, + "step": 120935 + }, + { + "epoch": 19.729200652528547, + "grad_norm": 0.005273323971778154, + "learning_rate": 5.590261398467633e-07, + "loss": 0.0005, + "num_input_tokens_seen": 261248624, + "step": 120940 + }, + { + "epoch": 19.730016313213703, + "grad_norm": 0.00022064868244342506, + "learning_rate": 5.556662281749891e-07, + "loss": 0.0026, + "num_input_tokens_seen": 261259248, + "step": 120945 + }, + { + "epoch": 19.73083197389886, + "grad_norm": 0.011284386739134789, + "learning_rate": 5.523164383762213e-07, + "loss": 0.0179, + "num_input_tokens_seen": 261269584, + "step": 120950 + }, + { + "epoch": 19.731647634584014, + "grad_norm": 0.19131779670715332, + "learning_rate": 5.489767705183501e-07, + "loss": 0.038, + "num_input_tokens_seen": 261280688, + "step": 120955 + }, + { + "epoch": 19.732463295269167, + "grad_norm": 0.0009722855174914002, + "learning_rate": 5.456472246690436e-07, + "loss": 0.001, + "num_input_tokens_seen": 261290512, + "step": 120960 + }, + { + "epoch": 19.733278955954322, + "grad_norm": 0.03766850382089615, + "learning_rate": 5.423278008958032e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261300880, + "step": 120965 + }, + { + "epoch": 19.734094616639478, + "grad_norm": 0.05344652757048607, + "learning_rate": 5.390184992659641e-07, + "loss": 0.0024, + "num_input_tokens_seen": 261311760, + "step": 120970 + }, + { + "epoch": 19.734910277324634, + "grad_norm": 0.002851302269846201, + "learning_rate": 5.357193198464727e-07, + "loss": 0.001, + "num_input_tokens_seen": 261323760, + "step": 120975 + }, + { + "epoch": 19.73572593800979, + "grad_norm": 0.28350090980529785, + "learning_rate": 5.324302627042199e-07, + "loss": 0.0059, + "num_input_tokens_seen": 261334704, + "step": 120980 + }, + { + "epoch": 19.73654159869494, + "grad_norm": 0.005798796657472849, + "learning_rate": 5.291513279059301e-07, + "loss": 0.0009, + "num_input_tokens_seen": 261345616, + "step": 120985 + }, + { + "epoch": 19.737357259380097, + "grad_norm": 0.004781166557222605, + "learning_rate": 5.258825155179948e-07, + "loss": 0.0006, + "num_input_tokens_seen": 261357168, + "step": 120990 + }, + { + "epoch": 19.738172920065253, + "grad_norm": 0.001854881877079606, + "learning_rate": 5.226238256066384e-07, + "loss": 0.1473, + "num_input_tokens_seen": 261368848, + "step": 120995 + }, + { + "epoch": 19.73898858075041, + "grad_norm": 0.019145376980304718, + "learning_rate": 5.193752582379752e-07, + "loss": 0.0018, + "num_input_tokens_seen": 261379504, + "step": 121000 + }, + { + "epoch": 19.739804241435564, + "grad_norm": 0.0002677353622857481, + "learning_rate": 5.16136813477841e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261390224, + "step": 121005 + }, + { + "epoch": 19.740619902120716, + "grad_norm": 0.0040374575182795525, + "learning_rate": 5.129084913917948e-07, + "loss": 0.0016, + "num_input_tokens_seen": 261401424, + "step": 121010 + }, + { + "epoch": 19.741435562805872, + "grad_norm": 0.11591839045286179, + "learning_rate": 5.096902920453395e-07, + "loss": 0.0055, + "num_input_tokens_seen": 261412752, + "step": 121015 + }, + { + "epoch": 19.742251223491028, + "grad_norm": 0.0011770358541980386, + "learning_rate": 5.064822155036453e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261424208, + "step": 121020 + }, + { + "epoch": 19.743066884176184, + "grad_norm": 0.0017884820699691772, + "learning_rate": 5.032842618317157e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261435344, + "step": 121025 + }, + { + "epoch": 19.74388254486134, + "grad_norm": 0.2122792750597, + "learning_rate": 5.000964310943878e-07, + "loss": 0.0038, + "num_input_tokens_seen": 261445520, + "step": 121030 + }, + { + "epoch": 19.74469820554649, + "grad_norm": 0.0012699973303824663, + "learning_rate": 4.969187233562767e-07, + "loss": 0.0006, + "num_input_tokens_seen": 261455728, + "step": 121035 + }, + { + "epoch": 19.745513866231647, + "grad_norm": 0.0003483338514342904, + "learning_rate": 4.937511386817751e-07, + "loss": 0.0004, + "num_input_tokens_seen": 261466512, + "step": 121040 + }, + { + "epoch": 19.746329526916803, + "grad_norm": 0.00047351993271149695, + "learning_rate": 4.905936771351094e-07, + "loss": 0.0016, + "num_input_tokens_seen": 261477936, + "step": 121045 + }, + { + "epoch": 19.74714518760196, + "grad_norm": 0.0007477999897673726, + "learning_rate": 4.874463387801731e-07, + "loss": 0.0024, + "num_input_tokens_seen": 261488912, + "step": 121050 + }, + { + "epoch": 19.747960848287114, + "grad_norm": 0.032901830971241, + "learning_rate": 4.843091236808594e-07, + "loss": 0.001, + "num_input_tokens_seen": 261499024, + "step": 121055 + }, + { + "epoch": 19.748776508972266, + "grad_norm": 0.0010255652014166117, + "learning_rate": 4.811820319006732e-07, + "loss": 0.0029, + "num_input_tokens_seen": 261509328, + "step": 121060 + }, + { + "epoch": 19.749592169657422, + "grad_norm": 0.05027930438518524, + "learning_rate": 4.780650635030081e-07, + "loss": 0.0014, + "num_input_tokens_seen": 261521136, + "step": 121065 + }, + { + "epoch": 19.750407830342578, + "grad_norm": 0.0001708085328573361, + "learning_rate": 4.7495821855109145e-07, + "loss": 0.0016, + "num_input_tokens_seen": 261531824, + "step": 121070 + }, + { + "epoch": 19.751223491027734, + "grad_norm": 0.0003755021607503295, + "learning_rate": 4.718614971078172e-07, + "loss": 0.0016, + "num_input_tokens_seen": 261543120, + "step": 121075 + }, + { + "epoch": 19.752039151712886, + "grad_norm": 0.0013738623820245266, + "learning_rate": 4.6877489923596863e-07, + "loss": 0.0013, + "num_input_tokens_seen": 261552464, + "step": 121080 + }, + { + "epoch": 19.75285481239804, + "grad_norm": 0.00029178871773183346, + "learning_rate": 4.6569842499805113e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261563088, + "step": 121085 + }, + { + "epoch": 19.753670473083197, + "grad_norm": 0.0012256011832505465, + "learning_rate": 4.626320744565149e-07, + "loss": 0.0037, + "num_input_tokens_seen": 261574672, + "step": 121090 + }, + { + "epoch": 19.754486133768353, + "grad_norm": 0.06535517424345016, + "learning_rate": 4.5957584767342133e-07, + "loss": 0.0027, + "num_input_tokens_seen": 261586224, + "step": 121095 + }, + { + "epoch": 19.75530179445351, + "grad_norm": 0.437174916267395, + "learning_rate": 4.5652974471077637e-07, + "loss": 0.0119, + "num_input_tokens_seen": 261597552, + "step": 121100 + }, + { + "epoch": 19.75611745513866, + "grad_norm": 0.0011823754757642746, + "learning_rate": 4.534937656301974e-07, + "loss": 0.0008, + "num_input_tokens_seen": 261609264, + "step": 121105 + }, + { + "epoch": 19.756933115823816, + "grad_norm": 0.00021741993259638548, + "learning_rate": 4.5046791049335733e-07, + "loss": 0.0019, + "num_input_tokens_seen": 261620400, + "step": 121110 + }, + { + "epoch": 19.757748776508972, + "grad_norm": 0.007211287505924702, + "learning_rate": 4.47452179361485e-07, + "loss": 0.0036, + "num_input_tokens_seen": 261631632, + "step": 121115 + }, + { + "epoch": 19.758564437194128, + "grad_norm": 0.02904735691845417, + "learning_rate": 4.444465722956981e-07, + "loss": 0.1387, + "num_input_tokens_seen": 261641168, + "step": 121120 + }, + { + "epoch": 19.759380097879284, + "grad_norm": 0.02278851345181465, + "learning_rate": 4.414510893569479e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261652016, + "step": 121125 + }, + { + "epoch": 19.760195758564436, + "grad_norm": 0.0007043814403004944, + "learning_rate": 4.384657306059636e-07, + "loss": 0.0027, + "num_input_tokens_seen": 261662544, + "step": 121130 + }, + { + "epoch": 19.76101141924959, + "grad_norm": 0.0009921282762661576, + "learning_rate": 4.354904961031414e-07, + "loss": 0.0096, + "num_input_tokens_seen": 261671888, + "step": 121135 + }, + { + "epoch": 19.761827079934747, + "grad_norm": 0.017717812210321426, + "learning_rate": 4.3252538590893285e-07, + "loss": 0.001, + "num_input_tokens_seen": 261682928, + "step": 121140 + }, + { + "epoch": 19.762642740619903, + "grad_norm": 0.04009336233139038, + "learning_rate": 4.2957040008323456e-07, + "loss": 0.0018, + "num_input_tokens_seen": 261693520, + "step": 121145 + }, + { + "epoch": 19.76345840130506, + "grad_norm": 0.1430460810661316, + "learning_rate": 4.266255386861095e-07, + "loss": 0.0026, + "num_input_tokens_seen": 261704432, + "step": 121150 + }, + { + "epoch": 19.76427406199021, + "grad_norm": 0.011360394768416882, + "learning_rate": 4.2369080177717676e-07, + "loss": 0.0069, + "num_input_tokens_seen": 261715568, + "step": 121155 + }, + { + "epoch": 19.765089722675366, + "grad_norm": 0.006870058830827475, + "learning_rate": 4.2076618941588875e-07, + "loss": 0.0013, + "num_input_tokens_seen": 261725008, + "step": 121160 + }, + { + "epoch": 19.765905383360522, + "grad_norm": 0.01938176155090332, + "learning_rate": 4.178517016615313e-07, + "loss": 0.0018, + "num_input_tokens_seen": 261734992, + "step": 121165 + }, + { + "epoch": 19.766721044045678, + "grad_norm": 0.002593178069218993, + "learning_rate": 4.1494733857322385e-07, + "loss": 0.0121, + "num_input_tokens_seen": 261746864, + "step": 121170 + }, + { + "epoch": 19.767536704730833, + "grad_norm": 0.00021053437376394868, + "learning_rate": 4.120531002096972e-07, + "loss": 0.0008, + "num_input_tokens_seen": 261757648, + "step": 121175 + }, + { + "epoch": 19.768352365415986, + "grad_norm": 0.0394943468272686, + "learning_rate": 4.091689866297377e-07, + "loss": 0.0032, + "num_input_tokens_seen": 261769104, + "step": 121180 + }, + { + "epoch": 19.76916802610114, + "grad_norm": 0.02225523442029953, + "learning_rate": 4.0629499789174293e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261780432, + "step": 121185 + }, + { + "epoch": 19.769983686786297, + "grad_norm": 0.00019127337145619094, + "learning_rate": 4.034311340539443e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261791312, + "step": 121190 + }, + { + "epoch": 19.770799347471453, + "grad_norm": 0.0008985060267150402, + "learning_rate": 4.005773951744063e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261802416, + "step": 121195 + }, + { + "epoch": 19.77161500815661, + "grad_norm": 0.208012193441391, + "learning_rate": 3.977337813109716e-07, + "loss": 0.0042, + "num_input_tokens_seen": 261813328, + "step": 121200 + }, + { + "epoch": 19.77243066884176, + "grad_norm": 0.0006497858557850122, + "learning_rate": 3.949002925212053e-07, + "loss": 0.0006, + "num_input_tokens_seen": 261824464, + "step": 121205 + }, + { + "epoch": 19.773246329526916, + "grad_norm": 0.0009609381668269634, + "learning_rate": 3.920769288626169e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261834416, + "step": 121210 + }, + { + "epoch": 19.774061990212072, + "grad_norm": 0.013297447003424168, + "learning_rate": 3.8926369039238295e-07, + "loss": 0.0395, + "num_input_tokens_seen": 261844944, + "step": 121215 + }, + { + "epoch": 19.774877650897228, + "grad_norm": 0.0035913216415792704, + "learning_rate": 3.864605771675134e-07, + "loss": 0.0034, + "num_input_tokens_seen": 261855120, + "step": 121220 + }, + { + "epoch": 19.775693311582383, + "grad_norm": 0.0012953771511092782, + "learning_rate": 3.8366758924479605e-07, + "loss": 0.0014, + "num_input_tokens_seen": 261865488, + "step": 121225 + }, + { + "epoch": 19.776508972267536, + "grad_norm": 0.0031643963884562254, + "learning_rate": 3.808847266809079e-07, + "loss": 0.0058, + "num_input_tokens_seen": 261876752, + "step": 121230 + }, + { + "epoch": 19.77732463295269, + "grad_norm": 0.0012785486178472638, + "learning_rate": 3.781119895321927e-07, + "loss": 0.0013, + "num_input_tokens_seen": 261887728, + "step": 121235 + }, + { + "epoch": 19.778140293637847, + "grad_norm": 0.1394270658493042, + "learning_rate": 3.753493778548278e-07, + "loss": 0.0021, + "num_input_tokens_seen": 261899056, + "step": 121240 + }, + { + "epoch": 19.778955954323003, + "grad_norm": 0.18898367881774902, + "learning_rate": 3.725968917048794e-07, + "loss": 0.0079, + "num_input_tokens_seen": 261909200, + "step": 121245 + }, + { + "epoch": 19.77977161500816, + "grad_norm": 0.047763291746377945, + "learning_rate": 3.6985453113802525e-07, + "loss": 0.0452, + "num_input_tokens_seen": 261920304, + "step": 121250 + }, + { + "epoch": 19.78058727569331, + "grad_norm": 0.006899984087795019, + "learning_rate": 3.6712229620988744e-07, + "loss": 0.0005, + "num_input_tokens_seen": 261931952, + "step": 121255 + }, + { + "epoch": 19.781402936378466, + "grad_norm": 0.0034642857499420643, + "learning_rate": 3.644001869758662e-07, + "loss": 0.0015, + "num_input_tokens_seen": 261944080, + "step": 121260 + }, + { + "epoch": 19.782218597063622, + "grad_norm": 0.2206059992313385, + "learning_rate": 3.616882034911395e-07, + "loss": 0.0048, + "num_input_tokens_seen": 261955888, + "step": 121265 + }, + { + "epoch": 19.783034257748778, + "grad_norm": 0.007103486452251673, + "learning_rate": 3.58986345810608e-07, + "loss": 0.0023, + "num_input_tokens_seen": 261966960, + "step": 121270 + }, + { + "epoch": 19.78384991843393, + "grad_norm": 0.0023222120944410563, + "learning_rate": 3.56294613989061e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261978224, + "step": 121275 + }, + { + "epoch": 19.784665579119086, + "grad_norm": 0.0007937622140161693, + "learning_rate": 3.5361300808106625e-07, + "loss": 0.0017, + "num_input_tokens_seen": 261988208, + "step": 121280 + }, + { + "epoch": 19.78548123980424, + "grad_norm": 0.08231380581855774, + "learning_rate": 3.509415281409134e-07, + "loss": 0.0082, + "num_input_tokens_seen": 261998960, + "step": 121285 + }, + { + "epoch": 19.786296900489397, + "grad_norm": 0.001557769370265305, + "learning_rate": 3.4828017422278146e-07, + "loss": 0.0012, + "num_input_tokens_seen": 262010288, + "step": 121290 + }, + { + "epoch": 19.787112561174553, + "grad_norm": 0.00967491790652275, + "learning_rate": 3.4562894638062727e-07, + "loss": 0.0039, + "num_input_tokens_seen": 262020816, + "step": 121295 + }, + { + "epoch": 19.787928221859705, + "grad_norm": 0.06240087002515793, + "learning_rate": 3.4298784466818553e-07, + "loss": 0.0029, + "num_input_tokens_seen": 262032528, + "step": 121300 + }, + { + "epoch": 19.78874388254486, + "grad_norm": 0.015026670880615711, + "learning_rate": 3.403568691389136e-07, + "loss": 0.0316, + "num_input_tokens_seen": 262043600, + "step": 121305 + }, + { + "epoch": 19.789559543230016, + "grad_norm": 0.018473364412784576, + "learning_rate": 3.3773601984615766e-07, + "loss": 0.0026, + "num_input_tokens_seen": 262055952, + "step": 121310 + }, + { + "epoch": 19.790375203915172, + "grad_norm": 0.007226514630019665, + "learning_rate": 3.3512529684309736e-07, + "loss": 0.0008, + "num_input_tokens_seen": 262067280, + "step": 121315 + }, + { + "epoch": 19.791190864600328, + "grad_norm": 0.004439678508788347, + "learning_rate": 3.325247001825793e-07, + "loss": 0.0079, + "num_input_tokens_seen": 262077968, + "step": 121320 + }, + { + "epoch": 19.79200652528548, + "grad_norm": 0.15969239175319672, + "learning_rate": 3.299342299172836e-07, + "loss": 0.0034, + "num_input_tokens_seen": 262089072, + "step": 121325 + }, + { + "epoch": 19.792822185970635, + "grad_norm": 0.004921938292682171, + "learning_rate": 3.2735388609977936e-07, + "loss": 0.0004, + "num_input_tokens_seen": 262099184, + "step": 121330 + }, + { + "epoch": 19.79363784665579, + "grad_norm": 0.0014474753988906741, + "learning_rate": 3.24783668782358e-07, + "loss": 0.0012, + "num_input_tokens_seen": 262110480, + "step": 121335 + }, + { + "epoch": 19.794453507340947, + "grad_norm": 0.0009749328601174057, + "learning_rate": 3.222235780170335e-07, + "loss": 0.0009, + "num_input_tokens_seen": 262120240, + "step": 121340 + }, + { + "epoch": 19.795269168026103, + "grad_norm": 0.0023331588599830866, + "learning_rate": 3.196736138557088e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262132272, + "step": 121345 + }, + { + "epoch": 19.796084828711255, + "grad_norm": 0.0006216101464815438, + "learning_rate": 3.171337763501203e-07, + "loss": 0.0071, + "num_input_tokens_seen": 262142736, + "step": 121350 + }, + { + "epoch": 19.79690048939641, + "grad_norm": 0.0431019552052021, + "learning_rate": 3.146040655517268e-07, + "loss": 0.0047, + "num_input_tokens_seen": 262153552, + "step": 121355 + }, + { + "epoch": 19.797716150081566, + "grad_norm": 0.0003282705438323319, + "learning_rate": 3.1208448151176516e-07, + "loss": 0.0016, + "num_input_tokens_seen": 262164240, + "step": 121360 + }, + { + "epoch": 19.798531810766722, + "grad_norm": 0.0002833571925293654, + "learning_rate": 3.0957502428130557e-07, + "loss": 0.0004, + "num_input_tokens_seen": 262176656, + "step": 121365 + }, + { + "epoch": 19.799347471451878, + "grad_norm": 0.0010412463452666998, + "learning_rate": 3.070756939111963e-07, + "loss": 0.0011, + "num_input_tokens_seen": 262186448, + "step": 121370 + }, + { + "epoch": 19.80016313213703, + "grad_norm": 0.0002267042436869815, + "learning_rate": 3.0458649045211895e-07, + "loss": 0.0004, + "num_input_tokens_seen": 262196592, + "step": 121375 + }, + { + "epoch": 19.800978792822185, + "grad_norm": 0.03307168558239937, + "learning_rate": 3.021074139545332e-07, + "loss": 0.0019, + "num_input_tokens_seen": 262207728, + "step": 121380 + }, + { + "epoch": 19.80179445350734, + "grad_norm": 0.007719150744378567, + "learning_rate": 2.996384644686212e-07, + "loss": 0.0411, + "num_input_tokens_seen": 262219760, + "step": 121385 + }, + { + "epoch": 19.802610114192497, + "grad_norm": 0.0004898576298728585, + "learning_rate": 2.971796420444539e-07, + "loss": 0.0013, + "num_input_tokens_seen": 262230256, + "step": 121390 + }, + { + "epoch": 19.803425774877653, + "grad_norm": 0.15931858122348785, + "learning_rate": 2.947309467318804e-07, + "loss": 0.005, + "num_input_tokens_seen": 262241776, + "step": 121395 + }, + { + "epoch": 19.804241435562805, + "grad_norm": 0.0005706818192265928, + "learning_rate": 2.922923785804721e-07, + "loss": 0.0006, + "num_input_tokens_seen": 262252304, + "step": 121400 + }, + { + "epoch": 19.80505709624796, + "grad_norm": 0.0008127331384457648, + "learning_rate": 2.898639376396894e-07, + "loss": 0.0013, + "num_input_tokens_seen": 262262736, + "step": 121405 + }, + { + "epoch": 19.805872756933116, + "grad_norm": 0.014661543071269989, + "learning_rate": 2.8744562395877083e-07, + "loss": 0.0016, + "num_input_tokens_seen": 262275280, + "step": 121410 + }, + { + "epoch": 19.806688417618272, + "grad_norm": 0.00010798094444908202, + "learning_rate": 2.850374375866216e-07, + "loss": 0.0009, + "num_input_tokens_seen": 262284560, + "step": 121415 + }, + { + "epoch": 19.807504078303424, + "grad_norm": 0.015252824872732162, + "learning_rate": 2.826393785722026e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262294864, + "step": 121420 + }, + { + "epoch": 19.80831973898858, + "grad_norm": 0.024584434926509857, + "learning_rate": 2.80251446963975e-07, + "loss": 0.0019, + "num_input_tokens_seen": 262305584, + "step": 121425 + }, + { + "epoch": 19.809135399673735, + "grad_norm": 0.000190110455150716, + "learning_rate": 2.778736428104556e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262317040, + "step": 121430 + }, + { + "epoch": 19.80995106035889, + "grad_norm": 0.030826276168227196, + "learning_rate": 2.75505966159717e-07, + "loss": 0.0023, + "num_input_tokens_seen": 262328336, + "step": 121435 + }, + { + "epoch": 19.810766721044047, + "grad_norm": 0.00027534199762158096, + "learning_rate": 2.73148417059832e-07, + "loss": 0.0045, + "num_input_tokens_seen": 262338928, + "step": 121440 + }, + { + "epoch": 19.8115823817292, + "grad_norm": 0.0006019662832841277, + "learning_rate": 2.708009955584845e-07, + "loss": 0.0008, + "num_input_tokens_seen": 262350896, + "step": 121445 + }, + { + "epoch": 19.812398042414355, + "grad_norm": 0.00021458462288137525, + "learning_rate": 2.684637017033587e-07, + "loss": 0.0009, + "num_input_tokens_seen": 262360976, + "step": 121450 + }, + { + "epoch": 19.81321370309951, + "grad_norm": 0.0041677881963551044, + "learning_rate": 2.6613653554175e-07, + "loss": 0.0008, + "num_input_tokens_seen": 262371824, + "step": 121455 + }, + { + "epoch": 19.814029363784666, + "grad_norm": 0.00046735754585824907, + "learning_rate": 2.6381949712089846e-07, + "loss": 0.017, + "num_input_tokens_seen": 262382320, + "step": 121460 + }, + { + "epoch": 19.81484502446982, + "grad_norm": 0.0006864400929771364, + "learning_rate": 2.6151258648765553e-07, + "loss": 0.001, + "num_input_tokens_seen": 262392624, + "step": 121465 + }, + { + "epoch": 19.815660685154974, + "grad_norm": 0.0004593665653374046, + "learning_rate": 2.59215803688817e-07, + "loss": 0.0009, + "num_input_tokens_seen": 262403792, + "step": 121470 + }, + { + "epoch": 19.81647634584013, + "grad_norm": 0.00036628826637752354, + "learning_rate": 2.5692914877090135e-07, + "loss": 0.001, + "num_input_tokens_seen": 262414800, + "step": 121475 + }, + { + "epoch": 19.817292006525285, + "grad_norm": 0.002507440047338605, + "learning_rate": 2.546526217803713e-07, + "loss": 0.0037, + "num_input_tokens_seen": 262425776, + "step": 121480 + }, + { + "epoch": 19.81810766721044, + "grad_norm": 0.001124211703427136, + "learning_rate": 2.5238622276319014e-07, + "loss": 0.0008, + "num_input_tokens_seen": 262435728, + "step": 121485 + }, + { + "epoch": 19.818923327895597, + "grad_norm": 0.004534002393484116, + "learning_rate": 2.501299517654321e-07, + "loss": 0.0088, + "num_input_tokens_seen": 262446608, + "step": 121490 + }, + { + "epoch": 19.81973898858075, + "grad_norm": 0.002117312513291836, + "learning_rate": 2.4788380883278285e-07, + "loss": 0.0007, + "num_input_tokens_seen": 262458512, + "step": 121495 + }, + { + "epoch": 19.820554649265905, + "grad_norm": 0.0016950422432273626, + "learning_rate": 2.4564779401070604e-07, + "loss": 0.0021, + "num_input_tokens_seen": 262468688, + "step": 121500 + }, + { + "epoch": 19.82137030995106, + "grad_norm": 0.005130813457071781, + "learning_rate": 2.434219073445543e-07, + "loss": 0.0008, + "num_input_tokens_seen": 262478992, + "step": 121505 + }, + { + "epoch": 19.822185970636216, + "grad_norm": 0.17918558418750763, + "learning_rate": 2.412061488795136e-07, + "loss": 0.0012, + "num_input_tokens_seen": 262490224, + "step": 121510 + }, + { + "epoch": 19.82300163132137, + "grad_norm": 0.03207606077194214, + "learning_rate": 2.390005186603261e-07, + "loss": 0.001, + "num_input_tokens_seen": 262501040, + "step": 121515 + }, + { + "epoch": 19.823817292006524, + "grad_norm": 0.0009639065247029066, + "learning_rate": 2.3680501673184474e-07, + "loss": 0.0007, + "num_input_tokens_seen": 262511152, + "step": 121520 + }, + { + "epoch": 19.82463295269168, + "grad_norm": 0.0005210313247516751, + "learning_rate": 2.346196431384784e-07, + "loss": 0.0008, + "num_input_tokens_seen": 262521744, + "step": 121525 + }, + { + "epoch": 19.825448613376835, + "grad_norm": 0.0011603363091126084, + "learning_rate": 2.324443979245805e-07, + "loss": 0.0008, + "num_input_tokens_seen": 262532368, + "step": 121530 + }, + { + "epoch": 19.82626427406199, + "grad_norm": 0.0002771125582512468, + "learning_rate": 2.302792811341714e-07, + "loss": 0.0008, + "num_input_tokens_seen": 262543152, + "step": 121535 + }, + { + "epoch": 19.827079934747147, + "grad_norm": 0.00139376032166183, + "learning_rate": 2.2812429281116043e-07, + "loss": 0.0003, + "num_input_tokens_seen": 262554512, + "step": 121540 + }, + { + "epoch": 19.8278955954323, + "grad_norm": 0.017726287245750427, + "learning_rate": 2.2597943299923484e-07, + "loss": 0.0015, + "num_input_tokens_seen": 262564592, + "step": 121545 + }, + { + "epoch": 19.828711256117455, + "grad_norm": 0.02565399929881096, + "learning_rate": 2.2384470174180438e-07, + "loss": 0.0028, + "num_input_tokens_seen": 262574192, + "step": 121550 + }, + { + "epoch": 19.82952691680261, + "grad_norm": 0.0008128905319608748, + "learning_rate": 2.2172009908216772e-07, + "loss": 0.0083, + "num_input_tokens_seen": 262585200, + "step": 121555 + }, + { + "epoch": 19.830342577487766, + "grad_norm": 0.002057405421510339, + "learning_rate": 2.1960562506340153e-07, + "loss": 0.0007, + "num_input_tokens_seen": 262596496, + "step": 121560 + }, + { + "epoch": 19.83115823817292, + "grad_norm": 0.0012084591435268521, + "learning_rate": 2.1750127972836042e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262608144, + "step": 121565 + }, + { + "epoch": 19.831973898858074, + "grad_norm": 0.001822744612582028, + "learning_rate": 2.1540706311967695e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262619312, + "step": 121570 + }, + { + "epoch": 19.83278955954323, + "grad_norm": 0.0852610394358635, + "learning_rate": 2.1332297527976164e-07, + "loss": 0.006, + "num_input_tokens_seen": 262630000, + "step": 121575 + }, + { + "epoch": 19.833605220228385, + "grad_norm": 0.002231738530099392, + "learning_rate": 2.1124901625091397e-07, + "loss": 0.0006, + "num_input_tokens_seen": 262640784, + "step": 121580 + }, + { + "epoch": 19.83442088091354, + "grad_norm": 0.029305459931492805, + "learning_rate": 2.091851860751004e-07, + "loss": 0.0027, + "num_input_tokens_seen": 262651664, + "step": 121585 + }, + { + "epoch": 19.835236541598697, + "grad_norm": 0.00031926666270010173, + "learning_rate": 2.071314847941763e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262662288, + "step": 121590 + }, + { + "epoch": 19.83605220228385, + "grad_norm": 0.0077890572138130665, + "learning_rate": 2.050879124498306e-07, + "loss": 0.0006, + "num_input_tokens_seen": 262672080, + "step": 121595 + }, + { + "epoch": 19.836867862969005, + "grad_norm": 0.07575695961713791, + "learning_rate": 2.0305446908336355e-07, + "loss": 0.0029, + "num_input_tokens_seen": 262683952, + "step": 121600 + }, + { + "epoch": 19.83768352365416, + "grad_norm": 0.010576908476650715, + "learning_rate": 2.0103115473601996e-07, + "loss": 0.0006, + "num_input_tokens_seen": 262693904, + "step": 121605 + }, + { + "epoch": 19.838499184339316, + "grad_norm": 0.008228904567658901, + "learning_rate": 1.9901796944882254e-07, + "loss": 0.0011, + "num_input_tokens_seen": 262706064, + "step": 121610 + }, + { + "epoch": 19.839314845024468, + "grad_norm": 0.000713883840944618, + "learning_rate": 1.9701491326257203e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262716688, + "step": 121615 + }, + { + "epoch": 19.840130505709624, + "grad_norm": 0.0005843836115673184, + "learning_rate": 1.9502198621790257e-07, + "loss": 0.0027, + "num_input_tokens_seen": 262726800, + "step": 121620 + }, + { + "epoch": 19.84094616639478, + "grad_norm": 0.00699404114857316, + "learning_rate": 1.9303918835511526e-07, + "loss": 0.0032, + "num_input_tokens_seen": 262737968, + "step": 121625 + }, + { + "epoch": 19.841761827079935, + "grad_norm": 0.0425538644194603, + "learning_rate": 1.9106651971445564e-07, + "loss": 0.0016, + "num_input_tokens_seen": 262747696, + "step": 121630 + }, + { + "epoch": 19.84257748776509, + "grad_norm": 0.013981866650283337, + "learning_rate": 1.8910398033589182e-07, + "loss": 0.0991, + "num_input_tokens_seen": 262759440, + "step": 121635 + }, + { + "epoch": 19.843393148450243, + "grad_norm": 0.0002504394797142595, + "learning_rate": 1.8715157025916972e-07, + "loss": 0.002, + "num_input_tokens_seen": 262770896, + "step": 121640 + }, + { + "epoch": 19.8442088091354, + "grad_norm": 0.00026449389406479895, + "learning_rate": 1.8520928952386885e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262782192, + "step": 121645 + }, + { + "epoch": 19.845024469820554, + "grad_norm": 0.007747107185423374, + "learning_rate": 1.8327713816940207e-07, + "loss": 0.0004, + "num_input_tokens_seen": 262792624, + "step": 121650 + }, + { + "epoch": 19.84584013050571, + "grad_norm": 0.0021923938766121864, + "learning_rate": 1.8135511623484925e-07, + "loss": 0.0006, + "num_input_tokens_seen": 262803888, + "step": 121655 + }, + { + "epoch": 19.846655791190866, + "grad_norm": 0.018954308703541756, + "learning_rate": 1.7944322375923472e-07, + "loss": 0.0011, + "num_input_tokens_seen": 262814480, + "step": 121660 + }, + { + "epoch": 19.847471451876018, + "grad_norm": 0.0002542664296925068, + "learning_rate": 1.7754146078124976e-07, + "loss": 0.0003, + "num_input_tokens_seen": 262824752, + "step": 121665 + }, + { + "epoch": 19.848287112561174, + "grad_norm": 0.004432213492691517, + "learning_rate": 1.7564982733947465e-07, + "loss": 0.001, + "num_input_tokens_seen": 262835536, + "step": 121670 + }, + { + "epoch": 19.84910277324633, + "grad_norm": 0.0007810618262737989, + "learning_rate": 1.7376832347221206e-07, + "loss": 0.0011, + "num_input_tokens_seen": 262846320, + "step": 121675 + }, + { + "epoch": 19.849918433931485, + "grad_norm": 0.002990703098475933, + "learning_rate": 1.7189694921759813e-07, + "loss": 0.0007, + "num_input_tokens_seen": 262856656, + "step": 121680 + }, + { + "epoch": 19.85073409461664, + "grad_norm": 0.00043039917363785207, + "learning_rate": 1.700357046136025e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262866128, + "step": 121685 + }, + { + "epoch": 19.851549755301793, + "grad_norm": 0.09391754120588303, + "learning_rate": 1.6818458969786177e-07, + "loss": 0.0046, + "num_input_tokens_seen": 262876976, + "step": 121690 + }, + { + "epoch": 19.85236541598695, + "grad_norm": 0.0021796554792672396, + "learning_rate": 1.6634360450795694e-07, + "loss": 0.0006, + "num_input_tokens_seen": 262887664, + "step": 121695 + }, + { + "epoch": 19.853181076672104, + "grad_norm": 0.004136047791689634, + "learning_rate": 1.6451274908124703e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262899408, + "step": 121700 + }, + { + "epoch": 19.85399673735726, + "grad_norm": 0.029257941991090775, + "learning_rate": 1.6269202345470247e-07, + "loss": 0.0119, + "num_input_tokens_seen": 262911120, + "step": 121705 + }, + { + "epoch": 19.854812398042416, + "grad_norm": 0.0014244935009628534, + "learning_rate": 1.6088142766529367e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262923120, + "step": 121710 + }, + { + "epoch": 19.855628058727568, + "grad_norm": 0.0018149535171687603, + "learning_rate": 1.5908096174976904e-07, + "loss": 0.0014, + "num_input_tokens_seen": 262933552, + "step": 121715 + }, + { + "epoch": 19.856443719412724, + "grad_norm": 0.0005949955666437745, + "learning_rate": 1.5729062574448838e-07, + "loss": 0.0006, + "num_input_tokens_seen": 262944784, + "step": 121720 + }, + { + "epoch": 19.85725938009788, + "grad_norm": 0.03684856742620468, + "learning_rate": 1.55510419685867e-07, + "loss": 0.0007, + "num_input_tokens_seen": 262957040, + "step": 121725 + }, + { + "epoch": 19.858075040783035, + "grad_norm": 0.03020774945616722, + "learning_rate": 1.5374034360993162e-07, + "loss": 0.0104, + "num_input_tokens_seen": 262969136, + "step": 121730 + }, + { + "epoch": 19.85889070146819, + "grad_norm": 0.0009404148440808058, + "learning_rate": 1.5198039755248693e-07, + "loss": 0.001, + "num_input_tokens_seen": 262980080, + "step": 121735 + }, + { + "epoch": 19.859706362153343, + "grad_norm": 0.009650173597037792, + "learning_rate": 1.5023058154928216e-07, + "loss": 0.0013, + "num_input_tokens_seen": 262990576, + "step": 121740 + }, + { + "epoch": 19.8605220228385, + "grad_norm": 0.0007336720591410995, + "learning_rate": 1.4849089563578888e-07, + "loss": 0.0014, + "num_input_tokens_seen": 263000240, + "step": 121745 + }, + { + "epoch": 19.861337683523654, + "grad_norm": 0.0011980609269812703, + "learning_rate": 1.467613398472567e-07, + "loss": 0.0031, + "num_input_tokens_seen": 263011280, + "step": 121750 + }, + { + "epoch": 19.86215334420881, + "grad_norm": 0.0014163218438625336, + "learning_rate": 1.4504191421865765e-07, + "loss": 0.0004, + "num_input_tokens_seen": 263021200, + "step": 121755 + }, + { + "epoch": 19.862969004893966, + "grad_norm": 0.4547708034515381, + "learning_rate": 1.433326187849082e-07, + "loss": 0.0037, + "num_input_tokens_seen": 263032560, + "step": 121760 + }, + { + "epoch": 19.863784665579118, + "grad_norm": 0.0008637936552986503, + "learning_rate": 1.416334535806474e-07, + "loss": 0.0148, + "num_input_tokens_seen": 263043984, + "step": 121765 + }, + { + "epoch": 19.864600326264274, + "grad_norm": 0.006428821943700314, + "learning_rate": 1.3994441864029206e-07, + "loss": 0.0031, + "num_input_tokens_seen": 263055152, + "step": 121770 + }, + { + "epoch": 19.86541598694943, + "grad_norm": 0.0065291267819702625, + "learning_rate": 1.3826551399809263e-07, + "loss": 0.0008, + "num_input_tokens_seen": 263065904, + "step": 121775 + }, + { + "epoch": 19.866231647634585, + "grad_norm": 0.0003827828913927078, + "learning_rate": 1.3659673968802188e-07, + "loss": 0.0023, + "num_input_tokens_seen": 263077232, + "step": 121780 + }, + { + "epoch": 19.86704730831974, + "grad_norm": 0.001834007678553462, + "learning_rate": 1.3493809574399717e-07, + "loss": 0.0004, + "num_input_tokens_seen": 263088816, + "step": 121785 + }, + { + "epoch": 19.867862969004893, + "grad_norm": 0.00888867024332285, + "learning_rate": 1.3328958219954724e-07, + "loss": 0.0005, + "num_input_tokens_seen": 263100912, + "step": 121790 + }, + { + "epoch": 19.86867862969005, + "grad_norm": 0.006172510329633951, + "learning_rate": 1.3165119908808976e-07, + "loss": 0.001, + "num_input_tokens_seen": 263112176, + "step": 121795 + }, + { + "epoch": 19.869494290375204, + "grad_norm": 0.025691168382763863, + "learning_rate": 1.3002294644287593e-07, + "loss": 0.0026, + "num_input_tokens_seen": 263123248, + "step": 121800 + }, + { + "epoch": 19.87030995106036, + "grad_norm": 0.00295065576210618, + "learning_rate": 1.284048242968794e-07, + "loss": 0.0007, + "num_input_tokens_seen": 263135440, + "step": 121805 + }, + { + "epoch": 19.871125611745512, + "grad_norm": 0.0006291031604632735, + "learning_rate": 1.267968326829072e-07, + "loss": 0.0061, + "num_input_tokens_seen": 263145584, + "step": 121810 + }, + { + "epoch": 19.871941272430668, + "grad_norm": 0.0017852471210062504, + "learning_rate": 1.2519897163348894e-07, + "loss": 0.0006, + "num_input_tokens_seen": 263156112, + "step": 121815 + }, + { + "epoch": 19.872756933115824, + "grad_norm": 0.00026627699844539165, + "learning_rate": 1.2361124118109856e-07, + "loss": 0.0016, + "num_input_tokens_seen": 263167856, + "step": 121820 + }, + { + "epoch": 19.87357259380098, + "grad_norm": 0.0008435967029072344, + "learning_rate": 1.220336413578216e-07, + "loss": 0.0022, + "num_input_tokens_seen": 263179248, + "step": 121825 + }, + { + "epoch": 19.874388254486135, + "grad_norm": 0.014502918347716331, + "learning_rate": 1.204661721956879e-07, + "loss": 0.0008, + "num_input_tokens_seen": 263191120, + "step": 121830 + }, + { + "epoch": 19.875203915171287, + "grad_norm": 0.0047426181845366955, + "learning_rate": 1.1890883372644989e-07, + "loss": 0.0011, + "num_input_tokens_seen": 263202352, + "step": 121835 + }, + { + "epoch": 19.876019575856443, + "grad_norm": 0.006645440123975277, + "learning_rate": 1.1736162598163791e-07, + "loss": 0.001, + "num_input_tokens_seen": 263214032, + "step": 121840 + }, + { + "epoch": 19.8768352365416, + "grad_norm": 0.026239193975925446, + "learning_rate": 1.1582454899267126e-07, + "loss": 0.0018, + "num_input_tokens_seen": 263225712, + "step": 121845 + }, + { + "epoch": 19.877650897226754, + "grad_norm": 0.00017447816208004951, + "learning_rate": 1.1429760279069168e-07, + "loss": 0.0022, + "num_input_tokens_seen": 263235632, + "step": 121850 + }, + { + "epoch": 19.87846655791191, + "grad_norm": 0.014652963727712631, + "learning_rate": 1.1278078740656339e-07, + "loss": 0.0016, + "num_input_tokens_seen": 263245872, + "step": 121855 + }, + { + "epoch": 19.879282218597062, + "grad_norm": 0.35925230383872986, + "learning_rate": 1.1127410287115059e-07, + "loss": 0.0068, + "num_input_tokens_seen": 263255984, + "step": 121860 + }, + { + "epoch": 19.880097879282218, + "grad_norm": 0.000630914350040257, + "learning_rate": 1.0977754921487337e-07, + "loss": 0.0011, + "num_input_tokens_seen": 263266800, + "step": 121865 + }, + { + "epoch": 19.880913539967374, + "grad_norm": 0.003280578413978219, + "learning_rate": 1.0829112646809635e-07, + "loss": 0.0008, + "num_input_tokens_seen": 263277744, + "step": 121870 + }, + { + "epoch": 19.88172920065253, + "grad_norm": 0.0055033001117408276, + "learning_rate": 1.068148346610176e-07, + "loss": 0.0103, + "num_input_tokens_seen": 263287120, + "step": 121875 + }, + { + "epoch": 19.882544861337685, + "grad_norm": 0.014537639915943146, + "learning_rate": 1.0534867382344659e-07, + "loss": 0.0009, + "num_input_tokens_seen": 263297744, + "step": 121880 + }, + { + "epoch": 19.883360522022837, + "grad_norm": 0.010632148012518883, + "learning_rate": 1.0389264398519283e-07, + "loss": 0.0022, + "num_input_tokens_seen": 263308464, + "step": 121885 + }, + { + "epoch": 19.884176182707993, + "grad_norm": 0.009531443938612938, + "learning_rate": 1.024467451756772e-07, + "loss": 0.001, + "num_input_tokens_seen": 263319504, + "step": 121890 + }, + { + "epoch": 19.88499184339315, + "grad_norm": 0.0028590448200702667, + "learning_rate": 1.0101097742426513e-07, + "loss": 0.0019, + "num_input_tokens_seen": 263330384, + "step": 121895 + }, + { + "epoch": 19.885807504078304, + "grad_norm": 0.001724202185869217, + "learning_rate": 9.958534075998893e-08, + "loss": 0.0034, + "num_input_tokens_seen": 263342160, + "step": 121900 + }, + { + "epoch": 19.88662316476346, + "grad_norm": 0.0016756814438849688, + "learning_rate": 9.816983521182543e-08, + "loss": 0.0034, + "num_input_tokens_seen": 263352944, + "step": 121905 + }, + { + "epoch": 19.887438825448612, + "grad_norm": 0.021503252908587456, + "learning_rate": 9.676446080841839e-08, + "loss": 0.0038, + "num_input_tokens_seen": 263364752, + "step": 121910 + }, + { + "epoch": 19.888254486133768, + "grad_norm": 0.06410571932792664, + "learning_rate": 9.536921757824502e-08, + "loss": 0.0013, + "num_input_tokens_seen": 263375536, + "step": 121915 + }, + { + "epoch": 19.889070146818923, + "grad_norm": 0.0006257572094909847, + "learning_rate": 9.39841055495605e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263386256, + "step": 121920 + }, + { + "epoch": 19.88988580750408, + "grad_norm": 0.0012954578269273043, + "learning_rate": 9.260912475050898e-08, + "loss": 0.001, + "num_input_tokens_seen": 263396912, + "step": 121925 + }, + { + "epoch": 19.890701468189235, + "grad_norm": 0.005243930034339428, + "learning_rate": 9.124427520890155e-08, + "loss": 0.0006, + "num_input_tokens_seen": 263408944, + "step": 121930 + }, + { + "epoch": 19.891517128874387, + "grad_norm": 0.0003207038389518857, + "learning_rate": 8.988955695238277e-08, + "loss": 0.0062, + "num_input_tokens_seen": 263420048, + "step": 121935 + }, + { + "epoch": 19.892332789559543, + "grad_norm": 0.0029847382102161646, + "learning_rate": 8.854497000843065e-08, + "loss": 0.0149, + "num_input_tokens_seen": 263430480, + "step": 121940 + }, + { + "epoch": 19.8931484502447, + "grad_norm": 0.007486116606742144, + "learning_rate": 8.721051440435668e-08, + "loss": 0.0029, + "num_input_tokens_seen": 263441136, + "step": 121945 + }, + { + "epoch": 19.893964110929854, + "grad_norm": 0.0015292003517970443, + "learning_rate": 8.588619016708377e-08, + "loss": 0.0033, + "num_input_tokens_seen": 263450256, + "step": 121950 + }, + { + "epoch": 19.894779771615006, + "grad_norm": 0.0005196183919906616, + "learning_rate": 8.457199732353482e-08, + "loss": 0.001, + "num_input_tokens_seen": 263461872, + "step": 121955 + }, + { + "epoch": 19.895595432300162, + "grad_norm": 0.000307762180455029, + "learning_rate": 8.32679359003552e-08, + "loss": 0.0082, + "num_input_tokens_seen": 263473104, + "step": 121960 + }, + { + "epoch": 19.896411092985318, + "grad_norm": 0.00352225243113935, + "learning_rate": 8.197400592391268e-08, + "loss": 0.0003, + "num_input_tokens_seen": 263482032, + "step": 121965 + }, + { + "epoch": 19.897226753670473, + "grad_norm": 0.0018733406905084848, + "learning_rate": 8.069020742040855e-08, + "loss": 0.0066, + "num_input_tokens_seen": 263492688, + "step": 121970 + }, + { + "epoch": 19.89804241435563, + "grad_norm": 0.0029799570329487324, + "learning_rate": 7.941654041598856e-08, + "loss": 0.0454, + "num_input_tokens_seen": 263503504, + "step": 121975 + }, + { + "epoch": 19.898858075040785, + "grad_norm": 0.0007707338081672788, + "learning_rate": 7.815300493635436e-08, + "loss": 0.0009, + "num_input_tokens_seen": 263514288, + "step": 121980 + }, + { + "epoch": 19.899673735725937, + "grad_norm": 0.0009643675875850022, + "learning_rate": 7.68996010071521e-08, + "loss": 0.0007, + "num_input_tokens_seen": 263524592, + "step": 121985 + }, + { + "epoch": 19.900489396411093, + "grad_norm": 0.014110865071415901, + "learning_rate": 7.565632865375039e-08, + "loss": 0.0021, + "num_input_tokens_seen": 263534640, + "step": 121990 + }, + { + "epoch": 19.90130505709625, + "grad_norm": 0.00029828620608896017, + "learning_rate": 7.442318790140679e-08, + "loss": 0.0003, + "num_input_tokens_seen": 263545488, + "step": 121995 + }, + { + "epoch": 19.902120717781404, + "grad_norm": 0.02362486906349659, + "learning_rate": 7.32001787750458e-08, + "loss": 0.0322, + "num_input_tokens_seen": 263556592, + "step": 122000 + }, + { + "epoch": 19.902936378466556, + "grad_norm": 0.024227816611528397, + "learning_rate": 7.198730129948094e-08, + "loss": 0.0015, + "num_input_tokens_seen": 263566960, + "step": 122005 + }, + { + "epoch": 19.903752039151712, + "grad_norm": 0.025522449985146523, + "learning_rate": 7.078455549935914e-08, + "loss": 0.0012, + "num_input_tokens_seen": 263577968, + "step": 122010 + }, + { + "epoch": 19.904567699836868, + "grad_norm": 0.0007530459552071989, + "learning_rate": 6.959194139893876e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263588656, + "step": 122015 + }, + { + "epoch": 19.905383360522023, + "grad_norm": 0.025915952399373055, + "learning_rate": 6.840945902242268e-08, + "loss": 0.0011, + "num_input_tokens_seen": 263599984, + "step": 122020 + }, + { + "epoch": 19.90619902120718, + "grad_norm": 0.06031077727675438, + "learning_rate": 6.723710839384723e-08, + "loss": 0.0025, + "num_input_tokens_seen": 263611792, + "step": 122025 + }, + { + "epoch": 19.90701468189233, + "grad_norm": 0.0003311052278149873, + "learning_rate": 6.607488953691565e-08, + "loss": 0.0014, + "num_input_tokens_seen": 263623088, + "step": 122030 + }, + { + "epoch": 19.907830342577487, + "grad_norm": 0.001318062306381762, + "learning_rate": 6.492280247516469e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263633232, + "step": 122035 + }, + { + "epoch": 19.908646003262643, + "grad_norm": 0.003854473354294896, + "learning_rate": 6.378084723196453e-08, + "loss": 0.0022, + "num_input_tokens_seen": 263643120, + "step": 122040 + }, + { + "epoch": 19.9094616639478, + "grad_norm": 0.0028099711053073406, + "learning_rate": 6.264902383051885e-08, + "loss": 0.0022, + "num_input_tokens_seen": 263653712, + "step": 122045 + }, + { + "epoch": 19.910277324632954, + "grad_norm": 0.008752093650400639, + "learning_rate": 6.152733229364272e-08, + "loss": 0.0008, + "num_input_tokens_seen": 263664752, + "step": 122050 + }, + { + "epoch": 19.911092985318106, + "grad_norm": 0.002545823808759451, + "learning_rate": 6.041577264415122e-08, + "loss": 0.0015, + "num_input_tokens_seen": 263676272, + "step": 122055 + }, + { + "epoch": 19.911908646003262, + "grad_norm": 0.0033062314614653587, + "learning_rate": 5.9314344904581876e-08, + "loss": 0.0006, + "num_input_tokens_seen": 263687088, + "step": 122060 + }, + { + "epoch": 19.912724306688418, + "grad_norm": 0.0010838387534022331, + "learning_rate": 5.822304909719467e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263697072, + "step": 122065 + }, + { + "epoch": 19.913539967373573, + "grad_norm": 0.07013796269893646, + "learning_rate": 5.714188524413855e-08, + "loss": 0.0036, + "num_input_tokens_seen": 263707728, + "step": 122070 + }, + { + "epoch": 19.91435562805873, + "grad_norm": 0.004880748223513365, + "learning_rate": 5.6070853367284903e-08, + "loss": 0.0163, + "num_input_tokens_seen": 263718000, + "step": 122075 + }, + { + "epoch": 19.91517128874388, + "grad_norm": 0.0007239999831654131, + "learning_rate": 5.500995348844962e-08, + "loss": 0.0101, + "num_input_tokens_seen": 263729040, + "step": 122080 + }, + { + "epoch": 19.915986949429037, + "grad_norm": 0.06177964434027672, + "learning_rate": 5.395918562900448e-08, + "loss": 0.0057, + "num_input_tokens_seen": 263740144, + "step": 122085 + }, + { + "epoch": 19.916802610114193, + "grad_norm": 0.004266651347279549, + "learning_rate": 5.2918549810376806e-08, + "loss": 0.0027, + "num_input_tokens_seen": 263750448, + "step": 122090 + }, + { + "epoch": 19.91761827079935, + "grad_norm": 0.16771961748600006, + "learning_rate": 5.188804605349429e-08, + "loss": 0.0067, + "num_input_tokens_seen": 263760080, + "step": 122095 + }, + { + "epoch": 19.918433931484504, + "grad_norm": 0.013668928295373917, + "learning_rate": 5.086767437939566e-08, + "loss": 0.0016, + "num_input_tokens_seen": 263770160, + "step": 122100 + }, + { + "epoch": 19.919249592169656, + "grad_norm": 0.005185098387300968, + "learning_rate": 4.985743480867555e-08, + "loss": 0.0009, + "num_input_tokens_seen": 263781104, + "step": 122105 + }, + { + "epoch": 19.920065252854812, + "grad_norm": 0.019272323697805405, + "learning_rate": 4.885732736181758e-08, + "loss": 0.0018, + "num_input_tokens_seen": 263791440, + "step": 122110 + }, + { + "epoch": 19.920880913539968, + "grad_norm": 0.0020935176871716976, + "learning_rate": 4.7867352059138835e-08, + "loss": 0.0008, + "num_input_tokens_seen": 263801328, + "step": 122115 + }, + { + "epoch": 19.921696574225123, + "grad_norm": 0.0010540640214458108, + "learning_rate": 4.688750892062332e-08, + "loss": 0.001, + "num_input_tokens_seen": 263812528, + "step": 122120 + }, + { + "epoch": 19.92251223491028, + "grad_norm": 0.00029742918559350073, + "learning_rate": 4.5917797966144037e-08, + "loss": 0.0012, + "num_input_tokens_seen": 263823024, + "step": 122125 + }, + { + "epoch": 19.92332789559543, + "grad_norm": 0.0015542684122920036, + "learning_rate": 4.495821921540744e-08, + "loss": 0.0011, + "num_input_tokens_seen": 263833680, + "step": 122130 + }, + { + "epoch": 19.924143556280587, + "grad_norm": 0.8919483423233032, + "learning_rate": 4.400877268784242e-08, + "loss": 0.0234, + "num_input_tokens_seen": 263844784, + "step": 122135 + }, + { + "epoch": 19.924959216965743, + "grad_norm": 0.00047430527047254145, + "learning_rate": 4.306945840265586e-08, + "loss": 0.0007, + "num_input_tokens_seen": 263855984, + "step": 122140 + }, + { + "epoch": 19.9257748776509, + "grad_norm": 0.001867034356109798, + "learning_rate": 4.2140276378943576e-08, + "loss": 0.0027, + "num_input_tokens_seen": 263867216, + "step": 122145 + }, + { + "epoch": 19.92659053833605, + "grad_norm": 0.29011815786361694, + "learning_rate": 4.1221226635468345e-08, + "loss": 0.0101, + "num_input_tokens_seen": 263877392, + "step": 122150 + }, + { + "epoch": 19.927406199021206, + "grad_norm": 0.0008424674742855132, + "learning_rate": 4.031230919088191e-08, + "loss": 0.0008, + "num_input_tokens_seen": 263888464, + "step": 122155 + }, + { + "epoch": 19.928221859706362, + "grad_norm": 0.0022897582966834307, + "learning_rate": 3.941352406361398e-08, + "loss": 0.001, + "num_input_tokens_seen": 263900144, + "step": 122160 + }, + { + "epoch": 19.929037520391518, + "grad_norm": 0.003309818683192134, + "learning_rate": 3.852487127187221e-08, + "loss": 0.0008, + "num_input_tokens_seen": 263910800, + "step": 122165 + }, + { + "epoch": 19.929853181076673, + "grad_norm": 0.021318409591913223, + "learning_rate": 3.7646350833697715e-08, + "loss": 0.0254, + "num_input_tokens_seen": 263921488, + "step": 122170 + }, + { + "epoch": 19.930668841761825, + "grad_norm": 0.00903343502432108, + "learning_rate": 3.677796276685408e-08, + "loss": 0.0016, + "num_input_tokens_seen": 263932176, + "step": 122175 + }, + { + "epoch": 19.93148450244698, + "grad_norm": 0.0011127277975901961, + "learning_rate": 3.591970708893832e-08, + "loss": 0.0008, + "num_input_tokens_seen": 263943024, + "step": 122180 + }, + { + "epoch": 19.932300163132137, + "grad_norm": 0.032411057502031326, + "learning_rate": 3.507158381738096e-08, + "loss": 0.0025, + "num_input_tokens_seen": 263954416, + "step": 122185 + }, + { + "epoch": 19.933115823817293, + "grad_norm": 0.0004720069991890341, + "learning_rate": 3.4233592969334926e-08, + "loss": 0.0011, + "num_input_tokens_seen": 263962640, + "step": 122190 + }, + { + "epoch": 19.93393148450245, + "grad_norm": 0.0994650274515152, + "learning_rate": 3.340573456184215e-08, + "loss": 0.0073, + "num_input_tokens_seen": 263973296, + "step": 122195 + }, + { + "epoch": 19.9347471451876, + "grad_norm": 0.000907588517293334, + "learning_rate": 3.258800861155598e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263984464, + "step": 122200 + }, + { + "epoch": 19.935562805872756, + "grad_norm": 0.1028270572423935, + "learning_rate": 3.178041513518526e-08, + "loss": 0.0035, + "num_input_tokens_seen": 263994480, + "step": 122205 + }, + { + "epoch": 19.936378466557912, + "grad_norm": 0.00754641043022275, + "learning_rate": 3.098295414899477e-08, + "loss": 0.0008, + "num_input_tokens_seen": 264005936, + "step": 122210 + }, + { + "epoch": 19.937194127243067, + "grad_norm": 0.0007687432807870209, + "learning_rate": 3.019562566924927e-08, + "loss": 0.0038, + "num_input_tokens_seen": 264016816, + "step": 122215 + }, + { + "epoch": 19.938009787928223, + "grad_norm": 0.00819560419768095, + "learning_rate": 2.9418429711769445e-08, + "loss": 0.0022, + "num_input_tokens_seen": 264027952, + "step": 122220 + }, + { + "epoch": 19.938825448613375, + "grad_norm": 0.044117119163274765, + "learning_rate": 2.865136629243148e-08, + "loss": 0.0023, + "num_input_tokens_seen": 264038992, + "step": 122225 + }, + { + "epoch": 19.93964110929853, + "grad_norm": 0.005589211825281382, + "learning_rate": 2.7894435426722988e-08, + "loss": 0.0088, + "num_input_tokens_seen": 264049776, + "step": 122230 + }, + { + "epoch": 19.940456769983687, + "grad_norm": 0.003952317405492067, + "learning_rate": 2.7147637130020553e-08, + "loss": 0.001, + "num_input_tokens_seen": 264060432, + "step": 122235 + }, + { + "epoch": 19.941272430668842, + "grad_norm": 0.0002749775012489408, + "learning_rate": 2.6410971417423214e-08, + "loss": 0.0003, + "num_input_tokens_seen": 264071856, + "step": 122240 + }, + { + "epoch": 19.942088091353998, + "grad_norm": 0.012595501728355885, + "learning_rate": 2.5684438303807955e-08, + "loss": 0.0018, + "num_input_tokens_seen": 264083344, + "step": 122245 + }, + { + "epoch": 19.94290375203915, + "grad_norm": 0.005414798855781555, + "learning_rate": 2.496803780405177e-08, + "loss": 0.0016, + "num_input_tokens_seen": 264094384, + "step": 122250 + }, + { + "epoch": 19.943719412724306, + "grad_norm": 0.0005905579309910536, + "learning_rate": 2.426176993253204e-08, + "loss": 0.0005, + "num_input_tokens_seen": 264105200, + "step": 122255 + }, + { + "epoch": 19.94453507340946, + "grad_norm": 0.0037936638109385967, + "learning_rate": 2.356563470357065e-08, + "loss": 0.0005, + "num_input_tokens_seen": 264115472, + "step": 122260 + }, + { + "epoch": 19.945350734094617, + "grad_norm": 0.0001786290085874498, + "learning_rate": 2.287963213137845e-08, + "loss": 0.0007, + "num_input_tokens_seen": 264125648, + "step": 122265 + }, + { + "epoch": 19.946166394779773, + "grad_norm": 0.003722716588526964, + "learning_rate": 2.2203762229777713e-08, + "loss": 0.0033, + "num_input_tokens_seen": 264136976, + "step": 122270 + }, + { + "epoch": 19.946982055464925, + "grad_norm": 0.005986783653497696, + "learning_rate": 2.15380250124797e-08, + "loss": 0.0033, + "num_input_tokens_seen": 264146768, + "step": 122275 + }, + { + "epoch": 19.94779771615008, + "grad_norm": 0.02588781714439392, + "learning_rate": 2.0882420493029132e-08, + "loss": 0.0009, + "num_input_tokens_seen": 264157616, + "step": 122280 + }, + { + "epoch": 19.948613376835237, + "grad_norm": 0.14042320847511292, + "learning_rate": 2.0236948684582147e-08, + "loss": 0.0139, + "num_input_tokens_seen": 264168176, + "step": 122285 + }, + { + "epoch": 19.949429037520392, + "grad_norm": 0.002619031583890319, + "learning_rate": 1.96016096003504e-08, + "loss": 0.0013, + "num_input_tokens_seen": 264177264, + "step": 122290 + }, + { + "epoch": 19.950244698205548, + "grad_norm": 0.005070169921964407, + "learning_rate": 1.8976403253156972e-08, + "loss": 0.0006, + "num_input_tokens_seen": 264186352, + "step": 122295 + }, + { + "epoch": 19.9510603588907, + "grad_norm": 0.011310932226479053, + "learning_rate": 1.836132965571391e-08, + "loss": 0.0018, + "num_input_tokens_seen": 264197232, + "step": 122300 + }, + { + "epoch": 19.951876019575856, + "grad_norm": 0.0019945164676755667, + "learning_rate": 1.7756388820400205e-08, + "loss": 0.1224, + "num_input_tokens_seen": 264207056, + "step": 122305 + }, + { + "epoch": 19.95269168026101, + "grad_norm": 0.03021303005516529, + "learning_rate": 1.716158075953933e-08, + "loss": 0.0014, + "num_input_tokens_seen": 264218288, + "step": 122310 + }, + { + "epoch": 19.953507340946167, + "grad_norm": 0.002650508191436529, + "learning_rate": 1.6576905485177206e-08, + "loss": 0.0152, + "num_input_tokens_seen": 264228816, + "step": 122315 + }, + { + "epoch": 19.954323001631323, + "grad_norm": 0.011295244097709656, + "learning_rate": 1.6002363009137712e-08, + "loss": 0.0009, + "num_input_tokens_seen": 264239856, + "step": 122320 + }, + { + "epoch": 19.955138662316475, + "grad_norm": 0.02210673689842224, + "learning_rate": 1.5437953343078182e-08, + "loss": 0.0016, + "num_input_tokens_seen": 264248848, + "step": 122325 + }, + { + "epoch": 19.95595432300163, + "grad_norm": 0.0006285077542997897, + "learning_rate": 1.488367649848943e-08, + "loss": 0.0005, + "num_input_tokens_seen": 264259632, + "step": 122330 + }, + { + "epoch": 19.956769983686787, + "grad_norm": 0.059016335755586624, + "learning_rate": 1.4339532486529195e-08, + "loss": 0.0028, + "num_input_tokens_seen": 264271152, + "step": 122335 + }, + { + "epoch": 19.957585644371942, + "grad_norm": 0.000976592069491744, + "learning_rate": 1.3805521318244196e-08, + "loss": 0.0009, + "num_input_tokens_seen": 264281744, + "step": 122340 + }, + { + "epoch": 19.958401305057095, + "grad_norm": 0.3123403489589691, + "learning_rate": 1.3281643004514621e-08, + "loss": 0.0512, + "num_input_tokens_seen": 264292656, + "step": 122345 + }, + { + "epoch": 19.95921696574225, + "grad_norm": 0.035995569080114365, + "learning_rate": 1.2767897555887587e-08, + "loss": 0.0026, + "num_input_tokens_seen": 264303664, + "step": 122350 + }, + { + "epoch": 19.960032626427406, + "grad_norm": 0.0020898154471069574, + "learning_rate": 1.2264284982743679e-08, + "loss": 0.0009, + "num_input_tokens_seen": 264314768, + "step": 122355 + }, + { + "epoch": 19.96084828711256, + "grad_norm": 0.004529369994997978, + "learning_rate": 1.1770805295407972e-08, + "loss": 0.001, + "num_input_tokens_seen": 264324976, + "step": 122360 + }, + { + "epoch": 19.961663947797717, + "grad_norm": 0.04026377946138382, + "learning_rate": 1.1287458503816961e-08, + "loss": 0.0018, + "num_input_tokens_seen": 264335792, + "step": 122365 + }, + { + "epoch": 19.96247960848287, + "grad_norm": 0.10382266342639923, + "learning_rate": 1.0814244617740609e-08, + "loss": 0.0057, + "num_input_tokens_seen": 264346512, + "step": 122370 + }, + { + "epoch": 19.963295269168025, + "grad_norm": 0.043546292930841446, + "learning_rate": 1.0351163646782346e-08, + "loss": 0.002, + "num_input_tokens_seen": 264356944, + "step": 122375 + }, + { + "epoch": 19.96411092985318, + "grad_norm": 0.06786137819290161, + "learning_rate": 9.898215600379068e-09, + "loss": 0.0028, + "num_input_tokens_seen": 264367376, + "step": 122380 + }, + { + "epoch": 19.964926590538337, + "grad_norm": 0.0008918531239032745, + "learning_rate": 9.455400487634602e-09, + "loss": 0.0003, + "num_input_tokens_seen": 264378864, + "step": 122385 + }, + { + "epoch": 19.965742251223492, + "grad_norm": 0.016499491408467293, + "learning_rate": 9.022718317597267e-09, + "loss": 0.0013, + "num_input_tokens_seen": 264389424, + "step": 122390 + }, + { + "epoch": 19.966557911908644, + "grad_norm": 0.21509645879268646, + "learning_rate": 8.600169098982313e-09, + "loss": 0.0066, + "num_input_tokens_seen": 264399376, + "step": 122395 + }, + { + "epoch": 19.9673735725938, + "grad_norm": 0.004423909820616245, + "learning_rate": 8.187752840338458e-09, + "loss": 0.0013, + "num_input_tokens_seen": 264411056, + "step": 122400 + }, + { + "epoch": 19.968189233278956, + "grad_norm": 0.0020240589510649443, + "learning_rate": 7.785469550103397e-09, + "loss": 0.0005, + "num_input_tokens_seen": 264422640, + "step": 122405 + }, + { + "epoch": 19.96900489396411, + "grad_norm": 0.0003290712193120271, + "learning_rate": 7.393319236326246e-09, + "loss": 0.0218, + "num_input_tokens_seen": 264434640, + "step": 122410 + }, + { + "epoch": 19.969820554649267, + "grad_norm": 0.004327591508626938, + "learning_rate": 7.011301907056122e-09, + "loss": 0.0027, + "num_input_tokens_seen": 264444624, + "step": 122415 + }, + { + "epoch": 19.97063621533442, + "grad_norm": 0.011982999742031097, + "learning_rate": 6.639417570009076e-09, + "loss": 0.0025, + "num_input_tokens_seen": 264454544, + "step": 122420 + }, + { + "epoch": 19.971451876019575, + "grad_norm": 0.025622902438044548, + "learning_rate": 6.2776662326236025e-09, + "loss": 0.0612, + "num_input_tokens_seen": 264466608, + "step": 122425 + }, + { + "epoch": 19.97226753670473, + "grad_norm": 0.001789126661606133, + "learning_rate": 5.926047902393705e-09, + "loss": 0.002, + "num_input_tokens_seen": 264477040, + "step": 122430 + }, + { + "epoch": 19.973083197389887, + "grad_norm": 0.026516800746321678, + "learning_rate": 5.584562586313791e-09, + "loss": 0.0016, + "num_input_tokens_seen": 264487856, + "step": 122435 + }, + { + "epoch": 19.973898858075042, + "grad_norm": 0.005369944963604212, + "learning_rate": 5.253210291322752e-09, + "loss": 0.0032, + "num_input_tokens_seen": 264497936, + "step": 122440 + }, + { + "epoch": 19.974714518760194, + "grad_norm": 0.007590974681079388, + "learning_rate": 4.93199102419295e-09, + "loss": 0.0005, + "num_input_tokens_seen": 264507600, + "step": 122445 + }, + { + "epoch": 19.97553017944535, + "grad_norm": 0.014951630495488644, + "learning_rate": 4.620904791419189e-09, + "loss": 0.0008, + "num_input_tokens_seen": 264519120, + "step": 122450 + }, + { + "epoch": 19.976345840130506, + "grad_norm": 0.0008193363901227713, + "learning_rate": 4.31995159927423e-09, + "loss": 0.0005, + "num_input_tokens_seen": 264530320, + "step": 122455 + }, + { + "epoch": 19.97716150081566, + "grad_norm": 0.010377529077231884, + "learning_rate": 4.029131453864299e-09, + "loss": 0.0036, + "num_input_tokens_seen": 264540912, + "step": 122460 + }, + { + "epoch": 19.977977161500817, + "grad_norm": 0.2446034699678421, + "learning_rate": 3.748444361129088e-09, + "loss": 0.0049, + "num_input_tokens_seen": 264550928, + "step": 122465 + }, + { + "epoch": 19.97879282218597, + "grad_norm": 0.00831429474055767, + "learning_rate": 3.477890326675226e-09, + "loss": 0.0021, + "num_input_tokens_seen": 264561584, + "step": 122470 + }, + { + "epoch": 19.979608482871125, + "grad_norm": 0.08868524432182312, + "learning_rate": 3.217469356053826e-09, + "loss": 0.001, + "num_input_tokens_seen": 264572464, + "step": 122475 + }, + { + "epoch": 19.98042414355628, + "grad_norm": 0.00023401925864163786, + "learning_rate": 2.9671814545384477e-09, + "loss": 0.0006, + "num_input_tokens_seen": 264582672, + "step": 122480 + }, + { + "epoch": 19.981239804241437, + "grad_norm": 0.000985664431937039, + "learning_rate": 2.7270266271806065e-09, + "loss": 0.0005, + "num_input_tokens_seen": 264593840, + "step": 122485 + }, + { + "epoch": 19.982055464926592, + "grad_norm": 0.0008842953247949481, + "learning_rate": 2.4970048788652833e-09, + "loss": 0.0004, + "num_input_tokens_seen": 264605136, + "step": 122490 + }, + { + "epoch": 19.982871125611744, + "grad_norm": 0.00036722770892083645, + "learning_rate": 2.2771162141999036e-09, + "loss": 0.0005, + "num_input_tokens_seen": 264617584, + "step": 122495 + }, + { + "epoch": 19.9836867862969, + "grad_norm": 0.049424611032009125, + "learning_rate": 2.0673606376808707e-09, + "loss": 0.0016, + "num_input_tokens_seen": 264627760, + "step": 122500 + }, + { + "epoch": 19.984502446982056, + "grad_norm": 0.01784713752567768, + "learning_rate": 1.8677381535825435e-09, + "loss": 0.0008, + "num_input_tokens_seen": 264638864, + "step": 122505 + }, + { + "epoch": 19.98531810766721, + "grad_norm": 0.001981490757316351, + "learning_rate": 1.6782487659572354e-09, + "loss": 0.0005, + "num_input_tokens_seen": 264648880, + "step": 122510 + }, + { + "epoch": 19.986133768352367, + "grad_norm": 0.0023713652044534683, + "learning_rate": 1.4988924785797053e-09, + "loss": 0.002, + "num_input_tokens_seen": 264659984, + "step": 122515 + }, + { + "epoch": 19.98694942903752, + "grad_norm": 0.1398274153470993, + "learning_rate": 1.329669295113689e-09, + "loss": 0.0055, + "num_input_tokens_seen": 264671504, + "step": 122520 + }, + { + "epoch": 19.987765089722675, + "grad_norm": 0.3532371520996094, + "learning_rate": 1.1705792190008778e-09, + "loss": 0.0134, + "num_input_tokens_seen": 264682704, + "step": 122525 + }, + { + "epoch": 19.98858075040783, + "grad_norm": 0.014912966638803482, + "learning_rate": 1.0216222534609189e-09, + "loss": 0.0028, + "num_input_tokens_seen": 264693232, + "step": 122530 + }, + { + "epoch": 19.989396411092986, + "grad_norm": 0.0006106890505179763, + "learning_rate": 8.827984014914137e-10, + "loss": 0.0024, + "num_input_tokens_seen": 264704400, + "step": 122535 + }, + { + "epoch": 19.99021207177814, + "grad_norm": 0.0008242643089033663, + "learning_rate": 7.541076659234314e-10, + "loss": 0.0018, + "num_input_tokens_seen": 264713648, + "step": 122540 + }, + { + "epoch": 19.991027732463294, + "grad_norm": 0.00012639925989788026, + "learning_rate": 6.355500494215072e-10, + "loss": 0.0094, + "num_input_tokens_seen": 264724496, + "step": 122545 + }, + { + "epoch": 19.99184339314845, + "grad_norm": 0.01107293926179409, + "learning_rate": 5.271255543171094e-10, + "loss": 0.002, + "num_input_tokens_seen": 264735760, + "step": 122550 + }, + { + "epoch": 19.992659053833606, + "grad_norm": 0.0008368192939087749, + "learning_rate": 4.2883418277517293e-10, + "loss": 0.0007, + "num_input_tokens_seen": 264744688, + "step": 122555 + }, + { + "epoch": 19.99347471451876, + "grad_norm": 0.09536808729171753, + "learning_rate": 3.4067593690512154e-10, + "loss": 0.0036, + "num_input_tokens_seen": 264755280, + "step": 122560 + }, + { + "epoch": 19.994290375203914, + "grad_norm": 0.004233692307025194, + "learning_rate": 2.6265081837228976e-10, + "loss": 0.0017, + "num_input_tokens_seen": 264764784, + "step": 122565 + }, + { + "epoch": 19.99510603588907, + "grad_norm": 0.003642668481916189, + "learning_rate": 1.9475882884201212e-10, + "loss": 0.0005, + "num_input_tokens_seen": 264774864, + "step": 122570 + }, + { + "epoch": 19.995921696574225, + "grad_norm": 0.00029860870563425124, + "learning_rate": 1.3699996964655626e-10, + "loss": 0.0003, + "num_input_tokens_seen": 264786800, + "step": 122575 + }, + { + "epoch": 19.99673735725938, + "grad_norm": 0.0004550835001282394, + "learning_rate": 8.937424195165634e-11, + "loss": 0.001, + "num_input_tokens_seen": 264798096, + "step": 122580 + }, + { + "epoch": 19.997553017944536, + "grad_norm": 0.0013587478315457702, + "learning_rate": 5.188164675651308e-11, + "loss": 0.006, + "num_input_tokens_seen": 264810128, + "step": 122585 + }, + { + "epoch": 19.99836867862969, + "grad_norm": 0.00026625217287801206, + "learning_rate": 2.4522184838282614e-11, + "loss": 0.0002, + "num_input_tokens_seen": 264819248, + "step": 122590 + }, + { + "epoch": 19.999184339314844, + "grad_norm": 2.0659940242767334, + "learning_rate": 7.295856696565295e-12, + "loss": 0.0159, + "num_input_tokens_seen": 264831024, + "step": 122595 + }, + { + "epoch": 20.0, + "grad_norm": 0.0002738155599217862, + "learning_rate": 2.0266266442803271e-13, + "loss": 0.0014, + "num_input_tokens_seen": 264840880, + "step": 122600 + }, + { + "epoch": 20.0, + "eval_loss": 0.3499123454093933, + "eval_runtime": 103.9372, + "eval_samples_per_second": 26.218, + "eval_steps_per_second": 6.562, + "num_input_tokens_seen": 264840880, + "step": 122600 + }, + { + "epoch": 20.0, + "num_input_tokens_seen": 264840880, + "step": 122600, + "total_flos": 1.1925665126797148e+19, + "train_loss": 0.0669187841472106, + "train_runtime": 45650.2803, + "train_samples_per_second": 10.742, + "train_steps_per_second": 2.686 + } + ], + "logging_steps": 5, + "max_steps": 122600, + "num_input_tokens_seen": 264840880, + "num_train_epochs": 20, + "save_steps": 6130, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1925665126797148e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}