{ "best_global_step": 11960, "best_metric": 0.49523019790649414, "best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_gsm8k_123_1760637708/checkpoint-11960", "epoch": 20.0, "eval_steps": 2990, "global_step": 29900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033444816053511705, "grad_norm": 46.91719055175781, "learning_rate": 1.3377926421404683e-08, "loss": 5.0676, "num_input_tokens_seen": 5120, "step": 5 }, { "epoch": 0.006688963210702341, "grad_norm": 47.75601577758789, "learning_rate": 3.010033444816054e-08, "loss": 5.2401, "num_input_tokens_seen": 10432, "step": 10 }, { "epoch": 0.010033444816053512, "grad_norm": 38.735809326171875, "learning_rate": 4.682274247491639e-08, "loss": 5.4302, "num_input_tokens_seen": 15872, "step": 15 }, { "epoch": 0.013377926421404682, "grad_norm": 46.26157760620117, "learning_rate": 6.354515050167225e-08, "loss": 5.3156, "num_input_tokens_seen": 21568, "step": 20 }, { "epoch": 0.016722408026755852, "grad_norm": 53.5979118347168, "learning_rate": 8.02675585284281e-08, "loss": 5.0741, "num_input_tokens_seen": 26784, "step": 25 }, { "epoch": 0.020066889632107024, "grad_norm": 55.685302734375, "learning_rate": 9.698996655518395e-08, "loss": 5.1124, "num_input_tokens_seen": 32576, "step": 30 }, { "epoch": 0.023411371237458192, "grad_norm": 36.38007736206055, "learning_rate": 1.1371237458193981e-07, "loss": 5.049, "num_input_tokens_seen": 36992, "step": 35 }, { "epoch": 0.026755852842809364, "grad_norm": 50.47294616699219, "learning_rate": 1.3043478260869566e-07, "loss": 5.0839, "num_input_tokens_seen": 42464, "step": 40 }, { "epoch": 0.030100334448160536, "grad_norm": 43.063255310058594, "learning_rate": 1.4715719063545153e-07, "loss": 4.9538, "num_input_tokens_seen": 47712, "step": 45 }, { "epoch": 0.033444816053511704, "grad_norm": 40.59852600097656, "learning_rate": 1.6387959866220735e-07, "loss": 5.1145, "num_input_tokens_seen": 52960, "step": 50 }, { "epoch": 0.03678929765886288, "grad_norm": 38.615867614746094, "learning_rate": 1.8060200668896325e-07, "loss": 4.6514, "num_input_tokens_seen": 59232, "step": 55 }, { "epoch": 0.04013377926421405, "grad_norm": 33.774166107177734, "learning_rate": 1.973244147157191e-07, "loss": 4.867, "num_input_tokens_seen": 63776, "step": 60 }, { "epoch": 0.043478260869565216, "grad_norm": 38.725101470947266, "learning_rate": 2.1404682274247494e-07, "loss": 4.7758, "num_input_tokens_seen": 68896, "step": 65 }, { "epoch": 0.046822742474916385, "grad_norm": 42.17696762084961, "learning_rate": 2.307692307692308e-07, "loss": 4.7666, "num_input_tokens_seen": 75552, "step": 70 }, { "epoch": 0.05016722408026756, "grad_norm": 28.672748565673828, "learning_rate": 2.474916387959866e-07, "loss": 4.7401, "num_input_tokens_seen": 80928, "step": 75 }, { "epoch": 0.05351170568561873, "grad_norm": 31.82779312133789, "learning_rate": 2.642140468227425e-07, "loss": 4.5955, "num_input_tokens_seen": 86560, "step": 80 }, { "epoch": 0.056856187290969896, "grad_norm": 25.630165100097656, "learning_rate": 2.8093645484949837e-07, "loss": 4.7533, "num_input_tokens_seen": 91456, "step": 85 }, { "epoch": 0.06020066889632107, "grad_norm": 34.48438262939453, "learning_rate": 2.976588628762542e-07, "loss": 4.6029, "num_input_tokens_seen": 96288, "step": 90 }, { "epoch": 0.06354515050167224, "grad_norm": 25.781822204589844, "learning_rate": 3.1438127090301006e-07, "loss": 4.7045, "num_input_tokens_seen": 100384, "step": 95 }, { "epoch": 0.06688963210702341, "grad_norm": 38.139766693115234, "learning_rate": 3.3110367892976593e-07, "loss": 4.5248, "num_input_tokens_seen": 105376, "step": 100 }, { "epoch": 0.07023411371237458, "grad_norm": 26.370643615722656, "learning_rate": 3.4782608695652175e-07, "loss": 4.4776, "num_input_tokens_seen": 109632, "step": 105 }, { "epoch": 0.07357859531772576, "grad_norm": 23.291044235229492, "learning_rate": 3.645484949832776e-07, "loss": 4.3531, "num_input_tokens_seen": 114880, "step": 110 }, { "epoch": 0.07692307692307693, "grad_norm": 26.41623306274414, "learning_rate": 3.8127090301003344e-07, "loss": 4.505, "num_input_tokens_seen": 120768, "step": 115 }, { "epoch": 0.0802675585284281, "grad_norm": 24.707361221313477, "learning_rate": 3.979933110367893e-07, "loss": 4.2106, "num_input_tokens_seen": 125536, "step": 120 }, { "epoch": 0.08361204013377926, "grad_norm": 19.375080108642578, "learning_rate": 4.1471571906354524e-07, "loss": 4.1522, "num_input_tokens_seen": 130240, "step": 125 }, { "epoch": 0.08695652173913043, "grad_norm": 19.71665382385254, "learning_rate": 4.3143812709030106e-07, "loss": 4.2341, "num_input_tokens_seen": 134848, "step": 130 }, { "epoch": 0.0903010033444816, "grad_norm": 20.46674156188965, "learning_rate": 4.481605351170569e-07, "loss": 4.2017, "num_input_tokens_seen": 139456, "step": 135 }, { "epoch": 0.09364548494983277, "grad_norm": 17.671037673950195, "learning_rate": 4.6488294314381275e-07, "loss": 4.1377, "num_input_tokens_seen": 144960, "step": 140 }, { "epoch": 0.09698996655518395, "grad_norm": 19.32286262512207, "learning_rate": 4.816053511705686e-07, "loss": 4.0746, "num_input_tokens_seen": 150336, "step": 145 }, { "epoch": 0.10033444816053512, "grad_norm": 19.63654136657715, "learning_rate": 4.983277591973244e-07, "loss": 4.059, "num_input_tokens_seen": 155104, "step": 150 }, { "epoch": 0.10367892976588629, "grad_norm": 17.617090225219727, "learning_rate": 5.150501672240804e-07, "loss": 3.7655, "num_input_tokens_seen": 160416, "step": 155 }, { "epoch": 0.10702341137123746, "grad_norm": 19.981536865234375, "learning_rate": 5.317725752508362e-07, "loss": 3.942, "num_input_tokens_seen": 165280, "step": 160 }, { "epoch": 0.11036789297658862, "grad_norm": 21.321786880493164, "learning_rate": 5.48494983277592e-07, "loss": 3.6218, "num_input_tokens_seen": 171424, "step": 165 }, { "epoch": 0.11371237458193979, "grad_norm": 15.141613960266113, "learning_rate": 5.652173913043478e-07, "loss": 3.6184, "num_input_tokens_seen": 177280, "step": 170 }, { "epoch": 0.11705685618729098, "grad_norm": 21.012407302856445, "learning_rate": 5.819397993311037e-07, "loss": 3.7153, "num_input_tokens_seen": 183424, "step": 175 }, { "epoch": 0.12040133779264214, "grad_norm": 20.948381423950195, "learning_rate": 5.986622073578596e-07, "loss": 3.8428, "num_input_tokens_seen": 188032, "step": 180 }, { "epoch": 0.12374581939799331, "grad_norm": 13.795903205871582, "learning_rate": 6.153846153846155e-07, "loss": 3.6535, "num_input_tokens_seen": 193184, "step": 185 }, { "epoch": 0.12709030100334448, "grad_norm": 15.03226089477539, "learning_rate": 6.321070234113713e-07, "loss": 3.7664, "num_input_tokens_seen": 198336, "step": 190 }, { "epoch": 0.13043478260869565, "grad_norm": 14.706256866455078, "learning_rate": 6.488294314381271e-07, "loss": 3.5343, "num_input_tokens_seen": 203776, "step": 195 }, { "epoch": 0.13377926421404682, "grad_norm": 17.53951072692871, "learning_rate": 6.65551839464883e-07, "loss": 3.5736, "num_input_tokens_seen": 208448, "step": 200 }, { "epoch": 0.13712374581939799, "grad_norm": 16.01492691040039, "learning_rate": 6.822742474916389e-07, "loss": 3.4191, "num_input_tokens_seen": 213792, "step": 205 }, { "epoch": 0.14046822742474915, "grad_norm": 13.654372215270996, "learning_rate": 6.989966555183947e-07, "loss": 3.591, "num_input_tokens_seen": 218496, "step": 210 }, { "epoch": 0.14381270903010032, "grad_norm": 16.52764320373535, "learning_rate": 7.157190635451505e-07, "loss": 3.2559, "num_input_tokens_seen": 224256, "step": 215 }, { "epoch": 0.14715719063545152, "grad_norm": 14.287187576293945, "learning_rate": 7.324414715719063e-07, "loss": 3.3404, "num_input_tokens_seen": 229760, "step": 220 }, { "epoch": 0.1505016722408027, "grad_norm": 12.822321891784668, "learning_rate": 7.491638795986622e-07, "loss": 3.3863, "num_input_tokens_seen": 235424, "step": 225 }, { "epoch": 0.15384615384615385, "grad_norm": 15.983246803283691, "learning_rate": 7.658862876254181e-07, "loss": 3.1805, "num_input_tokens_seen": 239840, "step": 230 }, { "epoch": 0.15719063545150502, "grad_norm": 13.211336135864258, "learning_rate": 7.82608695652174e-07, "loss": 3.1406, "num_input_tokens_seen": 245856, "step": 235 }, { "epoch": 0.1605351170568562, "grad_norm": 10.732369422912598, "learning_rate": 7.993311036789299e-07, "loss": 3.2777, "num_input_tokens_seen": 250496, "step": 240 }, { "epoch": 0.16387959866220736, "grad_norm": 16.439023971557617, "learning_rate": 8.160535117056857e-07, "loss": 3.3529, "num_input_tokens_seen": 255008, "step": 245 }, { "epoch": 0.16722408026755853, "grad_norm": 21.353416442871094, "learning_rate": 8.327759197324416e-07, "loss": 3.1757, "num_input_tokens_seen": 259968, "step": 250 }, { "epoch": 0.1705685618729097, "grad_norm": 15.413599014282227, "learning_rate": 8.494983277591974e-07, "loss": 3.1353, "num_input_tokens_seen": 265088, "step": 255 }, { "epoch": 0.17391304347826086, "grad_norm": 13.31151008605957, "learning_rate": 8.662207357859533e-07, "loss": 3.0596, "num_input_tokens_seen": 269888, "step": 260 }, { "epoch": 0.17725752508361203, "grad_norm": 13.30479621887207, "learning_rate": 8.829431438127091e-07, "loss": 2.919, "num_input_tokens_seen": 275328, "step": 265 }, { "epoch": 0.1806020066889632, "grad_norm": 14.571383476257324, "learning_rate": 8.996655518394649e-07, "loss": 3.0223, "num_input_tokens_seen": 280512, "step": 270 }, { "epoch": 0.18394648829431437, "grad_norm": 20.667600631713867, "learning_rate": 9.163879598662208e-07, "loss": 2.812, "num_input_tokens_seen": 285856, "step": 275 }, { "epoch": 0.18729096989966554, "grad_norm": 15.86732292175293, "learning_rate": 9.331103678929766e-07, "loss": 2.8313, "num_input_tokens_seen": 291008, "step": 280 }, { "epoch": 0.19063545150501673, "grad_norm": 11.822955131530762, "learning_rate": 9.498327759197325e-07, "loss": 2.937, "num_input_tokens_seen": 295808, "step": 285 }, { "epoch": 0.1939799331103679, "grad_norm": 12.861940383911133, "learning_rate": 9.665551839464883e-07, "loss": 2.9218, "num_input_tokens_seen": 300704, "step": 290 }, { "epoch": 0.19732441471571907, "grad_norm": 10.936447143554688, "learning_rate": 9.832775919732443e-07, "loss": 2.7944, "num_input_tokens_seen": 305344, "step": 295 }, { "epoch": 0.20066889632107024, "grad_norm": 16.990398406982422, "learning_rate": 1.0000000000000002e-06, "loss": 2.8137, "num_input_tokens_seen": 310560, "step": 300 }, { "epoch": 0.2040133779264214, "grad_norm": 18.51235008239746, "learning_rate": 1.016722408026756e-06, "loss": 2.8597, "num_input_tokens_seen": 315616, "step": 305 }, { "epoch": 0.20735785953177258, "grad_norm": 14.541281700134277, "learning_rate": 1.0334448160535118e-06, "loss": 2.7206, "num_input_tokens_seen": 321280, "step": 310 }, { "epoch": 0.21070234113712374, "grad_norm": 20.195690155029297, "learning_rate": 1.0501672240802676e-06, "loss": 2.8363, "num_input_tokens_seen": 326368, "step": 315 }, { "epoch": 0.2140468227424749, "grad_norm": 14.451964378356934, "learning_rate": 1.0668896321070234e-06, "loss": 2.5526, "num_input_tokens_seen": 330816, "step": 320 }, { "epoch": 0.21739130434782608, "grad_norm": 17.903043746948242, "learning_rate": 1.0836120401337793e-06, "loss": 2.6044, "num_input_tokens_seen": 336704, "step": 325 }, { "epoch": 0.22073578595317725, "grad_norm": 14.424607276916504, "learning_rate": 1.1003344481605353e-06, "loss": 2.6032, "num_input_tokens_seen": 341216, "step": 330 }, { "epoch": 0.22408026755852842, "grad_norm": 19.351394653320312, "learning_rate": 1.1170568561872911e-06, "loss": 2.5098, "num_input_tokens_seen": 346208, "step": 335 }, { "epoch": 0.22742474916387959, "grad_norm": 12.429352760314941, "learning_rate": 1.133779264214047e-06, "loss": 2.5597, "num_input_tokens_seen": 351456, "step": 340 }, { "epoch": 0.23076923076923078, "grad_norm": 16.16684913635254, "learning_rate": 1.1505016722408027e-06, "loss": 2.5652, "num_input_tokens_seen": 356448, "step": 345 }, { "epoch": 0.23411371237458195, "grad_norm": 41.474639892578125, "learning_rate": 1.1672240802675586e-06, "loss": 2.6448, "num_input_tokens_seen": 361920, "step": 350 }, { "epoch": 0.23745819397993312, "grad_norm": 12.980622291564941, "learning_rate": 1.1839464882943144e-06, "loss": 2.4251, "num_input_tokens_seen": 367456, "step": 355 }, { "epoch": 0.2408026755852843, "grad_norm": 26.087112426757812, "learning_rate": 1.2006688963210704e-06, "loss": 2.4366, "num_input_tokens_seen": 373536, "step": 360 }, { "epoch": 0.24414715719063546, "grad_norm": 15.603264808654785, "learning_rate": 1.2173913043478262e-06, "loss": 2.3765, "num_input_tokens_seen": 379328, "step": 365 }, { "epoch": 0.24749163879598662, "grad_norm": 14.955710411071777, "learning_rate": 1.234113712374582e-06, "loss": 2.355, "num_input_tokens_seen": 384800, "step": 370 }, { "epoch": 0.2508361204013378, "grad_norm": 13.720016479492188, "learning_rate": 1.2508361204013379e-06, "loss": 2.4043, "num_input_tokens_seen": 390336, "step": 375 }, { "epoch": 0.25418060200668896, "grad_norm": 20.2301025390625, "learning_rate": 1.2675585284280937e-06, "loss": 2.492, "num_input_tokens_seen": 394912, "step": 380 }, { "epoch": 0.25752508361204013, "grad_norm": 15.004314422607422, "learning_rate": 1.2842809364548495e-06, "loss": 2.2857, "num_input_tokens_seen": 399712, "step": 385 }, { "epoch": 0.2608695652173913, "grad_norm": 11.531144142150879, "learning_rate": 1.3010033444816055e-06, "loss": 2.2848, "num_input_tokens_seen": 404320, "step": 390 }, { "epoch": 0.26421404682274247, "grad_norm": 17.532400131225586, "learning_rate": 1.3177257525083614e-06, "loss": 2.1806, "num_input_tokens_seen": 409088, "step": 395 }, { "epoch": 0.26755852842809363, "grad_norm": 23.666322708129883, "learning_rate": 1.3344481605351172e-06, "loss": 2.378, "num_input_tokens_seen": 413632, "step": 400 }, { "epoch": 0.2709030100334448, "grad_norm": 11.490878105163574, "learning_rate": 1.3511705685618732e-06, "loss": 2.3094, "num_input_tokens_seen": 418272, "step": 405 }, { "epoch": 0.27424749163879597, "grad_norm": 18.241029739379883, "learning_rate": 1.367892976588629e-06, "loss": 2.3547, "num_input_tokens_seen": 424128, "step": 410 }, { "epoch": 0.27759197324414714, "grad_norm": 15.462986946105957, "learning_rate": 1.3846153846153848e-06, "loss": 2.1965, "num_input_tokens_seen": 428928, "step": 415 }, { "epoch": 0.2809364548494983, "grad_norm": 16.310638427734375, "learning_rate": 1.4013377926421407e-06, "loss": 2.17, "num_input_tokens_seen": 434272, "step": 420 }, { "epoch": 0.2842809364548495, "grad_norm": 21.123184204101562, "learning_rate": 1.4180602006688965e-06, "loss": 2.1836, "num_input_tokens_seen": 439424, "step": 425 }, { "epoch": 0.28762541806020064, "grad_norm": 16.914058685302734, "learning_rate": 1.4347826086956523e-06, "loss": 2.1514, "num_input_tokens_seen": 444576, "step": 430 }, { "epoch": 0.2909698996655518, "grad_norm": 17.51774787902832, "learning_rate": 1.4515050167224081e-06, "loss": 2.1801, "num_input_tokens_seen": 451200, "step": 435 }, { "epoch": 0.29431438127090304, "grad_norm": 20.416851043701172, "learning_rate": 1.468227424749164e-06, "loss": 2.2465, "num_input_tokens_seen": 456096, "step": 440 }, { "epoch": 0.2976588628762542, "grad_norm": 21.33445930480957, "learning_rate": 1.4849498327759198e-06, "loss": 2.0032, "num_input_tokens_seen": 460704, "step": 445 }, { "epoch": 0.3010033444816054, "grad_norm": 18.084840774536133, "learning_rate": 1.5016722408026758e-06, "loss": 2.0422, "num_input_tokens_seen": 466688, "step": 450 }, { "epoch": 0.30434782608695654, "grad_norm": 16.742563247680664, "learning_rate": 1.5183946488294316e-06, "loss": 2.0842, "num_input_tokens_seen": 471840, "step": 455 }, { "epoch": 0.3076923076923077, "grad_norm": 10.680274963378906, "learning_rate": 1.5351170568561874e-06, "loss": 2.0517, "num_input_tokens_seen": 477120, "step": 460 }, { "epoch": 0.3110367892976589, "grad_norm": 12.628313064575195, "learning_rate": 1.5518394648829432e-06, "loss": 1.9775, "num_input_tokens_seen": 481760, "step": 465 }, { "epoch": 0.31438127090301005, "grad_norm": 13.627203941345215, "learning_rate": 1.568561872909699e-06, "loss": 2.0602, "num_input_tokens_seen": 487296, "step": 470 }, { "epoch": 0.3177257525083612, "grad_norm": 18.350616455078125, "learning_rate": 1.5852842809364549e-06, "loss": 2.0795, "num_input_tokens_seen": 493568, "step": 475 }, { "epoch": 0.3210702341137124, "grad_norm": 16.93640899658203, "learning_rate": 1.6020066889632107e-06, "loss": 2.0475, "num_input_tokens_seen": 498464, "step": 480 }, { "epoch": 0.32441471571906355, "grad_norm": 11.04397201538086, "learning_rate": 1.6187290969899665e-06, "loss": 1.9556, "num_input_tokens_seen": 503072, "step": 485 }, { "epoch": 0.3277591973244147, "grad_norm": 16.210094451904297, "learning_rate": 1.6354515050167226e-06, "loss": 1.9766, "num_input_tokens_seen": 508256, "step": 490 }, { "epoch": 0.3311036789297659, "grad_norm": 13.323111534118652, "learning_rate": 1.6521739130434784e-06, "loss": 1.9545, "num_input_tokens_seen": 513568, "step": 495 }, { "epoch": 0.33444816053511706, "grad_norm": 16.6793270111084, "learning_rate": 1.6688963210702342e-06, "loss": 1.9692, "num_input_tokens_seen": 518272, "step": 500 }, { "epoch": 0.3377926421404682, "grad_norm": 17.930749893188477, "learning_rate": 1.68561872909699e-06, "loss": 1.9265, "num_input_tokens_seen": 523232, "step": 505 }, { "epoch": 0.3411371237458194, "grad_norm": 9.621335983276367, "learning_rate": 1.7023411371237458e-06, "loss": 1.9555, "num_input_tokens_seen": 528992, "step": 510 }, { "epoch": 0.34448160535117056, "grad_norm": 18.964805603027344, "learning_rate": 1.7190635451505019e-06, "loss": 1.7462, "num_input_tokens_seen": 534144, "step": 515 }, { "epoch": 0.34782608695652173, "grad_norm": 16.21211051940918, "learning_rate": 1.7357859531772579e-06, "loss": 1.9619, "num_input_tokens_seen": 539680, "step": 520 }, { "epoch": 0.3511705685618729, "grad_norm": 13.877176284790039, "learning_rate": 1.7525083612040137e-06, "loss": 1.8706, "num_input_tokens_seen": 544448, "step": 525 }, { "epoch": 0.35451505016722407, "grad_norm": 15.488062858581543, "learning_rate": 1.7692307692307695e-06, "loss": 1.899, "num_input_tokens_seen": 549696, "step": 530 }, { "epoch": 0.35785953177257523, "grad_norm": 12.403740882873535, "learning_rate": 1.7859531772575253e-06, "loss": 1.8059, "num_input_tokens_seen": 555232, "step": 535 }, { "epoch": 0.3612040133779264, "grad_norm": 21.587539672851562, "learning_rate": 1.8026755852842812e-06, "loss": 1.9005, "num_input_tokens_seen": 560064, "step": 540 }, { "epoch": 0.36454849498327757, "grad_norm": 12.645978927612305, "learning_rate": 1.819397993311037e-06, "loss": 1.6958, "num_input_tokens_seen": 565152, "step": 545 }, { "epoch": 0.36789297658862874, "grad_norm": 14.05691146850586, "learning_rate": 1.8361204013377928e-06, "loss": 1.82, "num_input_tokens_seen": 570304, "step": 550 }, { "epoch": 0.3712374581939799, "grad_norm": 16.616361618041992, "learning_rate": 1.8528428093645486e-06, "loss": 1.79, "num_input_tokens_seen": 575072, "step": 555 }, { "epoch": 0.3745819397993311, "grad_norm": 12.412001609802246, "learning_rate": 1.8695652173913044e-06, "loss": 1.6241, "num_input_tokens_seen": 580768, "step": 560 }, { "epoch": 0.3779264214046823, "grad_norm": 21.37675666809082, "learning_rate": 1.8862876254180605e-06, "loss": 1.6695, "num_input_tokens_seen": 585216, "step": 565 }, { "epoch": 0.38127090301003347, "grad_norm": 17.882158279418945, "learning_rate": 1.9030100334448163e-06, "loss": 1.882, "num_input_tokens_seen": 589568, "step": 570 }, { "epoch": 0.38461538461538464, "grad_norm": 12.399723052978516, "learning_rate": 1.919732441471572e-06, "loss": 1.752, "num_input_tokens_seen": 595328, "step": 575 }, { "epoch": 0.3879598662207358, "grad_norm": 18.835952758789062, "learning_rate": 1.9364548494983277e-06, "loss": 1.789, "num_input_tokens_seen": 599648, "step": 580 }, { "epoch": 0.391304347826087, "grad_norm": 10.001666069030762, "learning_rate": 1.953177257525084e-06, "loss": 1.8492, "num_input_tokens_seen": 604320, "step": 585 }, { "epoch": 0.39464882943143814, "grad_norm": 22.201248168945312, "learning_rate": 1.9698996655518398e-06, "loss": 1.6613, "num_input_tokens_seen": 609440, "step": 590 }, { "epoch": 0.3979933110367893, "grad_norm": 12.578019142150879, "learning_rate": 1.9866220735785956e-06, "loss": 1.6773, "num_input_tokens_seen": 614944, "step": 595 }, { "epoch": 0.4013377926421405, "grad_norm": 11.23897933959961, "learning_rate": 2.0033444816053514e-06, "loss": 1.6663, "num_input_tokens_seen": 619936, "step": 600 }, { "epoch": 0.40468227424749165, "grad_norm": 14.444059371948242, "learning_rate": 2.0200668896321072e-06, "loss": 1.775, "num_input_tokens_seen": 624608, "step": 605 }, { "epoch": 0.4080267558528428, "grad_norm": 13.118213653564453, "learning_rate": 2.036789297658863e-06, "loss": 1.8352, "num_input_tokens_seen": 629568, "step": 610 }, { "epoch": 0.411371237458194, "grad_norm": 14.107900619506836, "learning_rate": 2.053511705685619e-06, "loss": 1.6347, "num_input_tokens_seen": 634432, "step": 615 }, { "epoch": 0.41471571906354515, "grad_norm": 16.069061279296875, "learning_rate": 2.0702341137123747e-06, "loss": 1.5784, "num_input_tokens_seen": 638624, "step": 620 }, { "epoch": 0.4180602006688963, "grad_norm": 12.647886276245117, "learning_rate": 2.0869565217391305e-06, "loss": 1.6598, "num_input_tokens_seen": 643936, "step": 625 }, { "epoch": 0.4214046822742475, "grad_norm": 10.329437255859375, "learning_rate": 2.1036789297658863e-06, "loss": 1.6022, "num_input_tokens_seen": 649376, "step": 630 }, { "epoch": 0.42474916387959866, "grad_norm": 13.997169494628906, "learning_rate": 2.120401337792642e-06, "loss": 1.5478, "num_input_tokens_seen": 654240, "step": 635 }, { "epoch": 0.4280936454849498, "grad_norm": 11.933616638183594, "learning_rate": 2.1371237458193984e-06, "loss": 1.6614, "num_input_tokens_seen": 659776, "step": 640 }, { "epoch": 0.431438127090301, "grad_norm": 15.924729347229004, "learning_rate": 2.153846153846154e-06, "loss": 1.4946, "num_input_tokens_seen": 664576, "step": 645 }, { "epoch": 0.43478260869565216, "grad_norm": 14.248810768127441, "learning_rate": 2.17056856187291e-06, "loss": 1.55, "num_input_tokens_seen": 670048, "step": 650 }, { "epoch": 0.43812709030100333, "grad_norm": 12.983882904052734, "learning_rate": 2.187290969899666e-06, "loss": 1.7925, "num_input_tokens_seen": 674496, "step": 655 }, { "epoch": 0.4414715719063545, "grad_norm": 10.202300071716309, "learning_rate": 2.2040133779264217e-06, "loss": 1.585, "num_input_tokens_seen": 680256, "step": 660 }, { "epoch": 0.44481605351170567, "grad_norm": 12.72861099243164, "learning_rate": 2.2207357859531775e-06, "loss": 1.5671, "num_input_tokens_seen": 684800, "step": 665 }, { "epoch": 0.44816053511705684, "grad_norm": 13.916579246520996, "learning_rate": 2.2374581939799333e-06, "loss": 1.5144, "num_input_tokens_seen": 690048, "step": 670 }, { "epoch": 0.451505016722408, "grad_norm": 24.23007583618164, "learning_rate": 2.254180602006689e-06, "loss": 1.5144, "num_input_tokens_seen": 695680, "step": 675 }, { "epoch": 0.45484949832775917, "grad_norm": 16.445409774780273, "learning_rate": 2.270903010033445e-06, "loss": 1.3728, "num_input_tokens_seen": 701248, "step": 680 }, { "epoch": 0.45819397993311034, "grad_norm": 12.378650665283203, "learning_rate": 2.2876254180602008e-06, "loss": 1.43, "num_input_tokens_seen": 706336, "step": 685 }, { "epoch": 0.46153846153846156, "grad_norm": 19.546852111816406, "learning_rate": 2.3043478260869566e-06, "loss": 1.371, "num_input_tokens_seen": 711232, "step": 690 }, { "epoch": 0.46488294314381273, "grad_norm": 26.259363174438477, "learning_rate": 2.3210702341137124e-06, "loss": 1.5486, "num_input_tokens_seen": 716032, "step": 695 }, { "epoch": 0.4682274247491639, "grad_norm": 15.69704532623291, "learning_rate": 2.337792642140468e-06, "loss": 1.4306, "num_input_tokens_seen": 721280, "step": 700 }, { "epoch": 0.47157190635451507, "grad_norm": 13.286911964416504, "learning_rate": 2.3545150501672245e-06, "loss": 1.447, "num_input_tokens_seen": 725920, "step": 705 }, { "epoch": 0.47491638795986624, "grad_norm": 26.451047897338867, "learning_rate": 2.3712374581939803e-06, "loss": 1.5464, "num_input_tokens_seen": 730240, "step": 710 }, { "epoch": 0.4782608695652174, "grad_norm": 13.555302619934082, "learning_rate": 2.387959866220736e-06, "loss": 1.398, "num_input_tokens_seen": 734752, "step": 715 }, { "epoch": 0.4816053511705686, "grad_norm": 14.83349323272705, "learning_rate": 2.404682274247492e-06, "loss": 1.6378, "num_input_tokens_seen": 739616, "step": 720 }, { "epoch": 0.48494983277591974, "grad_norm": 12.991209983825684, "learning_rate": 2.4214046822742477e-06, "loss": 1.4082, "num_input_tokens_seen": 744960, "step": 725 }, { "epoch": 0.4882943143812709, "grad_norm": 15.707015991210938, "learning_rate": 2.4381270903010035e-06, "loss": 1.3533, "num_input_tokens_seen": 749952, "step": 730 }, { "epoch": 0.4916387959866221, "grad_norm": 10.250972747802734, "learning_rate": 2.4548494983277594e-06, "loss": 1.5279, "num_input_tokens_seen": 755200, "step": 735 }, { "epoch": 0.49498327759197325, "grad_norm": 17.963144302368164, "learning_rate": 2.471571906354515e-06, "loss": 1.2379, "num_input_tokens_seen": 760960, "step": 740 }, { "epoch": 0.4983277591973244, "grad_norm": 9.692252159118652, "learning_rate": 2.488294314381271e-06, "loss": 1.2212, "num_input_tokens_seen": 766496, "step": 745 }, { "epoch": 0.5016722408026756, "grad_norm": 11.664288520812988, "learning_rate": 2.505016722408027e-06, "loss": 1.3268, "num_input_tokens_seen": 771136, "step": 750 }, { "epoch": 0.5050167224080268, "grad_norm": 18.005577087402344, "learning_rate": 2.5217391304347826e-06, "loss": 1.253, "num_input_tokens_seen": 776640, "step": 755 }, { "epoch": 0.5083612040133779, "grad_norm": 19.216371536254883, "learning_rate": 2.5384615384615385e-06, "loss": 1.3665, "num_input_tokens_seen": 781152, "step": 760 }, { "epoch": 0.5117056856187291, "grad_norm": 11.787178993225098, "learning_rate": 2.5551839464882943e-06, "loss": 1.3137, "num_input_tokens_seen": 786368, "step": 765 }, { "epoch": 0.5150501672240803, "grad_norm": 16.074304580688477, "learning_rate": 2.57190635451505e-06, "loss": 1.345, "num_input_tokens_seen": 791808, "step": 770 }, { "epoch": 0.5183946488294314, "grad_norm": 12.382522583007812, "learning_rate": 2.588628762541806e-06, "loss": 1.2418, "num_input_tokens_seen": 796960, "step": 775 }, { "epoch": 0.5217391304347826, "grad_norm": 10.478669166564941, "learning_rate": 2.6053511705685617e-06, "loss": 1.4289, "num_input_tokens_seen": 802368, "step": 780 }, { "epoch": 0.5250836120401338, "grad_norm": 10.458620071411133, "learning_rate": 2.6220735785953176e-06, "loss": 1.2073, "num_input_tokens_seen": 808192, "step": 785 }, { "epoch": 0.5284280936454849, "grad_norm": 18.054931640625, "learning_rate": 2.638795986622074e-06, "loss": 1.2676, "num_input_tokens_seen": 812960, "step": 790 }, { "epoch": 0.5317725752508361, "grad_norm": 15.138144493103027, "learning_rate": 2.6555183946488296e-06, "loss": 1.3536, "num_input_tokens_seen": 817312, "step": 795 }, { "epoch": 0.5351170568561873, "grad_norm": 12.555214881896973, "learning_rate": 2.6722408026755854e-06, "loss": 1.3324, "num_input_tokens_seen": 822368, "step": 800 }, { "epoch": 0.5384615384615384, "grad_norm": 16.051115036010742, "learning_rate": 2.6889632107023413e-06, "loss": 1.1917, "num_input_tokens_seen": 827808, "step": 805 }, { "epoch": 0.5418060200668896, "grad_norm": 18.412708282470703, "learning_rate": 2.705685618729097e-06, "loss": 1.3941, "num_input_tokens_seen": 833120, "step": 810 }, { "epoch": 0.5451505016722408, "grad_norm": 13.265836715698242, "learning_rate": 2.722408026755853e-06, "loss": 1.3566, "num_input_tokens_seen": 838560, "step": 815 }, { "epoch": 0.5484949832775919, "grad_norm": 12.541476249694824, "learning_rate": 2.7391304347826087e-06, "loss": 1.2877, "num_input_tokens_seen": 843616, "step": 820 }, { "epoch": 0.5518394648829431, "grad_norm": 12.373580932617188, "learning_rate": 2.755852842809365e-06, "loss": 1.2312, "num_input_tokens_seen": 848576, "step": 825 }, { "epoch": 0.5551839464882943, "grad_norm": 14.771387100219727, "learning_rate": 2.7725752508361208e-06, "loss": 1.1331, "num_input_tokens_seen": 853664, "step": 830 }, { "epoch": 0.5585284280936454, "grad_norm": 12.879298210144043, "learning_rate": 2.7892976588628766e-06, "loss": 1.1935, "num_input_tokens_seen": 859200, "step": 835 }, { "epoch": 0.5618729096989966, "grad_norm": 11.85440731048584, "learning_rate": 2.8060200668896324e-06, "loss": 1.0872, "num_input_tokens_seen": 864192, "step": 840 }, { "epoch": 0.5652173913043478, "grad_norm": 9.21300983428955, "learning_rate": 2.8227424749163882e-06, "loss": 1.2896, "num_input_tokens_seen": 868896, "step": 845 }, { "epoch": 0.568561872909699, "grad_norm": 12.081624984741211, "learning_rate": 2.8394648829431445e-06, "loss": 1.2097, "num_input_tokens_seen": 874208, "step": 850 }, { "epoch": 0.5719063545150501, "grad_norm": 7.712004661560059, "learning_rate": 2.8561872909699003e-06, "loss": 1.1583, "num_input_tokens_seen": 879232, "step": 855 }, { "epoch": 0.5752508361204013, "grad_norm": 9.859228134155273, "learning_rate": 2.872909698996656e-06, "loss": 1.1439, "num_input_tokens_seen": 884928, "step": 860 }, { "epoch": 0.5785953177257525, "grad_norm": 10.455513000488281, "learning_rate": 2.889632107023412e-06, "loss": 1.2576, "num_input_tokens_seen": 890208, "step": 865 }, { "epoch": 0.5819397993311036, "grad_norm": 10.541882514953613, "learning_rate": 2.9063545150501677e-06, "loss": 1.0347, "num_input_tokens_seen": 895616, "step": 870 }, { "epoch": 0.5852842809364549, "grad_norm": 13.379380226135254, "learning_rate": 2.9230769230769236e-06, "loss": 1.2442, "num_input_tokens_seen": 901472, "step": 875 }, { "epoch": 0.5886287625418061, "grad_norm": 11.232973098754883, "learning_rate": 2.9397993311036794e-06, "loss": 1.2588, "num_input_tokens_seen": 906752, "step": 880 }, { "epoch": 0.5919732441471572, "grad_norm": 15.976790428161621, "learning_rate": 2.956521739130435e-06, "loss": 1.2706, "num_input_tokens_seen": 912160, "step": 885 }, { "epoch": 0.5953177257525084, "grad_norm": 12.609136581420898, "learning_rate": 2.973244147157191e-06, "loss": 1.2219, "num_input_tokens_seen": 916960, "step": 890 }, { "epoch": 0.5986622073578596, "grad_norm": 7.879086971282959, "learning_rate": 2.989966555183947e-06, "loss": 1.1944, "num_input_tokens_seen": 921856, "step": 895 }, { "epoch": 0.6020066889632107, "grad_norm": 10.047253608703613, "learning_rate": 3.0066889632107027e-06, "loss": 1.079, "num_input_tokens_seen": 927552, "step": 900 }, { "epoch": 0.6053511705685619, "grad_norm": 10.821419715881348, "learning_rate": 3.0234113712374585e-06, "loss": 1.1135, "num_input_tokens_seen": 932576, "step": 905 }, { "epoch": 0.6086956521739131, "grad_norm": 7.886102676391602, "learning_rate": 3.0401337792642143e-06, "loss": 1.1078, "num_input_tokens_seen": 937792, "step": 910 }, { "epoch": 0.6120401337792643, "grad_norm": 10.040949821472168, "learning_rate": 3.05685618729097e-06, "loss": 1.0704, "num_input_tokens_seen": 943392, "step": 915 }, { "epoch": 0.6153846153846154, "grad_norm": 13.12746810913086, "learning_rate": 3.073578595317726e-06, "loss": 1.0764, "num_input_tokens_seen": 948128, "step": 920 }, { "epoch": 0.6187290969899666, "grad_norm": 8.63747787475586, "learning_rate": 3.0903010033444818e-06, "loss": 0.9575, "num_input_tokens_seen": 952736, "step": 925 }, { "epoch": 0.6220735785953178, "grad_norm": 11.882269859313965, "learning_rate": 3.1070234113712376e-06, "loss": 1.169, "num_input_tokens_seen": 957152, "step": 930 }, { "epoch": 0.6254180602006689, "grad_norm": 11.146638870239258, "learning_rate": 3.1237458193979934e-06, "loss": 1.1502, "num_input_tokens_seen": 962496, "step": 935 }, { "epoch": 0.6287625418060201, "grad_norm": 10.263800621032715, "learning_rate": 3.1404682274247496e-06, "loss": 1.1785, "num_input_tokens_seen": 967296, "step": 940 }, { "epoch": 0.6321070234113713, "grad_norm": 10.74228286743164, "learning_rate": 3.1571906354515055e-06, "loss": 1.144, "num_input_tokens_seen": 972896, "step": 945 }, { "epoch": 0.6354515050167224, "grad_norm": 15.597986221313477, "learning_rate": 3.1739130434782613e-06, "loss": 1.0607, "num_input_tokens_seen": 977248, "step": 950 }, { "epoch": 0.6387959866220736, "grad_norm": 9.045693397521973, "learning_rate": 3.190635451505017e-06, "loss": 1.0388, "num_input_tokens_seen": 982624, "step": 955 }, { "epoch": 0.6421404682274248, "grad_norm": 14.396576881408691, "learning_rate": 3.207357859531773e-06, "loss": 1.2078, "num_input_tokens_seen": 987616, "step": 960 }, { "epoch": 0.6454849498327759, "grad_norm": 14.256489753723145, "learning_rate": 3.2240802675585287e-06, "loss": 1.2244, "num_input_tokens_seen": 992864, "step": 965 }, { "epoch": 0.6488294314381271, "grad_norm": 7.376067161560059, "learning_rate": 3.2408026755852845e-06, "loss": 1.0221, "num_input_tokens_seen": 998432, "step": 970 }, { "epoch": 0.6521739130434783, "grad_norm": 15.261981964111328, "learning_rate": 3.2575250836120404e-06, "loss": 0.9818, "num_input_tokens_seen": 1003680, "step": 975 }, { "epoch": 0.6555183946488294, "grad_norm": 10.31662368774414, "learning_rate": 3.274247491638796e-06, "loss": 1.2564, "num_input_tokens_seen": 1008544, "step": 980 }, { "epoch": 0.6588628762541806, "grad_norm": 14.325188636779785, "learning_rate": 3.290969899665552e-06, "loss": 1.103, "num_input_tokens_seen": 1013632, "step": 985 }, { "epoch": 0.6622073578595318, "grad_norm": 7.981444835662842, "learning_rate": 3.307692307692308e-06, "loss": 1.0996, "num_input_tokens_seen": 1018624, "step": 990 }, { "epoch": 0.6655518394648829, "grad_norm": 13.790352821350098, "learning_rate": 3.3244147157190636e-06, "loss": 1.1226, "num_input_tokens_seen": 1023616, "step": 995 }, { "epoch": 0.6688963210702341, "grad_norm": 9.917171478271484, "learning_rate": 3.3411371237458195e-06, "loss": 1.0273, "num_input_tokens_seen": 1028480, "step": 1000 }, { "epoch": 0.6722408026755853, "grad_norm": 7.525999546051025, "learning_rate": 3.3578595317725753e-06, "loss": 1.1049, "num_input_tokens_seen": 1033248, "step": 1005 }, { "epoch": 0.6755852842809364, "grad_norm": 8.67750072479248, "learning_rate": 3.374581939799331e-06, "loss": 0.9202, "num_input_tokens_seen": 1038752, "step": 1010 }, { "epoch": 0.6789297658862876, "grad_norm": 6.778685569763184, "learning_rate": 3.391304347826087e-06, "loss": 0.9343, "num_input_tokens_seen": 1044864, "step": 1015 }, { "epoch": 0.6822742474916388, "grad_norm": 8.803627014160156, "learning_rate": 3.4080267558528427e-06, "loss": 0.9254, "num_input_tokens_seen": 1050272, "step": 1020 }, { "epoch": 0.68561872909699, "grad_norm": 7.662417411804199, "learning_rate": 3.424749163879599e-06, "loss": 1.0082, "num_input_tokens_seen": 1055264, "step": 1025 }, { "epoch": 0.6889632107023411, "grad_norm": 9.675888061523438, "learning_rate": 3.441471571906355e-06, "loss": 1.0065, "num_input_tokens_seen": 1060256, "step": 1030 }, { "epoch": 0.6923076923076923, "grad_norm": 7.009413719177246, "learning_rate": 3.4581939799331106e-06, "loss": 1.0334, "num_input_tokens_seen": 1066048, "step": 1035 }, { "epoch": 0.6956521739130435, "grad_norm": 8.006351470947266, "learning_rate": 3.4749163879598664e-06, "loss": 1.0621, "num_input_tokens_seen": 1071584, "step": 1040 }, { "epoch": 0.6989966555183946, "grad_norm": 10.013736724853516, "learning_rate": 3.4916387959866222e-06, "loss": 1.0259, "num_input_tokens_seen": 1076352, "step": 1045 }, { "epoch": 0.7023411371237458, "grad_norm": 8.86385440826416, "learning_rate": 3.508361204013378e-06, "loss": 0.8857, "num_input_tokens_seen": 1081184, "step": 1050 }, { "epoch": 0.705685618729097, "grad_norm": 7.961977481842041, "learning_rate": 3.525083612040134e-06, "loss": 0.9202, "num_input_tokens_seen": 1086272, "step": 1055 }, { "epoch": 0.7090301003344481, "grad_norm": 11.252603530883789, "learning_rate": 3.5418060200668897e-06, "loss": 1.0382, "num_input_tokens_seen": 1091744, "step": 1060 }, { "epoch": 0.7123745819397993, "grad_norm": 9.424860954284668, "learning_rate": 3.5585284280936455e-06, "loss": 1.0581, "num_input_tokens_seen": 1096896, "step": 1065 }, { "epoch": 0.7157190635451505, "grad_norm": 6.342419624328613, "learning_rate": 3.5752508361204013e-06, "loss": 0.8869, "num_input_tokens_seen": 1102176, "step": 1070 }, { "epoch": 0.7190635451505016, "grad_norm": 6.7544379234313965, "learning_rate": 3.5919732441471576e-06, "loss": 0.9877, "num_input_tokens_seen": 1107904, "step": 1075 }, { "epoch": 0.7224080267558528, "grad_norm": 5.873295783996582, "learning_rate": 3.6086956521739134e-06, "loss": 0.9118, "num_input_tokens_seen": 1112736, "step": 1080 }, { "epoch": 0.725752508361204, "grad_norm": 8.399951934814453, "learning_rate": 3.6254180602006696e-06, "loss": 1.0902, "num_input_tokens_seen": 1118080, "step": 1085 }, { "epoch": 0.7290969899665551, "grad_norm": 7.357285976409912, "learning_rate": 3.6421404682274255e-06, "loss": 0.8189, "num_input_tokens_seen": 1123200, "step": 1090 }, { "epoch": 0.7324414715719063, "grad_norm": 10.625144958496094, "learning_rate": 3.6588628762541813e-06, "loss": 1.0188, "num_input_tokens_seen": 1128608, "step": 1095 }, { "epoch": 0.7357859531772575, "grad_norm": 9.750059127807617, "learning_rate": 3.675585284280937e-06, "loss": 0.9209, "num_input_tokens_seen": 1133248, "step": 1100 }, { "epoch": 0.7391304347826086, "grad_norm": 9.086219787597656, "learning_rate": 3.692307692307693e-06, "loss": 1.0199, "num_input_tokens_seen": 1138496, "step": 1105 }, { "epoch": 0.7424749163879598, "grad_norm": 5.9698991775512695, "learning_rate": 3.7090301003344487e-06, "loss": 0.9057, "num_input_tokens_seen": 1143488, "step": 1110 }, { "epoch": 0.745819397993311, "grad_norm": 7.923586845397949, "learning_rate": 3.7257525083612046e-06, "loss": 1.0432, "num_input_tokens_seen": 1150272, "step": 1115 }, { "epoch": 0.7491638795986622, "grad_norm": 6.354238986968994, "learning_rate": 3.7424749163879604e-06, "loss": 0.9447, "num_input_tokens_seen": 1156160, "step": 1120 }, { "epoch": 0.7525083612040134, "grad_norm": 6.590362548828125, "learning_rate": 3.759197324414716e-06, "loss": 0.8319, "num_input_tokens_seen": 1160960, "step": 1125 }, { "epoch": 0.7558528428093646, "grad_norm": 9.05449390411377, "learning_rate": 3.775919732441472e-06, "loss": 1.0092, "num_input_tokens_seen": 1166464, "step": 1130 }, { "epoch": 0.7591973244147158, "grad_norm": 11.567673683166504, "learning_rate": 3.792642140468228e-06, "loss": 0.762, "num_input_tokens_seen": 1172224, "step": 1135 }, { "epoch": 0.7625418060200669, "grad_norm": 6.4344563484191895, "learning_rate": 3.8093645484949837e-06, "loss": 0.8252, "num_input_tokens_seen": 1176960, "step": 1140 }, { "epoch": 0.7658862876254181, "grad_norm": 8.45168685913086, "learning_rate": 3.8260869565217395e-06, "loss": 0.8521, "num_input_tokens_seen": 1181984, "step": 1145 }, { "epoch": 0.7692307692307693, "grad_norm": 12.30045223236084, "learning_rate": 3.842809364548496e-06, "loss": 0.7479, "num_input_tokens_seen": 1187040, "step": 1150 }, { "epoch": 0.7725752508361204, "grad_norm": 8.064915657043457, "learning_rate": 3.859531772575251e-06, "loss": 0.8993, "num_input_tokens_seen": 1192256, "step": 1155 }, { "epoch": 0.7759197324414716, "grad_norm": 9.782032012939453, "learning_rate": 3.876254180602007e-06, "loss": 0.9506, "num_input_tokens_seen": 1197472, "step": 1160 }, { "epoch": 0.7792642140468228, "grad_norm": 7.67346715927124, "learning_rate": 3.892976588628763e-06, "loss": 0.8695, "num_input_tokens_seen": 1202688, "step": 1165 }, { "epoch": 0.782608695652174, "grad_norm": 7.361164569854736, "learning_rate": 3.909698996655519e-06, "loss": 0.8347, "num_input_tokens_seen": 1207872, "step": 1170 }, { "epoch": 0.7859531772575251, "grad_norm": 8.24132251739502, "learning_rate": 3.926421404682274e-06, "loss": 0.7358, "num_input_tokens_seen": 1213568, "step": 1175 }, { "epoch": 0.7892976588628763, "grad_norm": 12.980637550354004, "learning_rate": 3.943143812709031e-06, "loss": 0.8533, "num_input_tokens_seen": 1218784, "step": 1180 }, { "epoch": 0.7926421404682275, "grad_norm": 11.378439903259277, "learning_rate": 3.959866220735786e-06, "loss": 0.7165, "num_input_tokens_seen": 1223456, "step": 1185 }, { "epoch": 0.7959866220735786, "grad_norm": 6.376906394958496, "learning_rate": 3.976588628762542e-06, "loss": 0.8624, "num_input_tokens_seen": 1228736, "step": 1190 }, { "epoch": 0.7993311036789298, "grad_norm": 5.006546497344971, "learning_rate": 3.993311036789298e-06, "loss": 0.8125, "num_input_tokens_seen": 1233792, "step": 1195 }, { "epoch": 0.802675585284281, "grad_norm": 6.199674606323242, "learning_rate": 4.010033444816054e-06, "loss": 0.6895, "num_input_tokens_seen": 1238976, "step": 1200 }, { "epoch": 0.8060200668896321, "grad_norm": 7.953625679016113, "learning_rate": 4.026755852842809e-06, "loss": 0.7548, "num_input_tokens_seen": 1243776, "step": 1205 }, { "epoch": 0.8093645484949833, "grad_norm": 7.88476037979126, "learning_rate": 4.0434782608695655e-06, "loss": 0.7194, "num_input_tokens_seen": 1248608, "step": 1210 }, { "epoch": 0.8127090301003345, "grad_norm": 3.8683128356933594, "learning_rate": 4.060200668896322e-06, "loss": 0.7529, "num_input_tokens_seen": 1253952, "step": 1215 }, { "epoch": 0.8160535117056856, "grad_norm": 6.188815593719482, "learning_rate": 4.076923076923077e-06, "loss": 0.9157, "num_input_tokens_seen": 1259136, "step": 1220 }, { "epoch": 0.8193979933110368, "grad_norm": 5.902515411376953, "learning_rate": 4.093645484949833e-06, "loss": 0.7879, "num_input_tokens_seen": 1264128, "step": 1225 }, { "epoch": 0.822742474916388, "grad_norm": 6.4732513427734375, "learning_rate": 4.110367892976589e-06, "loss": 0.7743, "num_input_tokens_seen": 1269472, "step": 1230 }, { "epoch": 0.8260869565217391, "grad_norm": 6.7476725578308105, "learning_rate": 4.127090301003345e-06, "loss": 0.7945, "num_input_tokens_seen": 1274400, "step": 1235 }, { "epoch": 0.8294314381270903, "grad_norm": 6.793126106262207, "learning_rate": 4.1438127090301005e-06, "loss": 0.7599, "num_input_tokens_seen": 1279712, "step": 1240 }, { "epoch": 0.8327759197324415, "grad_norm": 7.240893363952637, "learning_rate": 4.160535117056857e-06, "loss": 0.7256, "num_input_tokens_seen": 1285184, "step": 1245 }, { "epoch": 0.8361204013377926, "grad_norm": 7.388341426849365, "learning_rate": 4.177257525083612e-06, "loss": 0.7054, "num_input_tokens_seen": 1290208, "step": 1250 }, { "epoch": 0.8394648829431438, "grad_norm": 3.962902545928955, "learning_rate": 4.193979933110368e-06, "loss": 0.677, "num_input_tokens_seen": 1296032, "step": 1255 }, { "epoch": 0.842809364548495, "grad_norm": 8.128307342529297, "learning_rate": 4.210702341137124e-06, "loss": 0.7759, "num_input_tokens_seen": 1300640, "step": 1260 }, { "epoch": 0.8461538461538461, "grad_norm": 5.309257984161377, "learning_rate": 4.22742474916388e-06, "loss": 0.753, "num_input_tokens_seen": 1305280, "step": 1265 }, { "epoch": 0.8494983277591973, "grad_norm": 7.213605880737305, "learning_rate": 4.244147157190635e-06, "loss": 0.7245, "num_input_tokens_seen": 1310144, "step": 1270 }, { "epoch": 0.8528428093645485, "grad_norm": 6.535955905914307, "learning_rate": 4.260869565217392e-06, "loss": 0.8358, "num_input_tokens_seen": 1315488, "step": 1275 }, { "epoch": 0.8561872909698997, "grad_norm": 5.262463092803955, "learning_rate": 4.277591973244147e-06, "loss": 0.7324, "num_input_tokens_seen": 1320480, "step": 1280 }, { "epoch": 0.8595317725752508, "grad_norm": 6.527382850646973, "learning_rate": 4.294314381270903e-06, "loss": 0.7044, "num_input_tokens_seen": 1324896, "step": 1285 }, { "epoch": 0.862876254180602, "grad_norm": 5.295207500457764, "learning_rate": 4.311036789297659e-06, "loss": 0.7829, "num_input_tokens_seen": 1330304, "step": 1290 }, { "epoch": 0.8662207357859532, "grad_norm": 4.176196098327637, "learning_rate": 4.327759197324415e-06, "loss": 0.7071, "num_input_tokens_seen": 1334944, "step": 1295 }, { "epoch": 0.8695652173913043, "grad_norm": 5.87897253036499, "learning_rate": 4.34448160535117e-06, "loss": 0.5894, "num_input_tokens_seen": 1340288, "step": 1300 }, { "epoch": 0.8729096989966555, "grad_norm": 5.938285827636719, "learning_rate": 4.3612040133779265e-06, "loss": 0.6789, "num_input_tokens_seen": 1344704, "step": 1305 }, { "epoch": 0.8762541806020067, "grad_norm": 5.427924633026123, "learning_rate": 4.377926421404683e-06, "loss": 0.6571, "num_input_tokens_seen": 1350112, "step": 1310 }, { "epoch": 0.8795986622073578, "grad_norm": 5.237860679626465, "learning_rate": 4.394648829431438e-06, "loss": 0.6915, "num_input_tokens_seen": 1355264, "step": 1315 }, { "epoch": 0.882943143812709, "grad_norm": 4.768033504486084, "learning_rate": 4.411371237458194e-06, "loss": 0.5931, "num_input_tokens_seen": 1360064, "step": 1320 }, { "epoch": 0.8862876254180602, "grad_norm": 6.414586544036865, "learning_rate": 4.428093645484951e-06, "loss": 0.707, "num_input_tokens_seen": 1364992, "step": 1325 }, { "epoch": 0.8896321070234113, "grad_norm": 4.842050075531006, "learning_rate": 4.444816053511706e-06, "loss": 0.6849, "num_input_tokens_seen": 1370304, "step": 1330 }, { "epoch": 0.8929765886287625, "grad_norm": 4.919260025024414, "learning_rate": 4.461538461538462e-06, "loss": 0.6535, "num_input_tokens_seen": 1374816, "step": 1335 }, { "epoch": 0.8963210702341137, "grad_norm": 5.803635120391846, "learning_rate": 4.478260869565218e-06, "loss": 0.6496, "num_input_tokens_seen": 1379360, "step": 1340 }, { "epoch": 0.8996655518394648, "grad_norm": 4.614714622497559, "learning_rate": 4.494983277591974e-06, "loss": 0.5919, "num_input_tokens_seen": 1384832, "step": 1345 }, { "epoch": 0.903010033444816, "grad_norm": 4.060434818267822, "learning_rate": 4.511705685618729e-06, "loss": 0.6588, "num_input_tokens_seen": 1390720, "step": 1350 }, { "epoch": 0.9063545150501672, "grad_norm": 6.341435432434082, "learning_rate": 4.5284280936454856e-06, "loss": 0.6397, "num_input_tokens_seen": 1395136, "step": 1355 }, { "epoch": 0.9096989966555183, "grad_norm": 4.321681976318359, "learning_rate": 4.545150501672241e-06, "loss": 0.5533, "num_input_tokens_seen": 1400640, "step": 1360 }, { "epoch": 0.9130434782608695, "grad_norm": 4.226893424987793, "learning_rate": 4.561872909698997e-06, "loss": 0.6388, "num_input_tokens_seen": 1405600, "step": 1365 }, { "epoch": 0.9163879598662207, "grad_norm": 5.056398868560791, "learning_rate": 4.5785953177257534e-06, "loss": 0.6831, "num_input_tokens_seen": 1409632, "step": 1370 }, { "epoch": 0.919732441471572, "grad_norm": 4.640850067138672, "learning_rate": 4.595317725752509e-06, "loss": 0.6222, "num_input_tokens_seen": 1414592, "step": 1375 }, { "epoch": 0.9230769230769231, "grad_norm": 3.7539353370666504, "learning_rate": 4.612040133779265e-06, "loss": 0.6474, "num_input_tokens_seen": 1419424, "step": 1380 }, { "epoch": 0.9264214046822743, "grad_norm": 3.2621681690216064, "learning_rate": 4.6287625418060205e-06, "loss": 0.7318, "num_input_tokens_seen": 1425344, "step": 1385 }, { "epoch": 0.9297658862876255, "grad_norm": 3.921999931335449, "learning_rate": 4.645484949832777e-06, "loss": 0.7246, "num_input_tokens_seen": 1431232, "step": 1390 }, { "epoch": 0.9331103678929766, "grad_norm": 3.857172727584839, "learning_rate": 4.662207357859532e-06, "loss": 0.7312, "num_input_tokens_seen": 1437824, "step": 1395 }, { "epoch": 0.9364548494983278, "grad_norm": 3.6673998832702637, "learning_rate": 4.678929765886288e-06, "loss": 0.7557, "num_input_tokens_seen": 1443232, "step": 1400 }, { "epoch": 0.939799331103679, "grad_norm": 2.914734363555908, "learning_rate": 4.695652173913044e-06, "loss": 0.617, "num_input_tokens_seen": 1448512, "step": 1405 }, { "epoch": 0.9431438127090301, "grad_norm": 4.472668647766113, "learning_rate": 4.7123745819398e-06, "loss": 0.6958, "num_input_tokens_seen": 1453824, "step": 1410 }, { "epoch": 0.9464882943143813, "grad_norm": 7.034492015838623, "learning_rate": 4.729096989966555e-06, "loss": 0.6955, "num_input_tokens_seen": 1458720, "step": 1415 }, { "epoch": 0.9498327759197325, "grad_norm": 5.455876350402832, "learning_rate": 4.745819397993312e-06, "loss": 0.7788, "num_input_tokens_seen": 1463840, "step": 1420 }, { "epoch": 0.9531772575250836, "grad_norm": 4.197169303894043, "learning_rate": 4.762541806020067e-06, "loss": 0.7576, "num_input_tokens_seen": 1469440, "step": 1425 }, { "epoch": 0.9565217391304348, "grad_norm": 2.947695016860962, "learning_rate": 4.779264214046823e-06, "loss": 0.6155, "num_input_tokens_seen": 1474400, "step": 1430 }, { "epoch": 0.959866220735786, "grad_norm": 4.8886213302612305, "learning_rate": 4.795986622073579e-06, "loss": 0.6744, "num_input_tokens_seen": 1479520, "step": 1435 }, { "epoch": 0.9632107023411371, "grad_norm": 4.535935401916504, "learning_rate": 4.812709030100335e-06, "loss": 0.6107, "num_input_tokens_seen": 1484128, "step": 1440 }, { "epoch": 0.9665551839464883, "grad_norm": 8.316730499267578, "learning_rate": 4.82943143812709e-06, "loss": 0.5909, "num_input_tokens_seen": 1489120, "step": 1445 }, { "epoch": 0.9698996655518395, "grad_norm": 3.2777669429779053, "learning_rate": 4.8461538461538465e-06, "loss": 0.5736, "num_input_tokens_seen": 1494240, "step": 1450 }, { "epoch": 0.9732441471571907, "grad_norm": 4.152234077453613, "learning_rate": 4.862876254180603e-06, "loss": 0.5698, "num_input_tokens_seen": 1499872, "step": 1455 }, { "epoch": 0.9765886287625418, "grad_norm": 6.473057746887207, "learning_rate": 4.879598662207358e-06, "loss": 0.6502, "num_input_tokens_seen": 1504576, "step": 1460 }, { "epoch": 0.979933110367893, "grad_norm": 4.25331974029541, "learning_rate": 4.896321070234114e-06, "loss": 0.7363, "num_input_tokens_seen": 1509792, "step": 1465 }, { "epoch": 0.9832775919732442, "grad_norm": 2.2631959915161133, "learning_rate": 4.91304347826087e-06, "loss": 0.6566, "num_input_tokens_seen": 1515008, "step": 1470 }, { "epoch": 0.9866220735785953, "grad_norm": 3.1087570190429688, "learning_rate": 4.929765886287626e-06, "loss": 0.6655, "num_input_tokens_seen": 1520544, "step": 1475 }, { "epoch": 0.9899665551839465, "grad_norm": 3.448456287384033, "learning_rate": 4.9464882943143815e-06, "loss": 0.5738, "num_input_tokens_seen": 1525248, "step": 1480 }, { "epoch": 0.9933110367892977, "grad_norm": 2.988557815551758, "learning_rate": 4.963210702341138e-06, "loss": 0.5618, "num_input_tokens_seen": 1530272, "step": 1485 }, { "epoch": 0.9966555183946488, "grad_norm": 5.77734899520874, "learning_rate": 4.979933110367893e-06, "loss": 0.7452, "num_input_tokens_seen": 1535296, "step": 1490 }, { "epoch": 1.0, "grad_norm": 8.819687843322754, "learning_rate": 4.996655518394649e-06, "loss": 0.6866, "num_input_tokens_seen": 1539648, "step": 1495 }, { "epoch": 1.0033444816053512, "grad_norm": 5.239814281463623, "learning_rate": 5.0133779264214056e-06, "loss": 0.6154, "num_input_tokens_seen": 1545536, "step": 1500 }, { "epoch": 1.0066889632107023, "grad_norm": 2.3264353275299072, "learning_rate": 5.030100334448161e-06, "loss": 0.6378, "num_input_tokens_seen": 1550720, "step": 1505 }, { "epoch": 1.0100334448160535, "grad_norm": 5.39306116104126, "learning_rate": 5.046822742474917e-06, "loss": 0.6523, "num_input_tokens_seen": 1555584, "step": 1510 }, { "epoch": 1.0133779264214047, "grad_norm": 3.7427237033843994, "learning_rate": 5.0635451505016735e-06, "loss": 0.5624, "num_input_tokens_seen": 1560384, "step": 1515 }, { "epoch": 1.0167224080267558, "grad_norm": 3.541205883026123, "learning_rate": 5.080267558528429e-06, "loss": 0.7238, "num_input_tokens_seen": 1564768, "step": 1520 }, { "epoch": 1.020066889632107, "grad_norm": 3.8866236209869385, "learning_rate": 5.096989966555185e-06, "loss": 0.5636, "num_input_tokens_seen": 1569856, "step": 1525 }, { "epoch": 1.0234113712374582, "grad_norm": 3.620760917663574, "learning_rate": 5.1137123745819405e-06, "loss": 0.6511, "num_input_tokens_seen": 1574592, "step": 1530 }, { "epoch": 1.0267558528428093, "grad_norm": 4.951235771179199, "learning_rate": 5.130434782608697e-06, "loss": 0.5938, "num_input_tokens_seen": 1579808, "step": 1535 }, { "epoch": 1.0301003344481605, "grad_norm": 2.7784481048583984, "learning_rate": 5.147157190635452e-06, "loss": 0.579, "num_input_tokens_seen": 1585248, "step": 1540 }, { "epoch": 1.0334448160535117, "grad_norm": 4.090806007385254, "learning_rate": 5.163879598662208e-06, "loss": 0.5938, "num_input_tokens_seen": 1590496, "step": 1545 }, { "epoch": 1.0367892976588629, "grad_norm": 3.06355881690979, "learning_rate": 5.180602006688964e-06, "loss": 0.605, "num_input_tokens_seen": 1595104, "step": 1550 }, { "epoch": 1.040133779264214, "grad_norm": 4.2053608894348145, "learning_rate": 5.19732441471572e-06, "loss": 0.6268, "num_input_tokens_seen": 1600448, "step": 1555 }, { "epoch": 1.0434782608695652, "grad_norm": 3.513134479522705, "learning_rate": 5.214046822742475e-06, "loss": 0.6169, "num_input_tokens_seen": 1604992, "step": 1560 }, { "epoch": 1.0468227424749164, "grad_norm": 4.1357879638671875, "learning_rate": 5.230769230769232e-06, "loss": 0.5165, "num_input_tokens_seen": 1610048, "step": 1565 }, { "epoch": 1.0501672240802675, "grad_norm": 3.7662084102630615, "learning_rate": 5.247491638795987e-06, "loss": 0.7257, "num_input_tokens_seen": 1615104, "step": 1570 }, { "epoch": 1.0535117056856187, "grad_norm": 3.2445006370544434, "learning_rate": 5.264214046822743e-06, "loss": 0.5659, "num_input_tokens_seen": 1620608, "step": 1575 }, { "epoch": 1.0568561872909699, "grad_norm": 2.9676058292388916, "learning_rate": 5.280936454849499e-06, "loss": 0.5937, "num_input_tokens_seen": 1626592, "step": 1580 }, { "epoch": 1.060200668896321, "grad_norm": 3.118917226791382, "learning_rate": 5.297658862876255e-06, "loss": 0.6121, "num_input_tokens_seen": 1631040, "step": 1585 }, { "epoch": 1.0635451505016722, "grad_norm": 3.115529775619507, "learning_rate": 5.31438127090301e-06, "loss": 0.5953, "num_input_tokens_seen": 1635616, "step": 1590 }, { "epoch": 1.0668896321070234, "grad_norm": 3.2771220207214355, "learning_rate": 5.3311036789297666e-06, "loss": 0.7061, "num_input_tokens_seen": 1640768, "step": 1595 }, { "epoch": 1.0702341137123745, "grad_norm": 5.024900913238525, "learning_rate": 5.347826086956523e-06, "loss": 0.6652, "num_input_tokens_seen": 1645984, "step": 1600 }, { "epoch": 1.0735785953177257, "grad_norm": 3.264113426208496, "learning_rate": 5.364548494983278e-06, "loss": 0.5454, "num_input_tokens_seen": 1652064, "step": 1605 }, { "epoch": 1.0769230769230769, "grad_norm": 5.750743389129639, "learning_rate": 5.3812709030100344e-06, "loss": 0.6428, "num_input_tokens_seen": 1657088, "step": 1610 }, { "epoch": 1.080267558528428, "grad_norm": 5.106245517730713, "learning_rate": 5.39799331103679e-06, "loss": 0.6612, "num_input_tokens_seen": 1661760, "step": 1615 }, { "epoch": 1.0836120401337792, "grad_norm": 3.913825035095215, "learning_rate": 5.414715719063546e-06, "loss": 0.7073, "num_input_tokens_seen": 1666944, "step": 1620 }, { "epoch": 1.0869565217391304, "grad_norm": 2.914896011352539, "learning_rate": 5.4314381270903015e-06, "loss": 0.5902, "num_input_tokens_seen": 1671776, "step": 1625 }, { "epoch": 1.0903010033444815, "grad_norm": 2.6663718223571777, "learning_rate": 5.448160535117058e-06, "loss": 0.566, "num_input_tokens_seen": 1676992, "step": 1630 }, { "epoch": 1.0936454849498327, "grad_norm": 3.021545886993408, "learning_rate": 5.464882943143813e-06, "loss": 0.5777, "num_input_tokens_seen": 1682272, "step": 1635 }, { "epoch": 1.0969899665551839, "grad_norm": 5.076983451843262, "learning_rate": 5.481605351170569e-06, "loss": 0.5842, "num_input_tokens_seen": 1686784, "step": 1640 }, { "epoch": 1.100334448160535, "grad_norm": 2.9786298274993896, "learning_rate": 5.498327759197325e-06, "loss": 0.4982, "num_input_tokens_seen": 1691520, "step": 1645 }, { "epoch": 1.1036789297658862, "grad_norm": 2.8256843090057373, "learning_rate": 5.515050167224081e-06, "loss": 0.5094, "num_input_tokens_seen": 1696480, "step": 1650 }, { "epoch": 1.1070234113712374, "grad_norm": 3.36544132232666, "learning_rate": 5.531772575250836e-06, "loss": 0.6223, "num_input_tokens_seen": 1701376, "step": 1655 }, { "epoch": 1.1103678929765886, "grad_norm": 2.4573075771331787, "learning_rate": 5.548494983277593e-06, "loss": 0.4916, "num_input_tokens_seen": 1705920, "step": 1660 }, { "epoch": 1.1137123745819397, "grad_norm": 3.262324571609497, "learning_rate": 5.565217391304348e-06, "loss": 0.6132, "num_input_tokens_seen": 1711200, "step": 1665 }, { "epoch": 1.117056856187291, "grad_norm": 3.248863697052002, "learning_rate": 5.581939799331104e-06, "loss": 0.5355, "num_input_tokens_seen": 1716256, "step": 1670 }, { "epoch": 1.120401337792642, "grad_norm": 3.9440805912017822, "learning_rate": 5.59866220735786e-06, "loss": 0.6336, "num_input_tokens_seen": 1720384, "step": 1675 }, { "epoch": 1.1237458193979932, "grad_norm": 3.037598133087158, "learning_rate": 5.615384615384616e-06, "loss": 0.6706, "num_input_tokens_seen": 1725504, "step": 1680 }, { "epoch": 1.1270903010033444, "grad_norm": 2.9753499031066895, "learning_rate": 5.632107023411372e-06, "loss": 0.5571, "num_input_tokens_seen": 1730592, "step": 1685 }, { "epoch": 1.1304347826086956, "grad_norm": 1.996120572090149, "learning_rate": 5.6488294314381275e-06, "loss": 0.603, "num_input_tokens_seen": 1736256, "step": 1690 }, { "epoch": 1.1337792642140467, "grad_norm": 2.1611342430114746, "learning_rate": 5.665551839464884e-06, "loss": 0.454, "num_input_tokens_seen": 1740992, "step": 1695 }, { "epoch": 1.137123745819398, "grad_norm": 3.167947292327881, "learning_rate": 5.682274247491639e-06, "loss": 0.5589, "num_input_tokens_seen": 1745568, "step": 1700 }, { "epoch": 1.140468227424749, "grad_norm": 2.6374645233154297, "learning_rate": 5.698996655518395e-06, "loss": 0.6181, "num_input_tokens_seen": 1751040, "step": 1705 }, { "epoch": 1.1438127090301002, "grad_norm": 4.323640823364258, "learning_rate": 5.715719063545151e-06, "loss": 0.6169, "num_input_tokens_seen": 1756672, "step": 1710 }, { "epoch": 1.1471571906354514, "grad_norm": 3.6537044048309326, "learning_rate": 5.732441471571907e-06, "loss": 0.6278, "num_input_tokens_seen": 1761536, "step": 1715 }, { "epoch": 1.1505016722408028, "grad_norm": 3.05830717086792, "learning_rate": 5.7491638795986624e-06, "loss": 0.6966, "num_input_tokens_seen": 1767104, "step": 1720 }, { "epoch": 1.1538461538461537, "grad_norm": 2.357998847961426, "learning_rate": 5.765886287625419e-06, "loss": 0.6399, "num_input_tokens_seen": 1772896, "step": 1725 }, { "epoch": 1.1571906354515051, "grad_norm": 2.4320290088653564, "learning_rate": 5.782608695652174e-06, "loss": 0.6436, "num_input_tokens_seen": 1778048, "step": 1730 }, { "epoch": 1.160535117056856, "grad_norm": 2.842409372329712, "learning_rate": 5.79933110367893e-06, "loss": 0.5342, "num_input_tokens_seen": 1783072, "step": 1735 }, { "epoch": 1.1638795986622075, "grad_norm": 1.9841961860656738, "learning_rate": 5.816053511705686e-06, "loss": 0.7294, "num_input_tokens_seen": 1788800, "step": 1740 }, { "epoch": 1.1672240802675584, "grad_norm": 3.2121965885162354, "learning_rate": 5.832775919732442e-06, "loss": 0.6572, "num_input_tokens_seen": 1794368, "step": 1745 }, { "epoch": 1.1705685618729098, "grad_norm": 2.4315860271453857, "learning_rate": 5.849498327759197e-06, "loss": 0.5381, "num_input_tokens_seen": 1799904, "step": 1750 }, { "epoch": 1.1739130434782608, "grad_norm": 4.655223369598389, "learning_rate": 5.866220735785954e-06, "loss": 0.6013, "num_input_tokens_seen": 1804864, "step": 1755 }, { "epoch": 1.1772575250836121, "grad_norm": 2.846898317337036, "learning_rate": 5.882943143812709e-06, "loss": 0.7049, "num_input_tokens_seen": 1810016, "step": 1760 }, { "epoch": 1.180602006688963, "grad_norm": 3.83738112449646, "learning_rate": 5.899665551839465e-06, "loss": 0.5842, "num_input_tokens_seen": 1815648, "step": 1765 }, { "epoch": 1.1839464882943145, "grad_norm": 1.9964828491210938, "learning_rate": 5.916387959866221e-06, "loss": 0.5937, "num_input_tokens_seen": 1821312, "step": 1770 }, { "epoch": 1.1872909698996654, "grad_norm": 2.4199867248535156, "learning_rate": 5.933110367892977e-06, "loss": 0.6311, "num_input_tokens_seen": 1826944, "step": 1775 }, { "epoch": 1.1906354515050168, "grad_norm": 1.9372090101242065, "learning_rate": 5.949832775919733e-06, "loss": 0.5438, "num_input_tokens_seen": 1832384, "step": 1780 }, { "epoch": 1.193979933110368, "grad_norm": 2.520470142364502, "learning_rate": 5.9665551839464885e-06, "loss": 0.5882, "num_input_tokens_seen": 1837664, "step": 1785 }, { "epoch": 1.1973244147157192, "grad_norm": 2.586111545562744, "learning_rate": 5.983277591973245e-06, "loss": 0.5901, "num_input_tokens_seen": 1841728, "step": 1790 }, { "epoch": 1.2006688963210703, "grad_norm": 2.333714008331299, "learning_rate": 6e-06, "loss": 0.5021, "num_input_tokens_seen": 1846848, "step": 1795 }, { "epoch": 1.2040133779264215, "grad_norm": 3.139904499053955, "learning_rate": 6.016722408026756e-06, "loss": 0.6279, "num_input_tokens_seen": 1851936, "step": 1800 }, { "epoch": 1.2073578595317727, "grad_norm": 3.2322020530700684, "learning_rate": 6.033444816053512e-06, "loss": 0.481, "num_input_tokens_seen": 1856800, "step": 1805 }, { "epoch": 1.2107023411371238, "grad_norm": 3.0939781665802, "learning_rate": 6.050167224080268e-06, "loss": 0.5793, "num_input_tokens_seen": 1861664, "step": 1810 }, { "epoch": 1.214046822742475, "grad_norm": 2.28116512298584, "learning_rate": 6.0668896321070234e-06, "loss": 0.645, "num_input_tokens_seen": 1867744, "step": 1815 }, { "epoch": 1.2173913043478262, "grad_norm": 2.8389298915863037, "learning_rate": 6.08361204013378e-06, "loss": 0.5687, "num_input_tokens_seen": 1872544, "step": 1820 }, { "epoch": 1.2207357859531773, "grad_norm": 2.7351632118225098, "learning_rate": 6.100334448160535e-06, "loss": 0.6141, "num_input_tokens_seen": 1878368, "step": 1825 }, { "epoch": 1.2240802675585285, "grad_norm": 3.8266429901123047, "learning_rate": 6.117056856187291e-06, "loss": 0.6235, "num_input_tokens_seen": 1883392, "step": 1830 }, { "epoch": 1.2274247491638797, "grad_norm": 3.1974124908447266, "learning_rate": 6.133779264214047e-06, "loss": 0.5465, "num_input_tokens_seen": 1887808, "step": 1835 }, { "epoch": 1.2307692307692308, "grad_norm": 4.306275367736816, "learning_rate": 6.150501672240803e-06, "loss": 0.7515, "num_input_tokens_seen": 1893248, "step": 1840 }, { "epoch": 1.234113712374582, "grad_norm": 2.23360276222229, "learning_rate": 6.167224080267558e-06, "loss": 0.6077, "num_input_tokens_seen": 1898752, "step": 1845 }, { "epoch": 1.2374581939799332, "grad_norm": 2.4644248485565186, "learning_rate": 6.183946488294315e-06, "loss": 0.6315, "num_input_tokens_seen": 1904448, "step": 1850 }, { "epoch": 1.2408026755852843, "grad_norm": 1.6613271236419678, "learning_rate": 6.20066889632107e-06, "loss": 0.5007, "num_input_tokens_seen": 1909440, "step": 1855 }, { "epoch": 1.2441471571906355, "grad_norm": 2.7582147121429443, "learning_rate": 6.217391304347826e-06, "loss": 0.5884, "num_input_tokens_seen": 1914784, "step": 1860 }, { "epoch": 1.2474916387959867, "grad_norm": 2.096349000930786, "learning_rate": 6.2341137123745825e-06, "loss": 0.579, "num_input_tokens_seen": 1920192, "step": 1865 }, { "epoch": 1.2508361204013378, "grad_norm": 3.867453098297119, "learning_rate": 6.250836120401338e-06, "loss": 0.6846, "num_input_tokens_seen": 1925504, "step": 1870 }, { "epoch": 1.254180602006689, "grad_norm": 2.0656120777130127, "learning_rate": 6.267558528428094e-06, "loss": 0.4947, "num_input_tokens_seen": 1930304, "step": 1875 }, { "epoch": 1.2575250836120402, "grad_norm": 2.7704501152038574, "learning_rate": 6.2842809364548495e-06, "loss": 0.7035, "num_input_tokens_seen": 1935328, "step": 1880 }, { "epoch": 1.2608695652173914, "grad_norm": 3.3248684406280518, "learning_rate": 6.301003344481606e-06, "loss": 0.5838, "num_input_tokens_seen": 1940352, "step": 1885 }, { "epoch": 1.2642140468227425, "grad_norm": 3.7076594829559326, "learning_rate": 6.317725752508361e-06, "loss": 0.6071, "num_input_tokens_seen": 1945184, "step": 1890 }, { "epoch": 1.2675585284280937, "grad_norm": 3.2820332050323486, "learning_rate": 6.334448160535117e-06, "loss": 0.5251, "num_input_tokens_seen": 1950272, "step": 1895 }, { "epoch": 1.2709030100334449, "grad_norm": 2.9523653984069824, "learning_rate": 6.351170568561873e-06, "loss": 0.5442, "num_input_tokens_seen": 1955808, "step": 1900 }, { "epoch": 1.274247491638796, "grad_norm": 2.7227025032043457, "learning_rate": 6.367892976588629e-06, "loss": 0.6323, "num_input_tokens_seen": 1961280, "step": 1905 }, { "epoch": 1.2775919732441472, "grad_norm": 2.9452011585235596, "learning_rate": 6.384615384615384e-06, "loss": 0.535, "num_input_tokens_seen": 1966624, "step": 1910 }, { "epoch": 1.2809364548494984, "grad_norm": 1.9344284534454346, "learning_rate": 6.401337792642141e-06, "loss": 0.5344, "num_input_tokens_seen": 1971360, "step": 1915 }, { "epoch": 1.2842809364548495, "grad_norm": 2.076200246810913, "learning_rate": 6.418060200668896e-06, "loss": 0.6233, "num_input_tokens_seen": 1976608, "step": 1920 }, { "epoch": 1.2876254180602007, "grad_norm": 2.2363173961639404, "learning_rate": 6.434782608695652e-06, "loss": 0.6497, "num_input_tokens_seen": 1981824, "step": 1925 }, { "epoch": 1.2909698996655519, "grad_norm": 3.398129463195801, "learning_rate": 6.451505016722408e-06, "loss": 0.6486, "num_input_tokens_seen": 1987136, "step": 1930 }, { "epoch": 1.294314381270903, "grad_norm": 3.2124457359313965, "learning_rate": 6.468227424749164e-06, "loss": 0.5612, "num_input_tokens_seen": 1992224, "step": 1935 }, { "epoch": 1.2976588628762542, "grad_norm": 2.8153951168060303, "learning_rate": 6.484949832775919e-06, "loss": 0.5952, "num_input_tokens_seen": 1997824, "step": 1940 }, { "epoch": 1.3010033444816054, "grad_norm": 2.286677360534668, "learning_rate": 6.501672240802676e-06, "loss": 0.6026, "num_input_tokens_seen": 2003040, "step": 1945 }, { "epoch": 1.3043478260869565, "grad_norm": 3.958369731903076, "learning_rate": 6.518394648829433e-06, "loss": 0.5966, "num_input_tokens_seen": 2007712, "step": 1950 }, { "epoch": 1.3076923076923077, "grad_norm": 2.8128883838653564, "learning_rate": 6.535117056856188e-06, "loss": 0.4473, "num_input_tokens_seen": 2012992, "step": 1955 }, { "epoch": 1.3110367892976589, "grad_norm": 2.572955846786499, "learning_rate": 6.551839464882944e-06, "loss": 0.6034, "num_input_tokens_seen": 2018176, "step": 1960 }, { "epoch": 1.31438127090301, "grad_norm": 3.1513288021087646, "learning_rate": 6.5685618729097e-06, "loss": 0.6735, "num_input_tokens_seen": 2023424, "step": 1965 }, { "epoch": 1.3177257525083612, "grad_norm": 2.627446413040161, "learning_rate": 6.585284280936456e-06, "loss": 0.5986, "num_input_tokens_seen": 2027712, "step": 1970 }, { "epoch": 1.3210702341137124, "grad_norm": 3.112898111343384, "learning_rate": 6.602006688963211e-06, "loss": 0.5007, "num_input_tokens_seen": 2032640, "step": 1975 }, { "epoch": 1.3244147157190636, "grad_norm": 2.9700725078582764, "learning_rate": 6.6187290969899676e-06, "loss": 0.5472, "num_input_tokens_seen": 2037440, "step": 1980 }, { "epoch": 1.3277591973244147, "grad_norm": 2.3957598209381104, "learning_rate": 6.635451505016724e-06, "loss": 0.6206, "num_input_tokens_seen": 2042496, "step": 1985 }, { "epoch": 1.3311036789297659, "grad_norm": 1.9358699321746826, "learning_rate": 6.652173913043479e-06, "loss": 0.5468, "num_input_tokens_seen": 2047616, "step": 1990 }, { "epoch": 1.334448160535117, "grad_norm": 3.6269307136535645, "learning_rate": 6.6688963210702354e-06, "loss": 0.6944, "num_input_tokens_seen": 2052096, "step": 1995 }, { "epoch": 1.3377926421404682, "grad_norm": 1.906773567199707, "learning_rate": 6.685618729096991e-06, "loss": 0.6066, "num_input_tokens_seen": 2056608, "step": 2000 }, { "epoch": 1.3411371237458194, "grad_norm": 2.908466100692749, "learning_rate": 6.702341137123747e-06, "loss": 0.5574, "num_input_tokens_seen": 2061632, "step": 2005 }, { "epoch": 1.3444816053511706, "grad_norm": 2.8376402854919434, "learning_rate": 6.7190635451505025e-06, "loss": 0.7057, "num_input_tokens_seen": 2066944, "step": 2010 }, { "epoch": 1.3478260869565217, "grad_norm": 2.377216339111328, "learning_rate": 6.735785953177259e-06, "loss": 0.5403, "num_input_tokens_seen": 2073024, "step": 2015 }, { "epoch": 1.351170568561873, "grad_norm": 2.24552059173584, "learning_rate": 6.752508361204014e-06, "loss": 0.4987, "num_input_tokens_seen": 2077120, "step": 2020 }, { "epoch": 1.354515050167224, "grad_norm": 4.012527942657471, "learning_rate": 6.76923076923077e-06, "loss": 0.5985, "num_input_tokens_seen": 2082656, "step": 2025 }, { "epoch": 1.3578595317725752, "grad_norm": 2.3137714862823486, "learning_rate": 6.785953177257526e-06, "loss": 0.6031, "num_input_tokens_seen": 2087168, "step": 2030 }, { "epoch": 1.3612040133779264, "grad_norm": 2.323391914367676, "learning_rate": 6.802675585284282e-06, "loss": 0.5172, "num_input_tokens_seen": 2092448, "step": 2035 }, { "epoch": 1.3645484949832776, "grad_norm": 2.5172781944274902, "learning_rate": 6.819397993311037e-06, "loss": 0.5741, "num_input_tokens_seen": 2097472, "step": 2040 }, { "epoch": 1.3678929765886287, "grad_norm": 1.8199257850646973, "learning_rate": 6.836120401337794e-06, "loss": 0.5347, "num_input_tokens_seen": 2102560, "step": 2045 }, { "epoch": 1.37123745819398, "grad_norm": 3.0208420753479004, "learning_rate": 6.852842809364549e-06, "loss": 0.554, "num_input_tokens_seen": 2106656, "step": 2050 }, { "epoch": 1.374581939799331, "grad_norm": 2.5065739154815674, "learning_rate": 6.869565217391305e-06, "loss": 0.582, "num_input_tokens_seen": 2111936, "step": 2055 }, { "epoch": 1.3779264214046822, "grad_norm": 4.499659061431885, "learning_rate": 6.886287625418061e-06, "loss": 0.6038, "num_input_tokens_seen": 2117216, "step": 2060 }, { "epoch": 1.3812709030100334, "grad_norm": 3.416724920272827, "learning_rate": 6.903010033444817e-06, "loss": 0.6736, "num_input_tokens_seen": 2123360, "step": 2065 }, { "epoch": 1.3846153846153846, "grad_norm": 2.8338065147399902, "learning_rate": 6.919732441471573e-06, "loss": 0.5955, "num_input_tokens_seen": 2127904, "step": 2070 }, { "epoch": 1.3879598662207357, "grad_norm": 1.6290894746780396, "learning_rate": 6.9364548494983285e-06, "loss": 0.5437, "num_input_tokens_seen": 2134144, "step": 2075 }, { "epoch": 1.391304347826087, "grad_norm": 3.5717952251434326, "learning_rate": 6.953177257525085e-06, "loss": 0.5956, "num_input_tokens_seen": 2138368, "step": 2080 }, { "epoch": 1.394648829431438, "grad_norm": 2.8933279514312744, "learning_rate": 6.96989966555184e-06, "loss": 0.563, "num_input_tokens_seen": 2142720, "step": 2085 }, { "epoch": 1.3979933110367893, "grad_norm": 1.9319531917572021, "learning_rate": 6.9866220735785964e-06, "loss": 0.5445, "num_input_tokens_seen": 2147584, "step": 2090 }, { "epoch": 1.4013377926421404, "grad_norm": 2.2988717555999756, "learning_rate": 7.003344481605352e-06, "loss": 0.5723, "num_input_tokens_seen": 2152512, "step": 2095 }, { "epoch": 1.4046822742474916, "grad_norm": 3.593855619430542, "learning_rate": 7.020066889632108e-06, "loss": 0.551, "num_input_tokens_seen": 2157536, "step": 2100 }, { "epoch": 1.4080267558528428, "grad_norm": 2.5556607246398926, "learning_rate": 7.0367892976588635e-06, "loss": 0.5332, "num_input_tokens_seen": 2163072, "step": 2105 }, { "epoch": 1.411371237458194, "grad_norm": 2.9890267848968506, "learning_rate": 7.05351170568562e-06, "loss": 0.5391, "num_input_tokens_seen": 2168640, "step": 2110 }, { "epoch": 1.414715719063545, "grad_norm": 2.4829821586608887, "learning_rate": 7.070234113712375e-06, "loss": 0.6528, "num_input_tokens_seen": 2172736, "step": 2115 }, { "epoch": 1.4180602006688963, "grad_norm": 2.0071589946746826, "learning_rate": 7.086956521739131e-06, "loss": 0.6119, "num_input_tokens_seen": 2177920, "step": 2120 }, { "epoch": 1.4214046822742474, "grad_norm": 2.5345239639282227, "learning_rate": 7.103678929765887e-06, "loss": 0.5933, "num_input_tokens_seen": 2183104, "step": 2125 }, { "epoch": 1.4247491638795986, "grad_norm": 2.527320146560669, "learning_rate": 7.120401337792643e-06, "loss": 0.5582, "num_input_tokens_seen": 2188384, "step": 2130 }, { "epoch": 1.4280936454849498, "grad_norm": 2.586773633956909, "learning_rate": 7.137123745819398e-06, "loss": 0.6123, "num_input_tokens_seen": 2192768, "step": 2135 }, { "epoch": 1.431438127090301, "grad_norm": 3.2184250354766846, "learning_rate": 7.153846153846155e-06, "loss": 0.4799, "num_input_tokens_seen": 2197216, "step": 2140 }, { "epoch": 1.434782608695652, "grad_norm": 2.292337656021118, "learning_rate": 7.17056856187291e-06, "loss": 0.5604, "num_input_tokens_seen": 2202976, "step": 2145 }, { "epoch": 1.4381270903010033, "grad_norm": 2.401183843612671, "learning_rate": 7.187290969899666e-06, "loss": 0.5421, "num_input_tokens_seen": 2207968, "step": 2150 }, { "epoch": 1.4414715719063544, "grad_norm": 3.673666477203369, "learning_rate": 7.2040133779264225e-06, "loss": 0.6328, "num_input_tokens_seen": 2213312, "step": 2155 }, { "epoch": 1.4448160535117056, "grad_norm": 2.834078311920166, "learning_rate": 7.220735785953178e-06, "loss": 0.6002, "num_input_tokens_seen": 2218912, "step": 2160 }, { "epoch": 1.4481605351170568, "grad_norm": 2.5652060508728027, "learning_rate": 7.237458193979934e-06, "loss": 0.5117, "num_input_tokens_seen": 2223840, "step": 2165 }, { "epoch": 1.451505016722408, "grad_norm": 3.786206007003784, "learning_rate": 7.2541806020066895e-06, "loss": 0.5547, "num_input_tokens_seen": 2229344, "step": 2170 }, { "epoch": 1.4548494983277591, "grad_norm": 3.4987480640411377, "learning_rate": 7.270903010033446e-06, "loss": 0.549, "num_input_tokens_seen": 2234336, "step": 2175 }, { "epoch": 1.4581939799331103, "grad_norm": 3.0915720462799072, "learning_rate": 7.287625418060201e-06, "loss": 0.59, "num_input_tokens_seen": 2239712, "step": 2180 }, { "epoch": 1.4615384615384617, "grad_norm": 2.401930093765259, "learning_rate": 7.304347826086957e-06, "loss": 0.5354, "num_input_tokens_seen": 2244608, "step": 2185 }, { "epoch": 1.4648829431438126, "grad_norm": 2.3195245265960693, "learning_rate": 7.321070234113713e-06, "loss": 0.5285, "num_input_tokens_seen": 2249280, "step": 2190 }, { "epoch": 1.468227424749164, "grad_norm": 2.974088430404663, "learning_rate": 7.337792642140469e-06, "loss": 0.5919, "num_input_tokens_seen": 2255360, "step": 2195 }, { "epoch": 1.471571906354515, "grad_norm": 1.925618290901184, "learning_rate": 7.3545150501672244e-06, "loss": 0.5894, "num_input_tokens_seen": 2259648, "step": 2200 }, { "epoch": 1.4749163879598663, "grad_norm": 2.267645835876465, "learning_rate": 7.371237458193981e-06, "loss": 0.5405, "num_input_tokens_seen": 2265664, "step": 2205 }, { "epoch": 1.4782608695652173, "grad_norm": 2.2234129905700684, "learning_rate": 7.387959866220736e-06, "loss": 0.4974, "num_input_tokens_seen": 2270496, "step": 2210 }, { "epoch": 1.4816053511705687, "grad_norm": 2.854872703552246, "learning_rate": 7.404682274247492e-06, "loss": 0.4524, "num_input_tokens_seen": 2275488, "step": 2215 }, { "epoch": 1.4849498327759196, "grad_norm": 2.4369559288024902, "learning_rate": 7.421404682274248e-06, "loss": 0.5677, "num_input_tokens_seen": 2280448, "step": 2220 }, { "epoch": 1.488294314381271, "grad_norm": 2.629380226135254, "learning_rate": 7.438127090301004e-06, "loss": 0.4955, "num_input_tokens_seen": 2285216, "step": 2225 }, { "epoch": 1.491638795986622, "grad_norm": 2.1333413124084473, "learning_rate": 7.454849498327759e-06, "loss": 0.54, "num_input_tokens_seen": 2291040, "step": 2230 }, { "epoch": 1.4949832775919734, "grad_norm": 1.6409038305282593, "learning_rate": 7.471571906354516e-06, "loss": 0.5, "num_input_tokens_seen": 2296544, "step": 2235 }, { "epoch": 1.4983277591973243, "grad_norm": 1.8903368711471558, "learning_rate": 7.488294314381271e-06, "loss": 0.5856, "num_input_tokens_seen": 2301792, "step": 2240 }, { "epoch": 1.5016722408026757, "grad_norm": 2.362424373626709, "learning_rate": 7.505016722408027e-06, "loss": 0.6439, "num_input_tokens_seen": 2306496, "step": 2245 }, { "epoch": 1.5050167224080266, "grad_norm": 2.0560414791107178, "learning_rate": 7.5217391304347835e-06, "loss": 0.5879, "num_input_tokens_seen": 2312032, "step": 2250 }, { "epoch": 1.508361204013378, "grad_norm": 1.7975596189498901, "learning_rate": 7.538461538461539e-06, "loss": 0.5618, "num_input_tokens_seen": 2317312, "step": 2255 }, { "epoch": 1.511705685618729, "grad_norm": 2.3156912326812744, "learning_rate": 7.555183946488295e-06, "loss": 0.5742, "num_input_tokens_seen": 2322112, "step": 2260 }, { "epoch": 1.5150501672240804, "grad_norm": 1.9195119142532349, "learning_rate": 7.5719063545150505e-06, "loss": 0.6096, "num_input_tokens_seen": 2327520, "step": 2265 }, { "epoch": 1.5183946488294313, "grad_norm": 2.1273512840270996, "learning_rate": 7.588628762541807e-06, "loss": 0.5401, "num_input_tokens_seen": 2333024, "step": 2270 }, { "epoch": 1.5217391304347827, "grad_norm": 1.8048537969589233, "learning_rate": 7.605351170568562e-06, "loss": 0.5797, "num_input_tokens_seen": 2338400, "step": 2275 }, { "epoch": 1.5250836120401337, "grad_norm": 3.1229827404022217, "learning_rate": 7.622073578595318e-06, "loss": 0.5795, "num_input_tokens_seen": 2343584, "step": 2280 }, { "epoch": 1.528428093645485, "grad_norm": 2.1641931533813477, "learning_rate": 7.638795986622075e-06, "loss": 0.4543, "num_input_tokens_seen": 2348928, "step": 2285 }, { "epoch": 1.531772575250836, "grad_norm": 1.4358283281326294, "learning_rate": 7.65551839464883e-06, "loss": 0.4947, "num_input_tokens_seen": 2354368, "step": 2290 }, { "epoch": 1.5351170568561874, "grad_norm": 1.7273856401443481, "learning_rate": 7.672240802675585e-06, "loss": 0.6738, "num_input_tokens_seen": 2359424, "step": 2295 }, { "epoch": 1.5384615384615383, "grad_norm": 1.8432101011276245, "learning_rate": 7.688963210702342e-06, "loss": 0.5156, "num_input_tokens_seen": 2364864, "step": 2300 }, { "epoch": 1.5418060200668897, "grad_norm": 2.528012752532959, "learning_rate": 7.705685618729098e-06, "loss": 0.5953, "num_input_tokens_seen": 2369856, "step": 2305 }, { "epoch": 1.5451505016722407, "grad_norm": 1.8891600370407104, "learning_rate": 7.722408026755852e-06, "loss": 0.4257, "num_input_tokens_seen": 2374720, "step": 2310 }, { "epoch": 1.548494983277592, "grad_norm": 2.19561767578125, "learning_rate": 7.739130434782609e-06, "loss": 0.647, "num_input_tokens_seen": 2379904, "step": 2315 }, { "epoch": 1.551839464882943, "grad_norm": 2.186650276184082, "learning_rate": 7.755852842809365e-06, "loss": 0.5976, "num_input_tokens_seen": 2384576, "step": 2320 }, { "epoch": 1.5551839464882944, "grad_norm": 1.9150667190551758, "learning_rate": 7.772575250836121e-06, "loss": 0.5544, "num_input_tokens_seen": 2389440, "step": 2325 }, { "epoch": 1.5585284280936453, "grad_norm": 1.4499436616897583, "learning_rate": 7.789297658862877e-06, "loss": 0.5284, "num_input_tokens_seen": 2395328, "step": 2330 }, { "epoch": 1.5618729096989967, "grad_norm": 1.682037591934204, "learning_rate": 7.806020066889632e-06, "loss": 0.535, "num_input_tokens_seen": 2400800, "step": 2335 }, { "epoch": 1.5652173913043477, "grad_norm": 1.9106417894363403, "learning_rate": 7.822742474916388e-06, "loss": 0.5098, "num_input_tokens_seen": 2406528, "step": 2340 }, { "epoch": 1.568561872909699, "grad_norm": 1.8523521423339844, "learning_rate": 7.839464882943144e-06, "loss": 0.517, "num_input_tokens_seen": 2412352, "step": 2345 }, { "epoch": 1.57190635451505, "grad_norm": 2.0089356899261475, "learning_rate": 7.8561872909699e-06, "loss": 0.5296, "num_input_tokens_seen": 2417152, "step": 2350 }, { "epoch": 1.5752508361204014, "grad_norm": 2.2337710857391357, "learning_rate": 7.872909698996655e-06, "loss": 0.5353, "num_input_tokens_seen": 2422400, "step": 2355 }, { "epoch": 1.5785953177257523, "grad_norm": 4.249716758728027, "learning_rate": 7.889632107023411e-06, "loss": 0.6171, "num_input_tokens_seen": 2427456, "step": 2360 }, { "epoch": 1.5819397993311037, "grad_norm": 1.2874336242675781, "learning_rate": 7.906354515050168e-06, "loss": 0.5441, "num_input_tokens_seen": 2432544, "step": 2365 }, { "epoch": 1.585284280936455, "grad_norm": 2.1092848777770996, "learning_rate": 7.923076923076924e-06, "loss": 0.5608, "num_input_tokens_seen": 2437152, "step": 2370 }, { "epoch": 1.588628762541806, "grad_norm": 3.2494397163391113, "learning_rate": 7.939799331103679e-06, "loss": 0.5347, "num_input_tokens_seen": 2441888, "step": 2375 }, { "epoch": 1.5919732441471572, "grad_norm": 1.5807664394378662, "learning_rate": 7.956521739130435e-06, "loss": 0.4705, "num_input_tokens_seen": 2448512, "step": 2380 }, { "epoch": 1.5953177257525084, "grad_norm": 2.7644150257110596, "learning_rate": 7.973244147157191e-06, "loss": 0.6161, "num_input_tokens_seen": 2453088, "step": 2385 }, { "epoch": 1.5986622073578596, "grad_norm": 4.532840728759766, "learning_rate": 7.989966555183947e-06, "loss": 0.6872, "num_input_tokens_seen": 2457728, "step": 2390 }, { "epoch": 1.6020066889632107, "grad_norm": 2.91133451461792, "learning_rate": 8.006688963210702e-06, "loss": 0.5734, "num_input_tokens_seen": 2462944, "step": 2395 }, { "epoch": 1.605351170568562, "grad_norm": 3.0326640605926514, "learning_rate": 8.023411371237458e-06, "loss": 0.4845, "num_input_tokens_seen": 2468256, "step": 2400 }, { "epoch": 1.608695652173913, "grad_norm": 3.9772727489471436, "learning_rate": 8.040133779264214e-06, "loss": 0.6082, "num_input_tokens_seen": 2472672, "step": 2405 }, { "epoch": 1.6120401337792643, "grad_norm": 1.620176076889038, "learning_rate": 8.05685618729097e-06, "loss": 0.6866, "num_input_tokens_seen": 2478048, "step": 2410 }, { "epoch": 1.6153846153846154, "grad_norm": 1.9549012184143066, "learning_rate": 8.073578595317727e-06, "loss": 0.4908, "num_input_tokens_seen": 2483520, "step": 2415 }, { "epoch": 1.6187290969899666, "grad_norm": 2.692873239517212, "learning_rate": 8.090301003344481e-06, "loss": 0.5983, "num_input_tokens_seen": 2489472, "step": 2420 }, { "epoch": 1.6220735785953178, "grad_norm": 2.2534921169281006, "learning_rate": 8.107023411371238e-06, "loss": 0.6694, "num_input_tokens_seen": 2494112, "step": 2425 }, { "epoch": 1.625418060200669, "grad_norm": 1.8375470638275146, "learning_rate": 8.123745819397994e-06, "loss": 0.568, "num_input_tokens_seen": 2499616, "step": 2430 }, { "epoch": 1.62876254180602, "grad_norm": 1.9575871229171753, "learning_rate": 8.14046822742475e-06, "loss": 0.5943, "num_input_tokens_seen": 2504672, "step": 2435 }, { "epoch": 1.6321070234113713, "grad_norm": 2.888976812362671, "learning_rate": 8.157190635451505e-06, "loss": 0.5727, "num_input_tokens_seen": 2509664, "step": 2440 }, { "epoch": 1.6354515050167224, "grad_norm": 1.9570945501327515, "learning_rate": 8.173913043478263e-06, "loss": 0.507, "num_input_tokens_seen": 2515744, "step": 2445 }, { "epoch": 1.6387959866220736, "grad_norm": 2.108715772628784, "learning_rate": 8.190635451505019e-06, "loss": 0.4923, "num_input_tokens_seen": 2520704, "step": 2450 }, { "epoch": 1.6421404682274248, "grad_norm": 2.1967694759368896, "learning_rate": 8.207357859531773e-06, "loss": 0.5423, "num_input_tokens_seen": 2525376, "step": 2455 }, { "epoch": 1.645484949832776, "grad_norm": 2.7484993934631348, "learning_rate": 8.22408026755853e-06, "loss": 0.5476, "num_input_tokens_seen": 2530624, "step": 2460 }, { "epoch": 1.648829431438127, "grad_norm": 4.776459693908691, "learning_rate": 8.240802675585286e-06, "loss": 0.5049, "num_input_tokens_seen": 2535104, "step": 2465 }, { "epoch": 1.6521739130434783, "grad_norm": 2.5438809394836426, "learning_rate": 8.257525083612042e-06, "loss": 0.5658, "num_input_tokens_seen": 2539936, "step": 2470 }, { "epoch": 1.6555183946488294, "grad_norm": 2.5061213970184326, "learning_rate": 8.274247491638797e-06, "loss": 0.5931, "num_input_tokens_seen": 2544992, "step": 2475 }, { "epoch": 1.6588628762541806, "grad_norm": 2.412627935409546, "learning_rate": 8.290969899665553e-06, "loss": 0.5613, "num_input_tokens_seen": 2550144, "step": 2480 }, { "epoch": 1.6622073578595318, "grad_norm": 3.6248841285705566, "learning_rate": 8.307692307692309e-06, "loss": 0.5951, "num_input_tokens_seen": 2555136, "step": 2485 }, { "epoch": 1.665551839464883, "grad_norm": 3.2327167987823486, "learning_rate": 8.324414715719065e-06, "loss": 0.5605, "num_input_tokens_seen": 2560928, "step": 2490 }, { "epoch": 1.6688963210702341, "grad_norm": 2.197124481201172, "learning_rate": 8.34113712374582e-06, "loss": 0.5526, "num_input_tokens_seen": 2565984, "step": 2495 }, { "epoch": 1.6722408026755853, "grad_norm": 3.4529519081115723, "learning_rate": 8.357859531772576e-06, "loss": 0.5695, "num_input_tokens_seen": 2570816, "step": 2500 }, { "epoch": 1.6755852842809364, "grad_norm": 2.6004669666290283, "learning_rate": 8.374581939799332e-06, "loss": 0.6079, "num_input_tokens_seen": 2576064, "step": 2505 }, { "epoch": 1.6789297658862876, "grad_norm": 1.8591876029968262, "learning_rate": 8.391304347826089e-06, "loss": 0.6071, "num_input_tokens_seen": 2581984, "step": 2510 }, { "epoch": 1.6822742474916388, "grad_norm": 2.7183196544647217, "learning_rate": 8.408026755852843e-06, "loss": 0.5544, "num_input_tokens_seen": 2587104, "step": 2515 }, { "epoch": 1.68561872909699, "grad_norm": 4.634718418121338, "learning_rate": 8.4247491638796e-06, "loss": 0.6214, "num_input_tokens_seen": 2592448, "step": 2520 }, { "epoch": 1.6889632107023411, "grad_norm": 1.9271985292434692, "learning_rate": 8.441471571906356e-06, "loss": 0.4846, "num_input_tokens_seen": 2597184, "step": 2525 }, { "epoch": 1.6923076923076923, "grad_norm": 2.118332624435425, "learning_rate": 8.458193979933112e-06, "loss": 0.5533, "num_input_tokens_seen": 2602048, "step": 2530 }, { "epoch": 1.6956521739130435, "grad_norm": 2.440281867980957, "learning_rate": 8.474916387959868e-06, "loss": 0.5393, "num_input_tokens_seen": 2607104, "step": 2535 }, { "epoch": 1.6989966555183946, "grad_norm": 1.9326103925704956, "learning_rate": 8.491638795986623e-06, "loss": 0.5614, "num_input_tokens_seen": 2612192, "step": 2540 }, { "epoch": 1.7023411371237458, "grad_norm": 2.0723791122436523, "learning_rate": 8.508361204013379e-06, "loss": 0.6189, "num_input_tokens_seen": 2617632, "step": 2545 }, { "epoch": 1.705685618729097, "grad_norm": 1.5601929426193237, "learning_rate": 8.525083612040135e-06, "loss": 0.5398, "num_input_tokens_seen": 2622752, "step": 2550 }, { "epoch": 1.7090301003344481, "grad_norm": 1.661224126815796, "learning_rate": 8.541806020066891e-06, "loss": 0.4767, "num_input_tokens_seen": 2627904, "step": 2555 }, { "epoch": 1.7123745819397993, "grad_norm": 3.0421369075775146, "learning_rate": 8.558528428093646e-06, "loss": 0.4786, "num_input_tokens_seen": 2633440, "step": 2560 }, { "epoch": 1.7157190635451505, "grad_norm": 2.5981807708740234, "learning_rate": 8.575250836120402e-06, "loss": 0.478, "num_input_tokens_seen": 2638720, "step": 2565 }, { "epoch": 1.7190635451505016, "grad_norm": 2.9157519340515137, "learning_rate": 8.591973244147158e-06, "loss": 0.5886, "num_input_tokens_seen": 2643232, "step": 2570 }, { "epoch": 1.7224080267558528, "grad_norm": 2.5606119632720947, "learning_rate": 8.608695652173915e-06, "loss": 0.5819, "num_input_tokens_seen": 2647936, "step": 2575 }, { "epoch": 1.725752508361204, "grad_norm": 1.885940432548523, "learning_rate": 8.62541806020067e-06, "loss": 0.4355, "num_input_tokens_seen": 2653184, "step": 2580 }, { "epoch": 1.7290969899665551, "grad_norm": 2.8506417274475098, "learning_rate": 8.642140468227425e-06, "loss": 0.5085, "num_input_tokens_seen": 2657920, "step": 2585 }, { "epoch": 1.7324414715719063, "grad_norm": 1.6913057565689087, "learning_rate": 8.658862876254182e-06, "loss": 0.4856, "num_input_tokens_seen": 2662016, "step": 2590 }, { "epoch": 1.7357859531772575, "grad_norm": 2.097841262817383, "learning_rate": 8.675585284280938e-06, "loss": 0.5825, "num_input_tokens_seen": 2667584, "step": 2595 }, { "epoch": 1.7391304347826086, "grad_norm": 1.5609009265899658, "learning_rate": 8.692307692307692e-06, "loss": 0.4452, "num_input_tokens_seen": 2673248, "step": 2600 }, { "epoch": 1.7424749163879598, "grad_norm": 1.87007737159729, "learning_rate": 8.709030100334449e-06, "loss": 0.5168, "num_input_tokens_seen": 2678752, "step": 2605 }, { "epoch": 1.745819397993311, "grad_norm": 2.4884285926818848, "learning_rate": 8.725752508361205e-06, "loss": 0.5672, "num_input_tokens_seen": 2683648, "step": 2610 }, { "epoch": 1.7491638795986622, "grad_norm": 1.9547523260116577, "learning_rate": 8.742474916387961e-06, "loss": 0.5116, "num_input_tokens_seen": 2689216, "step": 2615 }, { "epoch": 1.7525083612040135, "grad_norm": 1.9719293117523193, "learning_rate": 8.759197324414716e-06, "loss": 0.6338, "num_input_tokens_seen": 2693216, "step": 2620 }, { "epoch": 1.7558528428093645, "grad_norm": 2.4037771224975586, "learning_rate": 8.775919732441472e-06, "loss": 0.482, "num_input_tokens_seen": 2697472, "step": 2625 }, { "epoch": 1.7591973244147159, "grad_norm": 1.7648074626922607, "learning_rate": 8.792642140468228e-06, "loss": 0.534, "num_input_tokens_seen": 2702560, "step": 2630 }, { "epoch": 1.7625418060200668, "grad_norm": 2.1394588947296143, "learning_rate": 8.809364548494984e-06, "loss": 0.5376, "num_input_tokens_seen": 2707776, "step": 2635 }, { "epoch": 1.7658862876254182, "grad_norm": 1.5230865478515625, "learning_rate": 8.82608695652174e-06, "loss": 0.5611, "num_input_tokens_seen": 2712768, "step": 2640 }, { "epoch": 1.7692307692307692, "grad_norm": 1.7575663328170776, "learning_rate": 8.842809364548495e-06, "loss": 0.546, "num_input_tokens_seen": 2719296, "step": 2645 }, { "epoch": 1.7725752508361206, "grad_norm": 2.33321213722229, "learning_rate": 8.859531772575252e-06, "loss": 0.6433, "num_input_tokens_seen": 2724160, "step": 2650 }, { "epoch": 1.7759197324414715, "grad_norm": 2.7204771041870117, "learning_rate": 8.876254180602008e-06, "loss": 0.5275, "num_input_tokens_seen": 2729184, "step": 2655 }, { "epoch": 1.779264214046823, "grad_norm": 3.311382293701172, "learning_rate": 8.892976588628764e-06, "loss": 0.5544, "num_input_tokens_seen": 2734656, "step": 2660 }, { "epoch": 1.7826086956521738, "grad_norm": 2.3531582355499268, "learning_rate": 8.909698996655519e-06, "loss": 0.5788, "num_input_tokens_seen": 2740160, "step": 2665 }, { "epoch": 1.7859531772575252, "grad_norm": 1.7714669704437256, "learning_rate": 8.926421404682275e-06, "loss": 0.5967, "num_input_tokens_seen": 2745632, "step": 2670 }, { "epoch": 1.7892976588628762, "grad_norm": 1.633301019668579, "learning_rate": 8.943143812709031e-06, "loss": 0.5802, "num_input_tokens_seen": 2750048, "step": 2675 }, { "epoch": 1.7926421404682276, "grad_norm": 1.5225788354873657, "learning_rate": 8.959866220735787e-06, "loss": 0.6193, "num_input_tokens_seen": 2754720, "step": 2680 }, { "epoch": 1.7959866220735785, "grad_norm": 1.6047158241271973, "learning_rate": 8.976588628762542e-06, "loss": 0.511, "num_input_tokens_seen": 2759456, "step": 2685 }, { "epoch": 1.79933110367893, "grad_norm": 1.3513319492340088, "learning_rate": 8.993311036789298e-06, "loss": 0.5311, "num_input_tokens_seen": 2764608, "step": 2690 }, { "epoch": 1.8026755852842808, "grad_norm": 1.9756256341934204, "learning_rate": 9.010033444816054e-06, "loss": 0.543, "num_input_tokens_seen": 2769344, "step": 2695 }, { "epoch": 1.8060200668896322, "grad_norm": 2.026555061340332, "learning_rate": 9.02675585284281e-06, "loss": 0.4329, "num_input_tokens_seen": 2774432, "step": 2700 }, { "epoch": 1.8093645484949832, "grad_norm": 2.585559368133545, "learning_rate": 9.043478260869565e-06, "loss": 0.5428, "num_input_tokens_seen": 2779584, "step": 2705 }, { "epoch": 1.8127090301003346, "grad_norm": 1.808197021484375, "learning_rate": 9.060200668896321e-06, "loss": 0.5255, "num_input_tokens_seen": 2784896, "step": 2710 }, { "epoch": 1.8160535117056855, "grad_norm": 2.179985761642456, "learning_rate": 9.076923076923078e-06, "loss": 0.5571, "num_input_tokens_seen": 2790304, "step": 2715 }, { "epoch": 1.819397993311037, "grad_norm": 1.8703956604003906, "learning_rate": 9.093645484949834e-06, "loss": 0.5509, "num_input_tokens_seen": 2796256, "step": 2720 }, { "epoch": 1.8227424749163879, "grad_norm": 2.417612075805664, "learning_rate": 9.11036789297659e-06, "loss": 0.4751, "num_input_tokens_seen": 2801088, "step": 2725 }, { "epoch": 1.8260869565217392, "grad_norm": 1.7000787258148193, "learning_rate": 9.127090301003345e-06, "loss": 0.6928, "num_input_tokens_seen": 2805920, "step": 2730 }, { "epoch": 1.8294314381270902, "grad_norm": 2.1750431060791016, "learning_rate": 9.143812709030101e-06, "loss": 0.5759, "num_input_tokens_seen": 2811392, "step": 2735 }, { "epoch": 1.8327759197324416, "grad_norm": 1.894718050956726, "learning_rate": 9.160535117056857e-06, "loss": 0.5782, "num_input_tokens_seen": 2816160, "step": 2740 }, { "epoch": 1.8361204013377925, "grad_norm": 1.2475227117538452, "learning_rate": 9.177257525083613e-06, "loss": 0.6389, "num_input_tokens_seen": 2820832, "step": 2745 }, { "epoch": 1.839464882943144, "grad_norm": 1.9611231088638306, "learning_rate": 9.193979933110368e-06, "loss": 0.5796, "num_input_tokens_seen": 2826688, "step": 2750 }, { "epoch": 1.8428093645484949, "grad_norm": 1.7836278676986694, "learning_rate": 9.210702341137124e-06, "loss": 0.5448, "num_input_tokens_seen": 2832576, "step": 2755 }, { "epoch": 1.8461538461538463, "grad_norm": 1.988349199295044, "learning_rate": 9.22742474916388e-06, "loss": 0.4707, "num_input_tokens_seen": 2837920, "step": 2760 }, { "epoch": 1.8494983277591972, "grad_norm": 2.154717206954956, "learning_rate": 9.244147157190637e-06, "loss": 0.5399, "num_input_tokens_seen": 2843328, "step": 2765 }, { "epoch": 1.8528428093645486, "grad_norm": 1.9529693126678467, "learning_rate": 9.260869565217391e-06, "loss": 0.6024, "num_input_tokens_seen": 2848160, "step": 2770 }, { "epoch": 1.8561872909698995, "grad_norm": 2.3039426803588867, "learning_rate": 9.277591973244147e-06, "loss": 0.6384, "num_input_tokens_seen": 2852864, "step": 2775 }, { "epoch": 1.859531772575251, "grad_norm": 1.7799960374832153, "learning_rate": 9.294314381270904e-06, "loss": 0.4958, "num_input_tokens_seen": 2857504, "step": 2780 }, { "epoch": 1.8628762541806019, "grad_norm": 1.7436339855194092, "learning_rate": 9.31103678929766e-06, "loss": 0.5651, "num_input_tokens_seen": 2861920, "step": 2785 }, { "epoch": 1.8662207357859533, "grad_norm": 1.0296903848648071, "learning_rate": 9.327759197324414e-06, "loss": 0.4292, "num_input_tokens_seen": 2867072, "step": 2790 }, { "epoch": 1.8695652173913042, "grad_norm": 1.3697514533996582, "learning_rate": 9.34448160535117e-06, "loss": 0.5287, "num_input_tokens_seen": 2872768, "step": 2795 }, { "epoch": 1.8729096989966556, "grad_norm": 2.3225924968719482, "learning_rate": 9.361204013377927e-06, "loss": 0.5739, "num_input_tokens_seen": 2877856, "step": 2800 }, { "epoch": 1.8762541806020065, "grad_norm": 1.5034071207046509, "learning_rate": 9.377926421404683e-06, "loss": 0.4837, "num_input_tokens_seen": 2883168, "step": 2805 }, { "epoch": 1.879598662207358, "grad_norm": 1.397858738899231, "learning_rate": 9.39464882943144e-06, "loss": 0.5818, "num_input_tokens_seen": 2889024, "step": 2810 }, { "epoch": 1.8829431438127089, "grad_norm": 2.1793863773345947, "learning_rate": 9.411371237458194e-06, "loss": 0.462, "num_input_tokens_seen": 2893760, "step": 2815 }, { "epoch": 1.8862876254180603, "grad_norm": 1.8835233449935913, "learning_rate": 9.42809364548495e-06, "loss": 0.5527, "num_input_tokens_seen": 2898720, "step": 2820 }, { "epoch": 1.8896321070234112, "grad_norm": 2.151020050048828, "learning_rate": 9.444816053511706e-06, "loss": 0.5947, "num_input_tokens_seen": 2903232, "step": 2825 }, { "epoch": 1.8929765886287626, "grad_norm": 1.8735629320144653, "learning_rate": 9.461538461538463e-06, "loss": 0.4659, "num_input_tokens_seen": 2909536, "step": 2830 }, { "epoch": 1.8963210702341136, "grad_norm": 2.694094181060791, "learning_rate": 9.478260869565217e-06, "loss": 0.4817, "num_input_tokens_seen": 2913664, "step": 2835 }, { "epoch": 1.899665551839465, "grad_norm": 1.6713290214538574, "learning_rate": 9.494983277591973e-06, "loss": 0.4855, "num_input_tokens_seen": 2919104, "step": 2840 }, { "epoch": 1.903010033444816, "grad_norm": 2.0527498722076416, "learning_rate": 9.51170568561873e-06, "loss": 0.4238, "num_input_tokens_seen": 2923712, "step": 2845 }, { "epoch": 1.9063545150501673, "grad_norm": 2.2317285537719727, "learning_rate": 9.528428093645486e-06, "loss": 0.5644, "num_input_tokens_seen": 2928608, "step": 2850 }, { "epoch": 1.9096989966555182, "grad_norm": 2.6343657970428467, "learning_rate": 9.54515050167224e-06, "loss": 0.6654, "num_input_tokens_seen": 2934528, "step": 2855 }, { "epoch": 1.9130434782608696, "grad_norm": 1.494659185409546, "learning_rate": 9.561872909698997e-06, "loss": 0.5684, "num_input_tokens_seen": 2940128, "step": 2860 }, { "epoch": 1.9163879598662206, "grad_norm": 1.7828717231750488, "learning_rate": 9.578595317725753e-06, "loss": 0.6237, "num_input_tokens_seen": 2945152, "step": 2865 }, { "epoch": 1.919732441471572, "grad_norm": 2.0950381755828857, "learning_rate": 9.59531772575251e-06, "loss": 0.5309, "num_input_tokens_seen": 2950208, "step": 2870 }, { "epoch": 1.9230769230769231, "grad_norm": 1.3486195802688599, "learning_rate": 9.612040133779264e-06, "loss": 0.5291, "num_input_tokens_seen": 2956192, "step": 2875 }, { "epoch": 1.9264214046822743, "grad_norm": 1.8326590061187744, "learning_rate": 9.62876254180602e-06, "loss": 0.4367, "num_input_tokens_seen": 2962208, "step": 2880 }, { "epoch": 1.9297658862876255, "grad_norm": 1.7152191400527954, "learning_rate": 9.645484949832776e-06, "loss": 0.5231, "num_input_tokens_seen": 2967776, "step": 2885 }, { "epoch": 1.9331103678929766, "grad_norm": 1.5462260246276855, "learning_rate": 9.662207357859533e-06, "loss": 0.512, "num_input_tokens_seen": 2973216, "step": 2890 }, { "epoch": 1.9364548494983278, "grad_norm": 2.2408957481384277, "learning_rate": 9.678929765886289e-06, "loss": 0.5325, "num_input_tokens_seen": 2978016, "step": 2895 }, { "epoch": 1.939799331103679, "grad_norm": 1.789434790611267, "learning_rate": 9.695652173913043e-06, "loss": 0.6124, "num_input_tokens_seen": 2983456, "step": 2900 }, { "epoch": 1.9431438127090301, "grad_norm": 1.3208471536636353, "learning_rate": 9.7123745819398e-06, "loss": 0.5096, "num_input_tokens_seen": 2988896, "step": 2905 }, { "epoch": 1.9464882943143813, "grad_norm": 2.471282720565796, "learning_rate": 9.729096989966556e-06, "loss": 0.5723, "num_input_tokens_seen": 2993344, "step": 2910 }, { "epoch": 1.9498327759197325, "grad_norm": 1.845829963684082, "learning_rate": 9.745819397993312e-06, "loss": 0.5179, "num_input_tokens_seen": 2998400, "step": 2915 }, { "epoch": 1.9531772575250836, "grad_norm": 2.1256611347198486, "learning_rate": 9.762541806020067e-06, "loss": 0.6154, "num_input_tokens_seen": 3003552, "step": 2920 }, { "epoch": 1.9565217391304348, "grad_norm": 2.5855233669281006, "learning_rate": 9.779264214046823e-06, "loss": 0.5372, "num_input_tokens_seen": 3008128, "step": 2925 }, { "epoch": 1.959866220735786, "grad_norm": 1.5413987636566162, "learning_rate": 9.795986622073579e-06, "loss": 0.5187, "num_input_tokens_seen": 3014080, "step": 2930 }, { "epoch": 1.9632107023411371, "grad_norm": 1.5255895853042603, "learning_rate": 9.812709030100335e-06, "loss": 0.534, "num_input_tokens_seen": 3019712, "step": 2935 }, { "epoch": 1.9665551839464883, "grad_norm": 1.505606770515442, "learning_rate": 9.82943143812709e-06, "loss": 0.5578, "num_input_tokens_seen": 3024640, "step": 2940 }, { "epoch": 1.9698996655518395, "grad_norm": 2.288947343826294, "learning_rate": 9.846153846153848e-06, "loss": 0.5083, "num_input_tokens_seen": 3029728, "step": 2945 }, { "epoch": 1.9732441471571907, "grad_norm": 3.3660731315612793, "learning_rate": 9.862876254180604e-06, "loss": 0.555, "num_input_tokens_seen": 3035392, "step": 2950 }, { "epoch": 1.9765886287625418, "grad_norm": 1.5082626342773438, "learning_rate": 9.879598662207359e-06, "loss": 0.4863, "num_input_tokens_seen": 3040608, "step": 2955 }, { "epoch": 1.979933110367893, "grad_norm": 1.6702715158462524, "learning_rate": 9.896321070234115e-06, "loss": 0.6187, "num_input_tokens_seen": 3045056, "step": 2960 }, { "epoch": 1.9832775919732442, "grad_norm": 2.3047637939453125, "learning_rate": 9.913043478260871e-06, "loss": 0.484, "num_input_tokens_seen": 3050592, "step": 2965 }, { "epoch": 1.9866220735785953, "grad_norm": 2.957206964492798, "learning_rate": 9.929765886287627e-06, "loss": 0.5206, "num_input_tokens_seen": 3055456, "step": 2970 }, { "epoch": 1.9899665551839465, "grad_norm": 3.057056427001953, "learning_rate": 9.946488294314382e-06, "loss": 0.6283, "num_input_tokens_seen": 3059744, "step": 2975 }, { "epoch": 1.9933110367892977, "grad_norm": 1.2653470039367676, "learning_rate": 9.963210702341138e-06, "loss": 0.4904, "num_input_tokens_seen": 3064736, "step": 2980 }, { "epoch": 1.9966555183946488, "grad_norm": 2.095123529434204, "learning_rate": 9.979933110367894e-06, "loss": 0.5425, "num_input_tokens_seen": 3069440, "step": 2985 }, { "epoch": 2.0, "grad_norm": 2.148932933807373, "learning_rate": 9.99665551839465e-06, "loss": 0.4825, "num_input_tokens_seen": 3073904, "step": 2990 }, { "epoch": 2.0, "eval_loss": 0.5422992706298828, "eval_runtime": 37.5948, "eval_samples_per_second": 39.766, "eval_steps_per_second": 9.948, "num_input_tokens_seen": 3073904, "step": 2990 }, { "epoch": 2.0033444816053514, "grad_norm": 1.8691082000732422, "learning_rate": 9.999999454829518e-06, "loss": 0.4913, "num_input_tokens_seen": 3078544, "step": 2995 }, { "epoch": 2.0066889632107023, "grad_norm": 2.274009943008423, "learning_rate": 9.999997240074637e-06, "loss": 0.4754, "num_input_tokens_seen": 3084048, "step": 3000 }, { "epoch": 2.0100334448160537, "grad_norm": 1.9465736150741577, "learning_rate": 9.999993321662959e-06, "loss": 0.5268, "num_input_tokens_seen": 3089264, "step": 3005 }, { "epoch": 2.0133779264214047, "grad_norm": 2.032933473587036, "learning_rate": 9.999987699595816e-06, "loss": 0.5399, "num_input_tokens_seen": 3094608, "step": 3010 }, { "epoch": 2.016722408026756, "grad_norm": 1.8117295503616333, "learning_rate": 9.999980373875125e-06, "loss": 0.5735, "num_input_tokens_seen": 3099600, "step": 3015 }, { "epoch": 2.020066889632107, "grad_norm": 1.3177047967910767, "learning_rate": 9.99997134450338e-06, "loss": 0.5196, "num_input_tokens_seen": 3105072, "step": 3020 }, { "epoch": 2.0234113712374584, "grad_norm": 1.583908200263977, "learning_rate": 9.99996061148366e-06, "loss": 0.5906, "num_input_tokens_seen": 3110064, "step": 3025 }, { "epoch": 2.0267558528428093, "grad_norm": 1.744986891746521, "learning_rate": 9.999948174819623e-06, "loss": 0.5494, "num_input_tokens_seen": 3115536, "step": 3030 }, { "epoch": 2.0301003344481607, "grad_norm": 1.284173607826233, "learning_rate": 9.999934034515504e-06, "loss": 0.5026, "num_input_tokens_seen": 3120752, "step": 3035 }, { "epoch": 2.0334448160535117, "grad_norm": 1.535584807395935, "learning_rate": 9.999918190576121e-06, "loss": 0.4376, "num_input_tokens_seen": 3125264, "step": 3040 }, { "epoch": 2.036789297658863, "grad_norm": 1.5154166221618652, "learning_rate": 9.999900643006875e-06, "loss": 0.5478, "num_input_tokens_seen": 3129936, "step": 3045 }, { "epoch": 2.040133779264214, "grad_norm": 1.676486611366272, "learning_rate": 9.999881391813742e-06, "loss": 0.5089, "num_input_tokens_seen": 3134672, "step": 3050 }, { "epoch": 2.0434782608695654, "grad_norm": 2.048649549484253, "learning_rate": 9.999860437003284e-06, "loss": 0.5812, "num_input_tokens_seen": 3140496, "step": 3055 }, { "epoch": 2.0468227424749164, "grad_norm": 1.520688533782959, "learning_rate": 9.999837778582641e-06, "loss": 0.4107, "num_input_tokens_seen": 3145104, "step": 3060 }, { "epoch": 2.0501672240802677, "grad_norm": 1.6133724451065063, "learning_rate": 9.99981341655953e-06, "loss": 0.5084, "num_input_tokens_seen": 3149904, "step": 3065 }, { "epoch": 2.0535117056856187, "grad_norm": 1.7276908159255981, "learning_rate": 9.999787350942257e-06, "loss": 0.514, "num_input_tokens_seen": 3155312, "step": 3070 }, { "epoch": 2.05685618729097, "grad_norm": 1.9027116298675537, "learning_rate": 9.999759581739699e-06, "loss": 0.4999, "num_input_tokens_seen": 3159728, "step": 3075 }, { "epoch": 2.060200668896321, "grad_norm": 1.5756064653396606, "learning_rate": 9.999730108961321e-06, "loss": 0.5776, "num_input_tokens_seen": 3164816, "step": 3080 }, { "epoch": 2.0635451505016724, "grad_norm": 1.2740424871444702, "learning_rate": 9.999698932617163e-06, "loss": 0.5268, "num_input_tokens_seen": 3169936, "step": 3085 }, { "epoch": 2.0668896321070234, "grad_norm": 1.4884837865829468, "learning_rate": 9.999666052717849e-06, "loss": 0.5815, "num_input_tokens_seen": 3175824, "step": 3090 }, { "epoch": 2.0702341137123748, "grad_norm": 2.3653104305267334, "learning_rate": 9.99963146927458e-06, "loss": 0.4778, "num_input_tokens_seen": 3180816, "step": 3095 }, { "epoch": 2.0735785953177257, "grad_norm": 2.6233749389648438, "learning_rate": 9.999595182299144e-06, "loss": 0.5383, "num_input_tokens_seen": 3186512, "step": 3100 }, { "epoch": 2.076923076923077, "grad_norm": 1.345012903213501, "learning_rate": 9.999557191803901e-06, "loss": 0.5839, "num_input_tokens_seen": 3192688, "step": 3105 }, { "epoch": 2.080267558528428, "grad_norm": 1.4181674718856812, "learning_rate": 9.999517497801798e-06, "loss": 0.6324, "num_input_tokens_seen": 3198032, "step": 3110 }, { "epoch": 2.0836120401337794, "grad_norm": 1.6220489740371704, "learning_rate": 9.99947610030636e-06, "loss": 0.4292, "num_input_tokens_seen": 3203536, "step": 3115 }, { "epoch": 2.0869565217391304, "grad_norm": 1.2016857862472534, "learning_rate": 9.99943299933169e-06, "loss": 0.4994, "num_input_tokens_seen": 3209168, "step": 3120 }, { "epoch": 2.0903010033444818, "grad_norm": 1.658652663230896, "learning_rate": 9.999388194892478e-06, "loss": 0.5216, "num_input_tokens_seen": 3214480, "step": 3125 }, { "epoch": 2.0936454849498327, "grad_norm": 2.2161900997161865, "learning_rate": 9.999341687003986e-06, "loss": 0.4887, "num_input_tokens_seen": 3219472, "step": 3130 }, { "epoch": 2.096989966555184, "grad_norm": 3.269920825958252, "learning_rate": 9.999293475682063e-06, "loss": 0.5303, "num_input_tokens_seen": 3223632, "step": 3135 }, { "epoch": 2.100334448160535, "grad_norm": 2.1268792152404785, "learning_rate": 9.999243560943134e-06, "loss": 0.5069, "num_input_tokens_seen": 3228208, "step": 3140 }, { "epoch": 2.1036789297658864, "grad_norm": 2.4108035564422607, "learning_rate": 9.999191942804211e-06, "loss": 0.5212, "num_input_tokens_seen": 3233200, "step": 3145 }, { "epoch": 2.1070234113712374, "grad_norm": 1.6486009359359741, "learning_rate": 9.999138621282878e-06, "loss": 0.5935, "num_input_tokens_seen": 3238672, "step": 3150 }, { "epoch": 2.1103678929765888, "grad_norm": 3.127081871032715, "learning_rate": 9.999083596397303e-06, "loss": 0.5402, "num_input_tokens_seen": 3244048, "step": 3155 }, { "epoch": 2.1137123745819397, "grad_norm": 2.462867259979248, "learning_rate": 9.999026868166238e-06, "loss": 0.5025, "num_input_tokens_seen": 3249872, "step": 3160 }, { "epoch": 2.117056856187291, "grad_norm": 1.81485116481781, "learning_rate": 9.998968436609009e-06, "loss": 0.5567, "num_input_tokens_seen": 3254480, "step": 3165 }, { "epoch": 2.120401337792642, "grad_norm": 1.6496129035949707, "learning_rate": 9.998908301745527e-06, "loss": 0.493, "num_input_tokens_seen": 3260016, "step": 3170 }, { "epoch": 2.1237458193979935, "grad_norm": 2.759265899658203, "learning_rate": 9.998846463596281e-06, "loss": 0.53, "num_input_tokens_seen": 3265040, "step": 3175 }, { "epoch": 2.1270903010033444, "grad_norm": 1.3989485502243042, "learning_rate": 9.998782922182345e-06, "loss": 0.435, "num_input_tokens_seen": 3270640, "step": 3180 }, { "epoch": 2.130434782608696, "grad_norm": 1.6625112295150757, "learning_rate": 9.998717677525362e-06, "loss": 0.5326, "num_input_tokens_seen": 3276208, "step": 3185 }, { "epoch": 2.1337792642140467, "grad_norm": 1.519641637802124, "learning_rate": 9.998650729647569e-06, "loss": 0.6087, "num_input_tokens_seen": 3281104, "step": 3190 }, { "epoch": 2.137123745819398, "grad_norm": 1.224361538887024, "learning_rate": 9.998582078571774e-06, "loss": 0.5017, "num_input_tokens_seen": 3286128, "step": 3195 }, { "epoch": 2.140468227424749, "grad_norm": 1.6894862651824951, "learning_rate": 9.998511724321373e-06, "loss": 0.539, "num_input_tokens_seen": 3290928, "step": 3200 }, { "epoch": 2.1438127090301005, "grad_norm": 1.992734432220459, "learning_rate": 9.998439666920333e-06, "loss": 0.5696, "num_input_tokens_seen": 3295856, "step": 3205 }, { "epoch": 2.1471571906354514, "grad_norm": 2.030518054962158, "learning_rate": 9.998365906393208e-06, "loss": 0.5362, "num_input_tokens_seen": 3300752, "step": 3210 }, { "epoch": 2.150501672240803, "grad_norm": 2.50451922416687, "learning_rate": 9.998290442765133e-06, "loss": 0.4776, "num_input_tokens_seen": 3305232, "step": 3215 }, { "epoch": 2.1538461538461537, "grad_norm": 1.1443310976028442, "learning_rate": 9.998213276061816e-06, "loss": 0.4591, "num_input_tokens_seen": 3310448, "step": 3220 }, { "epoch": 2.157190635451505, "grad_norm": 1.5334336757659912, "learning_rate": 9.998134406309555e-06, "loss": 0.5111, "num_input_tokens_seen": 3315344, "step": 3225 }, { "epoch": 2.160535117056856, "grad_norm": 2.8452951908111572, "learning_rate": 9.99805383353522e-06, "loss": 0.4492, "num_input_tokens_seen": 3320368, "step": 3230 }, { "epoch": 2.1638795986622075, "grad_norm": 2.118476390838623, "learning_rate": 9.997971557766263e-06, "loss": 0.5501, "num_input_tokens_seen": 3325712, "step": 3235 }, { "epoch": 2.1672240802675584, "grad_norm": 1.6566609144210815, "learning_rate": 9.997887579030726e-06, "loss": 0.516, "num_input_tokens_seen": 3331696, "step": 3240 }, { "epoch": 2.17056856187291, "grad_norm": 1.582995057106018, "learning_rate": 9.997801897357215e-06, "loss": 0.5693, "num_input_tokens_seen": 3337392, "step": 3245 }, { "epoch": 2.1739130434782608, "grad_norm": 1.3430899381637573, "learning_rate": 9.997714512774928e-06, "loss": 0.4501, "num_input_tokens_seen": 3342000, "step": 3250 }, { "epoch": 2.177257525083612, "grad_norm": 1.5479960441589355, "learning_rate": 9.997625425313638e-06, "loss": 0.543, "num_input_tokens_seen": 3347088, "step": 3255 }, { "epoch": 2.180602006688963, "grad_norm": 1.5842000246047974, "learning_rate": 9.997534635003702e-06, "loss": 0.51, "num_input_tokens_seen": 3352368, "step": 3260 }, { "epoch": 2.1839464882943145, "grad_norm": 1.9477663040161133, "learning_rate": 9.997442141876055e-06, "loss": 0.5521, "num_input_tokens_seen": 3358064, "step": 3265 }, { "epoch": 2.1872909698996654, "grad_norm": 1.1707500219345093, "learning_rate": 9.99734794596221e-06, "loss": 0.5847, "num_input_tokens_seen": 3364080, "step": 3270 }, { "epoch": 2.190635451505017, "grad_norm": 1.6332650184631348, "learning_rate": 9.997252047294263e-06, "loss": 0.5456, "num_input_tokens_seen": 3368784, "step": 3275 }, { "epoch": 2.1939799331103678, "grad_norm": 1.950452208518982, "learning_rate": 9.997154445904893e-06, "loss": 0.4734, "num_input_tokens_seen": 3374256, "step": 3280 }, { "epoch": 2.197324414715719, "grad_norm": 1.6316304206848145, "learning_rate": 9.997055141827352e-06, "loss": 0.4805, "num_input_tokens_seen": 3379760, "step": 3285 }, { "epoch": 2.20066889632107, "grad_norm": 2.2603535652160645, "learning_rate": 9.99695413509548e-06, "loss": 0.5061, "num_input_tokens_seen": 3384880, "step": 3290 }, { "epoch": 2.2040133779264215, "grad_norm": 1.5288833379745483, "learning_rate": 9.99685142574369e-06, "loss": 0.4398, "num_input_tokens_seen": 3389328, "step": 3295 }, { "epoch": 2.2073578595317724, "grad_norm": 2.5834877490997314, "learning_rate": 9.996747013806976e-06, "loss": 0.535, "num_input_tokens_seen": 3393808, "step": 3300 }, { "epoch": 2.210702341137124, "grad_norm": 1.6601572036743164, "learning_rate": 9.99664089932092e-06, "loss": 0.5029, "num_input_tokens_seen": 3399056, "step": 3305 }, { "epoch": 2.2140468227424748, "grad_norm": 1.7608741521835327, "learning_rate": 9.996533082321675e-06, "loss": 0.6349, "num_input_tokens_seen": 3404336, "step": 3310 }, { "epoch": 2.217391304347826, "grad_norm": 2.5549707412719727, "learning_rate": 9.996423562845981e-06, "loss": 0.5527, "num_input_tokens_seen": 3408848, "step": 3315 }, { "epoch": 2.220735785953177, "grad_norm": 2.038078546524048, "learning_rate": 9.996312340931152e-06, "loss": 0.5582, "num_input_tokens_seen": 3413712, "step": 3320 }, { "epoch": 2.2240802675585285, "grad_norm": 2.204190969467163, "learning_rate": 9.996199416615083e-06, "loss": 0.5083, "num_input_tokens_seen": 3420400, "step": 3325 }, { "epoch": 2.2274247491638794, "grad_norm": 1.5039803981781006, "learning_rate": 9.996084789936254e-06, "loss": 0.5653, "num_input_tokens_seen": 3426128, "step": 3330 }, { "epoch": 2.230769230769231, "grad_norm": 1.2664977312088013, "learning_rate": 9.995968460933723e-06, "loss": 0.5448, "num_input_tokens_seen": 3432368, "step": 3335 }, { "epoch": 2.234113712374582, "grad_norm": 1.9954105615615845, "learning_rate": 9.995850429647122e-06, "loss": 0.611, "num_input_tokens_seen": 3438160, "step": 3340 }, { "epoch": 2.237458193979933, "grad_norm": 2.4608354568481445, "learning_rate": 9.995730696116674e-06, "loss": 0.5113, "num_input_tokens_seen": 3442800, "step": 3345 }, { "epoch": 2.240802675585284, "grad_norm": 1.4868571758270264, "learning_rate": 9.99560926038317e-06, "loss": 0.4401, "num_input_tokens_seen": 3448336, "step": 3350 }, { "epoch": 2.2441471571906355, "grad_norm": 1.381503701210022, "learning_rate": 9.995486122487992e-06, "loss": 0.596, "num_input_tokens_seen": 3453616, "step": 3355 }, { "epoch": 2.2474916387959865, "grad_norm": 2.5108320713043213, "learning_rate": 9.995361282473095e-06, "loss": 0.5167, "num_input_tokens_seen": 3458928, "step": 3360 }, { "epoch": 2.250836120401338, "grad_norm": 1.6137175559997559, "learning_rate": 9.995234740381016e-06, "loss": 0.4401, "num_input_tokens_seen": 3464016, "step": 3365 }, { "epoch": 2.254180602006689, "grad_norm": 2.8481945991516113, "learning_rate": 9.995106496254872e-06, "loss": 0.4558, "num_input_tokens_seen": 3469168, "step": 3370 }, { "epoch": 2.25752508361204, "grad_norm": 1.9670405387878418, "learning_rate": 9.994976550138358e-06, "loss": 0.4972, "num_input_tokens_seen": 3474416, "step": 3375 }, { "epoch": 2.260869565217391, "grad_norm": 2.1028685569763184, "learning_rate": 9.994844902075754e-06, "loss": 0.5935, "num_input_tokens_seen": 3479824, "step": 3380 }, { "epoch": 2.2642140468227425, "grad_norm": 2.5825467109680176, "learning_rate": 9.994711552111912e-06, "loss": 0.5505, "num_input_tokens_seen": 3484656, "step": 3385 }, { "epoch": 2.2675585284280935, "grad_norm": 2.04675030708313, "learning_rate": 9.994576500292275e-06, "loss": 0.5434, "num_input_tokens_seen": 3489680, "step": 3390 }, { "epoch": 2.270903010033445, "grad_norm": 2.0894558429718018, "learning_rate": 9.994439746662855e-06, "loss": 0.5538, "num_input_tokens_seen": 3495248, "step": 3395 }, { "epoch": 2.274247491638796, "grad_norm": 2.3275341987609863, "learning_rate": 9.994301291270249e-06, "loss": 0.5128, "num_input_tokens_seen": 3500208, "step": 3400 }, { "epoch": 2.277591973244147, "grad_norm": 1.6889287233352661, "learning_rate": 9.994161134161635e-06, "loss": 0.4986, "num_input_tokens_seen": 3505776, "step": 3405 }, { "epoch": 2.280936454849498, "grad_norm": 1.662778615951538, "learning_rate": 9.994019275384765e-06, "loss": 0.5086, "num_input_tokens_seen": 3510416, "step": 3410 }, { "epoch": 2.2842809364548495, "grad_norm": 1.8161821365356445, "learning_rate": 9.993875714987977e-06, "loss": 0.5195, "num_input_tokens_seen": 3516144, "step": 3415 }, { "epoch": 2.2876254180602005, "grad_norm": 2.307082414627075, "learning_rate": 9.993730453020187e-06, "loss": 0.5312, "num_input_tokens_seen": 3521296, "step": 3420 }, { "epoch": 2.290969899665552, "grad_norm": 2.083392858505249, "learning_rate": 9.993583489530892e-06, "loss": 0.4947, "num_input_tokens_seen": 3526384, "step": 3425 }, { "epoch": 2.294314381270903, "grad_norm": 2.262705087661743, "learning_rate": 9.993434824570163e-06, "loss": 0.5293, "num_input_tokens_seen": 3532912, "step": 3430 }, { "epoch": 2.297658862876254, "grad_norm": 1.600924015045166, "learning_rate": 9.993284458188657e-06, "loss": 0.5442, "num_input_tokens_seen": 3537808, "step": 3435 }, { "epoch": 2.3010033444816056, "grad_norm": 1.2768847942352295, "learning_rate": 9.993132390437608e-06, "loss": 0.4252, "num_input_tokens_seen": 3543344, "step": 3440 }, { "epoch": 2.3043478260869565, "grad_norm": 1.6489802598953247, "learning_rate": 9.992978621368832e-06, "loss": 0.498, "num_input_tokens_seen": 3550224, "step": 3445 }, { "epoch": 2.3076923076923075, "grad_norm": 1.204017162322998, "learning_rate": 9.99282315103472e-06, "loss": 0.5209, "num_input_tokens_seen": 3555536, "step": 3450 }, { "epoch": 2.311036789297659, "grad_norm": 2.8220558166503906, "learning_rate": 9.992665979488249e-06, "loss": 0.415, "num_input_tokens_seen": 3561008, "step": 3455 }, { "epoch": 2.3143812709030103, "grad_norm": 2.8189949989318848, "learning_rate": 9.99250710678297e-06, "loss": 0.4615, "num_input_tokens_seen": 3566224, "step": 3460 }, { "epoch": 2.317725752508361, "grad_norm": 1.2150259017944336, "learning_rate": 9.992346532973017e-06, "loss": 0.4696, "num_input_tokens_seen": 3571152, "step": 3465 }, { "epoch": 2.321070234113712, "grad_norm": 2.515000820159912, "learning_rate": 9.992184258113103e-06, "loss": 0.5391, "num_input_tokens_seen": 3576048, "step": 3470 }, { "epoch": 2.3244147157190636, "grad_norm": 3.497997283935547, "learning_rate": 9.992020282258517e-06, "loss": 0.492, "num_input_tokens_seen": 3579984, "step": 3475 }, { "epoch": 2.327759197324415, "grad_norm": 1.740172266960144, "learning_rate": 9.991854605465135e-06, "loss": 0.4651, "num_input_tokens_seen": 3584944, "step": 3480 }, { "epoch": 2.331103678929766, "grad_norm": 1.795430302619934, "learning_rate": 9.991687227789407e-06, "loss": 0.5252, "num_input_tokens_seen": 3589744, "step": 3485 }, { "epoch": 2.334448160535117, "grad_norm": 3.0910141468048096, "learning_rate": 9.991518149288361e-06, "loss": 0.5047, "num_input_tokens_seen": 3595696, "step": 3490 }, { "epoch": 2.3377926421404682, "grad_norm": 3.21907901763916, "learning_rate": 9.991347370019611e-06, "loss": 0.5277, "num_input_tokens_seen": 3600784, "step": 3495 }, { "epoch": 2.3411371237458196, "grad_norm": 1.6298397779464722, "learning_rate": 9.991174890041344e-06, "loss": 0.5024, "num_input_tokens_seen": 3606256, "step": 3500 }, { "epoch": 2.3444816053511706, "grad_norm": 1.9687687158584595, "learning_rate": 9.991000709412333e-06, "loss": 0.5342, "num_input_tokens_seen": 3612464, "step": 3505 }, { "epoch": 2.3478260869565215, "grad_norm": 2.288707971572876, "learning_rate": 9.990824828191922e-06, "loss": 0.4949, "num_input_tokens_seen": 3618224, "step": 3510 }, { "epoch": 2.351170568561873, "grad_norm": 1.688173532485962, "learning_rate": 9.990647246440046e-06, "loss": 0.6959, "num_input_tokens_seen": 3623728, "step": 3515 }, { "epoch": 2.3545150501672243, "grad_norm": 3.1135685443878174, "learning_rate": 9.990467964217206e-06, "loss": 0.5387, "num_input_tokens_seen": 3628656, "step": 3520 }, { "epoch": 2.3578595317725752, "grad_norm": 1.0547856092453003, "learning_rate": 9.990286981584492e-06, "loss": 0.5257, "num_input_tokens_seen": 3633584, "step": 3525 }, { "epoch": 2.361204013377926, "grad_norm": 1.7259275913238525, "learning_rate": 9.99010429860357e-06, "loss": 0.5675, "num_input_tokens_seen": 3638608, "step": 3530 }, { "epoch": 2.3645484949832776, "grad_norm": 2.0148942470550537, "learning_rate": 9.989919915336687e-06, "loss": 0.5162, "num_input_tokens_seen": 3643504, "step": 3535 }, { "epoch": 2.367892976588629, "grad_norm": 1.4457169771194458, "learning_rate": 9.989733831846667e-06, "loss": 0.5245, "num_input_tokens_seen": 3648016, "step": 3540 }, { "epoch": 2.37123745819398, "grad_norm": 1.4870789051055908, "learning_rate": 9.989546048196914e-06, "loss": 0.5552, "num_input_tokens_seen": 3653520, "step": 3545 }, { "epoch": 2.374581939799331, "grad_norm": 2.101405382156372, "learning_rate": 9.989356564451415e-06, "loss": 0.5472, "num_input_tokens_seen": 3658992, "step": 3550 }, { "epoch": 2.3779264214046822, "grad_norm": 1.9766501188278198, "learning_rate": 9.98916538067473e-06, "loss": 0.509, "num_input_tokens_seen": 3664752, "step": 3555 }, { "epoch": 2.3812709030100336, "grad_norm": 2.969620704650879, "learning_rate": 9.988972496932001e-06, "loss": 0.6046, "num_input_tokens_seen": 3670640, "step": 3560 }, { "epoch": 2.3846153846153846, "grad_norm": 1.547834038734436, "learning_rate": 9.98877791328895e-06, "loss": 0.4999, "num_input_tokens_seen": 3676528, "step": 3565 }, { "epoch": 2.387959866220736, "grad_norm": 2.300459146499634, "learning_rate": 9.988581629811879e-06, "loss": 0.557, "num_input_tokens_seen": 3681744, "step": 3570 }, { "epoch": 2.391304347826087, "grad_norm": 1.7051680088043213, "learning_rate": 9.98838364656767e-06, "loss": 0.4102, "num_input_tokens_seen": 3686256, "step": 3575 }, { "epoch": 2.3946488294314383, "grad_norm": 2.784123420715332, "learning_rate": 9.988183963623777e-06, "loss": 0.4925, "num_input_tokens_seen": 3690864, "step": 3580 }, { "epoch": 2.3979933110367893, "grad_norm": 1.1824426651000977, "learning_rate": 9.987982581048243e-06, "loss": 0.4782, "num_input_tokens_seen": 3695504, "step": 3585 }, { "epoch": 2.4013377926421406, "grad_norm": 1.697507619857788, "learning_rate": 9.98777949890968e-06, "loss": 0.5862, "num_input_tokens_seen": 3700688, "step": 3590 }, { "epoch": 2.4046822742474916, "grad_norm": 1.0430241823196411, "learning_rate": 9.987574717277291e-06, "loss": 0.4334, "num_input_tokens_seen": 3706640, "step": 3595 }, { "epoch": 2.408026755852843, "grad_norm": 1.9729982614517212, "learning_rate": 9.987368236220848e-06, "loss": 0.4817, "num_input_tokens_seen": 3711280, "step": 3600 }, { "epoch": 2.411371237458194, "grad_norm": 2.155010223388672, "learning_rate": 9.987160055810703e-06, "loss": 0.5237, "num_input_tokens_seen": 3715856, "step": 3605 }, { "epoch": 2.4147157190635453, "grad_norm": 2.006974458694458, "learning_rate": 9.986950176117795e-06, "loss": 0.5506, "num_input_tokens_seen": 3721584, "step": 3610 }, { "epoch": 2.4180602006688963, "grad_norm": 1.3970003128051758, "learning_rate": 9.986738597213633e-06, "loss": 0.4899, "num_input_tokens_seen": 3726928, "step": 3615 }, { "epoch": 2.4214046822742477, "grad_norm": 1.6773433685302734, "learning_rate": 9.98652531917031e-06, "loss": 0.5753, "num_input_tokens_seen": 3732496, "step": 3620 }, { "epoch": 2.4247491638795986, "grad_norm": 2.1741251945495605, "learning_rate": 9.986310342060499e-06, "loss": 0.5491, "num_input_tokens_seen": 3737296, "step": 3625 }, { "epoch": 2.42809364548495, "grad_norm": 1.2344439029693604, "learning_rate": 9.986093665957444e-06, "loss": 0.5442, "num_input_tokens_seen": 3742480, "step": 3630 }, { "epoch": 2.431438127090301, "grad_norm": 2.4036009311676025, "learning_rate": 9.985875290934977e-06, "loss": 0.5232, "num_input_tokens_seen": 3747248, "step": 3635 }, { "epoch": 2.4347826086956523, "grad_norm": 1.914934754371643, "learning_rate": 9.985655217067504e-06, "loss": 0.5497, "num_input_tokens_seen": 3752112, "step": 3640 }, { "epoch": 2.4381270903010033, "grad_norm": 2.008012533187866, "learning_rate": 9.985433444430011e-06, "loss": 0.4878, "num_input_tokens_seen": 3757392, "step": 3645 }, { "epoch": 2.4414715719063547, "grad_norm": 1.7503114938735962, "learning_rate": 9.985209973098064e-06, "loss": 0.4808, "num_input_tokens_seen": 3762608, "step": 3650 }, { "epoch": 2.4448160535117056, "grad_norm": 1.6985198259353638, "learning_rate": 9.984984803147807e-06, "loss": 0.4577, "num_input_tokens_seen": 3768144, "step": 3655 }, { "epoch": 2.448160535117057, "grad_norm": 2.1960630416870117, "learning_rate": 9.984757934655962e-06, "loss": 0.5207, "num_input_tokens_seen": 3773168, "step": 3660 }, { "epoch": 2.451505016722408, "grad_norm": 1.6843701601028442, "learning_rate": 9.98452936769983e-06, "loss": 0.5222, "num_input_tokens_seen": 3778096, "step": 3665 }, { "epoch": 2.4548494983277593, "grad_norm": 2.281360387802124, "learning_rate": 9.984299102357292e-06, "loss": 0.6042, "num_input_tokens_seen": 3783152, "step": 3670 }, { "epoch": 2.4581939799331103, "grad_norm": 1.786924958229065, "learning_rate": 9.984067138706803e-06, "loss": 0.5904, "num_input_tokens_seen": 3788944, "step": 3675 }, { "epoch": 2.4615384615384617, "grad_norm": 1.1074966192245483, "learning_rate": 9.983833476827404e-06, "loss": 0.546, "num_input_tokens_seen": 3794544, "step": 3680 }, { "epoch": 2.4648829431438126, "grad_norm": 1.1233205795288086, "learning_rate": 9.98359811679871e-06, "loss": 0.6084, "num_input_tokens_seen": 3800432, "step": 3685 }, { "epoch": 2.468227424749164, "grad_norm": 1.8534961938858032, "learning_rate": 9.983361058700916e-06, "loss": 0.5208, "num_input_tokens_seen": 3805808, "step": 3690 }, { "epoch": 2.471571906354515, "grad_norm": 1.8997340202331543, "learning_rate": 9.983122302614793e-06, "loss": 0.5127, "num_input_tokens_seen": 3811248, "step": 3695 }, { "epoch": 2.4749163879598663, "grad_norm": 1.467244267463684, "learning_rate": 9.982881848621697e-06, "loss": 0.4403, "num_input_tokens_seen": 3816464, "step": 3700 }, { "epoch": 2.4782608695652173, "grad_norm": 1.396279215812683, "learning_rate": 9.982639696803555e-06, "loss": 0.5274, "num_input_tokens_seen": 3820880, "step": 3705 }, { "epoch": 2.4816053511705687, "grad_norm": 1.4927785396575928, "learning_rate": 9.982395847242877e-06, "loss": 0.6341, "num_input_tokens_seen": 3826160, "step": 3710 }, { "epoch": 2.4849498327759196, "grad_norm": 1.9265084266662598, "learning_rate": 9.982150300022748e-06, "loss": 0.5361, "num_input_tokens_seen": 3831760, "step": 3715 }, { "epoch": 2.488294314381271, "grad_norm": 1.5338785648345947, "learning_rate": 9.981903055226836e-06, "loss": 0.5527, "num_input_tokens_seen": 3836720, "step": 3720 }, { "epoch": 2.491638795986622, "grad_norm": 1.8446769714355469, "learning_rate": 9.981654112939386e-06, "loss": 0.5189, "num_input_tokens_seen": 3842320, "step": 3725 }, { "epoch": 2.4949832775919734, "grad_norm": 2.0833640098571777, "learning_rate": 9.981403473245218e-06, "loss": 0.472, "num_input_tokens_seen": 3846448, "step": 3730 }, { "epoch": 2.4983277591973243, "grad_norm": 1.777630090713501, "learning_rate": 9.981151136229731e-06, "loss": 0.5103, "num_input_tokens_seen": 3851632, "step": 3735 }, { "epoch": 2.5016722408026757, "grad_norm": 1.5390561819076538, "learning_rate": 9.980897101978911e-06, "loss": 0.5033, "num_input_tokens_seen": 3856944, "step": 3740 }, { "epoch": 2.5050167224080266, "grad_norm": 1.4436044692993164, "learning_rate": 9.98064137057931e-06, "loss": 0.5428, "num_input_tokens_seen": 3861776, "step": 3745 }, { "epoch": 2.508361204013378, "grad_norm": 1.7429101467132568, "learning_rate": 9.980383942118066e-06, "loss": 0.6213, "num_input_tokens_seen": 3868016, "step": 3750 }, { "epoch": 2.511705685618729, "grad_norm": 1.2476568222045898, "learning_rate": 9.980124816682891e-06, "loss": 0.5502, "num_input_tokens_seen": 3873520, "step": 3755 }, { "epoch": 2.5150501672240804, "grad_norm": 1.5218292474746704, "learning_rate": 9.979863994362078e-06, "loss": 0.4265, "num_input_tokens_seen": 3878160, "step": 3760 }, { "epoch": 2.5183946488294313, "grad_norm": 1.5158296823501587, "learning_rate": 9.9796014752445e-06, "loss": 0.5051, "num_input_tokens_seen": 3883408, "step": 3765 }, { "epoch": 2.5217391304347827, "grad_norm": 1.5983504056930542, "learning_rate": 9.979337259419602e-06, "loss": 0.5044, "num_input_tokens_seen": 3888592, "step": 3770 }, { "epoch": 2.5250836120401337, "grad_norm": 1.8046156167984009, "learning_rate": 9.979071346977414e-06, "loss": 0.6101, "num_input_tokens_seen": 3893456, "step": 3775 }, { "epoch": 2.528428093645485, "grad_norm": 3.148212194442749, "learning_rate": 9.978803738008536e-06, "loss": 0.5161, "num_input_tokens_seen": 3898544, "step": 3780 }, { "epoch": 2.531772575250836, "grad_norm": 1.391937494277954, "learning_rate": 9.978534432604155e-06, "loss": 0.5318, "num_input_tokens_seen": 3903472, "step": 3785 }, { "epoch": 2.5351170568561874, "grad_norm": 2.0109899044036865, "learning_rate": 9.97826343085603e-06, "loss": 0.4983, "num_input_tokens_seen": 3908560, "step": 3790 }, { "epoch": 2.5384615384615383, "grad_norm": 1.4073151350021362, "learning_rate": 9.977990732856502e-06, "loss": 0.512, "num_input_tokens_seen": 3913712, "step": 3795 }, { "epoch": 2.5418060200668897, "grad_norm": 1.4403799772262573, "learning_rate": 9.977716338698485e-06, "loss": 0.5654, "num_input_tokens_seen": 3918960, "step": 3800 }, { "epoch": 2.5451505016722407, "grad_norm": 1.3154020309448242, "learning_rate": 9.977440248475475e-06, "loss": 0.5622, "num_input_tokens_seen": 3923856, "step": 3805 }, { "epoch": 2.548494983277592, "grad_norm": 1.486240267753601, "learning_rate": 9.977162462281544e-06, "loss": 0.5702, "num_input_tokens_seen": 3930224, "step": 3810 }, { "epoch": 2.551839464882943, "grad_norm": 1.8069722652435303, "learning_rate": 9.976882980211345e-06, "loss": 0.4982, "num_input_tokens_seen": 3935472, "step": 3815 }, { "epoch": 2.5551839464882944, "grad_norm": 1.428803563117981, "learning_rate": 9.976601802360102e-06, "loss": 0.5416, "num_input_tokens_seen": 3940912, "step": 3820 }, { "epoch": 2.5585284280936453, "grad_norm": 1.7699735164642334, "learning_rate": 9.976318928823625e-06, "loss": 0.478, "num_input_tokens_seen": 3946128, "step": 3825 }, { "epoch": 2.5618729096989967, "grad_norm": 1.2180256843566895, "learning_rate": 9.976034359698296e-06, "loss": 0.4247, "num_input_tokens_seen": 3951024, "step": 3830 }, { "epoch": 2.5652173913043477, "grad_norm": 2.1061630249023438, "learning_rate": 9.975748095081078e-06, "loss": 0.5593, "num_input_tokens_seen": 3956304, "step": 3835 }, { "epoch": 2.568561872909699, "grad_norm": 1.3598883152008057, "learning_rate": 9.975460135069509e-06, "loss": 0.4648, "num_input_tokens_seen": 3961168, "step": 3840 }, { "epoch": 2.57190635451505, "grad_norm": 1.8531746864318848, "learning_rate": 9.975170479761706e-06, "loss": 0.5719, "num_input_tokens_seen": 3966192, "step": 3845 }, { "epoch": 2.5752508361204014, "grad_norm": 1.3966381549835205, "learning_rate": 9.974879129256365e-06, "loss": 0.473, "num_input_tokens_seen": 3971248, "step": 3850 }, { "epoch": 2.5785953177257523, "grad_norm": 1.312935709953308, "learning_rate": 9.974586083652758e-06, "loss": 0.5266, "num_input_tokens_seen": 3976464, "step": 3855 }, { "epoch": 2.5819397993311037, "grad_norm": 2.0301895141601562, "learning_rate": 9.974291343050735e-06, "loss": 0.4974, "num_input_tokens_seen": 3982384, "step": 3860 }, { "epoch": 2.585284280936455, "grad_norm": 2.068810224533081, "learning_rate": 9.973994907550722e-06, "loss": 0.5755, "num_input_tokens_seen": 3987760, "step": 3865 }, { "epoch": 2.588628762541806, "grad_norm": 1.2280423641204834, "learning_rate": 9.973696777253726e-06, "loss": 0.5, "num_input_tokens_seen": 3994064, "step": 3870 }, { "epoch": 2.591973244147157, "grad_norm": 1.3790693283081055, "learning_rate": 9.973396952261327e-06, "loss": 0.4783, "num_input_tokens_seen": 3999120, "step": 3875 }, { "epoch": 2.5953177257525084, "grad_norm": 1.721253752708435, "learning_rate": 9.973095432675687e-06, "loss": 0.5619, "num_input_tokens_seen": 4003952, "step": 3880 }, { "epoch": 2.59866220735786, "grad_norm": 1.709930419921875, "learning_rate": 9.972792218599543e-06, "loss": 0.4066, "num_input_tokens_seen": 4008432, "step": 3885 }, { "epoch": 2.6020066889632107, "grad_norm": 1.7424228191375732, "learning_rate": 9.972487310136207e-06, "loss": 0.4396, "num_input_tokens_seen": 4013136, "step": 3890 }, { "epoch": 2.6053511705685617, "grad_norm": 1.5966991186141968, "learning_rate": 9.972180707389574e-06, "loss": 0.5298, "num_input_tokens_seen": 4017904, "step": 3895 }, { "epoch": 2.608695652173913, "grad_norm": 1.572526454925537, "learning_rate": 9.971872410464111e-06, "loss": 0.5443, "num_input_tokens_seen": 4023312, "step": 3900 }, { "epoch": 2.6120401337792645, "grad_norm": 2.1163768768310547, "learning_rate": 9.971562419464868e-06, "loss": 0.5644, "num_input_tokens_seen": 4028784, "step": 3905 }, { "epoch": 2.6153846153846154, "grad_norm": 1.5274107456207275, "learning_rate": 9.971250734497463e-06, "loss": 0.4936, "num_input_tokens_seen": 4033520, "step": 3910 }, { "epoch": 2.6187290969899664, "grad_norm": 1.636322021484375, "learning_rate": 9.970937355668104e-06, "loss": 0.4693, "num_input_tokens_seen": 4038672, "step": 3915 }, { "epoch": 2.6220735785953178, "grad_norm": 1.7022308111190796, "learning_rate": 9.970622283083564e-06, "loss": 0.5405, "num_input_tokens_seen": 4043984, "step": 3920 }, { "epoch": 2.625418060200669, "grad_norm": 2.1801552772521973, "learning_rate": 9.970305516851199e-06, "loss": 0.5195, "num_input_tokens_seen": 4048464, "step": 3925 }, { "epoch": 2.62876254180602, "grad_norm": 1.3403230905532837, "learning_rate": 9.969987057078942e-06, "loss": 0.4515, "num_input_tokens_seen": 4053264, "step": 3930 }, { "epoch": 2.632107023411371, "grad_norm": 1.3730032444000244, "learning_rate": 9.969666903875301e-06, "loss": 0.6012, "num_input_tokens_seen": 4057936, "step": 3935 }, { "epoch": 2.6354515050167224, "grad_norm": 1.4381893873214722, "learning_rate": 9.969345057349365e-06, "loss": 0.4998, "num_input_tokens_seen": 4063792, "step": 3940 }, { "epoch": 2.638795986622074, "grad_norm": 2.1932754516601562, "learning_rate": 9.969021517610794e-06, "loss": 0.5929, "num_input_tokens_seen": 4069328, "step": 3945 }, { "epoch": 2.6421404682274248, "grad_norm": 1.6316767930984497, "learning_rate": 9.96869628476983e-06, "loss": 0.6676, "num_input_tokens_seen": 4074736, "step": 3950 }, { "epoch": 2.6454849498327757, "grad_norm": 1.4866325855255127, "learning_rate": 9.96836935893729e-06, "loss": 0.4816, "num_input_tokens_seen": 4080208, "step": 3955 }, { "epoch": 2.648829431438127, "grad_norm": 1.285139799118042, "learning_rate": 9.968040740224569e-06, "loss": 0.5804, "num_input_tokens_seen": 4085680, "step": 3960 }, { "epoch": 2.6521739130434785, "grad_norm": 1.6043790578842163, "learning_rate": 9.967710428743636e-06, "loss": 0.5021, "num_input_tokens_seen": 4090256, "step": 3965 }, { "epoch": 2.6555183946488294, "grad_norm": 1.923613429069519, "learning_rate": 9.967378424607037e-06, "loss": 0.5872, "num_input_tokens_seen": 4095088, "step": 3970 }, { "epoch": 2.6588628762541804, "grad_norm": 1.8946343660354614, "learning_rate": 9.9670447279279e-06, "loss": 0.4645, "num_input_tokens_seen": 4100272, "step": 3975 }, { "epoch": 2.6622073578595318, "grad_norm": 1.1805360317230225, "learning_rate": 9.966709338819925e-06, "loss": 0.5237, "num_input_tokens_seen": 4105968, "step": 3980 }, { "epoch": 2.665551839464883, "grad_norm": 1.427530288696289, "learning_rate": 9.966372257397387e-06, "loss": 0.468, "num_input_tokens_seen": 4110320, "step": 3985 }, { "epoch": 2.668896321070234, "grad_norm": 2.8043975830078125, "learning_rate": 9.966033483775146e-06, "loss": 0.6066, "num_input_tokens_seen": 4116240, "step": 3990 }, { "epoch": 2.672240802675585, "grad_norm": 2.3061139583587646, "learning_rate": 9.965693018068625e-06, "loss": 0.5172, "num_input_tokens_seen": 4122032, "step": 3995 }, { "epoch": 2.6755852842809364, "grad_norm": 1.6675879955291748, "learning_rate": 9.96535086039384e-06, "loss": 0.5652, "num_input_tokens_seen": 4127312, "step": 4000 }, { "epoch": 2.678929765886288, "grad_norm": 1.398923397064209, "learning_rate": 9.965007010867366e-06, "loss": 0.5447, "num_input_tokens_seen": 4132400, "step": 4005 }, { "epoch": 2.682274247491639, "grad_norm": 1.7074272632598877, "learning_rate": 9.96466146960637e-06, "loss": 0.5957, "num_input_tokens_seen": 4137840, "step": 4010 }, { "epoch": 2.6856187290969897, "grad_norm": 2.5418455600738525, "learning_rate": 9.964314236728587e-06, "loss": 0.5223, "num_input_tokens_seen": 4143152, "step": 4015 }, { "epoch": 2.688963210702341, "grad_norm": 3.0198843479156494, "learning_rate": 9.963965312352328e-06, "loss": 0.4924, "num_input_tokens_seen": 4148528, "step": 4020 }, { "epoch": 2.6923076923076925, "grad_norm": 1.5188360214233398, "learning_rate": 9.963614696596486e-06, "loss": 0.4905, "num_input_tokens_seen": 4153840, "step": 4025 }, { "epoch": 2.6956521739130435, "grad_norm": 1.3100861310958862, "learning_rate": 9.963262389580527e-06, "loss": 0.3818, "num_input_tokens_seen": 4159376, "step": 4030 }, { "epoch": 2.6989966555183944, "grad_norm": 1.2499024868011475, "learning_rate": 9.962908391424488e-06, "loss": 0.5241, "num_input_tokens_seen": 4164816, "step": 4035 }, { "epoch": 2.702341137123746, "grad_norm": 1.2074987888336182, "learning_rate": 9.962552702248993e-06, "loss": 0.4625, "num_input_tokens_seen": 4169392, "step": 4040 }, { "epoch": 2.705685618729097, "grad_norm": 1.4957832098007202, "learning_rate": 9.962195322175231e-06, "loss": 0.5116, "num_input_tokens_seen": 4174128, "step": 4045 }, { "epoch": 2.709030100334448, "grad_norm": 1.9006519317626953, "learning_rate": 9.961836251324979e-06, "loss": 0.4785, "num_input_tokens_seen": 4179152, "step": 4050 }, { "epoch": 2.712374581939799, "grad_norm": 1.7682338953018188, "learning_rate": 9.96147548982058e-06, "loss": 0.5289, "num_input_tokens_seen": 4184112, "step": 4055 }, { "epoch": 2.7157190635451505, "grad_norm": 1.5431747436523438, "learning_rate": 9.961113037784958e-06, "loss": 0.5529, "num_input_tokens_seen": 4188944, "step": 4060 }, { "epoch": 2.719063545150502, "grad_norm": 1.8076133728027344, "learning_rate": 9.96074889534161e-06, "loss": 0.5059, "num_input_tokens_seen": 4194128, "step": 4065 }, { "epoch": 2.722408026755853, "grad_norm": 1.9092835187911987, "learning_rate": 9.960383062614614e-06, "loss": 0.5187, "num_input_tokens_seen": 4199632, "step": 4070 }, { "epoch": 2.7257525083612038, "grad_norm": 1.6259812116622925, "learning_rate": 9.960015539728617e-06, "loss": 0.6147, "num_input_tokens_seen": 4205424, "step": 4075 }, { "epoch": 2.729096989966555, "grad_norm": 1.6639596223831177, "learning_rate": 9.959646326808848e-06, "loss": 0.6046, "num_input_tokens_seen": 4210416, "step": 4080 }, { "epoch": 2.7324414715719065, "grad_norm": 1.1819326877593994, "learning_rate": 9.95927542398111e-06, "loss": 0.4712, "num_input_tokens_seen": 4216080, "step": 4085 }, { "epoch": 2.7357859531772575, "grad_norm": 1.7918132543563843, "learning_rate": 9.95890283137178e-06, "loss": 0.5201, "num_input_tokens_seen": 4220880, "step": 4090 }, { "epoch": 2.7391304347826084, "grad_norm": 2.133619785308838, "learning_rate": 9.958528549107812e-06, "loss": 0.5162, "num_input_tokens_seen": 4225168, "step": 4095 }, { "epoch": 2.74247491638796, "grad_norm": 1.717909812927246, "learning_rate": 9.958152577316736e-06, "loss": 0.5393, "num_input_tokens_seen": 4229584, "step": 4100 }, { "epoch": 2.745819397993311, "grad_norm": 2.080984115600586, "learning_rate": 9.957774916126657e-06, "loss": 0.5384, "num_input_tokens_seen": 4235568, "step": 4105 }, { "epoch": 2.749163879598662, "grad_norm": 1.4108389616012573, "learning_rate": 9.957395565666256e-06, "loss": 0.4586, "num_input_tokens_seen": 4240336, "step": 4110 }, { "epoch": 2.7525083612040135, "grad_norm": 2.899001121520996, "learning_rate": 9.957014526064794e-06, "loss": 0.5019, "num_input_tokens_seen": 4245360, "step": 4115 }, { "epoch": 2.7558528428093645, "grad_norm": 1.522618293762207, "learning_rate": 9.956631797452096e-06, "loss": 0.4969, "num_input_tokens_seen": 4250736, "step": 4120 }, { "epoch": 2.759197324414716, "grad_norm": 2.4425594806671143, "learning_rate": 9.956247379958575e-06, "loss": 0.4193, "num_input_tokens_seen": 4255600, "step": 4125 }, { "epoch": 2.762541806020067, "grad_norm": 1.345755934715271, "learning_rate": 9.955861273715213e-06, "loss": 0.4701, "num_input_tokens_seen": 4260848, "step": 4130 }, { "epoch": 2.765886287625418, "grad_norm": 1.5376501083374023, "learning_rate": 9.955473478853567e-06, "loss": 0.4582, "num_input_tokens_seen": 4264976, "step": 4135 }, { "epoch": 2.769230769230769, "grad_norm": 1.1733580827713013, "learning_rate": 9.955083995505772e-06, "loss": 0.5082, "num_input_tokens_seen": 4270064, "step": 4140 }, { "epoch": 2.7725752508361206, "grad_norm": 1.506137490272522, "learning_rate": 9.954692823804537e-06, "loss": 0.5422, "num_input_tokens_seen": 4274800, "step": 4145 }, { "epoch": 2.7759197324414715, "grad_norm": 1.4906256198883057, "learning_rate": 9.954299963883148e-06, "loss": 0.4785, "num_input_tokens_seen": 4278896, "step": 4150 }, { "epoch": 2.779264214046823, "grad_norm": 1.1847907304763794, "learning_rate": 9.953905415875462e-06, "loss": 0.447, "num_input_tokens_seen": 4284208, "step": 4155 }, { "epoch": 2.782608695652174, "grad_norm": 1.5204858779907227, "learning_rate": 9.953509179915917e-06, "loss": 0.5527, "num_input_tokens_seen": 4288784, "step": 4160 }, { "epoch": 2.7859531772575252, "grad_norm": 1.7963597774505615, "learning_rate": 9.95311125613952e-06, "loss": 0.4637, "num_input_tokens_seen": 4293872, "step": 4165 }, { "epoch": 2.789297658862876, "grad_norm": 1.473549246788025, "learning_rate": 9.952711644681859e-06, "loss": 0.5134, "num_input_tokens_seen": 4298832, "step": 4170 }, { "epoch": 2.7926421404682276, "grad_norm": 2.0215342044830322, "learning_rate": 9.952310345679093e-06, "loss": 0.5377, "num_input_tokens_seen": 4303888, "step": 4175 }, { "epoch": 2.7959866220735785, "grad_norm": 1.7222745418548584, "learning_rate": 9.951907359267957e-06, "loss": 0.5551, "num_input_tokens_seen": 4309072, "step": 4180 }, { "epoch": 2.79933110367893, "grad_norm": 1.8279701471328735, "learning_rate": 9.95150268558576e-06, "loss": 0.5277, "num_input_tokens_seen": 4314288, "step": 4185 }, { "epoch": 2.802675585284281, "grad_norm": 2.98844838142395, "learning_rate": 9.95109632477039e-06, "loss": 0.4122, "num_input_tokens_seen": 4320240, "step": 4190 }, { "epoch": 2.8060200668896322, "grad_norm": 2.4137275218963623, "learning_rate": 9.950688276960306e-06, "loss": 0.488, "num_input_tokens_seen": 4326096, "step": 4195 }, { "epoch": 2.809364548494983, "grad_norm": 3.412095308303833, "learning_rate": 9.95027854229454e-06, "loss": 0.5285, "num_input_tokens_seen": 4330352, "step": 4200 }, { "epoch": 2.8127090301003346, "grad_norm": 2.1294093132019043, "learning_rate": 9.949867120912705e-06, "loss": 0.4656, "num_input_tokens_seen": 4335440, "step": 4205 }, { "epoch": 2.8160535117056855, "grad_norm": 2.210012435913086, "learning_rate": 9.949454012954985e-06, "loss": 0.4324, "num_input_tokens_seen": 4340272, "step": 4210 }, { "epoch": 2.819397993311037, "grad_norm": 1.2658634185791016, "learning_rate": 9.949039218562138e-06, "loss": 0.6158, "num_input_tokens_seen": 4345360, "step": 4215 }, { "epoch": 2.822742474916388, "grad_norm": 1.0943318605422974, "learning_rate": 9.948622737875496e-06, "loss": 0.5065, "num_input_tokens_seen": 4350608, "step": 4220 }, { "epoch": 2.8260869565217392, "grad_norm": 2.4344067573547363, "learning_rate": 9.948204571036968e-06, "loss": 0.5129, "num_input_tokens_seen": 4355472, "step": 4225 }, { "epoch": 2.82943143812709, "grad_norm": 2.122096538543701, "learning_rate": 9.94778471818904e-06, "loss": 0.55, "num_input_tokens_seen": 4359856, "step": 4230 }, { "epoch": 2.8327759197324416, "grad_norm": 1.84500253200531, "learning_rate": 9.947363179474765e-06, "loss": 0.5661, "num_input_tokens_seen": 4365360, "step": 4235 }, { "epoch": 2.8361204013377925, "grad_norm": 1.3218168020248413, "learning_rate": 9.946939955037776e-06, "loss": 0.5616, "num_input_tokens_seen": 4371312, "step": 4240 }, { "epoch": 2.839464882943144, "grad_norm": 1.4573712348937988, "learning_rate": 9.946515045022278e-06, "loss": 0.5638, "num_input_tokens_seen": 4376560, "step": 4245 }, { "epoch": 2.842809364548495, "grad_norm": 1.6367191076278687, "learning_rate": 9.946088449573052e-06, "loss": 0.5028, "num_input_tokens_seen": 4382448, "step": 4250 }, { "epoch": 2.8461538461538463, "grad_norm": 2.0350191593170166, "learning_rate": 9.945660168835451e-06, "loss": 0.4924, "num_input_tokens_seen": 4387952, "step": 4255 }, { "epoch": 2.849498327759197, "grad_norm": 2.8543221950531006, "learning_rate": 9.945230202955408e-06, "loss": 0.5147, "num_input_tokens_seen": 4393392, "step": 4260 }, { "epoch": 2.8528428093645486, "grad_norm": 1.9280180931091309, "learning_rate": 9.944798552079422e-06, "loss": 0.6042, "num_input_tokens_seen": 4398064, "step": 4265 }, { "epoch": 2.8561872909698995, "grad_norm": 1.807050347328186, "learning_rate": 9.944365216354573e-06, "loss": 0.5735, "num_input_tokens_seen": 4403056, "step": 4270 }, { "epoch": 2.859531772575251, "grad_norm": 1.479434847831726, "learning_rate": 9.94393019592851e-06, "loss": 0.5536, "num_input_tokens_seen": 4408784, "step": 4275 }, { "epoch": 2.862876254180602, "grad_norm": 2.081637382507324, "learning_rate": 9.943493490949456e-06, "loss": 0.5614, "num_input_tokens_seen": 4414256, "step": 4280 }, { "epoch": 2.8662207357859533, "grad_norm": 1.2996973991394043, "learning_rate": 9.943055101566215e-06, "loss": 0.5002, "num_input_tokens_seen": 4419824, "step": 4285 }, { "epoch": 2.869565217391304, "grad_norm": 1.4030979871749878, "learning_rate": 9.942615027928157e-06, "loss": 0.4858, "num_input_tokens_seen": 4425040, "step": 4290 }, { "epoch": 2.8729096989966556, "grad_norm": 1.627907156944275, "learning_rate": 9.942173270185228e-06, "loss": 0.4659, "num_input_tokens_seen": 4430096, "step": 4295 }, { "epoch": 2.8762541806020065, "grad_norm": 2.371788501739502, "learning_rate": 9.941729828487953e-06, "loss": 0.5111, "num_input_tokens_seen": 4434864, "step": 4300 }, { "epoch": 2.879598662207358, "grad_norm": 1.8241878747940063, "learning_rate": 9.941284702987426e-06, "loss": 0.4807, "num_input_tokens_seen": 4440400, "step": 4305 }, { "epoch": 2.882943143812709, "grad_norm": 2.138018846511841, "learning_rate": 9.940837893835312e-06, "loss": 0.5167, "num_input_tokens_seen": 4445264, "step": 4310 }, { "epoch": 2.8862876254180603, "grad_norm": 1.7725707292556763, "learning_rate": 9.940389401183854e-06, "loss": 0.5863, "num_input_tokens_seen": 4451184, "step": 4315 }, { "epoch": 2.8896321070234112, "grad_norm": 1.4269754886627197, "learning_rate": 9.939939225185868e-06, "loss": 0.5467, "num_input_tokens_seen": 4456432, "step": 4320 }, { "epoch": 2.8929765886287626, "grad_norm": 1.3487333059310913, "learning_rate": 9.939487365994744e-06, "loss": 0.5262, "num_input_tokens_seen": 4461296, "step": 4325 }, { "epoch": 2.8963210702341136, "grad_norm": 2.5628864765167236, "learning_rate": 9.939033823764443e-06, "loss": 0.525, "num_input_tokens_seen": 4465744, "step": 4330 }, { "epoch": 2.899665551839465, "grad_norm": 2.3834850788116455, "learning_rate": 9.938578598649502e-06, "loss": 0.5374, "num_input_tokens_seen": 4470832, "step": 4335 }, { "epoch": 2.903010033444816, "grad_norm": 2.4154272079467773, "learning_rate": 9.938121690805031e-06, "loss": 0.4475, "num_input_tokens_seen": 4475280, "step": 4340 }, { "epoch": 2.9063545150501673, "grad_norm": 1.419203281402588, "learning_rate": 9.937663100386715e-06, "loss": 0.5022, "num_input_tokens_seen": 4480208, "step": 4345 }, { "epoch": 2.9096989966555182, "grad_norm": 1.9614945650100708, "learning_rate": 9.937202827550804e-06, "loss": 0.4917, "num_input_tokens_seen": 4485712, "step": 4350 }, { "epoch": 2.9130434782608696, "grad_norm": 1.2155195474624634, "learning_rate": 9.936740872454134e-06, "loss": 0.5412, "num_input_tokens_seen": 4490928, "step": 4355 }, { "epoch": 2.9163879598662206, "grad_norm": 1.4752168655395508, "learning_rate": 9.936277235254105e-06, "loss": 0.5047, "num_input_tokens_seen": 4496304, "step": 4360 }, { "epoch": 2.919732441471572, "grad_norm": 1.8482636213302612, "learning_rate": 9.93581191610869e-06, "loss": 0.5322, "num_input_tokens_seen": 4501520, "step": 4365 }, { "epoch": 2.9230769230769234, "grad_norm": 1.5897181034088135, "learning_rate": 9.935344915176441e-06, "loss": 0.5591, "num_input_tokens_seen": 4506512, "step": 4370 }, { "epoch": 2.9264214046822743, "grad_norm": 1.905950665473938, "learning_rate": 9.934876232616482e-06, "loss": 0.5923, "num_input_tokens_seen": 4511088, "step": 4375 }, { "epoch": 2.9297658862876252, "grad_norm": 2.2638299465179443, "learning_rate": 9.934405868588506e-06, "loss": 0.5945, "num_input_tokens_seen": 4516848, "step": 4380 }, { "epoch": 2.9331103678929766, "grad_norm": 1.6494619846343994, "learning_rate": 9.933933823252777e-06, "loss": 0.564, "num_input_tokens_seen": 4521680, "step": 4385 }, { "epoch": 2.936454849498328, "grad_norm": 1.5054208040237427, "learning_rate": 9.933460096770143e-06, "loss": 0.4365, "num_input_tokens_seen": 4526256, "step": 4390 }, { "epoch": 2.939799331103679, "grad_norm": 1.7637344598770142, "learning_rate": 9.932984689302012e-06, "loss": 0.4733, "num_input_tokens_seen": 4531728, "step": 4395 }, { "epoch": 2.94314381270903, "grad_norm": 1.7248412370681763, "learning_rate": 9.93250760101037e-06, "loss": 0.484, "num_input_tokens_seen": 4536400, "step": 4400 }, { "epoch": 2.9464882943143813, "grad_norm": 1.4501943588256836, "learning_rate": 9.93202883205778e-06, "loss": 0.4826, "num_input_tokens_seen": 4541712, "step": 4405 }, { "epoch": 2.9498327759197327, "grad_norm": 1.2202249765396118, "learning_rate": 9.931548382607372e-06, "loss": 0.527, "num_input_tokens_seen": 4546928, "step": 4410 }, { "epoch": 2.9531772575250836, "grad_norm": 1.249009609222412, "learning_rate": 9.931066252822849e-06, "loss": 0.4959, "num_input_tokens_seen": 4552848, "step": 4415 }, { "epoch": 2.9565217391304346, "grad_norm": 1.8208673000335693, "learning_rate": 9.93058244286849e-06, "loss": 0.4927, "num_input_tokens_seen": 4557200, "step": 4420 }, { "epoch": 2.959866220735786, "grad_norm": 2.1093506813049316, "learning_rate": 9.930096952909144e-06, "loss": 0.3977, "num_input_tokens_seen": 4563152, "step": 4425 }, { "epoch": 2.9632107023411374, "grad_norm": 1.7818119525909424, "learning_rate": 9.92960978311023e-06, "loss": 0.4518, "num_input_tokens_seen": 4568912, "step": 4430 }, { "epoch": 2.9665551839464883, "grad_norm": 1.4608683586120605, "learning_rate": 9.929120933637745e-06, "loss": 0.7123, "num_input_tokens_seen": 4574160, "step": 4435 }, { "epoch": 2.9698996655518393, "grad_norm": 1.6326828002929688, "learning_rate": 9.928630404658255e-06, "loss": 0.5298, "num_input_tokens_seen": 4579152, "step": 4440 }, { "epoch": 2.9732441471571907, "grad_norm": 2.322756290435791, "learning_rate": 9.928138196338898e-06, "loss": 0.4718, "num_input_tokens_seen": 4583824, "step": 4445 }, { "epoch": 2.976588628762542, "grad_norm": 1.9390547275543213, "learning_rate": 9.927644308847384e-06, "loss": 0.4809, "num_input_tokens_seen": 4588304, "step": 4450 }, { "epoch": 2.979933110367893, "grad_norm": 2.2761590480804443, "learning_rate": 9.927148742351999e-06, "loss": 0.5886, "num_input_tokens_seen": 4593136, "step": 4455 }, { "epoch": 2.983277591973244, "grad_norm": 1.0437663793563843, "learning_rate": 9.926651497021595e-06, "loss": 0.4887, "num_input_tokens_seen": 4597936, "step": 4460 }, { "epoch": 2.9866220735785953, "grad_norm": 2.2117159366607666, "learning_rate": 9.9261525730256e-06, "loss": 0.5626, "num_input_tokens_seen": 4603024, "step": 4465 }, { "epoch": 2.9899665551839467, "grad_norm": 1.1276172399520874, "learning_rate": 9.925651970534013e-06, "loss": 0.489, "num_input_tokens_seen": 4608176, "step": 4470 }, { "epoch": 2.9933110367892977, "grad_norm": 1.641218662261963, "learning_rate": 9.925149689717407e-06, "loss": 0.5034, "num_input_tokens_seen": 4613072, "step": 4475 }, { "epoch": 2.9966555183946486, "grad_norm": 1.0405648946762085, "learning_rate": 9.924645730746924e-06, "loss": 0.4031, "num_input_tokens_seen": 4618800, "step": 4480 }, { "epoch": 3.0, "grad_norm": 4.257623672485352, "learning_rate": 9.924140093794279e-06, "loss": 0.4355, "num_input_tokens_seen": 4623472, "step": 4485 }, { "epoch": 3.0033444816053514, "grad_norm": 1.5287542343139648, "learning_rate": 9.923632779031757e-06, "loss": 0.4718, "num_input_tokens_seen": 4628400, "step": 4490 }, { "epoch": 3.0066889632107023, "grad_norm": 1.181339144706726, "learning_rate": 9.923123786632217e-06, "loss": 0.4058, "num_input_tokens_seen": 4633360, "step": 4495 }, { "epoch": 3.0100334448160537, "grad_norm": 1.8018229007720947, "learning_rate": 9.922613116769087e-06, "loss": 0.4726, "num_input_tokens_seen": 4638128, "step": 4500 }, { "epoch": 3.0133779264214047, "grad_norm": 1.754859209060669, "learning_rate": 9.922100769616371e-06, "loss": 0.3964, "num_input_tokens_seen": 4642992, "step": 4505 }, { "epoch": 3.016722408026756, "grad_norm": 1.3727753162384033, "learning_rate": 9.921586745348641e-06, "loss": 0.5267, "num_input_tokens_seen": 4649136, "step": 4510 }, { "epoch": 3.020066889632107, "grad_norm": 1.6532602310180664, "learning_rate": 9.921071044141041e-06, "loss": 0.4723, "num_input_tokens_seen": 4654448, "step": 4515 }, { "epoch": 3.0234113712374584, "grad_norm": 2.2799839973449707, "learning_rate": 9.920553666169288e-06, "loss": 0.5743, "num_input_tokens_seen": 4659344, "step": 4520 }, { "epoch": 3.0267558528428093, "grad_norm": 1.232277750968933, "learning_rate": 9.920034611609667e-06, "loss": 0.5582, "num_input_tokens_seen": 4664880, "step": 4525 }, { "epoch": 3.0301003344481607, "grad_norm": 1.5043185949325562, "learning_rate": 9.919513880639036e-06, "loss": 0.4243, "num_input_tokens_seen": 4670000, "step": 4530 }, { "epoch": 3.0334448160535117, "grad_norm": 1.3569217920303345, "learning_rate": 9.918991473434827e-06, "loss": 0.4905, "num_input_tokens_seen": 4674352, "step": 4535 }, { "epoch": 3.036789297658863, "grad_norm": 1.8018499612808228, "learning_rate": 9.918467390175037e-06, "loss": 0.487, "num_input_tokens_seen": 4678928, "step": 4540 }, { "epoch": 3.040133779264214, "grad_norm": 1.2763596773147583, "learning_rate": 9.917941631038242e-06, "loss": 0.4741, "num_input_tokens_seen": 4683440, "step": 4545 }, { "epoch": 3.0434782608695654, "grad_norm": 1.4673000574111938, "learning_rate": 9.917414196203582e-06, "loss": 0.4488, "num_input_tokens_seen": 4688944, "step": 4550 }, { "epoch": 3.0468227424749164, "grad_norm": 1.5118495225906372, "learning_rate": 9.91688508585077e-06, "loss": 0.6479, "num_input_tokens_seen": 4694896, "step": 4555 }, { "epoch": 3.0501672240802677, "grad_norm": 1.5155723094940186, "learning_rate": 9.916354300160095e-06, "loss": 0.5181, "num_input_tokens_seen": 4699248, "step": 4560 }, { "epoch": 3.0535117056856187, "grad_norm": 1.7571488618850708, "learning_rate": 9.915821839312408e-06, "loss": 0.4595, "num_input_tokens_seen": 4704400, "step": 4565 }, { "epoch": 3.05685618729097, "grad_norm": 1.6198861598968506, "learning_rate": 9.915287703489134e-06, "loss": 0.5109, "num_input_tokens_seen": 4709552, "step": 4570 }, { "epoch": 3.060200668896321, "grad_norm": 1.4340420961380005, "learning_rate": 9.914751892872274e-06, "loss": 0.5949, "num_input_tokens_seen": 4715056, "step": 4575 }, { "epoch": 3.0635451505016724, "grad_norm": 1.556863784790039, "learning_rate": 9.914214407644397e-06, "loss": 0.4692, "num_input_tokens_seen": 4719952, "step": 4580 }, { "epoch": 3.0668896321070234, "grad_norm": 2.8103208541870117, "learning_rate": 9.913675247988634e-06, "loss": 0.6633, "num_input_tokens_seen": 4725200, "step": 4585 }, { "epoch": 3.0702341137123748, "grad_norm": 1.931642770767212, "learning_rate": 9.913134414088698e-06, "loss": 0.4715, "num_input_tokens_seen": 4730576, "step": 4590 }, { "epoch": 3.0735785953177257, "grad_norm": 2.7724666595458984, "learning_rate": 9.91259190612887e-06, "loss": 0.4771, "num_input_tokens_seen": 4734960, "step": 4595 }, { "epoch": 3.076923076923077, "grad_norm": 2.4383366107940674, "learning_rate": 9.912047724293998e-06, "loss": 0.6347, "num_input_tokens_seen": 4739920, "step": 4600 }, { "epoch": 3.080267558528428, "grad_norm": 1.253860592842102, "learning_rate": 9.9115018687695e-06, "loss": 0.5666, "num_input_tokens_seen": 4745360, "step": 4605 }, { "epoch": 3.0836120401337794, "grad_norm": 1.0692201852798462, "learning_rate": 9.910954339741369e-06, "loss": 0.4878, "num_input_tokens_seen": 4750128, "step": 4610 }, { "epoch": 3.0869565217391304, "grad_norm": 2.204390048980713, "learning_rate": 9.910405137396164e-06, "loss": 0.6028, "num_input_tokens_seen": 4754704, "step": 4615 }, { "epoch": 3.0903010033444818, "grad_norm": 1.4671112298965454, "learning_rate": 9.909854261921014e-06, "loss": 0.4966, "num_input_tokens_seen": 4760176, "step": 4620 }, { "epoch": 3.0936454849498327, "grad_norm": 1.4039055109024048, "learning_rate": 9.909301713503624e-06, "loss": 0.5366, "num_input_tokens_seen": 4765520, "step": 4625 }, { "epoch": 3.096989966555184, "grad_norm": 1.859808087348938, "learning_rate": 9.90874749233226e-06, "loss": 0.5171, "num_input_tokens_seen": 4770320, "step": 4630 }, { "epoch": 3.100334448160535, "grad_norm": 1.669434905052185, "learning_rate": 9.908191598595765e-06, "loss": 0.4723, "num_input_tokens_seen": 4775024, "step": 4635 }, { "epoch": 3.1036789297658864, "grad_norm": 1.4926934242248535, "learning_rate": 9.90763403248355e-06, "loss": 0.5457, "num_input_tokens_seen": 4779984, "step": 4640 }, { "epoch": 3.1070234113712374, "grad_norm": 1.4968053102493286, "learning_rate": 9.907074794185594e-06, "loss": 0.4439, "num_input_tokens_seen": 4784848, "step": 4645 }, { "epoch": 3.1103678929765888, "grad_norm": 2.0791711807250977, "learning_rate": 9.906513883892448e-06, "loss": 0.4898, "num_input_tokens_seen": 4789872, "step": 4650 }, { "epoch": 3.1137123745819397, "grad_norm": 1.3147464990615845, "learning_rate": 9.905951301795231e-06, "loss": 0.6206, "num_input_tokens_seen": 4794768, "step": 4655 }, { "epoch": 3.117056856187291, "grad_norm": 1.1579997539520264, "learning_rate": 9.905387048085633e-06, "loss": 0.4795, "num_input_tokens_seen": 4800336, "step": 4660 }, { "epoch": 3.120401337792642, "grad_norm": 1.632167100906372, "learning_rate": 9.904821122955914e-06, "loss": 0.5058, "num_input_tokens_seen": 4806064, "step": 4665 }, { "epoch": 3.1237458193979935, "grad_norm": 1.287974238395691, "learning_rate": 9.904253526598902e-06, "loss": 0.494, "num_input_tokens_seen": 4810288, "step": 4670 }, { "epoch": 3.1270903010033444, "grad_norm": 1.6255135536193848, "learning_rate": 9.903684259207994e-06, "loss": 0.5447, "num_input_tokens_seen": 4815696, "step": 4675 }, { "epoch": 3.130434782608696, "grad_norm": 2.399186611175537, "learning_rate": 9.903113320977156e-06, "loss": 0.4583, "num_input_tokens_seen": 4820592, "step": 4680 }, { "epoch": 3.1337792642140467, "grad_norm": 1.9571901559829712, "learning_rate": 9.902540712100929e-06, "loss": 0.4628, "num_input_tokens_seen": 4826320, "step": 4685 }, { "epoch": 3.137123745819398, "grad_norm": 1.4064723253250122, "learning_rate": 9.901966432774415e-06, "loss": 0.4764, "num_input_tokens_seen": 4831632, "step": 4690 }, { "epoch": 3.140468227424749, "grad_norm": 1.3289896249771118, "learning_rate": 9.901390483193291e-06, "loss": 0.4778, "num_input_tokens_seen": 4836496, "step": 4695 }, { "epoch": 3.1438127090301005, "grad_norm": 1.397327184677124, "learning_rate": 9.900812863553801e-06, "loss": 0.542, "num_input_tokens_seen": 4841328, "step": 4700 }, { "epoch": 3.1471571906354514, "grad_norm": 3.0277066230773926, "learning_rate": 9.90023357405276e-06, "loss": 0.6008, "num_input_tokens_seen": 4846384, "step": 4705 }, { "epoch": 3.150501672240803, "grad_norm": 1.4910900592803955, "learning_rate": 9.899652614887545e-06, "loss": 0.4087, "num_input_tokens_seen": 4851152, "step": 4710 }, { "epoch": 3.1538461538461537, "grad_norm": 2.0014545917510986, "learning_rate": 9.899069986256112e-06, "loss": 0.5841, "num_input_tokens_seen": 4856848, "step": 4715 }, { "epoch": 3.157190635451505, "grad_norm": 0.9623741507530212, "learning_rate": 9.89848568835698e-06, "loss": 0.4888, "num_input_tokens_seen": 4862832, "step": 4720 }, { "epoch": 3.160535117056856, "grad_norm": 1.4733625650405884, "learning_rate": 9.897899721389236e-06, "loss": 0.3986, "num_input_tokens_seen": 4867440, "step": 4725 }, { "epoch": 3.1638795986622075, "grad_norm": 1.9975076913833618, "learning_rate": 9.897312085552539e-06, "loss": 0.4391, "num_input_tokens_seen": 4873104, "step": 4730 }, { "epoch": 3.1672240802675584, "grad_norm": 2.717914581298828, "learning_rate": 9.896722781047114e-06, "loss": 0.5419, "num_input_tokens_seen": 4877808, "step": 4735 }, { "epoch": 3.17056856187291, "grad_norm": 2.2851083278656006, "learning_rate": 9.896131808073756e-06, "loss": 0.5045, "num_input_tokens_seen": 4883248, "step": 4740 }, { "epoch": 3.1739130434782608, "grad_norm": 1.9456638097763062, "learning_rate": 9.895539166833829e-06, "loss": 0.43, "num_input_tokens_seen": 4888016, "step": 4745 }, { "epoch": 3.177257525083612, "grad_norm": 3.184546709060669, "learning_rate": 9.894944857529262e-06, "loss": 0.4061, "num_input_tokens_seen": 4892848, "step": 4750 }, { "epoch": 3.180602006688963, "grad_norm": 2.27677845954895, "learning_rate": 9.894348880362561e-06, "loss": 0.5126, "num_input_tokens_seen": 4897648, "step": 4755 }, { "epoch": 3.1839464882943145, "grad_norm": 1.6046174764633179, "learning_rate": 9.893751235536785e-06, "loss": 0.4926, "num_input_tokens_seen": 4902448, "step": 4760 }, { "epoch": 3.1872909698996654, "grad_norm": 2.149508237838745, "learning_rate": 9.89315192325558e-06, "loss": 0.5214, "num_input_tokens_seen": 4907888, "step": 4765 }, { "epoch": 3.190635451505017, "grad_norm": 1.6241689920425415, "learning_rate": 9.892550943723143e-06, "loss": 0.4677, "num_input_tokens_seen": 4913904, "step": 4770 }, { "epoch": 3.1939799331103678, "grad_norm": 1.031356692314148, "learning_rate": 9.89194829714425e-06, "loss": 0.5511, "num_input_tokens_seen": 4919792, "step": 4775 }, { "epoch": 3.197324414715719, "grad_norm": 1.447738528251648, "learning_rate": 9.891343983724245e-06, "loss": 0.4538, "num_input_tokens_seen": 4924464, "step": 4780 }, { "epoch": 3.20066889632107, "grad_norm": 1.5096760988235474, "learning_rate": 9.890738003669029e-06, "loss": 0.5036, "num_input_tokens_seen": 4929520, "step": 4785 }, { "epoch": 3.2040133779264215, "grad_norm": 1.8870794773101807, "learning_rate": 9.890130357185084e-06, "loss": 0.482, "num_input_tokens_seen": 4934768, "step": 4790 }, { "epoch": 3.2073578595317724, "grad_norm": 1.7567918300628662, "learning_rate": 9.889521044479453e-06, "loss": 0.4613, "num_input_tokens_seen": 4940176, "step": 4795 }, { "epoch": 3.210702341137124, "grad_norm": 1.1236510276794434, "learning_rate": 9.888910065759749e-06, "loss": 0.4854, "num_input_tokens_seen": 4945680, "step": 4800 }, { "epoch": 3.2140468227424748, "grad_norm": 1.726742148399353, "learning_rate": 9.888297421234148e-06, "loss": 0.3816, "num_input_tokens_seen": 4950768, "step": 4805 }, { "epoch": 3.217391304347826, "grad_norm": 1.3189054727554321, "learning_rate": 9.887683111111402e-06, "loss": 0.4492, "num_input_tokens_seen": 4956432, "step": 4810 }, { "epoch": 3.220735785953177, "grad_norm": 2.137077569961548, "learning_rate": 9.887067135600826e-06, "loss": 0.5106, "num_input_tokens_seen": 4962256, "step": 4815 }, { "epoch": 3.2240802675585285, "grad_norm": 2.047978162765503, "learning_rate": 9.886449494912296e-06, "loss": 0.4944, "num_input_tokens_seen": 4967568, "step": 4820 }, { "epoch": 3.2274247491638794, "grad_norm": 1.3084993362426758, "learning_rate": 9.885830189256268e-06, "loss": 0.5191, "num_input_tokens_seen": 4972784, "step": 4825 }, { "epoch": 3.230769230769231, "grad_norm": 1.9615478515625, "learning_rate": 9.885209218843757e-06, "loss": 0.5329, "num_input_tokens_seen": 4977808, "step": 4830 }, { "epoch": 3.234113712374582, "grad_norm": 1.2043405771255493, "learning_rate": 9.884586583886347e-06, "loss": 0.4508, "num_input_tokens_seen": 4983984, "step": 4835 }, { "epoch": 3.237458193979933, "grad_norm": 1.6162434816360474, "learning_rate": 9.883962284596189e-06, "loss": 0.4805, "num_input_tokens_seen": 4989520, "step": 4840 }, { "epoch": 3.240802675585284, "grad_norm": 1.2602462768554688, "learning_rate": 9.883336321186e-06, "loss": 0.4839, "num_input_tokens_seen": 4993648, "step": 4845 }, { "epoch": 3.2441471571906355, "grad_norm": 1.30415678024292, "learning_rate": 9.882708693869071e-06, "loss": 0.5138, "num_input_tokens_seen": 4998960, "step": 4850 }, { "epoch": 3.2474916387959865, "grad_norm": 2.0692200660705566, "learning_rate": 9.88207940285925e-06, "loss": 0.592, "num_input_tokens_seen": 5004432, "step": 4855 }, { "epoch": 3.250836120401338, "grad_norm": 1.5071173906326294, "learning_rate": 9.881448448370956e-06, "loss": 0.5619, "num_input_tokens_seen": 5009520, "step": 4860 }, { "epoch": 3.254180602006689, "grad_norm": 1.2172918319702148, "learning_rate": 9.880815830619176e-06, "loss": 0.4027, "num_input_tokens_seen": 5014480, "step": 4865 }, { "epoch": 3.25752508361204, "grad_norm": 1.2421648502349854, "learning_rate": 9.880181549819463e-06, "loss": 0.4752, "num_input_tokens_seen": 5019696, "step": 4870 }, { "epoch": 3.260869565217391, "grad_norm": 1.501676082611084, "learning_rate": 9.879545606187938e-06, "loss": 0.479, "num_input_tokens_seen": 5024144, "step": 4875 }, { "epoch": 3.2642140468227425, "grad_norm": 1.564211130142212, "learning_rate": 9.878907999941285e-06, "loss": 0.5095, "num_input_tokens_seen": 5029040, "step": 4880 }, { "epoch": 3.2675585284280935, "grad_norm": 1.0060217380523682, "learning_rate": 9.878268731296756e-06, "loss": 0.431, "num_input_tokens_seen": 5034608, "step": 4885 }, { "epoch": 3.270903010033445, "grad_norm": 1.8949129581451416, "learning_rate": 9.877627800472172e-06, "loss": 0.5015, "num_input_tokens_seen": 5040048, "step": 4890 }, { "epoch": 3.274247491638796, "grad_norm": 1.842297077178955, "learning_rate": 9.876985207685917e-06, "loss": 0.4171, "num_input_tokens_seen": 5044528, "step": 4895 }, { "epoch": 3.277591973244147, "grad_norm": 1.9099494218826294, "learning_rate": 9.876340953156945e-06, "loss": 0.4341, "num_input_tokens_seen": 5049008, "step": 4900 }, { "epoch": 3.280936454849498, "grad_norm": 1.6980699300765991, "learning_rate": 9.875695037104768e-06, "loss": 0.451, "num_input_tokens_seen": 5054224, "step": 4905 }, { "epoch": 3.2842809364548495, "grad_norm": 1.6943267583847046, "learning_rate": 9.875047459749477e-06, "loss": 0.5853, "num_input_tokens_seen": 5060048, "step": 4910 }, { "epoch": 3.2876254180602005, "grad_norm": 1.4849843978881836, "learning_rate": 9.87439822131172e-06, "loss": 0.4993, "num_input_tokens_seen": 5063984, "step": 4915 }, { "epoch": 3.290969899665552, "grad_norm": 1.4667692184448242, "learning_rate": 9.87374732201271e-06, "loss": 0.4635, "num_input_tokens_seen": 5069264, "step": 4920 }, { "epoch": 3.294314381270903, "grad_norm": 1.6538485288619995, "learning_rate": 9.873094762074229e-06, "loss": 0.5027, "num_input_tokens_seen": 5074704, "step": 4925 }, { "epoch": 3.297658862876254, "grad_norm": 1.42667818069458, "learning_rate": 9.87244054171863e-06, "loss": 0.5309, "num_input_tokens_seen": 5080656, "step": 4930 }, { "epoch": 3.3010033444816056, "grad_norm": 1.0501630306243896, "learning_rate": 9.871784661168822e-06, "loss": 0.4965, "num_input_tokens_seen": 5086192, "step": 4935 }, { "epoch": 3.3043478260869565, "grad_norm": 1.3860963582992554, "learning_rate": 9.871127120648285e-06, "loss": 0.471, "num_input_tokens_seen": 5091568, "step": 4940 }, { "epoch": 3.3076923076923075, "grad_norm": 1.957303524017334, "learning_rate": 9.870467920381063e-06, "loss": 0.57, "num_input_tokens_seen": 5097264, "step": 4945 }, { "epoch": 3.311036789297659, "grad_norm": 1.5922647714614868, "learning_rate": 9.869807060591769e-06, "loss": 0.6123, "num_input_tokens_seen": 5103536, "step": 4950 }, { "epoch": 3.3143812709030103, "grad_norm": 2.156745672225952, "learning_rate": 9.869144541505578e-06, "loss": 0.4204, "num_input_tokens_seen": 5108176, "step": 4955 }, { "epoch": 3.317725752508361, "grad_norm": 2.118527412414551, "learning_rate": 9.86848036334823e-06, "loss": 0.4646, "num_input_tokens_seen": 5113488, "step": 4960 }, { "epoch": 3.321070234113712, "grad_norm": 1.6313546895980835, "learning_rate": 9.867814526346032e-06, "loss": 0.4416, "num_input_tokens_seen": 5118544, "step": 4965 }, { "epoch": 3.3244147157190636, "grad_norm": 2.0728533267974854, "learning_rate": 9.867147030725854e-06, "loss": 0.5222, "num_input_tokens_seen": 5123280, "step": 4970 }, { "epoch": 3.327759197324415, "grad_norm": 3.880662202835083, "learning_rate": 9.866477876715136e-06, "loss": 0.5568, "num_input_tokens_seen": 5128080, "step": 4975 }, { "epoch": 3.331103678929766, "grad_norm": 2.037964105606079, "learning_rate": 9.865807064541878e-06, "loss": 0.5163, "num_input_tokens_seen": 5133776, "step": 4980 }, { "epoch": 3.334448160535117, "grad_norm": 1.605203628540039, "learning_rate": 9.865134594434647e-06, "loss": 0.576, "num_input_tokens_seen": 5138992, "step": 4985 }, { "epoch": 3.3377926421404682, "grad_norm": 2.275679111480713, "learning_rate": 9.864460466622574e-06, "loss": 0.5393, "num_input_tokens_seen": 5144240, "step": 4990 }, { "epoch": 3.3411371237458196, "grad_norm": 1.313429594039917, "learning_rate": 9.863784681335357e-06, "loss": 0.5336, "num_input_tokens_seen": 5150416, "step": 4995 }, { "epoch": 3.3444816053511706, "grad_norm": 1.7274702787399292, "learning_rate": 9.863107238803258e-06, "loss": 0.5039, "num_input_tokens_seen": 5155632, "step": 5000 }, { "epoch": 3.3478260869565215, "grad_norm": 1.148226261138916, "learning_rate": 9.862428139257101e-06, "loss": 0.6248, "num_input_tokens_seen": 5160176, "step": 5005 }, { "epoch": 3.351170568561873, "grad_norm": 1.737484097480774, "learning_rate": 9.861747382928277e-06, "loss": 0.5142, "num_input_tokens_seen": 5164528, "step": 5010 }, { "epoch": 3.3545150501672243, "grad_norm": 1.5518507957458496, "learning_rate": 9.861064970048744e-06, "loss": 0.4313, "num_input_tokens_seen": 5169584, "step": 5015 }, { "epoch": 3.3578595317725752, "grad_norm": 2.272183895111084, "learning_rate": 9.860380900851017e-06, "loss": 0.4753, "num_input_tokens_seen": 5174864, "step": 5020 }, { "epoch": 3.361204013377926, "grad_norm": 1.8239854574203491, "learning_rate": 9.859695175568183e-06, "loss": 0.5498, "num_input_tokens_seen": 5180976, "step": 5025 }, { "epoch": 3.3645484949832776, "grad_norm": 2.322892665863037, "learning_rate": 9.85900779443389e-06, "loss": 0.4963, "num_input_tokens_seen": 5186480, "step": 5030 }, { "epoch": 3.367892976588629, "grad_norm": 1.737628698348999, "learning_rate": 9.858318757682348e-06, "loss": 0.468, "num_input_tokens_seen": 5191376, "step": 5035 }, { "epoch": 3.37123745819398, "grad_norm": 2.384023904800415, "learning_rate": 9.857628065548338e-06, "loss": 0.5119, "num_input_tokens_seen": 5196048, "step": 5040 }, { "epoch": 3.374581939799331, "grad_norm": 1.6822787523269653, "learning_rate": 9.856935718267196e-06, "loss": 0.4824, "num_input_tokens_seen": 5201552, "step": 5045 }, { "epoch": 3.3779264214046822, "grad_norm": 1.9165892601013184, "learning_rate": 9.856241716074831e-06, "loss": 0.471, "num_input_tokens_seen": 5206608, "step": 5050 }, { "epoch": 3.3812709030100336, "grad_norm": 2.168318033218384, "learning_rate": 9.855546059207706e-06, "loss": 0.5018, "num_input_tokens_seen": 5211216, "step": 5055 }, { "epoch": 3.3846153846153846, "grad_norm": 1.6796622276306152, "learning_rate": 9.85484874790286e-06, "loss": 0.5802, "num_input_tokens_seen": 5216400, "step": 5060 }, { "epoch": 3.387959866220736, "grad_norm": 2.0643298625946045, "learning_rate": 9.854149782397882e-06, "loss": 0.4637, "num_input_tokens_seen": 5222448, "step": 5065 }, { "epoch": 3.391304347826087, "grad_norm": 1.4689455032348633, "learning_rate": 9.853449162930936e-06, "loss": 0.4628, "num_input_tokens_seen": 5227632, "step": 5070 }, { "epoch": 3.3946488294314383, "grad_norm": 1.2522757053375244, "learning_rate": 9.852746889740745e-06, "loss": 0.4964, "num_input_tokens_seen": 5232368, "step": 5075 }, { "epoch": 3.3979933110367893, "grad_norm": 1.2293400764465332, "learning_rate": 9.852042963066595e-06, "loss": 0.527, "num_input_tokens_seen": 5236848, "step": 5080 }, { "epoch": 3.4013377926421406, "grad_norm": 2.1833133697509766, "learning_rate": 9.851337383148333e-06, "loss": 0.4829, "num_input_tokens_seen": 5241360, "step": 5085 }, { "epoch": 3.4046822742474916, "grad_norm": 1.4400677680969238, "learning_rate": 9.850630150226378e-06, "loss": 0.5077, "num_input_tokens_seen": 5246160, "step": 5090 }, { "epoch": 3.408026755852843, "grad_norm": 1.3804783821105957, "learning_rate": 9.849921264541703e-06, "loss": 0.4599, "num_input_tokens_seen": 5251088, "step": 5095 }, { "epoch": 3.411371237458194, "grad_norm": 2.3913140296936035, "learning_rate": 9.849210726335848e-06, "loss": 0.4979, "num_input_tokens_seen": 5255696, "step": 5100 }, { "epoch": 3.4147157190635453, "grad_norm": 3.0297675132751465, "learning_rate": 9.848498535850915e-06, "loss": 0.5711, "num_input_tokens_seen": 5260976, "step": 5105 }, { "epoch": 3.4180602006688963, "grad_norm": 1.693497896194458, "learning_rate": 9.847784693329571e-06, "loss": 0.4549, "num_input_tokens_seen": 5265808, "step": 5110 }, { "epoch": 3.4214046822742477, "grad_norm": 2.0493242740631104, "learning_rate": 9.847069199015047e-06, "loss": 0.5429, "num_input_tokens_seen": 5270352, "step": 5115 }, { "epoch": 3.4247491638795986, "grad_norm": 1.812986135482788, "learning_rate": 9.846352053151131e-06, "loss": 0.5622, "num_input_tokens_seen": 5275856, "step": 5120 }, { "epoch": 3.42809364548495, "grad_norm": 1.5017971992492676, "learning_rate": 9.845633255982177e-06, "loss": 0.5106, "num_input_tokens_seen": 5280848, "step": 5125 }, { "epoch": 3.431438127090301, "grad_norm": 2.1124420166015625, "learning_rate": 9.844912807753105e-06, "loss": 0.5769, "num_input_tokens_seen": 5286064, "step": 5130 }, { "epoch": 3.4347826086956523, "grad_norm": 1.5297260284423828, "learning_rate": 9.84419070870939e-06, "loss": 0.5719, "num_input_tokens_seen": 5292048, "step": 5135 }, { "epoch": 3.4381270903010033, "grad_norm": 2.0536255836486816, "learning_rate": 9.843466959097078e-06, "loss": 0.4731, "num_input_tokens_seen": 5297840, "step": 5140 }, { "epoch": 3.4414715719063547, "grad_norm": 1.892147183418274, "learning_rate": 9.842741559162771e-06, "loss": 0.509, "num_input_tokens_seen": 5302480, "step": 5145 }, { "epoch": 3.4448160535117056, "grad_norm": 1.6155548095703125, "learning_rate": 9.842014509153639e-06, "loss": 0.506, "num_input_tokens_seen": 5307408, "step": 5150 }, { "epoch": 3.448160535117057, "grad_norm": 1.332179307937622, "learning_rate": 9.841285809317407e-06, "loss": 0.5184, "num_input_tokens_seen": 5311664, "step": 5155 }, { "epoch": 3.451505016722408, "grad_norm": 1.4980521202087402, "learning_rate": 9.840555459902365e-06, "loss": 0.5359, "num_input_tokens_seen": 5316496, "step": 5160 }, { "epoch": 3.4548494983277593, "grad_norm": 1.313234567642212, "learning_rate": 9.839823461157372e-06, "loss": 0.4906, "num_input_tokens_seen": 5322000, "step": 5165 }, { "epoch": 3.4581939799331103, "grad_norm": 2.3224241733551025, "learning_rate": 9.839089813331838e-06, "loss": 0.4592, "num_input_tokens_seen": 5326960, "step": 5170 }, { "epoch": 3.4615384615384617, "grad_norm": 1.457208275794983, "learning_rate": 9.83835451667574e-06, "loss": 0.4372, "num_input_tokens_seen": 5333008, "step": 5175 }, { "epoch": 3.4648829431438126, "grad_norm": 1.8405667543411255, "learning_rate": 9.837617571439621e-06, "loss": 0.5439, "num_input_tokens_seen": 5337616, "step": 5180 }, { "epoch": 3.468227424749164, "grad_norm": 1.7291055917739868, "learning_rate": 9.836878977874578e-06, "loss": 0.4494, "num_input_tokens_seen": 5342896, "step": 5185 }, { "epoch": 3.471571906354515, "grad_norm": 2.1511900424957275, "learning_rate": 9.836138736232272e-06, "loss": 0.4577, "num_input_tokens_seen": 5347472, "step": 5190 }, { "epoch": 3.4749163879598663, "grad_norm": 1.4473142623901367, "learning_rate": 9.83539684676493e-06, "loss": 0.4404, "num_input_tokens_seen": 5353424, "step": 5195 }, { "epoch": 3.4782608695652173, "grad_norm": 3.3428585529327393, "learning_rate": 9.834653309725335e-06, "loss": 0.5082, "num_input_tokens_seen": 5358160, "step": 5200 }, { "epoch": 3.4816053511705687, "grad_norm": 1.8757314682006836, "learning_rate": 9.833908125366835e-06, "loss": 0.506, "num_input_tokens_seen": 5362992, "step": 5205 }, { "epoch": 3.4849498327759196, "grad_norm": 1.5226325988769531, "learning_rate": 9.833161293943337e-06, "loss": 0.504, "num_input_tokens_seen": 5368400, "step": 5210 }, { "epoch": 3.488294314381271, "grad_norm": 1.5825929641723633, "learning_rate": 9.83241281570931e-06, "loss": 0.5343, "num_input_tokens_seen": 5372752, "step": 5215 }, { "epoch": 3.491638795986622, "grad_norm": 1.8278789520263672, "learning_rate": 9.831662690919785e-06, "loss": 0.4495, "num_input_tokens_seen": 5377872, "step": 5220 }, { "epoch": 3.4949832775919734, "grad_norm": 1.6247365474700928, "learning_rate": 9.83091091983035e-06, "loss": 0.4556, "num_input_tokens_seen": 5383216, "step": 5225 }, { "epoch": 3.4983277591973243, "grad_norm": 1.7720402479171753, "learning_rate": 9.830157502697161e-06, "loss": 0.5765, "num_input_tokens_seen": 5388976, "step": 5230 }, { "epoch": 3.5016722408026757, "grad_norm": 1.653566837310791, "learning_rate": 9.829402439776931e-06, "loss": 0.5241, "num_input_tokens_seen": 5393712, "step": 5235 }, { "epoch": 3.5050167224080266, "grad_norm": 1.6086119413375854, "learning_rate": 9.82864573132693e-06, "loss": 0.5109, "num_input_tokens_seen": 5399152, "step": 5240 }, { "epoch": 3.508361204013378, "grad_norm": 1.3989070653915405, "learning_rate": 9.827887377604995e-06, "loss": 0.4432, "num_input_tokens_seen": 5405264, "step": 5245 }, { "epoch": 3.511705685618729, "grad_norm": 1.9417558908462524, "learning_rate": 9.827127378869523e-06, "loss": 0.5401, "num_input_tokens_seen": 5409776, "step": 5250 }, { "epoch": 3.5150501672240804, "grad_norm": 1.7641913890838623, "learning_rate": 9.826365735379464e-06, "loss": 0.5007, "num_input_tokens_seen": 5414928, "step": 5255 }, { "epoch": 3.5183946488294313, "grad_norm": 1.994534969329834, "learning_rate": 9.82560244739434e-06, "loss": 0.4717, "num_input_tokens_seen": 5420688, "step": 5260 }, { "epoch": 3.5217391304347827, "grad_norm": 2.7398014068603516, "learning_rate": 9.824837515174223e-06, "loss": 0.5829, "num_input_tokens_seen": 5425776, "step": 5265 }, { "epoch": 3.5250836120401337, "grad_norm": 1.2241755723953247, "learning_rate": 9.82407093897975e-06, "loss": 0.5589, "num_input_tokens_seen": 5431088, "step": 5270 }, { "epoch": 3.528428093645485, "grad_norm": 1.3863645792007446, "learning_rate": 9.82330271907212e-06, "loss": 0.5502, "num_input_tokens_seen": 5436208, "step": 5275 }, { "epoch": 3.531772575250836, "grad_norm": 1.3552296161651611, "learning_rate": 9.822532855713089e-06, "loss": 0.4171, "num_input_tokens_seen": 5442544, "step": 5280 }, { "epoch": 3.5351170568561874, "grad_norm": 3.0040085315704346, "learning_rate": 9.821761349164973e-06, "loss": 0.467, "num_input_tokens_seen": 5447376, "step": 5285 }, { "epoch": 3.5384615384615383, "grad_norm": 2.003485679626465, "learning_rate": 9.82098819969065e-06, "loss": 0.5093, "num_input_tokens_seen": 5452784, "step": 5290 }, { "epoch": 3.5418060200668897, "grad_norm": 1.8779956102371216, "learning_rate": 9.820213407553553e-06, "loss": 0.408, "num_input_tokens_seen": 5457744, "step": 5295 }, { "epoch": 3.5451505016722407, "grad_norm": 1.5724279880523682, "learning_rate": 9.819436973017683e-06, "loss": 0.5094, "num_input_tokens_seen": 5463088, "step": 5300 }, { "epoch": 3.548494983277592, "grad_norm": 2.038264513015747, "learning_rate": 9.818658896347591e-06, "loss": 0.5654, "num_input_tokens_seen": 5467344, "step": 5305 }, { "epoch": 3.551839464882943, "grad_norm": 1.4645026922225952, "learning_rate": 9.817879177808396e-06, "loss": 0.5246, "num_input_tokens_seen": 5472496, "step": 5310 }, { "epoch": 3.5551839464882944, "grad_norm": 1.0094009637832642, "learning_rate": 9.817097817665771e-06, "loss": 0.491, "num_input_tokens_seen": 5477808, "step": 5315 }, { "epoch": 3.5585284280936453, "grad_norm": 1.9784679412841797, "learning_rate": 9.81631481618595e-06, "loss": 0.4939, "num_input_tokens_seen": 5482640, "step": 5320 }, { "epoch": 3.5618729096989967, "grad_norm": 1.0357229709625244, "learning_rate": 9.815530173635725e-06, "loss": 0.4671, "num_input_tokens_seen": 5488400, "step": 5325 }, { "epoch": 3.5652173913043477, "grad_norm": 3.3723011016845703, "learning_rate": 9.814743890282452e-06, "loss": 0.5233, "num_input_tokens_seen": 5494384, "step": 5330 }, { "epoch": 3.568561872909699, "grad_norm": 2.0841281414031982, "learning_rate": 9.81395596639404e-06, "loss": 0.5332, "num_input_tokens_seen": 5499152, "step": 5335 }, { "epoch": 3.57190635451505, "grad_norm": 2.384793519973755, "learning_rate": 9.813166402238958e-06, "loss": 0.5155, "num_input_tokens_seen": 5503824, "step": 5340 }, { "epoch": 3.5752508361204014, "grad_norm": 1.4450467824935913, "learning_rate": 9.81237519808624e-06, "loss": 0.5461, "num_input_tokens_seen": 5509136, "step": 5345 }, { "epoch": 3.5785953177257523, "grad_norm": 1.2538515329360962, "learning_rate": 9.811582354205472e-06, "loss": 0.4416, "num_input_tokens_seen": 5513776, "step": 5350 }, { "epoch": 3.5819397993311037, "grad_norm": 1.705736756324768, "learning_rate": 9.810787870866798e-06, "loss": 0.5238, "num_input_tokens_seen": 5519536, "step": 5355 }, { "epoch": 3.585284280936455, "grad_norm": 1.072234034538269, "learning_rate": 9.809991748340928e-06, "loss": 0.4453, "num_input_tokens_seen": 5525168, "step": 5360 }, { "epoch": 3.588628762541806, "grad_norm": 1.6405885219573975, "learning_rate": 9.809193986899124e-06, "loss": 0.4299, "num_input_tokens_seen": 5529680, "step": 5365 }, { "epoch": 3.591973244147157, "grad_norm": 1.6142538785934448, "learning_rate": 9.808394586813209e-06, "loss": 0.4267, "num_input_tokens_seen": 5534608, "step": 5370 }, { "epoch": 3.5953177257525084, "grad_norm": 1.2995401620864868, "learning_rate": 9.807593548355562e-06, "loss": 0.4951, "num_input_tokens_seen": 5540560, "step": 5375 }, { "epoch": 3.59866220735786, "grad_norm": 1.3209867477416992, "learning_rate": 9.806790871799125e-06, "loss": 0.4952, "num_input_tokens_seen": 5545680, "step": 5380 }, { "epoch": 3.6020066889632107, "grad_norm": 1.7328054904937744, "learning_rate": 9.805986557417396e-06, "loss": 0.5964, "num_input_tokens_seen": 5550704, "step": 5385 }, { "epoch": 3.6053511705685617, "grad_norm": 2.264732599258423, "learning_rate": 9.805180605484424e-06, "loss": 0.4307, "num_input_tokens_seen": 5555824, "step": 5390 }, { "epoch": 3.608695652173913, "grad_norm": 1.4156454801559448, "learning_rate": 9.804373016274828e-06, "loss": 0.4879, "num_input_tokens_seen": 5560816, "step": 5395 }, { "epoch": 3.6120401337792645, "grad_norm": 0.9660384654998779, "learning_rate": 9.803563790063777e-06, "loss": 0.3695, "num_input_tokens_seen": 5565936, "step": 5400 }, { "epoch": 3.6153846153846154, "grad_norm": 1.5310328006744385, "learning_rate": 9.802752927127001e-06, "loss": 0.5008, "num_input_tokens_seen": 5571600, "step": 5405 }, { "epoch": 3.6187290969899664, "grad_norm": 1.2535970211029053, "learning_rate": 9.801940427740786e-06, "loss": 0.4389, "num_input_tokens_seen": 5576144, "step": 5410 }, { "epoch": 3.6220735785953178, "grad_norm": 1.6618144512176514, "learning_rate": 9.801126292181977e-06, "loss": 0.5411, "num_input_tokens_seen": 5582160, "step": 5415 }, { "epoch": 3.625418060200669, "grad_norm": 1.3602298498153687, "learning_rate": 9.800310520727972e-06, "loss": 0.4708, "num_input_tokens_seen": 5586896, "step": 5420 }, { "epoch": 3.62876254180602, "grad_norm": 0.9810209274291992, "learning_rate": 9.799493113656735e-06, "loss": 0.5982, "num_input_tokens_seen": 5591280, "step": 5425 }, { "epoch": 3.632107023411371, "grad_norm": 1.7698974609375, "learning_rate": 9.798674071246781e-06, "loss": 0.4359, "num_input_tokens_seen": 5595792, "step": 5430 }, { "epoch": 3.6354515050167224, "grad_norm": 2.0567147731781006, "learning_rate": 9.797853393777182e-06, "loss": 0.5243, "num_input_tokens_seen": 5601328, "step": 5435 }, { "epoch": 3.638795986622074, "grad_norm": 1.9074183702468872, "learning_rate": 9.797031081527568e-06, "loss": 0.4885, "num_input_tokens_seen": 5607536, "step": 5440 }, { "epoch": 3.6421404682274248, "grad_norm": 1.5998531579971313, "learning_rate": 9.79620713477813e-06, "loss": 0.5533, "num_input_tokens_seen": 5612560, "step": 5445 }, { "epoch": 3.6454849498327757, "grad_norm": 1.0191113948822021, "learning_rate": 9.795381553809612e-06, "loss": 0.5029, "num_input_tokens_seen": 5617936, "step": 5450 }, { "epoch": 3.648829431438127, "grad_norm": 1.4899442195892334, "learning_rate": 9.79455433890331e-06, "loss": 0.4198, "num_input_tokens_seen": 5622928, "step": 5455 }, { "epoch": 3.6521739130434785, "grad_norm": 1.7782179117202759, "learning_rate": 9.79372549034109e-06, "loss": 0.3947, "num_input_tokens_seen": 5627792, "step": 5460 }, { "epoch": 3.6555183946488294, "grad_norm": 1.6837379932403564, "learning_rate": 9.792895008405362e-06, "loss": 0.4244, "num_input_tokens_seen": 5632720, "step": 5465 }, { "epoch": 3.6588628762541804, "grad_norm": 1.4839823246002197, "learning_rate": 9.792062893379102e-06, "loss": 0.4304, "num_input_tokens_seen": 5637808, "step": 5470 }, { "epoch": 3.6622073578595318, "grad_norm": 1.3600717782974243, "learning_rate": 9.791229145545832e-06, "loss": 0.4955, "num_input_tokens_seen": 5642512, "step": 5475 }, { "epoch": 3.665551839464883, "grad_norm": 1.2577606439590454, "learning_rate": 9.790393765189638e-06, "loss": 0.5158, "num_input_tokens_seen": 5647696, "step": 5480 }, { "epoch": 3.668896321070234, "grad_norm": 1.2105118036270142, "learning_rate": 9.789556752595164e-06, "loss": 0.5132, "num_input_tokens_seen": 5652624, "step": 5485 }, { "epoch": 3.672240802675585, "grad_norm": 1.3363617658615112, "learning_rate": 9.788718108047603e-06, "loss": 0.4193, "num_input_tokens_seen": 5658064, "step": 5490 }, { "epoch": 3.6755852842809364, "grad_norm": 1.9777674674987793, "learning_rate": 9.787877831832708e-06, "loss": 0.5314, "num_input_tokens_seen": 5663152, "step": 5495 }, { "epoch": 3.678929765886288, "grad_norm": 2.324758291244507, "learning_rate": 9.787035924236789e-06, "loss": 0.537, "num_input_tokens_seen": 5668016, "step": 5500 }, { "epoch": 3.682274247491639, "grad_norm": 1.6935850381851196, "learning_rate": 9.786192385546708e-06, "loss": 0.4576, "num_input_tokens_seen": 5673776, "step": 5505 }, { "epoch": 3.6856187290969897, "grad_norm": 1.5067476034164429, "learning_rate": 9.78534721604989e-06, "loss": 0.4309, "num_input_tokens_seen": 5679216, "step": 5510 }, { "epoch": 3.688963210702341, "grad_norm": 1.3417198657989502, "learning_rate": 9.784500416034306e-06, "loss": 0.4979, "num_input_tokens_seen": 5684272, "step": 5515 }, { "epoch": 3.6923076923076925, "grad_norm": 1.306695580482483, "learning_rate": 9.783651985788488e-06, "loss": 0.4706, "num_input_tokens_seen": 5689008, "step": 5520 }, { "epoch": 3.6956521739130435, "grad_norm": 1.2987645864486694, "learning_rate": 9.782801925601526e-06, "loss": 0.4349, "num_input_tokens_seen": 5694320, "step": 5525 }, { "epoch": 3.6989966555183944, "grad_norm": 1.8576610088348389, "learning_rate": 9.78195023576306e-06, "loss": 0.4847, "num_input_tokens_seen": 5699024, "step": 5530 }, { "epoch": 3.702341137123746, "grad_norm": 1.5296783447265625, "learning_rate": 9.78109691656329e-06, "loss": 0.4334, "num_input_tokens_seen": 5704464, "step": 5535 }, { "epoch": 3.705685618729097, "grad_norm": 1.387617826461792, "learning_rate": 9.780241968292963e-06, "loss": 0.4242, "num_input_tokens_seen": 5709680, "step": 5540 }, { "epoch": 3.709030100334448, "grad_norm": 1.875403642654419, "learning_rate": 9.779385391243394e-06, "loss": 0.4729, "num_input_tokens_seen": 5715856, "step": 5545 }, { "epoch": 3.712374581939799, "grad_norm": 1.4875361919403076, "learning_rate": 9.778527185706441e-06, "loss": 0.4612, "num_input_tokens_seen": 5721712, "step": 5550 }, { "epoch": 3.7157190635451505, "grad_norm": 1.404881477355957, "learning_rate": 9.777667351974525e-06, "loss": 0.5203, "num_input_tokens_seen": 5726864, "step": 5555 }, { "epoch": 3.719063545150502, "grad_norm": 1.2675316333770752, "learning_rate": 9.776805890340615e-06, "loss": 0.5504, "num_input_tokens_seen": 5732080, "step": 5560 }, { "epoch": 3.722408026755853, "grad_norm": 1.182377576828003, "learning_rate": 9.775942801098241e-06, "loss": 0.5664, "num_input_tokens_seen": 5737232, "step": 5565 }, { "epoch": 3.7257525083612038, "grad_norm": 1.4187140464782715, "learning_rate": 9.775078084541483e-06, "loss": 0.5247, "num_input_tokens_seen": 5742512, "step": 5570 }, { "epoch": 3.729096989966555, "grad_norm": 1.9517536163330078, "learning_rate": 9.77421174096498e-06, "loss": 0.4507, "num_input_tokens_seen": 5746896, "step": 5575 }, { "epoch": 3.7324414715719065, "grad_norm": 1.985817790031433, "learning_rate": 9.773343770663919e-06, "loss": 0.4425, "num_input_tokens_seen": 5752208, "step": 5580 }, { "epoch": 3.7357859531772575, "grad_norm": 1.1771377325057983, "learning_rate": 9.772474173934046e-06, "loss": 0.5057, "num_input_tokens_seen": 5756752, "step": 5585 }, { "epoch": 3.7391304347826084, "grad_norm": 1.4143214225769043, "learning_rate": 9.77160295107166e-06, "loss": 0.4596, "num_input_tokens_seen": 5761424, "step": 5590 }, { "epoch": 3.74247491638796, "grad_norm": 1.4484542608261108, "learning_rate": 9.770730102373615e-06, "loss": 0.4941, "num_input_tokens_seen": 5766128, "step": 5595 }, { "epoch": 3.745819397993311, "grad_norm": 1.337950348854065, "learning_rate": 9.769855628137319e-06, "loss": 0.4486, "num_input_tokens_seen": 5770896, "step": 5600 }, { "epoch": 3.749163879598662, "grad_norm": 2.701982021331787, "learning_rate": 9.76897952866073e-06, "loss": 0.5267, "num_input_tokens_seen": 5776240, "step": 5605 }, { "epoch": 3.7525083612040135, "grad_norm": 1.0447405576705933, "learning_rate": 9.768101804242364e-06, "loss": 0.4125, "num_input_tokens_seen": 5780560, "step": 5610 }, { "epoch": 3.7558528428093645, "grad_norm": 1.3567088842391968, "learning_rate": 9.767222455181291e-06, "loss": 0.5616, "num_input_tokens_seen": 5785840, "step": 5615 }, { "epoch": 3.759197324414716, "grad_norm": 1.8748680353164673, "learning_rate": 9.766341481777132e-06, "loss": 0.4837, "num_input_tokens_seen": 5791472, "step": 5620 }, { "epoch": 3.762541806020067, "grad_norm": 1.3224585056304932, "learning_rate": 9.765458884330061e-06, "loss": 0.477, "num_input_tokens_seen": 5796688, "step": 5625 }, { "epoch": 3.765886287625418, "grad_norm": 1.5901349782943726, "learning_rate": 9.764574663140807e-06, "loss": 0.5162, "num_input_tokens_seen": 5801264, "step": 5630 }, { "epoch": 3.769230769230769, "grad_norm": 1.4338562488555908, "learning_rate": 9.763688818510654e-06, "loss": 0.444, "num_input_tokens_seen": 5805968, "step": 5635 }, { "epoch": 3.7725752508361206, "grad_norm": 1.2785922288894653, "learning_rate": 9.762801350741438e-06, "loss": 0.4854, "num_input_tokens_seen": 5811984, "step": 5640 }, { "epoch": 3.7759197324414715, "grad_norm": 1.232063889503479, "learning_rate": 9.761912260135543e-06, "loss": 0.552, "num_input_tokens_seen": 5817200, "step": 5645 }, { "epoch": 3.779264214046823, "grad_norm": 1.0574508905410767, "learning_rate": 9.761021546995913e-06, "loss": 0.4127, "num_input_tokens_seen": 5821776, "step": 5650 }, { "epoch": 3.782608695652174, "grad_norm": 1.8493643999099731, "learning_rate": 9.760129211626041e-06, "loss": 0.4892, "num_input_tokens_seen": 5826032, "step": 5655 }, { "epoch": 3.7859531772575252, "grad_norm": 1.8231760263442993, "learning_rate": 9.759235254329978e-06, "loss": 0.4602, "num_input_tokens_seen": 5831824, "step": 5660 }, { "epoch": 3.789297658862876, "grad_norm": 1.8435661792755127, "learning_rate": 9.758339675412316e-06, "loss": 0.5093, "num_input_tokens_seen": 5836656, "step": 5665 }, { "epoch": 3.7926421404682276, "grad_norm": 7.036864757537842, "learning_rate": 9.757442475178213e-06, "loss": 0.578, "num_input_tokens_seen": 5841392, "step": 5670 }, { "epoch": 3.7959866220735785, "grad_norm": 1.4494678974151611, "learning_rate": 9.75654365393337e-06, "loss": 0.5527, "num_input_tokens_seen": 5846640, "step": 5675 }, { "epoch": 3.79933110367893, "grad_norm": 1.4112205505371094, "learning_rate": 9.755643211984047e-06, "loss": 0.4582, "num_input_tokens_seen": 5851824, "step": 5680 }, { "epoch": 3.802675585284281, "grad_norm": 1.8925427198410034, "learning_rate": 9.754741149637051e-06, "loss": 0.4541, "num_input_tokens_seen": 5856304, "step": 5685 }, { "epoch": 3.8060200668896322, "grad_norm": 1.7518441677093506, "learning_rate": 9.75383746719974e-06, "loss": 0.548, "num_input_tokens_seen": 5862032, "step": 5690 }, { "epoch": 3.809364548494983, "grad_norm": 1.9883683919906616, "learning_rate": 9.752932164980033e-06, "loss": 0.4866, "num_input_tokens_seen": 5867536, "step": 5695 }, { "epoch": 3.8127090301003346, "grad_norm": 1.3592345714569092, "learning_rate": 9.752025243286393e-06, "loss": 0.4711, "num_input_tokens_seen": 5873040, "step": 5700 }, { "epoch": 3.8160535117056855, "grad_norm": 2.184706211090088, "learning_rate": 9.751116702427836e-06, "loss": 0.5674, "num_input_tokens_seen": 5877872, "step": 5705 }, { "epoch": 3.819397993311037, "grad_norm": 1.155957579612732, "learning_rate": 9.75020654271393e-06, "loss": 0.4637, "num_input_tokens_seen": 5883824, "step": 5710 }, { "epoch": 3.822742474916388, "grad_norm": 1.4869898557662964, "learning_rate": 9.749294764454796e-06, "loss": 0.488, "num_input_tokens_seen": 5888912, "step": 5715 }, { "epoch": 3.8260869565217392, "grad_norm": 1.5178481340408325, "learning_rate": 9.748381367961103e-06, "loss": 0.4127, "num_input_tokens_seen": 5893520, "step": 5720 }, { "epoch": 3.82943143812709, "grad_norm": 1.4387930631637573, "learning_rate": 9.74746635354408e-06, "loss": 0.471, "num_input_tokens_seen": 5898896, "step": 5725 }, { "epoch": 3.8327759197324416, "grad_norm": 1.391258955001831, "learning_rate": 9.746549721515497e-06, "loss": 0.5295, "num_input_tokens_seen": 5904272, "step": 5730 }, { "epoch": 3.8361204013377925, "grad_norm": 1.616763710975647, "learning_rate": 9.74563147218768e-06, "loss": 0.518, "num_input_tokens_seen": 5909104, "step": 5735 }, { "epoch": 3.839464882943144, "grad_norm": 1.7596286535263062, "learning_rate": 9.744711605873504e-06, "loss": 0.4669, "num_input_tokens_seen": 5914608, "step": 5740 }, { "epoch": 3.842809364548495, "grad_norm": 1.516993761062622, "learning_rate": 9.7437901228864e-06, "loss": 0.4934, "num_input_tokens_seen": 5919952, "step": 5745 }, { "epoch": 3.8461538461538463, "grad_norm": 1.1247273683547974, "learning_rate": 9.742867023540346e-06, "loss": 0.664, "num_input_tokens_seen": 5925968, "step": 5750 }, { "epoch": 3.849498327759197, "grad_norm": 1.1489135026931763, "learning_rate": 9.741942308149867e-06, "loss": 0.4559, "num_input_tokens_seen": 5930320, "step": 5755 }, { "epoch": 3.8528428093645486, "grad_norm": 3.405710220336914, "learning_rate": 9.741015977030046e-06, "loss": 0.5295, "num_input_tokens_seen": 5935152, "step": 5760 }, { "epoch": 3.8561872909698995, "grad_norm": 1.36209237575531, "learning_rate": 9.740088030496512e-06, "loss": 0.4505, "num_input_tokens_seen": 5939632, "step": 5765 }, { "epoch": 3.859531772575251, "grad_norm": 1.562391996383667, "learning_rate": 9.739158468865447e-06, "loss": 0.4941, "num_input_tokens_seen": 5944336, "step": 5770 }, { "epoch": 3.862876254180602, "grad_norm": 2.0025877952575684, "learning_rate": 9.738227292453582e-06, "loss": 0.4785, "num_input_tokens_seen": 5949360, "step": 5775 }, { "epoch": 3.8662207357859533, "grad_norm": 1.184965968132019, "learning_rate": 9.737294501578196e-06, "loss": 0.4875, "num_input_tokens_seen": 5953776, "step": 5780 }, { "epoch": 3.869565217391304, "grad_norm": 2.3055343627929688, "learning_rate": 9.736360096557122e-06, "loss": 0.4683, "num_input_tokens_seen": 5960048, "step": 5785 }, { "epoch": 3.8729096989966556, "grad_norm": 1.7038992643356323, "learning_rate": 9.735424077708741e-06, "loss": 0.5544, "num_input_tokens_seen": 5964944, "step": 5790 }, { "epoch": 3.8762541806020065, "grad_norm": 2.089632511138916, "learning_rate": 9.734486445351983e-06, "loss": 0.5654, "num_input_tokens_seen": 5970640, "step": 5795 }, { "epoch": 3.879598662207358, "grad_norm": 1.5968559980392456, "learning_rate": 9.733547199806332e-06, "loss": 0.4576, "num_input_tokens_seen": 5975152, "step": 5800 }, { "epoch": 3.882943143812709, "grad_norm": 1.660024881362915, "learning_rate": 9.732606341391815e-06, "loss": 0.4877, "num_input_tokens_seen": 5980464, "step": 5805 }, { "epoch": 3.8862876254180603, "grad_norm": 1.21007239818573, "learning_rate": 9.731663870429013e-06, "loss": 0.4169, "num_input_tokens_seen": 5984752, "step": 5810 }, { "epoch": 3.8896321070234112, "grad_norm": 2.1161608695983887, "learning_rate": 9.730719787239059e-06, "loss": 0.5587, "num_input_tokens_seen": 5990096, "step": 5815 }, { "epoch": 3.8929765886287626, "grad_norm": 1.0427769422531128, "learning_rate": 9.729774092143627e-06, "loss": 0.4517, "num_input_tokens_seen": 5995184, "step": 5820 }, { "epoch": 3.8963210702341136, "grad_norm": 1.3088239431381226, "learning_rate": 9.728826785464948e-06, "loss": 0.4939, "num_input_tokens_seen": 6000336, "step": 5825 }, { "epoch": 3.899665551839465, "grad_norm": 2.0905685424804688, "learning_rate": 9.727877867525799e-06, "loss": 0.5303, "num_input_tokens_seen": 6005488, "step": 5830 }, { "epoch": 3.903010033444816, "grad_norm": 1.5095936059951782, "learning_rate": 9.726927338649506e-06, "loss": 0.4094, "num_input_tokens_seen": 6010800, "step": 5835 }, { "epoch": 3.9063545150501673, "grad_norm": 1.2459386587142944, "learning_rate": 9.725975199159943e-06, "loss": 0.4677, "num_input_tokens_seen": 6015376, "step": 5840 }, { "epoch": 3.9096989966555182, "grad_norm": 1.1933516263961792, "learning_rate": 9.725021449381536e-06, "loss": 0.5271, "num_input_tokens_seen": 6021360, "step": 5845 }, { "epoch": 3.9130434782608696, "grad_norm": 2.234840154647827, "learning_rate": 9.724066089639257e-06, "loss": 0.5068, "num_input_tokens_seen": 6026096, "step": 5850 }, { "epoch": 3.9163879598662206, "grad_norm": 1.8481135368347168, "learning_rate": 9.723109120258624e-06, "loss": 0.5274, "num_input_tokens_seen": 6032368, "step": 5855 }, { "epoch": 3.919732441471572, "grad_norm": 1.4460917711257935, "learning_rate": 9.722150541565714e-06, "loss": 0.5538, "num_input_tokens_seen": 6036944, "step": 5860 }, { "epoch": 3.9230769230769234, "grad_norm": 1.7921055555343628, "learning_rate": 9.721190353887139e-06, "loss": 0.5559, "num_input_tokens_seen": 6041488, "step": 5865 }, { "epoch": 3.9264214046822743, "grad_norm": 1.4526629447937012, "learning_rate": 9.720228557550065e-06, "loss": 0.4326, "num_input_tokens_seen": 6046672, "step": 5870 }, { "epoch": 3.9297658862876252, "grad_norm": 2.352360963821411, "learning_rate": 9.71926515288221e-06, "loss": 0.4747, "num_input_tokens_seen": 6051920, "step": 5875 }, { "epoch": 3.9331103678929766, "grad_norm": 1.3111037015914917, "learning_rate": 9.718300140211833e-06, "loss": 0.508, "num_input_tokens_seen": 6056784, "step": 5880 }, { "epoch": 3.936454849498328, "grad_norm": 1.3950104713439941, "learning_rate": 9.717333519867747e-06, "loss": 0.4818, "num_input_tokens_seen": 6062064, "step": 5885 }, { "epoch": 3.939799331103679, "grad_norm": 2.0210623741149902, "learning_rate": 9.716365292179309e-06, "loss": 0.5164, "num_input_tokens_seen": 6067600, "step": 5890 }, { "epoch": 3.94314381270903, "grad_norm": 1.781867504119873, "learning_rate": 9.715395457476423e-06, "loss": 0.5251, "num_input_tokens_seen": 6072976, "step": 5895 }, { "epoch": 3.9464882943143813, "grad_norm": 2.280592679977417, "learning_rate": 9.714424016089545e-06, "loss": 0.4339, "num_input_tokens_seen": 6078032, "step": 5900 }, { "epoch": 3.9498327759197327, "grad_norm": 1.2254137992858887, "learning_rate": 9.713450968349674e-06, "loss": 0.4767, "num_input_tokens_seen": 6083728, "step": 5905 }, { "epoch": 3.9531772575250836, "grad_norm": 2.2282426357269287, "learning_rate": 9.71247631458836e-06, "loss": 0.4088, "num_input_tokens_seen": 6089776, "step": 5910 }, { "epoch": 3.9565217391304346, "grad_norm": 1.1553339958190918, "learning_rate": 9.711500055137695e-06, "loss": 0.502, "num_input_tokens_seen": 6094320, "step": 5915 }, { "epoch": 3.959866220735786, "grad_norm": 2.2221975326538086, "learning_rate": 9.710522190330324e-06, "loss": 0.4046, "num_input_tokens_seen": 6099056, "step": 5920 }, { "epoch": 3.9632107023411374, "grad_norm": 1.8267689943313599, "learning_rate": 9.709542720499436e-06, "loss": 0.4413, "num_input_tokens_seen": 6104176, "step": 5925 }, { "epoch": 3.9665551839464883, "grad_norm": 1.5283068418502808, "learning_rate": 9.708561645978766e-06, "loss": 0.4279, "num_input_tokens_seen": 6109392, "step": 5930 }, { "epoch": 3.9698996655518393, "grad_norm": 1.529542326927185, "learning_rate": 9.7075789671026e-06, "loss": 0.5, "num_input_tokens_seen": 6113904, "step": 5935 }, { "epoch": 3.9732441471571907, "grad_norm": 1.300517201423645, "learning_rate": 9.706594684205764e-06, "loss": 0.4904, "num_input_tokens_seen": 6118960, "step": 5940 }, { "epoch": 3.976588628762542, "grad_norm": 1.593293309211731, "learning_rate": 9.705608797623636e-06, "loss": 0.5499, "num_input_tokens_seen": 6124400, "step": 5945 }, { "epoch": 3.979933110367893, "grad_norm": 1.2507573366165161, "learning_rate": 9.704621307692139e-06, "loss": 0.4537, "num_input_tokens_seen": 6130000, "step": 5950 }, { "epoch": 3.983277591973244, "grad_norm": 1.5646883249282837, "learning_rate": 9.703632214747742e-06, "loss": 0.5156, "num_input_tokens_seen": 6135088, "step": 5955 }, { "epoch": 3.9866220735785953, "grad_norm": 1.966611385345459, "learning_rate": 9.702641519127459e-06, "loss": 0.4843, "num_input_tokens_seen": 6139760, "step": 5960 }, { "epoch": 3.9899665551839467, "grad_norm": 1.2783758640289307, "learning_rate": 9.701649221168852e-06, "loss": 0.4765, "num_input_tokens_seen": 6145520, "step": 5965 }, { "epoch": 3.9933110367892977, "grad_norm": 1.2103081941604614, "learning_rate": 9.700655321210029e-06, "loss": 0.4964, "num_input_tokens_seen": 6150736, "step": 5970 }, { "epoch": 3.9966555183946486, "grad_norm": 1.7009046077728271, "learning_rate": 9.699659819589641e-06, "loss": 0.5198, "num_input_tokens_seen": 6155984, "step": 5975 }, { "epoch": 4.0, "grad_norm": 1.642556071281433, "learning_rate": 9.698662716646889e-06, "loss": 0.4576, "num_input_tokens_seen": 6160576, "step": 5980 }, { "epoch": 4.0, "eval_loss": 0.5076645612716675, "eval_runtime": 37.5499, "eval_samples_per_second": 39.814, "eval_steps_per_second": 9.96, "num_input_tokens_seen": 6160576, "step": 5980 }, { "epoch": 4.003344481605351, "grad_norm": 1.2400630712509155, "learning_rate": 9.697664012721515e-06, "loss": 0.5003, "num_input_tokens_seen": 6165824, "step": 5985 }, { "epoch": 4.006688963210703, "grad_norm": 2.98922061920166, "learning_rate": 9.696663708153813e-06, "loss": 0.4783, "num_input_tokens_seen": 6170368, "step": 5990 }, { "epoch": 4.010033444816053, "grad_norm": 1.2936654090881348, "learning_rate": 9.695661803284615e-06, "loss": 0.4929, "num_input_tokens_seen": 6175456, "step": 5995 }, { "epoch": 4.013377926421405, "grad_norm": 1.7832502126693726, "learning_rate": 9.694658298455302e-06, "loss": 0.4976, "num_input_tokens_seen": 6180640, "step": 6000 }, { "epoch": 4.016722408026756, "grad_norm": 1.681467056274414, "learning_rate": 9.693653194007799e-06, "loss": 0.492, "num_input_tokens_seen": 6186400, "step": 6005 }, { "epoch": 4.0200668896321075, "grad_norm": 1.6252071857452393, "learning_rate": 9.692646490284578e-06, "loss": 0.5434, "num_input_tokens_seen": 6192000, "step": 6010 }, { "epoch": 4.023411371237458, "grad_norm": 1.2054234743118286, "learning_rate": 9.691638187628656e-06, "loss": 0.5014, "num_input_tokens_seen": 6197568, "step": 6015 }, { "epoch": 4.026755852842809, "grad_norm": 1.2368043661117554, "learning_rate": 9.690628286383593e-06, "loss": 0.4688, "num_input_tokens_seen": 6202656, "step": 6020 }, { "epoch": 4.030100334448161, "grad_norm": 1.265557885169983, "learning_rate": 9.689616786893491e-06, "loss": 0.5132, "num_input_tokens_seen": 6207360, "step": 6025 }, { "epoch": 4.033444816053512, "grad_norm": 1.193787693977356, "learning_rate": 9.688603689503002e-06, "loss": 0.4714, "num_input_tokens_seen": 6212640, "step": 6030 }, { "epoch": 4.036789297658863, "grad_norm": 1.1785221099853516, "learning_rate": 9.687588994557322e-06, "loss": 0.5408, "num_input_tokens_seen": 6218112, "step": 6035 }, { "epoch": 4.040133779264214, "grad_norm": 2.599172353744507, "learning_rate": 9.686572702402188e-06, "loss": 0.4407, "num_input_tokens_seen": 6223392, "step": 6040 }, { "epoch": 4.043478260869565, "grad_norm": 1.3384788036346436, "learning_rate": 9.685554813383882e-06, "loss": 0.4429, "num_input_tokens_seen": 6228480, "step": 6045 }, { "epoch": 4.046822742474917, "grad_norm": 1.5288225412368774, "learning_rate": 9.684535327849231e-06, "loss": 0.3955, "num_input_tokens_seen": 6233472, "step": 6050 }, { "epoch": 4.050167224080267, "grad_norm": 1.7179043292999268, "learning_rate": 9.683514246145607e-06, "loss": 0.5402, "num_input_tokens_seen": 6238592, "step": 6055 }, { "epoch": 4.053511705685619, "grad_norm": 1.6944324970245361, "learning_rate": 9.682491568620927e-06, "loss": 0.4227, "num_input_tokens_seen": 6243264, "step": 6060 }, { "epoch": 4.05685618729097, "grad_norm": 2.2160239219665527, "learning_rate": 9.681467295623643e-06, "loss": 0.4725, "num_input_tokens_seen": 6248896, "step": 6065 }, { "epoch": 4.0602006688963215, "grad_norm": 1.2820944786071777, "learning_rate": 9.680441427502763e-06, "loss": 0.5137, "num_input_tokens_seen": 6253184, "step": 6070 }, { "epoch": 4.063545150501672, "grad_norm": 1.8075705766677856, "learning_rate": 9.67941396460783e-06, "loss": 0.4113, "num_input_tokens_seen": 6257344, "step": 6075 }, { "epoch": 4.066889632107023, "grad_norm": 1.728959321975708, "learning_rate": 9.678384907288933e-06, "loss": 0.5008, "num_input_tokens_seen": 6262304, "step": 6080 }, { "epoch": 4.070234113712375, "grad_norm": 2.597890615463257, "learning_rate": 9.677354255896706e-06, "loss": 0.4599, "num_input_tokens_seen": 6266848, "step": 6085 }, { "epoch": 4.073578595317726, "grad_norm": 1.357690453529358, "learning_rate": 9.676322010782322e-06, "loss": 0.3794, "num_input_tokens_seen": 6272448, "step": 6090 }, { "epoch": 4.076923076923077, "grad_norm": 2.3607265949249268, "learning_rate": 9.675288172297502e-06, "loss": 0.4279, "num_input_tokens_seen": 6277760, "step": 6095 }, { "epoch": 4.080267558528428, "grad_norm": 1.8288695812225342, "learning_rate": 9.674252740794506e-06, "loss": 0.4405, "num_input_tokens_seen": 6281952, "step": 6100 }, { "epoch": 4.083612040133779, "grad_norm": 1.9306398630142212, "learning_rate": 9.673215716626137e-06, "loss": 0.5007, "num_input_tokens_seen": 6287424, "step": 6105 }, { "epoch": 4.086956521739131, "grad_norm": 1.1984425783157349, "learning_rate": 9.672177100145745e-06, "loss": 0.4091, "num_input_tokens_seen": 6293152, "step": 6110 }, { "epoch": 4.090301003344481, "grad_norm": 1.7067756652832031, "learning_rate": 9.671136891707216e-06, "loss": 0.4243, "num_input_tokens_seen": 6297536, "step": 6115 }, { "epoch": 4.093645484949833, "grad_norm": 1.8803962469100952, "learning_rate": 9.670095091664986e-06, "loss": 0.4927, "num_input_tokens_seen": 6302464, "step": 6120 }, { "epoch": 4.096989966555184, "grad_norm": 1.3017323017120361, "learning_rate": 9.669051700374024e-06, "loss": 0.5441, "num_input_tokens_seen": 6307808, "step": 6125 }, { "epoch": 4.1003344481605355, "grad_norm": 1.6946041584014893, "learning_rate": 9.66800671818985e-06, "loss": 0.5102, "num_input_tokens_seen": 6312768, "step": 6130 }, { "epoch": 4.103678929765886, "grad_norm": 1.780734896659851, "learning_rate": 9.666960145468522e-06, "loss": 0.478, "num_input_tokens_seen": 6317120, "step": 6135 }, { "epoch": 4.107023411371237, "grad_norm": 1.2490663528442383, "learning_rate": 9.665911982566638e-06, "loss": 0.4933, "num_input_tokens_seen": 6322880, "step": 6140 }, { "epoch": 4.110367892976589, "grad_norm": 1.3344132900238037, "learning_rate": 9.664862229841342e-06, "loss": 0.3889, "num_input_tokens_seen": 6328160, "step": 6145 }, { "epoch": 4.11371237458194, "grad_norm": 1.2594189643859863, "learning_rate": 9.66381088765032e-06, "loss": 0.4579, "num_input_tokens_seen": 6333056, "step": 6150 }, { "epoch": 4.117056856187291, "grad_norm": 1.6823636293411255, "learning_rate": 9.662757956351793e-06, "loss": 0.4817, "num_input_tokens_seen": 6338272, "step": 6155 }, { "epoch": 4.120401337792642, "grad_norm": 1.5642259120941162, "learning_rate": 9.66170343630453e-06, "loss": 0.4809, "num_input_tokens_seen": 6343712, "step": 6160 }, { "epoch": 4.1237458193979935, "grad_norm": 1.0961823463439941, "learning_rate": 9.66064732786784e-06, "loss": 0.4066, "num_input_tokens_seen": 6348832, "step": 6165 }, { "epoch": 4.127090301003345, "grad_norm": 1.4806993007659912, "learning_rate": 9.65958963140157e-06, "loss": 0.5347, "num_input_tokens_seen": 6354144, "step": 6170 }, { "epoch": 4.130434782608695, "grad_norm": 1.4625810384750366, "learning_rate": 9.658530347266115e-06, "loss": 0.4063, "num_input_tokens_seen": 6359424, "step": 6175 }, { "epoch": 4.133779264214047, "grad_norm": 1.6625574827194214, "learning_rate": 9.657469475822404e-06, "loss": 0.4865, "num_input_tokens_seen": 6364384, "step": 6180 }, { "epoch": 4.137123745819398, "grad_norm": 1.7619338035583496, "learning_rate": 9.656407017431906e-06, "loss": 0.462, "num_input_tokens_seen": 6370240, "step": 6185 }, { "epoch": 4.1404682274247495, "grad_norm": 1.9442641735076904, "learning_rate": 9.65534297245664e-06, "loss": 0.5534, "num_input_tokens_seen": 6375328, "step": 6190 }, { "epoch": 4.1438127090301, "grad_norm": 1.4111064672470093, "learning_rate": 9.654277341259156e-06, "loss": 0.4556, "num_input_tokens_seen": 6380128, "step": 6195 }, { "epoch": 4.147157190635451, "grad_norm": 2.546337127685547, "learning_rate": 9.65321012420255e-06, "loss": 0.4761, "num_input_tokens_seen": 6384928, "step": 6200 }, { "epoch": 4.150501672240803, "grad_norm": 1.584388017654419, "learning_rate": 9.652141321650454e-06, "loss": 0.4815, "num_input_tokens_seen": 6389504, "step": 6205 }, { "epoch": 4.153846153846154, "grad_norm": 1.6077768802642822, "learning_rate": 9.651070933967047e-06, "loss": 0.4189, "num_input_tokens_seen": 6395136, "step": 6210 }, { "epoch": 4.157190635451505, "grad_norm": 1.6233656406402588, "learning_rate": 9.64999896151704e-06, "loss": 0.5596, "num_input_tokens_seen": 6399680, "step": 6215 }, { "epoch": 4.160535117056856, "grad_norm": 1.4124398231506348, "learning_rate": 9.648925404665688e-06, "loss": 0.5818, "num_input_tokens_seen": 6404320, "step": 6220 }, { "epoch": 4.1638795986622075, "grad_norm": 1.6069576740264893, "learning_rate": 9.647850263778787e-06, "loss": 0.4515, "num_input_tokens_seen": 6410016, "step": 6225 }, { "epoch": 4.167224080267559, "grad_norm": 1.7171790599822998, "learning_rate": 9.646773539222672e-06, "loss": 0.418, "num_input_tokens_seen": 6414880, "step": 6230 }, { "epoch": 4.170568561872909, "grad_norm": 2.3344407081604004, "learning_rate": 9.645695231364217e-06, "loss": 0.4745, "num_input_tokens_seen": 6419680, "step": 6235 }, { "epoch": 4.173913043478261, "grad_norm": 1.4356908798217773, "learning_rate": 9.644615340570833e-06, "loss": 0.4924, "num_input_tokens_seen": 6425120, "step": 6240 }, { "epoch": 4.177257525083612, "grad_norm": 1.1561342477798462, "learning_rate": 9.643533867210477e-06, "loss": 0.445, "num_input_tokens_seen": 6430880, "step": 6245 }, { "epoch": 4.1806020066889635, "grad_norm": 1.2842859029769897, "learning_rate": 9.642450811651635e-06, "loss": 0.4736, "num_input_tokens_seen": 6435424, "step": 6250 }, { "epoch": 4.183946488294314, "grad_norm": 1.819689393043518, "learning_rate": 9.641366174263345e-06, "loss": 0.5015, "num_input_tokens_seen": 6439904, "step": 6255 }, { "epoch": 4.187290969899665, "grad_norm": 1.3667181730270386, "learning_rate": 9.640279955415174e-06, "loss": 0.4045, "num_input_tokens_seen": 6444928, "step": 6260 }, { "epoch": 4.190635451505017, "grad_norm": 1.5514203310012817, "learning_rate": 9.63919215547723e-06, "loss": 0.5292, "num_input_tokens_seen": 6449888, "step": 6265 }, { "epoch": 4.193979933110368, "grad_norm": 1.5031533241271973, "learning_rate": 9.638102774820162e-06, "loss": 0.4959, "num_input_tokens_seen": 6454368, "step": 6270 }, { "epoch": 4.197324414715719, "grad_norm": 1.507818341255188, "learning_rate": 9.637011813815155e-06, "loss": 0.4988, "num_input_tokens_seen": 6460096, "step": 6275 }, { "epoch": 4.20066889632107, "grad_norm": 1.59291410446167, "learning_rate": 9.635919272833938e-06, "loss": 0.5157, "num_input_tokens_seen": 6465664, "step": 6280 }, { "epoch": 4.2040133779264215, "grad_norm": 1.362639307975769, "learning_rate": 9.63482515224877e-06, "loss": 0.487, "num_input_tokens_seen": 6471040, "step": 6285 }, { "epoch": 4.207357859531773, "grad_norm": 1.2472792863845825, "learning_rate": 9.633729452432452e-06, "loss": 0.4003, "num_input_tokens_seen": 6476512, "step": 6290 }, { "epoch": 4.210702341137123, "grad_norm": 1.5739467144012451, "learning_rate": 9.632632173758327e-06, "loss": 0.4386, "num_input_tokens_seen": 6482432, "step": 6295 }, { "epoch": 4.214046822742475, "grad_norm": 1.5346927642822266, "learning_rate": 9.63153331660027e-06, "loss": 0.5174, "num_input_tokens_seen": 6487840, "step": 6300 }, { "epoch": 4.217391304347826, "grad_norm": 1.4912936687469482, "learning_rate": 9.630432881332697e-06, "loss": 0.4646, "num_input_tokens_seen": 6492704, "step": 6305 }, { "epoch": 4.2207357859531776, "grad_norm": 2.1781370639801025, "learning_rate": 9.62933086833056e-06, "loss": 0.5162, "num_input_tokens_seen": 6497184, "step": 6310 }, { "epoch": 4.224080267558528, "grad_norm": 1.8224375247955322, "learning_rate": 9.628227277969351e-06, "loss": 0.4381, "num_input_tokens_seen": 6501792, "step": 6315 }, { "epoch": 4.2274247491638794, "grad_norm": 1.3855311870574951, "learning_rate": 9.627122110625098e-06, "loss": 0.4722, "num_input_tokens_seen": 6506912, "step": 6320 }, { "epoch": 4.230769230769231, "grad_norm": 2.305814504623413, "learning_rate": 9.626015366674366e-06, "loss": 0.5477, "num_input_tokens_seen": 6511584, "step": 6325 }, { "epoch": 4.234113712374582, "grad_norm": 1.6104249954223633, "learning_rate": 9.62490704649426e-06, "loss": 0.4896, "num_input_tokens_seen": 6516704, "step": 6330 }, { "epoch": 4.237458193979933, "grad_norm": 1.1994508504867554, "learning_rate": 9.623797150462412e-06, "loss": 0.5255, "num_input_tokens_seen": 6522080, "step": 6335 }, { "epoch": 4.240802675585284, "grad_norm": 1.6614757776260376, "learning_rate": 9.622685678957007e-06, "loss": 0.4369, "num_input_tokens_seen": 6527968, "step": 6340 }, { "epoch": 4.2441471571906355, "grad_norm": 1.677627682685852, "learning_rate": 9.621572632356754e-06, "loss": 0.4874, "num_input_tokens_seen": 6532800, "step": 6345 }, { "epoch": 4.247491638795987, "grad_norm": 2.1486198902130127, "learning_rate": 9.620458011040906e-06, "loss": 0.4975, "num_input_tokens_seen": 6538752, "step": 6350 }, { "epoch": 4.250836120401337, "grad_norm": 2.5883986949920654, "learning_rate": 9.619341815389245e-06, "loss": 0.5086, "num_input_tokens_seen": 6543936, "step": 6355 }, { "epoch": 4.254180602006689, "grad_norm": 1.7424430847167969, "learning_rate": 9.618224045782098e-06, "loss": 0.5685, "num_input_tokens_seen": 6548992, "step": 6360 }, { "epoch": 4.25752508361204, "grad_norm": 1.8502362966537476, "learning_rate": 9.617104702600324e-06, "loss": 0.4388, "num_input_tokens_seen": 6554400, "step": 6365 }, { "epoch": 4.260869565217392, "grad_norm": 2.161695718765259, "learning_rate": 9.615983786225319e-06, "loss": 0.412, "num_input_tokens_seen": 6558656, "step": 6370 }, { "epoch": 4.264214046822742, "grad_norm": 1.5280736684799194, "learning_rate": 9.614861297039012e-06, "loss": 0.5433, "num_input_tokens_seen": 6563040, "step": 6375 }, { "epoch": 4.2675585284280935, "grad_norm": 1.4314244985580444, "learning_rate": 9.613737235423872e-06, "loss": 0.472, "num_input_tokens_seen": 6568064, "step": 6380 }, { "epoch": 4.270903010033445, "grad_norm": 1.356626272201538, "learning_rate": 9.612611601762902e-06, "loss": 0.4909, "num_input_tokens_seen": 6573952, "step": 6385 }, { "epoch": 4.274247491638796, "grad_norm": 1.718016266822815, "learning_rate": 9.611484396439641e-06, "loss": 0.5086, "num_input_tokens_seen": 6579008, "step": 6390 }, { "epoch": 4.277591973244147, "grad_norm": 2.313969135284424, "learning_rate": 9.610355619838162e-06, "loss": 0.5508, "num_input_tokens_seen": 6584096, "step": 6395 }, { "epoch": 4.280936454849498, "grad_norm": 1.3796037435531616, "learning_rate": 9.60922527234308e-06, "loss": 0.5237, "num_input_tokens_seen": 6588928, "step": 6400 }, { "epoch": 4.2842809364548495, "grad_norm": 2.177358865737915, "learning_rate": 9.608093354339535e-06, "loss": 0.5049, "num_input_tokens_seen": 6593728, "step": 6405 }, { "epoch": 4.287625418060201, "grad_norm": 1.9864119291305542, "learning_rate": 9.606959866213206e-06, "loss": 0.5178, "num_input_tokens_seen": 6598848, "step": 6410 }, { "epoch": 4.290969899665551, "grad_norm": 1.505090594291687, "learning_rate": 9.605824808350314e-06, "loss": 0.491, "num_input_tokens_seen": 6603200, "step": 6415 }, { "epoch": 4.294314381270903, "grad_norm": 1.979443907737732, "learning_rate": 9.604688181137603e-06, "loss": 0.4775, "num_input_tokens_seen": 6608192, "step": 6420 }, { "epoch": 4.297658862876254, "grad_norm": 1.5992522239685059, "learning_rate": 9.603549984962362e-06, "loss": 0.5758, "num_input_tokens_seen": 6613824, "step": 6425 }, { "epoch": 4.301003344481606, "grad_norm": 1.2837152481079102, "learning_rate": 9.602410220212409e-06, "loss": 0.4637, "num_input_tokens_seen": 6618848, "step": 6430 }, { "epoch": 4.304347826086957, "grad_norm": 2.988360643386841, "learning_rate": 9.601268887276097e-06, "loss": 0.552, "num_input_tokens_seen": 6623680, "step": 6435 }, { "epoch": 4.3076923076923075, "grad_norm": 1.2519264221191406, "learning_rate": 9.600125986542314e-06, "loss": 0.3961, "num_input_tokens_seen": 6628384, "step": 6440 }, { "epoch": 4.311036789297659, "grad_norm": 1.890238881111145, "learning_rate": 9.598981518400485e-06, "loss": 0.598, "num_input_tokens_seen": 6633856, "step": 6445 }, { "epoch": 4.31438127090301, "grad_norm": 1.3389090299606323, "learning_rate": 9.597835483240562e-06, "loss": 0.4686, "num_input_tokens_seen": 6638880, "step": 6450 }, { "epoch": 4.317725752508361, "grad_norm": 1.4635651111602783, "learning_rate": 9.596687881453041e-06, "loss": 0.4153, "num_input_tokens_seen": 6644224, "step": 6455 }, { "epoch": 4.321070234113712, "grad_norm": 1.109466552734375, "learning_rate": 9.59553871342894e-06, "loss": 0.453, "num_input_tokens_seen": 6649120, "step": 6460 }, { "epoch": 4.3244147157190636, "grad_norm": 2.0185964107513428, "learning_rate": 9.594387979559822e-06, "loss": 0.5106, "num_input_tokens_seen": 6654368, "step": 6465 }, { "epoch": 4.327759197324415, "grad_norm": 1.834173560142517, "learning_rate": 9.593235680237775e-06, "loss": 0.5237, "num_input_tokens_seen": 6659552, "step": 6470 }, { "epoch": 4.331103678929766, "grad_norm": 2.406099796295166, "learning_rate": 9.592081815855425e-06, "loss": 0.5, "num_input_tokens_seen": 6664832, "step": 6475 }, { "epoch": 4.334448160535117, "grad_norm": 1.5880799293518066, "learning_rate": 9.59092638680593e-06, "loss": 0.4369, "num_input_tokens_seen": 6669600, "step": 6480 }, { "epoch": 4.337792642140468, "grad_norm": 1.5513989925384521, "learning_rate": 9.58976939348298e-06, "loss": 0.4124, "num_input_tokens_seen": 6674592, "step": 6485 }, { "epoch": 4.34113712374582, "grad_norm": 1.1949419975280762, "learning_rate": 9.5886108362808e-06, "loss": 0.534, "num_input_tokens_seen": 6680224, "step": 6490 }, { "epoch": 4.34448160535117, "grad_norm": 1.9786787033081055, "learning_rate": 9.587450715594148e-06, "loss": 0.4746, "num_input_tokens_seen": 6685760, "step": 6495 }, { "epoch": 4.3478260869565215, "grad_norm": 2.3513665199279785, "learning_rate": 9.586289031818311e-06, "loss": 0.4656, "num_input_tokens_seen": 6690688, "step": 6500 }, { "epoch": 4.351170568561873, "grad_norm": 2.0711708068847656, "learning_rate": 9.585125785349115e-06, "loss": 0.3907, "num_input_tokens_seen": 6696192, "step": 6505 }, { "epoch": 4.354515050167224, "grad_norm": 1.1734645366668701, "learning_rate": 9.583960976582914e-06, "loss": 0.4545, "num_input_tokens_seen": 6701120, "step": 6510 }, { "epoch": 4.357859531772576, "grad_norm": 1.5836637020111084, "learning_rate": 9.58279460591659e-06, "loss": 0.4914, "num_input_tokens_seen": 6706176, "step": 6515 }, { "epoch": 4.361204013377926, "grad_norm": 1.6099859476089478, "learning_rate": 9.58162667374757e-06, "loss": 0.5001, "num_input_tokens_seen": 6711744, "step": 6520 }, { "epoch": 4.364548494983278, "grad_norm": 3.060180187225342, "learning_rate": 9.580457180473798e-06, "loss": 0.4643, "num_input_tokens_seen": 6717184, "step": 6525 }, { "epoch": 4.367892976588629, "grad_norm": 2.118075370788574, "learning_rate": 9.579286126493766e-06, "loss": 0.5649, "num_input_tokens_seen": 6722848, "step": 6530 }, { "epoch": 4.3712374581939795, "grad_norm": 1.0872541666030884, "learning_rate": 9.57811351220648e-06, "loss": 0.5421, "num_input_tokens_seen": 6728960, "step": 6535 }, { "epoch": 4.374581939799331, "grad_norm": 2.1907739639282227, "learning_rate": 9.57693933801149e-06, "loss": 0.5396, "num_input_tokens_seen": 6733504, "step": 6540 }, { "epoch": 4.377926421404682, "grad_norm": 1.5354816913604736, "learning_rate": 9.575763604308876e-06, "loss": 0.4431, "num_input_tokens_seen": 6738752, "step": 6545 }, { "epoch": 4.381270903010034, "grad_norm": 1.0632613897323608, "learning_rate": 9.574586311499246e-06, "loss": 0.5142, "num_input_tokens_seen": 6743840, "step": 6550 }, { "epoch": 4.384615384615385, "grad_norm": 1.4548654556274414, "learning_rate": 9.57340745998374e-06, "loss": 0.4083, "num_input_tokens_seen": 6748704, "step": 6555 }, { "epoch": 4.3879598662207355, "grad_norm": 1.7164644002914429, "learning_rate": 9.572227050164034e-06, "loss": 0.4759, "num_input_tokens_seen": 6754112, "step": 6560 }, { "epoch": 4.391304347826087, "grad_norm": 1.6633714437484741, "learning_rate": 9.571045082442327e-06, "loss": 0.5178, "num_input_tokens_seen": 6758816, "step": 6565 }, { "epoch": 4.394648829431438, "grad_norm": 1.168656826019287, "learning_rate": 9.569861557221352e-06, "loss": 0.4515, "num_input_tokens_seen": 6764288, "step": 6570 }, { "epoch": 4.39799331103679, "grad_norm": 1.5694248676300049, "learning_rate": 9.568676474904375e-06, "loss": 0.4278, "num_input_tokens_seen": 6768480, "step": 6575 }, { "epoch": 4.40133779264214, "grad_norm": 1.3797178268432617, "learning_rate": 9.567489835895192e-06, "loss": 0.4248, "num_input_tokens_seen": 6773568, "step": 6580 }, { "epoch": 4.404682274247492, "grad_norm": 1.6312572956085205, "learning_rate": 9.566301640598127e-06, "loss": 0.465, "num_input_tokens_seen": 6778944, "step": 6585 }, { "epoch": 4.408026755852843, "grad_norm": 1.4370825290679932, "learning_rate": 9.565111889418035e-06, "loss": 0.4391, "num_input_tokens_seen": 6783456, "step": 6590 }, { "epoch": 4.411371237458194, "grad_norm": 2.053605079650879, "learning_rate": 9.563920582760304e-06, "loss": 0.4289, "num_input_tokens_seen": 6788832, "step": 6595 }, { "epoch": 4.414715719063545, "grad_norm": 1.9435783624649048, "learning_rate": 9.562727721030846e-06, "loss": 0.5394, "num_input_tokens_seen": 6794784, "step": 6600 }, { "epoch": 4.418060200668896, "grad_norm": 1.215030312538147, "learning_rate": 9.561533304636111e-06, "loss": 0.495, "num_input_tokens_seen": 6800352, "step": 6605 }, { "epoch": 4.421404682274248, "grad_norm": 1.9213085174560547, "learning_rate": 9.560337333983072e-06, "loss": 0.4906, "num_input_tokens_seen": 6804800, "step": 6610 }, { "epoch": 4.424749163879599, "grad_norm": 1.9635891914367676, "learning_rate": 9.559139809479235e-06, "loss": 0.4723, "num_input_tokens_seen": 6810176, "step": 6615 }, { "epoch": 4.4280936454849495, "grad_norm": 1.752457857131958, "learning_rate": 9.55794073153263e-06, "loss": 0.4847, "num_input_tokens_seen": 6814848, "step": 6620 }, { "epoch": 4.431438127090301, "grad_norm": 1.4075608253479004, "learning_rate": 9.556740100551829e-06, "loss": 0.4988, "num_input_tokens_seen": 6820160, "step": 6625 }, { "epoch": 4.434782608695652, "grad_norm": 1.2876145839691162, "learning_rate": 9.555537916945917e-06, "loss": 0.4522, "num_input_tokens_seen": 6825440, "step": 6630 }, { "epoch": 4.438127090301004, "grad_norm": 1.4780402183532715, "learning_rate": 9.55433418112452e-06, "loss": 0.507, "num_input_tokens_seen": 6830400, "step": 6635 }, { "epoch": 4.441471571906354, "grad_norm": 1.22413969039917, "learning_rate": 9.553128893497788e-06, "loss": 0.4823, "num_input_tokens_seen": 6834880, "step": 6640 }, { "epoch": 4.444816053511706, "grad_norm": 1.1377878189086914, "learning_rate": 9.551922054476402e-06, "loss": 0.4001, "num_input_tokens_seen": 6840096, "step": 6645 }, { "epoch": 4.448160535117057, "grad_norm": 1.4049345254898071, "learning_rate": 9.550713664471566e-06, "loss": 0.4888, "num_input_tokens_seen": 6844928, "step": 6650 }, { "epoch": 4.451505016722408, "grad_norm": 1.3615474700927734, "learning_rate": 9.54950372389502e-06, "loss": 0.4951, "num_input_tokens_seen": 6851136, "step": 6655 }, { "epoch": 4.454849498327759, "grad_norm": 1.834377408027649, "learning_rate": 9.548292233159027e-06, "loss": 0.5162, "num_input_tokens_seen": 6856800, "step": 6660 }, { "epoch": 4.45819397993311, "grad_norm": 1.0012223720550537, "learning_rate": 9.547079192676382e-06, "loss": 0.3996, "num_input_tokens_seen": 6862720, "step": 6665 }, { "epoch": 4.461538461538462, "grad_norm": 1.3678277730941772, "learning_rate": 9.545864602860406e-06, "loss": 0.4995, "num_input_tokens_seen": 6867904, "step": 6670 }, { "epoch": 4.464882943143813, "grad_norm": 1.838553547859192, "learning_rate": 9.544648464124946e-06, "loss": 0.4612, "num_input_tokens_seen": 6872736, "step": 6675 }, { "epoch": 4.468227424749164, "grad_norm": 1.6102614402770996, "learning_rate": 9.543430776884378e-06, "loss": 0.5381, "num_input_tokens_seen": 6877696, "step": 6680 }, { "epoch": 4.471571906354515, "grad_norm": 1.6173020601272583, "learning_rate": 9.542211541553613e-06, "loss": 0.4946, "num_input_tokens_seen": 6882336, "step": 6685 }, { "epoch": 4.474916387959866, "grad_norm": 1.4223144054412842, "learning_rate": 9.540990758548077e-06, "loss": 0.492, "num_input_tokens_seen": 6887872, "step": 6690 }, { "epoch": 4.478260869565218, "grad_norm": 1.5512763261795044, "learning_rate": 9.539768428283731e-06, "loss": 0.5353, "num_input_tokens_seen": 6893696, "step": 6695 }, { "epoch": 4.481605351170568, "grad_norm": 1.555184006690979, "learning_rate": 9.53854455117706e-06, "loss": 0.4364, "num_input_tokens_seen": 6898560, "step": 6700 }, { "epoch": 4.48494983277592, "grad_norm": 1.849364995956421, "learning_rate": 9.53731912764508e-06, "loss": 0.5108, "num_input_tokens_seen": 6904064, "step": 6705 }, { "epoch": 4.488294314381271, "grad_norm": 2.1166446208953857, "learning_rate": 9.53609215810533e-06, "loss": 0.5092, "num_input_tokens_seen": 6909056, "step": 6710 }, { "epoch": 4.491638795986622, "grad_norm": 1.1863092184066772, "learning_rate": 9.534863642975878e-06, "loss": 0.4262, "num_input_tokens_seen": 6914336, "step": 6715 }, { "epoch": 4.494983277591973, "grad_norm": 1.90667724609375, "learning_rate": 9.533633582675316e-06, "loss": 0.501, "num_input_tokens_seen": 6919296, "step": 6720 }, { "epoch": 4.498327759197324, "grad_norm": 1.688531517982483, "learning_rate": 9.532401977622768e-06, "loss": 0.5333, "num_input_tokens_seen": 6925344, "step": 6725 }, { "epoch": 4.501672240802676, "grad_norm": 1.2783350944519043, "learning_rate": 9.531168828237878e-06, "loss": 0.4087, "num_input_tokens_seen": 6930656, "step": 6730 }, { "epoch": 4.505016722408027, "grad_norm": 1.7271029949188232, "learning_rate": 9.529934134940819e-06, "loss": 0.5203, "num_input_tokens_seen": 6935392, "step": 6735 }, { "epoch": 4.508361204013378, "grad_norm": 1.0698912143707275, "learning_rate": 9.52869789815229e-06, "loss": 0.573, "num_input_tokens_seen": 6940704, "step": 6740 }, { "epoch": 4.511705685618729, "grad_norm": 1.3498759269714355, "learning_rate": 9.527460118293515e-06, "loss": 0.3459, "num_input_tokens_seen": 6945248, "step": 6745 }, { "epoch": 4.51505016722408, "grad_norm": 1.6644046306610107, "learning_rate": 9.526220795786248e-06, "loss": 0.4029, "num_input_tokens_seen": 6950624, "step": 6750 }, { "epoch": 4.518394648829432, "grad_norm": 1.4025763273239136, "learning_rate": 9.524979931052763e-06, "loss": 0.5682, "num_input_tokens_seen": 6954880, "step": 6755 }, { "epoch": 4.521739130434782, "grad_norm": 1.2670841217041016, "learning_rate": 9.52373752451586e-06, "loss": 0.5169, "num_input_tokens_seen": 6960192, "step": 6760 }, { "epoch": 4.525083612040134, "grad_norm": 1.4969706535339355, "learning_rate": 9.52249357659887e-06, "loss": 0.4197, "num_input_tokens_seen": 6965216, "step": 6765 }, { "epoch": 4.528428093645485, "grad_norm": 2.064723014831543, "learning_rate": 9.521248087725641e-06, "loss": 0.4414, "num_input_tokens_seen": 6969760, "step": 6770 }, { "epoch": 4.531772575250836, "grad_norm": 1.6110395193099976, "learning_rate": 9.520001058320554e-06, "loss": 0.4249, "num_input_tokens_seen": 6974688, "step": 6775 }, { "epoch": 4.535117056856187, "grad_norm": 1.6168118715286255, "learning_rate": 9.51875248880851e-06, "loss": 0.5324, "num_input_tokens_seen": 6979776, "step": 6780 }, { "epoch": 4.538461538461538, "grad_norm": 2.2229127883911133, "learning_rate": 9.517502379614936e-06, "loss": 0.4876, "num_input_tokens_seen": 6985312, "step": 6785 }, { "epoch": 4.54180602006689, "grad_norm": 1.8537768125534058, "learning_rate": 9.516250731165783e-06, "loss": 0.441, "num_input_tokens_seen": 6990752, "step": 6790 }, { "epoch": 4.545150501672241, "grad_norm": 1.1929916143417358, "learning_rate": 9.51499754388753e-06, "loss": 0.369, "num_input_tokens_seen": 6995712, "step": 6795 }, { "epoch": 4.548494983277592, "grad_norm": 1.193973183631897, "learning_rate": 9.513742818207173e-06, "loss": 0.4808, "num_input_tokens_seen": 7000544, "step": 6800 }, { "epoch": 4.551839464882943, "grad_norm": 1.4485467672348022, "learning_rate": 9.512486554552238e-06, "loss": 0.5258, "num_input_tokens_seen": 7005760, "step": 6805 }, { "epoch": 4.555183946488294, "grad_norm": 1.3461065292358398, "learning_rate": 9.511228753350774e-06, "loss": 0.462, "num_input_tokens_seen": 7010752, "step": 6810 }, { "epoch": 4.558528428093646, "grad_norm": 1.1360877752304077, "learning_rate": 9.509969415031356e-06, "loss": 0.4765, "num_input_tokens_seen": 7015584, "step": 6815 }, { "epoch": 4.561872909698996, "grad_norm": 1.8250608444213867, "learning_rate": 9.508708540023077e-06, "loss": 0.3581, "num_input_tokens_seen": 7020288, "step": 6820 }, { "epoch": 4.565217391304348, "grad_norm": 1.0680062770843506, "learning_rate": 9.50744612875556e-06, "loss": 0.3853, "num_input_tokens_seen": 7024992, "step": 6825 }, { "epoch": 4.568561872909699, "grad_norm": 1.23674738407135, "learning_rate": 9.506182181658944e-06, "loss": 0.4721, "num_input_tokens_seen": 7031360, "step": 6830 }, { "epoch": 4.5719063545150505, "grad_norm": 1.9438718557357788, "learning_rate": 9.5049166991639e-06, "loss": 0.5314, "num_input_tokens_seen": 7036608, "step": 6835 }, { "epoch": 4.575250836120401, "grad_norm": 1.6289198398590088, "learning_rate": 9.503649681701614e-06, "loss": 0.5305, "num_input_tokens_seen": 7041920, "step": 6840 }, { "epoch": 4.578595317725752, "grad_norm": 1.5990676879882812, "learning_rate": 9.502381129703801e-06, "loss": 0.4167, "num_input_tokens_seen": 7047072, "step": 6845 }, { "epoch": 4.581939799331104, "grad_norm": 1.2810171842575073, "learning_rate": 9.5011110436027e-06, "loss": 0.4325, "num_input_tokens_seen": 7052736, "step": 6850 }, { "epoch": 4.585284280936455, "grad_norm": 1.2154302597045898, "learning_rate": 9.499839423831062e-06, "loss": 0.4195, "num_input_tokens_seen": 7058240, "step": 6855 }, { "epoch": 4.588628762541806, "grad_norm": 1.6751405000686646, "learning_rate": 9.498566270822172e-06, "loss": 0.5478, "num_input_tokens_seen": 7063520, "step": 6860 }, { "epoch": 4.591973244147157, "grad_norm": 1.8421123027801514, "learning_rate": 9.497291585009834e-06, "loss": 0.499, "num_input_tokens_seen": 7068832, "step": 6865 }, { "epoch": 4.595317725752508, "grad_norm": 1.798077940940857, "learning_rate": 9.496015366828373e-06, "loss": 0.5115, "num_input_tokens_seen": 7074528, "step": 6870 }, { "epoch": 4.59866220735786, "grad_norm": 2.2505433559417725, "learning_rate": 9.494737616712638e-06, "loss": 0.521, "num_input_tokens_seen": 7079296, "step": 6875 }, { "epoch": 4.602006688963211, "grad_norm": 2.336430311203003, "learning_rate": 9.493458335097996e-06, "loss": 0.5122, "num_input_tokens_seen": 7084512, "step": 6880 }, { "epoch": 4.605351170568562, "grad_norm": 1.5947612524032593, "learning_rate": 9.492177522420341e-06, "loss": 0.4449, "num_input_tokens_seen": 7089728, "step": 6885 }, { "epoch": 4.608695652173913, "grad_norm": 1.1312522888183594, "learning_rate": 9.490895179116085e-06, "loss": 0.4116, "num_input_tokens_seen": 7094976, "step": 6890 }, { "epoch": 4.6120401337792645, "grad_norm": 1.6248265504837036, "learning_rate": 9.489611305622162e-06, "loss": 0.4705, "num_input_tokens_seen": 7100352, "step": 6895 }, { "epoch": 4.615384615384615, "grad_norm": 1.2886854410171509, "learning_rate": 9.48832590237603e-06, "loss": 0.4837, "num_input_tokens_seen": 7106816, "step": 6900 }, { "epoch": 4.618729096989966, "grad_norm": 2.0500171184539795, "learning_rate": 9.487038969815665e-06, "loss": 0.3386, "num_input_tokens_seen": 7111136, "step": 6905 }, { "epoch": 4.622073578595318, "grad_norm": 1.474870204925537, "learning_rate": 9.485750508379568e-06, "loss": 0.45, "num_input_tokens_seen": 7116512, "step": 6910 }, { "epoch": 4.625418060200669, "grad_norm": 1.255368947982788, "learning_rate": 9.484460518506756e-06, "loss": 0.3791, "num_input_tokens_seen": 7121952, "step": 6915 }, { "epoch": 4.6287625418060205, "grad_norm": 0.9428399205207825, "learning_rate": 9.48316900063677e-06, "loss": 0.4886, "num_input_tokens_seen": 7127232, "step": 6920 }, { "epoch": 4.632107023411371, "grad_norm": 1.5717159509658813, "learning_rate": 9.481875955209671e-06, "loss": 0.423, "num_input_tokens_seen": 7132000, "step": 6925 }, { "epoch": 4.635451505016722, "grad_norm": 2.5163042545318604, "learning_rate": 9.480581382666041e-06, "loss": 0.4792, "num_input_tokens_seen": 7136928, "step": 6930 }, { "epoch": 4.638795986622074, "grad_norm": 2.189089298248291, "learning_rate": 9.47928528344698e-06, "loss": 0.5547, "num_input_tokens_seen": 7141824, "step": 6935 }, { "epoch": 4.642140468227424, "grad_norm": 2.1505751609802246, "learning_rate": 9.47798765799411e-06, "loss": 0.5703, "num_input_tokens_seen": 7147200, "step": 6940 }, { "epoch": 4.645484949832776, "grad_norm": 1.4321726560592651, "learning_rate": 9.476688506749576e-06, "loss": 0.387, "num_input_tokens_seen": 7152224, "step": 6945 }, { "epoch": 4.648829431438127, "grad_norm": 1.9888168573379517, "learning_rate": 9.475387830156038e-06, "loss": 0.6709, "num_input_tokens_seen": 7157888, "step": 6950 }, { "epoch": 4.6521739130434785, "grad_norm": 2.3119983673095703, "learning_rate": 9.474085628656675e-06, "loss": 0.5137, "num_input_tokens_seen": 7163168, "step": 6955 }, { "epoch": 4.65551839464883, "grad_norm": 1.8454924821853638, "learning_rate": 9.472781902695192e-06, "loss": 0.506, "num_input_tokens_seen": 7167936, "step": 6960 }, { "epoch": 4.65886287625418, "grad_norm": 1.3630167245864868, "learning_rate": 9.471476652715805e-06, "loss": 0.3927, "num_input_tokens_seen": 7173088, "step": 6965 }, { "epoch": 4.662207357859532, "grad_norm": 1.3978244066238403, "learning_rate": 9.47016987916326e-06, "loss": 0.4943, "num_input_tokens_seen": 7178432, "step": 6970 }, { "epoch": 4.665551839464883, "grad_norm": 1.3343729972839355, "learning_rate": 9.468861582482811e-06, "loss": 0.475, "num_input_tokens_seen": 7183360, "step": 6975 }, { "epoch": 4.668896321070234, "grad_norm": 1.2478084564208984, "learning_rate": 9.467551763120237e-06, "loss": 0.5557, "num_input_tokens_seen": 7188992, "step": 6980 }, { "epoch": 4.672240802675585, "grad_norm": 1.3869543075561523, "learning_rate": 9.466240421521837e-06, "loss": 0.4224, "num_input_tokens_seen": 7193888, "step": 6985 }, { "epoch": 4.6755852842809364, "grad_norm": 1.5446810722351074, "learning_rate": 9.464927558134424e-06, "loss": 0.5034, "num_input_tokens_seen": 7199008, "step": 6990 }, { "epoch": 4.678929765886288, "grad_norm": 1.4780374765396118, "learning_rate": 9.463613173405335e-06, "loss": 0.5078, "num_input_tokens_seen": 7203328, "step": 6995 }, { "epoch": 4.682274247491639, "grad_norm": 2.0998375415802, "learning_rate": 9.462297267782418e-06, "loss": 0.5252, "num_input_tokens_seen": 7208768, "step": 7000 }, { "epoch": 4.68561872909699, "grad_norm": 1.7586781978607178, "learning_rate": 9.460979841714047e-06, "loss": 0.5368, "num_input_tokens_seen": 7213664, "step": 7005 }, { "epoch": 4.688963210702341, "grad_norm": 1.7496862411499023, "learning_rate": 9.459660895649107e-06, "loss": 0.5535, "num_input_tokens_seen": 7219456, "step": 7010 }, { "epoch": 4.6923076923076925, "grad_norm": 2.1350183486938477, "learning_rate": 9.45834043003701e-06, "loss": 0.4257, "num_input_tokens_seen": 7223872, "step": 7015 }, { "epoch": 4.695652173913043, "grad_norm": 1.4220963716506958, "learning_rate": 9.457018445327674e-06, "loss": 0.5079, "num_input_tokens_seen": 7229120, "step": 7020 }, { "epoch": 4.698996655518394, "grad_norm": 1.3177319765090942, "learning_rate": 9.455694941971548e-06, "loss": 0.5645, "num_input_tokens_seen": 7234560, "step": 7025 }, { "epoch": 4.702341137123746, "grad_norm": 1.751773476600647, "learning_rate": 9.454369920419584e-06, "loss": 0.5172, "num_input_tokens_seen": 7240256, "step": 7030 }, { "epoch": 4.705685618729097, "grad_norm": 1.566832423210144, "learning_rate": 9.453043381123264e-06, "loss": 0.4897, "num_input_tokens_seen": 7245088, "step": 7035 }, { "epoch": 4.709030100334449, "grad_norm": 2.033029794692993, "learning_rate": 9.45171532453458e-06, "loss": 0.5201, "num_input_tokens_seen": 7249952, "step": 7040 }, { "epoch": 4.712374581939799, "grad_norm": 1.2945904731750488, "learning_rate": 9.450385751106042e-06, "loss": 0.4724, "num_input_tokens_seen": 7255168, "step": 7045 }, { "epoch": 4.7157190635451505, "grad_norm": 1.4723953008651733, "learning_rate": 9.449054661290677e-06, "loss": 0.4358, "num_input_tokens_seen": 7259872, "step": 7050 }, { "epoch": 4.719063545150502, "grad_norm": 1.755897045135498, "learning_rate": 9.447722055542032e-06, "loss": 0.5533, "num_input_tokens_seen": 7265216, "step": 7055 }, { "epoch": 4.722408026755852, "grad_norm": 2.0898756980895996, "learning_rate": 9.446387934314167e-06, "loss": 0.4592, "num_input_tokens_seen": 7271008, "step": 7060 }, { "epoch": 4.725752508361204, "grad_norm": 1.373331069946289, "learning_rate": 9.445052298061657e-06, "loss": 0.5225, "num_input_tokens_seen": 7276096, "step": 7065 }, { "epoch": 4.729096989966555, "grad_norm": 1.9651139974594116, "learning_rate": 9.443715147239598e-06, "loss": 0.4648, "num_input_tokens_seen": 7280608, "step": 7070 }, { "epoch": 4.7324414715719065, "grad_norm": 2.028477430343628, "learning_rate": 9.442376482303598e-06, "loss": 0.4316, "num_input_tokens_seen": 7285248, "step": 7075 }, { "epoch": 4.735785953177258, "grad_norm": 1.111504077911377, "learning_rate": 9.441036303709782e-06, "loss": 0.4327, "num_input_tokens_seen": 7290816, "step": 7080 }, { "epoch": 4.739130434782608, "grad_norm": 1.2003682851791382, "learning_rate": 9.439694611914796e-06, "loss": 0.4376, "num_input_tokens_seen": 7296032, "step": 7085 }, { "epoch": 4.74247491638796, "grad_norm": 2.060974359512329, "learning_rate": 9.438351407375788e-06, "loss": 0.5109, "num_input_tokens_seen": 7301280, "step": 7090 }, { "epoch": 4.745819397993311, "grad_norm": 1.940112829208374, "learning_rate": 9.437006690550438e-06, "loss": 0.4278, "num_input_tokens_seen": 7306560, "step": 7095 }, { "epoch": 4.749163879598662, "grad_norm": 1.6315807104110718, "learning_rate": 9.435660461896928e-06, "loss": 0.5079, "num_input_tokens_seen": 7312096, "step": 7100 }, { "epoch": 4.752508361204013, "grad_norm": 1.2846170663833618, "learning_rate": 9.434312721873965e-06, "loss": 0.4697, "num_input_tokens_seen": 7317120, "step": 7105 }, { "epoch": 4.7558528428093645, "grad_norm": 1.301581859588623, "learning_rate": 9.432963470940762e-06, "loss": 0.5332, "num_input_tokens_seen": 7321856, "step": 7110 }, { "epoch": 4.759197324414716, "grad_norm": 1.2097184658050537, "learning_rate": 9.431612709557054e-06, "loss": 0.6094, "num_input_tokens_seen": 7327680, "step": 7115 }, { "epoch": 4.762541806020067, "grad_norm": 1.3200092315673828, "learning_rate": 9.43026043818309e-06, "loss": 0.4823, "num_input_tokens_seen": 7332992, "step": 7120 }, { "epoch": 4.765886287625418, "grad_norm": 1.3179675340652466, "learning_rate": 9.428906657279629e-06, "loss": 0.4516, "num_input_tokens_seen": 7338144, "step": 7125 }, { "epoch": 4.769230769230769, "grad_norm": 1.8302197456359863, "learning_rate": 9.427551367307945e-06, "loss": 0.413, "num_input_tokens_seen": 7343328, "step": 7130 }, { "epoch": 4.7725752508361206, "grad_norm": 1.3593873977661133, "learning_rate": 9.426194568729832e-06, "loss": 0.5087, "num_input_tokens_seen": 7348448, "step": 7135 }, { "epoch": 4.775919732441472, "grad_norm": 1.222615122795105, "learning_rate": 9.42483626200759e-06, "loss": 0.3907, "num_input_tokens_seen": 7352928, "step": 7140 }, { "epoch": 4.7792642140468224, "grad_norm": 1.3376291990280151, "learning_rate": 9.423476447604042e-06, "loss": 0.3866, "num_input_tokens_seen": 7358336, "step": 7145 }, { "epoch": 4.782608695652174, "grad_norm": 1.2224581241607666, "learning_rate": 9.422115125982516e-06, "loss": 0.4518, "num_input_tokens_seen": 7363040, "step": 7150 }, { "epoch": 4.785953177257525, "grad_norm": 1.9049434661865234, "learning_rate": 9.420752297606857e-06, "loss": 0.4393, "num_input_tokens_seen": 7367392, "step": 7155 }, { "epoch": 4.789297658862877, "grad_norm": 1.9196215867996216, "learning_rate": 9.419387962941426e-06, "loss": 0.5216, "num_input_tokens_seen": 7373984, "step": 7160 }, { "epoch": 4.792642140468227, "grad_norm": 1.8743829727172852, "learning_rate": 9.418022122451093e-06, "loss": 0.4961, "num_input_tokens_seen": 7379712, "step": 7165 }, { "epoch": 4.7959866220735785, "grad_norm": 2.272832155227661, "learning_rate": 9.416654776601245e-06, "loss": 0.5237, "num_input_tokens_seen": 7384992, "step": 7170 }, { "epoch": 4.79933110367893, "grad_norm": 2.0784687995910645, "learning_rate": 9.415285925857778e-06, "loss": 0.4933, "num_input_tokens_seen": 7390208, "step": 7175 }, { "epoch": 4.802675585284281, "grad_norm": 1.3294384479522705, "learning_rate": 9.413915570687102e-06, "loss": 0.474, "num_input_tokens_seen": 7395776, "step": 7180 }, { "epoch": 4.806020066889632, "grad_norm": 1.3701108694076538, "learning_rate": 9.41254371155614e-06, "loss": 0.4712, "num_input_tokens_seen": 7400544, "step": 7185 }, { "epoch": 4.809364548494983, "grad_norm": 2.66517972946167, "learning_rate": 9.411170348932333e-06, "loss": 0.536, "num_input_tokens_seen": 7405696, "step": 7190 }, { "epoch": 4.812709030100335, "grad_norm": 1.3986282348632812, "learning_rate": 9.409795483283622e-06, "loss": 0.5064, "num_input_tokens_seen": 7410528, "step": 7195 }, { "epoch": 4.816053511705686, "grad_norm": 1.3543009757995605, "learning_rate": 9.40841911507847e-06, "loss": 0.5044, "num_input_tokens_seen": 7416288, "step": 7200 }, { "epoch": 4.8193979933110365, "grad_norm": 1.5855482816696167, "learning_rate": 9.407041244785851e-06, "loss": 0.497, "num_input_tokens_seen": 7421824, "step": 7205 }, { "epoch": 4.822742474916388, "grad_norm": 1.9397317171096802, "learning_rate": 9.405661872875245e-06, "loss": 0.481, "num_input_tokens_seen": 7426528, "step": 7210 }, { "epoch": 4.826086956521739, "grad_norm": 2.323287010192871, "learning_rate": 9.40428099981665e-06, "loss": 0.4297, "num_input_tokens_seen": 7430496, "step": 7215 }, { "epoch": 4.829431438127091, "grad_norm": 1.2335481643676758, "learning_rate": 9.402898626080573e-06, "loss": 0.442, "num_input_tokens_seen": 7435872, "step": 7220 }, { "epoch": 4.832775919732441, "grad_norm": 1.2244621515274048, "learning_rate": 9.401514752138033e-06, "loss": 0.4699, "num_input_tokens_seen": 7441312, "step": 7225 }, { "epoch": 4.8361204013377925, "grad_norm": 0.9128587245941162, "learning_rate": 9.400129378460556e-06, "loss": 0.3746, "num_input_tokens_seen": 7446432, "step": 7230 }, { "epoch": 4.839464882943144, "grad_norm": 2.5148046016693115, "learning_rate": 9.398742505520186e-06, "loss": 0.4427, "num_input_tokens_seen": 7450656, "step": 7235 }, { "epoch": 4.842809364548495, "grad_norm": 2.595768451690674, "learning_rate": 9.397354133789474e-06, "loss": 0.4415, "num_input_tokens_seen": 7455616, "step": 7240 }, { "epoch": 4.846153846153846, "grad_norm": 1.5398954153060913, "learning_rate": 9.395964263741482e-06, "loss": 0.5522, "num_input_tokens_seen": 7460800, "step": 7245 }, { "epoch": 4.849498327759197, "grad_norm": 2.076134443283081, "learning_rate": 9.39457289584978e-06, "loss": 0.5287, "num_input_tokens_seen": 7465664, "step": 7250 }, { "epoch": 4.852842809364549, "grad_norm": 1.7908252477645874, "learning_rate": 9.393180030588454e-06, "loss": 0.4971, "num_input_tokens_seen": 7471072, "step": 7255 }, { "epoch": 4.8561872909699, "grad_norm": 1.5391790866851807, "learning_rate": 9.391785668432094e-06, "loss": 0.5881, "num_input_tokens_seen": 7476832, "step": 7260 }, { "epoch": 4.8595317725752505, "grad_norm": 1.6968743801116943, "learning_rate": 9.390389809855806e-06, "loss": 0.4672, "num_input_tokens_seen": 7481632, "step": 7265 }, { "epoch": 4.862876254180602, "grad_norm": 1.9306151866912842, "learning_rate": 9.388992455335204e-06, "loss": 0.3779, "num_input_tokens_seen": 7486560, "step": 7270 }, { "epoch": 4.866220735785953, "grad_norm": 1.4776610136032104, "learning_rate": 9.387593605346408e-06, "loss": 0.5788, "num_input_tokens_seen": 7492384, "step": 7275 }, { "epoch": 4.869565217391305, "grad_norm": 2.1678974628448486, "learning_rate": 9.386193260366052e-06, "loss": 0.5053, "num_input_tokens_seen": 7497344, "step": 7280 }, { "epoch": 4.872909698996655, "grad_norm": 1.2845898866653442, "learning_rate": 9.384791420871275e-06, "loss": 0.4297, "num_input_tokens_seen": 7502112, "step": 7285 }, { "epoch": 4.8762541806020065, "grad_norm": 3.526175022125244, "learning_rate": 9.383388087339732e-06, "loss": 0.5659, "num_input_tokens_seen": 7507104, "step": 7290 }, { "epoch": 4.879598662207358, "grad_norm": 0.998020350933075, "learning_rate": 9.381983260249578e-06, "loss": 0.4725, "num_input_tokens_seen": 7512672, "step": 7295 }, { "epoch": 4.882943143812709, "grad_norm": 1.6014738082885742, "learning_rate": 9.380576940079488e-06, "loss": 0.5197, "num_input_tokens_seen": 7517952, "step": 7300 }, { "epoch": 4.88628762541806, "grad_norm": 1.2158056497573853, "learning_rate": 9.379169127308635e-06, "loss": 0.481, "num_input_tokens_seen": 7523040, "step": 7305 }, { "epoch": 4.889632107023411, "grad_norm": 2.011754274368286, "learning_rate": 9.377759822416708e-06, "loss": 0.4894, "num_input_tokens_seen": 7528416, "step": 7310 }, { "epoch": 4.892976588628763, "grad_norm": 1.4055380821228027, "learning_rate": 9.3763490258839e-06, "loss": 0.4783, "num_input_tokens_seen": 7534048, "step": 7315 }, { "epoch": 4.896321070234114, "grad_norm": 1.1158394813537598, "learning_rate": 9.374936738190913e-06, "loss": 0.5346, "num_input_tokens_seen": 7538912, "step": 7320 }, { "epoch": 4.8996655518394645, "grad_norm": 2.5755176544189453, "learning_rate": 9.37352295981896e-06, "loss": 0.5526, "num_input_tokens_seen": 7543968, "step": 7325 }, { "epoch": 4.903010033444816, "grad_norm": 1.5829404592514038, "learning_rate": 9.37210769124976e-06, "loss": 0.4126, "num_input_tokens_seen": 7548864, "step": 7330 }, { "epoch": 4.906354515050167, "grad_norm": 1.6693886518478394, "learning_rate": 9.370690932965538e-06, "loss": 0.4415, "num_input_tokens_seen": 7554048, "step": 7335 }, { "epoch": 4.909698996655519, "grad_norm": 1.5409700870513916, "learning_rate": 9.36927268544903e-06, "loss": 0.5105, "num_input_tokens_seen": 7559648, "step": 7340 }, { "epoch": 4.913043478260869, "grad_norm": 1.6620979309082031, "learning_rate": 9.367852949183473e-06, "loss": 0.3778, "num_input_tokens_seen": 7565344, "step": 7345 }, { "epoch": 4.916387959866221, "grad_norm": 2.477195978164673, "learning_rate": 9.366431724652623e-06, "loss": 0.4658, "num_input_tokens_seen": 7570752, "step": 7350 }, { "epoch": 4.919732441471572, "grad_norm": 1.6785011291503906, "learning_rate": 9.365009012340734e-06, "loss": 0.481, "num_input_tokens_seen": 7576864, "step": 7355 }, { "epoch": 4.923076923076923, "grad_norm": 2.231140613555908, "learning_rate": 9.363584812732564e-06, "loss": 0.4537, "num_input_tokens_seen": 7581728, "step": 7360 }, { "epoch": 4.926421404682275, "grad_norm": 1.6439242362976074, "learning_rate": 9.362159126313388e-06, "loss": 0.4484, "num_input_tokens_seen": 7586304, "step": 7365 }, { "epoch": 4.929765886287625, "grad_norm": 3.648841142654419, "learning_rate": 9.360731953568982e-06, "loss": 0.5443, "num_input_tokens_seen": 7591264, "step": 7370 }, { "epoch": 4.933110367892977, "grad_norm": 2.5836665630340576, "learning_rate": 9.359303294985624e-06, "loss": 0.4843, "num_input_tokens_seen": 7596416, "step": 7375 }, { "epoch": 4.936454849498328, "grad_norm": 1.588281512260437, "learning_rate": 9.35787315105011e-06, "loss": 0.3969, "num_input_tokens_seen": 7601728, "step": 7380 }, { "epoch": 4.9397993311036785, "grad_norm": 2.0987513065338135, "learning_rate": 9.35644152224973e-06, "loss": 0.4577, "num_input_tokens_seen": 7606752, "step": 7385 }, { "epoch": 4.94314381270903, "grad_norm": 2.368539810180664, "learning_rate": 9.355008409072288e-06, "loss": 0.4657, "num_input_tokens_seen": 7611712, "step": 7390 }, { "epoch": 4.946488294314381, "grad_norm": 1.3942734003067017, "learning_rate": 9.353573812006088e-06, "loss": 0.4814, "num_input_tokens_seen": 7617120, "step": 7395 }, { "epoch": 4.949832775919733, "grad_norm": 1.7438561916351318, "learning_rate": 9.352137731539945e-06, "loss": 0.4717, "num_input_tokens_seen": 7622464, "step": 7400 }, { "epoch": 4.953177257525084, "grad_norm": 1.2996070384979248, "learning_rate": 9.350700168163176e-06, "loss": 0.511, "num_input_tokens_seen": 7627104, "step": 7405 }, { "epoch": 4.956521739130435, "grad_norm": 2.068943977355957, "learning_rate": 9.349261122365605e-06, "loss": 0.4903, "num_input_tokens_seen": 7632512, "step": 7410 }, { "epoch": 4.959866220735786, "grad_norm": 1.4103658199310303, "learning_rate": 9.34782059463756e-06, "loss": 0.4708, "num_input_tokens_seen": 7638048, "step": 7415 }, { "epoch": 4.963210702341137, "grad_norm": 1.3768534660339355, "learning_rate": 9.34637858546987e-06, "loss": 0.4326, "num_input_tokens_seen": 7643296, "step": 7420 }, { "epoch": 4.966555183946488, "grad_norm": 1.5131514072418213, "learning_rate": 9.34493509535388e-06, "loss": 0.532, "num_input_tokens_seen": 7648960, "step": 7425 }, { "epoch": 4.969899665551839, "grad_norm": 1.3950443267822266, "learning_rate": 9.34349012478143e-06, "loss": 0.4457, "num_input_tokens_seen": 7654240, "step": 7430 }, { "epoch": 4.973244147157191, "grad_norm": 2.0458672046661377, "learning_rate": 9.342043674244866e-06, "loss": 0.5125, "num_input_tokens_seen": 7659328, "step": 7435 }, { "epoch": 4.976588628762542, "grad_norm": 1.13773775100708, "learning_rate": 9.340595744237037e-06, "loss": 0.4635, "num_input_tokens_seen": 7664352, "step": 7440 }, { "epoch": 4.979933110367893, "grad_norm": 3.5716166496276855, "learning_rate": 9.339146335251306e-06, "loss": 0.426, "num_input_tokens_seen": 7669280, "step": 7445 }, { "epoch": 4.983277591973244, "grad_norm": 2.115762948989868, "learning_rate": 9.337695447781525e-06, "loss": 0.5166, "num_input_tokens_seen": 7674496, "step": 7450 }, { "epoch": 4.986622073578595, "grad_norm": 1.261256456375122, "learning_rate": 9.336243082322058e-06, "loss": 0.5291, "num_input_tokens_seen": 7680000, "step": 7455 }, { "epoch": 4.989966555183947, "grad_norm": 1.5900349617004395, "learning_rate": 9.334789239367776e-06, "loss": 0.4515, "num_input_tokens_seen": 7685408, "step": 7460 }, { "epoch": 4.993311036789297, "grad_norm": 1.342415690422058, "learning_rate": 9.333333919414047e-06, "loss": 0.522, "num_input_tokens_seen": 7690880, "step": 7465 }, { "epoch": 4.996655518394649, "grad_norm": 1.3339735269546509, "learning_rate": 9.331877122956743e-06, "loss": 0.4665, "num_input_tokens_seen": 7696640, "step": 7470 }, { "epoch": 5.0, "grad_norm": 2.5137646198272705, "learning_rate": 9.330418850492242e-06, "loss": 0.4917, "num_input_tokens_seen": 7701664, "step": 7475 }, { "epoch": 5.003344481605351, "grad_norm": 1.4681588411331177, "learning_rate": 9.328959102517422e-06, "loss": 0.4395, "num_input_tokens_seen": 7706784, "step": 7480 }, { "epoch": 5.006688963210703, "grad_norm": 1.8979225158691406, "learning_rate": 9.327497879529668e-06, "loss": 0.4836, "num_input_tokens_seen": 7711808, "step": 7485 }, { "epoch": 5.010033444816053, "grad_norm": 1.2003192901611328, "learning_rate": 9.32603518202686e-06, "loss": 0.5152, "num_input_tokens_seen": 7717792, "step": 7490 }, { "epoch": 5.013377926421405, "grad_norm": 1.8588308095932007, "learning_rate": 9.32457101050739e-06, "loss": 0.4504, "num_input_tokens_seen": 7723200, "step": 7495 }, { "epoch": 5.016722408026756, "grad_norm": 1.4277549982070923, "learning_rate": 9.323105365470147e-06, "loss": 0.4869, "num_input_tokens_seen": 7727520, "step": 7500 }, { "epoch": 5.0200668896321075, "grad_norm": 1.516800880432129, "learning_rate": 9.32163824741452e-06, "loss": 0.4637, "num_input_tokens_seen": 7732896, "step": 7505 }, { "epoch": 5.023411371237458, "grad_norm": 1.743240237236023, "learning_rate": 9.320169656840403e-06, "loss": 0.5146, "num_input_tokens_seen": 7738368, "step": 7510 }, { "epoch": 5.026755852842809, "grad_norm": 2.126352548599243, "learning_rate": 9.318699594248192e-06, "loss": 0.4491, "num_input_tokens_seen": 7742656, "step": 7515 }, { "epoch": 5.030100334448161, "grad_norm": 1.5875076055526733, "learning_rate": 9.317228060138783e-06, "loss": 0.481, "num_input_tokens_seen": 7749248, "step": 7520 }, { "epoch": 5.033444816053512, "grad_norm": 1.423811912536621, "learning_rate": 9.315755055013575e-06, "loss": 0.3696, "num_input_tokens_seen": 7754016, "step": 7525 }, { "epoch": 5.036789297658863, "grad_norm": 1.6936719417572021, "learning_rate": 9.314280579374466e-06, "loss": 0.494, "num_input_tokens_seen": 7758880, "step": 7530 }, { "epoch": 5.040133779264214, "grad_norm": 1.2312949895858765, "learning_rate": 9.312804633723857e-06, "loss": 0.4783, "num_input_tokens_seen": 7763648, "step": 7535 }, { "epoch": 5.043478260869565, "grad_norm": 1.256499171257019, "learning_rate": 9.31132721856465e-06, "loss": 0.4236, "num_input_tokens_seen": 7768928, "step": 7540 }, { "epoch": 5.046822742474917, "grad_norm": 1.430989384651184, "learning_rate": 9.309848334400247e-06, "loss": 0.4823, "num_input_tokens_seen": 7773792, "step": 7545 }, { "epoch": 5.050167224080267, "grad_norm": 1.495910406112671, "learning_rate": 9.308367981734549e-06, "loss": 0.4777, "num_input_tokens_seen": 7779040, "step": 7550 }, { "epoch": 5.053511705685619, "grad_norm": 1.4839998483657837, "learning_rate": 9.30688616107196e-06, "loss": 0.429, "num_input_tokens_seen": 7784160, "step": 7555 }, { "epoch": 5.05685618729097, "grad_norm": 1.9294341802597046, "learning_rate": 9.305402872917383e-06, "loss": 0.4541, "num_input_tokens_seen": 7789376, "step": 7560 }, { "epoch": 5.0602006688963215, "grad_norm": 1.4179716110229492, "learning_rate": 9.30391811777622e-06, "loss": 0.4451, "num_input_tokens_seen": 7794080, "step": 7565 }, { "epoch": 5.063545150501672, "grad_norm": 2.196928024291992, "learning_rate": 9.302431896154374e-06, "loss": 0.4899, "num_input_tokens_seen": 7800128, "step": 7570 }, { "epoch": 5.066889632107023, "grad_norm": 2.287666082382202, "learning_rate": 9.30094420855825e-06, "loss": 0.4111, "num_input_tokens_seen": 7804800, "step": 7575 }, { "epoch": 5.070234113712375, "grad_norm": 1.3264356851577759, "learning_rate": 9.299455055494747e-06, "loss": 0.4599, "num_input_tokens_seen": 7809536, "step": 7580 }, { "epoch": 5.073578595317726, "grad_norm": 1.6332911252975464, "learning_rate": 9.297964437471268e-06, "loss": 0.5099, "num_input_tokens_seen": 7814688, "step": 7585 }, { "epoch": 5.076923076923077, "grad_norm": 1.1584880352020264, "learning_rate": 9.296472354995714e-06, "loss": 0.4305, "num_input_tokens_seen": 7819136, "step": 7590 }, { "epoch": 5.080267558528428, "grad_norm": 1.7341855764389038, "learning_rate": 9.294978808576484e-06, "loss": 0.4997, "num_input_tokens_seen": 7823872, "step": 7595 }, { "epoch": 5.083612040133779, "grad_norm": 1.5801247358322144, "learning_rate": 9.293483798722476e-06, "loss": 0.4411, "num_input_tokens_seen": 7829024, "step": 7600 }, { "epoch": 5.086956521739131, "grad_norm": 1.4417084455490112, "learning_rate": 9.291987325943089e-06, "loss": 0.4418, "num_input_tokens_seen": 7834304, "step": 7605 }, { "epoch": 5.090301003344481, "grad_norm": 1.5613797903060913, "learning_rate": 9.290489390748215e-06, "loss": 0.4061, "num_input_tokens_seen": 7839264, "step": 7610 }, { "epoch": 5.093645484949833, "grad_norm": 1.5238829851150513, "learning_rate": 9.28898999364825e-06, "loss": 0.4308, "num_input_tokens_seen": 7844288, "step": 7615 }, { "epoch": 5.096989966555184, "grad_norm": 1.8185436725616455, "learning_rate": 9.287489135154083e-06, "loss": 0.5501, "num_input_tokens_seen": 7849184, "step": 7620 }, { "epoch": 5.1003344481605355, "grad_norm": 1.464849591255188, "learning_rate": 9.28598681577711e-06, "loss": 0.4489, "num_input_tokens_seen": 7854560, "step": 7625 }, { "epoch": 5.103678929765886, "grad_norm": 1.3062994480133057, "learning_rate": 9.284483036029215e-06, "loss": 0.4443, "num_input_tokens_seen": 7859520, "step": 7630 }, { "epoch": 5.107023411371237, "grad_norm": 1.8647994995117188, "learning_rate": 9.282977796422782e-06, "loss": 0.4674, "num_input_tokens_seen": 7864800, "step": 7635 }, { "epoch": 5.110367892976589, "grad_norm": 2.1398794651031494, "learning_rate": 9.281471097470695e-06, "loss": 0.4986, "num_input_tokens_seen": 7869664, "step": 7640 }, { "epoch": 5.11371237458194, "grad_norm": 1.2125740051269531, "learning_rate": 9.279962939686333e-06, "loss": 0.4409, "num_input_tokens_seen": 7875520, "step": 7645 }, { "epoch": 5.117056856187291, "grad_norm": 1.700692057609558, "learning_rate": 9.278453323583575e-06, "loss": 0.4941, "num_input_tokens_seen": 7880896, "step": 7650 }, { "epoch": 5.120401337792642, "grad_norm": 1.0796988010406494, "learning_rate": 9.276942249676792e-06, "loss": 0.4278, "num_input_tokens_seen": 7886240, "step": 7655 }, { "epoch": 5.1237458193979935, "grad_norm": 2.0424928665161133, "learning_rate": 9.275429718480858e-06, "loss": 0.5134, "num_input_tokens_seen": 7891648, "step": 7660 }, { "epoch": 5.127090301003345, "grad_norm": 1.7802672386169434, "learning_rate": 9.273915730511136e-06, "loss": 0.4781, "num_input_tokens_seen": 7896128, "step": 7665 }, { "epoch": 5.130434782608695, "grad_norm": 1.7728599309921265, "learning_rate": 9.272400286283492e-06, "loss": 0.4927, "num_input_tokens_seen": 7900864, "step": 7670 }, { "epoch": 5.133779264214047, "grad_norm": 1.3636387586593628, "learning_rate": 9.270883386314285e-06, "loss": 0.365, "num_input_tokens_seen": 7906816, "step": 7675 }, { "epoch": 5.137123745819398, "grad_norm": 1.7500463724136353, "learning_rate": 9.269365031120372e-06, "loss": 0.4562, "num_input_tokens_seen": 7912960, "step": 7680 }, { "epoch": 5.1404682274247495, "grad_norm": 1.6663762331008911, "learning_rate": 9.267845221219103e-06, "loss": 0.4692, "num_input_tokens_seen": 7918560, "step": 7685 }, { "epoch": 5.1438127090301, "grad_norm": 1.4362033605575562, "learning_rate": 9.266323957128326e-06, "loss": 0.4041, "num_input_tokens_seen": 7923296, "step": 7690 }, { "epoch": 5.147157190635451, "grad_norm": 2.1293604373931885, "learning_rate": 9.26480123936638e-06, "loss": 0.462, "num_input_tokens_seen": 7928448, "step": 7695 }, { "epoch": 5.150501672240803, "grad_norm": 1.4489935636520386, "learning_rate": 9.26327706845211e-06, "loss": 0.4927, "num_input_tokens_seen": 7933728, "step": 7700 }, { "epoch": 5.153846153846154, "grad_norm": 1.1307203769683838, "learning_rate": 9.261751444904846e-06, "loss": 0.3967, "num_input_tokens_seen": 7939040, "step": 7705 }, { "epoch": 5.157190635451505, "grad_norm": 1.5958564281463623, "learning_rate": 9.260224369244414e-06, "loss": 0.4961, "num_input_tokens_seen": 7944384, "step": 7710 }, { "epoch": 5.160535117056856, "grad_norm": 2.668889284133911, "learning_rate": 9.258695841991137e-06, "loss": 0.4979, "num_input_tokens_seen": 7948992, "step": 7715 }, { "epoch": 5.1638795986622075, "grad_norm": 1.3662147521972656, "learning_rate": 9.257165863665833e-06, "loss": 0.4914, "num_input_tokens_seen": 7953152, "step": 7720 }, { "epoch": 5.167224080267559, "grad_norm": 2.0190577507019043, "learning_rate": 9.255634434789818e-06, "loss": 0.4526, "num_input_tokens_seen": 7957792, "step": 7725 }, { "epoch": 5.170568561872909, "grad_norm": 1.4211814403533936, "learning_rate": 9.254101555884892e-06, "loss": 0.4812, "num_input_tokens_seen": 7962720, "step": 7730 }, { "epoch": 5.173913043478261, "grad_norm": 1.2561393976211548, "learning_rate": 9.25256722747336e-06, "loss": 0.4027, "num_input_tokens_seen": 7968384, "step": 7735 }, { "epoch": 5.177257525083612, "grad_norm": 1.505723237991333, "learning_rate": 9.251031450078012e-06, "loss": 0.5419, "num_input_tokens_seen": 7974048, "step": 7740 }, { "epoch": 5.1806020066889635, "grad_norm": 1.1575965881347656, "learning_rate": 9.249494224222139e-06, "loss": 0.3734, "num_input_tokens_seen": 7979680, "step": 7745 }, { "epoch": 5.183946488294314, "grad_norm": 2.563859462738037, "learning_rate": 9.247955550429521e-06, "loss": 0.5813, "num_input_tokens_seen": 7984256, "step": 7750 }, { "epoch": 5.187290969899665, "grad_norm": 1.3632855415344238, "learning_rate": 9.246415429224433e-06, "loss": 0.4731, "num_input_tokens_seen": 7989120, "step": 7755 }, { "epoch": 5.190635451505017, "grad_norm": 1.5552411079406738, "learning_rate": 9.244873861131643e-06, "loss": 0.4129, "num_input_tokens_seen": 7994688, "step": 7760 }, { "epoch": 5.193979933110368, "grad_norm": 1.892443060874939, "learning_rate": 9.243330846676411e-06, "loss": 0.5514, "num_input_tokens_seen": 7999488, "step": 7765 }, { "epoch": 5.197324414715719, "grad_norm": 1.4979747533798218, "learning_rate": 9.24178638638449e-06, "loss": 0.5199, "num_input_tokens_seen": 8004608, "step": 7770 }, { "epoch": 5.20066889632107, "grad_norm": 1.6110140085220337, "learning_rate": 9.24024048078213e-06, "loss": 0.3769, "num_input_tokens_seen": 8009408, "step": 7775 }, { "epoch": 5.2040133779264215, "grad_norm": 1.4697246551513672, "learning_rate": 9.238693130396068e-06, "loss": 0.5177, "num_input_tokens_seen": 8014880, "step": 7780 }, { "epoch": 5.207357859531773, "grad_norm": 1.5050846338272095, "learning_rate": 9.237144335753534e-06, "loss": 0.5059, "num_input_tokens_seen": 8020832, "step": 7785 }, { "epoch": 5.210702341137123, "grad_norm": 1.6624624729156494, "learning_rate": 9.23559409738225e-06, "loss": 0.4118, "num_input_tokens_seen": 8026080, "step": 7790 }, { "epoch": 5.214046822742475, "grad_norm": 1.708404541015625, "learning_rate": 9.234042415810435e-06, "loss": 0.4324, "num_input_tokens_seen": 8030784, "step": 7795 }, { "epoch": 5.217391304347826, "grad_norm": 1.4676328897476196, "learning_rate": 9.232489291566792e-06, "loss": 0.4049, "num_input_tokens_seen": 8035680, "step": 7800 }, { "epoch": 5.2207357859531776, "grad_norm": 1.4558930397033691, "learning_rate": 9.230934725180522e-06, "loss": 0.4287, "num_input_tokens_seen": 8040480, "step": 7805 }, { "epoch": 5.224080267558528, "grad_norm": 2.354083776473999, "learning_rate": 9.229378717181316e-06, "loss": 0.5034, "num_input_tokens_seen": 8045952, "step": 7810 }, { "epoch": 5.2274247491638794, "grad_norm": 1.5408235788345337, "learning_rate": 9.22782126809935e-06, "loss": 0.4458, "num_input_tokens_seen": 8051040, "step": 7815 }, { "epoch": 5.230769230769231, "grad_norm": 1.6682641506195068, "learning_rate": 9.226262378465301e-06, "loss": 0.4849, "num_input_tokens_seen": 8055232, "step": 7820 }, { "epoch": 5.234113712374582, "grad_norm": 1.5036274194717407, "learning_rate": 9.22470204881033e-06, "loss": 0.3629, "num_input_tokens_seen": 8060416, "step": 7825 }, { "epoch": 5.237458193979933, "grad_norm": 1.334657073020935, "learning_rate": 9.223140279666089e-06, "loss": 0.4354, "num_input_tokens_seen": 8065248, "step": 7830 }, { "epoch": 5.240802675585284, "grad_norm": 2.5903258323669434, "learning_rate": 9.221577071564725e-06, "loss": 0.4025, "num_input_tokens_seen": 8070272, "step": 7835 }, { "epoch": 5.2441471571906355, "grad_norm": 1.5574710369110107, "learning_rate": 9.22001242503887e-06, "loss": 0.5058, "num_input_tokens_seen": 8075488, "step": 7840 }, { "epoch": 5.247491638795987, "grad_norm": 1.8121180534362793, "learning_rate": 9.218446340621649e-06, "loss": 0.428, "num_input_tokens_seen": 8079744, "step": 7845 }, { "epoch": 5.250836120401337, "grad_norm": 1.370565414428711, "learning_rate": 9.216878818846677e-06, "loss": 0.4749, "num_input_tokens_seen": 8085312, "step": 7850 }, { "epoch": 5.254180602006689, "grad_norm": 2.003783941268921, "learning_rate": 9.215309860248058e-06, "loss": 0.501, "num_input_tokens_seen": 8090560, "step": 7855 }, { "epoch": 5.25752508361204, "grad_norm": 1.4061458110809326, "learning_rate": 9.213739465360385e-06, "loss": 0.4431, "num_input_tokens_seen": 8095424, "step": 7860 }, { "epoch": 5.260869565217392, "grad_norm": 2.0860495567321777, "learning_rate": 9.212167634718743e-06, "loss": 0.5041, "num_input_tokens_seen": 8100608, "step": 7865 }, { "epoch": 5.264214046822742, "grad_norm": 1.4211267232894897, "learning_rate": 9.210594368858701e-06, "loss": 0.4037, "num_input_tokens_seen": 8105824, "step": 7870 }, { "epoch": 5.2675585284280935, "grad_norm": 1.4998711347579956, "learning_rate": 9.209019668316322e-06, "loss": 0.4475, "num_input_tokens_seen": 8111808, "step": 7875 }, { "epoch": 5.270903010033445, "grad_norm": 1.4667445421218872, "learning_rate": 9.207443533628158e-06, "loss": 0.4198, "num_input_tokens_seen": 8117120, "step": 7880 }, { "epoch": 5.274247491638796, "grad_norm": 1.3381679058074951, "learning_rate": 9.205865965331244e-06, "loss": 0.4399, "num_input_tokens_seen": 8123104, "step": 7885 }, { "epoch": 5.277591973244147, "grad_norm": 2.0226316452026367, "learning_rate": 9.204286963963112e-06, "loss": 0.4615, "num_input_tokens_seen": 8127968, "step": 7890 }, { "epoch": 5.280936454849498, "grad_norm": 1.4246488809585571, "learning_rate": 9.202706530061774e-06, "loss": 0.5418, "num_input_tokens_seen": 8132672, "step": 7895 }, { "epoch": 5.2842809364548495, "grad_norm": 1.6738170385360718, "learning_rate": 9.201124664165733e-06, "loss": 0.5245, "num_input_tokens_seen": 8137504, "step": 7900 }, { "epoch": 5.287625418060201, "grad_norm": 1.6256632804870605, "learning_rate": 9.199541366813984e-06, "loss": 0.4065, "num_input_tokens_seen": 8142656, "step": 7905 }, { "epoch": 5.290969899665551, "grad_norm": 2.122511386871338, "learning_rate": 9.197956638546003e-06, "loss": 0.4317, "num_input_tokens_seen": 8147104, "step": 7910 }, { "epoch": 5.294314381270903, "grad_norm": 2.0598039627075195, "learning_rate": 9.19637047990176e-06, "loss": 0.5303, "num_input_tokens_seen": 8151968, "step": 7915 }, { "epoch": 5.297658862876254, "grad_norm": 1.2745777368545532, "learning_rate": 9.194782891421707e-06, "loss": 0.4077, "num_input_tokens_seen": 8157472, "step": 7920 }, { "epoch": 5.301003344481606, "grad_norm": 1.8488960266113281, "learning_rate": 9.193193873646786e-06, "loss": 0.5066, "num_input_tokens_seen": 8162848, "step": 7925 }, { "epoch": 5.304347826086957, "grad_norm": 1.454786777496338, "learning_rate": 9.191603427118427e-06, "loss": 0.5173, "num_input_tokens_seen": 8168128, "step": 7930 }, { "epoch": 5.3076923076923075, "grad_norm": 1.3585588932037354, "learning_rate": 9.190011552378544e-06, "loss": 0.4062, "num_input_tokens_seen": 8172896, "step": 7935 }, { "epoch": 5.311036789297659, "grad_norm": 1.5168088674545288, "learning_rate": 9.188418249969539e-06, "loss": 0.516, "num_input_tokens_seen": 8179072, "step": 7940 }, { "epoch": 5.31438127090301, "grad_norm": 0.8852132558822632, "learning_rate": 9.1868235204343e-06, "loss": 0.45, "num_input_tokens_seen": 8184288, "step": 7945 }, { "epoch": 5.317725752508361, "grad_norm": 1.773711919784546, "learning_rate": 9.185227364316201e-06, "loss": 0.5237, "num_input_tokens_seen": 8190304, "step": 7950 }, { "epoch": 5.321070234113712, "grad_norm": 1.7731226682662964, "learning_rate": 9.183629782159104e-06, "loss": 0.4557, "num_input_tokens_seen": 8195648, "step": 7955 }, { "epoch": 5.3244147157190636, "grad_norm": 1.5293031930923462, "learning_rate": 9.182030774507357e-06, "loss": 0.4829, "num_input_tokens_seen": 8201184, "step": 7960 }, { "epoch": 5.327759197324415, "grad_norm": 1.488731861114502, "learning_rate": 9.18043034190579e-06, "loss": 0.4434, "num_input_tokens_seen": 8206432, "step": 7965 }, { "epoch": 5.331103678929766, "grad_norm": 2.6140828132629395, "learning_rate": 9.178828484899724e-06, "loss": 0.4927, "num_input_tokens_seen": 8210752, "step": 7970 }, { "epoch": 5.334448160535117, "grad_norm": 1.5976941585540771, "learning_rate": 9.177225204034957e-06, "loss": 0.4447, "num_input_tokens_seen": 8215872, "step": 7975 }, { "epoch": 5.337792642140468, "grad_norm": 1.5300625562667847, "learning_rate": 9.175620499857782e-06, "loss": 0.4822, "num_input_tokens_seen": 8221408, "step": 7980 }, { "epoch": 5.34113712374582, "grad_norm": 1.6390708684921265, "learning_rate": 9.17401437291497e-06, "loss": 0.4507, "num_input_tokens_seen": 8226880, "step": 7985 }, { "epoch": 5.34448160535117, "grad_norm": 1.0866694450378418, "learning_rate": 9.172406823753778e-06, "loss": 0.4314, "num_input_tokens_seen": 8232608, "step": 7990 }, { "epoch": 5.3478260869565215, "grad_norm": 1.2826067209243774, "learning_rate": 9.170797852921953e-06, "loss": 0.4609, "num_input_tokens_seen": 8238304, "step": 7995 }, { "epoch": 5.351170568561873, "grad_norm": 2.416844129562378, "learning_rate": 9.169187460967718e-06, "loss": 0.4636, "num_input_tokens_seen": 8243648, "step": 8000 }, { "epoch": 5.354515050167224, "grad_norm": 1.467926263809204, "learning_rate": 9.167575648439788e-06, "loss": 0.4542, "num_input_tokens_seen": 8250336, "step": 8005 }, { "epoch": 5.357859531772576, "grad_norm": 1.75364351272583, "learning_rate": 9.165962415887356e-06, "loss": 0.4351, "num_input_tokens_seen": 8254688, "step": 8010 }, { "epoch": 5.361204013377926, "grad_norm": 1.5056664943695068, "learning_rate": 9.164347763860099e-06, "loss": 0.4825, "num_input_tokens_seen": 8259616, "step": 8015 }, { "epoch": 5.364548494983278, "grad_norm": 1.4575203657150269, "learning_rate": 9.162731692908185e-06, "loss": 0.4415, "num_input_tokens_seen": 8264672, "step": 8020 }, { "epoch": 5.367892976588629, "grad_norm": 1.893418312072754, "learning_rate": 9.161114203582256e-06, "loss": 0.4002, "num_input_tokens_seen": 8270176, "step": 8025 }, { "epoch": 5.3712374581939795, "grad_norm": 1.3419684171676636, "learning_rate": 9.159495296433445e-06, "loss": 0.4549, "num_input_tokens_seen": 8275328, "step": 8030 }, { "epoch": 5.374581939799331, "grad_norm": 1.2808330059051514, "learning_rate": 9.157874972013361e-06, "loss": 0.4684, "num_input_tokens_seen": 8280352, "step": 8035 }, { "epoch": 5.377926421404682, "grad_norm": 1.651841163635254, "learning_rate": 9.156253230874104e-06, "loss": 0.4582, "num_input_tokens_seen": 8285472, "step": 8040 }, { "epoch": 5.381270903010034, "grad_norm": 1.5683919191360474, "learning_rate": 9.15463007356825e-06, "loss": 0.366, "num_input_tokens_seen": 8290944, "step": 8045 }, { "epoch": 5.384615384615385, "grad_norm": 1.5789616107940674, "learning_rate": 9.153005500648858e-06, "loss": 0.4224, "num_input_tokens_seen": 8296224, "step": 8050 }, { "epoch": 5.3879598662207355, "grad_norm": 1.484159231185913, "learning_rate": 9.151379512669474e-06, "loss": 0.4655, "num_input_tokens_seen": 8302240, "step": 8055 }, { "epoch": 5.391304347826087, "grad_norm": 2.131770610809326, "learning_rate": 9.149752110184125e-06, "loss": 0.4632, "num_input_tokens_seen": 8307104, "step": 8060 }, { "epoch": 5.394648829431438, "grad_norm": 1.5468286275863647, "learning_rate": 9.148123293747314e-06, "loss": 0.4266, "num_input_tokens_seen": 8312672, "step": 8065 }, { "epoch": 5.39799331103679, "grad_norm": 1.4492199420928955, "learning_rate": 9.146493063914032e-06, "loss": 0.4607, "num_input_tokens_seen": 8317376, "step": 8070 }, { "epoch": 5.40133779264214, "grad_norm": 1.4321430921554565, "learning_rate": 9.144861421239752e-06, "loss": 0.4263, "num_input_tokens_seen": 8323360, "step": 8075 }, { "epoch": 5.404682274247492, "grad_norm": 1.2344093322753906, "learning_rate": 9.143228366280424e-06, "loss": 0.4567, "num_input_tokens_seen": 8329536, "step": 8080 }, { "epoch": 5.408026755852843, "grad_norm": 1.2446072101593018, "learning_rate": 9.14159389959248e-06, "loss": 0.5577, "num_input_tokens_seen": 8334944, "step": 8085 }, { "epoch": 5.411371237458194, "grad_norm": 2.135686159133911, "learning_rate": 9.139958021732835e-06, "loss": 0.4249, "num_input_tokens_seen": 8339808, "step": 8090 }, { "epoch": 5.414715719063545, "grad_norm": 1.4557743072509766, "learning_rate": 9.138320733258887e-06, "loss": 0.4526, "num_input_tokens_seen": 8344128, "step": 8095 }, { "epoch": 5.418060200668896, "grad_norm": 3.1503713130950928, "learning_rate": 9.136682034728508e-06, "loss": 0.4782, "num_input_tokens_seen": 8348832, "step": 8100 }, { "epoch": 5.421404682274248, "grad_norm": 1.7721061706542969, "learning_rate": 9.135041926700057e-06, "loss": 0.5813, "num_input_tokens_seen": 8354816, "step": 8105 }, { "epoch": 5.424749163879599, "grad_norm": 2.1029703617095947, "learning_rate": 9.13340040973237e-06, "loss": 0.5461, "num_input_tokens_seen": 8359648, "step": 8110 }, { "epoch": 5.4280936454849495, "grad_norm": 1.3784056901931763, "learning_rate": 9.131757484384765e-06, "loss": 0.4884, "num_input_tokens_seen": 8365376, "step": 8115 }, { "epoch": 5.431438127090301, "grad_norm": 2.3794610500335693, "learning_rate": 9.130113151217034e-06, "loss": 0.4416, "num_input_tokens_seen": 8370176, "step": 8120 }, { "epoch": 5.434782608695652, "grad_norm": 1.7030954360961914, "learning_rate": 9.128467410789457e-06, "loss": 0.5003, "num_input_tokens_seen": 8374912, "step": 8125 }, { "epoch": 5.438127090301004, "grad_norm": 1.522966980934143, "learning_rate": 9.12682026366279e-06, "loss": 0.4889, "num_input_tokens_seen": 8380128, "step": 8130 }, { "epoch": 5.441471571906354, "grad_norm": 1.3133797645568848, "learning_rate": 9.125171710398263e-06, "loss": 0.4484, "num_input_tokens_seen": 8385376, "step": 8135 }, { "epoch": 5.444816053511706, "grad_norm": 1.3532077074050903, "learning_rate": 9.123521751557598e-06, "loss": 0.4977, "num_input_tokens_seen": 8390720, "step": 8140 }, { "epoch": 5.448160535117057, "grad_norm": 1.2209579944610596, "learning_rate": 9.121870387702982e-06, "loss": 0.5705, "num_input_tokens_seen": 8395360, "step": 8145 }, { "epoch": 5.451505016722408, "grad_norm": 1.7687441110610962, "learning_rate": 9.120217619397087e-06, "loss": 0.469, "num_input_tokens_seen": 8402048, "step": 8150 }, { "epoch": 5.454849498327759, "grad_norm": 1.9285640716552734, "learning_rate": 9.118563447203067e-06, "loss": 0.5085, "num_input_tokens_seen": 8406816, "step": 8155 }, { "epoch": 5.45819397993311, "grad_norm": 2.2152419090270996, "learning_rate": 9.116907871684548e-06, "loss": 0.4503, "num_input_tokens_seen": 8411264, "step": 8160 }, { "epoch": 5.461538461538462, "grad_norm": 1.8113096952438354, "learning_rate": 9.115250893405637e-06, "loss": 0.5353, "num_input_tokens_seen": 8417280, "step": 8165 }, { "epoch": 5.464882943143813, "grad_norm": 1.2662086486816406, "learning_rate": 9.11359251293092e-06, "loss": 0.5164, "num_input_tokens_seen": 8422336, "step": 8170 }, { "epoch": 5.468227424749164, "grad_norm": 1.7053003311157227, "learning_rate": 9.111932730825457e-06, "loss": 0.4111, "num_input_tokens_seen": 8426752, "step": 8175 }, { "epoch": 5.471571906354515, "grad_norm": 1.8352965116500854, "learning_rate": 9.11027154765479e-06, "loss": 0.5029, "num_input_tokens_seen": 8431968, "step": 8180 }, { "epoch": 5.474916387959866, "grad_norm": 1.5054869651794434, "learning_rate": 9.108608963984937e-06, "loss": 0.3882, "num_input_tokens_seen": 8436704, "step": 8185 }, { "epoch": 5.478260869565218, "grad_norm": 1.6315858364105225, "learning_rate": 9.106944980382392e-06, "loss": 0.5051, "num_input_tokens_seen": 8441408, "step": 8190 }, { "epoch": 5.481605351170568, "grad_norm": 1.6847459077835083, "learning_rate": 9.105279597414127e-06, "loss": 0.4649, "num_input_tokens_seen": 8446944, "step": 8195 }, { "epoch": 5.48494983277592, "grad_norm": 1.8319268226623535, "learning_rate": 9.10361281564759e-06, "loss": 0.4627, "num_input_tokens_seen": 8452672, "step": 8200 }, { "epoch": 5.488294314381271, "grad_norm": 2.065199375152588, "learning_rate": 9.101944635650705e-06, "loss": 0.4563, "num_input_tokens_seen": 8457504, "step": 8205 }, { "epoch": 5.491638795986622, "grad_norm": 1.8306289911270142, "learning_rate": 9.100275057991877e-06, "loss": 0.4057, "num_input_tokens_seen": 8462240, "step": 8210 }, { "epoch": 5.494983277591973, "grad_norm": 1.3885146379470825, "learning_rate": 9.098604083239981e-06, "loss": 0.5136, "num_input_tokens_seen": 8467936, "step": 8215 }, { "epoch": 5.498327759197324, "grad_norm": 1.2928400039672852, "learning_rate": 9.096931711964371e-06, "loss": 0.4285, "num_input_tokens_seen": 8472832, "step": 8220 }, { "epoch": 5.501672240802676, "grad_norm": 1.6358349323272705, "learning_rate": 9.095257944734879e-06, "loss": 0.4876, "num_input_tokens_seen": 8478592, "step": 8225 }, { "epoch": 5.505016722408027, "grad_norm": 1.6675282716751099, "learning_rate": 9.093582782121805e-06, "loss": 0.39, "num_input_tokens_seen": 8483424, "step": 8230 }, { "epoch": 5.508361204013378, "grad_norm": 3.1465630531311035, "learning_rate": 9.091906224695935e-06, "loss": 0.4104, "num_input_tokens_seen": 8488064, "step": 8235 }, { "epoch": 5.511705685618729, "grad_norm": 1.0608646869659424, "learning_rate": 9.090228273028524e-06, "loss": 0.4125, "num_input_tokens_seen": 8492992, "step": 8240 }, { "epoch": 5.51505016722408, "grad_norm": 2.161069631576538, "learning_rate": 9.088548927691301e-06, "loss": 0.4538, "num_input_tokens_seen": 8497504, "step": 8245 }, { "epoch": 5.518394648829432, "grad_norm": 1.601861834526062, "learning_rate": 9.086868189256475e-06, "loss": 0.4933, "num_input_tokens_seen": 8503808, "step": 8250 }, { "epoch": 5.521739130434782, "grad_norm": 1.893071174621582, "learning_rate": 9.085186058296721e-06, "loss": 0.5233, "num_input_tokens_seen": 8510112, "step": 8255 }, { "epoch": 5.525083612040134, "grad_norm": 1.4591319561004639, "learning_rate": 9.083502535385202e-06, "loss": 0.4048, "num_input_tokens_seen": 8515232, "step": 8260 }, { "epoch": 5.528428093645485, "grad_norm": 1.8037793636322021, "learning_rate": 9.081817621095541e-06, "loss": 0.3341, "num_input_tokens_seen": 8520096, "step": 8265 }, { "epoch": 5.531772575250836, "grad_norm": 1.9141961336135864, "learning_rate": 9.080131316001846e-06, "loss": 0.5169, "num_input_tokens_seen": 8525088, "step": 8270 }, { "epoch": 5.535117056856187, "grad_norm": 2.194817304611206, "learning_rate": 9.07844362067869e-06, "loss": 0.486, "num_input_tokens_seen": 8529728, "step": 8275 }, { "epoch": 5.538461538461538, "grad_norm": 1.584045648574829, "learning_rate": 9.076754535701127e-06, "loss": 0.4908, "num_input_tokens_seen": 8535360, "step": 8280 }, { "epoch": 5.54180602006689, "grad_norm": 1.4310566186904907, "learning_rate": 9.07506406164468e-06, "loss": 0.4976, "num_input_tokens_seen": 8541152, "step": 8285 }, { "epoch": 5.545150501672241, "grad_norm": 2.172606945037842, "learning_rate": 9.073372199085347e-06, "loss": 0.4969, "num_input_tokens_seen": 8545984, "step": 8290 }, { "epoch": 5.548494983277592, "grad_norm": 1.0383250713348389, "learning_rate": 9.0716789485996e-06, "loss": 0.4578, "num_input_tokens_seen": 8550880, "step": 8295 }, { "epoch": 5.551839464882943, "grad_norm": 2.870756149291992, "learning_rate": 9.069984310764383e-06, "loss": 0.5146, "num_input_tokens_seen": 8556480, "step": 8300 }, { "epoch": 5.555183946488294, "grad_norm": 1.5285913944244385, "learning_rate": 9.068288286157111e-06, "loss": 0.4971, "num_input_tokens_seen": 8562368, "step": 8305 }, { "epoch": 5.558528428093646, "grad_norm": 1.1484415531158447, "learning_rate": 9.066590875355674e-06, "loss": 0.4403, "num_input_tokens_seen": 8567936, "step": 8310 }, { "epoch": 5.561872909698996, "grad_norm": 1.3716316223144531, "learning_rate": 9.064892078938434e-06, "loss": 0.4523, "num_input_tokens_seen": 8573344, "step": 8315 }, { "epoch": 5.565217391304348, "grad_norm": 1.6218584775924683, "learning_rate": 9.063191897484225e-06, "loss": 0.5398, "num_input_tokens_seen": 8578016, "step": 8320 }, { "epoch": 5.568561872909699, "grad_norm": 1.5854213237762451, "learning_rate": 9.061490331572349e-06, "loss": 0.4928, "num_input_tokens_seen": 8583680, "step": 8325 }, { "epoch": 5.5719063545150505, "grad_norm": 1.821065902709961, "learning_rate": 9.059787381782585e-06, "loss": 0.4032, "num_input_tokens_seen": 8588512, "step": 8330 }, { "epoch": 5.575250836120401, "grad_norm": 1.7067406177520752, "learning_rate": 9.058083048695185e-06, "loss": 0.4319, "num_input_tokens_seen": 8592864, "step": 8335 }, { "epoch": 5.578595317725752, "grad_norm": 1.2451709508895874, "learning_rate": 9.056377332890864e-06, "loss": 0.4298, "num_input_tokens_seen": 8598816, "step": 8340 }, { "epoch": 5.581939799331104, "grad_norm": 1.4651281833648682, "learning_rate": 9.054670234950818e-06, "loss": 0.4321, "num_input_tokens_seen": 8603520, "step": 8345 }, { "epoch": 5.585284280936455, "grad_norm": 1.522429347038269, "learning_rate": 9.052961755456705e-06, "loss": 0.4587, "num_input_tokens_seen": 8608864, "step": 8350 }, { "epoch": 5.588628762541806, "grad_norm": 1.826324701309204, "learning_rate": 9.05125189499066e-06, "loss": 0.44, "num_input_tokens_seen": 8613728, "step": 8355 }, { "epoch": 5.591973244147157, "grad_norm": 1.8754403591156006, "learning_rate": 9.049540654135285e-06, "loss": 0.4645, "num_input_tokens_seen": 8618368, "step": 8360 }, { "epoch": 5.595317725752508, "grad_norm": 2.008901834487915, "learning_rate": 9.047828033473656e-06, "loss": 0.4811, "num_input_tokens_seen": 8623552, "step": 8365 }, { "epoch": 5.59866220735786, "grad_norm": 2.145059823989868, "learning_rate": 9.046114033589313e-06, "loss": 0.6006, "num_input_tokens_seen": 8628544, "step": 8370 }, { "epoch": 5.602006688963211, "grad_norm": 1.3920568227767944, "learning_rate": 9.044398655066276e-06, "loss": 0.5442, "num_input_tokens_seen": 8633472, "step": 8375 }, { "epoch": 5.605351170568562, "grad_norm": 2.450925350189209, "learning_rate": 9.042681898489022e-06, "loss": 0.5074, "num_input_tokens_seen": 8638432, "step": 8380 }, { "epoch": 5.608695652173913, "grad_norm": 1.7415133714675903, "learning_rate": 9.040963764442508e-06, "loss": 0.4827, "num_input_tokens_seen": 8643360, "step": 8385 }, { "epoch": 5.6120401337792645, "grad_norm": 2.1440589427948, "learning_rate": 9.039244253512157e-06, "loss": 0.5064, "num_input_tokens_seen": 8648288, "step": 8390 }, { "epoch": 5.615384615384615, "grad_norm": 1.7118918895721436, "learning_rate": 9.037523366283856e-06, "loss": 0.5496, "num_input_tokens_seen": 8654112, "step": 8395 }, { "epoch": 5.618729096989966, "grad_norm": 2.0899698734283447, "learning_rate": 9.035801103343971e-06, "loss": 0.4833, "num_input_tokens_seen": 8659168, "step": 8400 }, { "epoch": 5.622073578595318, "grad_norm": 1.9279167652130127, "learning_rate": 9.034077465279329e-06, "loss": 0.4814, "num_input_tokens_seen": 8664544, "step": 8405 }, { "epoch": 5.625418060200669, "grad_norm": 1.9347976446151733, "learning_rate": 9.032352452677227e-06, "loss": 0.3843, "num_input_tokens_seen": 8669728, "step": 8410 }, { "epoch": 5.6287625418060205, "grad_norm": 1.0203874111175537, "learning_rate": 9.030626066125432e-06, "loss": 0.3417, "num_input_tokens_seen": 8674560, "step": 8415 }, { "epoch": 5.632107023411371, "grad_norm": 1.2870310544967651, "learning_rate": 9.028898306212179e-06, "loss": 0.4491, "num_input_tokens_seen": 8680352, "step": 8420 }, { "epoch": 5.635451505016722, "grad_norm": 1.1263657808303833, "learning_rate": 9.02716917352617e-06, "loss": 0.5064, "num_input_tokens_seen": 8686208, "step": 8425 }, { "epoch": 5.638795986622074, "grad_norm": 1.89320707321167, "learning_rate": 9.025438668656574e-06, "loss": 0.5116, "num_input_tokens_seen": 8691424, "step": 8430 }, { "epoch": 5.642140468227424, "grad_norm": 1.4478257894515991, "learning_rate": 9.023706792193032e-06, "loss": 0.4304, "num_input_tokens_seen": 8697056, "step": 8435 }, { "epoch": 5.645484949832776, "grad_norm": 2.210106134414673, "learning_rate": 9.021973544725644e-06, "loss": 0.4529, "num_input_tokens_seen": 8701472, "step": 8440 }, { "epoch": 5.648829431438127, "grad_norm": 1.998112678527832, "learning_rate": 9.020238926844985e-06, "loss": 0.4502, "num_input_tokens_seen": 8706144, "step": 8445 }, { "epoch": 5.6521739130434785, "grad_norm": 2.3317670822143555, "learning_rate": 9.018502939142094e-06, "loss": 0.4754, "num_input_tokens_seen": 8711232, "step": 8450 }, { "epoch": 5.65551839464883, "grad_norm": 1.6557039022445679, "learning_rate": 9.016765582208475e-06, "loss": 0.5042, "num_input_tokens_seen": 8715904, "step": 8455 }, { "epoch": 5.65886287625418, "grad_norm": 1.4810632467269897, "learning_rate": 9.015026856636104e-06, "loss": 0.4825, "num_input_tokens_seen": 8721568, "step": 8460 }, { "epoch": 5.662207357859532, "grad_norm": 3.546990156173706, "learning_rate": 9.013286763017415e-06, "loss": 0.4814, "num_input_tokens_seen": 8727008, "step": 8465 }, { "epoch": 5.665551839464883, "grad_norm": 2.7937676906585693, "learning_rate": 9.011545301945316e-06, "loss": 0.4656, "num_input_tokens_seen": 8732384, "step": 8470 }, { "epoch": 5.668896321070234, "grad_norm": 1.7633367776870728, "learning_rate": 9.009802474013176e-06, "loss": 0.5204, "num_input_tokens_seen": 8737856, "step": 8475 }, { "epoch": 5.672240802675585, "grad_norm": 2.003121852874756, "learning_rate": 9.008058279814833e-06, "loss": 0.4224, "num_input_tokens_seen": 8742528, "step": 8480 }, { "epoch": 5.6755852842809364, "grad_norm": 2.2571427822113037, "learning_rate": 9.006312719944588e-06, "loss": 0.5115, "num_input_tokens_seen": 8747840, "step": 8485 }, { "epoch": 5.678929765886288, "grad_norm": 1.503438949584961, "learning_rate": 9.004565794997209e-06, "loss": 0.3912, "num_input_tokens_seen": 8753024, "step": 8490 }, { "epoch": 5.682274247491639, "grad_norm": 2.0031168460845947, "learning_rate": 9.00281750556793e-06, "loss": 0.4823, "num_input_tokens_seen": 8757920, "step": 8495 }, { "epoch": 5.68561872909699, "grad_norm": 1.5460946559906006, "learning_rate": 9.001067852252441e-06, "loss": 0.3693, "num_input_tokens_seen": 8763168, "step": 8500 }, { "epoch": 5.688963210702341, "grad_norm": 2.3061702251434326, "learning_rate": 8.999316835646914e-06, "loss": 0.4289, "num_input_tokens_seen": 8768000, "step": 8505 }, { "epoch": 5.6923076923076925, "grad_norm": 1.3855236768722534, "learning_rate": 8.997564456347969e-06, "loss": 0.4378, "num_input_tokens_seen": 8773216, "step": 8510 }, { "epoch": 5.695652173913043, "grad_norm": 1.4631720781326294, "learning_rate": 8.9958107149527e-06, "loss": 0.5418, "num_input_tokens_seen": 8777632, "step": 8515 }, { "epoch": 5.698996655518394, "grad_norm": 1.4705572128295898, "learning_rate": 8.994055612058662e-06, "loss": 0.4521, "num_input_tokens_seen": 8782720, "step": 8520 }, { "epoch": 5.702341137123746, "grad_norm": 2.934230089187622, "learning_rate": 8.99229914826387e-06, "loss": 0.4701, "num_input_tokens_seen": 8787968, "step": 8525 }, { "epoch": 5.705685618729097, "grad_norm": 1.2412205934524536, "learning_rate": 8.99054132416681e-06, "loss": 0.3658, "num_input_tokens_seen": 8793216, "step": 8530 }, { "epoch": 5.709030100334449, "grad_norm": 1.6965080499649048, "learning_rate": 8.98878214036643e-06, "loss": 0.4492, "num_input_tokens_seen": 8798176, "step": 8535 }, { "epoch": 5.712374581939799, "grad_norm": 1.6632157564163208, "learning_rate": 8.987021597462136e-06, "loss": 0.5347, "num_input_tokens_seen": 8804640, "step": 8540 }, { "epoch": 5.7157190635451505, "grad_norm": 1.527340292930603, "learning_rate": 8.985259696053802e-06, "loss": 0.5191, "num_input_tokens_seen": 8809056, "step": 8545 }, { "epoch": 5.719063545150502, "grad_norm": 1.8900188207626343, "learning_rate": 8.98349643674176e-06, "loss": 0.5637, "num_input_tokens_seen": 8814432, "step": 8550 }, { "epoch": 5.722408026755852, "grad_norm": 1.4076902866363525, "learning_rate": 8.981731820126816e-06, "loss": 0.3641, "num_input_tokens_seen": 8818688, "step": 8555 }, { "epoch": 5.725752508361204, "grad_norm": 1.2062666416168213, "learning_rate": 8.979965846810221e-06, "loss": 0.4126, "num_input_tokens_seen": 8824448, "step": 8560 }, { "epoch": 5.729096989966555, "grad_norm": 2.07356333732605, "learning_rate": 8.978198517393705e-06, "loss": 0.4109, "num_input_tokens_seen": 8829376, "step": 8565 }, { "epoch": 5.7324414715719065, "grad_norm": 2.273754596710205, "learning_rate": 8.97642983247945e-06, "loss": 0.4826, "num_input_tokens_seen": 8834656, "step": 8570 }, { "epoch": 5.735785953177258, "grad_norm": 1.712378978729248, "learning_rate": 8.974659792670102e-06, "loss": 0.5348, "num_input_tokens_seen": 8840768, "step": 8575 }, { "epoch": 5.739130434782608, "grad_norm": 2.1953060626983643, "learning_rate": 8.972888398568772e-06, "loss": 0.4804, "num_input_tokens_seen": 8845248, "step": 8580 }, { "epoch": 5.74247491638796, "grad_norm": 1.5902749300003052, "learning_rate": 8.971115650779027e-06, "loss": 0.4128, "num_input_tokens_seen": 8850912, "step": 8585 }, { "epoch": 5.745819397993311, "grad_norm": 1.2933145761489868, "learning_rate": 8.9693415499049e-06, "loss": 0.5063, "num_input_tokens_seen": 8855616, "step": 8590 }, { "epoch": 5.749163879598662, "grad_norm": 1.0463885068893433, "learning_rate": 8.967566096550884e-06, "loss": 0.4997, "num_input_tokens_seen": 8861088, "step": 8595 }, { "epoch": 5.752508361204013, "grad_norm": 1.3752377033233643, "learning_rate": 8.965789291321928e-06, "loss": 0.4629, "num_input_tokens_seen": 8865984, "step": 8600 }, { "epoch": 5.7558528428093645, "grad_norm": 1.349895715713501, "learning_rate": 8.96401113482345e-06, "loss": 0.5327, "num_input_tokens_seen": 8871136, "step": 8605 }, { "epoch": 5.759197324414716, "grad_norm": 1.3582960367202759, "learning_rate": 8.962231627661323e-06, "loss": 0.4702, "num_input_tokens_seen": 8876864, "step": 8610 }, { "epoch": 5.762541806020067, "grad_norm": 1.3462733030319214, "learning_rate": 8.960450770441877e-06, "loss": 0.4288, "num_input_tokens_seen": 8881696, "step": 8615 }, { "epoch": 5.765886287625418, "grad_norm": 1.7130992412567139, "learning_rate": 8.958668563771911e-06, "loss": 0.4836, "num_input_tokens_seen": 8887328, "step": 8620 }, { "epoch": 5.769230769230769, "grad_norm": 1.1183334589004517, "learning_rate": 8.956885008258678e-06, "loss": 0.3849, "num_input_tokens_seen": 8892480, "step": 8625 }, { "epoch": 5.7725752508361206, "grad_norm": 1.2899771928787231, "learning_rate": 8.955100104509891e-06, "loss": 0.446, "num_input_tokens_seen": 8897984, "step": 8630 }, { "epoch": 5.775919732441472, "grad_norm": 1.8671852350234985, "learning_rate": 8.953313853133724e-06, "loss": 0.3757, "num_input_tokens_seen": 8902688, "step": 8635 }, { "epoch": 5.7792642140468224, "grad_norm": 1.3371633291244507, "learning_rate": 8.95152625473881e-06, "loss": 0.5077, "num_input_tokens_seen": 8906944, "step": 8640 }, { "epoch": 5.782608695652174, "grad_norm": 1.318237066268921, "learning_rate": 8.949737309934236e-06, "loss": 0.4648, "num_input_tokens_seen": 8912192, "step": 8645 }, { "epoch": 5.785953177257525, "grad_norm": 1.600753903388977, "learning_rate": 8.947947019329554e-06, "loss": 0.3768, "num_input_tokens_seen": 8917344, "step": 8650 }, { "epoch": 5.789297658862877, "grad_norm": 3.518185615539551, "learning_rate": 8.946155383534775e-06, "loss": 0.5271, "num_input_tokens_seen": 8922688, "step": 8655 }, { "epoch": 5.792642140468227, "grad_norm": 1.6606130599975586, "learning_rate": 8.944362403160362e-06, "loss": 0.4705, "num_input_tokens_seen": 8927040, "step": 8660 }, { "epoch": 5.7959866220735785, "grad_norm": 1.352050542831421, "learning_rate": 8.942568078817245e-06, "loss": 0.3743, "num_input_tokens_seen": 8931968, "step": 8665 }, { "epoch": 5.79933110367893, "grad_norm": 1.7626763582229614, "learning_rate": 8.940772411116802e-06, "loss": 0.4612, "num_input_tokens_seen": 8936864, "step": 8670 }, { "epoch": 5.802675585284281, "grad_norm": 1.4050284624099731, "learning_rate": 8.938975400670876e-06, "loss": 0.4894, "num_input_tokens_seen": 8941888, "step": 8675 }, { "epoch": 5.806020066889632, "grad_norm": 1.1239392757415771, "learning_rate": 8.937177048091763e-06, "loss": 0.4963, "num_input_tokens_seen": 8947136, "step": 8680 }, { "epoch": 5.809364548494983, "grad_norm": 1.2877812385559082, "learning_rate": 8.935377353992222e-06, "loss": 0.4155, "num_input_tokens_seen": 8952480, "step": 8685 }, { "epoch": 5.812709030100335, "grad_norm": 1.279719591140747, "learning_rate": 8.933576318985462e-06, "loss": 0.5259, "num_input_tokens_seen": 8958752, "step": 8690 }, { "epoch": 5.816053511705686, "grad_norm": 1.474095106124878, "learning_rate": 8.931773943685155e-06, "loss": 0.5489, "num_input_tokens_seen": 8963648, "step": 8695 }, { "epoch": 5.8193979933110365, "grad_norm": 1.1043641567230225, "learning_rate": 8.929970228705425e-06, "loss": 0.4239, "num_input_tokens_seen": 8968416, "step": 8700 }, { "epoch": 5.822742474916388, "grad_norm": 1.5388708114624023, "learning_rate": 8.928165174660858e-06, "loss": 0.4481, "num_input_tokens_seen": 8974528, "step": 8705 }, { "epoch": 5.826086956521739, "grad_norm": 2.0242998600006104, "learning_rate": 8.926358782166488e-06, "loss": 0.4767, "num_input_tokens_seen": 8979840, "step": 8710 }, { "epoch": 5.829431438127091, "grad_norm": 2.4862582683563232, "learning_rate": 8.924551051837815e-06, "loss": 0.4266, "num_input_tokens_seen": 8984576, "step": 8715 }, { "epoch": 5.832775919732441, "grad_norm": 1.8180546760559082, "learning_rate": 8.922741984290786e-06, "loss": 0.4318, "num_input_tokens_seen": 8989664, "step": 8720 }, { "epoch": 5.8361204013377925, "grad_norm": 1.8669755458831787, "learning_rate": 8.92093158014181e-06, "loss": 0.4275, "num_input_tokens_seen": 8994016, "step": 8725 }, { "epoch": 5.839464882943144, "grad_norm": 1.974863052368164, "learning_rate": 8.919119840007747e-06, "loss": 0.5529, "num_input_tokens_seen": 8999296, "step": 8730 }, { "epoch": 5.842809364548495, "grad_norm": 1.6832934617996216, "learning_rate": 8.917306764505914e-06, "loss": 0.4411, "num_input_tokens_seen": 9004256, "step": 8735 }, { "epoch": 5.846153846153846, "grad_norm": 1.6550806760787964, "learning_rate": 8.915492354254082e-06, "loss": 0.5708, "num_input_tokens_seen": 9010144, "step": 8740 }, { "epoch": 5.849498327759197, "grad_norm": 1.470442295074463, "learning_rate": 8.913676609870481e-06, "loss": 0.5131, "num_input_tokens_seen": 9015712, "step": 8745 }, { "epoch": 5.852842809364549, "grad_norm": 1.6008931398391724, "learning_rate": 8.91185953197379e-06, "loss": 0.5507, "num_input_tokens_seen": 9020320, "step": 8750 }, { "epoch": 5.8561872909699, "grad_norm": 2.39780592918396, "learning_rate": 8.910041121183147e-06, "loss": 0.4202, "num_input_tokens_seen": 9025504, "step": 8755 }, { "epoch": 5.8595317725752505, "grad_norm": 1.6968436241149902, "learning_rate": 8.908221378118138e-06, "loss": 0.4664, "num_input_tokens_seen": 9031168, "step": 8760 }, { "epoch": 5.862876254180602, "grad_norm": 1.2803676128387451, "learning_rate": 8.906400303398811e-06, "loss": 0.4869, "num_input_tokens_seen": 9036704, "step": 8765 }, { "epoch": 5.866220735785953, "grad_norm": 1.3223991394042969, "learning_rate": 8.904577897645663e-06, "loss": 0.4803, "num_input_tokens_seen": 9041824, "step": 8770 }, { "epoch": 5.869565217391305, "grad_norm": 3.174455404281616, "learning_rate": 8.902754161479641e-06, "loss": 0.4218, "num_input_tokens_seen": 9046496, "step": 8775 }, { "epoch": 5.872909698996655, "grad_norm": 2.007244348526001, "learning_rate": 8.900929095522154e-06, "loss": 0.4781, "num_input_tokens_seen": 9051584, "step": 8780 }, { "epoch": 5.8762541806020065, "grad_norm": 1.9949675798416138, "learning_rate": 8.899102700395059e-06, "loss": 0.4958, "num_input_tokens_seen": 9058048, "step": 8785 }, { "epoch": 5.879598662207358, "grad_norm": 2.325917959213257, "learning_rate": 8.897274976720665e-06, "loss": 0.4856, "num_input_tokens_seen": 9063552, "step": 8790 }, { "epoch": 5.882943143812709, "grad_norm": 2.5829100608825684, "learning_rate": 8.895445925121736e-06, "loss": 0.4848, "num_input_tokens_seen": 9068672, "step": 8795 }, { "epoch": 5.88628762541806, "grad_norm": 1.552636742591858, "learning_rate": 8.893615546221487e-06, "loss": 0.4477, "num_input_tokens_seen": 9073472, "step": 8800 }, { "epoch": 5.889632107023411, "grad_norm": 1.983546495437622, "learning_rate": 8.891783840643585e-06, "loss": 0.3585, "num_input_tokens_seen": 9078336, "step": 8805 }, { "epoch": 5.892976588628763, "grad_norm": 1.6726534366607666, "learning_rate": 8.889950809012152e-06, "loss": 0.4918, "num_input_tokens_seen": 9083072, "step": 8810 }, { "epoch": 5.896321070234114, "grad_norm": 1.5000845193862915, "learning_rate": 8.888116451951755e-06, "loss": 0.4762, "num_input_tokens_seen": 9088960, "step": 8815 }, { "epoch": 5.8996655518394645, "grad_norm": 1.458548665046692, "learning_rate": 8.886280770087426e-06, "loss": 0.5358, "num_input_tokens_seen": 9094304, "step": 8820 }, { "epoch": 5.903010033444816, "grad_norm": 2.008014440536499, "learning_rate": 8.884443764044632e-06, "loss": 0.5593, "num_input_tokens_seen": 9099296, "step": 8825 }, { "epoch": 5.906354515050167, "grad_norm": 1.670749306678772, "learning_rate": 8.882605434449303e-06, "loss": 0.4391, "num_input_tokens_seen": 9104160, "step": 8830 }, { "epoch": 5.909698996655519, "grad_norm": 2.062469959259033, "learning_rate": 8.880765781927814e-06, "loss": 0.5065, "num_input_tokens_seen": 9109344, "step": 8835 }, { "epoch": 5.913043478260869, "grad_norm": 1.6437709331512451, "learning_rate": 8.878924807106992e-06, "loss": 0.4661, "num_input_tokens_seen": 9115264, "step": 8840 }, { "epoch": 5.916387959866221, "grad_norm": 1.7360163927078247, "learning_rate": 8.877082510614116e-06, "loss": 0.4813, "num_input_tokens_seen": 9120512, "step": 8845 }, { "epoch": 5.919732441471572, "grad_norm": 1.4818371534347534, "learning_rate": 8.875238893076916e-06, "loss": 0.4252, "num_input_tokens_seen": 9125312, "step": 8850 }, { "epoch": 5.923076923076923, "grad_norm": 2.2554337978363037, "learning_rate": 8.87339395512357e-06, "loss": 0.5024, "num_input_tokens_seen": 9130752, "step": 8855 }, { "epoch": 5.926421404682275, "grad_norm": 1.502153754234314, "learning_rate": 8.871547697382705e-06, "loss": 0.4715, "num_input_tokens_seen": 9135328, "step": 8860 }, { "epoch": 5.929765886287625, "grad_norm": 1.5427992343902588, "learning_rate": 8.8697001204834e-06, "loss": 0.4163, "num_input_tokens_seen": 9140160, "step": 8865 }, { "epoch": 5.933110367892977, "grad_norm": 1.6810916662216187, "learning_rate": 8.867851225055185e-06, "loss": 0.5401, "num_input_tokens_seen": 9145312, "step": 8870 }, { "epoch": 5.936454849498328, "grad_norm": 2.7552640438079834, "learning_rate": 8.866001011728032e-06, "loss": 0.4682, "num_input_tokens_seen": 9150336, "step": 8875 }, { "epoch": 5.9397993311036785, "grad_norm": 1.3024989366531372, "learning_rate": 8.86414948113237e-06, "loss": 0.4643, "num_input_tokens_seen": 9155392, "step": 8880 }, { "epoch": 5.94314381270903, "grad_norm": 1.6013189554214478, "learning_rate": 8.862296633899079e-06, "loss": 0.3991, "num_input_tokens_seen": 9160288, "step": 8885 }, { "epoch": 5.946488294314381, "grad_norm": 1.7112786769866943, "learning_rate": 8.860442470659474e-06, "loss": 0.5037, "num_input_tokens_seen": 9166560, "step": 8890 }, { "epoch": 5.949832775919733, "grad_norm": 1.7894610166549683, "learning_rate": 8.858586992045329e-06, "loss": 0.4848, "num_input_tokens_seen": 9171712, "step": 8895 }, { "epoch": 5.953177257525084, "grad_norm": 2.876410484313965, "learning_rate": 8.856730198688867e-06, "loss": 0.5356, "num_input_tokens_seen": 9176544, "step": 8900 }, { "epoch": 5.956521739130435, "grad_norm": 1.6017924547195435, "learning_rate": 8.854872091222755e-06, "loss": 0.4479, "num_input_tokens_seen": 9181056, "step": 8905 }, { "epoch": 5.959866220735786, "grad_norm": 1.766409158706665, "learning_rate": 8.853012670280108e-06, "loss": 0.3677, "num_input_tokens_seen": 9186496, "step": 8910 }, { "epoch": 5.963210702341137, "grad_norm": 2.1555745601654053, "learning_rate": 8.85115193649449e-06, "loss": 0.4348, "num_input_tokens_seen": 9192288, "step": 8915 }, { "epoch": 5.966555183946488, "grad_norm": 2.4467339515686035, "learning_rate": 8.849289890499912e-06, "loss": 0.4656, "num_input_tokens_seen": 9197312, "step": 8920 }, { "epoch": 5.969899665551839, "grad_norm": 1.690392255783081, "learning_rate": 8.84742653293083e-06, "loss": 0.4806, "num_input_tokens_seen": 9203200, "step": 8925 }, { "epoch": 5.973244147157191, "grad_norm": 1.390846610069275, "learning_rate": 8.845561864422151e-06, "loss": 0.4571, "num_input_tokens_seen": 9208352, "step": 8930 }, { "epoch": 5.976588628762542, "grad_norm": 2.3230254650115967, "learning_rate": 8.843695885609224e-06, "loss": 0.4853, "num_input_tokens_seen": 9212704, "step": 8935 }, { "epoch": 5.979933110367893, "grad_norm": 2.110374927520752, "learning_rate": 8.84182859712785e-06, "loss": 0.5139, "num_input_tokens_seen": 9217280, "step": 8940 }, { "epoch": 5.983277591973244, "grad_norm": 1.2385427951812744, "learning_rate": 8.839959999614272e-06, "loss": 0.5134, "num_input_tokens_seen": 9222688, "step": 8945 }, { "epoch": 5.986622073578595, "grad_norm": 1.6432825326919556, "learning_rate": 8.838090093705177e-06, "loss": 0.4754, "num_input_tokens_seen": 9227552, "step": 8950 }, { "epoch": 5.989966555183947, "grad_norm": 1.6412642002105713, "learning_rate": 8.836218880037704e-06, "loss": 0.5064, "num_input_tokens_seen": 9232544, "step": 8955 }, { "epoch": 5.993311036789297, "grad_norm": 1.3797013759613037, "learning_rate": 8.834346359249435e-06, "loss": 0.4144, "num_input_tokens_seen": 9237376, "step": 8960 }, { "epoch": 5.996655518394649, "grad_norm": 2.1457598209381104, "learning_rate": 8.832472531978396e-06, "loss": 0.4637, "num_input_tokens_seen": 9242432, "step": 8965 }, { "epoch": 6.0, "grad_norm": 2.2900378704071045, "learning_rate": 8.830597398863056e-06, "loss": 0.5076, "num_input_tokens_seen": 9246832, "step": 8970 }, { "epoch": 6.0, "eval_loss": 0.49793174862861633, "eval_runtime": 37.5585, "eval_samples_per_second": 39.805, "eval_steps_per_second": 9.958, "num_input_tokens_seen": 9246832, "step": 8970 }, { "epoch": 6.003344481605351, "grad_norm": 1.0624960660934448, "learning_rate": 8.828720960542339e-06, "loss": 0.3929, "num_input_tokens_seen": 9251824, "step": 8975 }, { "epoch": 6.006688963210703, "grad_norm": 1.641251564025879, "learning_rate": 8.826843217655601e-06, "loss": 0.4596, "num_input_tokens_seen": 9256208, "step": 8980 }, { "epoch": 6.010033444816053, "grad_norm": 1.6594079732894897, "learning_rate": 8.82496417084265e-06, "loss": 0.4183, "num_input_tokens_seen": 9261424, "step": 8985 }, { "epoch": 6.013377926421405, "grad_norm": 2.3157143592834473, "learning_rate": 8.823083820743733e-06, "loss": 0.4132, "num_input_tokens_seen": 9266064, "step": 8990 }, { "epoch": 6.016722408026756, "grad_norm": 1.5109078884124756, "learning_rate": 8.821202167999553e-06, "loss": 0.4776, "num_input_tokens_seen": 9270992, "step": 8995 }, { "epoch": 6.0200668896321075, "grad_norm": 1.32416570186615, "learning_rate": 8.81931921325124e-06, "loss": 0.461, "num_input_tokens_seen": 9275984, "step": 9000 }, { "epoch": 6.023411371237458, "grad_norm": 1.3434029817581177, "learning_rate": 8.817434957140382e-06, "loss": 0.4466, "num_input_tokens_seen": 9281648, "step": 9005 }, { "epoch": 6.026755852842809, "grad_norm": 2.0896499156951904, "learning_rate": 8.815549400309002e-06, "loss": 0.4582, "num_input_tokens_seen": 9286672, "step": 9010 }, { "epoch": 6.030100334448161, "grad_norm": 2.0412590503692627, "learning_rate": 8.813662543399567e-06, "loss": 0.4434, "num_input_tokens_seen": 9292048, "step": 9015 }, { "epoch": 6.033444816053512, "grad_norm": 1.650475025177002, "learning_rate": 8.811774387054992e-06, "loss": 0.5869, "num_input_tokens_seen": 9297904, "step": 9020 }, { "epoch": 6.036789297658863, "grad_norm": 2.020643949508667, "learning_rate": 8.809884931918628e-06, "loss": 0.4816, "num_input_tokens_seen": 9302736, "step": 9025 }, { "epoch": 6.040133779264214, "grad_norm": 1.5567835569381714, "learning_rate": 8.807994178634276e-06, "loss": 0.3918, "num_input_tokens_seen": 9307536, "step": 9030 }, { "epoch": 6.043478260869565, "grad_norm": 1.414307713508606, "learning_rate": 8.806102127846172e-06, "loss": 0.4213, "num_input_tokens_seen": 9312560, "step": 9035 }, { "epoch": 6.046822742474917, "grad_norm": 1.6176843643188477, "learning_rate": 8.804208780198998e-06, "loss": 0.4518, "num_input_tokens_seen": 9317520, "step": 9040 }, { "epoch": 6.050167224080267, "grad_norm": 1.1575047969818115, "learning_rate": 8.802314136337878e-06, "loss": 0.449, "num_input_tokens_seen": 9322960, "step": 9045 }, { "epoch": 6.053511705685619, "grad_norm": 2.472853899002075, "learning_rate": 8.800418196908378e-06, "loss": 0.4539, "num_input_tokens_seen": 9329104, "step": 9050 }, { "epoch": 6.05685618729097, "grad_norm": 2.262665033340454, "learning_rate": 8.798520962556502e-06, "loss": 0.4553, "num_input_tokens_seen": 9334224, "step": 9055 }, { "epoch": 6.0602006688963215, "grad_norm": 1.6750305891036987, "learning_rate": 8.796622433928699e-06, "loss": 0.4576, "num_input_tokens_seen": 9340624, "step": 9060 }, { "epoch": 6.063545150501672, "grad_norm": 1.243656039237976, "learning_rate": 8.794722611671857e-06, "loss": 0.4076, "num_input_tokens_seen": 9346416, "step": 9065 }, { "epoch": 6.066889632107023, "grad_norm": 1.9687036275863647, "learning_rate": 8.792821496433306e-06, "loss": 0.4606, "num_input_tokens_seen": 9351248, "step": 9070 }, { "epoch": 6.070234113712375, "grad_norm": 1.7611194849014282, "learning_rate": 8.790919088860815e-06, "loss": 0.4533, "num_input_tokens_seen": 9357040, "step": 9075 }, { "epoch": 6.073578595317726, "grad_norm": 1.0752229690551758, "learning_rate": 8.789015389602595e-06, "loss": 0.422, "num_input_tokens_seen": 9361008, "step": 9080 }, { "epoch": 6.076923076923077, "grad_norm": 1.6893852949142456, "learning_rate": 8.787110399307298e-06, "loss": 0.4229, "num_input_tokens_seen": 9366384, "step": 9085 }, { "epoch": 6.080267558528428, "grad_norm": 1.2485300302505493, "learning_rate": 8.78520411862401e-06, "loss": 0.3984, "num_input_tokens_seen": 9371216, "step": 9090 }, { "epoch": 6.083612040133779, "grad_norm": 1.3419232368469238, "learning_rate": 8.783296548202265e-06, "loss": 0.4953, "num_input_tokens_seen": 9377136, "step": 9095 }, { "epoch": 6.086956521739131, "grad_norm": 2.241464376449585, "learning_rate": 8.781387688692032e-06, "loss": 0.5204, "num_input_tokens_seen": 9381840, "step": 9100 }, { "epoch": 6.090301003344481, "grad_norm": 1.4823520183563232, "learning_rate": 8.779477540743716e-06, "loss": 0.4758, "num_input_tokens_seen": 9386416, "step": 9105 }, { "epoch": 6.093645484949833, "grad_norm": 1.6503386497497559, "learning_rate": 8.777566105008168e-06, "loss": 0.4868, "num_input_tokens_seen": 9390832, "step": 9110 }, { "epoch": 6.096989966555184, "grad_norm": 2.6265552043914795, "learning_rate": 8.775653382136676e-06, "loss": 0.4374, "num_input_tokens_seen": 9395824, "step": 9115 }, { "epoch": 6.1003344481605355, "grad_norm": 1.8568916320800781, "learning_rate": 8.77373937278096e-06, "loss": 0.504, "num_input_tokens_seen": 9400784, "step": 9120 }, { "epoch": 6.103678929765886, "grad_norm": 1.6031605005264282, "learning_rate": 8.771824077593187e-06, "loss": 0.4339, "num_input_tokens_seen": 9405840, "step": 9125 }, { "epoch": 6.107023411371237, "grad_norm": 1.4718222618103027, "learning_rate": 8.769907497225958e-06, "loss": 0.4144, "num_input_tokens_seen": 9410448, "step": 9130 }, { "epoch": 6.110367892976589, "grad_norm": 1.9305813312530518, "learning_rate": 8.767989632332312e-06, "loss": 0.3992, "num_input_tokens_seen": 9415696, "step": 9135 }, { "epoch": 6.11371237458194, "grad_norm": 1.6074272394180298, "learning_rate": 8.766070483565726e-06, "loss": 0.4234, "num_input_tokens_seen": 9421136, "step": 9140 }, { "epoch": 6.117056856187291, "grad_norm": 2.4720864295959473, "learning_rate": 8.764150051580115e-06, "loss": 0.5133, "num_input_tokens_seen": 9426864, "step": 9145 }, { "epoch": 6.120401337792642, "grad_norm": 2.2886815071105957, "learning_rate": 8.76222833702983e-06, "loss": 0.3718, "num_input_tokens_seen": 9431952, "step": 9150 }, { "epoch": 6.1237458193979935, "grad_norm": 1.511730432510376, "learning_rate": 8.760305340569661e-06, "loss": 0.5525, "num_input_tokens_seen": 9436848, "step": 9155 }, { "epoch": 6.127090301003345, "grad_norm": 2.035834312438965, "learning_rate": 8.758381062854832e-06, "loss": 0.5275, "num_input_tokens_seen": 9441424, "step": 9160 }, { "epoch": 6.130434782608695, "grad_norm": 1.3398444652557373, "learning_rate": 8.756455504541006e-06, "loss": 0.4029, "num_input_tokens_seen": 9446672, "step": 9165 }, { "epoch": 6.133779264214047, "grad_norm": 1.8198007345199585, "learning_rate": 8.75452866628428e-06, "loss": 0.4368, "num_input_tokens_seen": 9451984, "step": 9170 }, { "epoch": 6.137123745819398, "grad_norm": 2.362762689590454, "learning_rate": 8.752600548741193e-06, "loss": 0.5048, "num_input_tokens_seen": 9457424, "step": 9175 }, { "epoch": 6.1404682274247495, "grad_norm": 1.3967076539993286, "learning_rate": 8.75067115256871e-06, "loss": 0.4193, "num_input_tokens_seen": 9462544, "step": 9180 }, { "epoch": 6.1438127090301, "grad_norm": 2.2940759658813477, "learning_rate": 8.748740478424238e-06, "loss": 0.5097, "num_input_tokens_seen": 9467984, "step": 9185 }, { "epoch": 6.147157190635451, "grad_norm": 2.554215669631958, "learning_rate": 8.746808526965623e-06, "loss": 0.4993, "num_input_tokens_seen": 9473776, "step": 9190 }, { "epoch": 6.150501672240803, "grad_norm": 1.6885852813720703, "learning_rate": 8.744875298851138e-06, "loss": 0.4302, "num_input_tokens_seen": 9479312, "step": 9195 }, { "epoch": 6.153846153846154, "grad_norm": 1.5401407480239868, "learning_rate": 8.742940794739496e-06, "loss": 0.5115, "num_input_tokens_seen": 9484016, "step": 9200 }, { "epoch": 6.157190635451505, "grad_norm": 2.2714009284973145, "learning_rate": 8.741005015289843e-06, "loss": 0.411, "num_input_tokens_seen": 9489008, "step": 9205 }, { "epoch": 6.160535117056856, "grad_norm": 2.021226406097412, "learning_rate": 8.73906796116176e-06, "loss": 0.4454, "num_input_tokens_seen": 9494352, "step": 9210 }, { "epoch": 6.1638795986622075, "grad_norm": 1.986635446548462, "learning_rate": 8.737129633015264e-06, "loss": 0.4315, "num_input_tokens_seen": 9499024, "step": 9215 }, { "epoch": 6.167224080267559, "grad_norm": 2.2940797805786133, "learning_rate": 8.735190031510803e-06, "loss": 0.5008, "num_input_tokens_seen": 9504112, "step": 9220 }, { "epoch": 6.170568561872909, "grad_norm": 1.3385021686553955, "learning_rate": 8.73324915730926e-06, "loss": 0.4748, "num_input_tokens_seen": 9509584, "step": 9225 }, { "epoch": 6.173913043478261, "grad_norm": 2.185323476791382, "learning_rate": 8.731307011071954e-06, "loss": 0.4592, "num_input_tokens_seen": 9515024, "step": 9230 }, { "epoch": 6.177257525083612, "grad_norm": 2.297966718673706, "learning_rate": 8.729363593460636e-06, "loss": 0.4767, "num_input_tokens_seen": 9519984, "step": 9235 }, { "epoch": 6.1806020066889635, "grad_norm": 1.1729880571365356, "learning_rate": 8.727418905137486e-06, "loss": 0.4074, "num_input_tokens_seen": 9525296, "step": 9240 }, { "epoch": 6.183946488294314, "grad_norm": 1.42449152469635, "learning_rate": 8.725472946765122e-06, "loss": 0.4766, "num_input_tokens_seen": 9530704, "step": 9245 }, { "epoch": 6.187290969899665, "grad_norm": 1.6896183490753174, "learning_rate": 8.723525719006596e-06, "loss": 0.4846, "num_input_tokens_seen": 9535184, "step": 9250 }, { "epoch": 6.190635451505017, "grad_norm": 1.527845025062561, "learning_rate": 8.721577222525388e-06, "loss": 0.4249, "num_input_tokens_seen": 9539792, "step": 9255 }, { "epoch": 6.193979933110368, "grad_norm": 1.9275014400482178, "learning_rate": 8.719627457985411e-06, "loss": 0.438, "num_input_tokens_seen": 9544368, "step": 9260 }, { "epoch": 6.197324414715719, "grad_norm": 2.0124576091766357, "learning_rate": 8.717676426051012e-06, "loss": 0.4649, "num_input_tokens_seen": 9549360, "step": 9265 }, { "epoch": 6.20066889632107, "grad_norm": 1.6250569820404053, "learning_rate": 8.715724127386971e-06, "loss": 0.4905, "num_input_tokens_seen": 9553808, "step": 9270 }, { "epoch": 6.2040133779264215, "grad_norm": 1.679789662361145, "learning_rate": 8.713770562658497e-06, "loss": 0.4656, "num_input_tokens_seen": 9559088, "step": 9275 }, { "epoch": 6.207357859531773, "grad_norm": 1.724823236465454, "learning_rate": 8.71181573253123e-06, "loss": 0.5064, "num_input_tokens_seen": 9564144, "step": 9280 }, { "epoch": 6.210702341137123, "grad_norm": 1.3340386152267456, "learning_rate": 8.709859637671243e-06, "loss": 0.3923, "num_input_tokens_seen": 9569584, "step": 9285 }, { "epoch": 6.214046822742475, "grad_norm": 2.2466657161712646, "learning_rate": 8.70790227874504e-06, "loss": 0.4489, "num_input_tokens_seen": 9574832, "step": 9290 }, { "epoch": 6.217391304347826, "grad_norm": 1.3826985359191895, "learning_rate": 8.705943656419553e-06, "loss": 0.4109, "num_input_tokens_seen": 9580304, "step": 9295 }, { "epoch": 6.2207357859531776, "grad_norm": 1.560774326324463, "learning_rate": 8.70398377136215e-06, "loss": 0.4068, "num_input_tokens_seen": 9585968, "step": 9300 }, { "epoch": 6.224080267558528, "grad_norm": 2.0126020908355713, "learning_rate": 8.70202262424062e-06, "loss": 0.4813, "num_input_tokens_seen": 9591664, "step": 9305 }, { "epoch": 6.2274247491638794, "grad_norm": 1.5577737092971802, "learning_rate": 8.700060215723192e-06, "loss": 0.3875, "num_input_tokens_seen": 9596208, "step": 9310 }, { "epoch": 6.230769230769231, "grad_norm": 2.5347259044647217, "learning_rate": 8.69809654647852e-06, "loss": 0.5702, "num_input_tokens_seen": 9601712, "step": 9315 }, { "epoch": 6.234113712374582, "grad_norm": 1.6496638059616089, "learning_rate": 8.696131617175686e-06, "loss": 0.4993, "num_input_tokens_seen": 9606512, "step": 9320 }, { "epoch": 6.237458193979933, "grad_norm": 1.808294653892517, "learning_rate": 8.694165428484206e-06, "loss": 0.4125, "num_input_tokens_seen": 9611024, "step": 9325 }, { "epoch": 6.240802675585284, "grad_norm": 2.137178897857666, "learning_rate": 8.69219798107402e-06, "loss": 0.4636, "num_input_tokens_seen": 9616496, "step": 9330 }, { "epoch": 6.2441471571906355, "grad_norm": 1.8492131233215332, "learning_rate": 8.690229275615503e-06, "loss": 0.4642, "num_input_tokens_seen": 9622512, "step": 9335 }, { "epoch": 6.247491638795987, "grad_norm": 1.8589178323745728, "learning_rate": 8.688259312779453e-06, "loss": 0.4578, "num_input_tokens_seen": 9626960, "step": 9340 }, { "epoch": 6.250836120401337, "grad_norm": 2.6945409774780273, "learning_rate": 8.686288093237095e-06, "loss": 0.5195, "num_input_tokens_seen": 9631856, "step": 9345 }, { "epoch": 6.254180602006689, "grad_norm": 2.051678419113159, "learning_rate": 8.684315617660091e-06, "loss": 0.4257, "num_input_tokens_seen": 9636240, "step": 9350 }, { "epoch": 6.25752508361204, "grad_norm": 1.114547848701477, "learning_rate": 8.682341886720524e-06, "loss": 0.3988, "num_input_tokens_seen": 9640976, "step": 9355 }, { "epoch": 6.260869565217392, "grad_norm": 1.7501230239868164, "learning_rate": 8.680366901090906e-06, "loss": 0.4667, "num_input_tokens_seen": 9645584, "step": 9360 }, { "epoch": 6.264214046822742, "grad_norm": 2.170306921005249, "learning_rate": 8.678390661444175e-06, "loss": 0.4782, "num_input_tokens_seen": 9651312, "step": 9365 }, { "epoch": 6.2675585284280935, "grad_norm": 1.9101121425628662, "learning_rate": 8.676413168453704e-06, "loss": 0.4297, "num_input_tokens_seen": 9656048, "step": 9370 }, { "epoch": 6.270903010033445, "grad_norm": 1.3097602128982544, "learning_rate": 8.67443442279328e-06, "loss": 0.4399, "num_input_tokens_seen": 9660272, "step": 9375 }, { "epoch": 6.274247491638796, "grad_norm": 1.013252854347229, "learning_rate": 8.672454425137128e-06, "loss": 0.4248, "num_input_tokens_seen": 9665360, "step": 9380 }, { "epoch": 6.277591973244147, "grad_norm": 1.321293592453003, "learning_rate": 8.670473176159897e-06, "loss": 0.5323, "num_input_tokens_seen": 9670544, "step": 9385 }, { "epoch": 6.280936454849498, "grad_norm": 1.8000905513763428, "learning_rate": 8.668490676536658e-06, "loss": 0.5579, "num_input_tokens_seen": 9675408, "step": 9390 }, { "epoch": 6.2842809364548495, "grad_norm": 1.595774531364441, "learning_rate": 8.666506926942912e-06, "loss": 0.5587, "num_input_tokens_seen": 9680368, "step": 9395 }, { "epoch": 6.287625418060201, "grad_norm": 1.8899153470993042, "learning_rate": 8.664521928054585e-06, "loss": 0.4498, "num_input_tokens_seen": 9685264, "step": 9400 }, { "epoch": 6.290969899665551, "grad_norm": 1.4100334644317627, "learning_rate": 8.66253568054803e-06, "loss": 0.3741, "num_input_tokens_seen": 9690800, "step": 9405 }, { "epoch": 6.294314381270903, "grad_norm": 1.3874787092208862, "learning_rate": 8.660548185100022e-06, "loss": 0.4871, "num_input_tokens_seen": 9695792, "step": 9410 }, { "epoch": 6.297658862876254, "grad_norm": 1.2416882514953613, "learning_rate": 8.658559442387766e-06, "loss": 0.4026, "num_input_tokens_seen": 9701520, "step": 9415 }, { "epoch": 6.301003344481606, "grad_norm": 1.9967982769012451, "learning_rate": 8.656569453088887e-06, "loss": 0.5097, "num_input_tokens_seen": 9707216, "step": 9420 }, { "epoch": 6.304347826086957, "grad_norm": 1.4704114198684692, "learning_rate": 8.654578217881441e-06, "loss": 0.4467, "num_input_tokens_seen": 9712336, "step": 9425 }, { "epoch": 6.3076923076923075, "grad_norm": 2.0274460315704346, "learning_rate": 8.6525857374439e-06, "loss": 0.5174, "num_input_tokens_seen": 9717520, "step": 9430 }, { "epoch": 6.311036789297659, "grad_norm": 1.428174614906311, "learning_rate": 8.650592012455167e-06, "loss": 0.446, "num_input_tokens_seen": 9723248, "step": 9435 }, { "epoch": 6.31438127090301, "grad_norm": 1.5866273641586304, "learning_rate": 8.648597043594567e-06, "loss": 0.4445, "num_input_tokens_seen": 9728432, "step": 9440 }, { "epoch": 6.317725752508361, "grad_norm": 1.6537847518920898, "learning_rate": 8.646600831541847e-06, "loss": 0.4038, "num_input_tokens_seen": 9734096, "step": 9445 }, { "epoch": 6.321070234113712, "grad_norm": 1.185329794883728, "learning_rate": 8.644603376977184e-06, "loss": 0.4834, "num_input_tokens_seen": 9738736, "step": 9450 }, { "epoch": 6.3244147157190636, "grad_norm": 1.6009186506271362, "learning_rate": 8.64260468058117e-06, "loss": 0.4475, "num_input_tokens_seen": 9744752, "step": 9455 }, { "epoch": 6.327759197324415, "grad_norm": 1.6254929304122925, "learning_rate": 8.640604743034824e-06, "loss": 0.5205, "num_input_tokens_seen": 9750256, "step": 9460 }, { "epoch": 6.331103678929766, "grad_norm": 1.5399832725524902, "learning_rate": 8.638603565019588e-06, "loss": 0.4773, "num_input_tokens_seen": 9755888, "step": 9465 }, { "epoch": 6.334448160535117, "grad_norm": 3.2129695415496826, "learning_rate": 8.636601147217327e-06, "loss": 0.3553, "num_input_tokens_seen": 9761456, "step": 9470 }, { "epoch": 6.337792642140468, "grad_norm": 1.6217591762542725, "learning_rate": 8.63459749031033e-06, "loss": 0.4957, "num_input_tokens_seen": 9766832, "step": 9475 }, { "epoch": 6.34113712374582, "grad_norm": 1.5679925680160522, "learning_rate": 8.632592594981303e-06, "loss": 0.4373, "num_input_tokens_seen": 9772272, "step": 9480 }, { "epoch": 6.34448160535117, "grad_norm": 1.8120938539505005, "learning_rate": 8.630586461913378e-06, "loss": 0.5248, "num_input_tokens_seen": 9776880, "step": 9485 }, { "epoch": 6.3478260869565215, "grad_norm": 2.0365657806396484, "learning_rate": 8.628579091790108e-06, "loss": 0.4, "num_input_tokens_seen": 9781520, "step": 9490 }, { "epoch": 6.351170568561873, "grad_norm": 1.3286190032958984, "learning_rate": 8.62657048529547e-06, "loss": 0.3732, "num_input_tokens_seen": 9787056, "step": 9495 }, { "epoch": 6.354515050167224, "grad_norm": 1.8949153423309326, "learning_rate": 8.624560643113852e-06, "loss": 0.4313, "num_input_tokens_seen": 9791824, "step": 9500 }, { "epoch": 6.357859531772576, "grad_norm": 1.3946754932403564, "learning_rate": 8.622549565930081e-06, "loss": 0.458, "num_input_tokens_seen": 9797968, "step": 9505 }, { "epoch": 6.361204013377926, "grad_norm": 1.6004213094711304, "learning_rate": 8.620537254429386e-06, "loss": 0.5062, "num_input_tokens_seen": 9802768, "step": 9510 }, { "epoch": 6.364548494983278, "grad_norm": 1.746078610420227, "learning_rate": 8.618523709297426e-06, "loss": 0.5654, "num_input_tokens_seen": 9808080, "step": 9515 }, { "epoch": 6.367892976588629, "grad_norm": 1.9338748455047607, "learning_rate": 8.616508931220285e-06, "loss": 0.4703, "num_input_tokens_seen": 9812912, "step": 9520 }, { "epoch": 6.3712374581939795, "grad_norm": 2.118875503540039, "learning_rate": 8.614492920884457e-06, "loss": 0.4531, "num_input_tokens_seen": 9818224, "step": 9525 }, { "epoch": 6.374581939799331, "grad_norm": 1.4847183227539062, "learning_rate": 8.612475678976861e-06, "loss": 0.5231, "num_input_tokens_seen": 9823888, "step": 9530 }, { "epoch": 6.377926421404682, "grad_norm": 2.1377294063568115, "learning_rate": 8.610457206184835e-06, "loss": 0.4172, "num_input_tokens_seen": 9828016, "step": 9535 }, { "epoch": 6.381270903010034, "grad_norm": 2.1097629070281982, "learning_rate": 8.608437503196136e-06, "loss": 0.4758, "num_input_tokens_seen": 9833680, "step": 9540 }, { "epoch": 6.384615384615385, "grad_norm": 1.6777780055999756, "learning_rate": 8.606416570698943e-06, "loss": 0.5301, "num_input_tokens_seen": 9839472, "step": 9545 }, { "epoch": 6.3879598662207355, "grad_norm": 3.021489381790161, "learning_rate": 8.604394409381849e-06, "loss": 0.5049, "num_input_tokens_seen": 9844368, "step": 9550 }, { "epoch": 6.391304347826087, "grad_norm": 1.6647714376449585, "learning_rate": 8.602371019933867e-06, "loss": 0.5094, "num_input_tokens_seen": 9849392, "step": 9555 }, { "epoch": 6.394648829431438, "grad_norm": 2.0660290718078613, "learning_rate": 8.600346403044433e-06, "loss": 0.4252, "num_input_tokens_seen": 9854352, "step": 9560 }, { "epoch": 6.39799331103679, "grad_norm": 1.3247846364974976, "learning_rate": 8.598320559403394e-06, "loss": 0.3528, "num_input_tokens_seen": 9859696, "step": 9565 }, { "epoch": 6.40133779264214, "grad_norm": 1.0202311277389526, "learning_rate": 8.596293489701021e-06, "loss": 0.3481, "num_input_tokens_seen": 9865232, "step": 9570 }, { "epoch": 6.404682274247492, "grad_norm": 1.7442502975463867, "learning_rate": 8.594265194628003e-06, "loss": 0.508, "num_input_tokens_seen": 9870736, "step": 9575 }, { "epoch": 6.408026755852843, "grad_norm": 3.2929131984710693, "learning_rate": 8.592235674875442e-06, "loss": 0.45, "num_input_tokens_seen": 9875184, "step": 9580 }, { "epoch": 6.411371237458194, "grad_norm": 1.6672353744506836, "learning_rate": 8.590204931134856e-06, "loss": 0.4892, "num_input_tokens_seen": 9880624, "step": 9585 }, { "epoch": 6.414715719063545, "grad_norm": 1.779634714126587, "learning_rate": 8.588172964098188e-06, "loss": 0.4363, "num_input_tokens_seen": 9886896, "step": 9590 }, { "epoch": 6.418060200668896, "grad_norm": 1.3937158584594727, "learning_rate": 8.586139774457791e-06, "loss": 0.5221, "num_input_tokens_seen": 9893360, "step": 9595 }, { "epoch": 6.421404682274248, "grad_norm": 1.3436044454574585, "learning_rate": 8.584105362906438e-06, "loss": 0.5367, "num_input_tokens_seen": 9899056, "step": 9600 }, { "epoch": 6.424749163879599, "grad_norm": 2.0152881145477295, "learning_rate": 8.582069730137319e-06, "loss": 0.4485, "num_input_tokens_seen": 9904048, "step": 9605 }, { "epoch": 6.4280936454849495, "grad_norm": 0.7424044013023376, "learning_rate": 8.580032876844033e-06, "loss": 0.4053, "num_input_tokens_seen": 9909136, "step": 9610 }, { "epoch": 6.431438127090301, "grad_norm": 1.695774793624878, "learning_rate": 8.577994803720605e-06, "loss": 0.5301, "num_input_tokens_seen": 9914640, "step": 9615 }, { "epoch": 6.434782608695652, "grad_norm": 1.3966072797775269, "learning_rate": 8.575955511461471e-06, "loss": 0.4355, "num_input_tokens_seen": 9918608, "step": 9620 }, { "epoch": 6.438127090301004, "grad_norm": 1.3316956758499146, "learning_rate": 8.573915000761478e-06, "loss": 0.5196, "num_input_tokens_seen": 9923216, "step": 9625 }, { "epoch": 6.441471571906354, "grad_norm": 1.2608354091644287, "learning_rate": 8.571873272315895e-06, "loss": 0.4549, "num_input_tokens_seen": 9928240, "step": 9630 }, { "epoch": 6.444816053511706, "grad_norm": 1.4439243078231812, "learning_rate": 8.569830326820403e-06, "loss": 0.4956, "num_input_tokens_seen": 9933360, "step": 9635 }, { "epoch": 6.448160535117057, "grad_norm": 1.379343867301941, "learning_rate": 8.567786164971098e-06, "loss": 0.4917, "num_input_tokens_seen": 9938448, "step": 9640 }, { "epoch": 6.451505016722408, "grad_norm": 1.8130829334259033, "learning_rate": 8.56574078746449e-06, "loss": 0.4779, "num_input_tokens_seen": 9944016, "step": 9645 }, { "epoch": 6.454849498327759, "grad_norm": 1.3577078580856323, "learning_rate": 8.563694194997506e-06, "loss": 0.4147, "num_input_tokens_seen": 9950128, "step": 9650 }, { "epoch": 6.45819397993311, "grad_norm": 1.4700647592544556, "learning_rate": 8.561646388267482e-06, "loss": 0.4188, "num_input_tokens_seen": 9956016, "step": 9655 }, { "epoch": 6.461538461538462, "grad_norm": 1.6943763494491577, "learning_rate": 8.559597367972168e-06, "loss": 0.4579, "num_input_tokens_seen": 9961520, "step": 9660 }, { "epoch": 6.464882943143813, "grad_norm": 1.7829777002334595, "learning_rate": 8.557547134809736e-06, "loss": 0.3833, "num_input_tokens_seen": 9966672, "step": 9665 }, { "epoch": 6.468227424749164, "grad_norm": 2.967881679534912, "learning_rate": 8.555495689478762e-06, "loss": 0.4806, "num_input_tokens_seen": 9972240, "step": 9670 }, { "epoch": 6.471571906354515, "grad_norm": 1.127137303352356, "learning_rate": 8.553443032678237e-06, "loss": 0.3597, "num_input_tokens_seen": 9977392, "step": 9675 }, { "epoch": 6.474916387959866, "grad_norm": 1.7953500747680664, "learning_rate": 8.55138916510757e-06, "loss": 0.4774, "num_input_tokens_seen": 9982224, "step": 9680 }, { "epoch": 6.478260869565218, "grad_norm": 2.060637950897217, "learning_rate": 8.549334087466571e-06, "loss": 0.4764, "num_input_tokens_seen": 9987632, "step": 9685 }, { "epoch": 6.481605351170568, "grad_norm": 1.6653746366500854, "learning_rate": 8.547277800455477e-06, "loss": 0.5044, "num_input_tokens_seen": 9993008, "step": 9690 }, { "epoch": 6.48494983277592, "grad_norm": 1.3381489515304565, "learning_rate": 8.545220304774927e-06, "loss": 0.4562, "num_input_tokens_seen": 9997968, "step": 9695 }, { "epoch": 6.488294314381271, "grad_norm": 1.9488985538482666, "learning_rate": 8.543161601125974e-06, "loss": 0.4331, "num_input_tokens_seen": 10002992, "step": 9700 }, { "epoch": 6.491638795986622, "grad_norm": 2.022674083709717, "learning_rate": 8.541101690210086e-06, "loss": 0.4774, "num_input_tokens_seen": 10009040, "step": 9705 }, { "epoch": 6.494983277591973, "grad_norm": 2.086988687515259, "learning_rate": 8.539040572729135e-06, "loss": 0.4852, "num_input_tokens_seen": 10012944, "step": 9710 }, { "epoch": 6.498327759197324, "grad_norm": 2.331167221069336, "learning_rate": 8.536978249385415e-06, "loss": 0.5163, "num_input_tokens_seen": 10017744, "step": 9715 }, { "epoch": 6.501672240802676, "grad_norm": 1.8095107078552246, "learning_rate": 8.53491472088162e-06, "loss": 0.376, "num_input_tokens_seen": 10022352, "step": 9720 }, { "epoch": 6.505016722408027, "grad_norm": 1.713326334953308, "learning_rate": 8.532849987920859e-06, "loss": 0.458, "num_input_tokens_seen": 10027280, "step": 9725 }, { "epoch": 6.508361204013378, "grad_norm": 1.3994109630584717, "learning_rate": 8.530784051206654e-06, "loss": 0.384, "num_input_tokens_seen": 10031920, "step": 9730 }, { "epoch": 6.511705685618729, "grad_norm": 2.496594190597534, "learning_rate": 8.528716911442934e-06, "loss": 0.4479, "num_input_tokens_seen": 10037136, "step": 9735 }, { "epoch": 6.51505016722408, "grad_norm": 1.3036377429962158, "learning_rate": 8.52664856933404e-06, "loss": 0.367, "num_input_tokens_seen": 10042288, "step": 9740 }, { "epoch": 6.518394648829432, "grad_norm": 2.493680715560913, "learning_rate": 8.524579025584719e-06, "loss": 0.4782, "num_input_tokens_seen": 10047440, "step": 9745 }, { "epoch": 6.521739130434782, "grad_norm": 1.481765627861023, "learning_rate": 8.52250828090013e-06, "loss": 0.4241, "num_input_tokens_seen": 10052944, "step": 9750 }, { "epoch": 6.525083612040134, "grad_norm": 1.7084243297576904, "learning_rate": 8.520436335985843e-06, "loss": 0.3915, "num_input_tokens_seen": 10057328, "step": 9755 }, { "epoch": 6.528428093645485, "grad_norm": 1.504419207572937, "learning_rate": 8.518363191547833e-06, "loss": 0.4855, "num_input_tokens_seen": 10063536, "step": 9760 }, { "epoch": 6.531772575250836, "grad_norm": 2.3640339374542236, "learning_rate": 8.516288848292486e-06, "loss": 0.506, "num_input_tokens_seen": 10068048, "step": 9765 }, { "epoch": 6.535117056856187, "grad_norm": 2.051978349685669, "learning_rate": 8.5142133069266e-06, "loss": 0.4863, "num_input_tokens_seen": 10073360, "step": 9770 }, { "epoch": 6.538461538461538, "grad_norm": 2.2082555294036865, "learning_rate": 8.512136568157373e-06, "loss": 0.4889, "num_input_tokens_seen": 10078416, "step": 9775 }, { "epoch": 6.54180602006689, "grad_norm": 1.4468470811843872, "learning_rate": 8.510058632692415e-06, "loss": 0.4524, "num_input_tokens_seen": 10083632, "step": 9780 }, { "epoch": 6.545150501672241, "grad_norm": 1.3890846967697144, "learning_rate": 8.507979501239747e-06, "loss": 0.3634, "num_input_tokens_seen": 10089776, "step": 9785 }, { "epoch": 6.548494983277592, "grad_norm": 2.470379114151001, "learning_rate": 8.505899174507793e-06, "loss": 0.4349, "num_input_tokens_seen": 10094960, "step": 9790 }, { "epoch": 6.551839464882943, "grad_norm": 1.4843478202819824, "learning_rate": 8.503817653205388e-06, "loss": 0.4262, "num_input_tokens_seen": 10100336, "step": 9795 }, { "epoch": 6.555183946488294, "grad_norm": 2.230928659439087, "learning_rate": 8.501734938041769e-06, "loss": 0.4954, "num_input_tokens_seen": 10105008, "step": 9800 }, { "epoch": 6.558528428093646, "grad_norm": 1.1538678407669067, "learning_rate": 8.499651029726586e-06, "loss": 0.5028, "num_input_tokens_seen": 10110096, "step": 9805 }, { "epoch": 6.561872909698996, "grad_norm": 1.5814902782440186, "learning_rate": 8.497565928969889e-06, "loss": 0.4399, "num_input_tokens_seen": 10115024, "step": 9810 }, { "epoch": 6.565217391304348, "grad_norm": 1.4239223003387451, "learning_rate": 8.495479636482138e-06, "loss": 0.4385, "num_input_tokens_seen": 10120016, "step": 9815 }, { "epoch": 6.568561872909699, "grad_norm": 1.7726631164550781, "learning_rate": 8.493392152974203e-06, "loss": 0.4306, "num_input_tokens_seen": 10124784, "step": 9820 }, { "epoch": 6.5719063545150505, "grad_norm": 1.52203369140625, "learning_rate": 8.491303479157349e-06, "loss": 0.5247, "num_input_tokens_seen": 10130352, "step": 9825 }, { "epoch": 6.575250836120401, "grad_norm": 1.673000454902649, "learning_rate": 8.489213615743258e-06, "loss": 0.51, "num_input_tokens_seen": 10135888, "step": 9830 }, { "epoch": 6.578595317725752, "grad_norm": 1.5759220123291016, "learning_rate": 8.48712256344401e-06, "loss": 0.4862, "num_input_tokens_seen": 10140112, "step": 9835 }, { "epoch": 6.581939799331104, "grad_norm": 1.4635721445083618, "learning_rate": 8.485030322972094e-06, "loss": 0.5021, "num_input_tokens_seen": 10146288, "step": 9840 }, { "epoch": 6.585284280936455, "grad_norm": 1.4909467697143555, "learning_rate": 8.482936895040403e-06, "loss": 0.4195, "num_input_tokens_seen": 10152496, "step": 9845 }, { "epoch": 6.588628762541806, "grad_norm": 1.7496534585952759, "learning_rate": 8.480842280362229e-06, "loss": 0.4496, "num_input_tokens_seen": 10157456, "step": 9850 }, { "epoch": 6.591973244147157, "grad_norm": 1.7579928636550903, "learning_rate": 8.47874647965128e-06, "loss": 0.4593, "num_input_tokens_seen": 10162192, "step": 9855 }, { "epoch": 6.595317725752508, "grad_norm": 4.597160339355469, "learning_rate": 8.476649493621655e-06, "loss": 0.4997, "num_input_tokens_seen": 10167504, "step": 9860 }, { "epoch": 6.59866220735786, "grad_norm": 1.9277442693710327, "learning_rate": 8.47455132298787e-06, "loss": 0.3965, "num_input_tokens_seen": 10171760, "step": 9865 }, { "epoch": 6.602006688963211, "grad_norm": 1.7790672779083252, "learning_rate": 8.472451968464834e-06, "loss": 0.4652, "num_input_tokens_seen": 10177264, "step": 9870 }, { "epoch": 6.605351170568562, "grad_norm": 1.8424761295318604, "learning_rate": 8.470351430767862e-06, "loss": 0.4204, "num_input_tokens_seen": 10181936, "step": 9875 }, { "epoch": 6.608695652173913, "grad_norm": 2.0402145385742188, "learning_rate": 8.468249710612677e-06, "loss": 0.4668, "num_input_tokens_seen": 10187216, "step": 9880 }, { "epoch": 6.6120401337792645, "grad_norm": 1.9922560453414917, "learning_rate": 8.466146808715399e-06, "loss": 0.4025, "num_input_tokens_seen": 10191856, "step": 9885 }, { "epoch": 6.615384615384615, "grad_norm": 2.7372450828552246, "learning_rate": 8.464042725792553e-06, "loss": 0.4292, "num_input_tokens_seen": 10195984, "step": 9890 }, { "epoch": 6.618729096989966, "grad_norm": 1.9748528003692627, "learning_rate": 8.461937462561068e-06, "loss": 0.4082, "num_input_tokens_seen": 10201104, "step": 9895 }, { "epoch": 6.622073578595318, "grad_norm": 1.4465012550354004, "learning_rate": 8.459831019738271e-06, "loss": 0.4902, "num_input_tokens_seen": 10206352, "step": 9900 }, { "epoch": 6.625418060200669, "grad_norm": 1.4568190574645996, "learning_rate": 8.457723398041897e-06, "loss": 0.4603, "num_input_tokens_seen": 10212272, "step": 9905 }, { "epoch": 6.6287625418060205, "grad_norm": 1.5454537868499756, "learning_rate": 8.455614598190076e-06, "loss": 0.4793, "num_input_tokens_seen": 10217072, "step": 9910 }, { "epoch": 6.632107023411371, "grad_norm": 1.7950433492660522, "learning_rate": 8.453504620901345e-06, "loss": 0.4287, "num_input_tokens_seen": 10222352, "step": 9915 }, { "epoch": 6.635451505016722, "grad_norm": 2.7090463638305664, "learning_rate": 8.451393466894638e-06, "loss": 0.4322, "num_input_tokens_seen": 10227248, "step": 9920 }, { "epoch": 6.638795986622074, "grad_norm": 1.6359044313430786, "learning_rate": 8.449281136889293e-06, "loss": 0.4613, "num_input_tokens_seen": 10232048, "step": 9925 }, { "epoch": 6.642140468227424, "grad_norm": 1.6028143167495728, "learning_rate": 8.447167631605046e-06, "loss": 0.4098, "num_input_tokens_seen": 10238032, "step": 9930 }, { "epoch": 6.645484949832776, "grad_norm": 1.6207913160324097, "learning_rate": 8.445052951762039e-06, "loss": 0.483, "num_input_tokens_seen": 10242672, "step": 9935 }, { "epoch": 6.648829431438127, "grad_norm": 1.2581844329833984, "learning_rate": 8.442937098080804e-06, "loss": 0.4002, "num_input_tokens_seen": 10247600, "step": 9940 }, { "epoch": 6.6521739130434785, "grad_norm": 2.118093252182007, "learning_rate": 8.440820071282283e-06, "loss": 0.4781, "num_input_tokens_seen": 10252432, "step": 9945 }, { "epoch": 6.65551839464883, "grad_norm": 1.8837403059005737, "learning_rate": 8.438701872087813e-06, "loss": 0.4465, "num_input_tokens_seen": 10257552, "step": 9950 }, { "epoch": 6.65886287625418, "grad_norm": 1.270293951034546, "learning_rate": 8.436582501219133e-06, "loss": 0.4681, "num_input_tokens_seen": 10262352, "step": 9955 }, { "epoch": 6.662207357859532, "grad_norm": 1.50408935546875, "learning_rate": 8.434461959398377e-06, "loss": 0.46, "num_input_tokens_seen": 10267792, "step": 9960 }, { "epoch": 6.665551839464883, "grad_norm": 2.0761146545410156, "learning_rate": 8.432340247348081e-06, "loss": 0.5057, "num_input_tokens_seen": 10272304, "step": 9965 }, { "epoch": 6.668896321070234, "grad_norm": 1.658987283706665, "learning_rate": 8.43021736579118e-06, "loss": 0.4472, "num_input_tokens_seen": 10277616, "step": 9970 }, { "epoch": 6.672240802675585, "grad_norm": 1.80363130569458, "learning_rate": 8.428093315451006e-06, "loss": 0.3852, "num_input_tokens_seen": 10282832, "step": 9975 }, { "epoch": 6.6755852842809364, "grad_norm": 2.9619832038879395, "learning_rate": 8.425968097051291e-06, "loss": 0.4377, "num_input_tokens_seen": 10287440, "step": 9980 }, { "epoch": 6.678929765886288, "grad_norm": 1.4916666746139526, "learning_rate": 8.423841711316163e-06, "loss": 0.4617, "num_input_tokens_seen": 10292144, "step": 9985 }, { "epoch": 6.682274247491639, "grad_norm": 1.611107349395752, "learning_rate": 8.421714158970148e-06, "loss": 0.3987, "num_input_tokens_seen": 10297200, "step": 9990 }, { "epoch": 6.68561872909699, "grad_norm": 1.5733658075332642, "learning_rate": 8.419585440738172e-06, "loss": 0.4859, "num_input_tokens_seen": 10302000, "step": 9995 }, { "epoch": 6.688963210702341, "grad_norm": 1.7468034029006958, "learning_rate": 8.417455557345555e-06, "loss": 0.3705, "num_input_tokens_seen": 10307152, "step": 10000 }, { "epoch": 6.6923076923076925, "grad_norm": 1.7027318477630615, "learning_rate": 8.415324509518017e-06, "loss": 0.3692, "num_input_tokens_seen": 10311664, "step": 10005 }, { "epoch": 6.695652173913043, "grad_norm": 2.3174588680267334, "learning_rate": 8.413192297981672e-06, "loss": 0.5032, "num_input_tokens_seen": 10316080, "step": 10010 }, { "epoch": 6.698996655518394, "grad_norm": 1.2337104082107544, "learning_rate": 8.41105892346303e-06, "loss": 0.445, "num_input_tokens_seen": 10322000, "step": 10015 }, { "epoch": 6.702341137123746, "grad_norm": 3.367541551589966, "learning_rate": 8.408924386689003e-06, "loss": 0.4573, "num_input_tokens_seen": 10326928, "step": 10020 }, { "epoch": 6.705685618729097, "grad_norm": 1.6263145208358765, "learning_rate": 8.406788688386893e-06, "loss": 0.4457, "num_input_tokens_seen": 10332080, "step": 10025 }, { "epoch": 6.709030100334449, "grad_norm": 1.6832255125045776, "learning_rate": 8.4046518292844e-06, "loss": 0.4033, "num_input_tokens_seen": 10337520, "step": 10030 }, { "epoch": 6.712374581939799, "grad_norm": 1.9814958572387695, "learning_rate": 8.402513810109619e-06, "loss": 0.4292, "num_input_tokens_seen": 10342800, "step": 10035 }, { "epoch": 6.7157190635451505, "grad_norm": 2.086766242980957, "learning_rate": 8.40037463159104e-06, "loss": 0.4831, "num_input_tokens_seen": 10347664, "step": 10040 }, { "epoch": 6.719063545150502, "grad_norm": 1.4190149307250977, "learning_rate": 8.39823429445755e-06, "loss": 0.4198, "num_input_tokens_seen": 10352592, "step": 10045 }, { "epoch": 6.722408026755852, "grad_norm": 2.8310093879699707, "learning_rate": 8.396092799438429e-06, "loss": 0.3817, "num_input_tokens_seen": 10358064, "step": 10050 }, { "epoch": 6.725752508361204, "grad_norm": 2.1832275390625, "learning_rate": 8.39395014726335e-06, "loss": 0.4769, "num_input_tokens_seen": 10363408, "step": 10055 }, { "epoch": 6.729096989966555, "grad_norm": 1.362168788909912, "learning_rate": 8.391806338662386e-06, "loss": 0.5003, "num_input_tokens_seen": 10368464, "step": 10060 }, { "epoch": 6.7324414715719065, "grad_norm": 2.1759605407714844, "learning_rate": 8.389661374365998e-06, "loss": 0.4721, "num_input_tokens_seen": 10373616, "step": 10065 }, { "epoch": 6.735785953177258, "grad_norm": 1.5756832361221313, "learning_rate": 8.387515255105043e-06, "loss": 0.4423, "num_input_tokens_seen": 10379024, "step": 10070 }, { "epoch": 6.739130434782608, "grad_norm": 2.759611129760742, "learning_rate": 8.385367981610771e-06, "loss": 0.445, "num_input_tokens_seen": 10384848, "step": 10075 }, { "epoch": 6.74247491638796, "grad_norm": 1.6195664405822754, "learning_rate": 8.383219554614826e-06, "loss": 0.418, "num_input_tokens_seen": 10389616, "step": 10080 }, { "epoch": 6.745819397993311, "grad_norm": 1.3172379732131958, "learning_rate": 8.381069974849244e-06, "loss": 0.4983, "num_input_tokens_seen": 10394640, "step": 10085 }, { "epoch": 6.749163879598662, "grad_norm": 1.9042677879333496, "learning_rate": 8.378919243046457e-06, "loss": 0.5596, "num_input_tokens_seen": 10399536, "step": 10090 }, { "epoch": 6.752508361204013, "grad_norm": 1.5020009279251099, "learning_rate": 8.376767359939286e-06, "loss": 0.3754, "num_input_tokens_seen": 10404400, "step": 10095 }, { "epoch": 6.7558528428093645, "grad_norm": 1.9981955289840698, "learning_rate": 8.374614326260946e-06, "loss": 0.3881, "num_input_tokens_seen": 10409712, "step": 10100 }, { "epoch": 6.759197324414716, "grad_norm": 1.6879515647888184, "learning_rate": 8.372460142745045e-06, "loss": 0.4235, "num_input_tokens_seen": 10414704, "step": 10105 }, { "epoch": 6.762541806020067, "grad_norm": 1.6623598337173462, "learning_rate": 8.370304810125576e-06, "loss": 0.5259, "num_input_tokens_seen": 10421424, "step": 10110 }, { "epoch": 6.765886287625418, "grad_norm": 1.607630968093872, "learning_rate": 8.368148329136935e-06, "loss": 0.4019, "num_input_tokens_seen": 10426160, "step": 10115 }, { "epoch": 6.769230769230769, "grad_norm": 1.5585445165634155, "learning_rate": 8.365990700513898e-06, "loss": 0.4592, "num_input_tokens_seen": 10432304, "step": 10120 }, { "epoch": 6.7725752508361206, "grad_norm": 1.2344584465026855, "learning_rate": 8.363831924991641e-06, "loss": 0.3888, "num_input_tokens_seen": 10437360, "step": 10125 }, { "epoch": 6.775919732441472, "grad_norm": 1.5095933675765991, "learning_rate": 8.361672003305722e-06, "loss": 0.4543, "num_input_tokens_seen": 10442192, "step": 10130 }, { "epoch": 6.7792642140468224, "grad_norm": 1.4413963556289673, "learning_rate": 8.359510936192102e-06, "loss": 0.418, "num_input_tokens_seen": 10447792, "step": 10135 }, { "epoch": 6.782608695652174, "grad_norm": 1.7826417684555054, "learning_rate": 8.357348724387118e-06, "loss": 0.4023, "num_input_tokens_seen": 10452400, "step": 10140 }, { "epoch": 6.785953177257525, "grad_norm": 2.203733205795288, "learning_rate": 8.355185368627509e-06, "loss": 0.4573, "num_input_tokens_seen": 10457232, "step": 10145 }, { "epoch": 6.789297658862877, "grad_norm": 1.6544252634048462, "learning_rate": 8.353020869650393e-06, "loss": 0.4074, "num_input_tokens_seen": 10461808, "step": 10150 }, { "epoch": 6.792642140468227, "grad_norm": 1.328650951385498, "learning_rate": 8.350855228193287e-06, "loss": 0.4031, "num_input_tokens_seen": 10467152, "step": 10155 }, { "epoch": 6.7959866220735785, "grad_norm": 1.822534441947937, "learning_rate": 8.348688444994092e-06, "loss": 0.4521, "num_input_tokens_seen": 10472208, "step": 10160 }, { "epoch": 6.79933110367893, "grad_norm": 1.4608197212219238, "learning_rate": 8.346520520791101e-06, "loss": 0.5555, "num_input_tokens_seen": 10477680, "step": 10165 }, { "epoch": 6.802675585284281, "grad_norm": 1.641100525856018, "learning_rate": 8.344351456322992e-06, "loss": 0.4978, "num_input_tokens_seen": 10483568, "step": 10170 }, { "epoch": 6.806020066889632, "grad_norm": 1.8540756702423096, "learning_rate": 8.342181252328834e-06, "loss": 0.4171, "num_input_tokens_seen": 10488752, "step": 10175 }, { "epoch": 6.809364548494983, "grad_norm": 1.922379970550537, "learning_rate": 8.340009909548086e-06, "loss": 0.4501, "num_input_tokens_seen": 10493616, "step": 10180 }, { "epoch": 6.812709030100335, "grad_norm": 1.9041005373001099, "learning_rate": 8.33783742872059e-06, "loss": 0.4552, "num_input_tokens_seen": 10498672, "step": 10185 }, { "epoch": 6.816053511705686, "grad_norm": 2.1951098442077637, "learning_rate": 8.33566381058658e-06, "loss": 0.4285, "num_input_tokens_seen": 10503312, "step": 10190 }, { "epoch": 6.8193979933110365, "grad_norm": 1.0847289562225342, "learning_rate": 8.333489055886677e-06, "loss": 0.436, "num_input_tokens_seen": 10508784, "step": 10195 }, { "epoch": 6.822742474916388, "grad_norm": 1.1100106239318848, "learning_rate": 8.33131316536189e-06, "loss": 0.4225, "num_input_tokens_seen": 10513936, "step": 10200 }, { "epoch": 6.826086956521739, "grad_norm": 1.4818718433380127, "learning_rate": 8.329136139753609e-06, "loss": 0.4549, "num_input_tokens_seen": 10518384, "step": 10205 }, { "epoch": 6.829431438127091, "grad_norm": 1.5397876501083374, "learning_rate": 8.326957979803618e-06, "loss": 0.4265, "num_input_tokens_seen": 10523632, "step": 10210 }, { "epoch": 6.832775919732441, "grad_norm": 1.4887181520462036, "learning_rate": 8.324778686254086e-06, "loss": 0.4565, "num_input_tokens_seen": 10528752, "step": 10215 }, { "epoch": 6.8361204013377925, "grad_norm": 1.5267382860183716, "learning_rate": 8.322598259847567e-06, "loss": 0.4183, "num_input_tokens_seen": 10533872, "step": 10220 }, { "epoch": 6.839464882943144, "grad_norm": 1.9219560623168945, "learning_rate": 8.320416701326998e-06, "loss": 0.4518, "num_input_tokens_seen": 10538960, "step": 10225 }, { "epoch": 6.842809364548495, "grad_norm": 1.3861024379730225, "learning_rate": 8.318234011435706e-06, "loss": 0.4543, "num_input_tokens_seen": 10544624, "step": 10230 }, { "epoch": 6.846153846153846, "grad_norm": 1.4232869148254395, "learning_rate": 8.316050190917406e-06, "loss": 0.4259, "num_input_tokens_seen": 10549136, "step": 10235 }, { "epoch": 6.849498327759197, "grad_norm": 1.9185823202133179, "learning_rate": 8.313865240516187e-06, "loss": 0.4625, "num_input_tokens_seen": 10554576, "step": 10240 }, { "epoch": 6.852842809364549, "grad_norm": 1.5595492124557495, "learning_rate": 8.31167916097654e-06, "loss": 0.3645, "num_input_tokens_seen": 10560336, "step": 10245 }, { "epoch": 6.8561872909699, "grad_norm": 1.195313572883606, "learning_rate": 8.309491953043323e-06, "loss": 0.4307, "num_input_tokens_seen": 10566704, "step": 10250 }, { "epoch": 6.8595317725752505, "grad_norm": 2.4286909103393555, "learning_rate": 8.307303617461791e-06, "loss": 0.4923, "num_input_tokens_seen": 10570928, "step": 10255 }, { "epoch": 6.862876254180602, "grad_norm": 1.5543702840805054, "learning_rate": 8.305114154977577e-06, "loss": 0.468, "num_input_tokens_seen": 10575632, "step": 10260 }, { "epoch": 6.866220735785953, "grad_norm": 2.5817644596099854, "learning_rate": 8.302923566336704e-06, "loss": 0.4882, "num_input_tokens_seen": 10580272, "step": 10265 }, { "epoch": 6.869565217391305, "grad_norm": 1.7986260652542114, "learning_rate": 8.30073185228557e-06, "loss": 0.487, "num_input_tokens_seen": 10585584, "step": 10270 }, { "epoch": 6.872909698996655, "grad_norm": 1.5517115592956543, "learning_rate": 8.298539013570962e-06, "loss": 0.455, "num_input_tokens_seen": 10590512, "step": 10275 }, { "epoch": 6.8762541806020065, "grad_norm": 1.5319901704788208, "learning_rate": 8.296345050940052e-06, "loss": 0.4026, "num_input_tokens_seen": 10595472, "step": 10280 }, { "epoch": 6.879598662207358, "grad_norm": 1.725091814994812, "learning_rate": 8.294149965140387e-06, "loss": 0.5013, "num_input_tokens_seen": 10600272, "step": 10285 }, { "epoch": 6.882943143812709, "grad_norm": 1.4665840864181519, "learning_rate": 8.291953756919908e-06, "loss": 0.5093, "num_input_tokens_seen": 10605584, "step": 10290 }, { "epoch": 6.88628762541806, "grad_norm": 1.7665191888809204, "learning_rate": 8.28975642702693e-06, "loss": 0.4526, "num_input_tokens_seen": 10610928, "step": 10295 }, { "epoch": 6.889632107023411, "grad_norm": 1.992263674736023, "learning_rate": 8.28755797621015e-06, "loss": 0.3607, "num_input_tokens_seen": 10615216, "step": 10300 }, { "epoch": 6.892976588628763, "grad_norm": 2.326448678970337, "learning_rate": 8.285358405218655e-06, "loss": 0.4474, "num_input_tokens_seen": 10620016, "step": 10305 }, { "epoch": 6.896321070234114, "grad_norm": 1.9595775604248047, "learning_rate": 8.283157714801903e-06, "loss": 0.4986, "num_input_tokens_seen": 10625456, "step": 10310 }, { "epoch": 6.8996655518394645, "grad_norm": 2.20910906791687, "learning_rate": 8.280955905709743e-06, "loss": 0.4605, "num_input_tokens_seen": 10630352, "step": 10315 }, { "epoch": 6.903010033444816, "grad_norm": 1.8682491779327393, "learning_rate": 8.278752978692396e-06, "loss": 0.4081, "num_input_tokens_seen": 10635920, "step": 10320 }, { "epoch": 6.906354515050167, "grad_norm": 1.5209521055221558, "learning_rate": 8.276548934500472e-06, "loss": 0.4124, "num_input_tokens_seen": 10640912, "step": 10325 }, { "epoch": 6.909698996655519, "grad_norm": 1.6327146291732788, "learning_rate": 8.274343773884958e-06, "loss": 0.3992, "num_input_tokens_seen": 10646352, "step": 10330 }, { "epoch": 6.913043478260869, "grad_norm": 1.3910561800003052, "learning_rate": 8.272137497597224e-06, "loss": 0.4207, "num_input_tokens_seen": 10651696, "step": 10335 }, { "epoch": 6.916387959866221, "grad_norm": 1.9551655054092407, "learning_rate": 8.269930106389013e-06, "loss": 0.5057, "num_input_tokens_seen": 10656336, "step": 10340 }, { "epoch": 6.919732441471572, "grad_norm": 1.6701486110687256, "learning_rate": 8.267721601012454e-06, "loss": 0.464, "num_input_tokens_seen": 10661072, "step": 10345 }, { "epoch": 6.923076923076923, "grad_norm": 1.3276352882385254, "learning_rate": 8.265511982220059e-06, "loss": 0.5122, "num_input_tokens_seen": 10666128, "step": 10350 }, { "epoch": 6.926421404682275, "grad_norm": 1.3907994031906128, "learning_rate": 8.263301250764711e-06, "loss": 0.4216, "num_input_tokens_seen": 10671088, "step": 10355 }, { "epoch": 6.929765886287625, "grad_norm": 1.8094497919082642, "learning_rate": 8.261089407399676e-06, "loss": 0.4146, "num_input_tokens_seen": 10675952, "step": 10360 }, { "epoch": 6.933110367892977, "grad_norm": 1.608851671218872, "learning_rate": 8.258876452878599e-06, "loss": 0.4661, "num_input_tokens_seen": 10680880, "step": 10365 }, { "epoch": 6.936454849498328, "grad_norm": 1.2088913917541504, "learning_rate": 8.256662387955502e-06, "loss": 0.4365, "num_input_tokens_seen": 10687120, "step": 10370 }, { "epoch": 6.9397993311036785, "grad_norm": 1.3339202404022217, "learning_rate": 8.254447213384791e-06, "loss": 0.4434, "num_input_tokens_seen": 10693072, "step": 10375 }, { "epoch": 6.94314381270903, "grad_norm": 2.1991164684295654, "learning_rate": 8.252230929921243e-06, "loss": 0.3987, "num_input_tokens_seen": 10698480, "step": 10380 }, { "epoch": 6.946488294314381, "grad_norm": 1.2396364212036133, "learning_rate": 8.250013538320017e-06, "loss": 0.3986, "num_input_tokens_seen": 10703632, "step": 10385 }, { "epoch": 6.949832775919733, "grad_norm": 2.4793293476104736, "learning_rate": 8.247795039336646e-06, "loss": 0.54, "num_input_tokens_seen": 10708816, "step": 10390 }, { "epoch": 6.953177257525084, "grad_norm": 1.6594310998916626, "learning_rate": 8.245575433727044e-06, "loss": 0.4595, "num_input_tokens_seen": 10713168, "step": 10395 }, { "epoch": 6.956521739130435, "grad_norm": 1.7132583856582642, "learning_rate": 8.243354722247502e-06, "loss": 0.4299, "num_input_tokens_seen": 10718480, "step": 10400 }, { "epoch": 6.959866220735786, "grad_norm": 1.2599695920944214, "learning_rate": 8.241132905654685e-06, "loss": 0.4591, "num_input_tokens_seen": 10723248, "step": 10405 }, { "epoch": 6.963210702341137, "grad_norm": 1.530475378036499, "learning_rate": 8.238909984705635e-06, "loss": 0.4439, "num_input_tokens_seen": 10728176, "step": 10410 }, { "epoch": 6.966555183946488, "grad_norm": 1.2620426416397095, "learning_rate": 8.236685960157773e-06, "loss": 0.4477, "num_input_tokens_seen": 10733680, "step": 10415 }, { "epoch": 6.969899665551839, "grad_norm": 1.9853748083114624, "learning_rate": 8.234460832768893e-06, "loss": 0.4914, "num_input_tokens_seen": 10739120, "step": 10420 }, { "epoch": 6.973244147157191, "grad_norm": 1.6894268989562988, "learning_rate": 8.232234603297167e-06, "loss": 0.461, "num_input_tokens_seen": 10744816, "step": 10425 }, { "epoch": 6.976588628762542, "grad_norm": 1.7880918979644775, "learning_rate": 8.230007272501141e-06, "loss": 0.4976, "num_input_tokens_seen": 10750384, "step": 10430 }, { "epoch": 6.979933110367893, "grad_norm": 1.309501051902771, "learning_rate": 8.22777884113974e-06, "loss": 0.4508, "num_input_tokens_seen": 10756144, "step": 10435 }, { "epoch": 6.983277591973244, "grad_norm": 1.6588644981384277, "learning_rate": 8.225549309972256e-06, "loss": 0.4158, "num_input_tokens_seen": 10761008, "step": 10440 }, { "epoch": 6.986622073578595, "grad_norm": 2.130462169647217, "learning_rate": 8.223318679758362e-06, "loss": 0.485, "num_input_tokens_seen": 10766480, "step": 10445 }, { "epoch": 6.989966555183947, "grad_norm": 0.9777175188064575, "learning_rate": 8.221086951258107e-06, "loss": 0.5131, "num_input_tokens_seen": 10772592, "step": 10450 }, { "epoch": 6.993311036789297, "grad_norm": 2.5042524337768555, "learning_rate": 8.218854125231908e-06, "loss": 0.5137, "num_input_tokens_seen": 10778352, "step": 10455 }, { "epoch": 6.996655518394649, "grad_norm": 1.3152674436569214, "learning_rate": 8.21662020244056e-06, "loss": 0.4738, "num_input_tokens_seen": 10784432, "step": 10460 }, { "epoch": 7.0, "grad_norm": 2.179715633392334, "learning_rate": 8.214385183645231e-06, "loss": 0.398, "num_input_tokens_seen": 10789408, "step": 10465 }, { "epoch": 7.003344481605351, "grad_norm": 1.633862853050232, "learning_rate": 8.212149069607465e-06, "loss": 0.4442, "num_input_tokens_seen": 10794336, "step": 10470 }, { "epoch": 7.006688963210703, "grad_norm": 1.4570140838623047, "learning_rate": 8.209911861089172e-06, "loss": 0.3649, "num_input_tokens_seen": 10799712, "step": 10475 }, { "epoch": 7.010033444816053, "grad_norm": 2.112142562866211, "learning_rate": 8.207673558852644e-06, "loss": 0.428, "num_input_tokens_seen": 10804736, "step": 10480 }, { "epoch": 7.013377926421405, "grad_norm": 1.477744460105896, "learning_rate": 8.205434163660539e-06, "loss": 0.3836, "num_input_tokens_seen": 10810144, "step": 10485 }, { "epoch": 7.016722408026756, "grad_norm": 1.6077238321304321, "learning_rate": 8.20319367627589e-06, "loss": 0.4227, "num_input_tokens_seen": 10815808, "step": 10490 }, { "epoch": 7.0200668896321075, "grad_norm": 1.5879218578338623, "learning_rate": 8.200952097462098e-06, "loss": 0.4618, "num_input_tokens_seen": 10820864, "step": 10495 }, { "epoch": 7.023411371237458, "grad_norm": 1.9065064191818237, "learning_rate": 8.198709427982946e-06, "loss": 0.321, "num_input_tokens_seen": 10825472, "step": 10500 }, { "epoch": 7.026755852842809, "grad_norm": 1.7862392663955688, "learning_rate": 8.19646566860258e-06, "loss": 0.4414, "num_input_tokens_seen": 10830720, "step": 10505 }, { "epoch": 7.030100334448161, "grad_norm": 2.163635015487671, "learning_rate": 8.194220820085517e-06, "loss": 0.4738, "num_input_tokens_seen": 10835936, "step": 10510 }, { "epoch": 7.033444816053512, "grad_norm": 1.5194921493530273, "learning_rate": 8.19197488319665e-06, "loss": 0.4152, "num_input_tokens_seen": 10840704, "step": 10515 }, { "epoch": 7.036789297658863, "grad_norm": 1.3265511989593506, "learning_rate": 8.18972785870124e-06, "loss": 0.3262, "num_input_tokens_seen": 10845792, "step": 10520 }, { "epoch": 7.040133779264214, "grad_norm": 3.866384744644165, "learning_rate": 8.187479747364922e-06, "loss": 0.4482, "num_input_tokens_seen": 10851136, "step": 10525 }, { "epoch": 7.043478260869565, "grad_norm": 3.9039084911346436, "learning_rate": 8.185230549953693e-06, "loss": 0.4859, "num_input_tokens_seen": 10855808, "step": 10530 }, { "epoch": 7.046822742474917, "grad_norm": 1.8762120008468628, "learning_rate": 8.18298026723393e-06, "loss": 0.4848, "num_input_tokens_seen": 10861152, "step": 10535 }, { "epoch": 7.050167224080267, "grad_norm": 1.901007890701294, "learning_rate": 8.180728899972372e-06, "loss": 0.3655, "num_input_tokens_seen": 10865792, "step": 10540 }, { "epoch": 7.053511705685619, "grad_norm": 2.0388667583465576, "learning_rate": 8.178476448936133e-06, "loss": 0.4736, "num_input_tokens_seen": 10871200, "step": 10545 }, { "epoch": 7.05685618729097, "grad_norm": 1.6242493391036987, "learning_rate": 8.176222914892694e-06, "loss": 0.4396, "num_input_tokens_seen": 10876928, "step": 10550 }, { "epoch": 7.0602006688963215, "grad_norm": 1.2057000398635864, "learning_rate": 8.173968298609904e-06, "loss": 0.463, "num_input_tokens_seen": 10882880, "step": 10555 }, { "epoch": 7.063545150501672, "grad_norm": 2.2681732177734375, "learning_rate": 8.171712600855984e-06, "loss": 0.4618, "num_input_tokens_seen": 10887936, "step": 10560 }, { "epoch": 7.066889632107023, "grad_norm": 1.4817224740982056, "learning_rate": 8.169455822399519e-06, "loss": 0.404, "num_input_tokens_seen": 10892928, "step": 10565 }, { "epoch": 7.070234113712375, "grad_norm": 1.8949525356292725, "learning_rate": 8.167197964009465e-06, "loss": 0.3843, "num_input_tokens_seen": 10897376, "step": 10570 }, { "epoch": 7.073578595317726, "grad_norm": 1.1975903511047363, "learning_rate": 8.164939026455147e-06, "loss": 0.409, "num_input_tokens_seen": 10903840, "step": 10575 }, { "epoch": 7.076923076923077, "grad_norm": 1.8223683834075928, "learning_rate": 8.162679010506256e-06, "loss": 0.4002, "num_input_tokens_seen": 10908544, "step": 10580 }, { "epoch": 7.080267558528428, "grad_norm": 1.7208613157272339, "learning_rate": 8.160417916932851e-06, "loss": 0.498, "num_input_tokens_seen": 10913888, "step": 10585 }, { "epoch": 7.083612040133779, "grad_norm": 2.00213885307312, "learning_rate": 8.158155746505354e-06, "loss": 0.4468, "num_input_tokens_seen": 10918528, "step": 10590 }, { "epoch": 7.086956521739131, "grad_norm": 1.464988112449646, "learning_rate": 8.155892499994562e-06, "loss": 0.4283, "num_input_tokens_seen": 10923584, "step": 10595 }, { "epoch": 7.090301003344481, "grad_norm": 3.0970523357391357, "learning_rate": 8.153628178171635e-06, "loss": 0.4199, "num_input_tokens_seen": 10929120, "step": 10600 }, { "epoch": 7.093645484949833, "grad_norm": 2.0083465576171875, "learning_rate": 8.151362781808096e-06, "loss": 0.4761, "num_input_tokens_seen": 10934176, "step": 10605 }, { "epoch": 7.096989966555184, "grad_norm": 2.441114664077759, "learning_rate": 8.14909631167584e-06, "loss": 0.3468, "num_input_tokens_seen": 10938656, "step": 10610 }, { "epoch": 7.1003344481605355, "grad_norm": 1.810867190361023, "learning_rate": 8.146828768547123e-06, "loss": 0.4095, "num_input_tokens_seen": 10944320, "step": 10615 }, { "epoch": 7.103678929765886, "grad_norm": 1.5708378553390503, "learning_rate": 8.144560153194567e-06, "loss": 0.4715, "num_input_tokens_seen": 10949120, "step": 10620 }, { "epoch": 7.107023411371237, "grad_norm": 1.6490168571472168, "learning_rate": 8.142290466391165e-06, "loss": 0.4342, "num_input_tokens_seen": 10953568, "step": 10625 }, { "epoch": 7.110367892976589, "grad_norm": 1.9702552556991577, "learning_rate": 8.140019708910266e-06, "loss": 0.5087, "num_input_tokens_seen": 10959552, "step": 10630 }, { "epoch": 7.11371237458194, "grad_norm": 1.5312057733535767, "learning_rate": 8.137747881525593e-06, "loss": 0.4803, "num_input_tokens_seen": 10964448, "step": 10635 }, { "epoch": 7.117056856187291, "grad_norm": 1.756800889968872, "learning_rate": 8.135474985011225e-06, "loss": 0.5004, "num_input_tokens_seen": 10970272, "step": 10640 }, { "epoch": 7.120401337792642, "grad_norm": 1.5097423791885376, "learning_rate": 8.133201020141615e-06, "loss": 0.4225, "num_input_tokens_seen": 10975840, "step": 10645 }, { "epoch": 7.1237458193979935, "grad_norm": 2.273027181625366, "learning_rate": 8.13092598769157e-06, "loss": 0.4621, "num_input_tokens_seen": 10980224, "step": 10650 }, { "epoch": 7.127090301003345, "grad_norm": 2.6369376182556152, "learning_rate": 8.128649888436266e-06, "loss": 0.4497, "num_input_tokens_seen": 10985984, "step": 10655 }, { "epoch": 7.130434782608695, "grad_norm": 1.898877501487732, "learning_rate": 8.126372723151244e-06, "loss": 0.4477, "num_input_tokens_seen": 10990848, "step": 10660 }, { "epoch": 7.133779264214047, "grad_norm": 1.7442400455474854, "learning_rate": 8.124094492612405e-06, "loss": 0.3971, "num_input_tokens_seen": 10996160, "step": 10665 }, { "epoch": 7.137123745819398, "grad_norm": 1.595585584640503, "learning_rate": 8.121815197596012e-06, "loss": 0.3827, "num_input_tokens_seen": 11001536, "step": 10670 }, { "epoch": 7.1404682274247495, "grad_norm": 1.9161709547042847, "learning_rate": 8.119534838878695e-06, "loss": 0.4258, "num_input_tokens_seen": 11006080, "step": 10675 }, { "epoch": 7.1438127090301, "grad_norm": 2.5182554721832275, "learning_rate": 8.117253417237445e-06, "loss": 0.4336, "num_input_tokens_seen": 11010880, "step": 10680 }, { "epoch": 7.147157190635451, "grad_norm": 1.464194893836975, "learning_rate": 8.114970933449612e-06, "loss": 0.4003, "num_input_tokens_seen": 11017152, "step": 10685 }, { "epoch": 7.150501672240803, "grad_norm": 2.3501057624816895, "learning_rate": 8.11268738829291e-06, "loss": 0.4204, "num_input_tokens_seen": 11022080, "step": 10690 }, { "epoch": 7.153846153846154, "grad_norm": 2.0594708919525146, "learning_rate": 8.110402782545419e-06, "loss": 0.4885, "num_input_tokens_seen": 11027136, "step": 10695 }, { "epoch": 7.157190635451505, "grad_norm": 1.808695912361145, "learning_rate": 8.108117116985571e-06, "loss": 0.4918, "num_input_tokens_seen": 11032928, "step": 10700 }, { "epoch": 7.160535117056856, "grad_norm": 1.8349246978759766, "learning_rate": 8.105830392392168e-06, "loss": 0.4007, "num_input_tokens_seen": 11038080, "step": 10705 }, { "epoch": 7.1638795986622075, "grad_norm": 1.682847023010254, "learning_rate": 8.103542609544366e-06, "loss": 0.4753, "num_input_tokens_seen": 11043648, "step": 10710 }, { "epoch": 7.167224080267559, "grad_norm": 1.5099014043807983, "learning_rate": 8.101253769221688e-06, "loss": 0.4557, "num_input_tokens_seen": 11048512, "step": 10715 }, { "epoch": 7.170568561872909, "grad_norm": 1.7214109897613525, "learning_rate": 8.098963872204014e-06, "loss": 0.4222, "num_input_tokens_seen": 11053984, "step": 10720 }, { "epoch": 7.173913043478261, "grad_norm": 1.9395562410354614, "learning_rate": 8.096672919271581e-06, "loss": 0.3568, "num_input_tokens_seen": 11059392, "step": 10725 }, { "epoch": 7.177257525083612, "grad_norm": 2.3733999729156494, "learning_rate": 8.094380911204992e-06, "loss": 0.4534, "num_input_tokens_seen": 11064640, "step": 10730 }, { "epoch": 7.1806020066889635, "grad_norm": 1.8679676055908203, "learning_rate": 8.092087848785204e-06, "loss": 0.4442, "num_input_tokens_seen": 11069952, "step": 10735 }, { "epoch": 7.183946488294314, "grad_norm": 3.8517251014709473, "learning_rate": 8.089793732793538e-06, "loss": 0.4186, "num_input_tokens_seen": 11074944, "step": 10740 }, { "epoch": 7.187290969899665, "grad_norm": 2.110501527786255, "learning_rate": 8.087498564011672e-06, "loss": 0.555, "num_input_tokens_seen": 11080512, "step": 10745 }, { "epoch": 7.190635451505017, "grad_norm": 1.416579008102417, "learning_rate": 8.085202343221639e-06, "loss": 0.4658, "num_input_tokens_seen": 11086112, "step": 10750 }, { "epoch": 7.193979933110368, "grad_norm": 2.788884401321411, "learning_rate": 8.082905071205835e-06, "loss": 0.421, "num_input_tokens_seen": 11090688, "step": 10755 }, { "epoch": 7.197324414715719, "grad_norm": 1.942811131477356, "learning_rate": 8.080606748747017e-06, "loss": 0.421, "num_input_tokens_seen": 11095424, "step": 10760 }, { "epoch": 7.20066889632107, "grad_norm": 2.691427230834961, "learning_rate": 8.078307376628292e-06, "loss": 0.4959, "num_input_tokens_seen": 11099840, "step": 10765 }, { "epoch": 7.2040133779264215, "grad_norm": 1.3947632312774658, "learning_rate": 8.076006955633129e-06, "loss": 0.451, "num_input_tokens_seen": 11105472, "step": 10770 }, { "epoch": 7.207357859531773, "grad_norm": 2.5085299015045166, "learning_rate": 8.073705486545356e-06, "loss": 0.3706, "num_input_tokens_seen": 11110016, "step": 10775 }, { "epoch": 7.210702341137123, "grad_norm": 2.2269866466522217, "learning_rate": 8.071402970149153e-06, "loss": 0.3816, "num_input_tokens_seen": 11115200, "step": 10780 }, { "epoch": 7.214046822742475, "grad_norm": 1.3080979585647583, "learning_rate": 8.069099407229064e-06, "loss": 0.3467, "num_input_tokens_seen": 11119488, "step": 10785 }, { "epoch": 7.217391304347826, "grad_norm": 2.2707865238189697, "learning_rate": 8.066794798569982e-06, "loss": 0.4456, "num_input_tokens_seen": 11124256, "step": 10790 }, { "epoch": 7.2207357859531776, "grad_norm": 2.0768187046051025, "learning_rate": 8.064489144957162e-06, "loss": 0.4709, "num_input_tokens_seen": 11128992, "step": 10795 }, { "epoch": 7.224080267558528, "grad_norm": 2.235584259033203, "learning_rate": 8.06218244717621e-06, "loss": 0.432, "num_input_tokens_seen": 11133792, "step": 10800 }, { "epoch": 7.2274247491638794, "grad_norm": 1.5118613243103027, "learning_rate": 8.059874706013094e-06, "loss": 0.4587, "num_input_tokens_seen": 11139040, "step": 10805 }, { "epoch": 7.230769230769231, "grad_norm": 1.6416518688201904, "learning_rate": 8.057565922254133e-06, "loss": 0.365, "num_input_tokens_seen": 11144352, "step": 10810 }, { "epoch": 7.234113712374582, "grad_norm": 2.741614818572998, "learning_rate": 8.055256096686004e-06, "loss": 0.4197, "num_input_tokens_seen": 11150368, "step": 10815 }, { "epoch": 7.237458193979933, "grad_norm": 2.1200950145721436, "learning_rate": 8.052945230095735e-06, "loss": 0.4829, "num_input_tokens_seen": 11155200, "step": 10820 }, { "epoch": 7.240802675585284, "grad_norm": 1.855482578277588, "learning_rate": 8.050633323270711e-06, "loss": 0.4283, "num_input_tokens_seen": 11160096, "step": 10825 }, { "epoch": 7.2441471571906355, "grad_norm": 2.2887606620788574, "learning_rate": 8.048320376998675e-06, "loss": 0.3863, "num_input_tokens_seen": 11165248, "step": 10830 }, { "epoch": 7.247491638795987, "grad_norm": 1.5864449739456177, "learning_rate": 8.046006392067716e-06, "loss": 0.4319, "num_input_tokens_seen": 11171104, "step": 10835 }, { "epoch": 7.250836120401337, "grad_norm": 2.3799288272857666, "learning_rate": 8.043691369266284e-06, "loss": 0.4431, "num_input_tokens_seen": 11175520, "step": 10840 }, { "epoch": 7.254180602006689, "grad_norm": 1.4831745624542236, "learning_rate": 8.04137530938318e-06, "loss": 0.4068, "num_input_tokens_seen": 11180576, "step": 10845 }, { "epoch": 7.25752508361204, "grad_norm": 1.9962395429611206, "learning_rate": 8.039058213207562e-06, "loss": 0.4427, "num_input_tokens_seen": 11185056, "step": 10850 }, { "epoch": 7.260869565217392, "grad_norm": 1.8776735067367554, "learning_rate": 8.036740081528934e-06, "loss": 0.4443, "num_input_tokens_seen": 11189568, "step": 10855 }, { "epoch": 7.264214046822742, "grad_norm": 1.7220909595489502, "learning_rate": 8.034420915137156e-06, "loss": 0.3643, "num_input_tokens_seen": 11193856, "step": 10860 }, { "epoch": 7.2675585284280935, "grad_norm": 1.5683417320251465, "learning_rate": 8.032100714822443e-06, "loss": 0.4024, "num_input_tokens_seen": 11198944, "step": 10865 }, { "epoch": 7.270903010033445, "grad_norm": 1.7447001934051514, "learning_rate": 8.029779481375361e-06, "loss": 0.4557, "num_input_tokens_seen": 11204128, "step": 10870 }, { "epoch": 7.274247491638796, "grad_norm": 2.1060853004455566, "learning_rate": 8.027457215586827e-06, "loss": 0.4431, "num_input_tokens_seen": 11209024, "step": 10875 }, { "epoch": 7.277591973244147, "grad_norm": 1.844673752784729, "learning_rate": 8.025133918248109e-06, "loss": 0.5186, "num_input_tokens_seen": 11215360, "step": 10880 }, { "epoch": 7.280936454849498, "grad_norm": 1.4973790645599365, "learning_rate": 8.022809590150828e-06, "loss": 0.3963, "num_input_tokens_seen": 11220800, "step": 10885 }, { "epoch": 7.2842809364548495, "grad_norm": 2.555605888366699, "learning_rate": 8.020484232086956e-06, "loss": 0.4661, "num_input_tokens_seen": 11226144, "step": 10890 }, { "epoch": 7.287625418060201, "grad_norm": 1.7935569286346436, "learning_rate": 8.018157844848817e-06, "loss": 0.4676, "num_input_tokens_seen": 11231456, "step": 10895 }, { "epoch": 7.290969899665551, "grad_norm": 1.3329365253448486, "learning_rate": 8.015830429229083e-06, "loss": 0.4802, "num_input_tokens_seen": 11237664, "step": 10900 }, { "epoch": 7.294314381270903, "grad_norm": 1.9210727214813232, "learning_rate": 8.01350198602078e-06, "loss": 0.5085, "num_input_tokens_seen": 11242272, "step": 10905 }, { "epoch": 7.297658862876254, "grad_norm": 1.4306650161743164, "learning_rate": 8.01117251601728e-06, "loss": 0.4054, "num_input_tokens_seen": 11247680, "step": 10910 }, { "epoch": 7.301003344481606, "grad_norm": 2.144798755645752, "learning_rate": 8.008842020012306e-06, "loss": 0.4829, "num_input_tokens_seen": 11252864, "step": 10915 }, { "epoch": 7.304347826086957, "grad_norm": 1.306321620941162, "learning_rate": 8.006510498799935e-06, "loss": 0.3567, "num_input_tokens_seen": 11257344, "step": 10920 }, { "epoch": 7.3076923076923075, "grad_norm": 2.360748291015625, "learning_rate": 8.004177953174587e-06, "loss": 0.4989, "num_input_tokens_seen": 11262048, "step": 10925 }, { "epoch": 7.311036789297659, "grad_norm": 1.9580018520355225, "learning_rate": 8.001844383931037e-06, "loss": 0.4439, "num_input_tokens_seen": 11266720, "step": 10930 }, { "epoch": 7.31438127090301, "grad_norm": 2.0139167308807373, "learning_rate": 7.999509791864402e-06, "loss": 0.4704, "num_input_tokens_seen": 11271808, "step": 10935 }, { "epoch": 7.317725752508361, "grad_norm": 1.7993543148040771, "learning_rate": 7.997174177770153e-06, "loss": 0.3808, "num_input_tokens_seen": 11276704, "step": 10940 }, { "epoch": 7.321070234113712, "grad_norm": 1.5604089498519897, "learning_rate": 7.994837542444105e-06, "loss": 0.432, "num_input_tokens_seen": 11281856, "step": 10945 }, { "epoch": 7.3244147157190636, "grad_norm": 1.953301191329956, "learning_rate": 7.992499886682428e-06, "loss": 0.4362, "num_input_tokens_seen": 11286656, "step": 10950 }, { "epoch": 7.327759197324415, "grad_norm": 2.0000336170196533, "learning_rate": 7.990161211281634e-06, "loss": 0.4401, "num_input_tokens_seen": 11292192, "step": 10955 }, { "epoch": 7.331103678929766, "grad_norm": 2.05879282951355, "learning_rate": 7.987821517038578e-06, "loss": 0.4108, "num_input_tokens_seen": 11297376, "step": 10960 }, { "epoch": 7.334448160535117, "grad_norm": 1.0581467151641846, "learning_rate": 7.985480804750472e-06, "loss": 0.3482, "num_input_tokens_seen": 11302656, "step": 10965 }, { "epoch": 7.337792642140468, "grad_norm": 1.638556957244873, "learning_rate": 7.983139075214872e-06, "loss": 0.3798, "num_input_tokens_seen": 11308320, "step": 10970 }, { "epoch": 7.34113712374582, "grad_norm": 2.2861204147338867, "learning_rate": 7.980796329229678e-06, "loss": 0.4808, "num_input_tokens_seen": 11313024, "step": 10975 }, { "epoch": 7.34448160535117, "grad_norm": 1.540519118309021, "learning_rate": 7.978452567593134e-06, "loss": 0.3686, "num_input_tokens_seen": 11318400, "step": 10980 }, { "epoch": 7.3478260869565215, "grad_norm": 1.4441369771957397, "learning_rate": 7.976107791103838e-06, "loss": 0.4163, "num_input_tokens_seen": 11323040, "step": 10985 }, { "epoch": 7.351170568561873, "grad_norm": 1.6223058700561523, "learning_rate": 7.973762000560728e-06, "loss": 0.398, "num_input_tokens_seen": 11328192, "step": 10990 }, { "epoch": 7.354515050167224, "grad_norm": 1.7249279022216797, "learning_rate": 7.971415196763088e-06, "loss": 0.5237, "num_input_tokens_seen": 11333568, "step": 10995 }, { "epoch": 7.357859531772576, "grad_norm": 1.6336772441864014, "learning_rate": 7.969067380510549e-06, "loss": 0.4155, "num_input_tokens_seen": 11338752, "step": 11000 }, { "epoch": 7.361204013377926, "grad_norm": 1.7949731349945068, "learning_rate": 7.966718552603086e-06, "loss": 0.4806, "num_input_tokens_seen": 11343904, "step": 11005 }, { "epoch": 7.364548494983278, "grad_norm": 2.1546132564544678, "learning_rate": 7.964368713841016e-06, "loss": 0.4322, "num_input_tokens_seen": 11349952, "step": 11010 }, { "epoch": 7.367892976588629, "grad_norm": 1.2809538841247559, "learning_rate": 7.962017865025007e-06, "loss": 0.3687, "num_input_tokens_seen": 11355520, "step": 11015 }, { "epoch": 7.3712374581939795, "grad_norm": 1.5103577375411987, "learning_rate": 7.959666006956066e-06, "loss": 0.3914, "num_input_tokens_seen": 11359904, "step": 11020 }, { "epoch": 7.374581939799331, "grad_norm": 1.7707921266555786, "learning_rate": 7.957313140435545e-06, "loss": 0.4129, "num_input_tokens_seen": 11364192, "step": 11025 }, { "epoch": 7.377926421404682, "grad_norm": 2.438535451889038, "learning_rate": 7.954959266265141e-06, "loss": 0.4054, "num_input_tokens_seen": 11368768, "step": 11030 }, { "epoch": 7.381270903010034, "grad_norm": 1.7314119338989258, "learning_rate": 7.952604385246891e-06, "loss": 0.4834, "num_input_tokens_seen": 11373920, "step": 11035 }, { "epoch": 7.384615384615385, "grad_norm": 1.3710391521453857, "learning_rate": 7.95024849818318e-06, "loss": 0.3966, "num_input_tokens_seen": 11379584, "step": 11040 }, { "epoch": 7.3879598662207355, "grad_norm": 1.4919712543487549, "learning_rate": 7.94789160587673e-06, "loss": 0.3472, "num_input_tokens_seen": 11384704, "step": 11045 }, { "epoch": 7.391304347826087, "grad_norm": 1.5653749704360962, "learning_rate": 7.94553370913061e-06, "loss": 0.4044, "num_input_tokens_seen": 11389344, "step": 11050 }, { "epoch": 7.394648829431438, "grad_norm": 2.668077230453491, "learning_rate": 7.943174808748231e-06, "loss": 0.4514, "num_input_tokens_seen": 11394432, "step": 11055 }, { "epoch": 7.39799331103679, "grad_norm": 1.993961215019226, "learning_rate": 7.940814905533342e-06, "loss": 0.378, "num_input_tokens_seen": 11399200, "step": 11060 }, { "epoch": 7.40133779264214, "grad_norm": 1.2715630531311035, "learning_rate": 7.93845400029004e-06, "loss": 0.4602, "num_input_tokens_seen": 11405024, "step": 11065 }, { "epoch": 7.404682274247492, "grad_norm": 1.667510747909546, "learning_rate": 7.936092093822758e-06, "loss": 0.4293, "num_input_tokens_seen": 11410336, "step": 11070 }, { "epoch": 7.408026755852843, "grad_norm": 1.8009988069534302, "learning_rate": 7.933729186936273e-06, "loss": 0.3925, "num_input_tokens_seen": 11415584, "step": 11075 }, { "epoch": 7.411371237458194, "grad_norm": 1.9544541835784912, "learning_rate": 7.931365280435698e-06, "loss": 0.3586, "num_input_tokens_seen": 11420448, "step": 11080 }, { "epoch": 7.414715719063545, "grad_norm": 1.7325178384780884, "learning_rate": 7.929000375126496e-06, "loss": 0.3846, "num_input_tokens_seen": 11425664, "step": 11085 }, { "epoch": 7.418060200668896, "grad_norm": 2.8482143878936768, "learning_rate": 7.926634471814461e-06, "loss": 0.4754, "num_input_tokens_seen": 11431040, "step": 11090 }, { "epoch": 7.421404682274248, "grad_norm": 2.096487522125244, "learning_rate": 7.924267571305733e-06, "loss": 0.3608, "num_input_tokens_seen": 11436064, "step": 11095 }, { "epoch": 7.424749163879599, "grad_norm": 2.947629451751709, "learning_rate": 7.921899674406787e-06, "loss": 0.5395, "num_input_tokens_seen": 11440544, "step": 11100 }, { "epoch": 7.4280936454849495, "grad_norm": 1.5193411111831665, "learning_rate": 7.919530781924445e-06, "loss": 0.4131, "num_input_tokens_seen": 11446816, "step": 11105 }, { "epoch": 7.431438127090301, "grad_norm": 1.447790503501892, "learning_rate": 7.917160894665859e-06, "loss": 0.444, "num_input_tokens_seen": 11452256, "step": 11110 }, { "epoch": 7.434782608695652, "grad_norm": 1.5558345317840576, "learning_rate": 7.914790013438525e-06, "loss": 0.4305, "num_input_tokens_seen": 11456864, "step": 11115 }, { "epoch": 7.438127090301004, "grad_norm": 3.4202256202697754, "learning_rate": 7.91241813905028e-06, "loss": 0.502, "num_input_tokens_seen": 11461760, "step": 11120 }, { "epoch": 7.441471571906354, "grad_norm": 2.5432989597320557, "learning_rate": 7.910045272309295e-06, "loss": 0.5231, "num_input_tokens_seen": 11466976, "step": 11125 }, { "epoch": 7.444816053511706, "grad_norm": 2.0270299911499023, "learning_rate": 7.907671414024078e-06, "loss": 0.376, "num_input_tokens_seen": 11471776, "step": 11130 }, { "epoch": 7.448160535117057, "grad_norm": 2.1986923217773438, "learning_rate": 7.90529656500348e-06, "loss": 0.549, "num_input_tokens_seen": 11476608, "step": 11135 }, { "epoch": 7.451505016722408, "grad_norm": 1.8812744617462158, "learning_rate": 7.902920726056686e-06, "loss": 0.4438, "num_input_tokens_seen": 11481376, "step": 11140 }, { "epoch": 7.454849498327759, "grad_norm": 1.3094449043273926, "learning_rate": 7.900543897993218e-06, "loss": 0.4839, "num_input_tokens_seen": 11487520, "step": 11145 }, { "epoch": 7.45819397993311, "grad_norm": 1.6306368112564087, "learning_rate": 7.89816608162294e-06, "loss": 0.4292, "num_input_tokens_seen": 11492608, "step": 11150 }, { "epoch": 7.461538461538462, "grad_norm": 2.208804130554199, "learning_rate": 7.895787277756047e-06, "loss": 0.4286, "num_input_tokens_seen": 11498304, "step": 11155 }, { "epoch": 7.464882943143813, "grad_norm": 2.457801103591919, "learning_rate": 7.893407487203072e-06, "loss": 0.3582, "num_input_tokens_seen": 11503360, "step": 11160 }, { "epoch": 7.468227424749164, "grad_norm": 1.8054341077804565, "learning_rate": 7.891026710774884e-06, "loss": 0.4599, "num_input_tokens_seen": 11508832, "step": 11165 }, { "epoch": 7.471571906354515, "grad_norm": 1.9987479448318481, "learning_rate": 7.888644949282692e-06, "loss": 0.5623, "num_input_tokens_seen": 11514976, "step": 11170 }, { "epoch": 7.474916387959866, "grad_norm": 1.3804206848144531, "learning_rate": 7.886262203538033e-06, "loss": 0.4631, "num_input_tokens_seen": 11520320, "step": 11175 }, { "epoch": 7.478260869565218, "grad_norm": 1.67451810836792, "learning_rate": 7.883878474352788e-06, "loss": 0.4787, "num_input_tokens_seen": 11526432, "step": 11180 }, { "epoch": 7.481605351170568, "grad_norm": 1.578735589981079, "learning_rate": 7.881493762539166e-06, "loss": 0.4925, "num_input_tokens_seen": 11531840, "step": 11185 }, { "epoch": 7.48494983277592, "grad_norm": 1.4870153665542603, "learning_rate": 7.879108068909712e-06, "loss": 0.3546, "num_input_tokens_seen": 11537120, "step": 11190 }, { "epoch": 7.488294314381271, "grad_norm": 1.4845739603042603, "learning_rate": 7.87672139427731e-06, "loss": 0.343, "num_input_tokens_seen": 11541888, "step": 11195 }, { "epoch": 7.491638795986622, "grad_norm": 1.7146248817443848, "learning_rate": 7.874333739455176e-06, "loss": 0.5064, "num_input_tokens_seen": 11547008, "step": 11200 }, { "epoch": 7.494983277591973, "grad_norm": 1.8108323812484741, "learning_rate": 7.871945105256856e-06, "loss": 0.5122, "num_input_tokens_seen": 11552256, "step": 11205 }, { "epoch": 7.498327759197324, "grad_norm": 2.248457431793213, "learning_rate": 7.869555492496235e-06, "loss": 0.3855, "num_input_tokens_seen": 11556768, "step": 11210 }, { "epoch": 7.501672240802676, "grad_norm": 1.349004864692688, "learning_rate": 7.86716490198753e-06, "loss": 0.4027, "num_input_tokens_seen": 11561056, "step": 11215 }, { "epoch": 7.505016722408027, "grad_norm": 1.4223964214324951, "learning_rate": 7.86477333454529e-06, "loss": 0.4267, "num_input_tokens_seen": 11565600, "step": 11220 }, { "epoch": 7.508361204013378, "grad_norm": 1.8929978609085083, "learning_rate": 7.862380790984396e-06, "loss": 0.4528, "num_input_tokens_seen": 11570592, "step": 11225 }, { "epoch": 7.511705685618729, "grad_norm": 1.1719964742660522, "learning_rate": 7.859987272120067e-06, "loss": 0.4393, "num_input_tokens_seen": 11576000, "step": 11230 }, { "epoch": 7.51505016722408, "grad_norm": 2.092027187347412, "learning_rate": 7.857592778767845e-06, "loss": 0.4487, "num_input_tokens_seen": 11581696, "step": 11235 }, { "epoch": 7.518394648829432, "grad_norm": 1.74602210521698, "learning_rate": 7.855197311743613e-06, "loss": 0.4458, "num_input_tokens_seen": 11586560, "step": 11240 }, { "epoch": 7.521739130434782, "grad_norm": 1.9451953172683716, "learning_rate": 7.852800871863581e-06, "loss": 0.4286, "num_input_tokens_seen": 11591744, "step": 11245 }, { "epoch": 7.525083612040134, "grad_norm": 2.2591168880462646, "learning_rate": 7.850403459944293e-06, "loss": 0.4027, "num_input_tokens_seen": 11596832, "step": 11250 }, { "epoch": 7.528428093645485, "grad_norm": 2.2013704776763916, "learning_rate": 7.848005076802622e-06, "loss": 0.4706, "num_input_tokens_seen": 11601792, "step": 11255 }, { "epoch": 7.531772575250836, "grad_norm": 1.803783893585205, "learning_rate": 7.845605723255774e-06, "loss": 0.447, "num_input_tokens_seen": 11607424, "step": 11260 }, { "epoch": 7.535117056856187, "grad_norm": 2.062805652618408, "learning_rate": 7.843205400121281e-06, "loss": 0.4135, "num_input_tokens_seen": 11613344, "step": 11265 }, { "epoch": 7.538461538461538, "grad_norm": 1.9991261959075928, "learning_rate": 7.840804108217011e-06, "loss": 0.4095, "num_input_tokens_seen": 11618368, "step": 11270 }, { "epoch": 7.54180602006689, "grad_norm": 2.2242624759674072, "learning_rate": 7.838401848361161e-06, "loss": 0.5084, "num_input_tokens_seen": 11622624, "step": 11275 }, { "epoch": 7.545150501672241, "grad_norm": 2.024291753768921, "learning_rate": 7.835998621372256e-06, "loss": 0.5202, "num_input_tokens_seen": 11628480, "step": 11280 }, { "epoch": 7.548494983277592, "grad_norm": 2.0879366397857666, "learning_rate": 7.83359442806915e-06, "loss": 0.4544, "num_input_tokens_seen": 11633440, "step": 11285 }, { "epoch": 7.551839464882943, "grad_norm": 2.2484443187713623, "learning_rate": 7.831189269271029e-06, "loss": 0.5111, "num_input_tokens_seen": 11637920, "step": 11290 }, { "epoch": 7.555183946488294, "grad_norm": 2.052734375, "learning_rate": 7.828783145797405e-06, "loss": 0.4261, "num_input_tokens_seen": 11643296, "step": 11295 }, { "epoch": 7.558528428093646, "grad_norm": 2.944340229034424, "learning_rate": 7.82637605846812e-06, "loss": 0.4545, "num_input_tokens_seen": 11647200, "step": 11300 }, { "epoch": 7.561872909698996, "grad_norm": 1.9821316003799438, "learning_rate": 7.823968008103348e-06, "loss": 0.4014, "num_input_tokens_seen": 11652576, "step": 11305 }, { "epoch": 7.565217391304348, "grad_norm": 1.6474378108978271, "learning_rate": 7.821558995523586e-06, "loss": 0.4523, "num_input_tokens_seen": 11658368, "step": 11310 }, { "epoch": 7.568561872909699, "grad_norm": 2.4949991703033447, "learning_rate": 7.819149021549657e-06, "loss": 0.3468, "num_input_tokens_seen": 11662880, "step": 11315 }, { "epoch": 7.5719063545150505, "grad_norm": 2.270537853240967, "learning_rate": 7.81673808700272e-06, "loss": 0.4774, "num_input_tokens_seen": 11668512, "step": 11320 }, { "epoch": 7.575250836120401, "grad_norm": 1.8741745948791504, "learning_rate": 7.814326192704255e-06, "loss": 0.4758, "num_input_tokens_seen": 11674400, "step": 11325 }, { "epoch": 7.578595317725752, "grad_norm": 2.3063602447509766, "learning_rate": 7.81191333947607e-06, "loss": 0.4874, "num_input_tokens_seen": 11680064, "step": 11330 }, { "epoch": 7.581939799331104, "grad_norm": 1.1782103776931763, "learning_rate": 7.809499528140299e-06, "loss": 0.3925, "num_input_tokens_seen": 11685408, "step": 11335 }, { "epoch": 7.585284280936455, "grad_norm": 1.6044411659240723, "learning_rate": 7.807084759519405e-06, "loss": 0.4665, "num_input_tokens_seen": 11691360, "step": 11340 }, { "epoch": 7.588628762541806, "grad_norm": 1.8892303705215454, "learning_rate": 7.804669034436177e-06, "loss": 0.397, "num_input_tokens_seen": 11696544, "step": 11345 }, { "epoch": 7.591973244147157, "grad_norm": 1.6535238027572632, "learning_rate": 7.802252353713727e-06, "loss": 0.4266, "num_input_tokens_seen": 11701952, "step": 11350 }, { "epoch": 7.595317725752508, "grad_norm": 2.3164522647857666, "learning_rate": 7.799834718175494e-06, "loss": 0.395, "num_input_tokens_seen": 11707168, "step": 11355 }, { "epoch": 7.59866220735786, "grad_norm": 2.1250038146972656, "learning_rate": 7.797416128645246e-06, "loss": 0.5416, "num_input_tokens_seen": 11713376, "step": 11360 }, { "epoch": 7.602006688963211, "grad_norm": 1.8125447034835815, "learning_rate": 7.794996585947067e-06, "loss": 0.372, "num_input_tokens_seen": 11717792, "step": 11365 }, { "epoch": 7.605351170568562, "grad_norm": 1.3478038311004639, "learning_rate": 7.792576090905377e-06, "loss": 0.487, "num_input_tokens_seen": 11722752, "step": 11370 }, { "epoch": 7.608695652173913, "grad_norm": 2.360029697418213, "learning_rate": 7.790154644344913e-06, "loss": 0.531, "num_input_tokens_seen": 11727520, "step": 11375 }, { "epoch": 7.6120401337792645, "grad_norm": 1.3904234170913696, "learning_rate": 7.787732247090738e-06, "loss": 0.4333, "num_input_tokens_seen": 11732160, "step": 11380 }, { "epoch": 7.615384615384615, "grad_norm": 1.697792410850525, "learning_rate": 7.785308899968239e-06, "loss": 0.4346, "num_input_tokens_seen": 11736896, "step": 11385 }, { "epoch": 7.618729096989966, "grad_norm": 1.7067928314208984, "learning_rate": 7.782884603803128e-06, "loss": 0.4646, "num_input_tokens_seen": 11742560, "step": 11390 }, { "epoch": 7.622073578595318, "grad_norm": 1.4917443990707397, "learning_rate": 7.780459359421437e-06, "loss": 0.3999, "num_input_tokens_seen": 11747712, "step": 11395 }, { "epoch": 7.625418060200669, "grad_norm": 1.7835136651992798, "learning_rate": 7.778033167649526e-06, "loss": 0.3634, "num_input_tokens_seen": 11752192, "step": 11400 }, { "epoch": 7.6287625418060205, "grad_norm": 1.8925244808197021, "learning_rate": 7.775606029314073e-06, "loss": 0.4207, "num_input_tokens_seen": 11757376, "step": 11405 }, { "epoch": 7.632107023411371, "grad_norm": 1.6292707920074463, "learning_rate": 7.773177945242081e-06, "loss": 0.4881, "num_input_tokens_seen": 11763040, "step": 11410 }, { "epoch": 7.635451505016722, "grad_norm": 1.8114680051803589, "learning_rate": 7.770748916260875e-06, "loss": 0.473, "num_input_tokens_seen": 11769056, "step": 11415 }, { "epoch": 7.638795986622074, "grad_norm": 1.8360590934753418, "learning_rate": 7.768318943198103e-06, "loss": 0.4764, "num_input_tokens_seen": 11774464, "step": 11420 }, { "epoch": 7.642140468227424, "grad_norm": 1.2683007717132568, "learning_rate": 7.76588802688173e-06, "loss": 0.4603, "num_input_tokens_seen": 11779200, "step": 11425 }, { "epoch": 7.645484949832776, "grad_norm": 2.176302194595337, "learning_rate": 7.76345616814005e-06, "loss": 0.5668, "num_input_tokens_seen": 11783936, "step": 11430 }, { "epoch": 7.648829431438127, "grad_norm": 1.7593235969543457, "learning_rate": 7.761023367801672e-06, "loss": 0.4325, "num_input_tokens_seen": 11789120, "step": 11435 }, { "epoch": 7.6521739130434785, "grad_norm": 2.065037727355957, "learning_rate": 7.75858962669553e-06, "loss": 0.4315, "num_input_tokens_seen": 11793888, "step": 11440 }, { "epoch": 7.65551839464883, "grad_norm": 1.2082003355026245, "learning_rate": 7.756154945650872e-06, "loss": 0.3821, "num_input_tokens_seen": 11800544, "step": 11445 }, { "epoch": 7.65886287625418, "grad_norm": 1.3558093309402466, "learning_rate": 7.753719325497272e-06, "loss": 0.3699, "num_input_tokens_seen": 11805120, "step": 11450 }, { "epoch": 7.662207357859532, "grad_norm": 1.9007718563079834, "learning_rate": 7.751282767064627e-06, "loss": 0.5571, "num_input_tokens_seen": 11809600, "step": 11455 }, { "epoch": 7.665551839464883, "grad_norm": 1.5762200355529785, "learning_rate": 7.748845271183145e-06, "loss": 0.466, "num_input_tokens_seen": 11814976, "step": 11460 }, { "epoch": 7.668896321070234, "grad_norm": 1.6859931945800781, "learning_rate": 7.746406838683358e-06, "loss": 0.5002, "num_input_tokens_seen": 11819872, "step": 11465 }, { "epoch": 7.672240802675585, "grad_norm": 2.6089580059051514, "learning_rate": 7.743967470396119e-06, "loss": 0.392, "num_input_tokens_seen": 11824064, "step": 11470 }, { "epoch": 7.6755852842809364, "grad_norm": 1.694269061088562, "learning_rate": 7.741527167152595e-06, "loss": 0.4043, "num_input_tokens_seen": 11829440, "step": 11475 }, { "epoch": 7.678929765886288, "grad_norm": 2.3463642597198486, "learning_rate": 7.739085929784276e-06, "loss": 0.4228, "num_input_tokens_seen": 11834592, "step": 11480 }, { "epoch": 7.682274247491639, "grad_norm": 2.1163008213043213, "learning_rate": 7.736643759122968e-06, "loss": 0.5298, "num_input_tokens_seen": 11840640, "step": 11485 }, { "epoch": 7.68561872909699, "grad_norm": 2.6921918392181396, "learning_rate": 7.734200656000798e-06, "loss": 0.4557, "num_input_tokens_seen": 11846592, "step": 11490 }, { "epoch": 7.688963210702341, "grad_norm": 2.1192383766174316, "learning_rate": 7.731756621250204e-06, "loss": 0.4338, "num_input_tokens_seen": 11851328, "step": 11495 }, { "epoch": 7.6923076923076925, "grad_norm": 1.8543051481246948, "learning_rate": 7.72931165570395e-06, "loss": 0.4417, "num_input_tokens_seen": 11856736, "step": 11500 }, { "epoch": 7.695652173913043, "grad_norm": 2.1668310165405273, "learning_rate": 7.726865760195107e-06, "loss": 0.4997, "num_input_tokens_seen": 11861504, "step": 11505 }, { "epoch": 7.698996655518394, "grad_norm": 1.665808916091919, "learning_rate": 7.724418935557076e-06, "loss": 0.5209, "num_input_tokens_seen": 11866848, "step": 11510 }, { "epoch": 7.702341137123746, "grad_norm": 1.523970603942871, "learning_rate": 7.721971182623565e-06, "loss": 0.3919, "num_input_tokens_seen": 11871616, "step": 11515 }, { "epoch": 7.705685618729097, "grad_norm": 2.345893144607544, "learning_rate": 7.719522502228597e-06, "loss": 0.4573, "num_input_tokens_seen": 11877088, "step": 11520 }, { "epoch": 7.709030100334449, "grad_norm": 1.8084367513656616, "learning_rate": 7.717072895206519e-06, "loss": 0.285, "num_input_tokens_seen": 11882112, "step": 11525 }, { "epoch": 7.712374581939799, "grad_norm": 1.3489680290222168, "learning_rate": 7.714622362391985e-06, "loss": 0.4629, "num_input_tokens_seen": 11886816, "step": 11530 }, { "epoch": 7.7157190635451505, "grad_norm": 2.7476437091827393, "learning_rate": 7.712170904619973e-06, "loss": 0.4251, "num_input_tokens_seen": 11891552, "step": 11535 }, { "epoch": 7.719063545150502, "grad_norm": 1.512682557106018, "learning_rate": 7.709718522725771e-06, "loss": 0.3823, "num_input_tokens_seen": 11896864, "step": 11540 }, { "epoch": 7.722408026755852, "grad_norm": 2.136207103729248, "learning_rate": 7.707265217544982e-06, "loss": 0.4863, "num_input_tokens_seen": 11902208, "step": 11545 }, { "epoch": 7.725752508361204, "grad_norm": 2.020038366317749, "learning_rate": 7.704810989913524e-06, "loss": 0.4525, "num_input_tokens_seen": 11907040, "step": 11550 }, { "epoch": 7.729096989966555, "grad_norm": 2.862947463989258, "learning_rate": 7.702355840667633e-06, "loss": 0.4181, "num_input_tokens_seen": 11911680, "step": 11555 }, { "epoch": 7.7324414715719065, "grad_norm": 2.2197067737579346, "learning_rate": 7.699899770643851e-06, "loss": 0.4502, "num_input_tokens_seen": 11916960, "step": 11560 }, { "epoch": 7.735785953177258, "grad_norm": 1.6080433130264282, "learning_rate": 7.697442780679043e-06, "loss": 0.4481, "num_input_tokens_seen": 11923552, "step": 11565 }, { "epoch": 7.739130434782608, "grad_norm": 1.554800033569336, "learning_rate": 7.69498487161038e-06, "loss": 0.4808, "num_input_tokens_seen": 11929024, "step": 11570 }, { "epoch": 7.74247491638796, "grad_norm": 1.4199128150939941, "learning_rate": 7.69252604427535e-06, "loss": 0.3971, "num_input_tokens_seen": 11934080, "step": 11575 }, { "epoch": 7.745819397993311, "grad_norm": 2.22676944732666, "learning_rate": 7.690066299511753e-06, "loss": 0.5527, "num_input_tokens_seen": 11938816, "step": 11580 }, { "epoch": 7.749163879598662, "grad_norm": 1.7330403327941895, "learning_rate": 7.687605638157702e-06, "loss": 0.5781, "num_input_tokens_seen": 11944480, "step": 11585 }, { "epoch": 7.752508361204013, "grad_norm": 1.6726648807525635, "learning_rate": 7.685144061051622e-06, "loss": 0.4445, "num_input_tokens_seen": 11949760, "step": 11590 }, { "epoch": 7.7558528428093645, "grad_norm": 1.9297138452529907, "learning_rate": 7.68268156903225e-06, "loss": 0.4983, "num_input_tokens_seen": 11954624, "step": 11595 }, { "epoch": 7.759197324414716, "grad_norm": 1.8022173643112183, "learning_rate": 7.680218162938633e-06, "loss": 0.464, "num_input_tokens_seen": 11959808, "step": 11600 }, { "epoch": 7.762541806020067, "grad_norm": 1.8339824676513672, "learning_rate": 7.677753843610134e-06, "loss": 0.4139, "num_input_tokens_seen": 11964672, "step": 11605 }, { "epoch": 7.765886287625418, "grad_norm": 2.1775503158569336, "learning_rate": 7.675288611886423e-06, "loss": 0.5198, "num_input_tokens_seen": 11969280, "step": 11610 }, { "epoch": 7.769230769230769, "grad_norm": 1.406556487083435, "learning_rate": 7.672822468607482e-06, "loss": 0.3939, "num_input_tokens_seen": 11973728, "step": 11615 }, { "epoch": 7.7725752508361206, "grad_norm": 2.5054385662078857, "learning_rate": 7.670355414613604e-06, "loss": 0.4772, "num_input_tokens_seen": 11978464, "step": 11620 }, { "epoch": 7.775919732441472, "grad_norm": 2.1709344387054443, "learning_rate": 7.667887450745392e-06, "loss": 0.4669, "num_input_tokens_seen": 11983328, "step": 11625 }, { "epoch": 7.7792642140468224, "grad_norm": 1.8589493036270142, "learning_rate": 7.66541857784376e-06, "loss": 0.5068, "num_input_tokens_seen": 11988096, "step": 11630 }, { "epoch": 7.782608695652174, "grad_norm": 2.239929676055908, "learning_rate": 7.66294879674993e-06, "loss": 0.5189, "num_input_tokens_seen": 11993792, "step": 11635 }, { "epoch": 7.785953177257525, "grad_norm": 1.951370120048523, "learning_rate": 7.660478108305433e-06, "loss": 0.5017, "num_input_tokens_seen": 11998304, "step": 11640 }, { "epoch": 7.789297658862877, "grad_norm": 1.118665337562561, "learning_rate": 7.658006513352112e-06, "loss": 0.3817, "num_input_tokens_seen": 12003456, "step": 11645 }, { "epoch": 7.792642140468227, "grad_norm": 1.1258615255355835, "learning_rate": 7.655534012732118e-06, "loss": 0.3969, "num_input_tokens_seen": 12008320, "step": 11650 }, { "epoch": 7.7959866220735785, "grad_norm": 1.9989771842956543, "learning_rate": 7.65306060728791e-06, "loss": 0.4354, "num_input_tokens_seen": 12012992, "step": 11655 }, { "epoch": 7.79933110367893, "grad_norm": 1.7664490938186646, "learning_rate": 7.650586297862254e-06, "loss": 0.4587, "num_input_tokens_seen": 12018464, "step": 11660 }, { "epoch": 7.802675585284281, "grad_norm": 1.0374042987823486, "learning_rate": 7.648111085298226e-06, "loss": 0.4377, "num_input_tokens_seen": 12024672, "step": 11665 }, { "epoch": 7.806020066889632, "grad_norm": 1.5486338138580322, "learning_rate": 7.64563497043921e-06, "loss": 0.4667, "num_input_tokens_seen": 12029152, "step": 11670 }, { "epoch": 7.809364548494983, "grad_norm": 2.7589213848114014, "learning_rate": 7.643157954128895e-06, "loss": 0.4025, "num_input_tokens_seen": 12033696, "step": 11675 }, { "epoch": 7.812709030100335, "grad_norm": 1.6991045475006104, "learning_rate": 7.640680037211279e-06, "loss": 0.4883, "num_input_tokens_seen": 12039488, "step": 11680 }, { "epoch": 7.816053511705686, "grad_norm": 1.868175983428955, "learning_rate": 7.638201220530664e-06, "loss": 0.4104, "num_input_tokens_seen": 12044192, "step": 11685 }, { "epoch": 7.8193979933110365, "grad_norm": 1.9119703769683838, "learning_rate": 7.635721504931666e-06, "loss": 0.4399, "num_input_tokens_seen": 12049824, "step": 11690 }, { "epoch": 7.822742474916388, "grad_norm": 1.6608282327651978, "learning_rate": 7.633240891259198e-06, "loss": 0.4338, "num_input_tokens_seen": 12054240, "step": 11695 }, { "epoch": 7.826086956521739, "grad_norm": 1.703325867652893, "learning_rate": 7.630759380358488e-06, "loss": 0.4832, "num_input_tokens_seen": 12059040, "step": 11700 }, { "epoch": 7.829431438127091, "grad_norm": 1.6433923244476318, "learning_rate": 7.6282769730750595e-06, "loss": 0.4353, "num_input_tokens_seen": 12064128, "step": 11705 }, { "epoch": 7.832775919732441, "grad_norm": 1.5768266916275024, "learning_rate": 7.625793670254751e-06, "loss": 0.4637, "num_input_tokens_seen": 12069120, "step": 11710 }, { "epoch": 7.8361204013377925, "grad_norm": 1.583559274673462, "learning_rate": 7.623309472743701e-06, "loss": 0.523, "num_input_tokens_seen": 12074464, "step": 11715 }, { "epoch": 7.839464882943144, "grad_norm": 1.491636872291565, "learning_rate": 7.620824381388352e-06, "loss": 0.4586, "num_input_tokens_seen": 12080352, "step": 11720 }, { "epoch": 7.842809364548495, "grad_norm": 1.5474878549575806, "learning_rate": 7.618338397035457e-06, "loss": 0.4014, "num_input_tokens_seen": 12085632, "step": 11725 }, { "epoch": 7.846153846153846, "grad_norm": 1.324763536453247, "learning_rate": 7.615851520532065e-06, "loss": 0.4243, "num_input_tokens_seen": 12090592, "step": 11730 }, { "epoch": 7.849498327759197, "grad_norm": 1.4818320274353027, "learning_rate": 7.613363752725536e-06, "loss": 0.4811, "num_input_tokens_seen": 12095392, "step": 11735 }, { "epoch": 7.852842809364549, "grad_norm": 1.6378170251846313, "learning_rate": 7.61087509446353e-06, "loss": 0.4152, "num_input_tokens_seen": 12100992, "step": 11740 }, { "epoch": 7.8561872909699, "grad_norm": 1.809996247291565, "learning_rate": 7.608385546594011e-06, "loss": 0.406, "num_input_tokens_seen": 12106304, "step": 11745 }, { "epoch": 7.8595317725752505, "grad_norm": 2.2278645038604736, "learning_rate": 7.605895109965249e-06, "loss": 0.4254, "num_input_tokens_seen": 12111136, "step": 11750 }, { "epoch": 7.862876254180602, "grad_norm": 2.2025935649871826, "learning_rate": 7.6034037854258115e-06, "loss": 0.4512, "num_input_tokens_seen": 12116480, "step": 11755 }, { "epoch": 7.866220735785953, "grad_norm": 1.7095664739608765, "learning_rate": 7.600911573824575e-06, "loss": 0.4502, "num_input_tokens_seen": 12120960, "step": 11760 }, { "epoch": 7.869565217391305, "grad_norm": 4.023007869720459, "learning_rate": 7.598418476010708e-06, "loss": 0.4113, "num_input_tokens_seen": 12125536, "step": 11765 }, { "epoch": 7.872909698996655, "grad_norm": 1.92029869556427, "learning_rate": 7.595924492833694e-06, "loss": 0.5364, "num_input_tokens_seen": 12130912, "step": 11770 }, { "epoch": 7.8762541806020065, "grad_norm": 1.7115769386291504, "learning_rate": 7.593429625143308e-06, "loss": 0.4758, "num_input_tokens_seen": 12136096, "step": 11775 }, { "epoch": 7.879598662207358, "grad_norm": 2.2440595626831055, "learning_rate": 7.590933873789632e-06, "loss": 0.386, "num_input_tokens_seen": 12140736, "step": 11780 }, { "epoch": 7.882943143812709, "grad_norm": 1.4279260635375977, "learning_rate": 7.588437239623048e-06, "loss": 0.4463, "num_input_tokens_seen": 12146720, "step": 11785 }, { "epoch": 7.88628762541806, "grad_norm": 1.5580711364746094, "learning_rate": 7.585939723494234e-06, "loss": 0.4904, "num_input_tokens_seen": 12153216, "step": 11790 }, { "epoch": 7.889632107023411, "grad_norm": 1.348020076751709, "learning_rate": 7.583441326254177e-06, "loss": 0.4469, "num_input_tokens_seen": 12159104, "step": 11795 }, { "epoch": 7.892976588628763, "grad_norm": 1.6912119388580322, "learning_rate": 7.580942048754158e-06, "loss": 0.4534, "num_input_tokens_seen": 12164768, "step": 11800 }, { "epoch": 7.896321070234114, "grad_norm": 2.406179189682007, "learning_rate": 7.5784418918457605e-06, "loss": 0.4549, "num_input_tokens_seen": 12169472, "step": 11805 }, { "epoch": 7.8996655518394645, "grad_norm": 2.574861764907837, "learning_rate": 7.575940856380863e-06, "loss": 0.5123, "num_input_tokens_seen": 12175072, "step": 11810 }, { "epoch": 7.903010033444816, "grad_norm": 1.7274551391601562, "learning_rate": 7.573438943211653e-06, "loss": 0.4711, "num_input_tokens_seen": 12181024, "step": 11815 }, { "epoch": 7.906354515050167, "grad_norm": 2.052267551422119, "learning_rate": 7.570936153190608e-06, "loss": 0.5081, "num_input_tokens_seen": 12185664, "step": 11820 }, { "epoch": 7.909698996655519, "grad_norm": 1.6551740169525146, "learning_rate": 7.568432487170507e-06, "loss": 0.4029, "num_input_tokens_seen": 12190400, "step": 11825 }, { "epoch": 7.913043478260869, "grad_norm": 2.1303768157958984, "learning_rate": 7.56592794600443e-06, "loss": 0.4101, "num_input_tokens_seen": 12195264, "step": 11830 }, { "epoch": 7.916387959866221, "grad_norm": 2.5616707801818848, "learning_rate": 7.563422530545752e-06, "loss": 0.51, "num_input_tokens_seen": 12200096, "step": 11835 }, { "epoch": 7.919732441471572, "grad_norm": 1.648867130279541, "learning_rate": 7.560916241648146e-06, "loss": 0.4509, "num_input_tokens_seen": 12205024, "step": 11840 }, { "epoch": 7.923076923076923, "grad_norm": 1.6571381092071533, "learning_rate": 7.558409080165586e-06, "loss": 0.5023, "num_input_tokens_seen": 12210368, "step": 11845 }, { "epoch": 7.926421404682275, "grad_norm": 2.06504225730896, "learning_rate": 7.555901046952341e-06, "loss": 0.5222, "num_input_tokens_seen": 12216320, "step": 11850 }, { "epoch": 7.929765886287625, "grad_norm": 1.6207913160324097, "learning_rate": 7.553392142862974e-06, "loss": 0.4077, "num_input_tokens_seen": 12221344, "step": 11855 }, { "epoch": 7.933110367892977, "grad_norm": 1.7329052686691284, "learning_rate": 7.550882368752351e-06, "loss": 0.4514, "num_input_tokens_seen": 12226560, "step": 11860 }, { "epoch": 7.936454849498328, "grad_norm": 1.411930799484253, "learning_rate": 7.54837172547563e-06, "loss": 0.4504, "num_input_tokens_seen": 12231136, "step": 11865 }, { "epoch": 7.9397993311036785, "grad_norm": 1.2123756408691406, "learning_rate": 7.545860213888265e-06, "loss": 0.4315, "num_input_tokens_seen": 12237312, "step": 11870 }, { "epoch": 7.94314381270903, "grad_norm": 1.7272698879241943, "learning_rate": 7.543347834846011e-06, "loss": 0.4455, "num_input_tokens_seen": 12241600, "step": 11875 }, { "epoch": 7.946488294314381, "grad_norm": 1.540729284286499, "learning_rate": 7.540834589204911e-06, "loss": 0.4229, "num_input_tokens_seen": 12246848, "step": 11880 }, { "epoch": 7.949832775919733, "grad_norm": 1.791489601135254, "learning_rate": 7.5383204778213085e-06, "loss": 0.4656, "num_input_tokens_seen": 12252768, "step": 11885 }, { "epoch": 7.953177257525084, "grad_norm": 2.2808501720428467, "learning_rate": 7.535805501551838e-06, "loss": 0.4313, "num_input_tokens_seen": 12259008, "step": 11890 }, { "epoch": 7.956521739130435, "grad_norm": 1.883506417274475, "learning_rate": 7.533289661253438e-06, "loss": 0.4877, "num_input_tokens_seen": 12263872, "step": 11895 }, { "epoch": 7.959866220735786, "grad_norm": 2.9339494705200195, "learning_rate": 7.530772957783328e-06, "loss": 0.5226, "num_input_tokens_seen": 12268640, "step": 11900 }, { "epoch": 7.963210702341137, "grad_norm": 2.105761766433716, "learning_rate": 7.528255391999032e-06, "loss": 0.438, "num_input_tokens_seen": 12273664, "step": 11905 }, { "epoch": 7.966555183946488, "grad_norm": 1.894331693649292, "learning_rate": 7.525736964758361e-06, "loss": 0.3927, "num_input_tokens_seen": 12279040, "step": 11910 }, { "epoch": 7.969899665551839, "grad_norm": 1.8854933977127075, "learning_rate": 7.5232176769194276e-06, "loss": 0.4149, "num_input_tokens_seen": 12284000, "step": 11915 }, { "epoch": 7.973244147157191, "grad_norm": 1.5139704942703247, "learning_rate": 7.5206975293406255e-06, "loss": 0.4584, "num_input_tokens_seen": 12289888, "step": 11920 }, { "epoch": 7.976588628762542, "grad_norm": 1.8250125646591187, "learning_rate": 7.5181765228806555e-06, "loss": 0.4423, "num_input_tokens_seen": 12295616, "step": 11925 }, { "epoch": 7.979933110367893, "grad_norm": 3.9882543087005615, "learning_rate": 7.515654658398499e-06, "loss": 0.4403, "num_input_tokens_seen": 12300384, "step": 11930 }, { "epoch": 7.983277591973244, "grad_norm": 1.3257229328155518, "learning_rate": 7.513131936753439e-06, "loss": 0.532, "num_input_tokens_seen": 12305600, "step": 11935 }, { "epoch": 7.986622073578595, "grad_norm": 2.509579658508301, "learning_rate": 7.510608358805043e-06, "loss": 0.3603, "num_input_tokens_seen": 12310752, "step": 11940 }, { "epoch": 7.989966555183947, "grad_norm": 1.396632432937622, "learning_rate": 7.5080839254131735e-06, "loss": 0.4045, "num_input_tokens_seen": 12315744, "step": 11945 }, { "epoch": 7.993311036789297, "grad_norm": 1.4081140756607056, "learning_rate": 7.5055586374379876e-06, "loss": 0.4412, "num_input_tokens_seen": 12320640, "step": 11950 }, { "epoch": 7.996655518394649, "grad_norm": 1.615337610244751, "learning_rate": 7.503032495739927e-06, "loss": 0.3938, "num_input_tokens_seen": 12326368, "step": 11955 }, { "epoch": 8.0, "grad_norm": 3.7142181396484375, "learning_rate": 7.500505501179731e-06, "loss": 0.3616, "num_input_tokens_seen": 12330800, "step": 11960 }, { "epoch": 8.0, "eval_loss": 0.49523019790649414, "eval_runtime": 37.5453, "eval_samples_per_second": 39.819, "eval_steps_per_second": 9.961, "num_input_tokens_seen": 12330800, "step": 11960 }, { "epoch": 8.003344481605351, "grad_norm": 1.81753671169281, "learning_rate": 7.497977654618424e-06, "loss": 0.3952, "num_input_tokens_seen": 12335696, "step": 11965 }, { "epoch": 8.006688963210703, "grad_norm": 1.8132741451263428, "learning_rate": 7.495448956917326e-06, "loss": 0.4022, "num_input_tokens_seen": 12340848, "step": 11970 }, { "epoch": 8.010033444816054, "grad_norm": 2.3978006839752197, "learning_rate": 7.492919408938041e-06, "loss": 0.4761, "num_input_tokens_seen": 12345264, "step": 11975 }, { "epoch": 8.013377926421406, "grad_norm": 2.237151622772217, "learning_rate": 7.490389011542467e-06, "loss": 0.4075, "num_input_tokens_seen": 12350576, "step": 11980 }, { "epoch": 8.016722408026755, "grad_norm": 2.2264015674591064, "learning_rate": 7.48785776559279e-06, "loss": 0.4196, "num_input_tokens_seen": 12355920, "step": 11985 }, { "epoch": 8.020066889632107, "grad_norm": 2.062119960784912, "learning_rate": 7.4853256719514854e-06, "loss": 0.3149, "num_input_tokens_seen": 12360720, "step": 11990 }, { "epoch": 8.023411371237458, "grad_norm": 1.957173466682434, "learning_rate": 7.482792731481319e-06, "loss": 0.3739, "num_input_tokens_seen": 12365552, "step": 11995 }, { "epoch": 8.02675585284281, "grad_norm": 2.5110485553741455, "learning_rate": 7.4802589450453415e-06, "loss": 0.4548, "num_input_tokens_seen": 12370768, "step": 12000 }, { "epoch": 8.03010033444816, "grad_norm": 1.3504815101623535, "learning_rate": 7.477724313506893e-06, "loss": 0.441, "num_input_tokens_seen": 12376528, "step": 12005 }, { "epoch": 8.033444816053512, "grad_norm": 1.4946762323379517, "learning_rate": 7.4751888377296055e-06, "loss": 0.4017, "num_input_tokens_seen": 12382384, "step": 12010 }, { "epoch": 8.036789297658864, "grad_norm": 1.6907827854156494, "learning_rate": 7.472652518577395e-06, "loss": 0.4712, "num_input_tokens_seen": 12387760, "step": 12015 }, { "epoch": 8.040133779264215, "grad_norm": 2.32226300239563, "learning_rate": 7.470115356914461e-06, "loss": 0.4727, "num_input_tokens_seen": 12393008, "step": 12020 }, { "epoch": 8.043478260869565, "grad_norm": 1.459417462348938, "learning_rate": 7.4675773536053005e-06, "loss": 0.4317, "num_input_tokens_seen": 12397808, "step": 12025 }, { "epoch": 8.046822742474916, "grad_norm": 1.4920282363891602, "learning_rate": 7.465038509514688e-06, "loss": 0.4798, "num_input_tokens_seen": 12402608, "step": 12030 }, { "epoch": 8.050167224080267, "grad_norm": 1.672348141670227, "learning_rate": 7.462498825507689e-06, "loss": 0.4995, "num_input_tokens_seen": 12407504, "step": 12035 }, { "epoch": 8.053511705685619, "grad_norm": 2.4171910285949707, "learning_rate": 7.459958302449653e-06, "loss": 0.4519, "num_input_tokens_seen": 12412720, "step": 12040 }, { "epoch": 8.05685618729097, "grad_norm": 1.6987279653549194, "learning_rate": 7.457416941206217e-06, "loss": 0.4156, "num_input_tokens_seen": 12417232, "step": 12045 }, { "epoch": 8.060200668896321, "grad_norm": 3.485194683074951, "learning_rate": 7.454874742643303e-06, "loss": 0.4922, "num_input_tokens_seen": 12422032, "step": 12050 }, { "epoch": 8.063545150501673, "grad_norm": 2.481616258621216, "learning_rate": 7.4523317076271175e-06, "loss": 0.4762, "num_input_tokens_seen": 12427760, "step": 12055 }, { "epoch": 8.066889632107024, "grad_norm": 1.3867028951644897, "learning_rate": 7.449787837024154e-06, "loss": 0.4166, "num_input_tokens_seen": 12432592, "step": 12060 }, { "epoch": 8.070234113712374, "grad_norm": 2.1254751682281494, "learning_rate": 7.447243131701187e-06, "loss": 0.428, "num_input_tokens_seen": 12437392, "step": 12065 }, { "epoch": 8.073578595317725, "grad_norm": 2.5818636417388916, "learning_rate": 7.444697592525283e-06, "loss": 0.3442, "num_input_tokens_seen": 12442032, "step": 12070 }, { "epoch": 8.076923076923077, "grad_norm": 1.837427020072937, "learning_rate": 7.442151220363781e-06, "loss": 0.3737, "num_input_tokens_seen": 12447568, "step": 12075 }, { "epoch": 8.080267558528428, "grad_norm": 1.2705451250076294, "learning_rate": 7.439604016084315e-06, "loss": 0.4184, "num_input_tokens_seen": 12452912, "step": 12080 }, { "epoch": 8.08361204013378, "grad_norm": 1.2988488674163818, "learning_rate": 7.437055980554796e-06, "loss": 0.3929, "num_input_tokens_seen": 12458256, "step": 12085 }, { "epoch": 8.08695652173913, "grad_norm": 1.9992731809616089, "learning_rate": 7.43450711464342e-06, "loss": 0.3793, "num_input_tokens_seen": 12463888, "step": 12090 }, { "epoch": 8.090301003344482, "grad_norm": 1.9918222427368164, "learning_rate": 7.4319574192186675e-06, "loss": 0.4093, "num_input_tokens_seen": 12468880, "step": 12095 }, { "epoch": 8.093645484949834, "grad_norm": 1.8626224994659424, "learning_rate": 7.4294068951492985e-06, "loss": 0.4696, "num_input_tokens_seen": 12473968, "step": 12100 }, { "epoch": 8.096989966555183, "grad_norm": 1.5839523077011108, "learning_rate": 7.426855543304358e-06, "loss": 0.4956, "num_input_tokens_seen": 12479216, "step": 12105 }, { "epoch": 8.100334448160535, "grad_norm": 1.395649790763855, "learning_rate": 7.42430336455317e-06, "loss": 0.3798, "num_input_tokens_seen": 12484592, "step": 12110 }, { "epoch": 8.103678929765886, "grad_norm": 2.545696496963501, "learning_rate": 7.421750359765346e-06, "loss": 0.4109, "num_input_tokens_seen": 12489360, "step": 12115 }, { "epoch": 8.107023411371237, "grad_norm": 2.1503050327301025, "learning_rate": 7.419196529810773e-06, "loss": 0.4104, "num_input_tokens_seen": 12494960, "step": 12120 }, { "epoch": 8.110367892976589, "grad_norm": 2.651764154434204, "learning_rate": 7.416641875559622e-06, "loss": 0.4448, "num_input_tokens_seen": 12499792, "step": 12125 }, { "epoch": 8.11371237458194, "grad_norm": 1.7993875741958618, "learning_rate": 7.414086397882343e-06, "loss": 0.4427, "num_input_tokens_seen": 12504848, "step": 12130 }, { "epoch": 8.117056856187292, "grad_norm": 1.7212083339691162, "learning_rate": 7.411530097649669e-06, "loss": 0.374, "num_input_tokens_seen": 12509744, "step": 12135 }, { "epoch": 8.120401337792643, "grad_norm": 2.88818621635437, "learning_rate": 7.408972975732612e-06, "loss": 0.4101, "num_input_tokens_seen": 12514640, "step": 12140 }, { "epoch": 8.123745819397993, "grad_norm": 1.3476558923721313, "learning_rate": 7.406415033002465e-06, "loss": 0.3917, "num_input_tokens_seen": 12519888, "step": 12145 }, { "epoch": 8.127090301003344, "grad_norm": 1.901872158050537, "learning_rate": 7.403856270330798e-06, "loss": 0.4504, "num_input_tokens_seen": 12525392, "step": 12150 }, { "epoch": 8.130434782608695, "grad_norm": 2.2421581745147705, "learning_rate": 7.401296688589463e-06, "loss": 0.4255, "num_input_tokens_seen": 12529968, "step": 12155 }, { "epoch": 8.133779264214047, "grad_norm": 1.629091739654541, "learning_rate": 7.398736288650591e-06, "loss": 0.4038, "num_input_tokens_seen": 12535632, "step": 12160 }, { "epoch": 8.137123745819398, "grad_norm": 1.4186826944351196, "learning_rate": 7.3961750713865895e-06, "loss": 0.4889, "num_input_tokens_seen": 12540848, "step": 12165 }, { "epoch": 8.14046822742475, "grad_norm": 1.58741295337677, "learning_rate": 7.393613037670148e-06, "loss": 0.4055, "num_input_tokens_seen": 12546288, "step": 12170 }, { "epoch": 8.143812709030101, "grad_norm": 1.6888569593429565, "learning_rate": 7.3910501883742314e-06, "loss": 0.4843, "num_input_tokens_seen": 12550736, "step": 12175 }, { "epoch": 8.147157190635452, "grad_norm": 2.1505672931671143, "learning_rate": 7.388486524372082e-06, "loss": 0.3788, "num_input_tokens_seen": 12557072, "step": 12180 }, { "epoch": 8.150501672240802, "grad_norm": 3.5645532608032227, "learning_rate": 7.385922046537222e-06, "loss": 0.6039, "num_input_tokens_seen": 12561776, "step": 12185 }, { "epoch": 8.153846153846153, "grad_norm": 2.1617703437805176, "learning_rate": 7.3833567557434495e-06, "loss": 0.4234, "num_input_tokens_seen": 12566608, "step": 12190 }, { "epoch": 8.157190635451505, "grad_norm": 1.5035542249679565, "learning_rate": 7.380790652864842e-06, "loss": 0.4125, "num_input_tokens_seen": 12571664, "step": 12195 }, { "epoch": 8.160535117056856, "grad_norm": 1.6348680257797241, "learning_rate": 7.378223738775749e-06, "loss": 0.5097, "num_input_tokens_seen": 12577232, "step": 12200 }, { "epoch": 8.163879598662207, "grad_norm": 1.7533519268035889, "learning_rate": 7.375656014350801e-06, "loss": 0.3948, "num_input_tokens_seen": 12581968, "step": 12205 }, { "epoch": 8.167224080267559, "grad_norm": 2.4362704753875732, "learning_rate": 7.3730874804649e-06, "loss": 0.4612, "num_input_tokens_seen": 12587248, "step": 12210 }, { "epoch": 8.17056856187291, "grad_norm": 2.1649205684661865, "learning_rate": 7.370518137993231e-06, "loss": 0.4325, "num_input_tokens_seen": 12593360, "step": 12215 }, { "epoch": 8.173913043478262, "grad_norm": 2.0087461471557617, "learning_rate": 7.3679479878112466e-06, "loss": 0.4239, "num_input_tokens_seen": 12598192, "step": 12220 }, { "epoch": 8.177257525083611, "grad_norm": 1.6109318733215332, "learning_rate": 7.365377030794678e-06, "loss": 0.4416, "num_input_tokens_seen": 12603536, "step": 12225 }, { "epoch": 8.180602006688963, "grad_norm": 1.7796685695648193, "learning_rate": 7.362805267819533e-06, "loss": 0.4154, "num_input_tokens_seen": 12608048, "step": 12230 }, { "epoch": 8.183946488294314, "grad_norm": 1.8259289264678955, "learning_rate": 7.360232699762091e-06, "loss": 0.3905, "num_input_tokens_seen": 12612752, "step": 12235 }, { "epoch": 8.187290969899665, "grad_norm": 1.9006556272506714, "learning_rate": 7.3576593274989094e-06, "loss": 0.4219, "num_input_tokens_seen": 12617904, "step": 12240 }, { "epoch": 8.190635451505017, "grad_norm": 1.496366262435913, "learning_rate": 7.355085151906814e-06, "loss": 0.4517, "num_input_tokens_seen": 12623568, "step": 12245 }, { "epoch": 8.193979933110368, "grad_norm": 1.4489977359771729, "learning_rate": 7.352510173862912e-06, "loss": 0.3849, "num_input_tokens_seen": 12629104, "step": 12250 }, { "epoch": 8.19732441471572, "grad_norm": 2.877790689468384, "learning_rate": 7.349934394244575e-06, "loss": 0.4164, "num_input_tokens_seen": 12634480, "step": 12255 }, { "epoch": 8.200668896321071, "grad_norm": 1.6188857555389404, "learning_rate": 7.347357813929455e-06, "loss": 0.3618, "num_input_tokens_seen": 12639760, "step": 12260 }, { "epoch": 8.20401337792642, "grad_norm": 1.7794820070266724, "learning_rate": 7.344780433795473e-06, "loss": 0.4256, "num_input_tokens_seen": 12644752, "step": 12265 }, { "epoch": 8.207357859531772, "grad_norm": 2.0122666358947754, "learning_rate": 7.342202254720825e-06, "loss": 0.325, "num_input_tokens_seen": 12649648, "step": 12270 }, { "epoch": 8.210702341137123, "grad_norm": 2.109706163406372, "learning_rate": 7.339623277583977e-06, "loss": 0.458, "num_input_tokens_seen": 12654256, "step": 12275 }, { "epoch": 8.214046822742475, "grad_norm": 1.9570966958999634, "learning_rate": 7.337043503263668e-06, "loss": 0.3904, "num_input_tokens_seen": 12660144, "step": 12280 }, { "epoch": 8.217391304347826, "grad_norm": 1.9230117797851562, "learning_rate": 7.334462932638908e-06, "loss": 0.4499, "num_input_tokens_seen": 12665488, "step": 12285 }, { "epoch": 8.220735785953178, "grad_norm": 3.158494472503662, "learning_rate": 7.33188156658898e-06, "loss": 0.4276, "num_input_tokens_seen": 12671184, "step": 12290 }, { "epoch": 8.224080267558529, "grad_norm": 2.8143365383148193, "learning_rate": 7.329299405993436e-06, "loss": 0.388, "num_input_tokens_seen": 12676848, "step": 12295 }, { "epoch": 8.22742474916388, "grad_norm": 3.105764865875244, "learning_rate": 7.3267164517321e-06, "loss": 0.4176, "num_input_tokens_seen": 12681904, "step": 12300 }, { "epoch": 8.23076923076923, "grad_norm": 1.9554705619812012, "learning_rate": 7.3241327046850675e-06, "loss": 0.4659, "num_input_tokens_seen": 12687088, "step": 12305 }, { "epoch": 8.234113712374581, "grad_norm": 2.0177981853485107, "learning_rate": 7.321548165732698e-06, "loss": 0.4918, "num_input_tokens_seen": 12692112, "step": 12310 }, { "epoch": 8.237458193979933, "grad_norm": 1.8841158151626587, "learning_rate": 7.318962835755631e-06, "loss": 0.4132, "num_input_tokens_seen": 12697936, "step": 12315 }, { "epoch": 8.240802675585284, "grad_norm": 1.3956669569015503, "learning_rate": 7.316376715634768e-06, "loss": 0.3527, "num_input_tokens_seen": 12702512, "step": 12320 }, { "epoch": 8.244147157190636, "grad_norm": 1.436929702758789, "learning_rate": 7.313789806251279e-06, "loss": 0.4686, "num_input_tokens_seen": 12707280, "step": 12325 }, { "epoch": 8.247491638795987, "grad_norm": 1.9174138307571411, "learning_rate": 7.311202108486608e-06, "loss": 0.4019, "num_input_tokens_seen": 12712112, "step": 12330 }, { "epoch": 8.250836120401338, "grad_norm": 1.854927659034729, "learning_rate": 7.3086136232224645e-06, "loss": 0.4318, "num_input_tokens_seen": 12716304, "step": 12335 }, { "epoch": 8.25418060200669, "grad_norm": 2.203683376312256, "learning_rate": 7.306024351340829e-06, "loss": 0.437, "num_input_tokens_seen": 12722384, "step": 12340 }, { "epoch": 8.25752508361204, "grad_norm": 1.9753721952438354, "learning_rate": 7.303434293723946e-06, "loss": 0.4249, "num_input_tokens_seen": 12727632, "step": 12345 }, { "epoch": 8.26086956521739, "grad_norm": 1.8141440153121948, "learning_rate": 7.3008434512543325e-06, "loss": 0.4115, "num_input_tokens_seen": 12732944, "step": 12350 }, { "epoch": 8.264214046822742, "grad_norm": 2.040980815887451, "learning_rate": 7.298251824814766e-06, "loss": 0.4042, "num_input_tokens_seen": 12737328, "step": 12355 }, { "epoch": 8.267558528428093, "grad_norm": 1.2863554954528809, "learning_rate": 7.2956594152883e-06, "loss": 0.4305, "num_input_tokens_seen": 12742736, "step": 12360 }, { "epoch": 8.270903010033445, "grad_norm": 1.7101777791976929, "learning_rate": 7.293066223558247e-06, "loss": 0.3989, "num_input_tokens_seen": 12747216, "step": 12365 }, { "epoch": 8.274247491638796, "grad_norm": 1.8409538269042969, "learning_rate": 7.290472250508189e-06, "loss": 0.451, "num_input_tokens_seen": 12751760, "step": 12370 }, { "epoch": 8.277591973244148, "grad_norm": 2.036289691925049, "learning_rate": 7.287877497021978e-06, "loss": 0.4615, "num_input_tokens_seen": 12756976, "step": 12375 }, { "epoch": 8.280936454849499, "grad_norm": 1.4345567226409912, "learning_rate": 7.285281963983723e-06, "loss": 0.3683, "num_input_tokens_seen": 12763792, "step": 12380 }, { "epoch": 8.284280936454849, "grad_norm": 2.1833369731903076, "learning_rate": 7.282685652277809e-06, "loss": 0.4197, "num_input_tokens_seen": 12768784, "step": 12385 }, { "epoch": 8.2876254180602, "grad_norm": 1.8598382472991943, "learning_rate": 7.280088562788879e-06, "loss": 0.465, "num_input_tokens_seen": 12774128, "step": 12390 }, { "epoch": 8.290969899665551, "grad_norm": 2.4078781604766846, "learning_rate": 7.277490696401843e-06, "loss": 0.4238, "num_input_tokens_seen": 12778992, "step": 12395 }, { "epoch": 8.294314381270903, "grad_norm": 1.991844654083252, "learning_rate": 7.274892054001878e-06, "loss": 0.409, "num_input_tokens_seen": 12784080, "step": 12400 }, { "epoch": 8.297658862876254, "grad_norm": 2.024970531463623, "learning_rate": 7.272292636474422e-06, "loss": 0.5076, "num_input_tokens_seen": 12788912, "step": 12405 }, { "epoch": 8.301003344481606, "grad_norm": 1.349092960357666, "learning_rate": 7.269692444705179e-06, "loss": 0.4289, "num_input_tokens_seen": 12793808, "step": 12410 }, { "epoch": 8.304347826086957, "grad_norm": 1.8037618398666382, "learning_rate": 7.267091479580114e-06, "loss": 0.4037, "num_input_tokens_seen": 12798064, "step": 12415 }, { "epoch": 8.307692307692308, "grad_norm": 1.9478538036346436, "learning_rate": 7.264489741985462e-06, "loss": 0.4569, "num_input_tokens_seen": 12803600, "step": 12420 }, { "epoch": 8.31103678929766, "grad_norm": 2.5949454307556152, "learning_rate": 7.261887232807714e-06, "loss": 0.4144, "num_input_tokens_seen": 12808976, "step": 12425 }, { "epoch": 8.31438127090301, "grad_norm": 2.489222764968872, "learning_rate": 7.259283952933628e-06, "loss": 0.4094, "num_input_tokens_seen": 12813648, "step": 12430 }, { "epoch": 8.31772575250836, "grad_norm": 2.122499704360962, "learning_rate": 7.256679903250224e-06, "loss": 0.458, "num_input_tokens_seen": 12819088, "step": 12435 }, { "epoch": 8.321070234113712, "grad_norm": 1.9282491207122803, "learning_rate": 7.254075084644783e-06, "loss": 0.4142, "num_input_tokens_seen": 12823856, "step": 12440 }, { "epoch": 8.324414715719064, "grad_norm": 2.2084856033325195, "learning_rate": 7.2514694980048485e-06, "loss": 0.4322, "num_input_tokens_seen": 12828304, "step": 12445 }, { "epoch": 8.327759197324415, "grad_norm": 2.0690948963165283, "learning_rate": 7.248863144218229e-06, "loss": 0.4164, "num_input_tokens_seen": 12832816, "step": 12450 }, { "epoch": 8.331103678929766, "grad_norm": 1.6378192901611328, "learning_rate": 7.246256024172989e-06, "loss": 0.4294, "num_input_tokens_seen": 12837648, "step": 12455 }, { "epoch": 8.334448160535118, "grad_norm": 1.5919502973556519, "learning_rate": 7.243648138757455e-06, "loss": 0.3994, "num_input_tokens_seen": 12842544, "step": 12460 }, { "epoch": 8.337792642140467, "grad_norm": 2.0113112926483154, "learning_rate": 7.2410394888602194e-06, "loss": 0.3539, "num_input_tokens_seen": 12846960, "step": 12465 }, { "epoch": 8.341137123745819, "grad_norm": 2.348142147064209, "learning_rate": 7.23843007537013e-06, "loss": 0.4575, "num_input_tokens_seen": 12852976, "step": 12470 }, { "epoch": 8.34448160535117, "grad_norm": 1.7055916786193848, "learning_rate": 7.235819899176295e-06, "loss": 0.5864, "num_input_tokens_seen": 12858256, "step": 12475 }, { "epoch": 8.347826086956522, "grad_norm": 2.1762967109680176, "learning_rate": 7.2332089611680855e-06, "loss": 0.3396, "num_input_tokens_seen": 12863024, "step": 12480 }, { "epoch": 8.351170568561873, "grad_norm": 1.6153085231781006, "learning_rate": 7.23059726223513e-06, "loss": 0.4456, "num_input_tokens_seen": 12867632, "step": 12485 }, { "epoch": 8.354515050167224, "grad_norm": 1.9463804960250854, "learning_rate": 7.227984803267316e-06, "loss": 0.349, "num_input_tokens_seen": 12871920, "step": 12490 }, { "epoch": 8.357859531772576, "grad_norm": 1.9644638299942017, "learning_rate": 7.225371585154792e-06, "loss": 0.4241, "num_input_tokens_seen": 12876624, "step": 12495 }, { "epoch": 8.361204013377927, "grad_norm": 2.1788766384124756, "learning_rate": 7.2227576087879635e-06, "loss": 0.4596, "num_input_tokens_seen": 12881776, "step": 12500 }, { "epoch": 8.364548494983278, "grad_norm": 2.0132718086242676, "learning_rate": 7.2201428750574935e-06, "loss": 0.3402, "num_input_tokens_seen": 12887280, "step": 12505 }, { "epoch": 8.367892976588628, "grad_norm": 1.449885606765747, "learning_rate": 7.217527384854306e-06, "loss": 0.4208, "num_input_tokens_seen": 12892304, "step": 12510 }, { "epoch": 8.37123745819398, "grad_norm": 2.5155036449432373, "learning_rate": 7.214911139069578e-06, "loss": 0.3847, "num_input_tokens_seen": 12897200, "step": 12515 }, { "epoch": 8.37458193979933, "grad_norm": 1.846990704536438, "learning_rate": 7.212294138594752e-06, "loss": 0.4629, "num_input_tokens_seen": 12902640, "step": 12520 }, { "epoch": 8.377926421404682, "grad_norm": 1.2910641431808472, "learning_rate": 7.209676384321518e-06, "loss": 0.354, "num_input_tokens_seen": 12907984, "step": 12525 }, { "epoch": 8.381270903010034, "grad_norm": 1.2653398513793945, "learning_rate": 7.2070578771418284e-06, "loss": 0.4286, "num_input_tokens_seen": 12913136, "step": 12530 }, { "epoch": 8.384615384615385, "grad_norm": 1.9012477397918701, "learning_rate": 7.204438617947893e-06, "loss": 0.4604, "num_input_tokens_seen": 12917616, "step": 12535 }, { "epoch": 8.387959866220736, "grad_norm": 1.3552813529968262, "learning_rate": 7.201818607632176e-06, "loss": 0.3331, "num_input_tokens_seen": 12922352, "step": 12540 }, { "epoch": 8.391304347826088, "grad_norm": 1.768593192100525, "learning_rate": 7.199197847087396e-06, "loss": 0.4695, "num_input_tokens_seen": 12928112, "step": 12545 }, { "epoch": 8.394648829431437, "grad_norm": 2.400218963623047, "learning_rate": 7.1965763372065285e-06, "loss": 0.5142, "num_input_tokens_seen": 12932720, "step": 12550 }, { "epoch": 8.397993311036789, "grad_norm": 2.058717727661133, "learning_rate": 7.193954078882808e-06, "loss": 0.4252, "num_input_tokens_seen": 12937776, "step": 12555 }, { "epoch": 8.40133779264214, "grad_norm": 1.5163886547088623, "learning_rate": 7.191331073009716e-06, "loss": 0.3879, "num_input_tokens_seen": 12942768, "step": 12560 }, { "epoch": 8.404682274247492, "grad_norm": 1.6348421573638916, "learning_rate": 7.188707320480997e-06, "loss": 0.4547, "num_input_tokens_seen": 12948272, "step": 12565 }, { "epoch": 8.408026755852843, "grad_norm": 2.415982961654663, "learning_rate": 7.186082822190643e-06, "loss": 0.4349, "num_input_tokens_seen": 12953168, "step": 12570 }, { "epoch": 8.411371237458194, "grad_norm": 1.7136379480361938, "learning_rate": 7.183457579032907e-06, "loss": 0.3908, "num_input_tokens_seen": 12958416, "step": 12575 }, { "epoch": 8.414715719063546, "grad_norm": 1.3061034679412842, "learning_rate": 7.180831591902289e-06, "loss": 0.4024, "num_input_tokens_seen": 12964208, "step": 12580 }, { "epoch": 8.418060200668897, "grad_norm": 5.006861686706543, "learning_rate": 7.178204861693546e-06, "loss": 0.3882, "num_input_tokens_seen": 12968528, "step": 12585 }, { "epoch": 8.421404682274247, "grad_norm": 2.5113630294799805, "learning_rate": 7.17557738930169e-06, "loss": 0.4081, "num_input_tokens_seen": 12973360, "step": 12590 }, { "epoch": 8.424749163879598, "grad_norm": 1.859150767326355, "learning_rate": 7.172949175621984e-06, "loss": 0.4053, "num_input_tokens_seen": 12977936, "step": 12595 }, { "epoch": 8.42809364548495, "grad_norm": 2.004937171936035, "learning_rate": 7.17032022154994e-06, "loss": 0.3929, "num_input_tokens_seen": 12982480, "step": 12600 }, { "epoch": 8.431438127090301, "grad_norm": 2.9251632690429688, "learning_rate": 7.167690527981328e-06, "loss": 0.496, "num_input_tokens_seen": 12987504, "step": 12605 }, { "epoch": 8.434782608695652, "grad_norm": 1.8723804950714111, "learning_rate": 7.165060095812168e-06, "loss": 0.4615, "num_input_tokens_seen": 12993104, "step": 12610 }, { "epoch": 8.438127090301004, "grad_norm": 2.90128755569458, "learning_rate": 7.162428925938729e-06, "loss": 0.3616, "num_input_tokens_seen": 12997840, "step": 12615 }, { "epoch": 8.441471571906355, "grad_norm": 1.6658961772918701, "learning_rate": 7.159797019257536e-06, "loss": 0.364, "num_input_tokens_seen": 13003568, "step": 12620 }, { "epoch": 8.444816053511706, "grad_norm": 2.2901787757873535, "learning_rate": 7.15716437666536e-06, "loss": 0.4162, "num_input_tokens_seen": 13009520, "step": 12625 }, { "epoch": 8.448160535117056, "grad_norm": 2.091348648071289, "learning_rate": 7.154530999059227e-06, "loss": 0.435, "num_input_tokens_seen": 13015408, "step": 12630 }, { "epoch": 8.451505016722408, "grad_norm": 2.485903263092041, "learning_rate": 7.151896887336412e-06, "loss": 0.4584, "num_input_tokens_seen": 13020464, "step": 12635 }, { "epoch": 8.454849498327759, "grad_norm": 1.613696575164795, "learning_rate": 7.14926204239444e-06, "loss": 0.3834, "num_input_tokens_seen": 13026192, "step": 12640 }, { "epoch": 8.45819397993311, "grad_norm": 1.5784118175506592, "learning_rate": 7.146626465131087e-06, "loss": 0.3961, "num_input_tokens_seen": 13031248, "step": 12645 }, { "epoch": 8.461538461538462, "grad_norm": 2.0907719135284424, "learning_rate": 7.143990156444374e-06, "loss": 0.4213, "num_input_tokens_seen": 13036528, "step": 12650 }, { "epoch": 8.464882943143813, "grad_norm": 1.576827049255371, "learning_rate": 7.141353117232577e-06, "loss": 0.3592, "num_input_tokens_seen": 13041744, "step": 12655 }, { "epoch": 8.468227424749164, "grad_norm": 1.554663896560669, "learning_rate": 7.138715348394216e-06, "loss": 0.4361, "num_input_tokens_seen": 13046992, "step": 12660 }, { "epoch": 8.471571906354516, "grad_norm": 1.8823044300079346, "learning_rate": 7.136076850828067e-06, "loss": 0.4115, "num_input_tokens_seen": 13051984, "step": 12665 }, { "epoch": 8.474916387959865, "grad_norm": 1.9619334936141968, "learning_rate": 7.133437625433143e-06, "loss": 0.4729, "num_input_tokens_seen": 13057680, "step": 12670 }, { "epoch": 8.478260869565217, "grad_norm": 2.094433069229126, "learning_rate": 7.130797673108715e-06, "loss": 0.4201, "num_input_tokens_seen": 13062896, "step": 12675 }, { "epoch": 8.481605351170568, "grad_norm": 2.1009669303894043, "learning_rate": 7.128156994754299e-06, "loss": 0.3651, "num_input_tokens_seen": 13067568, "step": 12680 }, { "epoch": 8.48494983277592, "grad_norm": 2.207223892211914, "learning_rate": 7.125515591269655e-06, "loss": 0.4958, "num_input_tokens_seen": 13072400, "step": 12685 }, { "epoch": 8.488294314381271, "grad_norm": 1.5486934185028076, "learning_rate": 7.122873463554795e-06, "loss": 0.4595, "num_input_tokens_seen": 13078224, "step": 12690 }, { "epoch": 8.491638795986622, "grad_norm": 2.399306297302246, "learning_rate": 7.120230612509972e-06, "loss": 0.4575, "num_input_tokens_seen": 13083664, "step": 12695 }, { "epoch": 8.494983277591974, "grad_norm": 1.6386127471923828, "learning_rate": 7.117587039035692e-06, "loss": 0.4649, "num_input_tokens_seen": 13088688, "step": 12700 }, { "epoch": 8.498327759197325, "grad_norm": 1.5134387016296387, "learning_rate": 7.1149427440327e-06, "loss": 0.4587, "num_input_tokens_seen": 13093872, "step": 12705 }, { "epoch": 8.501672240802675, "grad_norm": 2.153994083404541, "learning_rate": 7.112297728401996e-06, "loss": 0.3747, "num_input_tokens_seen": 13098800, "step": 12710 }, { "epoch": 8.505016722408026, "grad_norm": 1.4326529502868652, "learning_rate": 7.1096519930448145e-06, "loss": 0.4784, "num_input_tokens_seen": 13103728, "step": 12715 }, { "epoch": 8.508361204013378, "grad_norm": 1.8429007530212402, "learning_rate": 7.107005538862647e-06, "loss": 0.4072, "num_input_tokens_seen": 13108528, "step": 12720 }, { "epoch": 8.511705685618729, "grad_norm": 1.2937647104263306, "learning_rate": 7.104358366757218e-06, "loss": 0.3999, "num_input_tokens_seen": 13114320, "step": 12725 }, { "epoch": 8.51505016722408, "grad_norm": 1.7682876586914062, "learning_rate": 7.101710477630507e-06, "loss": 0.4418, "num_input_tokens_seen": 13119760, "step": 12730 }, { "epoch": 8.518394648829432, "grad_norm": 2.5442380905151367, "learning_rate": 7.099061872384732e-06, "loss": 0.46, "num_input_tokens_seen": 13124848, "step": 12735 }, { "epoch": 8.521739130434783, "grad_norm": 2.033372402191162, "learning_rate": 7.096412551922355e-06, "loss": 0.4548, "num_input_tokens_seen": 13130064, "step": 12740 }, { "epoch": 8.525083612040135, "grad_norm": 2.713651657104492, "learning_rate": 7.0937625171460844e-06, "loss": 0.424, "num_input_tokens_seen": 13135152, "step": 12745 }, { "epoch": 8.528428093645484, "grad_norm": 2.430670976638794, "learning_rate": 7.09111176895887e-06, "loss": 0.5112, "num_input_tokens_seen": 13140560, "step": 12750 }, { "epoch": 8.531772575250836, "grad_norm": 1.861690640449524, "learning_rate": 7.088460308263907e-06, "loss": 0.4066, "num_input_tokens_seen": 13145168, "step": 12755 }, { "epoch": 8.535117056856187, "grad_norm": 2.4494130611419678, "learning_rate": 7.0858081359646295e-06, "loss": 0.431, "num_input_tokens_seen": 13150192, "step": 12760 }, { "epoch": 8.538461538461538, "grad_norm": 1.3433518409729004, "learning_rate": 7.083155252964717e-06, "loss": 0.3974, "num_input_tokens_seen": 13154736, "step": 12765 }, { "epoch": 8.54180602006689, "grad_norm": 1.454276442527771, "learning_rate": 7.080501660168091e-06, "loss": 0.4043, "num_input_tokens_seen": 13159728, "step": 12770 }, { "epoch": 8.545150501672241, "grad_norm": 2.025784492492676, "learning_rate": 7.077847358478914e-06, "loss": 0.3731, "num_input_tokens_seen": 13165296, "step": 12775 }, { "epoch": 8.548494983277592, "grad_norm": 2.299159526824951, "learning_rate": 7.075192348801591e-06, "loss": 0.3956, "num_input_tokens_seen": 13170768, "step": 12780 }, { "epoch": 8.551839464882944, "grad_norm": 1.4891818761825562, "learning_rate": 7.0725366320407655e-06, "loss": 0.4636, "num_input_tokens_seen": 13175760, "step": 12785 }, { "epoch": 8.555183946488294, "grad_norm": 1.3051397800445557, "learning_rate": 7.069880209101327e-06, "loss": 0.4447, "num_input_tokens_seen": 13181488, "step": 12790 }, { "epoch": 8.558528428093645, "grad_norm": 2.012730121612549, "learning_rate": 7.067223080888399e-06, "loss": 0.3733, "num_input_tokens_seen": 13186288, "step": 12795 }, { "epoch": 8.561872909698996, "grad_norm": 1.7560752630233765, "learning_rate": 7.064565248307351e-06, "loss": 0.4057, "num_input_tokens_seen": 13192432, "step": 12800 }, { "epoch": 8.565217391304348, "grad_norm": 1.3587099313735962, "learning_rate": 7.0619067122637905e-06, "loss": 0.4161, "num_input_tokens_seen": 13197488, "step": 12805 }, { "epoch": 8.568561872909699, "grad_norm": 1.6180505752563477, "learning_rate": 7.059247473663566e-06, "loss": 0.4726, "num_input_tokens_seen": 13202832, "step": 12810 }, { "epoch": 8.57190635451505, "grad_norm": 2.2463412284851074, "learning_rate": 7.05658753341276e-06, "loss": 0.4574, "num_input_tokens_seen": 13208400, "step": 12815 }, { "epoch": 8.575250836120402, "grad_norm": 1.7088643312454224, "learning_rate": 7.053926892417702e-06, "loss": 0.4374, "num_input_tokens_seen": 13212688, "step": 12820 }, { "epoch": 8.578595317725753, "grad_norm": 3.1655220985412598, "learning_rate": 7.051265551584954e-06, "loss": 0.4124, "num_input_tokens_seen": 13217648, "step": 12825 }, { "epoch": 8.581939799331103, "grad_norm": 1.7259924411773682, "learning_rate": 7.048603511821318e-06, "loss": 0.4254, "num_input_tokens_seen": 13222896, "step": 12830 }, { "epoch": 8.585284280936454, "grad_norm": 2.0548036098480225, "learning_rate": 7.045940774033838e-06, "loss": 0.4721, "num_input_tokens_seen": 13228016, "step": 12835 }, { "epoch": 8.588628762541806, "grad_norm": 1.7063795328140259, "learning_rate": 7.04327733912979e-06, "loss": 0.5987, "num_input_tokens_seen": 13233168, "step": 12840 }, { "epoch": 8.591973244147157, "grad_norm": 1.6173629760742188, "learning_rate": 7.040613208016693e-06, "loss": 0.2989, "num_input_tokens_seen": 13238256, "step": 12845 }, { "epoch": 8.595317725752508, "grad_norm": 1.8309365510940552, "learning_rate": 7.037948381602299e-06, "loss": 0.3506, "num_input_tokens_seen": 13243856, "step": 12850 }, { "epoch": 8.59866220735786, "grad_norm": 1.8616670370101929, "learning_rate": 7.035282860794598e-06, "loss": 0.4072, "num_input_tokens_seen": 13248752, "step": 12855 }, { "epoch": 8.602006688963211, "grad_norm": 1.2747433185577393, "learning_rate": 7.032616646501816e-06, "loss": 0.4285, "num_input_tokens_seen": 13254032, "step": 12860 }, { "epoch": 8.605351170568563, "grad_norm": 2.160489320755005, "learning_rate": 7.02994973963242e-06, "loss": 0.3917, "num_input_tokens_seen": 13258608, "step": 12865 }, { "epoch": 8.608695652173914, "grad_norm": 2.1952831745147705, "learning_rate": 7.027282141095106e-06, "loss": 0.3849, "num_input_tokens_seen": 13262896, "step": 12870 }, { "epoch": 8.612040133779264, "grad_norm": 2.363926887512207, "learning_rate": 7.024613851798808e-06, "loss": 0.4416, "num_input_tokens_seen": 13267536, "step": 12875 }, { "epoch": 8.615384615384615, "grad_norm": 2.4676408767700195, "learning_rate": 7.021944872652701e-06, "loss": 0.4475, "num_input_tokens_seen": 13272752, "step": 12880 }, { "epoch": 8.618729096989966, "grad_norm": 1.8562729358673096, "learning_rate": 7.0192752045661874e-06, "loss": 0.5237, "num_input_tokens_seen": 13279024, "step": 12885 }, { "epoch": 8.622073578595318, "grad_norm": 1.9269660711288452, "learning_rate": 7.016604848448907e-06, "loss": 0.3873, "num_input_tokens_seen": 13285680, "step": 12890 }, { "epoch": 8.62541806020067, "grad_norm": 2.06024432182312, "learning_rate": 7.0139338052107355e-06, "loss": 0.487, "num_input_tokens_seen": 13290992, "step": 12895 }, { "epoch": 8.62876254180602, "grad_norm": 2.039161443710327, "learning_rate": 7.011262075761782e-06, "loss": 0.471, "num_input_tokens_seen": 13296240, "step": 12900 }, { "epoch": 8.632107023411372, "grad_norm": 1.9570988416671753, "learning_rate": 7.008589661012387e-06, "loss": 0.4352, "num_input_tokens_seen": 13301488, "step": 12905 }, { "epoch": 8.635451505016722, "grad_norm": 1.5088870525360107, "learning_rate": 7.005916561873129e-06, "loss": 0.3554, "num_input_tokens_seen": 13306000, "step": 12910 }, { "epoch": 8.638795986622073, "grad_norm": 1.7654547691345215, "learning_rate": 7.003242779254814e-06, "loss": 0.4075, "num_input_tokens_seen": 13311632, "step": 12915 }, { "epoch": 8.642140468227424, "grad_norm": 1.6345295906066895, "learning_rate": 7.000568314068488e-06, "loss": 0.3752, "num_input_tokens_seen": 13316784, "step": 12920 }, { "epoch": 8.645484949832776, "grad_norm": 1.8942328691482544, "learning_rate": 6.9978931672254216e-06, "loss": 0.415, "num_input_tokens_seen": 13322672, "step": 12925 }, { "epoch": 8.648829431438127, "grad_norm": 1.8501007556915283, "learning_rate": 6.9952173396371245e-06, "loss": 0.4219, "num_input_tokens_seen": 13328176, "step": 12930 }, { "epoch": 8.652173913043478, "grad_norm": 2.037166118621826, "learning_rate": 6.9925408322153355e-06, "loss": 0.347, "num_input_tokens_seen": 13333040, "step": 12935 }, { "epoch": 8.65551839464883, "grad_norm": 1.358325481414795, "learning_rate": 6.989863645872024e-06, "loss": 0.425, "num_input_tokens_seen": 13337968, "step": 12940 }, { "epoch": 8.658862876254181, "grad_norm": 1.96000075340271, "learning_rate": 6.987185781519394e-06, "loss": 0.5044, "num_input_tokens_seen": 13343728, "step": 12945 }, { "epoch": 8.662207357859533, "grad_norm": 1.8903765678405762, "learning_rate": 6.984507240069874e-06, "loss": 0.4255, "num_input_tokens_seen": 13348816, "step": 12950 }, { "epoch": 8.665551839464882, "grad_norm": 2.6804122924804688, "learning_rate": 6.981828022436132e-06, "loss": 0.4277, "num_input_tokens_seen": 13354064, "step": 12955 }, { "epoch": 8.668896321070234, "grad_norm": 1.7979450225830078, "learning_rate": 6.979148129531059e-06, "loss": 0.5561, "num_input_tokens_seen": 13360112, "step": 12960 }, { "epoch": 8.672240802675585, "grad_norm": 1.5871381759643555, "learning_rate": 6.976467562267783e-06, "loss": 0.4241, "num_input_tokens_seen": 13364624, "step": 12965 }, { "epoch": 8.675585284280936, "grad_norm": 3.161895751953125, "learning_rate": 6.973786321559652e-06, "loss": 0.4549, "num_input_tokens_seen": 13368976, "step": 12970 }, { "epoch": 8.678929765886288, "grad_norm": 2.329197883605957, "learning_rate": 6.971104408320253e-06, "loss": 0.356, "num_input_tokens_seen": 13373744, "step": 12975 }, { "epoch": 8.68227424749164, "grad_norm": 1.7030612230300903, "learning_rate": 6.968421823463399e-06, "loss": 0.3785, "num_input_tokens_seen": 13378800, "step": 12980 }, { "epoch": 8.68561872909699, "grad_norm": 2.177703619003296, "learning_rate": 6.9657385679031284e-06, "loss": 0.4529, "num_input_tokens_seen": 13384112, "step": 12985 }, { "epoch": 8.68896321070234, "grad_norm": 1.9055744409561157, "learning_rate": 6.963054642553716e-06, "loss": 0.4753, "num_input_tokens_seen": 13390032, "step": 12990 }, { "epoch": 8.692307692307692, "grad_norm": 2.4378252029418945, "learning_rate": 6.960370048329654e-06, "loss": 0.3784, "num_input_tokens_seen": 13394800, "step": 12995 }, { "epoch": 8.695652173913043, "grad_norm": 2.5953752994537354, "learning_rate": 6.957684786145672e-06, "loss": 0.5157, "num_input_tokens_seen": 13399568, "step": 13000 }, { "epoch": 8.698996655518394, "grad_norm": 2.1339240074157715, "learning_rate": 6.954998856916721e-06, "loss": 0.4166, "num_input_tokens_seen": 13404720, "step": 13005 }, { "epoch": 8.702341137123746, "grad_norm": 2.0757880210876465, "learning_rate": 6.952312261557986e-06, "loss": 0.4209, "num_input_tokens_seen": 13410448, "step": 13010 }, { "epoch": 8.705685618729097, "grad_norm": 1.873246669769287, "learning_rate": 6.949625000984871e-06, "loss": 0.3596, "num_input_tokens_seen": 13415536, "step": 13015 }, { "epoch": 8.709030100334449, "grad_norm": 1.957047700881958, "learning_rate": 6.946937076113012e-06, "loss": 0.433, "num_input_tokens_seen": 13420944, "step": 13020 }, { "epoch": 8.7123745819398, "grad_norm": 2.588878870010376, "learning_rate": 6.944248487858267e-06, "loss": 0.4738, "num_input_tokens_seen": 13425584, "step": 13025 }, { "epoch": 8.715719063545151, "grad_norm": 2.0865724086761475, "learning_rate": 6.941559237136727e-06, "loss": 0.3823, "num_input_tokens_seen": 13430544, "step": 13030 }, { "epoch": 8.719063545150501, "grad_norm": 1.9257714748382568, "learning_rate": 6.938869324864704e-06, "loss": 0.5266, "num_input_tokens_seen": 13435920, "step": 13035 }, { "epoch": 8.722408026755852, "grad_norm": 1.6818827390670776, "learning_rate": 6.936178751958732e-06, "loss": 0.4083, "num_input_tokens_seen": 13440976, "step": 13040 }, { "epoch": 8.725752508361204, "grad_norm": 1.5880154371261597, "learning_rate": 6.933487519335579e-06, "loss": 0.393, "num_input_tokens_seen": 13445360, "step": 13045 }, { "epoch": 8.729096989966555, "grad_norm": 1.5959460735321045, "learning_rate": 6.93079562791223e-06, "loss": 0.4073, "num_input_tokens_seen": 13450480, "step": 13050 }, { "epoch": 8.732441471571907, "grad_norm": 1.7547301054000854, "learning_rate": 6.928103078605898e-06, "loss": 0.4215, "num_input_tokens_seen": 13455568, "step": 13055 }, { "epoch": 8.735785953177258, "grad_norm": 1.7604809999465942, "learning_rate": 6.925409872334019e-06, "loss": 0.4421, "num_input_tokens_seen": 13460656, "step": 13060 }, { "epoch": 8.73913043478261, "grad_norm": 1.8926364183425903, "learning_rate": 6.922716010014256e-06, "loss": 0.3841, "num_input_tokens_seen": 13466096, "step": 13065 }, { "epoch": 8.742474916387959, "grad_norm": 2.640096426010132, "learning_rate": 6.9200214925644884e-06, "loss": 0.4215, "num_input_tokens_seen": 13470896, "step": 13070 }, { "epoch": 8.74581939799331, "grad_norm": 1.4967350959777832, "learning_rate": 6.917326320902825e-06, "loss": 0.43, "num_input_tokens_seen": 13477360, "step": 13075 }, { "epoch": 8.749163879598662, "grad_norm": 2.561598062515259, "learning_rate": 6.914630495947599e-06, "loss": 0.4162, "num_input_tokens_seen": 13482736, "step": 13080 }, { "epoch": 8.752508361204013, "grad_norm": 1.450588345527649, "learning_rate": 6.911934018617359e-06, "loss": 0.3795, "num_input_tokens_seen": 13488656, "step": 13085 }, { "epoch": 8.755852842809364, "grad_norm": 1.4549700021743774, "learning_rate": 6.909236889830882e-06, "loss": 0.4423, "num_input_tokens_seen": 13493808, "step": 13090 }, { "epoch": 8.759197324414716, "grad_norm": 1.227466106414795, "learning_rate": 6.906539110507163e-06, "loss": 0.3222, "num_input_tokens_seen": 13499344, "step": 13095 }, { "epoch": 8.762541806020067, "grad_norm": 2.379993438720703, "learning_rate": 6.903840681565424e-06, "loss": 0.5311, "num_input_tokens_seen": 13505264, "step": 13100 }, { "epoch": 8.765886287625419, "grad_norm": 2.0277340412139893, "learning_rate": 6.9011416039251e-06, "loss": 0.3728, "num_input_tokens_seen": 13509776, "step": 13105 }, { "epoch": 8.76923076923077, "grad_norm": 2.6140074729919434, "learning_rate": 6.898441878505857e-06, "loss": 0.4437, "num_input_tokens_seen": 13515280, "step": 13110 }, { "epoch": 8.77257525083612, "grad_norm": 2.0579185485839844, "learning_rate": 6.895741506227573e-06, "loss": 0.4862, "num_input_tokens_seen": 13520688, "step": 13115 }, { "epoch": 8.775919732441471, "grad_norm": 1.887974500656128, "learning_rate": 6.893040488010351e-06, "loss": 0.4516, "num_input_tokens_seen": 13526384, "step": 13120 }, { "epoch": 8.779264214046822, "grad_norm": 1.9197932481765747, "learning_rate": 6.890338824774513e-06, "loss": 0.45, "num_input_tokens_seen": 13531408, "step": 13125 }, { "epoch": 8.782608695652174, "grad_norm": 2.2594985961914062, "learning_rate": 6.8876365174406004e-06, "loss": 0.4697, "num_input_tokens_seen": 13536496, "step": 13130 }, { "epoch": 8.785953177257525, "grad_norm": 1.596926212310791, "learning_rate": 6.884933566929377e-06, "loss": 0.3824, "num_input_tokens_seen": 13541776, "step": 13135 }, { "epoch": 8.789297658862877, "grad_norm": 1.8633873462677002, "learning_rate": 6.88222997416182e-06, "loss": 0.4702, "num_input_tokens_seen": 13546288, "step": 13140 }, { "epoch": 8.792642140468228, "grad_norm": 1.8035392761230469, "learning_rate": 6.879525740059133e-06, "loss": 0.4177, "num_input_tokens_seen": 13551952, "step": 13145 }, { "epoch": 8.79598662207358, "grad_norm": 2.2709736824035645, "learning_rate": 6.876820865542729e-06, "loss": 0.5005, "num_input_tokens_seen": 13557520, "step": 13150 }, { "epoch": 8.799331103678929, "grad_norm": 2.2983832359313965, "learning_rate": 6.874115351534248e-06, "loss": 0.4394, "num_input_tokens_seen": 13562416, "step": 13155 }, { "epoch": 8.80267558528428, "grad_norm": 1.9603608846664429, "learning_rate": 6.871409198955542e-06, "loss": 0.3923, "num_input_tokens_seen": 13568464, "step": 13160 }, { "epoch": 8.806020066889632, "grad_norm": 1.4646795988082886, "learning_rate": 6.868702408728682e-06, "loss": 0.436, "num_input_tokens_seen": 13574160, "step": 13165 }, { "epoch": 8.809364548494983, "grad_norm": 2.860186815261841, "learning_rate": 6.865994981775958e-06, "loss": 0.5123, "num_input_tokens_seen": 13579120, "step": 13170 }, { "epoch": 8.812709030100335, "grad_norm": 1.6294277906417847, "learning_rate": 6.8632869190198744e-06, "loss": 0.4121, "num_input_tokens_seen": 13584176, "step": 13175 }, { "epoch": 8.816053511705686, "grad_norm": 1.2088279724121094, "learning_rate": 6.860578221383156e-06, "loss": 0.3847, "num_input_tokens_seen": 13589232, "step": 13180 }, { "epoch": 8.819397993311037, "grad_norm": 2.1920177936553955, "learning_rate": 6.85786888978874e-06, "loss": 0.4575, "num_input_tokens_seen": 13594672, "step": 13185 }, { "epoch": 8.822742474916389, "grad_norm": 1.665204405784607, "learning_rate": 6.855158925159783e-06, "loss": 0.4414, "num_input_tokens_seen": 13599056, "step": 13190 }, { "epoch": 8.826086956521738, "grad_norm": 2.1963579654693604, "learning_rate": 6.852448328419652e-06, "loss": 0.4374, "num_input_tokens_seen": 13604304, "step": 13195 }, { "epoch": 8.82943143812709, "grad_norm": 2.491549015045166, "learning_rate": 6.849737100491934e-06, "loss": 0.4435, "num_input_tokens_seen": 13609776, "step": 13200 }, { "epoch": 8.832775919732441, "grad_norm": 2.6172397136688232, "learning_rate": 6.847025242300434e-06, "loss": 0.5466, "num_input_tokens_seen": 13614992, "step": 13205 }, { "epoch": 8.836120401337793, "grad_norm": 2.104421377182007, "learning_rate": 6.84431275476916e-06, "loss": 0.4121, "num_input_tokens_seen": 13619408, "step": 13210 }, { "epoch": 8.839464882943144, "grad_norm": 2.133547067642212, "learning_rate": 6.8415996388223475e-06, "loss": 0.4736, "num_input_tokens_seen": 13624688, "step": 13215 }, { "epoch": 8.842809364548495, "grad_norm": 1.715575098991394, "learning_rate": 6.8388858953844395e-06, "loss": 0.3985, "num_input_tokens_seen": 13630928, "step": 13220 }, { "epoch": 8.846153846153847, "grad_norm": 2.0372414588928223, "learning_rate": 6.836171525380096e-06, "loss": 0.5315, "num_input_tokens_seen": 13635600, "step": 13225 }, { "epoch": 8.849498327759198, "grad_norm": 2.020875930786133, "learning_rate": 6.833456529734184e-06, "loss": 0.3484, "num_input_tokens_seen": 13641200, "step": 13230 }, { "epoch": 8.852842809364548, "grad_norm": 1.5820250511169434, "learning_rate": 6.830740909371791e-06, "loss": 0.4814, "num_input_tokens_seen": 13646544, "step": 13235 }, { "epoch": 8.856187290969899, "grad_norm": 1.7139840126037598, "learning_rate": 6.828024665218215e-06, "loss": 0.5234, "num_input_tokens_seen": 13651760, "step": 13240 }, { "epoch": 8.85953177257525, "grad_norm": 1.9562780857086182, "learning_rate": 6.825307798198965e-06, "loss": 0.4424, "num_input_tokens_seen": 13656688, "step": 13245 }, { "epoch": 8.862876254180602, "grad_norm": 1.6171314716339111, "learning_rate": 6.822590309239764e-06, "loss": 0.4261, "num_input_tokens_seen": 13661552, "step": 13250 }, { "epoch": 8.866220735785953, "grad_norm": 2.0044872760772705, "learning_rate": 6.819872199266544e-06, "loss": 0.3997, "num_input_tokens_seen": 13666000, "step": 13255 }, { "epoch": 8.869565217391305, "grad_norm": 1.5383135080337524, "learning_rate": 6.817153469205456e-06, "loss": 0.3692, "num_input_tokens_seen": 13671536, "step": 13260 }, { "epoch": 8.872909698996656, "grad_norm": 2.6949164867401123, "learning_rate": 6.814434119982849e-06, "loss": 0.415, "num_input_tokens_seen": 13676304, "step": 13265 }, { "epoch": 8.876254180602007, "grad_norm": 1.7027153968811035, "learning_rate": 6.8117141525252985e-06, "loss": 0.4468, "num_input_tokens_seen": 13680976, "step": 13270 }, { "epoch": 8.879598662207357, "grad_norm": 1.7813365459442139, "learning_rate": 6.808993567759579e-06, "loss": 0.4374, "num_input_tokens_seen": 13684912, "step": 13275 }, { "epoch": 8.882943143812708, "grad_norm": 1.4169347286224365, "learning_rate": 6.806272366612683e-06, "loss": 0.4482, "num_input_tokens_seen": 13690800, "step": 13280 }, { "epoch": 8.88628762541806, "grad_norm": 1.6946053504943848, "learning_rate": 6.803550550011806e-06, "loss": 0.4818, "num_input_tokens_seen": 13695888, "step": 13285 }, { "epoch": 8.889632107023411, "grad_norm": 1.3578767776489258, "learning_rate": 6.800828118884359e-06, "loss": 0.472, "num_input_tokens_seen": 13701360, "step": 13290 }, { "epoch": 8.892976588628763, "grad_norm": 2.1570794582366943, "learning_rate": 6.798105074157959e-06, "loss": 0.3793, "num_input_tokens_seen": 13707024, "step": 13295 }, { "epoch": 8.896321070234114, "grad_norm": 3.30557918548584, "learning_rate": 6.795381416760436e-06, "loss": 0.4562, "num_input_tokens_seen": 13711792, "step": 13300 }, { "epoch": 8.899665551839465, "grad_norm": 1.8220593929290771, "learning_rate": 6.792657147619822e-06, "loss": 0.4755, "num_input_tokens_seen": 13717104, "step": 13305 }, { "epoch": 8.903010033444817, "grad_norm": 2.353569746017456, "learning_rate": 6.789932267664363e-06, "loss": 0.4852, "num_input_tokens_seen": 13722672, "step": 13310 }, { "epoch": 8.906354515050166, "grad_norm": 1.5344587564468384, "learning_rate": 6.7872067778225125e-06, "loss": 0.446, "num_input_tokens_seen": 13727408, "step": 13315 }, { "epoch": 8.909698996655518, "grad_norm": 1.7372244596481323, "learning_rate": 6.784480679022929e-06, "loss": 0.3904, "num_input_tokens_seen": 13732912, "step": 13320 }, { "epoch": 8.91304347826087, "grad_norm": 1.6965737342834473, "learning_rate": 6.7817539721944825e-06, "loss": 0.3649, "num_input_tokens_seen": 13738032, "step": 13325 }, { "epoch": 8.91638795986622, "grad_norm": 1.766006350517273, "learning_rate": 6.779026658266247e-06, "loss": 0.4707, "num_input_tokens_seen": 13743312, "step": 13330 }, { "epoch": 8.919732441471572, "grad_norm": 1.979134202003479, "learning_rate": 6.776298738167505e-06, "loss": 0.4804, "num_input_tokens_seen": 13750000, "step": 13335 }, { "epoch": 8.923076923076923, "grad_norm": 3.2430081367492676, "learning_rate": 6.773570212827743e-06, "loss": 0.4759, "num_input_tokens_seen": 13755728, "step": 13340 }, { "epoch": 8.926421404682275, "grad_norm": 2.168548345565796, "learning_rate": 6.770841083176659e-06, "loss": 0.3806, "num_input_tokens_seen": 13759792, "step": 13345 }, { "epoch": 8.929765886287626, "grad_norm": 1.58558988571167, "learning_rate": 6.768111350144151e-06, "loss": 0.4432, "num_input_tokens_seen": 13764752, "step": 13350 }, { "epoch": 8.933110367892976, "grad_norm": 1.4791998863220215, "learning_rate": 6.765381014660325e-06, "loss": 0.4833, "num_input_tokens_seen": 13770736, "step": 13355 }, { "epoch": 8.936454849498327, "grad_norm": 1.3599170446395874, "learning_rate": 6.762650077655495e-06, "loss": 0.3616, "num_input_tokens_seen": 13775632, "step": 13360 }, { "epoch": 8.939799331103679, "grad_norm": 1.5451555252075195, "learning_rate": 6.759918540060173e-06, "loss": 0.4396, "num_input_tokens_seen": 13781040, "step": 13365 }, { "epoch": 8.94314381270903, "grad_norm": 1.874598503112793, "learning_rate": 6.7571864028050835e-06, "loss": 0.4685, "num_input_tokens_seen": 13786032, "step": 13370 }, { "epoch": 8.946488294314381, "grad_norm": 2.722316026687622, "learning_rate": 6.754453666821152e-06, "loss": 0.3673, "num_input_tokens_seen": 13790832, "step": 13375 }, { "epoch": 8.949832775919733, "grad_norm": 1.7991838455200195, "learning_rate": 6.7517203330395065e-06, "loss": 0.3991, "num_input_tokens_seen": 13795952, "step": 13380 }, { "epoch": 8.953177257525084, "grad_norm": 1.9286409616470337, "learning_rate": 6.74898640239148e-06, "loss": 0.3996, "num_input_tokens_seen": 13800432, "step": 13385 }, { "epoch": 8.956521739130435, "grad_norm": 1.5311920642852783, "learning_rate": 6.746251875808609e-06, "loss": 0.4176, "num_input_tokens_seen": 13805168, "step": 13390 }, { "epoch": 8.959866220735787, "grad_norm": 2.486757516860962, "learning_rate": 6.743516754222635e-06, "loss": 0.4441, "num_input_tokens_seen": 13810256, "step": 13395 }, { "epoch": 8.963210702341136, "grad_norm": 3.3513951301574707, "learning_rate": 6.740781038565497e-06, "loss": 0.4038, "num_input_tokens_seen": 13815216, "step": 13400 }, { "epoch": 8.966555183946488, "grad_norm": 2.208669900894165, "learning_rate": 6.738044729769342e-06, "loss": 0.5271, "num_input_tokens_seen": 13820656, "step": 13405 }, { "epoch": 8.96989966555184, "grad_norm": 2.078373908996582, "learning_rate": 6.735307828766515e-06, "loss": 0.4946, "num_input_tokens_seen": 13825744, "step": 13410 }, { "epoch": 8.97324414715719, "grad_norm": 2.002655506134033, "learning_rate": 6.732570336489565e-06, "loss": 0.4868, "num_input_tokens_seen": 13830032, "step": 13415 }, { "epoch": 8.976588628762542, "grad_norm": 2.6925394535064697, "learning_rate": 6.729832253871245e-06, "loss": 0.444, "num_input_tokens_seen": 13835408, "step": 13420 }, { "epoch": 8.979933110367893, "grad_norm": 1.8687348365783691, "learning_rate": 6.727093581844502e-06, "loss": 0.4005, "num_input_tokens_seen": 13840432, "step": 13425 }, { "epoch": 8.983277591973245, "grad_norm": 1.7339158058166504, "learning_rate": 6.724354321342489e-06, "loss": 0.4306, "num_input_tokens_seen": 13845200, "step": 13430 }, { "epoch": 8.986622073578594, "grad_norm": 1.836778163909912, "learning_rate": 6.72161447329856e-06, "loss": 0.3792, "num_input_tokens_seen": 13850224, "step": 13435 }, { "epoch": 8.989966555183946, "grad_norm": 1.6976479291915894, "learning_rate": 6.718874038646266e-06, "loss": 0.4985, "num_input_tokens_seen": 13855536, "step": 13440 }, { "epoch": 8.993311036789297, "grad_norm": 2.1666784286499023, "learning_rate": 6.7161330183193595e-06, "loss": 0.5829, "num_input_tokens_seen": 13860688, "step": 13445 }, { "epoch": 8.996655518394649, "grad_norm": 1.6753357648849487, "learning_rate": 6.713391413251795e-06, "loss": 0.4014, "num_input_tokens_seen": 13866096, "step": 13450 }, { "epoch": 9.0, "grad_norm": 3.063601016998291, "learning_rate": 6.710649224377721e-06, "loss": 0.5916, "num_input_tokens_seen": 13870528, "step": 13455 }, { "epoch": 9.003344481605351, "grad_norm": 2.3567161560058594, "learning_rate": 6.70790645263149e-06, "loss": 0.4989, "num_input_tokens_seen": 13876192, "step": 13460 }, { "epoch": 9.006688963210703, "grad_norm": 1.853701114654541, "learning_rate": 6.705163098947648e-06, "loss": 0.3157, "num_input_tokens_seen": 13880672, "step": 13465 }, { "epoch": 9.010033444816054, "grad_norm": 1.2897777557373047, "learning_rate": 6.702419164260944e-06, "loss": 0.4207, "num_input_tokens_seen": 13885952, "step": 13470 }, { "epoch": 9.013377926421406, "grad_norm": 2.3482112884521484, "learning_rate": 6.699674649506323e-06, "loss": 0.5249, "num_input_tokens_seen": 13891136, "step": 13475 }, { "epoch": 9.016722408026755, "grad_norm": 2.197655439376831, "learning_rate": 6.6969295556189295e-06, "loss": 0.3858, "num_input_tokens_seen": 13896000, "step": 13480 }, { "epoch": 9.020066889632107, "grad_norm": 1.4763957262039185, "learning_rate": 6.6941838835341e-06, "loss": 0.3833, "num_input_tokens_seen": 13901440, "step": 13485 }, { "epoch": 9.023411371237458, "grad_norm": 1.737815499305725, "learning_rate": 6.691437634187375e-06, "loss": 0.4479, "num_input_tokens_seen": 13906496, "step": 13490 }, { "epoch": 9.02675585284281, "grad_norm": 2.2451767921447754, "learning_rate": 6.688690808514485e-06, "loss": 0.4326, "num_input_tokens_seen": 13911584, "step": 13495 }, { "epoch": 9.03010033444816, "grad_norm": 2.0775835514068604, "learning_rate": 6.685943407451363e-06, "loss": 0.3852, "num_input_tokens_seen": 13918912, "step": 13500 }, { "epoch": 9.033444816053512, "grad_norm": 2.3734641075134277, "learning_rate": 6.683195431934134e-06, "loss": 0.4186, "num_input_tokens_seen": 13923840, "step": 13505 }, { "epoch": 9.036789297658864, "grad_norm": 1.4846686124801636, "learning_rate": 6.68044688289912e-06, "loss": 0.3785, "num_input_tokens_seen": 13929376, "step": 13510 }, { "epoch": 9.040133779264215, "grad_norm": 2.2182748317718506, "learning_rate": 6.677697761282837e-06, "loss": 0.3769, "num_input_tokens_seen": 13934144, "step": 13515 }, { "epoch": 9.043478260869565, "grad_norm": 1.7604520320892334, "learning_rate": 6.6749480680220005e-06, "loss": 0.42, "num_input_tokens_seen": 13939168, "step": 13520 }, { "epoch": 9.046822742474916, "grad_norm": 2.077197551727295, "learning_rate": 6.672197804053516e-06, "loss": 0.4573, "num_input_tokens_seen": 13944832, "step": 13525 }, { "epoch": 9.050167224080267, "grad_norm": 1.8374803066253662, "learning_rate": 6.669446970314486e-06, "loss": 0.4129, "num_input_tokens_seen": 13949856, "step": 13530 }, { "epoch": 9.053511705685619, "grad_norm": 2.0526459217071533, "learning_rate": 6.666695567742204e-06, "loss": 0.3639, "num_input_tokens_seen": 13954560, "step": 13535 }, { "epoch": 9.05685618729097, "grad_norm": 2.365516185760498, "learning_rate": 6.663943597274163e-06, "loss": 0.3488, "num_input_tokens_seen": 13959584, "step": 13540 }, { "epoch": 9.060200668896321, "grad_norm": 1.960218071937561, "learning_rate": 6.661191059848043e-06, "loss": 0.383, "num_input_tokens_seen": 13964672, "step": 13545 }, { "epoch": 9.063545150501673, "grad_norm": 1.7084643840789795, "learning_rate": 6.658437956401723e-06, "loss": 0.3498, "num_input_tokens_seen": 13969216, "step": 13550 }, { "epoch": 9.066889632107024, "grad_norm": 2.1228485107421875, "learning_rate": 6.65568428787327e-06, "loss": 0.5561, "num_input_tokens_seen": 13974720, "step": 13555 }, { "epoch": 9.070234113712374, "grad_norm": 3.1347382068634033, "learning_rate": 6.652930055200948e-06, "loss": 0.4208, "num_input_tokens_seen": 13979552, "step": 13560 }, { "epoch": 9.073578595317725, "grad_norm": 1.7126866579055786, "learning_rate": 6.650175259323208e-06, "loss": 0.4191, "num_input_tokens_seen": 13985728, "step": 13565 }, { "epoch": 9.076923076923077, "grad_norm": 1.9820269346237183, "learning_rate": 6.647419901178699e-06, "loss": 0.3656, "num_input_tokens_seen": 13991232, "step": 13570 }, { "epoch": 9.080267558528428, "grad_norm": 1.7362456321716309, "learning_rate": 6.644663981706257e-06, "loss": 0.4875, "num_input_tokens_seen": 13996256, "step": 13575 }, { "epoch": 9.08361204013378, "grad_norm": 2.1245131492614746, "learning_rate": 6.641907501844909e-06, "loss": 0.4242, "num_input_tokens_seen": 14001440, "step": 13580 }, { "epoch": 9.08695652173913, "grad_norm": 2.190023422241211, "learning_rate": 6.639150462533879e-06, "loss": 0.5206, "num_input_tokens_seen": 14006432, "step": 13585 }, { "epoch": 9.090301003344482, "grad_norm": 1.8137192726135254, "learning_rate": 6.636392864712573e-06, "loss": 0.3965, "num_input_tokens_seen": 14011488, "step": 13590 }, { "epoch": 9.093645484949834, "grad_norm": 1.9932972192764282, "learning_rate": 6.633634709320594e-06, "loss": 0.4081, "num_input_tokens_seen": 14016608, "step": 13595 }, { "epoch": 9.096989966555183, "grad_norm": 1.8413419723510742, "learning_rate": 6.630875997297731e-06, "loss": 0.3931, "num_input_tokens_seen": 14021664, "step": 13600 }, { "epoch": 9.100334448160535, "grad_norm": 2.0699479579925537, "learning_rate": 6.628116729583967e-06, "loss": 0.4062, "num_input_tokens_seen": 14026880, "step": 13605 }, { "epoch": 9.103678929765886, "grad_norm": 3.098787307739258, "learning_rate": 6.625356907119467e-06, "loss": 0.4696, "num_input_tokens_seen": 14032672, "step": 13610 }, { "epoch": 9.107023411371237, "grad_norm": 1.8605972528457642, "learning_rate": 6.622596530844592e-06, "loss": 0.3524, "num_input_tokens_seen": 14037312, "step": 13615 }, { "epoch": 9.110367892976589, "grad_norm": 1.7319400310516357, "learning_rate": 6.61983560169989e-06, "loss": 0.3721, "num_input_tokens_seen": 14042752, "step": 13620 }, { "epoch": 9.11371237458194, "grad_norm": 1.8472241163253784, "learning_rate": 6.617074120626097e-06, "loss": 0.4448, "num_input_tokens_seen": 14047712, "step": 13625 }, { "epoch": 9.117056856187292, "grad_norm": 2.6668527126312256, "learning_rate": 6.614312088564137e-06, "loss": 0.5083, "num_input_tokens_seen": 14052320, "step": 13630 }, { "epoch": 9.120401337792643, "grad_norm": 2.754051923751831, "learning_rate": 6.6115495064551175e-06, "loss": 0.4492, "num_input_tokens_seen": 14058080, "step": 13635 }, { "epoch": 9.123745819397993, "grad_norm": 1.5666545629501343, "learning_rate": 6.608786375240343e-06, "loss": 0.3303, "num_input_tokens_seen": 14063040, "step": 13640 }, { "epoch": 9.127090301003344, "grad_norm": 2.03092098236084, "learning_rate": 6.606022695861294e-06, "loss": 0.4347, "num_input_tokens_seen": 14068000, "step": 13645 }, { "epoch": 9.130434782608695, "grad_norm": 2.315955400466919, "learning_rate": 6.603258469259648e-06, "loss": 0.4056, "num_input_tokens_seen": 14072736, "step": 13650 }, { "epoch": 9.133779264214047, "grad_norm": 1.803722620010376, "learning_rate": 6.600493696377259e-06, "loss": 0.4001, "num_input_tokens_seen": 14077760, "step": 13655 }, { "epoch": 9.137123745819398, "grad_norm": 2.0990278720855713, "learning_rate": 6.597728378156178e-06, "loss": 0.4395, "num_input_tokens_seen": 14083264, "step": 13660 }, { "epoch": 9.14046822742475, "grad_norm": 2.2141237258911133, "learning_rate": 6.594962515538633e-06, "loss": 0.4904, "num_input_tokens_seen": 14088736, "step": 13665 }, { "epoch": 9.143812709030101, "grad_norm": 1.6382428407669067, "learning_rate": 6.592196109467042e-06, "loss": 0.4385, "num_input_tokens_seen": 14093568, "step": 13670 }, { "epoch": 9.147157190635452, "grad_norm": 1.9724823236465454, "learning_rate": 6.589429160884004e-06, "loss": 0.3456, "num_input_tokens_seen": 14098464, "step": 13675 }, { "epoch": 9.150501672240802, "grad_norm": 1.8365464210510254, "learning_rate": 6.586661670732309e-06, "loss": 0.4131, "num_input_tokens_seen": 14103328, "step": 13680 }, { "epoch": 9.153846153846153, "grad_norm": 2.2677059173583984, "learning_rate": 6.583893639954928e-06, "loss": 0.4165, "num_input_tokens_seen": 14108640, "step": 13685 }, { "epoch": 9.157190635451505, "grad_norm": 1.3206037282943726, "learning_rate": 6.5811250694950134e-06, "loss": 0.3801, "num_input_tokens_seen": 14114560, "step": 13690 }, { "epoch": 9.160535117056856, "grad_norm": 1.9638116359710693, "learning_rate": 6.578355960295908e-06, "loss": 0.4356, "num_input_tokens_seen": 14119936, "step": 13695 }, { "epoch": 9.163879598662207, "grad_norm": 1.5139052867889404, "learning_rate": 6.575586313301132e-06, "loss": 0.3744, "num_input_tokens_seen": 14124672, "step": 13700 }, { "epoch": 9.167224080267559, "grad_norm": 2.1570050716400146, "learning_rate": 6.572816129454394e-06, "loss": 0.4272, "num_input_tokens_seen": 14129792, "step": 13705 }, { "epoch": 9.17056856187291, "grad_norm": 1.5435590744018555, "learning_rate": 6.57004540969958e-06, "loss": 0.4075, "num_input_tokens_seen": 14135040, "step": 13710 }, { "epoch": 9.173913043478262, "grad_norm": 1.4996953010559082, "learning_rate": 6.567274154980764e-06, "loss": 0.3801, "num_input_tokens_seen": 14140640, "step": 13715 }, { "epoch": 9.177257525083611, "grad_norm": 1.5521142482757568, "learning_rate": 6.5645023662422005e-06, "loss": 0.4153, "num_input_tokens_seen": 14145728, "step": 13720 }, { "epoch": 9.180602006688963, "grad_norm": 2.6018409729003906, "learning_rate": 6.561730044428322e-06, "loss": 0.4209, "num_input_tokens_seen": 14150464, "step": 13725 }, { "epoch": 9.183946488294314, "grad_norm": 2.5460429191589355, "learning_rate": 6.558957190483749e-06, "loss": 0.4884, "num_input_tokens_seen": 14155968, "step": 13730 }, { "epoch": 9.187290969899665, "grad_norm": 1.5934900045394897, "learning_rate": 6.556183805353279e-06, "loss": 0.4482, "num_input_tokens_seen": 14162080, "step": 13735 }, { "epoch": 9.190635451505017, "grad_norm": 2.2763800621032715, "learning_rate": 6.553409889981892e-06, "loss": 0.3687, "num_input_tokens_seen": 14167296, "step": 13740 }, { "epoch": 9.193979933110368, "grad_norm": 1.7509610652923584, "learning_rate": 6.550635445314749e-06, "loss": 0.3185, "num_input_tokens_seen": 14172544, "step": 13745 }, { "epoch": 9.19732441471572, "grad_norm": 1.6185505390167236, "learning_rate": 6.547860472297192e-06, "loss": 0.388, "num_input_tokens_seen": 14178400, "step": 13750 }, { "epoch": 9.200668896321071, "grad_norm": 2.280730962753296, "learning_rate": 6.545084971874738e-06, "loss": 0.4123, "num_input_tokens_seen": 14184224, "step": 13755 }, { "epoch": 9.20401337792642, "grad_norm": 1.9906786680221558, "learning_rate": 6.54230894499309e-06, "loss": 0.4004, "num_input_tokens_seen": 14189696, "step": 13760 }, { "epoch": 9.207357859531772, "grad_norm": 1.6508346796035767, "learning_rate": 6.539532392598129e-06, "loss": 0.3965, "num_input_tokens_seen": 14194432, "step": 13765 }, { "epoch": 9.210702341137123, "grad_norm": 2.077120542526245, "learning_rate": 6.536755315635912e-06, "loss": 0.3953, "num_input_tokens_seen": 14200160, "step": 13770 }, { "epoch": 9.214046822742475, "grad_norm": 3.493245840072632, "learning_rate": 6.53397771505268e-06, "loss": 0.3756, "num_input_tokens_seen": 14205440, "step": 13775 }, { "epoch": 9.217391304347826, "grad_norm": 2.014150857925415, "learning_rate": 6.531199591794843e-06, "loss": 0.4413, "num_input_tokens_seen": 14211456, "step": 13780 }, { "epoch": 9.220735785953178, "grad_norm": 2.3263471126556396, "learning_rate": 6.5284209468090024e-06, "loss": 0.4763, "num_input_tokens_seen": 14217408, "step": 13785 }, { "epoch": 9.224080267558529, "grad_norm": 1.7507615089416504, "learning_rate": 6.5256417810419236e-06, "loss": 0.4149, "num_input_tokens_seen": 14223008, "step": 13790 }, { "epoch": 9.22742474916388, "grad_norm": 2.156179189682007, "learning_rate": 6.5228620954405606e-06, "loss": 0.3808, "num_input_tokens_seen": 14227488, "step": 13795 }, { "epoch": 9.23076923076923, "grad_norm": 2.459981918334961, "learning_rate": 6.520081890952039e-06, "loss": 0.3761, "num_input_tokens_seen": 14232544, "step": 13800 }, { "epoch": 9.234113712374581, "grad_norm": 1.7372465133666992, "learning_rate": 6.517301168523662e-06, "loss": 0.3948, "num_input_tokens_seen": 14237184, "step": 13805 }, { "epoch": 9.237458193979933, "grad_norm": 2.0877113342285156, "learning_rate": 6.514519929102908e-06, "loss": 0.381, "num_input_tokens_seen": 14242240, "step": 13810 }, { "epoch": 9.240802675585284, "grad_norm": 2.078265428543091, "learning_rate": 6.511738173637434e-06, "loss": 0.4119, "num_input_tokens_seen": 14247488, "step": 13815 }, { "epoch": 9.244147157190636, "grad_norm": 4.059589385986328, "learning_rate": 6.508955903075074e-06, "loss": 0.4404, "num_input_tokens_seen": 14253152, "step": 13820 }, { "epoch": 9.247491638795987, "grad_norm": 2.3975865840911865, "learning_rate": 6.506173118363832e-06, "loss": 0.3264, "num_input_tokens_seen": 14258688, "step": 13825 }, { "epoch": 9.250836120401338, "grad_norm": 3.743756055831909, "learning_rate": 6.503389820451893e-06, "loss": 0.4545, "num_input_tokens_seen": 14263392, "step": 13830 }, { "epoch": 9.25418060200669, "grad_norm": 2.251527786254883, "learning_rate": 6.500606010287611e-06, "loss": 0.4475, "num_input_tokens_seen": 14268864, "step": 13835 }, { "epoch": 9.25752508361204, "grad_norm": 1.498293399810791, "learning_rate": 6.497821688819524e-06, "loss": 0.4225, "num_input_tokens_seen": 14274048, "step": 13840 }, { "epoch": 9.26086956521739, "grad_norm": 2.2947607040405273, "learning_rate": 6.495036856996332e-06, "loss": 0.4857, "num_input_tokens_seen": 14279168, "step": 13845 }, { "epoch": 9.264214046822742, "grad_norm": 1.4711503982543945, "learning_rate": 6.492251515766919e-06, "loss": 0.3328, "num_input_tokens_seen": 14283808, "step": 13850 }, { "epoch": 9.267558528428093, "grad_norm": 2.0633697509765625, "learning_rate": 6.489465666080334e-06, "loss": 0.3845, "num_input_tokens_seen": 14288672, "step": 13855 }, { "epoch": 9.270903010033445, "grad_norm": 1.6183737516403198, "learning_rate": 6.48667930888581e-06, "loss": 0.3473, "num_input_tokens_seen": 14293760, "step": 13860 }, { "epoch": 9.274247491638796, "grad_norm": 1.6289358139038086, "learning_rate": 6.483892445132743e-06, "loss": 0.4944, "num_input_tokens_seen": 14298336, "step": 13865 }, { "epoch": 9.277591973244148, "grad_norm": 1.8331323862075806, "learning_rate": 6.481105075770706e-06, "loss": 0.4384, "num_input_tokens_seen": 14303808, "step": 13870 }, { "epoch": 9.280936454849499, "grad_norm": 2.072610855102539, "learning_rate": 6.478317201749446e-06, "loss": 0.5018, "num_input_tokens_seen": 14309216, "step": 13875 }, { "epoch": 9.284280936454849, "grad_norm": 1.7632187604904175, "learning_rate": 6.475528824018875e-06, "loss": 0.3751, "num_input_tokens_seen": 14314912, "step": 13880 }, { "epoch": 9.2876254180602, "grad_norm": 1.8405674695968628, "learning_rate": 6.472739943529085e-06, "loss": 0.4196, "num_input_tokens_seen": 14320288, "step": 13885 }, { "epoch": 9.290969899665551, "grad_norm": 1.6823267936706543, "learning_rate": 6.469950561230334e-06, "loss": 0.3467, "num_input_tokens_seen": 14325088, "step": 13890 }, { "epoch": 9.294314381270903, "grad_norm": 1.8458948135375977, "learning_rate": 6.467160678073054e-06, "loss": 0.339, "num_input_tokens_seen": 14329376, "step": 13895 }, { "epoch": 9.297658862876254, "grad_norm": 2.42490553855896, "learning_rate": 6.464370295007843e-06, "loss": 0.3836, "num_input_tokens_seen": 14334688, "step": 13900 }, { "epoch": 9.301003344481606, "grad_norm": 2.2662546634674072, "learning_rate": 6.461579412985477e-06, "loss": 0.409, "num_input_tokens_seen": 14339328, "step": 13905 }, { "epoch": 9.304347826086957, "grad_norm": 2.216392993927002, "learning_rate": 6.458788032956893e-06, "loss": 0.3104, "num_input_tokens_seen": 14344192, "step": 13910 }, { "epoch": 9.307692307692308, "grad_norm": 1.8619201183319092, "learning_rate": 6.455996155873206e-06, "loss": 0.3771, "num_input_tokens_seen": 14349184, "step": 13915 }, { "epoch": 9.31103678929766, "grad_norm": 3.1537837982177734, "learning_rate": 6.453203782685695e-06, "loss": 0.4841, "num_input_tokens_seen": 14355360, "step": 13920 }, { "epoch": 9.31438127090301, "grad_norm": 1.7140637636184692, "learning_rate": 6.4504109143458085e-06, "loss": 0.3624, "num_input_tokens_seen": 14360288, "step": 13925 }, { "epoch": 9.31772575250836, "grad_norm": 2.003743886947632, "learning_rate": 6.447617551805167e-06, "loss": 0.3766, "num_input_tokens_seen": 14365344, "step": 13930 }, { "epoch": 9.321070234113712, "grad_norm": 1.8885451555252075, "learning_rate": 6.444823696015557e-06, "loss": 0.4217, "num_input_tokens_seen": 14371072, "step": 13935 }, { "epoch": 9.324414715719064, "grad_norm": 3.2244114875793457, "learning_rate": 6.442029347928932e-06, "loss": 0.4277, "num_input_tokens_seen": 14376544, "step": 13940 }, { "epoch": 9.327759197324415, "grad_norm": 2.278738498687744, "learning_rate": 6.439234508497414e-06, "loss": 0.3772, "num_input_tokens_seen": 14380768, "step": 13945 }, { "epoch": 9.331103678929766, "grad_norm": 2.5203754901885986, "learning_rate": 6.436439178673296e-06, "loss": 0.4554, "num_input_tokens_seen": 14385856, "step": 13950 }, { "epoch": 9.334448160535118, "grad_norm": 1.6342099905014038, "learning_rate": 6.433643359409032e-06, "loss": 0.4133, "num_input_tokens_seen": 14390496, "step": 13955 }, { "epoch": 9.337792642140467, "grad_norm": 2.2300021648406982, "learning_rate": 6.430847051657248e-06, "loss": 0.3573, "num_input_tokens_seen": 14394784, "step": 13960 }, { "epoch": 9.341137123745819, "grad_norm": 1.401651382446289, "learning_rate": 6.428050256370732e-06, "loss": 0.4541, "num_input_tokens_seen": 14401024, "step": 13965 }, { "epoch": 9.34448160535117, "grad_norm": 1.755732536315918, "learning_rate": 6.425252974502441e-06, "loss": 0.3858, "num_input_tokens_seen": 14405888, "step": 13970 }, { "epoch": 9.347826086956522, "grad_norm": 1.9629168510437012, "learning_rate": 6.4224552070055e-06, "loss": 0.45, "num_input_tokens_seen": 14411808, "step": 13975 }, { "epoch": 9.351170568561873, "grad_norm": 2.53058123588562, "learning_rate": 6.419656954833193e-06, "loss": 0.3654, "num_input_tokens_seen": 14416704, "step": 13980 }, { "epoch": 9.354515050167224, "grad_norm": 2.9861302375793457, "learning_rate": 6.416858218938976e-06, "loss": 0.3921, "num_input_tokens_seen": 14421952, "step": 13985 }, { "epoch": 9.357859531772576, "grad_norm": 2.444615364074707, "learning_rate": 6.4140590002764636e-06, "loss": 0.3912, "num_input_tokens_seen": 14426976, "step": 13990 }, { "epoch": 9.361204013377927, "grad_norm": 3.0138766765594482, "learning_rate": 6.4112592997994405e-06, "loss": 0.3073, "num_input_tokens_seen": 14432000, "step": 13995 }, { "epoch": 9.364548494983278, "grad_norm": 1.9574121236801147, "learning_rate": 6.408459118461851e-06, "loss": 0.4721, "num_input_tokens_seen": 14437504, "step": 14000 }, { "epoch": 9.367892976588628, "grad_norm": 1.8259978294372559, "learning_rate": 6.405658457217805e-06, "loss": 0.4162, "num_input_tokens_seen": 14443360, "step": 14005 }, { "epoch": 9.37123745819398, "grad_norm": 2.3450875282287598, "learning_rate": 6.402857317021577e-06, "loss": 0.3875, "num_input_tokens_seen": 14448096, "step": 14010 }, { "epoch": 9.37458193979933, "grad_norm": 1.93203866481781, "learning_rate": 6.4000556988276045e-06, "loss": 0.3904, "num_input_tokens_seen": 14452896, "step": 14015 }, { "epoch": 9.377926421404682, "grad_norm": 1.6230477094650269, "learning_rate": 6.397253603590488e-06, "loss": 0.4108, "num_input_tokens_seen": 14458176, "step": 14020 }, { "epoch": 9.381270903010034, "grad_norm": 2.2408437728881836, "learning_rate": 6.3944510322649855e-06, "loss": 0.4719, "num_input_tokens_seen": 14463104, "step": 14025 }, { "epoch": 9.384615384615385, "grad_norm": 2.0987496376037598, "learning_rate": 6.391647985806026e-06, "loss": 0.4857, "num_input_tokens_seen": 14468640, "step": 14030 }, { "epoch": 9.387959866220736, "grad_norm": 1.9427828788757324, "learning_rate": 6.388844465168693e-06, "loss": 0.4006, "num_input_tokens_seen": 14473664, "step": 14035 }, { "epoch": 9.391304347826088, "grad_norm": 1.8999366760253906, "learning_rate": 6.386040471308238e-06, "loss": 0.4369, "num_input_tokens_seen": 14479296, "step": 14040 }, { "epoch": 9.394648829431437, "grad_norm": 2.018799304962158, "learning_rate": 6.383236005180066e-06, "loss": 0.4322, "num_input_tokens_seen": 14484416, "step": 14045 }, { "epoch": 9.397993311036789, "grad_norm": 2.3327879905700684, "learning_rate": 6.3804310677397495e-06, "loss": 0.4539, "num_input_tokens_seen": 14489408, "step": 14050 }, { "epoch": 9.40133779264214, "grad_norm": 1.9840338230133057, "learning_rate": 6.377625659943018e-06, "loss": 0.3927, "num_input_tokens_seen": 14494528, "step": 14055 }, { "epoch": 9.404682274247492, "grad_norm": 2.3071680068969727, "learning_rate": 6.374819782745763e-06, "loss": 0.4721, "num_input_tokens_seen": 14499264, "step": 14060 }, { "epoch": 9.408026755852843, "grad_norm": 2.870378255844116, "learning_rate": 6.372013437104036e-06, "loss": 0.4593, "num_input_tokens_seen": 14504032, "step": 14065 }, { "epoch": 9.411371237458194, "grad_norm": 2.0794589519500732, "learning_rate": 6.369206623974045e-06, "loss": 0.3952, "num_input_tokens_seen": 14509216, "step": 14070 }, { "epoch": 9.414715719063546, "grad_norm": 1.6976816654205322, "learning_rate": 6.366399344312164e-06, "loss": 0.3834, "num_input_tokens_seen": 14513568, "step": 14075 }, { "epoch": 9.418060200668897, "grad_norm": 2.867100715637207, "learning_rate": 6.363591599074918e-06, "loss": 0.3887, "num_input_tokens_seen": 14518432, "step": 14080 }, { "epoch": 9.421404682274247, "grad_norm": 1.7543115615844727, "learning_rate": 6.360783389218996e-06, "loss": 0.3849, "num_input_tokens_seen": 14524192, "step": 14085 }, { "epoch": 9.424749163879598, "grad_norm": 2.147096872329712, "learning_rate": 6.357974715701243e-06, "loss": 0.4776, "num_input_tokens_seen": 14529440, "step": 14090 }, { "epoch": 9.42809364548495, "grad_norm": 1.6295567750930786, "learning_rate": 6.355165579478664e-06, "loss": 0.3847, "num_input_tokens_seen": 14534784, "step": 14095 }, { "epoch": 9.431438127090301, "grad_norm": 1.8185008764266968, "learning_rate": 6.35235598150842e-06, "loss": 0.4536, "num_input_tokens_seen": 14540064, "step": 14100 }, { "epoch": 9.434782608695652, "grad_norm": 1.6384692192077637, "learning_rate": 6.349545922747827e-06, "loss": 0.371, "num_input_tokens_seen": 14544608, "step": 14105 }, { "epoch": 9.438127090301004, "grad_norm": 1.7011072635650635, "learning_rate": 6.346735404154364e-06, "loss": 0.4502, "num_input_tokens_seen": 14548960, "step": 14110 }, { "epoch": 9.441471571906355, "grad_norm": 3.2206311225891113, "learning_rate": 6.343924426685661e-06, "loss": 0.4962, "num_input_tokens_seen": 14554464, "step": 14115 }, { "epoch": 9.444816053511706, "grad_norm": 1.6783078908920288, "learning_rate": 6.3411129912995095e-06, "loss": 0.5248, "num_input_tokens_seen": 14559520, "step": 14120 }, { "epoch": 9.448160535117056, "grad_norm": 1.6878688335418701, "learning_rate": 6.338301098953852e-06, "loss": 0.4241, "num_input_tokens_seen": 14564288, "step": 14125 }, { "epoch": 9.451505016722408, "grad_norm": 1.619963526725769, "learning_rate": 6.33548875060679e-06, "loss": 0.4306, "num_input_tokens_seen": 14569888, "step": 14130 }, { "epoch": 9.454849498327759, "grad_norm": 2.0868773460388184, "learning_rate": 6.3326759472165775e-06, "loss": 0.3802, "num_input_tokens_seen": 14575040, "step": 14135 }, { "epoch": 9.45819397993311, "grad_norm": 2.4588606357574463, "learning_rate": 6.329862689741628e-06, "loss": 0.4723, "num_input_tokens_seen": 14580928, "step": 14140 }, { "epoch": 9.461538461538462, "grad_norm": 2.1707608699798584, "learning_rate": 6.3270489791405055e-06, "loss": 0.3589, "num_input_tokens_seen": 14585952, "step": 14145 }, { "epoch": 9.464882943143813, "grad_norm": 1.2275971174240112, "learning_rate": 6.32423481637193e-06, "loss": 0.4141, "num_input_tokens_seen": 14591904, "step": 14150 }, { "epoch": 9.468227424749164, "grad_norm": 1.4706838130950928, "learning_rate": 6.3214202023947745e-06, "loss": 0.4357, "num_input_tokens_seen": 14597664, "step": 14155 }, { "epoch": 9.471571906354516, "grad_norm": 1.8291071653366089, "learning_rate": 6.318605138168069e-06, "loss": 0.3195, "num_input_tokens_seen": 14602464, "step": 14160 }, { "epoch": 9.474916387959865, "grad_norm": 2.24533748626709, "learning_rate": 6.315789624650994e-06, "loss": 0.4354, "num_input_tokens_seen": 14606944, "step": 14165 }, { "epoch": 9.478260869565217, "grad_norm": 1.9569623470306396, "learning_rate": 6.312973662802884e-06, "loss": 0.4922, "num_input_tokens_seen": 14612032, "step": 14170 }, { "epoch": 9.481605351170568, "grad_norm": 1.3162283897399902, "learning_rate": 6.3101572535832245e-06, "loss": 0.4083, "num_input_tokens_seen": 14616672, "step": 14175 }, { "epoch": 9.48494983277592, "grad_norm": 1.6338623762130737, "learning_rate": 6.3073403979516555e-06, "loss": 0.375, "num_input_tokens_seen": 14621920, "step": 14180 }, { "epoch": 9.488294314381271, "grad_norm": 1.429764747619629, "learning_rate": 6.30452309686797e-06, "loss": 0.3673, "num_input_tokens_seen": 14627264, "step": 14185 }, { "epoch": 9.491638795986622, "grad_norm": 1.7673211097717285, "learning_rate": 6.301705351292111e-06, "loss": 0.4444, "num_input_tokens_seen": 14633376, "step": 14190 }, { "epoch": 9.494983277591974, "grad_norm": 2.0412983894348145, "learning_rate": 6.298887162184172e-06, "loss": 0.4221, "num_input_tokens_seen": 14638592, "step": 14195 }, { "epoch": 9.498327759197325, "grad_norm": 2.712646245956421, "learning_rate": 6.2960685305044e-06, "loss": 0.4216, "num_input_tokens_seen": 14643808, "step": 14200 }, { "epoch": 9.501672240802675, "grad_norm": 2.002514362335205, "learning_rate": 6.293249457213191e-06, "loss": 0.4902, "num_input_tokens_seen": 14649024, "step": 14205 }, { "epoch": 9.505016722408026, "grad_norm": 1.7008137702941895, "learning_rate": 6.290429943271093e-06, "loss": 0.4092, "num_input_tokens_seen": 14654048, "step": 14210 }, { "epoch": 9.508361204013378, "grad_norm": 2.9521477222442627, "learning_rate": 6.287609989638801e-06, "loss": 0.4334, "num_input_tokens_seen": 14658336, "step": 14215 }, { "epoch": 9.511705685618729, "grad_norm": 1.8148033618927002, "learning_rate": 6.284789597277168e-06, "loss": 0.4306, "num_input_tokens_seen": 14663584, "step": 14220 }, { "epoch": 9.51505016722408, "grad_norm": 2.1990411281585693, "learning_rate": 6.281968767147183e-06, "loss": 0.3962, "num_input_tokens_seen": 14669248, "step": 14225 }, { "epoch": 9.518394648829432, "grad_norm": 1.4502174854278564, "learning_rate": 6.279147500209998e-06, "loss": 0.4212, "num_input_tokens_seen": 14673920, "step": 14230 }, { "epoch": 9.521739130434783, "grad_norm": 1.9807977676391602, "learning_rate": 6.276325797426905e-06, "loss": 0.3812, "num_input_tokens_seen": 14678432, "step": 14235 }, { "epoch": 9.525083612040135, "grad_norm": 2.4695773124694824, "learning_rate": 6.273503659759347e-06, "loss": 0.5034, "num_input_tokens_seen": 14683968, "step": 14240 }, { "epoch": 9.528428093645484, "grad_norm": 1.567945122718811, "learning_rate": 6.270681088168916e-06, "loss": 0.4111, "num_input_tokens_seen": 14688992, "step": 14245 }, { "epoch": 9.531772575250836, "grad_norm": 1.6294076442718506, "learning_rate": 6.267858083617351e-06, "loss": 0.3699, "num_input_tokens_seen": 14694176, "step": 14250 }, { "epoch": 9.535117056856187, "grad_norm": 2.517285108566284, "learning_rate": 6.265034647066537e-06, "loss": 0.3752, "num_input_tokens_seen": 14699712, "step": 14255 }, { "epoch": 9.538461538461538, "grad_norm": 2.759955406188965, "learning_rate": 6.26221077947851e-06, "loss": 0.3525, "num_input_tokens_seen": 14704384, "step": 14260 }, { "epoch": 9.54180602006689, "grad_norm": 2.412463903427124, "learning_rate": 6.259386481815452e-06, "loss": 0.3932, "num_input_tokens_seen": 14710368, "step": 14265 }, { "epoch": 9.545150501672241, "grad_norm": 1.740065574645996, "learning_rate": 6.2565617550396875e-06, "loss": 0.4288, "num_input_tokens_seen": 14715264, "step": 14270 }, { "epoch": 9.548494983277592, "grad_norm": 3.933805227279663, "learning_rate": 6.253736600113693e-06, "loss": 0.4253, "num_input_tokens_seen": 14720480, "step": 14275 }, { "epoch": 9.551839464882944, "grad_norm": 2.867543935775757, "learning_rate": 6.2509110180000835e-06, "loss": 0.5528, "num_input_tokens_seen": 14725504, "step": 14280 }, { "epoch": 9.555183946488294, "grad_norm": 1.6286518573760986, "learning_rate": 6.248085009661629e-06, "loss": 0.3973, "num_input_tokens_seen": 14731904, "step": 14285 }, { "epoch": 9.558528428093645, "grad_norm": 2.176163911819458, "learning_rate": 6.245258576061236e-06, "loss": 0.3475, "num_input_tokens_seen": 14736896, "step": 14290 }, { "epoch": 9.561872909698996, "grad_norm": 2.3159213066101074, "learning_rate": 6.24243171816196e-06, "loss": 0.4274, "num_input_tokens_seen": 14742272, "step": 14295 }, { "epoch": 9.565217391304348, "grad_norm": 1.8921695947647095, "learning_rate": 6.239604436927001e-06, "loss": 0.3679, "num_input_tokens_seen": 14747072, "step": 14300 }, { "epoch": 9.568561872909699, "grad_norm": 2.0268449783325195, "learning_rate": 6.236776733319705e-06, "loss": 0.5148, "num_input_tokens_seen": 14751648, "step": 14305 }, { "epoch": 9.57190635451505, "grad_norm": 1.9835011959075928, "learning_rate": 6.233948608303559e-06, "loss": 0.3593, "num_input_tokens_seen": 14756000, "step": 14310 }, { "epoch": 9.575250836120402, "grad_norm": 2.5723068714141846, "learning_rate": 6.231120062842192e-06, "loss": 0.4448, "num_input_tokens_seen": 14760352, "step": 14315 }, { "epoch": 9.578595317725753, "grad_norm": 2.8529486656188965, "learning_rate": 6.228291097899382e-06, "loss": 0.4501, "num_input_tokens_seen": 14765280, "step": 14320 }, { "epoch": 9.581939799331103, "grad_norm": 2.657670736312866, "learning_rate": 6.225461714439043e-06, "loss": 0.4813, "num_input_tokens_seen": 14770624, "step": 14325 }, { "epoch": 9.585284280936454, "grad_norm": 1.8814544677734375, "learning_rate": 6.222631913425238e-06, "loss": 0.3687, "num_input_tokens_seen": 14775008, "step": 14330 }, { "epoch": 9.588628762541806, "grad_norm": 1.6711384057998657, "learning_rate": 6.21980169582217e-06, "loss": 0.337, "num_input_tokens_seen": 14781120, "step": 14335 }, { "epoch": 9.591973244147157, "grad_norm": 1.825457215309143, "learning_rate": 6.216971062594179e-06, "loss": 0.4631, "num_input_tokens_seen": 14786208, "step": 14340 }, { "epoch": 9.595317725752508, "grad_norm": 2.0661723613739014, "learning_rate": 6.214140014705757e-06, "loss": 0.4382, "num_input_tokens_seen": 14790272, "step": 14345 }, { "epoch": 9.59866220735786, "grad_norm": 2.0415165424346924, "learning_rate": 6.2113085531215265e-06, "loss": 0.394, "num_input_tokens_seen": 14795648, "step": 14350 }, { "epoch": 9.602006688963211, "grad_norm": 3.682448148727417, "learning_rate": 6.208476678806259e-06, "loss": 0.4564, "num_input_tokens_seen": 14800992, "step": 14355 }, { "epoch": 9.605351170568563, "grad_norm": 2.4872920513153076, "learning_rate": 6.20564439272486e-06, "loss": 0.4729, "num_input_tokens_seen": 14806528, "step": 14360 }, { "epoch": 9.608695652173914, "grad_norm": 1.7228834629058838, "learning_rate": 6.2028116958423835e-06, "loss": 0.5047, "num_input_tokens_seen": 14811424, "step": 14365 }, { "epoch": 9.612040133779264, "grad_norm": 2.491793632507324, "learning_rate": 6.199978589124014e-06, "loss": 0.4167, "num_input_tokens_seen": 14816704, "step": 14370 }, { "epoch": 9.615384615384615, "grad_norm": 1.88848876953125, "learning_rate": 6.197145073535085e-06, "loss": 0.3783, "num_input_tokens_seen": 14821248, "step": 14375 }, { "epoch": 9.618729096989966, "grad_norm": 2.141126871109009, "learning_rate": 6.19431115004106e-06, "loss": 0.415, "num_input_tokens_seen": 14826720, "step": 14380 }, { "epoch": 9.622073578595318, "grad_norm": 2.1529009342193604, "learning_rate": 6.191476819607548e-06, "loss": 0.4166, "num_input_tokens_seen": 14832224, "step": 14385 }, { "epoch": 9.62541806020067, "grad_norm": 2.3816897869110107, "learning_rate": 6.188642083200297e-06, "loss": 0.408, "num_input_tokens_seen": 14836832, "step": 14390 }, { "epoch": 9.62876254180602, "grad_norm": 4.366697311401367, "learning_rate": 6.185806941785187e-06, "loss": 0.4685, "num_input_tokens_seen": 14842048, "step": 14395 }, { "epoch": 9.632107023411372, "grad_norm": 2.1622042655944824, "learning_rate": 6.182971396328242e-06, "loss": 0.3723, "num_input_tokens_seen": 14846784, "step": 14400 }, { "epoch": 9.635451505016722, "grad_norm": 1.9305660724639893, "learning_rate": 6.180135447795622e-06, "loss": 0.4294, "num_input_tokens_seen": 14852896, "step": 14405 }, { "epoch": 9.638795986622073, "grad_norm": 2.055896759033203, "learning_rate": 6.177299097153626e-06, "loss": 0.478, "num_input_tokens_seen": 14856992, "step": 14410 }, { "epoch": 9.642140468227424, "grad_norm": 1.7987654209136963, "learning_rate": 6.174462345368685e-06, "loss": 0.4195, "num_input_tokens_seen": 14863104, "step": 14415 }, { "epoch": 9.645484949832776, "grad_norm": 1.5559903383255005, "learning_rate": 6.171625193407371e-06, "loss": 0.3994, "num_input_tokens_seen": 14868160, "step": 14420 }, { "epoch": 9.648829431438127, "grad_norm": 2.979375123977661, "learning_rate": 6.168787642236393e-06, "loss": 0.4718, "num_input_tokens_seen": 14872704, "step": 14425 }, { "epoch": 9.652173913043478, "grad_norm": 3.2310128211975098, "learning_rate": 6.16594969282259e-06, "loss": 0.4127, "num_input_tokens_seen": 14877568, "step": 14430 }, { "epoch": 9.65551839464883, "grad_norm": 1.6707264184951782, "learning_rate": 6.163111346132946e-06, "loss": 0.3728, "num_input_tokens_seen": 14882784, "step": 14435 }, { "epoch": 9.658862876254181, "grad_norm": 2.3661255836486816, "learning_rate": 6.160272603134571e-06, "loss": 0.4607, "num_input_tokens_seen": 14888608, "step": 14440 }, { "epoch": 9.662207357859533, "grad_norm": 1.84927499294281, "learning_rate": 6.157433464794717e-06, "loss": 0.4511, "num_input_tokens_seen": 14893792, "step": 14445 }, { "epoch": 9.665551839464882, "grad_norm": 1.615002989768982, "learning_rate": 6.1545939320807645e-06, "loss": 0.3828, "num_input_tokens_seen": 14898848, "step": 14450 }, { "epoch": 9.668896321070234, "grad_norm": 1.6950459480285645, "learning_rate": 6.151754005960236e-06, "loss": 0.4118, "num_input_tokens_seen": 14904832, "step": 14455 }, { "epoch": 9.672240802675585, "grad_norm": 2.568969249725342, "learning_rate": 6.148913687400781e-06, "loss": 0.4585, "num_input_tokens_seen": 14910848, "step": 14460 }, { "epoch": 9.675585284280936, "grad_norm": 1.099061131477356, "learning_rate": 6.14607297737019e-06, "loss": 0.456, "num_input_tokens_seen": 14915904, "step": 14465 }, { "epoch": 9.678929765886288, "grad_norm": 2.2661705017089844, "learning_rate": 6.143231876836377e-06, "loss": 0.4305, "num_input_tokens_seen": 14921056, "step": 14470 }, { "epoch": 9.68227424749164, "grad_norm": 1.5595937967300415, "learning_rate": 6.140390386767398e-06, "loss": 0.434, "num_input_tokens_seen": 14926240, "step": 14475 }, { "epoch": 9.68561872909699, "grad_norm": 1.8128973245620728, "learning_rate": 6.137548508131437e-06, "loss": 0.3404, "num_input_tokens_seen": 14931456, "step": 14480 }, { "epoch": 9.68896321070234, "grad_norm": 1.726056694984436, "learning_rate": 6.134706241896811e-06, "loss": 0.4338, "num_input_tokens_seen": 14936448, "step": 14485 }, { "epoch": 9.692307692307692, "grad_norm": 1.4552268981933594, "learning_rate": 6.131863589031973e-06, "loss": 0.3846, "num_input_tokens_seen": 14941376, "step": 14490 }, { "epoch": 9.695652173913043, "grad_norm": 1.8162224292755127, "learning_rate": 6.1290205505055e-06, "loss": 0.4327, "num_input_tokens_seen": 14947616, "step": 14495 }, { "epoch": 9.698996655518394, "grad_norm": 2.1247878074645996, "learning_rate": 6.126177127286108e-06, "loss": 0.4314, "num_input_tokens_seen": 14952160, "step": 14500 }, { "epoch": 9.702341137123746, "grad_norm": 1.6849143505096436, "learning_rate": 6.123333320342639e-06, "loss": 0.4279, "num_input_tokens_seen": 14957312, "step": 14505 }, { "epoch": 9.705685618729097, "grad_norm": 1.4544166326522827, "learning_rate": 6.120489130644071e-06, "loss": 0.3848, "num_input_tokens_seen": 14962848, "step": 14510 }, { "epoch": 9.709030100334449, "grad_norm": 2.4177050590515137, "learning_rate": 6.117644559159508e-06, "loss": 0.4455, "num_input_tokens_seen": 14967456, "step": 14515 }, { "epoch": 9.7123745819398, "grad_norm": 1.6382144689559937, "learning_rate": 6.1147996068581815e-06, "loss": 0.3549, "num_input_tokens_seen": 14972704, "step": 14520 }, { "epoch": 9.715719063545151, "grad_norm": 2.2248876094818115, "learning_rate": 6.111954274709462e-06, "loss": 0.376, "num_input_tokens_seen": 14977920, "step": 14525 }, { "epoch": 9.719063545150501, "grad_norm": 2.057406187057495, "learning_rate": 6.1091085636828385e-06, "loss": 0.4863, "num_input_tokens_seen": 14983168, "step": 14530 }, { "epoch": 9.722408026755852, "grad_norm": 2.6840198040008545, "learning_rate": 6.106262474747939e-06, "loss": 0.3936, "num_input_tokens_seen": 14987872, "step": 14535 }, { "epoch": 9.725752508361204, "grad_norm": 2.368051052093506, "learning_rate": 6.1034160088745144e-06, "loss": 0.4351, "num_input_tokens_seen": 14993184, "step": 14540 }, { "epoch": 9.729096989966555, "grad_norm": 3.254365921020508, "learning_rate": 6.100569167032444e-06, "loss": 0.414, "num_input_tokens_seen": 14997984, "step": 14545 }, { "epoch": 9.732441471571907, "grad_norm": 2.179638385772705, "learning_rate": 6.097721950191738e-06, "loss": 0.3772, "num_input_tokens_seen": 15003168, "step": 14550 }, { "epoch": 9.735785953177258, "grad_norm": 2.690885305404663, "learning_rate": 6.094874359322534e-06, "loss": 0.3511, "num_input_tokens_seen": 15008096, "step": 14555 }, { "epoch": 9.73913043478261, "grad_norm": 1.9872318506240845, "learning_rate": 6.092026395395092e-06, "loss": 0.3643, "num_input_tokens_seen": 15012832, "step": 14560 }, { "epoch": 9.742474916387959, "grad_norm": 2.484468460083008, "learning_rate": 6.089178059379809e-06, "loss": 0.4419, "num_input_tokens_seen": 15017856, "step": 14565 }, { "epoch": 9.74581939799331, "grad_norm": 1.7683196067810059, "learning_rate": 6.086329352247199e-06, "loss": 0.4005, "num_input_tokens_seen": 15023328, "step": 14570 }, { "epoch": 9.749163879598662, "grad_norm": 1.9876457452774048, "learning_rate": 6.083480274967907e-06, "loss": 0.4667, "num_input_tokens_seen": 15029600, "step": 14575 }, { "epoch": 9.752508361204013, "grad_norm": 1.6568912267684937, "learning_rate": 6.080630828512703e-06, "loss": 0.3433, "num_input_tokens_seen": 15035040, "step": 14580 }, { "epoch": 9.755852842809364, "grad_norm": 1.7594068050384521, "learning_rate": 6.077781013852485e-06, "loss": 0.3722, "num_input_tokens_seen": 15040992, "step": 14585 }, { "epoch": 9.759197324414716, "grad_norm": 1.8803513050079346, "learning_rate": 6.074930831958274e-06, "loss": 0.4324, "num_input_tokens_seen": 15045792, "step": 14590 }, { "epoch": 9.762541806020067, "grad_norm": 1.679745078086853, "learning_rate": 6.072080283801216e-06, "loss": 0.4686, "num_input_tokens_seen": 15050592, "step": 14595 }, { "epoch": 9.765886287625419, "grad_norm": 2.121248483657837, "learning_rate": 6.069229370352584e-06, "loss": 0.5015, "num_input_tokens_seen": 15055840, "step": 14600 }, { "epoch": 9.76923076923077, "grad_norm": 2.7856781482696533, "learning_rate": 6.066378092583772e-06, "loss": 0.4123, "num_input_tokens_seen": 15060832, "step": 14605 }, { "epoch": 9.77257525083612, "grad_norm": 1.915785551071167, "learning_rate": 6.063526451466304e-06, "loss": 0.4674, "num_input_tokens_seen": 15065856, "step": 14610 }, { "epoch": 9.775919732441471, "grad_norm": 2.363466262817383, "learning_rate": 6.0606744479718225e-06, "loss": 0.4942, "num_input_tokens_seen": 15070784, "step": 14615 }, { "epoch": 9.779264214046822, "grad_norm": 2.01228404045105, "learning_rate": 6.057822083072092e-06, "loss": 0.4123, "num_input_tokens_seen": 15075712, "step": 14620 }, { "epoch": 9.782608695652174, "grad_norm": 2.0287182331085205, "learning_rate": 6.054969357739008e-06, "loss": 0.4232, "num_input_tokens_seen": 15080640, "step": 14625 }, { "epoch": 9.785953177257525, "grad_norm": 1.770209550857544, "learning_rate": 6.0521162729445804e-06, "loss": 0.4713, "num_input_tokens_seen": 15087360, "step": 14630 }, { "epoch": 9.789297658862877, "grad_norm": 2.399038791656494, "learning_rate": 6.049262829660948e-06, "loss": 0.4019, "num_input_tokens_seen": 15092256, "step": 14635 }, { "epoch": 9.792642140468228, "grad_norm": 3.2990775108337402, "learning_rate": 6.046409028860365e-06, "loss": 0.3952, "num_input_tokens_seen": 15097344, "step": 14640 }, { "epoch": 9.79598662207358, "grad_norm": 2.552884101867676, "learning_rate": 6.043554871515216e-06, "loss": 0.4201, "num_input_tokens_seen": 15102112, "step": 14645 }, { "epoch": 9.799331103678929, "grad_norm": 2.476630687713623, "learning_rate": 6.040700358598e-06, "loss": 0.5483, "num_input_tokens_seen": 15106752, "step": 14650 }, { "epoch": 9.80267558528428, "grad_norm": 2.2045047283172607, "learning_rate": 6.037845491081339e-06, "loss": 0.4917, "num_input_tokens_seen": 15111808, "step": 14655 }, { "epoch": 9.806020066889632, "grad_norm": 1.652010440826416, "learning_rate": 6.034990269937978e-06, "loss": 0.3669, "num_input_tokens_seen": 15117728, "step": 14660 }, { "epoch": 9.809364548494983, "grad_norm": 1.9166711568832397, "learning_rate": 6.032134696140779e-06, "loss": 0.4254, "num_input_tokens_seen": 15123264, "step": 14665 }, { "epoch": 9.812709030100335, "grad_norm": 1.486627459526062, "learning_rate": 6.0292787706627295e-06, "loss": 0.4343, "num_input_tokens_seen": 15128544, "step": 14670 }, { "epoch": 9.816053511705686, "grad_norm": 2.1795384883880615, "learning_rate": 6.0264224944769305e-06, "loss": 0.4019, "num_input_tokens_seen": 15133568, "step": 14675 }, { "epoch": 9.819397993311037, "grad_norm": 1.3085042238235474, "learning_rate": 6.023565868556605e-06, "loss": 0.3961, "num_input_tokens_seen": 15138720, "step": 14680 }, { "epoch": 9.822742474916389, "grad_norm": 1.5757650136947632, "learning_rate": 6.020708893875098e-06, "loss": 0.5067, "num_input_tokens_seen": 15143296, "step": 14685 }, { "epoch": 9.826086956521738, "grad_norm": 1.491220474243164, "learning_rate": 6.01785157140587e-06, "loss": 0.4379, "num_input_tokens_seen": 15148544, "step": 14690 }, { "epoch": 9.82943143812709, "grad_norm": 2.538580894470215, "learning_rate": 6.014993902122499e-06, "loss": 0.4078, "num_input_tokens_seen": 15153024, "step": 14695 }, { "epoch": 9.832775919732441, "grad_norm": 2.75288724899292, "learning_rate": 6.012135886998684e-06, "loss": 0.43, "num_input_tokens_seen": 15158464, "step": 14700 }, { "epoch": 9.836120401337793, "grad_norm": 1.8440603017807007, "learning_rate": 6.009277527008244e-06, "loss": 0.4089, "num_input_tokens_seen": 15163840, "step": 14705 }, { "epoch": 9.839464882943144, "grad_norm": 2.198467254638672, "learning_rate": 6.0064188231251084e-06, "loss": 0.4065, "num_input_tokens_seen": 15168480, "step": 14710 }, { "epoch": 9.842809364548495, "grad_norm": 2.8434669971466064, "learning_rate": 6.003559776323331e-06, "loss": 0.4661, "num_input_tokens_seen": 15173600, "step": 14715 }, { "epoch": 9.846153846153847, "grad_norm": 1.6975178718566895, "learning_rate": 6.000700387577075e-06, "loss": 0.3897, "num_input_tokens_seen": 15178592, "step": 14720 }, { "epoch": 9.849498327759198, "grad_norm": 2.3052005767822266, "learning_rate": 5.9978406578606295e-06, "loss": 0.4152, "num_input_tokens_seen": 15183776, "step": 14725 }, { "epoch": 9.852842809364548, "grad_norm": 1.9360620975494385, "learning_rate": 5.994980588148391e-06, "loss": 0.389, "num_input_tokens_seen": 15188672, "step": 14730 }, { "epoch": 9.856187290969899, "grad_norm": 2.222188711166382, "learning_rate": 5.992120179414878e-06, "loss": 0.3948, "num_input_tokens_seen": 15194144, "step": 14735 }, { "epoch": 9.85953177257525, "grad_norm": 2.3698973655700684, "learning_rate": 5.989259432634717e-06, "loss": 0.4883, "num_input_tokens_seen": 15200736, "step": 14740 }, { "epoch": 9.862876254180602, "grad_norm": 1.9306340217590332, "learning_rate": 5.986398348782661e-06, "loss": 0.3752, "num_input_tokens_seen": 15205696, "step": 14745 }, { "epoch": 9.866220735785953, "grad_norm": 1.7260961532592773, "learning_rate": 5.983536928833569e-06, "loss": 0.4212, "num_input_tokens_seen": 15211552, "step": 14750 }, { "epoch": 9.869565217391305, "grad_norm": 2.2457797527313232, "learning_rate": 5.980675173762415e-06, "loss": 0.4519, "num_input_tokens_seen": 15216768, "step": 14755 }, { "epoch": 9.872909698996656, "grad_norm": 2.6995480060577393, "learning_rate": 5.9778130845442926e-06, "loss": 0.3852, "num_input_tokens_seen": 15221536, "step": 14760 }, { "epoch": 9.876254180602007, "grad_norm": 2.256145715713501, "learning_rate": 5.974950662154403e-06, "loss": 0.4314, "num_input_tokens_seen": 15227168, "step": 14765 }, { "epoch": 9.879598662207357, "grad_norm": 2.2391929626464844, "learning_rate": 5.972087907568067e-06, "loss": 0.4188, "num_input_tokens_seen": 15232352, "step": 14770 }, { "epoch": 9.882943143812708, "grad_norm": 1.9712690114974976, "learning_rate": 5.9692248217607115e-06, "loss": 0.4423, "num_input_tokens_seen": 15238304, "step": 14775 }, { "epoch": 9.88628762541806, "grad_norm": 2.2939863204956055, "learning_rate": 5.966361405707885e-06, "loss": 0.434, "num_input_tokens_seen": 15243200, "step": 14780 }, { "epoch": 9.889632107023411, "grad_norm": 2.6639747619628906, "learning_rate": 5.9634976603852375e-06, "loss": 0.4486, "num_input_tokens_seen": 15247232, "step": 14785 }, { "epoch": 9.892976588628763, "grad_norm": 2.2053024768829346, "learning_rate": 5.9606335867685424e-06, "loss": 0.4271, "num_input_tokens_seen": 15252128, "step": 14790 }, { "epoch": 9.896321070234114, "grad_norm": 1.888987421989441, "learning_rate": 5.957769185833678e-06, "loss": 0.4103, "num_input_tokens_seen": 15257504, "step": 14795 }, { "epoch": 9.899665551839465, "grad_norm": 2.7931435108184814, "learning_rate": 5.954904458556636e-06, "loss": 0.4383, "num_input_tokens_seen": 15263168, "step": 14800 }, { "epoch": 9.903010033444817, "grad_norm": 2.296342134475708, "learning_rate": 5.952039405913522e-06, "loss": 0.4008, "num_input_tokens_seen": 15267872, "step": 14805 }, { "epoch": 9.906354515050166, "grad_norm": 1.6344292163848877, "learning_rate": 5.949174028880546e-06, "loss": 0.4019, "num_input_tokens_seen": 15272736, "step": 14810 }, { "epoch": 9.909698996655518, "grad_norm": 3.208158493041992, "learning_rate": 5.946308328434034e-06, "loss": 0.4949, "num_input_tokens_seen": 15277568, "step": 14815 }, { "epoch": 9.91304347826087, "grad_norm": 1.7836436033248901, "learning_rate": 5.94344230555042e-06, "loss": 0.4711, "num_input_tokens_seen": 15282496, "step": 14820 }, { "epoch": 9.91638795986622, "grad_norm": 2.8325531482696533, "learning_rate": 5.94057596120625e-06, "loss": 0.3517, "num_input_tokens_seen": 15287264, "step": 14825 }, { "epoch": 9.919732441471572, "grad_norm": 1.6406798362731934, "learning_rate": 5.937709296378175e-06, "loss": 0.3754, "num_input_tokens_seen": 15292960, "step": 14830 }, { "epoch": 9.923076923076923, "grad_norm": 1.8791959285736084, "learning_rate": 5.934842312042962e-06, "loss": 0.3724, "num_input_tokens_seen": 15297888, "step": 14835 }, { "epoch": 9.926421404682275, "grad_norm": 2.804340362548828, "learning_rate": 5.93197500917748e-06, "loss": 0.3443, "num_input_tokens_seen": 15302784, "step": 14840 }, { "epoch": 9.929765886287626, "grad_norm": 2.4341230392456055, "learning_rate": 5.929107388758709e-06, "loss": 0.4012, "num_input_tokens_seen": 15308736, "step": 14845 }, { "epoch": 9.933110367892976, "grad_norm": 1.5551165342330933, "learning_rate": 5.926239451763743e-06, "loss": 0.3664, "num_input_tokens_seen": 15313920, "step": 14850 }, { "epoch": 9.936454849498327, "grad_norm": 2.195364236831665, "learning_rate": 5.923371199169771e-06, "loss": 0.4094, "num_input_tokens_seen": 15319136, "step": 14855 }, { "epoch": 9.939799331103679, "grad_norm": 1.5532732009887695, "learning_rate": 5.920502631954104e-06, "loss": 0.5579, "num_input_tokens_seen": 15324736, "step": 14860 }, { "epoch": 9.94314381270903, "grad_norm": 3.0384106636047363, "learning_rate": 5.917633751094149e-06, "loss": 0.4388, "num_input_tokens_seen": 15329728, "step": 14865 }, { "epoch": 9.946488294314381, "grad_norm": 2.7514231204986572, "learning_rate": 5.9147645575674254e-06, "loss": 0.4833, "num_input_tokens_seen": 15334688, "step": 14870 }, { "epoch": 9.949832775919733, "grad_norm": 2.4857075214385986, "learning_rate": 5.911895052351557e-06, "loss": 0.4398, "num_input_tokens_seen": 15340096, "step": 14875 }, { "epoch": 9.953177257525084, "grad_norm": 2.2142558097839355, "learning_rate": 5.909025236424278e-06, "loss": 0.4509, "num_input_tokens_seen": 15345056, "step": 14880 }, { "epoch": 9.956521739130435, "grad_norm": 2.0397989749908447, "learning_rate": 5.906155110763423e-06, "loss": 0.4653, "num_input_tokens_seen": 15350272, "step": 14885 }, { "epoch": 9.959866220735787, "grad_norm": 2.0914430618286133, "learning_rate": 5.903284676346934e-06, "loss": 0.5215, "num_input_tokens_seen": 15354880, "step": 14890 }, { "epoch": 9.963210702341136, "grad_norm": 2.652387857437134, "learning_rate": 5.900413934152859e-06, "loss": 0.4192, "num_input_tokens_seen": 15359872, "step": 14895 }, { "epoch": 9.966555183946488, "grad_norm": 1.7076020240783691, "learning_rate": 5.89754288515935e-06, "loss": 0.4696, "num_input_tokens_seen": 15365152, "step": 14900 }, { "epoch": 9.96989966555184, "grad_norm": 2.782809019088745, "learning_rate": 5.894671530344665e-06, "loss": 0.4089, "num_input_tokens_seen": 15369920, "step": 14905 }, { "epoch": 9.97324414715719, "grad_norm": 1.990537405014038, "learning_rate": 5.891799870687164e-06, "loss": 0.3887, "num_input_tokens_seen": 15374784, "step": 14910 }, { "epoch": 9.976588628762542, "grad_norm": 2.4042739868164062, "learning_rate": 5.888927907165314e-06, "loss": 0.4354, "num_input_tokens_seen": 15379648, "step": 14915 }, { "epoch": 9.979933110367893, "grad_norm": 2.1644749641418457, "learning_rate": 5.88605564075768e-06, "loss": 0.4138, "num_input_tokens_seen": 15384896, "step": 14920 }, { "epoch": 9.983277591973245, "grad_norm": 2.8443443775177, "learning_rate": 5.883183072442938e-06, "loss": 0.4946, "num_input_tokens_seen": 15389120, "step": 14925 }, { "epoch": 9.986622073578594, "grad_norm": 2.2358109951019287, "learning_rate": 5.880310203199859e-06, "loss": 0.413, "num_input_tokens_seen": 15394336, "step": 14930 }, { "epoch": 9.989966555183946, "grad_norm": 3.631425380706787, "learning_rate": 5.877437034007324e-06, "loss": 0.4197, "num_input_tokens_seen": 15400032, "step": 14935 }, { "epoch": 9.993311036789297, "grad_norm": 1.4611217975616455, "learning_rate": 5.8745635658443075e-06, "loss": 0.3704, "num_input_tokens_seen": 15405408, "step": 14940 }, { "epoch": 9.996655518394649, "grad_norm": 2.031480312347412, "learning_rate": 5.871689799689895e-06, "loss": 0.4074, "num_input_tokens_seen": 15410176, "step": 14945 }, { "epoch": 10.0, "grad_norm": 2.4587225914001465, "learning_rate": 5.868815736523269e-06, "loss": 0.4245, "num_input_tokens_seen": 15414800, "step": 14950 }, { "epoch": 10.0, "eval_loss": 0.4962001442909241, "eval_runtime": 37.5659, "eval_samples_per_second": 39.797, "eval_steps_per_second": 9.956, "num_input_tokens_seen": 15414800, "step": 14950 }, { "epoch": 10.003344481605351, "grad_norm": 1.6822718381881714, "learning_rate": 5.865941377323709e-06, "loss": 0.4257, "num_input_tokens_seen": 15420400, "step": 14955 }, { "epoch": 10.006688963210703, "grad_norm": 2.9343557357788086, "learning_rate": 5.8630667230706065e-06, "loss": 0.3598, "num_input_tokens_seen": 15425936, "step": 14960 }, { "epoch": 10.010033444816054, "grad_norm": 2.5386884212493896, "learning_rate": 5.860191774743442e-06, "loss": 0.4428, "num_input_tokens_seen": 15431984, "step": 14965 }, { "epoch": 10.013377926421406, "grad_norm": 2.480494499206543, "learning_rate": 5.857316533321803e-06, "loss": 0.387, "num_input_tokens_seen": 15437040, "step": 14970 }, { "epoch": 10.016722408026755, "grad_norm": 1.8378052711486816, "learning_rate": 5.854440999785374e-06, "loss": 0.4261, "num_input_tokens_seen": 15442096, "step": 14975 }, { "epoch": 10.020066889632107, "grad_norm": 2.351938247680664, "learning_rate": 5.851565175113942e-06, "loss": 0.3738, "num_input_tokens_seen": 15446736, "step": 14980 }, { "epoch": 10.023411371237458, "grad_norm": 2.150003433227539, "learning_rate": 5.848689060287388e-06, "loss": 0.3891, "num_input_tokens_seen": 15452368, "step": 14985 }, { "epoch": 10.02675585284281, "grad_norm": 1.7239530086517334, "learning_rate": 5.845812656285698e-06, "loss": 0.3796, "num_input_tokens_seen": 15458000, "step": 14990 }, { "epoch": 10.03010033444816, "grad_norm": 2.0227484703063965, "learning_rate": 5.84293596408895e-06, "loss": 0.401, "num_input_tokens_seen": 15462832, "step": 14995 }, { "epoch": 10.033444816053512, "grad_norm": 2.0436782836914062, "learning_rate": 5.840058984677328e-06, "loss": 0.344, "num_input_tokens_seen": 15468656, "step": 15000 }, { "epoch": 10.036789297658864, "grad_norm": 2.0127246379852295, "learning_rate": 5.837181719031109e-06, "loss": 0.3601, "num_input_tokens_seen": 15474064, "step": 15005 }, { "epoch": 10.040133779264215, "grad_norm": 2.095538854598999, "learning_rate": 5.834304168130664e-06, "loss": 0.4444, "num_input_tokens_seen": 15479664, "step": 15010 }, { "epoch": 10.043478260869565, "grad_norm": 1.801506757736206, "learning_rate": 5.831426332956471e-06, "loss": 0.4506, "num_input_tokens_seen": 15485616, "step": 15015 }, { "epoch": 10.046822742474916, "grad_norm": 1.9086862802505493, "learning_rate": 5.828548214489096e-06, "loss": 0.3741, "num_input_tokens_seen": 15491408, "step": 15020 }, { "epoch": 10.050167224080267, "grad_norm": 1.994661569595337, "learning_rate": 5.825669813709205e-06, "loss": 0.3422, "num_input_tokens_seen": 15497104, "step": 15025 }, { "epoch": 10.053511705685619, "grad_norm": 2.0672106742858887, "learning_rate": 5.82279113159756e-06, "loss": 0.3879, "num_input_tokens_seen": 15502096, "step": 15030 }, { "epoch": 10.05685618729097, "grad_norm": 2.1842877864837646, "learning_rate": 5.81991216913502e-06, "loss": 0.3364, "num_input_tokens_seen": 15506448, "step": 15035 }, { "epoch": 10.060200668896321, "grad_norm": 1.8687657117843628, "learning_rate": 5.817032927302537e-06, "loss": 0.3106, "num_input_tokens_seen": 15511568, "step": 15040 }, { "epoch": 10.063545150501673, "grad_norm": 1.8452956676483154, "learning_rate": 5.814153407081159e-06, "loss": 0.4783, "num_input_tokens_seen": 15517104, "step": 15045 }, { "epoch": 10.066889632107024, "grad_norm": 3.004192590713501, "learning_rate": 5.811273609452033e-06, "loss": 0.4202, "num_input_tokens_seen": 15522160, "step": 15050 }, { "epoch": 10.070234113712374, "grad_norm": 3.1726787090301514, "learning_rate": 5.808393535396391e-06, "loss": 0.4729, "num_input_tokens_seen": 15527312, "step": 15055 }, { "epoch": 10.073578595317725, "grad_norm": 2.2084407806396484, "learning_rate": 5.805513185895572e-06, "loss": 0.3717, "num_input_tokens_seen": 15531792, "step": 15060 }, { "epoch": 10.076923076923077, "grad_norm": 1.873142957687378, "learning_rate": 5.802632561930995e-06, "loss": 0.3732, "num_input_tokens_seen": 15536528, "step": 15065 }, { "epoch": 10.080267558528428, "grad_norm": 2.302353858947754, "learning_rate": 5.799751664484184e-06, "loss": 0.3452, "num_input_tokens_seen": 15540848, "step": 15070 }, { "epoch": 10.08361204013378, "grad_norm": 1.9142686128616333, "learning_rate": 5.796870494536751e-06, "loss": 0.3686, "num_input_tokens_seen": 15545360, "step": 15075 }, { "epoch": 10.08695652173913, "grad_norm": 2.149008274078369, "learning_rate": 5.793989053070401e-06, "loss": 0.4258, "num_input_tokens_seen": 15550864, "step": 15080 }, { "epoch": 10.090301003344482, "grad_norm": 1.6481643915176392, "learning_rate": 5.7911073410669315e-06, "loss": 0.4895, "num_input_tokens_seen": 15556368, "step": 15085 }, { "epoch": 10.093645484949834, "grad_norm": 2.669559955596924, "learning_rate": 5.7882253595082326e-06, "loss": 0.4291, "num_input_tokens_seen": 15562096, "step": 15090 }, { "epoch": 10.096989966555183, "grad_norm": 1.8999112844467163, "learning_rate": 5.785343109376287e-06, "loss": 0.4313, "num_input_tokens_seen": 15566544, "step": 15095 }, { "epoch": 10.100334448160535, "grad_norm": 1.9113131761550903, "learning_rate": 5.782460591653168e-06, "loss": 0.4228, "num_input_tokens_seen": 15571920, "step": 15100 }, { "epoch": 10.103678929765886, "grad_norm": 2.222836971282959, "learning_rate": 5.779577807321042e-06, "loss": 0.4589, "num_input_tokens_seen": 15577552, "step": 15105 }, { "epoch": 10.107023411371237, "grad_norm": 2.233785390853882, "learning_rate": 5.776694757362161e-06, "loss": 0.3976, "num_input_tokens_seen": 15583696, "step": 15110 }, { "epoch": 10.110367892976589, "grad_norm": 1.7170300483703613, "learning_rate": 5.773811442758874e-06, "loss": 0.3966, "num_input_tokens_seen": 15588016, "step": 15115 }, { "epoch": 10.11371237458194, "grad_norm": 1.7710685729980469, "learning_rate": 5.7709278644936164e-06, "loss": 0.4265, "num_input_tokens_seen": 15592784, "step": 15120 }, { "epoch": 10.117056856187292, "grad_norm": 2.5902884006500244, "learning_rate": 5.768044023548914e-06, "loss": 0.4077, "num_input_tokens_seen": 15597808, "step": 15125 }, { "epoch": 10.120401337792643, "grad_norm": 2.088146924972534, "learning_rate": 5.765159920907384e-06, "loss": 0.4034, "num_input_tokens_seen": 15602544, "step": 15130 }, { "epoch": 10.123745819397993, "grad_norm": 5.540666103363037, "learning_rate": 5.762275557551728e-06, "loss": 0.4262, "num_input_tokens_seen": 15606896, "step": 15135 }, { "epoch": 10.127090301003344, "grad_norm": 3.791783094406128, "learning_rate": 5.759390934464741e-06, "loss": 0.4158, "num_input_tokens_seen": 15611440, "step": 15140 }, { "epoch": 10.130434782608695, "grad_norm": 1.6938694715499878, "learning_rate": 5.756506052629306e-06, "loss": 0.4071, "num_input_tokens_seen": 15617136, "step": 15145 }, { "epoch": 10.133779264214047, "grad_norm": 2.6614696979522705, "learning_rate": 5.753620913028394e-06, "loss": 0.4736, "num_input_tokens_seen": 15622544, "step": 15150 }, { "epoch": 10.137123745819398, "grad_norm": 1.7641748189926147, "learning_rate": 5.75073551664506e-06, "loss": 0.4166, "num_input_tokens_seen": 15627696, "step": 15155 }, { "epoch": 10.14046822742475, "grad_norm": 2.0741305351257324, "learning_rate": 5.747849864462453e-06, "loss": 0.3938, "num_input_tokens_seen": 15633424, "step": 15160 }, { "epoch": 10.143812709030101, "grad_norm": 2.043344259262085, "learning_rate": 5.744963957463802e-06, "loss": 0.3882, "num_input_tokens_seen": 15638768, "step": 15165 }, { "epoch": 10.147157190635452, "grad_norm": 1.8321954011917114, "learning_rate": 5.74207779663243e-06, "loss": 0.4796, "num_input_tokens_seen": 15644336, "step": 15170 }, { "epoch": 10.150501672240802, "grad_norm": 3.326425552368164, "learning_rate": 5.739191382951742e-06, "loss": 0.3964, "num_input_tokens_seen": 15649776, "step": 15175 }, { "epoch": 10.153846153846153, "grad_norm": 1.6243329048156738, "learning_rate": 5.73630471740523e-06, "loss": 0.4965, "num_input_tokens_seen": 15655120, "step": 15180 }, { "epoch": 10.157190635451505, "grad_norm": 1.5887538194656372, "learning_rate": 5.733417800976471e-06, "loss": 0.3948, "num_input_tokens_seen": 15660752, "step": 15185 }, { "epoch": 10.160535117056856, "grad_norm": 1.5486788749694824, "learning_rate": 5.73053063464913e-06, "loss": 0.3415, "num_input_tokens_seen": 15666096, "step": 15190 }, { "epoch": 10.163879598662207, "grad_norm": 2.075780153274536, "learning_rate": 5.727643219406955e-06, "loss": 0.3024, "num_input_tokens_seen": 15671792, "step": 15195 }, { "epoch": 10.167224080267559, "grad_norm": 1.5429292917251587, "learning_rate": 5.724755556233778e-06, "loss": 0.3454, "num_input_tokens_seen": 15677456, "step": 15200 }, { "epoch": 10.17056856187291, "grad_norm": 1.789829969406128, "learning_rate": 5.72186764611352e-06, "loss": 0.4211, "num_input_tokens_seen": 15682416, "step": 15205 }, { "epoch": 10.173913043478262, "grad_norm": 4.06287956237793, "learning_rate": 5.718979490030181e-06, "loss": 0.4079, "num_input_tokens_seen": 15687472, "step": 15210 }, { "epoch": 10.177257525083611, "grad_norm": 2.3423709869384766, "learning_rate": 5.716091088967848e-06, "loss": 0.4084, "num_input_tokens_seen": 15691664, "step": 15215 }, { "epoch": 10.180602006688963, "grad_norm": 2.4249541759490967, "learning_rate": 5.71320244391069e-06, "loss": 0.3909, "num_input_tokens_seen": 15696592, "step": 15220 }, { "epoch": 10.183946488294314, "grad_norm": 2.228808641433716, "learning_rate": 5.710313555842959e-06, "loss": 0.4041, "num_input_tokens_seen": 15701424, "step": 15225 }, { "epoch": 10.187290969899665, "grad_norm": 2.303605079650879, "learning_rate": 5.707424425748991e-06, "loss": 0.3807, "num_input_tokens_seen": 15706608, "step": 15230 }, { "epoch": 10.190635451505017, "grad_norm": 4.739552974700928, "learning_rate": 5.704535054613202e-06, "loss": 0.5006, "num_input_tokens_seen": 15711216, "step": 15235 }, { "epoch": 10.193979933110368, "grad_norm": 1.6487942934036255, "learning_rate": 5.701645443420094e-06, "loss": 0.3779, "num_input_tokens_seen": 15716080, "step": 15240 }, { "epoch": 10.19732441471572, "grad_norm": 1.8267710208892822, "learning_rate": 5.698755593154248e-06, "loss": 0.4061, "num_input_tokens_seen": 15721360, "step": 15245 }, { "epoch": 10.200668896321071, "grad_norm": 2.684081554412842, "learning_rate": 5.695865504800328e-06, "loss": 0.3538, "num_input_tokens_seen": 15726704, "step": 15250 }, { "epoch": 10.20401337792642, "grad_norm": 2.7792062759399414, "learning_rate": 5.6929751793430765e-06, "loss": 0.4121, "num_input_tokens_seen": 15732240, "step": 15255 }, { "epoch": 10.207357859531772, "grad_norm": 2.417271614074707, "learning_rate": 5.69008461776732e-06, "loss": 0.4318, "num_input_tokens_seen": 15738320, "step": 15260 }, { "epoch": 10.210702341137123, "grad_norm": 2.3066461086273193, "learning_rate": 5.687193821057963e-06, "loss": 0.4353, "num_input_tokens_seen": 15742896, "step": 15265 }, { "epoch": 10.214046822742475, "grad_norm": 2.20462703704834, "learning_rate": 5.684302790199992e-06, "loss": 0.2906, "num_input_tokens_seen": 15747152, "step": 15270 }, { "epoch": 10.217391304347826, "grad_norm": 2.0719263553619385, "learning_rate": 5.681411526178473e-06, "loss": 0.42, "num_input_tokens_seen": 15752912, "step": 15275 }, { "epoch": 10.220735785953178, "grad_norm": 3.070694923400879, "learning_rate": 5.678520029978548e-06, "loss": 0.4539, "num_input_tokens_seen": 15758544, "step": 15280 }, { "epoch": 10.224080267558529, "grad_norm": 2.3109688758850098, "learning_rate": 5.675628302585443e-06, "loss": 0.4232, "num_input_tokens_seen": 15763984, "step": 15285 }, { "epoch": 10.22742474916388, "grad_norm": 2.4323692321777344, "learning_rate": 5.672736344984463e-06, "loss": 0.4047, "num_input_tokens_seen": 15768816, "step": 15290 }, { "epoch": 10.23076923076923, "grad_norm": 1.4479376077651978, "learning_rate": 5.669844158160984e-06, "loss": 0.3779, "num_input_tokens_seen": 15773872, "step": 15295 }, { "epoch": 10.234113712374581, "grad_norm": 1.5777853727340698, "learning_rate": 5.666951743100469e-06, "loss": 0.4149, "num_input_tokens_seen": 15778864, "step": 15300 }, { "epoch": 10.237458193979933, "grad_norm": 1.9595694541931152, "learning_rate": 5.664059100788456e-06, "loss": 0.4086, "num_input_tokens_seen": 15783440, "step": 15305 }, { "epoch": 10.240802675585284, "grad_norm": 1.9327319860458374, "learning_rate": 5.661166232210555e-06, "loss": 0.4406, "num_input_tokens_seen": 15788048, "step": 15310 }, { "epoch": 10.244147157190636, "grad_norm": 1.9446675777435303, "learning_rate": 5.6582731383524625e-06, "loss": 0.352, "num_input_tokens_seen": 15792784, "step": 15315 }, { "epoch": 10.247491638795987, "grad_norm": 1.5804858207702637, "learning_rate": 5.655379820199944e-06, "loss": 0.3977, "num_input_tokens_seen": 15797360, "step": 15320 }, { "epoch": 10.250836120401338, "grad_norm": 1.9978960752487183, "learning_rate": 5.652486278738845e-06, "loss": 0.3806, "num_input_tokens_seen": 15802000, "step": 15325 }, { "epoch": 10.25418060200669, "grad_norm": 2.3794660568237305, "learning_rate": 5.649592514955087e-06, "loss": 0.4244, "num_input_tokens_seen": 15806448, "step": 15330 }, { "epoch": 10.25752508361204, "grad_norm": 1.9027925729751587, "learning_rate": 5.6466985298346634e-06, "loss": 0.3795, "num_input_tokens_seen": 15812368, "step": 15335 }, { "epoch": 10.26086956521739, "grad_norm": 2.5947229862213135, "learning_rate": 5.64380432436365e-06, "loss": 0.4062, "num_input_tokens_seen": 15817840, "step": 15340 }, { "epoch": 10.264214046822742, "grad_norm": 2.3069260120391846, "learning_rate": 5.640909899528192e-06, "loss": 0.4044, "num_input_tokens_seen": 15822096, "step": 15345 }, { "epoch": 10.267558528428093, "grad_norm": 2.362356185913086, "learning_rate": 5.638015256314513e-06, "loss": 0.4084, "num_input_tokens_seen": 15826960, "step": 15350 }, { "epoch": 10.270903010033445, "grad_norm": 2.1143288612365723, "learning_rate": 5.635120395708907e-06, "loss": 0.5324, "num_input_tokens_seen": 15832752, "step": 15355 }, { "epoch": 10.274247491638796, "grad_norm": 1.884077548980713, "learning_rate": 5.632225318697747e-06, "loss": 0.3968, "num_input_tokens_seen": 15838064, "step": 15360 }, { "epoch": 10.277591973244148, "grad_norm": 2.5907163619995117, "learning_rate": 5.629330026267475e-06, "loss": 0.4053, "num_input_tokens_seen": 15842512, "step": 15365 }, { "epoch": 10.280936454849499, "grad_norm": 2.978778600692749, "learning_rate": 5.626434519404609e-06, "loss": 0.4039, "num_input_tokens_seen": 15848016, "step": 15370 }, { "epoch": 10.284280936454849, "grad_norm": 2.2785050868988037, "learning_rate": 5.62353879909574e-06, "loss": 0.4316, "num_input_tokens_seen": 15852848, "step": 15375 }, { "epoch": 10.2876254180602, "grad_norm": 2.2986092567443848, "learning_rate": 5.620642866327529e-06, "loss": 0.3383, "num_input_tokens_seen": 15857648, "step": 15380 }, { "epoch": 10.290969899665551, "grad_norm": 1.9743772745132446, "learning_rate": 5.6177467220867145e-06, "loss": 0.4148, "num_input_tokens_seen": 15862544, "step": 15385 }, { "epoch": 10.294314381270903, "grad_norm": 1.703893780708313, "learning_rate": 5.614850367360103e-06, "loss": 0.4129, "num_input_tokens_seen": 15867632, "step": 15390 }, { "epoch": 10.297658862876254, "grad_norm": 2.4506542682647705, "learning_rate": 5.611953803134574e-06, "loss": 0.403, "num_input_tokens_seen": 15872176, "step": 15395 }, { "epoch": 10.301003344481606, "grad_norm": 1.542394995689392, "learning_rate": 5.609057030397079e-06, "loss": 0.3383, "num_input_tokens_seen": 15876816, "step": 15400 }, { "epoch": 10.304347826086957, "grad_norm": 1.9014216661453247, "learning_rate": 5.606160050134639e-06, "loss": 0.4343, "num_input_tokens_seen": 15882128, "step": 15405 }, { "epoch": 10.307692307692308, "grad_norm": 2.489668846130371, "learning_rate": 5.6032628633343475e-06, "loss": 0.38, "num_input_tokens_seen": 15887184, "step": 15410 }, { "epoch": 10.31103678929766, "grad_norm": 3.151484966278076, "learning_rate": 5.600365470983366e-06, "loss": 0.3651, "num_input_tokens_seen": 15893040, "step": 15415 }, { "epoch": 10.31438127090301, "grad_norm": 1.804494857788086, "learning_rate": 5.597467874068929e-06, "loss": 0.395, "num_input_tokens_seen": 15898320, "step": 15420 }, { "epoch": 10.31772575250836, "grad_norm": 3.4656968116760254, "learning_rate": 5.594570073578338e-06, "loss": 0.4315, "num_input_tokens_seen": 15903600, "step": 15425 }, { "epoch": 10.321070234113712, "grad_norm": 2.3015220165252686, "learning_rate": 5.591672070498966e-06, "loss": 0.4441, "num_input_tokens_seen": 15908656, "step": 15430 }, { "epoch": 10.324414715719064, "grad_norm": 2.8689136505126953, "learning_rate": 5.588773865818253e-06, "loss": 0.4134, "num_input_tokens_seen": 15913040, "step": 15435 }, { "epoch": 10.327759197324415, "grad_norm": 2.033482074737549, "learning_rate": 5.5858754605237074e-06, "loss": 0.3029, "num_input_tokens_seen": 15918064, "step": 15440 }, { "epoch": 10.331103678929766, "grad_norm": 2.331908941268921, "learning_rate": 5.582976855602911e-06, "loss": 0.4368, "num_input_tokens_seen": 15923376, "step": 15445 }, { "epoch": 10.334448160535118, "grad_norm": 2.1288628578186035, "learning_rate": 5.580078052043509e-06, "loss": 0.3924, "num_input_tokens_seen": 15928848, "step": 15450 }, { "epoch": 10.337792642140467, "grad_norm": 2.920254707336426, "learning_rate": 5.577179050833215e-06, "loss": 0.4797, "num_input_tokens_seen": 15934544, "step": 15455 }, { "epoch": 10.341137123745819, "grad_norm": 3.011934757232666, "learning_rate": 5.574279852959807e-06, "loss": 0.3279, "num_input_tokens_seen": 15939472, "step": 15460 }, { "epoch": 10.34448160535117, "grad_norm": 2.2516303062438965, "learning_rate": 5.571380459411137e-06, "loss": 0.3958, "num_input_tokens_seen": 15944144, "step": 15465 }, { "epoch": 10.347826086956522, "grad_norm": 2.001085042953491, "learning_rate": 5.568480871175119e-06, "loss": 0.4666, "num_input_tokens_seen": 15949808, "step": 15470 }, { "epoch": 10.351170568561873, "grad_norm": 1.8688968420028687, "learning_rate": 5.565581089239734e-06, "loss": 0.3845, "num_input_tokens_seen": 15954800, "step": 15475 }, { "epoch": 10.354515050167224, "grad_norm": 2.6207973957061768, "learning_rate": 5.562681114593028e-06, "loss": 0.458, "num_input_tokens_seen": 15959920, "step": 15480 }, { "epoch": 10.357859531772576, "grad_norm": 2.577979803085327, "learning_rate": 5.559780948223116e-06, "loss": 0.4794, "num_input_tokens_seen": 15964848, "step": 15485 }, { "epoch": 10.361204013377927, "grad_norm": 3.0056238174438477, "learning_rate": 5.5568805911181735e-06, "loss": 0.3989, "num_input_tokens_seen": 15969488, "step": 15490 }, { "epoch": 10.364548494983278, "grad_norm": 2.1765146255493164, "learning_rate": 5.5539800442664475e-06, "loss": 0.4005, "num_input_tokens_seen": 15974576, "step": 15495 }, { "epoch": 10.367892976588628, "grad_norm": 2.3435792922973633, "learning_rate": 5.551079308656242e-06, "loss": 0.3414, "num_input_tokens_seen": 15979280, "step": 15500 }, { "epoch": 10.37123745819398, "grad_norm": 1.7201104164123535, "learning_rate": 5.54817838527593e-06, "loss": 0.3458, "num_input_tokens_seen": 15984144, "step": 15505 }, { "epoch": 10.37458193979933, "grad_norm": 2.121624231338501, "learning_rate": 5.5452772751139496e-06, "loss": 0.4782, "num_input_tokens_seen": 15988432, "step": 15510 }, { "epoch": 10.377926421404682, "grad_norm": 2.9617819786071777, "learning_rate": 5.542375979158798e-06, "loss": 0.3543, "num_input_tokens_seen": 15993072, "step": 15515 }, { "epoch": 10.381270903010034, "grad_norm": 2.504840135574341, "learning_rate": 5.539474498399041e-06, "loss": 0.4134, "num_input_tokens_seen": 15997904, "step": 15520 }, { "epoch": 10.384615384615385, "grad_norm": 2.0747621059417725, "learning_rate": 5.536572833823301e-06, "loss": 0.4892, "num_input_tokens_seen": 16003856, "step": 15525 }, { "epoch": 10.387959866220736, "grad_norm": 2.017176389694214, "learning_rate": 5.533670986420272e-06, "loss": 0.3484, "num_input_tokens_seen": 16009040, "step": 15530 }, { "epoch": 10.391304347826088, "grad_norm": 1.8709152936935425, "learning_rate": 5.530768957178698e-06, "loss": 0.4081, "num_input_tokens_seen": 16014320, "step": 15535 }, { "epoch": 10.394648829431437, "grad_norm": 2.3478238582611084, "learning_rate": 5.527866747087398e-06, "loss": 0.4029, "num_input_tokens_seen": 16018992, "step": 15540 }, { "epoch": 10.397993311036789, "grad_norm": 2.66007924079895, "learning_rate": 5.524964357135243e-06, "loss": 0.3876, "num_input_tokens_seen": 16024464, "step": 15545 }, { "epoch": 10.40133779264214, "grad_norm": 3.9663798809051514, "learning_rate": 5.52206178831117e-06, "loss": 0.3244, "num_input_tokens_seen": 16029680, "step": 15550 }, { "epoch": 10.404682274247492, "grad_norm": 1.9726812839508057, "learning_rate": 5.519159041604177e-06, "loss": 0.3406, "num_input_tokens_seen": 16034512, "step": 15555 }, { "epoch": 10.408026755852843, "grad_norm": 1.8685123920440674, "learning_rate": 5.516256118003321e-06, "loss": 0.4103, "num_input_tokens_seen": 16039664, "step": 15560 }, { "epoch": 10.411371237458194, "grad_norm": 1.917542815208435, "learning_rate": 5.513353018497717e-06, "loss": 0.3472, "num_input_tokens_seen": 16044592, "step": 15565 }, { "epoch": 10.414715719063546, "grad_norm": 2.2461612224578857, "learning_rate": 5.510449744076544e-06, "loss": 0.4146, "num_input_tokens_seen": 16049904, "step": 15570 }, { "epoch": 10.418060200668897, "grad_norm": 2.0593502521514893, "learning_rate": 5.5075462957290405e-06, "loss": 0.4044, "num_input_tokens_seen": 16055632, "step": 15575 }, { "epoch": 10.421404682274247, "grad_norm": 1.5721224546432495, "learning_rate": 5.504642674444501e-06, "loss": 0.4811, "num_input_tokens_seen": 16061232, "step": 15580 }, { "epoch": 10.424749163879598, "grad_norm": 2.5337791442871094, "learning_rate": 5.501738881212283e-06, "loss": 0.4034, "num_input_tokens_seen": 16065904, "step": 15585 }, { "epoch": 10.42809364548495, "grad_norm": 2.315070152282715, "learning_rate": 5.498834917021798e-06, "loss": 0.4111, "num_input_tokens_seen": 16071536, "step": 15590 }, { "epoch": 10.431438127090301, "grad_norm": 1.7803761959075928, "learning_rate": 5.495930782862522e-06, "loss": 0.387, "num_input_tokens_seen": 16076560, "step": 15595 }, { "epoch": 10.434782608695652, "grad_norm": 1.5809656381607056, "learning_rate": 5.493026479723982e-06, "loss": 0.3606, "num_input_tokens_seen": 16081584, "step": 15600 }, { "epoch": 10.438127090301004, "grad_norm": 2.0641684532165527, "learning_rate": 5.490122008595765e-06, "loss": 0.4665, "num_input_tokens_seen": 16087536, "step": 15605 }, { "epoch": 10.441471571906355, "grad_norm": 2.370121717453003, "learning_rate": 5.487217370467518e-06, "loss": 0.427, "num_input_tokens_seen": 16092272, "step": 15610 }, { "epoch": 10.444816053511706, "grad_norm": 2.511359214782715, "learning_rate": 5.484312566328942e-06, "loss": 0.3986, "num_input_tokens_seen": 16097456, "step": 15615 }, { "epoch": 10.448160535117056, "grad_norm": 1.8821392059326172, "learning_rate": 5.481407597169796e-06, "loss": 0.4323, "num_input_tokens_seen": 16103120, "step": 15620 }, { "epoch": 10.451505016722408, "grad_norm": 1.3208166360855103, "learning_rate": 5.478502463979895e-06, "loss": 0.4248, "num_input_tokens_seen": 16106992, "step": 15625 }, { "epoch": 10.454849498327759, "grad_norm": 1.9058200120925903, "learning_rate": 5.4755971677491085e-06, "loss": 0.3957, "num_input_tokens_seen": 16113712, "step": 15630 }, { "epoch": 10.45819397993311, "grad_norm": 1.7464529275894165, "learning_rate": 5.472691709467362e-06, "loss": 0.3953, "num_input_tokens_seen": 16120080, "step": 15635 }, { "epoch": 10.461538461538462, "grad_norm": 1.8838226795196533, "learning_rate": 5.4697860901246365e-06, "loss": 0.3423, "num_input_tokens_seen": 16124464, "step": 15640 }, { "epoch": 10.464882943143813, "grad_norm": 2.3438773155212402, "learning_rate": 5.4668803107109716e-06, "loss": 0.5121, "num_input_tokens_seen": 16129168, "step": 15645 }, { "epoch": 10.468227424749164, "grad_norm": 2.3603909015655518, "learning_rate": 5.463974372216454e-06, "loss": 0.3496, "num_input_tokens_seen": 16134224, "step": 15650 }, { "epoch": 10.471571906354516, "grad_norm": 2.4822323322296143, "learning_rate": 5.461068275631233e-06, "loss": 0.4078, "num_input_tokens_seen": 16139504, "step": 15655 }, { "epoch": 10.474916387959865, "grad_norm": 2.209035634994507, "learning_rate": 5.458162021945502e-06, "loss": 0.3571, "num_input_tokens_seen": 16144144, "step": 15660 }, { "epoch": 10.478260869565217, "grad_norm": 2.7890124320983887, "learning_rate": 5.455255612149517e-06, "loss": 0.4693, "num_input_tokens_seen": 16148688, "step": 15665 }, { "epoch": 10.481605351170568, "grad_norm": 1.8744240999221802, "learning_rate": 5.452349047233583e-06, "loss": 0.4352, "num_input_tokens_seen": 16153520, "step": 15670 }, { "epoch": 10.48494983277592, "grad_norm": 1.8401919603347778, "learning_rate": 5.449442328188058e-06, "loss": 0.4311, "num_input_tokens_seen": 16158256, "step": 15675 }, { "epoch": 10.488294314381271, "grad_norm": 2.5911741256713867, "learning_rate": 5.446535456003353e-06, "loss": 0.4121, "num_input_tokens_seen": 16163280, "step": 15680 }, { "epoch": 10.491638795986622, "grad_norm": 2.248587131500244, "learning_rate": 5.443628431669929e-06, "loss": 0.4391, "num_input_tokens_seen": 16168624, "step": 15685 }, { "epoch": 10.494983277591974, "grad_norm": 2.6896023750305176, "learning_rate": 5.440721256178304e-06, "loss": 0.4041, "num_input_tokens_seen": 16173552, "step": 15690 }, { "epoch": 10.498327759197325, "grad_norm": 2.061610460281372, "learning_rate": 5.437813930519043e-06, "loss": 0.3257, "num_input_tokens_seen": 16178096, "step": 15695 }, { "epoch": 10.501672240802675, "grad_norm": 2.1566898822784424, "learning_rate": 5.434906455682765e-06, "loss": 0.4542, "num_input_tokens_seen": 16183216, "step": 15700 }, { "epoch": 10.505016722408026, "grad_norm": 1.73021399974823, "learning_rate": 5.431998832660136e-06, "loss": 0.4037, "num_input_tokens_seen": 16189136, "step": 15705 }, { "epoch": 10.508361204013378, "grad_norm": 2.203994035720825, "learning_rate": 5.429091062441877e-06, "loss": 0.4196, "num_input_tokens_seen": 16194480, "step": 15710 }, { "epoch": 10.511705685618729, "grad_norm": 1.7771425247192383, "learning_rate": 5.4261831460187545e-06, "loss": 0.3651, "num_input_tokens_seen": 16199856, "step": 15715 }, { "epoch": 10.51505016722408, "grad_norm": 2.0837225914001465, "learning_rate": 5.423275084381591e-06, "loss": 0.432, "num_input_tokens_seen": 16204336, "step": 15720 }, { "epoch": 10.518394648829432, "grad_norm": 2.334095001220703, "learning_rate": 5.420366878521251e-06, "loss": 0.4622, "num_input_tokens_seen": 16209200, "step": 15725 }, { "epoch": 10.521739130434783, "grad_norm": 2.2978532314300537, "learning_rate": 5.4174585294286555e-06, "loss": 0.4249, "num_input_tokens_seen": 16214192, "step": 15730 }, { "epoch": 10.525083612040135, "grad_norm": 2.212284803390503, "learning_rate": 5.414550038094767e-06, "loss": 0.475, "num_input_tokens_seen": 16219632, "step": 15735 }, { "epoch": 10.528428093645484, "grad_norm": 3.167762517929077, "learning_rate": 5.411641405510603e-06, "loss": 0.4647, "num_input_tokens_seen": 16224976, "step": 15740 }, { "epoch": 10.531772575250836, "grad_norm": 2.7274134159088135, "learning_rate": 5.408732632667226e-06, "loss": 0.3906, "num_input_tokens_seen": 16229968, "step": 15745 }, { "epoch": 10.535117056856187, "grad_norm": 2.3569579124450684, "learning_rate": 5.405823720555746e-06, "loss": 0.4478, "num_input_tokens_seen": 16235344, "step": 15750 }, { "epoch": 10.538461538461538, "grad_norm": 3.128546953201294, "learning_rate": 5.402914670167323e-06, "loss": 0.3802, "num_input_tokens_seen": 16240016, "step": 15755 }, { "epoch": 10.54180602006689, "grad_norm": 1.9136086702346802, "learning_rate": 5.400005482493159e-06, "loss": 0.4256, "num_input_tokens_seen": 16246320, "step": 15760 }, { "epoch": 10.545150501672241, "grad_norm": 2.3235023021698, "learning_rate": 5.397096158524509e-06, "loss": 0.4301, "num_input_tokens_seen": 16252752, "step": 15765 }, { "epoch": 10.548494983277592, "grad_norm": 1.6883673667907715, "learning_rate": 5.39418669925267e-06, "loss": 0.3674, "num_input_tokens_seen": 16257456, "step": 15770 }, { "epoch": 10.551839464882944, "grad_norm": 1.7366161346435547, "learning_rate": 5.391277105668987e-06, "loss": 0.4116, "num_input_tokens_seen": 16262672, "step": 15775 }, { "epoch": 10.555183946488294, "grad_norm": 3.389986515045166, "learning_rate": 5.3883673787648496e-06, "loss": 0.445, "num_input_tokens_seen": 16267888, "step": 15780 }, { "epoch": 10.558528428093645, "grad_norm": 2.5009593963623047, "learning_rate": 5.385457519531694e-06, "loss": 0.5235, "num_input_tokens_seen": 16273552, "step": 15785 }, { "epoch": 10.561872909698996, "grad_norm": 1.8432172536849976, "learning_rate": 5.382547528961002e-06, "loss": 0.4567, "num_input_tokens_seen": 16278928, "step": 15790 }, { "epoch": 10.565217391304348, "grad_norm": 2.28790545463562, "learning_rate": 5.379637408044297e-06, "loss": 0.4225, "num_input_tokens_seen": 16283664, "step": 15795 }, { "epoch": 10.568561872909699, "grad_norm": 1.9823359251022339, "learning_rate": 5.376727157773152e-06, "loss": 0.4533, "num_input_tokens_seen": 16288304, "step": 15800 }, { "epoch": 10.57190635451505, "grad_norm": 1.8529399633407593, "learning_rate": 5.373816779139177e-06, "loss": 0.368, "num_input_tokens_seen": 16293264, "step": 15805 }, { "epoch": 10.575250836120402, "grad_norm": 2.4355809688568115, "learning_rate": 5.370906273134034e-06, "loss": 0.3901, "num_input_tokens_seen": 16298480, "step": 15810 }, { "epoch": 10.578595317725753, "grad_norm": 2.30256724357605, "learning_rate": 5.367995640749421e-06, "loss": 0.3973, "num_input_tokens_seen": 16303472, "step": 15815 }, { "epoch": 10.581939799331103, "grad_norm": 2.1635568141937256, "learning_rate": 5.365084882977086e-06, "loss": 0.3905, "num_input_tokens_seen": 16308240, "step": 15820 }, { "epoch": 10.585284280936454, "grad_norm": 1.9987077713012695, "learning_rate": 5.362174000808813e-06, "loss": 0.4582, "num_input_tokens_seen": 16313616, "step": 15825 }, { "epoch": 10.588628762541806, "grad_norm": 2.2541494369506836, "learning_rate": 5.359262995236432e-06, "loss": 0.3875, "num_input_tokens_seen": 16319696, "step": 15830 }, { "epoch": 10.591973244147157, "grad_norm": 1.8304399251937866, "learning_rate": 5.356351867251814e-06, "loss": 0.4578, "num_input_tokens_seen": 16324912, "step": 15835 }, { "epoch": 10.595317725752508, "grad_norm": 2.0912106037139893, "learning_rate": 5.353440617846871e-06, "loss": 0.3974, "num_input_tokens_seen": 16330096, "step": 15840 }, { "epoch": 10.59866220735786, "grad_norm": 1.8922537565231323, "learning_rate": 5.350529248013562e-06, "loss": 0.4375, "num_input_tokens_seen": 16335664, "step": 15845 }, { "epoch": 10.602006688963211, "grad_norm": 2.74857234954834, "learning_rate": 5.347617758743878e-06, "loss": 0.3965, "num_input_tokens_seen": 16341264, "step": 15850 }, { "epoch": 10.605351170568563, "grad_norm": 2.4120888710021973, "learning_rate": 5.344706151029858e-06, "loss": 0.3512, "num_input_tokens_seen": 16346256, "step": 15855 }, { "epoch": 10.608695652173914, "grad_norm": 1.390073537826538, "learning_rate": 5.341794425863577e-06, "loss": 0.3818, "num_input_tokens_seen": 16352432, "step": 15860 }, { "epoch": 10.612040133779264, "grad_norm": 3.1201114654541016, "learning_rate": 5.3388825842371525e-06, "loss": 0.392, "num_input_tokens_seen": 16357200, "step": 15865 }, { "epoch": 10.615384615384615, "grad_norm": 2.164295196533203, "learning_rate": 5.33597062714274e-06, "loss": 0.3856, "num_input_tokens_seen": 16362320, "step": 15870 }, { "epoch": 10.618729096989966, "grad_norm": 1.91082763671875, "learning_rate": 5.333058555572538e-06, "loss": 0.387, "num_input_tokens_seen": 16367184, "step": 15875 }, { "epoch": 10.622073578595318, "grad_norm": 2.4057679176330566, "learning_rate": 5.330146370518776e-06, "loss": 0.4057, "num_input_tokens_seen": 16372304, "step": 15880 }, { "epoch": 10.62541806020067, "grad_norm": 2.1062254905700684, "learning_rate": 5.327234072973731e-06, "loss": 0.4209, "num_input_tokens_seen": 16376464, "step": 15885 }, { "epoch": 10.62876254180602, "grad_norm": 2.5156219005584717, "learning_rate": 5.324321663929715e-06, "loss": 0.3228, "num_input_tokens_seen": 16381872, "step": 15890 }, { "epoch": 10.632107023411372, "grad_norm": 2.386213541030884, "learning_rate": 5.321409144379074e-06, "loss": 0.402, "num_input_tokens_seen": 16387024, "step": 15895 }, { "epoch": 10.635451505016722, "grad_norm": 1.9594053030014038, "learning_rate": 5.3184965153142e-06, "loss": 0.3445, "num_input_tokens_seen": 16391920, "step": 15900 }, { "epoch": 10.638795986622073, "grad_norm": 1.826812744140625, "learning_rate": 5.3155837777275135e-06, "loss": 0.401, "num_input_tokens_seen": 16397520, "step": 15905 }, { "epoch": 10.642140468227424, "grad_norm": 2.793506145477295, "learning_rate": 5.312670932611479e-06, "loss": 0.4123, "num_input_tokens_seen": 16402736, "step": 15910 }, { "epoch": 10.645484949832776, "grad_norm": 1.349852442741394, "learning_rate": 5.309757980958593e-06, "loss": 0.3411, "num_input_tokens_seen": 16408720, "step": 15915 }, { "epoch": 10.648829431438127, "grad_norm": 2.552964210510254, "learning_rate": 5.306844923761391e-06, "loss": 0.4439, "num_input_tokens_seen": 16414032, "step": 15920 }, { "epoch": 10.652173913043478, "grad_norm": 2.737945079803467, "learning_rate": 5.3039317620124446e-06, "loss": 0.52, "num_input_tokens_seen": 16419440, "step": 15925 }, { "epoch": 10.65551839464883, "grad_norm": 1.7196967601776123, "learning_rate": 5.301018496704355e-06, "loss": 0.3556, "num_input_tokens_seen": 16424944, "step": 15930 }, { "epoch": 10.658862876254181, "grad_norm": 2.5396852493286133, "learning_rate": 5.298105128829771e-06, "loss": 0.374, "num_input_tokens_seen": 16430384, "step": 15935 }, { "epoch": 10.662207357859533, "grad_norm": 2.0383827686309814, "learning_rate": 5.295191659381362e-06, "loss": 0.358, "num_input_tokens_seen": 16435184, "step": 15940 }, { "epoch": 10.665551839464882, "grad_norm": 3.0452728271484375, "learning_rate": 5.2922780893518445e-06, "loss": 0.4576, "num_input_tokens_seen": 16439728, "step": 15945 }, { "epoch": 10.668896321070234, "grad_norm": 2.8356001377105713, "learning_rate": 5.289364419733959e-06, "loss": 0.3943, "num_input_tokens_seen": 16444016, "step": 15950 }, { "epoch": 10.672240802675585, "grad_norm": 1.9778510332107544, "learning_rate": 5.286450651520488e-06, "loss": 0.3986, "num_input_tokens_seen": 16448720, "step": 15955 }, { "epoch": 10.675585284280936, "grad_norm": 2.5352370738983154, "learning_rate": 5.283536785704243e-06, "loss": 0.3424, "num_input_tokens_seen": 16453584, "step": 15960 }, { "epoch": 10.678929765886288, "grad_norm": 2.3092148303985596, "learning_rate": 5.280622823278071e-06, "loss": 0.4651, "num_input_tokens_seen": 16458064, "step": 15965 }, { "epoch": 10.68227424749164, "grad_norm": 1.958724021911621, "learning_rate": 5.277708765234849e-06, "loss": 0.337, "num_input_tokens_seen": 16463120, "step": 15970 }, { "epoch": 10.68561872909699, "grad_norm": 2.840040922164917, "learning_rate": 5.2747946125674894e-06, "loss": 0.4701, "num_input_tokens_seen": 16467824, "step": 15975 }, { "epoch": 10.68896321070234, "grad_norm": 2.225987195968628, "learning_rate": 5.271880366268935e-06, "loss": 0.4392, "num_input_tokens_seen": 16473712, "step": 15980 }, { "epoch": 10.692307692307692, "grad_norm": 1.810852289199829, "learning_rate": 5.268966027332164e-06, "loss": 0.4141, "num_input_tokens_seen": 16479088, "step": 15985 }, { "epoch": 10.695652173913043, "grad_norm": 2.803969383239746, "learning_rate": 5.266051596750184e-06, "loss": 0.3469, "num_input_tokens_seen": 16485488, "step": 15990 }, { "epoch": 10.698996655518394, "grad_norm": 2.1128151416778564, "learning_rate": 5.263137075516028e-06, "loss": 0.3915, "num_input_tokens_seen": 16490832, "step": 15995 }, { "epoch": 10.702341137123746, "grad_norm": 2.192253351211548, "learning_rate": 5.260222464622772e-06, "loss": 0.3427, "num_input_tokens_seen": 16496144, "step": 16000 }, { "epoch": 10.705685618729097, "grad_norm": 2.6171982288360596, "learning_rate": 5.257307765063511e-06, "loss": 0.5548, "num_input_tokens_seen": 16501552, "step": 16005 }, { "epoch": 10.709030100334449, "grad_norm": 2.2168571949005127, "learning_rate": 5.2543929778313775e-06, "loss": 0.464, "num_input_tokens_seen": 16506960, "step": 16010 }, { "epoch": 10.7123745819398, "grad_norm": 2.0817856788635254, "learning_rate": 5.251478103919532e-06, "loss": 0.3433, "num_input_tokens_seen": 16511856, "step": 16015 }, { "epoch": 10.715719063545151, "grad_norm": 3.318959951400757, "learning_rate": 5.24856314432116e-06, "loss": 0.4191, "num_input_tokens_seen": 16516912, "step": 16020 }, { "epoch": 10.719063545150501, "grad_norm": 2.375417709350586, "learning_rate": 5.245648100029484e-06, "loss": 0.4443, "num_input_tokens_seen": 16522448, "step": 16025 }, { "epoch": 10.722408026755852, "grad_norm": 3.4846444129943848, "learning_rate": 5.242732972037752e-06, "loss": 0.3966, "num_input_tokens_seen": 16527504, "step": 16030 }, { "epoch": 10.725752508361204, "grad_norm": 2.0429091453552246, "learning_rate": 5.239817761339239e-06, "loss": 0.4134, "num_input_tokens_seen": 16532496, "step": 16035 }, { "epoch": 10.729096989966555, "grad_norm": 3.1514720916748047, "learning_rate": 5.236902468927248e-06, "loss": 0.3636, "num_input_tokens_seen": 16537744, "step": 16040 }, { "epoch": 10.732441471571907, "grad_norm": 2.6311964988708496, "learning_rate": 5.233987095795111e-06, "loss": 0.5082, "num_input_tokens_seen": 16542864, "step": 16045 }, { "epoch": 10.735785953177258, "grad_norm": 2.329080581665039, "learning_rate": 5.23107164293619e-06, "loss": 0.4047, "num_input_tokens_seen": 16547600, "step": 16050 }, { "epoch": 10.73913043478261, "grad_norm": 1.828668475151062, "learning_rate": 5.228156111343871e-06, "loss": 0.426, "num_input_tokens_seen": 16553936, "step": 16055 }, { "epoch": 10.742474916387959, "grad_norm": 2.282644510269165, "learning_rate": 5.225240502011566e-06, "loss": 0.3834, "num_input_tokens_seen": 16558672, "step": 16060 }, { "epoch": 10.74581939799331, "grad_norm": 4.651180744171143, "learning_rate": 5.222324815932717e-06, "loss": 0.4776, "num_input_tokens_seen": 16563536, "step": 16065 }, { "epoch": 10.749163879598662, "grad_norm": 2.1495420932769775, "learning_rate": 5.219409054100789e-06, "loss": 0.4305, "num_input_tokens_seen": 16568304, "step": 16070 }, { "epoch": 10.752508361204013, "grad_norm": 2.797856569290161, "learning_rate": 5.216493217509273e-06, "loss": 0.4072, "num_input_tokens_seen": 16573584, "step": 16075 }, { "epoch": 10.755852842809364, "grad_norm": 1.6804885864257812, "learning_rate": 5.213577307151688e-06, "loss": 0.35, "num_input_tokens_seen": 16579312, "step": 16080 }, { "epoch": 10.759197324414716, "grad_norm": 1.6977133750915527, "learning_rate": 5.210661324021577e-06, "loss": 0.4547, "num_input_tokens_seen": 16584464, "step": 16085 }, { "epoch": 10.762541806020067, "grad_norm": 2.5801217555999756, "learning_rate": 5.207745269112509e-06, "loss": 0.3998, "num_input_tokens_seen": 16589360, "step": 16090 }, { "epoch": 10.765886287625419, "grad_norm": 2.2394065856933594, "learning_rate": 5.204829143418072e-06, "loss": 0.4505, "num_input_tokens_seen": 16594064, "step": 16095 }, { "epoch": 10.76923076923077, "grad_norm": 2.5743861198425293, "learning_rate": 5.201912947931884e-06, "loss": 0.3697, "num_input_tokens_seen": 16598064, "step": 16100 }, { "epoch": 10.77257525083612, "grad_norm": 2.7204437255859375, "learning_rate": 5.198996683647585e-06, "loss": 0.5448, "num_input_tokens_seen": 16603600, "step": 16105 }, { "epoch": 10.775919732441471, "grad_norm": 1.5812489986419678, "learning_rate": 5.196080351558839e-06, "loss": 0.3957, "num_input_tokens_seen": 16609232, "step": 16110 }, { "epoch": 10.779264214046822, "grad_norm": 1.780667781829834, "learning_rate": 5.193163952659331e-06, "loss": 0.3788, "num_input_tokens_seen": 16614928, "step": 16115 }, { "epoch": 10.782608695652174, "grad_norm": 1.6747726202011108, "learning_rate": 5.190247487942768e-06, "loss": 0.3592, "num_input_tokens_seen": 16620176, "step": 16120 }, { "epoch": 10.785953177257525, "grad_norm": 2.6862220764160156, "learning_rate": 5.1873309584028854e-06, "loss": 0.4159, "num_input_tokens_seen": 16624912, "step": 16125 }, { "epoch": 10.789297658862877, "grad_norm": 1.7788383960723877, "learning_rate": 5.184414365033434e-06, "loss": 0.4013, "num_input_tokens_seen": 16629328, "step": 16130 }, { "epoch": 10.792642140468228, "grad_norm": 2.412071466445923, "learning_rate": 5.181497708828194e-06, "loss": 0.363, "num_input_tokens_seen": 16634000, "step": 16135 }, { "epoch": 10.79598662207358, "grad_norm": 1.6611449718475342, "learning_rate": 5.178580990780956e-06, "loss": 0.3694, "num_input_tokens_seen": 16639120, "step": 16140 }, { "epoch": 10.799331103678929, "grad_norm": 2.264477252960205, "learning_rate": 5.175664211885542e-06, "loss": 0.5011, "num_input_tokens_seen": 16644144, "step": 16145 }, { "epoch": 10.80267558528428, "grad_norm": 2.641725540161133, "learning_rate": 5.172747373135788e-06, "loss": 0.4515, "num_input_tokens_seen": 16649040, "step": 16150 }, { "epoch": 10.806020066889632, "grad_norm": 2.068740129470825, "learning_rate": 5.169830475525556e-06, "loss": 0.4711, "num_input_tokens_seen": 16654448, "step": 16155 }, { "epoch": 10.809364548494983, "grad_norm": 2.3425488471984863, "learning_rate": 5.166913520048722e-06, "loss": 0.4035, "num_input_tokens_seen": 16660432, "step": 16160 }, { "epoch": 10.812709030100335, "grad_norm": 1.840779423713684, "learning_rate": 5.163996507699185e-06, "loss": 0.4297, "num_input_tokens_seen": 16665648, "step": 16165 }, { "epoch": 10.816053511705686, "grad_norm": 2.031325340270996, "learning_rate": 5.1610794394708665e-06, "loss": 0.3646, "num_input_tokens_seen": 16670672, "step": 16170 }, { "epoch": 10.819397993311037, "grad_norm": 1.9697837829589844, "learning_rate": 5.158162316357699e-06, "loss": 0.3263, "num_input_tokens_seen": 16675696, "step": 16175 }, { "epoch": 10.822742474916389, "grad_norm": 4.081700325012207, "learning_rate": 5.155245139353641e-06, "loss": 0.5434, "num_input_tokens_seen": 16680880, "step": 16180 }, { "epoch": 10.826086956521738, "grad_norm": 2.3415074348449707, "learning_rate": 5.1523279094526655e-06, "loss": 0.4588, "num_input_tokens_seen": 16686096, "step": 16185 }, { "epoch": 10.82943143812709, "grad_norm": 2.1319329738616943, "learning_rate": 5.149410627648768e-06, "loss": 0.349, "num_input_tokens_seen": 16691696, "step": 16190 }, { "epoch": 10.832775919732441, "grad_norm": 3.4432055950164795, "learning_rate": 5.146493294935954e-06, "loss": 0.3832, "num_input_tokens_seen": 16696528, "step": 16195 }, { "epoch": 10.836120401337793, "grad_norm": 2.848508596420288, "learning_rate": 5.143575912308254e-06, "loss": 0.3996, "num_input_tokens_seen": 16701552, "step": 16200 }, { "epoch": 10.839464882943144, "grad_norm": 2.6907763481140137, "learning_rate": 5.140658480759711e-06, "loss": 0.4722, "num_input_tokens_seen": 16707024, "step": 16205 }, { "epoch": 10.842809364548495, "grad_norm": 2.176926374435425, "learning_rate": 5.137741001284386e-06, "loss": 0.4151, "num_input_tokens_seen": 16711504, "step": 16210 }, { "epoch": 10.846153846153847, "grad_norm": 1.77402925491333, "learning_rate": 5.134823474876357e-06, "loss": 0.4385, "num_input_tokens_seen": 16716464, "step": 16215 }, { "epoch": 10.849498327759198, "grad_norm": 1.6165764331817627, "learning_rate": 5.131905902529716e-06, "loss": 0.3893, "num_input_tokens_seen": 16721392, "step": 16220 }, { "epoch": 10.852842809364548, "grad_norm": 2.0907716751098633, "learning_rate": 5.128988285238572e-06, "loss": 0.3836, "num_input_tokens_seen": 16726608, "step": 16225 }, { "epoch": 10.856187290969899, "grad_norm": 2.3272247314453125, "learning_rate": 5.126070623997051e-06, "loss": 0.435, "num_input_tokens_seen": 16731312, "step": 16230 }, { "epoch": 10.85953177257525, "grad_norm": 2.373774766921997, "learning_rate": 5.123152919799291e-06, "loss": 0.3936, "num_input_tokens_seen": 16736208, "step": 16235 }, { "epoch": 10.862876254180602, "grad_norm": 2.1847405433654785, "learning_rate": 5.120235173639445e-06, "loss": 0.4428, "num_input_tokens_seen": 16741168, "step": 16240 }, { "epoch": 10.866220735785953, "grad_norm": 2.5744876861572266, "learning_rate": 5.1173173865116844e-06, "loss": 0.5193, "num_input_tokens_seen": 16746000, "step": 16245 }, { "epoch": 10.869565217391305, "grad_norm": 1.5137031078338623, "learning_rate": 5.114399559410188e-06, "loss": 0.4167, "num_input_tokens_seen": 16751984, "step": 16250 }, { "epoch": 10.872909698996656, "grad_norm": 2.4489920139312744, "learning_rate": 5.1114816933291524e-06, "loss": 0.3802, "num_input_tokens_seen": 16757008, "step": 16255 }, { "epoch": 10.876254180602007, "grad_norm": 1.7691580057144165, "learning_rate": 5.108563789262787e-06, "loss": 0.4218, "num_input_tokens_seen": 16762512, "step": 16260 }, { "epoch": 10.879598662207357, "grad_norm": 1.7834291458129883, "learning_rate": 5.1056458482053125e-06, "loss": 0.5331, "num_input_tokens_seen": 16767632, "step": 16265 }, { "epoch": 10.882943143812708, "grad_norm": 1.8558640480041504, "learning_rate": 5.102727871150966e-06, "loss": 0.3969, "num_input_tokens_seen": 16773872, "step": 16270 }, { "epoch": 10.88628762541806, "grad_norm": 2.2604143619537354, "learning_rate": 5.099809859093993e-06, "loss": 0.4955, "num_input_tokens_seen": 16778608, "step": 16275 }, { "epoch": 10.889632107023411, "grad_norm": 2.0216431617736816, "learning_rate": 5.096891813028652e-06, "loss": 0.3463, "num_input_tokens_seen": 16783504, "step": 16280 }, { "epoch": 10.892976588628763, "grad_norm": 2.365074634552002, "learning_rate": 5.093973733949212e-06, "loss": 0.451, "num_input_tokens_seen": 16788432, "step": 16285 }, { "epoch": 10.896321070234114, "grad_norm": 3.033843994140625, "learning_rate": 5.091055622849958e-06, "loss": 0.4303, "num_input_tokens_seen": 16793616, "step": 16290 }, { "epoch": 10.899665551839465, "grad_norm": 2.6142570972442627, "learning_rate": 5.0881374807251805e-06, "loss": 0.3694, "num_input_tokens_seen": 16798224, "step": 16295 }, { "epoch": 10.903010033444817, "grad_norm": 2.481255054473877, "learning_rate": 5.085219308569183e-06, "loss": 0.3785, "num_input_tokens_seen": 16803824, "step": 16300 }, { "epoch": 10.906354515050166, "grad_norm": 2.3732728958129883, "learning_rate": 5.082301107376279e-06, "loss": 0.4647, "num_input_tokens_seen": 16809616, "step": 16305 }, { "epoch": 10.909698996655518, "grad_norm": 1.7608847618103027, "learning_rate": 5.0793828781407895e-06, "loss": 0.3831, "num_input_tokens_seen": 16814416, "step": 16310 }, { "epoch": 10.91304347826087, "grad_norm": 1.8155325651168823, "learning_rate": 5.07646462185705e-06, "loss": 0.3129, "num_input_tokens_seen": 16820560, "step": 16315 }, { "epoch": 10.91638795986622, "grad_norm": 2.0642452239990234, "learning_rate": 5.073546339519401e-06, "loss": 0.3634, "num_input_tokens_seen": 16825584, "step": 16320 }, { "epoch": 10.919732441471572, "grad_norm": 2.6611766815185547, "learning_rate": 5.070628032122193e-06, "loss": 0.4693, "num_input_tokens_seen": 16830704, "step": 16325 }, { "epoch": 10.923076923076923, "grad_norm": 1.6822015047073364, "learning_rate": 5.067709700659788e-06, "loss": 0.3127, "num_input_tokens_seen": 16836304, "step": 16330 }, { "epoch": 10.926421404682275, "grad_norm": 2.1717686653137207, "learning_rate": 5.064791346126552e-06, "loss": 0.4743, "num_input_tokens_seen": 16841104, "step": 16335 }, { "epoch": 10.929765886287626, "grad_norm": 2.5634405612945557, "learning_rate": 5.061872969516859e-06, "loss": 0.3228, "num_input_tokens_seen": 16845872, "step": 16340 }, { "epoch": 10.933110367892976, "grad_norm": 3.2949459552764893, "learning_rate": 5.0589545718250955e-06, "loss": 0.337, "num_input_tokens_seen": 16850800, "step": 16345 }, { "epoch": 10.936454849498327, "grad_norm": 2.1253392696380615, "learning_rate": 5.056036154045648e-06, "loss": 0.4876, "num_input_tokens_seen": 16856080, "step": 16350 }, { "epoch": 10.939799331103679, "grad_norm": 1.9195805788040161, "learning_rate": 5.053117717172915e-06, "loss": 0.3868, "num_input_tokens_seen": 16861200, "step": 16355 }, { "epoch": 10.94314381270903, "grad_norm": 2.642395257949829, "learning_rate": 5.0501992622013e-06, "loss": 0.4378, "num_input_tokens_seen": 16865616, "step": 16360 }, { "epoch": 10.946488294314381, "grad_norm": 1.974463939666748, "learning_rate": 5.0472807901252126e-06, "loss": 0.3951, "num_input_tokens_seen": 16871056, "step": 16365 }, { "epoch": 10.949832775919733, "grad_norm": 2.0017929077148438, "learning_rate": 5.044362301939069e-06, "loss": 0.3656, "num_input_tokens_seen": 16877392, "step": 16370 }, { "epoch": 10.953177257525084, "grad_norm": 2.1010828018188477, "learning_rate": 5.04144379863729e-06, "loss": 0.4785, "num_input_tokens_seen": 16882224, "step": 16375 }, { "epoch": 10.956521739130435, "grad_norm": 1.6359747648239136, "learning_rate": 5.038525281214299e-06, "loss": 0.4288, "num_input_tokens_seen": 16887984, "step": 16380 }, { "epoch": 10.959866220735787, "grad_norm": 2.1845715045928955, "learning_rate": 5.035606750664531e-06, "loss": 0.458, "num_input_tokens_seen": 16892080, "step": 16385 }, { "epoch": 10.963210702341136, "grad_norm": 1.789440393447876, "learning_rate": 5.03268820798242e-06, "loss": 0.4617, "num_input_tokens_seen": 16897168, "step": 16390 }, { "epoch": 10.966555183946488, "grad_norm": 1.8401515483856201, "learning_rate": 5.029769654162405e-06, "loss": 0.4317, "num_input_tokens_seen": 16902704, "step": 16395 }, { "epoch": 10.96989966555184, "grad_norm": 3.332435131072998, "learning_rate": 5.026851090198929e-06, "loss": 0.3754, "num_input_tokens_seen": 16906864, "step": 16400 }, { "epoch": 10.97324414715719, "grad_norm": 2.282296657562256, "learning_rate": 5.023932517086442e-06, "loss": 0.3685, "num_input_tokens_seen": 16911920, "step": 16405 }, { "epoch": 10.976588628762542, "grad_norm": 2.3827710151672363, "learning_rate": 5.0210139358193885e-06, "loss": 0.3337, "num_input_tokens_seen": 16916208, "step": 16410 }, { "epoch": 10.979933110367893, "grad_norm": 2.0195095539093018, "learning_rate": 5.018095347392226e-06, "loss": 0.4023, "num_input_tokens_seen": 16921680, "step": 16415 }, { "epoch": 10.983277591973245, "grad_norm": 3.1821391582489014, "learning_rate": 5.015176752799405e-06, "loss": 0.4354, "num_input_tokens_seen": 16926064, "step": 16420 }, { "epoch": 10.986622073578594, "grad_norm": 1.6717790365219116, "learning_rate": 5.012258153035388e-06, "loss": 0.4429, "num_input_tokens_seen": 16931568, "step": 16425 }, { "epoch": 10.989966555183946, "grad_norm": 1.7625988721847534, "learning_rate": 5.00933954909463e-06, "loss": 0.3649, "num_input_tokens_seen": 16936336, "step": 16430 }, { "epoch": 10.993311036789297, "grad_norm": 2.9083657264709473, "learning_rate": 5.006420941971595e-06, "loss": 0.4416, "num_input_tokens_seen": 16941712, "step": 16435 }, { "epoch": 10.996655518394649, "grad_norm": 1.6246402263641357, "learning_rate": 5.003502332660742e-06, "loss": 0.3434, "num_input_tokens_seen": 16947152, "step": 16440 }, { "epoch": 11.0, "grad_norm": 1.732689619064331, "learning_rate": 5.000583722156533e-06, "loss": 0.3571, "num_input_tokens_seen": 16951600, "step": 16445 }, { "epoch": 11.003344481605351, "grad_norm": 2.5175890922546387, "learning_rate": 4.997665111453432e-06, "loss": 0.4694, "num_input_tokens_seen": 16956528, "step": 16450 }, { "epoch": 11.006688963210703, "grad_norm": 2.4943437576293945, "learning_rate": 4.994746501545901e-06, "loss": 0.4452, "num_input_tokens_seen": 16961712, "step": 16455 }, { "epoch": 11.010033444816054, "grad_norm": 2.378167152404785, "learning_rate": 4.991827893428401e-06, "loss": 0.4748, "num_input_tokens_seen": 16966864, "step": 16460 }, { "epoch": 11.013377926421406, "grad_norm": 2.3770766258239746, "learning_rate": 4.988909288095397e-06, "loss": 0.408, "num_input_tokens_seen": 16971792, "step": 16465 }, { "epoch": 11.016722408026755, "grad_norm": 2.0196993350982666, "learning_rate": 4.985990686541349e-06, "loss": 0.3616, "num_input_tokens_seen": 16976912, "step": 16470 }, { "epoch": 11.020066889632107, "grad_norm": 2.0898356437683105, "learning_rate": 4.983072089760716e-06, "loss": 0.3965, "num_input_tokens_seen": 16982448, "step": 16475 }, { "epoch": 11.023411371237458, "grad_norm": 1.7312588691711426, "learning_rate": 4.9801534987479545e-06, "loss": 0.3895, "num_input_tokens_seen": 16988368, "step": 16480 }, { "epoch": 11.02675585284281, "grad_norm": 1.94546377658844, "learning_rate": 4.977234914497522e-06, "loss": 0.4038, "num_input_tokens_seen": 16993296, "step": 16485 }, { "epoch": 11.03010033444816, "grad_norm": 1.8264715671539307, "learning_rate": 4.974316338003875e-06, "loss": 0.4066, "num_input_tokens_seen": 16998160, "step": 16490 }, { "epoch": 11.033444816053512, "grad_norm": 1.7680492401123047, "learning_rate": 4.97139777026146e-06, "loss": 0.3848, "num_input_tokens_seen": 17004016, "step": 16495 }, { "epoch": 11.036789297658864, "grad_norm": 3.231663227081299, "learning_rate": 4.968479212264729e-06, "loss": 0.391, "num_input_tokens_seen": 17008816, "step": 16500 }, { "epoch": 11.040133779264215, "grad_norm": 2.152585744857788, "learning_rate": 4.965560665008121e-06, "loss": 0.3778, "num_input_tokens_seen": 17013744, "step": 16505 }, { "epoch": 11.043478260869565, "grad_norm": 3.0162672996520996, "learning_rate": 4.962642129486084e-06, "loss": 0.4, "num_input_tokens_seen": 17018640, "step": 16510 }, { "epoch": 11.046822742474916, "grad_norm": 1.7502175569534302, "learning_rate": 4.959723606693051e-06, "loss": 0.3909, "num_input_tokens_seen": 17023760, "step": 16515 }, { "epoch": 11.050167224080267, "grad_norm": 2.0816709995269775, "learning_rate": 4.956805097623456e-06, "loss": 0.3217, "num_input_tokens_seen": 17029040, "step": 16520 }, { "epoch": 11.053511705685619, "grad_norm": 2.9935781955718994, "learning_rate": 4.9538866032717255e-06, "loss": 0.3728, "num_input_tokens_seen": 17033584, "step": 16525 }, { "epoch": 11.05685618729097, "grad_norm": 2.3428375720977783, "learning_rate": 4.950968124632285e-06, "loss": 0.4329, "num_input_tokens_seen": 17039216, "step": 16530 }, { "epoch": 11.060200668896321, "grad_norm": 2.440293073654175, "learning_rate": 4.9480496626995525e-06, "loss": 0.4022, "num_input_tokens_seen": 17045328, "step": 16535 }, { "epoch": 11.063545150501673, "grad_norm": 2.295064926147461, "learning_rate": 4.945131218467939e-06, "loss": 0.3478, "num_input_tokens_seen": 17051376, "step": 16540 }, { "epoch": 11.066889632107024, "grad_norm": 1.5590147972106934, "learning_rate": 4.94221279293185e-06, "loss": 0.4139, "num_input_tokens_seen": 17056496, "step": 16545 }, { "epoch": 11.070234113712374, "grad_norm": 2.146726131439209, "learning_rate": 4.939294387085684e-06, "loss": 0.3805, "num_input_tokens_seen": 17061200, "step": 16550 }, { "epoch": 11.073578595317725, "grad_norm": 1.5646990537643433, "learning_rate": 4.936376001923836e-06, "loss": 0.2979, "num_input_tokens_seen": 17066416, "step": 16555 }, { "epoch": 11.076923076923077, "grad_norm": 2.6358227729797363, "learning_rate": 4.933457638440693e-06, "loss": 0.4106, "num_input_tokens_seen": 17072176, "step": 16560 }, { "epoch": 11.080267558528428, "grad_norm": 2.0400052070617676, "learning_rate": 4.93053929763063e-06, "loss": 0.4159, "num_input_tokens_seen": 17077072, "step": 16565 }, { "epoch": 11.08361204013378, "grad_norm": 2.0469155311584473, "learning_rate": 4.92762098048802e-06, "loss": 0.3983, "num_input_tokens_seen": 17081680, "step": 16570 }, { "epoch": 11.08695652173913, "grad_norm": 1.8481718301773071, "learning_rate": 4.924702688007224e-06, "loss": 0.3473, "num_input_tokens_seen": 17086768, "step": 16575 }, { "epoch": 11.090301003344482, "grad_norm": 2.354912281036377, "learning_rate": 4.921784421182601e-06, "loss": 0.4033, "num_input_tokens_seen": 17091824, "step": 16580 }, { "epoch": 11.093645484949834, "grad_norm": 2.8961057662963867, "learning_rate": 4.918866181008492e-06, "loss": 0.3732, "num_input_tokens_seen": 17097392, "step": 16585 }, { "epoch": 11.096989966555183, "grad_norm": 2.1482465267181396, "learning_rate": 4.915947968479235e-06, "loss": 0.462, "num_input_tokens_seen": 17102192, "step": 16590 }, { "epoch": 11.100334448160535, "grad_norm": 1.9035751819610596, "learning_rate": 4.913029784589154e-06, "loss": 0.325, "num_input_tokens_seen": 17107088, "step": 16595 }, { "epoch": 11.103678929765886, "grad_norm": 2.2896361351013184, "learning_rate": 4.910111630332572e-06, "loss": 0.3761, "num_input_tokens_seen": 17111664, "step": 16600 }, { "epoch": 11.107023411371237, "grad_norm": 3.2190091609954834, "learning_rate": 4.907193506703793e-06, "loss": 0.3753, "num_input_tokens_seen": 17116528, "step": 16605 }, { "epoch": 11.110367892976589, "grad_norm": 1.8896830081939697, "learning_rate": 4.904275414697115e-06, "loss": 0.3734, "num_input_tokens_seen": 17121744, "step": 16610 }, { "epoch": 11.11371237458194, "grad_norm": 2.3395087718963623, "learning_rate": 4.90135735530682e-06, "loss": 0.3766, "num_input_tokens_seen": 17127408, "step": 16615 }, { "epoch": 11.117056856187292, "grad_norm": 2.6681699752807617, "learning_rate": 4.898439329527188e-06, "loss": 0.4481, "num_input_tokens_seen": 17132592, "step": 16620 }, { "epoch": 11.120401337792643, "grad_norm": 2.6525328159332275, "learning_rate": 4.895521338352479e-06, "loss": 0.468, "num_input_tokens_seen": 17137008, "step": 16625 }, { "epoch": 11.123745819397993, "grad_norm": 2.347923517227173, "learning_rate": 4.892603382776948e-06, "loss": 0.4073, "num_input_tokens_seen": 17141456, "step": 16630 }, { "epoch": 11.127090301003344, "grad_norm": 2.692141532897949, "learning_rate": 4.889685463794833e-06, "loss": 0.3574, "num_input_tokens_seen": 17146160, "step": 16635 }, { "epoch": 11.130434782608695, "grad_norm": 1.814361810684204, "learning_rate": 4.886767582400358e-06, "loss": 0.4042, "num_input_tokens_seen": 17151344, "step": 16640 }, { "epoch": 11.133779264214047, "grad_norm": 3.398090362548828, "learning_rate": 4.8838497395877425e-06, "loss": 0.4667, "num_input_tokens_seen": 17156176, "step": 16645 }, { "epoch": 11.137123745819398, "grad_norm": 1.6642072200775146, "learning_rate": 4.880931936351184e-06, "loss": 0.4913, "num_input_tokens_seen": 17161872, "step": 16650 }, { "epoch": 11.14046822742475, "grad_norm": 2.474745512008667, "learning_rate": 4.878014173684872e-06, "loss": 0.3687, "num_input_tokens_seen": 17166352, "step": 16655 }, { "epoch": 11.143812709030101, "grad_norm": 2.159926652908325, "learning_rate": 4.875096452582977e-06, "loss": 0.3695, "num_input_tokens_seen": 17172048, "step": 16660 }, { "epoch": 11.147157190635452, "grad_norm": 2.1008689403533936, "learning_rate": 4.872178774039663e-06, "loss": 0.4733, "num_input_tokens_seen": 17176880, "step": 16665 }, { "epoch": 11.150501672240802, "grad_norm": 3.3123788833618164, "learning_rate": 4.869261139049073e-06, "loss": 0.4004, "num_input_tokens_seen": 17182096, "step": 16670 }, { "epoch": 11.153846153846153, "grad_norm": 2.0878283977508545, "learning_rate": 4.8663435486053375e-06, "loss": 0.4339, "num_input_tokens_seen": 17187152, "step": 16675 }, { "epoch": 11.157190635451505, "grad_norm": 2.7422144412994385, "learning_rate": 4.863426003702572e-06, "loss": 0.4595, "num_input_tokens_seen": 17192528, "step": 16680 }, { "epoch": 11.160535117056856, "grad_norm": 2.094635009765625, "learning_rate": 4.860508505334873e-06, "loss": 0.4519, "num_input_tokens_seen": 17197328, "step": 16685 }, { "epoch": 11.163879598662207, "grad_norm": 3.145259380340576, "learning_rate": 4.85759105449633e-06, "loss": 0.352, "num_input_tokens_seen": 17201680, "step": 16690 }, { "epoch": 11.167224080267559, "grad_norm": 2.074958324432373, "learning_rate": 4.854673652181007e-06, "loss": 0.3457, "num_input_tokens_seen": 17206320, "step": 16695 }, { "epoch": 11.17056856187291, "grad_norm": 3.2402632236480713, "learning_rate": 4.851756299382955e-06, "loss": 0.4611, "num_input_tokens_seen": 17210864, "step": 16700 }, { "epoch": 11.173913043478262, "grad_norm": 2.683561086654663, "learning_rate": 4.848838997096208e-06, "loss": 0.4458, "num_input_tokens_seen": 17215600, "step": 16705 }, { "epoch": 11.177257525083611, "grad_norm": 2.1373980045318604, "learning_rate": 4.845921746314783e-06, "loss": 0.5358, "num_input_tokens_seen": 17220816, "step": 16710 }, { "epoch": 11.180602006688963, "grad_norm": 2.6757490634918213, "learning_rate": 4.843004548032682e-06, "loss": 0.4115, "num_input_tokens_seen": 17226192, "step": 16715 }, { "epoch": 11.183946488294314, "grad_norm": 2.152510404586792, "learning_rate": 4.840087403243883e-06, "loss": 0.3607, "num_input_tokens_seen": 17231280, "step": 16720 }, { "epoch": 11.187290969899665, "grad_norm": 2.6090986728668213, "learning_rate": 4.837170312942351e-06, "loss": 0.4042, "num_input_tokens_seen": 17236624, "step": 16725 }, { "epoch": 11.190635451505017, "grad_norm": 2.8472824096679688, "learning_rate": 4.834253278122028e-06, "loss": 0.4266, "num_input_tokens_seen": 17242512, "step": 16730 }, { "epoch": 11.193979933110368, "grad_norm": 3.1745126247406006, "learning_rate": 4.8313362997768456e-06, "loss": 0.4219, "num_input_tokens_seen": 17248176, "step": 16735 }, { "epoch": 11.19732441471572, "grad_norm": 1.7578251361846924, "learning_rate": 4.828419378900705e-06, "loss": 0.3631, "num_input_tokens_seen": 17253104, "step": 16740 }, { "epoch": 11.200668896321071, "grad_norm": 2.49222469329834, "learning_rate": 4.825502516487497e-06, "loss": 0.4123, "num_input_tokens_seen": 17258288, "step": 16745 }, { "epoch": 11.20401337792642, "grad_norm": 2.634587049484253, "learning_rate": 4.822585713531083e-06, "loss": 0.4555, "num_input_tokens_seen": 17263568, "step": 16750 }, { "epoch": 11.207357859531772, "grad_norm": 2.3218681812286377, "learning_rate": 4.819668971025316e-06, "loss": 0.3873, "num_input_tokens_seen": 17268816, "step": 16755 }, { "epoch": 11.210702341137123, "grad_norm": 2.168395519256592, "learning_rate": 4.81675228996402e-06, "loss": 0.3704, "num_input_tokens_seen": 17273968, "step": 16760 }, { "epoch": 11.214046822742475, "grad_norm": 3.6642701625823975, "learning_rate": 4.813835671340999e-06, "loss": 0.4671, "num_input_tokens_seen": 17278384, "step": 16765 }, { "epoch": 11.217391304347826, "grad_norm": 1.720310091972351, "learning_rate": 4.810919116150039e-06, "loss": 0.3915, "num_input_tokens_seen": 17284176, "step": 16770 }, { "epoch": 11.220735785953178, "grad_norm": 1.9353846311569214, "learning_rate": 4.808002625384898e-06, "loss": 0.385, "num_input_tokens_seen": 17289840, "step": 16775 }, { "epoch": 11.224080267558529, "grad_norm": 2.150707721710205, "learning_rate": 4.8050862000393225e-06, "loss": 0.3843, "num_input_tokens_seen": 17295056, "step": 16780 }, { "epoch": 11.22742474916388, "grad_norm": 1.784733533859253, "learning_rate": 4.802169841107027e-06, "loss": 0.386, "num_input_tokens_seen": 17300240, "step": 16785 }, { "epoch": 11.23076923076923, "grad_norm": 2.015521287918091, "learning_rate": 4.799253549581709e-06, "loss": 0.3426, "num_input_tokens_seen": 17305072, "step": 16790 }, { "epoch": 11.234113712374581, "grad_norm": 3.154649019241333, "learning_rate": 4.796337326457036e-06, "loss": 0.3574, "num_input_tokens_seen": 17310096, "step": 16795 }, { "epoch": 11.237458193979933, "grad_norm": 2.3813560009002686, "learning_rate": 4.793421172726665e-06, "loss": 0.3497, "num_input_tokens_seen": 17315984, "step": 16800 }, { "epoch": 11.240802675585284, "grad_norm": 2.118762969970703, "learning_rate": 4.790505089384217e-06, "loss": 0.3208, "num_input_tokens_seen": 17320816, "step": 16805 }, { "epoch": 11.244147157190636, "grad_norm": 2.738624334335327, "learning_rate": 4.787589077423294e-06, "loss": 0.3966, "num_input_tokens_seen": 17326000, "step": 16810 }, { "epoch": 11.247491638795987, "grad_norm": 1.6788599491119385, "learning_rate": 4.784673137837472e-06, "loss": 0.3254, "num_input_tokens_seen": 17330480, "step": 16815 }, { "epoch": 11.250836120401338, "grad_norm": 2.1295764446258545, "learning_rate": 4.781757271620306e-06, "loss": 0.3534, "num_input_tokens_seen": 17335504, "step": 16820 }, { "epoch": 11.25418060200669, "grad_norm": 2.3041021823883057, "learning_rate": 4.7788414797653215e-06, "loss": 0.4204, "num_input_tokens_seen": 17340624, "step": 16825 }, { "epoch": 11.25752508361204, "grad_norm": 2.295619249343872, "learning_rate": 4.775925763266024e-06, "loss": 0.3454, "num_input_tokens_seen": 17346000, "step": 16830 }, { "epoch": 11.26086956521739, "grad_norm": 2.5254931449890137, "learning_rate": 4.773010123115888e-06, "loss": 0.3833, "num_input_tokens_seen": 17351280, "step": 16835 }, { "epoch": 11.264214046822742, "grad_norm": 2.5696969032287598, "learning_rate": 4.770094560308361e-06, "loss": 0.3445, "num_input_tokens_seen": 17356848, "step": 16840 }, { "epoch": 11.267558528428093, "grad_norm": 2.0274648666381836, "learning_rate": 4.767179075836873e-06, "loss": 0.4723, "num_input_tokens_seen": 17362736, "step": 16845 }, { "epoch": 11.270903010033445, "grad_norm": 2.1742002964019775, "learning_rate": 4.764263670694819e-06, "loss": 0.3875, "num_input_tokens_seen": 17367408, "step": 16850 }, { "epoch": 11.274247491638796, "grad_norm": 2.525179624557495, "learning_rate": 4.7613483458755695e-06, "loss": 0.3359, "num_input_tokens_seen": 17372368, "step": 16855 }, { "epoch": 11.277591973244148, "grad_norm": 2.5246658325195312, "learning_rate": 4.758433102372466e-06, "loss": 0.3695, "num_input_tokens_seen": 17378448, "step": 16860 }, { "epoch": 11.280936454849499, "grad_norm": 1.5710396766662598, "learning_rate": 4.755517941178826e-06, "loss": 0.3611, "num_input_tokens_seen": 17384176, "step": 16865 }, { "epoch": 11.284280936454849, "grad_norm": 2.040842056274414, "learning_rate": 4.752602863287935e-06, "loss": 0.3929, "num_input_tokens_seen": 17389104, "step": 16870 }, { "epoch": 11.2876254180602, "grad_norm": 2.9099326133728027, "learning_rate": 4.749687869693056e-06, "loss": 0.3881, "num_input_tokens_seen": 17394192, "step": 16875 }, { "epoch": 11.290969899665551, "grad_norm": 1.805770754814148, "learning_rate": 4.746772961387416e-06, "loss": 0.3901, "num_input_tokens_seen": 17399664, "step": 16880 }, { "epoch": 11.294314381270903, "grad_norm": 2.6661548614501953, "learning_rate": 4.743858139364214e-06, "loss": 0.4079, "num_input_tokens_seen": 17404880, "step": 16885 }, { "epoch": 11.297658862876254, "grad_norm": 2.509906530380249, "learning_rate": 4.740943404616628e-06, "loss": 0.3476, "num_input_tokens_seen": 17410128, "step": 16890 }, { "epoch": 11.301003344481606, "grad_norm": 1.8929409980773926, "learning_rate": 4.738028758137797e-06, "loss": 0.3156, "num_input_tokens_seen": 17415312, "step": 16895 }, { "epoch": 11.304347826086957, "grad_norm": 2.101052761077881, "learning_rate": 4.735114200920832e-06, "loss": 0.4408, "num_input_tokens_seen": 17421104, "step": 16900 }, { "epoch": 11.307692307692308, "grad_norm": 2.561795711517334, "learning_rate": 4.7321997339588125e-06, "loss": 0.3866, "num_input_tokens_seen": 17426544, "step": 16905 }, { "epoch": 11.31103678929766, "grad_norm": 1.8270487785339355, "learning_rate": 4.729285358244795e-06, "loss": 0.3694, "num_input_tokens_seen": 17431440, "step": 16910 }, { "epoch": 11.31438127090301, "grad_norm": 2.66367244720459, "learning_rate": 4.726371074771797e-06, "loss": 0.3555, "num_input_tokens_seen": 17436272, "step": 16915 }, { "epoch": 11.31772575250836, "grad_norm": 2.9984138011932373, "learning_rate": 4.7234568845328045e-06, "loss": 0.481, "num_input_tokens_seen": 17441360, "step": 16920 }, { "epoch": 11.321070234113712, "grad_norm": 1.8395224809646606, "learning_rate": 4.720542788520777e-06, "loss": 0.4192, "num_input_tokens_seen": 17446800, "step": 16925 }, { "epoch": 11.324414715719064, "grad_norm": 2.2683496475219727, "learning_rate": 4.717628787728635e-06, "loss": 0.3734, "num_input_tokens_seen": 17452336, "step": 16930 }, { "epoch": 11.327759197324415, "grad_norm": 2.2108006477355957, "learning_rate": 4.714714883149276e-06, "loss": 0.3207, "num_input_tokens_seen": 17457040, "step": 16935 }, { "epoch": 11.331103678929766, "grad_norm": 2.201411724090576, "learning_rate": 4.711801075775557e-06, "loss": 0.3916, "num_input_tokens_seen": 17462320, "step": 16940 }, { "epoch": 11.334448160535118, "grad_norm": 2.2449710369110107, "learning_rate": 4.7088873666003025e-06, "loss": 0.4003, "num_input_tokens_seen": 17467472, "step": 16945 }, { "epoch": 11.337792642140467, "grad_norm": 2.3006138801574707, "learning_rate": 4.705973756616304e-06, "loss": 0.4207, "num_input_tokens_seen": 17472208, "step": 16950 }, { "epoch": 11.341137123745819, "grad_norm": 2.7310755252838135, "learning_rate": 4.703060246816326e-06, "loss": 0.4303, "num_input_tokens_seen": 17477520, "step": 16955 }, { "epoch": 11.34448160535117, "grad_norm": 2.0392887592315674, "learning_rate": 4.700146838193089e-06, "loss": 0.3838, "num_input_tokens_seen": 17482992, "step": 16960 }, { "epoch": 11.347826086956522, "grad_norm": 1.5489648580551147, "learning_rate": 4.697233531739282e-06, "loss": 0.3384, "num_input_tokens_seen": 17487952, "step": 16965 }, { "epoch": 11.351170568561873, "grad_norm": 2.2157270908355713, "learning_rate": 4.694320328447565e-06, "loss": 0.3607, "num_input_tokens_seen": 17492656, "step": 16970 }, { "epoch": 11.354515050167224, "grad_norm": 1.77234947681427, "learning_rate": 4.691407229310553e-06, "loss": 0.3012, "num_input_tokens_seen": 17497296, "step": 16975 }, { "epoch": 11.357859531772576, "grad_norm": 2.554543972015381, "learning_rate": 4.688494235320835e-06, "loss": 0.4476, "num_input_tokens_seen": 17501840, "step": 16980 }, { "epoch": 11.361204013377927, "grad_norm": 2.992461919784546, "learning_rate": 4.685581347470959e-06, "loss": 0.4227, "num_input_tokens_seen": 17507472, "step": 16985 }, { "epoch": 11.364548494983278, "grad_norm": 2.329174280166626, "learning_rate": 4.682668566753436e-06, "loss": 0.3804, "num_input_tokens_seen": 17512208, "step": 16990 }, { "epoch": 11.367892976588628, "grad_norm": 2.233022689819336, "learning_rate": 4.679755894160743e-06, "loss": 0.4376, "num_input_tokens_seen": 17517456, "step": 16995 }, { "epoch": 11.37123745819398, "grad_norm": 3.9548819065093994, "learning_rate": 4.676843330685321e-06, "loss": 0.3675, "num_input_tokens_seen": 17522000, "step": 17000 }, { "epoch": 11.37458193979933, "grad_norm": 1.6034001111984253, "learning_rate": 4.6739308773195715e-06, "loss": 0.4558, "num_input_tokens_seen": 17526384, "step": 17005 }, { "epoch": 11.377926421404682, "grad_norm": 2.51603627204895, "learning_rate": 4.671018535055858e-06, "loss": 0.3755, "num_input_tokens_seen": 17531536, "step": 17010 }, { "epoch": 11.381270903010034, "grad_norm": 2.0292110443115234, "learning_rate": 4.668106304886509e-06, "loss": 0.3744, "num_input_tokens_seen": 17536624, "step": 17015 }, { "epoch": 11.384615384615385, "grad_norm": 2.1303136348724365, "learning_rate": 4.6651941878038106e-06, "loss": 0.3267, "num_input_tokens_seen": 17541520, "step": 17020 }, { "epoch": 11.387959866220736, "grad_norm": 2.1131551265716553, "learning_rate": 4.662282184800017e-06, "loss": 0.2921, "num_input_tokens_seen": 17546960, "step": 17025 }, { "epoch": 11.391304347826088, "grad_norm": 6.671249866485596, "learning_rate": 4.659370296867337e-06, "loss": 0.5217, "num_input_tokens_seen": 17552016, "step": 17030 }, { "epoch": 11.394648829431437, "grad_norm": 2.8301432132720947, "learning_rate": 4.656458524997943e-06, "loss": 0.377, "num_input_tokens_seen": 17556464, "step": 17035 }, { "epoch": 11.397993311036789, "grad_norm": 2.3492558002471924, "learning_rate": 4.653546870183965e-06, "loss": 0.3937, "num_input_tokens_seen": 17561328, "step": 17040 }, { "epoch": 11.40133779264214, "grad_norm": 3.125396490097046, "learning_rate": 4.650635333417501e-06, "loss": 0.4651, "num_input_tokens_seen": 17567280, "step": 17045 }, { "epoch": 11.404682274247492, "grad_norm": 2.0760557651519775, "learning_rate": 4.6477239156906e-06, "loss": 0.4109, "num_input_tokens_seen": 17572016, "step": 17050 }, { "epoch": 11.408026755852843, "grad_norm": 2.0689637660980225, "learning_rate": 4.644812617995275e-06, "loss": 0.3437, "num_input_tokens_seen": 17576656, "step": 17055 }, { "epoch": 11.411371237458194, "grad_norm": 3.469085454940796, "learning_rate": 4.6419014413234954e-06, "loss": 0.4878, "num_input_tokens_seen": 17582416, "step": 17060 }, { "epoch": 11.414715719063546, "grad_norm": 2.188737392425537, "learning_rate": 4.638990386667192e-06, "loss": 0.4132, "num_input_tokens_seen": 17587056, "step": 17065 }, { "epoch": 11.418060200668897, "grad_norm": 3.463507890701294, "learning_rate": 4.636079455018253e-06, "loss": 0.4263, "num_input_tokens_seen": 17592368, "step": 17070 }, { "epoch": 11.421404682274247, "grad_norm": 3.6958165168762207, "learning_rate": 4.633168647368526e-06, "loss": 0.4661, "num_input_tokens_seen": 17598512, "step": 17075 }, { "epoch": 11.424749163879598, "grad_norm": 3.656039237976074, "learning_rate": 4.630257964709813e-06, "loss": 0.4159, "num_input_tokens_seen": 17603984, "step": 17080 }, { "epoch": 11.42809364548495, "grad_norm": 2.2759246826171875, "learning_rate": 4.6273474080338734e-06, "loss": 0.347, "num_input_tokens_seen": 17609008, "step": 17085 }, { "epoch": 11.431438127090301, "grad_norm": 2.1530797481536865, "learning_rate": 4.624436978332432e-06, "loss": 0.4408, "num_input_tokens_seen": 17613648, "step": 17090 }, { "epoch": 11.434782608695652, "grad_norm": 1.6461498737335205, "learning_rate": 4.6215266765971586e-06, "loss": 0.3555, "num_input_tokens_seen": 17619280, "step": 17095 }, { "epoch": 11.438127090301004, "grad_norm": 2.016359329223633, "learning_rate": 4.618616503819688e-06, "loss": 0.5128, "num_input_tokens_seen": 17624688, "step": 17100 }, { "epoch": 11.441471571906355, "grad_norm": 2.2511374950408936, "learning_rate": 4.615706460991604e-06, "loss": 0.472, "num_input_tokens_seen": 17629872, "step": 17105 }, { "epoch": 11.444816053511706, "grad_norm": 2.496462345123291, "learning_rate": 4.612796549104454e-06, "loss": 0.4915, "num_input_tokens_seen": 17635600, "step": 17110 }, { "epoch": 11.448160535117056, "grad_norm": 2.6464617252349854, "learning_rate": 4.609886769149734e-06, "loss": 0.3825, "num_input_tokens_seen": 17640880, "step": 17115 }, { "epoch": 11.451505016722408, "grad_norm": 1.9739677906036377, "learning_rate": 4.6069771221189e-06, "loss": 0.3708, "num_input_tokens_seen": 17646512, "step": 17120 }, { "epoch": 11.454849498327759, "grad_norm": 2.6356358528137207, "learning_rate": 4.6040676090033596e-06, "loss": 0.4342, "num_input_tokens_seen": 17651280, "step": 17125 }, { "epoch": 11.45819397993311, "grad_norm": 2.212538242340088, "learning_rate": 4.601158230794473e-06, "loss": 0.3713, "num_input_tokens_seen": 17656080, "step": 17130 }, { "epoch": 11.461538461538462, "grad_norm": 1.707119107246399, "learning_rate": 4.598248988483563e-06, "loss": 0.3331, "num_input_tokens_seen": 17661552, "step": 17135 }, { "epoch": 11.464882943143813, "grad_norm": 3.082146644592285, "learning_rate": 4.595339883061898e-06, "loss": 0.4391, "num_input_tokens_seen": 17666672, "step": 17140 }, { "epoch": 11.468227424749164, "grad_norm": 2.7088098526000977, "learning_rate": 4.592430915520699e-06, "loss": 0.4244, "num_input_tokens_seen": 17672432, "step": 17145 }, { "epoch": 11.471571906354516, "grad_norm": 2.347698211669922, "learning_rate": 4.589522086851146e-06, "loss": 0.4324, "num_input_tokens_seen": 17677648, "step": 17150 }, { "epoch": 11.474916387959865, "grad_norm": 1.8488495349884033, "learning_rate": 4.586613398044365e-06, "loss": 0.4057, "num_input_tokens_seen": 17682384, "step": 17155 }, { "epoch": 11.478260869565217, "grad_norm": 2.13875412940979, "learning_rate": 4.583704850091443e-06, "loss": 0.3539, "num_input_tokens_seen": 17687952, "step": 17160 }, { "epoch": 11.481605351170568, "grad_norm": 2.6776797771453857, "learning_rate": 4.580796443983411e-06, "loss": 0.3762, "num_input_tokens_seen": 17692688, "step": 17165 }, { "epoch": 11.48494983277592, "grad_norm": 2.9940500259399414, "learning_rate": 4.577888180711257e-06, "loss": 0.5627, "num_input_tokens_seen": 17698992, "step": 17170 }, { "epoch": 11.488294314381271, "grad_norm": 2.6192829608917236, "learning_rate": 4.574980061265913e-06, "loss": 0.41, "num_input_tokens_seen": 17705168, "step": 17175 }, { "epoch": 11.491638795986622, "grad_norm": 2.473541736602783, "learning_rate": 4.5720720866382745e-06, "loss": 0.3821, "num_input_tokens_seen": 17710160, "step": 17180 }, { "epoch": 11.494983277591974, "grad_norm": 2.1028168201446533, "learning_rate": 4.569164257819175e-06, "loss": 0.3548, "num_input_tokens_seen": 17715408, "step": 17185 }, { "epoch": 11.498327759197325, "grad_norm": 2.5455589294433594, "learning_rate": 4.566256575799406e-06, "loss": 0.3046, "num_input_tokens_seen": 17720496, "step": 17190 }, { "epoch": 11.501672240802675, "grad_norm": 3.26926851272583, "learning_rate": 4.5633490415697045e-06, "loss": 0.3485, "num_input_tokens_seen": 17725424, "step": 17195 }, { "epoch": 11.505016722408026, "grad_norm": 2.0850884914398193, "learning_rate": 4.560441656120758e-06, "loss": 0.4062, "num_input_tokens_seen": 17730960, "step": 17200 }, { "epoch": 11.508361204013378, "grad_norm": 2.7097363471984863, "learning_rate": 4.557534420443209e-06, "loss": 0.4053, "num_input_tokens_seen": 17736272, "step": 17205 }, { "epoch": 11.511705685618729, "grad_norm": 2.327479600906372, "learning_rate": 4.55462733552764e-06, "loss": 0.3893, "num_input_tokens_seen": 17740912, "step": 17210 }, { "epoch": 11.51505016722408, "grad_norm": 2.370560646057129, "learning_rate": 4.551720402364589e-06, "loss": 0.3684, "num_input_tokens_seen": 17745488, "step": 17215 }, { "epoch": 11.518394648829432, "grad_norm": 1.5592178106307983, "learning_rate": 4.548813621944538e-06, "loss": 0.3645, "num_input_tokens_seen": 17750320, "step": 17220 }, { "epoch": 11.521739130434783, "grad_norm": 2.2264926433563232, "learning_rate": 4.545906995257922e-06, "loss": 0.3639, "num_input_tokens_seen": 17754800, "step": 17225 }, { "epoch": 11.525083612040135, "grad_norm": 2.3830368518829346, "learning_rate": 4.543000523295119e-06, "loss": 0.358, "num_input_tokens_seen": 17759920, "step": 17230 }, { "epoch": 11.528428093645484, "grad_norm": 2.696014642715454, "learning_rate": 4.540094207046454e-06, "loss": 0.414, "num_input_tokens_seen": 17764688, "step": 17235 }, { "epoch": 11.531772575250836, "grad_norm": 2.2080206871032715, "learning_rate": 4.537188047502202e-06, "loss": 0.4003, "num_input_tokens_seen": 17770672, "step": 17240 }, { "epoch": 11.535117056856187, "grad_norm": 2.563408851623535, "learning_rate": 4.534282045652581e-06, "loss": 0.4567, "num_input_tokens_seen": 17775472, "step": 17245 }, { "epoch": 11.538461538461538, "grad_norm": 1.7683014869689941, "learning_rate": 4.531376202487762e-06, "loss": 0.3708, "num_input_tokens_seen": 17780720, "step": 17250 }, { "epoch": 11.54180602006689, "grad_norm": 2.9138119220733643, "learning_rate": 4.528470518997854e-06, "loss": 0.4086, "num_input_tokens_seen": 17785968, "step": 17255 }, { "epoch": 11.545150501672241, "grad_norm": 2.6178154945373535, "learning_rate": 4.525564996172915e-06, "loss": 0.4249, "num_input_tokens_seen": 17791184, "step": 17260 }, { "epoch": 11.548494983277592, "grad_norm": 2.120774269104004, "learning_rate": 4.52265963500295e-06, "loss": 0.3874, "num_input_tokens_seen": 17795984, "step": 17265 }, { "epoch": 11.551839464882944, "grad_norm": 3.135554313659668, "learning_rate": 4.519754436477906e-06, "loss": 0.3656, "num_input_tokens_seen": 17800624, "step": 17270 }, { "epoch": 11.555183946488294, "grad_norm": 1.9469536542892456, "learning_rate": 4.516849401587677e-06, "loss": 0.3797, "num_input_tokens_seen": 17806000, "step": 17275 }, { "epoch": 11.558528428093645, "grad_norm": 2.640050172805786, "learning_rate": 4.5139445313220995e-06, "loss": 0.4367, "num_input_tokens_seen": 17810832, "step": 17280 }, { "epoch": 11.561872909698996, "grad_norm": 2.0420446395874023, "learning_rate": 4.511039826670954e-06, "loss": 0.4155, "num_input_tokens_seen": 17815792, "step": 17285 }, { "epoch": 11.565217391304348, "grad_norm": 2.4906442165374756, "learning_rate": 4.508135288623964e-06, "loss": 0.3868, "num_input_tokens_seen": 17820752, "step": 17290 }, { "epoch": 11.568561872909699, "grad_norm": 1.8983429670333862, "learning_rate": 4.5052309181708e-06, "loss": 0.3846, "num_input_tokens_seen": 17825200, "step": 17295 }, { "epoch": 11.57190635451505, "grad_norm": 1.9133961200714111, "learning_rate": 4.502326716301071e-06, "loss": 0.42, "num_input_tokens_seen": 17829840, "step": 17300 }, { "epoch": 11.575250836120402, "grad_norm": 2.892026424407959, "learning_rate": 4.499422684004329e-06, "loss": 0.4032, "num_input_tokens_seen": 17834320, "step": 17305 }, { "epoch": 11.578595317725753, "grad_norm": 3.1100409030914307, "learning_rate": 4.496518822270071e-06, "loss": 0.4708, "num_input_tokens_seen": 17839856, "step": 17310 }, { "epoch": 11.581939799331103, "grad_norm": 2.296496868133545, "learning_rate": 4.493615132087734e-06, "loss": 0.3385, "num_input_tokens_seen": 17844560, "step": 17315 }, { "epoch": 11.585284280936454, "grad_norm": 2.2085518836975098, "learning_rate": 4.490711614446698e-06, "loss": 0.4109, "num_input_tokens_seen": 17850000, "step": 17320 }, { "epoch": 11.588628762541806, "grad_norm": 2.1289138793945312, "learning_rate": 4.4878082703362824e-06, "loss": 0.3815, "num_input_tokens_seen": 17854448, "step": 17325 }, { "epoch": 11.591973244147157, "grad_norm": 2.0412473678588867, "learning_rate": 4.484905100745747e-06, "loss": 0.3938, "num_input_tokens_seen": 17859024, "step": 17330 }, { "epoch": 11.595317725752508, "grad_norm": 1.8897936344146729, "learning_rate": 4.482002106664292e-06, "loss": 0.4134, "num_input_tokens_seen": 17864528, "step": 17335 }, { "epoch": 11.59866220735786, "grad_norm": 2.2198803424835205, "learning_rate": 4.479099289081063e-06, "loss": 0.3762, "num_input_tokens_seen": 17870032, "step": 17340 }, { "epoch": 11.602006688963211, "grad_norm": 2.061789035797119, "learning_rate": 4.47619664898514e-06, "loss": 0.4135, "num_input_tokens_seen": 17874928, "step": 17345 }, { "epoch": 11.605351170568563, "grad_norm": 1.8808023929595947, "learning_rate": 4.473294187365542e-06, "loss": 0.2687, "num_input_tokens_seen": 17880528, "step": 17350 }, { "epoch": 11.608695652173914, "grad_norm": 1.9597983360290527, "learning_rate": 4.470391905211231e-06, "loss": 0.3613, "num_input_tokens_seen": 17885328, "step": 17355 }, { "epoch": 11.612040133779264, "grad_norm": 2.2686567306518555, "learning_rate": 4.467489803511107e-06, "loss": 0.3743, "num_input_tokens_seen": 17891024, "step": 17360 }, { "epoch": 11.615384615384615, "grad_norm": 1.6370792388916016, "learning_rate": 4.4645878832540055e-06, "loss": 0.3887, "num_input_tokens_seen": 17896784, "step": 17365 }, { "epoch": 11.618729096989966, "grad_norm": 2.1404638290405273, "learning_rate": 4.4616861454287046e-06, "loss": 0.4379, "num_input_tokens_seen": 17903216, "step": 17370 }, { "epoch": 11.622073578595318, "grad_norm": 2.0300004482269287, "learning_rate": 4.458784591023916e-06, "loss": 0.3053, "num_input_tokens_seen": 17908688, "step": 17375 }, { "epoch": 11.62541806020067, "grad_norm": 2.1717939376831055, "learning_rate": 4.45588322102829e-06, "loss": 0.3373, "num_input_tokens_seen": 17913584, "step": 17380 }, { "epoch": 11.62876254180602, "grad_norm": 2.126563787460327, "learning_rate": 4.452982036430418e-06, "loss": 0.369, "num_input_tokens_seen": 17918192, "step": 17385 }, { "epoch": 11.632107023411372, "grad_norm": 1.8957985639572144, "learning_rate": 4.450081038218823e-06, "loss": 0.4152, "num_input_tokens_seen": 17923280, "step": 17390 }, { "epoch": 11.635451505016722, "grad_norm": 2.2755510807037354, "learning_rate": 4.447180227381968e-06, "loss": 0.3918, "num_input_tokens_seen": 17927856, "step": 17395 }, { "epoch": 11.638795986622073, "grad_norm": 2.2094264030456543, "learning_rate": 4.444279604908247e-06, "loss": 0.3474, "num_input_tokens_seen": 17933456, "step": 17400 }, { "epoch": 11.642140468227424, "grad_norm": 3.400768995285034, "learning_rate": 4.441379171785999e-06, "loss": 0.3999, "num_input_tokens_seen": 17937872, "step": 17405 }, { "epoch": 11.645484949832776, "grad_norm": 2.1294331550598145, "learning_rate": 4.43847892900349e-06, "loss": 0.313, "num_input_tokens_seen": 17942576, "step": 17410 }, { "epoch": 11.648829431438127, "grad_norm": 1.9845519065856934, "learning_rate": 4.435578877548925e-06, "loss": 0.3779, "num_input_tokens_seen": 17947472, "step": 17415 }, { "epoch": 11.652173913043478, "grad_norm": 2.181323528289795, "learning_rate": 4.43267901841044e-06, "loss": 0.3243, "num_input_tokens_seen": 17953488, "step": 17420 }, { "epoch": 11.65551839464883, "grad_norm": 1.8584058284759521, "learning_rate": 4.429779352576114e-06, "loss": 0.3513, "num_input_tokens_seen": 17957936, "step": 17425 }, { "epoch": 11.658862876254181, "grad_norm": 1.9858335256576538, "learning_rate": 4.426879881033952e-06, "loss": 0.5357, "num_input_tokens_seen": 17963792, "step": 17430 }, { "epoch": 11.662207357859533, "grad_norm": 2.673706293106079, "learning_rate": 4.423980604771895e-06, "loss": 0.4022, "num_input_tokens_seen": 17968656, "step": 17435 }, { "epoch": 11.665551839464882, "grad_norm": 2.8796684741973877, "learning_rate": 4.421081524777818e-06, "loss": 0.4817, "num_input_tokens_seen": 17973584, "step": 17440 }, { "epoch": 11.668896321070234, "grad_norm": 2.0663607120513916, "learning_rate": 4.418182642039528e-06, "loss": 0.3649, "num_input_tokens_seen": 17978512, "step": 17445 }, { "epoch": 11.672240802675585, "grad_norm": 1.8641117811203003, "learning_rate": 4.4152839575447684e-06, "loss": 0.4276, "num_input_tokens_seen": 17983728, "step": 17450 }, { "epoch": 11.675585284280936, "grad_norm": 2.1143813133239746, "learning_rate": 4.41238547228121e-06, "loss": 0.3481, "num_input_tokens_seen": 17989840, "step": 17455 }, { "epoch": 11.678929765886288, "grad_norm": 2.0811374187469482, "learning_rate": 4.40948718723646e-06, "loss": 0.3787, "num_input_tokens_seen": 17994256, "step": 17460 }, { "epoch": 11.68227424749164, "grad_norm": 2.0163278579711914, "learning_rate": 4.406589103398054e-06, "loss": 0.4422, "num_input_tokens_seen": 17999280, "step": 17465 }, { "epoch": 11.68561872909699, "grad_norm": 2.5975077152252197, "learning_rate": 4.403691221753461e-06, "loss": 0.3239, "num_input_tokens_seen": 18003664, "step": 17470 }, { "epoch": 11.68896321070234, "grad_norm": 2.160566806793213, "learning_rate": 4.400793543290083e-06, "loss": 0.3659, "num_input_tokens_seen": 18009168, "step": 17475 }, { "epoch": 11.692307692307692, "grad_norm": 2.1679253578186035, "learning_rate": 4.397896068995248e-06, "loss": 0.4301, "num_input_tokens_seen": 18014672, "step": 17480 }, { "epoch": 11.695652173913043, "grad_norm": 2.357771873474121, "learning_rate": 4.394998799856217e-06, "loss": 0.3343, "num_input_tokens_seen": 18020080, "step": 17485 }, { "epoch": 11.698996655518394, "grad_norm": 2.0926287174224854, "learning_rate": 4.392101736860179e-06, "loss": 0.3661, "num_input_tokens_seen": 18024784, "step": 17490 }, { "epoch": 11.702341137123746, "grad_norm": 2.630113124847412, "learning_rate": 4.38920488099426e-06, "loss": 0.4013, "num_input_tokens_seen": 18029936, "step": 17495 }, { "epoch": 11.705685618729097, "grad_norm": 1.602410912513733, "learning_rate": 4.386308233245508e-06, "loss": 0.3859, "num_input_tokens_seen": 18035248, "step": 17500 }, { "epoch": 11.709030100334449, "grad_norm": 1.9334864616394043, "learning_rate": 4.383411794600899e-06, "loss": 0.4244, "num_input_tokens_seen": 18040208, "step": 17505 }, { "epoch": 11.7123745819398, "grad_norm": 2.8880069255828857, "learning_rate": 4.380515566047345e-06, "loss": 0.3572, "num_input_tokens_seen": 18044784, "step": 17510 }, { "epoch": 11.715719063545151, "grad_norm": 2.6221773624420166, "learning_rate": 4.37761954857168e-06, "loss": 0.4415, "num_input_tokens_seen": 18049488, "step": 17515 }, { "epoch": 11.719063545150501, "grad_norm": 3.436469554901123, "learning_rate": 4.374723743160671e-06, "loss": 0.3459, "num_input_tokens_seen": 18054480, "step": 17520 }, { "epoch": 11.722408026755852, "grad_norm": 2.482581615447998, "learning_rate": 4.3718281508010095e-06, "loss": 0.3673, "num_input_tokens_seen": 18059312, "step": 17525 }, { "epoch": 11.725752508361204, "grad_norm": 3.1528804302215576, "learning_rate": 4.3689327724793144e-06, "loss": 0.4529, "num_input_tokens_seen": 18065008, "step": 17530 }, { "epoch": 11.729096989966555, "grad_norm": 2.5770862102508545, "learning_rate": 4.36603760918213e-06, "loss": 0.4942, "num_input_tokens_seen": 18070448, "step": 17535 }, { "epoch": 11.732441471571907, "grad_norm": 2.2947206497192383, "learning_rate": 4.363142661895934e-06, "loss": 0.412, "num_input_tokens_seen": 18075664, "step": 17540 }, { "epoch": 11.735785953177258, "grad_norm": 2.6444151401519775, "learning_rate": 4.360247931607125e-06, "loss": 0.4714, "num_input_tokens_seen": 18081808, "step": 17545 }, { "epoch": 11.73913043478261, "grad_norm": 2.2226390838623047, "learning_rate": 4.357353419302028e-06, "loss": 0.3521, "num_input_tokens_seen": 18086256, "step": 17550 }, { "epoch": 11.742474916387959, "grad_norm": 2.3638668060302734, "learning_rate": 4.354459125966894e-06, "loss": 0.3991, "num_input_tokens_seen": 18091056, "step": 17555 }, { "epoch": 11.74581939799331, "grad_norm": 2.1254215240478516, "learning_rate": 4.351565052587903e-06, "loss": 0.5137, "num_input_tokens_seen": 18096112, "step": 17560 }, { "epoch": 11.749163879598662, "grad_norm": 3.1893210411071777, "learning_rate": 4.348671200151155e-06, "loss": 0.4069, "num_input_tokens_seen": 18102064, "step": 17565 }, { "epoch": 11.752508361204013, "grad_norm": 2.453279733657837, "learning_rate": 4.345777569642677e-06, "loss": 0.4184, "num_input_tokens_seen": 18107056, "step": 17570 }, { "epoch": 11.755852842809364, "grad_norm": 2.611475706100464, "learning_rate": 4.342884162048421e-06, "loss": 0.3836, "num_input_tokens_seen": 18112560, "step": 17575 }, { "epoch": 11.759197324414716, "grad_norm": 1.9869892597198486, "learning_rate": 4.33999097835426e-06, "loss": 0.3328, "num_input_tokens_seen": 18117136, "step": 17580 }, { "epoch": 11.762541806020067, "grad_norm": 2.1253185272216797, "learning_rate": 4.337098019545996e-06, "loss": 0.357, "num_input_tokens_seen": 18121808, "step": 17585 }, { "epoch": 11.765886287625419, "grad_norm": 3.2696948051452637, "learning_rate": 4.334205286609349e-06, "loss": 0.3937, "num_input_tokens_seen": 18126480, "step": 17590 }, { "epoch": 11.76923076923077, "grad_norm": 2.754746198654175, "learning_rate": 4.331312780529968e-06, "loss": 0.456, "num_input_tokens_seen": 18132496, "step": 17595 }, { "epoch": 11.77257525083612, "grad_norm": 1.6907844543457031, "learning_rate": 4.328420502293414e-06, "loss": 0.4378, "num_input_tokens_seen": 18136976, "step": 17600 }, { "epoch": 11.775919732441471, "grad_norm": 2.0181450843811035, "learning_rate": 4.325528452885184e-06, "loss": 0.4314, "num_input_tokens_seen": 18141712, "step": 17605 }, { "epoch": 11.779264214046822, "grad_norm": 2.255474805831909, "learning_rate": 4.322636633290688e-06, "loss": 0.4381, "num_input_tokens_seen": 18146384, "step": 17610 }, { "epoch": 11.782608695652174, "grad_norm": 1.9673527479171753, "learning_rate": 4.319745044495262e-06, "loss": 0.3929, "num_input_tokens_seen": 18152240, "step": 17615 }, { "epoch": 11.785953177257525, "grad_norm": 1.8228658437728882, "learning_rate": 4.3168536874841585e-06, "loss": 0.4124, "num_input_tokens_seen": 18157328, "step": 17620 }, { "epoch": 11.789297658862877, "grad_norm": 2.104818105697632, "learning_rate": 4.313962563242554e-06, "loss": 0.3863, "num_input_tokens_seen": 18162864, "step": 17625 }, { "epoch": 11.792642140468228, "grad_norm": 2.3380725383758545, "learning_rate": 4.311071672755549e-06, "loss": 0.4735, "num_input_tokens_seen": 18168240, "step": 17630 }, { "epoch": 11.79598662207358, "grad_norm": 2.5061028003692627, "learning_rate": 4.30818101700816e-06, "loss": 0.3913, "num_input_tokens_seen": 18173776, "step": 17635 }, { "epoch": 11.799331103678929, "grad_norm": 2.2257795333862305, "learning_rate": 4.305290596985324e-06, "loss": 0.3976, "num_input_tokens_seen": 18179120, "step": 17640 }, { "epoch": 11.80267558528428, "grad_norm": 3.064922332763672, "learning_rate": 4.3024004136718955e-06, "loss": 0.333, "num_input_tokens_seen": 18184368, "step": 17645 }, { "epoch": 11.806020066889632, "grad_norm": 1.9713389873504639, "learning_rate": 4.299510468052656e-06, "loss": 0.3775, "num_input_tokens_seen": 18190000, "step": 17650 }, { "epoch": 11.809364548494983, "grad_norm": 2.5146753787994385, "learning_rate": 4.296620761112299e-06, "loss": 0.3737, "num_input_tokens_seen": 18194768, "step": 17655 }, { "epoch": 11.812709030100335, "grad_norm": 2.1623570919036865, "learning_rate": 4.293731293835438e-06, "loss": 0.3822, "num_input_tokens_seen": 18199536, "step": 17660 }, { "epoch": 11.816053511705686, "grad_norm": 2.7634904384613037, "learning_rate": 4.290842067206608e-06, "loss": 0.3926, "num_input_tokens_seen": 18204336, "step": 17665 }, { "epoch": 11.819397993311037, "grad_norm": 4.645077228546143, "learning_rate": 4.287953082210256e-06, "loss": 0.4911, "num_input_tokens_seen": 18208976, "step": 17670 }, { "epoch": 11.822742474916389, "grad_norm": 1.8431670665740967, "learning_rate": 4.285064339830753e-06, "loss": 0.3079, "num_input_tokens_seen": 18214352, "step": 17675 }, { "epoch": 11.826086956521738, "grad_norm": 1.4393447637557983, "learning_rate": 4.282175841052386e-06, "loss": 0.3653, "num_input_tokens_seen": 18219120, "step": 17680 }, { "epoch": 11.82943143812709, "grad_norm": 3.0457839965820312, "learning_rate": 4.279287586859355e-06, "loss": 0.4513, "num_input_tokens_seen": 18224272, "step": 17685 }, { "epoch": 11.832775919732441, "grad_norm": 2.7025721073150635, "learning_rate": 4.276399578235778e-06, "loss": 0.3975, "num_input_tokens_seen": 18229104, "step": 17690 }, { "epoch": 11.836120401337793, "grad_norm": 3.326321601867676, "learning_rate": 4.273511816165695e-06, "loss": 0.4134, "num_input_tokens_seen": 18234416, "step": 17695 }, { "epoch": 11.839464882943144, "grad_norm": 2.1425178050994873, "learning_rate": 4.270624301633056e-06, "loss": 0.4637, "num_input_tokens_seen": 18240496, "step": 17700 }, { "epoch": 11.842809364548495, "grad_norm": 2.727480173110962, "learning_rate": 4.267737035621726e-06, "loss": 0.4409, "num_input_tokens_seen": 18245616, "step": 17705 }, { "epoch": 11.846153846153847, "grad_norm": 3.002971887588501, "learning_rate": 4.26485001911549e-06, "loss": 0.4652, "num_input_tokens_seen": 18250224, "step": 17710 }, { "epoch": 11.849498327759198, "grad_norm": 2.650115489959717, "learning_rate": 4.261963253098043e-06, "loss": 0.4266, "num_input_tokens_seen": 18254800, "step": 17715 }, { "epoch": 11.852842809364548, "grad_norm": 2.7099475860595703, "learning_rate": 4.259076738553002e-06, "loss": 0.4139, "num_input_tokens_seen": 18260560, "step": 17720 }, { "epoch": 11.856187290969899, "grad_norm": 1.4136652946472168, "learning_rate": 4.25619047646389e-06, "loss": 0.3461, "num_input_tokens_seen": 18265136, "step": 17725 }, { "epoch": 11.85953177257525, "grad_norm": 1.9488970041275024, "learning_rate": 4.253304467814149e-06, "loss": 0.4083, "num_input_tokens_seen": 18271152, "step": 17730 }, { "epoch": 11.862876254180602, "grad_norm": 2.412869453430176, "learning_rate": 4.25041871358713e-06, "loss": 0.4142, "num_input_tokens_seen": 18275888, "step": 17735 }, { "epoch": 11.866220735785953, "grad_norm": 1.8184924125671387, "learning_rate": 4.247533214766105e-06, "loss": 0.3965, "num_input_tokens_seen": 18280816, "step": 17740 }, { "epoch": 11.869565217391305, "grad_norm": 2.163316011428833, "learning_rate": 4.244647972334252e-06, "loss": 0.4214, "num_input_tokens_seen": 18286224, "step": 17745 }, { "epoch": 11.872909698996656, "grad_norm": 3.0854604244232178, "learning_rate": 4.241762987274664e-06, "loss": 0.4185, "num_input_tokens_seen": 18290960, "step": 17750 }, { "epoch": 11.876254180602007, "grad_norm": 2.5083367824554443, "learning_rate": 4.2388782605703485e-06, "loss": 0.4503, "num_input_tokens_seen": 18296048, "step": 17755 }, { "epoch": 11.879598662207357, "grad_norm": 1.9735842943191528, "learning_rate": 4.235993793204219e-06, "loss": 0.406, "num_input_tokens_seen": 18301584, "step": 17760 }, { "epoch": 11.882943143812708, "grad_norm": 1.8311303853988647, "learning_rate": 4.233109586159108e-06, "loss": 0.3688, "num_input_tokens_seen": 18306608, "step": 17765 }, { "epoch": 11.88628762541806, "grad_norm": 1.7657073736190796, "learning_rate": 4.230225640417756e-06, "loss": 0.4664, "num_input_tokens_seen": 18311792, "step": 17770 }, { "epoch": 11.889632107023411, "grad_norm": 2.4329922199249268, "learning_rate": 4.227341956962814e-06, "loss": 0.4322, "num_input_tokens_seen": 18316656, "step": 17775 }, { "epoch": 11.892976588628763, "grad_norm": 2.2813913822174072, "learning_rate": 4.22445853677684e-06, "loss": 0.3793, "num_input_tokens_seen": 18322416, "step": 17780 }, { "epoch": 11.896321070234114, "grad_norm": 2.6667003631591797, "learning_rate": 4.22157538084231e-06, "loss": 0.3536, "num_input_tokens_seen": 18327088, "step": 17785 }, { "epoch": 11.899665551839465, "grad_norm": 3.170212984085083, "learning_rate": 4.218692490141607e-06, "loss": 0.3206, "num_input_tokens_seen": 18331984, "step": 17790 }, { "epoch": 11.903010033444817, "grad_norm": 2.166489362716675, "learning_rate": 4.215809865657021e-06, "loss": 0.3597, "num_input_tokens_seen": 18336720, "step": 17795 }, { "epoch": 11.906354515050166, "grad_norm": 4.8045783042907715, "learning_rate": 4.212927508370752e-06, "loss": 0.4076, "num_input_tokens_seen": 18341840, "step": 17800 }, { "epoch": 11.909698996655518, "grad_norm": 3.216792345046997, "learning_rate": 4.210045419264911e-06, "loss": 0.4514, "num_input_tokens_seen": 18347120, "step": 17805 }, { "epoch": 11.91304347826087, "grad_norm": 3.261932373046875, "learning_rate": 4.2071635993215175e-06, "loss": 0.4177, "num_input_tokens_seen": 18352048, "step": 17810 }, { "epoch": 11.91638795986622, "grad_norm": 2.2958621978759766, "learning_rate": 4.204282049522499e-06, "loss": 0.4714, "num_input_tokens_seen": 18357680, "step": 17815 }, { "epoch": 11.919732441471572, "grad_norm": 2.95310378074646, "learning_rate": 4.201400770849689e-06, "loss": 0.5058, "num_input_tokens_seen": 18363632, "step": 17820 }, { "epoch": 11.923076923076923, "grad_norm": 2.907644510269165, "learning_rate": 4.198519764284829e-06, "loss": 0.5111, "num_input_tokens_seen": 18368848, "step": 17825 }, { "epoch": 11.926421404682275, "grad_norm": 2.0665056705474854, "learning_rate": 4.1956390308095725e-06, "loss": 0.3866, "num_input_tokens_seen": 18375024, "step": 17830 }, { "epoch": 11.929765886287626, "grad_norm": 1.9785560369491577, "learning_rate": 4.192758571405474e-06, "loss": 0.3381, "num_input_tokens_seen": 18379888, "step": 17835 }, { "epoch": 11.933110367892976, "grad_norm": 2.486271381378174, "learning_rate": 4.1898783870539975e-06, "loss": 0.4462, "num_input_tokens_seen": 18386576, "step": 17840 }, { "epoch": 11.936454849498327, "grad_norm": 2.2560324668884277, "learning_rate": 4.18699847873651e-06, "loss": 0.3714, "num_input_tokens_seen": 18391344, "step": 17845 }, { "epoch": 11.939799331103679, "grad_norm": 2.687905788421631, "learning_rate": 4.18411884743429e-06, "loss": 0.4358, "num_input_tokens_seen": 18396528, "step": 17850 }, { "epoch": 11.94314381270903, "grad_norm": 2.1554625034332275, "learning_rate": 4.181239494128517e-06, "loss": 0.3753, "num_input_tokens_seen": 18401392, "step": 17855 }, { "epoch": 11.946488294314381, "grad_norm": 2.0327248573303223, "learning_rate": 4.178360419800281e-06, "loss": 0.4572, "num_input_tokens_seen": 18407728, "step": 17860 }, { "epoch": 11.949832775919733, "grad_norm": 2.5753159523010254, "learning_rate": 4.17548162543057e-06, "loss": 0.4314, "num_input_tokens_seen": 18413168, "step": 17865 }, { "epoch": 11.953177257525084, "grad_norm": 2.576591730117798, "learning_rate": 4.1726031120002795e-06, "loss": 0.3294, "num_input_tokens_seen": 18418896, "step": 17870 }, { "epoch": 11.956521739130435, "grad_norm": 2.343768358230591, "learning_rate": 4.169724880490213e-06, "loss": 0.3204, "num_input_tokens_seen": 18424272, "step": 17875 }, { "epoch": 11.959866220735787, "grad_norm": 13.849658012390137, "learning_rate": 4.166846931881074e-06, "loss": 0.3449, "num_input_tokens_seen": 18429616, "step": 17880 }, { "epoch": 11.963210702341136, "grad_norm": 2.615030527114868, "learning_rate": 4.163969267153469e-06, "loss": 0.4171, "num_input_tokens_seen": 18434544, "step": 17885 }, { "epoch": 11.966555183946488, "grad_norm": 2.0797579288482666, "learning_rate": 4.161091887287908e-06, "loss": 0.4057, "num_input_tokens_seen": 18439632, "step": 17890 }, { "epoch": 11.96989966555184, "grad_norm": 1.9268749952316284, "learning_rate": 4.158214793264808e-06, "loss": 0.3508, "num_input_tokens_seen": 18444496, "step": 17895 }, { "epoch": 11.97324414715719, "grad_norm": 2.9853975772857666, "learning_rate": 4.155337986064485e-06, "loss": 0.408, "num_input_tokens_seen": 18450576, "step": 17900 }, { "epoch": 11.976588628762542, "grad_norm": 2.5129621028900146, "learning_rate": 4.152461466667156e-06, "loss": 0.4495, "num_input_tokens_seen": 18455792, "step": 17905 }, { "epoch": 11.979933110367893, "grad_norm": 2.676039457321167, "learning_rate": 4.149585236052945e-06, "loss": 0.5094, "num_input_tokens_seen": 18461232, "step": 17910 }, { "epoch": 11.983277591973245, "grad_norm": 2.8399453163146973, "learning_rate": 4.146709295201871e-06, "loss": 0.41, "num_input_tokens_seen": 18466832, "step": 17915 }, { "epoch": 11.986622073578594, "grad_norm": 2.61948823928833, "learning_rate": 4.1438336450938606e-06, "loss": 0.3994, "num_input_tokens_seen": 18471856, "step": 17920 }, { "epoch": 11.989966555183946, "grad_norm": 2.9620893001556396, "learning_rate": 4.140958286708738e-06, "loss": 0.4737, "num_input_tokens_seen": 18478224, "step": 17925 }, { "epoch": 11.993311036789297, "grad_norm": 2.6813156604766846, "learning_rate": 4.138083221026229e-06, "loss": 0.3612, "num_input_tokens_seen": 18482576, "step": 17930 }, { "epoch": 11.996655518394649, "grad_norm": 3.1160168647766113, "learning_rate": 4.135208449025955e-06, "loss": 0.3985, "num_input_tokens_seen": 18487408, "step": 17935 }, { "epoch": 12.0, "grad_norm": 2.4410834312438965, "learning_rate": 4.132333971687448e-06, "loss": 0.3441, "num_input_tokens_seen": 18492000, "step": 17940 }, { "epoch": 12.0, "eval_loss": 0.5091866254806519, "eval_runtime": 37.5677, "eval_samples_per_second": 39.795, "eval_steps_per_second": 9.955, "num_input_tokens_seen": 18492000, "step": 17940 }, { "epoch": 12.003344481605351, "grad_norm": 2.4330894947052, "learning_rate": 4.129459789990129e-06, "loss": 0.3748, "num_input_tokens_seen": 18497376, "step": 17945 }, { "epoch": 12.006688963210703, "grad_norm": 2.9107534885406494, "learning_rate": 4.126585904913322e-06, "loss": 0.3937, "num_input_tokens_seen": 18502368, "step": 17950 }, { "epoch": 12.010033444816054, "grad_norm": 2.0924570560455322, "learning_rate": 4.123712317436254e-06, "loss": 0.3047, "num_input_tokens_seen": 18506432, "step": 17955 }, { "epoch": 12.013377926421406, "grad_norm": 3.7489235401153564, "learning_rate": 4.1208390285380405e-06, "loss": 0.4997, "num_input_tokens_seen": 18511104, "step": 17960 }, { "epoch": 12.016722408026755, "grad_norm": 3.3290326595306396, "learning_rate": 4.117966039197709e-06, "loss": 0.35, "num_input_tokens_seen": 18516096, "step": 17965 }, { "epoch": 12.020066889632107, "grad_norm": 2.081197738647461, "learning_rate": 4.115093350394174e-06, "loss": 0.3232, "num_input_tokens_seen": 18521536, "step": 17970 }, { "epoch": 12.023411371237458, "grad_norm": 2.475625991821289, "learning_rate": 4.112220963106252e-06, "loss": 0.3769, "num_input_tokens_seen": 18526848, "step": 17975 }, { "epoch": 12.02675585284281, "grad_norm": 2.0953683853149414, "learning_rate": 4.109348878312652e-06, "loss": 0.3346, "num_input_tokens_seen": 18531936, "step": 17980 }, { "epoch": 12.03010033444816, "grad_norm": 2.114516496658325, "learning_rate": 4.10647709699199e-06, "loss": 0.3652, "num_input_tokens_seen": 18537216, "step": 17985 }, { "epoch": 12.033444816053512, "grad_norm": 1.772823452949524, "learning_rate": 4.103605620122771e-06, "loss": 0.3708, "num_input_tokens_seen": 18542496, "step": 17990 }, { "epoch": 12.036789297658864, "grad_norm": 2.368745803833008, "learning_rate": 4.100734448683395e-06, "loss": 0.3955, "num_input_tokens_seen": 18547616, "step": 17995 }, { "epoch": 12.040133779264215, "grad_norm": 2.0965728759765625, "learning_rate": 4.097863583652162e-06, "loss": 0.307, "num_input_tokens_seen": 18551712, "step": 18000 }, { "epoch": 12.043478260869565, "grad_norm": 2.2650833129882812, "learning_rate": 4.094993026007267e-06, "loss": 0.4076, "num_input_tokens_seen": 18557408, "step": 18005 }, { "epoch": 12.046822742474916, "grad_norm": 2.0998902320861816, "learning_rate": 4.092122776726798e-06, "loss": 0.3474, "num_input_tokens_seen": 18562912, "step": 18010 }, { "epoch": 12.050167224080267, "grad_norm": 1.78520929813385, "learning_rate": 4.089252836788742e-06, "loss": 0.3358, "num_input_tokens_seen": 18567424, "step": 18015 }, { "epoch": 12.053511705685619, "grad_norm": 2.9249002933502197, "learning_rate": 4.086383207170977e-06, "loss": 0.4088, "num_input_tokens_seen": 18572320, "step": 18020 }, { "epoch": 12.05685618729097, "grad_norm": 2.428623914718628, "learning_rate": 4.0835138888512735e-06, "loss": 0.3444, "num_input_tokens_seen": 18577312, "step": 18025 }, { "epoch": 12.060200668896321, "grad_norm": 1.577877163887024, "learning_rate": 4.080644882807304e-06, "loss": 0.396, "num_input_tokens_seen": 18583104, "step": 18030 }, { "epoch": 12.063545150501673, "grad_norm": 2.0409109592437744, "learning_rate": 4.077776190016626e-06, "loss": 0.4126, "num_input_tokens_seen": 18588256, "step": 18035 }, { "epoch": 12.066889632107024, "grad_norm": 2.448516368865967, "learning_rate": 4.074907811456695e-06, "loss": 0.4349, "num_input_tokens_seen": 18593440, "step": 18040 }, { "epoch": 12.070234113712374, "grad_norm": 3.4912593364715576, "learning_rate": 4.072039748104856e-06, "loss": 0.4131, "num_input_tokens_seen": 18598336, "step": 18045 }, { "epoch": 12.073578595317725, "grad_norm": 3.000560998916626, "learning_rate": 4.06917200093835e-06, "loss": 0.4026, "num_input_tokens_seen": 18604256, "step": 18050 }, { "epoch": 12.076923076923077, "grad_norm": 2.2229607105255127, "learning_rate": 4.0663045709343085e-06, "loss": 0.4913, "num_input_tokens_seen": 18609408, "step": 18055 }, { "epoch": 12.080267558528428, "grad_norm": 2.7636733055114746, "learning_rate": 4.063437459069757e-06, "loss": 0.3891, "num_input_tokens_seen": 18614432, "step": 18060 }, { "epoch": 12.08361204013378, "grad_norm": 1.9106943607330322, "learning_rate": 4.060570666321611e-06, "loss": 0.398, "num_input_tokens_seen": 18619296, "step": 18065 }, { "epoch": 12.08695652173913, "grad_norm": 2.264176845550537, "learning_rate": 4.057704193666672e-06, "loss": 0.354, "num_input_tokens_seen": 18623904, "step": 18070 }, { "epoch": 12.090301003344482, "grad_norm": 2.2015414237976074, "learning_rate": 4.054838042081645e-06, "loss": 0.3416, "num_input_tokens_seen": 18629024, "step": 18075 }, { "epoch": 12.093645484949834, "grad_norm": 2.0068130493164062, "learning_rate": 4.051972212543114e-06, "loss": 0.3125, "num_input_tokens_seen": 18633920, "step": 18080 }, { "epoch": 12.096989966555183, "grad_norm": 2.0108776092529297, "learning_rate": 4.049106706027559e-06, "loss": 0.409, "num_input_tokens_seen": 18639328, "step": 18085 }, { "epoch": 12.100334448160535, "grad_norm": 2.039717674255371, "learning_rate": 4.046241523511347e-06, "loss": 0.3511, "num_input_tokens_seen": 18644736, "step": 18090 }, { "epoch": 12.103678929765886, "grad_norm": 1.4846794605255127, "learning_rate": 4.0433766659707375e-06, "loss": 0.3921, "num_input_tokens_seen": 18650016, "step": 18095 }, { "epoch": 12.107023411371237, "grad_norm": 2.2388525009155273, "learning_rate": 4.040512134381876e-06, "loss": 0.3909, "num_input_tokens_seen": 18655264, "step": 18100 }, { "epoch": 12.110367892976589, "grad_norm": 2.9089627265930176, "learning_rate": 4.037647929720803e-06, "loss": 0.4114, "num_input_tokens_seen": 18660512, "step": 18105 }, { "epoch": 12.11371237458194, "grad_norm": 2.684321403503418, "learning_rate": 4.034784052963439e-06, "loss": 0.4486, "num_input_tokens_seen": 18665728, "step": 18110 }, { "epoch": 12.117056856187292, "grad_norm": 2.0142722129821777, "learning_rate": 4.0319205050855965e-06, "loss": 0.3589, "num_input_tokens_seen": 18670560, "step": 18115 }, { "epoch": 12.120401337792643, "grad_norm": 2.0041942596435547, "learning_rate": 4.029057287062982e-06, "loss": 0.3074, "num_input_tokens_seen": 18675744, "step": 18120 }, { "epoch": 12.123745819397993, "grad_norm": 2.034862518310547, "learning_rate": 4.02619439987118e-06, "loss": 0.376, "num_input_tokens_seen": 18680768, "step": 18125 }, { "epoch": 12.127090301003344, "grad_norm": 2.3219897747039795, "learning_rate": 4.023331844485669e-06, "loss": 0.4118, "num_input_tokens_seen": 18686304, "step": 18130 }, { "epoch": 12.130434782608695, "grad_norm": 2.4500131607055664, "learning_rate": 4.020469621881809e-06, "loss": 0.3844, "num_input_tokens_seen": 18690912, "step": 18135 }, { "epoch": 12.133779264214047, "grad_norm": 2.337099313735962, "learning_rate": 4.01760773303485e-06, "loss": 0.4186, "num_input_tokens_seen": 18695648, "step": 18140 }, { "epoch": 12.137123745819398, "grad_norm": 1.5821959972381592, "learning_rate": 4.01474617891993e-06, "loss": 0.3765, "num_input_tokens_seen": 18700960, "step": 18145 }, { "epoch": 12.14046822742475, "grad_norm": 2.438857316970825, "learning_rate": 4.011884960512068e-06, "loss": 0.3661, "num_input_tokens_seen": 18706944, "step": 18150 }, { "epoch": 12.143812709030101, "grad_norm": 2.310631513595581, "learning_rate": 4.009024078786175e-06, "loss": 0.3246, "num_input_tokens_seen": 18711168, "step": 18155 }, { "epoch": 12.147157190635452, "grad_norm": 2.248894453048706, "learning_rate": 4.006163534717039e-06, "loss": 0.3845, "num_input_tokens_seen": 18717312, "step": 18160 }, { "epoch": 12.150501672240802, "grad_norm": 2.5751075744628906, "learning_rate": 4.003303329279342e-06, "loss": 0.3787, "num_input_tokens_seen": 18723776, "step": 18165 }, { "epoch": 12.153846153846153, "grad_norm": 1.9850194454193115, "learning_rate": 4.000443463447645e-06, "loss": 0.3563, "num_input_tokens_seen": 18728544, "step": 18170 }, { "epoch": 12.157190635451505, "grad_norm": 3.4277353286743164, "learning_rate": 3.9975839381963935e-06, "loss": 0.4496, "num_input_tokens_seen": 18734560, "step": 18175 }, { "epoch": 12.160535117056856, "grad_norm": 2.1490695476531982, "learning_rate": 3.994724754499919e-06, "loss": 0.3905, "num_input_tokens_seen": 18739840, "step": 18180 }, { "epoch": 12.163879598662207, "grad_norm": 2.2422850131988525, "learning_rate": 3.991865913332432e-06, "loss": 0.319, "num_input_tokens_seen": 18745856, "step": 18185 }, { "epoch": 12.167224080267559, "grad_norm": 2.348118782043457, "learning_rate": 3.989007415668035e-06, "loss": 0.4077, "num_input_tokens_seen": 18751360, "step": 18190 }, { "epoch": 12.17056856187291, "grad_norm": 2.349949836730957, "learning_rate": 3.986149262480706e-06, "loss": 0.3589, "num_input_tokens_seen": 18755296, "step": 18195 }, { "epoch": 12.173913043478262, "grad_norm": 2.554227828979492, "learning_rate": 3.983291454744308e-06, "loss": 0.4912, "num_input_tokens_seen": 18760960, "step": 18200 }, { "epoch": 12.177257525083611, "grad_norm": 1.9932503700256348, "learning_rate": 3.980433993432586e-06, "loss": 0.3457, "num_input_tokens_seen": 18765440, "step": 18205 }, { "epoch": 12.180602006688963, "grad_norm": 2.326932668685913, "learning_rate": 3.977576879519168e-06, "loss": 0.3562, "num_input_tokens_seen": 18771104, "step": 18210 }, { "epoch": 12.183946488294314, "grad_norm": 4.355728626251221, "learning_rate": 3.974720113977562e-06, "loss": 0.4014, "num_input_tokens_seen": 18776000, "step": 18215 }, { "epoch": 12.187290969899665, "grad_norm": 2.3416078090667725, "learning_rate": 3.97186369778116e-06, "loss": 0.3876, "num_input_tokens_seen": 18780576, "step": 18220 }, { "epoch": 12.190635451505017, "grad_norm": 2.2778737545013428, "learning_rate": 3.969007631903232e-06, "loss": 0.4346, "num_input_tokens_seen": 18785376, "step": 18225 }, { "epoch": 12.193979933110368, "grad_norm": 2.14288330078125, "learning_rate": 3.966151917316927e-06, "loss": 0.3143, "num_input_tokens_seen": 18789824, "step": 18230 }, { "epoch": 12.19732441471572, "grad_norm": 1.425777792930603, "learning_rate": 3.9632965549952805e-06, "loss": 0.3615, "num_input_tokens_seen": 18794912, "step": 18235 }, { "epoch": 12.200668896321071, "grad_norm": 1.8313013315200806, "learning_rate": 3.960441545911205e-06, "loss": 0.4082, "num_input_tokens_seen": 18800096, "step": 18240 }, { "epoch": 12.20401337792642, "grad_norm": 1.860439658164978, "learning_rate": 3.957586891037489e-06, "loss": 0.4176, "num_input_tokens_seen": 18805952, "step": 18245 }, { "epoch": 12.207357859531772, "grad_norm": 2.0582590103149414, "learning_rate": 3.954732591346805e-06, "loss": 0.3635, "num_input_tokens_seen": 18811776, "step": 18250 }, { "epoch": 12.210702341137123, "grad_norm": 2.128669500350952, "learning_rate": 3.951878647811704e-06, "loss": 0.3554, "num_input_tokens_seen": 18817984, "step": 18255 }, { "epoch": 12.214046822742475, "grad_norm": 2.5186009407043457, "learning_rate": 3.949025061404613e-06, "loss": 0.364, "num_input_tokens_seen": 18823136, "step": 18260 }, { "epoch": 12.217391304347826, "grad_norm": 2.525939464569092, "learning_rate": 3.946171833097841e-06, "loss": 0.438, "num_input_tokens_seen": 18828256, "step": 18265 }, { "epoch": 12.220735785953178, "grad_norm": 3.2395198345184326, "learning_rate": 3.943318963863571e-06, "loss": 0.4559, "num_input_tokens_seen": 18833664, "step": 18270 }, { "epoch": 12.224080267558529, "grad_norm": 2.6605026721954346, "learning_rate": 3.940466454673864e-06, "loss": 0.325, "num_input_tokens_seen": 18838592, "step": 18275 }, { "epoch": 12.22742474916388, "grad_norm": 2.4474098682403564, "learning_rate": 3.9376143065006645e-06, "loss": 0.4912, "num_input_tokens_seen": 18843520, "step": 18280 }, { "epoch": 12.23076923076923, "grad_norm": 1.731505036354065, "learning_rate": 3.934762520315786e-06, "loss": 0.3855, "num_input_tokens_seen": 18848512, "step": 18285 }, { "epoch": 12.234113712374581, "grad_norm": 2.0340847969055176, "learning_rate": 3.931911097090923e-06, "loss": 0.3918, "num_input_tokens_seen": 18853632, "step": 18290 }, { "epoch": 12.237458193979933, "grad_norm": 2.569443941116333, "learning_rate": 3.929060037797644e-06, "loss": 0.3593, "num_input_tokens_seen": 18858560, "step": 18295 }, { "epoch": 12.240802675585284, "grad_norm": 2.044375419616699, "learning_rate": 3.926209343407397e-06, "loss": 0.3705, "num_input_tokens_seen": 18863936, "step": 18300 }, { "epoch": 12.244147157190636, "grad_norm": 2.8227055072784424, "learning_rate": 3.923359014891503e-06, "loss": 0.327, "num_input_tokens_seen": 18868736, "step": 18305 }, { "epoch": 12.247491638795987, "grad_norm": 2.588841438293457, "learning_rate": 3.920509053221158e-06, "loss": 0.3916, "num_input_tokens_seen": 18873824, "step": 18310 }, { "epoch": 12.250836120401338, "grad_norm": 2.3057923316955566, "learning_rate": 3.9176594593674355e-06, "loss": 0.3239, "num_input_tokens_seen": 18879584, "step": 18315 }, { "epoch": 12.25418060200669, "grad_norm": 2.418508291244507, "learning_rate": 3.914810234301278e-06, "loss": 0.4025, "num_input_tokens_seen": 18884864, "step": 18320 }, { "epoch": 12.25752508361204, "grad_norm": 2.5321555137634277, "learning_rate": 3.911961378993511e-06, "loss": 0.4006, "num_input_tokens_seen": 18890208, "step": 18325 }, { "epoch": 12.26086956521739, "grad_norm": 1.7184879779815674, "learning_rate": 3.9091128944148276e-06, "loss": 0.3399, "num_input_tokens_seen": 18895552, "step": 18330 }, { "epoch": 12.264214046822742, "grad_norm": 2.415681838989258, "learning_rate": 3.906264781535796e-06, "loss": 0.4856, "num_input_tokens_seen": 18901408, "step": 18335 }, { "epoch": 12.267558528428093, "grad_norm": 2.106527090072632, "learning_rate": 3.9034170413268565e-06, "loss": 0.3408, "num_input_tokens_seen": 18907168, "step": 18340 }, { "epoch": 12.270903010033445, "grad_norm": 3.3386545181274414, "learning_rate": 3.9005696747583286e-06, "loss": 0.3646, "num_input_tokens_seen": 18912192, "step": 18345 }, { "epoch": 12.274247491638796, "grad_norm": 3.8911237716674805, "learning_rate": 3.897722682800395e-06, "loss": 0.3424, "num_input_tokens_seen": 18916736, "step": 18350 }, { "epoch": 12.277591973244148, "grad_norm": 1.9564213752746582, "learning_rate": 3.89487606642312e-06, "loss": 0.3957, "num_input_tokens_seen": 18922560, "step": 18355 }, { "epoch": 12.280936454849499, "grad_norm": 2.1146631240844727, "learning_rate": 3.892029826596434e-06, "loss": 0.3064, "num_input_tokens_seen": 18927648, "step": 18360 }, { "epoch": 12.284280936454849, "grad_norm": 2.1644763946533203, "learning_rate": 3.8891839642901366e-06, "loss": 0.3319, "num_input_tokens_seen": 18933504, "step": 18365 }, { "epoch": 12.2876254180602, "grad_norm": 2.283723831176758, "learning_rate": 3.886338480473909e-06, "loss": 0.3391, "num_input_tokens_seen": 18938368, "step": 18370 }, { "epoch": 12.290969899665551, "grad_norm": 1.9168474674224854, "learning_rate": 3.883493376117295e-06, "loss": 0.3562, "num_input_tokens_seen": 18943552, "step": 18375 }, { "epoch": 12.294314381270903, "grad_norm": 2.0680196285247803, "learning_rate": 3.880648652189711e-06, "loss": 0.5375, "num_input_tokens_seen": 18948576, "step": 18380 }, { "epoch": 12.297658862876254, "grad_norm": 3.2659833431243896, "learning_rate": 3.877804309660443e-06, "loss": 0.432, "num_input_tokens_seen": 18953536, "step": 18385 }, { "epoch": 12.301003344481606, "grad_norm": 2.1869757175445557, "learning_rate": 3.874960349498651e-06, "loss": 0.4401, "num_input_tokens_seen": 18958016, "step": 18390 }, { "epoch": 12.304347826086957, "grad_norm": 2.6276540756225586, "learning_rate": 3.87211677267336e-06, "loss": 0.3758, "num_input_tokens_seen": 18962560, "step": 18395 }, { "epoch": 12.307692307692308, "grad_norm": 5.350683689117432, "learning_rate": 3.869273580153468e-06, "loss": 0.42, "num_input_tokens_seen": 18966944, "step": 18400 }, { "epoch": 12.31103678929766, "grad_norm": 1.9183595180511475, "learning_rate": 3.866430772907737e-06, "loss": 0.4604, "num_input_tokens_seen": 18972512, "step": 18405 }, { "epoch": 12.31438127090301, "grad_norm": 2.929917573928833, "learning_rate": 3.863588351904804e-06, "loss": 0.3044, "num_input_tokens_seen": 18976960, "step": 18410 }, { "epoch": 12.31772575250836, "grad_norm": 2.469601631164551, "learning_rate": 3.860746318113174e-06, "loss": 0.4128, "num_input_tokens_seen": 18982272, "step": 18415 }, { "epoch": 12.321070234113712, "grad_norm": 2.511817216873169, "learning_rate": 3.857904672501212e-06, "loss": 0.4188, "num_input_tokens_seen": 18988192, "step": 18420 }, { "epoch": 12.324414715719064, "grad_norm": 2.3341848850250244, "learning_rate": 3.855063416037159e-06, "loss": 0.3959, "num_input_tokens_seen": 18993408, "step": 18425 }, { "epoch": 12.327759197324415, "grad_norm": 2.459217071533203, "learning_rate": 3.8522225496891194e-06, "loss": 0.392, "num_input_tokens_seen": 18999136, "step": 18430 }, { "epoch": 12.331103678929766, "grad_norm": 2.8426220417022705, "learning_rate": 3.849382074425069e-06, "loss": 0.2924, "num_input_tokens_seen": 19003488, "step": 18435 }, { "epoch": 12.334448160535118, "grad_norm": 2.515528678894043, "learning_rate": 3.846541991212845e-06, "loss": 0.364, "num_input_tokens_seen": 19008640, "step": 18440 }, { "epoch": 12.337792642140467, "grad_norm": 1.9131704568862915, "learning_rate": 3.8437023010201525e-06, "loss": 0.3567, "num_input_tokens_seen": 19013504, "step": 18445 }, { "epoch": 12.341137123745819, "grad_norm": 2.1063997745513916, "learning_rate": 3.840863004814565e-06, "loss": 0.3779, "num_input_tokens_seen": 19018464, "step": 18450 }, { "epoch": 12.34448160535117, "grad_norm": 3.4328577518463135, "learning_rate": 3.838024103563521e-06, "loss": 0.4177, "num_input_tokens_seen": 19023712, "step": 18455 }, { "epoch": 12.347826086956522, "grad_norm": 3.3592071533203125, "learning_rate": 3.835185598234323e-06, "loss": 0.3594, "num_input_tokens_seen": 19028736, "step": 18460 }, { "epoch": 12.351170568561873, "grad_norm": 2.55268931388855, "learning_rate": 3.8323474897941395e-06, "loss": 0.4153, "num_input_tokens_seen": 19033536, "step": 18465 }, { "epoch": 12.354515050167224, "grad_norm": 2.5598485469818115, "learning_rate": 3.829509779210002e-06, "loss": 0.3981, "num_input_tokens_seen": 19039072, "step": 18470 }, { "epoch": 12.357859531772576, "grad_norm": 2.6362526416778564, "learning_rate": 3.826672467448808e-06, "loss": 0.4359, "num_input_tokens_seen": 19043808, "step": 18475 }, { "epoch": 12.361204013377927, "grad_norm": 2.201216220855713, "learning_rate": 3.8238355554773234e-06, "loss": 0.26, "num_input_tokens_seen": 19048512, "step": 18480 }, { "epoch": 12.364548494983278, "grad_norm": 2.4865050315856934, "learning_rate": 3.820999044262169e-06, "loss": 0.3321, "num_input_tokens_seen": 19053728, "step": 18485 }, { "epoch": 12.367892976588628, "grad_norm": 3.0450799465179443, "learning_rate": 3.818162934769835e-06, "loss": 0.3648, "num_input_tokens_seen": 19058464, "step": 18490 }, { "epoch": 12.37123745819398, "grad_norm": 1.9385415315628052, "learning_rate": 3.815327227966673e-06, "loss": 0.3931, "num_input_tokens_seen": 19063840, "step": 18495 }, { "epoch": 12.37458193979933, "grad_norm": 3.101008653640747, "learning_rate": 3.8124919248188994e-06, "loss": 0.3591, "num_input_tokens_seen": 19068672, "step": 18500 }, { "epoch": 12.377926421404682, "grad_norm": 3.0191566944122314, "learning_rate": 3.8096570262925913e-06, "loss": 0.3698, "num_input_tokens_seen": 19075264, "step": 18505 }, { "epoch": 12.381270903010034, "grad_norm": 2.632033586502075, "learning_rate": 3.8068225333536867e-06, "loss": 0.4171, "num_input_tokens_seen": 19081248, "step": 18510 }, { "epoch": 12.384615384615385, "grad_norm": 1.9718315601348877, "learning_rate": 3.8039884469679873e-06, "loss": 0.3682, "num_input_tokens_seen": 19086432, "step": 18515 }, { "epoch": 12.387959866220736, "grad_norm": 2.5569992065429688, "learning_rate": 3.8011547681011535e-06, "loss": 0.4812, "num_input_tokens_seen": 19091488, "step": 18520 }, { "epoch": 12.391304347826088, "grad_norm": 2.7144522666931152, "learning_rate": 3.7983214977187135e-06, "loss": 0.4653, "num_input_tokens_seen": 19096576, "step": 18525 }, { "epoch": 12.394648829431437, "grad_norm": 2.4000468254089355, "learning_rate": 3.79548863678605e-06, "loss": 0.4604, "num_input_tokens_seen": 19101632, "step": 18530 }, { "epoch": 12.397993311036789, "grad_norm": 2.595189332962036, "learning_rate": 3.792656186268406e-06, "loss": 0.3768, "num_input_tokens_seen": 19106176, "step": 18535 }, { "epoch": 12.40133779264214, "grad_norm": 2.7182888984680176, "learning_rate": 3.7898241471308888e-06, "loss": 0.427, "num_input_tokens_seen": 19111712, "step": 18540 }, { "epoch": 12.404682274247492, "grad_norm": 2.164313316345215, "learning_rate": 3.786992520338464e-06, "loss": 0.3665, "num_input_tokens_seen": 19116416, "step": 18545 }, { "epoch": 12.408026755852843, "grad_norm": 2.0594875812530518, "learning_rate": 3.7841613068559545e-06, "loss": 0.396, "num_input_tokens_seen": 19122784, "step": 18550 }, { "epoch": 12.411371237458194, "grad_norm": 1.9311069250106812, "learning_rate": 3.781330507648046e-06, "loss": 0.3499, "num_input_tokens_seen": 19127872, "step": 18555 }, { "epoch": 12.414715719063546, "grad_norm": 1.7295207977294922, "learning_rate": 3.7785001236792802e-06, "loss": 0.32, "num_input_tokens_seen": 19132864, "step": 18560 }, { "epoch": 12.418060200668897, "grad_norm": 2.0506465435028076, "learning_rate": 3.7756701559140557e-06, "loss": 0.3963, "num_input_tokens_seen": 19137600, "step": 18565 }, { "epoch": 12.421404682274247, "grad_norm": 3.5790927410125732, "learning_rate": 3.772840605316636e-06, "loss": 0.4194, "num_input_tokens_seen": 19143232, "step": 18570 }, { "epoch": 12.424749163879598, "grad_norm": 2.412940740585327, "learning_rate": 3.770011472851137e-06, "loss": 0.3326, "num_input_tokens_seen": 19147808, "step": 18575 }, { "epoch": 12.42809364548495, "grad_norm": 2.233647584915161, "learning_rate": 3.7671827594815322e-06, "loss": 0.3831, "num_input_tokens_seen": 19153184, "step": 18580 }, { "epoch": 12.431438127090301, "grad_norm": 2.5261406898498535, "learning_rate": 3.7643544661716518e-06, "loss": 0.3894, "num_input_tokens_seen": 19158432, "step": 18585 }, { "epoch": 12.434782608695652, "grad_norm": 2.6480281352996826, "learning_rate": 3.7615265938851898e-06, "loss": 0.4625, "num_input_tokens_seen": 19164544, "step": 18590 }, { "epoch": 12.438127090301004, "grad_norm": 3.269878625869751, "learning_rate": 3.758699143585686e-06, "loss": 0.4106, "num_input_tokens_seen": 19169056, "step": 18595 }, { "epoch": 12.441471571906355, "grad_norm": 2.1167526245117188, "learning_rate": 3.755872116236547e-06, "loss": 0.3462, "num_input_tokens_seen": 19173664, "step": 18600 }, { "epoch": 12.444816053511706, "grad_norm": 2.1584765911102295, "learning_rate": 3.7530455128010266e-06, "loss": 0.2634, "num_input_tokens_seen": 19179232, "step": 18605 }, { "epoch": 12.448160535117056, "grad_norm": 2.0033910274505615, "learning_rate": 3.750219334242237e-06, "loss": 0.3441, "num_input_tokens_seen": 19184800, "step": 18610 }, { "epoch": 12.451505016722408, "grad_norm": 2.0451529026031494, "learning_rate": 3.74739358152315e-06, "loss": 0.3048, "num_input_tokens_seen": 19189600, "step": 18615 }, { "epoch": 12.454849498327759, "grad_norm": 2.230607032775879, "learning_rate": 3.7445682556065877e-06, "loss": 0.3261, "num_input_tokens_seen": 19194304, "step": 18620 }, { "epoch": 12.45819397993311, "grad_norm": 2.1684751510620117, "learning_rate": 3.7417433574552266e-06, "loss": 0.3533, "num_input_tokens_seen": 19199328, "step": 18625 }, { "epoch": 12.461538461538462, "grad_norm": 2.0880072116851807, "learning_rate": 3.7389188880315962e-06, "loss": 0.342, "num_input_tokens_seen": 19203936, "step": 18630 }, { "epoch": 12.464882943143813, "grad_norm": 2.680037021636963, "learning_rate": 3.7360948482980875e-06, "loss": 0.3944, "num_input_tokens_seen": 19209504, "step": 18635 }, { "epoch": 12.468227424749164, "grad_norm": 3.443521499633789, "learning_rate": 3.733271239216937e-06, "loss": 0.4258, "num_input_tokens_seen": 19215840, "step": 18640 }, { "epoch": 12.471571906354516, "grad_norm": 2.650686264038086, "learning_rate": 3.7304480617502387e-06, "loss": 0.4, "num_input_tokens_seen": 19221024, "step": 18645 }, { "epoch": 12.474916387959865, "grad_norm": 3.2576515674591064, "learning_rate": 3.7276253168599375e-06, "loss": 0.361, "num_input_tokens_seen": 19226304, "step": 18650 }, { "epoch": 12.478260869565217, "grad_norm": 2.2471871376037598, "learning_rate": 3.724803005507829e-06, "loss": 0.3878, "num_input_tokens_seen": 19230656, "step": 18655 }, { "epoch": 12.481605351170568, "grad_norm": 2.5538182258605957, "learning_rate": 3.721981128655569e-06, "loss": 0.4219, "num_input_tokens_seen": 19236544, "step": 18660 }, { "epoch": 12.48494983277592, "grad_norm": 2.052699565887451, "learning_rate": 3.7191596872646574e-06, "loss": 0.3898, "num_input_tokens_seen": 19241600, "step": 18665 }, { "epoch": 12.488294314381271, "grad_norm": 3.134464979171753, "learning_rate": 3.7163386822964474e-06, "loss": 0.3696, "num_input_tokens_seen": 19246560, "step": 18670 }, { "epoch": 12.491638795986622, "grad_norm": 2.4905788898468018, "learning_rate": 3.7135181147121433e-06, "loss": 0.3709, "num_input_tokens_seen": 19252800, "step": 18675 }, { "epoch": 12.494983277591974, "grad_norm": 3.2149806022644043, "learning_rate": 3.710697985472806e-06, "loss": 0.4243, "num_input_tokens_seen": 19258464, "step": 18680 }, { "epoch": 12.498327759197325, "grad_norm": 2.679199457168579, "learning_rate": 3.7078782955393396e-06, "loss": 0.3828, "num_input_tokens_seen": 19263744, "step": 18685 }, { "epoch": 12.501672240802675, "grad_norm": 1.6522332429885864, "learning_rate": 3.7050590458725e-06, "loss": 0.3695, "num_input_tokens_seen": 19269152, "step": 18690 }, { "epoch": 12.505016722408026, "grad_norm": 2.9271557331085205, "learning_rate": 3.7022402374328983e-06, "loss": 0.4065, "num_input_tokens_seen": 19275008, "step": 18695 }, { "epoch": 12.508361204013378, "grad_norm": 2.516289234161377, "learning_rate": 3.699421871180986e-06, "loss": 0.4121, "num_input_tokens_seen": 19279968, "step": 18700 }, { "epoch": 12.511705685618729, "grad_norm": 2.4186785221099854, "learning_rate": 3.6966039480770756e-06, "loss": 0.3419, "num_input_tokens_seen": 19285216, "step": 18705 }, { "epoch": 12.51505016722408, "grad_norm": 2.0871734619140625, "learning_rate": 3.693786469081319e-06, "loss": 0.3608, "num_input_tokens_seen": 19290464, "step": 18710 }, { "epoch": 12.518394648829432, "grad_norm": 2.473860025405884, "learning_rate": 3.6909694351537205e-06, "loss": 0.4156, "num_input_tokens_seen": 19295872, "step": 18715 }, { "epoch": 12.521739130434783, "grad_norm": 2.178633451461792, "learning_rate": 3.6881528472541318e-06, "loss": 0.454, "num_input_tokens_seen": 19301248, "step": 18720 }, { "epoch": 12.525083612040135, "grad_norm": 2.7786190509796143, "learning_rate": 3.685336706342255e-06, "loss": 0.3984, "num_input_tokens_seen": 19307072, "step": 18725 }, { "epoch": 12.528428093645484, "grad_norm": 2.1293210983276367, "learning_rate": 3.682521013377637e-06, "loss": 0.3996, "num_input_tokens_seen": 19313184, "step": 18730 }, { "epoch": 12.531772575250836, "grad_norm": 2.6657867431640625, "learning_rate": 3.679705769319673e-06, "loss": 0.4046, "num_input_tokens_seen": 19318336, "step": 18735 }, { "epoch": 12.535117056856187, "grad_norm": 2.717149019241333, "learning_rate": 3.676890975127607e-06, "loss": 0.3033, "num_input_tokens_seen": 19323648, "step": 18740 }, { "epoch": 12.538461538461538, "grad_norm": 2.540158987045288, "learning_rate": 3.6740766317605248e-06, "loss": 0.3167, "num_input_tokens_seen": 19328960, "step": 18745 }, { "epoch": 12.54180602006689, "grad_norm": 2.312988758087158, "learning_rate": 3.671262740177366e-06, "loss": 0.4209, "num_input_tokens_seen": 19333952, "step": 18750 }, { "epoch": 12.545150501672241, "grad_norm": 1.9308929443359375, "learning_rate": 3.6684493013369115e-06, "loss": 0.3641, "num_input_tokens_seen": 19338880, "step": 18755 }, { "epoch": 12.548494983277592, "grad_norm": 4.138387680053711, "learning_rate": 3.665636316197787e-06, "loss": 0.3244, "num_input_tokens_seen": 19343296, "step": 18760 }, { "epoch": 12.551839464882944, "grad_norm": 2.395165205001831, "learning_rate": 3.6628237857184636e-06, "loss": 0.3029, "num_input_tokens_seen": 19348256, "step": 18765 }, { "epoch": 12.555183946488294, "grad_norm": 3.806480884552002, "learning_rate": 3.6600117108572643e-06, "loss": 0.4033, "num_input_tokens_seen": 19353920, "step": 18770 }, { "epoch": 12.558528428093645, "grad_norm": 2.724895477294922, "learning_rate": 3.6572000925723484e-06, "loss": 0.3629, "num_input_tokens_seen": 19358752, "step": 18775 }, { "epoch": 12.561872909698996, "grad_norm": 2.082066535949707, "learning_rate": 3.654388931821723e-06, "loss": 0.4755, "num_input_tokens_seen": 19364288, "step": 18780 }, { "epoch": 12.565217391304348, "grad_norm": 3.341468334197998, "learning_rate": 3.6515782295632397e-06, "loss": 0.3998, "num_input_tokens_seen": 19368928, "step": 18785 }, { "epoch": 12.568561872909699, "grad_norm": 1.9886484146118164, "learning_rate": 3.6487679867545927e-06, "loss": 0.3209, "num_input_tokens_seen": 19374240, "step": 18790 }, { "epoch": 12.57190635451505, "grad_norm": 2.6093640327453613, "learning_rate": 3.6459582043533215e-06, "loss": 0.4402, "num_input_tokens_seen": 19378528, "step": 18795 }, { "epoch": 12.575250836120402, "grad_norm": 2.056644916534424, "learning_rate": 3.6431488833168084e-06, "loss": 0.4145, "num_input_tokens_seen": 19383424, "step": 18800 }, { "epoch": 12.578595317725753, "grad_norm": 2.0538694858551025, "learning_rate": 3.6403400246022765e-06, "loss": 0.4593, "num_input_tokens_seen": 19388736, "step": 18805 }, { "epoch": 12.581939799331103, "grad_norm": 2.7911815643310547, "learning_rate": 3.63753162916679e-06, "loss": 0.3829, "num_input_tokens_seen": 19393696, "step": 18810 }, { "epoch": 12.585284280936454, "grad_norm": 2.6921467781066895, "learning_rate": 3.634723697967263e-06, "loss": 0.3577, "num_input_tokens_seen": 19399296, "step": 18815 }, { "epoch": 12.588628762541806, "grad_norm": 2.7552459239959717, "learning_rate": 3.631916231960444e-06, "loss": 0.3425, "num_input_tokens_seen": 19405216, "step": 18820 }, { "epoch": 12.591973244147157, "grad_norm": 2.4132721424102783, "learning_rate": 3.6291092321029244e-06, "loss": 0.3243, "num_input_tokens_seen": 19410464, "step": 18825 }, { "epoch": 12.595317725752508, "grad_norm": 3.0659570693969727, "learning_rate": 3.6263026993511376e-06, "loss": 0.54, "num_input_tokens_seen": 19415648, "step": 18830 }, { "epoch": 12.59866220735786, "grad_norm": 2.6519935131073, "learning_rate": 3.623496634661358e-06, "loss": 0.4556, "num_input_tokens_seen": 19421632, "step": 18835 }, { "epoch": 12.602006688963211, "grad_norm": 3.787041664123535, "learning_rate": 3.620691038989701e-06, "loss": 0.4841, "num_input_tokens_seen": 19427648, "step": 18840 }, { "epoch": 12.605351170568563, "grad_norm": 2.5120561122894287, "learning_rate": 3.617885913292122e-06, "loss": 0.4144, "num_input_tokens_seen": 19432704, "step": 18845 }, { "epoch": 12.608695652173914, "grad_norm": 2.1541035175323486, "learning_rate": 3.6150812585244154e-06, "loss": 0.3319, "num_input_tokens_seen": 19437664, "step": 18850 }, { "epoch": 12.612040133779264, "grad_norm": 1.7862422466278076, "learning_rate": 3.6122770756422133e-06, "loss": 0.4518, "num_input_tokens_seen": 19442240, "step": 18855 }, { "epoch": 12.615384615384615, "grad_norm": 1.95845627784729, "learning_rate": 3.6094733656009933e-06, "loss": 0.3549, "num_input_tokens_seen": 19448416, "step": 18860 }, { "epoch": 12.618729096989966, "grad_norm": 2.0499558448791504, "learning_rate": 3.6066701293560656e-06, "loss": 0.3703, "num_input_tokens_seen": 19454464, "step": 18865 }, { "epoch": 12.622073578595318, "grad_norm": 2.1519978046417236, "learning_rate": 3.6038673678625817e-06, "loss": 0.3298, "num_input_tokens_seen": 19460256, "step": 18870 }, { "epoch": 12.62541806020067, "grad_norm": 2.9994425773620605, "learning_rate": 3.601065082075531e-06, "loss": 0.39, "num_input_tokens_seen": 19465728, "step": 18875 }, { "epoch": 12.62876254180602, "grad_norm": 2.97330641746521, "learning_rate": 3.5982632729497375e-06, "loss": 0.4762, "num_input_tokens_seen": 19470592, "step": 18880 }, { "epoch": 12.632107023411372, "grad_norm": 3.45505428314209, "learning_rate": 3.595461941439871e-06, "loss": 0.3948, "num_input_tokens_seen": 19475168, "step": 18885 }, { "epoch": 12.635451505016722, "grad_norm": 2.3907079696655273, "learning_rate": 3.5926610885004303e-06, "loss": 0.4018, "num_input_tokens_seen": 19480000, "step": 18890 }, { "epoch": 12.638795986622073, "grad_norm": 2.292288064956665, "learning_rate": 3.5898607150857566e-06, "loss": 0.4237, "num_input_tokens_seen": 19484416, "step": 18895 }, { "epoch": 12.642140468227424, "grad_norm": 2.711426019668579, "learning_rate": 3.587060822150022e-06, "loss": 0.4536, "num_input_tokens_seen": 19489600, "step": 18900 }, { "epoch": 12.645484949832776, "grad_norm": 2.250962734222412, "learning_rate": 3.584261410647243e-06, "loss": 0.3846, "num_input_tokens_seen": 19494304, "step": 18905 }, { "epoch": 12.648829431438127, "grad_norm": 2.438964366912842, "learning_rate": 3.581462481531264e-06, "loss": 0.4062, "num_input_tokens_seen": 19499264, "step": 18910 }, { "epoch": 12.652173913043478, "grad_norm": 3.0889251232147217, "learning_rate": 3.57866403575577e-06, "loss": 0.4371, "num_input_tokens_seen": 19504512, "step": 18915 }, { "epoch": 12.65551839464883, "grad_norm": 2.277015447616577, "learning_rate": 3.575866074274277e-06, "loss": 0.4895, "num_input_tokens_seen": 19510848, "step": 18920 }, { "epoch": 12.658862876254181, "grad_norm": 1.907348394393921, "learning_rate": 3.573068598040144e-06, "loss": 0.4021, "num_input_tokens_seen": 19516320, "step": 18925 }, { "epoch": 12.662207357859533, "grad_norm": 2.1626274585723877, "learning_rate": 3.5702716080065546e-06, "loss": 0.4092, "num_input_tokens_seen": 19521632, "step": 18930 }, { "epoch": 12.665551839464882, "grad_norm": 2.142920970916748, "learning_rate": 3.567475105126533e-06, "loss": 0.4407, "num_input_tokens_seen": 19526816, "step": 18935 }, { "epoch": 12.668896321070234, "grad_norm": 2.7795534133911133, "learning_rate": 3.564679090352937e-06, "loss": 0.4883, "num_input_tokens_seen": 19531936, "step": 18940 }, { "epoch": 12.672240802675585, "grad_norm": 2.588700771331787, "learning_rate": 3.561883564638455e-06, "loss": 0.4703, "num_input_tokens_seen": 19537120, "step": 18945 }, { "epoch": 12.675585284280936, "grad_norm": 2.7914700508117676, "learning_rate": 3.559088528935614e-06, "loss": 0.3537, "num_input_tokens_seen": 19542272, "step": 18950 }, { "epoch": 12.678929765886288, "grad_norm": 2.016448497772217, "learning_rate": 3.5562939841967682e-06, "loss": 0.3683, "num_input_tokens_seen": 19547616, "step": 18955 }, { "epoch": 12.68227424749164, "grad_norm": 3.0858404636383057, "learning_rate": 3.553499931374108e-06, "loss": 0.323, "num_input_tokens_seen": 19552896, "step": 18960 }, { "epoch": 12.68561872909699, "grad_norm": 2.6706645488739014, "learning_rate": 3.5507063714196537e-06, "loss": 0.4233, "num_input_tokens_seen": 19558912, "step": 18965 }, { "epoch": 12.68896321070234, "grad_norm": 2.6325581073760986, "learning_rate": 3.5479133052852627e-06, "loss": 0.3596, "num_input_tokens_seen": 19564736, "step": 18970 }, { "epoch": 12.692307692307692, "grad_norm": 3.342085361480713, "learning_rate": 3.545120733922619e-06, "loss": 0.4527, "num_input_tokens_seen": 19570336, "step": 18975 }, { "epoch": 12.695652173913043, "grad_norm": 3.018923282623291, "learning_rate": 3.5423286582832393e-06, "loss": 0.4084, "num_input_tokens_seen": 19575040, "step": 18980 }, { "epoch": 12.698996655518394, "grad_norm": 3.015791416168213, "learning_rate": 3.5395370793184714e-06, "loss": 0.3405, "num_input_tokens_seen": 19580512, "step": 18985 }, { "epoch": 12.702341137123746, "grad_norm": 1.3350446224212646, "learning_rate": 3.5367459979794948e-06, "loss": 0.3114, "num_input_tokens_seen": 19586208, "step": 18990 }, { "epoch": 12.705685618729097, "grad_norm": 2.6818084716796875, "learning_rate": 3.5339554152173195e-06, "loss": 0.471, "num_input_tokens_seen": 19591520, "step": 18995 }, { "epoch": 12.709030100334449, "grad_norm": 3.196460723876953, "learning_rate": 3.531165331982786e-06, "loss": 0.4095, "num_input_tokens_seen": 19596384, "step": 19000 }, { "epoch": 12.7123745819398, "grad_norm": 1.9189506769180298, "learning_rate": 3.5283757492265627e-06, "loss": 0.3596, "num_input_tokens_seen": 19601088, "step": 19005 }, { "epoch": 12.715719063545151, "grad_norm": 2.459819793701172, "learning_rate": 3.5255866678991456e-06, "loss": 0.3801, "num_input_tokens_seen": 19605824, "step": 19010 }, { "epoch": 12.719063545150501, "grad_norm": 2.3799362182617188, "learning_rate": 3.522798088950868e-06, "loss": 0.3746, "num_input_tokens_seen": 19611296, "step": 19015 }, { "epoch": 12.722408026755852, "grad_norm": 3.3111157417297363, "learning_rate": 3.5200100133318836e-06, "loss": 0.4587, "num_input_tokens_seen": 19615616, "step": 19020 }, { "epoch": 12.725752508361204, "grad_norm": 2.823941946029663, "learning_rate": 3.5172224419921783e-06, "loss": 0.3856, "num_input_tokens_seen": 19620160, "step": 19025 }, { "epoch": 12.729096989966555, "grad_norm": 2.5651047229766846, "learning_rate": 3.514435375881565e-06, "loss": 0.3797, "num_input_tokens_seen": 19624736, "step": 19030 }, { "epoch": 12.732441471571907, "grad_norm": 2.0712387561798096, "learning_rate": 3.5116488159496854e-06, "loss": 0.3801, "num_input_tokens_seen": 19629344, "step": 19035 }, { "epoch": 12.735785953177258, "grad_norm": 2.6272995471954346, "learning_rate": 3.5088627631460087e-06, "loss": 0.4111, "num_input_tokens_seen": 19634464, "step": 19040 }, { "epoch": 12.73913043478261, "grad_norm": 2.5632340908050537, "learning_rate": 3.506077218419832e-06, "loss": 0.3816, "num_input_tokens_seen": 19639136, "step": 19045 }, { "epoch": 12.742474916387959, "grad_norm": 2.625465154647827, "learning_rate": 3.5032921827202764e-06, "loss": 0.3872, "num_input_tokens_seen": 19644160, "step": 19050 }, { "epoch": 12.74581939799331, "grad_norm": 2.404714584350586, "learning_rate": 3.5005076569962903e-06, "loss": 0.4242, "num_input_tokens_seen": 19649248, "step": 19055 }, { "epoch": 12.749163879598662, "grad_norm": 3.7503461837768555, "learning_rate": 3.497723642196654e-06, "loss": 0.4096, "num_input_tokens_seen": 19654080, "step": 19060 }, { "epoch": 12.752508361204013, "grad_norm": 2.841054677963257, "learning_rate": 3.4949401392699657e-06, "loss": 0.4822, "num_input_tokens_seen": 19658624, "step": 19065 }, { "epoch": 12.755852842809364, "grad_norm": 3.7848474979400635, "learning_rate": 3.492157149164653e-06, "loss": 0.36, "num_input_tokens_seen": 19663552, "step": 19070 }, { "epoch": 12.759197324414716, "grad_norm": 1.949486255645752, "learning_rate": 3.489374672828969e-06, "loss": 0.3055, "num_input_tokens_seen": 19669280, "step": 19075 }, { "epoch": 12.762541806020067, "grad_norm": 2.1079864501953125, "learning_rate": 3.486592711210991e-06, "loss": 0.3636, "num_input_tokens_seen": 19674016, "step": 19080 }, { "epoch": 12.765886287625419, "grad_norm": 2.479020118713379, "learning_rate": 3.4838112652586213e-06, "loss": 0.455, "num_input_tokens_seen": 19679616, "step": 19085 }, { "epoch": 12.76923076923077, "grad_norm": 2.421849012374878, "learning_rate": 3.481030335919586e-06, "loss": 0.4427, "num_input_tokens_seen": 19684448, "step": 19090 }, { "epoch": 12.77257525083612, "grad_norm": 2.5239572525024414, "learning_rate": 3.4782499241414357e-06, "loss": 0.3694, "num_input_tokens_seen": 19690560, "step": 19095 }, { "epoch": 12.775919732441471, "grad_norm": 1.6397439241409302, "learning_rate": 3.4754700308715426e-06, "loss": 0.3786, "num_input_tokens_seen": 19696160, "step": 19100 }, { "epoch": 12.779264214046822, "grad_norm": 2.936098098754883, "learning_rate": 3.4726906570571075e-06, "loss": 0.4148, "num_input_tokens_seen": 19701696, "step": 19105 }, { "epoch": 12.782608695652174, "grad_norm": 2.6210949420928955, "learning_rate": 3.4699118036451496e-06, "loss": 0.3434, "num_input_tokens_seen": 19706528, "step": 19110 }, { "epoch": 12.785953177257525, "grad_norm": 4.183880805969238, "learning_rate": 3.4671334715825107e-06, "loss": 0.5187, "num_input_tokens_seen": 19712352, "step": 19115 }, { "epoch": 12.789297658862877, "grad_norm": 1.8971291780471802, "learning_rate": 3.464355661815857e-06, "loss": 0.4254, "num_input_tokens_seen": 19718336, "step": 19120 }, { "epoch": 12.792642140468228, "grad_norm": 2.4516022205352783, "learning_rate": 3.4615783752916733e-06, "loss": 0.404, "num_input_tokens_seen": 19722784, "step": 19125 }, { "epoch": 12.79598662207358, "grad_norm": 3.0697615146636963, "learning_rate": 3.458801612956273e-06, "loss": 0.4388, "num_input_tokens_seen": 19727840, "step": 19130 }, { "epoch": 12.799331103678929, "grad_norm": 2.0831451416015625, "learning_rate": 3.4560253757557836e-06, "loss": 0.3306, "num_input_tokens_seen": 19732384, "step": 19135 }, { "epoch": 12.80267558528428, "grad_norm": 2.6441173553466797, "learning_rate": 3.4532496646361603e-06, "loss": 0.4701, "num_input_tokens_seen": 19737632, "step": 19140 }, { "epoch": 12.806020066889632, "grad_norm": 2.856513500213623, "learning_rate": 3.4504744805431696e-06, "loss": 0.3839, "num_input_tokens_seen": 19743712, "step": 19145 }, { "epoch": 12.809364548494983, "grad_norm": 2.357394218444824, "learning_rate": 3.44769982442241e-06, "loss": 0.4702, "num_input_tokens_seen": 19747904, "step": 19150 }, { "epoch": 12.812709030100335, "grad_norm": 2.7813334465026855, "learning_rate": 3.4449256972192924e-06, "loss": 0.4091, "num_input_tokens_seen": 19752928, "step": 19155 }, { "epoch": 12.816053511705686, "grad_norm": 2.130976438522339, "learning_rate": 3.442152099879048e-06, "loss": 0.3669, "num_input_tokens_seen": 19758208, "step": 19160 }, { "epoch": 12.819397993311037, "grad_norm": 2.4455790519714355, "learning_rate": 3.4393790333467314e-06, "loss": 0.3444, "num_input_tokens_seen": 19762944, "step": 19165 }, { "epoch": 12.822742474916389, "grad_norm": 2.800164222717285, "learning_rate": 3.43660649856721e-06, "loss": 0.4122, "num_input_tokens_seen": 19767520, "step": 19170 }, { "epoch": 12.826086956521738, "grad_norm": 1.8198246955871582, "learning_rate": 3.4338344964851778e-06, "loss": 0.4128, "num_input_tokens_seen": 19772640, "step": 19175 }, { "epoch": 12.82943143812709, "grad_norm": 2.3183751106262207, "learning_rate": 3.4310630280451407e-06, "loss": 0.3907, "num_input_tokens_seen": 19777248, "step": 19180 }, { "epoch": 12.832775919732441, "grad_norm": 3.078228235244751, "learning_rate": 3.4282920941914284e-06, "loss": 0.4142, "num_input_tokens_seen": 19782528, "step": 19185 }, { "epoch": 12.836120401337793, "grad_norm": 2.8708653450012207, "learning_rate": 3.4255216958681826e-06, "loss": 0.3427, "num_input_tokens_seen": 19787808, "step": 19190 }, { "epoch": 12.839464882943144, "grad_norm": 3.373671293258667, "learning_rate": 3.422751834019366e-06, "loss": 0.4113, "num_input_tokens_seen": 19792320, "step": 19195 }, { "epoch": 12.842809364548495, "grad_norm": 1.9104933738708496, "learning_rate": 3.4199825095887597e-06, "loss": 0.3081, "num_input_tokens_seen": 19796992, "step": 19200 }, { "epoch": 12.846153846153847, "grad_norm": 2.2689290046691895, "learning_rate": 3.417213723519959e-06, "loss": 0.4413, "num_input_tokens_seen": 19802848, "step": 19205 }, { "epoch": 12.849498327759198, "grad_norm": 2.9225106239318848, "learning_rate": 3.4144454767563755e-06, "loss": 0.4425, "num_input_tokens_seen": 19808160, "step": 19210 }, { "epoch": 12.852842809364548, "grad_norm": 2.4960367679595947, "learning_rate": 3.4116777702412374e-06, "loss": 0.3148, "num_input_tokens_seen": 19812928, "step": 19215 }, { "epoch": 12.856187290969899, "grad_norm": 4.1172709465026855, "learning_rate": 3.4089106049175934e-06, "loss": 0.3669, "num_input_tokens_seen": 19818688, "step": 19220 }, { "epoch": 12.85953177257525, "grad_norm": 2.667694091796875, "learning_rate": 3.4061439817283014e-06, "loss": 0.3767, "num_input_tokens_seen": 19823616, "step": 19225 }, { "epoch": 12.862876254180602, "grad_norm": 3.7066402435302734, "learning_rate": 3.4033779016160374e-06, "loss": 0.4218, "num_input_tokens_seen": 19828256, "step": 19230 }, { "epoch": 12.866220735785953, "grad_norm": 2.8610432147979736, "learning_rate": 3.4006123655232914e-06, "loss": 0.3597, "num_input_tokens_seen": 19833216, "step": 19235 }, { "epoch": 12.869565217391305, "grad_norm": 2.0861315727233887, "learning_rate": 3.3978473743923702e-06, "loss": 0.3815, "num_input_tokens_seen": 19838016, "step": 19240 }, { "epoch": 12.872909698996656, "grad_norm": 2.663048505783081, "learning_rate": 3.3950829291653943e-06, "loss": 0.4212, "num_input_tokens_seen": 19842816, "step": 19245 }, { "epoch": 12.876254180602007, "grad_norm": 2.4305648803710938, "learning_rate": 3.3923190307842957e-06, "loss": 0.48, "num_input_tokens_seen": 19847840, "step": 19250 }, { "epoch": 12.879598662207357, "grad_norm": 3.2817025184631348, "learning_rate": 3.389555680190823e-06, "loss": 0.3955, "num_input_tokens_seen": 19853376, "step": 19255 }, { "epoch": 12.882943143812708, "grad_norm": 2.358675003051758, "learning_rate": 3.3867928783265334e-06, "loss": 0.3857, "num_input_tokens_seen": 19858112, "step": 19260 }, { "epoch": 12.88628762541806, "grad_norm": 2.0943408012390137, "learning_rate": 3.384030626132806e-06, "loss": 0.3779, "num_input_tokens_seen": 19863072, "step": 19265 }, { "epoch": 12.889632107023411, "grad_norm": 2.1757302284240723, "learning_rate": 3.3812689245508247e-06, "loss": 0.3593, "num_input_tokens_seen": 19868064, "step": 19270 }, { "epoch": 12.892976588628763, "grad_norm": 1.6339545249938965, "learning_rate": 3.378507774521587e-06, "loss": 0.3157, "num_input_tokens_seen": 19873152, "step": 19275 }, { "epoch": 12.896321070234114, "grad_norm": 2.351951837539673, "learning_rate": 3.375747176985906e-06, "loss": 0.3924, "num_input_tokens_seen": 19877568, "step": 19280 }, { "epoch": 12.899665551839465, "grad_norm": 2.3322901725769043, "learning_rate": 3.372987132884403e-06, "loss": 0.4848, "num_input_tokens_seen": 19883776, "step": 19285 }, { "epoch": 12.903010033444817, "grad_norm": 2.230436325073242, "learning_rate": 3.3702276431575133e-06, "loss": 0.4035, "num_input_tokens_seen": 19889344, "step": 19290 }, { "epoch": 12.906354515050166, "grad_norm": 2.7766852378845215, "learning_rate": 3.3674687087454817e-06, "loss": 0.4288, "num_input_tokens_seen": 19894496, "step": 19295 }, { "epoch": 12.909698996655518, "grad_norm": 2.966548442840576, "learning_rate": 3.3647103305883645e-06, "loss": 0.4015, "num_input_tokens_seen": 19899392, "step": 19300 }, { "epoch": 12.91304347826087, "grad_norm": 2.171281099319458, "learning_rate": 3.3619525096260253e-06, "loss": 0.2846, "num_input_tokens_seen": 19903808, "step": 19305 }, { "epoch": 12.91638795986622, "grad_norm": 3.370980978012085, "learning_rate": 3.3591952467981446e-06, "loss": 0.4355, "num_input_tokens_seen": 19908544, "step": 19310 }, { "epoch": 12.919732441471572, "grad_norm": 2.4830026626586914, "learning_rate": 3.3564385430442074e-06, "loss": 0.362, "num_input_tokens_seen": 19914208, "step": 19315 }, { "epoch": 12.923076923076923, "grad_norm": 3.063664674758911, "learning_rate": 3.3536823993035097e-06, "loss": 0.4248, "num_input_tokens_seen": 19918912, "step": 19320 }, { "epoch": 12.926421404682275, "grad_norm": 2.2659473419189453, "learning_rate": 3.3509268165151543e-06, "loss": 0.4407, "num_input_tokens_seen": 19924768, "step": 19325 }, { "epoch": 12.929765886287626, "grad_norm": 2.154362678527832, "learning_rate": 3.3481717956180595e-06, "loss": 0.4464, "num_input_tokens_seen": 19930368, "step": 19330 }, { "epoch": 12.933110367892976, "grad_norm": 2.956406593322754, "learning_rate": 3.345417337550945e-06, "loss": 0.3277, "num_input_tokens_seen": 19935584, "step": 19335 }, { "epoch": 12.936454849498327, "grad_norm": 3.3068065643310547, "learning_rate": 3.3426634432523424e-06, "loss": 0.5121, "num_input_tokens_seen": 19940832, "step": 19340 }, { "epoch": 12.939799331103679, "grad_norm": 2.8649048805236816, "learning_rate": 3.3399101136605906e-06, "loss": 0.4095, "num_input_tokens_seen": 19946400, "step": 19345 }, { "epoch": 12.94314381270903, "grad_norm": 2.4316842555999756, "learning_rate": 3.3371573497138334e-06, "loss": 0.3774, "num_input_tokens_seen": 19950496, "step": 19350 }, { "epoch": 12.946488294314381, "grad_norm": 2.968625068664551, "learning_rate": 3.334405152350028e-06, "loss": 0.4205, "num_input_tokens_seen": 19955520, "step": 19355 }, { "epoch": 12.949832775919733, "grad_norm": 2.353546619415283, "learning_rate": 3.3316535225069335e-06, "loss": 0.3606, "num_input_tokens_seen": 19960544, "step": 19360 }, { "epoch": 12.953177257525084, "grad_norm": 2.398613691329956, "learning_rate": 3.328902461122117e-06, "loss": 0.3735, "num_input_tokens_seen": 19966144, "step": 19365 }, { "epoch": 12.956521739130435, "grad_norm": 3.001547336578369, "learning_rate": 3.326151969132949e-06, "loss": 0.3681, "num_input_tokens_seen": 19970848, "step": 19370 }, { "epoch": 12.959866220735787, "grad_norm": 2.620790958404541, "learning_rate": 3.3234020474766137e-06, "loss": 0.4273, "num_input_tokens_seen": 19976128, "step": 19375 }, { "epoch": 12.963210702341136, "grad_norm": 1.7781904935836792, "learning_rate": 3.320652697090093e-06, "loss": 0.3948, "num_input_tokens_seen": 19981728, "step": 19380 }, { "epoch": 12.966555183946488, "grad_norm": 2.106689214706421, "learning_rate": 3.31790391891018e-06, "loss": 0.3875, "num_input_tokens_seen": 19986464, "step": 19385 }, { "epoch": 12.96989966555184, "grad_norm": 1.6821658611297607, "learning_rate": 3.3151557138734657e-06, "loss": 0.3885, "num_input_tokens_seen": 19991168, "step": 19390 }, { "epoch": 12.97324414715719, "grad_norm": 1.9980988502502441, "learning_rate": 3.312408082916355e-06, "loss": 0.2981, "num_input_tokens_seen": 19995904, "step": 19395 }, { "epoch": 12.976588628762542, "grad_norm": 2.7835910320281982, "learning_rate": 3.3096610269750507e-06, "loss": 0.3632, "num_input_tokens_seen": 20000512, "step": 19400 }, { "epoch": 12.979933110367893, "grad_norm": 3.0797462463378906, "learning_rate": 3.306914546985561e-06, "loss": 0.3565, "num_input_tokens_seen": 20005152, "step": 19405 }, { "epoch": 12.983277591973245, "grad_norm": 2.35672926902771, "learning_rate": 3.3041686438836984e-06, "loss": 0.4798, "num_input_tokens_seen": 20010624, "step": 19410 }, { "epoch": 12.986622073578594, "grad_norm": 3.224050998687744, "learning_rate": 3.301423318605077e-06, "loss": 0.4358, "num_input_tokens_seen": 20015264, "step": 19415 }, { "epoch": 12.989966555183946, "grad_norm": 3.184638500213623, "learning_rate": 3.298678572085119e-06, "loss": 0.4016, "num_input_tokens_seen": 20019680, "step": 19420 }, { "epoch": 12.993311036789297, "grad_norm": 1.5263893604278564, "learning_rate": 3.2959344052590445e-06, "loss": 0.3694, "num_input_tokens_seen": 20025472, "step": 19425 }, { "epoch": 12.996655518394649, "grad_norm": 1.8902182579040527, "learning_rate": 3.293190819061878e-06, "loss": 0.4341, "num_input_tokens_seen": 20030880, "step": 19430 }, { "epoch": 13.0, "grad_norm": 2.4174721240997314, "learning_rate": 3.290447814428445e-06, "loss": 0.3675, "num_input_tokens_seen": 20035648, "step": 19435 }, { "epoch": 13.003344481605351, "grad_norm": 2.8239448070526123, "learning_rate": 3.287705392293373e-06, "loss": 0.3733, "num_input_tokens_seen": 20040832, "step": 19440 }, { "epoch": 13.006688963210703, "grad_norm": 2.207956314086914, "learning_rate": 3.284963553591096e-06, "loss": 0.389, "num_input_tokens_seen": 20045952, "step": 19445 }, { "epoch": 13.010033444816054, "grad_norm": 1.800156831741333, "learning_rate": 3.282222299255842e-06, "loss": 0.3101, "num_input_tokens_seen": 20050272, "step": 19450 }, { "epoch": 13.013377926421406, "grad_norm": 2.476438522338867, "learning_rate": 3.2794816302216416e-06, "loss": 0.3349, "num_input_tokens_seen": 20055776, "step": 19455 }, { "epoch": 13.016722408026755, "grad_norm": 3.122170925140381, "learning_rate": 3.2767415474223276e-06, "loss": 0.4415, "num_input_tokens_seen": 20060800, "step": 19460 }, { "epoch": 13.020066889632107, "grad_norm": 2.1951043605804443, "learning_rate": 3.2740020517915348e-06, "loss": 0.3772, "num_input_tokens_seen": 20066048, "step": 19465 }, { "epoch": 13.023411371237458, "grad_norm": 3.2813801765441895, "learning_rate": 3.271263144262695e-06, "loss": 0.3572, "num_input_tokens_seen": 20070336, "step": 19470 }, { "epoch": 13.02675585284281, "grad_norm": 2.220101833343506, "learning_rate": 3.2685248257690393e-06, "loss": 0.3997, "num_input_tokens_seen": 20076384, "step": 19475 }, { "epoch": 13.03010033444816, "grad_norm": 2.3740406036376953, "learning_rate": 3.2657870972436e-06, "loss": 0.4037, "num_input_tokens_seen": 20081504, "step": 19480 }, { "epoch": 13.033444816053512, "grad_norm": 1.87953519821167, "learning_rate": 3.2630499596192065e-06, "loss": 0.4245, "num_input_tokens_seen": 20086816, "step": 19485 }, { "epoch": 13.036789297658864, "grad_norm": 2.3584933280944824, "learning_rate": 3.2603134138284908e-06, "loss": 0.4117, "num_input_tokens_seen": 20092000, "step": 19490 }, { "epoch": 13.040133779264215, "grad_norm": 2.9605541229248047, "learning_rate": 3.257577460803878e-06, "loss": 0.3606, "num_input_tokens_seen": 20096832, "step": 19495 }, { "epoch": 13.043478260869565, "grad_norm": 2.282744884490967, "learning_rate": 3.2548421014775943e-06, "loss": 0.3007, "num_input_tokens_seen": 20101664, "step": 19500 }, { "epoch": 13.046822742474916, "grad_norm": 2.2817952632904053, "learning_rate": 3.2521073367816603e-06, "loss": 0.3978, "num_input_tokens_seen": 20106656, "step": 19505 }, { "epoch": 13.050167224080267, "grad_norm": 2.598379611968994, "learning_rate": 3.2493731676479014e-06, "loss": 0.4528, "num_input_tokens_seen": 20112864, "step": 19510 }, { "epoch": 13.053511705685619, "grad_norm": 2.6118197441101074, "learning_rate": 3.2466395950079326e-06, "loss": 0.4773, "num_input_tokens_seen": 20117568, "step": 19515 }, { "epoch": 13.05685618729097, "grad_norm": 2.3873836994171143, "learning_rate": 3.243906619793168e-06, "loss": 0.374, "num_input_tokens_seen": 20122976, "step": 19520 }, { "epoch": 13.060200668896321, "grad_norm": 3.195404052734375, "learning_rate": 3.2411742429348194e-06, "loss": 0.3166, "num_input_tokens_seen": 20128512, "step": 19525 }, { "epoch": 13.063545150501673, "grad_norm": 2.3537518978118896, "learning_rate": 3.2384424653638934e-06, "loss": 0.3796, "num_input_tokens_seen": 20134400, "step": 19530 }, { "epoch": 13.066889632107024, "grad_norm": 2.2353532314300537, "learning_rate": 3.2357112880111923e-06, "loss": 0.3371, "num_input_tokens_seen": 20138976, "step": 19535 }, { "epoch": 13.070234113712374, "grad_norm": 3.1864047050476074, "learning_rate": 3.2329807118073162e-06, "loss": 0.3255, "num_input_tokens_seen": 20145152, "step": 19540 }, { "epoch": 13.073578595317725, "grad_norm": 3.1038308143615723, "learning_rate": 3.2302507376826575e-06, "loss": 0.4548, "num_input_tokens_seen": 20151104, "step": 19545 }, { "epoch": 13.076923076923077, "grad_norm": 2.3935978412628174, "learning_rate": 3.227521366567402e-06, "loss": 0.502, "num_input_tokens_seen": 20156768, "step": 19550 }, { "epoch": 13.080267558528428, "grad_norm": 2.7219629287719727, "learning_rate": 3.2247925993915363e-06, "loss": 0.2992, "num_input_tokens_seen": 20161888, "step": 19555 }, { "epoch": 13.08361204013378, "grad_norm": 2.3005154132843018, "learning_rate": 3.2220644370848376e-06, "loss": 0.351, "num_input_tokens_seen": 20167904, "step": 19560 }, { "epoch": 13.08695652173913, "grad_norm": 3.392695426940918, "learning_rate": 3.2193368805768745e-06, "loss": 0.3776, "num_input_tokens_seen": 20173440, "step": 19565 }, { "epoch": 13.090301003344482, "grad_norm": 3.1613476276397705, "learning_rate": 3.21660993079701e-06, "loss": 0.3458, "num_input_tokens_seen": 20177728, "step": 19570 }, { "epoch": 13.093645484949834, "grad_norm": 3.1024458408355713, "learning_rate": 3.2138835886744064e-06, "loss": 0.3985, "num_input_tokens_seen": 20182720, "step": 19575 }, { "epoch": 13.096989966555183, "grad_norm": 1.769409418106079, "learning_rate": 3.2111578551380117e-06, "loss": 0.4234, "num_input_tokens_seen": 20188032, "step": 19580 }, { "epoch": 13.100334448160535, "grad_norm": 2.1796138286590576, "learning_rate": 3.2084327311165707e-06, "loss": 0.3458, "num_input_tokens_seen": 20192544, "step": 19585 }, { "epoch": 13.103678929765886, "grad_norm": 3.0279598236083984, "learning_rate": 3.2057082175386182e-06, "loss": 0.3687, "num_input_tokens_seen": 20198048, "step": 19590 }, { "epoch": 13.107023411371237, "grad_norm": 3.0305683612823486, "learning_rate": 3.202984315332481e-06, "loss": 0.4028, "num_input_tokens_seen": 20202816, "step": 19595 }, { "epoch": 13.110367892976589, "grad_norm": 1.9774885177612305, "learning_rate": 3.2002610254262812e-06, "loss": 0.3741, "num_input_tokens_seen": 20208128, "step": 19600 }, { "epoch": 13.11371237458194, "grad_norm": 2.2860867977142334, "learning_rate": 3.197538348747927e-06, "loss": 0.3842, "num_input_tokens_seen": 20213312, "step": 19605 }, { "epoch": 13.117056856187292, "grad_norm": 2.842475414276123, "learning_rate": 3.1948162862251226e-06, "loss": 0.4063, "num_input_tokens_seen": 20218240, "step": 19610 }, { "epoch": 13.120401337792643, "grad_norm": 3.16322922706604, "learning_rate": 3.1920948387853567e-06, "loss": 0.4003, "num_input_tokens_seen": 20223424, "step": 19615 }, { "epoch": 13.123745819397993, "grad_norm": 2.1996309757232666, "learning_rate": 3.189374007355917e-06, "loss": 0.3545, "num_input_tokens_seen": 20229440, "step": 19620 }, { "epoch": 13.127090301003344, "grad_norm": 2.4279279708862305, "learning_rate": 3.186653792863873e-06, "loss": 0.3797, "num_input_tokens_seen": 20235040, "step": 19625 }, { "epoch": 13.130434782608695, "grad_norm": 2.5271174907684326, "learning_rate": 3.18393419623609e-06, "loss": 0.4007, "num_input_tokens_seen": 20240064, "step": 19630 }, { "epoch": 13.133779264214047, "grad_norm": 2.6561810970306396, "learning_rate": 3.1812152183992207e-06, "loss": 0.3625, "num_input_tokens_seen": 20244928, "step": 19635 }, { "epoch": 13.137123745819398, "grad_norm": 1.9817192554473877, "learning_rate": 3.1784968602797023e-06, "loss": 0.3008, "num_input_tokens_seen": 20249824, "step": 19640 }, { "epoch": 13.14046822742475, "grad_norm": 2.6359145641326904, "learning_rate": 3.1757791228037703e-06, "loss": 0.3947, "num_input_tokens_seen": 20254976, "step": 19645 }, { "epoch": 13.143812709030101, "grad_norm": 2.5244996547698975, "learning_rate": 3.1730620068974418e-06, "loss": 0.3569, "num_input_tokens_seen": 20259616, "step": 19650 }, { "epoch": 13.147157190635452, "grad_norm": 2.1294050216674805, "learning_rate": 3.170345513486523e-06, "loss": 0.3845, "num_input_tokens_seen": 20265152, "step": 19655 }, { "epoch": 13.150501672240802, "grad_norm": 2.6758205890655518, "learning_rate": 3.1676296434966082e-06, "loss": 0.4194, "num_input_tokens_seen": 20271168, "step": 19660 }, { "epoch": 13.153846153846153, "grad_norm": 3.0617477893829346, "learning_rate": 3.164914397853083e-06, "loss": 0.4135, "num_input_tokens_seen": 20277216, "step": 19665 }, { "epoch": 13.157190635451505, "grad_norm": 1.928092122077942, "learning_rate": 3.1621997774811155e-06, "loss": 0.3902, "num_input_tokens_seen": 20281568, "step": 19670 }, { "epoch": 13.160535117056856, "grad_norm": 2.743966579437256, "learning_rate": 3.1594857833056613e-06, "loss": 0.3197, "num_input_tokens_seen": 20286560, "step": 19675 }, { "epoch": 13.163879598662207, "grad_norm": 2.133603811264038, "learning_rate": 3.156772416251466e-06, "loss": 0.3187, "num_input_tokens_seen": 20292096, "step": 19680 }, { "epoch": 13.167224080267559, "grad_norm": 3.633084297180176, "learning_rate": 3.1540596772430567e-06, "loss": 0.4468, "num_input_tokens_seen": 20297024, "step": 19685 }, { "epoch": 13.17056856187291, "grad_norm": 2.184556484222412, "learning_rate": 3.1513475672047534e-06, "loss": 0.3784, "num_input_tokens_seen": 20302400, "step": 19690 }, { "epoch": 13.173913043478262, "grad_norm": 2.742868423461914, "learning_rate": 3.1486360870606547e-06, "loss": 0.4824, "num_input_tokens_seen": 20308000, "step": 19695 }, { "epoch": 13.177257525083611, "grad_norm": 1.6252145767211914, "learning_rate": 3.145925237734647e-06, "loss": 0.4329, "num_input_tokens_seen": 20314176, "step": 19700 }, { "epoch": 13.180602006688963, "grad_norm": 2.2045161724090576, "learning_rate": 3.143215020150402e-06, "loss": 0.3318, "num_input_tokens_seen": 20319840, "step": 19705 }, { "epoch": 13.183946488294314, "grad_norm": 2.5331525802612305, "learning_rate": 3.140505435231379e-06, "loss": 0.413, "num_input_tokens_seen": 20325312, "step": 19710 }, { "epoch": 13.187290969899665, "grad_norm": 2.069420099258423, "learning_rate": 3.1377964839008173e-06, "loss": 0.3408, "num_input_tokens_seen": 20330656, "step": 19715 }, { "epoch": 13.190635451505017, "grad_norm": 2.9304358959198, "learning_rate": 3.1350881670817415e-06, "loss": 0.3711, "num_input_tokens_seen": 20335232, "step": 19720 }, { "epoch": 13.193979933110368, "grad_norm": 2.428997278213501, "learning_rate": 3.132380485696962e-06, "loss": 0.3542, "num_input_tokens_seen": 20341152, "step": 19725 }, { "epoch": 13.19732441471572, "grad_norm": 2.6811137199401855, "learning_rate": 3.129673440669069e-06, "loss": 0.362, "num_input_tokens_seen": 20346240, "step": 19730 }, { "epoch": 13.200668896321071, "grad_norm": 2.8844594955444336, "learning_rate": 3.12696703292044e-06, "loss": 0.361, "num_input_tokens_seen": 20351360, "step": 19735 }, { "epoch": 13.20401337792642, "grad_norm": 2.114347219467163, "learning_rate": 3.1242612633732337e-06, "loss": 0.4226, "num_input_tokens_seen": 20356512, "step": 19740 }, { "epoch": 13.207357859531772, "grad_norm": 3.2151219844818115, "learning_rate": 3.1215561329493914e-06, "loss": 0.3456, "num_input_tokens_seen": 20361472, "step": 19745 }, { "epoch": 13.210702341137123, "grad_norm": 1.5796067714691162, "learning_rate": 3.1188516425706334e-06, "loss": 0.316, "num_input_tokens_seen": 20367168, "step": 19750 }, { "epoch": 13.214046822742475, "grad_norm": 2.1169850826263428, "learning_rate": 3.116147793158469e-06, "loss": 0.3797, "num_input_tokens_seen": 20372544, "step": 19755 }, { "epoch": 13.217391304347826, "grad_norm": 2.352609395980835, "learning_rate": 3.113444585634183e-06, "loss": 0.3197, "num_input_tokens_seen": 20377184, "step": 19760 }, { "epoch": 13.220735785953178, "grad_norm": 2.838265895843506, "learning_rate": 3.1107420209188432e-06, "loss": 0.33, "num_input_tokens_seen": 20382304, "step": 19765 }, { "epoch": 13.224080267558529, "grad_norm": 3.477466344833374, "learning_rate": 3.108040099933298e-06, "loss": 0.4426, "num_input_tokens_seen": 20386976, "step": 19770 }, { "epoch": 13.22742474916388, "grad_norm": 3.2671852111816406, "learning_rate": 3.1053388235981785e-06, "loss": 0.3219, "num_input_tokens_seen": 20392704, "step": 19775 }, { "epoch": 13.23076923076923, "grad_norm": 3.4392731189727783, "learning_rate": 3.1026381928338944e-06, "loss": 0.4356, "num_input_tokens_seen": 20397888, "step": 19780 }, { "epoch": 13.234113712374581, "grad_norm": 4.084686279296875, "learning_rate": 3.0999382085606366e-06, "loss": 0.3393, "num_input_tokens_seen": 20402720, "step": 19785 }, { "epoch": 13.237458193979933, "grad_norm": 2.100078821182251, "learning_rate": 3.097238871698374e-06, "loss": 0.4097, "num_input_tokens_seen": 20408608, "step": 19790 }, { "epoch": 13.240802675585284, "grad_norm": 3.055196762084961, "learning_rate": 3.0945401831668544e-06, "loss": 0.389, "num_input_tokens_seen": 20413728, "step": 19795 }, { "epoch": 13.244147157190636, "grad_norm": 2.2112488746643066, "learning_rate": 3.091842143885609e-06, "loss": 0.3475, "num_input_tokens_seen": 20418368, "step": 19800 }, { "epoch": 13.247491638795987, "grad_norm": 3.3298230171203613, "learning_rate": 3.089144754773944e-06, "loss": 0.3979, "num_input_tokens_seen": 20423936, "step": 19805 }, { "epoch": 13.250836120401338, "grad_norm": 2.8117263317108154, "learning_rate": 3.086448016750944e-06, "loss": 0.3614, "num_input_tokens_seen": 20427872, "step": 19810 }, { "epoch": 13.25418060200669, "grad_norm": 5.36260986328125, "learning_rate": 3.0837519307354723e-06, "loss": 0.4351, "num_input_tokens_seen": 20433024, "step": 19815 }, { "epoch": 13.25752508361204, "grad_norm": 2.8456592559814453, "learning_rate": 3.0810564976461723e-06, "loss": 0.4502, "num_input_tokens_seen": 20438176, "step": 19820 }, { "epoch": 13.26086956521739, "grad_norm": 2.9327926635742188, "learning_rate": 3.0783617184014613e-06, "loss": 0.4407, "num_input_tokens_seen": 20442976, "step": 19825 }, { "epoch": 13.264214046822742, "grad_norm": 3.293954610824585, "learning_rate": 3.075667593919538e-06, "loss": 0.3127, "num_input_tokens_seen": 20448288, "step": 19830 }, { "epoch": 13.267558528428093, "grad_norm": 2.3868088722229004, "learning_rate": 3.0729741251183742e-06, "loss": 0.4531, "num_input_tokens_seen": 20454240, "step": 19835 }, { "epoch": 13.270903010033445, "grad_norm": 2.02669095993042, "learning_rate": 3.070281312915717e-06, "loss": 0.4872, "num_input_tokens_seen": 20459424, "step": 19840 }, { "epoch": 13.274247491638796, "grad_norm": 2.3157570362091064, "learning_rate": 3.0675891582290973e-06, "loss": 0.3096, "num_input_tokens_seen": 20465920, "step": 19845 }, { "epoch": 13.277591973244148, "grad_norm": 1.9949488639831543, "learning_rate": 3.0648976619758145e-06, "loss": 0.4662, "num_input_tokens_seen": 20471712, "step": 19850 }, { "epoch": 13.280936454849499, "grad_norm": 2.682711601257324, "learning_rate": 3.0622068250729475e-06, "loss": 0.3263, "num_input_tokens_seen": 20476736, "step": 19855 }, { "epoch": 13.284280936454849, "grad_norm": 2.6725199222564697, "learning_rate": 3.0595166484373484e-06, "loss": 0.3738, "num_input_tokens_seen": 20481888, "step": 19860 }, { "epoch": 13.2876254180602, "grad_norm": 1.9642763137817383, "learning_rate": 3.0568271329856425e-06, "loss": 0.4353, "num_input_tokens_seen": 20487136, "step": 19865 }, { "epoch": 13.290969899665551, "grad_norm": 2.386380195617676, "learning_rate": 3.0541382796342377e-06, "loss": 0.4, "num_input_tokens_seen": 20492064, "step": 19870 }, { "epoch": 13.294314381270903, "grad_norm": 2.765810489654541, "learning_rate": 3.051450089299308e-06, "loss": 0.3344, "num_input_tokens_seen": 20496704, "step": 19875 }, { "epoch": 13.297658862876254, "grad_norm": 3.1553306579589844, "learning_rate": 3.0487625628968074e-06, "loss": 0.3609, "num_input_tokens_seen": 20500896, "step": 19880 }, { "epoch": 13.301003344481606, "grad_norm": 2.9581453800201416, "learning_rate": 3.0460757013424556e-06, "loss": 0.3969, "num_input_tokens_seen": 20505952, "step": 19885 }, { "epoch": 13.304347826086957, "grad_norm": 2.3556602001190186, "learning_rate": 3.043389505551758e-06, "loss": 0.3696, "num_input_tokens_seen": 20510336, "step": 19890 }, { "epoch": 13.307692307692308, "grad_norm": 2.9144346714019775, "learning_rate": 3.040703976439982e-06, "loss": 0.3375, "num_input_tokens_seen": 20515200, "step": 19895 }, { "epoch": 13.31103678929766, "grad_norm": 1.8798946142196655, "learning_rate": 3.038019114922173e-06, "loss": 0.3824, "num_input_tokens_seen": 20519872, "step": 19900 }, { "epoch": 13.31438127090301, "grad_norm": 2.4361329078674316, "learning_rate": 3.0353349219131455e-06, "loss": 0.4593, "num_input_tokens_seen": 20525216, "step": 19905 }, { "epoch": 13.31772575250836, "grad_norm": 2.1000354290008545, "learning_rate": 3.032651398327494e-06, "loss": 0.4421, "num_input_tokens_seen": 20529632, "step": 19910 }, { "epoch": 13.321070234113712, "grad_norm": 1.9519093036651611, "learning_rate": 3.029968545079577e-06, "loss": 0.4232, "num_input_tokens_seen": 20535264, "step": 19915 }, { "epoch": 13.324414715719064, "grad_norm": 3.107285737991333, "learning_rate": 3.0272863630835246e-06, "loss": 0.3548, "num_input_tokens_seen": 20540288, "step": 19920 }, { "epoch": 13.327759197324415, "grad_norm": 2.189059257507324, "learning_rate": 3.024604853253246e-06, "loss": 0.373, "num_input_tokens_seen": 20545344, "step": 19925 }, { "epoch": 13.331103678929766, "grad_norm": 4.028112411499023, "learning_rate": 3.02192401650241e-06, "loss": 0.338, "num_input_tokens_seen": 20549856, "step": 19930 }, { "epoch": 13.334448160535118, "grad_norm": 2.440335512161255, "learning_rate": 3.019243853744468e-06, "loss": 0.3496, "num_input_tokens_seen": 20555008, "step": 19935 }, { "epoch": 13.337792642140467, "grad_norm": 2.830875873565674, "learning_rate": 3.016564365892634e-06, "loss": 0.3783, "num_input_tokens_seen": 20559200, "step": 19940 }, { "epoch": 13.341137123745819, "grad_norm": 4.124083042144775, "learning_rate": 3.0138855538598944e-06, "loss": 0.4411, "num_input_tokens_seen": 20564640, "step": 19945 }, { "epoch": 13.34448160535117, "grad_norm": 3.2866861820220947, "learning_rate": 3.011207418559001e-06, "loss": 0.3579, "num_input_tokens_seen": 20568928, "step": 19950 }, { "epoch": 13.347826086956522, "grad_norm": 2.2049202919006348, "learning_rate": 3.0085299609024856e-06, "loss": 0.4309, "num_input_tokens_seen": 20574016, "step": 19955 }, { "epoch": 13.351170568561873, "grad_norm": 1.9342689514160156, "learning_rate": 3.005853181802638e-06, "loss": 0.3533, "num_input_tokens_seen": 20579360, "step": 19960 }, { "epoch": 13.354515050167224, "grad_norm": 2.3103768825531006, "learning_rate": 3.0031770821715233e-06, "loss": 0.3412, "num_input_tokens_seen": 20584032, "step": 19965 }, { "epoch": 13.357859531772576, "grad_norm": 2.263387680053711, "learning_rate": 3.0005016629209715e-06, "loss": 0.4192, "num_input_tokens_seen": 20588576, "step": 19970 }, { "epoch": 13.361204013377927, "grad_norm": 2.5100910663604736, "learning_rate": 2.9978269249625824e-06, "loss": 0.3719, "num_input_tokens_seen": 20593952, "step": 19975 }, { "epoch": 13.364548494983278, "grad_norm": 2.4640281200408936, "learning_rate": 2.995152869207725e-06, "loss": 0.3196, "num_input_tokens_seen": 20598688, "step": 19980 }, { "epoch": 13.367892976588628, "grad_norm": 3.598522186279297, "learning_rate": 2.992479496567534e-06, "loss": 0.3704, "num_input_tokens_seen": 20603424, "step": 19985 }, { "epoch": 13.37123745819398, "grad_norm": 3.204672336578369, "learning_rate": 2.9898068079529125e-06, "loss": 0.4096, "num_input_tokens_seen": 20607776, "step": 19990 }, { "epoch": 13.37458193979933, "grad_norm": 2.575134515762329, "learning_rate": 2.987134804274526e-06, "loss": 0.3363, "num_input_tokens_seen": 20612672, "step": 19995 }, { "epoch": 13.377926421404682, "grad_norm": 3.3167672157287598, "learning_rate": 2.9844634864428156e-06, "loss": 0.327, "num_input_tokens_seen": 20617728, "step": 20000 }, { "epoch": 13.381270903010034, "grad_norm": 2.883955955505371, "learning_rate": 2.981792855367982e-06, "loss": 0.3763, "num_input_tokens_seen": 20623488, "step": 20005 }, { "epoch": 13.384615384615385, "grad_norm": 2.8099236488342285, "learning_rate": 2.9791229119599918e-06, "loss": 0.4404, "num_input_tokens_seen": 20628832, "step": 20010 }, { "epoch": 13.387959866220736, "grad_norm": 1.8078253269195557, "learning_rate": 2.9764536571285797e-06, "loss": 0.4259, "num_input_tokens_seen": 20635040, "step": 20015 }, { "epoch": 13.391304347826088, "grad_norm": 1.854294776916504, "learning_rate": 2.973785091783245e-06, "loss": 0.3692, "num_input_tokens_seen": 20640960, "step": 20020 }, { "epoch": 13.394648829431437, "grad_norm": 2.335533618927002, "learning_rate": 2.9711172168332526e-06, "loss": 0.3311, "num_input_tokens_seen": 20645728, "step": 20025 }, { "epoch": 13.397993311036789, "grad_norm": 2.8289504051208496, "learning_rate": 2.968450033187632e-06, "loss": 0.4639, "num_input_tokens_seen": 20650464, "step": 20030 }, { "epoch": 13.40133779264214, "grad_norm": 2.0035111904144287, "learning_rate": 2.965783541755176e-06, "loss": 0.3849, "num_input_tokens_seen": 20655456, "step": 20035 }, { "epoch": 13.404682274247492, "grad_norm": 2.7325172424316406, "learning_rate": 2.9631177434444413e-06, "loss": 0.3593, "num_input_tokens_seen": 20661376, "step": 20040 }, { "epoch": 13.408026755852843, "grad_norm": 2.5729575157165527, "learning_rate": 2.9604526391637523e-06, "loss": 0.3534, "num_input_tokens_seen": 20666240, "step": 20045 }, { "epoch": 13.411371237458194, "grad_norm": 2.5239274501800537, "learning_rate": 2.9577882298211924e-06, "loss": 0.4008, "num_input_tokens_seen": 20670816, "step": 20050 }, { "epoch": 13.414715719063546, "grad_norm": 3.3778154850006104, "learning_rate": 2.95512451632461e-06, "loss": 0.3615, "num_input_tokens_seen": 20676736, "step": 20055 }, { "epoch": 13.418060200668897, "grad_norm": 2.027315616607666, "learning_rate": 2.952461499581616e-06, "loss": 0.3615, "num_input_tokens_seen": 20681920, "step": 20060 }, { "epoch": 13.421404682274247, "grad_norm": 2.426948070526123, "learning_rate": 2.9497991804995844e-06, "loss": 0.3208, "num_input_tokens_seen": 20687264, "step": 20065 }, { "epoch": 13.424749163879598, "grad_norm": 2.589132785797119, "learning_rate": 2.947137559985652e-06, "loss": 0.3885, "num_input_tokens_seen": 20692160, "step": 20070 }, { "epoch": 13.42809364548495, "grad_norm": 2.9061641693115234, "learning_rate": 2.944476638946716e-06, "loss": 0.3909, "num_input_tokens_seen": 20696704, "step": 20075 }, { "epoch": 13.431438127090301, "grad_norm": 2.2042524814605713, "learning_rate": 2.9418164182894383e-06, "loss": 0.467, "num_input_tokens_seen": 20702432, "step": 20080 }, { "epoch": 13.434782608695652, "grad_norm": 3.0367698669433594, "learning_rate": 2.939156898920235e-06, "loss": 0.3461, "num_input_tokens_seen": 20707040, "step": 20085 }, { "epoch": 13.438127090301004, "grad_norm": 2.832839250564575, "learning_rate": 2.9364980817452936e-06, "loss": 0.4659, "num_input_tokens_seen": 20712512, "step": 20090 }, { "epoch": 13.441471571906355, "grad_norm": 2.104599714279175, "learning_rate": 2.9338399676705553e-06, "loss": 0.3058, "num_input_tokens_seen": 20717888, "step": 20095 }, { "epoch": 13.444816053511706, "grad_norm": 1.5978599786758423, "learning_rate": 2.9311825576017234e-06, "loss": 0.3498, "num_input_tokens_seen": 20723904, "step": 20100 }, { "epoch": 13.448160535117056, "grad_norm": 2.162217617034912, "learning_rate": 2.9285258524442605e-06, "loss": 0.3091, "num_input_tokens_seen": 20729152, "step": 20105 }, { "epoch": 13.451505016722408, "grad_norm": 2.82277250289917, "learning_rate": 2.925869853103388e-06, "loss": 0.404, "num_input_tokens_seen": 20733600, "step": 20110 }, { "epoch": 13.454849498327759, "grad_norm": 1.6983453035354614, "learning_rate": 2.9232145604840934e-06, "loss": 0.2695, "num_input_tokens_seen": 20738304, "step": 20115 }, { "epoch": 13.45819397993311, "grad_norm": 3.005681276321411, "learning_rate": 2.920559975491115e-06, "loss": 0.3502, "num_input_tokens_seen": 20742592, "step": 20120 }, { "epoch": 13.461538461538462, "grad_norm": 2.3134312629699707, "learning_rate": 2.9179060990289565e-06, "loss": 0.3938, "num_input_tokens_seen": 20748000, "step": 20125 }, { "epoch": 13.464882943143813, "grad_norm": 2.314673900604248, "learning_rate": 2.9152529320018743e-06, "loss": 0.3495, "num_input_tokens_seen": 20754016, "step": 20130 }, { "epoch": 13.468227424749164, "grad_norm": 2.0926222801208496, "learning_rate": 2.912600475313888e-06, "loss": 0.3747, "num_input_tokens_seen": 20759520, "step": 20135 }, { "epoch": 13.471571906354516, "grad_norm": 3.5115838050842285, "learning_rate": 2.9099487298687745e-06, "loss": 0.428, "num_input_tokens_seen": 20764288, "step": 20140 }, { "epoch": 13.474916387959865, "grad_norm": 2.637112617492676, "learning_rate": 2.907297696570063e-06, "loss": 0.4108, "num_input_tokens_seen": 20769216, "step": 20145 }, { "epoch": 13.478260869565217, "grad_norm": 2.29227614402771, "learning_rate": 2.904647376321047e-06, "loss": 0.4474, "num_input_tokens_seen": 20774720, "step": 20150 }, { "epoch": 13.481605351170568, "grad_norm": 3.5856008529663086, "learning_rate": 2.901997770024774e-06, "loss": 0.3815, "num_input_tokens_seen": 20779392, "step": 20155 }, { "epoch": 13.48494983277592, "grad_norm": 2.2068209648132324, "learning_rate": 2.8993488785840483e-06, "loss": 0.3767, "num_input_tokens_seen": 20784288, "step": 20160 }, { "epoch": 13.488294314381271, "grad_norm": 2.6816699504852295, "learning_rate": 2.8967007029014327e-06, "loss": 0.4519, "num_input_tokens_seen": 20789728, "step": 20165 }, { "epoch": 13.491638795986622, "grad_norm": 4.01925802230835, "learning_rate": 2.89405324387924e-06, "loss": 0.4083, "num_input_tokens_seen": 20794336, "step": 20170 }, { "epoch": 13.494983277591974, "grad_norm": 2.459385633468628, "learning_rate": 2.891406502419546e-06, "loss": 0.4042, "num_input_tokens_seen": 20799872, "step": 20175 }, { "epoch": 13.498327759197325, "grad_norm": 2.229933738708496, "learning_rate": 2.888760479424178e-06, "loss": 0.3429, "num_input_tokens_seen": 20805088, "step": 20180 }, { "epoch": 13.501672240802675, "grad_norm": 4.362517356872559, "learning_rate": 2.886115175794722e-06, "loss": 0.3069, "num_input_tokens_seen": 20809856, "step": 20185 }, { "epoch": 13.505016722408026, "grad_norm": 2.4013559818267822, "learning_rate": 2.883470592432512e-06, "loss": 0.389, "num_input_tokens_seen": 20814976, "step": 20190 }, { "epoch": 13.508361204013378, "grad_norm": 2.866361141204834, "learning_rate": 2.880826730238643e-06, "loss": 0.402, "num_input_tokens_seen": 20820512, "step": 20195 }, { "epoch": 13.511705685618729, "grad_norm": 1.7081583738327026, "learning_rate": 2.878183590113963e-06, "loss": 0.3936, "num_input_tokens_seen": 20826496, "step": 20200 }, { "epoch": 13.51505016722408, "grad_norm": 4.214730262756348, "learning_rate": 2.8755411729590722e-06, "loss": 0.4251, "num_input_tokens_seen": 20831968, "step": 20205 }, { "epoch": 13.518394648829432, "grad_norm": 2.825387477874756, "learning_rate": 2.872899479674328e-06, "loss": 0.502, "num_input_tokens_seen": 20836416, "step": 20210 }, { "epoch": 13.521739130434783, "grad_norm": 2.629171133041382, "learning_rate": 2.8702585111598345e-06, "loss": 0.4519, "num_input_tokens_seen": 20841728, "step": 20215 }, { "epoch": 13.525083612040135, "grad_norm": 2.280170202255249, "learning_rate": 2.8676182683154563e-06, "loss": 0.3095, "num_input_tokens_seen": 20846528, "step": 20220 }, { "epoch": 13.528428093645484, "grad_norm": 2.9790542125701904, "learning_rate": 2.864978752040805e-06, "loss": 0.5068, "num_input_tokens_seen": 20851360, "step": 20225 }, { "epoch": 13.531772575250836, "grad_norm": 2.895307779312134, "learning_rate": 2.862339963235251e-06, "loss": 0.4037, "num_input_tokens_seen": 20856704, "step": 20230 }, { "epoch": 13.535117056856187, "grad_norm": 2.1776275634765625, "learning_rate": 2.859701902797908e-06, "loss": 0.4098, "num_input_tokens_seen": 20862528, "step": 20235 }, { "epoch": 13.538461538461538, "grad_norm": 2.79632306098938, "learning_rate": 2.857064571627648e-06, "loss": 0.4099, "num_input_tokens_seen": 20867552, "step": 20240 }, { "epoch": 13.54180602006689, "grad_norm": 2.6332030296325684, "learning_rate": 2.854427970623094e-06, "loss": 0.3437, "num_input_tokens_seen": 20873472, "step": 20245 }, { "epoch": 13.545150501672241, "grad_norm": 2.693507432937622, "learning_rate": 2.851792100682619e-06, "loss": 0.3851, "num_input_tokens_seen": 20878208, "step": 20250 }, { "epoch": 13.548494983277592, "grad_norm": 3.651235342025757, "learning_rate": 2.8491569627043487e-06, "loss": 0.369, "num_input_tokens_seen": 20883744, "step": 20255 }, { "epoch": 13.551839464882944, "grad_norm": 2.784569025039673, "learning_rate": 2.8465225575861533e-06, "loss": 0.458, "num_input_tokens_seen": 20889216, "step": 20260 }, { "epoch": 13.555183946488294, "grad_norm": 3.0214290618896484, "learning_rate": 2.8438888862256607e-06, "loss": 0.3895, "num_input_tokens_seen": 20893792, "step": 20265 }, { "epoch": 13.558528428093645, "grad_norm": 2.247727870941162, "learning_rate": 2.841255949520245e-06, "loss": 0.4286, "num_input_tokens_seen": 20898592, "step": 20270 }, { "epoch": 13.561872909698996, "grad_norm": 3.019260883331299, "learning_rate": 2.8386237483670338e-06, "loss": 0.3304, "num_input_tokens_seen": 20903200, "step": 20275 }, { "epoch": 13.565217391304348, "grad_norm": 1.993800401687622, "learning_rate": 2.835992283662896e-06, "loss": 0.3649, "num_input_tokens_seen": 20908480, "step": 20280 }, { "epoch": 13.568561872909699, "grad_norm": 3.023793935775757, "learning_rate": 2.833361556304457e-06, "loss": 0.3445, "num_input_tokens_seen": 20913184, "step": 20285 }, { "epoch": 13.57190635451505, "grad_norm": 3.9071826934814453, "learning_rate": 2.8307315671880884e-06, "loss": 0.343, "num_input_tokens_seen": 20917536, "step": 20290 }, { "epoch": 13.575250836120402, "grad_norm": 2.826423406600952, "learning_rate": 2.8281023172099108e-06, "loss": 0.336, "num_input_tokens_seen": 20922176, "step": 20295 }, { "epoch": 13.578595317725753, "grad_norm": 2.735137701034546, "learning_rate": 2.825473807265795e-06, "loss": 0.3251, "num_input_tokens_seen": 20927744, "step": 20300 }, { "epoch": 13.581939799331103, "grad_norm": 2.843139171600342, "learning_rate": 2.822846038251352e-06, "loss": 0.3412, "num_input_tokens_seen": 20933120, "step": 20305 }, { "epoch": 13.585284280936454, "grad_norm": 2.231327533721924, "learning_rate": 2.820219011061949e-06, "loss": 0.4374, "num_input_tokens_seen": 20938656, "step": 20310 }, { "epoch": 13.588628762541806, "grad_norm": 3.0755207538604736, "learning_rate": 2.8175927265926965e-06, "loss": 0.3962, "num_input_tokens_seen": 20944544, "step": 20315 }, { "epoch": 13.591973244147157, "grad_norm": 2.5612571239471436, "learning_rate": 2.8149671857384544e-06, "loss": 0.3542, "num_input_tokens_seen": 20949600, "step": 20320 }, { "epoch": 13.595317725752508, "grad_norm": 2.2558231353759766, "learning_rate": 2.812342389393823e-06, "loss": 0.3312, "num_input_tokens_seen": 20954752, "step": 20325 }, { "epoch": 13.59866220735786, "grad_norm": 2.564406633377075, "learning_rate": 2.8097183384531556e-06, "loss": 0.3995, "num_input_tokens_seen": 20960288, "step": 20330 }, { "epoch": 13.602006688963211, "grad_norm": 2.1134376525878906, "learning_rate": 2.8070950338105494e-06, "loss": 0.3466, "num_input_tokens_seen": 20965696, "step": 20335 }, { "epoch": 13.605351170568563, "grad_norm": 2.8765885829925537, "learning_rate": 2.8044724763598485e-06, "loss": 0.4853, "num_input_tokens_seen": 20970080, "step": 20340 }, { "epoch": 13.608695652173914, "grad_norm": 2.1998209953308105, "learning_rate": 2.801850666994637e-06, "loss": 0.4431, "num_input_tokens_seen": 20976384, "step": 20345 }, { "epoch": 13.612040133779264, "grad_norm": 2.986476421356201, "learning_rate": 2.7992296066082504e-06, "loss": 0.4327, "num_input_tokens_seen": 20980896, "step": 20350 }, { "epoch": 13.615384615384615, "grad_norm": 2.747837781906128, "learning_rate": 2.796609296093767e-06, "loss": 0.3152, "num_input_tokens_seen": 20986048, "step": 20355 }, { "epoch": 13.618729096989966, "grad_norm": 1.846983551979065, "learning_rate": 2.7939897363440083e-06, "loss": 0.3548, "num_input_tokens_seen": 20991552, "step": 20360 }, { "epoch": 13.622073578595318, "grad_norm": 2.143587827682495, "learning_rate": 2.7913709282515434e-06, "loss": 0.4117, "num_input_tokens_seen": 20996544, "step": 20365 }, { "epoch": 13.62541806020067, "grad_norm": 2.5347824096679688, "learning_rate": 2.7887528727086786e-06, "loss": 0.3458, "num_input_tokens_seen": 21001664, "step": 20370 }, { "epoch": 13.62876254180602, "grad_norm": 2.6317524909973145, "learning_rate": 2.786135570607473e-06, "loss": 0.3709, "num_input_tokens_seen": 21006816, "step": 20375 }, { "epoch": 13.632107023411372, "grad_norm": 2.4268243312835693, "learning_rate": 2.783519022839716e-06, "loss": 0.4078, "num_input_tokens_seen": 21012064, "step": 20380 }, { "epoch": 13.635451505016722, "grad_norm": 4.821837425231934, "learning_rate": 2.7809032302969587e-06, "loss": 0.4148, "num_input_tokens_seen": 21017664, "step": 20385 }, { "epoch": 13.638795986622073, "grad_norm": 2.17348051071167, "learning_rate": 2.778288193870476e-06, "loss": 0.3915, "num_input_tokens_seen": 21023136, "step": 20390 }, { "epoch": 13.642140468227424, "grad_norm": 2.9565954208374023, "learning_rate": 2.7756739144512957e-06, "loss": 0.3218, "num_input_tokens_seen": 21029120, "step": 20395 }, { "epoch": 13.645484949832776, "grad_norm": 2.2349231243133545, "learning_rate": 2.773060392930186e-06, "loss": 0.3456, "num_input_tokens_seen": 21033856, "step": 20400 }, { "epoch": 13.648829431438127, "grad_norm": 2.1441354751586914, "learning_rate": 2.7704476301976555e-06, "loss": 0.3493, "num_input_tokens_seen": 21039168, "step": 20405 }, { "epoch": 13.652173913043478, "grad_norm": 2.9371843338012695, "learning_rate": 2.7678356271439572e-06, "loss": 0.4482, "num_input_tokens_seen": 21044576, "step": 20410 }, { "epoch": 13.65551839464883, "grad_norm": 2.2415668964385986, "learning_rate": 2.7652243846590786e-06, "loss": 0.331, "num_input_tokens_seen": 21049856, "step": 20415 }, { "epoch": 13.658862876254181, "grad_norm": 2.6977601051330566, "learning_rate": 2.762613903632755e-06, "loss": 0.3314, "num_input_tokens_seen": 21054912, "step": 20420 }, { "epoch": 13.662207357859533, "grad_norm": 2.4481422901153564, "learning_rate": 2.760004184954459e-06, "loss": 0.4002, "num_input_tokens_seen": 21059744, "step": 20425 }, { "epoch": 13.665551839464882, "grad_norm": 2.240353584289551, "learning_rate": 2.7573952295134066e-06, "loss": 0.4158, "num_input_tokens_seen": 21064864, "step": 20430 }, { "epoch": 13.668896321070234, "grad_norm": 1.9239236116409302, "learning_rate": 2.7547870381985463e-06, "loss": 0.4013, "num_input_tokens_seen": 21070208, "step": 20435 }, { "epoch": 13.672240802675585, "grad_norm": 2.2266125679016113, "learning_rate": 2.7521796118985744e-06, "loss": 0.3603, "num_input_tokens_seen": 21075328, "step": 20440 }, { "epoch": 13.675585284280936, "grad_norm": 3.4743876457214355, "learning_rate": 2.749572951501923e-06, "loss": 0.4161, "num_input_tokens_seen": 21079968, "step": 20445 }, { "epoch": 13.678929765886288, "grad_norm": 2.291890859603882, "learning_rate": 2.7469670578967635e-06, "loss": 0.3105, "num_input_tokens_seen": 21085440, "step": 20450 }, { "epoch": 13.68227424749164, "grad_norm": 2.532275915145874, "learning_rate": 2.744361931971007e-06, "loss": 0.47, "num_input_tokens_seen": 21089920, "step": 20455 }, { "epoch": 13.68561872909699, "grad_norm": 2.346853256225586, "learning_rate": 2.741757574612299e-06, "loss": 0.3116, "num_input_tokens_seen": 21094560, "step": 20460 }, { "epoch": 13.68896321070234, "grad_norm": 3.451488733291626, "learning_rate": 2.7391539867080286e-06, "loss": 0.3116, "num_input_tokens_seen": 21099296, "step": 20465 }, { "epoch": 13.692307692307692, "grad_norm": 3.062701463699341, "learning_rate": 2.7365511691453195e-06, "loss": 0.4413, "num_input_tokens_seen": 21104800, "step": 20470 }, { "epoch": 13.695652173913043, "grad_norm": 2.6588916778564453, "learning_rate": 2.733949122811036e-06, "loss": 0.5063, "num_input_tokens_seen": 21111136, "step": 20475 }, { "epoch": 13.698996655518394, "grad_norm": 2.9699442386627197, "learning_rate": 2.731347848591774e-06, "loss": 0.3537, "num_input_tokens_seen": 21116288, "step": 20480 }, { "epoch": 13.702341137123746, "grad_norm": 2.402008056640625, "learning_rate": 2.7287473473738707e-06, "loss": 0.3645, "num_input_tokens_seen": 21121472, "step": 20485 }, { "epoch": 13.705685618729097, "grad_norm": 2.1312673091888428, "learning_rate": 2.7261476200433994e-06, "loss": 0.4389, "num_input_tokens_seen": 21126816, "step": 20490 }, { "epoch": 13.709030100334449, "grad_norm": 4.534657955169678, "learning_rate": 2.72354866748617e-06, "loss": 0.4017, "num_input_tokens_seen": 21132032, "step": 20495 }, { "epoch": 13.7123745819398, "grad_norm": 3.390652894973755, "learning_rate": 2.7209504905877283e-06, "loss": 0.3243, "num_input_tokens_seen": 21136448, "step": 20500 }, { "epoch": 13.715719063545151, "grad_norm": 3.183903217315674, "learning_rate": 2.718353090233351e-06, "loss": 0.3506, "num_input_tokens_seen": 21141632, "step": 20505 }, { "epoch": 13.719063545150501, "grad_norm": 3.7841405868530273, "learning_rate": 2.7157564673080583e-06, "loss": 0.3334, "num_input_tokens_seen": 21146336, "step": 20510 }, { "epoch": 13.722408026755852, "grad_norm": 1.967186689376831, "learning_rate": 2.713160622696599e-06, "loss": 0.3337, "num_input_tokens_seen": 21151520, "step": 20515 }, { "epoch": 13.725752508361204, "grad_norm": 2.3428261280059814, "learning_rate": 2.710565557283463e-06, "loss": 0.4232, "num_input_tokens_seen": 21156640, "step": 20520 }, { "epoch": 13.729096989966555, "grad_norm": 2.456737756729126, "learning_rate": 2.7079712719528674e-06, "loss": 0.4018, "num_input_tokens_seen": 21162080, "step": 20525 }, { "epoch": 13.732441471571907, "grad_norm": 2.7527353763580322, "learning_rate": 2.705377767588767e-06, "loss": 0.3371, "num_input_tokens_seen": 21166784, "step": 20530 }, { "epoch": 13.735785953177258, "grad_norm": 3.0318360328674316, "learning_rate": 2.7027850450748528e-06, "loss": 0.3755, "num_input_tokens_seen": 21171776, "step": 20535 }, { "epoch": 13.73913043478261, "grad_norm": 2.2210991382598877, "learning_rate": 2.700193105294545e-06, "loss": 0.3544, "num_input_tokens_seen": 21177248, "step": 20540 }, { "epoch": 13.742474916387959, "grad_norm": 2.6407415866851807, "learning_rate": 2.6976019491310025e-06, "loss": 0.3528, "num_input_tokens_seen": 21183136, "step": 20545 }, { "epoch": 13.74581939799331, "grad_norm": 2.594068765640259, "learning_rate": 2.69501157746711e-06, "loss": 0.3893, "num_input_tokens_seen": 21188064, "step": 20550 }, { "epoch": 13.749163879598662, "grad_norm": 3.227682113647461, "learning_rate": 2.6924219911854906e-06, "loss": 0.3598, "num_input_tokens_seen": 21193280, "step": 20555 }, { "epoch": 13.752508361204013, "grad_norm": 2.651024341583252, "learning_rate": 2.6898331911684976e-06, "loss": 0.3882, "num_input_tokens_seen": 21198016, "step": 20560 }, { "epoch": 13.755852842809364, "grad_norm": 2.496243953704834, "learning_rate": 2.68724517829822e-06, "loss": 0.4441, "num_input_tokens_seen": 21202176, "step": 20565 }, { "epoch": 13.759197324414716, "grad_norm": 3.275977373123169, "learning_rate": 2.68465795345647e-06, "loss": 0.3601, "num_input_tokens_seen": 21207968, "step": 20570 }, { "epoch": 13.762541806020067, "grad_norm": 5.352959156036377, "learning_rate": 2.6820715175248017e-06, "loss": 0.4336, "num_input_tokens_seen": 21213120, "step": 20575 }, { "epoch": 13.765886287625419, "grad_norm": 2.9104249477386475, "learning_rate": 2.6794858713844895e-06, "loss": 0.4547, "num_input_tokens_seen": 21218336, "step": 20580 }, { "epoch": 13.76923076923077, "grad_norm": 2.369330406188965, "learning_rate": 2.676901015916552e-06, "loss": 0.3155, "num_input_tokens_seen": 21223712, "step": 20585 }, { "epoch": 13.77257525083612, "grad_norm": 1.7345045804977417, "learning_rate": 2.6743169520017254e-06, "loss": 0.2999, "num_input_tokens_seen": 21228896, "step": 20590 }, { "epoch": 13.775919732441471, "grad_norm": 2.7463231086730957, "learning_rate": 2.671733680520483e-06, "loss": 0.3049, "num_input_tokens_seen": 21233952, "step": 20595 }, { "epoch": 13.779264214046822, "grad_norm": 2.329868793487549, "learning_rate": 2.6691512023530285e-06, "loss": 0.3348, "num_input_tokens_seen": 21239488, "step": 20600 }, { "epoch": 13.782608695652174, "grad_norm": 2.0139057636260986, "learning_rate": 2.6665695183792922e-06, "loss": 0.3435, "num_input_tokens_seen": 21244992, "step": 20605 }, { "epoch": 13.785953177257525, "grad_norm": 3.3940088748931885, "learning_rate": 2.6639886294789374e-06, "loss": 0.3881, "num_input_tokens_seen": 21249312, "step": 20610 }, { "epoch": 13.789297658862877, "grad_norm": 3.817420482635498, "learning_rate": 2.661408536531352e-06, "loss": 0.5167, "num_input_tokens_seen": 21254368, "step": 20615 }, { "epoch": 13.792642140468228, "grad_norm": 3.7559800148010254, "learning_rate": 2.658829240415657e-06, "loss": 0.3552, "num_input_tokens_seen": 21259200, "step": 20620 }, { "epoch": 13.79598662207358, "grad_norm": 3.2337100505828857, "learning_rate": 2.656250742010695e-06, "loss": 0.3719, "num_input_tokens_seen": 21264416, "step": 20625 }, { "epoch": 13.799331103678929, "grad_norm": 2.957136392593384, "learning_rate": 2.6536730421950496e-06, "loss": 0.374, "num_input_tokens_seen": 21269632, "step": 20630 }, { "epoch": 13.80267558528428, "grad_norm": 2.6941640377044678, "learning_rate": 2.6510961418470183e-06, "loss": 0.3223, "num_input_tokens_seen": 21274912, "step": 20635 }, { "epoch": 13.806020066889632, "grad_norm": 2.3903849124908447, "learning_rate": 2.6485200418446345e-06, "loss": 0.4132, "num_input_tokens_seen": 21279872, "step": 20640 }, { "epoch": 13.809364548494983, "grad_norm": 3.1728341579437256, "learning_rate": 2.645944743065657e-06, "loss": 0.3887, "num_input_tokens_seen": 21285376, "step": 20645 }, { "epoch": 13.812709030100335, "grad_norm": 3.2137367725372314, "learning_rate": 2.6433702463875703e-06, "loss": 0.4392, "num_input_tokens_seen": 21290784, "step": 20650 }, { "epoch": 13.816053511705686, "grad_norm": 2.6397087574005127, "learning_rate": 2.64079655268759e-06, "loss": 0.4338, "num_input_tokens_seen": 21295104, "step": 20655 }, { "epoch": 13.819397993311037, "grad_norm": 2.439218044281006, "learning_rate": 2.63822366284265e-06, "loss": 0.3889, "num_input_tokens_seen": 21300768, "step": 20660 }, { "epoch": 13.822742474916389, "grad_norm": 2.8288209438323975, "learning_rate": 2.635651577729419e-06, "loss": 0.3642, "num_input_tokens_seen": 21305344, "step": 20665 }, { "epoch": 13.826086956521738, "grad_norm": 2.3322672843933105, "learning_rate": 2.633080298224282e-06, "loss": 0.2907, "num_input_tokens_seen": 21310080, "step": 20670 }, { "epoch": 13.82943143812709, "grad_norm": 2.137132406234741, "learning_rate": 2.6305098252033625e-06, "loss": 0.4465, "num_input_tokens_seen": 21316128, "step": 20675 }, { "epoch": 13.832775919732441, "grad_norm": 2.9462168216705322, "learning_rate": 2.6279401595424958e-06, "loss": 0.3722, "num_input_tokens_seen": 21321376, "step": 20680 }, { "epoch": 13.836120401337793, "grad_norm": 1.9878876209259033, "learning_rate": 2.62537130211725e-06, "loss": 0.3312, "num_input_tokens_seen": 21326720, "step": 20685 }, { "epoch": 13.839464882943144, "grad_norm": 2.195809841156006, "learning_rate": 2.622803253802917e-06, "loss": 0.3906, "num_input_tokens_seen": 21331200, "step": 20690 }, { "epoch": 13.842809364548495, "grad_norm": 3.4534308910369873, "learning_rate": 2.6202360154745106e-06, "loss": 0.4231, "num_input_tokens_seen": 21336352, "step": 20695 }, { "epoch": 13.846153846153847, "grad_norm": 2.5370559692382812, "learning_rate": 2.617669588006772e-06, "loss": 0.4087, "num_input_tokens_seen": 21341472, "step": 20700 }, { "epoch": 13.849498327759198, "grad_norm": 2.864912509918213, "learning_rate": 2.6151039722741606e-06, "loss": 0.3921, "num_input_tokens_seen": 21346560, "step": 20705 }, { "epoch": 13.852842809364548, "grad_norm": 2.5510072708129883, "learning_rate": 2.612539169150866e-06, "loss": 0.4688, "num_input_tokens_seen": 21352544, "step": 20710 }, { "epoch": 13.856187290969899, "grad_norm": 2.5612316131591797, "learning_rate": 2.6099751795107925e-06, "loss": 0.4046, "num_input_tokens_seen": 21357504, "step": 20715 }, { "epoch": 13.85953177257525, "grad_norm": 3.394857883453369, "learning_rate": 2.6074120042275793e-06, "loss": 0.3346, "num_input_tokens_seen": 21362048, "step": 20720 }, { "epoch": 13.862876254180602, "grad_norm": 2.3052048683166504, "learning_rate": 2.6048496441745753e-06, "loss": 0.4366, "num_input_tokens_seen": 21366752, "step": 20725 }, { "epoch": 13.866220735785953, "grad_norm": 2.807206392288208, "learning_rate": 2.602288100224859e-06, "loss": 0.3344, "num_input_tokens_seen": 21371904, "step": 20730 }, { "epoch": 13.869565217391305, "grad_norm": 2.727569818496704, "learning_rate": 2.5997273732512308e-06, "loss": 0.3974, "num_input_tokens_seen": 21376448, "step": 20735 }, { "epoch": 13.872909698996656, "grad_norm": 2.8546366691589355, "learning_rate": 2.597167464126209e-06, "loss": 0.464, "num_input_tokens_seen": 21381472, "step": 20740 }, { "epoch": 13.876254180602007, "grad_norm": 3.0246269702911377, "learning_rate": 2.594608373722039e-06, "loss": 0.3515, "num_input_tokens_seen": 21387136, "step": 20745 }, { "epoch": 13.879598662207357, "grad_norm": 3.4010426998138428, "learning_rate": 2.592050102910679e-06, "loss": 0.4063, "num_input_tokens_seen": 21392096, "step": 20750 }, { "epoch": 13.882943143812708, "grad_norm": 2.5608320236206055, "learning_rate": 2.5894926525638164e-06, "loss": 0.3799, "num_input_tokens_seen": 21397088, "step": 20755 }, { "epoch": 13.88628762541806, "grad_norm": 2.1354570388793945, "learning_rate": 2.586936023552851e-06, "loss": 0.3443, "num_input_tokens_seen": 21403072, "step": 20760 }, { "epoch": 13.889632107023411, "grad_norm": 2.5311660766601562, "learning_rate": 2.584380216748912e-06, "loss": 0.3897, "num_input_tokens_seen": 21408320, "step": 20765 }, { "epoch": 13.892976588628763, "grad_norm": 2.9334826469421387, "learning_rate": 2.581825233022839e-06, "loss": 0.3216, "num_input_tokens_seen": 21413376, "step": 20770 }, { "epoch": 13.896321070234114, "grad_norm": 3.1800272464752197, "learning_rate": 2.5792710732452e-06, "loss": 0.4117, "num_input_tokens_seen": 21418656, "step": 20775 }, { "epoch": 13.899665551839465, "grad_norm": 1.948033332824707, "learning_rate": 2.576717738286271e-06, "loss": 0.3091, "num_input_tokens_seen": 21423328, "step": 20780 }, { "epoch": 13.903010033444817, "grad_norm": 3.1873786449432373, "learning_rate": 2.5741652290160613e-06, "loss": 0.3607, "num_input_tokens_seen": 21428608, "step": 20785 }, { "epoch": 13.906354515050166, "grad_norm": 1.8308944702148438, "learning_rate": 2.5716135463042867e-06, "loss": 0.3672, "num_input_tokens_seen": 21433600, "step": 20790 }, { "epoch": 13.909698996655518, "grad_norm": 2.216583013534546, "learning_rate": 2.5690626910203876e-06, "loss": 0.4411, "num_input_tokens_seen": 21439264, "step": 20795 }, { "epoch": 13.91304347826087, "grad_norm": 1.7332139015197754, "learning_rate": 2.5665126640335227e-06, "loss": 0.4309, "num_input_tokens_seen": 21444960, "step": 20800 }, { "epoch": 13.91638795986622, "grad_norm": 3.4916999340057373, "learning_rate": 2.56396346621256e-06, "loss": 0.397, "num_input_tokens_seen": 21449536, "step": 20805 }, { "epoch": 13.919732441471572, "grad_norm": 3.0333616733551025, "learning_rate": 2.561415098426101e-06, "loss": 0.4847, "num_input_tokens_seen": 21455104, "step": 20810 }, { "epoch": 13.923076923076923, "grad_norm": 2.537282943725586, "learning_rate": 2.5588675615424472e-06, "loss": 0.3788, "num_input_tokens_seen": 21460224, "step": 20815 }, { "epoch": 13.926421404682275, "grad_norm": 2.438859701156616, "learning_rate": 2.5563208564296306e-06, "loss": 0.3083, "num_input_tokens_seen": 21464928, "step": 20820 }, { "epoch": 13.929765886287626, "grad_norm": 4.1559295654296875, "learning_rate": 2.553774983955387e-06, "loss": 0.4467, "num_input_tokens_seen": 21469568, "step": 20825 }, { "epoch": 13.933110367892976, "grad_norm": 1.7677022218704224, "learning_rate": 2.5512299449871835e-06, "loss": 0.4002, "num_input_tokens_seen": 21475104, "step": 20830 }, { "epoch": 13.936454849498327, "grad_norm": 3.9947268962860107, "learning_rate": 2.5486857403921896e-06, "loss": 0.3973, "num_input_tokens_seen": 21480416, "step": 20835 }, { "epoch": 13.939799331103679, "grad_norm": 2.714169979095459, "learning_rate": 2.5461423710372967e-06, "loss": 0.3321, "num_input_tokens_seen": 21484800, "step": 20840 }, { "epoch": 13.94314381270903, "grad_norm": 1.7516371011734009, "learning_rate": 2.5435998377891148e-06, "loss": 0.355, "num_input_tokens_seen": 21490688, "step": 20845 }, { "epoch": 13.946488294314381, "grad_norm": 1.9397162199020386, "learning_rate": 2.541058141513959e-06, "loss": 0.3801, "num_input_tokens_seen": 21495808, "step": 20850 }, { "epoch": 13.949832775919733, "grad_norm": 2.9363811016082764, "learning_rate": 2.538517283077872e-06, "loss": 0.3494, "num_input_tokens_seen": 21500544, "step": 20855 }, { "epoch": 13.953177257525084, "grad_norm": 2.466747283935547, "learning_rate": 2.5359772633465997e-06, "loss": 0.397, "num_input_tokens_seen": 21507008, "step": 20860 }, { "epoch": 13.956521739130435, "grad_norm": 3.1239888668060303, "learning_rate": 2.53343808318561e-06, "loss": 0.3463, "num_input_tokens_seen": 21513504, "step": 20865 }, { "epoch": 13.959866220735787, "grad_norm": 1.8904762268066406, "learning_rate": 2.5308997434600768e-06, "loss": 0.3963, "num_input_tokens_seen": 21517856, "step": 20870 }, { "epoch": 13.963210702341136, "grad_norm": 3.7763514518737793, "learning_rate": 2.5283622450348994e-06, "loss": 0.3437, "num_input_tokens_seen": 21523136, "step": 20875 }, { "epoch": 13.966555183946488, "grad_norm": 1.8824881315231323, "learning_rate": 2.5258255887746786e-06, "loss": 0.3313, "num_input_tokens_seen": 21527904, "step": 20880 }, { "epoch": 13.96989966555184, "grad_norm": 5.182324409484863, "learning_rate": 2.5232897755437345e-06, "loss": 0.3998, "num_input_tokens_seen": 21533184, "step": 20885 }, { "epoch": 13.97324414715719, "grad_norm": 2.926920175552368, "learning_rate": 2.5207548062060986e-06, "loss": 0.3678, "num_input_tokens_seen": 21538528, "step": 20890 }, { "epoch": 13.976588628762542, "grad_norm": 2.331726551055908, "learning_rate": 2.5182206816255155e-06, "loss": 0.3765, "num_input_tokens_seen": 21543616, "step": 20895 }, { "epoch": 13.979933110367893, "grad_norm": 2.75775408744812, "learning_rate": 2.5156874026654432e-06, "loss": 0.5666, "num_input_tokens_seen": 21549184, "step": 20900 }, { "epoch": 13.983277591973245, "grad_norm": 3.7082252502441406, "learning_rate": 2.5131549701890457e-06, "loss": 0.4444, "num_input_tokens_seen": 21553984, "step": 20905 }, { "epoch": 13.986622073578594, "grad_norm": 2.7772316932678223, "learning_rate": 2.510623385059206e-06, "loss": 0.3225, "num_input_tokens_seen": 21559168, "step": 20910 }, { "epoch": 13.989966555183946, "grad_norm": 1.756565809249878, "learning_rate": 2.5080926481385104e-06, "loss": 0.3716, "num_input_tokens_seen": 21564608, "step": 20915 }, { "epoch": 13.993311036789297, "grad_norm": 4.097029685974121, "learning_rate": 2.5055627602892675e-06, "loss": 0.341, "num_input_tokens_seen": 21569376, "step": 20920 }, { "epoch": 13.996655518394649, "grad_norm": 1.8366941213607788, "learning_rate": 2.5030337223734845e-06, "loss": 0.3314, "num_input_tokens_seen": 21575104, "step": 20925 }, { "epoch": 14.0, "grad_norm": 3.5983245372772217, "learning_rate": 2.5005055352528862e-06, "loss": 0.3093, "num_input_tokens_seen": 21579648, "step": 20930 }, { "epoch": 14.0, "eval_loss": 0.5209661722183228, "eval_runtime": 37.5832, "eval_samples_per_second": 39.778, "eval_steps_per_second": 9.951, "num_input_tokens_seen": 21579648, "step": 20930 }, { "epoch": 14.003344481605351, "grad_norm": 2.8994176387786865, "learning_rate": 2.4979781997889064e-06, "loss": 0.3227, "num_input_tokens_seen": 21584992, "step": 20935 }, { "epoch": 14.006688963210703, "grad_norm": 2.4677696228027344, "learning_rate": 2.4954517168426867e-06, "loss": 0.3543, "num_input_tokens_seen": 21590016, "step": 20940 }, { "epoch": 14.010033444816054, "grad_norm": 2.7812387943267822, "learning_rate": 2.4929260872750826e-06, "loss": 0.3966, "num_input_tokens_seen": 21595008, "step": 20945 }, { "epoch": 14.013377926421406, "grad_norm": 2.9451589584350586, "learning_rate": 2.4904013119466518e-06, "loss": 0.3633, "num_input_tokens_seen": 21600608, "step": 20950 }, { "epoch": 14.016722408026755, "grad_norm": 3.425732135772705, "learning_rate": 2.487877391717668e-06, "loss": 0.3792, "num_input_tokens_seen": 21605280, "step": 20955 }, { "epoch": 14.020066889632107, "grad_norm": 2.620386838912964, "learning_rate": 2.485354327448106e-06, "loss": 0.3853, "num_input_tokens_seen": 21610208, "step": 20960 }, { "epoch": 14.023411371237458, "grad_norm": 2.108945369720459, "learning_rate": 2.4828321199976595e-06, "loss": 0.4481, "num_input_tokens_seen": 21615040, "step": 20965 }, { "epoch": 14.02675585284281, "grad_norm": 3.262967824935913, "learning_rate": 2.4803107702257196e-06, "loss": 0.3642, "num_input_tokens_seen": 21619776, "step": 20970 }, { "epoch": 14.03010033444816, "grad_norm": 2.1556899547576904, "learning_rate": 2.477790278991391e-06, "loss": 0.4035, "num_input_tokens_seen": 21625024, "step": 20975 }, { "epoch": 14.033444816053512, "grad_norm": 2.709331512451172, "learning_rate": 2.4752706471534842e-06, "loss": 0.3933, "num_input_tokens_seen": 21629344, "step": 20980 }, { "epoch": 14.036789297658864, "grad_norm": 2.7017245292663574, "learning_rate": 2.4727518755705206e-06, "loss": 0.3451, "num_input_tokens_seen": 21634144, "step": 20985 }, { "epoch": 14.040133779264215, "grad_norm": 2.2766237258911133, "learning_rate": 2.47023396510072e-06, "loss": 0.3534, "num_input_tokens_seen": 21639168, "step": 20990 }, { "epoch": 14.043478260869565, "grad_norm": 2.179649591445923, "learning_rate": 2.4677169166020164e-06, "loss": 0.4107, "num_input_tokens_seen": 21644608, "step": 20995 }, { "epoch": 14.046822742474916, "grad_norm": 3.1840152740478516, "learning_rate": 2.4652007309320497e-06, "loss": 0.4016, "num_input_tokens_seen": 21649632, "step": 21000 }, { "epoch": 14.050167224080267, "grad_norm": 3.1332709789276123, "learning_rate": 2.462685408948158e-06, "loss": 0.4181, "num_input_tokens_seen": 21655424, "step": 21005 }, { "epoch": 14.053511705685619, "grad_norm": 2.6959750652313232, "learning_rate": 2.4601709515073972e-06, "loss": 0.3651, "num_input_tokens_seen": 21660544, "step": 21010 }, { "epoch": 14.05685618729097, "grad_norm": 3.444103240966797, "learning_rate": 2.457657359466518e-06, "loss": 0.3774, "num_input_tokens_seen": 21665408, "step": 21015 }, { "epoch": 14.060200668896321, "grad_norm": 1.7345198392868042, "learning_rate": 2.455144633681984e-06, "loss": 0.3475, "num_input_tokens_seen": 21670880, "step": 21020 }, { "epoch": 14.063545150501673, "grad_norm": 2.7047502994537354, "learning_rate": 2.4526327750099544e-06, "loss": 0.3762, "num_input_tokens_seen": 21675904, "step": 21025 }, { "epoch": 14.066889632107024, "grad_norm": 2.705610752105713, "learning_rate": 2.4501217843063065e-06, "loss": 0.4015, "num_input_tokens_seen": 21681120, "step": 21030 }, { "epoch": 14.070234113712374, "grad_norm": 2.1698341369628906, "learning_rate": 2.4476116624266075e-06, "loss": 0.2973, "num_input_tokens_seen": 21686464, "step": 21035 }, { "epoch": 14.073578595317725, "grad_norm": 3.678636312484741, "learning_rate": 2.4451024102261385e-06, "loss": 0.3955, "num_input_tokens_seen": 21692736, "step": 21040 }, { "epoch": 14.076923076923077, "grad_norm": 2.996055841445923, "learning_rate": 2.442594028559881e-06, "loss": 0.3776, "num_input_tokens_seen": 21697888, "step": 21045 }, { "epoch": 14.080267558528428, "grad_norm": 1.9891077280044556, "learning_rate": 2.440086518282515e-06, "loss": 0.4214, "num_input_tokens_seen": 21703776, "step": 21050 }, { "epoch": 14.08361204013378, "grad_norm": 2.3847429752349854, "learning_rate": 2.4375798802484365e-06, "loss": 0.2942, "num_input_tokens_seen": 21709408, "step": 21055 }, { "epoch": 14.08695652173913, "grad_norm": 2.9671566486358643, "learning_rate": 2.4350741153117297e-06, "loss": 0.4056, "num_input_tokens_seen": 21713952, "step": 21060 }, { "epoch": 14.090301003344482, "grad_norm": 2.387356758117676, "learning_rate": 2.432569224326191e-06, "loss": 0.3938, "num_input_tokens_seen": 21719872, "step": 21065 }, { "epoch": 14.093645484949834, "grad_norm": 3.0932130813598633, "learning_rate": 2.4300652081453117e-06, "loss": 0.3499, "num_input_tokens_seen": 21724832, "step": 21070 }, { "epoch": 14.096989966555183, "grad_norm": 3.4465630054473877, "learning_rate": 2.4275620676222956e-06, "loss": 0.4241, "num_input_tokens_seen": 21730080, "step": 21075 }, { "epoch": 14.100334448160535, "grad_norm": 2.574636697769165, "learning_rate": 2.425059803610036e-06, "loss": 0.3167, "num_input_tokens_seen": 21735136, "step": 21080 }, { "epoch": 14.103678929765886, "grad_norm": 2.5919511318206787, "learning_rate": 2.4225584169611353e-06, "loss": 0.3536, "num_input_tokens_seen": 21742016, "step": 21085 }, { "epoch": 14.107023411371237, "grad_norm": 2.5926268100738525, "learning_rate": 2.420057908527897e-06, "loss": 0.3773, "num_input_tokens_seen": 21747040, "step": 21090 }, { "epoch": 14.110367892976589, "grad_norm": 2.242666482925415, "learning_rate": 2.417558279162317e-06, "loss": 0.3899, "num_input_tokens_seen": 21752256, "step": 21095 }, { "epoch": 14.11371237458194, "grad_norm": 2.3725481033325195, "learning_rate": 2.4150595297161054e-06, "loss": 0.3671, "num_input_tokens_seen": 21757440, "step": 21100 }, { "epoch": 14.117056856187292, "grad_norm": 2.5251877307891846, "learning_rate": 2.41256166104066e-06, "loss": 0.3514, "num_input_tokens_seen": 21762464, "step": 21105 }, { "epoch": 14.120401337792643, "grad_norm": 1.9918144941329956, "learning_rate": 2.4100646739870865e-06, "loss": 0.3545, "num_input_tokens_seen": 21767648, "step": 21110 }, { "epoch": 14.123745819397993, "grad_norm": 2.040797710418701, "learning_rate": 2.407568569406182e-06, "loss": 0.3675, "num_input_tokens_seen": 21773632, "step": 21115 }, { "epoch": 14.127090301003344, "grad_norm": 3.21767258644104, "learning_rate": 2.4050733481484556e-06, "loss": 0.3953, "num_input_tokens_seen": 21778336, "step": 21120 }, { "epoch": 14.130434782608695, "grad_norm": 2.5230746269226074, "learning_rate": 2.4025790110641016e-06, "loss": 0.4088, "num_input_tokens_seen": 21783488, "step": 21125 }, { "epoch": 14.133779264214047, "grad_norm": 1.8623287677764893, "learning_rate": 2.400085559003022e-06, "loss": 0.324, "num_input_tokens_seen": 21788896, "step": 21130 }, { "epoch": 14.137123745819398, "grad_norm": 2.956456422805786, "learning_rate": 2.397592992814816e-06, "loss": 0.4061, "num_input_tokens_seen": 21793888, "step": 21135 }, { "epoch": 14.14046822742475, "grad_norm": 1.9853652715682983, "learning_rate": 2.395101313348775e-06, "loss": 0.3253, "num_input_tokens_seen": 21798976, "step": 21140 }, { "epoch": 14.143812709030101, "grad_norm": 3.6338911056518555, "learning_rate": 2.392610521453898e-06, "loss": 0.4379, "num_input_tokens_seen": 21803776, "step": 21145 }, { "epoch": 14.147157190635452, "grad_norm": 2.210015296936035, "learning_rate": 2.390120617978873e-06, "loss": 0.3721, "num_input_tokens_seen": 21809056, "step": 21150 }, { "epoch": 14.150501672240802, "grad_norm": 2.5459344387054443, "learning_rate": 2.3876316037720915e-06, "loss": 0.3228, "num_input_tokens_seen": 21814752, "step": 21155 }, { "epoch": 14.153846153846153, "grad_norm": 2.2417447566986084, "learning_rate": 2.385143479681633e-06, "loss": 0.2968, "num_input_tokens_seen": 21820416, "step": 21160 }, { "epoch": 14.157190635451505, "grad_norm": 3.8925058841705322, "learning_rate": 2.382656246555289e-06, "loss": 0.4044, "num_input_tokens_seen": 21825344, "step": 21165 }, { "epoch": 14.160535117056856, "grad_norm": 2.4920008182525635, "learning_rate": 2.3801699052405303e-06, "loss": 0.3628, "num_input_tokens_seen": 21830656, "step": 21170 }, { "epoch": 14.163879598662207, "grad_norm": 2.571228504180908, "learning_rate": 2.377684456584536e-06, "loss": 0.2999, "num_input_tokens_seen": 21836192, "step": 21175 }, { "epoch": 14.167224080267559, "grad_norm": 2.1774814128875732, "learning_rate": 2.3751999014341778e-06, "loss": 0.2875, "num_input_tokens_seen": 21841728, "step": 21180 }, { "epoch": 14.17056856187291, "grad_norm": 3.0835187435150146, "learning_rate": 2.3727162406360157e-06, "loss": 0.4206, "num_input_tokens_seen": 21846240, "step": 21185 }, { "epoch": 14.173913043478262, "grad_norm": 3.7064461708068848, "learning_rate": 2.3702334750363193e-06, "loss": 0.3771, "num_input_tokens_seen": 21850912, "step": 21190 }, { "epoch": 14.177257525083611, "grad_norm": 2.1009864807128906, "learning_rate": 2.3677516054810403e-06, "loss": 0.3992, "num_input_tokens_seen": 21856320, "step": 21195 }, { "epoch": 14.180602006688963, "grad_norm": 1.9309539794921875, "learning_rate": 2.365270632815833e-06, "loss": 0.4224, "num_input_tokens_seen": 21861888, "step": 21200 }, { "epoch": 14.183946488294314, "grad_norm": 2.2220582962036133, "learning_rate": 2.362790557886037e-06, "loss": 0.3814, "num_input_tokens_seen": 21866688, "step": 21205 }, { "epoch": 14.187290969899665, "grad_norm": 2.049846887588501, "learning_rate": 2.3603113815367006e-06, "loss": 0.3729, "num_input_tokens_seen": 21871840, "step": 21210 }, { "epoch": 14.190635451505017, "grad_norm": 3.488056182861328, "learning_rate": 2.3578331046125513e-06, "loss": 0.3899, "num_input_tokens_seen": 21876416, "step": 21215 }, { "epoch": 14.193979933110368, "grad_norm": 3.638653039932251, "learning_rate": 2.35535572795802e-06, "loss": 0.4161, "num_input_tokens_seen": 21881312, "step": 21220 }, { "epoch": 14.19732441471572, "grad_norm": 2.7357959747314453, "learning_rate": 2.3528792524172235e-06, "loss": 0.3528, "num_input_tokens_seen": 21886720, "step": 21225 }, { "epoch": 14.200668896321071, "grad_norm": 3.058527946472168, "learning_rate": 2.3504036788339763e-06, "loss": 0.3344, "num_input_tokens_seen": 21891680, "step": 21230 }, { "epoch": 14.20401337792642, "grad_norm": 3.4054205417633057, "learning_rate": 2.347929008051785e-06, "loss": 0.4412, "num_input_tokens_seen": 21896928, "step": 21235 }, { "epoch": 14.207357859531772, "grad_norm": 3.1789462566375732, "learning_rate": 2.3454552409138483e-06, "loss": 0.3981, "num_input_tokens_seen": 21902368, "step": 21240 }, { "epoch": 14.210702341137123, "grad_norm": 2.9702131748199463, "learning_rate": 2.342982378263057e-06, "loss": 0.4023, "num_input_tokens_seen": 21907232, "step": 21245 }, { "epoch": 14.214046822742475, "grad_norm": 3.7735276222229004, "learning_rate": 2.3405104209419904e-06, "loss": 0.3138, "num_input_tokens_seen": 21913088, "step": 21250 }, { "epoch": 14.217391304347826, "grad_norm": 2.972484827041626, "learning_rate": 2.3380393697929277e-06, "loss": 0.3453, "num_input_tokens_seen": 21918240, "step": 21255 }, { "epoch": 14.220735785953178, "grad_norm": 2.4384913444519043, "learning_rate": 2.3355692256578293e-06, "loss": 0.371, "num_input_tokens_seen": 21923168, "step": 21260 }, { "epoch": 14.224080267558529, "grad_norm": 2.915663242340088, "learning_rate": 2.3330999893783552e-06, "loss": 0.3817, "num_input_tokens_seen": 21928672, "step": 21265 }, { "epoch": 14.22742474916388, "grad_norm": 2.525409698486328, "learning_rate": 2.3306316617958473e-06, "loss": 0.3683, "num_input_tokens_seen": 21933152, "step": 21270 }, { "epoch": 14.23076923076923, "grad_norm": 2.5183961391448975, "learning_rate": 2.3281642437513457e-06, "loss": 0.416, "num_input_tokens_seen": 21937824, "step": 21275 }, { "epoch": 14.234113712374581, "grad_norm": 2.9798264503479004, "learning_rate": 2.3256977360855764e-06, "loss": 0.3403, "num_input_tokens_seen": 21942816, "step": 21280 }, { "epoch": 14.237458193979933, "grad_norm": 2.4335715770721436, "learning_rate": 2.323232139638958e-06, "loss": 0.3375, "num_input_tokens_seen": 21948256, "step": 21285 }, { "epoch": 14.240802675585284, "grad_norm": 1.9779711961746216, "learning_rate": 2.320767455251597e-06, "loss": 0.4304, "num_input_tokens_seen": 21954848, "step": 21290 }, { "epoch": 14.244147157190636, "grad_norm": 2.9777920246124268, "learning_rate": 2.318303683763285e-06, "loss": 0.3374, "num_input_tokens_seen": 21959776, "step": 21295 }, { "epoch": 14.247491638795987, "grad_norm": 3.274977207183838, "learning_rate": 2.3158408260135127e-06, "loss": 0.3977, "num_input_tokens_seen": 21964256, "step": 21300 }, { "epoch": 14.250836120401338, "grad_norm": 2.3950648307800293, "learning_rate": 2.31337888284145e-06, "loss": 0.3821, "num_input_tokens_seen": 21969216, "step": 21305 }, { "epoch": 14.25418060200669, "grad_norm": 2.708754301071167, "learning_rate": 2.31091785508596e-06, "loss": 0.284, "num_input_tokens_seen": 21974176, "step": 21310 }, { "epoch": 14.25752508361204, "grad_norm": 2.550452470779419, "learning_rate": 2.3084577435855904e-06, "loss": 0.3673, "num_input_tokens_seen": 21979936, "step": 21315 }, { "epoch": 14.26086956521739, "grad_norm": 2.093228816986084, "learning_rate": 2.3059985491785797e-06, "loss": 0.3502, "num_input_tokens_seen": 21985888, "step": 21320 }, { "epoch": 14.264214046822742, "grad_norm": 4.048472881317139, "learning_rate": 2.303540272702853e-06, "loss": 0.4215, "num_input_tokens_seen": 21989888, "step": 21325 }, { "epoch": 14.267558528428093, "grad_norm": 2.1512515544891357, "learning_rate": 2.301082914996024e-06, "loss": 0.3386, "num_input_tokens_seen": 21994528, "step": 21330 }, { "epoch": 14.270903010033445, "grad_norm": 2.271233320236206, "learning_rate": 2.2986264768953924e-06, "loss": 0.3781, "num_input_tokens_seen": 21999520, "step": 21335 }, { "epoch": 14.274247491638796, "grad_norm": 3.1027402877807617, "learning_rate": 2.2961709592379387e-06, "loss": 0.359, "num_input_tokens_seen": 22004448, "step": 21340 }, { "epoch": 14.277591973244148, "grad_norm": 2.640098810195923, "learning_rate": 2.2937163628603437e-06, "loss": 0.3743, "num_input_tokens_seen": 22009952, "step": 21345 }, { "epoch": 14.280936454849499, "grad_norm": 2.785472869873047, "learning_rate": 2.291262688598959e-06, "loss": 0.3907, "num_input_tokens_seen": 22015616, "step": 21350 }, { "epoch": 14.284280936454849, "grad_norm": 2.591214895248413, "learning_rate": 2.288809937289834e-06, "loss": 0.3797, "num_input_tokens_seen": 22021280, "step": 21355 }, { "epoch": 14.2876254180602, "grad_norm": 3.142350912094116, "learning_rate": 2.286358109768693e-06, "loss": 0.3572, "num_input_tokens_seen": 22026464, "step": 21360 }, { "epoch": 14.290969899665551, "grad_norm": 2.3348233699798584, "learning_rate": 2.283907206870954e-06, "loss": 0.3141, "num_input_tokens_seen": 22031456, "step": 21365 }, { "epoch": 14.294314381270903, "grad_norm": 3.1997132301330566, "learning_rate": 2.281457229431717e-06, "loss": 0.3581, "num_input_tokens_seen": 22036640, "step": 21370 }, { "epoch": 14.297658862876254, "grad_norm": 2.1550445556640625, "learning_rate": 2.2790081782857655e-06, "loss": 0.4396, "num_input_tokens_seen": 22042272, "step": 21375 }, { "epoch": 14.301003344481606, "grad_norm": 3.1162960529327393, "learning_rate": 2.2765600542675714e-06, "loss": 0.3882, "num_input_tokens_seen": 22047232, "step": 21380 }, { "epoch": 14.304347826086957, "grad_norm": 2.1220719814300537, "learning_rate": 2.274112858211282e-06, "loss": 0.2952, "num_input_tokens_seen": 22051840, "step": 21385 }, { "epoch": 14.307692307692308, "grad_norm": 2.281576156616211, "learning_rate": 2.2716665909507415e-06, "loss": 0.4034, "num_input_tokens_seen": 22057632, "step": 21390 }, { "epoch": 14.31103678929766, "grad_norm": 2.4449985027313232, "learning_rate": 2.2692212533194645e-06, "loss": 0.3587, "num_input_tokens_seen": 22062752, "step": 21395 }, { "epoch": 14.31438127090301, "grad_norm": 2.5771467685699463, "learning_rate": 2.266776846150659e-06, "loss": 0.2767, "num_input_tokens_seen": 22067104, "step": 21400 }, { "epoch": 14.31772575250836, "grad_norm": 2.114297389984131, "learning_rate": 2.2643333702772057e-06, "loss": 0.4062, "num_input_tokens_seen": 22072576, "step": 21405 }, { "epoch": 14.321070234113712, "grad_norm": 2.5139071941375732, "learning_rate": 2.261890826531681e-06, "loss": 0.3222, "num_input_tokens_seen": 22077024, "step": 21410 }, { "epoch": 14.324414715719064, "grad_norm": 3.155858278274536, "learning_rate": 2.259449215746332e-06, "loss": 0.3628, "num_input_tokens_seen": 22081216, "step": 21415 }, { "epoch": 14.327759197324415, "grad_norm": 2.2619380950927734, "learning_rate": 2.2570085387530953e-06, "loss": 0.3986, "num_input_tokens_seen": 22085792, "step": 21420 }, { "epoch": 14.331103678929766, "grad_norm": 1.898226261138916, "learning_rate": 2.254568796383584e-06, "loss": 0.4352, "num_input_tokens_seen": 22091424, "step": 21425 }, { "epoch": 14.334448160535118, "grad_norm": 1.991965413093567, "learning_rate": 2.2521299894690956e-06, "loss": 0.4456, "num_input_tokens_seen": 22096544, "step": 21430 }, { "epoch": 14.337792642140467, "grad_norm": 2.8581035137176514, "learning_rate": 2.2496921188406097e-06, "loss": 0.46, "num_input_tokens_seen": 22102368, "step": 21435 }, { "epoch": 14.341137123745819, "grad_norm": 2.3177545070648193, "learning_rate": 2.2472551853287848e-06, "loss": 0.2874, "num_input_tokens_seen": 22108064, "step": 21440 }, { "epoch": 14.34448160535117, "grad_norm": 3.4431097507476807, "learning_rate": 2.2448191897639637e-06, "loss": 0.3618, "num_input_tokens_seen": 22112960, "step": 21445 }, { "epoch": 14.347826086956522, "grad_norm": 2.646815776824951, "learning_rate": 2.2423841329761614e-06, "loss": 0.3097, "num_input_tokens_seen": 22117376, "step": 21450 }, { "epoch": 14.351170568561873, "grad_norm": 2.3748931884765625, "learning_rate": 2.2399500157950855e-06, "loss": 0.3249, "num_input_tokens_seen": 22122624, "step": 21455 }, { "epoch": 14.354515050167224, "grad_norm": 2.689420223236084, "learning_rate": 2.2375168390501113e-06, "loss": 0.3423, "num_input_tokens_seen": 22127360, "step": 21460 }, { "epoch": 14.357859531772576, "grad_norm": 2.469095230102539, "learning_rate": 2.2350846035703015e-06, "loss": 0.4033, "num_input_tokens_seen": 22133056, "step": 21465 }, { "epoch": 14.361204013377927, "grad_norm": 2.003603458404541, "learning_rate": 2.232653310184394e-06, "loss": 0.356, "num_input_tokens_seen": 22138336, "step": 21470 }, { "epoch": 14.364548494983278, "grad_norm": 2.686004638671875, "learning_rate": 2.230222959720807e-06, "loss": 0.375, "num_input_tokens_seen": 22143840, "step": 21475 }, { "epoch": 14.367892976588628, "grad_norm": 3.5698118209838867, "learning_rate": 2.2277935530076384e-06, "loss": 0.3716, "num_input_tokens_seen": 22148576, "step": 21480 }, { "epoch": 14.37123745819398, "grad_norm": 2.5476112365722656, "learning_rate": 2.225365090872664e-06, "loss": 0.4549, "num_input_tokens_seen": 22153120, "step": 21485 }, { "epoch": 14.37458193979933, "grad_norm": 2.9272072315216064, "learning_rate": 2.222937574143338e-06, "loss": 0.436, "num_input_tokens_seen": 22157440, "step": 21490 }, { "epoch": 14.377926421404682, "grad_norm": 3.6755402088165283, "learning_rate": 2.2205110036467875e-06, "loss": 0.3864, "num_input_tokens_seen": 22162656, "step": 21495 }, { "epoch": 14.381270903010034, "grad_norm": 3.766490936279297, "learning_rate": 2.218085380209829e-06, "loss": 0.3715, "num_input_tokens_seen": 22167456, "step": 21500 }, { "epoch": 14.384615384615385, "grad_norm": 2.8625576496124268, "learning_rate": 2.2156607046589433e-06, "loss": 0.3781, "num_input_tokens_seen": 22172544, "step": 21505 }, { "epoch": 14.387959866220736, "grad_norm": 3.3048744201660156, "learning_rate": 2.2132369778202972e-06, "loss": 0.4657, "num_input_tokens_seen": 22177056, "step": 21510 }, { "epoch": 14.391304347826088, "grad_norm": 2.821667194366455, "learning_rate": 2.2108142005197275e-06, "loss": 0.3791, "num_input_tokens_seen": 22182304, "step": 21515 }, { "epoch": 14.394648829431437, "grad_norm": 2.322190046310425, "learning_rate": 2.208392373582753e-06, "loss": 0.362, "num_input_tokens_seen": 22187744, "step": 21520 }, { "epoch": 14.397993311036789, "grad_norm": 1.8668708801269531, "learning_rate": 2.205971497834566e-06, "loss": 0.3416, "num_input_tokens_seen": 22193344, "step": 21525 }, { "epoch": 14.40133779264214, "grad_norm": 2.5366475582122803, "learning_rate": 2.2035515741000353e-06, "loss": 0.4084, "num_input_tokens_seen": 22198656, "step": 21530 }, { "epoch": 14.404682274247492, "grad_norm": 3.1490585803985596, "learning_rate": 2.2011326032037065e-06, "loss": 0.3825, "num_input_tokens_seen": 22203328, "step": 21535 }, { "epoch": 14.408026755852843, "grad_norm": 3.7169339656829834, "learning_rate": 2.1987145859697956e-06, "loss": 0.425, "num_input_tokens_seen": 22208384, "step": 21540 }, { "epoch": 14.411371237458194, "grad_norm": 2.2442924976348877, "learning_rate": 2.1962975232222035e-06, "loss": 0.3687, "num_input_tokens_seen": 22213536, "step": 21545 }, { "epoch": 14.414715719063546, "grad_norm": 2.795647621154785, "learning_rate": 2.193881415784494e-06, "loss": 0.3567, "num_input_tokens_seen": 22218112, "step": 21550 }, { "epoch": 14.418060200668897, "grad_norm": 4.644344329833984, "learning_rate": 2.191466264479915e-06, "loss": 0.3915, "num_input_tokens_seen": 22223488, "step": 21555 }, { "epoch": 14.421404682274247, "grad_norm": 3.6385910511016846, "learning_rate": 2.189052070131381e-06, "loss": 0.3188, "num_input_tokens_seen": 22228256, "step": 21560 }, { "epoch": 14.424749163879598, "grad_norm": 3.2297451496124268, "learning_rate": 2.186638833561487e-06, "loss": 0.3269, "num_input_tokens_seen": 22233696, "step": 21565 }, { "epoch": 14.42809364548495, "grad_norm": 2.2410082817077637, "learning_rate": 2.1842265555924975e-06, "loss": 0.4217, "num_input_tokens_seen": 22239776, "step": 21570 }, { "epoch": 14.431438127090301, "grad_norm": 3.1528701782226562, "learning_rate": 2.1818152370463523e-06, "loss": 0.335, "num_input_tokens_seen": 22244288, "step": 21575 }, { "epoch": 14.434782608695652, "grad_norm": 2.8052568435668945, "learning_rate": 2.179404878744665e-06, "loss": 0.4276, "num_input_tokens_seen": 22249472, "step": 21580 }, { "epoch": 14.438127090301004, "grad_norm": 3.7147488594055176, "learning_rate": 2.176995481508717e-06, "loss": 0.4348, "num_input_tokens_seen": 22254336, "step": 21585 }, { "epoch": 14.441471571906355, "grad_norm": 4.047427654266357, "learning_rate": 2.174587046159471e-06, "loss": 0.3631, "num_input_tokens_seen": 22260224, "step": 21590 }, { "epoch": 14.444816053511706, "grad_norm": 3.0326178073883057, "learning_rate": 2.172179573517552e-06, "loss": 0.4107, "num_input_tokens_seen": 22265120, "step": 21595 }, { "epoch": 14.448160535117056, "grad_norm": 3.1159744262695312, "learning_rate": 2.1697730644032665e-06, "loss": 0.4205, "num_input_tokens_seen": 22269920, "step": 21600 }, { "epoch": 14.451505016722408, "grad_norm": 3.7558488845825195, "learning_rate": 2.167367519636584e-06, "loss": 0.4871, "num_input_tokens_seen": 22275264, "step": 21605 }, { "epoch": 14.454849498327759, "grad_norm": 2.670222043991089, "learning_rate": 2.1649629400371516e-06, "loss": 0.3873, "num_input_tokens_seen": 22280384, "step": 21610 }, { "epoch": 14.45819397993311, "grad_norm": 3.372131109237671, "learning_rate": 2.1625593264242838e-06, "loss": 0.4, "num_input_tokens_seen": 22285536, "step": 21615 }, { "epoch": 14.461538461538462, "grad_norm": 2.7282559871673584, "learning_rate": 2.1601566796169697e-06, "loss": 0.3151, "num_input_tokens_seen": 22290720, "step": 21620 }, { "epoch": 14.464882943143813, "grad_norm": 3.540212869644165, "learning_rate": 2.157755000433867e-06, "loss": 0.4573, "num_input_tokens_seen": 22296032, "step": 21625 }, { "epoch": 14.468227424749164, "grad_norm": 2.498199701309204, "learning_rate": 2.155354289693302e-06, "loss": 0.352, "num_input_tokens_seen": 22302016, "step": 21630 }, { "epoch": 14.471571906354516, "grad_norm": 1.9566571712493896, "learning_rate": 2.1529545482132717e-06, "loss": 0.3103, "num_input_tokens_seen": 22307424, "step": 21635 }, { "epoch": 14.474916387959865, "grad_norm": 2.4760799407958984, "learning_rate": 2.1505557768114456e-06, "loss": 0.3233, "num_input_tokens_seen": 22311872, "step": 21640 }, { "epoch": 14.478260869565217, "grad_norm": 3.1026246547698975, "learning_rate": 2.1481579763051623e-06, "loss": 0.4345, "num_input_tokens_seen": 22316736, "step": 21645 }, { "epoch": 14.481605351170568, "grad_norm": 2.180898904800415, "learning_rate": 2.145761147511424e-06, "loss": 0.3404, "num_input_tokens_seen": 22321632, "step": 21650 }, { "epoch": 14.48494983277592, "grad_norm": 3.025258779525757, "learning_rate": 2.1433652912469085e-06, "loss": 0.3652, "num_input_tokens_seen": 22327136, "step": 21655 }, { "epoch": 14.488294314381271, "grad_norm": 2.205596923828125, "learning_rate": 2.1409704083279586e-06, "loss": 0.3476, "num_input_tokens_seen": 22332256, "step": 21660 }, { "epoch": 14.491638795986622, "grad_norm": 3.0424530506134033, "learning_rate": 2.1385764995705886e-06, "loss": 0.3782, "num_input_tokens_seen": 22336576, "step": 21665 }, { "epoch": 14.494983277591974, "grad_norm": 2.1846976280212402, "learning_rate": 2.136183565790476e-06, "loss": 0.313, "num_input_tokens_seen": 22341952, "step": 21670 }, { "epoch": 14.498327759197325, "grad_norm": 3.2136919498443604, "learning_rate": 2.13379160780297e-06, "loss": 0.3711, "num_input_tokens_seen": 22346624, "step": 21675 }, { "epoch": 14.501672240802675, "grad_norm": 2.4333064556121826, "learning_rate": 2.1314006264230853e-06, "loss": 0.3484, "num_input_tokens_seen": 22351776, "step": 21680 }, { "epoch": 14.505016722408026, "grad_norm": 2.2342190742492676, "learning_rate": 2.1290106224655057e-06, "loss": 0.3888, "num_input_tokens_seen": 22357056, "step": 21685 }, { "epoch": 14.508361204013378, "grad_norm": 2.3781332969665527, "learning_rate": 2.1266215967445823e-06, "loss": 0.4033, "num_input_tokens_seen": 22361856, "step": 21690 }, { "epoch": 14.511705685618729, "grad_norm": 2.6706271171569824, "learning_rate": 2.1242335500743283e-06, "loss": 0.3617, "num_input_tokens_seen": 22367232, "step": 21695 }, { "epoch": 14.51505016722408, "grad_norm": 3.337675094604492, "learning_rate": 2.121846483268429e-06, "loss": 0.3669, "num_input_tokens_seen": 22372096, "step": 21700 }, { "epoch": 14.518394648829432, "grad_norm": 2.310453176498413, "learning_rate": 2.119460397140231e-06, "loss": 0.38, "num_input_tokens_seen": 22378048, "step": 21705 }, { "epoch": 14.521739130434783, "grad_norm": 2.832688570022583, "learning_rate": 2.1170752925027536e-06, "loss": 0.3773, "num_input_tokens_seen": 22383808, "step": 21710 }, { "epoch": 14.525083612040135, "grad_norm": 2.5630853176116943, "learning_rate": 2.1146911701686724e-06, "loss": 0.3964, "num_input_tokens_seen": 22388928, "step": 21715 }, { "epoch": 14.528428093645484, "grad_norm": 2.388521909713745, "learning_rate": 2.1123080309503348e-06, "loss": 0.3494, "num_input_tokens_seen": 22394880, "step": 21720 }, { "epoch": 14.531772575250836, "grad_norm": 2.3919615745544434, "learning_rate": 2.1099258756597513e-06, "loss": 0.3511, "num_input_tokens_seen": 22400384, "step": 21725 }, { "epoch": 14.535117056856187, "grad_norm": 3.3845977783203125, "learning_rate": 2.1075447051085985e-06, "loss": 0.3552, "num_input_tokens_seen": 22405344, "step": 21730 }, { "epoch": 14.538461538461538, "grad_norm": 3.8166275024414062, "learning_rate": 2.1051645201082167e-06, "loss": 0.3843, "num_input_tokens_seen": 22410464, "step": 21735 }, { "epoch": 14.54180602006689, "grad_norm": 2.37899112701416, "learning_rate": 2.102785321469607e-06, "loss": 0.3368, "num_input_tokens_seen": 22415520, "step": 21740 }, { "epoch": 14.545150501672241, "grad_norm": 3.0690269470214844, "learning_rate": 2.1004071100034394e-06, "loss": 0.4156, "num_input_tokens_seen": 22420480, "step": 21745 }, { "epoch": 14.548494983277592, "grad_norm": 2.7353856563568115, "learning_rate": 2.098029886520046e-06, "loss": 0.4209, "num_input_tokens_seen": 22425760, "step": 21750 }, { "epoch": 14.551839464882944, "grad_norm": 3.3391618728637695, "learning_rate": 2.095653651829423e-06, "loss": 0.3853, "num_input_tokens_seen": 22430592, "step": 21755 }, { "epoch": 14.555183946488294, "grad_norm": 2.3667633533477783, "learning_rate": 2.0932784067412247e-06, "loss": 0.2968, "num_input_tokens_seen": 22434752, "step": 21760 }, { "epoch": 14.558528428093645, "grad_norm": 2.012530565261841, "learning_rate": 2.0909041520647743e-06, "loss": 0.4324, "num_input_tokens_seen": 22439488, "step": 21765 }, { "epoch": 14.561872909698996, "grad_norm": 2.003225326538086, "learning_rate": 2.088530888609055e-06, "loss": 0.3831, "num_input_tokens_seen": 22444768, "step": 21770 }, { "epoch": 14.565217391304348, "grad_norm": 2.6322991847991943, "learning_rate": 2.0861586171827132e-06, "loss": 0.4108, "num_input_tokens_seen": 22450432, "step": 21775 }, { "epoch": 14.568561872909699, "grad_norm": 2.5308096408843994, "learning_rate": 2.083787338594057e-06, "loss": 0.3164, "num_input_tokens_seen": 22455712, "step": 21780 }, { "epoch": 14.57190635451505, "grad_norm": 2.2000582218170166, "learning_rate": 2.081417053651054e-06, "loss": 0.3672, "num_input_tokens_seen": 22460800, "step": 21785 }, { "epoch": 14.575250836120402, "grad_norm": 2.6642348766326904, "learning_rate": 2.079047763161336e-06, "loss": 0.4259, "num_input_tokens_seen": 22465888, "step": 21790 }, { "epoch": 14.578595317725753, "grad_norm": 3.267486810684204, "learning_rate": 2.0766794679321946e-06, "loss": 0.4197, "num_input_tokens_seen": 22471168, "step": 21795 }, { "epoch": 14.581939799331103, "grad_norm": 3.1198768615722656, "learning_rate": 2.0743121687705847e-06, "loss": 0.3871, "num_input_tokens_seen": 22476480, "step": 21800 }, { "epoch": 14.585284280936454, "grad_norm": 5.082079887390137, "learning_rate": 2.0719458664831164e-06, "loss": 0.3032, "num_input_tokens_seen": 22482560, "step": 21805 }, { "epoch": 14.588628762541806, "grad_norm": 2.727372169494629, "learning_rate": 2.069580561876066e-06, "loss": 0.3395, "num_input_tokens_seen": 22487584, "step": 21810 }, { "epoch": 14.591973244147157, "grad_norm": 2.5446689128875732, "learning_rate": 2.0672162557553667e-06, "loss": 0.3982, "num_input_tokens_seen": 22492736, "step": 21815 }, { "epoch": 14.595317725752508, "grad_norm": 3.3702282905578613, "learning_rate": 2.0648529489266116e-06, "loss": 0.4002, "num_input_tokens_seen": 22497984, "step": 21820 }, { "epoch": 14.59866220735786, "grad_norm": 2.722425699234009, "learning_rate": 2.0624906421950563e-06, "loss": 0.3793, "num_input_tokens_seen": 22503328, "step": 21825 }, { "epoch": 14.602006688963211, "grad_norm": 2.8732945919036865, "learning_rate": 2.06012933636561e-06, "loss": 0.3742, "num_input_tokens_seen": 22508192, "step": 21830 }, { "epoch": 14.605351170568563, "grad_norm": 2.6422441005706787, "learning_rate": 2.0577690322428455e-06, "loss": 0.3613, "num_input_tokens_seen": 22513856, "step": 21835 }, { "epoch": 14.608695652173914, "grad_norm": 2.5965402126312256, "learning_rate": 2.055409730630993e-06, "loss": 0.3059, "num_input_tokens_seen": 22519744, "step": 21840 }, { "epoch": 14.612040133779264, "grad_norm": 2.4763104915618896, "learning_rate": 2.053051432333943e-06, "loss": 0.3853, "num_input_tokens_seen": 22524384, "step": 21845 }, { "epoch": 14.615384615384615, "grad_norm": 3.8386847972869873, "learning_rate": 2.0506941381552377e-06, "loss": 0.4115, "num_input_tokens_seen": 22529568, "step": 21850 }, { "epoch": 14.618729096989966, "grad_norm": 2.654418706893921, "learning_rate": 2.0483378488980845e-06, "loss": 0.4355, "num_input_tokens_seen": 22534336, "step": 21855 }, { "epoch": 14.622073578595318, "grad_norm": 2.5387582778930664, "learning_rate": 2.045982565365344e-06, "loss": 0.3908, "num_input_tokens_seen": 22540384, "step": 21860 }, { "epoch": 14.62541806020067, "grad_norm": 2.2148356437683105, "learning_rate": 2.0436282883595387e-06, "loss": 0.3171, "num_input_tokens_seen": 22545056, "step": 21865 }, { "epoch": 14.62876254180602, "grad_norm": 2.0908608436584473, "learning_rate": 2.0412750186828416e-06, "loss": 0.3087, "num_input_tokens_seen": 22550112, "step": 21870 }, { "epoch": 14.632107023411372, "grad_norm": 2.0314440727233887, "learning_rate": 2.0389227571370867e-06, "loss": 0.3571, "num_input_tokens_seen": 22555264, "step": 21875 }, { "epoch": 14.635451505016722, "grad_norm": 2.2390329837799072, "learning_rate": 2.0365715045237646e-06, "loss": 0.3533, "num_input_tokens_seen": 22561152, "step": 21880 }, { "epoch": 14.638795986622073, "grad_norm": 2.510849952697754, "learning_rate": 2.0342212616440206e-06, "loss": 0.3332, "num_input_tokens_seen": 22567168, "step": 21885 }, { "epoch": 14.642140468227424, "grad_norm": 2.38985538482666, "learning_rate": 2.0318720292986584e-06, "loss": 0.3384, "num_input_tokens_seen": 22571616, "step": 21890 }, { "epoch": 14.645484949832776, "grad_norm": 2.5774810314178467, "learning_rate": 2.0295238082881324e-06, "loss": 0.3976, "num_input_tokens_seen": 22576864, "step": 21895 }, { "epoch": 14.648829431438127, "grad_norm": 1.9128280878067017, "learning_rate": 2.0271765994125573e-06, "loss": 0.3847, "num_input_tokens_seen": 22581856, "step": 21900 }, { "epoch": 14.652173913043478, "grad_norm": 2.5937623977661133, "learning_rate": 2.0248304034717007e-06, "loss": 0.4005, "num_input_tokens_seen": 22587392, "step": 21905 }, { "epoch": 14.65551839464883, "grad_norm": 2.6332571506500244, "learning_rate": 2.0224852212649885e-06, "loss": 0.3604, "num_input_tokens_seen": 22592192, "step": 21910 }, { "epoch": 14.658862876254181, "grad_norm": 2.3569390773773193, "learning_rate": 2.0201410535914934e-06, "loss": 0.2778, "num_input_tokens_seen": 22596352, "step": 21915 }, { "epoch": 14.662207357859533, "grad_norm": 3.212186336517334, "learning_rate": 2.01779790124995e-06, "loss": 0.4135, "num_input_tokens_seen": 22601728, "step": 21920 }, { "epoch": 14.665551839464882, "grad_norm": 3.5783419609069824, "learning_rate": 2.0154557650387437e-06, "loss": 0.3853, "num_input_tokens_seen": 22605984, "step": 21925 }, { "epoch": 14.668896321070234, "grad_norm": 3.0264062881469727, "learning_rate": 2.013114645755914e-06, "loss": 0.3764, "num_input_tokens_seen": 22610912, "step": 21930 }, { "epoch": 14.672240802675585, "grad_norm": 2.1847851276397705, "learning_rate": 2.0107745441991568e-06, "loss": 0.2979, "num_input_tokens_seen": 22616416, "step": 21935 }, { "epoch": 14.675585284280936, "grad_norm": 2.5522053241729736, "learning_rate": 2.0084354611658142e-06, "loss": 0.4151, "num_input_tokens_seen": 22622336, "step": 21940 }, { "epoch": 14.678929765886288, "grad_norm": 2.3627729415893555, "learning_rate": 2.0060973974528873e-06, "loss": 0.3623, "num_input_tokens_seen": 22627776, "step": 21945 }, { "epoch": 14.68227424749164, "grad_norm": 2.4634928703308105, "learning_rate": 2.0037603538570286e-06, "loss": 0.325, "num_input_tokens_seen": 22632512, "step": 21950 }, { "epoch": 14.68561872909699, "grad_norm": 1.9173074960708618, "learning_rate": 2.0014243311745446e-06, "loss": 0.3048, "num_input_tokens_seen": 22637472, "step": 21955 }, { "epoch": 14.68896321070234, "grad_norm": 3.2968828678131104, "learning_rate": 1.9990893302013877e-06, "loss": 0.4298, "num_input_tokens_seen": 22643232, "step": 21960 }, { "epoch": 14.692307692307692, "grad_norm": 1.812362551689148, "learning_rate": 1.9967553517331684e-06, "loss": 0.3104, "num_input_tokens_seen": 22649280, "step": 21965 }, { "epoch": 14.695652173913043, "grad_norm": 3.1654491424560547, "learning_rate": 1.9944223965651466e-06, "loss": 0.3899, "num_input_tokens_seen": 22654944, "step": 21970 }, { "epoch": 14.698996655518394, "grad_norm": 4.029391765594482, "learning_rate": 1.992090465492234e-06, "loss": 0.3692, "num_input_tokens_seen": 22659776, "step": 21975 }, { "epoch": 14.702341137123746, "grad_norm": 3.0492982864379883, "learning_rate": 1.9897595593089946e-06, "loss": 0.2871, "num_input_tokens_seen": 22664512, "step": 21980 }, { "epoch": 14.705685618729097, "grad_norm": 1.7005929946899414, "learning_rate": 1.987429678809639e-06, "loss": 0.3799, "num_input_tokens_seen": 22670176, "step": 21985 }, { "epoch": 14.709030100334449, "grad_norm": 2.0955655574798584, "learning_rate": 1.9851008247880315e-06, "loss": 0.3382, "num_input_tokens_seen": 22675296, "step": 21990 }, { "epoch": 14.7123745819398, "grad_norm": 2.829554319381714, "learning_rate": 1.982772998037687e-06, "loss": 0.35, "num_input_tokens_seen": 22680704, "step": 21995 }, { "epoch": 14.715719063545151, "grad_norm": 2.238177537918091, "learning_rate": 1.9804461993517708e-06, "loss": 0.3661, "num_input_tokens_seen": 22686656, "step": 22000 }, { "epoch": 14.719063545150501, "grad_norm": 3.080648183822632, "learning_rate": 1.9781204295230937e-06, "loss": 0.403, "num_input_tokens_seen": 22691776, "step": 22005 }, { "epoch": 14.722408026755852, "grad_norm": 2.8083136081695557, "learning_rate": 1.97579568934412e-06, "loss": 0.3387, "num_input_tokens_seen": 22696096, "step": 22010 }, { "epoch": 14.725752508361204, "grad_norm": 2.6404364109039307, "learning_rate": 1.9734719796069626e-06, "loss": 0.3867, "num_input_tokens_seen": 22701056, "step": 22015 }, { "epoch": 14.729096989966555, "grad_norm": 2.1645240783691406, "learning_rate": 1.9711493011033817e-06, "loss": 0.4089, "num_input_tokens_seen": 22706624, "step": 22020 }, { "epoch": 14.732441471571907, "grad_norm": 2.7487239837646484, "learning_rate": 1.9688276546247895e-06, "loss": 0.4383, "num_input_tokens_seen": 22711808, "step": 22025 }, { "epoch": 14.735785953177258, "grad_norm": 2.7837722301483154, "learning_rate": 1.966507040962241e-06, "loss": 0.4411, "num_input_tokens_seen": 22717120, "step": 22030 }, { "epoch": 14.73913043478261, "grad_norm": 2.244584798812866, "learning_rate": 1.9641874609064443e-06, "loss": 0.3723, "num_input_tokens_seen": 22722464, "step": 22035 }, { "epoch": 14.742474916387959, "grad_norm": 2.636951208114624, "learning_rate": 1.9618689152477527e-06, "loss": 0.3546, "num_input_tokens_seen": 22727008, "step": 22040 }, { "epoch": 14.74581939799331, "grad_norm": 2.929155111312866, "learning_rate": 1.9595514047761703e-06, "loss": 0.3165, "num_input_tokens_seen": 22732288, "step": 22045 }, { "epoch": 14.749163879598662, "grad_norm": 3.1384506225585938, "learning_rate": 1.9572349302813428e-06, "loss": 0.3945, "num_input_tokens_seen": 22737376, "step": 22050 }, { "epoch": 14.752508361204013, "grad_norm": 2.619856119155884, "learning_rate": 1.9549194925525667e-06, "loss": 0.3654, "num_input_tokens_seen": 22742272, "step": 22055 }, { "epoch": 14.755852842809364, "grad_norm": 1.9518264532089233, "learning_rate": 1.9526050923787855e-06, "loss": 0.3268, "num_input_tokens_seen": 22747360, "step": 22060 }, { "epoch": 14.759197324414716, "grad_norm": 2.197465181350708, "learning_rate": 1.950291730548588e-06, "loss": 0.3994, "num_input_tokens_seen": 22752128, "step": 22065 }, { "epoch": 14.762541806020067, "grad_norm": 2.546509265899658, "learning_rate": 1.9479794078502116e-06, "loss": 0.3639, "num_input_tokens_seen": 22757408, "step": 22070 }, { "epoch": 14.765886287625419, "grad_norm": 3.2488436698913574, "learning_rate": 1.9456681250715346e-06, "loss": 0.3703, "num_input_tokens_seen": 22763008, "step": 22075 }, { "epoch": 14.76923076923077, "grad_norm": 2.5811831951141357, "learning_rate": 1.943357883000085e-06, "loss": 0.3214, "num_input_tokens_seen": 22767840, "step": 22080 }, { "epoch": 14.77257525083612, "grad_norm": 2.3508899211883545, "learning_rate": 1.9410486824230353e-06, "loss": 0.4242, "num_input_tokens_seen": 22774048, "step": 22085 }, { "epoch": 14.775919732441471, "grad_norm": 3.4608922004699707, "learning_rate": 1.938740524127204e-06, "loss": 0.3316, "num_input_tokens_seen": 22778944, "step": 22090 }, { "epoch": 14.779264214046822, "grad_norm": 2.533243179321289, "learning_rate": 1.936433408899051e-06, "loss": 0.4657, "num_input_tokens_seen": 22783680, "step": 22095 }, { "epoch": 14.782608695652174, "grad_norm": 2.932924747467041, "learning_rate": 1.934127337524685e-06, "loss": 0.3425, "num_input_tokens_seen": 22788800, "step": 22100 }, { "epoch": 14.785953177257525, "grad_norm": 2.7039780616760254, "learning_rate": 1.931822310789857e-06, "loss": 0.4181, "num_input_tokens_seen": 22795392, "step": 22105 }, { "epoch": 14.789297658862877, "grad_norm": 2.8503706455230713, "learning_rate": 1.9295183294799642e-06, "loss": 0.3487, "num_input_tokens_seen": 22800736, "step": 22110 }, { "epoch": 14.792642140468228, "grad_norm": 2.9168920516967773, "learning_rate": 1.9272153943800413e-06, "loss": 0.4817, "num_input_tokens_seen": 22805632, "step": 22115 }, { "epoch": 14.79598662207358, "grad_norm": 2.200786590576172, "learning_rate": 1.924913506274774e-06, "loss": 0.2895, "num_input_tokens_seen": 22810304, "step": 22120 }, { "epoch": 14.799331103678929, "grad_norm": 2.7523458003997803, "learning_rate": 1.9226126659484873e-06, "loss": 0.3177, "num_input_tokens_seen": 22814976, "step": 22125 }, { "epoch": 14.80267558528428, "grad_norm": 2.812518358230591, "learning_rate": 1.9203128741851508e-06, "loss": 0.4135, "num_input_tokens_seen": 22819648, "step": 22130 }, { "epoch": 14.806020066889632, "grad_norm": 2.1314210891723633, "learning_rate": 1.918014131768377e-06, "loss": 0.2923, "num_input_tokens_seen": 22824064, "step": 22135 }, { "epoch": 14.809364548494983, "grad_norm": 2.959484577178955, "learning_rate": 1.9157164394814177e-06, "loss": 0.4015, "num_input_tokens_seen": 22828768, "step": 22140 }, { "epoch": 14.812709030100335, "grad_norm": 2.499985933303833, "learning_rate": 1.9134197981071694e-06, "loss": 0.4096, "num_input_tokens_seen": 22833824, "step": 22145 }, { "epoch": 14.816053511705686, "grad_norm": 3.182616949081421, "learning_rate": 1.9111242084281713e-06, "loss": 0.3646, "num_input_tokens_seen": 22838624, "step": 22150 }, { "epoch": 14.819397993311037, "grad_norm": 3.73376202583313, "learning_rate": 1.9088296712266046e-06, "loss": 0.365, "num_input_tokens_seen": 22843872, "step": 22155 }, { "epoch": 14.822742474916389, "grad_norm": 2.633007287979126, "learning_rate": 1.9065361872842869e-06, "loss": 0.3609, "num_input_tokens_seen": 22848736, "step": 22160 }, { "epoch": 14.826086956521738, "grad_norm": 3.0432236194610596, "learning_rate": 1.9042437573826828e-06, "loss": 0.3271, "num_input_tokens_seen": 22853760, "step": 22165 }, { "epoch": 14.82943143812709, "grad_norm": 2.8370769023895264, "learning_rate": 1.9019523823028952e-06, "loss": 0.454, "num_input_tokens_seen": 22859552, "step": 22170 }, { "epoch": 14.832775919732441, "grad_norm": 2.286487579345703, "learning_rate": 1.8996620628256678e-06, "loss": 0.3963, "num_input_tokens_seen": 22864384, "step": 22175 }, { "epoch": 14.836120401337793, "grad_norm": 1.706779956817627, "learning_rate": 1.8973727997313862e-06, "loss": 0.4032, "num_input_tokens_seen": 22869088, "step": 22180 }, { "epoch": 14.839464882943144, "grad_norm": 2.751224994659424, "learning_rate": 1.8950845938000717e-06, "loss": 0.4651, "num_input_tokens_seen": 22873824, "step": 22185 }, { "epoch": 14.842809364548495, "grad_norm": 2.9916465282440186, "learning_rate": 1.8927974458113896e-06, "loss": 0.3801, "num_input_tokens_seen": 22878592, "step": 22190 }, { "epoch": 14.846153846153847, "grad_norm": 3.0745415687561035, "learning_rate": 1.890511356544643e-06, "loss": 0.3944, "num_input_tokens_seen": 22884160, "step": 22195 }, { "epoch": 14.849498327759198, "grad_norm": 2.3877501487731934, "learning_rate": 1.888226326778777e-06, "loss": 0.3748, "num_input_tokens_seen": 22889216, "step": 22200 }, { "epoch": 14.852842809364548, "grad_norm": 2.3972275257110596, "learning_rate": 1.8859423572923702e-06, "loss": 0.3065, "num_input_tokens_seen": 22894272, "step": 22205 }, { "epoch": 14.856187290969899, "grad_norm": 2.4244918823242188, "learning_rate": 1.8836594488636434e-06, "loss": 0.3668, "num_input_tokens_seen": 22899424, "step": 22210 }, { "epoch": 14.85953177257525, "grad_norm": 3.1402485370635986, "learning_rate": 1.8813776022704572e-06, "loss": 0.426, "num_input_tokens_seen": 22904448, "step": 22215 }, { "epoch": 14.862876254180602, "grad_norm": 3.06447434425354, "learning_rate": 1.8790968182903074e-06, "loss": 0.3455, "num_input_tokens_seen": 22909312, "step": 22220 }, { "epoch": 14.866220735785953, "grad_norm": 3.9482431411743164, "learning_rate": 1.8768170977003313e-06, "loss": 0.3369, "num_input_tokens_seen": 22913888, "step": 22225 }, { "epoch": 14.869565217391305, "grad_norm": 2.8571932315826416, "learning_rate": 1.8745384412772983e-06, "loss": 0.4602, "num_input_tokens_seen": 22918944, "step": 22230 }, { "epoch": 14.872909698996656, "grad_norm": 3.9879798889160156, "learning_rate": 1.8722608497976196e-06, "loss": 0.4344, "num_input_tokens_seen": 22924576, "step": 22235 }, { "epoch": 14.876254180602007, "grad_norm": 3.219695568084717, "learning_rate": 1.869984324037344e-06, "loss": 0.4134, "num_input_tokens_seen": 22930144, "step": 22240 }, { "epoch": 14.879598662207357, "grad_norm": 3.802640914916992, "learning_rate": 1.8677088647721553e-06, "loss": 0.386, "num_input_tokens_seen": 22935712, "step": 22245 }, { "epoch": 14.882943143812708, "grad_norm": 2.3978376388549805, "learning_rate": 1.8654344727773722e-06, "loss": 0.3516, "num_input_tokens_seen": 22941024, "step": 22250 }, { "epoch": 14.88628762541806, "grad_norm": 3.1077842712402344, "learning_rate": 1.8631611488279532e-06, "loss": 0.33, "num_input_tokens_seen": 22946240, "step": 22255 }, { "epoch": 14.889632107023411, "grad_norm": 1.9172199964523315, "learning_rate": 1.860888893698492e-06, "loss": 0.3795, "num_input_tokens_seen": 22951168, "step": 22260 }, { "epoch": 14.892976588628763, "grad_norm": 2.076634168624878, "learning_rate": 1.8586177081632162e-06, "loss": 0.4013, "num_input_tokens_seen": 22956384, "step": 22265 }, { "epoch": 14.896321070234114, "grad_norm": 2.2368581295013428, "learning_rate": 1.8563475929959934e-06, "loss": 0.3373, "num_input_tokens_seen": 22962016, "step": 22270 }, { "epoch": 14.899665551839465, "grad_norm": 1.8971315622329712, "learning_rate": 1.8540785489703195e-06, "loss": 0.388, "num_input_tokens_seen": 22967392, "step": 22275 }, { "epoch": 14.903010033444817, "grad_norm": 2.09491229057312, "learning_rate": 1.8518105768593315e-06, "loss": 0.3608, "num_input_tokens_seen": 22972320, "step": 22280 }, { "epoch": 14.906354515050166, "grad_norm": 2.5173916816711426, "learning_rate": 1.8495436774357984e-06, "loss": 0.3178, "num_input_tokens_seen": 22977056, "step": 22285 }, { "epoch": 14.909698996655518, "grad_norm": 2.323277473449707, "learning_rate": 1.8472778514721263e-06, "loss": 0.3155, "num_input_tokens_seen": 22982176, "step": 22290 }, { "epoch": 14.91304347826087, "grad_norm": 3.047659397125244, "learning_rate": 1.8450130997403503e-06, "loss": 0.3913, "num_input_tokens_seen": 22987200, "step": 22295 }, { "epoch": 14.91638795986622, "grad_norm": 9.316410064697266, "learning_rate": 1.8427494230121462e-06, "loss": 0.3982, "num_input_tokens_seen": 22992704, "step": 22300 }, { "epoch": 14.919732441471572, "grad_norm": 2.277191400527954, "learning_rate": 1.840486822058815e-06, "loss": 0.3989, "num_input_tokens_seen": 22997664, "step": 22305 }, { "epoch": 14.923076923076923, "grad_norm": 3.545344114303589, "learning_rate": 1.8382252976513031e-06, "loss": 0.3744, "num_input_tokens_seen": 23002528, "step": 22310 }, { "epoch": 14.926421404682275, "grad_norm": 2.29274582862854, "learning_rate": 1.8359648505601775e-06, "loss": 0.3578, "num_input_tokens_seen": 23007808, "step": 22315 }, { "epoch": 14.929765886287626, "grad_norm": 2.8882904052734375, "learning_rate": 1.833705481555646e-06, "loss": 0.3436, "num_input_tokens_seen": 23012480, "step": 22320 }, { "epoch": 14.933110367892976, "grad_norm": 3.0070605278015137, "learning_rate": 1.831447191407547e-06, "loss": 0.4168, "num_input_tokens_seen": 23016960, "step": 22325 }, { "epoch": 14.936454849498327, "grad_norm": 3.3220183849334717, "learning_rate": 1.8291899808853513e-06, "loss": 0.4127, "num_input_tokens_seen": 23021984, "step": 22330 }, { "epoch": 14.939799331103679, "grad_norm": 2.4092085361480713, "learning_rate": 1.8269338507581629e-06, "loss": 0.3499, "num_input_tokens_seen": 23027360, "step": 22335 }, { "epoch": 14.94314381270903, "grad_norm": 2.7378227710723877, "learning_rate": 1.8246788017947138e-06, "loss": 0.4084, "num_input_tokens_seen": 23032864, "step": 22340 }, { "epoch": 14.946488294314381, "grad_norm": 3.1928205490112305, "learning_rate": 1.8224248347633732e-06, "loss": 0.4341, "num_input_tokens_seen": 23037408, "step": 22345 }, { "epoch": 14.949832775919733, "grad_norm": 3.0316762924194336, "learning_rate": 1.8201719504321336e-06, "loss": 0.3678, "num_input_tokens_seen": 23043040, "step": 22350 }, { "epoch": 14.953177257525084, "grad_norm": 2.6587369441986084, "learning_rate": 1.8179201495686311e-06, "loss": 0.5057, "num_input_tokens_seen": 23048128, "step": 22355 }, { "epoch": 14.956521739130435, "grad_norm": 2.801959753036499, "learning_rate": 1.8156694329401192e-06, "loss": 0.3449, "num_input_tokens_seen": 23054016, "step": 22360 }, { "epoch": 14.959866220735787, "grad_norm": 3.2539403438568115, "learning_rate": 1.8134198013134914e-06, "loss": 0.4828, "num_input_tokens_seen": 23059168, "step": 22365 }, { "epoch": 14.963210702341136, "grad_norm": 2.861140727996826, "learning_rate": 1.8111712554552664e-06, "loss": 0.4342, "num_input_tokens_seen": 23064064, "step": 22370 }, { "epoch": 14.966555183946488, "grad_norm": 3.0996551513671875, "learning_rate": 1.8089237961315958e-06, "loss": 0.3842, "num_input_tokens_seen": 23069184, "step": 22375 }, { "epoch": 14.96989966555184, "grad_norm": 2.7711377143859863, "learning_rate": 1.8066774241082612e-06, "loss": 0.3689, "num_input_tokens_seen": 23074880, "step": 22380 }, { "epoch": 14.97324414715719, "grad_norm": 2.813981056213379, "learning_rate": 1.8044321401506693e-06, "loss": 0.3504, "num_input_tokens_seen": 23079712, "step": 22385 }, { "epoch": 14.976588628762542, "grad_norm": 2.826932668685913, "learning_rate": 1.8021879450238606e-06, "loss": 0.4038, "num_input_tokens_seen": 23084160, "step": 22390 }, { "epoch": 14.979933110367893, "grad_norm": 2.3162107467651367, "learning_rate": 1.7999448394925033e-06, "loss": 0.4397, "num_input_tokens_seen": 23089440, "step": 22395 }, { "epoch": 14.983277591973245, "grad_norm": 4.661110877990723, "learning_rate": 1.7977028243208961e-06, "loss": 0.4053, "num_input_tokens_seen": 23093760, "step": 22400 }, { "epoch": 14.986622073578594, "grad_norm": 2.268627405166626, "learning_rate": 1.7954619002729607e-06, "loss": 0.3707, "num_input_tokens_seen": 23100128, "step": 22405 }, { "epoch": 14.989966555183946, "grad_norm": 3.0494842529296875, "learning_rate": 1.793222068112252e-06, "loss": 0.263, "num_input_tokens_seen": 23105696, "step": 22410 }, { "epoch": 14.993311036789297, "grad_norm": 2.5674889087677, "learning_rate": 1.7909833286019518e-06, "loss": 0.3516, "num_input_tokens_seen": 23111136, "step": 22415 }, { "epoch": 14.996655518394649, "grad_norm": 2.8387351036071777, "learning_rate": 1.7887456825048694e-06, "loss": 0.4316, "num_input_tokens_seen": 23117760, "step": 22420 }, { "epoch": 15.0, "grad_norm": 3.9425208568573, "learning_rate": 1.7865091305834431e-06, "loss": 0.3763, "num_input_tokens_seen": 23122720, "step": 22425 }, { "epoch": 15.003344481605351, "grad_norm": 2.4115467071533203, "learning_rate": 1.784273673599733e-06, "loss": 0.392, "num_input_tokens_seen": 23127744, "step": 22430 }, { "epoch": 15.006688963210703, "grad_norm": 2.360506534576416, "learning_rate": 1.7820393123154316e-06, "loss": 0.3329, "num_input_tokens_seen": 23132160, "step": 22435 }, { "epoch": 15.010033444816054, "grad_norm": 3.509146213531494, "learning_rate": 1.7798060474918571e-06, "loss": 0.3159, "num_input_tokens_seen": 23136416, "step": 22440 }, { "epoch": 15.013377926421406, "grad_norm": 2.327461004257202, "learning_rate": 1.777573879889954e-06, "loss": 0.2723, "num_input_tokens_seen": 23141152, "step": 22445 }, { "epoch": 15.016722408026755, "grad_norm": 2.9493699073791504, "learning_rate": 1.7753428102702885e-06, "loss": 0.3599, "num_input_tokens_seen": 23146080, "step": 22450 }, { "epoch": 15.020066889632107, "grad_norm": 2.992288112640381, "learning_rate": 1.7731128393930592e-06, "loss": 0.3559, "num_input_tokens_seen": 23150592, "step": 22455 }, { "epoch": 15.023411371237458, "grad_norm": 2.040471076965332, "learning_rate": 1.7708839680180866e-06, "loss": 0.329, "num_input_tokens_seen": 23155872, "step": 22460 }, { "epoch": 15.02675585284281, "grad_norm": 4.29910135269165, "learning_rate": 1.7686561969048188e-06, "loss": 0.3732, "num_input_tokens_seen": 23160896, "step": 22465 }, { "epoch": 15.03010033444816, "grad_norm": 3.7995007038116455, "learning_rate": 1.7664295268123278e-06, "loss": 0.2484, "num_input_tokens_seen": 23165984, "step": 22470 }, { "epoch": 15.033444816053512, "grad_norm": 2.9668233394622803, "learning_rate": 1.7642039584993077e-06, "loss": 0.4282, "num_input_tokens_seen": 23171072, "step": 22475 }, { "epoch": 15.036789297658864, "grad_norm": 3.3660571575164795, "learning_rate": 1.7619794927240818e-06, "loss": 0.4663, "num_input_tokens_seen": 23176256, "step": 22480 }, { "epoch": 15.040133779264215, "grad_norm": 2.3409883975982666, "learning_rate": 1.7597561302445953e-06, "loss": 0.4136, "num_input_tokens_seen": 23180672, "step": 22485 }, { "epoch": 15.043478260869565, "grad_norm": 3.8607499599456787, "learning_rate": 1.7575338718184198e-06, "loss": 0.4568, "num_input_tokens_seen": 23185952, "step": 22490 }, { "epoch": 15.046822742474916, "grad_norm": 2.6016552448272705, "learning_rate": 1.7553127182027457e-06, "loss": 0.3185, "num_input_tokens_seen": 23191616, "step": 22495 }, { "epoch": 15.050167224080267, "grad_norm": 2.176076650619507, "learning_rate": 1.753092670154391e-06, "loss": 0.4, "num_input_tokens_seen": 23198112, "step": 22500 }, { "epoch": 15.053511705685619, "grad_norm": 3.015354633331299, "learning_rate": 1.7508737284297973e-06, "loss": 0.3738, "num_input_tokens_seen": 23203392, "step": 22505 }, { "epoch": 15.05685618729097, "grad_norm": 2.582557201385498, "learning_rate": 1.748655893785029e-06, "loss": 0.3223, "num_input_tokens_seen": 23207808, "step": 22510 }, { "epoch": 15.060200668896321, "grad_norm": 2.706380605697632, "learning_rate": 1.7464391669757691e-06, "loss": 0.3565, "num_input_tokens_seen": 23212896, "step": 22515 }, { "epoch": 15.063545150501673, "grad_norm": 2.9888274669647217, "learning_rate": 1.7442235487573278e-06, "loss": 0.3333, "num_input_tokens_seen": 23218528, "step": 22520 }, { "epoch": 15.066889632107024, "grad_norm": 2.6558492183685303, "learning_rate": 1.7420090398846362e-06, "loss": 0.3967, "num_input_tokens_seen": 23223616, "step": 22525 }, { "epoch": 15.070234113712374, "grad_norm": 2.4702353477478027, "learning_rate": 1.739795641112248e-06, "loss": 0.4132, "num_input_tokens_seen": 23228160, "step": 22530 }, { "epoch": 15.073578595317725, "grad_norm": 1.860079050064087, "learning_rate": 1.7375833531943388e-06, "loss": 0.341, "num_input_tokens_seen": 23234208, "step": 22535 }, { "epoch": 15.076923076923077, "grad_norm": 2.721834897994995, "learning_rate": 1.735372176884702e-06, "loss": 0.364, "num_input_tokens_seen": 23239296, "step": 22540 }, { "epoch": 15.080267558528428, "grad_norm": 2.357954502105713, "learning_rate": 1.7331621129367583e-06, "loss": 0.3834, "num_input_tokens_seen": 23244160, "step": 22545 }, { "epoch": 15.08361204013378, "grad_norm": 2.3798766136169434, "learning_rate": 1.7309531621035408e-06, "loss": 0.3896, "num_input_tokens_seen": 23248960, "step": 22550 }, { "epoch": 15.08695652173913, "grad_norm": 1.965569257736206, "learning_rate": 1.7287453251377168e-06, "loss": 0.2719, "num_input_tokens_seen": 23253824, "step": 22555 }, { "epoch": 15.090301003344482, "grad_norm": 2.6310667991638184, "learning_rate": 1.7265386027915592e-06, "loss": 0.3448, "num_input_tokens_seen": 23258944, "step": 22560 }, { "epoch": 15.093645484949834, "grad_norm": 2.964017629623413, "learning_rate": 1.7243329958169708e-06, "loss": 0.3922, "num_input_tokens_seen": 23264064, "step": 22565 }, { "epoch": 15.096989966555183, "grad_norm": 2.758631944656372, "learning_rate": 1.7221285049654713e-06, "loss": 0.4858, "num_input_tokens_seen": 23269888, "step": 22570 }, { "epoch": 15.100334448160535, "grad_norm": 3.255436897277832, "learning_rate": 1.7199251309882004e-06, "loss": 0.3879, "num_input_tokens_seen": 23274912, "step": 22575 }, { "epoch": 15.103678929765886, "grad_norm": 2.556586980819702, "learning_rate": 1.717722874635918e-06, "loss": 0.3056, "num_input_tokens_seen": 23280224, "step": 22580 }, { "epoch": 15.107023411371237, "grad_norm": 1.9309314489364624, "learning_rate": 1.7155217366589993e-06, "loss": 0.359, "num_input_tokens_seen": 23284928, "step": 22585 }, { "epoch": 15.110367892976589, "grad_norm": 2.7153940200805664, "learning_rate": 1.7133217178074457e-06, "loss": 0.2974, "num_input_tokens_seen": 23290432, "step": 22590 }, { "epoch": 15.11371237458194, "grad_norm": 2.4031713008880615, "learning_rate": 1.7111228188308665e-06, "loss": 0.3546, "num_input_tokens_seen": 23296192, "step": 22595 }, { "epoch": 15.117056856187292, "grad_norm": 2.3421804904937744, "learning_rate": 1.708925040478504e-06, "loss": 0.3792, "num_input_tokens_seen": 23302336, "step": 22600 }, { "epoch": 15.120401337792643, "grad_norm": 4.433670520782471, "learning_rate": 1.7067283834992048e-06, "loss": 0.4138, "num_input_tokens_seen": 23307744, "step": 22605 }, { "epoch": 15.123745819397993, "grad_norm": 4.121286392211914, "learning_rate": 1.7045328486414414e-06, "loss": 0.3666, "num_input_tokens_seen": 23312256, "step": 22610 }, { "epoch": 15.127090301003344, "grad_norm": 3.347019672393799, "learning_rate": 1.7023384366533013e-06, "loss": 0.3313, "num_input_tokens_seen": 23316640, "step": 22615 }, { "epoch": 15.130434782608695, "grad_norm": 2.5975406169891357, "learning_rate": 1.7001451482824898e-06, "loss": 0.3234, "num_input_tokens_seen": 23322080, "step": 22620 }, { "epoch": 15.133779264214047, "grad_norm": 2.307852029800415, "learning_rate": 1.697952984276332e-06, "loss": 0.3508, "num_input_tokens_seen": 23326528, "step": 22625 }, { "epoch": 15.137123745819398, "grad_norm": 4.328629493713379, "learning_rate": 1.695761945381763e-06, "loss": 0.4189, "num_input_tokens_seen": 23332672, "step": 22630 }, { "epoch": 15.14046822742475, "grad_norm": 1.6220848560333252, "learning_rate": 1.6935720323453431e-06, "loss": 0.3323, "num_input_tokens_seen": 23338464, "step": 22635 }, { "epoch": 15.143812709030101, "grad_norm": 2.5817439556121826, "learning_rate": 1.691383245913239e-06, "loss": 0.3632, "num_input_tokens_seen": 23344960, "step": 22640 }, { "epoch": 15.147157190635452, "grad_norm": 2.7425689697265625, "learning_rate": 1.6891955868312465e-06, "loss": 0.45, "num_input_tokens_seen": 23350688, "step": 22645 }, { "epoch": 15.150501672240802, "grad_norm": 1.760685682296753, "learning_rate": 1.6870090558447644e-06, "loss": 0.3661, "num_input_tokens_seen": 23355616, "step": 22650 }, { "epoch": 15.153846153846153, "grad_norm": 2.845898389816284, "learning_rate": 1.6848236536988156e-06, "loss": 0.3024, "num_input_tokens_seen": 23361088, "step": 22655 }, { "epoch": 15.157190635451505, "grad_norm": 2.6579697132110596, "learning_rate": 1.6826393811380342e-06, "loss": 0.3489, "num_input_tokens_seen": 23366368, "step": 22660 }, { "epoch": 15.160535117056856, "grad_norm": 2.3370561599731445, "learning_rate": 1.6804562389066714e-06, "loss": 0.3144, "num_input_tokens_seen": 23371808, "step": 22665 }, { "epoch": 15.163879598662207, "grad_norm": 3.0028293132781982, "learning_rate": 1.6782742277485947e-06, "loss": 0.3447, "num_input_tokens_seen": 23376320, "step": 22670 }, { "epoch": 15.167224080267559, "grad_norm": 2.5639936923980713, "learning_rate": 1.67609334840728e-06, "loss": 0.3354, "num_input_tokens_seen": 23381088, "step": 22675 }, { "epoch": 15.17056856187291, "grad_norm": 2.558643341064453, "learning_rate": 1.6739136016258257e-06, "loss": 0.3978, "num_input_tokens_seen": 23386208, "step": 22680 }, { "epoch": 15.173913043478262, "grad_norm": 3.107038736343384, "learning_rate": 1.6717349881469347e-06, "loss": 0.397, "num_input_tokens_seen": 23391424, "step": 22685 }, { "epoch": 15.177257525083611, "grad_norm": 2.050222158432007, "learning_rate": 1.6695575087129367e-06, "loss": 0.3675, "num_input_tokens_seen": 23397216, "step": 22690 }, { "epoch": 15.180602006688963, "grad_norm": 2.6907269954681396, "learning_rate": 1.6673811640657621e-06, "loss": 0.2987, "num_input_tokens_seen": 23402016, "step": 22695 }, { "epoch": 15.183946488294314, "grad_norm": 2.6900546550750732, "learning_rate": 1.6652059549469625e-06, "loss": 0.3509, "num_input_tokens_seen": 23407232, "step": 22700 }, { "epoch": 15.187290969899665, "grad_norm": 2.6111857891082764, "learning_rate": 1.6630318820976994e-06, "loss": 0.3239, "num_input_tokens_seen": 23412032, "step": 22705 }, { "epoch": 15.190635451505017, "grad_norm": 2.590925931930542, "learning_rate": 1.6608589462587482e-06, "loss": 0.2897, "num_input_tokens_seen": 23417056, "step": 22710 }, { "epoch": 15.193979933110368, "grad_norm": 2.41616153717041, "learning_rate": 1.6586871481704985e-06, "loss": 0.3316, "num_input_tokens_seen": 23422080, "step": 22715 }, { "epoch": 15.19732441471572, "grad_norm": 2.5793752670288086, "learning_rate": 1.656516488572948e-06, "loss": 0.3613, "num_input_tokens_seen": 23426464, "step": 22720 }, { "epoch": 15.200668896321071, "grad_norm": 3.0792665481567383, "learning_rate": 1.6543469682057105e-06, "loss": 0.3837, "num_input_tokens_seen": 23431232, "step": 22725 }, { "epoch": 15.20401337792642, "grad_norm": 2.7252297401428223, "learning_rate": 1.652178587808006e-06, "loss": 0.4333, "num_input_tokens_seen": 23436736, "step": 22730 }, { "epoch": 15.207357859531772, "grad_norm": 3.051496982574463, "learning_rate": 1.650011348118677e-06, "loss": 0.4379, "num_input_tokens_seen": 23442112, "step": 22735 }, { "epoch": 15.210702341137123, "grad_norm": 2.160569190979004, "learning_rate": 1.6478452498761654e-06, "loss": 0.3284, "num_input_tokens_seen": 23447424, "step": 22740 }, { "epoch": 15.214046822742475, "grad_norm": 3.123487710952759, "learning_rate": 1.6456802938185317e-06, "loss": 0.3404, "num_input_tokens_seen": 23452768, "step": 22745 }, { "epoch": 15.217391304347826, "grad_norm": 2.5394628047943115, "learning_rate": 1.643516480683441e-06, "loss": 0.3214, "num_input_tokens_seen": 23458272, "step": 22750 }, { "epoch": 15.220735785953178, "grad_norm": 2.985506057739258, "learning_rate": 1.6413538112081777e-06, "loss": 0.3278, "num_input_tokens_seen": 23463840, "step": 22755 }, { "epoch": 15.224080267558529, "grad_norm": 2.6730918884277344, "learning_rate": 1.6391922861296283e-06, "loss": 0.3115, "num_input_tokens_seen": 23468992, "step": 22760 }, { "epoch": 15.22742474916388, "grad_norm": 2.6875030994415283, "learning_rate": 1.6370319061842933e-06, "loss": 0.4675, "num_input_tokens_seen": 23473600, "step": 22765 }, { "epoch": 15.23076923076923, "grad_norm": 2.887617349624634, "learning_rate": 1.6348726721082836e-06, "loss": 0.3192, "num_input_tokens_seen": 23478368, "step": 22770 }, { "epoch": 15.234113712374581, "grad_norm": 2.4843056201934814, "learning_rate": 1.6327145846373144e-06, "loss": 0.3171, "num_input_tokens_seen": 23483168, "step": 22775 }, { "epoch": 15.237458193979933, "grad_norm": 2.660395860671997, "learning_rate": 1.63055764450672e-06, "loss": 0.3976, "num_input_tokens_seen": 23488512, "step": 22780 }, { "epoch": 15.240802675585284, "grad_norm": 2.620049476623535, "learning_rate": 1.6284018524514333e-06, "loss": 0.3207, "num_input_tokens_seen": 23492864, "step": 22785 }, { "epoch": 15.244147157190636, "grad_norm": 3.5718941688537598, "learning_rate": 1.626247209206004e-06, "loss": 0.3326, "num_input_tokens_seen": 23498272, "step": 22790 }, { "epoch": 15.247491638795987, "grad_norm": 2.3332245349884033, "learning_rate": 1.6240937155045822e-06, "loss": 0.3697, "num_input_tokens_seen": 23505024, "step": 22795 }, { "epoch": 15.250836120401338, "grad_norm": 2.7596898078918457, "learning_rate": 1.6219413720809374e-06, "loss": 0.3667, "num_input_tokens_seen": 23510432, "step": 22800 }, { "epoch": 15.25418060200669, "grad_norm": 2.5898585319519043, "learning_rate": 1.6197901796684363e-06, "loss": 0.4024, "num_input_tokens_seen": 23515712, "step": 22805 }, { "epoch": 15.25752508361204, "grad_norm": 2.8235714435577393, "learning_rate": 1.6176401390000595e-06, "loss": 0.3838, "num_input_tokens_seen": 23520480, "step": 22810 }, { "epoch": 15.26086956521739, "grad_norm": 2.95745849609375, "learning_rate": 1.6154912508083954e-06, "loss": 0.3211, "num_input_tokens_seen": 23525280, "step": 22815 }, { "epoch": 15.264214046822742, "grad_norm": 2.8992748260498047, "learning_rate": 1.6133435158256327e-06, "loss": 0.287, "num_input_tokens_seen": 23530272, "step": 22820 }, { "epoch": 15.267558528428093, "grad_norm": 3.0401480197906494, "learning_rate": 1.6111969347835794e-06, "loss": 0.3493, "num_input_tokens_seen": 23535744, "step": 22825 }, { "epoch": 15.270903010033445, "grad_norm": 2.223546266555786, "learning_rate": 1.6090515084136388e-06, "loss": 0.3788, "num_input_tokens_seen": 23540736, "step": 22830 }, { "epoch": 15.274247491638796, "grad_norm": 3.283289909362793, "learning_rate": 1.606907237446828e-06, "loss": 0.4848, "num_input_tokens_seen": 23545472, "step": 22835 }, { "epoch": 15.277591973244148, "grad_norm": 3.6468005180358887, "learning_rate": 1.6047641226137628e-06, "loss": 0.3434, "num_input_tokens_seen": 23550336, "step": 22840 }, { "epoch": 15.280936454849499, "grad_norm": 2.9622795581817627, "learning_rate": 1.602622164644677e-06, "loss": 0.3164, "num_input_tokens_seen": 23554784, "step": 22845 }, { "epoch": 15.284280936454849, "grad_norm": 2.4677443504333496, "learning_rate": 1.6004813642693983e-06, "loss": 0.3411, "num_input_tokens_seen": 23559136, "step": 22850 }, { "epoch": 15.2876254180602, "grad_norm": 2.292013645172119, "learning_rate": 1.5983417222173664e-06, "loss": 0.4312, "num_input_tokens_seen": 23563904, "step": 22855 }, { "epoch": 15.290969899665551, "grad_norm": 3.4150643348693848, "learning_rate": 1.596203239217627e-06, "loss": 0.3965, "num_input_tokens_seen": 23568864, "step": 22860 }, { "epoch": 15.294314381270903, "grad_norm": 2.528709650039673, "learning_rate": 1.5940659159988226e-06, "loss": 0.3237, "num_input_tokens_seen": 23573920, "step": 22865 }, { "epoch": 15.297658862876254, "grad_norm": 3.5185649394989014, "learning_rate": 1.591929753289214e-06, "loss": 0.3364, "num_input_tokens_seen": 23578752, "step": 22870 }, { "epoch": 15.301003344481606, "grad_norm": 2.405583620071411, "learning_rate": 1.589794751816655e-06, "loss": 0.4174, "num_input_tokens_seen": 23584704, "step": 22875 }, { "epoch": 15.304347826086957, "grad_norm": 4.046800136566162, "learning_rate": 1.58766091230861e-06, "loss": 0.4562, "num_input_tokens_seen": 23590400, "step": 22880 }, { "epoch": 15.307692307692308, "grad_norm": 3.018465518951416, "learning_rate": 1.585528235492141e-06, "loss": 0.5048, "num_input_tokens_seen": 23596640, "step": 22885 }, { "epoch": 15.31103678929766, "grad_norm": 2.740177869796753, "learning_rate": 1.583396722093925e-06, "loss": 0.45, "num_input_tokens_seen": 23601376, "step": 22890 }, { "epoch": 15.31438127090301, "grad_norm": 3.2642345428466797, "learning_rate": 1.581266372840231e-06, "loss": 0.3812, "num_input_tokens_seen": 23606144, "step": 22895 }, { "epoch": 15.31772575250836, "grad_norm": 2.6576414108276367, "learning_rate": 1.5791371884569373e-06, "loss": 0.3624, "num_input_tokens_seen": 23611008, "step": 22900 }, { "epoch": 15.321070234113712, "grad_norm": 2.422093391418457, "learning_rate": 1.5770091696695245e-06, "loss": 0.3771, "num_input_tokens_seen": 23616256, "step": 22905 }, { "epoch": 15.324414715719064, "grad_norm": 3.458652973175049, "learning_rate": 1.5748823172030758e-06, "loss": 0.3667, "num_input_tokens_seen": 23620896, "step": 22910 }, { "epoch": 15.327759197324415, "grad_norm": 2.717298984527588, "learning_rate": 1.572756631782279e-06, "loss": 0.412, "num_input_tokens_seen": 23627008, "step": 22915 }, { "epoch": 15.331103678929766, "grad_norm": 3.363353967666626, "learning_rate": 1.5706321141314179e-06, "loss": 0.3956, "num_input_tokens_seen": 23632512, "step": 22920 }, { "epoch": 15.334448160535118, "grad_norm": 2.5281713008880615, "learning_rate": 1.5685087649743868e-06, "loss": 0.3679, "num_input_tokens_seen": 23637536, "step": 22925 }, { "epoch": 15.337792642140467, "grad_norm": 3.0305287837982178, "learning_rate": 1.5663865850346721e-06, "loss": 0.3536, "num_input_tokens_seen": 23642304, "step": 22930 }, { "epoch": 15.341137123745819, "grad_norm": 3.850722074508667, "learning_rate": 1.5642655750353746e-06, "loss": 0.3968, "num_input_tokens_seen": 23647776, "step": 22935 }, { "epoch": 15.34448160535117, "grad_norm": 3.1091108322143555, "learning_rate": 1.5621457356991849e-06, "loss": 0.3734, "num_input_tokens_seen": 23652992, "step": 22940 }, { "epoch": 15.347826086956522, "grad_norm": 4.710226535797119, "learning_rate": 1.5600270677484008e-06, "loss": 0.4474, "num_input_tokens_seen": 23657760, "step": 22945 }, { "epoch": 15.351170568561873, "grad_norm": 2.206853151321411, "learning_rate": 1.5579095719049165e-06, "loss": 0.4188, "num_input_tokens_seen": 23663520, "step": 22950 }, { "epoch": 15.354515050167224, "grad_norm": 2.655700445175171, "learning_rate": 1.5557932488902338e-06, "loss": 0.3222, "num_input_tokens_seen": 23668704, "step": 22955 }, { "epoch": 15.357859531772576, "grad_norm": 1.7708029747009277, "learning_rate": 1.553678099425448e-06, "loss": 0.2897, "num_input_tokens_seen": 23673568, "step": 22960 }, { "epoch": 15.361204013377927, "grad_norm": 2.667823553085327, "learning_rate": 1.5515641242312573e-06, "loss": 0.4827, "num_input_tokens_seen": 23678912, "step": 22965 }, { "epoch": 15.364548494983278, "grad_norm": 3.0303268432617188, "learning_rate": 1.5494513240279624e-06, "loss": 0.4397, "num_input_tokens_seen": 23684128, "step": 22970 }, { "epoch": 15.367892976588628, "grad_norm": 2.8801631927490234, "learning_rate": 1.5473396995354556e-06, "loss": 0.3454, "num_input_tokens_seen": 23690272, "step": 22975 }, { "epoch": 15.37123745819398, "grad_norm": 3.4553143978118896, "learning_rate": 1.5452292514732409e-06, "loss": 0.3971, "num_input_tokens_seen": 23695584, "step": 22980 }, { "epoch": 15.37458193979933, "grad_norm": 3.093435764312744, "learning_rate": 1.5431199805604086e-06, "loss": 0.3595, "num_input_tokens_seen": 23700224, "step": 22985 }, { "epoch": 15.377926421404682, "grad_norm": 3.084420919418335, "learning_rate": 1.5410118875156588e-06, "loss": 0.3112, "num_input_tokens_seen": 23705312, "step": 22990 }, { "epoch": 15.381270903010034, "grad_norm": 2.71762752532959, "learning_rate": 1.5389049730572786e-06, "loss": 0.4189, "num_input_tokens_seen": 23710816, "step": 22995 }, { "epoch": 15.384615384615385, "grad_norm": 2.2684881687164307, "learning_rate": 1.5367992379031683e-06, "loss": 0.4148, "num_input_tokens_seen": 23715840, "step": 23000 }, { "epoch": 15.387959866220736, "grad_norm": 2.6997838020324707, "learning_rate": 1.534694682770812e-06, "loss": 0.3809, "num_input_tokens_seen": 23720992, "step": 23005 }, { "epoch": 15.391304347826088, "grad_norm": 3.860238790512085, "learning_rate": 1.5325913083773008e-06, "loss": 0.413, "num_input_tokens_seen": 23726400, "step": 23010 }, { "epoch": 15.394648829431437, "grad_norm": 3.298625946044922, "learning_rate": 1.530489115439322e-06, "loss": 0.3711, "num_input_tokens_seen": 23731456, "step": 23015 }, { "epoch": 15.397993311036789, "grad_norm": 2.2555735111236572, "learning_rate": 1.5283881046731541e-06, "loss": 0.3237, "num_input_tokens_seen": 23737120, "step": 23020 }, { "epoch": 15.40133779264214, "grad_norm": 2.3798630237579346, "learning_rate": 1.5262882767946841e-06, "loss": 0.388, "num_input_tokens_seen": 23741792, "step": 23025 }, { "epoch": 15.404682274247492, "grad_norm": 2.4586100578308105, "learning_rate": 1.524189632519385e-06, "loss": 0.3054, "num_input_tokens_seen": 23747552, "step": 23030 }, { "epoch": 15.408026755852843, "grad_norm": 2.8150665760040283, "learning_rate": 1.5220921725623344e-06, "loss": 0.3714, "num_input_tokens_seen": 23752352, "step": 23035 }, { "epoch": 15.411371237458194, "grad_norm": 2.5014877319335938, "learning_rate": 1.519995897638198e-06, "loss": 0.3288, "num_input_tokens_seen": 23757248, "step": 23040 }, { "epoch": 15.414715719063546, "grad_norm": 2.288118362426758, "learning_rate": 1.517900808461249e-06, "loss": 0.3783, "num_input_tokens_seen": 23762656, "step": 23045 }, { "epoch": 15.418060200668897, "grad_norm": 3.349846601486206, "learning_rate": 1.5158069057453461e-06, "loss": 0.3282, "num_input_tokens_seen": 23767424, "step": 23050 }, { "epoch": 15.421404682274247, "grad_norm": 2.8696722984313965, "learning_rate": 1.5137141902039493e-06, "loss": 0.3808, "num_input_tokens_seen": 23772416, "step": 23055 }, { "epoch": 15.424749163879598, "grad_norm": 2.675002336502075, "learning_rate": 1.5116226625501145e-06, "loss": 0.402, "num_input_tokens_seen": 23778592, "step": 23060 }, { "epoch": 15.42809364548495, "grad_norm": 2.7368597984313965, "learning_rate": 1.5095323234964864e-06, "loss": 0.3368, "num_input_tokens_seen": 23784384, "step": 23065 }, { "epoch": 15.431438127090301, "grad_norm": 2.207899570465088, "learning_rate": 1.5074431737553158e-06, "loss": 0.3032, "num_input_tokens_seen": 23789824, "step": 23070 }, { "epoch": 15.434782608695652, "grad_norm": 2.1895387172698975, "learning_rate": 1.505355214038437e-06, "loss": 0.3507, "num_input_tokens_seen": 23794112, "step": 23075 }, { "epoch": 15.438127090301004, "grad_norm": 3.776470899581909, "learning_rate": 1.503268445057287e-06, "loss": 0.4551, "num_input_tokens_seen": 23799232, "step": 23080 }, { "epoch": 15.441471571906355, "grad_norm": 2.134068250656128, "learning_rate": 1.5011828675228894e-06, "loss": 0.4024, "num_input_tokens_seen": 23804928, "step": 23085 }, { "epoch": 15.444816053511706, "grad_norm": 2.8867499828338623, "learning_rate": 1.499098482145872e-06, "loss": 0.3585, "num_input_tokens_seen": 23810496, "step": 23090 }, { "epoch": 15.448160535117056, "grad_norm": 4.820174694061279, "learning_rate": 1.4970152896364465e-06, "loss": 0.4692, "num_input_tokens_seen": 23815872, "step": 23095 }, { "epoch": 15.451505016722408, "grad_norm": 2.7974367141723633, "learning_rate": 1.4949332907044239e-06, "loss": 0.3034, "num_input_tokens_seen": 23821792, "step": 23100 }, { "epoch": 15.454849498327759, "grad_norm": 2.8220362663269043, "learning_rate": 1.492852486059208e-06, "loss": 0.3698, "num_input_tokens_seen": 23827360, "step": 23105 }, { "epoch": 15.45819397993311, "grad_norm": 2.370034694671631, "learning_rate": 1.4907728764097902e-06, "loss": 0.3104, "num_input_tokens_seen": 23833408, "step": 23110 }, { "epoch": 15.461538461538462, "grad_norm": 3.8732802867889404, "learning_rate": 1.4886944624647647e-06, "loss": 0.3524, "num_input_tokens_seen": 23839200, "step": 23115 }, { "epoch": 15.464882943143813, "grad_norm": 2.7800419330596924, "learning_rate": 1.486617244932309e-06, "loss": 0.4398, "num_input_tokens_seen": 23845472, "step": 23120 }, { "epoch": 15.468227424749164, "grad_norm": 3.963639974594116, "learning_rate": 1.4845412245201995e-06, "loss": 0.2956, "num_input_tokens_seen": 23851296, "step": 23125 }, { "epoch": 15.471571906354516, "grad_norm": 2.916152238845825, "learning_rate": 1.4824664019357965e-06, "loss": 0.3437, "num_input_tokens_seen": 23855968, "step": 23130 }, { "epoch": 15.474916387959865, "grad_norm": 3.7186686992645264, "learning_rate": 1.4803927778860649e-06, "loss": 0.375, "num_input_tokens_seen": 23861120, "step": 23135 }, { "epoch": 15.478260869565217, "grad_norm": 4.027297019958496, "learning_rate": 1.4783203530775481e-06, "loss": 0.4018, "num_input_tokens_seen": 23865952, "step": 23140 }, { "epoch": 15.481605351170568, "grad_norm": 3.1258904933929443, "learning_rate": 1.4762491282163888e-06, "loss": 0.3848, "num_input_tokens_seen": 23871040, "step": 23145 }, { "epoch": 15.48494983277592, "grad_norm": 2.8078320026397705, "learning_rate": 1.4741791040083204e-06, "loss": 0.4221, "num_input_tokens_seen": 23876800, "step": 23150 }, { "epoch": 15.488294314381271, "grad_norm": 2.7087178230285645, "learning_rate": 1.4721102811586613e-06, "loss": 0.3301, "num_input_tokens_seen": 23881760, "step": 23155 }, { "epoch": 15.491638795986622, "grad_norm": 3.0728447437286377, "learning_rate": 1.4700426603723272e-06, "loss": 0.3876, "num_input_tokens_seen": 23887104, "step": 23160 }, { "epoch": 15.494983277591974, "grad_norm": 3.0873544216156006, "learning_rate": 1.4679762423538213e-06, "loss": 0.4028, "num_input_tokens_seen": 23892224, "step": 23165 }, { "epoch": 15.498327759197325, "grad_norm": 2.8759284019470215, "learning_rate": 1.4659110278072392e-06, "loss": 0.363, "num_input_tokens_seen": 23897728, "step": 23170 }, { "epoch": 15.501672240802675, "grad_norm": 2.7121479511260986, "learning_rate": 1.4638470174362601e-06, "loss": 0.3615, "num_input_tokens_seen": 23902848, "step": 23175 }, { "epoch": 15.505016722408026, "grad_norm": 2.570598602294922, "learning_rate": 1.4617842119441634e-06, "loss": 0.4021, "num_input_tokens_seen": 23907424, "step": 23180 }, { "epoch": 15.508361204013378, "grad_norm": 2.5194602012634277, "learning_rate": 1.459722612033807e-06, "loss": 0.3382, "num_input_tokens_seen": 23911680, "step": 23185 }, { "epoch": 15.511705685618729, "grad_norm": 2.0954222679138184, "learning_rate": 1.4576622184076467e-06, "loss": 0.3775, "num_input_tokens_seen": 23917120, "step": 23190 }, { "epoch": 15.51505016722408, "grad_norm": 2.534639596939087, "learning_rate": 1.4556030317677206e-06, "loss": 0.3506, "num_input_tokens_seen": 23921824, "step": 23195 }, { "epoch": 15.518394648829432, "grad_norm": 3.4993467330932617, "learning_rate": 1.4535450528156598e-06, "loss": 0.3309, "num_input_tokens_seen": 23926816, "step": 23200 }, { "epoch": 15.521739130434783, "grad_norm": 3.236996650695801, "learning_rate": 1.4514882822526827e-06, "loss": 0.4528, "num_input_tokens_seen": 23932864, "step": 23205 }, { "epoch": 15.525083612040135, "grad_norm": 1.9561166763305664, "learning_rate": 1.4494327207795955e-06, "loss": 0.3665, "num_input_tokens_seen": 23938272, "step": 23210 }, { "epoch": 15.528428093645484, "grad_norm": 2.2112908363342285, "learning_rate": 1.4473783690967958e-06, "loss": 0.363, "num_input_tokens_seen": 23943456, "step": 23215 }, { "epoch": 15.531772575250836, "grad_norm": 3.949557304382324, "learning_rate": 1.4453252279042596e-06, "loss": 0.4142, "num_input_tokens_seen": 23948608, "step": 23220 }, { "epoch": 15.535117056856187, "grad_norm": 2.6318445205688477, "learning_rate": 1.4432732979015641e-06, "loss": 0.3332, "num_input_tokens_seen": 23953024, "step": 23225 }, { "epoch": 15.538461538461538, "grad_norm": 4.001376152038574, "learning_rate": 1.4412225797878615e-06, "loss": 0.3961, "num_input_tokens_seen": 23958016, "step": 23230 }, { "epoch": 15.54180602006689, "grad_norm": 2.5729095935821533, "learning_rate": 1.4391730742618997e-06, "loss": 0.4201, "num_input_tokens_seen": 23963040, "step": 23235 }, { "epoch": 15.545150501672241, "grad_norm": 2.160754442214966, "learning_rate": 1.4371247820220064e-06, "loss": 0.3131, "num_input_tokens_seen": 23968512, "step": 23240 }, { "epoch": 15.548494983277592, "grad_norm": 2.8479466438293457, "learning_rate": 1.4350777037661012e-06, "loss": 0.3831, "num_input_tokens_seen": 23974208, "step": 23245 }, { "epoch": 15.551839464882944, "grad_norm": 2.7131781578063965, "learning_rate": 1.4330318401916882e-06, "loss": 0.4185, "num_input_tokens_seen": 23979136, "step": 23250 }, { "epoch": 15.555183946488294, "grad_norm": 2.676649808883667, "learning_rate": 1.4309871919958574e-06, "loss": 0.4314, "num_input_tokens_seen": 23984672, "step": 23255 }, { "epoch": 15.558528428093645, "grad_norm": 2.307349681854248, "learning_rate": 1.428943759875287e-06, "loss": 0.4112, "num_input_tokens_seen": 23990400, "step": 23260 }, { "epoch": 15.561872909698996, "grad_norm": 3.3961949348449707, "learning_rate": 1.4269015445262336e-06, "loss": 0.3623, "num_input_tokens_seen": 23995264, "step": 23265 }, { "epoch": 15.565217391304348, "grad_norm": 3.048286199569702, "learning_rate": 1.424860546644551e-06, "loss": 0.372, "num_input_tokens_seen": 24000000, "step": 23270 }, { "epoch": 15.568561872909699, "grad_norm": 2.7635679244995117, "learning_rate": 1.4228207669256665e-06, "loss": 0.372, "num_input_tokens_seen": 24005024, "step": 23275 }, { "epoch": 15.57190635451505, "grad_norm": 2.510690212249756, "learning_rate": 1.420782206064601e-06, "loss": 0.3988, "num_input_tokens_seen": 24010848, "step": 23280 }, { "epoch": 15.575250836120402, "grad_norm": 2.9125001430511475, "learning_rate": 1.4187448647559532e-06, "loss": 0.3612, "num_input_tokens_seen": 24015424, "step": 23285 }, { "epoch": 15.578595317725753, "grad_norm": 2.3948020935058594, "learning_rate": 1.416708743693911e-06, "loss": 0.3903, "num_input_tokens_seen": 24021088, "step": 23290 }, { "epoch": 15.581939799331103, "grad_norm": 2.6617255210876465, "learning_rate": 1.4146738435722445e-06, "loss": 0.363, "num_input_tokens_seen": 24026240, "step": 23295 }, { "epoch": 15.585284280936454, "grad_norm": 2.8476345539093018, "learning_rate": 1.4126401650843096e-06, "loss": 0.4061, "num_input_tokens_seen": 24031744, "step": 23300 }, { "epoch": 15.588628762541806, "grad_norm": 2.6521525382995605, "learning_rate": 1.4106077089230458e-06, "loss": 0.4336, "num_input_tokens_seen": 24037312, "step": 23305 }, { "epoch": 15.591973244147157, "grad_norm": 2.416618585586548, "learning_rate": 1.40857647578097e-06, "loss": 0.354, "num_input_tokens_seen": 24042144, "step": 23310 }, { "epoch": 15.595317725752508, "grad_norm": 2.3732523918151855, "learning_rate": 1.4065464663501937e-06, "loss": 0.4196, "num_input_tokens_seen": 24047040, "step": 23315 }, { "epoch": 15.59866220735786, "grad_norm": 2.4243781566619873, "learning_rate": 1.404517681322401e-06, "loss": 0.3413, "num_input_tokens_seen": 24051488, "step": 23320 }, { "epoch": 15.602006688963211, "grad_norm": 3.7990670204162598, "learning_rate": 1.4024901213888658e-06, "loss": 0.3663, "num_input_tokens_seen": 24056992, "step": 23325 }, { "epoch": 15.605351170568563, "grad_norm": 3.5433850288391113, "learning_rate": 1.4004637872404381e-06, "loss": 0.4259, "num_input_tokens_seen": 24062176, "step": 23330 }, { "epoch": 15.608695652173914, "grad_norm": 3.579505205154419, "learning_rate": 1.3984386795675564e-06, "loss": 0.4055, "num_input_tokens_seen": 24067904, "step": 23335 }, { "epoch": 15.612040133779264, "grad_norm": 3.025109052658081, "learning_rate": 1.396414799060238e-06, "loss": 0.3501, "num_input_tokens_seen": 24072448, "step": 23340 }, { "epoch": 15.615384615384615, "grad_norm": 2.6501669883728027, "learning_rate": 1.3943921464080834e-06, "loss": 0.3617, "num_input_tokens_seen": 24077664, "step": 23345 }, { "epoch": 15.618729096989966, "grad_norm": 1.6451029777526855, "learning_rate": 1.3923707223002748e-06, "loss": 0.3288, "num_input_tokens_seen": 24083264, "step": 23350 }, { "epoch": 15.622073578595318, "grad_norm": 3.6251962184906006, "learning_rate": 1.3903505274255719e-06, "loss": 0.4299, "num_input_tokens_seen": 24088768, "step": 23355 }, { "epoch": 15.62541806020067, "grad_norm": 2.8391525745391846, "learning_rate": 1.3883315624723236e-06, "loss": 0.4103, "num_input_tokens_seen": 24093792, "step": 23360 }, { "epoch": 15.62876254180602, "grad_norm": 4.10461950302124, "learning_rate": 1.386313828128451e-06, "loss": 0.4225, "num_input_tokens_seen": 24098656, "step": 23365 }, { "epoch": 15.632107023411372, "grad_norm": 2.338287591934204, "learning_rate": 1.3842973250814634e-06, "loss": 0.4073, "num_input_tokens_seen": 24103776, "step": 23370 }, { "epoch": 15.635451505016722, "grad_norm": 2.317196846008301, "learning_rate": 1.3822820540184418e-06, "loss": 0.3401, "num_input_tokens_seen": 24109888, "step": 23375 }, { "epoch": 15.638795986622073, "grad_norm": 2.573955774307251, "learning_rate": 1.3802680156260584e-06, "loss": 0.3301, "num_input_tokens_seen": 24114496, "step": 23380 }, { "epoch": 15.642140468227424, "grad_norm": 2.729769229888916, "learning_rate": 1.3782552105905562e-06, "loss": 0.3587, "num_input_tokens_seen": 24120128, "step": 23385 }, { "epoch": 15.645484949832776, "grad_norm": 2.4075372219085693, "learning_rate": 1.376243639597763e-06, "loss": 0.4059, "num_input_tokens_seen": 24125088, "step": 23390 }, { "epoch": 15.648829431438127, "grad_norm": 2.26216197013855, "learning_rate": 1.3742333033330823e-06, "loss": 0.4591, "num_input_tokens_seen": 24130112, "step": 23395 }, { "epoch": 15.652173913043478, "grad_norm": 3.0767343044281006, "learning_rate": 1.3722242024815008e-06, "loss": 0.392, "num_input_tokens_seen": 24135808, "step": 23400 }, { "epoch": 15.65551839464883, "grad_norm": 3.121659278869629, "learning_rate": 1.3702163377275823e-06, "loss": 0.3538, "num_input_tokens_seen": 24140640, "step": 23405 }, { "epoch": 15.658862876254181, "grad_norm": 2.6062123775482178, "learning_rate": 1.3682097097554692e-06, "loss": 0.4135, "num_input_tokens_seen": 24145632, "step": 23410 }, { "epoch": 15.662207357859533, "grad_norm": 2.3635990619659424, "learning_rate": 1.366204319248885e-06, "loss": 0.3438, "num_input_tokens_seen": 24150400, "step": 23415 }, { "epoch": 15.665551839464882, "grad_norm": 2.554725170135498, "learning_rate": 1.3642001668911248e-06, "loss": 0.4518, "num_input_tokens_seen": 24155072, "step": 23420 }, { "epoch": 15.668896321070234, "grad_norm": 3.81839656829834, "learning_rate": 1.3621972533650728e-06, "loss": 0.4221, "num_input_tokens_seen": 24160256, "step": 23425 }, { "epoch": 15.672240802675585, "grad_norm": 3.449396848678589, "learning_rate": 1.3601955793531802e-06, "loss": 0.3475, "num_input_tokens_seen": 24164640, "step": 23430 }, { "epoch": 15.675585284280936, "grad_norm": 3.637834310531616, "learning_rate": 1.358195145537483e-06, "loss": 0.436, "num_input_tokens_seen": 24170816, "step": 23435 }, { "epoch": 15.678929765886288, "grad_norm": 3.6536412239074707, "learning_rate": 1.3561959525995894e-06, "loss": 0.3711, "num_input_tokens_seen": 24176096, "step": 23440 }, { "epoch": 15.68227424749164, "grad_norm": 2.843606948852539, "learning_rate": 1.3541980012206895e-06, "loss": 0.3725, "num_input_tokens_seen": 24181312, "step": 23445 }, { "epoch": 15.68561872909699, "grad_norm": 1.956017017364502, "learning_rate": 1.352201292081548e-06, "loss": 0.3367, "num_input_tokens_seen": 24187072, "step": 23450 }, { "epoch": 15.68896321070234, "grad_norm": 3.3414182662963867, "learning_rate": 1.3502058258625066e-06, "loss": 0.2908, "num_input_tokens_seen": 24192672, "step": 23455 }, { "epoch": 15.692307692307692, "grad_norm": 2.8718855381011963, "learning_rate": 1.348211603243485e-06, "loss": 0.3502, "num_input_tokens_seen": 24197696, "step": 23460 }, { "epoch": 15.695652173913043, "grad_norm": 2.33302640914917, "learning_rate": 1.3462186249039732e-06, "loss": 0.337, "num_input_tokens_seen": 24203456, "step": 23465 }, { "epoch": 15.698996655518394, "grad_norm": 2.3326427936553955, "learning_rate": 1.3442268915230488e-06, "loss": 0.3964, "num_input_tokens_seen": 24208832, "step": 23470 }, { "epoch": 15.702341137123746, "grad_norm": 2.6478610038757324, "learning_rate": 1.3422364037793523e-06, "loss": 0.4283, "num_input_tokens_seen": 24214144, "step": 23475 }, { "epoch": 15.705685618729097, "grad_norm": 2.6255767345428467, "learning_rate": 1.3402471623511092e-06, "loss": 0.3535, "num_input_tokens_seen": 24220032, "step": 23480 }, { "epoch": 15.709030100334449, "grad_norm": 3.3011717796325684, "learning_rate": 1.3382591679161144e-06, "loss": 0.3545, "num_input_tokens_seen": 24224800, "step": 23485 }, { "epoch": 15.7123745819398, "grad_norm": 2.3198583126068115, "learning_rate": 1.336272421151741e-06, "loss": 0.3683, "num_input_tokens_seen": 24230016, "step": 23490 }, { "epoch": 15.715719063545151, "grad_norm": 2.2979860305786133, "learning_rate": 1.334286922734937e-06, "loss": 0.4506, "num_input_tokens_seen": 24234720, "step": 23495 }, { "epoch": 15.719063545150501, "grad_norm": 2.365553140640259, "learning_rate": 1.3323026733422233e-06, "loss": 0.3947, "num_input_tokens_seen": 24240032, "step": 23500 }, { "epoch": 15.722408026755852, "grad_norm": 2.214193105697632, "learning_rate": 1.3303196736496987e-06, "loss": 0.3396, "num_input_tokens_seen": 24245408, "step": 23505 }, { "epoch": 15.725752508361204, "grad_norm": 2.5856211185455322, "learning_rate": 1.3283379243330292e-06, "loss": 0.3264, "num_input_tokens_seen": 24250528, "step": 23510 }, { "epoch": 15.729096989966555, "grad_norm": 2.399590492248535, "learning_rate": 1.326357426067465e-06, "loss": 0.3237, "num_input_tokens_seen": 24255296, "step": 23515 }, { "epoch": 15.732441471571907, "grad_norm": 3.6487529277801514, "learning_rate": 1.3243781795278198e-06, "loss": 0.3845, "num_input_tokens_seen": 24260128, "step": 23520 }, { "epoch": 15.735785953177258, "grad_norm": 3.4562478065490723, "learning_rate": 1.3224001853884889e-06, "loss": 0.3672, "num_input_tokens_seen": 24264960, "step": 23525 }, { "epoch": 15.73913043478261, "grad_norm": 3.03216290473938, "learning_rate": 1.3204234443234338e-06, "loss": 0.4, "num_input_tokens_seen": 24270976, "step": 23530 }, { "epoch": 15.742474916387959, "grad_norm": 2.3081581592559814, "learning_rate": 1.3184479570061937e-06, "loss": 0.3433, "num_input_tokens_seen": 24275904, "step": 23535 }, { "epoch": 15.74581939799331, "grad_norm": 3.690248966217041, "learning_rate": 1.31647372410988e-06, "loss": 0.4193, "num_input_tokens_seen": 24280384, "step": 23540 }, { "epoch": 15.749163879598662, "grad_norm": 3.107180595397949, "learning_rate": 1.3145007463071763e-06, "loss": 0.3651, "num_input_tokens_seen": 24286048, "step": 23545 }, { "epoch": 15.752508361204013, "grad_norm": 2.3972017765045166, "learning_rate": 1.3125290242703392e-06, "loss": 0.325, "num_input_tokens_seen": 24291648, "step": 23550 }, { "epoch": 15.755852842809364, "grad_norm": 3.9494900703430176, "learning_rate": 1.3105585586711927e-06, "loss": 0.4272, "num_input_tokens_seen": 24297216, "step": 23555 }, { "epoch": 15.759197324414716, "grad_norm": 2.229585886001587, "learning_rate": 1.3085893501811426e-06, "loss": 0.3649, "num_input_tokens_seen": 24301728, "step": 23560 }, { "epoch": 15.762541806020067, "grad_norm": 3.5817902088165283, "learning_rate": 1.3066213994711552e-06, "loss": 0.2995, "num_input_tokens_seen": 24306656, "step": 23565 }, { "epoch": 15.765886287625419, "grad_norm": 2.4706170558929443, "learning_rate": 1.3046547072117772e-06, "loss": 0.4459, "num_input_tokens_seen": 24311808, "step": 23570 }, { "epoch": 15.76923076923077, "grad_norm": 2.7151808738708496, "learning_rate": 1.3026892740731196e-06, "loss": 0.3411, "num_input_tokens_seen": 24316800, "step": 23575 }, { "epoch": 15.77257525083612, "grad_norm": 3.2479047775268555, "learning_rate": 1.3007251007248678e-06, "loss": 0.3749, "num_input_tokens_seen": 24322464, "step": 23580 }, { "epoch": 15.775919732441471, "grad_norm": 4.086181163787842, "learning_rate": 1.2987621878362793e-06, "loss": 0.3648, "num_input_tokens_seen": 24327648, "step": 23585 }, { "epoch": 15.779264214046822, "grad_norm": 2.332033634185791, "learning_rate": 1.296800536076181e-06, "loss": 0.2758, "num_input_tokens_seen": 24332064, "step": 23590 }, { "epoch": 15.782608695652174, "grad_norm": 3.11957049369812, "learning_rate": 1.2948401461129667e-06, "loss": 0.3038, "num_input_tokens_seen": 24337024, "step": 23595 }, { "epoch": 15.785953177257525, "grad_norm": 2.643131732940674, "learning_rate": 1.2928810186146047e-06, "loss": 0.4099, "num_input_tokens_seen": 24342976, "step": 23600 }, { "epoch": 15.789297658862877, "grad_norm": 2.417048215866089, "learning_rate": 1.2909231542486312e-06, "loss": 0.3531, "num_input_tokens_seen": 24347904, "step": 23605 }, { "epoch": 15.792642140468228, "grad_norm": 2.4065115451812744, "learning_rate": 1.2889665536821527e-06, "loss": 0.3097, "num_input_tokens_seen": 24352544, "step": 23610 }, { "epoch": 15.79598662207358, "grad_norm": 2.835625648498535, "learning_rate": 1.287011217581846e-06, "loss": 0.3548, "num_input_tokens_seen": 24358112, "step": 23615 }, { "epoch": 15.799331103678929, "grad_norm": 2.972475528717041, "learning_rate": 1.2850571466139534e-06, "loss": 0.3226, "num_input_tokens_seen": 24363712, "step": 23620 }, { "epoch": 15.80267558528428, "grad_norm": 2.4812967777252197, "learning_rate": 1.2831043414442896e-06, "loss": 0.3576, "num_input_tokens_seen": 24368448, "step": 23625 }, { "epoch": 15.806020066889632, "grad_norm": 2.083871364593506, "learning_rate": 1.2811528027382369e-06, "loss": 0.3183, "num_input_tokens_seen": 24374272, "step": 23630 }, { "epoch": 15.809364548494983, "grad_norm": 3.350778579711914, "learning_rate": 1.2792025311607475e-06, "loss": 0.4082, "num_input_tokens_seen": 24378848, "step": 23635 }, { "epoch": 15.812709030100335, "grad_norm": 2.2673802375793457, "learning_rate": 1.2772535273763375e-06, "loss": 0.4055, "num_input_tokens_seen": 24384448, "step": 23640 }, { "epoch": 15.816053511705686, "grad_norm": 2.0358030796051025, "learning_rate": 1.2753057920490952e-06, "loss": 0.3505, "num_input_tokens_seen": 24390624, "step": 23645 }, { "epoch": 15.819397993311037, "grad_norm": 2.1556291580200195, "learning_rate": 1.2733593258426763e-06, "loss": 0.3654, "num_input_tokens_seen": 24395648, "step": 23650 }, { "epoch": 15.822742474916389, "grad_norm": 3.436542510986328, "learning_rate": 1.2714141294203032e-06, "loss": 0.3807, "num_input_tokens_seen": 24400000, "step": 23655 }, { "epoch": 15.826086956521738, "grad_norm": 2.1163790225982666, "learning_rate": 1.2694702034447659e-06, "loss": 0.3392, "num_input_tokens_seen": 24404896, "step": 23660 }, { "epoch": 15.82943143812709, "grad_norm": 2.9174184799194336, "learning_rate": 1.2675275485784194e-06, "loss": 0.3795, "num_input_tokens_seen": 24409536, "step": 23665 }, { "epoch": 15.832775919732441, "grad_norm": 3.4913885593414307, "learning_rate": 1.2655861654831892e-06, "loss": 0.3613, "num_input_tokens_seen": 24413792, "step": 23670 }, { "epoch": 15.836120401337793, "grad_norm": 2.348085641860962, "learning_rate": 1.2636460548205647e-06, "loss": 0.3539, "num_input_tokens_seen": 24418208, "step": 23675 }, { "epoch": 15.839464882943144, "grad_norm": 1.8946928977966309, "learning_rate": 1.2617072172516054e-06, "loss": 0.382, "num_input_tokens_seen": 24423520, "step": 23680 }, { "epoch": 15.842809364548495, "grad_norm": 3.049415111541748, "learning_rate": 1.2597696534369309e-06, "loss": 0.3201, "num_input_tokens_seen": 24427712, "step": 23685 }, { "epoch": 15.846153846153847, "grad_norm": 3.611459732055664, "learning_rate": 1.2578333640367318e-06, "loss": 0.328, "num_input_tokens_seen": 24433088, "step": 23690 }, { "epoch": 15.849498327759198, "grad_norm": 2.2893383502960205, "learning_rate": 1.2558983497107629e-06, "loss": 0.4053, "num_input_tokens_seen": 24437856, "step": 23695 }, { "epoch": 15.852842809364548, "grad_norm": 2.7843446731567383, "learning_rate": 1.2539646111183452e-06, "loss": 0.3815, "num_input_tokens_seen": 24443584, "step": 23700 }, { "epoch": 15.856187290969899, "grad_norm": 3.6024105548858643, "learning_rate": 1.2520321489183652e-06, "loss": 0.3678, "num_input_tokens_seen": 24449536, "step": 23705 }, { "epoch": 15.85953177257525, "grad_norm": 2.1697075366973877, "learning_rate": 1.2501009637692712e-06, "loss": 0.3349, "num_input_tokens_seen": 24454336, "step": 23710 }, { "epoch": 15.862876254180602, "grad_norm": 2.709049701690674, "learning_rate": 1.24817105632908e-06, "loss": 0.396, "num_input_tokens_seen": 24460000, "step": 23715 }, { "epoch": 15.866220735785953, "grad_norm": 2.0169990062713623, "learning_rate": 1.2462424272553725e-06, "loss": 0.3637, "num_input_tokens_seen": 24466272, "step": 23720 }, { "epoch": 15.869565217391305, "grad_norm": 2.357724666595459, "learning_rate": 1.2443150772052948e-06, "loss": 0.4262, "num_input_tokens_seen": 24471744, "step": 23725 }, { "epoch": 15.872909698996656, "grad_norm": 3.7895638942718506, "learning_rate": 1.2423890068355526e-06, "loss": 0.3075, "num_input_tokens_seen": 24476064, "step": 23730 }, { "epoch": 15.876254180602007, "grad_norm": 2.921623468399048, "learning_rate": 1.2404642168024206e-06, "loss": 0.3623, "num_input_tokens_seen": 24480544, "step": 23735 }, { "epoch": 15.879598662207357, "grad_norm": 2.327751398086548, "learning_rate": 1.238540707761735e-06, "loss": 0.355, "num_input_tokens_seen": 24484992, "step": 23740 }, { "epoch": 15.882943143812708, "grad_norm": 3.5016887187957764, "learning_rate": 1.2366184803688969e-06, "loss": 0.3802, "num_input_tokens_seen": 24490048, "step": 23745 }, { "epoch": 15.88628762541806, "grad_norm": 4.064520359039307, "learning_rate": 1.2346975352788708e-06, "loss": 0.3691, "num_input_tokens_seen": 24494752, "step": 23750 }, { "epoch": 15.889632107023411, "grad_norm": 2.5937228202819824, "learning_rate": 1.2327778731461803e-06, "loss": 0.3544, "num_input_tokens_seen": 24499488, "step": 23755 }, { "epoch": 15.892976588628763, "grad_norm": 2.090352773666382, "learning_rate": 1.2308594946249163e-06, "loss": 0.3254, "num_input_tokens_seen": 24505312, "step": 23760 }, { "epoch": 15.896321070234114, "grad_norm": 3.379826545715332, "learning_rate": 1.2289424003687307e-06, "loss": 0.3702, "num_input_tokens_seen": 24510688, "step": 23765 }, { "epoch": 15.899665551839465, "grad_norm": 2.910230875015259, "learning_rate": 1.22702659103084e-06, "loss": 0.3227, "num_input_tokens_seen": 24515424, "step": 23770 }, { "epoch": 15.903010033444817, "grad_norm": 2.1824464797973633, "learning_rate": 1.2251120672640176e-06, "loss": 0.3578, "num_input_tokens_seen": 24521184, "step": 23775 }, { "epoch": 15.906354515050166, "grad_norm": 2.7883825302124023, "learning_rate": 1.2231988297206032e-06, "loss": 0.3981, "num_input_tokens_seen": 24526784, "step": 23780 }, { "epoch": 15.909698996655518, "grad_norm": 3.2658700942993164, "learning_rate": 1.2212868790524985e-06, "loss": 0.4237, "num_input_tokens_seen": 24530944, "step": 23785 }, { "epoch": 15.91304347826087, "grad_norm": 2.3687307834625244, "learning_rate": 1.2193762159111639e-06, "loss": 0.3831, "num_input_tokens_seen": 24536320, "step": 23790 }, { "epoch": 15.91638795986622, "grad_norm": 2.79469895362854, "learning_rate": 1.2174668409476253e-06, "loss": 0.3466, "num_input_tokens_seen": 24541504, "step": 23795 }, { "epoch": 15.919732441471572, "grad_norm": 2.5555341243743896, "learning_rate": 1.2155587548124631e-06, "loss": 0.4318, "num_input_tokens_seen": 24546528, "step": 23800 }, { "epoch": 15.923076923076923, "grad_norm": 3.8658015727996826, "learning_rate": 1.2136519581558243e-06, "loss": 0.3402, "num_input_tokens_seen": 24552192, "step": 23805 }, { "epoch": 15.926421404682275, "grad_norm": 2.557978630065918, "learning_rate": 1.2117464516274147e-06, "loss": 0.3273, "num_input_tokens_seen": 24557280, "step": 23810 }, { "epoch": 15.929765886287626, "grad_norm": 3.4846060276031494, "learning_rate": 1.2098422358765021e-06, "loss": 0.4378, "num_input_tokens_seen": 24562976, "step": 23815 }, { "epoch": 15.933110367892976, "grad_norm": 3.139674663543701, "learning_rate": 1.2079393115519094e-06, "loss": 0.3409, "num_input_tokens_seen": 24568384, "step": 23820 }, { "epoch": 15.936454849498327, "grad_norm": 2.484815835952759, "learning_rate": 1.2060376793020257e-06, "loss": 0.451, "num_input_tokens_seen": 24572736, "step": 23825 }, { "epoch": 15.939799331103679, "grad_norm": 3.059589385986328, "learning_rate": 1.2041373397747953e-06, "loss": 0.4233, "num_input_tokens_seen": 24577984, "step": 23830 }, { "epoch": 15.94314381270903, "grad_norm": 2.400993585586548, "learning_rate": 1.2022382936177263e-06, "loss": 0.4155, "num_input_tokens_seen": 24582368, "step": 23835 }, { "epoch": 15.946488294314381, "grad_norm": 3.5449352264404297, "learning_rate": 1.2003405414778807e-06, "loss": 0.3356, "num_input_tokens_seen": 24587584, "step": 23840 }, { "epoch": 15.949832775919733, "grad_norm": 4.692881107330322, "learning_rate": 1.198444084001884e-06, "loss": 0.3629, "num_input_tokens_seen": 24592256, "step": 23845 }, { "epoch": 15.953177257525084, "grad_norm": 4.573724269866943, "learning_rate": 1.1965489218359195e-06, "loss": 0.3682, "num_input_tokens_seen": 24597152, "step": 23850 }, { "epoch": 15.956521739130435, "grad_norm": 2.7193446159362793, "learning_rate": 1.1946550556257275e-06, "loss": 0.3445, "num_input_tokens_seen": 24601856, "step": 23855 }, { "epoch": 15.959866220735787, "grad_norm": 3.5432753562927246, "learning_rate": 1.1927624860166104e-06, "loss": 0.3453, "num_input_tokens_seen": 24606784, "step": 23860 }, { "epoch": 15.963210702341136, "grad_norm": 2.453021287918091, "learning_rate": 1.1908712136534228e-06, "loss": 0.3376, "num_input_tokens_seen": 24611648, "step": 23865 }, { "epoch": 15.966555183946488, "grad_norm": 2.7967028617858887, "learning_rate": 1.1889812391805832e-06, "loss": 0.3545, "num_input_tokens_seen": 24616640, "step": 23870 }, { "epoch": 15.96989966555184, "grad_norm": 2.4699785709381104, "learning_rate": 1.187092563242065e-06, "loss": 0.3513, "num_input_tokens_seen": 24621600, "step": 23875 }, { "epoch": 15.97324414715719, "grad_norm": 2.4377903938293457, "learning_rate": 1.1852051864814013e-06, "loss": 0.4027, "num_input_tokens_seen": 24626560, "step": 23880 }, { "epoch": 15.976588628762542, "grad_norm": 3.430870532989502, "learning_rate": 1.1833191095416773e-06, "loss": 0.3743, "num_input_tokens_seen": 24631968, "step": 23885 }, { "epoch": 15.979933110367893, "grad_norm": 3.6098215579986572, "learning_rate": 1.1814343330655415e-06, "loss": 0.365, "num_input_tokens_seen": 24637056, "step": 23890 }, { "epoch": 15.983277591973245, "grad_norm": 2.7078917026519775, "learning_rate": 1.1795508576951958e-06, "loss": 0.4111, "num_input_tokens_seen": 24642336, "step": 23895 }, { "epoch": 15.986622073578594, "grad_norm": 2.7761523723602295, "learning_rate": 1.1776686840724004e-06, "loss": 0.3586, "num_input_tokens_seen": 24647616, "step": 23900 }, { "epoch": 15.989966555183946, "grad_norm": 3.1803951263427734, "learning_rate": 1.175787812838472e-06, "loss": 0.3634, "num_input_tokens_seen": 24652704, "step": 23905 }, { "epoch": 15.993311036789297, "grad_norm": 2.8112590312957764, "learning_rate": 1.1739082446342802e-06, "loss": 0.3852, "num_input_tokens_seen": 24657888, "step": 23910 }, { "epoch": 15.996655518394649, "grad_norm": 3.1201484203338623, "learning_rate": 1.172029980100255e-06, "loss": 0.3887, "num_input_tokens_seen": 24664288, "step": 23915 }, { "epoch": 16.0, "grad_norm": 3.733642339706421, "learning_rate": 1.1701530198763794e-06, "loss": 0.3145, "num_input_tokens_seen": 24669056, "step": 23920 }, { "epoch": 16.0, "eval_loss": 0.5284302830696106, "eval_runtime": 37.6217, "eval_samples_per_second": 39.738, "eval_steps_per_second": 9.941, "num_input_tokens_seen": 24669056, "step": 23920 }, { "epoch": 16.00334448160535, "grad_norm": 3.629271984100342, "learning_rate": 1.1682773646021955e-06, "loss": 0.3663, "num_input_tokens_seen": 24673856, "step": 23925 }, { "epoch": 16.006688963210703, "grad_norm": 3.2001729011535645, "learning_rate": 1.1664030149167943e-06, "loss": 0.3879, "num_input_tokens_seen": 24679168, "step": 23930 }, { "epoch": 16.010033444816052, "grad_norm": 3.085829019546509, "learning_rate": 1.1645299714588282e-06, "loss": 0.3344, "num_input_tokens_seen": 24683872, "step": 23935 }, { "epoch": 16.013377926421406, "grad_norm": 2.6943254470825195, "learning_rate": 1.1626582348665016e-06, "loss": 0.3561, "num_input_tokens_seen": 24689632, "step": 23940 }, { "epoch": 16.016722408026755, "grad_norm": 3.082434892654419, "learning_rate": 1.160787805777575e-06, "loss": 0.4768, "num_input_tokens_seen": 24695072, "step": 23945 }, { "epoch": 16.02006688963211, "grad_norm": 2.7115979194641113, "learning_rate": 1.1589186848293633e-06, "loss": 0.3944, "num_input_tokens_seen": 24701024, "step": 23950 }, { "epoch": 16.023411371237458, "grad_norm": 2.4310100078582764, "learning_rate": 1.157050872658732e-06, "loss": 0.2903, "num_input_tokens_seen": 24705312, "step": 23955 }, { "epoch": 16.02675585284281, "grad_norm": 2.886064291000366, "learning_rate": 1.1551843699021055e-06, "loss": 0.3951, "num_input_tokens_seen": 24710528, "step": 23960 }, { "epoch": 16.03010033444816, "grad_norm": 2.3764772415161133, "learning_rate": 1.1533191771954599e-06, "loss": 0.3821, "num_input_tokens_seen": 24715744, "step": 23965 }, { "epoch": 16.03344481605351, "grad_norm": 2.4368550777435303, "learning_rate": 1.1514552951743268e-06, "loss": 0.3977, "num_input_tokens_seen": 24722176, "step": 23970 }, { "epoch": 16.036789297658864, "grad_norm": 2.480401039123535, "learning_rate": 1.149592724473787e-06, "loss": 0.3983, "num_input_tokens_seen": 24727808, "step": 23975 }, { "epoch": 16.040133779264213, "grad_norm": 2.3687057495117188, "learning_rate": 1.1477314657284777e-06, "loss": 0.3099, "num_input_tokens_seen": 24732512, "step": 23980 }, { "epoch": 16.043478260869566, "grad_norm": 2.2911934852600098, "learning_rate": 1.1458715195725894e-06, "loss": 0.3237, "num_input_tokens_seen": 24737152, "step": 23985 }, { "epoch": 16.046822742474916, "grad_norm": 2.487067461013794, "learning_rate": 1.1440128866398643e-06, "loss": 0.3745, "num_input_tokens_seen": 24742080, "step": 23990 }, { "epoch": 16.05016722408027, "grad_norm": 3.073190927505493, "learning_rate": 1.1421555675635976e-06, "loss": 0.2704, "num_input_tokens_seen": 24746592, "step": 23995 }, { "epoch": 16.05351170568562, "grad_norm": 2.692095994949341, "learning_rate": 1.140299562976635e-06, "loss": 0.3525, "num_input_tokens_seen": 24751072, "step": 24000 }, { "epoch": 16.05685618729097, "grad_norm": 2.708618402481079, "learning_rate": 1.1384448735113767e-06, "loss": 0.4048, "num_input_tokens_seen": 24756384, "step": 24005 }, { "epoch": 16.06020066889632, "grad_norm": 2.8098418712615967, "learning_rate": 1.136591499799774e-06, "loss": 0.3778, "num_input_tokens_seen": 24761760, "step": 24010 }, { "epoch": 16.06354515050167, "grad_norm": 2.338905096054077, "learning_rate": 1.1347394424733316e-06, "loss": 0.332, "num_input_tokens_seen": 24767072, "step": 24015 }, { "epoch": 16.066889632107024, "grad_norm": 2.7633180618286133, "learning_rate": 1.1328887021630997e-06, "loss": 0.3847, "num_input_tokens_seen": 24772640, "step": 24020 }, { "epoch": 16.070234113712374, "grad_norm": 2.986258029937744, "learning_rate": 1.1310392794996867e-06, "loss": 0.4043, "num_input_tokens_seen": 24777600, "step": 24025 }, { "epoch": 16.073578595317727, "grad_norm": 4.187742233276367, "learning_rate": 1.129191175113249e-06, "loss": 0.393, "num_input_tokens_seen": 24782752, "step": 24030 }, { "epoch": 16.076923076923077, "grad_norm": 2.5010159015655518, "learning_rate": 1.1273443896334946e-06, "loss": 0.3202, "num_input_tokens_seen": 24787328, "step": 24035 }, { "epoch": 16.08026755852843, "grad_norm": 3.599031448364258, "learning_rate": 1.1254989236896797e-06, "loss": 0.3442, "num_input_tokens_seen": 24792480, "step": 24040 }, { "epoch": 16.08361204013378, "grad_norm": 3.1682333946228027, "learning_rate": 1.1236547779106139e-06, "loss": 0.4234, "num_input_tokens_seen": 24798400, "step": 24045 }, { "epoch": 16.08695652173913, "grad_norm": 4.411017417907715, "learning_rate": 1.1218119529246556e-06, "loss": 0.407, "num_input_tokens_seen": 24804608, "step": 24050 }, { "epoch": 16.090301003344482, "grad_norm": 3.155557155609131, "learning_rate": 1.119970449359714e-06, "loss": 0.3149, "num_input_tokens_seen": 24809248, "step": 24055 }, { "epoch": 16.093645484949832, "grad_norm": 2.4352171421051025, "learning_rate": 1.1181302678432481e-06, "loss": 0.3706, "num_input_tokens_seen": 24814624, "step": 24060 }, { "epoch": 16.096989966555185, "grad_norm": 2.560615301132202, "learning_rate": 1.1162914090022636e-06, "loss": 0.2971, "num_input_tokens_seen": 24819584, "step": 24065 }, { "epoch": 16.100334448160535, "grad_norm": 3.0251240730285645, "learning_rate": 1.1144538734633191e-06, "loss": 0.3729, "num_input_tokens_seen": 24824864, "step": 24070 }, { "epoch": 16.103678929765888, "grad_norm": 3.253610610961914, "learning_rate": 1.1126176618525202e-06, "loss": 0.352, "num_input_tokens_seen": 24829888, "step": 24075 }, { "epoch": 16.107023411371237, "grad_norm": 3.5367391109466553, "learning_rate": 1.1107827747955247e-06, "loss": 0.2802, "num_input_tokens_seen": 24835008, "step": 24080 }, { "epoch": 16.110367892976587, "grad_norm": 2.280301332473755, "learning_rate": 1.1089492129175329e-06, "loss": 0.3237, "num_input_tokens_seen": 24839488, "step": 24085 }, { "epoch": 16.11371237458194, "grad_norm": 3.8701422214508057, "learning_rate": 1.1071169768432983e-06, "loss": 0.4278, "num_input_tokens_seen": 24844576, "step": 24090 }, { "epoch": 16.11705685618729, "grad_norm": 4.389606475830078, "learning_rate": 1.1052860671971221e-06, "loss": 0.3793, "num_input_tokens_seen": 24848800, "step": 24095 }, { "epoch": 16.120401337792643, "grad_norm": 2.2383873462677, "learning_rate": 1.1034564846028533e-06, "loss": 0.2934, "num_input_tokens_seen": 24854112, "step": 24100 }, { "epoch": 16.123745819397993, "grad_norm": 2.447228193283081, "learning_rate": 1.1016282296838887e-06, "loss": 0.3839, "num_input_tokens_seen": 24859680, "step": 24105 }, { "epoch": 16.127090301003346, "grad_norm": 2.9173007011413574, "learning_rate": 1.099801303063171e-06, "loss": 0.377, "num_input_tokens_seen": 24864928, "step": 24110 }, { "epoch": 16.130434782608695, "grad_norm": 3.248995065689087, "learning_rate": 1.0979757053631918e-06, "loss": 0.344, "num_input_tokens_seen": 24869120, "step": 24115 }, { "epoch": 16.13377926421405, "grad_norm": 2.112170696258545, "learning_rate": 1.0961514372059907e-06, "loss": 0.3543, "num_input_tokens_seen": 24873600, "step": 24120 }, { "epoch": 16.137123745819398, "grad_norm": 3.604665994644165, "learning_rate": 1.0943284992131548e-06, "loss": 0.4534, "num_input_tokens_seen": 24878944, "step": 24125 }, { "epoch": 16.140468227424748, "grad_norm": 2.2789766788482666, "learning_rate": 1.092506892005813e-06, "loss": 0.298, "num_input_tokens_seen": 24884512, "step": 24130 }, { "epoch": 16.1438127090301, "grad_norm": 2.080951690673828, "learning_rate": 1.0906866162046465e-06, "loss": 0.3025, "num_input_tokens_seen": 24890368, "step": 24135 }, { "epoch": 16.14715719063545, "grad_norm": 2.552341938018799, "learning_rate": 1.0888676724298808e-06, "loss": 0.3755, "num_input_tokens_seen": 24895552, "step": 24140 }, { "epoch": 16.150501672240804, "grad_norm": 2.079772710800171, "learning_rate": 1.0870500613012868e-06, "loss": 0.3079, "num_input_tokens_seen": 24900288, "step": 24145 }, { "epoch": 16.153846153846153, "grad_norm": 2.7861478328704834, "learning_rate": 1.0852337834381837e-06, "loss": 0.4065, "num_input_tokens_seen": 24905504, "step": 24150 }, { "epoch": 16.157190635451506, "grad_norm": 3.0014336109161377, "learning_rate": 1.0834188394594319e-06, "loss": 0.3312, "num_input_tokens_seen": 24911840, "step": 24155 }, { "epoch": 16.160535117056856, "grad_norm": 3.318146228790283, "learning_rate": 1.0816052299834418e-06, "loss": 0.3693, "num_input_tokens_seen": 24917248, "step": 24160 }, { "epoch": 16.163879598662206, "grad_norm": 3.9274208545684814, "learning_rate": 1.0797929556281662e-06, "loss": 0.3788, "num_input_tokens_seen": 24921664, "step": 24165 }, { "epoch": 16.16722408026756, "grad_norm": 3.4039535522460938, "learning_rate": 1.0779820170111067e-06, "loss": 0.4245, "num_input_tokens_seen": 24927712, "step": 24170 }, { "epoch": 16.17056856187291, "grad_norm": 2.3368985652923584, "learning_rate": 1.076172414749304e-06, "loss": 0.3578, "num_input_tokens_seen": 24933376, "step": 24175 }, { "epoch": 16.17391304347826, "grad_norm": 2.993318796157837, "learning_rate": 1.0743641494593482e-06, "loss": 0.3505, "num_input_tokens_seen": 24937824, "step": 24180 }, { "epoch": 16.17725752508361, "grad_norm": 2.9615745544433594, "learning_rate": 1.072557221757372e-06, "loss": 0.2952, "num_input_tokens_seen": 24942400, "step": 24185 }, { "epoch": 16.180602006688964, "grad_norm": 2.3973984718322754, "learning_rate": 1.0707516322590532e-06, "loss": 0.3585, "num_input_tokens_seen": 24948160, "step": 24190 }, { "epoch": 16.183946488294314, "grad_norm": 3.632411479949951, "learning_rate": 1.0689473815796141e-06, "loss": 0.3849, "num_input_tokens_seen": 24953568, "step": 24195 }, { "epoch": 16.187290969899667, "grad_norm": 2.0468475818634033, "learning_rate": 1.0671444703338168e-06, "loss": 0.3867, "num_input_tokens_seen": 24959360, "step": 24200 }, { "epoch": 16.190635451505017, "grad_norm": 3.105344772338867, "learning_rate": 1.0653428991359726e-06, "loss": 0.4244, "num_input_tokens_seen": 24964384, "step": 24205 }, { "epoch": 16.193979933110366, "grad_norm": 3.547250986099243, "learning_rate": 1.0635426685999323e-06, "loss": 0.3326, "num_input_tokens_seen": 24969344, "step": 24210 }, { "epoch": 16.19732441471572, "grad_norm": 2.9367263317108154, "learning_rate": 1.0617437793390934e-06, "loss": 0.4378, "num_input_tokens_seen": 24974208, "step": 24215 }, { "epoch": 16.20066889632107, "grad_norm": 3.074988842010498, "learning_rate": 1.0599462319663906e-06, "loss": 0.4202, "num_input_tokens_seen": 24978592, "step": 24220 }, { "epoch": 16.204013377926422, "grad_norm": 2.5165653228759766, "learning_rate": 1.0581500270943074e-06, "loss": 0.3162, "num_input_tokens_seen": 24984064, "step": 24225 }, { "epoch": 16.207357859531772, "grad_norm": 3.1425111293792725, "learning_rate": 1.0563551653348675e-06, "loss": 0.3792, "num_input_tokens_seen": 24989568, "step": 24230 }, { "epoch": 16.210702341137125, "grad_norm": 2.606689453125, "learning_rate": 1.0545616472996357e-06, "loss": 0.4022, "num_input_tokens_seen": 24994784, "step": 24235 }, { "epoch": 16.214046822742475, "grad_norm": 2.350782871246338, "learning_rate": 1.0527694735997228e-06, "loss": 0.382, "num_input_tokens_seen": 25001280, "step": 24240 }, { "epoch": 16.217391304347824, "grad_norm": 3.0670573711395264, "learning_rate": 1.0509786448457753e-06, "loss": 0.3962, "num_input_tokens_seen": 25006656, "step": 24245 }, { "epoch": 16.220735785953178, "grad_norm": 3.518531084060669, "learning_rate": 1.0491891616479872e-06, "loss": 0.3223, "num_input_tokens_seen": 25011648, "step": 24250 }, { "epoch": 16.224080267558527, "grad_norm": 1.9204667806625366, "learning_rate": 1.0474010246160916e-06, "loss": 0.3311, "num_input_tokens_seen": 25016032, "step": 24255 }, { "epoch": 16.22742474916388, "grad_norm": 3.2940921783447266, "learning_rate": 1.0456142343593645e-06, "loss": 0.2942, "num_input_tokens_seen": 25020864, "step": 24260 }, { "epoch": 16.23076923076923, "grad_norm": 1.9738794565200806, "learning_rate": 1.0438287914866185e-06, "loss": 0.4138, "num_input_tokens_seen": 25026336, "step": 24265 }, { "epoch": 16.234113712374583, "grad_norm": 2.8219900131225586, "learning_rate": 1.0420446966062137e-06, "loss": 0.4931, "num_input_tokens_seen": 25031296, "step": 24270 }, { "epoch": 16.237458193979933, "grad_norm": 2.3649051189422607, "learning_rate": 1.040261950326043e-06, "loss": 0.3885, "num_input_tokens_seen": 25037152, "step": 24275 }, { "epoch": 16.240802675585286, "grad_norm": 2.6575851440429688, "learning_rate": 1.0384805532535502e-06, "loss": 0.3151, "num_input_tokens_seen": 25042016, "step": 24280 }, { "epoch": 16.244147157190636, "grad_norm": 4.370341777801514, "learning_rate": 1.0367005059957097e-06, "loss": 0.3722, "num_input_tokens_seen": 25046880, "step": 24285 }, { "epoch": 16.247491638795985, "grad_norm": 2.634918451309204, "learning_rate": 1.0349218091590396e-06, "loss": 0.2738, "num_input_tokens_seen": 25051936, "step": 24290 }, { "epoch": 16.25083612040134, "grad_norm": 2.239478826522827, "learning_rate": 1.0331444633495997e-06, "loss": 0.3945, "num_input_tokens_seen": 25056096, "step": 24295 }, { "epoch": 16.254180602006688, "grad_norm": 2.9482791423797607, "learning_rate": 1.0313684691729868e-06, "loss": 0.4105, "num_input_tokens_seen": 25061088, "step": 24300 }, { "epoch": 16.25752508361204, "grad_norm": 2.292365550994873, "learning_rate": 1.0295938272343397e-06, "loss": 0.3826, "num_input_tokens_seen": 25066048, "step": 24305 }, { "epoch": 16.26086956521739, "grad_norm": 3.240323305130005, "learning_rate": 1.0278205381383327e-06, "loss": 0.4042, "num_input_tokens_seen": 25070688, "step": 24310 }, { "epoch": 16.264214046822744, "grad_norm": 2.299826145172119, "learning_rate": 1.026048602489183e-06, "loss": 0.3974, "num_input_tokens_seen": 25077152, "step": 24315 }, { "epoch": 16.267558528428093, "grad_norm": 2.1502976417541504, "learning_rate": 1.0242780208906422e-06, "loss": 0.3159, "num_input_tokens_seen": 25082432, "step": 24320 }, { "epoch": 16.270903010033443, "grad_norm": 3.135802745819092, "learning_rate": 1.0225087939460077e-06, "loss": 0.3635, "num_input_tokens_seen": 25087168, "step": 24325 }, { "epoch": 16.274247491638796, "grad_norm": 3.050062417984009, "learning_rate": 1.0207409222581073e-06, "loss": 0.379, "num_input_tokens_seen": 25092896, "step": 24330 }, { "epoch": 16.277591973244146, "grad_norm": 2.4590439796447754, "learning_rate": 1.0189744064293123e-06, "loss": 0.3739, "num_input_tokens_seen": 25098912, "step": 24335 }, { "epoch": 16.2809364548495, "grad_norm": 2.98655366897583, "learning_rate": 1.0172092470615297e-06, "loss": 0.343, "num_input_tokens_seen": 25104192, "step": 24340 }, { "epoch": 16.28428093645485, "grad_norm": 2.3827359676361084, "learning_rate": 1.0154454447562051e-06, "loss": 0.3226, "num_input_tokens_seen": 25108864, "step": 24345 }, { "epoch": 16.287625418060202, "grad_norm": 4.0246100425720215, "learning_rate": 1.0136830001143233e-06, "loss": 0.3872, "num_input_tokens_seen": 25113440, "step": 24350 }, { "epoch": 16.29096989966555, "grad_norm": 2.2640767097473145, "learning_rate": 1.0119219137364017e-06, "loss": 0.3752, "num_input_tokens_seen": 25119040, "step": 24355 }, { "epoch": 16.294314381270905, "grad_norm": 4.039259433746338, "learning_rate": 1.0101621862225013e-06, "loss": 0.3668, "num_input_tokens_seen": 25124096, "step": 24360 }, { "epoch": 16.297658862876254, "grad_norm": 2.7153849601745605, "learning_rate": 1.0084038181722122e-06, "loss": 0.3524, "num_input_tokens_seen": 25130400, "step": 24365 }, { "epoch": 16.301003344481604, "grad_norm": 3.334522008895874, "learning_rate": 1.0066468101846716e-06, "loss": 0.3525, "num_input_tokens_seen": 25135168, "step": 24370 }, { "epoch": 16.304347826086957, "grad_norm": 2.505263566970825, "learning_rate": 1.0048911628585433e-06, "loss": 0.3571, "num_input_tokens_seen": 25140672, "step": 24375 }, { "epoch": 16.307692307692307, "grad_norm": 2.6098344326019287, "learning_rate": 1.0031368767920329e-06, "loss": 0.3482, "num_input_tokens_seen": 25146368, "step": 24380 }, { "epoch": 16.31103678929766, "grad_norm": 3.338235855102539, "learning_rate": 1.0013839525828811e-06, "loss": 0.3681, "num_input_tokens_seen": 25151232, "step": 24385 }, { "epoch": 16.31438127090301, "grad_norm": 3.214531660079956, "learning_rate": 9.99632390828365e-07, "loss": 0.3414, "num_input_tokens_seen": 25156000, "step": 24390 }, { "epoch": 16.317725752508363, "grad_norm": 2.8107378482818604, "learning_rate": 9.97882192125298e-07, "loss": 0.3801, "num_input_tokens_seen": 25160480, "step": 24395 }, { "epoch": 16.321070234113712, "grad_norm": 3.5227808952331543, "learning_rate": 9.96133357070025e-07, "loss": 0.3576, "num_input_tokens_seen": 25164640, "step": 24400 }, { "epoch": 16.324414715719065, "grad_norm": 2.270547389984131, "learning_rate": 9.943858862584304e-07, "loss": 0.337, "num_input_tokens_seen": 25169664, "step": 24405 }, { "epoch": 16.327759197324415, "grad_norm": 3.739663600921631, "learning_rate": 9.926397802859333e-07, "loss": 0.4593, "num_input_tokens_seen": 25175168, "step": 24410 }, { "epoch": 16.331103678929765, "grad_norm": 2.7104947566986084, "learning_rate": 9.908950397474882e-07, "loss": 0.3775, "num_input_tokens_seen": 25180160, "step": 24415 }, { "epoch": 16.334448160535118, "grad_norm": 2.8708181381225586, "learning_rate": 9.89151665237581e-07, "loss": 0.3882, "num_input_tokens_seen": 25185120, "step": 24420 }, { "epoch": 16.337792642140467, "grad_norm": 3.550506591796875, "learning_rate": 9.874096573502346e-07, "loss": 0.3625, "num_input_tokens_seen": 25189536, "step": 24425 }, { "epoch": 16.34113712374582, "grad_norm": 3.1024820804595947, "learning_rate": 9.856690166790072e-07, "loss": 0.3604, "num_input_tokens_seen": 25194016, "step": 24430 }, { "epoch": 16.34448160535117, "grad_norm": 2.74210524559021, "learning_rate": 9.839297438169893e-07, "loss": 0.3433, "num_input_tokens_seen": 25199072, "step": 24435 }, { "epoch": 16.347826086956523, "grad_norm": 2.0597941875457764, "learning_rate": 9.82191839356808e-07, "loss": 0.3519, "num_input_tokens_seen": 25204832, "step": 24440 }, { "epoch": 16.351170568561873, "grad_norm": 2.5643556118011475, "learning_rate": 9.804553038906184e-07, "loss": 0.4028, "num_input_tokens_seen": 25210624, "step": 24445 }, { "epoch": 16.354515050167223, "grad_norm": 3.509040355682373, "learning_rate": 9.787201380101157e-07, "loss": 0.3354, "num_input_tokens_seen": 25214912, "step": 24450 }, { "epoch": 16.357859531772576, "grad_norm": 2.818128824234009, "learning_rate": 9.769863423065246e-07, "loss": 0.357, "num_input_tokens_seen": 25220352, "step": 24455 }, { "epoch": 16.361204013377925, "grad_norm": 2.992680549621582, "learning_rate": 9.752539173706055e-07, "loss": 0.3773, "num_input_tokens_seen": 25225184, "step": 24460 }, { "epoch": 16.36454849498328, "grad_norm": 3.287404775619507, "learning_rate": 9.735228637926474e-07, "loss": 0.3558, "num_input_tokens_seen": 25230304, "step": 24465 }, { "epoch": 16.367892976588628, "grad_norm": 3.2532646656036377, "learning_rate": 9.717931821624787e-07, "loss": 0.4183, "num_input_tokens_seen": 25235296, "step": 24470 }, { "epoch": 16.37123745819398, "grad_norm": 3.1091253757476807, "learning_rate": 9.700648730694512e-07, "loss": 0.514, "num_input_tokens_seen": 25240128, "step": 24475 }, { "epoch": 16.37458193979933, "grad_norm": 3.00070858001709, "learning_rate": 9.683379371024598e-07, "loss": 0.328, "num_input_tokens_seen": 25245568, "step": 24480 }, { "epoch": 16.377926421404684, "grad_norm": 3.700117588043213, "learning_rate": 9.666123748499228e-07, "loss": 0.3512, "num_input_tokens_seen": 25249728, "step": 24485 }, { "epoch": 16.381270903010034, "grad_norm": 3.594212532043457, "learning_rate": 9.648881868997944e-07, "loss": 0.3633, "num_input_tokens_seen": 25254624, "step": 24490 }, { "epoch": 16.384615384615383, "grad_norm": 2.5851142406463623, "learning_rate": 9.631653738395602e-07, "loss": 0.4182, "num_input_tokens_seen": 25259872, "step": 24495 }, { "epoch": 16.387959866220736, "grad_norm": 1.8761624097824097, "learning_rate": 9.614439362562367e-07, "loss": 0.3905, "num_input_tokens_seen": 25265344, "step": 24500 }, { "epoch": 16.391304347826086, "grad_norm": 2.011486768722534, "learning_rate": 9.59723874736373e-07, "loss": 0.3768, "num_input_tokens_seen": 25270464, "step": 24505 }, { "epoch": 16.39464882943144, "grad_norm": 3.414583206176758, "learning_rate": 9.580051898660464e-07, "loss": 0.3978, "num_input_tokens_seen": 25275744, "step": 24510 }, { "epoch": 16.39799331103679, "grad_norm": 3.1134698390960693, "learning_rate": 9.562878822308696e-07, "loss": 0.3206, "num_input_tokens_seen": 25280832, "step": 24515 }, { "epoch": 16.401337792642142, "grad_norm": 2.8406975269317627, "learning_rate": 9.545719524159792e-07, "loss": 0.3799, "num_input_tokens_seen": 25286144, "step": 24520 }, { "epoch": 16.40468227424749, "grad_norm": 3.1123952865600586, "learning_rate": 9.528574010060515e-07, "loss": 0.3251, "num_input_tokens_seen": 25291072, "step": 24525 }, { "epoch": 16.40802675585284, "grad_norm": 3.8732986450195312, "learning_rate": 9.511442285852857e-07, "loss": 0.394, "num_input_tokens_seen": 25296640, "step": 24530 }, { "epoch": 16.411371237458194, "grad_norm": 2.5501930713653564, "learning_rate": 9.494324357374135e-07, "loss": 0.3484, "num_input_tokens_seen": 25301856, "step": 24535 }, { "epoch": 16.414715719063544, "grad_norm": 2.5640859603881836, "learning_rate": 9.477220230456974e-07, "loss": 0.3872, "num_input_tokens_seen": 25307168, "step": 24540 }, { "epoch": 16.418060200668897, "grad_norm": 3.9921367168426514, "learning_rate": 9.460129910929289e-07, "loss": 0.3685, "num_input_tokens_seen": 25312096, "step": 24545 }, { "epoch": 16.421404682274247, "grad_norm": 3.148799419403076, "learning_rate": 9.443053404614305e-07, "loss": 0.4452, "num_input_tokens_seen": 25317568, "step": 24550 }, { "epoch": 16.4247491638796, "grad_norm": 4.991943359375, "learning_rate": 9.425990717330496e-07, "loss": 0.3304, "num_input_tokens_seen": 25321696, "step": 24555 }, { "epoch": 16.42809364548495, "grad_norm": 2.0066492557525635, "learning_rate": 9.408941854891695e-07, "loss": 0.3182, "num_input_tokens_seen": 25326848, "step": 24560 }, { "epoch": 16.431438127090303, "grad_norm": 2.371852397918701, "learning_rate": 9.391906823106933e-07, "loss": 0.386, "num_input_tokens_seen": 25332160, "step": 24565 }, { "epoch": 16.434782608695652, "grad_norm": 2.275447130203247, "learning_rate": 9.37488562778065e-07, "loss": 0.3593, "num_input_tokens_seen": 25336992, "step": 24570 }, { "epoch": 16.438127090301002, "grad_norm": 3.9943039417266846, "learning_rate": 9.357878274712451e-07, "loss": 0.4689, "num_input_tokens_seen": 25341760, "step": 24575 }, { "epoch": 16.441471571906355, "grad_norm": 1.865609884262085, "learning_rate": 9.3408847696973e-07, "loss": 0.3936, "num_input_tokens_seen": 25347552, "step": 24580 }, { "epoch": 16.444816053511705, "grad_norm": 3.5618104934692383, "learning_rate": 9.323905118525417e-07, "loss": 0.3543, "num_input_tokens_seen": 25352160, "step": 24585 }, { "epoch": 16.448160535117058, "grad_norm": 2.7180020809173584, "learning_rate": 9.306939326982306e-07, "loss": 0.2976, "num_input_tokens_seen": 25356576, "step": 24590 }, { "epoch": 16.451505016722408, "grad_norm": 3.126164197921753, "learning_rate": 9.28998740084876e-07, "loss": 0.3503, "num_input_tokens_seen": 25361792, "step": 24595 }, { "epoch": 16.45484949832776, "grad_norm": 1.972257375717163, "learning_rate": 9.273049345900808e-07, "loss": 0.2996, "num_input_tokens_seen": 25367104, "step": 24600 }, { "epoch": 16.45819397993311, "grad_norm": 3.6074671745300293, "learning_rate": 9.256125167909807e-07, "loss": 0.446, "num_input_tokens_seen": 25372768, "step": 24605 }, { "epoch": 16.46153846153846, "grad_norm": 2.65557861328125, "learning_rate": 9.239214872642321e-07, "loss": 0.307, "num_input_tokens_seen": 25377792, "step": 24610 }, { "epoch": 16.464882943143813, "grad_norm": 2.5528430938720703, "learning_rate": 9.222318465860269e-07, "loss": 0.4299, "num_input_tokens_seen": 25382464, "step": 24615 }, { "epoch": 16.468227424749163, "grad_norm": 1.9503519535064697, "learning_rate": 9.205435953320751e-07, "loss": 0.3263, "num_input_tokens_seen": 25387616, "step": 24620 }, { "epoch": 16.471571906354516, "grad_norm": 2.5758025646209717, "learning_rate": 9.188567340776189e-07, "loss": 0.344, "num_input_tokens_seen": 25394080, "step": 24625 }, { "epoch": 16.474916387959865, "grad_norm": 2.766636371612549, "learning_rate": 9.17171263397425e-07, "loss": 0.282, "num_input_tokens_seen": 25399840, "step": 24630 }, { "epoch": 16.47826086956522, "grad_norm": 2.6197423934936523, "learning_rate": 9.154871838657859e-07, "loss": 0.3522, "num_input_tokens_seen": 25405440, "step": 24635 }, { "epoch": 16.48160535117057, "grad_norm": 5.095878601074219, "learning_rate": 9.138044960565229e-07, "loss": 0.4503, "num_input_tokens_seen": 25410912, "step": 24640 }, { "epoch": 16.48494983277592, "grad_norm": 2.9524896144866943, "learning_rate": 9.121232005429776e-07, "loss": 0.323, "num_input_tokens_seen": 25415360, "step": 24645 }, { "epoch": 16.48829431438127, "grad_norm": 2.001269578933716, "learning_rate": 9.104432978980227e-07, "loss": 0.3578, "num_input_tokens_seen": 25420576, "step": 24650 }, { "epoch": 16.49163879598662, "grad_norm": 3.3282248973846436, "learning_rate": 9.087647886940509e-07, "loss": 0.3186, "num_input_tokens_seen": 25425120, "step": 24655 }, { "epoch": 16.494983277591974, "grad_norm": 4.1521453857421875, "learning_rate": 9.070876735029877e-07, "loss": 0.3801, "num_input_tokens_seen": 25430944, "step": 24660 }, { "epoch": 16.498327759197323, "grad_norm": 2.657423734664917, "learning_rate": 9.054119528962757e-07, "loss": 0.3007, "num_input_tokens_seen": 25435104, "step": 24665 }, { "epoch": 16.501672240802677, "grad_norm": 3.350080728530884, "learning_rate": 9.037376274448873e-07, "loss": 0.3964, "num_input_tokens_seen": 25440064, "step": 24670 }, { "epoch": 16.505016722408026, "grad_norm": 3.6141974925994873, "learning_rate": 9.020646977193176e-07, "loss": 0.3302, "num_input_tokens_seen": 25445824, "step": 24675 }, { "epoch": 16.50836120401338, "grad_norm": 2.0526044368743896, "learning_rate": 9.00393164289588e-07, "loss": 0.2865, "num_input_tokens_seen": 25450304, "step": 24680 }, { "epoch": 16.51170568561873, "grad_norm": 2.741049289703369, "learning_rate": 8.987230277252395e-07, "loss": 0.3203, "num_input_tokens_seen": 25455488, "step": 24685 }, { "epoch": 16.51505016722408, "grad_norm": 2.6515276432037354, "learning_rate": 8.970542885953426e-07, "loss": 0.3565, "num_input_tokens_seen": 25459936, "step": 24690 }, { "epoch": 16.51839464882943, "grad_norm": 3.0719995498657227, "learning_rate": 8.953869474684901e-07, "loss": 0.3993, "num_input_tokens_seen": 25465344, "step": 24695 }, { "epoch": 16.52173913043478, "grad_norm": 3.6493077278137207, "learning_rate": 8.937210049127942e-07, "loss": 0.3665, "num_input_tokens_seen": 25470752, "step": 24700 }, { "epoch": 16.525083612040135, "grad_norm": 4.473995685577393, "learning_rate": 8.920564614958982e-07, "loss": 0.3995, "num_input_tokens_seen": 25475104, "step": 24705 }, { "epoch": 16.528428093645484, "grad_norm": 2.2576444149017334, "learning_rate": 8.903933177849621e-07, "loss": 0.3309, "num_input_tokens_seen": 25480288, "step": 24710 }, { "epoch": 16.531772575250837, "grad_norm": 2.1954398155212402, "learning_rate": 8.887315743466735e-07, "loss": 0.3401, "num_input_tokens_seen": 25485184, "step": 24715 }, { "epoch": 16.535117056856187, "grad_norm": 2.7243125438690186, "learning_rate": 8.870712317472364e-07, "loss": 0.2924, "num_input_tokens_seen": 25490048, "step": 24720 }, { "epoch": 16.53846153846154, "grad_norm": 1.8261899948120117, "learning_rate": 8.854122905523876e-07, "loss": 0.3568, "num_input_tokens_seen": 25494784, "step": 24725 }, { "epoch": 16.54180602006689, "grad_norm": 3.3849997520446777, "learning_rate": 8.83754751327377e-07, "loss": 0.4239, "num_input_tokens_seen": 25500096, "step": 24730 }, { "epoch": 16.54515050167224, "grad_norm": 2.1838877201080322, "learning_rate": 8.820986146369814e-07, "loss": 0.4143, "num_input_tokens_seen": 25504960, "step": 24735 }, { "epoch": 16.548494983277592, "grad_norm": 2.737046241760254, "learning_rate": 8.804438810454997e-07, "loss": 0.4226, "num_input_tokens_seen": 25509696, "step": 24740 }, { "epoch": 16.551839464882942, "grad_norm": 2.8311526775360107, "learning_rate": 8.787905511167489e-07, "loss": 0.4027, "num_input_tokens_seen": 25516160, "step": 24745 }, { "epoch": 16.555183946488295, "grad_norm": 2.7862601280212402, "learning_rate": 8.771386254140757e-07, "loss": 0.3605, "num_input_tokens_seen": 25522784, "step": 24750 }, { "epoch": 16.558528428093645, "grad_norm": 2.3765640258789062, "learning_rate": 8.754881045003388e-07, "loss": 0.2884, "num_input_tokens_seen": 25528448, "step": 24755 }, { "epoch": 16.561872909698998, "grad_norm": 2.6729230880737305, "learning_rate": 8.738389889379256e-07, "loss": 0.3148, "num_input_tokens_seen": 25533696, "step": 24760 }, { "epoch": 16.565217391304348, "grad_norm": 2.676478147506714, "learning_rate": 8.721912792887383e-07, "loss": 0.3404, "num_input_tokens_seen": 25538976, "step": 24765 }, { "epoch": 16.568561872909697, "grad_norm": 2.8446295261383057, "learning_rate": 8.705449761142082e-07, "loss": 0.3692, "num_input_tokens_seen": 25544000, "step": 24770 }, { "epoch": 16.57190635451505, "grad_norm": 2.9085693359375, "learning_rate": 8.68900079975279e-07, "loss": 0.3406, "num_input_tokens_seen": 25549184, "step": 24775 }, { "epoch": 16.5752508361204, "grad_norm": 3.226311445236206, "learning_rate": 8.672565914324205e-07, "loss": 0.4072, "num_input_tokens_seen": 25554400, "step": 24780 }, { "epoch": 16.578595317725753, "grad_norm": 2.523179531097412, "learning_rate": 8.656145110456216e-07, "loss": 0.3752, "num_input_tokens_seen": 25559776, "step": 24785 }, { "epoch": 16.581939799331103, "grad_norm": 2.931645393371582, "learning_rate": 8.639738393743874e-07, "loss": 0.3192, "num_input_tokens_seen": 25564736, "step": 24790 }, { "epoch": 16.585284280936456, "grad_norm": 3.176537275314331, "learning_rate": 8.623345769777514e-07, "loss": 0.3419, "num_input_tokens_seen": 25570432, "step": 24795 }, { "epoch": 16.588628762541806, "grad_norm": 2.4809343814849854, "learning_rate": 8.606967244142589e-07, "loss": 0.3374, "num_input_tokens_seen": 25575488, "step": 24800 }, { "epoch": 16.59197324414716, "grad_norm": 2.9618935585021973, "learning_rate": 8.590602822419797e-07, "loss": 0.3423, "num_input_tokens_seen": 25581376, "step": 24805 }, { "epoch": 16.59531772575251, "grad_norm": 3.288827419281006, "learning_rate": 8.574252510184977e-07, "loss": 0.398, "num_input_tokens_seen": 25586816, "step": 24810 }, { "epoch": 16.598662207357858, "grad_norm": 2.676222801208496, "learning_rate": 8.557916313009251e-07, "loss": 0.3178, "num_input_tokens_seen": 25592096, "step": 24815 }, { "epoch": 16.60200668896321, "grad_norm": 3.3135664463043213, "learning_rate": 8.541594236458834e-07, "loss": 0.3646, "num_input_tokens_seen": 25596992, "step": 24820 }, { "epoch": 16.60535117056856, "grad_norm": 2.561161518096924, "learning_rate": 8.52528628609518e-07, "loss": 0.435, "num_input_tokens_seen": 25602176, "step": 24825 }, { "epoch": 16.608695652173914, "grad_norm": 2.851534366607666, "learning_rate": 8.508992467474947e-07, "loss": 0.3134, "num_input_tokens_seen": 25606624, "step": 24830 }, { "epoch": 16.612040133779264, "grad_norm": 3.437619686126709, "learning_rate": 8.492712786149909e-07, "loss": 0.3282, "num_input_tokens_seen": 25611360, "step": 24835 }, { "epoch": 16.615384615384617, "grad_norm": 2.774716377258301, "learning_rate": 8.476447247667119e-07, "loss": 0.329, "num_input_tokens_seen": 25617120, "step": 24840 }, { "epoch": 16.618729096989966, "grad_norm": 3.2342565059661865, "learning_rate": 8.460195857568715e-07, "loss": 0.3393, "num_input_tokens_seen": 25622592, "step": 24845 }, { "epoch": 16.62207357859532, "grad_norm": 4.696656227111816, "learning_rate": 8.443958621392096e-07, "loss": 0.4155, "num_input_tokens_seen": 25627872, "step": 24850 }, { "epoch": 16.62541806020067, "grad_norm": 2.727928400039673, "learning_rate": 8.427735544669747e-07, "loss": 0.3927, "num_input_tokens_seen": 25632928, "step": 24855 }, { "epoch": 16.62876254180602, "grad_norm": 3.749755620956421, "learning_rate": 8.411526632929446e-07, "loss": 0.457, "num_input_tokens_seen": 25638624, "step": 24860 }, { "epoch": 16.632107023411372, "grad_norm": 2.0583698749542236, "learning_rate": 8.395331891694036e-07, "loss": 0.3832, "num_input_tokens_seen": 25644576, "step": 24865 }, { "epoch": 16.63545150501672, "grad_norm": 2.2108466625213623, "learning_rate": 8.379151326481588e-07, "loss": 0.4339, "num_input_tokens_seen": 25649760, "step": 24870 }, { "epoch": 16.638795986622075, "grad_norm": 3.39298677444458, "learning_rate": 8.362984942805336e-07, "loss": 0.2944, "num_input_tokens_seen": 25654816, "step": 24875 }, { "epoch": 16.642140468227424, "grad_norm": 2.5769264698028564, "learning_rate": 8.346832746173677e-07, "loss": 0.2818, "num_input_tokens_seen": 25660064, "step": 24880 }, { "epoch": 16.645484949832777, "grad_norm": 2.2627744674682617, "learning_rate": 8.330694742090178e-07, "loss": 0.3278, "num_input_tokens_seen": 25665216, "step": 24885 }, { "epoch": 16.648829431438127, "grad_norm": 2.762423515319824, "learning_rate": 8.314570936053551e-07, "loss": 0.3853, "num_input_tokens_seen": 25670368, "step": 24890 }, { "epoch": 16.652173913043477, "grad_norm": 2.441540002822876, "learning_rate": 8.298461333557706e-07, "loss": 0.3294, "num_input_tokens_seen": 25675328, "step": 24895 }, { "epoch": 16.65551839464883, "grad_norm": 2.967411994934082, "learning_rate": 8.282365940091653e-07, "loss": 0.3926, "num_input_tokens_seen": 25679488, "step": 24900 }, { "epoch": 16.65886287625418, "grad_norm": 2.3838207721710205, "learning_rate": 8.266284761139653e-07, "loss": 0.3482, "num_input_tokens_seen": 25684448, "step": 24905 }, { "epoch": 16.662207357859533, "grad_norm": 2.8241047859191895, "learning_rate": 8.250217802181037e-07, "loss": 0.3459, "num_input_tokens_seen": 25689760, "step": 24910 }, { "epoch": 16.665551839464882, "grad_norm": 2.7135584354400635, "learning_rate": 8.234165068690347e-07, "loss": 0.3629, "num_input_tokens_seen": 25693952, "step": 24915 }, { "epoch": 16.668896321070235, "grad_norm": 2.674812078475952, "learning_rate": 8.218126566137213e-07, "loss": 0.3107, "num_input_tokens_seen": 25698304, "step": 24920 }, { "epoch": 16.672240802675585, "grad_norm": 2.6515989303588867, "learning_rate": 8.202102299986514e-07, "loss": 0.3115, "num_input_tokens_seen": 25704064, "step": 24925 }, { "epoch": 16.675585284280935, "grad_norm": 2.339395523071289, "learning_rate": 8.186092275698188e-07, "loss": 0.3812, "num_input_tokens_seen": 25709568, "step": 24930 }, { "epoch": 16.678929765886288, "grad_norm": 2.926694869995117, "learning_rate": 8.170096498727359e-07, "loss": 0.34, "num_input_tokens_seen": 25715264, "step": 24935 }, { "epoch": 16.682274247491637, "grad_norm": 2.8131649494171143, "learning_rate": 8.154114974524313e-07, "loss": 0.2761, "num_input_tokens_seen": 25720192, "step": 24940 }, { "epoch": 16.68561872909699, "grad_norm": 2.418215274810791, "learning_rate": 8.138147708534422e-07, "loss": 0.3552, "num_input_tokens_seen": 25726304, "step": 24945 }, { "epoch": 16.68896321070234, "grad_norm": 2.621734619140625, "learning_rate": 8.122194706198289e-07, "loss": 0.3218, "num_input_tokens_seen": 25731264, "step": 24950 }, { "epoch": 16.692307692307693, "grad_norm": 4.211713790893555, "learning_rate": 8.106255972951554e-07, "loss": 0.3669, "num_input_tokens_seen": 25735840, "step": 24955 }, { "epoch": 16.695652173913043, "grad_norm": 3.3362741470336914, "learning_rate": 8.090331514225091e-07, "loss": 0.4739, "num_input_tokens_seen": 25741472, "step": 24960 }, { "epoch": 16.698996655518396, "grad_norm": 2.6854324340820312, "learning_rate": 8.07442133544481e-07, "loss": 0.2872, "num_input_tokens_seen": 25745600, "step": 24965 }, { "epoch": 16.702341137123746, "grad_norm": 2.273538589477539, "learning_rate": 8.058525442031872e-07, "loss": 0.4268, "num_input_tokens_seen": 25750688, "step": 24970 }, { "epoch": 16.705685618729095, "grad_norm": 2.577995538711548, "learning_rate": 8.04264383940247e-07, "loss": 0.3975, "num_input_tokens_seen": 25755840, "step": 24975 }, { "epoch": 16.70903010033445, "grad_norm": 3.3822474479675293, "learning_rate": 8.026776532967978e-07, "loss": 0.3759, "num_input_tokens_seen": 25761216, "step": 24980 }, { "epoch": 16.712374581939798, "grad_norm": 3.6907715797424316, "learning_rate": 8.010923528134901e-07, "loss": 0.3226, "num_input_tokens_seen": 25766528, "step": 24985 }, { "epoch": 16.71571906354515, "grad_norm": 2.7533340454101562, "learning_rate": 7.995084830304823e-07, "loss": 0.2975, "num_input_tokens_seen": 25771872, "step": 24990 }, { "epoch": 16.7190635451505, "grad_norm": 2.361006736755371, "learning_rate": 7.979260444874526e-07, "loss": 0.3545, "num_input_tokens_seen": 25777472, "step": 24995 }, { "epoch": 16.722408026755854, "grad_norm": 2.9958064556121826, "learning_rate": 7.963450377235859e-07, "loss": 0.3422, "num_input_tokens_seen": 25781600, "step": 25000 }, { "epoch": 16.725752508361204, "grad_norm": 3.020777702331543, "learning_rate": 7.94765463277582e-07, "loss": 0.3317, "num_input_tokens_seen": 25785984, "step": 25005 }, { "epoch": 16.729096989966557, "grad_norm": 3.0047874450683594, "learning_rate": 7.931873216876485e-07, "loss": 0.3315, "num_input_tokens_seen": 25791008, "step": 25010 }, { "epoch": 16.732441471571907, "grad_norm": 2.5165646076202393, "learning_rate": 7.916106134915136e-07, "loss": 0.4083, "num_input_tokens_seen": 25796416, "step": 25015 }, { "epoch": 16.735785953177256, "grad_norm": 2.4270365238189697, "learning_rate": 7.900353392264071e-07, "loss": 0.3698, "num_input_tokens_seen": 25801792, "step": 25020 }, { "epoch": 16.73913043478261, "grad_norm": 3.4901366233825684, "learning_rate": 7.884614994290757e-07, "loss": 0.4375, "num_input_tokens_seen": 25806816, "step": 25025 }, { "epoch": 16.74247491638796, "grad_norm": 2.316911458969116, "learning_rate": 7.868890946357782e-07, "loss": 0.3822, "num_input_tokens_seen": 25812096, "step": 25030 }, { "epoch": 16.745819397993312, "grad_norm": 2.822195053100586, "learning_rate": 7.853181253822784e-07, "loss": 0.3848, "num_input_tokens_seen": 25816640, "step": 25035 }, { "epoch": 16.74916387959866, "grad_norm": 3.4164514541625977, "learning_rate": 7.837485922038602e-07, "loss": 0.3219, "num_input_tokens_seen": 25821248, "step": 25040 }, { "epoch": 16.752508361204015, "grad_norm": 3.1147053241729736, "learning_rate": 7.821804956353091e-07, "loss": 0.3956, "num_input_tokens_seen": 25826208, "step": 25045 }, { "epoch": 16.755852842809364, "grad_norm": 3.2668228149414062, "learning_rate": 7.806138362109278e-07, "loss": 0.3265, "num_input_tokens_seen": 25830752, "step": 25050 }, { "epoch": 16.759197324414714, "grad_norm": 2.80185866355896, "learning_rate": 7.790486144645226e-07, "loss": 0.3768, "num_input_tokens_seen": 25836768, "step": 25055 }, { "epoch": 16.762541806020067, "grad_norm": 3.830017566680908, "learning_rate": 7.774848309294197e-07, "loss": 0.3899, "num_input_tokens_seen": 25841888, "step": 25060 }, { "epoch": 16.765886287625417, "grad_norm": 3.3142504692077637, "learning_rate": 7.759224861384446e-07, "loss": 0.4657, "num_input_tokens_seen": 25847968, "step": 25065 }, { "epoch": 16.76923076923077, "grad_norm": 2.6612207889556885, "learning_rate": 7.743615806239396e-07, "loss": 0.4054, "num_input_tokens_seen": 25853184, "step": 25070 }, { "epoch": 16.77257525083612, "grad_norm": 2.563140630722046, "learning_rate": 7.72802114917755e-07, "loss": 0.2686, "num_input_tokens_seen": 25858240, "step": 25075 }, { "epoch": 16.775919732441473, "grad_norm": 2.801412582397461, "learning_rate": 7.712440895512469e-07, "loss": 0.4326, "num_input_tokens_seen": 25863744, "step": 25080 }, { "epoch": 16.779264214046822, "grad_norm": 2.2692341804504395, "learning_rate": 7.696875050552876e-07, "loss": 0.3976, "num_input_tokens_seen": 25868992, "step": 25085 }, { "epoch": 16.782608695652176, "grad_norm": 2.2018826007843018, "learning_rate": 7.681323619602526e-07, "loss": 0.3127, "num_input_tokens_seen": 25874624, "step": 25090 }, { "epoch": 16.785953177257525, "grad_norm": 2.270524501800537, "learning_rate": 7.66578660796029e-07, "loss": 0.3898, "num_input_tokens_seen": 25880256, "step": 25095 }, { "epoch": 16.789297658862875, "grad_norm": 3.18825364112854, "learning_rate": 7.65026402092009e-07, "loss": 0.2981, "num_input_tokens_seen": 25885024, "step": 25100 }, { "epoch": 16.792642140468228, "grad_norm": 3.655409812927246, "learning_rate": 7.634755863771004e-07, "loss": 0.3878, "num_input_tokens_seen": 25889984, "step": 25105 }, { "epoch": 16.795986622073578, "grad_norm": 2.4890918731689453, "learning_rate": 7.619262141797123e-07, "loss": 0.4011, "num_input_tokens_seen": 25895616, "step": 25110 }, { "epoch": 16.79933110367893, "grad_norm": 3.5871522426605225, "learning_rate": 7.603782860277659e-07, "loss": 0.3786, "num_input_tokens_seen": 25900672, "step": 25115 }, { "epoch": 16.80267558528428, "grad_norm": 3.7081072330474854, "learning_rate": 7.58831802448688e-07, "loss": 0.4812, "num_input_tokens_seen": 25905344, "step": 25120 }, { "epoch": 16.806020066889634, "grad_norm": 1.9663074016571045, "learning_rate": 7.572867639694148e-07, "loss": 0.3396, "num_input_tokens_seen": 25910912, "step": 25125 }, { "epoch": 16.809364548494983, "grad_norm": 2.827430486679077, "learning_rate": 7.557431711163899e-07, "loss": 0.3282, "num_input_tokens_seen": 25915840, "step": 25130 }, { "epoch": 16.812709030100333, "grad_norm": 2.9255499839782715, "learning_rate": 7.542010244155634e-07, "loss": 0.431, "num_input_tokens_seen": 25920896, "step": 25135 }, { "epoch": 16.816053511705686, "grad_norm": 2.350419044494629, "learning_rate": 7.526603243923958e-07, "loss": 0.3025, "num_input_tokens_seen": 25926848, "step": 25140 }, { "epoch": 16.819397993311036, "grad_norm": 2.448719024658203, "learning_rate": 7.511210715718476e-07, "loss": 0.3894, "num_input_tokens_seen": 25932160, "step": 25145 }, { "epoch": 16.82274247491639, "grad_norm": 2.6702075004577637, "learning_rate": 7.495832664783959e-07, "loss": 0.3528, "num_input_tokens_seen": 25937248, "step": 25150 }, { "epoch": 16.82608695652174, "grad_norm": 2.8322598934173584, "learning_rate": 7.48046909636016e-07, "loss": 0.3478, "num_input_tokens_seen": 25941536, "step": 25155 }, { "epoch": 16.82943143812709, "grad_norm": 2.072099208831787, "learning_rate": 7.465120015681954e-07, "loss": 0.3842, "num_input_tokens_seen": 25947200, "step": 25160 }, { "epoch": 16.83277591973244, "grad_norm": 3.7177863121032715, "learning_rate": 7.449785427979234e-07, "loss": 0.3946, "num_input_tokens_seen": 25954080, "step": 25165 }, { "epoch": 16.836120401337794, "grad_norm": 2.0873184204101562, "learning_rate": 7.434465338476987e-07, "loss": 0.3597, "num_input_tokens_seen": 25958912, "step": 25170 }, { "epoch": 16.839464882943144, "grad_norm": 1.994895577430725, "learning_rate": 7.41915975239525e-07, "loss": 0.3147, "num_input_tokens_seen": 25964224, "step": 25175 }, { "epoch": 16.842809364548494, "grad_norm": 2.6045548915863037, "learning_rate": 7.403868674949122e-07, "loss": 0.2431, "num_input_tokens_seen": 25968928, "step": 25180 }, { "epoch": 16.846153846153847, "grad_norm": 2.74088716506958, "learning_rate": 7.38859211134877e-07, "loss": 0.4076, "num_input_tokens_seen": 25973888, "step": 25185 }, { "epoch": 16.849498327759196, "grad_norm": 3.1566436290740967, "learning_rate": 7.373330066799361e-07, "loss": 0.3431, "num_input_tokens_seen": 25978368, "step": 25190 }, { "epoch": 16.85284280936455, "grad_norm": 3.214036703109741, "learning_rate": 7.3580825465012e-07, "loss": 0.4814, "num_input_tokens_seen": 25983904, "step": 25195 }, { "epoch": 16.8561872909699, "grad_norm": 4.108069896697998, "learning_rate": 7.342849555649573e-07, "loss": 0.2953, "num_input_tokens_seen": 25988032, "step": 25200 }, { "epoch": 16.859531772575252, "grad_norm": 2.6768362522125244, "learning_rate": 7.327631099434851e-07, "loss": 0.3685, "num_input_tokens_seen": 25993024, "step": 25205 }, { "epoch": 16.862876254180602, "grad_norm": 2.9384102821350098, "learning_rate": 7.31242718304243e-07, "loss": 0.3877, "num_input_tokens_seen": 25997824, "step": 25210 }, { "epoch": 16.86622073578595, "grad_norm": 2.118694305419922, "learning_rate": 7.297237811652768e-07, "loss": 0.4155, "num_input_tokens_seen": 26002944, "step": 25215 }, { "epoch": 16.869565217391305, "grad_norm": 3.6297404766082764, "learning_rate": 7.282062990441368e-07, "loss": 0.3532, "num_input_tokens_seen": 26007360, "step": 25220 }, { "epoch": 16.872909698996654, "grad_norm": 2.6186087131500244, "learning_rate": 7.266902724578767e-07, "loss": 0.3797, "num_input_tokens_seen": 26012576, "step": 25225 }, { "epoch": 16.876254180602007, "grad_norm": 3.2625815868377686, "learning_rate": 7.25175701923056e-07, "loss": 0.3897, "num_input_tokens_seen": 26018208, "step": 25230 }, { "epoch": 16.879598662207357, "grad_norm": 2.607262372970581, "learning_rate": 7.23662587955733e-07, "loss": 0.3893, "num_input_tokens_seen": 26024128, "step": 25235 }, { "epoch": 16.88294314381271, "grad_norm": 3.9715428352355957, "learning_rate": 7.221509310714781e-07, "loss": 0.393, "num_input_tokens_seen": 26028512, "step": 25240 }, { "epoch": 16.88628762541806, "grad_norm": 3.7609024047851562, "learning_rate": 7.20640731785357e-07, "loss": 0.3324, "num_input_tokens_seen": 26032768, "step": 25245 }, { "epoch": 16.889632107023413, "grad_norm": 3.4340977668762207, "learning_rate": 7.191319906119443e-07, "loss": 0.3543, "num_input_tokens_seen": 26037568, "step": 25250 }, { "epoch": 16.892976588628763, "grad_norm": 2.5595579147338867, "learning_rate": 7.176247080653143e-07, "loss": 0.3295, "num_input_tokens_seen": 26043296, "step": 25255 }, { "epoch": 16.896321070234112, "grad_norm": 3.1792330741882324, "learning_rate": 7.161188846590455e-07, "loss": 0.4008, "num_input_tokens_seen": 26049504, "step": 25260 }, { "epoch": 16.899665551839465, "grad_norm": 3.400355100631714, "learning_rate": 7.146145209062205e-07, "loss": 0.3589, "num_input_tokens_seen": 26053632, "step": 25265 }, { "epoch": 16.903010033444815, "grad_norm": 2.364424467086792, "learning_rate": 7.13111617319423e-07, "loss": 0.2936, "num_input_tokens_seen": 26058304, "step": 25270 }, { "epoch": 16.906354515050168, "grad_norm": 3.2909069061279297, "learning_rate": 7.116101744107406e-07, "loss": 0.3267, "num_input_tokens_seen": 26064032, "step": 25275 }, { "epoch": 16.909698996655518, "grad_norm": 2.5394091606140137, "learning_rate": 7.101101926917597e-07, "loss": 0.2709, "num_input_tokens_seen": 26069632, "step": 25280 }, { "epoch": 16.91304347826087, "grad_norm": 2.517638683319092, "learning_rate": 7.086116726735753e-07, "loss": 0.3617, "num_input_tokens_seen": 26075072, "step": 25285 }, { "epoch": 16.91638795986622, "grad_norm": 2.338113307952881, "learning_rate": 7.071146148667773e-07, "loss": 0.3432, "num_input_tokens_seen": 26079648, "step": 25290 }, { "epoch": 16.919732441471574, "grad_norm": 3.149212598800659, "learning_rate": 7.056190197814628e-07, "loss": 0.4534, "num_input_tokens_seen": 26085088, "step": 25295 }, { "epoch": 16.923076923076923, "grad_norm": 2.0086846351623535, "learning_rate": 7.04124887927225e-07, "loss": 0.3467, "num_input_tokens_seen": 26090432, "step": 25300 }, { "epoch": 16.926421404682273, "grad_norm": 2.457916736602783, "learning_rate": 7.026322198131647e-07, "loss": 0.3583, "num_input_tokens_seen": 26095104, "step": 25305 }, { "epoch": 16.929765886287626, "grad_norm": 2.6387665271759033, "learning_rate": 7.011410159478793e-07, "loss": 0.4035, "num_input_tokens_seen": 26100192, "step": 25310 }, { "epoch": 16.933110367892976, "grad_norm": 4.700411319732666, "learning_rate": 6.996512768394703e-07, "loss": 0.3508, "num_input_tokens_seen": 26105344, "step": 25315 }, { "epoch": 16.93645484949833, "grad_norm": 2.5867772102355957, "learning_rate": 6.981630029955389e-07, "loss": 0.4087, "num_input_tokens_seen": 26110784, "step": 25320 }, { "epoch": 16.93979933110368, "grad_norm": 2.870310068130493, "learning_rate": 6.966761949231854e-07, "loss": 0.3116, "num_input_tokens_seen": 26115936, "step": 25325 }, { "epoch": 16.94314381270903, "grad_norm": 2.7303102016448975, "learning_rate": 6.951908531290131e-07, "loss": 0.3123, "num_input_tokens_seen": 26121248, "step": 25330 }, { "epoch": 16.94648829431438, "grad_norm": 2.599771022796631, "learning_rate": 6.937069781191247e-07, "loss": 0.353, "num_input_tokens_seen": 26126560, "step": 25335 }, { "epoch": 16.94983277591973, "grad_norm": 2.7404086589813232, "learning_rate": 6.922245703991248e-07, "loss": 0.4915, "num_input_tokens_seen": 26131904, "step": 25340 }, { "epoch": 16.953177257525084, "grad_norm": 1.996504306793213, "learning_rate": 6.907436304741139e-07, "loss": 0.3318, "num_input_tokens_seen": 26136672, "step": 25345 }, { "epoch": 16.956521739130434, "grad_norm": 2.5525808334350586, "learning_rate": 6.892641588486959e-07, "loss": 0.392, "num_input_tokens_seen": 26141824, "step": 25350 }, { "epoch": 16.959866220735787, "grad_norm": 2.859408140182495, "learning_rate": 6.877861560269733e-07, "loss": 0.3784, "num_input_tokens_seen": 26147392, "step": 25355 }, { "epoch": 16.963210702341136, "grad_norm": 2.553497552871704, "learning_rate": 6.863096225125504e-07, "loss": 0.3475, "num_input_tokens_seen": 26152704, "step": 25360 }, { "epoch": 16.96655518394649, "grad_norm": 4.328196048736572, "learning_rate": 6.848345588085253e-07, "loss": 0.4764, "num_input_tokens_seen": 26157312, "step": 25365 }, { "epoch": 16.96989966555184, "grad_norm": 2.4099955558776855, "learning_rate": 6.833609654175005e-07, "loss": 0.318, "num_input_tokens_seen": 26162432, "step": 25370 }, { "epoch": 16.97324414715719, "grad_norm": 2.382784366607666, "learning_rate": 6.81888842841576e-07, "loss": 0.4001, "num_input_tokens_seen": 26167552, "step": 25375 }, { "epoch": 16.976588628762542, "grad_norm": 3.096759796142578, "learning_rate": 6.804181915823499e-07, "loss": 0.3605, "num_input_tokens_seen": 26173536, "step": 25380 }, { "epoch": 16.97993311036789, "grad_norm": 2.6648945808410645, "learning_rate": 6.789490121409209e-07, "loss": 0.3545, "num_input_tokens_seen": 26179360, "step": 25385 }, { "epoch": 16.983277591973245, "grad_norm": 3.2271554470062256, "learning_rate": 6.774813050178813e-07, "loss": 0.4535, "num_input_tokens_seen": 26184928, "step": 25390 }, { "epoch": 16.986622073578594, "grad_norm": 2.9809157848358154, "learning_rate": 6.760150707133295e-07, "loss": 0.3426, "num_input_tokens_seen": 26189728, "step": 25395 }, { "epoch": 16.989966555183948, "grad_norm": 3.3966925144195557, "learning_rate": 6.745503097268547e-07, "loss": 0.3629, "num_input_tokens_seen": 26194816, "step": 25400 }, { "epoch": 16.993311036789297, "grad_norm": 2.43463134765625, "learning_rate": 6.730870225575492e-07, "loss": 0.4416, "num_input_tokens_seen": 26201152, "step": 25405 }, { "epoch": 16.99665551839465, "grad_norm": 2.762629508972168, "learning_rate": 6.716252097039994e-07, "loss": 0.3559, "num_input_tokens_seen": 26206400, "step": 25410 }, { "epoch": 17.0, "grad_norm": 2.9447309970855713, "learning_rate": 6.701648716642916e-07, "loss": 0.2539, "num_input_tokens_seen": 26210880, "step": 25415 }, { "epoch": 17.00334448160535, "grad_norm": 3.8603789806365967, "learning_rate": 6.687060089360092e-07, "loss": 0.4087, "num_input_tokens_seen": 26216000, "step": 25420 }, { "epoch": 17.006688963210703, "grad_norm": 3.0275087356567383, "learning_rate": 6.672486220162328e-07, "loss": 0.289, "num_input_tokens_seen": 26220640, "step": 25425 }, { "epoch": 17.010033444816052, "grad_norm": 2.606198787689209, "learning_rate": 6.657927114015411e-07, "loss": 0.397, "num_input_tokens_seen": 26225152, "step": 25430 }, { "epoch": 17.013377926421406, "grad_norm": 3.0655198097229004, "learning_rate": 6.643382775880053e-07, "loss": 0.291, "num_input_tokens_seen": 26229696, "step": 25435 }, { "epoch": 17.016722408026755, "grad_norm": 2.598104476928711, "learning_rate": 6.628853210712022e-07, "loss": 0.3048, "num_input_tokens_seen": 26235328, "step": 25440 }, { "epoch": 17.02006688963211, "grad_norm": 3.2058401107788086, "learning_rate": 6.614338423461958e-07, "loss": 0.3658, "num_input_tokens_seen": 26239968, "step": 25445 }, { "epoch": 17.023411371237458, "grad_norm": 2.8476672172546387, "learning_rate": 6.599838419075527e-07, "loss": 0.4017, "num_input_tokens_seen": 26245536, "step": 25450 }, { "epoch": 17.02675585284281, "grad_norm": 3.234945297241211, "learning_rate": 6.585353202493322e-07, "loss": 0.4002, "num_input_tokens_seen": 26250528, "step": 25455 }, { "epoch": 17.03010033444816, "grad_norm": 2.4428491592407227, "learning_rate": 6.570882778650922e-07, "loss": 0.3262, "num_input_tokens_seen": 26255648, "step": 25460 }, { "epoch": 17.03344481605351, "grad_norm": 2.2728919982910156, "learning_rate": 6.556427152478856e-07, "loss": 0.3678, "num_input_tokens_seen": 26260320, "step": 25465 }, { "epoch": 17.036789297658864, "grad_norm": 3.7877461910247803, "learning_rate": 6.541986328902611e-07, "loss": 0.3162, "num_input_tokens_seen": 26265152, "step": 25470 }, { "epoch": 17.040133779264213, "grad_norm": 2.1359617710113525, "learning_rate": 6.527560312842646e-07, "loss": 0.318, "num_input_tokens_seen": 26269920, "step": 25475 }, { "epoch": 17.043478260869566, "grad_norm": 3.5349371433258057, "learning_rate": 6.513149109214323e-07, "loss": 0.4298, "num_input_tokens_seen": 26274848, "step": 25480 }, { "epoch": 17.046822742474916, "grad_norm": 3.249490261077881, "learning_rate": 6.498752722928042e-07, "loss": 0.2986, "num_input_tokens_seen": 26279712, "step": 25485 }, { "epoch": 17.05016722408027, "grad_norm": 3.3673667907714844, "learning_rate": 6.484371158889069e-07, "loss": 0.3597, "num_input_tokens_seen": 26284672, "step": 25490 }, { "epoch": 17.05351170568562, "grad_norm": 2.5447213649749756, "learning_rate": 6.470004421997678e-07, "loss": 0.296, "num_input_tokens_seen": 26290240, "step": 25495 }, { "epoch": 17.05685618729097, "grad_norm": 2.975886344909668, "learning_rate": 6.455652517149053e-07, "loss": 0.2837, "num_input_tokens_seen": 26295584, "step": 25500 }, { "epoch": 17.06020066889632, "grad_norm": 3.5378379821777344, "learning_rate": 6.441315449233343e-07, "loss": 0.4764, "num_input_tokens_seen": 26300576, "step": 25505 }, { "epoch": 17.06354515050167, "grad_norm": 2.7076470851898193, "learning_rate": 6.426993223135647e-07, "loss": 0.3452, "num_input_tokens_seen": 26305408, "step": 25510 }, { "epoch": 17.066889632107024, "grad_norm": 2.47765851020813, "learning_rate": 6.412685843735999e-07, "loss": 0.3115, "num_input_tokens_seen": 26310432, "step": 25515 }, { "epoch": 17.070234113712374, "grad_norm": 2.6444225311279297, "learning_rate": 6.398393315909379e-07, "loss": 0.3443, "num_input_tokens_seen": 26315168, "step": 25520 }, { "epoch": 17.073578595317727, "grad_norm": 2.4625871181488037, "learning_rate": 6.384115644525674e-07, "loss": 0.3477, "num_input_tokens_seen": 26320064, "step": 25525 }, { "epoch": 17.076923076923077, "grad_norm": 4.169886112213135, "learning_rate": 6.369852834449786e-07, "loss": 0.4383, "num_input_tokens_seen": 26325632, "step": 25530 }, { "epoch": 17.08026755852843, "grad_norm": 3.1266300678253174, "learning_rate": 6.355604890541461e-07, "loss": 0.319, "num_input_tokens_seen": 26331520, "step": 25535 }, { "epoch": 17.08361204013378, "grad_norm": 4.132995128631592, "learning_rate": 6.341371817655451e-07, "loss": 0.4074, "num_input_tokens_seen": 26336608, "step": 25540 }, { "epoch": 17.08695652173913, "grad_norm": 3.949069023132324, "learning_rate": 6.327153620641385e-07, "loss": 0.3507, "num_input_tokens_seen": 26340864, "step": 25545 }, { "epoch": 17.090301003344482, "grad_norm": 2.521876096725464, "learning_rate": 6.312950304343874e-07, "loss": 0.435, "num_input_tokens_seen": 26347360, "step": 25550 }, { "epoch": 17.093645484949832, "grad_norm": 2.5377485752105713, "learning_rate": 6.298761873602427e-07, "loss": 0.4501, "num_input_tokens_seen": 26352864, "step": 25555 }, { "epoch": 17.096989966555185, "grad_norm": 1.7340998649597168, "learning_rate": 6.284588333251501e-07, "loss": 0.4115, "num_input_tokens_seen": 26358176, "step": 25560 }, { "epoch": 17.100334448160535, "grad_norm": 2.2214128971099854, "learning_rate": 6.27042968812045e-07, "loss": 0.3887, "num_input_tokens_seen": 26363712, "step": 25565 }, { "epoch": 17.103678929765888, "grad_norm": 2.6339685916900635, "learning_rate": 6.25628594303358e-07, "loss": 0.2533, "num_input_tokens_seen": 26368800, "step": 25570 }, { "epoch": 17.107023411371237, "grad_norm": 3.113070487976074, "learning_rate": 6.242157102810109e-07, "loss": 0.3298, "num_input_tokens_seen": 26373472, "step": 25575 }, { "epoch": 17.110367892976587, "grad_norm": 2.4194350242614746, "learning_rate": 6.228043172264181e-07, "loss": 0.2768, "num_input_tokens_seen": 26378848, "step": 25580 }, { "epoch": 17.11371237458194, "grad_norm": 2.854334592819214, "learning_rate": 6.213944156204871e-07, "loss": 0.3187, "num_input_tokens_seen": 26384224, "step": 25585 }, { "epoch": 17.11705685618729, "grad_norm": 2.317852020263672, "learning_rate": 6.199860059436136e-07, "loss": 0.3049, "num_input_tokens_seen": 26390496, "step": 25590 }, { "epoch": 17.120401337792643, "grad_norm": 3.0224313735961914, "learning_rate": 6.185790886756882e-07, "loss": 0.399, "num_input_tokens_seen": 26395776, "step": 25595 }, { "epoch": 17.123745819397993, "grad_norm": 2.7671709060668945, "learning_rate": 6.171736642960912e-07, "loss": 0.3172, "num_input_tokens_seen": 26401216, "step": 25600 }, { "epoch": 17.127090301003346, "grad_norm": 5.00736665725708, "learning_rate": 6.157697332836976e-07, "loss": 0.4429, "num_input_tokens_seen": 26406400, "step": 25605 }, { "epoch": 17.130434782608695, "grad_norm": 2.848792552947998, "learning_rate": 6.143672961168673e-07, "loss": 0.3783, "num_input_tokens_seen": 26412096, "step": 25610 }, { "epoch": 17.13377926421405, "grad_norm": 3.7387349605560303, "learning_rate": 6.12966353273457e-07, "loss": 0.3692, "num_input_tokens_seen": 26417504, "step": 25615 }, { "epoch": 17.137123745819398, "grad_norm": 3.2522084712982178, "learning_rate": 6.11566905230812e-07, "loss": 0.3944, "num_input_tokens_seen": 26423328, "step": 25620 }, { "epoch": 17.140468227424748, "grad_norm": 3.2183446884155273, "learning_rate": 6.101689524657683e-07, "loss": 0.3541, "num_input_tokens_seen": 26429088, "step": 25625 }, { "epoch": 17.1438127090301, "grad_norm": 2.3780040740966797, "learning_rate": 6.087724954546532e-07, "loss": 0.373, "num_input_tokens_seen": 26433952, "step": 25630 }, { "epoch": 17.14715719063545, "grad_norm": 3.2939085960388184, "learning_rate": 6.073775346732819e-07, "loss": 0.3397, "num_input_tokens_seen": 26439328, "step": 25635 }, { "epoch": 17.150501672240804, "grad_norm": 3.7770888805389404, "learning_rate": 6.059840705969627e-07, "loss": 0.4494, "num_input_tokens_seen": 26444096, "step": 25640 }, { "epoch": 17.153846153846153, "grad_norm": 3.311227321624756, "learning_rate": 6.045921037004926e-07, "loss": 0.3528, "num_input_tokens_seen": 26449536, "step": 25645 }, { "epoch": 17.157190635451506, "grad_norm": 2.2311084270477295, "learning_rate": 6.032016344581598e-07, "loss": 0.3885, "num_input_tokens_seen": 26453984, "step": 25650 }, { "epoch": 17.160535117056856, "grad_norm": 3.469985246658325, "learning_rate": 6.018126633437388e-07, "loss": 0.3739, "num_input_tokens_seen": 26459520, "step": 25655 }, { "epoch": 17.163879598662206, "grad_norm": 2.5440144538879395, "learning_rate": 6.00425190830497e-07, "loss": 0.3769, "num_input_tokens_seen": 26465344, "step": 25660 }, { "epoch": 17.16722408026756, "grad_norm": 2.026184320449829, "learning_rate": 5.990392173911896e-07, "loss": 0.2559, "num_input_tokens_seen": 26471360, "step": 25665 }, { "epoch": 17.17056856187291, "grad_norm": 2.4698426723480225, "learning_rate": 5.976547434980623e-07, "loss": 0.3682, "num_input_tokens_seen": 26476160, "step": 25670 }, { "epoch": 17.17391304347826, "grad_norm": 2.5833241939544678, "learning_rate": 5.962717696228498e-07, "loss": 0.3767, "num_input_tokens_seen": 26480608, "step": 25675 }, { "epoch": 17.17725752508361, "grad_norm": 2.559049129486084, "learning_rate": 5.948902962367726e-07, "loss": 0.3641, "num_input_tokens_seen": 26485600, "step": 25680 }, { "epoch": 17.180602006688964, "grad_norm": 2.5209550857543945, "learning_rate": 5.935103238105433e-07, "loss": 0.3794, "num_input_tokens_seen": 26491072, "step": 25685 }, { "epoch": 17.183946488294314, "grad_norm": 2.7731122970581055, "learning_rate": 5.921318528143622e-07, "loss": 0.4131, "num_input_tokens_seen": 26495936, "step": 25690 }, { "epoch": 17.187290969899667, "grad_norm": 2.2855842113494873, "learning_rate": 5.907548837179189e-07, "loss": 0.3295, "num_input_tokens_seen": 26501152, "step": 25695 }, { "epoch": 17.190635451505017, "grad_norm": 2.6421382427215576, "learning_rate": 5.893794169903877e-07, "loss": 0.3924, "num_input_tokens_seen": 26506816, "step": 25700 }, { "epoch": 17.193979933110366, "grad_norm": 2.9634101390838623, "learning_rate": 5.880054531004353e-07, "loss": 0.3326, "num_input_tokens_seen": 26512096, "step": 25705 }, { "epoch": 17.19732441471572, "grad_norm": 4.598107814788818, "learning_rate": 5.866329925162145e-07, "loss": 0.3942, "num_input_tokens_seen": 26517408, "step": 25710 }, { "epoch": 17.20066889632107, "grad_norm": 3.314965009689331, "learning_rate": 5.852620357053651e-07, "loss": 0.4025, "num_input_tokens_seen": 26522048, "step": 25715 }, { "epoch": 17.204013377926422, "grad_norm": 2.0895845890045166, "learning_rate": 5.838925831350173e-07, "loss": 0.3076, "num_input_tokens_seen": 26527136, "step": 25720 }, { "epoch": 17.207357859531772, "grad_norm": 2.874371290206909, "learning_rate": 5.825246352717839e-07, "loss": 0.3278, "num_input_tokens_seen": 26531776, "step": 25725 }, { "epoch": 17.210702341137125, "grad_norm": 2.383301258087158, "learning_rate": 5.811581925817694e-07, "loss": 0.3629, "num_input_tokens_seen": 26536896, "step": 25730 }, { "epoch": 17.214046822742475, "grad_norm": 2.9208152294158936, "learning_rate": 5.797932555305641e-07, "loss": 0.3745, "num_input_tokens_seen": 26541088, "step": 25735 }, { "epoch": 17.217391304347824, "grad_norm": 3.667370080947876, "learning_rate": 5.784298245832459e-07, "loss": 0.3382, "num_input_tokens_seen": 26545536, "step": 25740 }, { "epoch": 17.220735785953178, "grad_norm": 2.7226505279541016, "learning_rate": 5.770679002043766e-07, "loss": 0.2987, "num_input_tokens_seen": 26550624, "step": 25745 }, { "epoch": 17.224080267558527, "grad_norm": 2.1027069091796875, "learning_rate": 5.757074828580078e-07, "loss": 0.3811, "num_input_tokens_seen": 26556064, "step": 25750 }, { "epoch": 17.22742474916388, "grad_norm": 2.9979166984558105, "learning_rate": 5.743485730076769e-07, "loss": 0.3387, "num_input_tokens_seen": 26561760, "step": 25755 }, { "epoch": 17.23076923076923, "grad_norm": 2.080165147781372, "learning_rate": 5.729911711164083e-07, "loss": 0.35, "num_input_tokens_seen": 26566720, "step": 25760 }, { "epoch": 17.234113712374583, "grad_norm": 3.3334267139434814, "learning_rate": 5.716352776467088e-07, "loss": 0.3846, "num_input_tokens_seen": 26571840, "step": 25765 }, { "epoch": 17.237458193979933, "grad_norm": 2.524887800216675, "learning_rate": 5.702808930605763e-07, "loss": 0.3437, "num_input_tokens_seen": 26577920, "step": 25770 }, { "epoch": 17.240802675585286, "grad_norm": 3.2918007373809814, "learning_rate": 5.689280178194923e-07, "loss": 0.2633, "num_input_tokens_seen": 26582784, "step": 25775 }, { "epoch": 17.244147157190636, "grad_norm": 2.7322418689727783, "learning_rate": 5.675766523844233e-07, "loss": 0.3493, "num_input_tokens_seen": 26587680, "step": 25780 }, { "epoch": 17.247491638795985, "grad_norm": 3.3751847743988037, "learning_rate": 5.66226797215823e-07, "loss": 0.3899, "num_input_tokens_seen": 26592672, "step": 25785 }, { "epoch": 17.25083612040134, "grad_norm": 3.0043177604675293, "learning_rate": 5.648784527736289e-07, "loss": 0.4272, "num_input_tokens_seen": 26599840, "step": 25790 }, { "epoch": 17.254180602006688, "grad_norm": 2.9909844398498535, "learning_rate": 5.635316195172641e-07, "loss": 0.3599, "num_input_tokens_seen": 26604864, "step": 25795 }, { "epoch": 17.25752508361204, "grad_norm": 2.388495683670044, "learning_rate": 5.62186297905638e-07, "loss": 0.3206, "num_input_tokens_seen": 26609728, "step": 25800 }, { "epoch": 17.26086956521739, "grad_norm": 3.184117317199707, "learning_rate": 5.608424883971453e-07, "loss": 0.3773, "num_input_tokens_seen": 26614176, "step": 25805 }, { "epoch": 17.264214046822744, "grad_norm": 3.390270233154297, "learning_rate": 5.595001914496612e-07, "loss": 0.3771, "num_input_tokens_seen": 26619424, "step": 25810 }, { "epoch": 17.267558528428093, "grad_norm": 2.740673780441284, "learning_rate": 5.58159407520551e-07, "loss": 0.3983, "num_input_tokens_seen": 26624640, "step": 25815 }, { "epoch": 17.270903010033443, "grad_norm": 2.0285866260528564, "learning_rate": 5.56820137066661e-07, "loss": 0.3131, "num_input_tokens_seen": 26630208, "step": 25820 }, { "epoch": 17.274247491638796, "grad_norm": 3.0646541118621826, "learning_rate": 5.554823805443232e-07, "loss": 0.3381, "num_input_tokens_seen": 26635968, "step": 25825 }, { "epoch": 17.277591973244146, "grad_norm": 3.411842107772827, "learning_rate": 5.541461384093549e-07, "loss": 0.4346, "num_input_tokens_seen": 26641728, "step": 25830 }, { "epoch": 17.2809364548495, "grad_norm": 2.575279951095581, "learning_rate": 5.52811411117053e-07, "loss": 0.2751, "num_input_tokens_seen": 26646592, "step": 25835 }, { "epoch": 17.28428093645485, "grad_norm": 3.859177589416504, "learning_rate": 5.514781991222035e-07, "loss": 0.3384, "num_input_tokens_seen": 26651616, "step": 25840 }, { "epoch": 17.287625418060202, "grad_norm": 2.709665060043335, "learning_rate": 5.501465028790726e-07, "loss": 0.3917, "num_input_tokens_seen": 26657216, "step": 25845 }, { "epoch": 17.29096989966555, "grad_norm": 2.5314199924468994, "learning_rate": 5.488163228414128e-07, "loss": 0.3354, "num_input_tokens_seen": 26662016, "step": 25850 }, { "epoch": 17.294314381270905, "grad_norm": 3.1551313400268555, "learning_rate": 5.474876594624562e-07, "loss": 0.3811, "num_input_tokens_seen": 26666432, "step": 25855 }, { "epoch": 17.297658862876254, "grad_norm": 2.5943918228149414, "learning_rate": 5.461605131949216e-07, "loss": 0.3088, "num_input_tokens_seen": 26672544, "step": 25860 }, { "epoch": 17.301003344481604, "grad_norm": 2.5149435997009277, "learning_rate": 5.448348844910095e-07, "loss": 0.3579, "num_input_tokens_seen": 26677632, "step": 25865 }, { "epoch": 17.304347826086957, "grad_norm": 2.1968061923980713, "learning_rate": 5.435107738024032e-07, "loss": 0.3389, "num_input_tokens_seen": 26683808, "step": 25870 }, { "epoch": 17.307692307692307, "grad_norm": 3.4281272888183594, "learning_rate": 5.421881815802704e-07, "loss": 0.352, "num_input_tokens_seen": 26688608, "step": 25875 }, { "epoch": 17.31103678929766, "grad_norm": 2.3845536708831787, "learning_rate": 5.408671082752575e-07, "loss": 0.3645, "num_input_tokens_seen": 26694272, "step": 25880 }, { "epoch": 17.31438127090301, "grad_norm": 3.736241579055786, "learning_rate": 5.395475543374973e-07, "loss": 0.4007, "num_input_tokens_seen": 26699744, "step": 25885 }, { "epoch": 17.317725752508363, "grad_norm": 3.2079246044158936, "learning_rate": 5.382295202166027e-07, "loss": 0.4164, "num_input_tokens_seen": 26704224, "step": 25890 }, { "epoch": 17.321070234113712, "grad_norm": 3.735924243927002, "learning_rate": 5.369130063616718e-07, "loss": 0.3552, "num_input_tokens_seen": 26710336, "step": 25895 }, { "epoch": 17.324414715719065, "grad_norm": 2.2364532947540283, "learning_rate": 5.35598013221279e-07, "loss": 0.3779, "num_input_tokens_seen": 26715584, "step": 25900 }, { "epoch": 17.327759197324415, "grad_norm": 2.8190343379974365, "learning_rate": 5.34284541243486e-07, "loss": 0.4492, "num_input_tokens_seen": 26720544, "step": 25905 }, { "epoch": 17.331103678929765, "grad_norm": 2.6779308319091797, "learning_rate": 5.329725908758338e-07, "loss": 0.362, "num_input_tokens_seen": 26726368, "step": 25910 }, { "epoch": 17.334448160535118, "grad_norm": 5.153196811676025, "learning_rate": 5.316621625653445e-07, "loss": 0.3751, "num_input_tokens_seen": 26731200, "step": 25915 }, { "epoch": 17.337792642140467, "grad_norm": 3.7757532596588135, "learning_rate": 5.303532567585246e-07, "loss": 0.4014, "num_input_tokens_seen": 26736544, "step": 25920 }, { "epoch": 17.34113712374582, "grad_norm": 1.9959275722503662, "learning_rate": 5.290458739013571e-07, "loss": 0.402, "num_input_tokens_seen": 26742048, "step": 25925 }, { "epoch": 17.34448160535117, "grad_norm": 2.9574410915374756, "learning_rate": 5.277400144393097e-07, "loss": 0.316, "num_input_tokens_seen": 26747296, "step": 25930 }, { "epoch": 17.347826086956523, "grad_norm": 2.7321536540985107, "learning_rate": 5.264356788173297e-07, "loss": 0.4089, "num_input_tokens_seen": 26752448, "step": 25935 }, { "epoch": 17.351170568561873, "grad_norm": 2.869497537612915, "learning_rate": 5.251328674798461e-07, "loss": 0.2616, "num_input_tokens_seen": 26756896, "step": 25940 }, { "epoch": 17.354515050167223, "grad_norm": 2.072929859161377, "learning_rate": 5.238315808707667e-07, "loss": 0.3896, "num_input_tokens_seen": 26761952, "step": 25945 }, { "epoch": 17.357859531772576, "grad_norm": 2.74006986618042, "learning_rate": 5.225318194334816e-07, "loss": 0.3288, "num_input_tokens_seen": 26766400, "step": 25950 }, { "epoch": 17.361204013377925, "grad_norm": 4.209254741668701, "learning_rate": 5.212335836108596e-07, "loss": 0.4089, "num_input_tokens_seen": 26771744, "step": 25955 }, { "epoch": 17.36454849498328, "grad_norm": 2.649075508117676, "learning_rate": 5.199368738452521e-07, "loss": 0.409, "num_input_tokens_seen": 26777760, "step": 25960 }, { "epoch": 17.367892976588628, "grad_norm": 2.634570598602295, "learning_rate": 5.186416905784886e-07, "loss": 0.335, "num_input_tokens_seen": 26783008, "step": 25965 }, { "epoch": 17.37123745819398, "grad_norm": 2.4498138427734375, "learning_rate": 5.173480342518777e-07, "loss": 0.3626, "num_input_tokens_seen": 26788224, "step": 25970 }, { "epoch": 17.37458193979933, "grad_norm": 3.8630869388580322, "learning_rate": 5.160559053062103e-07, "loss": 0.4114, "num_input_tokens_seen": 26793088, "step": 25975 }, { "epoch": 17.377926421404684, "grad_norm": 2.106945037841797, "learning_rate": 5.147653041817546e-07, "loss": 0.3802, "num_input_tokens_seen": 26798176, "step": 25980 }, { "epoch": 17.381270903010034, "grad_norm": 2.1831419467926025, "learning_rate": 5.1347623131826e-07, "loss": 0.3662, "num_input_tokens_seen": 26803008, "step": 25985 }, { "epoch": 17.384615384615383, "grad_norm": 2.6376209259033203, "learning_rate": 5.121886871549537e-07, "loss": 0.3919, "num_input_tokens_seen": 26808480, "step": 25990 }, { "epoch": 17.387959866220736, "grad_norm": 3.1019089221954346, "learning_rate": 5.109026721305421e-07, "loss": 0.4222, "num_input_tokens_seen": 26813728, "step": 25995 }, { "epoch": 17.391304347826086, "grad_norm": 2.3857150077819824, "learning_rate": 5.096181866832118e-07, "loss": 0.3996, "num_input_tokens_seen": 26818752, "step": 26000 }, { "epoch": 17.39464882943144, "grad_norm": 2.559023857116699, "learning_rate": 5.083352312506284e-07, "loss": 0.2835, "num_input_tokens_seen": 26824416, "step": 26005 }, { "epoch": 17.39799331103679, "grad_norm": 2.1951868534088135, "learning_rate": 5.070538062699337e-07, "loss": 0.3692, "num_input_tokens_seen": 26829184, "step": 26010 }, { "epoch": 17.401337792642142, "grad_norm": 3.1788671016693115, "learning_rate": 5.0577391217775e-07, "loss": 0.3775, "num_input_tokens_seen": 26834208, "step": 26015 }, { "epoch": 17.40468227424749, "grad_norm": 2.536694288253784, "learning_rate": 5.044955494101778e-07, "loss": 0.3459, "num_input_tokens_seen": 26839456, "step": 26020 }, { "epoch": 17.40802675585284, "grad_norm": 3.1971490383148193, "learning_rate": 5.032187184027953e-07, "loss": 0.3451, "num_input_tokens_seen": 26843584, "step": 26025 }, { "epoch": 17.411371237458194, "grad_norm": 3.251396417617798, "learning_rate": 5.019434195906608e-07, "loss": 0.3789, "num_input_tokens_seen": 26848608, "step": 26030 }, { "epoch": 17.414715719063544, "grad_norm": 3.332590103149414, "learning_rate": 5.00669653408306e-07, "loss": 0.4155, "num_input_tokens_seen": 26853920, "step": 26035 }, { "epoch": 17.418060200668897, "grad_norm": 3.491316795349121, "learning_rate": 4.993974202897456e-07, "loss": 0.4128, "num_input_tokens_seen": 26858752, "step": 26040 }, { "epoch": 17.421404682274247, "grad_norm": 3.9224555492401123, "learning_rate": 4.981267206684681e-07, "loss": 0.3366, "num_input_tokens_seen": 26864064, "step": 26045 }, { "epoch": 17.4247491638796, "grad_norm": 3.1132140159606934, "learning_rate": 4.968575549774424e-07, "loss": 0.4011, "num_input_tokens_seen": 26869696, "step": 26050 }, { "epoch": 17.42809364548495, "grad_norm": 2.0852458477020264, "learning_rate": 4.955899236491112e-07, "loss": 0.3626, "num_input_tokens_seen": 26875456, "step": 26055 }, { "epoch": 17.431438127090303, "grad_norm": 2.4789326190948486, "learning_rate": 4.943238271153983e-07, "loss": 0.4202, "num_input_tokens_seen": 26880352, "step": 26060 }, { "epoch": 17.434782608695652, "grad_norm": 2.3699045181274414, "learning_rate": 4.930592658077022e-07, "loss": 0.3776, "num_input_tokens_seen": 26886080, "step": 26065 }, { "epoch": 17.438127090301002, "grad_norm": 3.045459508895874, "learning_rate": 4.917962401568982e-07, "loss": 0.4243, "num_input_tokens_seen": 26892608, "step": 26070 }, { "epoch": 17.441471571906355, "grad_norm": 3.2243432998657227, "learning_rate": 4.905347505933405e-07, "loss": 0.3208, "num_input_tokens_seen": 26897376, "step": 26075 }, { "epoch": 17.444816053511705, "grad_norm": 2.4128942489624023, "learning_rate": 4.89274797546857e-07, "loss": 0.3409, "num_input_tokens_seen": 26902592, "step": 26080 }, { "epoch": 17.448160535117058, "grad_norm": 2.8123741149902344, "learning_rate": 4.880163814467537e-07, "loss": 0.3891, "num_input_tokens_seen": 26907488, "step": 26085 }, { "epoch": 17.451505016722408, "grad_norm": 2.689891815185547, "learning_rate": 4.867595027218125e-07, "loss": 0.3856, "num_input_tokens_seen": 26912704, "step": 26090 }, { "epoch": 17.45484949832776, "grad_norm": 2.955148935317993, "learning_rate": 4.855041618002931e-07, "loss": 0.3841, "num_input_tokens_seen": 26917728, "step": 26095 }, { "epoch": 17.45819397993311, "grad_norm": 2.7226526737213135, "learning_rate": 4.842503591099279e-07, "loss": 0.3881, "num_input_tokens_seen": 26923360, "step": 26100 }, { "epoch": 17.46153846153846, "grad_norm": 3.246213912963867, "learning_rate": 4.829980950779273e-07, "loss": 0.3314, "num_input_tokens_seen": 26928608, "step": 26105 }, { "epoch": 17.464882943143813, "grad_norm": 2.2637345790863037, "learning_rate": 4.817473701309783e-07, "loss": 0.3584, "num_input_tokens_seen": 26934016, "step": 26110 }, { "epoch": 17.468227424749163, "grad_norm": 3.2573986053466797, "learning_rate": 4.804981846952411e-07, "loss": 0.3628, "num_input_tokens_seen": 26939200, "step": 26115 }, { "epoch": 17.471571906354516, "grad_norm": 2.863738775253296, "learning_rate": 4.792505391963543e-07, "loss": 0.3122, "num_input_tokens_seen": 26944000, "step": 26120 }, { "epoch": 17.474916387959865, "grad_norm": 3.2334811687469482, "learning_rate": 4.78004434059428e-07, "loss": 0.4088, "num_input_tokens_seen": 26948736, "step": 26125 }, { "epoch": 17.47826086956522, "grad_norm": 3.5517146587371826, "learning_rate": 4.767598697090503e-07, "loss": 0.3739, "num_input_tokens_seen": 26953600, "step": 26130 }, { "epoch": 17.48160535117057, "grad_norm": 2.45636248588562, "learning_rate": 4.7551684656928386e-07, "loss": 0.4237, "num_input_tokens_seen": 26958272, "step": 26135 }, { "epoch": 17.48494983277592, "grad_norm": 2.7968764305114746, "learning_rate": 4.7427536506366665e-07, "loss": 0.3445, "num_input_tokens_seen": 26963072, "step": 26140 }, { "epoch": 17.48829431438127, "grad_norm": 2.7632458209991455, "learning_rate": 4.730354256152092e-07, "loss": 0.2898, "num_input_tokens_seen": 26967520, "step": 26145 }, { "epoch": 17.49163879598662, "grad_norm": 2.987610101699829, "learning_rate": 4.71797028646398e-07, "loss": 0.3573, "num_input_tokens_seen": 26972512, "step": 26150 }, { "epoch": 17.494983277591974, "grad_norm": 2.9588425159454346, "learning_rate": 4.7056017457919435e-07, "loss": 0.4025, "num_input_tokens_seen": 26977824, "step": 26155 }, { "epoch": 17.498327759197323, "grad_norm": 3.97499418258667, "learning_rate": 4.6932486383503397e-07, "loss": 0.3817, "num_input_tokens_seen": 26982848, "step": 26160 }, { "epoch": 17.501672240802677, "grad_norm": 2.460561752319336, "learning_rate": 4.680910968348262e-07, "loss": 0.3181, "num_input_tokens_seen": 26988512, "step": 26165 }, { "epoch": 17.505016722408026, "grad_norm": 3.002453088760376, "learning_rate": 4.6685887399895326e-07, "loss": 0.3679, "num_input_tokens_seen": 26993248, "step": 26170 }, { "epoch": 17.50836120401338, "grad_norm": 2.3040804862976074, "learning_rate": 4.6562819574727304e-07, "loss": 0.3259, "num_input_tokens_seen": 26998400, "step": 26175 }, { "epoch": 17.51170568561873, "grad_norm": 3.062741279602051, "learning_rate": 4.6439906249911627e-07, "loss": 0.359, "num_input_tokens_seen": 27003456, "step": 26180 }, { "epoch": 17.51505016722408, "grad_norm": 2.4640729427337646, "learning_rate": 4.631714746732885e-07, "loss": 0.2681, "num_input_tokens_seen": 27009088, "step": 26185 }, { "epoch": 17.51839464882943, "grad_norm": 2.8301851749420166, "learning_rate": 4.619454326880657e-07, "loss": 0.3667, "num_input_tokens_seen": 27014112, "step": 26190 }, { "epoch": 17.52173913043478, "grad_norm": 3.1483218669891357, "learning_rate": 4.6072093696120036e-07, "loss": 0.3736, "num_input_tokens_seen": 27019104, "step": 26195 }, { "epoch": 17.525083612040135, "grad_norm": 4.266463756561279, "learning_rate": 4.594979879099165e-07, "loss": 0.3367, "num_input_tokens_seen": 27023424, "step": 26200 }, { "epoch": 17.528428093645484, "grad_norm": 3.1139426231384277, "learning_rate": 4.5827658595091254e-07, "loss": 0.354, "num_input_tokens_seen": 27027968, "step": 26205 }, { "epoch": 17.531772575250837, "grad_norm": 3.1524407863616943, "learning_rate": 4.57056731500356e-07, "loss": 0.3479, "num_input_tokens_seen": 27033184, "step": 26210 }, { "epoch": 17.535117056856187, "grad_norm": 3.258150577545166, "learning_rate": 4.5583842497389173e-07, "loss": 0.387, "num_input_tokens_seen": 27039328, "step": 26215 }, { "epoch": 17.53846153846154, "grad_norm": 3.5466649532318115, "learning_rate": 4.546216667866349e-07, "loss": 0.363, "num_input_tokens_seen": 27044384, "step": 26220 }, { "epoch": 17.54180602006689, "grad_norm": 2.5575640201568604, "learning_rate": 4.5340645735317377e-07, "loss": 0.361, "num_input_tokens_seen": 27049408, "step": 26225 }, { "epoch": 17.54515050167224, "grad_norm": 3.178215503692627, "learning_rate": 4.5219279708756834e-07, "loss": 0.2491, "num_input_tokens_seen": 27053696, "step": 26230 }, { "epoch": 17.548494983277592, "grad_norm": 3.27681565284729, "learning_rate": 4.5098068640335003e-07, "loss": 0.3148, "num_input_tokens_seen": 27058688, "step": 26235 }, { "epoch": 17.551839464882942, "grad_norm": 2.946981906890869, "learning_rate": 4.4977012571352453e-07, "loss": 0.3181, "num_input_tokens_seen": 27063584, "step": 26240 }, { "epoch": 17.555183946488295, "grad_norm": 2.8063805103302, "learning_rate": 4.485611154305658e-07, "loss": 0.3768, "num_input_tokens_seen": 27068512, "step": 26245 }, { "epoch": 17.558528428093645, "grad_norm": 3.4723684787750244, "learning_rate": 4.4735365596642543e-07, "loss": 0.4306, "num_input_tokens_seen": 27073728, "step": 26250 }, { "epoch": 17.561872909698998, "grad_norm": 3.0994420051574707, "learning_rate": 4.461477477325193e-07, "loss": 0.3297, "num_input_tokens_seen": 27078496, "step": 26255 }, { "epoch": 17.565217391304348, "grad_norm": 2.541870594024658, "learning_rate": 4.449433911397405e-07, "loss": 0.3457, "num_input_tokens_seen": 27083136, "step": 26260 }, { "epoch": 17.568561872909697, "grad_norm": 3.3515431880950928, "learning_rate": 4.437405865984512e-07, "loss": 0.4637, "num_input_tokens_seen": 27089344, "step": 26265 }, { "epoch": 17.57190635451505, "grad_norm": 3.947697877883911, "learning_rate": 4.4253933451848365e-07, "loss": 0.3256, "num_input_tokens_seen": 27093824, "step": 26270 }, { "epoch": 17.5752508361204, "grad_norm": 4.343869686126709, "learning_rate": 4.4133963530914427e-07, "loss": 0.3914, "num_input_tokens_seen": 27098592, "step": 26275 }, { "epoch": 17.578595317725753, "grad_norm": 2.9637269973754883, "learning_rate": 4.4014148937920667e-07, "loss": 0.3241, "num_input_tokens_seen": 27103456, "step": 26280 }, { "epoch": 17.581939799331103, "grad_norm": 3.01529860496521, "learning_rate": 4.3894489713691865e-07, "loss": 0.3329, "num_input_tokens_seen": 27108736, "step": 26285 }, { "epoch": 17.585284280936456, "grad_norm": 2.657837152481079, "learning_rate": 4.3774985898999366e-07, "loss": 0.3431, "num_input_tokens_seen": 27114272, "step": 26290 }, { "epoch": 17.588628762541806, "grad_norm": 2.6162660121917725, "learning_rate": 4.3655637534562313e-07, "loss": 0.315, "num_input_tokens_seen": 27119616, "step": 26295 }, { "epoch": 17.59197324414716, "grad_norm": 3.264482259750366, "learning_rate": 4.353644466104612e-07, "loss": 0.388, "num_input_tokens_seen": 27124640, "step": 26300 }, { "epoch": 17.59531772575251, "grad_norm": 2.711045026779175, "learning_rate": 4.341740731906374e-07, "loss": 0.344, "num_input_tokens_seen": 27129632, "step": 26305 }, { "epoch": 17.598662207357858, "grad_norm": 3.7007501125335693, "learning_rate": 4.3298525549174906e-07, "loss": 0.3968, "num_input_tokens_seen": 27135360, "step": 26310 }, { "epoch": 17.60200668896321, "grad_norm": 2.72100830078125, "learning_rate": 4.317979939188638e-07, "loss": 0.3548, "num_input_tokens_seen": 27140224, "step": 26315 }, { "epoch": 17.60535117056856, "grad_norm": 2.5653281211853027, "learning_rate": 4.306122888765202e-07, "loss": 0.308, "num_input_tokens_seen": 27145888, "step": 26320 }, { "epoch": 17.608695652173914, "grad_norm": 3.9730653762817383, "learning_rate": 4.294281407687234e-07, "loss": 0.3144, "num_input_tokens_seen": 27150080, "step": 26325 }, { "epoch": 17.612040133779264, "grad_norm": 2.988020420074463, "learning_rate": 4.2824554999895184e-07, "loss": 0.3418, "num_input_tokens_seen": 27154560, "step": 26330 }, { "epoch": 17.615384615384617, "grad_norm": 3.357340097427368, "learning_rate": 4.270645169701493e-07, "loss": 0.3585, "num_input_tokens_seen": 27159264, "step": 26335 }, { "epoch": 17.618729096989966, "grad_norm": 2.3457272052764893, "learning_rate": 4.2588504208473323e-07, "loss": 0.4003, "num_input_tokens_seen": 27164544, "step": 26340 }, { "epoch": 17.62207357859532, "grad_norm": 2.374265193939209, "learning_rate": 4.2470712574458674e-07, "loss": 0.3623, "num_input_tokens_seen": 27170720, "step": 26345 }, { "epoch": 17.62541806020067, "grad_norm": 2.14633846282959, "learning_rate": 4.2353076835106367e-07, "loss": 0.3722, "num_input_tokens_seen": 27176096, "step": 26350 }, { "epoch": 17.62876254180602, "grad_norm": 2.892108201980591, "learning_rate": 4.223559703049851e-07, "loss": 0.3544, "num_input_tokens_seen": 27181664, "step": 26355 }, { "epoch": 17.632107023411372, "grad_norm": 3.470841407775879, "learning_rate": 4.2118273200664304e-07, "loss": 0.3459, "num_input_tokens_seen": 27186592, "step": 26360 }, { "epoch": 17.63545150501672, "grad_norm": 2.1408581733703613, "learning_rate": 4.2001105385579664e-07, "loss": 0.3703, "num_input_tokens_seen": 27192160, "step": 26365 }, { "epoch": 17.638795986622075, "grad_norm": 2.6092872619628906, "learning_rate": 4.1884093625167267e-07, "loss": 0.3952, "num_input_tokens_seen": 27197504, "step": 26370 }, { "epoch": 17.642140468227424, "grad_norm": 2.9234089851379395, "learning_rate": 4.176723795929677e-07, "loss": 0.3563, "num_input_tokens_seen": 27202496, "step": 26375 }, { "epoch": 17.645484949832777, "grad_norm": 3.576784610748291, "learning_rate": 4.16505384277846e-07, "loss": 0.305, "num_input_tokens_seen": 27207584, "step": 26380 }, { "epoch": 17.648829431438127, "grad_norm": 3.523498773574829, "learning_rate": 4.1533995070394017e-07, "loss": 0.4427, "num_input_tokens_seen": 27213344, "step": 26385 }, { "epoch": 17.652173913043477, "grad_norm": 2.6870100498199463, "learning_rate": 4.141760792683486e-07, "loss": 0.3608, "num_input_tokens_seen": 27218592, "step": 26390 }, { "epoch": 17.65551839464883, "grad_norm": 3.1496951580047607, "learning_rate": 4.1301377036763954e-07, "loss": 0.4168, "num_input_tokens_seen": 27224064, "step": 26395 }, { "epoch": 17.65886287625418, "grad_norm": 2.2026453018188477, "learning_rate": 4.1185302439784914e-07, "loss": 0.3679, "num_input_tokens_seen": 27228704, "step": 26400 }, { "epoch": 17.662207357859533, "grad_norm": 3.3282673358917236, "learning_rate": 4.1069384175448035e-07, "loss": 0.371, "num_input_tokens_seen": 27233888, "step": 26405 }, { "epoch": 17.665551839464882, "grad_norm": 2.1672654151916504, "learning_rate": 4.0953622283250226e-07, "loss": 0.3141, "num_input_tokens_seen": 27239008, "step": 26410 }, { "epoch": 17.668896321070235, "grad_norm": 3.8941681385040283, "learning_rate": 4.083801680263522e-07, "loss": 0.3568, "num_input_tokens_seen": 27243392, "step": 26415 }, { "epoch": 17.672240802675585, "grad_norm": 3.2703094482421875, "learning_rate": 4.0722567772993495e-07, "loss": 0.4072, "num_input_tokens_seen": 27248352, "step": 26420 }, { "epoch": 17.675585284280935, "grad_norm": 2.8009729385375977, "learning_rate": 4.0607275233662144e-07, "loss": 0.3131, "num_input_tokens_seen": 27253504, "step": 26425 }, { "epoch": 17.678929765886288, "grad_norm": 3.767996311187744, "learning_rate": 4.049213922392509e-07, "loss": 0.3502, "num_input_tokens_seen": 27258368, "step": 26430 }, { "epoch": 17.682274247491637, "grad_norm": 2.9382364749908447, "learning_rate": 4.0377159783012653e-07, "loss": 0.3038, "num_input_tokens_seen": 27262976, "step": 26435 }, { "epoch": 17.68561872909699, "grad_norm": 2.4415152072906494, "learning_rate": 4.0262336950102066e-07, "loss": 0.4087, "num_input_tokens_seen": 27267520, "step": 26440 }, { "epoch": 17.68896321070234, "grad_norm": 3.175532102584839, "learning_rate": 4.0147670764316915e-07, "loss": 0.402, "num_input_tokens_seen": 27273888, "step": 26445 }, { "epoch": 17.692307692307693, "grad_norm": 2.1862359046936035, "learning_rate": 4.003316126472784e-07, "loss": 0.3762, "num_input_tokens_seen": 27279552, "step": 26450 }, { "epoch": 17.695652173913043, "grad_norm": 2.1581060886383057, "learning_rate": 3.9918808490351623e-07, "loss": 0.3745, "num_input_tokens_seen": 27285376, "step": 26455 }, { "epoch": 17.698996655518396, "grad_norm": 3.4872305393218994, "learning_rate": 3.980461248015194e-07, "loss": 0.46, "num_input_tokens_seen": 27290432, "step": 26460 }, { "epoch": 17.702341137123746, "grad_norm": 3.992245674133301, "learning_rate": 3.969057327303899e-07, "loss": 0.4464, "num_input_tokens_seen": 27295584, "step": 26465 }, { "epoch": 17.705685618729095, "grad_norm": 3.410428762435913, "learning_rate": 3.957669090786953e-07, "loss": 0.3354, "num_input_tokens_seen": 27300576, "step": 26470 }, { "epoch": 17.70903010033445, "grad_norm": 3.2711095809936523, "learning_rate": 3.946296542344696e-07, "loss": 0.3911, "num_input_tokens_seen": 27305184, "step": 26475 }, { "epoch": 17.712374581939798, "grad_norm": 2.5402631759643555, "learning_rate": 3.934939685852096e-07, "loss": 0.3577, "num_input_tokens_seen": 27309792, "step": 26480 }, { "epoch": 17.71571906354515, "grad_norm": 4.1055402755737305, "learning_rate": 3.9235985251788065e-07, "loss": 0.3328, "num_input_tokens_seen": 27314944, "step": 26485 }, { "epoch": 17.7190635451505, "grad_norm": 2.962442398071289, "learning_rate": 3.9122730641890984e-07, "loss": 0.2995, "num_input_tokens_seen": 27320384, "step": 26490 }, { "epoch": 17.722408026755854, "grad_norm": 2.851284980773926, "learning_rate": 3.900963306741945e-07, "loss": 0.358, "num_input_tokens_seen": 27326304, "step": 26495 }, { "epoch": 17.725752508361204, "grad_norm": 3.5226290225982666, "learning_rate": 3.889669256690914e-07, "loss": 0.3957, "num_input_tokens_seen": 27331872, "step": 26500 }, { "epoch": 17.729096989966557, "grad_norm": 2.9179720878601074, "learning_rate": 3.8783909178842504e-07, "loss": 0.3787, "num_input_tokens_seen": 27337536, "step": 26505 }, { "epoch": 17.732441471571907, "grad_norm": 2.8252599239349365, "learning_rate": 3.8671282941648403e-07, "loss": 0.3482, "num_input_tokens_seen": 27341696, "step": 26510 }, { "epoch": 17.735785953177256, "grad_norm": 3.743553876876831, "learning_rate": 3.855881389370214e-07, "loss": 0.3799, "num_input_tokens_seen": 27346560, "step": 26515 }, { "epoch": 17.73913043478261, "grad_norm": 2.399656057357788, "learning_rate": 3.844650207332562e-07, "loss": 0.2969, "num_input_tokens_seen": 27351584, "step": 26520 }, { "epoch": 17.74247491638796, "grad_norm": 3.158066511154175, "learning_rate": 3.833434751878673e-07, "loss": 0.2899, "num_input_tokens_seen": 27357024, "step": 26525 }, { "epoch": 17.745819397993312, "grad_norm": 2.227442741394043, "learning_rate": 3.8222350268300347e-07, "loss": 0.3092, "num_input_tokens_seen": 27361824, "step": 26530 }, { "epoch": 17.74916387959866, "grad_norm": 1.7744247913360596, "learning_rate": 3.8110510360027163e-07, "loss": 0.3386, "num_input_tokens_seen": 27367936, "step": 26535 }, { "epoch": 17.752508361204015, "grad_norm": 2.7445316314697266, "learning_rate": 3.7998827832074925e-07, "loss": 0.4043, "num_input_tokens_seen": 27373632, "step": 26540 }, { "epoch": 17.755852842809364, "grad_norm": 2.3141212463378906, "learning_rate": 3.7887302722497134e-07, "loss": 0.2741, "num_input_tokens_seen": 27378176, "step": 26545 }, { "epoch": 17.759197324414714, "grad_norm": 2.376004934310913, "learning_rate": 3.7775935069293943e-07, "loss": 0.3615, "num_input_tokens_seen": 27382624, "step": 26550 }, { "epoch": 17.762541806020067, "grad_norm": 2.7069082260131836, "learning_rate": 3.76647249104119e-07, "loss": 0.4498, "num_input_tokens_seen": 27388192, "step": 26555 }, { "epoch": 17.765886287625417, "grad_norm": 3.853266477584839, "learning_rate": 3.755367228374379e-07, "loss": 0.4069, "num_input_tokens_seen": 27393504, "step": 26560 }, { "epoch": 17.76923076923077, "grad_norm": 3.024411916732788, "learning_rate": 3.744277722712886e-07, "loss": 0.4574, "num_input_tokens_seen": 27398848, "step": 26565 }, { "epoch": 17.77257525083612, "grad_norm": 3.7372515201568604, "learning_rate": 3.733203977835231e-07, "loss": 0.3625, "num_input_tokens_seen": 27404128, "step": 26570 }, { "epoch": 17.775919732441473, "grad_norm": 3.4879963397979736, "learning_rate": 3.722145997514609e-07, "loss": 0.4279, "num_input_tokens_seen": 27408480, "step": 26575 }, { "epoch": 17.779264214046822, "grad_norm": 2.709519147872925, "learning_rate": 3.711103785518799e-07, "loss": 0.3886, "num_input_tokens_seen": 27413952, "step": 26580 }, { "epoch": 17.782608695652176, "grad_norm": 3.0222105979919434, "learning_rate": 3.7000773456102645e-07, "loss": 0.389, "num_input_tokens_seen": 27419776, "step": 26585 }, { "epoch": 17.785953177257525, "grad_norm": 2.7174031734466553, "learning_rate": 3.689066681546033e-07, "loss": 0.3283, "num_input_tokens_seen": 27425120, "step": 26590 }, { "epoch": 17.789297658862875, "grad_norm": 2.0532681941986084, "learning_rate": 3.6780717970777977e-07, "loss": 0.3715, "num_input_tokens_seen": 27430336, "step": 26595 }, { "epoch": 17.792642140468228, "grad_norm": 3.307645559310913, "learning_rate": 3.6670926959518626e-07, "loss": 0.3615, "num_input_tokens_seen": 27435328, "step": 26600 }, { "epoch": 17.795986622073578, "grad_norm": 2.2436881065368652, "learning_rate": 3.656129381909146e-07, "loss": 0.3734, "num_input_tokens_seen": 27440960, "step": 26605 }, { "epoch": 17.79933110367893, "grad_norm": 2.2582664489746094, "learning_rate": 3.645181858685215e-07, "loss": 0.3111, "num_input_tokens_seen": 27445888, "step": 26610 }, { "epoch": 17.80267558528428, "grad_norm": 3.110405445098877, "learning_rate": 3.6342501300102087e-07, "loss": 0.3564, "num_input_tokens_seen": 27450944, "step": 26615 }, { "epoch": 17.806020066889634, "grad_norm": 2.363375663757324, "learning_rate": 3.623334199608941e-07, "loss": 0.3527, "num_input_tokens_seen": 27456480, "step": 26620 }, { "epoch": 17.809364548494983, "grad_norm": 2.488579511642456, "learning_rate": 3.612434071200771e-07, "loss": 0.374, "num_input_tokens_seen": 27462176, "step": 26625 }, { "epoch": 17.812709030100333, "grad_norm": 3.7381134033203125, "learning_rate": 3.6015497484997665e-07, "loss": 0.3833, "num_input_tokens_seen": 27467360, "step": 26630 }, { "epoch": 17.816053511705686, "grad_norm": 4.072372913360596, "learning_rate": 3.590681235214527e-07, "loss": 0.4132, "num_input_tokens_seen": 27472032, "step": 26635 }, { "epoch": 17.819397993311036, "grad_norm": 2.6450347900390625, "learning_rate": 3.579828535048313e-07, "loss": 0.3648, "num_input_tokens_seen": 27476960, "step": 26640 }, { "epoch": 17.82274247491639, "grad_norm": 2.7704360485076904, "learning_rate": 3.568991651698961e-07, "loss": 0.4395, "num_input_tokens_seen": 27482176, "step": 26645 }, { "epoch": 17.82608695652174, "grad_norm": 3.2628862857818604, "learning_rate": 3.5581705888589724e-07, "loss": 0.3876, "num_input_tokens_seen": 27487424, "step": 26650 }, { "epoch": 17.82943143812709, "grad_norm": 3.593308210372925, "learning_rate": 3.547365350215398e-07, "loss": 0.3524, "num_input_tokens_seen": 27492896, "step": 26655 }, { "epoch": 17.83277591973244, "grad_norm": 3.3468940258026123, "learning_rate": 3.536575939449932e-07, "loss": 0.276, "num_input_tokens_seen": 27496768, "step": 26660 }, { "epoch": 17.836120401337794, "grad_norm": 2.370194911956787, "learning_rate": 3.5258023602388724e-07, "loss": 0.378, "num_input_tokens_seen": 27501376, "step": 26665 }, { "epoch": 17.839464882943144, "grad_norm": 3.8651552200317383, "learning_rate": 3.515044616253094e-07, "loss": 0.3843, "num_input_tokens_seen": 27506976, "step": 26670 }, { "epoch": 17.842809364548494, "grad_norm": 4.491761207580566, "learning_rate": 3.504302711158136e-07, "loss": 0.3324, "num_input_tokens_seen": 27511968, "step": 26675 }, { "epoch": 17.846153846153847, "grad_norm": 3.2208080291748047, "learning_rate": 3.493576648614078e-07, "loss": 0.3119, "num_input_tokens_seen": 27516736, "step": 26680 }, { "epoch": 17.849498327759196, "grad_norm": 3.7073745727539062, "learning_rate": 3.4828664322756446e-07, "loss": 0.347, "num_input_tokens_seen": 27521216, "step": 26685 }, { "epoch": 17.85284280936455, "grad_norm": 2.3522911071777344, "learning_rate": 3.4721720657921234e-07, "loss": 0.354, "num_input_tokens_seen": 27526912, "step": 26690 }, { "epoch": 17.8561872909699, "grad_norm": 2.6282129287719727, "learning_rate": 3.4614935528074436e-07, "loss": 0.3319, "num_input_tokens_seen": 27532288, "step": 26695 }, { "epoch": 17.859531772575252, "grad_norm": 3.0195014476776123, "learning_rate": 3.4508308969600946e-07, "loss": 0.2999, "num_input_tokens_seen": 27537536, "step": 26700 }, { "epoch": 17.862876254180602, "grad_norm": 3.378124237060547, "learning_rate": 3.440184101883193e-07, "loss": 0.383, "num_input_tokens_seen": 27542592, "step": 26705 }, { "epoch": 17.86622073578595, "grad_norm": 3.7438080310821533, "learning_rate": 3.429553171204436e-07, "loss": 0.2457, "num_input_tokens_seen": 27546400, "step": 26710 }, { "epoch": 17.869565217391305, "grad_norm": 3.472693681716919, "learning_rate": 3.418938108546099e-07, "loss": 0.3375, "num_input_tokens_seen": 27551328, "step": 26715 }, { "epoch": 17.872909698996654, "grad_norm": 3.8510396480560303, "learning_rate": 3.4083389175251005e-07, "loss": 0.3266, "num_input_tokens_seen": 27556288, "step": 26720 }, { "epoch": 17.876254180602007, "grad_norm": 2.8919482231140137, "learning_rate": 3.397755601752889e-07, "loss": 0.3479, "num_input_tokens_seen": 27562432, "step": 26725 }, { "epoch": 17.879598662207357, "grad_norm": 2.8666791915893555, "learning_rate": 3.3871881648355586e-07, "loss": 0.3534, "num_input_tokens_seen": 27567872, "step": 26730 }, { "epoch": 17.88294314381271, "grad_norm": 3.607072353363037, "learning_rate": 3.37663661037374e-07, "loss": 0.3141, "num_input_tokens_seen": 27572160, "step": 26735 }, { "epoch": 17.88628762541806, "grad_norm": 2.7552285194396973, "learning_rate": 3.366100941962713e-07, "loss": 0.3587, "num_input_tokens_seen": 27577280, "step": 26740 }, { "epoch": 17.889632107023413, "grad_norm": 2.8255012035369873, "learning_rate": 3.3555811631922895e-07, "loss": 0.3361, "num_input_tokens_seen": 27583008, "step": 26745 }, { "epoch": 17.892976588628763, "grad_norm": 2.9902384281158447, "learning_rate": 3.3450772776468965e-07, "loss": 0.4897, "num_input_tokens_seen": 27588736, "step": 26750 }, { "epoch": 17.896321070234112, "grad_norm": 1.7283596992492676, "learning_rate": 3.334589288905549e-07, "loss": 0.3083, "num_input_tokens_seen": 27594528, "step": 26755 }, { "epoch": 17.899665551839465, "grad_norm": 2.642033576965332, "learning_rate": 3.32411720054181e-07, "loss": 0.4057, "num_input_tokens_seen": 27599744, "step": 26760 }, { "epoch": 17.903010033444815, "grad_norm": 2.662970781326294, "learning_rate": 3.313661016123881e-07, "loss": 0.339, "num_input_tokens_seen": 27604704, "step": 26765 }, { "epoch": 17.906354515050168, "grad_norm": 3.138103723526001, "learning_rate": 3.303220739214491e-07, "loss": 0.3714, "num_input_tokens_seen": 27609408, "step": 26770 }, { "epoch": 17.909698996655518, "grad_norm": 2.9056785106658936, "learning_rate": 3.292796373370993e-07, "loss": 0.4842, "num_input_tokens_seen": 27614048, "step": 26775 }, { "epoch": 17.91304347826087, "grad_norm": 3.036236524581909, "learning_rate": 3.282387922145264e-07, "loss": 0.3568, "num_input_tokens_seen": 27619712, "step": 26780 }, { "epoch": 17.91638795986622, "grad_norm": 2.407609462738037, "learning_rate": 3.271995389083832e-07, "loss": 0.4356, "num_input_tokens_seen": 27624576, "step": 26785 }, { "epoch": 17.919732441471574, "grad_norm": 2.6463518142700195, "learning_rate": 3.261618777727732e-07, "loss": 0.3199, "num_input_tokens_seen": 27629952, "step": 26790 }, { "epoch": 17.923076923076923, "grad_norm": 3.294696569442749, "learning_rate": 3.2512580916126135e-07, "loss": 0.3767, "num_input_tokens_seen": 27635264, "step": 26795 }, { "epoch": 17.926421404682273, "grad_norm": 2.658893346786499, "learning_rate": 3.2409133342686914e-07, "loss": 0.3136, "num_input_tokens_seen": 27639840, "step": 26800 }, { "epoch": 17.929765886287626, "grad_norm": 2.4652132987976074, "learning_rate": 3.230584509220735e-07, "loss": 0.3003, "num_input_tokens_seen": 27644384, "step": 26805 }, { "epoch": 17.933110367892976, "grad_norm": 3.989793539047241, "learning_rate": 3.22027161998813e-07, "loss": 0.3218, "num_input_tokens_seen": 27649056, "step": 26810 }, { "epoch": 17.93645484949833, "grad_norm": 2.4756739139556885, "learning_rate": 3.209974670084776e-07, "loss": 0.3996, "num_input_tokens_seen": 27654176, "step": 26815 }, { "epoch": 17.93979933110368, "grad_norm": 3.031580686569214, "learning_rate": 3.1996936630191876e-07, "loss": 0.3996, "num_input_tokens_seen": 27659328, "step": 26820 }, { "epoch": 17.94314381270903, "grad_norm": 2.7135612964630127, "learning_rate": 3.1894286022944034e-07, "loss": 0.4471, "num_input_tokens_seen": 27664800, "step": 26825 }, { "epoch": 17.94648829431438, "grad_norm": 2.0941789150238037, "learning_rate": 3.1791794914080854e-07, "loss": 0.2967, "num_input_tokens_seen": 27669792, "step": 26830 }, { "epoch": 17.94983277591973, "grad_norm": 3.096346855163574, "learning_rate": 3.1689463338524017e-07, "loss": 0.4102, "num_input_tokens_seen": 27674976, "step": 26835 }, { "epoch": 17.953177257525084, "grad_norm": 2.1522271633148193, "learning_rate": 3.15872913311413e-07, "loss": 0.3147, "num_input_tokens_seen": 27680544, "step": 26840 }, { "epoch": 17.956521739130434, "grad_norm": 2.5939419269561768, "learning_rate": 3.148527892674591e-07, "loss": 0.3587, "num_input_tokens_seen": 27685024, "step": 26845 }, { "epoch": 17.959866220735787, "grad_norm": 2.300123453140259, "learning_rate": 3.138342616009654e-07, "loss": 0.354, "num_input_tokens_seen": 27689664, "step": 26850 }, { "epoch": 17.963210702341136, "grad_norm": 2.8341562747955322, "learning_rate": 3.128173306589777e-07, "loss": 0.4382, "num_input_tokens_seen": 27695232, "step": 26855 }, { "epoch": 17.96655518394649, "grad_norm": 2.5549826622009277, "learning_rate": 3.118019967879965e-07, "loss": 0.3976, "num_input_tokens_seen": 27700960, "step": 26860 }, { "epoch": 17.96989966555184, "grad_norm": 2.5226120948791504, "learning_rate": 3.1078826033397845e-07, "loss": 0.3144, "num_input_tokens_seen": 27706176, "step": 26865 }, { "epoch": 17.97324414715719, "grad_norm": 2.7115609645843506, "learning_rate": 3.0977612164233285e-07, "loss": 0.4291, "num_input_tokens_seen": 27711424, "step": 26870 }, { "epoch": 17.976588628762542, "grad_norm": 3.0915896892547607, "learning_rate": 3.0876558105793154e-07, "loss": 0.4118, "num_input_tokens_seen": 27717088, "step": 26875 }, { "epoch": 17.97993311036789, "grad_norm": 2.963258743286133, "learning_rate": 3.077566389250941e-07, "loss": 0.3742, "num_input_tokens_seen": 27722080, "step": 26880 }, { "epoch": 17.983277591973245, "grad_norm": 2.4926843643188477, "learning_rate": 3.0674929558760116e-07, "loss": 0.3313, "num_input_tokens_seen": 27728288, "step": 26885 }, { "epoch": 17.986622073578594, "grad_norm": 2.725224018096924, "learning_rate": 3.057435513886836e-07, "loss": 0.3722, "num_input_tokens_seen": 27733504, "step": 26890 }, { "epoch": 17.989966555183948, "grad_norm": 2.9989352226257324, "learning_rate": 3.047394066710341e-07, "loss": 0.2708, "num_input_tokens_seen": 27738368, "step": 26895 }, { "epoch": 17.993311036789297, "grad_norm": 2.8770220279693604, "learning_rate": 3.037368617767933e-07, "loss": 0.2999, "num_input_tokens_seen": 27742944, "step": 26900 }, { "epoch": 17.99665551839465, "grad_norm": 2.269383430480957, "learning_rate": 3.027359170475608e-07, "loss": 0.3078, "num_input_tokens_seen": 27748096, "step": 26905 }, { "epoch": 18.0, "grad_norm": 3.089867115020752, "learning_rate": 3.0173657282439097e-07, "loss": 0.3427, "num_input_tokens_seen": 27752896, "step": 26910 }, { "epoch": 18.0, "eval_loss": 0.5339722037315369, "eval_runtime": 37.6271, "eval_samples_per_second": 39.732, "eval_steps_per_second": 9.94, "num_input_tokens_seen": 27752896, "step": 26910 }, { "epoch": 18.00334448160535, "grad_norm": 3.118483066558838, "learning_rate": 3.0073882944778976e-07, "loss": 0.3729, "num_input_tokens_seen": 27758752, "step": 26915 }, { "epoch": 18.006688963210703, "grad_norm": 2.9888556003570557, "learning_rate": 2.997426872577225e-07, "loss": 0.3309, "num_input_tokens_seen": 27765088, "step": 26920 }, { "epoch": 18.010033444816052, "grad_norm": 4.298999786376953, "learning_rate": 2.987481465936043e-07, "loss": 0.4562, "num_input_tokens_seen": 27769600, "step": 26925 }, { "epoch": 18.013377926421406, "grad_norm": 3.565962791442871, "learning_rate": 2.97755207794308e-07, "loss": 0.4013, "num_input_tokens_seen": 27774688, "step": 26930 }, { "epoch": 18.016722408026755, "grad_norm": 3.3824894428253174, "learning_rate": 2.967638711981569e-07, "loss": 0.3568, "num_input_tokens_seen": 27779840, "step": 26935 }, { "epoch": 18.02006688963211, "grad_norm": 2.535773277282715, "learning_rate": 2.9577413714293345e-07, "loss": 0.3089, "num_input_tokens_seen": 27784928, "step": 26940 }, { "epoch": 18.023411371237458, "grad_norm": 2.1090404987335205, "learning_rate": 2.9478600596586915e-07, "loss": 0.3054, "num_input_tokens_seen": 27790432, "step": 26945 }, { "epoch": 18.02675585284281, "grad_norm": 2.9566783905029297, "learning_rate": 2.9379947800365236e-07, "loss": 0.3602, "num_input_tokens_seen": 27795232, "step": 26950 }, { "epoch": 18.03010033444816, "grad_norm": 3.4654476642608643, "learning_rate": 2.9281455359242483e-07, "loss": 0.3925, "num_input_tokens_seen": 27800384, "step": 26955 }, { "epoch": 18.03344481605351, "grad_norm": 3.6175355911254883, "learning_rate": 2.9183123306777917e-07, "loss": 0.4778, "num_input_tokens_seen": 27805824, "step": 26960 }, { "epoch": 18.036789297658864, "grad_norm": 2.4407706260681152, "learning_rate": 2.908495167647668e-07, "loss": 0.3838, "num_input_tokens_seen": 27811424, "step": 26965 }, { "epoch": 18.040133779264213, "grad_norm": 3.4304537773132324, "learning_rate": 2.8986940501788685e-07, "loss": 0.347, "num_input_tokens_seen": 27816736, "step": 26970 }, { "epoch": 18.043478260869566, "grad_norm": 3.022050380706787, "learning_rate": 2.8889089816109605e-07, "loss": 0.368, "num_input_tokens_seen": 27822464, "step": 26975 }, { "epoch": 18.046822742474916, "grad_norm": 3.1210286617279053, "learning_rate": 2.8791399652780103e-07, "loss": 0.3361, "num_input_tokens_seen": 27827776, "step": 26980 }, { "epoch": 18.05016722408027, "grad_norm": 3.8440659046173096, "learning_rate": 2.8693870045086446e-07, "loss": 0.3564, "num_input_tokens_seen": 27832512, "step": 26985 }, { "epoch": 18.05351170568562, "grad_norm": 2.249793767929077, "learning_rate": 2.859650102625999e-07, "loss": 0.3113, "num_input_tokens_seen": 27837088, "step": 26990 }, { "epoch": 18.05685618729097, "grad_norm": 3.8934578895568848, "learning_rate": 2.849929262947737e-07, "loss": 0.3205, "num_input_tokens_seen": 27841792, "step": 26995 }, { "epoch": 18.06020066889632, "grad_norm": 2.471362352371216, "learning_rate": 2.8402244887860696e-07, "loss": 0.405, "num_input_tokens_seen": 27847200, "step": 27000 }, { "epoch": 18.06354515050167, "grad_norm": 2.5225203037261963, "learning_rate": 2.8305357834476967e-07, "loss": 0.4069, "num_input_tokens_seen": 27852384, "step": 27005 }, { "epoch": 18.066889632107024, "grad_norm": 2.8218748569488525, "learning_rate": 2.8208631502338937e-07, "loss": 0.3419, "num_input_tokens_seen": 27857792, "step": 27010 }, { "epoch": 18.070234113712374, "grad_norm": 2.8435633182525635, "learning_rate": 2.8112065924404075e-07, "loss": 0.3506, "num_input_tokens_seen": 27862656, "step": 27015 }, { "epoch": 18.073578595317727, "grad_norm": 3.251857042312622, "learning_rate": 2.801566113357557e-07, "loss": 0.277, "num_input_tokens_seen": 27867136, "step": 27020 }, { "epoch": 18.076923076923077, "grad_norm": 4.466986656188965, "learning_rate": 2.791941716270125e-07, "loss": 0.3974, "num_input_tokens_seen": 27872160, "step": 27025 }, { "epoch": 18.08026755852843, "grad_norm": 3.9916412830352783, "learning_rate": 2.7823334044574766e-07, "loss": 0.3641, "num_input_tokens_seen": 27877536, "step": 27030 }, { "epoch": 18.08361204013378, "grad_norm": 3.0087008476257324, "learning_rate": 2.7727411811934504e-07, "loss": 0.3888, "num_input_tokens_seen": 27882976, "step": 27035 }, { "epoch": 18.08695652173913, "grad_norm": 3.6096251010894775, "learning_rate": 2.763165049746425e-07, "loss": 0.4686, "num_input_tokens_seen": 27889664, "step": 27040 }, { "epoch": 18.090301003344482, "grad_norm": 3.6307902336120605, "learning_rate": 2.753605013379301e-07, "loss": 0.3702, "num_input_tokens_seen": 27894944, "step": 27045 }, { "epoch": 18.093645484949832, "grad_norm": 2.5474932193756104, "learning_rate": 2.7440610753494524e-07, "loss": 0.3678, "num_input_tokens_seen": 27900352, "step": 27050 }, { "epoch": 18.096989966555185, "grad_norm": 2.759382486343384, "learning_rate": 2.734533238908843e-07, "loss": 0.3338, "num_input_tokens_seen": 27904864, "step": 27055 }, { "epoch": 18.100334448160535, "grad_norm": 2.376635789871216, "learning_rate": 2.725021507303871e-07, "loss": 0.3363, "num_input_tokens_seen": 27909408, "step": 27060 }, { "epoch": 18.103678929765888, "grad_norm": 2.390961170196533, "learning_rate": 2.7155258837755105e-07, "loss": 0.4335, "num_input_tokens_seen": 27914720, "step": 27065 }, { "epoch": 18.107023411371237, "grad_norm": 4.121286392211914, "learning_rate": 2.70604637155919e-07, "loss": 0.3613, "num_input_tokens_seen": 27919424, "step": 27070 }, { "epoch": 18.110367892976587, "grad_norm": 2.546677589416504, "learning_rate": 2.6965829738849093e-07, "loss": 0.3542, "num_input_tokens_seen": 27924800, "step": 27075 }, { "epoch": 18.11371237458194, "grad_norm": 2.9099209308624268, "learning_rate": 2.687135693977128e-07, "loss": 0.4234, "num_input_tokens_seen": 27929312, "step": 27080 }, { "epoch": 18.11705685618729, "grad_norm": 2.6546478271484375, "learning_rate": 2.6777045350548425e-07, "loss": 0.32, "num_input_tokens_seen": 27934304, "step": 27085 }, { "epoch": 18.120401337792643, "grad_norm": 5.740106582641602, "learning_rate": 2.6682895003315336e-07, "loss": 0.3339, "num_input_tokens_seen": 27939008, "step": 27090 }, { "epoch": 18.123745819397993, "grad_norm": 2.6479508876800537, "learning_rate": 2.6588905930152063e-07, "loss": 0.3834, "num_input_tokens_seen": 27944384, "step": 27095 }, { "epoch": 18.127090301003346, "grad_norm": 4.102469444274902, "learning_rate": 2.6495078163083666e-07, "loss": 0.4577, "num_input_tokens_seen": 27950080, "step": 27100 }, { "epoch": 18.130434782608695, "grad_norm": 2.563875436782837, "learning_rate": 2.6401411734080286e-07, "loss": 0.3184, "num_input_tokens_seen": 27955808, "step": 27105 }, { "epoch": 18.13377926421405, "grad_norm": 2.574625015258789, "learning_rate": 2.6307906675057003e-07, "loss": 0.313, "num_input_tokens_seen": 27960608, "step": 27110 }, { "epoch": 18.137123745819398, "grad_norm": 2.104480743408203, "learning_rate": 2.6214563017873727e-07, "loss": 0.3935, "num_input_tokens_seen": 27966112, "step": 27115 }, { "epoch": 18.140468227424748, "grad_norm": 2.9469902515411377, "learning_rate": 2.612138079433596e-07, "loss": 0.3418, "num_input_tokens_seen": 27970752, "step": 27120 }, { "epoch": 18.1438127090301, "grad_norm": 2.431156873703003, "learning_rate": 2.6028360036193525e-07, "loss": 0.329, "num_input_tokens_seen": 27975488, "step": 27125 }, { "epoch": 18.14715719063545, "grad_norm": 2.5102291107177734, "learning_rate": 2.5935500775141733e-07, "loss": 0.3804, "num_input_tokens_seen": 27980064, "step": 27130 }, { "epoch": 18.150501672240804, "grad_norm": 2.991731882095337, "learning_rate": 2.58428030428205e-07, "loss": 0.341, "num_input_tokens_seen": 27985312, "step": 27135 }, { "epoch": 18.153846153846153, "grad_norm": 3.21444034576416, "learning_rate": 2.575026687081489e-07, "loss": 0.3561, "num_input_tokens_seen": 27990112, "step": 27140 }, { "epoch": 18.157190635451506, "grad_norm": 2.2684895992279053, "learning_rate": 2.565789229065496e-07, "loss": 0.3496, "num_input_tokens_seen": 27995040, "step": 27145 }, { "epoch": 18.160535117056856, "grad_norm": 2.8236892223358154, "learning_rate": 2.5565679333815653e-07, "loss": 0.3452, "num_input_tokens_seen": 28000128, "step": 27150 }, { "epoch": 18.163879598662206, "grad_norm": 2.647655725479126, "learning_rate": 2.5473628031716823e-07, "loss": 0.3993, "num_input_tokens_seen": 28005792, "step": 27155 }, { "epoch": 18.16722408026756, "grad_norm": 3.374457359313965, "learning_rate": 2.538173841572311e-07, "loss": 0.4222, "num_input_tokens_seen": 28011904, "step": 27160 }, { "epoch": 18.17056856187291, "grad_norm": 2.755333423614502, "learning_rate": 2.5290010517144524e-07, "loss": 0.359, "num_input_tokens_seen": 28017088, "step": 27165 }, { "epoch": 18.17391304347826, "grad_norm": 4.016746520996094, "learning_rate": 2.519844436723534e-07, "loss": 0.3767, "num_input_tokens_seen": 28022848, "step": 27170 }, { "epoch": 18.17725752508361, "grad_norm": 3.4739363193511963, "learning_rate": 2.5107039997195215e-07, "loss": 0.3073, "num_input_tokens_seen": 28028160, "step": 27175 }, { "epoch": 18.180602006688964, "grad_norm": 2.309520959854126, "learning_rate": 2.5015797438168344e-07, "loss": 0.2667, "num_input_tokens_seen": 28033696, "step": 27180 }, { "epoch": 18.183946488294314, "grad_norm": 3.6656923294067383, "learning_rate": 2.492471672124408e-07, "loss": 0.4046, "num_input_tokens_seen": 28038336, "step": 27185 }, { "epoch": 18.187290969899667, "grad_norm": 3.588515520095825, "learning_rate": 2.4833797877456487e-07, "loss": 0.3611, "num_input_tokens_seen": 28043616, "step": 27190 }, { "epoch": 18.190635451505017, "grad_norm": 3.7462494373321533, "learning_rate": 2.47430409377844e-07, "loss": 0.3713, "num_input_tokens_seen": 28048992, "step": 27195 }, { "epoch": 18.193979933110366, "grad_norm": 2.585254192352295, "learning_rate": 2.4652445933151693e-07, "loss": 0.3558, "num_input_tokens_seen": 28053728, "step": 27200 }, { "epoch": 18.19732441471572, "grad_norm": 2.1428720951080322, "learning_rate": 2.4562012894426776e-07, "loss": 0.3619, "num_input_tokens_seen": 28058720, "step": 27205 }, { "epoch": 18.20066889632107, "grad_norm": 2.9681038856506348, "learning_rate": 2.447174185242324e-07, "loss": 0.3234, "num_input_tokens_seen": 28063840, "step": 27210 }, { "epoch": 18.204013377926422, "grad_norm": 4.389678955078125, "learning_rate": 2.438163283789913e-07, "loss": 0.4094, "num_input_tokens_seen": 28068928, "step": 27215 }, { "epoch": 18.207357859531772, "grad_norm": 2.6448841094970703, "learning_rate": 2.4291685881557504e-07, "loss": 0.4003, "num_input_tokens_seen": 28074528, "step": 27220 }, { "epoch": 18.210702341137125, "grad_norm": 3.512587070465088, "learning_rate": 2.4201901014046014e-07, "loss": 0.334, "num_input_tokens_seen": 28079296, "step": 27225 }, { "epoch": 18.214046822742475, "grad_norm": 3.1821982860565186, "learning_rate": 2.4112278265957356e-07, "loss": 0.3752, "num_input_tokens_seen": 28084096, "step": 27230 }, { "epoch": 18.217391304347824, "grad_norm": 3.6304821968078613, "learning_rate": 2.402281766782866e-07, "loss": 0.3834, "num_input_tokens_seen": 28088896, "step": 27235 }, { "epoch": 18.220735785953178, "grad_norm": 2.5688705444335938, "learning_rate": 2.3933519250142143e-07, "loss": 0.3463, "num_input_tokens_seen": 28094176, "step": 27240 }, { "epoch": 18.224080267558527, "grad_norm": 2.9629275798797607, "learning_rate": 2.3844383043324537e-07, "loss": 0.3925, "num_input_tokens_seen": 28098816, "step": 27245 }, { "epoch": 18.22742474916388, "grad_norm": 3.643768548965454, "learning_rate": 2.3755409077747205e-07, "loss": 0.3042, "num_input_tokens_seen": 28103840, "step": 27250 }, { "epoch": 18.23076923076923, "grad_norm": 2.9389991760253906, "learning_rate": 2.3666597383726675e-07, "loss": 0.3908, "num_input_tokens_seen": 28109376, "step": 27255 }, { "epoch": 18.234113712374583, "grad_norm": 3.7355237007141113, "learning_rate": 2.357794799152363e-07, "loss": 0.3938, "num_input_tokens_seen": 28114368, "step": 27260 }, { "epoch": 18.237458193979933, "grad_norm": 2.9103119373321533, "learning_rate": 2.3489460931343911e-07, "loss": 0.3784, "num_input_tokens_seen": 28120928, "step": 27265 }, { "epoch": 18.240802675585286, "grad_norm": 2.635392427444458, "learning_rate": 2.340113623333773e-07, "loss": 0.3164, "num_input_tokens_seen": 28126144, "step": 27270 }, { "epoch": 18.244147157190636, "grad_norm": 2.1451776027679443, "learning_rate": 2.3312973927600069e-07, "loss": 0.3175, "num_input_tokens_seen": 28130752, "step": 27275 }, { "epoch": 18.247491638795985, "grad_norm": 2.066183567047119, "learning_rate": 2.3224974044170679e-07, "loss": 0.3561, "num_input_tokens_seen": 28135328, "step": 27280 }, { "epoch": 18.25083612040134, "grad_norm": 2.491046905517578, "learning_rate": 2.3137136613033962e-07, "loss": 0.4077, "num_input_tokens_seen": 28140192, "step": 27285 }, { "epoch": 18.254180602006688, "grad_norm": 3.0128965377807617, "learning_rate": 2.3049461664118754e-07, "loss": 0.3628, "num_input_tokens_seen": 28145792, "step": 27290 }, { "epoch": 18.25752508361204, "grad_norm": 2.708712100982666, "learning_rate": 2.2961949227298718e-07, "loss": 0.337, "num_input_tokens_seen": 28151008, "step": 27295 }, { "epoch": 18.26086956521739, "grad_norm": 3.1186795234680176, "learning_rate": 2.2874599332392165e-07, "loss": 0.3266, "num_input_tokens_seen": 28155552, "step": 27300 }, { "epoch": 18.264214046822744, "grad_norm": 3.333993434906006, "learning_rate": 2.2787412009161903e-07, "loss": 0.3588, "num_input_tokens_seen": 28160640, "step": 27305 }, { "epoch": 18.267558528428093, "grad_norm": 2.093212604522705, "learning_rate": 2.2700387287315494e-07, "loss": 0.3282, "num_input_tokens_seen": 28165472, "step": 27310 }, { "epoch": 18.270903010033443, "grad_norm": 2.8786909580230713, "learning_rate": 2.2613525196504893e-07, "loss": 0.413, "num_input_tokens_seen": 28171168, "step": 27315 }, { "epoch": 18.274247491638796, "grad_norm": 2.7211430072784424, "learning_rate": 2.2526825766326754e-07, "loss": 0.4201, "num_input_tokens_seen": 28176704, "step": 27320 }, { "epoch": 18.277591973244146, "grad_norm": 2.8561689853668213, "learning_rate": 2.2440289026322393e-07, "loss": 0.3732, "num_input_tokens_seen": 28180992, "step": 27325 }, { "epoch": 18.2809364548495, "grad_norm": 3.0466296672821045, "learning_rate": 2.235391500597761e-07, "loss": 0.3715, "num_input_tokens_seen": 28185760, "step": 27330 }, { "epoch": 18.28428093645485, "grad_norm": 2.6958539485931396, "learning_rate": 2.2267703734722644e-07, "loss": 0.2999, "num_input_tokens_seen": 28190656, "step": 27335 }, { "epoch": 18.287625418060202, "grad_norm": 2.584773540496826, "learning_rate": 2.2181655241932498e-07, "loss": 0.3335, "num_input_tokens_seen": 28196416, "step": 27340 }, { "epoch": 18.29096989966555, "grad_norm": 2.3111534118652344, "learning_rate": 2.209576955692655e-07, "loss": 0.3747, "num_input_tokens_seen": 28200864, "step": 27345 }, { "epoch": 18.294314381270905, "grad_norm": 2.069149971008301, "learning_rate": 2.2010046708968778e-07, "loss": 0.4053, "num_input_tokens_seen": 28206784, "step": 27350 }, { "epoch": 18.297658862876254, "grad_norm": 3.345275640487671, "learning_rate": 2.1924486727267713e-07, "loss": 0.3935, "num_input_tokens_seen": 28212576, "step": 27355 }, { "epoch": 18.301003344481604, "grad_norm": 2.1684470176696777, "learning_rate": 2.1839089640976252e-07, "loss": 0.3584, "num_input_tokens_seen": 28217536, "step": 27360 }, { "epoch": 18.304347826086957, "grad_norm": 3.112337827682495, "learning_rate": 2.175385547919201e-07, "loss": 0.3713, "num_input_tokens_seen": 28223104, "step": 27365 }, { "epoch": 18.307692307692307, "grad_norm": 2.550351142883301, "learning_rate": 2.1668784270956754e-07, "loss": 0.4041, "num_input_tokens_seen": 28228512, "step": 27370 }, { "epoch": 18.31103678929766, "grad_norm": 3.0988481044769287, "learning_rate": 2.158387604525719e-07, "loss": 0.3779, "num_input_tokens_seen": 28234176, "step": 27375 }, { "epoch": 18.31438127090301, "grad_norm": 2.921675205230713, "learning_rate": 2.149913083102395e-07, "loss": 0.3484, "num_input_tokens_seen": 28239232, "step": 27380 }, { "epoch": 18.317725752508363, "grad_norm": 2.526888370513916, "learning_rate": 2.141454865713255e-07, "loss": 0.3201, "num_input_tokens_seen": 28244576, "step": 27385 }, { "epoch": 18.321070234113712, "grad_norm": 3.3381104469299316, "learning_rate": 2.1330129552402823e-07, "loss": 0.3215, "num_input_tokens_seen": 28249376, "step": 27390 }, { "epoch": 18.324414715719065, "grad_norm": 5.136075019836426, "learning_rate": 2.1245873545598928e-07, "loss": 0.3984, "num_input_tokens_seen": 28254080, "step": 27395 }, { "epoch": 18.327759197324415, "grad_norm": 2.319983720779419, "learning_rate": 2.1161780665429732e-07, "loss": 0.3627, "num_input_tokens_seen": 28259808, "step": 27400 }, { "epoch": 18.331103678929765, "grad_norm": 2.6909544467926025, "learning_rate": 2.107785094054804e-07, "loss": 0.3661, "num_input_tokens_seen": 28264736, "step": 27405 }, { "epoch": 18.334448160535118, "grad_norm": 4.114666938781738, "learning_rate": 2.099408439955164e-07, "loss": 0.3609, "num_input_tokens_seen": 28269600, "step": 27410 }, { "epoch": 18.337792642140467, "grad_norm": 2.4226865768432617, "learning_rate": 2.0910481070982257e-07, "loss": 0.3553, "num_input_tokens_seen": 28275072, "step": 27415 }, { "epoch": 18.34113712374582, "grad_norm": 3.6677732467651367, "learning_rate": 2.082704098332633e-07, "loss": 0.3137, "num_input_tokens_seen": 28279360, "step": 27420 }, { "epoch": 18.34448160535117, "grad_norm": 2.5761067867279053, "learning_rate": 2.0743764165014334e-07, "loss": 0.302, "num_input_tokens_seen": 28285152, "step": 27425 }, { "epoch": 18.347826086956523, "grad_norm": 2.0030317306518555, "learning_rate": 2.0660650644421465e-07, "loss": 0.3504, "num_input_tokens_seen": 28289920, "step": 27430 }, { "epoch": 18.351170568561873, "grad_norm": 2.4865713119506836, "learning_rate": 2.057770044986701e-07, "loss": 0.4285, "num_input_tokens_seen": 28295808, "step": 27435 }, { "epoch": 18.354515050167223, "grad_norm": 2.8016512393951416, "learning_rate": 2.049491360961481e-07, "loss": 0.3229, "num_input_tokens_seen": 28300640, "step": 27440 }, { "epoch": 18.357859531772576, "grad_norm": 3.464533805847168, "learning_rate": 2.0412290151873014e-07, "loss": 0.346, "num_input_tokens_seen": 28306208, "step": 27445 }, { "epoch": 18.361204013377925, "grad_norm": 3.1923158168792725, "learning_rate": 2.032983010479378e-07, "loss": 0.402, "num_input_tokens_seen": 28312224, "step": 27450 }, { "epoch": 18.36454849498328, "grad_norm": 2.636868953704834, "learning_rate": 2.0247533496474126e-07, "loss": 0.2965, "num_input_tokens_seen": 28317632, "step": 27455 }, { "epoch": 18.367892976588628, "grad_norm": 3.493466854095459, "learning_rate": 2.0165400354954955e-07, "loss": 0.3924, "num_input_tokens_seen": 28323136, "step": 27460 }, { "epoch": 18.37123745819398, "grad_norm": 3.1105494499206543, "learning_rate": 2.0083430708221774e-07, "loss": 0.3768, "num_input_tokens_seen": 28328384, "step": 27465 }, { "epoch": 18.37458193979933, "grad_norm": 3.0096492767333984, "learning_rate": 2.000162458420396e-07, "loss": 0.4207, "num_input_tokens_seen": 28333344, "step": 27470 }, { "epoch": 18.377926421404684, "grad_norm": 3.427438974380493, "learning_rate": 1.991998201077566e-07, "loss": 0.3658, "num_input_tokens_seen": 28338528, "step": 27475 }, { "epoch": 18.381270903010034, "grad_norm": 3.3616650104522705, "learning_rate": 1.9838503015754963e-07, "loss": 0.3204, "num_input_tokens_seen": 28343808, "step": 27480 }, { "epoch": 18.384615384615383, "grad_norm": 2.0484235286712646, "learning_rate": 1.9757187626904373e-07, "loss": 0.3683, "num_input_tokens_seen": 28348576, "step": 27485 }, { "epoch": 18.387959866220736, "grad_norm": 2.3465545177459717, "learning_rate": 1.9676035871930676e-07, "loss": 0.4258, "num_input_tokens_seen": 28354272, "step": 27490 }, { "epoch": 18.391304347826086, "grad_norm": 2.8168106079101562, "learning_rate": 1.95950477784847e-07, "loss": 0.2878, "num_input_tokens_seen": 28358208, "step": 27495 }, { "epoch": 18.39464882943144, "grad_norm": 3.1918365955352783, "learning_rate": 1.9514223374161757e-07, "loss": 0.2602, "num_input_tokens_seen": 28363520, "step": 27500 }, { "epoch": 18.39799331103679, "grad_norm": 2.457789659500122, "learning_rate": 1.9433562686501206e-07, "loss": 0.353, "num_input_tokens_seen": 28368768, "step": 27505 }, { "epoch": 18.401337792642142, "grad_norm": 2.4520158767700195, "learning_rate": 1.9353065742986733e-07, "loss": 0.3472, "num_input_tokens_seen": 28373504, "step": 27510 }, { "epoch": 18.40468227424749, "grad_norm": 2.517347574234009, "learning_rate": 1.9272732571046116e-07, "loss": 0.3024, "num_input_tokens_seen": 28379232, "step": 27515 }, { "epoch": 18.40802675585284, "grad_norm": 3.6610405445098877, "learning_rate": 1.9192563198051463e-07, "loss": 0.4173, "num_input_tokens_seen": 28384576, "step": 27520 }, { "epoch": 18.411371237458194, "grad_norm": 2.8377771377563477, "learning_rate": 1.911255765131903e-07, "loss": 0.436, "num_input_tokens_seen": 28389120, "step": 27525 }, { "epoch": 18.414715719063544, "grad_norm": 2.3701107501983643, "learning_rate": 1.9032715958109181e-07, "loss": 0.4912, "num_input_tokens_seen": 28394304, "step": 27530 }, { "epoch": 18.418060200668897, "grad_norm": 3.2208213806152344, "learning_rate": 1.895303814562649e-07, "loss": 0.3506, "num_input_tokens_seen": 28399520, "step": 27535 }, { "epoch": 18.421404682274247, "grad_norm": 2.6630802154541016, "learning_rate": 1.8873524241019736e-07, "loss": 0.2176, "num_input_tokens_seen": 28404192, "step": 27540 }, { "epoch": 18.4247491638796, "grad_norm": 2.0311195850372314, "learning_rate": 1.8794174271381803e-07, "loss": 0.3575, "num_input_tokens_seen": 28409440, "step": 27545 }, { "epoch": 18.42809364548495, "grad_norm": 2.708509683609009, "learning_rate": 1.8714988263749677e-07, "loss": 0.3774, "num_input_tokens_seen": 28414592, "step": 27550 }, { "epoch": 18.431438127090303, "grad_norm": 3.0030744075775146, "learning_rate": 1.8635966245104663e-07, "loss": 0.4037, "num_input_tokens_seen": 28420704, "step": 27555 }, { "epoch": 18.434782608695652, "grad_norm": 2.190293788909912, "learning_rate": 1.8557108242371946e-07, "loss": 0.394, "num_input_tokens_seen": 28426528, "step": 27560 }, { "epoch": 18.438127090301002, "grad_norm": 3.0065486431121826, "learning_rate": 1.8478414282420976e-07, "loss": 0.3643, "num_input_tokens_seen": 28431136, "step": 27565 }, { "epoch": 18.441471571906355, "grad_norm": 2.965111255645752, "learning_rate": 1.8399884392065193e-07, "loss": 0.3706, "num_input_tokens_seen": 28436768, "step": 27570 }, { "epoch": 18.444816053511705, "grad_norm": 4.314136981964111, "learning_rate": 1.832151859806236e-07, "loss": 0.3222, "num_input_tokens_seen": 28441952, "step": 27575 }, { "epoch": 18.448160535117058, "grad_norm": 2.6702425479888916, "learning_rate": 1.8243316927114007e-07, "loss": 0.39, "num_input_tokens_seen": 28446720, "step": 27580 }, { "epoch": 18.451505016722408, "grad_norm": 2.0896964073181152, "learning_rate": 1.8165279405866043e-07, "loss": 0.2977, "num_input_tokens_seen": 28452000, "step": 27585 }, { "epoch": 18.45484949832776, "grad_norm": 2.973546266555786, "learning_rate": 1.8087406060908198e-07, "loss": 0.3531, "num_input_tokens_seen": 28457920, "step": 27590 }, { "epoch": 18.45819397993311, "grad_norm": 2.954986333847046, "learning_rate": 1.8009696918774466e-07, "loss": 0.3204, "num_input_tokens_seen": 28463392, "step": 27595 }, { "epoch": 18.46153846153846, "grad_norm": 2.836819648742676, "learning_rate": 1.793215200594284e-07, "loss": 0.3715, "num_input_tokens_seen": 28468096, "step": 27600 }, { "epoch": 18.464882943143813, "grad_norm": 2.344592809677124, "learning_rate": 1.7854771348835175e-07, "loss": 0.3616, "num_input_tokens_seen": 28472832, "step": 27605 }, { "epoch": 18.468227424749163, "grad_norm": 2.8301639556884766, "learning_rate": 1.7777554973817558e-07, "loss": 0.465, "num_input_tokens_seen": 28477824, "step": 27610 }, { "epoch": 18.471571906354516, "grad_norm": 3.3576531410217285, "learning_rate": 1.7700502907200102e-07, "loss": 0.47, "num_input_tokens_seen": 28483456, "step": 27615 }, { "epoch": 18.474916387959865, "grad_norm": 3.838115692138672, "learning_rate": 1.762361517523692e-07, "loss": 0.3122, "num_input_tokens_seen": 28488544, "step": 27620 }, { "epoch": 18.47826086956522, "grad_norm": 2.5671160221099854, "learning_rate": 1.7546891804125888e-07, "loss": 0.3323, "num_input_tokens_seen": 28494368, "step": 27625 }, { "epoch": 18.48160535117057, "grad_norm": 2.078718662261963, "learning_rate": 1.747033282000926e-07, "loss": 0.3524, "num_input_tokens_seen": 28499424, "step": 27630 }, { "epoch": 18.48494983277592, "grad_norm": 2.6253039836883545, "learning_rate": 1.739393824897301e-07, "loss": 0.4078, "num_input_tokens_seen": 28504768, "step": 27635 }, { "epoch": 18.48829431438127, "grad_norm": 2.21329665184021, "learning_rate": 1.7317708117047194e-07, "loss": 0.3319, "num_input_tokens_seen": 28509920, "step": 27640 }, { "epoch": 18.49163879598662, "grad_norm": 3.25176739692688, "learning_rate": 1.724164245020593e-07, "loss": 0.3335, "num_input_tokens_seen": 28514688, "step": 27645 }, { "epoch": 18.494983277591974, "grad_norm": 2.8544552326202393, "learning_rate": 1.7165741274367042e-07, "loss": 0.3844, "num_input_tokens_seen": 28518976, "step": 27650 }, { "epoch": 18.498327759197323, "grad_norm": 2.7378573417663574, "learning_rate": 1.7090004615392453e-07, "loss": 0.4143, "num_input_tokens_seen": 28523264, "step": 27655 }, { "epoch": 18.501672240802677, "grad_norm": 2.898261547088623, "learning_rate": 1.701443249908813e-07, "loss": 0.3933, "num_input_tokens_seen": 28527968, "step": 27660 }, { "epoch": 18.505016722408026, "grad_norm": 3.545356512069702, "learning_rate": 1.6939024951203863e-07, "loss": 0.3632, "num_input_tokens_seen": 28533312, "step": 27665 }, { "epoch": 18.50836120401338, "grad_norm": 2.6528987884521484, "learning_rate": 1.6863781997433327e-07, "loss": 0.3624, "num_input_tokens_seen": 28538016, "step": 27670 }, { "epoch": 18.51170568561873, "grad_norm": 2.277604818344116, "learning_rate": 1.6788703663414175e-07, "loss": 0.3121, "num_input_tokens_seen": 28543424, "step": 27675 }, { "epoch": 18.51505016722408, "grad_norm": 2.6720962524414062, "learning_rate": 1.6713789974728e-07, "loss": 0.3783, "num_input_tokens_seen": 28549088, "step": 27680 }, { "epoch": 18.51839464882943, "grad_norm": 2.6753017902374268, "learning_rate": 1.6639040956900222e-07, "loss": 0.3503, "num_input_tokens_seen": 28554464, "step": 27685 }, { "epoch": 18.52173913043478, "grad_norm": 2.9709744453430176, "learning_rate": 1.6564456635400295e-07, "loss": 0.3136, "num_input_tokens_seen": 28559520, "step": 27690 }, { "epoch": 18.525083612040135, "grad_norm": 2.238208532333374, "learning_rate": 1.6490037035641338e-07, "loss": 0.2908, "num_input_tokens_seen": 28564256, "step": 27695 }, { "epoch": 18.528428093645484, "grad_norm": 3.4864416122436523, "learning_rate": 1.6415782182980455e-07, "loss": 0.3433, "num_input_tokens_seen": 28568512, "step": 27700 }, { "epoch": 18.531772575250837, "grad_norm": 3.298772096633911, "learning_rate": 1.634169210271863e-07, "loss": 0.4829, "num_input_tokens_seen": 28574336, "step": 27705 }, { "epoch": 18.535117056856187, "grad_norm": 2.91448712348938, "learning_rate": 1.6267766820100784e-07, "loss": 0.4305, "num_input_tokens_seen": 28579296, "step": 27710 }, { "epoch": 18.53846153846154, "grad_norm": 2.9554038047790527, "learning_rate": 1.619400636031543e-07, "loss": 0.3963, "num_input_tokens_seen": 28584128, "step": 27715 }, { "epoch": 18.54180602006689, "grad_norm": 2.593282699584961, "learning_rate": 1.6120410748495242e-07, "loss": 0.318, "num_input_tokens_seen": 28589696, "step": 27720 }, { "epoch": 18.54515050167224, "grad_norm": 3.6591856479644775, "learning_rate": 1.604698000971644e-07, "loss": 0.3194, "num_input_tokens_seen": 28594432, "step": 27725 }, { "epoch": 18.548494983277592, "grad_norm": 2.3227438926696777, "learning_rate": 1.5973714168999344e-07, "loss": 0.3303, "num_input_tokens_seen": 28599168, "step": 27730 }, { "epoch": 18.551839464882942, "grad_norm": 2.246168375015259, "learning_rate": 1.5900613251307762e-07, "loss": 0.3579, "num_input_tokens_seen": 28604992, "step": 27735 }, { "epoch": 18.555183946488295, "grad_norm": 4.550625324249268, "learning_rate": 1.5827677281549548e-07, "loss": 0.388, "num_input_tokens_seen": 28610464, "step": 27740 }, { "epoch": 18.558528428093645, "grad_norm": 2.40155029296875, "learning_rate": 1.575490628457632e-07, "loss": 0.3918, "num_input_tokens_seen": 28615552, "step": 27745 }, { "epoch": 18.561872909698998, "grad_norm": 2.5772085189819336, "learning_rate": 1.5682300285183415e-07, "loss": 0.375, "num_input_tokens_seen": 28620832, "step": 27750 }, { "epoch": 18.565217391304348, "grad_norm": 3.516836643218994, "learning_rate": 1.5609859308110098e-07, "loss": 0.3223, "num_input_tokens_seen": 28626432, "step": 27755 }, { "epoch": 18.568561872909697, "grad_norm": 2.3670153617858887, "learning_rate": 1.5537583378039123e-07, "loss": 0.3384, "num_input_tokens_seen": 28631616, "step": 27760 }, { "epoch": 18.57190635451505, "grad_norm": 2.6341166496276855, "learning_rate": 1.5465472519597292e-07, "loss": 0.4098, "num_input_tokens_seen": 28637600, "step": 27765 }, { "epoch": 18.5752508361204, "grad_norm": 2.1857945919036865, "learning_rate": 1.5393526757355004e-07, "loss": 0.3914, "num_input_tokens_seen": 28643200, "step": 27770 }, { "epoch": 18.578595317725753, "grad_norm": 4.2500433921813965, "learning_rate": 1.5321746115826543e-07, "loss": 0.364, "num_input_tokens_seen": 28648256, "step": 27775 }, { "epoch": 18.581939799331103, "grad_norm": 3.0416839122772217, "learning_rate": 1.525013061946967e-07, "loss": 0.3955, "num_input_tokens_seen": 28652992, "step": 27780 }, { "epoch": 18.585284280936456, "grad_norm": 2.5709738731384277, "learning_rate": 1.5178680292686155e-07, "loss": 0.3583, "num_input_tokens_seen": 28658240, "step": 27785 }, { "epoch": 18.588628762541806, "grad_norm": 2.4047720432281494, "learning_rate": 1.5107395159821347e-07, "loss": 0.326, "num_input_tokens_seen": 28663520, "step": 27790 }, { "epoch": 18.59197324414716, "grad_norm": 2.4420840740203857, "learning_rate": 1.5036275245164377e-07, "loss": 0.443, "num_input_tokens_seen": 28668576, "step": 27795 }, { "epoch": 18.59531772575251, "grad_norm": 3.512861967086792, "learning_rate": 1.4965320572948084e-07, "loss": 0.3307, "num_input_tokens_seen": 28673344, "step": 27800 }, { "epoch": 18.598662207357858, "grad_norm": 2.653677463531494, "learning_rate": 1.4894531167348792e-07, "loss": 0.3245, "num_input_tokens_seen": 28678336, "step": 27805 }, { "epoch": 18.60200668896321, "grad_norm": 3.172494888305664, "learning_rate": 1.482390705248682e-07, "loss": 0.3311, "num_input_tokens_seen": 28683584, "step": 27810 }, { "epoch": 18.60535117056856, "grad_norm": 3.303407907485962, "learning_rate": 1.4753448252425972e-07, "loss": 0.3537, "num_input_tokens_seen": 28689376, "step": 27815 }, { "epoch": 18.608695652173914, "grad_norm": 3.276592254638672, "learning_rate": 1.4683154791173883e-07, "loss": 0.293, "num_input_tokens_seen": 28693856, "step": 27820 }, { "epoch": 18.612040133779264, "grad_norm": 2.4849157333374023, "learning_rate": 1.4613026692681663e-07, "loss": 0.3438, "num_input_tokens_seen": 28699008, "step": 27825 }, { "epoch": 18.615384615384617, "grad_norm": 2.5213818550109863, "learning_rate": 1.4543063980844153e-07, "loss": 0.3386, "num_input_tokens_seen": 28704512, "step": 27830 }, { "epoch": 18.618729096989966, "grad_norm": 2.401353597640991, "learning_rate": 1.4473266679499888e-07, "loss": 0.304, "num_input_tokens_seen": 28709600, "step": 27835 }, { "epoch": 18.62207357859532, "grad_norm": 2.509195566177368, "learning_rate": 1.440363481243101e-07, "loss": 0.3322, "num_input_tokens_seen": 28715936, "step": 27840 }, { "epoch": 18.62541806020067, "grad_norm": 2.64635968208313, "learning_rate": 1.433416840336338e-07, "loss": 0.4956, "num_input_tokens_seen": 28720992, "step": 27845 }, { "epoch": 18.62876254180602, "grad_norm": 2.4113271236419678, "learning_rate": 1.4264867475966227e-07, "loss": 0.3151, "num_input_tokens_seen": 28725952, "step": 27850 }, { "epoch": 18.632107023411372, "grad_norm": 2.6997733116149902, "learning_rate": 1.4195732053852663e-07, "loss": 0.403, "num_input_tokens_seen": 28731232, "step": 27855 }, { "epoch": 18.63545150501672, "grad_norm": 3.6960337162017822, "learning_rate": 1.4126762160579287e-07, "loss": 0.4673, "num_input_tokens_seen": 28736096, "step": 27860 }, { "epoch": 18.638795986622075, "grad_norm": 3.091120481491089, "learning_rate": 1.4057957819646417e-07, "loss": 0.4055, "num_input_tokens_seen": 28741184, "step": 27865 }, { "epoch": 18.642140468227424, "grad_norm": 3.3220715522766113, "learning_rate": 1.3989319054497686e-07, "loss": 0.3917, "num_input_tokens_seen": 28745856, "step": 27870 }, { "epoch": 18.645484949832777, "grad_norm": 3.3722782135009766, "learning_rate": 1.392084588852055e-07, "loss": 0.2992, "num_input_tokens_seen": 28751008, "step": 27875 }, { "epoch": 18.648829431438127, "grad_norm": 4.239439964294434, "learning_rate": 1.3852538345046019e-07, "loss": 0.3484, "num_input_tokens_seen": 28756032, "step": 27880 }, { "epoch": 18.652173913043477, "grad_norm": 2.492382287979126, "learning_rate": 1.378439644734858e-07, "loss": 0.3125, "num_input_tokens_seen": 28760000, "step": 27885 }, { "epoch": 18.65551839464883, "grad_norm": 4.95097017288208, "learning_rate": 1.3716420218646442e-07, "loss": 0.3448, "num_input_tokens_seen": 28764384, "step": 27890 }, { "epoch": 18.65886287625418, "grad_norm": 2.3789222240448, "learning_rate": 1.3648609682101077e-07, "loss": 0.3692, "num_input_tokens_seen": 28769216, "step": 27895 }, { "epoch": 18.662207357859533, "grad_norm": 2.8298864364624023, "learning_rate": 1.358096486081778e-07, "loss": 0.3459, "num_input_tokens_seen": 28774080, "step": 27900 }, { "epoch": 18.665551839464882, "grad_norm": 3.4503538608551025, "learning_rate": 1.3513485777845225e-07, "loss": 0.4181, "num_input_tokens_seen": 28778816, "step": 27905 }, { "epoch": 18.668896321070235, "grad_norm": 3.5455589294433594, "learning_rate": 1.3446172456175745e-07, "loss": 0.4046, "num_input_tokens_seen": 28783360, "step": 27910 }, { "epoch": 18.672240802675585, "grad_norm": 3.476839780807495, "learning_rate": 1.3379024918745042e-07, "loss": 0.3529, "num_input_tokens_seen": 28788928, "step": 27915 }, { "epoch": 18.675585284280935, "grad_norm": 2.565764904022217, "learning_rate": 1.3312043188432378e-07, "loss": 0.3745, "num_input_tokens_seen": 28794240, "step": 27920 }, { "epoch": 18.678929765886288, "grad_norm": 3.0090155601501465, "learning_rate": 1.3245227288060603e-07, "loss": 0.3443, "num_input_tokens_seen": 28799456, "step": 27925 }, { "epoch": 18.682274247491637, "grad_norm": 2.167170524597168, "learning_rate": 1.3178577240396063e-07, "loss": 0.3378, "num_input_tokens_seen": 28803904, "step": 27930 }, { "epoch": 18.68561872909699, "grad_norm": 2.7744452953338623, "learning_rate": 1.3112093068148368e-07, "loss": 0.3782, "num_input_tokens_seen": 28808864, "step": 27935 }, { "epoch": 18.68896321070234, "grad_norm": 3.0826618671417236, "learning_rate": 1.3045774793970843e-07, "loss": 0.3547, "num_input_tokens_seen": 28813664, "step": 27940 }, { "epoch": 18.692307692307693, "grad_norm": 3.2369067668914795, "learning_rate": 1.297962244046025e-07, "loss": 0.3919, "num_input_tokens_seen": 28817984, "step": 27945 }, { "epoch": 18.695652173913043, "grad_norm": 3.500418186187744, "learning_rate": 1.2913636030156772e-07, "loss": 0.388, "num_input_tokens_seen": 28823392, "step": 27950 }, { "epoch": 18.698996655518396, "grad_norm": 2.5564024448394775, "learning_rate": 1.2847815585544044e-07, "loss": 0.3231, "num_input_tokens_seen": 28828160, "step": 27955 }, { "epoch": 18.702341137123746, "grad_norm": 3.353745937347412, "learning_rate": 1.2782161129049176e-07, "loss": 0.3378, "num_input_tokens_seen": 28833504, "step": 27960 }, { "epoch": 18.705685618729095, "grad_norm": 2.7409517765045166, "learning_rate": 1.2716672683042719e-07, "loss": 0.3822, "num_input_tokens_seen": 28839072, "step": 27965 }, { "epoch": 18.70903010033445, "grad_norm": 3.758579730987549, "learning_rate": 1.2651350269838603e-07, "loss": 0.3532, "num_input_tokens_seen": 28843872, "step": 27970 }, { "epoch": 18.712374581939798, "grad_norm": 2.4215264320373535, "learning_rate": 1.2586193911694355e-07, "loss": 0.3, "num_input_tokens_seen": 28849184, "step": 27975 }, { "epoch": 18.71571906354515, "grad_norm": 2.393103837966919, "learning_rate": 1.2521203630810664e-07, "loss": 0.3983, "num_input_tokens_seen": 28854688, "step": 27980 }, { "epoch": 18.7190635451505, "grad_norm": 2.90238094329834, "learning_rate": 1.2456379449331813e-07, "loss": 0.417, "num_input_tokens_seen": 28860160, "step": 27985 }, { "epoch": 18.722408026755854, "grad_norm": 3.6830101013183594, "learning_rate": 1.2391721389345468e-07, "loss": 0.3144, "num_input_tokens_seen": 28864768, "step": 27990 }, { "epoch": 18.725752508361204, "grad_norm": 2.3266496658325195, "learning_rate": 1.2327229472882675e-07, "loss": 0.3374, "num_input_tokens_seen": 28870368, "step": 27995 }, { "epoch": 18.729096989966557, "grad_norm": 2.8045413494110107, "learning_rate": 1.2262903721917907e-07, "loss": 0.2787, "num_input_tokens_seen": 28875552, "step": 28000 }, { "epoch": 18.732441471571907, "grad_norm": 2.1127946376800537, "learning_rate": 1.2198744158368858e-07, "loss": 0.4221, "num_input_tokens_seen": 28881696, "step": 28005 }, { "epoch": 18.735785953177256, "grad_norm": 2.0748963356018066, "learning_rate": 1.2134750804096818e-07, "loss": 0.2969, "num_input_tokens_seen": 28887520, "step": 28010 }, { "epoch": 18.73913043478261, "grad_norm": 2.85498309135437, "learning_rate": 1.2070923680906232e-07, "loss": 0.4046, "num_input_tokens_seen": 28892480, "step": 28015 }, { "epoch": 18.74247491638796, "grad_norm": 2.6468214988708496, "learning_rate": 1.2007262810545205e-07, "loss": 0.3969, "num_input_tokens_seen": 28897600, "step": 28020 }, { "epoch": 18.745819397993312, "grad_norm": 2.5014712810516357, "learning_rate": 1.1943768214704775e-07, "loss": 0.3552, "num_input_tokens_seen": 28903296, "step": 28025 }, { "epoch": 18.74916387959866, "grad_norm": 3.8430392742156982, "learning_rate": 1.1880439915019693e-07, "loss": 0.3601, "num_input_tokens_seen": 28908096, "step": 28030 }, { "epoch": 18.752508361204015, "grad_norm": 2.94846773147583, "learning_rate": 1.181727793306786e-07, "loss": 0.3359, "num_input_tokens_seen": 28913440, "step": 28035 }, { "epoch": 18.755852842809364, "grad_norm": 3.2154958248138428, "learning_rate": 1.1754282290370623e-07, "loss": 0.3645, "num_input_tokens_seen": 28918496, "step": 28040 }, { "epoch": 18.759197324414714, "grad_norm": 3.309054136276245, "learning_rate": 1.1691453008392528e-07, "loss": 0.3186, "num_input_tokens_seen": 28922816, "step": 28045 }, { "epoch": 18.762541806020067, "grad_norm": 3.260646104812622, "learning_rate": 1.1628790108541455e-07, "loss": 0.3151, "num_input_tokens_seen": 28927840, "step": 28050 }, { "epoch": 18.765886287625417, "grad_norm": 3.684763193130493, "learning_rate": 1.1566293612168711e-07, "loss": 0.3986, "num_input_tokens_seen": 28932800, "step": 28055 }, { "epoch": 18.76923076923077, "grad_norm": 3.234811782836914, "learning_rate": 1.1503963540568764e-07, "loss": 0.3681, "num_input_tokens_seen": 28938400, "step": 28060 }, { "epoch": 18.77257525083612, "grad_norm": 3.036238193511963, "learning_rate": 1.1441799914979568e-07, "loss": 0.3941, "num_input_tokens_seen": 28943168, "step": 28065 }, { "epoch": 18.775919732441473, "grad_norm": 3.3144750595092773, "learning_rate": 1.1379802756582014e-07, "loss": 0.3649, "num_input_tokens_seen": 28948640, "step": 28070 }, { "epoch": 18.779264214046822, "grad_norm": 4.005862236022949, "learning_rate": 1.1317972086500594e-07, "loss": 0.401, "num_input_tokens_seen": 28953632, "step": 28075 }, { "epoch": 18.782608695652176, "grad_norm": 3.3332056999206543, "learning_rate": 1.1256307925803012e-07, "loss": 0.3351, "num_input_tokens_seen": 28959200, "step": 28080 }, { "epoch": 18.785953177257525, "grad_norm": 3.7930140495300293, "learning_rate": 1.119481029550018e-07, "loss": 0.3234, "num_input_tokens_seen": 28963616, "step": 28085 }, { "epoch": 18.789297658862875, "grad_norm": 3.804994821548462, "learning_rate": 1.1133479216546229e-07, "loss": 0.3595, "num_input_tokens_seen": 28968704, "step": 28090 }, { "epoch": 18.792642140468228, "grad_norm": 3.394510507583618, "learning_rate": 1.107231470983866e-07, "loss": 0.3538, "num_input_tokens_seen": 28973920, "step": 28095 }, { "epoch": 18.795986622073578, "grad_norm": 3.9669876098632812, "learning_rate": 1.101131679621803e-07, "loss": 0.3919, "num_input_tokens_seen": 28978496, "step": 28100 }, { "epoch": 18.79933110367893, "grad_norm": 2.32727313041687, "learning_rate": 1.0950485496468377e-07, "loss": 0.3155, "num_input_tokens_seen": 28984384, "step": 28105 }, { "epoch": 18.80267558528428, "grad_norm": 2.867769241333008, "learning_rate": 1.0889820831316844e-07, "loss": 0.3541, "num_input_tokens_seen": 28989440, "step": 28110 }, { "epoch": 18.806020066889634, "grad_norm": 2.9695823192596436, "learning_rate": 1.082932282143373e-07, "loss": 0.4422, "num_input_tokens_seen": 28994432, "step": 28115 }, { "epoch": 18.809364548494983, "grad_norm": 2.4217326641082764, "learning_rate": 1.07689914874326e-07, "loss": 0.3605, "num_input_tokens_seen": 28999456, "step": 28120 }, { "epoch": 18.812709030100333, "grad_norm": 3.7515146732330322, "learning_rate": 1.0708826849870291e-07, "loss": 0.3203, "num_input_tokens_seen": 29003712, "step": 28125 }, { "epoch": 18.816053511705686, "grad_norm": 3.9476799964904785, "learning_rate": 1.064882892924679e-07, "loss": 0.3376, "num_input_tokens_seen": 29008992, "step": 28130 }, { "epoch": 18.819397993311036, "grad_norm": 4.011290550231934, "learning_rate": 1.0588997746005303e-07, "loss": 0.3345, "num_input_tokens_seen": 29014336, "step": 28135 }, { "epoch": 18.82274247491639, "grad_norm": 3.339750289916992, "learning_rate": 1.0529333320532131e-07, "loss": 0.3195, "num_input_tokens_seen": 29019232, "step": 28140 }, { "epoch": 18.82608695652174, "grad_norm": 2.5582942962646484, "learning_rate": 1.0469835673156847e-07, "loss": 0.3949, "num_input_tokens_seen": 29024160, "step": 28145 }, { "epoch": 18.82943143812709, "grad_norm": 3.6601672172546387, "learning_rate": 1.0410504824152235e-07, "loss": 0.3972, "num_input_tokens_seen": 29029216, "step": 28150 }, { "epoch": 18.83277591973244, "grad_norm": 2.6997413635253906, "learning_rate": 1.0351340793734178e-07, "loss": 0.3931, "num_input_tokens_seen": 29034112, "step": 28155 }, { "epoch": 18.836120401337794, "grad_norm": 4.269837856292725, "learning_rate": 1.029234360206166e-07, "loss": 0.287, "num_input_tokens_seen": 29039200, "step": 28160 }, { "epoch": 18.839464882943144, "grad_norm": 9.654271125793457, "learning_rate": 1.0233513269236994e-07, "loss": 0.3443, "num_input_tokens_seen": 29044896, "step": 28165 }, { "epoch": 18.842809364548494, "grad_norm": 2.7149507999420166, "learning_rate": 1.017484981530531e-07, "loss": 0.3372, "num_input_tokens_seen": 29050752, "step": 28170 }, { "epoch": 18.846153846153847, "grad_norm": 3.3108773231506348, "learning_rate": 1.0116353260255396e-07, "loss": 0.4205, "num_input_tokens_seen": 29056448, "step": 28175 }, { "epoch": 18.849498327759196, "grad_norm": 3.1718368530273438, "learning_rate": 1.0058023624018642e-07, "loss": 0.4194, "num_input_tokens_seen": 29061440, "step": 28180 }, { "epoch": 18.85284280936455, "grad_norm": 3.407350778579712, "learning_rate": 9.999860926469928e-08, "loss": 0.3483, "num_input_tokens_seen": 29065952, "step": 28185 }, { "epoch": 18.8561872909699, "grad_norm": 2.512174129486084, "learning_rate": 9.941865187427013e-08, "loss": 0.4162, "num_input_tokens_seen": 29072416, "step": 28190 }, { "epoch": 18.859531772575252, "grad_norm": 2.124760389328003, "learning_rate": 9.884036426650979e-08, "loss": 0.3314, "num_input_tokens_seen": 29078272, "step": 28195 }, { "epoch": 18.862876254180602, "grad_norm": 2.7904701232910156, "learning_rate": 9.826374663845895e-08, "loss": 0.35, "num_input_tokens_seen": 29083616, "step": 28200 }, { "epoch": 18.86622073578595, "grad_norm": 3.321428060531616, "learning_rate": 9.768879918658825e-08, "loss": 0.3287, "num_input_tokens_seen": 29088288, "step": 28205 }, { "epoch": 18.869565217391305, "grad_norm": 2.8844196796417236, "learning_rate": 9.711552210680209e-08, "loss": 0.2809, "num_input_tokens_seen": 29093280, "step": 28210 }, { "epoch": 18.872909698996654, "grad_norm": 2.922067642211914, "learning_rate": 9.654391559443255e-08, "loss": 0.2772, "num_input_tokens_seen": 29097664, "step": 28215 }, { "epoch": 18.876254180602007, "grad_norm": 2.3847687244415283, "learning_rate": 9.597397984424551e-08, "loss": 0.2955, "num_input_tokens_seen": 29102688, "step": 28220 }, { "epoch": 18.879598662207357, "grad_norm": 2.937831401824951, "learning_rate": 9.54057150504345e-08, "loss": 0.3461, "num_input_tokens_seen": 29107232, "step": 28225 }, { "epoch": 18.88294314381271, "grad_norm": 4.805811882019043, "learning_rate": 9.483912140662633e-08, "loss": 0.3819, "num_input_tokens_seen": 29111808, "step": 28230 }, { "epoch": 18.88628762541806, "grad_norm": 2.747859477996826, "learning_rate": 9.42741991058771e-08, "loss": 0.3007, "num_input_tokens_seen": 29115808, "step": 28235 }, { "epoch": 18.889632107023413, "grad_norm": 2.7025790214538574, "learning_rate": 9.37109483406734e-08, "loss": 0.3503, "num_input_tokens_seen": 29120576, "step": 28240 }, { "epoch": 18.892976588628763, "grad_norm": 2.6607742309570312, "learning_rate": 9.314936930293283e-08, "loss": 0.4146, "num_input_tokens_seen": 29125792, "step": 28245 }, { "epoch": 18.896321070234112, "grad_norm": 2.9316577911376953, "learning_rate": 9.258946218400289e-08, "loss": 0.346, "num_input_tokens_seen": 29131264, "step": 28250 }, { "epoch": 18.899665551839465, "grad_norm": 2.4368507862091064, "learning_rate": 9.203122717466206e-08, "loss": 0.283, "num_input_tokens_seen": 29135968, "step": 28255 }, { "epoch": 18.903010033444815, "grad_norm": 4.056362628936768, "learning_rate": 9.147466446511765e-08, "loss": 0.3845, "num_input_tokens_seen": 29140352, "step": 28260 }, { "epoch": 18.906354515050168, "grad_norm": 2.76108455657959, "learning_rate": 9.091977424500908e-08, "loss": 0.4409, "num_input_tokens_seen": 29145632, "step": 28265 }, { "epoch": 18.909698996655518, "grad_norm": 3.3178069591522217, "learning_rate": 9.036655670340456e-08, "loss": 0.3525, "num_input_tokens_seen": 29150400, "step": 28270 }, { "epoch": 18.91304347826087, "grad_norm": 3.3695645332336426, "learning_rate": 8.981501202880271e-08, "loss": 0.4279, "num_input_tokens_seen": 29154976, "step": 28275 }, { "epoch": 18.91638795986622, "grad_norm": 2.8363969326019287, "learning_rate": 8.926514040913215e-08, "loss": 0.3837, "num_input_tokens_seen": 29159936, "step": 28280 }, { "epoch": 18.919732441471574, "grad_norm": 3.769765615463257, "learning_rate": 8.87169420317513e-08, "loss": 0.3533, "num_input_tokens_seen": 29165088, "step": 28285 }, { "epoch": 18.923076923076923, "grad_norm": 3.008042097091675, "learning_rate": 8.81704170834502e-08, "loss": 0.3692, "num_input_tokens_seen": 29169952, "step": 28290 }, { "epoch": 18.926421404682273, "grad_norm": 3.0938141345977783, "learning_rate": 8.76255657504449e-08, "loss": 0.3276, "num_input_tokens_seen": 29174368, "step": 28295 }, { "epoch": 18.929765886287626, "grad_norm": 3.169790506362915, "learning_rate": 8.708238821838466e-08, "loss": 0.3129, "num_input_tokens_seen": 29180096, "step": 28300 }, { "epoch": 18.933110367892976, "grad_norm": 2.739128828048706, "learning_rate": 8.654088467234645e-08, "loss": 0.3232, "num_input_tokens_seen": 29185088, "step": 28305 }, { "epoch": 18.93645484949833, "grad_norm": 4.022238731384277, "learning_rate": 8.600105529683933e-08, "loss": 0.4219, "num_input_tokens_seen": 29190432, "step": 28310 }, { "epoch": 18.93979933110368, "grad_norm": 2.760007858276367, "learning_rate": 8.54629002757984e-08, "loss": 0.3808, "num_input_tokens_seen": 29195520, "step": 28315 }, { "epoch": 18.94314381270903, "grad_norm": 3.204455614089966, "learning_rate": 8.492641979259031e-08, "loss": 0.3673, "num_input_tokens_seen": 29200640, "step": 28320 }, { "epoch": 18.94648829431438, "grad_norm": 3.4720730781555176, "learning_rate": 8.439161403001162e-08, "loss": 0.3232, "num_input_tokens_seen": 29206560, "step": 28325 }, { "epoch": 18.94983277591973, "grad_norm": 2.454893112182617, "learning_rate": 8.385848317028711e-08, "loss": 0.3308, "num_input_tokens_seen": 29211552, "step": 28330 }, { "epoch": 18.953177257525084, "grad_norm": 3.0681591033935547, "learning_rate": 8.332702739507147e-08, "loss": 0.3145, "num_input_tokens_seen": 29216160, "step": 28335 }, { "epoch": 18.956521739130434, "grad_norm": 3.661038637161255, "learning_rate": 8.279724688544821e-08, "loss": 0.371, "num_input_tokens_seen": 29222176, "step": 28340 }, { "epoch": 18.959866220735787, "grad_norm": 3.0251624584198, "learning_rate": 8.226914182193014e-08, "loss": 0.4119, "num_input_tokens_seen": 29227328, "step": 28345 }, { "epoch": 18.963210702341136, "grad_norm": 2.7928824424743652, "learning_rate": 8.174271238445941e-08, "loss": 0.334, "num_input_tokens_seen": 29232192, "step": 28350 }, { "epoch": 18.96655518394649, "grad_norm": 2.8150217533111572, "learning_rate": 8.121795875240756e-08, "loss": 0.297, "num_input_tokens_seen": 29236832, "step": 28355 }, { "epoch": 18.96989966555184, "grad_norm": 2.412914514541626, "learning_rate": 8.069488110457436e-08, "loss": 0.3092, "num_input_tokens_seen": 29242272, "step": 28360 }, { "epoch": 18.97324414715719, "grad_norm": 2.274728775024414, "learning_rate": 8.017347961918887e-08, "loss": 0.323, "num_input_tokens_seen": 29247328, "step": 28365 }, { "epoch": 18.976588628762542, "grad_norm": 3.0638976097106934, "learning_rate": 7.965375447390899e-08, "loss": 0.4115, "num_input_tokens_seen": 29251776, "step": 28370 }, { "epoch": 18.97993311036789, "grad_norm": 3.0474298000335693, "learning_rate": 7.913570584582197e-08, "loss": 0.3317, "num_input_tokens_seen": 29257888, "step": 28375 }, { "epoch": 18.983277591973245, "grad_norm": 2.5960168838500977, "learning_rate": 7.861933391144272e-08, "loss": 0.3522, "num_input_tokens_seen": 29263552, "step": 28380 }, { "epoch": 18.986622073578594, "grad_norm": 3.51774001121521, "learning_rate": 7.810463884671549e-08, "loss": 0.4331, "num_input_tokens_seen": 29269120, "step": 28385 }, { "epoch": 18.989966555183948, "grad_norm": 2.16789174079895, "learning_rate": 7.759162082701278e-08, "loss": 0.2639, "num_input_tokens_seen": 29274016, "step": 28390 }, { "epoch": 18.993311036789297, "grad_norm": 2.71773099899292, "learning_rate": 7.708028002713697e-08, "loss": 0.3337, "num_input_tokens_seen": 29279296, "step": 28395 }, { "epoch": 18.99665551839465, "grad_norm": 2.8619790077209473, "learning_rate": 7.657061662131815e-08, "loss": 0.3138, "num_input_tokens_seen": 29284128, "step": 28400 }, { "epoch": 19.0, "grad_norm": 4.045263767242432, "learning_rate": 7.606263078321352e-08, "loss": 0.3392, "num_input_tokens_seen": 29289872, "step": 28405 }, { "epoch": 19.00334448160535, "grad_norm": 2.7233142852783203, "learning_rate": 7.555632268591073e-08, "loss": 0.3131, "num_input_tokens_seen": 29295088, "step": 28410 }, { "epoch": 19.006688963210703, "grad_norm": 2.5943379402160645, "learning_rate": 7.505169250192345e-08, "loss": 0.3204, "num_input_tokens_seen": 29299504, "step": 28415 }, { "epoch": 19.010033444816052, "grad_norm": 3.2390530109405518, "learning_rate": 7.454874040319749e-08, "loss": 0.3895, "num_input_tokens_seen": 29304176, "step": 28420 }, { "epoch": 19.013377926421406, "grad_norm": 3.4616546630859375, "learning_rate": 7.404746656110296e-08, "loss": 0.3745, "num_input_tokens_seen": 29308912, "step": 28425 }, { "epoch": 19.016722408026755, "grad_norm": 2.1142005920410156, "learning_rate": 7.354787114644047e-08, "loss": 0.343, "num_input_tokens_seen": 29314768, "step": 28430 }, { "epoch": 19.02006688963211, "grad_norm": 2.9139301776885986, "learning_rate": 7.304995432943662e-08, "loss": 0.4473, "num_input_tokens_seen": 29320272, "step": 28435 }, { "epoch": 19.023411371237458, "grad_norm": 3.0113933086395264, "learning_rate": 7.255371627974906e-08, "loss": 0.4117, "num_input_tokens_seen": 29325712, "step": 28440 }, { "epoch": 19.02675585284281, "grad_norm": 2.3548622131347656, "learning_rate": 7.205915716646083e-08, "loss": 0.3235, "num_input_tokens_seen": 29331216, "step": 28445 }, { "epoch": 19.03010033444816, "grad_norm": 2.534407377243042, "learning_rate": 7.156627715808384e-08, "loss": 0.3588, "num_input_tokens_seen": 29335760, "step": 28450 }, { "epoch": 19.03344481605351, "grad_norm": 3.1700165271759033, "learning_rate": 7.107507642255818e-08, "loss": 0.3023, "num_input_tokens_seen": 29340784, "step": 28455 }, { "epoch": 19.036789297658864, "grad_norm": 3.1525864601135254, "learning_rate": 7.058555512724997e-08, "loss": 0.3844, "num_input_tokens_seen": 29345424, "step": 28460 }, { "epoch": 19.040133779264213, "grad_norm": 3.0418291091918945, "learning_rate": 7.009771343895744e-08, "loss": 0.3594, "num_input_tokens_seen": 29349744, "step": 28465 }, { "epoch": 19.043478260869566, "grad_norm": 2.111104726791382, "learning_rate": 6.961155152390098e-08, "loss": 0.338, "num_input_tokens_seen": 29354480, "step": 28470 }, { "epoch": 19.046822742474916, "grad_norm": 3.3721232414245605, "learning_rate": 6.912706954773251e-08, "loss": 0.4372, "num_input_tokens_seen": 29359152, "step": 28475 }, { "epoch": 19.05016722408027, "grad_norm": 3.924013614654541, "learning_rate": 6.864426767553001e-08, "loss": 0.2891, "num_input_tokens_seen": 29364400, "step": 28480 }, { "epoch": 19.05351170568562, "grad_norm": 3.0678210258483887, "learning_rate": 6.816314607179963e-08, "loss": 0.3624, "num_input_tokens_seen": 29369552, "step": 28485 }, { "epoch": 19.05685618729097, "grad_norm": 2.9756381511688232, "learning_rate": 6.768370490047471e-08, "loss": 0.4626, "num_input_tokens_seen": 29375216, "step": 28490 }, { "epoch": 19.06020066889632, "grad_norm": 2.7092018127441406, "learning_rate": 6.720594432491568e-08, "loss": 0.4312, "num_input_tokens_seen": 29380336, "step": 28495 }, { "epoch": 19.06354515050167, "grad_norm": 2.879879951477051, "learning_rate": 6.672986450791064e-08, "loss": 0.401, "num_input_tokens_seen": 29385680, "step": 28500 }, { "epoch": 19.066889632107024, "grad_norm": 3.3858063220977783, "learning_rate": 6.625546561167484e-08, "loss": 0.3861, "num_input_tokens_seen": 29390512, "step": 28505 }, { "epoch": 19.070234113712374, "grad_norm": 2.961867332458496, "learning_rate": 6.57827477978512e-08, "loss": 0.3708, "num_input_tokens_seen": 29395504, "step": 28510 }, { "epoch": 19.073578595317727, "grad_norm": 4.868546962738037, "learning_rate": 6.531171122750979e-08, "loss": 0.355, "num_input_tokens_seen": 29400592, "step": 28515 }, { "epoch": 19.076923076923077, "grad_norm": 3.9028589725494385, "learning_rate": 6.484235606114719e-08, "loss": 0.3446, "num_input_tokens_seen": 29406160, "step": 28520 }, { "epoch": 19.08026755852843, "grad_norm": 2.2104008197784424, "learning_rate": 6.437468245868772e-08, "loss": 0.3829, "num_input_tokens_seen": 29411312, "step": 28525 }, { "epoch": 19.08361204013378, "grad_norm": 2.7689566612243652, "learning_rate": 6.390869057948279e-08, "loss": 0.3989, "num_input_tokens_seen": 29417072, "step": 28530 }, { "epoch": 19.08695652173913, "grad_norm": 2.887188196182251, "learning_rate": 6.34443805823104e-08, "loss": 0.4134, "num_input_tokens_seen": 29423792, "step": 28535 }, { "epoch": 19.090301003344482, "grad_norm": 4.321640491485596, "learning_rate": 6.29817526253751e-08, "loss": 0.3741, "num_input_tokens_seen": 29429328, "step": 28540 }, { "epoch": 19.093645484949832, "grad_norm": 2.7754344940185547, "learning_rate": 6.252080686630968e-08, "loss": 0.3749, "num_input_tokens_seen": 29434128, "step": 28545 }, { "epoch": 19.096989966555185, "grad_norm": 2.7166574001312256, "learning_rate": 6.206154346217185e-08, "loss": 0.4247, "num_input_tokens_seen": 29439632, "step": 28550 }, { "epoch": 19.100334448160535, "grad_norm": 2.877570629119873, "learning_rate": 6.160396256944868e-08, "loss": 0.4445, "num_input_tokens_seen": 29444496, "step": 28555 }, { "epoch": 19.103678929765888, "grad_norm": 2.500948667526245, "learning_rate": 6.114806434405096e-08, "loss": 0.3477, "num_input_tokens_seen": 29450288, "step": 28560 }, { "epoch": 19.107023411371237, "grad_norm": 2.9104249477386475, "learning_rate": 6.06938489413178e-08, "loss": 0.3374, "num_input_tokens_seen": 29455184, "step": 28565 }, { "epoch": 19.110367892976587, "grad_norm": 3.015611171722412, "learning_rate": 6.02413165160154e-08, "loss": 0.3613, "num_input_tokens_seen": 29460624, "step": 28570 }, { "epoch": 19.11371237458194, "grad_norm": 2.9365670680999756, "learning_rate": 5.97904672223354e-08, "loss": 0.3904, "num_input_tokens_seen": 29465648, "step": 28575 }, { "epoch": 19.11705685618729, "grad_norm": 2.400989294052124, "learning_rate": 5.9341301213896586e-08, "loss": 0.3648, "num_input_tokens_seen": 29470544, "step": 28580 }, { "epoch": 19.120401337792643, "grad_norm": 2.8032076358795166, "learning_rate": 5.889381864374322e-08, "loss": 0.4191, "num_input_tokens_seen": 29475600, "step": 28585 }, { "epoch": 19.123745819397993, "grad_norm": 2.201586961746216, "learning_rate": 5.844801966434832e-08, "loss": 0.3389, "num_input_tokens_seen": 29480912, "step": 28590 }, { "epoch": 19.127090301003346, "grad_norm": 2.3926126956939697, "learning_rate": 5.800390442760761e-08, "loss": 0.2933, "num_input_tokens_seen": 29486256, "step": 28595 }, { "epoch": 19.130434782608695, "grad_norm": 4.00234842300415, "learning_rate": 5.7561473084847274e-08, "loss": 0.4432, "num_input_tokens_seen": 29491824, "step": 28600 }, { "epoch": 19.13377926421405, "grad_norm": 2.5408599376678467, "learning_rate": 5.712072578681616e-08, "loss": 0.4158, "num_input_tokens_seen": 29497296, "step": 28605 }, { "epoch": 19.137123745819398, "grad_norm": 3.306546688079834, "learning_rate": 5.6681662683691354e-08, "loss": 0.3725, "num_input_tokens_seen": 29502448, "step": 28610 }, { "epoch": 19.140468227424748, "grad_norm": 2.7097249031066895, "learning_rate": 5.624428392507486e-08, "loss": 0.3296, "num_input_tokens_seen": 29506864, "step": 28615 }, { "epoch": 19.1438127090301, "grad_norm": 2.733806848526001, "learning_rate": 5.5808589659996914e-08, "loss": 0.281, "num_input_tokens_seen": 29512624, "step": 28620 }, { "epoch": 19.14715719063545, "grad_norm": 3.2803287506103516, "learning_rate": 5.537458003691043e-08, "loss": 0.317, "num_input_tokens_seen": 29517392, "step": 28625 }, { "epoch": 19.150501672240804, "grad_norm": 2.5400614738464355, "learning_rate": 5.494225520369767e-08, "loss": 0.3145, "num_input_tokens_seen": 29523408, "step": 28630 }, { "epoch": 19.153846153846153, "grad_norm": 2.920459508895874, "learning_rate": 5.451161530766469e-08, "loss": 0.3026, "num_input_tokens_seen": 29529072, "step": 28635 }, { "epoch": 19.157190635451506, "grad_norm": 3.5369975566864014, "learning_rate": 5.408266049554356e-08, "loss": 0.4138, "num_input_tokens_seen": 29534288, "step": 28640 }, { "epoch": 19.160535117056856, "grad_norm": 2.4864368438720703, "learning_rate": 5.365539091349403e-08, "loss": 0.3706, "num_input_tokens_seen": 29539472, "step": 28645 }, { "epoch": 19.163879598662206, "grad_norm": 3.047930955886841, "learning_rate": 5.32298067070991e-08, "loss": 0.4734, "num_input_tokens_seen": 29544400, "step": 28650 }, { "epoch": 19.16722408026756, "grad_norm": 2.6986570358276367, "learning_rate": 5.2805908021369424e-08, "loss": 0.3649, "num_input_tokens_seen": 29549616, "step": 28655 }, { "epoch": 19.17056856187291, "grad_norm": 3.1503522396087646, "learning_rate": 5.238369500074003e-08, "loss": 0.3754, "num_input_tokens_seen": 29554992, "step": 28660 }, { "epoch": 19.17391304347826, "grad_norm": 3.404108762741089, "learning_rate": 5.196316778907362e-08, "loss": 0.335, "num_input_tokens_seen": 29559984, "step": 28665 }, { "epoch": 19.17725752508361, "grad_norm": 3.121502637863159, "learning_rate": 5.154432652965558e-08, "loss": 0.4422, "num_input_tokens_seen": 29565648, "step": 28670 }, { "epoch": 19.180602006688964, "grad_norm": 2.842651605606079, "learning_rate": 5.1127171365198404e-08, "loss": 0.3458, "num_input_tokens_seen": 29570768, "step": 28675 }, { "epoch": 19.183946488294314, "grad_norm": 3.0090818405151367, "learning_rate": 5.071170243784118e-08, "loss": 0.3164, "num_input_tokens_seen": 29575312, "step": 28680 }, { "epoch": 19.187290969899667, "grad_norm": 3.3768320083618164, "learning_rate": 5.029791988914623e-08, "loss": 0.344, "num_input_tokens_seen": 29580208, "step": 28685 }, { "epoch": 19.190635451505017, "grad_norm": 3.3201422691345215, "learning_rate": 4.988582386010299e-08, "loss": 0.3456, "num_input_tokens_seen": 29585104, "step": 28690 }, { "epoch": 19.193979933110366, "grad_norm": 2.3097689151763916, "learning_rate": 4.9475414491124696e-08, "loss": 0.3678, "num_input_tokens_seen": 29590960, "step": 28695 }, { "epoch": 19.19732441471572, "grad_norm": 2.8817391395568848, "learning_rate": 4.906669192205227e-08, "loss": 0.3587, "num_input_tokens_seen": 29595760, "step": 28700 }, { "epoch": 19.20066889632107, "grad_norm": 2.7894768714904785, "learning_rate": 4.865965629214819e-08, "loss": 0.4625, "num_input_tokens_seen": 29601456, "step": 28705 }, { "epoch": 19.204013377926422, "grad_norm": 2.3698508739471436, "learning_rate": 4.825430774010487e-08, "loss": 0.303, "num_input_tokens_seen": 29605744, "step": 28710 }, { "epoch": 19.207357859531772, "grad_norm": 3.162661552429199, "learning_rate": 4.785064640403514e-08, "loss": 0.3553, "num_input_tokens_seen": 29609840, "step": 28715 }, { "epoch": 19.210702341137125, "grad_norm": 3.3396637439727783, "learning_rate": 4.7448672421480655e-08, "loss": 0.376, "num_input_tokens_seen": 29615408, "step": 28720 }, { "epoch": 19.214046822742475, "grad_norm": 2.557102918624878, "learning_rate": 4.704838592940575e-08, "loss": 0.3847, "num_input_tokens_seen": 29620304, "step": 28725 }, { "epoch": 19.217391304347824, "grad_norm": 2.5960476398468018, "learning_rate": 4.6649787064200755e-08, "loss": 0.3371, "num_input_tokens_seen": 29625808, "step": 28730 }, { "epoch": 19.220735785953178, "grad_norm": 3.2272188663482666, "learning_rate": 4.6252875961681484e-08, "loss": 0.4003, "num_input_tokens_seen": 29631664, "step": 28735 }, { "epoch": 19.224080267558527, "grad_norm": 5.8862175941467285, "learning_rate": 4.585765275708753e-08, "loss": 0.3201, "num_input_tokens_seen": 29636944, "step": 28740 }, { "epoch": 19.22742474916388, "grad_norm": 2.785120964050293, "learning_rate": 4.546411758508451e-08, "loss": 0.3994, "num_input_tokens_seen": 29642256, "step": 28745 }, { "epoch": 19.23076923076923, "grad_norm": 2.4157474040985107, "learning_rate": 4.507227057976127e-08, "loss": 0.3594, "num_input_tokens_seen": 29647088, "step": 28750 }, { "epoch": 19.234113712374583, "grad_norm": 3.5476088523864746, "learning_rate": 4.4682111874633226e-08, "loss": 0.3569, "num_input_tokens_seen": 29651184, "step": 28755 }, { "epoch": 19.237458193979933, "grad_norm": 3.4974775314331055, "learning_rate": 4.4293641602639604e-08, "loss": 0.3278, "num_input_tokens_seen": 29656432, "step": 28760 }, { "epoch": 19.240802675585286, "grad_norm": 2.113687515258789, "learning_rate": 4.390685989614396e-08, "loss": 0.2893, "num_input_tokens_seen": 29662352, "step": 28765 }, { "epoch": 19.244147157190636, "grad_norm": 3.0355870723724365, "learning_rate": 4.3521766886936434e-08, "loss": 0.3296, "num_input_tokens_seen": 29667696, "step": 28770 }, { "epoch": 19.247491638795985, "grad_norm": 2.2841877937316895, "learning_rate": 4.3138362706228174e-08, "loss": 0.3453, "num_input_tokens_seen": 29672752, "step": 28775 }, { "epoch": 19.25083612040134, "grad_norm": 2.622424364089966, "learning_rate": 4.275664748465913e-08, "loss": 0.4273, "num_input_tokens_seen": 29677840, "step": 28780 }, { "epoch": 19.254180602006688, "grad_norm": 3.199936628341675, "learning_rate": 4.2376621352290814e-08, "loss": 0.3625, "num_input_tokens_seen": 29682512, "step": 28785 }, { "epoch": 19.25752508361204, "grad_norm": 3.60046124458313, "learning_rate": 4.1998284438610205e-08, "loss": 0.3936, "num_input_tokens_seen": 29687696, "step": 28790 }, { "epoch": 19.26086956521739, "grad_norm": 3.554980516433716, "learning_rate": 4.1621636872528073e-08, "loss": 0.3158, "num_input_tokens_seen": 29693520, "step": 28795 }, { "epoch": 19.264214046822744, "grad_norm": 4.860376358032227, "learning_rate": 4.1246678782381757e-08, "loss": 0.3618, "num_input_tokens_seen": 29698768, "step": 28800 }, { "epoch": 19.267558528428093, "grad_norm": 3.3299014568328857, "learning_rate": 4.087341029592962e-08, "loss": 0.3279, "num_input_tokens_seen": 29703760, "step": 28805 }, { "epoch": 19.270903010033443, "grad_norm": 4.078242301940918, "learning_rate": 4.0501831540356586e-08, "loss": 0.3566, "num_input_tokens_seen": 29708624, "step": 28810 }, { "epoch": 19.274247491638796, "grad_norm": 2.399233102798462, "learning_rate": 4.013194264227138e-08, "loss": 0.3602, "num_input_tokens_seen": 29714256, "step": 28815 }, { "epoch": 19.277591973244146, "grad_norm": 2.78979754447937, "learning_rate": 3.9763743727706524e-08, "loss": 0.338, "num_input_tokens_seen": 29719952, "step": 28820 }, { "epoch": 19.2809364548495, "grad_norm": 2.66276478767395, "learning_rate": 3.939723492211944e-08, "loss": 0.3559, "num_input_tokens_seen": 29725072, "step": 28825 }, { "epoch": 19.28428093645485, "grad_norm": 2.2843918800354004, "learning_rate": 3.903241635039079e-08, "loss": 0.2979, "num_input_tokens_seen": 29730768, "step": 28830 }, { "epoch": 19.287625418060202, "grad_norm": 2.2603938579559326, "learning_rate": 3.8669288136826135e-08, "loss": 0.2737, "num_input_tokens_seen": 29736080, "step": 28835 }, { "epoch": 19.29096989966555, "grad_norm": 3.370823383331299, "learning_rate": 3.830785040515428e-08, "loss": 0.3505, "num_input_tokens_seen": 29740752, "step": 28840 }, { "epoch": 19.294314381270905, "grad_norm": 2.491934061050415, "learning_rate": 3.794810327852949e-08, "loss": 0.2798, "num_input_tokens_seen": 29746192, "step": 28845 }, { "epoch": 19.297658862876254, "grad_norm": 3.0472280979156494, "learning_rate": 3.759004687952761e-08, "loss": 0.4358, "num_input_tokens_seen": 29751344, "step": 28850 }, { "epoch": 19.301003344481604, "grad_norm": 3.056729316711426, "learning_rate": 3.723368133015048e-08, "loss": 0.3135, "num_input_tokens_seen": 29756752, "step": 28855 }, { "epoch": 19.304347826086957, "grad_norm": 3.5792505741119385, "learning_rate": 3.6879006751822656e-08, "loss": 0.3853, "num_input_tokens_seen": 29762224, "step": 28860 }, { "epoch": 19.307692307692307, "grad_norm": 2.342440605163574, "learning_rate": 3.652602326539412e-08, "loss": 0.3262, "num_input_tokens_seen": 29767024, "step": 28865 }, { "epoch": 19.31103678929766, "grad_norm": 2.6170225143432617, "learning_rate": 3.617473099113589e-08, "loss": 0.3609, "num_input_tokens_seen": 29772368, "step": 28870 }, { "epoch": 19.31438127090301, "grad_norm": 4.66478967666626, "learning_rate": 3.582513004874555e-08, "loss": 0.3455, "num_input_tokens_seen": 29777456, "step": 28875 }, { "epoch": 19.317725752508363, "grad_norm": 2.1554157733917236, "learning_rate": 3.5477220557342264e-08, "loss": 0.358, "num_input_tokens_seen": 29782768, "step": 28880 }, { "epoch": 19.321070234113712, "grad_norm": 2.3780360221862793, "learning_rate": 3.513100263547009e-08, "loss": 0.3118, "num_input_tokens_seen": 29787568, "step": 28885 }, { "epoch": 19.324414715719065, "grad_norm": 2.5123448371887207, "learning_rate": 3.478647640109745e-08, "loss": 0.3474, "num_input_tokens_seen": 29793360, "step": 28890 }, { "epoch": 19.327759197324415, "grad_norm": 2.728336811065674, "learning_rate": 3.444364197161376e-08, "loss": 0.3669, "num_input_tokens_seen": 29798384, "step": 28895 }, { "epoch": 19.331103678929765, "grad_norm": 2.60402512550354, "learning_rate": 3.4102499463833925e-08, "loss": 0.4353, "num_input_tokens_seen": 29803504, "step": 28900 }, { "epoch": 19.334448160535118, "grad_norm": 2.3179233074188232, "learning_rate": 3.3763048993996073e-08, "loss": 0.3229, "num_input_tokens_seen": 29808656, "step": 28905 }, { "epoch": 19.337792642140467, "grad_norm": 2.8489763736724854, "learning_rate": 3.3425290677762124e-08, "loss": 0.3652, "num_input_tokens_seen": 29813392, "step": 28910 }, { "epoch": 19.34113712374582, "grad_norm": 3.436647415161133, "learning_rate": 3.3089224630216684e-08, "loss": 0.3244, "num_input_tokens_seen": 29817872, "step": 28915 }, { "epoch": 19.34448160535117, "grad_norm": 2.7486605644226074, "learning_rate": 3.275485096586761e-08, "loss": 0.3741, "num_input_tokens_seen": 29822928, "step": 28920 }, { "epoch": 19.347826086956523, "grad_norm": 2.3950986862182617, "learning_rate": 3.2422169798647654e-08, "loss": 0.3976, "num_input_tokens_seen": 29828688, "step": 28925 }, { "epoch": 19.351170568561873, "grad_norm": 3.123725652694702, "learning_rate": 3.2091181241910016e-08, "loss": 0.4188, "num_input_tokens_seen": 29833840, "step": 28930 }, { "epoch": 19.354515050167223, "grad_norm": 1.6433255672454834, "learning_rate": 3.1761885408435055e-08, "loss": 0.3063, "num_input_tokens_seen": 29839536, "step": 28935 }, { "epoch": 19.357859531772576, "grad_norm": 3.039651393890381, "learning_rate": 3.143428241042301e-08, "loss": 0.4066, "num_input_tokens_seen": 29844816, "step": 28940 }, { "epoch": 19.361204013377925, "grad_norm": 3.513615369796753, "learning_rate": 3.110837235949849e-08, "loss": 0.3929, "num_input_tokens_seen": 29849136, "step": 28945 }, { "epoch": 19.36454849498328, "grad_norm": 3.337338447570801, "learning_rate": 3.0784155366709315e-08, "loss": 0.3904, "num_input_tokens_seen": 29854224, "step": 28950 }, { "epoch": 19.367892976588628, "grad_norm": 3.3699426651000977, "learning_rate": 3.046163154252713e-08, "loss": 0.3732, "num_input_tokens_seen": 29858704, "step": 28955 }, { "epoch": 19.37123745819398, "grad_norm": 3.3739848136901855, "learning_rate": 3.014080099684569e-08, "loss": 0.3438, "num_input_tokens_seen": 29864688, "step": 28960 }, { "epoch": 19.37458193979933, "grad_norm": 3.0788891315460205, "learning_rate": 2.9821663838981994e-08, "loss": 0.2791, "num_input_tokens_seen": 29869360, "step": 28965 }, { "epoch": 19.377926421404684, "grad_norm": 2.673893690109253, "learning_rate": 2.9504220177675737e-08, "loss": 0.337, "num_input_tokens_seen": 29874704, "step": 28970 }, { "epoch": 19.381270903010034, "grad_norm": 3.1487200260162354, "learning_rate": 2.9188470121090385e-08, "loss": 0.3873, "num_input_tokens_seen": 29879344, "step": 28975 }, { "epoch": 19.384615384615383, "grad_norm": 4.029144287109375, "learning_rate": 2.8874413776812103e-08, "loss": 0.2993, "num_input_tokens_seen": 29884240, "step": 28980 }, { "epoch": 19.387959866220736, "grad_norm": 2.2031543254852295, "learning_rate": 2.8562051251849743e-08, "loss": 0.327, "num_input_tokens_seen": 29889488, "step": 28985 }, { "epoch": 19.391304347826086, "grad_norm": 2.794222593307495, "learning_rate": 2.8251382652634828e-08, "loss": 0.4113, "num_input_tokens_seen": 29894064, "step": 28990 }, { "epoch": 19.39464882943144, "grad_norm": 3.858759641647339, "learning_rate": 2.7942408085022134e-08, "loss": 0.3393, "num_input_tokens_seen": 29899216, "step": 28995 }, { "epoch": 19.39799331103679, "grad_norm": 2.990041971206665, "learning_rate": 2.7635127654289108e-08, "loss": 0.3142, "num_input_tokens_seen": 29903920, "step": 29000 }, { "epoch": 19.401337792642142, "grad_norm": 3.3783669471740723, "learning_rate": 2.7329541465135335e-08, "loss": 0.2974, "num_input_tokens_seen": 29908528, "step": 29005 }, { "epoch": 19.40468227424749, "grad_norm": 3.135364055633545, "learning_rate": 2.702564962168419e-08, "loss": 0.3534, "num_input_tokens_seen": 29914096, "step": 29010 }, { "epoch": 19.40802675585284, "grad_norm": 4.038005828857422, "learning_rate": 2.6723452227481182e-08, "loss": 0.4152, "num_input_tokens_seen": 29919408, "step": 29015 }, { "epoch": 19.411371237458194, "grad_norm": 3.086914539337158, "learning_rate": 2.64229493854945e-08, "loss": 0.2819, "num_input_tokens_seen": 29925488, "step": 29020 }, { "epoch": 19.414715719063544, "grad_norm": 2.6637461185455322, "learning_rate": 2.6124141198115016e-08, "loss": 0.3631, "num_input_tokens_seen": 29930640, "step": 29025 }, { "epoch": 19.418060200668897, "grad_norm": 2.650932550430298, "learning_rate": 2.582702776715573e-08, "loss": 0.363, "num_input_tokens_seen": 29935600, "step": 29030 }, { "epoch": 19.421404682274247, "grad_norm": 2.589440107345581, "learning_rate": 2.553160919385289e-08, "loss": 0.3507, "num_input_tokens_seen": 29940784, "step": 29035 }, { "epoch": 19.4247491638796, "grad_norm": 2.9270224571228027, "learning_rate": 2.523788557886431e-08, "loss": 0.3292, "num_input_tokens_seen": 29946000, "step": 29040 }, { "epoch": 19.42809364548495, "grad_norm": 3.1263527870178223, "learning_rate": 2.4945857022272147e-08, "loss": 0.3631, "num_input_tokens_seen": 29951184, "step": 29045 }, { "epoch": 19.431438127090303, "grad_norm": 2.874084234237671, "learning_rate": 2.4655523623578482e-08, "loss": 0.3762, "num_input_tokens_seen": 29957104, "step": 29050 }, { "epoch": 19.434782608695652, "grad_norm": 3.051471471786499, "learning_rate": 2.436688548171029e-08, "loss": 0.2963, "num_input_tokens_seen": 29961744, "step": 29055 }, { "epoch": 19.438127090301002, "grad_norm": 2.3569774627685547, "learning_rate": 2.4079942695014458e-08, "loss": 0.3063, "num_input_tokens_seen": 29967024, "step": 29060 }, { "epoch": 19.441471571906355, "grad_norm": 2.2057971954345703, "learning_rate": 2.3794695361261667e-08, "loss": 0.3016, "num_input_tokens_seen": 29972368, "step": 29065 }, { "epoch": 19.444816053511705, "grad_norm": 2.72109055519104, "learning_rate": 2.351114357764528e-08, "loss": 0.3637, "num_input_tokens_seen": 29977200, "step": 29070 }, { "epoch": 19.448160535117058, "grad_norm": 3.81846284866333, "learning_rate": 2.3229287440780234e-08, "loss": 0.3483, "num_input_tokens_seen": 29981552, "step": 29075 }, { "epoch": 19.451505016722408, "grad_norm": 2.7553980350494385, "learning_rate": 2.29491270467036e-08, "loss": 0.4043, "num_input_tokens_seen": 29986800, "step": 29080 }, { "epoch": 19.45484949832776, "grad_norm": 3.6420419216156006, "learning_rate": 2.267066249087457e-08, "loss": 0.3643, "num_input_tokens_seen": 29992336, "step": 29085 }, { "epoch": 19.45819397993311, "grad_norm": 2.62103271484375, "learning_rate": 2.2393893868175585e-08, "loss": 0.374, "num_input_tokens_seen": 29997712, "step": 29090 }, { "epoch": 19.46153846153846, "grad_norm": 3.5398426055908203, "learning_rate": 2.2118821272909542e-08, "loss": 0.3717, "num_input_tokens_seen": 30002992, "step": 29095 }, { "epoch": 19.464882943143813, "grad_norm": 2.1502397060394287, "learning_rate": 2.1845444798803127e-08, "loss": 0.3308, "num_input_tokens_seen": 30008144, "step": 29100 }, { "epoch": 19.468227424749163, "grad_norm": 3.0668375492095947, "learning_rate": 2.1573764539004062e-08, "loss": 0.2963, "num_input_tokens_seen": 30013232, "step": 29105 }, { "epoch": 19.471571906354516, "grad_norm": 2.1839802265167236, "learning_rate": 2.1303780586081624e-08, "loss": 0.3758, "num_input_tokens_seen": 30019568, "step": 29110 }, { "epoch": 19.474916387959865, "grad_norm": 3.3127241134643555, "learning_rate": 2.1035493032028898e-08, "loss": 0.3837, "num_input_tokens_seen": 30024688, "step": 29115 }, { "epoch": 19.47826086956522, "grad_norm": 2.1576309204101562, "learning_rate": 2.076890196825998e-08, "loss": 0.4211, "num_input_tokens_seen": 30029776, "step": 29120 }, { "epoch": 19.48160535117057, "grad_norm": 3.1283316612243652, "learning_rate": 2.050400748560999e-08, "loss": 0.3591, "num_input_tokens_seen": 30034832, "step": 29125 }, { "epoch": 19.48494983277592, "grad_norm": 2.264983654022217, "learning_rate": 2.024080967433728e-08, "loss": 0.3727, "num_input_tokens_seen": 30040272, "step": 29130 }, { "epoch": 19.48829431438127, "grad_norm": 2.262467622756958, "learning_rate": 1.9979308624121783e-08, "loss": 0.3578, "num_input_tokens_seen": 30045648, "step": 29135 }, { "epoch": 19.49163879598662, "grad_norm": 3.6533379554748535, "learning_rate": 1.9719504424065005e-08, "loss": 0.3719, "num_input_tokens_seen": 30050832, "step": 29140 }, { "epoch": 19.494983277591974, "grad_norm": 4.060452938079834, "learning_rate": 1.9461397162690577e-08, "loss": 0.3776, "num_input_tokens_seen": 30055376, "step": 29145 }, { "epoch": 19.498327759197323, "grad_norm": 3.097447156906128, "learning_rate": 1.9204986927943703e-08, "loss": 0.3786, "num_input_tokens_seen": 30060304, "step": 29150 }, { "epoch": 19.501672240802677, "grad_norm": 2.2602779865264893, "learning_rate": 1.8950273807191166e-08, "loss": 0.3406, "num_input_tokens_seen": 30066000, "step": 29155 }, { "epoch": 19.505016722408026, "grad_norm": 3.146143913269043, "learning_rate": 1.8697257887221876e-08, "loss": 0.353, "num_input_tokens_seen": 30070960, "step": 29160 }, { "epoch": 19.50836120401338, "grad_norm": 3.570070505142212, "learning_rate": 1.844593925424687e-08, "loss": 0.4355, "num_input_tokens_seen": 30076208, "step": 29165 }, { "epoch": 19.51170568561873, "grad_norm": 3.0296647548675537, "learning_rate": 1.819631799389765e-08, "loss": 0.3648, "num_input_tokens_seen": 30080976, "step": 29170 }, { "epoch": 19.51505016722408, "grad_norm": 3.077130079269409, "learning_rate": 1.79483941912284e-08, "loss": 0.3907, "num_input_tokens_seen": 30085136, "step": 29175 }, { "epoch": 19.51839464882943, "grad_norm": 3.3505325317382812, "learning_rate": 1.7702167930714888e-08, "loss": 0.4079, "num_input_tokens_seen": 30089968, "step": 29180 }, { "epoch": 19.52173913043478, "grad_norm": 2.2780985832214355, "learning_rate": 1.7457639296253327e-08, "loss": 0.4213, "num_input_tokens_seen": 30095408, "step": 29185 }, { "epoch": 19.525083612040135, "grad_norm": 2.1840896606445312, "learning_rate": 1.7214808371163184e-08, "loss": 0.3147, "num_input_tokens_seen": 30100016, "step": 29190 }, { "epoch": 19.528428093645484, "grad_norm": 3.4377355575561523, "learning_rate": 1.6973675238183828e-08, "loss": 0.4685, "num_input_tokens_seen": 30105168, "step": 29195 }, { "epoch": 19.531772575250837, "grad_norm": 3.0167343616485596, "learning_rate": 1.6734239979477318e-08, "loss": 0.3494, "num_input_tokens_seen": 30110608, "step": 29200 }, { "epoch": 19.535117056856187, "grad_norm": 2.8103904724121094, "learning_rate": 1.6496502676626725e-08, "loss": 0.3269, "num_input_tokens_seen": 30116176, "step": 29205 }, { "epoch": 19.53846153846154, "grad_norm": 3.124032735824585, "learning_rate": 1.6260463410637252e-08, "loss": 0.3695, "num_input_tokens_seen": 30121616, "step": 29210 }, { "epoch": 19.54180602006689, "grad_norm": 2.600574493408203, "learning_rate": 1.6026122261934007e-08, "loss": 0.3762, "num_input_tokens_seen": 30127280, "step": 29215 }, { "epoch": 19.54515050167224, "grad_norm": 3.6601710319519043, "learning_rate": 1.5793479310364788e-08, "loss": 0.3596, "num_input_tokens_seen": 30132688, "step": 29220 }, { "epoch": 19.548494983277592, "grad_norm": 2.8179430961608887, "learning_rate": 1.5562534635198413e-08, "loss": 0.3622, "num_input_tokens_seen": 30137648, "step": 29225 }, { "epoch": 19.551839464882942, "grad_norm": 3.063349962234497, "learning_rate": 1.5333288315125262e-08, "loss": 0.2905, "num_input_tokens_seen": 30142416, "step": 29230 }, { "epoch": 19.555183946488295, "grad_norm": 3.1969153881073, "learning_rate": 1.5105740428256744e-08, "loss": 0.3345, "num_input_tokens_seen": 30148528, "step": 29235 }, { "epoch": 19.558528428093645, "grad_norm": 3.33769154548645, "learning_rate": 1.4879891052124728e-08, "loss": 0.4055, "num_input_tokens_seen": 30153776, "step": 29240 }, { "epoch": 19.561872909698998, "grad_norm": 3.2527108192443848, "learning_rate": 1.4655740263684327e-08, "loss": 0.4256, "num_input_tokens_seen": 30159472, "step": 29245 }, { "epoch": 19.565217391304348, "grad_norm": 2.5983760356903076, "learning_rate": 1.4433288139310553e-08, "loss": 0.3232, "num_input_tokens_seen": 30165296, "step": 29250 }, { "epoch": 19.568561872909697, "grad_norm": 2.4854719638824463, "learning_rate": 1.4212534754799445e-08, "loss": 0.3823, "num_input_tokens_seen": 30170608, "step": 29255 }, { "epoch": 19.57190635451505, "grad_norm": 3.3057219982147217, "learning_rate": 1.3993480185368613e-08, "loss": 0.326, "num_input_tokens_seen": 30175824, "step": 29260 }, { "epoch": 19.5752508361204, "grad_norm": 3.119497299194336, "learning_rate": 1.377612450565724e-08, "loss": 0.2676, "num_input_tokens_seen": 30180400, "step": 29265 }, { "epoch": 19.578595317725753, "grad_norm": 2.921090602874756, "learning_rate": 1.3560467789725529e-08, "loss": 0.3425, "num_input_tokens_seen": 30185552, "step": 29270 }, { "epoch": 19.581939799331103, "grad_norm": 3.469829797744751, "learning_rate": 1.334651011105359e-08, "loss": 0.3509, "num_input_tokens_seen": 30190512, "step": 29275 }, { "epoch": 19.585284280936456, "grad_norm": 2.9631266593933105, "learning_rate": 1.3134251542544774e-08, "loss": 0.3869, "num_input_tokens_seen": 30195440, "step": 29280 }, { "epoch": 19.588628762541806, "grad_norm": 2.9174654483795166, "learning_rate": 1.2923692156520674e-08, "loss": 0.3793, "num_input_tokens_seen": 30201008, "step": 29285 }, { "epoch": 19.59197324414716, "grad_norm": 2.3062820434570312, "learning_rate": 1.271483202472723e-08, "loss": 0.2986, "num_input_tokens_seen": 30205264, "step": 29290 }, { "epoch": 19.59531772575251, "grad_norm": 3.2611005306243896, "learning_rate": 1.2507671218328631e-08, "loss": 0.4728, "num_input_tokens_seen": 30210672, "step": 29295 }, { "epoch": 19.598662207357858, "grad_norm": 2.824624538421631, "learning_rate": 1.2302209807911192e-08, "loss": 0.3092, "num_input_tokens_seen": 30216464, "step": 29300 }, { "epoch": 19.60200668896321, "grad_norm": 2.4010772705078125, "learning_rate": 1.2098447863482243e-08, "loss": 0.2963, "num_input_tokens_seen": 30220912, "step": 29305 }, { "epoch": 19.60535117056856, "grad_norm": 3.7379214763641357, "learning_rate": 1.1896385454470138e-08, "loss": 0.4232, "num_input_tokens_seen": 30225744, "step": 29310 }, { "epoch": 19.608695652173914, "grad_norm": 3.051614761352539, "learning_rate": 1.1696022649723693e-08, "loss": 0.4133, "num_input_tokens_seen": 30230416, "step": 29315 }, { "epoch": 19.612040133779264, "grad_norm": 2.6716296672821045, "learning_rate": 1.1497359517512741e-08, "loss": 0.3088, "num_input_tokens_seen": 30235600, "step": 29320 }, { "epoch": 19.615384615384617, "grad_norm": 3.2485148906707764, "learning_rate": 1.1300396125528135e-08, "loss": 0.3319, "num_input_tokens_seen": 30240496, "step": 29325 }, { "epoch": 19.618729096989966, "grad_norm": 2.499432325363159, "learning_rate": 1.1105132540881747e-08, "loss": 0.3669, "num_input_tokens_seen": 30245616, "step": 29330 }, { "epoch": 19.62207357859532, "grad_norm": 2.565582752227783, "learning_rate": 1.0911568830105356e-08, "loss": 0.2783, "num_input_tokens_seen": 30250480, "step": 29335 }, { "epoch": 19.62541806020067, "grad_norm": 2.4588592052459717, "learning_rate": 1.0719705059153429e-08, "loss": 0.3014, "num_input_tokens_seen": 30256368, "step": 29340 }, { "epoch": 19.62876254180602, "grad_norm": 2.7278316020965576, "learning_rate": 1.052954129339867e-08, "loss": 0.3236, "num_input_tokens_seen": 30261072, "step": 29345 }, { "epoch": 19.632107023411372, "grad_norm": 3.0020668506622314, "learning_rate": 1.0341077597637028e-08, "loss": 0.3826, "num_input_tokens_seen": 30265456, "step": 29350 }, { "epoch": 19.63545150501672, "grad_norm": 3.805356979370117, "learning_rate": 1.0154314036083247e-08, "loss": 0.3962, "num_input_tokens_seen": 30270192, "step": 29355 }, { "epoch": 19.638795986622075, "grad_norm": 3.764726400375366, "learning_rate": 9.969250672373642e-09, "loss": 0.2684, "num_input_tokens_seen": 30274800, "step": 29360 }, { "epoch": 19.642140468227424, "grad_norm": 2.596707344055176, "learning_rate": 9.785887569565555e-09, "loss": 0.3827, "num_input_tokens_seen": 30279792, "step": 29365 }, { "epoch": 19.645484949832777, "grad_norm": 2.630265951156616, "learning_rate": 9.60422479013623e-09, "loss": 0.3445, "num_input_tokens_seen": 30286192, "step": 29370 }, { "epoch": 19.648829431438127, "grad_norm": 1.9405320882797241, "learning_rate": 9.42426239598393e-09, "loss": 0.4213, "num_input_tokens_seen": 30291408, "step": 29375 }, { "epoch": 19.652173913043477, "grad_norm": 2.505587577819824, "learning_rate": 9.246000448427938e-09, "loss": 0.4268, "num_input_tokens_seen": 30296496, "step": 29380 }, { "epoch": 19.65551839464883, "grad_norm": 2.4868004322052, "learning_rate": 9.069439008207447e-09, "loss": 0.3056, "num_input_tokens_seen": 30301072, "step": 29385 }, { "epoch": 19.65886287625418, "grad_norm": 3.41448974609375, "learning_rate": 8.894578135482113e-09, "loss": 0.4047, "num_input_tokens_seen": 30306288, "step": 29390 }, { "epoch": 19.662207357859533, "grad_norm": 2.5116333961486816, "learning_rate": 8.721417889833161e-09, "loss": 0.3911, "num_input_tokens_seen": 30311152, "step": 29395 }, { "epoch": 19.665551839464882, "grad_norm": 2.806391954421997, "learning_rate": 8.549958330261177e-09, "loss": 0.3308, "num_input_tokens_seen": 30315824, "step": 29400 }, { "epoch": 19.668896321070235, "grad_norm": 3.8424184322357178, "learning_rate": 8.380199515188314e-09, "loss": 0.3172, "num_input_tokens_seen": 30320432, "step": 29405 }, { "epoch": 19.672240802675585, "grad_norm": 2.2641568183898926, "learning_rate": 8.212141502456639e-09, "loss": 0.3749, "num_input_tokens_seen": 30325360, "step": 29410 }, { "epoch": 19.675585284280935, "grad_norm": 2.701085090637207, "learning_rate": 8.045784349329234e-09, "loss": 0.3028, "num_input_tokens_seen": 30330224, "step": 29415 }, { "epoch": 19.678929765886288, "grad_norm": 2.628476858139038, "learning_rate": 7.88112811248798e-09, "loss": 0.3134, "num_input_tokens_seen": 30336016, "step": 29420 }, { "epoch": 19.682274247491637, "grad_norm": 2.627042531967163, "learning_rate": 7.718172848037443e-09, "loss": 0.294, "num_input_tokens_seen": 30341648, "step": 29425 }, { "epoch": 19.68561872909699, "grad_norm": 2.189242362976074, "learning_rate": 7.556918611501541e-09, "loss": 0.3769, "num_input_tokens_seen": 30346448, "step": 29430 }, { "epoch": 19.68896321070234, "grad_norm": 2.89530348777771, "learning_rate": 7.397365457824657e-09, "loss": 0.4766, "num_input_tokens_seen": 30351920, "step": 29435 }, { "epoch": 19.692307692307693, "grad_norm": 4.354038715362549, "learning_rate": 7.239513441371637e-09, "loss": 0.3826, "num_input_tokens_seen": 30356752, "step": 29440 }, { "epoch": 19.695652173913043, "grad_norm": 2.993360757827759, "learning_rate": 7.083362615927236e-09, "loss": 0.3387, "num_input_tokens_seen": 30362352, "step": 29445 }, { "epoch": 19.698996655518396, "grad_norm": 2.4794421195983887, "learning_rate": 6.9289130346972264e-09, "loss": 0.3028, "num_input_tokens_seen": 30367536, "step": 29450 }, { "epoch": 19.702341137123746, "grad_norm": 2.7609283924102783, "learning_rate": 6.77616475030729e-09, "loss": 0.3633, "num_input_tokens_seen": 30372496, "step": 29455 }, { "epoch": 19.705685618729095, "grad_norm": 3.704930067062378, "learning_rate": 6.625117814804128e-09, "loss": 0.3953, "num_input_tokens_seen": 30377904, "step": 29460 }, { "epoch": 19.70903010033445, "grad_norm": 2.443020820617676, "learning_rate": 6.475772279653236e-09, "loss": 0.3429, "num_input_tokens_seen": 30382928, "step": 29465 }, { "epoch": 19.712374581939798, "grad_norm": 3.2346551418304443, "learning_rate": 6.328128195742245e-09, "loss": 0.3277, "num_input_tokens_seen": 30387632, "step": 29470 }, { "epoch": 19.71571906354515, "grad_norm": 2.581219434738159, "learning_rate": 6.1821856133781335e-09, "loss": 0.4047, "num_input_tokens_seen": 30392464, "step": 29475 }, { "epoch": 19.7190635451505, "grad_norm": 3.1281142234802246, "learning_rate": 6.037944582287236e-09, "loss": 0.344, "num_input_tokens_seen": 30397520, "step": 29480 }, { "epoch": 19.722408026755854, "grad_norm": 2.9812989234924316, "learning_rate": 5.895405151618017e-09, "loss": 0.4076, "num_input_tokens_seen": 30402768, "step": 29485 }, { "epoch": 19.725752508361204, "grad_norm": 2.5425899028778076, "learning_rate": 5.754567369937736e-09, "loss": 0.322, "num_input_tokens_seen": 30407664, "step": 29490 }, { "epoch": 19.729096989966557, "grad_norm": 2.2114884853363037, "learning_rate": 5.615431285234673e-09, "loss": 0.3315, "num_input_tokens_seen": 30412880, "step": 29495 }, { "epoch": 19.732441471571907, "grad_norm": 3.6691346168518066, "learning_rate": 5.477996944915909e-09, "loss": 0.3787, "num_input_tokens_seen": 30417648, "step": 29500 }, { "epoch": 19.735785953177256, "grad_norm": 2.339900016784668, "learning_rate": 5.342264395810648e-09, "loss": 0.3735, "num_input_tokens_seen": 30424080, "step": 29505 }, { "epoch": 19.73913043478261, "grad_norm": 2.5371265411376953, "learning_rate": 5.2082336841663414e-09, "loss": 0.2646, "num_input_tokens_seen": 30429072, "step": 29510 }, { "epoch": 19.74247491638796, "grad_norm": 2.4558043479919434, "learning_rate": 5.075904855652569e-09, "loss": 0.3474, "num_input_tokens_seen": 30434672, "step": 29515 }, { "epoch": 19.745819397993312, "grad_norm": 3.280967950820923, "learning_rate": 4.945277955357153e-09, "loss": 0.2835, "num_input_tokens_seen": 30438448, "step": 29520 }, { "epoch": 19.74916387959866, "grad_norm": 2.7340774536132812, "learning_rate": 4.816353027788933e-09, "loss": 0.3221, "num_input_tokens_seen": 30443824, "step": 29525 }, { "epoch": 19.752508361204015, "grad_norm": 3.4945666790008545, "learning_rate": 4.689130116877217e-09, "loss": 0.3712, "num_input_tokens_seen": 30450256, "step": 29530 }, { "epoch": 19.755852842809364, "grad_norm": 2.8573999404907227, "learning_rate": 4.563609265970104e-09, "loss": 0.3352, "num_input_tokens_seen": 30456400, "step": 29535 }, { "epoch": 19.759197324414714, "grad_norm": 2.395535945892334, "learning_rate": 4.439790517836717e-09, "loss": 0.3916, "num_input_tokens_seen": 30461008, "step": 29540 }, { "epoch": 19.762541806020067, "grad_norm": 2.443305253982544, "learning_rate": 4.317673914666087e-09, "loss": 0.4042, "num_input_tokens_seen": 30466384, "step": 29545 }, { "epoch": 19.765886287625417, "grad_norm": 2.3830368518829346, "learning_rate": 4.197259498067707e-09, "loss": 0.3404, "num_input_tokens_seen": 30472048, "step": 29550 }, { "epoch": 19.76923076923077, "grad_norm": 2.4559826850891113, "learning_rate": 4.078547309069869e-09, "loss": 0.3287, "num_input_tokens_seen": 30477232, "step": 29555 }, { "epoch": 19.77257525083612, "grad_norm": 2.8812737464904785, "learning_rate": 3.961537388121328e-09, "loss": 0.344, "num_input_tokens_seen": 30482480, "step": 29560 }, { "epoch": 19.775919732441473, "grad_norm": 6.013533592224121, "learning_rate": 3.846229775091859e-09, "loss": 0.4453, "num_input_tokens_seen": 30486704, "step": 29565 }, { "epoch": 19.779264214046822, "grad_norm": 3.1723828315734863, "learning_rate": 3.732624509270033e-09, "loss": 0.3947, "num_input_tokens_seen": 30491664, "step": 29570 }, { "epoch": 19.782608695652176, "grad_norm": 3.0184576511383057, "learning_rate": 3.6207216293648873e-09, "loss": 0.3131, "num_input_tokens_seen": 30497200, "step": 29575 }, { "epoch": 19.785953177257525, "grad_norm": 1.9807571172714233, "learning_rate": 3.5105211735053656e-09, "loss": 0.3952, "num_input_tokens_seen": 30502352, "step": 29580 }, { "epoch": 19.789297658862875, "grad_norm": 2.69755220413208, "learning_rate": 3.4020231792397663e-09, "loss": 0.364, "num_input_tokens_seen": 30507600, "step": 29585 }, { "epoch": 19.792642140468228, "grad_norm": 2.370529890060425, "learning_rate": 3.29522768353685e-09, "loss": 0.2903, "num_input_tokens_seen": 30513008, "step": 29590 }, { "epoch": 19.795986622073578, "grad_norm": 3.1194701194763184, "learning_rate": 3.1901347227858425e-09, "loss": 0.3517, "num_input_tokens_seen": 30518160, "step": 29595 }, { "epoch": 19.79933110367893, "grad_norm": 2.3848788738250732, "learning_rate": 3.0867443327942117e-09, "loss": 0.3953, "num_input_tokens_seen": 30523664, "step": 29600 }, { "epoch": 19.80267558528428, "grad_norm": 2.170250654220581, "learning_rate": 2.985056548791554e-09, "loss": 0.4069, "num_input_tokens_seen": 30529520, "step": 29605 }, { "epoch": 19.806020066889634, "grad_norm": 3.0275821685791016, "learning_rate": 2.885071405425155e-09, "loss": 0.4094, "num_input_tokens_seen": 30535632, "step": 29610 }, { "epoch": 19.809364548494983, "grad_norm": 2.2793543338775635, "learning_rate": 2.7867889367633182e-09, "loss": 0.3664, "num_input_tokens_seen": 30542352, "step": 29615 }, { "epoch": 19.812709030100333, "grad_norm": 3.173166036605835, "learning_rate": 2.690209176294256e-09, "loss": 0.3597, "num_input_tokens_seen": 30548304, "step": 29620 }, { "epoch": 19.816053511705686, "grad_norm": 2.8849968910217285, "learning_rate": 2.595332156925534e-09, "loss": 0.3049, "num_input_tokens_seen": 30553136, "step": 29625 }, { "epoch": 19.819397993311036, "grad_norm": 2.78950572013855, "learning_rate": 2.502157910984626e-09, "loss": 0.3933, "num_input_tokens_seen": 30559152, "step": 29630 }, { "epoch": 19.82274247491639, "grad_norm": 2.418653964996338, "learning_rate": 2.410686470218915e-09, "loss": 0.3681, "num_input_tokens_seen": 30563888, "step": 29635 }, { "epoch": 19.82608695652174, "grad_norm": 3.12142014503479, "learning_rate": 2.320917865795691e-09, "loss": 0.3376, "num_input_tokens_seen": 30568720, "step": 29640 }, { "epoch": 19.82943143812709, "grad_norm": 2.7482118606567383, "learning_rate": 2.2328521283027093e-09, "loss": 0.3297, "num_input_tokens_seen": 30573840, "step": 29645 }, { "epoch": 19.83277591973244, "grad_norm": 2.389814615249634, "learning_rate": 2.1464892877454126e-09, "loss": 0.2868, "num_input_tokens_seen": 30579472, "step": 29650 }, { "epoch": 19.836120401337794, "grad_norm": 2.8045356273651123, "learning_rate": 2.0618293735513717e-09, "loss": 0.4006, "num_input_tokens_seen": 30584464, "step": 29655 }, { "epoch": 19.839464882943144, "grad_norm": 2.8696401119232178, "learning_rate": 1.9788724145664018e-09, "loss": 0.363, "num_input_tokens_seen": 30588976, "step": 29660 }, { "epoch": 19.842809364548494, "grad_norm": 2.6656696796417236, "learning_rate": 1.897618439056781e-09, "loss": 0.3832, "num_input_tokens_seen": 30594000, "step": 29665 }, { "epoch": 19.846153846153847, "grad_norm": 2.9932310581207275, "learning_rate": 1.8180674747075856e-09, "loss": 0.2821, "num_input_tokens_seen": 30598608, "step": 29670 }, { "epoch": 19.849498327759196, "grad_norm": 3.17958402633667, "learning_rate": 1.7402195486254658e-09, "loss": 0.3542, "num_input_tokens_seen": 30603248, "step": 29675 }, { "epoch": 19.85284280936455, "grad_norm": 2.7408335208892822, "learning_rate": 1.6640746873353152e-09, "loss": 0.4237, "num_input_tokens_seen": 30608976, "step": 29680 }, { "epoch": 19.8561872909699, "grad_norm": 2.453921318054199, "learning_rate": 1.5896329167813807e-09, "loss": 0.4306, "num_input_tokens_seen": 30613936, "step": 29685 }, { "epoch": 19.859531772575252, "grad_norm": 2.2274742126464844, "learning_rate": 1.5168942623294825e-09, "loss": 0.3155, "num_input_tokens_seen": 30619408, "step": 29690 }, { "epoch": 19.862876254180602, "grad_norm": 2.1897008419036865, "learning_rate": 1.4458587487631293e-09, "loss": 0.3346, "num_input_tokens_seen": 30624560, "step": 29695 }, { "epoch": 19.86622073578595, "grad_norm": 4.051370143890381, "learning_rate": 1.3765264002862933e-09, "loss": 0.3023, "num_input_tokens_seen": 30630128, "step": 29700 }, { "epoch": 19.869565217391305, "grad_norm": 2.581756114959717, "learning_rate": 1.3088972405234101e-09, "loss": 0.2897, "num_input_tokens_seen": 30635760, "step": 29705 }, { "epoch": 19.872909698996654, "grad_norm": 2.5068516731262207, "learning_rate": 1.2429712925171588e-09, "loss": 0.4416, "num_input_tokens_seen": 30641072, "step": 29710 }, { "epoch": 19.876254180602007, "grad_norm": 3.1305394172668457, "learning_rate": 1.1787485787306818e-09, "loss": 0.4411, "num_input_tokens_seen": 30647088, "step": 29715 }, { "epoch": 19.879598662207357, "grad_norm": 2.9900197982788086, "learning_rate": 1.11622912104703e-09, "loss": 0.4324, "num_input_tokens_seen": 30651216, "step": 29720 }, { "epoch": 19.88294314381271, "grad_norm": 2.60184383392334, "learning_rate": 1.0554129407686076e-09, "loss": 0.3383, "num_input_tokens_seen": 30655984, "step": 29725 }, { "epoch": 19.88628762541806, "grad_norm": 2.384425163269043, "learning_rate": 9.963000586171722e-10, "loss": 0.3286, "num_input_tokens_seen": 30661520, "step": 29730 }, { "epoch": 19.889632107023413, "grad_norm": 3.434180974960327, "learning_rate": 9.38890494733835e-10, "loss": 0.3335, "num_input_tokens_seen": 30665936, "step": 29735 }, { "epoch": 19.892976588628763, "grad_norm": 3.1718671321868896, "learning_rate": 8.831842686807258e-10, "loss": 0.33, "num_input_tokens_seen": 30670960, "step": 29740 }, { "epoch": 19.896321070234112, "grad_norm": 2.5357494354248047, "learning_rate": 8.291813994387721e-10, "loss": 0.3258, "num_input_tokens_seen": 30675824, "step": 29745 }, { "epoch": 19.899665551839465, "grad_norm": 3.4253036975860596, "learning_rate": 7.768819054077004e-10, "loss": 0.3846, "num_input_tokens_seen": 30681296, "step": 29750 }, { "epoch": 19.903010033444815, "grad_norm": 2.9087538719177246, "learning_rate": 7.262858044077003e-10, "loss": 0.2916, "num_input_tokens_seen": 30685744, "step": 29755 }, { "epoch": 19.906354515050168, "grad_norm": 3.3166041374206543, "learning_rate": 6.773931136794254e-10, "loss": 0.4333, "num_input_tokens_seen": 30691920, "step": 29760 }, { "epoch": 19.909698996655518, "grad_norm": 2.0703933238983154, "learning_rate": 6.302038498806618e-10, "loss": 0.3699, "num_input_tokens_seen": 30697392, "step": 29765 }, { "epoch": 19.91304347826087, "grad_norm": 3.0315968990325928, "learning_rate": 5.847180290913246e-10, "loss": 0.3523, "num_input_tokens_seen": 30702544, "step": 29770 }, { "epoch": 19.91638795986622, "grad_norm": 3.786302089691162, "learning_rate": 5.409356668101273e-10, "loss": 0.4242, "num_input_tokens_seen": 30707984, "step": 29775 }, { "epoch": 19.919732441471574, "grad_norm": 3.7521369457244873, "learning_rate": 4.988567779540266e-10, "loss": 0.3663, "num_input_tokens_seen": 30712752, "step": 29780 }, { "epoch": 19.923076923076923, "grad_norm": 3.720855474472046, "learning_rate": 4.584813768609975e-10, "loss": 0.3607, "num_input_tokens_seen": 30717136, "step": 29785 }, { "epoch": 19.926421404682273, "grad_norm": 2.619049310684204, "learning_rate": 4.1980947728892385e-10, "loss": 0.3451, "num_input_tokens_seen": 30721744, "step": 29790 }, { "epoch": 19.929765886287626, "grad_norm": 2.54358172416687, "learning_rate": 3.828410924133774e-10, "loss": 0.2614, "num_input_tokens_seen": 30726544, "step": 29795 }, { "epoch": 19.933110367892976, "grad_norm": 2.488623857498169, "learning_rate": 3.4757623483205885e-10, "loss": 0.4823, "num_input_tokens_seen": 30731920, "step": 29800 }, { "epoch": 19.93645484949833, "grad_norm": 2.4751482009887695, "learning_rate": 3.1401491655980167e-10, "loss": 0.3823, "num_input_tokens_seen": 30736784, "step": 29805 }, { "epoch": 19.93979933110368, "grad_norm": 2.5375969409942627, "learning_rate": 2.821571490319031e-10, "loss": 0.3133, "num_input_tokens_seen": 30742288, "step": 29810 }, { "epoch": 19.94314381270903, "grad_norm": 2.267002820968628, "learning_rate": 2.5200294310356866e-10, "loss": 0.2895, "num_input_tokens_seen": 30747632, "step": 29815 }, { "epoch": 19.94648829431438, "grad_norm": 3.8229663372039795, "learning_rate": 2.2355230904991254e-10, "loss": 0.3703, "num_input_tokens_seen": 30753904, "step": 29820 }, { "epoch": 19.94983277591973, "grad_norm": 2.3507566452026367, "learning_rate": 1.9680525656429196e-10, "loss": 0.4401, "num_input_tokens_seen": 30758896, "step": 29825 }, { "epoch": 19.953177257525084, "grad_norm": 2.9404056072235107, "learning_rate": 1.7176179475997257e-10, "loss": 0.3573, "num_input_tokens_seen": 30764048, "step": 29830 }, { "epoch": 19.956521739130434, "grad_norm": 2.6927638053894043, "learning_rate": 1.484219321712388e-10, "loss": 0.3393, "num_input_tokens_seen": 30769200, "step": 29835 }, { "epoch": 19.959866220735787, "grad_norm": 2.5626354217529297, "learning_rate": 1.2678567674950793e-10, "loss": 0.4903, "num_input_tokens_seen": 30774288, "step": 29840 }, { "epoch": 19.963210702341136, "grad_norm": 2.316054344177246, "learning_rate": 1.0685303586721596e-10, "loss": 0.3682, "num_input_tokens_seen": 30779536, "step": 29845 }, { "epoch": 19.96655518394649, "grad_norm": 3.13614559173584, "learning_rate": 8.862401631670736e-11, "loss": 0.3191, "num_input_tokens_seen": 30784720, "step": 29850 }, { "epoch": 19.96989966555184, "grad_norm": 2.7688746452331543, "learning_rate": 7.209862430801462e-11, "loss": 0.4022, "num_input_tokens_seen": 30790032, "step": 29855 }, { "epoch": 19.97324414715719, "grad_norm": 2.947218418121338, "learning_rate": 5.727686547329914e-11, "loss": 0.3466, "num_input_tokens_seen": 30795312, "step": 29860 }, { "epoch": 19.976588628762542, "grad_norm": 3.235051393508911, "learning_rate": 4.415874486185523e-11, "loss": 0.3945, "num_input_tokens_seen": 30800656, "step": 29865 }, { "epoch": 19.97993311036789, "grad_norm": 3.05672287940979, "learning_rate": 3.274426694344079e-11, "loss": 0.3107, "num_input_tokens_seen": 30806672, "step": 29870 }, { "epoch": 19.983277591973245, "grad_norm": 2.5723724365234375, "learning_rate": 2.3033435607167087e-11, "loss": 0.3061, "num_input_tokens_seen": 30811984, "step": 29875 }, { "epoch": 19.986622073578594, "grad_norm": 2.638225555419922, "learning_rate": 1.5026254162608943e-11, "loss": 0.3375, "num_input_tokens_seen": 30816368, "step": 29880 }, { "epoch": 19.989966555183948, "grad_norm": 2.0039429664611816, "learning_rate": 8.722725337584337e-12, "loss": 0.3478, "num_input_tokens_seen": 30822320, "step": 29885 }, { "epoch": 19.993311036789297, "grad_norm": 3.206867218017578, "learning_rate": 4.122851279819706e-12, "loss": 0.3662, "num_input_tokens_seen": 30827952, "step": 29890 }, { "epoch": 19.99665551839465, "grad_norm": 4.139222145080566, "learning_rate": 1.2266335569499633e-12, "loss": 0.3845, "num_input_tokens_seen": 30833296, "step": 29895 }, { "epoch": 20.0, "grad_norm": 2.530097723007202, "learning_rate": 3.407315596337668e-14, "loss": 0.3472, "num_input_tokens_seen": 30837792, "step": 29900 }, { "epoch": 20.0, "eval_loss": 0.5358567833900452, "eval_runtime": 37.6984, "eval_samples_per_second": 39.657, "eval_steps_per_second": 9.921, "num_input_tokens_seen": 30837792, "step": 29900 }, { "epoch": 20.0, "num_input_tokens_seen": 30837792, "step": 29900, "total_flos": 1.3886118360648253e+18, "train_loss": 0.49428450607934526, "train_runtime": 7026.5426, "train_samples_per_second": 17.015, "train_steps_per_second": 4.255 } ], "logging_steps": 5, "max_steps": 29900, "num_input_tokens_seen": 30837792, "num_train_epochs": 20, "save_steps": 2990, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3886118360648253e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }