{ "best_global_step": 10176, "best_metric": 0.5032580494880676, "best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_codealpacapy_101112_1770403545/checkpoint-10176", "epoch": 20.0, "eval_steps": 3392, "global_step": 33920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00294811320754717, "grad_norm": 74.06575775146484, "learning_rate": 1.179245283018868e-08, "loss": 7.0485, "num_input_tokens_seen": 3360, "step": 5 }, { "epoch": 0.00589622641509434, "grad_norm": 63.695701599121094, "learning_rate": 2.6533018867924528e-08, "loss": 6.4131, "num_input_tokens_seen": 8608, "step": 10 }, { "epoch": 0.00884433962264151, "grad_norm": 114.82685852050781, "learning_rate": 4.127358490566038e-08, "loss": 7.1672, "num_input_tokens_seen": 12000, "step": 15 }, { "epoch": 0.01179245283018868, "grad_norm": 97.50489807128906, "learning_rate": 5.601415094339623e-08, "loss": 7.0605, "num_input_tokens_seen": 18880, "step": 20 }, { "epoch": 0.01474056603773585, "grad_norm": 103.65096282958984, "learning_rate": 7.075471698113208e-08, "loss": 7.543, "num_input_tokens_seen": 21664, "step": 25 }, { "epoch": 0.01768867924528302, "grad_norm": 79.8503646850586, "learning_rate": 8.549528301886793e-08, "loss": 6.7796, "num_input_tokens_seen": 24864, "step": 30 }, { "epoch": 0.020636792452830188, "grad_norm": 87.39107513427734, "learning_rate": 1.0023584905660378e-07, "loss": 6.6321, "num_input_tokens_seen": 28448, "step": 35 }, { "epoch": 0.02358490566037736, "grad_norm": 63.60541915893555, "learning_rate": 1.1497641509433962e-07, "loss": 6.8088, "num_input_tokens_seen": 31648, "step": 40 }, { "epoch": 0.02653301886792453, "grad_norm": 78.94878387451172, "learning_rate": 1.297169811320755e-07, "loss": 7.0507, "num_input_tokens_seen": 35392, "step": 45 }, { "epoch": 0.0294811320754717, "grad_norm": 86.38248443603516, "learning_rate": 1.4445754716981135e-07, "loss": 6.4107, "num_input_tokens_seen": 39104, "step": 50 }, { "epoch": 0.03242924528301887, "grad_norm": 74.74366760253906, "learning_rate": 1.591981132075472e-07, "loss": 6.4995, "num_input_tokens_seen": 43584, "step": 55 }, { "epoch": 0.03537735849056604, "grad_norm": 58.59898376464844, "learning_rate": 1.7393867924528304e-07, "loss": 6.1427, "num_input_tokens_seen": 47520, "step": 60 }, { "epoch": 0.038325471698113206, "grad_norm": 80.27046203613281, "learning_rate": 1.886792452830189e-07, "loss": 6.4795, "num_input_tokens_seen": 50720, "step": 65 }, { "epoch": 0.041273584905660375, "grad_norm": 72.09033966064453, "learning_rate": 2.0341981132075473e-07, "loss": 6.0678, "num_input_tokens_seen": 53824, "step": 70 }, { "epoch": 0.044221698113207544, "grad_norm": 56.44142150878906, "learning_rate": 2.1816037735849058e-07, "loss": 6.7139, "num_input_tokens_seen": 57056, "step": 75 }, { "epoch": 0.04716981132075472, "grad_norm": 56.32357406616211, "learning_rate": 2.3290094339622643e-07, "loss": 6.0458, "num_input_tokens_seen": 60864, "step": 80 }, { "epoch": 0.05011792452830189, "grad_norm": 73.15631866455078, "learning_rate": 2.476415094339623e-07, "loss": 6.1236, "num_input_tokens_seen": 64160, "step": 85 }, { "epoch": 0.05306603773584906, "grad_norm": 59.23871612548828, "learning_rate": 2.6238207547169815e-07, "loss": 5.914, "num_input_tokens_seen": 68064, "step": 90 }, { "epoch": 0.05601415094339623, "grad_norm": 66.3110122680664, "learning_rate": 2.7712264150943397e-07, "loss": 6.4511, "num_input_tokens_seen": 71840, "step": 95 }, { "epoch": 0.0589622641509434, "grad_norm": 57.93452835083008, "learning_rate": 2.9186320754716984e-07, "loss": 6.0237, "num_input_tokens_seen": 74976, "step": 100 }, { "epoch": 0.061910377358490566, "grad_norm": 75.299072265625, "learning_rate": 3.0660377358490567e-07, "loss": 6.0977, "num_input_tokens_seen": 77536, "step": 105 }, { "epoch": 0.06485849056603774, "grad_norm": 54.79448318481445, "learning_rate": 3.213443396226416e-07, "loss": 5.7799, "num_input_tokens_seen": 80352, "step": 110 }, { "epoch": 0.06780660377358491, "grad_norm": 37.4268798828125, "learning_rate": 3.3608490566037736e-07, "loss": 5.5223, "num_input_tokens_seen": 83360, "step": 115 }, { "epoch": 0.07075471698113207, "grad_norm": 52.39466857910156, "learning_rate": 3.508254716981133e-07, "loss": 5.1928, "num_input_tokens_seen": 85792, "step": 120 }, { "epoch": 0.07370283018867925, "grad_norm": 58.12413787841797, "learning_rate": 3.6556603773584905e-07, "loss": 5.3914, "num_input_tokens_seen": 88576, "step": 125 }, { "epoch": 0.07665094339622641, "grad_norm": 40.727622985839844, "learning_rate": 3.80306603773585e-07, "loss": 5.116, "num_input_tokens_seen": 92032, "step": 130 }, { "epoch": 0.07959905660377359, "grad_norm": 44.57987594604492, "learning_rate": 3.9504716981132075e-07, "loss": 5.0268, "num_input_tokens_seen": 96416, "step": 135 }, { "epoch": 0.08254716981132075, "grad_norm": 40.94643783569336, "learning_rate": 4.097877358490567e-07, "loss": 4.6469, "num_input_tokens_seen": 100032, "step": 140 }, { "epoch": 0.08549528301886793, "grad_norm": 38.7252082824707, "learning_rate": 4.2452830188679244e-07, "loss": 4.7991, "num_input_tokens_seen": 102368, "step": 145 }, { "epoch": 0.08844339622641509, "grad_norm": 40.15935516357422, "learning_rate": 4.3926886792452837e-07, "loss": 4.7584, "num_input_tokens_seen": 106016, "step": 150 }, { "epoch": 0.09139150943396226, "grad_norm": 43.73161697387695, "learning_rate": 4.5400943396226414e-07, "loss": 4.6684, "num_input_tokens_seen": 108928, "step": 155 }, { "epoch": 0.09433962264150944, "grad_norm": 37.12959289550781, "learning_rate": 4.6875000000000006e-07, "loss": 4.4541, "num_input_tokens_seen": 112192, "step": 160 }, { "epoch": 0.0972877358490566, "grad_norm": 28.593889236450195, "learning_rate": 4.834905660377359e-07, "loss": 4.4469, "num_input_tokens_seen": 116160, "step": 165 }, { "epoch": 0.10023584905660378, "grad_norm": 37.64503479003906, "learning_rate": 4.982311320754717e-07, "loss": 4.9826, "num_input_tokens_seen": 118848, "step": 170 }, { "epoch": 0.10318396226415094, "grad_norm": 30.13062286376953, "learning_rate": 5.129716981132076e-07, "loss": 4.3071, "num_input_tokens_seen": 121664, "step": 175 }, { "epoch": 0.10613207547169812, "grad_norm": 37.148170471191406, "learning_rate": 5.277122641509435e-07, "loss": 3.8958, "num_input_tokens_seen": 124544, "step": 180 }, { "epoch": 0.10908018867924528, "grad_norm": 37.15571212768555, "learning_rate": 5.424528301886793e-07, "loss": 3.9678, "num_input_tokens_seen": 127104, "step": 185 }, { "epoch": 0.11202830188679246, "grad_norm": 32.422607421875, "learning_rate": 5.571933962264151e-07, "loss": 4.0399, "num_input_tokens_seen": 129984, "step": 190 }, { "epoch": 0.11497641509433962, "grad_norm": 24.724512100219727, "learning_rate": 5.71933962264151e-07, "loss": 3.961, "num_input_tokens_seen": 135328, "step": 195 }, { "epoch": 0.1179245283018868, "grad_norm": 29.6192569732666, "learning_rate": 5.866745283018868e-07, "loss": 4.127, "num_input_tokens_seen": 137920, "step": 200 }, { "epoch": 0.12087264150943396, "grad_norm": 25.562232971191406, "learning_rate": 6.014150943396227e-07, "loss": 3.7078, "num_input_tokens_seen": 140896, "step": 205 }, { "epoch": 0.12382075471698113, "grad_norm": 20.49237823486328, "learning_rate": 6.161556603773585e-07, "loss": 3.9372, "num_input_tokens_seen": 143808, "step": 210 }, { "epoch": 0.1267688679245283, "grad_norm": 32.59312438964844, "learning_rate": 6.308962264150945e-07, "loss": 3.675, "num_input_tokens_seen": 146944, "step": 215 }, { "epoch": 0.12971698113207547, "grad_norm": 23.946186065673828, "learning_rate": 6.456367924528302e-07, "loss": 3.7759, "num_input_tokens_seen": 149376, "step": 220 }, { "epoch": 0.13266509433962265, "grad_norm": 21.426753997802734, "learning_rate": 6.603773584905661e-07, "loss": 3.9826, "num_input_tokens_seen": 152448, "step": 225 }, { "epoch": 0.13561320754716982, "grad_norm": 21.669078826904297, "learning_rate": 6.75117924528302e-07, "loss": 3.65, "num_input_tokens_seen": 155552, "step": 230 }, { "epoch": 0.13856132075471697, "grad_norm": 38.23433303833008, "learning_rate": 6.898584905660379e-07, "loss": 3.5417, "num_input_tokens_seen": 158688, "step": 235 }, { "epoch": 0.14150943396226415, "grad_norm": 22.041948318481445, "learning_rate": 7.045990566037736e-07, "loss": 4.1353, "num_input_tokens_seen": 161568, "step": 240 }, { "epoch": 0.14445754716981132, "grad_norm": 33.50838088989258, "learning_rate": 7.193396226415095e-07, "loss": 3.5293, "num_input_tokens_seen": 164224, "step": 245 }, { "epoch": 0.1474056603773585, "grad_norm": 26.110963821411133, "learning_rate": 7.340801886792454e-07, "loss": 3.6508, "num_input_tokens_seen": 167168, "step": 250 }, { "epoch": 0.15035377358490565, "grad_norm": 23.420310974121094, "learning_rate": 7.488207547169812e-07, "loss": 3.5122, "num_input_tokens_seen": 170560, "step": 255 }, { "epoch": 0.15330188679245282, "grad_norm": 36.95894241333008, "learning_rate": 7.63561320754717e-07, "loss": 3.2788, "num_input_tokens_seen": 173472, "step": 260 }, { "epoch": 0.15625, "grad_norm": 23.66181755065918, "learning_rate": 7.783018867924529e-07, "loss": 3.6387, "num_input_tokens_seen": 177472, "step": 265 }, { "epoch": 0.15919811320754718, "grad_norm": 22.023290634155273, "learning_rate": 7.930424528301888e-07, "loss": 3.4332, "num_input_tokens_seen": 180224, "step": 270 }, { "epoch": 0.16214622641509435, "grad_norm": 24.26153564453125, "learning_rate": 8.077830188679246e-07, "loss": 3.4263, "num_input_tokens_seen": 183584, "step": 275 }, { "epoch": 0.1650943396226415, "grad_norm": 22.44004249572754, "learning_rate": 8.225235849056605e-07, "loss": 3.7345, "num_input_tokens_seen": 185984, "step": 280 }, { "epoch": 0.16804245283018868, "grad_norm": 23.551084518432617, "learning_rate": 8.372641509433963e-07, "loss": 3.3136, "num_input_tokens_seen": 188672, "step": 285 }, { "epoch": 0.17099056603773585, "grad_norm": 26.49611473083496, "learning_rate": 8.520047169811321e-07, "loss": 3.2689, "num_input_tokens_seen": 191520, "step": 290 }, { "epoch": 0.17393867924528303, "grad_norm": 21.189117431640625, "learning_rate": 8.66745283018868e-07, "loss": 3.0941, "num_input_tokens_seen": 195008, "step": 295 }, { "epoch": 0.17688679245283018, "grad_norm": 20.543933868408203, "learning_rate": 8.814858490566039e-07, "loss": 3.1132, "num_input_tokens_seen": 198336, "step": 300 }, { "epoch": 0.17983490566037735, "grad_norm": 21.540475845336914, "learning_rate": 8.962264150943397e-07, "loss": 2.916, "num_input_tokens_seen": 201408, "step": 305 }, { "epoch": 0.18278301886792453, "grad_norm": 19.49152183532715, "learning_rate": 9.109669811320755e-07, "loss": 2.8331, "num_input_tokens_seen": 208480, "step": 310 }, { "epoch": 0.1857311320754717, "grad_norm": 57.73466110229492, "learning_rate": 9.257075471698114e-07, "loss": 3.1466, "num_input_tokens_seen": 210944, "step": 315 }, { "epoch": 0.18867924528301888, "grad_norm": 51.177791595458984, "learning_rate": 9.404481132075473e-07, "loss": 3.2353, "num_input_tokens_seen": 213888, "step": 320 }, { "epoch": 0.19162735849056603, "grad_norm": 19.382444381713867, "learning_rate": 9.551886792452833e-07, "loss": 3.1111, "num_input_tokens_seen": 217568, "step": 325 }, { "epoch": 0.1945754716981132, "grad_norm": 18.758729934692383, "learning_rate": 9.699292452830188e-07, "loss": 2.7765, "num_input_tokens_seen": 221152, "step": 330 }, { "epoch": 0.19752358490566038, "grad_norm": 20.98778533935547, "learning_rate": 9.846698113207548e-07, "loss": 2.8761, "num_input_tokens_seen": 223712, "step": 335 }, { "epoch": 0.20047169811320756, "grad_norm": 26.127901077270508, "learning_rate": 9.994103773584906e-07, "loss": 2.6667, "num_input_tokens_seen": 226848, "step": 340 }, { "epoch": 0.2034198113207547, "grad_norm": 27.323904037475586, "learning_rate": 1.0141509433962265e-06, "loss": 2.6245, "num_input_tokens_seen": 230112, "step": 345 }, { "epoch": 0.20636792452830188, "grad_norm": 16.280197143554688, "learning_rate": 1.0288915094339623e-06, "loss": 2.5657, "num_input_tokens_seen": 233536, "step": 350 }, { "epoch": 0.20931603773584906, "grad_norm": 27.93326759338379, "learning_rate": 1.043632075471698e-06, "loss": 2.9909, "num_input_tokens_seen": 236704, "step": 355 }, { "epoch": 0.21226415094339623, "grad_norm": 28.224987030029297, "learning_rate": 1.058372641509434e-06, "loss": 2.4521, "num_input_tokens_seen": 239616, "step": 360 }, { "epoch": 0.21521226415094338, "grad_norm": 19.11638069152832, "learning_rate": 1.07311320754717e-06, "loss": 2.6179, "num_input_tokens_seen": 243200, "step": 365 }, { "epoch": 0.21816037735849056, "grad_norm": 23.735010147094727, "learning_rate": 1.0878537735849056e-06, "loss": 3.0654, "num_input_tokens_seen": 246560, "step": 370 }, { "epoch": 0.22110849056603774, "grad_norm": 30.926237106323242, "learning_rate": 1.1025943396226416e-06, "loss": 2.6246, "num_input_tokens_seen": 248832, "step": 375 }, { "epoch": 0.2240566037735849, "grad_norm": 20.028623580932617, "learning_rate": 1.1173349056603773e-06, "loss": 2.4666, "num_input_tokens_seen": 252512, "step": 380 }, { "epoch": 0.2270047169811321, "grad_norm": 20.758230209350586, "learning_rate": 1.1320754716981133e-06, "loss": 2.4523, "num_input_tokens_seen": 255104, "step": 385 }, { "epoch": 0.22995283018867924, "grad_norm": 27.85914421081543, "learning_rate": 1.1468160377358493e-06, "loss": 2.514, "num_input_tokens_seen": 259584, "step": 390 }, { "epoch": 0.2329009433962264, "grad_norm": 19.635202407836914, "learning_rate": 1.1615566037735849e-06, "loss": 2.7188, "num_input_tokens_seen": 262752, "step": 395 }, { "epoch": 0.2358490566037736, "grad_norm": 23.203166961669922, "learning_rate": 1.1762971698113208e-06, "loss": 2.4774, "num_input_tokens_seen": 265472, "step": 400 }, { "epoch": 0.23879716981132076, "grad_norm": 24.519420623779297, "learning_rate": 1.1910377358490568e-06, "loss": 2.3511, "num_input_tokens_seen": 269184, "step": 405 }, { "epoch": 0.2417452830188679, "grad_norm": 27.509197235107422, "learning_rate": 1.2057783018867926e-06, "loss": 2.2869, "num_input_tokens_seen": 272512, "step": 410 }, { "epoch": 0.2446933962264151, "grad_norm": 26.423038482666016, "learning_rate": 1.2205188679245284e-06, "loss": 2.2368, "num_input_tokens_seen": 275392, "step": 415 }, { "epoch": 0.24764150943396226, "grad_norm": 21.470245361328125, "learning_rate": 1.2352594339622641e-06, "loss": 2.1817, "num_input_tokens_seen": 278528, "step": 420 }, { "epoch": 0.2505896226415094, "grad_norm": 26.609819412231445, "learning_rate": 1.25e-06, "loss": 2.1442, "num_input_tokens_seen": 281728, "step": 425 }, { "epoch": 0.2535377358490566, "grad_norm": 21.011613845825195, "learning_rate": 1.264740566037736e-06, "loss": 1.9372, "num_input_tokens_seen": 285024, "step": 430 }, { "epoch": 0.25648584905660377, "grad_norm": 19.21404457092285, "learning_rate": 1.2794811320754718e-06, "loss": 1.9145, "num_input_tokens_seen": 289728, "step": 435 }, { "epoch": 0.25943396226415094, "grad_norm": 18.226816177368164, "learning_rate": 1.2942216981132078e-06, "loss": 2.1572, "num_input_tokens_seen": 292352, "step": 440 }, { "epoch": 0.2623820754716981, "grad_norm": 24.116395950317383, "learning_rate": 1.3089622641509436e-06, "loss": 2.2892, "num_input_tokens_seen": 295808, "step": 445 }, { "epoch": 0.2653301886792453, "grad_norm": 14.305411338806152, "learning_rate": 1.3237028301886792e-06, "loss": 2.4007, "num_input_tokens_seen": 299264, "step": 450 }, { "epoch": 0.26827830188679247, "grad_norm": 18.555932998657227, "learning_rate": 1.3384433962264151e-06, "loss": 1.9879, "num_input_tokens_seen": 302304, "step": 455 }, { "epoch": 0.27122641509433965, "grad_norm": 17.686676025390625, "learning_rate": 1.353183962264151e-06, "loss": 2.0023, "num_input_tokens_seen": 306368, "step": 460 }, { "epoch": 0.27417452830188677, "grad_norm": 25.136276245117188, "learning_rate": 1.3679245283018869e-06, "loss": 1.8714, "num_input_tokens_seen": 310880, "step": 465 }, { "epoch": 0.27712264150943394, "grad_norm": 25.394203186035156, "learning_rate": 1.3826650943396229e-06, "loss": 1.8218, "num_input_tokens_seen": 313760, "step": 470 }, { "epoch": 0.2800707547169811, "grad_norm": 40.08849334716797, "learning_rate": 1.3974056603773586e-06, "loss": 1.8238, "num_input_tokens_seen": 317056, "step": 475 }, { "epoch": 0.2830188679245283, "grad_norm": 28.043527603149414, "learning_rate": 1.4121462264150946e-06, "loss": 1.8621, "num_input_tokens_seen": 320288, "step": 480 }, { "epoch": 0.28596698113207547, "grad_norm": 36.6243896484375, "learning_rate": 1.4268867924528304e-06, "loss": 1.8917, "num_input_tokens_seen": 323616, "step": 485 }, { "epoch": 0.28891509433962265, "grad_norm": 23.966020584106445, "learning_rate": 1.4416273584905664e-06, "loss": 2.3002, "num_input_tokens_seen": 326592, "step": 490 }, { "epoch": 0.2918632075471698, "grad_norm": 23.04029655456543, "learning_rate": 1.456367924528302e-06, "loss": 1.8434, "num_input_tokens_seen": 329472, "step": 495 }, { "epoch": 0.294811320754717, "grad_norm": 17.38574981689453, "learning_rate": 1.4711084905660377e-06, "loss": 1.4601, "num_input_tokens_seen": 332960, "step": 500 }, { "epoch": 0.2977594339622642, "grad_norm": 21.232189178466797, "learning_rate": 1.4858490566037737e-06, "loss": 1.6417, "num_input_tokens_seen": 336128, "step": 505 }, { "epoch": 0.3007075471698113, "grad_norm": 22.355833053588867, "learning_rate": 1.5005896226415096e-06, "loss": 1.5932, "num_input_tokens_seen": 340896, "step": 510 }, { "epoch": 0.30365566037735847, "grad_norm": 21.422412872314453, "learning_rate": 1.5153301886792454e-06, "loss": 1.785, "num_input_tokens_seen": 344288, "step": 515 }, { "epoch": 0.30660377358490565, "grad_norm": 21.26073455810547, "learning_rate": 1.5300707547169814e-06, "loss": 1.2948, "num_input_tokens_seen": 346816, "step": 520 }, { "epoch": 0.3095518867924528, "grad_norm": 17.090290069580078, "learning_rate": 1.5448113207547172e-06, "loss": 1.2109, "num_input_tokens_seen": 349344, "step": 525 }, { "epoch": 0.3125, "grad_norm": 19.889816284179688, "learning_rate": 1.5595518867924531e-06, "loss": 1.5124, "num_input_tokens_seen": 352160, "step": 530 }, { "epoch": 0.3154481132075472, "grad_norm": 15.621281623840332, "learning_rate": 1.574292452830189e-06, "loss": 1.4123, "num_input_tokens_seen": 356320, "step": 535 }, { "epoch": 0.31839622641509435, "grad_norm": 14.42752742767334, "learning_rate": 1.5890330188679245e-06, "loss": 1.4001, "num_input_tokens_seen": 359616, "step": 540 }, { "epoch": 0.32134433962264153, "grad_norm": 12.689046859741211, "learning_rate": 1.6037735849056604e-06, "loss": 1.4984, "num_input_tokens_seen": 363328, "step": 545 }, { "epoch": 0.3242924528301887, "grad_norm": 22.177303314208984, "learning_rate": 1.6185141509433964e-06, "loss": 1.3878, "num_input_tokens_seen": 366400, "step": 550 }, { "epoch": 0.3272405660377358, "grad_norm": 12.109625816345215, "learning_rate": 1.6332547169811322e-06, "loss": 1.065, "num_input_tokens_seen": 369216, "step": 555 }, { "epoch": 0.330188679245283, "grad_norm": 12.59526252746582, "learning_rate": 1.6479952830188682e-06, "loss": 0.8755, "num_input_tokens_seen": 374400, "step": 560 }, { "epoch": 0.3331367924528302, "grad_norm": 21.529996871948242, "learning_rate": 1.662735849056604e-06, "loss": 1.2639, "num_input_tokens_seen": 377248, "step": 565 }, { "epoch": 0.33608490566037735, "grad_norm": 18.110389709472656, "learning_rate": 1.67747641509434e-06, "loss": 1.0775, "num_input_tokens_seen": 380384, "step": 570 }, { "epoch": 0.33903301886792453, "grad_norm": 12.778611183166504, "learning_rate": 1.6922169811320757e-06, "loss": 1.3044, "num_input_tokens_seen": 383872, "step": 575 }, { "epoch": 0.3419811320754717, "grad_norm": 12.57426929473877, "learning_rate": 1.7069575471698112e-06, "loss": 1.1424, "num_input_tokens_seen": 386784, "step": 580 }, { "epoch": 0.3449292452830189, "grad_norm": 24.159107208251953, "learning_rate": 1.7216981132075472e-06, "loss": 1.051, "num_input_tokens_seen": 389984, "step": 585 }, { "epoch": 0.34787735849056606, "grad_norm": 19.060834884643555, "learning_rate": 1.736438679245283e-06, "loss": 1.311, "num_input_tokens_seen": 392544, "step": 590 }, { "epoch": 0.35082547169811323, "grad_norm": 26.77260971069336, "learning_rate": 1.751179245283019e-06, "loss": 1.6017, "num_input_tokens_seen": 394784, "step": 595 }, { "epoch": 0.35377358490566035, "grad_norm": 15.615082740783691, "learning_rate": 1.765919811320755e-06, "loss": 1.0358, "num_input_tokens_seen": 397024, "step": 600 }, { "epoch": 0.35672169811320753, "grad_norm": 16.0369930267334, "learning_rate": 1.7806603773584907e-06, "loss": 1.1311, "num_input_tokens_seen": 400768, "step": 605 }, { "epoch": 0.3596698113207547, "grad_norm": 13.076753616333008, "learning_rate": 1.7954009433962267e-06, "loss": 1.0454, "num_input_tokens_seen": 403616, "step": 610 }, { "epoch": 0.3626179245283019, "grad_norm": 16.56241798400879, "learning_rate": 1.8101415094339625e-06, "loss": 0.9561, "num_input_tokens_seen": 406560, "step": 615 }, { "epoch": 0.36556603773584906, "grad_norm": 13.586681365966797, "learning_rate": 1.8248820754716984e-06, "loss": 1.0149, "num_input_tokens_seen": 409504, "step": 620 }, { "epoch": 0.36851415094339623, "grad_norm": 10.52236557006836, "learning_rate": 1.839622641509434e-06, "loss": 0.9459, "num_input_tokens_seen": 412224, "step": 625 }, { "epoch": 0.3714622641509434, "grad_norm": 7.650209903717041, "learning_rate": 1.8543632075471698e-06, "loss": 0.8419, "num_input_tokens_seen": 414976, "step": 630 }, { "epoch": 0.3744103773584906, "grad_norm": 21.7857723236084, "learning_rate": 1.8691037735849057e-06, "loss": 1.0625, "num_input_tokens_seen": 418976, "step": 635 }, { "epoch": 0.37735849056603776, "grad_norm": 13.57638931274414, "learning_rate": 1.8838443396226417e-06, "loss": 0.9751, "num_input_tokens_seen": 422080, "step": 640 }, { "epoch": 0.3803066037735849, "grad_norm": 11.957155227661133, "learning_rate": 1.8985849056603775e-06, "loss": 0.5874, "num_input_tokens_seen": 424864, "step": 645 }, { "epoch": 0.38325471698113206, "grad_norm": 13.441313743591309, "learning_rate": 1.9133254716981133e-06, "loss": 1.014, "num_input_tokens_seen": 428352, "step": 650 }, { "epoch": 0.38620283018867924, "grad_norm": 11.791839599609375, "learning_rate": 1.9280660377358494e-06, "loss": 0.9594, "num_input_tokens_seen": 431072, "step": 655 }, { "epoch": 0.3891509433962264, "grad_norm": 9.357067108154297, "learning_rate": 1.9428066037735852e-06, "loss": 1.0233, "num_input_tokens_seen": 434880, "step": 660 }, { "epoch": 0.3920990566037736, "grad_norm": 10.246585845947266, "learning_rate": 1.957547169811321e-06, "loss": 1.0989, "num_input_tokens_seen": 438624, "step": 665 }, { "epoch": 0.39504716981132076, "grad_norm": 9.284098625183105, "learning_rate": 1.9722877358490568e-06, "loss": 0.7568, "num_input_tokens_seen": 441632, "step": 670 }, { "epoch": 0.39799528301886794, "grad_norm": 8.835054397583008, "learning_rate": 1.9870283018867925e-06, "loss": 0.8523, "num_input_tokens_seen": 444672, "step": 675 }, { "epoch": 0.4009433962264151, "grad_norm": 10.199869155883789, "learning_rate": 2.0017688679245283e-06, "loss": 0.7668, "num_input_tokens_seen": 447712, "step": 680 }, { "epoch": 0.40389150943396224, "grad_norm": 9.748979568481445, "learning_rate": 2.0165094339622645e-06, "loss": 0.7897, "num_input_tokens_seen": 451424, "step": 685 }, { "epoch": 0.4068396226415094, "grad_norm": 12.172663688659668, "learning_rate": 2.0312500000000002e-06, "loss": 0.9239, "num_input_tokens_seen": 454656, "step": 690 }, { "epoch": 0.4097877358490566, "grad_norm": 9.179244041442871, "learning_rate": 2.045990566037736e-06, "loss": 0.9547, "num_input_tokens_seen": 457760, "step": 695 }, { "epoch": 0.41273584905660377, "grad_norm": 9.277763366699219, "learning_rate": 2.0607311320754718e-06, "loss": 1.1594, "num_input_tokens_seen": 461568, "step": 700 }, { "epoch": 0.41568396226415094, "grad_norm": 7.869194984436035, "learning_rate": 2.075471698113208e-06, "loss": 0.7731, "num_input_tokens_seen": 464896, "step": 705 }, { "epoch": 0.4186320754716981, "grad_norm": 61.0346794128418, "learning_rate": 2.0902122641509437e-06, "loss": 0.8348, "num_input_tokens_seen": 467648, "step": 710 }, { "epoch": 0.4215801886792453, "grad_norm": 8.038278579711914, "learning_rate": 2.1049528301886795e-06, "loss": 0.7873, "num_input_tokens_seen": 469888, "step": 715 }, { "epoch": 0.42452830188679247, "grad_norm": 5.806300163269043, "learning_rate": 2.1196933962264153e-06, "loss": 0.7727, "num_input_tokens_seen": 472480, "step": 720 }, { "epoch": 0.42747641509433965, "grad_norm": 24.881826400756836, "learning_rate": 2.134433962264151e-06, "loss": 0.8489, "num_input_tokens_seen": 476736, "step": 725 }, { "epoch": 0.43042452830188677, "grad_norm": 14.31634521484375, "learning_rate": 2.149174528301887e-06, "loss": 0.8342, "num_input_tokens_seen": 480640, "step": 730 }, { "epoch": 0.43337264150943394, "grad_norm": 17.595922470092773, "learning_rate": 2.163915094339623e-06, "loss": 0.7452, "num_input_tokens_seen": 483936, "step": 735 }, { "epoch": 0.4363207547169811, "grad_norm": 8.841264724731445, "learning_rate": 2.1786556603773588e-06, "loss": 0.9013, "num_input_tokens_seen": 488192, "step": 740 }, { "epoch": 0.4392688679245283, "grad_norm": 9.401267051696777, "learning_rate": 2.1933962264150945e-06, "loss": 0.8042, "num_input_tokens_seen": 491104, "step": 745 }, { "epoch": 0.44221698113207547, "grad_norm": 9.340909957885742, "learning_rate": 2.2081367924528303e-06, "loss": 0.7833, "num_input_tokens_seen": 494144, "step": 750 }, { "epoch": 0.44516509433962265, "grad_norm": 15.403031349182129, "learning_rate": 2.2228773584905665e-06, "loss": 0.7735, "num_input_tokens_seen": 496704, "step": 755 }, { "epoch": 0.4481132075471698, "grad_norm": 17.41267204284668, "learning_rate": 2.237617924528302e-06, "loss": 0.7322, "num_input_tokens_seen": 500416, "step": 760 }, { "epoch": 0.451061320754717, "grad_norm": 6.003551006317139, "learning_rate": 2.252358490566038e-06, "loss": 0.6981, "num_input_tokens_seen": 503712, "step": 765 }, { "epoch": 0.4540094339622642, "grad_norm": 5.3718085289001465, "learning_rate": 2.267099056603774e-06, "loss": 0.794, "num_input_tokens_seen": 507424, "step": 770 }, { "epoch": 0.4569575471698113, "grad_norm": 8.144166946411133, "learning_rate": 2.2818396226415096e-06, "loss": 0.6019, "num_input_tokens_seen": 511168, "step": 775 }, { "epoch": 0.45990566037735847, "grad_norm": 6.1777777671813965, "learning_rate": 2.2965801886792453e-06, "loss": 0.829, "num_input_tokens_seen": 516480, "step": 780 }, { "epoch": 0.46285377358490565, "grad_norm": 5.932247161865234, "learning_rate": 2.3113207547169815e-06, "loss": 0.6215, "num_input_tokens_seen": 519488, "step": 785 }, { "epoch": 0.4658018867924528, "grad_norm": 5.8605804443359375, "learning_rate": 2.3260613207547173e-06, "loss": 0.6425, "num_input_tokens_seen": 522464, "step": 790 }, { "epoch": 0.46875, "grad_norm": 7.752542495727539, "learning_rate": 2.340801886792453e-06, "loss": 0.5934, "num_input_tokens_seen": 525696, "step": 795 }, { "epoch": 0.4716981132075472, "grad_norm": 13.058610916137695, "learning_rate": 2.355542452830189e-06, "loss": 0.7002, "num_input_tokens_seen": 529472, "step": 800 }, { "epoch": 0.47464622641509435, "grad_norm": 8.909782409667969, "learning_rate": 2.3702830188679246e-06, "loss": 0.7548, "num_input_tokens_seen": 531840, "step": 805 }, { "epoch": 0.47759433962264153, "grad_norm": 4.06205940246582, "learning_rate": 2.3850235849056604e-06, "loss": 0.5393, "num_input_tokens_seen": 534784, "step": 810 }, { "epoch": 0.4805424528301887, "grad_norm": 24.176931381225586, "learning_rate": 2.3997641509433966e-06, "loss": 0.659, "num_input_tokens_seen": 537824, "step": 815 }, { "epoch": 0.4834905660377358, "grad_norm": 9.467673301696777, "learning_rate": 2.4145047169811323e-06, "loss": 0.8124, "num_input_tokens_seen": 540864, "step": 820 }, { "epoch": 0.486438679245283, "grad_norm": 5.78130578994751, "learning_rate": 2.429245283018868e-06, "loss": 0.7031, "num_input_tokens_seen": 543296, "step": 825 }, { "epoch": 0.4893867924528302, "grad_norm": 5.5926408767700195, "learning_rate": 2.443985849056604e-06, "loss": 0.6525, "num_input_tokens_seen": 546496, "step": 830 }, { "epoch": 0.49233490566037735, "grad_norm": 5.509466648101807, "learning_rate": 2.45872641509434e-06, "loss": 0.5902, "num_input_tokens_seen": 549280, "step": 835 }, { "epoch": 0.49528301886792453, "grad_norm": 8.17666244506836, "learning_rate": 2.473466981132076e-06, "loss": 0.7358, "num_input_tokens_seen": 552288, "step": 840 }, { "epoch": 0.4982311320754717, "grad_norm": 10.840197563171387, "learning_rate": 2.4882075471698116e-06, "loss": 0.6275, "num_input_tokens_seen": 554912, "step": 845 }, { "epoch": 0.5011792452830188, "grad_norm": 10.688712120056152, "learning_rate": 2.5029481132075474e-06, "loss": 0.5852, "num_input_tokens_seen": 559264, "step": 850 }, { "epoch": 0.504127358490566, "grad_norm": 8.286870002746582, "learning_rate": 2.517688679245283e-06, "loss": 0.7165, "num_input_tokens_seen": 562688, "step": 855 }, { "epoch": 0.5070754716981132, "grad_norm": 8.09310245513916, "learning_rate": 2.532429245283019e-06, "loss": 0.5305, "num_input_tokens_seen": 565856, "step": 860 }, { "epoch": 0.5100235849056604, "grad_norm": 2.4835100173950195, "learning_rate": 2.547169811320755e-06, "loss": 0.6202, "num_input_tokens_seen": 570080, "step": 865 }, { "epoch": 0.5129716981132075, "grad_norm": 9.383463859558105, "learning_rate": 2.561910377358491e-06, "loss": 0.6608, "num_input_tokens_seen": 573344, "step": 870 }, { "epoch": 0.5159198113207547, "grad_norm": 5.455860137939453, "learning_rate": 2.5766509433962266e-06, "loss": 0.6334, "num_input_tokens_seen": 575808, "step": 875 }, { "epoch": 0.5188679245283019, "grad_norm": 30.33428382873535, "learning_rate": 2.5913915094339624e-06, "loss": 0.6556, "num_input_tokens_seen": 579552, "step": 880 }, { "epoch": 0.5218160377358491, "grad_norm": 2.819800853729248, "learning_rate": 2.6061320754716986e-06, "loss": 0.6354, "num_input_tokens_seen": 583232, "step": 885 }, { "epoch": 0.5247641509433962, "grad_norm": 5.631868839263916, "learning_rate": 2.6208726415094343e-06, "loss": 0.5886, "num_input_tokens_seen": 586496, "step": 890 }, { "epoch": 0.5277122641509434, "grad_norm": 3.033212184906006, "learning_rate": 2.63561320754717e-06, "loss": 0.6205, "num_input_tokens_seen": 590144, "step": 895 }, { "epoch": 0.5306603773584906, "grad_norm": 20.86394500732422, "learning_rate": 2.650353773584906e-06, "loss": 0.6408, "num_input_tokens_seen": 594432, "step": 900 }, { "epoch": 0.5336084905660378, "grad_norm": 7.1308512687683105, "learning_rate": 2.665094339622642e-06, "loss": 0.6731, "num_input_tokens_seen": 598976, "step": 905 }, { "epoch": 0.5365566037735849, "grad_norm": 7.186617374420166, "learning_rate": 2.679834905660378e-06, "loss": 0.6789, "num_input_tokens_seen": 602464, "step": 910 }, { "epoch": 0.5395047169811321, "grad_norm": 4.175388813018799, "learning_rate": 2.694575471698113e-06, "loss": 0.6237, "num_input_tokens_seen": 604832, "step": 915 }, { "epoch": 0.5424528301886793, "grad_norm": 5.964530944824219, "learning_rate": 2.709316037735849e-06, "loss": 0.5631, "num_input_tokens_seen": 607904, "step": 920 }, { "epoch": 0.5454009433962265, "grad_norm": 10.120810508728027, "learning_rate": 2.724056603773585e-06, "loss": 0.7237, "num_input_tokens_seen": 611264, "step": 925 }, { "epoch": 0.5483490566037735, "grad_norm": 17.5157470703125, "learning_rate": 2.738797169811321e-06, "loss": 0.7266, "num_input_tokens_seen": 614336, "step": 930 }, { "epoch": 0.5512971698113207, "grad_norm": 2.9105162620544434, "learning_rate": 2.7535377358490567e-06, "loss": 1.0816, "num_input_tokens_seen": 616960, "step": 935 }, { "epoch": 0.5542452830188679, "grad_norm": 5.336266040802002, "learning_rate": 2.7682783018867925e-06, "loss": 0.9082, "num_input_tokens_seen": 620512, "step": 940 }, { "epoch": 0.5571933962264151, "grad_norm": 5.177180290222168, "learning_rate": 2.7830188679245286e-06, "loss": 0.8995, "num_input_tokens_seen": 623584, "step": 945 }, { "epoch": 0.5601415094339622, "grad_norm": 10.397789001464844, "learning_rate": 2.7977594339622644e-06, "loss": 0.697, "num_input_tokens_seen": 626080, "step": 950 }, { "epoch": 0.5630896226415094, "grad_norm": 5.686207294464111, "learning_rate": 2.8125e-06, "loss": 0.7366, "num_input_tokens_seen": 628864, "step": 955 }, { "epoch": 0.5660377358490566, "grad_norm": 13.82198715209961, "learning_rate": 2.827240566037736e-06, "loss": 0.541, "num_input_tokens_seen": 632064, "step": 960 }, { "epoch": 0.5689858490566038, "grad_norm": 3.5186896324157715, "learning_rate": 2.841981132075472e-06, "loss": 0.8597, "num_input_tokens_seen": 635904, "step": 965 }, { "epoch": 0.5719339622641509, "grad_norm": 7.5879292488098145, "learning_rate": 2.856721698113208e-06, "loss": 0.8266, "num_input_tokens_seen": 638528, "step": 970 }, { "epoch": 0.5748820754716981, "grad_norm": 2.935241937637329, "learning_rate": 2.8714622641509437e-06, "loss": 0.3579, "num_input_tokens_seen": 641088, "step": 975 }, { "epoch": 0.5778301886792453, "grad_norm": 5.042152404785156, "learning_rate": 2.8862028301886794e-06, "loss": 0.5708, "num_input_tokens_seen": 645056, "step": 980 }, { "epoch": 0.5807783018867925, "grad_norm": 8.911943435668945, "learning_rate": 2.9009433962264156e-06, "loss": 0.7935, "num_input_tokens_seen": 649088, "step": 985 }, { "epoch": 0.5837264150943396, "grad_norm": 3.710848808288574, "learning_rate": 2.9156839622641514e-06, "loss": 0.4963, "num_input_tokens_seen": 651904, "step": 990 }, { "epoch": 0.5866745283018868, "grad_norm": 2.808746099472046, "learning_rate": 2.930424528301887e-06, "loss": 0.5753, "num_input_tokens_seen": 656000, "step": 995 }, { "epoch": 0.589622641509434, "grad_norm": 3.8598711490631104, "learning_rate": 2.9451650943396225e-06, "loss": 0.5815, "num_input_tokens_seen": 658688, "step": 1000 }, { "epoch": 0.5925707547169812, "grad_norm": 11.66132640838623, "learning_rate": 2.9599056603773587e-06, "loss": 0.7194, "num_input_tokens_seen": 662144, "step": 1005 }, { "epoch": 0.5955188679245284, "grad_norm": 4.273825645446777, "learning_rate": 2.9746462264150945e-06, "loss": 0.5695, "num_input_tokens_seen": 665600, "step": 1010 }, { "epoch": 0.5984669811320755, "grad_norm": 7.704720497131348, "learning_rate": 2.9893867924528302e-06, "loss": 0.668, "num_input_tokens_seen": 668960, "step": 1015 }, { "epoch": 0.6014150943396226, "grad_norm": 4.512171745300293, "learning_rate": 3.004127358490566e-06, "loss": 0.7512, "num_input_tokens_seen": 671872, "step": 1020 }, { "epoch": 0.6043632075471698, "grad_norm": 4.809904098510742, "learning_rate": 3.018867924528302e-06, "loss": 0.5939, "num_input_tokens_seen": 674944, "step": 1025 }, { "epoch": 0.6073113207547169, "grad_norm": 9.664993286132812, "learning_rate": 3.033608490566038e-06, "loss": 0.6629, "num_input_tokens_seen": 677600, "step": 1030 }, { "epoch": 0.6102594339622641, "grad_norm": 3.8864848613739014, "learning_rate": 3.0483490566037737e-06, "loss": 0.6016, "num_input_tokens_seen": 681664, "step": 1035 }, { "epoch": 0.6132075471698113, "grad_norm": 7.045229434967041, "learning_rate": 3.0630896226415095e-06, "loss": 0.677, "num_input_tokens_seen": 684448, "step": 1040 }, { "epoch": 0.6161556603773585, "grad_norm": 4.36606502532959, "learning_rate": 3.0778301886792457e-06, "loss": 0.7471, "num_input_tokens_seen": 687872, "step": 1045 }, { "epoch": 0.6191037735849056, "grad_norm": 2.931795358657837, "learning_rate": 3.0925707547169815e-06, "loss": 0.5413, "num_input_tokens_seen": 691744, "step": 1050 }, { "epoch": 0.6220518867924528, "grad_norm": 3.3371224403381348, "learning_rate": 3.1073113207547172e-06, "loss": 0.6149, "num_input_tokens_seen": 695072, "step": 1055 }, { "epoch": 0.625, "grad_norm": 4.360918998718262, "learning_rate": 3.122051886792453e-06, "loss": 0.5662, "num_input_tokens_seen": 698144, "step": 1060 }, { "epoch": 0.6279481132075472, "grad_norm": 7.037140846252441, "learning_rate": 3.136792452830189e-06, "loss": 0.7171, "num_input_tokens_seen": 701440, "step": 1065 }, { "epoch": 0.6308962264150944, "grad_norm": 9.752073287963867, "learning_rate": 3.151533018867925e-06, "loss": 0.4936, "num_input_tokens_seen": 704448, "step": 1070 }, { "epoch": 0.6338443396226415, "grad_norm": 4.396730422973633, "learning_rate": 3.1662735849056607e-06, "loss": 0.6647, "num_input_tokens_seen": 708128, "step": 1075 }, { "epoch": 0.6367924528301887, "grad_norm": 5.527029514312744, "learning_rate": 3.181014150943397e-06, "loss": 0.7612, "num_input_tokens_seen": 711840, "step": 1080 }, { "epoch": 0.6397405660377359, "grad_norm": 5.059622764587402, "learning_rate": 3.1957547169811327e-06, "loss": 0.6341, "num_input_tokens_seen": 713952, "step": 1085 }, { "epoch": 0.6426886792452831, "grad_norm": 3.5978844165802, "learning_rate": 3.210495283018868e-06, "loss": 0.5817, "num_input_tokens_seen": 718272, "step": 1090 }, { "epoch": 0.6456367924528302, "grad_norm": 6.026699542999268, "learning_rate": 3.225235849056604e-06, "loss": 0.6333, "num_input_tokens_seen": 721280, "step": 1095 }, { "epoch": 0.6485849056603774, "grad_norm": 5.304384708404541, "learning_rate": 3.2399764150943396e-06, "loss": 0.5597, "num_input_tokens_seen": 723904, "step": 1100 }, { "epoch": 0.6515330188679245, "grad_norm": 3.9573209285736084, "learning_rate": 3.2547169811320758e-06, "loss": 0.5991, "num_input_tokens_seen": 727968, "step": 1105 }, { "epoch": 0.6544811320754716, "grad_norm": 5.994421005249023, "learning_rate": 3.2694575471698115e-06, "loss": 0.5628, "num_input_tokens_seen": 732000, "step": 1110 }, { "epoch": 0.6574292452830188, "grad_norm": 5.7126665115356445, "learning_rate": 3.2841981132075473e-06, "loss": 0.5879, "num_input_tokens_seen": 734848, "step": 1115 }, { "epoch": 0.660377358490566, "grad_norm": 5.206673622131348, "learning_rate": 3.298938679245283e-06, "loss": 0.8727, "num_input_tokens_seen": 738976, "step": 1120 }, { "epoch": 0.6633254716981132, "grad_norm": 7.565757751464844, "learning_rate": 3.3136792452830192e-06, "loss": 0.5133, "num_input_tokens_seen": 741600, "step": 1125 }, { "epoch": 0.6662735849056604, "grad_norm": 3.4068069458007812, "learning_rate": 3.328419811320755e-06, "loss": 0.6511, "num_input_tokens_seen": 744160, "step": 1130 }, { "epoch": 0.6692216981132075, "grad_norm": 4.800451755523682, "learning_rate": 3.3431603773584908e-06, "loss": 0.6371, "num_input_tokens_seen": 747296, "step": 1135 }, { "epoch": 0.6721698113207547, "grad_norm": 3.9116437435150146, "learning_rate": 3.3579009433962266e-06, "loss": 0.577, "num_input_tokens_seen": 750752, "step": 1140 }, { "epoch": 0.6751179245283019, "grad_norm": 5.729433536529541, "learning_rate": 3.3726415094339627e-06, "loss": 0.5607, "num_input_tokens_seen": 753760, "step": 1145 }, { "epoch": 0.6780660377358491, "grad_norm": 5.800758361816406, "learning_rate": 3.3873820754716985e-06, "loss": 0.7251, "num_input_tokens_seen": 756512, "step": 1150 }, { "epoch": 0.6810141509433962, "grad_norm": 7.973551273345947, "learning_rate": 3.4021226415094343e-06, "loss": 0.5444, "num_input_tokens_seen": 758912, "step": 1155 }, { "epoch": 0.6839622641509434, "grad_norm": 2.8155453205108643, "learning_rate": 3.4168632075471705e-06, "loss": 0.5289, "num_input_tokens_seen": 762368, "step": 1160 }, { "epoch": 0.6869103773584906, "grad_norm": 9.483343124389648, "learning_rate": 3.4316037735849062e-06, "loss": 0.6568, "num_input_tokens_seen": 764544, "step": 1165 }, { "epoch": 0.6898584905660378, "grad_norm": 6.933462619781494, "learning_rate": 3.446344339622642e-06, "loss": 0.5403, "num_input_tokens_seen": 767328, "step": 1170 }, { "epoch": 0.6928066037735849, "grad_norm": 3.9723830223083496, "learning_rate": 3.4610849056603778e-06, "loss": 0.5028, "num_input_tokens_seen": 770496, "step": 1175 }, { "epoch": 0.6957547169811321, "grad_norm": 3.0466511249542236, "learning_rate": 3.475825471698113e-06, "loss": 0.6033, "num_input_tokens_seen": 775136, "step": 1180 }, { "epoch": 0.6987028301886793, "grad_norm": 16.413108825683594, "learning_rate": 3.4905660377358493e-06, "loss": 0.6507, "num_input_tokens_seen": 778944, "step": 1185 }, { "epoch": 0.7016509433962265, "grad_norm": 6.43879508972168, "learning_rate": 3.505306603773585e-06, "loss": 0.4441, "num_input_tokens_seen": 781536, "step": 1190 }, { "epoch": 0.7045990566037735, "grad_norm": 7.547950267791748, "learning_rate": 3.520047169811321e-06, "loss": 0.8131, "num_input_tokens_seen": 784352, "step": 1195 }, { "epoch": 0.7075471698113207, "grad_norm": 4.815586090087891, "learning_rate": 3.5347877358490566e-06, "loss": 0.5998, "num_input_tokens_seen": 788224, "step": 1200 }, { "epoch": 0.7104952830188679, "grad_norm": 5.432223320007324, "learning_rate": 3.549528301886793e-06, "loss": 0.7764, "num_input_tokens_seen": 791424, "step": 1205 }, { "epoch": 0.7134433962264151, "grad_norm": 3.8161978721618652, "learning_rate": 3.5642688679245286e-06, "loss": 0.5999, "num_input_tokens_seen": 794656, "step": 1210 }, { "epoch": 0.7163915094339622, "grad_norm": 8.441664695739746, "learning_rate": 3.5790094339622643e-06, "loss": 0.5389, "num_input_tokens_seen": 797984, "step": 1215 }, { "epoch": 0.7193396226415094, "grad_norm": 15.016979217529297, "learning_rate": 3.59375e-06, "loss": 0.5855, "num_input_tokens_seen": 802112, "step": 1220 }, { "epoch": 0.7222877358490566, "grad_norm": 5.264836311340332, "learning_rate": 3.6084905660377363e-06, "loss": 0.7197, "num_input_tokens_seen": 805184, "step": 1225 }, { "epoch": 0.7252358490566038, "grad_norm": 4.0827250480651855, "learning_rate": 3.623231132075472e-06, "loss": 0.6156, "num_input_tokens_seen": 808288, "step": 1230 }, { "epoch": 0.7281839622641509, "grad_norm": 7.188789367675781, "learning_rate": 3.637971698113208e-06, "loss": 0.7839, "num_input_tokens_seen": 811840, "step": 1235 }, { "epoch": 0.7311320754716981, "grad_norm": 3.9460668563842773, "learning_rate": 3.652712264150944e-06, "loss": 0.6031, "num_input_tokens_seen": 814816, "step": 1240 }, { "epoch": 0.7340801886792453, "grad_norm": 3.3640682697296143, "learning_rate": 3.66745283018868e-06, "loss": 0.5941, "num_input_tokens_seen": 818784, "step": 1245 }, { "epoch": 0.7370283018867925, "grad_norm": 9.034252166748047, "learning_rate": 3.6821933962264156e-06, "loss": 0.5696, "num_input_tokens_seen": 821472, "step": 1250 }, { "epoch": 0.7399764150943396, "grad_norm": 3.7004358768463135, "learning_rate": 3.6969339622641513e-06, "loss": 0.5555, "num_input_tokens_seen": 828384, "step": 1255 }, { "epoch": 0.7429245283018868, "grad_norm": 3.9819512367248535, "learning_rate": 3.7116745283018875e-06, "loss": 0.5052, "num_input_tokens_seen": 832416, "step": 1260 }, { "epoch": 0.745872641509434, "grad_norm": 3.6677780151367188, "learning_rate": 3.726415094339623e-06, "loss": 0.5224, "num_input_tokens_seen": 835744, "step": 1265 }, { "epoch": 0.7488207547169812, "grad_norm": 7.2073516845703125, "learning_rate": 3.7411556603773586e-06, "loss": 0.7468, "num_input_tokens_seen": 838816, "step": 1270 }, { "epoch": 0.7517688679245284, "grad_norm": 6.950973033905029, "learning_rate": 3.7558962264150944e-06, "loss": 0.7075, "num_input_tokens_seen": 843232, "step": 1275 }, { "epoch": 0.7547169811320755, "grad_norm": 4.660774230957031, "learning_rate": 3.77063679245283e-06, "loss": 0.5449, "num_input_tokens_seen": 845856, "step": 1280 }, { "epoch": 0.7576650943396226, "grad_norm": 5.415475368499756, "learning_rate": 3.7853773584905664e-06, "loss": 0.5481, "num_input_tokens_seen": 849440, "step": 1285 }, { "epoch": 0.7606132075471698, "grad_norm": 3.3951733112335205, "learning_rate": 3.800117924528302e-06, "loss": 0.4872, "num_input_tokens_seen": 852768, "step": 1290 }, { "epoch": 0.7635613207547169, "grad_norm": 5.599150657653809, "learning_rate": 3.814858490566038e-06, "loss": 0.549, "num_input_tokens_seen": 856256, "step": 1295 }, { "epoch": 0.7665094339622641, "grad_norm": 3.983771562576294, "learning_rate": 3.829599056603774e-06, "loss": 0.5188, "num_input_tokens_seen": 860096, "step": 1300 }, { "epoch": 0.7694575471698113, "grad_norm": 4.226012229919434, "learning_rate": 3.8443396226415094e-06, "loss": 0.6904, "num_input_tokens_seen": 863520, "step": 1305 }, { "epoch": 0.7724056603773585, "grad_norm": 5.358924865722656, "learning_rate": 3.859080188679246e-06, "loss": 0.5564, "num_input_tokens_seen": 866784, "step": 1310 }, { "epoch": 0.7753537735849056, "grad_norm": 4.4242706298828125, "learning_rate": 3.873820754716982e-06, "loss": 0.5411, "num_input_tokens_seen": 870208, "step": 1315 }, { "epoch": 0.7783018867924528, "grad_norm": 7.503829479217529, "learning_rate": 3.888561320754717e-06, "loss": 0.5672, "num_input_tokens_seen": 874112, "step": 1320 }, { "epoch": 0.78125, "grad_norm": 7.712329864501953, "learning_rate": 3.903301886792453e-06, "loss": 0.5843, "num_input_tokens_seen": 877728, "step": 1325 }, { "epoch": 0.7841981132075472, "grad_norm": 3.9674909114837646, "learning_rate": 3.9180424528301895e-06, "loss": 0.5098, "num_input_tokens_seen": 882048, "step": 1330 }, { "epoch": 0.7871462264150944, "grad_norm": 3.0964205265045166, "learning_rate": 3.932783018867925e-06, "loss": 0.6791, "num_input_tokens_seen": 886400, "step": 1335 }, { "epoch": 0.7900943396226415, "grad_norm": 6.021134853363037, "learning_rate": 3.947523584905661e-06, "loss": 0.4979, "num_input_tokens_seen": 889152, "step": 1340 }, { "epoch": 0.7930424528301887, "grad_norm": 5.005767345428467, "learning_rate": 3.962264150943396e-06, "loss": 0.5016, "num_input_tokens_seen": 891936, "step": 1345 }, { "epoch": 0.7959905660377359, "grad_norm": 5.763684272766113, "learning_rate": 3.977004716981133e-06, "loss": 0.6329, "num_input_tokens_seen": 894464, "step": 1350 }, { "epoch": 0.7989386792452831, "grad_norm": 3.277376651763916, "learning_rate": 3.991745283018868e-06, "loss": 0.4907, "num_input_tokens_seen": 897600, "step": 1355 }, { "epoch": 0.8018867924528302, "grad_norm": 4.561283588409424, "learning_rate": 4.006485849056604e-06, "loss": 0.5093, "num_input_tokens_seen": 900864, "step": 1360 }, { "epoch": 0.8048349056603774, "grad_norm": 3.305145263671875, "learning_rate": 4.0212264150943395e-06, "loss": 0.5818, "num_input_tokens_seen": 903808, "step": 1365 }, { "epoch": 0.8077830188679245, "grad_norm": 11.04648494720459, "learning_rate": 4.035966981132076e-06, "loss": 0.6685, "num_input_tokens_seen": 906176, "step": 1370 }, { "epoch": 0.8107311320754716, "grad_norm": 10.513497352600098, "learning_rate": 4.050707547169812e-06, "loss": 0.6661, "num_input_tokens_seen": 909152, "step": 1375 }, { "epoch": 0.8136792452830188, "grad_norm": 2.893483877182007, "learning_rate": 4.065448113207547e-06, "loss": 0.6104, "num_input_tokens_seen": 911680, "step": 1380 }, { "epoch": 0.816627358490566, "grad_norm": 3.4582347869873047, "learning_rate": 4.080188679245283e-06, "loss": 0.4818, "num_input_tokens_seen": 916128, "step": 1385 }, { "epoch": 0.8195754716981132, "grad_norm": 5.020617485046387, "learning_rate": 4.094929245283019e-06, "loss": 0.6485, "num_input_tokens_seen": 918880, "step": 1390 }, { "epoch": 0.8225235849056604, "grad_norm": 4.992088317871094, "learning_rate": 4.109669811320755e-06, "loss": 0.5349, "num_input_tokens_seen": 921216, "step": 1395 }, { "epoch": 0.8254716981132075, "grad_norm": 2.666390895843506, "learning_rate": 4.124410377358491e-06, "loss": 0.6605, "num_input_tokens_seen": 923776, "step": 1400 }, { "epoch": 0.8284198113207547, "grad_norm": 4.313969612121582, "learning_rate": 4.1391509433962265e-06, "loss": 0.6443, "num_input_tokens_seen": 926336, "step": 1405 }, { "epoch": 0.8313679245283019, "grad_norm": 2.7690999507904053, "learning_rate": 4.153891509433963e-06, "loss": 0.5567, "num_input_tokens_seen": 929824, "step": 1410 }, { "epoch": 0.8343160377358491, "grad_norm": 7.014973163604736, "learning_rate": 4.168632075471699e-06, "loss": 0.5194, "num_input_tokens_seen": 932800, "step": 1415 }, { "epoch": 0.8372641509433962, "grad_norm": 25.9439754486084, "learning_rate": 4.183372641509434e-06, "loss": 0.5714, "num_input_tokens_seen": 936544, "step": 1420 }, { "epoch": 0.8402122641509434, "grad_norm": 6.638407230377197, "learning_rate": 4.19811320754717e-06, "loss": 0.6441, "num_input_tokens_seen": 940160, "step": 1425 }, { "epoch": 0.8431603773584906, "grad_norm": 2.5558292865753174, "learning_rate": 4.212853773584907e-06, "loss": 0.4841, "num_input_tokens_seen": 943840, "step": 1430 }, { "epoch": 0.8461084905660378, "grad_norm": 9.15099811553955, "learning_rate": 4.227594339622642e-06, "loss": 0.5715, "num_input_tokens_seen": 946304, "step": 1435 }, { "epoch": 0.8490566037735849, "grad_norm": 3.2901201248168945, "learning_rate": 4.242334905660378e-06, "loss": 0.6108, "num_input_tokens_seen": 949184, "step": 1440 }, { "epoch": 0.8520047169811321, "grad_norm": 35.17498779296875, "learning_rate": 4.2570754716981135e-06, "loss": 0.5685, "num_input_tokens_seen": 953216, "step": 1445 }, { "epoch": 0.8549528301886793, "grad_norm": 9.862936019897461, "learning_rate": 4.271816037735849e-06, "loss": 0.5629, "num_input_tokens_seen": 956224, "step": 1450 }, { "epoch": 0.8579009433962265, "grad_norm": 4.42941427230835, "learning_rate": 4.286556603773585e-06, "loss": 0.5377, "num_input_tokens_seen": 959136, "step": 1455 }, { "epoch": 0.8608490566037735, "grad_norm": 2.1845648288726807, "learning_rate": 4.301297169811321e-06, "loss": 0.4088, "num_input_tokens_seen": 962656, "step": 1460 }, { "epoch": 0.8637971698113207, "grad_norm": 8.4141263961792, "learning_rate": 4.3160377358490565e-06, "loss": 0.5489, "num_input_tokens_seen": 965792, "step": 1465 }, { "epoch": 0.8667452830188679, "grad_norm": 16.000232696533203, "learning_rate": 4.330778301886793e-06, "loss": 0.6472, "num_input_tokens_seen": 968384, "step": 1470 }, { "epoch": 0.8696933962264151, "grad_norm": 4.5645527839660645, "learning_rate": 4.345518867924529e-06, "loss": 0.593, "num_input_tokens_seen": 971008, "step": 1475 }, { "epoch": 0.8726415094339622, "grad_norm": 3.2946958541870117, "learning_rate": 4.360259433962264e-06, "loss": 0.6604, "num_input_tokens_seen": 974368, "step": 1480 }, { "epoch": 0.8755896226415094, "grad_norm": 3.351793050765991, "learning_rate": 4.3750000000000005e-06, "loss": 0.5269, "num_input_tokens_seen": 977184, "step": 1485 }, { "epoch": 0.8785377358490566, "grad_norm": 11.141051292419434, "learning_rate": 4.389740566037737e-06, "loss": 0.6957, "num_input_tokens_seen": 980352, "step": 1490 }, { "epoch": 0.8814858490566038, "grad_norm": 2.412328004837036, "learning_rate": 4.404481132075472e-06, "loss": 0.6435, "num_input_tokens_seen": 984960, "step": 1495 }, { "epoch": 0.8844339622641509, "grad_norm": 2.933180093765259, "learning_rate": 4.419221698113208e-06, "loss": 0.5184, "num_input_tokens_seen": 989184, "step": 1500 }, { "epoch": 0.8873820754716981, "grad_norm": 40.39529800415039, "learning_rate": 4.4339622641509435e-06, "loss": 0.5649, "num_input_tokens_seen": 992224, "step": 1505 }, { "epoch": 0.8903301886792453, "grad_norm": 4.4944167137146, "learning_rate": 4.44870283018868e-06, "loss": 0.5511, "num_input_tokens_seen": 996672, "step": 1510 }, { "epoch": 0.8932783018867925, "grad_norm": 2.537010908126831, "learning_rate": 4.463443396226416e-06, "loss": 0.4889, "num_input_tokens_seen": 1000448, "step": 1515 }, { "epoch": 0.8962264150943396, "grad_norm": 3.1248507499694824, "learning_rate": 4.478183962264151e-06, "loss": 0.5878, "num_input_tokens_seen": 1004512, "step": 1520 }, { "epoch": 0.8991745283018868, "grad_norm": 6.637165546417236, "learning_rate": 4.4929245283018875e-06, "loss": 0.5581, "num_input_tokens_seen": 1008032, "step": 1525 }, { "epoch": 0.902122641509434, "grad_norm": 2.397888422012329, "learning_rate": 4.507665094339623e-06, "loss": 0.4896, "num_input_tokens_seen": 1011360, "step": 1530 }, { "epoch": 0.9050707547169812, "grad_norm": 5.424832820892334, "learning_rate": 4.522405660377359e-06, "loss": 0.6629, "num_input_tokens_seen": 1014688, "step": 1535 }, { "epoch": 0.9080188679245284, "grad_norm": 4.193342685699463, "learning_rate": 4.537146226415094e-06, "loss": 0.6012, "num_input_tokens_seen": 1017984, "step": 1540 }, { "epoch": 0.9109669811320755, "grad_norm": 6.288898468017578, "learning_rate": 4.5518867924528305e-06, "loss": 0.5833, "num_input_tokens_seen": 1020928, "step": 1545 }, { "epoch": 0.9139150943396226, "grad_norm": 3.6982738971710205, "learning_rate": 4.566627358490566e-06, "loss": 0.4773, "num_input_tokens_seen": 1023872, "step": 1550 }, { "epoch": 0.9168632075471698, "grad_norm": 3.738917112350464, "learning_rate": 4.581367924528302e-06, "loss": 0.4867, "num_input_tokens_seen": 1026432, "step": 1555 }, { "epoch": 0.9198113207547169, "grad_norm": 6.977383613586426, "learning_rate": 4.596108490566038e-06, "loss": 0.6858, "num_input_tokens_seen": 1029440, "step": 1560 }, { "epoch": 0.9227594339622641, "grad_norm": 7.859252452850342, "learning_rate": 4.610849056603774e-06, "loss": 0.5502, "num_input_tokens_seen": 1031712, "step": 1565 }, { "epoch": 0.9257075471698113, "grad_norm": 3.975205421447754, "learning_rate": 4.62558962264151e-06, "loss": 0.5964, "num_input_tokens_seen": 1035104, "step": 1570 }, { "epoch": 0.9286556603773585, "grad_norm": 3.459134578704834, "learning_rate": 4.640330188679246e-06, "loss": 0.4874, "num_input_tokens_seen": 1037728, "step": 1575 }, { "epoch": 0.9316037735849056, "grad_norm": 3.05485200881958, "learning_rate": 4.655070754716981e-06, "loss": 0.4169, "num_input_tokens_seen": 1040768, "step": 1580 }, { "epoch": 0.9345518867924528, "grad_norm": 3.625337600708008, "learning_rate": 4.6698113207547175e-06, "loss": 0.8154, "num_input_tokens_seen": 1043520, "step": 1585 }, { "epoch": 0.9375, "grad_norm": 6.160689830780029, "learning_rate": 4.684551886792454e-06, "loss": 0.5269, "num_input_tokens_seen": 1046656, "step": 1590 }, { "epoch": 0.9404481132075472, "grad_norm": 7.351727485656738, "learning_rate": 4.699292452830189e-06, "loss": 0.5194, "num_input_tokens_seen": 1052480, "step": 1595 }, { "epoch": 0.9433962264150944, "grad_norm": 3.5215978622436523, "learning_rate": 4.714033018867925e-06, "loss": 0.4395, "num_input_tokens_seen": 1055232, "step": 1600 }, { "epoch": 0.9463443396226415, "grad_norm": 6.998531818389893, "learning_rate": 4.728773584905661e-06, "loss": 0.5247, "num_input_tokens_seen": 1059136, "step": 1605 }, { "epoch": 0.9492924528301887, "grad_norm": 3.434100389480591, "learning_rate": 4.743514150943397e-06, "loss": 0.4669, "num_input_tokens_seen": 1062336, "step": 1610 }, { "epoch": 0.9522405660377359, "grad_norm": 2.658396005630493, "learning_rate": 4.758254716981133e-06, "loss": 0.3875, "num_input_tokens_seen": 1066336, "step": 1615 }, { "epoch": 0.9551886792452831, "grad_norm": 6.252284049987793, "learning_rate": 4.772995283018868e-06, "loss": 0.7762, "num_input_tokens_seen": 1069664, "step": 1620 }, { "epoch": 0.9581367924528302, "grad_norm": 3.977449655532837, "learning_rate": 4.787735849056604e-06, "loss": 0.5502, "num_input_tokens_seen": 1073504, "step": 1625 }, { "epoch": 0.9610849056603774, "grad_norm": 3.0226118564605713, "learning_rate": 4.80247641509434e-06, "loss": 0.4894, "num_input_tokens_seen": 1076224, "step": 1630 }, { "epoch": 0.9640330188679245, "grad_norm": 2.4303197860717773, "learning_rate": 4.817216981132076e-06, "loss": 0.4446, "num_input_tokens_seen": 1078912, "step": 1635 }, { "epoch": 0.9669811320754716, "grad_norm": 3.2032933235168457, "learning_rate": 4.831957547169811e-06, "loss": 0.6151, "num_input_tokens_seen": 1081632, "step": 1640 }, { "epoch": 0.9699292452830188, "grad_norm": 3.626514434814453, "learning_rate": 4.8466981132075476e-06, "loss": 0.5985, "num_input_tokens_seen": 1084000, "step": 1645 }, { "epoch": 0.972877358490566, "grad_norm": 2.6256682872772217, "learning_rate": 4.861438679245283e-06, "loss": 0.5093, "num_input_tokens_seen": 1087968, "step": 1650 }, { "epoch": 0.9758254716981132, "grad_norm": 1.9504079818725586, "learning_rate": 4.876179245283019e-06, "loss": 0.5324, "num_input_tokens_seen": 1091424, "step": 1655 }, { "epoch": 0.9787735849056604, "grad_norm": 5.960786819458008, "learning_rate": 4.890919811320755e-06, "loss": 0.5994, "num_input_tokens_seen": 1093984, "step": 1660 }, { "epoch": 0.9817216981132075, "grad_norm": 3.8546032905578613, "learning_rate": 4.905660377358491e-06, "loss": 0.6093, "num_input_tokens_seen": 1096512, "step": 1665 }, { "epoch": 0.9846698113207547, "grad_norm": 5.857081413269043, "learning_rate": 4.920400943396227e-06, "loss": 0.5195, "num_input_tokens_seen": 1099296, "step": 1670 }, { "epoch": 0.9876179245283019, "grad_norm": 3.008486270904541, "learning_rate": 4.935141509433963e-06, "loss": 0.6718, "num_input_tokens_seen": 1101984, "step": 1675 }, { "epoch": 0.9905660377358491, "grad_norm": 3.2688705921173096, "learning_rate": 4.949882075471698e-06, "loss": 0.5319, "num_input_tokens_seen": 1105632, "step": 1680 }, { "epoch": 0.9935141509433962, "grad_norm": 4.00997257232666, "learning_rate": 4.9646226415094346e-06, "loss": 0.558, "num_input_tokens_seen": 1108032, "step": 1685 }, { "epoch": 0.9964622641509434, "grad_norm": 3.1069600582122803, "learning_rate": 4.979363207547171e-06, "loss": 0.5533, "num_input_tokens_seen": 1111552, "step": 1690 }, { "epoch": 0.9994103773584906, "grad_norm": 5.2922821044921875, "learning_rate": 4.994103773584906e-06, "loss": 0.5527, "num_input_tokens_seen": 1114176, "step": 1695 }, { "epoch": 1.0023584905660377, "grad_norm": 4.249703884124756, "learning_rate": 5.0088443396226414e-06, "loss": 0.4423, "num_input_tokens_seen": 1116768, "step": 1700 }, { "epoch": 1.005306603773585, "grad_norm": 4.103577136993408, "learning_rate": 5.023584905660378e-06, "loss": 0.5543, "num_input_tokens_seen": 1119968, "step": 1705 }, { "epoch": 1.008254716981132, "grad_norm": 3.851033926010132, "learning_rate": 5.038325471698113e-06, "loss": 0.6666, "num_input_tokens_seen": 1123840, "step": 1710 }, { "epoch": 1.0112028301886793, "grad_norm": 3.7682266235351562, "learning_rate": 5.05306603773585e-06, "loss": 0.5371, "num_input_tokens_seen": 1126752, "step": 1715 }, { "epoch": 1.0141509433962264, "grad_norm": 4.232779502868652, "learning_rate": 5.067806603773585e-06, "loss": 0.4704, "num_input_tokens_seen": 1129568, "step": 1720 }, { "epoch": 1.0170990566037736, "grad_norm": 2.967118740081787, "learning_rate": 5.0825471698113216e-06, "loss": 0.5763, "num_input_tokens_seen": 1133536, "step": 1725 }, { "epoch": 1.0200471698113207, "grad_norm": 5.696288108825684, "learning_rate": 5.097287735849057e-06, "loss": 0.6713, "num_input_tokens_seen": 1137024, "step": 1730 }, { "epoch": 1.022995283018868, "grad_norm": 4.217639446258545, "learning_rate": 5.112028301886793e-06, "loss": 0.6222, "num_input_tokens_seen": 1140032, "step": 1735 }, { "epoch": 1.025943396226415, "grad_norm": 3.0691213607788086, "learning_rate": 5.1267688679245284e-06, "loss": 0.5244, "num_input_tokens_seen": 1144544, "step": 1740 }, { "epoch": 1.0288915094339623, "grad_norm": 4.341266632080078, "learning_rate": 5.1415094339622655e-06, "loss": 0.5728, "num_input_tokens_seen": 1148032, "step": 1745 }, { "epoch": 1.0318396226415094, "grad_norm": 3.4031808376312256, "learning_rate": 5.156250000000001e-06, "loss": 0.6018, "num_input_tokens_seen": 1152064, "step": 1750 }, { "epoch": 1.0347877358490567, "grad_norm": 1.8209701776504517, "learning_rate": 5.170990566037736e-06, "loss": 0.5697, "num_input_tokens_seen": 1155040, "step": 1755 }, { "epoch": 1.0377358490566038, "grad_norm": 4.900368690490723, "learning_rate": 5.185731132075472e-06, "loss": 0.6185, "num_input_tokens_seen": 1157728, "step": 1760 }, { "epoch": 1.040683962264151, "grad_norm": 4.575941562652588, "learning_rate": 5.200471698113208e-06, "loss": 0.6399, "num_input_tokens_seen": 1160480, "step": 1765 }, { "epoch": 1.0436320754716981, "grad_norm": 10.43868637084961, "learning_rate": 5.215212264150944e-06, "loss": 0.8053, "num_input_tokens_seen": 1163424, "step": 1770 }, { "epoch": 1.0465801886792452, "grad_norm": 3.881624937057495, "learning_rate": 5.229952830188679e-06, "loss": 0.6192, "num_input_tokens_seen": 1167136, "step": 1775 }, { "epoch": 1.0495283018867925, "grad_norm": 2.6529641151428223, "learning_rate": 5.2446933962264154e-06, "loss": 0.59, "num_input_tokens_seen": 1170720, "step": 1780 }, { "epoch": 1.0524764150943395, "grad_norm": 2.8498570919036865, "learning_rate": 5.259433962264151e-06, "loss": 0.5706, "num_input_tokens_seen": 1174144, "step": 1785 }, { "epoch": 1.0554245283018868, "grad_norm": 1.846171498298645, "learning_rate": 5.274174528301888e-06, "loss": 0.8162, "num_input_tokens_seen": 1176704, "step": 1790 }, { "epoch": 1.0583726415094339, "grad_norm": 3.792846202850342, "learning_rate": 5.288915094339623e-06, "loss": 0.6045, "num_input_tokens_seen": 1179264, "step": 1795 }, { "epoch": 1.0613207547169812, "grad_norm": 5.151941299438477, "learning_rate": 5.303655660377359e-06, "loss": 0.5421, "num_input_tokens_seen": 1181888, "step": 1800 }, { "epoch": 1.0642688679245282, "grad_norm": 2.3455300331115723, "learning_rate": 5.318396226415095e-06, "loss": 0.6233, "num_input_tokens_seen": 1185184, "step": 1805 }, { "epoch": 1.0672169811320755, "grad_norm": 2.107667922973633, "learning_rate": 5.333136792452831e-06, "loss": 0.5784, "num_input_tokens_seen": 1188576, "step": 1810 }, { "epoch": 1.0701650943396226, "grad_norm": 2.338409185409546, "learning_rate": 5.347877358490566e-06, "loss": 0.4903, "num_input_tokens_seen": 1193696, "step": 1815 }, { "epoch": 1.0731132075471699, "grad_norm": 3.872992753982544, "learning_rate": 5.362617924528302e-06, "loss": 0.6233, "num_input_tokens_seen": 1196864, "step": 1820 }, { "epoch": 1.076061320754717, "grad_norm": 22.63856315612793, "learning_rate": 5.377358490566038e-06, "loss": 0.4625, "num_input_tokens_seen": 1199488, "step": 1825 }, { "epoch": 1.0790094339622642, "grad_norm": 2.4325506687164307, "learning_rate": 5.392099056603775e-06, "loss": 0.4523, "num_input_tokens_seen": 1202560, "step": 1830 }, { "epoch": 1.0819575471698113, "grad_norm": 2.471057891845703, "learning_rate": 5.40683962264151e-06, "loss": 0.5561, "num_input_tokens_seen": 1206912, "step": 1835 }, { "epoch": 1.0849056603773586, "grad_norm": 4.980198860168457, "learning_rate": 5.4215801886792455e-06, "loss": 0.5805, "num_input_tokens_seen": 1214336, "step": 1840 }, { "epoch": 1.0878537735849056, "grad_norm": 3.3913519382476807, "learning_rate": 5.436320754716982e-06, "loss": 0.4879, "num_input_tokens_seen": 1217248, "step": 1845 }, { "epoch": 1.0908018867924527, "grad_norm": 6.771216869354248, "learning_rate": 5.451061320754717e-06, "loss": 0.5373, "num_input_tokens_seen": 1220992, "step": 1850 }, { "epoch": 1.09375, "grad_norm": 2.413623094558716, "learning_rate": 5.465801886792453e-06, "loss": 0.5705, "num_input_tokens_seen": 1225120, "step": 1855 }, { "epoch": 1.0966981132075473, "grad_norm": 2.9067020416259766, "learning_rate": 5.4805424528301886e-06, "loss": 0.4831, "num_input_tokens_seen": 1228832, "step": 1860 }, { "epoch": 1.0996462264150944, "grad_norm": 5.140554904937744, "learning_rate": 5.495283018867925e-06, "loss": 0.5371, "num_input_tokens_seen": 1232960, "step": 1865 }, { "epoch": 1.1025943396226414, "grad_norm": 3.314495086669922, "learning_rate": 5.51002358490566e-06, "loss": 0.5514, "num_input_tokens_seen": 1237408, "step": 1870 }, { "epoch": 1.1055424528301887, "grad_norm": 4.50612735748291, "learning_rate": 5.524764150943397e-06, "loss": 0.6155, "num_input_tokens_seen": 1240384, "step": 1875 }, { "epoch": 1.1084905660377358, "grad_norm": 2.173063278198242, "learning_rate": 5.5395047169811325e-06, "loss": 0.4352, "num_input_tokens_seen": 1243904, "step": 1880 }, { "epoch": 1.111438679245283, "grad_norm": 4.385035037994385, "learning_rate": 5.554245283018869e-06, "loss": 0.5924, "num_input_tokens_seen": 1246720, "step": 1885 }, { "epoch": 1.1143867924528301, "grad_norm": 4.968380451202393, "learning_rate": 5.568985849056604e-06, "loss": 0.6294, "num_input_tokens_seen": 1249120, "step": 1890 }, { "epoch": 1.1173349056603774, "grad_norm": 3.5222890377044678, "learning_rate": 5.58372641509434e-06, "loss": 0.4898, "num_input_tokens_seen": 1252224, "step": 1895 }, { "epoch": 1.1202830188679245, "grad_norm": 3.6813976764678955, "learning_rate": 5.5984669811320755e-06, "loss": 0.508, "num_input_tokens_seen": 1255360, "step": 1900 }, { "epoch": 1.1232311320754718, "grad_norm": 3.5292656421661377, "learning_rate": 5.613207547169813e-06, "loss": 0.4975, "num_input_tokens_seen": 1258016, "step": 1905 }, { "epoch": 1.1261792452830188, "grad_norm": 2.9576783180236816, "learning_rate": 5.627948113207548e-06, "loss": 0.5165, "num_input_tokens_seen": 1262560, "step": 1910 }, { "epoch": 1.1291273584905661, "grad_norm": 3.691248893737793, "learning_rate": 5.642688679245284e-06, "loss": 0.5123, "num_input_tokens_seen": 1265568, "step": 1915 }, { "epoch": 1.1320754716981132, "grad_norm": 5.523294925689697, "learning_rate": 5.6574292452830195e-06, "loss": 0.4834, "num_input_tokens_seen": 1268512, "step": 1920 }, { "epoch": 1.1350235849056605, "grad_norm": 3.2392354011535645, "learning_rate": 5.672169811320756e-06, "loss": 0.5192, "num_input_tokens_seen": 1271520, "step": 1925 }, { "epoch": 1.1379716981132075, "grad_norm": 3.8587400913238525, "learning_rate": 5.686910377358491e-06, "loss": 0.4993, "num_input_tokens_seen": 1274784, "step": 1930 }, { "epoch": 1.1409198113207548, "grad_norm": 1.6959421634674072, "learning_rate": 5.701650943396226e-06, "loss": 0.4692, "num_input_tokens_seen": 1277984, "step": 1935 }, { "epoch": 1.1438679245283019, "grad_norm": 2.152656316757202, "learning_rate": 5.7163915094339625e-06, "loss": 0.6395, "num_input_tokens_seen": 1280736, "step": 1940 }, { "epoch": 1.146816037735849, "grad_norm": 2.790085792541504, "learning_rate": 5.731132075471698e-06, "loss": 0.5326, "num_input_tokens_seen": 1284320, "step": 1945 }, { "epoch": 1.1497641509433962, "grad_norm": 2.3330836296081543, "learning_rate": 5.745872641509435e-06, "loss": 0.5145, "num_input_tokens_seen": 1287008, "step": 1950 }, { "epoch": 1.1527122641509433, "grad_norm": 2.0764760971069336, "learning_rate": 5.76061320754717e-06, "loss": 0.4969, "num_input_tokens_seen": 1289568, "step": 1955 }, { "epoch": 1.1556603773584906, "grad_norm": 5.520761489868164, "learning_rate": 5.7753537735849065e-06, "loss": 0.5207, "num_input_tokens_seen": 1292224, "step": 1960 }, { "epoch": 1.1586084905660377, "grad_norm": 4.548455715179443, "learning_rate": 5.790094339622642e-06, "loss": 0.6401, "num_input_tokens_seen": 1294496, "step": 1965 }, { "epoch": 1.161556603773585, "grad_norm": 2.090322494506836, "learning_rate": 5.804834905660378e-06, "loss": 0.5149, "num_input_tokens_seen": 1297888, "step": 1970 }, { "epoch": 1.164504716981132, "grad_norm": 2.509202718734741, "learning_rate": 5.819575471698113e-06, "loss": 0.4931, "num_input_tokens_seen": 1301504, "step": 1975 }, { "epoch": 1.1674528301886793, "grad_norm": 2.7064290046691895, "learning_rate": 5.8343160377358495e-06, "loss": 0.5105, "num_input_tokens_seen": 1305472, "step": 1980 }, { "epoch": 1.1704009433962264, "grad_norm": 2.9242172241210938, "learning_rate": 5.849056603773585e-06, "loss": 0.5799, "num_input_tokens_seen": 1308768, "step": 1985 }, { "epoch": 1.1733490566037736, "grad_norm": 2.3623175621032715, "learning_rate": 5.863797169811322e-06, "loss": 0.5767, "num_input_tokens_seen": 1313984, "step": 1990 }, { "epoch": 1.1762971698113207, "grad_norm": 3.6169967651367188, "learning_rate": 5.878537735849057e-06, "loss": 0.6357, "num_input_tokens_seen": 1317152, "step": 1995 }, { "epoch": 1.179245283018868, "grad_norm": 3.7577714920043945, "learning_rate": 5.8932783018867934e-06, "loss": 0.3919, "num_input_tokens_seen": 1320672, "step": 2000 }, { "epoch": 1.182193396226415, "grad_norm": 3.693960666656494, "learning_rate": 5.908018867924529e-06, "loss": 0.4994, "num_input_tokens_seen": 1323840, "step": 2005 }, { "epoch": 1.1851415094339623, "grad_norm": 3.1659798622131348, "learning_rate": 5.922759433962265e-06, "loss": 0.5483, "num_input_tokens_seen": 1326752, "step": 2010 }, { "epoch": 1.1880896226415094, "grad_norm": 3.9818222522735596, "learning_rate": 5.9375e-06, "loss": 0.483, "num_input_tokens_seen": 1329408, "step": 2015 }, { "epoch": 1.1910377358490567, "grad_norm": 4.175352573394775, "learning_rate": 5.952240566037736e-06, "loss": 0.5483, "num_input_tokens_seen": 1331360, "step": 2020 }, { "epoch": 1.1939858490566038, "grad_norm": 6.184230804443359, "learning_rate": 5.966981132075472e-06, "loss": 0.7314, "num_input_tokens_seen": 1334304, "step": 2025 }, { "epoch": 1.196933962264151, "grad_norm": 3.197706937789917, "learning_rate": 5.981721698113207e-06, "loss": 0.506, "num_input_tokens_seen": 1340832, "step": 2030 }, { "epoch": 1.1998820754716981, "grad_norm": 4.534255504608154, "learning_rate": 5.996462264150944e-06, "loss": 0.5492, "num_input_tokens_seen": 1343552, "step": 2035 }, { "epoch": 1.2028301886792452, "grad_norm": 3.6178128719329834, "learning_rate": 6.01120283018868e-06, "loss": 0.4167, "num_input_tokens_seen": 1347584, "step": 2040 }, { "epoch": 1.2057783018867925, "grad_norm": 3.2608139514923096, "learning_rate": 6.025943396226416e-06, "loss": 0.5045, "num_input_tokens_seen": 1350624, "step": 2045 }, { "epoch": 1.2087264150943395, "grad_norm": 4.287358283996582, "learning_rate": 6.040683962264151e-06, "loss": 0.5541, "num_input_tokens_seen": 1353408, "step": 2050 }, { "epoch": 1.2116745283018868, "grad_norm": 4.255808353424072, "learning_rate": 6.055424528301887e-06, "loss": 0.6644, "num_input_tokens_seen": 1357056, "step": 2055 }, { "epoch": 1.2146226415094339, "grad_norm": 2.3566532135009766, "learning_rate": 6.070165094339623e-06, "loss": 0.5401, "num_input_tokens_seen": 1359744, "step": 2060 }, { "epoch": 1.2175707547169812, "grad_norm": 3.019322395324707, "learning_rate": 6.08490566037736e-06, "loss": 0.5824, "num_input_tokens_seen": 1363104, "step": 2065 }, { "epoch": 1.2205188679245282, "grad_norm": 5.312661647796631, "learning_rate": 6.099646226415095e-06, "loss": 0.5176, "num_input_tokens_seen": 1365856, "step": 2070 }, { "epoch": 1.2234669811320755, "grad_norm": 2.836784839630127, "learning_rate": 6.114386792452831e-06, "loss": 0.4889, "num_input_tokens_seen": 1369056, "step": 2075 }, { "epoch": 1.2264150943396226, "grad_norm": 2.703174591064453, "learning_rate": 6.129127358490567e-06, "loss": 0.5321, "num_input_tokens_seen": 1372192, "step": 2080 }, { "epoch": 1.2293632075471699, "grad_norm": 3.301712989807129, "learning_rate": 6.143867924528303e-06, "loss": 0.6158, "num_input_tokens_seen": 1375840, "step": 2085 }, { "epoch": 1.232311320754717, "grad_norm": 2.398902654647827, "learning_rate": 6.158608490566038e-06, "loss": 0.5352, "num_input_tokens_seen": 1380032, "step": 2090 }, { "epoch": 1.2352594339622642, "grad_norm": 4.16459846496582, "learning_rate": 6.173349056603774e-06, "loss": 0.4882, "num_input_tokens_seen": 1382944, "step": 2095 }, { "epoch": 1.2382075471698113, "grad_norm": 3.7614545822143555, "learning_rate": 6.18808962264151e-06, "loss": 0.5135, "num_input_tokens_seen": 1385536, "step": 2100 }, { "epoch": 1.2411556603773586, "grad_norm": 2.9860689640045166, "learning_rate": 6.202830188679245e-06, "loss": 0.5549, "num_input_tokens_seen": 1388224, "step": 2105 }, { "epoch": 1.2441037735849056, "grad_norm": 1.8587725162506104, "learning_rate": 6.217570754716982e-06, "loss": 0.5414, "num_input_tokens_seen": 1391520, "step": 2110 }, { "epoch": 1.2470518867924527, "grad_norm": 3.4099864959716797, "learning_rate": 6.232311320754717e-06, "loss": 0.4993, "num_input_tokens_seen": 1395136, "step": 2115 }, { "epoch": 1.25, "grad_norm": 2.187669038772583, "learning_rate": 6.2470518867924536e-06, "loss": 0.3933, "num_input_tokens_seen": 1398272, "step": 2120 }, { "epoch": 1.2529481132075473, "grad_norm": 3.6227307319641113, "learning_rate": 6.261792452830189e-06, "loss": 0.4502, "num_input_tokens_seen": 1401152, "step": 2125 }, { "epoch": 1.2558962264150944, "grad_norm": 2.010763168334961, "learning_rate": 6.276533018867925e-06, "loss": 0.4865, "num_input_tokens_seen": 1404320, "step": 2130 }, { "epoch": 1.2588443396226414, "grad_norm": 5.0804619789123535, "learning_rate": 6.2912735849056604e-06, "loss": 0.623, "num_input_tokens_seen": 1406784, "step": 2135 }, { "epoch": 1.2617924528301887, "grad_norm": 2.9033353328704834, "learning_rate": 6.306014150943397e-06, "loss": 0.4657, "num_input_tokens_seen": 1409536, "step": 2140 }, { "epoch": 1.2647405660377358, "grad_norm": 4.2069993019104, "learning_rate": 6.320754716981132e-06, "loss": 0.5535, "num_input_tokens_seen": 1412608, "step": 2145 }, { "epoch": 1.267688679245283, "grad_norm": 2.6685538291931152, "learning_rate": 6.335495283018869e-06, "loss": 0.4679, "num_input_tokens_seen": 1415104, "step": 2150 }, { "epoch": 1.2706367924528301, "grad_norm": 2.4269840717315674, "learning_rate": 6.350235849056604e-06, "loss": 0.474, "num_input_tokens_seen": 1418496, "step": 2155 }, { "epoch": 1.2735849056603774, "grad_norm": 3.048978090286255, "learning_rate": 6.3649764150943406e-06, "loss": 0.7338, "num_input_tokens_seen": 1421760, "step": 2160 }, { "epoch": 1.2765330188679245, "grad_norm": 3.914550304412842, "learning_rate": 6.379716981132076e-06, "loss": 0.5287, "num_input_tokens_seen": 1425888, "step": 2165 }, { "epoch": 1.2794811320754718, "grad_norm": 2.482325315475464, "learning_rate": 6.394457547169812e-06, "loss": 0.4954, "num_input_tokens_seen": 1429440, "step": 2170 }, { "epoch": 1.2824292452830188, "grad_norm": 1.682214379310608, "learning_rate": 6.4091981132075474e-06, "loss": 0.5981, "num_input_tokens_seen": 1433664, "step": 2175 }, { "epoch": 1.2853773584905661, "grad_norm": 3.9419360160827637, "learning_rate": 6.423938679245284e-06, "loss": 0.6603, "num_input_tokens_seen": 1436704, "step": 2180 }, { "epoch": 1.2883254716981132, "grad_norm": 4.337841510772705, "learning_rate": 6.438679245283019e-06, "loss": 0.7338, "num_input_tokens_seen": 1439776, "step": 2185 }, { "epoch": 1.2912735849056602, "grad_norm": 2.785484552383423, "learning_rate": 6.453419811320756e-06, "loss": 0.5674, "num_input_tokens_seen": 1442848, "step": 2190 }, { "epoch": 1.2942216981132075, "grad_norm": 2.8729817867279053, "learning_rate": 6.468160377358491e-06, "loss": 0.5426, "num_input_tokens_seen": 1446400, "step": 2195 }, { "epoch": 1.2971698113207548, "grad_norm": 3.360976219177246, "learning_rate": 6.482900943396227e-06, "loss": 0.4204, "num_input_tokens_seen": 1450112, "step": 2200 }, { "epoch": 1.3001179245283019, "grad_norm": 16.757238388061523, "learning_rate": 6.497641509433963e-06, "loss": 0.5361, "num_input_tokens_seen": 1452832, "step": 2205 }, { "epoch": 1.303066037735849, "grad_norm": 3.2596757411956787, "learning_rate": 6.512382075471698e-06, "loss": 0.5846, "num_input_tokens_seen": 1456288, "step": 2210 }, { "epoch": 1.3060141509433962, "grad_norm": 2.6916353702545166, "learning_rate": 6.5271226415094344e-06, "loss": 0.5415, "num_input_tokens_seen": 1458944, "step": 2215 }, { "epoch": 1.3089622641509435, "grad_norm": 3.620497226715088, "learning_rate": 6.54186320754717e-06, "loss": 0.5442, "num_input_tokens_seen": 1462176, "step": 2220 }, { "epoch": 1.3119103773584906, "grad_norm": 3.2965245246887207, "learning_rate": 6.556603773584907e-06, "loss": 0.5742, "num_input_tokens_seen": 1465664, "step": 2225 }, { "epoch": 1.3148584905660377, "grad_norm": 2.5756919384002686, "learning_rate": 6.571344339622641e-06, "loss": 0.5088, "num_input_tokens_seen": 1468480, "step": 2230 }, { "epoch": 1.317806603773585, "grad_norm": 3.298975944519043, "learning_rate": 6.586084905660378e-06, "loss": 0.5049, "num_input_tokens_seen": 1472640, "step": 2235 }, { "epoch": 1.320754716981132, "grad_norm": 2.3547816276550293, "learning_rate": 6.600825471698114e-06, "loss": 0.4926, "num_input_tokens_seen": 1476160, "step": 2240 }, { "epoch": 1.3237028301886793, "grad_norm": 1.6435917615890503, "learning_rate": 6.61556603773585e-06, "loss": 0.5787, "num_input_tokens_seen": 1479168, "step": 2245 }, { "epoch": 1.3266509433962264, "grad_norm": 4.282700538635254, "learning_rate": 6.630306603773585e-06, "loss": 0.5289, "num_input_tokens_seen": 1482656, "step": 2250 }, { "epoch": 1.3295990566037736, "grad_norm": 3.351133346557617, "learning_rate": 6.645047169811321e-06, "loss": 0.4083, "num_input_tokens_seen": 1485600, "step": 2255 }, { "epoch": 1.3325471698113207, "grad_norm": 2.189128875732422, "learning_rate": 6.659787735849057e-06, "loss": 0.5556, "num_input_tokens_seen": 1488768, "step": 2260 }, { "epoch": 1.335495283018868, "grad_norm": 1.6451798677444458, "learning_rate": 6.674528301886794e-06, "loss": 0.4465, "num_input_tokens_seen": 1492224, "step": 2265 }, { "epoch": 1.338443396226415, "grad_norm": 3.4212090969085693, "learning_rate": 6.689268867924529e-06, "loss": 0.6671, "num_input_tokens_seen": 1494656, "step": 2270 }, { "epoch": 1.3413915094339623, "grad_norm": 3.1692099571228027, "learning_rate": 6.704009433962265e-06, "loss": 0.7953, "num_input_tokens_seen": 1498048, "step": 2275 }, { "epoch": 1.3443396226415094, "grad_norm": 4.251041889190674, "learning_rate": 6.718750000000001e-06, "loss": 0.5746, "num_input_tokens_seen": 1500864, "step": 2280 }, { "epoch": 1.3472877358490565, "grad_norm": 3.548417329788208, "learning_rate": 6.733490566037736e-06, "loss": 0.7002, "num_input_tokens_seen": 1503456, "step": 2285 }, { "epoch": 1.3502358490566038, "grad_norm": 2.8523647785186768, "learning_rate": 6.748231132075472e-06, "loss": 0.4461, "num_input_tokens_seen": 1507520, "step": 2290 }, { "epoch": 1.353183962264151, "grad_norm": 8.364376068115234, "learning_rate": 6.7629716981132076e-06, "loss": 0.5449, "num_input_tokens_seen": 1510272, "step": 2295 }, { "epoch": 1.3561320754716981, "grad_norm": 1.7106280326843262, "learning_rate": 6.777712264150944e-06, "loss": 0.6811, "num_input_tokens_seen": 1514016, "step": 2300 }, { "epoch": 1.3590801886792452, "grad_norm": 5.7316436767578125, "learning_rate": 6.792452830188679e-06, "loss": 0.5663, "num_input_tokens_seen": 1516384, "step": 2305 }, { "epoch": 1.3620283018867925, "grad_norm": 1.8173402547836304, "learning_rate": 6.807193396226416e-06, "loss": 0.5634, "num_input_tokens_seen": 1520000, "step": 2310 }, { "epoch": 1.3649764150943398, "grad_norm": 3.874575614929199, "learning_rate": 6.8219339622641515e-06, "loss": 0.5711, "num_input_tokens_seen": 1523424, "step": 2315 }, { "epoch": 1.3679245283018868, "grad_norm": 1.8740822076797485, "learning_rate": 6.836674528301888e-06, "loss": 0.483, "num_input_tokens_seen": 1527712, "step": 2320 }, { "epoch": 1.3708726415094339, "grad_norm": 1.5051612854003906, "learning_rate": 6.851415094339623e-06, "loss": 0.4839, "num_input_tokens_seen": 1530336, "step": 2325 }, { "epoch": 1.3738207547169812, "grad_norm": 2.4557623863220215, "learning_rate": 6.866155660377359e-06, "loss": 0.5352, "num_input_tokens_seen": 1533600, "step": 2330 }, { "epoch": 1.3767688679245282, "grad_norm": 2.603081464767456, "learning_rate": 6.8808962264150946e-06, "loss": 0.5141, "num_input_tokens_seen": 1536608, "step": 2335 }, { "epoch": 1.3797169811320755, "grad_norm": 5.052565097808838, "learning_rate": 6.895636792452831e-06, "loss": 0.5159, "num_input_tokens_seen": 1539840, "step": 2340 }, { "epoch": 1.3826650943396226, "grad_norm": 3.450303792953491, "learning_rate": 6.910377358490566e-06, "loss": 0.5546, "num_input_tokens_seen": 1542848, "step": 2345 }, { "epoch": 1.3856132075471699, "grad_norm": 5.4785051345825195, "learning_rate": 6.925117924528303e-06, "loss": 0.5306, "num_input_tokens_seen": 1545632, "step": 2350 }, { "epoch": 1.388561320754717, "grad_norm": 2.52748703956604, "learning_rate": 6.9398584905660385e-06, "loss": 0.4381, "num_input_tokens_seen": 1548608, "step": 2355 }, { "epoch": 1.3915094339622642, "grad_norm": 7.0988664627075195, "learning_rate": 6.954599056603775e-06, "loss": 0.5801, "num_input_tokens_seen": 1551040, "step": 2360 }, { "epoch": 1.3944575471698113, "grad_norm": 4.77442741394043, "learning_rate": 6.96933962264151e-06, "loss": 0.5325, "num_input_tokens_seen": 1554016, "step": 2365 }, { "epoch": 1.3974056603773586, "grad_norm": 3.5484859943389893, "learning_rate": 6.984080188679245e-06, "loss": 0.5848, "num_input_tokens_seen": 1556448, "step": 2370 }, { "epoch": 1.4003537735849056, "grad_norm": 2.26281476020813, "learning_rate": 6.9988207547169815e-06, "loss": 0.4444, "num_input_tokens_seen": 1559616, "step": 2375 }, { "epoch": 1.4033018867924527, "grad_norm": 1.4687451124191284, "learning_rate": 7.013561320754717e-06, "loss": 0.5111, "num_input_tokens_seen": 1562848, "step": 2380 }, { "epoch": 1.40625, "grad_norm": 3.5146713256835938, "learning_rate": 7.028301886792454e-06, "loss": 0.4979, "num_input_tokens_seen": 1566464, "step": 2385 }, { "epoch": 1.4091981132075473, "grad_norm": 3.4217355251312256, "learning_rate": 7.043042452830188e-06, "loss": 0.6476, "num_input_tokens_seen": 1569824, "step": 2390 }, { "epoch": 1.4121462264150944, "grad_norm": 2.4154245853424072, "learning_rate": 7.0577830188679255e-06, "loss": 0.528, "num_input_tokens_seen": 1573216, "step": 2395 }, { "epoch": 1.4150943396226414, "grad_norm": 1.2288198471069336, "learning_rate": 7.072523584905661e-06, "loss": 0.525, "num_input_tokens_seen": 1576576, "step": 2400 }, { "epoch": 1.4180424528301887, "grad_norm": 1.8532792329788208, "learning_rate": 7.087264150943397e-06, "loss": 0.6561, "num_input_tokens_seen": 1581824, "step": 2405 }, { "epoch": 1.4209905660377358, "grad_norm": 1.5900774002075195, "learning_rate": 7.102004716981132e-06, "loss": 0.637, "num_input_tokens_seen": 1584448, "step": 2410 }, { "epoch": 1.423938679245283, "grad_norm": 2.3724169731140137, "learning_rate": 7.1167452830188685e-06, "loss": 0.5577, "num_input_tokens_seen": 1587680, "step": 2415 }, { "epoch": 1.4268867924528301, "grad_norm": 2.141202211380005, "learning_rate": 7.131485849056604e-06, "loss": 0.4997, "num_input_tokens_seen": 1592352, "step": 2420 }, { "epoch": 1.4298349056603774, "grad_norm": 2.9847638607025146, "learning_rate": 7.146226415094341e-06, "loss": 0.5602, "num_input_tokens_seen": 1595296, "step": 2425 }, { "epoch": 1.4327830188679245, "grad_norm": 2.8324172496795654, "learning_rate": 7.160966981132076e-06, "loss": 0.62, "num_input_tokens_seen": 1598336, "step": 2430 }, { "epoch": 1.4357311320754718, "grad_norm": 4.628724098205566, "learning_rate": 7.1757075471698125e-06, "loss": 0.6017, "num_input_tokens_seen": 1600864, "step": 2435 }, { "epoch": 1.4386792452830188, "grad_norm": 2.555259943008423, "learning_rate": 7.190448113207548e-06, "loss": 0.4879, "num_input_tokens_seen": 1603200, "step": 2440 }, { "epoch": 1.4416273584905661, "grad_norm": 4.385376930236816, "learning_rate": 7.205188679245284e-06, "loss": 0.5204, "num_input_tokens_seen": 1605920, "step": 2445 }, { "epoch": 1.4445754716981132, "grad_norm": 2.0349044799804688, "learning_rate": 7.219929245283019e-06, "loss": 0.4936, "num_input_tokens_seen": 1609952, "step": 2450 }, { "epoch": 1.4475235849056602, "grad_norm": 2.761909246444702, "learning_rate": 7.2346698113207555e-06, "loss": 0.4876, "num_input_tokens_seen": 1612832, "step": 2455 }, { "epoch": 1.4504716981132075, "grad_norm": 2.9955990314483643, "learning_rate": 7.249410377358491e-06, "loss": 0.551, "num_input_tokens_seen": 1616032, "step": 2460 }, { "epoch": 1.4534198113207548, "grad_norm": 4.810715198516846, "learning_rate": 7.264150943396226e-06, "loss": 0.6721, "num_input_tokens_seen": 1618656, "step": 2465 }, { "epoch": 1.4563679245283019, "grad_norm": 3.773735284805298, "learning_rate": 7.278891509433963e-06, "loss": 0.4621, "num_input_tokens_seen": 1622016, "step": 2470 }, { "epoch": 1.459316037735849, "grad_norm": 2.9410014152526855, "learning_rate": 7.293632075471699e-06, "loss": 0.522, "num_input_tokens_seen": 1624960, "step": 2475 }, { "epoch": 1.4622641509433962, "grad_norm": 2.5756518840789795, "learning_rate": 7.308372641509435e-06, "loss": 0.5426, "num_input_tokens_seen": 1627616, "step": 2480 }, { "epoch": 1.4652122641509435, "grad_norm": 2.486736536026001, "learning_rate": 7.32311320754717e-06, "loss": 0.5014, "num_input_tokens_seen": 1631552, "step": 2485 }, { "epoch": 1.4681603773584906, "grad_norm": 2.4909157752990723, "learning_rate": 7.337853773584906e-06, "loss": 0.5222, "num_input_tokens_seen": 1635168, "step": 2490 }, { "epoch": 1.4711084905660377, "grad_norm": 2.5154950618743896, "learning_rate": 7.352594339622642e-06, "loss": 0.6117, "num_input_tokens_seen": 1638496, "step": 2495 }, { "epoch": 1.474056603773585, "grad_norm": 4.234388828277588, "learning_rate": 7.367334905660378e-06, "loss": 0.4517, "num_input_tokens_seen": 1641056, "step": 2500 }, { "epoch": 1.477004716981132, "grad_norm": 4.872511863708496, "learning_rate": 7.382075471698113e-06, "loss": 0.4142, "num_input_tokens_seen": 1643456, "step": 2505 }, { "epoch": 1.4799528301886793, "grad_norm": 3.017025947570801, "learning_rate": 7.39681603773585e-06, "loss": 0.6685, "num_input_tokens_seen": 1646048, "step": 2510 }, { "epoch": 1.4829009433962264, "grad_norm": 2.021003007888794, "learning_rate": 7.411556603773586e-06, "loss": 0.5022, "num_input_tokens_seen": 1648736, "step": 2515 }, { "epoch": 1.4858490566037736, "grad_norm": 4.787374496459961, "learning_rate": 7.426297169811322e-06, "loss": 0.7628, "num_input_tokens_seen": 1651744, "step": 2520 }, { "epoch": 1.4887971698113207, "grad_norm": 4.024120330810547, "learning_rate": 7.441037735849057e-06, "loss": 0.5596, "num_input_tokens_seen": 1655360, "step": 2525 }, { "epoch": 1.491745283018868, "grad_norm": 1.8238476514816284, "learning_rate": 7.455778301886793e-06, "loss": 0.4784, "num_input_tokens_seen": 1658496, "step": 2530 }, { "epoch": 1.494693396226415, "grad_norm": 1.660233497619629, "learning_rate": 7.470518867924529e-06, "loss": 0.4774, "num_input_tokens_seen": 1662464, "step": 2535 }, { "epoch": 1.4976415094339623, "grad_norm": 2.269698143005371, "learning_rate": 7.485259433962266e-06, "loss": 0.5315, "num_input_tokens_seen": 1666240, "step": 2540 }, { "epoch": 1.5005896226415094, "grad_norm": 4.825011253356934, "learning_rate": 7.500000000000001e-06, "loss": 0.5831, "num_input_tokens_seen": 1669664, "step": 2545 }, { "epoch": 1.5035377358490565, "grad_norm": 2.543957471847534, "learning_rate": 7.5147405660377355e-06, "loss": 0.6485, "num_input_tokens_seen": 1672480, "step": 2550 }, { "epoch": 1.5064858490566038, "grad_norm": 1.3370933532714844, "learning_rate": 7.5294811320754726e-06, "loss": 0.5187, "num_input_tokens_seen": 1676928, "step": 2555 }, { "epoch": 1.509433962264151, "grad_norm": 2.7509067058563232, "learning_rate": 7.544221698113208e-06, "loss": 0.5311, "num_input_tokens_seen": 1680416, "step": 2560 }, { "epoch": 1.5123820754716981, "grad_norm": 2.8606014251708984, "learning_rate": 7.558962264150944e-06, "loss": 0.5865, "num_input_tokens_seen": 1683936, "step": 2565 }, { "epoch": 1.5153301886792452, "grad_norm": 1.9248907566070557, "learning_rate": 7.5737028301886795e-06, "loss": 0.5626, "num_input_tokens_seen": 1686592, "step": 2570 }, { "epoch": 1.5182783018867925, "grad_norm": 2.226353883743286, "learning_rate": 7.588443396226416e-06, "loss": 0.4807, "num_input_tokens_seen": 1690080, "step": 2575 }, { "epoch": 1.5212264150943398, "grad_norm": 2.218029737472534, "learning_rate": 7.603183962264151e-06, "loss": 0.5007, "num_input_tokens_seen": 1693184, "step": 2580 }, { "epoch": 1.5241745283018868, "grad_norm": 2.605360984802246, "learning_rate": 7.617924528301888e-06, "loss": 0.5945, "num_input_tokens_seen": 1696416, "step": 2585 }, { "epoch": 1.5271226415094339, "grad_norm": 3.2108309268951416, "learning_rate": 7.632665094339623e-06, "loss": 0.6371, "num_input_tokens_seen": 1699936, "step": 2590 }, { "epoch": 1.5300707547169812, "grad_norm": 3.3554418087005615, "learning_rate": 7.64740566037736e-06, "loss": 0.4407, "num_input_tokens_seen": 1702848, "step": 2595 }, { "epoch": 1.5330188679245285, "grad_norm": 1.4536768198013306, "learning_rate": 7.662146226415095e-06, "loss": 0.3951, "num_input_tokens_seen": 1706016, "step": 2600 }, { "epoch": 1.5359669811320755, "grad_norm": 3.0723934173583984, "learning_rate": 7.676886792452832e-06, "loss": 0.5948, "num_input_tokens_seen": 1708448, "step": 2605 }, { "epoch": 1.5389150943396226, "grad_norm": 1.5783774852752686, "learning_rate": 7.691627358490567e-06, "loss": 0.4562, "num_input_tokens_seen": 1711648, "step": 2610 }, { "epoch": 1.5418632075471699, "grad_norm": 2.3228297233581543, "learning_rate": 7.706367924528303e-06, "loss": 0.5881, "num_input_tokens_seen": 1715456, "step": 2615 }, { "epoch": 1.544811320754717, "grad_norm": 2.5645196437835693, "learning_rate": 7.721108490566038e-06, "loss": 0.4879, "num_input_tokens_seen": 1719808, "step": 2620 }, { "epoch": 1.547759433962264, "grad_norm": 2.7866580486297607, "learning_rate": 7.735849056603775e-06, "loss": 0.4888, "num_input_tokens_seen": 1724320, "step": 2625 }, { "epoch": 1.5507075471698113, "grad_norm": 1.4965835809707642, "learning_rate": 7.75058962264151e-06, "loss": 0.5349, "num_input_tokens_seen": 1727392, "step": 2630 }, { "epoch": 1.5536556603773586, "grad_norm": 2.3611865043640137, "learning_rate": 7.765330188679246e-06, "loss": 0.4513, "num_input_tokens_seen": 1730336, "step": 2635 }, { "epoch": 1.5566037735849056, "grad_norm": 1.8291480541229248, "learning_rate": 7.780070754716981e-06, "loss": 0.4757, "num_input_tokens_seen": 1733344, "step": 2640 }, { "epoch": 1.5595518867924527, "grad_norm": 11.644481658935547, "learning_rate": 7.794811320754716e-06, "loss": 0.5294, "num_input_tokens_seen": 1735872, "step": 2645 }, { "epoch": 1.5625, "grad_norm": 3.3983981609344482, "learning_rate": 7.809551886792453e-06, "loss": 0.3726, "num_input_tokens_seen": 1738464, "step": 2650 }, { "epoch": 1.5654481132075473, "grad_norm": 2.53676700592041, "learning_rate": 7.824292452830189e-06, "loss": 0.5604, "num_input_tokens_seen": 1742240, "step": 2655 }, { "epoch": 1.5683962264150944, "grad_norm": 4.125581741333008, "learning_rate": 7.839033018867926e-06, "loss": 0.4881, "num_input_tokens_seen": 1744992, "step": 2660 }, { "epoch": 1.5713443396226414, "grad_norm": 1.8486053943634033, "learning_rate": 7.853773584905661e-06, "loss": 0.5349, "num_input_tokens_seen": 1748128, "step": 2665 }, { "epoch": 1.5742924528301887, "grad_norm": 3.089700222015381, "learning_rate": 7.868514150943397e-06, "loss": 0.5573, "num_input_tokens_seen": 1751456, "step": 2670 }, { "epoch": 1.577240566037736, "grad_norm": 3.634348154067993, "learning_rate": 7.883254716981132e-06, "loss": 0.4328, "num_input_tokens_seen": 1754048, "step": 2675 }, { "epoch": 1.580188679245283, "grad_norm": 2.200222969055176, "learning_rate": 7.897995283018869e-06, "loss": 0.5056, "num_input_tokens_seen": 1757920, "step": 2680 }, { "epoch": 1.5831367924528301, "grad_norm": 2.7570862770080566, "learning_rate": 7.912735849056604e-06, "loss": 0.5528, "num_input_tokens_seen": 1761344, "step": 2685 }, { "epoch": 1.5860849056603774, "grad_norm": 2.2776217460632324, "learning_rate": 7.927476415094341e-06, "loss": 0.5093, "num_input_tokens_seen": 1764992, "step": 2690 }, { "epoch": 1.5890330188679245, "grad_norm": 2.871467351913452, "learning_rate": 7.942216981132077e-06, "loss": 0.4772, "num_input_tokens_seen": 1768160, "step": 2695 }, { "epoch": 1.5919811320754715, "grad_norm": 1.8279430866241455, "learning_rate": 7.956957547169812e-06, "loss": 0.5784, "num_input_tokens_seen": 1771264, "step": 2700 }, { "epoch": 1.5949292452830188, "grad_norm": 1.9546269178390503, "learning_rate": 7.971698113207547e-06, "loss": 0.5148, "num_input_tokens_seen": 1773760, "step": 2705 }, { "epoch": 1.5978773584905661, "grad_norm": 7.57034158706665, "learning_rate": 7.986438679245284e-06, "loss": 0.4268, "num_input_tokens_seen": 1777248, "step": 2710 }, { "epoch": 1.6008254716981132, "grad_norm": 3.619108200073242, "learning_rate": 8.00117924528302e-06, "loss": 0.5673, "num_input_tokens_seen": 1779872, "step": 2715 }, { "epoch": 1.6037735849056602, "grad_norm": 2.7996795177459717, "learning_rate": 8.015919811320757e-06, "loss": 0.4967, "num_input_tokens_seen": 1782720, "step": 2720 }, { "epoch": 1.6067216981132075, "grad_norm": 3.921999216079712, "learning_rate": 8.030660377358492e-06, "loss": 0.5563, "num_input_tokens_seen": 1785632, "step": 2725 }, { "epoch": 1.6096698113207548, "grad_norm": 1.4991048574447632, "learning_rate": 8.045400943396227e-06, "loss": 0.501, "num_input_tokens_seen": 1788864, "step": 2730 }, { "epoch": 1.6126179245283019, "grad_norm": 3.3404085636138916, "learning_rate": 8.060141509433963e-06, "loss": 0.463, "num_input_tokens_seen": 1791136, "step": 2735 }, { "epoch": 1.615566037735849, "grad_norm": 2.3893890380859375, "learning_rate": 8.074882075471698e-06, "loss": 0.5265, "num_input_tokens_seen": 1794720, "step": 2740 }, { "epoch": 1.6185141509433962, "grad_norm": 1.633077621459961, "learning_rate": 8.089622641509435e-06, "loss": 0.4236, "num_input_tokens_seen": 1797824, "step": 2745 }, { "epoch": 1.6214622641509435, "grad_norm": 1.8737014532089233, "learning_rate": 8.10436320754717e-06, "loss": 0.5063, "num_input_tokens_seen": 1801184, "step": 2750 }, { "epoch": 1.6244103773584906, "grad_norm": 1.7135396003723145, "learning_rate": 8.119103773584906e-06, "loss": 0.4877, "num_input_tokens_seen": 1804960, "step": 2755 }, { "epoch": 1.6273584905660377, "grad_norm": 1.846476435661316, "learning_rate": 8.133844339622641e-06, "loss": 0.4982, "num_input_tokens_seen": 1809024, "step": 2760 }, { "epoch": 1.630306603773585, "grad_norm": 2.5734829902648926, "learning_rate": 8.148584905660378e-06, "loss": 0.4752, "num_input_tokens_seen": 1815296, "step": 2765 }, { "epoch": 1.6332547169811322, "grad_norm": 2.227144956588745, "learning_rate": 8.163325471698114e-06, "loss": 0.4258, "num_input_tokens_seen": 1819680, "step": 2770 }, { "epoch": 1.6362028301886793, "grad_norm": 3.5401036739349365, "learning_rate": 8.17806603773585e-06, "loss": 0.6924, "num_input_tokens_seen": 1824384, "step": 2775 }, { "epoch": 1.6391509433962264, "grad_norm": 2.811753273010254, "learning_rate": 8.192806603773586e-06, "loss": 0.4186, "num_input_tokens_seen": 1828032, "step": 2780 }, { "epoch": 1.6420990566037736, "grad_norm": 2.1543385982513428, "learning_rate": 8.207547169811321e-06, "loss": 0.614, "num_input_tokens_seen": 1831104, "step": 2785 }, { "epoch": 1.6450471698113207, "grad_norm": 1.954701542854309, "learning_rate": 8.222287735849057e-06, "loss": 0.4369, "num_input_tokens_seen": 1834016, "step": 2790 }, { "epoch": 1.6479952830188678, "grad_norm": 1.7077847719192505, "learning_rate": 8.237028301886794e-06, "loss": 0.5592, "num_input_tokens_seen": 1837696, "step": 2795 }, { "epoch": 1.650943396226415, "grad_norm": 2.0054714679718018, "learning_rate": 8.251768867924529e-06, "loss": 0.5354, "num_input_tokens_seen": 1840640, "step": 2800 }, { "epoch": 1.6538915094339623, "grad_norm": 2.5063631534576416, "learning_rate": 8.266509433962266e-06, "loss": 0.4845, "num_input_tokens_seen": 1843488, "step": 2805 }, { "epoch": 1.6568396226415094, "grad_norm": 2.524466037750244, "learning_rate": 8.281250000000001e-06, "loss": 0.4428, "num_input_tokens_seen": 1846432, "step": 2810 }, { "epoch": 1.6597877358490565, "grad_norm": 2.179178476333618, "learning_rate": 8.295990566037737e-06, "loss": 0.6294, "num_input_tokens_seen": 1849280, "step": 2815 }, { "epoch": 1.6627358490566038, "grad_norm": 3.2956972122192383, "learning_rate": 8.310731132075472e-06, "loss": 0.5306, "num_input_tokens_seen": 1852160, "step": 2820 }, { "epoch": 1.665683962264151, "grad_norm": 1.7134501934051514, "learning_rate": 8.325471698113207e-06, "loss": 0.6061, "num_input_tokens_seen": 1855328, "step": 2825 }, { "epoch": 1.6686320754716981, "grad_norm": 1.9560868740081787, "learning_rate": 8.340212264150944e-06, "loss": 0.4788, "num_input_tokens_seen": 1858784, "step": 2830 }, { "epoch": 1.6715801886792452, "grad_norm": 1.5478752851486206, "learning_rate": 8.35495283018868e-06, "loss": 0.5168, "num_input_tokens_seen": 1861696, "step": 2835 }, { "epoch": 1.6745283018867925, "grad_norm": 5.275365829467773, "learning_rate": 8.369693396226415e-06, "loss": 0.5878, "num_input_tokens_seen": 1864416, "step": 2840 }, { "epoch": 1.6774764150943398, "grad_norm": 2.1445727348327637, "learning_rate": 8.38443396226415e-06, "loss": 0.6567, "num_input_tokens_seen": 1867456, "step": 2845 }, { "epoch": 1.6804245283018868, "grad_norm": 2.275371551513672, "learning_rate": 8.399174528301888e-06, "loss": 0.5824, "num_input_tokens_seen": 1870496, "step": 2850 }, { "epoch": 1.6833726415094339, "grad_norm": 9.323149681091309, "learning_rate": 8.413915094339623e-06, "loss": 0.4872, "num_input_tokens_seen": 1873152, "step": 2855 }, { "epoch": 1.6863207547169812, "grad_norm": 2.843740224838257, "learning_rate": 8.42865566037736e-06, "loss": 0.48, "num_input_tokens_seen": 1876736, "step": 2860 }, { "epoch": 1.6892688679245285, "grad_norm": 1.4440834522247314, "learning_rate": 8.443396226415095e-06, "loss": 0.631, "num_input_tokens_seen": 1879840, "step": 2865 }, { "epoch": 1.6922169811320755, "grad_norm": 6.7249369621276855, "learning_rate": 8.45813679245283e-06, "loss": 0.5754, "num_input_tokens_seen": 1882720, "step": 2870 }, { "epoch": 1.6951650943396226, "grad_norm": 7.0283522605896, "learning_rate": 8.472877358490566e-06, "loss": 0.4906, "num_input_tokens_seen": 1886080, "step": 2875 }, { "epoch": 1.6981132075471699, "grad_norm": 2.0130412578582764, "learning_rate": 8.487617924528303e-06, "loss": 0.6151, "num_input_tokens_seen": 1890112, "step": 2880 }, { "epoch": 1.701061320754717, "grad_norm": 2.456204891204834, "learning_rate": 8.502358490566038e-06, "loss": 0.6087, "num_input_tokens_seen": 1892672, "step": 2885 }, { "epoch": 1.704009433962264, "grad_norm": 1.53376305103302, "learning_rate": 8.517099056603775e-06, "loss": 0.4272, "num_input_tokens_seen": 1895456, "step": 2890 }, { "epoch": 1.7069575471698113, "grad_norm": 5.201976299285889, "learning_rate": 8.53183962264151e-06, "loss": 0.5427, "num_input_tokens_seen": 1898176, "step": 2895 }, { "epoch": 1.7099056603773586, "grad_norm": 2.7541418075561523, "learning_rate": 8.546580188679246e-06, "loss": 0.5708, "num_input_tokens_seen": 1900960, "step": 2900 }, { "epoch": 1.7128537735849056, "grad_norm": 4.639934539794922, "learning_rate": 8.561320754716981e-06, "loss": 0.5078, "num_input_tokens_seen": 1904448, "step": 2905 }, { "epoch": 1.7158018867924527, "grad_norm": 2.0125160217285156, "learning_rate": 8.576061320754717e-06, "loss": 0.4228, "num_input_tokens_seen": 1907328, "step": 2910 }, { "epoch": 1.71875, "grad_norm": 2.5006237030029297, "learning_rate": 8.590801886792454e-06, "loss": 0.4491, "num_input_tokens_seen": 1910240, "step": 2915 }, { "epoch": 1.7216981132075473, "grad_norm": 1.5151746273040771, "learning_rate": 8.605542452830189e-06, "loss": 0.5179, "num_input_tokens_seen": 1913696, "step": 2920 }, { "epoch": 1.7246462264150944, "grad_norm": 3.6472792625427246, "learning_rate": 8.620283018867926e-06, "loss": 0.5002, "num_input_tokens_seen": 1917632, "step": 2925 }, { "epoch": 1.7275943396226414, "grad_norm": 5.584592819213867, "learning_rate": 8.635023584905662e-06, "loss": 0.3783, "num_input_tokens_seen": 1920768, "step": 2930 }, { "epoch": 1.7305424528301887, "grad_norm": 3.0399558544158936, "learning_rate": 8.649764150943397e-06, "loss": 0.8715, "num_input_tokens_seen": 1923648, "step": 2935 }, { "epoch": 1.733490566037736, "grad_norm": 4.560445308685303, "learning_rate": 8.664504716981132e-06, "loss": 0.6, "num_input_tokens_seen": 1926752, "step": 2940 }, { "epoch": 1.736438679245283, "grad_norm": 4.461677551269531, "learning_rate": 8.67924528301887e-06, "loss": 0.571, "num_input_tokens_seen": 1929184, "step": 2945 }, { "epoch": 1.7393867924528301, "grad_norm": 2.237555503845215, "learning_rate": 8.693985849056605e-06, "loss": 0.4753, "num_input_tokens_seen": 1933888, "step": 2950 }, { "epoch": 1.7423349056603774, "grad_norm": 4.473301887512207, "learning_rate": 8.70872641509434e-06, "loss": 0.6886, "num_input_tokens_seen": 1936864, "step": 2955 }, { "epoch": 1.7452830188679245, "grad_norm": 1.9088764190673828, "learning_rate": 8.723466981132075e-06, "loss": 0.4881, "num_input_tokens_seen": 1940288, "step": 2960 }, { "epoch": 1.7482311320754715, "grad_norm": 1.7761468887329102, "learning_rate": 8.738207547169812e-06, "loss": 0.3978, "num_input_tokens_seen": 1943104, "step": 2965 }, { "epoch": 1.7511792452830188, "grad_norm": 1.9424920082092285, "learning_rate": 8.752948113207548e-06, "loss": 0.361, "num_input_tokens_seen": 1946848, "step": 2970 }, { "epoch": 1.7541273584905661, "grad_norm": 1.890637993812561, "learning_rate": 8.767688679245285e-06, "loss": 0.4214, "num_input_tokens_seen": 1949056, "step": 2975 }, { "epoch": 1.7570754716981132, "grad_norm": 6.959601879119873, "learning_rate": 8.78242924528302e-06, "loss": 0.6241, "num_input_tokens_seen": 1951904, "step": 2980 }, { "epoch": 1.7600235849056602, "grad_norm": 1.3371315002441406, "learning_rate": 8.797169811320755e-06, "loss": 0.4553, "num_input_tokens_seen": 1956000, "step": 2985 }, { "epoch": 1.7629716981132075, "grad_norm": 1.5840321779251099, "learning_rate": 8.81191037735849e-06, "loss": 0.5739, "num_input_tokens_seen": 1958592, "step": 2990 }, { "epoch": 1.7659198113207548, "grad_norm": 1.4681565761566162, "learning_rate": 8.826650943396226e-06, "loss": 0.6707, "num_input_tokens_seen": 1962304, "step": 2995 }, { "epoch": 1.7688679245283019, "grad_norm": 1.6061550378799438, "learning_rate": 8.841391509433963e-06, "loss": 0.578, "num_input_tokens_seen": 1965568, "step": 3000 }, { "epoch": 1.771816037735849, "grad_norm": 1.5175234079360962, "learning_rate": 8.856132075471698e-06, "loss": 0.6034, "num_input_tokens_seen": 1969600, "step": 3005 }, { "epoch": 1.7747641509433962, "grad_norm": 1.9135304689407349, "learning_rate": 8.870872641509435e-06, "loss": 0.5001, "num_input_tokens_seen": 1972192, "step": 3010 }, { "epoch": 1.7777122641509435, "grad_norm": 1.9181616306304932, "learning_rate": 8.88561320754717e-06, "loss": 0.4339, "num_input_tokens_seen": 1975744, "step": 3015 }, { "epoch": 1.7806603773584906, "grad_norm": 1.5820478200912476, "learning_rate": 8.900353773584906e-06, "loss": 0.6109, "num_input_tokens_seen": 1979968, "step": 3020 }, { "epoch": 1.7836084905660377, "grad_norm": 1.6426547765731812, "learning_rate": 8.915094339622642e-06, "loss": 0.4785, "num_input_tokens_seen": 1984032, "step": 3025 }, { "epoch": 1.786556603773585, "grad_norm": 2.2906746864318848, "learning_rate": 8.929834905660379e-06, "loss": 0.3181, "num_input_tokens_seen": 1987904, "step": 3030 }, { "epoch": 1.7895047169811322, "grad_norm": 1.6604082584381104, "learning_rate": 8.944575471698114e-06, "loss": 0.4585, "num_input_tokens_seen": 1991008, "step": 3035 }, { "epoch": 1.7924528301886793, "grad_norm": 1.8780136108398438, "learning_rate": 8.959316037735851e-06, "loss": 0.5018, "num_input_tokens_seen": 1995520, "step": 3040 }, { "epoch": 1.7954009433962264, "grad_norm": 1.599363923072815, "learning_rate": 8.974056603773586e-06, "loss": 0.6045, "num_input_tokens_seen": 1998560, "step": 3045 }, { "epoch": 1.7983490566037736, "grad_norm": 1.8744579553604126, "learning_rate": 8.988797169811322e-06, "loss": 0.4225, "num_input_tokens_seen": 2002592, "step": 3050 }, { "epoch": 1.8012971698113207, "grad_norm": 2.9709784984588623, "learning_rate": 9.003537735849057e-06, "loss": 0.4773, "num_input_tokens_seen": 2005408, "step": 3055 }, { "epoch": 1.8042452830188678, "grad_norm": 1.6364822387695312, "learning_rate": 9.018278301886794e-06, "loss": 0.4632, "num_input_tokens_seen": 2008512, "step": 3060 }, { "epoch": 1.807193396226415, "grad_norm": 1.7016874551773071, "learning_rate": 9.03301886792453e-06, "loss": 0.5515, "num_input_tokens_seen": 2011904, "step": 3065 }, { "epoch": 1.8101415094339623, "grad_norm": 2.5323383808135986, "learning_rate": 9.047759433962265e-06, "loss": 0.5158, "num_input_tokens_seen": 2014592, "step": 3070 }, { "epoch": 1.8130896226415094, "grad_norm": 2.6336135864257812, "learning_rate": 9.0625e-06, "loss": 0.4885, "num_input_tokens_seen": 2017632, "step": 3075 }, { "epoch": 1.8160377358490565, "grad_norm": 2.089392900466919, "learning_rate": 9.077240566037735e-06, "loss": 0.3941, "num_input_tokens_seen": 2021376, "step": 3080 }, { "epoch": 1.8189858490566038, "grad_norm": 2.859149217605591, "learning_rate": 9.091981132075472e-06, "loss": 0.5868, "num_input_tokens_seen": 2024320, "step": 3085 }, { "epoch": 1.821933962264151, "grad_norm": 1.745100498199463, "learning_rate": 9.106721698113208e-06, "loss": 0.6129, "num_input_tokens_seen": 2027936, "step": 3090 }, { "epoch": 1.8248820754716981, "grad_norm": 2.116115093231201, "learning_rate": 9.121462264150945e-06, "loss": 0.4957, "num_input_tokens_seen": 2035744, "step": 3095 }, { "epoch": 1.8278301886792452, "grad_norm": 2.132314682006836, "learning_rate": 9.13620283018868e-06, "loss": 0.4346, "num_input_tokens_seen": 2038560, "step": 3100 }, { "epoch": 1.8307783018867925, "grad_norm": 2.0081193447113037, "learning_rate": 9.150943396226416e-06, "loss": 0.5564, "num_input_tokens_seen": 2041344, "step": 3105 }, { "epoch": 1.8337264150943398, "grad_norm": 1.266995906829834, "learning_rate": 9.165683962264151e-06, "loss": 0.4318, "num_input_tokens_seen": 2044480, "step": 3110 }, { "epoch": 1.8366745283018868, "grad_norm": 3.8343594074249268, "learning_rate": 9.180424528301888e-06, "loss": 0.7092, "num_input_tokens_seen": 2047488, "step": 3115 }, { "epoch": 1.8396226415094339, "grad_norm": 2.9083545207977295, "learning_rate": 9.195165094339623e-06, "loss": 0.5216, "num_input_tokens_seen": 2051104, "step": 3120 }, { "epoch": 1.8425707547169812, "grad_norm": 2.225029945373535, "learning_rate": 9.20990566037736e-06, "loss": 0.7074, "num_input_tokens_seen": 2054144, "step": 3125 }, { "epoch": 1.8455188679245285, "grad_norm": 1.5978819131851196, "learning_rate": 9.224646226415096e-06, "loss": 0.6309, "num_input_tokens_seen": 2057280, "step": 3130 }, { "epoch": 1.8484669811320755, "grad_norm": 2.462926149368286, "learning_rate": 9.239386792452831e-06, "loss": 0.4646, "num_input_tokens_seen": 2059584, "step": 3135 }, { "epoch": 1.8514150943396226, "grad_norm": 1.5843557119369507, "learning_rate": 9.254127358490566e-06, "loss": 0.5217, "num_input_tokens_seen": 2062432, "step": 3140 }, { "epoch": 1.8543632075471699, "grad_norm": 3.339601516723633, "learning_rate": 9.268867924528303e-06, "loss": 0.5981, "num_input_tokens_seen": 2066080, "step": 3145 }, { "epoch": 1.857311320754717, "grad_norm": 4.309798717498779, "learning_rate": 9.283608490566039e-06, "loss": 0.6047, "num_input_tokens_seen": 2069312, "step": 3150 }, { "epoch": 1.860259433962264, "grad_norm": 2.889320135116577, "learning_rate": 9.298349056603774e-06, "loss": 0.5519, "num_input_tokens_seen": 2072096, "step": 3155 }, { "epoch": 1.8632075471698113, "grad_norm": 2.3506481647491455, "learning_rate": 9.31308962264151e-06, "loss": 0.5016, "num_input_tokens_seen": 2075808, "step": 3160 }, { "epoch": 1.8661556603773586, "grad_norm": 5.7969889640808105, "learning_rate": 9.327830188679245e-06, "loss": 0.4151, "num_input_tokens_seen": 2078880, "step": 3165 }, { "epoch": 1.8691037735849056, "grad_norm": 2.598409414291382, "learning_rate": 9.342570754716982e-06, "loss": 0.4899, "num_input_tokens_seen": 2082432, "step": 3170 }, { "epoch": 1.8720518867924527, "grad_norm": 1.1234333515167236, "learning_rate": 9.357311320754717e-06, "loss": 0.4333, "num_input_tokens_seen": 2085632, "step": 3175 }, { "epoch": 1.875, "grad_norm": 1.4729344844818115, "learning_rate": 9.372051886792454e-06, "loss": 0.3981, "num_input_tokens_seen": 2088832, "step": 3180 }, { "epoch": 1.8779481132075473, "grad_norm": 4.173734188079834, "learning_rate": 9.38679245283019e-06, "loss": 0.4762, "num_input_tokens_seen": 2091968, "step": 3185 }, { "epoch": 1.8808962264150944, "grad_norm": 1.7485204935073853, "learning_rate": 9.401533018867925e-06, "loss": 0.4879, "num_input_tokens_seen": 2095200, "step": 3190 }, { "epoch": 1.8838443396226414, "grad_norm": 1.9946104288101196, "learning_rate": 9.41627358490566e-06, "loss": 0.4569, "num_input_tokens_seen": 2098432, "step": 3195 }, { "epoch": 1.8867924528301887, "grad_norm": 1.1985485553741455, "learning_rate": 9.431014150943397e-06, "loss": 0.4064, "num_input_tokens_seen": 2101408, "step": 3200 }, { "epoch": 1.889740566037736, "grad_norm": 2.045720338821411, "learning_rate": 9.445754716981133e-06, "loss": 0.4511, "num_input_tokens_seen": 2105568, "step": 3205 }, { "epoch": 1.892688679245283, "grad_norm": 1.5403004884719849, "learning_rate": 9.46049528301887e-06, "loss": 0.3954, "num_input_tokens_seen": 2109376, "step": 3210 }, { "epoch": 1.8956367924528301, "grad_norm": 1.8964297771453857, "learning_rate": 9.475235849056605e-06, "loss": 0.5335, "num_input_tokens_seen": 2112864, "step": 3215 }, { "epoch": 1.8985849056603774, "grad_norm": 2.230834722518921, "learning_rate": 9.48997641509434e-06, "loss": 0.5059, "num_input_tokens_seen": 2115712, "step": 3220 }, { "epoch": 1.9015330188679245, "grad_norm": 2.6552414894104004, "learning_rate": 9.504716981132076e-06, "loss": 0.5664, "num_input_tokens_seen": 2119360, "step": 3225 }, { "epoch": 1.9044811320754715, "grad_norm": 3.7138631343841553, "learning_rate": 9.519457547169813e-06, "loss": 0.4637, "num_input_tokens_seen": 2122144, "step": 3230 }, { "epoch": 1.9074292452830188, "grad_norm": 4.601591110229492, "learning_rate": 9.534198113207548e-06, "loss": 0.5663, "num_input_tokens_seen": 2125376, "step": 3235 }, { "epoch": 1.9103773584905661, "grad_norm": 2.3130199909210205, "learning_rate": 9.548938679245285e-06, "loss": 0.5443, "num_input_tokens_seen": 2129568, "step": 3240 }, { "epoch": 1.9133254716981132, "grad_norm": 1.836625099182129, "learning_rate": 9.56367924528302e-06, "loss": 0.399, "num_input_tokens_seen": 2132416, "step": 3245 }, { "epoch": 1.9162735849056602, "grad_norm": 2.152608871459961, "learning_rate": 9.578419811320756e-06, "loss": 0.4479, "num_input_tokens_seen": 2134976, "step": 3250 }, { "epoch": 1.9192216981132075, "grad_norm": 1.9451475143432617, "learning_rate": 9.593160377358491e-06, "loss": 0.5688, "num_input_tokens_seen": 2138016, "step": 3255 }, { "epoch": 1.9221698113207548, "grad_norm": 1.452512502670288, "learning_rate": 9.607900943396226e-06, "loss": 0.4379, "num_input_tokens_seen": 2140640, "step": 3260 }, { "epoch": 1.9251179245283019, "grad_norm": 2.0216588973999023, "learning_rate": 9.622641509433963e-06, "loss": 0.3492, "num_input_tokens_seen": 2143872, "step": 3265 }, { "epoch": 1.928066037735849, "grad_norm": 1.5202784538269043, "learning_rate": 9.637382075471699e-06, "loss": 0.5455, "num_input_tokens_seen": 2146816, "step": 3270 }, { "epoch": 1.9310141509433962, "grad_norm": 1.656038761138916, "learning_rate": 9.652122641509434e-06, "loss": 0.3682, "num_input_tokens_seen": 2149984, "step": 3275 }, { "epoch": 1.9339622641509435, "grad_norm": 2.969346046447754, "learning_rate": 9.66686320754717e-06, "loss": 0.6113, "num_input_tokens_seen": 2152864, "step": 3280 }, { "epoch": 1.9369103773584906, "grad_norm": 2.0021791458129883, "learning_rate": 9.681603773584907e-06, "loss": 0.5135, "num_input_tokens_seen": 2156192, "step": 3285 }, { "epoch": 1.9398584905660377, "grad_norm": 5.803738594055176, "learning_rate": 9.696344339622642e-06, "loss": 0.3924, "num_input_tokens_seen": 2159968, "step": 3290 }, { "epoch": 1.942806603773585, "grad_norm": 1.8151659965515137, "learning_rate": 9.711084905660379e-06, "loss": 0.5655, "num_input_tokens_seen": 2163136, "step": 3295 }, { "epoch": 1.9457547169811322, "grad_norm": 1.6342276334762573, "learning_rate": 9.725825471698114e-06, "loss": 0.513, "num_input_tokens_seen": 2166304, "step": 3300 }, { "epoch": 1.9487028301886793, "grad_norm": 1.9232072830200195, "learning_rate": 9.74056603773585e-06, "loss": 0.5464, "num_input_tokens_seen": 2169728, "step": 3305 }, { "epoch": 1.9516509433962264, "grad_norm": 3.9776339530944824, "learning_rate": 9.755306603773585e-06, "loss": 0.6543, "num_input_tokens_seen": 2172960, "step": 3310 }, { "epoch": 1.9545990566037736, "grad_norm": 1.8554483652114868, "learning_rate": 9.770047169811322e-06, "loss": 0.4088, "num_input_tokens_seen": 2175616, "step": 3315 }, { "epoch": 1.9575471698113207, "grad_norm": 2.4779863357543945, "learning_rate": 9.784787735849057e-06, "loss": 0.4068, "num_input_tokens_seen": 2180288, "step": 3320 }, { "epoch": 1.9604952830188678, "grad_norm": 1.2540879249572754, "learning_rate": 9.799528301886794e-06, "loss": 0.3793, "num_input_tokens_seen": 2183296, "step": 3325 }, { "epoch": 1.963443396226415, "grad_norm": 1.6181501150131226, "learning_rate": 9.81426886792453e-06, "loss": 0.5146, "num_input_tokens_seen": 2186752, "step": 3330 }, { "epoch": 1.9663915094339623, "grad_norm": 3.1527295112609863, "learning_rate": 9.829009433962265e-06, "loss": 0.3888, "num_input_tokens_seen": 2190944, "step": 3335 }, { "epoch": 1.9693396226415094, "grad_norm": 5.1117119789123535, "learning_rate": 9.84375e-06, "loss": 0.4057, "num_input_tokens_seen": 2194080, "step": 3340 }, { "epoch": 1.9722877358490565, "grad_norm": 1.1307283639907837, "learning_rate": 9.858490566037736e-06, "loss": 0.4615, "num_input_tokens_seen": 2197760, "step": 3345 }, { "epoch": 1.9752358490566038, "grad_norm": 4.283483505249023, "learning_rate": 9.873231132075473e-06, "loss": 0.57, "num_input_tokens_seen": 2201024, "step": 3350 }, { "epoch": 1.978183962264151, "grad_norm": 1.9049365520477295, "learning_rate": 9.887971698113208e-06, "loss": 0.6146, "num_input_tokens_seen": 2204000, "step": 3355 }, { "epoch": 1.9811320754716981, "grad_norm": 0.9755533337593079, "learning_rate": 9.902712264150945e-06, "loss": 0.4192, "num_input_tokens_seen": 2207008, "step": 3360 }, { "epoch": 1.9840801886792452, "grad_norm": 2.1306710243225098, "learning_rate": 9.917452830188679e-06, "loss": 0.5144, "num_input_tokens_seen": 2210048, "step": 3365 }, { "epoch": 1.9870283018867925, "grad_norm": 3.8325035572052, "learning_rate": 9.932193396226416e-06, "loss": 0.4688, "num_input_tokens_seen": 2212256, "step": 3370 }, { "epoch": 1.9899764150943398, "grad_norm": 2.5169591903686523, "learning_rate": 9.946933962264151e-06, "loss": 0.4716, "num_input_tokens_seen": 2214976, "step": 3375 }, { "epoch": 1.9929245283018868, "grad_norm": 1.1625642776489258, "learning_rate": 9.961674528301888e-06, "loss": 0.4401, "num_input_tokens_seen": 2218304, "step": 3380 }, { "epoch": 1.9958726415094339, "grad_norm": 3.0977630615234375, "learning_rate": 9.976415094339624e-06, "loss": 0.6739, "num_input_tokens_seen": 2220768, "step": 3385 }, { "epoch": 1.9988207547169812, "grad_norm": 4.551213264465332, "learning_rate": 9.991155660377359e-06, "loss": 0.5307, "num_input_tokens_seen": 2224032, "step": 3390 }, { "epoch": 2.0, "eval_loss": 0.5229928493499756, "eval_runtime": 18.5141, "eval_samples_per_second": 91.606, "eval_steps_per_second": 22.901, "num_input_tokens_seen": 2224576, "step": 3392 }, { "epoch": 2.0017688679245285, "grad_norm": 3.4274628162384033, "learning_rate": 9.999999894098275e-06, "loss": 0.5148, "num_input_tokens_seen": 2226016, "step": 3395 }, { "epoch": 2.0047169811320753, "grad_norm": 1.3424423933029175, "learning_rate": 9.99999870270391e-06, "loss": 0.4848, "num_input_tokens_seen": 2230432, "step": 3400 }, { "epoch": 2.0076650943396226, "grad_norm": 2.1738641262054443, "learning_rate": 9.999996187538341e-06, "loss": 0.3188, "num_input_tokens_seen": 2234368, "step": 3405 }, { "epoch": 2.01061320754717, "grad_norm": 2.4024107456207275, "learning_rate": 9.999992348602233e-06, "loss": 0.4725, "num_input_tokens_seen": 2237600, "step": 3410 }, { "epoch": 2.013561320754717, "grad_norm": 1.9780548810958862, "learning_rate": 9.999987185896598e-06, "loss": 0.4642, "num_input_tokens_seen": 2241696, "step": 3415 }, { "epoch": 2.016509433962264, "grad_norm": 2.547482490539551, "learning_rate": 9.99998069942281e-06, "loss": 0.5733, "num_input_tokens_seen": 2244640, "step": 3420 }, { "epoch": 2.0194575471698113, "grad_norm": 4.573434829711914, "learning_rate": 9.999972889182583e-06, "loss": 0.6758, "num_input_tokens_seen": 2248096, "step": 3425 }, { "epoch": 2.0224056603773586, "grad_norm": 3.4897468090057373, "learning_rate": 9.999963755177984e-06, "loss": 0.5173, "num_input_tokens_seen": 2251584, "step": 3430 }, { "epoch": 2.025353773584906, "grad_norm": 2.9240996837615967, "learning_rate": 9.999953297411434e-06, "loss": 0.4628, "num_input_tokens_seen": 2254592, "step": 3435 }, { "epoch": 2.0283018867924527, "grad_norm": 3.145596981048584, "learning_rate": 9.999941515885699e-06, "loss": 0.4743, "num_input_tokens_seen": 2257824, "step": 3440 }, { "epoch": 2.03125, "grad_norm": 2.0552804470062256, "learning_rate": 9.999928410603897e-06, "loss": 0.4709, "num_input_tokens_seen": 2260384, "step": 3445 }, { "epoch": 2.0341981132075473, "grad_norm": 4.6283488273620605, "learning_rate": 9.999913981569502e-06, "loss": 0.6251, "num_input_tokens_seen": 2263200, "step": 3450 }, { "epoch": 2.037146226415094, "grad_norm": 1.9133756160736084, "learning_rate": 9.999898228786332e-06, "loss": 0.4169, "num_input_tokens_seen": 2266048, "step": 3455 }, { "epoch": 2.0400943396226414, "grad_norm": 2.6543383598327637, "learning_rate": 9.999881152258557e-06, "loss": 0.4654, "num_input_tokens_seen": 2268960, "step": 3460 }, { "epoch": 2.0430424528301887, "grad_norm": 1.723082184791565, "learning_rate": 9.999862751990697e-06, "loss": 0.5, "num_input_tokens_seen": 2271840, "step": 3465 }, { "epoch": 2.045990566037736, "grad_norm": 1.753151297569275, "learning_rate": 9.999843027987628e-06, "loss": 0.4522, "num_input_tokens_seen": 2275200, "step": 3470 }, { "epoch": 2.048938679245283, "grad_norm": 1.5743434429168701, "learning_rate": 9.999821980254567e-06, "loss": 0.5106, "num_input_tokens_seen": 2279328, "step": 3475 }, { "epoch": 2.05188679245283, "grad_norm": 1.5645747184753418, "learning_rate": 9.99979960879709e-06, "loss": 0.4368, "num_input_tokens_seen": 2281952, "step": 3480 }, { "epoch": 2.0548349056603774, "grad_norm": 1.4349006414413452, "learning_rate": 9.99977591362112e-06, "loss": 0.3825, "num_input_tokens_seen": 2285824, "step": 3485 }, { "epoch": 2.0577830188679247, "grad_norm": 4.52356481552124, "learning_rate": 9.999750894732927e-06, "loss": 0.5268, "num_input_tokens_seen": 2289184, "step": 3490 }, { "epoch": 2.0607311320754715, "grad_norm": 2.1512105464935303, "learning_rate": 9.999724552139136e-06, "loss": 0.4594, "num_input_tokens_seen": 2292128, "step": 3495 }, { "epoch": 2.063679245283019, "grad_norm": 2.38511323928833, "learning_rate": 9.999696885846724e-06, "loss": 0.4469, "num_input_tokens_seen": 2295360, "step": 3500 }, { "epoch": 2.066627358490566, "grad_norm": 2.903155565261841, "learning_rate": 9.999667895863012e-06, "loss": 0.5415, "num_input_tokens_seen": 2298272, "step": 3505 }, { "epoch": 2.0695754716981134, "grad_norm": 2.4086902141571045, "learning_rate": 9.99963758219568e-06, "loss": 0.7515, "num_input_tokens_seen": 2301088, "step": 3510 }, { "epoch": 2.0725235849056602, "grad_norm": 2.6394271850585938, "learning_rate": 9.999605944852749e-06, "loss": 0.483, "num_input_tokens_seen": 2304384, "step": 3515 }, { "epoch": 2.0754716981132075, "grad_norm": 3.9606471061706543, "learning_rate": 9.999572983842599e-06, "loss": 0.4892, "num_input_tokens_seen": 2307616, "step": 3520 }, { "epoch": 2.078419811320755, "grad_norm": 1.6987196207046509, "learning_rate": 9.999538699173951e-06, "loss": 0.5054, "num_input_tokens_seen": 2311136, "step": 3525 }, { "epoch": 2.081367924528302, "grad_norm": 3.1527609825134277, "learning_rate": 9.99950309085589e-06, "loss": 0.5054, "num_input_tokens_seen": 2314208, "step": 3530 }, { "epoch": 2.084316037735849, "grad_norm": 1.383882761001587, "learning_rate": 9.999466158897835e-06, "loss": 0.5016, "num_input_tokens_seen": 2317472, "step": 3535 }, { "epoch": 2.0872641509433962, "grad_norm": 3.591376781463623, "learning_rate": 9.999427903309569e-06, "loss": 0.4769, "num_input_tokens_seen": 2320320, "step": 3540 }, { "epoch": 2.0902122641509435, "grad_norm": 4.1779680252075195, "learning_rate": 9.99938832410122e-06, "loss": 0.5156, "num_input_tokens_seen": 2323552, "step": 3545 }, { "epoch": 2.0931603773584904, "grad_norm": 3.4292712211608887, "learning_rate": 9.999347421283267e-06, "loss": 0.4255, "num_input_tokens_seen": 2327360, "step": 3550 }, { "epoch": 2.0961084905660377, "grad_norm": 5.304132461547852, "learning_rate": 9.999305194866538e-06, "loss": 0.4308, "num_input_tokens_seen": 2330336, "step": 3555 }, { "epoch": 2.099056603773585, "grad_norm": 1.3875356912612915, "learning_rate": 9.99926164486221e-06, "loss": 0.5474, "num_input_tokens_seen": 2334688, "step": 3560 }, { "epoch": 2.1020047169811322, "grad_norm": 2.6074681282043457, "learning_rate": 9.99921677128182e-06, "loss": 0.4579, "num_input_tokens_seen": 2338528, "step": 3565 }, { "epoch": 2.104952830188679, "grad_norm": 1.7006899118423462, "learning_rate": 9.99917057413724e-06, "loss": 0.5921, "num_input_tokens_seen": 2342432, "step": 3570 }, { "epoch": 2.1079009433962264, "grad_norm": 1.5385116338729858, "learning_rate": 9.99912305344071e-06, "loss": 0.38, "num_input_tokens_seen": 2345600, "step": 3575 }, { "epoch": 2.1108490566037736, "grad_norm": 1.95740807056427, "learning_rate": 9.999074209204803e-06, "loss": 0.4083, "num_input_tokens_seen": 2348352, "step": 3580 }, { "epoch": 2.113797169811321, "grad_norm": 2.3072099685668945, "learning_rate": 9.999024041442455e-06, "loss": 0.4648, "num_input_tokens_seen": 2352960, "step": 3585 }, { "epoch": 2.1167452830188678, "grad_norm": 1.7021610736846924, "learning_rate": 9.998972550166948e-06, "loss": 0.5222, "num_input_tokens_seen": 2356576, "step": 3590 }, { "epoch": 2.119693396226415, "grad_norm": 2.24062442779541, "learning_rate": 9.998919735391915e-06, "loss": 0.3892, "num_input_tokens_seen": 2359168, "step": 3595 }, { "epoch": 2.1226415094339623, "grad_norm": 1.3790653944015503, "learning_rate": 9.998865597131336e-06, "loss": 0.4876, "num_input_tokens_seen": 2362208, "step": 3600 }, { "epoch": 2.1255896226415096, "grad_norm": 5.552567958831787, "learning_rate": 9.998810135399545e-06, "loss": 0.4852, "num_input_tokens_seen": 2365600, "step": 3605 }, { "epoch": 2.1285377358490565, "grad_norm": 2.0687756538391113, "learning_rate": 9.99875335021123e-06, "loss": 0.506, "num_input_tokens_seen": 2368608, "step": 3610 }, { "epoch": 2.1314858490566038, "grad_norm": 2.2535817623138428, "learning_rate": 9.998695241581423e-06, "loss": 0.4315, "num_input_tokens_seen": 2371136, "step": 3615 }, { "epoch": 2.134433962264151, "grad_norm": 2.108121156692505, "learning_rate": 9.998635809525504e-06, "loss": 0.4412, "num_input_tokens_seen": 2373728, "step": 3620 }, { "epoch": 2.137382075471698, "grad_norm": 2.0042285919189453, "learning_rate": 9.998575054059212e-06, "loss": 0.5155, "num_input_tokens_seen": 2376320, "step": 3625 }, { "epoch": 2.140330188679245, "grad_norm": 3.4304375648498535, "learning_rate": 9.998512975198633e-06, "loss": 0.4381, "num_input_tokens_seen": 2379264, "step": 3630 }, { "epoch": 2.1432783018867925, "grad_norm": 4.519167900085449, "learning_rate": 9.998449572960202e-06, "loss": 0.6305, "num_input_tokens_seen": 2381984, "step": 3635 }, { "epoch": 2.1462264150943398, "grad_norm": 1.31942617893219, "learning_rate": 9.998384847360705e-06, "loss": 0.5849, "num_input_tokens_seen": 2385568, "step": 3640 }, { "epoch": 2.1491745283018866, "grad_norm": 2.228872537612915, "learning_rate": 9.998318798417276e-06, "loss": 0.507, "num_input_tokens_seen": 2389760, "step": 3645 }, { "epoch": 2.152122641509434, "grad_norm": 2.5871310234069824, "learning_rate": 9.998251426147403e-06, "loss": 0.4777, "num_input_tokens_seen": 2392448, "step": 3650 }, { "epoch": 2.155070754716981, "grad_norm": 1.3766682147979736, "learning_rate": 9.998182730568927e-06, "loss": 0.4284, "num_input_tokens_seen": 2395232, "step": 3655 }, { "epoch": 2.1580188679245285, "grad_norm": 1.707181692123413, "learning_rate": 9.998112711700028e-06, "loss": 0.4436, "num_input_tokens_seen": 2397760, "step": 3660 }, { "epoch": 2.1609669811320753, "grad_norm": 2.6673526763916016, "learning_rate": 9.99804136955925e-06, "loss": 0.5474, "num_input_tokens_seen": 2401280, "step": 3665 }, { "epoch": 2.1639150943396226, "grad_norm": 1.8203492164611816, "learning_rate": 9.99796870416548e-06, "loss": 0.4092, "num_input_tokens_seen": 2403904, "step": 3670 }, { "epoch": 2.16686320754717, "grad_norm": 2.0368471145629883, "learning_rate": 9.997894715537953e-06, "loss": 0.3913, "num_input_tokens_seen": 2406976, "step": 3675 }, { "epoch": 2.169811320754717, "grad_norm": 1.7933189868927002, "learning_rate": 9.997819403696263e-06, "loss": 0.5152, "num_input_tokens_seen": 2410272, "step": 3680 }, { "epoch": 2.172759433962264, "grad_norm": 2.912187099456787, "learning_rate": 9.997742768660345e-06, "loss": 0.4064, "num_input_tokens_seen": 2412416, "step": 3685 }, { "epoch": 2.1757075471698113, "grad_norm": 2.646860361099243, "learning_rate": 9.99766481045049e-06, "loss": 0.63, "num_input_tokens_seen": 2415328, "step": 3690 }, { "epoch": 2.1786556603773586, "grad_norm": 4.212270736694336, "learning_rate": 9.997585529087338e-06, "loss": 0.5876, "num_input_tokens_seen": 2418432, "step": 3695 }, { "epoch": 2.1816037735849054, "grad_norm": 2.838362455368042, "learning_rate": 9.997504924591878e-06, "loss": 0.4302, "num_input_tokens_seen": 2420896, "step": 3700 }, { "epoch": 2.1845518867924527, "grad_norm": 2.0587148666381836, "learning_rate": 9.99742299698545e-06, "loss": 0.5209, "num_input_tokens_seen": 2424320, "step": 3705 }, { "epoch": 2.1875, "grad_norm": 1.5177526473999023, "learning_rate": 9.997339746289749e-06, "loss": 0.4466, "num_input_tokens_seen": 2427072, "step": 3710 }, { "epoch": 2.1904481132075473, "grad_norm": 5.8244709968566895, "learning_rate": 9.997255172526812e-06, "loss": 0.5641, "num_input_tokens_seen": 2429408, "step": 3715 }, { "epoch": 2.1933962264150946, "grad_norm": 0.8839884400367737, "learning_rate": 9.99716927571903e-06, "loss": 0.3304, "num_input_tokens_seen": 2432608, "step": 3720 }, { "epoch": 2.1963443396226414, "grad_norm": 3.2401795387268066, "learning_rate": 9.997082055889147e-06, "loss": 0.6328, "num_input_tokens_seen": 2436736, "step": 3725 }, { "epoch": 2.1992924528301887, "grad_norm": 1.7188471555709839, "learning_rate": 9.996993513060252e-06, "loss": 0.5497, "num_input_tokens_seen": 2440800, "step": 3730 }, { "epoch": 2.202240566037736, "grad_norm": 3.3555569648742676, "learning_rate": 9.996903647255789e-06, "loss": 0.5915, "num_input_tokens_seen": 2444224, "step": 3735 }, { "epoch": 2.205188679245283, "grad_norm": 2.1207683086395264, "learning_rate": 9.99681245849955e-06, "loss": 0.413, "num_input_tokens_seen": 2447328, "step": 3740 }, { "epoch": 2.20813679245283, "grad_norm": 2.0389370918273926, "learning_rate": 9.996719946815679e-06, "loss": 0.5299, "num_input_tokens_seen": 2450432, "step": 3745 }, { "epoch": 2.2110849056603774, "grad_norm": 1.301500678062439, "learning_rate": 9.996626112228665e-06, "loss": 0.4386, "num_input_tokens_seen": 2454176, "step": 3750 }, { "epoch": 2.2140330188679247, "grad_norm": 1.6435264348983765, "learning_rate": 9.996530954763355e-06, "loss": 0.5481, "num_input_tokens_seen": 2457312, "step": 3755 }, { "epoch": 2.2169811320754715, "grad_norm": 4.53391170501709, "learning_rate": 9.99643447444494e-06, "loss": 0.495, "num_input_tokens_seen": 2459680, "step": 3760 }, { "epoch": 2.219929245283019, "grad_norm": 1.5793169736862183, "learning_rate": 9.996336671298965e-06, "loss": 0.5397, "num_input_tokens_seen": 2462528, "step": 3765 }, { "epoch": 2.222877358490566, "grad_norm": 3.54730486869812, "learning_rate": 9.996237545351323e-06, "loss": 0.5371, "num_input_tokens_seen": 2465920, "step": 3770 }, { "epoch": 2.2258254716981134, "grad_norm": 2.6879775524139404, "learning_rate": 9.996137096628259e-06, "loss": 0.5453, "num_input_tokens_seen": 2469184, "step": 3775 }, { "epoch": 2.2287735849056602, "grad_norm": 1.8114336729049683, "learning_rate": 9.996035325156366e-06, "loss": 0.4694, "num_input_tokens_seen": 2472768, "step": 3780 }, { "epoch": 2.2317216981132075, "grad_norm": 2.1791703701019287, "learning_rate": 9.995932230962589e-06, "loss": 0.3645, "num_input_tokens_seen": 2475328, "step": 3785 }, { "epoch": 2.234669811320755, "grad_norm": 1.6069248914718628, "learning_rate": 9.995827814074223e-06, "loss": 0.5007, "num_input_tokens_seen": 2478560, "step": 3790 }, { "epoch": 2.237617924528302, "grad_norm": 1.981142520904541, "learning_rate": 9.995722074518913e-06, "loss": 0.5015, "num_input_tokens_seen": 2481824, "step": 3795 }, { "epoch": 2.240566037735849, "grad_norm": 4.170042991638184, "learning_rate": 9.99561501232465e-06, "loss": 0.4337, "num_input_tokens_seen": 2485760, "step": 3800 }, { "epoch": 2.2435141509433962, "grad_norm": 1.5080641508102417, "learning_rate": 9.995506627519786e-06, "loss": 0.4192, "num_input_tokens_seen": 2488384, "step": 3805 }, { "epoch": 2.2464622641509435, "grad_norm": 0.27209609746932983, "learning_rate": 9.995396920133012e-06, "loss": 0.4028, "num_input_tokens_seen": 2494880, "step": 3810 }, { "epoch": 2.2494103773584904, "grad_norm": 3.6554861068725586, "learning_rate": 9.995285890193373e-06, "loss": 0.4032, "num_input_tokens_seen": 2497984, "step": 3815 }, { "epoch": 2.2523584905660377, "grad_norm": 1.7393091917037964, "learning_rate": 9.995173537730267e-06, "loss": 0.4764, "num_input_tokens_seen": 2501504, "step": 3820 }, { "epoch": 2.255306603773585, "grad_norm": 2.786332368850708, "learning_rate": 9.99505986277344e-06, "loss": 0.6307, "num_input_tokens_seen": 2504224, "step": 3825 }, { "epoch": 2.2582547169811322, "grad_norm": 1.6791085004806519, "learning_rate": 9.994944865352986e-06, "loss": 0.5972, "num_input_tokens_seen": 2506688, "step": 3830 }, { "epoch": 2.261202830188679, "grad_norm": 2.1040167808532715, "learning_rate": 9.994828545499351e-06, "loss": 0.4188, "num_input_tokens_seen": 2510720, "step": 3835 }, { "epoch": 2.2641509433962264, "grad_norm": 1.264478325843811, "learning_rate": 9.994710903243334e-06, "loss": 0.558, "num_input_tokens_seen": 2513824, "step": 3840 }, { "epoch": 2.2670990566037736, "grad_norm": 1.727280855178833, "learning_rate": 9.994591938616079e-06, "loss": 0.4676, "num_input_tokens_seen": 2516544, "step": 3845 }, { "epoch": 2.270047169811321, "grad_norm": 1.1869617700576782, "learning_rate": 9.994471651649082e-06, "loss": 0.3614, "num_input_tokens_seen": 2520320, "step": 3850 }, { "epoch": 2.2729952830188678, "grad_norm": 2.0498361587524414, "learning_rate": 9.99435004237419e-06, "loss": 0.5245, "num_input_tokens_seen": 2523232, "step": 3855 }, { "epoch": 2.275943396226415, "grad_norm": 2.3524208068847656, "learning_rate": 9.9942271108236e-06, "loss": 0.6327, "num_input_tokens_seen": 2525856, "step": 3860 }, { "epoch": 2.2788915094339623, "grad_norm": 4.213478088378906, "learning_rate": 9.994102857029859e-06, "loss": 0.479, "num_input_tokens_seen": 2530304, "step": 3865 }, { "epoch": 2.2818396226415096, "grad_norm": 1.9294328689575195, "learning_rate": 9.993977281025862e-06, "loss": 0.5224, "num_input_tokens_seen": 2532928, "step": 3870 }, { "epoch": 2.2847877358490565, "grad_norm": 1.6951313018798828, "learning_rate": 9.993850382844858e-06, "loss": 0.5705, "num_input_tokens_seen": 2536576, "step": 3875 }, { "epoch": 2.2877358490566038, "grad_norm": 2.046959400177002, "learning_rate": 9.993722162520443e-06, "loss": 0.6713, "num_input_tokens_seen": 2540384, "step": 3880 }, { "epoch": 2.290683962264151, "grad_norm": 3.408194065093994, "learning_rate": 9.993592620086564e-06, "loss": 0.4646, "num_input_tokens_seen": 2543040, "step": 3885 }, { "epoch": 2.293632075471698, "grad_norm": 2.0603668689727783, "learning_rate": 9.993461755577518e-06, "loss": 0.6142, "num_input_tokens_seen": 2546208, "step": 3890 }, { "epoch": 2.296580188679245, "grad_norm": 2.022984504699707, "learning_rate": 9.99332956902795e-06, "loss": 0.3432, "num_input_tokens_seen": 2549568, "step": 3895 }, { "epoch": 2.2995283018867925, "grad_norm": 1.1646318435668945, "learning_rate": 9.993196060472859e-06, "loss": 0.4817, "num_input_tokens_seen": 2552512, "step": 3900 }, { "epoch": 2.3024764150943398, "grad_norm": 1.804474115371704, "learning_rate": 9.993061229947591e-06, "loss": 0.5383, "num_input_tokens_seen": 2556864, "step": 3905 }, { "epoch": 2.3054245283018866, "grad_norm": 2.6117749214172363, "learning_rate": 9.992925077487845e-06, "loss": 0.6356, "num_input_tokens_seen": 2560000, "step": 3910 }, { "epoch": 2.308372641509434, "grad_norm": 1.864414095878601, "learning_rate": 9.992787603129666e-06, "loss": 0.6255, "num_input_tokens_seen": 2562400, "step": 3915 }, { "epoch": 2.311320754716981, "grad_norm": 1.3924446105957031, "learning_rate": 9.99264880690945e-06, "loss": 0.4274, "num_input_tokens_seen": 2566080, "step": 3920 }, { "epoch": 2.3142688679245285, "grad_norm": 2.0780627727508545, "learning_rate": 9.992508688863947e-06, "loss": 0.3886, "num_input_tokens_seen": 2569248, "step": 3925 }, { "epoch": 2.3172169811320753, "grad_norm": 1.269562840461731, "learning_rate": 9.99236724903025e-06, "loss": 0.538, "num_input_tokens_seen": 2572224, "step": 3930 }, { "epoch": 2.3201650943396226, "grad_norm": 2.956913948059082, "learning_rate": 9.992224487445809e-06, "loss": 0.5086, "num_input_tokens_seen": 2574624, "step": 3935 }, { "epoch": 2.32311320754717, "grad_norm": 1.9473118782043457, "learning_rate": 9.99208040414842e-06, "loss": 0.6148, "num_input_tokens_seen": 2577760, "step": 3940 }, { "epoch": 2.326061320754717, "grad_norm": 1.6173003911972046, "learning_rate": 9.99193499917623e-06, "loss": 0.4513, "num_input_tokens_seen": 2580992, "step": 3945 }, { "epoch": 2.329009433962264, "grad_norm": 3.894285202026367, "learning_rate": 9.991788272567735e-06, "loss": 0.4585, "num_input_tokens_seen": 2586400, "step": 3950 }, { "epoch": 2.3319575471698113, "grad_norm": 1.399957299232483, "learning_rate": 9.991640224361781e-06, "loss": 0.5969, "num_input_tokens_seen": 2589984, "step": 3955 }, { "epoch": 2.3349056603773586, "grad_norm": 2.7772164344787598, "learning_rate": 9.991490854597563e-06, "loss": 0.5806, "num_input_tokens_seen": 2593216, "step": 3960 }, { "epoch": 2.3378537735849054, "grad_norm": 1.1663191318511963, "learning_rate": 9.991340163314632e-06, "loss": 0.5218, "num_input_tokens_seen": 2597664, "step": 3965 }, { "epoch": 2.3408018867924527, "grad_norm": 1.9577068090438843, "learning_rate": 9.991188150552878e-06, "loss": 0.4184, "num_input_tokens_seen": 2601280, "step": 3970 }, { "epoch": 2.34375, "grad_norm": 1.413246750831604, "learning_rate": 9.991034816352553e-06, "loss": 0.4686, "num_input_tokens_seen": 2604032, "step": 3975 }, { "epoch": 2.3466981132075473, "grad_norm": 5.386325836181641, "learning_rate": 9.990880160754248e-06, "loss": 0.6512, "num_input_tokens_seen": 2606912, "step": 3980 }, { "epoch": 2.3496462264150946, "grad_norm": 1.8213728666305542, "learning_rate": 9.990724183798914e-06, "loss": 0.4396, "num_input_tokens_seen": 2610880, "step": 3985 }, { "epoch": 2.3525943396226414, "grad_norm": 5.505479335784912, "learning_rate": 9.990566885527841e-06, "loss": 0.3905, "num_input_tokens_seen": 2614176, "step": 3990 }, { "epoch": 2.3555424528301887, "grad_norm": 2.9704174995422363, "learning_rate": 9.99040826598268e-06, "loss": 0.4794, "num_input_tokens_seen": 2618240, "step": 3995 }, { "epoch": 2.358490566037736, "grad_norm": 2.2574639320373535, "learning_rate": 9.990248325205418e-06, "loss": 0.4433, "num_input_tokens_seen": 2625888, "step": 4000 }, { "epoch": 2.361438679245283, "grad_norm": 1.8538453578948975, "learning_rate": 9.990087063238408e-06, "loss": 0.4336, "num_input_tokens_seen": 2629280, "step": 4005 }, { "epoch": 2.36438679245283, "grad_norm": 2.030524253845215, "learning_rate": 9.989924480124342e-06, "loss": 0.4642, "num_input_tokens_seen": 2634880, "step": 4010 }, { "epoch": 2.3673349056603774, "grad_norm": 3.1902835369110107, "learning_rate": 9.989760575906264e-06, "loss": 0.5844, "num_input_tokens_seen": 2638368, "step": 4015 }, { "epoch": 2.3702830188679247, "grad_norm": 3.394543409347534, "learning_rate": 9.98959535062757e-06, "loss": 0.4077, "num_input_tokens_seen": 2641440, "step": 4020 }, { "epoch": 2.3732311320754715, "grad_norm": 1.7151886224746704, "learning_rate": 9.989428804332002e-06, "loss": 0.5419, "num_input_tokens_seen": 2644576, "step": 4025 }, { "epoch": 2.376179245283019, "grad_norm": 1.7761021852493286, "learning_rate": 9.989260937063656e-06, "loss": 0.4191, "num_input_tokens_seen": 2648160, "step": 4030 }, { "epoch": 2.379127358490566, "grad_norm": 5.007516384124756, "learning_rate": 9.989091748866973e-06, "loss": 0.7213, "num_input_tokens_seen": 2650976, "step": 4035 }, { "epoch": 2.3820754716981134, "grad_norm": 3.113544225692749, "learning_rate": 9.988921239786748e-06, "loss": 0.4164, "num_input_tokens_seen": 2653824, "step": 4040 }, { "epoch": 2.3850235849056602, "grad_norm": 2.5968940258026123, "learning_rate": 9.988749409868124e-06, "loss": 0.5158, "num_input_tokens_seen": 2657856, "step": 4045 }, { "epoch": 2.3879716981132075, "grad_norm": 1.7517516613006592, "learning_rate": 9.988576259156593e-06, "loss": 0.5011, "num_input_tokens_seen": 2661952, "step": 4050 }, { "epoch": 2.390919811320755, "grad_norm": 1.1566736698150635, "learning_rate": 9.988401787697996e-06, "loss": 0.4946, "num_input_tokens_seen": 2666336, "step": 4055 }, { "epoch": 2.393867924528302, "grad_norm": 0.9496446251869202, "learning_rate": 9.98822599553853e-06, "loss": 0.5097, "num_input_tokens_seen": 2669664, "step": 4060 }, { "epoch": 2.396816037735849, "grad_norm": 1.50706946849823, "learning_rate": 9.988048882724732e-06, "loss": 0.5281, "num_input_tokens_seen": 2671936, "step": 4065 }, { "epoch": 2.3997641509433962, "grad_norm": 2.407719612121582, "learning_rate": 9.987870449303497e-06, "loss": 0.6123, "num_input_tokens_seen": 2676256, "step": 4070 }, { "epoch": 2.4027122641509435, "grad_norm": 1.6305583715438843, "learning_rate": 9.98769069532206e-06, "loss": 0.5245, "num_input_tokens_seen": 2679488, "step": 4075 }, { "epoch": 2.4056603773584904, "grad_norm": 1.664167046546936, "learning_rate": 9.98750962082802e-06, "loss": 0.4474, "num_input_tokens_seen": 2682592, "step": 4080 }, { "epoch": 2.4086084905660377, "grad_norm": 1.8277117013931274, "learning_rate": 9.987327225869312e-06, "loss": 0.4682, "num_input_tokens_seen": 2685440, "step": 4085 }, { "epoch": 2.411556603773585, "grad_norm": 2.37996244430542, "learning_rate": 9.987143510494225e-06, "loss": 0.5124, "num_input_tokens_seen": 2688480, "step": 4090 }, { "epoch": 2.4145047169811322, "grad_norm": 1.5540119409561157, "learning_rate": 9.9869584747514e-06, "loss": 0.5205, "num_input_tokens_seen": 2692960, "step": 4095 }, { "epoch": 2.417452830188679, "grad_norm": 3.061523675918579, "learning_rate": 9.986772118689828e-06, "loss": 0.5716, "num_input_tokens_seen": 2695584, "step": 4100 }, { "epoch": 2.4204009433962264, "grad_norm": 3.363091468811035, "learning_rate": 9.986584442358845e-06, "loss": 0.5137, "num_input_tokens_seen": 2698912, "step": 4105 }, { "epoch": 2.4233490566037736, "grad_norm": 1.8133829832077026, "learning_rate": 9.98639544580814e-06, "loss": 0.4734, "num_input_tokens_seen": 2701984, "step": 4110 }, { "epoch": 2.426297169811321, "grad_norm": 1.2667860984802246, "learning_rate": 9.98620512908775e-06, "loss": 0.7524, "num_input_tokens_seen": 2706656, "step": 4115 }, { "epoch": 2.4292452830188678, "grad_norm": 1.2848924398422241, "learning_rate": 9.986013492248064e-06, "loss": 0.5976, "num_input_tokens_seen": 2709952, "step": 4120 }, { "epoch": 2.432193396226415, "grad_norm": 2.5398523807525635, "learning_rate": 9.985820535339817e-06, "loss": 0.5278, "num_input_tokens_seen": 2713472, "step": 4125 }, { "epoch": 2.4351415094339623, "grad_norm": 1.7317676544189453, "learning_rate": 9.985626258414093e-06, "loss": 0.4655, "num_input_tokens_seen": 2716544, "step": 4130 }, { "epoch": 2.4380896226415096, "grad_norm": 1.427561640739441, "learning_rate": 9.985430661522333e-06, "loss": 0.5529, "num_input_tokens_seen": 2720224, "step": 4135 }, { "epoch": 2.4410377358490565, "grad_norm": 1.726494312286377, "learning_rate": 9.98523374471632e-06, "loss": 0.5476, "num_input_tokens_seen": 2722848, "step": 4140 }, { "epoch": 2.4439858490566038, "grad_norm": 1.4170986413955688, "learning_rate": 9.985035508048186e-06, "loss": 0.3983, "num_input_tokens_seen": 2725920, "step": 4145 }, { "epoch": 2.446933962264151, "grad_norm": 1.3028337955474854, "learning_rate": 9.984835951570417e-06, "loss": 0.392, "num_input_tokens_seen": 2728928, "step": 4150 }, { "epoch": 2.449882075471698, "grad_norm": 1.7039188146591187, "learning_rate": 9.984635075335847e-06, "loss": 0.3956, "num_input_tokens_seen": 2732256, "step": 4155 }, { "epoch": 2.452830188679245, "grad_norm": 2.366997241973877, "learning_rate": 9.984432879397659e-06, "loss": 0.5402, "num_input_tokens_seen": 2735616, "step": 4160 }, { "epoch": 2.4557783018867925, "grad_norm": 3.883598804473877, "learning_rate": 9.984229363809383e-06, "loss": 0.5191, "num_input_tokens_seen": 2738016, "step": 4165 }, { "epoch": 2.4587264150943398, "grad_norm": 2.537175178527832, "learning_rate": 9.984024528624904e-06, "loss": 0.5519, "num_input_tokens_seen": 2740736, "step": 4170 }, { "epoch": 2.4616745283018866, "grad_norm": 1.943121314048767, "learning_rate": 9.98381837389845e-06, "loss": 0.5104, "num_input_tokens_seen": 2743840, "step": 4175 }, { "epoch": 2.464622641509434, "grad_norm": 2.0019259452819824, "learning_rate": 9.983610899684601e-06, "loss": 0.5907, "num_input_tokens_seen": 2746560, "step": 4180 }, { "epoch": 2.467570754716981, "grad_norm": 1.7394171953201294, "learning_rate": 9.983402106038292e-06, "loss": 0.4261, "num_input_tokens_seen": 2750336, "step": 4185 }, { "epoch": 2.4705188679245285, "grad_norm": 2.3498876094818115, "learning_rate": 9.983191993014793e-06, "loss": 0.4624, "num_input_tokens_seen": 2753024, "step": 4190 }, { "epoch": 2.4734669811320753, "grad_norm": 2.6195321083068848, "learning_rate": 9.982980560669742e-06, "loss": 0.4563, "num_input_tokens_seen": 2755712, "step": 4195 }, { "epoch": 2.4764150943396226, "grad_norm": 2.0730204582214355, "learning_rate": 9.98276780905911e-06, "loss": 0.6915, "num_input_tokens_seen": 2758048, "step": 4200 }, { "epoch": 2.47936320754717, "grad_norm": 1.8546154499053955, "learning_rate": 9.982553738239225e-06, "loss": 0.5612, "num_input_tokens_seen": 2761632, "step": 4205 }, { "epoch": 2.482311320754717, "grad_norm": 3.520881175994873, "learning_rate": 9.982338348266766e-06, "loss": 0.4078, "num_input_tokens_seen": 2763968, "step": 4210 }, { "epoch": 2.485259433962264, "grad_norm": 2.2719197273254395, "learning_rate": 9.982121639198756e-06, "loss": 0.7031, "num_input_tokens_seen": 2768192, "step": 4215 }, { "epoch": 2.4882075471698113, "grad_norm": 2.2567808628082275, "learning_rate": 9.98190361109257e-06, "loss": 0.6114, "num_input_tokens_seen": 2770880, "step": 4220 }, { "epoch": 2.4911556603773586, "grad_norm": 1.7961490154266357, "learning_rate": 9.981684264005934e-06, "loss": 0.5489, "num_input_tokens_seen": 2774240, "step": 4225 }, { "epoch": 2.4941037735849054, "grad_norm": 1.3754783868789673, "learning_rate": 9.981463597996917e-06, "loss": 0.6279, "num_input_tokens_seen": 2779200, "step": 4230 }, { "epoch": 2.4970518867924527, "grad_norm": 1.9009672403335571, "learning_rate": 9.981241613123944e-06, "loss": 0.4477, "num_input_tokens_seen": 2782272, "step": 4235 }, { "epoch": 2.5, "grad_norm": 1.3478950262069702, "learning_rate": 9.981018309445785e-06, "loss": 0.3731, "num_input_tokens_seen": 2785184, "step": 4240 }, { "epoch": 2.5029481132075473, "grad_norm": 1.643349289894104, "learning_rate": 9.980793687021564e-06, "loss": 0.4499, "num_input_tokens_seen": 2788736, "step": 4245 }, { "epoch": 2.5058962264150946, "grad_norm": 1.0082978010177612, "learning_rate": 9.980567745910746e-06, "loss": 0.7001, "num_input_tokens_seen": 2791840, "step": 4250 }, { "epoch": 2.5088443396226414, "grad_norm": 1.4014298915863037, "learning_rate": 9.980340486173155e-06, "loss": 0.5493, "num_input_tokens_seen": 2795712, "step": 4255 }, { "epoch": 2.5117924528301887, "grad_norm": 0.7935531735420227, "learning_rate": 9.980111907868954e-06, "loss": 0.334, "num_input_tokens_seen": 2799488, "step": 4260 }, { "epoch": 2.514740566037736, "grad_norm": 1.6023436784744263, "learning_rate": 9.979882011058662e-06, "loss": 0.4401, "num_input_tokens_seen": 2801984, "step": 4265 }, { "epoch": 2.517688679245283, "grad_norm": 1.9043166637420654, "learning_rate": 9.979650795803146e-06, "loss": 0.4888, "num_input_tokens_seen": 2805376, "step": 4270 }, { "epoch": 2.52063679245283, "grad_norm": 1.7135614156723022, "learning_rate": 9.979418262163621e-06, "loss": 0.3877, "num_input_tokens_seen": 2808256, "step": 4275 }, { "epoch": 2.5235849056603774, "grad_norm": 2.6715550422668457, "learning_rate": 9.979184410201652e-06, "loss": 0.4876, "num_input_tokens_seen": 2811488, "step": 4280 }, { "epoch": 2.5265330188679247, "grad_norm": 1.9309089183807373, "learning_rate": 9.97894923997915e-06, "loss": 0.6545, "num_input_tokens_seen": 2814304, "step": 4285 }, { "epoch": 2.5294811320754715, "grad_norm": 1.3302161693572998, "learning_rate": 9.97871275155838e-06, "loss": 0.618, "num_input_tokens_seen": 2817472, "step": 4290 }, { "epoch": 2.532429245283019, "grad_norm": 3.891223430633545, "learning_rate": 9.978474945001949e-06, "loss": 0.5778, "num_input_tokens_seen": 2820000, "step": 4295 }, { "epoch": 2.535377358490566, "grad_norm": 1.4964137077331543, "learning_rate": 9.978235820372822e-06, "loss": 0.5118, "num_input_tokens_seen": 2823424, "step": 4300 }, { "epoch": 2.538325471698113, "grad_norm": 1.3578648567199707, "learning_rate": 9.977995377734307e-06, "loss": 0.4734, "num_input_tokens_seen": 2826016, "step": 4305 }, { "epoch": 2.5412735849056602, "grad_norm": 1.0773439407348633, "learning_rate": 9.977753617150061e-06, "loss": 0.4401, "num_input_tokens_seen": 2829344, "step": 4310 }, { "epoch": 2.5442216981132075, "grad_norm": 1.4034388065338135, "learning_rate": 9.977510538684094e-06, "loss": 0.5156, "num_input_tokens_seen": 2832928, "step": 4315 }, { "epoch": 2.547169811320755, "grad_norm": 1.2843680381774902, "learning_rate": 9.977266142400757e-06, "loss": 0.3426, "num_input_tokens_seen": 2836032, "step": 4320 }, { "epoch": 2.550117924528302, "grad_norm": 1.2812039852142334, "learning_rate": 9.977020428364759e-06, "loss": 0.3762, "num_input_tokens_seen": 2838848, "step": 4325 }, { "epoch": 2.553066037735849, "grad_norm": 1.1480623483657837, "learning_rate": 9.976773396641154e-06, "loss": 0.4205, "num_input_tokens_seen": 2843744, "step": 4330 }, { "epoch": 2.5560141509433962, "grad_norm": 2.663217544555664, "learning_rate": 9.976525047295342e-06, "loss": 0.4991, "num_input_tokens_seen": 2846944, "step": 4335 }, { "epoch": 2.5589622641509435, "grad_norm": 1.2820947170257568, "learning_rate": 9.976275380393077e-06, "loss": 0.4477, "num_input_tokens_seen": 2849984, "step": 4340 }, { "epoch": 2.5619103773584904, "grad_norm": 1.1909037828445435, "learning_rate": 9.976024396000459e-06, "loss": 0.3811, "num_input_tokens_seen": 2853952, "step": 4345 }, { "epoch": 2.5648584905660377, "grad_norm": 10.396376609802246, "learning_rate": 9.975772094183935e-06, "loss": 0.4469, "num_input_tokens_seen": 2856992, "step": 4350 }, { "epoch": 2.567806603773585, "grad_norm": 1.8128471374511719, "learning_rate": 9.975518475010306e-06, "loss": 0.5455, "num_input_tokens_seen": 2859968, "step": 4355 }, { "epoch": 2.5707547169811322, "grad_norm": 1.5180341005325317, "learning_rate": 9.975263538546717e-06, "loss": 0.4638, "num_input_tokens_seen": 2863296, "step": 4360 }, { "epoch": 2.5737028301886795, "grad_norm": 2.2465033531188965, "learning_rate": 9.975007284860664e-06, "loss": 0.384, "num_input_tokens_seen": 2866400, "step": 4365 }, { "epoch": 2.5766509433962264, "grad_norm": 2.800170660018921, "learning_rate": 9.974749714019993e-06, "loss": 0.4717, "num_input_tokens_seen": 2869280, "step": 4370 }, { "epoch": 2.5795990566037736, "grad_norm": 1.452401876449585, "learning_rate": 9.974490826092894e-06, "loss": 0.4207, "num_input_tokens_seen": 2872096, "step": 4375 }, { "epoch": 2.5825471698113205, "grad_norm": 1.3614392280578613, "learning_rate": 9.974230621147907e-06, "loss": 0.5925, "num_input_tokens_seen": 2875776, "step": 4380 }, { "epoch": 2.5854952830188678, "grad_norm": 2.188351631164551, "learning_rate": 9.973969099253928e-06, "loss": 0.5752, "num_input_tokens_seen": 2879328, "step": 4385 }, { "epoch": 2.588443396226415, "grad_norm": 2.349144697189331, "learning_rate": 9.973706260480194e-06, "loss": 0.4779, "num_input_tokens_seen": 2882432, "step": 4390 }, { "epoch": 2.5913915094339623, "grad_norm": 1.125180721282959, "learning_rate": 9.97344210489629e-06, "loss": 0.4263, "num_input_tokens_seen": 2885280, "step": 4395 }, { "epoch": 2.5943396226415096, "grad_norm": 1.4721330404281616, "learning_rate": 9.973176632572158e-06, "loss": 0.393, "num_input_tokens_seen": 2888736, "step": 4400 }, { "epoch": 2.5972877358490565, "grad_norm": 2.057450532913208, "learning_rate": 9.972909843578076e-06, "loss": 0.428, "num_input_tokens_seen": 2891872, "step": 4405 }, { "epoch": 2.6002358490566038, "grad_norm": 1.8667659759521484, "learning_rate": 9.972641737984681e-06, "loss": 0.4962, "num_input_tokens_seen": 2894944, "step": 4410 }, { "epoch": 2.603183962264151, "grad_norm": 1.0966248512268066, "learning_rate": 9.972372315862956e-06, "loss": 0.6699, "num_input_tokens_seen": 2898464, "step": 4415 }, { "epoch": 2.606132075471698, "grad_norm": 1.7781461477279663, "learning_rate": 9.97210157728423e-06, "loss": 0.4888, "num_input_tokens_seen": 2901984, "step": 4420 }, { "epoch": 2.609080188679245, "grad_norm": 1.006951093673706, "learning_rate": 9.971829522320185e-06, "loss": 0.4885, "num_input_tokens_seen": 2905952, "step": 4425 }, { "epoch": 2.6120283018867925, "grad_norm": 1.8876290321350098, "learning_rate": 9.971556151042843e-06, "loss": 0.5429, "num_input_tokens_seen": 2908800, "step": 4430 }, { "epoch": 2.6149764150943398, "grad_norm": 1.478520393371582, "learning_rate": 9.971281463524588e-06, "loss": 0.5021, "num_input_tokens_seen": 2912800, "step": 4435 }, { "epoch": 2.617924528301887, "grad_norm": 2.534208059310913, "learning_rate": 9.971005459838136e-06, "loss": 0.4052, "num_input_tokens_seen": 2915776, "step": 4440 }, { "epoch": 2.620872641509434, "grad_norm": 2.212191343307495, "learning_rate": 9.970728140056567e-06, "loss": 0.4707, "num_input_tokens_seen": 2919232, "step": 4445 }, { "epoch": 2.623820754716981, "grad_norm": 1.5253533124923706, "learning_rate": 9.9704495042533e-06, "loss": 0.4161, "num_input_tokens_seen": 2923808, "step": 4450 }, { "epoch": 2.6267688679245285, "grad_norm": 2.1895623207092285, "learning_rate": 9.970169552502105e-06, "loss": 0.5572, "num_input_tokens_seen": 2926272, "step": 4455 }, { "epoch": 2.6297169811320753, "grad_norm": 1.6530238389968872, "learning_rate": 9.969888284877102e-06, "loss": 0.4364, "num_input_tokens_seen": 2930336, "step": 4460 }, { "epoch": 2.6326650943396226, "grad_norm": 2.6359267234802246, "learning_rate": 9.969605701452757e-06, "loss": 0.502, "num_input_tokens_seen": 2933824, "step": 4465 }, { "epoch": 2.63561320754717, "grad_norm": 1.7491228580474854, "learning_rate": 9.969321802303882e-06, "loss": 0.4168, "num_input_tokens_seen": 2938240, "step": 4470 }, { "epoch": 2.638561320754717, "grad_norm": 2.4501898288726807, "learning_rate": 9.969036587505644e-06, "loss": 0.4042, "num_input_tokens_seen": 2940768, "step": 4475 }, { "epoch": 2.641509433962264, "grad_norm": 2.341637134552002, "learning_rate": 9.968750057133555e-06, "loss": 0.4657, "num_input_tokens_seen": 2944064, "step": 4480 }, { "epoch": 2.6444575471698113, "grad_norm": 2.284288167953491, "learning_rate": 9.968462211263474e-06, "loss": 0.4561, "num_input_tokens_seen": 2947520, "step": 4485 }, { "epoch": 2.6474056603773586, "grad_norm": 1.943724513053894, "learning_rate": 9.96817304997161e-06, "loss": 0.4557, "num_input_tokens_seen": 2950720, "step": 4490 }, { "epoch": 2.6503537735849054, "grad_norm": 2.285425901412964, "learning_rate": 9.967882573334519e-06, "loss": 0.4388, "num_input_tokens_seen": 2953056, "step": 4495 }, { "epoch": 2.6533018867924527, "grad_norm": 2.7565972805023193, "learning_rate": 9.967590781429106e-06, "loss": 0.558, "num_input_tokens_seen": 2956864, "step": 4500 }, { "epoch": 2.65625, "grad_norm": 2.4901981353759766, "learning_rate": 9.967297674332625e-06, "loss": 0.5854, "num_input_tokens_seen": 2960288, "step": 4505 }, { "epoch": 2.6591981132075473, "grad_norm": 1.9111524820327759, "learning_rate": 9.967003252122675e-06, "loss": 0.4283, "num_input_tokens_seen": 2963808, "step": 4510 }, { "epoch": 2.6621462264150946, "grad_norm": 2.0595595836639404, "learning_rate": 9.96670751487721e-06, "loss": 0.4931, "num_input_tokens_seen": 2967424, "step": 4515 }, { "epoch": 2.6650943396226414, "grad_norm": 2.4858038425445557, "learning_rate": 9.966410462674525e-06, "loss": 0.5651, "num_input_tokens_seen": 2969888, "step": 4520 }, { "epoch": 2.6680424528301887, "grad_norm": 1.7285395860671997, "learning_rate": 9.966112095593264e-06, "loss": 0.4848, "num_input_tokens_seen": 2973600, "step": 4525 }, { "epoch": 2.670990566037736, "grad_norm": 1.5551965236663818, "learning_rate": 9.965812413712425e-06, "loss": 0.4489, "num_input_tokens_seen": 2977888, "step": 4530 }, { "epoch": 2.673938679245283, "grad_norm": 2.0597550868988037, "learning_rate": 9.965511417111346e-06, "loss": 0.5459, "num_input_tokens_seen": 2980704, "step": 4535 }, { "epoch": 2.67688679245283, "grad_norm": 1.9914778470993042, "learning_rate": 9.96520910586972e-06, "loss": 0.6998, "num_input_tokens_seen": 2983808, "step": 4540 }, { "epoch": 2.6798349056603774, "grad_norm": 2.5155093669891357, "learning_rate": 9.964905480067585e-06, "loss": 0.4636, "num_input_tokens_seen": 2987072, "step": 4545 }, { "epoch": 2.6827830188679247, "grad_norm": 1.7781126499176025, "learning_rate": 9.964600539785328e-06, "loss": 0.5595, "num_input_tokens_seen": 2989696, "step": 4550 }, { "epoch": 2.6857311320754715, "grad_norm": 2.3087291717529297, "learning_rate": 9.96429428510368e-06, "loss": 0.5438, "num_input_tokens_seen": 2992800, "step": 4555 }, { "epoch": 2.688679245283019, "grad_norm": 1.8486661911010742, "learning_rate": 9.963986716103724e-06, "loss": 0.3966, "num_input_tokens_seen": 2995872, "step": 4560 }, { "epoch": 2.691627358490566, "grad_norm": 1.22743821144104, "learning_rate": 9.963677832866893e-06, "loss": 0.534, "num_input_tokens_seen": 2998880, "step": 4565 }, { "epoch": 2.694575471698113, "grad_norm": 1.736434817314148, "learning_rate": 9.963367635474962e-06, "loss": 0.2894, "num_input_tokens_seen": 3001728, "step": 4570 }, { "epoch": 2.6975235849056602, "grad_norm": 3.1284735202789307, "learning_rate": 9.96305612401006e-06, "loss": 0.628, "num_input_tokens_seen": 3004512, "step": 4575 }, { "epoch": 2.7004716981132075, "grad_norm": 1.3413945436477661, "learning_rate": 9.96274329855466e-06, "loss": 0.3895, "num_input_tokens_seen": 3007904, "step": 4580 }, { "epoch": 2.703419811320755, "grad_norm": 1.2921295166015625, "learning_rate": 9.962429159191583e-06, "loss": 0.4565, "num_input_tokens_seen": 3011200, "step": 4585 }, { "epoch": 2.706367924528302, "grad_norm": 2.2340164184570312, "learning_rate": 9.962113706003997e-06, "loss": 0.3832, "num_input_tokens_seen": 3013984, "step": 4590 }, { "epoch": 2.709316037735849, "grad_norm": 1.245511770248413, "learning_rate": 9.961796939075424e-06, "loss": 0.5362, "num_input_tokens_seen": 3016544, "step": 4595 }, { "epoch": 2.7122641509433962, "grad_norm": 1.6616170406341553, "learning_rate": 9.961478858489728e-06, "loss": 0.5939, "num_input_tokens_seen": 3020192, "step": 4600 }, { "epoch": 2.7152122641509435, "grad_norm": 2.590428113937378, "learning_rate": 9.961159464331119e-06, "loss": 0.528, "num_input_tokens_seen": 3022848, "step": 4605 }, { "epoch": 2.7181603773584904, "grad_norm": 1.934630036354065, "learning_rate": 9.960838756684161e-06, "loss": 0.6667, "num_input_tokens_seen": 3025536, "step": 4610 }, { "epoch": 2.7211084905660377, "grad_norm": 1.5918610095977783, "learning_rate": 9.960516735633764e-06, "loss": 0.4638, "num_input_tokens_seen": 3029120, "step": 4615 }, { "epoch": 2.724056603773585, "grad_norm": 1.9903554916381836, "learning_rate": 9.960193401265181e-06, "loss": 0.4786, "num_input_tokens_seen": 3031552, "step": 4620 }, { "epoch": 2.7270047169811322, "grad_norm": 1.7555742263793945, "learning_rate": 9.959868753664018e-06, "loss": 0.4291, "num_input_tokens_seen": 3034464, "step": 4625 }, { "epoch": 2.7299528301886795, "grad_norm": 2.064436912536621, "learning_rate": 9.959542792916227e-06, "loss": 0.5141, "num_input_tokens_seen": 3037344, "step": 4630 }, { "epoch": 2.7329009433962264, "grad_norm": 1.5838605165481567, "learning_rate": 9.959215519108108e-06, "loss": 0.41, "num_input_tokens_seen": 3039968, "step": 4635 }, { "epoch": 2.7358490566037736, "grad_norm": 1.3867851495742798, "learning_rate": 9.958886932326306e-06, "loss": 0.4529, "num_input_tokens_seen": 3043136, "step": 4640 }, { "epoch": 2.7387971698113205, "grad_norm": 2.1044561862945557, "learning_rate": 9.958557032657817e-06, "loss": 0.4091, "num_input_tokens_seen": 3050368, "step": 4645 }, { "epoch": 2.7417452830188678, "grad_norm": 1.7976959943771362, "learning_rate": 9.958225820189984e-06, "loss": 0.4782, "num_input_tokens_seen": 3053440, "step": 4650 }, { "epoch": 2.744693396226415, "grad_norm": 1.3850997686386108, "learning_rate": 9.957893295010495e-06, "loss": 0.437, "num_input_tokens_seen": 3056800, "step": 4655 }, { "epoch": 2.7476415094339623, "grad_norm": 2.8802378177642822, "learning_rate": 9.957559457207391e-06, "loss": 0.4638, "num_input_tokens_seen": 3060576, "step": 4660 }, { "epoch": 2.7505896226415096, "grad_norm": 12.48519229888916, "learning_rate": 9.957224306869053e-06, "loss": 0.5833, "num_input_tokens_seen": 3064128, "step": 4665 }, { "epoch": 2.7535377358490565, "grad_norm": 2.182999849319458, "learning_rate": 9.956887844084216e-06, "loss": 0.4817, "num_input_tokens_seen": 3067872, "step": 4670 }, { "epoch": 2.7564858490566038, "grad_norm": 1.2213979959487915, "learning_rate": 9.956550068941958e-06, "loss": 0.6334, "num_input_tokens_seen": 3071328, "step": 4675 }, { "epoch": 2.759433962264151, "grad_norm": 3.3232877254486084, "learning_rate": 9.95621098153171e-06, "loss": 0.5386, "num_input_tokens_seen": 3074144, "step": 4680 }, { "epoch": 2.762382075471698, "grad_norm": 1.4227296113967896, "learning_rate": 9.955870581943243e-06, "loss": 0.5161, "num_input_tokens_seen": 3077056, "step": 4685 }, { "epoch": 2.765330188679245, "grad_norm": 2.935396194458008, "learning_rate": 9.955528870266681e-06, "loss": 0.6025, "num_input_tokens_seen": 3079904, "step": 4690 }, { "epoch": 2.7682783018867925, "grad_norm": 1.7990391254425049, "learning_rate": 9.955185846592495e-06, "loss": 0.6579, "num_input_tokens_seen": 3082752, "step": 4695 }, { "epoch": 2.7712264150943398, "grad_norm": 1.6266095638275146, "learning_rate": 9.9548415110115e-06, "loss": 0.4705, "num_input_tokens_seen": 3086176, "step": 4700 }, { "epoch": 2.774174528301887, "grad_norm": 1.1081061363220215, "learning_rate": 9.95449586361486e-06, "loss": 0.4125, "num_input_tokens_seen": 3089472, "step": 4705 }, { "epoch": 2.777122641509434, "grad_norm": 4.16483211517334, "learning_rate": 9.954148904494085e-06, "loss": 0.7046, "num_input_tokens_seen": 3092320, "step": 4710 }, { "epoch": 2.780070754716981, "grad_norm": 1.427153468132019, "learning_rate": 9.95380063374104e-06, "loss": 0.5102, "num_input_tokens_seen": 3094912, "step": 4715 }, { "epoch": 2.7830188679245285, "grad_norm": 1.3087785243988037, "learning_rate": 9.953451051447927e-06, "loss": 0.31, "num_input_tokens_seen": 3100288, "step": 4720 }, { "epoch": 2.7859669811320753, "grad_norm": 1.9905997514724731, "learning_rate": 9.953100157707299e-06, "loss": 0.424, "num_input_tokens_seen": 3104288, "step": 4725 }, { "epoch": 2.7889150943396226, "grad_norm": 1.2326284646987915, "learning_rate": 9.952747952612056e-06, "loss": 0.512, "num_input_tokens_seen": 3107488, "step": 4730 }, { "epoch": 2.79186320754717, "grad_norm": 1.023764967918396, "learning_rate": 9.952394436255451e-06, "loss": 0.6179, "num_input_tokens_seen": 3111104, "step": 4735 }, { "epoch": 2.794811320754717, "grad_norm": 1.8258326053619385, "learning_rate": 9.952039608731072e-06, "loss": 0.41, "num_input_tokens_seen": 3113632, "step": 4740 }, { "epoch": 2.797759433962264, "grad_norm": 2.997077703475952, "learning_rate": 9.951683470132868e-06, "loss": 0.5081, "num_input_tokens_seen": 3116448, "step": 4745 }, { "epoch": 2.8007075471698113, "grad_norm": 1.5128456354141235, "learning_rate": 9.951326020555122e-06, "loss": 0.4662, "num_input_tokens_seen": 3119104, "step": 4750 }, { "epoch": 2.8036556603773586, "grad_norm": 1.2902052402496338, "learning_rate": 9.950967260092473e-06, "loss": 0.3918, "num_input_tokens_seen": 3122848, "step": 4755 }, { "epoch": 2.8066037735849054, "grad_norm": 2.1645607948303223, "learning_rate": 9.950607188839905e-06, "loss": 0.554, "num_input_tokens_seen": 3126304, "step": 4760 }, { "epoch": 2.8095518867924527, "grad_norm": 2.7367639541625977, "learning_rate": 9.950245806892749e-06, "loss": 0.6729, "num_input_tokens_seen": 3129184, "step": 4765 }, { "epoch": 2.8125, "grad_norm": 2.7969813346862793, "learning_rate": 9.94988311434668e-06, "loss": 0.4256, "num_input_tokens_seen": 3132384, "step": 4770 }, { "epoch": 2.8154481132075473, "grad_norm": 1.2100876569747925, "learning_rate": 9.949519111297723e-06, "loss": 0.4173, "num_input_tokens_seen": 3135744, "step": 4775 }, { "epoch": 2.8183962264150946, "grad_norm": 1.6002990007400513, "learning_rate": 9.949153797842252e-06, "loss": 0.4267, "num_input_tokens_seen": 3138528, "step": 4780 }, { "epoch": 2.8213443396226414, "grad_norm": 5.960033416748047, "learning_rate": 9.948787174076982e-06, "loss": 0.6426, "num_input_tokens_seen": 3141632, "step": 4785 }, { "epoch": 2.8242924528301887, "grad_norm": 1.295791506767273, "learning_rate": 9.948419240098978e-06, "loss": 0.3517, "num_input_tokens_seen": 3146880, "step": 4790 }, { "epoch": 2.827240566037736, "grad_norm": 1.8094828128814697, "learning_rate": 9.948049996005657e-06, "loss": 0.4726, "num_input_tokens_seen": 3149312, "step": 4795 }, { "epoch": 2.830188679245283, "grad_norm": 1.1480863094329834, "learning_rate": 9.947679441894773e-06, "loss": 0.4049, "num_input_tokens_seen": 3152896, "step": 4800 }, { "epoch": 2.83313679245283, "grad_norm": 2.19973087310791, "learning_rate": 9.947307577864433e-06, "loss": 0.4449, "num_input_tokens_seen": 3157728, "step": 4805 }, { "epoch": 2.8360849056603774, "grad_norm": 2.5014805793762207, "learning_rate": 9.94693440401309e-06, "loss": 0.4305, "num_input_tokens_seen": 3160704, "step": 4810 }, { "epoch": 2.8390330188679247, "grad_norm": 2.3153488636016846, "learning_rate": 9.946559920439545e-06, "loss": 0.4522, "num_input_tokens_seen": 3164352, "step": 4815 }, { "epoch": 2.8419811320754715, "grad_norm": 2.2521612644195557, "learning_rate": 9.946184127242942e-06, "loss": 0.4742, "num_input_tokens_seen": 3167776, "step": 4820 }, { "epoch": 2.844929245283019, "grad_norm": 2.772606372833252, "learning_rate": 9.945807024522774e-06, "loss": 0.5169, "num_input_tokens_seen": 3171296, "step": 4825 }, { "epoch": 2.847877358490566, "grad_norm": 2.061849594116211, "learning_rate": 9.945428612378881e-06, "loss": 0.5031, "num_input_tokens_seen": 3174688, "step": 4830 }, { "epoch": 2.850825471698113, "grad_norm": 1.945185661315918, "learning_rate": 9.94504889091145e-06, "loss": 0.3562, "num_input_tokens_seen": 3177856, "step": 4835 }, { "epoch": 2.8537735849056602, "grad_norm": 1.5330348014831543, "learning_rate": 9.944667860221013e-06, "loss": 0.4684, "num_input_tokens_seen": 3180992, "step": 4840 }, { "epoch": 2.8567216981132075, "grad_norm": 1.953518271446228, "learning_rate": 9.944285520408448e-06, "loss": 0.5614, "num_input_tokens_seen": 3184608, "step": 4845 }, { "epoch": 2.859669811320755, "grad_norm": 1.3589810132980347, "learning_rate": 9.943901871574984e-06, "loss": 0.6009, "num_input_tokens_seen": 3187616, "step": 4850 }, { "epoch": 2.862617924528302, "grad_norm": 1.7746999263763428, "learning_rate": 9.943516913822192e-06, "loss": 0.4873, "num_input_tokens_seen": 3191008, "step": 4855 }, { "epoch": 2.865566037735849, "grad_norm": 4.118255138397217, "learning_rate": 9.943130647251994e-06, "loss": 0.4991, "num_input_tokens_seen": 3194592, "step": 4860 }, { "epoch": 2.8685141509433962, "grad_norm": 1.8358838558197021, "learning_rate": 9.94274307196665e-06, "loss": 0.4599, "num_input_tokens_seen": 3197792, "step": 4865 }, { "epoch": 2.8714622641509435, "grad_norm": 1.4452322721481323, "learning_rate": 9.942354188068778e-06, "loss": 0.4436, "num_input_tokens_seen": 3201568, "step": 4870 }, { "epoch": 2.8744103773584904, "grad_norm": 1.5798513889312744, "learning_rate": 9.941963995661333e-06, "loss": 0.4922, "num_input_tokens_seen": 3204224, "step": 4875 }, { "epoch": 2.8773584905660377, "grad_norm": 1.526102066040039, "learning_rate": 9.941572494847622e-06, "loss": 0.5057, "num_input_tokens_seen": 3207200, "step": 4880 }, { "epoch": 2.880306603773585, "grad_norm": 1.79891037940979, "learning_rate": 9.941179685731297e-06, "loss": 0.4936, "num_input_tokens_seen": 3211488, "step": 4885 }, { "epoch": 2.8832547169811322, "grad_norm": 1.6274664402008057, "learning_rate": 9.940785568416354e-06, "loss": 0.5424, "num_input_tokens_seen": 3214976, "step": 4890 }, { "epoch": 2.8862028301886795, "grad_norm": 2.215467691421509, "learning_rate": 9.940390143007137e-06, "loss": 0.4499, "num_input_tokens_seen": 3218944, "step": 4895 }, { "epoch": 2.8891509433962264, "grad_norm": 3.0093634128570557, "learning_rate": 9.939993409608339e-06, "loss": 0.4112, "num_input_tokens_seen": 3221920, "step": 4900 }, { "epoch": 2.8920990566037736, "grad_norm": 1.3105334043502808, "learning_rate": 9.939595368324996e-06, "loss": 0.4143, "num_input_tokens_seen": 3225344, "step": 4905 }, { "epoch": 2.8950471698113205, "grad_norm": 1.8488482236862183, "learning_rate": 9.93919601926249e-06, "loss": 0.4615, "num_input_tokens_seen": 3228064, "step": 4910 }, { "epoch": 2.8979952830188678, "grad_norm": 1.0685195922851562, "learning_rate": 9.938795362526552e-06, "loss": 0.5836, "num_input_tokens_seen": 3230688, "step": 4915 }, { "epoch": 2.900943396226415, "grad_norm": 1.639742374420166, "learning_rate": 9.938393398223255e-06, "loss": 0.4116, "num_input_tokens_seen": 3233344, "step": 4920 }, { "epoch": 2.9038915094339623, "grad_norm": 1.1223746538162231, "learning_rate": 9.937990126459024e-06, "loss": 0.6701, "num_input_tokens_seen": 3236512, "step": 4925 }, { "epoch": 2.9068396226415096, "grad_norm": 1.2443132400512695, "learning_rate": 9.937585547340624e-06, "loss": 0.3986, "num_input_tokens_seen": 3239616, "step": 4930 }, { "epoch": 2.9097877358490565, "grad_norm": 1.7467105388641357, "learning_rate": 9.937179660975174e-06, "loss": 0.6043, "num_input_tokens_seen": 3242944, "step": 4935 }, { "epoch": 2.9127358490566038, "grad_norm": 2.159405469894409, "learning_rate": 9.936772467470127e-06, "loss": 0.5736, "num_input_tokens_seen": 3246016, "step": 4940 }, { "epoch": 2.915683962264151, "grad_norm": 1.0849558115005493, "learning_rate": 9.936363966933294e-06, "loss": 0.5185, "num_input_tokens_seen": 3250240, "step": 4945 }, { "epoch": 2.918632075471698, "grad_norm": 2.3183724880218506, "learning_rate": 9.935954159472828e-06, "loss": 0.5477, "num_input_tokens_seen": 3253696, "step": 4950 }, { "epoch": 2.921580188679245, "grad_norm": 1.4328734874725342, "learning_rate": 9.935543045197222e-06, "loss": 0.4762, "num_input_tokens_seen": 3257376, "step": 4955 }, { "epoch": 2.9245283018867925, "grad_norm": 2.2161431312561035, "learning_rate": 9.935130624215326e-06, "loss": 0.5166, "num_input_tokens_seen": 3260960, "step": 4960 }, { "epoch": 2.9274764150943398, "grad_norm": 1.389668583869934, "learning_rate": 9.934716896636329e-06, "loss": 0.5227, "num_input_tokens_seen": 3264608, "step": 4965 }, { "epoch": 2.930424528301887, "grad_norm": 2.7577316761016846, "learning_rate": 9.934301862569764e-06, "loss": 0.5639, "num_input_tokens_seen": 3267808, "step": 4970 }, { "epoch": 2.933372641509434, "grad_norm": 1.84599769115448, "learning_rate": 9.933885522125517e-06, "loss": 0.486, "num_input_tokens_seen": 3271200, "step": 4975 }, { "epoch": 2.936320754716981, "grad_norm": 1.3649576902389526, "learning_rate": 9.933467875413813e-06, "loss": 0.5199, "num_input_tokens_seen": 3273920, "step": 4980 }, { "epoch": 2.9392688679245285, "grad_norm": 1.8558845520019531, "learning_rate": 9.933048922545227e-06, "loss": 0.6719, "num_input_tokens_seen": 3276800, "step": 4985 }, { "epoch": 2.9422169811320753, "grad_norm": 2.8185601234436035, "learning_rate": 9.932628663630679e-06, "loss": 0.5416, "num_input_tokens_seen": 3279360, "step": 4990 }, { "epoch": 2.9451650943396226, "grad_norm": 3.7125041484832764, "learning_rate": 9.932207098781432e-06, "loss": 0.4781, "num_input_tokens_seen": 3282048, "step": 4995 }, { "epoch": 2.94811320754717, "grad_norm": 1.6568915843963623, "learning_rate": 9.931784228109102e-06, "loss": 0.424, "num_input_tokens_seen": 3285088, "step": 5000 }, { "epoch": 2.951061320754717, "grad_norm": 1.1679878234863281, "learning_rate": 9.93136005172564e-06, "loss": 0.5663, "num_input_tokens_seen": 3288928, "step": 5005 }, { "epoch": 2.954009433962264, "grad_norm": 2.139047861099243, "learning_rate": 9.930934569743354e-06, "loss": 0.564, "num_input_tokens_seen": 3292160, "step": 5010 }, { "epoch": 2.9569575471698113, "grad_norm": 3.1792049407958984, "learning_rate": 9.930507782274888e-06, "loss": 0.4427, "num_input_tokens_seen": 3295584, "step": 5015 }, { "epoch": 2.9599056603773586, "grad_norm": 2.7713193893432617, "learning_rate": 9.930079689433236e-06, "loss": 0.4886, "num_input_tokens_seen": 3300192, "step": 5020 }, { "epoch": 2.9628537735849054, "grad_norm": 1.7680079936981201, "learning_rate": 9.92965029133174e-06, "loss": 0.5079, "num_input_tokens_seen": 3303104, "step": 5025 }, { "epoch": 2.9658018867924527, "grad_norm": 1.4279202222824097, "learning_rate": 9.929219588084084e-06, "loss": 0.5045, "num_input_tokens_seen": 3305952, "step": 5030 }, { "epoch": 2.96875, "grad_norm": 2.182008743286133, "learning_rate": 9.9287875798043e-06, "loss": 0.4194, "num_input_tokens_seen": 3309440, "step": 5035 }, { "epoch": 2.9716981132075473, "grad_norm": 3.9686636924743652, "learning_rate": 9.92835426660676e-06, "loss": 0.5496, "num_input_tokens_seen": 3312544, "step": 5040 }, { "epoch": 2.9746462264150946, "grad_norm": 2.8005213737487793, "learning_rate": 9.927919648606188e-06, "loss": 0.4636, "num_input_tokens_seen": 3315200, "step": 5045 }, { "epoch": 2.9775943396226414, "grad_norm": 1.5265860557556152, "learning_rate": 9.927483725917652e-06, "loss": 0.4807, "num_input_tokens_seen": 3317952, "step": 5050 }, { "epoch": 2.9805424528301887, "grad_norm": 2.3304340839385986, "learning_rate": 9.927046498656562e-06, "loss": 0.4946, "num_input_tokens_seen": 3320704, "step": 5055 }, { "epoch": 2.983490566037736, "grad_norm": 2.047696113586426, "learning_rate": 9.926607966938679e-06, "loss": 0.4438, "num_input_tokens_seen": 3324064, "step": 5060 }, { "epoch": 2.986438679245283, "grad_norm": 1.3531242609024048, "learning_rate": 9.926168130880103e-06, "loss": 0.3641, "num_input_tokens_seen": 3326720, "step": 5065 }, { "epoch": 2.98938679245283, "grad_norm": 1.5491362810134888, "learning_rate": 9.925726990597283e-06, "loss": 0.6691, "num_input_tokens_seen": 3330688, "step": 5070 }, { "epoch": 2.9923349056603774, "grad_norm": 1.8570971488952637, "learning_rate": 9.925284546207015e-06, "loss": 0.4811, "num_input_tokens_seen": 3333344, "step": 5075 }, { "epoch": 2.9952830188679247, "grad_norm": 2.240931272506714, "learning_rate": 9.924840797826436e-06, "loss": 0.4183, "num_input_tokens_seen": 3336960, "step": 5080 }, { "epoch": 2.9982311320754715, "grad_norm": 1.6752545833587646, "learning_rate": 9.924395745573029e-06, "loss": 0.4927, "num_input_tokens_seen": 3339776, "step": 5085 }, { "epoch": 3.001179245283019, "grad_norm": 1.3970379829406738, "learning_rate": 9.923949389564629e-06, "loss": 0.6013, "num_input_tokens_seen": 3342304, "step": 5090 }, { "epoch": 3.004127358490566, "grad_norm": 1.4951919317245483, "learning_rate": 9.923501729919404e-06, "loss": 0.4931, "num_input_tokens_seen": 3345632, "step": 5095 }, { "epoch": 3.0070754716981134, "grad_norm": 1.112223744392395, "learning_rate": 9.923052766755878e-06, "loss": 0.4941, "num_input_tokens_seen": 3348800, "step": 5100 }, { "epoch": 3.0100235849056602, "grad_norm": 1.2696794271469116, "learning_rate": 9.922602500192914e-06, "loss": 0.4785, "num_input_tokens_seen": 3352160, "step": 5105 }, { "epoch": 3.0129716981132075, "grad_norm": 1.219128966331482, "learning_rate": 9.922150930349725e-06, "loss": 0.5013, "num_input_tokens_seen": 3355360, "step": 5110 }, { "epoch": 3.015919811320755, "grad_norm": 1.2822624444961548, "learning_rate": 9.921698057345863e-06, "loss": 0.5104, "num_input_tokens_seen": 3358560, "step": 5115 }, { "epoch": 3.018867924528302, "grad_norm": 1.6639991998672485, "learning_rate": 9.921243881301229e-06, "loss": 0.4714, "num_input_tokens_seen": 3361664, "step": 5120 }, { "epoch": 3.021816037735849, "grad_norm": 2.7235310077667236, "learning_rate": 9.920788402336068e-06, "loss": 0.4314, "num_input_tokens_seen": 3364640, "step": 5125 }, { "epoch": 3.0247641509433962, "grad_norm": 1.2460765838623047, "learning_rate": 9.92033162057097e-06, "loss": 0.4231, "num_input_tokens_seen": 3368608, "step": 5130 }, { "epoch": 3.0277122641509435, "grad_norm": 1.6496349573135376, "learning_rate": 9.919873536126869e-06, "loss": 0.3804, "num_input_tokens_seen": 3371712, "step": 5135 }, { "epoch": 3.0306603773584904, "grad_norm": 2.1768150329589844, "learning_rate": 9.919414149125046e-06, "loss": 0.4291, "num_input_tokens_seen": 3374880, "step": 5140 }, { "epoch": 3.0336084905660377, "grad_norm": 1.6689797639846802, "learning_rate": 9.918953459687126e-06, "loss": 0.3712, "num_input_tokens_seen": 3380768, "step": 5145 }, { "epoch": 3.036556603773585, "grad_norm": 2.894310235977173, "learning_rate": 9.918491467935078e-06, "loss": 0.3785, "num_input_tokens_seen": 3383744, "step": 5150 }, { "epoch": 3.0395047169811322, "grad_norm": 1.0824196338653564, "learning_rate": 9.918028173991218e-06, "loss": 0.3651, "num_input_tokens_seen": 3386912, "step": 5155 }, { "epoch": 3.042452830188679, "grad_norm": 1.9404481649398804, "learning_rate": 9.917563577978202e-06, "loss": 0.4172, "num_input_tokens_seen": 3391072, "step": 5160 }, { "epoch": 3.0454009433962264, "grad_norm": 1.5985602140426636, "learning_rate": 9.917097680019035e-06, "loss": 0.5447, "num_input_tokens_seen": 3393824, "step": 5165 }, { "epoch": 3.0483490566037736, "grad_norm": 1.8965239524841309, "learning_rate": 9.916630480237066e-06, "loss": 0.3909, "num_input_tokens_seen": 3396832, "step": 5170 }, { "epoch": 3.051297169811321, "grad_norm": 1.0448908805847168, "learning_rate": 9.916161978755988e-06, "loss": 0.4754, "num_input_tokens_seen": 3400128, "step": 5175 }, { "epoch": 3.0542452830188678, "grad_norm": 3.1997549533843994, "learning_rate": 9.915692175699838e-06, "loss": 0.5048, "num_input_tokens_seen": 3403712, "step": 5180 }, { "epoch": 3.057193396226415, "grad_norm": 1.875261664390564, "learning_rate": 9.915221071193e-06, "loss": 0.521, "num_input_tokens_seen": 3406720, "step": 5185 }, { "epoch": 3.0601415094339623, "grad_norm": 1.9905083179473877, "learning_rate": 9.914748665360199e-06, "loss": 0.5251, "num_input_tokens_seen": 3409568, "step": 5190 }, { "epoch": 3.0630896226415096, "grad_norm": 1.9075783491134644, "learning_rate": 9.914274958326507e-06, "loss": 0.5182, "num_input_tokens_seen": 3412480, "step": 5195 }, { "epoch": 3.0660377358490565, "grad_norm": 1.583446979522705, "learning_rate": 9.913799950217341e-06, "loss": 0.5258, "num_input_tokens_seen": 3416288, "step": 5200 }, { "epoch": 3.0689858490566038, "grad_norm": 3.345060348510742, "learning_rate": 9.91332364115846e-06, "loss": 0.5167, "num_input_tokens_seen": 3419584, "step": 5205 }, { "epoch": 3.071933962264151, "grad_norm": 1.637128472328186, "learning_rate": 9.912846031275972e-06, "loss": 0.5845, "num_input_tokens_seen": 3422784, "step": 5210 }, { "epoch": 3.074882075471698, "grad_norm": 2.889280319213867, "learning_rate": 9.912367120696322e-06, "loss": 0.4552, "num_input_tokens_seen": 3425184, "step": 5215 }, { "epoch": 3.077830188679245, "grad_norm": 2.022205352783203, "learning_rate": 9.911886909546307e-06, "loss": 0.5608, "num_input_tokens_seen": 3428032, "step": 5220 }, { "epoch": 3.0807783018867925, "grad_norm": 1.0598067045211792, "learning_rate": 9.911405397953063e-06, "loss": 0.3748, "num_input_tokens_seen": 3432096, "step": 5225 }, { "epoch": 3.0837264150943398, "grad_norm": 2.2717177867889404, "learning_rate": 9.910922586044073e-06, "loss": 0.5234, "num_input_tokens_seen": 3434848, "step": 5230 }, { "epoch": 3.0866745283018866, "grad_norm": 1.9052342176437378, "learning_rate": 9.910438473947163e-06, "loss": 0.4048, "num_input_tokens_seen": 3437312, "step": 5235 }, { "epoch": 3.089622641509434, "grad_norm": 2.0371739864349365, "learning_rate": 9.909953061790506e-06, "loss": 0.5122, "num_input_tokens_seen": 3440800, "step": 5240 }, { "epoch": 3.092570754716981, "grad_norm": 1.294731616973877, "learning_rate": 9.909466349702613e-06, "loss": 0.7618, "num_input_tokens_seen": 3444192, "step": 5245 }, { "epoch": 3.0955188679245285, "grad_norm": 1.3287591934204102, "learning_rate": 9.908978337812348e-06, "loss": 0.5609, "num_input_tokens_seen": 3447232, "step": 5250 }, { "epoch": 3.0984669811320753, "grad_norm": 1.6527490615844727, "learning_rate": 9.908489026248909e-06, "loss": 0.5503, "num_input_tokens_seen": 3450592, "step": 5255 }, { "epoch": 3.1014150943396226, "grad_norm": 2.03985333442688, "learning_rate": 9.907998415141846e-06, "loss": 0.4299, "num_input_tokens_seen": 3453632, "step": 5260 }, { "epoch": 3.10436320754717, "grad_norm": 1.2423347234725952, "learning_rate": 9.907506504621052e-06, "loss": 0.5229, "num_input_tokens_seen": 3456384, "step": 5265 }, { "epoch": 3.107311320754717, "grad_norm": 2.142000198364258, "learning_rate": 9.907013294816759e-06, "loss": 0.5522, "num_input_tokens_seen": 3459104, "step": 5270 }, { "epoch": 3.110259433962264, "grad_norm": 2.6554131507873535, "learning_rate": 9.906518785859548e-06, "loss": 0.4367, "num_input_tokens_seen": 3461920, "step": 5275 }, { "epoch": 3.1132075471698113, "grad_norm": 1.6144039630889893, "learning_rate": 9.906022977880344e-06, "loss": 0.4205, "num_input_tokens_seen": 3464736, "step": 5280 }, { "epoch": 3.1161556603773586, "grad_norm": 2.9695677757263184, "learning_rate": 9.905525871010412e-06, "loss": 0.4689, "num_input_tokens_seen": 3468096, "step": 5285 }, { "epoch": 3.119103773584906, "grad_norm": 1.6211638450622559, "learning_rate": 9.905027465381363e-06, "loss": 0.3391, "num_input_tokens_seen": 3471200, "step": 5290 }, { "epoch": 3.1220518867924527, "grad_norm": 2.400726318359375, "learning_rate": 9.904527761125155e-06, "loss": 0.6238, "num_input_tokens_seen": 3475104, "step": 5295 }, { "epoch": 3.125, "grad_norm": 2.1816389560699463, "learning_rate": 9.904026758374083e-06, "loss": 0.3958, "num_input_tokens_seen": 3478688, "step": 5300 }, { "epoch": 3.1279481132075473, "grad_norm": 3.17679762840271, "learning_rate": 9.903524457260794e-06, "loss": 0.4787, "num_input_tokens_seen": 3482336, "step": 5305 }, { "epoch": 3.1308962264150946, "grad_norm": 1.7362056970596313, "learning_rate": 9.90302085791827e-06, "loss": 0.5141, "num_input_tokens_seen": 3485824, "step": 5310 }, { "epoch": 3.1338443396226414, "grad_norm": 1.6322377920150757, "learning_rate": 9.902515960479844e-06, "loss": 0.5199, "num_input_tokens_seen": 3488768, "step": 5315 }, { "epoch": 3.1367924528301887, "grad_norm": 3.6361188888549805, "learning_rate": 9.902009765079188e-06, "loss": 0.6753, "num_input_tokens_seen": 3491456, "step": 5320 }, { "epoch": 3.139740566037736, "grad_norm": 2.024271249771118, "learning_rate": 9.90150227185032e-06, "loss": 0.3746, "num_input_tokens_seen": 3493888, "step": 5325 }, { "epoch": 3.142688679245283, "grad_norm": 1.3443235158920288, "learning_rate": 9.900993480927603e-06, "loss": 0.5664, "num_input_tokens_seen": 3497312, "step": 5330 }, { "epoch": 3.14563679245283, "grad_norm": 2.301670789718628, "learning_rate": 9.90048339244574e-06, "loss": 0.4733, "num_input_tokens_seen": 3500608, "step": 5335 }, { "epoch": 3.1485849056603774, "grad_norm": 1.2122799158096313, "learning_rate": 9.899972006539776e-06, "loss": 0.557, "num_input_tokens_seen": 3504128, "step": 5340 }, { "epoch": 3.1515330188679247, "grad_norm": 1.9267774820327759, "learning_rate": 9.899459323345106e-06, "loss": 0.4494, "num_input_tokens_seen": 3507392, "step": 5345 }, { "epoch": 3.1544811320754715, "grad_norm": 2.446427822113037, "learning_rate": 9.898945342997467e-06, "loss": 0.4936, "num_input_tokens_seen": 3510528, "step": 5350 }, { "epoch": 3.157429245283019, "grad_norm": 1.4240705966949463, "learning_rate": 9.898430065632933e-06, "loss": 0.4219, "num_input_tokens_seen": 3513920, "step": 5355 }, { "epoch": 3.160377358490566, "grad_norm": 1.725625991821289, "learning_rate": 9.897913491387929e-06, "loss": 0.5765, "num_input_tokens_seen": 3516832, "step": 5360 }, { "epoch": 3.1633254716981134, "grad_norm": 2.459773302078247, "learning_rate": 9.897395620399219e-06, "loss": 0.5293, "num_input_tokens_seen": 3519648, "step": 5365 }, { "epoch": 3.1662735849056602, "grad_norm": 1.116410732269287, "learning_rate": 9.896876452803913e-06, "loss": 0.4163, "num_input_tokens_seen": 3522304, "step": 5370 }, { "epoch": 3.1692216981132075, "grad_norm": 1.3724499940872192, "learning_rate": 9.896355988739461e-06, "loss": 0.3675, "num_input_tokens_seen": 3525728, "step": 5375 }, { "epoch": 3.172169811320755, "grad_norm": 2.9763824939727783, "learning_rate": 9.895834228343658e-06, "loss": 0.4451, "num_input_tokens_seen": 3528448, "step": 5380 }, { "epoch": 3.175117924528302, "grad_norm": 1.85183846950531, "learning_rate": 9.895311171754644e-06, "loss": 0.4109, "num_input_tokens_seen": 3532832, "step": 5385 }, { "epoch": 3.178066037735849, "grad_norm": 1.3029963970184326, "learning_rate": 9.8947868191109e-06, "loss": 0.635, "num_input_tokens_seen": 3537440, "step": 5390 }, { "epoch": 3.1810141509433962, "grad_norm": 2.6454505920410156, "learning_rate": 9.894261170551249e-06, "loss": 0.4232, "num_input_tokens_seen": 3540160, "step": 5395 }, { "epoch": 3.1839622641509435, "grad_norm": 0.8761743307113647, "learning_rate": 9.893734226214861e-06, "loss": 0.4372, "num_input_tokens_seen": 3543616, "step": 5400 }, { "epoch": 3.1869103773584904, "grad_norm": 2.9375221729278564, "learning_rate": 9.893205986241246e-06, "loss": 0.5006, "num_input_tokens_seen": 3547520, "step": 5405 }, { "epoch": 3.1898584905660377, "grad_norm": 1.719679594039917, "learning_rate": 9.892676450770257e-06, "loss": 0.3915, "num_input_tokens_seen": 3550016, "step": 5410 }, { "epoch": 3.192806603773585, "grad_norm": 1.4372625350952148, "learning_rate": 9.892145619942092e-06, "loss": 0.4061, "num_input_tokens_seen": 3553056, "step": 5415 }, { "epoch": 3.1957547169811322, "grad_norm": 1.7672806978225708, "learning_rate": 9.891613493897289e-06, "loss": 0.4427, "num_input_tokens_seen": 3556384, "step": 5420 }, { "epoch": 3.198702830188679, "grad_norm": 2.5836451053619385, "learning_rate": 9.891080072776733e-06, "loss": 0.4176, "num_input_tokens_seen": 3559008, "step": 5425 }, { "epoch": 3.2016509433962264, "grad_norm": 1.144535779953003, "learning_rate": 9.890545356721649e-06, "loss": 0.3677, "num_input_tokens_seen": 3562752, "step": 5430 }, { "epoch": 3.2045990566037736, "grad_norm": 2.594238042831421, "learning_rate": 9.890009345873603e-06, "loss": 0.5071, "num_input_tokens_seen": 3566624, "step": 5435 }, { "epoch": 3.207547169811321, "grad_norm": 5.060590744018555, "learning_rate": 9.889472040374509e-06, "loss": 0.3751, "num_input_tokens_seen": 3569760, "step": 5440 }, { "epoch": 3.2104952830188678, "grad_norm": 1.412011981010437, "learning_rate": 9.88893344036662e-06, "loss": 0.4028, "num_input_tokens_seen": 3573280, "step": 5445 }, { "epoch": 3.213443396226415, "grad_norm": 1.7627023458480835, "learning_rate": 9.888393545992531e-06, "loss": 0.3511, "num_input_tokens_seen": 3576352, "step": 5450 }, { "epoch": 3.2163915094339623, "grad_norm": 1.2453337907791138, "learning_rate": 9.887852357395184e-06, "loss": 0.4917, "num_input_tokens_seen": 3580448, "step": 5455 }, { "epoch": 3.2193396226415096, "grad_norm": 1.3632738590240479, "learning_rate": 9.88730987471786e-06, "loss": 0.3643, "num_input_tokens_seen": 3584000, "step": 5460 }, { "epoch": 3.2222877358490565, "grad_norm": 1.5906684398651123, "learning_rate": 9.886766098104183e-06, "loss": 0.5025, "num_input_tokens_seen": 3587040, "step": 5465 }, { "epoch": 3.2252358490566038, "grad_norm": 3.1291592121124268, "learning_rate": 9.886221027698122e-06, "loss": 0.3698, "num_input_tokens_seen": 3590752, "step": 5470 }, { "epoch": 3.228183962264151, "grad_norm": 2.1609694957733154, "learning_rate": 9.885674663643983e-06, "loss": 0.4715, "num_input_tokens_seen": 3594080, "step": 5475 }, { "epoch": 3.231132075471698, "grad_norm": 2.0530693531036377, "learning_rate": 9.885127006086423e-06, "loss": 0.5441, "num_input_tokens_seen": 3597024, "step": 5480 }, { "epoch": 3.234080188679245, "grad_norm": 1.3710923194885254, "learning_rate": 9.884578055170434e-06, "loss": 0.4411, "num_input_tokens_seen": 3600160, "step": 5485 }, { "epoch": 3.2370283018867925, "grad_norm": 1.5020803213119507, "learning_rate": 9.884027811041353e-06, "loss": 0.4467, "num_input_tokens_seen": 3603328, "step": 5490 }, { "epoch": 3.2399764150943398, "grad_norm": 1.2672921419143677, "learning_rate": 9.883476273844861e-06, "loss": 0.4186, "num_input_tokens_seen": 3606464, "step": 5495 }, { "epoch": 3.2429245283018866, "grad_norm": 1.564405918121338, "learning_rate": 9.882923443726977e-06, "loss": 0.342, "num_input_tokens_seen": 3609632, "step": 5500 }, { "epoch": 3.245872641509434, "grad_norm": 3.2800121307373047, "learning_rate": 9.882369320834068e-06, "loss": 0.6523, "num_input_tokens_seen": 3612704, "step": 5505 }, { "epoch": 3.248820754716981, "grad_norm": 1.2520514726638794, "learning_rate": 9.88181390531284e-06, "loss": 0.4496, "num_input_tokens_seen": 3615328, "step": 5510 }, { "epoch": 3.2517688679245285, "grad_norm": 3.684345245361328, "learning_rate": 9.88125719731034e-06, "loss": 0.463, "num_input_tokens_seen": 3619136, "step": 5515 }, { "epoch": 3.2547169811320753, "grad_norm": 1.43419349193573, "learning_rate": 9.880699196973962e-06, "loss": 0.5809, "num_input_tokens_seen": 3623424, "step": 5520 }, { "epoch": 3.2576650943396226, "grad_norm": 1.3932815790176392, "learning_rate": 9.880139904451436e-06, "loss": 0.4946, "num_input_tokens_seen": 3626272, "step": 5525 }, { "epoch": 3.26061320754717, "grad_norm": 1.1739698648452759, "learning_rate": 9.879579319890838e-06, "loss": 0.4275, "num_input_tokens_seen": 3629952, "step": 5530 }, { "epoch": 3.263561320754717, "grad_norm": 1.5688470602035522, "learning_rate": 9.879017443440584e-06, "loss": 0.5178, "num_input_tokens_seen": 3633216, "step": 5535 }, { "epoch": 3.266509433962264, "grad_norm": 2.3204853534698486, "learning_rate": 9.878454275249436e-06, "loss": 0.538, "num_input_tokens_seen": 3636128, "step": 5540 }, { "epoch": 3.2694575471698113, "grad_norm": 1.3634610176086426, "learning_rate": 9.877889815466493e-06, "loss": 0.5195, "num_input_tokens_seen": 3639392, "step": 5545 }, { "epoch": 3.2724056603773586, "grad_norm": 1.6105164289474487, "learning_rate": 9.877324064241198e-06, "loss": 0.6111, "num_input_tokens_seen": 3642400, "step": 5550 }, { "epoch": 3.2753537735849054, "grad_norm": 1.2571817636489868, "learning_rate": 9.876757021723338e-06, "loss": 0.5202, "num_input_tokens_seen": 3646880, "step": 5555 }, { "epoch": 3.2783018867924527, "grad_norm": 1.9303092956542969, "learning_rate": 9.876188688063038e-06, "loss": 0.349, "num_input_tokens_seen": 3649696, "step": 5560 }, { "epoch": 3.28125, "grad_norm": 1.2904233932495117, "learning_rate": 9.875619063410768e-06, "loss": 0.4766, "num_input_tokens_seen": 3654272, "step": 5565 }, { "epoch": 3.2841981132075473, "grad_norm": 1.8048505783081055, "learning_rate": 9.875048147917339e-06, "loss": 0.5191, "num_input_tokens_seen": 3656992, "step": 5570 }, { "epoch": 3.2871462264150946, "grad_norm": 1.08250093460083, "learning_rate": 9.874475941733902e-06, "loss": 0.4387, "num_input_tokens_seen": 3659456, "step": 5575 }, { "epoch": 3.2900943396226414, "grad_norm": 2.644730567932129, "learning_rate": 9.873902445011952e-06, "loss": 0.484, "num_input_tokens_seen": 3661888, "step": 5580 }, { "epoch": 3.2930424528301887, "grad_norm": 2.8981356620788574, "learning_rate": 9.873327657903324e-06, "loss": 0.3877, "num_input_tokens_seen": 3664864, "step": 5585 }, { "epoch": 3.295990566037736, "grad_norm": 1.432806372642517, "learning_rate": 9.872751580560194e-06, "loss": 0.5104, "num_input_tokens_seen": 3667584, "step": 5590 }, { "epoch": 3.298938679245283, "grad_norm": 1.5874006748199463, "learning_rate": 9.872174213135084e-06, "loss": 0.4405, "num_input_tokens_seen": 3670432, "step": 5595 }, { "epoch": 3.30188679245283, "grad_norm": 2.1102523803710938, "learning_rate": 9.871595555780855e-06, "loss": 0.5244, "num_input_tokens_seen": 3673920, "step": 5600 }, { "epoch": 3.3048349056603774, "grad_norm": 1.930662989616394, "learning_rate": 9.871015608650705e-06, "loss": 0.4765, "num_input_tokens_seen": 3676672, "step": 5605 }, { "epoch": 3.3077830188679247, "grad_norm": 2.367018938064575, "learning_rate": 9.870434371898182e-06, "loss": 0.6418, "num_input_tokens_seen": 3683776, "step": 5610 }, { "epoch": 3.3107311320754715, "grad_norm": 1.6988190412521362, "learning_rate": 9.869851845677165e-06, "loss": 0.443, "num_input_tokens_seen": 3686752, "step": 5615 }, { "epoch": 3.313679245283019, "grad_norm": 1.4109981060028076, "learning_rate": 9.869268030141886e-06, "loss": 0.5254, "num_input_tokens_seen": 3690464, "step": 5620 }, { "epoch": 3.316627358490566, "grad_norm": 8.77749252319336, "learning_rate": 9.86868292544691e-06, "loss": 0.4536, "num_input_tokens_seen": 3694112, "step": 5625 }, { "epoch": 3.3195754716981134, "grad_norm": 1.6550644636154175, "learning_rate": 9.868096531747149e-06, "loss": 0.3649, "num_input_tokens_seen": 3696928, "step": 5630 }, { "epoch": 3.3225235849056602, "grad_norm": 3.6202762126922607, "learning_rate": 9.867508849197848e-06, "loss": 0.417, "num_input_tokens_seen": 3700480, "step": 5635 }, { "epoch": 3.3254716981132075, "grad_norm": 2.17661714553833, "learning_rate": 9.866919877954602e-06, "loss": 0.4521, "num_input_tokens_seen": 3704128, "step": 5640 }, { "epoch": 3.328419811320755, "grad_norm": 3.737755060195923, "learning_rate": 9.866329618173344e-06, "loss": 0.5264, "num_input_tokens_seen": 3707488, "step": 5645 }, { "epoch": 3.331367924528302, "grad_norm": 1.2112324237823486, "learning_rate": 9.865738070010346e-06, "loss": 0.4532, "num_input_tokens_seen": 3711200, "step": 5650 }, { "epoch": 3.334316037735849, "grad_norm": 2.3200461864471436, "learning_rate": 9.865145233622223e-06, "loss": 0.358, "num_input_tokens_seen": 3713824, "step": 5655 }, { "epoch": 3.3372641509433962, "grad_norm": 1.6822304725646973, "learning_rate": 9.864551109165935e-06, "loss": 0.5073, "num_input_tokens_seen": 3716416, "step": 5660 }, { "epoch": 3.3402122641509435, "grad_norm": 1.8871996402740479, "learning_rate": 9.863955696798773e-06, "loss": 0.6263, "num_input_tokens_seen": 3719712, "step": 5665 }, { "epoch": 3.3431603773584904, "grad_norm": 1.7739585638046265, "learning_rate": 9.863358996678378e-06, "loss": 0.4863, "num_input_tokens_seen": 3722720, "step": 5670 }, { "epoch": 3.3461084905660377, "grad_norm": 1.9164766073226929, "learning_rate": 9.86276100896273e-06, "loss": 0.5034, "num_input_tokens_seen": 3725408, "step": 5675 }, { "epoch": 3.349056603773585, "grad_norm": 1.1103459596633911, "learning_rate": 9.862161733810147e-06, "loss": 0.4648, "num_input_tokens_seen": 3729600, "step": 5680 }, { "epoch": 3.3520047169811322, "grad_norm": 1.1326202154159546, "learning_rate": 9.86156117137929e-06, "loss": 0.4823, "num_input_tokens_seen": 3733120, "step": 5685 }, { "epoch": 3.354952830188679, "grad_norm": 2.2033803462982178, "learning_rate": 9.860959321829159e-06, "loss": 0.5835, "num_input_tokens_seen": 3735968, "step": 5690 }, { "epoch": 3.3579009433962264, "grad_norm": 1.1811704635620117, "learning_rate": 9.860356185319102e-06, "loss": 0.3785, "num_input_tokens_seen": 3740960, "step": 5695 }, { "epoch": 3.3608490566037736, "grad_norm": 0.7767415642738342, "learning_rate": 9.859751762008796e-06, "loss": 0.4382, "num_input_tokens_seen": 3744192, "step": 5700 }, { "epoch": 3.363797169811321, "grad_norm": 3.2022221088409424, "learning_rate": 9.859146052058266e-06, "loss": 0.37, "num_input_tokens_seen": 3746912, "step": 5705 }, { "epoch": 3.3667452830188678, "grad_norm": 1.8211758136749268, "learning_rate": 9.858539055627876e-06, "loss": 0.6268, "num_input_tokens_seen": 3749920, "step": 5710 }, { "epoch": 3.369693396226415, "grad_norm": 0.9084793925285339, "learning_rate": 9.857930772878333e-06, "loss": 0.4571, "num_input_tokens_seen": 3752928, "step": 5715 }, { "epoch": 3.3726415094339623, "grad_norm": 6.581668853759766, "learning_rate": 9.857321203970682e-06, "loss": 0.5333, "num_input_tokens_seen": 3755968, "step": 5720 }, { "epoch": 3.3755896226415096, "grad_norm": 1.9784777164459229, "learning_rate": 9.856710349066307e-06, "loss": 0.4443, "num_input_tokens_seen": 3758336, "step": 5725 }, { "epoch": 3.3785377358490565, "grad_norm": 3.289888381958008, "learning_rate": 9.856098208326937e-06, "loss": 0.4655, "num_input_tokens_seen": 3761728, "step": 5730 }, { "epoch": 3.3814858490566038, "grad_norm": 1.7536498308181763, "learning_rate": 9.855484781914639e-06, "loss": 0.4973, "num_input_tokens_seen": 3765312, "step": 5735 }, { "epoch": 3.384433962264151, "grad_norm": 2.5265510082244873, "learning_rate": 9.854870069991817e-06, "loss": 0.5498, "num_input_tokens_seen": 3770176, "step": 5740 }, { "epoch": 3.387382075471698, "grad_norm": 1.5065233707427979, "learning_rate": 9.854254072721222e-06, "loss": 0.4566, "num_input_tokens_seen": 3772896, "step": 5745 }, { "epoch": 3.390330188679245, "grad_norm": 1.3384212255477905, "learning_rate": 9.853636790265938e-06, "loss": 0.4236, "num_input_tokens_seen": 3776000, "step": 5750 }, { "epoch": 3.3932783018867925, "grad_norm": 1.774455189704895, "learning_rate": 9.853018222789397e-06, "loss": 0.4282, "num_input_tokens_seen": 3779008, "step": 5755 }, { "epoch": 3.3962264150943398, "grad_norm": 1.5046420097351074, "learning_rate": 9.852398370455367e-06, "loss": 0.4457, "num_input_tokens_seen": 3781856, "step": 5760 }, { "epoch": 3.3991745283018866, "grad_norm": 1.7315226793289185, "learning_rate": 9.851777233427955e-06, "loss": 0.3952, "num_input_tokens_seen": 3785504, "step": 5765 }, { "epoch": 3.402122641509434, "grad_norm": 2.0062856674194336, "learning_rate": 9.85115481187161e-06, "loss": 0.5202, "num_input_tokens_seen": 3789440, "step": 5770 }, { "epoch": 3.405070754716981, "grad_norm": 1.9203672409057617, "learning_rate": 9.850531105951123e-06, "loss": 0.6009, "num_input_tokens_seen": 3792480, "step": 5775 }, { "epoch": 3.4080188679245285, "grad_norm": 1.225533366203308, "learning_rate": 9.84990611583162e-06, "loss": 0.5994, "num_input_tokens_seen": 3795552, "step": 5780 }, { "epoch": 3.4109669811320753, "grad_norm": 4.6309814453125, "learning_rate": 9.849279841678572e-06, "loss": 0.5078, "num_input_tokens_seen": 3799008, "step": 5785 }, { "epoch": 3.4139150943396226, "grad_norm": 2.3755407333374023, "learning_rate": 9.848652283657785e-06, "loss": 0.4884, "num_input_tokens_seen": 3802080, "step": 5790 }, { "epoch": 3.41686320754717, "grad_norm": 1.1990405321121216, "learning_rate": 9.848023441935411e-06, "loss": 0.4383, "num_input_tokens_seen": 3804960, "step": 5795 }, { "epoch": 3.419811320754717, "grad_norm": 1.2227355241775513, "learning_rate": 9.847393316677935e-06, "loss": 0.5677, "num_input_tokens_seen": 3807488, "step": 5800 }, { "epoch": 3.422759433962264, "grad_norm": 1.5516400337219238, "learning_rate": 9.846761908052188e-06, "loss": 0.4407, "num_input_tokens_seen": 3810944, "step": 5805 }, { "epoch": 3.4257075471698113, "grad_norm": 1.6039471626281738, "learning_rate": 9.846129216225338e-06, "loss": 0.3562, "num_input_tokens_seen": 3815200, "step": 5810 }, { "epoch": 3.4286556603773586, "grad_norm": 1.7080188989639282, "learning_rate": 9.845495241364892e-06, "loss": 0.6217, "num_input_tokens_seen": 3818368, "step": 5815 }, { "epoch": 3.4316037735849054, "grad_norm": 1.9222426414489746, "learning_rate": 9.844859983638696e-06, "loss": 0.4727, "num_input_tokens_seen": 3821888, "step": 5820 }, { "epoch": 3.4345518867924527, "grad_norm": 1.4508196115493774, "learning_rate": 9.844223443214942e-06, "loss": 0.3746, "num_input_tokens_seen": 3825760, "step": 5825 }, { "epoch": 3.4375, "grad_norm": 1.4531629085540771, "learning_rate": 9.843585620262153e-06, "loss": 0.4409, "num_input_tokens_seen": 3827968, "step": 5830 }, { "epoch": 3.4404481132075473, "grad_norm": 1.8350129127502441, "learning_rate": 9.842946514949197e-06, "loss": 0.4489, "num_input_tokens_seen": 3830368, "step": 5835 }, { "epoch": 3.4433962264150946, "grad_norm": 1.6987487077713013, "learning_rate": 9.842306127445279e-06, "loss": 0.4516, "num_input_tokens_seen": 3833568, "step": 5840 }, { "epoch": 3.4463443396226414, "grad_norm": 1.071814775466919, "learning_rate": 9.841664457919944e-06, "loss": 0.32, "num_input_tokens_seen": 3836672, "step": 5845 }, { "epoch": 3.4492924528301887, "grad_norm": 1.3683668375015259, "learning_rate": 9.841021506543079e-06, "loss": 0.4361, "num_input_tokens_seen": 3839808, "step": 5850 }, { "epoch": 3.452240566037736, "grad_norm": 1.2174758911132812, "learning_rate": 9.840377273484904e-06, "loss": 0.4268, "num_input_tokens_seen": 3844320, "step": 5855 }, { "epoch": 3.455188679245283, "grad_norm": 2.112711191177368, "learning_rate": 9.839731758915986e-06, "loss": 0.4648, "num_input_tokens_seen": 3847392, "step": 5860 }, { "epoch": 3.45813679245283, "grad_norm": 1.5011917352676392, "learning_rate": 9.839084963007226e-06, "loss": 0.6937, "num_input_tokens_seen": 3850304, "step": 5865 }, { "epoch": 3.4610849056603774, "grad_norm": 1.7322611808776855, "learning_rate": 9.838436885929868e-06, "loss": 0.4577, "num_input_tokens_seen": 3853440, "step": 5870 }, { "epoch": 3.4640330188679247, "grad_norm": 1.904584527015686, "learning_rate": 9.837787527855492e-06, "loss": 0.4963, "num_input_tokens_seen": 3856512, "step": 5875 }, { "epoch": 3.4669811320754715, "grad_norm": 3.0128376483917236, "learning_rate": 9.837136888956017e-06, "loss": 0.5292, "num_input_tokens_seen": 3859264, "step": 5880 }, { "epoch": 3.469929245283019, "grad_norm": 1.731585144996643, "learning_rate": 9.836484969403705e-06, "loss": 0.6351, "num_input_tokens_seen": 3862528, "step": 5885 }, { "epoch": 3.472877358490566, "grad_norm": 1.4423681497573853, "learning_rate": 9.835831769371152e-06, "loss": 0.3795, "num_input_tokens_seen": 3865280, "step": 5890 }, { "epoch": 3.4758254716981134, "grad_norm": 2.2970969676971436, "learning_rate": 9.835177289031298e-06, "loss": 0.4103, "num_input_tokens_seen": 3869024, "step": 5895 }, { "epoch": 3.4787735849056602, "grad_norm": 1.5829449892044067, "learning_rate": 9.834521528557419e-06, "loss": 0.3535, "num_input_tokens_seen": 3872160, "step": 5900 }, { "epoch": 3.4817216981132075, "grad_norm": 1.3686167001724243, "learning_rate": 9.833864488123128e-06, "loss": 0.5087, "num_input_tokens_seen": 3874752, "step": 5905 }, { "epoch": 3.484669811320755, "grad_norm": 1.9928351640701294, "learning_rate": 9.83320616790238e-06, "loss": 0.4137, "num_input_tokens_seen": 3878592, "step": 5910 }, { "epoch": 3.487617924528302, "grad_norm": 1.3079016208648682, "learning_rate": 9.832546568069472e-06, "loss": 0.4066, "num_input_tokens_seen": 3881344, "step": 5915 }, { "epoch": 3.490566037735849, "grad_norm": 2.3188681602478027, "learning_rate": 9.831885688799031e-06, "loss": 0.5626, "num_input_tokens_seen": 3884992, "step": 5920 }, { "epoch": 3.4935141509433962, "grad_norm": 1.0436007976531982, "learning_rate": 9.83122353026603e-06, "loss": 0.4574, "num_input_tokens_seen": 3888384, "step": 5925 }, { "epoch": 3.4964622641509435, "grad_norm": 2.4589107036590576, "learning_rate": 9.830560092645778e-06, "loss": 0.5224, "num_input_tokens_seen": 3892544, "step": 5930 }, { "epoch": 3.4994103773584904, "grad_norm": 1.3309943675994873, "learning_rate": 9.829895376113923e-06, "loss": 0.5574, "num_input_tokens_seen": 3896672, "step": 5935 }, { "epoch": 3.5023584905660377, "grad_norm": 5.762523174285889, "learning_rate": 9.829229380846452e-06, "loss": 0.4483, "num_input_tokens_seen": 3901824, "step": 5940 }, { "epoch": 3.505306603773585, "grad_norm": 1.4023840427398682, "learning_rate": 9.82856210701969e-06, "loss": 0.3994, "num_input_tokens_seen": 3904864, "step": 5945 }, { "epoch": 3.5082547169811322, "grad_norm": 3.000561237335205, "learning_rate": 9.827893554810298e-06, "loss": 0.4136, "num_input_tokens_seen": 3907168, "step": 5950 }, { "epoch": 3.5112028301886795, "grad_norm": 1.616960048675537, "learning_rate": 9.827223724395281e-06, "loss": 0.4938, "num_input_tokens_seen": 3910880, "step": 5955 }, { "epoch": 3.5141509433962264, "grad_norm": 2.1842644214630127, "learning_rate": 9.82655261595198e-06, "loss": 0.5869, "num_input_tokens_seen": 3914368, "step": 5960 }, { "epoch": 3.5170990566037736, "grad_norm": 2.396101713180542, "learning_rate": 9.825880229658073e-06, "loss": 0.6059, "num_input_tokens_seen": 3918048, "step": 5965 }, { "epoch": 3.5200471698113205, "grad_norm": 2.680884599685669, "learning_rate": 9.825206565691576e-06, "loss": 0.5197, "num_input_tokens_seen": 3922112, "step": 5970 }, { "epoch": 3.5229952830188678, "grad_norm": 1.0902928113937378, "learning_rate": 9.824531624230844e-06, "loss": 0.5199, "num_input_tokens_seen": 3925792, "step": 5975 }, { "epoch": 3.525943396226415, "grad_norm": 4.570259094238281, "learning_rate": 9.823855405454573e-06, "loss": 0.5727, "num_input_tokens_seen": 3928096, "step": 5980 }, { "epoch": 3.5288915094339623, "grad_norm": 1.115546703338623, "learning_rate": 9.823177909541795e-06, "loss": 0.5291, "num_input_tokens_seen": 3932000, "step": 5985 }, { "epoch": 3.5318396226415096, "grad_norm": 2.281398057937622, "learning_rate": 9.822499136671877e-06, "loss": 0.4789, "num_input_tokens_seen": 3934784, "step": 5990 }, { "epoch": 3.5347877358490565, "grad_norm": 1.6087472438812256, "learning_rate": 9.82181908702453e-06, "loss": 0.4996, "num_input_tokens_seen": 3937664, "step": 5995 }, { "epoch": 3.5377358490566038, "grad_norm": 1.7004281282424927, "learning_rate": 9.821137760779797e-06, "loss": 0.5472, "num_input_tokens_seen": 3944832, "step": 6000 }, { "epoch": 3.540683962264151, "grad_norm": 1.9369601011276245, "learning_rate": 9.820455158118065e-06, "loss": 0.4636, "num_input_tokens_seen": 3949632, "step": 6005 }, { "epoch": 3.543632075471698, "grad_norm": 1.1951167583465576, "learning_rate": 9.819771279220053e-06, "loss": 0.4264, "num_input_tokens_seen": 3954144, "step": 6010 }, { "epoch": 3.546580188679245, "grad_norm": 1.7883455753326416, "learning_rate": 9.819086124266825e-06, "loss": 0.5205, "num_input_tokens_seen": 3957440, "step": 6015 }, { "epoch": 3.5495283018867925, "grad_norm": 2.631777286529541, "learning_rate": 9.818399693439778e-06, "loss": 0.4055, "num_input_tokens_seen": 3960032, "step": 6020 }, { "epoch": 3.5524764150943398, "grad_norm": 1.6458097696304321, "learning_rate": 9.817711986920644e-06, "loss": 0.5613, "num_input_tokens_seen": 3963424, "step": 6025 }, { "epoch": 3.555424528301887, "grad_norm": 2.813098430633545, "learning_rate": 9.817023004891497e-06, "loss": 0.4482, "num_input_tokens_seen": 3966112, "step": 6030 }, { "epoch": 3.558372641509434, "grad_norm": 1.246803879737854, "learning_rate": 9.816332747534752e-06, "loss": 0.404, "num_input_tokens_seen": 3969152, "step": 6035 }, { "epoch": 3.561320754716981, "grad_norm": 1.169307827949524, "learning_rate": 9.815641215033153e-06, "loss": 0.4537, "num_input_tokens_seen": 3973248, "step": 6040 }, { "epoch": 3.5642688679245285, "grad_norm": 2.7057249546051025, "learning_rate": 9.814948407569789e-06, "loss": 0.4465, "num_input_tokens_seen": 3976128, "step": 6045 }, { "epoch": 3.5672169811320753, "grad_norm": 1.65903902053833, "learning_rate": 9.814254325328082e-06, "loss": 0.3929, "num_input_tokens_seen": 3979200, "step": 6050 }, { "epoch": 3.5701650943396226, "grad_norm": 2.1885879039764404, "learning_rate": 9.813558968491794e-06, "loss": 0.4134, "num_input_tokens_seen": 3981856, "step": 6055 }, { "epoch": 3.57311320754717, "grad_norm": 1.221183180809021, "learning_rate": 9.812862337245024e-06, "loss": 0.5178, "num_input_tokens_seen": 3986112, "step": 6060 }, { "epoch": 3.576061320754717, "grad_norm": 3.1700096130371094, "learning_rate": 9.812164431772208e-06, "loss": 0.4006, "num_input_tokens_seen": 3988512, "step": 6065 }, { "epoch": 3.579009433962264, "grad_norm": 0.6524559855461121, "learning_rate": 9.81146525225812e-06, "loss": 0.4336, "num_input_tokens_seen": 3992384, "step": 6070 }, { "epoch": 3.5819575471698113, "grad_norm": 4.495879173278809, "learning_rate": 9.810764798887868e-06, "loss": 0.4361, "num_input_tokens_seen": 3995424, "step": 6075 }, { "epoch": 3.5849056603773586, "grad_norm": 1.5187890529632568, "learning_rate": 9.810063071846905e-06, "loss": 0.3952, "num_input_tokens_seen": 3998208, "step": 6080 }, { "epoch": 3.5878537735849054, "grad_norm": 2.743954658508301, "learning_rate": 9.809360071321013e-06, "loss": 0.5214, "num_input_tokens_seen": 4000896, "step": 6085 }, { "epoch": 3.5908018867924527, "grad_norm": 2.2126975059509277, "learning_rate": 9.808655797496314e-06, "loss": 0.4162, "num_input_tokens_seen": 4003744, "step": 6090 }, { "epoch": 3.59375, "grad_norm": 1.5971733331680298, "learning_rate": 9.807950250559268e-06, "loss": 0.7132, "num_input_tokens_seen": 4006592, "step": 6095 }, { "epoch": 3.5966981132075473, "grad_norm": 1.4377169609069824, "learning_rate": 9.807243430696673e-06, "loss": 0.4284, "num_input_tokens_seen": 4009792, "step": 6100 }, { "epoch": 3.5996462264150946, "grad_norm": 0.7257506251335144, "learning_rate": 9.806535338095661e-06, "loss": 0.4369, "num_input_tokens_seen": 4012224, "step": 6105 }, { "epoch": 3.6025943396226414, "grad_norm": 1.7616901397705078, "learning_rate": 9.805825972943706e-06, "loss": 0.408, "num_input_tokens_seen": 4014752, "step": 6110 }, { "epoch": 3.6055424528301887, "grad_norm": 2.6620984077453613, "learning_rate": 9.80511533542861e-06, "loss": 0.4548, "num_input_tokens_seen": 4017376, "step": 6115 }, { "epoch": 3.608490566037736, "grad_norm": 1.671464204788208, "learning_rate": 9.80440342573852e-06, "loss": 0.4671, "num_input_tokens_seen": 4020992, "step": 6120 }, { "epoch": 3.611438679245283, "grad_norm": 1.7619001865386963, "learning_rate": 9.803690244061919e-06, "loss": 0.4978, "num_input_tokens_seen": 4024160, "step": 6125 }, { "epoch": 3.61438679245283, "grad_norm": 1.1873208284378052, "learning_rate": 9.802975790587621e-06, "loss": 0.4204, "num_input_tokens_seen": 4027008, "step": 6130 }, { "epoch": 3.6173349056603774, "grad_norm": 1.459019422531128, "learning_rate": 9.802260065504783e-06, "loss": 0.4072, "num_input_tokens_seen": 4030208, "step": 6135 }, { "epoch": 3.6202830188679247, "grad_norm": 3.0008325576782227, "learning_rate": 9.801543069002897e-06, "loss": 0.4926, "num_input_tokens_seen": 4034240, "step": 6140 }, { "epoch": 3.6232311320754715, "grad_norm": 2.18261981010437, "learning_rate": 9.80082480127179e-06, "loss": 0.4808, "num_input_tokens_seen": 4036832, "step": 6145 }, { "epoch": 3.626179245283019, "grad_norm": 1.4397802352905273, "learning_rate": 9.800105262501628e-06, "loss": 0.5372, "num_input_tokens_seen": 4039904, "step": 6150 }, { "epoch": 3.629127358490566, "grad_norm": 2.1018924713134766, "learning_rate": 9.799384452882907e-06, "loss": 0.5189, "num_input_tokens_seen": 4042912, "step": 6155 }, { "epoch": 3.632075471698113, "grad_norm": 2.759394407272339, "learning_rate": 9.798662372606469e-06, "loss": 0.427, "num_input_tokens_seen": 4045472, "step": 6160 }, { "epoch": 3.6350235849056602, "grad_norm": 1.0626555681228638, "learning_rate": 9.797939021863487e-06, "loss": 0.3912, "num_input_tokens_seen": 4049056, "step": 6165 }, { "epoch": 3.6379716981132075, "grad_norm": 1.7403539419174194, "learning_rate": 9.797214400845472e-06, "loss": 0.5173, "num_input_tokens_seen": 4052608, "step": 6170 }, { "epoch": 3.640919811320755, "grad_norm": 4.4174275398254395, "learning_rate": 9.796488509744269e-06, "loss": 0.4482, "num_input_tokens_seen": 4055552, "step": 6175 }, { "epoch": 3.643867924528302, "grad_norm": 2.0735886096954346, "learning_rate": 9.79576134875206e-06, "loss": 0.5202, "num_input_tokens_seen": 4059392, "step": 6180 }, { "epoch": 3.646816037735849, "grad_norm": 1.3971874713897705, "learning_rate": 9.795032918061367e-06, "loss": 0.4929, "num_input_tokens_seen": 4062432, "step": 6185 }, { "epoch": 3.6497641509433962, "grad_norm": 2.0408716201782227, "learning_rate": 9.794303217865041e-06, "loss": 0.4417, "num_input_tokens_seen": 4065216, "step": 6190 }, { "epoch": 3.6527122641509435, "grad_norm": 2.2572004795074463, "learning_rate": 9.79357224835628e-06, "loss": 0.5959, "num_input_tokens_seen": 4068416, "step": 6195 }, { "epoch": 3.6556603773584904, "grad_norm": 1.1689705848693848, "learning_rate": 9.792840009728605e-06, "loss": 0.4282, "num_input_tokens_seen": 4071424, "step": 6200 }, { "epoch": 3.6586084905660377, "grad_norm": 0.9085462093353271, "learning_rate": 9.79210650217588e-06, "loss": 0.4768, "num_input_tokens_seen": 4074432, "step": 6205 }, { "epoch": 3.661556603773585, "grad_norm": 1.5150179862976074, "learning_rate": 9.791371725892307e-06, "loss": 0.4749, "num_input_tokens_seen": 4078400, "step": 6210 }, { "epoch": 3.6645047169811322, "grad_norm": 1.280336618423462, "learning_rate": 9.79063568107242e-06, "loss": 0.4805, "num_input_tokens_seen": 4082464, "step": 6215 }, { "epoch": 3.6674528301886795, "grad_norm": 1.178101658821106, "learning_rate": 9.78989836791109e-06, "loss": 0.4123, "num_input_tokens_seen": 4085888, "step": 6220 }, { "epoch": 3.6704009433962264, "grad_norm": 1.207004189491272, "learning_rate": 9.789159786603524e-06, "loss": 0.4826, "num_input_tokens_seen": 4088672, "step": 6225 }, { "epoch": 3.6733490566037736, "grad_norm": 1.1889616250991821, "learning_rate": 9.788419937345263e-06, "loss": 0.5717, "num_input_tokens_seen": 4091936, "step": 6230 }, { "epoch": 3.6762971698113205, "grad_norm": 1.0485732555389404, "learning_rate": 9.787678820332188e-06, "loss": 0.422, "num_input_tokens_seen": 4095136, "step": 6235 }, { "epoch": 3.6792452830188678, "grad_norm": 1.8850973844528198, "learning_rate": 9.78693643576051e-06, "loss": 0.5566, "num_input_tokens_seen": 4098400, "step": 6240 }, { "epoch": 3.682193396226415, "grad_norm": 5.144676685333252, "learning_rate": 9.786192783826782e-06, "loss": 0.4567, "num_input_tokens_seen": 4100992, "step": 6245 }, { "epoch": 3.6851415094339623, "grad_norm": 1.5446677207946777, "learning_rate": 9.785447864727887e-06, "loss": 0.5884, "num_input_tokens_seen": 4103552, "step": 6250 }, { "epoch": 3.6880896226415096, "grad_norm": 2.3305587768554688, "learning_rate": 9.784701678661045e-06, "loss": 0.4799, "num_input_tokens_seen": 4106464, "step": 6255 }, { "epoch": 3.6910377358490565, "grad_norm": 1.0175330638885498, "learning_rate": 9.783954225823813e-06, "loss": 0.3432, "num_input_tokens_seen": 4110112, "step": 6260 }, { "epoch": 3.6939858490566038, "grad_norm": 1.424338698387146, "learning_rate": 9.783205506414082e-06, "loss": 0.5481, "num_input_tokens_seen": 4113568, "step": 6265 }, { "epoch": 3.696933962264151, "grad_norm": 2.0393760204315186, "learning_rate": 9.782455520630079e-06, "loss": 0.4827, "num_input_tokens_seen": 4119232, "step": 6270 }, { "epoch": 3.699882075471698, "grad_norm": 2.862187147140503, "learning_rate": 9.781704268670364e-06, "loss": 0.5763, "num_input_tokens_seen": 4121664, "step": 6275 }, { "epoch": 3.702830188679245, "grad_norm": 1.135124921798706, "learning_rate": 9.780951750733837e-06, "loss": 0.4523, "num_input_tokens_seen": 4124928, "step": 6280 }, { "epoch": 3.7057783018867925, "grad_norm": 1.3624074459075928, "learning_rate": 9.780197967019728e-06, "loss": 0.3927, "num_input_tokens_seen": 4128416, "step": 6285 }, { "epoch": 3.7087264150943398, "grad_norm": 1.171388864517212, "learning_rate": 9.779442917727608e-06, "loss": 0.4667, "num_input_tokens_seen": 4131904, "step": 6290 }, { "epoch": 3.711674528301887, "grad_norm": 2.751817464828491, "learning_rate": 9.778686603057377e-06, "loss": 0.4574, "num_input_tokens_seen": 4134752, "step": 6295 }, { "epoch": 3.714622641509434, "grad_norm": 3.459742784500122, "learning_rate": 9.777929023209271e-06, "loss": 0.5254, "num_input_tokens_seen": 4137120, "step": 6300 }, { "epoch": 3.717570754716981, "grad_norm": 1.526181697845459, "learning_rate": 9.777170178383866e-06, "loss": 0.3908, "num_input_tokens_seen": 4140032, "step": 6305 }, { "epoch": 3.7205188679245285, "grad_norm": 1.2823666334152222, "learning_rate": 9.776410068782068e-06, "loss": 0.4568, "num_input_tokens_seen": 4143072, "step": 6310 }, { "epoch": 3.7234669811320753, "grad_norm": 1.2535607814788818, "learning_rate": 9.775648694605118e-06, "loss": 0.4885, "num_input_tokens_seen": 4146400, "step": 6315 }, { "epoch": 3.7264150943396226, "grad_norm": 1.212941288948059, "learning_rate": 9.774886056054593e-06, "loss": 0.4844, "num_input_tokens_seen": 4149728, "step": 6320 }, { "epoch": 3.72936320754717, "grad_norm": 1.4960203170776367, "learning_rate": 9.774122153332408e-06, "loss": 0.534, "num_input_tokens_seen": 4152896, "step": 6325 }, { "epoch": 3.732311320754717, "grad_norm": 1.291388750076294, "learning_rate": 9.773356986640807e-06, "loss": 0.3873, "num_input_tokens_seen": 4156160, "step": 6330 }, { "epoch": 3.735259433962264, "grad_norm": 1.5232936143875122, "learning_rate": 9.772590556182373e-06, "loss": 0.4497, "num_input_tokens_seen": 4158944, "step": 6335 }, { "epoch": 3.7382075471698113, "grad_norm": 1.0840086936950684, "learning_rate": 9.77182286216002e-06, "loss": 0.5511, "num_input_tokens_seen": 4161760, "step": 6340 }, { "epoch": 3.7411556603773586, "grad_norm": 2.469583511352539, "learning_rate": 9.771053904776998e-06, "loss": 0.3563, "num_input_tokens_seen": 4164096, "step": 6345 }, { "epoch": 3.7441037735849054, "grad_norm": 2.086482524871826, "learning_rate": 9.770283684236891e-06, "loss": 0.5166, "num_input_tokens_seen": 4167072, "step": 6350 }, { "epoch": 3.7470518867924527, "grad_norm": 2.8709936141967773, "learning_rate": 9.769512200743623e-06, "loss": 0.521, "num_input_tokens_seen": 4170144, "step": 6355 }, { "epoch": 3.75, "grad_norm": 1.4724977016448975, "learning_rate": 9.768739454501444e-06, "loss": 0.4993, "num_input_tokens_seen": 4173376, "step": 6360 }, { "epoch": 3.7529481132075473, "grad_norm": 1.6236610412597656, "learning_rate": 9.76796544571494e-06, "loss": 0.4332, "num_input_tokens_seen": 4177760, "step": 6365 }, { "epoch": 3.7558962264150946, "grad_norm": 2.8545961380004883, "learning_rate": 9.767190174589036e-06, "loss": 0.4719, "num_input_tokens_seen": 4180000, "step": 6370 }, { "epoch": 3.7588443396226414, "grad_norm": 1.243948221206665, "learning_rate": 9.76641364132899e-06, "loss": 0.3893, "num_input_tokens_seen": 4183488, "step": 6375 }, { "epoch": 3.7617924528301887, "grad_norm": 1.1202269792556763, "learning_rate": 9.765635846140389e-06, "loss": 0.5285, "num_input_tokens_seen": 4186784, "step": 6380 }, { "epoch": 3.764740566037736, "grad_norm": 1.2434024810791016, "learning_rate": 9.764856789229157e-06, "loss": 0.4565, "num_input_tokens_seen": 4190208, "step": 6385 }, { "epoch": 3.767688679245283, "grad_norm": 1.3668067455291748, "learning_rate": 9.764076470801557e-06, "loss": 0.4799, "num_input_tokens_seen": 4194624, "step": 6390 }, { "epoch": 3.77063679245283, "grad_norm": 1.4432018995285034, "learning_rate": 9.763294891064182e-06, "loss": 0.7332, "num_input_tokens_seen": 4197440, "step": 6395 }, { "epoch": 3.7735849056603774, "grad_norm": 2.8919355869293213, "learning_rate": 9.762512050223951e-06, "loss": 0.535, "num_input_tokens_seen": 4201312, "step": 6400 }, { "epoch": 3.7765330188679247, "grad_norm": 2.311163902282715, "learning_rate": 9.761727948488132e-06, "loss": 0.4768, "num_input_tokens_seen": 4206592, "step": 6405 }, { "epoch": 3.7794811320754715, "grad_norm": 1.5516276359558105, "learning_rate": 9.760942586064315e-06, "loss": 0.5042, "num_input_tokens_seen": 4209888, "step": 6410 }, { "epoch": 3.782429245283019, "grad_norm": 1.652915596961975, "learning_rate": 9.760155963160431e-06, "loss": 0.466, "num_input_tokens_seen": 4213760, "step": 6415 }, { "epoch": 3.785377358490566, "grad_norm": 1.37106454372406, "learning_rate": 9.759368079984741e-06, "loss": 0.3701, "num_input_tokens_seen": 4217760, "step": 6420 }, { "epoch": 3.788325471698113, "grad_norm": 2.1211583614349365, "learning_rate": 9.758578936745839e-06, "loss": 0.4374, "num_input_tokens_seen": 4220704, "step": 6425 }, { "epoch": 3.7912735849056602, "grad_norm": 1.8582839965820312, "learning_rate": 9.757788533652656e-06, "loss": 0.4982, "num_input_tokens_seen": 4224384, "step": 6430 }, { "epoch": 3.7942216981132075, "grad_norm": 2.4204187393188477, "learning_rate": 9.756996870914454e-06, "loss": 0.4626, "num_input_tokens_seen": 4227360, "step": 6435 }, { "epoch": 3.797169811320755, "grad_norm": 1.308604121208191, "learning_rate": 9.756203948740828e-06, "loss": 0.5646, "num_input_tokens_seen": 4231392, "step": 6440 }, { "epoch": 3.800117924528302, "grad_norm": 1.425087571144104, "learning_rate": 9.755409767341709e-06, "loss": 0.5406, "num_input_tokens_seen": 4234112, "step": 6445 }, { "epoch": 3.803066037735849, "grad_norm": 1.1699360609054565, "learning_rate": 9.75461432692736e-06, "loss": 0.3709, "num_input_tokens_seen": 4237568, "step": 6450 }, { "epoch": 3.8060141509433962, "grad_norm": 2.0224366188049316, "learning_rate": 9.753817627708375e-06, "loss": 0.4366, "num_input_tokens_seen": 4240160, "step": 6455 }, { "epoch": 3.8089622641509435, "grad_norm": 1.2260260581970215, "learning_rate": 9.753019669895686e-06, "loss": 0.4863, "num_input_tokens_seen": 4244128, "step": 6460 }, { "epoch": 3.8119103773584904, "grad_norm": 1.2025717496871948, "learning_rate": 9.752220453700556e-06, "loss": 0.4184, "num_input_tokens_seen": 4248320, "step": 6465 }, { "epoch": 3.8148584905660377, "grad_norm": 2.328355312347412, "learning_rate": 9.75141997933458e-06, "loss": 0.455, "num_input_tokens_seen": 4250848, "step": 6470 }, { "epoch": 3.817806603773585, "grad_norm": 1.2289149761199951, "learning_rate": 9.750618247009685e-06, "loss": 0.4073, "num_input_tokens_seen": 4253696, "step": 6475 }, { "epoch": 3.8207547169811322, "grad_norm": 1.3951596021652222, "learning_rate": 9.749815256938138e-06, "loss": 0.4943, "num_input_tokens_seen": 4257376, "step": 6480 }, { "epoch": 3.8237028301886795, "grad_norm": 2.130927801132202, "learning_rate": 9.749011009332529e-06, "loss": 0.7096, "num_input_tokens_seen": 4259840, "step": 6485 }, { "epoch": 3.8266509433962264, "grad_norm": 1.9767720699310303, "learning_rate": 9.748205504405787e-06, "loss": 0.5053, "num_input_tokens_seen": 4262464, "step": 6490 }, { "epoch": 3.8295990566037736, "grad_norm": 1.810316801071167, "learning_rate": 9.747398742371177e-06, "loss": 0.4777, "num_input_tokens_seen": 4265504, "step": 6495 }, { "epoch": 3.8325471698113205, "grad_norm": 1.5256664752960205, "learning_rate": 9.746590723442289e-06, "loss": 0.4115, "num_input_tokens_seen": 4268736, "step": 6500 }, { "epoch": 3.8354952830188678, "grad_norm": 2.6901981830596924, "learning_rate": 9.745781447833049e-06, "loss": 0.6523, "num_input_tokens_seen": 4272224, "step": 6505 }, { "epoch": 3.838443396226415, "grad_norm": 1.377010464668274, "learning_rate": 9.74497091575772e-06, "loss": 0.5696, "num_input_tokens_seen": 4275360, "step": 6510 }, { "epoch": 3.8413915094339623, "grad_norm": 1.6121355295181274, "learning_rate": 9.744159127430888e-06, "loss": 0.3967, "num_input_tokens_seen": 4278112, "step": 6515 }, { "epoch": 3.8443396226415096, "grad_norm": 1.6417827606201172, "learning_rate": 9.743346083067482e-06, "loss": 0.5714, "num_input_tokens_seen": 4282720, "step": 6520 }, { "epoch": 3.8472877358490565, "grad_norm": 1.6774868965148926, "learning_rate": 9.742531782882758e-06, "loss": 0.5025, "num_input_tokens_seen": 4285824, "step": 6525 }, { "epoch": 3.8502358490566038, "grad_norm": 2.2523553371429443, "learning_rate": 9.741716227092305e-06, "loss": 0.589, "num_input_tokens_seen": 4288640, "step": 6530 }, { "epoch": 3.853183962264151, "grad_norm": 1.8369334936141968, "learning_rate": 9.740899415912048e-06, "loss": 0.4048, "num_input_tokens_seen": 4292224, "step": 6535 }, { "epoch": 3.856132075471698, "grad_norm": 1.8653901815414429, "learning_rate": 9.740081349558236e-06, "loss": 0.5014, "num_input_tokens_seen": 4296480, "step": 6540 }, { "epoch": 3.859080188679245, "grad_norm": 2.2307662963867188, "learning_rate": 9.739262028247459e-06, "loss": 0.342, "num_input_tokens_seen": 4298976, "step": 6545 }, { "epoch": 3.8620283018867925, "grad_norm": 1.672662377357483, "learning_rate": 9.738441452196633e-06, "loss": 0.6395, "num_input_tokens_seen": 4302112, "step": 6550 }, { "epoch": 3.8649764150943398, "grad_norm": 1.9552925825119019, "learning_rate": 9.737619621623013e-06, "loss": 0.4289, "num_input_tokens_seen": 4306048, "step": 6555 }, { "epoch": 3.867924528301887, "grad_norm": 1.4112449884414673, "learning_rate": 9.73679653674418e-06, "loss": 0.5088, "num_input_tokens_seen": 4310016, "step": 6560 }, { "epoch": 3.870872641509434, "grad_norm": 1.6735306978225708, "learning_rate": 9.735972197778047e-06, "loss": 0.674, "num_input_tokens_seen": 4312864, "step": 6565 }, { "epoch": 3.873820754716981, "grad_norm": 1.4183337688446045, "learning_rate": 9.735146604942867e-06, "loss": 0.3849, "num_input_tokens_seen": 4316192, "step": 6570 }, { "epoch": 3.8767688679245285, "grad_norm": 0.44019144773483276, "learning_rate": 9.734319758457214e-06, "loss": 0.4406, "num_input_tokens_seen": 4320992, "step": 6575 }, { "epoch": 3.8797169811320753, "grad_norm": 1.7620376348495483, "learning_rate": 9.733491658540001e-06, "loss": 0.6825, "num_input_tokens_seen": 4324320, "step": 6580 }, { "epoch": 3.8826650943396226, "grad_norm": 1.8355919122695923, "learning_rate": 9.732662305410474e-06, "loss": 0.5229, "num_input_tokens_seen": 4328032, "step": 6585 }, { "epoch": 3.88561320754717, "grad_norm": 1.2099289894104004, "learning_rate": 9.731831699288203e-06, "loss": 0.391, "num_input_tokens_seen": 4330976, "step": 6590 }, { "epoch": 3.888561320754717, "grad_norm": 1.2339755296707153, "learning_rate": 9.730999840393096e-06, "loss": 0.5233, "num_input_tokens_seen": 4334592, "step": 6595 }, { "epoch": 3.891509433962264, "grad_norm": 1.0515230894088745, "learning_rate": 9.730166728945391e-06, "loss": 0.7261, "num_input_tokens_seen": 4337792, "step": 6600 }, { "epoch": 3.8944575471698113, "grad_norm": 3.278333902359009, "learning_rate": 9.72933236516566e-06, "loss": 0.4317, "num_input_tokens_seen": 4340064, "step": 6605 }, { "epoch": 3.8974056603773586, "grad_norm": 1.8952993154525757, "learning_rate": 9.728496749274806e-06, "loss": 0.5908, "num_input_tokens_seen": 4343104, "step": 6610 }, { "epoch": 3.9003537735849054, "grad_norm": 1.053807258605957, "learning_rate": 9.727659881494054e-06, "loss": 0.3715, "num_input_tokens_seen": 4346048, "step": 6615 }, { "epoch": 3.9033018867924527, "grad_norm": 2.553382158279419, "learning_rate": 9.726821762044975e-06, "loss": 0.3827, "num_input_tokens_seen": 4348832, "step": 6620 }, { "epoch": 3.90625, "grad_norm": 1.8967511653900146, "learning_rate": 9.725982391149465e-06, "loss": 0.4671, "num_input_tokens_seen": 4351552, "step": 6625 }, { "epoch": 3.9091981132075473, "grad_norm": 2.283376693725586, "learning_rate": 9.725141769029747e-06, "loss": 0.5115, "num_input_tokens_seen": 4355072, "step": 6630 }, { "epoch": 3.9121462264150946, "grad_norm": 1.472892165184021, "learning_rate": 9.72429989590838e-06, "loss": 0.6414, "num_input_tokens_seen": 4358208, "step": 6635 }, { "epoch": 3.9150943396226414, "grad_norm": 1.8523184061050415, "learning_rate": 9.723456772008257e-06, "loss": 0.7653, "num_input_tokens_seen": 4361472, "step": 6640 }, { "epoch": 3.9180424528301887, "grad_norm": 1.7857452630996704, "learning_rate": 9.722612397552598e-06, "loss": 0.5553, "num_input_tokens_seen": 4364256, "step": 6645 }, { "epoch": 3.920990566037736, "grad_norm": 1.515962839126587, "learning_rate": 9.72176677276495e-06, "loss": 0.3625, "num_input_tokens_seen": 4368096, "step": 6650 }, { "epoch": 3.923938679245283, "grad_norm": 1.1936960220336914, "learning_rate": 9.7209198978692e-06, "loss": 0.3539, "num_input_tokens_seen": 4371360, "step": 6655 }, { "epoch": 3.92688679245283, "grad_norm": 1.758622407913208, "learning_rate": 9.720071773089564e-06, "loss": 0.4349, "num_input_tokens_seen": 4376032, "step": 6660 }, { "epoch": 3.9298349056603774, "grad_norm": 2.6767189502716064, "learning_rate": 9.71922239865058e-06, "loss": 0.3996, "num_input_tokens_seen": 4380160, "step": 6665 }, { "epoch": 3.9327830188679247, "grad_norm": 1.184960126876831, "learning_rate": 9.718371774777131e-06, "loss": 0.5788, "num_input_tokens_seen": 4383424, "step": 6670 }, { "epoch": 3.9357311320754715, "grad_norm": 2.0997672080993652, "learning_rate": 9.717519901694416e-06, "loss": 0.6405, "num_input_tokens_seen": 4386688, "step": 6675 }, { "epoch": 3.938679245283019, "grad_norm": 2.223879098892212, "learning_rate": 9.716666779627978e-06, "loss": 0.5596, "num_input_tokens_seen": 4389504, "step": 6680 }, { "epoch": 3.941627358490566, "grad_norm": 1.9806514978408813, "learning_rate": 9.715812408803681e-06, "loss": 0.5393, "num_input_tokens_seen": 4394496, "step": 6685 }, { "epoch": 3.944575471698113, "grad_norm": 1.384469985961914, "learning_rate": 9.714956789447726e-06, "loss": 0.448, "num_input_tokens_seen": 4397632, "step": 6690 }, { "epoch": 3.9475235849056602, "grad_norm": 1.94249427318573, "learning_rate": 9.71409992178664e-06, "loss": 0.3952, "num_input_tokens_seen": 4402144, "step": 6695 }, { "epoch": 3.9504716981132075, "grad_norm": 1.6410030126571655, "learning_rate": 9.713241806047282e-06, "loss": 0.4768, "num_input_tokens_seen": 4405664, "step": 6700 }, { "epoch": 3.953419811320755, "grad_norm": 1.0340516567230225, "learning_rate": 9.712382442456845e-06, "loss": 0.5473, "num_input_tokens_seen": 4409184, "step": 6705 }, { "epoch": 3.956367924528302, "grad_norm": 1.151318073272705, "learning_rate": 9.711521831242846e-06, "loss": 0.5303, "num_input_tokens_seen": 4412224, "step": 6710 }, { "epoch": 3.959316037735849, "grad_norm": 1.4608516693115234, "learning_rate": 9.710659972633137e-06, "loss": 0.4308, "num_input_tokens_seen": 4414400, "step": 6715 }, { "epoch": 3.9622641509433962, "grad_norm": 3.330353260040283, "learning_rate": 9.709796866855899e-06, "loss": 0.5633, "num_input_tokens_seen": 4417216, "step": 6720 }, { "epoch": 3.9652122641509435, "grad_norm": 1.3990044593811035, "learning_rate": 9.70893251413964e-06, "loss": 0.448, "num_input_tokens_seen": 4420512, "step": 6725 }, { "epoch": 3.9681603773584904, "grad_norm": 2.729924440383911, "learning_rate": 9.708066914713205e-06, "loss": 0.4792, "num_input_tokens_seen": 4424256, "step": 6730 }, { "epoch": 3.9711084905660377, "grad_norm": 1.4475140571594238, "learning_rate": 9.707200068805764e-06, "loss": 0.446, "num_input_tokens_seen": 4427648, "step": 6735 }, { "epoch": 3.974056603773585, "grad_norm": 2.0537967681884766, "learning_rate": 9.706331976646817e-06, "loss": 0.5296, "num_input_tokens_seen": 4430400, "step": 6740 }, { "epoch": 3.9770047169811322, "grad_norm": 1.4512430429458618, "learning_rate": 9.705462638466197e-06, "loss": 0.4226, "num_input_tokens_seen": 4433472, "step": 6745 }, { "epoch": 3.9799528301886795, "grad_norm": 1.2606934309005737, "learning_rate": 9.704592054494065e-06, "loss": 0.3674, "num_input_tokens_seen": 4436960, "step": 6750 }, { "epoch": 3.9829009433962264, "grad_norm": 1.8082282543182373, "learning_rate": 9.703720224960909e-06, "loss": 0.5934, "num_input_tokens_seen": 4439616, "step": 6755 }, { "epoch": 3.9858490566037736, "grad_norm": 2.312819242477417, "learning_rate": 9.702847150097552e-06, "loss": 0.5712, "num_input_tokens_seen": 4442432, "step": 6760 }, { "epoch": 3.9887971698113205, "grad_norm": 1.3700541257858276, "learning_rate": 9.701972830135143e-06, "loss": 0.4913, "num_input_tokens_seen": 4445312, "step": 6765 }, { "epoch": 3.9917452830188678, "grad_norm": 0.840997576713562, "learning_rate": 9.701097265305164e-06, "loss": 0.3389, "num_input_tokens_seen": 4449312, "step": 6770 }, { "epoch": 3.994693396226415, "grad_norm": 1.7987314462661743, "learning_rate": 9.700220455839422e-06, "loss": 0.5194, "num_input_tokens_seen": 4452576, "step": 6775 }, { "epoch": 3.9976415094339623, "grad_norm": 1.2682267427444458, "learning_rate": 9.69934240197006e-06, "loss": 0.456, "num_input_tokens_seen": 4457024, "step": 6780 }, { "epoch": 4.0, "eval_loss": 0.5043152570724487, "eval_runtime": 18.633, "eval_samples_per_second": 91.021, "eval_steps_per_second": 22.755, "num_input_tokens_seen": 4459088, "step": 6784 }, { "epoch": 4.00058962264151, "grad_norm": 1.5461894273757935, "learning_rate": 9.698463103929542e-06, "loss": 0.589, "num_input_tokens_seen": 4459664, "step": 6785 }, { "epoch": 4.003537735849057, "grad_norm": 1.086746335029602, "learning_rate": 9.697582561950669e-06, "loss": 0.4109, "num_input_tokens_seen": 4462672, "step": 6790 }, { "epoch": 4.006485849056604, "grad_norm": 2.0280590057373047, "learning_rate": 9.696700776266568e-06, "loss": 0.4905, "num_input_tokens_seen": 4465712, "step": 6795 }, { "epoch": 4.009433962264151, "grad_norm": 2.8882734775543213, "learning_rate": 9.695817747110694e-06, "loss": 0.3599, "num_input_tokens_seen": 4468496, "step": 6800 }, { "epoch": 4.012382075471698, "grad_norm": 2.305008888244629, "learning_rate": 9.694933474716831e-06, "loss": 0.4119, "num_input_tokens_seen": 4471216, "step": 6805 }, { "epoch": 4.015330188679245, "grad_norm": 1.8435958623886108, "learning_rate": 9.6940479593191e-06, "loss": 0.4654, "num_input_tokens_seen": 4474128, "step": 6810 }, { "epoch": 4.0182783018867925, "grad_norm": 2.2425060272216797, "learning_rate": 9.693161201151942e-06, "loss": 0.4734, "num_input_tokens_seen": 4477136, "step": 6815 }, { "epoch": 4.02122641509434, "grad_norm": 1.3822534084320068, "learning_rate": 9.692273200450128e-06, "loss": 0.4158, "num_input_tokens_seen": 4480464, "step": 6820 }, { "epoch": 4.024174528301887, "grad_norm": 1.089919090270996, "learning_rate": 9.69138395744876e-06, "loss": 0.5026, "num_input_tokens_seen": 4482960, "step": 6825 }, { "epoch": 4.027122641509434, "grad_norm": 2.6308724880218506, "learning_rate": 9.690493472383274e-06, "loss": 0.4445, "num_input_tokens_seen": 4486224, "step": 6830 }, { "epoch": 4.030070754716981, "grad_norm": 1.8857111930847168, "learning_rate": 9.689601745489423e-06, "loss": 0.3758, "num_input_tokens_seen": 4489296, "step": 6835 }, { "epoch": 4.033018867924528, "grad_norm": 1.7671762704849243, "learning_rate": 9.6887087770033e-06, "loss": 0.4647, "num_input_tokens_seen": 4491984, "step": 6840 }, { "epoch": 4.035966981132075, "grad_norm": 3.709575891494751, "learning_rate": 9.687814567161322e-06, "loss": 0.4483, "num_input_tokens_seen": 4494672, "step": 6845 }, { "epoch": 4.038915094339623, "grad_norm": 1.4687050580978394, "learning_rate": 9.686919116200232e-06, "loss": 0.3743, "num_input_tokens_seen": 4498128, "step": 6850 }, { "epoch": 4.04186320754717, "grad_norm": 1.514596939086914, "learning_rate": 9.686022424357108e-06, "loss": 0.4441, "num_input_tokens_seen": 4500464, "step": 6855 }, { "epoch": 4.044811320754717, "grad_norm": 1.8026905059814453, "learning_rate": 9.685124491869353e-06, "loss": 0.4936, "num_input_tokens_seen": 4503824, "step": 6860 }, { "epoch": 4.0477594339622645, "grad_norm": 2.5161027908325195, "learning_rate": 9.684225318974696e-06, "loss": 0.4461, "num_input_tokens_seen": 4506320, "step": 6865 }, { "epoch": 4.050707547169812, "grad_norm": 2.3022210597991943, "learning_rate": 9.683324905911197e-06, "loss": 0.4928, "num_input_tokens_seen": 4509200, "step": 6870 }, { "epoch": 4.053655660377358, "grad_norm": 1.0517865419387817, "learning_rate": 9.682423252917245e-06, "loss": 0.4423, "num_input_tokens_seen": 4512496, "step": 6875 }, { "epoch": 4.056603773584905, "grad_norm": 1.1717090606689453, "learning_rate": 9.681520360231557e-06, "loss": 0.5886, "num_input_tokens_seen": 4515504, "step": 6880 }, { "epoch": 4.059551886792453, "grad_norm": 5.2859039306640625, "learning_rate": 9.680616228093178e-06, "loss": 0.4416, "num_input_tokens_seen": 4518352, "step": 6885 }, { "epoch": 4.0625, "grad_norm": 1.690291166305542, "learning_rate": 9.67971085674148e-06, "loss": 0.5169, "num_input_tokens_seen": 4521232, "step": 6890 }, { "epoch": 4.065448113207547, "grad_norm": 2.0380616188049316, "learning_rate": 9.678804246416164e-06, "loss": 0.3679, "num_input_tokens_seen": 4523472, "step": 6895 }, { "epoch": 4.068396226415095, "grad_norm": 1.5193833112716675, "learning_rate": 9.677896397357259e-06, "loss": 0.6056, "num_input_tokens_seen": 4530736, "step": 6900 }, { "epoch": 4.071344339622642, "grad_norm": 1.3998491764068604, "learning_rate": 9.676987309805121e-06, "loss": 0.4431, "num_input_tokens_seen": 4533744, "step": 6905 }, { "epoch": 4.074292452830188, "grad_norm": 2.6273834705352783, "learning_rate": 9.67607698400044e-06, "loss": 0.5069, "num_input_tokens_seen": 4537488, "step": 6910 }, { "epoch": 4.0772405660377355, "grad_norm": 1.2349148988723755, "learning_rate": 9.67516542018422e-06, "loss": 0.5714, "num_input_tokens_seen": 4540336, "step": 6915 }, { "epoch": 4.080188679245283, "grad_norm": 0.935984194278717, "learning_rate": 9.67425261859781e-06, "loss": 0.5363, "num_input_tokens_seen": 4544240, "step": 6920 }, { "epoch": 4.08313679245283, "grad_norm": 1.164027214050293, "learning_rate": 9.673338579482871e-06, "loss": 0.5169, "num_input_tokens_seen": 4547376, "step": 6925 }, { "epoch": 4.086084905660377, "grad_norm": 1.9964708089828491, "learning_rate": 9.672423303081404e-06, "loss": 0.4317, "num_input_tokens_seen": 4550608, "step": 6930 }, { "epoch": 4.089033018867925, "grad_norm": 1.1801226139068604, "learning_rate": 9.67150678963573e-06, "loss": 0.5296, "num_input_tokens_seen": 4553936, "step": 6935 }, { "epoch": 4.091981132075472, "grad_norm": 1.8962206840515137, "learning_rate": 9.670589039388501e-06, "loss": 0.5834, "num_input_tokens_seen": 4556912, "step": 6940 }, { "epoch": 4.094929245283019, "grad_norm": 1.6872659921646118, "learning_rate": 9.669670052582695e-06, "loss": 0.4752, "num_input_tokens_seen": 4559632, "step": 6945 }, { "epoch": 4.097877358490566, "grad_norm": 1.4694007635116577, "learning_rate": 9.668749829461617e-06, "loss": 0.434, "num_input_tokens_seen": 4562640, "step": 6950 }, { "epoch": 4.100825471698113, "grad_norm": 1.2596594095230103, "learning_rate": 9.667828370268898e-06, "loss": 0.5308, "num_input_tokens_seen": 4566704, "step": 6955 }, { "epoch": 4.10377358490566, "grad_norm": 4.518300533294678, "learning_rate": 9.666905675248505e-06, "loss": 0.4344, "num_input_tokens_seen": 4569680, "step": 6960 }, { "epoch": 4.1067216981132075, "grad_norm": 2.4284656047821045, "learning_rate": 9.66598174464472e-06, "loss": 0.5149, "num_input_tokens_seen": 4572048, "step": 6965 }, { "epoch": 4.109669811320755, "grad_norm": 1.2399482727050781, "learning_rate": 9.665056578702157e-06, "loss": 0.4531, "num_input_tokens_seen": 4575760, "step": 6970 }, { "epoch": 4.112617924528302, "grad_norm": 1.9687081575393677, "learning_rate": 9.66413017766576e-06, "loss": 0.4277, "num_input_tokens_seen": 4579024, "step": 6975 }, { "epoch": 4.115566037735849, "grad_norm": 2.369271755218506, "learning_rate": 9.663202541780799e-06, "loss": 0.5376, "num_input_tokens_seen": 4581456, "step": 6980 }, { "epoch": 4.118514150943396, "grad_norm": 1.6652177572250366, "learning_rate": 9.662273671292866e-06, "loss": 0.4927, "num_input_tokens_seen": 4584688, "step": 6985 }, { "epoch": 4.121462264150943, "grad_norm": 1.5802111625671387, "learning_rate": 9.661343566447886e-06, "loss": 0.6459, "num_input_tokens_seen": 4588528, "step": 6990 }, { "epoch": 4.12441037735849, "grad_norm": 1.0057978630065918, "learning_rate": 9.660412227492107e-06, "loss": 0.4495, "num_input_tokens_seen": 4593424, "step": 6995 }, { "epoch": 4.127358490566038, "grad_norm": 1.4510605335235596, "learning_rate": 9.659479654672106e-06, "loss": 0.4918, "num_input_tokens_seen": 4595440, "step": 7000 }, { "epoch": 4.130306603773585, "grad_norm": 5.410662651062012, "learning_rate": 9.658545848234784e-06, "loss": 0.5195, "num_input_tokens_seen": 4598352, "step": 7005 }, { "epoch": 4.133254716981132, "grad_norm": 1.2003024816513062, "learning_rate": 9.657610808427372e-06, "loss": 0.3618, "num_input_tokens_seen": 4601968, "step": 7010 }, { "epoch": 4.1362028301886795, "grad_norm": 1.2876616716384888, "learning_rate": 9.656674535497425e-06, "loss": 0.4284, "num_input_tokens_seen": 4605168, "step": 7015 }, { "epoch": 4.139150943396227, "grad_norm": 1.4739691019058228, "learning_rate": 9.655737029692827e-06, "loss": 0.4595, "num_input_tokens_seen": 4609264, "step": 7020 }, { "epoch": 4.142099056603773, "grad_norm": 1.519716739654541, "learning_rate": 9.654798291261785e-06, "loss": 0.483, "num_input_tokens_seen": 4612752, "step": 7025 }, { "epoch": 4.1450471698113205, "grad_norm": 1.112308144569397, "learning_rate": 9.653858320452833e-06, "loss": 0.4348, "num_input_tokens_seen": 4615696, "step": 7030 }, { "epoch": 4.147995283018868, "grad_norm": 1.7578504085540771, "learning_rate": 9.652917117514836e-06, "loss": 0.4205, "num_input_tokens_seen": 4619472, "step": 7035 }, { "epoch": 4.150943396226415, "grad_norm": 1.784793734550476, "learning_rate": 9.651974682696975e-06, "loss": 0.6183, "num_input_tokens_seen": 4622704, "step": 7040 }, { "epoch": 4.153891509433962, "grad_norm": 2.011791467666626, "learning_rate": 9.651031016248773e-06, "loss": 0.4734, "num_input_tokens_seen": 4625968, "step": 7045 }, { "epoch": 4.15683962264151, "grad_norm": 2.2613046169281006, "learning_rate": 9.65008611842006e-06, "loss": 0.4302, "num_input_tokens_seen": 4629872, "step": 7050 }, { "epoch": 4.159787735849057, "grad_norm": 2.621907949447632, "learning_rate": 9.64913998946101e-06, "loss": 0.4518, "num_input_tokens_seen": 4632688, "step": 7055 }, { "epoch": 4.162735849056604, "grad_norm": 1.8560426235198975, "learning_rate": 9.648192629622109e-06, "loss": 0.6041, "num_input_tokens_seen": 4635824, "step": 7060 }, { "epoch": 4.165683962264151, "grad_norm": 0.8749855160713196, "learning_rate": 9.647244039154178e-06, "loss": 0.4776, "num_input_tokens_seen": 4638928, "step": 7065 }, { "epoch": 4.168632075471698, "grad_norm": 3.2430994510650635, "learning_rate": 9.64629421830836e-06, "loss": 0.4166, "num_input_tokens_seen": 4642224, "step": 7070 }, { "epoch": 4.171580188679245, "grad_norm": 1.2130906581878662, "learning_rate": 9.64534316733612e-06, "loss": 0.4654, "num_input_tokens_seen": 4645200, "step": 7075 }, { "epoch": 4.1745283018867925, "grad_norm": 1.1892516613006592, "learning_rate": 9.644390886489258e-06, "loss": 0.5592, "num_input_tokens_seen": 4648272, "step": 7080 }, { "epoch": 4.17747641509434, "grad_norm": 1.7081327438354492, "learning_rate": 9.643437376019893e-06, "loss": 0.6862, "num_input_tokens_seen": 4651216, "step": 7085 }, { "epoch": 4.180424528301887, "grad_norm": 1.2532811164855957, "learning_rate": 9.64248263618047e-06, "loss": 0.4146, "num_input_tokens_seen": 4655504, "step": 7090 }, { "epoch": 4.183372641509434, "grad_norm": 2.4869630336761475, "learning_rate": 9.64152666722376e-06, "loss": 0.465, "num_input_tokens_seen": 4658320, "step": 7095 }, { "epoch": 4.186320754716981, "grad_norm": 1.298412799835205, "learning_rate": 9.640569469402863e-06, "loss": 0.6232, "num_input_tokens_seen": 4662416, "step": 7100 }, { "epoch": 4.189268867924528, "grad_norm": 4.714798927307129, "learning_rate": 9.639611042971198e-06, "loss": 0.4249, "num_input_tokens_seen": 4666320, "step": 7105 }, { "epoch": 4.192216981132075, "grad_norm": 1.2748571634292603, "learning_rate": 9.638651388182514e-06, "loss": 0.4458, "num_input_tokens_seen": 4669552, "step": 7110 }, { "epoch": 4.195165094339623, "grad_norm": 1.3534326553344727, "learning_rate": 9.637690505290884e-06, "loss": 0.3796, "num_input_tokens_seen": 4672592, "step": 7115 }, { "epoch": 4.19811320754717, "grad_norm": 2.5860061645507812, "learning_rate": 9.636728394550705e-06, "loss": 0.4804, "num_input_tokens_seen": 4674928, "step": 7120 }, { "epoch": 4.201061320754717, "grad_norm": 1.4592465162277222, "learning_rate": 9.6357650562167e-06, "loss": 0.4148, "num_input_tokens_seen": 4679568, "step": 7125 }, { "epoch": 4.2040094339622645, "grad_norm": 1.3465533256530762, "learning_rate": 9.634800490543918e-06, "loss": 0.5359, "num_input_tokens_seen": 4682448, "step": 7130 }, { "epoch": 4.206957547169812, "grad_norm": 12.346698760986328, "learning_rate": 9.633834697787731e-06, "loss": 0.5987, "num_input_tokens_seen": 4685968, "step": 7135 }, { "epoch": 4.209905660377358, "grad_norm": 0.8936570286750793, "learning_rate": 9.632867678203836e-06, "loss": 0.3254, "num_input_tokens_seen": 4689968, "step": 7140 }, { "epoch": 4.212853773584905, "grad_norm": 2.4491353034973145, "learning_rate": 9.631899432048258e-06, "loss": 0.5341, "num_input_tokens_seen": 4692880, "step": 7145 }, { "epoch": 4.215801886792453, "grad_norm": 0.8480998873710632, "learning_rate": 9.630929959577343e-06, "loss": 0.4405, "num_input_tokens_seen": 4697840, "step": 7150 }, { "epoch": 4.21875, "grad_norm": 1.730553150177002, "learning_rate": 9.629959261047764e-06, "loss": 0.3683, "num_input_tokens_seen": 4702096, "step": 7155 }, { "epoch": 4.221698113207547, "grad_norm": 1.8823014497756958, "learning_rate": 9.628987336716513e-06, "loss": 0.3919, "num_input_tokens_seen": 4705808, "step": 7160 }, { "epoch": 4.224646226415095, "grad_norm": 1.047648549079895, "learning_rate": 9.628014186840918e-06, "loss": 0.3579, "num_input_tokens_seen": 4708144, "step": 7165 }, { "epoch": 4.227594339622642, "grad_norm": 1.887251615524292, "learning_rate": 9.62703981167862e-06, "loss": 0.3378, "num_input_tokens_seen": 4710640, "step": 7170 }, { "epoch": 4.230542452830188, "grad_norm": 1.6187584400177002, "learning_rate": 9.626064211487592e-06, "loss": 0.3649, "num_input_tokens_seen": 4717200, "step": 7175 }, { "epoch": 4.2334905660377355, "grad_norm": 2.7126846313476562, "learning_rate": 9.625087386526125e-06, "loss": 0.4909, "num_input_tokens_seen": 4720368, "step": 7180 }, { "epoch": 4.236438679245283, "grad_norm": 1.8572458028793335, "learning_rate": 9.624109337052839e-06, "loss": 0.43, "num_input_tokens_seen": 4724016, "step": 7185 }, { "epoch": 4.23938679245283, "grad_norm": 1.4514567852020264, "learning_rate": 9.623130063326678e-06, "loss": 0.4489, "num_input_tokens_seen": 4727248, "step": 7190 }, { "epoch": 4.242334905660377, "grad_norm": 1.4859769344329834, "learning_rate": 9.622149565606909e-06, "loss": 0.5464, "num_input_tokens_seen": 4730384, "step": 7195 }, { "epoch": 4.245283018867925, "grad_norm": 4.6082682609558105, "learning_rate": 9.621167844153122e-06, "loss": 0.5448, "num_input_tokens_seen": 4733008, "step": 7200 }, { "epoch": 4.248231132075472, "grad_norm": 1.2318919897079468, "learning_rate": 9.620184899225231e-06, "loss": 0.5343, "num_input_tokens_seen": 4735952, "step": 7205 }, { "epoch": 4.251179245283019, "grad_norm": 2.223651885986328, "learning_rate": 9.619200731083477e-06, "loss": 0.5068, "num_input_tokens_seen": 4738928, "step": 7210 }, { "epoch": 4.254127358490566, "grad_norm": 2.8110647201538086, "learning_rate": 9.618215339988422e-06, "loss": 0.5609, "num_input_tokens_seen": 4742256, "step": 7215 }, { "epoch": 4.257075471698113, "grad_norm": 1.6345478296279907, "learning_rate": 9.617228726200951e-06, "loss": 0.3972, "num_input_tokens_seen": 4746352, "step": 7220 }, { "epoch": 4.26002358490566, "grad_norm": 2.902766466140747, "learning_rate": 9.616240889982277e-06, "loss": 0.561, "num_input_tokens_seen": 4749648, "step": 7225 }, { "epoch": 4.2629716981132075, "grad_norm": 1.0840940475463867, "learning_rate": 9.61525183159393e-06, "loss": 0.3472, "num_input_tokens_seen": 4752912, "step": 7230 }, { "epoch": 4.265919811320755, "grad_norm": 1.3957514762878418, "learning_rate": 9.614261551297774e-06, "loss": 0.4117, "num_input_tokens_seen": 4756208, "step": 7235 }, { "epoch": 4.268867924528302, "grad_norm": 1.7862718105316162, "learning_rate": 9.613270049355983e-06, "loss": 0.4585, "num_input_tokens_seen": 4759600, "step": 7240 }, { "epoch": 4.271816037735849, "grad_norm": 1.0677130222320557, "learning_rate": 9.612277326031065e-06, "loss": 0.3853, "num_input_tokens_seen": 4762576, "step": 7245 }, { "epoch": 4.274764150943396, "grad_norm": 5.0528178215026855, "learning_rate": 9.611283381585848e-06, "loss": 0.4549, "num_input_tokens_seen": 4765808, "step": 7250 }, { "epoch": 4.277712264150943, "grad_norm": 4.799801349639893, "learning_rate": 9.61028821628348e-06, "loss": 0.444, "num_input_tokens_seen": 4768656, "step": 7255 }, { "epoch": 4.28066037735849, "grad_norm": 2.1277430057525635, "learning_rate": 9.609291830387439e-06, "loss": 0.4217, "num_input_tokens_seen": 4772144, "step": 7260 }, { "epoch": 4.283608490566038, "grad_norm": 2.4338104724884033, "learning_rate": 9.608294224161523e-06, "loss": 0.4485, "num_input_tokens_seen": 4775024, "step": 7265 }, { "epoch": 4.286556603773585, "grad_norm": 2.5002269744873047, "learning_rate": 9.607295397869847e-06, "loss": 0.4766, "num_input_tokens_seen": 4777936, "step": 7270 }, { "epoch": 4.289504716981132, "grad_norm": 1.7630901336669922, "learning_rate": 9.60629535177686e-06, "loss": 0.4228, "num_input_tokens_seen": 4782256, "step": 7275 }, { "epoch": 4.2924528301886795, "grad_norm": 1.8223686218261719, "learning_rate": 9.605294086147325e-06, "loss": 0.4907, "num_input_tokens_seen": 4785744, "step": 7280 }, { "epoch": 4.295400943396227, "grad_norm": 1.2157161235809326, "learning_rate": 9.604291601246333e-06, "loss": 0.5409, "num_input_tokens_seen": 4789712, "step": 7285 }, { "epoch": 4.298349056603773, "grad_norm": 2.884395122528076, "learning_rate": 9.603287897339299e-06, "loss": 0.519, "num_input_tokens_seen": 4792848, "step": 7290 }, { "epoch": 4.3012971698113205, "grad_norm": 1.6056814193725586, "learning_rate": 9.602282974691953e-06, "loss": 0.5106, "num_input_tokens_seen": 4796240, "step": 7295 }, { "epoch": 4.304245283018868, "grad_norm": 1.0748456716537476, "learning_rate": 9.601276833570355e-06, "loss": 0.4785, "num_input_tokens_seen": 4799216, "step": 7300 }, { "epoch": 4.307193396226415, "grad_norm": 1.9577796459197998, "learning_rate": 9.600269474240885e-06, "loss": 0.3806, "num_input_tokens_seen": 4802384, "step": 7305 }, { "epoch": 4.310141509433962, "grad_norm": 1.2991138696670532, "learning_rate": 9.599260896970246e-06, "loss": 0.4711, "num_input_tokens_seen": 4805552, "step": 7310 }, { "epoch": 4.31308962264151, "grad_norm": 2.744274854660034, "learning_rate": 9.598251102025463e-06, "loss": 0.4856, "num_input_tokens_seen": 4807952, "step": 7315 }, { "epoch": 4.316037735849057, "grad_norm": 1.339870572090149, "learning_rate": 9.597240089673882e-06, "loss": 0.4603, "num_input_tokens_seen": 4810896, "step": 7320 }, { "epoch": 4.318985849056604, "grad_norm": 1.8189408779144287, "learning_rate": 9.596227860183175e-06, "loss": 0.5311, "num_input_tokens_seen": 4814096, "step": 7325 }, { "epoch": 4.321933962264151, "grad_norm": 2.0885603427886963, "learning_rate": 9.595214413821334e-06, "loss": 0.4051, "num_input_tokens_seen": 4818192, "step": 7330 }, { "epoch": 4.324882075471698, "grad_norm": 1.1570916175842285, "learning_rate": 9.59419975085667e-06, "loss": 0.4824, "num_input_tokens_seen": 4821392, "step": 7335 }, { "epoch": 4.327830188679245, "grad_norm": 1.4041286706924438, "learning_rate": 9.593183871557826e-06, "loss": 0.4254, "num_input_tokens_seen": 4825232, "step": 7340 }, { "epoch": 4.3307783018867925, "grad_norm": 0.9202308654785156, "learning_rate": 9.592166776193754e-06, "loss": 0.4183, "num_input_tokens_seen": 4828432, "step": 7345 }, { "epoch": 4.33372641509434, "grad_norm": 5.073817729949951, "learning_rate": 9.591148465033738e-06, "loss": 0.4999, "num_input_tokens_seen": 4831344, "step": 7350 }, { "epoch": 4.336674528301887, "grad_norm": 1.356613278388977, "learning_rate": 9.590128938347378e-06, "loss": 0.3983, "num_input_tokens_seen": 4834448, "step": 7355 }, { "epoch": 4.339622641509434, "grad_norm": 2.8097305297851562, "learning_rate": 9.589108196404599e-06, "loss": 0.3894, "num_input_tokens_seen": 4837424, "step": 7360 }, { "epoch": 4.342570754716981, "grad_norm": 1.986074447631836, "learning_rate": 9.588086239475649e-06, "loss": 0.4636, "num_input_tokens_seen": 4840496, "step": 7365 }, { "epoch": 4.345518867924528, "grad_norm": 1.7731753587722778, "learning_rate": 9.587063067831092e-06, "loss": 0.5164, "num_input_tokens_seen": 4843952, "step": 7370 }, { "epoch": 4.348466981132075, "grad_norm": 2.5233607292175293, "learning_rate": 9.586038681741818e-06, "loss": 0.4627, "num_input_tokens_seen": 4848400, "step": 7375 }, { "epoch": 4.351415094339623, "grad_norm": 1.864503026008606, "learning_rate": 9.58501308147904e-06, "loss": 0.4496, "num_input_tokens_seen": 4852688, "step": 7380 }, { "epoch": 4.35436320754717, "grad_norm": 1.2735915184020996, "learning_rate": 9.583986267314288e-06, "loss": 0.3345, "num_input_tokens_seen": 4855696, "step": 7385 }, { "epoch": 4.357311320754717, "grad_norm": 1.4488096237182617, "learning_rate": 9.582958239519416e-06, "loss": 0.4972, "num_input_tokens_seen": 4859184, "step": 7390 }, { "epoch": 4.3602594339622645, "grad_norm": 2.3386640548706055, "learning_rate": 9.581928998366597e-06, "loss": 0.4746, "num_input_tokens_seen": 4862160, "step": 7395 }, { "epoch": 4.363207547169811, "grad_norm": 1.426688551902771, "learning_rate": 9.58089854412833e-06, "loss": 0.3592, "num_input_tokens_seen": 4865136, "step": 7400 }, { "epoch": 4.366155660377358, "grad_norm": 2.126584768295288, "learning_rate": 9.579866877077431e-06, "loss": 0.683, "num_input_tokens_seen": 4868848, "step": 7405 }, { "epoch": 4.369103773584905, "grad_norm": 1.602827548980713, "learning_rate": 9.578833997487038e-06, "loss": 0.4814, "num_input_tokens_seen": 4871120, "step": 7410 }, { "epoch": 4.372051886792453, "grad_norm": 2.1881253719329834, "learning_rate": 9.57779990563061e-06, "loss": 0.6006, "num_input_tokens_seen": 4874288, "step": 7415 }, { "epoch": 4.375, "grad_norm": 1.9868035316467285, "learning_rate": 9.576764601781928e-06, "loss": 0.4885, "num_input_tokens_seen": 4877712, "step": 7420 }, { "epoch": 4.377948113207547, "grad_norm": 2.176298141479492, "learning_rate": 9.575728086215093e-06, "loss": 0.4214, "num_input_tokens_seen": 4880336, "step": 7425 }, { "epoch": 4.380896226415095, "grad_norm": 1.3110195398330688, "learning_rate": 9.574690359204527e-06, "loss": 0.4674, "num_input_tokens_seen": 4884016, "step": 7430 }, { "epoch": 4.383844339622642, "grad_norm": 1.5313737392425537, "learning_rate": 9.573651421024972e-06, "loss": 0.3764, "num_input_tokens_seen": 4886704, "step": 7435 }, { "epoch": 4.386792452830189, "grad_norm": 1.1421900987625122, "learning_rate": 9.572611271951494e-06, "loss": 0.4025, "num_input_tokens_seen": 4890544, "step": 7440 }, { "epoch": 4.3897405660377355, "grad_norm": 1.2437127828598022, "learning_rate": 9.571569912259473e-06, "loss": 0.4352, "num_input_tokens_seen": 4895376, "step": 7445 }, { "epoch": 4.392688679245283, "grad_norm": 1.30611252784729, "learning_rate": 9.570527342224614e-06, "loss": 0.3868, "num_input_tokens_seen": 4898288, "step": 7450 }, { "epoch": 4.39563679245283, "grad_norm": 1.8360904455184937, "learning_rate": 9.569483562122945e-06, "loss": 0.5566, "num_input_tokens_seen": 4901808, "step": 7455 }, { "epoch": 4.398584905660377, "grad_norm": 1.3088781833648682, "learning_rate": 9.568438572230811e-06, "loss": 0.3774, "num_input_tokens_seen": 4904752, "step": 7460 }, { "epoch": 4.401533018867925, "grad_norm": 1.2663037776947021, "learning_rate": 9.567392372824873e-06, "loss": 0.5882, "num_input_tokens_seen": 4909424, "step": 7465 }, { "epoch": 4.404481132075472, "grad_norm": 1.2295280694961548, "learning_rate": 9.566344964182123e-06, "loss": 0.4239, "num_input_tokens_seen": 4912944, "step": 7470 }, { "epoch": 4.407429245283019, "grad_norm": 1.188954472541809, "learning_rate": 9.565296346579862e-06, "loss": 0.5083, "num_input_tokens_seen": 4916048, "step": 7475 }, { "epoch": 4.410377358490566, "grad_norm": 2.6439573764801025, "learning_rate": 9.564246520295719e-06, "loss": 0.5368, "num_input_tokens_seen": 4919152, "step": 7480 }, { "epoch": 4.413325471698113, "grad_norm": 1.2376055717468262, "learning_rate": 9.563195485607638e-06, "loss": 0.4164, "num_input_tokens_seen": 4922096, "step": 7485 }, { "epoch": 4.41627358490566, "grad_norm": 2.1889255046844482, "learning_rate": 9.562143242793885e-06, "loss": 0.5233, "num_input_tokens_seen": 4924784, "step": 7490 }, { "epoch": 4.4192216981132075, "grad_norm": 1.291057825088501, "learning_rate": 9.561089792133048e-06, "loss": 0.5917, "num_input_tokens_seen": 4927696, "step": 7495 }, { "epoch": 4.422169811320755, "grad_norm": 2.325979471206665, "learning_rate": 9.560035133904031e-06, "loss": 0.6092, "num_input_tokens_seen": 4930576, "step": 7500 }, { "epoch": 4.425117924528302, "grad_norm": 1.805397629737854, "learning_rate": 9.55897926838606e-06, "loss": 0.4936, "num_input_tokens_seen": 4933872, "step": 7505 }, { "epoch": 4.428066037735849, "grad_norm": 1.725646734237671, "learning_rate": 9.55792219585868e-06, "loss": 0.5222, "num_input_tokens_seen": 4937712, "step": 7510 }, { "epoch": 4.431014150943396, "grad_norm": 1.3465760946273804, "learning_rate": 9.556863916601754e-06, "loss": 0.4565, "num_input_tokens_seen": 4941104, "step": 7515 }, { "epoch": 4.433962264150943, "grad_norm": 5.542476654052734, "learning_rate": 9.555804430895467e-06, "loss": 0.6596, "num_input_tokens_seen": 4944208, "step": 7520 }, { "epoch": 4.43691037735849, "grad_norm": 1.7914153337478638, "learning_rate": 9.554743739020325e-06, "loss": 0.767, "num_input_tokens_seen": 4947504, "step": 7525 }, { "epoch": 4.439858490566038, "grad_norm": 1.4295704364776611, "learning_rate": 9.553681841257146e-06, "loss": 0.3794, "num_input_tokens_seen": 4950416, "step": 7530 }, { "epoch": 4.442806603773585, "grad_norm": 1.4698543548583984, "learning_rate": 9.552618737887073e-06, "loss": 0.4164, "num_input_tokens_seen": 4954832, "step": 7535 }, { "epoch": 4.445754716981132, "grad_norm": 1.2953099012374878, "learning_rate": 9.55155442919157e-06, "loss": 0.4324, "num_input_tokens_seen": 4958704, "step": 7540 }, { "epoch": 4.4487028301886795, "grad_norm": 1.1383936405181885, "learning_rate": 9.550488915452416e-06, "loss": 0.3654, "num_input_tokens_seen": 4962064, "step": 7545 }, { "epoch": 4.451650943396227, "grad_norm": 2.048032522201538, "learning_rate": 9.54942219695171e-06, "loss": 0.4939, "num_input_tokens_seen": 4965488, "step": 7550 }, { "epoch": 4.454599056603773, "grad_norm": 1.1095963716506958, "learning_rate": 9.54835427397187e-06, "loss": 0.4234, "num_input_tokens_seen": 4969008, "step": 7555 }, { "epoch": 4.4575471698113205, "grad_norm": 1.8654985427856445, "learning_rate": 9.547285146795634e-06, "loss": 0.4529, "num_input_tokens_seen": 4972688, "step": 7560 }, { "epoch": 4.460495283018868, "grad_norm": 1.7255550622940063, "learning_rate": 9.546214815706059e-06, "loss": 0.4513, "num_input_tokens_seen": 4976048, "step": 7565 }, { "epoch": 4.463443396226415, "grad_norm": 2.648359537124634, "learning_rate": 9.545143280986518e-06, "loss": 0.4642, "num_input_tokens_seen": 4978736, "step": 7570 }, { "epoch": 4.466391509433962, "grad_norm": 1.5671294927597046, "learning_rate": 9.544070542920703e-06, "loss": 0.4231, "num_input_tokens_seen": 4981680, "step": 7575 }, { "epoch": 4.46933962264151, "grad_norm": 1.2335697412490845, "learning_rate": 9.542996601792629e-06, "loss": 0.4848, "num_input_tokens_seen": 4984912, "step": 7580 }, { "epoch": 4.472287735849057, "grad_norm": 1.7566808462142944, "learning_rate": 9.541921457886624e-06, "loss": 0.475, "num_input_tokens_seen": 4988208, "step": 7585 }, { "epoch": 4.475235849056604, "grad_norm": 1.729846715927124, "learning_rate": 9.54084511148734e-06, "loss": 0.5998, "num_input_tokens_seen": 4991056, "step": 7590 }, { "epoch": 4.478183962264151, "grad_norm": 1.466539740562439, "learning_rate": 9.539767562879742e-06, "loss": 0.503, "num_input_tokens_seen": 4994448, "step": 7595 }, { "epoch": 4.481132075471698, "grad_norm": 1.9281237125396729, "learning_rate": 9.538688812349118e-06, "loss": 0.4528, "num_input_tokens_seen": 4997648, "step": 7600 }, { "epoch": 4.484080188679245, "grad_norm": 1.5650978088378906, "learning_rate": 9.537608860181069e-06, "loss": 0.4767, "num_input_tokens_seen": 5000496, "step": 7605 }, { "epoch": 4.4870283018867925, "grad_norm": 1.7806435823440552, "learning_rate": 9.536527706661519e-06, "loss": 0.5834, "num_input_tokens_seen": 5003088, "step": 7610 }, { "epoch": 4.48997641509434, "grad_norm": 2.490299940109253, "learning_rate": 9.535445352076707e-06, "loss": 0.5007, "num_input_tokens_seen": 5008144, "step": 7615 }, { "epoch": 4.492924528301887, "grad_norm": 6.744018077850342, "learning_rate": 9.534361796713191e-06, "loss": 0.4766, "num_input_tokens_seen": 5010192, "step": 7620 }, { "epoch": 4.495872641509434, "grad_norm": 2.1024386882781982, "learning_rate": 9.533277040857847e-06, "loss": 0.4667, "num_input_tokens_seen": 5013680, "step": 7625 }, { "epoch": 4.498820754716981, "grad_norm": 1.5739455223083496, "learning_rate": 9.53219108479787e-06, "loss": 0.5628, "num_input_tokens_seen": 5017584, "step": 7630 }, { "epoch": 4.501768867924528, "grad_norm": 1.3126907348632812, "learning_rate": 9.53110392882077e-06, "loss": 0.4397, "num_input_tokens_seen": 5021296, "step": 7635 }, { "epoch": 4.504716981132075, "grad_norm": 1.7906467914581299, "learning_rate": 9.530015573214378e-06, "loss": 0.4848, "num_input_tokens_seen": 5023696, "step": 7640 }, { "epoch": 4.507665094339623, "grad_norm": 1.5669918060302734, "learning_rate": 9.528926018266837e-06, "loss": 0.639, "num_input_tokens_seen": 5030896, "step": 7645 }, { "epoch": 4.51061320754717, "grad_norm": 1.6288657188415527, "learning_rate": 9.527835264266617e-06, "loss": 0.4097, "num_input_tokens_seen": 5033904, "step": 7650 }, { "epoch": 4.513561320754717, "grad_norm": 1.53730046749115, "learning_rate": 9.526743311502496e-06, "loss": 0.5569, "num_input_tokens_seen": 5038224, "step": 7655 }, { "epoch": 4.5165094339622645, "grad_norm": 1.0161722898483276, "learning_rate": 9.525650160263573e-06, "loss": 0.3815, "num_input_tokens_seen": 5042224, "step": 7660 }, { "epoch": 4.519457547169811, "grad_norm": 1.353582501411438, "learning_rate": 9.524555810839267e-06, "loss": 0.4611, "num_input_tokens_seen": 5045936, "step": 7665 }, { "epoch": 4.522405660377358, "grad_norm": 3.2348413467407227, "learning_rate": 9.523460263519309e-06, "loss": 0.4826, "num_input_tokens_seen": 5049008, "step": 7670 }, { "epoch": 4.525353773584905, "grad_norm": 1.898800015449524, "learning_rate": 9.522363518593753e-06, "loss": 0.5131, "num_input_tokens_seen": 5052816, "step": 7675 }, { "epoch": 4.528301886792453, "grad_norm": 2.8719465732574463, "learning_rate": 9.521265576352963e-06, "loss": 0.5245, "num_input_tokens_seen": 5057232, "step": 7680 }, { "epoch": 4.53125, "grad_norm": 1.2584196329116821, "learning_rate": 9.520166437087628e-06, "loss": 0.3925, "num_input_tokens_seen": 5060848, "step": 7685 }, { "epoch": 4.534198113207547, "grad_norm": 4.630710124969482, "learning_rate": 9.519066101088748e-06, "loss": 0.5486, "num_input_tokens_seen": 5064176, "step": 7690 }, { "epoch": 4.537146226415095, "grad_norm": 1.8673012256622314, "learning_rate": 9.51796456864764e-06, "loss": 0.3918, "num_input_tokens_seen": 5067888, "step": 7695 }, { "epoch": 4.540094339622642, "grad_norm": 1.8319356441497803, "learning_rate": 9.516861840055942e-06, "loss": 0.4913, "num_input_tokens_seen": 5071376, "step": 7700 }, { "epoch": 4.543042452830189, "grad_norm": 1.590044379234314, "learning_rate": 9.515757915605604e-06, "loss": 0.7609, "num_input_tokens_seen": 5074288, "step": 7705 }, { "epoch": 4.5459905660377355, "grad_norm": 1.2905527353286743, "learning_rate": 9.514652795588899e-06, "loss": 0.3969, "num_input_tokens_seen": 5077264, "step": 7710 }, { "epoch": 4.548938679245283, "grad_norm": 1.3399442434310913, "learning_rate": 9.513546480298405e-06, "loss": 0.4056, "num_input_tokens_seen": 5080240, "step": 7715 }, { "epoch": 4.55188679245283, "grad_norm": 1.0752267837524414, "learning_rate": 9.512438970027032e-06, "loss": 0.322, "num_input_tokens_seen": 5083024, "step": 7720 }, { "epoch": 4.554834905660377, "grad_norm": 2.1169474124908447, "learning_rate": 9.511330265067992e-06, "loss": 0.3791, "num_input_tokens_seen": 5086096, "step": 7725 }, { "epoch": 4.557783018867925, "grad_norm": 1.3301210403442383, "learning_rate": 9.510220365714822e-06, "loss": 0.4524, "num_input_tokens_seen": 5090576, "step": 7730 }, { "epoch": 4.560731132075472, "grad_norm": 1.7512493133544922, "learning_rate": 9.509109272261373e-06, "loss": 0.4174, "num_input_tokens_seen": 5093392, "step": 7735 }, { "epoch": 4.563679245283019, "grad_norm": 1.042447566986084, "learning_rate": 9.50799698500181e-06, "loss": 0.4262, "num_input_tokens_seen": 5096080, "step": 7740 }, { "epoch": 4.566627358490566, "grad_norm": 1.564965844154358, "learning_rate": 9.506883504230618e-06, "loss": 0.5047, "num_input_tokens_seen": 5098928, "step": 7745 }, { "epoch": 4.569575471698113, "grad_norm": 2.2095179557800293, "learning_rate": 9.505768830242593e-06, "loss": 0.401, "num_input_tokens_seen": 5105488, "step": 7750 }, { "epoch": 4.57252358490566, "grad_norm": 2.2624337673187256, "learning_rate": 9.504652963332852e-06, "loss": 0.4131, "num_input_tokens_seen": 5107856, "step": 7755 }, { "epoch": 4.5754716981132075, "grad_norm": 1.4758784770965576, "learning_rate": 9.503535903796825e-06, "loss": 0.4845, "num_input_tokens_seen": 5111504, "step": 7760 }, { "epoch": 4.578419811320755, "grad_norm": 1.5360649824142456, "learning_rate": 9.50241765193026e-06, "loss": 0.4886, "num_input_tokens_seen": 5115056, "step": 7765 }, { "epoch": 4.581367924528302, "grad_norm": 2.339620590209961, "learning_rate": 9.501298208029214e-06, "loss": 0.5979, "num_input_tokens_seen": 5117776, "step": 7770 }, { "epoch": 4.584316037735849, "grad_norm": 3.24383282661438, "learning_rate": 9.500177572390071e-06, "loss": 0.6179, "num_input_tokens_seen": 5120816, "step": 7775 }, { "epoch": 4.587264150943396, "grad_norm": 1.3192423582077026, "learning_rate": 9.49905574530952e-06, "loss": 0.3553, "num_input_tokens_seen": 5124464, "step": 7780 }, { "epoch": 4.590212264150943, "grad_norm": 1.9130979776382446, "learning_rate": 9.497932727084571e-06, "loss": 0.4624, "num_input_tokens_seen": 5127056, "step": 7785 }, { "epoch": 4.59316037735849, "grad_norm": 1.4089545011520386, "learning_rate": 9.496808518012545e-06, "loss": 0.5399, "num_input_tokens_seen": 5129808, "step": 7790 }, { "epoch": 4.596108490566038, "grad_norm": 3.0495827198028564, "learning_rate": 9.495683118391087e-06, "loss": 0.4768, "num_input_tokens_seen": 5132752, "step": 7795 }, { "epoch": 4.599056603773585, "grad_norm": 1.4719892740249634, "learning_rate": 9.494556528518146e-06, "loss": 0.3666, "num_input_tokens_seen": 5135184, "step": 7800 }, { "epoch": 4.602004716981132, "grad_norm": 1.9230979681015015, "learning_rate": 9.493428748691995e-06, "loss": 0.5205, "num_input_tokens_seen": 5138480, "step": 7805 }, { "epoch": 4.6049528301886795, "grad_norm": 2.259202241897583, "learning_rate": 9.492299779211215e-06, "loss": 0.4475, "num_input_tokens_seen": 5142544, "step": 7810 }, { "epoch": 4.607900943396227, "grad_norm": 1.8813408613204956, "learning_rate": 9.49116962037471e-06, "loss": 0.5665, "num_input_tokens_seen": 5147408, "step": 7815 }, { "epoch": 4.610849056603773, "grad_norm": 1.7434443235397339, "learning_rate": 9.490038272481691e-06, "loss": 0.558, "num_input_tokens_seen": 5150032, "step": 7820 }, { "epoch": 4.6137971698113205, "grad_norm": 1.9399977922439575, "learning_rate": 9.488905735831689e-06, "loss": 0.5014, "num_input_tokens_seen": 5153520, "step": 7825 }, { "epoch": 4.616745283018868, "grad_norm": 1.103170394897461, "learning_rate": 9.487772010724548e-06, "loss": 0.3344, "num_input_tokens_seen": 5156016, "step": 7830 }, { "epoch": 4.619693396226415, "grad_norm": 1.3416720628738403, "learning_rate": 9.486637097460425e-06, "loss": 0.4504, "num_input_tokens_seen": 5158992, "step": 7835 }, { "epoch": 4.622641509433962, "grad_norm": 1.0713697671890259, "learning_rate": 9.485500996339793e-06, "loss": 0.4294, "num_input_tokens_seen": 5162608, "step": 7840 }, { "epoch": 4.62558962264151, "grad_norm": 2.0718729496002197, "learning_rate": 9.484363707663443e-06, "loss": 0.3206, "num_input_tokens_seen": 5165776, "step": 7845 }, { "epoch": 4.628537735849057, "grad_norm": 2.1446614265441895, "learning_rate": 9.483225231732474e-06, "loss": 0.4754, "num_input_tokens_seen": 5170032, "step": 7850 }, { "epoch": 4.631485849056604, "grad_norm": 2.390486240386963, "learning_rate": 9.482085568848302e-06, "loss": 0.436, "num_input_tokens_seen": 5172464, "step": 7855 }, { "epoch": 4.634433962264151, "grad_norm": 2.3467483520507812, "learning_rate": 9.480944719312659e-06, "loss": 0.4208, "num_input_tokens_seen": 5175504, "step": 7860 }, { "epoch": 4.637382075471698, "grad_norm": 1.7380807399749756, "learning_rate": 9.47980268342759e-06, "loss": 0.3527, "num_input_tokens_seen": 5178544, "step": 7865 }, { "epoch": 4.640330188679245, "grad_norm": 2.004349708557129, "learning_rate": 9.478659461495456e-06, "loss": 0.4799, "num_input_tokens_seen": 5182992, "step": 7870 }, { "epoch": 4.6432783018867925, "grad_norm": 2.578930616378784, "learning_rate": 9.477515053818926e-06, "loss": 0.3726, "num_input_tokens_seen": 5185776, "step": 7875 }, { "epoch": 4.64622641509434, "grad_norm": 1.5195249319076538, "learning_rate": 9.476369460700988e-06, "loss": 0.3813, "num_input_tokens_seen": 5189488, "step": 7880 }, { "epoch": 4.649174528301887, "grad_norm": 1.561284065246582, "learning_rate": 9.475222682444944e-06, "loss": 0.4654, "num_input_tokens_seen": 5193232, "step": 7885 }, { "epoch": 4.652122641509434, "grad_norm": 1.950819730758667, "learning_rate": 9.474074719354406e-06, "loss": 0.4408, "num_input_tokens_seen": 5196080, "step": 7890 }, { "epoch": 4.655070754716981, "grad_norm": 2.089343786239624, "learning_rate": 9.472925571733306e-06, "loss": 0.451, "num_input_tokens_seen": 5199344, "step": 7895 }, { "epoch": 4.658018867924528, "grad_norm": 3.2086057662963867, "learning_rate": 9.471775239885883e-06, "loss": 0.6927, "num_input_tokens_seen": 5202608, "step": 7900 }, { "epoch": 4.660966981132075, "grad_norm": 1.5780502557754517, "learning_rate": 9.470623724116693e-06, "loss": 0.5081, "num_input_tokens_seen": 5205648, "step": 7905 }, { "epoch": 4.663915094339623, "grad_norm": 1.449953317642212, "learning_rate": 9.469471024730606e-06, "loss": 0.5185, "num_input_tokens_seen": 5208912, "step": 7910 }, { "epoch": 4.66686320754717, "grad_norm": 1.284179925918579, "learning_rate": 9.4683171420328e-06, "loss": 0.3712, "num_input_tokens_seen": 5212368, "step": 7915 }, { "epoch": 4.669811320754717, "grad_norm": 1.9851982593536377, "learning_rate": 9.467162076328776e-06, "loss": 0.4515, "num_input_tokens_seen": 5214672, "step": 7920 }, { "epoch": 4.6727594339622645, "grad_norm": 2.2076258659362793, "learning_rate": 9.466005827924337e-06, "loss": 0.3621, "num_input_tokens_seen": 5218768, "step": 7925 }, { "epoch": 4.675707547169811, "grad_norm": 1.5396060943603516, "learning_rate": 9.46484839712561e-06, "loss": 0.3804, "num_input_tokens_seen": 5221680, "step": 7930 }, { "epoch": 4.678655660377358, "grad_norm": 2.0493810176849365, "learning_rate": 9.463689784239026e-06, "loss": 0.4517, "num_input_tokens_seen": 5224944, "step": 7935 }, { "epoch": 4.681603773584905, "grad_norm": 1.3834885358810425, "learning_rate": 9.462529989571334e-06, "loss": 0.4206, "num_input_tokens_seen": 5228816, "step": 7940 }, { "epoch": 4.684551886792453, "grad_norm": 1.6657599210739136, "learning_rate": 9.461369013429595e-06, "loss": 0.4019, "num_input_tokens_seen": 5231728, "step": 7945 }, { "epoch": 4.6875, "grad_norm": 2.114285469055176, "learning_rate": 9.460206856121183e-06, "loss": 0.5814, "num_input_tokens_seen": 5234544, "step": 7950 }, { "epoch": 4.690448113207547, "grad_norm": 1.639106035232544, "learning_rate": 9.459043517953786e-06, "loss": 0.5839, "num_input_tokens_seen": 5237712, "step": 7955 }, { "epoch": 4.693396226415095, "grad_norm": 3.187288999557495, "learning_rate": 9.457878999235396e-06, "loss": 0.484, "num_input_tokens_seen": 5239984, "step": 7960 }, { "epoch": 4.696344339622642, "grad_norm": 1.900347113609314, "learning_rate": 9.45671330027433e-06, "loss": 0.4795, "num_input_tokens_seen": 5243664, "step": 7965 }, { "epoch": 4.699292452830189, "grad_norm": 2.086153268814087, "learning_rate": 9.45554642137921e-06, "loss": 0.5157, "num_input_tokens_seen": 5247504, "step": 7970 }, { "epoch": 4.7022405660377355, "grad_norm": 2.2201762199401855, "learning_rate": 9.454378362858974e-06, "loss": 0.6193, "num_input_tokens_seen": 5250576, "step": 7975 }, { "epoch": 4.705188679245283, "grad_norm": 1.8865718841552734, "learning_rate": 9.453209125022867e-06, "loss": 0.3854, "num_input_tokens_seen": 5256208, "step": 7980 }, { "epoch": 4.70813679245283, "grad_norm": 1.6674343347549438, "learning_rate": 9.452038708180453e-06, "loss": 0.6148, "num_input_tokens_seen": 5258896, "step": 7985 }, { "epoch": 4.711084905660377, "grad_norm": 2.013676643371582, "learning_rate": 9.450867112641603e-06, "loss": 0.47, "num_input_tokens_seen": 5261488, "step": 7990 }, { "epoch": 4.714033018867925, "grad_norm": 1.5758405923843384, "learning_rate": 9.449694338716506e-06, "loss": 0.4509, "num_input_tokens_seen": 5264240, "step": 7995 }, { "epoch": 4.716981132075472, "grad_norm": 3.4339053630828857, "learning_rate": 9.448520386715653e-06, "loss": 0.4761, "num_input_tokens_seen": 5266704, "step": 8000 }, { "epoch": 4.719929245283019, "grad_norm": 1.3561302423477173, "learning_rate": 9.447345256949855e-06, "loss": 0.3082, "num_input_tokens_seen": 5269200, "step": 8005 }, { "epoch": 4.722877358490566, "grad_norm": 1.813444972038269, "learning_rate": 9.446168949730234e-06, "loss": 0.3932, "num_input_tokens_seen": 5272272, "step": 8010 }, { "epoch": 4.725825471698113, "grad_norm": 1.7298909425735474, "learning_rate": 9.444991465368223e-06, "loss": 0.5184, "num_input_tokens_seen": 5276048, "step": 8015 }, { "epoch": 4.72877358490566, "grad_norm": 2.6634299755096436, "learning_rate": 9.443812804175562e-06, "loss": 0.4658, "num_input_tokens_seen": 5279376, "step": 8020 }, { "epoch": 4.7317216981132075, "grad_norm": 1.9125677347183228, "learning_rate": 9.44263296646431e-06, "loss": 0.4218, "num_input_tokens_seen": 5282192, "step": 8025 }, { "epoch": 4.734669811320755, "grad_norm": 1.6728181838989258, "learning_rate": 9.441451952546835e-06, "loss": 0.5399, "num_input_tokens_seen": 5285488, "step": 8030 }, { "epoch": 4.737617924528302, "grad_norm": 2.0912487506866455, "learning_rate": 9.440269762735814e-06, "loss": 0.6133, "num_input_tokens_seen": 5288912, "step": 8035 }, { "epoch": 4.740566037735849, "grad_norm": 1.6771467924118042, "learning_rate": 9.439086397344236e-06, "loss": 0.5829, "num_input_tokens_seen": 5293264, "step": 8040 }, { "epoch": 4.743514150943396, "grad_norm": 1.6020715236663818, "learning_rate": 9.437901856685404e-06, "loss": 0.4765, "num_input_tokens_seen": 5295984, "step": 8045 }, { "epoch": 4.746462264150943, "grad_norm": 1.9634795188903809, "learning_rate": 9.436716141072925e-06, "loss": 0.3924, "num_input_tokens_seen": 5300304, "step": 8050 }, { "epoch": 4.74941037735849, "grad_norm": 1.4742497205734253, "learning_rate": 9.435529250820732e-06, "loss": 0.4164, "num_input_tokens_seen": 5303696, "step": 8055 }, { "epoch": 4.752358490566038, "grad_norm": 1.0706861019134521, "learning_rate": 9.43434118624305e-06, "loss": 0.419, "num_input_tokens_seen": 5306960, "step": 8060 }, { "epoch": 4.755306603773585, "grad_norm": 1.3815747499465942, "learning_rate": 9.433151947654428e-06, "loss": 0.5653, "num_input_tokens_seen": 5310064, "step": 8065 }, { "epoch": 4.758254716981132, "grad_norm": 1.3191930055618286, "learning_rate": 9.431961535369724e-06, "loss": 0.3927, "num_input_tokens_seen": 5312944, "step": 8070 }, { "epoch": 4.7612028301886795, "grad_norm": 3.365647792816162, "learning_rate": 9.430769949704103e-06, "loss": 0.4595, "num_input_tokens_seen": 5316432, "step": 8075 }, { "epoch": 4.764150943396227, "grad_norm": 2.1225709915161133, "learning_rate": 9.42957719097304e-06, "loss": 0.5296, "num_input_tokens_seen": 5319536, "step": 8080 }, { "epoch": 4.767099056603773, "grad_norm": 1.1064833402633667, "learning_rate": 9.42838325949233e-06, "loss": 0.4522, "num_input_tokens_seen": 5324240, "step": 8085 }, { "epoch": 4.7700471698113205, "grad_norm": 2.113281726837158, "learning_rate": 9.427188155578062e-06, "loss": 0.4178, "num_input_tokens_seen": 5326256, "step": 8090 }, { "epoch": 4.772995283018868, "grad_norm": 1.4357326030731201, "learning_rate": 9.42599187954665e-06, "loss": 0.4934, "num_input_tokens_seen": 5330032, "step": 8095 }, { "epoch": 4.775943396226415, "grad_norm": 1.619777798652649, "learning_rate": 9.424794431714814e-06, "loss": 0.3955, "num_input_tokens_seen": 5333392, "step": 8100 }, { "epoch": 4.778891509433962, "grad_norm": 2.848904609680176, "learning_rate": 9.423595812399581e-06, "loss": 0.3685, "num_input_tokens_seen": 5337456, "step": 8105 }, { "epoch": 4.78183962264151, "grad_norm": 3.2984020709991455, "learning_rate": 9.422396021918296e-06, "loss": 0.442, "num_input_tokens_seen": 5340528, "step": 8110 }, { "epoch": 4.784787735849057, "grad_norm": 1.6184638738632202, "learning_rate": 9.421195060588602e-06, "loss": 0.4391, "num_input_tokens_seen": 5344080, "step": 8115 }, { "epoch": 4.787735849056604, "grad_norm": 1.1662230491638184, "learning_rate": 9.419992928728461e-06, "loss": 0.3062, "num_input_tokens_seen": 5346928, "step": 8120 }, { "epoch": 4.790683962264151, "grad_norm": 1.9551913738250732, "learning_rate": 9.418789626656144e-06, "loss": 0.4647, "num_input_tokens_seen": 5350416, "step": 8125 }, { "epoch": 4.793632075471698, "grad_norm": 1.8574638366699219, "learning_rate": 9.417585154690229e-06, "loss": 0.3898, "num_input_tokens_seen": 5353680, "step": 8130 }, { "epoch": 4.796580188679245, "grad_norm": 2.7301266193389893, "learning_rate": 9.416379513149605e-06, "loss": 0.5461, "num_input_tokens_seen": 5356048, "step": 8135 }, { "epoch": 4.7995283018867925, "grad_norm": 1.9225398302078247, "learning_rate": 9.415172702353471e-06, "loss": 0.3301, "num_input_tokens_seen": 5359728, "step": 8140 }, { "epoch": 4.80247641509434, "grad_norm": 1.9850400686264038, "learning_rate": 9.413964722621339e-06, "loss": 0.5131, "num_input_tokens_seen": 5362960, "step": 8145 }, { "epoch": 4.805424528301887, "grad_norm": 1.40363347530365, "learning_rate": 9.41275557427302e-06, "loss": 0.6782, "num_input_tokens_seen": 5365808, "step": 8150 }, { "epoch": 4.808372641509434, "grad_norm": 2.162151575088501, "learning_rate": 9.411545257628646e-06, "loss": 0.4405, "num_input_tokens_seen": 5368816, "step": 8155 }, { "epoch": 4.811320754716981, "grad_norm": 1.3315588235855103, "learning_rate": 9.410333773008653e-06, "loss": 0.5479, "num_input_tokens_seen": 5372080, "step": 8160 }, { "epoch": 4.814268867924528, "grad_norm": 1.4131205081939697, "learning_rate": 9.409121120733784e-06, "loss": 0.3199, "num_input_tokens_seen": 5375696, "step": 8165 }, { "epoch": 4.817216981132075, "grad_norm": 1.8479303121566772, "learning_rate": 9.4079073011251e-06, "loss": 0.5162, "num_input_tokens_seen": 5378032, "step": 8170 }, { "epoch": 4.820165094339623, "grad_norm": 1.937137484550476, "learning_rate": 9.406692314503956e-06, "loss": 0.5375, "num_input_tokens_seen": 5381392, "step": 8175 }, { "epoch": 4.82311320754717, "grad_norm": 2.6042585372924805, "learning_rate": 9.405476161192033e-06, "loss": 0.3843, "num_input_tokens_seen": 5384016, "step": 8180 }, { "epoch": 4.826061320754717, "grad_norm": 1.570969820022583, "learning_rate": 9.40425884151131e-06, "loss": 0.4272, "num_input_tokens_seen": 5386768, "step": 8185 }, { "epoch": 4.8290094339622645, "grad_norm": 0.8907636404037476, "learning_rate": 9.403040355784076e-06, "loss": 0.4296, "num_input_tokens_seen": 5390160, "step": 8190 }, { "epoch": 4.831957547169811, "grad_norm": 1.3845713138580322, "learning_rate": 9.401820704332932e-06, "loss": 0.4112, "num_input_tokens_seen": 5393744, "step": 8195 }, { "epoch": 4.834905660377358, "grad_norm": 2.673092842102051, "learning_rate": 9.400599887480786e-06, "loss": 0.5785, "num_input_tokens_seen": 5396144, "step": 8200 }, { "epoch": 4.837853773584905, "grad_norm": 3.063844919204712, "learning_rate": 9.399377905550854e-06, "loss": 0.4685, "num_input_tokens_seen": 5399440, "step": 8205 }, { "epoch": 4.840801886792453, "grad_norm": 1.263213872909546, "learning_rate": 9.398154758866662e-06, "loss": 0.5205, "num_input_tokens_seen": 5402608, "step": 8210 }, { "epoch": 4.84375, "grad_norm": 1.333776831626892, "learning_rate": 9.396930447752041e-06, "loss": 0.4616, "num_input_tokens_seen": 5405840, "step": 8215 }, { "epoch": 4.846698113207547, "grad_norm": 2.7155349254608154, "learning_rate": 9.395704972531137e-06, "loss": 0.331, "num_input_tokens_seen": 5408528, "step": 8220 }, { "epoch": 4.849646226415095, "grad_norm": 2.3511574268341064, "learning_rate": 9.394478333528396e-06, "loss": 0.429, "num_input_tokens_seen": 5411632, "step": 8225 }, { "epoch": 4.852594339622642, "grad_norm": 2.213747978210449, "learning_rate": 9.393250531068576e-06, "loss": 0.396, "num_input_tokens_seen": 5415696, "step": 8230 }, { "epoch": 4.855542452830189, "grad_norm": 1.9442403316497803, "learning_rate": 9.392021565476744e-06, "loss": 0.3882, "num_input_tokens_seen": 5419120, "step": 8235 }, { "epoch": 4.8584905660377355, "grad_norm": 0.9535264372825623, "learning_rate": 9.390791437078274e-06, "loss": 0.3237, "num_input_tokens_seen": 5422128, "step": 8240 }, { "epoch": 4.861438679245283, "grad_norm": 2.583608388900757, "learning_rate": 9.38956014619885e-06, "loss": 0.6273, "num_input_tokens_seen": 5424752, "step": 8245 }, { "epoch": 4.86438679245283, "grad_norm": 2.02195405960083, "learning_rate": 9.388327693164456e-06, "loss": 0.3997, "num_input_tokens_seen": 5427504, "step": 8250 }, { "epoch": 4.867334905660377, "grad_norm": 1.3079392910003662, "learning_rate": 9.387094078301395e-06, "loss": 0.3529, "num_input_tokens_seen": 5430480, "step": 8255 }, { "epoch": 4.870283018867925, "grad_norm": 2.8589696884155273, "learning_rate": 9.385859301936269e-06, "loss": 0.4004, "num_input_tokens_seen": 5434320, "step": 8260 }, { "epoch": 4.873231132075472, "grad_norm": 2.1956682205200195, "learning_rate": 9.38462336439599e-06, "loss": 0.5985, "num_input_tokens_seen": 5437584, "step": 8265 }, { "epoch": 4.876179245283019, "grad_norm": 1.6611027717590332, "learning_rate": 9.383386266007779e-06, "loss": 0.4534, "num_input_tokens_seen": 5440368, "step": 8270 }, { "epoch": 4.879127358490566, "grad_norm": 2.6482629776000977, "learning_rate": 9.382148007099164e-06, "loss": 0.4533, "num_input_tokens_seen": 5442928, "step": 8275 }, { "epoch": 4.882075471698113, "grad_norm": 0.9031383991241455, "learning_rate": 9.380908587997977e-06, "loss": 0.2839, "num_input_tokens_seen": 5446352, "step": 8280 }, { "epoch": 4.88502358490566, "grad_norm": 1.5544438362121582, "learning_rate": 9.37966800903236e-06, "loss": 0.6264, "num_input_tokens_seen": 5450064, "step": 8285 }, { "epoch": 4.8879716981132075, "grad_norm": 1.7318519353866577, "learning_rate": 9.378426270530762e-06, "loss": 0.4368, "num_input_tokens_seen": 5454416, "step": 8290 }, { "epoch": 4.890919811320755, "grad_norm": 2.077016592025757, "learning_rate": 9.37718337282194e-06, "loss": 0.4378, "num_input_tokens_seen": 5457360, "step": 8295 }, { "epoch": 4.893867924528302, "grad_norm": 2.068427085876465, "learning_rate": 9.375939316234956e-06, "loss": 0.4288, "num_input_tokens_seen": 5462128, "step": 8300 }, { "epoch": 4.896816037735849, "grad_norm": 1.2548197507858276, "learning_rate": 9.374694101099178e-06, "loss": 0.4951, "num_input_tokens_seen": 5465488, "step": 8305 }, { "epoch": 4.899764150943396, "grad_norm": 2.061397075653076, "learning_rate": 9.373447727744282e-06, "loss": 0.5077, "num_input_tokens_seen": 5468624, "step": 8310 }, { "epoch": 4.902712264150943, "grad_norm": 2.966580629348755, "learning_rate": 9.372200196500253e-06, "loss": 0.4766, "num_input_tokens_seen": 5471568, "step": 8315 }, { "epoch": 4.90566037735849, "grad_norm": 2.573957920074463, "learning_rate": 9.37095150769738e-06, "loss": 0.5496, "num_input_tokens_seen": 5474960, "step": 8320 }, { "epoch": 4.908608490566038, "grad_norm": 1.2371724843978882, "learning_rate": 9.369701661666255e-06, "loss": 0.4403, "num_input_tokens_seen": 5477456, "step": 8325 }, { "epoch": 4.911556603773585, "grad_norm": 1.8308464288711548, "learning_rate": 9.368450658737782e-06, "loss": 0.435, "num_input_tokens_seen": 5480816, "step": 8330 }, { "epoch": 4.914504716981132, "grad_norm": 1.6951231956481934, "learning_rate": 9.367198499243173e-06, "loss": 0.5364, "num_input_tokens_seen": 5483760, "step": 8335 }, { "epoch": 4.9174528301886795, "grad_norm": 2.129976511001587, "learning_rate": 9.365945183513938e-06, "loss": 0.4262, "num_input_tokens_seen": 5486128, "step": 8340 }, { "epoch": 4.920400943396227, "grad_norm": 1.422292709350586, "learning_rate": 9.3646907118819e-06, "loss": 0.4999, "num_input_tokens_seen": 5488720, "step": 8345 }, { "epoch": 4.923349056603773, "grad_norm": 1.5703325271606445, "learning_rate": 9.363435084679185e-06, "loss": 0.4543, "num_input_tokens_seen": 5491408, "step": 8350 }, { "epoch": 4.9262971698113205, "grad_norm": 1.027166724205017, "learning_rate": 9.362178302238227e-06, "loss": 0.3936, "num_input_tokens_seen": 5494384, "step": 8355 }, { "epoch": 4.929245283018868, "grad_norm": 1.9237626791000366, "learning_rate": 9.360920364891762e-06, "loss": 0.4219, "num_input_tokens_seen": 5497616, "step": 8360 }, { "epoch": 4.932193396226415, "grad_norm": 2.1197714805603027, "learning_rate": 9.359661272972836e-06, "loss": 0.3672, "num_input_tokens_seen": 5501360, "step": 8365 }, { "epoch": 4.935141509433962, "grad_norm": 1.3406232595443726, "learning_rate": 9.3584010268148e-06, "loss": 0.3765, "num_input_tokens_seen": 5504816, "step": 8370 }, { "epoch": 4.93808962264151, "grad_norm": 3.009965181350708, "learning_rate": 9.357139626751308e-06, "loss": 0.5661, "num_input_tokens_seen": 5508304, "step": 8375 }, { "epoch": 4.941037735849057, "grad_norm": 1.2815191745758057, "learning_rate": 9.355877073116321e-06, "loss": 0.4537, "num_input_tokens_seen": 5511568, "step": 8380 }, { "epoch": 4.943985849056604, "grad_norm": 1.3864115476608276, "learning_rate": 9.354613366244108e-06, "loss": 0.5969, "num_input_tokens_seen": 5515088, "step": 8385 }, { "epoch": 4.946933962264151, "grad_norm": 1.4687821865081787, "learning_rate": 9.353348506469236e-06, "loss": 0.4878, "num_input_tokens_seen": 5518000, "step": 8390 }, { "epoch": 4.949882075471698, "grad_norm": 2.2492079734802246, "learning_rate": 9.352082494126586e-06, "loss": 0.3904, "num_input_tokens_seen": 5521168, "step": 8395 }, { "epoch": 4.952830188679245, "grad_norm": 1.7199009656906128, "learning_rate": 9.350815329551341e-06, "loss": 0.4169, "num_input_tokens_seen": 5523856, "step": 8400 }, { "epoch": 4.9557783018867925, "grad_norm": 1.6847994327545166, "learning_rate": 9.349547013078986e-06, "loss": 0.4046, "num_input_tokens_seen": 5526640, "step": 8405 }, { "epoch": 4.95872641509434, "grad_norm": 1.5493295192718506, "learning_rate": 9.348277545045312e-06, "loss": 0.4932, "num_input_tokens_seen": 5529136, "step": 8410 }, { "epoch": 4.961674528301887, "grad_norm": 1.0601571798324585, "learning_rate": 9.347006925786418e-06, "loss": 0.434, "num_input_tokens_seen": 5531952, "step": 8415 }, { "epoch": 4.964622641509434, "grad_norm": 1.1718776226043701, "learning_rate": 9.34573515563871e-06, "loss": 0.4168, "num_input_tokens_seen": 5535632, "step": 8420 }, { "epoch": 4.967570754716981, "grad_norm": 1.7182163000106812, "learning_rate": 9.344462234938885e-06, "loss": 0.5061, "num_input_tokens_seen": 5539568, "step": 8425 }, { "epoch": 4.970518867924528, "grad_norm": 2.2491843700408936, "learning_rate": 9.343188164023962e-06, "loss": 0.4781, "num_input_tokens_seen": 5543056, "step": 8430 }, { "epoch": 4.973466981132075, "grad_norm": 1.534488320350647, "learning_rate": 9.341912943231256e-06, "loss": 0.4588, "num_input_tokens_seen": 5546960, "step": 8435 }, { "epoch": 4.976415094339623, "grad_norm": 2.6739718914031982, "learning_rate": 9.340636572898383e-06, "loss": 0.4274, "num_input_tokens_seen": 5549712, "step": 8440 }, { "epoch": 4.97936320754717, "grad_norm": 1.533219814300537, "learning_rate": 9.339359053363272e-06, "loss": 0.5231, "num_input_tokens_seen": 5552880, "step": 8445 }, { "epoch": 4.982311320754717, "grad_norm": 4.011496067047119, "learning_rate": 9.338080384964148e-06, "loss": 0.4758, "num_input_tokens_seen": 5555696, "step": 8450 }, { "epoch": 4.9852594339622645, "grad_norm": 1.9001705646514893, "learning_rate": 9.336800568039548e-06, "loss": 0.4446, "num_input_tokens_seen": 5561040, "step": 8455 }, { "epoch": 4.988207547169811, "grad_norm": 1.291440486907959, "learning_rate": 9.335519602928307e-06, "loss": 0.4016, "num_input_tokens_seen": 5564592, "step": 8460 }, { "epoch": 4.991155660377358, "grad_norm": 1.4741010665893555, "learning_rate": 9.334237489969565e-06, "loss": 0.3865, "num_input_tokens_seen": 5567376, "step": 8465 }, { "epoch": 4.994103773584905, "grad_norm": 2.562795400619507, "learning_rate": 9.332954229502768e-06, "loss": 0.4431, "num_input_tokens_seen": 5570320, "step": 8470 }, { "epoch": 4.997051886792453, "grad_norm": 1.1750456094741821, "learning_rate": 9.331669821867665e-06, "loss": 0.3592, "num_input_tokens_seen": 5573232, "step": 8475 }, { "epoch": 5.0, "grad_norm": 2.7815613746643066, "learning_rate": 9.33038426740431e-06, "loss": 0.3355, "num_input_tokens_seen": 5576208, "step": 8480 }, { "epoch": 5.002948113207547, "grad_norm": 2.2509920597076416, "learning_rate": 9.329097566453055e-06, "loss": 0.599, "num_input_tokens_seen": 5579760, "step": 8485 }, { "epoch": 5.005896226415095, "grad_norm": 3.286219358444214, "learning_rate": 9.327809719354564e-06, "loss": 0.4043, "num_input_tokens_seen": 5582864, "step": 8490 }, { "epoch": 5.008844339622642, "grad_norm": 2.1692628860473633, "learning_rate": 9.326520726449795e-06, "loss": 0.7048, "num_input_tokens_seen": 5586544, "step": 8495 }, { "epoch": 5.011792452830188, "grad_norm": 1.1213809251785278, "learning_rate": 9.32523058808002e-06, "loss": 0.5253, "num_input_tokens_seen": 5590352, "step": 8500 }, { "epoch": 5.0147405660377355, "grad_norm": 1.030902624130249, "learning_rate": 9.323939304586806e-06, "loss": 0.5322, "num_input_tokens_seen": 5593392, "step": 8505 }, { "epoch": 5.017688679245283, "grad_norm": 1.767309308052063, "learning_rate": 9.322646876312025e-06, "loss": 0.5101, "num_input_tokens_seen": 5595920, "step": 8510 }, { "epoch": 5.02063679245283, "grad_norm": 1.544469952583313, "learning_rate": 9.321353303597854e-06, "loss": 0.4391, "num_input_tokens_seen": 5599792, "step": 8515 }, { "epoch": 5.023584905660377, "grad_norm": 1.9386332035064697, "learning_rate": 9.320058586786771e-06, "loss": 0.4172, "num_input_tokens_seen": 5603056, "step": 8520 }, { "epoch": 5.026533018867925, "grad_norm": 0.8280078768730164, "learning_rate": 9.31876272622156e-06, "loss": 0.4822, "num_input_tokens_seen": 5608528, "step": 8525 }, { "epoch": 5.029481132075472, "grad_norm": 1.9066338539123535, "learning_rate": 9.317465722245305e-06, "loss": 0.4107, "num_input_tokens_seen": 5610768, "step": 8530 }, { "epoch": 5.032429245283019, "grad_norm": 1.6511282920837402, "learning_rate": 9.316167575201391e-06, "loss": 0.4612, "num_input_tokens_seen": 5613872, "step": 8535 }, { "epoch": 5.035377358490566, "grad_norm": 1.6795272827148438, "learning_rate": 9.31486828543351e-06, "loss": 0.3964, "num_input_tokens_seen": 5616816, "step": 8540 }, { "epoch": 5.038325471698113, "grad_norm": 2.0182554721832275, "learning_rate": 9.313567853285656e-06, "loss": 0.4989, "num_input_tokens_seen": 5619856, "step": 8545 }, { "epoch": 5.04127358490566, "grad_norm": 1.7882273197174072, "learning_rate": 9.31226627910212e-06, "loss": 0.4947, "num_input_tokens_seen": 5623376, "step": 8550 }, { "epoch": 5.0442216981132075, "grad_norm": 1.623075246810913, "learning_rate": 9.310963563227504e-06, "loss": 0.5136, "num_input_tokens_seen": 5626800, "step": 8555 }, { "epoch": 5.047169811320755, "grad_norm": 1.3902735710144043, "learning_rate": 9.309659706006704e-06, "loss": 0.4532, "num_input_tokens_seen": 5630672, "step": 8560 }, { "epoch": 5.050117924528302, "grad_norm": 1.0947688817977905, "learning_rate": 9.308354707784925e-06, "loss": 0.4068, "num_input_tokens_seen": 5633840, "step": 8565 }, { "epoch": 5.053066037735849, "grad_norm": 1.8611057996749878, "learning_rate": 9.307048568907669e-06, "loss": 0.4989, "num_input_tokens_seen": 5636880, "step": 8570 }, { "epoch": 5.056014150943396, "grad_norm": 4.94350004196167, "learning_rate": 9.30574128972074e-06, "loss": 0.4181, "num_input_tokens_seen": 5640336, "step": 8575 }, { "epoch": 5.058962264150943, "grad_norm": 1.6376441717147827, "learning_rate": 9.304432870570247e-06, "loss": 0.5415, "num_input_tokens_seen": 5644272, "step": 8580 }, { "epoch": 5.06191037735849, "grad_norm": 1.513922095298767, "learning_rate": 9.303123311802605e-06, "loss": 0.4002, "num_input_tokens_seen": 5651184, "step": 8585 }, { "epoch": 5.064858490566038, "grad_norm": 1.4587358236312866, "learning_rate": 9.301812613764516e-06, "loss": 0.3707, "num_input_tokens_seen": 5653776, "step": 8590 }, { "epoch": 5.067806603773585, "grad_norm": 1.9446543455123901, "learning_rate": 9.300500776803001e-06, "loss": 0.5782, "num_input_tokens_seen": 5656784, "step": 8595 }, { "epoch": 5.070754716981132, "grad_norm": 1.240234136581421, "learning_rate": 9.29918780126537e-06, "loss": 0.4463, "num_input_tokens_seen": 5660336, "step": 8600 }, { "epoch": 5.0737028301886795, "grad_norm": 4.062267303466797, "learning_rate": 9.297873687499239e-06, "loss": 0.3619, "num_input_tokens_seen": 5663472, "step": 8605 }, { "epoch": 5.076650943396227, "grad_norm": 1.2446560859680176, "learning_rate": 9.296558435852528e-06, "loss": 0.4536, "num_input_tokens_seen": 5667120, "step": 8610 }, { "epoch": 5.079599056603773, "grad_norm": 3.8689823150634766, "learning_rate": 9.295242046673454e-06, "loss": 0.4227, "num_input_tokens_seen": 5670352, "step": 8615 }, { "epoch": 5.0825471698113205, "grad_norm": 3.9903786182403564, "learning_rate": 9.293924520310535e-06, "loss": 0.6503, "num_input_tokens_seen": 5673328, "step": 8620 }, { "epoch": 5.085495283018868, "grad_norm": 2.2605702877044678, "learning_rate": 9.292605857112595e-06, "loss": 0.4477, "num_input_tokens_seen": 5677232, "step": 8625 }, { "epoch": 5.088443396226415, "grad_norm": 2.202570915222168, "learning_rate": 9.291286057428755e-06, "loss": 0.484, "num_input_tokens_seen": 5681008, "step": 8630 }, { "epoch": 5.091391509433962, "grad_norm": 1.4465112686157227, "learning_rate": 9.289965121608436e-06, "loss": 0.4302, "num_input_tokens_seen": 5684176, "step": 8635 }, { "epoch": 5.09433962264151, "grad_norm": 1.5815354585647583, "learning_rate": 9.288643050001362e-06, "loss": 0.527, "num_input_tokens_seen": 5687568, "step": 8640 }, { "epoch": 5.097287735849057, "grad_norm": 2.746887683868408, "learning_rate": 9.287319842957557e-06, "loss": 0.4622, "num_input_tokens_seen": 5691152, "step": 8645 }, { "epoch": 5.100235849056604, "grad_norm": 1.680882215499878, "learning_rate": 9.285995500827348e-06, "loss": 0.3863, "num_input_tokens_seen": 5694512, "step": 8650 }, { "epoch": 5.103183962264151, "grad_norm": 1.3339046239852905, "learning_rate": 9.284670023961355e-06, "loss": 0.5104, "num_input_tokens_seen": 5698192, "step": 8655 }, { "epoch": 5.106132075471698, "grad_norm": 1.4597092866897583, "learning_rate": 9.28334341271051e-06, "loss": 0.3519, "num_input_tokens_seen": 5701680, "step": 8660 }, { "epoch": 5.109080188679245, "grad_norm": 1.4530248641967773, "learning_rate": 9.282015667426036e-06, "loss": 0.5751, "num_input_tokens_seen": 5704656, "step": 8665 }, { "epoch": 5.1120283018867925, "grad_norm": 1.615359902381897, "learning_rate": 9.280686788459461e-06, "loss": 0.4526, "num_input_tokens_seen": 5707408, "step": 8670 }, { "epoch": 5.11497641509434, "grad_norm": 1.4398856163024902, "learning_rate": 9.279356776162606e-06, "loss": 0.4581, "num_input_tokens_seen": 5711024, "step": 8675 }, { "epoch": 5.117924528301887, "grad_norm": 1.2901521921157837, "learning_rate": 9.278025630887607e-06, "loss": 0.4647, "num_input_tokens_seen": 5714064, "step": 8680 }, { "epoch": 5.120872641509434, "grad_norm": 2.1178243160247803, "learning_rate": 9.27669335298688e-06, "loss": 0.4528, "num_input_tokens_seen": 5717264, "step": 8685 }, { "epoch": 5.123820754716981, "grad_norm": 1.4925237894058228, "learning_rate": 9.275359942813158e-06, "loss": 0.5211, "num_input_tokens_seen": 5720368, "step": 8690 }, { "epoch": 5.126768867924528, "grad_norm": 2.2151763439178467, "learning_rate": 9.274025400719466e-06, "loss": 0.3644, "num_input_tokens_seen": 5723408, "step": 8695 }, { "epoch": 5.129716981132075, "grad_norm": 4.057134628295898, "learning_rate": 9.27268972705913e-06, "loss": 0.4587, "num_input_tokens_seen": 5725840, "step": 8700 }, { "epoch": 5.132665094339623, "grad_norm": 5.500420570373535, "learning_rate": 9.271352922185772e-06, "loss": 0.4497, "num_input_tokens_seen": 5728208, "step": 8705 }, { "epoch": 5.13561320754717, "grad_norm": 2.0715699195861816, "learning_rate": 9.270014986453321e-06, "loss": 0.2845, "num_input_tokens_seen": 5731728, "step": 8710 }, { "epoch": 5.138561320754717, "grad_norm": 2.5610973834991455, "learning_rate": 9.268675920215999e-06, "loss": 0.402, "num_input_tokens_seen": 5734576, "step": 8715 }, { "epoch": 5.1415094339622645, "grad_norm": 1.2906702756881714, "learning_rate": 9.26733572382833e-06, "loss": 0.4756, "num_input_tokens_seen": 5737808, "step": 8720 }, { "epoch": 5.144457547169812, "grad_norm": 1.1267014741897583, "learning_rate": 9.265994397645137e-06, "loss": 0.4056, "num_input_tokens_seen": 5741456, "step": 8725 }, { "epoch": 5.147405660377358, "grad_norm": 2.0421361923217773, "learning_rate": 9.264651942021543e-06, "loss": 0.5338, "num_input_tokens_seen": 5744368, "step": 8730 }, { "epoch": 5.150353773584905, "grad_norm": 2.3602747917175293, "learning_rate": 9.263308357312966e-06, "loss": 0.3279, "num_input_tokens_seen": 5747952, "step": 8735 }, { "epoch": 5.153301886792453, "grad_norm": 2.3990132808685303, "learning_rate": 9.26196364387513e-06, "loss": 0.5392, "num_input_tokens_seen": 5750832, "step": 8740 }, { "epoch": 5.15625, "grad_norm": 1.2207131385803223, "learning_rate": 9.26061780206405e-06, "loss": 0.4033, "num_input_tokens_seen": 5753968, "step": 8745 }, { "epoch": 5.159198113207547, "grad_norm": 1.903184175491333, "learning_rate": 9.259270832236043e-06, "loss": 0.3124, "num_input_tokens_seen": 5756912, "step": 8750 }, { "epoch": 5.162146226415095, "grad_norm": 1.6476223468780518, "learning_rate": 9.257922734747729e-06, "loss": 0.3071, "num_input_tokens_seen": 5760368, "step": 8755 }, { "epoch": 5.165094339622642, "grad_norm": 1.2602455615997314, "learning_rate": 9.256573509956018e-06, "loss": 0.4645, "num_input_tokens_seen": 5763824, "step": 8760 }, { "epoch": 5.168042452830188, "grad_norm": 1.8353837728500366, "learning_rate": 9.255223158218127e-06, "loss": 0.55, "num_input_tokens_seen": 5768656, "step": 8765 }, { "epoch": 5.1709905660377355, "grad_norm": 1.1687804460525513, "learning_rate": 9.253871679891566e-06, "loss": 0.4195, "num_input_tokens_seen": 5772912, "step": 8770 }, { "epoch": 5.173938679245283, "grad_norm": 2.6070749759674072, "learning_rate": 9.252519075334143e-06, "loss": 0.4258, "num_input_tokens_seen": 5776400, "step": 8775 }, { "epoch": 5.17688679245283, "grad_norm": 1.296957015991211, "learning_rate": 9.251165344903969e-06, "loss": 0.5511, "num_input_tokens_seen": 5783472, "step": 8780 }, { "epoch": 5.179834905660377, "grad_norm": 1.4126909971237183, "learning_rate": 9.249810488959448e-06, "loss": 0.4457, "num_input_tokens_seen": 5786672, "step": 8785 }, { "epoch": 5.182783018867925, "grad_norm": 3.5645840167999268, "learning_rate": 9.248454507859285e-06, "loss": 0.5094, "num_input_tokens_seen": 5789488, "step": 8790 }, { "epoch": 5.185731132075472, "grad_norm": 3.515427589416504, "learning_rate": 9.247097401962482e-06, "loss": 0.3814, "num_input_tokens_seen": 5791984, "step": 8795 }, { "epoch": 5.188679245283019, "grad_norm": 2.7136270999908447, "learning_rate": 9.245739171628335e-06, "loss": 0.4139, "num_input_tokens_seen": 5794992, "step": 8800 }, { "epoch": 5.191627358490566, "grad_norm": 1.7092665433883667, "learning_rate": 9.244379817216447e-06, "loss": 0.442, "num_input_tokens_seen": 5798608, "step": 8805 }, { "epoch": 5.194575471698113, "grad_norm": 1.666703701019287, "learning_rate": 9.243019339086708e-06, "loss": 0.46, "num_input_tokens_seen": 5801008, "step": 8810 }, { "epoch": 5.19752358490566, "grad_norm": 1.5970726013183594, "learning_rate": 9.241657737599313e-06, "loss": 0.5199, "num_input_tokens_seen": 5804816, "step": 8815 }, { "epoch": 5.2004716981132075, "grad_norm": 0.7520119547843933, "learning_rate": 9.240295013114752e-06, "loss": 0.3152, "num_input_tokens_seen": 5808464, "step": 8820 }, { "epoch": 5.203419811320755, "grad_norm": 0.9182219505310059, "learning_rate": 9.238931165993811e-06, "loss": 0.3854, "num_input_tokens_seen": 5811408, "step": 8825 }, { "epoch": 5.206367924528302, "grad_norm": 1.1959481239318848, "learning_rate": 9.237566196597577e-06, "loss": 0.5178, "num_input_tokens_seen": 5814576, "step": 8830 }, { "epoch": 5.209316037735849, "grad_norm": 1.7995961904525757, "learning_rate": 9.236200105287427e-06, "loss": 0.4638, "num_input_tokens_seen": 5817776, "step": 8835 }, { "epoch": 5.212264150943396, "grad_norm": 2.0534157752990723, "learning_rate": 9.234832892425042e-06, "loss": 0.4361, "num_input_tokens_seen": 5820560, "step": 8840 }, { "epoch": 5.215212264150943, "grad_norm": 1.2805962562561035, "learning_rate": 9.2334645583724e-06, "loss": 0.4192, "num_input_tokens_seen": 5824784, "step": 8845 }, { "epoch": 5.21816037735849, "grad_norm": 1.3162072896957397, "learning_rate": 9.23209510349177e-06, "loss": 0.406, "num_input_tokens_seen": 5829104, "step": 8850 }, { "epoch": 5.221108490566038, "grad_norm": 1.6399866342544556, "learning_rate": 9.230724528145722e-06, "loss": 0.5492, "num_input_tokens_seen": 5832304, "step": 8855 }, { "epoch": 5.224056603773585, "grad_norm": 2.0032639503479004, "learning_rate": 9.229352832697122e-06, "loss": 0.4996, "num_input_tokens_seen": 5834704, "step": 8860 }, { "epoch": 5.227004716981132, "grad_norm": 3.007368564605713, "learning_rate": 9.22798001750913e-06, "loss": 0.4579, "num_input_tokens_seen": 5837360, "step": 8865 }, { "epoch": 5.2299528301886795, "grad_norm": 3.841254711151123, "learning_rate": 9.226606082945209e-06, "loss": 0.4597, "num_input_tokens_seen": 5840048, "step": 8870 }, { "epoch": 5.232900943396227, "grad_norm": 1.3834813833236694, "learning_rate": 9.225231029369112e-06, "loss": 0.2982, "num_input_tokens_seen": 5843376, "step": 8875 }, { "epoch": 5.235849056603773, "grad_norm": 1.9613910913467407, "learning_rate": 9.22385485714489e-06, "loss": 0.387, "num_input_tokens_seen": 5846608, "step": 8880 }, { "epoch": 5.2387971698113205, "grad_norm": 0.6365675330162048, "learning_rate": 9.222477566636889e-06, "loss": 0.3492, "num_input_tokens_seen": 5850032, "step": 8885 }, { "epoch": 5.241745283018868, "grad_norm": 1.977371096611023, "learning_rate": 9.221099158209757e-06, "loss": 0.3907, "num_input_tokens_seen": 5852400, "step": 8890 }, { "epoch": 5.244693396226415, "grad_norm": 1.8447961807250977, "learning_rate": 9.219719632228429e-06, "loss": 0.3711, "num_input_tokens_seen": 5855056, "step": 8895 }, { "epoch": 5.247641509433962, "grad_norm": 2.0137274265289307, "learning_rate": 9.218338989058141e-06, "loss": 0.3799, "num_input_tokens_seen": 5858544, "step": 8900 }, { "epoch": 5.25058962264151, "grad_norm": 1.2164019346237183, "learning_rate": 9.21695722906443e-06, "loss": 0.5608, "num_input_tokens_seen": 5862000, "step": 8905 }, { "epoch": 5.253537735849057, "grad_norm": 1.4283004999160767, "learning_rate": 9.215574352613115e-06, "loss": 0.449, "num_input_tokens_seen": 5865776, "step": 8910 }, { "epoch": 5.256485849056604, "grad_norm": 1.7063111066818237, "learning_rate": 9.214190360070323e-06, "loss": 0.4139, "num_input_tokens_seen": 5868976, "step": 8915 }, { "epoch": 5.259433962264151, "grad_norm": 2.5049901008605957, "learning_rate": 9.212805251802471e-06, "loss": 0.4523, "num_input_tokens_seen": 5872464, "step": 8920 }, { "epoch": 5.262382075471698, "grad_norm": 2.001146078109741, "learning_rate": 9.211419028176273e-06, "loss": 0.5771, "num_input_tokens_seen": 5875696, "step": 8925 }, { "epoch": 5.265330188679245, "grad_norm": 2.7330288887023926, "learning_rate": 9.210031689558738e-06, "loss": 0.3827, "num_input_tokens_seen": 5882032, "step": 8930 }, { "epoch": 5.2682783018867925, "grad_norm": 1.5459678173065186, "learning_rate": 9.208643236317166e-06, "loss": 0.472, "num_input_tokens_seen": 5885232, "step": 8935 }, { "epoch": 5.27122641509434, "grad_norm": 0.9355018138885498, "learning_rate": 9.20725366881916e-06, "loss": 0.4651, "num_input_tokens_seen": 5889584, "step": 8940 }, { "epoch": 5.274174528301887, "grad_norm": 1.8899022340774536, "learning_rate": 9.205862987432614e-06, "loss": 0.4652, "num_input_tokens_seen": 5894192, "step": 8945 }, { "epoch": 5.277122641509434, "grad_norm": 2.2202565670013428, "learning_rate": 9.204471192525715e-06, "loss": 0.4208, "num_input_tokens_seen": 5897328, "step": 8950 }, { "epoch": 5.280070754716981, "grad_norm": 2.2768843173980713, "learning_rate": 9.203078284466949e-06, "loss": 0.5462, "num_input_tokens_seen": 5900496, "step": 8955 }, { "epoch": 5.283018867924528, "grad_norm": 2.5466558933258057, "learning_rate": 9.201684263625091e-06, "loss": 0.4242, "num_input_tokens_seen": 5903056, "step": 8960 }, { "epoch": 5.285966981132075, "grad_norm": 2.093646287918091, "learning_rate": 9.200289130369218e-06, "loss": 0.3505, "num_input_tokens_seen": 5907888, "step": 8965 }, { "epoch": 5.288915094339623, "grad_norm": 2.367992877960205, "learning_rate": 9.198892885068693e-06, "loss": 0.4285, "num_input_tokens_seen": 5910704, "step": 8970 }, { "epoch": 5.29186320754717, "grad_norm": 1.5037342309951782, "learning_rate": 9.197495528093182e-06, "loss": 0.4927, "num_input_tokens_seen": 5913648, "step": 8975 }, { "epoch": 5.294811320754717, "grad_norm": 0.9579063057899475, "learning_rate": 9.196097059812639e-06, "loss": 0.4222, "num_input_tokens_seen": 5917232, "step": 8980 }, { "epoch": 5.2977594339622645, "grad_norm": 1.8144636154174805, "learning_rate": 9.194697480597316e-06, "loss": 0.444, "num_input_tokens_seen": 5922096, "step": 8985 }, { "epoch": 5.300707547169811, "grad_norm": 2.158350706100464, "learning_rate": 9.193296790817755e-06, "loss": 0.4404, "num_input_tokens_seen": 5925872, "step": 8990 }, { "epoch": 5.303655660377358, "grad_norm": 1.6068655252456665, "learning_rate": 9.1918949908448e-06, "loss": 0.3288, "num_input_tokens_seen": 5928368, "step": 8995 }, { "epoch": 5.306603773584905, "grad_norm": 1.9473400115966797, "learning_rate": 9.190492081049578e-06, "loss": 0.4756, "num_input_tokens_seen": 5931888, "step": 9000 }, { "epoch": 5.309551886792453, "grad_norm": 1.0463579893112183, "learning_rate": 9.189088061803517e-06, "loss": 0.4872, "num_input_tokens_seen": 5935088, "step": 9005 }, { "epoch": 5.3125, "grad_norm": 1.6355481147766113, "learning_rate": 9.187682933478337e-06, "loss": 0.4778, "num_input_tokens_seen": 5938000, "step": 9010 }, { "epoch": 5.315448113207547, "grad_norm": 0.9411556124687195, "learning_rate": 9.186276696446054e-06, "loss": 0.3793, "num_input_tokens_seen": 5940944, "step": 9015 }, { "epoch": 5.318396226415095, "grad_norm": 1.8102288246154785, "learning_rate": 9.184869351078974e-06, "loss": 0.4695, "num_input_tokens_seen": 5943472, "step": 9020 }, { "epoch": 5.321344339622642, "grad_norm": 1.396687626838684, "learning_rate": 9.183460897749697e-06, "loss": 0.362, "num_input_tokens_seen": 5946352, "step": 9025 }, { "epoch": 5.324292452830189, "grad_norm": 1.8311779499053955, "learning_rate": 9.182051336831117e-06, "loss": 0.3996, "num_input_tokens_seen": 5949456, "step": 9030 }, { "epoch": 5.3272405660377355, "grad_norm": 1.737457275390625, "learning_rate": 9.180640668696424e-06, "loss": 0.6482, "num_input_tokens_seen": 5952176, "step": 9035 }, { "epoch": 5.330188679245283, "grad_norm": 2.9504659175872803, "learning_rate": 9.179228893719094e-06, "loss": 0.5487, "num_input_tokens_seen": 5955056, "step": 9040 }, { "epoch": 5.33313679245283, "grad_norm": 2.1129534244537354, "learning_rate": 9.177816012272904e-06, "loss": 0.4376, "num_input_tokens_seen": 5958032, "step": 9045 }, { "epoch": 5.336084905660377, "grad_norm": 1.2721350193023682, "learning_rate": 9.17640202473192e-06, "loss": 0.4716, "num_input_tokens_seen": 5960976, "step": 9050 }, { "epoch": 5.339033018867925, "grad_norm": 2.7213451862335205, "learning_rate": 9.1749869314705e-06, "loss": 0.645, "num_input_tokens_seen": 5964144, "step": 9055 }, { "epoch": 5.341981132075472, "grad_norm": 1.0529426336288452, "learning_rate": 9.173570732863295e-06, "loss": 0.4562, "num_input_tokens_seen": 5967472, "step": 9060 }, { "epoch": 5.344929245283019, "grad_norm": 0.8690145611763, "learning_rate": 9.172153429285254e-06, "loss": 0.5285, "num_input_tokens_seen": 5972368, "step": 9065 }, { "epoch": 5.347877358490566, "grad_norm": 2.0693228244781494, "learning_rate": 9.17073502111161e-06, "loss": 0.3722, "num_input_tokens_seen": 5975664, "step": 9070 }, { "epoch": 5.350825471698113, "grad_norm": 1.314242959022522, "learning_rate": 9.169315508717895e-06, "loss": 0.4695, "num_input_tokens_seen": 5978640, "step": 9075 }, { "epoch": 5.35377358490566, "grad_norm": 1.2657374143600464, "learning_rate": 9.167894892479932e-06, "loss": 0.4243, "num_input_tokens_seen": 5982096, "step": 9080 }, { "epoch": 5.3567216981132075, "grad_norm": 1.5723438262939453, "learning_rate": 9.16647317277383e-06, "loss": 0.3623, "num_input_tokens_seen": 5985808, "step": 9085 }, { "epoch": 5.359669811320755, "grad_norm": 1.4243769645690918, "learning_rate": 9.165050349976002e-06, "loss": 0.447, "num_input_tokens_seen": 5989456, "step": 9090 }, { "epoch": 5.362617924528302, "grad_norm": 2.9499151706695557, "learning_rate": 9.16362642446314e-06, "loss": 0.4747, "num_input_tokens_seen": 5993232, "step": 9095 }, { "epoch": 5.365566037735849, "grad_norm": 1.3168716430664062, "learning_rate": 9.162201396612242e-06, "loss": 0.417, "num_input_tokens_seen": 5996848, "step": 9100 }, { "epoch": 5.368514150943396, "grad_norm": 2.623413562774658, "learning_rate": 9.160775266800583e-06, "loss": 0.5268, "num_input_tokens_seen": 6000080, "step": 9105 }, { "epoch": 5.371462264150943, "grad_norm": 1.2791543006896973, "learning_rate": 9.159348035405742e-06, "loss": 0.366, "num_input_tokens_seen": 6003024, "step": 9110 }, { "epoch": 5.37441037735849, "grad_norm": 1.8034533262252808, "learning_rate": 9.157919702805582e-06, "loss": 0.3721, "num_input_tokens_seen": 6005776, "step": 9115 }, { "epoch": 5.377358490566038, "grad_norm": 2.4158236980438232, "learning_rate": 9.156490269378262e-06, "loss": 0.4677, "num_input_tokens_seen": 6008816, "step": 9120 }, { "epoch": 5.380306603773585, "grad_norm": 1.448272466659546, "learning_rate": 9.15505973550223e-06, "loss": 0.5648, "num_input_tokens_seen": 6011952, "step": 9125 }, { "epoch": 5.383254716981132, "grad_norm": 2.540178060531616, "learning_rate": 9.153628101556223e-06, "loss": 0.5351, "num_input_tokens_seen": 6015184, "step": 9130 }, { "epoch": 5.3862028301886795, "grad_norm": 2.09043025970459, "learning_rate": 9.152195367919277e-06, "loss": 0.5443, "num_input_tokens_seen": 6019600, "step": 9135 }, { "epoch": 5.389150943396227, "grad_norm": 2.39131760597229, "learning_rate": 9.150761534970713e-06, "loss": 0.5158, "num_input_tokens_seen": 6022576, "step": 9140 }, { "epoch": 5.392099056603773, "grad_norm": 1.2820415496826172, "learning_rate": 9.149326603090144e-06, "loss": 0.4649, "num_input_tokens_seen": 6026768, "step": 9145 }, { "epoch": 5.3950471698113205, "grad_norm": 1.8241009712219238, "learning_rate": 9.147890572657471e-06, "loss": 0.3435, "num_input_tokens_seen": 6029232, "step": 9150 }, { "epoch": 5.397995283018868, "grad_norm": 2.0417590141296387, "learning_rate": 9.146453444052895e-06, "loss": 0.3881, "num_input_tokens_seen": 6032272, "step": 9155 }, { "epoch": 5.400943396226415, "grad_norm": 1.1340543031692505, "learning_rate": 9.145015217656899e-06, "loss": 0.3687, "num_input_tokens_seen": 6035408, "step": 9160 }, { "epoch": 5.403891509433962, "grad_norm": 2.698467969894409, "learning_rate": 9.14357589385026e-06, "loss": 0.3509, "num_input_tokens_seen": 6039472, "step": 9165 }, { "epoch": 5.40683962264151, "grad_norm": 1.4927802085876465, "learning_rate": 9.142135473014046e-06, "loss": 0.3563, "num_input_tokens_seen": 6042064, "step": 9170 }, { "epoch": 5.409787735849057, "grad_norm": 2.756965160369873, "learning_rate": 9.140693955529614e-06, "loss": 0.4665, "num_input_tokens_seen": 6046256, "step": 9175 }, { "epoch": 5.412735849056604, "grad_norm": 1.4925962686538696, "learning_rate": 9.13925134177861e-06, "loss": 0.4125, "num_input_tokens_seen": 6048848, "step": 9180 }, { "epoch": 5.415683962264151, "grad_norm": 3.938459873199463, "learning_rate": 9.137807632142977e-06, "loss": 0.4743, "num_input_tokens_seen": 6051792, "step": 9185 }, { "epoch": 5.418632075471698, "grad_norm": 2.1098568439483643, "learning_rate": 9.136362827004937e-06, "loss": 0.4483, "num_input_tokens_seen": 6055504, "step": 9190 }, { "epoch": 5.421580188679245, "grad_norm": 4.759139537811279, "learning_rate": 9.134916926747015e-06, "loss": 0.4483, "num_input_tokens_seen": 6057872, "step": 9195 }, { "epoch": 5.4245283018867925, "grad_norm": 3.8554651737213135, "learning_rate": 9.133469931752016e-06, "loss": 0.4206, "num_input_tokens_seen": 6060752, "step": 9200 }, { "epoch": 5.42747641509434, "grad_norm": 1.46991765499115, "learning_rate": 9.132021842403035e-06, "loss": 0.509, "num_input_tokens_seen": 6063440, "step": 9205 }, { "epoch": 5.430424528301887, "grad_norm": 3.0889194011688232, "learning_rate": 9.130572659083465e-06, "loss": 0.4454, "num_input_tokens_seen": 6066160, "step": 9210 }, { "epoch": 5.433372641509434, "grad_norm": 1.2232657670974731, "learning_rate": 9.129122382176982e-06, "loss": 0.3615, "num_input_tokens_seen": 6069680, "step": 9215 }, { "epoch": 5.436320754716981, "grad_norm": 2.705768346786499, "learning_rate": 9.127671012067554e-06, "loss": 0.3992, "num_input_tokens_seen": 6073104, "step": 9220 }, { "epoch": 5.439268867924528, "grad_norm": 2.7858078479766846, "learning_rate": 9.126218549139434e-06, "loss": 0.4175, "num_input_tokens_seen": 6076112, "step": 9225 }, { "epoch": 5.442216981132075, "grad_norm": 2.035801649093628, "learning_rate": 9.124764993777171e-06, "loss": 0.4627, "num_input_tokens_seen": 6079696, "step": 9230 }, { "epoch": 5.445165094339623, "grad_norm": 2.6079564094543457, "learning_rate": 9.1233103463656e-06, "loss": 0.5036, "num_input_tokens_seen": 6082800, "step": 9235 }, { "epoch": 5.44811320754717, "grad_norm": 1.1703869104385376, "learning_rate": 9.121854607289842e-06, "loss": 0.4426, "num_input_tokens_seen": 6085616, "step": 9240 }, { "epoch": 5.451061320754717, "grad_norm": 2.774160385131836, "learning_rate": 9.120397776935314e-06, "loss": 0.4048, "num_input_tokens_seen": 6088464, "step": 9245 }, { "epoch": 5.4540094339622645, "grad_norm": 1.574312686920166, "learning_rate": 9.118939855687717e-06, "loss": 0.5299, "num_input_tokens_seen": 6090992, "step": 9250 }, { "epoch": 5.456957547169811, "grad_norm": 1.2978872060775757, "learning_rate": 9.117480843933043e-06, "loss": 0.3684, "num_input_tokens_seen": 6094512, "step": 9255 }, { "epoch": 5.459905660377358, "grad_norm": 2.0553863048553467, "learning_rate": 9.116020742057567e-06, "loss": 0.4666, "num_input_tokens_seen": 6098192, "step": 9260 }, { "epoch": 5.462853773584905, "grad_norm": 1.6784273386001587, "learning_rate": 9.114559550447863e-06, "loss": 0.3229, "num_input_tokens_seen": 6100656, "step": 9265 }, { "epoch": 5.465801886792453, "grad_norm": 1.4353039264678955, "learning_rate": 9.113097269490784e-06, "loss": 0.5456, "num_input_tokens_seen": 6103472, "step": 9270 }, { "epoch": 5.46875, "grad_norm": 2.214181900024414, "learning_rate": 9.111633899573476e-06, "loss": 0.4296, "num_input_tokens_seen": 6107376, "step": 9275 }, { "epoch": 5.471698113207547, "grad_norm": 1.3857688903808594, "learning_rate": 9.110169441083374e-06, "loss": 0.3932, "num_input_tokens_seen": 6110288, "step": 9280 }, { "epoch": 5.474646226415095, "grad_norm": 2.2996420860290527, "learning_rate": 9.108703894408198e-06, "loss": 0.5081, "num_input_tokens_seen": 6113776, "step": 9285 }, { "epoch": 5.477594339622642, "grad_norm": 1.8776628971099854, "learning_rate": 9.107237259935959e-06, "loss": 0.4433, "num_input_tokens_seen": 6119344, "step": 9290 }, { "epoch": 5.480542452830189, "grad_norm": 2.6586310863494873, "learning_rate": 9.105769538054954e-06, "loss": 0.5416, "num_input_tokens_seen": 6122352, "step": 9295 }, { "epoch": 5.4834905660377355, "grad_norm": 2.127720832824707, "learning_rate": 9.104300729153768e-06, "loss": 0.3742, "num_input_tokens_seen": 6125936, "step": 9300 }, { "epoch": 5.486438679245283, "grad_norm": 1.6624419689178467, "learning_rate": 9.102830833621277e-06, "loss": 0.4611, "num_input_tokens_seen": 6128496, "step": 9305 }, { "epoch": 5.48938679245283, "grad_norm": 0.9749917984008789, "learning_rate": 9.101359851846639e-06, "loss": 0.4429, "num_input_tokens_seen": 6132240, "step": 9310 }, { "epoch": 5.492334905660377, "grad_norm": 2.6871166229248047, "learning_rate": 9.099887784219305e-06, "loss": 0.3733, "num_input_tokens_seen": 6137744, "step": 9315 }, { "epoch": 5.495283018867925, "grad_norm": 2.200547933578491, "learning_rate": 9.098414631129012e-06, "loss": 0.4654, "num_input_tokens_seen": 6140144, "step": 9320 }, { "epoch": 5.498231132075472, "grad_norm": 1.6323367357254028, "learning_rate": 9.09694039296578e-06, "loss": 0.4913, "num_input_tokens_seen": 6143024, "step": 9325 }, { "epoch": 5.501179245283019, "grad_norm": 1.4288878440856934, "learning_rate": 9.095465070119924e-06, "loss": 0.6553, "num_input_tokens_seen": 6146352, "step": 9330 }, { "epoch": 5.504127358490566, "grad_norm": 0.9306563138961792, "learning_rate": 9.09398866298204e-06, "loss": 0.483, "num_input_tokens_seen": 6149712, "step": 9335 }, { "epoch": 5.507075471698113, "grad_norm": 1.8516314029693604, "learning_rate": 9.09251117194301e-06, "loss": 0.3064, "num_input_tokens_seen": 6153168, "step": 9340 }, { "epoch": 5.51002358490566, "grad_norm": 1.0056731700897217, "learning_rate": 9.091032597394012e-06, "loss": 0.4477, "num_input_tokens_seen": 6155920, "step": 9345 }, { "epoch": 5.5129716981132075, "grad_norm": 1.268384337425232, "learning_rate": 9.089552939726503e-06, "loss": 0.3981, "num_input_tokens_seen": 6159120, "step": 9350 }, { "epoch": 5.515919811320755, "grad_norm": 1.282435655593872, "learning_rate": 9.088072199332227e-06, "loss": 0.4092, "num_input_tokens_seen": 6162160, "step": 9355 }, { "epoch": 5.518867924528302, "grad_norm": 2.5922892093658447, "learning_rate": 9.08659037660322e-06, "loss": 0.4067, "num_input_tokens_seen": 6165904, "step": 9360 }, { "epoch": 5.521816037735849, "grad_norm": 5.17973518371582, "learning_rate": 9.085107471931797e-06, "loss": 0.3608, "num_input_tokens_seen": 6169104, "step": 9365 }, { "epoch": 5.524764150943396, "grad_norm": 1.1576613187789917, "learning_rate": 9.083623485710564e-06, "loss": 0.3577, "num_input_tokens_seen": 6172272, "step": 9370 }, { "epoch": 5.527712264150943, "grad_norm": 1.8527144193649292, "learning_rate": 9.082138418332416e-06, "loss": 0.4201, "num_input_tokens_seen": 6174896, "step": 9375 }, { "epoch": 5.53066037735849, "grad_norm": 1.7309021949768066, "learning_rate": 9.080652270190527e-06, "loss": 0.6526, "num_input_tokens_seen": 6177904, "step": 9380 }, { "epoch": 5.533608490566038, "grad_norm": 2.225705146789551, "learning_rate": 9.079165041678363e-06, "loss": 0.5011, "num_input_tokens_seen": 6180944, "step": 9385 }, { "epoch": 5.536556603773585, "grad_norm": 1.4892221689224243, "learning_rate": 9.077676733189675e-06, "loss": 0.6476, "num_input_tokens_seen": 6184432, "step": 9390 }, { "epoch": 5.539504716981132, "grad_norm": 1.9744480848312378, "learning_rate": 9.076187345118496e-06, "loss": 0.4543, "num_input_tokens_seen": 6188016, "step": 9395 }, { "epoch": 5.5424528301886795, "grad_norm": 1.5793561935424805, "learning_rate": 9.074696877859152e-06, "loss": 0.5431, "num_input_tokens_seen": 6191504, "step": 9400 }, { "epoch": 5.545400943396227, "grad_norm": 3.1090195178985596, "learning_rate": 9.073205331806248e-06, "loss": 0.469, "num_input_tokens_seen": 6194032, "step": 9405 }, { "epoch": 5.548349056603773, "grad_norm": 1.1047942638397217, "learning_rate": 9.071712707354676e-06, "loss": 0.4088, "num_input_tokens_seen": 6197648, "step": 9410 }, { "epoch": 5.5512971698113205, "grad_norm": 3.2159149646759033, "learning_rate": 9.070219004899618e-06, "loss": 0.4392, "num_input_tokens_seen": 6200688, "step": 9415 }, { "epoch": 5.554245283018868, "grad_norm": 1.847395658493042, "learning_rate": 9.068724224836538e-06, "loss": 0.5236, "num_input_tokens_seen": 6203600, "step": 9420 }, { "epoch": 5.557193396226415, "grad_norm": 1.169272541999817, "learning_rate": 9.067228367561182e-06, "loss": 0.436, "num_input_tokens_seen": 6206064, "step": 9425 }, { "epoch": 5.560141509433962, "grad_norm": 2.145601511001587, "learning_rate": 9.06573143346959e-06, "loss": 0.5005, "num_input_tokens_seen": 6209552, "step": 9430 }, { "epoch": 5.56308962264151, "grad_norm": 3.0411248207092285, "learning_rate": 9.064233422958078e-06, "loss": 0.4777, "num_input_tokens_seen": 6212560, "step": 9435 }, { "epoch": 5.566037735849057, "grad_norm": 1.9452571868896484, "learning_rate": 9.062734336423248e-06, "loss": 0.3728, "num_input_tokens_seen": 6215056, "step": 9440 }, { "epoch": 5.568985849056604, "grad_norm": 2.384667158126831, "learning_rate": 9.061234174261998e-06, "loss": 0.3833, "num_input_tokens_seen": 6217904, "step": 9445 }, { "epoch": 5.571933962264151, "grad_norm": 1.5629767179489136, "learning_rate": 9.059732936871493e-06, "loss": 0.4558, "num_input_tokens_seen": 6221200, "step": 9450 }, { "epoch": 5.574882075471698, "grad_norm": 1.5061869621276855, "learning_rate": 9.058230624649198e-06, "loss": 0.5113, "num_input_tokens_seen": 6223920, "step": 9455 }, { "epoch": 5.577830188679245, "grad_norm": 1.3580442667007446, "learning_rate": 9.056727237992856e-06, "loss": 0.3718, "num_input_tokens_seen": 6226320, "step": 9460 }, { "epoch": 5.5807783018867925, "grad_norm": 2.07110595703125, "learning_rate": 9.055222777300493e-06, "loss": 0.3905, "num_input_tokens_seen": 6228880, "step": 9465 }, { "epoch": 5.58372641509434, "grad_norm": 2.125802993774414, "learning_rate": 9.053717242970423e-06, "loss": 0.3834, "num_input_tokens_seen": 6231888, "step": 9470 }, { "epoch": 5.586674528301887, "grad_norm": 0.7862787246704102, "learning_rate": 9.052210635401244e-06, "loss": 0.3512, "num_input_tokens_seen": 6234704, "step": 9475 }, { "epoch": 5.589622641509434, "grad_norm": 1.209218144416809, "learning_rate": 9.050702954991833e-06, "loss": 0.5494, "num_input_tokens_seen": 6238832, "step": 9480 }, { "epoch": 5.592570754716981, "grad_norm": 2.1326193809509277, "learning_rate": 9.049194202141358e-06, "loss": 0.3268, "num_input_tokens_seen": 6241872, "step": 9485 }, { "epoch": 5.595518867924528, "grad_norm": 3.930342674255371, "learning_rate": 9.047684377249267e-06, "loss": 0.5192, "num_input_tokens_seen": 6244240, "step": 9490 }, { "epoch": 5.598466981132075, "grad_norm": 0.32992562651634216, "learning_rate": 9.046173480715292e-06, "loss": 0.4629, "num_input_tokens_seen": 6249808, "step": 9495 }, { "epoch": 5.601415094339623, "grad_norm": 2.2414891719818115, "learning_rate": 9.044661512939451e-06, "loss": 0.4625, "num_input_tokens_seen": 6255344, "step": 9500 }, { "epoch": 5.60436320754717, "grad_norm": 1.7880144119262695, "learning_rate": 9.043148474322043e-06, "loss": 0.523, "num_input_tokens_seen": 6259344, "step": 9505 }, { "epoch": 5.607311320754717, "grad_norm": 1.167341709136963, "learning_rate": 9.04163436526365e-06, "loss": 0.3994, "num_input_tokens_seen": 6262928, "step": 9510 }, { "epoch": 5.6102594339622645, "grad_norm": 0.9776925444602966, "learning_rate": 9.040119186165142e-06, "loss": 0.5453, "num_input_tokens_seen": 6265808, "step": 9515 }, { "epoch": 5.613207547169811, "grad_norm": 1.4401787519454956, "learning_rate": 9.038602937427665e-06, "loss": 0.5702, "num_input_tokens_seen": 6268592, "step": 9520 }, { "epoch": 5.616155660377358, "grad_norm": 1.45424485206604, "learning_rate": 9.037085619452658e-06, "loss": 0.3895, "num_input_tokens_seen": 6271280, "step": 9525 }, { "epoch": 5.619103773584905, "grad_norm": 1.8184254169464111, "learning_rate": 9.035567232641833e-06, "loss": 0.4078, "num_input_tokens_seen": 6273904, "step": 9530 }, { "epoch": 5.622051886792453, "grad_norm": 1.3406755924224854, "learning_rate": 9.03404777739719e-06, "loss": 0.4438, "num_input_tokens_seen": 6277616, "step": 9535 }, { "epoch": 5.625, "grad_norm": 1.330183744430542, "learning_rate": 9.032527254121013e-06, "loss": 0.4115, "num_input_tokens_seen": 6280368, "step": 9540 }, { "epoch": 5.627948113207547, "grad_norm": 1.6298859119415283, "learning_rate": 9.031005663215867e-06, "loss": 0.4991, "num_input_tokens_seen": 6282960, "step": 9545 }, { "epoch": 5.630896226415095, "grad_norm": 1.7503435611724854, "learning_rate": 9.029483005084595e-06, "loss": 0.3716, "num_input_tokens_seen": 6285392, "step": 9550 }, { "epoch": 5.633844339622642, "grad_norm": 1.099950909614563, "learning_rate": 9.027959280130337e-06, "loss": 0.5208, "num_input_tokens_seen": 6288656, "step": 9555 }, { "epoch": 5.636792452830189, "grad_norm": 2.0341150760650635, "learning_rate": 9.026434488756496e-06, "loss": 0.6023, "num_input_tokens_seen": 6293168, "step": 9560 }, { "epoch": 5.6397405660377355, "grad_norm": 2.3310720920562744, "learning_rate": 9.024908631366774e-06, "loss": 0.3795, "num_input_tokens_seen": 6295984, "step": 9565 }, { "epoch": 5.642688679245283, "grad_norm": 2.6796092987060547, "learning_rate": 9.023381708365143e-06, "loss": 0.4998, "num_input_tokens_seen": 6301168, "step": 9570 }, { "epoch": 5.64563679245283, "grad_norm": 1.7312116622924805, "learning_rate": 9.021853720155866e-06, "loss": 0.3828, "num_input_tokens_seen": 6304112, "step": 9575 }, { "epoch": 5.648584905660377, "grad_norm": 1.7049384117126465, "learning_rate": 9.020324667143483e-06, "loss": 0.5646, "num_input_tokens_seen": 6307088, "step": 9580 }, { "epoch": 5.651533018867925, "grad_norm": 1.662471055984497, "learning_rate": 9.018794549732819e-06, "loss": 0.3444, "num_input_tokens_seen": 6310768, "step": 9585 }, { "epoch": 5.654481132075472, "grad_norm": 1.007150411605835, "learning_rate": 9.017263368328977e-06, "loss": 0.4351, "num_input_tokens_seen": 6314096, "step": 9590 }, { "epoch": 5.657429245283019, "grad_norm": 0.8378129601478577, "learning_rate": 9.015731123337344e-06, "loss": 0.4739, "num_input_tokens_seen": 6317200, "step": 9595 }, { "epoch": 5.660377358490566, "grad_norm": 1.4330755472183228, "learning_rate": 9.01419781516359e-06, "loss": 0.4714, "num_input_tokens_seen": 6320560, "step": 9600 }, { "epoch": 5.663325471698113, "grad_norm": 2.032437801361084, "learning_rate": 9.012663444213664e-06, "loss": 0.4576, "num_input_tokens_seen": 6323248, "step": 9605 }, { "epoch": 5.66627358490566, "grad_norm": 1.2855286598205566, "learning_rate": 9.011128010893797e-06, "loss": 0.4664, "num_input_tokens_seen": 6326128, "step": 9610 }, { "epoch": 5.6692216981132075, "grad_norm": 1.9442650079727173, "learning_rate": 9.009591515610503e-06, "loss": 0.5128, "num_input_tokens_seen": 6330000, "step": 9615 }, { "epoch": 5.672169811320755, "grad_norm": 1.5026626586914062, "learning_rate": 9.008053958770575e-06, "loss": 0.5152, "num_input_tokens_seen": 6332976, "step": 9620 }, { "epoch": 5.675117924528302, "grad_norm": 1.449323058128357, "learning_rate": 9.006515340781087e-06, "loss": 0.42, "num_input_tokens_seen": 6336400, "step": 9625 }, { "epoch": 5.678066037735849, "grad_norm": 1.0975688695907593, "learning_rate": 9.004975662049396e-06, "loss": 0.3815, "num_input_tokens_seen": 6340432, "step": 9630 }, { "epoch": 5.681014150943396, "grad_norm": 1.401356816291809, "learning_rate": 9.003434922983138e-06, "loss": 0.3816, "num_input_tokens_seen": 6343728, "step": 9635 }, { "epoch": 5.683962264150943, "grad_norm": 0.9706073999404907, "learning_rate": 9.00189312399023e-06, "loss": 0.4598, "num_input_tokens_seen": 6346960, "step": 9640 }, { "epoch": 5.68691037735849, "grad_norm": 1.8955104351043701, "learning_rate": 9.00035026547887e-06, "loss": 0.366, "num_input_tokens_seen": 6350480, "step": 9645 }, { "epoch": 5.689858490566038, "grad_norm": 1.364055871963501, "learning_rate": 8.998806347857537e-06, "loss": 0.471, "num_input_tokens_seen": 6354352, "step": 9650 }, { "epoch": 5.692806603773585, "grad_norm": 4.001993656158447, "learning_rate": 8.99726137153499e-06, "loss": 0.6451, "num_input_tokens_seen": 6357616, "step": 9655 }, { "epoch": 5.695754716981132, "grad_norm": 3.944474220275879, "learning_rate": 8.995715336920266e-06, "loss": 0.5028, "num_input_tokens_seen": 6360720, "step": 9660 }, { "epoch": 5.6987028301886795, "grad_norm": 0.8762447237968445, "learning_rate": 8.994168244422687e-06, "loss": 0.4198, "num_input_tokens_seen": 6364336, "step": 9665 }, { "epoch": 5.701650943396227, "grad_norm": 1.8168234825134277, "learning_rate": 8.992620094451852e-06, "loss": 0.4628, "num_input_tokens_seen": 6368016, "step": 9670 }, { "epoch": 5.704599056603773, "grad_norm": 1.355422854423523, "learning_rate": 8.991070887417639e-06, "loss": 0.3498, "num_input_tokens_seen": 6370736, "step": 9675 }, { "epoch": 5.7075471698113205, "grad_norm": 1.3768229484558105, "learning_rate": 8.989520623730208e-06, "loss": 0.4299, "num_input_tokens_seen": 6373744, "step": 9680 }, { "epoch": 5.710495283018868, "grad_norm": 1.9405567646026611, "learning_rate": 8.987969303799998e-06, "loss": 0.6352, "num_input_tokens_seen": 6377200, "step": 9685 }, { "epoch": 5.713443396226415, "grad_norm": 2.7222249507904053, "learning_rate": 8.986416928037728e-06, "loss": 0.5039, "num_input_tokens_seen": 6380016, "step": 9690 }, { "epoch": 5.716391509433962, "grad_norm": 1.634670376777649, "learning_rate": 8.984863496854395e-06, "loss": 0.3197, "num_input_tokens_seen": 6382896, "step": 9695 }, { "epoch": 5.71933962264151, "grad_norm": 1.2253656387329102, "learning_rate": 8.983309010661279e-06, "loss": 0.4403, "num_input_tokens_seen": 6387024, "step": 9700 }, { "epoch": 5.722287735849057, "grad_norm": 3.3518271446228027, "learning_rate": 8.981753469869934e-06, "loss": 0.5371, "num_input_tokens_seen": 6390032, "step": 9705 }, { "epoch": 5.725235849056604, "grad_norm": 1.239074468612671, "learning_rate": 8.980196874892198e-06, "loss": 0.3687, "num_input_tokens_seen": 6392976, "step": 9710 }, { "epoch": 5.728183962264151, "grad_norm": 2.1361031532287598, "learning_rate": 8.978639226140184e-06, "loss": 0.4688, "num_input_tokens_seen": 6395728, "step": 9715 }, { "epoch": 5.731132075471698, "grad_norm": 1.9222701787948608, "learning_rate": 8.977080524026289e-06, "loss": 0.3937, "num_input_tokens_seen": 6398480, "step": 9720 }, { "epoch": 5.734080188679245, "grad_norm": 2.660845994949341, "learning_rate": 8.975520768963186e-06, "loss": 0.5398, "num_input_tokens_seen": 6401168, "step": 9725 }, { "epoch": 5.7370283018867925, "grad_norm": 1.133131980895996, "learning_rate": 8.973959961363825e-06, "loss": 0.5109, "num_input_tokens_seen": 6404112, "step": 9730 }, { "epoch": 5.73997641509434, "grad_norm": 1.628474235534668, "learning_rate": 8.972398101641438e-06, "loss": 0.5123, "num_input_tokens_seen": 6407664, "step": 9735 }, { "epoch": 5.742924528301887, "grad_norm": 2.907227039337158, "learning_rate": 8.970835190209532e-06, "loss": 0.5724, "num_input_tokens_seen": 6410768, "step": 9740 }, { "epoch": 5.745872641509434, "grad_norm": 1.3654794692993164, "learning_rate": 8.969271227481899e-06, "loss": 0.71, "num_input_tokens_seen": 6414192, "step": 9745 }, { "epoch": 5.748820754716981, "grad_norm": 2.315873861312866, "learning_rate": 8.967706213872599e-06, "loss": 0.482, "num_input_tokens_seen": 6417456, "step": 9750 }, { "epoch": 5.751768867924528, "grad_norm": 3.824618339538574, "learning_rate": 8.966140149795981e-06, "loss": 0.4541, "num_input_tokens_seen": 6421296, "step": 9755 }, { "epoch": 5.754716981132075, "grad_norm": 2.958275318145752, "learning_rate": 8.964573035666663e-06, "loss": 0.5624, "num_input_tokens_seen": 6424304, "step": 9760 }, { "epoch": 5.757665094339623, "grad_norm": 1.013025164604187, "learning_rate": 8.96300487189955e-06, "loss": 0.4553, "num_input_tokens_seen": 6426800, "step": 9765 }, { "epoch": 5.76061320754717, "grad_norm": 1.5834970474243164, "learning_rate": 8.961435658909816e-06, "loss": 0.5223, "num_input_tokens_seen": 6429616, "step": 9770 }, { "epoch": 5.763561320754717, "grad_norm": 1.6287704706192017, "learning_rate": 8.959865397112918e-06, "loss": 0.5921, "num_input_tokens_seen": 6433616, "step": 9775 }, { "epoch": 5.7665094339622645, "grad_norm": 1.1970707178115845, "learning_rate": 8.95829408692459e-06, "loss": 0.5567, "num_input_tokens_seen": 6436752, "step": 9780 }, { "epoch": 5.769457547169811, "grad_norm": 1.5430458784103394, "learning_rate": 8.956721728760845e-06, "loss": 0.548, "num_input_tokens_seen": 6439216, "step": 9785 }, { "epoch": 5.772405660377358, "grad_norm": 1.8468583822250366, "learning_rate": 8.95514832303797e-06, "loss": 0.5576, "num_input_tokens_seen": 6442544, "step": 9790 }, { "epoch": 5.775353773584905, "grad_norm": 2.292644739151001, "learning_rate": 8.953573870172528e-06, "loss": 0.4912, "num_input_tokens_seen": 6446416, "step": 9795 }, { "epoch": 5.778301886792453, "grad_norm": 2.8521387577056885, "learning_rate": 8.951998370581368e-06, "loss": 0.454, "num_input_tokens_seen": 6449648, "step": 9800 }, { "epoch": 5.78125, "grad_norm": 1.3014496564865112, "learning_rate": 8.950421824681605e-06, "loss": 0.3917, "num_input_tokens_seen": 6453872, "step": 9805 }, { "epoch": 5.784198113207547, "grad_norm": 2.5618879795074463, "learning_rate": 8.948844232890638e-06, "loss": 0.4715, "num_input_tokens_seen": 6456336, "step": 9810 }, { "epoch": 5.787146226415095, "grad_norm": 2.4552369117736816, "learning_rate": 8.947265595626144e-06, "loss": 0.4667, "num_input_tokens_seen": 6458960, "step": 9815 }, { "epoch": 5.790094339622642, "grad_norm": 1.8860009908676147, "learning_rate": 8.945685913306071e-06, "loss": 0.4809, "num_input_tokens_seen": 6461744, "step": 9820 }, { "epoch": 5.793042452830189, "grad_norm": 1.3179007768630981, "learning_rate": 8.944105186348646e-06, "loss": 0.3683, "num_input_tokens_seen": 6464976, "step": 9825 }, { "epoch": 5.7959905660377355, "grad_norm": 4.168513298034668, "learning_rate": 8.942523415172377e-06, "loss": 0.4239, "num_input_tokens_seen": 6468048, "step": 9830 }, { "epoch": 5.798938679245283, "grad_norm": 1.9862051010131836, "learning_rate": 8.94094060019604e-06, "loss": 0.5675, "num_input_tokens_seen": 6471824, "step": 9835 }, { "epoch": 5.80188679245283, "grad_norm": 1.457822561264038, "learning_rate": 8.939356741838696e-06, "loss": 0.4432, "num_input_tokens_seen": 6475536, "step": 9840 }, { "epoch": 5.804834905660377, "grad_norm": 1.6278966665267944, "learning_rate": 8.937771840519677e-06, "loss": 0.3913, "num_input_tokens_seen": 6478480, "step": 9845 }, { "epoch": 5.807783018867925, "grad_norm": 4.442908763885498, "learning_rate": 8.936185896658593e-06, "loss": 0.3697, "num_input_tokens_seen": 6481904, "step": 9850 }, { "epoch": 5.810731132075472, "grad_norm": 1.6128867864608765, "learning_rate": 8.934598910675329e-06, "loss": 0.3191, "num_input_tokens_seen": 6485328, "step": 9855 }, { "epoch": 5.813679245283019, "grad_norm": 1.5722060203552246, "learning_rate": 8.933010882990044e-06, "loss": 0.4184, "num_input_tokens_seen": 6488336, "step": 9860 }, { "epoch": 5.816627358490566, "grad_norm": 1.927058219909668, "learning_rate": 8.93142181402318e-06, "loss": 0.5279, "num_input_tokens_seen": 6491088, "step": 9865 }, { "epoch": 5.819575471698113, "grad_norm": 2.0151145458221436, "learning_rate": 8.929831704195445e-06, "loss": 0.3845, "num_input_tokens_seen": 6494160, "step": 9870 }, { "epoch": 5.82252358490566, "grad_norm": 2.5690455436706543, "learning_rate": 8.928240553927831e-06, "loss": 0.5257, "num_input_tokens_seen": 6496816, "step": 9875 }, { "epoch": 5.8254716981132075, "grad_norm": 2.6642260551452637, "learning_rate": 8.926648363641602e-06, "loss": 0.44, "num_input_tokens_seen": 6500400, "step": 9880 }, { "epoch": 5.828419811320755, "grad_norm": 2.321229934692383, "learning_rate": 8.925055133758294e-06, "loss": 0.4679, "num_input_tokens_seen": 6503984, "step": 9885 }, { "epoch": 5.831367924528302, "grad_norm": 1.5377877950668335, "learning_rate": 8.923460864699723e-06, "loss": 0.4224, "num_input_tokens_seen": 6506768, "step": 9890 }, { "epoch": 5.834316037735849, "grad_norm": 3.149402141571045, "learning_rate": 8.921865556887979e-06, "loss": 0.4871, "num_input_tokens_seen": 6509712, "step": 9895 }, { "epoch": 5.837264150943396, "grad_norm": 2.0257809162139893, "learning_rate": 8.920269210745426e-06, "loss": 0.3052, "num_input_tokens_seen": 6512816, "step": 9900 }, { "epoch": 5.840212264150943, "grad_norm": 1.984734058380127, "learning_rate": 8.918671826694704e-06, "loss": 0.4388, "num_input_tokens_seen": 6516368, "step": 9905 }, { "epoch": 5.84316037735849, "grad_norm": 0.909290075302124, "learning_rate": 8.91707340515873e-06, "loss": 0.3703, "num_input_tokens_seen": 6519216, "step": 9910 }, { "epoch": 5.846108490566038, "grad_norm": 1.735177755355835, "learning_rate": 8.915473946560688e-06, "loss": 0.517, "num_input_tokens_seen": 6521936, "step": 9915 }, { "epoch": 5.849056603773585, "grad_norm": 1.1315135955810547, "learning_rate": 8.913873451324044e-06, "loss": 0.4177, "num_input_tokens_seen": 6524592, "step": 9920 }, { "epoch": 5.852004716981132, "grad_norm": 1.7962119579315186, "learning_rate": 8.912271919872538e-06, "loss": 0.4224, "num_input_tokens_seen": 6527216, "step": 9925 }, { "epoch": 5.8549528301886795, "grad_norm": 1.7840416431427002, "learning_rate": 8.910669352630176e-06, "loss": 0.3523, "num_input_tokens_seen": 6530544, "step": 9930 }, { "epoch": 5.857900943396227, "grad_norm": 1.1599944829940796, "learning_rate": 8.909065750021253e-06, "loss": 0.4497, "num_input_tokens_seen": 6533616, "step": 9935 }, { "epoch": 5.860849056603773, "grad_norm": 2.415081262588501, "learning_rate": 8.907461112470323e-06, "loss": 0.5988, "num_input_tokens_seen": 6536176, "step": 9940 }, { "epoch": 5.8637971698113205, "grad_norm": 1.7095738649368286, "learning_rate": 8.905855440402225e-06, "loss": 0.3865, "num_input_tokens_seen": 6539408, "step": 9945 }, { "epoch": 5.866745283018868, "grad_norm": 2.706624984741211, "learning_rate": 8.904248734242065e-06, "loss": 0.4258, "num_input_tokens_seen": 6541904, "step": 9950 }, { "epoch": 5.869693396226415, "grad_norm": 1.9739692211151123, "learning_rate": 8.902640994415226e-06, "loss": 0.4374, "num_input_tokens_seen": 6544944, "step": 9955 }, { "epoch": 5.872641509433962, "grad_norm": 2.242633104324341, "learning_rate": 8.901032221347364e-06, "loss": 0.4153, "num_input_tokens_seen": 6548496, "step": 9960 }, { "epoch": 5.87558962264151, "grad_norm": 1.4208674430847168, "learning_rate": 8.899422415464409e-06, "loss": 0.4043, "num_input_tokens_seen": 6551504, "step": 9965 }, { "epoch": 5.878537735849057, "grad_norm": 1.5309168100357056, "learning_rate": 8.897811577192565e-06, "loss": 0.5438, "num_input_tokens_seen": 6554672, "step": 9970 }, { "epoch": 5.881485849056604, "grad_norm": 1.6832607984542847, "learning_rate": 8.896199706958306e-06, "loss": 0.4732, "num_input_tokens_seen": 6557232, "step": 9975 }, { "epoch": 5.884433962264151, "grad_norm": 12.158102989196777, "learning_rate": 8.894586805188384e-06, "loss": 0.4913, "num_input_tokens_seen": 6560912, "step": 9980 }, { "epoch": 5.887382075471698, "grad_norm": 1.9066474437713623, "learning_rate": 8.892972872309821e-06, "loss": 0.3999, "num_input_tokens_seen": 6563120, "step": 9985 }, { "epoch": 5.890330188679245, "grad_norm": 2.3502795696258545, "learning_rate": 8.89135790874991e-06, "loss": 0.4611, "num_input_tokens_seen": 6566704, "step": 9990 }, { "epoch": 5.8932783018867925, "grad_norm": 1.8437918424606323, "learning_rate": 8.889741914936224e-06, "loss": 0.4342, "num_input_tokens_seen": 6570064, "step": 9995 }, { "epoch": 5.89622641509434, "grad_norm": 3.1394896507263184, "learning_rate": 8.888124891296602e-06, "loss": 0.408, "num_input_tokens_seen": 6573392, "step": 10000 }, { "epoch": 5.899174528301887, "grad_norm": 2.1808159351348877, "learning_rate": 8.886506838259156e-06, "loss": 0.4795, "num_input_tokens_seen": 6576912, "step": 10005 }, { "epoch": 5.902122641509434, "grad_norm": 1.6592062711715698, "learning_rate": 8.884887756252279e-06, "loss": 0.5038, "num_input_tokens_seen": 6581744, "step": 10010 }, { "epoch": 5.905070754716981, "grad_norm": 1.1634156703948975, "learning_rate": 8.88326764570462e-06, "loss": 0.4871, "num_input_tokens_seen": 6584592, "step": 10015 }, { "epoch": 5.908018867924528, "grad_norm": 2.4874939918518066, "learning_rate": 8.88164650704512e-06, "loss": 0.478, "num_input_tokens_seen": 6588560, "step": 10020 }, { "epoch": 5.910966981132075, "grad_norm": 1.5512479543685913, "learning_rate": 8.880024340702978e-06, "loss": 0.4861, "num_input_tokens_seen": 6591312, "step": 10025 }, { "epoch": 5.913915094339623, "grad_norm": 1.6443313360214233, "learning_rate": 8.878401147107667e-06, "loss": 0.4204, "num_input_tokens_seen": 6594256, "step": 10030 }, { "epoch": 5.91686320754717, "grad_norm": 1.506973385810852, "learning_rate": 8.87677692668894e-06, "loss": 0.4145, "num_input_tokens_seen": 6598096, "step": 10035 }, { "epoch": 5.919811320754717, "grad_norm": 0.8967400193214417, "learning_rate": 8.875151679876813e-06, "loss": 0.4083, "num_input_tokens_seen": 6601520, "step": 10040 }, { "epoch": 5.9227594339622645, "grad_norm": 2.2176225185394287, "learning_rate": 8.873525407101577e-06, "loss": 0.4707, "num_input_tokens_seen": 6604336, "step": 10045 }, { "epoch": 5.925707547169811, "grad_norm": 1.5994669198989868, "learning_rate": 8.871898108793796e-06, "loss": 0.3984, "num_input_tokens_seen": 6607152, "step": 10050 }, { "epoch": 5.928655660377358, "grad_norm": 1.5341591835021973, "learning_rate": 8.870269785384304e-06, "loss": 0.4705, "num_input_tokens_seen": 6609584, "step": 10055 }, { "epoch": 5.931603773584905, "grad_norm": 2.23099946975708, "learning_rate": 8.868640437304206e-06, "loss": 0.4258, "num_input_tokens_seen": 6614064, "step": 10060 }, { "epoch": 5.934551886792453, "grad_norm": 3.300271511077881, "learning_rate": 8.86701006498488e-06, "loss": 0.5643, "num_input_tokens_seen": 6617456, "step": 10065 }, { "epoch": 5.9375, "grad_norm": 1.2419747114181519, "learning_rate": 8.865378668857972e-06, "loss": 0.4239, "num_input_tokens_seen": 6621136, "step": 10070 }, { "epoch": 5.940448113207547, "grad_norm": 2.4581871032714844, "learning_rate": 8.863746249355404e-06, "loss": 0.4971, "num_input_tokens_seen": 6623920, "step": 10075 }, { "epoch": 5.943396226415095, "grad_norm": 1.1315168142318726, "learning_rate": 8.862112806909365e-06, "loss": 0.349, "num_input_tokens_seen": 6626672, "step": 10080 }, { "epoch": 5.946344339622642, "grad_norm": 1.7028762102127075, "learning_rate": 8.860478341952314e-06, "loss": 0.3715, "num_input_tokens_seen": 6629840, "step": 10085 }, { "epoch": 5.949292452830189, "grad_norm": 1.3235236406326294, "learning_rate": 8.858842854916985e-06, "loss": 0.41, "num_input_tokens_seen": 6632784, "step": 10090 }, { "epoch": 5.9522405660377355, "grad_norm": 1.9018012285232544, "learning_rate": 8.85720634623638e-06, "loss": 0.5947, "num_input_tokens_seen": 6635728, "step": 10095 }, { "epoch": 5.955188679245283, "grad_norm": 1.8645563125610352, "learning_rate": 8.855568816343769e-06, "loss": 0.4256, "num_input_tokens_seen": 6639024, "step": 10100 }, { "epoch": 5.95813679245283, "grad_norm": 1.4127731323242188, "learning_rate": 8.8539302656727e-06, "loss": 0.535, "num_input_tokens_seen": 6642672, "step": 10105 }, { "epoch": 5.961084905660377, "grad_norm": 1.7913249731063843, "learning_rate": 8.852290694656983e-06, "loss": 0.5028, "num_input_tokens_seen": 6645296, "step": 10110 }, { "epoch": 5.964033018867925, "grad_norm": 1.1655957698822021, "learning_rate": 8.8506501037307e-06, "loss": 0.4302, "num_input_tokens_seen": 6648912, "step": 10115 }, { "epoch": 5.966981132075472, "grad_norm": 1.8633705377578735, "learning_rate": 8.849008493328209e-06, "loss": 0.5548, "num_input_tokens_seen": 6652240, "step": 10120 }, { "epoch": 5.969929245283019, "grad_norm": 2.389394521713257, "learning_rate": 8.847365863884131e-06, "loss": 0.4833, "num_input_tokens_seen": 6656016, "step": 10125 }, { "epoch": 5.972877358490566, "grad_norm": 1.3999780416488647, "learning_rate": 8.845722215833359e-06, "loss": 0.3186, "num_input_tokens_seen": 6659344, "step": 10130 }, { "epoch": 5.975825471698113, "grad_norm": 0.8770366311073303, "learning_rate": 8.844077549611056e-06, "loss": 0.5393, "num_input_tokens_seen": 6662096, "step": 10135 }, { "epoch": 5.97877358490566, "grad_norm": 1.2846965789794922, "learning_rate": 8.842431865652654e-06, "loss": 0.4786, "num_input_tokens_seen": 6665456, "step": 10140 }, { "epoch": 5.9817216981132075, "grad_norm": 1.565163016319275, "learning_rate": 8.840785164393858e-06, "loss": 0.548, "num_input_tokens_seen": 6668304, "step": 10145 }, { "epoch": 5.984669811320755, "grad_norm": 2.0046629905700684, "learning_rate": 8.839137446270634e-06, "loss": 0.4269, "num_input_tokens_seen": 6670928, "step": 10150 }, { "epoch": 5.987617924528302, "grad_norm": 1.2267123460769653, "learning_rate": 8.837488711719226e-06, "loss": 0.3042, "num_input_tokens_seen": 6673488, "step": 10155 }, { "epoch": 5.990566037735849, "grad_norm": 3.062177896499634, "learning_rate": 8.835838961176143e-06, "loss": 0.4636, "num_input_tokens_seen": 6676112, "step": 10160 }, { "epoch": 5.993514150943396, "grad_norm": 1.836385726928711, "learning_rate": 8.834188195078164e-06, "loss": 0.3474, "num_input_tokens_seen": 6678832, "step": 10165 }, { "epoch": 5.996462264150943, "grad_norm": 1.8881038427352905, "learning_rate": 8.832536413862337e-06, "loss": 0.4661, "num_input_tokens_seen": 6682800, "step": 10170 }, { "epoch": 5.99941037735849, "grad_norm": 3.9764773845672607, "learning_rate": 8.830883617965976e-06, "loss": 0.7937, "num_input_tokens_seen": 6685072, "step": 10175 }, { "epoch": 6.0, "eval_loss": 0.5032580494880676, "eval_runtime": 18.6276, "eval_samples_per_second": 91.048, "eval_steps_per_second": 22.762, "num_input_tokens_seen": 6685232, "step": 10176 }, { "epoch": 6.002358490566038, "grad_norm": 2.5330185890197754, "learning_rate": 8.829229807826665e-06, "loss": 0.4594, "num_input_tokens_seen": 6688176, "step": 10180 }, { "epoch": 6.005306603773585, "grad_norm": 1.9266678094863892, "learning_rate": 8.82757498388226e-06, "loss": 0.3533, "num_input_tokens_seen": 6691856, "step": 10185 }, { "epoch": 6.008254716981132, "grad_norm": 1.4315369129180908, "learning_rate": 8.825919146570884e-06, "loss": 0.4948, "num_input_tokens_seen": 6694896, "step": 10190 }, { "epoch": 6.0112028301886795, "grad_norm": 2.041658401489258, "learning_rate": 8.824262296330925e-06, "loss": 0.4743, "num_input_tokens_seen": 6698128, "step": 10195 }, { "epoch": 6.014150943396227, "grad_norm": 1.6883341073989868, "learning_rate": 8.822604433601041e-06, "loss": 0.413, "num_input_tokens_seen": 6702000, "step": 10200 }, { "epoch": 6.017099056603773, "grad_norm": 3.2472429275512695, "learning_rate": 8.820945558820158e-06, "loss": 0.4228, "num_input_tokens_seen": 6705744, "step": 10205 }, { "epoch": 6.0200471698113205, "grad_norm": 1.6769251823425293, "learning_rate": 8.81928567242747e-06, "loss": 0.4715, "num_input_tokens_seen": 6709872, "step": 10210 }, { "epoch": 6.022995283018868, "grad_norm": 1.3550606966018677, "learning_rate": 8.817624774862443e-06, "loss": 0.3965, "num_input_tokens_seen": 6713040, "step": 10215 }, { "epoch": 6.025943396226415, "grad_norm": 1.4370311498641968, "learning_rate": 8.815962866564803e-06, "loss": 0.4929, "num_input_tokens_seen": 6716144, "step": 10220 }, { "epoch": 6.028891509433962, "grad_norm": 1.7063865661621094, "learning_rate": 8.814299947974547e-06, "loss": 0.3848, "num_input_tokens_seen": 6719344, "step": 10225 }, { "epoch": 6.03183962264151, "grad_norm": 1.4494707584381104, "learning_rate": 8.812636019531942e-06, "loss": 0.4185, "num_input_tokens_seen": 6722160, "step": 10230 }, { "epoch": 6.034787735849057, "grad_norm": 1.3934566974639893, "learning_rate": 8.810971081677517e-06, "loss": 0.4238, "num_input_tokens_seen": 6724880, "step": 10235 }, { "epoch": 6.037735849056604, "grad_norm": 1.125510334968567, "learning_rate": 8.809305134852076e-06, "loss": 0.3811, "num_input_tokens_seen": 6727856, "step": 10240 }, { "epoch": 6.040683962264151, "grad_norm": 3.532008647918701, "learning_rate": 8.807638179496684e-06, "loss": 0.4354, "num_input_tokens_seen": 6730096, "step": 10245 }, { "epoch": 6.043632075471698, "grad_norm": 2.1881535053253174, "learning_rate": 8.805970216052673e-06, "loss": 0.6541, "num_input_tokens_seen": 6732976, "step": 10250 }, { "epoch": 6.046580188679245, "grad_norm": 1.9718451499938965, "learning_rate": 8.804301244961645e-06, "loss": 0.4435, "num_input_tokens_seen": 6737456, "step": 10255 }, { "epoch": 6.0495283018867925, "grad_norm": 1.8388615846633911, "learning_rate": 8.802631266665465e-06, "loss": 0.3999, "num_input_tokens_seen": 6741040, "step": 10260 }, { "epoch": 6.05247641509434, "grad_norm": 2.1618845462799072, "learning_rate": 8.80096028160627e-06, "loss": 0.4608, "num_input_tokens_seen": 6743472, "step": 10265 }, { "epoch": 6.055424528301887, "grad_norm": 1.003592610359192, "learning_rate": 8.799288290226457e-06, "loss": 0.5078, "num_input_tokens_seen": 6748304, "step": 10270 }, { "epoch": 6.058372641509434, "grad_norm": 2.7597968578338623, "learning_rate": 8.797615292968698e-06, "loss": 0.585, "num_input_tokens_seen": 6751312, "step": 10275 }, { "epoch": 6.061320754716981, "grad_norm": 1.260477900505066, "learning_rate": 8.795941290275923e-06, "loss": 0.3448, "num_input_tokens_seen": 6754032, "step": 10280 }, { "epoch": 6.064268867924528, "grad_norm": 1.9686975479125977, "learning_rate": 8.79426628259133e-06, "loss": 0.4615, "num_input_tokens_seen": 6757264, "step": 10285 }, { "epoch": 6.067216981132075, "grad_norm": 2.168390989303589, "learning_rate": 8.792590270358389e-06, "loss": 0.4168, "num_input_tokens_seen": 6760912, "step": 10290 }, { "epoch": 6.070165094339623, "grad_norm": 2.8241865634918213, "learning_rate": 8.790913254020827e-06, "loss": 0.4815, "num_input_tokens_seen": 6765872, "step": 10295 }, { "epoch": 6.07311320754717, "grad_norm": 1.5455724000930786, "learning_rate": 8.789235234022643e-06, "loss": 0.3983, "num_input_tokens_seen": 6768496, "step": 10300 }, { "epoch": 6.076061320754717, "grad_norm": 1.2983731031417847, "learning_rate": 8.787556210808101e-06, "loss": 0.3506, "num_input_tokens_seen": 6772560, "step": 10305 }, { "epoch": 6.0790094339622645, "grad_norm": 1.2396259307861328, "learning_rate": 8.78587618482173e-06, "loss": 0.4169, "num_input_tokens_seen": 6776560, "step": 10310 }, { "epoch": 6.081957547169812, "grad_norm": 2.128328800201416, "learning_rate": 8.78419515650832e-06, "loss": 0.3554, "num_input_tokens_seen": 6779600, "step": 10315 }, { "epoch": 6.084905660377358, "grad_norm": 1.4239790439605713, "learning_rate": 8.782513126312934e-06, "loss": 0.3679, "num_input_tokens_seen": 6782928, "step": 10320 }, { "epoch": 6.087853773584905, "grad_norm": 3.3200738430023193, "learning_rate": 8.780830094680897e-06, "loss": 0.449, "num_input_tokens_seen": 6786928, "step": 10325 }, { "epoch": 6.090801886792453, "grad_norm": 1.9053758382797241, "learning_rate": 8.779146062057797e-06, "loss": 0.4447, "num_input_tokens_seen": 6790768, "step": 10330 }, { "epoch": 6.09375, "grad_norm": 1.8853201866149902, "learning_rate": 8.777461028889492e-06, "loss": 0.3908, "num_input_tokens_seen": 6793744, "step": 10335 }, { "epoch": 6.096698113207547, "grad_norm": 0.9447429180145264, "learning_rate": 8.775774995622097e-06, "loss": 0.281, "num_input_tokens_seen": 6796784, "step": 10340 }, { "epoch": 6.099646226415095, "grad_norm": 4.12408447265625, "learning_rate": 8.774087962702e-06, "loss": 0.3667, "num_input_tokens_seen": 6799184, "step": 10345 }, { "epoch": 6.102594339622642, "grad_norm": 1.9256771802902222, "learning_rate": 8.772399930575849e-06, "loss": 0.3858, "num_input_tokens_seen": 6802192, "step": 10350 }, { "epoch": 6.105542452830188, "grad_norm": 1.8608896732330322, "learning_rate": 8.77071089969056e-06, "loss": 0.4856, "num_input_tokens_seen": 6805200, "step": 10355 }, { "epoch": 6.1084905660377355, "grad_norm": 1.4245561361312866, "learning_rate": 8.769020870493309e-06, "loss": 0.3689, "num_input_tokens_seen": 6809040, "step": 10360 }, { "epoch": 6.111438679245283, "grad_norm": 1.3302894830703735, "learning_rate": 8.767329843431537e-06, "loss": 0.4141, "num_input_tokens_seen": 6812688, "step": 10365 }, { "epoch": 6.11438679245283, "grad_norm": 1.318554162979126, "learning_rate": 8.765637818952954e-06, "loss": 0.428, "num_input_tokens_seen": 6816208, "step": 10370 }, { "epoch": 6.117334905660377, "grad_norm": 1.150294542312622, "learning_rate": 8.76394479750553e-06, "loss": 0.3785, "num_input_tokens_seen": 6819792, "step": 10375 }, { "epoch": 6.120283018867925, "grad_norm": 1.6081420183181763, "learning_rate": 8.762250779537499e-06, "loss": 0.3491, "num_input_tokens_seen": 6822832, "step": 10380 }, { "epoch": 6.123231132075472, "grad_norm": 2.495600938796997, "learning_rate": 8.760555765497358e-06, "loss": 0.4238, "num_input_tokens_seen": 6826864, "step": 10385 }, { "epoch": 6.126179245283019, "grad_norm": 2.2361648082733154, "learning_rate": 8.758859755833873e-06, "loss": 0.4467, "num_input_tokens_seen": 6829872, "step": 10390 }, { "epoch": 6.129127358490566, "grad_norm": 2.1946725845336914, "learning_rate": 8.757162750996066e-06, "loss": 0.5041, "num_input_tokens_seen": 6832528, "step": 10395 }, { "epoch": 6.132075471698113, "grad_norm": 1.3004745244979858, "learning_rate": 8.755464751433229e-06, "loss": 0.4695, "num_input_tokens_seen": 6835824, "step": 10400 }, { "epoch": 6.13502358490566, "grad_norm": 2.0339314937591553, "learning_rate": 8.753765757594915e-06, "loss": 0.3865, "num_input_tokens_seen": 6838960, "step": 10405 }, { "epoch": 6.1379716981132075, "grad_norm": 1.7839187383651733, "learning_rate": 8.752065769930938e-06, "loss": 0.5431, "num_input_tokens_seen": 6841616, "step": 10410 }, { "epoch": 6.140919811320755, "grad_norm": 3.1681067943573, "learning_rate": 8.75036478889138e-06, "loss": 0.6294, "num_input_tokens_seen": 6844624, "step": 10415 }, { "epoch": 6.143867924528302, "grad_norm": 1.3963193893432617, "learning_rate": 8.748662814926576e-06, "loss": 0.3973, "num_input_tokens_seen": 6847856, "step": 10420 }, { "epoch": 6.146816037735849, "grad_norm": 1.220657229423523, "learning_rate": 8.746959848487139e-06, "loss": 0.3355, "num_input_tokens_seen": 6851184, "step": 10425 }, { "epoch": 6.149764150943396, "grad_norm": 3.5202293395996094, "learning_rate": 8.745255890023934e-06, "loss": 0.6188, "num_input_tokens_seen": 6854640, "step": 10430 }, { "epoch": 6.152712264150943, "grad_norm": 1.6679444313049316, "learning_rate": 8.74355093998809e-06, "loss": 0.6044, "num_input_tokens_seen": 6857520, "step": 10435 }, { "epoch": 6.15566037735849, "grad_norm": 1.3219857215881348, "learning_rate": 8.741844998831001e-06, "loss": 0.3866, "num_input_tokens_seen": 6861328, "step": 10440 }, { "epoch": 6.158608490566038, "grad_norm": 1.4512996673583984, "learning_rate": 8.740138067004323e-06, "loss": 0.4129, "num_input_tokens_seen": 6864336, "step": 10445 }, { "epoch": 6.161556603773585, "grad_norm": 7.576154708862305, "learning_rate": 8.738430144959973e-06, "loss": 0.3875, "num_input_tokens_seen": 6867632, "step": 10450 }, { "epoch": 6.164504716981132, "grad_norm": 2.077413320541382, "learning_rate": 8.73672123315013e-06, "loss": 0.5601, "num_input_tokens_seen": 6870576, "step": 10455 }, { "epoch": 6.1674528301886795, "grad_norm": 2.3201358318328857, "learning_rate": 8.735011332027234e-06, "loss": 0.5656, "num_input_tokens_seen": 6873296, "step": 10460 }, { "epoch": 6.170400943396227, "grad_norm": 10.899303436279297, "learning_rate": 8.733300442043993e-06, "loss": 0.46, "num_input_tokens_seen": 6875792, "step": 10465 }, { "epoch": 6.173349056603773, "grad_norm": 1.477938175201416, "learning_rate": 8.73158856365337e-06, "loss": 0.4957, "num_input_tokens_seen": 6879280, "step": 10470 }, { "epoch": 6.1762971698113205, "grad_norm": 1.84242844581604, "learning_rate": 8.729875697308592e-06, "loss": 0.4819, "num_input_tokens_seen": 6882320, "step": 10475 }, { "epoch": 6.179245283018868, "grad_norm": 3.3666746616363525, "learning_rate": 8.728161843463148e-06, "loss": 0.3764, "num_input_tokens_seen": 6885520, "step": 10480 }, { "epoch": 6.182193396226415, "grad_norm": 1.5043052434921265, "learning_rate": 8.726447002570791e-06, "loss": 0.3908, "num_input_tokens_seen": 6888400, "step": 10485 }, { "epoch": 6.185141509433962, "grad_norm": 2.2852513790130615, "learning_rate": 8.724731175085526e-06, "loss": 0.4212, "num_input_tokens_seen": 6891760, "step": 10490 }, { "epoch": 6.18808962264151, "grad_norm": 1.827285647392273, "learning_rate": 8.723014361461633e-06, "loss": 0.4613, "num_input_tokens_seen": 6894448, "step": 10495 }, { "epoch": 6.191037735849057, "grad_norm": 1.067101240158081, "learning_rate": 8.72129656215364e-06, "loss": 0.3781, "num_input_tokens_seen": 6897584, "step": 10500 }, { "epoch": 6.193985849056604, "grad_norm": 1.6596410274505615, "learning_rate": 8.719577777616347e-06, "loss": 0.4121, "num_input_tokens_seen": 6900176, "step": 10505 }, { "epoch": 6.196933962264151, "grad_norm": 1.9201310873031616, "learning_rate": 8.717858008304804e-06, "loss": 0.424, "num_input_tokens_seen": 6903120, "step": 10510 }, { "epoch": 6.199882075471698, "grad_norm": 1.517538070678711, "learning_rate": 8.71613725467433e-06, "loss": 0.5493, "num_input_tokens_seen": 6906512, "step": 10515 }, { "epoch": 6.202830188679245, "grad_norm": 2.4075398445129395, "learning_rate": 8.714415517180506e-06, "loss": 0.4991, "num_input_tokens_seen": 6909200, "step": 10520 }, { "epoch": 6.2057783018867925, "grad_norm": 1.7730536460876465, "learning_rate": 8.712692796279164e-06, "loss": 0.3949, "num_input_tokens_seen": 6911856, "step": 10525 }, { "epoch": 6.20872641509434, "grad_norm": 1.408811092376709, "learning_rate": 8.710969092426401e-06, "loss": 0.4374, "num_input_tokens_seen": 6915408, "step": 10530 }, { "epoch": 6.211674528301887, "grad_norm": 2.7778544425964355, "learning_rate": 8.70924440607858e-06, "loss": 0.4678, "num_input_tokens_seen": 6918672, "step": 10535 }, { "epoch": 6.214622641509434, "grad_norm": 2.0261683464050293, "learning_rate": 8.707518737692315e-06, "loss": 0.4326, "num_input_tokens_seen": 6922736, "step": 10540 }, { "epoch": 6.217570754716981, "grad_norm": 1.250875473022461, "learning_rate": 8.705792087724485e-06, "loss": 0.3601, "num_input_tokens_seen": 6925200, "step": 10545 }, { "epoch": 6.220518867924528, "grad_norm": 2.65476393699646, "learning_rate": 8.704064456632231e-06, "loss": 0.4677, "num_input_tokens_seen": 6927792, "step": 10550 }, { "epoch": 6.223466981132075, "grad_norm": 1.5335382223129272, "learning_rate": 8.702335844872946e-06, "loss": 0.3772, "num_input_tokens_seen": 6931280, "step": 10555 }, { "epoch": 6.226415094339623, "grad_norm": 3.6216518878936768, "learning_rate": 8.700606252904293e-06, "loss": 0.5716, "num_input_tokens_seen": 6935024, "step": 10560 }, { "epoch": 6.22936320754717, "grad_norm": 1.6659592390060425, "learning_rate": 8.698875681184183e-06, "loss": 0.408, "num_input_tokens_seen": 6938832, "step": 10565 }, { "epoch": 6.232311320754717, "grad_norm": 2.238334894180298, "learning_rate": 8.697144130170797e-06, "loss": 0.3774, "num_input_tokens_seen": 6941456, "step": 10570 }, { "epoch": 6.2352594339622645, "grad_norm": 2.238563299179077, "learning_rate": 8.695411600322568e-06, "loss": 0.5051, "num_input_tokens_seen": 6945168, "step": 10575 }, { "epoch": 6.238207547169812, "grad_norm": 2.323867082595825, "learning_rate": 8.693678092098191e-06, "loss": 0.4144, "num_input_tokens_seen": 6947536, "step": 10580 }, { "epoch": 6.241155660377358, "grad_norm": 2.211958646774292, "learning_rate": 8.691943605956621e-06, "loss": 0.5569, "num_input_tokens_seen": 6950480, "step": 10585 }, { "epoch": 6.244103773584905, "grad_norm": 2.6122357845306396, "learning_rate": 8.690208142357069e-06, "loss": 0.3727, "num_input_tokens_seen": 6953424, "step": 10590 }, { "epoch": 6.247051886792453, "grad_norm": 3.0812013149261475, "learning_rate": 8.68847170175901e-06, "loss": 0.3859, "num_input_tokens_seen": 6957168, "step": 10595 }, { "epoch": 6.25, "grad_norm": 2.4677529335021973, "learning_rate": 8.686734284622168e-06, "loss": 0.4061, "num_input_tokens_seen": 6959568, "step": 10600 }, { "epoch": 6.252948113207547, "grad_norm": 1.2205239534378052, "learning_rate": 8.684995891406537e-06, "loss": 0.3697, "num_input_tokens_seen": 6963184, "step": 10605 }, { "epoch": 6.255896226415095, "grad_norm": 2.003561019897461, "learning_rate": 8.683256522572362e-06, "loss": 0.3766, "num_input_tokens_seen": 6965520, "step": 10610 }, { "epoch": 6.258844339622642, "grad_norm": 1.3753488063812256, "learning_rate": 8.68151617858015e-06, "loss": 0.4188, "num_input_tokens_seen": 6969040, "step": 10615 }, { "epoch": 6.261792452830189, "grad_norm": 1.5027543306350708, "learning_rate": 8.67977485989066e-06, "loss": 0.4438, "num_input_tokens_seen": 6973584, "step": 10620 }, { "epoch": 6.2647405660377355, "grad_norm": 1.8263120651245117, "learning_rate": 8.67803256696492e-06, "loss": 0.3363, "num_input_tokens_seen": 6976528, "step": 10625 }, { "epoch": 6.267688679245283, "grad_norm": 1.7682455778121948, "learning_rate": 8.676289300264205e-06, "loss": 0.5148, "num_input_tokens_seen": 6979152, "step": 10630 }, { "epoch": 6.27063679245283, "grad_norm": 2.0034782886505127, "learning_rate": 8.674545060250054e-06, "loss": 0.4151, "num_input_tokens_seen": 6982896, "step": 10635 }, { "epoch": 6.273584905660377, "grad_norm": 2.2437400817871094, "learning_rate": 8.672799847384263e-06, "loss": 0.3515, "num_input_tokens_seen": 6985584, "step": 10640 }, { "epoch": 6.276533018867925, "grad_norm": 1.7890444993972778, "learning_rate": 8.671053662128883e-06, "loss": 0.5007, "num_input_tokens_seen": 6988208, "step": 10645 }, { "epoch": 6.279481132075472, "grad_norm": 1.7513598203659058, "learning_rate": 8.669306504946223e-06, "loss": 0.4738, "num_input_tokens_seen": 6991024, "step": 10650 }, { "epoch": 6.282429245283019, "grad_norm": 2.094367504119873, "learning_rate": 8.667558376298854e-06, "loss": 0.5049, "num_input_tokens_seen": 6995024, "step": 10655 }, { "epoch": 6.285377358490566, "grad_norm": 1.4113168716430664, "learning_rate": 8.665809276649597e-06, "loss": 0.2356, "num_input_tokens_seen": 6997584, "step": 10660 }, { "epoch": 6.288325471698113, "grad_norm": 1.7183871269226074, "learning_rate": 8.664059206461537e-06, "loss": 0.486, "num_input_tokens_seen": 7000784, "step": 10665 }, { "epoch": 6.29127358490566, "grad_norm": 1.4638513326644897, "learning_rate": 8.662308166198009e-06, "loss": 0.3859, "num_input_tokens_seen": 7004496, "step": 10670 }, { "epoch": 6.2942216981132075, "grad_norm": 2.9991557598114014, "learning_rate": 8.660556156322611e-06, "loss": 0.3631, "num_input_tokens_seen": 7008304, "step": 10675 }, { "epoch": 6.297169811320755, "grad_norm": 2.8889825344085693, "learning_rate": 8.658803177299196e-06, "loss": 0.4036, "num_input_tokens_seen": 7011856, "step": 10680 }, { "epoch": 6.300117924528302, "grad_norm": 4.210697174072266, "learning_rate": 8.65704922959187e-06, "loss": 0.5354, "num_input_tokens_seen": 7014608, "step": 10685 }, { "epoch": 6.303066037735849, "grad_norm": 1.2685353755950928, "learning_rate": 8.655294313664998e-06, "loss": 0.501, "num_input_tokens_seen": 7017360, "step": 10690 }, { "epoch": 6.306014150943396, "grad_norm": 1.8445277214050293, "learning_rate": 8.653538429983204e-06, "loss": 0.4385, "num_input_tokens_seen": 7021008, "step": 10695 }, { "epoch": 6.308962264150943, "grad_norm": 2.026364803314209, "learning_rate": 8.651781579011366e-06, "loss": 0.5064, "num_input_tokens_seen": 7023856, "step": 10700 }, { "epoch": 6.31191037735849, "grad_norm": 1.7974780797958374, "learning_rate": 8.650023761214615e-06, "loss": 0.4132, "num_input_tokens_seen": 7026832, "step": 10705 }, { "epoch": 6.314858490566038, "grad_norm": 1.3906562328338623, "learning_rate": 8.648264977058344e-06, "loss": 0.8757, "num_input_tokens_seen": 7031248, "step": 10710 }, { "epoch": 6.317806603773585, "grad_norm": 1.2825157642364502, "learning_rate": 8.646505227008197e-06, "loss": 0.5175, "num_input_tokens_seen": 7034800, "step": 10715 }, { "epoch": 6.320754716981132, "grad_norm": 4.270279407501221, "learning_rate": 8.644744511530074e-06, "loss": 0.508, "num_input_tokens_seen": 7038384, "step": 10720 }, { "epoch": 6.3237028301886795, "grad_norm": 1.7661528587341309, "learning_rate": 8.642982831090135e-06, "loss": 0.48, "num_input_tokens_seen": 7041808, "step": 10725 }, { "epoch": 6.326650943396227, "grad_norm": 1.525442361831665, "learning_rate": 8.64122018615479e-06, "loss": 0.4231, "num_input_tokens_seen": 7044784, "step": 10730 }, { "epoch": 6.329599056603773, "grad_norm": 3.9579505920410156, "learning_rate": 8.63945657719071e-06, "loss": 0.4345, "num_input_tokens_seen": 7047472, "step": 10735 }, { "epoch": 6.3325471698113205, "grad_norm": 2.0413084030151367, "learning_rate": 8.637692004664816e-06, "loss": 0.4403, "num_input_tokens_seen": 7051440, "step": 10740 }, { "epoch": 6.335495283018868, "grad_norm": 2.21733021736145, "learning_rate": 8.635926469044284e-06, "loss": 0.5116, "num_input_tokens_seen": 7054480, "step": 10745 }, { "epoch": 6.338443396226415, "grad_norm": 1.4763288497924805, "learning_rate": 8.63415997079655e-06, "loss": 0.318, "num_input_tokens_seen": 7058032, "step": 10750 }, { "epoch": 6.341391509433962, "grad_norm": 2.850381851196289, "learning_rate": 8.6323925103893e-06, "loss": 0.3696, "num_input_tokens_seen": 7060656, "step": 10755 }, { "epoch": 6.34433962264151, "grad_norm": 1.5218979120254517, "learning_rate": 8.63062408829048e-06, "loss": 0.5407, "num_input_tokens_seen": 7063600, "step": 10760 }, { "epoch": 6.347287735849057, "grad_norm": 1.8859142065048218, "learning_rate": 8.628854704968285e-06, "loss": 0.3737, "num_input_tokens_seen": 7066352, "step": 10765 }, { "epoch": 6.350235849056604, "grad_norm": 2.236943006515503, "learning_rate": 8.62708436089117e-06, "loss": 0.4807, "num_input_tokens_seen": 7068688, "step": 10770 }, { "epoch": 6.353183962264151, "grad_norm": 2.0991342067718506, "learning_rate": 8.625313056527836e-06, "loss": 0.4728, "num_input_tokens_seen": 7071888, "step": 10775 }, { "epoch": 6.356132075471698, "grad_norm": 1.1078697443008423, "learning_rate": 8.623540792347244e-06, "loss": 0.4158, "num_input_tokens_seen": 7074672, "step": 10780 }, { "epoch": 6.359080188679245, "grad_norm": 2.199436664581299, "learning_rate": 8.621767568818614e-06, "loss": 0.4677, "num_input_tokens_seen": 7077712, "step": 10785 }, { "epoch": 6.3620283018867925, "grad_norm": 2.143416404724121, "learning_rate": 8.619993386411409e-06, "loss": 0.3354, "num_input_tokens_seen": 7080944, "step": 10790 }, { "epoch": 6.36497641509434, "grad_norm": 1.4688570499420166, "learning_rate": 8.618218245595356e-06, "loss": 0.3763, "num_input_tokens_seen": 7083920, "step": 10795 }, { "epoch": 6.367924528301887, "grad_norm": 2.108813524246216, "learning_rate": 8.616442146840427e-06, "loss": 0.4094, "num_input_tokens_seen": 7087568, "step": 10800 }, { "epoch": 6.370872641509434, "grad_norm": 3.5692038536071777, "learning_rate": 8.614665090616854e-06, "loss": 0.4297, "num_input_tokens_seen": 7096528, "step": 10805 }, { "epoch": 6.373820754716981, "grad_norm": 3.1392903327941895, "learning_rate": 8.61288707739512e-06, "loss": 0.4745, "num_input_tokens_seen": 7099600, "step": 10810 }, { "epoch": 6.376768867924528, "grad_norm": 3.4131124019622803, "learning_rate": 8.611108107645963e-06, "loss": 0.5105, "num_input_tokens_seen": 7103120, "step": 10815 }, { "epoch": 6.379716981132075, "grad_norm": 3.1709883213043213, "learning_rate": 8.609328181840368e-06, "loss": 0.4171, "num_input_tokens_seen": 7106736, "step": 10820 }, { "epoch": 6.382665094339623, "grad_norm": 1.7631868124008179, "learning_rate": 8.607547300449585e-06, "loss": 0.2998, "num_input_tokens_seen": 7110640, "step": 10825 }, { "epoch": 6.38561320754717, "grad_norm": 1.9281522035598755, "learning_rate": 8.605765463945105e-06, "loss": 0.3618, "num_input_tokens_seen": 7113776, "step": 10830 }, { "epoch": 6.388561320754717, "grad_norm": 2.29105806350708, "learning_rate": 8.603982672798678e-06, "loss": 0.467, "num_input_tokens_seen": 7117168, "step": 10835 }, { "epoch": 6.3915094339622645, "grad_norm": 2.020014524459839, "learning_rate": 8.602198927482309e-06, "loss": 0.4046, "num_input_tokens_seen": 7119696, "step": 10840 }, { "epoch": 6.394457547169811, "grad_norm": 0.9665722250938416, "learning_rate": 8.600414228468245e-06, "loss": 0.4453, "num_input_tokens_seen": 7122960, "step": 10845 }, { "epoch": 6.397405660377358, "grad_norm": 2.3013410568237305, "learning_rate": 8.598628576229e-06, "loss": 0.4185, "num_input_tokens_seen": 7125552, "step": 10850 }, { "epoch": 6.400353773584905, "grad_norm": 3.333354949951172, "learning_rate": 8.596841971237328e-06, "loss": 0.5081, "num_input_tokens_seen": 7128912, "step": 10855 }, { "epoch": 6.403301886792453, "grad_norm": 2.1786446571350098, "learning_rate": 8.595054413966246e-06, "loss": 0.3475, "num_input_tokens_seen": 7132912, "step": 10860 }, { "epoch": 6.40625, "grad_norm": 2.781132221221924, "learning_rate": 8.593265904889011e-06, "loss": 0.4285, "num_input_tokens_seen": 7137712, "step": 10865 }, { "epoch": 6.409198113207547, "grad_norm": 2.2538623809814453, "learning_rate": 8.591476444479141e-06, "loss": 0.3014, "num_input_tokens_seen": 7140944, "step": 10870 }, { "epoch": 6.412146226415095, "grad_norm": 2.893298387527466, "learning_rate": 8.589686033210407e-06, "loss": 0.4017, "num_input_tokens_seen": 7144240, "step": 10875 }, { "epoch": 6.415094339622642, "grad_norm": 1.528727650642395, "learning_rate": 8.587894671556823e-06, "loss": 0.4037, "num_input_tokens_seen": 7146736, "step": 10880 }, { "epoch": 6.418042452830189, "grad_norm": 1.2615236043930054, "learning_rate": 8.586102359992663e-06, "loss": 0.3439, "num_input_tokens_seen": 7149904, "step": 10885 }, { "epoch": 6.4209905660377355, "grad_norm": 1.9388113021850586, "learning_rate": 8.584309098992447e-06, "loss": 0.5831, "num_input_tokens_seen": 7153040, "step": 10890 }, { "epoch": 6.423938679245283, "grad_norm": 1.4228475093841553, "learning_rate": 8.58251488903095e-06, "loss": 0.3276, "num_input_tokens_seen": 7155952, "step": 10895 }, { "epoch": 6.42688679245283, "grad_norm": 1.465314507484436, "learning_rate": 8.580719730583196e-06, "loss": 0.3854, "num_input_tokens_seen": 7158832, "step": 10900 }, { "epoch": 6.429834905660377, "grad_norm": 3.0672082901000977, "learning_rate": 8.578923624124462e-06, "loss": 0.5478, "num_input_tokens_seen": 7161424, "step": 10905 }, { "epoch": 6.432783018867925, "grad_norm": 2.318617582321167, "learning_rate": 8.577126570130273e-06, "loss": 0.4127, "num_input_tokens_seen": 7163984, "step": 10910 }, { "epoch": 6.435731132075472, "grad_norm": 1.926137924194336, "learning_rate": 8.575328569076408e-06, "loss": 0.44, "num_input_tokens_seen": 7166672, "step": 10915 }, { "epoch": 6.438679245283019, "grad_norm": 2.0968899726867676, "learning_rate": 8.573529621438896e-06, "loss": 0.4178, "num_input_tokens_seen": 7169904, "step": 10920 }, { "epoch": 6.441627358490566, "grad_norm": 1.466500163078308, "learning_rate": 8.571729727694015e-06, "loss": 0.3406, "num_input_tokens_seen": 7172688, "step": 10925 }, { "epoch": 6.444575471698113, "grad_norm": 1.8280740976333618, "learning_rate": 8.569928888318298e-06, "loss": 0.5601, "num_input_tokens_seen": 7179376, "step": 10930 }, { "epoch": 6.44752358490566, "grad_norm": 1.3864576816558838, "learning_rate": 8.56812710378852e-06, "loss": 0.3708, "num_input_tokens_seen": 7182832, "step": 10935 }, { "epoch": 6.4504716981132075, "grad_norm": 2.004868745803833, "learning_rate": 8.566324374581714e-06, "loss": 0.4563, "num_input_tokens_seen": 7186608, "step": 10940 }, { "epoch": 6.453419811320755, "grad_norm": 1.6767207384109497, "learning_rate": 8.564520701175158e-06, "loss": 0.5384, "num_input_tokens_seen": 7190192, "step": 10945 }, { "epoch": 6.456367924528302, "grad_norm": 2.2252159118652344, "learning_rate": 8.562716084046387e-06, "loss": 0.4523, "num_input_tokens_seen": 7193104, "step": 10950 }, { "epoch": 6.459316037735849, "grad_norm": 1.8465688228607178, "learning_rate": 8.560910523673177e-06, "loss": 0.5675, "num_input_tokens_seen": 7195984, "step": 10955 }, { "epoch": 6.462264150943396, "grad_norm": 2.1777639389038086, "learning_rate": 8.55910402053356e-06, "loss": 0.4314, "num_input_tokens_seen": 7199312, "step": 10960 }, { "epoch": 6.465212264150943, "grad_norm": 1.6534324884414673, "learning_rate": 8.557296575105814e-06, "loss": 0.5038, "num_input_tokens_seen": 7202320, "step": 10965 }, { "epoch": 6.46816037735849, "grad_norm": 1.4594694375991821, "learning_rate": 8.555488187868469e-06, "loss": 0.4302, "num_input_tokens_seen": 7205168, "step": 10970 }, { "epoch": 6.471108490566038, "grad_norm": 1.660736083984375, "learning_rate": 8.5536788593003e-06, "loss": 0.4161, "num_input_tokens_seen": 7207824, "step": 10975 }, { "epoch": 6.474056603773585, "grad_norm": 1.4801506996154785, "learning_rate": 8.55186858988034e-06, "loss": 0.5419, "num_input_tokens_seen": 7210864, "step": 10980 }, { "epoch": 6.477004716981132, "grad_norm": 2.515639305114746, "learning_rate": 8.550057380087863e-06, "loss": 0.4147, "num_input_tokens_seen": 7213776, "step": 10985 }, { "epoch": 6.4799528301886795, "grad_norm": 1.6673054695129395, "learning_rate": 8.548245230402396e-06, "loss": 0.4357, "num_input_tokens_seen": 7217424, "step": 10990 }, { "epoch": 6.482900943396227, "grad_norm": 1.6656055450439453, "learning_rate": 8.546432141303711e-06, "loss": 0.4764, "num_input_tokens_seen": 7220208, "step": 10995 }, { "epoch": 6.485849056603773, "grad_norm": 1.8693684339523315, "learning_rate": 8.544618113271833e-06, "loss": 0.4632, "num_input_tokens_seen": 7222896, "step": 11000 }, { "epoch": 6.4887971698113205, "grad_norm": 1.8655200004577637, "learning_rate": 8.542803146787032e-06, "loss": 0.402, "num_input_tokens_seen": 7226000, "step": 11005 }, { "epoch": 6.491745283018868, "grad_norm": 5.597531318664551, "learning_rate": 8.54098724232983e-06, "loss": 0.5029, "num_input_tokens_seen": 7228720, "step": 11010 }, { "epoch": 6.494693396226415, "grad_norm": 1.5380442142486572, "learning_rate": 8.539170400380994e-06, "loss": 0.4631, "num_input_tokens_seen": 7232208, "step": 11015 }, { "epoch": 6.497641509433962, "grad_norm": 1.0907167196273804, "learning_rate": 8.537352621421542e-06, "loss": 0.4308, "num_input_tokens_seen": 7235696, "step": 11020 }, { "epoch": 6.50058962264151, "grad_norm": 2.248356342315674, "learning_rate": 8.535533905932739e-06, "loss": 0.6868, "num_input_tokens_seen": 7239504, "step": 11025 }, { "epoch": 6.503537735849057, "grad_norm": 2.406081199645996, "learning_rate": 8.533714254396096e-06, "loss": 0.5586, "num_input_tokens_seen": 7243088, "step": 11030 }, { "epoch": 6.506485849056604, "grad_norm": 1.425073266029358, "learning_rate": 8.531893667293375e-06, "loss": 0.4221, "num_input_tokens_seen": 7246640, "step": 11035 }, { "epoch": 6.509433962264151, "grad_norm": 2.926396369934082, "learning_rate": 8.530072145106585e-06, "loss": 0.514, "num_input_tokens_seen": 7250800, "step": 11040 }, { "epoch": 6.512382075471698, "grad_norm": 2.3526628017425537, "learning_rate": 8.528249688317978e-06, "loss": 0.5119, "num_input_tokens_seen": 7253776, "step": 11045 }, { "epoch": 6.515330188679245, "grad_norm": 1.8358136415481567, "learning_rate": 8.526426297410062e-06, "loss": 0.4382, "num_input_tokens_seen": 7256848, "step": 11050 }, { "epoch": 6.5182783018867925, "grad_norm": 1.6634769439697266, "learning_rate": 8.524601972865586e-06, "loss": 0.5176, "num_input_tokens_seen": 7260048, "step": 11055 }, { "epoch": 6.52122641509434, "grad_norm": 1.4455606937408447, "learning_rate": 8.522776715167548e-06, "loss": 0.4422, "num_input_tokens_seen": 7262928, "step": 11060 }, { "epoch": 6.524174528301887, "grad_norm": 3.0320675373077393, "learning_rate": 8.520950524799192e-06, "loss": 0.4559, "num_input_tokens_seen": 7266128, "step": 11065 }, { "epoch": 6.527122641509434, "grad_norm": 2.026231050491333, "learning_rate": 8.51912340224401e-06, "loss": 0.3113, "num_input_tokens_seen": 7269296, "step": 11070 }, { "epoch": 6.530070754716981, "grad_norm": 1.1862576007843018, "learning_rate": 8.51729534798574e-06, "loss": 0.3219, "num_input_tokens_seen": 7273552, "step": 11075 }, { "epoch": 6.533018867924528, "grad_norm": 1.5939984321594238, "learning_rate": 8.515466362508369e-06, "loss": 0.3841, "num_input_tokens_seen": 7276976, "step": 11080 }, { "epoch": 6.535966981132075, "grad_norm": 1.36613929271698, "learning_rate": 8.513636446296125e-06, "loss": 0.4223, "num_input_tokens_seen": 7280016, "step": 11085 }, { "epoch": 6.538915094339623, "grad_norm": 1.187804937362671, "learning_rate": 8.51180559983349e-06, "loss": 0.3599, "num_input_tokens_seen": 7282992, "step": 11090 }, { "epoch": 6.54186320754717, "grad_norm": 2.3572309017181396, "learning_rate": 8.50997382360519e-06, "loss": 0.4786, "num_input_tokens_seen": 7286384, "step": 11095 }, { "epoch": 6.544811320754717, "grad_norm": 1.443867564201355, "learning_rate": 8.508141118096191e-06, "loss": 0.5104, "num_input_tokens_seen": 7289904, "step": 11100 }, { "epoch": 6.5477594339622645, "grad_norm": 3.8269262313842773, "learning_rate": 8.506307483791712e-06, "loss": 0.4027, "num_input_tokens_seen": 7292784, "step": 11105 }, { "epoch": 6.550707547169811, "grad_norm": 1.9308717250823975, "learning_rate": 8.504472921177215e-06, "loss": 0.4172, "num_input_tokens_seen": 7295760, "step": 11110 }, { "epoch": 6.553655660377358, "grad_norm": 1.519090175628662, "learning_rate": 8.502637430738409e-06, "loss": 0.5165, "num_input_tokens_seen": 7299152, "step": 11115 }, { "epoch": 6.556603773584905, "grad_norm": 1.6217608451843262, "learning_rate": 8.500801012961248e-06, "loss": 0.4026, "num_input_tokens_seen": 7302896, "step": 11120 }, { "epoch": 6.559551886792453, "grad_norm": 2.257913112640381, "learning_rate": 8.49896366833193e-06, "loss": 0.3445, "num_input_tokens_seen": 7306000, "step": 11125 }, { "epoch": 6.5625, "grad_norm": 3.87943434715271, "learning_rate": 8.497125397336903e-06, "loss": 0.3539, "num_input_tokens_seen": 7308528, "step": 11130 }, { "epoch": 6.565448113207547, "grad_norm": 2.2394444942474365, "learning_rate": 8.495286200462854e-06, "loss": 0.497, "num_input_tokens_seen": 7311920, "step": 11135 }, { "epoch": 6.568396226415095, "grad_norm": 2.429954767227173, "learning_rate": 8.49344607819672e-06, "loss": 0.3724, "num_input_tokens_seen": 7314928, "step": 11140 }, { "epoch": 6.571344339622642, "grad_norm": 2.223179817199707, "learning_rate": 8.49160503102568e-06, "loss": 0.5035, "num_input_tokens_seen": 7319248, "step": 11145 }, { "epoch": 6.574292452830189, "grad_norm": 1.867133378982544, "learning_rate": 8.489763059437161e-06, "loss": 0.4355, "num_input_tokens_seen": 7321808, "step": 11150 }, { "epoch": 6.5772405660377355, "grad_norm": 1.8614447116851807, "learning_rate": 8.487920163918833e-06, "loss": 0.4011, "num_input_tokens_seen": 7325168, "step": 11155 }, { "epoch": 6.580188679245283, "grad_norm": 1.6253000497817993, "learning_rate": 8.486076344958607e-06, "loss": 0.4893, "num_input_tokens_seen": 7329168, "step": 11160 }, { "epoch": 6.58313679245283, "grad_norm": 2.1671204566955566, "learning_rate": 8.484231603044647e-06, "loss": 0.653, "num_input_tokens_seen": 7333360, "step": 11165 }, { "epoch": 6.586084905660377, "grad_norm": 2.0236988067626953, "learning_rate": 8.482385938665352e-06, "loss": 0.4878, "num_input_tokens_seen": 7335984, "step": 11170 }, { "epoch": 6.589033018867925, "grad_norm": 1.955255150794983, "learning_rate": 8.480539352309373e-06, "loss": 0.4477, "num_input_tokens_seen": 7338768, "step": 11175 }, { "epoch": 6.591981132075472, "grad_norm": 2.6645772457122803, "learning_rate": 8.478691844465598e-06, "loss": 0.4509, "num_input_tokens_seen": 7341264, "step": 11180 }, { "epoch": 6.594929245283019, "grad_norm": 2.230401039123535, "learning_rate": 8.476843415623168e-06, "loss": 0.3834, "num_input_tokens_seen": 7344304, "step": 11185 }, { "epoch": 6.597877358490566, "grad_norm": 1.9272756576538086, "learning_rate": 8.474994066271458e-06, "loss": 0.461, "num_input_tokens_seen": 7347408, "step": 11190 }, { "epoch": 6.600825471698113, "grad_norm": 1.354148030281067, "learning_rate": 8.473143796900089e-06, "loss": 0.4605, "num_input_tokens_seen": 7352144, "step": 11195 }, { "epoch": 6.60377358490566, "grad_norm": 3.185457468032837, "learning_rate": 8.471292607998936e-06, "loss": 0.4432, "num_input_tokens_seen": 7355280, "step": 11200 }, { "epoch": 6.6067216981132075, "grad_norm": 2.664318561553955, "learning_rate": 8.469440500058104e-06, "loss": 0.4268, "num_input_tokens_seen": 7358768, "step": 11205 }, { "epoch": 6.609669811320755, "grad_norm": 1.6454737186431885, "learning_rate": 8.467587473567945e-06, "loss": 0.4138, "num_input_tokens_seen": 7362064, "step": 11210 }, { "epoch": 6.612617924528302, "grad_norm": 1.6478787660598755, "learning_rate": 8.46573352901906e-06, "loss": 0.3875, "num_input_tokens_seen": 7365072, "step": 11215 }, { "epoch": 6.615566037735849, "grad_norm": 1.7160217761993408, "learning_rate": 8.463878666902286e-06, "loss": 0.4012, "num_input_tokens_seen": 7367600, "step": 11220 }, { "epoch": 6.618514150943396, "grad_norm": 1.258363127708435, "learning_rate": 8.462022887708706e-06, "loss": 0.4284, "num_input_tokens_seen": 7370992, "step": 11225 }, { "epoch": 6.621462264150943, "grad_norm": 1.903702735900879, "learning_rate": 8.460166191929646e-06, "loss": 0.4735, "num_input_tokens_seen": 7373968, "step": 11230 }, { "epoch": 6.62441037735849, "grad_norm": 2.119025230407715, "learning_rate": 8.458308580056675e-06, "loss": 0.4437, "num_input_tokens_seen": 7377712, "step": 11235 }, { "epoch": 6.627358490566038, "grad_norm": 1.918455958366394, "learning_rate": 8.456450052581602e-06, "loss": 0.3872, "num_input_tokens_seen": 7380816, "step": 11240 }, { "epoch": 6.630306603773585, "grad_norm": 1.9621347188949585, "learning_rate": 8.45459060999648e-06, "loss": 0.3692, "num_input_tokens_seen": 7383536, "step": 11245 }, { "epoch": 6.633254716981132, "grad_norm": 1.4190667867660522, "learning_rate": 8.452730252793608e-06, "loss": 0.5152, "num_input_tokens_seen": 7386992, "step": 11250 }, { "epoch": 6.6362028301886795, "grad_norm": 2.291546583175659, "learning_rate": 8.450868981465519e-06, "loss": 0.4973, "num_input_tokens_seen": 7389392, "step": 11255 }, { "epoch": 6.639150943396227, "grad_norm": 0.366848886013031, "learning_rate": 8.449006796504997e-06, "loss": 0.2997, "num_input_tokens_seen": 7395312, "step": 11260 }, { "epoch": 6.642099056603773, "grad_norm": 1.3735781908035278, "learning_rate": 8.44714369840506e-06, "loss": 0.4799, "num_input_tokens_seen": 7397776, "step": 11265 }, { "epoch": 6.6450471698113205, "grad_norm": 1.553718090057373, "learning_rate": 8.445279687658973e-06, "loss": 0.5153, "num_input_tokens_seen": 7402224, "step": 11270 }, { "epoch": 6.647995283018868, "grad_norm": 1.4963597059249878, "learning_rate": 8.44341476476024e-06, "loss": 0.3461, "num_input_tokens_seen": 7405392, "step": 11275 }, { "epoch": 6.650943396226415, "grad_norm": 2.746622085571289, "learning_rate": 8.441548930202608e-06, "loss": 0.4903, "num_input_tokens_seen": 7407760, "step": 11280 }, { "epoch": 6.653891509433962, "grad_norm": 0.8991411924362183, "learning_rate": 8.439682184480065e-06, "loss": 0.3225, "num_input_tokens_seen": 7410576, "step": 11285 }, { "epoch": 6.65683962264151, "grad_norm": 3.0868282318115234, "learning_rate": 8.437814528086837e-06, "loss": 0.615, "num_input_tokens_seen": 7414032, "step": 11290 }, { "epoch": 6.659787735849057, "grad_norm": 1.6003422737121582, "learning_rate": 8.435945961517398e-06, "loss": 0.3833, "num_input_tokens_seen": 7417552, "step": 11295 }, { "epoch": 6.662735849056604, "grad_norm": 2.346701145172119, "learning_rate": 8.434076485266458e-06, "loss": 0.3937, "num_input_tokens_seen": 7420784, "step": 11300 }, { "epoch": 6.665683962264151, "grad_norm": 3.0627927780151367, "learning_rate": 8.432206099828969e-06, "loss": 0.4266, "num_input_tokens_seen": 7424400, "step": 11305 }, { "epoch": 6.668632075471698, "grad_norm": 1.851887583732605, "learning_rate": 8.430334805700122e-06, "loss": 0.3724, "num_input_tokens_seen": 7427248, "step": 11310 }, { "epoch": 6.671580188679245, "grad_norm": 1.4399101734161377, "learning_rate": 8.428462603375351e-06, "loss": 0.4405, "num_input_tokens_seen": 7430864, "step": 11315 }, { "epoch": 6.6745283018867925, "grad_norm": 1.5857211351394653, "learning_rate": 8.426589493350332e-06, "loss": 0.5346, "num_input_tokens_seen": 7434320, "step": 11320 }, { "epoch": 6.67747641509434, "grad_norm": 2.541590929031372, "learning_rate": 8.424715476120976e-06, "loss": 0.4507, "num_input_tokens_seen": 7437584, "step": 11325 }, { "epoch": 6.680424528301887, "grad_norm": 2.995051383972168, "learning_rate": 8.422840552183437e-06, "loss": 0.4607, "num_input_tokens_seen": 7440016, "step": 11330 }, { "epoch": 6.683372641509434, "grad_norm": 1.498258113861084, "learning_rate": 8.420964722034111e-06, "loss": 0.4922, "num_input_tokens_seen": 7444784, "step": 11335 }, { "epoch": 6.686320754716981, "grad_norm": 1.7142179012298584, "learning_rate": 8.41908798616963e-06, "loss": 0.5538, "num_input_tokens_seen": 7448624, "step": 11340 }, { "epoch": 6.689268867924528, "grad_norm": 2.1959855556488037, "learning_rate": 8.41721034508687e-06, "loss": 0.403, "num_input_tokens_seen": 7452144, "step": 11345 }, { "epoch": 6.692216981132075, "grad_norm": 2.629152774810791, "learning_rate": 8.415331799282942e-06, "loss": 0.3376, "num_input_tokens_seen": 7454768, "step": 11350 }, { "epoch": 6.695165094339623, "grad_norm": 6.8917460441589355, "learning_rate": 8.413452349255205e-06, "loss": 0.3938, "num_input_tokens_seen": 7457232, "step": 11355 }, { "epoch": 6.69811320754717, "grad_norm": 4.9069600105285645, "learning_rate": 8.411571995501245e-06, "loss": 0.4979, "num_input_tokens_seen": 7460048, "step": 11360 }, { "epoch": 6.701061320754717, "grad_norm": 1.2454969882965088, "learning_rate": 8.409690738518895e-06, "loss": 0.5086, "num_input_tokens_seen": 7462928, "step": 11365 }, { "epoch": 6.7040094339622645, "grad_norm": 1.4024739265441895, "learning_rate": 8.407808578806229e-06, "loss": 0.5756, "num_input_tokens_seen": 7465392, "step": 11370 }, { "epoch": 6.706957547169811, "grad_norm": 3.6195576190948486, "learning_rate": 8.405925516861555e-06, "loss": 0.5012, "num_input_tokens_seen": 7468496, "step": 11375 }, { "epoch": 6.709905660377358, "grad_norm": 1.5025053024291992, "learning_rate": 8.40404155318342e-06, "loss": 0.3634, "num_input_tokens_seen": 7471344, "step": 11380 }, { "epoch": 6.712853773584905, "grad_norm": 1.415581226348877, "learning_rate": 8.402156688270613e-06, "loss": 0.358, "num_input_tokens_seen": 7475376, "step": 11385 }, { "epoch": 6.715801886792453, "grad_norm": 2.4562671184539795, "learning_rate": 8.400270922622162e-06, "loss": 0.597, "num_input_tokens_seen": 7478768, "step": 11390 }, { "epoch": 6.71875, "grad_norm": 1.5907584428787231, "learning_rate": 8.398384256737328e-06, "loss": 0.4347, "num_input_tokens_seen": 7482352, "step": 11395 }, { "epoch": 6.721698113207547, "grad_norm": 1.9882957935333252, "learning_rate": 8.396496691115619e-06, "loss": 0.508, "num_input_tokens_seen": 7485360, "step": 11400 }, { "epoch": 6.724646226415095, "grad_norm": 2.527129650115967, "learning_rate": 8.39460822625677e-06, "loss": 0.4114, "num_input_tokens_seen": 7489968, "step": 11405 }, { "epoch": 6.727594339622642, "grad_norm": 1.5274602174758911, "learning_rate": 8.392718862660765e-06, "loss": 0.4081, "num_input_tokens_seen": 7492656, "step": 11410 }, { "epoch": 6.730542452830189, "grad_norm": 1.8379415273666382, "learning_rate": 8.390828600827818e-06, "loss": 0.469, "num_input_tokens_seen": 7496240, "step": 11415 }, { "epoch": 6.7334905660377355, "grad_norm": 2.040283203125, "learning_rate": 8.388937441258385e-06, "loss": 0.4896, "num_input_tokens_seen": 7498960, "step": 11420 }, { "epoch": 6.736438679245283, "grad_norm": 1.998300313949585, "learning_rate": 8.387045384453162e-06, "loss": 0.5452, "num_input_tokens_seen": 7501648, "step": 11425 }, { "epoch": 6.73938679245283, "grad_norm": 3.01244854927063, "learning_rate": 8.385152430913073e-06, "loss": 0.4492, "num_input_tokens_seen": 7506192, "step": 11430 }, { "epoch": 6.742334905660377, "grad_norm": 0.9951661825180054, "learning_rate": 8.383258581139288e-06, "loss": 0.3846, "num_input_tokens_seen": 7509712, "step": 11435 }, { "epoch": 6.745283018867925, "grad_norm": 2.267564296722412, "learning_rate": 8.381363835633213e-06, "loss": 0.4649, "num_input_tokens_seen": 7513040, "step": 11440 }, { "epoch": 6.748231132075472, "grad_norm": 2.438011407852173, "learning_rate": 8.379468194896492e-06, "loss": 0.4282, "num_input_tokens_seen": 7515984, "step": 11445 }, { "epoch": 6.751179245283019, "grad_norm": 3.7735676765441895, "learning_rate": 8.377571659431e-06, "loss": 0.385, "num_input_tokens_seen": 7519024, "step": 11450 }, { "epoch": 6.754127358490566, "grad_norm": 2.6648058891296387, "learning_rate": 8.375674229738855e-06, "loss": 0.4956, "num_input_tokens_seen": 7521744, "step": 11455 }, { "epoch": 6.757075471698113, "grad_norm": 2.5871646404266357, "learning_rate": 8.37377590632241e-06, "loss": 0.4757, "num_input_tokens_seen": 7524720, "step": 11460 }, { "epoch": 6.76002358490566, "grad_norm": 1.1515957117080688, "learning_rate": 8.371876689684253e-06, "loss": 0.4383, "num_input_tokens_seen": 7527888, "step": 11465 }, { "epoch": 6.7629716981132075, "grad_norm": 1.4539638757705688, "learning_rate": 8.369976580327211e-06, "loss": 0.4542, "num_input_tokens_seen": 7531728, "step": 11470 }, { "epoch": 6.765919811320755, "grad_norm": 1.147419810295105, "learning_rate": 8.368075578754345e-06, "loss": 0.3841, "num_input_tokens_seen": 7535728, "step": 11475 }, { "epoch": 6.768867924528302, "grad_norm": 2.03171443939209, "learning_rate": 8.366173685468952e-06, "loss": 0.3842, "num_input_tokens_seen": 7538640, "step": 11480 }, { "epoch": 6.771816037735849, "grad_norm": 1.7911927700042725, "learning_rate": 8.364270900974572e-06, "loss": 0.4033, "num_input_tokens_seen": 7541264, "step": 11485 }, { "epoch": 6.774764150943396, "grad_norm": 2.0398805141448975, "learning_rate": 8.362367225774968e-06, "loss": 0.4875, "num_input_tokens_seen": 7544400, "step": 11490 }, { "epoch": 6.777712264150943, "grad_norm": 4.555403232574463, "learning_rate": 8.360462660374153e-06, "loss": 0.7266, "num_input_tokens_seen": 7547952, "step": 11495 }, { "epoch": 6.78066037735849, "grad_norm": 3.55256724357605, "learning_rate": 8.358557205276365e-06, "loss": 0.6, "num_input_tokens_seen": 7551376, "step": 11500 }, { "epoch": 6.783608490566038, "grad_norm": 3.804173469543457, "learning_rate": 8.356650860986083e-06, "loss": 0.3786, "num_input_tokens_seen": 7554672, "step": 11505 }, { "epoch": 6.786556603773585, "grad_norm": 1.13832426071167, "learning_rate": 8.354743628008017e-06, "loss": 0.4192, "num_input_tokens_seen": 7557808, "step": 11510 }, { "epoch": 6.789504716981132, "grad_norm": 2.2475943565368652, "learning_rate": 8.35283550684712e-06, "loss": 0.5177, "num_input_tokens_seen": 7560688, "step": 11515 }, { "epoch": 6.7924528301886795, "grad_norm": 7.196534156799316, "learning_rate": 8.350926498008572e-06, "loss": 0.5062, "num_input_tokens_seen": 7563632, "step": 11520 }, { "epoch": 6.795400943396227, "grad_norm": 1.9636330604553223, "learning_rate": 8.34901660199779e-06, "loss": 0.5307, "num_input_tokens_seen": 7566544, "step": 11525 }, { "epoch": 6.798349056603773, "grad_norm": 4.1634063720703125, "learning_rate": 8.347105819320432e-06, "loss": 0.4815, "num_input_tokens_seen": 7570608, "step": 11530 }, { "epoch": 6.8012971698113205, "grad_norm": 3.26904559135437, "learning_rate": 8.34519415048238e-06, "loss": 0.4578, "num_input_tokens_seen": 7572912, "step": 11535 }, { "epoch": 6.804245283018868, "grad_norm": 2.393944025039673, "learning_rate": 8.343281595989761e-06, "loss": 0.449, "num_input_tokens_seen": 7575568, "step": 11540 }, { "epoch": 6.807193396226415, "grad_norm": 1.3488432168960571, "learning_rate": 8.341368156348933e-06, "loss": 0.3871, "num_input_tokens_seen": 7578992, "step": 11545 }, { "epoch": 6.810141509433962, "grad_norm": 2.5659494400024414, "learning_rate": 8.339453832066482e-06, "loss": 0.4082, "num_input_tokens_seen": 7582384, "step": 11550 }, { "epoch": 6.81308962264151, "grad_norm": 1.7893139123916626, "learning_rate": 8.337538623649237e-06, "loss": 0.4461, "num_input_tokens_seen": 7585616, "step": 11555 }, { "epoch": 6.816037735849057, "grad_norm": 2.168074131011963, "learning_rate": 8.33562253160426e-06, "loss": 0.5013, "num_input_tokens_seen": 7589840, "step": 11560 }, { "epoch": 6.818985849056604, "grad_norm": 1.3223870992660522, "learning_rate": 8.33370555643884e-06, "loss": 0.3014, "num_input_tokens_seen": 7593360, "step": 11565 }, { "epoch": 6.821933962264151, "grad_norm": 5.20681095123291, "learning_rate": 8.331787698660507e-06, "loss": 0.4338, "num_input_tokens_seen": 7597040, "step": 11570 }, { "epoch": 6.824882075471698, "grad_norm": 10.663688659667969, "learning_rate": 8.32986895877702e-06, "loss": 0.4124, "num_input_tokens_seen": 7600752, "step": 11575 }, { "epoch": 6.827830188679245, "grad_norm": 2.2869656085968018, "learning_rate": 8.327949337296378e-06, "loss": 0.38, "num_input_tokens_seen": 7603344, "step": 11580 }, { "epoch": 6.8307783018867925, "grad_norm": 1.0428977012634277, "learning_rate": 8.326028834726803e-06, "loss": 0.4106, "num_input_tokens_seen": 7607920, "step": 11585 }, { "epoch": 6.83372641509434, "grad_norm": 1.7405577898025513, "learning_rate": 8.324107451576762e-06, "loss": 0.4325, "num_input_tokens_seen": 7611120, "step": 11590 }, { "epoch": 6.836674528301887, "grad_norm": 2.5921826362609863, "learning_rate": 8.322185188354947e-06, "loss": 0.4743, "num_input_tokens_seen": 7614480, "step": 11595 }, { "epoch": 6.839622641509434, "grad_norm": 2.7361819744110107, "learning_rate": 8.320262045570284e-06, "loss": 0.3914, "num_input_tokens_seen": 7618224, "step": 11600 }, { "epoch": 6.842570754716981, "grad_norm": 1.983719825744629, "learning_rate": 8.318338023731937e-06, "loss": 0.4074, "num_input_tokens_seen": 7621520, "step": 11605 }, { "epoch": 6.845518867924528, "grad_norm": 1.7998954057693481, "learning_rate": 8.316413123349296e-06, "loss": 0.5061, "num_input_tokens_seen": 7624816, "step": 11610 }, { "epoch": 6.848466981132075, "grad_norm": 2.1564524173736572, "learning_rate": 8.314487344931987e-06, "loss": 0.388, "num_input_tokens_seen": 7628016, "step": 11615 }, { "epoch": 6.851415094339623, "grad_norm": 1.2488651275634766, "learning_rate": 8.31256068898987e-06, "loss": 0.3652, "num_input_tokens_seen": 7631632, "step": 11620 }, { "epoch": 6.85436320754717, "grad_norm": 4.252068519592285, "learning_rate": 8.310633156033032e-06, "loss": 0.474, "num_input_tokens_seen": 7634672, "step": 11625 }, { "epoch": 6.857311320754717, "grad_norm": 1.8225339651107788, "learning_rate": 8.3087047465718e-06, "loss": 0.612, "num_input_tokens_seen": 7637712, "step": 11630 }, { "epoch": 6.8602594339622645, "grad_norm": 5.244790077209473, "learning_rate": 8.306775461116727e-06, "loss": 0.3991, "num_input_tokens_seen": 7640816, "step": 11635 }, { "epoch": 6.863207547169811, "grad_norm": 1.1979581117630005, "learning_rate": 8.304845300178597e-06, "loss": 0.4055, "num_input_tokens_seen": 7643984, "step": 11640 }, { "epoch": 6.866155660377358, "grad_norm": 2.194411516189575, "learning_rate": 8.302914264268433e-06, "loss": 0.412, "num_input_tokens_seen": 7647280, "step": 11645 }, { "epoch": 6.869103773584905, "grad_norm": 2.016065835952759, "learning_rate": 8.300982353897482e-06, "loss": 0.388, "num_input_tokens_seen": 7649968, "step": 11650 }, { "epoch": 6.872051886792453, "grad_norm": 1.3579586744308472, "learning_rate": 8.299049569577226e-06, "loss": 0.4523, "num_input_tokens_seen": 7653456, "step": 11655 }, { "epoch": 6.875, "grad_norm": 3.2433743476867676, "learning_rate": 8.297115911819379e-06, "loss": 0.4513, "num_input_tokens_seen": 7656592, "step": 11660 }, { "epoch": 6.877948113207547, "grad_norm": 2.4334299564361572, "learning_rate": 8.295181381135884e-06, "loss": 0.3691, "num_input_tokens_seen": 7659504, "step": 11665 }, { "epoch": 6.880896226415095, "grad_norm": 1.556731939315796, "learning_rate": 8.293245978038917e-06, "loss": 0.4545, "num_input_tokens_seen": 7664944, "step": 11670 }, { "epoch": 6.883844339622642, "grad_norm": 1.6482861042022705, "learning_rate": 8.291309703040884e-06, "loss": 0.4537, "num_input_tokens_seen": 7668016, "step": 11675 }, { "epoch": 6.886792452830189, "grad_norm": 1.8837474584579468, "learning_rate": 8.289372556654422e-06, "loss": 0.386, "num_input_tokens_seen": 7671344, "step": 11680 }, { "epoch": 6.8897405660377355, "grad_norm": 2.6438755989074707, "learning_rate": 8.287434539392401e-06, "loss": 0.5028, "num_input_tokens_seen": 7674768, "step": 11685 }, { "epoch": 6.892688679245283, "grad_norm": 1.4248392581939697, "learning_rate": 8.285495651767916e-06, "loss": 0.5023, "num_input_tokens_seen": 7678288, "step": 11690 }, { "epoch": 6.89563679245283, "grad_norm": 1.6595683097839355, "learning_rate": 8.283555894294297e-06, "loss": 0.4833, "num_input_tokens_seen": 7682128, "step": 11695 }, { "epoch": 6.898584905660377, "grad_norm": 1.8650459051132202, "learning_rate": 8.281615267485105e-06, "loss": 0.464, "num_input_tokens_seen": 7686384, "step": 11700 }, { "epoch": 6.901533018867925, "grad_norm": 1.4705986976623535, "learning_rate": 8.279673771854127e-06, "loss": 0.4405, "num_input_tokens_seen": 7689200, "step": 11705 }, { "epoch": 6.904481132075472, "grad_norm": 1.9228847026824951, "learning_rate": 8.277731407915386e-06, "loss": 0.3909, "num_input_tokens_seen": 7692080, "step": 11710 }, { "epoch": 6.907429245283019, "grad_norm": 1.8446587324142456, "learning_rate": 8.275788176183126e-06, "loss": 0.4255, "num_input_tokens_seen": 7694896, "step": 11715 }, { "epoch": 6.910377358490566, "grad_norm": 2.2312676906585693, "learning_rate": 8.273844077171827e-06, "loss": 0.3689, "num_input_tokens_seen": 7698992, "step": 11720 }, { "epoch": 6.913325471698113, "grad_norm": 1.8863800764083862, "learning_rate": 8.271899111396202e-06, "loss": 0.5165, "num_input_tokens_seen": 7701648, "step": 11725 }, { "epoch": 6.91627358490566, "grad_norm": 2.0072247982025146, "learning_rate": 8.269953279371185e-06, "loss": 0.4842, "num_input_tokens_seen": 7704080, "step": 11730 }, { "epoch": 6.9192216981132075, "grad_norm": 1.004292368888855, "learning_rate": 8.268006581611945e-06, "loss": 0.3506, "num_input_tokens_seen": 7707536, "step": 11735 }, { "epoch": 6.922169811320755, "grad_norm": 2.1583240032196045, "learning_rate": 8.266059018633878e-06, "loss": 0.3547, "num_input_tokens_seen": 7711536, "step": 11740 }, { "epoch": 6.925117924528302, "grad_norm": 1.6757086515426636, "learning_rate": 8.264110590952609e-06, "loss": 0.4725, "num_input_tokens_seen": 7714288, "step": 11745 }, { "epoch": 6.928066037735849, "grad_norm": 1.5182878971099854, "learning_rate": 8.262161299083993e-06, "loss": 0.4083, "num_input_tokens_seen": 7717456, "step": 11750 }, { "epoch": 6.931014150943396, "grad_norm": 1.552993655204773, "learning_rate": 8.260211143544117e-06, "loss": 0.3985, "num_input_tokens_seen": 7720336, "step": 11755 }, { "epoch": 6.933962264150943, "grad_norm": 1.9180445671081543, "learning_rate": 8.258260124849288e-06, "loss": 0.3985, "num_input_tokens_seen": 7723664, "step": 11760 }, { "epoch": 6.93691037735849, "grad_norm": 3.434887647628784, "learning_rate": 8.256308243516048e-06, "loss": 0.3836, "num_input_tokens_seen": 7726672, "step": 11765 }, { "epoch": 6.939858490566038, "grad_norm": 1.221590280532837, "learning_rate": 8.254355500061168e-06, "loss": 0.367, "num_input_tokens_seen": 7730032, "step": 11770 }, { "epoch": 6.942806603773585, "grad_norm": 2.1012253761291504, "learning_rate": 8.252401895001643e-06, "loss": 0.3895, "num_input_tokens_seen": 7732976, "step": 11775 }, { "epoch": 6.945754716981132, "grad_norm": 1.6964490413665771, "learning_rate": 8.2504474288547e-06, "loss": 0.527, "num_input_tokens_seen": 7736080, "step": 11780 }, { "epoch": 6.9487028301886795, "grad_norm": 2.1091439723968506, "learning_rate": 8.248492102137791e-06, "loss": 0.3611, "num_input_tokens_seen": 7738608, "step": 11785 }, { "epoch": 6.951650943396227, "grad_norm": 1.230379581451416, "learning_rate": 8.246535915368596e-06, "loss": 0.4695, "num_input_tokens_seen": 7741488, "step": 11790 }, { "epoch": 6.954599056603773, "grad_norm": 1.906438946723938, "learning_rate": 8.24457886906503e-06, "loss": 0.4147, "num_input_tokens_seen": 7744272, "step": 11795 }, { "epoch": 6.9575471698113205, "grad_norm": 1.7286823987960815, "learning_rate": 8.242620963745222e-06, "loss": 0.3758, "num_input_tokens_seen": 7747632, "step": 11800 }, { "epoch": 6.960495283018868, "grad_norm": 2.2616677284240723, "learning_rate": 8.240662199927538e-06, "loss": 0.379, "num_input_tokens_seen": 7751056, "step": 11805 }, { "epoch": 6.963443396226415, "grad_norm": 1.477638602256775, "learning_rate": 8.238702578130573e-06, "loss": 0.4208, "num_input_tokens_seen": 7754032, "step": 11810 }, { "epoch": 6.966391509433962, "grad_norm": 3.61598539352417, "learning_rate": 8.23674209887314e-06, "loss": 0.3842, "num_input_tokens_seen": 7757360, "step": 11815 }, { "epoch": 6.96933962264151, "grad_norm": 4.0377702713012695, "learning_rate": 8.234780762674288e-06, "loss": 0.3807, "num_input_tokens_seen": 7759600, "step": 11820 }, { "epoch": 6.972287735849057, "grad_norm": 2.7432961463928223, "learning_rate": 8.232818570053286e-06, "loss": 0.417, "num_input_tokens_seen": 7762512, "step": 11825 }, { "epoch": 6.975235849056604, "grad_norm": 2.7062296867370605, "learning_rate": 8.230855521529637e-06, "loss": 0.5184, "num_input_tokens_seen": 7765904, "step": 11830 }, { "epoch": 6.978183962264151, "grad_norm": 1.6518627405166626, "learning_rate": 8.228891617623064e-06, "loss": 0.4304, "num_input_tokens_seen": 7770192, "step": 11835 }, { "epoch": 6.981132075471698, "grad_norm": 1.57875657081604, "learning_rate": 8.22692685885352e-06, "loss": 0.4797, "num_input_tokens_seen": 7773296, "step": 11840 }, { "epoch": 6.984080188679245, "grad_norm": 2.8679962158203125, "learning_rate": 8.224961245741183e-06, "loss": 0.4687, "num_input_tokens_seen": 7777680, "step": 11845 }, { "epoch": 6.9870283018867925, "grad_norm": 1.492464542388916, "learning_rate": 8.222994778806457e-06, "loss": 0.4845, "num_input_tokens_seen": 7780848, "step": 11850 }, { "epoch": 6.98997641509434, "grad_norm": 2.771681308746338, "learning_rate": 8.221027458569972e-06, "loss": 0.3958, "num_input_tokens_seen": 7783504, "step": 11855 }, { "epoch": 6.992924528301887, "grad_norm": 1.570156216621399, "learning_rate": 8.219059285552586e-06, "loss": 0.4146, "num_input_tokens_seen": 7786672, "step": 11860 }, { "epoch": 6.995872641509434, "grad_norm": 2.564511299133301, "learning_rate": 8.21709026027538e-06, "loss": 0.5341, "num_input_tokens_seen": 7790160, "step": 11865 }, { "epoch": 6.998820754716981, "grad_norm": 1.7155814170837402, "learning_rate": 8.215120383259664e-06, "loss": 0.3652, "num_input_tokens_seen": 7795088, "step": 11870 }, { "epoch": 7.001768867924528, "grad_norm": 1.6016138792037964, "learning_rate": 8.21314965502697e-06, "loss": 0.3401, "num_input_tokens_seen": 7797728, "step": 11875 }, { "epoch": 7.004716981132075, "grad_norm": 1.3065996170043945, "learning_rate": 8.211178076099056e-06, "loss": 0.4401, "num_input_tokens_seen": 7800384, "step": 11880 }, { "epoch": 7.007665094339623, "grad_norm": 2.4406204223632812, "learning_rate": 8.209205646997909e-06, "loss": 0.3522, "num_input_tokens_seen": 7804672, "step": 11885 }, { "epoch": 7.01061320754717, "grad_norm": 1.8363914489746094, "learning_rate": 8.207232368245735e-06, "loss": 0.3831, "num_input_tokens_seen": 7808128, "step": 11890 }, { "epoch": 7.013561320754717, "grad_norm": 1.7590378522872925, "learning_rate": 8.205258240364968e-06, "loss": 0.3935, "num_input_tokens_seen": 7810656, "step": 11895 }, { "epoch": 7.0165094339622645, "grad_norm": 1.259320616722107, "learning_rate": 8.203283263878268e-06, "loss": 0.467, "num_input_tokens_seen": 7814656, "step": 11900 }, { "epoch": 7.019457547169812, "grad_norm": 3.2030887603759766, "learning_rate": 8.201307439308518e-06, "loss": 0.3805, "num_input_tokens_seen": 7817792, "step": 11905 }, { "epoch": 7.022405660377358, "grad_norm": 1.5931191444396973, "learning_rate": 8.199330767178828e-06, "loss": 0.465, "num_input_tokens_seen": 7821376, "step": 11910 }, { "epoch": 7.025353773584905, "grad_norm": 2.036374568939209, "learning_rate": 8.19735324801253e-06, "loss": 0.4596, "num_input_tokens_seen": 7825184, "step": 11915 }, { "epoch": 7.028301886792453, "grad_norm": 1.6385753154754639, "learning_rate": 8.195374882333178e-06, "loss": 0.4848, "num_input_tokens_seen": 7828864, "step": 11920 }, { "epoch": 7.03125, "grad_norm": 1.7353466749191284, "learning_rate": 8.193395670664555e-06, "loss": 0.5123, "num_input_tokens_seen": 7832160, "step": 11925 }, { "epoch": 7.034198113207547, "grad_norm": 1.1664917469024658, "learning_rate": 8.191415613530667e-06, "loss": 0.4055, "num_input_tokens_seen": 7836192, "step": 11930 }, { "epoch": 7.037146226415095, "grad_norm": 1.2115110158920288, "learning_rate": 8.189434711455739e-06, "loss": 0.4218, "num_input_tokens_seen": 7840064, "step": 11935 }, { "epoch": 7.040094339622642, "grad_norm": 1.7713078260421753, "learning_rate": 8.187452964964226e-06, "loss": 0.4617, "num_input_tokens_seen": 7842912, "step": 11940 }, { "epoch": 7.043042452830188, "grad_norm": 2.3579745292663574, "learning_rate": 8.185470374580805e-06, "loss": 0.3302, "num_input_tokens_seen": 7845888, "step": 11945 }, { "epoch": 7.0459905660377355, "grad_norm": 2.8858420848846436, "learning_rate": 8.183486940830371e-06, "loss": 0.4191, "num_input_tokens_seen": 7849536, "step": 11950 }, { "epoch": 7.048938679245283, "grad_norm": 1.4819821119308472, "learning_rate": 8.18150266423805e-06, "loss": 0.4703, "num_input_tokens_seen": 7852832, "step": 11955 }, { "epoch": 7.05188679245283, "grad_norm": 2.6588611602783203, "learning_rate": 8.179517545329188e-06, "loss": 0.4608, "num_input_tokens_seen": 7855808, "step": 11960 }, { "epoch": 7.054834905660377, "grad_norm": 4.185413360595703, "learning_rate": 8.177531584629353e-06, "loss": 0.4026, "num_input_tokens_seen": 7858528, "step": 11965 }, { "epoch": 7.057783018867925, "grad_norm": 1.627937912940979, "learning_rate": 8.175544782664335e-06, "loss": 0.348, "num_input_tokens_seen": 7861312, "step": 11970 }, { "epoch": 7.060731132075472, "grad_norm": 1.9890239238739014, "learning_rate": 8.173557139960151e-06, "loss": 0.347, "num_input_tokens_seen": 7864416, "step": 11975 }, { "epoch": 7.063679245283019, "grad_norm": 3.1697793006896973, "learning_rate": 8.17156865704304e-06, "loss": 0.483, "num_input_tokens_seen": 7867200, "step": 11980 }, { "epoch": 7.066627358490566, "grad_norm": 1.8147103786468506, "learning_rate": 8.169579334439453e-06, "loss": 0.4707, "num_input_tokens_seen": 7870432, "step": 11985 }, { "epoch": 7.069575471698113, "grad_norm": 2.164008855819702, "learning_rate": 8.16758917267608e-06, "loss": 0.3344, "num_input_tokens_seen": 7874464, "step": 11990 }, { "epoch": 7.07252358490566, "grad_norm": 1.8056544065475464, "learning_rate": 8.165598172279822e-06, "loss": 0.4001, "num_input_tokens_seen": 7880512, "step": 11995 }, { "epoch": 7.0754716981132075, "grad_norm": 1.7205203771591187, "learning_rate": 8.163606333777804e-06, "loss": 0.3832, "num_input_tokens_seen": 7884896, "step": 12000 }, { "epoch": 7.078419811320755, "grad_norm": 2.6996588706970215, "learning_rate": 8.161613657697374e-06, "loss": 0.3609, "num_input_tokens_seen": 7888352, "step": 12005 }, { "epoch": 7.081367924528302, "grad_norm": 1.6810325384140015, "learning_rate": 8.159620144566103e-06, "loss": 0.4394, "num_input_tokens_seen": 7891776, "step": 12010 }, { "epoch": 7.084316037735849, "grad_norm": 1.8335344791412354, "learning_rate": 8.157625794911782e-06, "loss": 0.6511, "num_input_tokens_seen": 7894112, "step": 12015 }, { "epoch": 7.087264150943396, "grad_norm": 2.0450565814971924, "learning_rate": 8.155630609262424e-06, "loss": 0.4528, "num_input_tokens_seen": 7896928, "step": 12020 }, { "epoch": 7.090212264150943, "grad_norm": 1.6424673795700073, "learning_rate": 8.153634588146262e-06, "loss": 0.4214, "num_input_tokens_seen": 7899680, "step": 12025 }, { "epoch": 7.09316037735849, "grad_norm": 2.396646499633789, "learning_rate": 8.15163773209175e-06, "loss": 0.347, "num_input_tokens_seen": 7902112, "step": 12030 }, { "epoch": 7.096108490566038, "grad_norm": 1.0955971479415894, "learning_rate": 8.149640041627566e-06, "loss": 0.3758, "num_input_tokens_seen": 7905280, "step": 12035 }, { "epoch": 7.099056603773585, "grad_norm": 2.202089309692383, "learning_rate": 8.147641517282608e-06, "loss": 0.4507, "num_input_tokens_seen": 7907776, "step": 12040 }, { "epoch": 7.102004716981132, "grad_norm": 3.561007261276245, "learning_rate": 8.145642159585992e-06, "loss": 0.4391, "num_input_tokens_seen": 7910080, "step": 12045 }, { "epoch": 7.1049528301886795, "grad_norm": 2.517664909362793, "learning_rate": 8.143641969067057e-06, "loss": 0.3835, "num_input_tokens_seen": 7913568, "step": 12050 }, { "epoch": 7.107900943396227, "grad_norm": 2.1999099254608154, "learning_rate": 8.141640946255362e-06, "loss": 0.3479, "num_input_tokens_seen": 7916448, "step": 12055 }, { "epoch": 7.110849056603773, "grad_norm": 1.7840503454208374, "learning_rate": 8.139639091680687e-06, "loss": 0.3863, "num_input_tokens_seen": 7919232, "step": 12060 }, { "epoch": 7.1137971698113205, "grad_norm": 1.9401828050613403, "learning_rate": 8.137636405873031e-06, "loss": 0.4854, "num_input_tokens_seen": 7923040, "step": 12065 }, { "epoch": 7.116745283018868, "grad_norm": 1.621199607849121, "learning_rate": 8.135632889362614e-06, "loss": 0.4212, "num_input_tokens_seen": 7925952, "step": 12070 }, { "epoch": 7.119693396226415, "grad_norm": 1.313591480255127, "learning_rate": 8.133628542679879e-06, "loss": 0.4586, "num_input_tokens_seen": 7929472, "step": 12075 }, { "epoch": 7.122641509433962, "grad_norm": 2.0602595806121826, "learning_rate": 8.131623366355478e-06, "loss": 0.415, "num_input_tokens_seen": 7932384, "step": 12080 }, { "epoch": 7.12558962264151, "grad_norm": 2.3505077362060547, "learning_rate": 8.129617360920297e-06, "loss": 0.4964, "num_input_tokens_seen": 7936032, "step": 12085 }, { "epoch": 7.128537735849057, "grad_norm": 1.999080777168274, "learning_rate": 8.12761052690543e-06, "loss": 0.3816, "num_input_tokens_seen": 7938528, "step": 12090 }, { "epoch": 7.131485849056604, "grad_norm": 1.1627992391586304, "learning_rate": 8.125602864842197e-06, "loss": 0.6183, "num_input_tokens_seen": 7941472, "step": 12095 }, { "epoch": 7.134433962264151, "grad_norm": 1.5003302097320557, "learning_rate": 8.123594375262135e-06, "loss": 0.3942, "num_input_tokens_seen": 7944832, "step": 12100 }, { "epoch": 7.137382075471698, "grad_norm": 1.607213020324707, "learning_rate": 8.121585058697e-06, "loss": 0.3171, "num_input_tokens_seen": 7950752, "step": 12105 }, { "epoch": 7.140330188679245, "grad_norm": 1.706193208694458, "learning_rate": 8.119574915678767e-06, "loss": 0.3768, "num_input_tokens_seen": 7953696, "step": 12110 }, { "epoch": 7.1432783018867925, "grad_norm": 2.0735576152801514, "learning_rate": 8.117563946739632e-06, "loss": 0.3878, "num_input_tokens_seen": 7956896, "step": 12115 }, { "epoch": 7.14622641509434, "grad_norm": 5.1470255851745605, "learning_rate": 8.115552152412006e-06, "loss": 0.5886, "num_input_tokens_seen": 7959456, "step": 12120 }, { "epoch": 7.149174528301887, "grad_norm": 2.5840346813201904, "learning_rate": 8.11353953322852e-06, "loss": 0.4574, "num_input_tokens_seen": 7962336, "step": 12125 }, { "epoch": 7.152122641509434, "grad_norm": 1.8959447145462036, "learning_rate": 8.111526089722024e-06, "loss": 0.4872, "num_input_tokens_seen": 7964576, "step": 12130 }, { "epoch": 7.155070754716981, "grad_norm": 1.0848027467727661, "learning_rate": 8.109511822425586e-06, "loss": 0.3591, "num_input_tokens_seen": 7967584, "step": 12135 }, { "epoch": 7.158018867924528, "grad_norm": 3.7069387435913086, "learning_rate": 8.107496731872491e-06, "loss": 0.3826, "num_input_tokens_seen": 7971808, "step": 12140 }, { "epoch": 7.160966981132075, "grad_norm": 1.6599422693252563, "learning_rate": 8.105480818596243e-06, "loss": 0.3222, "num_input_tokens_seen": 7975008, "step": 12145 }, { "epoch": 7.163915094339623, "grad_norm": 1.824126958847046, "learning_rate": 8.103464083130566e-06, "loss": 0.418, "num_input_tokens_seen": 7977280, "step": 12150 }, { "epoch": 7.16686320754717, "grad_norm": 1.7617677450180054, "learning_rate": 8.101446526009397e-06, "loss": 0.3292, "num_input_tokens_seen": 7980480, "step": 12155 }, { "epoch": 7.169811320754717, "grad_norm": 2.185945987701416, "learning_rate": 8.099428147766894e-06, "loss": 0.3279, "num_input_tokens_seen": 7983552, "step": 12160 }, { "epoch": 7.1727594339622645, "grad_norm": 1.8317811489105225, "learning_rate": 8.097408948937431e-06, "loss": 0.5883, "num_input_tokens_seen": 7987136, "step": 12165 }, { "epoch": 7.175707547169812, "grad_norm": 1.8953253030776978, "learning_rate": 8.095388930055599e-06, "loss": 0.5065, "num_input_tokens_seen": 7990112, "step": 12170 }, { "epoch": 7.178655660377358, "grad_norm": 3.0639967918395996, "learning_rate": 8.093368091656209e-06, "loss": 0.5208, "num_input_tokens_seen": 7993312, "step": 12175 }, { "epoch": 7.181603773584905, "grad_norm": 1.3017091751098633, "learning_rate": 8.091346434274284e-06, "loss": 0.3945, "num_input_tokens_seen": 7996352, "step": 12180 }, { "epoch": 7.184551886792453, "grad_norm": 0.5470018982887268, "learning_rate": 8.089323958445068e-06, "loss": 0.3246, "num_input_tokens_seen": 8001248, "step": 12185 }, { "epoch": 7.1875, "grad_norm": 2.2512853145599365, "learning_rate": 8.08730066470402e-06, "loss": 0.3908, "num_input_tokens_seen": 8005024, "step": 12190 }, { "epoch": 7.190448113207547, "grad_norm": 1.9046117067337036, "learning_rate": 8.085276553586814e-06, "loss": 0.3799, "num_input_tokens_seen": 8008000, "step": 12195 }, { "epoch": 7.193396226415095, "grad_norm": 1.3622537851333618, "learning_rate": 8.083251625629345e-06, "loss": 0.3088, "num_input_tokens_seen": 8011072, "step": 12200 }, { "epoch": 7.196344339622642, "grad_norm": 5.916733264923096, "learning_rate": 8.08122588136772e-06, "loss": 0.3619, "num_input_tokens_seen": 8014048, "step": 12205 }, { "epoch": 7.199292452830188, "grad_norm": 1.282989740371704, "learning_rate": 8.079199321338262e-06, "loss": 0.2809, "num_input_tokens_seen": 8017248, "step": 12210 }, { "epoch": 7.2022405660377355, "grad_norm": 2.1556763648986816, "learning_rate": 8.077171946077516e-06, "loss": 0.3198, "num_input_tokens_seen": 8019648, "step": 12215 }, { "epoch": 7.205188679245283, "grad_norm": 1.9550988674163818, "learning_rate": 8.075143756122232e-06, "loss": 0.4055, "num_input_tokens_seen": 8022720, "step": 12220 }, { "epoch": 7.20813679245283, "grad_norm": 1.7085944414138794, "learning_rate": 8.073114752009388e-06, "loss": 0.4007, "num_input_tokens_seen": 8025760, "step": 12225 }, { "epoch": 7.211084905660377, "grad_norm": 1.7590181827545166, "learning_rate": 8.071084934276168e-06, "loss": 0.4246, "num_input_tokens_seen": 8028544, "step": 12230 }, { "epoch": 7.214033018867925, "grad_norm": 1.395263910293579, "learning_rate": 8.069054303459976e-06, "loss": 0.4218, "num_input_tokens_seen": 8031488, "step": 12235 }, { "epoch": 7.216981132075472, "grad_norm": 1.2848962545394897, "learning_rate": 8.06702286009843e-06, "loss": 0.5122, "num_input_tokens_seen": 8035648, "step": 12240 }, { "epoch": 7.219929245283019, "grad_norm": 2.3884332180023193, "learning_rate": 8.064990604729363e-06, "loss": 0.4796, "num_input_tokens_seen": 8038208, "step": 12245 }, { "epoch": 7.222877358490566, "grad_norm": 1.525738000869751, "learning_rate": 8.062957537890827e-06, "loss": 0.4411, "num_input_tokens_seen": 8041632, "step": 12250 }, { "epoch": 7.225825471698113, "grad_norm": 1.8145933151245117, "learning_rate": 8.060923660121081e-06, "loss": 0.4661, "num_input_tokens_seen": 8044768, "step": 12255 }, { "epoch": 7.22877358490566, "grad_norm": 5.097062587738037, "learning_rate": 8.058888971958603e-06, "loss": 0.3991, "num_input_tokens_seen": 8047744, "step": 12260 }, { "epoch": 7.2317216981132075, "grad_norm": 1.8016551733016968, "learning_rate": 8.056853473942085e-06, "loss": 0.4845, "num_input_tokens_seen": 8050464, "step": 12265 }, { "epoch": 7.234669811320755, "grad_norm": 1.5872262716293335, "learning_rate": 8.054817166610438e-06, "loss": 0.4454, "num_input_tokens_seen": 8053216, "step": 12270 }, { "epoch": 7.237617924528302, "grad_norm": 1.7079663276672363, "learning_rate": 8.052780050502781e-06, "loss": 0.4097, "num_input_tokens_seen": 8056768, "step": 12275 }, { "epoch": 7.240566037735849, "grad_norm": 2.230175018310547, "learning_rate": 8.050742126158448e-06, "loss": 0.4308, "num_input_tokens_seen": 8060224, "step": 12280 }, { "epoch": 7.243514150943396, "grad_norm": 2.062560796737671, "learning_rate": 8.04870339411699e-06, "loss": 0.5985, "num_input_tokens_seen": 8064832, "step": 12285 }, { "epoch": 7.246462264150943, "grad_norm": 1.0584306716918945, "learning_rate": 8.046663854918166e-06, "loss": 0.3934, "num_input_tokens_seen": 8068320, "step": 12290 }, { "epoch": 7.24941037735849, "grad_norm": 2.064013719558716, "learning_rate": 8.044623509101959e-06, "loss": 0.3619, "num_input_tokens_seen": 8070720, "step": 12295 }, { "epoch": 7.252358490566038, "grad_norm": 1.6640914678573608, "learning_rate": 8.042582357208557e-06, "loss": 0.4335, "num_input_tokens_seen": 8074304, "step": 12300 }, { "epoch": 7.255306603773585, "grad_norm": 1.350948691368103, "learning_rate": 8.04054039977836e-06, "loss": 0.5057, "num_input_tokens_seen": 8077408, "step": 12305 }, { "epoch": 7.258254716981132, "grad_norm": 2.0529353618621826, "learning_rate": 8.038497637351992e-06, "loss": 0.2991, "num_input_tokens_seen": 8081664, "step": 12310 }, { "epoch": 7.2612028301886795, "grad_norm": 3.246032476425171, "learning_rate": 8.036454070470276e-06, "loss": 0.4857, "num_input_tokens_seen": 8086176, "step": 12315 }, { "epoch": 7.264150943396227, "grad_norm": 1.3874900341033936, "learning_rate": 8.03440969967426e-06, "loss": 0.4689, "num_input_tokens_seen": 8088928, "step": 12320 }, { "epoch": 7.267099056603773, "grad_norm": 2.4815354347229004, "learning_rate": 8.032364525505198e-06, "loss": 0.4356, "num_input_tokens_seen": 8091584, "step": 12325 }, { "epoch": 7.2700471698113205, "grad_norm": 1.2948434352874756, "learning_rate": 8.030318548504561e-06, "loss": 0.4236, "num_input_tokens_seen": 8094848, "step": 12330 }, { "epoch": 7.272995283018868, "grad_norm": 1.6751853227615356, "learning_rate": 8.028271769214026e-06, "loss": 0.5136, "num_input_tokens_seen": 8097696, "step": 12335 }, { "epoch": 7.275943396226415, "grad_norm": 2.511627674102783, "learning_rate": 8.02622418817549e-06, "loss": 0.3823, "num_input_tokens_seen": 8100608, "step": 12340 }, { "epoch": 7.278891509433962, "grad_norm": 1.6146007776260376, "learning_rate": 8.024175805931056e-06, "loss": 0.3379, "num_input_tokens_seen": 8103200, "step": 12345 }, { "epoch": 7.28183962264151, "grad_norm": 2.543010950088501, "learning_rate": 8.022126623023045e-06, "loss": 0.4445, "num_input_tokens_seen": 8105792, "step": 12350 }, { "epoch": 7.284787735849057, "grad_norm": 1.7947094440460205, "learning_rate": 8.020076639993987e-06, "loss": 0.3191, "num_input_tokens_seen": 8109088, "step": 12355 }, { "epoch": 7.287735849056604, "grad_norm": 2.204556703567505, "learning_rate": 8.018025857386622e-06, "loss": 0.3615, "num_input_tokens_seen": 8112192, "step": 12360 }, { "epoch": 7.290683962264151, "grad_norm": 1.4949525594711304, "learning_rate": 8.015974275743905e-06, "loss": 0.5029, "num_input_tokens_seen": 8114784, "step": 12365 }, { "epoch": 7.293632075471698, "grad_norm": 3.4516353607177734, "learning_rate": 8.013921895609e-06, "loss": 0.672, "num_input_tokens_seen": 8117376, "step": 12370 }, { "epoch": 7.296580188679245, "grad_norm": 1.9573097229003906, "learning_rate": 8.011868717525283e-06, "loss": 0.397, "num_input_tokens_seen": 8120096, "step": 12375 }, { "epoch": 7.2995283018867925, "grad_norm": 2.2794346809387207, "learning_rate": 8.009814742036343e-06, "loss": 0.3446, "num_input_tokens_seen": 8123808, "step": 12380 }, { "epoch": 7.30247641509434, "grad_norm": 2.2470545768737793, "learning_rate": 8.007759969685979e-06, "loss": 0.41, "num_input_tokens_seen": 8126656, "step": 12385 }, { "epoch": 7.305424528301887, "grad_norm": 1.3391565084457397, "learning_rate": 8.005704401018199e-06, "loss": 0.4363, "num_input_tokens_seen": 8129440, "step": 12390 }, { "epoch": 7.308372641509434, "grad_norm": 2.653414487838745, "learning_rate": 8.003648036577226e-06, "loss": 0.298, "num_input_tokens_seen": 8132288, "step": 12395 }, { "epoch": 7.311320754716981, "grad_norm": 1.4034342765808105, "learning_rate": 8.00159087690749e-06, "loss": 0.3706, "num_input_tokens_seen": 8135232, "step": 12400 }, { "epoch": 7.314268867924528, "grad_norm": 4.030760288238525, "learning_rate": 7.999532922553635e-06, "loss": 0.4204, "num_input_tokens_seen": 8137824, "step": 12405 }, { "epoch": 7.317216981132075, "grad_norm": 2.426483154296875, "learning_rate": 7.997474174060508e-06, "loss": 0.3879, "num_input_tokens_seen": 8141120, "step": 12410 }, { "epoch": 7.320165094339623, "grad_norm": 2.849923849105835, "learning_rate": 7.995414631973179e-06, "loss": 0.3316, "num_input_tokens_seen": 8143616, "step": 12415 }, { "epoch": 7.32311320754717, "grad_norm": 2.1003317832946777, "learning_rate": 7.993354296836914e-06, "loss": 0.4876, "num_input_tokens_seen": 8147904, "step": 12420 }, { "epoch": 7.326061320754717, "grad_norm": 1.7448924779891968, "learning_rate": 7.991293169197198e-06, "loss": 0.3775, "num_input_tokens_seen": 8150816, "step": 12425 }, { "epoch": 7.3290094339622645, "grad_norm": 3.2596328258514404, "learning_rate": 7.989231249599725e-06, "loss": 0.3758, "num_input_tokens_seen": 8153728, "step": 12430 }, { "epoch": 7.331957547169811, "grad_norm": 2.0757553577423096, "learning_rate": 7.987168538590395e-06, "loss": 0.4087, "num_input_tokens_seen": 8156672, "step": 12435 }, { "epoch": 7.334905660377358, "grad_norm": 1.5588963031768799, "learning_rate": 7.985105036715322e-06, "loss": 0.3946, "num_input_tokens_seen": 8159232, "step": 12440 }, { "epoch": 7.337853773584905, "grad_norm": 2.1332321166992188, "learning_rate": 7.983040744520823e-06, "loss": 0.4909, "num_input_tokens_seen": 8162368, "step": 12445 }, { "epoch": 7.340801886792453, "grad_norm": 1.8798694610595703, "learning_rate": 7.980975662553432e-06, "loss": 0.3809, "num_input_tokens_seen": 8164608, "step": 12450 }, { "epoch": 7.34375, "grad_norm": 1.7106914520263672, "learning_rate": 7.978909791359888e-06, "loss": 0.4456, "num_input_tokens_seen": 8168000, "step": 12455 }, { "epoch": 7.346698113207547, "grad_norm": 1.8090864419937134, "learning_rate": 7.976843131487136e-06, "loss": 0.513, "num_input_tokens_seen": 8170656, "step": 12460 }, { "epoch": 7.349646226415095, "grad_norm": 1.3120800256729126, "learning_rate": 7.974775683482337e-06, "loss": 0.4691, "num_input_tokens_seen": 8175680, "step": 12465 }, { "epoch": 7.352594339622642, "grad_norm": 2.5053813457489014, "learning_rate": 7.972707447892855e-06, "loss": 0.5225, "num_input_tokens_seen": 8179136, "step": 12470 }, { "epoch": 7.355542452830189, "grad_norm": 1.4578431844711304, "learning_rate": 7.970638425266264e-06, "loss": 0.4666, "num_input_tokens_seen": 8181952, "step": 12475 }, { "epoch": 7.3584905660377355, "grad_norm": 1.9400789737701416, "learning_rate": 7.968568616150349e-06, "loss": 0.5254, "num_input_tokens_seen": 8185792, "step": 12480 }, { "epoch": 7.361438679245283, "grad_norm": 1.8106579780578613, "learning_rate": 7.966498021093096e-06, "loss": 0.3703, "num_input_tokens_seen": 8189312, "step": 12485 }, { "epoch": 7.36438679245283, "grad_norm": 1.505742073059082, "learning_rate": 7.96442664064271e-06, "loss": 0.37, "num_input_tokens_seen": 8192224, "step": 12490 }, { "epoch": 7.367334905660377, "grad_norm": 2.61379075050354, "learning_rate": 7.962354475347593e-06, "loss": 0.4714, "num_input_tokens_seen": 8194656, "step": 12495 }, { "epoch": 7.370283018867925, "grad_norm": 1.7493418455123901, "learning_rate": 7.960281525756364e-06, "loss": 0.5202, "num_input_tokens_seen": 8198240, "step": 12500 }, { "epoch": 7.373231132075472, "grad_norm": 1.8808696269989014, "learning_rate": 7.95820779241784e-06, "loss": 0.4227, "num_input_tokens_seen": 8201568, "step": 12505 }, { "epoch": 7.376179245283019, "grad_norm": 2.5154802799224854, "learning_rate": 7.956133275881055e-06, "loss": 0.4209, "num_input_tokens_seen": 8204480, "step": 12510 }, { "epoch": 7.379127358490566, "grad_norm": 1.3652325868606567, "learning_rate": 7.954057976695244e-06, "loss": 0.3898, "num_input_tokens_seen": 8207968, "step": 12515 }, { "epoch": 7.382075471698113, "grad_norm": 4.11403226852417, "learning_rate": 7.951981895409854e-06, "loss": 0.3516, "num_input_tokens_seen": 8210432, "step": 12520 }, { "epoch": 7.38502358490566, "grad_norm": 2.625023603439331, "learning_rate": 7.949905032574534e-06, "loss": 0.4433, "num_input_tokens_seen": 8213152, "step": 12525 }, { "epoch": 7.3879716981132075, "grad_norm": 1.3780241012573242, "learning_rate": 7.947827388739145e-06, "loss": 0.2944, "num_input_tokens_seen": 8216544, "step": 12530 }, { "epoch": 7.390919811320755, "grad_norm": 1.7670255899429321, "learning_rate": 7.945748964453747e-06, "loss": 0.3984, "num_input_tokens_seen": 8220320, "step": 12535 }, { "epoch": 7.393867924528302, "grad_norm": 2.123077630996704, "learning_rate": 7.943669760268618e-06, "loss": 0.5388, "num_input_tokens_seen": 8223232, "step": 12540 }, { "epoch": 7.396816037735849, "grad_norm": 3.5414204597473145, "learning_rate": 7.941589776734232e-06, "loss": 0.4054, "num_input_tokens_seen": 8226016, "step": 12545 }, { "epoch": 7.399764150943396, "grad_norm": 1.7777336835861206, "learning_rate": 7.939509014401277e-06, "loss": 0.5444, "num_input_tokens_seen": 8229600, "step": 12550 }, { "epoch": 7.402712264150943, "grad_norm": 2.4427311420440674, "learning_rate": 7.93742747382064e-06, "loss": 0.4027, "num_input_tokens_seen": 8232352, "step": 12555 }, { "epoch": 7.40566037735849, "grad_norm": 1.1192175149917603, "learning_rate": 7.93534515554342e-06, "loss": 0.3794, "num_input_tokens_seen": 8235360, "step": 12560 }, { "epoch": 7.408608490566038, "grad_norm": 2.3893048763275146, "learning_rate": 7.933262060120918e-06, "loss": 0.3033, "num_input_tokens_seen": 8240000, "step": 12565 }, { "epoch": 7.411556603773585, "grad_norm": 1.6233843564987183, "learning_rate": 7.931178188104646e-06, "loss": 0.4626, "num_input_tokens_seen": 8244160, "step": 12570 }, { "epoch": 7.414504716981132, "grad_norm": 1.8031573295593262, "learning_rate": 7.929093540046317e-06, "loss": 0.3791, "num_input_tokens_seen": 8247840, "step": 12575 }, { "epoch": 7.4174528301886795, "grad_norm": 5.013012886047363, "learning_rate": 7.927008116497848e-06, "loss": 0.4474, "num_input_tokens_seen": 8250816, "step": 12580 }, { "epoch": 7.420400943396227, "grad_norm": 1.956166386604309, "learning_rate": 7.924921918011366e-06, "loss": 0.431, "num_input_tokens_seen": 8254944, "step": 12585 }, { "epoch": 7.423349056603773, "grad_norm": 2.3169119358062744, "learning_rate": 7.9228349451392e-06, "loss": 0.3232, "num_input_tokens_seen": 8257888, "step": 12590 }, { "epoch": 7.4262971698113205, "grad_norm": 1.7224292755126953, "learning_rate": 7.920747198433884e-06, "loss": 0.4167, "num_input_tokens_seen": 8260896, "step": 12595 }, { "epoch": 7.429245283018868, "grad_norm": 1.5338996648788452, "learning_rate": 7.91865867844816e-06, "loss": 0.4393, "num_input_tokens_seen": 8264928, "step": 12600 }, { "epoch": 7.432193396226415, "grad_norm": 1.2865477800369263, "learning_rate": 7.916569385734976e-06, "loss": 0.4334, "num_input_tokens_seen": 8267520, "step": 12605 }, { "epoch": 7.435141509433962, "grad_norm": 1.8008134365081787, "learning_rate": 7.914479320847474e-06, "loss": 0.372, "num_input_tokens_seen": 8271392, "step": 12610 }, { "epoch": 7.43808962264151, "grad_norm": 1.5977699756622314, "learning_rate": 7.912388484339012e-06, "loss": 0.4565, "num_input_tokens_seen": 8274752, "step": 12615 }, { "epoch": 7.441037735849057, "grad_norm": 1.9693833589553833, "learning_rate": 7.910296876763147e-06, "loss": 0.4916, "num_input_tokens_seen": 8277184, "step": 12620 }, { "epoch": 7.443985849056604, "grad_norm": 2.148984432220459, "learning_rate": 7.90820449867364e-06, "loss": 0.4166, "num_input_tokens_seen": 8281344, "step": 12625 }, { "epoch": 7.446933962264151, "grad_norm": 1.6694334745407104, "learning_rate": 7.90611135062446e-06, "loss": 0.3896, "num_input_tokens_seen": 8284928, "step": 12630 }, { "epoch": 7.449882075471698, "grad_norm": 1.2679615020751953, "learning_rate": 7.904017433169775e-06, "loss": 0.3832, "num_input_tokens_seen": 8288352, "step": 12635 }, { "epoch": 7.452830188679245, "grad_norm": 2.2382752895355225, "learning_rate": 7.901922746863957e-06, "loss": 0.4248, "num_input_tokens_seen": 8292736, "step": 12640 }, { "epoch": 7.4557783018867925, "grad_norm": 3.032177448272705, "learning_rate": 7.899827292261589e-06, "loss": 0.3948, "num_input_tokens_seen": 8296352, "step": 12645 }, { "epoch": 7.45872641509434, "grad_norm": 2.432804584503174, "learning_rate": 7.897731069917444e-06, "loss": 0.3977, "num_input_tokens_seen": 8298560, "step": 12650 }, { "epoch": 7.461674528301887, "grad_norm": 2.779297351837158, "learning_rate": 7.895634080386512e-06, "loss": 0.4153, "num_input_tokens_seen": 8301600, "step": 12655 }, { "epoch": 7.464622641509434, "grad_norm": 1.6676620244979858, "learning_rate": 7.893536324223977e-06, "loss": 0.4506, "num_input_tokens_seen": 8305280, "step": 12660 }, { "epoch": 7.467570754716981, "grad_norm": 1.4888038635253906, "learning_rate": 7.89143780198523e-06, "loss": 0.4499, "num_input_tokens_seen": 8308768, "step": 12665 }, { "epoch": 7.470518867924528, "grad_norm": 2.160839796066284, "learning_rate": 7.889338514225862e-06, "loss": 0.375, "num_input_tokens_seen": 8311488, "step": 12670 }, { "epoch": 7.473466981132075, "grad_norm": 1.0705455541610718, "learning_rate": 7.887238461501671e-06, "loss": 0.3354, "num_input_tokens_seen": 8314880, "step": 12675 }, { "epoch": 7.476415094339623, "grad_norm": 1.5552070140838623, "learning_rate": 7.885137644368654e-06, "loss": 0.4843, "num_input_tokens_seen": 8318752, "step": 12680 }, { "epoch": 7.47936320754717, "grad_norm": 2.2615702152252197, "learning_rate": 7.883036063383012e-06, "loss": 0.277, "num_input_tokens_seen": 8321760, "step": 12685 }, { "epoch": 7.482311320754717, "grad_norm": 1.7472974061965942, "learning_rate": 7.880933719101148e-06, "loss": 0.4145, "num_input_tokens_seen": 8325344, "step": 12690 }, { "epoch": 7.4852594339622645, "grad_norm": 1.1354089975357056, "learning_rate": 7.878830612079664e-06, "loss": 0.5482, "num_input_tokens_seen": 8328608, "step": 12695 }, { "epoch": 7.488207547169811, "grad_norm": 1.729274034500122, "learning_rate": 7.876726742875369e-06, "loss": 0.436, "num_input_tokens_seen": 8331904, "step": 12700 }, { "epoch": 7.491155660377358, "grad_norm": 8.14317512512207, "learning_rate": 7.874622112045269e-06, "loss": 0.4384, "num_input_tokens_seen": 8335072, "step": 12705 }, { "epoch": 7.494103773584905, "grad_norm": 1.4179096221923828, "learning_rate": 7.872516720146578e-06, "loss": 0.4691, "num_input_tokens_seen": 8339552, "step": 12710 }, { "epoch": 7.497051886792453, "grad_norm": 1.9743411540985107, "learning_rate": 7.870410567736705e-06, "loss": 0.434, "num_input_tokens_seen": 8342592, "step": 12715 }, { "epoch": 7.5, "grad_norm": 1.812266230583191, "learning_rate": 7.868303655373264e-06, "loss": 0.4613, "num_input_tokens_seen": 8345152, "step": 12720 }, { "epoch": 7.502948113207547, "grad_norm": 1.3845603466033936, "learning_rate": 7.866195983614066e-06, "loss": 0.4391, "num_input_tokens_seen": 8348576, "step": 12725 }, { "epoch": 7.505896226415095, "grad_norm": 2.9390366077423096, "learning_rate": 7.864087553017133e-06, "loss": 0.3972, "num_input_tokens_seen": 8351360, "step": 12730 }, { "epoch": 7.508844339622642, "grad_norm": 3.8568084239959717, "learning_rate": 7.861978364140674e-06, "loss": 0.4739, "num_input_tokens_seen": 8354496, "step": 12735 }, { "epoch": 7.511792452830189, "grad_norm": 1.7081845998764038, "learning_rate": 7.859868417543109e-06, "loss": 0.3164, "num_input_tokens_seen": 8357152, "step": 12740 }, { "epoch": 7.5147405660377355, "grad_norm": 2.58125376701355, "learning_rate": 7.857757713783055e-06, "loss": 0.3743, "num_input_tokens_seen": 8361184, "step": 12745 }, { "epoch": 7.517688679245283, "grad_norm": 1.1928293704986572, "learning_rate": 7.855646253419331e-06, "loss": 0.4263, "num_input_tokens_seen": 8364928, "step": 12750 }, { "epoch": 7.52063679245283, "grad_norm": 3.4893486499786377, "learning_rate": 7.853534037010952e-06, "loss": 0.5103, "num_input_tokens_seen": 8367744, "step": 12755 }, { "epoch": 7.523584905660377, "grad_norm": 1.5877420902252197, "learning_rate": 7.851421065117142e-06, "loss": 0.3771, "num_input_tokens_seen": 8371104, "step": 12760 }, { "epoch": 7.526533018867925, "grad_norm": 2.675773859024048, "learning_rate": 7.849307338297314e-06, "loss": 0.4052, "num_input_tokens_seen": 8373888, "step": 12765 }, { "epoch": 7.529481132075472, "grad_norm": 1.877766489982605, "learning_rate": 7.847192857111087e-06, "loss": 0.4242, "num_input_tokens_seen": 8377408, "step": 12770 }, { "epoch": 7.532429245283019, "grad_norm": 4.838123798370361, "learning_rate": 7.845077622118282e-06, "loss": 0.3714, "num_input_tokens_seen": 8379904, "step": 12775 }, { "epoch": 7.535377358490566, "grad_norm": 1.5701254606246948, "learning_rate": 7.842961633878916e-06, "loss": 0.4092, "num_input_tokens_seen": 8383296, "step": 12780 }, { "epoch": 7.538325471698113, "grad_norm": 1.0992152690887451, "learning_rate": 7.840844892953204e-06, "loss": 0.3705, "num_input_tokens_seen": 8387328, "step": 12785 }, { "epoch": 7.54127358490566, "grad_norm": 2.6309165954589844, "learning_rate": 7.838727399901562e-06, "loss": 0.3052, "num_input_tokens_seen": 8391168, "step": 12790 }, { "epoch": 7.5442216981132075, "grad_norm": 2.350400447845459, "learning_rate": 7.836609155284607e-06, "loss": 0.3918, "num_input_tokens_seen": 8394080, "step": 12795 }, { "epoch": 7.547169811320755, "grad_norm": 2.073744535446167, "learning_rate": 7.834490159663154e-06, "loss": 0.5156, "num_input_tokens_seen": 8397920, "step": 12800 }, { "epoch": 7.550117924528302, "grad_norm": 2.1424314975738525, "learning_rate": 7.832370413598215e-06, "loss": 0.5711, "num_input_tokens_seen": 8401120, "step": 12805 }, { "epoch": 7.553066037735849, "grad_norm": 1.510869026184082, "learning_rate": 7.830249917651003e-06, "loss": 0.4277, "num_input_tokens_seen": 8404384, "step": 12810 }, { "epoch": 7.556014150943396, "grad_norm": 2.3509483337402344, "learning_rate": 7.828128672382926e-06, "loss": 0.5811, "num_input_tokens_seen": 8407552, "step": 12815 }, { "epoch": 7.558962264150943, "grad_norm": 3.9559905529022217, "learning_rate": 7.826006678355596e-06, "loss": 0.4526, "num_input_tokens_seen": 8410208, "step": 12820 }, { "epoch": 7.56191037735849, "grad_norm": 1.9079508781433105, "learning_rate": 7.823883936130817e-06, "loss": 0.3453, "num_input_tokens_seen": 8413664, "step": 12825 }, { "epoch": 7.564858490566038, "grad_norm": 1.4982151985168457, "learning_rate": 7.821760446270597e-06, "loss": 0.4166, "num_input_tokens_seen": 8417024, "step": 12830 }, { "epoch": 7.567806603773585, "grad_norm": 1.5626529455184937, "learning_rate": 7.819636209337136e-06, "loss": 0.4115, "num_input_tokens_seen": 8419360, "step": 12835 }, { "epoch": 7.570754716981132, "grad_norm": 1.7151862382888794, "learning_rate": 7.817511225892838e-06, "loss": 0.5011, "num_input_tokens_seen": 8422592, "step": 12840 }, { "epoch": 7.5737028301886795, "grad_norm": 2.1275227069854736, "learning_rate": 7.8153854965003e-06, "loss": 0.3291, "num_input_tokens_seen": 8425408, "step": 12845 }, { "epoch": 7.576650943396227, "grad_norm": 1.8867838382720947, "learning_rate": 7.813259021722319e-06, "loss": 0.3703, "num_input_tokens_seen": 8428352, "step": 12850 }, { "epoch": 7.579599056603773, "grad_norm": 1.6957852840423584, "learning_rate": 7.811131802121885e-06, "loss": 0.5485, "num_input_tokens_seen": 8431232, "step": 12855 }, { "epoch": 7.5825471698113205, "grad_norm": 1.1976268291473389, "learning_rate": 7.809003838262193e-06, "loss": 0.533, "num_input_tokens_seen": 8435040, "step": 12860 }, { "epoch": 7.585495283018868, "grad_norm": 1.7459875345230103, "learning_rate": 7.806875130706628e-06, "loss": 0.4078, "num_input_tokens_seen": 8439840, "step": 12865 }, { "epoch": 7.588443396226415, "grad_norm": 2.4968643188476562, "learning_rate": 7.804745680018775e-06, "loss": 0.3747, "num_input_tokens_seen": 8444256, "step": 12870 }, { "epoch": 7.591391509433962, "grad_norm": 2.5367767810821533, "learning_rate": 7.802615486762418e-06, "loss": 0.4815, "num_input_tokens_seen": 8448224, "step": 12875 }, { "epoch": 7.59433962264151, "grad_norm": 1.7669814825057983, "learning_rate": 7.800484551501528e-06, "loss": 0.4237, "num_input_tokens_seen": 8450816, "step": 12880 }, { "epoch": 7.597287735849057, "grad_norm": 2.0231943130493164, "learning_rate": 7.798352874800285e-06, "loss": 0.4758, "num_input_tokens_seen": 8453600, "step": 12885 }, { "epoch": 7.600235849056604, "grad_norm": 2.8383538722991943, "learning_rate": 7.79622045722306e-06, "loss": 0.4763, "num_input_tokens_seen": 8457120, "step": 12890 }, { "epoch": 7.603183962264151, "grad_norm": 1.9634640216827393, "learning_rate": 7.794087299334416e-06, "loss": 0.4203, "num_input_tokens_seen": 8460000, "step": 12895 }, { "epoch": 7.606132075471698, "grad_norm": 1.8947242498397827, "learning_rate": 7.79195340169912e-06, "loss": 0.4128, "num_input_tokens_seen": 8463392, "step": 12900 }, { "epoch": 7.609080188679245, "grad_norm": 1.8896082639694214, "learning_rate": 7.789818764882127e-06, "loss": 0.5599, "num_input_tokens_seen": 8465728, "step": 12905 }, { "epoch": 7.6120283018867925, "grad_norm": 2.3915791511535645, "learning_rate": 7.78768338944859e-06, "loss": 0.5339, "num_input_tokens_seen": 8467808, "step": 12910 }, { "epoch": 7.61497641509434, "grad_norm": 1.755210518836975, "learning_rate": 7.785547275963865e-06, "loss": 0.3599, "num_input_tokens_seen": 8471008, "step": 12915 }, { "epoch": 7.617924528301887, "grad_norm": 4.405529022216797, "learning_rate": 7.783410424993492e-06, "loss": 0.4343, "num_input_tokens_seen": 8475392, "step": 12920 }, { "epoch": 7.620872641509434, "grad_norm": 1.0493425130844116, "learning_rate": 7.781272837103213e-06, "loss": 0.2904, "num_input_tokens_seen": 8478176, "step": 12925 }, { "epoch": 7.623820754716981, "grad_norm": 1.9952170848846436, "learning_rate": 7.779134512858964e-06, "loss": 0.4234, "num_input_tokens_seen": 8481696, "step": 12930 }, { "epoch": 7.626768867924528, "grad_norm": 1.8787864446640015, "learning_rate": 7.776995452826876e-06, "loss": 0.3797, "num_input_tokens_seen": 8485152, "step": 12935 }, { "epoch": 7.629716981132075, "grad_norm": 2.1021244525909424, "learning_rate": 7.774855657573274e-06, "loss": 0.503, "num_input_tokens_seen": 8488768, "step": 12940 }, { "epoch": 7.632665094339623, "grad_norm": 1.909528374671936, "learning_rate": 7.772715127664676e-06, "loss": 0.3767, "num_input_tokens_seen": 8492160, "step": 12945 }, { "epoch": 7.63561320754717, "grad_norm": 2.075157880783081, "learning_rate": 7.7705738636678e-06, "loss": 0.5139, "num_input_tokens_seen": 8494816, "step": 12950 }, { "epoch": 7.638561320754717, "grad_norm": 1.5140079259872437, "learning_rate": 7.768431866149552e-06, "loss": 0.533, "num_input_tokens_seen": 8497856, "step": 12955 }, { "epoch": 7.6415094339622645, "grad_norm": 1.3973344564437866, "learning_rate": 7.766289135677035e-06, "loss": 0.3371, "num_input_tokens_seen": 8501216, "step": 12960 }, { "epoch": 7.644457547169811, "grad_norm": 2.8034121990203857, "learning_rate": 7.764145672817549e-06, "loss": 0.4034, "num_input_tokens_seen": 8504352, "step": 12965 }, { "epoch": 7.647405660377358, "grad_norm": 2.0743870735168457, "learning_rate": 7.762001478138583e-06, "loss": 0.4851, "num_input_tokens_seen": 8507424, "step": 12970 }, { "epoch": 7.650353773584905, "grad_norm": 1.5804443359375, "learning_rate": 7.759856552207822e-06, "loss": 0.4767, "num_input_tokens_seen": 8510208, "step": 12975 }, { "epoch": 7.653301886792453, "grad_norm": 1.722705602645874, "learning_rate": 7.757710895593144e-06, "loss": 0.4877, "num_input_tokens_seen": 8513280, "step": 12980 }, { "epoch": 7.65625, "grad_norm": 2.085301637649536, "learning_rate": 7.755564508862623e-06, "loss": 0.5114, "num_input_tokens_seen": 8516672, "step": 12985 }, { "epoch": 7.659198113207547, "grad_norm": 1.3383662700653076, "learning_rate": 7.753417392584522e-06, "loss": 0.4411, "num_input_tokens_seen": 8519328, "step": 12990 }, { "epoch": 7.662146226415095, "grad_norm": 1.590824842453003, "learning_rate": 7.751269547327298e-06, "loss": 0.4245, "num_input_tokens_seen": 8522752, "step": 12995 }, { "epoch": 7.665094339622642, "grad_norm": 1.4524153470993042, "learning_rate": 7.749120973659606e-06, "loss": 0.3133, "num_input_tokens_seen": 8525856, "step": 13000 }, { "epoch": 7.668042452830189, "grad_norm": 1.7499901056289673, "learning_rate": 7.746971672150286e-06, "loss": 0.4701, "num_input_tokens_seen": 8530080, "step": 13005 }, { "epoch": 7.6709905660377355, "grad_norm": 3.645381212234497, "learning_rate": 7.74482164336838e-06, "loss": 0.3816, "num_input_tokens_seen": 8534880, "step": 13010 }, { "epoch": 7.673938679245283, "grad_norm": 1.951810359954834, "learning_rate": 7.742670887883111e-06, "loss": 0.3731, "num_input_tokens_seen": 8538400, "step": 13015 }, { "epoch": 7.67688679245283, "grad_norm": 1.7525858879089355, "learning_rate": 7.740519406263905e-06, "loss": 0.504, "num_input_tokens_seen": 8542176, "step": 13020 }, { "epoch": 7.679834905660377, "grad_norm": 2.45721697807312, "learning_rate": 7.738367199080376e-06, "loss": 0.5589, "num_input_tokens_seen": 8545120, "step": 13025 }, { "epoch": 7.682783018867925, "grad_norm": 2.1724302768707275, "learning_rate": 7.73621426690233e-06, "loss": 0.6369, "num_input_tokens_seen": 8549216, "step": 13030 }, { "epoch": 7.685731132075472, "grad_norm": 1.9341850280761719, "learning_rate": 7.734060610299764e-06, "loss": 0.4582, "num_input_tokens_seen": 8552672, "step": 13035 }, { "epoch": 7.688679245283019, "grad_norm": 2.087179660797119, "learning_rate": 7.731906229842869e-06, "loss": 0.3623, "num_input_tokens_seen": 8556064, "step": 13040 }, { "epoch": 7.691627358490566, "grad_norm": 1.5776234865188599, "learning_rate": 7.729751126102023e-06, "loss": 0.4023, "num_input_tokens_seen": 8558656, "step": 13045 }, { "epoch": 7.694575471698113, "grad_norm": 2.7568676471710205, "learning_rate": 7.727595299647805e-06, "loss": 0.597, "num_input_tokens_seen": 8561632, "step": 13050 }, { "epoch": 7.69752358490566, "grad_norm": 3.9526712894439697, "learning_rate": 7.725438751050973e-06, "loss": 0.4721, "num_input_tokens_seen": 8564384, "step": 13055 }, { "epoch": 7.7004716981132075, "grad_norm": 2.497513771057129, "learning_rate": 7.723281480882489e-06, "loss": 0.3975, "num_input_tokens_seen": 8567936, "step": 13060 }, { "epoch": 7.703419811320755, "grad_norm": 1.6688073873519897, "learning_rate": 7.721123489713494e-06, "loss": 0.6146, "num_input_tokens_seen": 8570560, "step": 13065 }, { "epoch": 7.706367924528302, "grad_norm": 1.9114148616790771, "learning_rate": 7.718964778115328e-06, "loss": 0.4916, "num_input_tokens_seen": 8573504, "step": 13070 }, { "epoch": 7.709316037735849, "grad_norm": 1.8979603052139282, "learning_rate": 7.716805346659519e-06, "loss": 0.4649, "num_input_tokens_seen": 8577568, "step": 13075 }, { "epoch": 7.712264150943396, "grad_norm": 2.6062676906585693, "learning_rate": 7.714645195917788e-06, "loss": 0.4666, "num_input_tokens_seen": 8582528, "step": 13080 }, { "epoch": 7.715212264150943, "grad_norm": 2.0960702896118164, "learning_rate": 7.712484326462038e-06, "loss": 0.5069, "num_input_tokens_seen": 8585344, "step": 13085 }, { "epoch": 7.71816037735849, "grad_norm": 1.5436135530471802, "learning_rate": 7.710322738864375e-06, "loss": 0.4303, "num_input_tokens_seen": 8588256, "step": 13090 }, { "epoch": 7.721108490566038, "grad_norm": 1.5239251852035522, "learning_rate": 7.708160433697085e-06, "loss": 0.4241, "num_input_tokens_seen": 8591904, "step": 13095 }, { "epoch": 7.724056603773585, "grad_norm": 1.7148292064666748, "learning_rate": 7.705997411532649e-06, "loss": 0.4013, "num_input_tokens_seen": 8595936, "step": 13100 }, { "epoch": 7.727004716981132, "grad_norm": 1.8077300786972046, "learning_rate": 7.703833672943735e-06, "loss": 0.4072, "num_input_tokens_seen": 8599424, "step": 13105 }, { "epoch": 7.7299528301886795, "grad_norm": 1.813728928565979, "learning_rate": 7.701669218503206e-06, "loss": 0.5108, "num_input_tokens_seen": 8601888, "step": 13110 }, { "epoch": 7.732900943396227, "grad_norm": 1.3793092966079712, "learning_rate": 7.699504048784106e-06, "loss": 0.3671, "num_input_tokens_seen": 8605088, "step": 13115 }, { "epoch": 7.735849056603773, "grad_norm": 1.4083250761032104, "learning_rate": 7.697338164359675e-06, "loss": 0.3959, "num_input_tokens_seen": 8608288, "step": 13120 }, { "epoch": 7.7387971698113205, "grad_norm": 1.3327134847640991, "learning_rate": 7.69517156580334e-06, "loss": 0.3675, "num_input_tokens_seen": 8611712, "step": 13125 }, { "epoch": 7.741745283018868, "grad_norm": 1.794937014579773, "learning_rate": 7.693004253688716e-06, "loss": 0.4509, "num_input_tokens_seen": 8616352, "step": 13130 }, { "epoch": 7.744693396226415, "grad_norm": 1.8602782487869263, "learning_rate": 7.690836228589613e-06, "loss": 0.505, "num_input_tokens_seen": 8619200, "step": 13135 }, { "epoch": 7.747641509433962, "grad_norm": 3.2823522090911865, "learning_rate": 7.688667491080019e-06, "loss": 0.309, "num_input_tokens_seen": 8622080, "step": 13140 }, { "epoch": 7.75058962264151, "grad_norm": 2.315758466720581, "learning_rate": 7.686498041734121e-06, "loss": 0.3728, "num_input_tokens_seen": 8625472, "step": 13145 }, { "epoch": 7.753537735849057, "grad_norm": 3.491800546646118, "learning_rate": 7.684327881126285e-06, "loss": 0.3833, "num_input_tokens_seen": 8628864, "step": 13150 }, { "epoch": 7.756485849056604, "grad_norm": 2.121569871902466, "learning_rate": 7.682157009831078e-06, "loss": 0.4125, "num_input_tokens_seen": 8631808, "step": 13155 }, { "epoch": 7.759433962264151, "grad_norm": 3.5057291984558105, "learning_rate": 7.67998542842324e-06, "loss": 0.4349, "num_input_tokens_seen": 8635392, "step": 13160 }, { "epoch": 7.762382075471698, "grad_norm": 2.414351463317871, "learning_rate": 7.677813137477711e-06, "loss": 0.3779, "num_input_tokens_seen": 8638112, "step": 13165 }, { "epoch": 7.765330188679245, "grad_norm": 4.055723667144775, "learning_rate": 7.675640137569614e-06, "loss": 0.3829, "num_input_tokens_seen": 8641504, "step": 13170 }, { "epoch": 7.7682783018867925, "grad_norm": 1.2950494289398193, "learning_rate": 7.673466429274257e-06, "loss": 0.3546, "num_input_tokens_seen": 8644512, "step": 13175 }, { "epoch": 7.77122641509434, "grad_norm": 2.2266550064086914, "learning_rate": 7.671292013167143e-06, "loss": 0.382, "num_input_tokens_seen": 8647552, "step": 13180 }, { "epoch": 7.774174528301887, "grad_norm": 1.492285966873169, "learning_rate": 7.669116889823955e-06, "loss": 0.5073, "num_input_tokens_seen": 8650912, "step": 13185 }, { "epoch": 7.777122641509434, "grad_norm": 1.7889227867126465, "learning_rate": 7.666941059820567e-06, "loss": 0.4158, "num_input_tokens_seen": 8653920, "step": 13190 }, { "epoch": 7.780070754716981, "grad_norm": 1.8523943424224854, "learning_rate": 7.66476452373304e-06, "loss": 0.5003, "num_input_tokens_seen": 8656800, "step": 13195 }, { "epoch": 7.783018867924528, "grad_norm": 1.6288057565689087, "learning_rate": 7.66258728213762e-06, "loss": 0.4283, "num_input_tokens_seen": 8659968, "step": 13200 }, { "epoch": 7.785966981132075, "grad_norm": 3.099612236022949, "learning_rate": 7.66040933561074e-06, "loss": 0.4278, "num_input_tokens_seen": 8663616, "step": 13205 }, { "epoch": 7.788915094339623, "grad_norm": 2.7693920135498047, "learning_rate": 7.658230684729027e-06, "loss": 0.5065, "num_input_tokens_seen": 8666528, "step": 13210 }, { "epoch": 7.79186320754717, "grad_norm": 2.055595874786377, "learning_rate": 7.656051330069282e-06, "loss": 0.3445, "num_input_tokens_seen": 8670080, "step": 13215 }, { "epoch": 7.794811320754717, "grad_norm": 1.2807207107543945, "learning_rate": 7.6538712722085e-06, "loss": 0.4314, "num_input_tokens_seen": 8673600, "step": 13220 }, { "epoch": 7.7977594339622645, "grad_norm": 1.8916566371917725, "learning_rate": 7.651690511723862e-06, "loss": 0.5602, "num_input_tokens_seen": 8678912, "step": 13225 }, { "epoch": 7.800707547169811, "grad_norm": 6.02538537979126, "learning_rate": 7.64950904919273e-06, "loss": 0.4748, "num_input_tokens_seen": 8682048, "step": 13230 }, { "epoch": 7.803655660377358, "grad_norm": 2.1439404487609863, "learning_rate": 7.647326885192662e-06, "loss": 0.4447, "num_input_tokens_seen": 8685184, "step": 13235 }, { "epoch": 7.806603773584905, "grad_norm": 2.481997013092041, "learning_rate": 7.645144020301392e-06, "loss": 0.4904, "num_input_tokens_seen": 8687488, "step": 13240 }, { "epoch": 7.809551886792453, "grad_norm": 2.1024422645568848, "learning_rate": 7.64296045509684e-06, "loss": 0.4658, "num_input_tokens_seen": 8690432, "step": 13245 }, { "epoch": 7.8125, "grad_norm": 2.8097803592681885, "learning_rate": 7.64077619015712e-06, "loss": 0.468, "num_input_tokens_seen": 8694912, "step": 13250 }, { "epoch": 7.815448113207547, "grad_norm": 1.6926392316818237, "learning_rate": 7.638591226060519e-06, "loss": 0.3749, "num_input_tokens_seen": 8697984, "step": 13255 }, { "epoch": 7.818396226415095, "grad_norm": 1.8357315063476562, "learning_rate": 7.636405563385522e-06, "loss": 0.5052, "num_input_tokens_seen": 8701344, "step": 13260 }, { "epoch": 7.821344339622642, "grad_norm": 2.2154574394226074, "learning_rate": 7.634219202710789e-06, "loss": 0.5782, "num_input_tokens_seen": 8704576, "step": 13265 }, { "epoch": 7.824292452830189, "grad_norm": 1.5475186109542847, "learning_rate": 7.632032144615168e-06, "loss": 0.4299, "num_input_tokens_seen": 8708704, "step": 13270 }, { "epoch": 7.8272405660377355, "grad_norm": 1.7287894487380981, "learning_rate": 7.629844389677695e-06, "loss": 0.3595, "num_input_tokens_seen": 8711648, "step": 13275 }, { "epoch": 7.830188679245283, "grad_norm": 2.462097644805908, "learning_rate": 7.627655938477586e-06, "loss": 0.4799, "num_input_tokens_seen": 8715136, "step": 13280 }, { "epoch": 7.83313679245283, "grad_norm": 2.2011733055114746, "learning_rate": 7.6254667915942415e-06, "loss": 0.4524, "num_input_tokens_seen": 8718624, "step": 13285 }, { "epoch": 7.836084905660377, "grad_norm": 3.451885938644409, "learning_rate": 7.62327694960725e-06, "loss": 0.5063, "num_input_tokens_seen": 8722752, "step": 13290 }, { "epoch": 7.839033018867925, "grad_norm": 1.2270232439041138, "learning_rate": 7.621086413096379e-06, "loss": 0.4283, "num_input_tokens_seen": 8725504, "step": 13295 }, { "epoch": 7.841981132075472, "grad_norm": 1.8514940738677979, "learning_rate": 7.618895182641584e-06, "loss": 0.4519, "num_input_tokens_seen": 8732608, "step": 13300 }, { "epoch": 7.844929245283019, "grad_norm": 2.9876649379730225, "learning_rate": 7.6167032588230035e-06, "loss": 0.4515, "num_input_tokens_seen": 8735744, "step": 13305 }, { "epoch": 7.847877358490566, "grad_norm": 1.3692538738250732, "learning_rate": 7.614510642220958e-06, "loss": 0.2673, "num_input_tokens_seen": 8738240, "step": 13310 }, { "epoch": 7.850825471698113, "grad_norm": 2.2379934787750244, "learning_rate": 7.612317333415951e-06, "loss": 0.3825, "num_input_tokens_seen": 8741568, "step": 13315 }, { "epoch": 7.85377358490566, "grad_norm": 1.9861050844192505, "learning_rate": 7.610123332988673e-06, "loss": 0.4136, "num_input_tokens_seen": 8744672, "step": 13320 }, { "epoch": 7.8567216981132075, "grad_norm": 2.211603879928589, "learning_rate": 7.607928641519992e-06, "loss": 0.4491, "num_input_tokens_seen": 8747744, "step": 13325 }, { "epoch": 7.859669811320755, "grad_norm": 1.1706457138061523, "learning_rate": 7.605733259590964e-06, "loss": 0.5119, "num_input_tokens_seen": 8752032, "step": 13330 }, { "epoch": 7.862617924528302, "grad_norm": 1.3234573602676392, "learning_rate": 7.603537187782826e-06, "loss": 0.3423, "num_input_tokens_seen": 8755264, "step": 13335 }, { "epoch": 7.865566037735849, "grad_norm": 1.5851619243621826, "learning_rate": 7.601340426676996e-06, "loss": 0.5258, "num_input_tokens_seen": 8758304, "step": 13340 }, { "epoch": 7.868514150943396, "grad_norm": 2.4597973823547363, "learning_rate": 7.599142976855077e-06, "loss": 0.3284, "num_input_tokens_seen": 8760896, "step": 13345 }, { "epoch": 7.871462264150943, "grad_norm": 1.9056761264801025, "learning_rate": 7.596944838898854e-06, "loss": 0.5228, "num_input_tokens_seen": 8763904, "step": 13350 }, { "epoch": 7.87441037735849, "grad_norm": 2.1288857460021973, "learning_rate": 7.594746013390293e-06, "loss": 0.5389, "num_input_tokens_seen": 8766880, "step": 13355 }, { "epoch": 7.877358490566038, "grad_norm": 1.9192731380462646, "learning_rate": 7.59254650091154e-06, "loss": 0.3676, "num_input_tokens_seen": 8769504, "step": 13360 }, { "epoch": 7.880306603773585, "grad_norm": 1.9397765398025513, "learning_rate": 7.59034630204493e-06, "loss": 0.3527, "num_input_tokens_seen": 8772384, "step": 13365 }, { "epoch": 7.883254716981132, "grad_norm": 1.5335980653762817, "learning_rate": 7.588145417372972e-06, "loss": 0.4724, "num_input_tokens_seen": 8776704, "step": 13370 }, { "epoch": 7.8862028301886795, "grad_norm": 2.70646071434021, "learning_rate": 7.585943847478361e-06, "loss": 0.4081, "num_input_tokens_seen": 8779296, "step": 13375 }, { "epoch": 7.889150943396227, "grad_norm": 2.332643985748291, "learning_rate": 7.583741592943971e-06, "loss": 0.4425, "num_input_tokens_seen": 8782720, "step": 13380 }, { "epoch": 7.892099056603773, "grad_norm": 2.1041085720062256, "learning_rate": 7.581538654352859e-06, "loss": 0.3508, "num_input_tokens_seen": 8785184, "step": 13385 }, { "epoch": 7.8950471698113205, "grad_norm": 1.8241114616394043, "learning_rate": 7.579335032288262e-06, "loss": 0.4622, "num_input_tokens_seen": 8788832, "step": 13390 }, { "epoch": 7.897995283018868, "grad_norm": 1.1013692617416382, "learning_rate": 7.577130727333598e-06, "loss": 0.4424, "num_input_tokens_seen": 8792384, "step": 13395 }, { "epoch": 7.900943396226415, "grad_norm": 1.6441538333892822, "learning_rate": 7.5749257400724695e-06, "loss": 0.4464, "num_input_tokens_seen": 8796032, "step": 13400 }, { "epoch": 7.903891509433962, "grad_norm": 2.898393154144287, "learning_rate": 7.572720071088653e-06, "loss": 0.4436, "num_input_tokens_seen": 8799232, "step": 13405 }, { "epoch": 7.90683962264151, "grad_norm": 2.275266647338867, "learning_rate": 7.570513720966108e-06, "loss": 0.4872, "num_input_tokens_seen": 8803104, "step": 13410 }, { "epoch": 7.909787735849057, "grad_norm": 1.7455189228057861, "learning_rate": 7.56830669028898e-06, "loss": 0.3747, "num_input_tokens_seen": 8806304, "step": 13415 }, { "epoch": 7.912735849056604, "grad_norm": 4.596806526184082, "learning_rate": 7.566098979641588e-06, "loss": 0.4457, "num_input_tokens_seen": 8810272, "step": 13420 }, { "epoch": 7.915683962264151, "grad_norm": 2.7914445400238037, "learning_rate": 7.563890589608427e-06, "loss": 0.4029, "num_input_tokens_seen": 8813312, "step": 13425 }, { "epoch": 7.918632075471698, "grad_norm": 1.9035816192626953, "learning_rate": 7.561681520774187e-06, "loss": 0.3643, "num_input_tokens_seen": 8816896, "step": 13430 }, { "epoch": 7.921580188679245, "grad_norm": 1.4413453340530396, "learning_rate": 7.559471773723721e-06, "loss": 0.3367, "num_input_tokens_seen": 8820288, "step": 13435 }, { "epoch": 7.9245283018867925, "grad_norm": 1.912588119506836, "learning_rate": 7.557261349042073e-06, "loss": 0.4473, "num_input_tokens_seen": 8823008, "step": 13440 }, { "epoch": 7.92747641509434, "grad_norm": 2.0341808795928955, "learning_rate": 7.555050247314464e-06, "loss": 0.3981, "num_input_tokens_seen": 8826240, "step": 13445 }, { "epoch": 7.930424528301887, "grad_norm": 2.030747175216675, "learning_rate": 7.552838469126289e-06, "loss": 0.5882, "num_input_tokens_seen": 8829088, "step": 13450 }, { "epoch": 7.933372641509434, "grad_norm": 2.3446409702301025, "learning_rate": 7.550626015063125e-06, "loss": 0.3925, "num_input_tokens_seen": 8832224, "step": 13455 }, { "epoch": 7.936320754716981, "grad_norm": 1.67404043674469, "learning_rate": 7.548412885710734e-06, "loss": 0.621, "num_input_tokens_seen": 8835008, "step": 13460 }, { "epoch": 7.939268867924528, "grad_norm": 1.5737817287445068, "learning_rate": 7.546199081655048e-06, "loss": 0.402, "num_input_tokens_seen": 8838304, "step": 13465 }, { "epoch": 7.942216981132075, "grad_norm": 2.9213709831237793, "learning_rate": 7.54398460348218e-06, "loss": 0.4406, "num_input_tokens_seen": 8841216, "step": 13470 }, { "epoch": 7.945165094339623, "grad_norm": 1.5484328269958496, "learning_rate": 7.541769451778425e-06, "loss": 0.3803, "num_input_tokens_seen": 8843872, "step": 13475 }, { "epoch": 7.94811320754717, "grad_norm": 1.8332335948944092, "learning_rate": 7.5395536271302536e-06, "loss": 0.429, "num_input_tokens_seen": 8847040, "step": 13480 }, { "epoch": 7.951061320754717, "grad_norm": 1.9052687883377075, "learning_rate": 7.5373371301243136e-06, "loss": 0.4364, "num_input_tokens_seen": 8851008, "step": 13485 }, { "epoch": 7.9540094339622645, "grad_norm": 1.3721044063568115, "learning_rate": 7.535119961347433e-06, "loss": 0.4082, "num_input_tokens_seen": 8854496, "step": 13490 }, { "epoch": 7.956957547169811, "grad_norm": 1.7673497200012207, "learning_rate": 7.532902121386618e-06, "loss": 0.5978, "num_input_tokens_seen": 8857696, "step": 13495 }, { "epoch": 7.959905660377358, "grad_norm": 2.4823834896087646, "learning_rate": 7.530683610829051e-06, "loss": 0.3937, "num_input_tokens_seen": 8860512, "step": 13500 }, { "epoch": 7.962853773584905, "grad_norm": 1.5627117156982422, "learning_rate": 7.5284644302620906e-06, "loss": 0.3531, "num_input_tokens_seen": 8863744, "step": 13505 }, { "epoch": 7.965801886792453, "grad_norm": 3.0845258235931396, "learning_rate": 7.526244580273274e-06, "loss": 0.4477, "num_input_tokens_seen": 8866816, "step": 13510 }, { "epoch": 7.96875, "grad_norm": 3.554990768432617, "learning_rate": 7.524024061450318e-06, "loss": 0.3751, "num_input_tokens_seen": 8869408, "step": 13515 }, { "epoch": 7.971698113207547, "grad_norm": 1.3999227285385132, "learning_rate": 7.521802874381115e-06, "loss": 0.5236, "num_input_tokens_seen": 8873184, "step": 13520 }, { "epoch": 7.974646226415095, "grad_norm": 2.5408339500427246, "learning_rate": 7.519581019653731e-06, "loss": 0.3771, "num_input_tokens_seen": 8877632, "step": 13525 }, { "epoch": 7.977594339622642, "grad_norm": 3.5411417484283447, "learning_rate": 7.517358497856413e-06, "loss": 0.3735, "num_input_tokens_seen": 8880256, "step": 13530 }, { "epoch": 7.980542452830189, "grad_norm": 3.3937604427337646, "learning_rate": 7.515135309577584e-06, "loss": 0.3933, "num_input_tokens_seen": 8883168, "step": 13535 }, { "epoch": 7.9834905660377355, "grad_norm": 2.5807878971099854, "learning_rate": 7.5129114554058425e-06, "loss": 0.5053, "num_input_tokens_seen": 8889728, "step": 13540 }, { "epoch": 7.986438679245283, "grad_norm": 6.585300922393799, "learning_rate": 7.510686935929963e-06, "loss": 0.4227, "num_input_tokens_seen": 8892960, "step": 13545 }, { "epoch": 7.98938679245283, "grad_norm": 1.6548582315444946, "learning_rate": 7.5084617517388965e-06, "loss": 0.3716, "num_input_tokens_seen": 8895936, "step": 13550 }, { "epoch": 7.992334905660377, "grad_norm": 9.416098594665527, "learning_rate": 7.506235903421771e-06, "loss": 0.3257, "num_input_tokens_seen": 8898912, "step": 13555 }, { "epoch": 7.995283018867925, "grad_norm": 2.5470471382141113, "learning_rate": 7.504009391567889e-06, "loss": 0.2888, "num_input_tokens_seen": 8902624, "step": 13560 }, { "epoch": 7.998231132075472, "grad_norm": 1.782319188117981, "learning_rate": 7.501782216766729e-06, "loss": 0.3851, "num_input_tokens_seen": 8906144, "step": 13565 }, { "epoch": 8.0, "eval_loss": 0.5101737380027771, "eval_runtime": 18.7802, "eval_samples_per_second": 90.308, "eval_steps_per_second": 22.577, "num_input_tokens_seen": 8907352, "step": 13568 }, { "epoch": 8.00117924528302, "grad_norm": 1.9183645248413086, "learning_rate": 7.499554379607944e-06, "loss": 0.3641, "num_input_tokens_seen": 8908312, "step": 13570 }, { "epoch": 8.004127358490566, "grad_norm": 3.714686155319214, "learning_rate": 7.497325880681365e-06, "loss": 0.4757, "num_input_tokens_seen": 8911288, "step": 13575 }, { "epoch": 8.007075471698114, "grad_norm": 3.0114855766296387, "learning_rate": 7.495096720576994e-06, "loss": 0.3812, "num_input_tokens_seen": 8913688, "step": 13580 }, { "epoch": 8.01002358490566, "grad_norm": 1.1283586025238037, "learning_rate": 7.492866899885017e-06, "loss": 0.4689, "num_input_tokens_seen": 8917432, "step": 13585 }, { "epoch": 8.012971698113208, "grad_norm": 3.646439552307129, "learning_rate": 7.490636419195782e-06, "loss": 0.4854, "num_input_tokens_seen": 8919928, "step": 13590 }, { "epoch": 8.015919811320755, "grad_norm": 3.7566728591918945, "learning_rate": 7.488405279099821e-06, "loss": 0.4241, "num_input_tokens_seen": 8923512, "step": 13595 }, { "epoch": 8.018867924528301, "grad_norm": 2.7828922271728516, "learning_rate": 7.48617348018784e-06, "loss": 0.3389, "num_input_tokens_seen": 8926840, "step": 13600 }, { "epoch": 8.02181603773585, "grad_norm": 2.0354220867156982, "learning_rate": 7.4839410230507134e-06, "loss": 0.2752, "num_input_tokens_seen": 8929880, "step": 13605 }, { "epoch": 8.024764150943396, "grad_norm": 2.9697203636169434, "learning_rate": 7.481707908279496e-06, "loss": 0.305, "num_input_tokens_seen": 8933528, "step": 13610 }, { "epoch": 8.027712264150944, "grad_norm": 3.2175703048706055, "learning_rate": 7.4794741364654144e-06, "loss": 0.5117, "num_input_tokens_seen": 8937208, "step": 13615 }, { "epoch": 8.03066037735849, "grad_norm": 2.0966639518737793, "learning_rate": 7.477239708199871e-06, "loss": 0.545, "num_input_tokens_seen": 8942040, "step": 13620 }, { "epoch": 8.033608490566039, "grad_norm": 1.7126930952072144, "learning_rate": 7.475004624074434e-06, "loss": 0.4063, "num_input_tokens_seen": 8945176, "step": 13625 }, { "epoch": 8.036556603773585, "grad_norm": 1.99705970287323, "learning_rate": 7.4727688846808595e-06, "loss": 0.4873, "num_input_tokens_seen": 8948376, "step": 13630 }, { "epoch": 8.039504716981131, "grad_norm": 1.8098605871200562, "learning_rate": 7.4705324906110654e-06, "loss": 0.4406, "num_input_tokens_seen": 8951448, "step": 13635 }, { "epoch": 8.04245283018868, "grad_norm": 2.013715982437134, "learning_rate": 7.4682954424571466e-06, "loss": 0.3436, "num_input_tokens_seen": 8953976, "step": 13640 }, { "epoch": 8.045400943396226, "grad_norm": 1.7205569744110107, "learning_rate": 7.466057740811372e-06, "loss": 0.323, "num_input_tokens_seen": 8957208, "step": 13645 }, { "epoch": 8.048349056603774, "grad_norm": 3.6039533615112305, "learning_rate": 7.463819386266182e-06, "loss": 0.3232, "num_input_tokens_seen": 8959800, "step": 13650 }, { "epoch": 8.05129716981132, "grad_norm": 1.5504131317138672, "learning_rate": 7.461580379414191e-06, "loss": 0.3721, "num_input_tokens_seen": 8963096, "step": 13655 }, { "epoch": 8.054245283018869, "grad_norm": 1.7509262561798096, "learning_rate": 7.459340720848187e-06, "loss": 0.3809, "num_input_tokens_seen": 8966360, "step": 13660 }, { "epoch": 8.057193396226415, "grad_norm": 2.419797420501709, "learning_rate": 7.457100411161128e-06, "loss": 0.3604, "num_input_tokens_seen": 8968920, "step": 13665 }, { "epoch": 8.060141509433961, "grad_norm": 1.9243464469909668, "learning_rate": 7.454859450946144e-06, "loss": 0.5121, "num_input_tokens_seen": 8973304, "step": 13670 }, { "epoch": 8.06308962264151, "grad_norm": 6.73280668258667, "learning_rate": 7.4526178407965396e-06, "loss": 0.3017, "num_input_tokens_seen": 8976664, "step": 13675 }, { "epoch": 8.066037735849056, "grad_norm": 1.8158038854599, "learning_rate": 7.450375581305794e-06, "loss": 0.3472, "num_input_tokens_seen": 8979480, "step": 13680 }, { "epoch": 8.068985849056604, "grad_norm": 1.850595474243164, "learning_rate": 7.448132673067552e-06, "loss": 0.2856, "num_input_tokens_seen": 8982648, "step": 13685 }, { "epoch": 8.07193396226415, "grad_norm": 2.8924882411956787, "learning_rate": 7.445889116675634e-06, "loss": 0.4042, "num_input_tokens_seen": 8985560, "step": 13690 }, { "epoch": 8.074882075471699, "grad_norm": 2.2530126571655273, "learning_rate": 7.443644912724031e-06, "loss": 0.3738, "num_input_tokens_seen": 8988632, "step": 13695 }, { "epoch": 8.077830188679245, "grad_norm": 2.4828577041625977, "learning_rate": 7.441400061806907e-06, "loss": 0.3739, "num_input_tokens_seen": 8991864, "step": 13700 }, { "epoch": 8.080778301886792, "grad_norm": 2.0393240451812744, "learning_rate": 7.439154564518592e-06, "loss": 0.4201, "num_input_tokens_seen": 8994520, "step": 13705 }, { "epoch": 8.08372641509434, "grad_norm": 1.7927478551864624, "learning_rate": 7.436908421453597e-06, "loss": 0.4304, "num_input_tokens_seen": 8998712, "step": 13710 }, { "epoch": 8.086674528301886, "grad_norm": 1.9271326065063477, "learning_rate": 7.434661633206593e-06, "loss": 0.4202, "num_input_tokens_seen": 9001816, "step": 13715 }, { "epoch": 8.089622641509434, "grad_norm": 2.5872721672058105, "learning_rate": 7.4324142003724286e-06, "loss": 0.4122, "num_input_tokens_seen": 9006456, "step": 13720 }, { "epoch": 8.09257075471698, "grad_norm": 1.8936609029769897, "learning_rate": 7.430166123546122e-06, "loss": 0.339, "num_input_tokens_seen": 9008920, "step": 13725 }, { "epoch": 8.095518867924529, "grad_norm": 1.7021554708480835, "learning_rate": 7.427917403322862e-06, "loss": 0.271, "num_input_tokens_seen": 9012536, "step": 13730 }, { "epoch": 8.098466981132075, "grad_norm": 2.6044857501983643, "learning_rate": 7.425668040298003e-06, "loss": 0.3686, "num_input_tokens_seen": 9015736, "step": 13735 }, { "epoch": 8.101415094339623, "grad_norm": 2.7241199016571045, "learning_rate": 7.4234180350670785e-06, "loss": 0.3991, "num_input_tokens_seen": 9018456, "step": 13740 }, { "epoch": 8.10436320754717, "grad_norm": 1.6619213819503784, "learning_rate": 7.421167388225785e-06, "loss": 0.4103, "num_input_tokens_seen": 9021976, "step": 13745 }, { "epoch": 8.107311320754716, "grad_norm": 1.4864208698272705, "learning_rate": 7.41891610036999e-06, "loss": 0.4183, "num_input_tokens_seen": 9025176, "step": 13750 }, { "epoch": 8.110259433962264, "grad_norm": 2.2019479274749756, "learning_rate": 7.416664172095732e-06, "loss": 0.4132, "num_input_tokens_seen": 9027800, "step": 13755 }, { "epoch": 8.11320754716981, "grad_norm": 2.8141753673553467, "learning_rate": 7.414411603999221e-06, "loss": 0.4154, "num_input_tokens_seen": 9031032, "step": 13760 }, { "epoch": 8.116155660377359, "grad_norm": 3.352543592453003, "learning_rate": 7.4121583966768295e-06, "loss": 0.3422, "num_input_tokens_seen": 9034072, "step": 13765 }, { "epoch": 8.119103773584905, "grad_norm": 5.323648929595947, "learning_rate": 7.409904550725109e-06, "loss": 0.4516, "num_input_tokens_seen": 9036888, "step": 13770 }, { "epoch": 8.122051886792454, "grad_norm": 2.1443774700164795, "learning_rate": 7.407650066740771e-06, "loss": 0.4885, "num_input_tokens_seen": 9040888, "step": 13775 }, { "epoch": 8.125, "grad_norm": 3.883267402648926, "learning_rate": 7.405394945320702e-06, "loss": 0.4404, "num_input_tokens_seen": 9043640, "step": 13780 }, { "epoch": 8.127948113207546, "grad_norm": 3.5693283081054688, "learning_rate": 7.403139187061955e-06, "loss": 0.5365, "num_input_tokens_seen": 9047064, "step": 13785 }, { "epoch": 8.130896226415095, "grad_norm": 1.6338123083114624, "learning_rate": 7.400882792561752e-06, "loss": 0.3485, "num_input_tokens_seen": 9049944, "step": 13790 }, { "epoch": 8.133844339622641, "grad_norm": 1.8451004028320312, "learning_rate": 7.39862576241748e-06, "loss": 0.4015, "num_input_tokens_seen": 9054072, "step": 13795 }, { "epoch": 8.13679245283019, "grad_norm": 1.6697138547897339, "learning_rate": 7.396368097226703e-06, "loss": 0.3411, "num_input_tokens_seen": 9057592, "step": 13800 }, { "epoch": 8.139740566037736, "grad_norm": 1.8880037069320679, "learning_rate": 7.394109797587144e-06, "loss": 0.3801, "num_input_tokens_seen": 9060536, "step": 13805 }, { "epoch": 8.142688679245284, "grad_norm": 1.8495157957077026, "learning_rate": 7.3918508640966956e-06, "loss": 0.3251, "num_input_tokens_seen": 9063832, "step": 13810 }, { "epoch": 8.14563679245283, "grad_norm": 3.4600613117218018, "learning_rate": 7.389591297353424e-06, "loss": 0.4327, "num_input_tokens_seen": 9067928, "step": 13815 }, { "epoch": 8.148584905660377, "grad_norm": 2.420914888381958, "learning_rate": 7.3873310979555565e-06, "loss": 0.4833, "num_input_tokens_seen": 9071416, "step": 13820 }, { "epoch": 8.151533018867925, "grad_norm": 8.64047622680664, "learning_rate": 7.385070266501495e-06, "loss": 0.3134, "num_input_tokens_seen": 9073752, "step": 13825 }, { "epoch": 8.154481132075471, "grad_norm": 2.386353015899658, "learning_rate": 7.382808803589798e-06, "loss": 0.421, "num_input_tokens_seen": 9076696, "step": 13830 }, { "epoch": 8.15742924528302, "grad_norm": 1.8833140134811401, "learning_rate": 7.380546709819204e-06, "loss": 0.4381, "num_input_tokens_seen": 9080536, "step": 13835 }, { "epoch": 8.160377358490566, "grad_norm": 3.0205001831054688, "learning_rate": 7.378283985788608e-06, "loss": 0.4836, "num_input_tokens_seen": 9083736, "step": 13840 }, { "epoch": 8.163325471698114, "grad_norm": 2.229940414428711, "learning_rate": 7.376020632097076e-06, "loss": 0.4353, "num_input_tokens_seen": 9087288, "step": 13845 }, { "epoch": 8.16627358490566, "grad_norm": 1.8270822763442993, "learning_rate": 7.373756649343841e-06, "loss": 0.3833, "num_input_tokens_seen": 9089944, "step": 13850 }, { "epoch": 8.169221698113208, "grad_norm": 1.628374695777893, "learning_rate": 7.371492038128305e-06, "loss": 0.3543, "num_input_tokens_seen": 9096088, "step": 13855 }, { "epoch": 8.172169811320755, "grad_norm": 2.646756887435913, "learning_rate": 7.36922679905003e-06, "loss": 0.399, "num_input_tokens_seen": 9099576, "step": 13860 }, { "epoch": 8.175117924528301, "grad_norm": 3.208387851715088, "learning_rate": 7.366960932708749e-06, "loss": 0.3486, "num_input_tokens_seen": 9102328, "step": 13865 }, { "epoch": 8.17806603773585, "grad_norm": 1.896582007408142, "learning_rate": 7.364694439704361e-06, "loss": 0.3766, "num_input_tokens_seen": 9105944, "step": 13870 }, { "epoch": 8.181014150943396, "grad_norm": 2.355004072189331, "learning_rate": 7.3624273206369264e-06, "loss": 0.3797, "num_input_tokens_seen": 9108664, "step": 13875 }, { "epoch": 8.183962264150944, "grad_norm": 3.2104640007019043, "learning_rate": 7.360159576106681e-06, "loss": 0.3883, "num_input_tokens_seen": 9111800, "step": 13880 }, { "epoch": 8.18691037735849, "grad_norm": 2.3619582653045654, "learning_rate": 7.357891206714014e-06, "loss": 0.4671, "num_input_tokens_seen": 9114552, "step": 13885 }, { "epoch": 8.189858490566039, "grad_norm": 3.248217821121216, "learning_rate": 7.355622213059487e-06, "loss": 0.4083, "num_input_tokens_seen": 9117336, "step": 13890 }, { "epoch": 8.192806603773585, "grad_norm": 3.201423406600952, "learning_rate": 7.353352595743829e-06, "loss": 0.3992, "num_input_tokens_seen": 9120152, "step": 13895 }, { "epoch": 8.195754716981131, "grad_norm": 4.227177619934082, "learning_rate": 7.351082355367928e-06, "loss": 0.3917, "num_input_tokens_seen": 9124024, "step": 13900 }, { "epoch": 8.19870283018868, "grad_norm": 1.0032356977462769, "learning_rate": 7.34881149253284e-06, "loss": 0.3644, "num_input_tokens_seen": 9127256, "step": 13905 }, { "epoch": 8.201650943396226, "grad_norm": 4.509525775909424, "learning_rate": 7.346540007839787e-06, "loss": 0.3827, "num_input_tokens_seen": 9130328, "step": 13910 }, { "epoch": 8.204599056603774, "grad_norm": 3.7355566024780273, "learning_rate": 7.344267901890154e-06, "loss": 0.374, "num_input_tokens_seen": 9133816, "step": 13915 }, { "epoch": 8.20754716981132, "grad_norm": 2.401834487915039, "learning_rate": 7.341995175285491e-06, "loss": 0.4059, "num_input_tokens_seen": 9136472, "step": 13920 }, { "epoch": 8.210495283018869, "grad_norm": 1.6789335012435913, "learning_rate": 7.339721828627512e-06, "loss": 0.4239, "num_input_tokens_seen": 9140376, "step": 13925 }, { "epoch": 8.213443396226415, "grad_norm": 2.021962881088257, "learning_rate": 7.337447862518096e-06, "loss": 0.4282, "num_input_tokens_seen": 9143128, "step": 13930 }, { "epoch": 8.216391509433961, "grad_norm": 4.479005813598633, "learning_rate": 7.335173277559282e-06, "loss": 0.6042, "num_input_tokens_seen": 9145624, "step": 13935 }, { "epoch": 8.21933962264151, "grad_norm": 1.7594196796417236, "learning_rate": 7.332898074353281e-06, "loss": 0.4, "num_input_tokens_seen": 9148696, "step": 13940 }, { "epoch": 8.222287735849056, "grad_norm": 3.2007787227630615, "learning_rate": 7.330622253502461e-06, "loss": 0.3963, "num_input_tokens_seen": 9151352, "step": 13945 }, { "epoch": 8.225235849056604, "grad_norm": 2.0016283988952637, "learning_rate": 7.3283458156093534e-06, "loss": 0.3614, "num_input_tokens_seen": 9154552, "step": 13950 }, { "epoch": 8.22818396226415, "grad_norm": 1.6878390312194824, "learning_rate": 7.326068761276657e-06, "loss": 0.3908, "num_input_tokens_seen": 9157688, "step": 13955 }, { "epoch": 8.231132075471699, "grad_norm": 6.612765312194824, "learning_rate": 7.323791091107231e-06, "loss": 0.4526, "num_input_tokens_seen": 9160248, "step": 13960 }, { "epoch": 8.234080188679245, "grad_norm": 3.7263081073760986, "learning_rate": 7.3215128057040986e-06, "loss": 0.4496, "num_input_tokens_seen": 9163800, "step": 13965 }, { "epoch": 8.237028301886792, "grad_norm": 1.5383602380752563, "learning_rate": 7.319233905670447e-06, "loss": 0.6624, "num_input_tokens_seen": 9167960, "step": 13970 }, { "epoch": 8.23997641509434, "grad_norm": 1.420710802078247, "learning_rate": 7.316954391609622e-06, "loss": 0.359, "num_input_tokens_seen": 9170136, "step": 13975 }, { "epoch": 8.242924528301886, "grad_norm": 3.0058977603912354, "learning_rate": 7.314674264125137e-06, "loss": 0.3783, "num_input_tokens_seen": 9173208, "step": 13980 }, { "epoch": 8.245872641509434, "grad_norm": 4.042976379394531, "learning_rate": 7.312393523820665e-06, "loss": 0.5107, "num_input_tokens_seen": 9176920, "step": 13985 }, { "epoch": 8.24882075471698, "grad_norm": 5.1771111488342285, "learning_rate": 7.310112171300041e-06, "loss": 0.5076, "num_input_tokens_seen": 9179960, "step": 13990 }, { "epoch": 8.251768867924529, "grad_norm": 2.843798875808716, "learning_rate": 7.307830207167263e-06, "loss": 0.4446, "num_input_tokens_seen": 9183192, "step": 13995 }, { "epoch": 8.254716981132075, "grad_norm": 3.256155014038086, "learning_rate": 7.305547632026493e-06, "loss": 0.3861, "num_input_tokens_seen": 9186104, "step": 14000 }, { "epoch": 8.257665094339623, "grad_norm": 2.127089262008667, "learning_rate": 7.3032644464820515e-06, "loss": 0.3477, "num_input_tokens_seen": 9189368, "step": 14005 }, { "epoch": 8.26061320754717, "grad_norm": 3.182023525238037, "learning_rate": 7.30098065113842e-06, "loss": 0.3758, "num_input_tokens_seen": 9191608, "step": 14010 }, { "epoch": 8.263561320754716, "grad_norm": 1.6164308786392212, "learning_rate": 7.298696246600244e-06, "loss": 0.5137, "num_input_tokens_seen": 9198872, "step": 14015 }, { "epoch": 8.266509433962264, "grad_norm": 1.818859577178955, "learning_rate": 7.2964112334723315e-06, "loss": 0.4141, "num_input_tokens_seen": 9201688, "step": 14020 }, { "epoch": 8.26945754716981, "grad_norm": 1.687383770942688, "learning_rate": 7.294125612359647e-06, "loss": 0.422, "num_input_tokens_seen": 9204536, "step": 14025 }, { "epoch": 8.272405660377359, "grad_norm": 1.486741065979004, "learning_rate": 7.291839383867318e-06, "loss": 0.3877, "num_input_tokens_seen": 9208632, "step": 14030 }, { "epoch": 8.275353773584905, "grad_norm": 2.843839168548584, "learning_rate": 7.289552548600638e-06, "loss": 0.3534, "num_input_tokens_seen": 9211352, "step": 14035 }, { "epoch": 8.278301886792454, "grad_norm": 2.0993847846984863, "learning_rate": 7.287265107165052e-06, "loss": 0.3552, "num_input_tokens_seen": 9214104, "step": 14040 }, { "epoch": 8.28125, "grad_norm": 1.988822102546692, "learning_rate": 7.284977060166171e-06, "loss": 0.4031, "num_input_tokens_seen": 9217144, "step": 14045 }, { "epoch": 8.284198113207546, "grad_norm": 1.9342114925384521, "learning_rate": 7.282688408209766e-06, "loss": 0.3511, "num_input_tokens_seen": 9219800, "step": 14050 }, { "epoch": 8.287146226415095, "grad_norm": 1.7154837846755981, "learning_rate": 7.2803991519017655e-06, "loss": 0.4139, "num_input_tokens_seen": 9222936, "step": 14055 }, { "epoch": 8.290094339622641, "grad_norm": 2.066704034805298, "learning_rate": 7.2781092918482634e-06, "loss": 0.5029, "num_input_tokens_seen": 9225528, "step": 14060 }, { "epoch": 8.29304245283019, "grad_norm": 2.810373067855835, "learning_rate": 7.275818828655508e-06, "loss": 0.3954, "num_input_tokens_seen": 9228344, "step": 14065 }, { "epoch": 8.295990566037736, "grad_norm": 1.6190686225891113, "learning_rate": 7.27352776292991e-06, "loss": 0.349, "num_input_tokens_seen": 9231416, "step": 14070 }, { "epoch": 8.298938679245284, "grad_norm": 2.7559170722961426, "learning_rate": 7.271236095278036e-06, "loss": 0.3999, "num_input_tokens_seen": 9234488, "step": 14075 }, { "epoch": 8.30188679245283, "grad_norm": 1.5005055665969849, "learning_rate": 7.2689438263066195e-06, "loss": 0.3919, "num_input_tokens_seen": 9237464, "step": 14080 }, { "epoch": 8.304834905660377, "grad_norm": 1.5409646034240723, "learning_rate": 7.266650956622546e-06, "loss": 0.3688, "num_input_tokens_seen": 9240216, "step": 14085 }, { "epoch": 8.307783018867925, "grad_norm": 1.9525551795959473, "learning_rate": 7.2643574868328625e-06, "loss": 0.3621, "num_input_tokens_seen": 9243256, "step": 14090 }, { "epoch": 8.310731132075471, "grad_norm": 1.9255626201629639, "learning_rate": 7.262063417544776e-06, "loss": 0.323, "num_input_tokens_seen": 9246232, "step": 14095 }, { "epoch": 8.31367924528302, "grad_norm": 1.812003254890442, "learning_rate": 7.25976874936565e-06, "loss": 0.4684, "num_input_tokens_seen": 9250104, "step": 14100 }, { "epoch": 8.316627358490566, "grad_norm": 2.5070207118988037, "learning_rate": 7.257473482903009e-06, "loss": 0.4264, "num_input_tokens_seen": 9253912, "step": 14105 }, { "epoch": 8.319575471698114, "grad_norm": 3.055269479751587, "learning_rate": 7.255177618764534e-06, "loss": 0.4547, "num_input_tokens_seen": 9256472, "step": 14110 }, { "epoch": 8.32252358490566, "grad_norm": 2.532064437866211, "learning_rate": 7.252881157558065e-06, "loss": 0.4933, "num_input_tokens_seen": 9259928, "step": 14115 }, { "epoch": 8.325471698113208, "grad_norm": 1.266371726989746, "learning_rate": 7.250584099891602e-06, "loss": 0.5536, "num_input_tokens_seen": 9264280, "step": 14120 }, { "epoch": 8.328419811320755, "grad_norm": 1.5381954908370972, "learning_rate": 7.248286446373296e-06, "loss": 0.3845, "num_input_tokens_seen": 9267544, "step": 14125 }, { "epoch": 8.331367924528301, "grad_norm": 1.6281750202178955, "learning_rate": 7.245988197611466e-06, "loss": 0.3526, "num_input_tokens_seen": 9270456, "step": 14130 }, { "epoch": 8.33431603773585, "grad_norm": 2.317800760269165, "learning_rate": 7.2436893542145805e-06, "loss": 0.3883, "num_input_tokens_seen": 9274264, "step": 14135 }, { "epoch": 8.337264150943396, "grad_norm": 4.5440449714660645, "learning_rate": 7.241389916791269e-06, "loss": 0.4602, "num_input_tokens_seen": 9277816, "step": 14140 }, { "epoch": 8.340212264150944, "grad_norm": 3.0955100059509277, "learning_rate": 7.239089885950317e-06, "loss": 0.4982, "num_input_tokens_seen": 9281048, "step": 14145 }, { "epoch": 8.34316037735849, "grad_norm": 2.320317029953003, "learning_rate": 7.236789262300667e-06, "loss": 0.3805, "num_input_tokens_seen": 9284152, "step": 14150 }, { "epoch": 8.346108490566039, "grad_norm": 2.1793570518493652, "learning_rate": 7.23448804645142e-06, "loss": 0.3767, "num_input_tokens_seen": 9286904, "step": 14155 }, { "epoch": 8.349056603773585, "grad_norm": 3.073859930038452, "learning_rate": 7.232186239011834e-06, "loss": 0.3676, "num_input_tokens_seen": 9289880, "step": 14160 }, { "epoch": 8.352004716981131, "grad_norm": 3.018864393234253, "learning_rate": 7.2298838405913195e-06, "loss": 0.4934, "num_input_tokens_seen": 9293144, "step": 14165 }, { "epoch": 8.35495283018868, "grad_norm": 2.1429386138916016, "learning_rate": 7.227580851799448e-06, "loss": 0.4178, "num_input_tokens_seen": 9296568, "step": 14170 }, { "epoch": 8.357900943396226, "grad_norm": 1.6333343982696533, "learning_rate": 7.2252772732459455e-06, "loss": 0.3992, "num_input_tokens_seen": 9299928, "step": 14175 }, { "epoch": 8.360849056603774, "grad_norm": 3.3485267162323, "learning_rate": 7.222973105540696e-06, "loss": 0.4399, "num_input_tokens_seen": 9303096, "step": 14180 }, { "epoch": 8.36379716981132, "grad_norm": 1.5235471725463867, "learning_rate": 7.2206683492937345e-06, "loss": 0.3134, "num_input_tokens_seen": 9305976, "step": 14185 }, { "epoch": 8.366745283018869, "grad_norm": 1.854101538658142, "learning_rate": 7.218363005115259e-06, "loss": 0.3592, "num_input_tokens_seen": 9308632, "step": 14190 }, { "epoch": 8.369693396226415, "grad_norm": 1.61742103099823, "learning_rate": 7.216057073615617e-06, "loss": 0.3364, "num_input_tokens_seen": 9312024, "step": 14195 }, { "epoch": 8.372641509433961, "grad_norm": 2.034339427947998, "learning_rate": 7.21375055540531e-06, "loss": 0.3823, "num_input_tokens_seen": 9315000, "step": 14200 }, { "epoch": 8.37558962264151, "grad_norm": 1.9843195676803589, "learning_rate": 7.211443451095007e-06, "loss": 0.4549, "num_input_tokens_seen": 9318776, "step": 14205 }, { "epoch": 8.378537735849056, "grad_norm": 1.5132791996002197, "learning_rate": 7.2091357612955185e-06, "loss": 0.3872, "num_input_tokens_seen": 9321976, "step": 14210 }, { "epoch": 8.381485849056604, "grad_norm": 2.46427583694458, "learning_rate": 7.206827486617816e-06, "loss": 0.3122, "num_input_tokens_seen": 9324536, "step": 14215 }, { "epoch": 8.38443396226415, "grad_norm": 3.754460573196411, "learning_rate": 7.204518627673026e-06, "loss": 0.3986, "num_input_tokens_seen": 9327448, "step": 14220 }, { "epoch": 8.387382075471699, "grad_norm": 2.0958211421966553, "learning_rate": 7.202209185072428e-06, "loss": 0.3424, "num_input_tokens_seen": 9330520, "step": 14225 }, { "epoch": 8.390330188679245, "grad_norm": 2.355443000793457, "learning_rate": 7.199899159427457e-06, "loss": 0.3069, "num_input_tokens_seen": 9333688, "step": 14230 }, { "epoch": 8.393278301886792, "grad_norm": 1.4750559329986572, "learning_rate": 7.1975885513497035e-06, "loss": 0.3551, "num_input_tokens_seen": 9337848, "step": 14235 }, { "epoch": 8.39622641509434, "grad_norm": 1.678882122039795, "learning_rate": 7.195277361450909e-06, "loss": 0.4845, "num_input_tokens_seen": 9341208, "step": 14240 }, { "epoch": 8.399174528301886, "grad_norm": 2.593461036682129, "learning_rate": 7.192965590342973e-06, "loss": 0.5752, "num_input_tokens_seen": 9345368, "step": 14245 }, { "epoch": 8.402122641509434, "grad_norm": 3.573441743850708, "learning_rate": 7.190653238637945e-06, "loss": 0.5519, "num_input_tokens_seen": 9347704, "step": 14250 }, { "epoch": 8.40507075471698, "grad_norm": 2.4108004570007324, "learning_rate": 7.18834030694803e-06, "loss": 0.4269, "num_input_tokens_seen": 9350232, "step": 14255 }, { "epoch": 8.408018867924529, "grad_norm": 1.9307973384857178, "learning_rate": 7.186026795885589e-06, "loss": 0.3728, "num_input_tokens_seen": 9353528, "step": 14260 }, { "epoch": 8.410966981132075, "grad_norm": 4.593100547790527, "learning_rate": 7.183712706063133e-06, "loss": 0.4261, "num_input_tokens_seen": 9357112, "step": 14265 }, { "epoch": 8.413915094339623, "grad_norm": 1.772660255432129, "learning_rate": 7.1813980380933255e-06, "loss": 0.3863, "num_input_tokens_seen": 9361560, "step": 14270 }, { "epoch": 8.41686320754717, "grad_norm": 1.3630471229553223, "learning_rate": 7.179082792588986e-06, "loss": 0.4056, "num_input_tokens_seen": 9365016, "step": 14275 }, { "epoch": 8.419811320754716, "grad_norm": 2.4834189414978027, "learning_rate": 7.176766970163087e-06, "loss": 0.4964, "num_input_tokens_seen": 9367736, "step": 14280 }, { "epoch": 8.422759433962264, "grad_norm": 2.8840010166168213, "learning_rate": 7.17445057142875e-06, "loss": 0.342, "num_input_tokens_seen": 9370328, "step": 14285 }, { "epoch": 8.42570754716981, "grad_norm": 2.1319565773010254, "learning_rate": 7.172133596999253e-06, "loss": 0.3413, "num_input_tokens_seen": 9374232, "step": 14290 }, { "epoch": 8.428655660377359, "grad_norm": 2.4046783447265625, "learning_rate": 7.1698160474880255e-06, "loss": 0.4111, "num_input_tokens_seen": 9376408, "step": 14295 }, { "epoch": 8.431603773584905, "grad_norm": 2.9220221042633057, "learning_rate": 7.167497923508648e-06, "loss": 0.3769, "num_input_tokens_seen": 9379704, "step": 14300 }, { "epoch": 8.434551886792454, "grad_norm": 4.241151332855225, "learning_rate": 7.165179225674854e-06, "loss": 0.3984, "num_input_tokens_seen": 9382776, "step": 14305 }, { "epoch": 8.4375, "grad_norm": 1.5804568529129028, "learning_rate": 7.1628599546005276e-06, "loss": 0.4496, "num_input_tokens_seen": 9386008, "step": 14310 }, { "epoch": 8.440448113207546, "grad_norm": 2.1845202445983887, "learning_rate": 7.160540110899708e-06, "loss": 0.4068, "num_input_tokens_seen": 9389048, "step": 14315 }, { "epoch": 8.443396226415095, "grad_norm": 1.7323826551437378, "learning_rate": 7.158219695186582e-06, "loss": 0.4079, "num_input_tokens_seen": 9392504, "step": 14320 }, { "epoch": 8.446344339622641, "grad_norm": 12.078439712524414, "learning_rate": 7.1558987080754905e-06, "loss": 0.4036, "num_input_tokens_seen": 9395768, "step": 14325 }, { "epoch": 8.44929245283019, "grad_norm": 2.9786884784698486, "learning_rate": 7.1535771501809245e-06, "loss": 0.2976, "num_input_tokens_seen": 9398712, "step": 14330 }, { "epoch": 8.452240566037736, "grad_norm": 2.466611623764038, "learning_rate": 7.151255022117527e-06, "loss": 0.2582, "num_input_tokens_seen": 9402424, "step": 14335 }, { "epoch": 8.455188679245284, "grad_norm": 2.375934600830078, "learning_rate": 7.148932324500091e-06, "loss": 0.5292, "num_input_tokens_seen": 9405496, "step": 14340 }, { "epoch": 8.45813679245283, "grad_norm": 1.9712445735931396, "learning_rate": 7.146609057943559e-06, "loss": 0.5146, "num_input_tokens_seen": 9408536, "step": 14345 }, { "epoch": 8.461084905660377, "grad_norm": 1.9552159309387207, "learning_rate": 7.14428522306303e-06, "loss": 0.3517, "num_input_tokens_seen": 9411128, "step": 14350 }, { "epoch": 8.464033018867925, "grad_norm": 1.8944820165634155, "learning_rate": 7.141960820473745e-06, "loss": 0.4852, "num_input_tokens_seen": 9414904, "step": 14355 }, { "epoch": 8.466981132075471, "grad_norm": 2.448129653930664, "learning_rate": 7.139635850791102e-06, "loss": 0.373, "num_input_tokens_seen": 9417880, "step": 14360 }, { "epoch": 8.46992924528302, "grad_norm": 1.7965128421783447, "learning_rate": 7.137310314630647e-06, "loss": 0.3274, "num_input_tokens_seen": 9421816, "step": 14365 }, { "epoch": 8.472877358490566, "grad_norm": 2.529834032058716, "learning_rate": 7.134984212608074e-06, "loss": 0.382, "num_input_tokens_seen": 9424984, "step": 14370 }, { "epoch": 8.475825471698114, "grad_norm": 1.9582459926605225, "learning_rate": 7.13265754533923e-06, "loss": 0.3996, "num_input_tokens_seen": 9428632, "step": 14375 }, { "epoch": 8.47877358490566, "grad_norm": 2.201965093612671, "learning_rate": 7.130330313440109e-06, "loss": 0.3153, "num_input_tokens_seen": 9433080, "step": 14380 }, { "epoch": 8.481721698113208, "grad_norm": 3.0475077629089355, "learning_rate": 7.128002517526856e-06, "loss": 0.3988, "num_input_tokens_seen": 9435192, "step": 14385 }, { "epoch": 8.484669811320755, "grad_norm": 2.4395394325256348, "learning_rate": 7.1256741582157654e-06, "loss": 0.4611, "num_input_tokens_seen": 9438904, "step": 14390 }, { "epoch": 8.487617924528301, "grad_norm": 1.1438593864440918, "learning_rate": 7.123345236123282e-06, "loss": 0.3808, "num_input_tokens_seen": 9442968, "step": 14395 }, { "epoch": 8.49056603773585, "grad_norm": 2.3835511207580566, "learning_rate": 7.121015751865994e-06, "loss": 0.4245, "num_input_tokens_seen": 9446264, "step": 14400 }, { "epoch": 8.493514150943396, "grad_norm": 2.4600462913513184, "learning_rate": 7.118685706060645e-06, "loss": 0.43, "num_input_tokens_seen": 9448760, "step": 14405 }, { "epoch": 8.496462264150944, "grad_norm": 1.4631307125091553, "learning_rate": 7.116355099324126e-06, "loss": 0.4304, "num_input_tokens_seen": 9451576, "step": 14410 }, { "epoch": 8.49941037735849, "grad_norm": 1.0778608322143555, "learning_rate": 7.114023932273471e-06, "loss": 0.3691, "num_input_tokens_seen": 9454360, "step": 14415 }, { "epoch": 8.502358490566039, "grad_norm": 2.2408387660980225, "learning_rate": 7.111692205525871e-06, "loss": 0.3898, "num_input_tokens_seen": 9457368, "step": 14420 }, { "epoch": 8.505306603773585, "grad_norm": 4.424534797668457, "learning_rate": 7.109359919698658e-06, "loss": 0.4467, "num_input_tokens_seen": 9460024, "step": 14425 }, { "epoch": 8.508254716981131, "grad_norm": 2.558814287185669, "learning_rate": 7.107027075409316e-06, "loss": 0.3454, "num_input_tokens_seen": 9463032, "step": 14430 }, { "epoch": 8.51120283018868, "grad_norm": 2.5315804481506348, "learning_rate": 7.104693673275475e-06, "loss": 0.3938, "num_input_tokens_seen": 9466136, "step": 14435 }, { "epoch": 8.514150943396226, "grad_norm": 2.196658134460449, "learning_rate": 7.1023597139149115e-06, "loss": 0.4668, "num_input_tokens_seen": 9469656, "step": 14440 }, { "epoch": 8.517099056603774, "grad_norm": 2.096489191055298, "learning_rate": 7.100025197945555e-06, "loss": 0.368, "num_input_tokens_seen": 9473112, "step": 14445 }, { "epoch": 8.52004716981132, "grad_norm": 3.7898011207580566, "learning_rate": 7.097690125985476e-06, "loss": 0.4942, "num_input_tokens_seen": 9476056, "step": 14450 }, { "epoch": 8.522995283018869, "grad_norm": 4.091800212860107, "learning_rate": 7.095354498652895e-06, "loss": 0.2748, "num_input_tokens_seen": 9482488, "step": 14455 }, { "epoch": 8.525943396226415, "grad_norm": 4.27150821685791, "learning_rate": 7.093018316566182e-06, "loss": 0.5229, "num_input_tokens_seen": 9486040, "step": 14460 }, { "epoch": 8.528891509433961, "grad_norm": 2.58501935005188, "learning_rate": 7.0906815803438465e-06, "loss": 0.4455, "num_input_tokens_seen": 9489016, "step": 14465 }, { "epoch": 8.53183962264151, "grad_norm": 2.0694291591644287, "learning_rate": 7.088344290604554e-06, "loss": 0.4345, "num_input_tokens_seen": 9492568, "step": 14470 }, { "epoch": 8.534787735849056, "grad_norm": 2.397692918777466, "learning_rate": 7.086006447967111e-06, "loss": 0.5414, "num_input_tokens_seen": 9495352, "step": 14475 }, { "epoch": 8.537735849056604, "grad_norm": 1.99761164188385, "learning_rate": 7.08366805305047e-06, "loss": 0.4163, "num_input_tokens_seen": 9498616, "step": 14480 }, { "epoch": 8.54068396226415, "grad_norm": 2.999887704849243, "learning_rate": 7.08132910647373e-06, "loss": 0.3167, "num_input_tokens_seen": 9502232, "step": 14485 }, { "epoch": 8.543632075471699, "grad_norm": 1.5442019701004028, "learning_rate": 7.078989608856142e-06, "loss": 0.474, "num_input_tokens_seen": 9505592, "step": 14490 }, { "epoch": 8.546580188679245, "grad_norm": 4.0928144454956055, "learning_rate": 7.076649560817092e-06, "loss": 0.3898, "num_input_tokens_seen": 9507800, "step": 14495 }, { "epoch": 8.549528301886792, "grad_norm": 2.6833043098449707, "learning_rate": 7.0743089629761245e-06, "loss": 0.3495, "num_input_tokens_seen": 9511288, "step": 14500 }, { "epoch": 8.55247641509434, "grad_norm": 2.190847635269165, "learning_rate": 7.071967815952917e-06, "loss": 0.5016, "num_input_tokens_seen": 9514328, "step": 14505 }, { "epoch": 8.555424528301886, "grad_norm": 2.1796398162841797, "learning_rate": 7.0696261203673e-06, "loss": 0.394, "num_input_tokens_seen": 9517944, "step": 14510 }, { "epoch": 8.558372641509434, "grad_norm": 1.567954182624817, "learning_rate": 7.067283876839249e-06, "loss": 0.3909, "num_input_tokens_seen": 9520248, "step": 14515 }, { "epoch": 8.56132075471698, "grad_norm": 1.3021125793457031, "learning_rate": 7.064941085988884e-06, "loss": 0.4456, "num_input_tokens_seen": 9523288, "step": 14520 }, { "epoch": 8.564268867924529, "grad_norm": 1.9511109590530396, "learning_rate": 7.062597748436464e-06, "loss": 0.3413, "num_input_tokens_seen": 9526296, "step": 14525 }, { "epoch": 8.567216981132075, "grad_norm": 3.731557846069336, "learning_rate": 7.060253864802402e-06, "loss": 0.411, "num_input_tokens_seen": 9529592, "step": 14530 }, { "epoch": 8.570165094339622, "grad_norm": 2.827678680419922, "learning_rate": 7.05790943570725e-06, "loss": 0.3706, "num_input_tokens_seen": 9532600, "step": 14535 }, { "epoch": 8.57311320754717, "grad_norm": 1.8461275100708008, "learning_rate": 7.055564461771704e-06, "loss": 0.4762, "num_input_tokens_seen": 9535128, "step": 14540 }, { "epoch": 8.576061320754716, "grad_norm": 3.15303897857666, "learning_rate": 7.053218943616611e-06, "loss": 0.4956, "num_input_tokens_seen": 9538072, "step": 14545 }, { "epoch": 8.579009433962264, "grad_norm": 1.7208880186080933, "learning_rate": 7.050872881862952e-06, "loss": 0.3587, "num_input_tokens_seen": 9540856, "step": 14550 }, { "epoch": 8.58195754716981, "grad_norm": 4.504153251647949, "learning_rate": 7.04852627713186e-06, "loss": 0.4106, "num_input_tokens_seen": 9543672, "step": 14555 }, { "epoch": 8.584905660377359, "grad_norm": 2.263424873352051, "learning_rate": 7.0461791300446055e-06, "loss": 0.3176, "num_input_tokens_seen": 9546936, "step": 14560 }, { "epoch": 8.587853773584905, "grad_norm": 3.0372602939605713, "learning_rate": 7.043831441222611e-06, "loss": 0.6496, "num_input_tokens_seen": 9550008, "step": 14565 }, { "epoch": 8.590801886792454, "grad_norm": 1.8683973550796509, "learning_rate": 7.04148321128743e-06, "loss": 0.3552, "num_input_tokens_seen": 9553752, "step": 14570 }, { "epoch": 8.59375, "grad_norm": 2.7596206665039062, "learning_rate": 7.039134440860773e-06, "loss": 0.4484, "num_input_tokens_seen": 9556376, "step": 14575 }, { "epoch": 8.596698113207546, "grad_norm": 2.0952091217041016, "learning_rate": 7.036785130564484e-06, "loss": 0.3857, "num_input_tokens_seen": 9559480, "step": 14580 }, { "epoch": 8.599646226415095, "grad_norm": 4.716624736785889, "learning_rate": 7.0344352810205544e-06, "loss": 0.3577, "num_input_tokens_seen": 9562200, "step": 14585 }, { "epoch": 8.602594339622641, "grad_norm": 1.67292058467865, "learning_rate": 7.032084892851115e-06, "loss": 0.3916, "num_input_tokens_seen": 9565144, "step": 14590 }, { "epoch": 8.60554245283019, "grad_norm": 1.7544599771499634, "learning_rate": 7.0297339666784425e-06, "loss": 0.4005, "num_input_tokens_seen": 9567992, "step": 14595 }, { "epoch": 8.608490566037736, "grad_norm": 2.127913475036621, "learning_rate": 7.0273825031249556e-06, "loss": 0.5034, "num_input_tokens_seen": 9572472, "step": 14600 }, { "epoch": 8.611438679245284, "grad_norm": 1.7265125513076782, "learning_rate": 7.025030502813213e-06, "loss": 0.4304, "num_input_tokens_seen": 9575544, "step": 14605 }, { "epoch": 8.61438679245283, "grad_norm": 5.157460689544678, "learning_rate": 7.022677966365917e-06, "loss": 0.3922, "num_input_tokens_seen": 9578232, "step": 14610 }, { "epoch": 8.617334905660378, "grad_norm": 2.4534621238708496, "learning_rate": 7.020324894405913e-06, "loss": 0.3684, "num_input_tokens_seen": 9584696, "step": 14615 }, { "epoch": 8.620283018867925, "grad_norm": 7.824957370758057, "learning_rate": 7.017971287556185e-06, "loss": 0.4858, "num_input_tokens_seen": 9589080, "step": 14620 }, { "epoch": 8.623231132075471, "grad_norm": 1.6975691318511963, "learning_rate": 7.015617146439863e-06, "loss": 0.3658, "num_input_tokens_seen": 9593240, "step": 14625 }, { "epoch": 8.62617924528302, "grad_norm": 2.132927656173706, "learning_rate": 7.0132624716802125e-06, "loss": 0.3195, "num_input_tokens_seen": 9596760, "step": 14630 }, { "epoch": 8.629127358490566, "grad_norm": 2.1246933937072754, "learning_rate": 7.0109072639006474e-06, "loss": 0.3683, "num_input_tokens_seen": 9599544, "step": 14635 }, { "epoch": 8.632075471698114, "grad_norm": 1.2366578578948975, "learning_rate": 7.008551523724717e-06, "loss": 0.4173, "num_input_tokens_seen": 9603000, "step": 14640 }, { "epoch": 8.63502358490566, "grad_norm": 1.7593507766723633, "learning_rate": 7.006195251776116e-06, "loss": 0.4016, "num_input_tokens_seen": 9606712, "step": 14645 }, { "epoch": 8.637971698113208, "grad_norm": 3.082564353942871, "learning_rate": 7.003838448678674e-06, "loss": 0.3827, "num_input_tokens_seen": 9609784, "step": 14650 }, { "epoch": 8.640919811320755, "grad_norm": 1.4339088201522827, "learning_rate": 7.0014811150563675e-06, "loss": 0.4632, "num_input_tokens_seen": 9612600, "step": 14655 }, { "epoch": 8.643867924528301, "grad_norm": 1.7641173601150513, "learning_rate": 6.999123251533311e-06, "loss": 0.5802, "num_input_tokens_seen": 9615640, "step": 14660 }, { "epoch": 8.64681603773585, "grad_norm": 1.9602274894714355, "learning_rate": 6.996764858733756e-06, "loss": 0.4907, "num_input_tokens_seen": 9619512, "step": 14665 }, { "epoch": 8.649764150943396, "grad_norm": 1.7782695293426514, "learning_rate": 6.994405937282099e-06, "loss": 0.3315, "num_input_tokens_seen": 9623640, "step": 14670 }, { "epoch": 8.652712264150944, "grad_norm": 1.336503028869629, "learning_rate": 6.9920464878028745e-06, "loss": 0.3251, "num_input_tokens_seen": 9627128, "step": 14675 }, { "epoch": 8.65566037735849, "grad_norm": 1.8412688970565796, "learning_rate": 6.989686510920758e-06, "loss": 0.3753, "num_input_tokens_seen": 9630584, "step": 14680 }, { "epoch": 8.658608490566039, "grad_norm": 1.6783863306045532, "learning_rate": 6.9873260072605634e-06, "loss": 0.4305, "num_input_tokens_seen": 9633592, "step": 14685 }, { "epoch": 8.661556603773585, "grad_norm": 3.425694704055786, "learning_rate": 6.984964977447243e-06, "loss": 0.3317, "num_input_tokens_seen": 9638392, "step": 14690 }, { "epoch": 8.664504716981131, "grad_norm": 1.3974274396896362, "learning_rate": 6.982603422105889e-06, "loss": 0.3005, "num_input_tokens_seen": 9641912, "step": 14695 }, { "epoch": 8.66745283018868, "grad_norm": 2.676815986633301, "learning_rate": 6.980241341861736e-06, "loss": 0.3896, "num_input_tokens_seen": 9644664, "step": 14700 }, { "epoch": 8.670400943396226, "grad_norm": 2.977435827255249, "learning_rate": 6.977878737340153e-06, "loss": 0.5017, "num_input_tokens_seen": 9647608, "step": 14705 }, { "epoch": 8.673349056603774, "grad_norm": 1.775968074798584, "learning_rate": 6.97551560916665e-06, "loss": 0.3822, "num_input_tokens_seen": 9650616, "step": 14710 }, { "epoch": 8.67629716981132, "grad_norm": 2.669670343399048, "learning_rate": 6.973151957966875e-06, "loss": 0.5947, "num_input_tokens_seen": 9653176, "step": 14715 }, { "epoch": 8.679245283018869, "grad_norm": 1.466066837310791, "learning_rate": 6.970787784366616e-06, "loss": 0.3723, "num_input_tokens_seen": 9656632, "step": 14720 }, { "epoch": 8.682193396226415, "grad_norm": 4.585606575012207, "learning_rate": 6.968423088991797e-06, "loss": 0.4359, "num_input_tokens_seen": 9660664, "step": 14725 }, { "epoch": 8.685141509433961, "grad_norm": 4.358580112457275, "learning_rate": 6.966057872468481e-06, "loss": 0.3445, "num_input_tokens_seen": 9664152, "step": 14730 }, { "epoch": 8.68808962264151, "grad_norm": 1.5063092708587646, "learning_rate": 6.963692135422872e-06, "loss": 0.3174, "num_input_tokens_seen": 9667864, "step": 14735 }, { "epoch": 8.691037735849056, "grad_norm": 1.9654090404510498, "learning_rate": 6.961325878481305e-06, "loss": 0.4047, "num_input_tokens_seen": 9671448, "step": 14740 }, { "epoch": 8.693985849056604, "grad_norm": 1.484573483467102, "learning_rate": 6.958959102270259e-06, "loss": 0.4404, "num_input_tokens_seen": 9676536, "step": 14745 }, { "epoch": 8.69693396226415, "grad_norm": 1.9402559995651245, "learning_rate": 6.95659180741635e-06, "loss": 0.5536, "num_input_tokens_seen": 9679576, "step": 14750 }, { "epoch": 8.699882075471699, "grad_norm": 1.4386368989944458, "learning_rate": 6.954223994546326e-06, "loss": 0.4526, "num_input_tokens_seen": 9682680, "step": 14755 }, { "epoch": 8.702830188679245, "grad_norm": 1.5791798830032349, "learning_rate": 6.951855664287077e-06, "loss": 0.3527, "num_input_tokens_seen": 9685816, "step": 14760 }, { "epoch": 8.705778301886792, "grad_norm": 1.7131348848342896, "learning_rate": 6.9494868172656304e-06, "loss": 0.2874, "num_input_tokens_seen": 9689080, "step": 14765 }, { "epoch": 8.70872641509434, "grad_norm": 1.3199063539505005, "learning_rate": 6.947117454109146e-06, "loss": 0.3647, "num_input_tokens_seen": 9691992, "step": 14770 }, { "epoch": 8.711674528301886, "grad_norm": 2.6262617111206055, "learning_rate": 6.944747575444924e-06, "loss": 0.351, "num_input_tokens_seen": 9695160, "step": 14775 }, { "epoch": 8.714622641509434, "grad_norm": 2.076737880706787, "learning_rate": 6.942377181900399e-06, "loss": 0.35, "num_input_tokens_seen": 9697720, "step": 14780 }, { "epoch": 8.71757075471698, "grad_norm": 3.7002038955688477, "learning_rate": 6.940006274103146e-06, "loss": 0.3287, "num_input_tokens_seen": 9700728, "step": 14785 }, { "epoch": 8.720518867924529, "grad_norm": 2.460127830505371, "learning_rate": 6.93763485268087e-06, "loss": 0.4224, "num_input_tokens_seen": 9704920, "step": 14790 }, { "epoch": 8.723466981132075, "grad_norm": 1.0790541172027588, "learning_rate": 6.935262918261416e-06, "loss": 0.4344, "num_input_tokens_seen": 9708888, "step": 14795 }, { "epoch": 8.726415094339622, "grad_norm": 2.5792524814605713, "learning_rate": 6.932890471472764e-06, "loss": 0.5478, "num_input_tokens_seen": 9712056, "step": 14800 }, { "epoch": 8.72936320754717, "grad_norm": 2.6065733432769775, "learning_rate": 6.930517512943029e-06, "loss": 0.3908, "num_input_tokens_seen": 9716056, "step": 14805 }, { "epoch": 8.732311320754716, "grad_norm": 2.5821385383605957, "learning_rate": 6.928144043300463e-06, "loss": 0.3608, "num_input_tokens_seen": 9719352, "step": 14810 }, { "epoch": 8.735259433962264, "grad_norm": 2.212601661682129, "learning_rate": 6.925770063173451e-06, "loss": 0.3724, "num_input_tokens_seen": 9722872, "step": 14815 }, { "epoch": 8.73820754716981, "grad_norm": 2.5051798820495605, "learning_rate": 6.923395573190514e-06, "loss": 0.435, "num_input_tokens_seen": 9725240, "step": 14820 }, { "epoch": 8.741155660377359, "grad_norm": 2.022491216659546, "learning_rate": 6.921020573980313e-06, "loss": 0.4844, "num_input_tokens_seen": 9729208, "step": 14825 }, { "epoch": 8.744103773584905, "grad_norm": 2.2844693660736084, "learning_rate": 6.918645066171634e-06, "loss": 0.527, "num_input_tokens_seen": 9733240, "step": 14830 }, { "epoch": 8.747051886792454, "grad_norm": 3.681520700454712, "learning_rate": 6.916269050393404e-06, "loss": 0.3634, "num_input_tokens_seen": 9737400, "step": 14835 }, { "epoch": 8.75, "grad_norm": 1.594216227531433, "learning_rate": 6.913892527274686e-06, "loss": 0.4139, "num_input_tokens_seen": 9741016, "step": 14840 }, { "epoch": 8.752948113207546, "grad_norm": 1.7567775249481201, "learning_rate": 6.9115154974446716e-06, "loss": 0.4328, "num_input_tokens_seen": 9743896, "step": 14845 }, { "epoch": 8.755896226415095, "grad_norm": 3.0666115283966064, "learning_rate": 6.909137961532692e-06, "loss": 0.5068, "num_input_tokens_seen": 9747192, "step": 14850 }, { "epoch": 8.758844339622641, "grad_norm": 1.8985918760299683, "learning_rate": 6.906759920168209e-06, "loss": 0.3503, "num_input_tokens_seen": 9750360, "step": 14855 }, { "epoch": 8.76179245283019, "grad_norm": 1.4352837800979614, "learning_rate": 6.90438137398082e-06, "loss": 0.3591, "num_input_tokens_seen": 9754072, "step": 14860 }, { "epoch": 8.764740566037736, "grad_norm": 1.7177258729934692, "learning_rate": 6.902002323600252e-06, "loss": 0.3818, "num_input_tokens_seen": 9758552, "step": 14865 }, { "epoch": 8.767688679245284, "grad_norm": 1.8785511255264282, "learning_rate": 6.899622769656373e-06, "loss": 0.4212, "num_input_tokens_seen": 9761720, "step": 14870 }, { "epoch": 8.77063679245283, "grad_norm": 1.5735493898391724, "learning_rate": 6.897242712779179e-06, "loss": 0.3943, "num_input_tokens_seen": 9766968, "step": 14875 }, { "epoch": 8.773584905660378, "grad_norm": 2.438490629196167, "learning_rate": 6.894862153598802e-06, "loss": 0.477, "num_input_tokens_seen": 9769304, "step": 14880 }, { "epoch": 8.776533018867925, "grad_norm": 2.180138111114502, "learning_rate": 6.892481092745502e-06, "loss": 0.3418, "num_input_tokens_seen": 9774776, "step": 14885 }, { "epoch": 8.779481132075471, "grad_norm": 1.883935570716858, "learning_rate": 6.890099530849677e-06, "loss": 0.3754, "num_input_tokens_seen": 9777560, "step": 14890 }, { "epoch": 8.78242924528302, "grad_norm": 1.4584909677505493, "learning_rate": 6.887717468541855e-06, "loss": 0.3721, "num_input_tokens_seen": 9780088, "step": 14895 }, { "epoch": 8.785377358490566, "grad_norm": 4.600624084472656, "learning_rate": 6.885334906452696e-06, "loss": 0.3813, "num_input_tokens_seen": 9782584, "step": 14900 }, { "epoch": 8.788325471698114, "grad_norm": 1.9549750089645386, "learning_rate": 6.882951845212997e-06, "loss": 0.4676, "num_input_tokens_seen": 9785496, "step": 14905 }, { "epoch": 8.79127358490566, "grad_norm": 1.7457497119903564, "learning_rate": 6.880568285453682e-06, "loss": 0.3716, "num_input_tokens_seen": 9788344, "step": 14910 }, { "epoch": 8.794221698113208, "grad_norm": 1.8453991413116455, "learning_rate": 6.878184227805807e-06, "loss": 0.3831, "num_input_tokens_seen": 9791096, "step": 14915 }, { "epoch": 8.797169811320755, "grad_norm": 3.9710874557495117, "learning_rate": 6.8757996729005645e-06, "loss": 0.5509, "num_input_tokens_seen": 9794936, "step": 14920 }, { "epoch": 8.800117924528301, "grad_norm": 14.138848304748535, "learning_rate": 6.8734146213692756e-06, "loss": 0.5456, "num_input_tokens_seen": 9797784, "step": 14925 }, { "epoch": 8.80306603773585, "grad_norm": 2.2862813472747803, "learning_rate": 6.87102907384339e-06, "loss": 0.401, "num_input_tokens_seen": 9800984, "step": 14930 }, { "epoch": 8.806014150943396, "grad_norm": 1.2904669046401978, "learning_rate": 6.868643030954494e-06, "loss": 0.4124, "num_input_tokens_seen": 9804760, "step": 14935 }, { "epoch": 8.808962264150944, "grad_norm": 1.5964165925979614, "learning_rate": 6.866256493334302e-06, "loss": 0.4177, "num_input_tokens_seen": 9808536, "step": 14940 }, { "epoch": 8.81191037735849, "grad_norm": 2.922182559967041, "learning_rate": 6.863869461614659e-06, "loss": 0.3342, "num_input_tokens_seen": 9811192, "step": 14945 }, { "epoch": 8.814858490566039, "grad_norm": 2.7199597358703613, "learning_rate": 6.861481936427545e-06, "loss": 0.4454, "num_input_tokens_seen": 9814072, "step": 14950 }, { "epoch": 8.817806603773585, "grad_norm": 1.0714527368545532, "learning_rate": 6.859093918405067e-06, "loss": 0.5368, "num_input_tokens_seen": 9816920, "step": 14955 }, { "epoch": 8.820754716981131, "grad_norm": 3.2250304222106934, "learning_rate": 6.856705408179458e-06, "loss": 0.4492, "num_input_tokens_seen": 9820376, "step": 14960 }, { "epoch": 8.82370283018868, "grad_norm": 1.3404574394226074, "learning_rate": 6.854316406383093e-06, "loss": 0.4064, "num_input_tokens_seen": 9823672, "step": 14965 }, { "epoch": 8.826650943396226, "grad_norm": 3.204049587249756, "learning_rate": 6.8519269136484665e-06, "loss": 0.4715, "num_input_tokens_seen": 9827288, "step": 14970 }, { "epoch": 8.829599056603774, "grad_norm": 2.9617061614990234, "learning_rate": 6.849536930608208e-06, "loss": 0.3581, "num_input_tokens_seen": 9830168, "step": 14975 }, { "epoch": 8.83254716981132, "grad_norm": 2.7662243843078613, "learning_rate": 6.847146457895078e-06, "loss": 0.412, "num_input_tokens_seen": 9833240, "step": 14980 }, { "epoch": 8.835495283018869, "grad_norm": 2.896782398223877, "learning_rate": 6.8447554961419615e-06, "loss": 0.538, "num_input_tokens_seen": 9837208, "step": 14985 }, { "epoch": 8.838443396226415, "grad_norm": 1.5471094846725464, "learning_rate": 6.842364045981876e-06, "loss": 0.3019, "num_input_tokens_seen": 9840536, "step": 14990 }, { "epoch": 8.841391509433961, "grad_norm": 1.275814175605774, "learning_rate": 6.83997210804797e-06, "loss": 0.4004, "num_input_tokens_seen": 9843832, "step": 14995 }, { "epoch": 8.84433962264151, "grad_norm": 1.7966731786727905, "learning_rate": 6.837579682973519e-06, "loss": 0.4981, "num_input_tokens_seen": 9846776, "step": 15000 }, { "epoch": 8.847287735849056, "grad_norm": 2.793853759765625, "learning_rate": 6.835186771391926e-06, "loss": 0.3509, "num_input_tokens_seen": 9849464, "step": 15005 }, { "epoch": 8.850235849056604, "grad_norm": 3.18677020072937, "learning_rate": 6.8327933739367266e-06, "loss": 0.5485, "num_input_tokens_seen": 9854264, "step": 15010 }, { "epoch": 8.85318396226415, "grad_norm": 2.6716983318328857, "learning_rate": 6.830399491241584e-06, "loss": 0.4748, "num_input_tokens_seen": 9857336, "step": 15015 }, { "epoch": 8.856132075471699, "grad_norm": 2.165402889251709, "learning_rate": 6.828005123940287e-06, "loss": 0.5031, "num_input_tokens_seen": 9860152, "step": 15020 }, { "epoch": 8.859080188679245, "grad_norm": 1.330311894416809, "learning_rate": 6.825610272666754e-06, "loss": 0.5057, "num_input_tokens_seen": 9864472, "step": 15025 }, { "epoch": 8.862028301886792, "grad_norm": 1.7609962224960327, "learning_rate": 6.823214938055034e-06, "loss": 0.3835, "num_input_tokens_seen": 9867704, "step": 15030 }, { "epoch": 8.86497641509434, "grad_norm": 1.9133124351501465, "learning_rate": 6.8208191207393e-06, "loss": 0.3846, "num_input_tokens_seen": 9871352, "step": 15035 }, { "epoch": 8.867924528301886, "grad_norm": 2.2019009590148926, "learning_rate": 6.818422821353859e-06, "loss": 0.4681, "num_input_tokens_seen": 9875128, "step": 15040 }, { "epoch": 8.870872641509434, "grad_norm": 2.0546188354492188, "learning_rate": 6.816026040533139e-06, "loss": 0.4859, "num_input_tokens_seen": 9877976, "step": 15045 }, { "epoch": 8.87382075471698, "grad_norm": 5.416626453399658, "learning_rate": 6.8136287789116966e-06, "loss": 0.3404, "num_input_tokens_seen": 9880472, "step": 15050 }, { "epoch": 8.876768867924529, "grad_norm": 1.8469674587249756, "learning_rate": 6.81123103712422e-06, "loss": 0.5657, "num_input_tokens_seen": 9884440, "step": 15055 }, { "epoch": 8.879716981132075, "grad_norm": 4.6245927810668945, "learning_rate": 6.808832815805518e-06, "loss": 0.4511, "num_input_tokens_seen": 9887480, "step": 15060 }, { "epoch": 8.882665094339622, "grad_norm": 2.6515862941741943, "learning_rate": 6.806434115590534e-06, "loss": 0.3657, "num_input_tokens_seen": 9890264, "step": 15065 }, { "epoch": 8.88561320754717, "grad_norm": 2.483234167098999, "learning_rate": 6.804034937114332e-06, "loss": 0.4179, "num_input_tokens_seen": 9893912, "step": 15070 }, { "epoch": 8.888561320754716, "grad_norm": 2.220271110534668, "learning_rate": 6.8016352810121064e-06, "loss": 0.5181, "num_input_tokens_seen": 9896856, "step": 15075 }, { "epoch": 8.891509433962264, "grad_norm": 3.665300130844116, "learning_rate": 6.799235147919176e-06, "loss": 0.4019, "num_input_tokens_seen": 9899992, "step": 15080 }, { "epoch": 8.89445754716981, "grad_norm": 1.9727132320404053, "learning_rate": 6.796834538470985e-06, "loss": 0.2527, "num_input_tokens_seen": 9902648, "step": 15085 }, { "epoch": 8.897405660377359, "grad_norm": 3.1375420093536377, "learning_rate": 6.794433453303106e-06, "loss": 0.4711, "num_input_tokens_seen": 9905048, "step": 15090 }, { "epoch": 8.900353773584905, "grad_norm": 3.8160624504089355, "learning_rate": 6.792031893051238e-06, "loss": 0.4361, "num_input_tokens_seen": 9907864, "step": 15095 }, { "epoch": 8.903301886792454, "grad_norm": 3.6184914112091064, "learning_rate": 6.789629858351201e-06, "loss": 0.4163, "num_input_tokens_seen": 9910200, "step": 15100 }, { "epoch": 8.90625, "grad_norm": 1.8119903802871704, "learning_rate": 6.787227349838946e-06, "loss": 0.467, "num_input_tokens_seen": 9914264, "step": 15105 }, { "epoch": 8.909198113207546, "grad_norm": 3.327181100845337, "learning_rate": 6.784824368150548e-06, "loss": 0.3522, "num_input_tokens_seen": 9917912, "step": 15110 }, { "epoch": 8.912146226415095, "grad_norm": 2.615298271179199, "learning_rate": 6.7824209139222076e-06, "loss": 0.3644, "num_input_tokens_seen": 9921368, "step": 15115 }, { "epoch": 8.915094339622641, "grad_norm": 1.9218441247940063, "learning_rate": 6.780016987790248e-06, "loss": 0.5832, "num_input_tokens_seen": 9925432, "step": 15120 }, { "epoch": 8.91804245283019, "grad_norm": 3.4048573970794678, "learning_rate": 6.7776125903911194e-06, "loss": 0.351, "num_input_tokens_seen": 9928216, "step": 15125 }, { "epoch": 8.920990566037736, "grad_norm": 3.5054171085357666, "learning_rate": 6.775207722361396e-06, "loss": 0.5359, "num_input_tokens_seen": 9931864, "step": 15130 }, { "epoch": 8.923938679245284, "grad_norm": 3.3559410572052, "learning_rate": 6.772802384337778e-06, "loss": 0.4634, "num_input_tokens_seen": 9935224, "step": 15135 }, { "epoch": 8.92688679245283, "grad_norm": 3.038119077682495, "learning_rate": 6.770396576957088e-06, "loss": 0.4977, "num_input_tokens_seen": 9938136, "step": 15140 }, { "epoch": 8.929834905660378, "grad_norm": 2.2318174839019775, "learning_rate": 6.767990300856274e-06, "loss": 0.492, "num_input_tokens_seen": 9941304, "step": 15145 }, { "epoch": 8.932783018867925, "grad_norm": 1.9040416479110718, "learning_rate": 6.765583556672408e-06, "loss": 0.5384, "num_input_tokens_seen": 9945176, "step": 15150 }, { "epoch": 8.935731132075471, "grad_norm": 2.529581069946289, "learning_rate": 6.763176345042687e-06, "loss": 0.4301, "num_input_tokens_seen": 9948472, "step": 15155 }, { "epoch": 8.93867924528302, "grad_norm": 2.5209619998931885, "learning_rate": 6.760768666604429e-06, "loss": 0.3367, "num_input_tokens_seen": 9950680, "step": 15160 }, { "epoch": 8.941627358490566, "grad_norm": 2.4521543979644775, "learning_rate": 6.758360521995079e-06, "loss": 0.3901, "num_input_tokens_seen": 9953336, "step": 15165 }, { "epoch": 8.944575471698114, "grad_norm": 1.5494389533996582, "learning_rate": 6.755951911852202e-06, "loss": 0.3784, "num_input_tokens_seen": 9956344, "step": 15170 }, { "epoch": 8.94752358490566, "grad_norm": 2.78596568107605, "learning_rate": 6.7535428368134885e-06, "loss": 0.3944, "num_input_tokens_seen": 9959640, "step": 15175 }, { "epoch": 8.950471698113208, "grad_norm": 1.9254405498504639, "learning_rate": 6.751133297516752e-06, "loss": 0.3187, "num_input_tokens_seen": 9962648, "step": 15180 }, { "epoch": 8.953419811320755, "grad_norm": 3.183145523071289, "learning_rate": 6.748723294599928e-06, "loss": 0.597, "num_input_tokens_seen": 9968856, "step": 15185 }, { "epoch": 8.956367924528301, "grad_norm": 7.104411602020264, "learning_rate": 6.746312828701075e-06, "loss": 0.5019, "num_input_tokens_seen": 9971320, "step": 15190 }, { "epoch": 8.95931603773585, "grad_norm": 3.247711658477783, "learning_rate": 6.743901900458374e-06, "loss": 0.4416, "num_input_tokens_seen": 9974424, "step": 15195 }, { "epoch": 8.962264150943396, "grad_norm": 4.113162040710449, "learning_rate": 6.741490510510129e-06, "loss": 0.4588, "num_input_tokens_seen": 9977144, "step": 15200 }, { "epoch": 8.965212264150944, "grad_norm": 3.1802878379821777, "learning_rate": 6.7390786594947665e-06, "loss": 0.5401, "num_input_tokens_seen": 9980088, "step": 15205 }, { "epoch": 8.96816037735849, "grad_norm": 2.569182872772217, "learning_rate": 6.7366663480508335e-06, "loss": 0.4032, "num_input_tokens_seen": 9983256, "step": 15210 }, { "epoch": 8.971108490566039, "grad_norm": 1.951671838760376, "learning_rate": 6.734253576817002e-06, "loss": 0.4637, "num_input_tokens_seen": 9986136, "step": 15215 }, { "epoch": 8.974056603773585, "grad_norm": 3.4235143661499023, "learning_rate": 6.731840346432061e-06, "loss": 0.3955, "num_input_tokens_seen": 9989560, "step": 15220 }, { "epoch": 8.977004716981131, "grad_norm": 2.1503241062164307, "learning_rate": 6.729426657534922e-06, "loss": 0.4648, "num_input_tokens_seen": 9992536, "step": 15225 }, { "epoch": 8.97995283018868, "grad_norm": 1.7868163585662842, "learning_rate": 6.727012510764624e-06, "loss": 0.3874, "num_input_tokens_seen": 9996376, "step": 15230 }, { "epoch": 8.982900943396226, "grad_norm": 1.972352147102356, "learning_rate": 6.724597906760322e-06, "loss": 0.4152, "num_input_tokens_seen": 9999640, "step": 15235 }, { "epoch": 8.985849056603774, "grad_norm": 1.4098140001296997, "learning_rate": 6.722182846161289e-06, "loss": 0.2549, "num_input_tokens_seen": 10002840, "step": 15240 }, { "epoch": 8.98879716981132, "grad_norm": 1.8524625301361084, "learning_rate": 6.719767329606926e-06, "loss": 0.3958, "num_input_tokens_seen": 10007032, "step": 15245 }, { "epoch": 8.991745283018869, "grad_norm": 2.1714208126068115, "learning_rate": 6.717351357736751e-06, "loss": 0.3985, "num_input_tokens_seen": 10009752, "step": 15250 }, { "epoch": 8.994693396226415, "grad_norm": 1.8546099662780762, "learning_rate": 6.7149349311904025e-06, "loss": 0.44, "num_input_tokens_seen": 10012696, "step": 15255 }, { "epoch": 8.997641509433961, "grad_norm": 2.4000840187072754, "learning_rate": 6.712518050607642e-06, "loss": 0.4463, "num_input_tokens_seen": 10016696, "step": 15260 }, { "epoch": 9.00058962264151, "grad_norm": 1.9924007654190063, "learning_rate": 6.710100716628345e-06, "loss": 0.4234, "num_input_tokens_seen": 10019288, "step": 15265 }, { "epoch": 9.003537735849056, "grad_norm": 1.9655539989471436, "learning_rate": 6.707682929892513e-06, "loss": 0.4974, "num_input_tokens_seen": 10022840, "step": 15270 }, { "epoch": 9.006485849056604, "grad_norm": 1.4429490566253662, "learning_rate": 6.705264691040266e-06, "loss": 0.507, "num_input_tokens_seen": 10026424, "step": 15275 }, { "epoch": 9.00943396226415, "grad_norm": 1.445254921913147, "learning_rate": 6.7028460007118435e-06, "loss": 0.4643, "num_input_tokens_seen": 10030648, "step": 15280 }, { "epoch": 9.012382075471699, "grad_norm": 1.7352896928787231, "learning_rate": 6.700426859547602e-06, "loss": 0.3341, "num_input_tokens_seen": 10033880, "step": 15285 }, { "epoch": 9.015330188679245, "grad_norm": 3.023343801498413, "learning_rate": 6.6980072681880224e-06, "loss": 0.4342, "num_input_tokens_seen": 10036408, "step": 15290 }, { "epoch": 9.018278301886792, "grad_norm": 7.63539457321167, "learning_rate": 6.695587227273699e-06, "loss": 0.311, "num_input_tokens_seen": 10039128, "step": 15295 }, { "epoch": 9.02122641509434, "grad_norm": 2.5380287170410156, "learning_rate": 6.69316673744535e-06, "loss": 0.4218, "num_input_tokens_seen": 10042456, "step": 15300 }, { "epoch": 9.024174528301886, "grad_norm": 2.0090689659118652, "learning_rate": 6.6907457993438115e-06, "loss": 0.3447, "num_input_tokens_seen": 10046456, "step": 15305 }, { "epoch": 9.027122641509434, "grad_norm": 1.248644232749939, "learning_rate": 6.688324413610036e-06, "loss": 0.403, "num_input_tokens_seen": 10050168, "step": 15310 }, { "epoch": 9.03007075471698, "grad_norm": 2.5477538108825684, "learning_rate": 6.685902580885094e-06, "loss": 0.4801, "num_input_tokens_seen": 10052984, "step": 15315 }, { "epoch": 9.033018867924529, "grad_norm": 2.0146408081054688, "learning_rate": 6.6834803018101794e-06, "loss": 0.3493, "num_input_tokens_seen": 10055000, "step": 15320 }, { "epoch": 9.035966981132075, "grad_norm": 3.7359507083892822, "learning_rate": 6.681057577026599e-06, "loss": 0.45, "num_input_tokens_seen": 10058200, "step": 15325 }, { "epoch": 9.038915094339623, "grad_norm": 6.164427757263184, "learning_rate": 6.6786344071757795e-06, "loss": 0.3503, "num_input_tokens_seen": 10061144, "step": 15330 }, { "epoch": 9.04186320754717, "grad_norm": 3.797043561935425, "learning_rate": 6.676210792899267e-06, "loss": 0.392, "num_input_tokens_seen": 10064696, "step": 15335 }, { "epoch": 9.044811320754716, "grad_norm": 2.2971458435058594, "learning_rate": 6.6737867348387235e-06, "loss": 0.341, "num_input_tokens_seen": 10067480, "step": 15340 }, { "epoch": 9.047759433962264, "grad_norm": 2.011563539505005, "learning_rate": 6.671362233635926e-06, "loss": 0.3225, "num_input_tokens_seen": 10070392, "step": 15345 }, { "epoch": 9.05070754716981, "grad_norm": 3.039677381515503, "learning_rate": 6.668937289932775e-06, "loss": 0.4078, "num_input_tokens_seen": 10073464, "step": 15350 }, { "epoch": 9.053655660377359, "grad_norm": 3.5509512424468994, "learning_rate": 6.666511904371285e-06, "loss": 0.426, "num_input_tokens_seen": 10076696, "step": 15355 }, { "epoch": 9.056603773584905, "grad_norm": 4.081659317016602, "learning_rate": 6.664086077593586e-06, "loss": 0.4153, "num_input_tokens_seen": 10079256, "step": 15360 }, { "epoch": 9.059551886792454, "grad_norm": 2.1714026927948, "learning_rate": 6.661659810241924e-06, "loss": 0.4501, "num_input_tokens_seen": 10082392, "step": 15365 }, { "epoch": 9.0625, "grad_norm": 2.489070177078247, "learning_rate": 6.659233102958667e-06, "loss": 0.3802, "num_input_tokens_seen": 10085464, "step": 15370 }, { "epoch": 9.065448113207546, "grad_norm": 1.63927161693573, "learning_rate": 6.6568059563862965e-06, "loss": 0.5375, "num_input_tokens_seen": 10091448, "step": 15375 }, { "epoch": 9.068396226415095, "grad_norm": 1.7979624271392822, "learning_rate": 6.654378371167407e-06, "loss": 0.3732, "num_input_tokens_seen": 10094584, "step": 15380 }, { "epoch": 9.071344339622641, "grad_norm": 1.9027912616729736, "learning_rate": 6.651950347944715e-06, "loss": 0.3311, "num_input_tokens_seen": 10098008, "step": 15385 }, { "epoch": 9.07429245283019, "grad_norm": 1.8891710042953491, "learning_rate": 6.649521887361048e-06, "loss": 0.3336, "num_input_tokens_seen": 10101368, "step": 15390 }, { "epoch": 9.077240566037736, "grad_norm": 3.645746946334839, "learning_rate": 6.647092990059352e-06, "loss": 0.5363, "num_input_tokens_seen": 10104888, "step": 15395 }, { "epoch": 9.080188679245284, "grad_norm": 2.05617618560791, "learning_rate": 6.644663656682689e-06, "loss": 0.4863, "num_input_tokens_seen": 10108120, "step": 15400 }, { "epoch": 9.08313679245283, "grad_norm": 1.409879207611084, "learning_rate": 6.642233887874234e-06, "loss": 0.4687, "num_input_tokens_seen": 10111128, "step": 15405 }, { "epoch": 9.086084905660377, "grad_norm": 2.518214225769043, "learning_rate": 6.639803684277279e-06, "loss": 0.3635, "num_input_tokens_seen": 10114328, "step": 15410 }, { "epoch": 9.089033018867925, "grad_norm": 2.7114901542663574, "learning_rate": 6.637373046535233e-06, "loss": 0.4394, "num_input_tokens_seen": 10116888, "step": 15415 }, { "epoch": 9.091981132075471, "grad_norm": 3.0574214458465576, "learning_rate": 6.634941975291617e-06, "loss": 0.4193, "num_input_tokens_seen": 10120696, "step": 15420 }, { "epoch": 9.09492924528302, "grad_norm": 1.7663036584854126, "learning_rate": 6.632510471190065e-06, "loss": 0.3301, "num_input_tokens_seen": 10123448, "step": 15425 }, { "epoch": 9.097877358490566, "grad_norm": 3.080214262008667, "learning_rate": 6.630078534874332e-06, "loss": 0.3085, "num_input_tokens_seen": 10126936, "step": 15430 }, { "epoch": 9.100825471698114, "grad_norm": 3.742147207260132, "learning_rate": 6.62764616698828e-06, "loss": 0.4091, "num_input_tokens_seen": 10131416, "step": 15435 }, { "epoch": 9.10377358490566, "grad_norm": 2.1446285247802734, "learning_rate": 6.625213368175889e-06, "loss": 0.3098, "num_input_tokens_seen": 10135256, "step": 15440 }, { "epoch": 9.106721698113208, "grad_norm": 2.8686180114746094, "learning_rate": 6.622780139081256e-06, "loss": 0.3653, "num_input_tokens_seen": 10138776, "step": 15445 }, { "epoch": 9.109669811320755, "grad_norm": 1.6570831537246704, "learning_rate": 6.620346480348589e-06, "loss": 0.3144, "num_input_tokens_seen": 10141656, "step": 15450 }, { "epoch": 9.112617924528301, "grad_norm": 2.0905332565307617, "learning_rate": 6.617912392622206e-06, "loss": 0.5101, "num_input_tokens_seen": 10144504, "step": 15455 }, { "epoch": 9.11556603773585, "grad_norm": 1.6373510360717773, "learning_rate": 6.615477876546544e-06, "loss": 0.3419, "num_input_tokens_seen": 10147672, "step": 15460 }, { "epoch": 9.118514150943396, "grad_norm": 2.36488938331604, "learning_rate": 6.6130429327661535e-06, "loss": 0.3766, "num_input_tokens_seen": 10150648, "step": 15465 }, { "epoch": 9.121462264150944, "grad_norm": 7.6742658615112305, "learning_rate": 6.610607561925694e-06, "loss": 0.4805, "num_input_tokens_seen": 10154808, "step": 15470 }, { "epoch": 9.12441037735849, "grad_norm": 6.874012470245361, "learning_rate": 6.608171764669941e-06, "loss": 0.3915, "num_input_tokens_seen": 10158392, "step": 15475 }, { "epoch": 9.127358490566039, "grad_norm": 1.382972002029419, "learning_rate": 6.605735541643783e-06, "loss": 0.3752, "num_input_tokens_seen": 10161752, "step": 15480 }, { "epoch": 9.130306603773585, "grad_norm": 2.011078357696533, "learning_rate": 6.603298893492219e-06, "loss": 0.3664, "num_input_tokens_seen": 10164984, "step": 15485 }, { "epoch": 9.133254716981131, "grad_norm": 2.762141704559326, "learning_rate": 6.6008618208603625e-06, "loss": 0.4241, "num_input_tokens_seen": 10167992, "step": 15490 }, { "epoch": 9.13620283018868, "grad_norm": 3.24261212348938, "learning_rate": 6.598424324393442e-06, "loss": 0.3669, "num_input_tokens_seen": 10170712, "step": 15495 }, { "epoch": 9.139150943396226, "grad_norm": 2.0811104774475098, "learning_rate": 6.595986404736792e-06, "loss": 0.3954, "num_input_tokens_seen": 10174296, "step": 15500 }, { "epoch": 9.142099056603774, "grad_norm": 1.586551547050476, "learning_rate": 6.5935480625358615e-06, "loss": 0.3755, "num_input_tokens_seen": 10177272, "step": 15505 }, { "epoch": 9.14504716981132, "grad_norm": 1.6395344734191895, "learning_rate": 6.591109298436216e-06, "loss": 0.3283, "num_input_tokens_seen": 10182616, "step": 15510 }, { "epoch": 9.147995283018869, "grad_norm": 2.218440532684326, "learning_rate": 6.5886701130835255e-06, "loss": 0.4306, "num_input_tokens_seen": 10186776, "step": 15515 }, { "epoch": 9.150943396226415, "grad_norm": 1.991758942604065, "learning_rate": 6.586230507123574e-06, "loss": 0.3856, "num_input_tokens_seen": 10189784, "step": 15520 }, { "epoch": 9.153891509433961, "grad_norm": 2.8031058311462402, "learning_rate": 6.583790481202261e-06, "loss": 0.4404, "num_input_tokens_seen": 10195960, "step": 15525 }, { "epoch": 9.15683962264151, "grad_norm": 2.838350296020508, "learning_rate": 6.5813500359655925e-06, "loss": 0.363, "num_input_tokens_seen": 10198424, "step": 15530 }, { "epoch": 9.159787735849056, "grad_norm": 2.0274746417999268, "learning_rate": 6.578909172059687e-06, "loss": 0.3126, "num_input_tokens_seen": 10201208, "step": 15535 }, { "epoch": 9.162735849056604, "grad_norm": 1.7407771348953247, "learning_rate": 6.576467890130772e-06, "loss": 0.3608, "num_input_tokens_seen": 10205336, "step": 15540 }, { "epoch": 9.16568396226415, "grad_norm": 2.0612215995788574, "learning_rate": 6.574026190825191e-06, "loss": 0.4396, "num_input_tokens_seen": 10209816, "step": 15545 }, { "epoch": 9.168632075471699, "grad_norm": 1.9204044342041016, "learning_rate": 6.57158407478939e-06, "loss": 0.4021, "num_input_tokens_seen": 10213208, "step": 15550 }, { "epoch": 9.171580188679245, "grad_norm": 3.486077308654785, "learning_rate": 6.569141542669935e-06, "loss": 0.5403, "num_input_tokens_seen": 10215544, "step": 15555 }, { "epoch": 9.174528301886792, "grad_norm": 3.1840081214904785, "learning_rate": 6.566698595113492e-06, "loss": 0.3595, "num_input_tokens_seen": 10218488, "step": 15560 }, { "epoch": 9.17747641509434, "grad_norm": 3.047320604324341, "learning_rate": 6.564255232766843e-06, "loss": 0.4514, "num_input_tokens_seen": 10221496, "step": 15565 }, { "epoch": 9.180424528301886, "grad_norm": 2.142313003540039, "learning_rate": 6.561811456276881e-06, "loss": 0.315, "num_input_tokens_seen": 10224568, "step": 15570 }, { "epoch": 9.183372641509434, "grad_norm": 1.2603410482406616, "learning_rate": 6.559367266290605e-06, "loss": 0.3492, "num_input_tokens_seen": 10228696, "step": 15575 }, { "epoch": 9.18632075471698, "grad_norm": 4.071621417999268, "learning_rate": 6.556922663455123e-06, "loss": 0.537, "num_input_tokens_seen": 10231736, "step": 15580 }, { "epoch": 9.189268867924529, "grad_norm": 2.185105562210083, "learning_rate": 6.554477648417657e-06, "loss": 0.4577, "num_input_tokens_seen": 10235768, "step": 15585 }, { "epoch": 9.192216981132075, "grad_norm": 2.2664732933044434, "learning_rate": 6.552032221825535e-06, "loss": 0.3641, "num_input_tokens_seen": 10239128, "step": 15590 }, { "epoch": 9.195165094339623, "grad_norm": 5.495608329772949, "learning_rate": 6.549586384326192e-06, "loss": 0.3117, "num_input_tokens_seen": 10242616, "step": 15595 }, { "epoch": 9.19811320754717, "grad_norm": 1.404151439666748, "learning_rate": 6.547140136567176e-06, "loss": 0.3322, "num_input_tokens_seen": 10245688, "step": 15600 }, { "epoch": 9.201061320754716, "grad_norm": 1.9770381450653076, "learning_rate": 6.544693479196142e-06, "loss": 0.4353, "num_input_tokens_seen": 10248568, "step": 15605 }, { "epoch": 9.204009433962264, "grad_norm": 3.844179153442383, "learning_rate": 6.542246412860851e-06, "loss": 0.422, "num_input_tokens_seen": 10250904, "step": 15610 }, { "epoch": 9.20695754716981, "grad_norm": 2.0758371353149414, "learning_rate": 6.5397989382091754e-06, "loss": 0.4825, "num_input_tokens_seen": 10254104, "step": 15615 }, { "epoch": 9.209905660377359, "grad_norm": 2.034029960632324, "learning_rate": 6.537351055889096e-06, "loss": 0.3535, "num_input_tokens_seen": 10258488, "step": 15620 }, { "epoch": 9.212853773584905, "grad_norm": 2.0889596939086914, "learning_rate": 6.534902766548698e-06, "loss": 0.3055, "num_input_tokens_seen": 10261496, "step": 15625 }, { "epoch": 9.215801886792454, "grad_norm": 2.296004056930542, "learning_rate": 6.532454070836176e-06, "loss": 0.3555, "num_input_tokens_seen": 10264888, "step": 15630 }, { "epoch": 9.21875, "grad_norm": 2.2352051734924316, "learning_rate": 6.530004969399836e-06, "loss": 0.3693, "num_input_tokens_seen": 10268344, "step": 15635 }, { "epoch": 9.221698113207546, "grad_norm": 1.8403929471969604, "learning_rate": 6.5275554628880865e-06, "loss": 0.4504, "num_input_tokens_seen": 10272472, "step": 15640 }, { "epoch": 9.224646226415095, "grad_norm": 3.039288282394409, "learning_rate": 6.525105551949444e-06, "loss": 0.3867, "num_input_tokens_seen": 10275736, "step": 15645 }, { "epoch": 9.227594339622641, "grad_norm": 3.0742366313934326, "learning_rate": 6.522655237232535e-06, "loss": 0.2901, "num_input_tokens_seen": 10279096, "step": 15650 }, { "epoch": 9.23054245283019, "grad_norm": 2.4602670669555664, "learning_rate": 6.5202045193860885e-06, "loss": 0.4108, "num_input_tokens_seen": 10282232, "step": 15655 }, { "epoch": 9.233490566037736, "grad_norm": 2.206672430038452, "learning_rate": 6.517753399058944e-06, "loss": 0.4205, "num_input_tokens_seen": 10285656, "step": 15660 }, { "epoch": 9.236438679245284, "grad_norm": 2.953235387802124, "learning_rate": 6.515301876900047e-06, "loss": 0.4366, "num_input_tokens_seen": 10288120, "step": 15665 }, { "epoch": 9.23938679245283, "grad_norm": 2.5219743251800537, "learning_rate": 6.512849953558445e-06, "loss": 0.3657, "num_input_tokens_seen": 10290808, "step": 15670 }, { "epoch": 9.242334905660377, "grad_norm": 1.2852953672409058, "learning_rate": 6.510397629683301e-06, "loss": 0.2493, "num_input_tokens_seen": 10294392, "step": 15675 }, { "epoch": 9.245283018867925, "grad_norm": 6.146023750305176, "learning_rate": 6.507944905923872e-06, "loss": 0.5243, "num_input_tokens_seen": 10297432, "step": 15680 }, { "epoch": 9.248231132075471, "grad_norm": 2.6167852878570557, "learning_rate": 6.505491782929531e-06, "loss": 0.4424, "num_input_tokens_seen": 10299928, "step": 15685 }, { "epoch": 9.25117924528302, "grad_norm": 2.3519885540008545, "learning_rate": 6.5030382613497535e-06, "loss": 0.3323, "num_input_tokens_seen": 10303352, "step": 15690 }, { "epoch": 9.254127358490566, "grad_norm": 2.0565314292907715, "learning_rate": 6.500584341834119e-06, "loss": 0.3771, "num_input_tokens_seen": 10307352, "step": 15695 }, { "epoch": 9.257075471698114, "grad_norm": 2.1585395336151123, "learning_rate": 6.498130025032312e-06, "loss": 0.3559, "num_input_tokens_seen": 10310104, "step": 15700 }, { "epoch": 9.26002358490566, "grad_norm": 1.7189851999282837, "learning_rate": 6.495675311594123e-06, "loss": 0.2526, "num_input_tokens_seen": 10312888, "step": 15705 }, { "epoch": 9.262971698113208, "grad_norm": 3.6124353408813477, "learning_rate": 6.493220202169452e-06, "loss": 0.4672, "num_input_tokens_seen": 10315256, "step": 15710 }, { "epoch": 9.265919811320755, "grad_norm": 3.0644752979278564, "learning_rate": 6.490764697408295e-06, "loss": 0.511, "num_input_tokens_seen": 10317880, "step": 15715 }, { "epoch": 9.268867924528301, "grad_norm": 1.7118041515350342, "learning_rate": 6.48830879796076e-06, "loss": 0.4451, "num_input_tokens_seen": 10321656, "step": 15720 }, { "epoch": 9.27181603773585, "grad_norm": 1.3824814558029175, "learning_rate": 6.4858525044770546e-06, "loss": 0.3343, "num_input_tokens_seen": 10324728, "step": 15725 }, { "epoch": 9.274764150943396, "grad_norm": 2.6789159774780273, "learning_rate": 6.483395817607497e-06, "loss": 0.3114, "num_input_tokens_seen": 10327288, "step": 15730 }, { "epoch": 9.277712264150944, "grad_norm": 1.6193599700927734, "learning_rate": 6.480938738002504e-06, "loss": 0.4395, "num_input_tokens_seen": 10330136, "step": 15735 }, { "epoch": 9.28066037735849, "grad_norm": 2.279451847076416, "learning_rate": 6.478481266312597e-06, "loss": 0.367, "num_input_tokens_seen": 10333944, "step": 15740 }, { "epoch": 9.283608490566039, "grad_norm": 2.036045551300049, "learning_rate": 6.476023403188403e-06, "loss": 0.432, "num_input_tokens_seen": 10338168, "step": 15745 }, { "epoch": 9.286556603773585, "grad_norm": 1.920186996459961, "learning_rate": 6.473565149280651e-06, "loss": 0.5271, "num_input_tokens_seen": 10341336, "step": 15750 }, { "epoch": 9.289504716981131, "grad_norm": 2.1124987602233887, "learning_rate": 6.471106505240175e-06, "loss": 0.3458, "num_input_tokens_seen": 10345144, "step": 15755 }, { "epoch": 9.29245283018868, "grad_norm": 2.610301971435547, "learning_rate": 6.468647471717914e-06, "loss": 0.3273, "num_input_tokens_seen": 10347576, "step": 15760 }, { "epoch": 9.295400943396226, "grad_norm": 5.246949672698975, "learning_rate": 6.466188049364902e-06, "loss": 0.6295, "num_input_tokens_seen": 10350648, "step": 15765 }, { "epoch": 9.298349056603774, "grad_norm": 2.027376174926758, "learning_rate": 6.463728238832288e-06, "loss": 0.3064, "num_input_tokens_seen": 10353464, "step": 15770 }, { "epoch": 9.30129716981132, "grad_norm": 2.690380096435547, "learning_rate": 6.461268040771311e-06, "loss": 0.3912, "num_input_tokens_seen": 10356440, "step": 15775 }, { "epoch": 9.304245283018869, "grad_norm": 3.8912899494171143, "learning_rate": 6.458807455833326e-06, "loss": 0.3829, "num_input_tokens_seen": 10361496, "step": 15780 }, { "epoch": 9.307193396226415, "grad_norm": 1.7600922584533691, "learning_rate": 6.456346484669778e-06, "loss": 0.3449, "num_input_tokens_seen": 10365432, "step": 15785 }, { "epoch": 9.310141509433961, "grad_norm": 2.209531784057617, "learning_rate": 6.4538851279322225e-06, "loss": 0.3947, "num_input_tokens_seen": 10368568, "step": 15790 }, { "epoch": 9.31308962264151, "grad_norm": 3.9951884746551514, "learning_rate": 6.451423386272312e-06, "loss": 0.4535, "num_input_tokens_seen": 10371128, "step": 15795 }, { "epoch": 9.316037735849056, "grad_norm": 1.9749239683151245, "learning_rate": 6.448961260341806e-06, "loss": 0.3346, "num_input_tokens_seen": 10373784, "step": 15800 }, { "epoch": 9.318985849056604, "grad_norm": 2.049018621444702, "learning_rate": 6.446498750792563e-06, "loss": 0.4943, "num_input_tokens_seen": 10377208, "step": 15805 }, { "epoch": 9.32193396226415, "grad_norm": 3.109506845474243, "learning_rate": 6.444035858276538e-06, "loss": 0.4087, "num_input_tokens_seen": 10380216, "step": 15810 }, { "epoch": 9.324882075471699, "grad_norm": 2.569887638092041, "learning_rate": 6.441572583445799e-06, "loss": 0.5003, "num_input_tokens_seen": 10384344, "step": 15815 }, { "epoch": 9.327830188679245, "grad_norm": 1.8467037677764893, "learning_rate": 6.439108926952504e-06, "loss": 0.4404, "num_input_tokens_seen": 10387288, "step": 15820 }, { "epoch": 9.330778301886792, "grad_norm": 2.38896107673645, "learning_rate": 6.43664488944892e-06, "loss": 0.4064, "num_input_tokens_seen": 10390136, "step": 15825 }, { "epoch": 9.33372641509434, "grad_norm": 1.780076265335083, "learning_rate": 6.434180471587409e-06, "loss": 0.3349, "num_input_tokens_seen": 10393464, "step": 15830 }, { "epoch": 9.336674528301886, "grad_norm": 1.7832469940185547, "learning_rate": 6.431715674020438e-06, "loss": 0.3741, "num_input_tokens_seen": 10396696, "step": 15835 }, { "epoch": 9.339622641509434, "grad_norm": 1.6145708560943604, "learning_rate": 6.429250497400571e-06, "loss": 0.4212, "num_input_tokens_seen": 10400664, "step": 15840 }, { "epoch": 9.34257075471698, "grad_norm": 2.753108024597168, "learning_rate": 6.426784942380475e-06, "loss": 0.4417, "num_input_tokens_seen": 10403224, "step": 15845 }, { "epoch": 9.345518867924529, "grad_norm": 1.8530153036117554, "learning_rate": 6.424319009612917e-06, "loss": 0.417, "num_input_tokens_seen": 10407608, "step": 15850 }, { "epoch": 9.348466981132075, "grad_norm": 2.1493167877197266, "learning_rate": 6.421852699750763e-06, "loss": 0.4243, "num_input_tokens_seen": 10410392, "step": 15855 }, { "epoch": 9.351415094339623, "grad_norm": 3.3826167583465576, "learning_rate": 6.4193860134469775e-06, "loss": 0.4345, "num_input_tokens_seen": 10412952, "step": 15860 }, { "epoch": 9.35436320754717, "grad_norm": 8.163442611694336, "learning_rate": 6.416918951354629e-06, "loss": 0.4145, "num_input_tokens_seen": 10416664, "step": 15865 }, { "epoch": 9.357311320754716, "grad_norm": 1.2581937313079834, "learning_rate": 6.4144515141268816e-06, "loss": 0.4182, "num_input_tokens_seen": 10419704, "step": 15870 }, { "epoch": 9.360259433962264, "grad_norm": 2.0582234859466553, "learning_rate": 6.411983702416999e-06, "loss": 0.369, "num_input_tokens_seen": 10422584, "step": 15875 }, { "epoch": 9.36320754716981, "grad_norm": 2.533095359802246, "learning_rate": 6.409515516878346e-06, "loss": 0.3558, "num_input_tokens_seen": 10426168, "step": 15880 }, { "epoch": 9.366155660377359, "grad_norm": 3.166553497314453, "learning_rate": 6.407046958164387e-06, "loss": 0.3884, "num_input_tokens_seen": 10429912, "step": 15885 }, { "epoch": 9.369103773584905, "grad_norm": 4.418611526489258, "learning_rate": 6.404578026928679e-06, "loss": 0.2988, "num_input_tokens_seen": 10432984, "step": 15890 }, { "epoch": 9.372051886792454, "grad_norm": 2.784627914428711, "learning_rate": 6.402108723824887e-06, "loss": 0.4846, "num_input_tokens_seen": 10437944, "step": 15895 }, { "epoch": 9.375, "grad_norm": 5.452880382537842, "learning_rate": 6.399639049506767e-06, "loss": 0.381, "num_input_tokens_seen": 10441016, "step": 15900 }, { "epoch": 9.377948113207546, "grad_norm": 3.6164655685424805, "learning_rate": 6.397169004628177e-06, "loss": 0.4659, "num_input_tokens_seen": 10444248, "step": 15905 }, { "epoch": 9.380896226415095, "grad_norm": 2.126168727874756, "learning_rate": 6.39469858984307e-06, "loss": 0.4675, "num_input_tokens_seen": 10448440, "step": 15910 }, { "epoch": 9.383844339622641, "grad_norm": 1.9546359777450562, "learning_rate": 6.3922278058055024e-06, "loss": 0.361, "num_input_tokens_seen": 10451256, "step": 15915 }, { "epoch": 9.38679245283019, "grad_norm": 2.510014772415161, "learning_rate": 6.389756653169622e-06, "loss": 0.376, "num_input_tokens_seen": 10454680, "step": 15920 }, { "epoch": 9.389740566037736, "grad_norm": 4.304040908813477, "learning_rate": 6.387285132589678e-06, "loss": 0.4225, "num_input_tokens_seen": 10458552, "step": 15925 }, { "epoch": 9.392688679245284, "grad_norm": 2.562786340713501, "learning_rate": 6.3848132447200166e-06, "loss": 0.3194, "num_input_tokens_seen": 10461944, "step": 15930 }, { "epoch": 9.39563679245283, "grad_norm": 2.782195568084717, "learning_rate": 6.38234099021508e-06, "loss": 0.4874, "num_input_tokens_seen": 10464824, "step": 15935 }, { "epoch": 9.398584905660377, "grad_norm": 2.720417022705078, "learning_rate": 6.379868369729409e-06, "loss": 0.3527, "num_input_tokens_seen": 10468376, "step": 15940 }, { "epoch": 9.401533018867925, "grad_norm": 1.639888882637024, "learning_rate": 6.377395383917642e-06, "loss": 0.3722, "num_input_tokens_seen": 10471608, "step": 15945 }, { "epoch": 9.404481132075471, "grad_norm": 2.546006202697754, "learning_rate": 6.374922033434507e-06, "loss": 0.3573, "num_input_tokens_seen": 10475384, "step": 15950 }, { "epoch": 9.40742924528302, "grad_norm": 1.3920271396636963, "learning_rate": 6.372448318934842e-06, "loss": 0.4446, "num_input_tokens_seen": 10478712, "step": 15955 }, { "epoch": 9.410377358490566, "grad_norm": 1.7862845659255981, "learning_rate": 6.369974241073569e-06, "loss": 0.3748, "num_input_tokens_seen": 10482168, "step": 15960 }, { "epoch": 9.413325471698114, "grad_norm": 6.0758442878723145, "learning_rate": 6.367499800505709e-06, "loss": 0.4363, "num_input_tokens_seen": 10484888, "step": 15965 }, { "epoch": 9.41627358490566, "grad_norm": 1.9836804866790771, "learning_rate": 6.365024997886384e-06, "loss": 0.3927, "num_input_tokens_seen": 10488120, "step": 15970 }, { "epoch": 9.419221698113208, "grad_norm": 1.988278865814209, "learning_rate": 6.362549833870808e-06, "loss": 0.302, "num_input_tokens_seen": 10490712, "step": 15975 }, { "epoch": 9.422169811320755, "grad_norm": 2.4051077365875244, "learning_rate": 6.360074309114293e-06, "loss": 0.4561, "num_input_tokens_seen": 10494008, "step": 15980 }, { "epoch": 9.425117924528301, "grad_norm": 2.4203944206237793, "learning_rate": 6.357598424272241e-06, "loss": 0.4567, "num_input_tokens_seen": 10497080, "step": 15985 }, { "epoch": 9.42806603773585, "grad_norm": 2.7060303688049316, "learning_rate": 6.355122180000156e-06, "loss": 0.4829, "num_input_tokens_seen": 10500120, "step": 15990 }, { "epoch": 9.431014150943396, "grad_norm": 5.550325870513916, "learning_rate": 6.352645576953635e-06, "loss": 0.5568, "num_input_tokens_seen": 10503256, "step": 15995 }, { "epoch": 9.433962264150944, "grad_norm": 1.7113676071166992, "learning_rate": 6.350168615788366e-06, "loss": 0.4986, "num_input_tokens_seen": 10507064, "step": 16000 }, { "epoch": 9.43691037735849, "grad_norm": 4.155259609222412, "learning_rate": 6.347691297160137e-06, "loss": 0.3848, "num_input_tokens_seen": 10509528, "step": 16005 }, { "epoch": 9.439858490566039, "grad_norm": 1.7613545656204224, "learning_rate": 6.34521362172483e-06, "loss": 0.4129, "num_input_tokens_seen": 10512472, "step": 16010 }, { "epoch": 9.442806603773585, "grad_norm": 3.4020957946777344, "learning_rate": 6.342735590138417e-06, "loss": 0.4323, "num_input_tokens_seen": 10515352, "step": 16015 }, { "epoch": 9.445754716981131, "grad_norm": 6.576329708099365, "learning_rate": 6.340257203056972e-06, "loss": 0.4197, "num_input_tokens_seen": 10518008, "step": 16020 }, { "epoch": 9.44870283018868, "grad_norm": 3.5870308876037598, "learning_rate": 6.3377784611366554e-06, "loss": 0.4547, "num_input_tokens_seen": 10521368, "step": 16025 }, { "epoch": 9.451650943396226, "grad_norm": 1.924638032913208, "learning_rate": 6.335299365033726e-06, "loss": 0.3746, "num_input_tokens_seen": 10524600, "step": 16030 }, { "epoch": 9.454599056603774, "grad_norm": 2.5671942234039307, "learning_rate": 6.3328199154045346e-06, "loss": 0.526, "num_input_tokens_seen": 10527544, "step": 16035 }, { "epoch": 9.45754716981132, "grad_norm": 2.2690460681915283, "learning_rate": 6.3303401129055265e-06, "loss": 0.289, "num_input_tokens_seen": 10530872, "step": 16040 }, { "epoch": 9.460495283018869, "grad_norm": 3.9232518672943115, "learning_rate": 6.32785995819324e-06, "loss": 0.4597, "num_input_tokens_seen": 10534456, "step": 16045 }, { "epoch": 9.463443396226415, "grad_norm": 2.8382863998413086, "learning_rate": 6.3253794519243075e-06, "loss": 0.3639, "num_input_tokens_seen": 10537752, "step": 16050 }, { "epoch": 9.466391509433961, "grad_norm": 3.011012315750122, "learning_rate": 6.322898594755452e-06, "loss": 0.4387, "num_input_tokens_seen": 10540504, "step": 16055 }, { "epoch": 9.46933962264151, "grad_norm": 1.199315071105957, "learning_rate": 6.320417387343492e-06, "loss": 0.333, "num_input_tokens_seen": 10543608, "step": 16060 }, { "epoch": 9.472287735849056, "grad_norm": 2.335817337036133, "learning_rate": 6.3179358303453386e-06, "loss": 0.3341, "num_input_tokens_seen": 10547096, "step": 16065 }, { "epoch": 9.475235849056604, "grad_norm": 2.4042890071868896, "learning_rate": 6.315453924417995e-06, "loss": 0.4412, "num_input_tokens_seen": 10550200, "step": 16070 }, { "epoch": 9.47818396226415, "grad_norm": 2.3623828887939453, "learning_rate": 6.312971670218554e-06, "loss": 0.4505, "num_input_tokens_seen": 10553208, "step": 16075 }, { "epoch": 9.481132075471699, "grad_norm": 1.4005883932113647, "learning_rate": 6.3104890684042055e-06, "loss": 0.336, "num_input_tokens_seen": 10556280, "step": 16080 }, { "epoch": 9.484080188679245, "grad_norm": 2.3039145469665527, "learning_rate": 6.308006119632228e-06, "loss": 0.4216, "num_input_tokens_seen": 10559352, "step": 16085 }, { "epoch": 9.487028301886792, "grad_norm": 3.7823047637939453, "learning_rate": 6.305522824559993e-06, "loss": 0.519, "num_input_tokens_seen": 10562168, "step": 16090 }, { "epoch": 9.48997641509434, "grad_norm": 2.5060477256774902, "learning_rate": 6.303039183844965e-06, "loss": 0.4297, "num_input_tokens_seen": 10565656, "step": 16095 }, { "epoch": 9.492924528301886, "grad_norm": 1.463445782661438, "learning_rate": 6.300555198144697e-06, "loss": 0.3797, "num_input_tokens_seen": 10568376, "step": 16100 }, { "epoch": 9.495872641509434, "grad_norm": 2.8930089473724365, "learning_rate": 6.2980708681168335e-06, "loss": 0.4325, "num_input_tokens_seen": 10571128, "step": 16105 }, { "epoch": 9.49882075471698, "grad_norm": 5.417365550994873, "learning_rate": 6.2955861944191145e-06, "loss": 0.4651, "num_input_tokens_seen": 10574680, "step": 16110 }, { "epoch": 9.501768867924529, "grad_norm": 5.169792652130127, "learning_rate": 6.293101177709367e-06, "loss": 0.5602, "num_input_tokens_seen": 10577304, "step": 16115 }, { "epoch": 9.504716981132075, "grad_norm": 4.017365455627441, "learning_rate": 6.290615818645509e-06, "loss": 0.4326, "num_input_tokens_seen": 10580664, "step": 16120 }, { "epoch": 9.507665094339622, "grad_norm": 1.8234435319900513, "learning_rate": 6.288130117885552e-06, "loss": 0.2805, "num_input_tokens_seen": 10583512, "step": 16125 }, { "epoch": 9.51061320754717, "grad_norm": 2.542971611022949, "learning_rate": 6.285644076087594e-06, "loss": 0.4339, "num_input_tokens_seen": 10586232, "step": 16130 }, { "epoch": 9.513561320754716, "grad_norm": 3.56671142578125, "learning_rate": 6.283157693909826e-06, "loss": 0.4228, "num_input_tokens_seen": 10590424, "step": 16135 }, { "epoch": 9.516509433962264, "grad_norm": 3.805233955383301, "learning_rate": 6.280670972010528e-06, "loss": 0.3856, "num_input_tokens_seen": 10592984, "step": 16140 }, { "epoch": 9.51945754716981, "grad_norm": 2.098464012145996, "learning_rate": 6.278183911048072e-06, "loss": 0.3843, "num_input_tokens_seen": 10595512, "step": 16145 }, { "epoch": 9.522405660377359, "grad_norm": 6.479310035705566, "learning_rate": 6.275696511680915e-06, "loss": 0.3034, "num_input_tokens_seen": 10598168, "step": 16150 }, { "epoch": 9.525353773584905, "grad_norm": 2.636725902557373, "learning_rate": 6.27320877456761e-06, "loss": 0.4755, "num_input_tokens_seen": 10604920, "step": 16155 }, { "epoch": 9.528301886792454, "grad_norm": 2.834294557571411, "learning_rate": 6.270720700366793e-06, "loss": 0.3847, "num_input_tokens_seen": 10609016, "step": 16160 }, { "epoch": 9.53125, "grad_norm": 1.7177224159240723, "learning_rate": 6.2682322897371974e-06, "loss": 0.24, "num_input_tokens_seen": 10611384, "step": 16165 }, { "epoch": 9.534198113207546, "grad_norm": 2.120371103286743, "learning_rate": 6.265743543337634e-06, "loss": 0.3329, "num_input_tokens_seen": 10613720, "step": 16170 }, { "epoch": 9.537146226415095, "grad_norm": 1.4954875707626343, "learning_rate": 6.2632544618270156e-06, "loss": 0.3095, "num_input_tokens_seen": 10617112, "step": 16175 }, { "epoch": 9.540094339622641, "grad_norm": 3.4509458541870117, "learning_rate": 6.260765045864332e-06, "loss": 0.357, "num_input_tokens_seen": 10620568, "step": 16180 }, { "epoch": 9.54304245283019, "grad_norm": 2.065401077270508, "learning_rate": 6.258275296108669e-06, "loss": 0.4359, "num_input_tokens_seen": 10623416, "step": 16185 }, { "epoch": 9.545990566037736, "grad_norm": 2.7469546794891357, "learning_rate": 6.2557852132191985e-06, "loss": 0.3694, "num_input_tokens_seen": 10627096, "step": 16190 }, { "epoch": 9.548938679245284, "grad_norm": 3.7605221271514893, "learning_rate": 6.253294797855182e-06, "loss": 0.5415, "num_input_tokens_seen": 10630104, "step": 16195 }, { "epoch": 9.55188679245283, "grad_norm": 2.371619462966919, "learning_rate": 6.250804050675964e-06, "loss": 0.3926, "num_input_tokens_seen": 10633784, "step": 16200 }, { "epoch": 9.554834905660378, "grad_norm": 3.679405450820923, "learning_rate": 6.248312972340984e-06, "loss": 0.4238, "num_input_tokens_seen": 10636760, "step": 16205 }, { "epoch": 9.557783018867925, "grad_norm": 2.208228349685669, "learning_rate": 6.2458215635097656e-06, "loss": 0.3642, "num_input_tokens_seen": 10639384, "step": 16210 }, { "epoch": 9.560731132075471, "grad_norm": 1.4754571914672852, "learning_rate": 6.2433298248419175e-06, "loss": 0.3515, "num_input_tokens_seen": 10642328, "step": 16215 }, { "epoch": 9.56367924528302, "grad_norm": 2.5138156414031982, "learning_rate": 6.2408377569971405e-06, "loss": 0.3121, "num_input_tokens_seen": 10645976, "step": 16220 }, { "epoch": 9.566627358490566, "grad_norm": 1.4552799463272095, "learning_rate": 6.238345360635221e-06, "loss": 0.3702, "num_input_tokens_seen": 10649368, "step": 16225 }, { "epoch": 9.569575471698114, "grad_norm": 4.18242883682251, "learning_rate": 6.2358526364160274e-06, "loss": 0.5272, "num_input_tokens_seen": 10652216, "step": 16230 }, { "epoch": 9.57252358490566, "grad_norm": 2.405513048171997, "learning_rate": 6.2333595849995245e-06, "loss": 0.413, "num_input_tokens_seen": 10654744, "step": 16235 }, { "epoch": 9.575471698113208, "grad_norm": 6.489028453826904, "learning_rate": 6.230866207045756e-06, "loss": 0.2683, "num_input_tokens_seen": 10657464, "step": 16240 }, { "epoch": 9.578419811320755, "grad_norm": 2.8084652423858643, "learning_rate": 6.228372503214853e-06, "loss": 0.3172, "num_input_tokens_seen": 10660824, "step": 16245 }, { "epoch": 9.581367924528301, "grad_norm": 4.629970550537109, "learning_rate": 6.225878474167035e-06, "loss": 0.5899, "num_input_tokens_seen": 10663768, "step": 16250 }, { "epoch": 9.58431603773585, "grad_norm": 2.0842864513397217, "learning_rate": 6.22338412056261e-06, "loss": 0.4294, "num_input_tokens_seen": 10666168, "step": 16255 }, { "epoch": 9.587264150943396, "grad_norm": 2.576387405395508, "learning_rate": 6.220889443061966e-06, "loss": 0.3173, "num_input_tokens_seen": 10669176, "step": 16260 }, { "epoch": 9.590212264150944, "grad_norm": 2.7385191917419434, "learning_rate": 6.2183944423255796e-06, "loss": 0.3966, "num_input_tokens_seen": 10673080, "step": 16265 }, { "epoch": 9.59316037735849, "grad_norm": 3.7885940074920654, "learning_rate": 6.215899119014015e-06, "loss": 0.2652, "num_input_tokens_seen": 10676120, "step": 16270 }, { "epoch": 9.596108490566039, "grad_norm": 1.8840351104736328, "learning_rate": 6.2134034737879175e-06, "loss": 0.3493, "num_input_tokens_seen": 10679928, "step": 16275 }, { "epoch": 9.599056603773585, "grad_norm": 2.9571125507354736, "learning_rate": 6.2109075073080205e-06, "loss": 0.4558, "num_input_tokens_seen": 10683608, "step": 16280 }, { "epoch": 9.602004716981131, "grad_norm": 3.8392372131347656, "learning_rate": 6.2084112202351425e-06, "loss": 0.4151, "num_input_tokens_seen": 10686104, "step": 16285 }, { "epoch": 9.60495283018868, "grad_norm": 3.595548629760742, "learning_rate": 6.205914613230186e-06, "loss": 0.3263, "num_input_tokens_seen": 10690584, "step": 16290 }, { "epoch": 9.607900943396226, "grad_norm": 2.460397243499756, "learning_rate": 6.203417686954138e-06, "loss": 0.4435, "num_input_tokens_seen": 10694360, "step": 16295 }, { "epoch": 9.610849056603774, "grad_norm": 2.564406633377075, "learning_rate": 6.2009204420680706e-06, "loss": 0.3267, "num_input_tokens_seen": 10697336, "step": 16300 }, { "epoch": 9.61379716981132, "grad_norm": 1.7802597284317017, "learning_rate": 6.198422879233141e-06, "loss": 0.4417, "num_input_tokens_seen": 10700664, "step": 16305 }, { "epoch": 9.616745283018869, "grad_norm": 2.173797607421875, "learning_rate": 6.1959249991105895e-06, "loss": 0.4351, "num_input_tokens_seen": 10703800, "step": 16310 }, { "epoch": 9.619693396226415, "grad_norm": 2.381871461868286, "learning_rate": 6.19342680236174e-06, "loss": 0.4085, "num_input_tokens_seen": 10706648, "step": 16315 }, { "epoch": 9.622641509433961, "grad_norm": 2.1897027492523193, "learning_rate": 6.190928289648003e-06, "loss": 0.4895, "num_input_tokens_seen": 10710072, "step": 16320 }, { "epoch": 9.62558962264151, "grad_norm": 3.7890052795410156, "learning_rate": 6.188429461630866e-06, "loss": 0.3287, "num_input_tokens_seen": 10714456, "step": 16325 }, { "epoch": 9.628537735849056, "grad_norm": 2.459362506866455, "learning_rate": 6.185930318971909e-06, "loss": 0.4242, "num_input_tokens_seen": 10717496, "step": 16330 }, { "epoch": 9.631485849056604, "grad_norm": 2.3548834323883057, "learning_rate": 6.1834308623327885e-06, "loss": 0.3795, "num_input_tokens_seen": 10720216, "step": 16335 }, { "epoch": 9.63443396226415, "grad_norm": 2.5186691284179688, "learning_rate": 6.180931092375247e-06, "loss": 0.3581, "num_input_tokens_seen": 10723000, "step": 16340 }, { "epoch": 9.637382075471699, "grad_norm": 2.8677361011505127, "learning_rate": 6.1784310097611075e-06, "loss": 0.435, "num_input_tokens_seen": 10726808, "step": 16345 }, { "epoch": 9.640330188679245, "grad_norm": 2.878976583480835, "learning_rate": 6.1759306151522815e-06, "loss": 0.4222, "num_input_tokens_seen": 10729816, "step": 16350 }, { "epoch": 9.643278301886792, "grad_norm": 3.897583246231079, "learning_rate": 6.173429909210755e-06, "loss": 0.3912, "num_input_tokens_seen": 10733560, "step": 16355 }, { "epoch": 9.64622641509434, "grad_norm": 2.8735647201538086, "learning_rate": 6.170928892598606e-06, "loss": 0.5155, "num_input_tokens_seen": 10737528, "step": 16360 }, { "epoch": 9.649174528301886, "grad_norm": 3.3323466777801514, "learning_rate": 6.168427565977984e-06, "loss": 0.3361, "num_input_tokens_seen": 10741464, "step": 16365 }, { "epoch": 9.652122641509434, "grad_norm": 1.8531463146209717, "learning_rate": 6.165925930011129e-06, "loss": 0.3558, "num_input_tokens_seen": 10745080, "step": 16370 }, { "epoch": 9.65507075471698, "grad_norm": 2.065155029296875, "learning_rate": 6.163423985360359e-06, "loss": 0.4794, "num_input_tokens_seen": 10747896, "step": 16375 }, { "epoch": 9.658018867924529, "grad_norm": 1.4443846940994263, "learning_rate": 6.160921732688076e-06, "loss": 0.4598, "num_input_tokens_seen": 10751256, "step": 16380 }, { "epoch": 9.660966981132075, "grad_norm": 1.876855492591858, "learning_rate": 6.158419172656759e-06, "loss": 0.3321, "num_input_tokens_seen": 10754808, "step": 16385 }, { "epoch": 9.663915094339622, "grad_norm": 2.748591423034668, "learning_rate": 6.155916305928974e-06, "loss": 0.4197, "num_input_tokens_seen": 10757656, "step": 16390 }, { "epoch": 9.66686320754717, "grad_norm": 2.106283664703369, "learning_rate": 6.153413133167366e-06, "loss": 0.3935, "num_input_tokens_seen": 10760600, "step": 16395 }, { "epoch": 9.669811320754716, "grad_norm": 5.566813945770264, "learning_rate": 6.1509096550346596e-06, "loss": 0.3448, "num_input_tokens_seen": 10763512, "step": 16400 }, { "epoch": 9.672759433962264, "grad_norm": 3.368913412094116, "learning_rate": 6.148405872193661e-06, "loss": 0.5049, "num_input_tokens_seen": 10765976, "step": 16405 }, { "epoch": 9.67570754716981, "grad_norm": 4.172853946685791, "learning_rate": 6.14590178530726e-06, "loss": 0.3992, "num_input_tokens_seen": 10769080, "step": 16410 }, { "epoch": 9.678655660377359, "grad_norm": 2.697779655456543, "learning_rate": 6.143397395038422e-06, "loss": 0.3185, "num_input_tokens_seen": 10772056, "step": 16415 }, { "epoch": 9.681603773584905, "grad_norm": 4.210964679718018, "learning_rate": 6.140892702050196e-06, "loss": 0.5527, "num_input_tokens_seen": 10775544, "step": 16420 }, { "epoch": 9.684551886792454, "grad_norm": 3.6627726554870605, "learning_rate": 6.138387707005711e-06, "loss": 0.3784, "num_input_tokens_seen": 10778552, "step": 16425 }, { "epoch": 9.6875, "grad_norm": 3.466911792755127, "learning_rate": 6.135882410568172e-06, "loss": 0.4323, "num_input_tokens_seen": 10782040, "step": 16430 }, { "epoch": 9.690448113207546, "grad_norm": 4.624415874481201, "learning_rate": 6.133376813400872e-06, "loss": 0.2604, "num_input_tokens_seen": 10784376, "step": 16435 }, { "epoch": 9.693396226415095, "grad_norm": 3.426090955734253, "learning_rate": 6.130870916167175e-06, "loss": 0.4042, "num_input_tokens_seen": 10786744, "step": 16440 }, { "epoch": 9.696344339622641, "grad_norm": 2.2966582775115967, "learning_rate": 6.128364719530528e-06, "loss": 0.4004, "num_input_tokens_seen": 10789496, "step": 16445 }, { "epoch": 9.69929245283019, "grad_norm": 1.8466875553131104, "learning_rate": 6.125858224154459e-06, "loss": 0.3968, "num_input_tokens_seen": 10793208, "step": 16450 }, { "epoch": 9.702240566037736, "grad_norm": 2.6909232139587402, "learning_rate": 6.123351430702576e-06, "loss": 0.3182, "num_input_tokens_seen": 10796440, "step": 16455 }, { "epoch": 9.705188679245284, "grad_norm": 2.0357308387756348, "learning_rate": 6.1208443398385575e-06, "loss": 0.3227, "num_input_tokens_seen": 10800088, "step": 16460 }, { "epoch": 9.70813679245283, "grad_norm": 2.316270112991333, "learning_rate": 6.118336952226169e-06, "loss": 0.3375, "num_input_tokens_seen": 10803288, "step": 16465 }, { "epoch": 9.711084905660378, "grad_norm": 3.0867578983306885, "learning_rate": 6.115829268529254e-06, "loss": 0.4365, "num_input_tokens_seen": 10808184, "step": 16470 }, { "epoch": 9.714033018867925, "grad_norm": 2.1129820346832275, "learning_rate": 6.1133212894117326e-06, "loss": 0.4483, "num_input_tokens_seen": 10810840, "step": 16475 }, { "epoch": 9.716981132075471, "grad_norm": 1.8898190259933472, "learning_rate": 6.1108130155375986e-06, "loss": 0.4097, "num_input_tokens_seen": 10814072, "step": 16480 }, { "epoch": 9.71992924528302, "grad_norm": 3.377685308456421, "learning_rate": 6.108304447570933e-06, "loss": 0.484, "num_input_tokens_seen": 10817464, "step": 16485 }, { "epoch": 9.722877358490566, "grad_norm": 2.3370559215545654, "learning_rate": 6.105795586175888e-06, "loss": 0.3806, "num_input_tokens_seen": 10820504, "step": 16490 }, { "epoch": 9.725825471698114, "grad_norm": 1.7648296356201172, "learning_rate": 6.1032864320166954e-06, "loss": 0.2998, "num_input_tokens_seen": 10824344, "step": 16495 }, { "epoch": 9.72877358490566, "grad_norm": 1.6513816118240356, "learning_rate": 6.100776985757666e-06, "loss": 0.3527, "num_input_tokens_seen": 10827384, "step": 16500 }, { "epoch": 9.731721698113208, "grad_norm": 1.1788866519927979, "learning_rate": 6.098267248063186e-06, "loss": 0.3275, "num_input_tokens_seen": 10831064, "step": 16505 }, { "epoch": 9.734669811320755, "grad_norm": 1.8187040090560913, "learning_rate": 6.0957572195977165e-06, "loss": 0.4074, "num_input_tokens_seen": 10838200, "step": 16510 }, { "epoch": 9.737617924528301, "grad_norm": 3.970810890197754, "learning_rate": 6.0932469010258025e-06, "loss": 0.4914, "num_input_tokens_seen": 10840728, "step": 16515 }, { "epoch": 9.74056603773585, "grad_norm": 2.2221295833587646, "learning_rate": 6.0907362930120594e-06, "loss": 0.4677, "num_input_tokens_seen": 10843896, "step": 16520 }, { "epoch": 9.743514150943396, "grad_norm": 3.7675817012786865, "learning_rate": 6.088225396221181e-06, "loss": 0.4124, "num_input_tokens_seen": 10848088, "step": 16525 }, { "epoch": 9.746462264150944, "grad_norm": 0.9427663683891296, "learning_rate": 6.0857142113179415e-06, "loss": 0.3951, "num_input_tokens_seen": 10851224, "step": 16530 }, { "epoch": 9.74941037735849, "grad_norm": 3.0832290649414062, "learning_rate": 6.083202738967182e-06, "loss": 0.4373, "num_input_tokens_seen": 10854968, "step": 16535 }, { "epoch": 9.752358490566039, "grad_norm": 1.9840511083602905, "learning_rate": 6.0806909798338324e-06, "loss": 0.4242, "num_input_tokens_seen": 10858328, "step": 16540 }, { "epoch": 9.755306603773585, "grad_norm": 2.669060468673706, "learning_rate": 6.0781789345828854e-06, "loss": 0.4919, "num_input_tokens_seen": 10861752, "step": 16545 }, { "epoch": 9.758254716981131, "grad_norm": 2.0320310592651367, "learning_rate": 6.0756666038794195e-06, "loss": 0.457, "num_input_tokens_seen": 10864696, "step": 16550 }, { "epoch": 9.76120283018868, "grad_norm": 2.161770820617676, "learning_rate": 6.073153988388586e-06, "loss": 0.3444, "num_input_tokens_seen": 10867192, "step": 16555 }, { "epoch": 9.764150943396226, "grad_norm": 2.27244234085083, "learning_rate": 6.070641088775608e-06, "loss": 0.3823, "num_input_tokens_seen": 10869656, "step": 16560 }, { "epoch": 9.767099056603774, "grad_norm": 4.62990665435791, "learning_rate": 6.068127905705787e-06, "loss": 0.4243, "num_input_tokens_seen": 10872856, "step": 16565 }, { "epoch": 9.77004716981132, "grad_norm": 1.5436702966690063, "learning_rate": 6.065614439844501e-06, "loss": 0.3824, "num_input_tokens_seen": 10875960, "step": 16570 }, { "epoch": 9.772995283018869, "grad_norm": 2.06426739692688, "learning_rate": 6.063100691857198e-06, "loss": 0.4109, "num_input_tokens_seen": 10879448, "step": 16575 }, { "epoch": 9.775943396226415, "grad_norm": 3.148911714553833, "learning_rate": 6.060586662409407e-06, "loss": 0.3604, "num_input_tokens_seen": 10881752, "step": 16580 }, { "epoch": 9.778891509433961, "grad_norm": 7.292573928833008, "learning_rate": 6.058072352166724e-06, "loss": 0.4108, "num_input_tokens_seen": 10885112, "step": 16585 }, { "epoch": 9.78183962264151, "grad_norm": 6.0959858894348145, "learning_rate": 6.055557761794826e-06, "loss": 0.3435, "num_input_tokens_seen": 10889688, "step": 16590 }, { "epoch": 9.784787735849056, "grad_norm": 1.168031096458435, "learning_rate": 6.053042891959462e-06, "loss": 0.359, "num_input_tokens_seen": 10893368, "step": 16595 }, { "epoch": 9.787735849056604, "grad_norm": 2.6051084995269775, "learning_rate": 6.050527743326455e-06, "loss": 0.5013, "num_input_tokens_seen": 10896472, "step": 16600 }, { "epoch": 9.79068396226415, "grad_norm": 2.0813310146331787, "learning_rate": 6.048012316561699e-06, "loss": 0.4024, "num_input_tokens_seen": 10899640, "step": 16605 }, { "epoch": 9.793632075471699, "grad_norm": 2.56844425201416, "learning_rate": 6.045496612331166e-06, "loss": 0.3963, "num_input_tokens_seen": 10902552, "step": 16610 }, { "epoch": 9.796580188679245, "grad_norm": 2.4539244174957275, "learning_rate": 6.0429806313009e-06, "loss": 0.4441, "num_input_tokens_seen": 10906392, "step": 16615 }, { "epoch": 9.799528301886792, "grad_norm": 4.304088592529297, "learning_rate": 6.040464374137015e-06, "loss": 0.4377, "num_input_tokens_seen": 10909720, "step": 16620 }, { "epoch": 9.80247641509434, "grad_norm": 2.3462440967559814, "learning_rate": 6.0379478415057045e-06, "loss": 0.4805, "num_input_tokens_seen": 10914072, "step": 16625 }, { "epoch": 9.805424528301886, "grad_norm": 2.9798901081085205, "learning_rate": 6.035431034073228e-06, "loss": 0.4034, "num_input_tokens_seen": 10917464, "step": 16630 }, { "epoch": 9.808372641509434, "grad_norm": 2.315579414367676, "learning_rate": 6.0329139525059235e-06, "loss": 0.4825, "num_input_tokens_seen": 10920120, "step": 16635 }, { "epoch": 9.81132075471698, "grad_norm": 2.678272247314453, "learning_rate": 6.030396597470198e-06, "loss": 0.4283, "num_input_tokens_seen": 10922936, "step": 16640 }, { "epoch": 9.814268867924529, "grad_norm": 1.528733253479004, "learning_rate": 6.027878969632534e-06, "loss": 0.4489, "num_input_tokens_seen": 10926840, "step": 16645 }, { "epoch": 9.817216981132075, "grad_norm": 2.041757345199585, "learning_rate": 6.025361069659482e-06, "loss": 0.3963, "num_input_tokens_seen": 10930552, "step": 16650 }, { "epoch": 9.820165094339622, "grad_norm": 1.9807296991348267, "learning_rate": 6.022842898217668e-06, "loss": 0.5663, "num_input_tokens_seen": 10934136, "step": 16655 }, { "epoch": 9.82311320754717, "grad_norm": 3.37778377532959, "learning_rate": 6.020324455973788e-06, "loss": 0.4163, "num_input_tokens_seen": 10937720, "step": 16660 }, { "epoch": 9.826061320754716, "grad_norm": 3.5917885303497314, "learning_rate": 6.017805743594612e-06, "loss": 0.3491, "num_input_tokens_seen": 10940696, "step": 16665 }, { "epoch": 9.829009433962264, "grad_norm": 3.0264666080474854, "learning_rate": 6.0152867617469776e-06, "loss": 0.2857, "num_input_tokens_seen": 10943576, "step": 16670 }, { "epoch": 9.83195754716981, "grad_norm": 3.8330957889556885, "learning_rate": 6.012767511097799e-06, "loss": 0.4404, "num_input_tokens_seen": 10946904, "step": 16675 }, { "epoch": 9.834905660377359, "grad_norm": 3.2365458011627197, "learning_rate": 6.010247992314055e-06, "loss": 0.3122, "num_input_tokens_seen": 10950264, "step": 16680 }, { "epoch": 9.837853773584905, "grad_norm": 1.2328965663909912, "learning_rate": 6.007728206062802e-06, "loss": 0.3098, "num_input_tokens_seen": 10953720, "step": 16685 }, { "epoch": 9.840801886792454, "grad_norm": 1.697411060333252, "learning_rate": 6.005208153011163e-06, "loss": 0.3971, "num_input_tokens_seen": 10956216, "step": 16690 }, { "epoch": 9.84375, "grad_norm": 1.2479360103607178, "learning_rate": 6.0026878338263335e-06, "loss": 0.3976, "num_input_tokens_seen": 10959896, "step": 16695 }, { "epoch": 9.846698113207546, "grad_norm": 2.2523398399353027, "learning_rate": 6.000167249175579e-06, "loss": 0.3464, "num_input_tokens_seen": 10963416, "step": 16700 }, { "epoch": 9.849646226415095, "grad_norm": 2.9545702934265137, "learning_rate": 5.997646399726236e-06, "loss": 0.2479, "num_input_tokens_seen": 10966072, "step": 16705 }, { "epoch": 9.852594339622641, "grad_norm": 2.873659610748291, "learning_rate": 5.995125286145707e-06, "loss": 0.4823, "num_input_tokens_seen": 10970712, "step": 16710 }, { "epoch": 9.85554245283019, "grad_norm": 1.4504408836364746, "learning_rate": 5.99260390910147e-06, "loss": 0.4443, "num_input_tokens_seen": 10975288, "step": 16715 }, { "epoch": 9.858490566037736, "grad_norm": 2.4705958366394043, "learning_rate": 5.990082269261071e-06, "loss": 0.4915, "num_input_tokens_seen": 10977976, "step": 16720 }, { "epoch": 9.861438679245284, "grad_norm": 2.9171502590179443, "learning_rate": 5.987560367292123e-06, "loss": 0.3903, "num_input_tokens_seen": 10980728, "step": 16725 }, { "epoch": 9.86438679245283, "grad_norm": 1.9374436140060425, "learning_rate": 5.985038203862313e-06, "loss": 0.5196, "num_input_tokens_seen": 10984152, "step": 16730 }, { "epoch": 9.867334905660378, "grad_norm": 3.655853748321533, "learning_rate": 5.982515779639393e-06, "loss": 0.484, "num_input_tokens_seen": 10987480, "step": 16735 }, { "epoch": 9.870283018867925, "grad_norm": 6.524825572967529, "learning_rate": 5.979993095291186e-06, "loss": 0.4624, "num_input_tokens_seen": 10990424, "step": 16740 }, { "epoch": 9.873231132075471, "grad_norm": 2.862771511077881, "learning_rate": 5.977470151485582e-06, "loss": 0.3719, "num_input_tokens_seen": 10992536, "step": 16745 }, { "epoch": 9.87617924528302, "grad_norm": 2.3015975952148438, "learning_rate": 5.974946948890544e-06, "loss": 0.393, "num_input_tokens_seen": 10996216, "step": 16750 }, { "epoch": 9.879127358490566, "grad_norm": 3.3873910903930664, "learning_rate": 5.9724234881740994e-06, "loss": 0.3498, "num_input_tokens_seen": 10999256, "step": 16755 }, { "epoch": 9.882075471698114, "grad_norm": 2.8708527088165283, "learning_rate": 5.9698997700043445e-06, "loss": 0.3814, "num_input_tokens_seen": 11002488, "step": 16760 }, { "epoch": 9.88502358490566, "grad_norm": 2.1095101833343506, "learning_rate": 5.9673757950494475e-06, "loss": 0.2805, "num_input_tokens_seen": 11005432, "step": 16765 }, { "epoch": 9.887971698113208, "grad_norm": 3.5808093547821045, "learning_rate": 5.964851563977639e-06, "loss": 0.2197, "num_input_tokens_seen": 11007736, "step": 16770 }, { "epoch": 9.890919811320755, "grad_norm": 10.994311332702637, "learning_rate": 5.962327077457219e-06, "loss": 0.3715, "num_input_tokens_seen": 11011160, "step": 16775 }, { "epoch": 9.893867924528301, "grad_norm": 3.944780111312866, "learning_rate": 5.959802336156558e-06, "loss": 0.3567, "num_input_tokens_seen": 11014264, "step": 16780 }, { "epoch": 9.89681603773585, "grad_norm": 2.841972827911377, "learning_rate": 5.957277340744094e-06, "loss": 0.3254, "num_input_tokens_seen": 11016920, "step": 16785 }, { "epoch": 9.899764150943396, "grad_norm": 1.291869044303894, "learning_rate": 5.954752091888326e-06, "loss": 0.4121, "num_input_tokens_seen": 11020152, "step": 16790 }, { "epoch": 9.902712264150944, "grad_norm": 2.6977412700653076, "learning_rate": 5.952226590257829e-06, "loss": 0.4511, "num_input_tokens_seen": 11022584, "step": 16795 }, { "epoch": 9.90566037735849, "grad_norm": 4.251730918884277, "learning_rate": 5.949700836521239e-06, "loss": 0.3205, "num_input_tokens_seen": 11026648, "step": 16800 }, { "epoch": 9.908608490566039, "grad_norm": 1.9338898658752441, "learning_rate": 5.947174831347257e-06, "loss": 0.359, "num_input_tokens_seen": 11030232, "step": 16805 }, { "epoch": 9.911556603773585, "grad_norm": 1.9591487646102905, "learning_rate": 5.944648575404657e-06, "loss": 0.3928, "num_input_tokens_seen": 11034168, "step": 16810 }, { "epoch": 9.914504716981131, "grad_norm": 3.8000810146331787, "learning_rate": 5.942122069362276e-06, "loss": 0.3636, "num_input_tokens_seen": 11037912, "step": 16815 }, { "epoch": 9.91745283018868, "grad_norm": 1.7359559535980225, "learning_rate": 5.939595313889016e-06, "loss": 0.3798, "num_input_tokens_seen": 11040696, "step": 16820 }, { "epoch": 9.920400943396226, "grad_norm": 3.6204402446746826, "learning_rate": 5.937068309653848e-06, "loss": 0.4414, "num_input_tokens_seen": 11043896, "step": 16825 }, { "epoch": 9.923349056603774, "grad_norm": 3.2673511505126953, "learning_rate": 5.934541057325807e-06, "loss": 0.421, "num_input_tokens_seen": 11046744, "step": 16830 }, { "epoch": 9.92629716981132, "grad_norm": 1.7623263597488403, "learning_rate": 5.932013557573992e-06, "loss": 0.3736, "num_input_tokens_seen": 11050904, "step": 16835 }, { "epoch": 9.929245283018869, "grad_norm": 3.619976043701172, "learning_rate": 5.929485811067572e-06, "loss": 0.3984, "num_input_tokens_seen": 11053304, "step": 16840 }, { "epoch": 9.932193396226415, "grad_norm": 2.8903918266296387, "learning_rate": 5.926957818475778e-06, "loss": 0.3644, "num_input_tokens_seen": 11056568, "step": 16845 }, { "epoch": 9.935141509433961, "grad_norm": 1.9187055826187134, "learning_rate": 5.924429580467905e-06, "loss": 0.3471, "num_input_tokens_seen": 11059640, "step": 16850 }, { "epoch": 9.93808962264151, "grad_norm": 4.224221229553223, "learning_rate": 5.921901097713317e-06, "loss": 0.3146, "num_input_tokens_seen": 11062392, "step": 16855 }, { "epoch": 9.941037735849056, "grad_norm": 2.1697909832000732, "learning_rate": 5.919372370881442e-06, "loss": 0.489, "num_input_tokens_seen": 11065720, "step": 16860 }, { "epoch": 9.943985849056604, "grad_norm": 2.395853281021118, "learning_rate": 5.916843400641768e-06, "loss": 0.4421, "num_input_tokens_seen": 11069528, "step": 16865 }, { "epoch": 9.94693396226415, "grad_norm": 4.0244951248168945, "learning_rate": 5.914314187663851e-06, "loss": 0.4149, "num_input_tokens_seen": 11072056, "step": 16870 }, { "epoch": 9.949882075471699, "grad_norm": 1.8209600448608398, "learning_rate": 5.911784732617314e-06, "loss": 0.3276, "num_input_tokens_seen": 11074904, "step": 16875 }, { "epoch": 9.952830188679245, "grad_norm": 1.5675276517868042, "learning_rate": 5.90925503617184e-06, "loss": 0.3182, "num_input_tokens_seen": 11077912, "step": 16880 }, { "epoch": 9.955778301886792, "grad_norm": 1.6668412685394287, "learning_rate": 5.9067250989971745e-06, "loss": 0.4971, "num_input_tokens_seen": 11082104, "step": 16885 }, { "epoch": 9.95872641509434, "grad_norm": 3.091620683670044, "learning_rate": 5.904194921763133e-06, "loss": 0.4271, "num_input_tokens_seen": 11085304, "step": 16890 }, { "epoch": 9.961674528301886, "grad_norm": 3.3599324226379395, "learning_rate": 5.901664505139589e-06, "loss": 0.5222, "num_input_tokens_seen": 11088088, "step": 16895 }, { "epoch": 9.964622641509434, "grad_norm": 1.8796828985214233, "learning_rate": 5.8991338497964814e-06, "loss": 0.391, "num_input_tokens_seen": 11091416, "step": 16900 }, { "epoch": 9.96757075471698, "grad_norm": 1.7490973472595215, "learning_rate": 5.896602956403812e-06, "loss": 0.3003, "num_input_tokens_seen": 11094040, "step": 16905 }, { "epoch": 9.970518867924529, "grad_norm": 5.845962047576904, "learning_rate": 5.894071825631645e-06, "loss": 0.3656, "num_input_tokens_seen": 11097112, "step": 16910 }, { "epoch": 9.973466981132075, "grad_norm": 3.1568727493286133, "learning_rate": 5.891540458150109e-06, "loss": 0.3938, "num_input_tokens_seen": 11101368, "step": 16915 }, { "epoch": 9.976415094339622, "grad_norm": 2.1694955825805664, "learning_rate": 5.889008854629395e-06, "loss": 0.3435, "num_input_tokens_seen": 11105112, "step": 16920 }, { "epoch": 9.97936320754717, "grad_norm": 1.9519085884094238, "learning_rate": 5.886477015739754e-06, "loss": 0.3492, "num_input_tokens_seen": 11107672, "step": 16925 }, { "epoch": 9.982311320754716, "grad_norm": 1.5684754848480225, "learning_rate": 5.883944942151502e-06, "loss": 0.4233, "num_input_tokens_seen": 11111864, "step": 16930 }, { "epoch": 9.985259433962264, "grad_norm": 1.491429090499878, "learning_rate": 5.88141263453502e-06, "loss": 0.3249, "num_input_tokens_seen": 11115480, "step": 16935 }, { "epoch": 9.98820754716981, "grad_norm": 2.1752326488494873, "learning_rate": 5.878880093560744e-06, "loss": 0.5107, "num_input_tokens_seen": 11119288, "step": 16940 }, { "epoch": 9.991155660377359, "grad_norm": 1.6714144945144653, "learning_rate": 5.876347319899173e-06, "loss": 0.3821, "num_input_tokens_seen": 11122584, "step": 16945 }, { "epoch": 9.994103773584905, "grad_norm": 2.2964963912963867, "learning_rate": 5.873814314220874e-06, "loss": 0.4379, "num_input_tokens_seen": 11126264, "step": 16950 }, { "epoch": 9.997051886792454, "grad_norm": 2.040882110595703, "learning_rate": 5.871281077196469e-06, "loss": 0.4329, "num_input_tokens_seen": 11128728, "step": 16955 }, { "epoch": 10.0, "grad_norm": 4.213760852813721, "learning_rate": 5.868747609496643e-06, "loss": 0.415, "num_input_tokens_seen": 11130848, "step": 16960 }, { "epoch": 10.0, "eval_loss": 0.5237100124359131, "eval_runtime": 18.6287, "eval_samples_per_second": 91.042, "eval_steps_per_second": 22.761, "num_input_tokens_seen": 11130848, "step": 16960 }, { "epoch": 10.002948113207546, "grad_norm": 2.287257671356201, "learning_rate": 5.8662139117921435e-06, "loss": 0.3726, "num_input_tokens_seen": 11133888, "step": 16965 }, { "epoch": 10.005896226415095, "grad_norm": 2.663464069366455, "learning_rate": 5.8636799847537785e-06, "loss": 0.2553, "num_input_tokens_seen": 11139712, "step": 16970 }, { "epoch": 10.008844339622641, "grad_norm": 1.7380332946777344, "learning_rate": 5.861145829052415e-06, "loss": 0.4085, "num_input_tokens_seen": 11143552, "step": 16975 }, { "epoch": 10.01179245283019, "grad_norm": 1.9142603874206543, "learning_rate": 5.858611445358982e-06, "loss": 0.3916, "num_input_tokens_seen": 11146592, "step": 16980 }, { "epoch": 10.014740566037736, "grad_norm": 1.6322453022003174, "learning_rate": 5.856076834344468e-06, "loss": 0.2325, "num_input_tokens_seen": 11149440, "step": 16985 }, { "epoch": 10.017688679245284, "grad_norm": 1.7215757369995117, "learning_rate": 5.853541996679924e-06, "loss": 0.3724, "num_input_tokens_seen": 11152416, "step": 16990 }, { "epoch": 10.02063679245283, "grad_norm": 2.419384479522705, "learning_rate": 5.851006933036456e-06, "loss": 0.5383, "num_input_tokens_seen": 11156640, "step": 16995 }, { "epoch": 10.023584905660377, "grad_norm": 2.7226717472076416, "learning_rate": 5.848471644085236e-06, "loss": 0.4462, "num_input_tokens_seen": 11159296, "step": 17000 }, { "epoch": 10.026533018867925, "grad_norm": 3.595921516418457, "learning_rate": 5.84593613049749e-06, "loss": 0.4381, "num_input_tokens_seen": 11162272, "step": 17005 }, { "epoch": 10.029481132075471, "grad_norm": 5.285074710845947, "learning_rate": 5.843400392944509e-06, "loss": 0.5717, "num_input_tokens_seen": 11164608, "step": 17010 }, { "epoch": 10.03242924528302, "grad_norm": 3.042351245880127, "learning_rate": 5.840864432097639e-06, "loss": 0.5381, "num_input_tokens_seen": 11168352, "step": 17015 }, { "epoch": 10.035377358490566, "grad_norm": 3.1762492656707764, "learning_rate": 5.838328248628286e-06, "loss": 0.3676, "num_input_tokens_seen": 11171584, "step": 17020 }, { "epoch": 10.038325471698114, "grad_norm": 1.904537320137024, "learning_rate": 5.835791843207916e-06, "loss": 0.5234, "num_input_tokens_seen": 11174560, "step": 17025 }, { "epoch": 10.04127358490566, "grad_norm": 6.314684867858887, "learning_rate": 5.833255216508056e-06, "loss": 0.3438, "num_input_tokens_seen": 11177248, "step": 17030 }, { "epoch": 10.044221698113208, "grad_norm": 2.614004373550415, "learning_rate": 5.830718369200284e-06, "loss": 0.4516, "num_input_tokens_seen": 11180608, "step": 17035 }, { "epoch": 10.047169811320755, "grad_norm": 2.184997320175171, "learning_rate": 5.828181301956244e-06, "loss": 0.2947, "num_input_tokens_seen": 11184128, "step": 17040 }, { "epoch": 10.050117924528301, "grad_norm": 2.848257064819336, "learning_rate": 5.825644015447636e-06, "loss": 0.2833, "num_input_tokens_seen": 11187328, "step": 17045 }, { "epoch": 10.05306603773585, "grad_norm": 1.3011598587036133, "learning_rate": 5.823106510346216e-06, "loss": 0.3099, "num_input_tokens_seen": 11190432, "step": 17050 }, { "epoch": 10.056014150943396, "grad_norm": 1.6673575639724731, "learning_rate": 5.820568787323798e-06, "loss": 0.4428, "num_input_tokens_seen": 11194432, "step": 17055 }, { "epoch": 10.058962264150944, "grad_norm": 2.7247769832611084, "learning_rate": 5.818030847052258e-06, "loss": 0.4434, "num_input_tokens_seen": 11197280, "step": 17060 }, { "epoch": 10.06191037735849, "grad_norm": 2.5376574993133545, "learning_rate": 5.815492690203528e-06, "loss": 0.3663, "num_input_tokens_seen": 11199872, "step": 17065 }, { "epoch": 10.064858490566039, "grad_norm": 1.9055509567260742, "learning_rate": 5.812954317449591e-06, "loss": 0.3354, "num_input_tokens_seen": 11203584, "step": 17070 }, { "epoch": 10.067806603773585, "grad_norm": 2.0177454948425293, "learning_rate": 5.810415729462496e-06, "loss": 0.4532, "num_input_tokens_seen": 11207264, "step": 17075 }, { "epoch": 10.070754716981131, "grad_norm": 2.6126959323883057, "learning_rate": 5.807876926914344e-06, "loss": 0.3284, "num_input_tokens_seen": 11210784, "step": 17080 }, { "epoch": 10.07370283018868, "grad_norm": 3.6635117530822754, "learning_rate": 5.8053379104772935e-06, "loss": 0.4056, "num_input_tokens_seen": 11213280, "step": 17085 }, { "epoch": 10.076650943396226, "grad_norm": 3.45194673538208, "learning_rate": 5.802798680823562e-06, "loss": 0.4716, "num_input_tokens_seen": 11216576, "step": 17090 }, { "epoch": 10.079599056603774, "grad_norm": 2.507205009460449, "learning_rate": 5.8002592386254186e-06, "loss": 0.3108, "num_input_tokens_seen": 11219104, "step": 17095 }, { "epoch": 10.08254716981132, "grad_norm": 2.3954031467437744, "learning_rate": 5.797719584555192e-06, "loss": 0.2841, "num_input_tokens_seen": 11223168, "step": 17100 }, { "epoch": 10.085495283018869, "grad_norm": 1.9781039953231812, "learning_rate": 5.795179719285269e-06, "loss": 0.4072, "num_input_tokens_seen": 11227104, "step": 17105 }, { "epoch": 10.088443396226415, "grad_norm": 2.6994082927703857, "learning_rate": 5.792639643488086e-06, "loss": 0.3576, "num_input_tokens_seen": 11231616, "step": 17110 }, { "epoch": 10.091391509433961, "grad_norm": 1.8472552299499512, "learning_rate": 5.7900993578361434e-06, "loss": 0.301, "num_input_tokens_seen": 11234720, "step": 17115 }, { "epoch": 10.09433962264151, "grad_norm": 2.5158045291900635, "learning_rate": 5.7875588630019895e-06, "loss": 0.4063, "num_input_tokens_seen": 11238368, "step": 17120 }, { "epoch": 10.097287735849056, "grad_norm": 2.257936954498291, "learning_rate": 5.7850181596582335e-06, "loss": 0.4262, "num_input_tokens_seen": 11241312, "step": 17125 }, { "epoch": 10.100235849056604, "grad_norm": 3.3489346504211426, "learning_rate": 5.782477248477535e-06, "loss": 0.3631, "num_input_tokens_seen": 11243744, "step": 17130 }, { "epoch": 10.10318396226415, "grad_norm": 2.205156087875366, "learning_rate": 5.779936130132614e-06, "loss": 0.3631, "num_input_tokens_seen": 11247808, "step": 17135 }, { "epoch": 10.106132075471699, "grad_norm": 2.2506673336029053, "learning_rate": 5.777394805296242e-06, "loss": 0.3215, "num_input_tokens_seen": 11250432, "step": 17140 }, { "epoch": 10.109080188679245, "grad_norm": 2.1344430446624756, "learning_rate": 5.774853274641243e-06, "loss": 0.2663, "num_input_tokens_seen": 11253376, "step": 17145 }, { "epoch": 10.112028301886792, "grad_norm": 2.755187749862671, "learning_rate": 5.772311538840501e-06, "loss": 0.4222, "num_input_tokens_seen": 11256448, "step": 17150 }, { "epoch": 10.11497641509434, "grad_norm": 2.9691007137298584, "learning_rate": 5.76976959856695e-06, "loss": 0.3542, "num_input_tokens_seen": 11260096, "step": 17155 }, { "epoch": 10.117924528301886, "grad_norm": 5.33510160446167, "learning_rate": 5.767227454493582e-06, "loss": 0.3285, "num_input_tokens_seen": 11263296, "step": 17160 }, { "epoch": 10.120872641509434, "grad_norm": 3.149852752685547, "learning_rate": 5.764685107293436e-06, "loss": 0.3222, "num_input_tokens_seen": 11266272, "step": 17165 }, { "epoch": 10.12382075471698, "grad_norm": 2.177703857421875, "learning_rate": 5.762142557639614e-06, "loss": 0.2886, "num_input_tokens_seen": 11269824, "step": 17170 }, { "epoch": 10.126768867924529, "grad_norm": 2.6026151180267334, "learning_rate": 5.759599806205266e-06, "loss": 0.4412, "num_input_tokens_seen": 11272992, "step": 17175 }, { "epoch": 10.129716981132075, "grad_norm": 1.6627405881881714, "learning_rate": 5.757056853663594e-06, "loss": 0.3677, "num_input_tokens_seen": 11275840, "step": 17180 }, { "epoch": 10.132665094339623, "grad_norm": 9.84994888305664, "learning_rate": 5.754513700687858e-06, "loss": 0.3378, "num_input_tokens_seen": 11279296, "step": 17185 }, { "epoch": 10.13561320754717, "grad_norm": 3.1363580226898193, "learning_rate": 5.751970347951369e-06, "loss": 0.4047, "num_input_tokens_seen": 11282688, "step": 17190 }, { "epoch": 10.138561320754716, "grad_norm": 3.822051525115967, "learning_rate": 5.7494267961274875e-06, "loss": 0.4373, "num_input_tokens_seen": 11286880, "step": 17195 }, { "epoch": 10.141509433962264, "grad_norm": 3.356849431991577, "learning_rate": 5.746883045889633e-06, "loss": 0.3624, "num_input_tokens_seen": 11292384, "step": 17200 }, { "epoch": 10.14445754716981, "grad_norm": 2.1997807025909424, "learning_rate": 5.744339097911272e-06, "loss": 0.3278, "num_input_tokens_seen": 11296192, "step": 17205 }, { "epoch": 10.147405660377359, "grad_norm": 2.6483094692230225, "learning_rate": 5.741794952865928e-06, "loss": 0.3231, "num_input_tokens_seen": 11299584, "step": 17210 }, { "epoch": 10.150353773584905, "grad_norm": 2.6081998348236084, "learning_rate": 5.739250611427173e-06, "loss": 0.3598, "num_input_tokens_seen": 11302816, "step": 17215 }, { "epoch": 10.153301886792454, "grad_norm": 2.1866347789764404, "learning_rate": 5.736706074268633e-06, "loss": 0.3154, "num_input_tokens_seen": 11305760, "step": 17220 }, { "epoch": 10.15625, "grad_norm": 2.540930986404419, "learning_rate": 5.734161342063984e-06, "loss": 0.4007, "num_input_tokens_seen": 11309568, "step": 17225 }, { "epoch": 10.159198113207546, "grad_norm": 3.363173723220825, "learning_rate": 5.731616415486957e-06, "loss": 0.4269, "num_input_tokens_seen": 11312672, "step": 17230 }, { "epoch": 10.162146226415095, "grad_norm": 1.9613347053527832, "learning_rate": 5.72907129521133e-06, "loss": 0.4018, "num_input_tokens_seen": 11316960, "step": 17235 }, { "epoch": 10.165094339622641, "grad_norm": 1.2766114473342896, "learning_rate": 5.726525981910935e-06, "loss": 0.2702, "num_input_tokens_seen": 11320544, "step": 17240 }, { "epoch": 10.16804245283019, "grad_norm": 1.9110418558120728, "learning_rate": 5.723980476259658e-06, "loss": 0.3552, "num_input_tokens_seen": 11322976, "step": 17245 }, { "epoch": 10.170990566037736, "grad_norm": 1.5758239030838013, "learning_rate": 5.721434778931426e-06, "loss": 0.379, "num_input_tokens_seen": 11326144, "step": 17250 }, { "epoch": 10.173938679245284, "grad_norm": 1.8295018672943115, "learning_rate": 5.7188888906002284e-06, "loss": 0.2769, "num_input_tokens_seen": 11328896, "step": 17255 }, { "epoch": 10.17688679245283, "grad_norm": 3.7223060131073, "learning_rate": 5.716342811940098e-06, "loss": 0.3271, "num_input_tokens_seen": 11331840, "step": 17260 }, { "epoch": 10.179834905660377, "grad_norm": 2.7457339763641357, "learning_rate": 5.713796543625123e-06, "loss": 0.4374, "num_input_tokens_seen": 11334688, "step": 17265 }, { "epoch": 10.182783018867925, "grad_norm": 2.4161126613616943, "learning_rate": 5.711250086329435e-06, "loss": 0.378, "num_input_tokens_seen": 11338112, "step": 17270 }, { "epoch": 10.185731132075471, "grad_norm": 1.6731821298599243, "learning_rate": 5.7087034407272225e-06, "loss": 0.3524, "num_input_tokens_seen": 11340832, "step": 17275 }, { "epoch": 10.18867924528302, "grad_norm": 3.962085723876953, "learning_rate": 5.70615660749272e-06, "loss": 0.4954, "num_input_tokens_seen": 11344192, "step": 17280 }, { "epoch": 10.191627358490566, "grad_norm": 2.268817186355591, "learning_rate": 5.7036095873002106e-06, "loss": 0.2472, "num_input_tokens_seen": 11347392, "step": 17285 }, { "epoch": 10.194575471698114, "grad_norm": 2.951965808868408, "learning_rate": 5.701062380824032e-06, "loss": 0.3811, "num_input_tokens_seen": 11350432, "step": 17290 }, { "epoch": 10.19752358490566, "grad_norm": 1.8252220153808594, "learning_rate": 5.698514988738566e-06, "loss": 0.2759, "num_input_tokens_seen": 11353344, "step": 17295 }, { "epoch": 10.200471698113208, "grad_norm": 3.57769513130188, "learning_rate": 5.6959674117182465e-06, "loss": 0.3499, "num_input_tokens_seen": 11356544, "step": 17300 }, { "epoch": 10.203419811320755, "grad_norm": 2.784310817718506, "learning_rate": 5.693419650437554e-06, "loss": 0.3698, "num_input_tokens_seen": 11359584, "step": 17305 }, { "epoch": 10.206367924528301, "grad_norm": 2.8446292877197266, "learning_rate": 5.690871705571022e-06, "loss": 0.4189, "num_input_tokens_seen": 11363200, "step": 17310 }, { "epoch": 10.20931603773585, "grad_norm": 2.30488920211792, "learning_rate": 5.688323577793229e-06, "loss": 0.3311, "num_input_tokens_seen": 11365824, "step": 17315 }, { "epoch": 10.212264150943396, "grad_norm": 2.251605272293091, "learning_rate": 5.685775267778801e-06, "loss": 0.4515, "num_input_tokens_seen": 11369600, "step": 17320 }, { "epoch": 10.215212264150944, "grad_norm": 4.615743160247803, "learning_rate": 5.683226776202416e-06, "loss": 0.4136, "num_input_tokens_seen": 11372736, "step": 17325 }, { "epoch": 10.21816037735849, "grad_norm": 2.325037717819214, "learning_rate": 5.680678103738798e-06, "loss": 0.5021, "num_input_tokens_seen": 11375904, "step": 17330 }, { "epoch": 10.221108490566039, "grad_norm": 1.6468127965927124, "learning_rate": 5.678129251062717e-06, "loss": 0.438, "num_input_tokens_seen": 11379424, "step": 17335 }, { "epoch": 10.224056603773585, "grad_norm": 3.4688894748687744, "learning_rate": 5.675580218848995e-06, "loss": 0.4392, "num_input_tokens_seen": 11381728, "step": 17340 }, { "epoch": 10.227004716981131, "grad_norm": 1.7730430364608765, "learning_rate": 5.673031007772498e-06, "loss": 0.3403, "num_input_tokens_seen": 11385984, "step": 17345 }, { "epoch": 10.22995283018868, "grad_norm": 2.2969110012054443, "learning_rate": 5.670481618508141e-06, "loss": 0.4201, "num_input_tokens_seen": 11389024, "step": 17350 }, { "epoch": 10.232900943396226, "grad_norm": 2.160532236099243, "learning_rate": 5.667932051730887e-06, "loss": 0.288, "num_input_tokens_seen": 11392256, "step": 17355 }, { "epoch": 10.235849056603774, "grad_norm": 4.310451507568359, "learning_rate": 5.6653823081157434e-06, "loss": 0.4174, "num_input_tokens_seen": 11395328, "step": 17360 }, { "epoch": 10.23879716981132, "grad_norm": 2.7155609130859375, "learning_rate": 5.662832388337766e-06, "loss": 0.4798, "num_input_tokens_seen": 11398784, "step": 17365 }, { "epoch": 10.241745283018869, "grad_norm": 4.254199028015137, "learning_rate": 5.660282293072057e-06, "loss": 0.3053, "num_input_tokens_seen": 11401408, "step": 17370 }, { "epoch": 10.244693396226415, "grad_norm": 2.569195508956909, "learning_rate": 5.657732022993765e-06, "loss": 0.3347, "num_input_tokens_seen": 11404512, "step": 17375 }, { "epoch": 10.247641509433961, "grad_norm": 4.818870544433594, "learning_rate": 5.655181578778085e-06, "loss": 0.4706, "num_input_tokens_seen": 11407232, "step": 17380 }, { "epoch": 10.25058962264151, "grad_norm": 2.855987310409546, "learning_rate": 5.65263096110026e-06, "loss": 0.4424, "num_input_tokens_seen": 11411776, "step": 17385 }, { "epoch": 10.253537735849056, "grad_norm": 2.3390731811523438, "learning_rate": 5.650080170635573e-06, "loss": 0.3259, "num_input_tokens_seen": 11414240, "step": 17390 }, { "epoch": 10.256485849056604, "grad_norm": 1.3935083150863647, "learning_rate": 5.647529208059359e-06, "loss": 0.3557, "num_input_tokens_seen": 11417312, "step": 17395 }, { "epoch": 10.25943396226415, "grad_norm": 3.505833864212036, "learning_rate": 5.6449780740469985e-06, "loss": 0.2939, "num_input_tokens_seen": 11421088, "step": 17400 }, { "epoch": 10.262382075471699, "grad_norm": 1.7600125074386597, "learning_rate": 5.642426769273912e-06, "loss": 0.3733, "num_input_tokens_seen": 11426272, "step": 17405 }, { "epoch": 10.265330188679245, "grad_norm": 2.13626766204834, "learning_rate": 5.63987529441557e-06, "loss": 0.4584, "num_input_tokens_seen": 11429792, "step": 17410 }, { "epoch": 10.268278301886792, "grad_norm": 2.0722861289978027, "learning_rate": 5.637323650147487e-06, "loss": 0.4029, "num_input_tokens_seen": 11432160, "step": 17415 }, { "epoch": 10.27122641509434, "grad_norm": 1.843292236328125, "learning_rate": 5.63477183714522e-06, "loss": 0.2583, "num_input_tokens_seen": 11435136, "step": 17420 }, { "epoch": 10.274174528301886, "grad_norm": 4.5734333992004395, "learning_rate": 5.632219856084373e-06, "loss": 0.3845, "num_input_tokens_seen": 11438304, "step": 17425 }, { "epoch": 10.277122641509434, "grad_norm": 3.3145077228546143, "learning_rate": 5.6296677076405944e-06, "loss": 0.3822, "num_input_tokens_seen": 11442656, "step": 17430 }, { "epoch": 10.28007075471698, "grad_norm": 1.8816351890563965, "learning_rate": 5.627115392489578e-06, "loss": 0.3511, "num_input_tokens_seen": 11445248, "step": 17435 }, { "epoch": 10.283018867924529, "grad_norm": 16.141775131225586, "learning_rate": 5.624562911307058e-06, "loss": 0.4648, "num_input_tokens_seen": 11448288, "step": 17440 }, { "epoch": 10.285966981132075, "grad_norm": 2.2035443782806396, "learning_rate": 5.622010264768813e-06, "loss": 0.3224, "num_input_tokens_seen": 11450816, "step": 17445 }, { "epoch": 10.288915094339623, "grad_norm": 3.730288505554199, "learning_rate": 5.619457453550673e-06, "loss": 0.3642, "num_input_tokens_seen": 11453536, "step": 17450 }, { "epoch": 10.29186320754717, "grad_norm": 4.126858234405518, "learning_rate": 5.616904478328502e-06, "loss": 0.4165, "num_input_tokens_seen": 11456864, "step": 17455 }, { "epoch": 10.294811320754716, "grad_norm": 3.0200350284576416, "learning_rate": 5.6143513397782104e-06, "loss": 0.3426, "num_input_tokens_seen": 11460704, "step": 17460 }, { "epoch": 10.297759433962264, "grad_norm": 2.4012739658355713, "learning_rate": 5.611798038575755e-06, "loss": 0.3201, "num_input_tokens_seen": 11464224, "step": 17465 }, { "epoch": 10.30070754716981, "grad_norm": 2.6266696453094482, "learning_rate": 5.609244575397131e-06, "loss": 0.4536, "num_input_tokens_seen": 11468224, "step": 17470 }, { "epoch": 10.303655660377359, "grad_norm": 3.301006555557251, "learning_rate": 5.606690950918381e-06, "loss": 0.316, "num_input_tokens_seen": 11470912, "step": 17475 }, { "epoch": 10.306603773584905, "grad_norm": 1.9135061502456665, "learning_rate": 5.604137165815586e-06, "loss": 0.462, "num_input_tokens_seen": 11475104, "step": 17480 }, { "epoch": 10.309551886792454, "grad_norm": 3.7008817195892334, "learning_rate": 5.601583220764874e-06, "loss": 0.3441, "num_input_tokens_seen": 11477248, "step": 17485 }, { "epoch": 10.3125, "grad_norm": 2.502904176712036, "learning_rate": 5.599029116442409e-06, "loss": 0.3834, "num_input_tokens_seen": 11480256, "step": 17490 }, { "epoch": 10.315448113207546, "grad_norm": 3.3228962421417236, "learning_rate": 5.596474853524406e-06, "loss": 0.3346, "num_input_tokens_seen": 11482560, "step": 17495 }, { "epoch": 10.318396226415095, "grad_norm": 2.4679818153381348, "learning_rate": 5.593920432687115e-06, "loss": 0.3198, "num_input_tokens_seen": 11486336, "step": 17500 }, { "epoch": 10.321344339622641, "grad_norm": 3.6818981170654297, "learning_rate": 5.591365854606829e-06, "loss": 0.4283, "num_input_tokens_seen": 11488992, "step": 17505 }, { "epoch": 10.32429245283019, "grad_norm": 1.4096975326538086, "learning_rate": 5.588811119959885e-06, "loss": 0.3498, "num_input_tokens_seen": 11492096, "step": 17510 }, { "epoch": 10.327240566037736, "grad_norm": 2.3779914379119873, "learning_rate": 5.58625622942266e-06, "loss": 0.2729, "num_input_tokens_seen": 11495776, "step": 17515 }, { "epoch": 10.330188679245284, "grad_norm": 3.1025075912475586, "learning_rate": 5.58370118367157e-06, "loss": 0.4515, "num_input_tokens_seen": 11499328, "step": 17520 }, { "epoch": 10.33313679245283, "grad_norm": 2.146284818649292, "learning_rate": 5.581145983383077e-06, "loss": 0.4075, "num_input_tokens_seen": 11503072, "step": 17525 }, { "epoch": 10.336084905660377, "grad_norm": 2.5648984909057617, "learning_rate": 5.57859062923368e-06, "loss": 0.3711, "num_input_tokens_seen": 11506432, "step": 17530 }, { "epoch": 10.339033018867925, "grad_norm": 2.071563243865967, "learning_rate": 5.5760351218999194e-06, "loss": 0.3723, "num_input_tokens_seen": 11509088, "step": 17535 }, { "epoch": 10.341981132075471, "grad_norm": 3.902458906173706, "learning_rate": 5.573479462058379e-06, "loss": 0.406, "num_input_tokens_seen": 11512416, "step": 17540 }, { "epoch": 10.34492924528302, "grad_norm": 2.014582633972168, "learning_rate": 5.570923650385679e-06, "loss": 0.3593, "num_input_tokens_seen": 11515264, "step": 17545 }, { "epoch": 10.347877358490566, "grad_norm": 3.9252214431762695, "learning_rate": 5.568367687558481e-06, "loss": 0.5044, "num_input_tokens_seen": 11518624, "step": 17550 }, { "epoch": 10.350825471698114, "grad_norm": 4.615581035614014, "learning_rate": 5.56581157425349e-06, "loss": 0.2831, "num_input_tokens_seen": 11521600, "step": 17555 }, { "epoch": 10.35377358490566, "grad_norm": 1.5007507801055908, "learning_rate": 5.563255311147446e-06, "loss": 0.4454, "num_input_tokens_seen": 11525344, "step": 17560 }, { "epoch": 10.356721698113208, "grad_norm": 2.5592174530029297, "learning_rate": 5.560698898917129e-06, "loss": 0.3994, "num_input_tokens_seen": 11528192, "step": 17565 }, { "epoch": 10.359669811320755, "grad_norm": 3.7084200382232666, "learning_rate": 5.558142338239365e-06, "loss": 0.3304, "num_input_tokens_seen": 11531136, "step": 17570 }, { "epoch": 10.362617924528301, "grad_norm": 1.9414279460906982, "learning_rate": 5.55558562979101e-06, "loss": 0.3593, "num_input_tokens_seen": 11535968, "step": 17575 }, { "epoch": 10.36556603773585, "grad_norm": 3.190951108932495, "learning_rate": 5.553028774248964e-06, "loss": 0.3796, "num_input_tokens_seen": 11538912, "step": 17580 }, { "epoch": 10.368514150943396, "grad_norm": 2.0823934078216553, "learning_rate": 5.5504717722901665e-06, "loss": 0.4634, "num_input_tokens_seen": 11541312, "step": 17585 }, { "epoch": 10.371462264150944, "grad_norm": 2.4560816287994385, "learning_rate": 5.547914624591597e-06, "loss": 0.339, "num_input_tokens_seen": 11544384, "step": 17590 }, { "epoch": 10.37441037735849, "grad_norm": 2.787726402282715, "learning_rate": 5.545357331830269e-06, "loss": 0.3607, "num_input_tokens_seen": 11547968, "step": 17595 }, { "epoch": 10.377358490566039, "grad_norm": 1.8053663969039917, "learning_rate": 5.542799894683235e-06, "loss": 0.4361, "num_input_tokens_seen": 11551616, "step": 17600 }, { "epoch": 10.380306603773585, "grad_norm": 3.1877245903015137, "learning_rate": 5.540242313827591e-06, "loss": 0.4079, "num_input_tokens_seen": 11554816, "step": 17605 }, { "epoch": 10.383254716981131, "grad_norm": 3.308037757873535, "learning_rate": 5.537684589940466e-06, "loss": 0.4986, "num_input_tokens_seen": 11558016, "step": 17610 }, { "epoch": 10.38620283018868, "grad_norm": 2.9293322563171387, "learning_rate": 5.535126723699029e-06, "loss": 0.3363, "num_input_tokens_seen": 11561312, "step": 17615 }, { "epoch": 10.389150943396226, "grad_norm": 2.2650210857391357, "learning_rate": 5.532568715780485e-06, "loss": 0.3622, "num_input_tokens_seen": 11564864, "step": 17620 }, { "epoch": 10.392099056603774, "grad_norm": 1.84156334400177, "learning_rate": 5.530010566862077e-06, "loss": 0.3703, "num_input_tokens_seen": 11567648, "step": 17625 }, { "epoch": 10.39504716981132, "grad_norm": 1.7504929304122925, "learning_rate": 5.527452277621089e-06, "loss": 0.3438, "num_input_tokens_seen": 11570816, "step": 17630 }, { "epoch": 10.397995283018869, "grad_norm": 2.3863565921783447, "learning_rate": 5.524893848734837e-06, "loss": 0.3457, "num_input_tokens_seen": 11573440, "step": 17635 }, { "epoch": 10.400943396226415, "grad_norm": 4.2294816970825195, "learning_rate": 5.522335280880676e-06, "loss": 0.4709, "num_input_tokens_seen": 11577792, "step": 17640 }, { "epoch": 10.403891509433961, "grad_norm": 1.2147821187973022, "learning_rate": 5.519776574735999e-06, "loss": 0.4382, "num_input_tokens_seen": 11581280, "step": 17645 }, { "epoch": 10.40683962264151, "grad_norm": 3.092496395111084, "learning_rate": 5.5172177309782325e-06, "loss": 0.2604, "num_input_tokens_seen": 11585504, "step": 17650 }, { "epoch": 10.409787735849056, "grad_norm": 2.6912453174591064, "learning_rate": 5.514658750284844e-06, "loss": 0.3876, "num_input_tokens_seen": 11588064, "step": 17655 }, { "epoch": 10.412735849056604, "grad_norm": 4.712058067321777, "learning_rate": 5.512099633333332e-06, "loss": 0.4397, "num_input_tokens_seen": 11592000, "step": 17660 }, { "epoch": 10.41568396226415, "grad_norm": 2.5576541423797607, "learning_rate": 5.509540380801236e-06, "loss": 0.342, "num_input_tokens_seen": 11594912, "step": 17665 }, { "epoch": 10.418632075471699, "grad_norm": 3.486462116241455, "learning_rate": 5.506980993366129e-06, "loss": 0.3659, "num_input_tokens_seen": 11598624, "step": 17670 }, { "epoch": 10.421580188679245, "grad_norm": 3.042754650115967, "learning_rate": 5.504421471705616e-06, "loss": 0.4628, "num_input_tokens_seen": 11601888, "step": 17675 }, { "epoch": 10.424528301886792, "grad_norm": 5.829019546508789, "learning_rate": 5.501861816497346e-06, "loss": 0.4849, "num_input_tokens_seen": 11604512, "step": 17680 }, { "epoch": 10.42747641509434, "grad_norm": 2.2586801052093506, "learning_rate": 5.499302028418998e-06, "loss": 0.6647, "num_input_tokens_seen": 11607936, "step": 17685 }, { "epoch": 10.430424528301886, "grad_norm": 1.9694805145263672, "learning_rate": 5.496742108148285e-06, "loss": 0.3611, "num_input_tokens_seen": 11611744, "step": 17690 }, { "epoch": 10.433372641509434, "grad_norm": 5.384252071380615, "learning_rate": 5.494182056362959e-06, "loss": 0.4987, "num_input_tokens_seen": 11615040, "step": 17695 }, { "epoch": 10.43632075471698, "grad_norm": 2.196399450302124, "learning_rate": 5.491621873740804e-06, "loss": 0.373, "num_input_tokens_seen": 11617920, "step": 17700 }, { "epoch": 10.439268867924529, "grad_norm": 2.6987321376800537, "learning_rate": 5.4890615609596375e-06, "loss": 0.5822, "num_input_tokens_seen": 11621024, "step": 17705 }, { "epoch": 10.442216981132075, "grad_norm": 1.7639427185058594, "learning_rate": 5.486501118697317e-06, "loss": 0.3839, "num_input_tokens_seen": 11625248, "step": 17710 }, { "epoch": 10.445165094339623, "grad_norm": 1.9509416818618774, "learning_rate": 5.483940547631727e-06, "loss": 0.4157, "num_input_tokens_seen": 11629152, "step": 17715 }, { "epoch": 10.44811320754717, "grad_norm": 3.505030870437622, "learning_rate": 5.481379848440792e-06, "loss": 0.2907, "num_input_tokens_seen": 11631840, "step": 17720 }, { "epoch": 10.451061320754716, "grad_norm": 3.1687161922454834, "learning_rate": 5.478819021802468e-06, "loss": 0.3364, "num_input_tokens_seen": 11634624, "step": 17725 }, { "epoch": 10.454009433962264, "grad_norm": 1.8249472379684448, "learning_rate": 5.476258068394743e-06, "loss": 0.3934, "num_input_tokens_seen": 11637376, "step": 17730 }, { "epoch": 10.45695754716981, "grad_norm": 2.280449151992798, "learning_rate": 5.473696988895644e-06, "loss": 0.381, "num_input_tokens_seen": 11639872, "step": 17735 }, { "epoch": 10.459905660377359, "grad_norm": 3.2426559925079346, "learning_rate": 5.471135783983224e-06, "loss": 0.3915, "num_input_tokens_seen": 11642656, "step": 17740 }, { "epoch": 10.462853773584905, "grad_norm": 2.4496898651123047, "learning_rate": 5.4685744543355745e-06, "loss": 0.3675, "num_input_tokens_seen": 11650304, "step": 17745 }, { "epoch": 10.465801886792454, "grad_norm": 2.510732412338257, "learning_rate": 5.466013000630819e-06, "loss": 0.2943, "num_input_tokens_seen": 11653152, "step": 17750 }, { "epoch": 10.46875, "grad_norm": 4.571321487426758, "learning_rate": 5.463451423547114e-06, "loss": 0.3575, "num_input_tokens_seen": 11655776, "step": 17755 }, { "epoch": 10.471698113207546, "grad_norm": 2.338484048843384, "learning_rate": 5.460889723762647e-06, "loss": 0.2569, "num_input_tokens_seen": 11658496, "step": 17760 }, { "epoch": 10.474646226415095, "grad_norm": 3.043275833129883, "learning_rate": 5.458327901955639e-06, "loss": 0.337, "num_input_tokens_seen": 11661632, "step": 17765 }, { "epoch": 10.477594339622641, "grad_norm": 2.6607048511505127, "learning_rate": 5.455765958804344e-06, "loss": 0.2631, "num_input_tokens_seen": 11665696, "step": 17770 }, { "epoch": 10.48054245283019, "grad_norm": 5.000323295593262, "learning_rate": 5.4532038949870455e-06, "loss": 0.303, "num_input_tokens_seen": 11669120, "step": 17775 }, { "epoch": 10.483490566037736, "grad_norm": 1.9114824533462524, "learning_rate": 5.450641711182066e-06, "loss": 0.2926, "num_input_tokens_seen": 11673728, "step": 17780 }, { "epoch": 10.486438679245284, "grad_norm": 5.805638313293457, "learning_rate": 5.448079408067748e-06, "loss": 0.5006, "num_input_tokens_seen": 11675744, "step": 17785 }, { "epoch": 10.48938679245283, "grad_norm": 3.6648104190826416, "learning_rate": 5.4455169863224775e-06, "loss": 0.323, "num_input_tokens_seen": 11678688, "step": 17790 }, { "epoch": 10.492334905660377, "grad_norm": 3.534059762954712, "learning_rate": 5.442954446624664e-06, "loss": 0.4218, "num_input_tokens_seen": 11682272, "step": 17795 }, { "epoch": 10.495283018867925, "grad_norm": 3.2291202545166016, "learning_rate": 5.440391789652752e-06, "loss": 0.3678, "num_input_tokens_seen": 11685536, "step": 17800 }, { "epoch": 10.498231132075471, "grad_norm": 3.8044700622558594, "learning_rate": 5.437829016085216e-06, "loss": 0.3054, "num_input_tokens_seen": 11688384, "step": 17805 }, { "epoch": 10.50117924528302, "grad_norm": 1.3955873250961304, "learning_rate": 5.435266126600561e-06, "loss": 0.3681, "num_input_tokens_seen": 11691968, "step": 17810 }, { "epoch": 10.504127358490566, "grad_norm": 1.4557307958602905, "learning_rate": 5.4327031218773215e-06, "loss": 0.3636, "num_input_tokens_seen": 11695008, "step": 17815 }, { "epoch": 10.507075471698114, "grad_norm": 2.916255235671997, "learning_rate": 5.430140002594067e-06, "loss": 0.4505, "num_input_tokens_seen": 11697856, "step": 17820 }, { "epoch": 10.51002358490566, "grad_norm": 4.390948295593262, "learning_rate": 5.4275767694293934e-06, "loss": 0.3478, "num_input_tokens_seen": 11700992, "step": 17825 }, { "epoch": 10.512971698113208, "grad_norm": 2.340970754623413, "learning_rate": 5.425013423061926e-06, "loss": 0.3055, "num_input_tokens_seen": 11703680, "step": 17830 }, { "epoch": 10.515919811320755, "grad_norm": 3.3250718116760254, "learning_rate": 5.422449964170324e-06, "loss": 0.3469, "num_input_tokens_seen": 11706720, "step": 17835 }, { "epoch": 10.518867924528301, "grad_norm": 2.8987677097320557, "learning_rate": 5.419886393433275e-06, "loss": 0.4413, "num_input_tokens_seen": 11710272, "step": 17840 }, { "epoch": 10.52181603773585, "grad_norm": 2.064582347869873, "learning_rate": 5.417322711529491e-06, "loss": 0.2892, "num_input_tokens_seen": 11713344, "step": 17845 }, { "epoch": 10.524764150943396, "grad_norm": 2.4439070224761963, "learning_rate": 5.4147589191377224e-06, "loss": 0.3113, "num_input_tokens_seen": 11716640, "step": 17850 }, { "epoch": 10.527712264150944, "grad_norm": 2.171205759048462, "learning_rate": 5.412195016936742e-06, "loss": 0.3852, "num_input_tokens_seen": 11720544, "step": 17855 }, { "epoch": 10.53066037735849, "grad_norm": 2.8582887649536133, "learning_rate": 5.409631005605354e-06, "loss": 0.3198, "num_input_tokens_seen": 11723328, "step": 17860 }, { "epoch": 10.533608490566039, "grad_norm": 3.4580917358398438, "learning_rate": 5.407066885822391e-06, "loss": 0.4606, "num_input_tokens_seen": 11726464, "step": 17865 }, { "epoch": 10.536556603773585, "grad_norm": 1.6413919925689697, "learning_rate": 5.404502658266717e-06, "loss": 0.3659, "num_input_tokens_seen": 11729952, "step": 17870 }, { "epoch": 10.539504716981131, "grad_norm": 1.1685482263565063, "learning_rate": 5.4019383236172195e-06, "loss": 0.3186, "num_input_tokens_seen": 11734144, "step": 17875 }, { "epoch": 10.54245283018868, "grad_norm": 4.12357234954834, "learning_rate": 5.39937388255282e-06, "loss": 0.3874, "num_input_tokens_seen": 11736576, "step": 17880 }, { "epoch": 10.545400943396226, "grad_norm": 1.7040752172470093, "learning_rate": 5.3968093357524645e-06, "loss": 0.4079, "num_input_tokens_seen": 11740448, "step": 17885 }, { "epoch": 10.548349056603774, "grad_norm": 1.5262701511383057, "learning_rate": 5.3942446838951245e-06, "loss": 0.3688, "num_input_tokens_seen": 11743488, "step": 17890 }, { "epoch": 10.55129716981132, "grad_norm": 2.1222779750823975, "learning_rate": 5.3916799276598074e-06, "loss": 0.3793, "num_input_tokens_seen": 11746848, "step": 17895 }, { "epoch": 10.554245283018869, "grad_norm": 1.7760231494903564, "learning_rate": 5.3891150677255425e-06, "loss": 0.306, "num_input_tokens_seen": 11750080, "step": 17900 }, { "epoch": 10.557193396226415, "grad_norm": 1.344353437423706, "learning_rate": 5.386550104771384e-06, "loss": 0.4013, "num_input_tokens_seen": 11753248, "step": 17905 }, { "epoch": 10.560141509433961, "grad_norm": 2.9781906604766846, "learning_rate": 5.3839850394764205e-06, "loss": 0.4558, "num_input_tokens_seen": 11756544, "step": 17910 }, { "epoch": 10.56308962264151, "grad_norm": 4.477484703063965, "learning_rate": 5.381419872519763e-06, "loss": 0.4732, "num_input_tokens_seen": 11761856, "step": 17915 }, { "epoch": 10.566037735849056, "grad_norm": 2.6330950260162354, "learning_rate": 5.378854604580549e-06, "loss": 0.3914, "num_input_tokens_seen": 11764576, "step": 17920 }, { "epoch": 10.568985849056604, "grad_norm": 2.2008609771728516, "learning_rate": 5.376289236337946e-06, "loss": 0.2675, "num_input_tokens_seen": 11767360, "step": 17925 }, { "epoch": 10.57193396226415, "grad_norm": 2.0539114475250244, "learning_rate": 5.373723768471147e-06, "loss": 0.3896, "num_input_tokens_seen": 11771072, "step": 17930 }, { "epoch": 10.574882075471699, "grad_norm": 6.646390438079834, "learning_rate": 5.37115820165937e-06, "loss": 0.4195, "num_input_tokens_seen": 11773952, "step": 17935 }, { "epoch": 10.577830188679245, "grad_norm": 3.1991701126098633, "learning_rate": 5.368592536581858e-06, "loss": 0.3337, "num_input_tokens_seen": 11776672, "step": 17940 }, { "epoch": 10.580778301886792, "grad_norm": 4.00484037399292, "learning_rate": 5.366026773917885e-06, "loss": 0.3548, "num_input_tokens_seen": 11779520, "step": 17945 }, { "epoch": 10.58372641509434, "grad_norm": 4.27842378616333, "learning_rate": 5.363460914346746e-06, "loss": 0.3078, "num_input_tokens_seen": 11782880, "step": 17950 }, { "epoch": 10.586674528301886, "grad_norm": 2.611039876937866, "learning_rate": 5.360894958547762e-06, "loss": 0.4735, "num_input_tokens_seen": 11787360, "step": 17955 }, { "epoch": 10.589622641509434, "grad_norm": 3.389627456665039, "learning_rate": 5.358328907200284e-06, "loss": 0.4055, "num_input_tokens_seen": 11789824, "step": 17960 }, { "epoch": 10.59257075471698, "grad_norm": 3.3236358165740967, "learning_rate": 5.355762760983682e-06, "loss": 0.3877, "num_input_tokens_seen": 11792896, "step": 17965 }, { "epoch": 10.595518867924529, "grad_norm": 2.507995843887329, "learning_rate": 5.353196520577356e-06, "loss": 0.4056, "num_input_tokens_seen": 11796064, "step": 17970 }, { "epoch": 10.598466981132075, "grad_norm": 2.412907361984253, "learning_rate": 5.35063018666073e-06, "loss": 0.4184, "num_input_tokens_seen": 11799424, "step": 17975 }, { "epoch": 10.601415094339622, "grad_norm": 3.680333375930786, "learning_rate": 5.3480637599132515e-06, "loss": 0.4002, "num_input_tokens_seen": 11802016, "step": 17980 }, { "epoch": 10.60436320754717, "grad_norm": 3.8464457988739014, "learning_rate": 5.34549724101439e-06, "loss": 0.362, "num_input_tokens_seen": 11805248, "step": 17985 }, { "epoch": 10.607311320754716, "grad_norm": 2.6529700756073, "learning_rate": 5.342930630643646e-06, "loss": 0.5419, "num_input_tokens_seen": 11808256, "step": 17990 }, { "epoch": 10.610259433962264, "grad_norm": 2.8876729011535645, "learning_rate": 5.340363929480541e-06, "loss": 0.3264, "num_input_tokens_seen": 11812320, "step": 17995 }, { "epoch": 10.61320754716981, "grad_norm": 3.2595889568328857, "learning_rate": 5.3377971382046164e-06, "loss": 0.3946, "num_input_tokens_seen": 11816256, "step": 18000 }, { "epoch": 10.616155660377359, "grad_norm": 2.8682949542999268, "learning_rate": 5.335230257495446e-06, "loss": 0.4138, "num_input_tokens_seen": 11819712, "step": 18005 }, { "epoch": 10.619103773584905, "grad_norm": 1.81500244140625, "learning_rate": 5.3326632880326205e-06, "loss": 0.4464, "num_input_tokens_seen": 11823040, "step": 18010 }, { "epoch": 10.622051886792454, "grad_norm": 3.238943576812744, "learning_rate": 5.3300962304957515e-06, "loss": 0.4035, "num_input_tokens_seen": 11826496, "step": 18015 }, { "epoch": 10.625, "grad_norm": 1.8997316360473633, "learning_rate": 5.327529085564487e-06, "loss": 0.2682, "num_input_tokens_seen": 11829920, "step": 18020 }, { "epoch": 10.627948113207546, "grad_norm": 2.247060537338257, "learning_rate": 5.324961853918485e-06, "loss": 0.3292, "num_input_tokens_seen": 11833440, "step": 18025 }, { "epoch": 10.630896226415095, "grad_norm": 3.336592197418213, "learning_rate": 5.32239453623743e-06, "loss": 0.4349, "num_input_tokens_seen": 11836192, "step": 18030 }, { "epoch": 10.633844339622641, "grad_norm": 3.3881192207336426, "learning_rate": 5.3198271332010335e-06, "loss": 0.3802, "num_input_tokens_seen": 11839328, "step": 18035 }, { "epoch": 10.63679245283019, "grad_norm": 2.742424488067627, "learning_rate": 5.317259645489024e-06, "loss": 0.4198, "num_input_tokens_seen": 11843008, "step": 18040 }, { "epoch": 10.639740566037736, "grad_norm": 2.711759567260742, "learning_rate": 5.314692073781157e-06, "loss": 0.5907, "num_input_tokens_seen": 11845856, "step": 18045 }, { "epoch": 10.642688679245284, "grad_norm": 8.83246898651123, "learning_rate": 5.312124418757207e-06, "loss": 0.3094, "num_input_tokens_seen": 11848544, "step": 18050 }, { "epoch": 10.64563679245283, "grad_norm": 2.1332294940948486, "learning_rate": 5.309556681096972e-06, "loss": 0.3724, "num_input_tokens_seen": 11852736, "step": 18055 }, { "epoch": 10.648584905660378, "grad_norm": 4.31043815612793, "learning_rate": 5.306988861480271e-06, "loss": 0.475, "num_input_tokens_seen": 11855552, "step": 18060 }, { "epoch": 10.651533018867925, "grad_norm": 1.5277680158615112, "learning_rate": 5.304420960586946e-06, "loss": 0.3013, "num_input_tokens_seen": 11859040, "step": 18065 }, { "epoch": 10.654481132075471, "grad_norm": 3.1668481826782227, "learning_rate": 5.3018529790968606e-06, "loss": 0.3369, "num_input_tokens_seen": 11863392, "step": 18070 }, { "epoch": 10.65742924528302, "grad_norm": 2.0263686180114746, "learning_rate": 5.299284917689898e-06, "loss": 0.3015, "num_input_tokens_seen": 11866528, "step": 18075 }, { "epoch": 10.660377358490566, "grad_norm": 1.8386552333831787, "learning_rate": 5.296716777045962e-06, "loss": 0.3834, "num_input_tokens_seen": 11870240, "step": 18080 }, { "epoch": 10.663325471698114, "grad_norm": 2.7866628170013428, "learning_rate": 5.294148557844983e-06, "loss": 0.4158, "num_input_tokens_seen": 11872640, "step": 18085 }, { "epoch": 10.66627358490566, "grad_norm": 3.5673770904541016, "learning_rate": 5.291580260766904e-06, "loss": 0.3014, "num_input_tokens_seen": 11875648, "step": 18090 }, { "epoch": 10.669221698113208, "grad_norm": 2.309857130050659, "learning_rate": 5.289011886491694e-06, "loss": 0.4155, "num_input_tokens_seen": 11878976, "step": 18095 }, { "epoch": 10.672169811320755, "grad_norm": 2.9957950115203857, "learning_rate": 5.286443435699342e-06, "loss": 0.4066, "num_input_tokens_seen": 11881408, "step": 18100 }, { "epoch": 10.675117924528301, "grad_norm": 11.59062671661377, "learning_rate": 5.283874909069855e-06, "loss": 0.4475, "num_input_tokens_seen": 11884352, "step": 18105 }, { "epoch": 10.67806603773585, "grad_norm": 2.9237773418426514, "learning_rate": 5.281306307283263e-06, "loss": 0.3789, "num_input_tokens_seen": 11887328, "step": 18110 }, { "epoch": 10.681014150943396, "grad_norm": 3.2383716106414795, "learning_rate": 5.2787376310196145e-06, "loss": 0.4199, "num_input_tokens_seen": 11890080, "step": 18115 }, { "epoch": 10.683962264150944, "grad_norm": 1.5078654289245605, "learning_rate": 5.276168880958977e-06, "loss": 0.3685, "num_input_tokens_seen": 11893472, "step": 18120 }, { "epoch": 10.68691037735849, "grad_norm": 3.1274468898773193, "learning_rate": 5.273600057781437e-06, "loss": 0.3587, "num_input_tokens_seen": 11896704, "step": 18125 }, { "epoch": 10.689858490566039, "grad_norm": 1.7222368717193604, "learning_rate": 5.271031162167103e-06, "loss": 0.3778, "num_input_tokens_seen": 11900096, "step": 18130 }, { "epoch": 10.692806603773585, "grad_norm": 2.014012098312378, "learning_rate": 5.268462194796101e-06, "loss": 0.4953, "num_input_tokens_seen": 11903552, "step": 18135 }, { "epoch": 10.695754716981131, "grad_norm": 2.223637819290161, "learning_rate": 5.265893156348576e-06, "loss": 0.3634, "num_input_tokens_seen": 11907040, "step": 18140 }, { "epoch": 10.69870283018868, "grad_norm": 2.625415325164795, "learning_rate": 5.2633240475046925e-06, "loss": 0.4827, "num_input_tokens_seen": 11910048, "step": 18145 }, { "epoch": 10.701650943396226, "grad_norm": 2.8661293983459473, "learning_rate": 5.2607548689446305e-06, "loss": 0.2888, "num_input_tokens_seen": 11913376, "step": 18150 }, { "epoch": 10.704599056603774, "grad_norm": 1.6829744577407837, "learning_rate": 5.258185621348595e-06, "loss": 0.3683, "num_input_tokens_seen": 11917056, "step": 18155 }, { "epoch": 10.70754716981132, "grad_norm": 3.9572575092315674, "learning_rate": 5.255616305396801e-06, "loss": 0.3156, "num_input_tokens_seen": 11921216, "step": 18160 }, { "epoch": 10.710495283018869, "grad_norm": 3.552994728088379, "learning_rate": 5.253046921769491e-06, "loss": 0.5464, "num_input_tokens_seen": 11924320, "step": 18165 }, { "epoch": 10.713443396226415, "grad_norm": 3.095008134841919, "learning_rate": 5.250477471146916e-06, "loss": 0.3555, "num_input_tokens_seen": 11929088, "step": 18170 }, { "epoch": 10.716391509433961, "grad_norm": 2.7182435989379883, "learning_rate": 5.2479079542093535e-06, "loss": 0.406, "num_input_tokens_seen": 11932832, "step": 18175 }, { "epoch": 10.71933962264151, "grad_norm": 2.962261438369751, "learning_rate": 5.245338371637091e-06, "loss": 0.5151, "num_input_tokens_seen": 11935488, "step": 18180 }, { "epoch": 10.722287735849056, "grad_norm": 1.881032943725586, "learning_rate": 5.242768724110437e-06, "loss": 0.4818, "num_input_tokens_seen": 11942400, "step": 18185 }, { "epoch": 10.725235849056604, "grad_norm": 3.124351739883423, "learning_rate": 5.240199012309717e-06, "loss": 0.5526, "num_input_tokens_seen": 11945312, "step": 18190 }, { "epoch": 10.72818396226415, "grad_norm": 2.200315475463867, "learning_rate": 5.237629236915273e-06, "loss": 0.2852, "num_input_tokens_seen": 11949216, "step": 18195 }, { "epoch": 10.731132075471699, "grad_norm": 1.8141921758651733, "learning_rate": 5.235059398607464e-06, "loss": 0.386, "num_input_tokens_seen": 11952608, "step": 18200 }, { "epoch": 10.734080188679245, "grad_norm": 5.213189601898193, "learning_rate": 5.232489498066665e-06, "loss": 0.4155, "num_input_tokens_seen": 11955040, "step": 18205 }, { "epoch": 10.737028301886792, "grad_norm": 2.477965831756592, "learning_rate": 5.229919535973272e-06, "loss": 0.3722, "num_input_tokens_seen": 11958464, "step": 18210 }, { "epoch": 10.73997641509434, "grad_norm": 1.5480906963348389, "learning_rate": 5.2273495130076905e-06, "loss": 0.3257, "num_input_tokens_seen": 11962464, "step": 18215 }, { "epoch": 10.742924528301886, "grad_norm": 3.0602664947509766, "learning_rate": 5.224779429850344e-06, "loss": 0.3003, "num_input_tokens_seen": 11965376, "step": 18220 }, { "epoch": 10.745872641509434, "grad_norm": 2.2576541900634766, "learning_rate": 5.222209287181677e-06, "loss": 0.3824, "num_input_tokens_seen": 11968672, "step": 18225 }, { "epoch": 10.74882075471698, "grad_norm": 2.7666196823120117, "learning_rate": 5.219639085682142e-06, "loss": 0.3551, "num_input_tokens_seen": 11971488, "step": 18230 }, { "epoch": 10.751768867924529, "grad_norm": 2.6421895027160645, "learning_rate": 5.2170688260322124e-06, "loss": 0.3594, "num_input_tokens_seen": 11975168, "step": 18235 }, { "epoch": 10.754716981132075, "grad_norm": 1.8277153968811035, "learning_rate": 5.214498508912376e-06, "loss": 0.3663, "num_input_tokens_seen": 11978368, "step": 18240 }, { "epoch": 10.757665094339622, "grad_norm": 3.1835272312164307, "learning_rate": 5.211928135003135e-06, "loss": 0.3887, "num_input_tokens_seen": 11981984, "step": 18245 }, { "epoch": 10.76061320754717, "grad_norm": 3.052665948867798, "learning_rate": 5.209357704985007e-06, "loss": 0.4546, "num_input_tokens_seen": 11984736, "step": 18250 }, { "epoch": 10.763561320754716, "grad_norm": 1.7890002727508545, "learning_rate": 5.206787219538524e-06, "loss": 0.2582, "num_input_tokens_seen": 11987456, "step": 18255 }, { "epoch": 10.766509433962264, "grad_norm": 2.184941291809082, "learning_rate": 5.204216679344234e-06, "loss": 0.3486, "num_input_tokens_seen": 11991296, "step": 18260 }, { "epoch": 10.76945754716981, "grad_norm": 2.074272394180298, "learning_rate": 5.201646085082696e-06, "loss": 0.3847, "num_input_tokens_seen": 11995104, "step": 18265 }, { "epoch": 10.772405660377359, "grad_norm": 1.851378321647644, "learning_rate": 5.199075437434491e-06, "loss": 0.3801, "num_input_tokens_seen": 11998624, "step": 18270 }, { "epoch": 10.775353773584905, "grad_norm": 2.9253528118133545, "learning_rate": 5.1965047370802046e-06, "loss": 0.3454, "num_input_tokens_seen": 12001280, "step": 18275 }, { "epoch": 10.778301886792454, "grad_norm": 2.6923818588256836, "learning_rate": 5.193933984700441e-06, "loss": 0.3934, "num_input_tokens_seen": 12004384, "step": 18280 }, { "epoch": 10.78125, "grad_norm": 3.3366949558258057, "learning_rate": 5.1913631809758216e-06, "loss": 0.4098, "num_input_tokens_seen": 12007072, "step": 18285 }, { "epoch": 10.784198113207546, "grad_norm": 3.192009449005127, "learning_rate": 5.188792326586973e-06, "loss": 0.4224, "num_input_tokens_seen": 12010976, "step": 18290 }, { "epoch": 10.787146226415095, "grad_norm": 2.0112922191619873, "learning_rate": 5.186221422214544e-06, "loss": 0.4452, "num_input_tokens_seen": 12013824, "step": 18295 }, { "epoch": 10.790094339622641, "grad_norm": 2.225877523422241, "learning_rate": 5.1836504685391885e-06, "loss": 0.3872, "num_input_tokens_seen": 12017440, "step": 18300 }, { "epoch": 10.79304245283019, "grad_norm": 3.5018303394317627, "learning_rate": 5.181079466241582e-06, "loss": 0.3931, "num_input_tokens_seen": 12021568, "step": 18305 }, { "epoch": 10.795990566037736, "grad_norm": 2.626007318496704, "learning_rate": 5.178508416002406e-06, "loss": 0.2608, "num_input_tokens_seen": 12024192, "step": 18310 }, { "epoch": 10.798938679245284, "grad_norm": 1.3511990308761597, "learning_rate": 5.175937318502357e-06, "loss": 0.3207, "num_input_tokens_seen": 12027936, "step": 18315 }, { "epoch": 10.80188679245283, "grad_norm": 2.5027737617492676, "learning_rate": 5.173366174422147e-06, "loss": 0.4611, "num_input_tokens_seen": 12031808, "step": 18320 }, { "epoch": 10.804834905660378, "grad_norm": 3.615351438522339, "learning_rate": 5.170794984442492e-06, "loss": 0.4295, "num_input_tokens_seen": 12034656, "step": 18325 }, { "epoch": 10.807783018867925, "grad_norm": 2.4128968715667725, "learning_rate": 5.16822374924413e-06, "loss": 0.3545, "num_input_tokens_seen": 12038240, "step": 18330 }, { "epoch": 10.810731132075471, "grad_norm": 2.18862247467041, "learning_rate": 5.165652469507806e-06, "loss": 0.3584, "num_input_tokens_seen": 12042272, "step": 18335 }, { "epoch": 10.81367924528302, "grad_norm": 2.0019888877868652, "learning_rate": 5.163081145914276e-06, "loss": 0.3364, "num_input_tokens_seen": 12045248, "step": 18340 }, { "epoch": 10.816627358490566, "grad_norm": 2.2972142696380615, "learning_rate": 5.160509779144311e-06, "loss": 0.3567, "num_input_tokens_seen": 12048096, "step": 18345 }, { "epoch": 10.819575471698114, "grad_norm": 2.0611014366149902, "learning_rate": 5.157938369878688e-06, "loss": 0.3657, "num_input_tokens_seen": 12051392, "step": 18350 }, { "epoch": 10.82252358490566, "grad_norm": 3.5758090019226074, "learning_rate": 5.155366918798203e-06, "loss": 0.4409, "num_input_tokens_seen": 12054112, "step": 18355 }, { "epoch": 10.825471698113208, "grad_norm": 2.83524489402771, "learning_rate": 5.152795426583654e-06, "loss": 0.3697, "num_input_tokens_seen": 12057312, "step": 18360 }, { "epoch": 10.828419811320755, "grad_norm": 2.1920831203460693, "learning_rate": 5.15022389391586e-06, "loss": 0.3362, "num_input_tokens_seen": 12060736, "step": 18365 }, { "epoch": 10.831367924528301, "grad_norm": 3.5017640590667725, "learning_rate": 5.147652321475642e-06, "loss": 0.6282, "num_input_tokens_seen": 12063968, "step": 18370 }, { "epoch": 10.83431603773585, "grad_norm": 4.739803791046143, "learning_rate": 5.145080709943835e-06, "loss": 0.3468, "num_input_tokens_seen": 12066848, "step": 18375 }, { "epoch": 10.837264150943396, "grad_norm": 4.539663791656494, "learning_rate": 5.142509060001285e-06, "loss": 0.5837, "num_input_tokens_seen": 12069632, "step": 18380 }, { "epoch": 10.840212264150944, "grad_norm": 4.6205668449401855, "learning_rate": 5.139937372328847e-06, "loss": 0.4575, "num_input_tokens_seen": 12073088, "step": 18385 }, { "epoch": 10.84316037735849, "grad_norm": 2.9568097591400146, "learning_rate": 5.1373656476073876e-06, "loss": 0.508, "num_input_tokens_seen": 12077600, "step": 18390 }, { "epoch": 10.846108490566039, "grad_norm": 3.4327893257141113, "learning_rate": 5.134793886517779e-06, "loss": 0.4068, "num_input_tokens_seen": 12080320, "step": 18395 }, { "epoch": 10.849056603773585, "grad_norm": 2.380484104156494, "learning_rate": 5.1322220897409105e-06, "loss": 0.4312, "num_input_tokens_seen": 12084160, "step": 18400 }, { "epoch": 10.852004716981131, "grad_norm": 3.007835626602173, "learning_rate": 5.129650257957671e-06, "loss": 0.3252, "num_input_tokens_seen": 12087264, "step": 18405 }, { "epoch": 10.85495283018868, "grad_norm": 1.6766592264175415, "learning_rate": 5.12707839184897e-06, "loss": 0.3701, "num_input_tokens_seen": 12091040, "step": 18410 }, { "epoch": 10.857900943396226, "grad_norm": 1.9222661256790161, "learning_rate": 5.124506492095716e-06, "loss": 0.4672, "num_input_tokens_seen": 12094048, "step": 18415 }, { "epoch": 10.860849056603774, "grad_norm": 2.6545920372009277, "learning_rate": 5.121934559378831e-06, "loss": 0.5455, "num_input_tokens_seen": 12099008, "step": 18420 }, { "epoch": 10.86379716981132, "grad_norm": 3.729748487472534, "learning_rate": 5.1193625943792456e-06, "loss": 0.3792, "num_input_tokens_seen": 12102368, "step": 18425 }, { "epoch": 10.866745283018869, "grad_norm": 1.9776052236557007, "learning_rate": 5.116790597777901e-06, "loss": 0.4225, "num_input_tokens_seen": 12105504, "step": 18430 }, { "epoch": 10.869693396226415, "grad_norm": 2.514369487762451, "learning_rate": 5.11421857025574e-06, "loss": 0.3538, "num_input_tokens_seen": 12109376, "step": 18435 }, { "epoch": 10.872641509433961, "grad_norm": 3.93491792678833, "learning_rate": 5.111646512493721e-06, "loss": 0.4633, "num_input_tokens_seen": 12112256, "step": 18440 }, { "epoch": 10.87558962264151, "grad_norm": 2.3441641330718994, "learning_rate": 5.109074425172806e-06, "loss": 0.3501, "num_input_tokens_seen": 12115072, "step": 18445 }, { "epoch": 10.878537735849056, "grad_norm": 2.3104586601257324, "learning_rate": 5.106502308973967e-06, "loss": 0.462, "num_input_tokens_seen": 12118208, "step": 18450 }, { "epoch": 10.881485849056604, "grad_norm": 2.8562095165252686, "learning_rate": 5.103930164578184e-06, "loss": 0.395, "num_input_tokens_seen": 12121888, "step": 18455 }, { "epoch": 10.88443396226415, "grad_norm": 1.5458022356033325, "learning_rate": 5.101357992666441e-06, "loss": 0.346, "num_input_tokens_seen": 12124416, "step": 18460 }, { "epoch": 10.887382075471699, "grad_norm": 2.7009990215301514, "learning_rate": 5.098785793919733e-06, "loss": 0.3346, "num_input_tokens_seen": 12127168, "step": 18465 }, { "epoch": 10.890330188679245, "grad_norm": 3.270085573196411, "learning_rate": 5.096213569019061e-06, "loss": 0.3256, "num_input_tokens_seen": 12130144, "step": 18470 }, { "epoch": 10.893278301886792, "grad_norm": 1.2378054857254028, "learning_rate": 5.0936413186454315e-06, "loss": 0.3909, "num_input_tokens_seen": 12133600, "step": 18475 }, { "epoch": 10.89622641509434, "grad_norm": 3.6403980255126953, "learning_rate": 5.0910690434798584e-06, "loss": 0.4278, "num_input_tokens_seen": 12136960, "step": 18480 }, { "epoch": 10.899174528301886, "grad_norm": 3.0989866256713867, "learning_rate": 5.088496744203364e-06, "loss": 0.3427, "num_input_tokens_seen": 12139648, "step": 18485 }, { "epoch": 10.902122641509434, "grad_norm": 4.773684501647949, "learning_rate": 5.085924421496976e-06, "loss": 0.337, "num_input_tokens_seen": 12142080, "step": 18490 }, { "epoch": 10.90507075471698, "grad_norm": 0.9465792179107666, "learning_rate": 5.083352076041725e-06, "loss": 0.3707, "num_input_tokens_seen": 12144992, "step": 18495 }, { "epoch": 10.908018867924529, "grad_norm": 2.6024246215820312, "learning_rate": 5.080779708518654e-06, "loss": 0.353, "num_input_tokens_seen": 12148800, "step": 18500 }, { "epoch": 10.910966981132075, "grad_norm": 4.178752899169922, "learning_rate": 5.078207319608807e-06, "loss": 0.4002, "num_input_tokens_seen": 12151264, "step": 18505 }, { "epoch": 10.913915094339622, "grad_norm": 1.591094970703125, "learning_rate": 5.075634909993235e-06, "loss": 0.2857, "num_input_tokens_seen": 12154848, "step": 18510 }, { "epoch": 10.91686320754717, "grad_norm": 2.590350389480591, "learning_rate": 5.073062480352995e-06, "loss": 0.4291, "num_input_tokens_seen": 12157696, "step": 18515 }, { "epoch": 10.919811320754716, "grad_norm": 2.008937358856201, "learning_rate": 5.070490031369149e-06, "loss": 0.4903, "num_input_tokens_seen": 12161856, "step": 18520 }, { "epoch": 10.922759433962264, "grad_norm": 4.2813286781311035, "learning_rate": 5.067917563722762e-06, "loss": 0.5605, "num_input_tokens_seen": 12165376, "step": 18525 }, { "epoch": 10.92570754716981, "grad_norm": 2.58742356300354, "learning_rate": 5.065345078094907e-06, "loss": 0.2865, "num_input_tokens_seen": 12168256, "step": 18530 }, { "epoch": 10.928655660377359, "grad_norm": 2.0577259063720703, "learning_rate": 5.062772575166663e-06, "loss": 0.3737, "num_input_tokens_seen": 12172480, "step": 18535 }, { "epoch": 10.931603773584905, "grad_norm": 4.4149298667907715, "learning_rate": 5.0602000556191075e-06, "loss": 0.4031, "num_input_tokens_seen": 12176512, "step": 18540 }, { "epoch": 10.934551886792454, "grad_norm": 2.477151393890381, "learning_rate": 5.0576275201333284e-06, "loss": 0.4022, "num_input_tokens_seen": 12180320, "step": 18545 }, { "epoch": 10.9375, "grad_norm": 3.5711300373077393, "learning_rate": 5.055054969390415e-06, "loss": 0.3881, "num_input_tokens_seen": 12183392, "step": 18550 }, { "epoch": 10.940448113207546, "grad_norm": 3.5455820560455322, "learning_rate": 5.052482404071461e-06, "loss": 0.3267, "num_input_tokens_seen": 12186656, "step": 18555 }, { "epoch": 10.943396226415095, "grad_norm": 4.423620700836182, "learning_rate": 5.049909824857564e-06, "loss": 0.333, "num_input_tokens_seen": 12190208, "step": 18560 }, { "epoch": 10.946344339622641, "grad_norm": 2.2552552223205566, "learning_rate": 5.047337232429827e-06, "loss": 0.4152, "num_input_tokens_seen": 12193504, "step": 18565 }, { "epoch": 10.94929245283019, "grad_norm": 5.219983100891113, "learning_rate": 5.044764627469354e-06, "loss": 0.5365, "num_input_tokens_seen": 12196096, "step": 18570 }, { "epoch": 10.952240566037736, "grad_norm": 4.654555320739746, "learning_rate": 5.042192010657251e-06, "loss": 0.27, "num_input_tokens_seen": 12201216, "step": 18575 }, { "epoch": 10.955188679245284, "grad_norm": 2.4191222190856934, "learning_rate": 5.039619382674632e-06, "loss": 0.48, "num_input_tokens_seen": 12204576, "step": 18580 }, { "epoch": 10.95813679245283, "grad_norm": 1.807813286781311, "learning_rate": 5.0370467442026115e-06, "loss": 0.3792, "num_input_tokens_seen": 12207904, "step": 18585 }, { "epoch": 10.961084905660378, "grad_norm": 2.2783639430999756, "learning_rate": 5.034474095922304e-06, "loss": 0.3195, "num_input_tokens_seen": 12211008, "step": 18590 }, { "epoch": 10.964033018867925, "grad_norm": 3.575453042984009, "learning_rate": 5.031901438514832e-06, "loss": 0.4471, "num_input_tokens_seen": 12214976, "step": 18595 }, { "epoch": 10.966981132075471, "grad_norm": 2.3048501014709473, "learning_rate": 5.0293287726613185e-06, "loss": 0.3949, "num_input_tokens_seen": 12218368, "step": 18600 }, { "epoch": 10.96992924528302, "grad_norm": 3.479658365249634, "learning_rate": 5.0267560990428836e-06, "loss": 0.496, "num_input_tokens_seen": 12221056, "step": 18605 }, { "epoch": 10.972877358490566, "grad_norm": 5.855332374572754, "learning_rate": 5.024183418340657e-06, "loss": 0.3779, "num_input_tokens_seen": 12224192, "step": 18610 }, { "epoch": 10.975825471698114, "grad_norm": 2.837212324142456, "learning_rate": 5.021610731235766e-06, "loss": 0.3762, "num_input_tokens_seen": 12226656, "step": 18615 }, { "epoch": 10.97877358490566, "grad_norm": 4.907573223114014, "learning_rate": 5.01903803840934e-06, "loss": 0.4024, "num_input_tokens_seen": 12232320, "step": 18620 }, { "epoch": 10.981721698113208, "grad_norm": 2.4162025451660156, "learning_rate": 5.016465340542514e-06, "loss": 0.4118, "num_input_tokens_seen": 12234688, "step": 18625 }, { "epoch": 10.984669811320755, "grad_norm": 2.717884063720703, "learning_rate": 5.013892638316417e-06, "loss": 0.4805, "num_input_tokens_seen": 12238240, "step": 18630 }, { "epoch": 10.987617924528301, "grad_norm": 2.040825128555298, "learning_rate": 5.011319932412182e-06, "loss": 0.394, "num_input_tokens_seen": 12241440, "step": 18635 }, { "epoch": 10.99056603773585, "grad_norm": 2.74782657623291, "learning_rate": 5.008747223510947e-06, "loss": 0.3328, "num_input_tokens_seen": 12243744, "step": 18640 }, { "epoch": 10.993514150943396, "grad_norm": 2.063300609588623, "learning_rate": 5.006174512293849e-06, "loss": 0.4122, "num_input_tokens_seen": 12246656, "step": 18645 }, { "epoch": 10.996462264150944, "grad_norm": 2.9233455657958984, "learning_rate": 5.003601799442019e-06, "loss": 0.4105, "num_input_tokens_seen": 12249792, "step": 18650 }, { "epoch": 10.99941037735849, "grad_norm": 2.047179698944092, "learning_rate": 5.0010290856366e-06, "loss": 0.4058, "num_input_tokens_seen": 12252672, "step": 18655 }, { "epoch": 11.002358490566039, "grad_norm": 1.9444782733917236, "learning_rate": 4.998456371558726e-06, "loss": 0.4965, "num_input_tokens_seen": 12255272, "step": 18660 }, { "epoch": 11.005306603773585, "grad_norm": 5.645694255828857, "learning_rate": 4.995883657889531e-06, "loss": 0.4923, "num_input_tokens_seen": 12258408, "step": 18665 }, { "epoch": 11.008254716981131, "grad_norm": 1.7563081979751587, "learning_rate": 4.993310945310158e-06, "loss": 0.3463, "num_input_tokens_seen": 12261736, "step": 18670 }, { "epoch": 11.01120283018868, "grad_norm": 2.53068208694458, "learning_rate": 4.9907382345017416e-06, "loss": 0.4831, "num_input_tokens_seen": 12265352, "step": 18675 }, { "epoch": 11.014150943396226, "grad_norm": 3.25628399848938, "learning_rate": 4.988165526145416e-06, "loss": 0.4689, "num_input_tokens_seen": 12268264, "step": 18680 }, { "epoch": 11.017099056603774, "grad_norm": 2.593524932861328, "learning_rate": 4.985592820922319e-06, "loss": 0.3708, "num_input_tokens_seen": 12271464, "step": 18685 }, { "epoch": 11.02004716981132, "grad_norm": 1.7127033472061157, "learning_rate": 4.983020119513586e-06, "loss": 0.5537, "num_input_tokens_seen": 12274888, "step": 18690 }, { "epoch": 11.022995283018869, "grad_norm": 2.130173444747925, "learning_rate": 4.9804474226003465e-06, "loss": 0.344, "num_input_tokens_seen": 12277672, "step": 18695 }, { "epoch": 11.025943396226415, "grad_norm": 2.9557504653930664, "learning_rate": 4.977874730863739e-06, "loss": 0.4158, "num_input_tokens_seen": 12280936, "step": 18700 }, { "epoch": 11.028891509433961, "grad_norm": 1.8084354400634766, "learning_rate": 4.975302044984889e-06, "loss": 0.2805, "num_input_tokens_seen": 12284616, "step": 18705 }, { "epoch": 11.03183962264151, "grad_norm": 3.299832820892334, "learning_rate": 4.972729365644931e-06, "loss": 0.3969, "num_input_tokens_seen": 12287816, "step": 18710 }, { "epoch": 11.034787735849056, "grad_norm": 2.7262513637542725, "learning_rate": 4.97015669352499e-06, "loss": 0.2489, "num_input_tokens_seen": 12290760, "step": 18715 }, { "epoch": 11.037735849056604, "grad_norm": 4.685520648956299, "learning_rate": 4.967584029306194e-06, "loss": 0.3695, "num_input_tokens_seen": 12295624, "step": 18720 }, { "epoch": 11.04068396226415, "grad_norm": 2.1779580116271973, "learning_rate": 4.965011373669666e-06, "loss": 0.5377, "num_input_tokens_seen": 12298984, "step": 18725 }, { "epoch": 11.043632075471699, "grad_norm": 2.729032516479492, "learning_rate": 4.962438727296527e-06, "loss": 0.2969, "num_input_tokens_seen": 12302056, "step": 18730 }, { "epoch": 11.046580188679245, "grad_norm": 1.7260695695877075, "learning_rate": 4.959866090867897e-06, "loss": 0.3505, "num_input_tokens_seen": 12305000, "step": 18735 }, { "epoch": 11.049528301886792, "grad_norm": 3.1963629722595215, "learning_rate": 4.957293465064893e-06, "loss": 0.4441, "num_input_tokens_seen": 12308136, "step": 18740 }, { "epoch": 11.05247641509434, "grad_norm": 2.6610023975372314, "learning_rate": 4.954720850568627e-06, "loss": 0.2834, "num_input_tokens_seen": 12311048, "step": 18745 }, { "epoch": 11.055424528301886, "grad_norm": 2.8494436740875244, "learning_rate": 4.952148248060212e-06, "loss": 0.3677, "num_input_tokens_seen": 12313896, "step": 18750 }, { "epoch": 11.058372641509434, "grad_norm": 3.6552326679229736, "learning_rate": 4.949575658220755e-06, "loss": 0.322, "num_input_tokens_seen": 12317192, "step": 18755 }, { "epoch": 11.06132075471698, "grad_norm": 5.750929832458496, "learning_rate": 4.947003081731359e-06, "loss": 0.3773, "num_input_tokens_seen": 12320744, "step": 18760 }, { "epoch": 11.064268867924529, "grad_norm": 1.676643967628479, "learning_rate": 4.944430519273126e-06, "loss": 0.4995, "num_input_tokens_seen": 12324776, "step": 18765 }, { "epoch": 11.067216981132075, "grad_norm": 3.576266288757324, "learning_rate": 4.941857971527152e-06, "loss": 0.3286, "num_input_tokens_seen": 12327432, "step": 18770 }, { "epoch": 11.070165094339623, "grad_norm": 3.207613229751587, "learning_rate": 4.93928543917453e-06, "loss": 0.2865, "num_input_tokens_seen": 12330120, "step": 18775 }, { "epoch": 11.07311320754717, "grad_norm": 2.107090711593628, "learning_rate": 4.93671292289635e-06, "loss": 0.3742, "num_input_tokens_seen": 12332328, "step": 18780 }, { "epoch": 11.076061320754716, "grad_norm": 4.944457054138184, "learning_rate": 4.934140423373698e-06, "loss": 0.4687, "num_input_tokens_seen": 12334792, "step": 18785 }, { "epoch": 11.079009433962264, "grad_norm": 2.2843573093414307, "learning_rate": 4.931567941287651e-06, "loss": 0.5292, "num_input_tokens_seen": 12338952, "step": 18790 }, { "epoch": 11.08195754716981, "grad_norm": 2.6810195446014404, "learning_rate": 4.9289954773192875e-06, "loss": 0.475, "num_input_tokens_seen": 12341768, "step": 18795 }, { "epoch": 11.084905660377359, "grad_norm": 2.79561710357666, "learning_rate": 4.926423032149677e-06, "loss": 0.3183, "num_input_tokens_seen": 12345256, "step": 18800 }, { "epoch": 11.087853773584905, "grad_norm": 1.9615920782089233, "learning_rate": 4.923850606459883e-06, "loss": 0.3161, "num_input_tokens_seen": 12348584, "step": 18805 }, { "epoch": 11.090801886792454, "grad_norm": 2.3519647121429443, "learning_rate": 4.921278200930972e-06, "loss": 0.3759, "num_input_tokens_seen": 12352104, "step": 18810 }, { "epoch": 11.09375, "grad_norm": 3.004254102706909, "learning_rate": 4.918705816243996e-06, "loss": 0.3159, "num_input_tokens_seen": 12354856, "step": 18815 }, { "epoch": 11.096698113207546, "grad_norm": 5.357302188873291, "learning_rate": 4.916133453080007e-06, "loss": 0.2292, "num_input_tokens_seen": 12357928, "step": 18820 }, { "epoch": 11.099646226415095, "grad_norm": 4.39891242980957, "learning_rate": 4.913561112120046e-06, "loss": 0.3657, "num_input_tokens_seen": 12360872, "step": 18825 }, { "epoch": 11.102594339622641, "grad_norm": 3.560786008834839, "learning_rate": 4.910988794045154e-06, "loss": 0.4018, "num_input_tokens_seen": 12363912, "step": 18830 }, { "epoch": 11.10554245283019, "grad_norm": 4.1672773361206055, "learning_rate": 4.9084164995363626e-06, "loss": 0.3688, "num_input_tokens_seen": 12367336, "step": 18835 }, { "epoch": 11.108490566037736, "grad_norm": 2.7688560485839844, "learning_rate": 4.905844229274697e-06, "loss": 0.4977, "num_input_tokens_seen": 12370152, "step": 18840 }, { "epoch": 11.111438679245284, "grad_norm": 0.4507738947868347, "learning_rate": 4.903271983941177e-06, "loss": 0.2787, "num_input_tokens_seen": 12376040, "step": 18845 }, { "epoch": 11.11438679245283, "grad_norm": 2.9538331031799316, "learning_rate": 4.900699764216818e-06, "loss": 0.355, "num_input_tokens_seen": 12378792, "step": 18850 }, { "epoch": 11.117334905660377, "grad_norm": 2.2495343685150146, "learning_rate": 4.898127570782622e-06, "loss": 0.3291, "num_input_tokens_seen": 12381960, "step": 18855 }, { "epoch": 11.120283018867925, "grad_norm": 2.3658058643341064, "learning_rate": 4.895555404319592e-06, "loss": 0.3301, "num_input_tokens_seen": 12385320, "step": 18860 }, { "epoch": 11.123231132075471, "grad_norm": 3.9122941493988037, "learning_rate": 4.89298326550872e-06, "loss": 0.5467, "num_input_tokens_seen": 12388616, "step": 18865 }, { "epoch": 11.12617924528302, "grad_norm": 2.818708658218384, "learning_rate": 4.8904111550309876e-06, "loss": 0.4476, "num_input_tokens_seen": 12391944, "step": 18870 }, { "epoch": 11.129127358490566, "grad_norm": 2.512768030166626, "learning_rate": 4.8878390735673755e-06, "loss": 0.4178, "num_input_tokens_seen": 12394920, "step": 18875 }, { "epoch": 11.132075471698114, "grad_norm": 2.029797077178955, "learning_rate": 4.8852670217988505e-06, "loss": 0.3913, "num_input_tokens_seen": 12398408, "step": 18880 }, { "epoch": 11.13502358490566, "grad_norm": 4.503636360168457, "learning_rate": 4.882695000406377e-06, "loss": 0.3094, "num_input_tokens_seen": 12401640, "step": 18885 }, { "epoch": 11.137971698113208, "grad_norm": 2.0828137397766113, "learning_rate": 4.880123010070909e-06, "loss": 0.2938, "num_input_tokens_seen": 12405192, "step": 18890 }, { "epoch": 11.140919811320755, "grad_norm": 2.870120048522949, "learning_rate": 4.877551051473388e-06, "loss": 0.4629, "num_input_tokens_seen": 12408008, "step": 18895 }, { "epoch": 11.143867924528301, "grad_norm": 1.9965013265609741, "learning_rate": 4.874979125294755e-06, "loss": 0.3071, "num_input_tokens_seen": 12411080, "step": 18900 }, { "epoch": 11.14681603773585, "grad_norm": 3.048051595687866, "learning_rate": 4.872407232215937e-06, "loss": 0.3311, "num_input_tokens_seen": 12414952, "step": 18905 }, { "epoch": 11.149764150943396, "grad_norm": 3.690500020980835, "learning_rate": 4.8698353729178546e-06, "loss": 0.3397, "num_input_tokens_seen": 12420392, "step": 18910 }, { "epoch": 11.152712264150944, "grad_norm": 2.3633649349212646, "learning_rate": 4.867263548081418e-06, "loss": 0.3008, "num_input_tokens_seen": 12422920, "step": 18915 }, { "epoch": 11.15566037735849, "grad_norm": 2.380781412124634, "learning_rate": 4.8646917583875304e-06, "loss": 0.3257, "num_input_tokens_seen": 12426312, "step": 18920 }, { "epoch": 11.158608490566039, "grad_norm": 13.204137802124023, "learning_rate": 4.862120004517082e-06, "loss": 0.2647, "num_input_tokens_seen": 12429608, "step": 18925 }, { "epoch": 11.161556603773585, "grad_norm": 2.1218948364257812, "learning_rate": 4.859548287150956e-06, "loss": 0.3134, "num_input_tokens_seen": 12433768, "step": 18930 }, { "epoch": 11.164504716981131, "grad_norm": 2.466839075088501, "learning_rate": 4.8569766069700275e-06, "loss": 0.3139, "num_input_tokens_seen": 12436936, "step": 18935 }, { "epoch": 11.16745283018868, "grad_norm": 1.8761425018310547, "learning_rate": 4.854404964655158e-06, "loss": 0.4544, "num_input_tokens_seen": 12440520, "step": 18940 }, { "epoch": 11.170400943396226, "grad_norm": 2.608947277069092, "learning_rate": 4.8518333608872015e-06, "loss": 0.4648, "num_input_tokens_seen": 12443336, "step": 18945 }, { "epoch": 11.173349056603774, "grad_norm": 1.7685317993164062, "learning_rate": 4.849261796347002e-06, "loss": 0.3472, "num_input_tokens_seen": 12446824, "step": 18950 }, { "epoch": 11.17629716981132, "grad_norm": 4.355589389801025, "learning_rate": 4.846690271715391e-06, "loss": 0.3291, "num_input_tokens_seen": 12449928, "step": 18955 }, { "epoch": 11.179245283018869, "grad_norm": 2.270428419113159, "learning_rate": 4.844118787673191e-06, "loss": 0.5372, "num_input_tokens_seen": 12452488, "step": 18960 }, { "epoch": 11.182193396226415, "grad_norm": 2.478703260421753, "learning_rate": 4.841547344901214e-06, "loss": 0.3341, "num_input_tokens_seen": 12455496, "step": 18965 }, { "epoch": 11.185141509433961, "grad_norm": 4.277663230895996, "learning_rate": 4.838975944080261e-06, "loss": 0.4815, "num_input_tokens_seen": 12459336, "step": 18970 }, { "epoch": 11.18808962264151, "grad_norm": 2.572075605392456, "learning_rate": 4.83640458589112e-06, "loss": 0.289, "num_input_tokens_seen": 12462472, "step": 18975 }, { "epoch": 11.191037735849056, "grad_norm": 2.3088488578796387, "learning_rate": 4.833833271014571e-06, "loss": 0.4885, "num_input_tokens_seen": 12466024, "step": 18980 }, { "epoch": 11.193985849056604, "grad_norm": 2.606062889099121, "learning_rate": 4.831262000131379e-06, "loss": 0.3357, "num_input_tokens_seen": 12468904, "step": 18985 }, { "epoch": 11.19693396226415, "grad_norm": 2.4099557399749756, "learning_rate": 4.828690773922299e-06, "loss": 0.2746, "num_input_tokens_seen": 12471880, "step": 18990 }, { "epoch": 11.199882075471699, "grad_norm": 3.4610509872436523, "learning_rate": 4.826119593068074e-06, "loss": 0.3665, "num_input_tokens_seen": 12475656, "step": 18995 }, { "epoch": 11.202830188679245, "grad_norm": 3.090590238571167, "learning_rate": 4.8235484582494375e-06, "loss": 0.3537, "num_input_tokens_seen": 12478664, "step": 19000 }, { "epoch": 11.205778301886792, "grad_norm": 2.9219908714294434, "learning_rate": 4.8209773701471076e-06, "loss": 0.2921, "num_input_tokens_seen": 12482088, "step": 19005 }, { "epoch": 11.20872641509434, "grad_norm": 2.7767839431762695, "learning_rate": 4.818406329441789e-06, "loss": 0.4203, "num_input_tokens_seen": 12485288, "step": 19010 }, { "epoch": 11.211674528301886, "grad_norm": 3.148263692855835, "learning_rate": 4.815835336814179e-06, "loss": 0.2676, "num_input_tokens_seen": 12489480, "step": 19015 }, { "epoch": 11.214622641509434, "grad_norm": 3.79921817779541, "learning_rate": 4.813264392944957e-06, "loss": 0.5262, "num_input_tokens_seen": 12493512, "step": 19020 }, { "epoch": 11.21757075471698, "grad_norm": 3.4700801372528076, "learning_rate": 4.8106934985147905e-06, "loss": 0.5162, "num_input_tokens_seen": 12496552, "step": 19025 }, { "epoch": 11.220518867924529, "grad_norm": 7.747455596923828, "learning_rate": 4.808122654204338e-06, "loss": 0.3773, "num_input_tokens_seen": 12499240, "step": 19030 }, { "epoch": 11.223466981132075, "grad_norm": 4.732048034667969, "learning_rate": 4.805551860694239e-06, "loss": 0.394, "num_input_tokens_seen": 12506696, "step": 19035 }, { "epoch": 11.226415094339623, "grad_norm": 2.497349977493286, "learning_rate": 4.802981118665122e-06, "loss": 0.4782, "num_input_tokens_seen": 12510216, "step": 19040 }, { "epoch": 11.22936320754717, "grad_norm": 2.5055248737335205, "learning_rate": 4.800410428797604e-06, "loss": 0.3607, "num_input_tokens_seen": 12513640, "step": 19045 }, { "epoch": 11.232311320754716, "grad_norm": 2.830946683883667, "learning_rate": 4.797839791772286e-06, "loss": 0.3899, "num_input_tokens_seen": 12517288, "step": 19050 }, { "epoch": 11.235259433962264, "grad_norm": 3.8035242557525635, "learning_rate": 4.795269208269752e-06, "loss": 0.3728, "num_input_tokens_seen": 12521288, "step": 19055 }, { "epoch": 11.23820754716981, "grad_norm": 3.7679574489593506, "learning_rate": 4.792698678970579e-06, "loss": 0.3626, "num_input_tokens_seen": 12523528, "step": 19060 }, { "epoch": 11.241155660377359, "grad_norm": 3.314286708831787, "learning_rate": 4.7901282045553245e-06, "loss": 0.3703, "num_input_tokens_seen": 12527400, "step": 19065 }, { "epoch": 11.244103773584905, "grad_norm": 3.0438897609710693, "learning_rate": 4.787557785704531e-06, "loss": 0.3768, "num_input_tokens_seen": 12529992, "step": 19070 }, { "epoch": 11.247051886792454, "grad_norm": 1.392468810081482, "learning_rate": 4.784987423098731e-06, "loss": 0.3306, "num_input_tokens_seen": 12533768, "step": 19075 }, { "epoch": 11.25, "grad_norm": 3.4233100414276123, "learning_rate": 4.7824171174184354e-06, "loss": 0.3688, "num_input_tokens_seen": 12536648, "step": 19080 }, { "epoch": 11.252948113207546, "grad_norm": 1.4635359048843384, "learning_rate": 4.779846869344146e-06, "loss": 0.3788, "num_input_tokens_seen": 12540488, "step": 19085 }, { "epoch": 11.255896226415095, "grad_norm": 2.730463743209839, "learning_rate": 4.777276679556346e-06, "loss": 0.4296, "num_input_tokens_seen": 12543464, "step": 19090 }, { "epoch": 11.258844339622641, "grad_norm": 3.3892979621887207, "learning_rate": 4.774706548735507e-06, "loss": 0.4466, "num_input_tokens_seen": 12546472, "step": 19095 }, { "epoch": 11.26179245283019, "grad_norm": 2.3381237983703613, "learning_rate": 4.77213647756208e-06, "loss": 0.2472, "num_input_tokens_seen": 12550472, "step": 19100 }, { "epoch": 11.264740566037736, "grad_norm": 1.8154503107070923, "learning_rate": 4.769566466716501e-06, "loss": 0.3211, "num_input_tokens_seen": 12554376, "step": 19105 }, { "epoch": 11.267688679245284, "grad_norm": 2.5776889324188232, "learning_rate": 4.766996516879195e-06, "loss": 0.3836, "num_input_tokens_seen": 12557736, "step": 19110 }, { "epoch": 11.27063679245283, "grad_norm": 1.8006149530410767, "learning_rate": 4.764426628730564e-06, "loss": 0.3773, "num_input_tokens_seen": 12560872, "step": 19115 }, { "epoch": 11.273584905660377, "grad_norm": 3.0378339290618896, "learning_rate": 4.761856802950999e-06, "loss": 0.5121, "num_input_tokens_seen": 12564008, "step": 19120 }, { "epoch": 11.276533018867925, "grad_norm": 5.100160598754883, "learning_rate": 4.759287040220872e-06, "loss": 0.518, "num_input_tokens_seen": 12571208, "step": 19125 }, { "epoch": 11.279481132075471, "grad_norm": 3.561790704727173, "learning_rate": 4.756717341220538e-06, "loss": 0.3782, "num_input_tokens_seen": 12573576, "step": 19130 }, { "epoch": 11.28242924528302, "grad_norm": 1.3950576782226562, "learning_rate": 4.7541477066303365e-06, "loss": 0.3099, "num_input_tokens_seen": 12577352, "step": 19135 }, { "epoch": 11.285377358490566, "grad_norm": 4.033588886260986, "learning_rate": 4.75157813713059e-06, "loss": 0.4647, "num_input_tokens_seen": 12580328, "step": 19140 }, { "epoch": 11.288325471698114, "grad_norm": 4.170646667480469, "learning_rate": 4.7490086334016e-06, "loss": 0.448, "num_input_tokens_seen": 12583912, "step": 19145 }, { "epoch": 11.29127358490566, "grad_norm": 3.3430116176605225, "learning_rate": 4.746439196123659e-06, "loss": 0.3359, "num_input_tokens_seen": 12586376, "step": 19150 }, { "epoch": 11.294221698113208, "grad_norm": 2.1393942832946777, "learning_rate": 4.743869825977032e-06, "loss": 0.3505, "num_input_tokens_seen": 12589512, "step": 19155 }, { "epoch": 11.297169811320755, "grad_norm": 4.75051212310791, "learning_rate": 4.741300523641972e-06, "loss": 0.3368, "num_input_tokens_seen": 12592776, "step": 19160 }, { "epoch": 11.300117924528301, "grad_norm": 2.9829564094543457, "learning_rate": 4.738731289798715e-06, "loss": 0.3834, "num_input_tokens_seen": 12595944, "step": 19165 }, { "epoch": 11.30306603773585, "grad_norm": 3.5402302742004395, "learning_rate": 4.7361621251274744e-06, "loss": 0.437, "num_input_tokens_seen": 12598504, "step": 19170 }, { "epoch": 11.306014150943396, "grad_norm": 4.909487724304199, "learning_rate": 4.733593030308446e-06, "loss": 0.381, "num_input_tokens_seen": 12601256, "step": 19175 }, { "epoch": 11.308962264150944, "grad_norm": 2.2352426052093506, "learning_rate": 4.731024006021814e-06, "loss": 0.2922, "num_input_tokens_seen": 12605256, "step": 19180 }, { "epoch": 11.31191037735849, "grad_norm": 2.3312785625457764, "learning_rate": 4.728455052947732e-06, "loss": 0.3507, "num_input_tokens_seen": 12609480, "step": 19185 }, { "epoch": 11.314858490566039, "grad_norm": 2.3933210372924805, "learning_rate": 4.725886171766349e-06, "loss": 0.3489, "num_input_tokens_seen": 12612872, "step": 19190 }, { "epoch": 11.317806603773585, "grad_norm": 1.447776436805725, "learning_rate": 4.723317363157781e-06, "loss": 0.3603, "num_input_tokens_seen": 12616648, "step": 19195 }, { "epoch": 11.320754716981131, "grad_norm": 2.2032551765441895, "learning_rate": 4.720748627802135e-06, "loss": 0.3239, "num_input_tokens_seen": 12619272, "step": 19200 }, { "epoch": 11.32370283018868, "grad_norm": 2.1168248653411865, "learning_rate": 4.718179966379492e-06, "loss": 0.3159, "num_input_tokens_seen": 12623560, "step": 19205 }, { "epoch": 11.326650943396226, "grad_norm": 2.9094398021698, "learning_rate": 4.715611379569919e-06, "loss": 0.3153, "num_input_tokens_seen": 12626248, "step": 19210 }, { "epoch": 11.329599056603774, "grad_norm": 2.1283676624298096, "learning_rate": 4.713042868053458e-06, "loss": 0.417, "num_input_tokens_seen": 12628904, "step": 19215 }, { "epoch": 11.33254716981132, "grad_norm": 2.0884928703308105, "learning_rate": 4.7104744325101345e-06, "loss": 0.3965, "num_input_tokens_seen": 12632264, "step": 19220 }, { "epoch": 11.335495283018869, "grad_norm": 3.3471872806549072, "learning_rate": 4.7079060736199525e-06, "loss": 0.4985, "num_input_tokens_seen": 12634920, "step": 19225 }, { "epoch": 11.338443396226415, "grad_norm": 2.910663604736328, "learning_rate": 4.705337792062897e-06, "loss": 0.3358, "num_input_tokens_seen": 12638216, "step": 19230 }, { "epoch": 11.341391509433961, "grad_norm": 4.250668048858643, "learning_rate": 4.702769588518931e-06, "loss": 0.3693, "num_input_tokens_seen": 12640680, "step": 19235 }, { "epoch": 11.34433962264151, "grad_norm": 2.0099806785583496, "learning_rate": 4.700201463667996e-06, "loss": 0.2954, "num_input_tokens_seen": 12643080, "step": 19240 }, { "epoch": 11.347287735849056, "grad_norm": 3.3049044609069824, "learning_rate": 4.697633418190017e-06, "loss": 0.4007, "num_input_tokens_seen": 12646312, "step": 19245 }, { "epoch": 11.350235849056604, "grad_norm": 2.4251747131347656, "learning_rate": 4.695065452764893e-06, "loss": 0.3988, "num_input_tokens_seen": 12649384, "step": 19250 }, { "epoch": 11.35318396226415, "grad_norm": 1.929849624633789, "learning_rate": 4.692497568072505e-06, "loss": 0.3202, "num_input_tokens_seen": 12652424, "step": 19255 }, { "epoch": 11.356132075471699, "grad_norm": 2.549640417098999, "learning_rate": 4.689929764792711e-06, "loss": 0.2993, "num_input_tokens_seen": 12656168, "step": 19260 }, { "epoch": 11.359080188679245, "grad_norm": 2.7890543937683105, "learning_rate": 4.687362043605349e-06, "loss": 0.5107, "num_input_tokens_seen": 12659496, "step": 19265 }, { "epoch": 11.362028301886792, "grad_norm": 3.5622670650482178, "learning_rate": 4.684794405190231e-06, "loss": 0.3233, "num_input_tokens_seen": 12662984, "step": 19270 }, { "epoch": 11.36497641509434, "grad_norm": 2.15720272064209, "learning_rate": 4.682226850227155e-06, "loss": 0.3232, "num_input_tokens_seen": 12665736, "step": 19275 }, { "epoch": 11.367924528301886, "grad_norm": 3.6624300479888916, "learning_rate": 4.6796593793958875e-06, "loss": 0.4093, "num_input_tokens_seen": 12668456, "step": 19280 }, { "epoch": 11.370872641509434, "grad_norm": 1.897755742073059, "learning_rate": 4.6770919933761815e-06, "loss": 0.366, "num_input_tokens_seen": 12672008, "step": 19285 }, { "epoch": 11.37382075471698, "grad_norm": 2.720365047454834, "learning_rate": 4.6745246928477615e-06, "loss": 0.4188, "num_input_tokens_seen": 12674952, "step": 19290 }, { "epoch": 11.376768867924529, "grad_norm": 3.617016553878784, "learning_rate": 4.671957478490332e-06, "loss": 0.3117, "num_input_tokens_seen": 12678056, "step": 19295 }, { "epoch": 11.379716981132075, "grad_norm": 1.5735013484954834, "learning_rate": 4.669390350983574e-06, "loss": 0.3582, "num_input_tokens_seen": 12681672, "step": 19300 }, { "epoch": 11.382665094339623, "grad_norm": 2.8561925888061523, "learning_rate": 4.666823311007145e-06, "loss": 0.4238, "num_input_tokens_seen": 12685224, "step": 19305 }, { "epoch": 11.38561320754717, "grad_norm": 4.176980018615723, "learning_rate": 4.664256359240681e-06, "loss": 0.324, "num_input_tokens_seen": 12688232, "step": 19310 }, { "epoch": 11.388561320754716, "grad_norm": 2.4019720554351807, "learning_rate": 4.661689496363793e-06, "loss": 0.3022, "num_input_tokens_seen": 12691720, "step": 19315 }, { "epoch": 11.391509433962264, "grad_norm": 3.0540270805358887, "learning_rate": 4.659122723056068e-06, "loss": 0.3517, "num_input_tokens_seen": 12694056, "step": 19320 }, { "epoch": 11.39445754716981, "grad_norm": 3.675100803375244, "learning_rate": 4.656556039997072e-06, "loss": 0.3994, "num_input_tokens_seen": 12696808, "step": 19325 }, { "epoch": 11.397405660377359, "grad_norm": 2.24606990814209, "learning_rate": 4.653989447866345e-06, "loss": 0.3504, "num_input_tokens_seen": 12699496, "step": 19330 }, { "epoch": 11.400353773584905, "grad_norm": 1.7960495948791504, "learning_rate": 4.651422947343401e-06, "loss": 0.3441, "num_input_tokens_seen": 12703944, "step": 19335 }, { "epoch": 11.403301886792454, "grad_norm": 1.935671329498291, "learning_rate": 4.6488565391077355e-06, "loss": 0.4146, "num_input_tokens_seen": 12706664, "step": 19340 }, { "epoch": 11.40625, "grad_norm": 6.9264678955078125, "learning_rate": 4.646290223838815e-06, "loss": 0.3726, "num_input_tokens_seen": 12710152, "step": 19345 }, { "epoch": 11.409198113207546, "grad_norm": 3.199533700942993, "learning_rate": 4.64372400221608e-06, "loss": 0.4126, "num_input_tokens_seen": 12713416, "step": 19350 }, { "epoch": 11.412146226415095, "grad_norm": 3.102489709854126, "learning_rate": 4.641157874918953e-06, "loss": 0.3688, "num_input_tokens_seen": 12716680, "step": 19355 }, { "epoch": 11.415094339622641, "grad_norm": 2.4927666187286377, "learning_rate": 4.6385918426268245e-06, "loss": 0.4362, "num_input_tokens_seen": 12721256, "step": 19360 }, { "epoch": 11.41804245283019, "grad_norm": 2.5266528129577637, "learning_rate": 4.636025906019062e-06, "loss": 0.3964, "num_input_tokens_seen": 12724808, "step": 19365 }, { "epoch": 11.420990566037736, "grad_norm": 2.68241024017334, "learning_rate": 4.6334600657750115e-06, "loss": 0.4221, "num_input_tokens_seen": 12727912, "step": 19370 }, { "epoch": 11.423938679245284, "grad_norm": 2.899287223815918, "learning_rate": 4.6308943225739855e-06, "loss": 0.2973, "num_input_tokens_seen": 12731144, "step": 19375 }, { "epoch": 11.42688679245283, "grad_norm": 3.504629611968994, "learning_rate": 4.628328677095281e-06, "loss": 0.3741, "num_input_tokens_seen": 12734696, "step": 19380 }, { "epoch": 11.429834905660377, "grad_norm": 2.3660695552825928, "learning_rate": 4.625763130018159e-06, "loss": 0.325, "num_input_tokens_seen": 12737576, "step": 19385 }, { "epoch": 11.432783018867925, "grad_norm": 2.1870882511138916, "learning_rate": 4.6231976820218635e-06, "loss": 0.3688, "num_input_tokens_seen": 12740424, "step": 19390 }, { "epoch": 11.435731132075471, "grad_norm": 3.0224623680114746, "learning_rate": 4.620632333785605e-06, "loss": 0.4898, "num_input_tokens_seen": 12743880, "step": 19395 }, { "epoch": 11.43867924528302, "grad_norm": 3.733335018157959, "learning_rate": 4.618067085988569e-06, "loss": 0.3219, "num_input_tokens_seen": 12747304, "step": 19400 }, { "epoch": 11.441627358490566, "grad_norm": 2.0204577445983887, "learning_rate": 4.615501939309921e-06, "loss": 0.5697, "num_input_tokens_seen": 12751336, "step": 19405 }, { "epoch": 11.444575471698114, "grad_norm": 20.096763610839844, "learning_rate": 4.612936894428791e-06, "loss": 0.4285, "num_input_tokens_seen": 12754344, "step": 19410 }, { "epoch": 11.44752358490566, "grad_norm": 2.995230197906494, "learning_rate": 4.610371952024285e-06, "loss": 0.3424, "num_input_tokens_seen": 12757640, "step": 19415 }, { "epoch": 11.450471698113208, "grad_norm": 18.707902908325195, "learning_rate": 4.607807112775485e-06, "loss": 0.459, "num_input_tokens_seen": 12760552, "step": 19420 }, { "epoch": 11.453419811320755, "grad_norm": 4.150442123413086, "learning_rate": 4.605242377361441e-06, "loss": 0.4767, "num_input_tokens_seen": 12763240, "step": 19425 }, { "epoch": 11.456367924528301, "grad_norm": 3.124814033508301, "learning_rate": 4.60267774646118e-06, "loss": 0.2659, "num_input_tokens_seen": 12765928, "step": 19430 }, { "epoch": 11.45931603773585, "grad_norm": 1.8947467803955078, "learning_rate": 4.600113220753698e-06, "loss": 0.2808, "num_input_tokens_seen": 12768360, "step": 19435 }, { "epoch": 11.462264150943396, "grad_norm": 2.3462891578674316, "learning_rate": 4.597548800917964e-06, "loss": 0.3561, "num_input_tokens_seen": 12771272, "step": 19440 }, { "epoch": 11.465212264150944, "grad_norm": 1.9862464666366577, "learning_rate": 4.594984487632919e-06, "loss": 0.3069, "num_input_tokens_seen": 12774472, "step": 19445 }, { "epoch": 11.46816037735849, "grad_norm": 3.0772857666015625, "learning_rate": 4.592420281577478e-06, "loss": 0.4058, "num_input_tokens_seen": 12778280, "step": 19450 }, { "epoch": 11.471108490566039, "grad_norm": 3.8193883895874023, "learning_rate": 4.589856183430521e-06, "loss": 0.3151, "num_input_tokens_seen": 12782664, "step": 19455 }, { "epoch": 11.474056603773585, "grad_norm": 2.4405765533447266, "learning_rate": 4.58729219387091e-06, "loss": 0.3806, "num_input_tokens_seen": 12786312, "step": 19460 }, { "epoch": 11.477004716981131, "grad_norm": 4.583826065063477, "learning_rate": 4.584728313577468e-06, "loss": 0.4429, "num_input_tokens_seen": 12788968, "step": 19465 }, { "epoch": 11.47995283018868, "grad_norm": 1.6143423318862915, "learning_rate": 4.582164543228993e-06, "loss": 0.3289, "num_input_tokens_seen": 12793000, "step": 19470 }, { "epoch": 11.482900943396226, "grad_norm": 1.8579895496368408, "learning_rate": 4.5796008835042574e-06, "loss": 0.292, "num_input_tokens_seen": 12795880, "step": 19475 }, { "epoch": 11.485849056603774, "grad_norm": 2.7451705932617188, "learning_rate": 4.577037335082e-06, "loss": 0.3937, "num_input_tokens_seen": 12799112, "step": 19480 }, { "epoch": 11.48879716981132, "grad_norm": 2.585949659347534, "learning_rate": 4.574473898640933e-06, "loss": 0.3787, "num_input_tokens_seen": 12801864, "step": 19485 }, { "epoch": 11.491745283018869, "grad_norm": 2.977332830429077, "learning_rate": 4.571910574859732e-06, "loss": 0.3473, "num_input_tokens_seen": 12805288, "step": 19490 }, { "epoch": 11.494693396226415, "grad_norm": 3.2595818042755127, "learning_rate": 4.5693473644170535e-06, "loss": 0.4035, "num_input_tokens_seen": 12808904, "step": 19495 }, { "epoch": 11.497641509433961, "grad_norm": 2.30147647857666, "learning_rate": 4.566784267991516e-06, "loss": 0.377, "num_input_tokens_seen": 12811944, "step": 19500 }, { "epoch": 11.50058962264151, "grad_norm": 6.687666416168213, "learning_rate": 4.564221286261709e-06, "loss": 0.3904, "num_input_tokens_seen": 12814248, "step": 19505 }, { "epoch": 11.503537735849056, "grad_norm": 2.2681024074554443, "learning_rate": 4.5616584199061964e-06, "loss": 0.326, "num_input_tokens_seen": 12817768, "step": 19510 }, { "epoch": 11.506485849056604, "grad_norm": 3.7902517318725586, "learning_rate": 4.559095669603506e-06, "loss": 0.3533, "num_input_tokens_seen": 12820616, "step": 19515 }, { "epoch": 11.50943396226415, "grad_norm": 3.0312604904174805, "learning_rate": 4.556533036032136e-06, "loss": 0.4054, "num_input_tokens_seen": 12825352, "step": 19520 }, { "epoch": 11.512382075471699, "grad_norm": 1.7822132110595703, "learning_rate": 4.553970519870557e-06, "loss": 0.3336, "num_input_tokens_seen": 12828168, "step": 19525 }, { "epoch": 11.515330188679245, "grad_norm": 2.341787815093994, "learning_rate": 4.551408121797205e-06, "loss": 0.3082, "num_input_tokens_seen": 12831560, "step": 19530 }, { "epoch": 11.518278301886792, "grad_norm": 3.508589267730713, "learning_rate": 4.548845842490486e-06, "loss": 0.3696, "num_input_tokens_seen": 12835048, "step": 19535 }, { "epoch": 11.52122641509434, "grad_norm": 2.975423812866211, "learning_rate": 4.5462836826287745e-06, "loss": 0.4903, "num_input_tokens_seen": 12838344, "step": 19540 }, { "epoch": 11.524174528301886, "grad_norm": 3.105102062225342, "learning_rate": 4.543721642890414e-06, "loss": 0.3047, "num_input_tokens_seen": 12841416, "step": 19545 }, { "epoch": 11.527122641509434, "grad_norm": 1.9527781009674072, "learning_rate": 4.541159723953714e-06, "loss": 0.2859, "num_input_tokens_seen": 12845192, "step": 19550 }, { "epoch": 11.53007075471698, "grad_norm": 3.6623198986053467, "learning_rate": 4.538597926496955e-06, "loss": 0.3593, "num_input_tokens_seen": 12849800, "step": 19555 }, { "epoch": 11.533018867924529, "grad_norm": 2.209362030029297, "learning_rate": 4.536036251198384e-06, "loss": 0.3975, "num_input_tokens_seen": 12853800, "step": 19560 }, { "epoch": 11.535966981132075, "grad_norm": 1.1631041765213013, "learning_rate": 4.5334746987362124e-06, "loss": 0.2076, "num_input_tokens_seen": 12859432, "step": 19565 }, { "epoch": 11.538915094339622, "grad_norm": 3.577486515045166, "learning_rate": 4.530913269788627e-06, "loss": 0.4404, "num_input_tokens_seen": 12862632, "step": 19570 }, { "epoch": 11.54186320754717, "grad_norm": 2.0869288444519043, "learning_rate": 4.528351965033775e-06, "loss": 0.4061, "num_input_tokens_seen": 12865352, "step": 19575 }, { "epoch": 11.544811320754716, "grad_norm": 4.656004905700684, "learning_rate": 4.525790785149774e-06, "loss": 0.3919, "num_input_tokens_seen": 12867976, "step": 19580 }, { "epoch": 11.547759433962264, "grad_norm": 2.0894527435302734, "learning_rate": 4.523229730814705e-06, "loss": 0.3783, "num_input_tokens_seen": 12871496, "step": 19585 }, { "epoch": 11.55070754716981, "grad_norm": 1.6486048698425293, "learning_rate": 4.52066880270662e-06, "loss": 0.3315, "num_input_tokens_seen": 12874952, "step": 19590 }, { "epoch": 11.553655660377359, "grad_norm": 1.7659733295440674, "learning_rate": 4.518108001503536e-06, "loss": 0.4507, "num_input_tokens_seen": 12878696, "step": 19595 }, { "epoch": 11.556603773584905, "grad_norm": 3.6144211292266846, "learning_rate": 4.515547327883434e-06, "loss": 0.3727, "num_input_tokens_seen": 12881928, "step": 19600 }, { "epoch": 11.559551886792454, "grad_norm": 3.276402235031128, "learning_rate": 4.512986782524266e-06, "loss": 0.3026, "num_input_tokens_seen": 12884456, "step": 19605 }, { "epoch": 11.5625, "grad_norm": 5.39693021774292, "learning_rate": 4.510426366103946e-06, "loss": 0.3139, "num_input_tokens_seen": 12887528, "step": 19610 }, { "epoch": 11.565448113207546, "grad_norm": 2.594026803970337, "learning_rate": 4.5078660793003544e-06, "loss": 0.4962, "num_input_tokens_seen": 12890888, "step": 19615 }, { "epoch": 11.568396226415095, "grad_norm": 2.108659029006958, "learning_rate": 4.505305922791341e-06, "loss": 0.3257, "num_input_tokens_seen": 12893960, "step": 19620 }, { "epoch": 11.571344339622641, "grad_norm": 1.8157026767730713, "learning_rate": 4.502745897254716e-06, "loss": 0.4042, "num_input_tokens_seen": 12897384, "step": 19625 }, { "epoch": 11.57429245283019, "grad_norm": 3.1156363487243652, "learning_rate": 4.500186003368257e-06, "loss": 0.3484, "num_input_tokens_seen": 12901768, "step": 19630 }, { "epoch": 11.577240566037736, "grad_norm": 3.1664140224456787, "learning_rate": 4.497626241809709e-06, "loss": 0.4455, "num_input_tokens_seen": 12905224, "step": 19635 }, { "epoch": 11.580188679245284, "grad_norm": 2.2174625396728516, "learning_rate": 4.4950666132567775e-06, "loss": 0.3382, "num_input_tokens_seen": 12909288, "step": 19640 }, { "epoch": 11.58313679245283, "grad_norm": 2.913154125213623, "learning_rate": 4.492507118387136e-06, "loss": 0.3134, "num_input_tokens_seen": 12912488, "step": 19645 }, { "epoch": 11.586084905660378, "grad_norm": 3.1154258251190186, "learning_rate": 4.4899477578784235e-06, "loss": 0.3698, "num_input_tokens_seen": 12915592, "step": 19650 }, { "epoch": 11.589033018867925, "grad_norm": 2.0730841159820557, "learning_rate": 4.487388532408239e-06, "loss": 0.3697, "num_input_tokens_seen": 12918408, "step": 19655 }, { "epoch": 11.591981132075471, "grad_norm": 2.359560012817383, "learning_rate": 4.48482944265415e-06, "loss": 0.3772, "num_input_tokens_seen": 12921416, "step": 19660 }, { "epoch": 11.59492924528302, "grad_norm": 3.8549764156341553, "learning_rate": 4.482270489293685e-06, "loss": 0.5162, "num_input_tokens_seen": 12925256, "step": 19665 }, { "epoch": 11.597877358490566, "grad_norm": 2.1036593914031982, "learning_rate": 4.479711673004341e-06, "loss": 0.4038, "num_input_tokens_seen": 12928008, "step": 19670 }, { "epoch": 11.600825471698114, "grad_norm": 2.4744420051574707, "learning_rate": 4.477152994463575e-06, "loss": 0.3287, "num_input_tokens_seen": 12931688, "step": 19675 }, { "epoch": 11.60377358490566, "grad_norm": 8.64070987701416, "learning_rate": 4.474594454348805e-06, "loss": 0.265, "num_input_tokens_seen": 12936808, "step": 19680 }, { "epoch": 11.606721698113208, "grad_norm": 9.96491527557373, "learning_rate": 4.472036053337419e-06, "loss": 0.4617, "num_input_tokens_seen": 12939688, "step": 19685 }, { "epoch": 11.609669811320755, "grad_norm": 2.629462718963623, "learning_rate": 4.469477792106761e-06, "loss": 0.3573, "num_input_tokens_seen": 12942984, "step": 19690 }, { "epoch": 11.612617924528301, "grad_norm": 2.929704189300537, "learning_rate": 4.466919671334146e-06, "loss": 0.4307, "num_input_tokens_seen": 12945800, "step": 19695 }, { "epoch": 11.61556603773585, "grad_norm": 2.2183661460876465, "learning_rate": 4.4643616916968456e-06, "loss": 0.314, "num_input_tokens_seen": 12949256, "step": 19700 }, { "epoch": 11.618514150943396, "grad_norm": 3.077728271484375, "learning_rate": 4.461803853872095e-06, "loss": 0.3654, "num_input_tokens_seen": 12952808, "step": 19705 }, { "epoch": 11.621462264150944, "grad_norm": 5.529137134552002, "learning_rate": 4.459246158537094e-06, "loss": 0.4069, "num_input_tokens_seen": 12956168, "step": 19710 }, { "epoch": 11.62441037735849, "grad_norm": 3.2279551029205322, "learning_rate": 4.456688606369003e-06, "loss": 0.3572, "num_input_tokens_seen": 12958792, "step": 19715 }, { "epoch": 11.627358490566039, "grad_norm": 7.231363773345947, "learning_rate": 4.4541311980449446e-06, "loss": 0.4614, "num_input_tokens_seen": 12961768, "step": 19720 }, { "epoch": 11.630306603773585, "grad_norm": 8.963570594787598, "learning_rate": 4.4515739342420045e-06, "loss": 0.4084, "num_input_tokens_seen": 12966056, "step": 19725 }, { "epoch": 11.633254716981131, "grad_norm": 3.899817943572998, "learning_rate": 4.4490168156372294e-06, "loss": 0.3535, "num_input_tokens_seen": 12968584, "step": 19730 }, { "epoch": 11.63620283018868, "grad_norm": 3.3640236854553223, "learning_rate": 4.446459842907626e-06, "loss": 0.424, "num_input_tokens_seen": 12970984, "step": 19735 }, { "epoch": 11.639150943396226, "grad_norm": 3.8250246047973633, "learning_rate": 4.443903016730165e-06, "loss": 0.3685, "num_input_tokens_seen": 12974664, "step": 19740 }, { "epoch": 11.642099056603774, "grad_norm": 3.7282466888427734, "learning_rate": 4.4413463377817775e-06, "loss": 0.415, "num_input_tokens_seen": 12978760, "step": 19745 }, { "epoch": 11.64504716981132, "grad_norm": 2.115724563598633, "learning_rate": 4.438789806739353e-06, "loss": 0.2278, "num_input_tokens_seen": 12982632, "step": 19750 }, { "epoch": 11.647995283018869, "grad_norm": 4.37672758102417, "learning_rate": 4.436233424279746e-06, "loss": 0.4525, "num_input_tokens_seen": 12986216, "step": 19755 }, { "epoch": 11.650943396226415, "grad_norm": 4.145387172698975, "learning_rate": 4.433677191079771e-06, "loss": 0.3358, "num_input_tokens_seen": 12988232, "step": 19760 }, { "epoch": 11.653891509433961, "grad_norm": 3.339895486831665, "learning_rate": 4.431121107816201e-06, "loss": 0.3942, "num_input_tokens_seen": 12990984, "step": 19765 }, { "epoch": 11.65683962264151, "grad_norm": 4.8316473960876465, "learning_rate": 4.4285651751657676e-06, "loss": 0.374, "num_input_tokens_seen": 12994056, "step": 19770 }, { "epoch": 11.659787735849056, "grad_norm": 2.425354480743408, "learning_rate": 4.4260093938051685e-06, "loss": 0.3316, "num_input_tokens_seen": 12997640, "step": 19775 }, { "epoch": 11.662735849056604, "grad_norm": 4.853847503662109, "learning_rate": 4.423453764411056e-06, "loss": 0.3501, "num_input_tokens_seen": 13000040, "step": 19780 }, { "epoch": 11.66568396226415, "grad_norm": 2.1464767456054688, "learning_rate": 4.4208982876600425e-06, "loss": 0.3863, "num_input_tokens_seen": 13002632, "step": 19785 }, { "epoch": 11.668632075471699, "grad_norm": 1.9878268241882324, "learning_rate": 4.418342964228705e-06, "loss": 0.3133, "num_input_tokens_seen": 13006824, "step": 19790 }, { "epoch": 11.671580188679245, "grad_norm": 2.8435134887695312, "learning_rate": 4.415787794793574e-06, "loss": 0.4236, "num_input_tokens_seen": 13009544, "step": 19795 }, { "epoch": 11.674528301886792, "grad_norm": 2.0660250186920166, "learning_rate": 4.4132327800311414e-06, "loss": 0.4065, "num_input_tokens_seen": 13012840, "step": 19800 }, { "epoch": 11.67747641509434, "grad_norm": 6.009430885314941, "learning_rate": 4.41067792061786e-06, "loss": 0.3358, "num_input_tokens_seen": 13015848, "step": 19805 }, { "epoch": 11.680424528301886, "grad_norm": 3.654662847518921, "learning_rate": 4.408123217230139e-06, "loss": 0.3614, "num_input_tokens_seen": 13018600, "step": 19810 }, { "epoch": 11.683372641509434, "grad_norm": 2.106203556060791, "learning_rate": 4.4055686705443464e-06, "loss": 0.2922, "num_input_tokens_seen": 13021128, "step": 19815 }, { "epoch": 11.68632075471698, "grad_norm": 3.0184810161590576, "learning_rate": 4.4030142812368106e-06, "loss": 0.268, "num_input_tokens_seen": 13024744, "step": 19820 }, { "epoch": 11.689268867924529, "grad_norm": 2.7693018913269043, "learning_rate": 4.400460049983817e-06, "loss": 0.3669, "num_input_tokens_seen": 13027944, "step": 19825 }, { "epoch": 11.692216981132075, "grad_norm": 1.5019121170043945, "learning_rate": 4.397905977461608e-06, "loss": 0.3009, "num_input_tokens_seen": 13030280, "step": 19830 }, { "epoch": 11.695165094339622, "grad_norm": 2.387373208999634, "learning_rate": 4.395352064346387e-06, "loss": 0.4628, "num_input_tokens_seen": 13034312, "step": 19835 }, { "epoch": 11.69811320754717, "grad_norm": 6.0773491859436035, "learning_rate": 4.392798311314314e-06, "loss": 0.3951, "num_input_tokens_seen": 13037736, "step": 19840 }, { "epoch": 11.701061320754716, "grad_norm": 2.0773723125457764, "learning_rate": 4.390244719041502e-06, "loss": 0.4251, "num_input_tokens_seen": 13041768, "step": 19845 }, { "epoch": 11.704009433962264, "grad_norm": 2.939948081970215, "learning_rate": 4.387691288204029e-06, "loss": 0.3335, "num_input_tokens_seen": 13044872, "step": 19850 }, { "epoch": 11.70695754716981, "grad_norm": 3.7730298042297363, "learning_rate": 4.385138019477928e-06, "loss": 0.3246, "num_input_tokens_seen": 13047624, "step": 19855 }, { "epoch": 11.709905660377359, "grad_norm": 2.684981346130371, "learning_rate": 4.3825849135391864e-06, "loss": 0.2605, "num_input_tokens_seen": 13050536, "step": 19860 }, { "epoch": 11.712853773584905, "grad_norm": 2.565232038497925, "learning_rate": 4.380031971063748e-06, "loss": 0.3495, "num_input_tokens_seen": 13053384, "step": 19865 }, { "epoch": 11.715801886792454, "grad_norm": 2.9828603267669678, "learning_rate": 4.37747919272752e-06, "loss": 0.3547, "num_input_tokens_seen": 13055880, "step": 19870 }, { "epoch": 11.71875, "grad_norm": 4.305165767669678, "learning_rate": 4.374926579206357e-06, "loss": 0.3758, "num_input_tokens_seen": 13059624, "step": 19875 }, { "epoch": 11.721698113207546, "grad_norm": 2.9595346450805664, "learning_rate": 4.372374131176075e-06, "loss": 0.3204, "num_input_tokens_seen": 13063272, "step": 19880 }, { "epoch": 11.724646226415095, "grad_norm": 4.764659881591797, "learning_rate": 4.369821849312449e-06, "loss": 0.3582, "num_input_tokens_seen": 13065832, "step": 19885 }, { "epoch": 11.727594339622641, "grad_norm": 2.410532236099243, "learning_rate": 4.367269734291203e-06, "loss": 0.3946, "num_input_tokens_seen": 13068712, "step": 19890 }, { "epoch": 11.73054245283019, "grad_norm": 2.54146671295166, "learning_rate": 4.364717786788022e-06, "loss": 0.3212, "num_input_tokens_seen": 13073288, "step": 19895 }, { "epoch": 11.733490566037736, "grad_norm": 2.2002196311950684, "learning_rate": 4.362166007478545e-06, "loss": 0.427, "num_input_tokens_seen": 13076200, "step": 19900 }, { "epoch": 11.736438679245284, "grad_norm": 5.465193748474121, "learning_rate": 4.3596143970383665e-06, "loss": 0.4299, "num_input_tokens_seen": 13079400, "step": 19905 }, { "epoch": 11.73938679245283, "grad_norm": 2.3327832221984863, "learning_rate": 4.357062956143035e-06, "loss": 0.3683, "num_input_tokens_seen": 13082792, "step": 19910 }, { "epoch": 11.742334905660378, "grad_norm": 4.466202259063721, "learning_rate": 4.354511685468059e-06, "loss": 0.2846, "num_input_tokens_seen": 13085416, "step": 19915 }, { "epoch": 11.745283018867925, "grad_norm": 2.6005470752716064, "learning_rate": 4.351960585688894e-06, "loss": 0.4136, "num_input_tokens_seen": 13089672, "step": 19920 }, { "epoch": 11.748231132075471, "grad_norm": 4.345762729644775, "learning_rate": 4.349409657480959e-06, "loss": 0.4688, "num_input_tokens_seen": 13092264, "step": 19925 }, { "epoch": 11.75117924528302, "grad_norm": 9.020211219787598, "learning_rate": 4.34685890151962e-06, "loss": 0.4142, "num_input_tokens_seen": 13095624, "step": 19930 }, { "epoch": 11.754127358490566, "grad_norm": 3.737705707550049, "learning_rate": 4.344308318480201e-06, "loss": 0.4487, "num_input_tokens_seen": 13098664, "step": 19935 }, { "epoch": 11.757075471698114, "grad_norm": 5.499984264373779, "learning_rate": 4.341757909037981e-06, "loss": 0.3857, "num_input_tokens_seen": 13101448, "step": 19940 }, { "epoch": 11.76002358490566, "grad_norm": 2.7718610763549805, "learning_rate": 4.33920767386819e-06, "loss": 0.3562, "num_input_tokens_seen": 13104488, "step": 19945 }, { "epoch": 11.762971698113208, "grad_norm": 2.2171480655670166, "learning_rate": 4.336657613646017e-06, "loss": 0.3972, "num_input_tokens_seen": 13107496, "step": 19950 }, { "epoch": 11.765919811320755, "grad_norm": 3.585073471069336, "learning_rate": 4.3341077290466e-06, "loss": 0.3625, "num_input_tokens_seen": 13110248, "step": 19955 }, { "epoch": 11.768867924528301, "grad_norm": 2.676954507827759, "learning_rate": 4.331558020745031e-06, "loss": 0.3835, "num_input_tokens_seen": 13113256, "step": 19960 }, { "epoch": 11.77181603773585, "grad_norm": 2.1905176639556885, "learning_rate": 4.329008489416357e-06, "loss": 0.3085, "num_input_tokens_seen": 13116392, "step": 19965 }, { "epoch": 11.774764150943396, "grad_norm": 6.962961673736572, "learning_rate": 4.326459135735576e-06, "loss": 0.3173, "num_input_tokens_seen": 13119080, "step": 19970 }, { "epoch": 11.777712264150944, "grad_norm": 3.340482234954834, "learning_rate": 4.323909960377644e-06, "loss": 0.2209, "num_input_tokens_seen": 13122312, "step": 19975 }, { "epoch": 11.78066037735849, "grad_norm": 2.54903507232666, "learning_rate": 4.3213609640174625e-06, "loss": 0.2681, "num_input_tokens_seen": 13125448, "step": 19980 }, { "epoch": 11.783608490566039, "grad_norm": 3.3778066635131836, "learning_rate": 4.318812147329889e-06, "loss": 0.4187, "num_input_tokens_seen": 13128008, "step": 19985 }, { "epoch": 11.786556603773585, "grad_norm": 5.482907295227051, "learning_rate": 4.316263510989737e-06, "loss": 0.3868, "num_input_tokens_seen": 13130344, "step": 19990 }, { "epoch": 11.789504716981131, "grad_norm": 2.0509440898895264, "learning_rate": 4.313715055671768e-06, "loss": 0.3544, "num_input_tokens_seen": 13134024, "step": 19995 }, { "epoch": 11.79245283018868, "grad_norm": 8.0045166015625, "learning_rate": 4.311166782050694e-06, "loss": 0.4324, "num_input_tokens_seen": 13137640, "step": 20000 }, { "epoch": 11.795400943396226, "grad_norm": 3.7448322772979736, "learning_rate": 4.308618690801184e-06, "loss": 0.239, "num_input_tokens_seen": 13140456, "step": 20005 }, { "epoch": 11.798349056603774, "grad_norm": 2.4475631713867188, "learning_rate": 4.3060707825978564e-06, "loss": 0.4204, "num_input_tokens_seen": 13143560, "step": 20010 }, { "epoch": 11.80129716981132, "grad_norm": 2.8511712551116943, "learning_rate": 4.303523058115278e-06, "loss": 0.3098, "num_input_tokens_seen": 13146824, "step": 20015 }, { "epoch": 11.804245283018869, "grad_norm": 2.4474666118621826, "learning_rate": 4.300975518027972e-06, "loss": 0.4095, "num_input_tokens_seen": 13149928, "step": 20020 }, { "epoch": 11.807193396226415, "grad_norm": 2.2655792236328125, "learning_rate": 4.298428163010411e-06, "loss": 0.3549, "num_input_tokens_seen": 13152936, "step": 20025 }, { "epoch": 11.810141509433961, "grad_norm": 2.260230541229248, "learning_rate": 4.295880993737016e-06, "loss": 0.5185, "num_input_tokens_seen": 13155528, "step": 20030 }, { "epoch": 11.81308962264151, "grad_norm": 1.8818002939224243, "learning_rate": 4.293334010882164e-06, "loss": 0.3802, "num_input_tokens_seen": 13158888, "step": 20035 }, { "epoch": 11.816037735849056, "grad_norm": 2.7065351009368896, "learning_rate": 4.290787215120178e-06, "loss": 0.3296, "num_input_tokens_seen": 13162120, "step": 20040 }, { "epoch": 11.818985849056604, "grad_norm": 2.833141565322876, "learning_rate": 4.2882406071253315e-06, "loss": 0.2542, "num_input_tokens_seen": 13165384, "step": 20045 }, { "epoch": 11.82193396226415, "grad_norm": 4.182108402252197, "learning_rate": 4.285694187571852e-06, "loss": 0.3762, "num_input_tokens_seen": 13168584, "step": 20050 }, { "epoch": 11.824882075471699, "grad_norm": 2.8279709815979004, "learning_rate": 4.283147957133917e-06, "loss": 0.2677, "num_input_tokens_seen": 13170920, "step": 20055 }, { "epoch": 11.827830188679245, "grad_norm": 2.5842528343200684, "learning_rate": 4.280601916485648e-06, "loss": 0.3896, "num_input_tokens_seen": 13174440, "step": 20060 }, { "epoch": 11.830778301886792, "grad_norm": 2.8652560710906982, "learning_rate": 4.278056066301123e-06, "loss": 0.3926, "num_input_tokens_seen": 13177864, "step": 20065 }, { "epoch": 11.83372641509434, "grad_norm": 5.206861972808838, "learning_rate": 4.275510407254366e-06, "loss": 0.4363, "num_input_tokens_seen": 13180552, "step": 20070 }, { "epoch": 11.836674528301886, "grad_norm": 3.740891695022583, "learning_rate": 4.27296494001935e-06, "loss": 0.3245, "num_input_tokens_seen": 13184648, "step": 20075 }, { "epoch": 11.839622641509434, "grad_norm": 3.9128570556640625, "learning_rate": 4.2704196652700005e-06, "loss": 0.3606, "num_input_tokens_seen": 13187688, "step": 20080 }, { "epoch": 11.84257075471698, "grad_norm": 2.4606618881225586, "learning_rate": 4.2678745836801895e-06, "loss": 0.3886, "num_input_tokens_seen": 13191976, "step": 20085 }, { "epoch": 11.845518867924529, "grad_norm": 2.606884241104126, "learning_rate": 4.265329695923736e-06, "loss": 0.246, "num_input_tokens_seen": 13195912, "step": 20090 }, { "epoch": 11.848466981132075, "grad_norm": 3.7489874362945557, "learning_rate": 4.262785002674412e-06, "loss": 0.3512, "num_input_tokens_seen": 13199752, "step": 20095 }, { "epoch": 11.851415094339622, "grad_norm": 5.329141139984131, "learning_rate": 4.260240504605938e-06, "loss": 0.4957, "num_input_tokens_seen": 13202792, "step": 20100 }, { "epoch": 11.85436320754717, "grad_norm": 2.190020799636841, "learning_rate": 4.2576962023919775e-06, "loss": 0.3104, "num_input_tokens_seen": 13205640, "step": 20105 }, { "epoch": 11.857311320754716, "grad_norm": 3.871018886566162, "learning_rate": 4.255152096706145e-06, "loss": 0.3121, "num_input_tokens_seen": 13208072, "step": 20110 }, { "epoch": 11.860259433962264, "grad_norm": 2.1526002883911133, "learning_rate": 4.252608188222007e-06, "loss": 0.3973, "num_input_tokens_seen": 13211368, "step": 20115 }, { "epoch": 11.86320754716981, "grad_norm": 2.4768011569976807, "learning_rate": 4.250064477613071e-06, "loss": 0.336, "num_input_tokens_seen": 13214856, "step": 20120 }, { "epoch": 11.866155660377359, "grad_norm": 2.065314531326294, "learning_rate": 4.247520965552797e-06, "loss": 0.3482, "num_input_tokens_seen": 13219720, "step": 20125 }, { "epoch": 11.869103773584905, "grad_norm": 3.748910427093506, "learning_rate": 4.24497765271459e-06, "loss": 0.4218, "num_input_tokens_seen": 13223368, "step": 20130 }, { "epoch": 11.872051886792454, "grad_norm": 2.5462958812713623, "learning_rate": 4.242434539771804e-06, "loss": 0.3472, "num_input_tokens_seen": 13226408, "step": 20135 }, { "epoch": 11.875, "grad_norm": 3.040117025375366, "learning_rate": 4.239891627397737e-06, "loss": 0.3977, "num_input_tokens_seen": 13229224, "step": 20140 }, { "epoch": 11.877948113207546, "grad_norm": 2.8448777198791504, "learning_rate": 4.237348916265638e-06, "loss": 0.3439, "num_input_tokens_seen": 13232360, "step": 20145 }, { "epoch": 11.880896226415095, "grad_norm": 3.7187581062316895, "learning_rate": 4.234806407048702e-06, "loss": 0.448, "num_input_tokens_seen": 13236392, "step": 20150 }, { "epoch": 11.883844339622641, "grad_norm": 1.6924569606781006, "learning_rate": 4.232264100420066e-06, "loss": 0.3103, "num_input_tokens_seen": 13239880, "step": 20155 }, { "epoch": 11.88679245283019, "grad_norm": 4.112235069274902, "learning_rate": 4.229721997052819e-06, "loss": 0.295, "num_input_tokens_seen": 13244104, "step": 20160 }, { "epoch": 11.889740566037736, "grad_norm": 2.8204457759857178, "learning_rate": 4.227180097619993e-06, "loss": 0.2967, "num_input_tokens_seen": 13246280, "step": 20165 }, { "epoch": 11.892688679245284, "grad_norm": 2.508347749710083, "learning_rate": 4.224638402794566e-06, "loss": 0.3557, "num_input_tokens_seen": 13249448, "step": 20170 }, { "epoch": 11.89563679245283, "grad_norm": 2.829827308654785, "learning_rate": 4.222096913249464e-06, "loss": 0.4172, "num_input_tokens_seen": 13252936, "step": 20175 }, { "epoch": 11.898584905660378, "grad_norm": 3.3924288749694824, "learning_rate": 4.219555629657558e-06, "loss": 0.3866, "num_input_tokens_seen": 13255976, "step": 20180 }, { "epoch": 11.901533018867925, "grad_norm": 2.415720224380493, "learning_rate": 4.21701455269166e-06, "loss": 0.2773, "num_input_tokens_seen": 13258504, "step": 20185 }, { "epoch": 11.904481132075471, "grad_norm": 3.0127179622650146, "learning_rate": 4.2144736830245356e-06, "loss": 0.3919, "num_input_tokens_seen": 13261288, "step": 20190 }, { "epoch": 11.90742924528302, "grad_norm": 2.2931745052337646, "learning_rate": 4.211933021328889e-06, "loss": 0.4587, "num_input_tokens_seen": 13264552, "step": 20195 }, { "epoch": 11.910377358490566, "grad_norm": 2.8055953979492188, "learning_rate": 4.20939256827737e-06, "loss": 0.3603, "num_input_tokens_seen": 13268040, "step": 20200 }, { "epoch": 11.913325471698114, "grad_norm": 2.0822532176971436, "learning_rate": 4.206852324542578e-06, "loss": 0.3306, "num_input_tokens_seen": 13271880, "step": 20205 }, { "epoch": 11.91627358490566, "grad_norm": 3.4122154712677, "learning_rate": 4.2043122907970496e-06, "loss": 0.3994, "num_input_tokens_seen": 13275528, "step": 20210 }, { "epoch": 11.919221698113208, "grad_norm": 3.1095638275146484, "learning_rate": 4.2017724677132715e-06, "loss": 0.4005, "num_input_tokens_seen": 13278536, "step": 20215 }, { "epoch": 11.922169811320755, "grad_norm": 2.25229549407959, "learning_rate": 4.1992328559636734e-06, "loss": 0.398, "num_input_tokens_seen": 13281800, "step": 20220 }, { "epoch": 11.925117924528301, "grad_norm": 2.286259174346924, "learning_rate": 4.196693456220628e-06, "loss": 0.5475, "num_input_tokens_seen": 13284776, "step": 20225 }, { "epoch": 11.92806603773585, "grad_norm": 4.1983962059021, "learning_rate": 4.194154269156452e-06, "loss": 0.5131, "num_input_tokens_seen": 13287976, "step": 20230 }, { "epoch": 11.931014150943396, "grad_norm": 1.9969326257705688, "learning_rate": 4.191615295443404e-06, "loss": 0.4627, "num_input_tokens_seen": 13291976, "step": 20235 }, { "epoch": 11.933962264150944, "grad_norm": 1.9645689725875854, "learning_rate": 4.189076535753692e-06, "loss": 0.3533, "num_input_tokens_seen": 13295720, "step": 20240 }, { "epoch": 11.93691037735849, "grad_norm": 4.5950493812561035, "learning_rate": 4.186537990759464e-06, "loss": 0.3481, "num_input_tokens_seen": 13299016, "step": 20245 }, { "epoch": 11.939858490566039, "grad_norm": 3.3556361198425293, "learning_rate": 4.183999661132806e-06, "loss": 0.4797, "num_input_tokens_seen": 13301960, "step": 20250 }, { "epoch": 11.942806603773585, "grad_norm": 5.219093322753906, "learning_rate": 4.181461547545756e-06, "loss": 0.3487, "num_input_tokens_seen": 13305736, "step": 20255 }, { "epoch": 11.945754716981131, "grad_norm": 2.4549100399017334, "learning_rate": 4.178923650670289e-06, "loss": 0.3568, "num_input_tokens_seen": 13308648, "step": 20260 }, { "epoch": 11.94870283018868, "grad_norm": 2.954921007156372, "learning_rate": 4.176385971178324e-06, "loss": 0.4433, "num_input_tokens_seen": 13312040, "step": 20265 }, { "epoch": 11.951650943396226, "grad_norm": 2.739041328430176, "learning_rate": 4.1738485097417225e-06, "loss": 0.5606, "num_input_tokens_seen": 13314152, "step": 20270 }, { "epoch": 11.954599056603774, "grad_norm": 8.716150283813477, "learning_rate": 4.1713112670322886e-06, "loss": 0.3951, "num_input_tokens_seen": 13317320, "step": 20275 }, { "epoch": 11.95754716981132, "grad_norm": 2.3340039253234863, "learning_rate": 4.168774243721768e-06, "loss": 0.2793, "num_input_tokens_seen": 13320552, "step": 20280 }, { "epoch": 11.960495283018869, "grad_norm": 4.3242411613464355, "learning_rate": 4.166237440481849e-06, "loss": 0.4489, "num_input_tokens_seen": 13323752, "step": 20285 }, { "epoch": 11.963443396226415, "grad_norm": 6.113791465759277, "learning_rate": 4.163700857984162e-06, "loss": 0.324, "num_input_tokens_seen": 13327112, "step": 20290 }, { "epoch": 11.966391509433961, "grad_norm": 1.8474559783935547, "learning_rate": 4.161164496900275e-06, "loss": 0.3649, "num_input_tokens_seen": 13330248, "step": 20295 }, { "epoch": 11.96933962264151, "grad_norm": 2.8068525791168213, "learning_rate": 4.1586283579017036e-06, "loss": 0.3002, "num_input_tokens_seen": 13333416, "step": 20300 }, { "epoch": 11.972287735849056, "grad_norm": 1.1672521829605103, "learning_rate": 4.156092441659901e-06, "loss": 0.3812, "num_input_tokens_seen": 13336072, "step": 20305 }, { "epoch": 11.975235849056604, "grad_norm": 2.169060468673706, "learning_rate": 4.153556748846261e-06, "loss": 0.3657, "num_input_tokens_seen": 13338632, "step": 20310 }, { "epoch": 11.97818396226415, "grad_norm": 2.2743380069732666, "learning_rate": 4.15102128013212e-06, "loss": 0.5093, "num_input_tokens_seen": 13341928, "step": 20315 }, { "epoch": 11.981132075471699, "grad_norm": 1.8740943670272827, "learning_rate": 4.1484860361887544e-06, "loss": 0.3415, "num_input_tokens_seen": 13345576, "step": 20320 }, { "epoch": 11.984080188679245, "grad_norm": 5.938906192779541, "learning_rate": 4.145951017687379e-06, "loss": 0.3914, "num_input_tokens_seen": 13348840, "step": 20325 }, { "epoch": 11.987028301886792, "grad_norm": 3.10079026222229, "learning_rate": 4.1434162252991524e-06, "loss": 0.3707, "num_input_tokens_seen": 13352456, "step": 20330 }, { "epoch": 11.98997641509434, "grad_norm": 2.19854474067688, "learning_rate": 4.140881659695173e-06, "loss": 0.3267, "num_input_tokens_seen": 13355752, "step": 20335 }, { "epoch": 11.992924528301886, "grad_norm": 3.471027374267578, "learning_rate": 4.138347321546477e-06, "loss": 0.4147, "num_input_tokens_seen": 13358152, "step": 20340 }, { "epoch": 11.995872641509434, "grad_norm": 5.22161340713501, "learning_rate": 4.13581321152404e-06, "loss": 0.3749, "num_input_tokens_seen": 13362760, "step": 20345 }, { "epoch": 11.99882075471698, "grad_norm": 5.228881359100342, "learning_rate": 4.133279330298781e-06, "loss": 0.3657, "num_input_tokens_seen": 13366280, "step": 20350 }, { "epoch": 12.0, "eval_loss": 0.5405434966087341, "eval_runtime": 18.6301, "eval_samples_per_second": 91.036, "eval_steps_per_second": 22.759, "num_input_tokens_seen": 13367408, "step": 20352 }, { "epoch": 12.001768867924529, "grad_norm": 3.047558307647705, "learning_rate": 4.130745678541555e-06, "loss": 0.216, "num_input_tokens_seen": 13369552, "step": 20355 }, { "epoch": 12.004716981132075, "grad_norm": 4.190121650695801, "learning_rate": 4.128212256923155e-06, "loss": 0.3989, "num_input_tokens_seen": 13372208, "step": 20360 }, { "epoch": 12.007665094339623, "grad_norm": 3.5618228912353516, "learning_rate": 4.125679066114318e-06, "loss": 0.4556, "num_input_tokens_seen": 13374928, "step": 20365 }, { "epoch": 12.01061320754717, "grad_norm": 1.8897720575332642, "learning_rate": 4.123146106785717e-06, "loss": 0.276, "num_input_tokens_seen": 13378992, "step": 20370 }, { "epoch": 12.013561320754716, "grad_norm": 2.480733871459961, "learning_rate": 4.1206133796079625e-06, "loss": 0.3403, "num_input_tokens_seen": 13383376, "step": 20375 }, { "epoch": 12.016509433962264, "grad_norm": 2.4912264347076416, "learning_rate": 4.1180808852516065e-06, "loss": 0.3176, "num_input_tokens_seen": 13386320, "step": 20380 }, { "epoch": 12.01945754716981, "grad_norm": 8.267723083496094, "learning_rate": 4.115548624387136e-06, "loss": 0.3214, "num_input_tokens_seen": 13388624, "step": 20385 }, { "epoch": 12.022405660377359, "grad_norm": 2.560649871826172, "learning_rate": 4.113016597684979e-06, "loss": 0.3518, "num_input_tokens_seen": 13392272, "step": 20390 }, { "epoch": 12.025353773584905, "grad_norm": 1.4669487476348877, "learning_rate": 4.110484805815502e-06, "loss": 0.2766, "num_input_tokens_seen": 13395536, "step": 20395 }, { "epoch": 12.028301886792454, "grad_norm": 3.964613676071167, "learning_rate": 4.107953249449005e-06, "loss": 0.3107, "num_input_tokens_seen": 13399408, "step": 20400 }, { "epoch": 12.03125, "grad_norm": 3.1298370361328125, "learning_rate": 4.105421929255729e-06, "loss": 0.2737, "num_input_tokens_seen": 13401872, "step": 20405 }, { "epoch": 12.034198113207546, "grad_norm": 3.3273797035217285, "learning_rate": 4.102890845905854e-06, "loss": 0.383, "num_input_tokens_seen": 13405072, "step": 20410 }, { "epoch": 12.037146226415095, "grad_norm": 4.149435043334961, "learning_rate": 4.1003600000694935e-06, "loss": 0.3104, "num_input_tokens_seen": 13408176, "step": 20415 }, { "epoch": 12.040094339622641, "grad_norm": 2.1199870109558105, "learning_rate": 4.0978293924167e-06, "loss": 0.4955, "num_input_tokens_seen": 13412624, "step": 20420 }, { "epoch": 12.04304245283019, "grad_norm": 2.0944058895111084, "learning_rate": 4.095299023617461e-06, "loss": 0.4676, "num_input_tokens_seen": 13415184, "step": 20425 }, { "epoch": 12.045990566037736, "grad_norm": 2.4188008308410645, "learning_rate": 4.092768894341707e-06, "loss": 0.295, "num_input_tokens_seen": 13421968, "step": 20430 }, { "epoch": 12.048938679245284, "grad_norm": 3.159278154373169, "learning_rate": 4.090239005259298e-06, "loss": 0.2564, "num_input_tokens_seen": 13425552, "step": 20435 }, { "epoch": 12.05188679245283, "grad_norm": 2.7918882369995117, "learning_rate": 4.087709357040033e-06, "loss": 0.2447, "num_input_tokens_seen": 13428304, "step": 20440 }, { "epoch": 12.054834905660377, "grad_norm": 4.397581100463867, "learning_rate": 4.085179950353648e-06, "loss": 0.6028, "num_input_tokens_seen": 13430928, "step": 20445 }, { "epoch": 12.057783018867925, "grad_norm": 2.625603675842285, "learning_rate": 4.0826507858698135e-06, "loss": 0.4145, "num_input_tokens_seen": 13434032, "step": 20450 }, { "epoch": 12.060731132075471, "grad_norm": 1.1775883436203003, "learning_rate": 4.080121864258136e-06, "loss": 0.3207, "num_input_tokens_seen": 13437456, "step": 20455 }, { "epoch": 12.06367924528302, "grad_norm": 3.1677095890045166, "learning_rate": 4.077593186188161e-06, "loss": 0.4408, "num_input_tokens_seen": 13440944, "step": 20460 }, { "epoch": 12.066627358490566, "grad_norm": 2.2542169094085693, "learning_rate": 4.075064752329364e-06, "loss": 0.2893, "num_input_tokens_seen": 13443856, "step": 20465 }, { "epoch": 12.069575471698114, "grad_norm": 2.689950942993164, "learning_rate": 4.0725365633511605e-06, "loss": 0.3873, "num_input_tokens_seen": 13446960, "step": 20470 }, { "epoch": 12.07252358490566, "grad_norm": 2.9552102088928223, "learning_rate": 4.070008619922899e-06, "loss": 0.4352, "num_input_tokens_seen": 13450352, "step": 20475 }, { "epoch": 12.075471698113208, "grad_norm": 2.8919947147369385, "learning_rate": 4.067480922713864e-06, "loss": 0.3392, "num_input_tokens_seen": 13454672, "step": 20480 }, { "epoch": 12.078419811320755, "grad_norm": 3.217130661010742, "learning_rate": 4.064953472393273e-06, "loss": 0.3606, "num_input_tokens_seen": 13457488, "step": 20485 }, { "epoch": 12.081367924528301, "grad_norm": 2.319880247116089, "learning_rate": 4.06242626963028e-06, "loss": 0.2935, "num_input_tokens_seen": 13460560, "step": 20490 }, { "epoch": 12.08431603773585, "grad_norm": 3.659457206726074, "learning_rate": 4.059899315093972e-06, "loss": 0.3438, "num_input_tokens_seen": 13463312, "step": 20495 }, { "epoch": 12.087264150943396, "grad_norm": 3.2477612495422363, "learning_rate": 4.057372609453374e-06, "loss": 0.4839, "num_input_tokens_seen": 13467184, "step": 20500 }, { "epoch": 12.090212264150944, "grad_norm": 2.8868861198425293, "learning_rate": 4.054846153377439e-06, "loss": 0.3316, "num_input_tokens_seen": 13470128, "step": 20505 }, { "epoch": 12.09316037735849, "grad_norm": 2.5493600368499756, "learning_rate": 4.052319947535058e-06, "loss": 0.6091, "num_input_tokens_seen": 13474000, "step": 20510 }, { "epoch": 12.096108490566039, "grad_norm": 3.03436541557312, "learning_rate": 4.049793992595056e-06, "loss": 0.4072, "num_input_tokens_seen": 13477680, "step": 20515 }, { "epoch": 12.099056603773585, "grad_norm": 2.720736503601074, "learning_rate": 4.047268289226187e-06, "loss": 0.4155, "num_input_tokens_seen": 13482352, "step": 20520 }, { "epoch": 12.102004716981131, "grad_norm": 5.57158088684082, "learning_rate": 4.044742838097147e-06, "loss": 0.4072, "num_input_tokens_seen": 13486736, "step": 20525 }, { "epoch": 12.10495283018868, "grad_norm": 3.6178224086761475, "learning_rate": 4.0422176398765564e-06, "loss": 0.4767, "num_input_tokens_seen": 13492208, "step": 20530 }, { "epoch": 12.107900943396226, "grad_norm": 2.7664551734924316, "learning_rate": 4.039692695232975e-06, "loss": 0.3741, "num_input_tokens_seen": 13495184, "step": 20535 }, { "epoch": 12.110849056603774, "grad_norm": 3.1369552612304688, "learning_rate": 4.03716800483489e-06, "loss": 0.3845, "num_input_tokens_seen": 13498160, "step": 20540 }, { "epoch": 12.11379716981132, "grad_norm": 2.877448320388794, "learning_rate": 4.034643569350726e-06, "loss": 0.4003, "num_input_tokens_seen": 13500976, "step": 20545 }, { "epoch": 12.116745283018869, "grad_norm": 2.4466545581817627, "learning_rate": 4.032119389448837e-06, "loss": 0.3241, "num_input_tokens_seen": 13504272, "step": 20550 }, { "epoch": 12.119693396226415, "grad_norm": 1.969397783279419, "learning_rate": 4.0295954657975115e-06, "loss": 0.3178, "num_input_tokens_seen": 13508944, "step": 20555 }, { "epoch": 12.122641509433961, "grad_norm": 2.730341672897339, "learning_rate": 4.027071799064968e-06, "loss": 0.3748, "num_input_tokens_seen": 13512336, "step": 20560 }, { "epoch": 12.12558962264151, "grad_norm": 3.2492544651031494, "learning_rate": 4.02454838991936e-06, "loss": 0.3398, "num_input_tokens_seen": 13515056, "step": 20565 }, { "epoch": 12.128537735849056, "grad_norm": 2.5765509605407715, "learning_rate": 4.022025239028768e-06, "loss": 0.2788, "num_input_tokens_seen": 13518736, "step": 20570 }, { "epoch": 12.131485849056604, "grad_norm": 2.9761788845062256, "learning_rate": 4.0195023470612095e-06, "loss": 0.3608, "num_input_tokens_seen": 13521744, "step": 20575 }, { "epoch": 12.13443396226415, "grad_norm": 3.339893341064453, "learning_rate": 4.016979714684631e-06, "loss": 0.3928, "num_input_tokens_seen": 13525200, "step": 20580 }, { "epoch": 12.137382075471699, "grad_norm": 3.4719831943511963, "learning_rate": 4.014457342566909e-06, "loss": 0.4983, "num_input_tokens_seen": 13528336, "step": 20585 }, { "epoch": 12.140330188679245, "grad_norm": 3.8152124881744385, "learning_rate": 4.011935231375853e-06, "loss": 0.3394, "num_input_tokens_seen": 13535888, "step": 20590 }, { "epoch": 12.143278301886792, "grad_norm": 5.84313440322876, "learning_rate": 4.009413381779203e-06, "loss": 0.4326, "num_input_tokens_seen": 13538192, "step": 20595 }, { "epoch": 12.14622641509434, "grad_norm": 2.707919120788574, "learning_rate": 4.00689179444463e-06, "loss": 0.3895, "num_input_tokens_seen": 13541008, "step": 20600 }, { "epoch": 12.149174528301886, "grad_norm": 2.927839517593384, "learning_rate": 4.004370470039733e-06, "loss": 0.3823, "num_input_tokens_seen": 13544720, "step": 20605 }, { "epoch": 12.152122641509434, "grad_norm": 4.994647026062012, "learning_rate": 4.001849409232046e-06, "loss": 0.3382, "num_input_tokens_seen": 13547344, "step": 20610 }, { "epoch": 12.15507075471698, "grad_norm": 3.126650333404541, "learning_rate": 3.9993286126890274e-06, "loss": 0.4373, "num_input_tokens_seen": 13550288, "step": 20615 }, { "epoch": 12.158018867924529, "grad_norm": 3.962484359741211, "learning_rate": 3.996808081078074e-06, "loss": 0.3806, "num_input_tokens_seen": 13553776, "step": 20620 }, { "epoch": 12.160966981132075, "grad_norm": 3.9641928672790527, "learning_rate": 3.9942878150665035e-06, "loss": 0.3177, "num_input_tokens_seen": 13557776, "step": 20625 }, { "epoch": 12.163915094339623, "grad_norm": 3.01908278465271, "learning_rate": 3.991767815321569e-06, "loss": 0.3593, "num_input_tokens_seen": 13560880, "step": 20630 }, { "epoch": 12.16686320754717, "grad_norm": 2.9584901332855225, "learning_rate": 3.9892480825104504e-06, "loss": 0.3327, "num_input_tokens_seen": 13563856, "step": 20635 }, { "epoch": 12.169811320754716, "grad_norm": 6.165433883666992, "learning_rate": 3.986728617300257e-06, "loss": 0.3718, "num_input_tokens_seen": 13566960, "step": 20640 }, { "epoch": 12.172759433962264, "grad_norm": 3.1964964866638184, "learning_rate": 3.984209420358031e-06, "loss": 0.3084, "num_input_tokens_seen": 13570960, "step": 20645 }, { "epoch": 12.17570754716981, "grad_norm": 2.6186063289642334, "learning_rate": 3.981690492350738e-06, "loss": 0.3787, "num_input_tokens_seen": 13573872, "step": 20650 }, { "epoch": 12.178655660377359, "grad_norm": 3.930363416671753, "learning_rate": 3.979171833945276e-06, "loss": 0.3289, "num_input_tokens_seen": 13576880, "step": 20655 }, { "epoch": 12.181603773584905, "grad_norm": 2.0664501190185547, "learning_rate": 3.97665344580847e-06, "loss": 0.451, "num_input_tokens_seen": 13579600, "step": 20660 }, { "epoch": 12.184551886792454, "grad_norm": 3.2089884281158447, "learning_rate": 3.974135328607075e-06, "loss": 0.3067, "num_input_tokens_seen": 13582448, "step": 20665 }, { "epoch": 12.1875, "grad_norm": 1.9291728734970093, "learning_rate": 3.971617483007773e-06, "loss": 0.3389, "num_input_tokens_seen": 13585488, "step": 20670 }, { "epoch": 12.190448113207546, "grad_norm": 7.728989601135254, "learning_rate": 3.969099909677174e-06, "loss": 0.4329, "num_input_tokens_seen": 13589584, "step": 20675 }, { "epoch": 12.193396226415095, "grad_norm": 3.2600440979003906, "learning_rate": 3.9665826092818165e-06, "loss": 0.3147, "num_input_tokens_seen": 13592592, "step": 20680 }, { "epoch": 12.196344339622641, "grad_norm": 3.9623990058898926, "learning_rate": 3.964065582488168e-06, "loss": 0.4706, "num_input_tokens_seen": 13598896, "step": 20685 }, { "epoch": 12.19929245283019, "grad_norm": 3.1531331539154053, "learning_rate": 3.961548829962622e-06, "loss": 0.3484, "num_input_tokens_seen": 13603376, "step": 20690 }, { "epoch": 12.202240566037736, "grad_norm": 2.8436834812164307, "learning_rate": 3.959032352371496e-06, "loss": 0.3744, "num_input_tokens_seen": 13605648, "step": 20695 }, { "epoch": 12.205188679245284, "grad_norm": 3.504366874694824, "learning_rate": 3.956516150381043e-06, "loss": 0.3318, "num_input_tokens_seen": 13608208, "step": 20700 }, { "epoch": 12.20813679245283, "grad_norm": 2.595430850982666, "learning_rate": 3.954000224657436e-06, "loss": 0.3872, "num_input_tokens_seen": 13611312, "step": 20705 }, { "epoch": 12.211084905660377, "grad_norm": 3.696253776550293, "learning_rate": 3.951484575866776e-06, "loss": 0.3301, "num_input_tokens_seen": 13614288, "step": 20710 }, { "epoch": 12.214033018867925, "grad_norm": 2.2964136600494385, "learning_rate": 3.948969204675096e-06, "loss": 0.4323, "num_input_tokens_seen": 13617008, "step": 20715 }, { "epoch": 12.216981132075471, "grad_norm": 4.083402633666992, "learning_rate": 3.946454111748346e-06, "loss": 0.3361, "num_input_tokens_seen": 13619696, "step": 20720 }, { "epoch": 12.21992924528302, "grad_norm": 6.239043235778809, "learning_rate": 3.943939297752413e-06, "loss": 0.4831, "num_input_tokens_seen": 13622992, "step": 20725 }, { "epoch": 12.222877358490566, "grad_norm": 4.235449314117432, "learning_rate": 3.9414247633531e-06, "loss": 0.3128, "num_input_tokens_seen": 13625232, "step": 20730 }, { "epoch": 12.225825471698114, "grad_norm": 2.240983247756958, "learning_rate": 3.9389105092161454e-06, "loss": 0.3969, "num_input_tokens_seen": 13628400, "step": 20735 }, { "epoch": 12.22877358490566, "grad_norm": 2.0177788734436035, "learning_rate": 3.936396536007205e-06, "loss": 0.3803, "num_input_tokens_seen": 13632272, "step": 20740 }, { "epoch": 12.231721698113208, "grad_norm": 2.907423496246338, "learning_rate": 3.933882844391866e-06, "loss": 0.447, "num_input_tokens_seen": 13635056, "step": 20745 }, { "epoch": 12.234669811320755, "grad_norm": 2.6729273796081543, "learning_rate": 3.931369435035639e-06, "loss": 0.381, "num_input_tokens_seen": 13638960, "step": 20750 }, { "epoch": 12.237617924528301, "grad_norm": 4.038314342498779, "learning_rate": 3.92885630860396e-06, "loss": 0.2195, "num_input_tokens_seen": 13642320, "step": 20755 }, { "epoch": 12.24056603773585, "grad_norm": 2.2901248931884766, "learning_rate": 3.926343465762189e-06, "loss": 0.3478, "num_input_tokens_seen": 13645424, "step": 20760 }, { "epoch": 12.243514150943396, "grad_norm": 1.8170710802078247, "learning_rate": 3.923830907175613e-06, "loss": 0.4744, "num_input_tokens_seen": 13649584, "step": 20765 }, { "epoch": 12.246462264150944, "grad_norm": 3.9314944744110107, "learning_rate": 3.921318633509442e-06, "loss": 0.3421, "num_input_tokens_seen": 13652592, "step": 20770 }, { "epoch": 12.24941037735849, "grad_norm": 2.9024147987365723, "learning_rate": 3.918806645428811e-06, "loss": 0.247, "num_input_tokens_seen": 13655728, "step": 20775 }, { "epoch": 12.252358490566039, "grad_norm": 2.722501277923584, "learning_rate": 3.916294943598781e-06, "loss": 0.2707, "num_input_tokens_seen": 13658160, "step": 20780 }, { "epoch": 12.255306603773585, "grad_norm": 3.327033042907715, "learning_rate": 3.913783528684336e-06, "loss": 0.4218, "num_input_tokens_seen": 13661328, "step": 20785 }, { "epoch": 12.258254716981131, "grad_norm": 4.903980731964111, "learning_rate": 3.911272401350381e-06, "loss": 0.4223, "num_input_tokens_seen": 13665424, "step": 20790 }, { "epoch": 12.26120283018868, "grad_norm": 2.589827299118042, "learning_rate": 3.9087615622617525e-06, "loss": 0.338, "num_input_tokens_seen": 13668528, "step": 20795 }, { "epoch": 12.264150943396226, "grad_norm": 2.750173807144165, "learning_rate": 3.906251012083202e-06, "loss": 0.2942, "num_input_tokens_seen": 13671920, "step": 20800 }, { "epoch": 12.267099056603774, "grad_norm": 3.2562355995178223, "learning_rate": 3.9037407514794085e-06, "loss": 0.3965, "num_input_tokens_seen": 13675696, "step": 20805 }, { "epoch": 12.27004716981132, "grad_norm": 3.404291868209839, "learning_rate": 3.901230781114976e-06, "loss": 0.383, "num_input_tokens_seen": 13679152, "step": 20810 }, { "epoch": 12.272995283018869, "grad_norm": 2.583686590194702, "learning_rate": 3.898721101654431e-06, "loss": 0.4575, "num_input_tokens_seen": 13682192, "step": 20815 }, { "epoch": 12.275943396226415, "grad_norm": 3.251049518585205, "learning_rate": 3.896211713762221e-06, "loss": 0.4598, "num_input_tokens_seen": 13686096, "step": 20820 }, { "epoch": 12.278891509433961, "grad_norm": 2.659850597381592, "learning_rate": 3.893702618102715e-06, "loss": 0.3639, "num_input_tokens_seen": 13689648, "step": 20825 }, { "epoch": 12.28183962264151, "grad_norm": 2.0594053268432617, "learning_rate": 3.891193815340211e-06, "loss": 0.3369, "num_input_tokens_seen": 13692624, "step": 20830 }, { "epoch": 12.284787735849056, "grad_norm": 1.7678366899490356, "learning_rate": 3.888685306138922e-06, "loss": 0.4058, "num_input_tokens_seen": 13697392, "step": 20835 }, { "epoch": 12.287735849056604, "grad_norm": 2.622631549835205, "learning_rate": 3.886177091162987e-06, "loss": 0.4269, "num_input_tokens_seen": 13700816, "step": 20840 }, { "epoch": 12.29068396226415, "grad_norm": 4.295436382293701, "learning_rate": 3.883669171076468e-06, "loss": 0.3921, "num_input_tokens_seen": 13703600, "step": 20845 }, { "epoch": 12.293632075471699, "grad_norm": 3.67788028717041, "learning_rate": 3.881161546543348e-06, "loss": 0.3875, "num_input_tokens_seen": 13706128, "step": 20850 }, { "epoch": 12.296580188679245, "grad_norm": 3.0230910778045654, "learning_rate": 3.8786542182275295e-06, "loss": 0.3174, "num_input_tokens_seen": 13709872, "step": 20855 }, { "epoch": 12.299528301886792, "grad_norm": 2.7158219814300537, "learning_rate": 3.87614718679284e-06, "loss": 0.3911, "num_input_tokens_seen": 13712752, "step": 20860 }, { "epoch": 12.30247641509434, "grad_norm": 5.059131145477295, "learning_rate": 3.8736404529030255e-06, "loss": 0.3253, "num_input_tokens_seen": 13715472, "step": 20865 }, { "epoch": 12.305424528301886, "grad_norm": 2.115938663482666, "learning_rate": 3.871134017221756e-06, "loss": 0.2241, "num_input_tokens_seen": 13720688, "step": 20870 }, { "epoch": 12.308372641509434, "grad_norm": 3.6568243503570557, "learning_rate": 3.868627880412621e-06, "loss": 0.413, "num_input_tokens_seen": 13723504, "step": 20875 }, { "epoch": 12.31132075471698, "grad_norm": 4.534202575683594, "learning_rate": 3.86612204313913e-06, "loss": 0.363, "num_input_tokens_seen": 13725840, "step": 20880 }, { "epoch": 12.314268867924529, "grad_norm": 2.968675374984741, "learning_rate": 3.863616506064714e-06, "loss": 0.2425, "num_input_tokens_seen": 13728240, "step": 20885 }, { "epoch": 12.317216981132075, "grad_norm": 3.4000906944274902, "learning_rate": 3.861111269852727e-06, "loss": 0.4515, "num_input_tokens_seen": 13731152, "step": 20890 }, { "epoch": 12.320165094339623, "grad_norm": 2.4599649906158447, "learning_rate": 3.858606335166439e-06, "loss": 0.3133, "num_input_tokens_seen": 13734160, "step": 20895 }, { "epoch": 12.32311320754717, "grad_norm": 2.3866491317749023, "learning_rate": 3.8561017026690415e-06, "loss": 0.3429, "num_input_tokens_seen": 13736976, "step": 20900 }, { "epoch": 12.326061320754716, "grad_norm": 5.611135959625244, "learning_rate": 3.8535973730236495e-06, "loss": 0.3503, "num_input_tokens_seen": 13739280, "step": 20905 }, { "epoch": 12.329009433962264, "grad_norm": 3.9649672508239746, "learning_rate": 3.851093346893294e-06, "loss": 0.2135, "num_input_tokens_seen": 13742288, "step": 20910 }, { "epoch": 12.33195754716981, "grad_norm": 3.5205538272857666, "learning_rate": 3.848589624940927e-06, "loss": 0.3316, "num_input_tokens_seen": 13745168, "step": 20915 }, { "epoch": 12.334905660377359, "grad_norm": 8.603401184082031, "learning_rate": 3.846086207829417e-06, "loss": 0.2714, "num_input_tokens_seen": 13747760, "step": 20920 }, { "epoch": 12.337853773584905, "grad_norm": 2.8814942836761475, "learning_rate": 3.843583096221559e-06, "loss": 0.3618, "num_input_tokens_seen": 13751952, "step": 20925 }, { "epoch": 12.340801886792454, "grad_norm": 3.9056174755096436, "learning_rate": 3.8410802907800596e-06, "loss": 0.3908, "num_input_tokens_seen": 13755248, "step": 20930 }, { "epoch": 12.34375, "grad_norm": 2.743077516555786, "learning_rate": 3.838577792167546e-06, "loss": 0.3686, "num_input_tokens_seen": 13759472, "step": 20935 }, { "epoch": 12.346698113207546, "grad_norm": 2.372849702835083, "learning_rate": 3.836075601046569e-06, "loss": 0.3864, "num_input_tokens_seen": 13762768, "step": 20940 }, { "epoch": 12.349646226415095, "grad_norm": 2.6860618591308594, "learning_rate": 3.833573718079594e-06, "loss": 0.4276, "num_input_tokens_seen": 13765968, "step": 20945 }, { "epoch": 12.352594339622641, "grad_norm": 3.1898505687713623, "learning_rate": 3.831072143929002e-06, "loss": 0.2573, "num_input_tokens_seen": 13768400, "step": 20950 }, { "epoch": 12.35554245283019, "grad_norm": 2.770662546157837, "learning_rate": 3.828570879257098e-06, "loss": 0.4397, "num_input_tokens_seen": 13771376, "step": 20955 }, { "epoch": 12.358490566037736, "grad_norm": 2.442225217819214, "learning_rate": 3.826069924726102e-06, "loss": 0.383, "num_input_tokens_seen": 13774960, "step": 20960 }, { "epoch": 12.361438679245284, "grad_norm": 2.054852247238159, "learning_rate": 3.823569280998154e-06, "loss": 0.3303, "num_input_tokens_seen": 13778352, "step": 20965 }, { "epoch": 12.36438679245283, "grad_norm": 2.7256662845611572, "learning_rate": 3.8210689487353095e-06, "loss": 0.3556, "num_input_tokens_seen": 13780592, "step": 20970 }, { "epoch": 12.367334905660377, "grad_norm": 2.909536600112915, "learning_rate": 3.818568928599539e-06, "loss": 0.3439, "num_input_tokens_seen": 13783984, "step": 20975 }, { "epoch": 12.370283018867925, "grad_norm": 2.919609785079956, "learning_rate": 3.816069221252739e-06, "loss": 0.357, "num_input_tokens_seen": 13786448, "step": 20980 }, { "epoch": 12.373231132075471, "grad_norm": 2.3102035522460938, "learning_rate": 3.813569827356715e-06, "loss": 0.3757, "num_input_tokens_seen": 13789648, "step": 20985 }, { "epoch": 12.37617924528302, "grad_norm": 2.457350492477417, "learning_rate": 3.811070747573191e-06, "loss": 0.3564, "num_input_tokens_seen": 13792112, "step": 20990 }, { "epoch": 12.379127358490566, "grad_norm": 2.9253787994384766, "learning_rate": 3.8085719825638098e-06, "loss": 0.4078, "num_input_tokens_seen": 13795184, "step": 20995 }, { "epoch": 12.382075471698114, "grad_norm": 4.033147811889648, "learning_rate": 3.806073532990132e-06, "loss": 0.2551, "num_input_tokens_seen": 13798288, "step": 21000 }, { "epoch": 12.38502358490566, "grad_norm": 5.415529727935791, "learning_rate": 3.8035753995136333e-06, "loss": 0.4665, "num_input_tokens_seen": 13800976, "step": 21005 }, { "epoch": 12.387971698113208, "grad_norm": 2.390009880065918, "learning_rate": 3.801077582795702e-06, "loss": 0.2906, "num_input_tokens_seen": 13803600, "step": 21010 }, { "epoch": 12.390919811320755, "grad_norm": 2.7795472145080566, "learning_rate": 3.7985800834976494e-06, "loss": 0.3664, "num_input_tokens_seen": 13806384, "step": 21015 }, { "epoch": 12.393867924528301, "grad_norm": 3.001187324523926, "learning_rate": 3.7960829022806965e-06, "loss": 0.2411, "num_input_tokens_seen": 13809488, "step": 21020 }, { "epoch": 12.39681603773585, "grad_norm": 3.2996702194213867, "learning_rate": 3.793586039805984e-06, "loss": 0.2722, "num_input_tokens_seen": 13812368, "step": 21025 }, { "epoch": 12.399764150943396, "grad_norm": 1.9129056930541992, "learning_rate": 3.791089496734567e-06, "loss": 0.4172, "num_input_tokens_seen": 13815536, "step": 21030 }, { "epoch": 12.402712264150944, "grad_norm": 2.798980712890625, "learning_rate": 3.7885932737274163e-06, "loss": 0.3769, "num_input_tokens_seen": 13818832, "step": 21035 }, { "epoch": 12.40566037735849, "grad_norm": 3.1407077312469482, "learning_rate": 3.7860973714454156e-06, "loss": 0.477, "num_input_tokens_seen": 13822128, "step": 21040 }, { "epoch": 12.408608490566039, "grad_norm": 2.8872790336608887, "learning_rate": 3.7836017905493695e-06, "loss": 0.3358, "num_input_tokens_seen": 13825840, "step": 21045 }, { "epoch": 12.411556603773585, "grad_norm": 2.33056640625, "learning_rate": 3.7811065316999908e-06, "loss": 0.2826, "num_input_tokens_seen": 13828528, "step": 21050 }, { "epoch": 12.414504716981131, "grad_norm": 3.231480360031128, "learning_rate": 3.7786115955579105e-06, "loss": 0.3353, "num_input_tokens_seen": 13831568, "step": 21055 }, { "epoch": 12.41745283018868, "grad_norm": 3.3072588443756104, "learning_rate": 3.7761169827836746e-06, "loss": 0.296, "num_input_tokens_seen": 13833968, "step": 21060 }, { "epoch": 12.420400943396226, "grad_norm": 2.506629467010498, "learning_rate": 3.773622694037743e-06, "loss": 0.3543, "num_input_tokens_seen": 13837232, "step": 21065 }, { "epoch": 12.423349056603774, "grad_norm": 3.0421481132507324, "learning_rate": 3.7711287299804865e-06, "loss": 0.5336, "num_input_tokens_seen": 13840944, "step": 21070 }, { "epoch": 12.42629716981132, "grad_norm": 2.9167048931121826, "learning_rate": 3.768635091272197e-06, "loss": 0.3152, "num_input_tokens_seen": 13844336, "step": 21075 }, { "epoch": 12.429245283018869, "grad_norm": 6.2789387702941895, "learning_rate": 3.7661417785730732e-06, "loss": 0.4897, "num_input_tokens_seen": 13847728, "step": 21080 }, { "epoch": 12.432193396226415, "grad_norm": 2.3515632152557373, "learning_rate": 3.7636487925432304e-06, "loss": 0.2853, "num_input_tokens_seen": 13850800, "step": 21085 }, { "epoch": 12.435141509433961, "grad_norm": 5.050576210021973, "learning_rate": 3.761156133842697e-06, "loss": 0.3811, "num_input_tokens_seen": 13854192, "step": 21090 }, { "epoch": 12.43808962264151, "grad_norm": 3.111809253692627, "learning_rate": 3.7586638031314182e-06, "loss": 0.3593, "num_input_tokens_seen": 13857168, "step": 21095 }, { "epoch": 12.441037735849056, "grad_norm": 3.205216407775879, "learning_rate": 3.7561718010692477e-06, "loss": 0.3257, "num_input_tokens_seen": 13860240, "step": 21100 }, { "epoch": 12.443985849056604, "grad_norm": 2.072448253631592, "learning_rate": 3.7536801283159523e-06, "loss": 0.4428, "num_input_tokens_seen": 13864688, "step": 21105 }, { "epoch": 12.44693396226415, "grad_norm": 3.6486997604370117, "learning_rate": 3.7511887855312155e-06, "loss": 0.6059, "num_input_tokens_seen": 13869232, "step": 21110 }, { "epoch": 12.449882075471699, "grad_norm": 4.769418716430664, "learning_rate": 3.74869777337463e-06, "loss": 0.504, "num_input_tokens_seen": 13873168, "step": 21115 }, { "epoch": 12.452830188679245, "grad_norm": 8.437764167785645, "learning_rate": 3.7462070925057004e-06, "loss": 0.4391, "num_input_tokens_seen": 13875792, "step": 21120 }, { "epoch": 12.455778301886792, "grad_norm": 3.40521502494812, "learning_rate": 3.7437167435838472e-06, "loss": 0.377, "num_input_tokens_seen": 13878448, "step": 21125 }, { "epoch": 12.45872641509434, "grad_norm": 4.206230640411377, "learning_rate": 3.741226727268401e-06, "loss": 0.4417, "num_input_tokens_seen": 13881648, "step": 21130 }, { "epoch": 12.461674528301886, "grad_norm": 2.9391913414001465, "learning_rate": 3.7387370442186027e-06, "loss": 0.3025, "num_input_tokens_seen": 13885808, "step": 21135 }, { "epoch": 12.464622641509434, "grad_norm": 2.9090332984924316, "learning_rate": 3.736247695093609e-06, "loss": 0.3513, "num_input_tokens_seen": 13889584, "step": 21140 }, { "epoch": 12.46757075471698, "grad_norm": 2.3743278980255127, "learning_rate": 3.7337586805524838e-06, "loss": 0.4749, "num_input_tokens_seen": 13893392, "step": 21145 }, { "epoch": 12.470518867924529, "grad_norm": 5.034541606903076, "learning_rate": 3.731270001254205e-06, "loss": 0.3435, "num_input_tokens_seen": 13896272, "step": 21150 }, { "epoch": 12.473466981132075, "grad_norm": 3.5804200172424316, "learning_rate": 3.728781657857661e-06, "loss": 0.2713, "num_input_tokens_seen": 13898480, "step": 21155 }, { "epoch": 12.476415094339623, "grad_norm": 3.293790102005005, "learning_rate": 3.726293651021653e-06, "loss": 0.346, "num_input_tokens_seen": 13901648, "step": 21160 }, { "epoch": 12.47936320754717, "grad_norm": 1.8772025108337402, "learning_rate": 3.7238059814048888e-06, "loss": 0.2959, "num_input_tokens_seen": 13904880, "step": 21165 }, { "epoch": 12.482311320754716, "grad_norm": 2.598938226699829, "learning_rate": 3.7213186496659916e-06, "loss": 0.3253, "num_input_tokens_seen": 13907920, "step": 21170 }, { "epoch": 12.485259433962264, "grad_norm": 2.386141300201416, "learning_rate": 3.718831656463493e-06, "loss": 0.2937, "num_input_tokens_seen": 13911376, "step": 21175 }, { "epoch": 12.48820754716981, "grad_norm": 3.51926326751709, "learning_rate": 3.7163450024558345e-06, "loss": 0.5178, "num_input_tokens_seen": 13915216, "step": 21180 }, { "epoch": 12.491155660377359, "grad_norm": 4.624641418457031, "learning_rate": 3.7138586883013694e-06, "loss": 0.4453, "num_input_tokens_seen": 13917968, "step": 21185 }, { "epoch": 12.494103773584905, "grad_norm": 2.592355251312256, "learning_rate": 3.7113727146583573e-06, "loss": 0.3878, "num_input_tokens_seen": 13920656, "step": 21190 }, { "epoch": 12.497051886792454, "grad_norm": 2.1472208499908447, "learning_rate": 3.7088870821849748e-06, "loss": 0.3764, "num_input_tokens_seen": 13924432, "step": 21195 }, { "epoch": 12.5, "grad_norm": 4.90548038482666, "learning_rate": 3.706401791539301e-06, "loss": 0.2504, "num_input_tokens_seen": 13927408, "step": 21200 }, { "epoch": 12.502948113207546, "grad_norm": 3.35027813911438, "learning_rate": 3.703916843379328e-06, "loss": 0.4941, "num_input_tokens_seen": 13930448, "step": 21205 }, { "epoch": 12.505896226415095, "grad_norm": 2.327728748321533, "learning_rate": 3.7014322383629575e-06, "loss": 0.358, "num_input_tokens_seen": 13934160, "step": 21210 }, { "epoch": 12.508844339622641, "grad_norm": 4.441257953643799, "learning_rate": 3.6989479771479976e-06, "loss": 0.4785, "num_input_tokens_seen": 13937904, "step": 21215 }, { "epoch": 12.51179245283019, "grad_norm": 2.0684306621551514, "learning_rate": 3.696464060392169e-06, "loss": 0.5306, "num_input_tokens_seen": 13943248, "step": 21220 }, { "epoch": 12.514740566037736, "grad_norm": 2.521986246109009, "learning_rate": 3.6939804887530962e-06, "loss": 0.3119, "num_input_tokens_seen": 13946256, "step": 21225 }, { "epoch": 12.517688679245284, "grad_norm": 3.5539913177490234, "learning_rate": 3.6914972628883196e-06, "loss": 0.3323, "num_input_tokens_seen": 13949328, "step": 21230 }, { "epoch": 12.52063679245283, "grad_norm": 2.8236629962921143, "learning_rate": 3.6890143834552814e-06, "loss": 0.3264, "num_input_tokens_seen": 13952368, "step": 21235 }, { "epoch": 12.523584905660378, "grad_norm": 2.975353479385376, "learning_rate": 3.6865318511113347e-06, "loss": 0.3739, "num_input_tokens_seen": 13955696, "step": 21240 }, { "epoch": 12.526533018867925, "grad_norm": 1.7085002660751343, "learning_rate": 3.684049666513742e-06, "loss": 0.2956, "num_input_tokens_seen": 13959312, "step": 21245 }, { "epoch": 12.529481132075471, "grad_norm": 3.2460124492645264, "learning_rate": 3.6815678303196715e-06, "loss": 0.3081, "num_input_tokens_seen": 13962928, "step": 21250 }, { "epoch": 12.53242924528302, "grad_norm": 4.136741638183594, "learning_rate": 3.6790863431861988e-06, "loss": 0.3079, "num_input_tokens_seen": 13965488, "step": 21255 }, { "epoch": 12.535377358490566, "grad_norm": 1.644304871559143, "learning_rate": 3.676605205770311e-06, "loss": 0.319, "num_input_tokens_seen": 13968912, "step": 21260 }, { "epoch": 12.538325471698114, "grad_norm": 5.903139114379883, "learning_rate": 3.674124418728898e-06, "loss": 0.4081, "num_input_tokens_seen": 13971440, "step": 21265 }, { "epoch": 12.54127358490566, "grad_norm": 3.798128128051758, "learning_rate": 3.671643982718759e-06, "loss": 0.4054, "num_input_tokens_seen": 13974608, "step": 21270 }, { "epoch": 12.544221698113208, "grad_norm": 2.598311185836792, "learning_rate": 3.6691638983966017e-06, "loss": 0.296, "num_input_tokens_seen": 13977456, "step": 21275 }, { "epoch": 12.547169811320755, "grad_norm": 4.988778591156006, "learning_rate": 3.666684166419037e-06, "loss": 0.4204, "num_input_tokens_seen": 13980688, "step": 21280 }, { "epoch": 12.550117924528301, "grad_norm": 5.888576984405518, "learning_rate": 3.6642047874425833e-06, "loss": 0.3326, "num_input_tokens_seen": 13983536, "step": 21285 }, { "epoch": 12.55306603773585, "grad_norm": 1.7901389598846436, "learning_rate": 3.661725762123671e-06, "loss": 0.4727, "num_input_tokens_seen": 13987536, "step": 21290 }, { "epoch": 12.556014150943396, "grad_norm": 2.952404737472534, "learning_rate": 3.659247091118631e-06, "loss": 0.3242, "num_input_tokens_seen": 13990928, "step": 21295 }, { "epoch": 12.558962264150944, "grad_norm": 1.9161968231201172, "learning_rate": 3.6567687750837027e-06, "loss": 0.3587, "num_input_tokens_seen": 13994160, "step": 21300 }, { "epoch": 12.56191037735849, "grad_norm": 5.462453842163086, "learning_rate": 3.6542908146750287e-06, "loss": 0.2914, "num_input_tokens_seen": 13997200, "step": 21305 }, { "epoch": 12.564858490566039, "grad_norm": 3.4031922817230225, "learning_rate": 3.6518132105486624e-06, "loss": 0.3028, "num_input_tokens_seen": 14000144, "step": 21310 }, { "epoch": 12.567806603773585, "grad_norm": 2.7293851375579834, "learning_rate": 3.649335963360559e-06, "loss": 0.4449, "num_input_tokens_seen": 14003120, "step": 21315 }, { "epoch": 12.570754716981131, "grad_norm": 2.338593006134033, "learning_rate": 3.6468590737665795e-06, "loss": 0.3388, "num_input_tokens_seen": 14007248, "step": 21320 }, { "epoch": 12.57370283018868, "grad_norm": 2.988677501678467, "learning_rate": 3.6443825424224926e-06, "loss": 0.355, "num_input_tokens_seen": 14010768, "step": 21325 }, { "epoch": 12.576650943396226, "grad_norm": 2.1959547996520996, "learning_rate": 3.64190636998397e-06, "loss": 0.4149, "num_input_tokens_seen": 14013776, "step": 21330 }, { "epoch": 12.579599056603774, "grad_norm": 1.9597562551498413, "learning_rate": 3.639430557106588e-06, "loss": 0.2989, "num_input_tokens_seen": 14016656, "step": 21335 }, { "epoch": 12.58254716981132, "grad_norm": 1.9886515140533447, "learning_rate": 3.6369551044458314e-06, "loss": 0.3502, "num_input_tokens_seen": 14020144, "step": 21340 }, { "epoch": 12.585495283018869, "grad_norm": 2.7308881282806396, "learning_rate": 3.6344800126570846e-06, "loss": 0.4691, "num_input_tokens_seen": 14022768, "step": 21345 }, { "epoch": 12.588443396226415, "grad_norm": 1.5920990705490112, "learning_rate": 3.6320052823956385e-06, "loss": 0.3026, "num_input_tokens_seen": 14026224, "step": 21350 }, { "epoch": 12.591391509433961, "grad_norm": 2.451422929763794, "learning_rate": 3.6295309143166906e-06, "loss": 0.3757, "num_input_tokens_seen": 14029776, "step": 21355 }, { "epoch": 12.59433962264151, "grad_norm": 4.539200782775879, "learning_rate": 3.62705690907534e-06, "loss": 0.3872, "num_input_tokens_seen": 14033776, "step": 21360 }, { "epoch": 12.597287735849056, "grad_norm": 7.373136043548584, "learning_rate": 3.624583267326588e-06, "loss": 0.3119, "num_input_tokens_seen": 14036272, "step": 21365 }, { "epoch": 12.600235849056604, "grad_norm": 2.522484302520752, "learning_rate": 3.6221099897253454e-06, "loss": 0.3284, "num_input_tokens_seen": 14041040, "step": 21370 }, { "epoch": 12.60318396226415, "grad_norm": 2.8799023628234863, "learning_rate": 3.619637076926421e-06, "loss": 0.2649, "num_input_tokens_seen": 14044304, "step": 21375 }, { "epoch": 12.606132075471699, "grad_norm": 4.266395092010498, "learning_rate": 3.617164529584528e-06, "loss": 0.4763, "num_input_tokens_seen": 14046768, "step": 21380 }, { "epoch": 12.609080188679245, "grad_norm": 2.9551467895507812, "learning_rate": 3.614692348354286e-06, "loss": 0.4024, "num_input_tokens_seen": 14049424, "step": 21385 }, { "epoch": 12.612028301886792, "grad_norm": 3.6736137866973877, "learning_rate": 3.612220533890216e-06, "loss": 0.3782, "num_input_tokens_seen": 14052240, "step": 21390 }, { "epoch": 12.61497641509434, "grad_norm": 2.355088710784912, "learning_rate": 3.609749086846741e-06, "loss": 0.3685, "num_input_tokens_seen": 14056240, "step": 21395 }, { "epoch": 12.617924528301886, "grad_norm": 2.2823781967163086, "learning_rate": 3.607278007878186e-06, "loss": 0.3335, "num_input_tokens_seen": 14059600, "step": 21400 }, { "epoch": 12.620872641509434, "grad_norm": 7.6052680015563965, "learning_rate": 3.6048072976387817e-06, "loss": 0.3733, "num_input_tokens_seen": 14062896, "step": 21405 }, { "epoch": 12.62382075471698, "grad_norm": 2.967176675796509, "learning_rate": 3.6023369567826585e-06, "loss": 0.3771, "num_input_tokens_seen": 14066416, "step": 21410 }, { "epoch": 12.626768867924529, "grad_norm": 3.739410638809204, "learning_rate": 3.599866985963848e-06, "loss": 0.2977, "num_input_tokens_seen": 14069424, "step": 21415 }, { "epoch": 12.629716981132075, "grad_norm": 5.638262748718262, "learning_rate": 3.5973973858362885e-06, "loss": 0.3376, "num_input_tokens_seen": 14072336, "step": 21420 }, { "epoch": 12.632665094339622, "grad_norm": 1.8749562501907349, "learning_rate": 3.594928157053816e-06, "loss": 0.3358, "num_input_tokens_seen": 14075856, "step": 21425 }, { "epoch": 12.63561320754717, "grad_norm": 3.378084897994995, "learning_rate": 3.592459300270168e-06, "loss": 0.3568, "num_input_tokens_seen": 14079792, "step": 21430 }, { "epoch": 12.638561320754716, "grad_norm": 2.9577298164367676, "learning_rate": 3.589990816138988e-06, "loss": 0.3777, "num_input_tokens_seen": 14084624, "step": 21435 }, { "epoch": 12.641509433962264, "grad_norm": 6.8189873695373535, "learning_rate": 3.587522705313816e-06, "loss": 0.3762, "num_input_tokens_seen": 14087248, "step": 21440 }, { "epoch": 12.64445754716981, "grad_norm": 1.2858933210372925, "learning_rate": 3.585054968448094e-06, "loss": 0.2805, "num_input_tokens_seen": 14090480, "step": 21445 }, { "epoch": 12.647405660377359, "grad_norm": 2.7150301933288574, "learning_rate": 3.5825876061951686e-06, "loss": 0.3603, "num_input_tokens_seen": 14093904, "step": 21450 }, { "epoch": 12.650353773584905, "grad_norm": 1.759726643562317, "learning_rate": 3.5801206192082818e-06, "loss": 0.2845, "num_input_tokens_seen": 14097136, "step": 21455 }, { "epoch": 12.653301886792454, "grad_norm": 2.2539100646972656, "learning_rate": 3.577654008140582e-06, "loss": 0.367, "num_input_tokens_seen": 14100208, "step": 21460 }, { "epoch": 12.65625, "grad_norm": 3.7967898845672607, "learning_rate": 3.5751877736451123e-06, "loss": 0.4227, "num_input_tokens_seen": 14103920, "step": 21465 }, { "epoch": 12.659198113207546, "grad_norm": 2.8325154781341553, "learning_rate": 3.5727219163748205e-06, "loss": 0.3946, "num_input_tokens_seen": 14107536, "step": 21470 }, { "epoch": 12.662146226415095, "grad_norm": 2.179591178894043, "learning_rate": 3.570256436982552e-06, "loss": 0.381, "num_input_tokens_seen": 14111536, "step": 21475 }, { "epoch": 12.665094339622641, "grad_norm": 1.7885324954986572, "learning_rate": 3.5677913361210536e-06, "loss": 0.2593, "num_input_tokens_seen": 14114192, "step": 21480 }, { "epoch": 12.66804245283019, "grad_norm": 3.0825130939483643, "learning_rate": 3.565326614442972e-06, "loss": 0.3219, "num_input_tokens_seen": 14117840, "step": 21485 }, { "epoch": 12.670990566037736, "grad_norm": 2.68951416015625, "learning_rate": 3.5628622726008523e-06, "loss": 0.2907, "num_input_tokens_seen": 14120752, "step": 21490 }, { "epoch": 12.673938679245284, "grad_norm": 5.1981892585754395, "learning_rate": 3.56039831124714e-06, "loss": 0.5088, "num_input_tokens_seen": 14123408, "step": 21495 }, { "epoch": 12.67688679245283, "grad_norm": 3.692922353744507, "learning_rate": 3.557934731034179e-06, "loss": 0.416, "num_input_tokens_seen": 14126480, "step": 21500 }, { "epoch": 12.679834905660378, "grad_norm": 2.1209182739257812, "learning_rate": 3.5554715326142126e-06, "loss": 0.3592, "num_input_tokens_seen": 14129872, "step": 21505 }, { "epoch": 12.682783018867925, "grad_norm": 2.1466493606567383, "learning_rate": 3.553008716639384e-06, "loss": 0.2527, "num_input_tokens_seen": 14133008, "step": 21510 }, { "epoch": 12.685731132075471, "grad_norm": 2.7755067348480225, "learning_rate": 3.5505462837617338e-06, "loss": 0.3823, "num_input_tokens_seen": 14136336, "step": 21515 }, { "epoch": 12.68867924528302, "grad_norm": 3.5960283279418945, "learning_rate": 3.5480842346332013e-06, "loss": 0.3094, "num_input_tokens_seen": 14139344, "step": 21520 }, { "epoch": 12.691627358490566, "grad_norm": 2.779279947280884, "learning_rate": 3.5456225699056256e-06, "loss": 0.4299, "num_input_tokens_seen": 14142864, "step": 21525 }, { "epoch": 12.694575471698114, "grad_norm": 2.202871799468994, "learning_rate": 3.5431612902307426e-06, "loss": 0.4182, "num_input_tokens_seen": 14146032, "step": 21530 }, { "epoch": 12.69752358490566, "grad_norm": 4.3859782218933105, "learning_rate": 3.540700396260186e-06, "loss": 0.2671, "num_input_tokens_seen": 14149360, "step": 21535 }, { "epoch": 12.700471698113208, "grad_norm": 2.9432213306427, "learning_rate": 3.538239888645489e-06, "loss": 0.4789, "num_input_tokens_seen": 14152400, "step": 21540 }, { "epoch": 12.703419811320755, "grad_norm": 3.037104368209839, "learning_rate": 3.535779768038082e-06, "loss": 0.4989, "num_input_tokens_seen": 14155344, "step": 21545 }, { "epoch": 12.706367924528301, "grad_norm": 5.2940473556518555, "learning_rate": 3.5333200350892905e-06, "loss": 0.4166, "num_input_tokens_seen": 14158448, "step": 21550 }, { "epoch": 12.70931603773585, "grad_norm": 1.9137450456619263, "learning_rate": 3.530860690450342e-06, "loss": 0.2565, "num_input_tokens_seen": 14161488, "step": 21555 }, { "epoch": 12.712264150943396, "grad_norm": 3.1285526752471924, "learning_rate": 3.528401734772357e-06, "loss": 0.4825, "num_input_tokens_seen": 14164592, "step": 21560 }, { "epoch": 12.715212264150944, "grad_norm": 3.073610305786133, "learning_rate": 3.5259431687063538e-06, "loss": 0.4199, "num_input_tokens_seen": 14168144, "step": 21565 }, { "epoch": 12.71816037735849, "grad_norm": 3.887014150619507, "learning_rate": 3.523484992903249e-06, "loss": 0.3824, "num_input_tokens_seen": 14171248, "step": 21570 }, { "epoch": 12.721108490566039, "grad_norm": 2.6358656883239746, "learning_rate": 3.5210272080138573e-06, "loss": 0.2357, "num_input_tokens_seen": 14174704, "step": 21575 }, { "epoch": 12.724056603773585, "grad_norm": 3.8700549602508545, "learning_rate": 3.518569814688887e-06, "loss": 0.5027, "num_input_tokens_seen": 14177328, "step": 21580 }, { "epoch": 12.727004716981131, "grad_norm": 2.5069735050201416, "learning_rate": 3.5161128135789414e-06, "loss": 0.3278, "num_input_tokens_seen": 14179792, "step": 21585 }, { "epoch": 12.72995283018868, "grad_norm": 4.250805377960205, "learning_rate": 3.513656205334525e-06, "loss": 0.3317, "num_input_tokens_seen": 14183376, "step": 21590 }, { "epoch": 12.732900943396226, "grad_norm": 2.3387510776519775, "learning_rate": 3.5111999906060336e-06, "loss": 0.4805, "num_input_tokens_seen": 14187184, "step": 21595 }, { "epoch": 12.735849056603774, "grad_norm": 2.675712823867798, "learning_rate": 3.50874417004376e-06, "loss": 0.3189, "num_input_tokens_seen": 14190672, "step": 21600 }, { "epoch": 12.73879716981132, "grad_norm": 5.003769397735596, "learning_rate": 3.5062887442978956e-06, "loss": 0.3459, "num_input_tokens_seen": 14193168, "step": 21605 }, { "epoch": 12.741745283018869, "grad_norm": 3.0355396270751953, "learning_rate": 3.503833714018524e-06, "loss": 0.355, "num_input_tokens_seen": 14196528, "step": 21610 }, { "epoch": 12.744693396226415, "grad_norm": 3.7554874420166016, "learning_rate": 3.5013790798556228e-06, "loss": 0.3668, "num_input_tokens_seen": 14200752, "step": 21615 }, { "epoch": 12.747641509433961, "grad_norm": 2.349477529525757, "learning_rate": 3.4989248424590705e-06, "loss": 0.3339, "num_input_tokens_seen": 14204144, "step": 21620 }, { "epoch": 12.75058962264151, "grad_norm": 1.8885842561721802, "learning_rate": 3.4964710024786354e-06, "loss": 0.3831, "num_input_tokens_seen": 14207184, "step": 21625 }, { "epoch": 12.753537735849056, "grad_norm": 2.4917352199554443, "learning_rate": 3.4940175605639813e-06, "loss": 0.2344, "num_input_tokens_seen": 14210224, "step": 21630 }, { "epoch": 12.756485849056604, "grad_norm": 1.9161036014556885, "learning_rate": 3.4915645173646694e-06, "loss": 0.2995, "num_input_tokens_seen": 14214160, "step": 21635 }, { "epoch": 12.75943396226415, "grad_norm": 2.6958229541778564, "learning_rate": 3.489111873530153e-06, "loss": 0.4298, "num_input_tokens_seen": 14216496, "step": 21640 }, { "epoch": 12.762382075471699, "grad_norm": 3.415879011154175, "learning_rate": 3.4866596297097776e-06, "loss": 0.3363, "num_input_tokens_seen": 14219376, "step": 21645 }, { "epoch": 12.765330188679245, "grad_norm": 3.608565330505371, "learning_rate": 3.484207786552789e-06, "loss": 0.693, "num_input_tokens_seen": 14222992, "step": 21650 }, { "epoch": 12.768278301886792, "grad_norm": 2.854328155517578, "learning_rate": 3.4817563447083214e-06, "loss": 0.3505, "num_input_tokens_seen": 14226992, "step": 21655 }, { "epoch": 12.77122641509434, "grad_norm": 4.584987640380859, "learning_rate": 3.4793053048254044e-06, "loss": 0.3688, "num_input_tokens_seen": 14230224, "step": 21660 }, { "epoch": 12.774174528301886, "grad_norm": 2.1705517768859863, "learning_rate": 3.47685466755296e-06, "loss": 0.3855, "num_input_tokens_seen": 14233328, "step": 21665 }, { "epoch": 12.777122641509434, "grad_norm": 3.216411828994751, "learning_rate": 3.474404433539809e-06, "loss": 0.2317, "num_input_tokens_seen": 14236912, "step": 21670 }, { "epoch": 12.78007075471698, "grad_norm": 3.8199634552001953, "learning_rate": 3.4719546034346598e-06, "loss": 0.3348, "num_input_tokens_seen": 14240848, "step": 21675 }, { "epoch": 12.783018867924529, "grad_norm": 2.78244686126709, "learning_rate": 3.4695051778861125e-06, "loss": 0.2537, "num_input_tokens_seen": 14243472, "step": 21680 }, { "epoch": 12.785966981132075, "grad_norm": 3.5587940216064453, "learning_rate": 3.4670561575426677e-06, "loss": 0.4091, "num_input_tokens_seen": 14246192, "step": 21685 }, { "epoch": 12.788915094339622, "grad_norm": 4.127229690551758, "learning_rate": 3.4646075430527115e-06, "loss": 0.3044, "num_input_tokens_seen": 14249264, "step": 21690 }, { "epoch": 12.79186320754717, "grad_norm": 2.781796932220459, "learning_rate": 3.4621593350645236e-06, "loss": 0.3356, "num_input_tokens_seen": 14252048, "step": 21695 }, { "epoch": 12.794811320754716, "grad_norm": 2.3941917419433594, "learning_rate": 3.4597115342262817e-06, "loss": 0.2495, "num_input_tokens_seen": 14254832, "step": 21700 }, { "epoch": 12.797759433962264, "grad_norm": 2.8860390186309814, "learning_rate": 3.4572641411860484e-06, "loss": 0.3436, "num_input_tokens_seen": 14258448, "step": 21705 }, { "epoch": 12.80070754716981, "grad_norm": 3.626854419708252, "learning_rate": 3.454817156591782e-06, "loss": 0.4368, "num_input_tokens_seen": 14261936, "step": 21710 }, { "epoch": 12.803655660377359, "grad_norm": 4.182994842529297, "learning_rate": 3.4523705810913344e-06, "loss": 0.3177, "num_input_tokens_seen": 14264944, "step": 21715 }, { "epoch": 12.806603773584905, "grad_norm": 5.773361682891846, "learning_rate": 3.449924415332443e-06, "loss": 0.4047, "num_input_tokens_seen": 14268176, "step": 21720 }, { "epoch": 12.809551886792454, "grad_norm": 3.531745195388794, "learning_rate": 3.447478659962745e-06, "loss": 0.301, "num_input_tokens_seen": 14271536, "step": 21725 }, { "epoch": 12.8125, "grad_norm": 3.612093448638916, "learning_rate": 3.4450333156297625e-06, "loss": 0.4213, "num_input_tokens_seen": 14275312, "step": 21730 }, { "epoch": 12.815448113207546, "grad_norm": 2.3170018196105957, "learning_rate": 3.44258838298091e-06, "loss": 0.3105, "num_input_tokens_seen": 14278448, "step": 21735 }, { "epoch": 12.818396226415095, "grad_norm": 2.3123130798339844, "learning_rate": 3.440143862663497e-06, "loss": 0.3098, "num_input_tokens_seen": 14281424, "step": 21740 }, { "epoch": 12.821344339622641, "grad_norm": 3.8197519779205322, "learning_rate": 3.4376997553247183e-06, "loss": 0.4009, "num_input_tokens_seen": 14284944, "step": 21745 }, { "epoch": 12.82429245283019, "grad_norm": 2.0211920738220215, "learning_rate": 3.4352560616116617e-06, "loss": 0.3564, "num_input_tokens_seen": 14287312, "step": 21750 }, { "epoch": 12.827240566037736, "grad_norm": 2.7113542556762695, "learning_rate": 3.4328127821713077e-06, "loss": 0.281, "num_input_tokens_seen": 14291408, "step": 21755 }, { "epoch": 12.830188679245284, "grad_norm": 3.5049333572387695, "learning_rate": 3.430369917650521e-06, "loss": 0.2932, "num_input_tokens_seen": 14294512, "step": 21760 }, { "epoch": 12.83313679245283, "grad_norm": 3.838129758834839, "learning_rate": 3.427927468696066e-06, "loss": 0.2799, "num_input_tokens_seen": 14297744, "step": 21765 }, { "epoch": 12.836084905660378, "grad_norm": 3.6884241104125977, "learning_rate": 3.425485435954588e-06, "loss": 0.4314, "num_input_tokens_seen": 14301040, "step": 21770 }, { "epoch": 12.839033018867925, "grad_norm": 1.765413761138916, "learning_rate": 3.4230438200726274e-06, "loss": 0.4438, "num_input_tokens_seen": 14304304, "step": 21775 }, { "epoch": 12.841981132075471, "grad_norm": 2.512037754058838, "learning_rate": 3.4206026216966113e-06, "loss": 0.531, "num_input_tokens_seen": 14307408, "step": 21780 }, { "epoch": 12.84492924528302, "grad_norm": 3.9298622608184814, "learning_rate": 3.418161841472858e-06, "loss": 0.3791, "num_input_tokens_seen": 14310352, "step": 21785 }, { "epoch": 12.847877358490566, "grad_norm": 4.288475513458252, "learning_rate": 3.4157214800475746e-06, "loss": 0.428, "num_input_tokens_seen": 14314928, "step": 21790 }, { "epoch": 12.850825471698114, "grad_norm": 2.452195167541504, "learning_rate": 3.4132815380668577e-06, "loss": 0.4315, "num_input_tokens_seen": 14317616, "step": 21795 }, { "epoch": 12.85377358490566, "grad_norm": 2.3427016735076904, "learning_rate": 3.410842016176691e-06, "loss": 0.3519, "num_input_tokens_seen": 14320720, "step": 21800 }, { "epoch": 12.856721698113208, "grad_norm": 2.2547314167022705, "learning_rate": 3.4084029150229503e-06, "loss": 0.3458, "num_input_tokens_seen": 14323536, "step": 21805 }, { "epoch": 12.859669811320755, "grad_norm": 7.099574089050293, "learning_rate": 3.4059642352513965e-06, "loss": 0.3423, "num_input_tokens_seen": 14326800, "step": 21810 }, { "epoch": 12.862617924528301, "grad_norm": 2.2386021614074707, "learning_rate": 3.4035259775076813e-06, "loss": 0.2849, "num_input_tokens_seen": 14329360, "step": 21815 }, { "epoch": 12.86556603773585, "grad_norm": 4.24003791809082, "learning_rate": 3.401088142437344e-06, "loss": 0.3168, "num_input_tokens_seen": 14332304, "step": 21820 }, { "epoch": 12.868514150943396, "grad_norm": 3.1786389350891113, "learning_rate": 3.398650730685813e-06, "loss": 0.4, "num_input_tokens_seen": 14335024, "step": 21825 }, { "epoch": 12.871462264150944, "grad_norm": 2.043787717819214, "learning_rate": 3.396213742898401e-06, "loss": 0.3097, "num_input_tokens_seen": 14340880, "step": 21830 }, { "epoch": 12.87441037735849, "grad_norm": 2.950995922088623, "learning_rate": 3.3937771797203134e-06, "loss": 0.3242, "num_input_tokens_seen": 14343344, "step": 21835 }, { "epoch": 12.877358490566039, "grad_norm": 2.0895349979400635, "learning_rate": 3.391341041796641e-06, "loss": 0.3086, "num_input_tokens_seen": 14346192, "step": 21840 }, { "epoch": 12.880306603773585, "grad_norm": 3.7014267444610596, "learning_rate": 3.3889053297723585e-06, "loss": 0.2821, "num_input_tokens_seen": 14349712, "step": 21845 }, { "epoch": 12.883254716981131, "grad_norm": 4.505474090576172, "learning_rate": 3.3864700442923342e-06, "loss": 0.4585, "num_input_tokens_seen": 14352496, "step": 21850 }, { "epoch": 12.88620283018868, "grad_norm": 3.2988080978393555, "learning_rate": 3.384035186001318e-06, "loss": 0.3763, "num_input_tokens_seen": 14355376, "step": 21855 }, { "epoch": 12.889150943396226, "grad_norm": 1.9598214626312256, "learning_rate": 3.381600755543953e-06, "loss": 0.3324, "num_input_tokens_seen": 14359760, "step": 21860 }, { "epoch": 12.892099056603774, "grad_norm": 1.599167823791504, "learning_rate": 3.3791667535647615e-06, "loss": 0.3069, "num_input_tokens_seen": 14363440, "step": 21865 }, { "epoch": 12.89504716981132, "grad_norm": 1.9964741468429565, "learning_rate": 3.3767331807081584e-06, "loss": 0.3688, "num_input_tokens_seen": 14366352, "step": 21870 }, { "epoch": 12.897995283018869, "grad_norm": 2.2842860221862793, "learning_rate": 3.374300037618442e-06, "loss": 0.3287, "num_input_tokens_seen": 14369840, "step": 21875 }, { "epoch": 12.900943396226415, "grad_norm": 2.263439416885376, "learning_rate": 3.371867324939796e-06, "loss": 0.3553, "num_input_tokens_seen": 14373616, "step": 21880 }, { "epoch": 12.903891509433961, "grad_norm": 2.798953056335449, "learning_rate": 3.369435043316293e-06, "loss": 0.4206, "num_input_tokens_seen": 14377040, "step": 21885 }, { "epoch": 12.90683962264151, "grad_norm": 4.337742805480957, "learning_rate": 3.36700319339189e-06, "loss": 0.2749, "num_input_tokens_seen": 14380144, "step": 21890 }, { "epoch": 12.909787735849056, "grad_norm": 2.9336705207824707, "learning_rate": 3.3645717758104286e-06, "loss": 0.3085, "num_input_tokens_seen": 14382864, "step": 21895 }, { "epoch": 12.912735849056604, "grad_norm": 2.9670286178588867, "learning_rate": 3.3621407912156383e-06, "loss": 0.2661, "num_input_tokens_seen": 14385808, "step": 21900 }, { "epoch": 12.91568396226415, "grad_norm": 3.2300374507904053, "learning_rate": 3.3597102402511326e-06, "loss": 0.4625, "num_input_tokens_seen": 14389808, "step": 21905 }, { "epoch": 12.918632075471699, "grad_norm": 2.5134804248809814, "learning_rate": 3.3572801235604093e-06, "loss": 0.2417, "num_input_tokens_seen": 14392176, "step": 21910 }, { "epoch": 12.921580188679245, "grad_norm": 3.851245403289795, "learning_rate": 3.3548504417868538e-06, "loss": 0.3004, "num_input_tokens_seen": 14394992, "step": 21915 }, { "epoch": 12.924528301886792, "grad_norm": 2.8740596771240234, "learning_rate": 3.352421195573734e-06, "loss": 0.422, "num_input_tokens_seen": 14397936, "step": 21920 }, { "epoch": 12.92747641509434, "grad_norm": 4.334013938903809, "learning_rate": 3.3499923855642026e-06, "loss": 0.3368, "num_input_tokens_seen": 14401488, "step": 21925 }, { "epoch": 12.930424528301886, "grad_norm": 2.8779194355010986, "learning_rate": 3.3475640124012986e-06, "loss": 0.4084, "num_input_tokens_seen": 14404400, "step": 21930 }, { "epoch": 12.933372641509434, "grad_norm": 2.510408401489258, "learning_rate": 3.345136076727945e-06, "loss": 0.356, "num_input_tokens_seen": 14406640, "step": 21935 }, { "epoch": 12.93632075471698, "grad_norm": 1.8383452892303467, "learning_rate": 3.3427085791869453e-06, "loss": 0.3313, "num_input_tokens_seen": 14409584, "step": 21940 }, { "epoch": 12.939268867924529, "grad_norm": 2.53867244720459, "learning_rate": 3.3402815204209926e-06, "loss": 0.3565, "num_input_tokens_seen": 14415600, "step": 21945 }, { "epoch": 12.942216981132075, "grad_norm": 5.843546390533447, "learning_rate": 3.337854901072659e-06, "loss": 0.3986, "num_input_tokens_seen": 14418544, "step": 21950 }, { "epoch": 12.945165094339622, "grad_norm": 3.746781349182129, "learning_rate": 3.3354287217844056e-06, "loss": 0.3021, "num_input_tokens_seen": 14421584, "step": 21955 }, { "epoch": 12.94811320754717, "grad_norm": 2.529804229736328, "learning_rate": 3.3330029831985712e-06, "loss": 0.3024, "num_input_tokens_seen": 14424400, "step": 21960 }, { "epoch": 12.951061320754716, "grad_norm": 4.491691589355469, "learning_rate": 3.330577685957382e-06, "loss": 0.3254, "num_input_tokens_seen": 14428048, "step": 21965 }, { "epoch": 12.954009433962264, "grad_norm": 3.9130489826202393, "learning_rate": 3.3281528307029454e-06, "loss": 0.3729, "num_input_tokens_seen": 14430960, "step": 21970 }, { "epoch": 12.95695754716981, "grad_norm": 3.6740026473999023, "learning_rate": 3.325728418077251e-06, "loss": 0.4021, "num_input_tokens_seen": 14433840, "step": 21975 }, { "epoch": 12.959905660377359, "grad_norm": 2.877146005630493, "learning_rate": 3.3233044487221744e-06, "loss": 0.436, "num_input_tokens_seen": 14436752, "step": 21980 }, { "epoch": 12.962853773584905, "grad_norm": 4.3798112869262695, "learning_rate": 3.3208809232794715e-06, "loss": 0.341, "num_input_tokens_seen": 14439312, "step": 21985 }, { "epoch": 12.965801886792454, "grad_norm": 2.9188177585601807, "learning_rate": 3.3184578423907797e-06, "loss": 0.3978, "num_input_tokens_seen": 14442000, "step": 21990 }, { "epoch": 12.96875, "grad_norm": 3.5581209659576416, "learning_rate": 3.3160352066976224e-06, "loss": 0.3107, "num_input_tokens_seen": 14445456, "step": 21995 }, { "epoch": 12.971698113207546, "grad_norm": 4.393881797790527, "learning_rate": 3.3136130168414003e-06, "loss": 0.4598, "num_input_tokens_seen": 14448848, "step": 22000 }, { "epoch": 12.974646226415095, "grad_norm": 2.6945858001708984, "learning_rate": 3.311191273463401e-06, "loss": 0.3239, "num_input_tokens_seen": 14452176, "step": 22005 }, { "epoch": 12.977594339622641, "grad_norm": 6.574132919311523, "learning_rate": 3.3087699772047908e-06, "loss": 0.38, "num_input_tokens_seen": 14455696, "step": 22010 }, { "epoch": 12.98054245283019, "grad_norm": 6.1678466796875, "learning_rate": 3.3063491287066164e-06, "loss": 0.3666, "num_input_tokens_seen": 14458832, "step": 22015 }, { "epoch": 12.983490566037736, "grad_norm": 2.803983211517334, "learning_rate": 3.303928728609811e-06, "loss": 0.441, "num_input_tokens_seen": 14461680, "step": 22020 }, { "epoch": 12.986438679245284, "grad_norm": 2.6968190670013428, "learning_rate": 3.3015087775551835e-06, "loss": 0.3593, "num_input_tokens_seen": 14464912, "step": 22025 }, { "epoch": 12.98938679245283, "grad_norm": 2.2933311462402344, "learning_rate": 3.299089276183427e-06, "loss": 0.3625, "num_input_tokens_seen": 14468016, "step": 22030 }, { "epoch": 12.992334905660378, "grad_norm": 2.4128382205963135, "learning_rate": 3.2966702251351157e-06, "loss": 0.2945, "num_input_tokens_seen": 14471408, "step": 22035 }, { "epoch": 12.995283018867925, "grad_norm": 4.7380452156066895, "learning_rate": 3.2942516250507035e-06, "loss": 0.2602, "num_input_tokens_seen": 14474288, "step": 22040 }, { "epoch": 12.998231132075471, "grad_norm": 3.4399330615997314, "learning_rate": 3.2918334765705227e-06, "loss": 0.4541, "num_input_tokens_seen": 14477328, "step": 22045 }, { "epoch": 13.00117924528302, "grad_norm": 2.4032914638519287, "learning_rate": 3.289415780334792e-06, "loss": 0.3834, "num_input_tokens_seen": 14479688, "step": 22050 }, { "epoch": 13.004127358490566, "grad_norm": 2.7762539386749268, "learning_rate": 3.2869985369836067e-06, "loss": 0.376, "num_input_tokens_seen": 14483432, "step": 22055 }, { "epoch": 13.007075471698114, "grad_norm": 2.872123956680298, "learning_rate": 3.2845817471569406e-06, "loss": 0.4088, "num_input_tokens_seen": 14486696, "step": 22060 }, { "epoch": 13.01002358490566, "grad_norm": 2.2066516876220703, "learning_rate": 3.28216541149465e-06, "loss": 0.2748, "num_input_tokens_seen": 14489320, "step": 22065 }, { "epoch": 13.012971698113208, "grad_norm": 3.0379676818847656, "learning_rate": 3.2797495306364707e-06, "loss": 0.387, "num_input_tokens_seen": 14492520, "step": 22070 }, { "epoch": 13.015919811320755, "grad_norm": 4.183723449707031, "learning_rate": 3.2773341052220174e-06, "loss": 0.3297, "num_input_tokens_seen": 14495944, "step": 22075 }, { "epoch": 13.018867924528301, "grad_norm": 5.458145618438721, "learning_rate": 3.274919135890783e-06, "loss": 0.3541, "num_input_tokens_seen": 14498760, "step": 22080 }, { "epoch": 13.02181603773585, "grad_norm": 2.488776445388794, "learning_rate": 3.2725046232821424e-06, "loss": 0.3957, "num_input_tokens_seen": 14501992, "step": 22085 }, { "epoch": 13.024764150943396, "grad_norm": 4.948331356048584, "learning_rate": 3.270090568035348e-06, "loss": 0.3818, "num_input_tokens_seen": 14505192, "step": 22090 }, { "epoch": 13.027712264150944, "grad_norm": 2.3038229942321777, "learning_rate": 3.2676769707895306e-06, "loss": 0.3182, "num_input_tokens_seen": 14508968, "step": 22095 }, { "epoch": 13.03066037735849, "grad_norm": 2.8669888973236084, "learning_rate": 3.2652638321837015e-06, "loss": 0.3081, "num_input_tokens_seen": 14513768, "step": 22100 }, { "epoch": 13.033608490566039, "grad_norm": 2.1250665187835693, "learning_rate": 3.2628511528567497e-06, "loss": 0.3278, "num_input_tokens_seen": 14517352, "step": 22105 }, { "epoch": 13.036556603773585, "grad_norm": 4.02549409866333, "learning_rate": 3.2604389334474407e-06, "loss": 0.2726, "num_input_tokens_seen": 14519528, "step": 22110 }, { "epoch": 13.039504716981131, "grad_norm": 2.347276449203491, "learning_rate": 3.2580271745944224e-06, "loss": 0.3117, "num_input_tokens_seen": 14523176, "step": 22115 }, { "epoch": 13.04245283018868, "grad_norm": 3.0129709243774414, "learning_rate": 3.255615876936217e-06, "loss": 0.3903, "num_input_tokens_seen": 14525800, "step": 22120 }, { "epoch": 13.045400943396226, "grad_norm": 3.156278371810913, "learning_rate": 3.2532050411112248e-06, "loss": 0.3172, "num_input_tokens_seen": 14528648, "step": 22125 }, { "epoch": 13.048349056603774, "grad_norm": 2.816683292388916, "learning_rate": 3.2507946677577274e-06, "loss": 0.358, "num_input_tokens_seen": 14531432, "step": 22130 }, { "epoch": 13.05129716981132, "grad_norm": 3.1873650550842285, "learning_rate": 3.2483847575138807e-06, "loss": 0.4371, "num_input_tokens_seen": 14535240, "step": 22135 }, { "epoch": 13.054245283018869, "grad_norm": 2.903531789779663, "learning_rate": 3.245975311017716e-06, "loss": 0.4108, "num_input_tokens_seen": 14538216, "step": 22140 }, { "epoch": 13.057193396226415, "grad_norm": 1.7930574417114258, "learning_rate": 3.2435663289071486e-06, "loss": 0.2297, "num_input_tokens_seen": 14542120, "step": 22145 }, { "epoch": 13.060141509433961, "grad_norm": 3.465599536895752, "learning_rate": 3.241157811819966e-06, "loss": 0.4162, "num_input_tokens_seen": 14545064, "step": 22150 }, { "epoch": 13.06308962264151, "grad_norm": 5.056347370147705, "learning_rate": 3.2387497603938327e-06, "loss": 0.5611, "num_input_tokens_seen": 14548520, "step": 22155 }, { "epoch": 13.066037735849056, "grad_norm": 2.3430562019348145, "learning_rate": 3.2363421752662903e-06, "loss": 0.34, "num_input_tokens_seen": 14551720, "step": 22160 }, { "epoch": 13.068985849056604, "grad_norm": 5.425186634063721, "learning_rate": 3.233935057074759e-06, "loss": 0.3604, "num_input_tokens_seen": 14555208, "step": 22165 }, { "epoch": 13.07193396226415, "grad_norm": 4.441516876220703, "learning_rate": 3.2315284064565324e-06, "loss": 0.4237, "num_input_tokens_seen": 14558408, "step": 22170 }, { "epoch": 13.074882075471699, "grad_norm": 3.429607391357422, "learning_rate": 3.2291222240487813e-06, "loss": 0.3235, "num_input_tokens_seen": 14562440, "step": 22175 }, { "epoch": 13.077830188679245, "grad_norm": 4.097338676452637, "learning_rate": 3.226716510488554e-06, "loss": 0.2821, "num_input_tokens_seen": 14567208, "step": 22180 }, { "epoch": 13.080778301886792, "grad_norm": 1.786116361618042, "learning_rate": 3.224311266412773e-06, "loss": 0.2857, "num_input_tokens_seen": 14570568, "step": 22185 }, { "epoch": 13.08372641509434, "grad_norm": 3.3701767921447754, "learning_rate": 3.2219064924582366e-06, "loss": 0.3836, "num_input_tokens_seen": 14573416, "step": 22190 }, { "epoch": 13.086674528301886, "grad_norm": 2.124253034591675, "learning_rate": 3.2195021892616197e-06, "loss": 0.2834, "num_input_tokens_seen": 14576744, "step": 22195 }, { "epoch": 13.089622641509434, "grad_norm": 3.2051033973693848, "learning_rate": 3.217098357459472e-06, "loss": 0.497, "num_input_tokens_seen": 14581096, "step": 22200 }, { "epoch": 13.09257075471698, "grad_norm": 2.3806819915771484, "learning_rate": 3.214694997688217e-06, "loss": 0.3562, "num_input_tokens_seen": 14583912, "step": 22205 }, { "epoch": 13.095518867924529, "grad_norm": 3.2602415084838867, "learning_rate": 3.2122921105841572e-06, "loss": 0.3873, "num_input_tokens_seen": 14588104, "step": 22210 }, { "epoch": 13.098466981132075, "grad_norm": 3.6457154750823975, "learning_rate": 3.2098896967834647e-06, "loss": 0.2947, "num_input_tokens_seen": 14591336, "step": 22215 }, { "epoch": 13.101415094339623, "grad_norm": 2.5983402729034424, "learning_rate": 3.2074877569221896e-06, "loss": 0.3857, "num_input_tokens_seen": 14594120, "step": 22220 }, { "epoch": 13.10436320754717, "grad_norm": 2.1097264289855957, "learning_rate": 3.205086291636257e-06, "loss": 0.2746, "num_input_tokens_seen": 14596840, "step": 22225 }, { "epoch": 13.107311320754716, "grad_norm": 3.49792218208313, "learning_rate": 3.202685301561463e-06, "loss": 0.2878, "num_input_tokens_seen": 14600104, "step": 22230 }, { "epoch": 13.110259433962264, "grad_norm": 2.305542469024658, "learning_rate": 3.200284787333482e-06, "loss": 0.2772, "num_input_tokens_seen": 14603176, "step": 22235 }, { "epoch": 13.11320754716981, "grad_norm": 3.233210325241089, "learning_rate": 3.1978847495878595e-06, "loss": 0.3935, "num_input_tokens_seen": 14606504, "step": 22240 }, { "epoch": 13.116155660377359, "grad_norm": 2.590954065322876, "learning_rate": 3.1954851889600176e-06, "loss": 0.3269, "num_input_tokens_seen": 14609576, "step": 22245 }, { "epoch": 13.119103773584905, "grad_norm": 2.2277817726135254, "learning_rate": 3.1930861060852485e-06, "loss": 0.4901, "num_input_tokens_seen": 14613800, "step": 22250 }, { "epoch": 13.122051886792454, "grad_norm": 2.1496267318725586, "learning_rate": 3.1906875015987194e-06, "loss": 0.2166, "num_input_tokens_seen": 14616872, "step": 22255 }, { "epoch": 13.125, "grad_norm": 4.401931285858154, "learning_rate": 3.188289376135473e-06, "loss": 0.4302, "num_input_tokens_seen": 14620488, "step": 22260 }, { "epoch": 13.127948113207546, "grad_norm": 2.140324115753174, "learning_rate": 3.1858917303304213e-06, "loss": 0.4019, "num_input_tokens_seen": 14624072, "step": 22265 }, { "epoch": 13.130896226415095, "grad_norm": 3.55090069770813, "learning_rate": 3.1834945648183535e-06, "loss": 0.4333, "num_input_tokens_seen": 14628040, "step": 22270 }, { "epoch": 13.133844339622641, "grad_norm": 5.214928150177002, "learning_rate": 3.1810978802339283e-06, "loss": 0.39, "num_input_tokens_seen": 14630760, "step": 22275 }, { "epoch": 13.13679245283019, "grad_norm": 2.013343572616577, "learning_rate": 3.1787016772116767e-06, "loss": 0.3927, "num_input_tokens_seen": 14634728, "step": 22280 }, { "epoch": 13.139740566037736, "grad_norm": 2.304565906524658, "learning_rate": 3.1763059563860073e-06, "loss": 0.2323, "num_input_tokens_seen": 14637576, "step": 22285 }, { "epoch": 13.142688679245284, "grad_norm": 4.235412120819092, "learning_rate": 3.1739107183911953e-06, "loss": 0.331, "num_input_tokens_seen": 14640680, "step": 22290 }, { "epoch": 13.14563679245283, "grad_norm": 3.5375795364379883, "learning_rate": 3.1715159638613898e-06, "loss": 0.2491, "num_input_tokens_seen": 14644200, "step": 22295 }, { "epoch": 13.148584905660377, "grad_norm": 2.610440731048584, "learning_rate": 3.1691216934306134e-06, "loss": 0.3396, "num_input_tokens_seen": 14648040, "step": 22300 }, { "epoch": 13.151533018867925, "grad_norm": 2.4899816513061523, "learning_rate": 3.16672790773276e-06, "loss": 0.3189, "num_input_tokens_seen": 14651048, "step": 22305 }, { "epoch": 13.154481132075471, "grad_norm": 4.037322998046875, "learning_rate": 3.164334607401593e-06, "loss": 0.263, "num_input_tokens_seen": 14654248, "step": 22310 }, { "epoch": 13.15742924528302, "grad_norm": 2.5703766345977783, "learning_rate": 3.1619417930707506e-06, "loss": 0.297, "num_input_tokens_seen": 14657128, "step": 22315 }, { "epoch": 13.160377358490566, "grad_norm": 2.3502328395843506, "learning_rate": 3.1595494653737408e-06, "loss": 0.3196, "num_input_tokens_seen": 14660136, "step": 22320 }, { "epoch": 13.163325471698114, "grad_norm": 2.2417426109313965, "learning_rate": 3.1571576249439408e-06, "loss": 0.3785, "num_input_tokens_seen": 14663496, "step": 22325 }, { "epoch": 13.16627358490566, "grad_norm": 3.4074509143829346, "learning_rate": 3.1547662724146e-06, "loss": 0.2629, "num_input_tokens_seen": 14666696, "step": 22330 }, { "epoch": 13.169221698113208, "grad_norm": 6.548097133636475, "learning_rate": 3.1523754084188436e-06, "loss": 0.2808, "num_input_tokens_seen": 14669096, "step": 22335 }, { "epoch": 13.172169811320755, "grad_norm": 4.4342241287231445, "learning_rate": 3.149985033589661e-06, "loss": 0.3373, "num_input_tokens_seen": 14671944, "step": 22340 }, { "epoch": 13.175117924528301, "grad_norm": 4.560307025909424, "learning_rate": 3.147595148559912e-06, "loss": 0.5109, "num_input_tokens_seen": 14675112, "step": 22345 }, { "epoch": 13.17806603773585, "grad_norm": 7.70598030090332, "learning_rate": 3.1452057539623328e-06, "loss": 0.4113, "num_input_tokens_seen": 14678152, "step": 22350 }, { "epoch": 13.181014150943396, "grad_norm": 3.10040545463562, "learning_rate": 3.142816850429523e-06, "loss": 0.5382, "num_input_tokens_seen": 14681544, "step": 22355 }, { "epoch": 13.183962264150944, "grad_norm": 2.0855348110198975, "learning_rate": 3.1404284385939552e-06, "loss": 0.3148, "num_input_tokens_seen": 14684616, "step": 22360 }, { "epoch": 13.18691037735849, "grad_norm": 4.075204372406006, "learning_rate": 3.138040519087975e-06, "loss": 0.3234, "num_input_tokens_seen": 14687976, "step": 22365 }, { "epoch": 13.189858490566039, "grad_norm": 3.951671838760376, "learning_rate": 3.13565309254379e-06, "loss": 0.3832, "num_input_tokens_seen": 14691560, "step": 22370 }, { "epoch": 13.192806603773585, "grad_norm": 2.664435386657715, "learning_rate": 3.1332661595934845e-06, "loss": 0.3445, "num_input_tokens_seen": 14694536, "step": 22375 }, { "epoch": 13.195754716981131, "grad_norm": 2.769216299057007, "learning_rate": 3.130879720869008e-06, "loss": 0.3144, "num_input_tokens_seen": 14697128, "step": 22380 }, { "epoch": 13.19870283018868, "grad_norm": 2.244934320449829, "learning_rate": 3.1284937770021815e-06, "loss": 0.337, "num_input_tokens_seen": 14700104, "step": 22385 }, { "epoch": 13.201650943396226, "grad_norm": 2.254823923110962, "learning_rate": 3.1261083286246916e-06, "loss": 0.3193, "num_input_tokens_seen": 14702888, "step": 22390 }, { "epoch": 13.204599056603774, "grad_norm": 3.008026599884033, "learning_rate": 3.1237233763680997e-06, "loss": 0.4121, "num_input_tokens_seen": 14706472, "step": 22395 }, { "epoch": 13.20754716981132, "grad_norm": 3.8319954872131348, "learning_rate": 3.1213389208638303e-06, "loss": 0.4456, "num_input_tokens_seen": 14709160, "step": 22400 }, { "epoch": 13.210495283018869, "grad_norm": 4.431209564208984, "learning_rate": 3.1189549627431757e-06, "loss": 0.3323, "num_input_tokens_seen": 14711688, "step": 22405 }, { "epoch": 13.213443396226415, "grad_norm": 4.083348274230957, "learning_rate": 3.116571502637304e-06, "loss": 0.3836, "num_input_tokens_seen": 14714568, "step": 22410 }, { "epoch": 13.216391509433961, "grad_norm": 4.798980236053467, "learning_rate": 3.1141885411772434e-06, "loss": 0.3886, "num_input_tokens_seen": 14717032, "step": 22415 }, { "epoch": 13.21933962264151, "grad_norm": 4.309057235717773, "learning_rate": 3.111806078993893e-06, "loss": 0.3085, "num_input_tokens_seen": 14719752, "step": 22420 }, { "epoch": 13.222287735849056, "grad_norm": 4.137636661529541, "learning_rate": 3.1094241167180223e-06, "loss": 0.4134, "num_input_tokens_seen": 14722664, "step": 22425 }, { "epoch": 13.225235849056604, "grad_norm": 2.298349618911743, "learning_rate": 3.1070426549802623e-06, "loss": 0.328, "num_input_tokens_seen": 14725128, "step": 22430 }, { "epoch": 13.22818396226415, "grad_norm": 3.67136287689209, "learning_rate": 3.1046616944111196e-06, "loss": 0.288, "num_input_tokens_seen": 14728712, "step": 22435 }, { "epoch": 13.231132075471699, "grad_norm": 3.260784387588501, "learning_rate": 3.1022812356409606e-06, "loss": 0.2991, "num_input_tokens_seen": 14731368, "step": 22440 }, { "epoch": 13.234080188679245, "grad_norm": 2.9602959156036377, "learning_rate": 3.0999012793000244e-06, "loss": 0.4822, "num_input_tokens_seen": 14733800, "step": 22445 }, { "epoch": 13.237028301886792, "grad_norm": 4.1569671630859375, "learning_rate": 3.097521826018414e-06, "loss": 0.3601, "num_input_tokens_seen": 14736712, "step": 22450 }, { "epoch": 13.23997641509434, "grad_norm": 3.239003896713257, "learning_rate": 3.0951428764260973e-06, "loss": 0.3588, "num_input_tokens_seen": 14739528, "step": 22455 }, { "epoch": 13.242924528301886, "grad_norm": 2.38771915435791, "learning_rate": 3.092764431152915e-06, "loss": 0.2274, "num_input_tokens_seen": 14743208, "step": 22460 }, { "epoch": 13.245872641509434, "grad_norm": 3.771160125732422, "learning_rate": 3.0903864908285693e-06, "loss": 0.4963, "num_input_tokens_seen": 14747688, "step": 22465 }, { "epoch": 13.24882075471698, "grad_norm": 3.0501787662506104, "learning_rate": 3.088009056082629e-06, "loss": 0.3459, "num_input_tokens_seen": 14751208, "step": 22470 }, { "epoch": 13.251768867924529, "grad_norm": 2.729999303817749, "learning_rate": 3.0856321275445324e-06, "loss": 0.2382, "num_input_tokens_seen": 14755560, "step": 22475 }, { "epoch": 13.254716981132075, "grad_norm": 3.4357731342315674, "learning_rate": 3.0832557058435808e-06, "loss": 0.2715, "num_input_tokens_seen": 14758632, "step": 22480 }, { "epoch": 13.257665094339623, "grad_norm": 2.38533091545105, "learning_rate": 3.0808797916089405e-06, "loss": 0.351, "num_input_tokens_seen": 14761576, "step": 22485 }, { "epoch": 13.26061320754717, "grad_norm": 2.8444526195526123, "learning_rate": 3.078504385469647e-06, "loss": 0.3463, "num_input_tokens_seen": 14765384, "step": 22490 }, { "epoch": 13.263561320754716, "grad_norm": 2.1010334491729736, "learning_rate": 3.076129488054599e-06, "loss": 0.2572, "num_input_tokens_seen": 14768808, "step": 22495 }, { "epoch": 13.266509433962264, "grad_norm": 5.500343322753906, "learning_rate": 3.0737550999925604e-06, "loss": 0.2861, "num_input_tokens_seen": 14771720, "step": 22500 }, { "epoch": 13.26945754716981, "grad_norm": 2.4808669090270996, "learning_rate": 3.0713812219121604e-06, "loss": 0.4006, "num_input_tokens_seen": 14774824, "step": 22505 }, { "epoch": 13.272405660377359, "grad_norm": 5.361450672149658, "learning_rate": 3.0690078544418934e-06, "loss": 0.4935, "num_input_tokens_seen": 14777576, "step": 22510 }, { "epoch": 13.275353773584905, "grad_norm": 2.5125045776367188, "learning_rate": 3.0666349982101198e-06, "loss": 0.2482, "num_input_tokens_seen": 14781480, "step": 22515 }, { "epoch": 13.278301886792454, "grad_norm": 2.961906909942627, "learning_rate": 3.0642626538450627e-06, "loss": 0.4102, "num_input_tokens_seen": 14784296, "step": 22520 }, { "epoch": 13.28125, "grad_norm": 4.705463886260986, "learning_rate": 3.061890821974809e-06, "loss": 0.374, "num_input_tokens_seen": 14787240, "step": 22525 }, { "epoch": 13.284198113207546, "grad_norm": 3.211750030517578, "learning_rate": 3.059519503227313e-06, "loss": 0.3056, "num_input_tokens_seen": 14790632, "step": 22530 }, { "epoch": 13.287146226415095, "grad_norm": 3.9440529346466064, "learning_rate": 3.057148698230393e-06, "loss": 0.3175, "num_input_tokens_seen": 14793000, "step": 22535 }, { "epoch": 13.290094339622641, "grad_norm": 4.601212024688721, "learning_rate": 3.0547784076117294e-06, "loss": 0.3541, "num_input_tokens_seen": 14795912, "step": 22540 }, { "epoch": 13.29304245283019, "grad_norm": 2.610868453979492, "learning_rate": 3.0524086319988635e-06, "loss": 0.4102, "num_input_tokens_seen": 14802728, "step": 22545 }, { "epoch": 13.295990566037736, "grad_norm": 3.840474843978882, "learning_rate": 3.0500393720192074e-06, "loss": 0.3922, "num_input_tokens_seen": 14805480, "step": 22550 }, { "epoch": 13.298938679245284, "grad_norm": 3.9872570037841797, "learning_rate": 3.047670628300031e-06, "loss": 0.3195, "num_input_tokens_seen": 14808360, "step": 22555 }, { "epoch": 13.30188679245283, "grad_norm": 3.0667850971221924, "learning_rate": 3.0453024014684694e-06, "loss": 0.3655, "num_input_tokens_seen": 14810984, "step": 22560 }, { "epoch": 13.304834905660377, "grad_norm": 3.57549786567688, "learning_rate": 3.0429346921515225e-06, "loss": 0.322, "num_input_tokens_seen": 14813960, "step": 22565 }, { "epoch": 13.307783018867925, "grad_norm": 3.621351480484009, "learning_rate": 3.04056750097605e-06, "loss": 0.4253, "num_input_tokens_seen": 14818280, "step": 22570 }, { "epoch": 13.310731132075471, "grad_norm": 1.7438747882843018, "learning_rate": 3.0382008285687754e-06, "loss": 0.3727, "num_input_tokens_seen": 14822248, "step": 22575 }, { "epoch": 13.31367924528302, "grad_norm": 2.4748666286468506, "learning_rate": 3.035834675556287e-06, "loss": 0.295, "num_input_tokens_seen": 14825960, "step": 22580 }, { "epoch": 13.316627358490566, "grad_norm": 3.969114065170288, "learning_rate": 3.0334690425650336e-06, "loss": 0.4403, "num_input_tokens_seen": 14829160, "step": 22585 }, { "epoch": 13.319575471698114, "grad_norm": 2.7561588287353516, "learning_rate": 3.031103930221325e-06, "loss": 0.3204, "num_input_tokens_seen": 14832392, "step": 22590 }, { "epoch": 13.32252358490566, "grad_norm": 3.1612014770507812, "learning_rate": 3.028739339151338e-06, "loss": 0.4406, "num_input_tokens_seen": 14835624, "step": 22595 }, { "epoch": 13.325471698113208, "grad_norm": 3.7511119842529297, "learning_rate": 3.0263752699811067e-06, "loss": 0.309, "num_input_tokens_seen": 14837768, "step": 22600 }, { "epoch": 13.328419811320755, "grad_norm": 2.6637184619903564, "learning_rate": 3.0240117233365267e-06, "loss": 0.3286, "num_input_tokens_seen": 14840328, "step": 22605 }, { "epoch": 13.331367924528301, "grad_norm": 5.083560466766357, "learning_rate": 3.0216486998433604e-06, "loss": 0.4143, "num_input_tokens_seen": 14842856, "step": 22610 }, { "epoch": 13.33431603773585, "grad_norm": 8.626235961914062, "learning_rate": 3.0192862001272273e-06, "loss": 0.4183, "num_input_tokens_seen": 14846056, "step": 22615 }, { "epoch": 13.337264150943396, "grad_norm": 5.860737323760986, "learning_rate": 3.0169242248136066e-06, "loss": 0.4117, "num_input_tokens_seen": 14849288, "step": 22620 }, { "epoch": 13.340212264150944, "grad_norm": 2.941923141479492, "learning_rate": 3.0145627745278457e-06, "loss": 0.3846, "num_input_tokens_seen": 14852264, "step": 22625 }, { "epoch": 13.34316037735849, "grad_norm": 2.100914239883423, "learning_rate": 3.0122018498951478e-06, "loss": 0.3538, "num_input_tokens_seen": 14854312, "step": 22630 }, { "epoch": 13.346108490566039, "grad_norm": 3.4114303588867188, "learning_rate": 3.0098414515405765e-06, "loss": 0.5209, "num_input_tokens_seen": 14857224, "step": 22635 }, { "epoch": 13.349056603773585, "grad_norm": 5.562315464019775, "learning_rate": 3.0074815800890576e-06, "loss": 0.4218, "num_input_tokens_seen": 14859560, "step": 22640 }, { "epoch": 13.352004716981131, "grad_norm": 4.658580303192139, "learning_rate": 3.005122236165378e-06, "loss": 0.3817, "num_input_tokens_seen": 14862888, "step": 22645 }, { "epoch": 13.35495283018868, "grad_norm": 2.255444049835205, "learning_rate": 3.0027634203941847e-06, "loss": 0.1862, "num_input_tokens_seen": 14866152, "step": 22650 }, { "epoch": 13.357900943396226, "grad_norm": 3.416393280029297, "learning_rate": 3.0004051333999816e-06, "loss": 0.319, "num_input_tokens_seen": 14868616, "step": 22655 }, { "epoch": 13.360849056603774, "grad_norm": 4.4062933921813965, "learning_rate": 2.998047375807139e-06, "loss": 0.3342, "num_input_tokens_seen": 14871304, "step": 22660 }, { "epoch": 13.36379716981132, "grad_norm": 4.575821399688721, "learning_rate": 2.995690148239881e-06, "loss": 0.4598, "num_input_tokens_seen": 14873768, "step": 22665 }, { "epoch": 13.366745283018869, "grad_norm": 2.2836992740631104, "learning_rate": 2.993333451322293e-06, "loss": 0.3214, "num_input_tokens_seen": 14877064, "step": 22670 }, { "epoch": 13.369693396226415, "grad_norm": 2.32738995552063, "learning_rate": 2.9909772856783242e-06, "loss": 0.3181, "num_input_tokens_seen": 14880808, "step": 22675 }, { "epoch": 13.372641509433961, "grad_norm": 4.284757614135742, "learning_rate": 2.988621651931777e-06, "loss": 0.377, "num_input_tokens_seen": 14883688, "step": 22680 }, { "epoch": 13.37558962264151, "grad_norm": 2.212005376815796, "learning_rate": 2.986266550706315e-06, "loss": 0.3142, "num_input_tokens_seen": 14886888, "step": 22685 }, { "epoch": 13.378537735849056, "grad_norm": 4.528445720672607, "learning_rate": 2.9839119826254627e-06, "loss": 0.33, "num_input_tokens_seen": 14889672, "step": 22690 }, { "epoch": 13.381485849056604, "grad_norm": 3.128779411315918, "learning_rate": 2.981557948312602e-06, "loss": 0.3702, "num_input_tokens_seen": 14893032, "step": 22695 }, { "epoch": 13.38443396226415, "grad_norm": 3.2781243324279785, "learning_rate": 2.9792044483909733e-06, "loss": 0.3147, "num_input_tokens_seen": 14895688, "step": 22700 }, { "epoch": 13.387382075471699, "grad_norm": 3.0737531185150146, "learning_rate": 2.9768514834836767e-06, "loss": 0.2565, "num_input_tokens_seen": 14898600, "step": 22705 }, { "epoch": 13.390330188679245, "grad_norm": 3.4172897338867188, "learning_rate": 2.9744990542136685e-06, "loss": 0.2987, "num_input_tokens_seen": 14901736, "step": 22710 }, { "epoch": 13.393278301886792, "grad_norm": 1.820635199546814, "learning_rate": 2.9721471612037637e-06, "loss": 0.3537, "num_input_tokens_seen": 14904232, "step": 22715 }, { "epoch": 13.39622641509434, "grad_norm": 2.9424421787261963, "learning_rate": 2.9697958050766385e-06, "loss": 0.3489, "num_input_tokens_seen": 14912232, "step": 22720 }, { "epoch": 13.399174528301886, "grad_norm": 4.64182710647583, "learning_rate": 2.967444986454825e-06, "loss": 0.3092, "num_input_tokens_seen": 14915080, "step": 22725 }, { "epoch": 13.402122641509434, "grad_norm": 2.8752429485321045, "learning_rate": 2.9650947059607106e-06, "loss": 0.2987, "num_input_tokens_seen": 14918216, "step": 22730 }, { "epoch": 13.40507075471698, "grad_norm": 2.893691062927246, "learning_rate": 2.962744964216542e-06, "loss": 0.3636, "num_input_tokens_seen": 14920872, "step": 22735 }, { "epoch": 13.408018867924529, "grad_norm": 2.384772777557373, "learning_rate": 2.960395761844425e-06, "loss": 0.2743, "num_input_tokens_seen": 14924008, "step": 22740 }, { "epoch": 13.410966981132075, "grad_norm": 3.5482006072998047, "learning_rate": 2.95804709946632e-06, "loss": 0.3743, "num_input_tokens_seen": 14927496, "step": 22745 }, { "epoch": 13.413915094339623, "grad_norm": 2.8433709144592285, "learning_rate": 2.9556989777040457e-06, "loss": 0.2752, "num_input_tokens_seen": 14930184, "step": 22750 }, { "epoch": 13.41686320754717, "grad_norm": 4.722848415374756, "learning_rate": 2.9533513971792776e-06, "loss": 0.3542, "num_input_tokens_seen": 14933032, "step": 22755 }, { "epoch": 13.419811320754716, "grad_norm": 3.0234522819519043, "learning_rate": 2.9510043585135473e-06, "loss": 0.374, "num_input_tokens_seen": 14936104, "step": 22760 }, { "epoch": 13.422759433962264, "grad_norm": 4.048891544342041, "learning_rate": 2.948657862328244e-06, "loss": 0.353, "num_input_tokens_seen": 14939528, "step": 22765 }, { "epoch": 13.42570754716981, "grad_norm": 2.40956711769104, "learning_rate": 2.946311909244613e-06, "loss": 0.3251, "num_input_tokens_seen": 14942760, "step": 22770 }, { "epoch": 13.428655660377359, "grad_norm": 3.8587534427642822, "learning_rate": 2.9439664998837538e-06, "loss": 0.4225, "num_input_tokens_seen": 14945384, "step": 22775 }, { "epoch": 13.431603773584905, "grad_norm": 2.4043803215026855, "learning_rate": 2.941621634866626e-06, "loss": 0.3524, "num_input_tokens_seen": 14948968, "step": 22780 }, { "epoch": 13.434551886792454, "grad_norm": 2.7656259536743164, "learning_rate": 2.9392773148140406e-06, "loss": 0.3115, "num_input_tokens_seen": 14952392, "step": 22785 }, { "epoch": 13.4375, "grad_norm": 2.960615634918213, "learning_rate": 2.9369335403466676e-06, "loss": 0.2973, "num_input_tokens_seen": 14955016, "step": 22790 }, { "epoch": 13.440448113207546, "grad_norm": 2.2410197257995605, "learning_rate": 2.9345903120850318e-06, "loss": 0.3608, "num_input_tokens_seen": 14959112, "step": 22795 }, { "epoch": 13.443396226415095, "grad_norm": 4.238822937011719, "learning_rate": 2.932247630649512e-06, "loss": 0.3954, "num_input_tokens_seen": 14962248, "step": 22800 }, { "epoch": 13.446344339622641, "grad_norm": 2.7644033432006836, "learning_rate": 2.9299054966603424e-06, "loss": 0.4338, "num_input_tokens_seen": 14965256, "step": 22805 }, { "epoch": 13.44929245283019, "grad_norm": 4.235435962677002, "learning_rate": 2.927563910737613e-06, "loss": 0.4232, "num_input_tokens_seen": 14968168, "step": 22810 }, { "epoch": 13.452240566037736, "grad_norm": 2.991079807281494, "learning_rate": 2.9252228735012722e-06, "loss": 0.3377, "num_input_tokens_seen": 14971464, "step": 22815 }, { "epoch": 13.455188679245284, "grad_norm": 2.400155782699585, "learning_rate": 2.9228823855711174e-06, "loss": 0.3577, "num_input_tokens_seen": 14974600, "step": 22820 }, { "epoch": 13.45813679245283, "grad_norm": 2.0167527198791504, "learning_rate": 2.920542447566802e-06, "loss": 0.3398, "num_input_tokens_seen": 14977160, "step": 22825 }, { "epoch": 13.461084905660377, "grad_norm": 2.9820282459259033, "learning_rate": 2.918203060107837e-06, "loss": 0.3085, "num_input_tokens_seen": 14980168, "step": 22830 }, { "epoch": 13.464033018867925, "grad_norm": 1.9181057214736938, "learning_rate": 2.9158642238135813e-06, "loss": 0.2324, "num_input_tokens_seen": 14983688, "step": 22835 }, { "epoch": 13.466981132075471, "grad_norm": 3.2153797149658203, "learning_rate": 2.913525939303257e-06, "loss": 0.4631, "num_input_tokens_seen": 14986888, "step": 22840 }, { "epoch": 13.46992924528302, "grad_norm": 6.353240489959717, "learning_rate": 2.9111882071959317e-06, "loss": 0.3792, "num_input_tokens_seen": 14990088, "step": 22845 }, { "epoch": 13.472877358490566, "grad_norm": 2.2239274978637695, "learning_rate": 2.908851028110532e-06, "loss": 0.3487, "num_input_tokens_seen": 14993832, "step": 22850 }, { "epoch": 13.475825471698114, "grad_norm": 3.309965133666992, "learning_rate": 2.906514402665834e-06, "loss": 0.3428, "num_input_tokens_seen": 14997800, "step": 22855 }, { "epoch": 13.47877358490566, "grad_norm": 7.1186981201171875, "learning_rate": 2.9041783314804705e-06, "loss": 0.2747, "num_input_tokens_seen": 15001320, "step": 22860 }, { "epoch": 13.481721698113208, "grad_norm": 2.275710105895996, "learning_rate": 2.9018428151729238e-06, "loss": 0.3955, "num_input_tokens_seen": 15005096, "step": 22865 }, { "epoch": 13.484669811320755, "grad_norm": 3.8312160968780518, "learning_rate": 2.899507854361537e-06, "loss": 0.246, "num_input_tokens_seen": 15008040, "step": 22870 }, { "epoch": 13.487617924528301, "grad_norm": 3.2549068927764893, "learning_rate": 2.8971734496644975e-06, "loss": 0.4164, "num_input_tokens_seen": 15010888, "step": 22875 }, { "epoch": 13.49056603773585, "grad_norm": 5.7766337394714355, "learning_rate": 2.894839601699851e-06, "loss": 0.4265, "num_input_tokens_seen": 15014312, "step": 22880 }, { "epoch": 13.493514150943396, "grad_norm": 3.729243040084839, "learning_rate": 2.8925063110854923e-06, "loss": 0.3988, "num_input_tokens_seen": 15019016, "step": 22885 }, { "epoch": 13.496462264150944, "grad_norm": 1.9186397790908813, "learning_rate": 2.8901735784391683e-06, "loss": 0.5801, "num_input_tokens_seen": 15021864, "step": 22890 }, { "epoch": 13.49941037735849, "grad_norm": 2.4799869060516357, "learning_rate": 2.8878414043784844e-06, "loss": 0.3024, "num_input_tokens_seen": 15025512, "step": 22895 }, { "epoch": 13.502358490566039, "grad_norm": 3.399651288986206, "learning_rate": 2.885509789520891e-06, "loss": 0.2798, "num_input_tokens_seen": 15028104, "step": 22900 }, { "epoch": 13.505306603773585, "grad_norm": 2.3681015968322754, "learning_rate": 2.8831787344836926e-06, "loss": 0.3772, "num_input_tokens_seen": 15032168, "step": 22905 }, { "epoch": 13.508254716981131, "grad_norm": 5.766441822052002, "learning_rate": 2.880848239884049e-06, "loss": 0.4144, "num_input_tokens_seen": 15035080, "step": 22910 }, { "epoch": 13.51120283018868, "grad_norm": 4.3512773513793945, "learning_rate": 2.8785183063389667e-06, "loss": 0.4136, "num_input_tokens_seen": 15040392, "step": 22915 }, { "epoch": 13.514150943396226, "grad_norm": 2.659827470779419, "learning_rate": 2.876188934465306e-06, "loss": 0.4059, "num_input_tokens_seen": 15044072, "step": 22920 }, { "epoch": 13.517099056603774, "grad_norm": 2.2953131198883057, "learning_rate": 2.8738601248797758e-06, "loss": 0.2905, "num_input_tokens_seen": 15046888, "step": 22925 }, { "epoch": 13.52004716981132, "grad_norm": 3.018979072570801, "learning_rate": 2.8715318781989432e-06, "loss": 0.3122, "num_input_tokens_seen": 15050536, "step": 22930 }, { "epoch": 13.522995283018869, "grad_norm": 3.367251396179199, "learning_rate": 2.869204195039219e-06, "loss": 0.4261, "num_input_tokens_seen": 15054184, "step": 22935 }, { "epoch": 13.525943396226415, "grad_norm": 4.4344635009765625, "learning_rate": 2.8668770760168673e-06, "loss": 0.3571, "num_input_tokens_seen": 15058056, "step": 22940 }, { "epoch": 13.528891509433961, "grad_norm": 2.3951163291931152, "learning_rate": 2.864550521748003e-06, "loss": 0.2344, "num_input_tokens_seen": 15060680, "step": 22945 }, { "epoch": 13.53183962264151, "grad_norm": 5.834720611572266, "learning_rate": 2.862224532848591e-06, "loss": 0.311, "num_input_tokens_seen": 15063976, "step": 22950 }, { "epoch": 13.534787735849056, "grad_norm": 1.8310600519180298, "learning_rate": 2.8598991099344455e-06, "loss": 0.2783, "num_input_tokens_seen": 15067848, "step": 22955 }, { "epoch": 13.537735849056604, "grad_norm": 2.9392106533050537, "learning_rate": 2.857574253621236e-06, "loss": 0.4598, "num_input_tokens_seen": 15070344, "step": 22960 }, { "epoch": 13.54068396226415, "grad_norm": 4.063082695007324, "learning_rate": 2.855249964524476e-06, "loss": 0.3255, "num_input_tokens_seen": 15073064, "step": 22965 }, { "epoch": 13.543632075471699, "grad_norm": 2.512906551361084, "learning_rate": 2.852926243259531e-06, "loss": 0.3948, "num_input_tokens_seen": 15077064, "step": 22970 }, { "epoch": 13.546580188679245, "grad_norm": 3.4805541038513184, "learning_rate": 2.850603090441617e-06, "loss": 0.3056, "num_input_tokens_seen": 15079944, "step": 22975 }, { "epoch": 13.549528301886792, "grad_norm": 3.2060048580169678, "learning_rate": 2.848280506685798e-06, "loss": 0.359, "num_input_tokens_seen": 15083304, "step": 22980 }, { "epoch": 13.55247641509434, "grad_norm": 3.2011427879333496, "learning_rate": 2.845958492606986e-06, "loss": 0.36, "num_input_tokens_seen": 15086152, "step": 22985 }, { "epoch": 13.555424528301886, "grad_norm": 5.012073993682861, "learning_rate": 2.843637048819949e-06, "loss": 0.2835, "num_input_tokens_seen": 15088392, "step": 22990 }, { "epoch": 13.558372641509434, "grad_norm": 2.975238561630249, "learning_rate": 2.8413161759392966e-06, "loss": 0.2479, "num_input_tokens_seen": 15091016, "step": 22995 }, { "epoch": 13.56132075471698, "grad_norm": 2.8017220497131348, "learning_rate": 2.8389958745794878e-06, "loss": 0.2832, "num_input_tokens_seen": 15094536, "step": 23000 }, { "epoch": 13.564268867924529, "grad_norm": 11.676660537719727, "learning_rate": 2.8366761453548366e-06, "loss": 0.3422, "num_input_tokens_seen": 15096968, "step": 23005 }, { "epoch": 13.567216981132075, "grad_norm": 4.622337818145752, "learning_rate": 2.8343569888795e-06, "loss": 0.3057, "num_input_tokens_seen": 15100584, "step": 23010 }, { "epoch": 13.570165094339622, "grad_norm": 2.1507351398468018, "learning_rate": 2.832038405767483e-06, "loss": 0.2864, "num_input_tokens_seen": 15103752, "step": 23015 }, { "epoch": 13.57311320754717, "grad_norm": 2.6558854579925537, "learning_rate": 2.8297203966326397e-06, "loss": 0.2941, "num_input_tokens_seen": 15106984, "step": 23020 }, { "epoch": 13.576061320754716, "grad_norm": 2.356536388397217, "learning_rate": 2.8274029620886773e-06, "loss": 0.3131, "num_input_tokens_seen": 15109480, "step": 23025 }, { "epoch": 13.579009433962264, "grad_norm": 2.3406312465667725, "learning_rate": 2.825086102749144e-06, "loss": 0.3373, "num_input_tokens_seen": 15113416, "step": 23030 }, { "epoch": 13.58195754716981, "grad_norm": 2.367610454559326, "learning_rate": 2.822769819227438e-06, "loss": 0.2984, "num_input_tokens_seen": 15116808, "step": 23035 }, { "epoch": 13.584905660377359, "grad_norm": 3.956859827041626, "learning_rate": 2.8204541121368055e-06, "loss": 0.3504, "num_input_tokens_seen": 15119528, "step": 23040 }, { "epoch": 13.587853773584905, "grad_norm": 2.5750374794006348, "learning_rate": 2.8181389820903402e-06, "loss": 0.3277, "num_input_tokens_seen": 15123208, "step": 23045 }, { "epoch": 13.590801886792454, "grad_norm": 2.9734761714935303, "learning_rate": 2.8158244297009814e-06, "loss": 0.4339, "num_input_tokens_seen": 15126824, "step": 23050 }, { "epoch": 13.59375, "grad_norm": 3.101776123046875, "learning_rate": 2.8135104555815196e-06, "loss": 0.3398, "num_input_tokens_seen": 15130920, "step": 23055 }, { "epoch": 13.596698113207546, "grad_norm": 2.5838418006896973, "learning_rate": 2.811197060344588e-06, "loss": 0.4002, "num_input_tokens_seen": 15134504, "step": 23060 }, { "epoch": 13.599646226415095, "grad_norm": 2.2848284244537354, "learning_rate": 2.8088842446026677e-06, "loss": 0.3191, "num_input_tokens_seen": 15137896, "step": 23065 }, { "epoch": 13.602594339622641, "grad_norm": 2.501537561416626, "learning_rate": 2.806572008968087e-06, "loss": 0.2626, "num_input_tokens_seen": 15140584, "step": 23070 }, { "epoch": 13.60554245283019, "grad_norm": 3.587226629257202, "learning_rate": 2.80426035405302e-06, "loss": 0.4101, "num_input_tokens_seen": 15143848, "step": 23075 }, { "epoch": 13.608490566037736, "grad_norm": 2.3952860832214355, "learning_rate": 2.8019492804694852e-06, "loss": 0.3362, "num_input_tokens_seen": 15146824, "step": 23080 }, { "epoch": 13.611438679245284, "grad_norm": 2.9485926628112793, "learning_rate": 2.799638788829354e-06, "loss": 0.4079, "num_input_tokens_seen": 15149576, "step": 23085 }, { "epoch": 13.61438679245283, "grad_norm": 4.73024845123291, "learning_rate": 2.7973288797443367e-06, "loss": 0.4101, "num_input_tokens_seen": 15152232, "step": 23090 }, { "epoch": 13.617334905660378, "grad_norm": 3.4013521671295166, "learning_rate": 2.7950195538259884e-06, "loss": 0.4664, "num_input_tokens_seen": 15158664, "step": 23095 }, { "epoch": 13.620283018867925, "grad_norm": 3.8979105949401855, "learning_rate": 2.792710811685719e-06, "loss": 0.4431, "num_input_tokens_seen": 15163304, "step": 23100 }, { "epoch": 13.623231132075471, "grad_norm": 2.535388469696045, "learning_rate": 2.7904026539347743e-06, "loss": 0.4573, "num_input_tokens_seen": 15166824, "step": 23105 }, { "epoch": 13.62617924528302, "grad_norm": 1.8969595432281494, "learning_rate": 2.7880950811842507e-06, "loss": 0.2882, "num_input_tokens_seen": 15169704, "step": 23110 }, { "epoch": 13.629127358490566, "grad_norm": 2.1833178997039795, "learning_rate": 2.785788094045085e-06, "loss": 0.3347, "num_input_tokens_seen": 15173096, "step": 23115 }, { "epoch": 13.632075471698114, "grad_norm": 2.7880125045776367, "learning_rate": 2.7834816931280655e-06, "loss": 0.4568, "num_input_tokens_seen": 15176680, "step": 23120 }, { "epoch": 13.63502358490566, "grad_norm": 4.2647905349731445, "learning_rate": 2.781175879043821e-06, "loss": 0.3503, "num_input_tokens_seen": 15180360, "step": 23125 }, { "epoch": 13.637971698113208, "grad_norm": 2.078787088394165, "learning_rate": 2.778870652402825e-06, "loss": 0.2708, "num_input_tokens_seen": 15184616, "step": 23130 }, { "epoch": 13.640919811320755, "grad_norm": 2.9178037643432617, "learning_rate": 2.776566013815396e-06, "loss": 0.3476, "num_input_tokens_seen": 15188552, "step": 23135 }, { "epoch": 13.643867924528301, "grad_norm": 2.585348129272461, "learning_rate": 2.774261963891698e-06, "loss": 0.3032, "num_input_tokens_seen": 15190888, "step": 23140 }, { "epoch": 13.64681603773585, "grad_norm": 3.840508222579956, "learning_rate": 2.771958503241735e-06, "loss": 0.3397, "num_input_tokens_seen": 15196168, "step": 23145 }, { "epoch": 13.649764150943396, "grad_norm": 2.92923641204834, "learning_rate": 2.769655632475362e-06, "loss": 0.3507, "num_input_tokens_seen": 15199240, "step": 23150 }, { "epoch": 13.652712264150944, "grad_norm": 3.298246383666992, "learning_rate": 2.7673533522022733e-06, "loss": 0.3506, "num_input_tokens_seen": 15202824, "step": 23155 }, { "epoch": 13.65566037735849, "grad_norm": 1.913661003112793, "learning_rate": 2.765051663032007e-06, "loss": 0.2685, "num_input_tokens_seen": 15205736, "step": 23160 }, { "epoch": 13.658608490566039, "grad_norm": 2.52078914642334, "learning_rate": 2.7627505655739446e-06, "loss": 0.31, "num_input_tokens_seen": 15208168, "step": 23165 }, { "epoch": 13.661556603773585, "grad_norm": 3.8814809322357178, "learning_rate": 2.7604500604373097e-06, "loss": 0.2964, "num_input_tokens_seen": 15210920, "step": 23170 }, { "epoch": 13.664504716981131, "grad_norm": 2.170159101486206, "learning_rate": 2.7581501482311757e-06, "loss": 0.3136, "num_input_tokens_seen": 15214376, "step": 23175 }, { "epoch": 13.66745283018868, "grad_norm": 1.9587795734405518, "learning_rate": 2.7558508295644513e-06, "loss": 0.3198, "num_input_tokens_seen": 15218920, "step": 23180 }, { "epoch": 13.670400943396226, "grad_norm": 2.3946847915649414, "learning_rate": 2.7535521050458922e-06, "loss": 0.3503, "num_input_tokens_seen": 15221896, "step": 23185 }, { "epoch": 13.673349056603774, "grad_norm": 4.506974220275879, "learning_rate": 2.7512539752840926e-06, "loss": 0.3575, "num_input_tokens_seen": 15224616, "step": 23190 }, { "epoch": 13.67629716981132, "grad_norm": 3.212613105773926, "learning_rate": 2.748956440887497e-06, "loss": 0.5269, "num_input_tokens_seen": 15228680, "step": 23195 }, { "epoch": 13.679245283018869, "grad_norm": 3.7049543857574463, "learning_rate": 2.7466595024643843e-06, "loss": 0.3971, "num_input_tokens_seen": 15231592, "step": 23200 }, { "epoch": 13.682193396226415, "grad_norm": 4.732707500457764, "learning_rate": 2.744363160622878e-06, "loss": 0.3114, "num_input_tokens_seen": 15234984, "step": 23205 }, { "epoch": 13.685141509433961, "grad_norm": 1.9613673686981201, "learning_rate": 2.742067415970948e-06, "loss": 0.3314, "num_input_tokens_seen": 15238824, "step": 23210 }, { "epoch": 13.68808962264151, "grad_norm": 4.073821067810059, "learning_rate": 2.739772269116402e-06, "loss": 0.372, "num_input_tokens_seen": 15241736, "step": 23215 }, { "epoch": 13.691037735849056, "grad_norm": 3.5856029987335205, "learning_rate": 2.7374777206668874e-06, "loss": 0.3527, "num_input_tokens_seen": 15245224, "step": 23220 }, { "epoch": 13.693985849056604, "grad_norm": 4.132888317108154, "learning_rate": 2.735183771229898e-06, "loss": 0.3755, "num_input_tokens_seen": 15247528, "step": 23225 }, { "epoch": 13.69693396226415, "grad_norm": 3.1307144165039062, "learning_rate": 2.732890421412765e-06, "loss": 0.3027, "num_input_tokens_seen": 15250408, "step": 23230 }, { "epoch": 13.699882075471699, "grad_norm": 2.236030340194702, "learning_rate": 2.7305976718226624e-06, "loss": 0.3635, "num_input_tokens_seen": 15253512, "step": 23235 }, { "epoch": 13.702830188679245, "grad_norm": 3.8586819171905518, "learning_rate": 2.728305523066609e-06, "loss": 0.3789, "num_input_tokens_seen": 15256584, "step": 23240 }, { "epoch": 13.705778301886792, "grad_norm": 2.466792106628418, "learning_rate": 2.726013975751458e-06, "loss": 0.3655, "num_input_tokens_seen": 15259432, "step": 23245 }, { "epoch": 13.70872641509434, "grad_norm": 3.4348294734954834, "learning_rate": 2.723723030483908e-06, "loss": 0.2645, "num_input_tokens_seen": 15262504, "step": 23250 }, { "epoch": 13.711674528301886, "grad_norm": 2.4731369018554688, "learning_rate": 2.7214326878704953e-06, "loss": 0.3342, "num_input_tokens_seen": 15265288, "step": 23255 }, { "epoch": 13.714622641509434, "grad_norm": 3.2552268505096436, "learning_rate": 2.7191429485175993e-06, "loss": 0.5794, "num_input_tokens_seen": 15268360, "step": 23260 }, { "epoch": 13.71757075471698, "grad_norm": 6.500421524047852, "learning_rate": 2.716853813031435e-06, "loss": 0.4481, "num_input_tokens_seen": 15272392, "step": 23265 }, { "epoch": 13.720518867924529, "grad_norm": 2.455156087875366, "learning_rate": 2.714565282018066e-06, "loss": 0.3212, "num_input_tokens_seen": 15275656, "step": 23270 }, { "epoch": 13.723466981132075, "grad_norm": 3.0550012588500977, "learning_rate": 2.7122773560833877e-06, "loss": 0.3178, "num_input_tokens_seen": 15278312, "step": 23275 }, { "epoch": 13.726415094339622, "grad_norm": 2.5457775592803955, "learning_rate": 2.709990035833139e-06, "loss": 0.2919, "num_input_tokens_seen": 15281832, "step": 23280 }, { "epoch": 13.72936320754717, "grad_norm": 2.6261162757873535, "learning_rate": 2.707703321872896e-06, "loss": 0.2527, "num_input_tokens_seen": 15285128, "step": 23285 }, { "epoch": 13.732311320754716, "grad_norm": 3.107254981994629, "learning_rate": 2.705417214808079e-06, "loss": 0.2711, "num_input_tokens_seen": 15288712, "step": 23290 }, { "epoch": 13.735259433962264, "grad_norm": 3.676213026046753, "learning_rate": 2.703131715243945e-06, "loss": 0.3098, "num_input_tokens_seen": 15291688, "step": 23295 }, { "epoch": 13.73820754716981, "grad_norm": 3.9595181941986084, "learning_rate": 2.7008468237855855e-06, "loss": 0.3497, "num_input_tokens_seen": 15294760, "step": 23300 }, { "epoch": 13.741155660377359, "grad_norm": 2.9160945415496826, "learning_rate": 2.69856254103794e-06, "loss": 0.2338, "num_input_tokens_seen": 15297768, "step": 23305 }, { "epoch": 13.744103773584905, "grad_norm": 3.5498416423797607, "learning_rate": 2.6962788676057806e-06, "loss": 0.281, "num_input_tokens_seen": 15301032, "step": 23310 }, { "epoch": 13.747051886792454, "grad_norm": 5.444496154785156, "learning_rate": 2.69399580409372e-06, "loss": 0.4105, "num_input_tokens_seen": 15305800, "step": 23315 }, { "epoch": 13.75, "grad_norm": 3.301224708557129, "learning_rate": 2.6917133511062076e-06, "loss": 0.3468, "num_input_tokens_seen": 15308584, "step": 23320 }, { "epoch": 13.752948113207546, "grad_norm": 3.636467218399048, "learning_rate": 2.6894315092475342e-06, "loss": 0.3119, "num_input_tokens_seen": 15311816, "step": 23325 }, { "epoch": 13.755896226415095, "grad_norm": 2.631821632385254, "learning_rate": 2.6871502791218245e-06, "loss": 0.3223, "num_input_tokens_seen": 15315240, "step": 23330 }, { "epoch": 13.758844339622641, "grad_norm": 3.0172386169433594, "learning_rate": 2.684869661333048e-06, "loss": 0.2845, "num_input_tokens_seen": 15318568, "step": 23335 }, { "epoch": 13.76179245283019, "grad_norm": 7.356691837310791, "learning_rate": 2.6825896564850074e-06, "loss": 0.4225, "num_input_tokens_seen": 15321320, "step": 23340 }, { "epoch": 13.764740566037736, "grad_norm": 1.9219379425048828, "learning_rate": 2.6803102651813416e-06, "loss": 0.351, "num_input_tokens_seen": 15326120, "step": 23345 }, { "epoch": 13.767688679245284, "grad_norm": 9.455195426940918, "learning_rate": 2.6780314880255307e-06, "loss": 0.4002, "num_input_tokens_seen": 15332808, "step": 23350 }, { "epoch": 13.77063679245283, "grad_norm": 3.262770891189575, "learning_rate": 2.675753325620891e-06, "loss": 0.4042, "num_input_tokens_seen": 15335176, "step": 23355 }, { "epoch": 13.773584905660378, "grad_norm": 2.7925331592559814, "learning_rate": 2.6734757785705727e-06, "loss": 0.4143, "num_input_tokens_seen": 15339016, "step": 23360 }, { "epoch": 13.776533018867925, "grad_norm": 2.732698917388916, "learning_rate": 2.6711988474775712e-06, "loss": 0.4183, "num_input_tokens_seen": 15342728, "step": 23365 }, { "epoch": 13.779481132075471, "grad_norm": 3.1652562618255615, "learning_rate": 2.668922532944711e-06, "loss": 0.4122, "num_input_tokens_seen": 15345672, "step": 23370 }, { "epoch": 13.78242924528302, "grad_norm": 2.908952236175537, "learning_rate": 2.6666468355746566e-06, "loss": 0.2914, "num_input_tokens_seen": 15348904, "step": 23375 }, { "epoch": 13.785377358490566, "grad_norm": 1.908881664276123, "learning_rate": 2.6643717559699073e-06, "loss": 0.4608, "num_input_tokens_seen": 15352104, "step": 23380 }, { "epoch": 13.788325471698114, "grad_norm": 3.6340489387512207, "learning_rate": 2.662097294732803e-06, "loss": 0.4448, "num_input_tokens_seen": 15354984, "step": 23385 }, { "epoch": 13.79127358490566, "grad_norm": 3.6219003200531006, "learning_rate": 2.6598234524655165e-06, "loss": 0.3341, "num_input_tokens_seen": 15358824, "step": 23390 }, { "epoch": 13.794221698113208, "grad_norm": 4.104816913604736, "learning_rate": 2.657550229770054e-06, "loss": 0.2772, "num_input_tokens_seen": 15362376, "step": 23395 }, { "epoch": 13.797169811320755, "grad_norm": 2.6682910919189453, "learning_rate": 2.655277627248265e-06, "loss": 0.2951, "num_input_tokens_seen": 15365352, "step": 23400 }, { "epoch": 13.800117924528301, "grad_norm": 2.386286735534668, "learning_rate": 2.65300564550183e-06, "loss": 0.4369, "num_input_tokens_seen": 15367720, "step": 23405 }, { "epoch": 13.80306603773585, "grad_norm": 2.3025617599487305, "learning_rate": 2.6507342851322647e-06, "loss": 0.2939, "num_input_tokens_seen": 15370632, "step": 23410 }, { "epoch": 13.806014150943396, "grad_norm": 4.214304447174072, "learning_rate": 2.6484635467409233e-06, "loss": 0.3643, "num_input_tokens_seen": 15373096, "step": 23415 }, { "epoch": 13.808962264150944, "grad_norm": 1.8736987113952637, "learning_rate": 2.64619343092899e-06, "loss": 0.4168, "num_input_tokens_seen": 15377320, "step": 23420 }, { "epoch": 13.81191037735849, "grad_norm": 3.1527576446533203, "learning_rate": 2.643923938297492e-06, "loss": 0.3035, "num_input_tokens_seen": 15381448, "step": 23425 }, { "epoch": 13.814858490566039, "grad_norm": 4.084475040435791, "learning_rate": 2.6416550694472855e-06, "loss": 0.4062, "num_input_tokens_seen": 15384936, "step": 23430 }, { "epoch": 13.817806603773585, "grad_norm": 2.594305992126465, "learning_rate": 2.639386824979063e-06, "loss": 0.3516, "num_input_tokens_seen": 15388136, "step": 23435 }, { "epoch": 13.820754716981131, "grad_norm": 2.34702467918396, "learning_rate": 2.6371192054933525e-06, "loss": 0.3727, "num_input_tokens_seen": 15391720, "step": 23440 }, { "epoch": 13.82370283018868, "grad_norm": 3.216477155685425, "learning_rate": 2.634852211590516e-06, "loss": 0.4069, "num_input_tokens_seen": 15394888, "step": 23445 }, { "epoch": 13.826650943396226, "grad_norm": 2.335145950317383, "learning_rate": 2.6325858438707473e-06, "loss": 0.3207, "num_input_tokens_seen": 15398024, "step": 23450 }, { "epoch": 13.829599056603774, "grad_norm": 2.955345392227173, "learning_rate": 2.630320102934082e-06, "loss": 0.3851, "num_input_tokens_seen": 15402056, "step": 23455 }, { "epoch": 13.83254716981132, "grad_norm": 4.749246597290039, "learning_rate": 2.628054989380382e-06, "loss": 0.3498, "num_input_tokens_seen": 15406440, "step": 23460 }, { "epoch": 13.835495283018869, "grad_norm": 2.949122428894043, "learning_rate": 2.625790503809346e-06, "loss": 0.3316, "num_input_tokens_seen": 15409320, "step": 23465 }, { "epoch": 13.838443396226415, "grad_norm": 2.3663883209228516, "learning_rate": 2.6235266468205067e-06, "loss": 0.4042, "num_input_tokens_seen": 15412648, "step": 23470 }, { "epoch": 13.841391509433961, "grad_norm": 2.7810027599334717, "learning_rate": 2.621263419013227e-06, "loss": 0.3905, "num_input_tokens_seen": 15415528, "step": 23475 }, { "epoch": 13.84433962264151, "grad_norm": 5.1194305419921875, "learning_rate": 2.619000820986711e-06, "loss": 0.2766, "num_input_tokens_seen": 15419400, "step": 23480 }, { "epoch": 13.847287735849056, "grad_norm": 2.5054807662963867, "learning_rate": 2.616738853339988e-06, "loss": 0.3226, "num_input_tokens_seen": 15423592, "step": 23485 }, { "epoch": 13.850235849056604, "grad_norm": 3.936861991882324, "learning_rate": 2.614477516671926e-06, "loss": 0.3685, "num_input_tokens_seen": 15426792, "step": 23490 }, { "epoch": 13.85318396226415, "grad_norm": 3.9115688800811768, "learning_rate": 2.612216811581223e-06, "loss": 0.3296, "num_input_tokens_seen": 15430088, "step": 23495 }, { "epoch": 13.856132075471699, "grad_norm": 3.7003133296966553, "learning_rate": 2.6099567386664095e-06, "loss": 0.283, "num_input_tokens_seen": 15433288, "step": 23500 }, { "epoch": 13.859080188679245, "grad_norm": 2.629554510116577, "learning_rate": 2.60769729852585e-06, "loss": 0.3125, "num_input_tokens_seen": 15436648, "step": 23505 }, { "epoch": 13.862028301886792, "grad_norm": 4.241819858551025, "learning_rate": 2.6054384917577413e-06, "loss": 0.5278, "num_input_tokens_seen": 15439688, "step": 23510 }, { "epoch": 13.86497641509434, "grad_norm": 1.8935102224349976, "learning_rate": 2.60318031896011e-06, "loss": 0.3457, "num_input_tokens_seen": 15442504, "step": 23515 }, { "epoch": 13.867924528301886, "grad_norm": 2.5282013416290283, "learning_rate": 2.60092278073082e-06, "loss": 0.3929, "num_input_tokens_seen": 15446120, "step": 23520 }, { "epoch": 13.870872641509434, "grad_norm": 2.510122060775757, "learning_rate": 2.5986658776675644e-06, "loss": 0.4347, "num_input_tokens_seen": 15450216, "step": 23525 }, { "epoch": 13.87382075471698, "grad_norm": 5.261634349822998, "learning_rate": 2.5964096103678666e-06, "loss": 0.246, "num_input_tokens_seen": 15453416, "step": 23530 }, { "epoch": 13.876768867924529, "grad_norm": 2.4386098384857178, "learning_rate": 2.5941539794290833e-06, "loss": 0.3676, "num_input_tokens_seen": 15456104, "step": 23535 }, { "epoch": 13.879716981132075, "grad_norm": 2.9384422302246094, "learning_rate": 2.5918989854484024e-06, "loss": 0.27, "num_input_tokens_seen": 15458280, "step": 23540 }, { "epoch": 13.882665094339622, "grad_norm": 2.196805953979492, "learning_rate": 2.5896446290228417e-06, "loss": 0.4511, "num_input_tokens_seen": 15461192, "step": 23545 }, { "epoch": 13.88561320754717, "grad_norm": 2.037470579147339, "learning_rate": 2.5873909107492547e-06, "loss": 0.2653, "num_input_tokens_seen": 15464008, "step": 23550 }, { "epoch": 13.888561320754716, "grad_norm": 2.2036736011505127, "learning_rate": 2.5851378312243224e-06, "loss": 0.4113, "num_input_tokens_seen": 15467976, "step": 23555 }, { "epoch": 13.891509433962264, "grad_norm": 2.09246826171875, "learning_rate": 2.5828853910445572e-06, "loss": 0.383, "num_input_tokens_seen": 15471080, "step": 23560 }, { "epoch": 13.89445754716981, "grad_norm": 3.56240177154541, "learning_rate": 2.5806335908063012e-06, "loss": 0.3548, "num_input_tokens_seen": 15473736, "step": 23565 }, { "epoch": 13.897405660377359, "grad_norm": 9.931920051574707, "learning_rate": 2.5783824311057293e-06, "loss": 0.419, "num_input_tokens_seen": 15477032, "step": 23570 }, { "epoch": 13.900353773584905, "grad_norm": 2.9077236652374268, "learning_rate": 2.5761319125388433e-06, "loss": 0.2706, "num_input_tokens_seen": 15480040, "step": 23575 }, { "epoch": 13.903301886792454, "grad_norm": 3.56777024269104, "learning_rate": 2.57388203570148e-06, "loss": 0.437, "num_input_tokens_seen": 15482856, "step": 23580 }, { "epoch": 13.90625, "grad_norm": 2.438924789428711, "learning_rate": 2.5716328011893055e-06, "loss": 0.3184, "num_input_tokens_seen": 15486088, "step": 23585 }, { "epoch": 13.909198113207546, "grad_norm": 4.839676856994629, "learning_rate": 2.5693842095978127e-06, "loss": 0.2928, "num_input_tokens_seen": 15489608, "step": 23590 }, { "epoch": 13.912146226415095, "grad_norm": 4.828232288360596, "learning_rate": 2.567136261522325e-06, "loss": 0.3647, "num_input_tokens_seen": 15493416, "step": 23595 }, { "epoch": 13.915094339622641, "grad_norm": 2.1982319355010986, "learning_rate": 2.5648889575579985e-06, "loss": 0.4483, "num_input_tokens_seen": 15496552, "step": 23600 }, { "epoch": 13.91804245283019, "grad_norm": 5.982715129852295, "learning_rate": 2.562642298299814e-06, "loss": 0.4073, "num_input_tokens_seen": 15499688, "step": 23605 }, { "epoch": 13.920990566037736, "grad_norm": 4.357577800750732, "learning_rate": 2.560396284342584e-06, "loss": 0.3699, "num_input_tokens_seen": 15503240, "step": 23610 }, { "epoch": 13.923938679245284, "grad_norm": 3.2490029335021973, "learning_rate": 2.558150916280954e-06, "loss": 0.3336, "num_input_tokens_seen": 15507528, "step": 23615 }, { "epoch": 13.92688679245283, "grad_norm": 2.0026254653930664, "learning_rate": 2.555906194709392e-06, "loss": 0.369, "num_input_tokens_seen": 15510504, "step": 23620 }, { "epoch": 13.929834905660378, "grad_norm": 3.0140113830566406, "learning_rate": 2.553662120222199e-06, "loss": 0.2753, "num_input_tokens_seen": 15514440, "step": 23625 }, { "epoch": 13.932783018867925, "grad_norm": 2.7062828540802, "learning_rate": 2.5514186934135026e-06, "loss": 0.3034, "num_input_tokens_seen": 15517736, "step": 23630 }, { "epoch": 13.935731132075471, "grad_norm": 4.5522284507751465, "learning_rate": 2.54917591487726e-06, "loss": 0.2358, "num_input_tokens_seen": 15521192, "step": 23635 }, { "epoch": 13.93867924528302, "grad_norm": 3.269625425338745, "learning_rate": 2.5469337852072547e-06, "loss": 0.2759, "num_input_tokens_seen": 15524264, "step": 23640 }, { "epoch": 13.941627358490566, "grad_norm": 1.9467438459396362, "learning_rate": 2.5446923049971035e-06, "loss": 0.3233, "num_input_tokens_seen": 15527592, "step": 23645 }, { "epoch": 13.944575471698114, "grad_norm": 2.692870616912842, "learning_rate": 2.5424514748402463e-06, "loss": 0.3559, "num_input_tokens_seen": 15531848, "step": 23650 }, { "epoch": 13.94752358490566, "grad_norm": 2.5982275009155273, "learning_rate": 2.540211295329953e-06, "loss": 0.3213, "num_input_tokens_seen": 15535432, "step": 23655 }, { "epoch": 13.950471698113208, "grad_norm": 2.3144266605377197, "learning_rate": 2.5379717670593197e-06, "loss": 0.3102, "num_input_tokens_seen": 15538312, "step": 23660 }, { "epoch": 13.953419811320755, "grad_norm": 3.0332117080688477, "learning_rate": 2.53573289062127e-06, "loss": 0.3071, "num_input_tokens_seen": 15541544, "step": 23665 }, { "epoch": 13.956367924528301, "grad_norm": 3.2685585021972656, "learning_rate": 2.5334946666085605e-06, "loss": 0.2608, "num_input_tokens_seen": 15544360, "step": 23670 }, { "epoch": 13.95931603773585, "grad_norm": 3.4289042949676514, "learning_rate": 2.531257095613766e-06, "loss": 0.3261, "num_input_tokens_seen": 15547752, "step": 23675 }, { "epoch": 13.962264150943396, "grad_norm": 7.076101779937744, "learning_rate": 2.529020178229297e-06, "loss": 0.3359, "num_input_tokens_seen": 15550984, "step": 23680 }, { "epoch": 13.965212264150944, "grad_norm": 4.372530460357666, "learning_rate": 2.5267839150473846e-06, "loss": 0.2479, "num_input_tokens_seen": 15553544, "step": 23685 }, { "epoch": 13.96816037735849, "grad_norm": 5.515628337860107, "learning_rate": 2.5245483066600896e-06, "loss": 0.3238, "num_input_tokens_seen": 15557704, "step": 23690 }, { "epoch": 13.971108490566039, "grad_norm": 4.1123738288879395, "learning_rate": 2.5223133536592996e-06, "loss": 0.4691, "num_input_tokens_seen": 15560744, "step": 23695 }, { "epoch": 13.974056603773585, "grad_norm": 1.9785263538360596, "learning_rate": 2.520079056636725e-06, "loss": 0.3917, "num_input_tokens_seen": 15563464, "step": 23700 }, { "epoch": 13.977004716981131, "grad_norm": 1.7597905397415161, "learning_rate": 2.5178454161839106e-06, "loss": 0.2975, "num_input_tokens_seen": 15566664, "step": 23705 }, { "epoch": 13.97995283018868, "grad_norm": 2.7479426860809326, "learning_rate": 2.5156124328922195e-06, "loss": 0.4251, "num_input_tokens_seen": 15570088, "step": 23710 }, { "epoch": 13.982900943396226, "grad_norm": 2.7167656421661377, "learning_rate": 2.513380107352844e-06, "loss": 0.4008, "num_input_tokens_seen": 15573000, "step": 23715 }, { "epoch": 13.985849056603774, "grad_norm": 2.4053609371185303, "learning_rate": 2.5111484401568014e-06, "loss": 0.2766, "num_input_tokens_seen": 15575336, "step": 23720 }, { "epoch": 13.98879716981132, "grad_norm": 5.146011829376221, "learning_rate": 2.508917431894936e-06, "loss": 0.2881, "num_input_tokens_seen": 15577864, "step": 23725 }, { "epoch": 13.991745283018869, "grad_norm": 3.6907505989074707, "learning_rate": 2.5066870831579144e-06, "loss": 0.3624, "num_input_tokens_seen": 15581064, "step": 23730 }, { "epoch": 13.994693396226415, "grad_norm": 5.027768611907959, "learning_rate": 2.504457394536235e-06, "loss": 0.5137, "num_input_tokens_seen": 15584680, "step": 23735 }, { "epoch": 13.997641509433961, "grad_norm": 2.6475954055786133, "learning_rate": 2.502228366620216e-06, "loss": 0.2972, "num_input_tokens_seen": 15588744, "step": 23740 }, { "epoch": 14.0, "eval_loss": 0.566584587097168, "eval_runtime": 18.6845, "eval_samples_per_second": 90.77, "eval_steps_per_second": 22.693, "num_input_tokens_seen": 15590856, "step": 23744 }, { "epoch": 14.00058962264151, "grad_norm": 3.015045166015625, "learning_rate": 2.5000000000000015e-06, "loss": 0.4359, "num_input_tokens_seen": 15591656, "step": 23745 }, { "epoch": 14.003537735849056, "grad_norm": 2.8435189723968506, "learning_rate": 2.497772295265561e-06, "loss": 0.2632, "num_input_tokens_seen": 15595336, "step": 23750 }, { "epoch": 14.006485849056604, "grad_norm": 2.74514102935791, "learning_rate": 2.4955452530066897e-06, "loss": 0.3398, "num_input_tokens_seen": 15598376, "step": 23755 }, { "epoch": 14.00943396226415, "grad_norm": 2.343170166015625, "learning_rate": 2.4933188738130043e-06, "loss": 0.3586, "num_input_tokens_seen": 15602984, "step": 23760 }, { "epoch": 14.012382075471699, "grad_norm": 2.6953375339508057, "learning_rate": 2.49109315827395e-06, "loss": 0.2274, "num_input_tokens_seen": 15605992, "step": 23765 }, { "epoch": 14.015330188679245, "grad_norm": 3.067012071609497, "learning_rate": 2.4888681069787975e-06, "loss": 0.3798, "num_input_tokens_seen": 15610440, "step": 23770 }, { "epoch": 14.018278301886792, "grad_norm": 4.5910325050354, "learning_rate": 2.4866437205166353e-06, "loss": 0.3732, "num_input_tokens_seen": 15613704, "step": 23775 }, { "epoch": 14.02122641509434, "grad_norm": 1.8203330039978027, "learning_rate": 2.4844199994763803e-06, "loss": 0.3946, "num_input_tokens_seen": 15616296, "step": 23780 }, { "epoch": 14.024174528301886, "grad_norm": 2.648573637008667, "learning_rate": 2.482196944446772e-06, "loss": 0.2832, "num_input_tokens_seen": 15619240, "step": 23785 }, { "epoch": 14.027122641509434, "grad_norm": 2.6871864795684814, "learning_rate": 2.4799745560163736e-06, "loss": 0.2962, "num_input_tokens_seen": 15621896, "step": 23790 }, { "epoch": 14.03007075471698, "grad_norm": 3.544062852859497, "learning_rate": 2.4777528347735707e-06, "loss": 0.3153, "num_input_tokens_seen": 15624424, "step": 23795 }, { "epoch": 14.033018867924529, "grad_norm": 3.0380356311798096, "learning_rate": 2.4755317813065766e-06, "loss": 0.2588, "num_input_tokens_seen": 15627464, "step": 23800 }, { "epoch": 14.035966981132075, "grad_norm": 2.3066718578338623, "learning_rate": 2.4733113962034234e-06, "loss": 0.4569, "num_input_tokens_seen": 15631944, "step": 23805 }, { "epoch": 14.038915094339623, "grad_norm": 4.461876392364502, "learning_rate": 2.4710916800519674e-06, "loss": 0.3693, "num_input_tokens_seen": 15637544, "step": 23810 }, { "epoch": 14.04186320754717, "grad_norm": 2.5970306396484375, "learning_rate": 2.4688726334398883e-06, "loss": 0.2656, "num_input_tokens_seen": 15640872, "step": 23815 }, { "epoch": 14.044811320754716, "grad_norm": 2.922893762588501, "learning_rate": 2.466654256954688e-06, "loss": 0.2999, "num_input_tokens_seen": 15644104, "step": 23820 }, { "epoch": 14.047759433962264, "grad_norm": 3.629657506942749, "learning_rate": 2.4644365511836895e-06, "loss": 0.3518, "num_input_tokens_seen": 15646888, "step": 23825 }, { "epoch": 14.05070754716981, "grad_norm": 2.814878225326538, "learning_rate": 2.4622195167140432e-06, "loss": 0.3511, "num_input_tokens_seen": 15649992, "step": 23830 }, { "epoch": 14.053655660377359, "grad_norm": 2.007297992706299, "learning_rate": 2.4600031541327173e-06, "loss": 0.3473, "num_input_tokens_seen": 15652936, "step": 23835 }, { "epoch": 14.056603773584905, "grad_norm": 4.967955589294434, "learning_rate": 2.457787464026503e-06, "loss": 0.3986, "num_input_tokens_seen": 15655368, "step": 23840 }, { "epoch": 14.059551886792454, "grad_norm": 3.6894373893737793, "learning_rate": 2.455572446982014e-06, "loss": 0.3511, "num_input_tokens_seen": 15659304, "step": 23845 }, { "epoch": 14.0625, "grad_norm": 4.626992702484131, "learning_rate": 2.453358103585686e-06, "loss": 0.3571, "num_input_tokens_seen": 15662280, "step": 23850 }, { "epoch": 14.065448113207546, "grad_norm": 3.048856019973755, "learning_rate": 2.4511444344237733e-06, "loss": 0.2783, "num_input_tokens_seen": 15665608, "step": 23855 }, { "epoch": 14.068396226415095, "grad_norm": 4.836048126220703, "learning_rate": 2.4489314400823567e-06, "loss": 0.4053, "num_input_tokens_seen": 15668904, "step": 23860 }, { "epoch": 14.071344339622641, "grad_norm": 4.5277814865112305, "learning_rate": 2.446719121147337e-06, "loss": 0.5071, "num_input_tokens_seen": 15675560, "step": 23865 }, { "epoch": 14.07429245283019, "grad_norm": 3.047391414642334, "learning_rate": 2.4445074782044347e-06, "loss": 0.4297, "num_input_tokens_seen": 15678760, "step": 23870 }, { "epoch": 14.077240566037736, "grad_norm": 4.169650554656982, "learning_rate": 2.442296511839191e-06, "loss": 0.4225, "num_input_tokens_seen": 15681896, "step": 23875 }, { "epoch": 14.080188679245284, "grad_norm": 6.022006511688232, "learning_rate": 2.4400862226369687e-06, "loss": 0.2888, "num_input_tokens_seen": 15684872, "step": 23880 }, { "epoch": 14.08313679245283, "grad_norm": 3.986882209777832, "learning_rate": 2.4378766111829514e-06, "loss": 0.3054, "num_input_tokens_seen": 15688616, "step": 23885 }, { "epoch": 14.086084905660377, "grad_norm": 2.329045057296753, "learning_rate": 2.435667678062142e-06, "loss": 0.373, "num_input_tokens_seen": 15691880, "step": 23890 }, { "epoch": 14.089033018867925, "grad_norm": 3.6797525882720947, "learning_rate": 2.4334594238593682e-06, "loss": 0.346, "num_input_tokens_seen": 15695208, "step": 23895 }, { "epoch": 14.091981132075471, "grad_norm": 3.21769642829895, "learning_rate": 2.4312518491592727e-06, "loss": 0.4226, "num_input_tokens_seen": 15698056, "step": 23900 }, { "epoch": 14.09492924528302, "grad_norm": 2.8482863903045654, "learning_rate": 2.429044954546322e-06, "loss": 0.2696, "num_input_tokens_seen": 15701096, "step": 23905 }, { "epoch": 14.097877358490566, "grad_norm": 1.8938056230545044, "learning_rate": 2.426838740604799e-06, "loss": 0.3109, "num_input_tokens_seen": 15704584, "step": 23910 }, { "epoch": 14.100825471698114, "grad_norm": 2.8213324546813965, "learning_rate": 2.4246332079188066e-06, "loss": 0.3654, "num_input_tokens_seen": 15709064, "step": 23915 }, { "epoch": 14.10377358490566, "grad_norm": 3.8999288082122803, "learning_rate": 2.4224283570722745e-06, "loss": 0.487, "num_input_tokens_seen": 15712648, "step": 23920 }, { "epoch": 14.106721698113208, "grad_norm": 2.373804807662964, "learning_rate": 2.420224188648943e-06, "loss": 0.3044, "num_input_tokens_seen": 15715784, "step": 23925 }, { "epoch": 14.109669811320755, "grad_norm": 3.157949447631836, "learning_rate": 2.418020703232376e-06, "loss": 0.2874, "num_input_tokens_seen": 15718824, "step": 23930 }, { "epoch": 14.112617924528301, "grad_norm": 2.7895305156707764, "learning_rate": 2.4158179014059556e-06, "loss": 0.3526, "num_input_tokens_seen": 15721768, "step": 23935 }, { "epoch": 14.11556603773585, "grad_norm": 8.417078018188477, "learning_rate": 2.413615783752883e-06, "loss": 0.4151, "num_input_tokens_seen": 15724840, "step": 23940 }, { "epoch": 14.118514150943396, "grad_norm": 2.9882283210754395, "learning_rate": 2.4114143508561767e-06, "loss": 0.254, "num_input_tokens_seen": 15728072, "step": 23945 }, { "epoch": 14.121462264150944, "grad_norm": 3.280252456665039, "learning_rate": 2.4092136032986783e-06, "loss": 0.2925, "num_input_tokens_seen": 15732136, "step": 23950 }, { "epoch": 14.12441037735849, "grad_norm": 4.2337870597839355, "learning_rate": 2.407013541663043e-06, "loss": 0.3156, "num_input_tokens_seen": 15736104, "step": 23955 }, { "epoch": 14.127358490566039, "grad_norm": 3.136699914932251, "learning_rate": 2.4048141665317493e-06, "loss": 0.3002, "num_input_tokens_seen": 15739656, "step": 23960 }, { "epoch": 14.130306603773585, "grad_norm": 3.349000930786133, "learning_rate": 2.40261547848709e-06, "loss": 0.288, "num_input_tokens_seen": 15743144, "step": 23965 }, { "epoch": 14.133254716981131, "grad_norm": 2.1197023391723633, "learning_rate": 2.400417478111176e-06, "loss": 0.2122, "num_input_tokens_seen": 15745800, "step": 23970 }, { "epoch": 14.13620283018868, "grad_norm": 3.0013577938079834, "learning_rate": 2.3982201659859387e-06, "loss": 0.3819, "num_input_tokens_seen": 15749384, "step": 23975 }, { "epoch": 14.139150943396226, "grad_norm": 5.340903282165527, "learning_rate": 2.3960235426931237e-06, "loss": 0.4202, "num_input_tokens_seen": 15752200, "step": 23980 }, { "epoch": 14.142099056603774, "grad_norm": 3.5452046394348145, "learning_rate": 2.3938276088143003e-06, "loss": 0.3569, "num_input_tokens_seen": 15755176, "step": 23985 }, { "epoch": 14.14504716981132, "grad_norm": 8.251919746398926, "learning_rate": 2.391632364930849e-06, "loss": 0.4204, "num_input_tokens_seen": 15758024, "step": 23990 }, { "epoch": 14.147995283018869, "grad_norm": 2.5455310344696045, "learning_rate": 2.3894378116239706e-06, "loss": 0.3008, "num_input_tokens_seen": 15762504, "step": 23995 }, { "epoch": 14.150943396226415, "grad_norm": 3.355330228805542, "learning_rate": 2.387243949474683e-06, "loss": 0.2899, "num_input_tokens_seen": 15766248, "step": 24000 }, { "epoch": 14.153891509433961, "grad_norm": 2.3730857372283936, "learning_rate": 2.38505077906382e-06, "loss": 0.2068, "num_input_tokens_seen": 15769480, "step": 24005 }, { "epoch": 14.15683962264151, "grad_norm": 3.458019256591797, "learning_rate": 2.382858300972031e-06, "loss": 0.3076, "num_input_tokens_seen": 15771848, "step": 24010 }, { "epoch": 14.159787735849056, "grad_norm": 4.476145267486572, "learning_rate": 2.380666515779788e-06, "loss": 0.4083, "num_input_tokens_seen": 15774760, "step": 24015 }, { "epoch": 14.162735849056604, "grad_norm": 2.196066379547119, "learning_rate": 2.3784754240673734e-06, "loss": 0.3774, "num_input_tokens_seen": 15778504, "step": 24020 }, { "epoch": 14.16568396226415, "grad_norm": 2.684271812438965, "learning_rate": 2.3762850264148883e-06, "loss": 0.3457, "num_input_tokens_seen": 15782216, "step": 24025 }, { "epoch": 14.168632075471699, "grad_norm": 3.031079053878784, "learning_rate": 2.374095323402251e-06, "loss": 0.3756, "num_input_tokens_seen": 15784840, "step": 24030 }, { "epoch": 14.171580188679245, "grad_norm": 4.475839138031006, "learning_rate": 2.371906315609193e-06, "loss": 0.3228, "num_input_tokens_seen": 15788136, "step": 24035 }, { "epoch": 14.174528301886792, "grad_norm": 2.7936487197875977, "learning_rate": 2.369718003615263e-06, "loss": 0.3121, "num_input_tokens_seen": 15792072, "step": 24040 }, { "epoch": 14.17747641509434, "grad_norm": 2.1514484882354736, "learning_rate": 2.3675303879998284e-06, "loss": 0.3075, "num_input_tokens_seen": 15796424, "step": 24045 }, { "epoch": 14.180424528301886, "grad_norm": 3.899038553237915, "learning_rate": 2.365343469342068e-06, "loss": 0.3437, "num_input_tokens_seen": 15799112, "step": 24050 }, { "epoch": 14.183372641509434, "grad_norm": 4.551481246948242, "learning_rate": 2.3631572482209803e-06, "loss": 0.3417, "num_input_tokens_seen": 15803272, "step": 24055 }, { "epoch": 14.18632075471698, "grad_norm": 3.3130908012390137, "learning_rate": 2.3609717252153752e-06, "loss": 0.3376, "num_input_tokens_seen": 15806536, "step": 24060 }, { "epoch": 14.189268867924529, "grad_norm": 2.7285304069519043, "learning_rate": 2.35878690090388e-06, "loss": 0.4612, "num_input_tokens_seen": 15809992, "step": 24065 }, { "epoch": 14.192216981132075, "grad_norm": 2.186633586883545, "learning_rate": 2.356602775864935e-06, "loss": 0.4093, "num_input_tokens_seen": 15813320, "step": 24070 }, { "epoch": 14.195165094339623, "grad_norm": 2.7063992023468018, "learning_rate": 2.354419350676796e-06, "loss": 0.36, "num_input_tokens_seen": 15817832, "step": 24075 }, { "epoch": 14.19811320754717, "grad_norm": 3.0751407146453857, "learning_rate": 2.3522366259175377e-06, "loss": 0.3675, "num_input_tokens_seen": 15820840, "step": 24080 }, { "epoch": 14.201061320754716, "grad_norm": 2.2038447856903076, "learning_rate": 2.350054602165044e-06, "loss": 0.3564, "num_input_tokens_seen": 15823112, "step": 24085 }, { "epoch": 14.204009433962264, "grad_norm": 3.6187403202056885, "learning_rate": 2.3478732799970143e-06, "loss": 0.152, "num_input_tokens_seen": 15826568, "step": 24090 }, { "epoch": 14.20695754716981, "grad_norm": 6.097751140594482, "learning_rate": 2.3456926599909646e-06, "loss": 0.3079, "num_input_tokens_seen": 15829544, "step": 24095 }, { "epoch": 14.209905660377359, "grad_norm": 2.905355930328369, "learning_rate": 2.343512742724222e-06, "loss": 0.2345, "num_input_tokens_seen": 15833096, "step": 24100 }, { "epoch": 14.212853773584905, "grad_norm": 3.2548563480377197, "learning_rate": 2.341333528773928e-06, "loss": 0.5848, "num_input_tokens_seen": 15836488, "step": 24105 }, { "epoch": 14.215801886792454, "grad_norm": 3.3227896690368652, "learning_rate": 2.3391550187170427e-06, "loss": 0.2951, "num_input_tokens_seen": 15839400, "step": 24110 }, { "epoch": 14.21875, "grad_norm": 2.7185373306274414, "learning_rate": 2.336977213130333e-06, "loss": 0.4903, "num_input_tokens_seen": 15842376, "step": 24115 }, { "epoch": 14.221698113207546, "grad_norm": 3.667579412460327, "learning_rate": 2.3348001125903837e-06, "loss": 0.2983, "num_input_tokens_seen": 15845384, "step": 24120 }, { "epoch": 14.224646226415095, "grad_norm": 3.919628858566284, "learning_rate": 2.3326237176735905e-06, "loss": 0.4162, "num_input_tokens_seen": 15848872, "step": 24125 }, { "epoch": 14.227594339622641, "grad_norm": 2.8992667198181152, "learning_rate": 2.330448028956164e-06, "loss": 0.2214, "num_input_tokens_seen": 15851816, "step": 24130 }, { "epoch": 14.23054245283019, "grad_norm": 3.198714256286621, "learning_rate": 2.3282730470141255e-06, "loss": 0.2898, "num_input_tokens_seen": 15858728, "step": 24135 }, { "epoch": 14.233490566037736, "grad_norm": 2.8777801990509033, "learning_rate": 2.3260987724233143e-06, "loss": 0.36, "num_input_tokens_seen": 15862216, "step": 24140 }, { "epoch": 14.236438679245284, "grad_norm": 7.448873519897461, "learning_rate": 2.323925205759374e-06, "loss": 0.3814, "num_input_tokens_seen": 15866408, "step": 24145 }, { "epoch": 14.23938679245283, "grad_norm": 4.194307327270508, "learning_rate": 2.3217523475977715e-06, "loss": 0.3577, "num_input_tokens_seen": 15869160, "step": 24150 }, { "epoch": 14.242334905660377, "grad_norm": 2.1203720569610596, "learning_rate": 2.3195801985137773e-06, "loss": 0.3571, "num_input_tokens_seen": 15873128, "step": 24155 }, { "epoch": 14.245283018867925, "grad_norm": 3.607631206512451, "learning_rate": 2.317408759082478e-06, "loss": 0.3341, "num_input_tokens_seen": 15876008, "step": 24160 }, { "epoch": 14.248231132075471, "grad_norm": 3.1893556118011475, "learning_rate": 2.31523802987877e-06, "loss": 0.4424, "num_input_tokens_seen": 15878664, "step": 24165 }, { "epoch": 14.25117924528302, "grad_norm": 2.931027412414551, "learning_rate": 2.3130680114773637e-06, "loss": 0.3406, "num_input_tokens_seen": 15881064, "step": 24170 }, { "epoch": 14.254127358490566, "grad_norm": 6.859006881713867, "learning_rate": 2.310898704452782e-06, "loss": 0.4106, "num_input_tokens_seen": 15883528, "step": 24175 }, { "epoch": 14.257075471698114, "grad_norm": 5.29276704788208, "learning_rate": 2.3087301093793584e-06, "loss": 0.2263, "num_input_tokens_seen": 15885768, "step": 24180 }, { "epoch": 14.26002358490566, "grad_norm": 3.721316337585449, "learning_rate": 2.306562226831237e-06, "loss": 0.2771, "num_input_tokens_seen": 15888584, "step": 24185 }, { "epoch": 14.262971698113208, "grad_norm": 2.77593731880188, "learning_rate": 2.304395057382374e-06, "loss": 0.4258, "num_input_tokens_seen": 15891720, "step": 24190 }, { "epoch": 14.265919811320755, "grad_norm": 2.9346604347229004, "learning_rate": 2.3022286016065354e-06, "loss": 0.4272, "num_input_tokens_seen": 15894632, "step": 24195 }, { "epoch": 14.268867924528301, "grad_norm": 3.0805141925811768, "learning_rate": 2.300062860077303e-06, "loss": 0.3874, "num_input_tokens_seen": 15897416, "step": 24200 }, { "epoch": 14.27181603773585, "grad_norm": 3.5646286010742188, "learning_rate": 2.297897833368064e-06, "loss": 0.4548, "num_input_tokens_seen": 15900872, "step": 24205 }, { "epoch": 14.274764150943396, "grad_norm": 3.4275074005126953, "learning_rate": 2.2957335220520194e-06, "loss": 0.3595, "num_input_tokens_seen": 15904232, "step": 24210 }, { "epoch": 14.277712264150944, "grad_norm": 4.621788024902344, "learning_rate": 2.293569926702179e-06, "loss": 0.3502, "num_input_tokens_seen": 15908264, "step": 24215 }, { "epoch": 14.28066037735849, "grad_norm": 2.9551568031311035, "learning_rate": 2.291407047891366e-06, "loss": 0.5482, "num_input_tokens_seen": 15910824, "step": 24220 }, { "epoch": 14.283608490566039, "grad_norm": 6.268513202667236, "learning_rate": 2.2892448861922075e-06, "loss": 0.2978, "num_input_tokens_seen": 15913544, "step": 24225 }, { "epoch": 14.286556603773585, "grad_norm": 3.243342876434326, "learning_rate": 2.2870834421771505e-06, "loss": 0.3359, "num_input_tokens_seen": 15915976, "step": 24230 }, { "epoch": 14.289504716981131, "grad_norm": 3.4481101036071777, "learning_rate": 2.2849227164184433e-06, "loss": 0.3572, "num_input_tokens_seen": 15919496, "step": 24235 }, { "epoch": 14.29245283018868, "grad_norm": 2.4060888290405273, "learning_rate": 2.2827627094881473e-06, "loss": 0.311, "num_input_tokens_seen": 15922536, "step": 24240 }, { "epoch": 14.295400943396226, "grad_norm": 4.076788902282715, "learning_rate": 2.2806034219581364e-06, "loss": 0.2901, "num_input_tokens_seen": 15925160, "step": 24245 }, { "epoch": 14.298349056603774, "grad_norm": 8.99624252319336, "learning_rate": 2.278444854400089e-06, "loss": 0.3937, "num_input_tokens_seen": 15928424, "step": 24250 }, { "epoch": 14.30129716981132, "grad_norm": 3.1592373847961426, "learning_rate": 2.276287007385496e-06, "loss": 0.3239, "num_input_tokens_seen": 15932360, "step": 24255 }, { "epoch": 14.304245283018869, "grad_norm": 3.7048544883728027, "learning_rate": 2.2741298814856542e-06, "loss": 0.3621, "num_input_tokens_seen": 15935144, "step": 24260 }, { "epoch": 14.307193396226415, "grad_norm": 3.6841163635253906, "learning_rate": 2.2719734772716763e-06, "loss": 0.3411, "num_input_tokens_seen": 15937288, "step": 24265 }, { "epoch": 14.310141509433961, "grad_norm": 5.7026543617248535, "learning_rate": 2.269817795314477e-06, "loss": 0.3894, "num_input_tokens_seen": 15941224, "step": 24270 }, { "epoch": 14.31308962264151, "grad_norm": 3.7525033950805664, "learning_rate": 2.2676628361847834e-06, "loss": 0.4188, "num_input_tokens_seen": 15944264, "step": 24275 }, { "epoch": 14.316037735849056, "grad_norm": 4.183028697967529, "learning_rate": 2.2655086004531296e-06, "loss": 0.4372, "num_input_tokens_seen": 15946952, "step": 24280 }, { "epoch": 14.318985849056604, "grad_norm": 2.461275339126587, "learning_rate": 2.2633550886898583e-06, "loss": 0.3269, "num_input_tokens_seen": 15951144, "step": 24285 }, { "epoch": 14.32193396226415, "grad_norm": 9.285680770874023, "learning_rate": 2.26120230146512e-06, "loss": 0.3893, "num_input_tokens_seen": 15954152, "step": 24290 }, { "epoch": 14.324882075471699, "grad_norm": 2.4888484477996826, "learning_rate": 2.2590502393488777e-06, "loss": 0.3555, "num_input_tokens_seen": 15957032, "step": 24295 }, { "epoch": 14.327830188679245, "grad_norm": 2.4050779342651367, "learning_rate": 2.256898902910898e-06, "loss": 0.3318, "num_input_tokens_seen": 15961032, "step": 24300 }, { "epoch": 14.330778301886792, "grad_norm": 3.048614025115967, "learning_rate": 2.2547482927207548e-06, "loss": 0.321, "num_input_tokens_seen": 15964072, "step": 24305 }, { "epoch": 14.33372641509434, "grad_norm": 2.482595920562744, "learning_rate": 2.252598409347833e-06, "loss": 0.3202, "num_input_tokens_seen": 15967048, "step": 24310 }, { "epoch": 14.336674528301886, "grad_norm": 13.337950706481934, "learning_rate": 2.250449253361323e-06, "loss": 0.3967, "num_input_tokens_seen": 15969768, "step": 24315 }, { "epoch": 14.339622641509434, "grad_norm": 6.053839206695557, "learning_rate": 2.2483008253302214e-06, "loss": 0.2726, "num_input_tokens_seen": 15971912, "step": 24320 }, { "epoch": 14.34257075471698, "grad_norm": 5.154048919677734, "learning_rate": 2.246153125823337e-06, "loss": 0.2831, "num_input_tokens_seen": 15974600, "step": 24325 }, { "epoch": 14.345518867924529, "grad_norm": 3.7169528007507324, "learning_rate": 2.2440061554092813e-06, "loss": 0.2875, "num_input_tokens_seen": 15976968, "step": 24330 }, { "epoch": 14.348466981132075, "grad_norm": 3.246070384979248, "learning_rate": 2.2418599146564714e-06, "loss": 0.2354, "num_input_tokens_seen": 15979432, "step": 24335 }, { "epoch": 14.351415094339623, "grad_norm": 3.84919810295105, "learning_rate": 2.239714404133138e-06, "loss": 0.3383, "num_input_tokens_seen": 15982216, "step": 24340 }, { "epoch": 14.35436320754717, "grad_norm": 2.3941667079925537, "learning_rate": 2.2375696244073126e-06, "loss": 0.3595, "num_input_tokens_seen": 15986248, "step": 24345 }, { "epoch": 14.357311320754716, "grad_norm": 2.3149969577789307, "learning_rate": 2.235425576046834e-06, "loss": 0.2759, "num_input_tokens_seen": 15989928, "step": 24350 }, { "epoch": 14.360259433962264, "grad_norm": 3.8455440998077393, "learning_rate": 2.233282259619347e-06, "loss": 0.3351, "num_input_tokens_seen": 15993672, "step": 24355 }, { "epoch": 14.36320754716981, "grad_norm": 4.790022373199463, "learning_rate": 2.231139675692308e-06, "loss": 0.2986, "num_input_tokens_seen": 15996328, "step": 24360 }, { "epoch": 14.366155660377359, "grad_norm": 2.836423397064209, "learning_rate": 2.228997824832973e-06, "loss": 0.3821, "num_input_tokens_seen": 16001320, "step": 24365 }, { "epoch": 14.369103773584905, "grad_norm": 1.8952630758285522, "learning_rate": 2.226856707608406e-06, "loss": 0.3462, "num_input_tokens_seen": 16005480, "step": 24370 }, { "epoch": 14.372051886792454, "grad_norm": 3.3338842391967773, "learning_rate": 2.2247163245854768e-06, "loss": 0.3175, "num_input_tokens_seen": 16008584, "step": 24375 }, { "epoch": 14.375, "grad_norm": 4.059430122375488, "learning_rate": 2.222576676330862e-06, "loss": 0.3484, "num_input_tokens_seen": 16011048, "step": 24380 }, { "epoch": 14.377948113207546, "grad_norm": 5.59852409362793, "learning_rate": 2.2204377634110403e-06, "loss": 0.3809, "num_input_tokens_seen": 16014344, "step": 24385 }, { "epoch": 14.380896226415095, "grad_norm": 3.6498188972473145, "learning_rate": 2.218299586392301e-06, "loss": 0.347, "num_input_tokens_seen": 16017224, "step": 24390 }, { "epoch": 14.383844339622641, "grad_norm": 5.584791660308838, "learning_rate": 2.2161621458407355e-06, "loss": 0.3297, "num_input_tokens_seen": 16020264, "step": 24395 }, { "epoch": 14.38679245283019, "grad_norm": 3.3129053115844727, "learning_rate": 2.2140254423222398e-06, "loss": 0.4174, "num_input_tokens_seen": 16023720, "step": 24400 }, { "epoch": 14.389740566037736, "grad_norm": 2.209779977798462, "learning_rate": 2.2118894764025146e-06, "loss": 0.3531, "num_input_tokens_seen": 16027304, "step": 24405 }, { "epoch": 14.392688679245284, "grad_norm": 2.971938371658325, "learning_rate": 2.2097542486470667e-06, "loss": 0.3567, "num_input_tokens_seen": 16030344, "step": 24410 }, { "epoch": 14.39563679245283, "grad_norm": 5.21639347076416, "learning_rate": 2.207619759621205e-06, "loss": 0.3738, "num_input_tokens_seen": 16033224, "step": 24415 }, { "epoch": 14.398584905660377, "grad_norm": 1.9810161590576172, "learning_rate": 2.205486009890049e-06, "loss": 0.3332, "num_input_tokens_seen": 16036840, "step": 24420 }, { "epoch": 14.401533018867925, "grad_norm": 3.662766695022583, "learning_rate": 2.2033530000185146e-06, "loss": 0.3081, "num_input_tokens_seen": 16039720, "step": 24425 }, { "epoch": 14.404481132075471, "grad_norm": 6.553592681884766, "learning_rate": 2.2012207305713244e-06, "loss": 0.2633, "num_input_tokens_seen": 16043304, "step": 24430 }, { "epoch": 14.40742924528302, "grad_norm": 3.8046112060546875, "learning_rate": 2.19908920211301e-06, "loss": 0.4128, "num_input_tokens_seen": 16046568, "step": 24435 }, { "epoch": 14.410377358490566, "grad_norm": 3.954754590988159, "learning_rate": 2.196958415207901e-06, "loss": 0.2106, "num_input_tokens_seen": 16051176, "step": 24440 }, { "epoch": 14.413325471698114, "grad_norm": 1.9548498392105103, "learning_rate": 2.1948283704201312e-06, "loss": 0.3982, "num_input_tokens_seen": 16055912, "step": 24445 }, { "epoch": 14.41627358490566, "grad_norm": 4.567651271820068, "learning_rate": 2.1926990683136383e-06, "loss": 0.3846, "num_input_tokens_seen": 16058632, "step": 24450 }, { "epoch": 14.419221698113208, "grad_norm": 2.587756395339966, "learning_rate": 2.1905705094521685e-06, "loss": 0.3499, "num_input_tokens_seen": 16062376, "step": 24455 }, { "epoch": 14.422169811320755, "grad_norm": 4.779268741607666, "learning_rate": 2.1884426943992635e-06, "loss": 0.4439, "num_input_tokens_seen": 16065096, "step": 24460 }, { "epoch": 14.425117924528301, "grad_norm": 2.5860283374786377, "learning_rate": 2.1863156237182727e-06, "loss": 0.3616, "num_input_tokens_seen": 16067912, "step": 24465 }, { "epoch": 14.42806603773585, "grad_norm": 2.493384838104248, "learning_rate": 2.1841892979723466e-06, "loss": 0.3198, "num_input_tokens_seen": 16071144, "step": 24470 }, { "epoch": 14.431014150943396, "grad_norm": 2.1952872276306152, "learning_rate": 2.1820637177244375e-06, "loss": 0.3132, "num_input_tokens_seen": 16073896, "step": 24475 }, { "epoch": 14.433962264150944, "grad_norm": 8.306375503540039, "learning_rate": 2.179938883537306e-06, "loss": 0.3708, "num_input_tokens_seen": 16076456, "step": 24480 }, { "epoch": 14.43691037735849, "grad_norm": 3.5465474128723145, "learning_rate": 2.177814795973508e-06, "loss": 0.3609, "num_input_tokens_seen": 16079688, "step": 24485 }, { "epoch": 14.439858490566039, "grad_norm": 2.9062345027923584, "learning_rate": 2.1756914555954064e-06, "loss": 0.3043, "num_input_tokens_seen": 16083752, "step": 24490 }, { "epoch": 14.442806603773585, "grad_norm": 5.965517044067383, "learning_rate": 2.173568862965164e-06, "loss": 0.3015, "num_input_tokens_seen": 16086248, "step": 24495 }, { "epoch": 14.445754716981131, "grad_norm": 4.07684850692749, "learning_rate": 2.171447018644746e-06, "loss": 0.3935, "num_input_tokens_seen": 16089768, "step": 24500 }, { "epoch": 14.44870283018868, "grad_norm": 3.8119688034057617, "learning_rate": 2.1693259231959186e-06, "loss": 0.5133, "num_input_tokens_seen": 16093512, "step": 24505 }, { "epoch": 14.451650943396226, "grad_norm": 3.721956729888916, "learning_rate": 2.1672055771802545e-06, "loss": 0.341, "num_input_tokens_seen": 16096744, "step": 24510 }, { "epoch": 14.454599056603774, "grad_norm": 4.190140724182129, "learning_rate": 2.1650859811591224e-06, "loss": 0.3745, "num_input_tokens_seen": 16099976, "step": 24515 }, { "epoch": 14.45754716981132, "grad_norm": 4.3208770751953125, "learning_rate": 2.1629671356936943e-06, "loss": 0.4127, "num_input_tokens_seen": 16102504, "step": 24520 }, { "epoch": 14.460495283018869, "grad_norm": 3.822577476501465, "learning_rate": 2.1608490413449428e-06, "loss": 0.3389, "num_input_tokens_seen": 16105128, "step": 24525 }, { "epoch": 14.463443396226415, "grad_norm": 3.199812889099121, "learning_rate": 2.158731698673645e-06, "loss": 0.3229, "num_input_tokens_seen": 16108744, "step": 24530 }, { "epoch": 14.466391509433961, "grad_norm": 3.282534122467041, "learning_rate": 2.1566151082403752e-06, "loss": 0.3828, "num_input_tokens_seen": 16111496, "step": 24535 }, { "epoch": 14.46933962264151, "grad_norm": 2.013637065887451, "learning_rate": 2.154499270605508e-06, "loss": 0.3327, "num_input_tokens_seen": 16115400, "step": 24540 }, { "epoch": 14.472287735849056, "grad_norm": 3.1889097690582275, "learning_rate": 2.1523841863292243e-06, "loss": 0.331, "num_input_tokens_seen": 16118568, "step": 24545 }, { "epoch": 14.475235849056604, "grad_norm": 5.8994951248168945, "learning_rate": 2.1502698559714998e-06, "loss": 0.3647, "num_input_tokens_seen": 16121224, "step": 24550 }, { "epoch": 14.47818396226415, "grad_norm": 3.2728323936462402, "learning_rate": 2.1481562800921125e-06, "loss": 0.3672, "num_input_tokens_seen": 16124584, "step": 24555 }, { "epoch": 14.481132075471699, "grad_norm": 3.1149160861968994, "learning_rate": 2.146043459250641e-06, "loss": 0.2624, "num_input_tokens_seen": 16127368, "step": 24560 }, { "epoch": 14.484080188679245, "grad_norm": 9.27627182006836, "learning_rate": 2.1439313940064634e-06, "loss": 0.3243, "num_input_tokens_seen": 16131528, "step": 24565 }, { "epoch": 14.487028301886792, "grad_norm": 2.3685193061828613, "learning_rate": 2.141820084918756e-06, "loss": 0.2508, "num_input_tokens_seen": 16134344, "step": 24570 }, { "epoch": 14.48997641509434, "grad_norm": 3.4639830589294434, "learning_rate": 2.1397095325465013e-06, "loss": 0.2928, "num_input_tokens_seen": 16138952, "step": 24575 }, { "epoch": 14.492924528301886, "grad_norm": 1.8057721853256226, "learning_rate": 2.1375997374484754e-06, "loss": 0.2772, "num_input_tokens_seen": 16141384, "step": 24580 }, { "epoch": 14.495872641509434, "grad_norm": 5.3271164894104, "learning_rate": 2.1354907001832546e-06, "loss": 0.3295, "num_input_tokens_seen": 16145032, "step": 24585 }, { "epoch": 14.49882075471698, "grad_norm": 2.833660364151001, "learning_rate": 2.133382421309217e-06, "loss": 0.3233, "num_input_tokens_seen": 16147560, "step": 24590 }, { "epoch": 14.501768867924529, "grad_norm": 2.984715223312378, "learning_rate": 2.131274901384537e-06, "loss": 0.4076, "num_input_tokens_seen": 16150696, "step": 24595 }, { "epoch": 14.504716981132075, "grad_norm": 4.448645114898682, "learning_rate": 2.1291681409671896e-06, "loss": 0.307, "num_input_tokens_seen": 16154504, "step": 24600 }, { "epoch": 14.507665094339622, "grad_norm": 5.286225318908691, "learning_rate": 2.12706214061495e-06, "loss": 0.4148, "num_input_tokens_seen": 16157864, "step": 24605 }, { "epoch": 14.51061320754717, "grad_norm": 4.216075420379639, "learning_rate": 2.124956900885391e-06, "loss": 0.4786, "num_input_tokens_seen": 16161096, "step": 24610 }, { "epoch": 14.513561320754716, "grad_norm": 4.126805782318115, "learning_rate": 2.1228524223358833e-06, "loss": 0.3498, "num_input_tokens_seen": 16164328, "step": 24615 }, { "epoch": 14.516509433962264, "grad_norm": 4.162108898162842, "learning_rate": 2.120748705523595e-06, "loss": 0.3745, "num_input_tokens_seen": 16167688, "step": 24620 }, { "epoch": 14.51945754716981, "grad_norm": 6.243841648101807, "learning_rate": 2.1186457510054976e-06, "loss": 0.3449, "num_input_tokens_seen": 16170440, "step": 24625 }, { "epoch": 14.522405660377359, "grad_norm": 5.044878959655762, "learning_rate": 2.116543559338355e-06, "loss": 0.2889, "num_input_tokens_seen": 16172744, "step": 24630 }, { "epoch": 14.525353773584905, "grad_norm": 3.0747759342193604, "learning_rate": 2.1144421310787305e-06, "loss": 0.2777, "num_input_tokens_seen": 16175656, "step": 24635 }, { "epoch": 14.528301886792454, "grad_norm": 2.53835129737854, "learning_rate": 2.11234146678299e-06, "loss": 0.3464, "num_input_tokens_seen": 16179400, "step": 24640 }, { "epoch": 14.53125, "grad_norm": 3.1400556564331055, "learning_rate": 2.1102415670072907e-06, "loss": 0.2817, "num_input_tokens_seen": 16181864, "step": 24645 }, { "epoch": 14.534198113207546, "grad_norm": 3.1983864307403564, "learning_rate": 2.108142432307591e-06, "loss": 0.3169, "num_input_tokens_seen": 16184552, "step": 24650 }, { "epoch": 14.537146226415095, "grad_norm": 2.128373622894287, "learning_rate": 2.1060440632396456e-06, "loss": 0.2846, "num_input_tokens_seen": 16187688, "step": 24655 }, { "epoch": 14.540094339622641, "grad_norm": 2.4882311820983887, "learning_rate": 2.103946460359007e-06, "loss": 0.3945, "num_input_tokens_seen": 16191112, "step": 24660 }, { "epoch": 14.54304245283019, "grad_norm": 2.1004393100738525, "learning_rate": 2.101849624221022e-06, "loss": 0.2284, "num_input_tokens_seen": 16194760, "step": 24665 }, { "epoch": 14.545990566037736, "grad_norm": 5.237072467803955, "learning_rate": 2.0997535553808417e-06, "loss": 0.2267, "num_input_tokens_seen": 16198856, "step": 24670 }, { "epoch": 14.548938679245284, "grad_norm": 4.167510986328125, "learning_rate": 2.0976582543934064e-06, "loss": 0.4412, "num_input_tokens_seen": 16203176, "step": 24675 }, { "epoch": 14.55188679245283, "grad_norm": 18.159154891967773, "learning_rate": 2.0955637218134573e-06, "loss": 0.4001, "num_input_tokens_seen": 16206472, "step": 24680 }, { "epoch": 14.554834905660378, "grad_norm": 8.440449714660645, "learning_rate": 2.09346995819553e-06, "loss": 0.2747, "num_input_tokens_seen": 16209480, "step": 24685 }, { "epoch": 14.557783018867925, "grad_norm": 3.0341074466705322, "learning_rate": 2.0913769640939553e-06, "loss": 0.2996, "num_input_tokens_seen": 16213576, "step": 24690 }, { "epoch": 14.560731132075471, "grad_norm": 2.9372637271881104, "learning_rate": 2.0892847400628674e-06, "loss": 0.3887, "num_input_tokens_seen": 16216424, "step": 24695 }, { "epoch": 14.56367924528302, "grad_norm": 3.6353986263275146, "learning_rate": 2.0871932866561885e-06, "loss": 0.3825, "num_input_tokens_seen": 16219048, "step": 24700 }, { "epoch": 14.566627358490566, "grad_norm": 3.733618974685669, "learning_rate": 2.0851026044276405e-06, "loss": 0.3478, "num_input_tokens_seen": 16221960, "step": 24705 }, { "epoch": 14.569575471698114, "grad_norm": 3.1698782444000244, "learning_rate": 2.083012693930741e-06, "loss": 0.4725, "num_input_tokens_seen": 16225576, "step": 24710 }, { "epoch": 14.57252358490566, "grad_norm": 2.895906448364258, "learning_rate": 2.0809235557188e-06, "loss": 0.3202, "num_input_tokens_seen": 16228680, "step": 24715 }, { "epoch": 14.575471698113208, "grad_norm": 2.6882452964782715, "learning_rate": 2.0788351903449307e-06, "loss": 0.2269, "num_input_tokens_seen": 16231304, "step": 24720 }, { "epoch": 14.578419811320755, "grad_norm": 3.6293067932128906, "learning_rate": 2.0767475983620317e-06, "loss": 0.2351, "num_input_tokens_seen": 16234600, "step": 24725 }, { "epoch": 14.581367924528301, "grad_norm": 5.562458038330078, "learning_rate": 2.074660780322806e-06, "loss": 0.236, "num_input_tokens_seen": 16237576, "step": 24730 }, { "epoch": 14.58431603773585, "grad_norm": 2.9556171894073486, "learning_rate": 2.0725747367797473e-06, "loss": 0.3305, "num_input_tokens_seen": 16240616, "step": 24735 }, { "epoch": 14.587264150943396, "grad_norm": 8.521834373474121, "learning_rate": 2.070489468285143e-06, "loss": 0.3323, "num_input_tokens_seen": 16244488, "step": 24740 }, { "epoch": 14.590212264150944, "grad_norm": 5.581058979034424, "learning_rate": 2.068404975391077e-06, "loss": 0.2992, "num_input_tokens_seen": 16247976, "step": 24745 }, { "epoch": 14.59316037735849, "grad_norm": 2.604302406311035, "learning_rate": 2.0663212586494293e-06, "loss": 0.3692, "num_input_tokens_seen": 16251336, "step": 24750 }, { "epoch": 14.596108490566039, "grad_norm": 1.8392833471298218, "learning_rate": 2.064238318611869e-06, "loss": 0.3847, "num_input_tokens_seen": 16255048, "step": 24755 }, { "epoch": 14.599056603773585, "grad_norm": 3.3094027042388916, "learning_rate": 2.0621561558298693e-06, "loss": 0.3337, "num_input_tokens_seen": 16257768, "step": 24760 }, { "epoch": 14.602004716981131, "grad_norm": 1.4599881172180176, "learning_rate": 2.0600747708546877e-06, "loss": 0.2908, "num_input_tokens_seen": 16261736, "step": 24765 }, { "epoch": 14.60495283018868, "grad_norm": 4.384059906005859, "learning_rate": 2.0579941642373814e-06, "loss": 0.45, "num_input_tokens_seen": 16264200, "step": 24770 }, { "epoch": 14.607900943396226, "grad_norm": 5.087770938873291, "learning_rate": 2.0559143365287993e-06, "loss": 0.3068, "num_input_tokens_seen": 16266632, "step": 24775 }, { "epoch": 14.610849056603774, "grad_norm": 3.2713775634765625, "learning_rate": 2.0538352882795846e-06, "loss": 0.2914, "num_input_tokens_seen": 16269224, "step": 24780 }, { "epoch": 14.61379716981132, "grad_norm": 3.3091399669647217, "learning_rate": 2.051757020040173e-06, "loss": 0.3108, "num_input_tokens_seen": 16272328, "step": 24785 }, { "epoch": 14.616745283018869, "grad_norm": 3.712570905685425, "learning_rate": 2.0496795323607983e-06, "loss": 0.2487, "num_input_tokens_seen": 16275208, "step": 24790 }, { "epoch": 14.619693396226415, "grad_norm": 7.2578206062316895, "learning_rate": 2.0476028257914825e-06, "loss": 0.3411, "num_input_tokens_seen": 16277896, "step": 24795 }, { "epoch": 14.622641509433961, "grad_norm": 2.501924753189087, "learning_rate": 2.0455269008820433e-06, "loss": 0.3587, "num_input_tokens_seen": 16281032, "step": 24800 }, { "epoch": 14.62558962264151, "grad_norm": 2.847658395767212, "learning_rate": 2.0434517581820893e-06, "loss": 0.4325, "num_input_tokens_seen": 16284392, "step": 24805 }, { "epoch": 14.628537735849056, "grad_norm": 3.629201650619507, "learning_rate": 2.041377398241025e-06, "loss": 0.2961, "num_input_tokens_seen": 16290120, "step": 24810 }, { "epoch": 14.631485849056604, "grad_norm": 2.6484787464141846, "learning_rate": 2.0393038216080433e-06, "loss": 0.388, "num_input_tokens_seen": 16293896, "step": 24815 }, { "epoch": 14.63443396226415, "grad_norm": 3.4122228622436523, "learning_rate": 2.037231028832135e-06, "loss": 0.3363, "num_input_tokens_seen": 16297608, "step": 24820 }, { "epoch": 14.637382075471699, "grad_norm": 3.1886301040649414, "learning_rate": 2.0351590204620823e-06, "loss": 0.3994, "num_input_tokens_seen": 16300968, "step": 24825 }, { "epoch": 14.640330188679245, "grad_norm": 5.027430057525635, "learning_rate": 2.033087797046457e-06, "loss": 0.3603, "num_input_tokens_seen": 16305192, "step": 24830 }, { "epoch": 14.643278301886792, "grad_norm": 2.503948450088501, "learning_rate": 2.031017359133624e-06, "loss": 0.3653, "num_input_tokens_seen": 16308424, "step": 24835 }, { "epoch": 14.64622641509434, "grad_norm": 3.419942617416382, "learning_rate": 2.0289477072717406e-06, "loss": 0.4338, "num_input_tokens_seen": 16312520, "step": 24840 }, { "epoch": 14.649174528301886, "grad_norm": 4.597949504852295, "learning_rate": 2.026878842008756e-06, "loss": 0.3722, "num_input_tokens_seen": 16315432, "step": 24845 }, { "epoch": 14.652122641509434, "grad_norm": 3.5508241653442383, "learning_rate": 2.0248107638924105e-06, "loss": 0.313, "num_input_tokens_seen": 16318664, "step": 24850 }, { "epoch": 14.65507075471698, "grad_norm": 2.763918876647949, "learning_rate": 2.0227434734702386e-06, "loss": 0.3901, "num_input_tokens_seen": 16321640, "step": 24855 }, { "epoch": 14.658018867924529, "grad_norm": 2.8822779655456543, "learning_rate": 2.020676971289563e-06, "loss": 0.3321, "num_input_tokens_seen": 16324488, "step": 24860 }, { "epoch": 14.660966981132075, "grad_norm": 1.867860198020935, "learning_rate": 2.0186112578975005e-06, "loss": 0.2701, "num_input_tokens_seen": 16327944, "step": 24865 }, { "epoch": 14.663915094339622, "grad_norm": 6.1324639320373535, "learning_rate": 2.016546333840956e-06, "loss": 0.3844, "num_input_tokens_seen": 16330472, "step": 24870 }, { "epoch": 14.66686320754717, "grad_norm": 2.6301021575927734, "learning_rate": 2.014482199666627e-06, "loss": 0.2706, "num_input_tokens_seen": 16333960, "step": 24875 }, { "epoch": 14.669811320754716, "grad_norm": 1.9802969694137573, "learning_rate": 2.0124188559210017e-06, "loss": 0.2987, "num_input_tokens_seen": 16336968, "step": 24880 }, { "epoch": 14.672759433962264, "grad_norm": 3.1904897689819336, "learning_rate": 2.0103563031503613e-06, "loss": 0.2607, "num_input_tokens_seen": 16339368, "step": 24885 }, { "epoch": 14.67570754716981, "grad_norm": 4.573526859283447, "learning_rate": 2.0082945419007745e-06, "loss": 0.2033, "num_input_tokens_seen": 16342056, "step": 24890 }, { "epoch": 14.678655660377359, "grad_norm": 2.1927220821380615, "learning_rate": 2.0062335727181007e-06, "loss": 0.2484, "num_input_tokens_seen": 16344776, "step": 24895 }, { "epoch": 14.681603773584905, "grad_norm": 4.081036567687988, "learning_rate": 2.004173396147992e-06, "loss": 0.3597, "num_input_tokens_seen": 16347944, "step": 24900 }, { "epoch": 14.684551886792454, "grad_norm": 4.41820764541626, "learning_rate": 2.0021140127358873e-06, "loss": 0.3088, "num_input_tokens_seen": 16352104, "step": 24905 }, { "epoch": 14.6875, "grad_norm": 4.6376447677612305, "learning_rate": 2.0000554230270164e-06, "loss": 0.4204, "num_input_tokens_seen": 16354824, "step": 24910 }, { "epoch": 14.690448113207546, "grad_norm": 2.7834999561309814, "learning_rate": 1.997997627566401e-06, "loss": 0.3552, "num_input_tokens_seen": 16357928, "step": 24915 }, { "epoch": 14.693396226415095, "grad_norm": 3.0203869342803955, "learning_rate": 1.9959406268988536e-06, "loss": 0.3187, "num_input_tokens_seen": 16361576, "step": 24920 }, { "epoch": 14.696344339622641, "grad_norm": 3.0216755867004395, "learning_rate": 1.9938844215689717e-06, "loss": 0.3877, "num_input_tokens_seen": 16364744, "step": 24925 }, { "epoch": 14.69929245283019, "grad_norm": 3.5914957523345947, "learning_rate": 1.991829012121145e-06, "loss": 0.3289, "num_input_tokens_seen": 16367784, "step": 24930 }, { "epoch": 14.702240566037736, "grad_norm": 2.7214019298553467, "learning_rate": 1.989774399099552e-06, "loss": 0.3349, "num_input_tokens_seen": 16371496, "step": 24935 }, { "epoch": 14.705188679245284, "grad_norm": 3.9025142192840576, "learning_rate": 1.98772058304816e-06, "loss": 0.3983, "num_input_tokens_seen": 16374408, "step": 24940 }, { "epoch": 14.70813679245283, "grad_norm": 6.809302806854248, "learning_rate": 1.9856675645107244e-06, "loss": 0.3058, "num_input_tokens_seen": 16378152, "step": 24945 }, { "epoch": 14.711084905660378, "grad_norm": 2.440298080444336, "learning_rate": 1.9836153440307936e-06, "loss": 0.376, "num_input_tokens_seen": 16381640, "step": 24950 }, { "epoch": 14.714033018867925, "grad_norm": 2.5643014907836914, "learning_rate": 1.9815639221517002e-06, "loss": 0.3253, "num_input_tokens_seen": 16384616, "step": 24955 }, { "epoch": 14.716981132075471, "grad_norm": 2.826843023300171, "learning_rate": 1.9795132994165673e-06, "loss": 0.2397, "num_input_tokens_seen": 16388392, "step": 24960 }, { "epoch": 14.71992924528302, "grad_norm": 3.387916088104248, "learning_rate": 1.977463476368306e-06, "loss": 0.4153, "num_input_tokens_seen": 16392392, "step": 24965 }, { "epoch": 14.722877358490566, "grad_norm": 3.211857557296753, "learning_rate": 1.975414453549614e-06, "loss": 0.4287, "num_input_tokens_seen": 16395432, "step": 24970 }, { "epoch": 14.725825471698114, "grad_norm": 2.252070665359497, "learning_rate": 1.9733662315029826e-06, "loss": 0.333, "num_input_tokens_seen": 16398472, "step": 24975 }, { "epoch": 14.72877358490566, "grad_norm": 3.6192173957824707, "learning_rate": 1.9713188107706856e-06, "loss": 0.3171, "num_input_tokens_seen": 16402344, "step": 24980 }, { "epoch": 14.731721698113208, "grad_norm": 4.957299709320068, "learning_rate": 1.969272191894786e-06, "loss": 0.3107, "num_input_tokens_seen": 16405128, "step": 24985 }, { "epoch": 14.734669811320755, "grad_norm": 3.8053746223449707, "learning_rate": 1.967226375417135e-06, "loss": 0.3639, "num_input_tokens_seen": 16407560, "step": 24990 }, { "epoch": 14.737617924528301, "grad_norm": 2.9983530044555664, "learning_rate": 1.965181361879372e-06, "loss": 0.3329, "num_input_tokens_seen": 16412616, "step": 24995 }, { "epoch": 14.74056603773585, "grad_norm": 3.659153938293457, "learning_rate": 1.9631371518229214e-06, "loss": 0.3674, "num_input_tokens_seen": 16415752, "step": 25000 }, { "epoch": 14.743514150943396, "grad_norm": 3.977548599243164, "learning_rate": 1.9610937457889975e-06, "loss": 0.469, "num_input_tokens_seen": 16419272, "step": 25005 }, { "epoch": 14.746462264150944, "grad_norm": 4.400087833404541, "learning_rate": 1.9590511443186032e-06, "loss": 0.4285, "num_input_tokens_seen": 16422344, "step": 25010 }, { "epoch": 14.74941037735849, "grad_norm": 1.94951331615448, "learning_rate": 1.9570093479525243e-06, "loss": 0.3275, "num_input_tokens_seen": 16425672, "step": 25015 }, { "epoch": 14.752358490566039, "grad_norm": 3.5136849880218506, "learning_rate": 1.954968357231335e-06, "loss": 0.3933, "num_input_tokens_seen": 16428904, "step": 25020 }, { "epoch": 14.755306603773585, "grad_norm": 2.3590848445892334, "learning_rate": 1.9529281726953964e-06, "loss": 0.3916, "num_input_tokens_seen": 16431976, "step": 25025 }, { "epoch": 14.758254716981131, "grad_norm": 2.841110944747925, "learning_rate": 1.9508887948848564e-06, "loss": 0.3025, "num_input_tokens_seen": 16434792, "step": 25030 }, { "epoch": 14.76120283018868, "grad_norm": 3.780902147293091, "learning_rate": 1.9488502243396475e-06, "loss": 0.266, "num_input_tokens_seen": 16437288, "step": 25035 }, { "epoch": 14.764150943396226, "grad_norm": 2.708383798599243, "learning_rate": 1.946812461599492e-06, "loss": 0.2272, "num_input_tokens_seen": 16441160, "step": 25040 }, { "epoch": 14.767099056603774, "grad_norm": 3.674901247024536, "learning_rate": 1.944775507203897e-06, "loss": 0.2813, "num_input_tokens_seen": 16444008, "step": 25045 }, { "epoch": 14.77004716981132, "grad_norm": 3.306471824645996, "learning_rate": 1.942739361692153e-06, "loss": 0.3205, "num_input_tokens_seen": 16447656, "step": 25050 }, { "epoch": 14.772995283018869, "grad_norm": 4.022496223449707, "learning_rate": 1.94070402560334e-06, "loss": 0.2205, "num_input_tokens_seen": 16452808, "step": 25055 }, { "epoch": 14.775943396226415, "grad_norm": 3.0671069622039795, "learning_rate": 1.93866949947632e-06, "loss": 0.3291, "num_input_tokens_seen": 16456488, "step": 25060 }, { "epoch": 14.778891509433961, "grad_norm": 6.4838972091674805, "learning_rate": 1.9366357838497423e-06, "loss": 0.3259, "num_input_tokens_seen": 16460936, "step": 25065 }, { "epoch": 14.78183962264151, "grad_norm": 2.4065980911254883, "learning_rate": 1.9346028792620454e-06, "loss": 0.2834, "num_input_tokens_seen": 16463400, "step": 25070 }, { "epoch": 14.784787735849056, "grad_norm": 5.359405040740967, "learning_rate": 1.9325707862514464e-06, "loss": 0.3996, "num_input_tokens_seen": 16466216, "step": 25075 }, { "epoch": 14.787735849056604, "grad_norm": 4.634647369384766, "learning_rate": 1.930539505355952e-06, "loss": 0.4838, "num_input_tokens_seen": 16469416, "step": 25080 }, { "epoch": 14.79068396226415, "grad_norm": 2.4687275886535645, "learning_rate": 1.9285090371133524e-06, "loss": 0.4449, "num_input_tokens_seen": 16472456, "step": 25085 }, { "epoch": 14.793632075471699, "grad_norm": 2.981765031814575, "learning_rate": 1.9264793820612228e-06, "loss": 0.4919, "num_input_tokens_seen": 16475432, "step": 25090 }, { "epoch": 14.796580188679245, "grad_norm": 3.9234464168548584, "learning_rate": 1.924450540736921e-06, "loss": 0.4919, "num_input_tokens_seen": 16478792, "step": 25095 }, { "epoch": 14.799528301886792, "grad_norm": 3.0135157108306885, "learning_rate": 1.922422513677593e-06, "loss": 0.2573, "num_input_tokens_seen": 16481704, "step": 25100 }, { "epoch": 14.80247641509434, "grad_norm": 4.53028678894043, "learning_rate": 1.9203953014201703e-06, "loss": 0.3591, "num_input_tokens_seen": 16484392, "step": 25105 }, { "epoch": 14.805424528301886, "grad_norm": 3.6948068141937256, "learning_rate": 1.918368904501364e-06, "loss": 0.3677, "num_input_tokens_seen": 16486952, "step": 25110 }, { "epoch": 14.808372641509434, "grad_norm": 4.126126766204834, "learning_rate": 1.9163433234576713e-06, "loss": 0.3991, "num_input_tokens_seen": 16490056, "step": 25115 }, { "epoch": 14.81132075471698, "grad_norm": 5.317174911499023, "learning_rate": 1.9143185588253733e-06, "loss": 0.3187, "num_input_tokens_seen": 16493096, "step": 25120 }, { "epoch": 14.814268867924529, "grad_norm": 2.4055514335632324, "learning_rate": 1.9122946111405354e-06, "loss": 0.3774, "num_input_tokens_seen": 16496840, "step": 25125 }, { "epoch": 14.817216981132075, "grad_norm": 3.374483346939087, "learning_rate": 1.910271480939005e-06, "loss": 0.3568, "num_input_tokens_seen": 16499784, "step": 25130 }, { "epoch": 14.820165094339622, "grad_norm": 2.0640745162963867, "learning_rate": 1.9082491687564176e-06, "loss": 0.2593, "num_input_tokens_seen": 16503560, "step": 25135 }, { "epoch": 14.82311320754717, "grad_norm": 3.2683162689208984, "learning_rate": 1.9062276751281872e-06, "loss": 0.2798, "num_input_tokens_seen": 16507080, "step": 25140 }, { "epoch": 14.826061320754716, "grad_norm": 2.996887683868408, "learning_rate": 1.9042070005895136e-06, "loss": 0.3497, "num_input_tokens_seen": 16510312, "step": 25145 }, { "epoch": 14.829009433962264, "grad_norm": 2.7498648166656494, "learning_rate": 1.9021871456753788e-06, "loss": 0.3005, "num_input_tokens_seen": 16513576, "step": 25150 }, { "epoch": 14.83195754716981, "grad_norm": 3.4551515579223633, "learning_rate": 1.9001681109205478e-06, "loss": 0.2646, "num_input_tokens_seen": 16516232, "step": 25155 }, { "epoch": 14.834905660377359, "grad_norm": 3.947782278060913, "learning_rate": 1.898149896859567e-06, "loss": 0.3363, "num_input_tokens_seen": 16519688, "step": 25160 }, { "epoch": 14.837853773584905, "grad_norm": 3.952019214630127, "learning_rate": 1.8961325040267714e-06, "loss": 0.3205, "num_input_tokens_seen": 16522568, "step": 25165 }, { "epoch": 14.840801886792454, "grad_norm": 2.0060391426086426, "learning_rate": 1.894115932956272e-06, "loss": 0.2889, "num_input_tokens_seen": 16525192, "step": 25170 }, { "epoch": 14.84375, "grad_norm": 1.9115650653839111, "learning_rate": 1.8921001841819652e-06, "loss": 0.5021, "num_input_tokens_seen": 16528136, "step": 25175 }, { "epoch": 14.846698113207546, "grad_norm": 3.4501123428344727, "learning_rate": 1.8900852582375284e-06, "loss": 0.4139, "num_input_tokens_seen": 16532488, "step": 25180 }, { "epoch": 14.849646226415095, "grad_norm": 5.914278507232666, "learning_rate": 1.8880711556564214e-06, "loss": 0.3967, "num_input_tokens_seen": 16536936, "step": 25185 }, { "epoch": 14.852594339622641, "grad_norm": 4.556258201599121, "learning_rate": 1.8860578769718891e-06, "loss": 0.4071, "num_input_tokens_seen": 16542568, "step": 25190 }, { "epoch": 14.85554245283019, "grad_norm": 3.2886908054351807, "learning_rate": 1.8840454227169525e-06, "loss": 0.3068, "num_input_tokens_seen": 16546248, "step": 25195 }, { "epoch": 14.858490566037736, "grad_norm": 3.2632572650909424, "learning_rate": 1.882033793424421e-06, "loss": 0.3178, "num_input_tokens_seen": 16549128, "step": 25200 }, { "epoch": 14.861438679245284, "grad_norm": 6.3639373779296875, "learning_rate": 1.88002298962688e-06, "loss": 0.4076, "num_input_tokens_seen": 16551496, "step": 25205 }, { "epoch": 14.86438679245283, "grad_norm": 1.6138988733291626, "learning_rate": 1.8780130118566996e-06, "loss": 0.3716, "num_input_tokens_seen": 16554600, "step": 25210 }, { "epoch": 14.867334905660378, "grad_norm": 3.9709279537200928, "learning_rate": 1.876003860646029e-06, "loss": 0.3513, "num_input_tokens_seen": 16558696, "step": 25215 }, { "epoch": 14.870283018867925, "grad_norm": 4.01861047744751, "learning_rate": 1.8739955365267997e-06, "loss": 0.4108, "num_input_tokens_seen": 16561384, "step": 25220 }, { "epoch": 14.873231132075471, "grad_norm": 7.517172336578369, "learning_rate": 1.8719880400307228e-06, "loss": 0.385, "num_input_tokens_seen": 16564296, "step": 25225 }, { "epoch": 14.87617924528302, "grad_norm": 3.275465488433838, "learning_rate": 1.869981371689295e-06, "loss": 0.2328, "num_input_tokens_seen": 16567112, "step": 25230 }, { "epoch": 14.879127358490566, "grad_norm": 4.754958629608154, "learning_rate": 1.867975532033789e-06, "loss": 0.4037, "num_input_tokens_seen": 16569736, "step": 25235 }, { "epoch": 14.882075471698114, "grad_norm": 2.501829147338867, "learning_rate": 1.8659705215952589e-06, "loss": 0.3086, "num_input_tokens_seen": 16572872, "step": 25240 }, { "epoch": 14.88502358490566, "grad_norm": 3.534601926803589, "learning_rate": 1.8639663409045405e-06, "loss": 0.4829, "num_input_tokens_seen": 16575528, "step": 25245 }, { "epoch": 14.887971698113208, "grad_norm": 3.19101619720459, "learning_rate": 1.8619629904922466e-06, "loss": 0.3509, "num_input_tokens_seen": 16578824, "step": 25250 }, { "epoch": 14.890919811320755, "grad_norm": 5.469621181488037, "learning_rate": 1.859960470888777e-06, "loss": 0.3493, "num_input_tokens_seen": 16581992, "step": 25255 }, { "epoch": 14.893867924528301, "grad_norm": 2.097588062286377, "learning_rate": 1.857958782624306e-06, "loss": 0.4454, "num_input_tokens_seen": 16585608, "step": 25260 }, { "epoch": 14.89681603773585, "grad_norm": 2.6561856269836426, "learning_rate": 1.8559579262287886e-06, "loss": 0.2491, "num_input_tokens_seen": 16592168, "step": 25265 }, { "epoch": 14.899764150943396, "grad_norm": 5.81765079498291, "learning_rate": 1.8539579022319599e-06, "loss": 0.4518, "num_input_tokens_seen": 16595144, "step": 25270 }, { "epoch": 14.902712264150944, "grad_norm": 3.13091778755188, "learning_rate": 1.8519587111633357e-06, "loss": 0.2127, "num_input_tokens_seen": 16599272, "step": 25275 }, { "epoch": 14.90566037735849, "grad_norm": 3.931061029434204, "learning_rate": 1.8499603535522082e-06, "loss": 0.4137, "num_input_tokens_seen": 16601832, "step": 25280 }, { "epoch": 14.908608490566039, "grad_norm": 2.2984509468078613, "learning_rate": 1.8479628299276543e-06, "loss": 0.4321, "num_input_tokens_seen": 16605224, "step": 25285 }, { "epoch": 14.911556603773585, "grad_norm": 2.166402578353882, "learning_rate": 1.8459661408185241e-06, "loss": 0.3148, "num_input_tokens_seen": 16608584, "step": 25290 }, { "epoch": 14.914504716981131, "grad_norm": 4.413754463195801, "learning_rate": 1.8439702867534536e-06, "loss": 0.5345, "num_input_tokens_seen": 16611752, "step": 25295 }, { "epoch": 14.91745283018868, "grad_norm": 2.452721357345581, "learning_rate": 1.841975268260851e-06, "loss": 0.3837, "num_input_tokens_seen": 16614696, "step": 25300 }, { "epoch": 14.920400943396226, "grad_norm": 3.847743272781372, "learning_rate": 1.8399810858689066e-06, "loss": 0.285, "num_input_tokens_seen": 16618600, "step": 25305 }, { "epoch": 14.923349056603774, "grad_norm": 4.471195220947266, "learning_rate": 1.8379877401055884e-06, "loss": 0.2345, "num_input_tokens_seen": 16620904, "step": 25310 }, { "epoch": 14.92629716981132, "grad_norm": 2.5796031951904297, "learning_rate": 1.8359952314986418e-06, "loss": 0.3004, "num_input_tokens_seen": 16623976, "step": 25315 }, { "epoch": 14.929245283018869, "grad_norm": 2.8209757804870605, "learning_rate": 1.8340035605755957e-06, "loss": 0.2855, "num_input_tokens_seen": 16627688, "step": 25320 }, { "epoch": 14.932193396226415, "grad_norm": 4.37742805480957, "learning_rate": 1.8320127278637518e-06, "loss": 0.3722, "num_input_tokens_seen": 16630952, "step": 25325 }, { "epoch": 14.935141509433961, "grad_norm": 3.0644619464874268, "learning_rate": 1.830022733890191e-06, "loss": 0.289, "num_input_tokens_seen": 16634088, "step": 25330 }, { "epoch": 14.93808962264151, "grad_norm": 3.1316120624542236, "learning_rate": 1.8280335791817733e-06, "loss": 0.3879, "num_input_tokens_seen": 16636776, "step": 25335 }, { "epoch": 14.941037735849056, "grad_norm": 4.62498664855957, "learning_rate": 1.826045264265136e-06, "loss": 0.2968, "num_input_tokens_seen": 16640520, "step": 25340 }, { "epoch": 14.943985849056604, "grad_norm": 3.526069164276123, "learning_rate": 1.8240577896666928e-06, "loss": 0.4416, "num_input_tokens_seen": 16643880, "step": 25345 }, { "epoch": 14.94693396226415, "grad_norm": 2.5235745906829834, "learning_rate": 1.8220711559126382e-06, "loss": 0.4094, "num_input_tokens_seen": 16646504, "step": 25350 }, { "epoch": 14.949882075471699, "grad_norm": 3.847862720489502, "learning_rate": 1.8200853635289417e-06, "loss": 0.2622, "num_input_tokens_seen": 16650088, "step": 25355 }, { "epoch": 14.952830188679245, "grad_norm": 1.9829790592193604, "learning_rate": 1.81810041304135e-06, "loss": 0.2773, "num_input_tokens_seen": 16654120, "step": 25360 }, { "epoch": 14.955778301886792, "grad_norm": 2.7630391120910645, "learning_rate": 1.8161163049753865e-06, "loss": 0.3859, "num_input_tokens_seen": 16657224, "step": 25365 }, { "epoch": 14.95872641509434, "grad_norm": 2.508316993713379, "learning_rate": 1.8141330398563533e-06, "loss": 0.2518, "num_input_tokens_seen": 16660840, "step": 25370 }, { "epoch": 14.961674528301886, "grad_norm": 2.3257040977478027, "learning_rate": 1.8121506182093268e-06, "loss": 0.4122, "num_input_tokens_seen": 16664136, "step": 25375 }, { "epoch": 14.964622641509434, "grad_norm": 2.2113001346588135, "learning_rate": 1.8101690405591643e-06, "loss": 0.4102, "num_input_tokens_seen": 16666696, "step": 25380 }, { "epoch": 14.96757075471698, "grad_norm": 4.328030586242676, "learning_rate": 1.8081883074304945e-06, "loss": 0.377, "num_input_tokens_seen": 16669000, "step": 25385 }, { "epoch": 14.970518867924529, "grad_norm": 2.2295026779174805, "learning_rate": 1.8062084193477275e-06, "loss": 0.3425, "num_input_tokens_seen": 16672360, "step": 25390 }, { "epoch": 14.973466981132075, "grad_norm": 2.7323176860809326, "learning_rate": 1.804229376835046e-06, "loss": 0.2598, "num_input_tokens_seen": 16674888, "step": 25395 }, { "epoch": 14.976415094339622, "grad_norm": 1.3441169261932373, "learning_rate": 1.8022511804164105e-06, "loss": 0.3587, "num_input_tokens_seen": 16678568, "step": 25400 }, { "epoch": 14.97936320754717, "grad_norm": 3.0524773597717285, "learning_rate": 1.8002738306155559e-06, "loss": 0.3782, "num_input_tokens_seen": 16682248, "step": 25405 }, { "epoch": 14.982311320754716, "grad_norm": 4.660611152648926, "learning_rate": 1.7982973279559935e-06, "loss": 0.5246, "num_input_tokens_seen": 16685096, "step": 25410 }, { "epoch": 14.985259433962264, "grad_norm": 3.4011318683624268, "learning_rate": 1.7963216729610134e-06, "loss": 0.3283, "num_input_tokens_seen": 16688328, "step": 25415 }, { "epoch": 14.98820754716981, "grad_norm": 4.342550277709961, "learning_rate": 1.7943468661536773e-06, "loss": 0.3349, "num_input_tokens_seen": 16691016, "step": 25420 }, { "epoch": 14.991155660377359, "grad_norm": 4.066812038421631, "learning_rate": 1.7923729080568242e-06, "loss": 0.3798, "num_input_tokens_seen": 16694056, "step": 25425 }, { "epoch": 14.994103773584905, "grad_norm": 3.9511239528656006, "learning_rate": 1.7903997991930683e-06, "loss": 0.2678, "num_input_tokens_seen": 16697000, "step": 25430 }, { "epoch": 14.997051886792454, "grad_norm": 3.2100234031677246, "learning_rate": 1.7884275400847972e-06, "loss": 0.3073, "num_input_tokens_seen": 16701032, "step": 25435 }, { "epoch": 15.0, "grad_norm": 9.040056228637695, "learning_rate": 1.786456131254175e-06, "loss": 0.336, "num_input_tokens_seen": 16703256, "step": 25440 }, { "epoch": 15.002948113207546, "grad_norm": 3.5045883655548096, "learning_rate": 1.784485573223143e-06, "loss": 0.4093, "num_input_tokens_seen": 16705624, "step": 25445 }, { "epoch": 15.005896226415095, "grad_norm": 2.6210122108459473, "learning_rate": 1.782515866513414e-06, "loss": 0.2688, "num_input_tokens_seen": 16708792, "step": 25450 }, { "epoch": 15.008844339622641, "grad_norm": 3.429590940475464, "learning_rate": 1.7805470116464758e-06, "loss": 0.416, "num_input_tokens_seen": 16713016, "step": 25455 }, { "epoch": 15.01179245283019, "grad_norm": 2.941826105117798, "learning_rate": 1.7785790091435911e-06, "loss": 0.312, "num_input_tokens_seen": 16715768, "step": 25460 }, { "epoch": 15.014740566037736, "grad_norm": 2.76576566696167, "learning_rate": 1.776611859525796e-06, "loss": 0.345, "num_input_tokens_seen": 16718840, "step": 25465 }, { "epoch": 15.017688679245284, "grad_norm": 7.322103500366211, "learning_rate": 1.7746455633139042e-06, "loss": 0.2406, "num_input_tokens_seen": 16724344, "step": 25470 }, { "epoch": 15.02063679245283, "grad_norm": 7.545610427856445, "learning_rate": 1.7726801210285005e-06, "loss": 0.3394, "num_input_tokens_seen": 16728248, "step": 25475 }, { "epoch": 15.023584905660377, "grad_norm": 2.60906982421875, "learning_rate": 1.7707155331899418e-06, "loss": 0.3151, "num_input_tokens_seen": 16732440, "step": 25480 }, { "epoch": 15.026533018867925, "grad_norm": 4.527617454528809, "learning_rate": 1.7687518003183645e-06, "loss": 0.3609, "num_input_tokens_seen": 16735288, "step": 25485 }, { "epoch": 15.029481132075471, "grad_norm": 4.07396936416626, "learning_rate": 1.766788922933675e-06, "loss": 0.2977, "num_input_tokens_seen": 16738712, "step": 25490 }, { "epoch": 15.03242924528302, "grad_norm": 1.9758623838424683, "learning_rate": 1.7648269015555514e-06, "loss": 0.3939, "num_input_tokens_seen": 16742136, "step": 25495 }, { "epoch": 15.035377358490566, "grad_norm": 3.542388439178467, "learning_rate": 1.7628657367034474e-06, "loss": 0.3506, "num_input_tokens_seen": 16744824, "step": 25500 }, { "epoch": 15.038325471698114, "grad_norm": 2.8312289714813232, "learning_rate": 1.7609054288965922e-06, "loss": 0.2602, "num_input_tokens_seen": 16747352, "step": 25505 }, { "epoch": 15.04127358490566, "grad_norm": 5.966804027557373, "learning_rate": 1.7589459786539847e-06, "loss": 0.3377, "num_input_tokens_seen": 16751512, "step": 25510 }, { "epoch": 15.044221698113208, "grad_norm": 3.487410545349121, "learning_rate": 1.7569873864943975e-06, "loss": 0.51, "num_input_tokens_seen": 16753592, "step": 25515 }, { "epoch": 15.047169811320755, "grad_norm": 11.090282440185547, "learning_rate": 1.7550296529363764e-06, "loss": 0.3233, "num_input_tokens_seen": 16756088, "step": 25520 }, { "epoch": 15.050117924528301, "grad_norm": 4.345810890197754, "learning_rate": 1.7530727784982393e-06, "loss": 0.4293, "num_input_tokens_seen": 16759000, "step": 25525 }, { "epoch": 15.05306603773585, "grad_norm": 2.7668867111206055, "learning_rate": 1.7511167636980765e-06, "loss": 0.2693, "num_input_tokens_seen": 16762200, "step": 25530 }, { "epoch": 15.056014150943396, "grad_norm": 4.450671195983887, "learning_rate": 1.7491616090537539e-06, "loss": 0.3301, "num_input_tokens_seen": 16766360, "step": 25535 }, { "epoch": 15.058962264150944, "grad_norm": 3.0298879146575928, "learning_rate": 1.7472073150829056e-06, "loss": 0.3366, "num_input_tokens_seen": 16769176, "step": 25540 }, { "epoch": 15.06191037735849, "grad_norm": 3.722209930419922, "learning_rate": 1.745253882302939e-06, "loss": 0.3578, "num_input_tokens_seen": 16772120, "step": 25545 }, { "epoch": 15.064858490566039, "grad_norm": 3.855719566345215, "learning_rate": 1.743301311231035e-06, "loss": 0.3998, "num_input_tokens_seen": 16775768, "step": 25550 }, { "epoch": 15.067806603773585, "grad_norm": 5.510416507720947, "learning_rate": 1.7413496023841437e-06, "loss": 0.259, "num_input_tokens_seen": 16778776, "step": 25555 }, { "epoch": 15.070754716981131, "grad_norm": 4.305747985839844, "learning_rate": 1.7393987562789876e-06, "loss": 0.3571, "num_input_tokens_seen": 16781304, "step": 25560 }, { "epoch": 15.07370283018868, "grad_norm": 3.4675185680389404, "learning_rate": 1.7374487734320655e-06, "loss": 0.3255, "num_input_tokens_seen": 16784152, "step": 25565 }, { "epoch": 15.076650943396226, "grad_norm": 4.394769191741943, "learning_rate": 1.7354996543596408e-06, "loss": 0.2823, "num_input_tokens_seen": 16786552, "step": 25570 }, { "epoch": 15.079599056603774, "grad_norm": 3.1821305751800537, "learning_rate": 1.7335513995777504e-06, "loss": 0.2414, "num_input_tokens_seen": 16790072, "step": 25575 }, { "epoch": 15.08254716981132, "grad_norm": 5.39393949508667, "learning_rate": 1.7316040096022062e-06, "loss": 0.4208, "num_input_tokens_seen": 16794680, "step": 25580 }, { "epoch": 15.085495283018869, "grad_norm": 2.409226179122925, "learning_rate": 1.7296574849485863e-06, "loss": 0.417, "num_input_tokens_seen": 16798264, "step": 25585 }, { "epoch": 15.088443396226415, "grad_norm": 2.965400218963623, "learning_rate": 1.7277118261322423e-06, "loss": 0.3541, "num_input_tokens_seen": 16801752, "step": 25590 }, { "epoch": 15.091391509433961, "grad_norm": 6.046774864196777, "learning_rate": 1.7257670336682925e-06, "loss": 0.4004, "num_input_tokens_seen": 16804248, "step": 25595 }, { "epoch": 15.09433962264151, "grad_norm": 4.45102596282959, "learning_rate": 1.7238231080716339e-06, "loss": 0.3664, "num_input_tokens_seen": 16808216, "step": 25600 }, { "epoch": 15.097287735849056, "grad_norm": 4.044468879699707, "learning_rate": 1.721880049856927e-06, "loss": 0.3819, "num_input_tokens_seen": 16811384, "step": 25605 }, { "epoch": 15.100235849056604, "grad_norm": 3.887524127960205, "learning_rate": 1.7199378595386046e-06, "loss": 0.3344, "num_input_tokens_seen": 16814872, "step": 25610 }, { "epoch": 15.10318396226415, "grad_norm": 7.107062816619873, "learning_rate": 1.7179965376308705e-06, "loss": 0.3272, "num_input_tokens_seen": 16818648, "step": 25615 }, { "epoch": 15.106132075471699, "grad_norm": 5.26723575592041, "learning_rate": 1.7160560846476976e-06, "loss": 0.335, "num_input_tokens_seen": 16822904, "step": 25620 }, { "epoch": 15.109080188679245, "grad_norm": 3.0392677783966064, "learning_rate": 1.7141165011028277e-06, "loss": 0.3343, "num_input_tokens_seen": 16825304, "step": 25625 }, { "epoch": 15.112028301886792, "grad_norm": 4.210397720336914, "learning_rate": 1.7121777875097767e-06, "loss": 0.3617, "num_input_tokens_seen": 16827928, "step": 25630 }, { "epoch": 15.11497641509434, "grad_norm": 2.767460823059082, "learning_rate": 1.7102399443818268e-06, "loss": 0.2835, "num_input_tokens_seen": 16830552, "step": 25635 }, { "epoch": 15.117924528301886, "grad_norm": 5.001343727111816, "learning_rate": 1.7083029722320294e-06, "loss": 0.263, "num_input_tokens_seen": 16833208, "step": 25640 }, { "epoch": 15.120872641509434, "grad_norm": 3.2492523193359375, "learning_rate": 1.7063668715732063e-06, "loss": 0.3325, "num_input_tokens_seen": 16837912, "step": 25645 }, { "epoch": 15.12382075471698, "grad_norm": 4.239966869354248, "learning_rate": 1.7044316429179492e-06, "loss": 0.2865, "num_input_tokens_seen": 16841240, "step": 25650 }, { "epoch": 15.126768867924529, "grad_norm": 2.6065053939819336, "learning_rate": 1.7024972867786155e-06, "loss": 0.3252, "num_input_tokens_seen": 16846168, "step": 25655 }, { "epoch": 15.129716981132075, "grad_norm": 2.400942325592041, "learning_rate": 1.7005638036673389e-06, "loss": 0.2649, "num_input_tokens_seen": 16849048, "step": 25660 }, { "epoch": 15.132665094339623, "grad_norm": 2.3800466060638428, "learning_rate": 1.6986311940960148e-06, "loss": 0.2103, "num_input_tokens_seen": 16851896, "step": 25665 }, { "epoch": 15.13561320754717, "grad_norm": 1.8591388463974, "learning_rate": 1.696699458576308e-06, "loss": 0.3322, "num_input_tokens_seen": 16855480, "step": 25670 }, { "epoch": 15.138561320754716, "grad_norm": 1.9832265377044678, "learning_rate": 1.6947685976196581e-06, "loss": 0.2766, "num_input_tokens_seen": 16859448, "step": 25675 }, { "epoch": 15.141509433962264, "grad_norm": 4.35140323638916, "learning_rate": 1.692838611737267e-06, "loss": 0.2487, "num_input_tokens_seen": 16862712, "step": 25680 }, { "epoch": 15.14445754716981, "grad_norm": 3.8029375076293945, "learning_rate": 1.690909501440106e-06, "loss": 0.2746, "num_input_tokens_seen": 16865912, "step": 25685 }, { "epoch": 15.147405660377359, "grad_norm": 3.837329626083374, "learning_rate": 1.688981267238915e-06, "loss": 0.356, "num_input_tokens_seen": 16868600, "step": 25690 }, { "epoch": 15.150353773584905, "grad_norm": 3.199493169784546, "learning_rate": 1.687053909644204e-06, "loss": 0.2734, "num_input_tokens_seen": 16871064, "step": 25695 }, { "epoch": 15.153301886792454, "grad_norm": 4.369735240936279, "learning_rate": 1.685127429166249e-06, "loss": 0.4107, "num_input_tokens_seen": 16874328, "step": 25700 }, { "epoch": 15.15625, "grad_norm": 3.9761626720428467, "learning_rate": 1.683201826315093e-06, "loss": 0.2207, "num_input_tokens_seen": 16877560, "step": 25705 }, { "epoch": 15.159198113207546, "grad_norm": 6.094707012176514, "learning_rate": 1.681277101600548e-06, "loss": 0.2302, "num_input_tokens_seen": 16880216, "step": 25710 }, { "epoch": 15.162146226415095, "grad_norm": 2.5866572856903076, "learning_rate": 1.6793532555321939e-06, "loss": 0.3836, "num_input_tokens_seen": 16883672, "step": 25715 }, { "epoch": 15.165094339622641, "grad_norm": 6.847254753112793, "learning_rate": 1.6774302886193744e-06, "loss": 0.3319, "num_input_tokens_seen": 16887768, "step": 25720 }, { "epoch": 15.16804245283019, "grad_norm": 2.914491891860962, "learning_rate": 1.6755082013712076e-06, "loss": 0.4893, "num_input_tokens_seen": 16891160, "step": 25725 }, { "epoch": 15.170990566037736, "grad_norm": 4.121789455413818, "learning_rate": 1.6735869942965716e-06, "loss": 0.3308, "num_input_tokens_seen": 16894616, "step": 25730 }, { "epoch": 15.173938679245284, "grad_norm": 4.6612420082092285, "learning_rate": 1.6716666679041155e-06, "loss": 0.2935, "num_input_tokens_seen": 16897976, "step": 25735 }, { "epoch": 15.17688679245283, "grad_norm": 3.894672393798828, "learning_rate": 1.6697472227022533e-06, "loss": 0.2772, "num_input_tokens_seen": 16901528, "step": 25740 }, { "epoch": 15.179834905660377, "grad_norm": 3.0139200687408447, "learning_rate": 1.6678286591991644e-06, "loss": 0.2877, "num_input_tokens_seen": 16904312, "step": 25745 }, { "epoch": 15.182783018867925, "grad_norm": 3.110736846923828, "learning_rate": 1.665910977902801e-06, "loss": 0.4161, "num_input_tokens_seen": 16907352, "step": 25750 }, { "epoch": 15.185731132075471, "grad_norm": 3.6079087257385254, "learning_rate": 1.6639941793208747e-06, "loss": 0.4174, "num_input_tokens_seen": 16910808, "step": 25755 }, { "epoch": 15.18867924528302, "grad_norm": 2.240734338760376, "learning_rate": 1.6620782639608674e-06, "loss": 0.2496, "num_input_tokens_seen": 16913432, "step": 25760 }, { "epoch": 15.191627358490566, "grad_norm": 3.611354112625122, "learning_rate": 1.6601632323300231e-06, "loss": 0.3276, "num_input_tokens_seen": 16916728, "step": 25765 }, { "epoch": 15.194575471698114, "grad_norm": 4.648670196533203, "learning_rate": 1.6582490849353595e-06, "loss": 0.2711, "num_input_tokens_seen": 16919544, "step": 25770 }, { "epoch": 15.19752358490566, "grad_norm": 3.618058919906616, "learning_rate": 1.6563358222836523e-06, "loss": 0.3005, "num_input_tokens_seen": 16923352, "step": 25775 }, { "epoch": 15.200471698113208, "grad_norm": 4.440439224243164, "learning_rate": 1.654423444881445e-06, "loss": 0.3374, "num_input_tokens_seen": 16927096, "step": 25780 }, { "epoch": 15.203419811320755, "grad_norm": 2.9980931282043457, "learning_rate": 1.652511953235051e-06, "loss": 0.3161, "num_input_tokens_seen": 16930168, "step": 25785 }, { "epoch": 15.206367924528301, "grad_norm": 3.9161460399627686, "learning_rate": 1.650601347850544e-06, "loss": 0.3584, "num_input_tokens_seen": 16933336, "step": 25790 }, { "epoch": 15.20931603773585, "grad_norm": 5.610029697418213, "learning_rate": 1.6486916292337652e-06, "loss": 0.4674, "num_input_tokens_seen": 16936504, "step": 25795 }, { "epoch": 15.212264150943396, "grad_norm": 4.206812381744385, "learning_rate": 1.6467827978903212e-06, "loss": 0.2821, "num_input_tokens_seen": 16939096, "step": 25800 }, { "epoch": 15.215212264150944, "grad_norm": 4.656123161315918, "learning_rate": 1.6448748543255827e-06, "loss": 0.3997, "num_input_tokens_seen": 16942872, "step": 25805 }, { "epoch": 15.21816037735849, "grad_norm": 2.0921199321746826, "learning_rate": 1.6429677990446845e-06, "loss": 0.3306, "num_input_tokens_seen": 16945368, "step": 25810 }, { "epoch": 15.221108490566039, "grad_norm": 2.1846442222595215, "learning_rate": 1.6410616325525319e-06, "loss": 0.2896, "num_input_tokens_seen": 16949656, "step": 25815 }, { "epoch": 15.224056603773585, "grad_norm": 3.4011030197143555, "learning_rate": 1.6391563553537875e-06, "loss": 0.364, "num_input_tokens_seen": 16952664, "step": 25820 }, { "epoch": 15.227004716981131, "grad_norm": 2.7858026027679443, "learning_rate": 1.6372519679528832e-06, "loss": 0.2311, "num_input_tokens_seen": 16956152, "step": 25825 }, { "epoch": 15.22995283018868, "grad_norm": 2.432554244995117, "learning_rate": 1.6353484708540124e-06, "loss": 0.3365, "num_input_tokens_seen": 16959352, "step": 25830 }, { "epoch": 15.232900943396226, "grad_norm": 3.507625102996826, "learning_rate": 1.633445864561135e-06, "loss": 0.2892, "num_input_tokens_seen": 16962616, "step": 25835 }, { "epoch": 15.235849056603774, "grad_norm": 2.8583974838256836, "learning_rate": 1.6315441495779726e-06, "loss": 0.3613, "num_input_tokens_seen": 16965304, "step": 25840 }, { "epoch": 15.23879716981132, "grad_norm": 4.385777950286865, "learning_rate": 1.6296433264080152e-06, "loss": 0.2519, "num_input_tokens_seen": 16968024, "step": 25845 }, { "epoch": 15.241745283018869, "grad_norm": 5.593526363372803, "learning_rate": 1.627743395554513e-06, "loss": 0.565, "num_input_tokens_seen": 16971192, "step": 25850 }, { "epoch": 15.244693396226415, "grad_norm": 4.967219829559326, "learning_rate": 1.6258443575204802e-06, "loss": 0.4191, "num_input_tokens_seen": 16974200, "step": 25855 }, { "epoch": 15.247641509433961, "grad_norm": 2.2998905181884766, "learning_rate": 1.6239462128086936e-06, "loss": 0.3921, "num_input_tokens_seen": 16976856, "step": 25860 }, { "epoch": 15.25058962264151, "grad_norm": 3.709352970123291, "learning_rate": 1.6220489619216988e-06, "loss": 0.3834, "num_input_tokens_seen": 16980376, "step": 25865 }, { "epoch": 15.253537735849056, "grad_norm": 2.805677652359009, "learning_rate": 1.6201526053618e-06, "loss": 0.2574, "num_input_tokens_seen": 16983224, "step": 25870 }, { "epoch": 15.256485849056604, "grad_norm": 3.2457685470581055, "learning_rate": 1.6182571436310634e-06, "loss": 0.2777, "num_input_tokens_seen": 16986936, "step": 25875 }, { "epoch": 15.25943396226415, "grad_norm": 2.9603707790374756, "learning_rate": 1.616362577231324e-06, "loss": 0.2956, "num_input_tokens_seen": 16991416, "step": 25880 }, { "epoch": 15.262382075471699, "grad_norm": 3.2866084575653076, "learning_rate": 1.614468906664175e-06, "loss": 0.2795, "num_input_tokens_seen": 16994840, "step": 25885 }, { "epoch": 15.265330188679245, "grad_norm": 4.979950904846191, "learning_rate": 1.612576132430974e-06, "loss": 0.323, "num_input_tokens_seen": 16996984, "step": 25890 }, { "epoch": 15.268278301886792, "grad_norm": 3.381808042526245, "learning_rate": 1.6106842550328406e-06, "loss": 0.3267, "num_input_tokens_seen": 16999704, "step": 25895 }, { "epoch": 15.27122641509434, "grad_norm": 3.0996134281158447, "learning_rate": 1.6087932749706582e-06, "loss": 0.2958, "num_input_tokens_seen": 17002712, "step": 25900 }, { "epoch": 15.274174528301886, "grad_norm": 5.022707462310791, "learning_rate": 1.6069031927450696e-06, "loss": 0.3448, "num_input_tokens_seen": 17006360, "step": 25905 }, { "epoch": 15.277122641509434, "grad_norm": 6.523232936859131, "learning_rate": 1.605014008856486e-06, "loss": 0.2774, "num_input_tokens_seen": 17010360, "step": 25910 }, { "epoch": 15.28007075471698, "grad_norm": 3.442077875137329, "learning_rate": 1.6031257238050745e-06, "loss": 0.351, "num_input_tokens_seen": 17013144, "step": 25915 }, { "epoch": 15.283018867924529, "grad_norm": 4.312236309051514, "learning_rate": 1.601238338090768e-06, "loss": 0.4016, "num_input_tokens_seen": 17015576, "step": 25920 }, { "epoch": 15.285966981132075, "grad_norm": 3.598395824432373, "learning_rate": 1.5993518522132595e-06, "loss": 0.3409, "num_input_tokens_seen": 17018584, "step": 25925 }, { "epoch": 15.288915094339623, "grad_norm": 3.9069621562957764, "learning_rate": 1.5974662666720037e-06, "loss": 0.3297, "num_input_tokens_seen": 17021272, "step": 25930 }, { "epoch": 15.29186320754717, "grad_norm": 3.2253589630126953, "learning_rate": 1.5955815819662162e-06, "loss": 0.2301, "num_input_tokens_seen": 17024120, "step": 25935 }, { "epoch": 15.294811320754716, "grad_norm": 3.5082406997680664, "learning_rate": 1.5936977985948788e-06, "loss": 0.4285, "num_input_tokens_seen": 17026488, "step": 25940 }, { "epoch": 15.297759433962264, "grad_norm": 2.9430794715881348, "learning_rate": 1.5918149170567298e-06, "loss": 0.2705, "num_input_tokens_seen": 17030072, "step": 25945 }, { "epoch": 15.30070754716981, "grad_norm": 2.966576099395752, "learning_rate": 1.5899329378502698e-06, "loss": 0.3084, "num_input_tokens_seen": 17033144, "step": 25950 }, { "epoch": 15.303655660377359, "grad_norm": 2.3899173736572266, "learning_rate": 1.588051861473761e-06, "loss": 0.3347, "num_input_tokens_seen": 17035928, "step": 25955 }, { "epoch": 15.306603773584905, "grad_norm": 4.201458930969238, "learning_rate": 1.5861716884252253e-06, "loss": 0.2824, "num_input_tokens_seen": 17038232, "step": 25960 }, { "epoch": 15.309551886792454, "grad_norm": 4.2662129402160645, "learning_rate": 1.5842924192024489e-06, "loss": 0.3352, "num_input_tokens_seen": 17041688, "step": 25965 }, { "epoch": 15.3125, "grad_norm": 6.189284324645996, "learning_rate": 1.5824140543029742e-06, "loss": 0.3486, "num_input_tokens_seen": 17044888, "step": 25970 }, { "epoch": 15.315448113207546, "grad_norm": 5.393751621246338, "learning_rate": 1.5805365942241092e-06, "loss": 0.3677, "num_input_tokens_seen": 17048280, "step": 25975 }, { "epoch": 15.318396226415095, "grad_norm": 3.068864107131958, "learning_rate": 1.5786600394629181e-06, "loss": 0.3348, "num_input_tokens_seen": 17051320, "step": 25980 }, { "epoch": 15.321344339622641, "grad_norm": 3.2713897228240967, "learning_rate": 1.5767843905162261e-06, "loss": 0.5281, "num_input_tokens_seen": 17055128, "step": 25985 }, { "epoch": 15.32429245283019, "grad_norm": 2.7895596027374268, "learning_rate": 1.5749096478806209e-06, "loss": 0.3719, "num_input_tokens_seen": 17058648, "step": 25990 }, { "epoch": 15.327240566037736, "grad_norm": 3.612484931945801, "learning_rate": 1.5730358120524452e-06, "loss": 0.3308, "num_input_tokens_seen": 17061624, "step": 25995 }, { "epoch": 15.330188679245284, "grad_norm": 3.754026174545288, "learning_rate": 1.5711628835278098e-06, "loss": 0.4748, "num_input_tokens_seen": 17064952, "step": 26000 }, { "epoch": 15.33313679245283, "grad_norm": 4.4249267578125, "learning_rate": 1.5692908628025782e-06, "loss": 0.2017, "num_input_tokens_seen": 17069528, "step": 26005 }, { "epoch": 15.336084905660377, "grad_norm": 5.1811113357543945, "learning_rate": 1.5674197503723765e-06, "loss": 0.3346, "num_input_tokens_seen": 17072568, "step": 26010 }, { "epoch": 15.339033018867925, "grad_norm": 2.253615140914917, "learning_rate": 1.5655495467325893e-06, "loss": 0.2618, "num_input_tokens_seen": 17075896, "step": 26015 }, { "epoch": 15.341981132075471, "grad_norm": 4.817640781402588, "learning_rate": 1.5636802523783613e-06, "loss": 0.3452, "num_input_tokens_seen": 17079192, "step": 26020 }, { "epoch": 15.34492924528302, "grad_norm": 3.307757616043091, "learning_rate": 1.5618118678045947e-06, "loss": 0.2168, "num_input_tokens_seen": 17081784, "step": 26025 }, { "epoch": 15.347877358490566, "grad_norm": 3.6523008346557617, "learning_rate": 1.5599443935059549e-06, "loss": 0.3667, "num_input_tokens_seen": 17085560, "step": 26030 }, { "epoch": 15.350825471698114, "grad_norm": 4.353590965270996, "learning_rate": 1.5580778299768635e-06, "loss": 0.3944, "num_input_tokens_seen": 17088120, "step": 26035 }, { "epoch": 15.35377358490566, "grad_norm": 5.164697170257568, "learning_rate": 1.5562121777114997e-06, "loss": 0.3898, "num_input_tokens_seen": 17091768, "step": 26040 }, { "epoch": 15.356721698113208, "grad_norm": 3.6270790100097656, "learning_rate": 1.5543474372038043e-06, "loss": 0.2632, "num_input_tokens_seen": 17094584, "step": 26045 }, { "epoch": 15.359669811320755, "grad_norm": 2.8039138317108154, "learning_rate": 1.5524836089474748e-06, "loss": 0.2592, "num_input_tokens_seen": 17097592, "step": 26050 }, { "epoch": 15.362617924528301, "grad_norm": 4.47136926651001, "learning_rate": 1.5506206934359664e-06, "loss": 0.4627, "num_input_tokens_seen": 17100088, "step": 26055 }, { "epoch": 15.36556603773585, "grad_norm": 2.880223035812378, "learning_rate": 1.5487586911624947e-06, "loss": 0.339, "num_input_tokens_seen": 17102936, "step": 26060 }, { "epoch": 15.368514150943396, "grad_norm": 2.549013376235962, "learning_rate": 1.5468976026200355e-06, "loss": 0.3561, "num_input_tokens_seen": 17107288, "step": 26065 }, { "epoch": 15.371462264150944, "grad_norm": 2.5711100101470947, "learning_rate": 1.5450374283013187e-06, "loss": 0.4408, "num_input_tokens_seen": 17111864, "step": 26070 }, { "epoch": 15.37441037735849, "grad_norm": 4.103601932525635, "learning_rate": 1.5431781686988317e-06, "loss": 0.2716, "num_input_tokens_seen": 17115096, "step": 26075 }, { "epoch": 15.377358490566039, "grad_norm": 3.997302532196045, "learning_rate": 1.5413198243048233e-06, "loss": 0.2698, "num_input_tokens_seen": 17118328, "step": 26080 }, { "epoch": 15.380306603773585, "grad_norm": 3.312404155731201, "learning_rate": 1.5394623956112974e-06, "loss": 0.3221, "num_input_tokens_seen": 17121208, "step": 26085 }, { "epoch": 15.383254716981131, "grad_norm": 3.289424419403076, "learning_rate": 1.537605883110015e-06, "loss": 0.367, "num_input_tokens_seen": 17125048, "step": 26090 }, { "epoch": 15.38620283018868, "grad_norm": 2.658742904663086, "learning_rate": 1.5357502872924984e-06, "loss": 0.3993, "num_input_tokens_seen": 17128760, "step": 26095 }, { "epoch": 15.389150943396226, "grad_norm": 5.822504997253418, "learning_rate": 1.5338956086500235e-06, "loss": 0.4447, "num_input_tokens_seen": 17133112, "step": 26100 }, { "epoch": 15.392099056603774, "grad_norm": 2.85068941116333, "learning_rate": 1.5320418476736237e-06, "loss": 0.2466, "num_input_tokens_seen": 17136440, "step": 26105 }, { "epoch": 15.39504716981132, "grad_norm": 1.801239013671875, "learning_rate": 1.5301890048540912e-06, "loss": 0.2687, "num_input_tokens_seen": 17139928, "step": 26110 }, { "epoch": 15.397995283018869, "grad_norm": 6.144158840179443, "learning_rate": 1.5283370806819743e-06, "loss": 0.4106, "num_input_tokens_seen": 17142808, "step": 26115 }, { "epoch": 15.400943396226415, "grad_norm": 2.743367910385132, "learning_rate": 1.5264860756475752e-06, "loss": 0.2805, "num_input_tokens_seen": 17145752, "step": 26120 }, { "epoch": 15.403891509433961, "grad_norm": 3.9104769229888916, "learning_rate": 1.5246359902409592e-06, "loss": 0.3467, "num_input_tokens_seen": 17148920, "step": 26125 }, { "epoch": 15.40683962264151, "grad_norm": 3.2527670860290527, "learning_rate": 1.5227868249519423e-06, "loss": 0.2662, "num_input_tokens_seen": 17153720, "step": 26130 }, { "epoch": 15.409787735849056, "grad_norm": 2.643486738204956, "learning_rate": 1.5209385802700999e-06, "loss": 0.3433, "num_input_tokens_seen": 17157240, "step": 26135 }, { "epoch": 15.412735849056604, "grad_norm": 2.7460286617279053, "learning_rate": 1.5190912566847626e-06, "loss": 0.358, "num_input_tokens_seen": 17161208, "step": 26140 }, { "epoch": 15.41568396226415, "grad_norm": 7.471737384796143, "learning_rate": 1.5172448546850166e-06, "loss": 0.3322, "num_input_tokens_seen": 17164568, "step": 26145 }, { "epoch": 15.418632075471699, "grad_norm": 5.681195259094238, "learning_rate": 1.515399374759704e-06, "loss": 0.3166, "num_input_tokens_seen": 17166968, "step": 26150 }, { "epoch": 15.421580188679245, "grad_norm": 2.7004566192626953, "learning_rate": 1.513554817397424e-06, "loss": 0.4063, "num_input_tokens_seen": 17171384, "step": 26155 }, { "epoch": 15.424528301886792, "grad_norm": 4.799001216888428, "learning_rate": 1.5117111830865338e-06, "loss": 0.2175, "num_input_tokens_seen": 17173592, "step": 26160 }, { "epoch": 15.42747641509434, "grad_norm": 2.6254944801330566, "learning_rate": 1.509868472315142e-06, "loss": 0.2923, "num_input_tokens_seen": 17177048, "step": 26165 }, { "epoch": 15.430424528301886, "grad_norm": 2.5709335803985596, "learning_rate": 1.508026685571113e-06, "loss": 0.3055, "num_input_tokens_seen": 17180312, "step": 26170 }, { "epoch": 15.433372641509434, "grad_norm": 9.459994316101074, "learning_rate": 1.506185823342069e-06, "loss": 0.432, "num_input_tokens_seen": 17183576, "step": 26175 }, { "epoch": 15.43632075471698, "grad_norm": 2.982394218444824, "learning_rate": 1.504345886115386e-06, "loss": 0.3187, "num_input_tokens_seen": 17186936, "step": 26180 }, { "epoch": 15.439268867924529, "grad_norm": 3.2225677967071533, "learning_rate": 1.502506874378193e-06, "loss": 0.2629, "num_input_tokens_seen": 17189816, "step": 26185 }, { "epoch": 15.442216981132075, "grad_norm": 2.433189868927002, "learning_rate": 1.5006687886173805e-06, "loss": 0.3129, "num_input_tokens_seen": 17196536, "step": 26190 }, { "epoch": 15.445165094339623, "grad_norm": 3.4699416160583496, "learning_rate": 1.498831629319587e-06, "loss": 0.3773, "num_input_tokens_seen": 17199992, "step": 26195 }, { "epoch": 15.44811320754717, "grad_norm": 3.3743858337402344, "learning_rate": 1.4969953969712087e-06, "loss": 0.3019, "num_input_tokens_seen": 17203160, "step": 26200 }, { "epoch": 15.451061320754716, "grad_norm": 1.5363969802856445, "learning_rate": 1.4951600920583963e-06, "loss": 0.2734, "num_input_tokens_seen": 17206808, "step": 26205 }, { "epoch": 15.454009433962264, "grad_norm": 3.5299994945526123, "learning_rate": 1.493325715067055e-06, "loss": 0.3761, "num_input_tokens_seen": 17209912, "step": 26210 }, { "epoch": 15.45695754716981, "grad_norm": 3.6727888584136963, "learning_rate": 1.4914922664828417e-06, "loss": 0.2813, "num_input_tokens_seen": 17213176, "step": 26215 }, { "epoch": 15.459905660377359, "grad_norm": 2.344977378845215, "learning_rate": 1.4896597467911732e-06, "loss": 0.3515, "num_input_tokens_seen": 17218200, "step": 26220 }, { "epoch": 15.462853773584905, "grad_norm": 4.256892681121826, "learning_rate": 1.4878281564772156e-06, "loss": 0.2456, "num_input_tokens_seen": 17221240, "step": 26225 }, { "epoch": 15.465801886792454, "grad_norm": 3.1647567749023438, "learning_rate": 1.4859974960258898e-06, "loss": 0.3722, "num_input_tokens_seen": 17226296, "step": 26230 }, { "epoch": 15.46875, "grad_norm": 2.545691728591919, "learning_rate": 1.4841677659218723e-06, "loss": 0.2991, "num_input_tokens_seen": 17229272, "step": 26235 }, { "epoch": 15.471698113207546, "grad_norm": 3.1219098567962646, "learning_rate": 1.4823389666495886e-06, "loss": 0.442, "num_input_tokens_seen": 17231864, "step": 26240 }, { "epoch": 15.474646226415095, "grad_norm": 4.278151035308838, "learning_rate": 1.4805110986932258e-06, "loss": 0.3144, "num_input_tokens_seen": 17235576, "step": 26245 }, { "epoch": 15.477594339622641, "grad_norm": 3.398521661758423, "learning_rate": 1.4786841625367166e-06, "loss": 0.3763, "num_input_tokens_seen": 17239032, "step": 26250 }, { "epoch": 15.48054245283019, "grad_norm": 5.6483354568481445, "learning_rate": 1.476858158663752e-06, "loss": 0.3601, "num_input_tokens_seen": 17243224, "step": 26255 }, { "epoch": 15.483490566037736, "grad_norm": 3.0179030895233154, "learning_rate": 1.4750330875577745e-06, "loss": 0.3285, "num_input_tokens_seen": 17246744, "step": 26260 }, { "epoch": 15.486438679245284, "grad_norm": 2.992347002029419, "learning_rate": 1.4732089497019787e-06, "loss": 0.2983, "num_input_tokens_seen": 17249336, "step": 26265 }, { "epoch": 15.48938679245283, "grad_norm": 3.3717896938323975, "learning_rate": 1.471385745579313e-06, "loss": 0.2947, "num_input_tokens_seen": 17252120, "step": 26270 }, { "epoch": 15.492334905660377, "grad_norm": 5.118661880493164, "learning_rate": 1.4695634756724775e-06, "loss": 0.2735, "num_input_tokens_seen": 17255032, "step": 26275 }, { "epoch": 15.495283018867925, "grad_norm": 3.5158638954162598, "learning_rate": 1.4677421404639281e-06, "loss": 0.3231, "num_input_tokens_seen": 17258104, "step": 26280 }, { "epoch": 15.498231132075471, "grad_norm": 3.823340654373169, "learning_rate": 1.4659217404358706e-06, "loss": 0.335, "num_input_tokens_seen": 17260312, "step": 26285 }, { "epoch": 15.50117924528302, "grad_norm": 3.3256843090057373, "learning_rate": 1.4641022760702627e-06, "loss": 0.2749, "num_input_tokens_seen": 17263288, "step": 26290 }, { "epoch": 15.504127358490566, "grad_norm": 3.909599542617798, "learning_rate": 1.4622837478488172e-06, "loss": 0.3676, "num_input_tokens_seen": 17266392, "step": 26295 }, { "epoch": 15.507075471698114, "grad_norm": 4.565539836883545, "learning_rate": 1.4604661562529953e-06, "loss": 0.2905, "num_input_tokens_seen": 17269976, "step": 26300 }, { "epoch": 15.51002358490566, "grad_norm": 2.1890177726745605, "learning_rate": 1.4586495017640119e-06, "loss": 0.3203, "num_input_tokens_seen": 17273208, "step": 26305 }, { "epoch": 15.512971698113208, "grad_norm": 1.5557522773742676, "learning_rate": 1.4568337848628366e-06, "loss": 0.2909, "num_input_tokens_seen": 17276856, "step": 26310 }, { "epoch": 15.515919811320755, "grad_norm": 2.705244779586792, "learning_rate": 1.4550190060301872e-06, "loss": 0.336, "num_input_tokens_seen": 17280696, "step": 26315 }, { "epoch": 15.518867924528301, "grad_norm": 3.5507254600524902, "learning_rate": 1.4532051657465335e-06, "loss": 0.3778, "num_input_tokens_seen": 17283320, "step": 26320 }, { "epoch": 15.52181603773585, "grad_norm": 4.620489597320557, "learning_rate": 1.4513922644920985e-06, "loss": 0.3246, "num_input_tokens_seen": 17286712, "step": 26325 }, { "epoch": 15.524764150943396, "grad_norm": 3.9678072929382324, "learning_rate": 1.4495803027468552e-06, "loss": 0.3245, "num_input_tokens_seen": 17289112, "step": 26330 }, { "epoch": 15.527712264150944, "grad_norm": 2.7010691165924072, "learning_rate": 1.4477692809905263e-06, "loss": 0.3775, "num_input_tokens_seen": 17291608, "step": 26335 }, { "epoch": 15.53066037735849, "grad_norm": 4.505232334136963, "learning_rate": 1.4459591997025896e-06, "loss": 0.2833, "num_input_tokens_seen": 17294840, "step": 26340 }, { "epoch": 15.533608490566039, "grad_norm": 4.191202163696289, "learning_rate": 1.4441500593622737e-06, "loss": 0.348, "num_input_tokens_seen": 17297976, "step": 26345 }, { "epoch": 15.536556603773585, "grad_norm": 3.2427380084991455, "learning_rate": 1.4423418604485539e-06, "loss": 0.3115, "num_input_tokens_seen": 17300536, "step": 26350 }, { "epoch": 15.539504716981131, "grad_norm": 2.9826176166534424, "learning_rate": 1.4405346034401597e-06, "loss": 0.3371, "num_input_tokens_seen": 17302968, "step": 26355 }, { "epoch": 15.54245283018868, "grad_norm": 4.470551013946533, "learning_rate": 1.4387282888155695e-06, "loss": 0.3612, "num_input_tokens_seen": 17306392, "step": 26360 }, { "epoch": 15.545400943396226, "grad_norm": 3.0213968753814697, "learning_rate": 1.436922917053013e-06, "loss": 0.3384, "num_input_tokens_seen": 17309336, "step": 26365 }, { "epoch": 15.548349056603774, "grad_norm": 2.66776704788208, "learning_rate": 1.4351184886304686e-06, "loss": 0.2658, "num_input_tokens_seen": 17312568, "step": 26370 }, { "epoch": 15.55129716981132, "grad_norm": 2.992781639099121, "learning_rate": 1.4333150040256699e-06, "loss": 0.3015, "num_input_tokens_seen": 17315416, "step": 26375 }, { "epoch": 15.554245283018869, "grad_norm": 3.2246501445770264, "learning_rate": 1.4315124637160954e-06, "loss": 0.3408, "num_input_tokens_seen": 17319192, "step": 26380 }, { "epoch": 15.557193396226415, "grad_norm": 4.957902908325195, "learning_rate": 1.4297108681789752e-06, "loss": 0.3664, "num_input_tokens_seen": 17321880, "step": 26385 }, { "epoch": 15.560141509433961, "grad_norm": 5.935503005981445, "learning_rate": 1.4279102178912902e-06, "loss": 0.3305, "num_input_tokens_seen": 17324824, "step": 26390 }, { "epoch": 15.56308962264151, "grad_norm": 3.2440459728240967, "learning_rate": 1.4261105133297693e-06, "loss": 0.3039, "num_input_tokens_seen": 17327576, "step": 26395 }, { "epoch": 15.566037735849056, "grad_norm": 3.817844867706299, "learning_rate": 1.4243117549708913e-06, "loss": 0.3892, "num_input_tokens_seen": 17330552, "step": 26400 }, { "epoch": 15.568985849056604, "grad_norm": 3.626891613006592, "learning_rate": 1.422513943290888e-06, "loss": 0.397, "num_input_tokens_seen": 17333240, "step": 26405 }, { "epoch": 15.57193396226415, "grad_norm": 14.86973762512207, "learning_rate": 1.4207170787657365e-06, "loss": 0.4106, "num_input_tokens_seen": 17336696, "step": 26410 }, { "epoch": 15.574882075471699, "grad_norm": 4.32824182510376, "learning_rate": 1.4189211618711646e-06, "loss": 0.2888, "num_input_tokens_seen": 17339448, "step": 26415 }, { "epoch": 15.577830188679245, "grad_norm": 2.490678310394287, "learning_rate": 1.417126193082648e-06, "loss": 0.3434, "num_input_tokens_seen": 17342648, "step": 26420 }, { "epoch": 15.580778301886792, "grad_norm": 3.4098474979400635, "learning_rate": 1.4153321728754133e-06, "loss": 0.2697, "num_input_tokens_seen": 17344760, "step": 26425 }, { "epoch": 15.58372641509434, "grad_norm": 3.3430137634277344, "learning_rate": 1.4135391017244338e-06, "loss": 0.3093, "num_input_tokens_seen": 17348472, "step": 26430 }, { "epoch": 15.586674528301886, "grad_norm": 1.8430302143096924, "learning_rate": 1.4117469801044332e-06, "loss": 0.3578, "num_input_tokens_seen": 17352280, "step": 26435 }, { "epoch": 15.589622641509434, "grad_norm": 4.7937445640563965, "learning_rate": 1.4099558084898862e-06, "loss": 0.4175, "num_input_tokens_seen": 17355960, "step": 26440 }, { "epoch": 15.59257075471698, "grad_norm": 3.6117372512817383, "learning_rate": 1.408165587355011e-06, "loss": 0.3536, "num_input_tokens_seen": 17358904, "step": 26445 }, { "epoch": 15.595518867924529, "grad_norm": 3.7707254886627197, "learning_rate": 1.4063763171737766e-06, "loss": 0.3988, "num_input_tokens_seen": 17361624, "step": 26450 }, { "epoch": 15.598466981132075, "grad_norm": 3.021270990371704, "learning_rate": 1.4045879984198996e-06, "loss": 0.3484, "num_input_tokens_seen": 17364664, "step": 26455 }, { "epoch": 15.601415094339622, "grad_norm": 3.9046273231506348, "learning_rate": 1.4028006315668457e-06, "loss": 0.3375, "num_input_tokens_seen": 17367896, "step": 26460 }, { "epoch": 15.60436320754717, "grad_norm": 3.505500316619873, "learning_rate": 1.4010142170878261e-06, "loss": 0.3098, "num_input_tokens_seen": 17370968, "step": 26465 }, { "epoch": 15.607311320754716, "grad_norm": 3.4850101470947266, "learning_rate": 1.3992287554558042e-06, "loss": 0.2955, "num_input_tokens_seen": 17374008, "step": 26470 }, { "epoch": 15.610259433962264, "grad_norm": 4.7869553565979, "learning_rate": 1.3974442471434885e-06, "loss": 0.2599, "num_input_tokens_seen": 17377784, "step": 26475 }, { "epoch": 15.61320754716981, "grad_norm": 2.3475334644317627, "learning_rate": 1.395660692623334e-06, "loss": 0.4098, "num_input_tokens_seen": 17380952, "step": 26480 }, { "epoch": 15.616155660377359, "grad_norm": 3.3226253986358643, "learning_rate": 1.3938780923675454e-06, "loss": 0.2894, "num_input_tokens_seen": 17383864, "step": 26485 }, { "epoch": 15.619103773584905, "grad_norm": 5.057884216308594, "learning_rate": 1.3920964468480718e-06, "loss": 0.3921, "num_input_tokens_seen": 17388600, "step": 26490 }, { "epoch": 15.622051886792454, "grad_norm": 3.1410560607910156, "learning_rate": 1.3903157565366143e-06, "loss": 0.3791, "num_input_tokens_seen": 17391736, "step": 26495 }, { "epoch": 15.625, "grad_norm": 1.9424118995666504, "learning_rate": 1.3885360219046172e-06, "loss": 0.2188, "num_input_tokens_seen": 17395928, "step": 26500 }, { "epoch": 15.627948113207546, "grad_norm": 8.50379467010498, "learning_rate": 1.386757243423273e-06, "loss": 0.4018, "num_input_tokens_seen": 17398904, "step": 26505 }, { "epoch": 15.630896226415095, "grad_norm": 3.935452699661255, "learning_rate": 1.384979421563521e-06, "loss": 0.2632, "num_input_tokens_seen": 17403576, "step": 26510 }, { "epoch": 15.633844339622641, "grad_norm": 2.1347835063934326, "learning_rate": 1.3832025567960465e-06, "loss": 0.2331, "num_input_tokens_seen": 17406072, "step": 26515 }, { "epoch": 15.63679245283019, "grad_norm": 4.186219215393066, "learning_rate": 1.3814266495912815e-06, "loss": 0.2185, "num_input_tokens_seen": 17409688, "step": 26520 }, { "epoch": 15.639740566037736, "grad_norm": 6.017496585845947, "learning_rate": 1.3796517004194078e-06, "loss": 0.3276, "num_input_tokens_seen": 17413240, "step": 26525 }, { "epoch": 15.642688679245284, "grad_norm": 2.655961513519287, "learning_rate": 1.3778777097503476e-06, "loss": 0.348, "num_input_tokens_seen": 17416376, "step": 26530 }, { "epoch": 15.64563679245283, "grad_norm": 4.057333469390869, "learning_rate": 1.3761046780537757e-06, "loss": 0.494, "num_input_tokens_seen": 17419416, "step": 26535 }, { "epoch": 15.648584905660378, "grad_norm": 4.3250861167907715, "learning_rate": 1.3743326057991086e-06, "loss": 0.3674, "num_input_tokens_seen": 17422392, "step": 26540 }, { "epoch": 15.651533018867925, "grad_norm": 4.212874889373779, "learning_rate": 1.3725614934555093e-06, "loss": 0.3838, "num_input_tokens_seen": 17425016, "step": 26545 }, { "epoch": 15.654481132075471, "grad_norm": 7.228378772735596, "learning_rate": 1.3707913414918882e-06, "loss": 0.3808, "num_input_tokens_seen": 17428504, "step": 26550 }, { "epoch": 15.65742924528302, "grad_norm": 4.081445217132568, "learning_rate": 1.3690221503768996e-06, "loss": 0.2847, "num_input_tokens_seen": 17431640, "step": 26555 }, { "epoch": 15.660377358490566, "grad_norm": 2.6809439659118652, "learning_rate": 1.3672539205789465e-06, "loss": 0.2792, "num_input_tokens_seen": 17434872, "step": 26560 }, { "epoch": 15.663325471698114, "grad_norm": 3.806696891784668, "learning_rate": 1.3654866525661737e-06, "loss": 0.2239, "num_input_tokens_seen": 17439992, "step": 26565 }, { "epoch": 15.66627358490566, "grad_norm": 2.5632646083831787, "learning_rate": 1.3637203468064741e-06, "loss": 0.3521, "num_input_tokens_seen": 17443864, "step": 26570 }, { "epoch": 15.669221698113208, "grad_norm": 8.522326469421387, "learning_rate": 1.3619550037674838e-06, "loss": 0.3565, "num_input_tokens_seen": 17447000, "step": 26575 }, { "epoch": 15.672169811320755, "grad_norm": 3.544565439224243, "learning_rate": 1.3601906239165857e-06, "loss": 0.4054, "num_input_tokens_seen": 17449592, "step": 26580 }, { "epoch": 15.675117924528301, "grad_norm": 4.025323867797852, "learning_rate": 1.3584272077209048e-06, "loss": 0.2892, "num_input_tokens_seen": 17453016, "step": 26585 }, { "epoch": 15.67806603773585, "grad_norm": 2.6860342025756836, "learning_rate": 1.3566647556473168e-06, "loss": 0.3051, "num_input_tokens_seen": 17455832, "step": 26590 }, { "epoch": 15.681014150943396, "grad_norm": 3.9674694538116455, "learning_rate": 1.3549032681624363e-06, "loss": 0.2414, "num_input_tokens_seen": 17459192, "step": 26595 }, { "epoch": 15.683962264150944, "grad_norm": 5.305192470550537, "learning_rate": 1.3531427457326252e-06, "loss": 0.422, "num_input_tokens_seen": 17463704, "step": 26600 }, { "epoch": 15.68691037735849, "grad_norm": 2.2905664443969727, "learning_rate": 1.3513831888239893e-06, "loss": 0.4217, "num_input_tokens_seen": 17468184, "step": 26605 }, { "epoch": 15.689858490566039, "grad_norm": 3.116577386856079, "learning_rate": 1.3496245979023786e-06, "loss": 0.2571, "num_input_tokens_seen": 17470936, "step": 26610 }, { "epoch": 15.692806603773585, "grad_norm": 2.8455140590667725, "learning_rate": 1.3478669734333865e-06, "loss": 0.3178, "num_input_tokens_seen": 17474488, "step": 26615 }, { "epoch": 15.695754716981131, "grad_norm": 3.274867296218872, "learning_rate": 1.3461103158823546e-06, "loss": 0.3474, "num_input_tokens_seen": 17478072, "step": 26620 }, { "epoch": 15.69870283018868, "grad_norm": 6.016503810882568, "learning_rate": 1.3443546257143624e-06, "loss": 0.2996, "num_input_tokens_seen": 17481048, "step": 26625 }, { "epoch": 15.701650943396226, "grad_norm": 4.651987552642822, "learning_rate": 1.3425999033942395e-06, "loss": 0.3563, "num_input_tokens_seen": 17484312, "step": 26630 }, { "epoch": 15.704599056603774, "grad_norm": 5.377594470977783, "learning_rate": 1.3408461493865549e-06, "loss": 0.3034, "num_input_tokens_seen": 17487032, "step": 26635 }, { "epoch": 15.70754716981132, "grad_norm": 3.3961989879608154, "learning_rate": 1.339093364155622e-06, "loss": 0.3036, "num_input_tokens_seen": 17489752, "step": 26640 }, { "epoch": 15.710495283018869, "grad_norm": 4.377021789550781, "learning_rate": 1.3373415481654988e-06, "loss": 0.3397, "num_input_tokens_seen": 17492312, "step": 26645 }, { "epoch": 15.713443396226415, "grad_norm": 5.121591091156006, "learning_rate": 1.335590701879984e-06, "loss": 0.2643, "num_input_tokens_seen": 17495352, "step": 26650 }, { "epoch": 15.716391509433961, "grad_norm": 4.2189202308654785, "learning_rate": 1.3338408257626257e-06, "loss": 0.3642, "num_input_tokens_seen": 17498296, "step": 26655 }, { "epoch": 15.71933962264151, "grad_norm": 2.7994470596313477, "learning_rate": 1.3320919202767086e-06, "loss": 0.3583, "num_input_tokens_seen": 17501656, "step": 26660 }, { "epoch": 15.722287735849056, "grad_norm": 2.627258062362671, "learning_rate": 1.3303439858852636e-06, "loss": 0.3254, "num_input_tokens_seen": 17504408, "step": 26665 }, { "epoch": 15.725235849056604, "grad_norm": 6.041173934936523, "learning_rate": 1.3285970230510636e-06, "loss": 0.3651, "num_input_tokens_seen": 17508664, "step": 26670 }, { "epoch": 15.72818396226415, "grad_norm": 3.6427416801452637, "learning_rate": 1.3268510322366246e-06, "loss": 0.2838, "num_input_tokens_seen": 17512088, "step": 26675 }, { "epoch": 15.731132075471699, "grad_norm": 2.7252538204193115, "learning_rate": 1.3251060139042038e-06, "loss": 0.2953, "num_input_tokens_seen": 17516216, "step": 26680 }, { "epoch": 15.734080188679245, "grad_norm": 4.122894763946533, "learning_rate": 1.3233619685158056e-06, "loss": 0.3868, "num_input_tokens_seen": 17519960, "step": 26685 }, { "epoch": 15.737028301886792, "grad_norm": 4.7222700119018555, "learning_rate": 1.3216188965331712e-06, "loss": 0.2352, "num_input_tokens_seen": 17523640, "step": 26690 }, { "epoch": 15.73997641509434, "grad_norm": 5.267205238342285, "learning_rate": 1.3198767984177869e-06, "loss": 0.3592, "num_input_tokens_seen": 17526616, "step": 26695 }, { "epoch": 15.742924528301886, "grad_norm": 4.793625354766846, "learning_rate": 1.3181356746308805e-06, "loss": 0.2808, "num_input_tokens_seen": 17528984, "step": 26700 }, { "epoch": 15.745872641509434, "grad_norm": 3.338239908218384, "learning_rate": 1.3163955256334226e-06, "loss": 0.3402, "num_input_tokens_seen": 17532664, "step": 26705 }, { "epoch": 15.74882075471698, "grad_norm": 4.807497024536133, "learning_rate": 1.3146563518861227e-06, "loss": 0.3343, "num_input_tokens_seen": 17536088, "step": 26710 }, { "epoch": 15.751768867924529, "grad_norm": 2.8949062824249268, "learning_rate": 1.3129181538494384e-06, "loss": 0.4125, "num_input_tokens_seen": 17538616, "step": 26715 }, { "epoch": 15.754716981132075, "grad_norm": 3.4606142044067383, "learning_rate": 1.3111809319835622e-06, "loss": 0.2901, "num_input_tokens_seen": 17542552, "step": 26720 }, { "epoch": 15.757665094339622, "grad_norm": 3.742039680480957, "learning_rate": 1.3094446867484335e-06, "loss": 0.3315, "num_input_tokens_seen": 17545592, "step": 26725 }, { "epoch": 15.76061320754717, "grad_norm": 4.031980991363525, "learning_rate": 1.3077094186037287e-06, "loss": 0.391, "num_input_tokens_seen": 17548248, "step": 26730 }, { "epoch": 15.763561320754716, "grad_norm": 4.512321472167969, "learning_rate": 1.305975128008869e-06, "loss": 0.3366, "num_input_tokens_seen": 17551768, "step": 26735 }, { "epoch": 15.766509433962264, "grad_norm": 8.970560073852539, "learning_rate": 1.304241815423014e-06, "loss": 0.3852, "num_input_tokens_seen": 17554968, "step": 26740 }, { "epoch": 15.76945754716981, "grad_norm": 3.7556183338165283, "learning_rate": 1.3025094813050655e-06, "loss": 0.3657, "num_input_tokens_seen": 17559000, "step": 26745 }, { "epoch": 15.772405660377359, "grad_norm": 4.409719467163086, "learning_rate": 1.3007781261136675e-06, "loss": 0.4075, "num_input_tokens_seen": 17562520, "step": 26750 }, { "epoch": 15.775353773584905, "grad_norm": 4.347368240356445, "learning_rate": 1.299047750307204e-06, "loss": 0.3928, "num_input_tokens_seen": 17565208, "step": 26755 }, { "epoch": 15.778301886792454, "grad_norm": 2.255911350250244, "learning_rate": 1.297318354343799e-06, "loss": 0.2918, "num_input_tokens_seen": 17568184, "step": 26760 }, { "epoch": 15.78125, "grad_norm": 3.068441152572632, "learning_rate": 1.295589938681317e-06, "loss": 0.2469, "num_input_tokens_seen": 17572216, "step": 26765 }, { "epoch": 15.784198113207546, "grad_norm": 5.339722633361816, "learning_rate": 1.2938625037773628e-06, "loss": 0.4408, "num_input_tokens_seen": 17575032, "step": 26770 }, { "epoch": 15.787146226415095, "grad_norm": 1.662153720855713, "learning_rate": 1.2921360500892843e-06, "loss": 0.2336, "num_input_tokens_seen": 17578776, "step": 26775 }, { "epoch": 15.790094339622641, "grad_norm": 2.6500344276428223, "learning_rate": 1.290410578074167e-06, "loss": 0.3092, "num_input_tokens_seen": 17581720, "step": 26780 }, { "epoch": 15.79304245283019, "grad_norm": 2.5736756324768066, "learning_rate": 1.2886860881888362e-06, "loss": 0.2736, "num_input_tokens_seen": 17585208, "step": 26785 }, { "epoch": 15.795990566037736, "grad_norm": 5.181730270385742, "learning_rate": 1.2869625808898584e-06, "loss": 0.3653, "num_input_tokens_seen": 17587960, "step": 26790 }, { "epoch": 15.798938679245284, "grad_norm": 5.308647155761719, "learning_rate": 1.2852400566335398e-06, "loss": 0.3612, "num_input_tokens_seen": 17591192, "step": 26795 }, { "epoch": 15.80188679245283, "grad_norm": 3.950026273727417, "learning_rate": 1.2835185158759244e-06, "loss": 0.2999, "num_input_tokens_seen": 17594296, "step": 26800 }, { "epoch": 15.804834905660378, "grad_norm": 3.8745715618133545, "learning_rate": 1.2817979590728009e-06, "loss": 0.416, "num_input_tokens_seen": 17599032, "step": 26805 }, { "epoch": 15.807783018867925, "grad_norm": 2.715277910232544, "learning_rate": 1.2800783866796918e-06, "loss": 0.3792, "num_input_tokens_seen": 17601784, "step": 26810 }, { "epoch": 15.810731132075471, "grad_norm": 4.309456825256348, "learning_rate": 1.2783597991518604e-06, "loss": 0.2799, "num_input_tokens_seen": 17604312, "step": 26815 }, { "epoch": 15.81367924528302, "grad_norm": 2.5418155193328857, "learning_rate": 1.2766421969443131e-06, "loss": 0.41, "num_input_tokens_seen": 17607960, "step": 26820 }, { "epoch": 15.816627358490566, "grad_norm": 14.294421195983887, "learning_rate": 1.274925580511791e-06, "loss": 0.5092, "num_input_tokens_seen": 17611608, "step": 26825 }, { "epoch": 15.819575471698114, "grad_norm": 4.857864856719971, "learning_rate": 1.2732099503087757e-06, "loss": 0.2758, "num_input_tokens_seen": 17614840, "step": 26830 }, { "epoch": 15.82252358490566, "grad_norm": 3.24558424949646, "learning_rate": 1.2714953067894859e-06, "loss": 0.389, "num_input_tokens_seen": 17617816, "step": 26835 }, { "epoch": 15.825471698113208, "grad_norm": 2.693751573562622, "learning_rate": 1.2697816504078847e-06, "loss": 0.3192, "num_input_tokens_seen": 17620824, "step": 26840 }, { "epoch": 15.828419811320755, "grad_norm": 3.0342462062835693, "learning_rate": 1.2680689816176672e-06, "loss": 0.2898, "num_input_tokens_seen": 17623896, "step": 26845 }, { "epoch": 15.831367924528301, "grad_norm": 3.7872226238250732, "learning_rate": 1.2663573008722707e-06, "loss": 0.3497, "num_input_tokens_seen": 17626968, "step": 26850 }, { "epoch": 15.83431603773585, "grad_norm": 4.544588088989258, "learning_rate": 1.2646466086248698e-06, "loss": 0.3141, "num_input_tokens_seen": 17629720, "step": 26855 }, { "epoch": 15.837264150943396, "grad_norm": 4.127427577972412, "learning_rate": 1.2629369053283779e-06, "loss": 0.3301, "num_input_tokens_seen": 17633336, "step": 26860 }, { "epoch": 15.840212264150944, "grad_norm": 3.3453783988952637, "learning_rate": 1.2612281914354452e-06, "loss": 0.3834, "num_input_tokens_seen": 17636088, "step": 26865 }, { "epoch": 15.84316037735849, "grad_norm": 2.2840962409973145, "learning_rate": 1.259520467398463e-06, "loss": 0.4962, "num_input_tokens_seen": 17643288, "step": 26870 }, { "epoch": 15.846108490566039, "grad_norm": 3.671475410461426, "learning_rate": 1.2578137336695573e-06, "loss": 0.3589, "num_input_tokens_seen": 17646104, "step": 26875 }, { "epoch": 15.849056603773585, "grad_norm": 4.270492076873779, "learning_rate": 1.256107990700594e-06, "loss": 0.3389, "num_input_tokens_seen": 17649528, "step": 26880 }, { "epoch": 15.852004716981131, "grad_norm": 4.333379745483398, "learning_rate": 1.2544032389431753e-06, "loss": 0.3512, "num_input_tokens_seen": 17652536, "step": 26885 }, { "epoch": 15.85495283018868, "grad_norm": 3.7302944660186768, "learning_rate": 1.2526994788486418e-06, "loss": 0.3474, "num_input_tokens_seen": 17655768, "step": 26890 }, { "epoch": 15.857900943396226, "grad_norm": 3.131018877029419, "learning_rate": 1.2509967108680697e-06, "loss": 0.4891, "num_input_tokens_seen": 17658392, "step": 26895 }, { "epoch": 15.860849056603774, "grad_norm": 3.471769094467163, "learning_rate": 1.249294935452277e-06, "loss": 0.3007, "num_input_tokens_seen": 17661336, "step": 26900 }, { "epoch": 15.86379716981132, "grad_norm": 2.974522352218628, "learning_rate": 1.247594153051815e-06, "loss": 0.3263, "num_input_tokens_seen": 17665144, "step": 26905 }, { "epoch": 15.866745283018869, "grad_norm": 1.9907454252243042, "learning_rate": 1.2458943641169718e-06, "loss": 0.4486, "num_input_tokens_seen": 17669560, "step": 26910 }, { "epoch": 15.869693396226415, "grad_norm": 2.214097738265991, "learning_rate": 1.2441955690977758e-06, "loss": 0.3126, "num_input_tokens_seen": 17672984, "step": 26915 }, { "epoch": 15.872641509433961, "grad_norm": 2.966783285140991, "learning_rate": 1.2424977684439898e-06, "loss": 0.3761, "num_input_tokens_seen": 17676216, "step": 26920 }, { "epoch": 15.87558962264151, "grad_norm": 4.879164218902588, "learning_rate": 1.2408009626051137e-06, "loss": 0.3375, "num_input_tokens_seen": 17678840, "step": 26925 }, { "epoch": 15.878537735849056, "grad_norm": 3.747241973876953, "learning_rate": 1.2391051520303826e-06, "loss": 0.2913, "num_input_tokens_seen": 17682168, "step": 26930 }, { "epoch": 15.881485849056604, "grad_norm": 3.2364001274108887, "learning_rate": 1.2374103371687723e-06, "loss": 0.3088, "num_input_tokens_seen": 17685848, "step": 26935 }, { "epoch": 15.88443396226415, "grad_norm": 5.6982316970825195, "learning_rate": 1.2357165184689906e-06, "loss": 0.444, "num_input_tokens_seen": 17688568, "step": 26940 }, { "epoch": 15.887382075471699, "grad_norm": 2.758070230484009, "learning_rate": 1.2340236963794845e-06, "loss": 0.3661, "num_input_tokens_seen": 17692920, "step": 26945 }, { "epoch": 15.890330188679245, "grad_norm": 6.051400661468506, "learning_rate": 1.232331871348435e-06, "loss": 0.4912, "num_input_tokens_seen": 17695992, "step": 26950 }, { "epoch": 15.893278301886792, "grad_norm": 5.47653865814209, "learning_rate": 1.2306410438237603e-06, "loss": 0.3967, "num_input_tokens_seen": 17698712, "step": 26955 }, { "epoch": 15.89622641509434, "grad_norm": 2.576843738555908, "learning_rate": 1.228951214253113e-06, "loss": 0.3871, "num_input_tokens_seen": 17701112, "step": 26960 }, { "epoch": 15.899174528301886, "grad_norm": 2.6388909816741943, "learning_rate": 1.2272623830838854e-06, "loss": 0.1971, "num_input_tokens_seen": 17704184, "step": 26965 }, { "epoch": 15.902122641509434, "grad_norm": 3.4307496547698975, "learning_rate": 1.2255745507632016e-06, "loss": 0.3711, "num_input_tokens_seen": 17707320, "step": 26970 }, { "epoch": 15.90507075471698, "grad_norm": 1.6098686456680298, "learning_rate": 1.223887717737922e-06, "loss": 0.2853, "num_input_tokens_seen": 17714744, "step": 26975 }, { "epoch": 15.908018867924529, "grad_norm": 6.802907466888428, "learning_rate": 1.2222018844546434e-06, "loss": 0.366, "num_input_tokens_seen": 17717432, "step": 26980 }, { "epoch": 15.910966981132075, "grad_norm": 3.302823305130005, "learning_rate": 1.2205170513596975e-06, "loss": 0.3585, "num_input_tokens_seen": 17720760, "step": 26985 }, { "epoch": 15.913915094339622, "grad_norm": 2.66294527053833, "learning_rate": 1.2188332188991493e-06, "loss": 0.2654, "num_input_tokens_seen": 17724120, "step": 26990 }, { "epoch": 15.91686320754717, "grad_norm": 3.7573587894439697, "learning_rate": 1.217150387518804e-06, "loss": 0.2666, "num_input_tokens_seen": 17726744, "step": 26995 }, { "epoch": 15.919811320754716, "grad_norm": 3.777942180633545, "learning_rate": 1.2154685576641967e-06, "loss": 0.2964, "num_input_tokens_seen": 17729464, "step": 27000 }, { "epoch": 15.922759433962264, "grad_norm": 5.268105506896973, "learning_rate": 1.2137877297805972e-06, "loss": 0.3298, "num_input_tokens_seen": 17732536, "step": 27005 }, { "epoch": 15.92570754716981, "grad_norm": 3.181459903717041, "learning_rate": 1.2121079043130162e-06, "loss": 0.3322, "num_input_tokens_seen": 17735768, "step": 27010 }, { "epoch": 15.928655660377359, "grad_norm": 3.4578237533569336, "learning_rate": 1.210429081706192e-06, "loss": 0.2266, "num_input_tokens_seen": 17737912, "step": 27015 }, { "epoch": 15.931603773584905, "grad_norm": 3.6956028938293457, "learning_rate": 1.2087512624046005e-06, "loss": 0.4045, "num_input_tokens_seen": 17741528, "step": 27020 }, { "epoch": 15.934551886792454, "grad_norm": 6.089264392852783, "learning_rate": 1.2070744468524503e-06, "loss": 0.3702, "num_input_tokens_seen": 17745400, "step": 27025 }, { "epoch": 15.9375, "grad_norm": 5.460577964782715, "learning_rate": 1.2053986354936887e-06, "loss": 0.3988, "num_input_tokens_seen": 17749336, "step": 27030 }, { "epoch": 15.940448113207546, "grad_norm": 5.213695526123047, "learning_rate": 1.2037238287719916e-06, "loss": 0.2471, "num_input_tokens_seen": 17752344, "step": 27035 }, { "epoch": 15.943396226415095, "grad_norm": 4.0175042152404785, "learning_rate": 1.2020500271307721e-06, "loss": 0.3271, "num_input_tokens_seen": 17755288, "step": 27040 }, { "epoch": 15.946344339622641, "grad_norm": 2.7279775142669678, "learning_rate": 1.200377231013176e-06, "loss": 0.4004, "num_input_tokens_seen": 17758648, "step": 27045 }, { "epoch": 15.94929245283019, "grad_norm": 1.827243447303772, "learning_rate": 1.1987054408620825e-06, "loss": 0.3052, "num_input_tokens_seen": 17762104, "step": 27050 }, { "epoch": 15.952240566037736, "grad_norm": 2.4908857345581055, "learning_rate": 1.197034657120107e-06, "loss": 0.3476, "num_input_tokens_seen": 17765848, "step": 27055 }, { "epoch": 15.955188679245284, "grad_norm": 4.642092227935791, "learning_rate": 1.1953648802295964e-06, "loss": 0.4192, "num_input_tokens_seen": 17768472, "step": 27060 }, { "epoch": 15.95813679245283, "grad_norm": 1.9981095790863037, "learning_rate": 1.1936961106326307e-06, "loss": 0.4212, "num_input_tokens_seen": 17771608, "step": 27065 }, { "epoch": 15.961084905660378, "grad_norm": 3.4245474338531494, "learning_rate": 1.1920283487710237e-06, "loss": 0.1882, "num_input_tokens_seen": 17774488, "step": 27070 }, { "epoch": 15.964033018867925, "grad_norm": 3.8815197944641113, "learning_rate": 1.1903615950863228e-06, "loss": 0.4026, "num_input_tokens_seen": 17776856, "step": 27075 }, { "epoch": 15.966981132075471, "grad_norm": 3.32802152633667, "learning_rate": 1.1886958500198076e-06, "loss": 0.263, "num_input_tokens_seen": 17780920, "step": 27080 }, { "epoch": 15.96992924528302, "grad_norm": 2.808011293411255, "learning_rate": 1.1870311140124923e-06, "loss": 0.2455, "num_input_tokens_seen": 17784152, "step": 27085 }, { "epoch": 15.972877358490566, "grad_norm": 2.3417136669158936, "learning_rate": 1.185367387505123e-06, "loss": 0.3268, "num_input_tokens_seen": 17787544, "step": 27090 }, { "epoch": 15.975825471698114, "grad_norm": 4.2579264640808105, "learning_rate": 1.1837046709381783e-06, "loss": 0.3393, "num_input_tokens_seen": 17791384, "step": 27095 }, { "epoch": 15.97877358490566, "grad_norm": 14.852094650268555, "learning_rate": 1.1820429647518678e-06, "loss": 0.3514, "num_input_tokens_seen": 17794520, "step": 27100 }, { "epoch": 15.981721698113208, "grad_norm": 4.159623146057129, "learning_rate": 1.1803822693861377e-06, "loss": 0.4594, "num_input_tokens_seen": 17797496, "step": 27105 }, { "epoch": 15.984669811320755, "grad_norm": 3.9878251552581787, "learning_rate": 1.1787225852806639e-06, "loss": 0.2767, "num_input_tokens_seen": 17799896, "step": 27110 }, { "epoch": 15.987617924528301, "grad_norm": 2.502535820007324, "learning_rate": 1.177063912874853e-06, "loss": 0.3362, "num_input_tokens_seen": 17802840, "step": 27115 }, { "epoch": 15.99056603773585, "grad_norm": 4.294631004333496, "learning_rate": 1.1754062526078487e-06, "loss": 0.2965, "num_input_tokens_seen": 17805848, "step": 27120 }, { "epoch": 15.993514150943396, "grad_norm": 2.8553929328918457, "learning_rate": 1.1737496049185215e-06, "loss": 0.3593, "num_input_tokens_seen": 17808728, "step": 27125 }, { "epoch": 15.996462264150944, "grad_norm": 4.721065521240234, "learning_rate": 1.172093970245477e-06, "loss": 0.422, "num_input_tokens_seen": 17811832, "step": 27130 }, { "epoch": 15.99941037735849, "grad_norm": 3.422154188156128, "learning_rate": 1.1704393490270516e-06, "loss": 0.3497, "num_input_tokens_seen": 17815096, "step": 27135 }, { "epoch": 16.0, "eval_loss": 0.587344765663147, "eval_runtime": 18.6401, "eval_samples_per_second": 90.987, "eval_steps_per_second": 22.747, "num_input_tokens_seen": 17815160, "step": 27136 }, { "epoch": 16.00235849056604, "grad_norm": 5.597724914550781, "learning_rate": 1.1687857417013126e-06, "loss": 0.2681, "num_input_tokens_seen": 17817816, "step": 27140 }, { "epoch": 16.005306603773583, "grad_norm": 3.58742094039917, "learning_rate": 1.1671331487060583e-06, "loss": 0.3911, "num_input_tokens_seen": 17821592, "step": 27145 }, { "epoch": 16.00825471698113, "grad_norm": 3.2295289039611816, "learning_rate": 1.1654815704788237e-06, "loss": 0.236, "num_input_tokens_seen": 17824952, "step": 27150 }, { "epoch": 16.01120283018868, "grad_norm": 2.443837881088257, "learning_rate": 1.1638310074568687e-06, "loss": 0.2067, "num_input_tokens_seen": 17827448, "step": 27155 }, { "epoch": 16.014150943396228, "grad_norm": 3.471554756164551, "learning_rate": 1.162181460077188e-06, "loss": 0.2938, "num_input_tokens_seen": 17830712, "step": 27160 }, { "epoch": 16.017099056603772, "grad_norm": 4.148375988006592, "learning_rate": 1.1605329287765056e-06, "loss": 0.3538, "num_input_tokens_seen": 17833624, "step": 27165 }, { "epoch": 16.02004716981132, "grad_norm": 3.143738269805908, "learning_rate": 1.1588854139912775e-06, "loss": 0.3172, "num_input_tokens_seen": 17837720, "step": 27170 }, { "epoch": 16.02299528301887, "grad_norm": 2.993138313293457, "learning_rate": 1.1572389161576886e-06, "loss": 0.301, "num_input_tokens_seen": 17840952, "step": 27175 }, { "epoch": 16.025943396226417, "grad_norm": 4.074196815490723, "learning_rate": 1.15559343571166e-06, "loss": 0.4447, "num_input_tokens_seen": 17844216, "step": 27180 }, { "epoch": 16.02889150943396, "grad_norm": 2.8271608352661133, "learning_rate": 1.153948973088837e-06, "loss": 0.3247, "num_input_tokens_seen": 17848216, "step": 27185 }, { "epoch": 16.03183962264151, "grad_norm": 4.965586185455322, "learning_rate": 1.1523055287245993e-06, "loss": 0.3572, "num_input_tokens_seen": 17851640, "step": 27190 }, { "epoch": 16.034787735849058, "grad_norm": 4.617186546325684, "learning_rate": 1.150663103054056e-06, "loss": 0.2676, "num_input_tokens_seen": 17854808, "step": 27195 }, { "epoch": 16.037735849056602, "grad_norm": 3.4842655658721924, "learning_rate": 1.1490216965120438e-06, "loss": 0.2954, "num_input_tokens_seen": 17857656, "step": 27200 }, { "epoch": 16.04068396226415, "grad_norm": 3.112712860107422, "learning_rate": 1.147381309533136e-06, "loss": 0.4362, "num_input_tokens_seen": 17861272, "step": 27205 }, { "epoch": 16.0436320754717, "grad_norm": 2.899782657623291, "learning_rate": 1.1457419425516287e-06, "loss": 0.37, "num_input_tokens_seen": 17865240, "step": 27210 }, { "epoch": 16.046580188679247, "grad_norm": 3.5159847736358643, "learning_rate": 1.1441035960015544e-06, "loss": 0.3556, "num_input_tokens_seen": 17869016, "step": 27215 }, { "epoch": 16.04952830188679, "grad_norm": 3.0064563751220703, "learning_rate": 1.1424662703166716e-06, "loss": 0.3989, "num_input_tokens_seen": 17872088, "step": 27220 }, { "epoch": 16.05247641509434, "grad_norm": 3.285438060760498, "learning_rate": 1.1408299659304684e-06, "loss": 0.4309, "num_input_tokens_seen": 17875768, "step": 27225 }, { "epoch": 16.055424528301888, "grad_norm": 6.277017593383789, "learning_rate": 1.1391946832761642e-06, "loss": 0.3387, "num_input_tokens_seen": 17878488, "step": 27230 }, { "epoch": 16.058372641509433, "grad_norm": 3.5135867595672607, "learning_rate": 1.137560422786706e-06, "loss": 0.3631, "num_input_tokens_seen": 17881592, "step": 27235 }, { "epoch": 16.06132075471698, "grad_norm": 5.272532939910889, "learning_rate": 1.1359271848947712e-06, "loss": 0.1879, "num_input_tokens_seen": 17885368, "step": 27240 }, { "epoch": 16.06426886792453, "grad_norm": 3.6095991134643555, "learning_rate": 1.1342949700327688e-06, "loss": 0.3174, "num_input_tokens_seen": 17888376, "step": 27245 }, { "epoch": 16.067216981132077, "grad_norm": 3.6612179279327393, "learning_rate": 1.1326637786328332e-06, "loss": 0.3241, "num_input_tokens_seen": 17891416, "step": 27250 }, { "epoch": 16.07016509433962, "grad_norm": 6.494958400726318, "learning_rate": 1.1310336111268293e-06, "loss": 0.3388, "num_input_tokens_seen": 17894232, "step": 27255 }, { "epoch": 16.07311320754717, "grad_norm": 2.8199121952056885, "learning_rate": 1.1294044679463517e-06, "loss": 0.372, "num_input_tokens_seen": 17897848, "step": 27260 }, { "epoch": 16.076061320754718, "grad_norm": 3.255025625228882, "learning_rate": 1.1277763495227207e-06, "loss": 0.2888, "num_input_tokens_seen": 17901560, "step": 27265 }, { "epoch": 16.079009433962263, "grad_norm": 3.899280309677124, "learning_rate": 1.1261492562869913e-06, "loss": 0.306, "num_input_tokens_seen": 17904408, "step": 27270 }, { "epoch": 16.08195754716981, "grad_norm": 5.537953853607178, "learning_rate": 1.1245231886699415e-06, "loss": 0.2673, "num_input_tokens_seen": 17907864, "step": 27275 }, { "epoch": 16.08490566037736, "grad_norm": 2.0360891819000244, "learning_rate": 1.12289814710208e-06, "loss": 0.3568, "num_input_tokens_seen": 17911128, "step": 27280 }, { "epoch": 16.087853773584907, "grad_norm": 3.257258892059326, "learning_rate": 1.1212741320136433e-06, "loss": 0.3589, "num_input_tokens_seen": 17914360, "step": 27285 }, { "epoch": 16.090801886792452, "grad_norm": 3.358752489089966, "learning_rate": 1.1196511438345963e-06, "loss": 0.2458, "num_input_tokens_seen": 17918456, "step": 27290 }, { "epoch": 16.09375, "grad_norm": 2.5883824825286865, "learning_rate": 1.118029182994631e-06, "loss": 0.3079, "num_input_tokens_seen": 17921752, "step": 27295 }, { "epoch": 16.096698113207548, "grad_norm": 3.2041165828704834, "learning_rate": 1.1164082499231704e-06, "loss": 0.2647, "num_input_tokens_seen": 17924824, "step": 27300 }, { "epoch": 16.099646226415093, "grad_norm": 4.0037970542907715, "learning_rate": 1.114788345049364e-06, "loss": 0.3063, "num_input_tokens_seen": 17927384, "step": 27305 }, { "epoch": 16.10259433962264, "grad_norm": 3.9838240146636963, "learning_rate": 1.1131694688020872e-06, "loss": 0.3981, "num_input_tokens_seen": 17930392, "step": 27310 }, { "epoch": 16.10554245283019, "grad_norm": 4.091338157653809, "learning_rate": 1.1115516216099453e-06, "loss": 0.2882, "num_input_tokens_seen": 17933528, "step": 27315 }, { "epoch": 16.108490566037737, "grad_norm": 4.737424850463867, "learning_rate": 1.1099348039012698e-06, "loss": 0.3856, "num_input_tokens_seen": 17935960, "step": 27320 }, { "epoch": 16.111438679245282, "grad_norm": 3.1994595527648926, "learning_rate": 1.1083190161041202e-06, "loss": 0.2903, "num_input_tokens_seen": 17939384, "step": 27325 }, { "epoch": 16.11438679245283, "grad_norm": 2.285853147506714, "learning_rate": 1.1067042586462822e-06, "loss": 0.2743, "num_input_tokens_seen": 17942264, "step": 27330 }, { "epoch": 16.11733490566038, "grad_norm": 4.114500999450684, "learning_rate": 1.1050905319552718e-06, "loss": 0.2862, "num_input_tokens_seen": 17944792, "step": 27335 }, { "epoch": 16.120283018867923, "grad_norm": 3.092381238937378, "learning_rate": 1.1034778364583293e-06, "loss": 0.3147, "num_input_tokens_seen": 17948568, "step": 27340 }, { "epoch": 16.12323113207547, "grad_norm": 4.219966888427734, "learning_rate": 1.1018661725824231e-06, "loss": 0.4291, "num_input_tokens_seen": 17951736, "step": 27345 }, { "epoch": 16.12617924528302, "grad_norm": 3.983710289001465, "learning_rate": 1.100255540754247e-06, "loss": 0.3389, "num_input_tokens_seen": 17954616, "step": 27350 }, { "epoch": 16.129127358490567, "grad_norm": 3.3224143981933594, "learning_rate": 1.0986459414002244e-06, "loss": 0.3998, "num_input_tokens_seen": 17958200, "step": 27355 }, { "epoch": 16.132075471698112, "grad_norm": 2.546635627746582, "learning_rate": 1.0970373749465008e-06, "loss": 0.3344, "num_input_tokens_seen": 17961528, "step": 27360 }, { "epoch": 16.13502358490566, "grad_norm": 3.211247205734253, "learning_rate": 1.095429841818954e-06, "loss": 0.3845, "num_input_tokens_seen": 17964216, "step": 27365 }, { "epoch": 16.13797169811321, "grad_norm": 3.2530834674835205, "learning_rate": 1.093823342443185e-06, "loss": 0.2332, "num_input_tokens_seen": 17967448, "step": 27370 }, { "epoch": 16.140919811320753, "grad_norm": 5.781114101409912, "learning_rate": 1.0922178772445203e-06, "loss": 0.3114, "num_input_tokens_seen": 17971448, "step": 27375 }, { "epoch": 16.1438679245283, "grad_norm": 1.8719605207443237, "learning_rate": 1.0906134466480146e-06, "loss": 0.2983, "num_input_tokens_seen": 17974488, "step": 27380 }, { "epoch": 16.14681603773585, "grad_norm": 6.151439666748047, "learning_rate": 1.0890100510784473e-06, "loss": 0.3779, "num_input_tokens_seen": 17977400, "step": 27385 }, { "epoch": 16.149764150943398, "grad_norm": 4.748885154724121, "learning_rate": 1.0874076909603227e-06, "loss": 0.3852, "num_input_tokens_seen": 17979608, "step": 27390 }, { "epoch": 16.152712264150942, "grad_norm": 2.9580190181732178, "learning_rate": 1.0858063667178747e-06, "loss": 0.3241, "num_input_tokens_seen": 17983352, "step": 27395 }, { "epoch": 16.15566037735849, "grad_norm": 4.36998987197876, "learning_rate": 1.0842060787750614e-06, "loss": 0.3411, "num_input_tokens_seen": 17987544, "step": 27400 }, { "epoch": 16.15860849056604, "grad_norm": 3.3182759284973145, "learning_rate": 1.0826068275555652e-06, "loss": 0.3132, "num_input_tokens_seen": 17990328, "step": 27405 }, { "epoch": 16.161556603773583, "grad_norm": 3.3346028327941895, "learning_rate": 1.081008613482794e-06, "loss": 0.3229, "num_input_tokens_seen": 17993528, "step": 27410 }, { "epoch": 16.16450471698113, "grad_norm": 3.8553833961486816, "learning_rate": 1.079411436979883e-06, "loss": 0.3026, "num_input_tokens_seen": 17995704, "step": 27415 }, { "epoch": 16.16745283018868, "grad_norm": 3.7936484813690186, "learning_rate": 1.0778152984696905e-06, "loss": 0.2478, "num_input_tokens_seen": 17999608, "step": 27420 }, { "epoch": 16.170400943396228, "grad_norm": 3.7704906463623047, "learning_rate": 1.0762201983747993e-06, "loss": 0.3289, "num_input_tokens_seen": 18002776, "step": 27425 }, { "epoch": 16.173349056603772, "grad_norm": 4.568115711212158, "learning_rate": 1.0746261371175238e-06, "loss": 0.4367, "num_input_tokens_seen": 18005624, "step": 27430 }, { "epoch": 16.17629716981132, "grad_norm": 5.128986358642578, "learning_rate": 1.0730331151198953e-06, "loss": 0.3106, "num_input_tokens_seen": 18008344, "step": 27435 }, { "epoch": 16.17924528301887, "grad_norm": 3.7109270095825195, "learning_rate": 1.0714411328036733e-06, "loss": 0.3083, "num_input_tokens_seen": 18011416, "step": 27440 }, { "epoch": 16.182193396226417, "grad_norm": 1.9551055431365967, "learning_rate": 1.0698501905903435e-06, "loss": 0.2834, "num_input_tokens_seen": 18014296, "step": 27445 }, { "epoch": 16.18514150943396, "grad_norm": 3.539057970046997, "learning_rate": 1.0682602889011134e-06, "loss": 0.3688, "num_input_tokens_seen": 18017176, "step": 27450 }, { "epoch": 16.18808962264151, "grad_norm": 2.0777506828308105, "learning_rate": 1.0666714281569152e-06, "loss": 0.3588, "num_input_tokens_seen": 18020920, "step": 27455 }, { "epoch": 16.191037735849058, "grad_norm": 4.654979228973389, "learning_rate": 1.0650836087784095e-06, "loss": 0.301, "num_input_tokens_seen": 18023800, "step": 27460 }, { "epoch": 16.193985849056602, "grad_norm": 2.1793789863586426, "learning_rate": 1.0634968311859768e-06, "loss": 0.3563, "num_input_tokens_seen": 18027320, "step": 27465 }, { "epoch": 16.19693396226415, "grad_norm": 3.8635811805725098, "learning_rate": 1.0619110957997237e-06, "loss": 0.3456, "num_input_tokens_seen": 18030232, "step": 27470 }, { "epoch": 16.1998820754717, "grad_norm": 3.877487897872925, "learning_rate": 1.06032640303948e-06, "loss": 0.3707, "num_input_tokens_seen": 18033496, "step": 27475 }, { "epoch": 16.202830188679247, "grad_norm": 6.0411295890808105, "learning_rate": 1.0587427533248002e-06, "loss": 0.3297, "num_input_tokens_seen": 18036248, "step": 27480 }, { "epoch": 16.20577830188679, "grad_norm": 3.3776066303253174, "learning_rate": 1.057160147074961e-06, "loss": 0.2993, "num_input_tokens_seen": 18039160, "step": 27485 }, { "epoch": 16.20872641509434, "grad_norm": 2.3363077640533447, "learning_rate": 1.0555785847089657e-06, "loss": 0.3858, "num_input_tokens_seen": 18042968, "step": 27490 }, { "epoch": 16.211674528301888, "grad_norm": 3.712942123413086, "learning_rate": 1.0539980666455407e-06, "loss": 0.367, "num_input_tokens_seen": 18048376, "step": 27495 }, { "epoch": 16.214622641509433, "grad_norm": 3.3698596954345703, "learning_rate": 1.052418593303134e-06, "loss": 0.3585, "num_input_tokens_seen": 18051512, "step": 27500 }, { "epoch": 16.21757075471698, "grad_norm": 5.068997383117676, "learning_rate": 1.0508401650999178e-06, "loss": 0.2994, "num_input_tokens_seen": 18054328, "step": 27505 }, { "epoch": 16.22051886792453, "grad_norm": 3.375046968460083, "learning_rate": 1.0492627824537877e-06, "loss": 0.2597, "num_input_tokens_seen": 18057048, "step": 27510 }, { "epoch": 16.223466981132077, "grad_norm": 4.8329925537109375, "learning_rate": 1.0476864457823626e-06, "loss": 0.2625, "num_input_tokens_seen": 18060120, "step": 27515 }, { "epoch": 16.22641509433962, "grad_norm": 4.348163604736328, "learning_rate": 1.0461111555029836e-06, "loss": 0.2529, "num_input_tokens_seen": 18062840, "step": 27520 }, { "epoch": 16.22936320754717, "grad_norm": 2.507608652114868, "learning_rate": 1.0445369120327175e-06, "loss": 0.4155, "num_input_tokens_seen": 18067256, "step": 27525 }, { "epoch": 16.232311320754718, "grad_norm": 3.5649120807647705, "learning_rate": 1.0429637157883516e-06, "loss": 0.3563, "num_input_tokens_seen": 18070712, "step": 27530 }, { "epoch": 16.235259433962263, "grad_norm": 5.553269386291504, "learning_rate": 1.041391567186395e-06, "loss": 0.3068, "num_input_tokens_seen": 18073368, "step": 27535 }, { "epoch": 16.23820754716981, "grad_norm": 4.507972717285156, "learning_rate": 1.0398204666430821e-06, "loss": 0.2022, "num_input_tokens_seen": 18075992, "step": 27540 }, { "epoch": 16.24115566037736, "grad_norm": 4.235992908477783, "learning_rate": 1.0382504145743667e-06, "loss": 0.2787, "num_input_tokens_seen": 18078456, "step": 27545 }, { "epoch": 16.244103773584907, "grad_norm": 3.009700298309326, "learning_rate": 1.0366814113959294e-06, "loss": 0.3912, "num_input_tokens_seen": 18081688, "step": 27550 }, { "epoch": 16.247051886792452, "grad_norm": 3.4170737266540527, "learning_rate": 1.0351134575231697e-06, "loss": 0.4062, "num_input_tokens_seen": 18085080, "step": 27555 }, { "epoch": 16.25, "grad_norm": 2.603529930114746, "learning_rate": 1.0335465533712098e-06, "loss": 0.4116, "num_input_tokens_seen": 18089368, "step": 27560 }, { "epoch": 16.252948113207548, "grad_norm": 5.27362585067749, "learning_rate": 1.031980699354894e-06, "loss": 0.3172, "num_input_tokens_seen": 18092504, "step": 27565 }, { "epoch": 16.255896226415093, "grad_norm": 3.0618231296539307, "learning_rate": 1.03041589588879e-06, "loss": 0.4822, "num_input_tokens_seen": 18095640, "step": 27570 }, { "epoch": 16.25884433962264, "grad_norm": 5.394371509552002, "learning_rate": 1.0288521433871834e-06, "loss": 0.2679, "num_input_tokens_seen": 18098488, "step": 27575 }, { "epoch": 16.26179245283019, "grad_norm": 8.016378402709961, "learning_rate": 1.0272894422640866e-06, "loss": 0.3609, "num_input_tokens_seen": 18101848, "step": 27580 }, { "epoch": 16.264740566037737, "grad_norm": 5.287758827209473, "learning_rate": 1.0257277929332332e-06, "loss": 0.3741, "num_input_tokens_seen": 18104824, "step": 27585 }, { "epoch": 16.267688679245282, "grad_norm": 5.405674934387207, "learning_rate": 1.0241671958080745e-06, "loss": 0.3796, "num_input_tokens_seen": 18109496, "step": 27590 }, { "epoch": 16.27063679245283, "grad_norm": 7.12821102142334, "learning_rate": 1.0226076513017858e-06, "loss": 0.4435, "num_input_tokens_seen": 18112760, "step": 27595 }, { "epoch": 16.27358490566038, "grad_norm": 3.9378020763397217, "learning_rate": 1.0210491598272625e-06, "loss": 0.2394, "num_input_tokens_seen": 18114872, "step": 27600 }, { "epoch": 16.276533018867923, "grad_norm": 5.986956596374512, "learning_rate": 1.0194917217971229e-06, "loss": 0.3688, "num_input_tokens_seen": 18118872, "step": 27605 }, { "epoch": 16.27948113207547, "grad_norm": 2.462805986404419, "learning_rate": 1.0179353376237038e-06, "loss": 0.3244, "num_input_tokens_seen": 18123544, "step": 27610 }, { "epoch": 16.28242924528302, "grad_norm": 4.743433952331543, "learning_rate": 1.0163800077190672e-06, "loss": 0.3261, "num_input_tokens_seen": 18126328, "step": 27615 }, { "epoch": 16.285377358490567, "grad_norm": 4.612635135650635, "learning_rate": 1.0148257324949916e-06, "loss": 0.2514, "num_input_tokens_seen": 18129048, "step": 27620 }, { "epoch": 16.288325471698112, "grad_norm": 2.6978964805603027, "learning_rate": 1.0132725123629783e-06, "loss": 0.2992, "num_input_tokens_seen": 18133528, "step": 27625 }, { "epoch": 16.29127358490566, "grad_norm": 2.8011977672576904, "learning_rate": 1.0117203477342497e-06, "loss": 0.2456, "num_input_tokens_seen": 18136216, "step": 27630 }, { "epoch": 16.29422169811321, "grad_norm": 4.1666388511657715, "learning_rate": 1.0101692390197477e-06, "loss": 0.4258, "num_input_tokens_seen": 18139928, "step": 27635 }, { "epoch": 16.297169811320753, "grad_norm": 2.989713430404663, "learning_rate": 1.0086191866301331e-06, "loss": 0.2817, "num_input_tokens_seen": 18143224, "step": 27640 }, { "epoch": 16.3001179245283, "grad_norm": 3.3256216049194336, "learning_rate": 1.0070701909757918e-06, "loss": 0.3272, "num_input_tokens_seen": 18147416, "step": 27645 }, { "epoch": 16.30306603773585, "grad_norm": 3.6470818519592285, "learning_rate": 1.0055222524668267e-06, "loss": 0.3937, "num_input_tokens_seen": 18151608, "step": 27650 }, { "epoch": 16.306014150943398, "grad_norm": 2.731015920639038, "learning_rate": 1.00397537151306e-06, "loss": 0.3297, "num_input_tokens_seen": 18155480, "step": 27655 }, { "epoch": 16.308962264150942, "grad_norm": 1.7300302982330322, "learning_rate": 1.002429548524036e-06, "loss": 0.31, "num_input_tokens_seen": 18159544, "step": 27660 }, { "epoch": 16.31191037735849, "grad_norm": 2.8041160106658936, "learning_rate": 1.0008847839090175e-06, "loss": 0.2381, "num_input_tokens_seen": 18162712, "step": 27665 }, { "epoch": 16.31485849056604, "grad_norm": 4.076910018920898, "learning_rate": 9.993410780769862e-07, "loss": 0.3289, "num_input_tokens_seen": 18166072, "step": 27670 }, { "epoch": 16.317806603773583, "grad_norm": 5.227705001831055, "learning_rate": 9.977984314366463e-07, "loss": 0.4075, "num_input_tokens_seen": 18169112, "step": 27675 }, { "epoch": 16.32075471698113, "grad_norm": 3.399327039718628, "learning_rate": 9.962568443964216e-07, "loss": 0.2147, "num_input_tokens_seen": 18172600, "step": 27680 }, { "epoch": 16.32370283018868, "grad_norm": 2.620345115661621, "learning_rate": 9.947163173644524e-07, "loss": 0.318, "num_input_tokens_seen": 18175512, "step": 27685 }, { "epoch": 16.326650943396228, "grad_norm": 1.619612693786621, "learning_rate": 9.931768507486007e-07, "loss": 0.2509, "num_input_tokens_seen": 18178360, "step": 27690 }, { "epoch": 16.329599056603772, "grad_norm": 7.173616409301758, "learning_rate": 9.916384449564453e-07, "loss": 0.3925, "num_input_tokens_seen": 18182168, "step": 27695 }, { "epoch": 16.33254716981132, "grad_norm": 3.5758907794952393, "learning_rate": 9.90101100395287e-07, "loss": 0.4099, "num_input_tokens_seen": 18184568, "step": 27700 }, { "epoch": 16.33549528301887, "grad_norm": 3.4123687744140625, "learning_rate": 9.885648174721428e-07, "loss": 0.3003, "num_input_tokens_seen": 18187896, "step": 27705 }, { "epoch": 16.338443396226417, "grad_norm": 3.7993698120117188, "learning_rate": 9.870295965937532e-07, "loss": 0.3054, "num_input_tokens_seen": 18190616, "step": 27710 }, { "epoch": 16.34139150943396, "grad_norm": 4.141104221343994, "learning_rate": 9.854954381665727e-07, "loss": 0.3661, "num_input_tokens_seen": 18193592, "step": 27715 }, { "epoch": 16.34433962264151, "grad_norm": 5.583395004272461, "learning_rate": 9.83962342596776e-07, "loss": 0.3392, "num_input_tokens_seen": 18196920, "step": 27720 }, { "epoch": 16.347287735849058, "grad_norm": 3.221306324005127, "learning_rate": 9.824303102902576e-07, "loss": 0.2876, "num_input_tokens_seen": 18200056, "step": 27725 }, { "epoch": 16.350235849056602, "grad_norm": 4.939162731170654, "learning_rate": 9.808993416526292e-07, "loss": 0.3044, "num_input_tokens_seen": 18202936, "step": 27730 }, { "epoch": 16.35318396226415, "grad_norm": 4.423654556274414, "learning_rate": 9.793694370892204e-07, "loss": 0.3171, "num_input_tokens_seen": 18205720, "step": 27735 }, { "epoch": 16.3561320754717, "grad_norm": 2.6864817142486572, "learning_rate": 9.77840597005082e-07, "loss": 0.3051, "num_input_tokens_seen": 18209144, "step": 27740 }, { "epoch": 16.359080188679247, "grad_norm": 4.145637512207031, "learning_rate": 9.763128218049806e-07, "loss": 0.3006, "num_input_tokens_seen": 18211544, "step": 27745 }, { "epoch": 16.36202830188679, "grad_norm": 4.532238960266113, "learning_rate": 9.747861118934005e-07, "loss": 0.2929, "num_input_tokens_seen": 18215608, "step": 27750 }, { "epoch": 16.36497641509434, "grad_norm": 4.272895336151123, "learning_rate": 9.732604676745443e-07, "loss": 0.3686, "num_input_tokens_seen": 18219544, "step": 27755 }, { "epoch": 16.367924528301888, "grad_norm": 4.901117324829102, "learning_rate": 9.717358895523333e-07, "loss": 0.315, "num_input_tokens_seen": 18222488, "step": 27760 }, { "epoch": 16.370872641509433, "grad_norm": 3.711956739425659, "learning_rate": 9.702123779304074e-07, "loss": 0.4175, "num_input_tokens_seen": 18224792, "step": 27765 }, { "epoch": 16.37382075471698, "grad_norm": 2.516512393951416, "learning_rate": 9.686899332121203e-07, "loss": 0.353, "num_input_tokens_seen": 18227512, "step": 27770 }, { "epoch": 16.37676886792453, "grad_norm": 4.144463539123535, "learning_rate": 9.671685558005488e-07, "loss": 0.3421, "num_input_tokens_seen": 18229816, "step": 27775 }, { "epoch": 16.379716981132077, "grad_norm": 4.455991268157959, "learning_rate": 9.656482460984828e-07, "loss": 0.4109, "num_input_tokens_seen": 18232568, "step": 27780 }, { "epoch": 16.38266509433962, "grad_norm": 4.260944843292236, "learning_rate": 9.641290045084307e-07, "loss": 0.3918, "num_input_tokens_seen": 18235800, "step": 27785 }, { "epoch": 16.38561320754717, "grad_norm": 6.984913349151611, "learning_rate": 9.626108314326182e-07, "loss": 0.4051, "num_input_tokens_seen": 18238776, "step": 27790 }, { "epoch": 16.388561320754718, "grad_norm": 4.88116455078125, "learning_rate": 9.610937272729881e-07, "loss": 0.417, "num_input_tokens_seen": 18242392, "step": 27795 }, { "epoch": 16.391509433962263, "grad_norm": 9.309867858886719, "learning_rate": 9.595776924311996e-07, "loss": 0.337, "num_input_tokens_seen": 18245528, "step": 27800 }, { "epoch": 16.39445754716981, "grad_norm": 3.633033514022827, "learning_rate": 9.580627273086313e-07, "loss": 0.3403, "num_input_tokens_seen": 18249208, "step": 27805 }, { "epoch": 16.39740566037736, "grad_norm": 7.01216983795166, "learning_rate": 9.565488323063754e-07, "loss": 0.3212, "num_input_tokens_seen": 18252920, "step": 27810 }, { "epoch": 16.400353773584907, "grad_norm": 3.9389336109161377, "learning_rate": 9.55036007825243e-07, "loss": 0.4524, "num_input_tokens_seen": 18255480, "step": 27815 }, { "epoch": 16.403301886792452, "grad_norm": 3.9376657009124756, "learning_rate": 9.535242542657602e-07, "loss": 0.3268, "num_input_tokens_seen": 18258424, "step": 27820 }, { "epoch": 16.40625, "grad_norm": 2.1861941814422607, "learning_rate": 9.520135720281692e-07, "loss": 0.2883, "num_input_tokens_seen": 18261656, "step": 27825 }, { "epoch": 16.409198113207548, "grad_norm": 4.350306987762451, "learning_rate": 9.505039615124318e-07, "loss": 0.3315, "num_input_tokens_seen": 18265176, "step": 27830 }, { "epoch": 16.412146226415093, "grad_norm": 2.7081196308135986, "learning_rate": 9.489954231182235e-07, "loss": 0.278, "num_input_tokens_seen": 18267896, "step": 27835 }, { "epoch": 16.41509433962264, "grad_norm": 6.6619110107421875, "learning_rate": 9.474879572449352e-07, "loss": 0.3774, "num_input_tokens_seen": 18270872, "step": 27840 }, { "epoch": 16.41804245283019, "grad_norm": 3.069117546081543, "learning_rate": 9.459815642916759e-07, "loss": 0.2796, "num_input_tokens_seen": 18274168, "step": 27845 }, { "epoch": 16.420990566037737, "grad_norm": 2.2698609828948975, "learning_rate": 9.444762446572692e-07, "loss": 0.2663, "num_input_tokens_seen": 18276952, "step": 27850 }, { "epoch": 16.423938679245282, "grad_norm": 3.0263102054595947, "learning_rate": 9.429719987402541e-07, "loss": 0.2908, "num_input_tokens_seen": 18280696, "step": 27855 }, { "epoch": 16.42688679245283, "grad_norm": 4.168011665344238, "learning_rate": 9.414688269388883e-07, "loss": 0.2602, "num_input_tokens_seen": 18284088, "step": 27860 }, { "epoch": 16.42983490566038, "grad_norm": 3.4914779663085938, "learning_rate": 9.3996672965114e-07, "loss": 0.2172, "num_input_tokens_seen": 18286680, "step": 27865 }, { "epoch": 16.432783018867923, "grad_norm": 4.129406929016113, "learning_rate": 9.384657072747e-07, "loss": 0.4217, "num_input_tokens_seen": 18294136, "step": 27870 }, { "epoch": 16.43573113207547, "grad_norm": 4.401743412017822, "learning_rate": 9.369657602069676e-07, "loss": 0.4128, "num_input_tokens_seen": 18296824, "step": 27875 }, { "epoch": 16.43867924528302, "grad_norm": 5.168199062347412, "learning_rate": 9.354668888450608e-07, "loss": 0.4318, "num_input_tokens_seen": 18299288, "step": 27880 }, { "epoch": 16.441627358490567, "grad_norm": 7.160244464874268, "learning_rate": 9.339690935858125e-07, "loss": 0.406, "num_input_tokens_seen": 18302520, "step": 27885 }, { "epoch": 16.444575471698112, "grad_norm": 2.2068240642547607, "learning_rate": 9.324723748257697e-07, "loss": 0.281, "num_input_tokens_seen": 18305432, "step": 27890 }, { "epoch": 16.44752358490566, "grad_norm": 3.827772855758667, "learning_rate": 9.309767329611963e-07, "loss": 0.4372, "num_input_tokens_seen": 18309592, "step": 27895 }, { "epoch": 16.45047169811321, "grad_norm": 3.924755334854126, "learning_rate": 9.294821683880695e-07, "loss": 0.3556, "num_input_tokens_seen": 18313016, "step": 27900 }, { "epoch": 16.453419811320753, "grad_norm": 4.081784248352051, "learning_rate": 9.279886815020816e-07, "loss": 0.2618, "num_input_tokens_seen": 18315928, "step": 27905 }, { "epoch": 16.4563679245283, "grad_norm": 4.35581636428833, "learning_rate": 9.264962726986393e-07, "loss": 0.3465, "num_input_tokens_seen": 18318776, "step": 27910 }, { "epoch": 16.45931603773585, "grad_norm": 1.7629292011260986, "learning_rate": 9.250049423728652e-07, "loss": 0.1967, "num_input_tokens_seen": 18321400, "step": 27915 }, { "epoch": 16.462264150943398, "grad_norm": 2.6757290363311768, "learning_rate": 9.235146909195936e-07, "loss": 0.3422, "num_input_tokens_seen": 18324344, "step": 27920 }, { "epoch": 16.465212264150942, "grad_norm": 3.510850667953491, "learning_rate": 9.220255187333771e-07, "loss": 0.2735, "num_input_tokens_seen": 18327032, "step": 27925 }, { "epoch": 16.46816037735849, "grad_norm": 4.469859600067139, "learning_rate": 9.205374262084798e-07, "loss": 0.3801, "num_input_tokens_seen": 18330200, "step": 27930 }, { "epoch": 16.47110849056604, "grad_norm": 4.743519306182861, "learning_rate": 9.190504137388806e-07, "loss": 0.3246, "num_input_tokens_seen": 18333176, "step": 27935 }, { "epoch": 16.474056603773583, "grad_norm": 3.2612452507019043, "learning_rate": 9.175644817182722e-07, "loss": 0.243, "num_input_tokens_seen": 18335704, "step": 27940 }, { "epoch": 16.47700471698113, "grad_norm": 7.259327411651611, "learning_rate": 9.16079630540061e-07, "loss": 0.3227, "num_input_tokens_seen": 18339096, "step": 27945 }, { "epoch": 16.47995283018868, "grad_norm": 4.563156604766846, "learning_rate": 9.145958605973676e-07, "loss": 0.3128, "num_input_tokens_seen": 18342168, "step": 27950 }, { "epoch": 16.482900943396228, "grad_norm": 4.891501426696777, "learning_rate": 9.131131722830289e-07, "loss": 0.2678, "num_input_tokens_seen": 18344664, "step": 27955 }, { "epoch": 16.485849056603772, "grad_norm": 2.9108083248138428, "learning_rate": 9.116315659895892e-07, "loss": 0.2877, "num_input_tokens_seen": 18348568, "step": 27960 }, { "epoch": 16.48879716981132, "grad_norm": 4.5899434089660645, "learning_rate": 9.10151042109314e-07, "loss": 0.232, "num_input_tokens_seen": 18352152, "step": 27965 }, { "epoch": 16.49174528301887, "grad_norm": 7.729650020599365, "learning_rate": 9.086716010341767e-07, "loss": 0.3297, "num_input_tokens_seen": 18356120, "step": 27970 }, { "epoch": 16.494693396226417, "grad_norm": 2.389268159866333, "learning_rate": 9.071932431558655e-07, "loss": 0.3543, "num_input_tokens_seen": 18359544, "step": 27975 }, { "epoch": 16.49764150943396, "grad_norm": 2.678556203842163, "learning_rate": 9.057159688657824e-07, "loss": 0.4397, "num_input_tokens_seen": 18362648, "step": 27980 }, { "epoch": 16.50058962264151, "grad_norm": 5.498838424682617, "learning_rate": 9.042397785550405e-07, "loss": 0.3455, "num_input_tokens_seen": 18365176, "step": 27985 }, { "epoch": 16.503537735849058, "grad_norm": 3.1144943237304688, "learning_rate": 9.027646726144707e-07, "loss": 0.4462, "num_input_tokens_seen": 18368856, "step": 27990 }, { "epoch": 16.506485849056602, "grad_norm": 4.95502233505249, "learning_rate": 9.012906514346115e-07, "loss": 0.3822, "num_input_tokens_seen": 18372056, "step": 27995 }, { "epoch": 16.50943396226415, "grad_norm": 5.155535697937012, "learning_rate": 8.99817715405717e-07, "loss": 0.2979, "num_input_tokens_seen": 18374840, "step": 28000 }, { "epoch": 16.5123820754717, "grad_norm": 3.440028190612793, "learning_rate": 8.983458649177529e-07, "loss": 0.3116, "num_input_tokens_seen": 18378424, "step": 28005 }, { "epoch": 16.515330188679247, "grad_norm": 1.6762851476669312, "learning_rate": 8.968751003603982e-07, "loss": 0.3851, "num_input_tokens_seen": 18382712, "step": 28010 }, { "epoch": 16.51827830188679, "grad_norm": 3.403991460800171, "learning_rate": 8.95405422123043e-07, "loss": 0.3292, "num_input_tokens_seen": 18384728, "step": 28015 }, { "epoch": 16.52122641509434, "grad_norm": 3.3607165813446045, "learning_rate": 8.939368305947932e-07, "loss": 0.2784, "num_input_tokens_seen": 18387576, "step": 28020 }, { "epoch": 16.524174528301888, "grad_norm": 3.1070828437805176, "learning_rate": 8.92469326164464e-07, "loss": 0.2873, "num_input_tokens_seen": 18390648, "step": 28025 }, { "epoch": 16.527122641509433, "grad_norm": 4.204075813293457, "learning_rate": 8.910029092205829e-07, "loss": 0.2589, "num_input_tokens_seen": 18393528, "step": 28030 }, { "epoch": 16.53007075471698, "grad_norm": 3.2968909740448, "learning_rate": 8.895375801513906e-07, "loss": 0.3792, "num_input_tokens_seen": 18397752, "step": 28035 }, { "epoch": 16.53301886792453, "grad_norm": 2.5317368507385254, "learning_rate": 8.880733393448377e-07, "loss": 0.3677, "num_input_tokens_seen": 18401496, "step": 28040 }, { "epoch": 16.535966981132077, "grad_norm": 6.63867712020874, "learning_rate": 8.866101871885907e-07, "loss": 0.386, "num_input_tokens_seen": 18404728, "step": 28045 }, { "epoch": 16.53891509433962, "grad_norm": 2.887050151824951, "learning_rate": 8.851481240700249e-07, "loss": 0.2609, "num_input_tokens_seen": 18408152, "step": 28050 }, { "epoch": 16.54186320754717, "grad_norm": 5.3327202796936035, "learning_rate": 8.836871503762257e-07, "loss": 0.2476, "num_input_tokens_seen": 18411416, "step": 28055 }, { "epoch": 16.544811320754718, "grad_norm": 4.549817085266113, "learning_rate": 8.822272664939946e-07, "loss": 0.3202, "num_input_tokens_seen": 18414424, "step": 28060 }, { "epoch": 16.547759433962263, "grad_norm": 2.277540922164917, "learning_rate": 8.80768472809842e-07, "loss": 0.3576, "num_input_tokens_seen": 18418424, "step": 28065 }, { "epoch": 16.55070754716981, "grad_norm": 2.3250772953033447, "learning_rate": 8.793107697099884e-07, "loss": 0.3681, "num_input_tokens_seen": 18421688, "step": 28070 }, { "epoch": 16.55365566037736, "grad_norm": 4.620701313018799, "learning_rate": 8.778541575803673e-07, "loss": 0.2475, "num_input_tokens_seen": 18425144, "step": 28075 }, { "epoch": 16.556603773584907, "grad_norm": 4.845494270324707, "learning_rate": 8.763986368066241e-07, "loss": 0.3561, "num_input_tokens_seen": 18427512, "step": 28080 }, { "epoch": 16.559551886792452, "grad_norm": 4.700624942779541, "learning_rate": 8.749442077741138e-07, "loss": 0.2892, "num_input_tokens_seen": 18430264, "step": 28085 }, { "epoch": 16.5625, "grad_norm": 2.9744060039520264, "learning_rate": 8.734908708679024e-07, "loss": 0.3088, "num_input_tokens_seen": 18433112, "step": 28090 }, { "epoch": 16.565448113207548, "grad_norm": 2.8854174613952637, "learning_rate": 8.72038626472767e-07, "loss": 0.3891, "num_input_tokens_seen": 18435736, "step": 28095 }, { "epoch": 16.568396226415093, "grad_norm": 3.7308189868927, "learning_rate": 8.705874749731962e-07, "loss": 0.3817, "num_input_tokens_seen": 18439192, "step": 28100 }, { "epoch": 16.57134433962264, "grad_norm": 4.475008964538574, "learning_rate": 8.691374167533867e-07, "loss": 0.3262, "num_input_tokens_seen": 18442168, "step": 28105 }, { "epoch": 16.57429245283019, "grad_norm": 2.4612083435058594, "learning_rate": 8.6768845219725e-07, "loss": 0.2995, "num_input_tokens_seen": 18444920, "step": 28110 }, { "epoch": 16.577240566037737, "grad_norm": 0.7763966917991638, "learning_rate": 8.662405816884056e-07, "loss": 0.379, "num_input_tokens_seen": 18450744, "step": 28115 }, { "epoch": 16.580188679245282, "grad_norm": 2.4059553146362305, "learning_rate": 8.647938056101824e-07, "loss": 0.2827, "num_input_tokens_seen": 18454424, "step": 28120 }, { "epoch": 16.58313679245283, "grad_norm": 8.509726524353027, "learning_rate": 8.63348124345621e-07, "loss": 0.2852, "num_input_tokens_seen": 18457656, "step": 28125 }, { "epoch": 16.58608490566038, "grad_norm": 3.684553384780884, "learning_rate": 8.619035382774716e-07, "loss": 0.3341, "num_input_tokens_seen": 18460088, "step": 28130 }, { "epoch": 16.589033018867923, "grad_norm": 4.239358425140381, "learning_rate": 8.60460047788193e-07, "loss": 0.2459, "num_input_tokens_seen": 18462936, "step": 28135 }, { "epoch": 16.59198113207547, "grad_norm": 3.856100559234619, "learning_rate": 8.590176532599587e-07, "loss": 0.3144, "num_input_tokens_seen": 18466072, "step": 28140 }, { "epoch": 16.59492924528302, "grad_norm": 2.5705156326293945, "learning_rate": 8.575763550746475e-07, "loss": 0.3036, "num_input_tokens_seen": 18468888, "step": 28145 }, { "epoch": 16.597877358490567, "grad_norm": 3.4004318714141846, "learning_rate": 8.56136153613848e-07, "loss": 0.3839, "num_input_tokens_seen": 18472088, "step": 28150 }, { "epoch": 16.600825471698112, "grad_norm": 6.439693450927734, "learning_rate": 8.546970492588619e-07, "loss": 0.4111, "num_input_tokens_seen": 18474488, "step": 28155 }, { "epoch": 16.60377358490566, "grad_norm": 4.929098129272461, "learning_rate": 8.532590423906973e-07, "loss": 0.3678, "num_input_tokens_seen": 18477624, "step": 28160 }, { "epoch": 16.60672169811321, "grad_norm": 2.9495651721954346, "learning_rate": 8.518221333900728e-07, "loss": 0.2801, "num_input_tokens_seen": 18480760, "step": 28165 }, { "epoch": 16.609669811320753, "grad_norm": 4.1924943923950195, "learning_rate": 8.503863226374148e-07, "loss": 0.2703, "num_input_tokens_seen": 18483544, "step": 28170 }, { "epoch": 16.6126179245283, "grad_norm": 9.546478271484375, "learning_rate": 8.489516105128632e-07, "loss": 0.2916, "num_input_tokens_seen": 18486648, "step": 28175 }, { "epoch": 16.61556603773585, "grad_norm": 4.616825580596924, "learning_rate": 8.475179973962621e-07, "loss": 0.5232, "num_input_tokens_seen": 18490328, "step": 28180 }, { "epoch": 16.618514150943398, "grad_norm": 3.4183542728424072, "learning_rate": 8.460854836671678e-07, "loss": 0.3964, "num_input_tokens_seen": 18494104, "step": 28185 }, { "epoch": 16.621462264150942, "grad_norm": 4.43074369430542, "learning_rate": 8.446540697048445e-07, "loss": 0.2238, "num_input_tokens_seen": 18498520, "step": 28190 }, { "epoch": 16.62441037735849, "grad_norm": 6.467661380767822, "learning_rate": 8.432237558882639e-07, "loss": 0.311, "num_input_tokens_seen": 18500792, "step": 28195 }, { "epoch": 16.62735849056604, "grad_norm": 4.1161723136901855, "learning_rate": 8.417945425961083e-07, "loss": 0.2664, "num_input_tokens_seen": 18503416, "step": 28200 }, { "epoch": 16.630306603773583, "grad_norm": 3.9927473068237305, "learning_rate": 8.403664302067688e-07, "loss": 0.4173, "num_input_tokens_seen": 18506200, "step": 28205 }, { "epoch": 16.63325471698113, "grad_norm": 2.3939709663391113, "learning_rate": 8.389394190983446e-07, "loss": 0.2301, "num_input_tokens_seen": 18509272, "step": 28210 }, { "epoch": 16.63620283018868, "grad_norm": 3.9372775554656982, "learning_rate": 8.37513509648642e-07, "loss": 0.4727, "num_input_tokens_seen": 18512312, "step": 28215 }, { "epoch": 16.639150943396228, "grad_norm": 6.142547607421875, "learning_rate": 8.360887022351771e-07, "loss": 0.3273, "num_input_tokens_seen": 18514744, "step": 28220 }, { "epoch": 16.642099056603772, "grad_norm": 3.6099438667297363, "learning_rate": 8.346649972351739e-07, "loss": 0.3616, "num_input_tokens_seen": 18518136, "step": 28225 }, { "epoch": 16.64504716981132, "grad_norm": 5.010387897491455, "learning_rate": 8.33242395025563e-07, "loss": 0.3158, "num_input_tokens_seen": 18520728, "step": 28230 }, { "epoch": 16.64799528301887, "grad_norm": 3.0426371097564697, "learning_rate": 8.318208959829871e-07, "loss": 0.4195, "num_input_tokens_seen": 18524088, "step": 28235 }, { "epoch": 16.650943396226417, "grad_norm": 4.475261688232422, "learning_rate": 8.304005004837929e-07, "loss": 0.3499, "num_input_tokens_seen": 18526808, "step": 28240 }, { "epoch": 16.65389150943396, "grad_norm": 3.7106356620788574, "learning_rate": 8.289812089040344e-07, "loss": 0.2409, "num_input_tokens_seen": 18529944, "step": 28245 }, { "epoch": 16.65683962264151, "grad_norm": 5.7206573486328125, "learning_rate": 8.275630216194785e-07, "loss": 0.3778, "num_input_tokens_seen": 18533368, "step": 28250 }, { "epoch": 16.659787735849058, "grad_norm": 4.453394412994385, "learning_rate": 8.261459390055948e-07, "loss": 0.2866, "num_input_tokens_seen": 18537240, "step": 28255 }, { "epoch": 16.662735849056602, "grad_norm": 3.0740954875946045, "learning_rate": 8.24729961437562e-07, "loss": 0.3254, "num_input_tokens_seen": 18540408, "step": 28260 }, { "epoch": 16.66568396226415, "grad_norm": 3.3055999279022217, "learning_rate": 8.233150892902653e-07, "loss": 0.3968, "num_input_tokens_seen": 18544056, "step": 28265 }, { "epoch": 16.6686320754717, "grad_norm": 2.1743643283843994, "learning_rate": 8.219013229383005e-07, "loss": 0.2911, "num_input_tokens_seen": 18546936, "step": 28270 }, { "epoch": 16.671580188679247, "grad_norm": 3.1338162422180176, "learning_rate": 8.204886627559666e-07, "loss": 0.3423, "num_input_tokens_seen": 18550040, "step": 28275 }, { "epoch": 16.67452830188679, "grad_norm": 3.337329626083374, "learning_rate": 8.190771091172722e-07, "loss": 0.3482, "num_input_tokens_seen": 18552824, "step": 28280 }, { "epoch": 16.67747641509434, "grad_norm": 10.482561111450195, "learning_rate": 8.176666623959323e-07, "loss": 0.2729, "num_input_tokens_seen": 18555832, "step": 28285 }, { "epoch": 16.680424528301888, "grad_norm": 3.1228933334350586, "learning_rate": 8.162573229653681e-07, "loss": 0.2665, "num_input_tokens_seen": 18558552, "step": 28290 }, { "epoch": 16.683372641509433, "grad_norm": 2.5417709350585938, "learning_rate": 8.148490911987073e-07, "loss": 0.28, "num_input_tokens_seen": 18565752, "step": 28295 }, { "epoch": 16.68632075471698, "grad_norm": 8.018363952636719, "learning_rate": 8.134419674687876e-07, "loss": 0.3527, "num_input_tokens_seen": 18568216, "step": 28300 }, { "epoch": 16.68926886792453, "grad_norm": 3.423430919647217, "learning_rate": 8.120359521481502e-07, "loss": 0.283, "num_input_tokens_seen": 18570616, "step": 28305 }, { "epoch": 16.692216981132077, "grad_norm": 4.461760997772217, "learning_rate": 8.106310456090438e-07, "loss": 0.3392, "num_input_tokens_seen": 18573688, "step": 28310 }, { "epoch": 16.69516509433962, "grad_norm": 6.158578395843506, "learning_rate": 8.092272482234231e-07, "loss": 0.258, "num_input_tokens_seen": 18576600, "step": 28315 }, { "epoch": 16.69811320754717, "grad_norm": 5.566991329193115, "learning_rate": 8.078245603629486e-07, "loss": 0.355, "num_input_tokens_seen": 18580280, "step": 28320 }, { "epoch": 16.701061320754718, "grad_norm": 3.9424166679382324, "learning_rate": 8.0642298239899e-07, "loss": 0.249, "num_input_tokens_seen": 18583352, "step": 28325 }, { "epoch": 16.704009433962263, "grad_norm": 1.9246339797973633, "learning_rate": 8.050225147026202e-07, "loss": 0.2751, "num_input_tokens_seen": 18587288, "step": 28330 }, { "epoch": 16.70695754716981, "grad_norm": 2.1803457736968994, "learning_rate": 8.03623157644619e-07, "loss": 0.2773, "num_input_tokens_seen": 18591256, "step": 28335 }, { "epoch": 16.70990566037736, "grad_norm": 3.1033623218536377, "learning_rate": 8.022249115954728e-07, "loss": 0.339, "num_input_tokens_seen": 18594264, "step": 28340 }, { "epoch": 16.712853773584907, "grad_norm": 2.8313543796539307, "learning_rate": 8.008277769253709e-07, "loss": 0.3134, "num_input_tokens_seen": 18597496, "step": 28345 }, { "epoch": 16.715801886792452, "grad_norm": 2.437797784805298, "learning_rate": 7.994317540042135e-07, "loss": 0.3821, "num_input_tokens_seen": 18601080, "step": 28350 }, { "epoch": 16.71875, "grad_norm": 3.8334717750549316, "learning_rate": 7.980368432016017e-07, "loss": 0.3652, "num_input_tokens_seen": 18603768, "step": 28355 }, { "epoch": 16.721698113207548, "grad_norm": 3.475005865097046, "learning_rate": 7.966430448868461e-07, "loss": 0.3933, "num_input_tokens_seen": 18607096, "step": 28360 }, { "epoch": 16.724646226415093, "grad_norm": 2.615126132965088, "learning_rate": 7.952503594289601e-07, "loss": 0.2466, "num_input_tokens_seen": 18609848, "step": 28365 }, { "epoch": 16.72759433962264, "grad_norm": 3.151883602142334, "learning_rate": 7.93858787196663e-07, "loss": 0.2995, "num_input_tokens_seen": 18612504, "step": 28370 }, { "epoch": 16.73054245283019, "grad_norm": 3.3585000038146973, "learning_rate": 7.92468328558379e-07, "loss": 0.2993, "num_input_tokens_seen": 18615896, "step": 28375 }, { "epoch": 16.733490566037737, "grad_norm": 2.4752583503723145, "learning_rate": 7.910789838822386e-07, "loss": 0.4094, "num_input_tokens_seen": 18618968, "step": 28380 }, { "epoch": 16.736438679245282, "grad_norm": 7.1505303382873535, "learning_rate": 7.89690753536076e-07, "loss": 0.2911, "num_input_tokens_seen": 18622296, "step": 28385 }, { "epoch": 16.73938679245283, "grad_norm": 3.2126760482788086, "learning_rate": 7.883036378874326e-07, "loss": 0.3674, "num_input_tokens_seen": 18626840, "step": 28390 }, { "epoch": 16.74233490566038, "grad_norm": 3.2762861251831055, "learning_rate": 7.86917637303552e-07, "loss": 0.3758, "num_input_tokens_seen": 18629336, "step": 28395 }, { "epoch": 16.745283018867923, "grad_norm": 6.470501899719238, "learning_rate": 7.855327521513851e-07, "loss": 0.2627, "num_input_tokens_seen": 18632536, "step": 28400 }, { "epoch": 16.74823113207547, "grad_norm": 4.857864856719971, "learning_rate": 7.841489827975851e-07, "loss": 0.4364, "num_input_tokens_seen": 18636600, "step": 28405 }, { "epoch": 16.75117924528302, "grad_norm": 2.534759998321533, "learning_rate": 7.827663296085109e-07, "loss": 0.5275, "num_input_tokens_seen": 18641080, "step": 28410 }, { "epoch": 16.754127358490567, "grad_norm": 12.336221694946289, "learning_rate": 7.813847929502255e-07, "loss": 0.2339, "num_input_tokens_seen": 18643960, "step": 28415 }, { "epoch": 16.757075471698112, "grad_norm": 3.1567559242248535, "learning_rate": 7.800043731884982e-07, "loss": 0.2907, "num_input_tokens_seen": 18647480, "step": 28420 }, { "epoch": 16.76002358490566, "grad_norm": 2.9181084632873535, "learning_rate": 7.786250706888005e-07, "loss": 0.3285, "num_input_tokens_seen": 18650168, "step": 28425 }, { "epoch": 16.76297169811321, "grad_norm": 4.2917094230651855, "learning_rate": 7.772468858163085e-07, "loss": 0.4084, "num_input_tokens_seen": 18652472, "step": 28430 }, { "epoch": 16.765919811320753, "grad_norm": 3.5879015922546387, "learning_rate": 7.758698189359026e-07, "loss": 0.2967, "num_input_tokens_seen": 18655128, "step": 28435 }, { "epoch": 16.7688679245283, "grad_norm": 4.422234058380127, "learning_rate": 7.744938704121658e-07, "loss": 0.3835, "num_input_tokens_seen": 18659096, "step": 28440 }, { "epoch": 16.77181603773585, "grad_norm": 5.546584129333496, "learning_rate": 7.731190406093892e-07, "loss": 0.2643, "num_input_tokens_seen": 18662552, "step": 28445 }, { "epoch": 16.774764150943398, "grad_norm": 4.768032073974609, "learning_rate": 7.717453298915617e-07, "loss": 0.3643, "num_input_tokens_seen": 18665144, "step": 28450 }, { "epoch": 16.777712264150942, "grad_norm": 4.747335910797119, "learning_rate": 7.703727386223825e-07, "loss": 0.264, "num_input_tokens_seen": 18668024, "step": 28455 }, { "epoch": 16.78066037735849, "grad_norm": 4.021056652069092, "learning_rate": 7.690012671652491e-07, "loss": 0.343, "num_input_tokens_seen": 18670872, "step": 28460 }, { "epoch": 16.78360849056604, "grad_norm": 3.063950777053833, "learning_rate": 7.676309158832651e-07, "loss": 0.4046, "num_input_tokens_seen": 18674840, "step": 28465 }, { "epoch": 16.786556603773583, "grad_norm": 3.747563600540161, "learning_rate": 7.662616851392362e-07, "loss": 0.4197, "num_input_tokens_seen": 18677656, "step": 28470 }, { "epoch": 16.78950471698113, "grad_norm": 3.5824244022369385, "learning_rate": 7.648935752956732e-07, "loss": 0.3857, "num_input_tokens_seen": 18681176, "step": 28475 }, { "epoch": 16.79245283018868, "grad_norm": 2.9878032207489014, "learning_rate": 7.635265867147867e-07, "loss": 0.3214, "num_input_tokens_seen": 18684984, "step": 28480 }, { "epoch": 16.795400943396228, "grad_norm": 4.018722057342529, "learning_rate": 7.621607197584963e-07, "loss": 0.3315, "num_input_tokens_seen": 18687800, "step": 28485 }, { "epoch": 16.798349056603772, "grad_norm": 3.4111075401306152, "learning_rate": 7.607959747884186e-07, "loss": 0.3415, "num_input_tokens_seen": 18691384, "step": 28490 }, { "epoch": 16.80129716981132, "grad_norm": 2.871192216873169, "learning_rate": 7.594323521658769e-07, "loss": 0.4674, "num_input_tokens_seen": 18695192, "step": 28495 }, { "epoch": 16.80424528301887, "grad_norm": 3.4413809776306152, "learning_rate": 7.580698522518958e-07, "loss": 0.2703, "num_input_tokens_seen": 18699448, "step": 28500 }, { "epoch": 16.807193396226417, "grad_norm": 3.036454439163208, "learning_rate": 7.567084754072035e-07, "loss": 0.3208, "num_input_tokens_seen": 18702456, "step": 28505 }, { "epoch": 16.81014150943396, "grad_norm": 6.435492515563965, "learning_rate": 7.553482219922282e-07, "loss": 0.5562, "num_input_tokens_seen": 18706360, "step": 28510 }, { "epoch": 16.81308962264151, "grad_norm": 3.040978193283081, "learning_rate": 7.539890923671061e-07, "loss": 0.3062, "num_input_tokens_seen": 18709848, "step": 28515 }, { "epoch": 16.816037735849058, "grad_norm": 2.6537420749664307, "learning_rate": 7.526310868916708e-07, "loss": 0.3358, "num_input_tokens_seen": 18712728, "step": 28520 }, { "epoch": 16.818985849056602, "grad_norm": 2.6693613529205322, "learning_rate": 7.512742059254602e-07, "loss": 0.2812, "num_input_tokens_seen": 18715832, "step": 28525 }, { "epoch": 16.82193396226415, "grad_norm": 2.6472089290618896, "learning_rate": 7.499184498277151e-07, "loss": 0.3514, "num_input_tokens_seen": 18719160, "step": 28530 }, { "epoch": 16.8248820754717, "grad_norm": 4.258396148681641, "learning_rate": 7.485638189573758e-07, "loss": 0.2549, "num_input_tokens_seen": 18722392, "step": 28535 }, { "epoch": 16.827830188679247, "grad_norm": 3.3167002201080322, "learning_rate": 7.472103136730891e-07, "loss": 0.2975, "num_input_tokens_seen": 18725944, "step": 28540 }, { "epoch": 16.83077830188679, "grad_norm": 6.771842956542969, "learning_rate": 7.458579343331996e-07, "loss": 0.2553, "num_input_tokens_seen": 18729304, "step": 28545 }, { "epoch": 16.83372641509434, "grad_norm": 2.3176276683807373, "learning_rate": 7.445066812957569e-07, "loss": 0.3972, "num_input_tokens_seen": 18732728, "step": 28550 }, { "epoch": 16.836674528301888, "grad_norm": 2.70751953125, "learning_rate": 7.43156554918511e-07, "loss": 0.3613, "num_input_tokens_seen": 18736280, "step": 28555 }, { "epoch": 16.839622641509433, "grad_norm": 6.768215656280518, "learning_rate": 7.418075555589132e-07, "loss": 0.2582, "num_input_tokens_seen": 18739608, "step": 28560 }, { "epoch": 16.84257075471698, "grad_norm": 2.9533047676086426, "learning_rate": 7.404596835741168e-07, "loss": 0.395, "num_input_tokens_seen": 18742136, "step": 28565 }, { "epoch": 16.84551886792453, "grad_norm": 3.6123502254486084, "learning_rate": 7.391129393209751e-07, "loss": 0.298, "num_input_tokens_seen": 18746296, "step": 28570 }, { "epoch": 16.848466981132077, "grad_norm": 5.162701606750488, "learning_rate": 7.377673231560478e-07, "loss": 0.3192, "num_input_tokens_seen": 18748984, "step": 28575 }, { "epoch": 16.85141509433962, "grad_norm": 4.296681880950928, "learning_rate": 7.364228354355907e-07, "loss": 0.2614, "num_input_tokens_seen": 18751768, "step": 28580 }, { "epoch": 16.85436320754717, "grad_norm": 3.0825932025909424, "learning_rate": 7.350794765155627e-07, "loss": 0.3221, "num_input_tokens_seen": 18755160, "step": 28585 }, { "epoch": 16.857311320754718, "grad_norm": 11.087625503540039, "learning_rate": 7.337372467516246e-07, "loss": 0.3257, "num_input_tokens_seen": 18757752, "step": 28590 }, { "epoch": 16.860259433962263, "grad_norm": 8.609670639038086, "learning_rate": 7.323961464991369e-07, "loss": 0.3767, "num_input_tokens_seen": 18761080, "step": 28595 }, { "epoch": 16.86320754716981, "grad_norm": 3.7152347564697266, "learning_rate": 7.310561761131601e-07, "loss": 0.2283, "num_input_tokens_seen": 18770744, "step": 28600 }, { "epoch": 16.86615566037736, "grad_norm": 1.9624944925308228, "learning_rate": 7.297173359484605e-07, "loss": 0.3031, "num_input_tokens_seen": 18774744, "step": 28605 }, { "epoch": 16.869103773584907, "grad_norm": 2.836780309677124, "learning_rate": 7.283796263595e-07, "loss": 0.2821, "num_input_tokens_seen": 18777528, "step": 28610 }, { "epoch": 16.872051886792452, "grad_norm": 2.6511237621307373, "learning_rate": 7.270430477004431e-07, "loss": 0.2602, "num_input_tokens_seen": 18781176, "step": 28615 }, { "epoch": 16.875, "grad_norm": 3.3335351943969727, "learning_rate": 7.257076003251545e-07, "loss": 0.2094, "num_input_tokens_seen": 18785784, "step": 28620 }, { "epoch": 16.877948113207548, "grad_norm": 3.553370475769043, "learning_rate": 7.243732845871998e-07, "loss": 0.2516, "num_input_tokens_seen": 18788792, "step": 28625 }, { "epoch": 16.880896226415093, "grad_norm": 3.6926980018615723, "learning_rate": 7.230401008398441e-07, "loss": 0.3801, "num_input_tokens_seen": 18792536, "step": 28630 }, { "epoch": 16.88384433962264, "grad_norm": 3.012319803237915, "learning_rate": 7.217080494360546e-07, "loss": 0.3495, "num_input_tokens_seen": 18795384, "step": 28635 }, { "epoch": 16.88679245283019, "grad_norm": 4.9488701820373535, "learning_rate": 7.20377130728498e-07, "loss": 0.2656, "num_input_tokens_seen": 18797976, "step": 28640 }, { "epoch": 16.889740566037737, "grad_norm": 2.931410551071167, "learning_rate": 7.190473450695407e-07, "loss": 0.399, "num_input_tokens_seen": 18801944, "step": 28645 }, { "epoch": 16.892688679245282, "grad_norm": 3.014059066772461, "learning_rate": 7.177186928112484e-07, "loss": 0.2834, "num_input_tokens_seen": 18805496, "step": 28650 }, { "epoch": 16.89563679245283, "grad_norm": 3.6469056606292725, "learning_rate": 7.163911743053876e-07, "loss": 0.3634, "num_input_tokens_seen": 18808632, "step": 28655 }, { "epoch": 16.89858490566038, "grad_norm": 2.963620901107788, "learning_rate": 7.150647899034252e-07, "loss": 0.3274, "num_input_tokens_seen": 18811640, "step": 28660 }, { "epoch": 16.901533018867923, "grad_norm": 5.758010387420654, "learning_rate": 7.13739539956525e-07, "loss": 0.3886, "num_input_tokens_seen": 18814456, "step": 28665 }, { "epoch": 16.90448113207547, "grad_norm": 3.7026660442352295, "learning_rate": 7.124154248155562e-07, "loss": 0.3616, "num_input_tokens_seen": 18817912, "step": 28670 }, { "epoch": 16.90742924528302, "grad_norm": 2.8317723274230957, "learning_rate": 7.110924448310813e-07, "loss": 0.3485, "num_input_tokens_seen": 18820888, "step": 28675 }, { "epoch": 16.910377358490567, "grad_norm": 3.582503318786621, "learning_rate": 7.097706003533666e-07, "loss": 0.2896, "num_input_tokens_seen": 18824184, "step": 28680 }, { "epoch": 16.913325471698112, "grad_norm": 2.8434200286865234, "learning_rate": 7.084498917323751e-07, "loss": 0.3301, "num_input_tokens_seen": 18827288, "step": 28685 }, { "epoch": 16.91627358490566, "grad_norm": 3.217040777206421, "learning_rate": 7.071303193177698e-07, "loss": 0.5198, "num_input_tokens_seen": 18830168, "step": 28690 }, { "epoch": 16.91922169811321, "grad_norm": 3.2115061283111572, "learning_rate": 7.058118834589133e-07, "loss": 0.3574, "num_input_tokens_seen": 18833144, "step": 28695 }, { "epoch": 16.922169811320753, "grad_norm": 4.379009246826172, "learning_rate": 7.044945845048684e-07, "loss": 0.2705, "num_input_tokens_seen": 18836408, "step": 28700 }, { "epoch": 16.9251179245283, "grad_norm": 2.359614849090576, "learning_rate": 7.031784228043948e-07, "loss": 0.4605, "num_input_tokens_seen": 18839736, "step": 28705 }, { "epoch": 16.92806603773585, "grad_norm": 5.300109386444092, "learning_rate": 7.01863398705952e-07, "loss": 0.3703, "num_input_tokens_seen": 18842424, "step": 28710 }, { "epoch": 16.931014150943398, "grad_norm": 6.405318260192871, "learning_rate": 7.005495125576983e-07, "loss": 0.2816, "num_input_tokens_seen": 18844664, "step": 28715 }, { "epoch": 16.933962264150942, "grad_norm": 3.8221585750579834, "learning_rate": 6.99236764707491e-07, "loss": 0.2314, "num_input_tokens_seen": 18848344, "step": 28720 }, { "epoch": 16.93691037735849, "grad_norm": 5.649932384490967, "learning_rate": 6.979251555028843e-07, "loss": 0.2954, "num_input_tokens_seen": 18851128, "step": 28725 }, { "epoch": 16.93985849056604, "grad_norm": 2.1031851768493652, "learning_rate": 6.966146852911332e-07, "loss": 0.3007, "num_input_tokens_seen": 18855384, "step": 28730 }, { "epoch": 16.942806603773583, "grad_norm": 2.3273308277130127, "learning_rate": 6.953053544191923e-07, "loss": 0.3629, "num_input_tokens_seen": 18858136, "step": 28735 }, { "epoch": 16.94575471698113, "grad_norm": 2.317497968673706, "learning_rate": 6.939971632337111e-07, "loss": 0.3022, "num_input_tokens_seen": 18862776, "step": 28740 }, { "epoch": 16.94870283018868, "grad_norm": 3.8923799991607666, "learning_rate": 6.926901120810387e-07, "loss": 0.3335, "num_input_tokens_seen": 18865432, "step": 28745 }, { "epoch": 16.951650943396228, "grad_norm": 5.079744338989258, "learning_rate": 6.91384201307222e-07, "loss": 0.2604, "num_input_tokens_seen": 18868248, "step": 28750 }, { "epoch": 16.954599056603772, "grad_norm": 4.622825622558594, "learning_rate": 6.900794312580078e-07, "loss": 0.3837, "num_input_tokens_seen": 18872056, "step": 28755 }, { "epoch": 16.95754716981132, "grad_norm": 2.7267491817474365, "learning_rate": 6.887758022788377e-07, "loss": 0.332, "num_input_tokens_seen": 18875544, "step": 28760 }, { "epoch": 16.96049528301887, "grad_norm": 5.135005950927734, "learning_rate": 6.874733147148549e-07, "loss": 0.2384, "num_input_tokens_seen": 18880184, "step": 28765 }, { "epoch": 16.963443396226417, "grad_norm": 3.9961678981781006, "learning_rate": 6.861719689108987e-07, "loss": 0.3613, "num_input_tokens_seen": 18883928, "step": 28770 }, { "epoch": 16.96639150943396, "grad_norm": 5.509896278381348, "learning_rate": 6.84871765211505e-07, "loss": 0.2921, "num_input_tokens_seen": 18886584, "step": 28775 }, { "epoch": 16.96933962264151, "grad_norm": 3.214423418045044, "learning_rate": 6.835727039609086e-07, "loss": 0.2649, "num_input_tokens_seen": 18888952, "step": 28780 }, { "epoch": 16.972287735849058, "grad_norm": 6.575146675109863, "learning_rate": 6.822747855030415e-07, "loss": 0.3164, "num_input_tokens_seen": 18892056, "step": 28785 }, { "epoch": 16.975235849056602, "grad_norm": 3.448870897293091, "learning_rate": 6.809780101815322e-07, "loss": 0.2324, "num_input_tokens_seen": 18895544, "step": 28790 }, { "epoch": 16.97818396226415, "grad_norm": 4.115417003631592, "learning_rate": 6.796823783397099e-07, "loss": 0.2477, "num_input_tokens_seen": 18898520, "step": 28795 }, { "epoch": 16.9811320754717, "grad_norm": 2.891979217529297, "learning_rate": 6.783878903205976e-07, "loss": 0.3888, "num_input_tokens_seen": 18902968, "step": 28800 }, { "epoch": 16.984080188679247, "grad_norm": 3.6382412910461426, "learning_rate": 6.77094546466916e-07, "loss": 0.2836, "num_input_tokens_seen": 18906328, "step": 28805 }, { "epoch": 16.98702830188679, "grad_norm": 5.161401748657227, "learning_rate": 6.758023471210845e-07, "loss": 0.4585, "num_input_tokens_seen": 18909240, "step": 28810 }, { "epoch": 16.98997641509434, "grad_norm": 6.842303276062012, "learning_rate": 6.745112926252162e-07, "loss": 0.3217, "num_input_tokens_seen": 18913368, "step": 28815 }, { "epoch": 16.992924528301888, "grad_norm": 2.5500802993774414, "learning_rate": 6.732213833211265e-07, "loss": 0.3426, "num_input_tokens_seen": 18917688, "step": 28820 }, { "epoch": 16.995872641509433, "grad_norm": 3.589447021484375, "learning_rate": 6.719326195503218e-07, "loss": 0.3641, "num_input_tokens_seen": 18921112, "step": 28825 }, { "epoch": 16.99882075471698, "grad_norm": 3.0993971824645996, "learning_rate": 6.706450016540094e-07, "loss": 0.263, "num_input_tokens_seen": 18924728, "step": 28830 }, { "epoch": 17.00176886792453, "grad_norm": 2.8493762016296387, "learning_rate": 6.69358529973092e-07, "loss": 0.3084, "num_input_tokens_seen": 18926984, "step": 28835 }, { "epoch": 17.004716981132077, "grad_norm": 3.220813035964966, "learning_rate": 6.680732048481681e-07, "loss": 0.2964, "num_input_tokens_seen": 18929448, "step": 28840 }, { "epoch": 17.00766509433962, "grad_norm": 2.5201141834259033, "learning_rate": 6.667890266195321e-07, "loss": 0.3566, "num_input_tokens_seen": 18933096, "step": 28845 }, { "epoch": 17.01061320754717, "grad_norm": 2.598842144012451, "learning_rate": 6.655059956271759e-07, "loss": 0.2757, "num_input_tokens_seen": 18936872, "step": 28850 }, { "epoch": 17.013561320754718, "grad_norm": 2.9264302253723145, "learning_rate": 6.642241122107884e-07, "loss": 0.3397, "num_input_tokens_seen": 18940968, "step": 28855 }, { "epoch": 17.016509433962263, "grad_norm": 3.2100918292999268, "learning_rate": 6.629433767097537e-07, "loss": 0.2063, "num_input_tokens_seen": 18943304, "step": 28860 }, { "epoch": 17.01945754716981, "grad_norm": 3.553795099258423, "learning_rate": 6.616637894631517e-07, "loss": 0.3094, "num_input_tokens_seen": 18947080, "step": 28865 }, { "epoch": 17.02240566037736, "grad_norm": 3.0495049953460693, "learning_rate": 6.603853508097591e-07, "loss": 0.2202, "num_input_tokens_seen": 18950056, "step": 28870 }, { "epoch": 17.025353773584907, "grad_norm": 3.3062894344329834, "learning_rate": 6.591080610880468e-07, "loss": 0.2552, "num_input_tokens_seen": 18952904, "step": 28875 }, { "epoch": 17.028301886792452, "grad_norm": 3.2175745964050293, "learning_rate": 6.578319206361828e-07, "loss": 0.3544, "num_input_tokens_seen": 18956776, "step": 28880 }, { "epoch": 17.03125, "grad_norm": 3.3457741737365723, "learning_rate": 6.565569297920327e-07, "loss": 0.2676, "num_input_tokens_seen": 18960296, "step": 28885 }, { "epoch": 17.034198113207548, "grad_norm": 3.344038963317871, "learning_rate": 6.552830888931544e-07, "loss": 0.3541, "num_input_tokens_seen": 18962728, "step": 28890 }, { "epoch": 17.037146226415093, "grad_norm": 2.565610647201538, "learning_rate": 6.540103982768031e-07, "loss": 0.2994, "num_input_tokens_seen": 18966184, "step": 28895 }, { "epoch": 17.04009433962264, "grad_norm": 3.8383328914642334, "learning_rate": 6.527388582799293e-07, "loss": 0.4516, "num_input_tokens_seen": 18969672, "step": 28900 }, { "epoch": 17.04304245283019, "grad_norm": 3.5906169414520264, "learning_rate": 6.514684692391782e-07, "loss": 0.2716, "num_input_tokens_seen": 18972104, "step": 28905 }, { "epoch": 17.045990566037737, "grad_norm": 3.6715471744537354, "learning_rate": 6.501992314908895e-07, "loss": 0.3133, "num_input_tokens_seen": 18974600, "step": 28910 }, { "epoch": 17.048938679245282, "grad_norm": 3.4319303035736084, "learning_rate": 6.489311453711017e-07, "loss": 0.2973, "num_input_tokens_seen": 18977256, "step": 28915 }, { "epoch": 17.05188679245283, "grad_norm": 2.81569504737854, "learning_rate": 6.476642112155457e-07, "loss": 0.2562, "num_input_tokens_seen": 18980520, "step": 28920 }, { "epoch": 17.05483490566038, "grad_norm": 3.517092227935791, "learning_rate": 6.463984293596476e-07, "loss": 0.2419, "num_input_tokens_seen": 18984392, "step": 28925 }, { "epoch": 17.057783018867923, "grad_norm": 4.227409362792969, "learning_rate": 6.451338001385282e-07, "loss": 0.3698, "num_input_tokens_seen": 18987368, "step": 28930 }, { "epoch": 17.06073113207547, "grad_norm": 4.422043323516846, "learning_rate": 6.438703238870037e-07, "loss": 0.2608, "num_input_tokens_seen": 18989928, "step": 28935 }, { "epoch": 17.06367924528302, "grad_norm": 4.808384895324707, "learning_rate": 6.426080009395846e-07, "loss": 0.2235, "num_input_tokens_seen": 18992744, "step": 28940 }, { "epoch": 17.066627358490567, "grad_norm": 9.739033699035645, "learning_rate": 6.413468316304755e-07, "loss": 0.3004, "num_input_tokens_seen": 18995560, "step": 28945 }, { "epoch": 17.069575471698112, "grad_norm": 4.848001956939697, "learning_rate": 6.400868162935786e-07, "loss": 0.361, "num_input_tokens_seen": 18998600, "step": 28950 }, { "epoch": 17.07252358490566, "grad_norm": 3.145320177078247, "learning_rate": 6.388279552624877e-07, "loss": 0.2797, "num_input_tokens_seen": 19001160, "step": 28955 }, { "epoch": 17.07547169811321, "grad_norm": 4.928226947784424, "learning_rate": 6.37570248870491e-07, "loss": 0.4262, "num_input_tokens_seen": 19006376, "step": 28960 }, { "epoch": 17.078419811320753, "grad_norm": 2.9970428943634033, "learning_rate": 6.363136974505718e-07, "loss": 0.2226, "num_input_tokens_seen": 19009768, "step": 28965 }, { "epoch": 17.0813679245283, "grad_norm": 2.466669797897339, "learning_rate": 6.350583013354078e-07, "loss": 0.2805, "num_input_tokens_seen": 19013064, "step": 28970 }, { "epoch": 17.08431603773585, "grad_norm": 3.7884206771850586, "learning_rate": 6.338040608573693e-07, "loss": 0.2971, "num_input_tokens_seen": 19016424, "step": 28975 }, { "epoch": 17.087264150943398, "grad_norm": 8.298959732055664, "learning_rate": 6.325509763485238e-07, "loss": 0.3358, "num_input_tokens_seen": 19019944, "step": 28980 }, { "epoch": 17.090212264150942, "grad_norm": 4.928779602050781, "learning_rate": 6.312990481406301e-07, "loss": 0.241, "num_input_tokens_seen": 19023144, "step": 28985 }, { "epoch": 17.09316037735849, "grad_norm": 3.55210280418396, "learning_rate": 6.300482765651411e-07, "loss": 0.2877, "num_input_tokens_seen": 19025864, "step": 28990 }, { "epoch": 17.09610849056604, "grad_norm": 4.486354351043701, "learning_rate": 6.28798661953205e-07, "loss": 0.3518, "num_input_tokens_seen": 19028776, "step": 28995 }, { "epoch": 17.099056603773583, "grad_norm": 3.1636600494384766, "learning_rate": 6.275502046356618e-07, "loss": 0.3348, "num_input_tokens_seen": 19031848, "step": 29000 }, { "epoch": 17.10200471698113, "grad_norm": 7.13303279876709, "learning_rate": 6.263029049430447e-07, "loss": 0.4716, "num_input_tokens_seen": 19035112, "step": 29005 }, { "epoch": 17.10495283018868, "grad_norm": 3.6804656982421875, "learning_rate": 6.250567632055832e-07, "loss": 0.3747, "num_input_tokens_seen": 19037768, "step": 29010 }, { "epoch": 17.107900943396228, "grad_norm": 4.1229634284973145, "learning_rate": 6.238117797532e-07, "loss": 0.3282, "num_input_tokens_seen": 19041416, "step": 29015 }, { "epoch": 17.110849056603772, "grad_norm": 4.748226642608643, "learning_rate": 6.225679549155083e-07, "loss": 0.4275, "num_input_tokens_seen": 19043848, "step": 29020 }, { "epoch": 17.11379716981132, "grad_norm": 5.604130268096924, "learning_rate": 6.213252890218163e-07, "loss": 0.3371, "num_input_tokens_seen": 19046856, "step": 29025 }, { "epoch": 17.11674528301887, "grad_norm": 3.085228204727173, "learning_rate": 6.200837824011247e-07, "loss": 0.3721, "num_input_tokens_seen": 19049768, "step": 29030 }, { "epoch": 17.119693396226417, "grad_norm": 2.52500581741333, "learning_rate": 6.188434353821282e-07, "loss": 0.2594, "num_input_tokens_seen": 19053096, "step": 29035 }, { "epoch": 17.12264150943396, "grad_norm": 7.569760322570801, "learning_rate": 6.176042482932132e-07, "loss": 0.3622, "num_input_tokens_seen": 19055880, "step": 29040 }, { "epoch": 17.12558962264151, "grad_norm": 2.847083568572998, "learning_rate": 6.163662214624616e-07, "loss": 0.2855, "num_input_tokens_seen": 19058792, "step": 29045 }, { "epoch": 17.128537735849058, "grad_norm": 3.553361177444458, "learning_rate": 6.151293552176451e-07, "loss": 0.4042, "num_input_tokens_seen": 19062824, "step": 29050 }, { "epoch": 17.131485849056602, "grad_norm": 3.875861167907715, "learning_rate": 6.138936498862291e-07, "loss": 0.2448, "num_input_tokens_seen": 19066920, "step": 29055 }, { "epoch": 17.13443396226415, "grad_norm": 2.924435615539551, "learning_rate": 6.126591057953729e-07, "loss": 0.483, "num_input_tokens_seen": 19071112, "step": 29060 }, { "epoch": 17.1373820754717, "grad_norm": 3.395110845565796, "learning_rate": 6.114257232719267e-07, "loss": 0.3745, "num_input_tokens_seen": 19074088, "step": 29065 }, { "epoch": 17.140330188679247, "grad_norm": 3.4223852157592773, "learning_rate": 6.101935026424332e-07, "loss": 0.3167, "num_input_tokens_seen": 19076680, "step": 29070 }, { "epoch": 17.14327830188679, "grad_norm": 3.12040376663208, "learning_rate": 6.089624442331293e-07, "loss": 0.2716, "num_input_tokens_seen": 19080104, "step": 29075 }, { "epoch": 17.14622641509434, "grad_norm": 3.5841164588928223, "learning_rate": 6.077325483699432e-07, "loss": 0.3802, "num_input_tokens_seen": 19083112, "step": 29080 }, { "epoch": 17.149174528301888, "grad_norm": 4.024878025054932, "learning_rate": 6.065038153784947e-07, "loss": 0.3204, "num_input_tokens_seen": 19086120, "step": 29085 }, { "epoch": 17.152122641509433, "grad_norm": 6.292355537414551, "learning_rate": 6.052762455840955e-07, "loss": 0.3102, "num_input_tokens_seen": 19089832, "step": 29090 }, { "epoch": 17.15507075471698, "grad_norm": 3.6146793365478516, "learning_rate": 6.040498393117494e-07, "loss": 0.4567, "num_input_tokens_seen": 19092680, "step": 29095 }, { "epoch": 17.15801886792453, "grad_norm": 2.7120521068573, "learning_rate": 6.028245968861551e-07, "loss": 0.2666, "num_input_tokens_seen": 19095624, "step": 29100 }, { "epoch": 17.160966981132077, "grad_norm": 3.4833450317382812, "learning_rate": 6.016005186316987e-07, "loss": 0.3323, "num_input_tokens_seen": 19098216, "step": 29105 }, { "epoch": 17.16391509433962, "grad_norm": 3.1611790657043457, "learning_rate": 6.003776048724614e-07, "loss": 0.3846, "num_input_tokens_seen": 19101544, "step": 29110 }, { "epoch": 17.16686320754717, "grad_norm": 3.644649028778076, "learning_rate": 5.991558559322152e-07, "loss": 0.3282, "num_input_tokens_seen": 19104264, "step": 29115 }, { "epoch": 17.169811320754718, "grad_norm": 7.052829265594482, "learning_rate": 5.979352721344223e-07, "loss": 0.3213, "num_input_tokens_seen": 19107080, "step": 29120 }, { "epoch": 17.172759433962263, "grad_norm": 4.725963592529297, "learning_rate": 5.967158538022383e-07, "loss": 0.2506, "num_input_tokens_seen": 19109320, "step": 29125 }, { "epoch": 17.17570754716981, "grad_norm": 3.9929187297821045, "learning_rate": 5.954976012585078e-07, "loss": 0.336, "num_input_tokens_seen": 19112200, "step": 29130 }, { "epoch": 17.17865566037736, "grad_norm": 2.625795602798462, "learning_rate": 5.942805148257713e-07, "loss": 0.3476, "num_input_tokens_seen": 19115432, "step": 29135 }, { "epoch": 17.181603773584907, "grad_norm": 3.414363145828247, "learning_rate": 5.930645948262553e-07, "loss": 0.2998, "num_input_tokens_seen": 19120072, "step": 29140 }, { "epoch": 17.184551886792452, "grad_norm": 3.410490036010742, "learning_rate": 5.918498415818813e-07, "loss": 0.2731, "num_input_tokens_seen": 19122792, "step": 29145 }, { "epoch": 17.1875, "grad_norm": 4.06822395324707, "learning_rate": 5.906362554142592e-07, "loss": 0.3382, "num_input_tokens_seen": 19126120, "step": 29150 }, { "epoch": 17.190448113207548, "grad_norm": 3.1118881702423096, "learning_rate": 5.894238366446925e-07, "loss": 0.2584, "num_input_tokens_seen": 19129160, "step": 29155 }, { "epoch": 17.193396226415093, "grad_norm": 4.051974773406982, "learning_rate": 5.882125855941723e-07, "loss": 0.3095, "num_input_tokens_seen": 19131432, "step": 29160 }, { "epoch": 17.19634433962264, "grad_norm": 6.523778438568115, "learning_rate": 5.870025025833842e-07, "loss": 0.3345, "num_input_tokens_seen": 19135080, "step": 29165 }, { "epoch": 17.19929245283019, "grad_norm": 2.254842519760132, "learning_rate": 5.857935879327031e-07, "loss": 0.2662, "num_input_tokens_seen": 19138952, "step": 29170 }, { "epoch": 17.202240566037737, "grad_norm": 2.7776684761047363, "learning_rate": 5.845858419621936e-07, "loss": 0.4094, "num_input_tokens_seen": 19141544, "step": 29175 }, { "epoch": 17.205188679245282, "grad_norm": 6.308425426483154, "learning_rate": 5.83379264991612e-07, "loss": 0.3688, "num_input_tokens_seen": 19144648, "step": 29180 }, { "epoch": 17.20813679245283, "grad_norm": 3.955644130706787, "learning_rate": 5.821738573404046e-07, "loss": 0.4037, "num_input_tokens_seen": 19147560, "step": 29185 }, { "epoch": 17.21108490566038, "grad_norm": 3.456576108932495, "learning_rate": 5.80969619327707e-07, "loss": 0.2679, "num_input_tokens_seen": 19150152, "step": 29190 }, { "epoch": 17.214033018867923, "grad_norm": 2.481722116470337, "learning_rate": 5.797665512723488e-07, "loss": 0.3209, "num_input_tokens_seen": 19153512, "step": 29195 }, { "epoch": 17.21698113207547, "grad_norm": 5.754764556884766, "learning_rate": 5.785646534928452e-07, "loss": 0.2316, "num_input_tokens_seen": 19157064, "step": 29200 }, { "epoch": 17.21992924528302, "grad_norm": 2.3730812072753906, "learning_rate": 5.77363926307406e-07, "loss": 0.3053, "num_input_tokens_seen": 19159912, "step": 29205 }, { "epoch": 17.222877358490567, "grad_norm": 4.905813217163086, "learning_rate": 5.761643700339281e-07, "loss": 0.2941, "num_input_tokens_seen": 19162568, "step": 29210 }, { "epoch": 17.225825471698112, "grad_norm": 3.708552837371826, "learning_rate": 5.749659849899985e-07, "loss": 0.3255, "num_input_tokens_seen": 19165704, "step": 29215 }, { "epoch": 17.22877358490566, "grad_norm": 2.817892074584961, "learning_rate": 5.737687714928953e-07, "loss": 0.2717, "num_input_tokens_seen": 19169544, "step": 29220 }, { "epoch": 17.23172169811321, "grad_norm": 4.341379165649414, "learning_rate": 5.725727298595846e-07, "loss": 0.3568, "num_input_tokens_seen": 19172904, "step": 29225 }, { "epoch": 17.234669811320753, "grad_norm": 3.8354787826538086, "learning_rate": 5.71377860406726e-07, "loss": 0.3669, "num_input_tokens_seen": 19176328, "step": 29230 }, { "epoch": 17.2376179245283, "grad_norm": 3.520921230316162, "learning_rate": 5.701841634506655e-07, "loss": 0.346, "num_input_tokens_seen": 19180168, "step": 29235 }, { "epoch": 17.24056603773585, "grad_norm": 2.925405263900757, "learning_rate": 5.689916393074391e-07, "loss": 0.3665, "num_input_tokens_seen": 19183784, "step": 29240 }, { "epoch": 17.243514150943398, "grad_norm": 2.6695523262023926, "learning_rate": 5.678002882927725e-07, "loss": 0.3605, "num_input_tokens_seen": 19187688, "step": 29245 }, { "epoch": 17.246462264150942, "grad_norm": 5.032520771026611, "learning_rate": 5.666101107220811e-07, "loss": 0.3197, "num_input_tokens_seen": 19191144, "step": 29250 }, { "epoch": 17.24941037735849, "grad_norm": 3.4703099727630615, "learning_rate": 5.654211069104693e-07, "loss": 0.3267, "num_input_tokens_seen": 19194056, "step": 29255 }, { "epoch": 17.25235849056604, "grad_norm": 3.058375120162964, "learning_rate": 5.642332771727321e-07, "loss": 0.3407, "num_input_tokens_seen": 19196776, "step": 29260 }, { "epoch": 17.255306603773583, "grad_norm": 3.224025249481201, "learning_rate": 5.630466218233521e-07, "loss": 0.2967, "num_input_tokens_seen": 19199816, "step": 29265 }, { "epoch": 17.25825471698113, "grad_norm": 4.777052402496338, "learning_rate": 5.618611411765007e-07, "loss": 0.2453, "num_input_tokens_seen": 19203016, "step": 29270 }, { "epoch": 17.26120283018868, "grad_norm": 5.834474563598633, "learning_rate": 5.606768355460401e-07, "loss": 0.3778, "num_input_tokens_seen": 19206792, "step": 29275 }, { "epoch": 17.264150943396228, "grad_norm": 3.6066672801971436, "learning_rate": 5.594937052455191e-07, "loss": 0.3659, "num_input_tokens_seen": 19210280, "step": 29280 }, { "epoch": 17.267099056603772, "grad_norm": 3.154449462890625, "learning_rate": 5.583117505881764e-07, "loss": 0.3554, "num_input_tokens_seen": 19214472, "step": 29285 }, { "epoch": 17.27004716981132, "grad_norm": 4.76646089553833, "learning_rate": 5.571309718869417e-07, "loss": 0.3399, "num_input_tokens_seen": 19218792, "step": 29290 }, { "epoch": 17.27299528301887, "grad_norm": 4.589078903198242, "learning_rate": 5.559513694544282e-07, "loss": 0.3118, "num_input_tokens_seen": 19221992, "step": 29295 }, { "epoch": 17.275943396226417, "grad_norm": 2.3752174377441406, "learning_rate": 5.547729436029442e-07, "loss": 0.3641, "num_input_tokens_seen": 19225064, "step": 29300 }, { "epoch": 17.27889150943396, "grad_norm": 3.057640790939331, "learning_rate": 5.535956946444809e-07, "loss": 0.4223, "num_input_tokens_seen": 19229288, "step": 29305 }, { "epoch": 17.28183962264151, "grad_norm": 2.7878377437591553, "learning_rate": 5.524196228907203e-07, "loss": 0.248, "num_input_tokens_seen": 19231592, "step": 29310 }, { "epoch": 17.284787735849058, "grad_norm": 3.1303746700286865, "learning_rate": 5.512447286530326e-07, "loss": 0.2709, "num_input_tokens_seen": 19233992, "step": 29315 }, { "epoch": 17.287735849056602, "grad_norm": 1.831650972366333, "learning_rate": 5.500710122424746e-07, "loss": 0.2857, "num_input_tokens_seen": 19237096, "step": 29320 }, { "epoch": 17.29068396226415, "grad_norm": 4.88261604309082, "learning_rate": 5.488984739697961e-07, "loss": 0.2964, "num_input_tokens_seen": 19240904, "step": 29325 }, { "epoch": 17.2936320754717, "grad_norm": 3.182245969772339, "learning_rate": 5.477271141454294e-07, "loss": 0.3678, "num_input_tokens_seen": 19243784, "step": 29330 }, { "epoch": 17.296580188679247, "grad_norm": 4.308318138122559, "learning_rate": 5.465569330794974e-07, "loss": 0.3038, "num_input_tokens_seen": 19246888, "step": 29335 }, { "epoch": 17.29952830188679, "grad_norm": 5.208832263946533, "learning_rate": 5.453879310818105e-07, "loss": 0.2703, "num_input_tokens_seen": 19249512, "step": 29340 }, { "epoch": 17.30247641509434, "grad_norm": 3.77803111076355, "learning_rate": 5.442201084618664e-07, "loss": 0.2917, "num_input_tokens_seen": 19252680, "step": 29345 }, { "epoch": 17.305424528301888, "grad_norm": 4.893446922302246, "learning_rate": 5.430534655288528e-07, "loss": 0.3355, "num_input_tokens_seen": 19256808, "step": 29350 }, { "epoch": 17.308372641509433, "grad_norm": 3.942440986633301, "learning_rate": 5.418880025916428e-07, "loss": 0.3012, "num_input_tokens_seen": 19259912, "step": 29355 }, { "epoch": 17.31132075471698, "grad_norm": 2.930877447128296, "learning_rate": 5.407237199587973e-07, "loss": 0.3546, "num_input_tokens_seen": 19263784, "step": 29360 }, { "epoch": 17.31426886792453, "grad_norm": 3.339630365371704, "learning_rate": 5.395606179385654e-07, "loss": 0.2403, "num_input_tokens_seen": 19266472, "step": 29365 }, { "epoch": 17.317216981132077, "grad_norm": 4.380157470703125, "learning_rate": 5.383986968388833e-07, "loss": 0.2709, "num_input_tokens_seen": 19269128, "step": 29370 }, { "epoch": 17.32016509433962, "grad_norm": 2.5789504051208496, "learning_rate": 5.372379569673736e-07, "loss": 0.2844, "num_input_tokens_seen": 19277320, "step": 29375 }, { "epoch": 17.32311320754717, "grad_norm": 4.165277004241943, "learning_rate": 5.360783986313495e-07, "loss": 0.2983, "num_input_tokens_seen": 19281608, "step": 29380 }, { "epoch": 17.326061320754718, "grad_norm": 2.9870800971984863, "learning_rate": 5.349200221378076e-07, "loss": 0.3481, "num_input_tokens_seen": 19284808, "step": 29385 }, { "epoch": 17.329009433962263, "grad_norm": 3.9824841022491455, "learning_rate": 5.33762827793432e-07, "loss": 0.3354, "num_input_tokens_seen": 19287688, "step": 29390 }, { "epoch": 17.33195754716981, "grad_norm": 5.479090213775635, "learning_rate": 5.326068159045978e-07, "loss": 0.328, "num_input_tokens_seen": 19291080, "step": 29395 }, { "epoch": 17.33490566037736, "grad_norm": 2.4579203128814697, "learning_rate": 5.314519867773621e-07, "loss": 0.2525, "num_input_tokens_seen": 19294664, "step": 29400 }, { "epoch": 17.337853773584907, "grad_norm": 3.1371774673461914, "learning_rate": 5.302983407174711e-07, "loss": 0.3231, "num_input_tokens_seen": 19298952, "step": 29405 }, { "epoch": 17.340801886792452, "grad_norm": 2.917617082595825, "learning_rate": 5.291458780303572e-07, "loss": 0.3339, "num_input_tokens_seen": 19302216, "step": 29410 }, { "epoch": 17.34375, "grad_norm": 3.3990488052368164, "learning_rate": 5.279945990211411e-07, "loss": 0.38, "num_input_tokens_seen": 19304872, "step": 29415 }, { "epoch": 17.346698113207548, "grad_norm": 4.141565799713135, "learning_rate": 5.26844503994628e-07, "loss": 0.3709, "num_input_tokens_seen": 19308904, "step": 29420 }, { "epoch": 17.349646226415093, "grad_norm": 5.691437244415283, "learning_rate": 5.25695593255311e-07, "loss": 0.3534, "num_input_tokens_seen": 19311976, "step": 29425 }, { "epoch": 17.35259433962264, "grad_norm": 3.896350860595703, "learning_rate": 5.24547867107369e-07, "loss": 0.4191, "num_input_tokens_seen": 19315688, "step": 29430 }, { "epoch": 17.35554245283019, "grad_norm": 2.742183208465576, "learning_rate": 5.234013258546672e-07, "loss": 0.3151, "num_input_tokens_seen": 19319144, "step": 29435 }, { "epoch": 17.358490566037737, "grad_norm": 4.143150806427002, "learning_rate": 5.222559698007563e-07, "loss": 0.1952, "num_input_tokens_seen": 19322856, "step": 29440 }, { "epoch": 17.361438679245282, "grad_norm": 3.3160417079925537, "learning_rate": 5.211117992488763e-07, "loss": 0.3902, "num_input_tokens_seen": 19325736, "step": 29445 }, { "epoch": 17.36438679245283, "grad_norm": 4.854966163635254, "learning_rate": 5.199688145019505e-07, "loss": 0.4309, "num_input_tokens_seen": 19328776, "step": 29450 }, { "epoch": 17.36733490566038, "grad_norm": 2.822237968444824, "learning_rate": 5.188270158625891e-07, "loss": 0.2417, "num_input_tokens_seen": 19331464, "step": 29455 }, { "epoch": 17.370283018867923, "grad_norm": 3.780109167098999, "learning_rate": 5.176864036330875e-07, "loss": 0.2811, "num_input_tokens_seen": 19334184, "step": 29460 }, { "epoch": 17.37323113207547, "grad_norm": 4.310973167419434, "learning_rate": 5.165469781154287e-07, "loss": 0.3798, "num_input_tokens_seen": 19337032, "step": 29465 }, { "epoch": 17.37617924528302, "grad_norm": 4.559661865234375, "learning_rate": 5.154087396112789e-07, "loss": 0.3292, "num_input_tokens_seen": 19340200, "step": 29470 }, { "epoch": 17.379127358490567, "grad_norm": 5.818599700927734, "learning_rate": 5.142716884219939e-07, "loss": 0.3401, "num_input_tokens_seen": 19343048, "step": 29475 }, { "epoch": 17.382075471698112, "grad_norm": 6.56919002532959, "learning_rate": 5.131358248486118e-07, "loss": 0.2809, "num_input_tokens_seen": 19345896, "step": 29480 }, { "epoch": 17.38502358490566, "grad_norm": 3.1370582580566406, "learning_rate": 5.120011491918564e-07, "loss": 0.2689, "num_input_tokens_seen": 19348808, "step": 29485 }, { "epoch": 17.38797169811321, "grad_norm": 4.97844123840332, "learning_rate": 5.108676617521402e-07, "loss": 0.3287, "num_input_tokens_seen": 19351656, "step": 29490 }, { "epoch": 17.390919811320753, "grad_norm": 3.1310806274414062, "learning_rate": 5.097353628295571e-07, "loss": 0.3508, "num_input_tokens_seen": 19355208, "step": 29495 }, { "epoch": 17.3938679245283, "grad_norm": 3.681748151779175, "learning_rate": 5.086042527238893e-07, "loss": 0.2994, "num_input_tokens_seen": 19358088, "step": 29500 }, { "epoch": 17.39681603773585, "grad_norm": 6.031062126159668, "learning_rate": 5.074743317346009e-07, "loss": 0.2849, "num_input_tokens_seen": 19361576, "step": 29505 }, { "epoch": 17.399764150943398, "grad_norm": 3.537694215774536, "learning_rate": 5.063456001608458e-07, "loss": 0.297, "num_input_tokens_seen": 19366600, "step": 29510 }, { "epoch": 17.402712264150942, "grad_norm": 3.0628039836883545, "learning_rate": 5.052180583014599e-07, "loss": 0.3231, "num_input_tokens_seen": 19369672, "step": 29515 }, { "epoch": 17.40566037735849, "grad_norm": 2.462053060531616, "learning_rate": 5.04091706454965e-07, "loss": 0.2562, "num_input_tokens_seen": 19372520, "step": 29520 }, { "epoch": 17.40860849056604, "grad_norm": 5.08489990234375, "learning_rate": 5.029665449195665e-07, "loss": 0.3625, "num_input_tokens_seen": 19375496, "step": 29525 }, { "epoch": 17.411556603773583, "grad_norm": 7.412511348724365, "learning_rate": 5.018425739931559e-07, "loss": 0.2409, "num_input_tokens_seen": 19378056, "step": 29530 }, { "epoch": 17.41450471698113, "grad_norm": 3.318896770477295, "learning_rate": 5.007197939733099e-07, "loss": 0.3336, "num_input_tokens_seen": 19381064, "step": 29535 }, { "epoch": 17.41745283018868, "grad_norm": 4.678496360778809, "learning_rate": 4.995982051572895e-07, "loss": 0.4124, "num_input_tokens_seen": 19384232, "step": 29540 }, { "epoch": 17.420400943396228, "grad_norm": 4.122057914733887, "learning_rate": 4.984778078420405e-07, "loss": 0.2859, "num_input_tokens_seen": 19387048, "step": 29545 }, { "epoch": 17.423349056603772, "grad_norm": 3.0006651878356934, "learning_rate": 4.973586023241917e-07, "loss": 0.3428, "num_input_tokens_seen": 19390408, "step": 29550 }, { "epoch": 17.42629716981132, "grad_norm": 2.1786420345306396, "learning_rate": 4.962405889000588e-07, "loss": 0.3725, "num_input_tokens_seen": 19393032, "step": 29555 }, { "epoch": 17.42924528301887, "grad_norm": 4.282044887542725, "learning_rate": 4.951237678656396e-07, "loss": 0.3086, "num_input_tokens_seen": 19396552, "step": 29560 }, { "epoch": 17.432193396226417, "grad_norm": 3.56268572807312, "learning_rate": 4.940081395166174e-07, "loss": 0.2972, "num_input_tokens_seen": 19399240, "step": 29565 }, { "epoch": 17.43514150943396, "grad_norm": 5.111908912658691, "learning_rate": 4.928937041483606e-07, "loss": 0.2894, "num_input_tokens_seen": 19402056, "step": 29570 }, { "epoch": 17.43808962264151, "grad_norm": 4.095756530761719, "learning_rate": 4.917804620559202e-07, "loss": 0.2072, "num_input_tokens_seen": 19405032, "step": 29575 }, { "epoch": 17.441037735849058, "grad_norm": 3.5949459075927734, "learning_rate": 4.906684135340317e-07, "loss": 0.3852, "num_input_tokens_seen": 19409608, "step": 29580 }, { "epoch": 17.443985849056602, "grad_norm": 3.5306754112243652, "learning_rate": 4.89557558877114e-07, "loss": 0.2437, "num_input_tokens_seen": 19412872, "step": 29585 }, { "epoch": 17.44693396226415, "grad_norm": 3.5506091117858887, "learning_rate": 4.884478983792728e-07, "loss": 0.2557, "num_input_tokens_seen": 19415400, "step": 29590 }, { "epoch": 17.4498820754717, "grad_norm": 5.179652690887451, "learning_rate": 4.873394323342939e-07, "loss": 0.3269, "num_input_tokens_seen": 19418088, "step": 29595 }, { "epoch": 17.452830188679247, "grad_norm": 4.249876976013184, "learning_rate": 4.86232161035648e-07, "loss": 0.3321, "num_input_tokens_seen": 19421736, "step": 29600 }, { "epoch": 17.45577830188679, "grad_norm": 5.374099254608154, "learning_rate": 4.851260847764916e-07, "loss": 0.3001, "num_input_tokens_seen": 19424552, "step": 29605 }, { "epoch": 17.45872641509434, "grad_norm": 4.311479568481445, "learning_rate": 4.840212038496622e-07, "loss": 0.2399, "num_input_tokens_seen": 19426792, "step": 29610 }, { "epoch": 17.461674528301888, "grad_norm": 6.845385551452637, "learning_rate": 4.82917518547682e-07, "loss": 0.3822, "num_input_tokens_seen": 19431112, "step": 29615 }, { "epoch": 17.464622641509433, "grad_norm": 2.131422519683838, "learning_rate": 4.81815029162756e-07, "loss": 0.2634, "num_input_tokens_seen": 19435048, "step": 29620 }, { "epoch": 17.46757075471698, "grad_norm": 3.3411953449249268, "learning_rate": 4.807137359867725e-07, "loss": 0.3069, "num_input_tokens_seen": 19439016, "step": 29625 }, { "epoch": 17.47051886792453, "grad_norm": 4.113058090209961, "learning_rate": 4.79613639311306e-07, "loss": 0.7762, "num_input_tokens_seen": 19441416, "step": 29630 }, { "epoch": 17.473466981132077, "grad_norm": 2.2407712936401367, "learning_rate": 4.785147394276096e-07, "loss": 0.2884, "num_input_tokens_seen": 19444904, "step": 29635 }, { "epoch": 17.47641509433962, "grad_norm": 4.289929389953613, "learning_rate": 4.774170366266223e-07, "loss": 0.3243, "num_input_tokens_seen": 19449224, "step": 29640 }, { "epoch": 17.47936320754717, "grad_norm": 3.9832842350006104, "learning_rate": 4.763205311989666e-07, "loss": 0.2957, "num_input_tokens_seen": 19452040, "step": 29645 }, { "epoch": 17.482311320754718, "grad_norm": 6.323486328125, "learning_rate": 4.752252234349458e-07, "loss": 0.268, "num_input_tokens_seen": 19455528, "step": 29650 }, { "epoch": 17.485259433962263, "grad_norm": 3.764775037765503, "learning_rate": 4.7413111362454634e-07, "loss": 0.3665, "num_input_tokens_seen": 19458472, "step": 29655 }, { "epoch": 17.48820754716981, "grad_norm": 6.375917434692383, "learning_rate": 4.7303820205744143e-07, "loss": 0.3035, "num_input_tokens_seen": 19461032, "step": 29660 }, { "epoch": 17.49115566037736, "grad_norm": 3.2134628295898438, "learning_rate": 4.7194648902298303e-07, "loss": 0.271, "num_input_tokens_seen": 19463688, "step": 29665 }, { "epoch": 17.494103773584907, "grad_norm": 2.3608615398406982, "learning_rate": 4.7085597481020594e-07, "loss": 0.3686, "num_input_tokens_seen": 19466760, "step": 29670 }, { "epoch": 17.497051886792452, "grad_norm": 2.6023104190826416, "learning_rate": 4.697666597078293e-07, "loss": 0.2568, "num_input_tokens_seen": 19469736, "step": 29675 }, { "epoch": 17.5, "grad_norm": 4.500664234161377, "learning_rate": 4.6867854400425237e-07, "loss": 0.2723, "num_input_tokens_seen": 19472616, "step": 29680 }, { "epoch": 17.502948113207548, "grad_norm": 2.545865774154663, "learning_rate": 4.6759162798756084e-07, "loss": 0.2504, "num_input_tokens_seen": 19476424, "step": 29685 }, { "epoch": 17.505896226415093, "grad_norm": 3.2858521938323975, "learning_rate": 4.6650591194551895e-07, "loss": 0.3689, "num_input_tokens_seen": 19479496, "step": 29690 }, { "epoch": 17.50884433962264, "grad_norm": 2.7884714603424072, "learning_rate": 4.654213961655757e-07, "loss": 0.199, "num_input_tokens_seen": 19482856, "step": 29695 }, { "epoch": 17.51179245283019, "grad_norm": 4.08107328414917, "learning_rate": 4.6433808093486075e-07, "loss": 0.3373, "num_input_tokens_seen": 19486600, "step": 29700 }, { "epoch": 17.514740566037737, "grad_norm": 8.637051582336426, "learning_rate": 4.63255966540187e-07, "loss": 0.3869, "num_input_tokens_seen": 19489416, "step": 29705 }, { "epoch": 17.517688679245282, "grad_norm": 3.8505306243896484, "learning_rate": 4.62175053268048e-07, "loss": 0.3256, "num_input_tokens_seen": 19492360, "step": 29710 }, { "epoch": 17.52063679245283, "grad_norm": 3.2301440238952637, "learning_rate": 4.6109534140462045e-07, "loss": 0.2689, "num_input_tokens_seen": 19495048, "step": 29715 }, { "epoch": 17.52358490566038, "grad_norm": 4.873189449310303, "learning_rate": 4.6001683123576226e-07, "loss": 0.2467, "num_input_tokens_seen": 19497608, "step": 29720 }, { "epoch": 17.526533018867923, "grad_norm": 4.470226287841797, "learning_rate": 4.589395230470145e-07, "loss": 0.2918, "num_input_tokens_seen": 19501800, "step": 29725 }, { "epoch": 17.52948113207547, "grad_norm": 5.198459625244141, "learning_rate": 4.578634171235996e-07, "loss": 0.3816, "num_input_tokens_seen": 19504520, "step": 29730 }, { "epoch": 17.53242924528302, "grad_norm": 4.9548139572143555, "learning_rate": 4.567885137504202e-07, "loss": 0.3333, "num_input_tokens_seen": 19507432, "step": 29735 }, { "epoch": 17.535377358490567, "grad_norm": 2.4765734672546387, "learning_rate": 4.55714813212062e-07, "loss": 0.3846, "num_input_tokens_seen": 19510856, "step": 29740 }, { "epoch": 17.538325471698112, "grad_norm": 3.7761480808258057, "learning_rate": 4.5464231579279206e-07, "loss": 0.3336, "num_input_tokens_seen": 19514568, "step": 29745 }, { "epoch": 17.54127358490566, "grad_norm": 3.486419677734375, "learning_rate": 4.535710217765571e-07, "loss": 0.3116, "num_input_tokens_seen": 19518888, "step": 29750 }, { "epoch": 17.54422169811321, "grad_norm": 3.302319049835205, "learning_rate": 4.5250093144698913e-07, "loss": 0.359, "num_input_tokens_seen": 19521736, "step": 29755 }, { "epoch": 17.547169811320753, "grad_norm": 4.745415210723877, "learning_rate": 4.514320450873988e-07, "loss": 0.3655, "num_input_tokens_seen": 19524392, "step": 29760 }, { "epoch": 17.5501179245283, "grad_norm": 5.265478610992432, "learning_rate": 4.503643629807769e-07, "loss": 0.2739, "num_input_tokens_seen": 19526984, "step": 29765 }, { "epoch": 17.55306603773585, "grad_norm": 3.692864179611206, "learning_rate": 4.4929788540979844e-07, "loss": 0.3556, "num_input_tokens_seen": 19530216, "step": 29770 }, { "epoch": 17.556014150943398, "grad_norm": 4.530064582824707, "learning_rate": 4.4823261265681596e-07, "loss": 0.4103, "num_input_tokens_seen": 19533384, "step": 29775 }, { "epoch": 17.558962264150942, "grad_norm": 8.030803680419922, "learning_rate": 4.471685450038671e-07, "loss": 0.4348, "num_input_tokens_seen": 19536232, "step": 29780 }, { "epoch": 17.56191037735849, "grad_norm": 3.7678098678588867, "learning_rate": 4.4610568273266706e-07, "loss": 0.356, "num_input_tokens_seen": 19539848, "step": 29785 }, { "epoch": 17.56485849056604, "grad_norm": 4.619472980499268, "learning_rate": 4.450440261246142e-07, "loss": 0.3136, "num_input_tokens_seen": 19542568, "step": 29790 }, { "epoch": 17.567806603773583, "grad_norm": 3.4950063228607178, "learning_rate": 4.439835754607863e-07, "loss": 0.3145, "num_input_tokens_seen": 19546600, "step": 29795 }, { "epoch": 17.57075471698113, "grad_norm": 3.8304474353790283, "learning_rate": 4.429243310219422e-07, "loss": 0.2754, "num_input_tokens_seen": 19549800, "step": 29800 }, { "epoch": 17.57370283018868, "grad_norm": 4.187685966491699, "learning_rate": 4.418662930885215e-07, "loss": 0.3122, "num_input_tokens_seen": 19553096, "step": 29805 }, { "epoch": 17.576650943396228, "grad_norm": 2.3086740970611572, "learning_rate": 4.408094619406439e-07, "loss": 0.2894, "num_input_tokens_seen": 19556040, "step": 29810 }, { "epoch": 17.579599056603772, "grad_norm": 7.590880870819092, "learning_rate": 4.3975383785810954e-07, "loss": 0.456, "num_input_tokens_seen": 19559944, "step": 29815 }, { "epoch": 17.58254716981132, "grad_norm": 5.278389930725098, "learning_rate": 4.3869942112040096e-07, "loss": 0.4226, "num_input_tokens_seen": 19566280, "step": 29820 }, { "epoch": 17.58549528301887, "grad_norm": 3.6655566692352295, "learning_rate": 4.3764621200667936e-07, "loss": 0.3564, "num_input_tokens_seen": 19569992, "step": 29825 }, { "epoch": 17.588443396226417, "grad_norm": 3.7083778381347656, "learning_rate": 4.365942107957849e-07, "loss": 0.3488, "num_input_tokens_seen": 19572744, "step": 29830 }, { "epoch": 17.59139150943396, "grad_norm": 6.2063069343566895, "learning_rate": 4.35543417766241e-07, "loss": 0.2799, "num_input_tokens_seen": 19576424, "step": 29835 }, { "epoch": 17.59433962264151, "grad_norm": 3.699772357940674, "learning_rate": 4.3449383319624785e-07, "loss": 0.3257, "num_input_tokens_seen": 19578952, "step": 29840 }, { "epoch": 17.597287735849058, "grad_norm": 7.938838958740234, "learning_rate": 4.3344545736368926e-07, "loss": 0.3074, "num_input_tokens_seen": 19582184, "step": 29845 }, { "epoch": 17.600235849056602, "grad_norm": 6.128177642822266, "learning_rate": 4.323982905461266e-07, "loss": 0.4153, "num_input_tokens_seen": 19586056, "step": 29850 }, { "epoch": 17.60318396226415, "grad_norm": 4.197917461395264, "learning_rate": 4.313523330208019e-07, "loss": 0.3857, "num_input_tokens_seen": 19589448, "step": 29855 }, { "epoch": 17.6061320754717, "grad_norm": 3.810572624206543, "learning_rate": 4.303075850646371e-07, "loss": 0.2854, "num_input_tokens_seen": 19593736, "step": 29860 }, { "epoch": 17.609080188679247, "grad_norm": 8.597007751464844, "learning_rate": 4.2926404695423305e-07, "loss": 0.2553, "num_input_tokens_seen": 19597000, "step": 29865 }, { "epoch": 17.61202830188679, "grad_norm": 7.298404216766357, "learning_rate": 4.282217189658705e-07, "loss": 0.3366, "num_input_tokens_seen": 19600200, "step": 29870 }, { "epoch": 17.61497641509434, "grad_norm": 3.3758997917175293, "learning_rate": 4.27180601375512e-07, "loss": 0.3711, "num_input_tokens_seen": 19603048, "step": 29875 }, { "epoch": 17.617924528301888, "grad_norm": 5.835490703582764, "learning_rate": 4.2614069445879646e-07, "loss": 0.4878, "num_input_tokens_seen": 19605960, "step": 29880 }, { "epoch": 17.620872641509433, "grad_norm": 3.5462043285369873, "learning_rate": 4.251019984910448e-07, "loss": 0.3758, "num_input_tokens_seen": 19609448, "step": 29885 }, { "epoch": 17.62382075471698, "grad_norm": 4.367044448852539, "learning_rate": 4.2406451374725597e-07, "loss": 0.4048, "num_input_tokens_seen": 19613256, "step": 29890 }, { "epoch": 17.62676886792453, "grad_norm": 2.684479236602783, "learning_rate": 4.2302824050210855e-07, "loss": 0.364, "num_input_tokens_seen": 19616680, "step": 29895 }, { "epoch": 17.629716981132077, "grad_norm": 2.7027223110198975, "learning_rate": 4.2199317902995974e-07, "loss": 0.2791, "num_input_tokens_seen": 19619944, "step": 29900 }, { "epoch": 17.63266509433962, "grad_norm": 2.0559022426605225, "learning_rate": 4.209593296048459e-07, "loss": 0.249, "num_input_tokens_seen": 19623016, "step": 29905 }, { "epoch": 17.63561320754717, "grad_norm": 2.624694347381592, "learning_rate": 4.1992669250048524e-07, "loss": 0.3136, "num_input_tokens_seen": 19625704, "step": 29910 }, { "epoch": 17.638561320754718, "grad_norm": 2.6954686641693115, "learning_rate": 4.188952679902719e-07, "loss": 0.421, "num_input_tokens_seen": 19629224, "step": 29915 }, { "epoch": 17.641509433962263, "grad_norm": 4.55791711807251, "learning_rate": 4.178650563472797e-07, "loss": 0.3877, "num_input_tokens_seen": 19633448, "step": 29920 }, { "epoch": 17.64445754716981, "grad_norm": 3.818033456802368, "learning_rate": 4.168360578442615e-07, "loss": 0.3272, "num_input_tokens_seen": 19636616, "step": 29925 }, { "epoch": 17.64740566037736, "grad_norm": 2.5958707332611084, "learning_rate": 4.1580827275365e-07, "loss": 0.3456, "num_input_tokens_seen": 19640840, "step": 29930 }, { "epoch": 17.650353773584907, "grad_norm": 5.355216979980469, "learning_rate": 4.147817013475536e-07, "loss": 0.2124, "num_input_tokens_seen": 19643240, "step": 29935 }, { "epoch": 17.653301886792452, "grad_norm": 5.83351469039917, "learning_rate": 4.1375634389776375e-07, "loss": 0.2578, "num_input_tokens_seen": 19646024, "step": 29940 }, { "epoch": 17.65625, "grad_norm": 3.986323356628418, "learning_rate": 4.127322006757478e-07, "loss": 0.3031, "num_input_tokens_seen": 19649416, "step": 29945 }, { "epoch": 17.659198113207548, "grad_norm": 2.8367278575897217, "learning_rate": 4.1170927195265163e-07, "loss": 0.2317, "num_input_tokens_seen": 19653000, "step": 29950 }, { "epoch": 17.662146226415093, "grad_norm": 5.164255142211914, "learning_rate": 4.1068755799930026e-07, "loss": 0.4126, "num_input_tokens_seen": 19656648, "step": 29955 }, { "epoch": 17.66509433962264, "grad_norm": 2.4700393676757812, "learning_rate": 4.096670590861962e-07, "loss": 0.2634, "num_input_tokens_seen": 19659944, "step": 29960 }, { "epoch": 17.66804245283019, "grad_norm": 2.381352424621582, "learning_rate": 4.086477754835211e-07, "loss": 0.3282, "num_input_tokens_seen": 19664296, "step": 29965 }, { "epoch": 17.670990566037737, "grad_norm": 5.311634540557861, "learning_rate": 4.0762970746113517e-07, "loss": 0.2773, "num_input_tokens_seen": 19667144, "step": 29970 }, { "epoch": 17.673938679245282, "grad_norm": 2.255930185317993, "learning_rate": 4.0661285528857676e-07, "loss": 0.34, "num_input_tokens_seen": 19670696, "step": 29975 }, { "epoch": 17.67688679245283, "grad_norm": 2.925170660018921, "learning_rate": 4.0559721923506155e-07, "loss": 0.2917, "num_input_tokens_seen": 19673832, "step": 29980 }, { "epoch": 17.67983490566038, "grad_norm": 6.344004154205322, "learning_rate": 4.045827995694834e-07, "loss": 0.3174, "num_input_tokens_seen": 19677096, "step": 29985 }, { "epoch": 17.682783018867923, "grad_norm": 3.95340895652771, "learning_rate": 4.035695965604142e-07, "loss": 0.2941, "num_input_tokens_seen": 19680200, "step": 29990 }, { "epoch": 17.68573113207547, "grad_norm": 3.183182716369629, "learning_rate": 4.0255761047610365e-07, "loss": 0.3493, "num_input_tokens_seen": 19684392, "step": 29995 }, { "epoch": 17.68867924528302, "grad_norm": 3.6182615756988525, "learning_rate": 4.0154684158447864e-07, "loss": 0.5106, "num_input_tokens_seen": 19686792, "step": 30000 }, { "epoch": 17.691627358490567, "grad_norm": 4.004990100860596, "learning_rate": 4.0053729015314623e-07, "loss": 0.3677, "num_input_tokens_seen": 19689736, "step": 30005 }, { "epoch": 17.694575471698112, "grad_norm": 2.7860474586486816, "learning_rate": 3.9952895644938926e-07, "loss": 0.3173, "num_input_tokens_seen": 19693352, "step": 30010 }, { "epoch": 17.69752358490566, "grad_norm": 4.96690034866333, "learning_rate": 3.985218407401681e-07, "loss": 0.2951, "num_input_tokens_seen": 19696776, "step": 30015 }, { "epoch": 17.70047169811321, "grad_norm": 5.045187473297119, "learning_rate": 3.975159432921205e-07, "loss": 0.4994, "num_input_tokens_seen": 19699784, "step": 30020 }, { "epoch": 17.703419811320753, "grad_norm": 3.874377727508545, "learning_rate": 3.9651126437156294e-07, "loss": 0.2964, "num_input_tokens_seen": 19703432, "step": 30025 }, { "epoch": 17.7063679245283, "grad_norm": 3.113098621368408, "learning_rate": 3.9550780424448653e-07, "loss": 0.3349, "num_input_tokens_seen": 19706280, "step": 30030 }, { "epoch": 17.70931603773585, "grad_norm": 6.261507511138916, "learning_rate": 3.9450556317656487e-07, "loss": 0.3232, "num_input_tokens_seen": 19709480, "step": 30035 }, { "epoch": 17.712264150943398, "grad_norm": 7.009562969207764, "learning_rate": 3.935045414331434e-07, "loss": 0.2611, "num_input_tokens_seen": 19712680, "step": 30040 }, { "epoch": 17.715212264150942, "grad_norm": 3.021461248397827, "learning_rate": 3.925047392792475e-07, "loss": 0.3538, "num_input_tokens_seen": 19715752, "step": 30045 }, { "epoch": 17.71816037735849, "grad_norm": 3.5753226280212402, "learning_rate": 3.9150615697957917e-07, "loss": 0.3106, "num_input_tokens_seen": 19718344, "step": 30050 }, { "epoch": 17.72110849056604, "grad_norm": 6.162144184112549, "learning_rate": 3.9050879479851753e-07, "loss": 0.3965, "num_input_tokens_seen": 19720904, "step": 30055 }, { "epoch": 17.724056603773583, "grad_norm": 2.8470003604888916, "learning_rate": 3.89512653000117e-07, "loss": 0.36, "num_input_tokens_seen": 19728104, "step": 30060 }, { "epoch": 17.72700471698113, "grad_norm": 6.061378479003906, "learning_rate": 3.8851773184811203e-07, "loss": 0.4082, "num_input_tokens_seen": 19731048, "step": 30065 }, { "epoch": 17.72995283018868, "grad_norm": 4.385757923126221, "learning_rate": 3.8752403160591255e-07, "loss": 0.1869, "num_input_tokens_seen": 19734312, "step": 30070 }, { "epoch": 17.732900943396228, "grad_norm": 2.7674829959869385, "learning_rate": 3.8653155253660477e-07, "loss": 0.3312, "num_input_tokens_seen": 19737608, "step": 30075 }, { "epoch": 17.735849056603772, "grad_norm": 2.4463181495666504, "learning_rate": 3.8554029490295073e-07, "loss": 0.3561, "num_input_tokens_seen": 19741768, "step": 30080 }, { "epoch": 17.73879716981132, "grad_norm": 5.646268844604492, "learning_rate": 3.8455025896739164e-07, "loss": 0.4199, "num_input_tokens_seen": 19744456, "step": 30085 }, { "epoch": 17.74174528301887, "grad_norm": 2.430732011795044, "learning_rate": 3.8356144499204215e-07, "loss": 0.335, "num_input_tokens_seen": 19747816, "step": 30090 }, { "epoch": 17.744693396226417, "grad_norm": 10.114243507385254, "learning_rate": 3.8257385323869576e-07, "loss": 0.277, "num_input_tokens_seen": 19750856, "step": 30095 }, { "epoch": 17.74764150943396, "grad_norm": 2.108164072036743, "learning_rate": 3.815874839688222e-07, "loss": 0.3099, "num_input_tokens_seen": 19753992, "step": 30100 }, { "epoch": 17.75058962264151, "grad_norm": 3.657869338989258, "learning_rate": 3.8060233744356634e-07, "loss": 0.4067, "num_input_tokens_seen": 19757896, "step": 30105 }, { "epoch": 17.753537735849058, "grad_norm": 2.6765058040618896, "learning_rate": 3.796184139237502e-07, "loss": 0.433, "num_input_tokens_seen": 19760520, "step": 30110 }, { "epoch": 17.756485849056602, "grad_norm": 5.54719877243042, "learning_rate": 3.7863571366987206e-07, "loss": 0.3034, "num_input_tokens_seen": 19763016, "step": 30115 }, { "epoch": 17.75943396226415, "grad_norm": 4.3164286613464355, "learning_rate": 3.776542369421049e-07, "loss": 0.3374, "num_input_tokens_seen": 19766184, "step": 30120 }, { "epoch": 17.7623820754717, "grad_norm": 7.191664218902588, "learning_rate": 3.766739840003003e-07, "loss": 0.2453, "num_input_tokens_seen": 19769416, "step": 30125 }, { "epoch": 17.765330188679247, "grad_norm": 2.204716682434082, "learning_rate": 3.756949551039835e-07, "loss": 0.3231, "num_input_tokens_seen": 19774024, "step": 30130 }, { "epoch": 17.76827830188679, "grad_norm": 3.598282814025879, "learning_rate": 3.7471715051235757e-07, "loss": 0.3106, "num_input_tokens_seen": 19776872, "step": 30135 }, { "epoch": 17.77122641509434, "grad_norm": 4.430799961090088, "learning_rate": 3.7374057048429947e-07, "loss": 0.2205, "num_input_tokens_seen": 19780520, "step": 30140 }, { "epoch": 17.774174528301888, "grad_norm": 2.8732118606567383, "learning_rate": 3.7276521527836396e-07, "loss": 0.4052, "num_input_tokens_seen": 19785032, "step": 30145 }, { "epoch": 17.777122641509433, "grad_norm": 2.567589521408081, "learning_rate": 3.717910851527784e-07, "loss": 0.3441, "num_input_tokens_seen": 19788584, "step": 30150 }, { "epoch": 17.78007075471698, "grad_norm": 2.7687063217163086, "learning_rate": 3.708181803654498e-07, "loss": 0.2994, "num_input_tokens_seen": 19791368, "step": 30155 }, { "epoch": 17.78301886792453, "grad_norm": 3.3695194721221924, "learning_rate": 3.6984650117395993e-07, "loss": 0.4049, "num_input_tokens_seen": 19795304, "step": 30160 }, { "epoch": 17.785966981132077, "grad_norm": 3.1376848220825195, "learning_rate": 3.688760478355635e-07, "loss": 0.3123, "num_input_tokens_seen": 19799048, "step": 30165 }, { "epoch": 17.78891509433962, "grad_norm": 3.5615522861480713, "learning_rate": 3.679068206071923e-07, "loss": 0.3427, "num_input_tokens_seen": 19801288, "step": 30170 }, { "epoch": 17.79186320754717, "grad_norm": 4.759487628936768, "learning_rate": 3.669388197454532e-07, "loss": 0.3391, "num_input_tokens_seen": 19804168, "step": 30175 }, { "epoch": 17.794811320754718, "grad_norm": 2.360820770263672, "learning_rate": 3.6597204550662956e-07, "loss": 0.2421, "num_input_tokens_seen": 19807688, "step": 30180 }, { "epoch": 17.797759433962263, "grad_norm": 7.8036274909973145, "learning_rate": 3.650064981466772e-07, "loss": 0.391, "num_input_tokens_seen": 19810376, "step": 30185 }, { "epoch": 17.80070754716981, "grad_norm": 2.3514466285705566, "learning_rate": 3.640421779212311e-07, "loss": 0.2775, "num_input_tokens_seen": 19813512, "step": 30190 }, { "epoch": 17.80365566037736, "grad_norm": 3.173494577407837, "learning_rate": 3.630790850855986e-07, "loss": 0.2962, "num_input_tokens_seen": 19817064, "step": 30195 }, { "epoch": 17.806603773584907, "grad_norm": 1.893020510673523, "learning_rate": 3.62117219894762e-07, "loss": 0.2753, "num_input_tokens_seen": 19821320, "step": 30200 }, { "epoch": 17.809551886792452, "grad_norm": 6.428494453430176, "learning_rate": 3.611565826033797e-07, "loss": 0.2957, "num_input_tokens_seen": 19824360, "step": 30205 }, { "epoch": 17.8125, "grad_norm": 4.030634880065918, "learning_rate": 3.6019717346578445e-07, "loss": 0.253, "num_input_tokens_seen": 19827016, "step": 30210 }, { "epoch": 17.815448113207548, "grad_norm": 5.172286033630371, "learning_rate": 3.5923899273598293e-07, "loss": 0.3847, "num_input_tokens_seen": 19830824, "step": 30215 }, { "epoch": 17.818396226415093, "grad_norm": 3.319737434387207, "learning_rate": 3.582820406676596e-07, "loss": 0.2131, "num_input_tokens_seen": 19833800, "step": 30220 }, { "epoch": 17.82134433962264, "grad_norm": 4.967245578765869, "learning_rate": 3.5732631751417056e-07, "loss": 0.3095, "num_input_tokens_seen": 19836776, "step": 30225 }, { "epoch": 17.82429245283019, "grad_norm": 4.054728031158447, "learning_rate": 3.563718235285485e-07, "loss": 0.3553, "num_input_tokens_seen": 19840040, "step": 30230 }, { "epoch": 17.827240566037737, "grad_norm": 3.6017558574676514, "learning_rate": 3.5541855896349844e-07, "loss": 0.2658, "num_input_tokens_seen": 19843336, "step": 30235 }, { "epoch": 17.830188679245282, "grad_norm": 3.684494972229004, "learning_rate": 3.544665240714018e-07, "loss": 0.2751, "num_input_tokens_seen": 19846120, "step": 30240 }, { "epoch": 17.83313679245283, "grad_norm": 8.940827369689941, "learning_rate": 3.535157191043137e-07, "loss": 0.314, "num_input_tokens_seen": 19849544, "step": 30245 }, { "epoch": 17.83608490566038, "grad_norm": 3.4570200443267822, "learning_rate": 3.5256614431396385e-07, "loss": 0.4347, "num_input_tokens_seen": 19853096, "step": 30250 }, { "epoch": 17.839033018867923, "grad_norm": 4.571300506591797, "learning_rate": 3.516177999517578e-07, "loss": 0.3392, "num_input_tokens_seen": 19855848, "step": 30255 }, { "epoch": 17.84198113207547, "grad_norm": 3.250882148742676, "learning_rate": 3.50670686268772e-07, "loss": 0.3226, "num_input_tokens_seen": 19859752, "step": 30260 }, { "epoch": 17.84492924528302, "grad_norm": 3.1567108631134033, "learning_rate": 3.497248035157602e-07, "loss": 0.3212, "num_input_tokens_seen": 19862472, "step": 30265 }, { "epoch": 17.847877358490567, "grad_norm": 2.6210131645202637, "learning_rate": 3.4878015194314773e-07, "loss": 0.3762, "num_input_tokens_seen": 19865832, "step": 30270 }, { "epoch": 17.850825471698112, "grad_norm": 3.9561991691589355, "learning_rate": 3.4783673180103617e-07, "loss": 0.3479, "num_input_tokens_seen": 19869256, "step": 30275 }, { "epoch": 17.85377358490566, "grad_norm": 3.155005693435669, "learning_rate": 3.468945433391985e-07, "loss": 0.3642, "num_input_tokens_seen": 19872200, "step": 30280 }, { "epoch": 17.85672169811321, "grad_norm": 3.0886781215667725, "learning_rate": 3.459535868070851e-07, "loss": 0.3986, "num_input_tokens_seen": 19875368, "step": 30285 }, { "epoch": 17.859669811320753, "grad_norm": 5.478484153747559, "learning_rate": 3.450138624538174e-07, "loss": 0.2705, "num_input_tokens_seen": 19878248, "step": 30290 }, { "epoch": 17.8626179245283, "grad_norm": 3.0976693630218506, "learning_rate": 3.440753705281913e-07, "loss": 0.3378, "num_input_tokens_seen": 19880968, "step": 30295 }, { "epoch": 17.86556603773585, "grad_norm": 3.1771154403686523, "learning_rate": 3.4313811127867693e-07, "loss": 0.3165, "num_input_tokens_seen": 19887240, "step": 30300 }, { "epoch": 17.868514150943398, "grad_norm": 4.872092247009277, "learning_rate": 3.4220208495341745e-07, "loss": 0.3932, "num_input_tokens_seen": 19890568, "step": 30305 }, { "epoch": 17.871462264150942, "grad_norm": 2.794956684112549, "learning_rate": 3.412672918002291e-07, "loss": 0.2875, "num_input_tokens_seen": 19893608, "step": 30310 }, { "epoch": 17.87441037735849, "grad_norm": 3.7632126808166504, "learning_rate": 3.403337320666045e-07, "loss": 0.5391, "num_input_tokens_seen": 19896360, "step": 30315 }, { "epoch": 17.87735849056604, "grad_norm": 5.2011003494262695, "learning_rate": 3.394014059997064e-07, "loss": 0.265, "num_input_tokens_seen": 19899720, "step": 30320 }, { "epoch": 17.880306603773583, "grad_norm": 5.309354305267334, "learning_rate": 3.3847031384637185e-07, "loss": 0.4119, "num_input_tokens_seen": 19902376, "step": 30325 }, { "epoch": 17.88325471698113, "grad_norm": 2.9873743057250977, "learning_rate": 3.3754045585311147e-07, "loss": 0.3044, "num_input_tokens_seen": 19905576, "step": 30330 }, { "epoch": 17.88620283018868, "grad_norm": 4.769732475280762, "learning_rate": 3.366118322661094e-07, "loss": 0.2809, "num_input_tokens_seen": 19907976, "step": 30335 }, { "epoch": 17.889150943396228, "grad_norm": 5.779354095458984, "learning_rate": 3.3568444333122283e-07, "loss": 0.4398, "num_input_tokens_seen": 19911656, "step": 30340 }, { "epoch": 17.892099056603772, "grad_norm": 3.9615063667297363, "learning_rate": 3.347582892939816e-07, "loss": 0.3397, "num_input_tokens_seen": 19914760, "step": 30345 }, { "epoch": 17.89504716981132, "grad_norm": 5.206798076629639, "learning_rate": 3.338333703995905e-07, "loss": 0.3458, "num_input_tokens_seen": 19918184, "step": 30350 }, { "epoch": 17.89799528301887, "grad_norm": 4.286725997924805, "learning_rate": 3.329096868929238e-07, "loss": 0.3219, "num_input_tokens_seen": 19921736, "step": 30355 }, { "epoch": 17.900943396226417, "grad_norm": 3.989588499069214, "learning_rate": 3.319872390185325e-07, "loss": 0.3732, "num_input_tokens_seen": 19924456, "step": 30360 }, { "epoch": 17.90389150943396, "grad_norm": 3.6226704120635986, "learning_rate": 3.3106602702063727e-07, "loss": 0.2783, "num_input_tokens_seen": 19927112, "step": 30365 }, { "epoch": 17.90683962264151, "grad_norm": 3.375239133834839, "learning_rate": 3.3014605114313316e-07, "loss": 0.4876, "num_input_tokens_seen": 19929896, "step": 30370 }, { "epoch": 17.909787735849058, "grad_norm": 2.705922842025757, "learning_rate": 3.2922731162958744e-07, "loss": 0.3819, "num_input_tokens_seen": 19934536, "step": 30375 }, { "epoch": 17.912735849056602, "grad_norm": 3.6318893432617188, "learning_rate": 3.2830980872324114e-07, "loss": 0.2944, "num_input_tokens_seen": 19937192, "step": 30380 }, { "epoch": 17.91568396226415, "grad_norm": 4.040850639343262, "learning_rate": 3.2739354266700775e-07, "loss": 0.2447, "num_input_tokens_seen": 19940392, "step": 30385 }, { "epoch": 17.9186320754717, "grad_norm": 5.677233695983887, "learning_rate": 3.264785137034709e-07, "loss": 0.452, "num_input_tokens_seen": 19943624, "step": 30390 }, { "epoch": 17.921580188679247, "grad_norm": 2.9594972133636475, "learning_rate": 3.2556472207488977e-07, "loss": 0.2874, "num_input_tokens_seen": 19946856, "step": 30395 }, { "epoch": 17.92452830188679, "grad_norm": 2.9966344833374023, "learning_rate": 3.246521680231934e-07, "loss": 0.3887, "num_input_tokens_seen": 19949992, "step": 30400 }, { "epoch": 17.92747641509434, "grad_norm": 3.562363862991333, "learning_rate": 3.2374085178998594e-07, "loss": 0.3135, "num_input_tokens_seen": 19953960, "step": 30405 }, { "epoch": 17.930424528301888, "grad_norm": 4.214653968811035, "learning_rate": 3.2283077361654145e-07, "loss": 0.3499, "num_input_tokens_seen": 19957160, "step": 30410 }, { "epoch": 17.933372641509433, "grad_norm": 7.402681350708008, "learning_rate": 3.2192193374380677e-07, "loss": 0.2746, "num_input_tokens_seen": 19960168, "step": 30415 }, { "epoch": 17.93632075471698, "grad_norm": 11.917881965637207, "learning_rate": 3.210143324124021e-07, "loss": 0.3793, "num_input_tokens_seen": 19963368, "step": 30420 }, { "epoch": 17.93926886792453, "grad_norm": 3.4628703594207764, "learning_rate": 3.2010796986261805e-07, "loss": 0.2637, "num_input_tokens_seen": 19966568, "step": 30425 }, { "epoch": 17.942216981132077, "grad_norm": 4.460865497589111, "learning_rate": 3.1920284633441713e-07, "loss": 0.2981, "num_input_tokens_seen": 19970152, "step": 30430 }, { "epoch": 17.94516509433962, "grad_norm": 2.8955180644989014, "learning_rate": 3.1829896206743704e-07, "loss": 0.2951, "num_input_tokens_seen": 19973576, "step": 30435 }, { "epoch": 17.94811320754717, "grad_norm": 2.7796413898468018, "learning_rate": 3.173963173009825e-07, "loss": 0.3464, "num_input_tokens_seen": 19977640, "step": 30440 }, { "epoch": 17.951061320754718, "grad_norm": 4.6051249504089355, "learning_rate": 3.164949122740352e-07, "loss": 0.3599, "num_input_tokens_seen": 19980232, "step": 30445 }, { "epoch": 17.954009433962263, "grad_norm": 2.660403251647949, "learning_rate": 3.1559474722524406e-07, "loss": 0.3253, "num_input_tokens_seen": 19983496, "step": 30450 }, { "epoch": 17.95695754716981, "grad_norm": 2.758788824081421, "learning_rate": 3.146958223929325e-07, "loss": 0.4055, "num_input_tokens_seen": 19986664, "step": 30455 }, { "epoch": 17.95990566037736, "grad_norm": 11.006932258605957, "learning_rate": 3.1379813801509454e-07, "loss": 0.2311, "num_input_tokens_seen": 19989672, "step": 30460 }, { "epoch": 17.962853773584907, "grad_norm": 4.309544563293457, "learning_rate": 3.1290169432939556e-07, "loss": 0.2698, "num_input_tokens_seen": 19992680, "step": 30465 }, { "epoch": 17.965801886792452, "grad_norm": 4.727543830871582, "learning_rate": 3.120064915731735e-07, "loss": 0.4832, "num_input_tokens_seen": 19999688, "step": 30470 }, { "epoch": 17.96875, "grad_norm": 2.2988455295562744, "learning_rate": 3.1111252998343723e-07, "loss": 0.3805, "num_input_tokens_seen": 20004520, "step": 30475 }, { "epoch": 17.971698113207548, "grad_norm": 2.804508924484253, "learning_rate": 3.102198097968662e-07, "loss": 0.2685, "num_input_tokens_seen": 20007240, "step": 30480 }, { "epoch": 17.974646226415093, "grad_norm": 2.0426509380340576, "learning_rate": 3.093283312498124e-07, "loss": 0.2558, "num_input_tokens_seen": 20011272, "step": 30485 }, { "epoch": 17.97759433962264, "grad_norm": 4.182236194610596, "learning_rate": 3.084380945782989e-07, "loss": 0.3287, "num_input_tokens_seen": 20014376, "step": 30490 }, { "epoch": 17.98054245283019, "grad_norm": 5.031620979309082, "learning_rate": 3.0754910001801866e-07, "loss": 0.3196, "num_input_tokens_seen": 20017768, "step": 30495 }, { "epoch": 17.983490566037737, "grad_norm": 4.622066020965576, "learning_rate": 3.0666134780433786e-07, "loss": 0.2615, "num_input_tokens_seen": 20020392, "step": 30500 }, { "epoch": 17.986438679245282, "grad_norm": 5.089591026306152, "learning_rate": 3.0577483817229306e-07, "loss": 0.28, "num_input_tokens_seen": 20024104, "step": 30505 }, { "epoch": 17.98938679245283, "grad_norm": 2.645587921142578, "learning_rate": 3.0488957135659023e-07, "loss": 0.2722, "num_input_tokens_seen": 20027240, "step": 30510 }, { "epoch": 17.99233490566038, "grad_norm": 4.619197845458984, "learning_rate": 3.040055475916087e-07, "loss": 0.3564, "num_input_tokens_seen": 20031816, "step": 30515 }, { "epoch": 17.995283018867923, "grad_norm": 4.085579872131348, "learning_rate": 3.0312276711139675e-07, "loss": 0.2959, "num_input_tokens_seen": 20034696, "step": 30520 }, { "epoch": 17.99823113207547, "grad_norm": 3.7990238666534424, "learning_rate": 3.0224123014967353e-07, "loss": 0.3135, "num_input_tokens_seen": 20036744, "step": 30525 }, { "epoch": 18.0, "eval_loss": 0.6031064987182617, "eval_runtime": 18.7405, "eval_samples_per_second": 90.499, "eval_steps_per_second": 22.625, "num_input_tokens_seen": 20038104, "step": 30528 }, { "epoch": 18.00117924528302, "grad_norm": 9.578548431396484, "learning_rate": 3.013609369398324e-07, "loss": 0.3579, "num_input_tokens_seen": 20039000, "step": 30530 }, { "epoch": 18.004127358490567, "grad_norm": 5.766124248504639, "learning_rate": 3.004818877149318e-07, "loss": 0.2752, "num_input_tokens_seen": 20041400, "step": 30535 }, { "epoch": 18.007075471698112, "grad_norm": 4.621849536895752, "learning_rate": 2.9960408270770624e-07, "loss": 0.429, "num_input_tokens_seen": 20044280, "step": 30540 }, { "epoch": 18.01002358490566, "grad_norm": 2.7784578800201416, "learning_rate": 2.9872752215055755e-07, "loss": 0.3284, "num_input_tokens_seen": 20047544, "step": 30545 }, { "epoch": 18.01297169811321, "grad_norm": 3.5186078548431396, "learning_rate": 2.9785220627555844e-07, "loss": 0.3035, "num_input_tokens_seen": 20051064, "step": 30550 }, { "epoch": 18.015919811320753, "grad_norm": 2.1591858863830566, "learning_rate": 2.9697813531445295e-07, "loss": 0.3102, "num_input_tokens_seen": 20054040, "step": 30555 }, { "epoch": 18.0188679245283, "grad_norm": 2.997922420501709, "learning_rate": 2.9610530949865433e-07, "loss": 0.3237, "num_input_tokens_seen": 20058104, "step": 30560 }, { "epoch": 18.02181603773585, "grad_norm": 4.417153358459473, "learning_rate": 2.952337290592483e-07, "loss": 0.3736, "num_input_tokens_seen": 20061336, "step": 30565 }, { "epoch": 18.024764150943398, "grad_norm": 5.385011672973633, "learning_rate": 2.9436339422698913e-07, "loss": 0.405, "num_input_tokens_seen": 20064632, "step": 30570 }, { "epoch": 18.027712264150942, "grad_norm": 4.848145961761475, "learning_rate": 2.934943052323008e-07, "loss": 0.1941, "num_input_tokens_seen": 20067544, "step": 30575 }, { "epoch": 18.03066037735849, "grad_norm": 4.3622870445251465, "learning_rate": 2.926264623052799e-07, "loss": 0.3228, "num_input_tokens_seen": 20071096, "step": 30580 }, { "epoch": 18.03360849056604, "grad_norm": 3.52626633644104, "learning_rate": 2.9175986567569036e-07, "loss": 0.323, "num_input_tokens_seen": 20073688, "step": 30585 }, { "epoch": 18.036556603773583, "grad_norm": 3.539945125579834, "learning_rate": 2.9089451557296755e-07, "loss": 0.2871, "num_input_tokens_seen": 20077496, "step": 30590 }, { "epoch": 18.03950471698113, "grad_norm": 3.7377991676330566, "learning_rate": 2.9003041222621706e-07, "loss": 0.3414, "num_input_tokens_seen": 20081176, "step": 30595 }, { "epoch": 18.04245283018868, "grad_norm": 3.6061484813690186, "learning_rate": 2.8916755586421375e-07, "loss": 0.3289, "num_input_tokens_seen": 20084056, "step": 30600 }, { "epoch": 18.045400943396228, "grad_norm": 10.159502983093262, "learning_rate": 2.883059467154031e-07, "loss": 0.3818, "num_input_tokens_seen": 20087128, "step": 30605 }, { "epoch": 18.048349056603772, "grad_norm": 4.915844917297363, "learning_rate": 2.8744558500789887e-07, "loss": 0.3061, "num_input_tokens_seen": 20090264, "step": 30610 }, { "epoch": 18.05129716981132, "grad_norm": 2.8435046672821045, "learning_rate": 2.8658647096948546e-07, "loss": 0.2669, "num_input_tokens_seen": 20092760, "step": 30615 }, { "epoch": 18.05424528301887, "grad_norm": 3.7232418060302734, "learning_rate": 2.8572860482761813e-07, "loss": 0.4352, "num_input_tokens_seen": 20096216, "step": 30620 }, { "epoch": 18.057193396226417, "grad_norm": 2.626399517059326, "learning_rate": 2.8487198680942017e-07, "loss": 0.2961, "num_input_tokens_seen": 20099576, "step": 30625 }, { "epoch": 18.06014150943396, "grad_norm": 3.3478357791900635, "learning_rate": 2.840166171416836e-07, "loss": 0.384, "num_input_tokens_seen": 20103672, "step": 30630 }, { "epoch": 18.06308962264151, "grad_norm": 3.8845889568328857, "learning_rate": 2.8316249605087386e-07, "loss": 0.3566, "num_input_tokens_seen": 20106840, "step": 30635 }, { "epoch": 18.066037735849058, "grad_norm": 4.354495048522949, "learning_rate": 2.823096237631212e-07, "loss": 0.3176, "num_input_tokens_seen": 20110072, "step": 30640 }, { "epoch": 18.068985849056602, "grad_norm": 6.10596227645874, "learning_rate": 2.814580005042283e-07, "loss": 0.4581, "num_input_tokens_seen": 20112920, "step": 30645 }, { "epoch": 18.07193396226415, "grad_norm": 3.7606239318847656, "learning_rate": 2.8060762649966435e-07, "loss": 0.3166, "num_input_tokens_seen": 20116280, "step": 30650 }, { "epoch": 18.0748820754717, "grad_norm": 3.0134544372558594, "learning_rate": 2.797585019745713e-07, "loss": 0.3059, "num_input_tokens_seen": 20119256, "step": 30655 }, { "epoch": 18.077830188679247, "grad_norm": 3.387517213821411, "learning_rate": 2.789106271537584e-07, "loss": 0.2605, "num_input_tokens_seen": 20122424, "step": 30660 }, { "epoch": 18.08077830188679, "grad_norm": 4.419379234313965, "learning_rate": 2.780640022617037e-07, "loss": 0.244, "num_input_tokens_seen": 20125464, "step": 30665 }, { "epoch": 18.08372641509434, "grad_norm": 4.969230651855469, "learning_rate": 2.772186275225547e-07, "loss": 0.3259, "num_input_tokens_seen": 20129912, "step": 30670 }, { "epoch": 18.086674528301888, "grad_norm": 4.377511024475098, "learning_rate": 2.7637450316012836e-07, "loss": 0.3323, "num_input_tokens_seen": 20133048, "step": 30675 }, { "epoch": 18.089622641509433, "grad_norm": 3.2251484394073486, "learning_rate": 2.755316293979088e-07, "loss": 0.2797, "num_input_tokens_seen": 20137368, "step": 30680 }, { "epoch": 18.09257075471698, "grad_norm": 3.40616512298584, "learning_rate": 2.7469000645905295e-07, "loss": 0.2357, "num_input_tokens_seen": 20140056, "step": 30685 }, { "epoch": 18.09551886792453, "grad_norm": 3.111687421798706, "learning_rate": 2.738496345663827e-07, "loss": 0.3672, "num_input_tokens_seen": 20142968, "step": 30690 }, { "epoch": 18.098466981132077, "grad_norm": 3.086103916168213, "learning_rate": 2.7301051394239e-07, "loss": 0.2831, "num_input_tokens_seen": 20145464, "step": 30695 }, { "epoch": 18.10141509433962, "grad_norm": 2.8618979454040527, "learning_rate": 2.72172644809236e-07, "loss": 0.3526, "num_input_tokens_seen": 20149048, "step": 30700 }, { "epoch": 18.10436320754717, "grad_norm": 2.9042890071868896, "learning_rate": 2.7133602738875e-07, "loss": 0.3105, "num_input_tokens_seen": 20151416, "step": 30705 }, { "epoch": 18.107311320754718, "grad_norm": 5.067267894744873, "learning_rate": 2.7050066190242976e-07, "loss": 0.2893, "num_input_tokens_seen": 20154136, "step": 30710 }, { "epoch": 18.110259433962263, "grad_norm": 2.9179344177246094, "learning_rate": 2.696665485714428e-07, "loss": 0.2278, "num_input_tokens_seen": 20156824, "step": 30715 }, { "epoch": 18.11320754716981, "grad_norm": 4.4393391609191895, "learning_rate": 2.6883368761662367e-07, "loss": 0.323, "num_input_tokens_seen": 20159320, "step": 30720 }, { "epoch": 18.11615566037736, "grad_norm": 2.7913875579833984, "learning_rate": 2.680020792584759e-07, "loss": 0.316, "num_input_tokens_seen": 20162424, "step": 30725 }, { "epoch": 18.119103773584907, "grad_norm": 3.9745659828186035, "learning_rate": 2.6717172371717113e-07, "loss": 0.2804, "num_input_tokens_seen": 20165080, "step": 30730 }, { "epoch": 18.122051886792452, "grad_norm": 2.2311503887176514, "learning_rate": 2.663426212125503e-07, "loss": 0.3884, "num_input_tokens_seen": 20169336, "step": 30735 }, { "epoch": 18.125, "grad_norm": 2.9139766693115234, "learning_rate": 2.655147719641216e-07, "loss": 0.4187, "num_input_tokens_seen": 20171960, "step": 30740 }, { "epoch": 18.127948113207548, "grad_norm": 4.5965046882629395, "learning_rate": 2.646881761910602e-07, "loss": 0.2258, "num_input_tokens_seen": 20174968, "step": 30745 }, { "epoch": 18.130896226415093, "grad_norm": 2.497976779937744, "learning_rate": 2.638628341122135e-07, "loss": 0.3748, "num_input_tokens_seen": 20177912, "step": 30750 }, { "epoch": 18.13384433962264, "grad_norm": 3.4366419315338135, "learning_rate": 2.6303874594609314e-07, "loss": 0.2457, "num_input_tokens_seen": 20181048, "step": 30755 }, { "epoch": 18.13679245283019, "grad_norm": 2.106492042541504, "learning_rate": 2.622159119108797e-07, "loss": 0.3524, "num_input_tokens_seen": 20184664, "step": 30760 }, { "epoch": 18.139740566037737, "grad_norm": 3.361116886138916, "learning_rate": 2.6139433222442226e-07, "loss": 0.365, "num_input_tokens_seen": 20188056, "step": 30765 }, { "epoch": 18.142688679245282, "grad_norm": 3.550609827041626, "learning_rate": 2.6057400710423787e-07, "loss": 0.3194, "num_input_tokens_seen": 20191416, "step": 30770 }, { "epoch": 18.14563679245283, "grad_norm": 3.862424373626709, "learning_rate": 2.5975493676751004e-07, "loss": 0.2344, "num_input_tokens_seen": 20194680, "step": 30775 }, { "epoch": 18.14858490566038, "grad_norm": 5.312565326690674, "learning_rate": 2.589371214310926e-07, "loss": 0.2321, "num_input_tokens_seen": 20197048, "step": 30780 }, { "epoch": 18.151533018867923, "grad_norm": 3.818697214126587, "learning_rate": 2.581205613115051e-07, "loss": 0.2856, "num_input_tokens_seen": 20203320, "step": 30785 }, { "epoch": 18.15448113207547, "grad_norm": 3.4766879081726074, "learning_rate": 2.573052566249357e-07, "loss": 0.2898, "num_input_tokens_seen": 20207096, "step": 30790 }, { "epoch": 18.15742924528302, "grad_norm": 3.8996269702911377, "learning_rate": 2.5649120758723945e-07, "loss": 0.2865, "num_input_tokens_seen": 20209816, "step": 30795 }, { "epoch": 18.160377358490567, "grad_norm": 4.073709487915039, "learning_rate": 2.5567841441393906e-07, "loss": 0.3333, "num_input_tokens_seen": 20213368, "step": 30800 }, { "epoch": 18.163325471698112, "grad_norm": 4.164287567138672, "learning_rate": 2.548668773202245e-07, "loss": 0.2547, "num_input_tokens_seen": 20217592, "step": 30805 }, { "epoch": 18.16627358490566, "grad_norm": 4.675111293792725, "learning_rate": 2.5405659652095573e-07, "loss": 0.347, "num_input_tokens_seen": 20220376, "step": 30810 }, { "epoch": 18.16922169811321, "grad_norm": 2.328660249710083, "learning_rate": 2.5324757223065655e-07, "loss": 0.3059, "num_input_tokens_seen": 20223192, "step": 30815 }, { "epoch": 18.172169811320753, "grad_norm": 3.7299728393554688, "learning_rate": 2.524398046635207e-07, "loss": 0.355, "num_input_tokens_seen": 20226232, "step": 30820 }, { "epoch": 18.1751179245283, "grad_norm": 3.7154784202575684, "learning_rate": 2.51633294033406e-07, "loss": 0.336, "num_input_tokens_seen": 20229464, "step": 30825 }, { "epoch": 18.17806603773585, "grad_norm": 2.965959310531616, "learning_rate": 2.5082804055384214e-07, "loss": 0.3637, "num_input_tokens_seen": 20232856, "step": 30830 }, { "epoch": 18.181014150943398, "grad_norm": 3.2134757041931152, "learning_rate": 2.50024044438022e-07, "loss": 0.4086, "num_input_tokens_seen": 20236920, "step": 30835 }, { "epoch": 18.183962264150942, "grad_norm": 2.9329447746276855, "learning_rate": 2.492213058988069e-07, "loss": 0.3528, "num_input_tokens_seen": 20240248, "step": 30840 }, { "epoch": 18.18691037735849, "grad_norm": 3.2297394275665283, "learning_rate": 2.4841982514872633e-07, "loss": 0.2433, "num_input_tokens_seen": 20243352, "step": 30845 }, { "epoch": 18.18985849056604, "grad_norm": 2.660773277282715, "learning_rate": 2.4761960239997497e-07, "loss": 0.2945, "num_input_tokens_seen": 20246936, "step": 30850 }, { "epoch": 18.192806603773583, "grad_norm": 2.1894328594207764, "learning_rate": 2.4682063786441556e-07, "loss": 0.2581, "num_input_tokens_seen": 20250104, "step": 30855 }, { "epoch": 18.19575471698113, "grad_norm": 2.440031051635742, "learning_rate": 2.460229317535778e-07, "loss": 0.3458, "num_input_tokens_seen": 20253624, "step": 30860 }, { "epoch": 18.19870283018868, "grad_norm": 3.6991117000579834, "learning_rate": 2.4522648427865725e-07, "loss": 0.2934, "num_input_tokens_seen": 20256504, "step": 30865 }, { "epoch": 18.201650943396228, "grad_norm": 5.972622871398926, "learning_rate": 2.444312956505163e-07, "loss": 0.3429, "num_input_tokens_seen": 20259928, "step": 30870 }, { "epoch": 18.204599056603772, "grad_norm": 5.4500346183776855, "learning_rate": 2.4363736607968537e-07, "loss": 0.4013, "num_input_tokens_seen": 20262968, "step": 30875 }, { "epoch": 18.20754716981132, "grad_norm": 3.068817377090454, "learning_rate": 2.428446957763608e-07, "loss": 0.2699, "num_input_tokens_seen": 20266264, "step": 30880 }, { "epoch": 18.21049528301887, "grad_norm": 3.2428267002105713, "learning_rate": 2.4205328495040535e-07, "loss": 0.3173, "num_input_tokens_seen": 20268984, "step": 30885 }, { "epoch": 18.213443396226417, "grad_norm": 3.2289199829101562, "learning_rate": 2.412631338113486e-07, "loss": 0.2558, "num_input_tokens_seen": 20271992, "step": 30890 }, { "epoch": 18.21639150943396, "grad_norm": 9.333342552185059, "learning_rate": 2.404742425683848e-07, "loss": 0.3293, "num_input_tokens_seen": 20275864, "step": 30895 }, { "epoch": 18.21933962264151, "grad_norm": 3.302374839782715, "learning_rate": 2.3968661143037864e-07, "loss": 0.2668, "num_input_tokens_seen": 20279224, "step": 30900 }, { "epoch": 18.222287735849058, "grad_norm": 4.582475185394287, "learning_rate": 2.3890024060585823e-07, "loss": 0.3363, "num_input_tokens_seen": 20283000, "step": 30905 }, { "epoch": 18.225235849056602, "grad_norm": 3.264417886734009, "learning_rate": 2.3811513030301826e-07, "loss": 0.2881, "num_input_tokens_seen": 20285592, "step": 30910 }, { "epoch": 18.22818396226415, "grad_norm": 3.9075872898101807, "learning_rate": 2.373312807297201e-07, "loss": 0.3081, "num_input_tokens_seen": 20288632, "step": 30915 }, { "epoch": 18.2311320754717, "grad_norm": 6.793896198272705, "learning_rate": 2.3654869209349007e-07, "loss": 0.5219, "num_input_tokens_seen": 20292024, "step": 30920 }, { "epoch": 18.234080188679247, "grad_norm": 3.166482448577881, "learning_rate": 2.357673646015246e-07, "loss": 0.3114, "num_input_tokens_seen": 20294680, "step": 30925 }, { "epoch": 18.23702830188679, "grad_norm": 2.282170534133911, "learning_rate": 2.3498729846068103e-07, "loss": 0.3499, "num_input_tokens_seen": 20298808, "step": 30930 }, { "epoch": 18.23997641509434, "grad_norm": 6.671809196472168, "learning_rate": 2.342084938774869e-07, "loss": 0.2269, "num_input_tokens_seen": 20303320, "step": 30935 }, { "epoch": 18.242924528301888, "grad_norm": 6.688050270080566, "learning_rate": 2.334309510581334e-07, "loss": 0.3361, "num_input_tokens_seen": 20306744, "step": 30940 }, { "epoch": 18.245872641509433, "grad_norm": 7.051071643829346, "learning_rate": 2.3265467020847864e-07, "loss": 0.2916, "num_input_tokens_seen": 20310296, "step": 30945 }, { "epoch": 18.24882075471698, "grad_norm": 3.26595401763916, "learning_rate": 2.31879651534046e-07, "loss": 0.2115, "num_input_tokens_seen": 20313496, "step": 30950 }, { "epoch": 18.25176886792453, "grad_norm": 3.753068447113037, "learning_rate": 2.311058952400247e-07, "loss": 0.3475, "num_input_tokens_seen": 20317656, "step": 30955 }, { "epoch": 18.254716981132077, "grad_norm": 3.0445077419281006, "learning_rate": 2.3033340153127026e-07, "loss": 0.3094, "num_input_tokens_seen": 20321048, "step": 30960 }, { "epoch": 18.25766509433962, "grad_norm": 4.6903486251831055, "learning_rate": 2.295621706123041e-07, "loss": 0.332, "num_input_tokens_seen": 20323800, "step": 30965 }, { "epoch": 18.26061320754717, "grad_norm": 3.8130438327789307, "learning_rate": 2.287922026873135e-07, "loss": 0.293, "num_input_tokens_seen": 20326040, "step": 30970 }, { "epoch": 18.263561320754718, "grad_norm": 3.2005016803741455, "learning_rate": 2.2802349796014923e-07, "loss": 0.303, "num_input_tokens_seen": 20329016, "step": 30975 }, { "epoch": 18.266509433962263, "grad_norm": 3.4477391242980957, "learning_rate": 2.2725605663433013e-07, "loss": 0.355, "num_input_tokens_seen": 20331352, "step": 30980 }, { "epoch": 18.26945754716981, "grad_norm": 2.871917486190796, "learning_rate": 2.264898789130393e-07, "loss": 0.5069, "num_input_tokens_seen": 20334392, "step": 30985 }, { "epoch": 18.27240566037736, "grad_norm": 10.676265716552734, "learning_rate": 2.2572496499912554e-07, "loss": 0.4874, "num_input_tokens_seen": 20337720, "step": 30990 }, { "epoch": 18.275353773584907, "grad_norm": 2.9488632678985596, "learning_rate": 2.2496131509510354e-07, "loss": 0.3107, "num_input_tokens_seen": 20340568, "step": 30995 }, { "epoch": 18.278301886792452, "grad_norm": 2.679182529449463, "learning_rate": 2.2419892940315268e-07, "loss": 0.2941, "num_input_tokens_seen": 20344344, "step": 31000 }, { "epoch": 18.28125, "grad_norm": 6.690992832183838, "learning_rate": 2.2343780812511819e-07, "loss": 0.3141, "num_input_tokens_seen": 20347320, "step": 31005 }, { "epoch": 18.284198113207548, "grad_norm": 5.172159194946289, "learning_rate": 2.2267795146250936e-07, "loss": 0.2579, "num_input_tokens_seen": 20351160, "step": 31010 }, { "epoch": 18.287146226415093, "grad_norm": 3.2337253093719482, "learning_rate": 2.2191935961650146e-07, "loss": 0.2893, "num_input_tokens_seen": 20353912, "step": 31015 }, { "epoch": 18.29009433962264, "grad_norm": 6.4315056800842285, "learning_rate": 2.2116203278793603e-07, "loss": 0.3887, "num_input_tokens_seen": 20357080, "step": 31020 }, { "epoch": 18.29304245283019, "grad_norm": 3.5322749614715576, "learning_rate": 2.2040597117731766e-07, "loss": 0.2883, "num_input_tokens_seen": 20359384, "step": 31025 }, { "epoch": 18.295990566037737, "grad_norm": 4.6669416427612305, "learning_rate": 2.1965117498481793e-07, "loss": 0.253, "num_input_tokens_seen": 20363128, "step": 31030 }, { "epoch": 18.298938679245282, "grad_norm": 4.731997489929199, "learning_rate": 2.188976444102714e-07, "loss": 0.3827, "num_input_tokens_seen": 20366648, "step": 31035 }, { "epoch": 18.30188679245283, "grad_norm": 2.7553911209106445, "learning_rate": 2.181453796531796e-07, "loss": 0.3758, "num_input_tokens_seen": 20370552, "step": 31040 }, { "epoch": 18.30483490566038, "grad_norm": 2.9746344089508057, "learning_rate": 2.1739438091270658e-07, "loss": 0.3357, "num_input_tokens_seen": 20373880, "step": 31045 }, { "epoch": 18.307783018867923, "grad_norm": 4.784595966339111, "learning_rate": 2.1664464838768329e-07, "loss": 0.2936, "num_input_tokens_seen": 20377112, "step": 31050 }, { "epoch": 18.31073113207547, "grad_norm": 4.768202781677246, "learning_rate": 2.1589618227660426e-07, "loss": 0.3465, "num_input_tokens_seen": 20379704, "step": 31055 }, { "epoch": 18.31367924528302, "grad_norm": 2.616786003112793, "learning_rate": 2.151489827776293e-07, "loss": 0.4509, "num_input_tokens_seen": 20382808, "step": 31060 }, { "epoch": 18.316627358490567, "grad_norm": 4.187252044677734, "learning_rate": 2.1440305008858298e-07, "loss": 0.3323, "num_input_tokens_seen": 20386264, "step": 31065 }, { "epoch": 18.319575471698112, "grad_norm": 4.019891738891602, "learning_rate": 2.1365838440695397e-07, "loss": 0.3469, "num_input_tokens_seen": 20390008, "step": 31070 }, { "epoch": 18.32252358490566, "grad_norm": 2.1569669246673584, "learning_rate": 2.129149859298957e-07, "loss": 0.2877, "num_input_tokens_seen": 20393176, "step": 31075 }, { "epoch": 18.32547169811321, "grad_norm": 4.719166278839111, "learning_rate": 2.1217285485422622e-07, "loss": 0.4782, "num_input_tokens_seen": 20396024, "step": 31080 }, { "epoch": 18.328419811320753, "grad_norm": 4.369931697845459, "learning_rate": 2.114319913764268e-07, "loss": 0.3159, "num_input_tokens_seen": 20398712, "step": 31085 }, { "epoch": 18.3313679245283, "grad_norm": 2.314028263092041, "learning_rate": 2.10692395692646e-07, "loss": 0.2012, "num_input_tokens_seen": 20401336, "step": 31090 }, { "epoch": 18.33431603773585, "grad_norm": 3.613093614578247, "learning_rate": 2.0995406799869444e-07, "loss": 0.3107, "num_input_tokens_seen": 20404312, "step": 31095 }, { "epoch": 18.337264150943398, "grad_norm": 4.393362045288086, "learning_rate": 2.0921700849004743e-07, "loss": 0.2598, "num_input_tokens_seen": 20408088, "step": 31100 }, { "epoch": 18.340212264150942, "grad_norm": 2.7265560626983643, "learning_rate": 2.084812173618439e-07, "loss": 0.3302, "num_input_tokens_seen": 20410872, "step": 31105 }, { "epoch": 18.34316037735849, "grad_norm": 3.519169807434082, "learning_rate": 2.0774669480888853e-07, "loss": 0.2499, "num_input_tokens_seen": 20413752, "step": 31110 }, { "epoch": 18.34610849056604, "grad_norm": 5.125545978546143, "learning_rate": 2.0701344102564912e-07, "loss": 0.2222, "num_input_tokens_seen": 20416952, "step": 31115 }, { "epoch": 18.349056603773583, "grad_norm": 3.755362033843994, "learning_rate": 2.062814562062576e-07, "loss": 0.2602, "num_input_tokens_seen": 20420472, "step": 31120 }, { "epoch": 18.35200471698113, "grad_norm": 3.4256207942962646, "learning_rate": 2.0555074054451063e-07, "loss": 0.2978, "num_input_tokens_seen": 20424632, "step": 31125 }, { "epoch": 18.35495283018868, "grad_norm": 4.0963664054870605, "learning_rate": 2.0482129423386843e-07, "loss": 0.4084, "num_input_tokens_seen": 20427992, "step": 31130 }, { "epoch": 18.357900943396228, "grad_norm": 3.427919864654541, "learning_rate": 2.040931174674543e-07, "loss": 0.2987, "num_input_tokens_seen": 20431032, "step": 31135 }, { "epoch": 18.360849056603772, "grad_norm": 4.513065814971924, "learning_rate": 2.0336621043805682e-07, "loss": 0.462, "num_input_tokens_seen": 20434584, "step": 31140 }, { "epoch": 18.36379716981132, "grad_norm": 3.121596574783325, "learning_rate": 2.0264057333812704e-07, "loss": 0.2702, "num_input_tokens_seen": 20437784, "step": 31145 }, { "epoch": 18.36674528301887, "grad_norm": 5.3565497398376465, "learning_rate": 2.0191620635978127e-07, "loss": 0.2781, "num_input_tokens_seen": 20440248, "step": 31150 }, { "epoch": 18.369693396226417, "grad_norm": 2.816514730453491, "learning_rate": 2.0119310969479833e-07, "loss": 0.3716, "num_input_tokens_seen": 20442936, "step": 31155 }, { "epoch": 18.37264150943396, "grad_norm": 4.210086345672607, "learning_rate": 2.004712835346212e-07, "loss": 0.2614, "num_input_tokens_seen": 20446392, "step": 31160 }, { "epoch": 18.37558962264151, "grad_norm": 8.4136381149292, "learning_rate": 1.99750728070357e-07, "loss": 0.2635, "num_input_tokens_seen": 20449048, "step": 31165 }, { "epoch": 18.378537735849058, "grad_norm": 3.738391160964966, "learning_rate": 1.9903144349277536e-07, "loss": 0.43, "num_input_tokens_seen": 20452888, "step": 31170 }, { "epoch": 18.381485849056602, "grad_norm": 12.177677154541016, "learning_rate": 1.983134299923095e-07, "loss": 0.4782, "num_input_tokens_seen": 20458008, "step": 31175 }, { "epoch": 18.38443396226415, "grad_norm": 3.0526421070098877, "learning_rate": 1.9759668775905737e-07, "loss": 0.2945, "num_input_tokens_seen": 20461496, "step": 31180 }, { "epoch": 18.3873820754717, "grad_norm": 3.1576173305511475, "learning_rate": 1.9688121698277995e-07, "loss": 0.2935, "num_input_tokens_seen": 20464600, "step": 31185 }, { "epoch": 18.390330188679247, "grad_norm": 3.5299627780914307, "learning_rate": 1.9616701785290015e-07, "loss": 0.3288, "num_input_tokens_seen": 20467480, "step": 31190 }, { "epoch": 18.39327830188679, "grad_norm": 4.059203624725342, "learning_rate": 1.954540905585056e-07, "loss": 0.537, "num_input_tokens_seen": 20470744, "step": 31195 }, { "epoch": 18.39622641509434, "grad_norm": 2.9347121715545654, "learning_rate": 1.9474243528834757e-07, "loss": 0.4263, "num_input_tokens_seen": 20474584, "step": 31200 }, { "epoch": 18.399174528301888, "grad_norm": 2.954437017440796, "learning_rate": 1.9403205223083866e-07, "loss": 0.4038, "num_input_tokens_seen": 20477240, "step": 31205 }, { "epoch": 18.402122641509433, "grad_norm": 2.584096670150757, "learning_rate": 1.9332294157405619e-07, "loss": 0.3398, "num_input_tokens_seen": 20480632, "step": 31210 }, { "epoch": 18.40507075471698, "grad_norm": 5.013119220733643, "learning_rate": 1.926151035057411e-07, "loss": 0.3525, "num_input_tokens_seen": 20483448, "step": 31215 }, { "epoch": 18.40801886792453, "grad_norm": 2.974757432937622, "learning_rate": 1.9190853821329626e-07, "loss": 0.3032, "num_input_tokens_seen": 20488056, "step": 31220 }, { "epoch": 18.410966981132077, "grad_norm": 8.193241119384766, "learning_rate": 1.9120324588378757e-07, "loss": 0.3452, "num_input_tokens_seen": 20491768, "step": 31225 }, { "epoch": 18.41391509433962, "grad_norm": 5.425990581512451, "learning_rate": 1.9049922670394461e-07, "loss": 0.2895, "num_input_tokens_seen": 20494488, "step": 31230 }, { "epoch": 18.41686320754717, "grad_norm": 6.527312755584717, "learning_rate": 1.897964808601588e-07, "loss": 0.389, "num_input_tokens_seen": 20497336, "step": 31235 }, { "epoch": 18.419811320754718, "grad_norm": 2.3552169799804688, "learning_rate": 1.8909500853848517e-07, "loss": 0.3064, "num_input_tokens_seen": 20500856, "step": 31240 }, { "epoch": 18.422759433962263, "grad_norm": 2.2908132076263428, "learning_rate": 1.8839480992464243e-07, "loss": 0.3944, "num_input_tokens_seen": 20503576, "step": 31245 }, { "epoch": 18.42570754716981, "grad_norm": 7.449656009674072, "learning_rate": 1.8769588520401005e-07, "loss": 0.2317, "num_input_tokens_seen": 20506296, "step": 31250 }, { "epoch": 18.42865566037736, "grad_norm": 4.253210544586182, "learning_rate": 1.8699823456163279e-07, "loss": 0.2193, "num_input_tokens_seen": 20510680, "step": 31255 }, { "epoch": 18.431603773584907, "grad_norm": 4.59022855758667, "learning_rate": 1.8630185818221514e-07, "loss": 0.3128, "num_input_tokens_seen": 20512920, "step": 31260 }, { "epoch": 18.434551886792452, "grad_norm": 3.8318538665771484, "learning_rate": 1.856067562501268e-07, "loss": 0.4246, "num_input_tokens_seen": 20517816, "step": 31265 }, { "epoch": 18.4375, "grad_norm": 4.464165687561035, "learning_rate": 1.8491292894939837e-07, "loss": 0.272, "num_input_tokens_seen": 20521016, "step": 31270 }, { "epoch": 18.440448113207548, "grad_norm": 2.8426120281219482, "learning_rate": 1.8422037646372405e-07, "loss": 0.3209, "num_input_tokens_seen": 20523832, "step": 31275 }, { "epoch": 18.443396226415093, "grad_norm": 3.2962281703948975, "learning_rate": 1.8352909897645989e-07, "loss": 0.2843, "num_input_tokens_seen": 20526648, "step": 31280 }, { "epoch": 18.44634433962264, "grad_norm": 3.097761631011963, "learning_rate": 1.8283909667062448e-07, "loss": 0.2706, "num_input_tokens_seen": 20529048, "step": 31285 }, { "epoch": 18.44929245283019, "grad_norm": 3.4493138790130615, "learning_rate": 1.82150369728899e-07, "loss": 0.3382, "num_input_tokens_seen": 20531800, "step": 31290 }, { "epoch": 18.452240566037737, "grad_norm": 2.952893018722534, "learning_rate": 1.814629183336275e-07, "loss": 0.3009, "num_input_tokens_seen": 20535864, "step": 31295 }, { "epoch": 18.455188679245282, "grad_norm": 4.700063228607178, "learning_rate": 1.807767426668139e-07, "loss": 0.3318, "num_input_tokens_seen": 20538936, "step": 31300 }, { "epoch": 18.45813679245283, "grad_norm": 2.5729267597198486, "learning_rate": 1.8009184291012783e-07, "loss": 0.2577, "num_input_tokens_seen": 20541784, "step": 31305 }, { "epoch": 18.46108490566038, "grad_norm": 2.9156315326690674, "learning_rate": 1.7940821924489926e-07, "loss": 0.3872, "num_input_tokens_seen": 20544984, "step": 31310 }, { "epoch": 18.464033018867923, "grad_norm": 6.569728374481201, "learning_rate": 1.7872587185212009e-07, "loss": 0.3086, "num_input_tokens_seen": 20547448, "step": 31315 }, { "epoch": 18.46698113207547, "grad_norm": 5.048493385314941, "learning_rate": 1.7804480091244524e-07, "loss": 0.2976, "num_input_tokens_seen": 20550552, "step": 31320 }, { "epoch": 18.46992924528302, "grad_norm": 4.995121002197266, "learning_rate": 1.7736500660619104e-07, "loss": 0.4666, "num_input_tokens_seen": 20557144, "step": 31325 }, { "epoch": 18.472877358490567, "grad_norm": 4.634515285491943, "learning_rate": 1.766864891133352e-07, "loss": 0.2395, "num_input_tokens_seen": 20560088, "step": 31330 }, { "epoch": 18.475825471698112, "grad_norm": 5.615715980529785, "learning_rate": 1.7600924861351843e-07, "loss": 0.3341, "num_input_tokens_seen": 20562840, "step": 31335 }, { "epoch": 18.47877358490566, "grad_norm": 4.749587059020996, "learning_rate": 1.7533328528604398e-07, "loss": 0.2635, "num_input_tokens_seen": 20565816, "step": 31340 }, { "epoch": 18.48172169811321, "grad_norm": 2.838334560394287, "learning_rate": 1.746585993098754e-07, "loss": 0.3293, "num_input_tokens_seen": 20569016, "step": 31345 }, { "epoch": 18.484669811320753, "grad_norm": 2.3257219791412354, "learning_rate": 1.7398519086363864e-07, "loss": 0.3126, "num_input_tokens_seen": 20573496, "step": 31350 }, { "epoch": 18.4876179245283, "grad_norm": 2.8385307788848877, "learning_rate": 1.733130601256211e-07, "loss": 0.3277, "num_input_tokens_seen": 20577656, "step": 31355 }, { "epoch": 18.49056603773585, "grad_norm": 5.04785680770874, "learning_rate": 1.7264220727377323e-07, "loss": 0.4107, "num_input_tokens_seen": 20581464, "step": 31360 }, { "epoch": 18.493514150943398, "grad_norm": 6.172461032867432, "learning_rate": 1.7197263248570517e-07, "loss": 0.2563, "num_input_tokens_seen": 20584024, "step": 31365 }, { "epoch": 18.496462264150942, "grad_norm": 2.4732413291931152, "learning_rate": 1.7130433593869124e-07, "loss": 0.3034, "num_input_tokens_seen": 20587224, "step": 31370 }, { "epoch": 18.49941037735849, "grad_norm": 5.015571594238281, "learning_rate": 1.706373178096643e-07, "loss": 0.3952, "num_input_tokens_seen": 20590296, "step": 31375 }, { "epoch": 18.50235849056604, "grad_norm": 4.596898555755615, "learning_rate": 1.6997157827522092e-07, "loss": 0.317, "num_input_tokens_seen": 20594552, "step": 31380 }, { "epoch": 18.505306603773583, "grad_norm": 5.175604820251465, "learning_rate": 1.6930711751161843e-07, "loss": 0.3373, "num_input_tokens_seen": 20597496, "step": 31385 }, { "epoch": 18.50825471698113, "grad_norm": 3.819269895553589, "learning_rate": 1.6864393569477556e-07, "loss": 0.33, "num_input_tokens_seen": 20600792, "step": 31390 }, { "epoch": 18.51120283018868, "grad_norm": 4.998197078704834, "learning_rate": 1.6798203300027295e-07, "loss": 0.2721, "num_input_tokens_seen": 20603480, "step": 31395 }, { "epoch": 18.514150943396228, "grad_norm": 2.7222981452941895, "learning_rate": 1.6732140960335152e-07, "loss": 0.4433, "num_input_tokens_seen": 20606424, "step": 31400 }, { "epoch": 18.517099056603772, "grad_norm": 2.5885696411132812, "learning_rate": 1.666620656789153e-07, "loss": 0.2895, "num_input_tokens_seen": 20609368, "step": 31405 }, { "epoch": 18.52004716981132, "grad_norm": 4.355928421020508, "learning_rate": 1.660040014015274e-07, "loss": 0.2265, "num_input_tokens_seen": 20612152, "step": 31410 }, { "epoch": 18.52299528301887, "grad_norm": 7.701449394226074, "learning_rate": 1.6534721694541344e-07, "loss": 0.2292, "num_input_tokens_seen": 20615224, "step": 31415 }, { "epoch": 18.525943396226417, "grad_norm": 5.658308982849121, "learning_rate": 1.6469171248445993e-07, "loss": 0.3605, "num_input_tokens_seen": 20619096, "step": 31420 }, { "epoch": 18.52889150943396, "grad_norm": 2.5633015632629395, "learning_rate": 1.6403748819221464e-07, "loss": 0.2849, "num_input_tokens_seen": 20622584, "step": 31425 }, { "epoch": 18.53183962264151, "grad_norm": 3.8886990547180176, "learning_rate": 1.6338454424188632e-07, "loss": 0.3266, "num_input_tokens_seen": 20625720, "step": 31430 }, { "epoch": 18.534787735849058, "grad_norm": 1.8646730184555054, "learning_rate": 1.6273288080634442e-07, "loss": 0.3146, "num_input_tokens_seen": 20629496, "step": 31435 }, { "epoch": 18.537735849056602, "grad_norm": 7.307372570037842, "learning_rate": 1.6208249805811982e-07, "loss": 0.3579, "num_input_tokens_seen": 20633112, "step": 31440 }, { "epoch": 18.54068396226415, "grad_norm": 3.0670418739318848, "learning_rate": 1.6143339616940423e-07, "loss": 0.2792, "num_input_tokens_seen": 20636664, "step": 31445 }, { "epoch": 18.5436320754717, "grad_norm": 2.7095487117767334, "learning_rate": 1.6078557531205018e-07, "loss": 0.3204, "num_input_tokens_seen": 20639448, "step": 31450 }, { "epoch": 18.546580188679247, "grad_norm": 3.0444815158843994, "learning_rate": 1.601390356575705e-07, "loss": 0.3522, "num_input_tokens_seen": 20642552, "step": 31455 }, { "epoch": 18.54952830188679, "grad_norm": 3.6967384815216064, "learning_rate": 1.5949377737713988e-07, "loss": 0.304, "num_input_tokens_seen": 20645048, "step": 31460 }, { "epoch": 18.55247641509434, "grad_norm": 3.7687265872955322, "learning_rate": 1.5884980064159338e-07, "loss": 0.4316, "num_input_tokens_seen": 20647992, "step": 31465 }, { "epoch": 18.555424528301888, "grad_norm": 3.6180388927459717, "learning_rate": 1.5820710562142627e-07, "loss": 0.2463, "num_input_tokens_seen": 20651288, "step": 31470 }, { "epoch": 18.558372641509433, "grad_norm": 3.3070473670959473, "learning_rate": 1.575656924867952e-07, "loss": 0.3566, "num_input_tokens_seen": 20654808, "step": 31475 }, { "epoch": 18.56132075471698, "grad_norm": 5.046578884124756, "learning_rate": 1.5692556140751658e-07, "loss": 0.3668, "num_input_tokens_seen": 20658456, "step": 31480 }, { "epoch": 18.56426886792453, "grad_norm": 3.2417638301849365, "learning_rate": 1.5628671255306706e-07, "loss": 0.2838, "num_input_tokens_seen": 20661368, "step": 31485 }, { "epoch": 18.567216981132077, "grad_norm": 2.580996036529541, "learning_rate": 1.556491460925863e-07, "loss": 0.2344, "num_input_tokens_seen": 20664408, "step": 31490 }, { "epoch": 18.57016509433962, "grad_norm": 2.6844542026519775, "learning_rate": 1.550128621948721e-07, "loss": 0.2972, "num_input_tokens_seen": 20667576, "step": 31495 }, { "epoch": 18.57311320754717, "grad_norm": 3.5275754928588867, "learning_rate": 1.5437786102838413e-07, "loss": 0.2337, "num_input_tokens_seen": 20670488, "step": 31500 }, { "epoch": 18.576061320754718, "grad_norm": 4.732057571411133, "learning_rate": 1.5374414276124017e-07, "loss": 0.4612, "num_input_tokens_seen": 20673336, "step": 31505 }, { "epoch": 18.579009433962263, "grad_norm": 3.3838539123535156, "learning_rate": 1.5311170756122095e-07, "loss": 0.2341, "num_input_tokens_seen": 20677528, "step": 31510 }, { "epoch": 18.58195754716981, "grad_norm": 7.523408889770508, "learning_rate": 1.5248055559576647e-07, "loss": 0.2597, "num_input_tokens_seen": 20679704, "step": 31515 }, { "epoch": 18.58490566037736, "grad_norm": 4.364363670349121, "learning_rate": 1.5185068703197526e-07, "loss": 0.4385, "num_input_tokens_seen": 20682808, "step": 31520 }, { "epoch": 18.587853773584907, "grad_norm": 3.0016653537750244, "learning_rate": 1.5122210203661004e-07, "loss": 0.3538, "num_input_tokens_seen": 20686616, "step": 31525 }, { "epoch": 18.590801886792452, "grad_norm": 5.49711275100708, "learning_rate": 1.505948007760899e-07, "loss": 0.3281, "num_input_tokens_seen": 20692664, "step": 31530 }, { "epoch": 18.59375, "grad_norm": 3.3256657123565674, "learning_rate": 1.4996878341649647e-07, "loss": 0.3015, "num_input_tokens_seen": 20696440, "step": 31535 }, { "epoch": 18.596698113207548, "grad_norm": 2.796574115753174, "learning_rate": 1.493440501235699e-07, "loss": 0.3017, "num_input_tokens_seen": 20699736, "step": 31540 }, { "epoch": 18.599646226415093, "grad_norm": 3.425859212875366, "learning_rate": 1.487206010627118e-07, "loss": 0.2651, "num_input_tokens_seen": 20703416, "step": 31545 }, { "epoch": 18.60259433962264, "grad_norm": 2.6110188961029053, "learning_rate": 1.4809843639898124e-07, "loss": 0.2863, "num_input_tokens_seen": 20706968, "step": 31550 }, { "epoch": 18.60554245283019, "grad_norm": 3.3318066596984863, "learning_rate": 1.4747755629710093e-07, "loss": 0.2621, "num_input_tokens_seen": 20709720, "step": 31555 }, { "epoch": 18.608490566037737, "grad_norm": 4.372989177703857, "learning_rate": 1.4685796092145045e-07, "loss": 0.2685, "num_input_tokens_seen": 20712536, "step": 31560 }, { "epoch": 18.611438679245282, "grad_norm": 3.0544168949127197, "learning_rate": 1.4623965043607135e-07, "loss": 0.2524, "num_input_tokens_seen": 20715928, "step": 31565 }, { "epoch": 18.61438679245283, "grad_norm": 7.62545919418335, "learning_rate": 1.4562262500466273e-07, "loss": 0.3507, "num_input_tokens_seen": 20718680, "step": 31570 }, { "epoch": 18.61733490566038, "grad_norm": 3.4410581588745117, "learning_rate": 1.4500688479058556e-07, "loss": 0.2732, "num_input_tokens_seen": 20721144, "step": 31575 }, { "epoch": 18.620283018867923, "grad_norm": 2.439364194869995, "learning_rate": 1.4439242995685943e-07, "loss": 0.3356, "num_input_tokens_seen": 20724728, "step": 31580 }, { "epoch": 18.62323113207547, "grad_norm": 4.0596747398376465, "learning_rate": 1.4377926066616364e-07, "loss": 0.2704, "num_input_tokens_seen": 20727800, "step": 31585 }, { "epoch": 18.62617924528302, "grad_norm": 3.7012827396392822, "learning_rate": 1.4316737708083783e-07, "loss": 0.4012, "num_input_tokens_seen": 20730360, "step": 31590 }, { "epoch": 18.629127358490567, "grad_norm": 4.458847999572754, "learning_rate": 1.4255677936288127e-07, "loss": 0.2743, "num_input_tokens_seen": 20733240, "step": 31595 }, { "epoch": 18.632075471698112, "grad_norm": 6.175071716308594, "learning_rate": 1.4194746767395184e-07, "loss": 0.3517, "num_input_tokens_seen": 20736728, "step": 31600 }, { "epoch": 18.63502358490566, "grad_norm": 2.5431156158447266, "learning_rate": 1.4133944217536722e-07, "loss": 0.2995, "num_input_tokens_seen": 20739704, "step": 31605 }, { "epoch": 18.63797169811321, "grad_norm": 2.8353729248046875, "learning_rate": 1.4073270302810471e-07, "loss": 0.2768, "num_input_tokens_seen": 20744600, "step": 31610 }, { "epoch": 18.640919811320753, "grad_norm": 3.1900107860565186, "learning_rate": 1.4012725039280084e-07, "loss": 0.3365, "num_input_tokens_seen": 20747416, "step": 31615 }, { "epoch": 18.6438679245283, "grad_norm": 5.103302955627441, "learning_rate": 1.3952308442975292e-07, "loss": 0.3993, "num_input_tokens_seen": 20749848, "step": 31620 }, { "epoch": 18.64681603773585, "grad_norm": 3.770167827606201, "learning_rate": 1.3892020529891637e-07, "loss": 0.3511, "num_input_tokens_seen": 20753144, "step": 31625 }, { "epoch": 18.649764150943398, "grad_norm": 4.093818664550781, "learning_rate": 1.3831861315990514e-07, "loss": 0.3811, "num_input_tokens_seen": 20756152, "step": 31630 }, { "epoch": 18.652712264150942, "grad_norm": 2.413487672805786, "learning_rate": 1.377183081719935e-07, "loss": 0.3669, "num_input_tokens_seen": 20759384, "step": 31635 }, { "epoch": 18.65566037735849, "grad_norm": 3.344602584838867, "learning_rate": 1.3711929049411544e-07, "loss": 0.3188, "num_input_tokens_seen": 20762008, "step": 31640 }, { "epoch": 18.65860849056604, "grad_norm": 3.412999153137207, "learning_rate": 1.365215602848624e-07, "loss": 0.3149, "num_input_tokens_seen": 20765016, "step": 31645 }, { "epoch": 18.661556603773583, "grad_norm": 3.199430465698242, "learning_rate": 1.3592511770248727e-07, "loss": 0.3725, "num_input_tokens_seen": 20768024, "step": 31650 }, { "epoch": 18.66450471698113, "grad_norm": 4.593005657196045, "learning_rate": 1.3532996290490041e-07, "loss": 0.3812, "num_input_tokens_seen": 20772600, "step": 31655 }, { "epoch": 18.66745283018868, "grad_norm": 3.111488103866577, "learning_rate": 1.347360960496713e-07, "loss": 0.2977, "num_input_tokens_seen": 20776184, "step": 31660 }, { "epoch": 18.670400943396228, "grad_norm": 3.6489484310150146, "learning_rate": 1.3414351729402862e-07, "loss": 0.3159, "num_input_tokens_seen": 20779576, "step": 31665 }, { "epoch": 18.673349056603772, "grad_norm": 3.1500236988067627, "learning_rate": 1.3355222679486025e-07, "loss": 0.2965, "num_input_tokens_seen": 20782520, "step": 31670 }, { "epoch": 18.67629716981132, "grad_norm": 6.310462951660156, "learning_rate": 1.3296222470871367e-07, "loss": 0.2769, "num_input_tokens_seen": 20786616, "step": 31675 }, { "epoch": 18.67924528301887, "grad_norm": 2.0988576412200928, "learning_rate": 1.3237351119179287e-07, "loss": 0.2714, "num_input_tokens_seen": 20791640, "step": 31680 }, { "epoch": 18.682193396226417, "grad_norm": 3.343992233276367, "learning_rate": 1.3178608639996425e-07, "loss": 0.2982, "num_input_tokens_seen": 20794488, "step": 31685 }, { "epoch": 18.68514150943396, "grad_norm": 3.638942003250122, "learning_rate": 1.3119995048874957e-07, "loss": 0.573, "num_input_tokens_seen": 20797656, "step": 31690 }, { "epoch": 18.68808962264151, "grad_norm": 4.459532260894775, "learning_rate": 1.3061510361333186e-07, "loss": 0.3981, "num_input_tokens_seen": 20801080, "step": 31695 }, { "epoch": 18.691037735849058, "grad_norm": 5.254451751708984, "learning_rate": 1.3003154592855116e-07, "loss": 0.3159, "num_input_tokens_seen": 20804408, "step": 31700 }, { "epoch": 18.693985849056602, "grad_norm": 2.7353131771087646, "learning_rate": 1.2944927758890668e-07, "loss": 0.3413, "num_input_tokens_seen": 20807864, "step": 31705 }, { "epoch": 18.69693396226415, "grad_norm": 5.080729961395264, "learning_rate": 1.2886829874855733e-07, "loss": 0.2632, "num_input_tokens_seen": 20810424, "step": 31710 }, { "epoch": 18.6998820754717, "grad_norm": 3.558445453643799, "learning_rate": 1.2828860956131894e-07, "loss": 0.3029, "num_input_tokens_seen": 20813880, "step": 31715 }, { "epoch": 18.702830188679247, "grad_norm": 3.503539800643921, "learning_rate": 1.2771021018066765e-07, "loss": 0.3444, "num_input_tokens_seen": 20817432, "step": 31720 }, { "epoch": 18.70577830188679, "grad_norm": 5.026356220245361, "learning_rate": 1.271331007597365e-07, "loss": 0.3817, "num_input_tokens_seen": 20820664, "step": 31725 }, { "epoch": 18.70872641509434, "grad_norm": 3.893691062927246, "learning_rate": 1.2655728145131774e-07, "loss": 0.3164, "num_input_tokens_seen": 20824024, "step": 31730 }, { "epoch": 18.711674528301888, "grad_norm": 3.999563217163086, "learning_rate": 1.2598275240786105e-07, "loss": 0.3675, "num_input_tokens_seen": 20827128, "step": 31735 }, { "epoch": 18.714622641509433, "grad_norm": 2.9170448780059814, "learning_rate": 1.254095137814776e-07, "loss": 0.2537, "num_input_tokens_seen": 20830424, "step": 31740 }, { "epoch": 18.71757075471698, "grad_norm": 3.517791271209717, "learning_rate": 1.2483756572393368e-07, "loss": 0.2486, "num_input_tokens_seen": 20836856, "step": 31745 }, { "epoch": 18.72051886792453, "grad_norm": 4.597444534301758, "learning_rate": 1.242669083866549e-07, "loss": 0.3197, "num_input_tokens_seen": 20839928, "step": 31750 }, { "epoch": 18.723466981132077, "grad_norm": 2.18725848197937, "learning_rate": 1.2369754192072537e-07, "loss": 0.2837, "num_input_tokens_seen": 20843480, "step": 31755 }, { "epoch": 18.72641509433962, "grad_norm": 6.269420623779297, "learning_rate": 1.231294664768873e-07, "loss": 0.529, "num_input_tokens_seen": 20846680, "step": 31760 }, { "epoch": 18.72936320754717, "grad_norm": 3.892496109008789, "learning_rate": 1.225626822055409e-07, "loss": 0.2211, "num_input_tokens_seen": 20849592, "step": 31765 }, { "epoch": 18.732311320754718, "grad_norm": 5.038003921508789, "learning_rate": 1.2199718925674508e-07, "loss": 0.3587, "num_input_tokens_seen": 20853112, "step": 31770 }, { "epoch": 18.735259433962263, "grad_norm": 4.222679138183594, "learning_rate": 1.2143298778021616e-07, "loss": 0.4134, "num_input_tokens_seen": 20856504, "step": 31775 }, { "epoch": 18.73820754716981, "grad_norm": 3.8558907508850098, "learning_rate": 1.2087007792532967e-07, "loss": 0.3416, "num_input_tokens_seen": 20858776, "step": 31780 }, { "epoch": 18.74115566037736, "grad_norm": 2.8027243614196777, "learning_rate": 1.203084598411175e-07, "loss": 0.2291, "num_input_tokens_seen": 20862168, "step": 31785 }, { "epoch": 18.744103773584907, "grad_norm": 5.895125865936279, "learning_rate": 1.1974813367627124e-07, "loss": 0.2869, "num_input_tokens_seen": 20865208, "step": 31790 }, { "epoch": 18.747051886792452, "grad_norm": 2.646333694458008, "learning_rate": 1.1918909957913949e-07, "loss": 0.2803, "num_input_tokens_seen": 20868440, "step": 31795 }, { "epoch": 18.75, "grad_norm": 5.393239974975586, "learning_rate": 1.1863135769772827e-07, "loss": 0.3196, "num_input_tokens_seen": 20871352, "step": 31800 }, { "epoch": 18.752948113207548, "grad_norm": 4.769639015197754, "learning_rate": 1.1807490817970279e-07, "loss": 0.2862, "num_input_tokens_seen": 20873560, "step": 31805 }, { "epoch": 18.755896226415093, "grad_norm": 3.851017713546753, "learning_rate": 1.1751975117238578e-07, "loss": 0.3731, "num_input_tokens_seen": 20876472, "step": 31810 }, { "epoch": 18.75884433962264, "grad_norm": 2.147902011871338, "learning_rate": 1.1696588682275633e-07, "loss": 0.4163, "num_input_tokens_seen": 20880408, "step": 31815 }, { "epoch": 18.76179245283019, "grad_norm": 3.35772442817688, "learning_rate": 1.1641331527745325e-07, "loss": 0.2867, "num_input_tokens_seen": 20883512, "step": 31820 }, { "epoch": 18.764740566037737, "grad_norm": 3.63387393951416, "learning_rate": 1.1586203668277229e-07, "loss": 0.3233, "num_input_tokens_seen": 20887160, "step": 31825 }, { "epoch": 18.767688679245282, "grad_norm": 5.606722354888916, "learning_rate": 1.1531205118466615e-07, "loss": 0.2464, "num_input_tokens_seen": 20890776, "step": 31830 }, { "epoch": 18.77063679245283, "grad_norm": 9.179000854492188, "learning_rate": 1.1476335892874669e-07, "loss": 0.3572, "num_input_tokens_seen": 20894200, "step": 31835 }, { "epoch": 18.77358490566038, "grad_norm": 4.07706356048584, "learning_rate": 1.1421596006028157e-07, "loss": 0.3201, "num_input_tokens_seen": 20897656, "step": 31840 }, { "epoch": 18.776533018867923, "grad_norm": 2.7614035606384277, "learning_rate": 1.1366985472419823e-07, "loss": 0.3388, "num_input_tokens_seen": 20901464, "step": 31845 }, { "epoch": 18.77948113207547, "grad_norm": 3.067110061645508, "learning_rate": 1.1312504306507987e-07, "loss": 0.2973, "num_input_tokens_seen": 20904088, "step": 31850 }, { "epoch": 18.78242924528302, "grad_norm": 2.416315793991089, "learning_rate": 1.1258152522716725e-07, "loss": 0.2594, "num_input_tokens_seen": 20906680, "step": 31855 }, { "epoch": 18.785377358490567, "grad_norm": 5.913733959197998, "learning_rate": 1.1203930135435914e-07, "loss": 0.4256, "num_input_tokens_seen": 20910936, "step": 31860 }, { "epoch": 18.788325471698112, "grad_norm": 2.0822951793670654, "learning_rate": 1.1149837159021238e-07, "loss": 0.23, "num_input_tokens_seen": 20914840, "step": 31865 }, { "epoch": 18.79127358490566, "grad_norm": 5.5424394607543945, "learning_rate": 1.1095873607793961e-07, "loss": 0.2099, "num_input_tokens_seen": 20917496, "step": 31870 }, { "epoch": 18.79422169811321, "grad_norm": 2.3658769130706787, "learning_rate": 1.1042039496041212e-07, "loss": 0.2761, "num_input_tokens_seen": 20920856, "step": 31875 }, { "epoch": 18.797169811320753, "grad_norm": 3.021974563598633, "learning_rate": 1.0988334838015812e-07, "loss": 0.3949, "num_input_tokens_seen": 20923736, "step": 31880 }, { "epoch": 18.8001179245283, "grad_norm": 2.5438196659088135, "learning_rate": 1.0934759647936333e-07, "loss": 0.224, "num_input_tokens_seen": 20927832, "step": 31885 }, { "epoch": 18.80306603773585, "grad_norm": 3.899817943572998, "learning_rate": 1.0881313939986926e-07, "loss": 0.4279, "num_input_tokens_seen": 20930744, "step": 31890 }, { "epoch": 18.806014150943398, "grad_norm": 4.3550615310668945, "learning_rate": 1.0827997728317662e-07, "loss": 0.4258, "num_input_tokens_seen": 20934168, "step": 31895 }, { "epoch": 18.808962264150942, "grad_norm": 3.973264694213867, "learning_rate": 1.0774811027044196e-07, "loss": 0.3294, "num_input_tokens_seen": 20937816, "step": 31900 }, { "epoch": 18.81191037735849, "grad_norm": 3.0676727294921875, "learning_rate": 1.0721753850247984e-07, "loss": 0.3101, "num_input_tokens_seen": 20941624, "step": 31905 }, { "epoch": 18.81485849056604, "grad_norm": 2.9374053478240967, "learning_rate": 1.0668826211976124e-07, "loss": 0.4557, "num_input_tokens_seen": 20946168, "step": 31910 }, { "epoch": 18.817806603773583, "grad_norm": 4.368290901184082, "learning_rate": 1.0616028126241407e-07, "loss": 0.2488, "num_input_tokens_seen": 20948568, "step": 31915 }, { "epoch": 18.82075471698113, "grad_norm": 3.776893138885498, "learning_rate": 1.0563359607022372e-07, "loss": 0.3066, "num_input_tokens_seen": 20952440, "step": 31920 }, { "epoch": 18.82370283018868, "grad_norm": 3.658106565475464, "learning_rate": 1.05108206682632e-07, "loss": 0.2689, "num_input_tokens_seen": 20956280, "step": 31925 }, { "epoch": 18.826650943396228, "grad_norm": 5.080296039581299, "learning_rate": 1.0458411323873874e-07, "loss": 0.4822, "num_input_tokens_seen": 20959672, "step": 31930 }, { "epoch": 18.829599056603772, "grad_norm": 6.338685512542725, "learning_rate": 1.0406131587729962e-07, "loss": 0.3223, "num_input_tokens_seen": 20962456, "step": 31935 }, { "epoch": 18.83254716981132, "grad_norm": 3.168614387512207, "learning_rate": 1.035398147367278e-07, "loss": 0.2587, "num_input_tokens_seen": 20966744, "step": 31940 }, { "epoch": 18.83549528301887, "grad_norm": 2.3946356773376465, "learning_rate": 1.030196099550923e-07, "loss": 0.2886, "num_input_tokens_seen": 20969848, "step": 31945 }, { "epoch": 18.838443396226417, "grad_norm": 4.079901218414307, "learning_rate": 1.0250070167011905e-07, "loss": 0.3589, "num_input_tokens_seen": 20977176, "step": 31950 }, { "epoch": 18.84139150943396, "grad_norm": 2.9759256839752197, "learning_rate": 1.0198309001919315e-07, "loss": 0.2517, "num_input_tokens_seen": 20980536, "step": 31955 }, { "epoch": 18.84433962264151, "grad_norm": 3.525834083557129, "learning_rate": 1.0146677513935277e-07, "loss": 0.3138, "num_input_tokens_seen": 20983608, "step": 31960 }, { "epoch": 18.847287735849058, "grad_norm": 5.389620304107666, "learning_rate": 1.0095175716729578e-07, "loss": 0.2682, "num_input_tokens_seen": 20986232, "step": 31965 }, { "epoch": 18.850235849056602, "grad_norm": 4.5196003913879395, "learning_rate": 1.004380362393742e-07, "loss": 0.3208, "num_input_tokens_seen": 20988888, "step": 31970 }, { "epoch": 18.85318396226415, "grad_norm": 4.486591339111328, "learning_rate": 9.99256124915987e-08, "loss": 0.3166, "num_input_tokens_seen": 20993048, "step": 31975 }, { "epoch": 18.8561320754717, "grad_norm": 3.689250946044922, "learning_rate": 9.941448605963577e-08, "loss": 0.2814, "num_input_tokens_seen": 20996088, "step": 31980 }, { "epoch": 18.859080188679247, "grad_norm": 3.5612683296203613, "learning_rate": 9.890465707880715e-08, "loss": 0.2688, "num_input_tokens_seen": 20999576, "step": 31985 }, { "epoch": 18.86202830188679, "grad_norm": 6.514934062957764, "learning_rate": 9.839612568409374e-08, "loss": 0.3263, "num_input_tokens_seen": 21003128, "step": 31990 }, { "epoch": 18.86497641509434, "grad_norm": 5.5278449058532715, "learning_rate": 9.788889201013119e-08, "loss": 0.3799, "num_input_tokens_seen": 21006360, "step": 31995 }, { "epoch": 18.867924528301888, "grad_norm": 3.942974328994751, "learning_rate": 9.738295619121097e-08, "loss": 0.3312, "num_input_tokens_seen": 21009112, "step": 32000 }, { "epoch": 18.870872641509433, "grad_norm": 3.324528217315674, "learning_rate": 9.687831836128203e-08, "loss": 0.272, "num_input_tokens_seen": 21012632, "step": 32005 }, { "epoch": 18.87382075471698, "grad_norm": 4.66732120513916, "learning_rate": 9.637497865395029e-08, "loss": 0.3037, "num_input_tokens_seen": 21015320, "step": 32010 }, { "epoch": 18.87676886792453, "grad_norm": 3.113525390625, "learning_rate": 9.587293720247526e-08, "loss": 0.212, "num_input_tokens_seen": 21018776, "step": 32015 }, { "epoch": 18.879716981132077, "grad_norm": 4.721179485321045, "learning_rate": 9.537219413977672e-08, "loss": 0.3111, "num_input_tokens_seen": 21021688, "step": 32020 }, { "epoch": 18.88266509433962, "grad_norm": 4.876535415649414, "learning_rate": 9.487274959842696e-08, "loss": 0.2744, "num_input_tokens_seen": 21024440, "step": 32025 }, { "epoch": 18.88561320754717, "grad_norm": 3.8091623783111572, "learning_rate": 9.437460371065687e-08, "loss": 0.2628, "num_input_tokens_seen": 21027576, "step": 32030 }, { "epoch": 18.888561320754718, "grad_norm": 3.3275017738342285, "learning_rate": 9.387775660835263e-08, "loss": 0.341, "num_input_tokens_seen": 21031288, "step": 32035 }, { "epoch": 18.891509433962263, "grad_norm": 2.492278814315796, "learning_rate": 9.338220842305678e-08, "loss": 0.3873, "num_input_tokens_seen": 21034200, "step": 32040 }, { "epoch": 18.89445754716981, "grad_norm": 3.665848970413208, "learning_rate": 9.288795928596661e-08, "loss": 0.3289, "num_input_tokens_seen": 21037528, "step": 32045 }, { "epoch": 18.89740566037736, "grad_norm": 4.28730583190918, "learning_rate": 9.239500932793854e-08, "loss": 0.3618, "num_input_tokens_seen": 21040312, "step": 32050 }, { "epoch": 18.900353773584907, "grad_norm": 4.818135738372803, "learning_rate": 9.190335867948263e-08, "loss": 0.4087, "num_input_tokens_seen": 21043064, "step": 32055 }, { "epoch": 18.903301886792452, "grad_norm": 2.5100021362304688, "learning_rate": 9.141300747076476e-08, "loss": 0.2684, "num_input_tokens_seen": 21046072, "step": 32060 }, { "epoch": 18.90625, "grad_norm": 3.992295503616333, "learning_rate": 9.092395583160773e-08, "loss": 0.459, "num_input_tokens_seen": 21049688, "step": 32065 }, { "epoch": 18.909198113207548, "grad_norm": 3.2542216777801514, "learning_rate": 9.043620389149021e-08, "loss": 0.2962, "num_input_tokens_seen": 21052664, "step": 32070 }, { "epoch": 18.912146226415093, "grad_norm": 4.981858253479004, "learning_rate": 8.994975177954723e-08, "loss": 0.3485, "num_input_tokens_seen": 21055480, "step": 32075 }, { "epoch": 18.91509433962264, "grad_norm": 2.664935350418091, "learning_rate": 8.946459962456855e-08, "loss": 0.2372, "num_input_tokens_seen": 21059960, "step": 32080 }, { "epoch": 18.91804245283019, "grad_norm": 3.945145845413208, "learning_rate": 8.89807475550003e-08, "loss": 0.3458, "num_input_tokens_seen": 21062744, "step": 32085 }, { "epoch": 18.920990566037737, "grad_norm": 3.776557207107544, "learning_rate": 8.849819569894447e-08, "loss": 0.3032, "num_input_tokens_seen": 21066328, "step": 32090 }, { "epoch": 18.923938679245282, "grad_norm": 5.2415771484375, "learning_rate": 8.801694418415884e-08, "loss": 0.2694, "num_input_tokens_seen": 21069496, "step": 32095 }, { "epoch": 18.92688679245283, "grad_norm": 3.2540366649627686, "learning_rate": 8.753699313805708e-08, "loss": 0.2795, "num_input_tokens_seen": 21074200, "step": 32100 }, { "epoch": 18.92983490566038, "grad_norm": 2.553692102432251, "learning_rate": 8.705834268770753e-08, "loss": 0.2786, "num_input_tokens_seen": 21076856, "step": 32105 }, { "epoch": 18.932783018867923, "grad_norm": 6.243492603302002, "learning_rate": 8.65809929598349e-08, "loss": 0.457, "num_input_tokens_seen": 21079448, "step": 32110 }, { "epoch": 18.93573113207547, "grad_norm": 2.9390060901641846, "learning_rate": 8.610494408082037e-08, "loss": 0.2913, "num_input_tokens_seen": 21082072, "step": 32115 }, { "epoch": 18.93867924528302, "grad_norm": 3.5361013412475586, "learning_rate": 8.563019617669977e-08, "loss": 0.325, "num_input_tokens_seen": 21085112, "step": 32120 }, { "epoch": 18.941627358490567, "grad_norm": 4.876676559448242, "learning_rate": 8.51567493731642e-08, "loss": 0.4867, "num_input_tokens_seen": 21087800, "step": 32125 }, { "epoch": 18.944575471698112, "grad_norm": 6.583503246307373, "learning_rate": 8.468460379556176e-08, "loss": 0.2641, "num_input_tokens_seen": 21090936, "step": 32130 }, { "epoch": 18.94752358490566, "grad_norm": 4.344171524047852, "learning_rate": 8.421375956889355e-08, "loss": 0.2233, "num_input_tokens_seen": 21094360, "step": 32135 }, { "epoch": 18.95047169811321, "grad_norm": 5.045650005340576, "learning_rate": 8.374421681781819e-08, "loss": 0.3267, "num_input_tokens_seen": 21097432, "step": 32140 }, { "epoch": 18.953419811320753, "grad_norm": 2.7603282928466797, "learning_rate": 8.327597566665013e-08, "loss": 0.2224, "num_input_tokens_seen": 21099800, "step": 32145 }, { "epoch": 18.9563679245283, "grad_norm": 3.334500312805176, "learning_rate": 8.280903623935688e-08, "loss": 0.3119, "num_input_tokens_seen": 21103448, "step": 32150 }, { "epoch": 18.95931603773585, "grad_norm": 3.4896984100341797, "learning_rate": 8.234339865956342e-08, "loss": 0.295, "num_input_tokens_seen": 21106648, "step": 32155 }, { "epoch": 18.962264150943398, "grad_norm": 3.8874351978302, "learning_rate": 8.187906305054838e-08, "loss": 0.4191, "num_input_tokens_seen": 21109432, "step": 32160 }, { "epoch": 18.965212264150942, "grad_norm": 5.17053747177124, "learning_rate": 8.141602953524841e-08, "loss": 0.3591, "num_input_tokens_seen": 21112120, "step": 32165 }, { "epoch": 18.96816037735849, "grad_norm": 4.387740612030029, "learning_rate": 8.095429823625212e-08, "loss": 0.3085, "num_input_tokens_seen": 21115768, "step": 32170 }, { "epoch": 18.97110849056604, "grad_norm": 5.386492729187012, "learning_rate": 8.04938692758045e-08, "loss": 0.3974, "num_input_tokens_seen": 21119192, "step": 32175 }, { "epoch": 18.974056603773583, "grad_norm": 3.040576696395874, "learning_rate": 8.003474277580803e-08, "loss": 0.2387, "num_input_tokens_seen": 21121912, "step": 32180 }, { "epoch": 18.97700471698113, "grad_norm": 2.1517746448516846, "learning_rate": 7.95769188578166e-08, "loss": 0.3636, "num_input_tokens_seen": 21124920, "step": 32185 }, { "epoch": 18.97995283018868, "grad_norm": 2.4734086990356445, "learning_rate": 7.912039764304213e-08, "loss": 0.3295, "num_input_tokens_seen": 21129176, "step": 32190 }, { "epoch": 18.982900943396228, "grad_norm": 4.73417329788208, "learning_rate": 7.866517925235017e-08, "loss": 0.4329, "num_input_tokens_seen": 21132472, "step": 32195 }, { "epoch": 18.985849056603772, "grad_norm": 3.2540297508239746, "learning_rate": 7.821126380626154e-08, "loss": 0.2752, "num_input_tokens_seen": 21136312, "step": 32200 }, { "epoch": 18.98879716981132, "grad_norm": 5.501195907592773, "learning_rate": 7.775865142495286e-08, "loss": 0.4375, "num_input_tokens_seen": 21139224, "step": 32205 }, { "epoch": 18.99174528301887, "grad_norm": 3.3205087184906006, "learning_rate": 7.730734222825442e-08, "loss": 0.3989, "num_input_tokens_seen": 21142712, "step": 32210 }, { "epoch": 18.994693396226417, "grad_norm": 3.389864444732666, "learning_rate": 7.68573363356534e-08, "loss": 0.3215, "num_input_tokens_seen": 21146520, "step": 32215 }, { "epoch": 18.99764150943396, "grad_norm": 2.7845895290374756, "learning_rate": 7.640863386629005e-08, "loss": 0.2783, "num_input_tokens_seen": 21149048, "step": 32220 }, { "epoch": 19.00058962264151, "grad_norm": 4.374135494232178, "learning_rate": 7.59612349389599e-08, "loss": 0.3422, "num_input_tokens_seen": 21151624, "step": 32225 }, { "epoch": 19.003537735849058, "grad_norm": 4.439969062805176, "learning_rate": 7.551513967211433e-08, "loss": 0.3719, "num_input_tokens_seen": 21155240, "step": 32230 }, { "epoch": 19.006485849056602, "grad_norm": 3.2258360385894775, "learning_rate": 7.507034818385883e-08, "loss": 0.2997, "num_input_tokens_seen": 21158152, "step": 32235 }, { "epoch": 19.00943396226415, "grad_norm": 4.449168682098389, "learning_rate": 7.462686059195423e-08, "loss": 0.2424, "num_input_tokens_seen": 21161512, "step": 32240 }, { "epoch": 19.0123820754717, "grad_norm": 2.7838642597198486, "learning_rate": 7.418467701381548e-08, "loss": 0.384, "num_input_tokens_seen": 21165640, "step": 32245 }, { "epoch": 19.015330188679247, "grad_norm": 2.5062248706817627, "learning_rate": 7.374379756651285e-08, "loss": 0.2989, "num_input_tokens_seen": 21171528, "step": 32250 }, { "epoch": 19.01827830188679, "grad_norm": 2.633643388748169, "learning_rate": 7.330422236677015e-08, "loss": 0.2168, "num_input_tokens_seen": 21174056, "step": 32255 }, { "epoch": 19.02122641509434, "grad_norm": 4.493930816650391, "learning_rate": 7.286595153096765e-08, "loss": 0.4787, "num_input_tokens_seen": 21177000, "step": 32260 }, { "epoch": 19.024174528301888, "grad_norm": 4.705043792724609, "learning_rate": 7.242898517513864e-08, "loss": 0.3935, "num_input_tokens_seen": 21180872, "step": 32265 }, { "epoch": 19.027122641509433, "grad_norm": 2.940704822540283, "learning_rate": 7.199332341497333e-08, "loss": 0.2265, "num_input_tokens_seen": 21183624, "step": 32270 }, { "epoch": 19.03007075471698, "grad_norm": 5.823301792144775, "learning_rate": 7.155896636581394e-08, "loss": 0.1687, "num_input_tokens_seen": 21185864, "step": 32275 }, { "epoch": 19.03301886792453, "grad_norm": 4.154581546783447, "learning_rate": 7.112591414265901e-08, "loss": 0.3199, "num_input_tokens_seen": 21188648, "step": 32280 }, { "epoch": 19.035966981132077, "grad_norm": 3.632584571838379, "learning_rate": 7.069416686016018e-08, "loss": 0.3549, "num_input_tokens_seen": 21192072, "step": 32285 }, { "epoch": 19.03891509433962, "grad_norm": 3.1595187187194824, "learning_rate": 7.026372463262488e-08, "loss": 0.3875, "num_input_tokens_seen": 21195912, "step": 32290 }, { "epoch": 19.04186320754717, "grad_norm": 7.322221279144287, "learning_rate": 6.983458757401418e-08, "loss": 0.3136, "num_input_tokens_seen": 21199336, "step": 32295 }, { "epoch": 19.044811320754718, "grad_norm": 3.113441228866577, "learning_rate": 6.940675579794443e-08, "loss": 0.2864, "num_input_tokens_seen": 21202984, "step": 32300 }, { "epoch": 19.047759433962263, "grad_norm": 4.883604526519775, "learning_rate": 6.898022941768612e-08, "loss": 0.3591, "num_input_tokens_seen": 21206120, "step": 32305 }, { "epoch": 19.05070754716981, "grad_norm": 1.608805775642395, "learning_rate": 6.855500854616337e-08, "loss": 0.2191, "num_input_tokens_seen": 21210696, "step": 32310 }, { "epoch": 19.05365566037736, "grad_norm": 2.623196601867676, "learning_rate": 6.813109329595557e-08, "loss": 0.2669, "num_input_tokens_seen": 21215560, "step": 32315 }, { "epoch": 19.056603773584907, "grad_norm": 2.8250393867492676, "learning_rate": 6.770848377929573e-08, "loss": 0.2773, "num_input_tokens_seen": 21219240, "step": 32320 }, { "epoch": 19.059551886792452, "grad_norm": 2.959465265274048, "learning_rate": 6.728718010807156e-08, "loss": 0.4175, "num_input_tokens_seen": 21223016, "step": 32325 }, { "epoch": 19.0625, "grad_norm": 2.7041170597076416, "learning_rate": 6.68671823938255e-08, "loss": 0.2488, "num_input_tokens_seen": 21226824, "step": 32330 }, { "epoch": 19.065448113207548, "grad_norm": 5.7009100914001465, "learning_rate": 6.644849074775361e-08, "loss": 0.2954, "num_input_tokens_seen": 21230344, "step": 32335 }, { "epoch": 19.068396226415093, "grad_norm": 2.9571454524993896, "learning_rate": 6.603110528070667e-08, "loss": 0.3621, "num_input_tokens_seen": 21233672, "step": 32340 }, { "epoch": 19.07134433962264, "grad_norm": 8.681792259216309, "learning_rate": 6.561502610318849e-08, "loss": 0.3444, "num_input_tokens_seen": 21237416, "step": 32345 }, { "epoch": 19.07429245283019, "grad_norm": 5.104862689971924, "learning_rate": 6.520025332535762e-08, "loss": 0.4166, "num_input_tokens_seen": 21240456, "step": 32350 }, { "epoch": 19.077240566037737, "grad_norm": 3.8823328018188477, "learning_rate": 6.47867870570279e-08, "loss": 0.3531, "num_input_tokens_seen": 21243848, "step": 32355 }, { "epoch": 19.080188679245282, "grad_norm": 3.227219820022583, "learning_rate": 6.437462740766564e-08, "loss": 0.4322, "num_input_tokens_seen": 21246696, "step": 32360 }, { "epoch": 19.08313679245283, "grad_norm": 4.166708946228027, "learning_rate": 6.396377448639246e-08, "loss": 0.259, "num_input_tokens_seen": 21250280, "step": 32365 }, { "epoch": 19.08608490566038, "grad_norm": 3.780578374862671, "learning_rate": 6.3554228401983e-08, "loss": 0.2916, "num_input_tokens_seen": 21254440, "step": 32370 }, { "epoch": 19.089033018867923, "grad_norm": 4.621339797973633, "learning_rate": 6.314598926286663e-08, "loss": 0.2681, "num_input_tokens_seen": 21257256, "step": 32375 }, { "epoch": 19.09198113207547, "grad_norm": 3.8629376888275146, "learning_rate": 6.273905717712637e-08, "loss": 0.3517, "num_input_tokens_seen": 21260616, "step": 32380 }, { "epoch": 19.09492924528302, "grad_norm": 3.7364394664764404, "learning_rate": 6.233343225249933e-08, "loss": 0.3407, "num_input_tokens_seen": 21263752, "step": 32385 }, { "epoch": 19.097877358490567, "grad_norm": 2.954512119293213, "learning_rate": 6.192911459637519e-08, "loss": 0.2939, "num_input_tokens_seen": 21267304, "step": 32390 }, { "epoch": 19.100825471698112, "grad_norm": 6.296505928039551, "learning_rate": 6.152610431580052e-08, "loss": 0.2896, "num_input_tokens_seen": 21269928, "step": 32395 }, { "epoch": 19.10377358490566, "grad_norm": 2.968703508377075, "learning_rate": 6.112440151747389e-08, "loss": 0.331, "num_input_tokens_seen": 21273224, "step": 32400 }, { "epoch": 19.10672169811321, "grad_norm": 3.4624440670013428, "learning_rate": 6.072400630774689e-08, "loss": 0.4431, "num_input_tokens_seen": 21276040, "step": 32405 }, { "epoch": 19.109669811320753, "grad_norm": 4.9036478996276855, "learning_rate": 6.032491879262637e-08, "loss": 0.4719, "num_input_tokens_seen": 21279144, "step": 32410 }, { "epoch": 19.1126179245283, "grad_norm": 3.0820469856262207, "learning_rate": 5.99271390777717e-08, "loss": 0.2675, "num_input_tokens_seen": 21282184, "step": 32415 }, { "epoch": 19.11556603773585, "grad_norm": 5.621577262878418, "learning_rate": 5.953066726849865e-08, "loss": 0.3507, "num_input_tokens_seen": 21285864, "step": 32420 }, { "epoch": 19.118514150943398, "grad_norm": 3.904658079147339, "learning_rate": 5.913550346977326e-08, "loss": 0.3761, "num_input_tokens_seen": 21288840, "step": 32425 }, { "epoch": 19.121462264150942, "grad_norm": 4.834802150726318, "learning_rate": 5.874164778621683e-08, "loss": 0.3324, "num_input_tokens_seen": 21291688, "step": 32430 }, { "epoch": 19.12441037735849, "grad_norm": 3.577867269515991, "learning_rate": 5.834910032210539e-08, "loss": 0.4738, "num_input_tokens_seen": 21295272, "step": 32435 }, { "epoch": 19.12735849056604, "grad_norm": 3.1119890213012695, "learning_rate": 5.795786118136693e-08, "loss": 0.2919, "num_input_tokens_seen": 21298696, "step": 32440 }, { "epoch": 19.130306603773583, "grad_norm": 4.912132740020752, "learning_rate": 5.756793046758302e-08, "loss": 0.3451, "num_input_tokens_seen": 21301320, "step": 32445 }, { "epoch": 19.13325471698113, "grad_norm": 3.479008197784424, "learning_rate": 5.7179308283990544e-08, "loss": 0.4686, "num_input_tokens_seen": 21304200, "step": 32450 }, { "epoch": 19.13620283018868, "grad_norm": 2.2334558963775635, "learning_rate": 5.679199473347885e-08, "loss": 0.3628, "num_input_tokens_seen": 21308328, "step": 32455 }, { "epoch": 19.139150943396228, "grad_norm": 4.768280029296875, "learning_rate": 5.6405989918590366e-08, "loss": 0.2508, "num_input_tokens_seen": 21311336, "step": 32460 }, { "epoch": 19.142099056603772, "grad_norm": 3.698974609375, "learning_rate": 5.6021293941522225e-08, "loss": 0.3402, "num_input_tokens_seen": 21314600, "step": 32465 }, { "epoch": 19.14504716981132, "grad_norm": 3.111863851547241, "learning_rate": 5.563790690412352e-08, "loss": 0.2323, "num_input_tokens_seen": 21317800, "step": 32470 }, { "epoch": 19.14799528301887, "grad_norm": 6.835299491882324, "learning_rate": 5.525582890789805e-08, "loss": 0.2752, "num_input_tokens_seen": 21320936, "step": 32475 }, { "epoch": 19.150943396226417, "grad_norm": 4.998710632324219, "learning_rate": 5.4875060054002115e-08, "loss": 0.3061, "num_input_tokens_seen": 21323944, "step": 32480 }, { "epoch": 19.15389150943396, "grad_norm": 3.088934898376465, "learning_rate": 5.4495600443246755e-08, "loss": 0.3069, "num_input_tokens_seen": 21327208, "step": 32485 }, { "epoch": 19.15683962264151, "grad_norm": 3.8236732482910156, "learning_rate": 5.411745017609493e-08, "loss": 0.3647, "num_input_tokens_seen": 21331496, "step": 32490 }, { "epoch": 19.159787735849058, "grad_norm": 3.571381092071533, "learning_rate": 5.374060935266434e-08, "loss": 0.2946, "num_input_tokens_seen": 21334344, "step": 32495 }, { "epoch": 19.162735849056602, "grad_norm": 4.756137847900391, "learning_rate": 5.3365078072724065e-08, "loss": 0.363, "num_input_tokens_seen": 21337128, "step": 32500 }, { "epoch": 19.16568396226415, "grad_norm": 2.998162031173706, "learning_rate": 5.299085643569846e-08, "loss": 0.2969, "num_input_tokens_seen": 21340520, "step": 32505 }, { "epoch": 19.1686320754717, "grad_norm": 3.872178792953491, "learning_rate": 5.261794454066327e-08, "loss": 0.3877, "num_input_tokens_seen": 21344168, "step": 32510 }, { "epoch": 19.171580188679247, "grad_norm": 4.16678524017334, "learning_rate": 5.224634248635008e-08, "loss": 0.3697, "num_input_tokens_seen": 21347368, "step": 32515 }, { "epoch": 19.17452830188679, "grad_norm": 4.02678108215332, "learning_rate": 5.187605037114129e-08, "loss": 0.253, "num_input_tokens_seen": 21350728, "step": 32520 }, { "epoch": 19.17747641509434, "grad_norm": 3.2296416759490967, "learning_rate": 5.15070682930735e-08, "loss": 0.2986, "num_input_tokens_seen": 21353800, "step": 32525 }, { "epoch": 19.180424528301888, "grad_norm": 3.7112722396850586, "learning_rate": 5.113939634983578e-08, "loss": 0.2999, "num_input_tokens_seen": 21357128, "step": 32530 }, { "epoch": 19.183372641509433, "grad_norm": 3.076540946960449, "learning_rate": 5.077303463877192e-08, "loss": 0.339, "num_input_tokens_seen": 21360488, "step": 32535 }, { "epoch": 19.18632075471698, "grad_norm": 5.428586006164551, "learning_rate": 5.040798325687601e-08, "loss": 0.2963, "num_input_tokens_seen": 21364328, "step": 32540 }, { "epoch": 19.18926886792453, "grad_norm": 5.111176013946533, "learning_rate": 5.004424230079852e-08, "loss": 0.343, "num_input_tokens_seen": 21367432, "step": 32545 }, { "epoch": 19.192216981132077, "grad_norm": 4.3155999183654785, "learning_rate": 4.968181186684129e-08, "loss": 0.2961, "num_input_tokens_seen": 21369832, "step": 32550 }, { "epoch": 19.19516509433962, "grad_norm": 3.491464614868164, "learning_rate": 4.932069205095924e-08, "loss": 0.3667, "num_input_tokens_seen": 21373224, "step": 32555 }, { "epoch": 19.19811320754717, "grad_norm": 5.916723728179932, "learning_rate": 4.896088294875978e-08, "loss": 0.3726, "num_input_tokens_seen": 21375656, "step": 32560 }, { "epoch": 19.201061320754718, "grad_norm": 5.592641353607178, "learning_rate": 4.8602384655505044e-08, "loss": 0.2982, "num_input_tokens_seen": 21378056, "step": 32565 }, { "epoch": 19.204009433962263, "grad_norm": 4.35010290145874, "learning_rate": 4.824519726610744e-08, "loss": 0.353, "num_input_tokens_seen": 21381096, "step": 32570 }, { "epoch": 19.20695754716981, "grad_norm": 4.315145969390869, "learning_rate": 4.7889320875135206e-08, "loss": 0.2246, "num_input_tokens_seen": 21384136, "step": 32575 }, { "epoch": 19.20990566037736, "grad_norm": 6.974118709564209, "learning_rate": 4.753475557680742e-08, "loss": 0.178, "num_input_tokens_seen": 21386600, "step": 32580 }, { "epoch": 19.212853773584907, "grad_norm": 4.218348979949951, "learning_rate": 4.718150146499734e-08, "loss": 0.3356, "num_input_tokens_seen": 21390600, "step": 32585 }, { "epoch": 19.215801886792452, "grad_norm": 3.2819619178771973, "learning_rate": 4.682955863323013e-08, "loss": 0.2811, "num_input_tokens_seen": 21393544, "step": 32590 }, { "epoch": 19.21875, "grad_norm": 5.004150867462158, "learning_rate": 4.6478927174684606e-08, "loss": 0.4506, "num_input_tokens_seen": 21396424, "step": 32595 }, { "epoch": 19.221698113207548, "grad_norm": 6.629669666290283, "learning_rate": 4.612960718219095e-08, "loss": 0.3399, "num_input_tokens_seen": 21400360, "step": 32600 }, { "epoch": 19.224646226415093, "grad_norm": 3.1561810970306396, "learning_rate": 4.578159874823407e-08, "loss": 0.3412, "num_input_tokens_seen": 21403976, "step": 32605 }, { "epoch": 19.22759433962264, "grad_norm": 5.264781475067139, "learning_rate": 4.5434901964950264e-08, "loss": 0.3594, "num_input_tokens_seen": 21407400, "step": 32610 }, { "epoch": 19.23054245283019, "grad_norm": 4.452872276306152, "learning_rate": 4.508951692412944e-08, "loss": 0.3105, "num_input_tokens_seen": 21410664, "step": 32615 }, { "epoch": 19.233490566037737, "grad_norm": 2.1512725353240967, "learning_rate": 4.4745443717213455e-08, "loss": 0.2606, "num_input_tokens_seen": 21413608, "step": 32620 }, { "epoch": 19.236438679245282, "grad_norm": 2.984400749206543, "learning_rate": 4.4402682435296666e-08, "loss": 0.3078, "num_input_tokens_seen": 21416136, "step": 32625 }, { "epoch": 19.23938679245283, "grad_norm": 3.1178271770477295, "learning_rate": 4.406123316912758e-08, "loss": 0.4396, "num_input_tokens_seen": 21420264, "step": 32630 }, { "epoch": 19.24233490566038, "grad_norm": 2.915034532546997, "learning_rate": 4.372109600910612e-08, "loss": 0.184, "num_input_tokens_seen": 21423432, "step": 32635 }, { "epoch": 19.245283018867923, "grad_norm": 3.127793073654175, "learning_rate": 4.338227104528414e-08, "loss": 0.3517, "num_input_tokens_seen": 21426504, "step": 32640 }, { "epoch": 19.24823113207547, "grad_norm": 3.316561698913574, "learning_rate": 4.304475836736821e-08, "loss": 0.3193, "num_input_tokens_seen": 21428744, "step": 32645 }, { "epoch": 19.25117924528302, "grad_norm": 3.134742259979248, "learning_rate": 4.27085580647163e-08, "loss": 0.3676, "num_input_tokens_seen": 21431528, "step": 32650 }, { "epoch": 19.254127358490567, "grad_norm": 9.052151679992676, "learning_rate": 4.237367022633776e-08, "loss": 0.416, "num_input_tokens_seen": 21435688, "step": 32655 }, { "epoch": 19.257075471698112, "grad_norm": 3.252088785171509, "learning_rate": 4.204009494089612e-08, "loss": 0.3294, "num_input_tokens_seen": 21439400, "step": 32660 }, { "epoch": 19.26002358490566, "grad_norm": 5.862829208374023, "learning_rate": 4.170783229670739e-08, "loss": 0.3, "num_input_tokens_seen": 21443208, "step": 32665 }, { "epoch": 19.26297169811321, "grad_norm": 4.514962673187256, "learning_rate": 4.137688238173898e-08, "loss": 0.3019, "num_input_tokens_seen": 21446344, "step": 32670 }, { "epoch": 19.265919811320753, "grad_norm": 5.952615737915039, "learning_rate": 4.104724528361137e-08, "loss": 0.4402, "num_input_tokens_seen": 21450760, "step": 32675 }, { "epoch": 19.2688679245283, "grad_norm": 3.3912675380706787, "learning_rate": 4.071892108959752e-08, "loss": 0.2112, "num_input_tokens_seen": 21454408, "step": 32680 }, { "epoch": 19.27181603773585, "grad_norm": 4.002889156341553, "learning_rate": 4.039190988662234e-08, "loss": 0.2715, "num_input_tokens_seen": 21458376, "step": 32685 }, { "epoch": 19.274764150943398, "grad_norm": 3.299323081970215, "learning_rate": 4.006621176126435e-08, "loss": 0.3035, "num_input_tokens_seen": 21461864, "step": 32690 }, { "epoch": 19.277712264150942, "grad_norm": 2.87632417678833, "learning_rate": 3.974182679975236e-08, "loss": 0.4039, "num_input_tokens_seen": 21465064, "step": 32695 }, { "epoch": 19.28066037735849, "grad_norm": 3.2904512882232666, "learning_rate": 3.941875508796933e-08, "loss": 0.3072, "num_input_tokens_seen": 21467784, "step": 32700 }, { "epoch": 19.28360849056604, "grad_norm": 4.994115352630615, "learning_rate": 3.909699671145017e-08, "loss": 0.3044, "num_input_tokens_seen": 21471496, "step": 32705 }, { "epoch": 19.286556603773583, "grad_norm": 4.666810989379883, "learning_rate": 3.87765517553812e-08, "loss": 0.4788, "num_input_tokens_seen": 21474696, "step": 32710 }, { "epoch": 19.28950471698113, "grad_norm": 2.4772963523864746, "learning_rate": 3.8457420304601756e-08, "loss": 0.2864, "num_input_tokens_seen": 21478408, "step": 32715 }, { "epoch": 19.29245283018868, "grad_norm": 4.777077674865723, "learning_rate": 3.813960244360371e-08, "loss": 0.3189, "num_input_tokens_seen": 21481768, "step": 32720 }, { "epoch": 19.295400943396228, "grad_norm": 4.021971225738525, "learning_rate": 3.7823098256529744e-08, "loss": 0.405, "num_input_tokens_seen": 21485800, "step": 32725 }, { "epoch": 19.298349056603772, "grad_norm": 3.104027271270752, "learning_rate": 3.750790782717673e-08, "loss": 0.3084, "num_input_tokens_seen": 21489576, "step": 32730 }, { "epoch": 19.30129716981132, "grad_norm": 4.4660563468933105, "learning_rate": 3.719403123899179e-08, "loss": 0.3366, "num_input_tokens_seen": 21492744, "step": 32735 }, { "epoch": 19.30424528301887, "grad_norm": 3.3197176456451416, "learning_rate": 3.688146857507624e-08, "loss": 0.3886, "num_input_tokens_seen": 21495944, "step": 32740 }, { "epoch": 19.307193396226417, "grad_norm": 3.372048854827881, "learning_rate": 3.657021991818166e-08, "loss": 0.3286, "num_input_tokens_seen": 21499336, "step": 32745 }, { "epoch": 19.31014150943396, "grad_norm": 3.0466196537017822, "learning_rate": 3.626028535071213e-08, "loss": 0.2687, "num_input_tokens_seen": 21502152, "step": 32750 }, { "epoch": 19.31308962264151, "grad_norm": 3.743074417114258, "learning_rate": 3.59516649547248e-08, "loss": 0.3194, "num_input_tokens_seen": 21505800, "step": 32755 }, { "epoch": 19.316037735849058, "grad_norm": 4.6738409996032715, "learning_rate": 3.564435881192818e-08, "loss": 0.3407, "num_input_tokens_seen": 21509192, "step": 32760 }, { "epoch": 19.318985849056602, "grad_norm": 4.75137186050415, "learning_rate": 3.5338367003682763e-08, "loss": 0.4416, "num_input_tokens_seen": 21512360, "step": 32765 }, { "epoch": 19.32193396226415, "grad_norm": 4.395416259765625, "learning_rate": 3.5033689611000954e-08, "loss": 0.3387, "num_input_tokens_seen": 21515912, "step": 32770 }, { "epoch": 19.3248820754717, "grad_norm": 2.6544551849365234, "learning_rate": 3.473032671454768e-08, "loss": 0.3029, "num_input_tokens_seen": 21519400, "step": 32775 }, { "epoch": 19.327830188679247, "grad_norm": 7.6113128662109375, "learning_rate": 3.44282783946398e-08, "loss": 0.3538, "num_input_tokens_seen": 21522056, "step": 32780 }, { "epoch": 19.33077830188679, "grad_norm": 3.6528160572052, "learning_rate": 3.4127544731245575e-08, "loss": 0.3846, "num_input_tokens_seen": 21524840, "step": 32785 }, { "epoch": 19.33372641509434, "grad_norm": 4.468834400177002, "learning_rate": 3.38281258039852e-08, "loss": 0.3255, "num_input_tokens_seen": 21528424, "step": 32790 }, { "epoch": 19.336674528301888, "grad_norm": 4.076722621917725, "learning_rate": 3.353002169213193e-08, "loss": 0.3433, "num_input_tokens_seen": 21535016, "step": 32795 }, { "epoch": 19.339622641509433, "grad_norm": 3.2967846393585205, "learning_rate": 3.32332324746093e-08, "loss": 0.3262, "num_input_tokens_seen": 21537960, "step": 32800 }, { "epoch": 19.34257075471698, "grad_norm": 2.814846992492676, "learning_rate": 3.2937758229994455e-08, "loss": 0.3587, "num_input_tokens_seen": 21541384, "step": 32805 }, { "epoch": 19.34551886792453, "grad_norm": 3.5950825214385986, "learning_rate": 3.2643599036514815e-08, "loss": 0.3198, "num_input_tokens_seen": 21543816, "step": 32810 }, { "epoch": 19.348466981132077, "grad_norm": 5.959361553192139, "learning_rate": 3.2350754972050316e-08, "loss": 0.3419, "num_input_tokens_seen": 21547336, "step": 32815 }, { "epoch": 19.35141509433962, "grad_norm": 6.700908184051514, "learning_rate": 3.2059226114132815e-08, "loss": 0.2994, "num_input_tokens_seen": 21550120, "step": 32820 }, { "epoch": 19.35436320754717, "grad_norm": 3.477038860321045, "learning_rate": 3.1769012539945575e-08, "loss": 0.4356, "num_input_tokens_seen": 21553640, "step": 32825 }, { "epoch": 19.357311320754718, "grad_norm": 2.7023372650146484, "learning_rate": 3.1480114326324364e-08, "loss": 0.3121, "num_input_tokens_seen": 21557384, "step": 32830 }, { "epoch": 19.360259433962263, "grad_norm": 3.24471378326416, "learning_rate": 3.1192531549756325e-08, "loss": 0.3037, "num_input_tokens_seen": 21560040, "step": 32835 }, { "epoch": 19.36320754716981, "grad_norm": 5.131308078765869, "learning_rate": 3.090626428638e-08, "loss": 0.2944, "num_input_tokens_seen": 21562888, "step": 32840 }, { "epoch": 19.36615566037736, "grad_norm": 3.8714561462402344, "learning_rate": 3.062131261198531e-08, "loss": 0.333, "num_input_tokens_seen": 21565608, "step": 32845 }, { "epoch": 19.369103773584907, "grad_norm": 6.259768486022949, "learning_rate": 3.033767660201525e-08, "loss": 0.287, "num_input_tokens_seen": 21568072, "step": 32850 }, { "epoch": 19.372051886792452, "grad_norm": 3.4825551509857178, "learning_rate": 3.005535633156309e-08, "loss": 0.2344, "num_input_tokens_seen": 21573832, "step": 32855 }, { "epoch": 19.375, "grad_norm": 3.2975707054138184, "learning_rate": 2.977435187537514e-08, "loss": 0.2919, "num_input_tokens_seen": 21576584, "step": 32860 }, { "epoch": 19.377948113207548, "grad_norm": 4.16794490814209, "learning_rate": 2.949466330784745e-08, "loss": 0.4552, "num_input_tokens_seen": 21579720, "step": 32865 }, { "epoch": 19.380896226415093, "grad_norm": 3.8494374752044678, "learning_rate": 2.921629070302967e-08, "loss": 0.4118, "num_input_tokens_seen": 21582600, "step": 32870 }, { "epoch": 19.38384433962264, "grad_norm": 4.632498741149902, "learning_rate": 2.893923413462174e-08, "loss": 0.3804, "num_input_tokens_seen": 21585512, "step": 32875 }, { "epoch": 19.38679245283019, "grad_norm": 3.388049602508545, "learning_rate": 2.866349367597554e-08, "loss": 0.3468, "num_input_tokens_seen": 21588840, "step": 32880 }, { "epoch": 19.389740566037737, "grad_norm": 4.461574554443359, "learning_rate": 2.8389069400094893e-08, "loss": 0.2133, "num_input_tokens_seen": 21591080, "step": 32885 }, { "epoch": 19.392688679245282, "grad_norm": 3.4388105869293213, "learning_rate": 2.811596137963446e-08, "loss": 0.4341, "num_input_tokens_seen": 21594408, "step": 32890 }, { "epoch": 19.39563679245283, "grad_norm": 5.650057315826416, "learning_rate": 2.7844169686900844e-08, "loss": 0.2496, "num_input_tokens_seen": 21598248, "step": 32895 }, { "epoch": 19.39858490566038, "grad_norm": 4.353342533111572, "learning_rate": 2.7573694393852047e-08, "loss": 0.2318, "num_input_tokens_seen": 21601544, "step": 32900 }, { "epoch": 19.401533018867923, "grad_norm": 3.0688366889953613, "learning_rate": 2.7304535572098e-08, "loss": 0.3219, "num_input_tokens_seen": 21604360, "step": 32905 }, { "epoch": 19.40448113207547, "grad_norm": 4.178114891052246, "learning_rate": 2.703669329289893e-08, "loss": 0.2648, "num_input_tokens_seen": 21607880, "step": 32910 }, { "epoch": 19.40742924528302, "grad_norm": 9.126350402832031, "learning_rate": 2.6770167627167554e-08, "loss": 0.3146, "num_input_tokens_seen": 21610664, "step": 32915 }, { "epoch": 19.410377358490567, "grad_norm": 7.142443656921387, "learning_rate": 2.6504958645467426e-08, "loss": 0.3364, "num_input_tokens_seen": 21614536, "step": 32920 }, { "epoch": 19.413325471698112, "grad_norm": 4.203126430511475, "learning_rate": 2.6241066418014605e-08, "loss": 0.3877, "num_input_tokens_seen": 21617160, "step": 32925 }, { "epoch": 19.41627358490566, "grad_norm": 3.0384292602539062, "learning_rate": 2.5978491014674866e-08, "loss": 0.2796, "num_input_tokens_seen": 21620264, "step": 32930 }, { "epoch": 19.41922169811321, "grad_norm": 3.2507667541503906, "learning_rate": 2.571723250496705e-08, "loss": 0.3994, "num_input_tokens_seen": 21623336, "step": 32935 }, { "epoch": 19.422169811320753, "grad_norm": 4.702920913696289, "learning_rate": 2.5457290958059155e-08, "loss": 0.3632, "num_input_tokens_seen": 21626152, "step": 32940 }, { "epoch": 19.4251179245283, "grad_norm": 4.190332889556885, "learning_rate": 2.51986664427728e-08, "loss": 0.2409, "num_input_tokens_seen": 21628776, "step": 32945 }, { "epoch": 19.42806603773585, "grad_norm": 2.4774692058563232, "learning_rate": 2.4941359027579883e-08, "loss": 0.2823, "num_input_tokens_seen": 21632552, "step": 32950 }, { "epoch": 19.431014150943398, "grad_norm": 2.26975417137146, "learning_rate": 2.468536878060368e-08, "loss": 0.3359, "num_input_tokens_seen": 21634920, "step": 32955 }, { "epoch": 19.433962264150942, "grad_norm": 4.402746200561523, "learning_rate": 2.443069576961832e-08, "loss": 0.3548, "num_input_tokens_seen": 21637256, "step": 32960 }, { "epoch": 19.43691037735849, "grad_norm": 2.656428337097168, "learning_rate": 2.4177340062049304e-08, "loss": 0.2696, "num_input_tokens_seen": 21641256, "step": 32965 }, { "epoch": 19.43985849056604, "grad_norm": 5.129738807678223, "learning_rate": 2.3925301724974647e-08, "loss": 0.3706, "num_input_tokens_seen": 21645608, "step": 32970 }, { "epoch": 19.442806603773583, "grad_norm": 5.338975429534912, "learning_rate": 2.367458082512153e-08, "loss": 0.2476, "num_input_tokens_seen": 21648840, "step": 32975 }, { "epoch": 19.44575471698113, "grad_norm": 2.058675765991211, "learning_rate": 2.3425177428870737e-08, "loss": 0.2468, "num_input_tokens_seen": 21651976, "step": 32980 }, { "epoch": 19.44870283018868, "grad_norm": 4.473964691162109, "learning_rate": 2.3177091602251677e-08, "loss": 0.3264, "num_input_tokens_seen": 21654600, "step": 32985 }, { "epoch": 19.451650943396228, "grad_norm": 4.656971454620361, "learning_rate": 2.2930323410946254e-08, "loss": 0.3678, "num_input_tokens_seen": 21657320, "step": 32990 }, { "epoch": 19.454599056603772, "grad_norm": 5.3582444190979, "learning_rate": 2.2684872920287758e-08, "loss": 0.2574, "num_input_tokens_seen": 21660232, "step": 32995 }, { "epoch": 19.45754716981132, "grad_norm": 4.506361961364746, "learning_rate": 2.2440740195260323e-08, "loss": 0.2804, "num_input_tokens_seen": 21663272, "step": 33000 }, { "epoch": 19.46049528301887, "grad_norm": 5.362870216369629, "learning_rate": 2.219792530049891e-08, "loss": 0.3068, "num_input_tokens_seen": 21666152, "step": 33005 }, { "epoch": 19.463443396226417, "grad_norm": 2.1794512271881104, "learning_rate": 2.1956428300290434e-08, "loss": 0.3541, "num_input_tokens_seen": 21669480, "step": 33010 }, { "epoch": 19.46639150943396, "grad_norm": 6.9840545654296875, "learning_rate": 2.1716249258570966e-08, "loss": 0.3795, "num_input_tokens_seen": 21673512, "step": 33015 }, { "epoch": 19.46933962264151, "grad_norm": 4.7646050453186035, "learning_rate": 2.1477388238930196e-08, "loss": 0.371, "num_input_tokens_seen": 21676456, "step": 33020 }, { "epoch": 19.472287735849058, "grad_norm": 5.635921955108643, "learning_rate": 2.1239845304606988e-08, "loss": 0.3352, "num_input_tokens_seen": 21679592, "step": 33025 }, { "epoch": 19.475235849056602, "grad_norm": 5.876254558563232, "learning_rate": 2.100362051849214e-08, "loss": 0.3378, "num_input_tokens_seen": 21682440, "step": 33030 }, { "epoch": 19.47818396226415, "grad_norm": 4.375807285308838, "learning_rate": 2.076871394312674e-08, "loss": 0.3247, "num_input_tokens_seen": 21686216, "step": 33035 }, { "epoch": 19.4811320754717, "grad_norm": 4.57375431060791, "learning_rate": 2.0535125640703813e-08, "loss": 0.2874, "num_input_tokens_seen": 21690408, "step": 33040 }, { "epoch": 19.484080188679247, "grad_norm": 2.631700038909912, "learning_rate": 2.0302855673066667e-08, "loss": 0.2806, "num_input_tokens_seen": 21693320, "step": 33045 }, { "epoch": 19.48702830188679, "grad_norm": 6.3515424728393555, "learning_rate": 2.0071904101710004e-08, "loss": 0.2439, "num_input_tokens_seen": 21696392, "step": 33050 }, { "epoch": 19.48997641509434, "grad_norm": 3.775470018386841, "learning_rate": 1.98422709877788e-08, "loss": 0.3142, "num_input_tokens_seen": 21700232, "step": 33055 }, { "epoch": 19.492924528301888, "grad_norm": 4.391932487487793, "learning_rate": 1.961395639206942e-08, "loss": 0.3358, "num_input_tokens_seen": 21703368, "step": 33060 }, { "epoch": 19.495872641509433, "grad_norm": 4.238579273223877, "learning_rate": 1.9386960375029628e-08, "loss": 0.2903, "num_input_tokens_seen": 21706376, "step": 33065 }, { "epoch": 19.49882075471698, "grad_norm": 4.145432949066162, "learning_rate": 1.9161282996757458e-08, "loss": 0.2829, "num_input_tokens_seen": 21709576, "step": 33070 }, { "epoch": 19.50176886792453, "grad_norm": 4.542132377624512, "learning_rate": 1.8936924317001225e-08, "loss": 0.2596, "num_input_tokens_seen": 21713480, "step": 33075 }, { "epoch": 19.504716981132077, "grad_norm": 7.937570095062256, "learning_rate": 1.8713884395162308e-08, "loss": 0.2198, "num_input_tokens_seen": 21716360, "step": 33080 }, { "epoch": 19.50766509433962, "grad_norm": 3.5890636444091797, "learning_rate": 1.8492163290290132e-08, "loss": 0.2886, "num_input_tokens_seen": 21719976, "step": 33085 }, { "epoch": 19.51061320754717, "grad_norm": 3.018160104751587, "learning_rate": 1.827176106108719e-08, "loss": 0.3005, "num_input_tokens_seen": 21723080, "step": 33090 }, { "epoch": 19.513561320754718, "grad_norm": 2.6946113109588623, "learning_rate": 1.8052677765905137e-08, "loss": 0.2514, "num_input_tokens_seen": 21725480, "step": 33095 }, { "epoch": 19.516509433962263, "grad_norm": 2.309978485107422, "learning_rate": 1.783491346274757e-08, "loss": 0.2713, "num_input_tokens_seen": 21728872, "step": 33100 }, { "epoch": 19.51945754716981, "grad_norm": 4.239889621734619, "learning_rate": 1.7618468209268936e-08, "loss": 0.4062, "num_input_tokens_seen": 21732520, "step": 33105 }, { "epoch": 19.52240566037736, "grad_norm": 3.861268997192383, "learning_rate": 1.7403342062773943e-08, "loss": 0.3686, "num_input_tokens_seen": 21736328, "step": 33110 }, { "epoch": 19.525353773584907, "grad_norm": 3.1952619552612305, "learning_rate": 1.718953508021759e-08, "loss": 0.2416, "num_input_tokens_seen": 21739336, "step": 33115 }, { "epoch": 19.528301886792452, "grad_norm": 2.951690435409546, "learning_rate": 1.6977047318206262e-08, "loss": 0.2496, "num_input_tokens_seen": 21742600, "step": 33120 }, { "epoch": 19.53125, "grad_norm": 3.8892438411712646, "learning_rate": 1.676587883299774e-08, "loss": 0.2672, "num_input_tokens_seen": 21745704, "step": 33125 }, { "epoch": 19.534198113207548, "grad_norm": 2.564662218093872, "learning_rate": 1.655602968049952e-08, "loss": 0.4406, "num_input_tokens_seen": 21748808, "step": 33130 }, { "epoch": 19.537146226415093, "grad_norm": 3.6241374015808105, "learning_rate": 1.634749991626938e-08, "loss": 0.2141, "num_input_tokens_seen": 21752232, "step": 33135 }, { "epoch": 19.54009433962264, "grad_norm": 4.1000447273254395, "learning_rate": 1.6140289595517056e-08, "loss": 0.3902, "num_input_tokens_seen": 21756264, "step": 33140 }, { "epoch": 19.54304245283019, "grad_norm": 3.5851922035217285, "learning_rate": 1.5934398773102545e-08, "loss": 0.3049, "num_input_tokens_seen": 21759784, "step": 33145 }, { "epoch": 19.545990566037737, "grad_norm": 4.849981784820557, "learning_rate": 1.5729827503536133e-08, "loss": 0.2638, "num_input_tokens_seen": 21763496, "step": 33150 }, { "epoch": 19.548938679245282, "grad_norm": 4.897857189178467, "learning_rate": 1.5526575840978942e-08, "loss": 0.3362, "num_input_tokens_seen": 21766984, "step": 33155 }, { "epoch": 19.55188679245283, "grad_norm": 2.6421396732330322, "learning_rate": 1.532464383924237e-08, "loss": 0.5042, "num_input_tokens_seen": 21770216, "step": 33160 }, { "epoch": 19.55483490566038, "grad_norm": 5.477406024932861, "learning_rate": 1.5124031551789208e-08, "loss": 0.3162, "num_input_tokens_seen": 21772744, "step": 33165 }, { "epoch": 19.557783018867923, "grad_norm": 3.1904256343841553, "learning_rate": 1.4924739031732527e-08, "loss": 0.3863, "num_input_tokens_seen": 21776232, "step": 33170 }, { "epoch": 19.56073113207547, "grad_norm": 3.130096197128296, "learning_rate": 1.4726766331835118e-08, "loss": 0.2565, "num_input_tokens_seen": 21779176, "step": 33175 }, { "epoch": 19.56367924528302, "grad_norm": 2.917417287826538, "learning_rate": 1.4530113504512278e-08, "loss": 0.282, "num_input_tokens_seen": 21783112, "step": 33180 }, { "epoch": 19.566627358490567, "grad_norm": 3.6074025630950928, "learning_rate": 1.4334780601827914e-08, "loss": 0.2748, "num_input_tokens_seen": 21786376, "step": 33185 }, { "epoch": 19.569575471698112, "grad_norm": 4.533635139465332, "learning_rate": 1.4140767675497325e-08, "loss": 0.3154, "num_input_tokens_seen": 21789128, "step": 33190 }, { "epoch": 19.57252358490566, "grad_norm": 3.555637836456299, "learning_rate": 1.394807477688609e-08, "loss": 0.2834, "num_input_tokens_seen": 21791464, "step": 33195 }, { "epoch": 19.57547169811321, "grad_norm": 6.472837924957275, "learning_rate": 1.3756701957011177e-08, "loss": 0.2097, "num_input_tokens_seen": 21795240, "step": 33200 }, { "epoch": 19.578419811320753, "grad_norm": 4.505040645599365, "learning_rate": 1.3566649266538723e-08, "loss": 0.3444, "num_input_tokens_seen": 21797800, "step": 33205 }, { "epoch": 19.5813679245283, "grad_norm": 3.160975694656372, "learning_rate": 1.3377916755786257e-08, "loss": 0.2141, "num_input_tokens_seen": 21802728, "step": 33210 }, { "epoch": 19.58431603773585, "grad_norm": 3.0343966484069824, "learning_rate": 1.3190504474721588e-08, "loss": 0.2863, "num_input_tokens_seen": 21805640, "step": 33215 }, { "epoch": 19.587264150943398, "grad_norm": 4.419856548309326, "learning_rate": 1.3004412472962802e-08, "loss": 0.3381, "num_input_tokens_seen": 21809480, "step": 33220 }, { "epoch": 19.590212264150942, "grad_norm": 5.474216938018799, "learning_rate": 1.2819640799778266e-08, "loss": 0.2684, "num_input_tokens_seen": 21812232, "step": 33225 }, { "epoch": 19.59316037735849, "grad_norm": 2.8511857986450195, "learning_rate": 1.2636189504087737e-08, "loss": 0.2932, "num_input_tokens_seen": 21814856, "step": 33230 }, { "epoch": 19.59610849056604, "grad_norm": 3.3703503608703613, "learning_rate": 1.2454058634460142e-08, "loss": 0.268, "num_input_tokens_seen": 21817512, "step": 33235 }, { "epoch": 19.599056603773583, "grad_norm": 3.343593120574951, "learning_rate": 1.2273248239115798e-08, "loss": 0.2952, "num_input_tokens_seen": 21820392, "step": 33240 }, { "epoch": 19.60200471698113, "grad_norm": 3.696733236312866, "learning_rate": 1.2093758365924746e-08, "loss": 0.2987, "num_input_tokens_seen": 21823688, "step": 33245 }, { "epoch": 19.60495283018868, "grad_norm": 1.9375697374343872, "learning_rate": 1.1915589062408417e-08, "loss": 0.2214, "num_input_tokens_seen": 21827240, "step": 33250 }, { "epoch": 19.607900943396228, "grad_norm": 3.067296266555786, "learning_rate": 1.1738740375736301e-08, "loss": 0.315, "num_input_tokens_seen": 21829800, "step": 33255 }, { "epoch": 19.610849056603772, "grad_norm": 5.685369968414307, "learning_rate": 1.15632123527315e-08, "loss": 0.2046, "num_input_tokens_seen": 21831976, "step": 33260 }, { "epoch": 19.61379716981132, "grad_norm": 3.439405679702759, "learning_rate": 1.1389005039865176e-08, "loss": 0.4619, "num_input_tokens_seen": 21835208, "step": 33265 }, { "epoch": 19.61674528301887, "grad_norm": 3.84596586227417, "learning_rate": 1.1216118483259875e-08, "loss": 0.3354, "num_input_tokens_seen": 21839432, "step": 33270 }, { "epoch": 19.619693396226417, "grad_norm": 2.9982426166534424, "learning_rate": 1.1044552728687319e-08, "loss": 0.397, "num_input_tokens_seen": 21843144, "step": 33275 }, { "epoch": 19.62264150943396, "grad_norm": 4.225620746612549, "learning_rate": 1.0874307821570618e-08, "loss": 0.2769, "num_input_tokens_seen": 21846504, "step": 33280 }, { "epoch": 19.62558962264151, "grad_norm": 6.297050476074219, "learning_rate": 1.0705383806982606e-08, "loss": 0.4139, "num_input_tokens_seen": 21848968, "step": 33285 }, { "epoch": 19.628537735849058, "grad_norm": 3.751392364501953, "learning_rate": 1.0537780729646952e-08, "loss": 0.3176, "num_input_tokens_seen": 21852520, "step": 33290 }, { "epoch": 19.631485849056602, "grad_norm": 3.0809335708618164, "learning_rate": 1.0371498633937605e-08, "loss": 0.2398, "num_input_tokens_seen": 21856008, "step": 33295 }, { "epoch": 19.63443396226415, "grad_norm": 4.048020839691162, "learning_rate": 1.0206537563877683e-08, "loss": 0.2832, "num_input_tokens_seen": 21858888, "step": 33300 }, { "epoch": 19.6373820754717, "grad_norm": 3.513007640838623, "learning_rate": 1.0042897563141695e-08, "loss": 0.3042, "num_input_tokens_seen": 21861512, "step": 33305 }, { "epoch": 19.640330188679247, "grad_norm": 2.0452911853790283, "learning_rate": 9.88057867505443e-09, "loss": 0.3379, "num_input_tokens_seen": 21865864, "step": 33310 }, { "epoch": 19.64327830188679, "grad_norm": 2.7330479621887207, "learning_rate": 9.7195809425904e-09, "loss": 0.3621, "num_input_tokens_seen": 21868840, "step": 33315 }, { "epoch": 19.64622641509434, "grad_norm": 2.568960189819336, "learning_rate": 9.559904408373844e-09, "loss": 0.3764, "num_input_tokens_seen": 21871816, "step": 33320 }, { "epoch": 19.649174528301888, "grad_norm": 2.5925774574279785, "learning_rate": 9.401549114680387e-09, "loss": 0.3132, "num_input_tokens_seen": 21874632, "step": 33325 }, { "epoch": 19.652122641509433, "grad_norm": 3.543515920639038, "learning_rate": 9.244515103434826e-09, "loss": 0.3511, "num_input_tokens_seen": 21877416, "step": 33330 }, { "epoch": 19.65507075471698, "grad_norm": 2.8799631595611572, "learning_rate": 9.08880241621335e-09, "loss": 0.341, "num_input_tokens_seen": 21880136, "step": 33335 }, { "epoch": 19.65801886792453, "grad_norm": 5.26106595993042, "learning_rate": 8.934411094240758e-09, "loss": 0.4832, "num_input_tokens_seen": 21884168, "step": 33340 }, { "epoch": 19.660966981132077, "grad_norm": 2.515576124191284, "learning_rate": 8.781341178393244e-09, "loss": 0.3376, "num_input_tokens_seen": 21887592, "step": 33345 }, { "epoch": 19.66391509433962, "grad_norm": 4.146634101867676, "learning_rate": 8.629592709196167e-09, "loss": 0.3358, "num_input_tokens_seen": 21891016, "step": 33350 }, { "epoch": 19.66686320754717, "grad_norm": 2.837592601776123, "learning_rate": 8.479165726826277e-09, "loss": 0.3523, "num_input_tokens_seen": 21893928, "step": 33355 }, { "epoch": 19.669811320754718, "grad_norm": 3.6670539379119873, "learning_rate": 8.330060271109496e-09, "loss": 0.2256, "num_input_tokens_seen": 21897320, "step": 33360 }, { "epoch": 19.672759433962263, "grad_norm": 4.35264778137207, "learning_rate": 8.18227638152258e-09, "loss": 0.4062, "num_input_tokens_seen": 21900872, "step": 33365 }, { "epoch": 19.67570754716981, "grad_norm": 4.713831901550293, "learning_rate": 8.035814097191452e-09, "loss": 0.4882, "num_input_tokens_seen": 21904168, "step": 33370 }, { "epoch": 19.67865566037736, "grad_norm": 2.430180549621582, "learning_rate": 7.890673456892317e-09, "loss": 0.2872, "num_input_tokens_seen": 21908424, "step": 33375 }, { "epoch": 19.681603773584907, "grad_norm": 3.576939582824707, "learning_rate": 7.746854499052215e-09, "loss": 0.2732, "num_input_tokens_seen": 21911048, "step": 33380 }, { "epoch": 19.684551886792452, "grad_norm": 3.797013998031616, "learning_rate": 7.604357261747907e-09, "loss": 0.36, "num_input_tokens_seen": 21914312, "step": 33385 }, { "epoch": 19.6875, "grad_norm": 7.692054748535156, "learning_rate": 7.463181782705886e-09, "loss": 0.3061, "num_input_tokens_seen": 21917224, "step": 33390 }, { "epoch": 19.690448113207548, "grad_norm": 3.102510690689087, "learning_rate": 7.3233280993034726e-09, "loss": 0.4194, "num_input_tokens_seen": 21920488, "step": 33395 }, { "epoch": 19.693396226415093, "grad_norm": 5.9583306312561035, "learning_rate": 7.184796248567161e-09, "loss": 0.33, "num_input_tokens_seen": 21923112, "step": 33400 }, { "epoch": 19.69634433962264, "grad_norm": 2.3439974784851074, "learning_rate": 7.047586267173723e-09, "loss": 0.2672, "num_input_tokens_seen": 21926408, "step": 33405 }, { "epoch": 19.69929245283019, "grad_norm": 3.5302767753601074, "learning_rate": 6.9116981914502114e-09, "loss": 0.2928, "num_input_tokens_seen": 21929224, "step": 33410 }, { "epoch": 19.702240566037737, "grad_norm": 2.7350025177001953, "learning_rate": 6.7771320573734036e-09, "loss": 0.6448, "num_input_tokens_seen": 21932264, "step": 33415 }, { "epoch": 19.705188679245282, "grad_norm": 3.4986226558685303, "learning_rate": 6.6438879005709114e-09, "loss": 0.3176, "num_input_tokens_seen": 21935496, "step": 33420 }, { "epoch": 19.70813679245283, "grad_norm": 3.134547472000122, "learning_rate": 6.511965756318961e-09, "loss": 0.2919, "num_input_tokens_seen": 21938696, "step": 33425 }, { "epoch": 19.71108490566038, "grad_norm": 10.980745315551758, "learning_rate": 6.381365659545169e-09, "loss": 0.3762, "num_input_tokens_seen": 21941544, "step": 33430 }, { "epoch": 19.714033018867923, "grad_norm": 4.404969692230225, "learning_rate": 6.252087644825766e-09, "loss": 0.2594, "num_input_tokens_seen": 21944200, "step": 33435 }, { "epoch": 19.71698113207547, "grad_norm": 4.041815757751465, "learning_rate": 6.124131746388373e-09, "loss": 0.2582, "num_input_tokens_seen": 21947304, "step": 33440 }, { "epoch": 19.71992924528302, "grad_norm": 1.5937291383743286, "learning_rate": 5.997497998109225e-09, "loss": 0.328, "num_input_tokens_seen": 21950728, "step": 33445 }, { "epoch": 19.722877358490567, "grad_norm": 2.785818576812744, "learning_rate": 5.8721864335153925e-09, "loss": 0.2883, "num_input_tokens_seen": 21954280, "step": 33450 }, { "epoch": 19.725825471698112, "grad_norm": 3.9782259464263916, "learning_rate": 5.748197085784224e-09, "loss": 0.2876, "num_input_tokens_seen": 21956968, "step": 33455 }, { "epoch": 19.72877358490566, "grad_norm": 5.6468329429626465, "learning_rate": 5.62552998774113e-09, "loss": 0.3204, "num_input_tokens_seen": 21959400, "step": 33460 }, { "epoch": 19.73172169811321, "grad_norm": 4.879594326019287, "learning_rate": 5.504185171864018e-09, "loss": 0.4159, "num_input_tokens_seen": 21962952, "step": 33465 }, { "epoch": 19.734669811320753, "grad_norm": 3.9689979553222656, "learning_rate": 5.384162670278858e-09, "loss": 0.3801, "num_input_tokens_seen": 21966248, "step": 33470 }, { "epoch": 19.7376179245283, "grad_norm": 3.1544175148010254, "learning_rate": 5.265462514762454e-09, "loss": 0.3325, "num_input_tokens_seen": 21969608, "step": 33475 }, { "epoch": 19.74056603773585, "grad_norm": 4.0891337394714355, "learning_rate": 5.148084736740777e-09, "loss": 0.3521, "num_input_tokens_seen": 21973736, "step": 33480 }, { "epoch": 19.743514150943398, "grad_norm": 2.98569393157959, "learning_rate": 5.032029367290081e-09, "loss": 0.2669, "num_input_tokens_seen": 21976680, "step": 33485 }, { "epoch": 19.746462264150942, "grad_norm": 3.6015985012054443, "learning_rate": 4.917296437136898e-09, "loss": 0.2692, "num_input_tokens_seen": 21979432, "step": 33490 }, { "epoch": 19.74941037735849, "grad_norm": 2.7548458576202393, "learning_rate": 4.8038859766569305e-09, "loss": 0.2487, "num_input_tokens_seen": 21982696, "step": 33495 }, { "epoch": 19.75235849056604, "grad_norm": 3.2171318531036377, "learning_rate": 4.691798015876714e-09, "loss": 0.3339, "num_input_tokens_seen": 21986760, "step": 33500 }, { "epoch": 19.755306603773583, "grad_norm": 5.5918869972229, "learning_rate": 4.581032584470846e-09, "loss": 0.2586, "num_input_tokens_seen": 21990440, "step": 33505 }, { "epoch": 19.75825471698113, "grad_norm": 4.120641708374023, "learning_rate": 4.471589711766422e-09, "loss": 0.1989, "num_input_tokens_seen": 21994152, "step": 33510 }, { "epoch": 19.76120283018868, "grad_norm": 2.4740803241729736, "learning_rate": 4.363469426737487e-09, "loss": 0.2732, "num_input_tokens_seen": 21997736, "step": 33515 }, { "epoch": 19.764150943396228, "grad_norm": 9.618244171142578, "learning_rate": 4.256671758010588e-09, "loss": 0.2735, "num_input_tokens_seen": 22000968, "step": 33520 }, { "epoch": 19.767099056603772, "grad_norm": 2.507578134536743, "learning_rate": 4.151196733859775e-09, "loss": 0.3318, "num_input_tokens_seen": 22004040, "step": 33525 }, { "epoch": 19.77004716981132, "grad_norm": 4.238846302032471, "learning_rate": 4.047044382211041e-09, "loss": 0.2394, "num_input_tokens_seen": 22006792, "step": 33530 }, { "epoch": 19.77299528301887, "grad_norm": 4.130313396453857, "learning_rate": 3.94421473063844e-09, "loss": 0.2777, "num_input_tokens_seen": 22009832, "step": 33535 }, { "epoch": 19.775943396226417, "grad_norm": 3.5623831748962402, "learning_rate": 3.842707806366863e-09, "loss": 0.3239, "num_input_tokens_seen": 22012712, "step": 33540 }, { "epoch": 19.77889150943396, "grad_norm": 2.3634026050567627, "learning_rate": 3.742523636270368e-09, "loss": 0.3933, "num_input_tokens_seen": 22016136, "step": 33545 }, { "epoch": 19.78183962264151, "grad_norm": 3.6151745319366455, "learning_rate": 3.6436622468738468e-09, "loss": 0.329, "num_input_tokens_seen": 22019464, "step": 33550 }, { "epoch": 19.784787735849058, "grad_norm": 6.024074077606201, "learning_rate": 3.546123664350254e-09, "loss": 0.2304, "num_input_tokens_seen": 22021768, "step": 33555 }, { "epoch": 19.787735849056602, "grad_norm": 5.989695072174072, "learning_rate": 3.449907914524486e-09, "loss": 0.3705, "num_input_tokens_seen": 22024776, "step": 33560 }, { "epoch": 19.79068396226415, "grad_norm": 3.0368106365203857, "learning_rate": 3.355015022869501e-09, "loss": 0.2532, "num_input_tokens_seen": 22027432, "step": 33565 }, { "epoch": 19.7936320754717, "grad_norm": 4.301450729370117, "learning_rate": 3.261445014508535e-09, "loss": 0.3641, "num_input_tokens_seen": 22030728, "step": 33570 }, { "epoch": 19.796580188679247, "grad_norm": 2.6790072917938232, "learning_rate": 3.1691979142145503e-09, "loss": 0.2282, "num_input_tokens_seen": 22033960, "step": 33575 }, { "epoch": 19.79952830188679, "grad_norm": 2.669393539428711, "learning_rate": 3.078273746410787e-09, "loss": 0.3339, "num_input_tokens_seen": 22038344, "step": 33580 }, { "epoch": 19.80247641509434, "grad_norm": 7.125037670135498, "learning_rate": 2.988672535169657e-09, "loss": 0.3647, "num_input_tokens_seen": 22043240, "step": 33585 }, { "epoch": 19.805424528301888, "grad_norm": 3.46358585357666, "learning_rate": 2.9003943042127393e-09, "loss": 0.2963, "num_input_tokens_seen": 22046952, "step": 33590 }, { "epoch": 19.808372641509433, "grad_norm": 2.4503448009490967, "learning_rate": 2.8134390769135598e-09, "loss": 0.2915, "num_input_tokens_seen": 22049928, "step": 33595 }, { "epoch": 19.81132075471698, "grad_norm": 3.562318801879883, "learning_rate": 2.7278068762925935e-09, "loss": 0.2329, "num_input_tokens_seen": 22052456, "step": 33600 }, { "epoch": 19.81426886792453, "grad_norm": 5.904191970825195, "learning_rate": 2.6434977250217043e-09, "loss": 0.3623, "num_input_tokens_seen": 22055848, "step": 33605 }, { "epoch": 19.817216981132077, "grad_norm": 2.4953160285949707, "learning_rate": 2.560511645422481e-09, "loss": 0.2423, "num_input_tokens_seen": 22059240, "step": 33610 }, { "epoch": 19.82016509433962, "grad_norm": 2.38295578956604, "learning_rate": 2.4788486594656825e-09, "loss": 0.4027, "num_input_tokens_seen": 22064360, "step": 33615 }, { "epoch": 19.82311320754717, "grad_norm": 4.165903568267822, "learning_rate": 2.398508788771792e-09, "loss": 0.336, "num_input_tokens_seen": 22067208, "step": 33620 }, { "epoch": 19.826061320754718, "grad_norm": 3.835305690765381, "learning_rate": 2.3194920546110166e-09, "loss": 0.3816, "num_input_tokens_seen": 22070440, "step": 33625 }, { "epoch": 19.829009433962263, "grad_norm": 3.1170239448547363, "learning_rate": 2.2417984779032896e-09, "loss": 0.2933, "num_input_tokens_seen": 22073384, "step": 33630 }, { "epoch": 19.83195754716981, "grad_norm": 2.006960153579712, "learning_rate": 2.1654280792193782e-09, "loss": 0.2198, "num_input_tokens_seen": 22076392, "step": 33635 }, { "epoch": 19.83490566037736, "grad_norm": 4.571083068847656, "learning_rate": 2.0903808787769987e-09, "loss": 0.2045, "num_input_tokens_seen": 22079624, "step": 33640 }, { "epoch": 19.837853773584907, "grad_norm": 3.6024372577667236, "learning_rate": 2.0166568964463673e-09, "loss": 0.3187, "num_input_tokens_seen": 22082504, "step": 33645 }, { "epoch": 19.840801886792452, "grad_norm": 4.256567001342773, "learning_rate": 1.9442561517463153e-09, "loss": 0.3294, "num_input_tokens_seen": 22085960, "step": 33650 }, { "epoch": 19.84375, "grad_norm": 8.009331703186035, "learning_rate": 1.8731786638442886e-09, "loss": 0.281, "num_input_tokens_seen": 22089128, "step": 33655 }, { "epoch": 19.846698113207548, "grad_norm": 6.521792888641357, "learning_rate": 1.8034244515591214e-09, "loss": 0.4514, "num_input_tokens_seen": 22092808, "step": 33660 }, { "epoch": 19.849646226415093, "grad_norm": 3.7842459678649902, "learning_rate": 1.7349935333582646e-09, "loss": 0.3594, "num_input_tokens_seen": 22095848, "step": 33665 }, { "epoch": 19.85259433962264, "grad_norm": 5.114498615264893, "learning_rate": 1.6678859273594471e-09, "loss": 0.2195, "num_input_tokens_seen": 22098216, "step": 33670 }, { "epoch": 19.85554245283019, "grad_norm": 5.898948669433594, "learning_rate": 1.6021016513295683e-09, "loss": 0.3642, "num_input_tokens_seen": 22100648, "step": 33675 }, { "epoch": 19.858490566037737, "grad_norm": 5.434137344360352, "learning_rate": 1.5376407226846968e-09, "loss": 0.3118, "num_input_tokens_seen": 22103592, "step": 33680 }, { "epoch": 19.861438679245282, "grad_norm": 3.1042845249176025, "learning_rate": 1.4745031584917357e-09, "loss": 0.3676, "num_input_tokens_seen": 22106920, "step": 33685 }, { "epoch": 19.86438679245283, "grad_norm": 2.862250328063965, "learning_rate": 1.4126889754667583e-09, "loss": 0.2154, "num_input_tokens_seen": 22109864, "step": 33690 }, { "epoch": 19.86733490566038, "grad_norm": 2.5424675941467285, "learning_rate": 1.3521981899750069e-09, "loss": 0.2362, "num_input_tokens_seen": 22113384, "step": 33695 }, { "epoch": 19.870283018867923, "grad_norm": 5.9063310623168945, "learning_rate": 1.293030818032004e-09, "loss": 0.3047, "num_input_tokens_seen": 22116264, "step": 33700 }, { "epoch": 19.87323113207547, "grad_norm": 3.800877809524536, "learning_rate": 1.2351868753018858e-09, "loss": 0.3334, "num_input_tokens_seen": 22119528, "step": 33705 }, { "epoch": 19.87617924528302, "grad_norm": 2.0096731185913086, "learning_rate": 1.1786663770996242e-09, "loss": 0.2979, "num_input_tokens_seen": 22123304, "step": 33710 }, { "epoch": 19.879127358490567, "grad_norm": 6.8044891357421875, "learning_rate": 1.1234693383893602e-09, "loss": 0.2968, "num_input_tokens_seen": 22126312, "step": 33715 }, { "epoch": 19.882075471698112, "grad_norm": 2.969419002532959, "learning_rate": 1.0695957737844043e-09, "loss": 0.3666, "num_input_tokens_seen": 22129640, "step": 33720 }, { "epoch": 19.88502358490566, "grad_norm": 4.026296138763428, "learning_rate": 1.0170456975483467e-09, "loss": 0.3655, "num_input_tokens_seen": 22132648, "step": 33725 }, { "epoch": 19.88797169811321, "grad_norm": 3.6552133560180664, "learning_rate": 9.658191235933922e-10, "loss": 0.3487, "num_input_tokens_seen": 22136104, "step": 33730 }, { "epoch": 19.890919811320753, "grad_norm": 3.0181994438171387, "learning_rate": 9.159160654825805e-10, "loss": 0.3566, "num_input_tokens_seen": 22139848, "step": 33735 }, { "epoch": 19.8938679245283, "grad_norm": 4.775576114654541, "learning_rate": 8.673365364281205e-10, "loss": 0.3501, "num_input_tokens_seen": 22142376, "step": 33740 }, { "epoch": 19.89681603773585, "grad_norm": 2.180443048477173, "learning_rate": 8.200805492913911e-10, "loss": 0.2691, "num_input_tokens_seen": 22146120, "step": 33745 }, { "epoch": 19.899764150943398, "grad_norm": 8.808688163757324, "learning_rate": 7.741481165834952e-10, "loss": 0.2869, "num_input_tokens_seen": 22148840, "step": 33750 }, { "epoch": 19.902712264150942, "grad_norm": 4.1864705085754395, "learning_rate": 7.29539250465261e-10, "loss": 0.2985, "num_input_tokens_seen": 22152904, "step": 33755 }, { "epoch": 19.90566037735849, "grad_norm": 4.844406604766846, "learning_rate": 6.862539627472409e-10, "loss": 0.3478, "num_input_tokens_seen": 22155944, "step": 33760 }, { "epoch": 19.90860849056604, "grad_norm": 3.364931106567383, "learning_rate": 6.442922648897121e-10, "loss": 0.2058, "num_input_tokens_seen": 22158696, "step": 33765 }, { "epoch": 19.911556603773583, "grad_norm": 2.5939183235168457, "learning_rate": 6.036541680015662e-10, "loss": 0.2749, "num_input_tokens_seen": 22162856, "step": 33770 }, { "epoch": 19.91450471698113, "grad_norm": 2.648616313934326, "learning_rate": 5.643396828419745e-10, "loss": 0.2408, "num_input_tokens_seen": 22166184, "step": 33775 }, { "epoch": 19.91745283018868, "grad_norm": 3.7995612621307373, "learning_rate": 5.26348819819833e-10, "loss": 0.3318, "num_input_tokens_seen": 22169864, "step": 33780 }, { "epoch": 19.920400943396228, "grad_norm": 4.102115631103516, "learning_rate": 4.896815889937622e-10, "loss": 0.2939, "num_input_tokens_seen": 22172872, "step": 33785 }, { "epoch": 19.923349056603772, "grad_norm": 2.8301150798797607, "learning_rate": 4.543380000704423e-10, "loss": 0.2555, "num_input_tokens_seen": 22180584, "step": 33790 }, { "epoch": 19.92629716981132, "grad_norm": 2.5516343116760254, "learning_rate": 4.203180624084979e-10, "loss": 0.2493, "num_input_tokens_seen": 22185416, "step": 33795 }, { "epoch": 19.92924528301887, "grad_norm": 5.679764270782471, "learning_rate": 3.876217850146136e-10, "loss": 0.2418, "num_input_tokens_seen": 22188488, "step": 33800 }, { "epoch": 19.932193396226417, "grad_norm": 5.611368656158447, "learning_rate": 3.56249176544643e-10, "loss": 0.2709, "num_input_tokens_seen": 22192136, "step": 33805 }, { "epoch": 19.93514150943396, "grad_norm": 4.732576370239258, "learning_rate": 3.262002453047197e-10, "loss": 0.2726, "num_input_tokens_seen": 22194888, "step": 33810 }, { "epoch": 19.93808962264151, "grad_norm": 3.969005584716797, "learning_rate": 2.974749992512571e-10, "loss": 0.4702, "num_input_tokens_seen": 22200296, "step": 33815 }, { "epoch": 19.941037735849058, "grad_norm": 3.472781181335449, "learning_rate": 2.700734459881726e-10, "loss": 0.2811, "num_input_tokens_seen": 22203496, "step": 33820 }, { "epoch": 19.943985849056602, "grad_norm": 3.6114039421081543, "learning_rate": 2.439955927713289e-10, "loss": 0.2696, "num_input_tokens_seen": 22206440, "step": 33825 }, { "epoch": 19.94693396226415, "grad_norm": 2.1662330627441406, "learning_rate": 2.1924144650409263e-10, "loss": 0.4401, "num_input_tokens_seen": 22210440, "step": 33830 }, { "epoch": 19.9498820754717, "grad_norm": 5.177118301391602, "learning_rate": 1.9581101374066546e-10, "loss": 0.3732, "num_input_tokens_seen": 22213448, "step": 33835 }, { "epoch": 19.952830188679247, "grad_norm": 3.3303768634796143, "learning_rate": 1.7370430068441858e-10, "loss": 0.2899, "num_input_tokens_seen": 22216072, "step": 33840 }, { "epoch": 19.95577830188679, "grad_norm": 4.157655715942383, "learning_rate": 1.529213131878926e-10, "loss": 0.301, "num_input_tokens_seen": 22219080, "step": 33845 }, { "epoch": 19.95872641509434, "grad_norm": 5.0223236083984375, "learning_rate": 1.3346205675335288e-10, "loss": 0.4291, "num_input_tokens_seen": 22223848, "step": 33850 }, { "epoch": 19.961674528301888, "grad_norm": 4.475122451782227, "learning_rate": 1.1532653653334447e-10, "loss": 0.3624, "num_input_tokens_seen": 22227368, "step": 33855 }, { "epoch": 19.964622641509433, "grad_norm": 4.9691596031188965, "learning_rate": 9.85147573284717e-11, "loss": 0.3401, "num_input_tokens_seen": 22229928, "step": 33860 }, { "epoch": 19.96757075471698, "grad_norm": 3.8071768283843994, "learning_rate": 8.302672359072894e-11, "loss": 0.3796, "num_input_tokens_seen": 22233416, "step": 33865 }, { "epoch": 19.97051886792453, "grad_norm": 2.8690426349639893, "learning_rate": 6.886243941961468e-11, "loss": 0.3194, "num_input_tokens_seen": 22236200, "step": 33870 }, { "epoch": 19.973466981132077, "grad_norm": 3.188943862915039, "learning_rate": 5.602190856601741e-11, "loss": 0.3417, "num_input_tokens_seen": 22238952, "step": 33875 }, { "epoch": 19.97641509433962, "grad_norm": 3.8367841243743896, "learning_rate": 4.450513442888493e-11, "loss": 0.4159, "num_input_tokens_seen": 22242056, "step": 33880 }, { "epoch": 19.97936320754717, "grad_norm": 3.563401937484741, "learning_rate": 3.4312120057999886e-11, "loss": 0.3453, "num_input_tokens_seen": 22245576, "step": 33885 }, { "epoch": 19.982311320754718, "grad_norm": 3.908334255218506, "learning_rate": 2.5442868151204224e-11, "loss": 0.3803, "num_input_tokens_seen": 22248424, "step": 33890 }, { "epoch": 19.985259433962263, "grad_norm": 6.026727199554443, "learning_rate": 1.7897381057729867e-11, "loss": 0.3286, "num_input_tokens_seen": 22250888, "step": 33895 }, { "epoch": 19.98820754716981, "grad_norm": 4.07478141784668, "learning_rate": 1.1675660773757813e-11, "loss": 0.3245, "num_input_tokens_seen": 22254824, "step": 33900 }, { "epoch": 19.99115566037736, "grad_norm": 3.9052696228027344, "learning_rate": 6.777708947969253e-12, "loss": 0.3288, "num_input_tokens_seen": 22257512, "step": 33905 }, { "epoch": 19.994103773584907, "grad_norm": 4.661067008972168, "learning_rate": 3.2035268765495674e-12, "loss": 0.358, "num_input_tokens_seen": 22260488, "step": 33910 }, { "epoch": 19.997051886792452, "grad_norm": 8.17127799987793, "learning_rate": 9.531155059638863e-13, "loss": 0.2852, "num_input_tokens_seen": 22263528, "step": 33915 }, { "epoch": 20.0, "grad_norm": 16.215375900268555, "learning_rate": 2.6475431291750342e-14, "loss": 0.3684, "num_input_tokens_seen": 22265472, "step": 33920 }, { "epoch": 20.0, "eval_loss": 0.6077574491500854, "eval_runtime": 18.8066, "eval_samples_per_second": 90.181, "eval_steps_per_second": 22.545, "num_input_tokens_seen": 22265472, "step": 33920 }, { "epoch": 20.0, "num_input_tokens_seen": 22265472, "step": 33920, "total_flos": 1.0026041408791511e+18, "train_loss": 0.4615844900104797, "train_runtime": 3911.661, "train_samples_per_second": 34.671, "train_steps_per_second": 8.672 } ], "logging_steps": 5, "max_steps": 33920, "num_input_tokens_seen": 22265472, "num_train_epochs": 20, "save_steps": 3392, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0026041408791511e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }