{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9992755373098285, "eval_steps": 500, "global_step": 4140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007244626901714562, "grad_norm": 5.692458629608154, "learning_rate": 2.4154589371980678e-08, "loss": 0.8083, "step": 1 }, { "epoch": 0.0014489253803429123, "grad_norm": 5.845697402954102, "learning_rate": 4.8309178743961356e-08, "loss": 0.8498, "step": 2 }, { "epoch": 0.0021733880705143687, "grad_norm": 5.869663715362549, "learning_rate": 7.246376811594204e-08, "loss": 0.8313, "step": 3 }, { "epoch": 0.0028978507606858247, "grad_norm": 6.032198905944824, "learning_rate": 9.661835748792271e-08, "loss": 0.8467, "step": 4 }, { "epoch": 0.003622313450857281, "grad_norm": 5.566505432128906, "learning_rate": 1.2077294685990338e-07, "loss": 0.8322, "step": 5 }, { "epoch": 0.004346776141028737, "grad_norm": 5.945607662200928, "learning_rate": 1.4492753623188408e-07, "loss": 0.8402, "step": 6 }, { "epoch": 0.005071238831200193, "grad_norm": 5.585754871368408, "learning_rate": 1.6908212560386473e-07, "loss": 0.8299, "step": 7 }, { "epoch": 0.005795701521371649, "grad_norm": 5.544276714324951, "learning_rate": 1.9323671497584542e-07, "loss": 0.837, "step": 8 }, { "epoch": 0.006520164211543105, "grad_norm": 5.9799675941467285, "learning_rate": 2.173913043478261e-07, "loss": 0.8843, "step": 9 }, { "epoch": 0.007244626901714562, "grad_norm": 5.666192531585693, "learning_rate": 2.4154589371980677e-07, "loss": 0.8261, "step": 10 }, { "epoch": 0.007969089591886018, "grad_norm": 5.554163932800293, "learning_rate": 2.6570048309178746e-07, "loss": 0.8072, "step": 11 }, { "epoch": 0.008693552282057475, "grad_norm": 5.869314670562744, "learning_rate": 2.8985507246376816e-07, "loss": 0.8757, "step": 12 }, { "epoch": 0.00941801497222893, "grad_norm": 5.950514793395996, "learning_rate": 3.140096618357488e-07, "loss": 0.8512, "step": 13 }, { "epoch": 0.010142477662400387, "grad_norm": 5.789525508880615, "learning_rate": 3.3816425120772945e-07, "loss": 0.8287, "step": 14 }, { "epoch": 0.010866940352571842, "grad_norm": 5.3574910163879395, "learning_rate": 3.623188405797102e-07, "loss": 0.8266, "step": 15 }, { "epoch": 0.011591403042743299, "grad_norm": 5.460178852081299, "learning_rate": 3.8647342995169085e-07, "loss": 0.8201, "step": 16 }, { "epoch": 0.012315865732914755, "grad_norm": 5.59174919128418, "learning_rate": 4.1062801932367154e-07, "loss": 0.8514, "step": 17 }, { "epoch": 0.01304032842308621, "grad_norm": 5.195479393005371, "learning_rate": 4.347826086956522e-07, "loss": 0.8209, "step": 18 }, { "epoch": 0.013764791113257667, "grad_norm": 5.461888313293457, "learning_rate": 4.5893719806763294e-07, "loss": 0.8369, "step": 19 }, { "epoch": 0.014489253803429124, "grad_norm": 4.342181205749512, "learning_rate": 4.830917874396135e-07, "loss": 0.7961, "step": 20 }, { "epoch": 0.01521371649360058, "grad_norm": 4.303071975708008, "learning_rate": 5.072463768115942e-07, "loss": 0.7702, "step": 21 }, { "epoch": 0.015938179183772036, "grad_norm": 4.3797221183776855, "learning_rate": 5.314009661835749e-07, "loss": 0.8027, "step": 22 }, { "epoch": 0.016662641873943493, "grad_norm": 4.395879745483398, "learning_rate": 5.555555555555555e-07, "loss": 0.7944, "step": 23 }, { "epoch": 0.01738710456411495, "grad_norm": 3.946931838989258, "learning_rate": 5.797101449275363e-07, "loss": 0.7384, "step": 24 }, { "epoch": 0.018111567254286403, "grad_norm": 3.9909114837646484, "learning_rate": 6.038647342995169e-07, "loss": 0.7701, "step": 25 }, { "epoch": 0.01883602994445786, "grad_norm": 4.019972324371338, "learning_rate": 6.280193236714976e-07, "loss": 0.7928, "step": 26 }, { "epoch": 0.019560492634629317, "grad_norm": 2.6854958534240723, "learning_rate": 6.521739130434783e-07, "loss": 0.7276, "step": 27 }, { "epoch": 0.020284955324800773, "grad_norm": 2.2493503093719482, "learning_rate": 6.763285024154589e-07, "loss": 0.7428, "step": 28 }, { "epoch": 0.02100941801497223, "grad_norm": 2.329251289367676, "learning_rate": 7.004830917874397e-07, "loss": 0.7628, "step": 29 }, { "epoch": 0.021733880705143684, "grad_norm": 2.3079590797424316, "learning_rate": 7.246376811594204e-07, "loss": 0.7401, "step": 30 }, { "epoch": 0.02245834339531514, "grad_norm": 2.215883731842041, "learning_rate": 7.48792270531401e-07, "loss": 0.7616, "step": 31 }, { "epoch": 0.023182806085486597, "grad_norm": 2.0352723598480225, "learning_rate": 7.729468599033817e-07, "loss": 0.7005, "step": 32 }, { "epoch": 0.023907268775658054, "grad_norm": 1.9396977424621582, "learning_rate": 7.971014492753623e-07, "loss": 0.7518, "step": 33 }, { "epoch": 0.02463173146582951, "grad_norm": 1.9599989652633667, "learning_rate": 8.212560386473431e-07, "loss": 0.7288, "step": 34 }, { "epoch": 0.025356194156000968, "grad_norm": 1.69314706325531, "learning_rate": 8.454106280193238e-07, "loss": 0.7271, "step": 35 }, { "epoch": 0.02608065684617242, "grad_norm": 1.4534565210342407, "learning_rate": 8.695652173913044e-07, "loss": 0.6832, "step": 36 }, { "epoch": 0.026805119536343878, "grad_norm": 1.5373289585113525, "learning_rate": 8.937198067632851e-07, "loss": 0.7283, "step": 37 }, { "epoch": 0.027529582226515335, "grad_norm": 1.8246307373046875, "learning_rate": 9.178743961352659e-07, "loss": 0.72, "step": 38 }, { "epoch": 0.02825404491668679, "grad_norm": 1.9413942098617554, "learning_rate": 9.420289855072465e-07, "loss": 0.6713, "step": 39 }, { "epoch": 0.028978507606858248, "grad_norm": 2.1538703441619873, "learning_rate": 9.66183574879227e-07, "loss": 0.6823, "step": 40 }, { "epoch": 0.0297029702970297, "grad_norm": 2.146657705307007, "learning_rate": 9.903381642512078e-07, "loss": 0.6982, "step": 41 }, { "epoch": 0.03042743298720116, "grad_norm": 1.977217197418213, "learning_rate": 1.0144927536231885e-06, "loss": 0.6852, "step": 42 }, { "epoch": 0.031151895677372615, "grad_norm": 1.8195909261703491, "learning_rate": 1.0386473429951692e-06, "loss": 0.6671, "step": 43 }, { "epoch": 0.03187635836754407, "grad_norm": 1.5773699283599854, "learning_rate": 1.0628019323671499e-06, "loss": 0.6651, "step": 44 }, { "epoch": 0.03260082105771553, "grad_norm": 1.4141956567764282, "learning_rate": 1.0869565217391306e-06, "loss": 0.6738, "step": 45 }, { "epoch": 0.033325283747886986, "grad_norm": 1.3008631467819214, "learning_rate": 1.111111111111111e-06, "loss": 0.6926, "step": 46 }, { "epoch": 0.03404974643805844, "grad_norm": 1.1756030321121216, "learning_rate": 1.135265700483092e-06, "loss": 0.6764, "step": 47 }, { "epoch": 0.0347742091282299, "grad_norm": 0.9896304607391357, "learning_rate": 1.1594202898550726e-06, "loss": 0.6837, "step": 48 }, { "epoch": 0.03549867181840135, "grad_norm": 0.930390477180481, "learning_rate": 1.1835748792270531e-06, "loss": 0.6823, "step": 49 }, { "epoch": 0.036223134508572806, "grad_norm": 0.989176869392395, "learning_rate": 1.2077294685990338e-06, "loss": 0.6716, "step": 50 }, { "epoch": 0.03694759719874426, "grad_norm": 1.0605345964431763, "learning_rate": 1.2318840579710147e-06, "loss": 0.6111, "step": 51 }, { "epoch": 0.03767205988891572, "grad_norm": 1.1027417182922363, "learning_rate": 1.2560386473429952e-06, "loss": 0.6372, "step": 52 }, { "epoch": 0.038396522579087176, "grad_norm": 0.9564452767372131, "learning_rate": 1.2801932367149761e-06, "loss": 0.6459, "step": 53 }, { "epoch": 0.03912098526925863, "grad_norm": 0.9055147171020508, "learning_rate": 1.3043478260869566e-06, "loss": 0.6538, "step": 54 }, { "epoch": 0.03984544795943009, "grad_norm": 0.7458699345588684, "learning_rate": 1.3285024154589373e-06, "loss": 0.635, "step": 55 }, { "epoch": 0.04056991064960155, "grad_norm": 0.7433354258537292, "learning_rate": 1.3526570048309178e-06, "loss": 0.6306, "step": 56 }, { "epoch": 0.041294373339773004, "grad_norm": 0.7450302243232727, "learning_rate": 1.3768115942028987e-06, "loss": 0.6536, "step": 57 }, { "epoch": 0.04201883602994446, "grad_norm": 0.6905714273452759, "learning_rate": 1.4009661835748794e-06, "loss": 0.6256, "step": 58 }, { "epoch": 0.04274329872011592, "grad_norm": 0.6706605553627014, "learning_rate": 1.42512077294686e-06, "loss": 0.6012, "step": 59 }, { "epoch": 0.04346776141028737, "grad_norm": 0.6252108216285706, "learning_rate": 1.4492753623188408e-06, "loss": 0.6489, "step": 60 }, { "epoch": 0.044192224100458824, "grad_norm": 0.6710149645805359, "learning_rate": 1.4734299516908213e-06, "loss": 0.6249, "step": 61 }, { "epoch": 0.04491668679063028, "grad_norm": 0.5865916609764099, "learning_rate": 1.497584541062802e-06, "loss": 0.6061, "step": 62 }, { "epoch": 0.04564114948080174, "grad_norm": 0.6767906546592712, "learning_rate": 1.521739130434783e-06, "loss": 0.6156, "step": 63 }, { "epoch": 0.046365612170973194, "grad_norm": 0.7680872678756714, "learning_rate": 1.5458937198067634e-06, "loss": 0.6632, "step": 64 }, { "epoch": 0.04709007486114465, "grad_norm": 0.6387614011764526, "learning_rate": 1.570048309178744e-06, "loss": 0.6074, "step": 65 }, { "epoch": 0.04781453755131611, "grad_norm": 0.5589495301246643, "learning_rate": 1.5942028985507246e-06, "loss": 0.5738, "step": 66 }, { "epoch": 0.048539000241487565, "grad_norm": 0.5194693803787231, "learning_rate": 1.6183574879227055e-06, "loss": 0.5877, "step": 67 }, { "epoch": 0.04926346293165902, "grad_norm": 0.48751363158226013, "learning_rate": 1.6425120772946862e-06, "loss": 0.5956, "step": 68 }, { "epoch": 0.04998792562183048, "grad_norm": 0.49916476011276245, "learning_rate": 1.6666666666666667e-06, "loss": 0.6116, "step": 69 }, { "epoch": 0.050712388312001935, "grad_norm": 0.5422055125236511, "learning_rate": 1.6908212560386476e-06, "loss": 0.5992, "step": 70 }, { "epoch": 0.051436851002173385, "grad_norm": 0.4752846360206604, "learning_rate": 1.7149758454106283e-06, "loss": 0.5876, "step": 71 }, { "epoch": 0.05216131369234484, "grad_norm": 0.546368420124054, "learning_rate": 1.7391304347826088e-06, "loss": 0.5913, "step": 72 }, { "epoch": 0.0528857763825163, "grad_norm": 0.5277104377746582, "learning_rate": 1.7632850241545897e-06, "loss": 0.5654, "step": 73 }, { "epoch": 0.053610239072687756, "grad_norm": 0.5583764314651489, "learning_rate": 1.7874396135265702e-06, "loss": 0.6104, "step": 74 }, { "epoch": 0.05433470176285921, "grad_norm": 0.5480570197105408, "learning_rate": 1.8115942028985508e-06, "loss": 0.5719, "step": 75 }, { "epoch": 0.05505916445303067, "grad_norm": 0.4735434949398041, "learning_rate": 1.8357487922705318e-06, "loss": 0.601, "step": 76 }, { "epoch": 0.055783627143202126, "grad_norm": 0.4870864450931549, "learning_rate": 1.8599033816425122e-06, "loss": 0.5864, "step": 77 }, { "epoch": 0.05650808983337358, "grad_norm": 0.4513680040836334, "learning_rate": 1.884057971014493e-06, "loss": 0.555, "step": 78 }, { "epoch": 0.05723255252354504, "grad_norm": 0.4437417984008789, "learning_rate": 1.9082125603864736e-06, "loss": 0.5871, "step": 79 }, { "epoch": 0.057957015213716497, "grad_norm": 0.4588111340999603, "learning_rate": 1.932367149758454e-06, "loss": 0.5775, "step": 80 }, { "epoch": 0.058681477903887946, "grad_norm": 0.44121602177619934, "learning_rate": 1.956521739130435e-06, "loss": 0.5901, "step": 81 }, { "epoch": 0.0594059405940594, "grad_norm": 0.45062318444252014, "learning_rate": 1.9806763285024155e-06, "loss": 0.5875, "step": 82 }, { "epoch": 0.06013040328423086, "grad_norm": 0.46856632828712463, "learning_rate": 2.0048309178743964e-06, "loss": 0.5803, "step": 83 }, { "epoch": 0.06085486597440232, "grad_norm": 0.4999275207519531, "learning_rate": 2.028985507246377e-06, "loss": 0.5915, "step": 84 }, { "epoch": 0.061579328664573774, "grad_norm": 0.3979341983795166, "learning_rate": 2.053140096618358e-06, "loss": 0.5715, "step": 85 }, { "epoch": 0.06230379135474523, "grad_norm": 0.443066269159317, "learning_rate": 2.0772946859903383e-06, "loss": 0.5524, "step": 86 }, { "epoch": 0.06302825404491669, "grad_norm": 0.4662742614746094, "learning_rate": 2.101449275362319e-06, "loss": 0.5717, "step": 87 }, { "epoch": 0.06375271673508814, "grad_norm": 0.4507353603839874, "learning_rate": 2.1256038647342997e-06, "loss": 0.5609, "step": 88 }, { "epoch": 0.0644771794252596, "grad_norm": 0.41581836342811584, "learning_rate": 2.1497584541062806e-06, "loss": 0.5504, "step": 89 }, { "epoch": 0.06520164211543106, "grad_norm": 0.5001596212387085, "learning_rate": 2.173913043478261e-06, "loss": 0.576, "step": 90 }, { "epoch": 0.06592610480560251, "grad_norm": 0.43446943163871765, "learning_rate": 2.1980676328502416e-06, "loss": 0.5669, "step": 91 }, { "epoch": 0.06665056749577397, "grad_norm": 0.46627894043922424, "learning_rate": 2.222222222222222e-06, "loss": 0.5427, "step": 92 }, { "epoch": 0.06737503018594543, "grad_norm": 0.3846312463283539, "learning_rate": 2.246376811594203e-06, "loss": 0.5615, "step": 93 }, { "epoch": 0.06809949287611688, "grad_norm": 0.40571990609169006, "learning_rate": 2.270531400966184e-06, "loss": 0.5367, "step": 94 }, { "epoch": 0.06882395556628834, "grad_norm": 0.43824541568756104, "learning_rate": 2.2946859903381644e-06, "loss": 0.5772, "step": 95 }, { "epoch": 0.0695484182564598, "grad_norm": 0.48324185609817505, "learning_rate": 2.3188405797101453e-06, "loss": 0.5854, "step": 96 }, { "epoch": 0.07027288094663126, "grad_norm": 0.42265331745147705, "learning_rate": 2.3429951690821258e-06, "loss": 0.5442, "step": 97 }, { "epoch": 0.0709973436368027, "grad_norm": 0.38139575719833374, "learning_rate": 2.3671497584541063e-06, "loss": 0.5388, "step": 98 }, { "epoch": 0.07172180632697416, "grad_norm": 0.41075894236564636, "learning_rate": 2.391304347826087e-06, "loss": 0.5382, "step": 99 }, { "epoch": 0.07244626901714561, "grad_norm": 0.4165528416633606, "learning_rate": 2.4154589371980677e-06, "loss": 0.5504, "step": 100 }, { "epoch": 0.07317073170731707, "grad_norm": 0.43313759565353394, "learning_rate": 2.4396135265700486e-06, "loss": 0.5361, "step": 101 }, { "epoch": 0.07389519439748853, "grad_norm": 0.4495377540588379, "learning_rate": 2.4637681159420295e-06, "loss": 0.5403, "step": 102 }, { "epoch": 0.07461965708765998, "grad_norm": 0.4305095672607422, "learning_rate": 2.48792270531401e-06, "loss": 0.5667, "step": 103 }, { "epoch": 0.07534411977783144, "grad_norm": 0.4191048741340637, "learning_rate": 2.5120772946859904e-06, "loss": 0.5201, "step": 104 }, { "epoch": 0.0760685824680029, "grad_norm": 0.4199225902557373, "learning_rate": 2.5362318840579714e-06, "loss": 0.5376, "step": 105 }, { "epoch": 0.07679304515817435, "grad_norm": 0.3827517628669739, "learning_rate": 2.5603864734299523e-06, "loss": 0.5076, "step": 106 }, { "epoch": 0.07751750784834581, "grad_norm": 0.4294317960739136, "learning_rate": 2.5845410628019323e-06, "loss": 0.5272, "step": 107 }, { "epoch": 0.07824197053851727, "grad_norm": 0.3756121098995209, "learning_rate": 2.6086956521739132e-06, "loss": 0.5478, "step": 108 }, { "epoch": 0.07896643322868872, "grad_norm": 0.38546082377433777, "learning_rate": 2.632850241545894e-06, "loss": 0.563, "step": 109 }, { "epoch": 0.07969089591886018, "grad_norm": 0.42863184213638306, "learning_rate": 2.6570048309178746e-06, "loss": 0.5486, "step": 110 }, { "epoch": 0.08041535860903164, "grad_norm": 0.41263723373413086, "learning_rate": 2.6811594202898555e-06, "loss": 0.5523, "step": 111 }, { "epoch": 0.0811398212992031, "grad_norm": 0.3808525800704956, "learning_rate": 2.7053140096618356e-06, "loss": 0.5341, "step": 112 }, { "epoch": 0.08186428398937455, "grad_norm": 0.4495500326156616, "learning_rate": 2.7294685990338165e-06, "loss": 0.554, "step": 113 }, { "epoch": 0.08258874667954601, "grad_norm": 0.43805092573165894, "learning_rate": 2.7536231884057974e-06, "loss": 0.5749, "step": 114 }, { "epoch": 0.08331320936971746, "grad_norm": 0.41404420137405396, "learning_rate": 2.7777777777777783e-06, "loss": 0.5525, "step": 115 }, { "epoch": 0.08403767205988892, "grad_norm": 0.3961395025253296, "learning_rate": 2.801932367149759e-06, "loss": 0.5546, "step": 116 }, { "epoch": 0.08476213475006038, "grad_norm": 0.42186239361763, "learning_rate": 2.8260869565217393e-06, "loss": 0.5742, "step": 117 }, { "epoch": 0.08548659744023183, "grad_norm": 0.4301183819770813, "learning_rate": 2.85024154589372e-06, "loss": 0.5402, "step": 118 }, { "epoch": 0.08621106013040328, "grad_norm": 0.40048015117645264, "learning_rate": 2.8743961352657007e-06, "loss": 0.5207, "step": 119 }, { "epoch": 0.08693552282057473, "grad_norm": 0.4188607633113861, "learning_rate": 2.8985507246376816e-06, "loss": 0.5499, "step": 120 }, { "epoch": 0.08765998551074619, "grad_norm": 0.4856567680835724, "learning_rate": 2.922705314009662e-06, "loss": 0.5679, "step": 121 }, { "epoch": 0.08838444820091765, "grad_norm": 0.40552598237991333, "learning_rate": 2.9468599033816426e-06, "loss": 0.5159, "step": 122 }, { "epoch": 0.0891089108910891, "grad_norm": 0.4144699275493622, "learning_rate": 2.9710144927536235e-06, "loss": 0.5133, "step": 123 }, { "epoch": 0.08983337358126056, "grad_norm": 0.4032340347766876, "learning_rate": 2.995169082125604e-06, "loss": 0.562, "step": 124 }, { "epoch": 0.09055783627143202, "grad_norm": 0.4288605749607086, "learning_rate": 3.019323671497585e-06, "loss": 0.5354, "step": 125 }, { "epoch": 0.09128229896160348, "grad_norm": 0.4288913607597351, "learning_rate": 3.043478260869566e-06, "loss": 0.5421, "step": 126 }, { "epoch": 0.09200676165177493, "grad_norm": 0.37692990899086, "learning_rate": 3.067632850241546e-06, "loss": 0.5264, "step": 127 }, { "epoch": 0.09273122434194639, "grad_norm": 0.4359961450099945, "learning_rate": 3.0917874396135268e-06, "loss": 0.4992, "step": 128 }, { "epoch": 0.09345568703211785, "grad_norm": 0.4114662706851959, "learning_rate": 3.1159420289855073e-06, "loss": 0.5568, "step": 129 }, { "epoch": 0.0941801497222893, "grad_norm": 0.41098445653915405, "learning_rate": 3.140096618357488e-06, "loss": 0.5391, "step": 130 }, { "epoch": 0.09490461241246076, "grad_norm": 0.4031665027141571, "learning_rate": 3.164251207729469e-06, "loss": 0.5157, "step": 131 }, { "epoch": 0.09562907510263222, "grad_norm": 0.39645469188690186, "learning_rate": 3.188405797101449e-06, "loss": 0.4978, "step": 132 }, { "epoch": 0.09635353779280367, "grad_norm": 0.3772999942302704, "learning_rate": 3.21256038647343e-06, "loss": 0.5008, "step": 133 }, { "epoch": 0.09707800048297513, "grad_norm": 0.37768325209617615, "learning_rate": 3.236714975845411e-06, "loss": 0.5229, "step": 134 }, { "epoch": 0.09780246317314659, "grad_norm": 0.42896610498428345, "learning_rate": 3.2608695652173914e-06, "loss": 0.5052, "step": 135 }, { "epoch": 0.09852692586331804, "grad_norm": 0.4112709164619446, "learning_rate": 3.2850241545893724e-06, "loss": 0.5586, "step": 136 }, { "epoch": 0.0992513885534895, "grad_norm": 0.42361101508140564, "learning_rate": 3.3091787439613533e-06, "loss": 0.5826, "step": 137 }, { "epoch": 0.09997585124366096, "grad_norm": 0.432756245136261, "learning_rate": 3.3333333333333333e-06, "loss": 0.5352, "step": 138 }, { "epoch": 0.10070031393383241, "grad_norm": 0.4182632863521576, "learning_rate": 3.3574879227053142e-06, "loss": 0.5613, "step": 139 }, { "epoch": 0.10142477662400387, "grad_norm": 0.4321322441101074, "learning_rate": 3.381642512077295e-06, "loss": 0.5441, "step": 140 }, { "epoch": 0.10214923931417531, "grad_norm": 0.4440820515155792, "learning_rate": 3.4057971014492756e-06, "loss": 0.4999, "step": 141 }, { "epoch": 0.10287370200434677, "grad_norm": 0.39243948459625244, "learning_rate": 3.4299516908212565e-06, "loss": 0.513, "step": 142 }, { "epoch": 0.10359816469451823, "grad_norm": 0.43331000208854675, "learning_rate": 3.4541062801932366e-06, "loss": 0.5496, "step": 143 }, { "epoch": 0.10432262738468968, "grad_norm": 0.45473888516426086, "learning_rate": 3.4782608695652175e-06, "loss": 0.4969, "step": 144 }, { "epoch": 0.10504709007486114, "grad_norm": 0.3917466402053833, "learning_rate": 3.5024154589371984e-06, "loss": 0.5062, "step": 145 }, { "epoch": 0.1057715527650326, "grad_norm": 0.3921971321105957, "learning_rate": 3.5265700483091793e-06, "loss": 0.5302, "step": 146 }, { "epoch": 0.10649601545520405, "grad_norm": 0.42358043789863586, "learning_rate": 3.55072463768116e-06, "loss": 0.4686, "step": 147 }, { "epoch": 0.10722047814537551, "grad_norm": 0.4172501266002655, "learning_rate": 3.5748792270531403e-06, "loss": 0.5366, "step": 148 }, { "epoch": 0.10794494083554697, "grad_norm": 0.4090701937675476, "learning_rate": 3.5990338164251208e-06, "loss": 0.5262, "step": 149 }, { "epoch": 0.10866940352571842, "grad_norm": 0.4050650894641876, "learning_rate": 3.6231884057971017e-06, "loss": 0.5367, "step": 150 }, { "epoch": 0.10939386621588988, "grad_norm": 0.42217814922332764, "learning_rate": 3.6473429951690826e-06, "loss": 0.5285, "step": 151 }, { "epoch": 0.11011832890606134, "grad_norm": 0.4367026686668396, "learning_rate": 3.6714975845410635e-06, "loss": 0.5227, "step": 152 }, { "epoch": 0.1108427915962328, "grad_norm": 0.36070993542671204, "learning_rate": 3.6956521739130436e-06, "loss": 0.4903, "step": 153 }, { "epoch": 0.11156725428640425, "grad_norm": 0.4787660241127014, "learning_rate": 3.7198067632850245e-06, "loss": 0.544, "step": 154 }, { "epoch": 0.11229171697657571, "grad_norm": 0.47188517451286316, "learning_rate": 3.743961352657005e-06, "loss": 0.5644, "step": 155 }, { "epoch": 0.11301617966674717, "grad_norm": 0.3707191050052643, "learning_rate": 3.768115942028986e-06, "loss": 0.4911, "step": 156 }, { "epoch": 0.11374064235691862, "grad_norm": 0.4298829436302185, "learning_rate": 3.792270531400967e-06, "loss": 0.4907, "step": 157 }, { "epoch": 0.11446510504709008, "grad_norm": 0.4169030785560608, "learning_rate": 3.816425120772947e-06, "loss": 0.5336, "step": 158 }, { "epoch": 0.11518956773726154, "grad_norm": 0.4100486934185028, "learning_rate": 3.840579710144928e-06, "loss": 0.5356, "step": 159 }, { "epoch": 0.11591403042743299, "grad_norm": 0.42012345790863037, "learning_rate": 3.864734299516908e-06, "loss": 0.5145, "step": 160 }, { "epoch": 0.11663849311760445, "grad_norm": 0.48124903440475464, "learning_rate": 3.88888888888889e-06, "loss": 0.5171, "step": 161 }, { "epoch": 0.11736295580777589, "grad_norm": 0.4163072407245636, "learning_rate": 3.91304347826087e-06, "loss": 0.4749, "step": 162 }, { "epoch": 0.11808741849794735, "grad_norm": 0.41929641366004944, "learning_rate": 3.9371980676328506e-06, "loss": 0.5165, "step": 163 }, { "epoch": 0.1188118811881188, "grad_norm": 0.41240638494491577, "learning_rate": 3.961352657004831e-06, "loss": 0.5131, "step": 164 }, { "epoch": 0.11953634387829026, "grad_norm": 0.44815269112586975, "learning_rate": 3.9855072463768115e-06, "loss": 0.5278, "step": 165 }, { "epoch": 0.12026080656846172, "grad_norm": 0.468707412481308, "learning_rate": 4.009661835748793e-06, "loss": 0.5349, "step": 166 }, { "epoch": 0.12098526925863318, "grad_norm": 0.5336907505989075, "learning_rate": 4.033816425120773e-06, "loss": 0.5158, "step": 167 }, { "epoch": 0.12170973194880463, "grad_norm": 0.4308851659297943, "learning_rate": 4.057971014492754e-06, "loss": 0.4723, "step": 168 }, { "epoch": 0.12243419463897609, "grad_norm": 0.3987612724304199, "learning_rate": 4.082125603864734e-06, "loss": 0.501, "step": 169 }, { "epoch": 0.12315865732914755, "grad_norm": 0.38950878381729126, "learning_rate": 4.106280193236716e-06, "loss": 0.5364, "step": 170 }, { "epoch": 0.123883120019319, "grad_norm": 0.4049890339374542, "learning_rate": 4.130434782608696e-06, "loss": 0.469, "step": 171 }, { "epoch": 0.12460758270949046, "grad_norm": 0.45008766651153564, "learning_rate": 4.154589371980677e-06, "loss": 0.5176, "step": 172 }, { "epoch": 0.12533204539966192, "grad_norm": 0.4008510410785675, "learning_rate": 4.178743961352658e-06, "loss": 0.4959, "step": 173 }, { "epoch": 0.12605650808983337, "grad_norm": 0.43297165632247925, "learning_rate": 4.202898550724638e-06, "loss": 0.5115, "step": 174 }, { "epoch": 0.12678097078000483, "grad_norm": 0.3948287069797516, "learning_rate": 4.227053140096619e-06, "loss": 0.5251, "step": 175 }, { "epoch": 0.1275054334701763, "grad_norm": 0.44706931710243225, "learning_rate": 4.251207729468599e-06, "loss": 0.4745, "step": 176 }, { "epoch": 0.12822989616034774, "grad_norm": 0.4285999834537506, "learning_rate": 4.27536231884058e-06, "loss": 0.5053, "step": 177 }, { "epoch": 0.1289543588505192, "grad_norm": 0.4209946393966675, "learning_rate": 4.299516908212561e-06, "loss": 0.5014, "step": 178 }, { "epoch": 0.12967882154069066, "grad_norm": 0.4710477292537689, "learning_rate": 4.323671497584541e-06, "loss": 0.5267, "step": 179 }, { "epoch": 0.13040328423086212, "grad_norm": 0.4033375084400177, "learning_rate": 4.347826086956522e-06, "loss": 0.527, "step": 180 }, { "epoch": 0.13112774692103357, "grad_norm": 0.42441633343696594, "learning_rate": 4.371980676328503e-06, "loss": 0.5031, "step": 181 }, { "epoch": 0.13185220961120503, "grad_norm": 0.41694405674934387, "learning_rate": 4.396135265700483e-06, "loss": 0.5094, "step": 182 }, { "epoch": 0.13257667230137649, "grad_norm": 0.38231971859931946, "learning_rate": 4.4202898550724645e-06, "loss": 0.5019, "step": 183 }, { "epoch": 0.13330113499154794, "grad_norm": 0.43780502676963806, "learning_rate": 4.444444444444444e-06, "loss": 0.5234, "step": 184 }, { "epoch": 0.1340255976817194, "grad_norm": 0.3888809382915497, "learning_rate": 4.4685990338164255e-06, "loss": 0.4798, "step": 185 }, { "epoch": 0.13475006037189086, "grad_norm": 0.4522818326950073, "learning_rate": 4.492753623188406e-06, "loss": 0.5069, "step": 186 }, { "epoch": 0.1354745230620623, "grad_norm": 0.4520219564437866, "learning_rate": 4.516908212560387e-06, "loss": 0.5081, "step": 187 }, { "epoch": 0.13619898575223377, "grad_norm": 0.4292866587638855, "learning_rate": 4.541062801932368e-06, "loss": 0.5273, "step": 188 }, { "epoch": 0.13692344844240523, "grad_norm": 0.4525240957736969, "learning_rate": 4.565217391304348e-06, "loss": 0.5241, "step": 189 }, { "epoch": 0.13764791113257668, "grad_norm": 0.45626434683799744, "learning_rate": 4.589371980676329e-06, "loss": 0.5248, "step": 190 }, { "epoch": 0.13837237382274814, "grad_norm": 0.4096200466156006, "learning_rate": 4.613526570048309e-06, "loss": 0.497, "step": 191 }, { "epoch": 0.1390968365129196, "grad_norm": 0.3867015540599823, "learning_rate": 4.637681159420291e-06, "loss": 0.5038, "step": 192 }, { "epoch": 0.13982129920309105, "grad_norm": 0.4090193212032318, "learning_rate": 4.661835748792271e-06, "loss": 0.5182, "step": 193 }, { "epoch": 0.1405457618932625, "grad_norm": 0.44483503699302673, "learning_rate": 4.6859903381642516e-06, "loss": 0.5102, "step": 194 }, { "epoch": 0.14127022458343394, "grad_norm": 0.41988465189933777, "learning_rate": 4.710144927536232e-06, "loss": 0.4863, "step": 195 }, { "epoch": 0.1419946872736054, "grad_norm": 0.3977968692779541, "learning_rate": 4.7342995169082125e-06, "loss": 0.5004, "step": 196 }, { "epoch": 0.14271914996377685, "grad_norm": 0.38155117630958557, "learning_rate": 4.758454106280194e-06, "loss": 0.5048, "step": 197 }, { "epoch": 0.1434436126539483, "grad_norm": 0.3712746202945709, "learning_rate": 4.782608695652174e-06, "loss": 0.4482, "step": 198 }, { "epoch": 0.14416807534411977, "grad_norm": 0.4256414473056793, "learning_rate": 4.806763285024155e-06, "loss": 0.5121, "step": 199 }, { "epoch": 0.14489253803429122, "grad_norm": 0.38659465312957764, "learning_rate": 4.830917874396135e-06, "loss": 0.4992, "step": 200 }, { "epoch": 0.14561700072446268, "grad_norm": 0.4110007584095001, "learning_rate": 4.855072463768117e-06, "loss": 0.5102, "step": 201 }, { "epoch": 0.14634146341463414, "grad_norm": 0.40480491518974304, "learning_rate": 4.879227053140097e-06, "loss": 0.4892, "step": 202 }, { "epoch": 0.1470659261048056, "grad_norm": 0.3959154784679413, "learning_rate": 4.903381642512078e-06, "loss": 0.484, "step": 203 }, { "epoch": 0.14779038879497705, "grad_norm": 0.39229801297187805, "learning_rate": 4.927536231884059e-06, "loss": 0.5174, "step": 204 }, { "epoch": 0.1485148514851485, "grad_norm": 0.40920794010162354, "learning_rate": 4.951690821256039e-06, "loss": 0.4761, "step": 205 }, { "epoch": 0.14923931417531996, "grad_norm": 0.40159422159194946, "learning_rate": 4.97584541062802e-06, "loss": 0.492, "step": 206 }, { "epoch": 0.14996377686549142, "grad_norm": 0.4200117886066437, "learning_rate": 5e-06, "loss": 0.5066, "step": 207 }, { "epoch": 0.15068823955566288, "grad_norm": 0.47102367877960205, "learning_rate": 5.024154589371981e-06, "loss": 0.5121, "step": 208 }, { "epoch": 0.15141270224583434, "grad_norm": 0.4274490475654602, "learning_rate": 5.048309178743962e-06, "loss": 0.5016, "step": 209 }, { "epoch": 0.1521371649360058, "grad_norm": 0.4668126404285431, "learning_rate": 5.072463768115943e-06, "loss": 0.5033, "step": 210 }, { "epoch": 0.15286162762617725, "grad_norm": 0.44378378987312317, "learning_rate": 5.096618357487923e-06, "loss": 0.4759, "step": 211 }, { "epoch": 0.1535860903163487, "grad_norm": 0.45746910572052, "learning_rate": 5.1207729468599045e-06, "loss": 0.4905, "step": 212 }, { "epoch": 0.15431055300652016, "grad_norm": 0.4142937958240509, "learning_rate": 5.144927536231884e-06, "loss": 0.4866, "step": 213 }, { "epoch": 0.15503501569669162, "grad_norm": 0.47374993562698364, "learning_rate": 5.169082125603865e-06, "loss": 0.5084, "step": 214 }, { "epoch": 0.15575947838686308, "grad_norm": 0.46000611782073975, "learning_rate": 5.193236714975846e-06, "loss": 0.4894, "step": 215 }, { "epoch": 0.15648394107703453, "grad_norm": 0.485257625579834, "learning_rate": 5.2173913043478265e-06, "loss": 0.4701, "step": 216 }, { "epoch": 0.157208403767206, "grad_norm": 0.47231489419937134, "learning_rate": 5.241545893719807e-06, "loss": 0.4992, "step": 217 }, { "epoch": 0.15793286645737745, "grad_norm": 0.41061025857925415, "learning_rate": 5.265700483091788e-06, "loss": 0.4722, "step": 218 }, { "epoch": 0.1586573291475489, "grad_norm": 0.440318763256073, "learning_rate": 5.289855072463769e-06, "loss": 0.4961, "step": 219 }, { "epoch": 0.15938179183772036, "grad_norm": 0.44463497400283813, "learning_rate": 5.314009661835749e-06, "loss": 0.498, "step": 220 }, { "epoch": 0.16010625452789182, "grad_norm": 0.41229796409606934, "learning_rate": 5.338164251207731e-06, "loss": 0.503, "step": 221 }, { "epoch": 0.16083071721806327, "grad_norm": 0.44692209362983704, "learning_rate": 5.362318840579711e-06, "loss": 0.5147, "step": 222 }, { "epoch": 0.16155517990823473, "grad_norm": 0.43659982085227966, "learning_rate": 5.386473429951691e-06, "loss": 0.5169, "step": 223 }, { "epoch": 0.1622796425984062, "grad_norm": 0.3590286672115326, "learning_rate": 5.410628019323671e-06, "loss": 0.504, "step": 224 }, { "epoch": 0.16300410528857764, "grad_norm": 0.4451122283935547, "learning_rate": 5.4347826086956525e-06, "loss": 0.5021, "step": 225 }, { "epoch": 0.1637285679787491, "grad_norm": 0.4056592583656311, "learning_rate": 5.458937198067633e-06, "loss": 0.4856, "step": 226 }, { "epoch": 0.16445303066892056, "grad_norm": 0.4128817021846771, "learning_rate": 5.4830917874396135e-06, "loss": 0.4907, "step": 227 }, { "epoch": 0.16517749335909201, "grad_norm": 0.47903913259506226, "learning_rate": 5.507246376811595e-06, "loss": 0.4786, "step": 228 }, { "epoch": 0.16590195604926347, "grad_norm": 0.4102441966533661, "learning_rate": 5.531400966183575e-06, "loss": 0.4976, "step": 229 }, { "epoch": 0.16662641873943493, "grad_norm": 0.4630212187767029, "learning_rate": 5.555555555555557e-06, "loss": 0.4946, "step": 230 }, { "epoch": 0.16735088142960639, "grad_norm": 0.43774908781051636, "learning_rate": 5.579710144927537e-06, "loss": 0.4666, "step": 231 }, { "epoch": 0.16807534411977784, "grad_norm": 0.3963059186935425, "learning_rate": 5.603864734299518e-06, "loss": 0.4765, "step": 232 }, { "epoch": 0.1687998068099493, "grad_norm": 0.41839757561683655, "learning_rate": 5.628019323671497e-06, "loss": 0.4996, "step": 233 }, { "epoch": 0.16952426950012076, "grad_norm": 0.4075770080089569, "learning_rate": 5.652173913043479e-06, "loss": 0.4849, "step": 234 }, { "epoch": 0.1702487321902922, "grad_norm": 0.4220485985279083, "learning_rate": 5.676328502415459e-06, "loss": 0.5049, "step": 235 }, { "epoch": 0.17097319488046367, "grad_norm": 0.406438410282135, "learning_rate": 5.70048309178744e-06, "loss": 0.5013, "step": 236 }, { "epoch": 0.17169765757063513, "grad_norm": 0.42556020617485046, "learning_rate": 5.724637681159421e-06, "loss": 0.4775, "step": 237 }, { "epoch": 0.17242212026080656, "grad_norm": 0.3902406096458435, "learning_rate": 5.748792270531401e-06, "loss": 0.4859, "step": 238 }, { "epoch": 0.173146582950978, "grad_norm": 0.39743271470069885, "learning_rate": 5.772946859903382e-06, "loss": 0.4586, "step": 239 }, { "epoch": 0.17387104564114947, "grad_norm": 0.38171324133872986, "learning_rate": 5.797101449275363e-06, "loss": 0.4788, "step": 240 }, { "epoch": 0.17459550833132093, "grad_norm": 0.39145541191101074, "learning_rate": 5.821256038647344e-06, "loss": 0.4886, "step": 241 }, { "epoch": 0.17531997102149238, "grad_norm": 0.3821873068809509, "learning_rate": 5.845410628019324e-06, "loss": 0.4822, "step": 242 }, { "epoch": 0.17604443371166384, "grad_norm": 0.4348975419998169, "learning_rate": 5.8695652173913055e-06, "loss": 0.4844, "step": 243 }, { "epoch": 0.1767688964018353, "grad_norm": 0.4400613307952881, "learning_rate": 5.893719806763285e-06, "loss": 0.4383, "step": 244 }, { "epoch": 0.17749335909200675, "grad_norm": 0.4425283670425415, "learning_rate": 5.917874396135266e-06, "loss": 0.4907, "step": 245 }, { "epoch": 0.1782178217821782, "grad_norm": 0.41467225551605225, "learning_rate": 5.942028985507247e-06, "loss": 0.4684, "step": 246 }, { "epoch": 0.17894228447234967, "grad_norm": 0.4427250325679779, "learning_rate": 5.9661835748792275e-06, "loss": 0.4919, "step": 247 }, { "epoch": 0.17966674716252112, "grad_norm": 0.4602275788784027, "learning_rate": 5.990338164251208e-06, "loss": 0.5259, "step": 248 }, { "epoch": 0.18039120985269258, "grad_norm": 0.5041419863700867, "learning_rate": 6.014492753623189e-06, "loss": 0.4887, "step": 249 }, { "epoch": 0.18111567254286404, "grad_norm": 0.41622963547706604, "learning_rate": 6.03864734299517e-06, "loss": 0.4991, "step": 250 }, { "epoch": 0.1818401352330355, "grad_norm": 0.40862202644348145, "learning_rate": 6.06280193236715e-06, "loss": 0.4564, "step": 251 }, { "epoch": 0.18256459792320695, "grad_norm": 0.4989408254623413, "learning_rate": 6.086956521739132e-06, "loss": 0.5359, "step": 252 }, { "epoch": 0.1832890606133784, "grad_norm": 0.4402110278606415, "learning_rate": 6.111111111111112e-06, "loss": 0.5072, "step": 253 }, { "epoch": 0.18401352330354986, "grad_norm": 0.4638153314590454, "learning_rate": 6.135265700483092e-06, "loss": 0.5042, "step": 254 }, { "epoch": 0.18473798599372132, "grad_norm": 0.4432891011238098, "learning_rate": 6.159420289855072e-06, "loss": 0.5066, "step": 255 }, { "epoch": 0.18546244868389278, "grad_norm": 0.4346922039985657, "learning_rate": 6.1835748792270535e-06, "loss": 0.4921, "step": 256 }, { "epoch": 0.18618691137406423, "grad_norm": 0.4072340130805969, "learning_rate": 6.207729468599034e-06, "loss": 0.513, "step": 257 }, { "epoch": 0.1869113740642357, "grad_norm": 0.4198291599750519, "learning_rate": 6.2318840579710145e-06, "loss": 0.4829, "step": 258 }, { "epoch": 0.18763583675440715, "grad_norm": 0.40139076113700867, "learning_rate": 6.256038647342996e-06, "loss": 0.5022, "step": 259 }, { "epoch": 0.1883602994445786, "grad_norm": 0.40026766061782837, "learning_rate": 6.280193236714976e-06, "loss": 0.4924, "step": 260 }, { "epoch": 0.18908476213475006, "grad_norm": 0.5138009190559387, "learning_rate": 6.304347826086958e-06, "loss": 0.5108, "step": 261 }, { "epoch": 0.18980922482492152, "grad_norm": 0.41296377778053284, "learning_rate": 6.328502415458938e-06, "loss": 0.5178, "step": 262 }, { "epoch": 0.19053368751509298, "grad_norm": 0.4560945928096771, "learning_rate": 6.352657004830919e-06, "loss": 0.4856, "step": 263 }, { "epoch": 0.19125815020526443, "grad_norm": 0.3961561620235443, "learning_rate": 6.376811594202898e-06, "loss": 0.4636, "step": 264 }, { "epoch": 0.1919826128954359, "grad_norm": 0.45498764514923096, "learning_rate": 6.40096618357488e-06, "loss": 0.4916, "step": 265 }, { "epoch": 0.19270707558560735, "grad_norm": 0.47145557403564453, "learning_rate": 6.42512077294686e-06, "loss": 0.4636, "step": 266 }, { "epoch": 0.1934315382757788, "grad_norm": 0.46885019540786743, "learning_rate": 6.449275362318841e-06, "loss": 0.5036, "step": 267 }, { "epoch": 0.19415600096595026, "grad_norm": 0.4135659635066986, "learning_rate": 6.473429951690822e-06, "loss": 0.4594, "step": 268 }, { "epoch": 0.19488046365612172, "grad_norm": 0.4431118667125702, "learning_rate": 6.497584541062802e-06, "loss": 0.4481, "step": 269 }, { "epoch": 0.19560492634629317, "grad_norm": 0.431918203830719, "learning_rate": 6.521739130434783e-06, "loss": 0.4927, "step": 270 }, { "epoch": 0.19632938903646463, "grad_norm": 0.49355778098106384, "learning_rate": 6.545893719806764e-06, "loss": 0.4668, "step": 271 }, { "epoch": 0.1970538517266361, "grad_norm": 0.4541790783405304, "learning_rate": 6.570048309178745e-06, "loss": 0.4919, "step": 272 }, { "epoch": 0.19777831441680754, "grad_norm": 0.40902096033096313, "learning_rate": 6.594202898550725e-06, "loss": 0.4725, "step": 273 }, { "epoch": 0.198502777106979, "grad_norm": 0.5413607954978943, "learning_rate": 6.6183574879227065e-06, "loss": 0.5038, "step": 274 }, { "epoch": 0.19922723979715046, "grad_norm": 0.44243916869163513, "learning_rate": 6.642512077294686e-06, "loss": 0.5015, "step": 275 }, { "epoch": 0.19995170248732191, "grad_norm": 0.44736677408218384, "learning_rate": 6.666666666666667e-06, "loss": 0.4534, "step": 276 }, { "epoch": 0.20067616517749337, "grad_norm": 0.6430760622024536, "learning_rate": 6.690821256038648e-06, "loss": 0.4972, "step": 277 }, { "epoch": 0.20140062786766483, "grad_norm": 0.46760493516921997, "learning_rate": 6.7149758454106285e-06, "loss": 0.5018, "step": 278 }, { "epoch": 0.20212509055783628, "grad_norm": 0.4279399812221527, "learning_rate": 6.739130434782609e-06, "loss": 0.5043, "step": 279 }, { "epoch": 0.20284955324800774, "grad_norm": 0.3839455842971802, "learning_rate": 6.76328502415459e-06, "loss": 0.4734, "step": 280 }, { "epoch": 0.20357401593817917, "grad_norm": 0.43676233291625977, "learning_rate": 6.787439613526571e-06, "loss": 0.4969, "step": 281 }, { "epoch": 0.20429847862835063, "grad_norm": 0.5046594142913818, "learning_rate": 6.811594202898551e-06, "loss": 0.4882, "step": 282 }, { "epoch": 0.20502294131852208, "grad_norm": 0.4669559895992279, "learning_rate": 6.835748792270533e-06, "loss": 0.5123, "step": 283 }, { "epoch": 0.20574740400869354, "grad_norm": 0.5318796634674072, "learning_rate": 6.859903381642513e-06, "loss": 0.5098, "step": 284 }, { "epoch": 0.206471866698865, "grad_norm": 0.42241495847702026, "learning_rate": 6.884057971014493e-06, "loss": 0.5049, "step": 285 }, { "epoch": 0.20719632938903645, "grad_norm": 0.5354446172714233, "learning_rate": 6.908212560386473e-06, "loss": 0.5202, "step": 286 }, { "epoch": 0.2079207920792079, "grad_norm": 0.4448866546154022, "learning_rate": 6.9323671497584545e-06, "loss": 0.4523, "step": 287 }, { "epoch": 0.20864525476937937, "grad_norm": 0.5115771889686584, "learning_rate": 6.956521739130435e-06, "loss": 0.4798, "step": 288 }, { "epoch": 0.20936971745955082, "grad_norm": 0.5244592428207397, "learning_rate": 6.980676328502416e-06, "loss": 0.5094, "step": 289 }, { "epoch": 0.21009418014972228, "grad_norm": 0.4308057725429535, "learning_rate": 7.004830917874397e-06, "loss": 0.4846, "step": 290 }, { "epoch": 0.21081864283989374, "grad_norm": 0.5552890300750732, "learning_rate": 7.028985507246377e-06, "loss": 0.4917, "step": 291 }, { "epoch": 0.2115431055300652, "grad_norm": 0.43694615364074707, "learning_rate": 7.053140096618359e-06, "loss": 0.4752, "step": 292 }, { "epoch": 0.21226756822023665, "grad_norm": 0.5093947649002075, "learning_rate": 7.077294685990339e-06, "loss": 0.4963, "step": 293 }, { "epoch": 0.2129920309104081, "grad_norm": 0.459568589925766, "learning_rate": 7.10144927536232e-06, "loss": 0.4703, "step": 294 }, { "epoch": 0.21371649360057957, "grad_norm": 0.5896654725074768, "learning_rate": 7.125603864734299e-06, "loss": 0.5133, "step": 295 }, { "epoch": 0.21444095629075102, "grad_norm": 0.46485352516174316, "learning_rate": 7.149758454106281e-06, "loss": 0.469, "step": 296 }, { "epoch": 0.21516541898092248, "grad_norm": 0.5410766005516052, "learning_rate": 7.173913043478261e-06, "loss": 0.4762, "step": 297 }, { "epoch": 0.21588988167109394, "grad_norm": 0.5405240654945374, "learning_rate": 7.1980676328502416e-06, "loss": 0.4353, "step": 298 }, { "epoch": 0.2166143443612654, "grad_norm": 0.441251665353775, "learning_rate": 7.222222222222223e-06, "loss": 0.4651, "step": 299 }, { "epoch": 0.21733880705143685, "grad_norm": 0.45267894864082336, "learning_rate": 7.246376811594203e-06, "loss": 0.4748, "step": 300 }, { "epoch": 0.2180632697416083, "grad_norm": 0.5049658417701721, "learning_rate": 7.270531400966184e-06, "loss": 0.4926, "step": 301 }, { "epoch": 0.21878773243177976, "grad_norm": 0.455903559923172, "learning_rate": 7.294685990338165e-06, "loss": 0.5011, "step": 302 }, { "epoch": 0.21951219512195122, "grad_norm": 0.46524566411972046, "learning_rate": 7.318840579710146e-06, "loss": 0.4657, "step": 303 }, { "epoch": 0.22023665781212268, "grad_norm": 0.5215276479721069, "learning_rate": 7.342995169082127e-06, "loss": 0.483, "step": 304 }, { "epoch": 0.22096112050229413, "grad_norm": 0.4886139929294586, "learning_rate": 7.3671497584541075e-06, "loss": 0.5004, "step": 305 }, { "epoch": 0.2216855831924656, "grad_norm": 0.4763126075267792, "learning_rate": 7.391304347826087e-06, "loss": 0.4725, "step": 306 }, { "epoch": 0.22241004588263705, "grad_norm": 0.4535231590270996, "learning_rate": 7.415458937198068e-06, "loss": 0.4545, "step": 307 }, { "epoch": 0.2231345085728085, "grad_norm": 0.4779628813266754, "learning_rate": 7.439613526570049e-06, "loss": 0.476, "step": 308 }, { "epoch": 0.22385897126297996, "grad_norm": 0.46387356519699097, "learning_rate": 7.4637681159420295e-06, "loss": 0.498, "step": 309 }, { "epoch": 0.22458343395315142, "grad_norm": 0.5264805555343628, "learning_rate": 7.48792270531401e-06, "loss": 0.5041, "step": 310 }, { "epoch": 0.22530789664332287, "grad_norm": 0.3861474096775055, "learning_rate": 7.512077294685991e-06, "loss": 0.4819, "step": 311 }, { "epoch": 0.22603235933349433, "grad_norm": 0.5053765177726746, "learning_rate": 7.536231884057972e-06, "loss": 0.4853, "step": 312 }, { "epoch": 0.2267568220236658, "grad_norm": 0.47047531604766846, "learning_rate": 7.560386473429952e-06, "loss": 0.4763, "step": 313 }, { "epoch": 0.22748128471383725, "grad_norm": 0.42195528745651245, "learning_rate": 7.584541062801934e-06, "loss": 0.458, "step": 314 }, { "epoch": 0.2282057474040087, "grad_norm": 0.44757458567619324, "learning_rate": 7.608695652173914e-06, "loss": 0.4842, "step": 315 }, { "epoch": 0.22893021009418016, "grad_norm": 0.41291913390159607, "learning_rate": 7.632850241545895e-06, "loss": 0.4615, "step": 316 }, { "epoch": 0.22965467278435162, "grad_norm": 0.4658533036708832, "learning_rate": 7.657004830917875e-06, "loss": 0.4654, "step": 317 }, { "epoch": 0.23037913547452307, "grad_norm": 0.4508609473705292, "learning_rate": 7.681159420289856e-06, "loss": 0.4761, "step": 318 }, { "epoch": 0.23110359816469453, "grad_norm": 0.49906444549560547, "learning_rate": 7.705314009661836e-06, "loss": 0.4688, "step": 319 }, { "epoch": 0.23182806085486599, "grad_norm": 0.38128015398979187, "learning_rate": 7.729468599033817e-06, "loss": 0.4166, "step": 320 }, { "epoch": 0.23255252354503744, "grad_norm": 0.5338290929794312, "learning_rate": 7.753623188405797e-06, "loss": 0.4598, "step": 321 }, { "epoch": 0.2332769862352089, "grad_norm": 0.4767688810825348, "learning_rate": 7.77777777777778e-06, "loss": 0.4722, "step": 322 }, { "epoch": 0.23400144892538036, "grad_norm": 0.5483136177062988, "learning_rate": 7.80193236714976e-06, "loss": 0.4886, "step": 323 }, { "epoch": 0.23472591161555179, "grad_norm": 0.44236475229263306, "learning_rate": 7.82608695652174e-06, "loss": 0.485, "step": 324 }, { "epoch": 0.23545037430572324, "grad_norm": 0.4928440749645233, "learning_rate": 7.85024154589372e-06, "loss": 0.498, "step": 325 }, { "epoch": 0.2361748369958947, "grad_norm": 0.4956628084182739, "learning_rate": 7.874396135265701e-06, "loss": 0.4545, "step": 326 }, { "epoch": 0.23689929968606616, "grad_norm": 0.47739672660827637, "learning_rate": 7.898550724637682e-06, "loss": 0.4652, "step": 327 }, { "epoch": 0.2376237623762376, "grad_norm": 0.43885424733161926, "learning_rate": 7.922705314009662e-06, "loss": 0.4753, "step": 328 }, { "epoch": 0.23834822506640907, "grad_norm": 0.49466121196746826, "learning_rate": 7.946859903381643e-06, "loss": 0.4235, "step": 329 }, { "epoch": 0.23907268775658053, "grad_norm": 0.4735454320907593, "learning_rate": 7.971014492753623e-06, "loss": 0.4382, "step": 330 }, { "epoch": 0.23979715044675198, "grad_norm": 0.438824325799942, "learning_rate": 7.995169082125605e-06, "loss": 0.48, "step": 331 }, { "epoch": 0.24052161313692344, "grad_norm": 0.45137885212898254, "learning_rate": 8.019323671497586e-06, "loss": 0.4772, "step": 332 }, { "epoch": 0.2412460758270949, "grad_norm": 0.4755925238132477, "learning_rate": 8.043478260869566e-06, "loss": 0.4948, "step": 333 }, { "epoch": 0.24197053851726635, "grad_norm": 0.4265425205230713, "learning_rate": 8.067632850241547e-06, "loss": 0.468, "step": 334 }, { "epoch": 0.2426950012074378, "grad_norm": 0.5346320271492004, "learning_rate": 8.091787439613527e-06, "loss": 0.4904, "step": 335 }, { "epoch": 0.24341946389760927, "grad_norm": 0.4254898130893707, "learning_rate": 8.115942028985508e-06, "loss": 0.4528, "step": 336 }, { "epoch": 0.24414392658778072, "grad_norm": 0.5375081300735474, "learning_rate": 8.140096618357488e-06, "loss": 0.4765, "step": 337 }, { "epoch": 0.24486838927795218, "grad_norm": 0.3890259861946106, "learning_rate": 8.164251207729469e-06, "loss": 0.4742, "step": 338 }, { "epoch": 0.24559285196812364, "grad_norm": 0.5136224627494812, "learning_rate": 8.188405797101449e-06, "loss": 0.478, "step": 339 }, { "epoch": 0.2463173146582951, "grad_norm": 0.425131618976593, "learning_rate": 8.212560386473431e-06, "loss": 0.4407, "step": 340 }, { "epoch": 0.24704177734846655, "grad_norm": 0.43751218914985657, "learning_rate": 8.236714975845412e-06, "loss": 0.4887, "step": 341 }, { "epoch": 0.247766240038638, "grad_norm": 0.4704265296459198, "learning_rate": 8.260869565217392e-06, "loss": 0.4481, "step": 342 }, { "epoch": 0.24849070272880946, "grad_norm": 0.47131267189979553, "learning_rate": 8.285024154589373e-06, "loss": 0.4572, "step": 343 }, { "epoch": 0.24921516541898092, "grad_norm": 0.4858401119709015, "learning_rate": 8.309178743961353e-06, "loss": 0.5025, "step": 344 }, { "epoch": 0.24993962810915238, "grad_norm": 0.5079872012138367, "learning_rate": 8.333333333333334e-06, "loss": 0.5038, "step": 345 }, { "epoch": 0.25066409079932384, "grad_norm": 0.4360049068927765, "learning_rate": 8.357487922705316e-06, "loss": 0.4735, "step": 346 }, { "epoch": 0.2513885534894953, "grad_norm": 0.4847256541252136, "learning_rate": 8.381642512077295e-06, "loss": 0.5145, "step": 347 }, { "epoch": 0.25211301617966675, "grad_norm": 0.5322009325027466, "learning_rate": 8.405797101449275e-06, "loss": 0.4702, "step": 348 }, { "epoch": 0.2528374788698382, "grad_norm": 0.44590967893600464, "learning_rate": 8.429951690821256e-06, "loss": 0.461, "step": 349 }, { "epoch": 0.25356194156000966, "grad_norm": 0.5072506666183472, "learning_rate": 8.454106280193238e-06, "loss": 0.4653, "step": 350 }, { "epoch": 0.2542864042501811, "grad_norm": 0.5689263343811035, "learning_rate": 8.478260869565218e-06, "loss": 0.4693, "step": 351 }, { "epoch": 0.2550108669403526, "grad_norm": 0.489543616771698, "learning_rate": 8.502415458937199e-06, "loss": 0.4809, "step": 352 }, { "epoch": 0.25573532963052403, "grad_norm": 0.5199886560440063, "learning_rate": 8.52657004830918e-06, "loss": 0.4403, "step": 353 }, { "epoch": 0.2564597923206955, "grad_norm": 0.44071903824806213, "learning_rate": 8.55072463768116e-06, "loss": 0.4648, "step": 354 }, { "epoch": 0.25718425501086695, "grad_norm": 0.46398693323135376, "learning_rate": 8.57487922705314e-06, "loss": 0.4868, "step": 355 }, { "epoch": 0.2579087177010384, "grad_norm": 0.42420724034309387, "learning_rate": 8.599033816425122e-06, "loss": 0.442, "step": 356 }, { "epoch": 0.25863318039120986, "grad_norm": 0.42813143134117126, "learning_rate": 8.623188405797103e-06, "loss": 0.4358, "step": 357 }, { "epoch": 0.2593576430813813, "grad_norm": 0.41775259375572205, "learning_rate": 8.647342995169082e-06, "loss": 0.4774, "step": 358 }, { "epoch": 0.2600821057715528, "grad_norm": 0.4707051217556, "learning_rate": 8.671497584541064e-06, "loss": 0.4725, "step": 359 }, { "epoch": 0.26080656846172423, "grad_norm": 0.43866753578186035, "learning_rate": 8.695652173913044e-06, "loss": 0.4665, "step": 360 }, { "epoch": 0.2615310311518957, "grad_norm": 0.48229527473449707, "learning_rate": 8.719806763285025e-06, "loss": 0.4718, "step": 361 }, { "epoch": 0.26225549384206714, "grad_norm": 0.4431239664554596, "learning_rate": 8.743961352657005e-06, "loss": 0.4595, "step": 362 }, { "epoch": 0.2629799565322386, "grad_norm": 0.43484610319137573, "learning_rate": 8.768115942028986e-06, "loss": 0.459, "step": 363 }, { "epoch": 0.26370441922241006, "grad_norm": 0.4636439085006714, "learning_rate": 8.792270531400966e-06, "loss": 0.4895, "step": 364 }, { "epoch": 0.2644288819125815, "grad_norm": 0.4152275323867798, "learning_rate": 8.816425120772949e-06, "loss": 0.4662, "step": 365 }, { "epoch": 0.26515334460275297, "grad_norm": 0.4849233627319336, "learning_rate": 8.840579710144929e-06, "loss": 0.4774, "step": 366 }, { "epoch": 0.26587780729292443, "grad_norm": 0.4804450273513794, "learning_rate": 8.86473429951691e-06, "loss": 0.4676, "step": 367 }, { "epoch": 0.2666022699830959, "grad_norm": 0.5160784125328064, "learning_rate": 8.888888888888888e-06, "loss": 0.4739, "step": 368 }, { "epoch": 0.26732673267326734, "grad_norm": 0.48937562108039856, "learning_rate": 8.91304347826087e-06, "loss": 0.4611, "step": 369 }, { "epoch": 0.2680511953634388, "grad_norm": 0.5368051528930664, "learning_rate": 8.937198067632851e-06, "loss": 0.4635, "step": 370 }, { "epoch": 0.26877565805361026, "grad_norm": 0.5454927086830139, "learning_rate": 8.961352657004831e-06, "loss": 0.49, "step": 371 }, { "epoch": 0.2695001207437817, "grad_norm": 0.40306130051612854, "learning_rate": 8.985507246376812e-06, "loss": 0.4481, "step": 372 }, { "epoch": 0.27022458343395317, "grad_norm": 0.5315220952033997, "learning_rate": 9.009661835748792e-06, "loss": 0.4707, "step": 373 }, { "epoch": 0.2709490461241246, "grad_norm": 0.498853862285614, "learning_rate": 9.033816425120775e-06, "loss": 0.4834, "step": 374 }, { "epoch": 0.2716735088142961, "grad_norm": 0.44266822934150696, "learning_rate": 9.057971014492755e-06, "loss": 0.456, "step": 375 }, { "epoch": 0.27239797150446754, "grad_norm": 0.46477702260017395, "learning_rate": 9.082125603864736e-06, "loss": 0.4607, "step": 376 }, { "epoch": 0.273122434194639, "grad_norm": 0.4754306972026825, "learning_rate": 9.106280193236716e-06, "loss": 0.4925, "step": 377 }, { "epoch": 0.27384689688481045, "grad_norm": 0.4665883183479309, "learning_rate": 9.130434782608697e-06, "loss": 0.481, "step": 378 }, { "epoch": 0.2745713595749819, "grad_norm": 0.4519490599632263, "learning_rate": 9.154589371980677e-06, "loss": 0.4683, "step": 379 }, { "epoch": 0.27529582226515337, "grad_norm": 0.48609182238578796, "learning_rate": 9.178743961352658e-06, "loss": 0.4579, "step": 380 }, { "epoch": 0.2760202849553248, "grad_norm": 0.4338051974773407, "learning_rate": 9.202898550724638e-06, "loss": 0.455, "step": 381 }, { "epoch": 0.2767447476454963, "grad_norm": 0.4972583055496216, "learning_rate": 9.227053140096618e-06, "loss": 0.4477, "step": 382 }, { "epoch": 0.27746921033566774, "grad_norm": 0.4159846603870392, "learning_rate": 9.251207729468599e-06, "loss": 0.4861, "step": 383 }, { "epoch": 0.2781936730258392, "grad_norm": 0.5551496148109436, "learning_rate": 9.275362318840581e-06, "loss": 0.4794, "step": 384 }, { "epoch": 0.27891813571601065, "grad_norm": 0.4342496693134308, "learning_rate": 9.299516908212562e-06, "loss": 0.4632, "step": 385 }, { "epoch": 0.2796425984061821, "grad_norm": 0.4937846064567566, "learning_rate": 9.323671497584542e-06, "loss": 0.4628, "step": 386 }, { "epoch": 0.28036706109635356, "grad_norm": 0.419484406709671, "learning_rate": 9.347826086956523e-06, "loss": 0.503, "step": 387 }, { "epoch": 0.281091523786525, "grad_norm": 0.4651278853416443, "learning_rate": 9.371980676328503e-06, "loss": 0.4495, "step": 388 }, { "epoch": 0.2818159864766964, "grad_norm": 0.4884265661239624, "learning_rate": 9.396135265700484e-06, "loss": 0.4641, "step": 389 }, { "epoch": 0.2825404491668679, "grad_norm": 0.48073625564575195, "learning_rate": 9.420289855072464e-06, "loss": 0.5079, "step": 390 }, { "epoch": 0.28326491185703934, "grad_norm": 0.5242260098457336, "learning_rate": 9.444444444444445e-06, "loss": 0.4644, "step": 391 }, { "epoch": 0.2839893745472108, "grad_norm": 0.6078798770904541, "learning_rate": 9.468599033816425e-06, "loss": 0.4824, "step": 392 }, { "epoch": 0.28471383723738225, "grad_norm": 0.4673522114753723, "learning_rate": 9.492753623188407e-06, "loss": 0.4872, "step": 393 }, { "epoch": 0.2854382999275537, "grad_norm": 0.6530593037605286, "learning_rate": 9.516908212560388e-06, "loss": 0.4894, "step": 394 }, { "epoch": 0.28616276261772516, "grad_norm": 0.5249728560447693, "learning_rate": 9.541062801932368e-06, "loss": 0.481, "step": 395 }, { "epoch": 0.2868872253078966, "grad_norm": 0.5032094120979309, "learning_rate": 9.565217391304349e-06, "loss": 0.4576, "step": 396 }, { "epoch": 0.2876116879980681, "grad_norm": 0.4852934777736664, "learning_rate": 9.58937198067633e-06, "loss": 0.4556, "step": 397 }, { "epoch": 0.28833615068823953, "grad_norm": 0.5186448693275452, "learning_rate": 9.61352657004831e-06, "loss": 0.4681, "step": 398 }, { "epoch": 0.289060613378411, "grad_norm": 0.4420957863330841, "learning_rate": 9.63768115942029e-06, "loss": 0.4652, "step": 399 }, { "epoch": 0.28978507606858245, "grad_norm": 0.6125519871711731, "learning_rate": 9.66183574879227e-06, "loss": 0.5153, "step": 400 }, { "epoch": 0.2905095387587539, "grad_norm": 0.4779365658760071, "learning_rate": 9.685990338164251e-06, "loss": 0.4722, "step": 401 }, { "epoch": 0.29123400144892536, "grad_norm": 0.5430857539176941, "learning_rate": 9.710144927536233e-06, "loss": 0.5023, "step": 402 }, { "epoch": 0.2919584641390968, "grad_norm": 0.4831739068031311, "learning_rate": 9.734299516908214e-06, "loss": 0.4679, "step": 403 }, { "epoch": 0.2926829268292683, "grad_norm": 0.5713402628898621, "learning_rate": 9.758454106280194e-06, "loss": 0.4906, "step": 404 }, { "epoch": 0.29340738951943973, "grad_norm": 0.5172116160392761, "learning_rate": 9.782608695652175e-06, "loss": 0.4488, "step": 405 }, { "epoch": 0.2941318522096112, "grad_norm": 0.5380436182022095, "learning_rate": 9.806763285024155e-06, "loss": 0.4968, "step": 406 }, { "epoch": 0.29485631489978265, "grad_norm": 0.4612531363964081, "learning_rate": 9.830917874396136e-06, "loss": 0.4642, "step": 407 }, { "epoch": 0.2955807775899541, "grad_norm": 0.4972025454044342, "learning_rate": 9.855072463768118e-06, "loss": 0.4626, "step": 408 }, { "epoch": 0.29630524028012556, "grad_norm": 0.4081306755542755, "learning_rate": 9.879227053140097e-06, "loss": 0.4479, "step": 409 }, { "epoch": 0.297029702970297, "grad_norm": 0.44093507528305054, "learning_rate": 9.903381642512077e-06, "loss": 0.4777, "step": 410 }, { "epoch": 0.2977541656604685, "grad_norm": 0.4226702153682709, "learning_rate": 9.927536231884058e-06, "loss": 0.4596, "step": 411 }, { "epoch": 0.29847862835063993, "grad_norm": 0.4151028096675873, "learning_rate": 9.95169082125604e-06, "loss": 0.466, "step": 412 }, { "epoch": 0.2992030910408114, "grad_norm": 0.45368343591690063, "learning_rate": 9.97584541062802e-06, "loss": 0.4498, "step": 413 }, { "epoch": 0.29992755373098284, "grad_norm": 0.5476334095001221, "learning_rate": 1e-05, "loss": 0.4773, "step": 414 }, { "epoch": 0.3006520164211543, "grad_norm": 0.4321163594722748, "learning_rate": 9.999998222727476e-06, "loss": 0.4646, "step": 415 }, { "epoch": 0.30137647911132576, "grad_norm": 0.49836766719818115, "learning_rate": 9.999992890911164e-06, "loss": 0.475, "step": 416 }, { "epoch": 0.3021009418014972, "grad_norm": 0.4489550292491913, "learning_rate": 9.999984004554854e-06, "loss": 0.4817, "step": 417 }, { "epoch": 0.30282540449166867, "grad_norm": 0.47034379839897156, "learning_rate": 9.999971563664866e-06, "loss": 0.4701, "step": 418 }, { "epoch": 0.3035498671818401, "grad_norm": 0.5300540924072266, "learning_rate": 9.999955568250043e-06, "loss": 0.4884, "step": 419 }, { "epoch": 0.3042743298720116, "grad_norm": 0.4331187605857849, "learning_rate": 9.999936018321757e-06, "loss": 0.4552, "step": 420 }, { "epoch": 0.30499879256218304, "grad_norm": 0.4506802260875702, "learning_rate": 9.999912913893905e-06, "loss": 0.4586, "step": 421 }, { "epoch": 0.3057232552523545, "grad_norm": 0.4782383441925049, "learning_rate": 9.999886254982912e-06, "loss": 0.5024, "step": 422 }, { "epoch": 0.30644771794252595, "grad_norm": 0.45143023133277893, "learning_rate": 9.999856041607732e-06, "loss": 0.479, "step": 423 }, { "epoch": 0.3071721806326974, "grad_norm": 0.4515882730484009, "learning_rate": 9.999822273789841e-06, "loss": 0.461, "step": 424 }, { "epoch": 0.30789664332286887, "grad_norm": 0.4706707298755646, "learning_rate": 9.999784951553249e-06, "loss": 0.4744, "step": 425 }, { "epoch": 0.3086211060130403, "grad_norm": 0.5119355320930481, "learning_rate": 9.999744074924486e-06, "loss": 0.4696, "step": 426 }, { "epoch": 0.3093455687032118, "grad_norm": 0.46742451190948486, "learning_rate": 9.99969964393261e-06, "loss": 0.4644, "step": 427 }, { "epoch": 0.31007003139338324, "grad_norm": 0.48932087421417236, "learning_rate": 9.99965165860921e-06, "loss": 0.4786, "step": 428 }, { "epoch": 0.3107944940835547, "grad_norm": 0.4061794877052307, "learning_rate": 9.9996001189884e-06, "loss": 0.4601, "step": 429 }, { "epoch": 0.31151895677372615, "grad_norm": 0.49403083324432373, "learning_rate": 9.999545025106818e-06, "loss": 0.476, "step": 430 }, { "epoch": 0.3122434194638976, "grad_norm": 0.5048626661300659, "learning_rate": 9.999486377003631e-06, "loss": 0.467, "step": 431 }, { "epoch": 0.31296788215406907, "grad_norm": 0.47468698024749756, "learning_rate": 9.99942417472053e-06, "loss": 0.466, "step": 432 }, { "epoch": 0.3136923448442405, "grad_norm": 0.4752046763896942, "learning_rate": 9.999358418301742e-06, "loss": 0.4954, "step": 433 }, { "epoch": 0.314416807534412, "grad_norm": 0.5110712051391602, "learning_rate": 9.999289107794008e-06, "loss": 0.4725, "step": 434 }, { "epoch": 0.31514127022458344, "grad_norm": 0.5258490443229675, "learning_rate": 9.999216243246603e-06, "loss": 0.4766, "step": 435 }, { "epoch": 0.3158657329147549, "grad_norm": 0.5120612382888794, "learning_rate": 9.999139824711327e-06, "loss": 0.4884, "step": 436 }, { "epoch": 0.31659019560492635, "grad_norm": 0.4687125086784363, "learning_rate": 9.999059852242508e-06, "loss": 0.4431, "step": 437 }, { "epoch": 0.3173146582950978, "grad_norm": 0.45731329917907715, "learning_rate": 9.998976325896997e-06, "loss": 0.4607, "step": 438 }, { "epoch": 0.31803912098526926, "grad_norm": 0.49087706208229065, "learning_rate": 9.998889245734175e-06, "loss": 0.4449, "step": 439 }, { "epoch": 0.3187635836754407, "grad_norm": 0.46834835410118103, "learning_rate": 9.998798611815948e-06, "loss": 0.4611, "step": 440 }, { "epoch": 0.3194880463656122, "grad_norm": 0.4398033022880554, "learning_rate": 9.998704424206747e-06, "loss": 0.4435, "step": 441 }, { "epoch": 0.32021250905578363, "grad_norm": 0.5014259219169617, "learning_rate": 9.998606682973534e-06, "loss": 0.4429, "step": 442 }, { "epoch": 0.3209369717459551, "grad_norm": 0.4380209147930145, "learning_rate": 9.998505388185789e-06, "loss": 0.4424, "step": 443 }, { "epoch": 0.32166143443612655, "grad_norm": 0.44836360216140747, "learning_rate": 9.998400539915528e-06, "loss": 0.495, "step": 444 }, { "epoch": 0.322385897126298, "grad_norm": 0.4748617112636566, "learning_rate": 9.998292138237287e-06, "loss": 0.4524, "step": 445 }, { "epoch": 0.32311035981646946, "grad_norm": 0.6980361342430115, "learning_rate": 9.99818018322813e-06, "loss": 0.4924, "step": 446 }, { "epoch": 0.3238348225066409, "grad_norm": 0.47119221091270447, "learning_rate": 9.998064674967647e-06, "loss": 0.4469, "step": 447 }, { "epoch": 0.3245592851968124, "grad_norm": 0.49431899189949036, "learning_rate": 9.997945613537952e-06, "loss": 0.4649, "step": 448 }, { "epoch": 0.32528374788698383, "grad_norm": 0.5146427750587463, "learning_rate": 9.99782299902369e-06, "loss": 0.4668, "step": 449 }, { "epoch": 0.3260082105771553, "grad_norm": 0.4476524889469147, "learning_rate": 9.997696831512027e-06, "loss": 0.4639, "step": 450 }, { "epoch": 0.32673267326732675, "grad_norm": 0.5598997473716736, "learning_rate": 9.997567111092656e-06, "loss": 0.4667, "step": 451 }, { "epoch": 0.3274571359574982, "grad_norm": 0.5124192237854004, "learning_rate": 9.997433837857797e-06, "loss": 0.4819, "step": 452 }, { "epoch": 0.32818159864766966, "grad_norm": 0.40552741289138794, "learning_rate": 9.997297011902195e-06, "loss": 0.4675, "step": 453 }, { "epoch": 0.3289060613378411, "grad_norm": 0.4075053930282593, "learning_rate": 9.997156633323122e-06, "loss": 0.4598, "step": 454 }, { "epoch": 0.3296305240280126, "grad_norm": 0.4553438723087311, "learning_rate": 9.997012702220372e-06, "loss": 0.4386, "step": 455 }, { "epoch": 0.33035498671818403, "grad_norm": 0.4611193835735321, "learning_rate": 9.99686521869627e-06, "loss": 0.4431, "step": 456 }, { "epoch": 0.3310794494083555, "grad_norm": 0.4557623863220215, "learning_rate": 9.996714182855662e-06, "loss": 0.4873, "step": 457 }, { "epoch": 0.33180391209852694, "grad_norm": 0.513141930103302, "learning_rate": 9.996559594805919e-06, "loss": 0.4498, "step": 458 }, { "epoch": 0.3325283747886984, "grad_norm": 0.45477429032325745, "learning_rate": 9.996401454656941e-06, "loss": 0.4497, "step": 459 }, { "epoch": 0.33325283747886986, "grad_norm": 0.44748541712760925, "learning_rate": 9.996239762521152e-06, "loss": 0.4981, "step": 460 }, { "epoch": 0.3339773001690413, "grad_norm": 0.5522701740264893, "learning_rate": 9.996074518513497e-06, "loss": 0.458, "step": 461 }, { "epoch": 0.33470176285921277, "grad_norm": 0.5148823857307434, "learning_rate": 9.995905722751453e-06, "loss": 0.46, "step": 462 }, { "epoch": 0.3354262255493842, "grad_norm": 0.49311473965644836, "learning_rate": 9.995733375355017e-06, "loss": 0.4459, "step": 463 }, { "epoch": 0.3361506882395557, "grad_norm": 0.4949207901954651, "learning_rate": 9.99555747644671e-06, "loss": 0.4658, "step": 464 }, { "epoch": 0.33687515092972714, "grad_norm": 0.5328883528709412, "learning_rate": 9.995378026151587e-06, "loss": 0.454, "step": 465 }, { "epoch": 0.3375996136198986, "grad_norm": 0.520133376121521, "learning_rate": 9.995195024597213e-06, "loss": 0.5048, "step": 466 }, { "epoch": 0.33832407631007005, "grad_norm": 0.46551960706710815, "learning_rate": 9.99500847191369e-06, "loss": 0.4786, "step": 467 }, { "epoch": 0.3390485390002415, "grad_norm": 0.5300512313842773, "learning_rate": 9.994818368233639e-06, "loss": 0.4718, "step": 468 }, { "epoch": 0.33977300169041297, "grad_norm": 0.5185633897781372, "learning_rate": 9.994624713692204e-06, "loss": 0.4803, "step": 469 }, { "epoch": 0.3404974643805844, "grad_norm": 0.42136240005493164, "learning_rate": 9.99442750842706e-06, "loss": 0.4267, "step": 470 }, { "epoch": 0.3412219270707559, "grad_norm": 0.6034565567970276, "learning_rate": 9.9942267525784e-06, "loss": 0.4928, "step": 471 }, { "epoch": 0.34194638976092734, "grad_norm": 0.41539210081100464, "learning_rate": 9.994022446288942e-06, "loss": 0.4436, "step": 472 }, { "epoch": 0.3426708524510988, "grad_norm": 0.5923753380775452, "learning_rate": 9.993814589703932e-06, "loss": 0.4907, "step": 473 }, { "epoch": 0.34339531514127025, "grad_norm": 0.5407740473747253, "learning_rate": 9.993603182971135e-06, "loss": 0.4709, "step": 474 }, { "epoch": 0.34411977783144165, "grad_norm": 0.4783056974411011, "learning_rate": 9.99338822624084e-06, "loss": 0.4641, "step": 475 }, { "epoch": 0.3448442405216131, "grad_norm": 0.5048810243606567, "learning_rate": 9.993169719665866e-06, "loss": 0.4473, "step": 476 }, { "epoch": 0.34556870321178457, "grad_norm": 0.577242910861969, "learning_rate": 9.992947663401548e-06, "loss": 0.498, "step": 477 }, { "epoch": 0.346293165901956, "grad_norm": 0.4544142186641693, "learning_rate": 9.992722057605752e-06, "loss": 0.4529, "step": 478 }, { "epoch": 0.3470176285921275, "grad_norm": 0.6631633043289185, "learning_rate": 9.992492902438857e-06, "loss": 0.4786, "step": 479 }, { "epoch": 0.34774209128229894, "grad_norm": 0.5786163210868835, "learning_rate": 9.992260198063777e-06, "loss": 0.5003, "step": 480 }, { "epoch": 0.3484665539724704, "grad_norm": 0.5595309138298035, "learning_rate": 9.99202394464594e-06, "loss": 0.4411, "step": 481 }, { "epoch": 0.34919101666264185, "grad_norm": 0.6512249708175659, "learning_rate": 9.991784142353302e-06, "loss": 0.4287, "step": 482 }, { "epoch": 0.3499154793528133, "grad_norm": 0.5120485424995422, "learning_rate": 9.991540791356342e-06, "loss": 0.4526, "step": 483 }, { "epoch": 0.35063994204298476, "grad_norm": 0.5165001153945923, "learning_rate": 9.991293891828057e-06, "loss": 0.4859, "step": 484 }, { "epoch": 0.3513644047331562, "grad_norm": 0.6688116788864136, "learning_rate": 9.991043443943975e-06, "loss": 0.464, "step": 485 }, { "epoch": 0.3520888674233277, "grad_norm": 0.4694933295249939, "learning_rate": 9.990789447882136e-06, "loss": 0.4604, "step": 486 }, { "epoch": 0.35281333011349914, "grad_norm": 0.5416237115859985, "learning_rate": 9.990531903823113e-06, "loss": 0.446, "step": 487 }, { "epoch": 0.3535377928036706, "grad_norm": 0.49977123737335205, "learning_rate": 9.990270811949994e-06, "loss": 0.4649, "step": 488 }, { "epoch": 0.35426225549384205, "grad_norm": 0.49416929483413696, "learning_rate": 9.990006172448392e-06, "loss": 0.4561, "step": 489 }, { "epoch": 0.3549867181840135, "grad_norm": 0.4852323830127716, "learning_rate": 9.989737985506442e-06, "loss": 0.4288, "step": 490 }, { "epoch": 0.35571118087418496, "grad_norm": 0.46902114152908325, "learning_rate": 9.9894662513148e-06, "loss": 0.4603, "step": 491 }, { "epoch": 0.3564356435643564, "grad_norm": 0.4465765357017517, "learning_rate": 9.989190970066643e-06, "loss": 0.466, "step": 492 }, { "epoch": 0.3571601062545279, "grad_norm": 0.4408620595932007, "learning_rate": 9.988912141957672e-06, "loss": 0.484, "step": 493 }, { "epoch": 0.35788456894469933, "grad_norm": 0.4933319389820099, "learning_rate": 9.988629767186108e-06, "loss": 0.4392, "step": 494 }, { "epoch": 0.3586090316348708, "grad_norm": 0.48803839087486267, "learning_rate": 9.988343845952697e-06, "loss": 0.4513, "step": 495 }, { "epoch": 0.35933349432504225, "grad_norm": 0.4870426058769226, "learning_rate": 9.988054378460698e-06, "loss": 0.4388, "step": 496 }, { "epoch": 0.3600579570152137, "grad_norm": 0.5236425399780273, "learning_rate": 9.9877613649159e-06, "loss": 0.4543, "step": 497 }, { "epoch": 0.36078241970538516, "grad_norm": 0.46239733695983887, "learning_rate": 9.987464805526607e-06, "loss": 0.446, "step": 498 }, { "epoch": 0.3615068823955566, "grad_norm": 0.50190669298172, "learning_rate": 9.987164700503647e-06, "loss": 0.4648, "step": 499 }, { "epoch": 0.3622313450857281, "grad_norm": 0.4814044237136841, "learning_rate": 9.986861050060366e-06, "loss": 0.4814, "step": 500 }, { "epoch": 0.36295580777589953, "grad_norm": 0.5225426554679871, "learning_rate": 9.986553854412632e-06, "loss": 0.4844, "step": 501 }, { "epoch": 0.363680270466071, "grad_norm": 0.5655185580253601, "learning_rate": 9.986243113778834e-06, "loss": 0.4758, "step": 502 }, { "epoch": 0.36440473315624244, "grad_norm": 0.5002275109291077, "learning_rate": 9.98592882837988e-06, "loss": 0.4641, "step": 503 }, { "epoch": 0.3651291958464139, "grad_norm": 0.5486688017845154, "learning_rate": 9.985610998439198e-06, "loss": 0.4789, "step": 504 }, { "epoch": 0.36585365853658536, "grad_norm": 0.43717509508132935, "learning_rate": 9.985289624182735e-06, "loss": 0.4868, "step": 505 }, { "epoch": 0.3665781212267568, "grad_norm": 0.47220003604888916, "learning_rate": 9.98496470583896e-06, "loss": 0.422, "step": 506 }, { "epoch": 0.36730258391692827, "grad_norm": 0.46194684505462646, "learning_rate": 9.984636243638864e-06, "loss": 0.4619, "step": 507 }, { "epoch": 0.36802704660709973, "grad_norm": 0.43008434772491455, "learning_rate": 9.984304237815948e-06, "loss": 0.4755, "step": 508 }, { "epoch": 0.3687515092972712, "grad_norm": 0.42834579944610596, "learning_rate": 9.98396868860624e-06, "loss": 0.432, "step": 509 }, { "epoch": 0.36947597198744264, "grad_norm": 0.4184827506542206, "learning_rate": 9.983629596248285e-06, "loss": 0.4819, "step": 510 }, { "epoch": 0.3702004346776141, "grad_norm": 0.47196221351623535, "learning_rate": 9.983286960983148e-06, "loss": 0.4348, "step": 511 }, { "epoch": 0.37092489736778556, "grad_norm": 0.4906253218650818, "learning_rate": 9.982940783054409e-06, "loss": 0.4464, "step": 512 }, { "epoch": 0.371649360057957, "grad_norm": 0.47642022371292114, "learning_rate": 9.982591062708172e-06, "loss": 0.469, "step": 513 }, { "epoch": 0.37237382274812847, "grad_norm": 0.4311465620994568, "learning_rate": 9.982237800193054e-06, "loss": 0.4628, "step": 514 }, { "epoch": 0.3730982854382999, "grad_norm": 0.5105434060096741, "learning_rate": 9.981880995760193e-06, "loss": 0.4872, "step": 515 }, { "epoch": 0.3738227481284714, "grad_norm": 0.47950297594070435, "learning_rate": 9.981520649663246e-06, "loss": 0.4655, "step": 516 }, { "epoch": 0.37454721081864284, "grad_norm": 0.38091161847114563, "learning_rate": 9.981156762158384e-06, "loss": 0.4416, "step": 517 }, { "epoch": 0.3752716735088143, "grad_norm": 0.4927051067352295, "learning_rate": 9.9807893335043e-06, "loss": 0.4829, "step": 518 }, { "epoch": 0.37599613619898575, "grad_norm": 0.4365162253379822, "learning_rate": 9.980418363962201e-06, "loss": 0.4785, "step": 519 }, { "epoch": 0.3767205988891572, "grad_norm": 0.4912382960319519, "learning_rate": 9.980043853795813e-06, "loss": 0.4709, "step": 520 }, { "epoch": 0.37744506157932867, "grad_norm": 0.43698182702064514, "learning_rate": 9.97966580327138e-06, "loss": 0.459, "step": 521 }, { "epoch": 0.3781695242695001, "grad_norm": 0.47832539677619934, "learning_rate": 9.979284212657658e-06, "loss": 0.4321, "step": 522 }, { "epoch": 0.3788939869596716, "grad_norm": 0.4751511216163635, "learning_rate": 9.978899082225926e-06, "loss": 0.4393, "step": 523 }, { "epoch": 0.37961844964984304, "grad_norm": 0.47826260328292847, "learning_rate": 9.978510412249975e-06, "loss": 0.4506, "step": 524 }, { "epoch": 0.3803429123400145, "grad_norm": 0.4306967556476593, "learning_rate": 9.978118203006116e-06, "loss": 0.4492, "step": 525 }, { "epoch": 0.38106737503018595, "grad_norm": 0.4484672248363495, "learning_rate": 9.977722454773173e-06, "loss": 0.4825, "step": 526 }, { "epoch": 0.3817918377203574, "grad_norm": 0.4419330358505249, "learning_rate": 9.977323167832489e-06, "loss": 0.4886, "step": 527 }, { "epoch": 0.38251630041052886, "grad_norm": 0.43038150668144226, "learning_rate": 9.976920342467915e-06, "loss": 0.4389, "step": 528 }, { "epoch": 0.3832407631007003, "grad_norm": 0.4721599221229553, "learning_rate": 9.976513978965829e-06, "loss": 0.465, "step": 529 }, { "epoch": 0.3839652257908718, "grad_norm": 0.450601190328598, "learning_rate": 9.976104077615118e-06, "loss": 0.4486, "step": 530 }, { "epoch": 0.38468968848104323, "grad_norm": 0.4250977039337158, "learning_rate": 9.97569063870718e-06, "loss": 0.445, "step": 531 }, { "epoch": 0.3854141511712147, "grad_norm": 0.49607059359550476, "learning_rate": 9.975273662535938e-06, "loss": 0.488, "step": 532 }, { "epoch": 0.38613861386138615, "grad_norm": 0.452585905790329, "learning_rate": 9.974853149397821e-06, "loss": 0.4454, "step": 533 }, { "epoch": 0.3868630765515576, "grad_norm": 0.43256068229675293, "learning_rate": 9.974429099591774e-06, "loss": 0.4327, "step": 534 }, { "epoch": 0.38758753924172906, "grad_norm": 0.6103281378746033, "learning_rate": 9.974001513419262e-06, "loss": 0.4591, "step": 535 }, { "epoch": 0.3883120019319005, "grad_norm": 0.4628312587738037, "learning_rate": 9.973570391184257e-06, "loss": 0.4627, "step": 536 }, { "epoch": 0.389036464622072, "grad_norm": 0.5072430372238159, "learning_rate": 9.973135733193249e-06, "loss": 0.4368, "step": 537 }, { "epoch": 0.38976092731224343, "grad_norm": 0.43841552734375, "learning_rate": 9.97269753975524e-06, "loss": 0.4647, "step": 538 }, { "epoch": 0.3904853900024149, "grad_norm": 0.520649790763855, "learning_rate": 9.972255811181745e-06, "loss": 0.4532, "step": 539 }, { "epoch": 0.39120985269258635, "grad_norm": 0.403367817401886, "learning_rate": 9.971810547786794e-06, "loss": 0.458, "step": 540 }, { "epoch": 0.3919343153827578, "grad_norm": 0.4685801863670349, "learning_rate": 9.971361749886927e-06, "loss": 0.4553, "step": 541 }, { "epoch": 0.39265877807292926, "grad_norm": 0.41353264451026917, "learning_rate": 9.9709094178012e-06, "loss": 0.4707, "step": 542 }, { "epoch": 0.3933832407631007, "grad_norm": 0.40614718198776245, "learning_rate": 9.97045355185118e-06, "loss": 0.4448, "step": 543 }, { "epoch": 0.3941077034532722, "grad_norm": 0.3832119107246399, "learning_rate": 9.969994152360944e-06, "loss": 0.4649, "step": 544 }, { "epoch": 0.39483216614344363, "grad_norm": 0.435237318277359, "learning_rate": 9.969531219657087e-06, "loss": 0.4763, "step": 545 }, { "epoch": 0.3955566288336151, "grad_norm": 0.3863712549209595, "learning_rate": 9.969064754068709e-06, "loss": 0.4593, "step": 546 }, { "epoch": 0.39628109152378654, "grad_norm": 0.42440566420555115, "learning_rate": 9.968594755927425e-06, "loss": 0.4712, "step": 547 }, { "epoch": 0.397005554213958, "grad_norm": 0.4077382981777191, "learning_rate": 9.968121225567362e-06, "loss": 0.4487, "step": 548 }, { "epoch": 0.39773001690412946, "grad_norm": 0.3968351483345032, "learning_rate": 9.967644163325157e-06, "loss": 0.4462, "step": 549 }, { "epoch": 0.3984544795943009, "grad_norm": 0.45068275928497314, "learning_rate": 9.967163569539957e-06, "loss": 0.4803, "step": 550 }, { "epoch": 0.39917894228447237, "grad_norm": 0.43489205837249756, "learning_rate": 9.96667944455342e-06, "loss": 0.47, "step": 551 }, { "epoch": 0.39990340497464383, "grad_norm": 0.4121951460838318, "learning_rate": 9.966191788709716e-06, "loss": 0.4638, "step": 552 }, { "epoch": 0.4006278676648153, "grad_norm": 0.3960815668106079, "learning_rate": 9.965700602355524e-06, "loss": 0.4759, "step": 553 }, { "epoch": 0.40135233035498674, "grad_norm": 0.4166441559791565, "learning_rate": 9.96520588584003e-06, "loss": 0.4608, "step": 554 }, { "epoch": 0.4020767930451582, "grad_norm": 0.36035579442977905, "learning_rate": 9.964707639514938e-06, "loss": 0.4573, "step": 555 }, { "epoch": 0.40280125573532966, "grad_norm": 0.36098194122314453, "learning_rate": 9.96420586373445e-06, "loss": 0.4359, "step": 556 }, { "epoch": 0.4035257184255011, "grad_norm": 0.38172176480293274, "learning_rate": 9.963700558855287e-06, "loss": 0.4705, "step": 557 }, { "epoch": 0.40425018111567257, "grad_norm": 0.4015679955482483, "learning_rate": 9.963191725236672e-06, "loss": 0.4526, "step": 558 }, { "epoch": 0.404974643805844, "grad_norm": 0.37299028038978577, "learning_rate": 9.962679363240341e-06, "loss": 0.4721, "step": 559 }, { "epoch": 0.4056991064960155, "grad_norm": 0.4105387330055237, "learning_rate": 9.962163473230538e-06, "loss": 0.4661, "step": 560 }, { "epoch": 0.4064235691861869, "grad_norm": 0.4289017617702484, "learning_rate": 9.961644055574011e-06, "loss": 0.4594, "step": 561 }, { "epoch": 0.40714803187635834, "grad_norm": 0.42483291029930115, "learning_rate": 9.961121110640019e-06, "loss": 0.4578, "step": 562 }, { "epoch": 0.4078724945665298, "grad_norm": 0.4873124957084656, "learning_rate": 9.960594638800332e-06, "loss": 0.4676, "step": 563 }, { "epoch": 0.40859695725670125, "grad_norm": 0.4355083107948303, "learning_rate": 9.960064640429217e-06, "loss": 0.4843, "step": 564 }, { "epoch": 0.4093214199468727, "grad_norm": 0.44071879982948303, "learning_rate": 9.959531115903462e-06, "loss": 0.4466, "step": 565 }, { "epoch": 0.41004588263704417, "grad_norm": 0.42548680305480957, "learning_rate": 9.958994065602347e-06, "loss": 0.4776, "step": 566 }, { "epoch": 0.4107703453272156, "grad_norm": 0.423258900642395, "learning_rate": 9.958453489907673e-06, "loss": 0.4532, "step": 567 }, { "epoch": 0.4114948080173871, "grad_norm": 0.4124598801136017, "learning_rate": 9.957909389203736e-06, "loss": 0.4324, "step": 568 }, { "epoch": 0.41221927070755854, "grad_norm": 0.4418940544128418, "learning_rate": 9.957361763877342e-06, "loss": 0.4608, "step": 569 }, { "epoch": 0.41294373339773, "grad_norm": 0.41272538900375366, "learning_rate": 9.956810614317804e-06, "loss": 0.4599, "step": 570 }, { "epoch": 0.41366819608790145, "grad_norm": 0.3864220678806305, "learning_rate": 9.95625594091694e-06, "loss": 0.4324, "step": 571 }, { "epoch": 0.4143926587780729, "grad_norm": 0.4384666085243225, "learning_rate": 9.955697744069071e-06, "loss": 0.466, "step": 572 }, { "epoch": 0.41511712146824437, "grad_norm": 0.43370336294174194, "learning_rate": 9.955136024171024e-06, "loss": 0.4588, "step": 573 }, { "epoch": 0.4158415841584158, "grad_norm": 0.42362645268440247, "learning_rate": 9.95457078162213e-06, "loss": 0.4783, "step": 574 }, { "epoch": 0.4165660468485873, "grad_norm": 0.44760653376579285, "learning_rate": 9.954002016824226e-06, "loss": 0.4658, "step": 575 }, { "epoch": 0.41729050953875874, "grad_norm": 0.4415430724620819, "learning_rate": 9.953429730181653e-06, "loss": 0.4824, "step": 576 }, { "epoch": 0.4180149722289302, "grad_norm": 0.4385088086128235, "learning_rate": 9.952853922101255e-06, "loss": 0.4583, "step": 577 }, { "epoch": 0.41873943491910165, "grad_norm": 0.5072717666625977, "learning_rate": 9.952274592992378e-06, "loss": 0.4814, "step": 578 }, { "epoch": 0.4194638976092731, "grad_norm": 0.44841185212135315, "learning_rate": 9.951691743266871e-06, "loss": 0.4514, "step": 579 }, { "epoch": 0.42018836029944456, "grad_norm": 0.45247048139572144, "learning_rate": 9.95110537333909e-06, "loss": 0.438, "step": 580 }, { "epoch": 0.420912822989616, "grad_norm": 0.4501187801361084, "learning_rate": 9.950515483625887e-06, "loss": 0.4486, "step": 581 }, { "epoch": 0.4216372856797875, "grad_norm": 0.49788281321525574, "learning_rate": 9.949922074546622e-06, "loss": 0.4616, "step": 582 }, { "epoch": 0.42236174836995893, "grad_norm": 0.5162307620048523, "learning_rate": 9.949325146523157e-06, "loss": 0.4728, "step": 583 }, { "epoch": 0.4230862110601304, "grad_norm": 0.42813029885292053, "learning_rate": 9.948724699979851e-06, "loss": 0.4898, "step": 584 }, { "epoch": 0.42381067375030185, "grad_norm": 0.5070464611053467, "learning_rate": 9.948120735343566e-06, "loss": 0.4044, "step": 585 }, { "epoch": 0.4245351364404733, "grad_norm": 0.4804460108280182, "learning_rate": 9.947513253043668e-06, "loss": 0.4348, "step": 586 }, { "epoch": 0.42525959913064476, "grad_norm": 0.4384527802467346, "learning_rate": 9.946902253512021e-06, "loss": 0.5136, "step": 587 }, { "epoch": 0.4259840618208162, "grad_norm": 0.45669132471084595, "learning_rate": 9.946287737182989e-06, "loss": 0.4401, "step": 588 }, { "epoch": 0.4267085245109877, "grad_norm": 0.45920196175575256, "learning_rate": 9.945669704493439e-06, "loss": 0.4571, "step": 589 }, { "epoch": 0.42743298720115913, "grad_norm": 0.41123947501182556, "learning_rate": 9.945048155882733e-06, "loss": 0.4343, "step": 590 }, { "epoch": 0.4281574498913306, "grad_norm": 0.44743677973747253, "learning_rate": 9.944423091792739e-06, "loss": 0.4535, "step": 591 }, { "epoch": 0.42888191258150204, "grad_norm": 0.41893619298934937, "learning_rate": 9.94379451266782e-06, "loss": 0.4536, "step": 592 }, { "epoch": 0.4296063752716735, "grad_norm": 0.40037983655929565, "learning_rate": 9.943162418954836e-06, "loss": 0.4711, "step": 593 }, { "epoch": 0.43033083796184496, "grad_norm": 0.44491368532180786, "learning_rate": 9.942526811103153e-06, "loss": 0.4794, "step": 594 }, { "epoch": 0.4310553006520164, "grad_norm": 0.43832817673683167, "learning_rate": 9.941887689564625e-06, "loss": 0.4559, "step": 595 }, { "epoch": 0.43177976334218787, "grad_norm": 0.3946869671344757, "learning_rate": 9.941245054793611e-06, "loss": 0.4257, "step": 596 }, { "epoch": 0.43250422603235933, "grad_norm": 0.408139705657959, "learning_rate": 9.940598907246968e-06, "loss": 0.4576, "step": 597 }, { "epoch": 0.4332286887225308, "grad_norm": 0.4142443835735321, "learning_rate": 9.939949247384046e-06, "loss": 0.4562, "step": 598 }, { "epoch": 0.43395315141270224, "grad_norm": 0.43098291754722595, "learning_rate": 9.939296075666694e-06, "loss": 0.4439, "step": 599 }, { "epoch": 0.4346776141028737, "grad_norm": 0.3990369439125061, "learning_rate": 9.93863939255926e-06, "loss": 0.4465, "step": 600 }, { "epoch": 0.43540207679304516, "grad_norm": 0.45796331763267517, "learning_rate": 9.937979198528583e-06, "loss": 0.4779, "step": 601 }, { "epoch": 0.4361265394832166, "grad_norm": 0.4965221881866455, "learning_rate": 9.937315494044e-06, "loss": 0.4563, "step": 602 }, { "epoch": 0.43685100217338807, "grad_norm": 0.41640257835388184, "learning_rate": 9.93664827957735e-06, "loss": 0.4624, "step": 603 }, { "epoch": 0.4375754648635595, "grad_norm": 0.44301506876945496, "learning_rate": 9.935977555602956e-06, "loss": 0.468, "step": 604 }, { "epoch": 0.438299927553731, "grad_norm": 0.5203280448913574, "learning_rate": 9.935303322597644e-06, "loss": 0.4793, "step": 605 }, { "epoch": 0.43902439024390244, "grad_norm": 0.40682870149612427, "learning_rate": 9.934625581040734e-06, "loss": 0.4484, "step": 606 }, { "epoch": 0.4397488529340739, "grad_norm": 0.4865570366382599, "learning_rate": 9.933944331414036e-06, "loss": 0.4463, "step": 607 }, { "epoch": 0.44047331562424535, "grad_norm": 0.42007672786712646, "learning_rate": 9.933259574201856e-06, "loss": 0.4346, "step": 608 }, { "epoch": 0.4411977783144168, "grad_norm": 0.4733751714229584, "learning_rate": 9.932571309890998e-06, "loss": 0.4646, "step": 609 }, { "epoch": 0.44192224100458827, "grad_norm": 0.41557809710502625, "learning_rate": 9.931879538970752e-06, "loss": 0.462, "step": 610 }, { "epoch": 0.4426467036947597, "grad_norm": 0.43916055560112, "learning_rate": 9.931184261932905e-06, "loss": 0.433, "step": 611 }, { "epoch": 0.4433711663849312, "grad_norm": 0.47538962960243225, "learning_rate": 9.930485479271735e-06, "loss": 0.4539, "step": 612 }, { "epoch": 0.44409562907510264, "grad_norm": 0.41562268137931824, "learning_rate": 9.929783191484015e-06, "loss": 0.4361, "step": 613 }, { "epoch": 0.4448200917652741, "grad_norm": 0.4190317988395691, "learning_rate": 9.929077399069005e-06, "loss": 0.4487, "step": 614 }, { "epoch": 0.44554455445544555, "grad_norm": 0.5041528940200806, "learning_rate": 9.92836810252846e-06, "loss": 0.4547, "step": 615 }, { "epoch": 0.446269017145617, "grad_norm": 0.4328729212284088, "learning_rate": 9.927655302366629e-06, "loss": 0.4554, "step": 616 }, { "epoch": 0.44699347983578847, "grad_norm": 0.3926216661930084, "learning_rate": 9.92693899909024e-06, "loss": 0.4641, "step": 617 }, { "epoch": 0.4477179425259599, "grad_norm": 0.48375794291496277, "learning_rate": 9.926219193208529e-06, "loss": 0.4641, "step": 618 }, { "epoch": 0.4484424052161314, "grad_norm": 0.414238840341568, "learning_rate": 9.925495885233205e-06, "loss": 0.424, "step": 619 }, { "epoch": 0.44916686790630284, "grad_norm": 0.443496435880661, "learning_rate": 9.924769075678478e-06, "loss": 0.4795, "step": 620 }, { "epoch": 0.4498913305964743, "grad_norm": 0.4146901071071625, "learning_rate": 9.924038765061042e-06, "loss": 0.4377, "step": 621 }, { "epoch": 0.45061579328664575, "grad_norm": 0.42079097032546997, "learning_rate": 9.923304953900082e-06, "loss": 0.4756, "step": 622 }, { "epoch": 0.4513402559768172, "grad_norm": 0.4590493142604828, "learning_rate": 9.922567642717269e-06, "loss": 0.4492, "step": 623 }, { "epoch": 0.45206471866698866, "grad_norm": 0.4142650067806244, "learning_rate": 9.921826832036768e-06, "loss": 0.4444, "step": 624 }, { "epoch": 0.4527891813571601, "grad_norm": 0.44503408670425415, "learning_rate": 9.921082522385225e-06, "loss": 0.4464, "step": 625 }, { "epoch": 0.4535136440473316, "grad_norm": 0.4530354142189026, "learning_rate": 9.920334714291778e-06, "loss": 0.437, "step": 626 }, { "epoch": 0.45423810673750303, "grad_norm": 0.4991097152233124, "learning_rate": 9.919583408288049e-06, "loss": 0.4818, "step": 627 }, { "epoch": 0.4549625694276745, "grad_norm": 0.4410784840583801, "learning_rate": 9.918828604908151e-06, "loss": 0.4652, "step": 628 }, { "epoch": 0.45568703211784595, "grad_norm": 0.46671822667121887, "learning_rate": 9.918070304688677e-06, "loss": 0.4403, "step": 629 }, { "epoch": 0.4564114948080174, "grad_norm": 0.45424506068229675, "learning_rate": 9.917308508168712e-06, "loss": 0.4691, "step": 630 }, { "epoch": 0.45713595749818886, "grad_norm": 0.4109485149383545, "learning_rate": 9.916543215889823e-06, "loss": 0.4411, "step": 631 }, { "epoch": 0.4578604201883603, "grad_norm": 0.47180068492889404, "learning_rate": 9.91577442839606e-06, "loss": 0.4722, "step": 632 }, { "epoch": 0.4585848828785318, "grad_norm": 0.46452754735946655, "learning_rate": 9.915002146233968e-06, "loss": 0.4888, "step": 633 }, { "epoch": 0.45930934556870323, "grad_norm": 0.44455766677856445, "learning_rate": 9.914226369952565e-06, "loss": 0.4411, "step": 634 }, { "epoch": 0.4600338082588747, "grad_norm": 0.4441909193992615, "learning_rate": 9.913447100103357e-06, "loss": 0.4905, "step": 635 }, { "epoch": 0.46075827094904614, "grad_norm": 0.40963178873062134, "learning_rate": 9.912664337240336e-06, "loss": 0.4642, "step": 636 }, { "epoch": 0.4614827336392176, "grad_norm": 0.45063817501068115, "learning_rate": 9.911878081919972e-06, "loss": 0.4618, "step": 637 }, { "epoch": 0.46220719632938906, "grad_norm": 0.4826534390449524, "learning_rate": 9.911088334701225e-06, "loss": 0.4713, "step": 638 }, { "epoch": 0.4629316590195605, "grad_norm": 0.4807960093021393, "learning_rate": 9.91029509614553e-06, "loss": 0.4466, "step": 639 }, { "epoch": 0.46365612170973197, "grad_norm": 0.5856636762619019, "learning_rate": 9.90949836681681e-06, "loss": 0.4771, "step": 640 }, { "epoch": 0.46438058439990343, "grad_norm": 0.39163026213645935, "learning_rate": 9.908698147281465e-06, "loss": 0.4278, "step": 641 }, { "epoch": 0.4651050470900749, "grad_norm": 0.4717934727668762, "learning_rate": 9.90789443810838e-06, "loss": 0.4305, "step": 642 }, { "epoch": 0.46582950978024634, "grad_norm": 0.496134489774704, "learning_rate": 9.907087239868917e-06, "loss": 0.4582, "step": 643 }, { "epoch": 0.4665539724704178, "grad_norm": 0.5147307515144348, "learning_rate": 9.906276553136924e-06, "loss": 0.446, "step": 644 }, { "epoch": 0.46727843516058926, "grad_norm": 0.4465230703353882, "learning_rate": 9.905462378488722e-06, "loss": 0.4531, "step": 645 }, { "epoch": 0.4680028978507607, "grad_norm": 0.4523470997810364, "learning_rate": 9.904644716503117e-06, "loss": 0.4325, "step": 646 }, { "epoch": 0.46872736054093217, "grad_norm": 0.4478057622909546, "learning_rate": 9.90382356776139e-06, "loss": 0.453, "step": 647 }, { "epoch": 0.46945182323110357, "grad_norm": 0.4444551169872284, "learning_rate": 9.902998932847308e-06, "loss": 0.449, "step": 648 }, { "epoch": 0.47017628592127503, "grad_norm": 0.5088189840316772, "learning_rate": 9.902170812347105e-06, "loss": 0.4798, "step": 649 }, { "epoch": 0.4709007486114465, "grad_norm": 0.48004212975502014, "learning_rate": 9.901339206849503e-06, "loss": 0.4637, "step": 650 }, { "epoch": 0.47162521130161794, "grad_norm": 0.4509207308292389, "learning_rate": 9.900504116945697e-06, "loss": 0.4604, "step": 651 }, { "epoch": 0.4723496739917894, "grad_norm": 0.4302298426628113, "learning_rate": 9.899665543229362e-06, "loss": 0.4661, "step": 652 }, { "epoch": 0.47307413668196086, "grad_norm": 0.4501067101955414, "learning_rate": 9.898823486296645e-06, "loss": 0.4609, "step": 653 }, { "epoch": 0.4737985993721323, "grad_norm": 0.4375602602958679, "learning_rate": 9.897977946746172e-06, "loss": 0.452, "step": 654 }, { "epoch": 0.47452306206230377, "grad_norm": 0.5028941035270691, "learning_rate": 9.897128925179045e-06, "loss": 0.4359, "step": 655 }, { "epoch": 0.4752475247524752, "grad_norm": 0.4502822160720825, "learning_rate": 9.896276422198843e-06, "loss": 0.4399, "step": 656 }, { "epoch": 0.4759719874426467, "grad_norm": 0.46370020508766174, "learning_rate": 9.895420438411616e-06, "loss": 0.468, "step": 657 }, { "epoch": 0.47669645013281814, "grad_norm": 0.37307173013687134, "learning_rate": 9.89456097442589e-06, "loss": 0.4312, "step": 658 }, { "epoch": 0.4774209128229896, "grad_norm": 0.4366321861743927, "learning_rate": 9.893698030852668e-06, "loss": 0.4145, "step": 659 }, { "epoch": 0.47814537551316105, "grad_norm": 0.4132115840911865, "learning_rate": 9.892831608305421e-06, "loss": 0.446, "step": 660 }, { "epoch": 0.4788698382033325, "grad_norm": 0.39365869760513306, "learning_rate": 9.891961707400102e-06, "loss": 0.4498, "step": 661 }, { "epoch": 0.47959430089350397, "grad_norm": 0.4604562520980835, "learning_rate": 9.891088328755125e-06, "loss": 0.4353, "step": 662 }, { "epoch": 0.4803187635836754, "grad_norm": 0.42794501781463623, "learning_rate": 9.890211472991388e-06, "loss": 0.429, "step": 663 }, { "epoch": 0.4810432262738469, "grad_norm": 0.4235381484031677, "learning_rate": 9.889331140732253e-06, "loss": 0.4243, "step": 664 }, { "epoch": 0.48176768896401834, "grad_norm": 0.4937763810157776, "learning_rate": 9.888447332603557e-06, "loss": 0.4696, "step": 665 }, { "epoch": 0.4824921516541898, "grad_norm": 0.45525142550468445, "learning_rate": 9.887560049233606e-06, "loss": 0.4427, "step": 666 }, { "epoch": 0.48321661434436125, "grad_norm": 0.4384925961494446, "learning_rate": 9.886669291253178e-06, "loss": 0.4462, "step": 667 }, { "epoch": 0.4839410770345327, "grad_norm": 0.47481632232666016, "learning_rate": 9.885775059295523e-06, "loss": 0.4494, "step": 668 }, { "epoch": 0.48466553972470416, "grad_norm": 0.4067806601524353, "learning_rate": 9.884877353996356e-06, "loss": 0.4528, "step": 669 }, { "epoch": 0.4853900024148756, "grad_norm": 0.44881516695022583, "learning_rate": 9.883976175993866e-06, "loss": 0.4439, "step": 670 }, { "epoch": 0.4861144651050471, "grad_norm": 0.3961077034473419, "learning_rate": 9.883071525928706e-06, "loss": 0.4666, "step": 671 }, { "epoch": 0.48683892779521853, "grad_norm": 0.4370024502277374, "learning_rate": 9.882163404444001e-06, "loss": 0.4395, "step": 672 }, { "epoch": 0.48756339048539, "grad_norm": 0.44325459003448486, "learning_rate": 9.881251812185343e-06, "loss": 0.4354, "step": 673 }, { "epoch": 0.48828785317556145, "grad_norm": 0.4414970278739929, "learning_rate": 9.880336749800791e-06, "loss": 0.4404, "step": 674 }, { "epoch": 0.4890123158657329, "grad_norm": 0.41670626401901245, "learning_rate": 9.879418217940872e-06, "loss": 0.4564, "step": 675 }, { "epoch": 0.48973677855590436, "grad_norm": 0.42541149258613586, "learning_rate": 9.87849621725858e-06, "loss": 0.455, "step": 676 }, { "epoch": 0.4904612412460758, "grad_norm": 0.4223549962043762, "learning_rate": 9.877570748409369e-06, "loss": 0.4497, "step": 677 }, { "epoch": 0.4911857039362473, "grad_norm": 0.370145708322525, "learning_rate": 9.876641812051164e-06, "loss": 0.4344, "step": 678 }, { "epoch": 0.49191016662641873, "grad_norm": 0.475358247756958, "learning_rate": 9.875709408844358e-06, "loss": 0.4464, "step": 679 }, { "epoch": 0.4926346293165902, "grad_norm": 0.4357815980911255, "learning_rate": 9.874773539451803e-06, "loss": 0.4306, "step": 680 }, { "epoch": 0.49335909200676165, "grad_norm": 0.4669366776943207, "learning_rate": 9.873834204538814e-06, "loss": 0.4438, "step": 681 }, { "epoch": 0.4940835546969331, "grad_norm": 0.38863319158554077, "learning_rate": 9.872891404773176e-06, "loss": 0.4288, "step": 682 }, { "epoch": 0.49480801738710456, "grad_norm": 0.45222634077072144, "learning_rate": 9.871945140825136e-06, "loss": 0.4588, "step": 683 }, { "epoch": 0.495532480077276, "grad_norm": 0.37746289372444153, "learning_rate": 9.870995413367397e-06, "loss": 0.4456, "step": 684 }, { "epoch": 0.4962569427674475, "grad_norm": 0.4669919013977051, "learning_rate": 9.87004222307513e-06, "loss": 0.4659, "step": 685 }, { "epoch": 0.49698140545761893, "grad_norm": 0.3890366554260254, "learning_rate": 9.869085570625965e-06, "loss": 0.4636, "step": 686 }, { "epoch": 0.4977058681477904, "grad_norm": 0.3929554224014282, "learning_rate": 9.8681254567e-06, "loss": 0.4318, "step": 687 }, { "epoch": 0.49843033083796184, "grad_norm": 0.3849615454673767, "learning_rate": 9.867161881979784e-06, "loss": 0.4538, "step": 688 }, { "epoch": 0.4991547935281333, "grad_norm": 0.37892112135887146, "learning_rate": 9.866194847150333e-06, "loss": 0.4454, "step": 689 }, { "epoch": 0.49987925621830476, "grad_norm": 0.4191863238811493, "learning_rate": 9.86522435289912e-06, "loss": 0.4559, "step": 690 }, { "epoch": 0.5006037189084762, "grad_norm": 0.38774755597114563, "learning_rate": 9.864250399916077e-06, "loss": 0.4257, "step": 691 }, { "epoch": 0.5013281815986477, "grad_norm": 0.40207087993621826, "learning_rate": 9.8632729888936e-06, "loss": 0.4709, "step": 692 }, { "epoch": 0.5020526442888191, "grad_norm": 0.48027193546295166, "learning_rate": 9.862292120526536e-06, "loss": 0.4821, "step": 693 }, { "epoch": 0.5027771069789906, "grad_norm": 0.39370107650756836, "learning_rate": 9.861307795512191e-06, "loss": 0.4304, "step": 694 }, { "epoch": 0.503501569669162, "grad_norm": 0.45005884766578674, "learning_rate": 9.860320014550336e-06, "loss": 0.4502, "step": 695 }, { "epoch": 0.5042260323593335, "grad_norm": 0.40108031034469604, "learning_rate": 9.85932877834319e-06, "loss": 0.4353, "step": 696 }, { "epoch": 0.504950495049505, "grad_norm": 0.38449621200561523, "learning_rate": 9.858334087595433e-06, "loss": 0.4771, "step": 697 }, { "epoch": 0.5056749577396764, "grad_norm": 0.4403113126754761, "learning_rate": 9.857335943014198e-06, "loss": 0.424, "step": 698 }, { "epoch": 0.5063994204298479, "grad_norm": 0.41029787063598633, "learning_rate": 9.856334345309077e-06, "loss": 0.4015, "step": 699 }, { "epoch": 0.5071238831200193, "grad_norm": 0.3977220058441162, "learning_rate": 9.855329295192113e-06, "loss": 0.4555, "step": 700 }, { "epoch": 0.5078483458101908, "grad_norm": 0.4440165162086487, "learning_rate": 9.854320793377806e-06, "loss": 0.4669, "step": 701 }, { "epoch": 0.5085728085003622, "grad_norm": 0.40660715103149414, "learning_rate": 9.85330884058311e-06, "loss": 0.4405, "step": 702 }, { "epoch": 0.5092972711905337, "grad_norm": 0.4466342628002167, "learning_rate": 9.85229343752743e-06, "loss": 0.4527, "step": 703 }, { "epoch": 0.5100217338807052, "grad_norm": 0.4455570876598358, "learning_rate": 9.851274584932624e-06, "loss": 0.443, "step": 704 }, { "epoch": 0.5107461965708766, "grad_norm": 0.45712926983833313, "learning_rate": 9.850252283523007e-06, "loss": 0.4447, "step": 705 }, { "epoch": 0.5114706592610481, "grad_norm": 0.4576941132545471, "learning_rate": 9.849226534025339e-06, "loss": 0.4684, "step": 706 }, { "epoch": 0.5121951219512195, "grad_norm": 0.4103553891181946, "learning_rate": 9.848197337168837e-06, "loss": 0.4457, "step": 707 }, { "epoch": 0.512919584641391, "grad_norm": 0.5092628002166748, "learning_rate": 9.847164693685163e-06, "loss": 0.4612, "step": 708 }, { "epoch": 0.5136440473315624, "grad_norm": 0.40524059534072876, "learning_rate": 9.846128604308438e-06, "loss": 0.4563, "step": 709 }, { "epoch": 0.5143685100217339, "grad_norm": 0.4180395007133484, "learning_rate": 9.845089069775222e-06, "loss": 0.4253, "step": 710 }, { "epoch": 0.5150929727119054, "grad_norm": 0.4480024576187134, "learning_rate": 9.844046090824533e-06, "loss": 0.4513, "step": 711 }, { "epoch": 0.5158174354020768, "grad_norm": 0.41059330105781555, "learning_rate": 9.842999668197832e-06, "loss": 0.4282, "step": 712 }, { "epoch": 0.5165418980922483, "grad_norm": 0.44888338446617126, "learning_rate": 9.841949802639031e-06, "loss": 0.4472, "step": 713 }, { "epoch": 0.5172663607824197, "grad_norm": 0.4189525246620178, "learning_rate": 9.840896494894487e-06, "loss": 0.4661, "step": 714 }, { "epoch": 0.5179908234725912, "grad_norm": 0.39217957854270935, "learning_rate": 9.83983974571301e-06, "loss": 0.4566, "step": 715 }, { "epoch": 0.5187152861627626, "grad_norm": 0.4739202558994293, "learning_rate": 9.83877955584585e-06, "loss": 0.4508, "step": 716 }, { "epoch": 0.5194397488529341, "grad_norm": 0.36941325664520264, "learning_rate": 9.837715926046705e-06, "loss": 0.4382, "step": 717 }, { "epoch": 0.5201642115431055, "grad_norm": 0.48434242606163025, "learning_rate": 9.83664885707172e-06, "loss": 0.4601, "step": 718 }, { "epoch": 0.520888674233277, "grad_norm": 0.444691002368927, "learning_rate": 9.835578349679484e-06, "loss": 0.4327, "step": 719 }, { "epoch": 0.5216131369234485, "grad_norm": 0.379691481590271, "learning_rate": 9.834504404631032e-06, "loss": 0.4232, "step": 720 }, { "epoch": 0.5223375996136199, "grad_norm": 0.4229881465435028, "learning_rate": 9.833427022689836e-06, "loss": 0.4409, "step": 721 }, { "epoch": 0.5230620623037914, "grad_norm": 0.45967358350753784, "learning_rate": 9.832346204621821e-06, "loss": 0.453, "step": 722 }, { "epoch": 0.5237865249939628, "grad_norm": 0.42511439323425293, "learning_rate": 9.83126195119535e-06, "loss": 0.4362, "step": 723 }, { "epoch": 0.5245109876841343, "grad_norm": 0.41015493869781494, "learning_rate": 9.83017426318123e-06, "loss": 0.4402, "step": 724 }, { "epoch": 0.5252354503743057, "grad_norm": 0.4768367409706116, "learning_rate": 9.829083141352701e-06, "loss": 0.4611, "step": 725 }, { "epoch": 0.5259599130644772, "grad_norm": 0.39045923948287964, "learning_rate": 9.827988586485459e-06, "loss": 0.4253, "step": 726 }, { "epoch": 0.5266843757546487, "grad_norm": 0.5170559883117676, "learning_rate": 9.82689059935763e-06, "loss": 0.4798, "step": 727 }, { "epoch": 0.5274088384448201, "grad_norm": 0.37973567843437195, "learning_rate": 9.825789180749784e-06, "loss": 0.4391, "step": 728 }, { "epoch": 0.5281333011349916, "grad_norm": 0.44890260696411133, "learning_rate": 9.824684331444926e-06, "loss": 0.4406, "step": 729 }, { "epoch": 0.528857763825163, "grad_norm": 0.4329308867454529, "learning_rate": 9.823576052228507e-06, "loss": 0.4479, "step": 730 }, { "epoch": 0.5295822265153345, "grad_norm": 0.44991302490234375, "learning_rate": 9.822464343888413e-06, "loss": 0.4287, "step": 731 }, { "epoch": 0.5303066892055059, "grad_norm": 0.4146920144557953, "learning_rate": 9.821349207214965e-06, "loss": 0.4857, "step": 732 }, { "epoch": 0.5310311518956774, "grad_norm": 0.5035730600357056, "learning_rate": 9.820230643000923e-06, "loss": 0.4548, "step": 733 }, { "epoch": 0.5317556145858489, "grad_norm": 0.43640565872192383, "learning_rate": 9.81910865204149e-06, "loss": 0.4495, "step": 734 }, { "epoch": 0.5324800772760203, "grad_norm": 0.42724135518074036, "learning_rate": 9.817983235134291e-06, "loss": 0.4516, "step": 735 }, { "epoch": 0.5332045399661918, "grad_norm": 0.4640941917896271, "learning_rate": 9.816854393079402e-06, "loss": 0.4454, "step": 736 }, { "epoch": 0.5339290026563632, "grad_norm": 0.42841964960098267, "learning_rate": 9.815722126679325e-06, "loss": 0.4823, "step": 737 }, { "epoch": 0.5346534653465347, "grad_norm": 0.5154430866241455, "learning_rate": 9.814586436738998e-06, "loss": 0.4675, "step": 738 }, { "epoch": 0.5353779280367061, "grad_norm": 0.3994486927986145, "learning_rate": 9.813447324065792e-06, "loss": 0.4312, "step": 739 }, { "epoch": 0.5361023907268776, "grad_norm": 0.46764811873435974, "learning_rate": 9.812304789469513e-06, "loss": 0.4526, "step": 740 }, { "epoch": 0.536826853417049, "grad_norm": 0.4396473467350006, "learning_rate": 9.811158833762403e-06, "loss": 0.4363, "step": 741 }, { "epoch": 0.5375513161072205, "grad_norm": 0.44800499081611633, "learning_rate": 9.810009457759126e-06, "loss": 0.4704, "step": 742 }, { "epoch": 0.538275778797392, "grad_norm": 0.46728041768074036, "learning_rate": 9.808856662276787e-06, "loss": 0.427, "step": 743 }, { "epoch": 0.5390002414875634, "grad_norm": 0.47170427441596985, "learning_rate": 9.80770044813492e-06, "loss": 0.4402, "step": 744 }, { "epoch": 0.5397247041777349, "grad_norm": 0.4292517304420471, "learning_rate": 9.806540816155485e-06, "loss": 0.4531, "step": 745 }, { "epoch": 0.5404491668679063, "grad_norm": 0.5027399659156799, "learning_rate": 9.805377767162878e-06, "loss": 0.4516, "step": 746 }, { "epoch": 0.5411736295580778, "grad_norm": 0.40487751364707947, "learning_rate": 9.804211301983919e-06, "loss": 0.4682, "step": 747 }, { "epoch": 0.5418980922482493, "grad_norm": 0.42648985981941223, "learning_rate": 9.803041421447858e-06, "loss": 0.4239, "step": 748 }, { "epoch": 0.5426225549384207, "grad_norm": 0.4861374497413635, "learning_rate": 9.801868126386377e-06, "loss": 0.4627, "step": 749 }, { "epoch": 0.5433470176285922, "grad_norm": 0.46737977862358093, "learning_rate": 9.800691417633577e-06, "loss": 0.4587, "step": 750 }, { "epoch": 0.5440714803187636, "grad_norm": 0.40296533703804016, "learning_rate": 9.799511296025992e-06, "loss": 0.4585, "step": 751 }, { "epoch": 0.5447959430089351, "grad_norm": 0.5428235530853271, "learning_rate": 9.798327762402587e-06, "loss": 0.4664, "step": 752 }, { "epoch": 0.5455204056991065, "grad_norm": 0.43055322766304016, "learning_rate": 9.797140817604738e-06, "loss": 0.4549, "step": 753 }, { "epoch": 0.546244868389278, "grad_norm": 0.4537781774997711, "learning_rate": 9.795950462476263e-06, "loss": 0.4572, "step": 754 }, { "epoch": 0.5469693310794495, "grad_norm": 0.45599424839019775, "learning_rate": 9.794756697863389e-06, "loss": 0.4634, "step": 755 }, { "epoch": 0.5476937937696209, "grad_norm": 0.39356401562690735, "learning_rate": 9.793559524614779e-06, "loss": 0.4323, "step": 756 }, { "epoch": 0.5484182564597924, "grad_norm": 0.46141114830970764, "learning_rate": 9.792358943581511e-06, "loss": 0.455, "step": 757 }, { "epoch": 0.5491427191499638, "grad_norm": 0.4181160032749176, "learning_rate": 9.791154955617092e-06, "loss": 0.4514, "step": 758 }, { "epoch": 0.5498671818401353, "grad_norm": 0.4888444244861603, "learning_rate": 9.789947561577445e-06, "loss": 0.4574, "step": 759 }, { "epoch": 0.5505916445303067, "grad_norm": 0.35830244421958923, "learning_rate": 9.78873676232092e-06, "loss": 0.4302, "step": 760 }, { "epoch": 0.5513161072204782, "grad_norm": 0.47038939595222473, "learning_rate": 9.787522558708283e-06, "loss": 0.4662, "step": 761 }, { "epoch": 0.5520405699106496, "grad_norm": 0.4246884286403656, "learning_rate": 9.786304951602721e-06, "loss": 0.4351, "step": 762 }, { "epoch": 0.5527650326008211, "grad_norm": 0.4181270897388458, "learning_rate": 9.785083941869847e-06, "loss": 0.459, "step": 763 }, { "epoch": 0.5534894952909926, "grad_norm": 0.3941545784473419, "learning_rate": 9.783859530377682e-06, "loss": 0.4453, "step": 764 }, { "epoch": 0.554213957981164, "grad_norm": 0.4007667303085327, "learning_rate": 9.782631717996675e-06, "loss": 0.452, "step": 765 }, { "epoch": 0.5549384206713355, "grad_norm": 0.41164064407348633, "learning_rate": 9.781400505599688e-06, "loss": 0.4542, "step": 766 }, { "epoch": 0.5556628833615069, "grad_norm": 0.39008012413978577, "learning_rate": 9.780165894062e-06, "loss": 0.4383, "step": 767 }, { "epoch": 0.5563873460516784, "grad_norm": 0.43341490626335144, "learning_rate": 9.778927884261307e-06, "loss": 0.4788, "step": 768 }, { "epoch": 0.5571118087418498, "grad_norm": 0.3900536298751831, "learning_rate": 9.777686477077724e-06, "loss": 0.4418, "step": 769 }, { "epoch": 0.5578362714320213, "grad_norm": 0.3951365351676941, "learning_rate": 9.776441673393775e-06, "loss": 0.4283, "step": 770 }, { "epoch": 0.5585607341221928, "grad_norm": 0.4546695947647095, "learning_rate": 9.775193474094407e-06, "loss": 0.4573, "step": 771 }, { "epoch": 0.5592851968123642, "grad_norm": 0.44359201192855835, "learning_rate": 9.773941880066972e-06, "loss": 0.4454, "step": 772 }, { "epoch": 0.5600096595025357, "grad_norm": 0.4103962481021881, "learning_rate": 9.77268689220124e-06, "loss": 0.4781, "step": 773 }, { "epoch": 0.5607341221927071, "grad_norm": 0.49590107798576355, "learning_rate": 9.771428511389395e-06, "loss": 0.4567, "step": 774 }, { "epoch": 0.5614585848828786, "grad_norm": 0.42519500851631165, "learning_rate": 9.77016673852603e-06, "loss": 0.4568, "step": 775 }, { "epoch": 0.56218304757305, "grad_norm": 0.482571542263031, "learning_rate": 9.76890157450815e-06, "loss": 0.4427, "step": 776 }, { "epoch": 0.5629075102632214, "grad_norm": 0.4468637704849243, "learning_rate": 9.767633020235175e-06, "loss": 0.4343, "step": 777 }, { "epoch": 0.5636319729533928, "grad_norm": 0.43378594517707825, "learning_rate": 9.766361076608926e-06, "loss": 0.4558, "step": 778 }, { "epoch": 0.5643564356435643, "grad_norm": 0.45917025208473206, "learning_rate": 9.765085744533644e-06, "loss": 0.4417, "step": 779 }, { "epoch": 0.5650808983337358, "grad_norm": 0.40235164761543274, "learning_rate": 9.76380702491597e-06, "loss": 0.4666, "step": 780 }, { "epoch": 0.5658053610239072, "grad_norm": 0.4648854434490204, "learning_rate": 9.762524918664962e-06, "loss": 0.421, "step": 781 }, { "epoch": 0.5665298237140787, "grad_norm": 0.4063076078891754, "learning_rate": 9.761239426692077e-06, "loss": 0.4537, "step": 782 }, { "epoch": 0.5672542864042501, "grad_norm": 0.511119544506073, "learning_rate": 9.759950549911185e-06, "loss": 0.4493, "step": 783 }, { "epoch": 0.5679787490944216, "grad_norm": 0.4039326310157776, "learning_rate": 9.75865828923856e-06, "loss": 0.4621, "step": 784 }, { "epoch": 0.568703211784593, "grad_norm": 0.44238749146461487, "learning_rate": 9.75736264559288e-06, "loss": 0.4199, "step": 785 }, { "epoch": 0.5694276744747645, "grad_norm": 0.40109845995903015, "learning_rate": 9.756063619895232e-06, "loss": 0.4515, "step": 786 }, { "epoch": 0.570152137164936, "grad_norm": 0.46571552753448486, "learning_rate": 9.754761213069103e-06, "loss": 0.4354, "step": 787 }, { "epoch": 0.5708765998551074, "grad_norm": 0.4667969048023224, "learning_rate": 9.753455426040387e-06, "loss": 0.4268, "step": 788 }, { "epoch": 0.5716010625452789, "grad_norm": 0.4182460904121399, "learning_rate": 9.75214625973738e-06, "loss": 0.446, "step": 789 }, { "epoch": 0.5723255252354503, "grad_norm": 0.4628804624080658, "learning_rate": 9.75083371509078e-06, "loss": 0.4454, "step": 790 }, { "epoch": 0.5730499879256218, "grad_norm": 0.5156680345535278, "learning_rate": 9.749517793033684e-06, "loss": 0.4775, "step": 791 }, { "epoch": 0.5737744506157932, "grad_norm": 0.437677800655365, "learning_rate": 9.748198494501598e-06, "loss": 0.4435, "step": 792 }, { "epoch": 0.5744989133059647, "grad_norm": 0.511820912361145, "learning_rate": 9.74687582043242e-06, "loss": 0.4377, "step": 793 }, { "epoch": 0.5752233759961362, "grad_norm": 0.44114065170288086, "learning_rate": 9.745549771766449e-06, "loss": 0.4423, "step": 794 }, { "epoch": 0.5759478386863076, "grad_norm": 0.4830992817878723, "learning_rate": 9.744220349446389e-06, "loss": 0.4543, "step": 795 }, { "epoch": 0.5766723013764791, "grad_norm": 0.4047682583332062, "learning_rate": 9.742887554417337e-06, "loss": 0.4318, "step": 796 }, { "epoch": 0.5773967640666505, "grad_norm": 0.46158096194267273, "learning_rate": 9.741551387626789e-06, "loss": 0.4629, "step": 797 }, { "epoch": 0.578121226756822, "grad_norm": 0.4261923134326935, "learning_rate": 9.740211850024638e-06, "loss": 0.4191, "step": 798 }, { "epoch": 0.5788456894469934, "grad_norm": 0.4204781651496887, "learning_rate": 9.738868942563171e-06, "loss": 0.4638, "step": 799 }, { "epoch": 0.5795701521371649, "grad_norm": 0.4380224943161011, "learning_rate": 9.737522666197076e-06, "loss": 0.4263, "step": 800 }, { "epoch": 0.5802946148273364, "grad_norm": 0.37235602736473083, "learning_rate": 9.736173021883433e-06, "loss": 0.4275, "step": 801 }, { "epoch": 0.5810190775175078, "grad_norm": 0.45496678352355957, "learning_rate": 9.734820010581713e-06, "loss": 0.4464, "step": 802 }, { "epoch": 0.5817435402076793, "grad_norm": 0.4161072075366974, "learning_rate": 9.733463633253788e-06, "loss": 0.4358, "step": 803 }, { "epoch": 0.5824680028978507, "grad_norm": 0.43562832474708557, "learning_rate": 9.732103890863918e-06, "loss": 0.4434, "step": 804 }, { "epoch": 0.5831924655880222, "grad_norm": 0.41102197766304016, "learning_rate": 9.730740784378755e-06, "loss": 0.4152, "step": 805 }, { "epoch": 0.5839169282781936, "grad_norm": 0.3994993269443512, "learning_rate": 9.729374314767341e-06, "loss": 0.4457, "step": 806 }, { "epoch": 0.5846413909683651, "grad_norm": 0.3824525773525238, "learning_rate": 9.728004483001117e-06, "loss": 0.3777, "step": 807 }, { "epoch": 0.5853658536585366, "grad_norm": 0.4596159756183624, "learning_rate": 9.726631290053904e-06, "loss": 0.4751, "step": 808 }, { "epoch": 0.586090316348708, "grad_norm": 0.3918156325817108, "learning_rate": 9.72525473690192e-06, "loss": 0.4222, "step": 809 }, { "epoch": 0.5868147790388795, "grad_norm": 0.40133535861968994, "learning_rate": 9.72387482452377e-06, "loss": 0.448, "step": 810 }, { "epoch": 0.5875392417290509, "grad_norm": 0.36849772930145264, "learning_rate": 9.722491553900446e-06, "loss": 0.4518, "step": 811 }, { "epoch": 0.5882637044192224, "grad_norm": 0.39057162404060364, "learning_rate": 9.721104926015324e-06, "loss": 0.4465, "step": 812 }, { "epoch": 0.5889881671093938, "grad_norm": 0.3953041732311249, "learning_rate": 9.719714941854172e-06, "loss": 0.4388, "step": 813 }, { "epoch": 0.5897126297995653, "grad_norm": 0.41085726022720337, "learning_rate": 9.718321602405143e-06, "loss": 0.4498, "step": 814 }, { "epoch": 0.5904370924897367, "grad_norm": 0.3906296491622925, "learning_rate": 9.716924908658776e-06, "loss": 0.4239, "step": 815 }, { "epoch": 0.5911615551799082, "grad_norm": 0.415164053440094, "learning_rate": 9.71552486160799e-06, "loss": 0.4365, "step": 816 }, { "epoch": 0.5918860178700797, "grad_norm": 0.4262652099132538, "learning_rate": 9.714121462248093e-06, "loss": 0.432, "step": 817 }, { "epoch": 0.5926104805602511, "grad_norm": 0.4076358675956726, "learning_rate": 9.712714711576774e-06, "loss": 0.418, "step": 818 }, { "epoch": 0.5933349432504226, "grad_norm": 0.44822070002555847, "learning_rate": 9.711304610594104e-06, "loss": 0.4743, "step": 819 }, { "epoch": 0.594059405940594, "grad_norm": 0.4126460552215576, "learning_rate": 9.709891160302536e-06, "loss": 0.4191, "step": 820 }, { "epoch": 0.5947838686307655, "grad_norm": 0.39143335819244385, "learning_rate": 9.708474361706907e-06, "loss": 0.4639, "step": 821 }, { "epoch": 0.595508331320937, "grad_norm": 0.40678736567497253, "learning_rate": 9.707054215814428e-06, "loss": 0.4358, "step": 822 }, { "epoch": 0.5962327940111084, "grad_norm": 0.4545709788799286, "learning_rate": 9.705630723634698e-06, "loss": 0.4441, "step": 823 }, { "epoch": 0.5969572567012799, "grad_norm": 0.39318764209747314, "learning_rate": 9.704203886179689e-06, "loss": 0.4451, "step": 824 }, { "epoch": 0.5976817193914513, "grad_norm": 0.40893492102622986, "learning_rate": 9.702773704463752e-06, "loss": 0.4293, "step": 825 }, { "epoch": 0.5984061820816228, "grad_norm": 0.4317995607852936, "learning_rate": 9.701340179503614e-06, "loss": 0.4597, "step": 826 }, { "epoch": 0.5991306447717942, "grad_norm": 0.3837151527404785, "learning_rate": 9.699903312318385e-06, "loss": 0.4566, "step": 827 }, { "epoch": 0.5998551074619657, "grad_norm": 0.42366448044776917, "learning_rate": 9.698463103929542e-06, "loss": 0.4407, "step": 828 }, { "epoch": 0.6005795701521371, "grad_norm": 0.39618051052093506, "learning_rate": 9.697019555360947e-06, "loss": 0.4635, "step": 829 }, { "epoch": 0.6013040328423086, "grad_norm": 0.3645772933959961, "learning_rate": 9.695572667638829e-06, "loss": 0.4325, "step": 830 }, { "epoch": 0.6020284955324801, "grad_norm": 0.4138987362384796, "learning_rate": 9.694122441791793e-06, "loss": 0.466, "step": 831 }, { "epoch": 0.6027529582226515, "grad_norm": 0.4093734323978424, "learning_rate": 9.692668878850819e-06, "loss": 0.4463, "step": 832 }, { "epoch": 0.603477420912823, "grad_norm": 0.37769845128059387, "learning_rate": 9.691211979849258e-06, "loss": 0.4897, "step": 833 }, { "epoch": 0.6042018836029944, "grad_norm": 0.4384388327598572, "learning_rate": 9.689751745822833e-06, "loss": 0.4429, "step": 834 }, { "epoch": 0.6049263462931659, "grad_norm": 0.3932948708534241, "learning_rate": 9.688288177809635e-06, "loss": 0.4318, "step": 835 }, { "epoch": 0.6056508089833373, "grad_norm": 0.3901706039905548, "learning_rate": 9.686821276850131e-06, "loss": 0.4818, "step": 836 }, { "epoch": 0.6063752716735088, "grad_norm": 0.40918266773223877, "learning_rate": 9.685351043987151e-06, "loss": 0.4267, "step": 837 }, { "epoch": 0.6070997343636803, "grad_norm": 0.4044211506843567, "learning_rate": 9.6838774802659e-06, "loss": 0.4483, "step": 838 }, { "epoch": 0.6078241970538517, "grad_norm": 0.40156805515289307, "learning_rate": 9.682400586733945e-06, "loss": 0.4553, "step": 839 }, { "epoch": 0.6085486597440232, "grad_norm": 0.48017120361328125, "learning_rate": 9.680920364441223e-06, "loss": 0.4722, "step": 840 }, { "epoch": 0.6092731224341946, "grad_norm": 0.4191622734069824, "learning_rate": 9.67943681444004e-06, "loss": 0.4454, "step": 841 }, { "epoch": 0.6099975851243661, "grad_norm": 0.3827061653137207, "learning_rate": 9.677949937785063e-06, "loss": 0.4512, "step": 842 }, { "epoch": 0.6107220478145375, "grad_norm": 0.4357450306415558, "learning_rate": 9.676459735533326e-06, "loss": 0.4281, "step": 843 }, { "epoch": 0.611446510504709, "grad_norm": 0.40700921416282654, "learning_rate": 9.674966208744228e-06, "loss": 0.4372, "step": 844 }, { "epoch": 0.6121709731948805, "grad_norm": 0.37690702080726624, "learning_rate": 9.67346935847953e-06, "loss": 0.4543, "step": 845 }, { "epoch": 0.6128954358850519, "grad_norm": 0.3800661265850067, "learning_rate": 9.671969185803357e-06, "loss": 0.4076, "step": 846 }, { "epoch": 0.6136198985752234, "grad_norm": 0.40739554166793823, "learning_rate": 9.670465691782194e-06, "loss": 0.4219, "step": 847 }, { "epoch": 0.6143443612653948, "grad_norm": 0.3928028643131256, "learning_rate": 9.66895887748489e-06, "loss": 0.4331, "step": 848 }, { "epoch": 0.6150688239555663, "grad_norm": 0.3968651294708252, "learning_rate": 9.66744874398265e-06, "loss": 0.4404, "step": 849 }, { "epoch": 0.6157932866457377, "grad_norm": 0.36869797110557556, "learning_rate": 9.665935292349047e-06, "loss": 0.4467, "step": 850 }, { "epoch": 0.6165177493359092, "grad_norm": 0.42009544372558594, "learning_rate": 9.664418523660004e-06, "loss": 0.4631, "step": 851 }, { "epoch": 0.6172422120260806, "grad_norm": 0.3592831790447235, "learning_rate": 9.662898438993803e-06, "loss": 0.4483, "step": 852 }, { "epoch": 0.6179666747162521, "grad_norm": 0.38012638688087463, "learning_rate": 9.661375039431092e-06, "loss": 0.4477, "step": 853 }, { "epoch": 0.6186911374064236, "grad_norm": 0.4183570444583893, "learning_rate": 9.659848326054862e-06, "loss": 0.4304, "step": 854 }, { "epoch": 0.619415600096595, "grad_norm": 0.41112974286079407, "learning_rate": 9.658318299950473e-06, "loss": 0.472, "step": 855 }, { "epoch": 0.6201400627867665, "grad_norm": 0.43574005365371704, "learning_rate": 9.656784962205635e-06, "loss": 0.4351, "step": 856 }, { "epoch": 0.6208645254769379, "grad_norm": 0.3860520124435425, "learning_rate": 9.655248313910405e-06, "loss": 0.4675, "step": 857 }, { "epoch": 0.6215889881671094, "grad_norm": 0.4183955788612366, "learning_rate": 9.653708356157207e-06, "loss": 0.4392, "step": 858 }, { "epoch": 0.6223134508572808, "grad_norm": 0.36697492003440857, "learning_rate": 9.652165090040808e-06, "loss": 0.461, "step": 859 }, { "epoch": 0.6230379135474523, "grad_norm": 0.37670600414276123, "learning_rate": 9.65061851665833e-06, "loss": 0.4308, "step": 860 }, { "epoch": 0.6237623762376238, "grad_norm": 0.39000314474105835, "learning_rate": 9.649068637109245e-06, "loss": 0.4241, "step": 861 }, { "epoch": 0.6244868389277952, "grad_norm": 0.4611751437187195, "learning_rate": 9.647515452495378e-06, "loss": 0.4199, "step": 862 }, { "epoch": 0.6252113016179667, "grad_norm": 0.3849788308143616, "learning_rate": 9.645958963920901e-06, "loss": 0.4512, "step": 863 }, { "epoch": 0.6259357643081381, "grad_norm": 0.4624251127243042, "learning_rate": 9.644399172492337e-06, "loss": 0.4514, "step": 864 }, { "epoch": 0.6266602269983096, "grad_norm": 0.39507052302360535, "learning_rate": 9.642836079318553e-06, "loss": 0.4492, "step": 865 }, { "epoch": 0.627384689688481, "grad_norm": 0.4278791844844818, "learning_rate": 9.641269685510768e-06, "loss": 0.4232, "step": 866 }, { "epoch": 0.6281091523786525, "grad_norm": 0.4310159683227539, "learning_rate": 9.639699992182547e-06, "loss": 0.4287, "step": 867 }, { "epoch": 0.628833615068824, "grad_norm": 0.4481368362903595, "learning_rate": 9.638127000449795e-06, "loss": 0.4383, "step": 868 }, { "epoch": 0.6295580777589954, "grad_norm": 0.45911234617233276, "learning_rate": 9.63655071143077e-06, "loss": 0.4501, "step": 869 }, { "epoch": 0.6302825404491669, "grad_norm": 0.4214564859867096, "learning_rate": 9.634971126246067e-06, "loss": 0.4175, "step": 870 }, { "epoch": 0.6310070031393383, "grad_norm": 0.4525710642337799, "learning_rate": 9.63338824601863e-06, "loss": 0.4403, "step": 871 }, { "epoch": 0.6317314658295098, "grad_norm": 0.436284601688385, "learning_rate": 9.631802071873741e-06, "loss": 0.4486, "step": 872 }, { "epoch": 0.6324559285196812, "grad_norm": 0.4672541916370392, "learning_rate": 9.630212604939026e-06, "loss": 0.4521, "step": 873 }, { "epoch": 0.6331803912098527, "grad_norm": 0.4406264126300812, "learning_rate": 9.628619846344453e-06, "loss": 0.4045, "step": 874 }, { "epoch": 0.6339048539000242, "grad_norm": 0.5719868540763855, "learning_rate": 9.627023797222325e-06, "loss": 0.4866, "step": 875 }, { "epoch": 0.6346293165901956, "grad_norm": 0.43884405493736267, "learning_rate": 9.625424458707291e-06, "loss": 0.4501, "step": 876 }, { "epoch": 0.6353537792803671, "grad_norm": 0.4829519987106323, "learning_rate": 9.623821831936333e-06, "loss": 0.4442, "step": 877 }, { "epoch": 0.6360782419705385, "grad_norm": 0.4972541928291321, "learning_rate": 9.622215918048774e-06, "loss": 0.4224, "step": 878 }, { "epoch": 0.63680270466071, "grad_norm": 0.44246381521224976, "learning_rate": 9.620606718186272e-06, "loss": 0.427, "step": 879 }, { "epoch": 0.6375271673508814, "grad_norm": 0.4771749675273895, "learning_rate": 9.61899423349282e-06, "loss": 0.4414, "step": 880 }, { "epoch": 0.6382516300410529, "grad_norm": 0.5113094449043274, "learning_rate": 9.617378465114753e-06, "loss": 0.4452, "step": 881 }, { "epoch": 0.6389760927312244, "grad_norm": 0.3977792263031006, "learning_rate": 9.615759414200729e-06, "loss": 0.4343, "step": 882 }, { "epoch": 0.6397005554213958, "grad_norm": 0.5014514327049255, "learning_rate": 9.614137081901751e-06, "loss": 0.4642, "step": 883 }, { "epoch": 0.6404250181115673, "grad_norm": 0.46444767713546753, "learning_rate": 9.612511469371146e-06, "loss": 0.4412, "step": 884 }, { "epoch": 0.6411494808017387, "grad_norm": 0.42733290791511536, "learning_rate": 9.61088257776458e-06, "loss": 0.4668, "step": 885 }, { "epoch": 0.6418739434919102, "grad_norm": 0.3827154338359833, "learning_rate": 9.609250408240046e-06, "loss": 0.4276, "step": 886 }, { "epoch": 0.6425984061820816, "grad_norm": 0.4315594434738159, "learning_rate": 9.607614961957863e-06, "loss": 0.4436, "step": 887 }, { "epoch": 0.6433228688722531, "grad_norm": 0.43633192777633667, "learning_rate": 9.605976240080691e-06, "loss": 0.5051, "step": 888 }, { "epoch": 0.6440473315624246, "grad_norm": 0.4368920624256134, "learning_rate": 9.604334243773509e-06, "loss": 0.4637, "step": 889 }, { "epoch": 0.644771794252596, "grad_norm": 0.4452131390571594, "learning_rate": 9.602688974203629e-06, "loss": 0.4443, "step": 890 }, { "epoch": 0.6454962569427675, "grad_norm": 0.3883676826953888, "learning_rate": 9.601040432540684e-06, "loss": 0.4404, "step": 891 }, { "epoch": 0.6462207196329389, "grad_norm": 0.4019961953163147, "learning_rate": 9.599388619956642e-06, "loss": 0.4436, "step": 892 }, { "epoch": 0.6469451823231104, "grad_norm": 0.46847617626190186, "learning_rate": 9.597733537625788e-06, "loss": 0.446, "step": 893 }, { "epoch": 0.6476696450132818, "grad_norm": 0.41111230850219727, "learning_rate": 9.596075186724737e-06, "loss": 0.4513, "step": 894 }, { "epoch": 0.6483941077034533, "grad_norm": 0.423624187707901, "learning_rate": 9.594413568432421e-06, "loss": 0.4555, "step": 895 }, { "epoch": 0.6491185703936247, "grad_norm": 0.44857993721961975, "learning_rate": 9.592748683930106e-06, "loss": 0.4591, "step": 896 }, { "epoch": 0.6498430330837962, "grad_norm": 0.4101342558860779, "learning_rate": 9.591080534401371e-06, "loss": 0.4278, "step": 897 }, { "epoch": 0.6505674957739677, "grad_norm": 0.36504730582237244, "learning_rate": 9.589409121032117e-06, "loss": 0.4165, "step": 898 }, { "epoch": 0.6512919584641391, "grad_norm": 0.419657826423645, "learning_rate": 9.587734445010568e-06, "loss": 0.4328, "step": 899 }, { "epoch": 0.6520164211543106, "grad_norm": 0.43852537870407104, "learning_rate": 9.586056507527266e-06, "loss": 0.4283, "step": 900 }, { "epoch": 0.652740883844482, "grad_norm": 0.3770526051521301, "learning_rate": 9.584375309775071e-06, "loss": 0.4426, "step": 901 }, { "epoch": 0.6534653465346535, "grad_norm": 0.46526017785072327, "learning_rate": 9.582690852949164e-06, "loss": 0.4692, "step": 902 }, { "epoch": 0.654189809224825, "grad_norm": 0.3587336540222168, "learning_rate": 9.58100313824704e-06, "loss": 0.4133, "step": 903 }, { "epoch": 0.6549142719149964, "grad_norm": 0.46992042660713196, "learning_rate": 9.579312166868507e-06, "loss": 0.4514, "step": 904 }, { "epoch": 0.6556387346051679, "grad_norm": 0.3875126540660858, "learning_rate": 9.577617940015695e-06, "loss": 0.442, "step": 905 }, { "epoch": 0.6563631972953393, "grad_norm": 0.40051212906837463, "learning_rate": 9.575920458893046e-06, "loss": 0.4295, "step": 906 }, { "epoch": 0.6570876599855108, "grad_norm": 0.39937758445739746, "learning_rate": 9.574219724707313e-06, "loss": 0.3938, "step": 907 }, { "epoch": 0.6578121226756822, "grad_norm": 0.40078675746917725, "learning_rate": 9.572515738667563e-06, "loss": 0.4395, "step": 908 }, { "epoch": 0.6585365853658537, "grad_norm": 0.4561125636100769, "learning_rate": 9.570808501985176e-06, "loss": 0.4641, "step": 909 }, { "epoch": 0.6592610480560251, "grad_norm": 0.4386140704154968, "learning_rate": 9.56909801587384e-06, "loss": 0.4468, "step": 910 }, { "epoch": 0.6599855107461966, "grad_norm": 0.44128313660621643, "learning_rate": 9.56738428154956e-06, "loss": 0.4522, "step": 911 }, { "epoch": 0.6607099734363681, "grad_norm": 0.3870251178741455, "learning_rate": 9.565667300230637e-06, "loss": 0.4256, "step": 912 }, { "epoch": 0.6614344361265395, "grad_norm": 0.4524024724960327, "learning_rate": 9.563947073137695e-06, "loss": 0.4313, "step": 913 }, { "epoch": 0.662158898816711, "grad_norm": 0.3524264693260193, "learning_rate": 9.562223601493658e-06, "loss": 0.4322, "step": 914 }, { "epoch": 0.6628833615068824, "grad_norm": 0.4363982081413269, "learning_rate": 9.560496886523756e-06, "loss": 0.4433, "step": 915 }, { "epoch": 0.6636078241970539, "grad_norm": 0.41511109471321106, "learning_rate": 9.558766929455527e-06, "loss": 0.4606, "step": 916 }, { "epoch": 0.6643322868872253, "grad_norm": 0.37013405561447144, "learning_rate": 9.557033731518813e-06, "loss": 0.4376, "step": 917 }, { "epoch": 0.6650567495773968, "grad_norm": 0.42168766260147095, "learning_rate": 9.55529729394576e-06, "loss": 0.4649, "step": 918 }, { "epoch": 0.6657812122675683, "grad_norm": 0.3986402153968811, "learning_rate": 9.553557617970817e-06, "loss": 0.4565, "step": 919 }, { "epoch": 0.6665056749577397, "grad_norm": 0.3860379159450531, "learning_rate": 9.551814704830734e-06, "loss": 0.4069, "step": 920 }, { "epoch": 0.6672301376479112, "grad_norm": 0.3931432366371155, "learning_rate": 9.550068555764567e-06, "loss": 0.4612, "step": 921 }, { "epoch": 0.6679546003380826, "grad_norm": 0.3967553973197937, "learning_rate": 9.548319172013665e-06, "loss": 0.4511, "step": 922 }, { "epoch": 0.6686790630282541, "grad_norm": 0.36526596546173096, "learning_rate": 9.546566554821683e-06, "loss": 0.4356, "step": 923 }, { "epoch": 0.6694035257184255, "grad_norm": 0.4314006567001343, "learning_rate": 9.544810705434574e-06, "loss": 0.446, "step": 924 }, { "epoch": 0.670127988408597, "grad_norm": 0.43257901072502136, "learning_rate": 9.543051625100584e-06, "loss": 0.4318, "step": 925 }, { "epoch": 0.6708524510987685, "grad_norm": 0.4155063033103943, "learning_rate": 9.54128931507026e-06, "loss": 0.4307, "step": 926 }, { "epoch": 0.6715769137889399, "grad_norm": 0.44219550490379333, "learning_rate": 9.539523776596446e-06, "loss": 0.4149, "step": 927 }, { "epoch": 0.6723013764791114, "grad_norm": 0.40312889218330383, "learning_rate": 9.537755010934276e-06, "loss": 0.3991, "step": 928 }, { "epoch": 0.6730258391692828, "grad_norm": 0.4250657260417938, "learning_rate": 9.535983019341184e-06, "loss": 0.4462, "step": 929 }, { "epoch": 0.6737503018594543, "grad_norm": 0.4725401997566223, "learning_rate": 9.534207803076894e-06, "loss": 0.4734, "step": 930 }, { "epoch": 0.6744747645496257, "grad_norm": 0.3754332363605499, "learning_rate": 9.532429363403421e-06, "loss": 0.4373, "step": 931 }, { "epoch": 0.6751992272397972, "grad_norm": 0.43413710594177246, "learning_rate": 9.530647701585078e-06, "loss": 0.4026, "step": 932 }, { "epoch": 0.6759236899299687, "grad_norm": 0.4020750820636749, "learning_rate": 9.528862818888462e-06, "loss": 0.4512, "step": 933 }, { "epoch": 0.6766481526201401, "grad_norm": 0.3839271366596222, "learning_rate": 9.527074716582463e-06, "loss": 0.437, "step": 934 }, { "epoch": 0.6773726153103116, "grad_norm": 0.39749541878700256, "learning_rate": 9.525283395938259e-06, "loss": 0.417, "step": 935 }, { "epoch": 0.678097078000483, "grad_norm": 0.43483495712280273, "learning_rate": 9.523488858229313e-06, "loss": 0.4567, "step": 936 }, { "epoch": 0.6788215406906545, "grad_norm": 0.4033527970314026, "learning_rate": 9.521691104731381e-06, "loss": 0.4226, "step": 937 }, { "epoch": 0.6795460033808259, "grad_norm": 0.3772134780883789, "learning_rate": 9.519890136722505e-06, "loss": 0.4334, "step": 938 }, { "epoch": 0.6802704660709974, "grad_norm": 0.37122389674186707, "learning_rate": 9.518085955483002e-06, "loss": 0.4225, "step": 939 }, { "epoch": 0.6809949287611688, "grad_norm": 0.4325699508190155, "learning_rate": 9.516278562295487e-06, "loss": 0.4152, "step": 940 }, { "epoch": 0.6817193914513403, "grad_norm": 0.37359827756881714, "learning_rate": 9.51446795844485e-06, "loss": 0.4366, "step": 941 }, { "epoch": 0.6824438541415118, "grad_norm": 0.39326733350753784, "learning_rate": 9.512654145218265e-06, "loss": 0.4503, "step": 942 }, { "epoch": 0.6831683168316832, "grad_norm": 0.3589102625846863, "learning_rate": 9.51083712390519e-06, "loss": 0.4368, "step": 943 }, { "epoch": 0.6838927795218547, "grad_norm": 0.3660793900489807, "learning_rate": 9.509016895797358e-06, "loss": 0.4264, "step": 944 }, { "epoch": 0.6846172422120261, "grad_norm": 0.3941572904586792, "learning_rate": 9.507193462188791e-06, "loss": 0.4565, "step": 945 }, { "epoch": 0.6853417049021976, "grad_norm": 0.3688904941082001, "learning_rate": 9.505366824375779e-06, "loss": 0.4373, "step": 946 }, { "epoch": 0.686066167592369, "grad_norm": 0.39947018027305603, "learning_rate": 9.503536983656898e-06, "loss": 0.4189, "step": 947 }, { "epoch": 0.6867906302825405, "grad_norm": 0.37034836411476135, "learning_rate": 9.501703941332999e-06, "loss": 0.4459, "step": 948 }, { "epoch": 0.6875150929727118, "grad_norm": 0.38267090916633606, "learning_rate": 9.499867698707205e-06, "loss": 0.419, "step": 949 }, { "epoch": 0.6882395556628833, "grad_norm": 0.39816734194755554, "learning_rate": 9.498028257084923e-06, "loss": 0.4452, "step": 950 }, { "epoch": 0.6889640183530548, "grad_norm": 0.4213787317276001, "learning_rate": 9.496185617773824e-06, "loss": 0.471, "step": 951 }, { "epoch": 0.6896884810432262, "grad_norm": 0.34985843300819397, "learning_rate": 9.494339782083857e-06, "loss": 0.4268, "step": 952 }, { "epoch": 0.6904129437333977, "grad_norm": 0.3768939971923828, "learning_rate": 9.492490751327245e-06, "loss": 0.4496, "step": 953 }, { "epoch": 0.6911374064235691, "grad_norm": 0.3784196972846985, "learning_rate": 9.490638526818482e-06, "loss": 0.4474, "step": 954 }, { "epoch": 0.6918618691137406, "grad_norm": 0.3469781279563904, "learning_rate": 9.488783109874327e-06, "loss": 0.4084, "step": 955 }, { "epoch": 0.692586331803912, "grad_norm": 0.3851598799228668, "learning_rate": 9.486924501813815e-06, "loss": 0.4225, "step": 956 }, { "epoch": 0.6933107944940835, "grad_norm": 0.3312722444534302, "learning_rate": 9.485062703958248e-06, "loss": 0.4123, "step": 957 }, { "epoch": 0.694035257184255, "grad_norm": 0.36504507064819336, "learning_rate": 9.483197717631193e-06, "loss": 0.4208, "step": 958 }, { "epoch": 0.6947597198744264, "grad_norm": 0.3897438943386078, "learning_rate": 9.481329544158486e-06, "loss": 0.4451, "step": 959 }, { "epoch": 0.6954841825645979, "grad_norm": 0.38637563586235046, "learning_rate": 9.47945818486823e-06, "loss": 0.4432, "step": 960 }, { "epoch": 0.6962086452547693, "grad_norm": 0.4060679078102112, "learning_rate": 9.47758364109079e-06, "loss": 0.436, "step": 961 }, { "epoch": 0.6969331079449408, "grad_norm": 0.36793801188468933, "learning_rate": 9.475705914158796e-06, "loss": 0.4189, "step": 962 }, { "epoch": 0.6976575706351122, "grad_norm": 0.3810827434062958, "learning_rate": 9.47382500540714e-06, "loss": 0.4234, "step": 963 }, { "epoch": 0.6983820333252837, "grad_norm": 0.36961886286735535, "learning_rate": 9.47194091617298e-06, "loss": 0.4262, "step": 964 }, { "epoch": 0.6991064960154552, "grad_norm": 0.3853397071361542, "learning_rate": 9.470053647795727e-06, "loss": 0.4497, "step": 965 }, { "epoch": 0.6998309587056266, "grad_norm": 0.3898027241230011, "learning_rate": 9.468163201617063e-06, "loss": 0.4458, "step": 966 }, { "epoch": 0.7005554213957981, "grad_norm": 0.38989728689193726, "learning_rate": 9.466269578980917e-06, "loss": 0.4321, "step": 967 }, { "epoch": 0.7012798840859695, "grad_norm": 0.44447991251945496, "learning_rate": 9.464372781233489e-06, "loss": 0.4801, "step": 968 }, { "epoch": 0.702004346776141, "grad_norm": 0.40722593665122986, "learning_rate": 9.462472809723226e-06, "loss": 0.4538, "step": 969 }, { "epoch": 0.7027288094663124, "grad_norm": 0.3770104646682739, "learning_rate": 9.460569665800833e-06, "loss": 0.4517, "step": 970 }, { "epoch": 0.7034532721564839, "grad_norm": 0.38395723700523376, "learning_rate": 9.458663350819277e-06, "loss": 0.4394, "step": 971 }, { "epoch": 0.7041777348466554, "grad_norm": 0.3628930151462555, "learning_rate": 9.45675386613377e-06, "loss": 0.4199, "step": 972 }, { "epoch": 0.7049021975368268, "grad_norm": 0.3590884506702423, "learning_rate": 9.454841213101784e-06, "loss": 0.4508, "step": 973 }, { "epoch": 0.7056266602269983, "grad_norm": 0.4284939467906952, "learning_rate": 9.452925393083041e-06, "loss": 0.4293, "step": 974 }, { "epoch": 0.7063511229171697, "grad_norm": 0.4312379062175751, "learning_rate": 9.451006407439515e-06, "loss": 0.436, "step": 975 }, { "epoch": 0.7070755856073412, "grad_norm": 0.359935998916626, "learning_rate": 9.44908425753543e-06, "loss": 0.3901, "step": 976 }, { "epoch": 0.7078000482975126, "grad_norm": 0.4188145101070404, "learning_rate": 9.447158944737261e-06, "loss": 0.4309, "step": 977 }, { "epoch": 0.7085245109876841, "grad_norm": 0.4267701506614685, "learning_rate": 9.445230470413726e-06, "loss": 0.4203, "step": 978 }, { "epoch": 0.7092489736778556, "grad_norm": 0.43181249499320984, "learning_rate": 9.4432988359358e-06, "loss": 0.4717, "step": 979 }, { "epoch": 0.709973436368027, "grad_norm": 0.3534151017665863, "learning_rate": 9.441364042676697e-06, "loss": 0.4068, "step": 980 }, { "epoch": 0.7106978990581985, "grad_norm": 0.39549124240875244, "learning_rate": 9.439426092011877e-06, "loss": 0.4206, "step": 981 }, { "epoch": 0.7114223617483699, "grad_norm": 0.39119014143943787, "learning_rate": 9.43748498531905e-06, "loss": 0.4125, "step": 982 }, { "epoch": 0.7121468244385414, "grad_norm": 0.38045740127563477, "learning_rate": 9.435540723978163e-06, "loss": 0.4437, "step": 983 }, { "epoch": 0.7128712871287128, "grad_norm": 0.34724679589271545, "learning_rate": 9.433593309371411e-06, "loss": 0.4362, "step": 984 }, { "epoch": 0.7135957498188843, "grad_norm": 0.4044446647167206, "learning_rate": 9.43164274288323e-06, "loss": 0.4481, "step": 985 }, { "epoch": 0.7143202125090558, "grad_norm": 0.460126668214798, "learning_rate": 9.429689025900293e-06, "loss": 0.468, "step": 986 }, { "epoch": 0.7150446751992272, "grad_norm": 0.44857120513916016, "learning_rate": 9.427732159811514e-06, "loss": 0.4525, "step": 987 }, { "epoch": 0.7157691378893987, "grad_norm": 0.44323647022247314, "learning_rate": 9.42577214600805e-06, "loss": 0.4468, "step": 988 }, { "epoch": 0.7164936005795701, "grad_norm": 0.4160037934780121, "learning_rate": 9.423808985883289e-06, "loss": 0.4256, "step": 989 }, { "epoch": 0.7172180632697416, "grad_norm": 0.42461177706718445, "learning_rate": 9.421842680832862e-06, "loss": 0.4202, "step": 990 }, { "epoch": 0.717942525959913, "grad_norm": 0.4257369935512543, "learning_rate": 9.419873232254632e-06, "loss": 0.4269, "step": 991 }, { "epoch": 0.7186669886500845, "grad_norm": 0.4304386377334595, "learning_rate": 9.417900641548699e-06, "loss": 0.4417, "step": 992 }, { "epoch": 0.719391451340256, "grad_norm": 0.41846027970314026, "learning_rate": 9.415924910117391e-06, "loss": 0.3967, "step": 993 }, { "epoch": 0.7201159140304274, "grad_norm": 0.4379346966743469, "learning_rate": 9.413946039365277e-06, "loss": 0.428, "step": 994 }, { "epoch": 0.7208403767205989, "grad_norm": 0.3850870430469513, "learning_rate": 9.411964030699156e-06, "loss": 0.4255, "step": 995 }, { "epoch": 0.7215648394107703, "grad_norm": 0.434347927570343, "learning_rate": 9.409978885528053e-06, "loss": 0.4545, "step": 996 }, { "epoch": 0.7222893021009418, "grad_norm": 0.41526368260383606, "learning_rate": 9.407990605263225e-06, "loss": 0.4526, "step": 997 }, { "epoch": 0.7230137647911132, "grad_norm": 0.3917652368545532, "learning_rate": 9.405999191318158e-06, "loss": 0.4414, "step": 998 }, { "epoch": 0.7237382274812847, "grad_norm": 0.42446932196617126, "learning_rate": 9.40400464510857e-06, "loss": 0.4384, "step": 999 }, { "epoch": 0.7244626901714561, "grad_norm": 0.40269631147384644, "learning_rate": 9.402006968052396e-06, "loss": 0.4366, "step": 1000 }, { "epoch": 0.7251871528616276, "grad_norm": 0.4052945077419281, "learning_rate": 9.400006161569808e-06, "loss": 0.4335, "step": 1001 }, { "epoch": 0.7259116155517991, "grad_norm": 0.4494095742702484, "learning_rate": 9.398002227083193e-06, "loss": 0.4295, "step": 1002 }, { "epoch": 0.7266360782419705, "grad_norm": 0.4094015955924988, "learning_rate": 9.395995166017171e-06, "loss": 0.4193, "step": 1003 }, { "epoch": 0.727360540932142, "grad_norm": 0.41275161504745483, "learning_rate": 9.393984979798577e-06, "loss": 0.4378, "step": 1004 }, { "epoch": 0.7280850036223134, "grad_norm": 0.41076353192329407, "learning_rate": 9.391971669856469e-06, "loss": 0.4564, "step": 1005 }, { "epoch": 0.7288094663124849, "grad_norm": 0.41270843148231506, "learning_rate": 9.38995523762213e-06, "loss": 0.4096, "step": 1006 }, { "epoch": 0.7295339290026563, "grad_norm": 0.4057427942752838, "learning_rate": 9.387935684529057e-06, "loss": 0.4606, "step": 1007 }, { "epoch": 0.7302583916928278, "grad_norm": 0.4032459557056427, "learning_rate": 9.385913012012972e-06, "loss": 0.428, "step": 1008 }, { "epoch": 0.7309828543829993, "grad_norm": 0.365999311208725, "learning_rate": 9.383887221511809e-06, "loss": 0.4449, "step": 1009 }, { "epoch": 0.7317073170731707, "grad_norm": 0.37868577241897583, "learning_rate": 9.381858314465719e-06, "loss": 0.4499, "step": 1010 }, { "epoch": 0.7324317797633422, "grad_norm": 0.39182478189468384, "learning_rate": 9.379826292317075e-06, "loss": 0.4287, "step": 1011 }, { "epoch": 0.7331562424535136, "grad_norm": 0.41855061054229736, "learning_rate": 9.377791156510456e-06, "loss": 0.4182, "step": 1012 }, { "epoch": 0.7338807051436851, "grad_norm": 0.4000391960144043, "learning_rate": 9.37575290849266e-06, "loss": 0.4442, "step": 1013 }, { "epoch": 0.7346051678338565, "grad_norm": 0.3930225670337677, "learning_rate": 9.373711549712694e-06, "loss": 0.4555, "step": 1014 }, { "epoch": 0.735329630524028, "grad_norm": 0.42249712347984314, "learning_rate": 9.371667081621779e-06, "loss": 0.4154, "step": 1015 }, { "epoch": 0.7360540932141995, "grad_norm": 0.4157273471355438, "learning_rate": 9.369619505673348e-06, "loss": 0.4501, "step": 1016 }, { "epoch": 0.7367785559043709, "grad_norm": 0.39603620767593384, "learning_rate": 9.367568823323039e-06, "loss": 0.4389, "step": 1017 }, { "epoch": 0.7375030185945424, "grad_norm": 0.3827815055847168, "learning_rate": 9.365515036028702e-06, "loss": 0.4284, "step": 1018 }, { "epoch": 0.7382274812847138, "grad_norm": 0.4068104028701782, "learning_rate": 9.363458145250392e-06, "loss": 0.4359, "step": 1019 }, { "epoch": 0.7389519439748853, "grad_norm": 0.436969131231308, "learning_rate": 9.36139815245037e-06, "loss": 0.4455, "step": 1020 }, { "epoch": 0.7396764066650567, "grad_norm": 0.37201741337776184, "learning_rate": 9.359335059093107e-06, "loss": 0.4369, "step": 1021 }, { "epoch": 0.7404008693552282, "grad_norm": 0.4652364253997803, "learning_rate": 9.35726886664527e-06, "loss": 0.4297, "step": 1022 }, { "epoch": 0.7411253320453997, "grad_norm": 0.41318705677986145, "learning_rate": 9.355199576575738e-06, "loss": 0.469, "step": 1023 }, { "epoch": 0.7418497947355711, "grad_norm": 0.46660298109054565, "learning_rate": 9.353127190355587e-06, "loss": 0.4475, "step": 1024 }, { "epoch": 0.7425742574257426, "grad_norm": 0.48308318853378296, "learning_rate": 9.351051709458092e-06, "loss": 0.444, "step": 1025 }, { "epoch": 0.743298720115914, "grad_norm": 0.49536195397377014, "learning_rate": 9.348973135358734e-06, "loss": 0.4323, "step": 1026 }, { "epoch": 0.7440231828060855, "grad_norm": 0.48932647705078125, "learning_rate": 9.34689146953519e-06, "loss": 0.4193, "step": 1027 }, { "epoch": 0.7447476454962569, "grad_norm": 0.4195725619792938, "learning_rate": 9.344806713467334e-06, "loss": 0.4299, "step": 1028 }, { "epoch": 0.7454721081864284, "grad_norm": 0.4397088587284088, "learning_rate": 9.342718868637238e-06, "loss": 0.4713, "step": 1029 }, { "epoch": 0.7461965708765999, "grad_norm": 0.4477749466896057, "learning_rate": 9.34062793652917e-06, "loss": 0.4158, "step": 1030 }, { "epoch": 0.7469210335667713, "grad_norm": 0.44897550344467163, "learning_rate": 9.338533918629594e-06, "loss": 0.4243, "step": 1031 }, { "epoch": 0.7476454962569428, "grad_norm": 0.4267942011356354, "learning_rate": 9.336436816427161e-06, "loss": 0.4259, "step": 1032 }, { "epoch": 0.7483699589471142, "grad_norm": 0.500425398349762, "learning_rate": 9.334336631412726e-06, "loss": 0.4204, "step": 1033 }, { "epoch": 0.7490944216372857, "grad_norm": 0.4137190282344818, "learning_rate": 9.332233365079325e-06, "loss": 0.4393, "step": 1034 }, { "epoch": 0.7498188843274571, "grad_norm": 0.44953417778015137, "learning_rate": 9.330127018922195e-06, "loss": 0.4542, "step": 1035 }, { "epoch": 0.7505433470176286, "grad_norm": 0.48390910029411316, "learning_rate": 9.328017594438748e-06, "loss": 0.4369, "step": 1036 }, { "epoch": 0.7512678097078, "grad_norm": 0.416665643453598, "learning_rate": 9.3259050931286e-06, "loss": 0.4313, "step": 1037 }, { "epoch": 0.7519922723979715, "grad_norm": 0.4601927399635315, "learning_rate": 9.323789516493542e-06, "loss": 0.4504, "step": 1038 }, { "epoch": 0.752716735088143, "grad_norm": 0.4730970859527588, "learning_rate": 9.321670866037562e-06, "loss": 0.4381, "step": 1039 }, { "epoch": 0.7534411977783144, "grad_norm": 0.3944840133190155, "learning_rate": 9.319549143266823e-06, "loss": 0.4393, "step": 1040 }, { "epoch": 0.7541656604684859, "grad_norm": 0.4419754147529602, "learning_rate": 9.317424349689677e-06, "loss": 0.4479, "step": 1041 }, { "epoch": 0.7548901231586573, "grad_norm": 0.41033580899238586, "learning_rate": 9.315296486816663e-06, "loss": 0.4524, "step": 1042 }, { "epoch": 0.7556145858488288, "grad_norm": 0.390825480222702, "learning_rate": 9.313165556160493e-06, "loss": 0.4603, "step": 1043 }, { "epoch": 0.7563390485390002, "grad_norm": 0.40725159645080566, "learning_rate": 9.311031559236067e-06, "loss": 0.4291, "step": 1044 }, { "epoch": 0.7570635112291717, "grad_norm": 0.412993460893631, "learning_rate": 9.308894497560465e-06, "loss": 0.4807, "step": 1045 }, { "epoch": 0.7577879739193432, "grad_norm": 0.4204166829586029, "learning_rate": 9.30675437265294e-06, "loss": 0.4396, "step": 1046 }, { "epoch": 0.7585124366095146, "grad_norm": 0.37054407596588135, "learning_rate": 9.304611186034926e-06, "loss": 0.4448, "step": 1047 }, { "epoch": 0.7592368992996861, "grad_norm": 0.45943933725357056, "learning_rate": 9.302464939230037e-06, "loss": 0.4338, "step": 1048 }, { "epoch": 0.7599613619898575, "grad_norm": 0.46317145228385925, "learning_rate": 9.300315633764055e-06, "loss": 0.4568, "step": 1049 }, { "epoch": 0.760685824680029, "grad_norm": 0.42357516288757324, "learning_rate": 9.298163271164944e-06, "loss": 0.4579, "step": 1050 }, { "epoch": 0.7614102873702004, "grad_norm": 0.44015824794769287, "learning_rate": 9.296007852962835e-06, "loss": 0.4484, "step": 1051 }, { "epoch": 0.7621347500603719, "grad_norm": 0.4057539403438568, "learning_rate": 9.293849380690037e-06, "loss": 0.4467, "step": 1052 }, { "epoch": 0.7628592127505434, "grad_norm": 0.3817419111728668, "learning_rate": 9.291687855881027e-06, "loss": 0.4268, "step": 1053 }, { "epoch": 0.7635836754407148, "grad_norm": 0.411005437374115, "learning_rate": 9.28952328007245e-06, "loss": 0.4637, "step": 1054 }, { "epoch": 0.7643081381308863, "grad_norm": 0.4017236530780792, "learning_rate": 9.287355654803125e-06, "loss": 0.4453, "step": 1055 }, { "epoch": 0.7650326008210577, "grad_norm": 0.40975144505500793, "learning_rate": 9.285184981614035e-06, "loss": 0.4243, "step": 1056 }, { "epoch": 0.7657570635112292, "grad_norm": 0.3413551449775696, "learning_rate": 9.283011262048334e-06, "loss": 0.4264, "step": 1057 }, { "epoch": 0.7664815262014006, "grad_norm": 0.40049874782562256, "learning_rate": 9.280834497651334e-06, "loss": 0.4201, "step": 1058 }, { "epoch": 0.7672059888915721, "grad_norm": 0.3825286328792572, "learning_rate": 9.278654689970519e-06, "loss": 0.449, "step": 1059 }, { "epoch": 0.7679304515817436, "grad_norm": 0.4081519842147827, "learning_rate": 9.276471840555535e-06, "loss": 0.4333, "step": 1060 }, { "epoch": 0.768654914271915, "grad_norm": 0.4335751533508301, "learning_rate": 9.274285950958187e-06, "loss": 0.4172, "step": 1061 }, { "epoch": 0.7693793769620865, "grad_norm": 0.34927207231521606, "learning_rate": 9.272097022732444e-06, "loss": 0.4186, "step": 1062 }, { "epoch": 0.7701038396522579, "grad_norm": 0.46532440185546875, "learning_rate": 9.269905057434436e-06, "loss": 0.4458, "step": 1063 }, { "epoch": 0.7708283023424294, "grad_norm": 0.4283447563648224, "learning_rate": 9.267710056622452e-06, "loss": 0.4295, "step": 1064 }, { "epoch": 0.7715527650326008, "grad_norm": 0.40760543942451477, "learning_rate": 9.265512021856934e-06, "loss": 0.4621, "step": 1065 }, { "epoch": 0.7722772277227723, "grad_norm": 0.4961356818675995, "learning_rate": 9.263310954700487e-06, "loss": 0.434, "step": 1066 }, { "epoch": 0.7730016904129438, "grad_norm": 0.33915263414382935, "learning_rate": 9.261106856717871e-06, "loss": 0.4522, "step": 1067 }, { "epoch": 0.7737261531031152, "grad_norm": 0.4364902079105377, "learning_rate": 9.258899729475997e-06, "loss": 0.4591, "step": 1068 }, { "epoch": 0.7744506157932867, "grad_norm": 0.3842344880104065, "learning_rate": 9.256689574543931e-06, "loss": 0.4131, "step": 1069 }, { "epoch": 0.7751750784834581, "grad_norm": 0.36406752467155457, "learning_rate": 9.254476393492895e-06, "loss": 0.4424, "step": 1070 }, { "epoch": 0.7758995411736296, "grad_norm": 0.4229189157485962, "learning_rate": 9.252260187896257e-06, "loss": 0.4377, "step": 1071 }, { "epoch": 0.776624003863801, "grad_norm": 0.4069315791130066, "learning_rate": 9.250040959329539e-06, "loss": 0.4322, "step": 1072 }, { "epoch": 0.7773484665539725, "grad_norm": 0.38519904017448425, "learning_rate": 9.247818709370406e-06, "loss": 0.4279, "step": 1073 }, { "epoch": 0.778072929244144, "grad_norm": 0.34695881605148315, "learning_rate": 9.245593439598684e-06, "loss": 0.4382, "step": 1074 }, { "epoch": 0.7787973919343154, "grad_norm": 0.3752036988735199, "learning_rate": 9.24336515159633e-06, "loss": 0.4275, "step": 1075 }, { "epoch": 0.7795218546244869, "grad_norm": 0.3589307367801666, "learning_rate": 9.241133846947457e-06, "loss": 0.4377, "step": 1076 }, { "epoch": 0.7802463173146583, "grad_norm": 0.4055996239185333, "learning_rate": 9.238899527238319e-06, "loss": 0.4142, "step": 1077 }, { "epoch": 0.7809707800048298, "grad_norm": 0.4022076725959778, "learning_rate": 9.236662194057314e-06, "loss": 0.4407, "step": 1078 }, { "epoch": 0.7816952426950012, "grad_norm": 0.36757490038871765, "learning_rate": 9.234421848994982e-06, "loss": 0.4552, "step": 1079 }, { "epoch": 0.7824197053851727, "grad_norm": 0.3741929531097412, "learning_rate": 9.232178493644006e-06, "loss": 0.4326, "step": 1080 }, { "epoch": 0.7831441680753441, "grad_norm": 0.40012165904045105, "learning_rate": 9.229932129599206e-06, "loss": 0.4339, "step": 1081 }, { "epoch": 0.7838686307655156, "grad_norm": 0.40874114632606506, "learning_rate": 9.227682758457544e-06, "loss": 0.4403, "step": 1082 }, { "epoch": 0.7845930934556871, "grad_norm": 0.3631582260131836, "learning_rate": 9.225430381818116e-06, "loss": 0.4287, "step": 1083 }, { "epoch": 0.7853175561458585, "grad_norm": 0.40000277757644653, "learning_rate": 9.223175001282157e-06, "loss": 0.4075, "step": 1084 }, { "epoch": 0.78604201883603, "grad_norm": 0.37886300683021545, "learning_rate": 9.22091661845304e-06, "loss": 0.4626, "step": 1085 }, { "epoch": 0.7867664815262014, "grad_norm": 0.4256695508956909, "learning_rate": 9.218655234936267e-06, "loss": 0.4176, "step": 1086 }, { "epoch": 0.7874909442163729, "grad_norm": 0.48891156911849976, "learning_rate": 9.216390852339478e-06, "loss": 0.4849, "step": 1087 }, { "epoch": 0.7882154069065443, "grad_norm": 0.38585418462753296, "learning_rate": 9.214123472272442e-06, "loss": 0.4434, "step": 1088 }, { "epoch": 0.7889398695967158, "grad_norm": 0.3846301734447479, "learning_rate": 9.211853096347059e-06, "loss": 0.4241, "step": 1089 }, { "epoch": 0.7896643322868873, "grad_norm": 0.38906145095825195, "learning_rate": 9.20957972617736e-06, "loss": 0.4355, "step": 1090 }, { "epoch": 0.7903887949770587, "grad_norm": 0.41960757970809937, "learning_rate": 9.207303363379507e-06, "loss": 0.4452, "step": 1091 }, { "epoch": 0.7911132576672302, "grad_norm": 0.40459001064300537, "learning_rate": 9.205024009571784e-06, "loss": 0.4361, "step": 1092 }, { "epoch": 0.7918377203574016, "grad_norm": 0.380317747592926, "learning_rate": 9.202741666374604e-06, "loss": 0.405, "step": 1093 }, { "epoch": 0.7925621830475731, "grad_norm": 0.38362371921539307, "learning_rate": 9.200456335410506e-06, "loss": 0.4162, "step": 1094 }, { "epoch": 0.7932866457377445, "grad_norm": 0.41437092423439026, "learning_rate": 9.198168018304154e-06, "loss": 0.4397, "step": 1095 }, { "epoch": 0.794011108427916, "grad_norm": 0.35920220613479614, "learning_rate": 9.195876716682331e-06, "loss": 0.4209, "step": 1096 }, { "epoch": 0.7947355711180875, "grad_norm": 0.3815721869468689, "learning_rate": 9.193582432173943e-06, "loss": 0.4153, "step": 1097 }, { "epoch": 0.7954600338082589, "grad_norm": 0.37686556577682495, "learning_rate": 9.191285166410023e-06, "loss": 0.4513, "step": 1098 }, { "epoch": 0.7961844964984304, "grad_norm": 0.35489991307258606, "learning_rate": 9.188984921023712e-06, "loss": 0.4033, "step": 1099 }, { "epoch": 0.7969089591886018, "grad_norm": 0.4480908513069153, "learning_rate": 9.186681697650277e-06, "loss": 0.4652, "step": 1100 }, { "epoch": 0.7976334218787733, "grad_norm": 0.36178743839263916, "learning_rate": 9.184375497927103e-06, "loss": 0.4257, "step": 1101 }, { "epoch": 0.7983578845689447, "grad_norm": 0.3902286887168884, "learning_rate": 9.182066323493683e-06, "loss": 0.4525, "step": 1102 }, { "epoch": 0.7990823472591162, "grad_norm": 0.36827757954597473, "learning_rate": 9.179754175991636e-06, "loss": 0.443, "step": 1103 }, { "epoch": 0.7998068099492877, "grad_norm": 0.4072573482990265, "learning_rate": 9.177439057064684e-06, "loss": 0.4297, "step": 1104 }, { "epoch": 0.8005312726394591, "grad_norm": 0.3875163793563843, "learning_rate": 9.175120968358666e-06, "loss": 0.3965, "step": 1105 }, { "epoch": 0.8012557353296306, "grad_norm": 0.37679600715637207, "learning_rate": 9.172799911521533e-06, "loss": 0.4324, "step": 1106 }, { "epoch": 0.801980198019802, "grad_norm": 0.3907124996185303, "learning_rate": 9.170475888203348e-06, "loss": 0.4334, "step": 1107 }, { "epoch": 0.8027046607099735, "grad_norm": 0.35453158617019653, "learning_rate": 9.168148900056275e-06, "loss": 0.416, "step": 1108 }, { "epoch": 0.8034291234001449, "grad_norm": 0.37232735753059387, "learning_rate": 9.165818948734595e-06, "loss": 0.4614, "step": 1109 }, { "epoch": 0.8041535860903164, "grad_norm": 0.35956913232803345, "learning_rate": 9.16348603589469e-06, "loss": 0.4378, "step": 1110 }, { "epoch": 0.8048780487804879, "grad_norm": 0.3539981245994568, "learning_rate": 9.161150163195047e-06, "loss": 0.4593, "step": 1111 }, { "epoch": 0.8056025114706593, "grad_norm": 0.37305349111557007, "learning_rate": 9.158811332296263e-06, "loss": 0.4346, "step": 1112 }, { "epoch": 0.8063269741608308, "grad_norm": 0.3700515925884247, "learning_rate": 9.15646954486103e-06, "loss": 0.4329, "step": 1113 }, { "epoch": 0.8070514368510022, "grad_norm": 0.33366405963897705, "learning_rate": 9.154124802554148e-06, "loss": 0.4463, "step": 1114 }, { "epoch": 0.8077758995411737, "grad_norm": 0.40708401799201965, "learning_rate": 9.151777107042515e-06, "loss": 0.4245, "step": 1115 }, { "epoch": 0.8085003622313451, "grad_norm": 0.4417791962623596, "learning_rate": 9.149426459995127e-06, "loss": 0.4455, "step": 1116 }, { "epoch": 0.8092248249215166, "grad_norm": 0.36388248205184937, "learning_rate": 9.147072863083082e-06, "loss": 0.4141, "step": 1117 }, { "epoch": 0.809949287611688, "grad_norm": 0.36573684215545654, "learning_rate": 9.144716317979575e-06, "loss": 0.4388, "step": 1118 }, { "epoch": 0.8106737503018595, "grad_norm": 0.37397056818008423, "learning_rate": 9.14235682635989e-06, "loss": 0.4284, "step": 1119 }, { "epoch": 0.811398212992031, "grad_norm": 0.41755741834640503, "learning_rate": 9.139994389901416e-06, "loss": 0.4098, "step": 1120 }, { "epoch": 0.8121226756822024, "grad_norm": 0.3893563151359558, "learning_rate": 9.13762901028363e-06, "loss": 0.4462, "step": 1121 }, { "epoch": 0.8128471383723738, "grad_norm": 0.42045333981513977, "learning_rate": 9.1352606891881e-06, "loss": 0.4397, "step": 1122 }, { "epoch": 0.8135716010625452, "grad_norm": 0.4058089256286621, "learning_rate": 9.132889428298483e-06, "loss": 0.4076, "step": 1123 }, { "epoch": 0.8142960637527167, "grad_norm": 0.40118736028671265, "learning_rate": 9.130515229300538e-06, "loss": 0.4431, "step": 1124 }, { "epoch": 0.8150205264428881, "grad_norm": 0.3891606628894806, "learning_rate": 9.128138093882098e-06, "loss": 0.4294, "step": 1125 }, { "epoch": 0.8157449891330596, "grad_norm": 0.4459226429462433, "learning_rate": 9.125758023733094e-06, "loss": 0.4262, "step": 1126 }, { "epoch": 0.816469451823231, "grad_norm": 0.4369942843914032, "learning_rate": 9.123375020545534e-06, "loss": 0.4384, "step": 1127 }, { "epoch": 0.8171939145134025, "grad_norm": 0.4051201343536377, "learning_rate": 9.120989086013522e-06, "loss": 0.4615, "step": 1128 }, { "epoch": 0.817918377203574, "grad_norm": 0.417635440826416, "learning_rate": 9.118600221833237e-06, "loss": 0.4583, "step": 1129 }, { "epoch": 0.8186428398937454, "grad_norm": 0.4460812211036682, "learning_rate": 9.116208429702948e-06, "loss": 0.4377, "step": 1130 }, { "epoch": 0.8193673025839169, "grad_norm": 0.3786093294620514, "learning_rate": 9.113813711322997e-06, "loss": 0.4303, "step": 1131 }, { "epoch": 0.8200917652740883, "grad_norm": 0.3732582628726959, "learning_rate": 9.111416068395813e-06, "loss": 0.4291, "step": 1132 }, { "epoch": 0.8208162279642598, "grad_norm": 0.402224063873291, "learning_rate": 9.109015502625902e-06, "loss": 0.43, "step": 1133 }, { "epoch": 0.8215406906544312, "grad_norm": 0.3673078119754791, "learning_rate": 9.106612015719845e-06, "loss": 0.4238, "step": 1134 }, { "epoch": 0.8222651533446027, "grad_norm": 0.358090341091156, "learning_rate": 9.104205609386308e-06, "loss": 0.4249, "step": 1135 }, { "epoch": 0.8229896160347742, "grad_norm": 0.43211016058921814, "learning_rate": 9.101796285336022e-06, "loss": 0.4659, "step": 1136 }, { "epoch": 0.8237140787249456, "grad_norm": 0.3589254915714264, "learning_rate": 9.099384045281801e-06, "loss": 0.4527, "step": 1137 }, { "epoch": 0.8244385414151171, "grad_norm": 0.3850201666355133, "learning_rate": 9.096968890938524e-06, "loss": 0.45, "step": 1138 }, { "epoch": 0.8251630041052885, "grad_norm": 0.41675737500190735, "learning_rate": 9.09455082402315e-06, "loss": 0.4627, "step": 1139 }, { "epoch": 0.82588746679546, "grad_norm": 0.3731227517127991, "learning_rate": 9.092129846254703e-06, "loss": 0.4332, "step": 1140 }, { "epoch": 0.8266119294856314, "grad_norm": 0.36045631766319275, "learning_rate": 9.089705959354279e-06, "loss": 0.4476, "step": 1141 }, { "epoch": 0.8273363921758029, "grad_norm": 0.44686636328697205, "learning_rate": 9.087279165045038e-06, "loss": 0.4312, "step": 1142 }, { "epoch": 0.8280608548659744, "grad_norm": 0.36906927824020386, "learning_rate": 9.08484946505221e-06, "loss": 0.4338, "step": 1143 }, { "epoch": 0.8287853175561458, "grad_norm": 0.3876591622829437, "learning_rate": 9.082416861103095e-06, "loss": 0.41, "step": 1144 }, { "epoch": 0.8295097802463173, "grad_norm": 0.3522427976131439, "learning_rate": 9.07998135492705e-06, "loss": 0.4366, "step": 1145 }, { "epoch": 0.8302342429364887, "grad_norm": 0.38508865237236023, "learning_rate": 9.077542948255498e-06, "loss": 0.4288, "step": 1146 }, { "epoch": 0.8309587056266602, "grad_norm": 0.38007476925849915, "learning_rate": 9.075101642821926e-06, "loss": 0.3997, "step": 1147 }, { "epoch": 0.8316831683168316, "grad_norm": 0.3887992799282074, "learning_rate": 9.072657440361878e-06, "loss": 0.4345, "step": 1148 }, { "epoch": 0.8324076310070031, "grad_norm": 0.3822677731513977, "learning_rate": 9.070210342612961e-06, "loss": 0.4387, "step": 1149 }, { "epoch": 0.8331320936971746, "grad_norm": 0.3775729835033417, "learning_rate": 9.067760351314838e-06, "loss": 0.4263, "step": 1150 }, { "epoch": 0.833856556387346, "grad_norm": 0.42536813020706177, "learning_rate": 9.06530746820923e-06, "loss": 0.4391, "step": 1151 }, { "epoch": 0.8345810190775175, "grad_norm": 0.3927188217639923, "learning_rate": 9.062851695039915e-06, "loss": 0.4446, "step": 1152 }, { "epoch": 0.8353054817676889, "grad_norm": 0.3907926082611084, "learning_rate": 9.060393033552723e-06, "loss": 0.4478, "step": 1153 }, { "epoch": 0.8360299444578604, "grad_norm": 0.40085843205451965, "learning_rate": 9.05793148549554e-06, "loss": 0.4253, "step": 1154 }, { "epoch": 0.8367544071480318, "grad_norm": 0.3549349904060364, "learning_rate": 9.055467052618301e-06, "loss": 0.4127, "step": 1155 }, { "epoch": 0.8374788698382033, "grad_norm": 0.415708988904953, "learning_rate": 9.052999736672994e-06, "loss": 0.4361, "step": 1156 }, { "epoch": 0.8382033325283748, "grad_norm": 0.33825936913490295, "learning_rate": 9.050529539413654e-06, "loss": 0.4508, "step": 1157 }, { "epoch": 0.8389277952185462, "grad_norm": 0.37880566716194153, "learning_rate": 9.04805646259637e-06, "loss": 0.4709, "step": 1158 }, { "epoch": 0.8396522579087177, "grad_norm": 0.3746563792228699, "learning_rate": 9.045580507979274e-06, "loss": 0.4077, "step": 1159 }, { "epoch": 0.8403767205988891, "grad_norm": 0.4093169867992401, "learning_rate": 9.043101677322545e-06, "loss": 0.4263, "step": 1160 }, { "epoch": 0.8411011832890606, "grad_norm": 0.3626551330089569, "learning_rate": 9.040619972388402e-06, "loss": 0.4295, "step": 1161 }, { "epoch": 0.841825645979232, "grad_norm": 0.467331200838089, "learning_rate": 9.038135394941117e-06, "loss": 0.4156, "step": 1162 }, { "epoch": 0.8425501086694035, "grad_norm": 0.44259244203567505, "learning_rate": 9.035647946746995e-06, "loss": 0.4472, "step": 1163 }, { "epoch": 0.843274571359575, "grad_norm": 0.3543887436389923, "learning_rate": 9.033157629574385e-06, "loss": 0.4121, "step": 1164 }, { "epoch": 0.8439990340497464, "grad_norm": 0.4613070785999298, "learning_rate": 9.030664445193676e-06, "loss": 0.4428, "step": 1165 }, { "epoch": 0.8447234967399179, "grad_norm": 0.4112345576286316, "learning_rate": 9.028168395377297e-06, "loss": 0.4506, "step": 1166 }, { "epoch": 0.8454479594300893, "grad_norm": 0.416177898645401, "learning_rate": 9.025669481899713e-06, "loss": 0.4329, "step": 1167 }, { "epoch": 0.8461724221202608, "grad_norm": 0.4517548382282257, "learning_rate": 9.023167706537422e-06, "loss": 0.4264, "step": 1168 }, { "epoch": 0.8468968848104322, "grad_norm": 0.35384514927864075, "learning_rate": 9.020663071068958e-06, "loss": 0.4114, "step": 1169 }, { "epoch": 0.8476213475006037, "grad_norm": 0.3764117360115051, "learning_rate": 9.018155577274891e-06, "loss": 0.4381, "step": 1170 }, { "epoch": 0.8483458101907752, "grad_norm": 0.38221117854118347, "learning_rate": 9.015645226937821e-06, "loss": 0.4197, "step": 1171 }, { "epoch": 0.8490702728809466, "grad_norm": 0.4375673234462738, "learning_rate": 9.013132021842377e-06, "loss": 0.4437, "step": 1172 }, { "epoch": 0.8497947355711181, "grad_norm": 0.40190356969833374, "learning_rate": 9.01061596377522e-06, "loss": 0.4632, "step": 1173 }, { "epoch": 0.8505191982612895, "grad_norm": 0.3758179545402527, "learning_rate": 9.008097054525038e-06, "loss": 0.4283, "step": 1174 }, { "epoch": 0.851243660951461, "grad_norm": 0.40486329793930054, "learning_rate": 9.005575295882548e-06, "loss": 0.4495, "step": 1175 }, { "epoch": 0.8519681236416324, "grad_norm": 0.44034072756767273, "learning_rate": 9.003050689640487e-06, "loss": 0.4331, "step": 1176 }, { "epoch": 0.8526925863318039, "grad_norm": 0.39043477177619934, "learning_rate": 9.000523237593624e-06, "loss": 0.4073, "step": 1177 }, { "epoch": 0.8534170490219753, "grad_norm": 0.5083754062652588, "learning_rate": 8.997992941538747e-06, "loss": 0.4435, "step": 1178 }, { "epoch": 0.8541415117121468, "grad_norm": 0.39611995220184326, "learning_rate": 8.995459803274664e-06, "loss": 0.413, "step": 1179 }, { "epoch": 0.8548659744023183, "grad_norm": 0.41626864671707153, "learning_rate": 8.992923824602208e-06, "loss": 0.4284, "step": 1180 }, { "epoch": 0.8555904370924897, "grad_norm": 0.40626421570777893, "learning_rate": 8.990385007324228e-06, "loss": 0.4345, "step": 1181 }, { "epoch": 0.8563148997826612, "grad_norm": 0.38452404737472534, "learning_rate": 8.987843353245594e-06, "loss": 0.4462, "step": 1182 }, { "epoch": 0.8570393624728326, "grad_norm": 0.40148282051086426, "learning_rate": 8.985298864173189e-06, "loss": 0.4319, "step": 1183 }, { "epoch": 0.8577638251630041, "grad_norm": 0.43007805943489075, "learning_rate": 8.982751541915912e-06, "loss": 0.4454, "step": 1184 }, { "epoch": 0.8584882878531755, "grad_norm": 0.450321763753891, "learning_rate": 8.980201388284679e-06, "loss": 0.4524, "step": 1185 }, { "epoch": 0.859212750543347, "grad_norm": 0.4127412438392639, "learning_rate": 8.977648405092418e-06, "loss": 0.4111, "step": 1186 }, { "epoch": 0.8599372132335185, "grad_norm": 0.5283059477806091, "learning_rate": 8.975092594154068e-06, "loss": 0.4506, "step": 1187 }, { "epoch": 0.8606616759236899, "grad_norm": 0.3911735713481903, "learning_rate": 8.972533957286574e-06, "loss": 0.402, "step": 1188 }, { "epoch": 0.8613861386138614, "grad_norm": 0.3382461369037628, "learning_rate": 8.969972496308898e-06, "loss": 0.3957, "step": 1189 }, { "epoch": 0.8621106013040328, "grad_norm": 0.44050973653793335, "learning_rate": 8.967408213042005e-06, "loss": 0.4334, "step": 1190 }, { "epoch": 0.8628350639942043, "grad_norm": 0.39053159952163696, "learning_rate": 8.964841109308866e-06, "loss": 0.3959, "step": 1191 }, { "epoch": 0.8635595266843757, "grad_norm": 0.38239380717277527, "learning_rate": 8.962271186934459e-06, "loss": 0.4251, "step": 1192 }, { "epoch": 0.8642839893745472, "grad_norm": 0.36377817392349243, "learning_rate": 8.959698447745764e-06, "loss": 0.4137, "step": 1193 }, { "epoch": 0.8650084520647187, "grad_norm": 0.3955075442790985, "learning_rate": 8.957122893571766e-06, "loss": 0.4502, "step": 1194 }, { "epoch": 0.8657329147548901, "grad_norm": 0.3771941065788269, "learning_rate": 8.95454452624345e-06, "loss": 0.4293, "step": 1195 }, { "epoch": 0.8664573774450616, "grad_norm": 0.33146560192108154, "learning_rate": 8.951963347593797e-06, "loss": 0.4112, "step": 1196 }, { "epoch": 0.867181840135233, "grad_norm": 0.35591354966163635, "learning_rate": 8.949379359457795e-06, "loss": 0.4264, "step": 1197 }, { "epoch": 0.8679063028254045, "grad_norm": 0.3861212730407715, "learning_rate": 8.94679256367242e-06, "loss": 0.4354, "step": 1198 }, { "epoch": 0.8686307655155759, "grad_norm": 0.36173397302627563, "learning_rate": 8.94420296207665e-06, "loss": 0.4413, "step": 1199 }, { "epoch": 0.8693552282057474, "grad_norm": 0.40658220648765564, "learning_rate": 8.941610556511458e-06, "loss": 0.4349, "step": 1200 }, { "epoch": 0.8700796908959189, "grad_norm": 0.37256452441215515, "learning_rate": 8.939015348819809e-06, "loss": 0.4321, "step": 1201 }, { "epoch": 0.8708041535860903, "grad_norm": 0.35852882266044617, "learning_rate": 8.936417340846655e-06, "loss": 0.4294, "step": 1202 }, { "epoch": 0.8715286162762618, "grad_norm": 0.3688584864139557, "learning_rate": 8.933816534438946e-06, "loss": 0.4349, "step": 1203 }, { "epoch": 0.8722530789664332, "grad_norm": 0.3805645704269409, "learning_rate": 8.931212931445618e-06, "loss": 0.3872, "step": 1204 }, { "epoch": 0.8729775416566047, "grad_norm": 0.3627467453479767, "learning_rate": 8.928606533717597e-06, "loss": 0.4382, "step": 1205 }, { "epoch": 0.8737020043467761, "grad_norm": 0.3667280972003937, "learning_rate": 8.925997343107796e-06, "loss": 0.4285, "step": 1206 }, { "epoch": 0.8744264670369476, "grad_norm": 0.41676947474479675, "learning_rate": 8.923385361471106e-06, "loss": 0.4649, "step": 1207 }, { "epoch": 0.875150929727119, "grad_norm": 0.3603455722332001, "learning_rate": 8.920770590664416e-06, "loss": 0.4187, "step": 1208 }, { "epoch": 0.8758753924172905, "grad_norm": 0.38049617409706116, "learning_rate": 8.918153032546586e-06, "loss": 0.4234, "step": 1209 }, { "epoch": 0.876599855107462, "grad_norm": 0.373248428106308, "learning_rate": 8.915532688978462e-06, "loss": 0.4192, "step": 1210 }, { "epoch": 0.8773243177976334, "grad_norm": 0.37912118434906006, "learning_rate": 8.91290956182287e-06, "loss": 0.4182, "step": 1211 }, { "epoch": 0.8780487804878049, "grad_norm": 0.44167736172676086, "learning_rate": 8.910283652944614e-06, "loss": 0.4543, "step": 1212 }, { "epoch": 0.8787732431779763, "grad_norm": 0.3972494602203369, "learning_rate": 8.907654964210476e-06, "loss": 0.4471, "step": 1213 }, { "epoch": 0.8794977058681478, "grad_norm": 0.3778550326824188, "learning_rate": 8.905023497489218e-06, "loss": 0.4432, "step": 1214 }, { "epoch": 0.8802221685583193, "grad_norm": 0.36695626378059387, "learning_rate": 8.902389254651568e-06, "loss": 0.4483, "step": 1215 }, { "epoch": 0.8809466312484907, "grad_norm": 0.40104225277900696, "learning_rate": 8.899752237570239e-06, "loss": 0.4221, "step": 1216 }, { "epoch": 0.8816710939386622, "grad_norm": 0.3297346234321594, "learning_rate": 8.897112448119905e-06, "loss": 0.44, "step": 1217 }, { "epoch": 0.8823955566288336, "grad_norm": 0.4006214439868927, "learning_rate": 8.89446988817722e-06, "loss": 0.4441, "step": 1218 }, { "epoch": 0.8831200193190051, "grad_norm": 0.3811209499835968, "learning_rate": 8.891824559620801e-06, "loss": 0.4429, "step": 1219 }, { "epoch": 0.8838444820091765, "grad_norm": 0.3507653772830963, "learning_rate": 8.889176464331238e-06, "loss": 0.4331, "step": 1220 }, { "epoch": 0.884568944699348, "grad_norm": 0.4101492464542389, "learning_rate": 8.886525604191084e-06, "loss": 0.4043, "step": 1221 }, { "epoch": 0.8852934073895194, "grad_norm": 0.4239862859249115, "learning_rate": 8.883871981084858e-06, "loss": 0.4481, "step": 1222 }, { "epoch": 0.8860178700796909, "grad_norm": 0.3806942403316498, "learning_rate": 8.881215596899048e-06, "loss": 0.4317, "step": 1223 }, { "epoch": 0.8867423327698624, "grad_norm": 0.41835492849349976, "learning_rate": 8.8785564535221e-06, "loss": 0.4216, "step": 1224 }, { "epoch": 0.8874667954600338, "grad_norm": 0.3809407353401184, "learning_rate": 8.875894552844423e-06, "loss": 0.4236, "step": 1225 }, { "epoch": 0.8881912581502053, "grad_norm": 0.39243876934051514, "learning_rate": 8.873229896758387e-06, "loss": 0.4411, "step": 1226 }, { "epoch": 0.8889157208403767, "grad_norm": 0.39241188764572144, "learning_rate": 8.870562487158317e-06, "loss": 0.4402, "step": 1227 }, { "epoch": 0.8896401835305482, "grad_norm": 0.325573205947876, "learning_rate": 8.867892325940502e-06, "loss": 0.4163, "step": 1228 }, { "epoch": 0.8903646462207196, "grad_norm": 0.3867679536342621, "learning_rate": 8.865219415003184e-06, "loss": 0.4109, "step": 1229 }, { "epoch": 0.8910891089108911, "grad_norm": 0.3837641775608063, "learning_rate": 8.862543756246556e-06, "loss": 0.4182, "step": 1230 }, { "epoch": 0.8918135716010626, "grad_norm": 0.3772681653499603, "learning_rate": 8.85986535157277e-06, "loss": 0.442, "step": 1231 }, { "epoch": 0.892538034291234, "grad_norm": 0.4043648838996887, "learning_rate": 8.857184202885926e-06, "loss": 0.4356, "step": 1232 }, { "epoch": 0.8932624969814055, "grad_norm": 0.41124922037124634, "learning_rate": 8.854500312092081e-06, "loss": 0.4297, "step": 1233 }, { "epoch": 0.8939869596715769, "grad_norm": 0.36563578248023987, "learning_rate": 8.851813681099235e-06, "loss": 0.4533, "step": 1234 }, { "epoch": 0.8947114223617484, "grad_norm": 0.41038835048675537, "learning_rate": 8.849124311817336e-06, "loss": 0.4425, "step": 1235 }, { "epoch": 0.8954358850519198, "grad_norm": 0.4046548902988434, "learning_rate": 8.846432206158285e-06, "loss": 0.4418, "step": 1236 }, { "epoch": 0.8961603477420913, "grad_norm": 0.38206344842910767, "learning_rate": 8.843737366035918e-06, "loss": 0.4346, "step": 1237 }, { "epoch": 0.8968848104322628, "grad_norm": 0.4169843792915344, "learning_rate": 8.841039793366028e-06, "loss": 0.4231, "step": 1238 }, { "epoch": 0.8976092731224342, "grad_norm": 0.37566038966178894, "learning_rate": 8.838339490066341e-06, "loss": 0.4409, "step": 1239 }, { "epoch": 0.8983337358126057, "grad_norm": 0.3802827000617981, "learning_rate": 8.835636458056526e-06, "loss": 0.4146, "step": 1240 }, { "epoch": 0.8990581985027771, "grad_norm": 0.416149377822876, "learning_rate": 8.832930699258194e-06, "loss": 0.4233, "step": 1241 }, { "epoch": 0.8997826611929486, "grad_norm": 0.3738349974155426, "learning_rate": 8.83022221559489e-06, "loss": 0.4111, "step": 1242 }, { "epoch": 0.90050712388312, "grad_norm": 0.4088447690010071, "learning_rate": 8.827511008992105e-06, "loss": 0.4413, "step": 1243 }, { "epoch": 0.9012315865732915, "grad_norm": 0.40232372283935547, "learning_rate": 8.824797081377256e-06, "loss": 0.4424, "step": 1244 }, { "epoch": 0.901956049263463, "grad_norm": 0.3383753299713135, "learning_rate": 8.822080434679702e-06, "loss": 0.438, "step": 1245 }, { "epoch": 0.9026805119536344, "grad_norm": 0.42672550678253174, "learning_rate": 8.819361070830727e-06, "loss": 0.4525, "step": 1246 }, { "epoch": 0.9034049746438059, "grad_norm": 0.3872290551662445, "learning_rate": 8.816638991763558e-06, "loss": 0.4506, "step": 1247 }, { "epoch": 0.9041294373339773, "grad_norm": 0.3589775860309601, "learning_rate": 8.813914199413338e-06, "loss": 0.443, "step": 1248 }, { "epoch": 0.9048539000241488, "grad_norm": 0.37122416496276855, "learning_rate": 8.81118669571715e-06, "loss": 0.4231, "step": 1249 }, { "epoch": 0.9055783627143202, "grad_norm": 0.3604592978954315, "learning_rate": 8.808456482614001e-06, "loss": 0.4297, "step": 1250 }, { "epoch": 0.9063028254044917, "grad_norm": 0.3449357748031616, "learning_rate": 8.805723562044825e-06, "loss": 0.4281, "step": 1251 }, { "epoch": 0.9070272880946632, "grad_norm": 0.3671077787876129, "learning_rate": 8.802987935952479e-06, "loss": 0.4256, "step": 1252 }, { "epoch": 0.9077517507848346, "grad_norm": 0.3688286542892456, "learning_rate": 8.800249606281742e-06, "loss": 0.3992, "step": 1253 }, { "epoch": 0.9084762134750061, "grad_norm": 0.39646458625793457, "learning_rate": 8.79750857497932e-06, "loss": 0.4537, "step": 1254 }, { "epoch": 0.9092006761651775, "grad_norm": 0.3557892143726349, "learning_rate": 8.794764843993838e-06, "loss": 0.4222, "step": 1255 }, { "epoch": 0.909925138855349, "grad_norm": 0.3874111473560333, "learning_rate": 8.792018415275833e-06, "loss": 0.4194, "step": 1256 }, { "epoch": 0.9106496015455204, "grad_norm": 0.39340001344680786, "learning_rate": 8.789269290777774e-06, "loss": 0.4456, "step": 1257 }, { "epoch": 0.9113740642356919, "grad_norm": 0.39244401454925537, "learning_rate": 8.786517472454032e-06, "loss": 0.4405, "step": 1258 }, { "epoch": 0.9120985269258634, "grad_norm": 0.4325498342514038, "learning_rate": 8.783762962260904e-06, "loss": 0.4496, "step": 1259 }, { "epoch": 0.9128229896160348, "grad_norm": 0.4167627990245819, "learning_rate": 8.781005762156593e-06, "loss": 0.4545, "step": 1260 }, { "epoch": 0.9135474523062063, "grad_norm": 0.42066407203674316, "learning_rate": 8.778245874101218e-06, "loss": 0.424, "step": 1261 }, { "epoch": 0.9142719149963777, "grad_norm": 0.41910719871520996, "learning_rate": 8.77548330005681e-06, "loss": 0.443, "step": 1262 }, { "epoch": 0.9149963776865492, "grad_norm": 0.4005095660686493, "learning_rate": 8.772718041987305e-06, "loss": 0.4637, "step": 1263 }, { "epoch": 0.9157208403767206, "grad_norm": 0.35994309186935425, "learning_rate": 8.769950101858553e-06, "loss": 0.4081, "step": 1264 }, { "epoch": 0.9164453030668921, "grad_norm": 0.36667314171791077, "learning_rate": 8.767179481638303e-06, "loss": 0.4251, "step": 1265 }, { "epoch": 0.9171697657570635, "grad_norm": 0.43970558047294617, "learning_rate": 8.76440618329622e-06, "loss": 0.4023, "step": 1266 }, { "epoch": 0.917894228447235, "grad_norm": 0.39412036538124084, "learning_rate": 8.761630208803863e-06, "loss": 0.4316, "step": 1267 }, { "epoch": 0.9186186911374065, "grad_norm": 0.38118448853492737, "learning_rate": 8.758851560134697e-06, "loss": 0.4404, "step": 1268 }, { "epoch": 0.9193431538275779, "grad_norm": 0.39033666253089905, "learning_rate": 8.756070239264089e-06, "loss": 0.4355, "step": 1269 }, { "epoch": 0.9200676165177494, "grad_norm": 0.4023302495479584, "learning_rate": 8.753286248169306e-06, "loss": 0.4431, "step": 1270 }, { "epoch": 0.9207920792079208, "grad_norm": 0.40738043189048767, "learning_rate": 8.750499588829511e-06, "loss": 0.4412, "step": 1271 }, { "epoch": 0.9215165418980923, "grad_norm": 0.359467089176178, "learning_rate": 8.747710263225768e-06, "loss": 0.4319, "step": 1272 }, { "epoch": 0.9222410045882637, "grad_norm": 0.35331687331199646, "learning_rate": 8.74491827334103e-06, "loss": 0.4067, "step": 1273 }, { "epoch": 0.9229654672784352, "grad_norm": 0.4354839622974396, "learning_rate": 8.742123621160148e-06, "loss": 0.4534, "step": 1274 }, { "epoch": 0.9236899299686067, "grad_norm": 0.3943859338760376, "learning_rate": 8.739326308669869e-06, "loss": 0.4111, "step": 1275 }, { "epoch": 0.9244143926587781, "grad_norm": 0.34512433409690857, "learning_rate": 8.736526337858823e-06, "loss": 0.3978, "step": 1276 }, { "epoch": 0.9251388553489496, "grad_norm": 0.4457188844680786, "learning_rate": 8.733723710717542e-06, "loss": 0.427, "step": 1277 }, { "epoch": 0.925863318039121, "grad_norm": 0.4003567099571228, "learning_rate": 8.730918429238429e-06, "loss": 0.4498, "step": 1278 }, { "epoch": 0.9265877807292925, "grad_norm": 0.3628998398780823, "learning_rate": 8.728110495415789e-06, "loss": 0.4547, "step": 1279 }, { "epoch": 0.9273122434194639, "grad_norm": 0.3582828938961029, "learning_rate": 8.725299911245808e-06, "loss": 0.4344, "step": 1280 }, { "epoch": 0.9280367061096354, "grad_norm": 0.3738276958465576, "learning_rate": 8.722486678726555e-06, "loss": 0.4432, "step": 1281 }, { "epoch": 0.9287611687998069, "grad_norm": 0.3691905736923218, "learning_rate": 8.719670799857983e-06, "loss": 0.431, "step": 1282 }, { "epoch": 0.9294856314899783, "grad_norm": 0.3500174582004547, "learning_rate": 8.716852276641923e-06, "loss": 0.4166, "step": 1283 }, { "epoch": 0.9302100941801498, "grad_norm": 0.4153335392475128, "learning_rate": 8.71403111108209e-06, "loss": 0.4353, "step": 1284 }, { "epoch": 0.9309345568703212, "grad_norm": 0.38806235790252686, "learning_rate": 8.711207305184078e-06, "loss": 0.4359, "step": 1285 }, { "epoch": 0.9316590195604927, "grad_norm": 0.4090997874736786, "learning_rate": 8.708380860955355e-06, "loss": 0.4311, "step": 1286 }, { "epoch": 0.9323834822506641, "grad_norm": 0.4335585832595825, "learning_rate": 8.705551780405264e-06, "loss": 0.4125, "step": 1287 }, { "epoch": 0.9331079449408356, "grad_norm": 0.3740835189819336, "learning_rate": 8.702720065545024e-06, "loss": 0.4019, "step": 1288 }, { "epoch": 0.933832407631007, "grad_norm": 0.41969966888427734, "learning_rate": 8.699885718387728e-06, "loss": 0.4496, "step": 1289 }, { "epoch": 0.9345568703211785, "grad_norm": 0.39320340752601624, "learning_rate": 8.697048740948338e-06, "loss": 0.4403, "step": 1290 }, { "epoch": 0.93528133301135, "grad_norm": 0.364958256483078, "learning_rate": 8.69420913524369e-06, "loss": 0.4402, "step": 1291 }, { "epoch": 0.9360057957015214, "grad_norm": 0.39708906412124634, "learning_rate": 8.691366903292479e-06, "loss": 0.429, "step": 1292 }, { "epoch": 0.9367302583916929, "grad_norm": 0.3983142077922821, "learning_rate": 8.688522047115278e-06, "loss": 0.4094, "step": 1293 }, { "epoch": 0.9374547210818643, "grad_norm": 0.39288681745529175, "learning_rate": 8.68567456873452e-06, "loss": 0.4176, "step": 1294 }, { "epoch": 0.9381791837720357, "grad_norm": 0.34443679451942444, "learning_rate": 8.682824470174502e-06, "loss": 0.411, "step": 1295 }, { "epoch": 0.9389036464622071, "grad_norm": 0.32373788952827454, "learning_rate": 8.679971753461388e-06, "loss": 0.4242, "step": 1296 }, { "epoch": 0.9396281091523786, "grad_norm": 0.39674094319343567, "learning_rate": 8.677116420623194e-06, "loss": 0.4108, "step": 1297 }, { "epoch": 0.9403525718425501, "grad_norm": 0.44780972599983215, "learning_rate": 8.674258473689807e-06, "loss": 0.4191, "step": 1298 }, { "epoch": 0.9410770345327215, "grad_norm": 0.3581278622150421, "learning_rate": 8.671397914692966e-06, "loss": 0.4295, "step": 1299 }, { "epoch": 0.941801497222893, "grad_norm": 0.40562325716018677, "learning_rate": 8.668534745666268e-06, "loss": 0.4365, "step": 1300 }, { "epoch": 0.9425259599130644, "grad_norm": 0.39394158124923706, "learning_rate": 8.665668968645164e-06, "loss": 0.421, "step": 1301 }, { "epoch": 0.9432504226032359, "grad_norm": 0.34343916177749634, "learning_rate": 8.662800585666963e-06, "loss": 0.4109, "step": 1302 }, { "epoch": 0.9439748852934073, "grad_norm": 0.3787355422973633, "learning_rate": 8.659929598770824e-06, "loss": 0.4276, "step": 1303 }, { "epoch": 0.9446993479835788, "grad_norm": 0.37198910117149353, "learning_rate": 8.657056009997757e-06, "loss": 0.4482, "step": 1304 }, { "epoch": 0.9454238106737503, "grad_norm": 0.39556264877319336, "learning_rate": 8.65417982139062e-06, "loss": 0.4431, "step": 1305 }, { "epoch": 0.9461482733639217, "grad_norm": 0.3512415289878845, "learning_rate": 8.651301034994127e-06, "loss": 0.4073, "step": 1306 }, { "epoch": 0.9468727360540932, "grad_norm": 0.3586278557777405, "learning_rate": 8.648419652854827e-06, "loss": 0.4533, "step": 1307 }, { "epoch": 0.9475971987442646, "grad_norm": 0.37773627042770386, "learning_rate": 8.645535677021124e-06, "loss": 0.4359, "step": 1308 }, { "epoch": 0.9483216614344361, "grad_norm": 0.3669639229774475, "learning_rate": 8.64264910954326e-06, "loss": 0.4369, "step": 1309 }, { "epoch": 0.9490461241246075, "grad_norm": 0.37393152713775635, "learning_rate": 8.639759952473324e-06, "loss": 0.4543, "step": 1310 }, { "epoch": 0.949770586814779, "grad_norm": 0.43638938665390015, "learning_rate": 8.636868207865244e-06, "loss": 0.4446, "step": 1311 }, { "epoch": 0.9504950495049505, "grad_norm": 0.3574516475200653, "learning_rate": 8.633973877774787e-06, "loss": 0.4481, "step": 1312 }, { "epoch": 0.9512195121951219, "grad_norm": 0.3822166621685028, "learning_rate": 8.631076964259556e-06, "loss": 0.4217, "step": 1313 }, { "epoch": 0.9519439748852934, "grad_norm": 0.38383784890174866, "learning_rate": 8.628177469378995e-06, "loss": 0.4228, "step": 1314 }, { "epoch": 0.9526684375754648, "grad_norm": 0.3422839641571045, "learning_rate": 8.625275395194382e-06, "loss": 0.4284, "step": 1315 }, { "epoch": 0.9533929002656363, "grad_norm": 0.3870636224746704, "learning_rate": 8.622370743768826e-06, "loss": 0.4414, "step": 1316 }, { "epoch": 0.9541173629558077, "grad_norm": 0.38176271319389343, "learning_rate": 8.61946351716727e-06, "loss": 0.4172, "step": 1317 }, { "epoch": 0.9548418256459792, "grad_norm": 0.4657913148403168, "learning_rate": 8.616553717456488e-06, "loss": 0.4289, "step": 1318 }, { "epoch": 0.9555662883361506, "grad_norm": 0.4177466928958893, "learning_rate": 8.613641346705084e-06, "loss": 0.4376, "step": 1319 }, { "epoch": 0.9562907510263221, "grad_norm": 0.38769084215164185, "learning_rate": 8.610726406983485e-06, "loss": 0.3993, "step": 1320 }, { "epoch": 0.9570152137164936, "grad_norm": 0.4868411421775818, "learning_rate": 8.607808900363952e-06, "loss": 0.4317, "step": 1321 }, { "epoch": 0.957739676406665, "grad_norm": 0.39994314312934875, "learning_rate": 8.604888828920564e-06, "loss": 0.4202, "step": 1322 }, { "epoch": 0.9584641390968365, "grad_norm": 0.388860285282135, "learning_rate": 8.601966194729228e-06, "loss": 0.3992, "step": 1323 }, { "epoch": 0.9591886017870079, "grad_norm": 0.4033240079879761, "learning_rate": 8.599040999867669e-06, "loss": 0.4118, "step": 1324 }, { "epoch": 0.9599130644771794, "grad_norm": 0.42254436016082764, "learning_rate": 8.596113246415435e-06, "loss": 0.4267, "step": 1325 }, { "epoch": 0.9606375271673508, "grad_norm": 0.3677334487438202, "learning_rate": 8.593182936453894e-06, "loss": 0.4177, "step": 1326 }, { "epoch": 0.9613619898575223, "grad_norm": 0.4043729305267334, "learning_rate": 8.590250072066229e-06, "loss": 0.4203, "step": 1327 }, { "epoch": 0.9620864525476938, "grad_norm": 0.4147138297557831, "learning_rate": 8.587314655337438e-06, "loss": 0.4324, "step": 1328 }, { "epoch": 0.9628109152378652, "grad_norm": 0.5020073056221008, "learning_rate": 8.584376688354336e-06, "loss": 0.4573, "step": 1329 }, { "epoch": 0.9635353779280367, "grad_norm": 0.39574483036994934, "learning_rate": 8.581436173205552e-06, "loss": 0.4315, "step": 1330 }, { "epoch": 0.9642598406182081, "grad_norm": 0.389879435300827, "learning_rate": 8.578493111981523e-06, "loss": 0.4141, "step": 1331 }, { "epoch": 0.9649843033083796, "grad_norm": 0.40143486857414246, "learning_rate": 8.575547506774498e-06, "loss": 0.4299, "step": 1332 }, { "epoch": 0.965708765998551, "grad_norm": 0.4404750466346741, "learning_rate": 8.572599359678533e-06, "loss": 0.4308, "step": 1333 }, { "epoch": 0.9664332286887225, "grad_norm": 0.4074755012989044, "learning_rate": 8.569648672789496e-06, "loss": 0.408, "step": 1334 }, { "epoch": 0.967157691378894, "grad_norm": 0.5102840662002563, "learning_rate": 8.566695448205053e-06, "loss": 0.4439, "step": 1335 }, { "epoch": 0.9678821540690654, "grad_norm": 0.36906397342681885, "learning_rate": 8.56373968802468e-06, "loss": 0.416, "step": 1336 }, { "epoch": 0.9686066167592369, "grad_norm": 0.5405845046043396, "learning_rate": 8.560781394349654e-06, "loss": 0.4467, "step": 1337 }, { "epoch": 0.9693310794494083, "grad_norm": 0.3900741636753082, "learning_rate": 8.55782056928305e-06, "loss": 0.4335, "step": 1338 }, { "epoch": 0.9700555421395798, "grad_norm": 0.39347872138023376, "learning_rate": 8.554857214929749e-06, "loss": 0.4048, "step": 1339 }, { "epoch": 0.9707800048297512, "grad_norm": 0.4399970769882202, "learning_rate": 8.551891333396422e-06, "loss": 0.4409, "step": 1340 }, { "epoch": 0.9715044675199227, "grad_norm": 0.3565783202648163, "learning_rate": 8.548922926791545e-06, "loss": 0.4303, "step": 1341 }, { "epoch": 0.9722289302100942, "grad_norm": 0.38588187098503113, "learning_rate": 8.545951997225384e-06, "loss": 0.4369, "step": 1342 }, { "epoch": 0.9729533929002656, "grad_norm": 0.36795949935913086, "learning_rate": 8.542978546809997e-06, "loss": 0.4224, "step": 1343 }, { "epoch": 0.9736778555904371, "grad_norm": 0.3548993170261383, "learning_rate": 8.540002577659238e-06, "loss": 0.4284, "step": 1344 }, { "epoch": 0.9744023182806085, "grad_norm": 0.40933847427368164, "learning_rate": 8.537024091888752e-06, "loss": 0.4284, "step": 1345 }, { "epoch": 0.97512678097078, "grad_norm": 0.33137449622154236, "learning_rate": 8.53404309161597e-06, "loss": 0.4078, "step": 1346 }, { "epoch": 0.9758512436609514, "grad_norm": 0.3717988133430481, "learning_rate": 8.531059578960111e-06, "loss": 0.4089, "step": 1347 }, { "epoch": 0.9765757063511229, "grad_norm": 0.41303932666778564, "learning_rate": 8.528073556042183e-06, "loss": 0.4275, "step": 1348 }, { "epoch": 0.9773001690412944, "grad_norm": 0.3549620509147644, "learning_rate": 8.525085024984976e-06, "loss": 0.4388, "step": 1349 }, { "epoch": 0.9780246317314658, "grad_norm": 0.4022923409938812, "learning_rate": 8.522093987913063e-06, "loss": 0.4418, "step": 1350 }, { "epoch": 0.9787490944216373, "grad_norm": 0.4086378514766693, "learning_rate": 8.519100446952799e-06, "loss": 0.4162, "step": 1351 }, { "epoch": 0.9794735571118087, "grad_norm": 0.38655248284339905, "learning_rate": 8.516104404232319e-06, "loss": 0.4436, "step": 1352 }, { "epoch": 0.9801980198019802, "grad_norm": 0.423133909702301, "learning_rate": 8.51310586188154e-06, "loss": 0.4463, "step": 1353 }, { "epoch": 0.9809224824921516, "grad_norm": 0.377841979265213, "learning_rate": 8.510104822032149e-06, "loss": 0.4413, "step": 1354 }, { "epoch": 0.9816469451823231, "grad_norm": 0.38987618684768677, "learning_rate": 8.507101286817614e-06, "loss": 0.4336, "step": 1355 }, { "epoch": 0.9823714078724946, "grad_norm": 0.4263487458229065, "learning_rate": 8.504095258373174e-06, "loss": 0.4532, "step": 1356 }, { "epoch": 0.983095870562666, "grad_norm": 0.42380285263061523, "learning_rate": 8.501086738835843e-06, "loss": 0.4419, "step": 1357 }, { "epoch": 0.9838203332528375, "grad_norm": 0.3457951247692108, "learning_rate": 8.498075730344403e-06, "loss": 0.4225, "step": 1358 }, { "epoch": 0.9845447959430089, "grad_norm": 0.3673158884048462, "learning_rate": 8.49506223503941e-06, "loss": 0.4099, "step": 1359 }, { "epoch": 0.9852692586331804, "grad_norm": 0.3521679937839508, "learning_rate": 8.492046255063184e-06, "loss": 0.4011, "step": 1360 }, { "epoch": 0.9859937213233518, "grad_norm": 0.35046789050102234, "learning_rate": 8.48902779255981e-06, "loss": 0.4007, "step": 1361 }, { "epoch": 0.9867181840135233, "grad_norm": 0.3312092423439026, "learning_rate": 8.486006849675142e-06, "loss": 0.4157, "step": 1362 }, { "epoch": 0.9874426467036947, "grad_norm": 0.348123162984848, "learning_rate": 8.482983428556794e-06, "loss": 0.4347, "step": 1363 }, { "epoch": 0.9881671093938662, "grad_norm": 0.35120341181755066, "learning_rate": 8.479957531354148e-06, "loss": 0.4453, "step": 1364 }, { "epoch": 0.9888915720840377, "grad_norm": 0.3326391279697418, "learning_rate": 8.476929160218336e-06, "loss": 0.4071, "step": 1365 }, { "epoch": 0.9896160347742091, "grad_norm": 0.4143807590007782, "learning_rate": 8.473898317302255e-06, "loss": 0.4155, "step": 1366 }, { "epoch": 0.9903404974643806, "grad_norm": 0.39165422320365906, "learning_rate": 8.470865004760562e-06, "loss": 0.4327, "step": 1367 }, { "epoch": 0.991064960154552, "grad_norm": 0.38387271761894226, "learning_rate": 8.467829224749665e-06, "loss": 0.4245, "step": 1368 }, { "epoch": 0.9917894228447235, "grad_norm": 0.3663344383239746, "learning_rate": 8.464790979427726e-06, "loss": 0.4105, "step": 1369 }, { "epoch": 0.992513885534895, "grad_norm": 0.3891139030456543, "learning_rate": 8.461750270954661e-06, "loss": 0.4553, "step": 1370 }, { "epoch": 0.9932383482250664, "grad_norm": 0.36450666189193726, "learning_rate": 8.458707101492138e-06, "loss": 0.4264, "step": 1371 }, { "epoch": 0.9939628109152379, "grad_norm": 0.37403005361557007, "learning_rate": 8.455661473203573e-06, "loss": 0.4149, "step": 1372 }, { "epoch": 0.9946872736054093, "grad_norm": 0.38800257444381714, "learning_rate": 8.452613388254131e-06, "loss": 0.4459, "step": 1373 }, { "epoch": 0.9954117362955808, "grad_norm": 0.36400744318962097, "learning_rate": 8.449562848810721e-06, "loss": 0.4178, "step": 1374 }, { "epoch": 0.9961361989857522, "grad_norm": 0.39685407280921936, "learning_rate": 8.446509857042004e-06, "loss": 0.3937, "step": 1375 }, { "epoch": 0.9968606616759237, "grad_norm": 0.37514859437942505, "learning_rate": 8.443454415118375e-06, "loss": 0.4114, "step": 1376 }, { "epoch": 0.9975851243660951, "grad_norm": 0.37398236989974976, "learning_rate": 8.440396525211976e-06, "loss": 0.428, "step": 1377 }, { "epoch": 0.9983095870562666, "grad_norm": 0.42171934247016907, "learning_rate": 8.43733618949669e-06, "loss": 0.4219, "step": 1378 }, { "epoch": 0.9990340497464381, "grad_norm": 0.3772500157356262, "learning_rate": 8.434273410148133e-06, "loss": 0.4445, "step": 1379 }, { "epoch": 0.9997585124366095, "grad_norm": 0.37559425830841064, "learning_rate": 8.43120818934367e-06, "loss": 0.4131, "step": 1380 }, { "epoch": 1.000482975126781, "grad_norm": 0.6928520202636719, "learning_rate": 8.428140529262387e-06, "loss": 0.6401, "step": 1381 }, { "epoch": 1.0012074378169524, "grad_norm": 0.4297943115234375, "learning_rate": 8.425070432085114e-06, "loss": 0.3849, "step": 1382 }, { "epoch": 1.0019319005071239, "grad_norm": 0.45530402660369873, "learning_rate": 8.42199789999441e-06, "loss": 0.4054, "step": 1383 }, { "epoch": 1.0026563631972953, "grad_norm": 0.4143877327442169, "learning_rate": 8.418922935174567e-06, "loss": 0.4364, "step": 1384 }, { "epoch": 1.0033808258874668, "grad_norm": 0.40395596623420715, "learning_rate": 8.415845539811605e-06, "loss": 0.4011, "step": 1385 }, { "epoch": 1.0041052885776383, "grad_norm": 0.38684433698654175, "learning_rate": 8.412765716093273e-06, "loss": 0.4307, "step": 1386 }, { "epoch": 1.0048297512678097, "grad_norm": 0.3802293837070465, "learning_rate": 8.40968346620904e-06, "loss": 0.3758, "step": 1387 }, { "epoch": 1.0055542139579812, "grad_norm": 0.3749120533466339, "learning_rate": 8.406598792350113e-06, "loss": 0.4308, "step": 1388 }, { "epoch": 1.0062786766481526, "grad_norm": 0.36624032258987427, "learning_rate": 8.403511696709407e-06, "loss": 0.4059, "step": 1389 }, { "epoch": 1.007003139338324, "grad_norm": 0.3908820152282715, "learning_rate": 8.400422181481573e-06, "loss": 0.4011, "step": 1390 }, { "epoch": 1.0077276020284955, "grad_norm": 0.37490344047546387, "learning_rate": 8.397330248862971e-06, "loss": 0.4032, "step": 1391 }, { "epoch": 1.008452064718667, "grad_norm": 0.37148094177246094, "learning_rate": 8.394235901051687e-06, "loss": 0.4307, "step": 1392 }, { "epoch": 1.0091765274088385, "grad_norm": 0.39189112186431885, "learning_rate": 8.391139140247516e-06, "loss": 0.3921, "step": 1393 }, { "epoch": 1.00990099009901, "grad_norm": 0.3802763521671295, "learning_rate": 8.388039968651976e-06, "loss": 0.3995, "step": 1394 }, { "epoch": 1.0106254527891814, "grad_norm": 0.4046536087989807, "learning_rate": 8.384938388468296e-06, "loss": 0.3917, "step": 1395 }, { "epoch": 1.0113499154793528, "grad_norm": 0.37834370136260986, "learning_rate": 8.381834401901416e-06, "loss": 0.3682, "step": 1396 }, { "epoch": 1.0120743781695243, "grad_norm": 0.41191565990448, "learning_rate": 8.378728011157989e-06, "loss": 0.4224, "step": 1397 }, { "epoch": 1.0127988408596957, "grad_norm": 0.39186108112335205, "learning_rate": 8.375619218446377e-06, "loss": 0.404, "step": 1398 }, { "epoch": 1.0135233035498672, "grad_norm": 0.3898712396621704, "learning_rate": 8.372508025976649e-06, "loss": 0.4249, "step": 1399 }, { "epoch": 1.0142477662400387, "grad_norm": 0.422443687915802, "learning_rate": 8.369394435960575e-06, "loss": 0.4287, "step": 1400 }, { "epoch": 1.01497222893021, "grad_norm": 0.350200891494751, "learning_rate": 8.366278450611643e-06, "loss": 0.3502, "step": 1401 }, { "epoch": 1.0156966916203816, "grad_norm": 0.48364749550819397, "learning_rate": 8.363160072145025e-06, "loss": 0.4211, "step": 1402 }, { "epoch": 1.016421154310553, "grad_norm": 0.3576699197292328, "learning_rate": 8.360039302777614e-06, "loss": 0.4114, "step": 1403 }, { "epoch": 1.0171456170007245, "grad_norm": 0.3922165334224701, "learning_rate": 8.356916144727985e-06, "loss": 0.3954, "step": 1404 }, { "epoch": 1.017870079690896, "grad_norm": 0.37130671739578247, "learning_rate": 8.353790600216422e-06, "loss": 0.3912, "step": 1405 }, { "epoch": 1.0185945423810674, "grad_norm": 0.4276247024536133, "learning_rate": 8.350662671464907e-06, "loss": 0.4464, "step": 1406 }, { "epoch": 1.0193190050712388, "grad_norm": 0.3577728569507599, "learning_rate": 8.347532360697105e-06, "loss": 0.3869, "step": 1407 }, { "epoch": 1.0200434677614103, "grad_norm": 0.3654215931892395, "learning_rate": 8.344399670138388e-06, "loss": 0.3942, "step": 1408 }, { "epoch": 1.0207679304515818, "grad_norm": 0.4067346453666687, "learning_rate": 8.341264602015813e-06, "loss": 0.422, "step": 1409 }, { "epoch": 1.0214923931417532, "grad_norm": 0.3676408529281616, "learning_rate": 8.338127158558126e-06, "loss": 0.404, "step": 1410 }, { "epoch": 1.0222168558319247, "grad_norm": 0.34398534893989563, "learning_rate": 8.334987341995766e-06, "loss": 0.3787, "step": 1411 }, { "epoch": 1.0229413185220961, "grad_norm": 0.4011276066303253, "learning_rate": 8.331845154560855e-06, "loss": 0.3887, "step": 1412 }, { "epoch": 1.0236657812122676, "grad_norm": 0.4064714014530182, "learning_rate": 8.328700598487203e-06, "loss": 0.417, "step": 1413 }, { "epoch": 1.024390243902439, "grad_norm": 0.38404765725135803, "learning_rate": 8.325553676010305e-06, "loss": 0.3797, "step": 1414 }, { "epoch": 1.0251147065926105, "grad_norm": 0.3914385437965393, "learning_rate": 8.322404389367335e-06, "loss": 0.4378, "step": 1415 }, { "epoch": 1.025839169282782, "grad_norm": 0.3627963066101074, "learning_rate": 8.319252740797149e-06, "loss": 0.4031, "step": 1416 }, { "epoch": 1.0265636319729534, "grad_norm": 0.4491138458251953, "learning_rate": 8.316098732540282e-06, "loss": 0.3749, "step": 1417 }, { "epoch": 1.0272880946631249, "grad_norm": 0.39106854796409607, "learning_rate": 8.312942366838949e-06, "loss": 0.4119, "step": 1418 }, { "epoch": 1.0280125573532963, "grad_norm": 0.3640814423561096, "learning_rate": 8.309783645937037e-06, "loss": 0.4091, "step": 1419 }, { "epoch": 1.0287370200434678, "grad_norm": 0.39894425868988037, "learning_rate": 8.30662257208011e-06, "loss": 0.4069, "step": 1420 }, { "epoch": 1.0294614827336392, "grad_norm": 0.3706302344799042, "learning_rate": 8.303459147515403e-06, "loss": 0.413, "step": 1421 }, { "epoch": 1.0301859454238107, "grad_norm": 0.33460697531700134, "learning_rate": 8.300293374491821e-06, "loss": 0.3534, "step": 1422 }, { "epoch": 1.0309104081139822, "grad_norm": 0.39398276805877686, "learning_rate": 8.297125255259947e-06, "loss": 0.4227, "step": 1423 }, { "epoch": 1.0316348708041536, "grad_norm": 0.3733583092689514, "learning_rate": 8.293954792072018e-06, "loss": 0.4309, "step": 1424 }, { "epoch": 1.032359333494325, "grad_norm": 0.3762594163417816, "learning_rate": 8.290781987181951e-06, "loss": 0.4228, "step": 1425 }, { "epoch": 1.0330837961844965, "grad_norm": 0.3468662202358246, "learning_rate": 8.28760684284532e-06, "loss": 0.3857, "step": 1426 }, { "epoch": 1.033808258874668, "grad_norm": 0.3649839460849762, "learning_rate": 8.28442936131936e-06, "loss": 0.3647, "step": 1427 }, { "epoch": 1.0345327215648394, "grad_norm": 0.3683072626590729, "learning_rate": 8.281249544862975e-06, "loss": 0.4163, "step": 1428 }, { "epoch": 1.035257184255011, "grad_norm": 0.38672909140586853, "learning_rate": 8.278067395736725e-06, "loss": 0.4205, "step": 1429 }, { "epoch": 1.0359816469451824, "grad_norm": 0.36870360374450684, "learning_rate": 8.274882916202827e-06, "loss": 0.4142, "step": 1430 }, { "epoch": 1.0367061096353538, "grad_norm": 0.37603163719177246, "learning_rate": 8.271696108525156e-06, "loss": 0.3835, "step": 1431 }, { "epoch": 1.0374305723255253, "grad_norm": 0.36450210213661194, "learning_rate": 8.268506974969244e-06, "loss": 0.3202, "step": 1432 }, { "epoch": 1.0381550350156967, "grad_norm": 0.4050193727016449, "learning_rate": 8.265315517802275e-06, "loss": 0.4313, "step": 1433 }, { "epoch": 1.0388794977058682, "grad_norm": 0.4194083511829376, "learning_rate": 8.262121739293081e-06, "loss": 0.4298, "step": 1434 }, { "epoch": 1.0396039603960396, "grad_norm": 0.3836210370063782, "learning_rate": 8.258925641712152e-06, "loss": 0.4005, "step": 1435 }, { "epoch": 1.040328423086211, "grad_norm": 0.3729094862937927, "learning_rate": 8.25572722733162e-06, "loss": 0.4249, "step": 1436 }, { "epoch": 1.0410528857763826, "grad_norm": 0.3709816336631775, "learning_rate": 8.252526498425271e-06, "loss": 0.3707, "step": 1437 }, { "epoch": 1.041777348466554, "grad_norm": 0.3546146750450134, "learning_rate": 8.249323457268526e-06, "loss": 0.3933, "step": 1438 }, { "epoch": 1.0425018111567255, "grad_norm": 0.40446388721466064, "learning_rate": 8.246118106138458e-06, "loss": 0.4132, "step": 1439 }, { "epoch": 1.043226273846897, "grad_norm": 0.4471367597579956, "learning_rate": 8.24291044731378e-06, "loss": 0.4052, "step": 1440 }, { "epoch": 1.0439507365370684, "grad_norm": 0.3650663197040558, "learning_rate": 8.239700483074848e-06, "loss": 0.385, "step": 1441 }, { "epoch": 1.0446751992272398, "grad_norm": 0.42406901717185974, "learning_rate": 8.23648821570365e-06, "loss": 0.4024, "step": 1442 }, { "epoch": 1.0453996619174113, "grad_norm": 0.38081294298171997, "learning_rate": 8.233273647483822e-06, "loss": 0.3774, "step": 1443 }, { "epoch": 1.0461241246075828, "grad_norm": 0.38282862305641174, "learning_rate": 8.230056780700624e-06, "loss": 0.3987, "step": 1444 }, { "epoch": 1.0468485872977542, "grad_norm": 0.35862892866134644, "learning_rate": 8.226837617640957e-06, "loss": 0.4036, "step": 1445 }, { "epoch": 1.0475730499879257, "grad_norm": 0.4138716757297516, "learning_rate": 8.223616160593353e-06, "loss": 0.4068, "step": 1446 }, { "epoch": 1.0482975126780971, "grad_norm": 0.3680810034275055, "learning_rate": 8.220392411847976e-06, "loss": 0.411, "step": 1447 }, { "epoch": 1.0490219753682686, "grad_norm": 0.4657863974571228, "learning_rate": 8.217166373696618e-06, "loss": 0.4314, "step": 1448 }, { "epoch": 1.04974643805844, "grad_norm": 0.37673744559288025, "learning_rate": 8.213938048432697e-06, "loss": 0.4208, "step": 1449 }, { "epoch": 1.0504709007486115, "grad_norm": 0.3997384011745453, "learning_rate": 8.21070743835126e-06, "loss": 0.4077, "step": 1450 }, { "epoch": 1.051195363438783, "grad_norm": 0.45633262395858765, "learning_rate": 8.207474545748977e-06, "loss": 0.3962, "step": 1451 }, { "epoch": 1.0519198261289544, "grad_norm": 0.33542051911354065, "learning_rate": 8.20423937292414e-06, "loss": 0.3713, "step": 1452 }, { "epoch": 1.0526442888191259, "grad_norm": 0.4026559591293335, "learning_rate": 8.20100192217666e-06, "loss": 0.3953, "step": 1453 }, { "epoch": 1.0533687515092973, "grad_norm": 0.5009774565696716, "learning_rate": 8.197762195808072e-06, "loss": 0.4248, "step": 1454 }, { "epoch": 1.0540932141994688, "grad_norm": 0.36366006731987, "learning_rate": 8.194520196121528e-06, "loss": 0.3834, "step": 1455 }, { "epoch": 1.0548176768896402, "grad_norm": 0.3958222568035126, "learning_rate": 8.191275925421795e-06, "loss": 0.4228, "step": 1456 }, { "epoch": 1.0555421395798117, "grad_norm": 0.4137931764125824, "learning_rate": 8.18802938601525e-06, "loss": 0.3949, "step": 1457 }, { "epoch": 1.0562666022699831, "grad_norm": 0.47269612550735474, "learning_rate": 8.184780580209892e-06, "loss": 0.4152, "step": 1458 }, { "epoch": 1.0569910649601546, "grad_norm": 0.39468979835510254, "learning_rate": 8.181529510315323e-06, "loss": 0.4232, "step": 1459 }, { "epoch": 1.057715527650326, "grad_norm": 0.38648465275764465, "learning_rate": 8.17827617864276e-06, "loss": 0.427, "step": 1460 }, { "epoch": 1.0584399903404975, "grad_norm": 0.38991132378578186, "learning_rate": 8.175020587505026e-06, "loss": 0.3885, "step": 1461 }, { "epoch": 1.059164453030669, "grad_norm": 0.41049543023109436, "learning_rate": 8.171762739216549e-06, "loss": 0.4513, "step": 1462 }, { "epoch": 1.0598889157208404, "grad_norm": 0.3597794771194458, "learning_rate": 8.168502636093361e-06, "loss": 0.3886, "step": 1463 }, { "epoch": 1.0606133784110119, "grad_norm": 0.37402036786079407, "learning_rate": 8.165240280453102e-06, "loss": 0.3803, "step": 1464 }, { "epoch": 1.0613378411011833, "grad_norm": 0.34467101097106934, "learning_rate": 8.161975674615007e-06, "loss": 0.4237, "step": 1465 }, { "epoch": 1.0620623037913548, "grad_norm": 0.4159948229789734, "learning_rate": 8.158708820899916e-06, "loss": 0.388, "step": 1466 }, { "epoch": 1.0627867664815263, "grad_norm": 0.3789779841899872, "learning_rate": 8.155439721630265e-06, "loss": 0.4549, "step": 1467 }, { "epoch": 1.0635112291716977, "grad_norm": 0.33253929018974304, "learning_rate": 8.152168379130082e-06, "loss": 0.3733, "step": 1468 }, { "epoch": 1.0642356918618692, "grad_norm": 0.4742108881473541, "learning_rate": 8.148894795725002e-06, "loss": 0.4393, "step": 1469 }, { "epoch": 1.0649601545520406, "grad_norm": 0.3483884036540985, "learning_rate": 8.145618973742235e-06, "loss": 0.3987, "step": 1470 }, { "epoch": 1.065684617242212, "grad_norm": 0.41775432229042053, "learning_rate": 8.142340915510598e-06, "loss": 0.453, "step": 1471 }, { "epoch": 1.0664090799323835, "grad_norm": 0.3599341809749603, "learning_rate": 8.139060623360494e-06, "loss": 0.3651, "step": 1472 }, { "epoch": 1.067133542622555, "grad_norm": 0.42934584617614746, "learning_rate": 8.135778099623908e-06, "loss": 0.4372, "step": 1473 }, { "epoch": 1.0678580053127265, "grad_norm": 0.3972543478012085, "learning_rate": 8.132493346634417e-06, "loss": 0.4194, "step": 1474 }, { "epoch": 1.068582468002898, "grad_norm": 0.3613380789756775, "learning_rate": 8.129206366727182e-06, "loss": 0.3951, "step": 1475 }, { "epoch": 1.0693069306930694, "grad_norm": 0.33956143260002136, "learning_rate": 8.125917162238945e-06, "loss": 0.3896, "step": 1476 }, { "epoch": 1.0700313933832408, "grad_norm": 0.3630477488040924, "learning_rate": 8.122625735508035e-06, "loss": 0.4247, "step": 1477 }, { "epoch": 1.0707558560734123, "grad_norm": 0.45788103342056274, "learning_rate": 8.119332088874354e-06, "loss": 0.3765, "step": 1478 }, { "epoch": 1.0714803187635837, "grad_norm": 0.33767059445381165, "learning_rate": 8.116036224679384e-06, "loss": 0.378, "step": 1479 }, { "epoch": 1.0722047814537552, "grad_norm": 0.4117564260959625, "learning_rate": 8.112738145266186e-06, "loss": 0.3986, "step": 1480 }, { "epoch": 1.0729292441439267, "grad_norm": 0.3832103908061981, "learning_rate": 8.109437852979397e-06, "loss": 0.3956, "step": 1481 }, { "epoch": 1.073653706834098, "grad_norm": 0.39287522435188293, "learning_rate": 8.106135350165219e-06, "loss": 0.4075, "step": 1482 }, { "epoch": 1.0743781695242696, "grad_norm": 0.39186564087867737, "learning_rate": 8.102830639171435e-06, "loss": 0.4167, "step": 1483 }, { "epoch": 1.075102632214441, "grad_norm": 0.3865715265274048, "learning_rate": 8.099523722347394e-06, "loss": 0.4311, "step": 1484 }, { "epoch": 1.0758270949046125, "grad_norm": 0.36034971475601196, "learning_rate": 8.096214602044011e-06, "loss": 0.3905, "step": 1485 }, { "epoch": 1.076551557594784, "grad_norm": 0.3636012375354767, "learning_rate": 8.092903280613769e-06, "loss": 0.3987, "step": 1486 }, { "epoch": 1.0772760202849554, "grad_norm": 0.3449145555496216, "learning_rate": 8.08958976041072e-06, "loss": 0.4127, "step": 1487 }, { "epoch": 1.0780004829751269, "grad_norm": 0.3932015597820282, "learning_rate": 8.08627404379047e-06, "loss": 0.4256, "step": 1488 }, { "epoch": 1.0787249456652983, "grad_norm": 0.38834038376808167, "learning_rate": 8.082956133110196e-06, "loss": 0.4123, "step": 1489 }, { "epoch": 1.0794494083554698, "grad_norm": 0.34351420402526855, "learning_rate": 8.079636030728629e-06, "loss": 0.3908, "step": 1490 }, { "epoch": 1.0801738710456412, "grad_norm": 0.39909544587135315, "learning_rate": 8.076313739006058e-06, "loss": 0.4286, "step": 1491 }, { "epoch": 1.0808983337358127, "grad_norm": 0.36706382036209106, "learning_rate": 8.072989260304335e-06, "loss": 0.3907, "step": 1492 }, { "epoch": 1.0816227964259841, "grad_norm": 0.3963608741760254, "learning_rate": 8.069662596986859e-06, "loss": 0.399, "step": 1493 }, { "epoch": 1.0823472591161556, "grad_norm": 0.4249162971973419, "learning_rate": 8.066333751418582e-06, "loss": 0.425, "step": 1494 }, { "epoch": 1.083071721806327, "grad_norm": 0.35047653317451477, "learning_rate": 8.063002725966014e-06, "loss": 0.394, "step": 1495 }, { "epoch": 1.0837961844964985, "grad_norm": 0.404999703168869, "learning_rate": 8.05966952299721e-06, "loss": 0.4356, "step": 1496 }, { "epoch": 1.08452064718667, "grad_norm": 0.3913189470767975, "learning_rate": 8.056334144881776e-06, "loss": 0.3884, "step": 1497 }, { "epoch": 1.0852451098768414, "grad_norm": 0.4008939266204834, "learning_rate": 8.052996593990861e-06, "loss": 0.4227, "step": 1498 }, { "epoch": 1.0859695725670129, "grad_norm": 0.3996501564979553, "learning_rate": 8.049656872697158e-06, "loss": 0.4224, "step": 1499 }, { "epoch": 1.0866940352571843, "grad_norm": 0.39700019359588623, "learning_rate": 8.046314983374905e-06, "loss": 0.413, "step": 1500 }, { "epoch": 1.0874184979473558, "grad_norm": 0.3972935378551483, "learning_rate": 8.042970928399885e-06, "loss": 0.4145, "step": 1501 }, { "epoch": 1.0881429606375272, "grad_norm": 0.3841893672943115, "learning_rate": 8.039624710149413e-06, "loss": 0.4273, "step": 1502 }, { "epoch": 1.0888674233276987, "grad_norm": 0.34196680784225464, "learning_rate": 8.036276331002348e-06, "loss": 0.3559, "step": 1503 }, { "epoch": 1.0895918860178702, "grad_norm": 0.3655821979045868, "learning_rate": 8.032925793339082e-06, "loss": 0.4281, "step": 1504 }, { "epoch": 1.0903163487080416, "grad_norm": 0.34008386731147766, "learning_rate": 8.029573099541543e-06, "loss": 0.3867, "step": 1505 }, { "epoch": 1.091040811398213, "grad_norm": 0.3685896098613739, "learning_rate": 8.02621825199319e-06, "loss": 0.4173, "step": 1506 }, { "epoch": 1.0917652740883845, "grad_norm": 0.36212819814682007, "learning_rate": 8.022861253079015e-06, "loss": 0.4128, "step": 1507 }, { "epoch": 1.092489736778556, "grad_norm": 0.3603215515613556, "learning_rate": 8.01950210518554e-06, "loss": 0.4021, "step": 1508 }, { "epoch": 1.0932141994687274, "grad_norm": 0.39289727807044983, "learning_rate": 8.016140810700813e-06, "loss": 0.4051, "step": 1509 }, { "epoch": 1.093938662158899, "grad_norm": 0.3726145029067993, "learning_rate": 8.012777372014405e-06, "loss": 0.3804, "step": 1510 }, { "epoch": 1.0946631248490704, "grad_norm": 0.3663536012172699, "learning_rate": 8.00941179151742e-06, "loss": 0.4282, "step": 1511 }, { "epoch": 1.0953875875392418, "grad_norm": 0.4418599009513855, "learning_rate": 8.006044071602476e-06, "loss": 0.3941, "step": 1512 }, { "epoch": 1.0961120502294133, "grad_norm": 0.378684937953949, "learning_rate": 8.002674214663718e-06, "loss": 0.3922, "step": 1513 }, { "epoch": 1.0968365129195847, "grad_norm": 0.3812818229198456, "learning_rate": 7.999302223096806e-06, "loss": 0.439, "step": 1514 }, { "epoch": 1.0975609756097562, "grad_norm": 0.3553319573402405, "learning_rate": 7.99592809929892e-06, "loss": 0.3577, "step": 1515 }, { "epoch": 1.0982854382999276, "grad_norm": 0.4075811803340912, "learning_rate": 7.992551845668753e-06, "loss": 0.4464, "step": 1516 }, { "epoch": 1.099009900990099, "grad_norm": 0.3536193072795868, "learning_rate": 7.989173464606517e-06, "loss": 0.395, "step": 1517 }, { "epoch": 1.0997343636802706, "grad_norm": 0.38142848014831543, "learning_rate": 7.985792958513932e-06, "loss": 0.4182, "step": 1518 }, { "epoch": 1.100458826370442, "grad_norm": 0.37532854080200195, "learning_rate": 7.98241032979423e-06, "loss": 0.3962, "step": 1519 }, { "epoch": 1.1011832890606135, "grad_norm": 0.36446303129196167, "learning_rate": 7.979025580852153e-06, "loss": 0.4516, "step": 1520 }, { "epoch": 1.101907751750785, "grad_norm": 0.34382522106170654, "learning_rate": 7.97563871409395e-06, "loss": 0.3873, "step": 1521 }, { "epoch": 1.1026322144409564, "grad_norm": 0.35633739829063416, "learning_rate": 7.972249731927372e-06, "loss": 0.4188, "step": 1522 }, { "epoch": 1.1033566771311278, "grad_norm": 0.30252739787101746, "learning_rate": 7.968858636761683e-06, "loss": 0.3883, "step": 1523 }, { "epoch": 1.1040811398212993, "grad_norm": 0.37019383907318115, "learning_rate": 7.965465431007636e-06, "loss": 0.4248, "step": 1524 }, { "epoch": 1.1048056025114708, "grad_norm": 0.3857470154762268, "learning_rate": 7.962070117077496e-06, "loss": 0.4136, "step": 1525 }, { "epoch": 1.1055300652016422, "grad_norm": 0.34702184796333313, "learning_rate": 7.958672697385021e-06, "loss": 0.4083, "step": 1526 }, { "epoch": 1.1062545278918137, "grad_norm": 0.34549760818481445, "learning_rate": 7.955273174345469e-06, "loss": 0.4106, "step": 1527 }, { "epoch": 1.1069789905819851, "grad_norm": 0.3418892025947571, "learning_rate": 7.95187155037559e-06, "loss": 0.3801, "step": 1528 }, { "epoch": 1.1077034532721566, "grad_norm": 0.3711289167404175, "learning_rate": 7.948467827893626e-06, "loss": 0.394, "step": 1529 }, { "epoch": 1.108427915962328, "grad_norm": 0.3855430483818054, "learning_rate": 7.94506200931932e-06, "loss": 0.4428, "step": 1530 }, { "epoch": 1.1091523786524995, "grad_norm": 0.363099604845047, "learning_rate": 7.941654097073897e-06, "loss": 0.4003, "step": 1531 }, { "epoch": 1.109876841342671, "grad_norm": 0.42610087990760803, "learning_rate": 7.93824409358007e-06, "loss": 0.4489, "step": 1532 }, { "epoch": 1.1106013040328424, "grad_norm": 0.3328514099121094, "learning_rate": 7.934832001262043e-06, "loss": 0.3514, "step": 1533 }, { "epoch": 1.1113257667230139, "grad_norm": 0.3609577417373657, "learning_rate": 7.931417822545502e-06, "loss": 0.4006, "step": 1534 }, { "epoch": 1.1120502294131853, "grad_norm": 0.344690203666687, "learning_rate": 7.928001559857621e-06, "loss": 0.3995, "step": 1535 }, { "epoch": 1.1127746921033568, "grad_norm": 0.35158464312553406, "learning_rate": 7.924583215627048e-06, "loss": 0.444, "step": 1536 }, { "epoch": 1.1134991547935282, "grad_norm": 0.3413187861442566, "learning_rate": 7.921162792283918e-06, "loss": 0.3745, "step": 1537 }, { "epoch": 1.1142236174836997, "grad_norm": 0.32190313935279846, "learning_rate": 7.917740292259836e-06, "loss": 0.4076, "step": 1538 }, { "epoch": 1.1149480801738711, "grad_norm": 0.3444294333457947, "learning_rate": 7.914315717987892e-06, "loss": 0.3643, "step": 1539 }, { "epoch": 1.1156725428640426, "grad_norm": 0.3750390112400055, "learning_rate": 7.910889071902647e-06, "loss": 0.4559, "step": 1540 }, { "epoch": 1.116397005554214, "grad_norm": 0.3232389986515045, "learning_rate": 7.907460356440133e-06, "loss": 0.318, "step": 1541 }, { "epoch": 1.1171214682443855, "grad_norm": 0.4188596308231354, "learning_rate": 7.904029574037856e-06, "loss": 0.432, "step": 1542 }, { "epoch": 1.117845930934557, "grad_norm": 0.33879876136779785, "learning_rate": 7.900596727134789e-06, "loss": 0.3871, "step": 1543 }, { "epoch": 1.1185703936247284, "grad_norm": 0.41095590591430664, "learning_rate": 7.897161818171373e-06, "loss": 0.4514, "step": 1544 }, { "epoch": 1.1192948563148999, "grad_norm": 0.3499133288860321, "learning_rate": 7.893724849589517e-06, "loss": 0.3721, "step": 1545 }, { "epoch": 1.1200193190050713, "grad_norm": 0.35060766339302063, "learning_rate": 7.890285823832591e-06, "loss": 0.4032, "step": 1546 }, { "epoch": 1.1207437816952428, "grad_norm": 0.3708325922489166, "learning_rate": 7.886844743345434e-06, "loss": 0.3625, "step": 1547 }, { "epoch": 1.1214682443854143, "grad_norm": 0.36332476139068604, "learning_rate": 7.883401610574338e-06, "loss": 0.3903, "step": 1548 }, { "epoch": 1.1221927070755857, "grad_norm": 0.3788968026638031, "learning_rate": 7.879956427967055e-06, "loss": 0.4174, "step": 1549 }, { "epoch": 1.1229171697657572, "grad_norm": 0.40052083134651184, "learning_rate": 7.876509197972799e-06, "loss": 0.4191, "step": 1550 }, { "epoch": 1.1236416324559286, "grad_norm": 0.34152644872665405, "learning_rate": 7.873059923042237e-06, "loss": 0.3962, "step": 1551 }, { "epoch": 1.1243660951461, "grad_norm": 0.3820720314979553, "learning_rate": 7.86960860562749e-06, "loss": 0.4238, "step": 1552 }, { "epoch": 1.1250905578362715, "grad_norm": 0.42130035161972046, "learning_rate": 7.866155248182129e-06, "loss": 0.4033, "step": 1553 }, { "epoch": 1.125815020526443, "grad_norm": 0.36376193165779114, "learning_rate": 7.862699853161177e-06, "loss": 0.3887, "step": 1554 }, { "epoch": 1.1265394832166145, "grad_norm": 0.45998579263687134, "learning_rate": 7.859242423021108e-06, "loss": 0.4213, "step": 1555 }, { "epoch": 1.127263945906786, "grad_norm": 0.38748589158058167, "learning_rate": 7.855782960219838e-06, "loss": 0.4046, "step": 1556 }, { "epoch": 1.1279884085969574, "grad_norm": 0.4169427752494812, "learning_rate": 7.85232146721673e-06, "loss": 0.4436, "step": 1557 }, { "epoch": 1.1287128712871288, "grad_norm": 0.7593168020248413, "learning_rate": 7.848857946472592e-06, "loss": 0.3664, "step": 1558 }, { "epoch": 1.1294373339773003, "grad_norm": 0.44358664751052856, "learning_rate": 7.84539240044967e-06, "loss": 0.4331, "step": 1559 }, { "epoch": 1.1301617966674717, "grad_norm": 0.38565176725387573, "learning_rate": 7.841924831611653e-06, "loss": 0.4077, "step": 1560 }, { "epoch": 1.1308862593576432, "grad_norm": 0.34713250398635864, "learning_rate": 7.838455242423669e-06, "loss": 0.3841, "step": 1561 }, { "epoch": 1.1316107220478147, "grad_norm": 0.364738404750824, "learning_rate": 7.834983635352277e-06, "loss": 0.4277, "step": 1562 }, { "epoch": 1.1323351847379861, "grad_norm": 0.3894791901111603, "learning_rate": 7.831510012865475e-06, "loss": 0.4005, "step": 1563 }, { "epoch": 1.1330596474281576, "grad_norm": 0.36206570267677307, "learning_rate": 7.828034377432694e-06, "loss": 0.4038, "step": 1564 }, { "epoch": 1.133784110118329, "grad_norm": 0.36219966411590576, "learning_rate": 7.824556731524793e-06, "loss": 0.4278, "step": 1565 }, { "epoch": 1.1345085728085005, "grad_norm": 0.31835147738456726, "learning_rate": 7.821077077614062e-06, "loss": 0.3673, "step": 1566 }, { "epoch": 1.135233035498672, "grad_norm": 0.38734665513038635, "learning_rate": 7.817595418174217e-06, "loss": 0.4583, "step": 1567 }, { "epoch": 1.1359574981888434, "grad_norm": 0.3554036617279053, "learning_rate": 7.814111755680402e-06, "loss": 0.3761, "step": 1568 }, { "epoch": 1.1366819608790149, "grad_norm": 0.35998445749282837, "learning_rate": 7.810626092609186e-06, "loss": 0.39, "step": 1569 }, { "epoch": 1.1374064235691863, "grad_norm": 0.340089350938797, "learning_rate": 7.807138431438557e-06, "loss": 0.4147, "step": 1570 }, { "epoch": 1.1381308862593578, "grad_norm": 0.3223473131656647, "learning_rate": 7.803648774647923e-06, "loss": 0.3829, "step": 1571 }, { "epoch": 1.1388553489495292, "grad_norm": 0.3716624081134796, "learning_rate": 7.800157124718115e-06, "loss": 0.4338, "step": 1572 }, { "epoch": 1.1395798116397007, "grad_norm": 0.330165833234787, "learning_rate": 7.796663484131378e-06, "loss": 0.4005, "step": 1573 }, { "epoch": 1.1403042743298721, "grad_norm": 0.36313503980636597, "learning_rate": 7.79316785537137e-06, "loss": 0.3882, "step": 1574 }, { "epoch": 1.1410287370200434, "grad_norm": 0.36245474219322205, "learning_rate": 7.789670240923169e-06, "loss": 0.4387, "step": 1575 }, { "epoch": 1.1417531997102148, "grad_norm": 0.38193294405937195, "learning_rate": 7.786170643273256e-06, "loss": 0.4295, "step": 1576 }, { "epoch": 1.1424776624003863, "grad_norm": 0.3587680757045746, "learning_rate": 7.78266906490953e-06, "loss": 0.437, "step": 1577 }, { "epoch": 1.1432021250905577, "grad_norm": 0.3317493200302124, "learning_rate": 7.779165508321293e-06, "loss": 0.3849, "step": 1578 }, { "epoch": 1.1439265877807292, "grad_norm": 0.3858107030391693, "learning_rate": 7.775659975999257e-06, "loss": 0.4057, "step": 1579 }, { "epoch": 1.1446510504709007, "grad_norm": 0.34380778670310974, "learning_rate": 7.772152470435535e-06, "loss": 0.3907, "step": 1580 }, { "epoch": 1.1453755131610721, "grad_norm": 0.3467852771282196, "learning_rate": 7.768642994123643e-06, "loss": 0.4165, "step": 1581 }, { "epoch": 1.1460999758512436, "grad_norm": 0.33887556195259094, "learning_rate": 7.765131549558503e-06, "loss": 0.4067, "step": 1582 }, { "epoch": 1.146824438541415, "grad_norm": 0.3402368724346161, "learning_rate": 7.761618139236429e-06, "loss": 0.4065, "step": 1583 }, { "epoch": 1.1475489012315865, "grad_norm": 0.3579900860786438, "learning_rate": 7.758102765655136e-06, "loss": 0.4533, "step": 1584 }, { "epoch": 1.148273363921758, "grad_norm": 0.3368096947669983, "learning_rate": 7.754585431313738e-06, "loss": 0.3816, "step": 1585 }, { "epoch": 1.1489978266119294, "grad_norm": 0.34446173906326294, "learning_rate": 7.751066138712738e-06, "loss": 0.3842, "step": 1586 }, { "epoch": 1.1497222893021009, "grad_norm": 0.35595953464508057, "learning_rate": 7.747544890354031e-06, "loss": 0.4044, "step": 1587 }, { "epoch": 1.1504467519922723, "grad_norm": 0.3451770842075348, "learning_rate": 7.744021688740905e-06, "loss": 0.3803, "step": 1588 }, { "epoch": 1.1511712146824438, "grad_norm": 0.4358142614364624, "learning_rate": 7.74049653637804e-06, "loss": 0.4347, "step": 1589 }, { "epoch": 1.1518956773726152, "grad_norm": 0.33285999298095703, "learning_rate": 7.736969435771493e-06, "loss": 0.3794, "step": 1590 }, { "epoch": 1.1526201400627867, "grad_norm": 0.38750723004341125, "learning_rate": 7.733440389428714e-06, "loss": 0.3878, "step": 1591 }, { "epoch": 1.1533446027529581, "grad_norm": 0.39403268694877625, "learning_rate": 7.729909399858533e-06, "loss": 0.4065, "step": 1592 }, { "epoch": 1.1540690654431296, "grad_norm": 0.38967403769493103, "learning_rate": 7.726376469571165e-06, "loss": 0.4229, "step": 1593 }, { "epoch": 1.154793528133301, "grad_norm": 0.4069700539112091, "learning_rate": 7.722841601078197e-06, "loss": 0.4086, "step": 1594 }, { "epoch": 1.1555179908234725, "grad_norm": 0.3582433760166168, "learning_rate": 7.719304796892602e-06, "loss": 0.35, "step": 1595 }, { "epoch": 1.156242453513644, "grad_norm": 0.34599021077156067, "learning_rate": 7.715766059528727e-06, "loss": 0.4128, "step": 1596 }, { "epoch": 1.1569669162038154, "grad_norm": 0.4059074819087982, "learning_rate": 7.71222539150229e-06, "loss": 0.4307, "step": 1597 }, { "epoch": 1.1576913788939869, "grad_norm": 0.4193807542324066, "learning_rate": 7.708682795330385e-06, "loss": 0.3707, "step": 1598 }, { "epoch": 1.1584158415841583, "grad_norm": 0.4335813522338867, "learning_rate": 7.705138273531476e-06, "loss": 0.4281, "step": 1599 }, { "epoch": 1.1591403042743298, "grad_norm": 0.3562397360801697, "learning_rate": 7.701591828625395e-06, "loss": 0.3959, "step": 1600 }, { "epoch": 1.1598647669645012, "grad_norm": 0.42271822690963745, "learning_rate": 7.69804346313334e-06, "loss": 0.4018, "step": 1601 }, { "epoch": 1.1605892296546727, "grad_norm": 0.45580610632896423, "learning_rate": 7.69449317957788e-06, "loss": 0.4497, "step": 1602 }, { "epoch": 1.1613136923448442, "grad_norm": 0.3703674077987671, "learning_rate": 7.690940980482938e-06, "loss": 0.3862, "step": 1603 }, { "epoch": 1.1620381550350156, "grad_norm": 0.37878236174583435, "learning_rate": 7.687386868373807e-06, "loss": 0.4037, "step": 1604 }, { "epoch": 1.162762617725187, "grad_norm": 0.4318549633026123, "learning_rate": 7.683830845777141e-06, "loss": 0.3921, "step": 1605 }, { "epoch": 1.1634870804153585, "grad_norm": 0.380543053150177, "learning_rate": 7.680272915220941e-06, "loss": 0.376, "step": 1606 }, { "epoch": 1.16421154310553, "grad_norm": 0.3799059987068176, "learning_rate": 7.676713079234578e-06, "loss": 0.408, "step": 1607 }, { "epoch": 1.1649360057957014, "grad_norm": 0.38699576258659363, "learning_rate": 7.67315134034877e-06, "loss": 0.447, "step": 1608 }, { "epoch": 1.165660468485873, "grad_norm": 0.35602912306785583, "learning_rate": 7.669587701095589e-06, "loss": 0.4217, "step": 1609 }, { "epoch": 1.1663849311760444, "grad_norm": 0.35262995958328247, "learning_rate": 7.666022164008458e-06, "loss": 0.3865, "step": 1610 }, { "epoch": 1.1671093938662158, "grad_norm": 0.35893377661705017, "learning_rate": 7.66245473162215e-06, "loss": 0.3903, "step": 1611 }, { "epoch": 1.1678338565563873, "grad_norm": 0.3518680930137634, "learning_rate": 7.658885406472781e-06, "loss": 0.4097, "step": 1612 }, { "epoch": 1.1685583192465587, "grad_norm": 0.37274786829948425, "learning_rate": 7.655314191097822e-06, "loss": 0.3829, "step": 1613 }, { "epoch": 1.1692827819367302, "grad_norm": 0.4532231390476227, "learning_rate": 7.65174108803608e-06, "loss": 0.4199, "step": 1614 }, { "epoch": 1.1700072446269016, "grad_norm": 0.34309032559394836, "learning_rate": 7.648166099827706e-06, "loss": 0.4278, "step": 1615 }, { "epoch": 1.170731707317073, "grad_norm": 0.36645376682281494, "learning_rate": 7.644589229014191e-06, "loss": 0.4204, "step": 1616 }, { "epoch": 1.1714561700072446, "grad_norm": 0.36509013175964355, "learning_rate": 7.641010478138365e-06, "loss": 0.4306, "step": 1617 }, { "epoch": 1.172180632697416, "grad_norm": 0.36138054728507996, "learning_rate": 7.637429849744395e-06, "loss": 0.4077, "step": 1618 }, { "epoch": 1.1729050953875875, "grad_norm": 0.3370508849620819, "learning_rate": 7.633847346377781e-06, "loss": 0.4126, "step": 1619 }, { "epoch": 1.173629558077759, "grad_norm": 0.33481791615486145, "learning_rate": 7.630262970585355e-06, "loss": 0.3542, "step": 1620 }, { "epoch": 1.1743540207679304, "grad_norm": 0.41411057114601135, "learning_rate": 7.626676724915288e-06, "loss": 0.4253, "step": 1621 }, { "epoch": 1.1750784834581018, "grad_norm": 0.3634917438030243, "learning_rate": 7.623088611917069e-06, "loss": 0.4525, "step": 1622 }, { "epoch": 1.1758029461482733, "grad_norm": 0.3690543472766876, "learning_rate": 7.619498634141521e-06, "loss": 0.3851, "step": 1623 }, { "epoch": 1.1765274088384448, "grad_norm": 0.3340941369533539, "learning_rate": 7.615906794140793e-06, "loss": 0.4035, "step": 1624 }, { "epoch": 1.1772518715286162, "grad_norm": 0.3646768629550934, "learning_rate": 7.6123130944683535e-06, "loss": 0.4316, "step": 1625 }, { "epoch": 1.1779763342187877, "grad_norm": 0.33864498138427734, "learning_rate": 7.608717537679e-06, "loss": 0.373, "step": 1626 }, { "epoch": 1.1787007969089591, "grad_norm": 0.35463687777519226, "learning_rate": 7.605120126328844e-06, "loss": 0.4251, "step": 1627 }, { "epoch": 1.1794252595991306, "grad_norm": 0.32441791892051697, "learning_rate": 7.6015208629753166e-06, "loss": 0.389, "step": 1628 }, { "epoch": 1.180149722289302, "grad_norm": 0.3634214997291565, "learning_rate": 7.597919750177168e-06, "loss": 0.3931, "step": 1629 }, { "epoch": 1.1808741849794735, "grad_norm": 0.3587419390678406, "learning_rate": 7.594316790494463e-06, "loss": 0.4377, "step": 1630 }, { "epoch": 1.181598647669645, "grad_norm": 0.33682599663734436, "learning_rate": 7.590711986488575e-06, "loss": 0.3864, "step": 1631 }, { "epoch": 1.1823231103598164, "grad_norm": 0.3634129762649536, "learning_rate": 7.587105340722194e-06, "loss": 0.414, "step": 1632 }, { "epoch": 1.1830475730499879, "grad_norm": 0.33260011672973633, "learning_rate": 7.5834968557593155e-06, "loss": 0.4132, "step": 1633 }, { "epoch": 1.1837720357401593, "grad_norm": 0.39765095710754395, "learning_rate": 7.579886534165244e-06, "loss": 0.3736, "step": 1634 }, { "epoch": 1.1844964984303308, "grad_norm": 0.3524782955646515, "learning_rate": 7.5762743785065916e-06, "loss": 0.4282, "step": 1635 }, { "epoch": 1.1852209611205022, "grad_norm": 0.35177063941955566, "learning_rate": 7.572660391351271e-06, "loss": 0.3898, "step": 1636 }, { "epoch": 1.1859454238106737, "grad_norm": 0.4276060461997986, "learning_rate": 7.569044575268498e-06, "loss": 0.379, "step": 1637 }, { "epoch": 1.1866698865008452, "grad_norm": 0.3506036400794983, "learning_rate": 7.56542693282879e-06, "loss": 0.3976, "step": 1638 }, { "epoch": 1.1873943491910166, "grad_norm": 0.45333075523376465, "learning_rate": 7.561807466603959e-06, "loss": 0.4423, "step": 1639 }, { "epoch": 1.188118811881188, "grad_norm": 0.33689653873443604, "learning_rate": 7.558186179167118e-06, "loss": 0.3784, "step": 1640 }, { "epoch": 1.1888432745713595, "grad_norm": 0.34541380405426025, "learning_rate": 7.554563073092674e-06, "loss": 0.3637, "step": 1641 }, { "epoch": 1.189567737261531, "grad_norm": 0.42089274525642395, "learning_rate": 7.550938150956323e-06, "loss": 0.4081, "step": 1642 }, { "epoch": 1.1902921999517024, "grad_norm": 0.31626227498054504, "learning_rate": 7.547311415335057e-06, "loss": 0.3991, "step": 1643 }, { "epoch": 1.191016662641874, "grad_norm": 0.3666383922100067, "learning_rate": 7.543682868807154e-06, "loss": 0.4271, "step": 1644 }, { "epoch": 1.1917411253320453, "grad_norm": 0.3597119152545929, "learning_rate": 7.540052513952181e-06, "loss": 0.3825, "step": 1645 }, { "epoch": 1.1924655880222168, "grad_norm": 0.34300053119659424, "learning_rate": 7.53642035335099e-06, "loss": 0.3781, "step": 1646 }, { "epoch": 1.1931900507123883, "grad_norm": 0.364274799823761, "learning_rate": 7.532786389585715e-06, "loss": 0.4156, "step": 1647 }, { "epoch": 1.1939145134025597, "grad_norm": 0.3367619514465332, "learning_rate": 7.529150625239776e-06, "loss": 0.4256, "step": 1648 }, { "epoch": 1.1946389760927312, "grad_norm": 0.3503808081150055, "learning_rate": 7.525513062897871e-06, "loss": 0.3946, "step": 1649 }, { "epoch": 1.1953634387829026, "grad_norm": 0.3694020211696625, "learning_rate": 7.521873705145972e-06, "loss": 0.4214, "step": 1650 }, { "epoch": 1.196087901473074, "grad_norm": 0.3719983398914337, "learning_rate": 7.518232554571336e-06, "loss": 0.4285, "step": 1651 }, { "epoch": 1.1968123641632455, "grad_norm": 0.35522234439849854, "learning_rate": 7.514589613762487e-06, "loss": 0.3805, "step": 1652 }, { "epoch": 1.197536826853417, "grad_norm": 0.35049933195114136, "learning_rate": 7.5109448853092225e-06, "loss": 0.3964, "step": 1653 }, { "epoch": 1.1982612895435885, "grad_norm": 0.3597317337989807, "learning_rate": 7.507298371802617e-06, "loss": 0.3736, "step": 1654 }, { "epoch": 1.19898575223376, "grad_norm": 0.39392027258872986, "learning_rate": 7.503650075835007e-06, "loss": 0.4439, "step": 1655 }, { "epoch": 1.1997102149239314, "grad_norm": 0.3320519030094147, "learning_rate": 7.500000000000001e-06, "loss": 0.3752, "step": 1656 }, { "epoch": 1.2004346776141028, "grad_norm": 0.3573497533798218, "learning_rate": 7.496348146892469e-06, "loss": 0.4077, "step": 1657 }, { "epoch": 1.2011591403042743, "grad_norm": 0.33668404817581177, "learning_rate": 7.492694519108548e-06, "loss": 0.3808, "step": 1658 }, { "epoch": 1.2018836029944457, "grad_norm": 0.3581962585449219, "learning_rate": 7.4890391192456335e-06, "loss": 0.3992, "step": 1659 }, { "epoch": 1.2026080656846172, "grad_norm": 0.30809929966926575, "learning_rate": 7.4853819499023815e-06, "loss": 0.363, "step": 1660 }, { "epoch": 1.2033325283747887, "grad_norm": 0.38527464866638184, "learning_rate": 7.481723013678707e-06, "loss": 0.44, "step": 1661 }, { "epoch": 1.2040569910649601, "grad_norm": 0.4044356644153595, "learning_rate": 7.478062313175784e-06, "loss": 0.3751, "step": 1662 }, { "epoch": 1.2047814537551316, "grad_norm": 0.39288926124572754, "learning_rate": 7.4743998509960346e-06, "loss": 0.4159, "step": 1663 }, { "epoch": 1.205505916445303, "grad_norm": 0.36967727541923523, "learning_rate": 7.470735629743136e-06, "loss": 0.377, "step": 1664 }, { "epoch": 1.2062303791354745, "grad_norm": 0.3640151619911194, "learning_rate": 7.467069652022017e-06, "loss": 0.3789, "step": 1665 }, { "epoch": 1.206954841825646, "grad_norm": 0.3985443413257599, "learning_rate": 7.4634019204388534e-06, "loss": 0.4163, "step": 1666 }, { "epoch": 1.2076793045158174, "grad_norm": 0.37213510274887085, "learning_rate": 7.459732437601068e-06, "loss": 0.4241, "step": 1667 }, { "epoch": 1.2084037672059889, "grad_norm": 0.34115949273109436, "learning_rate": 7.4560612061173324e-06, "loss": 0.3535, "step": 1668 }, { "epoch": 1.2091282298961603, "grad_norm": 0.383075088262558, "learning_rate": 7.452388228597554e-06, "loss": 0.407, "step": 1669 }, { "epoch": 1.2098526925863318, "grad_norm": 0.3732651174068451, "learning_rate": 7.448713507652889e-06, "loss": 0.417, "step": 1670 }, { "epoch": 1.2105771552765032, "grad_norm": 0.35721665620803833, "learning_rate": 7.445037045895727e-06, "loss": 0.3986, "step": 1671 }, { "epoch": 1.2113016179666747, "grad_norm": 0.372534841299057, "learning_rate": 7.4413588459397e-06, "loss": 0.3827, "step": 1672 }, { "epoch": 1.2120260806568461, "grad_norm": 0.3685985803604126, "learning_rate": 7.437678910399672e-06, "loss": 0.4048, "step": 1673 }, { "epoch": 1.2127505433470176, "grad_norm": 0.41066089272499084, "learning_rate": 7.433997241891743e-06, "loss": 0.4303, "step": 1674 }, { "epoch": 1.213475006037189, "grad_norm": 0.3548670709133148, "learning_rate": 7.4303138430332445e-06, "loss": 0.383, "step": 1675 }, { "epoch": 1.2141994687273605, "grad_norm": 0.37299004197120667, "learning_rate": 7.426628716442739e-06, "loss": 0.4189, "step": 1676 }, { "epoch": 1.214923931417532, "grad_norm": 0.32896509766578674, "learning_rate": 7.422941864740012e-06, "loss": 0.3409, "step": 1677 }, { "epoch": 1.2156483941077034, "grad_norm": 0.40639543533325195, "learning_rate": 7.419253290546084e-06, "loss": 0.4362, "step": 1678 }, { "epoch": 1.2163728567978749, "grad_norm": 0.34627580642700195, "learning_rate": 7.415562996483193e-06, "loss": 0.4009, "step": 1679 }, { "epoch": 1.2170973194880463, "grad_norm": 0.3775731325149536, "learning_rate": 7.411870985174804e-06, "loss": 0.416, "step": 1680 }, { "epoch": 1.2178217821782178, "grad_norm": 0.3481098413467407, "learning_rate": 7.4081772592456e-06, "loss": 0.4232, "step": 1681 }, { "epoch": 1.2185462448683893, "grad_norm": 0.3428622782230377, "learning_rate": 7.4044818213214854e-06, "loss": 0.3925, "step": 1682 }, { "epoch": 1.2192707075585607, "grad_norm": 0.377108633518219, "learning_rate": 7.400784674029579e-06, "loss": 0.4285, "step": 1683 }, { "epoch": 1.2199951702487322, "grad_norm": 0.3979058861732483, "learning_rate": 7.397085819998217e-06, "loss": 0.3772, "step": 1684 }, { "epoch": 1.2207196329389036, "grad_norm": 0.3464503884315491, "learning_rate": 7.393385261856946e-06, "loss": 0.4012, "step": 1685 }, { "epoch": 1.221444095629075, "grad_norm": 0.3662325143814087, "learning_rate": 7.3896830022365286e-06, "loss": 0.3903, "step": 1686 }, { "epoch": 1.2221685583192465, "grad_norm": 0.34895655512809753, "learning_rate": 7.3859790437689335e-06, "loss": 0.4171, "step": 1687 }, { "epoch": 1.222893021009418, "grad_norm": 0.3998792767524719, "learning_rate": 7.382273389087338e-06, "loss": 0.4057, "step": 1688 }, { "epoch": 1.2236174836995894, "grad_norm": 0.35972341895103455, "learning_rate": 7.378566040826126e-06, "loss": 0.3782, "step": 1689 }, { "epoch": 1.224341946389761, "grad_norm": 0.36243900656700134, "learning_rate": 7.374857001620885e-06, "loss": 0.409, "step": 1690 }, { "epoch": 1.2250664090799324, "grad_norm": 0.404752641916275, "learning_rate": 7.371146274108403e-06, "loss": 0.3925, "step": 1691 }, { "epoch": 1.2257908717701038, "grad_norm": 0.37822216749191284, "learning_rate": 7.3674338609266705e-06, "loss": 0.4, "step": 1692 }, { "epoch": 1.2265153344602753, "grad_norm": 0.4117165505886078, "learning_rate": 7.363719764714875e-06, "loss": 0.3859, "step": 1693 }, { "epoch": 1.2272397971504467, "grad_norm": 0.4307750165462494, "learning_rate": 7.360003988113402e-06, "loss": 0.3983, "step": 1694 }, { "epoch": 1.2279642598406182, "grad_norm": 0.3748008608818054, "learning_rate": 7.356286533763833e-06, "loss": 0.3959, "step": 1695 }, { "epoch": 1.2286887225307896, "grad_norm": 0.4690828323364258, "learning_rate": 7.352567404308932e-06, "loss": 0.4572, "step": 1696 }, { "epoch": 1.229413185220961, "grad_norm": 0.34172171354293823, "learning_rate": 7.3488466023926685e-06, "loss": 0.4159, "step": 1697 }, { "epoch": 1.2301376479111326, "grad_norm": 0.39245522022247314, "learning_rate": 7.345124130660191e-06, "loss": 0.3811, "step": 1698 }, { "epoch": 1.230862110601304, "grad_norm": 0.3977771997451782, "learning_rate": 7.341399991757841e-06, "loss": 0.3933, "step": 1699 }, { "epoch": 1.2315865732914755, "grad_norm": 0.349505215883255, "learning_rate": 7.337674188333139e-06, "loss": 0.4147, "step": 1700 }, { "epoch": 1.232311035981647, "grad_norm": 0.4053078591823578, "learning_rate": 7.333946723034794e-06, "loss": 0.3954, "step": 1701 }, { "epoch": 1.2330354986718184, "grad_norm": 0.36326760053634644, "learning_rate": 7.330217598512696e-06, "loss": 0.4026, "step": 1702 }, { "epoch": 1.2337599613619898, "grad_norm": 0.33517134189605713, "learning_rate": 7.326486817417911e-06, "loss": 0.4201, "step": 1703 }, { "epoch": 1.2344844240521613, "grad_norm": 0.3933173716068268, "learning_rate": 7.322754382402684e-06, "loss": 0.3692, "step": 1704 }, { "epoch": 1.2352088867423328, "grad_norm": 0.38562554121017456, "learning_rate": 7.319020296120439e-06, "loss": 0.3879, "step": 1705 }, { "epoch": 1.2359333494325042, "grad_norm": 0.356254518032074, "learning_rate": 7.315284561225772e-06, "loss": 0.3875, "step": 1706 }, { "epoch": 1.2366578121226757, "grad_norm": 0.41129112243652344, "learning_rate": 7.311547180374448e-06, "loss": 0.3987, "step": 1707 }, { "epoch": 1.2373822748128471, "grad_norm": 0.3384135067462921, "learning_rate": 7.307808156223407e-06, "loss": 0.3964, "step": 1708 }, { "epoch": 1.2381067375030186, "grad_norm": 0.3978455066680908, "learning_rate": 7.304067491430753e-06, "loss": 0.4012, "step": 1709 }, { "epoch": 1.23883120019319, "grad_norm": 0.3722447156906128, "learning_rate": 7.300325188655762e-06, "loss": 0.3686, "step": 1710 }, { "epoch": 1.2395556628833615, "grad_norm": 0.4027857780456543, "learning_rate": 7.2965812505588665e-06, "loss": 0.4274, "step": 1711 }, { "epoch": 1.240280125573533, "grad_norm": 0.3926340639591217, "learning_rate": 7.292835679801668e-06, "loss": 0.4042, "step": 1712 }, { "epoch": 1.2410045882637044, "grad_norm": 0.3905361592769623, "learning_rate": 7.289088479046925e-06, "loss": 0.3842, "step": 1713 }, { "epoch": 1.2417290509538759, "grad_norm": 0.31983792781829834, "learning_rate": 7.285339650958558e-06, "loss": 0.3818, "step": 1714 }, { "epoch": 1.2424535136440473, "grad_norm": 0.35816118121147156, "learning_rate": 7.281589198201642e-06, "loss": 0.397, "step": 1715 }, { "epoch": 1.2431779763342188, "grad_norm": 0.3511388599872589, "learning_rate": 7.277837123442408e-06, "loss": 0.4329, "step": 1716 }, { "epoch": 1.2439024390243902, "grad_norm": 0.345885694026947, "learning_rate": 7.27408342934824e-06, "loss": 0.3825, "step": 1717 }, { "epoch": 1.2446269017145617, "grad_norm": 0.38150081038475037, "learning_rate": 7.270328118587671e-06, "loss": 0.4352, "step": 1718 }, { "epoch": 1.2453513644047332, "grad_norm": 0.3469431698322296, "learning_rate": 7.266571193830387e-06, "loss": 0.3823, "step": 1719 }, { "epoch": 1.2460758270949046, "grad_norm": 0.33331814408302307, "learning_rate": 7.262812657747218e-06, "loss": 0.4245, "step": 1720 }, { "epoch": 1.246800289785076, "grad_norm": 0.3329892158508301, "learning_rate": 7.2590525130101454e-06, "loss": 0.4009, "step": 1721 }, { "epoch": 1.2475247524752475, "grad_norm": 0.3421848714351654, "learning_rate": 7.255290762292286e-06, "loss": 0.4078, "step": 1722 }, { "epoch": 1.248249215165419, "grad_norm": 0.3946518003940582, "learning_rate": 7.251527408267901e-06, "loss": 0.4107, "step": 1723 }, { "epoch": 1.2489736778555904, "grad_norm": 0.34401705861091614, "learning_rate": 7.247762453612397e-06, "loss": 0.4227, "step": 1724 }, { "epoch": 1.249698140545762, "grad_norm": 0.36019793152809143, "learning_rate": 7.243995901002312e-06, "loss": 0.379, "step": 1725 }, { "epoch": 1.2504226032359333, "grad_norm": 0.32511022686958313, "learning_rate": 7.240227753115321e-06, "loss": 0.384, "step": 1726 }, { "epoch": 1.2511470659261048, "grad_norm": 0.34583526849746704, "learning_rate": 7.236458012630237e-06, "loss": 0.376, "step": 1727 }, { "epoch": 1.2518715286162763, "grad_norm": 0.37585729360580444, "learning_rate": 7.232686682227001e-06, "loss": 0.4359, "step": 1728 }, { "epoch": 1.2525959913064477, "grad_norm": 0.35362881422042847, "learning_rate": 7.228913764586685e-06, "loss": 0.3666, "step": 1729 }, { "epoch": 1.2533204539966192, "grad_norm": 0.3630039095878601, "learning_rate": 7.225139262391493e-06, "loss": 0.3595, "step": 1730 }, { "epoch": 1.2540449166867906, "grad_norm": 0.36929652094841003, "learning_rate": 7.221363178324748e-06, "loss": 0.4248, "step": 1731 }, { "epoch": 1.254769379376962, "grad_norm": 0.34872713685035706, "learning_rate": 7.217585515070906e-06, "loss": 0.3539, "step": 1732 }, { "epoch": 1.2554938420671335, "grad_norm": 0.3870997726917267, "learning_rate": 7.213806275315541e-06, "loss": 0.4068, "step": 1733 }, { "epoch": 1.256218304757305, "grad_norm": 0.37144216895103455, "learning_rate": 7.210025461745348e-06, "loss": 0.4029, "step": 1734 }, { "epoch": 1.2569427674474765, "grad_norm": 0.36817893385887146, "learning_rate": 7.206243077048142e-06, "loss": 0.4518, "step": 1735 }, { "epoch": 1.257667230137648, "grad_norm": 0.33840394020080566, "learning_rate": 7.2024591239128535e-06, "loss": 0.4109, "step": 1736 }, { "epoch": 1.2583916928278194, "grad_norm": 0.3906187117099762, "learning_rate": 7.198673605029529e-06, "loss": 0.4238, "step": 1737 }, { "epoch": 1.2591161555179908, "grad_norm": 0.3902265727519989, "learning_rate": 7.19488652308933e-06, "loss": 0.4309, "step": 1738 }, { "epoch": 1.2598406182081623, "grad_norm": 0.32534727454185486, "learning_rate": 7.191097880784523e-06, "loss": 0.3683, "step": 1739 }, { "epoch": 1.2605650808983337, "grad_norm": 0.36915186047554016, "learning_rate": 7.187307680808493e-06, "loss": 0.4549, "step": 1740 }, { "epoch": 1.2612895435885052, "grad_norm": 0.37926775217056274, "learning_rate": 7.183515925855724e-06, "loss": 0.4061, "step": 1741 }, { "epoch": 1.2620140062786767, "grad_norm": 0.3291555643081665, "learning_rate": 7.179722618621811e-06, "loss": 0.385, "step": 1742 }, { "epoch": 1.2627384689688481, "grad_norm": 0.32689204812049866, "learning_rate": 7.175927761803448e-06, "loss": 0.4065, "step": 1743 }, { "epoch": 1.2634629316590196, "grad_norm": 0.2898622453212738, "learning_rate": 7.172131358098432e-06, "loss": 0.3615, "step": 1744 }, { "epoch": 1.264187394349191, "grad_norm": 0.4025760591030121, "learning_rate": 7.168333410205663e-06, "loss": 0.4722, "step": 1745 }, { "epoch": 1.2649118570393625, "grad_norm": 0.3480073809623718, "learning_rate": 7.164533920825137e-06, "loss": 0.3751, "step": 1746 }, { "epoch": 1.265636319729534, "grad_norm": 0.40079009532928467, "learning_rate": 7.160732892657943e-06, "loss": 0.3995, "step": 1747 }, { "epoch": 1.2663607824197054, "grad_norm": 0.3732667863368988, "learning_rate": 7.156930328406268e-06, "loss": 0.4303, "step": 1748 }, { "epoch": 1.2670852451098769, "grad_norm": 0.33641576766967773, "learning_rate": 7.153126230773388e-06, "loss": 0.3961, "step": 1749 }, { "epoch": 1.2678097078000483, "grad_norm": 0.3963044285774231, "learning_rate": 7.14932060246367e-06, "loss": 0.3906, "step": 1750 }, { "epoch": 1.2685341704902198, "grad_norm": 0.4544864594936371, "learning_rate": 7.1455134461825715e-06, "loss": 0.4504, "step": 1751 }, { "epoch": 1.2692586331803912, "grad_norm": 0.31675705313682556, "learning_rate": 7.141704764636632e-06, "loss": 0.3706, "step": 1752 }, { "epoch": 1.2699830958705627, "grad_norm": 0.4497453272342682, "learning_rate": 7.137894560533478e-06, "loss": 0.4074, "step": 1753 }, { "epoch": 1.2707075585607341, "grad_norm": 0.44499677419662476, "learning_rate": 7.134082836581819e-06, "loss": 0.409, "step": 1754 }, { "epoch": 1.2714320212509056, "grad_norm": 0.35697701573371887, "learning_rate": 7.130269595491443e-06, "loss": 0.4013, "step": 1755 }, { "epoch": 1.272156483941077, "grad_norm": 0.3761662244796753, "learning_rate": 7.126454839973217e-06, "loss": 0.3999, "step": 1756 }, { "epoch": 1.2728809466312485, "grad_norm": 0.3575904071331024, "learning_rate": 7.122638572739087e-06, "loss": 0.3811, "step": 1757 }, { "epoch": 1.27360540932142, "grad_norm": 0.36404016613960266, "learning_rate": 7.118820796502069e-06, "loss": 0.3941, "step": 1758 }, { "epoch": 1.2743298720115914, "grad_norm": 0.39243432879447937, "learning_rate": 7.115001513976257e-06, "loss": 0.4409, "step": 1759 }, { "epoch": 1.2750543347017629, "grad_norm": 0.3564545810222626, "learning_rate": 7.111180727876811e-06, "loss": 0.3774, "step": 1760 }, { "epoch": 1.2757787973919343, "grad_norm": 0.3618677854537964, "learning_rate": 7.107358440919963e-06, "loss": 0.4013, "step": 1761 }, { "epoch": 1.2765032600821058, "grad_norm": 0.36468759179115295, "learning_rate": 7.103534655823014e-06, "loss": 0.3948, "step": 1762 }, { "epoch": 1.2772277227722773, "grad_norm": 0.3564842641353607, "learning_rate": 7.099709375304324e-06, "loss": 0.3955, "step": 1763 }, { "epoch": 1.2779521854624487, "grad_norm": 0.3903474509716034, "learning_rate": 7.095882602083321e-06, "loss": 0.4105, "step": 1764 }, { "epoch": 1.2786766481526202, "grad_norm": 0.3556039035320282, "learning_rate": 7.092054338880492e-06, "loss": 0.4137, "step": 1765 }, { "epoch": 1.2794011108427916, "grad_norm": 0.3118101954460144, "learning_rate": 7.088224588417383e-06, "loss": 0.3538, "step": 1766 }, { "epoch": 1.280125573532963, "grad_norm": 0.37923744320869446, "learning_rate": 7.084393353416601e-06, "loss": 0.4056, "step": 1767 }, { "epoch": 1.2808500362231345, "grad_norm": 0.34142154455184937, "learning_rate": 7.080560636601802e-06, "loss": 0.4158, "step": 1768 }, { "epoch": 1.281574498913306, "grad_norm": 0.3327343463897705, "learning_rate": 7.0767264406977e-06, "loss": 0.3647, "step": 1769 }, { "epoch": 1.2822989616034774, "grad_norm": 0.36297768354415894, "learning_rate": 7.072890768430061e-06, "loss": 0.3883, "step": 1770 }, { "epoch": 1.283023424293649, "grad_norm": 0.3613491654396057, "learning_rate": 7.069053622525697e-06, "loss": 0.394, "step": 1771 }, { "epoch": 1.2837478869838204, "grad_norm": 0.3829686939716339, "learning_rate": 7.065215005712469e-06, "loss": 0.4131, "step": 1772 }, { "epoch": 1.2844723496739918, "grad_norm": 0.35084885358810425, "learning_rate": 7.061374920719288e-06, "loss": 0.3884, "step": 1773 }, { "epoch": 1.2851968123641633, "grad_norm": 0.3395313620567322, "learning_rate": 7.057533370276102e-06, "loss": 0.4114, "step": 1774 }, { "epoch": 1.2859212750543347, "grad_norm": 0.3776375353336334, "learning_rate": 7.0536903571139035e-06, "loss": 0.3881, "step": 1775 }, { "epoch": 1.2866457377445062, "grad_norm": 0.32561448216438293, "learning_rate": 7.049845883964728e-06, "loss": 0.3778, "step": 1776 }, { "epoch": 1.2873702004346776, "grad_norm": 0.33245283365249634, "learning_rate": 7.045999953561642e-06, "loss": 0.4032, "step": 1777 }, { "epoch": 1.288094663124849, "grad_norm": 0.34954318404197693, "learning_rate": 7.042152568638755e-06, "loss": 0.4048, "step": 1778 }, { "epoch": 1.2888191258150206, "grad_norm": 0.349913626909256, "learning_rate": 7.038303731931209e-06, "loss": 0.4111, "step": 1779 }, { "epoch": 1.289543588505192, "grad_norm": 0.3523792624473572, "learning_rate": 7.0344534461751716e-06, "loss": 0.3733, "step": 1780 }, { "epoch": 1.2902680511953635, "grad_norm": 0.3550451695919037, "learning_rate": 7.0306017141078495e-06, "loss": 0.4174, "step": 1781 }, { "epoch": 1.290992513885535, "grad_norm": 0.3627639710903168, "learning_rate": 7.026748538467474e-06, "loss": 0.4344, "step": 1782 }, { "epoch": 1.2917169765757064, "grad_norm": 0.34394145011901855, "learning_rate": 7.0228939219933e-06, "loss": 0.3889, "step": 1783 }, { "epoch": 1.2924414392658778, "grad_norm": 0.3804479241371155, "learning_rate": 7.019037867425612e-06, "loss": 0.4013, "step": 1784 }, { "epoch": 1.2931659019560493, "grad_norm": 0.3675382435321808, "learning_rate": 7.015180377505711e-06, "loss": 0.3779, "step": 1785 }, { "epoch": 1.2938903646462208, "grad_norm": 0.3292711675167084, "learning_rate": 7.011321454975924e-06, "loss": 0.3891, "step": 1786 }, { "epoch": 1.2946148273363922, "grad_norm": 0.327301561832428, "learning_rate": 7.0074611025795916e-06, "loss": 0.3903, "step": 1787 }, { "epoch": 1.2953392900265637, "grad_norm": 0.35343942046165466, "learning_rate": 7.003599323061073e-06, "loss": 0.4111, "step": 1788 }, { "epoch": 1.2960637527167351, "grad_norm": 0.3505232334136963, "learning_rate": 6.999736119165745e-06, "loss": 0.4208, "step": 1789 }, { "epoch": 1.2967882154069066, "grad_norm": 0.35083651542663574, "learning_rate": 6.995871493639989e-06, "loss": 0.3901, "step": 1790 }, { "epoch": 1.297512678097078, "grad_norm": 0.4116804301738739, "learning_rate": 6.9920054492312086e-06, "loss": 0.443, "step": 1791 }, { "epoch": 1.2982371407872495, "grad_norm": 0.3751932382583618, "learning_rate": 6.9881379886878045e-06, "loss": 0.3901, "step": 1792 }, { "epoch": 1.298961603477421, "grad_norm": 0.35733211040496826, "learning_rate": 6.98426911475919e-06, "loss": 0.3901, "step": 1793 }, { "epoch": 1.2996860661675924, "grad_norm": 0.3729836046695709, "learning_rate": 6.980398830195785e-06, "loss": 0.3939, "step": 1794 }, { "epoch": 1.3004105288577639, "grad_norm": 0.3776799440383911, "learning_rate": 6.976527137749007e-06, "loss": 0.4036, "step": 1795 }, { "epoch": 1.3011349915479353, "grad_norm": 0.3563235402107239, "learning_rate": 6.972654040171278e-06, "loss": 0.4002, "step": 1796 }, { "epoch": 1.3018594542381068, "grad_norm": 0.43606358766555786, "learning_rate": 6.96877954021602e-06, "loss": 0.4302, "step": 1797 }, { "epoch": 1.3025839169282782, "grad_norm": 0.32870590686798096, "learning_rate": 6.964903640637646e-06, "loss": 0.3629, "step": 1798 }, { "epoch": 1.3033083796184497, "grad_norm": 0.39073967933654785, "learning_rate": 6.961026344191569e-06, "loss": 0.422, "step": 1799 }, { "epoch": 1.3040328423086212, "grad_norm": 0.3995259404182434, "learning_rate": 6.957147653634198e-06, "loss": 0.4253, "step": 1800 }, { "epoch": 1.3047573049987926, "grad_norm": 0.3472627103328705, "learning_rate": 6.9532675717229225e-06, "loss": 0.363, "step": 1801 }, { "epoch": 1.305481767688964, "grad_norm": 0.3915518522262573, "learning_rate": 6.949386101216133e-06, "loss": 0.4276, "step": 1802 }, { "epoch": 1.3062062303791355, "grad_norm": 0.3753298819065094, "learning_rate": 6.9455032448732e-06, "loss": 0.3972, "step": 1803 }, { "epoch": 1.306930693069307, "grad_norm": 0.32419970631599426, "learning_rate": 6.94161900545448e-06, "loss": 0.3632, "step": 1804 }, { "epoch": 1.3076551557594784, "grad_norm": 0.33590278029441833, "learning_rate": 6.9377333857213154e-06, "loss": 0.3543, "step": 1805 }, { "epoch": 1.30837961844965, "grad_norm": 0.4465208351612091, "learning_rate": 6.9338463884360286e-06, "loss": 0.4237, "step": 1806 }, { "epoch": 1.3091040811398214, "grad_norm": 0.3980885148048401, "learning_rate": 6.9299580163619184e-06, "loss": 0.3892, "step": 1807 }, { "epoch": 1.3098285438299928, "grad_norm": 0.4113253057003021, "learning_rate": 6.926068272263267e-06, "loss": 0.3978, "step": 1808 }, { "epoch": 1.3105530065201643, "grad_norm": 0.3607718348503113, "learning_rate": 6.922177158905326e-06, "loss": 0.3864, "step": 1809 }, { "epoch": 1.3112774692103357, "grad_norm": 0.37946319580078125, "learning_rate": 6.918284679054323e-06, "loss": 0.4103, "step": 1810 }, { "epoch": 1.3120019319005072, "grad_norm": 0.38259533047676086, "learning_rate": 6.914390835477458e-06, "loss": 0.3867, "step": 1811 }, { "epoch": 1.3127263945906786, "grad_norm": 0.397155225276947, "learning_rate": 6.910495630942899e-06, "loss": 0.394, "step": 1812 }, { "epoch": 1.31345085728085, "grad_norm": 0.4313705861568451, "learning_rate": 6.9065990682197835e-06, "loss": 0.4217, "step": 1813 }, { "epoch": 1.3141753199710215, "grad_norm": 0.34366458654403687, "learning_rate": 6.902701150078212e-06, "loss": 0.416, "step": 1814 }, { "epoch": 1.314899782661193, "grad_norm": 0.4003240466117859, "learning_rate": 6.898801879289248e-06, "loss": 0.4449, "step": 1815 }, { "epoch": 1.3156242453513645, "grad_norm": 0.38687050342559814, "learning_rate": 6.89490125862492e-06, "loss": 0.3901, "step": 1816 }, { "epoch": 1.316348708041536, "grad_norm": 0.31964725255966187, "learning_rate": 6.890999290858213e-06, "loss": 0.3846, "step": 1817 }, { "epoch": 1.3170731707317074, "grad_norm": 0.3854660093784332, "learning_rate": 6.887095978763072e-06, "loss": 0.4572, "step": 1818 }, { "epoch": 1.3177976334218788, "grad_norm": 0.35480383038520813, "learning_rate": 6.883191325114399e-06, "loss": 0.4089, "step": 1819 }, { "epoch": 1.3185220961120503, "grad_norm": 0.34717145562171936, "learning_rate": 6.879285332688044e-06, "loss": 0.3696, "step": 1820 }, { "epoch": 1.3192465588022217, "grad_norm": 0.33839693665504456, "learning_rate": 6.875378004260813e-06, "loss": 0.4318, "step": 1821 }, { "epoch": 1.3199710214923932, "grad_norm": 0.32896706461906433, "learning_rate": 6.871469342610462e-06, "loss": 0.4142, "step": 1822 }, { "epoch": 1.3206954841825647, "grad_norm": 0.35834094882011414, "learning_rate": 6.8675593505156925e-06, "loss": 0.4057, "step": 1823 }, { "epoch": 1.3214199468727361, "grad_norm": 0.31520041823387146, "learning_rate": 6.863648030756154e-06, "loss": 0.3352, "step": 1824 }, { "epoch": 1.3221444095629076, "grad_norm": 0.411811888217926, "learning_rate": 6.859735386112439e-06, "loss": 0.4413, "step": 1825 }, { "epoch": 1.322868872253079, "grad_norm": 0.3716681897640228, "learning_rate": 6.855821419366082e-06, "loss": 0.4079, "step": 1826 }, { "epoch": 1.3235933349432505, "grad_norm": 0.37269532680511475, "learning_rate": 6.851906133299556e-06, "loss": 0.393, "step": 1827 }, { "epoch": 1.324317797633422, "grad_norm": 0.40876877307891846, "learning_rate": 6.847989530696275e-06, "loss": 0.4242, "step": 1828 }, { "epoch": 1.3250422603235934, "grad_norm": 0.31705015897750854, "learning_rate": 6.844071614340585e-06, "loss": 0.3843, "step": 1829 }, { "epoch": 1.3257667230137649, "grad_norm": 0.35695794224739075, "learning_rate": 6.84015238701777e-06, "loss": 0.4105, "step": 1830 }, { "epoch": 1.3264911857039363, "grad_norm": 0.3605729043483734, "learning_rate": 6.8362318515140434e-06, "loss": 0.3937, "step": 1831 }, { "epoch": 1.3272156483941078, "grad_norm": 0.34637466073036194, "learning_rate": 6.832310010616547e-06, "loss": 0.3877, "step": 1832 }, { "epoch": 1.3279401110842792, "grad_norm": 0.36229610443115234, "learning_rate": 6.828386867113357e-06, "loss": 0.4167, "step": 1833 }, { "epoch": 1.3286645737744507, "grad_norm": 0.4011974036693573, "learning_rate": 6.824462423793467e-06, "loss": 0.3908, "step": 1834 }, { "epoch": 1.3293890364646221, "grad_norm": 0.36489036679267883, "learning_rate": 6.8205366834468035e-06, "loss": 0.4172, "step": 1835 }, { "epoch": 1.3301134991547936, "grad_norm": 0.36478742957115173, "learning_rate": 6.816609648864208e-06, "loss": 0.431, "step": 1836 }, { "epoch": 1.330837961844965, "grad_norm": 0.3743828237056732, "learning_rate": 6.812681322837444e-06, "loss": 0.366, "step": 1837 }, { "epoch": 1.3315624245351365, "grad_norm": 0.38665053248405457, "learning_rate": 6.808751708159196e-06, "loss": 0.3893, "step": 1838 }, { "epoch": 1.332286887225308, "grad_norm": 0.326610803604126, "learning_rate": 6.804820807623061e-06, "loss": 0.3878, "step": 1839 }, { "epoch": 1.3330113499154794, "grad_norm": 0.3377801477909088, "learning_rate": 6.800888624023552e-06, "loss": 0.3786, "step": 1840 }, { "epoch": 1.3337358126056509, "grad_norm": 0.4888227581977844, "learning_rate": 6.796955160156096e-06, "loss": 0.4255, "step": 1841 }, { "epoch": 1.3344602752958223, "grad_norm": 0.3686349093914032, "learning_rate": 6.7930204188170244e-06, "loss": 0.3965, "step": 1842 }, { "epoch": 1.3351847379859938, "grad_norm": 0.356297105550766, "learning_rate": 6.789084402803582e-06, "loss": 0.3631, "step": 1843 }, { "epoch": 1.3359092006761653, "grad_norm": 0.37233585119247437, "learning_rate": 6.785147114913918e-06, "loss": 0.3973, "step": 1844 }, { "epoch": 1.3366336633663367, "grad_norm": 0.43861451745033264, "learning_rate": 6.781208557947085e-06, "loss": 0.4048, "step": 1845 }, { "epoch": 1.3373581260565082, "grad_norm": 0.3038392663002014, "learning_rate": 6.77726873470304e-06, "loss": 0.3784, "step": 1846 }, { "epoch": 1.3380825887466796, "grad_norm": 0.37306591868400574, "learning_rate": 6.773327647982638e-06, "loss": 0.4645, "step": 1847 }, { "epoch": 1.338807051436851, "grad_norm": 0.3516532778739929, "learning_rate": 6.769385300587632e-06, "loss": 0.3591, "step": 1848 }, { "epoch": 1.3395315141270225, "grad_norm": 0.32636797428131104, "learning_rate": 6.7654416953206755e-06, "loss": 0.3533, "step": 1849 }, { "epoch": 1.340255976817194, "grad_norm": 0.32887259125709534, "learning_rate": 6.761496834985309e-06, "loss": 0.4007, "step": 1850 }, { "epoch": 1.3409804395073655, "grad_norm": 0.28791284561157227, "learning_rate": 6.757550722385973e-06, "loss": 0.3357, "step": 1851 }, { "epoch": 1.341704902197537, "grad_norm": 0.39926621317863464, "learning_rate": 6.753603360327992e-06, "loss": 0.4508, "step": 1852 }, { "epoch": 1.3424293648877084, "grad_norm": 0.34025344252586365, "learning_rate": 6.749654751617582e-06, "loss": 0.3786, "step": 1853 }, { "epoch": 1.3431538275778798, "grad_norm": 0.349732369184494, "learning_rate": 6.745704899061843e-06, "loss": 0.4035, "step": 1854 }, { "epoch": 1.343878290268051, "grad_norm": 0.38043779134750366, "learning_rate": 6.7417538054687646e-06, "loss": 0.4069, "step": 1855 }, { "epoch": 1.3446027529582225, "grad_norm": 0.33141767978668213, "learning_rate": 6.737801473647211e-06, "loss": 0.3858, "step": 1856 }, { "epoch": 1.345327215648394, "grad_norm": 0.39217615127563477, "learning_rate": 6.733847906406933e-06, "loss": 0.3943, "step": 1857 }, { "epoch": 1.3460516783385654, "grad_norm": 0.3594192564487457, "learning_rate": 6.729893106558556e-06, "loss": 0.4169, "step": 1858 }, { "epoch": 1.3467761410287369, "grad_norm": 0.326779305934906, "learning_rate": 6.725937076913582e-06, "loss": 0.3898, "step": 1859 }, { "epoch": 1.3475006037189083, "grad_norm": 0.3959048390388489, "learning_rate": 6.72197982028439e-06, "loss": 0.4422, "step": 1860 }, { "epoch": 1.3482250664090798, "grad_norm": 0.34770461916923523, "learning_rate": 6.718021339484229e-06, "loss": 0.3975, "step": 1861 }, { "epoch": 1.3489495290992513, "grad_norm": 0.3263068199157715, "learning_rate": 6.714061637327217e-06, "loss": 0.364, "step": 1862 }, { "epoch": 1.3496739917894227, "grad_norm": 0.41291284561157227, "learning_rate": 6.710100716628345e-06, "loss": 0.4103, "step": 1863 }, { "epoch": 1.3503984544795942, "grad_norm": 0.37005823850631714, "learning_rate": 6.706138580203463e-06, "loss": 0.43, "step": 1864 }, { "epoch": 1.3511229171697656, "grad_norm": 0.3380392789840698, "learning_rate": 6.702175230869293e-06, "loss": 0.3925, "step": 1865 }, { "epoch": 1.351847379859937, "grad_norm": 0.3377431631088257, "learning_rate": 6.698210671443416e-06, "loss": 0.3781, "step": 1866 }, { "epoch": 1.3525718425501085, "grad_norm": 0.3435581922531128, "learning_rate": 6.694244904744269e-06, "loss": 0.3999, "step": 1867 }, { "epoch": 1.35329630524028, "grad_norm": 0.31992095708847046, "learning_rate": 6.690277933591156e-06, "loss": 0.409, "step": 1868 }, { "epoch": 1.3540207679304515, "grad_norm": 0.3660760521888733, "learning_rate": 6.6863097608042295e-06, "loss": 0.3972, "step": 1869 }, { "epoch": 1.354745230620623, "grad_norm": 0.3640010952949524, "learning_rate": 6.6823403892045e-06, "loss": 0.4235, "step": 1870 }, { "epoch": 1.3554696933107944, "grad_norm": 0.32384663820266724, "learning_rate": 6.67836982161383e-06, "loss": 0.3516, "step": 1871 }, { "epoch": 1.3561941560009658, "grad_norm": 0.3906276822090149, "learning_rate": 6.674398060854931e-06, "loss": 0.4123, "step": 1872 }, { "epoch": 1.3569186186911373, "grad_norm": 0.34979870915412903, "learning_rate": 6.670425109751365e-06, "loss": 0.3582, "step": 1873 }, { "epoch": 1.3576430813813087, "grad_norm": 0.3692592978477478, "learning_rate": 6.6664509711275375e-06, "loss": 0.4231, "step": 1874 }, { "epoch": 1.3583675440714802, "grad_norm": 0.34032323956489563, "learning_rate": 6.662475647808699e-06, "loss": 0.375, "step": 1875 }, { "epoch": 1.3590920067616516, "grad_norm": 0.40236467123031616, "learning_rate": 6.658499142620944e-06, "loss": 0.3872, "step": 1876 }, { "epoch": 1.359816469451823, "grad_norm": 0.36616480350494385, "learning_rate": 6.654521458391206e-06, "loss": 0.391, "step": 1877 }, { "epoch": 1.3605409321419946, "grad_norm": 0.3356474041938782, "learning_rate": 6.650542597947253e-06, "loss": 0.4115, "step": 1878 }, { "epoch": 1.361265394832166, "grad_norm": 0.3745135962963104, "learning_rate": 6.646562564117699e-06, "loss": 0.418, "step": 1879 }, { "epoch": 1.3619898575223375, "grad_norm": 0.3773886561393738, "learning_rate": 6.642581359731981e-06, "loss": 0.4057, "step": 1880 }, { "epoch": 1.362714320212509, "grad_norm": 0.3248441517353058, "learning_rate": 6.638598987620375e-06, "loss": 0.3471, "step": 1881 }, { "epoch": 1.3634387829026804, "grad_norm": 0.3664219379425049, "learning_rate": 6.6346154506139844e-06, "loss": 0.4198, "step": 1882 }, { "epoch": 1.3641632455928518, "grad_norm": 0.36061933636665344, "learning_rate": 6.630630751544742e-06, "loss": 0.373, "step": 1883 }, { "epoch": 1.3648877082830233, "grad_norm": 0.32997649908065796, "learning_rate": 6.626644893245406e-06, "loss": 0.4065, "step": 1884 }, { "epoch": 1.3656121709731948, "grad_norm": 0.3817669451236725, "learning_rate": 6.622657878549558e-06, "loss": 0.4315, "step": 1885 }, { "epoch": 1.3663366336633662, "grad_norm": 0.3010866343975067, "learning_rate": 6.618669710291607e-06, "loss": 0.3592, "step": 1886 }, { "epoch": 1.3670610963535377, "grad_norm": 0.3789379894733429, "learning_rate": 6.614680391306772e-06, "loss": 0.3948, "step": 1887 }, { "epoch": 1.3677855590437091, "grad_norm": 0.34030625224113464, "learning_rate": 6.610689924431099e-06, "loss": 0.4266, "step": 1888 }, { "epoch": 1.3685100217338806, "grad_norm": 0.3343445658683777, "learning_rate": 6.6066983125014465e-06, "loss": 0.3974, "step": 1889 }, { "epoch": 1.369234484424052, "grad_norm": 0.400250107049942, "learning_rate": 6.6027055583554865e-06, "loss": 0.3942, "step": 1890 }, { "epoch": 1.3699589471142235, "grad_norm": 0.3380352258682251, "learning_rate": 6.598711664831704e-06, "loss": 0.4198, "step": 1891 }, { "epoch": 1.370683409804395, "grad_norm": 0.3147457242012024, "learning_rate": 6.594716634769396e-06, "loss": 0.3798, "step": 1892 }, { "epoch": 1.3714078724945664, "grad_norm": 0.3697604238986969, "learning_rate": 6.590720471008662e-06, "loss": 0.3981, "step": 1893 }, { "epoch": 1.3721323351847379, "grad_norm": 0.3231216073036194, "learning_rate": 6.586723176390414e-06, "loss": 0.3811, "step": 1894 }, { "epoch": 1.3728567978749093, "grad_norm": 0.3590335547924042, "learning_rate": 6.582724753756363e-06, "loss": 0.4157, "step": 1895 }, { "epoch": 1.3735812605650808, "grad_norm": 0.3670969605445862, "learning_rate": 6.578725205949023e-06, "loss": 0.3786, "step": 1896 }, { "epoch": 1.3743057232552522, "grad_norm": 0.3704526424407959, "learning_rate": 6.574724535811712e-06, "loss": 0.4286, "step": 1897 }, { "epoch": 1.3750301859454237, "grad_norm": 0.36687150597572327, "learning_rate": 6.570722746188538e-06, "loss": 0.3849, "step": 1898 }, { "epoch": 1.3757546486355952, "grad_norm": 0.3851530849933624, "learning_rate": 6.566719839924412e-06, "loss": 0.3901, "step": 1899 }, { "epoch": 1.3764791113257666, "grad_norm": 0.3520437479019165, "learning_rate": 6.5627158198650355e-06, "loss": 0.4302, "step": 1900 }, { "epoch": 1.377203574015938, "grad_norm": 0.34302979707717896, "learning_rate": 6.558710688856903e-06, "loss": 0.3801, "step": 1901 }, { "epoch": 1.3779280367061095, "grad_norm": 0.3551795780658722, "learning_rate": 6.554704449747295e-06, "loss": 0.3918, "step": 1902 }, { "epoch": 1.378652499396281, "grad_norm": 0.3456220328807831, "learning_rate": 6.550697105384289e-06, "loss": 0.3887, "step": 1903 }, { "epoch": 1.3793769620864524, "grad_norm": 0.36341285705566406, "learning_rate": 6.546688658616738e-06, "loss": 0.4127, "step": 1904 }, { "epoch": 1.380101424776624, "grad_norm": 0.35196152329444885, "learning_rate": 6.542679112294283e-06, "loss": 0.4425, "step": 1905 }, { "epoch": 1.3808258874667954, "grad_norm": 0.3322828710079193, "learning_rate": 6.538668469267348e-06, "loss": 0.375, "step": 1906 }, { "epoch": 1.3815503501569668, "grad_norm": 0.446066677570343, "learning_rate": 6.5346567323871335e-06, "loss": 0.4271, "step": 1907 }, { "epoch": 1.3822748128471383, "grad_norm": 0.32966524362564087, "learning_rate": 6.530643904505622e-06, "loss": 0.3757, "step": 1908 }, { "epoch": 1.3829992755373097, "grad_norm": 0.34854528307914734, "learning_rate": 6.526629988475567e-06, "loss": 0.3886, "step": 1909 }, { "epoch": 1.3837237382274812, "grad_norm": 0.3637988269329071, "learning_rate": 6.522614987150498e-06, "loss": 0.4185, "step": 1910 }, { "epoch": 1.3844482009176526, "grad_norm": 0.34850528836250305, "learning_rate": 6.518598903384714e-06, "loss": 0.4133, "step": 1911 }, { "epoch": 1.385172663607824, "grad_norm": 0.36712315678596497, "learning_rate": 6.51458174003329e-06, "loss": 0.4005, "step": 1912 }, { "epoch": 1.3858971262979956, "grad_norm": 0.3629966676235199, "learning_rate": 6.510563499952058e-06, "loss": 0.4445, "step": 1913 }, { "epoch": 1.386621588988167, "grad_norm": 0.3132498562335968, "learning_rate": 6.506544185997625e-06, "loss": 0.3623, "step": 1914 }, { "epoch": 1.3873460516783385, "grad_norm": 0.35782158374786377, "learning_rate": 6.502523801027356e-06, "loss": 0.3935, "step": 1915 }, { "epoch": 1.38807051436851, "grad_norm": 0.3238376975059509, "learning_rate": 6.498502347899377e-06, "loss": 0.3722, "step": 1916 }, { "epoch": 1.3887949770586814, "grad_norm": 0.2926456928253174, "learning_rate": 6.49447982947258e-06, "loss": 0.3706, "step": 1917 }, { "epoch": 1.3895194397488528, "grad_norm": 0.35916775465011597, "learning_rate": 6.490456248606605e-06, "loss": 0.4291, "step": 1918 }, { "epoch": 1.3902439024390243, "grad_norm": 0.32045552134513855, "learning_rate": 6.486431608161854e-06, "loss": 0.3744, "step": 1919 }, { "epoch": 1.3909683651291957, "grad_norm": 0.35243120789527893, "learning_rate": 6.482405910999481e-06, "loss": 0.4043, "step": 1920 }, { "epoch": 1.3916928278193672, "grad_norm": 0.35597696900367737, "learning_rate": 6.478379159981388e-06, "loss": 0.4038, "step": 1921 }, { "epoch": 1.3924172905095387, "grad_norm": 0.3161196708679199, "learning_rate": 6.474351357970229e-06, "loss": 0.3569, "step": 1922 }, { "epoch": 1.3931417531997101, "grad_norm": 0.37421539425849915, "learning_rate": 6.470322507829407e-06, "loss": 0.3956, "step": 1923 }, { "epoch": 1.3938662158898816, "grad_norm": 0.34072592854499817, "learning_rate": 6.466292612423066e-06, "loss": 0.377, "step": 1924 }, { "epoch": 1.394590678580053, "grad_norm": 0.368962824344635, "learning_rate": 6.462261674616095e-06, "loss": 0.4442, "step": 1925 }, { "epoch": 1.3953151412702245, "grad_norm": 0.35559535026550293, "learning_rate": 6.458229697274125e-06, "loss": 0.3618, "step": 1926 }, { "epoch": 1.396039603960396, "grad_norm": 0.3748978078365326, "learning_rate": 6.454196683263524e-06, "loss": 0.4028, "step": 1927 }, { "epoch": 1.3967640666505674, "grad_norm": 0.34395116567611694, "learning_rate": 6.4501626354514006e-06, "loss": 0.4287, "step": 1928 }, { "epoch": 1.3974885293407389, "grad_norm": 0.3501701354980469, "learning_rate": 6.446127556705591e-06, "loss": 0.3851, "step": 1929 }, { "epoch": 1.3982129920309103, "grad_norm": 0.3390663266181946, "learning_rate": 6.442091449894672e-06, "loss": 0.3919, "step": 1930 }, { "epoch": 1.3989374547210818, "grad_norm": 0.4124451279640198, "learning_rate": 6.438054317887948e-06, "loss": 0.3993, "step": 1931 }, { "epoch": 1.3996619174112532, "grad_norm": 0.3701508939266205, "learning_rate": 6.434016163555452e-06, "loss": 0.4369, "step": 1932 }, { "epoch": 1.4003863801014247, "grad_norm": 0.32033681869506836, "learning_rate": 6.429976989767945e-06, "loss": 0.3646, "step": 1933 }, { "epoch": 1.4011108427915961, "grad_norm": 0.3507528305053711, "learning_rate": 6.425936799396913e-06, "loss": 0.4077, "step": 1934 }, { "epoch": 1.4018353054817676, "grad_norm": 0.37821856141090393, "learning_rate": 6.42189559531456e-06, "loss": 0.401, "step": 1935 }, { "epoch": 1.402559768171939, "grad_norm": 0.3713609576225281, "learning_rate": 6.417853380393819e-06, "loss": 0.3904, "step": 1936 }, { "epoch": 1.4032842308621105, "grad_norm": 0.32174789905548096, "learning_rate": 6.413810157508333e-06, "loss": 0.376, "step": 1937 }, { "epoch": 1.404008693552282, "grad_norm": 0.35810860991477966, "learning_rate": 6.40976592953247e-06, "loss": 0.4648, "step": 1938 }, { "epoch": 1.4047331562424534, "grad_norm": 0.32594063878059387, "learning_rate": 6.405720699341304e-06, "loss": 0.3565, "step": 1939 }, { "epoch": 1.4054576189326249, "grad_norm": 0.3331037163734436, "learning_rate": 6.401674469810626e-06, "loss": 0.3776, "step": 1940 }, { "epoch": 1.4061820816227963, "grad_norm": 0.3421030044555664, "learning_rate": 6.39762724381694e-06, "loss": 0.3953, "step": 1941 }, { "epoch": 1.4069065443129678, "grad_norm": 0.3331269919872284, "learning_rate": 6.3935790242374515e-06, "loss": 0.4068, "step": 1942 }, { "epoch": 1.4076310070031393, "grad_norm": 0.3209146559238434, "learning_rate": 6.3895298139500805e-06, "loss": 0.3772, "step": 1943 }, { "epoch": 1.4083554696933107, "grad_norm": 0.33271217346191406, "learning_rate": 6.385479615833445e-06, "loss": 0.4079, "step": 1944 }, { "epoch": 1.4090799323834822, "grad_norm": 0.33668678998947144, "learning_rate": 6.381428432766865e-06, "loss": 0.3949, "step": 1945 }, { "epoch": 1.4098043950736536, "grad_norm": 0.330890953540802, "learning_rate": 6.377376267630368e-06, "loss": 0.4192, "step": 1946 }, { "epoch": 1.410528857763825, "grad_norm": 0.3558255732059479, "learning_rate": 6.373323123304671e-06, "loss": 0.4024, "step": 1947 }, { "epoch": 1.4112533204539965, "grad_norm": 0.3190561830997467, "learning_rate": 6.369269002671192e-06, "loss": 0.3649, "step": 1948 }, { "epoch": 1.411977783144168, "grad_norm": 0.34717023372650146, "learning_rate": 6.365213908612041e-06, "loss": 0.4519, "step": 1949 }, { "epoch": 1.4127022458343395, "grad_norm": 0.3380286991596222, "learning_rate": 6.361157844010023e-06, "loss": 0.3726, "step": 1950 }, { "epoch": 1.413426708524511, "grad_norm": 0.3630762994289398, "learning_rate": 6.357100811748627e-06, "loss": 0.4019, "step": 1951 }, { "epoch": 1.4141511712146824, "grad_norm": 0.397933691740036, "learning_rate": 6.353042814712039e-06, "loss": 0.4172, "step": 1952 }, { "epoch": 1.4148756339048538, "grad_norm": 0.36725619435310364, "learning_rate": 6.348983855785122e-06, "loss": 0.4236, "step": 1953 }, { "epoch": 1.4156000965950253, "grad_norm": 0.35106343030929565, "learning_rate": 6.344923937853426e-06, "loss": 0.3794, "step": 1954 }, { "epoch": 1.4163245592851967, "grad_norm": 0.3885941207408905, "learning_rate": 6.340863063803187e-06, "loss": 0.4787, "step": 1955 }, { "epoch": 1.4170490219753682, "grad_norm": 0.3698010742664337, "learning_rate": 6.336801236521314e-06, "loss": 0.3714, "step": 1956 }, { "epoch": 1.4177734846655397, "grad_norm": 0.37478843331336975, "learning_rate": 6.332738458895398e-06, "loss": 0.3881, "step": 1957 }, { "epoch": 1.418497947355711, "grad_norm": 0.3621782064437866, "learning_rate": 6.328674733813702e-06, "loss": 0.4048, "step": 1958 }, { "epoch": 1.4192224100458826, "grad_norm": 0.4411458969116211, "learning_rate": 6.324610064165165e-06, "loss": 0.4295, "step": 1959 }, { "epoch": 1.419946872736054, "grad_norm": 0.3847411572933197, "learning_rate": 6.320544452839401e-06, "loss": 0.3918, "step": 1960 }, { "epoch": 1.4206713354262255, "grad_norm": 0.35082775354385376, "learning_rate": 6.3164779027266875e-06, "loss": 0.3938, "step": 1961 }, { "epoch": 1.421395798116397, "grad_norm": 0.3675915598869324, "learning_rate": 6.312410416717969e-06, "loss": 0.3844, "step": 1962 }, { "epoch": 1.4221202608065684, "grad_norm": 0.4240942895412445, "learning_rate": 6.308341997704862e-06, "loss": 0.3918, "step": 1963 }, { "epoch": 1.4228447234967398, "grad_norm": 0.3729610741138458, "learning_rate": 6.304272648579639e-06, "loss": 0.3996, "step": 1964 }, { "epoch": 1.4235691861869113, "grad_norm": 0.33049827814102173, "learning_rate": 6.3002023722352405e-06, "loss": 0.3703, "step": 1965 }, { "epoch": 1.4242936488770828, "grad_norm": 0.3981064558029175, "learning_rate": 6.296131171565259e-06, "loss": 0.4068, "step": 1966 }, { "epoch": 1.4250181115672542, "grad_norm": 0.37246641516685486, "learning_rate": 6.29205904946395e-06, "loss": 0.4057, "step": 1967 }, { "epoch": 1.4257425742574257, "grad_norm": 0.3399188816547394, "learning_rate": 6.287986008826221e-06, "loss": 0.382, "step": 1968 }, { "epoch": 1.4264670369475971, "grad_norm": 0.3175429701805115, "learning_rate": 6.283912052547634e-06, "loss": 0.3686, "step": 1969 }, { "epoch": 1.4271914996377686, "grad_norm": 0.3361561596393585, "learning_rate": 6.2798371835244e-06, "loss": 0.3798, "step": 1970 }, { "epoch": 1.42791596232794, "grad_norm": 0.32722821831703186, "learning_rate": 6.275761404653381e-06, "loss": 0.3909, "step": 1971 }, { "epoch": 1.4286404250181115, "grad_norm": 0.3783937394618988, "learning_rate": 6.271684718832085e-06, "loss": 0.4099, "step": 1972 }, { "epoch": 1.429364887708283, "grad_norm": 0.31288695335388184, "learning_rate": 6.267607128958664e-06, "loss": 0.4319, "step": 1973 }, { "epoch": 1.4300893503984544, "grad_norm": 0.32740938663482666, "learning_rate": 6.263528637931914e-06, "loss": 0.3802, "step": 1974 }, { "epoch": 1.4308138130886259, "grad_norm": 0.347461998462677, "learning_rate": 6.259449248651271e-06, "loss": 0.4104, "step": 1975 }, { "epoch": 1.4315382757787973, "grad_norm": 0.3281102478504181, "learning_rate": 6.255368964016809e-06, "loss": 0.4054, "step": 1976 }, { "epoch": 1.4322627384689688, "grad_norm": 0.32870015501976013, "learning_rate": 6.25128778692924e-06, "loss": 0.3476, "step": 1977 }, { "epoch": 1.4329872011591402, "grad_norm": 0.3125225007534027, "learning_rate": 6.247205720289907e-06, "loss": 0.3929, "step": 1978 }, { "epoch": 1.4337116638493117, "grad_norm": 0.34737899899482727, "learning_rate": 6.2431227670007925e-06, "loss": 0.3961, "step": 1979 }, { "epoch": 1.4344361265394832, "grad_norm": 0.3705986440181732, "learning_rate": 6.2390389299645e-06, "loss": 0.4103, "step": 1980 }, { "epoch": 1.4351605892296546, "grad_norm": 0.3473495841026306, "learning_rate": 6.2349542120842696e-06, "loss": 0.3938, "step": 1981 }, { "epoch": 1.435885051919826, "grad_norm": 0.3476700186729431, "learning_rate": 6.230868616263963e-06, "loss": 0.3948, "step": 1982 }, { "epoch": 1.4366095146099975, "grad_norm": 0.31773924827575684, "learning_rate": 6.226782145408066e-06, "loss": 0.3625, "step": 1983 }, { "epoch": 1.437333977300169, "grad_norm": 0.35239243507385254, "learning_rate": 6.2226948024216896e-06, "loss": 0.4353, "step": 1984 }, { "epoch": 1.4380584399903404, "grad_norm": 0.3079184889793396, "learning_rate": 6.218606590210561e-06, "loss": 0.3773, "step": 1985 }, { "epoch": 1.438782902680512, "grad_norm": 0.360155314207077, "learning_rate": 6.214517511681027e-06, "loss": 0.372, "step": 1986 }, { "epoch": 1.4395073653706834, "grad_norm": 0.3300393223762512, "learning_rate": 6.210427569740052e-06, "loss": 0.4144, "step": 1987 }, { "epoch": 1.4402318280608548, "grad_norm": 0.3184061646461487, "learning_rate": 6.2063367672952116e-06, "loss": 0.3815, "step": 1988 }, { "epoch": 1.4409562907510263, "grad_norm": 0.35880953073501587, "learning_rate": 6.2022451072546926e-06, "loss": 0.4272, "step": 1989 }, { "epoch": 1.4416807534411977, "grad_norm": 0.35892951488494873, "learning_rate": 6.198152592527296e-06, "loss": 0.3773, "step": 1990 }, { "epoch": 1.4424052161313692, "grad_norm": 0.35593217611312866, "learning_rate": 6.194059226022424e-06, "loss": 0.4247, "step": 1991 }, { "epoch": 1.4431296788215406, "grad_norm": 0.35565048456192017, "learning_rate": 6.189965010650092e-06, "loss": 0.3776, "step": 1992 }, { "epoch": 1.443854141511712, "grad_norm": 0.379966139793396, "learning_rate": 6.185869949320912e-06, "loss": 0.416, "step": 1993 }, { "epoch": 1.4445786042018836, "grad_norm": 0.3446526527404785, "learning_rate": 6.181774044946099e-06, "loss": 0.4111, "step": 1994 }, { "epoch": 1.445303066892055, "grad_norm": 0.3548848032951355, "learning_rate": 6.17767730043747e-06, "loss": 0.4183, "step": 1995 }, { "epoch": 1.4460275295822265, "grad_norm": 0.32013964653015137, "learning_rate": 6.173579718707438e-06, "loss": 0.3615, "step": 1996 }, { "epoch": 1.446751992272398, "grad_norm": 0.3765902519226074, "learning_rate": 6.169481302669007e-06, "loss": 0.4281, "step": 1997 }, { "epoch": 1.4474764549625694, "grad_norm": 0.38630038499832153, "learning_rate": 6.165382055235784e-06, "loss": 0.441, "step": 1998 }, { "epoch": 1.4482009176527408, "grad_norm": 0.30369439721107483, "learning_rate": 6.161281979321957e-06, "loss": 0.3658, "step": 1999 }, { "epoch": 1.4489253803429123, "grad_norm": 0.3297085165977478, "learning_rate": 6.157181077842306e-06, "loss": 0.3916, "step": 2000 }, { "epoch": 1.4496498430330838, "grad_norm": 0.318837434053421, "learning_rate": 6.153079353712201e-06, "loss": 0.3624, "step": 2001 }, { "epoch": 1.4503743057232552, "grad_norm": 0.3249877691268921, "learning_rate": 6.1489768098475945e-06, "loss": 0.3781, "step": 2002 }, { "epoch": 1.4510987684134267, "grad_norm": 0.36900946497917175, "learning_rate": 6.144873449165022e-06, "loss": 0.4108, "step": 2003 }, { "epoch": 1.4518232311035981, "grad_norm": 0.3326786458492279, "learning_rate": 6.1407692745816e-06, "loss": 0.4013, "step": 2004 }, { "epoch": 1.4525476937937696, "grad_norm": 0.3416687548160553, "learning_rate": 6.13666428901502e-06, "loss": 0.3886, "step": 2005 }, { "epoch": 1.453272156483941, "grad_norm": 0.37604692578315735, "learning_rate": 6.132558495383556e-06, "loss": 0.4028, "step": 2006 }, { "epoch": 1.4539966191741125, "grad_norm": 0.36056238412857056, "learning_rate": 6.128451896606054e-06, "loss": 0.375, "step": 2007 }, { "epoch": 1.454721081864284, "grad_norm": 0.3870079219341278, "learning_rate": 6.1243444956019305e-06, "loss": 0.4021, "step": 2008 }, { "epoch": 1.4554455445544554, "grad_norm": 0.36670684814453125, "learning_rate": 6.1202362952911755e-06, "loss": 0.3867, "step": 2009 }, { "epoch": 1.4561700072446269, "grad_norm": 0.3518906831741333, "learning_rate": 6.116127298594345e-06, "loss": 0.4026, "step": 2010 }, { "epoch": 1.4568944699347983, "grad_norm": 0.3791253864765167, "learning_rate": 6.11201750843256e-06, "loss": 0.3784, "step": 2011 }, { "epoch": 1.4576189326249698, "grad_norm": 0.3299693167209625, "learning_rate": 6.10790692772751e-06, "loss": 0.3795, "step": 2012 }, { "epoch": 1.4583433953151412, "grad_norm": 0.4249092638492584, "learning_rate": 6.10379555940144e-06, "loss": 0.4331, "step": 2013 }, { "epoch": 1.4590678580053127, "grad_norm": 0.3420979380607605, "learning_rate": 6.099683406377165e-06, "loss": 0.3499, "step": 2014 }, { "epoch": 1.4597923206954841, "grad_norm": 0.3378066420555115, "learning_rate": 6.0955704715780465e-06, "loss": 0.3746, "step": 2015 }, { "epoch": 1.4605167833856556, "grad_norm": 0.3587631285190582, "learning_rate": 6.091456757928008e-06, "loss": 0.4064, "step": 2016 }, { "epoch": 1.461241246075827, "grad_norm": 0.36664527654647827, "learning_rate": 6.0873422683515286e-06, "loss": 0.3998, "step": 2017 }, { "epoch": 1.4619657087659985, "grad_norm": 0.35142782330513, "learning_rate": 6.083227005773631e-06, "loss": 0.3843, "step": 2018 }, { "epoch": 1.46269017145617, "grad_norm": 0.3492193818092346, "learning_rate": 6.079110973119896e-06, "loss": 0.4436, "step": 2019 }, { "epoch": 1.4634146341463414, "grad_norm": 0.36464571952819824, "learning_rate": 6.0749941733164475e-06, "loss": 0.4104, "step": 2020 }, { "epoch": 1.4641390968365129, "grad_norm": 0.32284310460090637, "learning_rate": 6.070876609289955e-06, "loss": 0.3818, "step": 2021 }, { "epoch": 1.4648635595266843, "grad_norm": 0.3667973279953003, "learning_rate": 6.066758283967633e-06, "loss": 0.4116, "step": 2022 }, { "epoch": 1.4655880222168558, "grad_norm": 0.33755791187286377, "learning_rate": 6.062639200277236e-06, "loss": 0.4196, "step": 2023 }, { "epoch": 1.4663124849070273, "grad_norm": 0.35973161458969116, "learning_rate": 6.058519361147055e-06, "loss": 0.3675, "step": 2024 }, { "epoch": 1.4670369475971987, "grad_norm": 0.3758852481842041, "learning_rate": 6.0543987695059236e-06, "loss": 0.3989, "step": 2025 }, { "epoch": 1.4677614102873702, "grad_norm": 0.359470009803772, "learning_rate": 6.050277428283206e-06, "loss": 0.4684, "step": 2026 }, { "epoch": 1.4684858729775416, "grad_norm": 0.3339592516422272, "learning_rate": 6.046155340408803e-06, "loss": 0.3971, "step": 2027 }, { "epoch": 1.469210335667713, "grad_norm": 0.3258877992630005, "learning_rate": 6.042032508813141e-06, "loss": 0.3646, "step": 2028 }, { "epoch": 1.4699347983578845, "grad_norm": 0.36449119448661804, "learning_rate": 6.03790893642718e-06, "loss": 0.3965, "step": 2029 }, { "epoch": 1.470659261048056, "grad_norm": 0.36979466676712036, "learning_rate": 6.033784626182405e-06, "loss": 0.3881, "step": 2030 }, { "epoch": 1.4713837237382275, "grad_norm": 0.3864899277687073, "learning_rate": 6.029659581010825e-06, "loss": 0.4051, "step": 2031 }, { "epoch": 1.472108186428399, "grad_norm": 0.3869205415248871, "learning_rate": 6.0255338038449705e-06, "loss": 0.4049, "step": 2032 }, { "epoch": 1.4728326491185704, "grad_norm": 0.3664189577102661, "learning_rate": 6.021407297617896e-06, "loss": 0.3985, "step": 2033 }, { "epoch": 1.4735571118087418, "grad_norm": 0.34855198860168457, "learning_rate": 6.0172800652631706e-06, "loss": 0.4211, "step": 2034 }, { "epoch": 1.4742815744989133, "grad_norm": 0.4057926833629608, "learning_rate": 6.013152109714879e-06, "loss": 0.4542, "step": 2035 }, { "epoch": 1.4750060371890847, "grad_norm": 0.314674973487854, "learning_rate": 6.009023433907626e-06, "loss": 0.3406, "step": 2036 }, { "epoch": 1.4757304998792562, "grad_norm": 0.32821449637413025, "learning_rate": 6.004894040776521e-06, "loss": 0.4144, "step": 2037 }, { "epoch": 1.4764549625694277, "grad_norm": 0.3036889135837555, "learning_rate": 6.00076393325719e-06, "loss": 0.3805, "step": 2038 }, { "epoch": 1.477179425259599, "grad_norm": 0.3395094573497772, "learning_rate": 5.99663311428576e-06, "loss": 0.4288, "step": 2039 }, { "epoch": 1.4779038879497706, "grad_norm": 0.3192773461341858, "learning_rate": 5.99250158679887e-06, "loss": 0.3701, "step": 2040 }, { "epoch": 1.478628350639942, "grad_norm": 0.3609676957130432, "learning_rate": 5.9883693537336595e-06, "loss": 0.405, "step": 2041 }, { "epoch": 1.4793528133301135, "grad_norm": 0.339385062456131, "learning_rate": 5.984236418027771e-06, "loss": 0.4165, "step": 2042 }, { "epoch": 1.480077276020285, "grad_norm": 0.27958613634109497, "learning_rate": 5.980102782619343e-06, "loss": 0.3583, "step": 2043 }, { "epoch": 1.4808017387104564, "grad_norm": 0.34067651629447937, "learning_rate": 5.975968450447017e-06, "loss": 0.4108, "step": 2044 }, { "epoch": 1.4815262014006279, "grad_norm": 0.3373166024684906, "learning_rate": 5.971833424449926e-06, "loss": 0.3815, "step": 2045 }, { "epoch": 1.4822506640907993, "grad_norm": 0.35221824049949646, "learning_rate": 5.967697707567697e-06, "loss": 0.4009, "step": 2046 }, { "epoch": 1.4829751267809708, "grad_norm": 0.33213719725608826, "learning_rate": 5.9635613027404495e-06, "loss": 0.3925, "step": 2047 }, { "epoch": 1.4836995894711422, "grad_norm": 0.3101835250854492, "learning_rate": 5.959424212908789e-06, "loss": 0.3527, "step": 2048 }, { "epoch": 1.4844240521613137, "grad_norm": 0.3716333508491516, "learning_rate": 5.9552864410138115e-06, "loss": 0.4421, "step": 2049 }, { "epoch": 1.4851485148514851, "grad_norm": 0.35122671723365784, "learning_rate": 5.951147989997096e-06, "loss": 0.401, "step": 2050 }, { "epoch": 1.4858729775416566, "grad_norm": 0.321189820766449, "learning_rate": 5.947008862800704e-06, "loss": 0.3816, "step": 2051 }, { "epoch": 1.486597440231828, "grad_norm": 0.34836629033088684, "learning_rate": 5.9428690623671796e-06, "loss": 0.3994, "step": 2052 }, { "epoch": 1.4873219029219995, "grad_norm": 0.34572458267211914, "learning_rate": 5.938728591639542e-06, "loss": 0.3844, "step": 2053 }, { "epoch": 1.488046365612171, "grad_norm": 0.30845949053764343, "learning_rate": 5.9345874535612915e-06, "loss": 0.3614, "step": 2054 }, { "epoch": 1.4887708283023424, "grad_norm": 0.3755147159099579, "learning_rate": 5.9304456510763995e-06, "loss": 0.4161, "step": 2055 }, { "epoch": 1.4894952909925139, "grad_norm": 0.32574763894081116, "learning_rate": 5.926303187129311e-06, "loss": 0.4035, "step": 2056 }, { "epoch": 1.4902197536826853, "grad_norm": 0.3368590772151947, "learning_rate": 5.92216006466494e-06, "loss": 0.4142, "step": 2057 }, { "epoch": 1.4909442163728568, "grad_norm": 0.3249984681606293, "learning_rate": 5.91801628662867e-06, "loss": 0.4048, "step": 2058 }, { "epoch": 1.4916686790630282, "grad_norm": 0.3533872961997986, "learning_rate": 5.913871855966351e-06, "loss": 0.3964, "step": 2059 }, { "epoch": 1.4923931417531997, "grad_norm": 0.3537514805793762, "learning_rate": 5.909726775624296e-06, "loss": 0.4423, "step": 2060 }, { "epoch": 1.4931176044433712, "grad_norm": 0.322289377450943, "learning_rate": 5.905581048549279e-06, "loss": 0.3816, "step": 2061 }, { "epoch": 1.4938420671335426, "grad_norm": 0.3201662302017212, "learning_rate": 5.9014346776885356e-06, "loss": 0.377, "step": 2062 }, { "epoch": 1.494566529823714, "grad_norm": 0.35688838362693787, "learning_rate": 5.897287665989757e-06, "loss": 0.3991, "step": 2063 }, { "epoch": 1.4952909925138855, "grad_norm": 0.359997421503067, "learning_rate": 5.893140016401093e-06, "loss": 0.4109, "step": 2064 }, { "epoch": 1.496015455204057, "grad_norm": 0.3537271320819855, "learning_rate": 5.888991731871143e-06, "loss": 0.3926, "step": 2065 }, { "epoch": 1.4967399178942284, "grad_norm": 0.3403804898262024, "learning_rate": 5.8848428153489614e-06, "loss": 0.4012, "step": 2066 }, { "epoch": 1.4974643805844, "grad_norm": 0.37346285581588745, "learning_rate": 5.8806932697840505e-06, "loss": 0.3889, "step": 2067 }, { "epoch": 1.4981888432745714, "grad_norm": 0.31064942479133606, "learning_rate": 5.8765430981263585e-06, "loss": 0.3807, "step": 2068 }, { "epoch": 1.4989133059647428, "grad_norm": 0.3464277684688568, "learning_rate": 5.872392303326281e-06, "loss": 0.3663, "step": 2069 }, { "epoch": 1.4996377686549143, "grad_norm": 0.43480467796325684, "learning_rate": 5.8682408883346535e-06, "loss": 0.4136, "step": 2070 }, { "epoch": 1.5003622313450857, "grad_norm": 0.34051793813705444, "learning_rate": 5.864088856102755e-06, "loss": 0.3879, "step": 2071 }, { "epoch": 1.5010866940352572, "grad_norm": 0.3151017725467682, "learning_rate": 5.859936209582305e-06, "loss": 0.3648, "step": 2072 }, { "epoch": 1.5018111567254286, "grad_norm": 0.3785874843597412, "learning_rate": 5.855782951725454e-06, "loss": 0.4216, "step": 2073 }, { "epoch": 1.5025356194156, "grad_norm": 0.36756470799446106, "learning_rate": 5.851629085484792e-06, "loss": 0.4151, "step": 2074 }, { "epoch": 1.5032600821057716, "grad_norm": 0.33964085578918457, "learning_rate": 5.84747461381334e-06, "loss": 0.4288, "step": 2075 }, { "epoch": 1.503984544795943, "grad_norm": 0.31690865755081177, "learning_rate": 5.843319539664549e-06, "loss": 0.3496, "step": 2076 }, { "epoch": 1.5047090074861145, "grad_norm": 0.36600393056869507, "learning_rate": 5.839163865992301e-06, "loss": 0.4261, "step": 2077 }, { "epoch": 1.505433470176286, "grad_norm": 0.3520257771015167, "learning_rate": 5.835007595750897e-06, "loss": 0.436, "step": 2078 }, { "epoch": 1.5061579328664574, "grad_norm": 0.321289986371994, "learning_rate": 5.830850731895071e-06, "loss": 0.36, "step": 2079 }, { "epoch": 1.5068823955566288, "grad_norm": 0.33578309416770935, "learning_rate": 5.826693277379974e-06, "loss": 0.4054, "step": 2080 }, { "epoch": 1.5076068582468003, "grad_norm": 0.3225679397583008, "learning_rate": 5.8225352351611766e-06, "loss": 0.4111, "step": 2081 }, { "epoch": 1.5083313209369718, "grad_norm": 0.335292249917984, "learning_rate": 5.818376608194669e-06, "loss": 0.369, "step": 2082 }, { "epoch": 1.5090557836271432, "grad_norm": 0.3512920141220093, "learning_rate": 5.814217399436859e-06, "loss": 0.404, "step": 2083 }, { "epoch": 1.5097802463173147, "grad_norm": 0.34883570671081543, "learning_rate": 5.810057611844561e-06, "loss": 0.3864, "step": 2084 }, { "epoch": 1.5105047090074861, "grad_norm": 0.32163259387016296, "learning_rate": 5.805897248375009e-06, "loss": 0.3952, "step": 2085 }, { "epoch": 1.5112291716976576, "grad_norm": 0.3455567955970764, "learning_rate": 5.801736311985841e-06, "loss": 0.394, "step": 2086 }, { "epoch": 1.511953634387829, "grad_norm": 0.3493753969669342, "learning_rate": 5.797574805635108e-06, "loss": 0.4315, "step": 2087 }, { "epoch": 1.5126780970780005, "grad_norm": 0.4064215421676636, "learning_rate": 5.793412732281258e-06, "loss": 0.4262, "step": 2088 }, { "epoch": 1.513402559768172, "grad_norm": 0.3460928201675415, "learning_rate": 5.789250094883146e-06, "loss": 0.3997, "step": 2089 }, { "epoch": 1.5141270224583434, "grad_norm": 0.3428976237773895, "learning_rate": 5.785086896400033e-06, "loss": 0.3936, "step": 2090 }, { "epoch": 1.5148514851485149, "grad_norm": 0.3349997103214264, "learning_rate": 5.780923139791571e-06, "loss": 0.387, "step": 2091 }, { "epoch": 1.5155759478386863, "grad_norm": 0.3361458480358124, "learning_rate": 5.776758828017811e-06, "loss": 0.3893, "step": 2092 }, { "epoch": 1.5163004105288578, "grad_norm": 0.31781283020973206, "learning_rate": 5.772593964039203e-06, "loss": 0.35, "step": 2093 }, { "epoch": 1.5170248732190292, "grad_norm": 0.3452989161014557, "learning_rate": 5.768428550816584e-06, "loss": 0.3906, "step": 2094 }, { "epoch": 1.5177493359092007, "grad_norm": 0.3348972797393799, "learning_rate": 5.764262591311186e-06, "loss": 0.3976, "step": 2095 }, { "epoch": 1.5184737985993721, "grad_norm": 0.3392990231513977, "learning_rate": 5.760096088484624e-06, "loss": 0.3826, "step": 2096 }, { "epoch": 1.5191982612895436, "grad_norm": 0.33330821990966797, "learning_rate": 5.755929045298905e-06, "loss": 0.3709, "step": 2097 }, { "epoch": 1.519922723979715, "grad_norm": 0.4246078431606293, "learning_rate": 5.751761464716418e-06, "loss": 0.379, "step": 2098 }, { "epoch": 1.5206471866698865, "grad_norm": 0.33480456471443176, "learning_rate": 5.747593349699931e-06, "loss": 0.3955, "step": 2099 }, { "epoch": 1.521371649360058, "grad_norm": 0.34983426332473755, "learning_rate": 5.7434247032125945e-06, "loss": 0.3992, "step": 2100 }, { "epoch": 1.5220961120502294, "grad_norm": 0.3481118381023407, "learning_rate": 5.73925552821794e-06, "loss": 0.399, "step": 2101 }, { "epoch": 1.522820574740401, "grad_norm": 0.3767218291759491, "learning_rate": 5.735085827679869e-06, "loss": 0.4025, "step": 2102 }, { "epoch": 1.5235450374305723, "grad_norm": 0.3320414125919342, "learning_rate": 5.73091560456266e-06, "loss": 0.4151, "step": 2103 }, { "epoch": 1.5242695001207438, "grad_norm": 0.32655832171440125, "learning_rate": 5.726744861830963e-06, "loss": 0.3693, "step": 2104 }, { "epoch": 1.5249939628109153, "grad_norm": 0.38517051935195923, "learning_rate": 5.722573602449794e-06, "loss": 0.3776, "step": 2105 }, { "epoch": 1.5257184255010867, "grad_norm": 0.3362641930580139, "learning_rate": 5.718401829384541e-06, "loss": 0.3965, "step": 2106 }, { "epoch": 1.5264428881912582, "grad_norm": 0.33933722972869873, "learning_rate": 5.714229545600956e-06, "loss": 0.4126, "step": 2107 }, { "epoch": 1.5271673508814296, "grad_norm": 0.3392072021961212, "learning_rate": 5.71005675406515e-06, "loss": 0.3579, "step": 2108 }, { "epoch": 1.527891813571601, "grad_norm": 0.36148932576179504, "learning_rate": 5.7058834577436e-06, "loss": 0.439, "step": 2109 }, { "epoch": 1.5286162762617725, "grad_norm": 0.3263086676597595, "learning_rate": 5.701709659603142e-06, "loss": 0.3784, "step": 2110 }, { "epoch": 1.529340738951944, "grad_norm": 0.38286709785461426, "learning_rate": 5.697535362610962e-06, "loss": 0.4265, "step": 2111 }, { "epoch": 1.5300652016421155, "grad_norm": 0.3570502698421478, "learning_rate": 5.69336056973461e-06, "loss": 0.4064, "step": 2112 }, { "epoch": 1.530789664332287, "grad_norm": 0.35320523381233215, "learning_rate": 5.68918528394198e-06, "loss": 0.4206, "step": 2113 }, { "epoch": 1.5315141270224584, "grad_norm": 0.36840516328811646, "learning_rate": 5.685009508201324e-06, "loss": 0.4107, "step": 2114 }, { "epoch": 1.5322385897126298, "grad_norm": 0.33105605840682983, "learning_rate": 5.680833245481234e-06, "loss": 0.3601, "step": 2115 }, { "epoch": 1.5329630524028013, "grad_norm": 0.328506737947464, "learning_rate": 5.6766564987506564e-06, "loss": 0.4021, "step": 2116 }, { "epoch": 1.5336875150929727, "grad_norm": 0.37145501375198364, "learning_rate": 5.672479270978878e-06, "loss": 0.4337, "step": 2117 }, { "epoch": 1.5344119777831442, "grad_norm": 0.38219133019447327, "learning_rate": 5.668301565135525e-06, "loss": 0.4047, "step": 2118 }, { "epoch": 1.5351364404733157, "grad_norm": 0.3380630612373352, "learning_rate": 5.6641233841905695e-06, "loss": 0.3809, "step": 2119 }, { "epoch": 1.5358609031634871, "grad_norm": 0.3454959988594055, "learning_rate": 5.659944731114315e-06, "loss": 0.4064, "step": 2120 }, { "epoch": 1.5365853658536586, "grad_norm": 0.3442600965499878, "learning_rate": 5.655765608877406e-06, "loss": 0.4087, "step": 2121 }, { "epoch": 1.53730982854383, "grad_norm": 0.361488401889801, "learning_rate": 5.651586020450816e-06, "loss": 0.4129, "step": 2122 }, { "epoch": 1.5380342912340015, "grad_norm": 0.34541159868240356, "learning_rate": 5.647405968805854e-06, "loss": 0.4238, "step": 2123 }, { "epoch": 1.538758753924173, "grad_norm": 0.32343602180480957, "learning_rate": 5.6432254569141565e-06, "loss": 0.3929, "step": 2124 }, { "epoch": 1.5394832166143444, "grad_norm": 0.31487226486206055, "learning_rate": 5.639044487747684e-06, "loss": 0.3548, "step": 2125 }, { "epoch": 1.5402076793045159, "grad_norm": 0.37510645389556885, "learning_rate": 5.6348630642787305e-06, "loss": 0.3767, "step": 2126 }, { "epoch": 1.5409321419946873, "grad_norm": 0.3341904580593109, "learning_rate": 5.630681189479902e-06, "loss": 0.3807, "step": 2127 }, { "epoch": 1.5416566046848588, "grad_norm": 0.3586965501308441, "learning_rate": 5.626498866324133e-06, "loss": 0.4008, "step": 2128 }, { "epoch": 1.5423810673750302, "grad_norm": 0.3913462162017822, "learning_rate": 5.622316097784677e-06, "loss": 0.4735, "step": 2129 }, { "epoch": 1.5431055300652017, "grad_norm": 0.31637242436408997, "learning_rate": 5.618132886835098e-06, "loss": 0.3804, "step": 2130 }, { "epoch": 1.5438299927553731, "grad_norm": 0.3347223997116089, "learning_rate": 5.613949236449282e-06, "loss": 0.4065, "step": 2131 }, { "epoch": 1.5445544554455446, "grad_norm": 0.33775821328163147, "learning_rate": 5.60976514960142e-06, "loss": 0.3749, "step": 2132 }, { "epoch": 1.545278918135716, "grad_norm": 0.35365527868270874, "learning_rate": 5.605580629266021e-06, "loss": 0.4385, "step": 2133 }, { "epoch": 1.5460033808258875, "grad_norm": 0.37041500210762024, "learning_rate": 5.601395678417896e-06, "loss": 0.4451, "step": 2134 }, { "epoch": 1.546727843516059, "grad_norm": 0.3585076630115509, "learning_rate": 5.5972103000321644e-06, "loss": 0.3503, "step": 2135 }, { "epoch": 1.5474523062062304, "grad_norm": 0.3652200698852539, "learning_rate": 5.593024497084249e-06, "loss": 0.4466, "step": 2136 }, { "epoch": 1.5481767688964019, "grad_norm": 0.3074163496494293, "learning_rate": 5.5888382725498765e-06, "loss": 0.3723, "step": 2137 }, { "epoch": 1.5489012315865733, "grad_norm": 0.36498355865478516, "learning_rate": 5.584651629405069e-06, "loss": 0.433, "step": 2138 }, { "epoch": 1.5496256942767448, "grad_norm": 0.3800821304321289, "learning_rate": 5.5804645706261515e-06, "loss": 0.4341, "step": 2139 }, { "epoch": 1.5503501569669162, "grad_norm": 0.31838423013687134, "learning_rate": 5.576277099189742e-06, "loss": 0.3755, "step": 2140 }, { "epoch": 1.5510746196570877, "grad_norm": 0.42105433344841003, "learning_rate": 5.5720892180727475e-06, "loss": 0.4302, "step": 2141 }, { "epoch": 1.5517990823472592, "grad_norm": 0.30727002024650574, "learning_rate": 5.567900930252375e-06, "loss": 0.3574, "step": 2142 }, { "epoch": 1.5525235450374306, "grad_norm": 0.32964277267456055, "learning_rate": 5.563712238706114e-06, "loss": 0.3959, "step": 2143 }, { "epoch": 1.553248007727602, "grad_norm": 0.3735048472881317, "learning_rate": 5.5595231464117425e-06, "loss": 0.4039, "step": 2144 }, { "epoch": 1.5539724704177735, "grad_norm": 0.2984336316585541, "learning_rate": 5.555333656347327e-06, "loss": 0.3854, "step": 2145 }, { "epoch": 1.554696933107945, "grad_norm": 0.3274257481098175, "learning_rate": 5.551143771491209e-06, "loss": 0.4207, "step": 2146 }, { "epoch": 1.5554213957981164, "grad_norm": 0.3033275306224823, "learning_rate": 5.5469534948220206e-06, "loss": 0.3911, "step": 2147 }, { "epoch": 1.556145858488288, "grad_norm": 0.35199010372161865, "learning_rate": 5.542762829318663e-06, "loss": 0.4005, "step": 2148 }, { "epoch": 1.5568703211784594, "grad_norm": 0.33731359243392944, "learning_rate": 5.53857177796032e-06, "loss": 0.4014, "step": 2149 }, { "epoch": 1.5575947838686308, "grad_norm": 0.36242276430130005, "learning_rate": 5.534380343726448e-06, "loss": 0.4366, "step": 2150 }, { "epoch": 1.5583192465588023, "grad_norm": 0.35001298785209656, "learning_rate": 5.530188529596774e-06, "loss": 0.405, "step": 2151 }, { "epoch": 1.5590437092489737, "grad_norm": 0.3472488820552826, "learning_rate": 5.5259963385512975e-06, "loss": 0.3999, "step": 2152 }, { "epoch": 1.5597681719391452, "grad_norm": 0.31333744525909424, "learning_rate": 5.521803773570285e-06, "loss": 0.3709, "step": 2153 }, { "epoch": 1.5604926346293166, "grad_norm": 0.33393406867980957, "learning_rate": 5.5176108376342665e-06, "loss": 0.3863, "step": 2154 }, { "epoch": 1.561217097319488, "grad_norm": 0.31442373991012573, "learning_rate": 5.513417533724042e-06, "loss": 0.3953, "step": 2155 }, { "epoch": 1.5619415600096596, "grad_norm": 0.3662436604499817, "learning_rate": 5.509223864820666e-06, "loss": 0.4012, "step": 2156 }, { "epoch": 1.562666022699831, "grad_norm": 0.3694762885570526, "learning_rate": 5.505029833905456e-06, "loss": 0.3923, "step": 2157 }, { "epoch": 1.5633904853900025, "grad_norm": 0.34033632278442383, "learning_rate": 5.500835443959987e-06, "loss": 0.3723, "step": 2158 }, { "epoch": 1.564114948080174, "grad_norm": 0.3469395935535431, "learning_rate": 5.49664069796609e-06, "loss": 0.3945, "step": 2159 }, { "epoch": 1.5648394107703454, "grad_norm": 0.3505686819553375, "learning_rate": 5.492445598905843e-06, "loss": 0.3811, "step": 2160 }, { "epoch": 1.5655638734605168, "grad_norm": 0.39006203413009644, "learning_rate": 5.488250149761586e-06, "loss": 0.4253, "step": 2161 }, { "epoch": 1.5662883361506883, "grad_norm": 0.35688820481300354, "learning_rate": 5.484054353515896e-06, "loss": 0.3939, "step": 2162 }, { "epoch": 1.5670127988408598, "grad_norm": 0.3333371579647064, "learning_rate": 5.479858213151607e-06, "loss": 0.3895, "step": 2163 }, { "epoch": 1.5677372615310312, "grad_norm": 0.32442229986190796, "learning_rate": 5.4756617316517894e-06, "loss": 0.3552, "step": 2164 }, { "epoch": 1.5684617242212027, "grad_norm": 0.3677930533885956, "learning_rate": 5.471464911999758e-06, "loss": 0.4036, "step": 2165 }, { "epoch": 1.5691861869113741, "grad_norm": 0.32218027114868164, "learning_rate": 5.467267757179076e-06, "loss": 0.3743, "step": 2166 }, { "epoch": 1.5699106496015456, "grad_norm": 0.30803677439689636, "learning_rate": 5.463070270173535e-06, "loss": 0.3843, "step": 2167 }, { "epoch": 1.570635112291717, "grad_norm": 0.326459139585495, "learning_rate": 5.458872453967167e-06, "loss": 0.3571, "step": 2168 }, { "epoch": 1.5713595749818885, "grad_norm": 0.34540605545043945, "learning_rate": 5.454674311544236e-06, "loss": 0.3807, "step": 2169 }, { "epoch": 1.57208403767206, "grad_norm": 0.3383195698261261, "learning_rate": 5.450475845889241e-06, "loss": 0.3909, "step": 2170 }, { "epoch": 1.5728085003622314, "grad_norm": 0.33320653438568115, "learning_rate": 5.4462770599869075e-06, "loss": 0.4048, "step": 2171 }, { "epoch": 1.5735329630524029, "grad_norm": 0.34092190861701965, "learning_rate": 5.442077956822191e-06, "loss": 0.3856, "step": 2172 }, { "epoch": 1.5742574257425743, "grad_norm": 0.36617252230644226, "learning_rate": 5.437878539380272e-06, "loss": 0.4028, "step": 2173 }, { "epoch": 1.5749818884327458, "grad_norm": 0.35633182525634766, "learning_rate": 5.433678810646554e-06, "loss": 0.4101, "step": 2174 }, { "epoch": 1.5757063511229172, "grad_norm": 0.3498481810092926, "learning_rate": 5.429478773606663e-06, "loss": 0.4139, "step": 2175 }, { "epoch": 1.5764308138130887, "grad_norm": 0.33737632632255554, "learning_rate": 5.4252784312464414e-06, "loss": 0.3905, "step": 2176 }, { "epoch": 1.5771552765032602, "grad_norm": 0.3486364483833313, "learning_rate": 5.421077786551951e-06, "loss": 0.3766, "step": 2177 }, { "epoch": 1.5778797391934316, "grad_norm": 0.3269813060760498, "learning_rate": 5.416876842509468e-06, "loss": 0.3899, "step": 2178 }, { "epoch": 1.578604201883603, "grad_norm": 0.3326578736305237, "learning_rate": 5.412675602105483e-06, "loss": 0.4052, "step": 2179 }, { "epoch": 1.5793286645737745, "grad_norm": 0.35159313678741455, "learning_rate": 5.408474068326693e-06, "loss": 0.3935, "step": 2180 }, { "epoch": 1.580053127263946, "grad_norm": 0.31710851192474365, "learning_rate": 5.404272244160007e-06, "loss": 0.3633, "step": 2181 }, { "epoch": 1.5807775899541174, "grad_norm": 0.3482740819454193, "learning_rate": 5.400070132592541e-06, "loss": 0.3739, "step": 2182 }, { "epoch": 1.581502052644289, "grad_norm": 0.3242197632789612, "learning_rate": 5.395867736611613e-06, "loss": 0.3793, "step": 2183 }, { "epoch": 1.5822265153344603, "grad_norm": 0.3531443476676941, "learning_rate": 5.391665059204744e-06, "loss": 0.3742, "step": 2184 }, { "epoch": 1.5829509780246318, "grad_norm": 0.34838101267814636, "learning_rate": 5.387462103359655e-06, "loss": 0.3923, "step": 2185 }, { "epoch": 1.5836754407148033, "grad_norm": 0.336479514837265, "learning_rate": 5.3832588720642674e-06, "loss": 0.3907, "step": 2186 }, { "epoch": 1.5843999034049747, "grad_norm": 0.340014785528183, "learning_rate": 5.379055368306693e-06, "loss": 0.3981, "step": 2187 }, { "epoch": 1.5851243660951462, "grad_norm": 0.3859887421131134, "learning_rate": 5.374851595075243e-06, "loss": 0.4446, "step": 2188 }, { "epoch": 1.5858488287853176, "grad_norm": 0.34789329767227173, "learning_rate": 5.370647555358414e-06, "loss": 0.372, "step": 2189 }, { "epoch": 1.586573291475489, "grad_norm": 0.3355407118797302, "learning_rate": 5.366443252144901e-06, "loss": 0.3872, "step": 2190 }, { "epoch": 1.5872977541656605, "grad_norm": 0.3318612575531006, "learning_rate": 5.3622386884235764e-06, "loss": 0.4007, "step": 2191 }, { "epoch": 1.588022216855832, "grad_norm": 0.3726203739643097, "learning_rate": 5.358033867183505e-06, "loss": 0.4348, "step": 2192 }, { "epoch": 1.5887466795460035, "grad_norm": 0.33758896589279175, "learning_rate": 5.353828791413933e-06, "loss": 0.3996, "step": 2193 }, { "epoch": 1.589471142236175, "grad_norm": 0.35137349367141724, "learning_rate": 5.349623464104284e-06, "loss": 0.3861, "step": 2194 }, { "epoch": 1.5901956049263464, "grad_norm": 0.3554786443710327, "learning_rate": 5.345417888244164e-06, "loss": 0.3918, "step": 2195 }, { "epoch": 1.5909200676165178, "grad_norm": 0.35209134221076965, "learning_rate": 5.341212066823356e-06, "loss": 0.3835, "step": 2196 }, { "epoch": 1.5916445303066893, "grad_norm": 0.3564736247062683, "learning_rate": 5.337006002831815e-06, "loss": 0.3644, "step": 2197 }, { "epoch": 1.5923689929968607, "grad_norm": 0.3621894121170044, "learning_rate": 5.332799699259669e-06, "loss": 0.4098, "step": 2198 }, { "epoch": 1.5930934556870322, "grad_norm": 0.3316822946071625, "learning_rate": 5.328593159097219e-06, "loss": 0.3996, "step": 2199 }, { "epoch": 1.5938179183772037, "grad_norm": 0.3464525043964386, "learning_rate": 5.324386385334931e-06, "loss": 0.3984, "step": 2200 }, { "epoch": 1.5945423810673751, "grad_norm": 0.3275332748889923, "learning_rate": 5.3201793809634395e-06, "loss": 0.4031, "step": 2201 }, { "epoch": 1.5952668437575466, "grad_norm": 0.37016820907592773, "learning_rate": 5.315972148973541e-06, "loss": 0.3675, "step": 2202 }, { "epoch": 1.595991306447718, "grad_norm": 0.3346472382545471, "learning_rate": 5.311764692356195e-06, "loss": 0.405, "step": 2203 }, { "epoch": 1.5967157691378895, "grad_norm": 0.3079183101654053, "learning_rate": 5.30755701410252e-06, "loss": 0.3558, "step": 2204 }, { "epoch": 1.597440231828061, "grad_norm": 0.3567577600479126, "learning_rate": 5.3033491172037935e-06, "loss": 0.4335, "step": 2205 }, { "epoch": 1.5981646945182324, "grad_norm": 0.32676494121551514, "learning_rate": 5.2991410046514445e-06, "loss": 0.3706, "step": 2206 }, { "epoch": 1.5988891572084039, "grad_norm": 0.35970890522003174, "learning_rate": 5.294932679437063e-06, "loss": 0.4339, "step": 2207 }, { "epoch": 1.5996136198985753, "grad_norm": 0.33996137976646423, "learning_rate": 5.290724144552379e-06, "loss": 0.3922, "step": 2208 }, { "epoch": 1.6003380825887468, "grad_norm": 0.3117385804653168, "learning_rate": 5.286515402989284e-06, "loss": 0.3785, "step": 2209 }, { "epoch": 1.6010625452789182, "grad_norm": 0.317130982875824, "learning_rate": 5.2823064577398065e-06, "loss": 0.402, "step": 2210 }, { "epoch": 1.6017870079690897, "grad_norm": 0.3610402047634125, "learning_rate": 5.278097311796123e-06, "loss": 0.3903, "step": 2211 }, { "epoch": 1.6025114706592611, "grad_norm": 0.3427317440509796, "learning_rate": 5.2738879681505576e-06, "loss": 0.392, "step": 2212 }, { "epoch": 1.6032359333494326, "grad_norm": 0.3415745794773102, "learning_rate": 5.2696784297955665e-06, "loss": 0.4167, "step": 2213 }, { "epoch": 1.603960396039604, "grad_norm": 0.34441882371902466, "learning_rate": 5.265468699723748e-06, "loss": 0.4067, "step": 2214 }, { "epoch": 1.6046848587297755, "grad_norm": 0.3778005838394165, "learning_rate": 5.2612587809278415e-06, "loss": 0.4168, "step": 2215 }, { "epoch": 1.605409321419947, "grad_norm": 0.346310019493103, "learning_rate": 5.2570486764007124e-06, "loss": 0.3781, "step": 2216 }, { "epoch": 1.6061337841101184, "grad_norm": 0.30152440071105957, "learning_rate": 5.252838389135364e-06, "loss": 0.3909, "step": 2217 }, { "epoch": 1.6068582468002899, "grad_norm": 0.33657726645469666, "learning_rate": 5.248627922124926e-06, "loss": 0.3882, "step": 2218 }, { "epoch": 1.6075827094904613, "grad_norm": 0.3557579517364502, "learning_rate": 5.2444172783626575e-06, "loss": 0.434, "step": 2219 }, { "epoch": 1.6083071721806328, "grad_norm": 0.31058406829833984, "learning_rate": 5.240206460841944e-06, "loss": 0.3567, "step": 2220 }, { "epoch": 1.6090316348708043, "grad_norm": 0.3389318585395813, "learning_rate": 5.235995472556295e-06, "loss": 0.4205, "step": 2221 }, { "epoch": 1.6097560975609757, "grad_norm": 0.31861913204193115, "learning_rate": 5.231784316499335e-06, "loss": 0.4037, "step": 2222 }, { "epoch": 1.6104805602511472, "grad_norm": 0.3060288727283478, "learning_rate": 5.227572995664819e-06, "loss": 0.3523, "step": 2223 }, { "epoch": 1.6112050229413186, "grad_norm": 0.34855377674102783, "learning_rate": 5.223361513046608e-06, "loss": 0.3735, "step": 2224 }, { "epoch": 1.61192948563149, "grad_norm": 0.3654906749725342, "learning_rate": 5.219149871638686e-06, "loss": 0.4818, "step": 2225 }, { "epoch": 1.6126539483216615, "grad_norm": 0.3011781573295593, "learning_rate": 5.214938074435145e-06, "loss": 0.354, "step": 2226 }, { "epoch": 1.613378411011833, "grad_norm": 0.2894231379032135, "learning_rate": 5.21072612443019e-06, "loss": 0.3556, "step": 2227 }, { "epoch": 1.6141028737020044, "grad_norm": 0.35926270484924316, "learning_rate": 5.206514024618136e-06, "loss": 0.3749, "step": 2228 }, { "epoch": 1.614827336392176, "grad_norm": 0.3446187973022461, "learning_rate": 5.2023017779934e-06, "loss": 0.4102, "step": 2229 }, { "epoch": 1.6155517990823474, "grad_norm": 0.3184847831726074, "learning_rate": 5.198089387550507e-06, "loss": 0.3793, "step": 2230 }, { "epoch": 1.6162762617725188, "grad_norm": 0.35416558384895325, "learning_rate": 5.193876856284085e-06, "loss": 0.4041, "step": 2231 }, { "epoch": 1.6170007244626903, "grad_norm": 0.3399524688720703, "learning_rate": 5.189664187188857e-06, "loss": 0.4318, "step": 2232 }, { "epoch": 1.6177251871528617, "grad_norm": 0.3255431056022644, "learning_rate": 5.185451383259651e-06, "loss": 0.4065, "step": 2233 }, { "epoch": 1.6184496498430332, "grad_norm": 0.34359219670295715, "learning_rate": 5.181238447491385e-06, "loss": 0.4054, "step": 2234 }, { "epoch": 1.6191741125332046, "grad_norm": 0.3408445417881012, "learning_rate": 5.177025382879074e-06, "loss": 0.3692, "step": 2235 }, { "epoch": 1.619898575223376, "grad_norm": 0.3391171097755432, "learning_rate": 5.172812192417824e-06, "loss": 0.3894, "step": 2236 }, { "epoch": 1.6206230379135476, "grad_norm": 0.3266442120075226, "learning_rate": 5.168598879102827e-06, "loss": 0.3608, "step": 2237 }, { "epoch": 1.621347500603719, "grad_norm": 0.3564135730266571, "learning_rate": 5.164385445929369e-06, "loss": 0.424, "step": 2238 }, { "epoch": 1.6220719632938905, "grad_norm": 0.32613059878349304, "learning_rate": 5.160171895892817e-06, "loss": 0.4142, "step": 2239 }, { "epoch": 1.622796425984062, "grad_norm": 0.30813130736351013, "learning_rate": 5.155958231988621e-06, "loss": 0.3867, "step": 2240 }, { "epoch": 1.6235208886742334, "grad_norm": 0.3743106424808502, "learning_rate": 5.151744457212312e-06, "loss": 0.4102, "step": 2241 }, { "epoch": 1.6242453513644048, "grad_norm": 0.3446752429008484, "learning_rate": 5.147530574559501e-06, "loss": 0.4154, "step": 2242 }, { "epoch": 1.6249698140545763, "grad_norm": 0.3361722528934479, "learning_rate": 5.143316587025877e-06, "loss": 0.4299, "step": 2243 }, { "epoch": 1.6256942767447478, "grad_norm": 0.33715763688087463, "learning_rate": 5.139102497607198e-06, "loss": 0.3793, "step": 2244 }, { "epoch": 1.6264187394349192, "grad_norm": 0.3477323651313782, "learning_rate": 5.1348883092993e-06, "loss": 0.4089, "step": 2245 }, { "epoch": 1.6271432021250907, "grad_norm": 0.31379133462905884, "learning_rate": 5.130674025098091e-06, "loss": 0.3698, "step": 2246 }, { "epoch": 1.6278676648152621, "grad_norm": 0.33117011189460754, "learning_rate": 5.126459647999535e-06, "loss": 0.4274, "step": 2247 }, { "epoch": 1.6285921275054336, "grad_norm": 0.31358152627944946, "learning_rate": 5.12224518099968e-06, "loss": 0.3588, "step": 2248 }, { "epoch": 1.629316590195605, "grad_norm": 0.3666573762893677, "learning_rate": 5.118030627094622e-06, "loss": 0.4195, "step": 2249 }, { "epoch": 1.6300410528857765, "grad_norm": 0.3003334403038025, "learning_rate": 5.113815989280528e-06, "loss": 0.3806, "step": 2250 }, { "epoch": 1.630765515575948, "grad_norm": 0.3035551607608795, "learning_rate": 5.109601270553623e-06, "loss": 0.3727, "step": 2251 }, { "epoch": 1.6314899782661194, "grad_norm": 0.3707110285758972, "learning_rate": 5.105386473910187e-06, "loss": 0.4487, "step": 2252 }, { "epoch": 1.6322144409562909, "grad_norm": 0.29726311564445496, "learning_rate": 5.101171602346556e-06, "loss": 0.345, "step": 2253 }, { "epoch": 1.6329389036464623, "grad_norm": 0.28911906480789185, "learning_rate": 5.096956658859122e-06, "loss": 0.3709, "step": 2254 }, { "epoch": 1.6336633663366338, "grad_norm": 0.33778074383735657, "learning_rate": 5.092741646444327e-06, "loss": 0.4104, "step": 2255 }, { "epoch": 1.6343878290268052, "grad_norm": 0.353344589471817, "learning_rate": 5.0885265680986585e-06, "loss": 0.4684, "step": 2256 }, { "epoch": 1.6351122917169767, "grad_norm": 0.30552104115486145, "learning_rate": 5.084311426818654e-06, "loss": 0.37, "step": 2257 }, { "epoch": 1.6358367544071482, "grad_norm": 0.3565255105495453, "learning_rate": 5.080096225600899e-06, "loss": 0.4096, "step": 2258 }, { "epoch": 1.6365612170973196, "grad_norm": 0.32954028248786926, "learning_rate": 5.075880967442014e-06, "loss": 0.4263, "step": 2259 }, { "epoch": 1.637285679787491, "grad_norm": 0.35339614748954773, "learning_rate": 5.0716656553386655e-06, "loss": 0.3999, "step": 2260 }, { "epoch": 1.6380101424776625, "grad_norm": 0.3506143391132355, "learning_rate": 5.0674502922875565e-06, "loss": 0.3568, "step": 2261 }, { "epoch": 1.638734605167834, "grad_norm": 0.39779132604599, "learning_rate": 5.063234881285428e-06, "loss": 0.4315, "step": 2262 }, { "epoch": 1.6394590678580054, "grad_norm": 0.319578617811203, "learning_rate": 5.059019425329053e-06, "loss": 0.399, "step": 2263 }, { "epoch": 1.640183530548177, "grad_norm": 0.34968623518943787, "learning_rate": 5.054803927415236e-06, "loss": 0.407, "step": 2264 }, { "epoch": 1.6409079932383484, "grad_norm": 0.3404877185821533, "learning_rate": 5.050588390540813e-06, "loss": 0.3806, "step": 2265 }, { "epoch": 1.6416324559285198, "grad_norm": 0.34939640760421753, "learning_rate": 5.046372817702647e-06, "loss": 0.4104, "step": 2266 }, { "epoch": 1.6423569186186913, "grad_norm": 0.358859658241272, "learning_rate": 5.0421572118976284e-06, "loss": 0.4042, "step": 2267 }, { "epoch": 1.6430813813088627, "grad_norm": 0.3390056788921356, "learning_rate": 5.037941576122667e-06, "loss": 0.4047, "step": 2268 }, { "epoch": 1.6438058439990342, "grad_norm": 0.32021886110305786, "learning_rate": 5.033725913374698e-06, "loss": 0.3549, "step": 2269 }, { "epoch": 1.6445303066892056, "grad_norm": 0.3668937385082245, "learning_rate": 5.029510226650672e-06, "loss": 0.3949, "step": 2270 }, { "epoch": 1.645254769379377, "grad_norm": 0.3351605534553528, "learning_rate": 5.025294518947561e-06, "loss": 0.3665, "step": 2271 }, { "epoch": 1.6459792320695485, "grad_norm": 0.33336660265922546, "learning_rate": 5.021078793262349e-06, "loss": 0.4264, "step": 2272 }, { "epoch": 1.64670369475972, "grad_norm": 0.30642950534820557, "learning_rate": 5.0168630525920316e-06, "loss": 0.3452, "step": 2273 }, { "epoch": 1.6474281574498915, "grad_norm": 0.36838603019714355, "learning_rate": 5.01264729993362e-06, "loss": 0.411, "step": 2274 }, { "epoch": 1.648152620140063, "grad_norm": 0.35179978609085083, "learning_rate": 5.008431538284128e-06, "loss": 0.4265, "step": 2275 }, { "epoch": 1.6488770828302344, "grad_norm": 0.33314767479896545, "learning_rate": 5.004215770640578e-06, "loss": 0.3679, "step": 2276 }, { "epoch": 1.6496015455204058, "grad_norm": 0.3559325337409973, "learning_rate": 5e-06, "loss": 0.4296, "step": 2277 }, { "epoch": 1.6503260082105773, "grad_norm": 0.34511420130729675, "learning_rate": 4.995784229359423e-06, "loss": 0.3525, "step": 2278 }, { "epoch": 1.6510504709007487, "grad_norm": 0.3573168218135834, "learning_rate": 4.991568461715874e-06, "loss": 0.4018, "step": 2279 }, { "epoch": 1.6517749335909202, "grad_norm": 0.352881520986557, "learning_rate": 4.987352700066382e-06, "loss": 0.3904, "step": 2280 }, { "epoch": 1.6524993962810917, "grad_norm": 0.33958137035369873, "learning_rate": 4.98313694740797e-06, "loss": 0.3817, "step": 2281 }, { "epoch": 1.6532238589712631, "grad_norm": 0.3334532380104065, "learning_rate": 4.978921206737652e-06, "loss": 0.3812, "step": 2282 }, { "epoch": 1.6539483216614346, "grad_norm": 0.34684666991233826, "learning_rate": 4.9747054810524405e-06, "loss": 0.3884, "step": 2283 }, { "epoch": 1.654672784351606, "grad_norm": 0.3507898449897766, "learning_rate": 4.970489773349328e-06, "loss": 0.377, "step": 2284 }, { "epoch": 1.6553972470417775, "grad_norm": 0.3593534529209137, "learning_rate": 4.966274086625304e-06, "loss": 0.4212, "step": 2285 }, { "epoch": 1.656121709731949, "grad_norm": 0.3508662283420563, "learning_rate": 4.962058423877335e-06, "loss": 0.3895, "step": 2286 }, { "epoch": 1.6568461724221204, "grad_norm": 0.3170710802078247, "learning_rate": 4.957842788102372e-06, "loss": 0.4261, "step": 2287 }, { "epoch": 1.6575706351122919, "grad_norm": 0.3023587465286255, "learning_rate": 4.953627182297354e-06, "loss": 0.3732, "step": 2288 }, { "epoch": 1.6582950978024633, "grad_norm": 0.33022773265838623, "learning_rate": 4.949411609459189e-06, "loss": 0.3781, "step": 2289 }, { "epoch": 1.6590195604926348, "grad_norm": 0.3913925886154175, "learning_rate": 4.945196072584766e-06, "loss": 0.4517, "step": 2290 }, { "epoch": 1.6597440231828062, "grad_norm": 0.3427768647670746, "learning_rate": 4.940980574670949e-06, "loss": 0.3776, "step": 2291 }, { "epoch": 1.6604684858729777, "grad_norm": 0.30939722061157227, "learning_rate": 4.936765118714574e-06, "loss": 0.353, "step": 2292 }, { "epoch": 1.6611929485631491, "grad_norm": 0.390769898891449, "learning_rate": 4.932549707712444e-06, "loss": 0.4422, "step": 2293 }, { "epoch": 1.6619174112533206, "grad_norm": 0.3117462396621704, "learning_rate": 4.928334344661336e-06, "loss": 0.3624, "step": 2294 }, { "epoch": 1.662641873943492, "grad_norm": 0.3374945819377899, "learning_rate": 4.924119032557988e-06, "loss": 0.4027, "step": 2295 }, { "epoch": 1.6633663366336635, "grad_norm": 0.3313553035259247, "learning_rate": 4.919903774399103e-06, "loss": 0.4071, "step": 2296 }, { "epoch": 1.664090799323835, "grad_norm": 0.37559130787849426, "learning_rate": 4.915688573181347e-06, "loss": 0.4082, "step": 2297 }, { "epoch": 1.6648152620140064, "grad_norm": 0.3164505064487457, "learning_rate": 4.911473431901343e-06, "loss": 0.3928, "step": 2298 }, { "epoch": 1.6655397247041779, "grad_norm": 0.34802892804145813, "learning_rate": 4.907258353555675e-06, "loss": 0.3773, "step": 2299 }, { "epoch": 1.6662641873943493, "grad_norm": 0.30784982442855835, "learning_rate": 4.903043341140879e-06, "loss": 0.3664, "step": 2300 }, { "epoch": 1.6669886500845208, "grad_norm": 0.33725371956825256, "learning_rate": 4.8988283976534436e-06, "loss": 0.3818, "step": 2301 }, { "epoch": 1.6677131127746923, "grad_norm": 0.36124852299690247, "learning_rate": 4.894613526089814e-06, "loss": 0.431, "step": 2302 }, { "epoch": 1.6684375754648637, "grad_norm": 0.3056713044643402, "learning_rate": 4.890398729446378e-06, "loss": 0.3622, "step": 2303 }, { "epoch": 1.6691620381550352, "grad_norm": 0.3543775677680969, "learning_rate": 4.886184010719472e-06, "loss": 0.4043, "step": 2304 }, { "epoch": 1.6698865008452066, "grad_norm": 0.3752552568912506, "learning_rate": 4.8819693729053786e-06, "loss": 0.403, "step": 2305 }, { "epoch": 1.670610963535378, "grad_norm": 0.3479154109954834, "learning_rate": 4.877754819000321e-06, "loss": 0.3807, "step": 2306 }, { "epoch": 1.6713354262255495, "grad_norm": 0.349687397480011, "learning_rate": 4.873540352000464e-06, "loss": 0.4197, "step": 2307 }, { "epoch": 1.672059888915721, "grad_norm": 0.3229091167449951, "learning_rate": 4.869325974901911e-06, "loss": 0.3968, "step": 2308 }, { "epoch": 1.6727843516058925, "grad_norm": 0.3474825620651245, "learning_rate": 4.865111690700699e-06, "loss": 0.3811, "step": 2309 }, { "epoch": 1.673508814296064, "grad_norm": 0.3859727680683136, "learning_rate": 4.860897502392802e-06, "loss": 0.4066, "step": 2310 }, { "epoch": 1.6742332769862354, "grad_norm": 0.3316902220249176, "learning_rate": 4.856683412974124e-06, "loss": 0.4142, "step": 2311 }, { "epoch": 1.6749577396764068, "grad_norm": 0.33436882495880127, "learning_rate": 4.852469425440499e-06, "loss": 0.382, "step": 2312 }, { "epoch": 1.6756822023665783, "grad_norm": 0.3281739056110382, "learning_rate": 4.848255542787689e-06, "loss": 0.4327, "step": 2313 }, { "epoch": 1.6764066650567497, "grad_norm": 0.30339813232421875, "learning_rate": 4.84404176801138e-06, "loss": 0.4009, "step": 2314 }, { "epoch": 1.6771311277469212, "grad_norm": 0.3623583912849426, "learning_rate": 4.839828104107183e-06, "loss": 0.4133, "step": 2315 }, { "epoch": 1.6778555904370926, "grad_norm": 0.35020968317985535, "learning_rate": 4.8356145540706315e-06, "loss": 0.4139, "step": 2316 }, { "epoch": 1.678580053127264, "grad_norm": 0.3108100891113281, "learning_rate": 4.831401120897173e-06, "loss": 0.3657, "step": 2317 }, { "epoch": 1.6793045158174356, "grad_norm": 0.3467102646827698, "learning_rate": 4.827187807582177e-06, "loss": 0.398, "step": 2318 }, { "epoch": 1.680028978507607, "grad_norm": 0.3673262298107147, "learning_rate": 4.8229746171209275e-06, "loss": 0.4344, "step": 2319 }, { "epoch": 1.6807534411977785, "grad_norm": 0.3280233144760132, "learning_rate": 4.818761552508615e-06, "loss": 0.3937, "step": 2320 }, { "epoch": 1.68147790388795, "grad_norm": 0.3050577938556671, "learning_rate": 4.8145486167403496e-06, "loss": 0.3522, "step": 2321 }, { "epoch": 1.6822023665781214, "grad_norm": 0.3335897922515869, "learning_rate": 4.8103358128111435e-06, "loss": 0.4115, "step": 2322 }, { "epoch": 1.6829268292682928, "grad_norm": 0.32369232177734375, "learning_rate": 4.806123143715916e-06, "loss": 0.3915, "step": 2323 }, { "epoch": 1.6836512919584643, "grad_norm": 0.3144063949584961, "learning_rate": 4.801910612449495e-06, "loss": 0.3831, "step": 2324 }, { "epoch": 1.6843757546486358, "grad_norm": 0.3494751453399658, "learning_rate": 4.797698222006603e-06, "loss": 0.4168, "step": 2325 }, { "epoch": 1.6851002173388072, "grad_norm": 0.3173535466194153, "learning_rate": 4.7934859753818666e-06, "loss": 0.3602, "step": 2326 }, { "epoch": 1.6858246800289787, "grad_norm": 0.325476735830307, "learning_rate": 4.789273875569811e-06, "loss": 0.3843, "step": 2327 }, { "epoch": 1.6865491427191501, "grad_norm": 0.34631186723709106, "learning_rate": 4.7850619255648575e-06, "loss": 0.3889, "step": 2328 }, { "epoch": 1.6872736054093216, "grad_norm": 0.33495140075683594, "learning_rate": 4.780850128361317e-06, "loss": 0.3868, "step": 2329 }, { "epoch": 1.6879980680994928, "grad_norm": 0.30795779824256897, "learning_rate": 4.776638486953393e-06, "loss": 0.4011, "step": 2330 }, { "epoch": 1.6887225307896643, "grad_norm": 0.33200764656066895, "learning_rate": 4.772427004335183e-06, "loss": 0.3808, "step": 2331 }, { "epoch": 1.6894469934798357, "grad_norm": 0.32425665855407715, "learning_rate": 4.7682156835006665e-06, "loss": 0.4003, "step": 2332 }, { "epoch": 1.6901714561700072, "grad_norm": 0.32554543018341064, "learning_rate": 4.764004527443708e-06, "loss": 0.4198, "step": 2333 }, { "epoch": 1.6908959188601786, "grad_norm": 0.3496548533439636, "learning_rate": 4.759793539158057e-06, "loss": 0.3934, "step": 2334 }, { "epoch": 1.69162038155035, "grad_norm": 0.30950412154197693, "learning_rate": 4.755582721637344e-06, "loss": 0.3562, "step": 2335 }, { "epoch": 1.6923448442405216, "grad_norm": 0.36736756563186646, "learning_rate": 4.751372077875077e-06, "loss": 0.4053, "step": 2336 }, { "epoch": 1.693069306930693, "grad_norm": 0.36911463737487793, "learning_rate": 4.747161610864639e-06, "loss": 0.4147, "step": 2337 }, { "epoch": 1.6937937696208645, "grad_norm": 0.3060970604419708, "learning_rate": 4.74295132359929e-06, "loss": 0.3794, "step": 2338 }, { "epoch": 1.694518232311036, "grad_norm": 0.33386915922164917, "learning_rate": 4.738741219072161e-06, "loss": 0.4025, "step": 2339 }, { "epoch": 1.6952426950012074, "grad_norm": 0.3247238099575043, "learning_rate": 4.7345313002762545e-06, "loss": 0.4186, "step": 2340 }, { "epoch": 1.6959671576913788, "grad_norm": 0.3210209310054779, "learning_rate": 4.730321570204436e-06, "loss": 0.4169, "step": 2341 }, { "epoch": 1.6966916203815503, "grad_norm": 0.31688159704208374, "learning_rate": 4.726112031849446e-06, "loss": 0.3703, "step": 2342 }, { "epoch": 1.6974160830717218, "grad_norm": 0.32272112369537354, "learning_rate": 4.721902688203879e-06, "loss": 0.3898, "step": 2343 }, { "epoch": 1.6981405457618932, "grad_norm": 0.3066641092300415, "learning_rate": 4.717693542260196e-06, "loss": 0.3867, "step": 2344 }, { "epoch": 1.6988650084520647, "grad_norm": 0.3034276068210602, "learning_rate": 4.713484597010718e-06, "loss": 0.3779, "step": 2345 }, { "epoch": 1.6995894711422361, "grad_norm": 0.32722848653793335, "learning_rate": 4.7092758554476215e-06, "loss": 0.4223, "step": 2346 }, { "epoch": 1.7003139338324076, "grad_norm": 0.31734853982925415, "learning_rate": 4.705067320562939e-06, "loss": 0.4007, "step": 2347 }, { "epoch": 1.701038396522579, "grad_norm": 0.3259417712688446, "learning_rate": 4.700858995348556e-06, "loss": 0.3737, "step": 2348 }, { "epoch": 1.7017628592127505, "grad_norm": 0.33916088938713074, "learning_rate": 4.696650882796207e-06, "loss": 0.3801, "step": 2349 }, { "epoch": 1.702487321902922, "grad_norm": 0.3260381519794464, "learning_rate": 4.692442985897481e-06, "loss": 0.3896, "step": 2350 }, { "epoch": 1.7032117845930934, "grad_norm": 0.2915399968624115, "learning_rate": 4.688235307643807e-06, "loss": 0.3538, "step": 2351 }, { "epoch": 1.7039362472832649, "grad_norm": 0.3365890085697174, "learning_rate": 4.6840278510264595e-06, "loss": 0.4336, "step": 2352 }, { "epoch": 1.7046607099734363, "grad_norm": 0.34256502985954285, "learning_rate": 4.679820619036562e-06, "loss": 0.3847, "step": 2353 }, { "epoch": 1.7053851726636078, "grad_norm": 0.3412240147590637, "learning_rate": 4.67561361466507e-06, "loss": 0.3925, "step": 2354 }, { "epoch": 1.7061096353537792, "grad_norm": 0.33031997084617615, "learning_rate": 4.671406840902782e-06, "loss": 0.4392, "step": 2355 }, { "epoch": 1.7068340980439507, "grad_norm": 0.3392753005027771, "learning_rate": 4.667200300740333e-06, "loss": 0.3659, "step": 2356 }, { "epoch": 1.7075585607341222, "grad_norm": 0.35722601413726807, "learning_rate": 4.662993997168187e-06, "loss": 0.4032, "step": 2357 }, { "epoch": 1.7082830234242936, "grad_norm": 0.3063857853412628, "learning_rate": 4.6587879331766465e-06, "loss": 0.3781, "step": 2358 }, { "epoch": 1.709007486114465, "grad_norm": 0.3209799826145172, "learning_rate": 4.6545821117558375e-06, "loss": 0.3867, "step": 2359 }, { "epoch": 1.7097319488046365, "grad_norm": 0.3584285080432892, "learning_rate": 4.650376535895717e-06, "loss": 0.4026, "step": 2360 }, { "epoch": 1.710456411494808, "grad_norm": 0.37693846225738525, "learning_rate": 4.646171208586069e-06, "loss": 0.4138, "step": 2361 }, { "epoch": 1.7111808741849794, "grad_norm": 0.3283816874027252, "learning_rate": 4.641966132816496e-06, "loss": 0.378, "step": 2362 }, { "epoch": 1.711905336875151, "grad_norm": 0.3781837522983551, "learning_rate": 4.637761311576424e-06, "loss": 0.4121, "step": 2363 }, { "epoch": 1.7126297995653224, "grad_norm": 0.3802495002746582, "learning_rate": 4.633556747855102e-06, "loss": 0.4259, "step": 2364 }, { "epoch": 1.7133542622554938, "grad_norm": 0.33253249526023865, "learning_rate": 4.629352444641588e-06, "loss": 0.3736, "step": 2365 }, { "epoch": 1.7140787249456653, "grad_norm": 0.325011283159256, "learning_rate": 4.62514840492476e-06, "loss": 0.3921, "step": 2366 }, { "epoch": 1.7148031876358367, "grad_norm": 0.3923077881336212, "learning_rate": 4.620944631693309e-06, "loss": 0.4218, "step": 2367 }, { "epoch": 1.7155276503260082, "grad_norm": 0.367877334356308, "learning_rate": 4.616741127935734e-06, "loss": 0.3516, "step": 2368 }, { "epoch": 1.7162521130161796, "grad_norm": 0.394596666097641, "learning_rate": 4.6125378966403465e-06, "loss": 0.47, "step": 2369 }, { "epoch": 1.716976575706351, "grad_norm": 0.32668572664260864, "learning_rate": 4.608334940795258e-06, "loss": 0.383, "step": 2370 }, { "epoch": 1.7177010383965226, "grad_norm": 0.38756120204925537, "learning_rate": 4.604132263388389e-06, "loss": 0.387, "step": 2371 }, { "epoch": 1.718425501086694, "grad_norm": 0.36521628499031067, "learning_rate": 4.599929867407461e-06, "loss": 0.4123, "step": 2372 }, { "epoch": 1.7191499637768655, "grad_norm": 0.3077073097229004, "learning_rate": 4.595727755839994e-06, "loss": 0.3556, "step": 2373 }, { "epoch": 1.719874426467037, "grad_norm": 0.30571871995925903, "learning_rate": 4.591525931673308e-06, "loss": 0.3816, "step": 2374 }, { "epoch": 1.7205988891572084, "grad_norm": 0.32010406255722046, "learning_rate": 4.587324397894519e-06, "loss": 0.3761, "step": 2375 }, { "epoch": 1.7213233518473798, "grad_norm": 0.33599019050598145, "learning_rate": 4.583123157490533e-06, "loss": 0.4231, "step": 2376 }, { "epoch": 1.7220478145375513, "grad_norm": 0.31629428267478943, "learning_rate": 4.57892221344805e-06, "loss": 0.3788, "step": 2377 }, { "epoch": 1.7227722772277227, "grad_norm": 0.31623411178588867, "learning_rate": 4.574721568753561e-06, "loss": 0.3951, "step": 2378 }, { "epoch": 1.7234967399178942, "grad_norm": 0.3203562796115875, "learning_rate": 4.570521226393339e-06, "loss": 0.3867, "step": 2379 }, { "epoch": 1.7242212026080657, "grad_norm": 0.3300575613975525, "learning_rate": 4.566321189353447e-06, "loss": 0.3789, "step": 2380 }, { "epoch": 1.7249456652982371, "grad_norm": 0.3490508496761322, "learning_rate": 4.562121460619729e-06, "loss": 0.4352, "step": 2381 }, { "epoch": 1.7256701279884086, "grad_norm": 0.3462974727153778, "learning_rate": 4.55792204317781e-06, "loss": 0.4166, "step": 2382 }, { "epoch": 1.72639459067858, "grad_norm": 0.32870250940322876, "learning_rate": 4.553722940013094e-06, "loss": 0.4104, "step": 2383 }, { "epoch": 1.7271190533687515, "grad_norm": 0.29451242089271545, "learning_rate": 4.549524154110762e-06, "loss": 0.3695, "step": 2384 }, { "epoch": 1.727843516058923, "grad_norm": 0.31492504477500916, "learning_rate": 4.545325688455766e-06, "loss": 0.4022, "step": 2385 }, { "epoch": 1.7285679787490944, "grad_norm": 0.30916357040405273, "learning_rate": 4.5411275460328354e-06, "loss": 0.4074, "step": 2386 }, { "epoch": 1.7292924414392659, "grad_norm": 0.29156169295310974, "learning_rate": 4.536929729826467e-06, "loss": 0.3278, "step": 2387 }, { "epoch": 1.7300169041294373, "grad_norm": 0.3226005434989929, "learning_rate": 4.5327322428209245e-06, "loss": 0.419, "step": 2388 }, { "epoch": 1.7307413668196088, "grad_norm": 0.2870633006095886, "learning_rate": 4.528535088000242e-06, "loss": 0.3777, "step": 2389 }, { "epoch": 1.7314658295097802, "grad_norm": 0.3297814130783081, "learning_rate": 4.524338268348213e-06, "loss": 0.439, "step": 2390 }, { "epoch": 1.7321902921999517, "grad_norm": 0.3346737325191498, "learning_rate": 4.520141786848396e-06, "loss": 0.443, "step": 2391 }, { "epoch": 1.7329147548901231, "grad_norm": 0.32241198420524597, "learning_rate": 4.515945646484105e-06, "loss": 0.3743, "step": 2392 }, { "epoch": 1.7336392175802946, "grad_norm": 0.3195786476135254, "learning_rate": 4.511749850238415e-06, "loss": 0.3783, "step": 2393 }, { "epoch": 1.734363680270466, "grad_norm": 0.30316510796546936, "learning_rate": 4.507554401094157e-06, "loss": 0.3859, "step": 2394 }, { "epoch": 1.7350881429606375, "grad_norm": 0.3466293513774872, "learning_rate": 4.503359302033912e-06, "loss": 0.4122, "step": 2395 }, { "epoch": 1.735812605650809, "grad_norm": 0.32934442162513733, "learning_rate": 4.499164556040013e-06, "loss": 0.3856, "step": 2396 }, { "epoch": 1.7365370683409804, "grad_norm": 0.3311391770839691, "learning_rate": 4.494970166094545e-06, "loss": 0.3924, "step": 2397 }, { "epoch": 1.7372615310311519, "grad_norm": 0.3087494969367981, "learning_rate": 4.490776135179335e-06, "loss": 0.3598, "step": 2398 }, { "epoch": 1.7379859937213233, "grad_norm": 0.33907613158226013, "learning_rate": 4.486582466275958e-06, "loss": 0.4284, "step": 2399 }, { "epoch": 1.7387104564114948, "grad_norm": 0.3223370313644409, "learning_rate": 4.4823891623657335e-06, "loss": 0.3616, "step": 2400 }, { "epoch": 1.7394349191016663, "grad_norm": 0.31962859630584717, "learning_rate": 4.478196226429716e-06, "loss": 0.3852, "step": 2401 }, { "epoch": 1.7401593817918377, "grad_norm": 0.34590834379196167, "learning_rate": 4.474003661448703e-06, "loss": 0.4377, "step": 2402 }, { "epoch": 1.7408838444820092, "grad_norm": 0.3512279987335205, "learning_rate": 4.469811470403228e-06, "loss": 0.3836, "step": 2403 }, { "epoch": 1.7416083071721806, "grad_norm": 0.33013856410980225, "learning_rate": 4.465619656273553e-06, "loss": 0.3841, "step": 2404 }, { "epoch": 1.742332769862352, "grad_norm": 0.31845635175704956, "learning_rate": 4.461428222039681e-06, "loss": 0.3714, "step": 2405 }, { "epoch": 1.7430572325525235, "grad_norm": 0.3103337287902832, "learning_rate": 4.457237170681338e-06, "loss": 0.4098, "step": 2406 }, { "epoch": 1.743781695242695, "grad_norm": 0.36141639947891235, "learning_rate": 4.45304650517798e-06, "loss": 0.4177, "step": 2407 }, { "epoch": 1.7445061579328665, "grad_norm": 0.30481991171836853, "learning_rate": 4.4488562285087914e-06, "loss": 0.3432, "step": 2408 }, { "epoch": 1.745230620623038, "grad_norm": 0.348529189825058, "learning_rate": 4.444666343652675e-06, "loss": 0.4206, "step": 2409 }, { "epoch": 1.7459550833132094, "grad_norm": 0.34889253973960876, "learning_rate": 4.440476853588257e-06, "loss": 0.4145, "step": 2410 }, { "epoch": 1.7466795460033808, "grad_norm": 0.32829952239990234, "learning_rate": 4.436287761293887e-06, "loss": 0.4156, "step": 2411 }, { "epoch": 1.7474040086935523, "grad_norm": 0.32565319538116455, "learning_rate": 4.432099069747625e-06, "loss": 0.3641, "step": 2412 }, { "epoch": 1.7481284713837237, "grad_norm": 0.3057096600532532, "learning_rate": 4.4279107819272525e-06, "loss": 0.393, "step": 2413 }, { "epoch": 1.7488529340738952, "grad_norm": 0.29947906732559204, "learning_rate": 4.42372290081026e-06, "loss": 0.3745, "step": 2414 }, { "epoch": 1.7495773967640667, "grad_norm": 0.34195610880851746, "learning_rate": 4.4195354293738484e-06, "loss": 0.4117, "step": 2415 }, { "epoch": 1.750301859454238, "grad_norm": 0.2950175404548645, "learning_rate": 4.415348370594931e-06, "loss": 0.3839, "step": 2416 }, { "epoch": 1.7510263221444096, "grad_norm": 0.3165358901023865, "learning_rate": 4.411161727450126e-06, "loss": 0.4313, "step": 2417 }, { "epoch": 1.751750784834581, "grad_norm": 0.3132931888103485, "learning_rate": 4.406975502915753e-06, "loss": 0.3709, "step": 2418 }, { "epoch": 1.7524752475247525, "grad_norm": 0.3081265687942505, "learning_rate": 4.402789699967838e-06, "loss": 0.3778, "step": 2419 }, { "epoch": 1.753199710214924, "grad_norm": 0.3153415620326996, "learning_rate": 4.398604321582106e-06, "loss": 0.3944, "step": 2420 }, { "epoch": 1.7539241729050954, "grad_norm": 0.33156657218933105, "learning_rate": 4.394419370733981e-06, "loss": 0.411, "step": 2421 }, { "epoch": 1.7546486355952668, "grad_norm": 0.3202776312828064, "learning_rate": 4.390234850398581e-06, "loss": 0.4228, "step": 2422 }, { "epoch": 1.7553730982854383, "grad_norm": 0.31378868222236633, "learning_rate": 4.38605076355072e-06, "loss": 0.3661, "step": 2423 }, { "epoch": 1.7560975609756098, "grad_norm": 0.3148961663246155, "learning_rate": 4.3818671131649046e-06, "loss": 0.3884, "step": 2424 }, { "epoch": 1.7568220236657812, "grad_norm": 0.3286360800266266, "learning_rate": 4.377683902215326e-06, "loss": 0.3823, "step": 2425 }, { "epoch": 1.7575464863559527, "grad_norm": 0.32553762197494507, "learning_rate": 4.373501133675869e-06, "loss": 0.4048, "step": 2426 }, { "epoch": 1.7582709490461241, "grad_norm": 0.32776960730552673, "learning_rate": 4.369318810520101e-06, "loss": 0.369, "step": 2427 }, { "epoch": 1.7589954117362956, "grad_norm": 0.3417339622974396, "learning_rate": 4.365136935721273e-06, "loss": 0.3951, "step": 2428 }, { "epoch": 1.759719874426467, "grad_norm": 0.30308979749679565, "learning_rate": 4.360955512252317e-06, "loss": 0.3834, "step": 2429 }, { "epoch": 1.7604443371166385, "grad_norm": 0.348335325717926, "learning_rate": 4.356774543085845e-06, "loss": 0.432, "step": 2430 }, { "epoch": 1.76116879980681, "grad_norm": 0.31420180201530457, "learning_rate": 4.352594031194147e-06, "loss": 0.3924, "step": 2431 }, { "epoch": 1.7618932624969814, "grad_norm": 0.3068479597568512, "learning_rate": 4.348413979549186e-06, "loss": 0.3445, "step": 2432 }, { "epoch": 1.7626177251871529, "grad_norm": 0.36851969361305237, "learning_rate": 4.344234391122596e-06, "loss": 0.4379, "step": 2433 }, { "epoch": 1.7633421878773243, "grad_norm": 0.3639955222606659, "learning_rate": 4.3400552688856866e-06, "loss": 0.4343, "step": 2434 }, { "epoch": 1.7640666505674958, "grad_norm": 0.31505608558654785, "learning_rate": 4.335876615809434e-06, "loss": 0.3752, "step": 2435 }, { "epoch": 1.7647911132576672, "grad_norm": 0.37496721744537354, "learning_rate": 4.331698434864476e-06, "loss": 0.4251, "step": 2436 }, { "epoch": 1.7655155759478387, "grad_norm": 0.3312278687953949, "learning_rate": 4.327520729021125e-06, "loss": 0.3898, "step": 2437 }, { "epoch": 1.7662400386380102, "grad_norm": 0.2922884225845337, "learning_rate": 4.323343501249346e-06, "loss": 0.36, "step": 2438 }, { "epoch": 1.7669645013281816, "grad_norm": 0.36977240443229675, "learning_rate": 4.319166754518768e-06, "loss": 0.4288, "step": 2439 }, { "epoch": 1.767688964018353, "grad_norm": 0.33515849709510803, "learning_rate": 4.31499049179868e-06, "loss": 0.3793, "step": 2440 }, { "epoch": 1.7684134267085245, "grad_norm": 0.3101891279220581, "learning_rate": 4.310814716058022e-06, "loss": 0.3784, "step": 2441 }, { "epoch": 1.769137889398696, "grad_norm": 0.3293270170688629, "learning_rate": 4.306639430265392e-06, "loss": 0.3991, "step": 2442 }, { "epoch": 1.7698623520888674, "grad_norm": 0.3535301685333252, "learning_rate": 4.302464637389039e-06, "loss": 0.4162, "step": 2443 }, { "epoch": 1.770586814779039, "grad_norm": 0.32309553027153015, "learning_rate": 4.29829034039686e-06, "loss": 0.3675, "step": 2444 }, { "epoch": 1.7713112774692104, "grad_norm": 0.3285590410232544, "learning_rate": 4.2941165422564004e-06, "loss": 0.414, "step": 2445 }, { "epoch": 1.7720357401593818, "grad_norm": 0.3214424252510071, "learning_rate": 4.289943245934851e-06, "loss": 0.3832, "step": 2446 }, { "epoch": 1.7727602028495533, "grad_norm": 0.3389798402786255, "learning_rate": 4.285770454399045e-06, "loss": 0.4155, "step": 2447 }, { "epoch": 1.7734846655397247, "grad_norm": 0.2874317169189453, "learning_rate": 4.28159817061546e-06, "loss": 0.362, "step": 2448 }, { "epoch": 1.7742091282298962, "grad_norm": 0.3093124032020569, "learning_rate": 4.277426397550208e-06, "loss": 0.3897, "step": 2449 }, { "epoch": 1.7749335909200676, "grad_norm": 0.3466958999633789, "learning_rate": 4.273255138169039e-06, "loss": 0.4506, "step": 2450 }, { "epoch": 1.775658053610239, "grad_norm": 0.3144570589065552, "learning_rate": 4.2690843954373415e-06, "loss": 0.3765, "step": 2451 }, { "epoch": 1.7763825163004106, "grad_norm": 0.2989591658115387, "learning_rate": 4.264914172320132e-06, "loss": 0.3786, "step": 2452 }, { "epoch": 1.777106978990582, "grad_norm": 0.35673820972442627, "learning_rate": 4.260744471782061e-06, "loss": 0.3911, "step": 2453 }, { "epoch": 1.7778314416807535, "grad_norm": 0.31848907470703125, "learning_rate": 4.256575296787406e-06, "loss": 0.4139, "step": 2454 }, { "epoch": 1.778555904370925, "grad_norm": 0.28793367743492126, "learning_rate": 4.25240665030007e-06, "loss": 0.3524, "step": 2455 }, { "epoch": 1.7792803670610964, "grad_norm": 0.36275744438171387, "learning_rate": 4.248238535283584e-06, "loss": 0.413, "step": 2456 }, { "epoch": 1.7800048297512678, "grad_norm": 0.35471218824386597, "learning_rate": 4.244070954701096e-06, "loss": 0.4042, "step": 2457 }, { "epoch": 1.7807292924414393, "grad_norm": 0.28470897674560547, "learning_rate": 4.2399039115153765e-06, "loss": 0.3728, "step": 2458 }, { "epoch": 1.7814537551316107, "grad_norm": 0.31690385937690735, "learning_rate": 4.235737408688816e-06, "loss": 0.4274, "step": 2459 }, { "epoch": 1.7821782178217822, "grad_norm": 0.33197805285453796, "learning_rate": 4.231571449183417e-06, "loss": 0.415, "step": 2460 }, { "epoch": 1.7829026805119537, "grad_norm": 0.3311344087123871, "learning_rate": 4.227406035960798e-06, "loss": 0.4204, "step": 2461 }, { "epoch": 1.7836271432021251, "grad_norm": 0.3302258253097534, "learning_rate": 4.223241171982191e-06, "loss": 0.4169, "step": 2462 }, { "epoch": 1.7843516058922966, "grad_norm": 0.27956563234329224, "learning_rate": 4.2190768602084306e-06, "loss": 0.3694, "step": 2463 }, { "epoch": 1.785076068582468, "grad_norm": 0.3153659999370575, "learning_rate": 4.2149131035999685e-06, "loss": 0.4002, "step": 2464 }, { "epoch": 1.7858005312726395, "grad_norm": 0.31823721528053284, "learning_rate": 4.210749905116855e-06, "loss": 0.3686, "step": 2465 }, { "epoch": 1.786524993962811, "grad_norm": 0.36916086077690125, "learning_rate": 4.206587267718743e-06, "loss": 0.4432, "step": 2466 }, { "epoch": 1.7872494566529824, "grad_norm": 0.29649025201797485, "learning_rate": 4.2024251943648945e-06, "loss": 0.3815, "step": 2467 }, { "epoch": 1.7879739193431539, "grad_norm": 0.33526042103767395, "learning_rate": 4.1982636880141595e-06, "loss": 0.4325, "step": 2468 }, { "epoch": 1.7886983820333253, "grad_norm": 0.2957707345485687, "learning_rate": 4.194102751624992e-06, "loss": 0.3297, "step": 2469 }, { "epoch": 1.7894228447234968, "grad_norm": 0.36815062165260315, "learning_rate": 4.18994238815544e-06, "loss": 0.4577, "step": 2470 }, { "epoch": 1.7901473074136682, "grad_norm": 0.32459428906440735, "learning_rate": 4.185782600563143e-06, "loss": 0.4133, "step": 2471 }, { "epoch": 1.7908717701038397, "grad_norm": 0.324133962392807, "learning_rate": 4.1816233918053316e-06, "loss": 0.4158, "step": 2472 }, { "epoch": 1.7915962327940111, "grad_norm": 0.3185576796531677, "learning_rate": 4.177464764838826e-06, "loss": 0.3545, "step": 2473 }, { "epoch": 1.7923206954841826, "grad_norm": 0.36184534430503845, "learning_rate": 4.173306722620027e-06, "loss": 0.4353, "step": 2474 }, { "epoch": 1.793045158174354, "grad_norm": 0.305524617433548, "learning_rate": 4.1691492681049305e-06, "loss": 0.3676, "step": 2475 }, { "epoch": 1.7937696208645255, "grad_norm": 0.3127298951148987, "learning_rate": 4.164992404249105e-06, "loss": 0.3804, "step": 2476 }, { "epoch": 1.794494083554697, "grad_norm": 0.37205204367637634, "learning_rate": 4.1608361340077e-06, "loss": 0.4342, "step": 2477 }, { "epoch": 1.7952185462448684, "grad_norm": 0.2820008099079132, "learning_rate": 4.1566804603354515e-06, "loss": 0.302, "step": 2478 }, { "epoch": 1.7959430089350399, "grad_norm": 0.34038805961608887, "learning_rate": 4.1525253861866615e-06, "loss": 0.4094, "step": 2479 }, { "epoch": 1.7966674716252113, "grad_norm": 0.316672682762146, "learning_rate": 4.148370914515209e-06, "loss": 0.3844, "step": 2480 }, { "epoch": 1.7973919343153828, "grad_norm": 0.2943013608455658, "learning_rate": 4.144217048274548e-06, "loss": 0.3715, "step": 2481 }, { "epoch": 1.7981163970055543, "grad_norm": 0.34323546290397644, "learning_rate": 4.140063790417696e-06, "loss": 0.4292, "step": 2482 }, { "epoch": 1.7988408596957257, "grad_norm": 0.3251972496509552, "learning_rate": 4.135911143897246e-06, "loss": 0.3691, "step": 2483 }, { "epoch": 1.7995653223858972, "grad_norm": 0.3113522529602051, "learning_rate": 4.131759111665349e-06, "loss": 0.4026, "step": 2484 }, { "epoch": 1.8002897850760686, "grad_norm": 0.3037515878677368, "learning_rate": 4.127607696673721e-06, "loss": 0.4025, "step": 2485 }, { "epoch": 1.80101424776624, "grad_norm": 0.30430904030799866, "learning_rate": 4.123456901873642e-06, "loss": 0.3314, "step": 2486 }, { "epoch": 1.8017387104564115, "grad_norm": 0.3342970609664917, "learning_rate": 4.119306730215951e-06, "loss": 0.4159, "step": 2487 }, { "epoch": 1.802463173146583, "grad_norm": 0.34202149510383606, "learning_rate": 4.1151571846510385e-06, "loss": 0.408, "step": 2488 }, { "epoch": 1.8031876358367545, "grad_norm": 0.304066002368927, "learning_rate": 4.1110082681288575e-06, "loss": 0.3846, "step": 2489 }, { "epoch": 1.803912098526926, "grad_norm": 0.32395708560943604, "learning_rate": 4.106859983598909e-06, "loss": 0.3677, "step": 2490 }, { "epoch": 1.8046365612170974, "grad_norm": 0.30423903465270996, "learning_rate": 4.102712334010243e-06, "loss": 0.3739, "step": 2491 }, { "epoch": 1.8053610239072688, "grad_norm": 0.34012216329574585, "learning_rate": 4.098565322311465e-06, "loss": 0.394, "step": 2492 }, { "epoch": 1.8060854865974403, "grad_norm": 0.31681060791015625, "learning_rate": 4.094418951450721e-06, "loss": 0.3777, "step": 2493 }, { "epoch": 1.8068099492876117, "grad_norm": 0.31646066904067993, "learning_rate": 4.090273224375704e-06, "loss": 0.4176, "step": 2494 }, { "epoch": 1.8075344119777832, "grad_norm": 0.2965797185897827, "learning_rate": 4.086128144033649e-06, "loss": 0.3822, "step": 2495 }, { "epoch": 1.8082588746679547, "grad_norm": 0.33592602610588074, "learning_rate": 4.08198371337133e-06, "loss": 0.405, "step": 2496 }, { "epoch": 1.808983337358126, "grad_norm": 0.30193573236465454, "learning_rate": 4.077839935335061e-06, "loss": 0.4001, "step": 2497 }, { "epoch": 1.8097078000482976, "grad_norm": 0.3025062084197998, "learning_rate": 4.07369681287069e-06, "loss": 0.3991, "step": 2498 }, { "epoch": 1.810432262738469, "grad_norm": 0.29935869574546814, "learning_rate": 4.0695543489236004e-06, "loss": 0.3625, "step": 2499 }, { "epoch": 1.8111567254286405, "grad_norm": 0.3634847402572632, "learning_rate": 4.065412546438709e-06, "loss": 0.4184, "step": 2500 }, { "epoch": 1.811881188118812, "grad_norm": 0.317024290561676, "learning_rate": 4.061271408360459e-06, "loss": 0.3702, "step": 2501 }, { "epoch": 1.8126056508089832, "grad_norm": 0.3606925308704376, "learning_rate": 4.057130937632821e-06, "loss": 0.4134, "step": 2502 }, { "epoch": 1.8133301134991546, "grad_norm": 0.2921443581581116, "learning_rate": 4.052991137199297e-06, "loss": 0.3807, "step": 2503 }, { "epoch": 1.814054576189326, "grad_norm": 0.3099372684955597, "learning_rate": 4.048852010002904e-06, "loss": 0.3644, "step": 2504 }, { "epoch": 1.8147790388794975, "grad_norm": 0.3619023561477661, "learning_rate": 4.044713558986189e-06, "loss": 0.4608, "step": 2505 }, { "epoch": 1.815503501569669, "grad_norm": 0.303991436958313, "learning_rate": 4.040575787091212e-06, "loss": 0.3911, "step": 2506 }, { "epoch": 1.8162279642598405, "grad_norm": 0.29247355461120605, "learning_rate": 4.036438697259551e-06, "loss": 0.3993, "step": 2507 }, { "epoch": 1.816952426950012, "grad_norm": 0.3084902763366699, "learning_rate": 4.032302292432303e-06, "loss": 0.4064, "step": 2508 }, { "epoch": 1.8176768896401834, "grad_norm": 0.3287025988101959, "learning_rate": 4.0281665755500744e-06, "loss": 0.414, "step": 2509 }, { "epoch": 1.8184013523303548, "grad_norm": 0.31984007358551025, "learning_rate": 4.024031549552985e-06, "loss": 0.4014, "step": 2510 }, { "epoch": 1.8191258150205263, "grad_norm": 0.3324487805366516, "learning_rate": 4.01989721738066e-06, "loss": 0.3701, "step": 2511 }, { "epoch": 1.8198502777106977, "grad_norm": 0.3306279182434082, "learning_rate": 4.015763581972232e-06, "loss": 0.395, "step": 2512 }, { "epoch": 1.8205747404008692, "grad_norm": 0.3341876268386841, "learning_rate": 4.011630646266343e-06, "loss": 0.3865, "step": 2513 }, { "epoch": 1.8212992030910407, "grad_norm": 0.33625146746635437, "learning_rate": 4.007498413201132e-06, "loss": 0.4096, "step": 2514 }, { "epoch": 1.822023665781212, "grad_norm": 0.3339204490184784, "learning_rate": 4.0033668857142424e-06, "loss": 0.4052, "step": 2515 }, { "epoch": 1.8227481284713836, "grad_norm": 0.2877708673477173, "learning_rate": 3.999236066742813e-06, "loss": 0.3855, "step": 2516 }, { "epoch": 1.823472591161555, "grad_norm": 0.35168197751045227, "learning_rate": 3.9951059592234805e-06, "loss": 0.3806, "step": 2517 }, { "epoch": 1.8241970538517265, "grad_norm": 0.32686737179756165, "learning_rate": 3.990976566092377e-06, "loss": 0.389, "step": 2518 }, { "epoch": 1.824921516541898, "grad_norm": 0.33376309275627136, "learning_rate": 3.9868478902851235e-06, "loss": 0.4261, "step": 2519 }, { "epoch": 1.8256459792320694, "grad_norm": 0.3070868253707886, "learning_rate": 3.982719934736832e-06, "loss": 0.3837, "step": 2520 }, { "epoch": 1.8263704419222409, "grad_norm": 0.31073880195617676, "learning_rate": 3.978592702382106e-06, "loss": 0.3908, "step": 2521 }, { "epoch": 1.8270949046124123, "grad_norm": 0.3291935324668884, "learning_rate": 3.97446619615503e-06, "loss": 0.3628, "step": 2522 }, { "epoch": 1.8278193673025838, "grad_norm": 0.34498631954193115, "learning_rate": 3.970340418989177e-06, "loss": 0.3624, "step": 2523 }, { "epoch": 1.8285438299927552, "grad_norm": 0.3466966152191162, "learning_rate": 3.966215373817597e-06, "loss": 0.4152, "step": 2524 }, { "epoch": 1.8292682926829267, "grad_norm": 0.30525270104408264, "learning_rate": 3.962091063572821e-06, "loss": 0.3871, "step": 2525 }, { "epoch": 1.8299927553730981, "grad_norm": 0.28252360224723816, "learning_rate": 3.957967491186861e-06, "loss": 0.3485, "step": 2526 }, { "epoch": 1.8307172180632696, "grad_norm": 0.32016849517822266, "learning_rate": 3.9538446595912e-06, "loss": 0.4026, "step": 2527 }, { "epoch": 1.831441680753441, "grad_norm": 0.35020163655281067, "learning_rate": 3.949722571716795e-06, "loss": 0.4007, "step": 2528 }, { "epoch": 1.8321661434436125, "grad_norm": 0.2934373617172241, "learning_rate": 3.945601230494079e-06, "loss": 0.3956, "step": 2529 }, { "epoch": 1.832890606133784, "grad_norm": 0.2916053533554077, "learning_rate": 3.941480638852948e-06, "loss": 0.3723, "step": 2530 }, { "epoch": 1.8336150688239554, "grad_norm": 0.31129151582717896, "learning_rate": 3.937360799722767e-06, "loss": 0.3933, "step": 2531 }, { "epoch": 1.8343395315141269, "grad_norm": 0.343945175409317, "learning_rate": 3.9332417160323685e-06, "loss": 0.3927, "step": 2532 }, { "epoch": 1.8350639942042983, "grad_norm": 0.28367412090301514, "learning_rate": 3.929123390710045e-06, "loss": 0.3537, "step": 2533 }, { "epoch": 1.8357884568944698, "grad_norm": 0.3096967339515686, "learning_rate": 3.925005826683554e-06, "loss": 0.4107, "step": 2534 }, { "epoch": 1.8365129195846412, "grad_norm": 0.31752708554267883, "learning_rate": 3.920889026880106e-06, "loss": 0.4086, "step": 2535 }, { "epoch": 1.8372373822748127, "grad_norm": 0.33114853501319885, "learning_rate": 3.91677299422637e-06, "loss": 0.404, "step": 2536 }, { "epoch": 1.8379618449649842, "grad_norm": 0.3246402442455292, "learning_rate": 3.912657731648474e-06, "loss": 0.3809, "step": 2537 }, { "epoch": 1.8386863076551556, "grad_norm": 0.34434548020362854, "learning_rate": 3.9085432420719934e-06, "loss": 0.4333, "step": 2538 }, { "epoch": 1.839410770345327, "grad_norm": 0.29455846548080444, "learning_rate": 3.904429528421954e-06, "loss": 0.3559, "step": 2539 }, { "epoch": 1.8401352330354985, "grad_norm": 0.33347827196121216, "learning_rate": 3.900316593622837e-06, "loss": 0.4199, "step": 2540 }, { "epoch": 1.84085969572567, "grad_norm": 0.31510254740715027, "learning_rate": 3.896204440598561e-06, "loss": 0.3942, "step": 2541 }, { "epoch": 1.8415841584158414, "grad_norm": 0.310346782207489, "learning_rate": 3.892093072272493e-06, "loss": 0.4052, "step": 2542 }, { "epoch": 1.842308621106013, "grad_norm": 0.3230580985546112, "learning_rate": 3.887982491567443e-06, "loss": 0.382, "step": 2543 }, { "epoch": 1.8430330837961844, "grad_norm": 0.3596835434436798, "learning_rate": 3.883872701405658e-06, "loss": 0.3871, "step": 2544 }, { "epoch": 1.8437575464863558, "grad_norm": 0.39894187450408936, "learning_rate": 3.879763704708827e-06, "loss": 0.4112, "step": 2545 }, { "epoch": 1.8444820091765273, "grad_norm": 0.3760286271572113, "learning_rate": 3.875655504398071e-06, "loss": 0.433, "step": 2546 }, { "epoch": 1.8452064718666987, "grad_norm": 0.2909357249736786, "learning_rate": 3.871548103393947e-06, "loss": 0.346, "step": 2547 }, { "epoch": 1.8459309345568702, "grad_norm": 0.34009644389152527, "learning_rate": 3.8674415046164445e-06, "loss": 0.3733, "step": 2548 }, { "epoch": 1.8466553972470416, "grad_norm": 0.34754571318626404, "learning_rate": 3.8633357109849825e-06, "loss": 0.36, "step": 2549 }, { "epoch": 1.847379859937213, "grad_norm": 0.34383609890937805, "learning_rate": 3.859230725418403e-06, "loss": 0.4296, "step": 2550 }, { "epoch": 1.8481043226273846, "grad_norm": 0.3323667049407959, "learning_rate": 3.8551265508349785e-06, "loss": 0.3864, "step": 2551 }, { "epoch": 1.848828785317556, "grad_norm": 0.34040260314941406, "learning_rate": 3.851023190152406e-06, "loss": 0.4046, "step": 2552 }, { "epoch": 1.8495532480077275, "grad_norm": 0.32725152373313904, "learning_rate": 3.8469206462878e-06, "loss": 0.3814, "step": 2553 }, { "epoch": 1.850277710697899, "grad_norm": 0.31183984875679016, "learning_rate": 3.842818922157695e-06, "loss": 0.3995, "step": 2554 }, { "epoch": 1.8510021733880704, "grad_norm": 0.3272894322872162, "learning_rate": 3.838718020678045e-06, "loss": 0.4078, "step": 2555 }, { "epoch": 1.8517266360782418, "grad_norm": 0.33241987228393555, "learning_rate": 3.834617944764218e-06, "loss": 0.3849, "step": 2556 }, { "epoch": 1.8524510987684133, "grad_norm": 0.34762170910835266, "learning_rate": 3.830518697330994e-06, "loss": 0.4132, "step": 2557 }, { "epoch": 1.8531755614585848, "grad_norm": 0.36000311374664307, "learning_rate": 3.826420281292564e-06, "loss": 0.4504, "step": 2558 }, { "epoch": 1.8539000241487562, "grad_norm": 0.321247398853302, "learning_rate": 3.822322699562532e-06, "loss": 0.3868, "step": 2559 }, { "epoch": 1.8546244868389277, "grad_norm": 0.3324410319328308, "learning_rate": 3.818225955053903e-06, "loss": 0.3861, "step": 2560 }, { "epoch": 1.8553489495290991, "grad_norm": 0.3717401921749115, "learning_rate": 3.81413005067909e-06, "loss": 0.3836, "step": 2561 }, { "epoch": 1.8560734122192706, "grad_norm": 0.33221590518951416, "learning_rate": 3.8100349893499094e-06, "loss": 0.3872, "step": 2562 }, { "epoch": 1.856797874909442, "grad_norm": 0.33603984117507935, "learning_rate": 3.8059407739775767e-06, "loss": 0.3667, "step": 2563 }, { "epoch": 1.8575223375996135, "grad_norm": 0.34769198298454285, "learning_rate": 3.8018474074727052e-06, "loss": 0.3874, "step": 2564 }, { "epoch": 1.858246800289785, "grad_norm": 0.31434863805770874, "learning_rate": 3.797754892745309e-06, "loss": 0.3932, "step": 2565 }, { "epoch": 1.8589712629799564, "grad_norm": 0.3368460237979889, "learning_rate": 3.79366323270479e-06, "loss": 0.411, "step": 2566 }, { "epoch": 1.8596957256701279, "grad_norm": 0.3279944360256195, "learning_rate": 3.7895724302599496e-06, "loss": 0.376, "step": 2567 }, { "epoch": 1.8604201883602993, "grad_norm": 0.30102670192718506, "learning_rate": 3.7854824883189744e-06, "loss": 0.3519, "step": 2568 }, { "epoch": 1.8611446510504708, "grad_norm": 0.3417000472545624, "learning_rate": 3.78139340978944e-06, "loss": 0.4195, "step": 2569 }, { "epoch": 1.8618691137406422, "grad_norm": 0.3297842741012573, "learning_rate": 3.777305197578312e-06, "loss": 0.3621, "step": 2570 }, { "epoch": 1.8625935764308137, "grad_norm": 0.33751633763313293, "learning_rate": 3.7732178545919353e-06, "loss": 0.3908, "step": 2571 }, { "epoch": 1.8633180391209851, "grad_norm": 0.3620988726615906, "learning_rate": 3.769131383736038e-06, "loss": 0.4082, "step": 2572 }, { "epoch": 1.8640425018111566, "grad_norm": 0.3247729539871216, "learning_rate": 3.7650457879157317e-06, "loss": 0.374, "step": 2573 }, { "epoch": 1.864766964501328, "grad_norm": 0.2923401892185211, "learning_rate": 3.7609610700355014e-06, "loss": 0.3675, "step": 2574 }, { "epoch": 1.8654914271914995, "grad_norm": 0.3193333148956299, "learning_rate": 3.756877232999209e-06, "loss": 0.4153, "step": 2575 }, { "epoch": 1.866215889881671, "grad_norm": 0.33937421441078186, "learning_rate": 3.752794279710094e-06, "loss": 0.3906, "step": 2576 }, { "epoch": 1.8669403525718424, "grad_norm": 0.3161516487598419, "learning_rate": 3.748712213070762e-06, "loss": 0.3555, "step": 2577 }, { "epoch": 1.8676648152620139, "grad_norm": 0.32917025685310364, "learning_rate": 3.744631035983193e-06, "loss": 0.3977, "step": 2578 }, { "epoch": 1.8683892779521853, "grad_norm": 0.32567039132118225, "learning_rate": 3.7405507513487307e-06, "loss": 0.4218, "step": 2579 }, { "epoch": 1.8691137406423568, "grad_norm": 0.31177181005477905, "learning_rate": 3.736471362068087e-06, "loss": 0.3969, "step": 2580 }, { "epoch": 1.8698382033325283, "grad_norm": 0.3367050290107727, "learning_rate": 3.732392871041337e-06, "loss": 0.3931, "step": 2581 }, { "epoch": 1.8705626660226997, "grad_norm": 0.2991842031478882, "learning_rate": 3.7283152811679165e-06, "loss": 0.3396, "step": 2582 }, { "epoch": 1.8712871287128712, "grad_norm": 0.3000541925430298, "learning_rate": 3.724238595346619e-06, "loss": 0.3554, "step": 2583 }, { "epoch": 1.8720115914030426, "grad_norm": 0.3330483138561249, "learning_rate": 3.720162816475601e-06, "loss": 0.4659, "step": 2584 }, { "epoch": 1.872736054093214, "grad_norm": 0.3269869387149811, "learning_rate": 3.716087947452367e-06, "loss": 0.4261, "step": 2585 }, { "epoch": 1.8734605167833855, "grad_norm": 0.30642077326774597, "learning_rate": 3.7120139911737792e-06, "loss": 0.3999, "step": 2586 }, { "epoch": 1.874184979473557, "grad_norm": 0.32908299565315247, "learning_rate": 3.707940950536051e-06, "loss": 0.3826, "step": 2587 }, { "epoch": 1.8749094421637285, "grad_norm": 0.29113489389419556, "learning_rate": 3.7038688284347407e-06, "loss": 0.3607, "step": 2588 }, { "epoch": 1.8756339048539, "grad_norm": 0.3119388222694397, "learning_rate": 3.6997976277647607e-06, "loss": 0.4272, "step": 2589 }, { "epoch": 1.8763583675440714, "grad_norm": 0.3445422947406769, "learning_rate": 3.695727351420361e-06, "loss": 0.4047, "step": 2590 }, { "epoch": 1.8770828302342428, "grad_norm": 0.3322848379611969, "learning_rate": 3.6916580022951385e-06, "loss": 0.3828, "step": 2591 }, { "epoch": 1.8778072929244143, "grad_norm": 0.3181737959384918, "learning_rate": 3.687589583282031e-06, "loss": 0.3978, "step": 2592 }, { "epoch": 1.8785317556145857, "grad_norm": 0.3022726774215698, "learning_rate": 3.6835220972733142e-06, "loss": 0.4004, "step": 2593 }, { "epoch": 1.8792562183047572, "grad_norm": 0.31538400053977966, "learning_rate": 3.679455547160599e-06, "loss": 0.3582, "step": 2594 }, { "epoch": 1.8799806809949287, "grad_norm": 0.33900386095046997, "learning_rate": 3.6753899358348344e-06, "loss": 0.3925, "step": 2595 }, { "epoch": 1.8807051436851001, "grad_norm": 0.3085217773914337, "learning_rate": 3.6713252661862997e-06, "loss": 0.3601, "step": 2596 }, { "epoch": 1.8814296063752716, "grad_norm": 0.329691618680954, "learning_rate": 3.6672615411046032e-06, "loss": 0.4093, "step": 2597 }, { "epoch": 1.882154069065443, "grad_norm": 0.29870209097862244, "learning_rate": 3.663198763478687e-06, "loss": 0.3919, "step": 2598 }, { "epoch": 1.8828785317556145, "grad_norm": 0.33096104860305786, "learning_rate": 3.6591369361968127e-06, "loss": 0.4504, "step": 2599 }, { "epoch": 1.883602994445786, "grad_norm": 0.29970309138298035, "learning_rate": 3.6550760621465733e-06, "loss": 0.3533, "step": 2600 }, { "epoch": 1.8843274571359574, "grad_norm": 0.3171185851097107, "learning_rate": 3.6510161442148783e-06, "loss": 0.3838, "step": 2601 }, { "epoch": 1.8850519198261289, "grad_norm": 0.30565112829208374, "learning_rate": 3.646957185287961e-06, "loss": 0.3854, "step": 2602 }, { "epoch": 1.8857763825163003, "grad_norm": 0.3313187062740326, "learning_rate": 3.642899188251372e-06, "loss": 0.4339, "step": 2603 }, { "epoch": 1.8865008452064718, "grad_norm": 0.28714123368263245, "learning_rate": 3.6388421559899794e-06, "loss": 0.3587, "step": 2604 }, { "epoch": 1.8872253078966432, "grad_norm": 0.35154253244400024, "learning_rate": 3.6347860913879605e-06, "loss": 0.4511, "step": 2605 }, { "epoch": 1.8879497705868147, "grad_norm": 0.29472634196281433, "learning_rate": 3.6307309973288096e-06, "loss": 0.3579, "step": 2606 }, { "epoch": 1.8886742332769861, "grad_norm": 0.35062575340270996, "learning_rate": 3.626676876695331e-06, "loss": 0.4127, "step": 2607 }, { "epoch": 1.8893986959671576, "grad_norm": 0.2921110987663269, "learning_rate": 3.6226237323696345e-06, "loss": 0.3782, "step": 2608 }, { "epoch": 1.890123158657329, "grad_norm": 0.33105379343032837, "learning_rate": 3.6185715672331355e-06, "loss": 0.3936, "step": 2609 }, { "epoch": 1.8908476213475005, "grad_norm": 0.30315330624580383, "learning_rate": 3.6145203841665577e-06, "loss": 0.3842, "step": 2610 }, { "epoch": 1.891572084037672, "grad_norm": 0.3216678500175476, "learning_rate": 3.610470186049921e-06, "loss": 0.4137, "step": 2611 }, { "epoch": 1.8922965467278434, "grad_norm": 0.327807754278183, "learning_rate": 3.606420975762549e-06, "loss": 0.3912, "step": 2612 }, { "epoch": 1.8930210094180149, "grad_norm": 0.31451210379600525, "learning_rate": 3.6023727561830623e-06, "loss": 0.4321, "step": 2613 }, { "epoch": 1.8937454721081863, "grad_norm": 0.30380043387413025, "learning_rate": 3.5983255301893762e-06, "loss": 0.4159, "step": 2614 }, { "epoch": 1.8944699347983578, "grad_norm": 0.30130138993263245, "learning_rate": 3.5942793006586985e-06, "loss": 0.3493, "step": 2615 }, { "epoch": 1.8951943974885292, "grad_norm": 0.3139849603176117, "learning_rate": 3.590234070467533e-06, "loss": 0.3806, "step": 2616 }, { "epoch": 1.8959188601787007, "grad_norm": 0.37083718180656433, "learning_rate": 3.586189842491668e-06, "loss": 0.4077, "step": 2617 }, { "epoch": 1.8966433228688722, "grad_norm": 0.303484171628952, "learning_rate": 3.582146619606184e-06, "loss": 0.379, "step": 2618 }, { "epoch": 1.8973677855590436, "grad_norm": 0.35708025097846985, "learning_rate": 3.578104404685442e-06, "loss": 0.4167, "step": 2619 }, { "epoch": 1.898092248249215, "grad_norm": 0.30566057562828064, "learning_rate": 3.5740632006030897e-06, "loss": 0.3624, "step": 2620 }, { "epoch": 1.8988167109393865, "grad_norm": 0.318827748298645, "learning_rate": 3.5700230102320565e-06, "loss": 0.4049, "step": 2621 }, { "epoch": 1.899541173629558, "grad_norm": 0.33695492148399353, "learning_rate": 3.5659838364445505e-06, "loss": 0.4104, "step": 2622 }, { "epoch": 1.9002656363197294, "grad_norm": 0.31925874948501587, "learning_rate": 3.561945682112054e-06, "loss": 0.3952, "step": 2623 }, { "epoch": 1.900990099009901, "grad_norm": 0.34101906418800354, "learning_rate": 3.55790855010533e-06, "loss": 0.4137, "step": 2624 }, { "epoch": 1.9017145617000724, "grad_norm": 0.28169217705726624, "learning_rate": 3.5538724432944118e-06, "loss": 0.352, "step": 2625 }, { "epoch": 1.9024390243902438, "grad_norm": 0.29794761538505554, "learning_rate": 3.5498373645486024e-06, "loss": 0.403, "step": 2626 }, { "epoch": 1.9031634870804153, "grad_norm": 0.33532536029815674, "learning_rate": 3.545803316736477e-06, "loss": 0.3841, "step": 2627 }, { "epoch": 1.9038879497705867, "grad_norm": 0.33239221572875977, "learning_rate": 3.5417703027258752e-06, "loss": 0.4059, "step": 2628 }, { "epoch": 1.9046124124607582, "grad_norm": 0.329262375831604, "learning_rate": 3.537738325383906e-06, "loss": 0.384, "step": 2629 }, { "epoch": 1.9053368751509296, "grad_norm": 0.31434616446495056, "learning_rate": 3.5337073875769357e-06, "loss": 0.3693, "step": 2630 }, { "epoch": 1.906061337841101, "grad_norm": 0.3240516185760498, "learning_rate": 3.5296774921705945e-06, "loss": 0.4095, "step": 2631 }, { "epoch": 1.9067858005312726, "grad_norm": 0.3313315808773041, "learning_rate": 3.525648642029772e-06, "loss": 0.3785, "step": 2632 }, { "epoch": 1.907510263221444, "grad_norm": 0.33397069573402405, "learning_rate": 3.521620840018615e-06, "loss": 0.3939, "step": 2633 }, { "epoch": 1.9082347259116155, "grad_norm": 0.3165329396724701, "learning_rate": 3.517594089000521e-06, "loss": 0.3881, "step": 2634 }, { "epoch": 1.908959188601787, "grad_norm": 0.3076345920562744, "learning_rate": 3.513568391838148e-06, "loss": 0.3924, "step": 2635 }, { "epoch": 1.9096836512919584, "grad_norm": 0.304934024810791, "learning_rate": 3.5095437513933973e-06, "loss": 0.3859, "step": 2636 }, { "epoch": 1.9104081139821298, "grad_norm": 0.3279597759246826, "learning_rate": 3.5055201705274223e-06, "loss": 0.3858, "step": 2637 }, { "epoch": 1.9111325766723013, "grad_norm": 0.33920207619667053, "learning_rate": 3.501497652100624e-06, "loss": 0.409, "step": 2638 }, { "epoch": 1.9118570393624728, "grad_norm": 0.30498650670051575, "learning_rate": 3.4974761989726456e-06, "loss": 0.3724, "step": 2639 }, { "epoch": 1.9125815020526442, "grad_norm": 0.3440050780773163, "learning_rate": 3.4934558140023765e-06, "loss": 0.3969, "step": 2640 }, { "epoch": 1.9133059647428157, "grad_norm": 0.31516459584236145, "learning_rate": 3.4894365000479436e-06, "loss": 0.3941, "step": 2641 }, { "epoch": 1.9140304274329871, "grad_norm": 0.2961702048778534, "learning_rate": 3.485418259966712e-06, "loss": 0.3522, "step": 2642 }, { "epoch": 1.9147548901231586, "grad_norm": 0.30971789360046387, "learning_rate": 3.481401096615287e-06, "loss": 0.414, "step": 2643 }, { "epoch": 1.91547935281333, "grad_norm": 0.308754026889801, "learning_rate": 3.4773850128495044e-06, "loss": 0.4211, "step": 2644 }, { "epoch": 1.9162038155035015, "grad_norm": 0.3132416009902954, "learning_rate": 3.473370011524435e-06, "loss": 0.3754, "step": 2645 }, { "epoch": 1.916928278193673, "grad_norm": 0.30660751461982727, "learning_rate": 3.46935609549438e-06, "loss": 0.382, "step": 2646 }, { "epoch": 1.9176527408838444, "grad_norm": 0.316356360912323, "learning_rate": 3.4653432676128686e-06, "loss": 0.4187, "step": 2647 }, { "epoch": 1.9183772035740159, "grad_norm": 0.279665470123291, "learning_rate": 3.4613315307326535e-06, "loss": 0.3675, "step": 2648 }, { "epoch": 1.9191016662641873, "grad_norm": 0.3164890706539154, "learning_rate": 3.4573208877057186e-06, "loss": 0.4521, "step": 2649 }, { "epoch": 1.9198261289543588, "grad_norm": 0.31564927101135254, "learning_rate": 3.453311341383263e-06, "loss": 0.3742, "step": 2650 }, { "epoch": 1.9205505916445302, "grad_norm": 0.3078301250934601, "learning_rate": 3.4493028946157123e-06, "loss": 0.3899, "step": 2651 }, { "epoch": 1.9212750543347017, "grad_norm": 0.3036668300628662, "learning_rate": 3.445295550252705e-06, "loss": 0.3767, "step": 2652 }, { "epoch": 1.9219995170248731, "grad_norm": 0.31020331382751465, "learning_rate": 3.441289311143099e-06, "loss": 0.3784, "step": 2653 }, { "epoch": 1.9227239797150446, "grad_norm": 0.305279940366745, "learning_rate": 3.437284180134966e-06, "loss": 0.3942, "step": 2654 }, { "epoch": 1.923448442405216, "grad_norm": 0.3050245940685272, "learning_rate": 3.4332801600755895e-06, "loss": 0.3647, "step": 2655 }, { "epoch": 1.9241729050953875, "grad_norm": 0.3166002929210663, "learning_rate": 3.4292772538114625e-06, "loss": 0.4024, "step": 2656 }, { "epoch": 1.924897367785559, "grad_norm": 0.33247441053390503, "learning_rate": 3.4252754641882903e-06, "loss": 0.3938, "step": 2657 }, { "epoch": 1.9256218304757304, "grad_norm": 0.335060179233551, "learning_rate": 3.4212747940509783e-06, "loss": 0.391, "step": 2658 }, { "epoch": 1.926346293165902, "grad_norm": 0.3366158902645111, "learning_rate": 3.417275246243638e-06, "loss": 0.402, "step": 2659 }, { "epoch": 1.9270707558560733, "grad_norm": 0.3354906141757965, "learning_rate": 3.4132768236095875e-06, "loss": 0.4586, "step": 2660 }, { "epoch": 1.9277952185462448, "grad_norm": 0.33138608932495117, "learning_rate": 3.409279528991338e-06, "loss": 0.3713, "step": 2661 }, { "epoch": 1.9285196812364163, "grad_norm": 0.3260311484336853, "learning_rate": 3.405283365230606e-06, "loss": 0.3864, "step": 2662 }, { "epoch": 1.9292441439265877, "grad_norm": 0.3395462930202484, "learning_rate": 3.4012883351682975e-06, "loss": 0.4255, "step": 2663 }, { "epoch": 1.9299686066167592, "grad_norm": 0.31118646264076233, "learning_rate": 3.397294441644515e-06, "loss": 0.3733, "step": 2664 }, { "epoch": 1.9306930693069306, "grad_norm": 0.3076493740081787, "learning_rate": 3.3933016874985556e-06, "loss": 0.3991, "step": 2665 }, { "epoch": 1.931417531997102, "grad_norm": 0.3038483262062073, "learning_rate": 3.3893100755689034e-06, "loss": 0.3499, "step": 2666 }, { "epoch": 1.9321419946872735, "grad_norm": 0.36161568760871887, "learning_rate": 3.3853196086932295e-06, "loss": 0.3987, "step": 2667 }, { "epoch": 1.932866457377445, "grad_norm": 0.3021342158317566, "learning_rate": 3.3813302897083955e-06, "loss": 0.3698, "step": 2668 }, { "epoch": 1.9335909200676165, "grad_norm": 0.33283731341362, "learning_rate": 3.3773421214504424e-06, "loss": 0.4195, "step": 2669 }, { "epoch": 1.934315382757788, "grad_norm": 0.3254534602165222, "learning_rate": 3.373355106754595e-06, "loss": 0.3894, "step": 2670 }, { "epoch": 1.9350398454479594, "grad_norm": 0.3032558262348175, "learning_rate": 3.36936924845526e-06, "loss": 0.3747, "step": 2671 }, { "epoch": 1.9357643081381308, "grad_norm": 0.39056456089019775, "learning_rate": 3.365384549386016e-06, "loss": 0.425, "step": 2672 }, { "epoch": 1.9364887708283023, "grad_norm": 0.2851409316062927, "learning_rate": 3.3614010123796257e-06, "loss": 0.3543, "step": 2673 }, { "epoch": 1.9372132335184737, "grad_norm": 0.2861112058162689, "learning_rate": 3.35741864026802e-06, "loss": 0.3628, "step": 2674 }, { "epoch": 1.9379376962086452, "grad_norm": 0.3547042906284332, "learning_rate": 3.353437435882301e-06, "loss": 0.4521, "step": 2675 }, { "epoch": 1.9386621588988167, "grad_norm": 0.3434026539325714, "learning_rate": 3.349457402052746e-06, "loss": 0.4044, "step": 2676 }, { "epoch": 1.9393866215889881, "grad_norm": 0.29710039496421814, "learning_rate": 3.3454785416087956e-06, "loss": 0.3634, "step": 2677 }, { "epoch": 1.9401110842791596, "grad_norm": 0.35346269607543945, "learning_rate": 3.3415008573790563e-06, "loss": 0.4333, "step": 2678 }, { "epoch": 1.940835546969331, "grad_norm": 0.3004611134529114, "learning_rate": 3.3375243521913016e-06, "loss": 0.3644, "step": 2679 }, { "epoch": 1.9415600096595025, "grad_norm": 0.32908594608306885, "learning_rate": 3.3335490288724637e-06, "loss": 0.3628, "step": 2680 }, { "epoch": 1.942284472349674, "grad_norm": 0.3201019763946533, "learning_rate": 3.329574890248636e-06, "loss": 0.3806, "step": 2681 }, { "epoch": 1.9430089350398454, "grad_norm": 0.3407033383846283, "learning_rate": 3.3256019391450696e-06, "loss": 0.4291, "step": 2682 }, { "epoch": 1.9437333977300169, "grad_norm": 0.3292374908924103, "learning_rate": 3.3216301783861705e-06, "loss": 0.4186, "step": 2683 }, { "epoch": 1.9444578604201883, "grad_norm": 0.32029443979263306, "learning_rate": 3.3176596107955006e-06, "loss": 0.3856, "step": 2684 }, { "epoch": 1.9451823231103598, "grad_norm": 0.3198530972003937, "learning_rate": 3.3136902391957713e-06, "loss": 0.3892, "step": 2685 }, { "epoch": 1.9459067858005312, "grad_norm": 0.30874866247177124, "learning_rate": 3.3097220664088446e-06, "loss": 0.3551, "step": 2686 }, { "epoch": 1.9466312484907027, "grad_norm": 0.31815972924232483, "learning_rate": 3.3057550952557316e-06, "loss": 0.4157, "step": 2687 }, { "epoch": 1.9473557111808741, "grad_norm": 0.3403359353542328, "learning_rate": 3.301789328556586e-06, "loss": 0.405, "step": 2688 }, { "epoch": 1.9480801738710456, "grad_norm": 0.32335472106933594, "learning_rate": 3.2978247691307065e-06, "loss": 0.4041, "step": 2689 }, { "epoch": 1.948804636561217, "grad_norm": 0.30931925773620605, "learning_rate": 3.293861419796537e-06, "loss": 0.3383, "step": 2690 }, { "epoch": 1.9495290992513885, "grad_norm": 0.33387255668640137, "learning_rate": 3.289899283371657e-06, "loss": 0.4367, "step": 2691 }, { "epoch": 1.95025356194156, "grad_norm": 0.29541435837745667, "learning_rate": 3.2859383626727824e-06, "loss": 0.3469, "step": 2692 }, { "epoch": 1.9509780246317314, "grad_norm": 0.3108336329460144, "learning_rate": 3.281978660515772e-06, "loss": 0.3846, "step": 2693 }, { "epoch": 1.9517024873219029, "grad_norm": 0.29408156871795654, "learning_rate": 3.2780201797156097e-06, "loss": 0.3643, "step": 2694 }, { "epoch": 1.9524269500120743, "grad_norm": 0.3179401755332947, "learning_rate": 3.274062923086418e-06, "loss": 0.3871, "step": 2695 }, { "epoch": 1.9531514127022458, "grad_norm": 0.3241508901119232, "learning_rate": 3.270106893441445e-06, "loss": 0.3746, "step": 2696 }, { "epoch": 1.9538758753924172, "grad_norm": 0.32593411207199097, "learning_rate": 3.266152093593069e-06, "loss": 0.4033, "step": 2697 }, { "epoch": 1.9546003380825887, "grad_norm": 0.3419671952724457, "learning_rate": 3.262198526352791e-06, "loss": 0.3831, "step": 2698 }, { "epoch": 1.9553248007727602, "grad_norm": 0.33371788263320923, "learning_rate": 3.2582461945312375e-06, "loss": 0.3777, "step": 2699 }, { "epoch": 1.9560492634629316, "grad_norm": 0.3311613202095032, "learning_rate": 3.2542951009381584e-06, "loss": 0.3959, "step": 2700 }, { "epoch": 1.956773726153103, "grad_norm": 0.31067049503326416, "learning_rate": 3.2503452483824204e-06, "loss": 0.3576, "step": 2701 }, { "epoch": 1.9574981888432745, "grad_norm": 0.3549036383628845, "learning_rate": 3.2463966396720103e-06, "loss": 0.4163, "step": 2702 }, { "epoch": 1.958222651533446, "grad_norm": 0.3137205243110657, "learning_rate": 3.24244927761403e-06, "loss": 0.3736, "step": 2703 }, { "epoch": 1.9589471142236174, "grad_norm": 0.34351855516433716, "learning_rate": 3.2385031650146926e-06, "loss": 0.3778, "step": 2704 }, { "epoch": 1.959671576913789, "grad_norm": 0.35983794927597046, "learning_rate": 3.234558304679327e-06, "loss": 0.373, "step": 2705 }, { "epoch": 1.9603960396039604, "grad_norm": 0.3339568078517914, "learning_rate": 3.23061469941237e-06, "loss": 0.3944, "step": 2706 }, { "epoch": 1.9611205022941318, "grad_norm": 0.35484594106674194, "learning_rate": 3.226672352017364e-06, "loss": 0.4088, "step": 2707 }, { "epoch": 1.9618449649843033, "grad_norm": 0.3216332793235779, "learning_rate": 3.2227312652969624e-06, "loss": 0.4007, "step": 2708 }, { "epoch": 1.9625694276744747, "grad_norm": 0.36426985263824463, "learning_rate": 3.2187914420529176e-06, "loss": 0.4133, "step": 2709 }, { "epoch": 1.9632938903646462, "grad_norm": 0.3414016664028168, "learning_rate": 3.214852885086084e-06, "loss": 0.3787, "step": 2710 }, { "epoch": 1.9640183530548176, "grad_norm": 0.3333376348018646, "learning_rate": 3.21091559719642e-06, "loss": 0.3919, "step": 2711 }, { "epoch": 1.964742815744989, "grad_norm": 0.3249971270561218, "learning_rate": 3.2069795811829772e-06, "loss": 0.3921, "step": 2712 }, { "epoch": 1.9654672784351606, "grad_norm": 0.30317679047584534, "learning_rate": 3.2030448398439063e-06, "loss": 0.3886, "step": 2713 }, { "epoch": 1.966191741125332, "grad_norm": 0.32020360231399536, "learning_rate": 3.1991113759764493e-06, "loss": 0.4053, "step": 2714 }, { "epoch": 1.9669162038155035, "grad_norm": 0.30379796028137207, "learning_rate": 3.1951791923769405e-06, "loss": 0.4055, "step": 2715 }, { "epoch": 1.967640666505675, "grad_norm": 0.3212195336818695, "learning_rate": 3.191248291840806e-06, "loss": 0.3614, "step": 2716 }, { "epoch": 1.9683651291958464, "grad_norm": 0.319909006357193, "learning_rate": 3.1873186771625586e-06, "loss": 0.3887, "step": 2717 }, { "epoch": 1.9690895918860178, "grad_norm": 0.3135465979576111, "learning_rate": 3.1833903511357943e-06, "loss": 0.363, "step": 2718 }, { "epoch": 1.9698140545761893, "grad_norm": 0.3065800666809082, "learning_rate": 3.1794633165531986e-06, "loss": 0.3905, "step": 2719 }, { "epoch": 1.9705385172663608, "grad_norm": 0.2869381904602051, "learning_rate": 3.1755375762065344e-06, "loss": 0.3675, "step": 2720 }, { "epoch": 1.9712629799565322, "grad_norm": 0.29706576466560364, "learning_rate": 3.171613132886645e-06, "loss": 0.3822, "step": 2721 }, { "epoch": 1.9719874426467037, "grad_norm": 0.2917563021183014, "learning_rate": 3.1676899893834545e-06, "loss": 0.3794, "step": 2722 }, { "epoch": 1.9727119053368751, "grad_norm": 0.29259005188941956, "learning_rate": 3.1637681484859582e-06, "loss": 0.3464, "step": 2723 }, { "epoch": 1.9734363680270466, "grad_norm": 0.28821343183517456, "learning_rate": 3.1598476129822314e-06, "loss": 0.3788, "step": 2724 }, { "epoch": 1.974160830717218, "grad_norm": 0.29924556612968445, "learning_rate": 3.1559283856594165e-06, "loss": 0.396, "step": 2725 }, { "epoch": 1.9748852934073895, "grad_norm": 0.30413416028022766, "learning_rate": 3.1520104693037263e-06, "loss": 0.3823, "step": 2726 }, { "epoch": 1.975609756097561, "grad_norm": 0.2992077171802521, "learning_rate": 3.148093866700445e-06, "loss": 0.3856, "step": 2727 }, { "epoch": 1.9763342187877324, "grad_norm": 0.28336745500564575, "learning_rate": 3.1441785806339196e-06, "loss": 0.4095, "step": 2728 }, { "epoch": 1.9770586814779039, "grad_norm": 0.3013747036457062, "learning_rate": 3.1402646138875615e-06, "loss": 0.4053, "step": 2729 }, { "epoch": 1.9777831441680753, "grad_norm": 0.31316709518432617, "learning_rate": 3.136351969243847e-06, "loss": 0.377, "step": 2730 }, { "epoch": 1.9785076068582468, "grad_norm": 0.31666168570518494, "learning_rate": 3.1324406494843095e-06, "loss": 0.4057, "step": 2731 }, { "epoch": 1.9792320695484182, "grad_norm": 0.3090651333332062, "learning_rate": 3.1285306573895398e-06, "loss": 0.3887, "step": 2732 }, { "epoch": 1.9799565322385897, "grad_norm": 0.29284149408340454, "learning_rate": 3.1246219957391887e-06, "loss": 0.3847, "step": 2733 }, { "epoch": 1.9806809949287612, "grad_norm": 0.3075655698776245, "learning_rate": 3.120714667311957e-06, "loss": 0.4278, "step": 2734 }, { "epoch": 1.9814054576189326, "grad_norm": 0.2901515066623688, "learning_rate": 3.116808674885603e-06, "loss": 0.3939, "step": 2735 }, { "epoch": 1.982129920309104, "grad_norm": 0.28223010897636414, "learning_rate": 3.1129040212369286e-06, "loss": 0.3807, "step": 2736 }, { "epoch": 1.9828543829992755, "grad_norm": 0.3215063214302063, "learning_rate": 3.1090007091417884e-06, "loss": 0.4221, "step": 2737 }, { "epoch": 1.983578845689447, "grad_norm": 0.2961074113845825, "learning_rate": 3.105098741375082e-06, "loss": 0.3691, "step": 2738 }, { "epoch": 1.9843033083796184, "grad_norm": 0.2840355336666107, "learning_rate": 3.1011981207107543e-06, "loss": 0.3807, "step": 2739 }, { "epoch": 1.98502777106979, "grad_norm": 0.3085200786590576, "learning_rate": 3.0972988499217903e-06, "loss": 0.3972, "step": 2740 }, { "epoch": 1.9857522337599613, "grad_norm": 0.3078588843345642, "learning_rate": 3.093400931780218e-06, "loss": 0.3844, "step": 2741 }, { "epoch": 1.9864766964501328, "grad_norm": 0.3302699029445648, "learning_rate": 3.0895043690571018e-06, "loss": 0.4559, "step": 2742 }, { "epoch": 1.9872011591403043, "grad_norm": 0.2889290153980255, "learning_rate": 3.0856091645225426e-06, "loss": 0.3642, "step": 2743 }, { "epoch": 1.9879256218304757, "grad_norm": 0.299554705619812, "learning_rate": 3.0817153209456786e-06, "loss": 0.3938, "step": 2744 }, { "epoch": 1.9886500845206472, "grad_norm": 0.29922357201576233, "learning_rate": 3.077822841094675e-06, "loss": 0.3984, "step": 2745 }, { "epoch": 1.9893745472108186, "grad_norm": 0.2906413972377777, "learning_rate": 3.0739317277367348e-06, "loss": 0.4228, "step": 2746 }, { "epoch": 1.99009900990099, "grad_norm": 0.299888014793396, "learning_rate": 3.070041983638083e-06, "loss": 0.3752, "step": 2747 }, { "epoch": 1.9908234725911615, "grad_norm": 0.31589099764823914, "learning_rate": 3.0661536115639727e-06, "loss": 0.3761, "step": 2748 }, { "epoch": 1.991547935281333, "grad_norm": 0.29152554273605347, "learning_rate": 3.0622666142786854e-06, "loss": 0.3406, "step": 2749 }, { "epoch": 1.9922723979715045, "grad_norm": 0.3315778970718384, "learning_rate": 3.058380994545521e-06, "loss": 0.4248, "step": 2750 }, { "epoch": 1.992996860661676, "grad_norm": 0.33971360325813293, "learning_rate": 3.054496755126801e-06, "loss": 0.3784, "step": 2751 }, { "epoch": 1.9937213233518474, "grad_norm": 0.3090791702270508, "learning_rate": 3.0506138987838683e-06, "loss": 0.4022, "step": 2752 }, { "epoch": 1.9944457860420188, "grad_norm": 0.323154091835022, "learning_rate": 3.0467324282770788e-06, "loss": 0.3901, "step": 2753 }, { "epoch": 1.9951702487321903, "grad_norm": 0.3101798892021179, "learning_rate": 3.0428523463658046e-06, "loss": 0.3522, "step": 2754 }, { "epoch": 1.9958947114223617, "grad_norm": 0.33997729420661926, "learning_rate": 3.0389736558084314e-06, "loss": 0.4409, "step": 2755 }, { "epoch": 1.9966191741125332, "grad_norm": 0.28854337334632874, "learning_rate": 3.035096359362355e-06, "loss": 0.342, "step": 2756 }, { "epoch": 1.9973436368027047, "grad_norm": 0.3312452435493469, "learning_rate": 3.0312204597839824e-06, "loss": 0.4101, "step": 2757 }, { "epoch": 1.9980680994928761, "grad_norm": 0.3240601718425751, "learning_rate": 3.0273459598287224e-06, "loss": 0.4084, "step": 2758 }, { "epoch": 1.9987925621830476, "grad_norm": 0.3721505105495453, "learning_rate": 3.0234728622509936e-06, "loss": 0.4332, "step": 2759 }, { "epoch": 1.999517024873219, "grad_norm": 0.31233182549476624, "learning_rate": 3.019601169804216e-06, "loss": 0.388, "step": 2760 }, { "epoch": 2.0002414875633905, "grad_norm": 0.5953847169876099, "learning_rate": 3.0157308852408105e-06, "loss": 0.6104, "step": 2761 }, { "epoch": 2.000965950253562, "grad_norm": 0.30753177404403687, "learning_rate": 3.0118620113121967e-06, "loss": 0.3546, "step": 2762 }, { "epoch": 2.0016904129437334, "grad_norm": 0.3058798611164093, "learning_rate": 3.007994550768793e-06, "loss": 0.3703, "step": 2763 }, { "epoch": 2.002414875633905, "grad_norm": 0.35165053606033325, "learning_rate": 3.004128506360011e-06, "loss": 0.4179, "step": 2764 }, { "epoch": 2.0031393383240763, "grad_norm": 0.30149659514427185, "learning_rate": 3.0002638808342565e-06, "loss": 0.3194, "step": 2765 }, { "epoch": 2.0038638010142478, "grad_norm": 0.30792176723480225, "learning_rate": 2.9964006769389277e-06, "loss": 0.3712, "step": 2766 }, { "epoch": 2.0045882637044192, "grad_norm": 0.3239831328392029, "learning_rate": 2.9925388974204097e-06, "loss": 0.3879, "step": 2767 }, { "epoch": 2.0053127263945907, "grad_norm": 0.3348630666732788, "learning_rate": 2.988678545024077e-06, "loss": 0.3774, "step": 2768 }, { "epoch": 2.006037189084762, "grad_norm": 0.2962486743927002, "learning_rate": 2.9848196224942895e-06, "loss": 0.358, "step": 2769 }, { "epoch": 2.0067616517749336, "grad_norm": 0.2949208915233612, "learning_rate": 2.980962132574389e-06, "loss": 0.3341, "step": 2770 }, { "epoch": 2.007486114465105, "grad_norm": 0.31907588243484497, "learning_rate": 2.9771060780066997e-06, "loss": 0.3658, "step": 2771 }, { "epoch": 2.0082105771552765, "grad_norm": 0.3289080560207367, "learning_rate": 2.973251461532527e-06, "loss": 0.3944, "step": 2772 }, { "epoch": 2.008935039845448, "grad_norm": 0.34461066126823425, "learning_rate": 2.96939828589215e-06, "loss": 0.392, "step": 2773 }, { "epoch": 2.0096595025356194, "grad_norm": 0.31556159257888794, "learning_rate": 2.965546553824829e-06, "loss": 0.3789, "step": 2774 }, { "epoch": 2.010383965225791, "grad_norm": 0.32769811153411865, "learning_rate": 2.9616962680687934e-06, "loss": 0.3829, "step": 2775 }, { "epoch": 2.0111084279159623, "grad_norm": 0.2961372137069702, "learning_rate": 2.957847431361245e-06, "loss": 0.3577, "step": 2776 }, { "epoch": 2.011832890606134, "grad_norm": 0.3235292434692383, "learning_rate": 2.954000046438359e-06, "loss": 0.3611, "step": 2777 }, { "epoch": 2.0125573532963053, "grad_norm": 0.3361559212207794, "learning_rate": 2.950154116035273e-06, "loss": 0.3708, "step": 2778 }, { "epoch": 2.0132818159864767, "grad_norm": 0.32306960225105286, "learning_rate": 2.946309642886097e-06, "loss": 0.3512, "step": 2779 }, { "epoch": 2.014006278676648, "grad_norm": 0.3191300630569458, "learning_rate": 2.942466629723899e-06, "loss": 0.4196, "step": 2780 }, { "epoch": 2.0147307413668196, "grad_norm": 0.3109561800956726, "learning_rate": 2.9386250792807124e-06, "loss": 0.3444, "step": 2781 }, { "epoch": 2.015455204056991, "grad_norm": 0.3405488133430481, "learning_rate": 2.9347849942875307e-06, "loss": 0.3815, "step": 2782 }, { "epoch": 2.0161796667471625, "grad_norm": 0.30972111225128174, "learning_rate": 2.9309463774743047e-06, "loss": 0.3725, "step": 2783 }, { "epoch": 2.016904129437334, "grad_norm": 0.31712624430656433, "learning_rate": 2.92710923156994e-06, "loss": 0.3593, "step": 2784 }, { "epoch": 2.0176285921275054, "grad_norm": 0.3189895451068878, "learning_rate": 2.923273559302301e-06, "loss": 0.3527, "step": 2785 }, { "epoch": 2.018353054817677, "grad_norm": 0.2885994017124176, "learning_rate": 2.919439363398199e-06, "loss": 0.344, "step": 2786 }, { "epoch": 2.0190775175078484, "grad_norm": 0.3162406086921692, "learning_rate": 2.9156066465833997e-06, "loss": 0.3716, "step": 2787 }, { "epoch": 2.01980198019802, "grad_norm": 0.33050769567489624, "learning_rate": 2.9117754115826156e-06, "loss": 0.379, "step": 2788 }, { "epoch": 2.0205264428881913, "grad_norm": 0.32729384303092957, "learning_rate": 2.907945661119509e-06, "loss": 0.3586, "step": 2789 }, { "epoch": 2.0212509055783627, "grad_norm": 0.3000594675540924, "learning_rate": 2.9041173979166813e-06, "loss": 0.3733, "step": 2790 }, { "epoch": 2.021975368268534, "grad_norm": 0.3136564791202545, "learning_rate": 2.9002906246956773e-06, "loss": 0.3819, "step": 2791 }, { "epoch": 2.0226998309587056, "grad_norm": 0.28469252586364746, "learning_rate": 2.896465344176987e-06, "loss": 0.3323, "step": 2792 }, { "epoch": 2.023424293648877, "grad_norm": 0.2991248071193695, "learning_rate": 2.8926415590800384e-06, "loss": 0.3738, "step": 2793 }, { "epoch": 2.0241487563390486, "grad_norm": 0.3112556040287018, "learning_rate": 2.888819272123191e-06, "loss": 0.3739, "step": 2794 }, { "epoch": 2.02487321902922, "grad_norm": 0.3409173786640167, "learning_rate": 2.884998486023746e-06, "loss": 0.4103, "step": 2795 }, { "epoch": 2.0255976817193915, "grad_norm": 0.27598410844802856, "learning_rate": 2.881179203497934e-06, "loss": 0.3137, "step": 2796 }, { "epoch": 2.026322144409563, "grad_norm": 0.350209504365921, "learning_rate": 2.8773614272609163e-06, "loss": 0.3926, "step": 2797 }, { "epoch": 2.0270466070997344, "grad_norm": 0.3045232892036438, "learning_rate": 2.8735451600267845e-06, "loss": 0.3261, "step": 2798 }, { "epoch": 2.027771069789906, "grad_norm": 0.3223300576210022, "learning_rate": 2.86973040450856e-06, "loss": 0.3897, "step": 2799 }, { "epoch": 2.0284955324800773, "grad_norm": 0.3174144923686981, "learning_rate": 2.865917163418183e-06, "loss": 0.3556, "step": 2800 }, { "epoch": 2.0292199951702488, "grad_norm": 0.32789120078086853, "learning_rate": 2.8621054394665233e-06, "loss": 0.4063, "step": 2801 }, { "epoch": 2.02994445786042, "grad_norm": 0.3329305350780487, "learning_rate": 2.8582952353633708e-06, "loss": 0.3663, "step": 2802 }, { "epoch": 2.0306689205505917, "grad_norm": 0.3433016836643219, "learning_rate": 2.8544865538174306e-06, "loss": 0.3933, "step": 2803 }, { "epoch": 2.031393383240763, "grad_norm": 0.3413502275943756, "learning_rate": 2.8506793975363313e-06, "loss": 0.3778, "step": 2804 }, { "epoch": 2.0321178459309346, "grad_norm": 0.32692089676856995, "learning_rate": 2.8468737692266146e-06, "loss": 0.3623, "step": 2805 }, { "epoch": 2.032842308621106, "grad_norm": 0.3519879877567291, "learning_rate": 2.843069671593734e-06, "loss": 0.3524, "step": 2806 }, { "epoch": 2.0335667713112775, "grad_norm": 0.3095693588256836, "learning_rate": 2.839267107342058e-06, "loss": 0.3515, "step": 2807 }, { "epoch": 2.034291234001449, "grad_norm": 0.3109806180000305, "learning_rate": 2.835466079174866e-06, "loss": 0.357, "step": 2808 }, { "epoch": 2.0350156966916204, "grad_norm": 0.3312370777130127, "learning_rate": 2.8316665897943384e-06, "loss": 0.3603, "step": 2809 }, { "epoch": 2.035740159381792, "grad_norm": 0.34863173961639404, "learning_rate": 2.8278686419015693e-06, "loss": 0.39, "step": 2810 }, { "epoch": 2.0364646220719633, "grad_norm": 0.3099724054336548, "learning_rate": 2.824072238196554e-06, "loss": 0.3684, "step": 2811 }, { "epoch": 2.037189084762135, "grad_norm": 0.3083511292934418, "learning_rate": 2.820277381378192e-06, "loss": 0.3728, "step": 2812 }, { "epoch": 2.0379135474523062, "grad_norm": 0.30026087164878845, "learning_rate": 2.816484074144277e-06, "loss": 0.3691, "step": 2813 }, { "epoch": 2.0386380101424777, "grad_norm": 0.2963140606880188, "learning_rate": 2.812692319191507e-06, "loss": 0.3368, "step": 2814 }, { "epoch": 2.039362472832649, "grad_norm": 0.30000829696655273, "learning_rate": 2.8089021192154776e-06, "loss": 0.3502, "step": 2815 }, { "epoch": 2.0400869355228206, "grad_norm": 0.32534992694854736, "learning_rate": 2.805113476910671e-06, "loss": 0.3621, "step": 2816 }, { "epoch": 2.040811398212992, "grad_norm": 0.30074000358581543, "learning_rate": 2.8013263949704706e-06, "loss": 0.3753, "step": 2817 }, { "epoch": 2.0415358609031635, "grad_norm": 0.2956092655658722, "learning_rate": 2.797540876087148e-06, "loss": 0.4027, "step": 2818 }, { "epoch": 2.042260323593335, "grad_norm": 0.3119695782661438, "learning_rate": 2.7937569229518593e-06, "loss": 0.3972, "step": 2819 }, { "epoch": 2.0429847862835064, "grad_norm": 0.27603304386138916, "learning_rate": 2.789974538254653e-06, "loss": 0.3348, "step": 2820 }, { "epoch": 2.043709248973678, "grad_norm": 0.3034762740135193, "learning_rate": 2.7861937246844604e-06, "loss": 0.3823, "step": 2821 }, { "epoch": 2.0444337116638494, "grad_norm": 0.2800074517726898, "learning_rate": 2.7824144849290946e-06, "loss": 0.3515, "step": 2822 }, { "epoch": 2.045158174354021, "grad_norm": 0.3173096776008606, "learning_rate": 2.7786368216752526e-06, "loss": 0.3593, "step": 2823 }, { "epoch": 2.0458826370441923, "grad_norm": 0.3434549570083618, "learning_rate": 2.7748607376085102e-06, "loss": 0.3932, "step": 2824 }, { "epoch": 2.0466070997343637, "grad_norm": 0.2969778776168823, "learning_rate": 2.7710862354133163e-06, "loss": 0.3479, "step": 2825 }, { "epoch": 2.047331562424535, "grad_norm": 0.2953125834465027, "learning_rate": 2.767313317773e-06, "loss": 0.374, "step": 2826 }, { "epoch": 2.0480560251147066, "grad_norm": 0.30173808336257935, "learning_rate": 2.763541987369765e-06, "loss": 0.3574, "step": 2827 }, { "epoch": 2.048780487804878, "grad_norm": 0.33338844776153564, "learning_rate": 2.75977224688468e-06, "loss": 0.3923, "step": 2828 }, { "epoch": 2.0495049504950495, "grad_norm": 0.3099971413612366, "learning_rate": 2.7560040989976894e-06, "loss": 0.3619, "step": 2829 }, { "epoch": 2.050229413185221, "grad_norm": 0.29550203680992126, "learning_rate": 2.7522375463876054e-06, "loss": 0.3812, "step": 2830 }, { "epoch": 2.0509538758753925, "grad_norm": 0.3245016634464264, "learning_rate": 2.7484725917321006e-06, "loss": 0.3618, "step": 2831 }, { "epoch": 2.051678338565564, "grad_norm": 0.33054473996162415, "learning_rate": 2.744709237707717e-06, "loss": 0.3828, "step": 2832 }, { "epoch": 2.0524028012557354, "grad_norm": 0.2807053029537201, "learning_rate": 2.7409474869898554e-06, "loss": 0.3413, "step": 2833 }, { "epoch": 2.053127263945907, "grad_norm": 0.30330103635787964, "learning_rate": 2.737187342252783e-06, "loss": 0.369, "step": 2834 }, { "epoch": 2.0538517266360783, "grad_norm": 0.31755226850509644, "learning_rate": 2.7334288061696146e-06, "loss": 0.3886, "step": 2835 }, { "epoch": 2.0545761893262497, "grad_norm": 0.3153161406517029, "learning_rate": 2.7296718814123304e-06, "loss": 0.3915, "step": 2836 }, { "epoch": 2.055300652016421, "grad_norm": 0.3345911204814911, "learning_rate": 2.725916570651763e-06, "loss": 0.3844, "step": 2837 }, { "epoch": 2.0560251147065927, "grad_norm": 0.32126009464263916, "learning_rate": 2.722162876557593e-06, "loss": 0.3489, "step": 2838 }, { "epoch": 2.056749577396764, "grad_norm": 0.3148820698261261, "learning_rate": 2.718410801798358e-06, "loss": 0.3651, "step": 2839 }, { "epoch": 2.0574740400869356, "grad_norm": 0.325172483921051, "learning_rate": 2.7146603490414437e-06, "loss": 0.3468, "step": 2840 }, { "epoch": 2.058198502777107, "grad_norm": 0.3191031813621521, "learning_rate": 2.7109115209530756e-06, "loss": 0.3833, "step": 2841 }, { "epoch": 2.0589229654672785, "grad_norm": 0.3262450098991394, "learning_rate": 2.7071643201983335e-06, "loss": 0.3857, "step": 2842 }, { "epoch": 2.05964742815745, "grad_norm": 0.29687532782554626, "learning_rate": 2.7034187494411356e-06, "loss": 0.3883, "step": 2843 }, { "epoch": 2.0603718908476214, "grad_norm": 0.29451778531074524, "learning_rate": 2.6996748113442397e-06, "loss": 0.3432, "step": 2844 }, { "epoch": 2.061096353537793, "grad_norm": 0.2904651165008545, "learning_rate": 2.695932508569247e-06, "loss": 0.3542, "step": 2845 }, { "epoch": 2.0618208162279643, "grad_norm": 0.31765857338905334, "learning_rate": 2.692191843776595e-06, "loss": 0.3818, "step": 2846 }, { "epoch": 2.0625452789181358, "grad_norm": 0.297224760055542, "learning_rate": 2.688452819625553e-06, "loss": 0.3406, "step": 2847 }, { "epoch": 2.0632697416083072, "grad_norm": 0.33963003754615784, "learning_rate": 2.684715438774229e-06, "loss": 0.3778, "step": 2848 }, { "epoch": 2.0639942042984787, "grad_norm": 0.30061647295951843, "learning_rate": 2.6809797038795632e-06, "loss": 0.3238, "step": 2849 }, { "epoch": 2.06471866698865, "grad_norm": 0.3216242492198944, "learning_rate": 2.6772456175973176e-06, "loss": 0.3929, "step": 2850 }, { "epoch": 2.0654431296788216, "grad_norm": 0.322430282831192, "learning_rate": 2.6735131825820913e-06, "loss": 0.3565, "step": 2851 }, { "epoch": 2.066167592368993, "grad_norm": 0.3166784942150116, "learning_rate": 2.6697824014873076e-06, "loss": 0.379, "step": 2852 }, { "epoch": 2.0668920550591645, "grad_norm": 0.3209136128425598, "learning_rate": 2.666053276965207e-06, "loss": 0.3778, "step": 2853 }, { "epoch": 2.067616517749336, "grad_norm": 0.3216211795806885, "learning_rate": 2.6623258116668615e-06, "loss": 0.3672, "step": 2854 }, { "epoch": 2.0683409804395074, "grad_norm": 0.29915717244148254, "learning_rate": 2.65860000824216e-06, "loss": 0.3572, "step": 2855 }, { "epoch": 2.069065443129679, "grad_norm": 0.2786274552345276, "learning_rate": 2.6548758693398104e-06, "loss": 0.339, "step": 2856 }, { "epoch": 2.0697899058198503, "grad_norm": 0.3453792333602905, "learning_rate": 2.6511533976073323e-06, "loss": 0.3989, "step": 2857 }, { "epoch": 2.070514368510022, "grad_norm": 0.32666802406311035, "learning_rate": 2.6474325956910684e-06, "loss": 0.3653, "step": 2858 }, { "epoch": 2.0712388312001933, "grad_norm": 0.29204899072647095, "learning_rate": 2.6437134662361703e-06, "loss": 0.3505, "step": 2859 }, { "epoch": 2.0719632938903647, "grad_norm": 0.2871588468551636, "learning_rate": 2.6399960118865976e-06, "loss": 0.3927, "step": 2860 }, { "epoch": 2.072687756580536, "grad_norm": 0.28390273451805115, "learning_rate": 2.636280235285124e-06, "loss": 0.3477, "step": 2861 }, { "epoch": 2.0734122192707076, "grad_norm": 0.2994069755077362, "learning_rate": 2.6325661390733303e-06, "loss": 0.3455, "step": 2862 }, { "epoch": 2.074136681960879, "grad_norm": 0.3204430937767029, "learning_rate": 2.628853725891598e-06, "loss": 0.3743, "step": 2863 }, { "epoch": 2.0748611446510505, "grad_norm": 0.32414543628692627, "learning_rate": 2.6251429983791154e-06, "loss": 0.3446, "step": 2864 }, { "epoch": 2.075585607341222, "grad_norm": 0.30812957882881165, "learning_rate": 2.621433959173875e-06, "loss": 0.3952, "step": 2865 }, { "epoch": 2.0763100700313935, "grad_norm": 0.3002729117870331, "learning_rate": 2.6177266109126624e-06, "loss": 0.3398, "step": 2866 }, { "epoch": 2.077034532721565, "grad_norm": 0.34261319041252136, "learning_rate": 2.614020956231067e-06, "loss": 0.383, "step": 2867 }, { "epoch": 2.0777589954117364, "grad_norm": 0.30261415243148804, "learning_rate": 2.610316997763473e-06, "loss": 0.3384, "step": 2868 }, { "epoch": 2.078483458101908, "grad_norm": 0.30949506163597107, "learning_rate": 2.606614738143055e-06, "loss": 0.3702, "step": 2869 }, { "epoch": 2.0792079207920793, "grad_norm": 0.3099467158317566, "learning_rate": 2.602914180001784e-06, "loss": 0.3868, "step": 2870 }, { "epoch": 2.0799323834822507, "grad_norm": 0.3012160658836365, "learning_rate": 2.599215325970423e-06, "loss": 0.3506, "step": 2871 }, { "epoch": 2.080656846172422, "grad_norm": 0.29052215814590454, "learning_rate": 2.5955181786785154e-06, "loss": 0.3396, "step": 2872 }, { "epoch": 2.0813813088625936, "grad_norm": 0.2684364318847656, "learning_rate": 2.5918227407544e-06, "loss": 0.3486, "step": 2873 }, { "epoch": 2.082105771552765, "grad_norm": 0.32451072335243225, "learning_rate": 2.5881290148251975e-06, "loss": 0.412, "step": 2874 }, { "epoch": 2.0828302342429366, "grad_norm": 0.27052292227745056, "learning_rate": 2.5844370035168077e-06, "loss": 0.3295, "step": 2875 }, { "epoch": 2.083554696933108, "grad_norm": 0.29936233162879944, "learning_rate": 2.580746709453917e-06, "loss": 0.3604, "step": 2876 }, { "epoch": 2.0842791596232795, "grad_norm": 0.340827614068985, "learning_rate": 2.5770581352599876e-06, "loss": 0.3868, "step": 2877 }, { "epoch": 2.085003622313451, "grad_norm": 0.28503531217575073, "learning_rate": 2.5733712835572633e-06, "loss": 0.3235, "step": 2878 }, { "epoch": 2.0857280850036224, "grad_norm": 0.3122889995574951, "learning_rate": 2.5696861569667555e-06, "loss": 0.3782, "step": 2879 }, { "epoch": 2.086452547693794, "grad_norm": 0.2916828393936157, "learning_rate": 2.566002758108256e-06, "loss": 0.3683, "step": 2880 }, { "epoch": 2.0871770103839653, "grad_norm": 0.31204086542129517, "learning_rate": 2.562321089600329e-06, "loss": 0.3793, "step": 2881 }, { "epoch": 2.0879014730741368, "grad_norm": 0.3004184365272522, "learning_rate": 2.5586411540603006e-06, "loss": 0.3471, "step": 2882 }, { "epoch": 2.088625935764308, "grad_norm": 0.31142526865005493, "learning_rate": 2.554962954104274e-06, "loss": 0.3457, "step": 2883 }, { "epoch": 2.0893503984544797, "grad_norm": 0.3242337703704834, "learning_rate": 2.5512864923471126e-06, "loss": 0.3814, "step": 2884 }, { "epoch": 2.090074861144651, "grad_norm": 0.30382683873176575, "learning_rate": 2.5476117714024482e-06, "loss": 0.3719, "step": 2885 }, { "epoch": 2.0907993238348226, "grad_norm": 0.2818298637866974, "learning_rate": 2.5439387938826697e-06, "loss": 0.3466, "step": 2886 }, { "epoch": 2.091523786524994, "grad_norm": 0.31889086961746216, "learning_rate": 2.5402675623989325e-06, "loss": 0.3999, "step": 2887 }, { "epoch": 2.0922482492151655, "grad_norm": 0.3086029589176178, "learning_rate": 2.53659807956115e-06, "loss": 0.3928, "step": 2888 }, { "epoch": 2.092972711905337, "grad_norm": 0.2993220090866089, "learning_rate": 2.5329303479779855e-06, "loss": 0.3429, "step": 2889 }, { "epoch": 2.0936971745955084, "grad_norm": 0.3101990520954132, "learning_rate": 2.5292643702568655e-06, "loss": 0.395, "step": 2890 }, { "epoch": 2.09442163728568, "grad_norm": 0.2824648916721344, "learning_rate": 2.5256001490039684e-06, "loss": 0.3394, "step": 2891 }, { "epoch": 2.0951460999758513, "grad_norm": 0.30687087774276733, "learning_rate": 2.5219376868242173e-06, "loss": 0.3833, "step": 2892 }, { "epoch": 2.095870562666023, "grad_norm": 0.31170904636383057, "learning_rate": 2.518276986321293e-06, "loss": 0.3659, "step": 2893 }, { "epoch": 2.0965950253561942, "grad_norm": 0.30572420358657837, "learning_rate": 2.5146180500976215e-06, "loss": 0.3877, "step": 2894 }, { "epoch": 2.0973194880463657, "grad_norm": 0.3091205656528473, "learning_rate": 2.5109608807543694e-06, "loss": 0.3429, "step": 2895 }, { "epoch": 2.098043950736537, "grad_norm": 0.3104066252708435, "learning_rate": 2.5073054808914544e-06, "loss": 0.3675, "step": 2896 }, { "epoch": 2.0987684134267086, "grad_norm": 0.37715768814086914, "learning_rate": 2.503651853107534e-06, "loss": 0.374, "step": 2897 }, { "epoch": 2.09949287611688, "grad_norm": 0.2922247648239136, "learning_rate": 2.5000000000000015e-06, "loss": 0.3877, "step": 2898 }, { "epoch": 2.1002173388070515, "grad_norm": 0.2937591075897217, "learning_rate": 2.496349924164994e-06, "loss": 0.3449, "step": 2899 }, { "epoch": 2.100941801497223, "grad_norm": 0.3067688047885895, "learning_rate": 2.492701628197386e-06, "loss": 0.3981, "step": 2900 }, { "epoch": 2.1016662641873944, "grad_norm": 0.29500797390937805, "learning_rate": 2.489055114690779e-06, "loss": 0.3518, "step": 2901 }, { "epoch": 2.102390726877566, "grad_norm": 0.30321773886680603, "learning_rate": 2.485410386237516e-06, "loss": 0.3521, "step": 2902 }, { "epoch": 2.1031151895677374, "grad_norm": 0.33582162857055664, "learning_rate": 2.4817674454286674e-06, "loss": 0.3643, "step": 2903 }, { "epoch": 2.103839652257909, "grad_norm": 0.2987511456012726, "learning_rate": 2.478126294854029e-06, "loss": 0.3423, "step": 2904 }, { "epoch": 2.1045641149480803, "grad_norm": 0.31952083110809326, "learning_rate": 2.4744869371021307e-06, "loss": 0.3848, "step": 2905 }, { "epoch": 2.1052885776382517, "grad_norm": 0.31220635771751404, "learning_rate": 2.4708493747602234e-06, "loss": 0.3704, "step": 2906 }, { "epoch": 2.106013040328423, "grad_norm": 0.3043956756591797, "learning_rate": 2.467213610414286e-06, "loss": 0.3514, "step": 2907 }, { "epoch": 2.1067375030185946, "grad_norm": 0.31058838963508606, "learning_rate": 2.4635796466490115e-06, "loss": 0.3733, "step": 2908 }, { "epoch": 2.107461965708766, "grad_norm": 0.28317731618881226, "learning_rate": 2.4599474860478197e-06, "loss": 0.3561, "step": 2909 }, { "epoch": 2.1081864283989376, "grad_norm": 0.28580960631370544, "learning_rate": 2.4563171311928475e-06, "loss": 0.3365, "step": 2910 }, { "epoch": 2.108910891089109, "grad_norm": 0.3261757493019104, "learning_rate": 2.452688584664944e-06, "loss": 0.4041, "step": 2911 }, { "epoch": 2.1096353537792805, "grad_norm": 0.29852938652038574, "learning_rate": 2.4490618490436772e-06, "loss": 0.354, "step": 2912 }, { "epoch": 2.110359816469452, "grad_norm": 0.30671951174736023, "learning_rate": 2.4454369269073284e-06, "loss": 0.3562, "step": 2913 }, { "epoch": 2.1110842791596234, "grad_norm": 0.2766207456588745, "learning_rate": 2.4418138208328835e-06, "loss": 0.3327, "step": 2914 }, { "epoch": 2.111808741849795, "grad_norm": 0.32184135913848877, "learning_rate": 2.4381925333960422e-06, "loss": 0.3802, "step": 2915 }, { "epoch": 2.1125332045399663, "grad_norm": 0.3133653402328491, "learning_rate": 2.434573067171213e-06, "loss": 0.3584, "step": 2916 }, { "epoch": 2.1132576672301377, "grad_norm": 0.26544156670570374, "learning_rate": 2.4309554247315038e-06, "loss": 0.326, "step": 2917 }, { "epoch": 2.113982129920309, "grad_norm": 0.3317614793777466, "learning_rate": 2.42733960864873e-06, "loss": 0.3693, "step": 2918 }, { "epoch": 2.1147065926104807, "grad_norm": 0.31857356429100037, "learning_rate": 2.42372562149341e-06, "loss": 0.3564, "step": 2919 }, { "epoch": 2.115431055300652, "grad_norm": 0.3116730749607086, "learning_rate": 2.420113465834757e-06, "loss": 0.3844, "step": 2920 }, { "epoch": 2.1161555179908236, "grad_norm": 0.2936806082725525, "learning_rate": 2.4165031442406857e-06, "loss": 0.3858, "step": 2921 }, { "epoch": 2.116879980680995, "grad_norm": 0.27390334010124207, "learning_rate": 2.4128946592778086e-06, "loss": 0.3752, "step": 2922 }, { "epoch": 2.1176044433711665, "grad_norm": 0.2929794490337372, "learning_rate": 2.4092880135114266e-06, "loss": 0.3392, "step": 2923 }, { "epoch": 2.118328906061338, "grad_norm": 0.32412660121917725, "learning_rate": 2.405683209505539e-06, "loss": 0.3991, "step": 2924 }, { "epoch": 2.1190533687515094, "grad_norm": 0.3419969379901886, "learning_rate": 2.4020802498228333e-06, "loss": 0.3572, "step": 2925 }, { "epoch": 2.119777831441681, "grad_norm": 0.2936384975910187, "learning_rate": 2.3984791370246847e-06, "loss": 0.3328, "step": 2926 }, { "epoch": 2.1205022941318523, "grad_norm": 0.30432891845703125, "learning_rate": 2.3948798736711574e-06, "loss": 0.3553, "step": 2927 }, { "epoch": 2.1212267568220238, "grad_norm": 0.3286393880844116, "learning_rate": 2.391282462321e-06, "loss": 0.3791, "step": 2928 }, { "epoch": 2.1219512195121952, "grad_norm": 0.3211567997932434, "learning_rate": 2.3876869055316477e-06, "loss": 0.383, "step": 2929 }, { "epoch": 2.1226756822023667, "grad_norm": 0.30572885274887085, "learning_rate": 2.3840932058592086e-06, "loss": 0.3719, "step": 2930 }, { "epoch": 2.123400144892538, "grad_norm": 0.30269643664360046, "learning_rate": 2.38050136585848e-06, "loss": 0.3617, "step": 2931 }, { "epoch": 2.1241246075827096, "grad_norm": 0.29604241251945496, "learning_rate": 2.3769113880829333e-06, "loss": 0.3345, "step": 2932 }, { "epoch": 2.124849070272881, "grad_norm": 0.3149932622909546, "learning_rate": 2.3733232750847135e-06, "loss": 0.3851, "step": 2933 }, { "epoch": 2.1255735329630525, "grad_norm": 0.3051965534687042, "learning_rate": 2.369737029414644e-06, "loss": 0.3812, "step": 2934 }, { "epoch": 2.126297995653224, "grad_norm": 0.2879173755645752, "learning_rate": 2.366152653622222e-06, "loss": 0.36, "step": 2935 }, { "epoch": 2.1270224583433954, "grad_norm": 0.3086816668510437, "learning_rate": 2.362570150255607e-06, "loss": 0.3495, "step": 2936 }, { "epoch": 2.127746921033567, "grad_norm": 0.3598249852657318, "learning_rate": 2.3589895218616356e-06, "loss": 0.4227, "step": 2937 }, { "epoch": 2.1284713837237383, "grad_norm": 0.31394731998443604, "learning_rate": 2.355410770985811e-06, "loss": 0.2994, "step": 2938 }, { "epoch": 2.12919584641391, "grad_norm": 0.3262869715690613, "learning_rate": 2.3518339001722956e-06, "loss": 0.3596, "step": 2939 }, { "epoch": 2.1299203091040813, "grad_norm": 0.3459727466106415, "learning_rate": 2.348258911963921e-06, "loss": 0.3608, "step": 2940 }, { "epoch": 2.1306447717942527, "grad_norm": 0.3281113803386688, "learning_rate": 2.3446858089021802e-06, "loss": 0.3798, "step": 2941 }, { "epoch": 2.131369234484424, "grad_norm": 0.2925070524215698, "learning_rate": 2.3411145935272202e-06, "loss": 0.3339, "step": 2942 }, { "epoch": 2.1320936971745956, "grad_norm": 0.3369428217411041, "learning_rate": 2.337545268377853e-06, "loss": 0.3709, "step": 2943 }, { "epoch": 2.132818159864767, "grad_norm": 0.3304680585861206, "learning_rate": 2.333977835991545e-06, "loss": 0.3544, "step": 2944 }, { "epoch": 2.1335426225549385, "grad_norm": 0.3314804434776306, "learning_rate": 2.3304122989044128e-06, "loss": 0.3909, "step": 2945 }, { "epoch": 2.13426708524511, "grad_norm": 0.3085714280605316, "learning_rate": 2.3268486596512306e-06, "loss": 0.3237, "step": 2946 }, { "epoch": 2.1349915479352815, "grad_norm": 0.31130436062812805, "learning_rate": 2.3232869207654237e-06, "loss": 0.426, "step": 2947 }, { "epoch": 2.135716010625453, "grad_norm": 0.2984030842781067, "learning_rate": 2.31972708477906e-06, "loss": 0.3959, "step": 2948 }, { "epoch": 2.1364404733156244, "grad_norm": 0.32033130526542664, "learning_rate": 2.3161691542228616e-06, "loss": 0.3866, "step": 2949 }, { "epoch": 2.137164936005796, "grad_norm": 0.27633172273635864, "learning_rate": 2.312613131626193e-06, "loss": 0.3244, "step": 2950 }, { "epoch": 2.1378893986959673, "grad_norm": 0.2877872586250305, "learning_rate": 2.3090590195170642e-06, "loss": 0.328, "step": 2951 }, { "epoch": 2.1386138613861387, "grad_norm": 0.3509773313999176, "learning_rate": 2.3055068204221226e-06, "loss": 0.4035, "step": 2952 }, { "epoch": 2.13933832407631, "grad_norm": 0.2931835353374481, "learning_rate": 2.30195653686666e-06, "loss": 0.3346, "step": 2953 }, { "epoch": 2.1400627867664817, "grad_norm": 0.28894099593162537, "learning_rate": 2.298408171374607e-06, "loss": 0.3535, "step": 2954 }, { "epoch": 2.140787249456653, "grad_norm": 0.2950001657009125, "learning_rate": 2.2948617264685245e-06, "loss": 0.3869, "step": 2955 }, { "epoch": 2.1415117121468246, "grad_norm": 0.32912445068359375, "learning_rate": 2.291317204669615e-06, "loss": 0.392, "step": 2956 }, { "epoch": 2.142236174836996, "grad_norm": 0.3022708594799042, "learning_rate": 2.2877746084977108e-06, "loss": 0.3565, "step": 2957 }, { "epoch": 2.1429606375271675, "grad_norm": 0.31089359521865845, "learning_rate": 2.284233940471274e-06, "loss": 0.3283, "step": 2958 }, { "epoch": 2.143685100217339, "grad_norm": 0.3252945840358734, "learning_rate": 2.280695203107398e-06, "loss": 0.3902, "step": 2959 }, { "epoch": 2.1444095629075104, "grad_norm": 0.30023327469825745, "learning_rate": 2.277158398921805e-06, "loss": 0.3517, "step": 2960 }, { "epoch": 2.145134025597682, "grad_norm": 0.3227042257785797, "learning_rate": 2.2736235304288373e-06, "loss": 0.3967, "step": 2961 }, { "epoch": 2.1458584882878533, "grad_norm": 0.2974092364311218, "learning_rate": 2.2700906001414675e-06, "loss": 0.3609, "step": 2962 }, { "epoch": 2.1465829509780248, "grad_norm": 0.3061903119087219, "learning_rate": 2.266559610571288e-06, "loss": 0.3637, "step": 2963 }, { "epoch": 2.147307413668196, "grad_norm": 0.29309380054473877, "learning_rate": 2.2630305642285084e-06, "loss": 0.3579, "step": 2964 }, { "epoch": 2.1480318763583677, "grad_norm": 0.2938739061355591, "learning_rate": 2.259503463621961e-06, "loss": 0.3754, "step": 2965 }, { "epoch": 2.148756339048539, "grad_norm": 0.33336129784584045, "learning_rate": 2.255978311259095e-06, "loss": 0.3817, "step": 2966 }, { "epoch": 2.1494808017387106, "grad_norm": 0.3214505612850189, "learning_rate": 2.2524551096459703e-06, "loss": 0.3504, "step": 2967 }, { "epoch": 2.150205264428882, "grad_norm": 0.3080897927284241, "learning_rate": 2.248933861287263e-06, "loss": 0.3907, "step": 2968 }, { "epoch": 2.1509297271190535, "grad_norm": 0.2864167094230652, "learning_rate": 2.245414568686263e-06, "loss": 0.3487, "step": 2969 }, { "epoch": 2.151654189809225, "grad_norm": 0.345795601606369, "learning_rate": 2.241897234344864e-06, "loss": 0.3973, "step": 2970 }, { "epoch": 2.1523786524993964, "grad_norm": 0.2863444983959198, "learning_rate": 2.238381860763572e-06, "loss": 0.37, "step": 2971 }, { "epoch": 2.153103115189568, "grad_norm": 0.2987995147705078, "learning_rate": 2.2348684504414974e-06, "loss": 0.4129, "step": 2972 }, { "epoch": 2.1538275778797393, "grad_norm": 0.297908753156662, "learning_rate": 2.2313570058763574e-06, "loss": 0.3526, "step": 2973 }, { "epoch": 2.154552040569911, "grad_norm": 0.2990860939025879, "learning_rate": 2.2278475295644653e-06, "loss": 0.3542, "step": 2974 }, { "epoch": 2.1552765032600822, "grad_norm": 0.3078773617744446, "learning_rate": 2.2243400240007424e-06, "loss": 0.3614, "step": 2975 }, { "epoch": 2.1560009659502537, "grad_norm": 0.31168028712272644, "learning_rate": 2.2208344916787087e-06, "loss": 0.3623, "step": 2976 }, { "epoch": 2.156725428640425, "grad_norm": 0.310314804315567, "learning_rate": 2.217330935090472e-06, "loss": 0.3979, "step": 2977 }, { "epoch": 2.1574498913305966, "grad_norm": 0.314750611782074, "learning_rate": 2.213829356726746e-06, "loss": 0.3998, "step": 2978 }, { "epoch": 2.158174354020768, "grad_norm": 0.3110838532447815, "learning_rate": 2.2103297590768334e-06, "loss": 0.3469, "step": 2979 }, { "epoch": 2.1588988167109395, "grad_norm": 0.31356409192085266, "learning_rate": 2.2068321446286326e-06, "loss": 0.3907, "step": 2980 }, { "epoch": 2.159623279401111, "grad_norm": 0.3176238238811493, "learning_rate": 2.203336515868625e-06, "loss": 0.3746, "step": 2981 }, { "epoch": 2.1603477420912824, "grad_norm": 0.30143633484840393, "learning_rate": 2.199842875281886e-06, "loss": 0.3389, "step": 2982 }, { "epoch": 2.161072204781454, "grad_norm": 0.283805251121521, "learning_rate": 2.1963512253520792e-06, "loss": 0.3375, "step": 2983 }, { "epoch": 2.1617966674716254, "grad_norm": 0.28169575333595276, "learning_rate": 2.1928615685614453e-06, "loss": 0.3428, "step": 2984 }, { "epoch": 2.162521130161797, "grad_norm": 0.3061242401599884, "learning_rate": 2.1893739073908147e-06, "loss": 0.3911, "step": 2985 }, { "epoch": 2.1632455928519683, "grad_norm": 0.2689173221588135, "learning_rate": 2.1858882443196e-06, "loss": 0.3505, "step": 2986 }, { "epoch": 2.1639700555421397, "grad_norm": 0.3715078830718994, "learning_rate": 2.1824045818257856e-06, "loss": 0.3899, "step": 2987 }, { "epoch": 2.164694518232311, "grad_norm": 0.2952030301094055, "learning_rate": 2.1789229223859403e-06, "loss": 0.3841, "step": 2988 }, { "epoch": 2.1654189809224826, "grad_norm": 0.3292694091796875, "learning_rate": 2.1754432684752103e-06, "loss": 0.4001, "step": 2989 }, { "epoch": 2.166143443612654, "grad_norm": 0.2957063913345337, "learning_rate": 2.171965622567308e-06, "loss": 0.3381, "step": 2990 }, { "epoch": 2.1668679063028256, "grad_norm": 0.2976095378398895, "learning_rate": 2.1684899871345256e-06, "loss": 0.3907, "step": 2991 }, { "epoch": 2.167592368992997, "grad_norm": 0.27925533056259155, "learning_rate": 2.1650163646477258e-06, "loss": 0.3571, "step": 2992 }, { "epoch": 2.1683168316831685, "grad_norm": 0.3156002163887024, "learning_rate": 2.161544757576333e-06, "loss": 0.3793, "step": 2993 }, { "epoch": 2.16904129437334, "grad_norm": 0.30415305495262146, "learning_rate": 2.158075168388348e-06, "loss": 0.3732, "step": 2994 }, { "epoch": 2.1697657570635114, "grad_norm": 0.31385481357574463, "learning_rate": 2.1546075995503328e-06, "loss": 0.3822, "step": 2995 }, { "epoch": 2.170490219753683, "grad_norm": 0.31137263774871826, "learning_rate": 2.1511420535274106e-06, "loss": 0.3761, "step": 2996 }, { "epoch": 2.1712146824438543, "grad_norm": 0.2745570242404938, "learning_rate": 2.1476785327832715e-06, "loss": 0.3437, "step": 2997 }, { "epoch": 2.1719391451340258, "grad_norm": 0.31477105617523193, "learning_rate": 2.144217039780165e-06, "loss": 0.4096, "step": 2998 }, { "epoch": 2.172663607824197, "grad_norm": 0.27900198101997375, "learning_rate": 2.140757576978894e-06, "loss": 0.3197, "step": 2999 }, { "epoch": 2.1733880705143687, "grad_norm": 0.3048737347126007, "learning_rate": 2.1373001468388232e-06, "loss": 0.351, "step": 3000 }, { "epoch": 2.17411253320454, "grad_norm": 0.3124304711818695, "learning_rate": 2.1338447518178718e-06, "loss": 0.3759, "step": 3001 }, { "epoch": 2.1748369958947116, "grad_norm": 0.2942749559879303, "learning_rate": 2.130391394372512e-06, "loss": 0.3616, "step": 3002 }, { "epoch": 2.175561458584883, "grad_norm": 0.33199211955070496, "learning_rate": 2.1269400769577635e-06, "loss": 0.3834, "step": 3003 }, { "epoch": 2.1762859212750545, "grad_norm": 0.3113398849964142, "learning_rate": 2.1234908020272016e-06, "loss": 0.3813, "step": 3004 }, { "epoch": 2.177010383965226, "grad_norm": 0.28962522745132446, "learning_rate": 2.1200435720329476e-06, "loss": 0.3514, "step": 3005 }, { "epoch": 2.1777348466553974, "grad_norm": 0.29236167669296265, "learning_rate": 2.1165983894256647e-06, "loss": 0.3678, "step": 3006 }, { "epoch": 2.178459309345569, "grad_norm": 0.3324524760246277, "learning_rate": 2.113155256654567e-06, "loss": 0.3904, "step": 3007 }, { "epoch": 2.1791837720357403, "grad_norm": 0.3042861521244049, "learning_rate": 2.10971417616741e-06, "loss": 0.3543, "step": 3008 }, { "epoch": 2.1799082347259118, "grad_norm": 0.31141921877861023, "learning_rate": 2.106275150410485e-06, "loss": 0.3551, "step": 3009 }, { "epoch": 2.1806326974160832, "grad_norm": 0.31016165018081665, "learning_rate": 2.1028381818286285e-06, "loss": 0.373, "step": 3010 }, { "epoch": 2.1813571601062547, "grad_norm": 0.3185249865055084, "learning_rate": 2.0994032728652134e-06, "loss": 0.3451, "step": 3011 }, { "epoch": 2.182081622796426, "grad_norm": 0.2951759099960327, "learning_rate": 2.0959704259621457e-06, "loss": 0.3755, "step": 3012 }, { "epoch": 2.1828060854865976, "grad_norm": 0.29487141966819763, "learning_rate": 2.0925396435598665e-06, "loss": 0.3288, "step": 3013 }, { "epoch": 2.183530548176769, "grad_norm": 0.3109551668167114, "learning_rate": 2.0891109280973536e-06, "loss": 0.3582, "step": 3014 }, { "epoch": 2.1842550108669405, "grad_norm": 0.2840130627155304, "learning_rate": 2.085684282012108e-06, "loss": 0.3332, "step": 3015 }, { "epoch": 2.184979473557112, "grad_norm": 0.30533289909362793, "learning_rate": 2.082259707740164e-06, "loss": 0.4258, "step": 3016 }, { "epoch": 2.1857039362472834, "grad_norm": 0.2917375862598419, "learning_rate": 2.078837207716085e-06, "loss": 0.3508, "step": 3017 }, { "epoch": 2.186428398937455, "grad_norm": 0.3163018822669983, "learning_rate": 2.0754167843729532e-06, "loss": 0.3898, "step": 3018 }, { "epoch": 2.1871528616276263, "grad_norm": 0.31761136651039124, "learning_rate": 2.07199844014238e-06, "loss": 0.3501, "step": 3019 }, { "epoch": 2.187877324317798, "grad_norm": 0.2792647182941437, "learning_rate": 2.0685821774544997e-06, "loss": 0.3232, "step": 3020 }, { "epoch": 2.1886017870079693, "grad_norm": 0.3092782497406006, "learning_rate": 2.065167998737959e-06, "loss": 0.3913, "step": 3021 }, { "epoch": 2.1893262496981407, "grad_norm": 0.29403600096702576, "learning_rate": 2.0617559064199317e-06, "loss": 0.3787, "step": 3022 }, { "epoch": 2.190050712388312, "grad_norm": 0.32909688353538513, "learning_rate": 2.0583459029261044e-06, "loss": 0.3729, "step": 3023 }, { "epoch": 2.1907751750784836, "grad_norm": 0.29323557019233704, "learning_rate": 2.0549379906806816e-06, "loss": 0.3721, "step": 3024 }, { "epoch": 2.191499637768655, "grad_norm": 0.2923307418823242, "learning_rate": 2.051532172106374e-06, "loss": 0.3698, "step": 3025 }, { "epoch": 2.1922241004588265, "grad_norm": 0.2945021688938141, "learning_rate": 2.048128449624411e-06, "loss": 0.3491, "step": 3026 }, { "epoch": 2.192948563148998, "grad_norm": 0.2985961139202118, "learning_rate": 2.0447268256545327e-06, "loss": 0.3921, "step": 3027 }, { "epoch": 2.1936730258391695, "grad_norm": 0.28623315691947937, "learning_rate": 2.0413273026149794e-06, "loss": 0.3906, "step": 3028 }, { "epoch": 2.194397488529341, "grad_norm": 0.295999139547348, "learning_rate": 2.037929882922504e-06, "loss": 0.375, "step": 3029 }, { "epoch": 2.1951219512195124, "grad_norm": 0.30401742458343506, "learning_rate": 2.034534568992366e-06, "loss": 0.3708, "step": 3030 }, { "epoch": 2.195846413909684, "grad_norm": 0.30116477608680725, "learning_rate": 2.0311413632383192e-06, "loss": 0.3496, "step": 3031 }, { "epoch": 2.1965708765998553, "grad_norm": 0.29509237408638, "learning_rate": 2.0277502680726275e-06, "loss": 0.3403, "step": 3032 }, { "epoch": 2.1972953392900267, "grad_norm": 0.30280226469039917, "learning_rate": 2.0243612859060526e-06, "loss": 0.3655, "step": 3033 }, { "epoch": 2.198019801980198, "grad_norm": 0.3041680157184601, "learning_rate": 2.020974419147848e-06, "loss": 0.3751, "step": 3034 }, { "epoch": 2.1987442646703697, "grad_norm": 0.2808971703052521, "learning_rate": 2.0175896702057705e-06, "loss": 0.3538, "step": 3035 }, { "epoch": 2.199468727360541, "grad_norm": 0.3066999912261963, "learning_rate": 2.0142070414860704e-06, "loss": 0.3634, "step": 3036 }, { "epoch": 2.2001931900507126, "grad_norm": 0.30236679315567017, "learning_rate": 2.0108265353934846e-06, "loss": 0.3602, "step": 3037 }, { "epoch": 2.200917652740884, "grad_norm": 0.3124159276485443, "learning_rate": 2.0074481543312475e-06, "loss": 0.3792, "step": 3038 }, { "epoch": 2.2016421154310555, "grad_norm": 0.2925941050052643, "learning_rate": 2.004071900701083e-06, "loss": 0.3545, "step": 3039 }, { "epoch": 2.202366578121227, "grad_norm": 0.2919071912765503, "learning_rate": 2.0006977769031956e-06, "loss": 0.3808, "step": 3040 }, { "epoch": 2.2030910408113984, "grad_norm": 0.30252355337142944, "learning_rate": 1.9973257853362824e-06, "loss": 0.3708, "step": 3041 }, { "epoch": 2.20381550350157, "grad_norm": 0.3085349202156067, "learning_rate": 1.9939559283975237e-06, "loss": 0.3647, "step": 3042 }, { "epoch": 2.2045399661917413, "grad_norm": 0.31603339314460754, "learning_rate": 1.990588208482582e-06, "loss": 0.3583, "step": 3043 }, { "epoch": 2.2052644288819128, "grad_norm": 0.3222765624523163, "learning_rate": 1.9872226279855955e-06, "loss": 0.3316, "step": 3044 }, { "epoch": 2.205988891572084, "grad_norm": 0.30396267771720886, "learning_rate": 1.9838591892991884e-06, "loss": 0.3473, "step": 3045 }, { "epoch": 2.2067133542622557, "grad_norm": 0.2817571759223938, "learning_rate": 1.980497894814461e-06, "loss": 0.3754, "step": 3046 }, { "epoch": 2.207437816952427, "grad_norm": 0.3009531795978546, "learning_rate": 1.977138746920985e-06, "loss": 0.353, "step": 3047 }, { "epoch": 2.2081622796425986, "grad_norm": 0.31193050742149353, "learning_rate": 1.97378174800681e-06, "loss": 0.4245, "step": 3048 }, { "epoch": 2.20888674233277, "grad_norm": 0.2938419282436371, "learning_rate": 1.9704269004584583e-06, "loss": 0.3466, "step": 3049 }, { "epoch": 2.2096112050229415, "grad_norm": 0.3152781128883362, "learning_rate": 1.9670742066609182e-06, "loss": 0.3733, "step": 3050 }, { "epoch": 2.210335667713113, "grad_norm": 0.27357375621795654, "learning_rate": 1.9637236689976517e-06, "loss": 0.3479, "step": 3051 }, { "epoch": 2.2110601304032844, "grad_norm": 0.2972021996974945, "learning_rate": 1.960375289850588e-06, "loss": 0.3334, "step": 3052 }, { "epoch": 2.211784593093456, "grad_norm": 0.30838215351104736, "learning_rate": 1.9570290716001162e-06, "loss": 0.3685, "step": 3053 }, { "epoch": 2.2125090557836273, "grad_norm": 0.3080647587776184, "learning_rate": 1.953685016625094e-06, "loss": 0.3613, "step": 3054 }, { "epoch": 2.213233518473799, "grad_norm": 0.30834391713142395, "learning_rate": 1.950343127302844e-06, "loss": 0.3618, "step": 3055 }, { "epoch": 2.2139579811639702, "grad_norm": 0.31624096632003784, "learning_rate": 1.9470034060091408e-06, "loss": 0.3664, "step": 3056 }, { "epoch": 2.2146824438541417, "grad_norm": 0.30415987968444824, "learning_rate": 1.9436658551182234e-06, "loss": 0.3537, "step": 3057 }, { "epoch": 2.215406906544313, "grad_norm": 0.29883167147636414, "learning_rate": 1.94033047700279e-06, "loss": 0.349, "step": 3058 }, { "epoch": 2.2161313692344846, "grad_norm": 0.292086124420166, "learning_rate": 1.936997274033986e-06, "loss": 0.3555, "step": 3059 }, { "epoch": 2.216855831924656, "grad_norm": 0.3059002757072449, "learning_rate": 1.933666248581418e-06, "loss": 0.3969, "step": 3060 }, { "epoch": 2.2175802946148275, "grad_norm": 0.30404970049858093, "learning_rate": 1.930337403013144e-06, "loss": 0.3664, "step": 3061 }, { "epoch": 2.218304757304999, "grad_norm": 0.30966705083847046, "learning_rate": 1.927010739695666e-06, "loss": 0.3475, "step": 3062 }, { "epoch": 2.2190292199951704, "grad_norm": 0.3122783601284027, "learning_rate": 1.9236862609939415e-06, "loss": 0.4026, "step": 3063 }, { "epoch": 2.219753682685342, "grad_norm": 0.26814088225364685, "learning_rate": 1.9203639692713715e-06, "loss": 0.3394, "step": 3064 }, { "epoch": 2.2204781453755134, "grad_norm": 0.2940293848514557, "learning_rate": 1.9170438668898057e-06, "loss": 0.3799, "step": 3065 }, { "epoch": 2.221202608065685, "grad_norm": 0.3025403618812561, "learning_rate": 1.913725956209531e-06, "loss": 0.3752, "step": 3066 }, { "epoch": 2.2219270707558563, "grad_norm": 0.29614871740341187, "learning_rate": 1.9104102395892816e-06, "loss": 0.3654, "step": 3067 }, { "epoch": 2.2226515334460277, "grad_norm": 0.317518949508667, "learning_rate": 1.9070967193862322e-06, "loss": 0.3686, "step": 3068 }, { "epoch": 2.223375996136199, "grad_norm": 0.32410475611686707, "learning_rate": 1.9037853979559923e-06, "loss": 0.3715, "step": 3069 }, { "epoch": 2.2241004588263706, "grad_norm": 0.28332313895225525, "learning_rate": 1.900476277652608e-06, "loss": 0.3539, "step": 3070 }, { "epoch": 2.224824921516542, "grad_norm": 0.26751479506492615, "learning_rate": 1.8971693608285675e-06, "loss": 0.3675, "step": 3071 }, { "epoch": 2.2255493842067136, "grad_norm": 0.308602899312973, "learning_rate": 1.893864649834783e-06, "loss": 0.3697, "step": 3072 }, { "epoch": 2.226273846896885, "grad_norm": 0.2920379936695099, "learning_rate": 1.890562147020606e-06, "loss": 0.3848, "step": 3073 }, { "epoch": 2.2269983095870565, "grad_norm": 0.27065661549568176, "learning_rate": 1.8872618547338145e-06, "loss": 0.3513, "step": 3074 }, { "epoch": 2.227722772277228, "grad_norm": 0.3341638445854187, "learning_rate": 1.8839637753206186e-06, "loss": 0.4199, "step": 3075 }, { "epoch": 2.2284472349673994, "grad_norm": 0.3050549030303955, "learning_rate": 1.8806679111256487e-06, "loss": 0.3749, "step": 3076 }, { "epoch": 2.229171697657571, "grad_norm": 0.30848821997642517, "learning_rate": 1.8773742644919663e-06, "loss": 0.3848, "step": 3077 }, { "epoch": 2.2298961603477423, "grad_norm": 0.2721932530403137, "learning_rate": 1.8740828377610564e-06, "loss": 0.3283, "step": 3078 }, { "epoch": 2.2306206230379138, "grad_norm": 0.28558552265167236, "learning_rate": 1.87079363327282e-06, "loss": 0.36, "step": 3079 }, { "epoch": 2.231345085728085, "grad_norm": 0.29816773533821106, "learning_rate": 1.8675066533655845e-06, "loss": 0.3442, "step": 3080 }, { "epoch": 2.2320695484182567, "grad_norm": 0.3286582827568054, "learning_rate": 1.8642219003760948e-06, "loss": 0.3775, "step": 3081 }, { "epoch": 2.232794011108428, "grad_norm": 0.2895018756389618, "learning_rate": 1.8609393766395083e-06, "loss": 0.3704, "step": 3082 }, { "epoch": 2.2335184737985996, "grad_norm": 0.2887575924396515, "learning_rate": 1.8576590844894021e-06, "loss": 0.3574, "step": 3083 }, { "epoch": 2.234242936488771, "grad_norm": 0.29874274134635925, "learning_rate": 1.8543810262577671e-06, "loss": 0.3715, "step": 3084 }, { "epoch": 2.2349673991789425, "grad_norm": 0.3352900743484497, "learning_rate": 1.851105204275001e-06, "loss": 0.3952, "step": 3085 }, { "epoch": 2.235691861869114, "grad_norm": 0.2868969440460205, "learning_rate": 1.8478316208699177e-06, "loss": 0.3493, "step": 3086 }, { "epoch": 2.2364163245592854, "grad_norm": 0.2962270677089691, "learning_rate": 1.8445602783697375e-06, "loss": 0.3542, "step": 3087 }, { "epoch": 2.237140787249457, "grad_norm": 0.3039577603340149, "learning_rate": 1.841291179100085e-06, "loss": 0.3745, "step": 3088 }, { "epoch": 2.2378652499396283, "grad_norm": 0.3033204972743988, "learning_rate": 1.8380243253849933e-06, "loss": 0.3651, "step": 3089 }, { "epoch": 2.2385897126297998, "grad_norm": 0.2985183000564575, "learning_rate": 1.8347597195469009e-06, "loss": 0.3181, "step": 3090 }, { "epoch": 2.2393141753199712, "grad_norm": 0.33268508315086365, "learning_rate": 1.8314973639066402e-06, "loss": 0.3787, "step": 3091 }, { "epoch": 2.2400386380101427, "grad_norm": 0.3149172365665436, "learning_rate": 1.8282372607834525e-06, "loss": 0.3859, "step": 3092 }, { "epoch": 2.240763100700314, "grad_norm": 0.29439425468444824, "learning_rate": 1.8249794124949742e-06, "loss": 0.3421, "step": 3093 }, { "epoch": 2.2414875633904856, "grad_norm": 0.298734575510025, "learning_rate": 1.8217238213572401e-06, "loss": 0.3949, "step": 3094 }, { "epoch": 2.242212026080657, "grad_norm": 0.31094783544540405, "learning_rate": 1.8184704896846773e-06, "loss": 0.3932, "step": 3095 }, { "epoch": 2.2429364887708285, "grad_norm": 0.2698996067047119, "learning_rate": 1.8152194197901086e-06, "loss": 0.3229, "step": 3096 }, { "epoch": 2.243660951461, "grad_norm": 0.2865182161331177, "learning_rate": 1.811970613984751e-06, "loss": 0.3758, "step": 3097 }, { "epoch": 2.2443854141511714, "grad_norm": 0.31377288699150085, "learning_rate": 1.808724074578207e-06, "loss": 0.3636, "step": 3098 }, { "epoch": 2.245109876841343, "grad_norm": 0.3186250329017639, "learning_rate": 1.805479803878472e-06, "loss": 0.4157, "step": 3099 }, { "epoch": 2.2458343395315143, "grad_norm": 0.3023534119129181, "learning_rate": 1.8022378041919292e-06, "loss": 0.3869, "step": 3100 }, { "epoch": 2.246558802221686, "grad_norm": 0.2906970977783203, "learning_rate": 1.7989980778233422e-06, "loss": 0.3474, "step": 3101 }, { "epoch": 2.2472832649118573, "grad_norm": 0.3013107180595398, "learning_rate": 1.7957606270758626e-06, "loss": 0.383, "step": 3102 }, { "epoch": 2.2480077276020287, "grad_norm": 0.29594454169273376, "learning_rate": 1.7925254542510256e-06, "loss": 0.3713, "step": 3103 }, { "epoch": 2.2487321902922, "grad_norm": 0.2861643433570862, "learning_rate": 1.789292561648741e-06, "loss": 0.3272, "step": 3104 }, { "epoch": 2.2494566529823716, "grad_norm": 0.3167389929294586, "learning_rate": 1.7860619515673034e-06, "loss": 0.3717, "step": 3105 }, { "epoch": 2.250181115672543, "grad_norm": 0.2936827838420868, "learning_rate": 1.782833626303384e-06, "loss": 0.334, "step": 3106 }, { "epoch": 2.2509055783627145, "grad_norm": 0.3343866169452667, "learning_rate": 1.7796075881520247e-06, "loss": 0.4612, "step": 3107 }, { "epoch": 2.251630041052886, "grad_norm": 0.3002091646194458, "learning_rate": 1.7763838394066474e-06, "loss": 0.3641, "step": 3108 }, { "epoch": 2.2523545037430575, "grad_norm": 0.30802783370018005, "learning_rate": 1.7731623823590456e-06, "loss": 0.361, "step": 3109 }, { "epoch": 2.253078966433229, "grad_norm": 0.3114280700683594, "learning_rate": 1.7699432192993782e-06, "loss": 0.387, "step": 3110 }, { "epoch": 2.2538034291234004, "grad_norm": 0.30692800879478455, "learning_rate": 1.7667263525161798e-06, "loss": 0.3831, "step": 3111 }, { "epoch": 2.254527891813572, "grad_norm": 0.2940272390842438, "learning_rate": 1.7635117842963507e-06, "loss": 0.3516, "step": 3112 }, { "epoch": 2.2552523545037433, "grad_norm": 0.3014373183250427, "learning_rate": 1.7602995169251542e-06, "loss": 0.3553, "step": 3113 }, { "epoch": 2.2559768171939147, "grad_norm": 0.32484275102615356, "learning_rate": 1.7570895526862202e-06, "loss": 0.4012, "step": 3114 }, { "epoch": 2.256701279884086, "grad_norm": 0.32116106152534485, "learning_rate": 1.7538818938615432e-06, "loss": 0.3621, "step": 3115 }, { "epoch": 2.2574257425742577, "grad_norm": 0.2909351885318756, "learning_rate": 1.750676542731477e-06, "loss": 0.3751, "step": 3116 }, { "epoch": 2.258150205264429, "grad_norm": 0.2795540392398834, "learning_rate": 1.7474735015747312e-06, "loss": 0.3424, "step": 3117 }, { "epoch": 2.2588746679546006, "grad_norm": 0.30501312017440796, "learning_rate": 1.7442727726683794e-06, "loss": 0.3743, "step": 3118 }, { "epoch": 2.259599130644772, "grad_norm": 0.31474050879478455, "learning_rate": 1.7410743582878492e-06, "loss": 0.3895, "step": 3119 }, { "epoch": 2.2603235933349435, "grad_norm": 0.2734764516353607, "learning_rate": 1.7378782607069195e-06, "loss": 0.3247, "step": 3120 }, { "epoch": 2.261048056025115, "grad_norm": 0.27877312898635864, "learning_rate": 1.7346844821977266e-06, "loss": 0.3816, "step": 3121 }, { "epoch": 2.2617725187152864, "grad_norm": 0.290865421295166, "learning_rate": 1.7314930250307571e-06, "loss": 0.3594, "step": 3122 }, { "epoch": 2.262496981405458, "grad_norm": 0.30389365553855896, "learning_rate": 1.7283038914748446e-06, "loss": 0.371, "step": 3123 }, { "epoch": 2.2632214440956293, "grad_norm": 0.2814875543117523, "learning_rate": 1.7251170837971737e-06, "loss": 0.346, "step": 3124 }, { "epoch": 2.2639459067858008, "grad_norm": 0.3136861026287079, "learning_rate": 1.7219326042632772e-06, "loss": 0.3861, "step": 3125 }, { "epoch": 2.2646703694759722, "grad_norm": 0.2891913950443268, "learning_rate": 1.718750455137026e-06, "loss": 0.3264, "step": 3126 }, { "epoch": 2.2653948321661437, "grad_norm": 0.30964362621307373, "learning_rate": 1.7155706386806402e-06, "loss": 0.3711, "step": 3127 }, { "epoch": 2.266119294856315, "grad_norm": 0.29771310091018677, "learning_rate": 1.7123931571546826e-06, "loss": 0.3429, "step": 3128 }, { "epoch": 2.2668437575464866, "grad_norm": 0.2846716642379761, "learning_rate": 1.7092180128180496e-06, "loss": 0.3939, "step": 3129 }, { "epoch": 2.267568220236658, "grad_norm": 0.28461095690727234, "learning_rate": 1.7060452079279816e-06, "loss": 0.3526, "step": 3130 }, { "epoch": 2.2682926829268295, "grad_norm": 0.28865930438041687, "learning_rate": 1.7028747447400555e-06, "loss": 0.3684, "step": 3131 }, { "epoch": 2.269017145617001, "grad_norm": 0.32490432262420654, "learning_rate": 1.6997066255081795e-06, "loss": 0.4062, "step": 3132 }, { "epoch": 2.2697416083071724, "grad_norm": 0.284918874502182, "learning_rate": 1.6965408524845993e-06, "loss": 0.3359, "step": 3133 }, { "epoch": 2.270466070997344, "grad_norm": 0.2819148302078247, "learning_rate": 1.6933774279198934e-06, "loss": 0.3342, "step": 3134 }, { "epoch": 2.2711905336875153, "grad_norm": 0.2697708308696747, "learning_rate": 1.6902163540629652e-06, "loss": 0.3322, "step": 3135 }, { "epoch": 2.271914996377687, "grad_norm": 0.2877367436885834, "learning_rate": 1.6870576331610522e-06, "loss": 0.3407, "step": 3136 }, { "epoch": 2.2726394590678582, "grad_norm": 0.31140923500061035, "learning_rate": 1.6839012674597177e-06, "loss": 0.3879, "step": 3137 }, { "epoch": 2.2733639217580297, "grad_norm": 0.2948157489299774, "learning_rate": 1.6807472592028524e-06, "loss": 0.3451, "step": 3138 }, { "epoch": 2.274088384448201, "grad_norm": 0.30077534914016724, "learning_rate": 1.6775956106326656e-06, "loss": 0.3614, "step": 3139 }, { "epoch": 2.2748128471383726, "grad_norm": 0.3220531940460205, "learning_rate": 1.6744463239896947e-06, "loss": 0.4089, "step": 3140 }, { "epoch": 2.275537309828544, "grad_norm": 0.3139605224132538, "learning_rate": 1.6712994015127976e-06, "loss": 0.3913, "step": 3141 }, { "epoch": 2.2762617725187155, "grad_norm": 0.2801085412502289, "learning_rate": 1.6681548454391455e-06, "loss": 0.3386, "step": 3142 }, { "epoch": 2.276986235208887, "grad_norm": 0.28564831614494324, "learning_rate": 1.6650126580042346e-06, "loss": 0.3828, "step": 3143 }, { "epoch": 2.2777106978990584, "grad_norm": 0.28885242342948914, "learning_rate": 1.661872841441875e-06, "loss": 0.3564, "step": 3144 }, { "epoch": 2.27843516058923, "grad_norm": 0.33258718252182007, "learning_rate": 1.658735397984188e-06, "loss": 0.3788, "step": 3145 }, { "epoch": 2.2791596232794014, "grad_norm": 0.29735973477363586, "learning_rate": 1.6556003298616113e-06, "loss": 0.3543, "step": 3146 }, { "epoch": 2.279884085969573, "grad_norm": 0.308655709028244, "learning_rate": 1.6524676393028955e-06, "loss": 0.3738, "step": 3147 }, { "epoch": 2.2806085486597443, "grad_norm": 0.3021217882633209, "learning_rate": 1.649337328535095e-06, "loss": 0.3848, "step": 3148 }, { "epoch": 2.2813330113499153, "grad_norm": 0.2846744954586029, "learning_rate": 1.6462093997835772e-06, "loss": 0.3475, "step": 3149 }, { "epoch": 2.2820574740400867, "grad_norm": 0.3351779580116272, "learning_rate": 1.6430838552720168e-06, "loss": 0.3993, "step": 3150 }, { "epoch": 2.282781936730258, "grad_norm": 0.31777629256248474, "learning_rate": 1.639960697222388e-06, "loss": 0.3225, "step": 3151 }, { "epoch": 2.2835063994204297, "grad_norm": 0.3146757483482361, "learning_rate": 1.6368399278549741e-06, "loss": 0.3722, "step": 3152 }, { "epoch": 2.284230862110601, "grad_norm": 0.3103538155555725, "learning_rate": 1.6337215493883596e-06, "loss": 0.3731, "step": 3153 }, { "epoch": 2.2849553248007726, "grad_norm": 0.33493590354919434, "learning_rate": 1.6306055640394243e-06, "loss": 0.3949, "step": 3154 }, { "epoch": 2.285679787490944, "grad_norm": 0.31188443303108215, "learning_rate": 1.6274919740233525e-06, "loss": 0.3826, "step": 3155 }, { "epoch": 2.2864042501811155, "grad_norm": 0.27727317810058594, "learning_rate": 1.6243807815536245e-06, "loss": 0.3369, "step": 3156 }, { "epoch": 2.287128712871287, "grad_norm": 0.28985723853111267, "learning_rate": 1.6212719888420114e-06, "loss": 0.3328, "step": 3157 }, { "epoch": 2.2878531755614584, "grad_norm": 0.3156263828277588, "learning_rate": 1.618165598098585e-06, "loss": 0.3785, "step": 3158 }, { "epoch": 2.28857763825163, "grad_norm": 0.305492103099823, "learning_rate": 1.6150616115317052e-06, "loss": 0.3724, "step": 3159 }, { "epoch": 2.2893021009418013, "grad_norm": 0.309566468000412, "learning_rate": 1.611960031348026e-06, "loss": 0.3508, "step": 3160 }, { "epoch": 2.2900265636319728, "grad_norm": 0.33643561601638794, "learning_rate": 1.6088608597524858e-06, "loss": 0.3873, "step": 3161 }, { "epoch": 2.2907510263221442, "grad_norm": 0.32146385312080383, "learning_rate": 1.6057640989483158e-06, "loss": 0.3347, "step": 3162 }, { "epoch": 2.2914754890123157, "grad_norm": 0.3133610785007477, "learning_rate": 1.6026697511370309e-06, "loss": 0.371, "step": 3163 }, { "epoch": 2.292199951702487, "grad_norm": 0.34018823504447937, "learning_rate": 1.5995778185184285e-06, "loss": 0.3795, "step": 3164 }, { "epoch": 2.2929244143926586, "grad_norm": 0.30551233887672424, "learning_rate": 1.5964883032905932e-06, "loss": 0.3581, "step": 3165 }, { "epoch": 2.29364887708283, "grad_norm": 0.2656961977481842, "learning_rate": 1.5934012076498889e-06, "loss": 0.3006, "step": 3166 }, { "epoch": 2.2943733397730015, "grad_norm": 0.31382694840431213, "learning_rate": 1.5903165337909615e-06, "loss": 0.3655, "step": 3167 }, { "epoch": 2.295097802463173, "grad_norm": 0.33020147681236267, "learning_rate": 1.5872342839067305e-06, "loss": 0.4397, "step": 3168 }, { "epoch": 2.2958222651533444, "grad_norm": 0.32466912269592285, "learning_rate": 1.5841544601883962e-06, "loss": 0.3742, "step": 3169 }, { "epoch": 2.296546727843516, "grad_norm": 0.31333836913108826, "learning_rate": 1.5810770648254347e-06, "loss": 0.3695, "step": 3170 }, { "epoch": 2.2972711905336873, "grad_norm": 0.29637956619262695, "learning_rate": 1.5780021000055917e-06, "loss": 0.3641, "step": 3171 }, { "epoch": 2.297995653223859, "grad_norm": 0.2904829978942871, "learning_rate": 1.5749295679148879e-06, "loss": 0.3253, "step": 3172 }, { "epoch": 2.2987201159140302, "grad_norm": 0.33180204033851624, "learning_rate": 1.571859470737616e-06, "loss": 0.3878, "step": 3173 }, { "epoch": 2.2994445786042017, "grad_norm": 0.347089946269989, "learning_rate": 1.5687918106563326e-06, "loss": 0.38, "step": 3174 }, { "epoch": 2.300169041294373, "grad_norm": 0.31642353534698486, "learning_rate": 1.565726589851867e-06, "loss": 0.3731, "step": 3175 }, { "epoch": 2.3008935039845446, "grad_norm": 0.33082038164138794, "learning_rate": 1.5626638105033132e-06, "loss": 0.4269, "step": 3176 }, { "epoch": 2.301617966674716, "grad_norm": 0.29630494117736816, "learning_rate": 1.5596034747880263e-06, "loss": 0.3477, "step": 3177 }, { "epoch": 2.3023424293648875, "grad_norm": 0.3556058704853058, "learning_rate": 1.5565455848816269e-06, "loss": 0.3508, "step": 3178 }, { "epoch": 2.303066892055059, "grad_norm": 0.3080747127532959, "learning_rate": 1.5534901429579986e-06, "loss": 0.3744, "step": 3179 }, { "epoch": 2.3037913547452304, "grad_norm": 0.31256407499313354, "learning_rate": 1.5504371511892797e-06, "loss": 0.3843, "step": 3180 }, { "epoch": 2.304515817435402, "grad_norm": 0.2878004312515259, "learning_rate": 1.5473866117458708e-06, "loss": 0.3349, "step": 3181 }, { "epoch": 2.3052402801255734, "grad_norm": 0.2960623502731323, "learning_rate": 1.5443385267964295e-06, "loss": 0.3557, "step": 3182 }, { "epoch": 2.305964742815745, "grad_norm": 0.2870892286300659, "learning_rate": 1.541292898507864e-06, "loss": 0.3404, "step": 3183 }, { "epoch": 2.3066892055059163, "grad_norm": 0.31699037551879883, "learning_rate": 1.5382497290453402e-06, "loss": 0.3856, "step": 3184 }, { "epoch": 2.3074136681960877, "grad_norm": 0.2700352966785431, "learning_rate": 1.5352090205722765e-06, "loss": 0.3214, "step": 3185 }, { "epoch": 2.308138130886259, "grad_norm": 0.3197454810142517, "learning_rate": 1.5321707752503367e-06, "loss": 0.392, "step": 3186 }, { "epoch": 2.3088625935764306, "grad_norm": 0.2770592272281647, "learning_rate": 1.5291349952394379e-06, "loss": 0.3451, "step": 3187 }, { "epoch": 2.309587056266602, "grad_norm": 0.2832491397857666, "learning_rate": 1.5261016826977443e-06, "loss": 0.3669, "step": 3188 }, { "epoch": 2.3103115189567736, "grad_norm": 0.30498114228248596, "learning_rate": 1.5230708397816658e-06, "loss": 0.3772, "step": 3189 }, { "epoch": 2.311035981646945, "grad_norm": 0.2850368022918701, "learning_rate": 1.5200424686458537e-06, "loss": 0.329, "step": 3190 }, { "epoch": 2.3117604443371165, "grad_norm": 0.3200870752334595, "learning_rate": 1.517016571443205e-06, "loss": 0.3781, "step": 3191 }, { "epoch": 2.312484907027288, "grad_norm": 0.26906704902648926, "learning_rate": 1.5139931503248595e-06, "loss": 0.3453, "step": 3192 }, { "epoch": 2.3132093697174594, "grad_norm": 0.3027033507823944, "learning_rate": 1.510972207440191e-06, "loss": 0.3839, "step": 3193 }, { "epoch": 2.313933832407631, "grad_norm": 0.28783008456230164, "learning_rate": 1.507953744936817e-06, "loss": 0.3441, "step": 3194 }, { "epoch": 2.3146582950978023, "grad_norm": 0.2997998297214508, "learning_rate": 1.5049377649605906e-06, "loss": 0.4115, "step": 3195 }, { "epoch": 2.3153827577879738, "grad_norm": 0.2893494665622711, "learning_rate": 1.501924269655597e-06, "loss": 0.3574, "step": 3196 }, { "epoch": 2.316107220478145, "grad_norm": 0.30500081181526184, "learning_rate": 1.4989132611641576e-06, "loss": 0.3931, "step": 3197 }, { "epoch": 2.3168316831683167, "grad_norm": 0.3054518401622772, "learning_rate": 1.4959047416268273e-06, "loss": 0.3771, "step": 3198 }, { "epoch": 2.317556145858488, "grad_norm": 0.2741226553916931, "learning_rate": 1.4928987131823874e-06, "loss": 0.3506, "step": 3199 }, { "epoch": 2.3182806085486596, "grad_norm": 0.3043588697910309, "learning_rate": 1.4898951779678517e-06, "loss": 0.373, "step": 3200 }, { "epoch": 2.319005071238831, "grad_norm": 0.2897460460662842, "learning_rate": 1.486894138118462e-06, "loss": 0.3451, "step": 3201 }, { "epoch": 2.3197295339290025, "grad_norm": 0.30427980422973633, "learning_rate": 1.4838955957676815e-06, "loss": 0.3486, "step": 3202 }, { "epoch": 2.320453996619174, "grad_norm": 0.32078394293785095, "learning_rate": 1.480899553047202e-06, "loss": 0.3728, "step": 3203 }, { "epoch": 2.3211784593093454, "grad_norm": 0.3206414580345154, "learning_rate": 1.4779060120869393e-06, "loss": 0.3995, "step": 3204 }, { "epoch": 2.321902921999517, "grad_norm": 0.29610294103622437, "learning_rate": 1.474914975015026e-06, "loss": 0.3461, "step": 3205 }, { "epoch": 2.3226273846896883, "grad_norm": 0.3031742572784424, "learning_rate": 1.471926443957818e-06, "loss": 0.3791, "step": 3206 }, { "epoch": 2.32335184737986, "grad_norm": 0.310345321893692, "learning_rate": 1.4689404210398906e-06, "loss": 0.3564, "step": 3207 }, { "epoch": 2.3240763100700312, "grad_norm": 0.30873802304267883, "learning_rate": 1.465956908384032e-06, "loss": 0.3902, "step": 3208 }, { "epoch": 2.3248007727602027, "grad_norm": 0.31484100222587585, "learning_rate": 1.4629759081112487e-06, "loss": 0.3613, "step": 3209 }, { "epoch": 2.325525235450374, "grad_norm": 0.2933141589164734, "learning_rate": 1.459997422340762e-06, "loss": 0.3974, "step": 3210 }, { "epoch": 2.3262496981405456, "grad_norm": 0.2898856997489929, "learning_rate": 1.4570214531900051e-06, "loss": 0.3859, "step": 3211 }, { "epoch": 2.326974160830717, "grad_norm": 0.2883496582508087, "learning_rate": 1.4540480027746178e-06, "loss": 0.3505, "step": 3212 }, { "epoch": 2.3276986235208885, "grad_norm": 0.3155589997768402, "learning_rate": 1.451077073208455e-06, "loss": 0.3768, "step": 3213 }, { "epoch": 2.32842308621106, "grad_norm": 0.27901092171669006, "learning_rate": 1.4481086666035781e-06, "loss": 0.3533, "step": 3214 }, { "epoch": 2.3291475489012314, "grad_norm": 0.31813353300094604, "learning_rate": 1.445142785070252e-06, "loss": 0.3804, "step": 3215 }, { "epoch": 2.329872011591403, "grad_norm": 0.2975279986858368, "learning_rate": 1.4421794307169495e-06, "loss": 0.372, "step": 3216 }, { "epoch": 2.3305964742815743, "grad_norm": 0.3086436092853546, "learning_rate": 1.4392186056503476e-06, "loss": 0.3575, "step": 3217 }, { "epoch": 2.331320936971746, "grad_norm": 0.31291893124580383, "learning_rate": 1.4362603119753204e-06, "loss": 0.3424, "step": 3218 }, { "epoch": 2.3320453996619173, "grad_norm": 0.3338637351989746, "learning_rate": 1.4333045517949468e-06, "loss": 0.3876, "step": 3219 }, { "epoch": 2.3327698623520887, "grad_norm": 0.3193533718585968, "learning_rate": 1.4303513272105057e-06, "loss": 0.3754, "step": 3220 }, { "epoch": 2.33349432504226, "grad_norm": 0.2744387686252594, "learning_rate": 1.4274006403214669e-06, "loss": 0.3567, "step": 3221 }, { "epoch": 2.3342187877324316, "grad_norm": 0.30775654315948486, "learning_rate": 1.4244524932255026e-06, "loss": 0.3894, "step": 3222 }, { "epoch": 2.334943250422603, "grad_norm": 0.3043455183506012, "learning_rate": 1.421506888018479e-06, "loss": 0.3557, "step": 3223 }, { "epoch": 2.3356677131127745, "grad_norm": 0.284729927778244, "learning_rate": 1.418563826794449e-06, "loss": 0.3849, "step": 3224 }, { "epoch": 2.336392175802946, "grad_norm": 0.28194546699523926, "learning_rate": 1.4156233116456642e-06, "loss": 0.3483, "step": 3225 }, { "epoch": 2.3371166384931175, "grad_norm": 0.2863452434539795, "learning_rate": 1.4126853446625644e-06, "loss": 0.3284, "step": 3226 }, { "epoch": 2.337841101183289, "grad_norm": 0.3280470073223114, "learning_rate": 1.409749927933773e-06, "loss": 0.3885, "step": 3227 }, { "epoch": 2.3385655638734604, "grad_norm": 0.30901038646698, "learning_rate": 1.4068170635461065e-06, "loss": 0.3693, "step": 3228 }, { "epoch": 2.339290026563632, "grad_norm": 0.29390987753868103, "learning_rate": 1.4038867535845664e-06, "loss": 0.3491, "step": 3229 }, { "epoch": 2.3400144892538033, "grad_norm": 0.29974114894866943, "learning_rate": 1.4009590001323325e-06, "loss": 0.3556, "step": 3230 }, { "epoch": 2.3407389519439747, "grad_norm": 0.29232504963874817, "learning_rate": 1.3980338052707737e-06, "loss": 0.361, "step": 3231 }, { "epoch": 2.341463414634146, "grad_norm": 0.2898787260055542, "learning_rate": 1.3951111710794368e-06, "loss": 0.3778, "step": 3232 }, { "epoch": 2.3421878773243177, "grad_norm": 0.29779574275016785, "learning_rate": 1.39219109963605e-06, "loss": 0.3462, "step": 3233 }, { "epoch": 2.342912340014489, "grad_norm": 0.29963093996047974, "learning_rate": 1.3892735930165157e-06, "loss": 0.3929, "step": 3234 }, { "epoch": 2.3436368027046606, "grad_norm": 0.29696446657180786, "learning_rate": 1.3863586532949174e-06, "loss": 0.3852, "step": 3235 }, { "epoch": 2.344361265394832, "grad_norm": 0.31137773394584656, "learning_rate": 1.383446282543513e-06, "loss": 0.4026, "step": 3236 }, { "epoch": 2.3450857280850035, "grad_norm": 0.29839515686035156, "learning_rate": 1.3805364828327305e-06, "loss": 0.351, "step": 3237 }, { "epoch": 2.345810190775175, "grad_norm": 0.3154705762863159, "learning_rate": 1.3776292562311743e-06, "loss": 0.373, "step": 3238 }, { "epoch": 2.3465346534653464, "grad_norm": 0.2797548770904541, "learning_rate": 1.374724604805619e-06, "loss": 0.3552, "step": 3239 }, { "epoch": 2.347259116155518, "grad_norm": 0.28867003321647644, "learning_rate": 1.3718225306210049e-06, "loss": 0.3395, "step": 3240 }, { "epoch": 2.3479835788456893, "grad_norm": 0.31087586283683777, "learning_rate": 1.3689230357404442e-06, "loss": 0.4096, "step": 3241 }, { "epoch": 2.3487080415358608, "grad_norm": 0.29616016149520874, "learning_rate": 1.3660261222252147e-06, "loss": 0.3702, "step": 3242 }, { "epoch": 2.3494325042260322, "grad_norm": 0.28643715381622314, "learning_rate": 1.3631317921347564e-06, "loss": 0.3893, "step": 3243 }, { "epoch": 2.3501569669162037, "grad_norm": 0.25597038865089417, "learning_rate": 1.3602400475266758e-06, "loss": 0.3134, "step": 3244 }, { "epoch": 2.350881429606375, "grad_norm": 0.2990086078643799, "learning_rate": 1.3573508904567412e-06, "loss": 0.3984, "step": 3245 }, { "epoch": 2.3516058922965466, "grad_norm": 0.3028738796710968, "learning_rate": 1.354464322978878e-06, "loss": 0.3599, "step": 3246 }, { "epoch": 2.352330354986718, "grad_norm": 0.3032677173614502, "learning_rate": 1.351580347145174e-06, "loss": 0.3751, "step": 3247 }, { "epoch": 2.3530548176768895, "grad_norm": 0.27845072746276855, "learning_rate": 1.348698965005875e-06, "loss": 0.3512, "step": 3248 }, { "epoch": 2.353779280367061, "grad_norm": 0.29052552580833435, "learning_rate": 1.3458201786093795e-06, "loss": 0.4046, "step": 3249 }, { "epoch": 2.3545037430572324, "grad_norm": 0.2845311462879181, "learning_rate": 1.3429439900022434e-06, "loss": 0.3421, "step": 3250 }, { "epoch": 2.355228205747404, "grad_norm": 0.30509042739868164, "learning_rate": 1.340070401229177e-06, "loss": 0.3776, "step": 3251 }, { "epoch": 2.3559526684375753, "grad_norm": 0.2993945777416229, "learning_rate": 1.337199414333037e-06, "loss": 0.3735, "step": 3252 }, { "epoch": 2.356677131127747, "grad_norm": 0.30217862129211426, "learning_rate": 1.3343310313548351e-06, "loss": 0.3214, "step": 3253 }, { "epoch": 2.3574015938179183, "grad_norm": 0.2825793921947479, "learning_rate": 1.3314652543337319e-06, "loss": 0.3482, "step": 3254 }, { "epoch": 2.3581260565080897, "grad_norm": 0.28752779960632324, "learning_rate": 1.3286020853070341e-06, "loss": 0.3599, "step": 3255 }, { "epoch": 2.358850519198261, "grad_norm": 0.27992162108421326, "learning_rate": 1.3257415263101942e-06, "loss": 0.3691, "step": 3256 }, { "epoch": 2.3595749818884326, "grad_norm": 0.294805645942688, "learning_rate": 1.322883579376807e-06, "loss": 0.3835, "step": 3257 }, { "epoch": 2.360299444578604, "grad_norm": 0.29452231526374817, "learning_rate": 1.3200282465386156e-06, "loss": 0.3699, "step": 3258 }, { "epoch": 2.3610239072687755, "grad_norm": 0.30416205525398254, "learning_rate": 1.3171755298254996e-06, "loss": 0.3856, "step": 3259 }, { "epoch": 2.361748369958947, "grad_norm": 0.2944527864456177, "learning_rate": 1.3143254312654814e-06, "loss": 0.3567, "step": 3260 }, { "epoch": 2.3624728326491184, "grad_norm": 0.3022892475128174, "learning_rate": 1.3114779528847233e-06, "loss": 0.3748, "step": 3261 }, { "epoch": 2.36319729533929, "grad_norm": 0.2949846386909485, "learning_rate": 1.3086330967075233e-06, "loss": 0.3531, "step": 3262 }, { "epoch": 2.3639217580294614, "grad_norm": 0.30715012550354004, "learning_rate": 1.3057908647563133e-06, "loss": 0.3538, "step": 3263 }, { "epoch": 2.364646220719633, "grad_norm": 0.2998347282409668, "learning_rate": 1.3029512590516624e-06, "loss": 0.4029, "step": 3264 }, { "epoch": 2.3653706834098043, "grad_norm": 0.2817631661891937, "learning_rate": 1.3001142816122742e-06, "loss": 0.3704, "step": 3265 }, { "epoch": 2.3660951460999757, "grad_norm": 0.28140994906425476, "learning_rate": 1.297279934454978e-06, "loss": 0.3742, "step": 3266 }, { "epoch": 2.366819608790147, "grad_norm": 0.3053601086139679, "learning_rate": 1.2944482195947384e-06, "loss": 0.3845, "step": 3267 }, { "epoch": 2.3675440714803186, "grad_norm": 0.2647744417190552, "learning_rate": 1.291619139044648e-06, "loss": 0.325, "step": 3268 }, { "epoch": 2.36826853417049, "grad_norm": 0.27679067850112915, "learning_rate": 1.288792694815923e-06, "loss": 0.3746, "step": 3269 }, { "epoch": 2.3689929968606616, "grad_norm": 0.29909345507621765, "learning_rate": 1.28596888891791e-06, "loss": 0.4034, "step": 3270 }, { "epoch": 2.369717459550833, "grad_norm": 0.27254611253738403, "learning_rate": 1.2831477233580792e-06, "loss": 0.3583, "step": 3271 }, { "epoch": 2.3704419222410045, "grad_norm": 0.29536721110343933, "learning_rate": 1.2803292001420191e-06, "loss": 0.3703, "step": 3272 }, { "epoch": 2.371166384931176, "grad_norm": 0.28030315041542053, "learning_rate": 1.277513321273446e-06, "loss": 0.3592, "step": 3273 }, { "epoch": 2.3718908476213474, "grad_norm": 0.3221222162246704, "learning_rate": 1.2747000887541938e-06, "loss": 0.3891, "step": 3274 }, { "epoch": 2.372615310311519, "grad_norm": 0.29005104303359985, "learning_rate": 1.2718895045842123e-06, "loss": 0.3668, "step": 3275 }, { "epoch": 2.3733397730016903, "grad_norm": 0.3031136393547058, "learning_rate": 1.2690815707615727e-06, "loss": 0.4107, "step": 3276 }, { "epoch": 2.3740642356918618, "grad_norm": 0.28114843368530273, "learning_rate": 1.2662762892824621e-06, "loss": 0.343, "step": 3277 }, { "epoch": 2.374788698382033, "grad_norm": 0.29788738489151, "learning_rate": 1.263473662141177e-06, "loss": 0.385, "step": 3278 }, { "epoch": 2.3755131610722047, "grad_norm": 0.3035779893398285, "learning_rate": 1.2606736913301322e-06, "loss": 0.3908, "step": 3279 }, { "epoch": 2.376237623762376, "grad_norm": 0.31409773230552673, "learning_rate": 1.2578763788398534e-06, "loss": 0.3998, "step": 3280 }, { "epoch": 2.3769620864525476, "grad_norm": 0.3042820394039154, "learning_rate": 1.2550817266589726e-06, "loss": 0.3691, "step": 3281 }, { "epoch": 2.377686549142719, "grad_norm": 0.29068177938461304, "learning_rate": 1.2522897367742337e-06, "loss": 0.3305, "step": 3282 }, { "epoch": 2.3784110118328905, "grad_norm": 0.3154565393924713, "learning_rate": 1.2495004111704889e-06, "loss": 0.3922, "step": 3283 }, { "epoch": 2.379135474523062, "grad_norm": 0.2832374572753906, "learning_rate": 1.2467137518306955e-06, "loss": 0.3378, "step": 3284 }, { "epoch": 2.3798599372132334, "grad_norm": 0.2905344069004059, "learning_rate": 1.2439297607359118e-06, "loss": 0.3542, "step": 3285 }, { "epoch": 2.380584399903405, "grad_norm": 0.309023380279541, "learning_rate": 1.241148439865304e-06, "loss": 0.3661, "step": 3286 }, { "epoch": 2.3813088625935763, "grad_norm": 0.3199666440486908, "learning_rate": 1.238369791196139e-06, "loss": 0.3508, "step": 3287 }, { "epoch": 2.382033325283748, "grad_norm": 0.33571797609329224, "learning_rate": 1.2355938167037811e-06, "loss": 0.3988, "step": 3288 }, { "epoch": 2.3827577879739192, "grad_norm": 0.28604820370674133, "learning_rate": 1.2328205183616964e-06, "loss": 0.3367, "step": 3289 }, { "epoch": 2.3834822506640907, "grad_norm": 0.30539658665657043, "learning_rate": 1.2300498981414495e-06, "loss": 0.3705, "step": 3290 }, { "epoch": 2.384206713354262, "grad_norm": 0.29140686988830566, "learning_rate": 1.227281958012696e-06, "loss": 0.3787, "step": 3291 }, { "epoch": 2.3849311760444336, "grad_norm": 0.2700766623020172, "learning_rate": 1.2245166999431913e-06, "loss": 0.3305, "step": 3292 }, { "epoch": 2.385655638734605, "grad_norm": 0.33261531591415405, "learning_rate": 1.221754125898783e-06, "loss": 0.3735, "step": 3293 }, { "epoch": 2.3863801014247765, "grad_norm": 0.310781866312027, "learning_rate": 1.2189942378434083e-06, "loss": 0.346, "step": 3294 }, { "epoch": 2.387104564114948, "grad_norm": 0.3099566400051117, "learning_rate": 1.216237037739097e-06, "loss": 0.4142, "step": 3295 }, { "epoch": 2.3878290268051194, "grad_norm": 0.31747767329216003, "learning_rate": 1.2134825275459688e-06, "loss": 0.3831, "step": 3296 }, { "epoch": 2.388553489495291, "grad_norm": 0.2875758409500122, "learning_rate": 1.2107307092222276e-06, "loss": 0.3426, "step": 3297 }, { "epoch": 2.3892779521854624, "grad_norm": 0.29223325848579407, "learning_rate": 1.2079815847241666e-06, "loss": 0.3424, "step": 3298 }, { "epoch": 2.390002414875634, "grad_norm": 0.29394739866256714, "learning_rate": 1.2052351560061653e-06, "loss": 0.3497, "step": 3299 }, { "epoch": 2.3907268775658053, "grad_norm": 0.31756502389907837, "learning_rate": 1.2024914250206814e-06, "loss": 0.3825, "step": 3300 }, { "epoch": 2.3914513402559767, "grad_norm": 0.3187260031700134, "learning_rate": 1.1997503937182592e-06, "loss": 0.4052, "step": 3301 }, { "epoch": 2.392175802946148, "grad_norm": 0.30793526768684387, "learning_rate": 1.1970120640475241e-06, "loss": 0.3481, "step": 3302 }, { "epoch": 2.3929002656363196, "grad_norm": 0.3037872016429901, "learning_rate": 1.194276437955177e-06, "loss": 0.3614, "step": 3303 }, { "epoch": 2.393624728326491, "grad_norm": 0.2991044521331787, "learning_rate": 1.1915435173859996e-06, "loss": 0.3539, "step": 3304 }, { "epoch": 2.3943491910166625, "grad_norm": 0.3017299175262451, "learning_rate": 1.188813304282851e-06, "loss": 0.3874, "step": 3305 }, { "epoch": 2.395073653706834, "grad_norm": 0.319206565618515, "learning_rate": 1.1860858005866642e-06, "loss": 0.407, "step": 3306 }, { "epoch": 2.3957981163970055, "grad_norm": 0.2983119487762451, "learning_rate": 1.1833610082364444e-06, "loss": 0.3521, "step": 3307 }, { "epoch": 2.396522579087177, "grad_norm": 0.2967032492160797, "learning_rate": 1.1806389291692722e-06, "loss": 0.3732, "step": 3308 }, { "epoch": 2.3972470417773484, "grad_norm": 0.27948880195617676, "learning_rate": 1.1779195653202997e-06, "loss": 0.3478, "step": 3309 }, { "epoch": 2.39797150446752, "grad_norm": 0.2921161949634552, "learning_rate": 1.1752029186227443e-06, "loss": 0.3685, "step": 3310 }, { "epoch": 2.3986959671576913, "grad_norm": 0.2820241153240204, "learning_rate": 1.1724889910078952e-06, "loss": 0.3273, "step": 3311 }, { "epoch": 2.3994204298478627, "grad_norm": 0.32392311096191406, "learning_rate": 1.1697777844051105e-06, "loss": 0.3927, "step": 3312 }, { "epoch": 2.400144892538034, "grad_norm": 0.29271793365478516, "learning_rate": 1.1670693007418082e-06, "loss": 0.3401, "step": 3313 }, { "epoch": 2.4008693552282057, "grad_norm": 0.3043597340583801, "learning_rate": 1.164363541943475e-06, "loss": 0.42, "step": 3314 }, { "epoch": 2.401593817918377, "grad_norm": 0.29617804288864136, "learning_rate": 1.161660509933661e-06, "loss": 0.3824, "step": 3315 }, { "epoch": 2.4023182806085486, "grad_norm": 0.2966325283050537, "learning_rate": 1.1589602066339723e-06, "loss": 0.3672, "step": 3316 }, { "epoch": 2.40304274329872, "grad_norm": 0.32221323251724243, "learning_rate": 1.1562626339640814e-06, "loss": 0.3825, "step": 3317 }, { "epoch": 2.4037672059888915, "grad_norm": 0.28352823853492737, "learning_rate": 1.1535677938417178e-06, "loss": 0.3465, "step": 3318 }, { "epoch": 2.404491668679063, "grad_norm": 0.31630992889404297, "learning_rate": 1.150875688182665e-06, "loss": 0.39, "step": 3319 }, { "epoch": 2.4052161313692344, "grad_norm": 0.29360324144363403, "learning_rate": 1.1481863189007664e-06, "loss": 0.368, "step": 3320 }, { "epoch": 2.405940594059406, "grad_norm": 0.2824745178222656, "learning_rate": 1.1454996879079205e-06, "loss": 0.3381, "step": 3321 }, { "epoch": 2.4066650567495773, "grad_norm": 0.3035656213760376, "learning_rate": 1.142815797114074e-06, "loss": 0.3427, "step": 3322 }, { "epoch": 2.4073895194397488, "grad_norm": 0.31392258405685425, "learning_rate": 1.1401346484272314e-06, "loss": 0.3963, "step": 3323 }, { "epoch": 2.4081139821299202, "grad_norm": 0.32229167222976685, "learning_rate": 1.1374562437534465e-06, "loss": 0.3733, "step": 3324 }, { "epoch": 2.4088384448200917, "grad_norm": 0.31112992763519287, "learning_rate": 1.1347805849968184e-06, "loss": 0.356, "step": 3325 }, { "epoch": 2.409562907510263, "grad_norm": 0.320508748292923, "learning_rate": 1.1321076740594984e-06, "loss": 0.3775, "step": 3326 }, { "epoch": 2.4102873702004346, "grad_norm": 0.3117820918560028, "learning_rate": 1.129437512841683e-06, "loss": 0.3811, "step": 3327 }, { "epoch": 2.411011832890606, "grad_norm": 0.3124529719352722, "learning_rate": 1.1267701032416151e-06, "loss": 0.3737, "step": 3328 }, { "epoch": 2.4117362955807775, "grad_norm": 0.2852548658847809, "learning_rate": 1.1241054471555779e-06, "loss": 0.3842, "step": 3329 }, { "epoch": 2.412460758270949, "grad_norm": 0.2902745008468628, "learning_rate": 1.1214435464779006e-06, "loss": 0.3714, "step": 3330 }, { "epoch": 2.4131852209611204, "grad_norm": 0.3041691482067108, "learning_rate": 1.1187844031009533e-06, "loss": 0.3826, "step": 3331 }, { "epoch": 2.413909683651292, "grad_norm": 0.28609368205070496, "learning_rate": 1.116128018915143e-06, "loss": 0.355, "step": 3332 }, { "epoch": 2.4146341463414633, "grad_norm": 0.28930920362472534, "learning_rate": 1.1134743958089177e-06, "loss": 0.3506, "step": 3333 }, { "epoch": 2.415358609031635, "grad_norm": 0.2961498200893402, "learning_rate": 1.1108235356687646e-06, "loss": 0.3744, "step": 3334 }, { "epoch": 2.4160830717218063, "grad_norm": 0.29633253812789917, "learning_rate": 1.1081754403792e-06, "loss": 0.3914, "step": 3335 }, { "epoch": 2.4168075344119777, "grad_norm": 0.2876109182834625, "learning_rate": 1.1055301118227806e-06, "loss": 0.3728, "step": 3336 }, { "epoch": 2.417531997102149, "grad_norm": 0.2929481863975525, "learning_rate": 1.1028875518800957e-06, "loss": 0.3613, "step": 3337 }, { "epoch": 2.4182564597923206, "grad_norm": 0.3018573820590973, "learning_rate": 1.100247762429762e-06, "loss": 0.3488, "step": 3338 }, { "epoch": 2.418980922482492, "grad_norm": 0.2665429413318634, "learning_rate": 1.0976107453484314e-06, "loss": 0.3279, "step": 3339 }, { "epoch": 2.4197053851726635, "grad_norm": 0.31613990664482117, "learning_rate": 1.0949765025107844e-06, "loss": 0.4025, "step": 3340 }, { "epoch": 2.420429847862835, "grad_norm": 0.2760581374168396, "learning_rate": 1.0923450357895248e-06, "loss": 0.3426, "step": 3341 }, { "epoch": 2.4211543105530065, "grad_norm": 0.2916056215763092, "learning_rate": 1.089716347055388e-06, "loss": 0.3994, "step": 3342 }, { "epoch": 2.421878773243178, "grad_norm": 0.29035359621047974, "learning_rate": 1.087090438177133e-06, "loss": 0.3895, "step": 3343 }, { "epoch": 2.4226032359333494, "grad_norm": 0.2912515699863434, "learning_rate": 1.0844673110215403e-06, "loss": 0.3488, "step": 3344 }, { "epoch": 2.423327698623521, "grad_norm": 0.2838703989982605, "learning_rate": 1.0818469674534154e-06, "loss": 0.368, "step": 3345 }, { "epoch": 2.4240521613136923, "grad_norm": 0.2859722375869751, "learning_rate": 1.0792294093355843e-06, "loss": 0.3812, "step": 3346 }, { "epoch": 2.4247766240038637, "grad_norm": 0.2788096070289612, "learning_rate": 1.0766146385288939e-06, "loss": 0.3467, "step": 3347 }, { "epoch": 2.425501086694035, "grad_norm": 0.28026482462882996, "learning_rate": 1.0740026568922058e-06, "loss": 0.3652, "step": 3348 }, { "epoch": 2.4262255493842066, "grad_norm": 0.29308342933654785, "learning_rate": 1.0713934662824038e-06, "loss": 0.3467, "step": 3349 }, { "epoch": 2.426950012074378, "grad_norm": 0.3069414794445038, "learning_rate": 1.0687870685543838e-06, "loss": 0.3571, "step": 3350 }, { "epoch": 2.4276744747645496, "grad_norm": 0.26987555623054504, "learning_rate": 1.0661834655610565e-06, "loss": 0.391, "step": 3351 }, { "epoch": 2.428398937454721, "grad_norm": 0.29768452048301697, "learning_rate": 1.0635826591533472e-06, "loss": 0.373, "step": 3352 }, { "epoch": 2.4291234001448925, "grad_norm": 0.2939329445362091, "learning_rate": 1.0609846511801946e-06, "loss": 0.3307, "step": 3353 }, { "epoch": 2.429847862835064, "grad_norm": 0.2661828100681305, "learning_rate": 1.0583894434885434e-06, "loss": 0.3469, "step": 3354 }, { "epoch": 2.4305723255252354, "grad_norm": 0.27996954321861267, "learning_rate": 1.0557970379233507e-06, "loss": 0.3861, "step": 3355 }, { "epoch": 2.431296788215407, "grad_norm": 0.312032014131546, "learning_rate": 1.0532074363275813e-06, "loss": 0.4039, "step": 3356 }, { "epoch": 2.4320212509055783, "grad_norm": 0.2963099479675293, "learning_rate": 1.050620640542208e-06, "loss": 0.3567, "step": 3357 }, { "epoch": 2.4327457135957498, "grad_norm": 0.3104957938194275, "learning_rate": 1.0480366524062041e-06, "loss": 0.3549, "step": 3358 }, { "epoch": 2.433470176285921, "grad_norm": 0.27946048974990845, "learning_rate": 1.045455473756552e-06, "loss": 0.3171, "step": 3359 }, { "epoch": 2.4341946389760927, "grad_norm": 0.2773331105709076, "learning_rate": 1.0428771064282357e-06, "loss": 0.3381, "step": 3360 }, { "epoch": 2.434919101666264, "grad_norm": 0.29945865273475647, "learning_rate": 1.040301552254237e-06, "loss": 0.3712, "step": 3361 }, { "epoch": 2.4356435643564356, "grad_norm": 0.313921183347702, "learning_rate": 1.0377288130655422e-06, "loss": 0.4209, "step": 3362 }, { "epoch": 2.436368027046607, "grad_norm": 0.29777559638023376, "learning_rate": 1.035158890691136e-06, "loss": 0.3803, "step": 3363 }, { "epoch": 2.4370924897367785, "grad_norm": 0.291789710521698, "learning_rate": 1.0325917869579965e-06, "loss": 0.3359, "step": 3364 }, { "epoch": 2.43781695242695, "grad_norm": 0.2811073958873749, "learning_rate": 1.0300275036911029e-06, "loss": 0.3492, "step": 3365 }, { "epoch": 2.4385414151171214, "grad_norm": 0.28287532925605774, "learning_rate": 1.027466042713428e-06, "loss": 0.334, "step": 3366 }, { "epoch": 2.439265877807293, "grad_norm": 0.3587968647480011, "learning_rate": 1.0249074058459346e-06, "loss": 0.3522, "step": 3367 }, { "epoch": 2.4399903404974643, "grad_norm": 0.3049441874027252, "learning_rate": 1.022351594907583e-06, "loss": 0.3411, "step": 3368 }, { "epoch": 2.440714803187636, "grad_norm": 0.3130398392677307, "learning_rate": 1.0197986117153224e-06, "loss": 0.3956, "step": 3369 }, { "epoch": 2.4414392658778072, "grad_norm": 0.31091946363449097, "learning_rate": 1.0172484580840896e-06, "loss": 0.3856, "step": 3370 }, { "epoch": 2.4421637285679787, "grad_norm": 0.31107738614082336, "learning_rate": 1.0147011358268132e-06, "loss": 0.3787, "step": 3371 }, { "epoch": 2.44288819125815, "grad_norm": 0.2845331132411957, "learning_rate": 1.0121566467544085e-06, "loss": 0.3421, "step": 3372 }, { "epoch": 2.4436126539483216, "grad_norm": 0.2968955934047699, "learning_rate": 1.009614992675773e-06, "loss": 0.3932, "step": 3373 }, { "epoch": 2.444337116638493, "grad_norm": 0.2856278121471405, "learning_rate": 1.007076175397793e-06, "loss": 0.3711, "step": 3374 }, { "epoch": 2.4450615793286645, "grad_norm": 0.26430216431617737, "learning_rate": 1.0045401967253382e-06, "loss": 0.3139, "step": 3375 }, { "epoch": 2.445786042018836, "grad_norm": 0.2801792025566101, "learning_rate": 1.0020070584612557e-06, "loss": 0.3785, "step": 3376 }, { "epoch": 2.4465105047090074, "grad_norm": 0.28522899746894836, "learning_rate": 9.994767624063773e-07, "loss": 0.3402, "step": 3377 }, { "epoch": 2.447234967399179, "grad_norm": 0.27791887521743774, "learning_rate": 9.969493103595141e-07, "loss": 0.3848, "step": 3378 }, { "epoch": 2.4479594300893504, "grad_norm": 0.27929866313934326, "learning_rate": 9.944247041174549e-07, "loss": 0.346, "step": 3379 }, { "epoch": 2.448683892779522, "grad_norm": 0.31923648715019226, "learning_rate": 9.919029454749635e-07, "loss": 0.362, "step": 3380 }, { "epoch": 2.4494083554696933, "grad_norm": 0.2800285518169403, "learning_rate": 9.893840362247809e-07, "loss": 0.3747, "step": 3381 }, { "epoch": 2.4501328181598647, "grad_norm": 0.2903733253479004, "learning_rate": 9.868679781576245e-07, "loss": 0.3394, "step": 3382 }, { "epoch": 2.450857280850036, "grad_norm": 0.3213966190814972, "learning_rate": 9.843547730621805e-07, "loss": 0.3557, "step": 3383 }, { "epoch": 2.4515817435402076, "grad_norm": 0.29545387625694275, "learning_rate": 9.81844422725109e-07, "loss": 0.3957, "step": 3384 }, { "epoch": 2.452306206230379, "grad_norm": 0.28733837604522705, "learning_rate": 9.793369289310427e-07, "loss": 0.3624, "step": 3385 }, { "epoch": 2.4530306689205505, "grad_norm": 0.28439661860466003, "learning_rate": 9.768322934625796e-07, "loss": 0.3539, "step": 3386 }, { "epoch": 2.453755131610722, "grad_norm": 0.3201664388179779, "learning_rate": 9.743305181002877e-07, "loss": 0.3981, "step": 3387 }, { "epoch": 2.4544795943008935, "grad_norm": 0.3002682626247406, "learning_rate": 9.718316046227033e-07, "loss": 0.3789, "step": 3388 }, { "epoch": 2.455204056991065, "grad_norm": 0.29446837306022644, "learning_rate": 9.69335554806325e-07, "loss": 0.3529, "step": 3389 }, { "epoch": 2.4559285196812364, "grad_norm": 0.2951369881629944, "learning_rate": 9.668423704256164e-07, "loss": 0.3778, "step": 3390 }, { "epoch": 2.456652982371408, "grad_norm": 0.28269490599632263, "learning_rate": 9.643520532530075e-07, "loss": 0.3461, "step": 3391 }, { "epoch": 2.4573774450615793, "grad_norm": 0.2875156104564667, "learning_rate": 9.618646050588848e-07, "loss": 0.3906, "step": 3392 }, { "epoch": 2.4581019077517507, "grad_norm": 0.28663545846939087, "learning_rate": 9.593800276115978e-07, "loss": 0.3421, "step": 3393 }, { "epoch": 2.458826370441922, "grad_norm": 0.30554816126823425, "learning_rate": 9.56898322677457e-07, "loss": 0.3566, "step": 3394 }, { "epoch": 2.4595508331320937, "grad_norm": 0.29329580068588257, "learning_rate": 9.544194920207262e-07, "loss": 0.3706, "step": 3395 }, { "epoch": 2.460275295822265, "grad_norm": 0.2910032868385315, "learning_rate": 9.519435374036295e-07, "loss": 0.3821, "step": 3396 }, { "epoch": 2.4609997585124366, "grad_norm": 0.28988662362098694, "learning_rate": 9.494704605863458e-07, "loss": 0.331, "step": 3397 }, { "epoch": 2.461724221202608, "grad_norm": 0.31466954946517944, "learning_rate": 9.470002633270087e-07, "loss": 0.3695, "step": 3398 }, { "epoch": 2.4624486838927795, "grad_norm": 0.30237072706222534, "learning_rate": 9.445329473817006e-07, "loss": 0.3605, "step": 3399 }, { "epoch": 2.463173146582951, "grad_norm": 0.27700209617614746, "learning_rate": 9.420685145044606e-07, "loss": 0.3385, "step": 3400 }, { "epoch": 2.4638976092731224, "grad_norm": 0.29325786232948303, "learning_rate": 9.396069664472774e-07, "loss": 0.3884, "step": 3401 }, { "epoch": 2.464622071963294, "grad_norm": 0.2837594151496887, "learning_rate": 9.371483049600849e-07, "loss": 0.3721, "step": 3402 }, { "epoch": 2.4653465346534653, "grad_norm": 0.30143335461616516, "learning_rate": 9.34692531790769e-07, "loss": 0.3551, "step": 3403 }, { "epoch": 2.4660709973436368, "grad_norm": 0.3037964403629303, "learning_rate": 9.322396486851626e-07, "loss": 0.3823, "step": 3404 }, { "epoch": 2.4667954600338082, "grad_norm": 0.3021060526371002, "learning_rate": 9.297896573870397e-07, "loss": 0.3384, "step": 3405 }, { "epoch": 2.4675199227239797, "grad_norm": 0.28995281457901, "learning_rate": 9.273425596381224e-07, "loss": 0.356, "step": 3406 }, { "epoch": 2.468244385414151, "grad_norm": 0.2692264914512634, "learning_rate": 9.248983571780757e-07, "loss": 0.3712, "step": 3407 }, { "epoch": 2.4689688481043226, "grad_norm": 0.2966023087501526, "learning_rate": 9.224570517445025e-07, "loss": 0.3804, "step": 3408 }, { "epoch": 2.469693310794494, "grad_norm": 0.2960933744907379, "learning_rate": 9.200186450729509e-07, "loss": 0.4016, "step": 3409 }, { "epoch": 2.4704177734846655, "grad_norm": 0.29851382970809937, "learning_rate": 9.175831388969064e-07, "loss": 0.4053, "step": 3410 }, { "epoch": 2.471142236174837, "grad_norm": 0.2757416069507599, "learning_rate": 9.151505349477901e-07, "loss": 0.3168, "step": 3411 }, { "epoch": 2.4718666988650084, "grad_norm": 0.31360068917274475, "learning_rate": 9.127208349549637e-07, "loss": 0.3541, "step": 3412 }, { "epoch": 2.47259116155518, "grad_norm": 0.302419513463974, "learning_rate": 9.102940406457239e-07, "loss": 0.3735, "step": 3413 }, { "epoch": 2.4733156242453513, "grad_norm": 0.29200810194015503, "learning_rate": 9.078701537452978e-07, "loss": 0.3893, "step": 3414 }, { "epoch": 2.474040086935523, "grad_norm": 0.2730301320552826, "learning_rate": 9.054491759768497e-07, "loss": 0.3539, "step": 3415 }, { "epoch": 2.4747645496256943, "grad_norm": 0.28803661465644836, "learning_rate": 9.030311090614763e-07, "loss": 0.3682, "step": 3416 }, { "epoch": 2.4754890123158657, "grad_norm": 0.2842257618904114, "learning_rate": 9.006159547182003e-07, "loss": 0.3456, "step": 3417 }, { "epoch": 2.476213475006037, "grad_norm": 0.2939983904361725, "learning_rate": 8.982037146639783e-07, "loss": 0.3532, "step": 3418 }, { "epoch": 2.4769379376962086, "grad_norm": 0.29272669553756714, "learning_rate": 8.957943906136924e-07, "loss": 0.3737, "step": 3419 }, { "epoch": 2.47766240038638, "grad_norm": 0.3172145485877991, "learning_rate": 8.933879842801558e-07, "loss": 0.3905, "step": 3420 }, { "epoch": 2.4783868630765515, "grad_norm": 0.26388922333717346, "learning_rate": 8.909844973740999e-07, "loss": 0.3431, "step": 3421 }, { "epoch": 2.479111325766723, "grad_norm": 0.30871644616127014, "learning_rate": 8.885839316041883e-07, "loss": 0.4054, "step": 3422 }, { "epoch": 2.4798357884568945, "grad_norm": 0.28339773416519165, "learning_rate": 8.861862886770045e-07, "loss": 0.3609, "step": 3423 }, { "epoch": 2.480560251147066, "grad_norm": 0.28404831886291504, "learning_rate": 8.837915702970534e-07, "loss": 0.3847, "step": 3424 }, { "epoch": 2.4812847138372374, "grad_norm": 0.2528369724750519, "learning_rate": 8.813997781667621e-07, "loss": 0.345, "step": 3425 }, { "epoch": 2.482009176527409, "grad_norm": 0.28055939078330994, "learning_rate": 8.790109139864789e-07, "loss": 0.3818, "step": 3426 }, { "epoch": 2.4827336392175803, "grad_norm": 0.28202369809150696, "learning_rate": 8.766249794544662e-07, "loss": 0.3309, "step": 3427 }, { "epoch": 2.4834581019077517, "grad_norm": 0.32646507024765015, "learning_rate": 8.74241976266908e-07, "loss": 0.4158, "step": 3428 }, { "epoch": 2.484182564597923, "grad_norm": 0.2951858341693878, "learning_rate": 8.718619061179029e-07, "loss": 0.3482, "step": 3429 }, { "epoch": 2.4849070272880946, "grad_norm": 0.29261431097984314, "learning_rate": 8.69484770699463e-07, "loss": 0.3757, "step": 3430 }, { "epoch": 2.485631489978266, "grad_norm": 0.28942009806632996, "learning_rate": 8.67110571701516e-07, "loss": 0.3401, "step": 3431 }, { "epoch": 2.4863559526684376, "grad_norm": 0.2776653468608856, "learning_rate": 8.647393108119023e-07, "loss": 0.3487, "step": 3432 }, { "epoch": 2.487080415358609, "grad_norm": 0.30603858828544617, "learning_rate": 8.623709897163707e-07, "loss": 0.3834, "step": 3433 }, { "epoch": 2.4878048780487805, "grad_norm": 0.31046709418296814, "learning_rate": 8.600056100985831e-07, "loss": 0.3581, "step": 3434 }, { "epoch": 2.488529340738952, "grad_norm": 0.28140559792518616, "learning_rate": 8.576431736401098e-07, "loss": 0.3366, "step": 3435 }, { "epoch": 2.4892538034291234, "grad_norm": 0.2971031367778778, "learning_rate": 8.552836820204269e-07, "loss": 0.3647, "step": 3436 }, { "epoch": 2.489978266119295, "grad_norm": 0.2907017171382904, "learning_rate": 8.529271369169178e-07, "loss": 0.3583, "step": 3437 }, { "epoch": 2.4907027288094663, "grad_norm": 0.26926878094673157, "learning_rate": 8.505735400048748e-07, "loss": 0.3344, "step": 3438 }, { "epoch": 2.4914271914996378, "grad_norm": 0.3089801073074341, "learning_rate": 8.482228929574876e-07, "loss": 0.3841, "step": 3439 }, { "epoch": 2.492151654189809, "grad_norm": 0.2763574421405792, "learning_rate": 8.458751974458534e-07, "loss": 0.3822, "step": 3440 }, { "epoch": 2.4928761168799807, "grad_norm": 0.29954373836517334, "learning_rate": 8.435304551389706e-07, "loss": 0.3575, "step": 3441 }, { "epoch": 2.493600579570152, "grad_norm": 0.2962022125720978, "learning_rate": 8.411886677037401e-07, "loss": 0.3501, "step": 3442 }, { "epoch": 2.4943250422603236, "grad_norm": 0.29330331087112427, "learning_rate": 8.388498368049541e-07, "loss": 0.3671, "step": 3443 }, { "epoch": 2.495049504950495, "grad_norm": 0.2885951101779938, "learning_rate": 8.365139641053121e-07, "loss": 0.3764, "step": 3444 }, { "epoch": 2.4957739676406665, "grad_norm": 0.28565865755081177, "learning_rate": 8.341810512654075e-07, "loss": 0.3631, "step": 3445 }, { "epoch": 2.496498430330838, "grad_norm": 0.27491509914398193, "learning_rate": 8.318510999437263e-07, "loss": 0.3225, "step": 3446 }, { "epoch": 2.4972228930210094, "grad_norm": 0.29099661111831665, "learning_rate": 8.29524111796654e-07, "loss": 0.3657, "step": 3447 }, { "epoch": 2.497947355711181, "grad_norm": 0.28548887372016907, "learning_rate": 8.272000884784675e-07, "loss": 0.3528, "step": 3448 }, { "epoch": 2.4986718184013523, "grad_norm": 0.28425678610801697, "learning_rate": 8.248790316413363e-07, "loss": 0.3639, "step": 3449 }, { "epoch": 2.499396281091524, "grad_norm": 0.2949208915233612, "learning_rate": 8.225609429353187e-07, "loss": 0.3695, "step": 3450 }, { "epoch": 2.5001207437816952, "grad_norm": 0.2972923815250397, "learning_rate": 8.202458240083655e-07, "loss": 0.3795, "step": 3451 }, { "epoch": 2.5008452064718667, "grad_norm": 0.30386316776275635, "learning_rate": 8.179336765063179e-07, "loss": 0.343, "step": 3452 }, { "epoch": 2.501569669162038, "grad_norm": 0.30714988708496094, "learning_rate": 8.156245020728986e-07, "loss": 0.3779, "step": 3453 }, { "epoch": 2.5022941318522096, "grad_norm": 0.28088104724884033, "learning_rate": 8.133183023497232e-07, "loss": 0.3727, "step": 3454 }, { "epoch": 2.503018594542381, "grad_norm": 0.2845723032951355, "learning_rate": 8.1101507897629e-07, "loss": 0.351, "step": 3455 }, { "epoch": 2.5037430572325525, "grad_norm": 0.2908911406993866, "learning_rate": 8.087148335899786e-07, "loss": 0.3698, "step": 3456 }, { "epoch": 2.504467519922724, "grad_norm": 0.27606722712516785, "learning_rate": 8.064175678260566e-07, "loss": 0.3723, "step": 3457 }, { "epoch": 2.5051919826128954, "grad_norm": 0.3368840217590332, "learning_rate": 8.041232833176716e-07, "loss": 0.4025, "step": 3458 }, { "epoch": 2.505916445303067, "grad_norm": 0.2882290780544281, "learning_rate": 8.018319816958481e-07, "loss": 0.3613, "step": 3459 }, { "epoch": 2.5066409079932384, "grad_norm": 0.2886197566986084, "learning_rate": 7.995436645894949e-07, "loss": 0.3789, "step": 3460 }, { "epoch": 2.50736537068341, "grad_norm": 0.2879616916179657, "learning_rate": 7.97258333625398e-07, "loss": 0.3566, "step": 3461 }, { "epoch": 2.5080898333735813, "grad_norm": 0.29505425691604614, "learning_rate": 7.949759904282184e-07, "loss": 0.3751, "step": 3462 }, { "epoch": 2.5088142960637527, "grad_norm": 0.2765631079673767, "learning_rate": 7.926966366204946e-07, "loss": 0.3495, "step": 3463 }, { "epoch": 2.509538758753924, "grad_norm": 0.2895795702934265, "learning_rate": 7.904202738226408e-07, "loss": 0.3962, "step": 3464 }, { "epoch": 2.5102632214440956, "grad_norm": 0.2715228796005249, "learning_rate": 7.881469036529427e-07, "loss": 0.3698, "step": 3465 }, { "epoch": 2.510987684134267, "grad_norm": 0.30324167013168335, "learning_rate": 7.858765277275593e-07, "loss": 0.35, "step": 3466 }, { "epoch": 2.5117121468244386, "grad_norm": 0.32641661167144775, "learning_rate": 7.836091476605234e-07, "loss": 0.3803, "step": 3467 }, { "epoch": 2.51243660951461, "grad_norm": 0.2930084764957428, "learning_rate": 7.81344765063734e-07, "loss": 0.3659, "step": 3468 }, { "epoch": 2.5131610722047815, "grad_norm": 0.3056747615337372, "learning_rate": 7.790833815469612e-07, "loss": 0.3807, "step": 3469 }, { "epoch": 2.513885534894953, "grad_norm": 0.2834494709968567, "learning_rate": 7.768249987178433e-07, "loss": 0.3761, "step": 3470 }, { "epoch": 2.5146099975851244, "grad_norm": 0.30275416374206543, "learning_rate": 7.745696181818862e-07, "loss": 0.3845, "step": 3471 }, { "epoch": 2.515334460275296, "grad_norm": 0.3517991602420807, "learning_rate": 7.723172415424579e-07, "loss": 0.3441, "step": 3472 }, { "epoch": 2.5160589229654673, "grad_norm": 0.31282782554626465, "learning_rate": 7.700678704007947e-07, "loss": 0.3783, "step": 3473 }, { "epoch": 2.5167833856556387, "grad_norm": 0.3039097487926483, "learning_rate": 7.678215063559957e-07, "loss": 0.3498, "step": 3474 }, { "epoch": 2.51750784834581, "grad_norm": 0.2684783935546875, "learning_rate": 7.655781510050186e-07, "loss": 0.3667, "step": 3475 }, { "epoch": 2.5182323110359817, "grad_norm": 0.27138060331344604, "learning_rate": 7.633378059426871e-07, "loss": 0.3582, "step": 3476 }, { "epoch": 2.518956773726153, "grad_norm": 0.28184014558792114, "learning_rate": 7.611004727616833e-07, "loss": 0.3714, "step": 3477 }, { "epoch": 2.5196812364163246, "grad_norm": 0.29267585277557373, "learning_rate": 7.588661530525448e-07, "loss": 0.3429, "step": 3478 }, { "epoch": 2.520405699106496, "grad_norm": 0.29910796880722046, "learning_rate": 7.566348484036712e-07, "loss": 0.3768, "step": 3479 }, { "epoch": 2.5211301617966675, "grad_norm": 0.3063623905181885, "learning_rate": 7.544065604013184e-07, "loss": 0.402, "step": 3480 }, { "epoch": 2.521854624486839, "grad_norm": 0.2842179238796234, "learning_rate": 7.521812906295938e-07, "loss": 0.3325, "step": 3481 }, { "epoch": 2.5225790871770104, "grad_norm": 0.2941224277019501, "learning_rate": 7.499590406704632e-07, "loss": 0.3322, "step": 3482 }, { "epoch": 2.523303549867182, "grad_norm": 0.31096890568733215, "learning_rate": 7.477398121037449e-07, "loss": 0.3916, "step": 3483 }, { "epoch": 2.5240280125573533, "grad_norm": 0.28993088006973267, "learning_rate": 7.455236065071064e-07, "loss": 0.3902, "step": 3484 }, { "epoch": 2.5247524752475248, "grad_norm": 0.28452277183532715, "learning_rate": 7.433104254560692e-07, "loss": 0.3596, "step": 3485 }, { "epoch": 2.5254769379376962, "grad_norm": 0.2681162357330322, "learning_rate": 7.41100270524005e-07, "loss": 0.3111, "step": 3486 }, { "epoch": 2.5262014006278677, "grad_norm": 0.3120240867137909, "learning_rate": 7.388931432821306e-07, "loss": 0.3934, "step": 3487 }, { "epoch": 2.526925863318039, "grad_norm": 0.30118033289909363, "learning_rate": 7.366890452995134e-07, "loss": 0.3328, "step": 3488 }, { "epoch": 2.5276503260082106, "grad_norm": 0.29532748460769653, "learning_rate": 7.34487978143068e-07, "loss": 0.3764, "step": 3489 }, { "epoch": 2.528374788698382, "grad_norm": 0.3086693584918976, "learning_rate": 7.322899433775504e-07, "loss": 0.4243, "step": 3490 }, { "epoch": 2.5290992513885535, "grad_norm": 0.2798154950141907, "learning_rate": 7.300949425655651e-07, "loss": 0.3062, "step": 3491 }, { "epoch": 2.529823714078725, "grad_norm": 0.3087565302848816, "learning_rate": 7.279029772675572e-07, "loss": 0.3939, "step": 3492 }, { "epoch": 2.5305481767688964, "grad_norm": 0.27346161007881165, "learning_rate": 7.257140490418157e-07, "loss": 0.3288, "step": 3493 }, { "epoch": 2.531272639459068, "grad_norm": 0.2760918438434601, "learning_rate": 7.235281594444671e-07, "loss": 0.3476, "step": 3494 }, { "epoch": 2.5319971021492393, "grad_norm": 0.30938875675201416, "learning_rate": 7.213453100294821e-07, "loss": 0.3763, "step": 3495 }, { "epoch": 2.532721564839411, "grad_norm": 0.28903600573539734, "learning_rate": 7.191655023486682e-07, "loss": 0.3465, "step": 3496 }, { "epoch": 2.5334460275295823, "grad_norm": 0.27733442187309265, "learning_rate": 7.169887379516682e-07, "loss": 0.361, "step": 3497 }, { "epoch": 2.5341704902197537, "grad_norm": 0.29333463311195374, "learning_rate": 7.148150183859648e-07, "loss": 0.3805, "step": 3498 }, { "epoch": 2.534894952909925, "grad_norm": 0.30383560061454773, "learning_rate": 7.126443451968762e-07, "loss": 0.3951, "step": 3499 }, { "epoch": 2.5356194156000966, "grad_norm": 0.30972516536712646, "learning_rate": 7.104767199275509e-07, "loss": 0.3838, "step": 3500 }, { "epoch": 2.536343878290268, "grad_norm": 0.30994054675102234, "learning_rate": 7.083121441189739e-07, "loss": 0.4014, "step": 3501 }, { "epoch": 2.5370683409804395, "grad_norm": 0.29993534088134766, "learning_rate": 7.061506193099643e-07, "loss": 0.3768, "step": 3502 }, { "epoch": 2.537792803670611, "grad_norm": 0.2877250909805298, "learning_rate": 7.039921470371657e-07, "loss": 0.3698, "step": 3503 }, { "epoch": 2.5385172663607825, "grad_norm": 0.2811706066131592, "learning_rate": 7.018367288350575e-07, "loss": 0.3389, "step": 3504 }, { "epoch": 2.539241729050954, "grad_norm": 0.2843785881996155, "learning_rate": 6.996843662359465e-07, "loss": 0.355, "step": 3505 }, { "epoch": 2.5399661917411254, "grad_norm": 0.2817438840866089, "learning_rate": 6.975350607699649e-07, "loss": 0.363, "step": 3506 }, { "epoch": 2.540690654431297, "grad_norm": 0.27770328521728516, "learning_rate": 6.953888139650744e-07, "loss": 0.3281, "step": 3507 }, { "epoch": 2.5414151171214683, "grad_norm": 0.289455771446228, "learning_rate": 6.932456273470622e-07, "loss": 0.4022, "step": 3508 }, { "epoch": 2.5421395798116397, "grad_norm": 0.2868770360946655, "learning_rate": 6.911055024395363e-07, "loss": 0.3632, "step": 3509 }, { "epoch": 2.542864042501811, "grad_norm": 0.28977784514427185, "learning_rate": 6.889684407639324e-07, "loss": 0.4142, "step": 3510 }, { "epoch": 2.5435885051919827, "grad_norm": 0.2786150276660919, "learning_rate": 6.868344438395086e-07, "loss": 0.3536, "step": 3511 }, { "epoch": 2.544312967882154, "grad_norm": 0.27349141240119934, "learning_rate": 6.84703513183339e-07, "loss": 0.3631, "step": 3512 }, { "epoch": 2.5450374305723256, "grad_norm": 0.2633858323097229, "learning_rate": 6.825756503103231e-07, "loss": 0.3132, "step": 3513 }, { "epoch": 2.545761893262497, "grad_norm": 0.3174283504486084, "learning_rate": 6.80450856733178e-07, "loss": 0.3843, "step": 3514 }, { "epoch": 2.5464863559526685, "grad_norm": 0.2964191436767578, "learning_rate": 6.783291339624399e-07, "loss": 0.3627, "step": 3515 }, { "epoch": 2.54721081864284, "grad_norm": 0.2849510908126831, "learning_rate": 6.762104835064576e-07, "loss": 0.3571, "step": 3516 }, { "epoch": 2.5479352813330114, "grad_norm": 0.2991173267364502, "learning_rate": 6.740949068714009e-07, "loss": 0.412, "step": 3517 }, { "epoch": 2.548659744023183, "grad_norm": 0.28679949045181274, "learning_rate": 6.719824055612523e-07, "loss": 0.3735, "step": 3518 }, { "epoch": 2.5493842067133543, "grad_norm": 0.2897375524044037, "learning_rate": 6.698729810778065e-07, "loss": 0.3588, "step": 3519 }, { "epoch": 2.5501086694035258, "grad_norm": 0.28985628485679626, "learning_rate": 6.677666349206735e-07, "loss": 0.384, "step": 3520 }, { "epoch": 2.550833132093697, "grad_norm": 0.304585337638855, "learning_rate": 6.656633685872749e-07, "loss": 0.3831, "step": 3521 }, { "epoch": 2.5515575947838687, "grad_norm": 0.29445523023605347, "learning_rate": 6.63563183572839e-07, "loss": 0.3592, "step": 3522 }, { "epoch": 2.55228205747404, "grad_norm": 0.29494708776474, "learning_rate": 6.614660813704076e-07, "loss": 0.4199, "step": 3523 }, { "epoch": 2.5530065201642116, "grad_norm": 0.2841244041919708, "learning_rate": 6.593720634708312e-07, "loss": 0.3495, "step": 3524 }, { "epoch": 2.553730982854383, "grad_norm": 0.28597238659858704, "learning_rate": 6.57281131362763e-07, "loss": 0.346, "step": 3525 }, { "epoch": 2.5544554455445545, "grad_norm": 0.2979830503463745, "learning_rate": 6.551932865326666e-07, "loss": 0.3773, "step": 3526 }, { "epoch": 2.555179908234726, "grad_norm": 0.28922808170318604, "learning_rate": 6.531085304648116e-07, "loss": 0.3914, "step": 3527 }, { "epoch": 2.5559043709248974, "grad_norm": 0.2972557544708252, "learning_rate": 6.510268646412665e-07, "loss": 0.3739, "step": 3528 }, { "epoch": 2.556628833615069, "grad_norm": 0.2713003158569336, "learning_rate": 6.489482905419087e-07, "loss": 0.3767, "step": 3529 }, { "epoch": 2.5573532963052403, "grad_norm": 0.28829383850097656, "learning_rate": 6.468728096444154e-07, "loss": 0.345, "step": 3530 }, { "epoch": 2.558077758995412, "grad_norm": 0.2974450886249542, "learning_rate": 6.448004234242627e-07, "loss": 0.3514, "step": 3531 }, { "epoch": 2.5588022216855832, "grad_norm": 0.2936441898345947, "learning_rate": 6.427311333547298e-07, "loss": 0.3345, "step": 3532 }, { "epoch": 2.5595266843757547, "grad_norm": 0.3029550313949585, "learning_rate": 6.406649409068949e-07, "loss": 0.3571, "step": 3533 }, { "epoch": 2.560251147065926, "grad_norm": 0.3135346472263336, "learning_rate": 6.386018475496303e-07, "loss": 0.3578, "step": 3534 }, { "epoch": 2.5609756097560976, "grad_norm": 0.3105078637599945, "learning_rate": 6.365418547496099e-07, "loss": 0.3656, "step": 3535 }, { "epoch": 2.561700072446269, "grad_norm": 0.32025811076164246, "learning_rate": 6.344849639712997e-07, "loss": 0.4487, "step": 3536 }, { "epoch": 2.5624245351364405, "grad_norm": 0.25519663095474243, "learning_rate": 6.324311766769631e-07, "loss": 0.3152, "step": 3537 }, { "epoch": 2.563148997826612, "grad_norm": 0.27203303575515747, "learning_rate": 6.303804943266539e-07, "loss": 0.3507, "step": 3538 }, { "epoch": 2.5638734605167834, "grad_norm": 0.2985607087612152, "learning_rate": 6.283329183782222e-07, "loss": 0.3962, "step": 3539 }, { "epoch": 2.564597923206955, "grad_norm": 0.2864999771118164, "learning_rate": 6.26288450287309e-07, "loss": 0.3725, "step": 3540 }, { "epoch": 2.5653223858971264, "grad_norm": 0.2924043834209442, "learning_rate": 6.242470915073434e-07, "loss": 0.3897, "step": 3541 }, { "epoch": 2.566046848587298, "grad_norm": 0.2874194085597992, "learning_rate": 6.222088434895462e-07, "loss": 0.3831, "step": 3542 }, { "epoch": 2.5667713112774693, "grad_norm": 0.30988410115242004, "learning_rate": 6.201737076829267e-07, "loss": 0.3716, "step": 3543 }, { "epoch": 2.5674957739676407, "grad_norm": 0.286399781703949, "learning_rate": 6.18141685534282e-07, "loss": 0.3683, "step": 3544 }, { "epoch": 2.568220236657812, "grad_norm": 0.2699064314365387, "learning_rate": 6.161127784881926e-07, "loss": 0.3486, "step": 3545 }, { "epoch": 2.5689446993479836, "grad_norm": 0.2621704638004303, "learning_rate": 6.140869879870287e-07, "loss": 0.3067, "step": 3546 }, { "epoch": 2.569669162038155, "grad_norm": 0.3219786286354065, "learning_rate": 6.120643154709438e-07, "loss": 0.4074, "step": 3547 }, { "epoch": 2.5703936247283266, "grad_norm": 0.29883134365081787, "learning_rate": 6.100447623778716e-07, "loss": 0.3652, "step": 3548 }, { "epoch": 2.571118087418498, "grad_norm": 0.2791089117527008, "learning_rate": 6.08028330143532e-07, "loss": 0.3565, "step": 3549 }, { "epoch": 2.5718425501086695, "grad_norm": 0.27891531586647034, "learning_rate": 6.060150202014253e-07, "loss": 0.3514, "step": 3550 }, { "epoch": 2.572567012798841, "grad_norm": 0.2752542793750763, "learning_rate": 6.040048339828302e-07, "loss": 0.3396, "step": 3551 }, { "epoch": 2.5732914754890124, "grad_norm": 0.30373436212539673, "learning_rate": 6.01997772916807e-07, "loss": 0.4083, "step": 3552 }, { "epoch": 2.574015938179184, "grad_norm": 0.3037305474281311, "learning_rate": 5.99993838430194e-07, "loss": 0.421, "step": 3553 }, { "epoch": 2.5747404008693553, "grad_norm": 0.277713418006897, "learning_rate": 5.979930319476051e-07, "loss": 0.3407, "step": 3554 }, { "epoch": 2.5754648635595268, "grad_norm": 0.29404085874557495, "learning_rate": 5.959953548914327e-07, "loss": 0.3501, "step": 3555 }, { "epoch": 2.576189326249698, "grad_norm": 0.3017187714576721, "learning_rate": 5.940008086818439e-07, "loss": 0.3836, "step": 3556 }, { "epoch": 2.5769137889398697, "grad_norm": 0.261210560798645, "learning_rate": 5.920093947367777e-07, "loss": 0.3447, "step": 3557 }, { "epoch": 2.577638251630041, "grad_norm": 0.26735714077949524, "learning_rate": 5.900211144719492e-07, "loss": 0.3534, "step": 3558 }, { "epoch": 2.5783627143202126, "grad_norm": 0.3191494643688202, "learning_rate": 5.880359693008453e-07, "loss": 0.3864, "step": 3559 }, { "epoch": 2.579087177010384, "grad_norm": 0.28970304131507874, "learning_rate": 5.860539606347226e-07, "loss": 0.3562, "step": 3560 }, { "epoch": 2.5798116397005555, "grad_norm": 0.2906232476234436, "learning_rate": 5.840750898826097e-07, "loss": 0.3672, "step": 3561 }, { "epoch": 2.580536102390727, "grad_norm": 0.28489363193511963, "learning_rate": 5.82099358451304e-07, "loss": 0.3662, "step": 3562 }, { "epoch": 2.5812605650808984, "grad_norm": 0.3263349235057831, "learning_rate": 5.801267677453693e-07, "loss": 0.3727, "step": 3563 }, { "epoch": 2.58198502777107, "grad_norm": 0.2992188632488251, "learning_rate": 5.781573191671386e-07, "loss": 0.3749, "step": 3564 }, { "epoch": 2.5827094904612413, "grad_norm": 0.275596022605896, "learning_rate": 5.76191014116711e-07, "loss": 0.3569, "step": 3565 }, { "epoch": 2.5834339531514128, "grad_norm": 0.27684497833251953, "learning_rate": 5.742278539919516e-07, "loss": 0.332, "step": 3566 }, { "epoch": 2.5841584158415842, "grad_norm": 0.27754533290863037, "learning_rate": 5.722678401884868e-07, "loss": 0.3596, "step": 3567 }, { "epoch": 2.5848828785317557, "grad_norm": 0.29945242404937744, "learning_rate": 5.703109740997081e-07, "loss": 0.3679, "step": 3568 }, { "epoch": 2.585607341221927, "grad_norm": 0.29707658290863037, "learning_rate": 5.683572571167711e-07, "loss": 0.375, "step": 3569 }, { "epoch": 2.5863318039120986, "grad_norm": 0.27551326155662537, "learning_rate": 5.664066906285892e-07, "loss": 0.3498, "step": 3570 }, { "epoch": 2.58705626660227, "grad_norm": 0.28085213899612427, "learning_rate": 5.644592760218376e-07, "loss": 0.3697, "step": 3571 }, { "epoch": 2.5877807292924415, "grad_norm": 0.27821779251098633, "learning_rate": 5.625150146809522e-07, "loss": 0.3526, "step": 3572 }, { "epoch": 2.588505191982613, "grad_norm": 0.2820848524570465, "learning_rate": 5.60573907988124e-07, "loss": 0.3692, "step": 3573 }, { "epoch": 2.5892296546727844, "grad_norm": 0.2795170545578003, "learning_rate": 5.586359573233047e-07, "loss": 0.3733, "step": 3574 }, { "epoch": 2.589954117362956, "grad_norm": 0.29591649770736694, "learning_rate": 5.567011640642012e-07, "loss": 0.3908, "step": 3575 }, { "epoch": 2.5906785800531273, "grad_norm": 0.25783392786979675, "learning_rate": 5.547695295862742e-07, "loss": 0.3603, "step": 3576 }, { "epoch": 2.591403042743299, "grad_norm": 0.28710246086120605, "learning_rate": 5.5284105526274e-07, "loss": 0.4082, "step": 3577 }, { "epoch": 2.5921275054334703, "grad_norm": 0.2887401878833771, "learning_rate": 5.509157424645706e-07, "loss": 0.3456, "step": 3578 }, { "epoch": 2.5928519681236417, "grad_norm": 0.3128547668457031, "learning_rate": 5.48993592560485e-07, "loss": 0.4115, "step": 3579 }, { "epoch": 2.593576430813813, "grad_norm": 0.2798478305339813, "learning_rate": 5.470746069169591e-07, "loss": 0.3505, "step": 3580 }, { "epoch": 2.5943008935039846, "grad_norm": 0.294280081987381, "learning_rate": 5.451587868982172e-07, "loss": 0.3631, "step": 3581 }, { "epoch": 2.595025356194156, "grad_norm": 0.29253309965133667, "learning_rate": 5.43246133866231e-07, "loss": 0.3916, "step": 3582 }, { "epoch": 2.5957498188843275, "grad_norm": 0.27518585324287415, "learning_rate": 5.413366491807243e-07, "loss": 0.3472, "step": 3583 }, { "epoch": 2.596474281574499, "grad_norm": 0.3103296458721161, "learning_rate": 5.39430334199168e-07, "loss": 0.3722, "step": 3584 }, { "epoch": 2.5971987442646705, "grad_norm": 0.28795045614242554, "learning_rate": 5.37527190276776e-07, "loss": 0.3383, "step": 3585 }, { "epoch": 2.597923206954842, "grad_norm": 0.31203344464302063, "learning_rate": 5.356272187665118e-07, "loss": 0.4213, "step": 3586 }, { "epoch": 2.5986476696450134, "grad_norm": 0.2828258275985718, "learning_rate": 5.337304210190825e-07, "loss": 0.3406, "step": 3587 }, { "epoch": 2.599372132335185, "grad_norm": 0.29297396540641785, "learning_rate": 5.318367983829393e-07, "loss": 0.3545, "step": 3588 }, { "epoch": 2.6000965950253563, "grad_norm": 0.3029537498950958, "learning_rate": 5.299463522042736e-07, "loss": 0.3512, "step": 3589 }, { "epoch": 2.6008210577155277, "grad_norm": 0.28495660424232483, "learning_rate": 5.280590838270222e-07, "loss": 0.3784, "step": 3590 }, { "epoch": 2.601545520405699, "grad_norm": 0.27691933512687683, "learning_rate": 5.261749945928613e-07, "loss": 0.363, "step": 3591 }, { "epoch": 2.6022699830958707, "grad_norm": 0.3037094473838806, "learning_rate": 5.242940858412054e-07, "loss": 0.3615, "step": 3592 }, { "epoch": 2.602994445786042, "grad_norm": 0.27523931860923767, "learning_rate": 5.224163589092107e-07, "loss": 0.3615, "step": 3593 }, { "epoch": 2.6037189084762136, "grad_norm": 0.27699053287506104, "learning_rate": 5.205418151317709e-07, "loss": 0.371, "step": 3594 }, { "epoch": 2.604443371166385, "grad_norm": 0.27698901295661926, "learning_rate": 5.18670455841514e-07, "loss": 0.3604, "step": 3595 }, { "epoch": 2.6051678338565565, "grad_norm": 0.2686791718006134, "learning_rate": 5.168022823688073e-07, "loss": 0.3616, "step": 3596 }, { "epoch": 2.605892296546728, "grad_norm": 0.29746517539024353, "learning_rate": 5.149372960417537e-07, "loss": 0.3871, "step": 3597 }, { "epoch": 2.6066167592368994, "grad_norm": 0.26310038566589355, "learning_rate": 5.130754981861857e-07, "loss": 0.3441, "step": 3598 }, { "epoch": 2.607341221927071, "grad_norm": 0.27949628233909607, "learning_rate": 5.112168901256737e-07, "loss": 0.3489, "step": 3599 }, { "epoch": 2.6080656846172423, "grad_norm": 0.29512953758239746, "learning_rate": 5.0936147318152e-07, "loss": 0.3872, "step": 3600 }, { "epoch": 2.6087901473074138, "grad_norm": 0.2824815511703491, "learning_rate": 5.075092486727551e-07, "loss": 0.3419, "step": 3601 }, { "epoch": 2.609514609997585, "grad_norm": 0.2600335478782654, "learning_rate": 5.05660217916143e-07, "loss": 0.3128, "step": 3602 }, { "epoch": 2.6102390726877567, "grad_norm": 0.28589653968811035, "learning_rate": 5.038143822261776e-07, "loss": 0.4012, "step": 3603 }, { "epoch": 2.610963535377928, "grad_norm": 0.28860846161842346, "learning_rate": 5.019717429150783e-07, "loss": 0.3778, "step": 3604 }, { "epoch": 2.6116879980680996, "grad_norm": 0.2815190553665161, "learning_rate": 5.001323012927945e-07, "loss": 0.3567, "step": 3605 }, { "epoch": 2.612412460758271, "grad_norm": 0.27341583371162415, "learning_rate": 4.982960586670027e-07, "loss": 0.3139, "step": 3606 }, { "epoch": 2.6131369234484425, "grad_norm": 0.3132252097129822, "learning_rate": 4.964630163431028e-07, "loss": 0.3913, "step": 3607 }, { "epoch": 2.613861386138614, "grad_norm": 0.2753840684890747, "learning_rate": 4.946331756242223e-07, "loss": 0.3661, "step": 3608 }, { "epoch": 2.6145858488287854, "grad_norm": 0.27151933312416077, "learning_rate": 4.928065378112107e-07, "loss": 0.3531, "step": 3609 }, { "epoch": 2.615310311518957, "grad_norm": 0.28102433681488037, "learning_rate": 4.909831042026425e-07, "loss": 0.3754, "step": 3610 }, { "epoch": 2.6160347742091283, "grad_norm": 0.26311078667640686, "learning_rate": 4.891628760948114e-07, "loss": 0.3598, "step": 3611 }, { "epoch": 2.6167592368993, "grad_norm": 0.3003917932510376, "learning_rate": 4.87345854781735e-07, "loss": 0.4098, "step": 3612 }, { "epoch": 2.6174836995894712, "grad_norm": 0.28238654136657715, "learning_rate": 4.855320415551506e-07, "loss": 0.3664, "step": 3613 }, { "epoch": 2.6182081622796427, "grad_norm": 0.27002575993537903, "learning_rate": 4.83721437704513e-07, "loss": 0.3935, "step": 3614 }, { "epoch": 2.618932624969814, "grad_norm": 0.26407715678215027, "learning_rate": 4.819140445169973e-07, "loss": 0.3532, "step": 3615 }, { "epoch": 2.6196570876599856, "grad_norm": 0.2959456145763397, "learning_rate": 4.80109863277497e-07, "loss": 0.3945, "step": 3616 }, { "epoch": 2.620381550350157, "grad_norm": 0.2876134216785431, "learning_rate": 4.783088952686182e-07, "loss": 0.3729, "step": 3617 }, { "epoch": 2.6211060130403285, "grad_norm": 0.27990031242370605, "learning_rate": 4.7651114177068694e-07, "loss": 0.3924, "step": 3618 }, { "epoch": 2.6218304757305, "grad_norm": 0.26414015889167786, "learning_rate": 4.747166040617435e-07, "loss": 0.3377, "step": 3619 }, { "epoch": 2.6225549384206714, "grad_norm": 0.2859136760234833, "learning_rate": 4.72925283417538e-07, "loss": 0.3856, "step": 3620 }, { "epoch": 2.623279401110843, "grad_norm": 0.2893695831298828, "learning_rate": 4.7113718111153784e-07, "loss": 0.3682, "step": 3621 }, { "epoch": 2.6240038638010144, "grad_norm": 0.2866642475128174, "learning_rate": 4.6935229841492235e-07, "loss": 0.3654, "step": 3622 }, { "epoch": 2.624728326491186, "grad_norm": 0.29316943883895874, "learning_rate": 4.675706365965788e-07, "loss": 0.3894, "step": 3623 }, { "epoch": 2.6254527891813573, "grad_norm": 0.2579948604106903, "learning_rate": 4.6579219692310716e-07, "loss": 0.3189, "step": 3624 }, { "epoch": 2.6261772518715287, "grad_norm": 0.2865001857280731, "learning_rate": 4.640169806588174e-07, "loss": 0.3754, "step": 3625 }, { "epoch": 2.6269017145617, "grad_norm": 0.290314644575119, "learning_rate": 4.6224498906572434e-07, "loss": 0.3881, "step": 3626 }, { "epoch": 2.6276261772518716, "grad_norm": 0.27788206934928894, "learning_rate": 4.604762234035548e-07, "loss": 0.3487, "step": 3627 }, { "epoch": 2.628350639942043, "grad_norm": 0.29498425126075745, "learning_rate": 4.587106849297407e-07, "loss": 0.3807, "step": 3628 }, { "epoch": 2.6290751026322146, "grad_norm": 0.2800765335559845, "learning_rate": 4.5694837489941744e-07, "loss": 0.3684, "step": 3629 }, { "epoch": 2.629799565322386, "grad_norm": 0.2835700511932373, "learning_rate": 4.551892945654274e-07, "loss": 0.3594, "step": 3630 }, { "epoch": 2.6305240280125575, "grad_norm": 0.3037457764148712, "learning_rate": 4.5343344517831733e-07, "loss": 0.3831, "step": 3631 }, { "epoch": 2.631248490702729, "grad_norm": 0.2959515154361725, "learning_rate": 4.5168082798633664e-07, "loss": 0.3721, "step": 3632 }, { "epoch": 2.6319729533929004, "grad_norm": 0.26656731963157654, "learning_rate": 4.4993144423543525e-07, "loss": 0.3653, "step": 3633 }, { "epoch": 2.632697416083072, "grad_norm": 0.2781929671764374, "learning_rate": 4.481852951692672e-07, "loss": 0.3591, "step": 3634 }, { "epoch": 2.6334218787732433, "grad_norm": 0.2654513716697693, "learning_rate": 4.4644238202918577e-07, "loss": 0.3845, "step": 3635 }, { "epoch": 2.6341463414634148, "grad_norm": 0.2666546106338501, "learning_rate": 4.4470270605424195e-07, "loss": 0.3895, "step": 3636 }, { "epoch": 2.634870804153586, "grad_norm": 0.24951772391796112, "learning_rate": 4.4296626848118805e-07, "loss": 0.3363, "step": 3637 }, { "epoch": 2.6355952668437577, "grad_norm": 0.29104897379875183, "learning_rate": 4.412330705444734e-07, "loss": 0.4228, "step": 3638 }, { "epoch": 2.636319729533929, "grad_norm": 0.2656427323818207, "learning_rate": 4.3950311347624507e-07, "loss": 0.3406, "step": 3639 }, { "epoch": 2.6370441922241006, "grad_norm": 0.3300534784793854, "learning_rate": 4.377763985063427e-07, "loss": 0.4471, "step": 3640 }, { "epoch": 2.637768654914272, "grad_norm": 0.2829388380050659, "learning_rate": 4.360529268623048e-07, "loss": 0.3525, "step": 3641 }, { "epoch": 2.6384931176044435, "grad_norm": 0.2750054597854614, "learning_rate": 4.343326997693637e-07, "loss": 0.3592, "step": 3642 }, { "epoch": 2.639217580294615, "grad_norm": 0.2659370005130768, "learning_rate": 4.326157184504426e-07, "loss": 0.3702, "step": 3643 }, { "epoch": 2.6399420429847864, "grad_norm": 0.30319535732269287, "learning_rate": 4.309019841261597e-07, "loss": 0.3892, "step": 3644 }, { "epoch": 2.640666505674958, "grad_norm": 0.2785472571849823, "learning_rate": 4.2919149801482596e-07, "loss": 0.3905, "step": 3645 }, { "epoch": 2.6413909683651293, "grad_norm": 0.28239163756370544, "learning_rate": 4.2748426133243816e-07, "loss": 0.4001, "step": 3646 }, { "epoch": 2.642115431055301, "grad_norm": 0.2899135947227478, "learning_rate": 4.257802752926882e-07, "loss": 0.3206, "step": 3647 }, { "epoch": 2.6428398937454722, "grad_norm": 0.2975526750087738, "learning_rate": 4.240795411069554e-07, "loss": 0.3626, "step": 3648 }, { "epoch": 2.6435643564356437, "grad_norm": 0.2971833050251007, "learning_rate": 4.223820599843054e-07, "loss": 0.3779, "step": 3649 }, { "epoch": 2.644288819125815, "grad_norm": 0.2885863184928894, "learning_rate": 4.20687833131494e-07, "loss": 0.3624, "step": 3650 }, { "epoch": 2.6450132818159866, "grad_norm": 0.29493895173072815, "learning_rate": 4.189968617529627e-07, "loss": 0.3832, "step": 3651 }, { "epoch": 2.645737744506158, "grad_norm": 0.2978026568889618, "learning_rate": 4.173091470508367e-07, "loss": 0.4494, "step": 3652 }, { "epoch": 2.6464622071963295, "grad_norm": 0.28705495595932007, "learning_rate": 4.156246902249289e-07, "loss": 0.3327, "step": 3653 }, { "epoch": 2.647186669886501, "grad_norm": 0.28519535064697266, "learning_rate": 4.139434924727359e-07, "loss": 0.3687, "step": 3654 }, { "epoch": 2.6479111325766724, "grad_norm": 0.29114460945129395, "learning_rate": 4.122655549894333e-07, "loss": 0.3701, "step": 3655 }, { "epoch": 2.648635595266844, "grad_norm": 0.29143646359443665, "learning_rate": 4.105908789678842e-07, "loss": 0.3578, "step": 3656 }, { "epoch": 2.6493600579570153, "grad_norm": 0.2742461860179901, "learning_rate": 4.089194655986306e-07, "loss": 0.3458, "step": 3657 }, { "epoch": 2.650084520647187, "grad_norm": 0.29519328474998474, "learning_rate": 4.072513160698943e-07, "loss": 0.3815, "step": 3658 }, { "epoch": 2.6508089833373583, "grad_norm": 0.2845940887928009, "learning_rate": 4.0558643156757893e-07, "loss": 0.3634, "step": 3659 }, { "epoch": 2.6515334460275297, "grad_norm": 0.300129771232605, "learning_rate": 4.0392481327526545e-07, "loss": 0.3818, "step": 3660 }, { "epoch": 2.652257908717701, "grad_norm": 0.29794546961784363, "learning_rate": 4.0226646237421396e-07, "loss": 0.3457, "step": 3661 }, { "epoch": 2.6529823714078726, "grad_norm": 0.27046912908554077, "learning_rate": 4.0061138004335974e-07, "loss": 0.3255, "step": 3662 }, { "epoch": 2.653706834098044, "grad_norm": 0.29378944635391235, "learning_rate": 3.989595674593161e-07, "loss": 0.373, "step": 3663 }, { "epoch": 2.6544312967882155, "grad_norm": 0.2837303578853607, "learning_rate": 3.9731102579637303e-07, "loss": 0.3835, "step": 3664 }, { "epoch": 2.655155759478387, "grad_norm": 0.27111566066741943, "learning_rate": 3.956657562264915e-07, "loss": 0.3607, "step": 3665 }, { "epoch": 2.6558802221685585, "grad_norm": 0.26686909794807434, "learning_rate": 3.9402375991930975e-07, "loss": 0.3751, "step": 3666 }, { "epoch": 2.65660468485873, "grad_norm": 0.28772979974746704, "learning_rate": 3.923850380421379e-07, "loss": 0.3822, "step": 3667 }, { "epoch": 2.6573291475489014, "grad_norm": 0.2905943989753723, "learning_rate": 3.907495917599563e-07, "loss": 0.3409, "step": 3668 }, { "epoch": 2.658053610239073, "grad_norm": 0.28905045986175537, "learning_rate": 3.8911742223542047e-07, "loss": 0.3436, "step": 3669 }, { "epoch": 2.6587780729292443, "grad_norm": 0.2870100736618042, "learning_rate": 3.874885306288545e-07, "loss": 0.3605, "step": 3670 }, { "epoch": 2.6595025356194157, "grad_norm": 0.3106895387172699, "learning_rate": 3.858629180982504e-07, "loss": 0.3763, "step": 3671 }, { "epoch": 2.660226998309587, "grad_norm": 0.3263731598854065, "learning_rate": 3.8424058579927147e-07, "loss": 0.3749, "step": 3672 }, { "epoch": 2.6609514609997587, "grad_norm": 0.3003147840499878, "learning_rate": 3.826215348852491e-07, "loss": 0.3818, "step": 3673 }, { "epoch": 2.66167592368993, "grad_norm": 0.29168975353240967, "learning_rate": 3.8100576650718034e-07, "loss": 0.3719, "step": 3674 }, { "epoch": 2.6624003863801016, "grad_norm": 0.2896346151828766, "learning_rate": 3.793932818137297e-07, "loss": 0.3795, "step": 3675 }, { "epoch": 2.663124849070273, "grad_norm": 0.2575375437736511, "learning_rate": 3.7778408195122797e-07, "loss": 0.3538, "step": 3676 }, { "epoch": 2.6638493117604445, "grad_norm": 0.2807663083076477, "learning_rate": 3.7617816806366834e-07, "loss": 0.4124, "step": 3677 }, { "epoch": 2.664573774450616, "grad_norm": 0.2606498897075653, "learning_rate": 3.745755412927099e-07, "loss": 0.3361, "step": 3678 }, { "epoch": 2.6652982371407874, "grad_norm": 0.298868864774704, "learning_rate": 3.7297620277767623e-07, "loss": 0.3724, "step": 3679 }, { "epoch": 2.666022699830959, "grad_norm": 0.3190399408340454, "learning_rate": 3.7138015365554834e-07, "loss": 0.3919, "step": 3680 }, { "epoch": 2.6667471625211303, "grad_norm": 0.2787415087223053, "learning_rate": 3.697873950609737e-07, "loss": 0.358, "step": 3681 }, { "epoch": 2.6674716252113018, "grad_norm": 0.2853206396102905, "learning_rate": 3.6819792812625876e-07, "loss": 0.395, "step": 3682 }, { "epoch": 2.6681960879014732, "grad_norm": 0.26239070296287537, "learning_rate": 3.666117539813707e-07, "loss": 0.3492, "step": 3683 }, { "epoch": 2.6689205505916447, "grad_norm": 0.27691859006881714, "learning_rate": 3.6502887375393314e-07, "loss": 0.3741, "step": 3684 }, { "epoch": 2.669645013281816, "grad_norm": 0.2627592384815216, "learning_rate": 3.634492885692309e-07, "loss": 0.3453, "step": 3685 }, { "epoch": 2.6703694759719876, "grad_norm": 0.28872257471084595, "learning_rate": 3.6187299955020573e-07, "loss": 0.3622, "step": 3686 }, { "epoch": 2.671093938662159, "grad_norm": 0.3132060170173645, "learning_rate": 3.6030000781745513e-07, "loss": 0.3952, "step": 3687 }, { "epoch": 2.6718184013523305, "grad_norm": 0.28944891691207886, "learning_rate": 3.5873031448923237e-07, "loss": 0.3607, "step": 3688 }, { "epoch": 2.672542864042502, "grad_norm": 0.2963632345199585, "learning_rate": 3.571639206814487e-07, "loss": 0.368, "step": 3689 }, { "epoch": 2.6732673267326734, "grad_norm": 0.2936262786388397, "learning_rate": 3.55600827507665e-07, "loss": 0.3684, "step": 3690 }, { "epoch": 2.673991789422845, "grad_norm": 0.2824256420135498, "learning_rate": 3.5404103607909965e-07, "loss": 0.3689, "step": 3691 }, { "epoch": 2.6747162521130163, "grad_norm": 0.26589497923851013, "learning_rate": 3.5248454750462345e-07, "loss": 0.3464, "step": 3692 }, { "epoch": 2.675440714803188, "grad_norm": 0.30442366003990173, "learning_rate": 3.5093136289075567e-07, "loss": 0.3824, "step": 3693 }, { "epoch": 2.6761651774933592, "grad_norm": 0.267736554145813, "learning_rate": 3.4938148334167144e-07, "loss": 0.3591, "step": 3694 }, { "epoch": 2.6768896401835307, "grad_norm": 0.26626303791999817, "learning_rate": 3.478349099591932e-07, "loss": 0.3711, "step": 3695 }, { "epoch": 2.677614102873702, "grad_norm": 0.295052707195282, "learning_rate": 3.462916438427938e-07, "loss": 0.3781, "step": 3696 }, { "epoch": 2.6783385655638736, "grad_norm": 0.2699654698371887, "learning_rate": 3.4475168608959495e-07, "loss": 0.3501, "step": 3697 }, { "epoch": 2.679063028254045, "grad_norm": 0.2695721983909607, "learning_rate": 3.432150377943677e-07, "loss": 0.3443, "step": 3698 }, { "epoch": 2.6797874909442165, "grad_norm": 0.2966639995574951, "learning_rate": 3.416817000495271e-07, "loss": 0.3722, "step": 3699 }, { "epoch": 2.680511953634388, "grad_norm": 0.2773069441318512, "learning_rate": 3.401516739451383e-07, "loss": 0.345, "step": 3700 }, { "epoch": 2.6812364163245594, "grad_norm": 0.2948189675807953, "learning_rate": 3.386249605689107e-07, "loss": 0.3705, "step": 3701 }, { "epoch": 2.681960879014731, "grad_norm": 0.29304739832878113, "learning_rate": 3.371015610061973e-07, "loss": 0.3821, "step": 3702 }, { "epoch": 2.6826853417049024, "grad_norm": 0.27547454833984375, "learning_rate": 3.355814763399973e-07, "loss": 0.3521, "step": 3703 }, { "epoch": 2.683409804395074, "grad_norm": 0.28758421540260315, "learning_rate": 3.340647076509529e-07, "loss": 0.3655, "step": 3704 }, { "epoch": 2.6841342670852453, "grad_norm": 0.2591378390789032, "learning_rate": 3.325512560173494e-07, "loss": 0.3482, "step": 3705 }, { "epoch": 2.6848587297754167, "grad_norm": 0.29933595657348633, "learning_rate": 3.310411225151111e-07, "loss": 0.4001, "step": 3706 }, { "epoch": 2.685583192465588, "grad_norm": 0.32196545600891113, "learning_rate": 3.295343082178065e-07, "loss": 0.4053, "step": 3707 }, { "epoch": 2.6863076551557596, "grad_norm": 0.29638779163360596, "learning_rate": 3.2803081419664483e-07, "loss": 0.37, "step": 3708 }, { "epoch": 2.687032117845931, "grad_norm": 0.2682141959667206, "learning_rate": 3.265306415204711e-07, "loss": 0.3495, "step": 3709 }, { "epoch": 2.687756580536102, "grad_norm": 0.29221951961517334, "learning_rate": 3.2503379125577275e-07, "loss": 0.3809, "step": 3710 }, { "epoch": 2.6884810432262736, "grad_norm": 0.2713361978530884, "learning_rate": 3.235402644666752e-07, "loss": 0.3323, "step": 3711 }, { "epoch": 2.689205505916445, "grad_norm": 0.29719316959381104, "learning_rate": 3.2205006221493807e-07, "loss": 0.4218, "step": 3712 }, { "epoch": 2.6899299686066165, "grad_norm": 0.27486422657966614, "learning_rate": 3.205631855599606e-07, "loss": 0.3405, "step": 3713 }, { "epoch": 2.690654431296788, "grad_norm": 0.28899988532066345, "learning_rate": 3.1907963555877776e-07, "loss": 0.3922, "step": 3714 }, { "epoch": 2.6913788939869594, "grad_norm": 0.2650248408317566, "learning_rate": 3.1759941326605646e-07, "loss": 0.3642, "step": 3715 }, { "epoch": 2.692103356677131, "grad_norm": 0.2770833969116211, "learning_rate": 3.1612251973410115e-07, "loss": 0.3647, "step": 3716 }, { "epoch": 2.6928278193673023, "grad_norm": 0.28128254413604736, "learning_rate": 3.146489560128496e-07, "loss": 0.3954, "step": 3717 }, { "epoch": 2.6935522820574738, "grad_norm": 0.2683310806751251, "learning_rate": 3.1317872314987007e-07, "loss": 0.3404, "step": 3718 }, { "epoch": 2.6942767447476452, "grad_norm": 0.2699487805366516, "learning_rate": 3.1171182219036533e-07, "loss": 0.3457, "step": 3719 }, { "epoch": 2.6950012074378167, "grad_norm": 0.293244868516922, "learning_rate": 3.102482541771684e-07, "loss": 0.3998, "step": 3720 }, { "epoch": 2.695725670127988, "grad_norm": 0.2927028238773346, "learning_rate": 3.0878802015074314e-07, "loss": 0.3902, "step": 3721 }, { "epoch": 2.6964501328181596, "grad_norm": 0.27658429741859436, "learning_rate": 3.0733112114918196e-07, "loss": 0.3683, "step": 3722 }, { "epoch": 2.697174595508331, "grad_norm": 0.26502615213394165, "learning_rate": 3.0587755820820797e-07, "loss": 0.3509, "step": 3723 }, { "epoch": 2.6978990581985025, "grad_norm": 0.2729715406894684, "learning_rate": 3.0442733236117293e-07, "loss": 0.3489, "step": 3724 }, { "epoch": 2.698623520888674, "grad_norm": 0.2819748520851135, "learning_rate": 3.029804446390544e-07, "loss": 0.3634, "step": 3725 }, { "epoch": 2.6993479835788454, "grad_norm": 0.30151090025901794, "learning_rate": 3.015368960704584e-07, "loss": 0.3798, "step": 3726 }, { "epoch": 2.700072446269017, "grad_norm": 0.30034637451171875, "learning_rate": 3.0009668768161746e-07, "loss": 0.364, "step": 3727 }, { "epoch": 2.7007969089591883, "grad_norm": 0.31196561455726624, "learning_rate": 2.9865982049638697e-07, "loss": 0.3724, "step": 3728 }, { "epoch": 2.70152137164936, "grad_norm": 0.28815674781799316, "learning_rate": 2.972262955362498e-07, "loss": 0.358, "step": 3729 }, { "epoch": 2.7022458343395312, "grad_norm": 0.2960498631000519, "learning_rate": 2.9579611382031247e-07, "loss": 0.3224, "step": 3730 }, { "epoch": 2.7029702970297027, "grad_norm": 0.2914646863937378, "learning_rate": 2.943692763653022e-07, "loss": 0.3642, "step": 3731 }, { "epoch": 2.703694759719874, "grad_norm": 0.30173635482788086, "learning_rate": 2.929457841855715e-07, "loss": 0.4018, "step": 3732 }, { "epoch": 2.7044192224100456, "grad_norm": 0.2670757472515106, "learning_rate": 2.9152563829309423e-07, "loss": 0.327, "step": 3733 }, { "epoch": 2.705143685100217, "grad_norm": 0.2822189927101135, "learning_rate": 2.9010883969746495e-07, "loss": 0.366, "step": 3734 }, { "epoch": 2.7058681477903885, "grad_norm": 0.2859998941421509, "learning_rate": 2.88695389405898e-07, "loss": 0.3993, "step": 3735 }, { "epoch": 2.70659261048056, "grad_norm": 0.2663450241088867, "learning_rate": 2.8728528842322746e-07, "loss": 0.2984, "step": 3736 }, { "epoch": 2.7073170731707314, "grad_norm": 0.296045184135437, "learning_rate": 2.8587853775190856e-07, "loss": 0.4015, "step": 3737 }, { "epoch": 2.708041535860903, "grad_norm": 0.28452950716018677, "learning_rate": 2.844751383920108e-07, "loss": 0.3183, "step": 3738 }, { "epoch": 2.7087659985510744, "grad_norm": 0.2670045793056488, "learning_rate": 2.83075091341225e-07, "loss": 0.3251, "step": 3739 }, { "epoch": 2.709490461241246, "grad_norm": 0.2941342294216156, "learning_rate": 2.816783975948573e-07, "loss": 0.4321, "step": 3740 }, { "epoch": 2.7102149239314173, "grad_norm": 0.2593350410461426, "learning_rate": 2.802850581458283e-07, "loss": 0.3436, "step": 3741 }, { "epoch": 2.7109393866215887, "grad_norm": 0.2769438922405243, "learning_rate": 2.7889507398467687e-07, "loss": 0.3721, "step": 3742 }, { "epoch": 2.71166384931176, "grad_norm": 0.30830347537994385, "learning_rate": 2.7750844609955583e-07, "loss": 0.4053, "step": 3743 }, { "epoch": 2.7123883120019316, "grad_norm": 0.2816218435764313, "learning_rate": 2.7612517547622955e-07, "loss": 0.3392, "step": 3744 }, { "epoch": 2.713112774692103, "grad_norm": 0.27333179116249084, "learning_rate": 2.74745263098079e-07, "loss": 0.3496, "step": 3745 }, { "epoch": 2.7138372373822746, "grad_norm": 0.2782779037952423, "learning_rate": 2.733687099460963e-07, "loss": 0.3991, "step": 3746 }, { "epoch": 2.714561700072446, "grad_norm": 0.29374614357948303, "learning_rate": 2.719955169988847e-07, "loss": 0.3738, "step": 3747 }, { "epoch": 2.7152861627626175, "grad_norm": 0.28975263237953186, "learning_rate": 2.7062568523266e-07, "loss": 0.3472, "step": 3748 }, { "epoch": 2.716010625452789, "grad_norm": 0.2831323742866516, "learning_rate": 2.6925921562124867e-07, "loss": 0.3711, "step": 3749 }, { "epoch": 2.7167350881429604, "grad_norm": 0.2874748408794403, "learning_rate": 2.6789610913608434e-07, "loss": 0.3614, "step": 3750 }, { "epoch": 2.717459550833132, "grad_norm": 0.279470294713974, "learning_rate": 2.6653636674621284e-07, "loss": 0.4027, "step": 3751 }, { "epoch": 2.7181840135233033, "grad_norm": 0.2850702106952667, "learning_rate": 2.651799894182877e-07, "loss": 0.3639, "step": 3752 }, { "epoch": 2.7189084762134748, "grad_norm": 0.29366302490234375, "learning_rate": 2.638269781165692e-07, "loss": 0.3627, "step": 3753 }, { "epoch": 2.719632938903646, "grad_norm": 0.2997506558895111, "learning_rate": 2.6247733380292515e-07, "loss": 0.379, "step": 3754 }, { "epoch": 2.7203574015938177, "grad_norm": 0.2877674400806427, "learning_rate": 2.6113105743682975e-07, "loss": 0.343, "step": 3755 }, { "epoch": 2.721081864283989, "grad_norm": 0.2882018983364105, "learning_rate": 2.597881499753646e-07, "loss": 0.3501, "step": 3756 }, { "epoch": 2.7218063269741606, "grad_norm": 0.289125919342041, "learning_rate": 2.584486123732122e-07, "loss": 0.3734, "step": 3757 }, { "epoch": 2.722530789664332, "grad_norm": 0.2948627173900604, "learning_rate": 2.5711244558266346e-07, "loss": 0.3793, "step": 3758 }, { "epoch": 2.7232552523545035, "grad_norm": 0.27591460943222046, "learning_rate": 2.5577965055361145e-07, "loss": 0.38, "step": 3759 }, { "epoch": 2.723979715044675, "grad_norm": 0.25741028785705566, "learning_rate": 2.544502282335515e-07, "loss": 0.351, "step": 3760 }, { "epoch": 2.7247041777348464, "grad_norm": 0.29105502367019653, "learning_rate": 2.5312417956758206e-07, "loss": 0.3748, "step": 3761 }, { "epoch": 2.725428640425018, "grad_norm": 0.2663559913635254, "learning_rate": 2.518015054984041e-07, "loss": 0.3306, "step": 3762 }, { "epoch": 2.7261531031151893, "grad_norm": 0.27904391288757324, "learning_rate": 2.5048220696631653e-07, "loss": 0.3886, "step": 3763 }, { "epoch": 2.726877565805361, "grad_norm": 0.27982258796691895, "learning_rate": 2.491662849092219e-07, "loss": 0.3551, "step": 3764 }, { "epoch": 2.7276020284955322, "grad_norm": 0.28173840045928955, "learning_rate": 2.478537402626219e-07, "loss": 0.3811, "step": 3765 }, { "epoch": 2.7283264911857037, "grad_norm": 0.28405922651290894, "learning_rate": 2.4654457395961395e-07, "loss": 0.3587, "step": 3766 }, { "epoch": 2.729050953875875, "grad_norm": 0.27722615003585815, "learning_rate": 2.452387869308981e-07, "loss": 0.3579, "step": 3767 }, { "epoch": 2.7297754165660466, "grad_norm": 0.28453952074050903, "learning_rate": 2.439363801047695e-07, "loss": 0.3933, "step": 3768 }, { "epoch": 2.730499879256218, "grad_norm": 0.28607380390167236, "learning_rate": 2.4263735440712087e-07, "loss": 0.3641, "step": 3769 }, { "epoch": 2.7312243419463895, "grad_norm": 0.2873665690422058, "learning_rate": 2.413417107614413e-07, "loss": 0.3696, "step": 3770 }, { "epoch": 2.731948804636561, "grad_norm": 0.28245192766189575, "learning_rate": 2.4004945008881617e-07, "loss": 0.357, "step": 3771 }, { "epoch": 2.7326732673267324, "grad_norm": 0.28510546684265137, "learning_rate": 2.3876057330792344e-07, "loss": 0.3558, "step": 3772 }, { "epoch": 2.733397730016904, "grad_norm": 0.300131231546402, "learning_rate": 2.3747508133503904e-07, "loss": 0.3932, "step": 3773 }, { "epoch": 2.7341221927070753, "grad_norm": 0.2723574638366699, "learning_rate": 2.361929750840297e-07, "loss": 0.3577, "step": 3774 }, { "epoch": 2.734846655397247, "grad_norm": 0.27498117089271545, "learning_rate": 2.3491425546635803e-07, "loss": 0.3476, "step": 3775 }, { "epoch": 2.7355711180874183, "grad_norm": 0.2729450762271881, "learning_rate": 2.336389233910752e-07, "loss": 0.3709, "step": 3776 }, { "epoch": 2.7362955807775897, "grad_norm": 0.2795395851135254, "learning_rate": 2.3236697976482703e-07, "loss": 0.4038, "step": 3777 }, { "epoch": 2.737020043467761, "grad_norm": 0.270018070936203, "learning_rate": 2.3109842549185025e-07, "loss": 0.3813, "step": 3778 }, { "epoch": 2.7377445061579326, "grad_norm": 0.28440919518470764, "learning_rate": 2.2983326147397066e-07, "loss": 0.381, "step": 3779 }, { "epoch": 2.738468968848104, "grad_norm": 0.2776559889316559, "learning_rate": 2.2857148861060552e-07, "loss": 0.3649, "step": 3780 }, { "epoch": 2.7391934315382755, "grad_norm": 0.29364508390426636, "learning_rate": 2.273131077987606e-07, "loss": 0.4039, "step": 3781 }, { "epoch": 2.739917894228447, "grad_norm": 0.27005085349082947, "learning_rate": 2.2605811993302916e-07, "loss": 0.3295, "step": 3782 }, { "epoch": 2.7406423569186185, "grad_norm": 0.28604358434677124, "learning_rate": 2.2480652590559371e-07, "loss": 0.3664, "step": 3783 }, { "epoch": 2.74136681960879, "grad_norm": 0.27795499563217163, "learning_rate": 2.2355832660622467e-07, "loss": 0.3971, "step": 3784 }, { "epoch": 2.7420912822989614, "grad_norm": 0.27806326746940613, "learning_rate": 2.2231352292227727e-07, "loss": 0.3414, "step": 3785 }, { "epoch": 2.742815744989133, "grad_norm": 0.29037514328956604, "learning_rate": 2.2107211573869357e-07, "loss": 0.3877, "step": 3786 }, { "epoch": 2.7435402076793043, "grad_norm": 0.28941628336906433, "learning_rate": 2.1983410593800213e-07, "loss": 0.3892, "step": 3787 }, { "epoch": 2.7442646703694757, "grad_norm": 0.27034792304039, "learning_rate": 2.1859949440031391e-07, "loss": 0.3384, "step": 3788 }, { "epoch": 2.744989133059647, "grad_norm": 0.28939899802207947, "learning_rate": 2.1736828200332628e-07, "loss": 0.3844, "step": 3789 }, { "epoch": 2.7457135957498187, "grad_norm": 0.26972144842147827, "learning_rate": 2.1614046962231962e-07, "loss": 0.3269, "step": 3790 }, { "epoch": 2.74643805843999, "grad_norm": 0.29902294278144836, "learning_rate": 2.1491605813015515e-07, "loss": 0.3921, "step": 3791 }, { "epoch": 2.7471625211301616, "grad_norm": 0.2785424590110779, "learning_rate": 2.136950483972794e-07, "loss": 0.3425, "step": 3792 }, { "epoch": 2.747886983820333, "grad_norm": 0.29011696577072144, "learning_rate": 2.124774412917191e-07, "loss": 0.4125, "step": 3793 }, { "epoch": 2.7486114465105045, "grad_norm": 0.29142725467681885, "learning_rate": 2.1126323767908187e-07, "loss": 0.36, "step": 3794 }, { "epoch": 2.749335909200676, "grad_norm": 0.28902289271354675, "learning_rate": 2.1005243842255552e-07, "loss": 0.3893, "step": 3795 }, { "epoch": 2.7500603718908474, "grad_norm": 0.2676951289176941, "learning_rate": 2.0884504438290875e-07, "loss": 0.3457, "step": 3796 }, { "epoch": 2.750784834581019, "grad_norm": 0.2805117964744568, "learning_rate": 2.0764105641848943e-07, "loss": 0.3682, "step": 3797 }, { "epoch": 2.7515092972711903, "grad_norm": 0.28168168663978577, "learning_rate": 2.0644047538522226e-07, "loss": 0.356, "step": 3798 }, { "epoch": 2.7522337599613618, "grad_norm": 0.3077790439128876, "learning_rate": 2.0524330213661126e-07, "loss": 0.3732, "step": 3799 }, { "epoch": 2.7529582226515332, "grad_norm": 0.2597523629665375, "learning_rate": 2.040495375237389e-07, "loss": 0.3361, "step": 3800 }, { "epoch": 2.7536826853417047, "grad_norm": 0.2703828811645508, "learning_rate": 2.0285918239526192e-07, "loss": 0.3641, "step": 3801 }, { "epoch": 2.754407148031876, "grad_norm": 0.27140915393829346, "learning_rate": 2.016722375974145e-07, "loss": 0.3331, "step": 3802 }, { "epoch": 2.7551316107220476, "grad_norm": 0.2985759377479553, "learning_rate": 2.0048870397400776e-07, "loss": 0.3778, "step": 3803 }, { "epoch": 2.755856073412219, "grad_norm": 0.27404239773750305, "learning_rate": 1.993085823664248e-07, "loss": 0.3513, "step": 3804 }, { "epoch": 2.7565805361023905, "grad_norm": 0.27265557646751404, "learning_rate": 1.9813187361362506e-07, "loss": 0.3639, "step": 3805 }, { "epoch": 2.757304998792562, "grad_norm": 0.27807897329330444, "learning_rate": 1.9695857855214273e-07, "loss": 0.3275, "step": 3806 }, { "epoch": 2.7580294614827334, "grad_norm": 0.29044443368911743, "learning_rate": 1.9578869801608168e-07, "loss": 0.3985, "step": 3807 }, { "epoch": 2.758753924172905, "grad_norm": 0.28575143218040466, "learning_rate": 1.9462223283712224e-07, "loss": 0.3679, "step": 3808 }, { "epoch": 2.7594783868630763, "grad_norm": 0.2778254449367523, "learning_rate": 1.934591838445149e-07, "loss": 0.3606, "step": 3809 }, { "epoch": 2.760202849553248, "grad_norm": 0.28172940015792847, "learning_rate": 1.9229955186508053e-07, "loss": 0.3596, "step": 3810 }, { "epoch": 2.7609273122434193, "grad_norm": 0.2872879207134247, "learning_rate": 1.9114333772321246e-07, "loss": 0.3688, "step": 3811 }, { "epoch": 2.7616517749335907, "grad_norm": 0.2738395929336548, "learning_rate": 1.8999054224087487e-07, "loss": 0.3497, "step": 3812 }, { "epoch": 2.762376237623762, "grad_norm": 0.29987213015556335, "learning_rate": 1.8884116623759884e-07, "loss": 0.3719, "step": 3813 }, { "epoch": 2.7631007003139336, "grad_norm": 0.271128386259079, "learning_rate": 1.876952105304869e-07, "loss": 0.3548, "step": 3814 }, { "epoch": 2.763825163004105, "grad_norm": 0.27806299924850464, "learning_rate": 1.86552675934209e-07, "loss": 0.3807, "step": 3815 }, { "epoch": 2.7645496256942765, "grad_norm": 0.28486087918281555, "learning_rate": 1.8541356326100436e-07, "loss": 0.3521, "step": 3816 }, { "epoch": 2.765274088384448, "grad_norm": 0.2880067527294159, "learning_rate": 1.8427787332067626e-07, "loss": 0.3848, "step": 3817 }, { "epoch": 2.7659985510746194, "grad_norm": 0.26736903190612793, "learning_rate": 1.8314560692059836e-07, "loss": 0.3747, "step": 3818 }, { "epoch": 2.766723013764791, "grad_norm": 0.2886166572570801, "learning_rate": 1.8201676486570952e-07, "loss": 0.3659, "step": 3819 }, { "epoch": 2.7674474764549624, "grad_norm": 0.2847660481929779, "learning_rate": 1.8089134795851282e-07, "loss": 0.3845, "step": 3820 }, { "epoch": 2.768171939145134, "grad_norm": 0.26540276408195496, "learning_rate": 1.7976935699907716e-07, "loss": 0.3264, "step": 3821 }, { "epoch": 2.7688964018353053, "grad_norm": 0.2774020731449127, "learning_rate": 1.7865079278503727e-07, "loss": 0.3669, "step": 3822 }, { "epoch": 2.7696208645254767, "grad_norm": 0.2767746150493622, "learning_rate": 1.7753565611158873e-07, "loss": 0.3526, "step": 3823 }, { "epoch": 2.770345327215648, "grad_norm": 0.27658194303512573, "learning_rate": 1.7642394777149353e-07, "loss": 0.3619, "step": 3824 }, { "epoch": 2.7710697899058196, "grad_norm": 0.28127598762512207, "learning_rate": 1.7531566855507442e-07, "loss": 0.36, "step": 3825 }, { "epoch": 2.771794252595991, "grad_norm": 0.2692055404186249, "learning_rate": 1.7421081925021843e-07, "loss": 0.3706, "step": 3826 }, { "epoch": 2.7725187152861626, "grad_norm": 0.29715919494628906, "learning_rate": 1.7310940064237114e-07, "loss": 0.3786, "step": 3827 }, { "epoch": 2.773243177976334, "grad_norm": 0.2881264388561249, "learning_rate": 1.7201141351454176e-07, "loss": 0.3574, "step": 3828 }, { "epoch": 2.7739676406665055, "grad_norm": 0.29261142015457153, "learning_rate": 1.7091685864729978e-07, "loss": 0.369, "step": 3829 }, { "epoch": 2.774692103356677, "grad_norm": 0.2799926698207855, "learning_rate": 1.698257368187728e-07, "loss": 0.3413, "step": 3830 }, { "epoch": 2.7754165660468484, "grad_norm": 0.2922535240650177, "learning_rate": 1.6873804880464972e-07, "loss": 0.3929, "step": 3831 }, { "epoch": 2.77614102873702, "grad_norm": 0.26956701278686523, "learning_rate": 1.6765379537817872e-07, "loss": 0.3486, "step": 3832 }, { "epoch": 2.7768654914271913, "grad_norm": 0.2907513678073883, "learning_rate": 1.6657297731016432e-07, "loss": 0.3487, "step": 3833 }, { "epoch": 2.7775899541173628, "grad_norm": 0.27224841713905334, "learning_rate": 1.6549559536896964e-07, "loss": 0.3752, "step": 3834 }, { "epoch": 2.778314416807534, "grad_norm": 0.25791871547698975, "learning_rate": 1.644216503205165e-07, "loss": 0.331, "step": 3835 }, { "epoch": 2.7790388794977057, "grad_norm": 0.2999523878097534, "learning_rate": 1.6335114292828025e-07, "loss": 0.3639, "step": 3836 }, { "epoch": 2.779763342187877, "grad_norm": 0.2943907380104065, "learning_rate": 1.6228407395329548e-07, "loss": 0.4096, "step": 3837 }, { "epoch": 2.7804878048780486, "grad_norm": 0.27451470494270325, "learning_rate": 1.612204441541515e-07, "loss": 0.3483, "step": 3838 }, { "epoch": 2.78121226756822, "grad_norm": 0.27289533615112305, "learning_rate": 1.6016025428699067e-07, "loss": 0.3369, "step": 3839 }, { "epoch": 2.7819367302583915, "grad_norm": 0.30736950039863586, "learning_rate": 1.5910350510551288e-07, "loss": 0.4003, "step": 3840 }, { "epoch": 2.782661192948563, "grad_norm": 0.26099127531051636, "learning_rate": 1.5805019736097105e-07, "loss": 0.3164, "step": 3841 }, { "epoch": 2.7833856556387344, "grad_norm": 0.2956882417201996, "learning_rate": 1.5700033180216956e-07, "loss": 0.4144, "step": 3842 }, { "epoch": 2.784110118328906, "grad_norm": 0.28580552339553833, "learning_rate": 1.559539091754686e-07, "loss": 0.3616, "step": 3843 }, { "epoch": 2.7848345810190773, "grad_norm": 0.2934645712375641, "learning_rate": 1.5491093022477866e-07, "loss": 0.3912, "step": 3844 }, { "epoch": 2.785559043709249, "grad_norm": 0.2866480350494385, "learning_rate": 1.538713956915633e-07, "loss": 0.364, "step": 3845 }, { "epoch": 2.7862835063994202, "grad_norm": 0.2810584306716919, "learning_rate": 1.5283530631483688e-07, "loss": 0.3936, "step": 3846 }, { "epoch": 2.7870079690895917, "grad_norm": 0.27872902154922485, "learning_rate": 1.5180266283116417e-07, "loss": 0.3387, "step": 3847 }, { "epoch": 2.787732431779763, "grad_norm": 0.272849977016449, "learning_rate": 1.5077346597466235e-07, "loss": 0.3581, "step": 3848 }, { "epoch": 2.7884568944699346, "grad_norm": 0.3083185851573944, "learning_rate": 1.4974771647699448e-07, "loss": 0.401, "step": 3849 }, { "epoch": 2.789181357160106, "grad_norm": 0.28317639231681824, "learning_rate": 1.487254150673767e-07, "loss": 0.3444, "step": 3850 }, { "epoch": 2.7899058198502775, "grad_norm": 0.2990070879459381, "learning_rate": 1.4770656247257264e-07, "loss": 0.3694, "step": 3851 }, { "epoch": 2.790630282540449, "grad_norm": 0.2833901643753052, "learning_rate": 1.4669115941689182e-07, "loss": 0.3343, "step": 3852 }, { "epoch": 2.7913547452306204, "grad_norm": 0.2628764510154724, "learning_rate": 1.4567920662219514e-07, "loss": 0.3452, "step": 3853 }, { "epoch": 2.792079207920792, "grad_norm": 0.2675885856151581, "learning_rate": 1.4467070480788824e-07, "loss": 0.3758, "step": 3854 }, { "epoch": 2.7928036706109634, "grad_norm": 0.27674204111099243, "learning_rate": 1.436656546909243e-07, "loss": 0.4031, "step": 3855 }, { "epoch": 2.793528133301135, "grad_norm": 0.29972121119499207, "learning_rate": 1.426640569858029e-07, "loss": 0.3965, "step": 3856 }, { "epoch": 2.7942525959913063, "grad_norm": 0.2849942445755005, "learning_rate": 1.4166591240456828e-07, "loss": 0.3701, "step": 3857 }, { "epoch": 2.7949770586814777, "grad_norm": 0.27336546778678894, "learning_rate": 1.406712216568107e-07, "loss": 0.3208, "step": 3858 }, { "epoch": 2.795701521371649, "grad_norm": 0.2838139235973358, "learning_rate": 1.3967998544966442e-07, "loss": 0.3793, "step": 3859 }, { "epoch": 2.7964259840618206, "grad_norm": 0.27628442645072937, "learning_rate": 1.386922044878086e-07, "loss": 0.4079, "step": 3860 }, { "epoch": 2.797150446751992, "grad_norm": 0.2781168222427368, "learning_rate": 1.3770787947346597e-07, "loss": 0.3436, "step": 3861 }, { "epoch": 2.7978749094421635, "grad_norm": 0.2691403329372406, "learning_rate": 1.3672701110640064e-07, "loss": 0.3747, "step": 3862 }, { "epoch": 2.798599372132335, "grad_norm": 0.28094005584716797, "learning_rate": 1.3574960008392258e-07, "loss": 0.339, "step": 3863 }, { "epoch": 2.7993238348225065, "grad_norm": 0.2982158660888672, "learning_rate": 1.3477564710088097e-07, "loss": 0.3614, "step": 3864 }, { "epoch": 2.800048297512678, "grad_norm": 0.30119508504867554, "learning_rate": 1.3380515284966811e-07, "loss": 0.3795, "step": 3865 }, { "epoch": 2.8007727602028494, "grad_norm": 0.27612215280532837, "learning_rate": 1.3283811802021705e-07, "loss": 0.3703, "step": 3866 }, { "epoch": 2.801497222893021, "grad_norm": 0.2803041934967041, "learning_rate": 1.3187454330000127e-07, "loss": 0.3835, "step": 3867 }, { "epoch": 2.8022216855831923, "grad_norm": 0.27699148654937744, "learning_rate": 1.3091442937403498e-07, "loss": 0.3763, "step": 3868 }, { "epoch": 2.8029461482733637, "grad_norm": 0.29443028569221497, "learning_rate": 1.2995777692487166e-07, "loss": 0.3681, "step": 3869 }, { "epoch": 2.803670610963535, "grad_norm": 0.2891677916049957, "learning_rate": 1.2900458663260506e-07, "loss": 0.3792, "step": 3870 }, { "epoch": 2.8043950736537067, "grad_norm": 0.2676737308502197, "learning_rate": 1.2805485917486538e-07, "loss": 0.347, "step": 3871 }, { "epoch": 2.805119536343878, "grad_norm": 0.295315682888031, "learning_rate": 1.2710859522682305e-07, "loss": 0.3655, "step": 3872 }, { "epoch": 2.8058439990340496, "grad_norm": 0.3251308500766754, "learning_rate": 1.2616579546118614e-07, "loss": 0.4119, "step": 3873 }, { "epoch": 2.806568461724221, "grad_norm": 0.2699500620365143, "learning_rate": 1.2522646054819841e-07, "loss": 0.3227, "step": 3874 }, { "epoch": 2.8072929244143925, "grad_norm": 0.30520108342170715, "learning_rate": 1.2429059115564247e-07, "loss": 0.4075, "step": 3875 }, { "epoch": 2.808017387104564, "grad_norm": 0.25896742939949036, "learning_rate": 1.2335818794883604e-07, "loss": 0.3361, "step": 3876 }, { "epoch": 2.8087418497947354, "grad_norm": 0.2834792137145996, "learning_rate": 1.224292515906328e-07, "loss": 0.3615, "step": 3877 }, { "epoch": 2.809466312484907, "grad_norm": 0.27928534150123596, "learning_rate": 1.215037827414217e-07, "loss": 0.3533, "step": 3878 }, { "epoch": 2.8101907751750783, "grad_norm": 0.27776283025741577, "learning_rate": 1.2058178205912763e-07, "loss": 0.3616, "step": 3879 }, { "epoch": 2.8109152378652498, "grad_norm": 0.26674947142601013, "learning_rate": 1.1966325019920845e-07, "loss": 0.3755, "step": 3880 }, { "epoch": 2.8116397005554212, "grad_norm": 0.2913094162940979, "learning_rate": 1.187481878146568e-07, "loss": 0.3893, "step": 3881 }, { "epoch": 2.8123641632455927, "grad_norm": 0.2489907294511795, "learning_rate": 1.1783659555600013e-07, "loss": 0.3276, "step": 3882 }, { "epoch": 2.813088625935764, "grad_norm": 0.2932935953140259, "learning_rate": 1.169284740712956e-07, "loss": 0.3836, "step": 3883 }, { "epoch": 2.8138130886259356, "grad_norm": 0.287579208612442, "learning_rate": 1.1602382400613566e-07, "loss": 0.3552, "step": 3884 }, { "epoch": 2.814537551316107, "grad_norm": 0.281368613243103, "learning_rate": 1.1512264600364476e-07, "loss": 0.3648, "step": 3885 }, { "epoch": 2.8152620140062785, "grad_norm": 0.29483702778816223, "learning_rate": 1.142249407044782e-07, "loss": 0.3976, "step": 3886 }, { "epoch": 2.81598647669645, "grad_norm": 0.29679441452026367, "learning_rate": 1.1333070874682217e-07, "loss": 0.35, "step": 3887 }, { "epoch": 2.8167109393866214, "grad_norm": 0.2835509181022644, "learning_rate": 1.1243995076639535e-07, "loss": 0.353, "step": 3888 }, { "epoch": 2.817435402076793, "grad_norm": 0.27114227414131165, "learning_rate": 1.1155266739644455e-07, "loss": 0.3711, "step": 3889 }, { "epoch": 2.8181598647669643, "grad_norm": 0.29177045822143555, "learning_rate": 1.1066885926774795e-07, "loss": 0.3701, "step": 3890 }, { "epoch": 2.818884327457136, "grad_norm": 0.27923476696014404, "learning_rate": 1.0978852700861243e-07, "loss": 0.3779, "step": 3891 }, { "epoch": 2.8196087901473073, "grad_norm": 0.2667888104915619, "learning_rate": 1.0891167124487512e-07, "loss": 0.3767, "step": 3892 }, { "epoch": 2.8203332528374787, "grad_norm": 0.2770543396472931, "learning_rate": 1.0803829259989962e-07, "loss": 0.3755, "step": 3893 }, { "epoch": 2.82105771552765, "grad_norm": 0.2816324532032013, "learning_rate": 1.0716839169457872e-07, "loss": 0.3652, "step": 3894 }, { "epoch": 2.8217821782178216, "grad_norm": 0.2936844229698181, "learning_rate": 1.0630196914733381e-07, "loss": 0.3844, "step": 3895 }, { "epoch": 2.822506640907993, "grad_norm": 0.2880173623561859, "learning_rate": 1.0543902557411112e-07, "loss": 0.3499, "step": 3896 }, { "epoch": 2.8232311035981645, "grad_norm": 0.28740009665489197, "learning_rate": 1.0457956158838545e-07, "loss": 0.3562, "step": 3897 }, { "epoch": 2.823955566288336, "grad_norm": 0.2766217291355133, "learning_rate": 1.037235778011586e-07, "loss": 0.3366, "step": 3898 }, { "epoch": 2.8246800289785075, "grad_norm": 0.2824343740940094, "learning_rate": 1.0287107482095549e-07, "loss": 0.3812, "step": 3899 }, { "epoch": 2.825404491668679, "grad_norm": 0.28766122460365295, "learning_rate": 1.0202205325382909e-07, "loss": 0.365, "step": 3900 }, { "epoch": 2.8261289543588504, "grad_norm": 0.2699967920780182, "learning_rate": 1.0117651370335657e-07, "loss": 0.3497, "step": 3901 }, { "epoch": 2.826853417049022, "grad_norm": 0.27633219957351685, "learning_rate": 1.003344567706388e-07, "loss": 0.3374, "step": 3902 }, { "epoch": 2.8275778797391933, "grad_norm": 0.28372082114219666, "learning_rate": 9.949588305430302e-08, "loss": 0.3982, "step": 3903 }, { "epoch": 2.8283023424293647, "grad_norm": 0.2713795602321625, "learning_rate": 9.866079315049792e-08, "loss": 0.3706, "step": 3904 }, { "epoch": 2.829026805119536, "grad_norm": 0.28915998339653015, "learning_rate": 9.782918765289584e-08, "loss": 0.3416, "step": 3905 }, { "epoch": 2.8297512678097076, "grad_norm": 0.2804672420024872, "learning_rate": 9.700106715269386e-08, "loss": 0.3628, "step": 3906 }, { "epoch": 2.830475730499879, "grad_norm": 0.27428948879241943, "learning_rate": 9.617643223860995e-08, "loss": 0.3599, "step": 3907 }, { "epoch": 2.8312001931900506, "grad_norm": 0.2709624767303467, "learning_rate": 9.53552834968846e-08, "loss": 0.3743, "step": 3908 }, { "epoch": 2.831924655880222, "grad_norm": 0.2893640398979187, "learning_rate": 9.453762151127865e-08, "loss": 0.3757, "step": 3909 }, { "epoch": 2.8326491185703935, "grad_norm": 0.2548883855342865, "learning_rate": 9.372344686307655e-08, "loss": 0.3389, "step": 3910 }, { "epoch": 2.833373581260565, "grad_norm": 0.25662896037101746, "learning_rate": 9.291276013108308e-08, "loss": 0.3633, "step": 3911 }, { "epoch": 2.8340980439507364, "grad_norm": 0.288316547870636, "learning_rate": 9.210556189162056e-08, "loss": 0.3605, "step": 3912 }, { "epoch": 2.834822506640908, "grad_norm": 0.28692349791526794, "learning_rate": 9.130185271853553e-08, "loss": 0.444, "step": 3913 }, { "epoch": 2.8355469693310793, "grad_norm": 0.2844719886779785, "learning_rate": 9.050163318319094e-08, "loss": 0.3784, "step": 3914 }, { "epoch": 2.8362714320212508, "grad_norm": 0.28568315505981445, "learning_rate": 8.970490385447061e-08, "loss": 0.3551, "step": 3915 }, { "epoch": 2.836995894711422, "grad_norm": 0.2708108425140381, "learning_rate": 8.891166529877593e-08, "loss": 0.3777, "step": 3916 }, { "epoch": 2.8377203574015937, "grad_norm": 0.28744837641716003, "learning_rate": 8.812191808002857e-08, "loss": 0.3629, "step": 3917 }, { "epoch": 2.838444820091765, "grad_norm": 0.26871103048324585, "learning_rate": 8.733566275966554e-08, "loss": 0.3355, "step": 3918 }, { "epoch": 2.8391692827819366, "grad_norm": 0.28512147068977356, "learning_rate": 8.655289989664361e-08, "loss": 0.3786, "step": 3919 }, { "epoch": 2.839893745472108, "grad_norm": 0.2846902906894684, "learning_rate": 8.5773630047436e-08, "loss": 0.3469, "step": 3920 }, { "epoch": 2.8406182081622795, "grad_norm": 0.30740705132484436, "learning_rate": 8.499785376603287e-08, "loss": 0.3989, "step": 3921 }, { "epoch": 2.841342670852451, "grad_norm": 0.2844851315021515, "learning_rate": 8.422557160393974e-08, "loss": 0.3946, "step": 3922 }, { "epoch": 2.8420671335426224, "grad_norm": 0.2821013927459717, "learning_rate": 8.34567841101791e-08, "loss": 0.3725, "step": 3923 }, { "epoch": 2.842791596232794, "grad_norm": 0.2808919847011566, "learning_rate": 8.269149183128988e-08, "loss": 0.3748, "step": 3924 }, { "epoch": 2.8435160589229653, "grad_norm": 0.2905890643596649, "learning_rate": 8.192969531132411e-08, "loss": 0.3638, "step": 3925 }, { "epoch": 2.844240521613137, "grad_norm": 0.27492284774780273, "learning_rate": 8.117139509185024e-08, "loss": 0.3737, "step": 3926 }, { "epoch": 2.8449649843033082, "grad_norm": 0.28944581747055054, "learning_rate": 8.041659171195149e-08, "loss": 0.3748, "step": 3927 }, { "epoch": 2.8456894469934797, "grad_norm": 0.26743370294570923, "learning_rate": 7.966528570822307e-08, "loss": 0.3116, "step": 3928 }, { "epoch": 2.846413909683651, "grad_norm": 0.2878575325012207, "learning_rate": 7.891747761477553e-08, "loss": 0.3525, "step": 3929 }, { "epoch": 2.8471383723738226, "grad_norm": 0.2684059739112854, "learning_rate": 7.817316796323305e-08, "loss": 0.3493, "step": 3930 }, { "epoch": 2.847862835063994, "grad_norm": 0.2783373296260834, "learning_rate": 7.74323572827318e-08, "loss": 0.3874, "step": 3931 }, { "epoch": 2.8485872977541655, "grad_norm": 0.290679931640625, "learning_rate": 7.669504609991996e-08, "loss": 0.3778, "step": 3932 }, { "epoch": 2.849311760444337, "grad_norm": 0.2789824903011322, "learning_rate": 7.59612349389599e-08, "loss": 0.3405, "step": 3933 }, { "epoch": 2.8500362231345084, "grad_norm": 0.28498178720474243, "learning_rate": 7.523092432152379e-08, "loss": 0.3746, "step": 3934 }, { "epoch": 2.85076068582468, "grad_norm": 0.30835679173469543, "learning_rate": 7.450411476679632e-08, "loss": 0.3964, "step": 3935 }, { "epoch": 2.8514851485148514, "grad_norm": 0.26104962825775146, "learning_rate": 7.378080679147304e-08, "loss": 0.3483, "step": 3936 }, { "epoch": 2.852209611205023, "grad_norm": 0.2758910059928894, "learning_rate": 7.306100090975987e-08, "loss": 0.3695, "step": 3937 }, { "epoch": 2.8529340738951943, "grad_norm": 0.29563212394714355, "learning_rate": 7.23446976333736e-08, "loss": 0.389, "step": 3938 }, { "epoch": 2.8536585365853657, "grad_norm": 0.27247682213783264, "learning_rate": 7.16318974715402e-08, "loss": 0.3718, "step": 3939 }, { "epoch": 2.854382999275537, "grad_norm": 0.2728745639324188, "learning_rate": 7.092260093099601e-08, "loss": 0.3674, "step": 3940 }, { "epoch": 2.8551074619657086, "grad_norm": 0.2716105580329895, "learning_rate": 7.021680851598656e-08, "loss": 0.3416, "step": 3941 }, { "epoch": 2.85583192465588, "grad_norm": 0.2787238359451294, "learning_rate": 6.951452072826547e-08, "loss": 0.3929, "step": 3942 }, { "epoch": 2.8565563873460516, "grad_norm": 0.2891733646392822, "learning_rate": 6.881573806709618e-08, "loss": 0.393, "step": 3943 }, { "epoch": 2.857280850036223, "grad_norm": 0.27603965997695923, "learning_rate": 6.812046102924908e-08, "loss": 0.3381, "step": 3944 }, { "epoch": 2.8580053127263945, "grad_norm": 0.2897740602493286, "learning_rate": 6.742869010900266e-08, "loss": 0.3468, "step": 3945 }, { "epoch": 2.858729775416566, "grad_norm": 0.26368018984794617, "learning_rate": 6.674042579814355e-08, "loss": 0.3461, "step": 3946 }, { "epoch": 2.8594542381067374, "grad_norm": 0.28718751668930054, "learning_rate": 6.605566858596536e-08, "loss": 0.3875, "step": 3947 }, { "epoch": 2.860178700796909, "grad_norm": 0.2798607051372528, "learning_rate": 6.537441895926699e-08, "loss": 0.3763, "step": 3948 }, { "epoch": 2.8609031634870803, "grad_norm": 0.28102874755859375, "learning_rate": 6.469667740235663e-08, "loss": 0.4063, "step": 3949 }, { "epoch": 2.8616276261772517, "grad_norm": 0.2764836251735687, "learning_rate": 6.402244439704497e-08, "loss": 0.3725, "step": 3950 }, { "epoch": 2.862352088867423, "grad_norm": 0.3150946795940399, "learning_rate": 6.335172042265192e-08, "loss": 0.38, "step": 3951 }, { "epoch": 2.8630765515575947, "grad_norm": 0.2598618268966675, "learning_rate": 6.26845059560005e-08, "loss": 0.3204, "step": 3952 }, { "epoch": 2.863801014247766, "grad_norm": 0.2851560413837433, "learning_rate": 6.202080147141909e-08, "loss": 0.3789, "step": 3953 }, { "epoch": 2.8645254769379376, "grad_norm": 0.28345081210136414, "learning_rate": 6.13606074407419e-08, "loss": 0.3693, "step": 3954 }, { "epoch": 2.865249939628109, "grad_norm": 0.30172497034072876, "learning_rate": 6.070392433330686e-08, "loss": 0.3787, "step": 3955 }, { "epoch": 2.8659744023182805, "grad_norm": 0.2671976089477539, "learning_rate": 6.005075261595495e-08, "loss": 0.4012, "step": 3956 }, { "epoch": 2.866698865008452, "grad_norm": 0.2589917778968811, "learning_rate": 5.9401092753033075e-08, "loss": 0.3427, "step": 3957 }, { "epoch": 2.8674233276986234, "grad_norm": 0.2888273000717163, "learning_rate": 5.875494520638958e-08, "loss": 0.415, "step": 3958 }, { "epoch": 2.868147790388795, "grad_norm": 0.2609724998474121, "learning_rate": 5.811231043537646e-08, "loss": 0.3292, "step": 3959 }, { "epoch": 2.8688722530789663, "grad_norm": 0.30220216512680054, "learning_rate": 5.747318889684883e-08, "loss": 0.3731, "step": 3960 }, { "epoch": 2.8695967157691378, "grad_norm": 0.29629749059677124, "learning_rate": 5.6837581045163795e-08, "loss": 0.3468, "step": 3961 }, { "epoch": 2.8703211784593092, "grad_norm": 0.3133241832256317, "learning_rate": 5.620548733218101e-08, "loss": 0.385, "step": 3962 }, { "epoch": 2.8710456411494807, "grad_norm": 0.3033459186553955, "learning_rate": 5.557690820726103e-08, "loss": 0.3358, "step": 3963 }, { "epoch": 2.871770103839652, "grad_norm": 0.3035110831260681, "learning_rate": 5.495184411726751e-08, "loss": 0.4177, "step": 3964 }, { "epoch": 2.8724945665298236, "grad_norm": 0.28143632411956787, "learning_rate": 5.433029550656277e-08, "loss": 0.3413, "step": 3965 }, { "epoch": 2.873219029219995, "grad_norm": 0.2995283305644989, "learning_rate": 5.371226281701225e-08, "loss": 0.3615, "step": 3966 }, { "epoch": 2.8739434919101665, "grad_norm": 0.29148584604263306, "learning_rate": 5.3097746487980606e-08, "loss": 0.3694, "step": 3967 }, { "epoch": 2.874667954600338, "grad_norm": 0.2459227442741394, "learning_rate": 5.248674695633338e-08, "loss": 0.3604, "step": 3968 }, { "epoch": 2.8753924172905094, "grad_norm": 0.2897578477859497, "learning_rate": 5.187926465643478e-08, "loss": 0.3861, "step": 3969 }, { "epoch": 2.876116879980681, "grad_norm": 0.2903725802898407, "learning_rate": 5.127530002015047e-08, "loss": 0.3477, "step": 3970 }, { "epoch": 2.8768413426708523, "grad_norm": 0.29529470205307007, "learning_rate": 5.067485347684364e-08, "loss": 0.3802, "step": 3971 }, { "epoch": 2.877565805361024, "grad_norm": 0.2762308418750763, "learning_rate": 5.0077925453377865e-08, "loss": 0.3622, "step": 3972 }, { "epoch": 2.8782902680511953, "grad_norm": 0.2710151970386505, "learning_rate": 4.948451637411367e-08, "loss": 0.3925, "step": 3973 }, { "epoch": 2.8790147307413667, "grad_norm": 0.26902881264686584, "learning_rate": 4.889462666091194e-08, "loss": 0.3401, "step": 3974 }, { "epoch": 2.879739193431538, "grad_norm": 0.29036855697631836, "learning_rate": 4.830825673312944e-08, "loss": 0.3605, "step": 3975 }, { "epoch": 2.8804636561217096, "grad_norm": 0.2783884108066559, "learning_rate": 4.772540700762274e-08, "loss": 0.3721, "step": 3976 }, { "epoch": 2.881188118811881, "grad_norm": 0.2853941321372986, "learning_rate": 4.7146077898745366e-08, "loss": 0.3786, "step": 3977 }, { "epoch": 2.8819125815020525, "grad_norm": 0.2738112211227417, "learning_rate": 4.657026981834623e-08, "loss": 0.3802, "step": 3978 }, { "epoch": 2.882637044192224, "grad_norm": 0.2583811581134796, "learning_rate": 4.599798317577342e-08, "loss": 0.3162, "step": 3979 }, { "epoch": 2.8833615068823955, "grad_norm": 0.26347148418426514, "learning_rate": 4.54292183778704e-08, "loss": 0.3769, "step": 3980 }, { "epoch": 2.884085969572567, "grad_norm": 0.2771751880645752, "learning_rate": 4.486397582897761e-08, "loss": 0.3748, "step": 3981 }, { "epoch": 2.8848104322627384, "grad_norm": 0.28683316707611084, "learning_rate": 4.430225593093029e-08, "loss": 0.377, "step": 3982 }, { "epoch": 2.88553489495291, "grad_norm": 0.289766401052475, "learning_rate": 4.374405908306067e-08, "loss": 0.3707, "step": 3983 }, { "epoch": 2.8862593576430813, "grad_norm": 0.27606743574142456, "learning_rate": 4.3189385682195774e-08, "loss": 0.349, "step": 3984 }, { "epoch": 2.8869838203332527, "grad_norm": 0.2924911379814148, "learning_rate": 4.2638236122658494e-08, "loss": 0.3845, "step": 3985 }, { "epoch": 2.887708283023424, "grad_norm": 0.27677443623542786, "learning_rate": 4.209061079626486e-08, "loss": 0.3456, "step": 3986 }, { "epoch": 2.8884327457135957, "grad_norm": 0.2879009544849396, "learning_rate": 4.1546510092327906e-08, "loss": 0.384, "step": 3987 }, { "epoch": 2.889157208403767, "grad_norm": 0.2650260925292969, "learning_rate": 4.1005934397652656e-08, "loss": 0.3308, "step": 3988 }, { "epoch": 2.8898816710939386, "grad_norm": 0.28029805421829224, "learning_rate": 4.046888409653949e-08, "loss": 0.3572, "step": 3989 }, { "epoch": 2.89060613378411, "grad_norm": 0.2961486577987671, "learning_rate": 3.9935359570782984e-08, "loss": 0.3709, "step": 3990 }, { "epoch": 2.8913305964742815, "grad_norm": 0.2700715959072113, "learning_rate": 3.9405361199669754e-08, "loss": 0.39, "step": 3991 }, { "epoch": 2.892055059164453, "grad_norm": 0.28643396496772766, "learning_rate": 3.887888935998119e-08, "loss": 0.3916, "step": 3992 }, { "epoch": 2.8927795218546244, "grad_norm": 0.2841840386390686, "learning_rate": 3.835594442599011e-08, "loss": 0.369, "step": 3993 }, { "epoch": 2.893503984544796, "grad_norm": 0.28521493077278137, "learning_rate": 3.783652676946303e-08, "loss": 0.3559, "step": 3994 }, { "epoch": 2.8942284472349673, "grad_norm": 0.3034553527832031, "learning_rate": 3.732063675965902e-08, "loss": 0.3981, "step": 3995 }, { "epoch": 2.8949529099251388, "grad_norm": 0.28887733817100525, "learning_rate": 3.680827476332804e-08, "loss": 0.355, "step": 3996 }, { "epoch": 2.89567737261531, "grad_norm": 0.2715854346752167, "learning_rate": 3.629944114471373e-08, "loss": 0.3657, "step": 3997 }, { "epoch": 2.8964018353054817, "grad_norm": 0.2611428201198578, "learning_rate": 3.579413626555006e-08, "loss": 0.3686, "step": 3998 }, { "epoch": 2.897126297995653, "grad_norm": 0.27714911103248596, "learning_rate": 3.529236048506357e-08, "loss": 0.3585, "step": 3999 }, { "epoch": 2.8978507606858246, "grad_norm": 0.26317623257637024, "learning_rate": 3.4794114159970024e-08, "loss": 0.3646, "step": 4000 }, { "epoch": 2.898575223375996, "grad_norm": 0.298836886882782, "learning_rate": 3.429939764447776e-08, "loss": 0.4247, "step": 4001 }, { "epoch": 2.8992996860661675, "grad_norm": 0.28203219175338745, "learning_rate": 3.3808211290284886e-08, "loss": 0.3347, "step": 4002 }, { "epoch": 2.900024148756339, "grad_norm": 0.26564550399780273, "learning_rate": 3.332055544658152e-08, "loss": 0.3495, "step": 4003 }, { "epoch": 2.9007486114465104, "grad_norm": 0.27218666672706604, "learning_rate": 3.283643046004481e-08, "loss": 0.3879, "step": 4004 }, { "epoch": 2.901473074136682, "grad_norm": 0.26349884271621704, "learning_rate": 3.235583667484443e-08, "loss": 0.3379, "step": 4005 }, { "epoch": 2.9021975368268533, "grad_norm": 0.26512470841407776, "learning_rate": 3.187877443263876e-08, "loss": 0.3548, "step": 4006 }, { "epoch": 2.902921999517025, "grad_norm": 0.3101538419723511, "learning_rate": 3.140524407257539e-08, "loss": 0.3782, "step": 4007 }, { "epoch": 2.9036464622071962, "grad_norm": 0.3095655143260956, "learning_rate": 3.0935245931292265e-08, "loss": 0.4258, "step": 4008 }, { "epoch": 2.9043709248973677, "grad_norm": 0.2775860130786896, "learning_rate": 3.0468780342914336e-08, "loss": 0.3298, "step": 4009 }, { "epoch": 2.905095387587539, "grad_norm": 0.29825085401535034, "learning_rate": 3.000584763905634e-08, "loss": 0.3561, "step": 4010 }, { "epoch": 2.9058198502777106, "grad_norm": 0.29400935769081116, "learning_rate": 2.9546448148821127e-08, "loss": 0.4039, "step": 4011 }, { "epoch": 2.906544312967882, "grad_norm": 0.2898826599121094, "learning_rate": 2.9090582198800788e-08, "loss": 0.3456, "step": 4012 }, { "epoch": 2.9072687756580535, "grad_norm": 0.27706432342529297, "learning_rate": 2.863825011307386e-08, "loss": 0.3754, "step": 4013 }, { "epoch": 2.907993238348225, "grad_norm": 0.27164602279663086, "learning_rate": 2.8189452213207014e-08, "loss": 0.3513, "step": 4014 }, { "epoch": 2.9087177010383964, "grad_norm": 0.2891388535499573, "learning_rate": 2.774418881825558e-08, "loss": 0.3968, "step": 4015 }, { "epoch": 2.909442163728568, "grad_norm": 0.2752481997013092, "learning_rate": 2.7302460244760797e-08, "loss": 0.3612, "step": 4016 }, { "epoch": 2.9101666264187394, "grad_norm": 0.26326096057891846, "learning_rate": 2.6864266806751472e-08, "loss": 0.3467, "step": 4017 }, { "epoch": 2.910891089108911, "grad_norm": 0.26474082469940186, "learning_rate": 2.642960881574286e-08, "loss": 0.35, "step": 4018 }, { "epoch": 2.9116155517990823, "grad_norm": 0.27665209770202637, "learning_rate": 2.599848658073889e-08, "loss": 0.3667, "step": 4019 }, { "epoch": 2.9123400144892537, "grad_norm": 0.2919926643371582, "learning_rate": 2.5570900408226072e-08, "loss": 0.3436, "step": 4020 }, { "epoch": 2.913064477179425, "grad_norm": 0.3066198527812958, "learning_rate": 2.5146850602180695e-08, "loss": 0.4156, "step": 4021 }, { "epoch": 2.9137889398695966, "grad_norm": 0.274679034948349, "learning_rate": 2.4726337464063277e-08, "loss": 0.3297, "step": 4022 }, { "epoch": 2.914513402559768, "grad_norm": 0.28614646196365356, "learning_rate": 2.4309361292820245e-08, "loss": 0.4056, "step": 4023 }, { "epoch": 2.9152378652499396, "grad_norm": 0.2933044731616974, "learning_rate": 2.3895922384883363e-08, "loss": 0.3947, "step": 4024 }, { "epoch": 2.915962327940111, "grad_norm": 0.3258002698421478, "learning_rate": 2.3486021034170857e-08, "loss": 0.3777, "step": 4025 }, { "epoch": 2.9166867906302825, "grad_norm": 0.2897891402244568, "learning_rate": 2.307965753208519e-08, "loss": 0.3453, "step": 4026 }, { "epoch": 2.917411253320454, "grad_norm": 0.2779790163040161, "learning_rate": 2.2676832167513042e-08, "loss": 0.3428, "step": 4027 }, { "epoch": 2.9181357160106254, "grad_norm": 0.303017795085907, "learning_rate": 2.2277545226827568e-08, "loss": 0.4143, "step": 4028 }, { "epoch": 2.918860178700797, "grad_norm": 0.2866486608982086, "learning_rate": 2.1881796993884487e-08, "loss": 0.3939, "step": 4029 }, { "epoch": 2.9195846413909683, "grad_norm": 0.301435649394989, "learning_rate": 2.148958775002541e-08, "loss": 0.3853, "step": 4030 }, { "epoch": 2.9203091040811398, "grad_norm": 0.27546167373657227, "learning_rate": 2.1100917774075635e-08, "loss": 0.3584, "step": 4031 }, { "epoch": 2.921033566771311, "grad_norm": 0.2818518877029419, "learning_rate": 2.0715787342343586e-08, "loss": 0.3839, "step": 4032 }, { "epoch": 2.9217580294614827, "grad_norm": 0.273394912481308, "learning_rate": 2.0334196728622468e-08, "loss": 0.3674, "step": 4033 }, { "epoch": 2.922482492151654, "grad_norm": 0.27478617429733276, "learning_rate": 1.9956146204188066e-08, "loss": 0.3583, "step": 4034 }, { "epoch": 2.9232069548418256, "grad_norm": 0.2750408947467804, "learning_rate": 1.9581636037799834e-08, "loss": 0.3837, "step": 4035 }, { "epoch": 2.923931417531997, "grad_norm": 0.27992895245552063, "learning_rate": 1.9210666495700914e-08, "loss": 0.3482, "step": 4036 }, { "epoch": 2.9246558802221685, "grad_norm": 0.2915441393852234, "learning_rate": 1.8843237841616458e-08, "loss": 0.352, "step": 4037 }, { "epoch": 2.92538034291234, "grad_norm": 0.27039408683776855, "learning_rate": 1.8479350336755298e-08, "loss": 0.3546, "step": 4038 }, { "epoch": 2.9261048056025114, "grad_norm": 0.29616355895996094, "learning_rate": 1.8119004239807726e-08, "loss": 0.3911, "step": 4039 }, { "epoch": 2.926829268292683, "grad_norm": 0.29640406370162964, "learning_rate": 1.7762199806947157e-08, "loss": 0.3997, "step": 4040 }, { "epoch": 2.9275537309828543, "grad_norm": 0.2678855359554291, "learning_rate": 1.7408937291829575e-08, "loss": 0.3358, "step": 4041 }, { "epoch": 2.9282781936730258, "grad_norm": 0.30745676159858704, "learning_rate": 1.705921694559187e-08, "loss": 0.3806, "step": 4042 }, { "epoch": 2.9290026563631972, "grad_norm": 0.2753886580467224, "learning_rate": 1.6713039016853505e-08, "loss": 0.3535, "step": 4043 }, { "epoch": 2.9297271190533687, "grad_norm": 0.28634709119796753, "learning_rate": 1.63704037517165e-08, "loss": 0.4172, "step": 4044 }, { "epoch": 2.93045158174354, "grad_norm": 0.27271002531051636, "learning_rate": 1.6031311393761572e-08, "loss": 0.3564, "step": 4045 }, { "epoch": 2.9311760444337116, "grad_norm": 0.27430829405784607, "learning_rate": 1.5695762184053666e-08, "loss": 0.3837, "step": 4046 }, { "epoch": 2.931900507123883, "grad_norm": 0.2650650441646576, "learning_rate": 1.5363756361138072e-08, "loss": 0.3714, "step": 4047 }, { "epoch": 2.9326249698140545, "grad_norm": 0.31847670674324036, "learning_rate": 1.5035294161039882e-08, "loss": 0.4126, "step": 4048 }, { "epoch": 2.933349432504226, "grad_norm": 0.2642671763896942, "learning_rate": 1.4710375817265643e-08, "loss": 0.353, "step": 4049 }, { "epoch": 2.9340738951943974, "grad_norm": 0.2699483335018158, "learning_rate": 1.4389001560803917e-08, "loss": 0.3789, "step": 4050 }, { "epoch": 2.934798357884569, "grad_norm": 0.2983326017856598, "learning_rate": 1.4071171620121393e-08, "loss": 0.3611, "step": 4051 }, { "epoch": 2.9355228205747403, "grad_norm": 0.28241604566574097, "learning_rate": 1.3756886221166776e-08, "loss": 0.3967, "step": 4052 }, { "epoch": 2.936247283264912, "grad_norm": 0.2586151957511902, "learning_rate": 1.3446145587368563e-08, "loss": 0.3355, "step": 4053 }, { "epoch": 2.9369717459550833, "grad_norm": 0.29851895570755005, "learning_rate": 1.3138949939635049e-08, "loss": 0.4049, "step": 4054 }, { "epoch": 2.9376962086452547, "grad_norm": 0.26841723918914795, "learning_rate": 1.283529949635376e-08, "loss": 0.3506, "step": 4055 }, { "epoch": 2.938420671335426, "grad_norm": 0.28538578748703003, "learning_rate": 1.253519447339313e-08, "loss": 0.3827, "step": 4056 }, { "epoch": 2.9391451340255976, "grad_norm": 0.27938780188560486, "learning_rate": 1.223863508410028e-08, "loss": 0.3811, "step": 4057 }, { "epoch": 2.939869596715769, "grad_norm": 0.2819328010082245, "learning_rate": 1.1945621539301566e-08, "loss": 0.3551, "step": 4058 }, { "epoch": 2.9405940594059405, "grad_norm": 0.28163284063339233, "learning_rate": 1.1656154047303691e-08, "loss": 0.353, "step": 4059 }, { "epoch": 2.941318522096112, "grad_norm": 0.2998712658882141, "learning_rate": 1.1370232813892045e-08, "loss": 0.407, "step": 4060 }, { "epoch": 2.9420429847862835, "grad_norm": 0.2733823359012604, "learning_rate": 1.1087858042329036e-08, "loss": 0.3732, "step": 4061 }, { "epoch": 2.942767447476455, "grad_norm": 0.2657240927219391, "learning_rate": 1.0809029933359083e-08, "loss": 0.3122, "step": 4062 }, { "epoch": 2.9434919101666264, "grad_norm": 0.2789859473705292, "learning_rate": 1.0533748685202516e-08, "loss": 0.3738, "step": 4063 }, { "epoch": 2.944216372856798, "grad_norm": 0.2841784656047821, "learning_rate": 1.0262014493559458e-08, "loss": 0.3732, "step": 4064 }, { "epoch": 2.9449408355469693, "grad_norm": 0.2746102809906006, "learning_rate": 9.993827551608715e-09, "loss": 0.3611, "step": 4065 }, { "epoch": 2.9456652982371407, "grad_norm": 0.27584996819496155, "learning_rate": 9.729188050006666e-09, "loss": 0.3535, "step": 4066 }, { "epoch": 2.946389760927312, "grad_norm": 0.2730507552623749, "learning_rate": 9.468096176887265e-09, "loss": 0.3626, "step": 4067 }, { "epoch": 2.9471142236174837, "grad_norm": 0.2981140613555908, "learning_rate": 9.210552117863703e-09, "loss": 0.4176, "step": 4068 }, { "epoch": 2.947838686307655, "grad_norm": 0.276419997215271, "learning_rate": 8.956556056026188e-09, "loss": 0.3705, "step": 4069 }, { "epoch": 2.9485631489978266, "grad_norm": 0.2707394063472748, "learning_rate": 8.706108171942507e-09, "loss": 0.3759, "step": 4070 }, { "epoch": 2.949287611687998, "grad_norm": 0.29356303811073303, "learning_rate": 8.459208643659122e-09, "loss": 0.3932, "step": 4071 }, { "epoch": 2.9500120743781695, "grad_norm": 0.2673918604850769, "learning_rate": 8.21585764669841e-09, "loss": 0.3346, "step": 4072 }, { "epoch": 2.950736537068341, "grad_norm": 0.29154303669929504, "learning_rate": 7.976055354060874e-09, "loss": 0.3741, "step": 4073 }, { "epoch": 2.9514609997585124, "grad_norm": 0.2829691469669342, "learning_rate": 7.739801936224034e-09, "loss": 0.3893, "step": 4074 }, { "epoch": 2.952185462448684, "grad_norm": 0.2679212689399719, "learning_rate": 7.507097561143538e-09, "loss": 0.3556, "step": 4075 }, { "epoch": 2.9529099251388553, "grad_norm": 0.2919843792915344, "learning_rate": 7.277942394249282e-09, "loss": 0.3298, "step": 4076 }, { "epoch": 2.9536343878290268, "grad_norm": 0.2853247821331024, "learning_rate": 7.052336598451504e-09, "loss": 0.3722, "step": 4077 }, { "epoch": 2.954358850519198, "grad_norm": 0.27992886304855347, "learning_rate": 6.8302803341346864e-09, "loss": 0.3538, "step": 4078 }, { "epoch": 2.9550833132093697, "grad_norm": 0.26797547936439514, "learning_rate": 6.6117737591597785e-09, "loss": 0.402, "step": 4079 }, { "epoch": 2.955807775899541, "grad_norm": 0.2667589783668518, "learning_rate": 6.3968170288664086e-09, "loss": 0.3721, "step": 4080 }, { "epoch": 2.9565322385897126, "grad_norm": 0.25521034002304077, "learning_rate": 6.185410296069006e-09, "loss": 0.3467, "step": 4081 }, { "epoch": 2.957256701279884, "grad_norm": 0.27678585052490234, "learning_rate": 5.977553711057904e-09, "loss": 0.3754, "step": 4082 }, { "epoch": 2.9579811639700555, "grad_norm": 0.26573485136032104, "learning_rate": 5.773247421600459e-09, "loss": 0.3547, "step": 4083 }, { "epoch": 2.958705626660227, "grad_norm": 0.28422388434410095, "learning_rate": 5.572491572939931e-09, "loss": 0.3804, "step": 4084 }, { "epoch": 2.9594300893503984, "grad_norm": 0.2978706955909729, "learning_rate": 5.375286307795491e-09, "loss": 0.3958, "step": 4085 }, { "epoch": 2.96015455204057, "grad_norm": 0.25702351331710815, "learning_rate": 5.181631766362216e-09, "loss": 0.3311, "step": 4086 }, { "epoch": 2.9608790147307413, "grad_norm": 0.27617666125297546, "learning_rate": 4.9915280863105375e-09, "loss": 0.3937, "step": 4087 }, { "epoch": 2.961603477420913, "grad_norm": 0.287548303604126, "learning_rate": 4.804975402787348e-09, "loss": 0.3639, "step": 4088 }, { "epoch": 2.9623279401110842, "grad_norm": 0.30876433849334717, "learning_rate": 4.621973848414341e-09, "loss": 0.4211, "step": 4089 }, { "epoch": 2.9630524028012557, "grad_norm": 0.2721802294254303, "learning_rate": 4.442523553289113e-09, "loss": 0.3379, "step": 4090 }, { "epoch": 2.963776865491427, "grad_norm": 0.2889803647994995, "learning_rate": 4.266624644984063e-09, "loss": 0.3685, "step": 4091 }, { "epoch": 2.9645013281815986, "grad_norm": 0.2903919816017151, "learning_rate": 4.094277248548051e-09, "loss": 0.3665, "step": 4092 }, { "epoch": 2.96522579087177, "grad_norm": 0.27301403880119324, "learning_rate": 3.9254814865036236e-09, "loss": 0.3622, "step": 4093 }, { "epoch": 2.9659502535619415, "grad_norm": 0.27502575516700745, "learning_rate": 3.760237478849793e-09, "loss": 0.3572, "step": 4094 }, { "epoch": 2.966674716252113, "grad_norm": 0.26436156034469604, "learning_rate": 3.5985453430598115e-09, "loss": 0.3688, "step": 4095 }, { "epoch": 2.9673991789422844, "grad_norm": 0.2628489136695862, "learning_rate": 3.4404051940817306e-09, "loss": 0.3814, "step": 4096 }, { "epoch": 2.968123641632456, "grad_norm": 0.2711799740791321, "learning_rate": 3.2858171443395092e-09, "loss": 0.3761, "step": 4097 }, { "epoch": 2.9688481043226274, "grad_norm": 0.280161589384079, "learning_rate": 3.134781303730794e-09, "loss": 0.38, "step": 4098 }, { "epoch": 2.969572567012799, "grad_norm": 0.29012399911880493, "learning_rate": 2.9872977796280287e-09, "loss": 0.3787, "step": 4099 }, { "epoch": 2.9702970297029703, "grad_norm": 0.2718244194984436, "learning_rate": 2.8433666768790115e-09, "loss": 0.3302, "step": 4100 }, { "epoch": 2.9710214923931417, "grad_norm": 0.2980569303035736, "learning_rate": 2.702988097805781e-09, "loss": 0.3713, "step": 4101 }, { "epoch": 2.971745955083313, "grad_norm": 0.2667255997657776, "learning_rate": 2.566162142204065e-09, "loss": 0.3622, "step": 4102 }, { "epoch": 2.9724704177734846, "grad_norm": 0.2724962532520294, "learning_rate": 2.4328889073449436e-09, "loss": 0.3703, "step": 4103 }, { "epoch": 2.973194880463656, "grad_norm": 0.2792902886867523, "learning_rate": 2.3031684879742944e-09, "loss": 0.379, "step": 4104 }, { "epoch": 2.9739193431538276, "grad_norm": 0.2913864552974701, "learning_rate": 2.1770009763105727e-09, "loss": 0.4, "step": 4105 }, { "epoch": 2.974643805843999, "grad_norm": 0.288669615983963, "learning_rate": 2.0543864620481413e-09, "loss": 0.3904, "step": 4106 }, { "epoch": 2.9753682685341705, "grad_norm": 0.2775460183620453, "learning_rate": 1.9353250323539407e-09, "loss": 0.3253, "step": 4107 }, { "epoch": 2.976092731224342, "grad_norm": 0.2943876385688782, "learning_rate": 1.8198167718708192e-09, "loss": 0.3863, "step": 4108 }, { "epoch": 2.9768171939145134, "grad_norm": 0.2772209942340851, "learning_rate": 1.7078617627136474e-09, "loss": 0.3678, "step": 4109 }, { "epoch": 2.977541656604685, "grad_norm": 0.26036664843559265, "learning_rate": 1.5994600844726482e-09, "loss": 0.3527, "step": 4110 }, { "epoch": 2.9782661192948563, "grad_norm": 0.2958499789237976, "learning_rate": 1.4946118142117326e-09, "loss": 0.3892, "step": 4111 }, { "epoch": 2.9789905819850278, "grad_norm": 0.291458398103714, "learning_rate": 1.393317026468499e-09, "loss": 0.3752, "step": 4112 }, { "epoch": 2.979715044675199, "grad_norm": 0.3132496178150177, "learning_rate": 1.2955757932542334e-09, "loss": 0.4002, "step": 4113 }, { "epoch": 2.9804395073653707, "grad_norm": 0.27965471148490906, "learning_rate": 1.2013881840539089e-09, "loss": 0.3466, "step": 4114 }, { "epoch": 2.981163970055542, "grad_norm": 0.25562646985054016, "learning_rate": 1.1107542658261861e-09, "loss": 0.3627, "step": 4115 }, { "epoch": 2.9818884327457136, "grad_norm": 0.265747994184494, "learning_rate": 1.0236741030039687e-09, "loss": 0.3576, "step": 4116 }, { "epoch": 2.982612895435885, "grad_norm": 0.27731767296791077, "learning_rate": 9.401477574932927e-10, "loss": 0.3921, "step": 4117 }, { "epoch": 2.9833373581260565, "grad_norm": 0.2845531702041626, "learning_rate": 8.601752886733261e-10, "loss": 0.3762, "step": 4118 }, { "epoch": 2.984061820816228, "grad_norm": 0.28089457750320435, "learning_rate": 7.8375675339748e-10, "loss": 0.3249, "step": 4119 }, { "epoch": 2.9847862835063994, "grad_norm": 0.3056414723396301, "learning_rate": 7.108922059928525e-10, "loss": 0.3909, "step": 4120 }, { "epoch": 2.985510746196571, "grad_norm": 0.31060314178466797, "learning_rate": 6.415816982585644e-10, "loss": 0.3886, "step": 4121 }, { "epoch": 2.9862352088867423, "grad_norm": 0.2695300877094269, "learning_rate": 5.758252794690888e-10, "loss": 0.3526, "step": 4122 }, { "epoch": 2.9869596715769138, "grad_norm": 0.29083436727523804, "learning_rate": 5.136229963703665e-10, "loss": 0.4076, "step": 4123 }, { "epoch": 2.9876841342670852, "grad_norm": 0.26678046584129333, "learning_rate": 4.549748931831355e-10, "loss": 0.3266, "step": 4124 }, { "epoch": 2.9884085969572567, "grad_norm": 0.2985779941082001, "learning_rate": 3.9988101160071124e-10, "loss": 0.3866, "step": 4125 }, { "epoch": 2.989133059647428, "grad_norm": 0.28071969747543335, "learning_rate": 3.4834139078954167e-10, "loss": 0.3534, "step": 4126 }, { "epoch": 2.9898575223375996, "grad_norm": 0.27365851402282715, "learning_rate": 3.003560673903172e-10, "loss": 0.3648, "step": 4127 }, { "epoch": 2.990581985027771, "grad_norm": 0.2751316428184509, "learning_rate": 2.5592507551519543e-10, "loss": 0.3567, "step": 4128 }, { "epoch": 2.9913064477179425, "grad_norm": 0.2695724070072174, "learning_rate": 2.1504844675168667e-10, "loss": 0.3566, "step": 4129 }, { "epoch": 2.992030910408114, "grad_norm": 0.27674078941345215, "learning_rate": 1.7772621015876846e-10, "loss": 0.3705, "step": 4130 }, { "epoch": 2.9927553730982854, "grad_norm": 0.2815358340740204, "learning_rate": 1.4395839226910568e-10, "loss": 0.3561, "step": 4131 }, { "epoch": 2.993479835788457, "grad_norm": 0.3067064881324768, "learning_rate": 1.1374501708849572e-10, "loss": 0.3854, "step": 4132 }, { "epoch": 2.9942042984786283, "grad_norm": 0.2695311903953552, "learning_rate": 8.708610609642343e-11, "loss": 0.3418, "step": 4133 }, { "epoch": 2.9949287611688, "grad_norm": 0.30188092589378357, "learning_rate": 6.398167824439583e-11, "loss": 0.3993, "step": 4134 }, { "epoch": 2.9956532238589713, "grad_norm": 0.29380542039871216, "learning_rate": 4.443174995760746e-11, "loss": 0.4086, "step": 4135 }, { "epoch": 2.9963776865491427, "grad_norm": 0.2877809405326843, "learning_rate": 2.8436335134940372e-11, "loss": 0.3413, "step": 4136 }, { "epoch": 2.997102149239314, "grad_norm": 0.2900252938270569, "learning_rate": 1.599544514674367e-11, "loss": 0.394, "step": 4137 }, { "epoch": 2.9978266119294856, "grad_norm": 0.2824209928512573, "learning_rate": 7.109088837609079e-12, "loss": 0.3666, "step": 4138 }, { "epoch": 2.998551074619657, "grad_norm": 0.2741779386997223, "learning_rate": 1.7772725252607204e-12, "loss": 0.3459, "step": 4139 }, { "epoch": 2.9992755373098285, "grad_norm": 0.28835082054138184, "learning_rate": 0.0, "loss": 0.3451, "step": 4140 }, { "epoch": 2.9992755373098285, "step": 4140, "total_flos": 5393877495447552.0, "train_loss": 0.4123690740713751, "train_runtime": 83797.5514, "train_samples_per_second": 4.743, "train_steps_per_second": 0.049 } ], "logging_steps": 1.0, "max_steps": 4140, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5393877495447552.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }