{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1626, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024622960911049553, "grad_norm": 1.5234375, "learning_rate": 4.0816326530612243e-07, "loss": 1.3865270614624023, "step": 2 }, { "epoch": 0.0049245921822099106, "grad_norm": 36.75, "learning_rate": 1.2244897959183673e-06, "loss": 1.8756635189056396, "step": 4 }, { "epoch": 0.007386888273314866, "grad_norm": 3.625, "learning_rate": 2.0408163265306125e-06, "loss": 1.1310276985168457, "step": 6 }, { "epoch": 0.009849184364419821, "grad_norm": 6.09375, "learning_rate": 2.8571428571428573e-06, "loss": 1.8238341808319092, "step": 8 }, { "epoch": 0.012311480455524777, "grad_norm": 12.0, "learning_rate": 3.6734693877551024e-06, "loss": 2.2014291286468506, "step": 10 }, { "epoch": 0.014773776546629732, "grad_norm": 19.5, "learning_rate": 4.489795918367348e-06, "loss": 2.4339303970336914, "step": 12 }, { "epoch": 0.017236072637734686, "grad_norm": 9.1875, "learning_rate": 5.306122448979593e-06, "loss": 1.3835787773132324, "step": 14 }, { "epoch": 0.019698368728839642, "grad_norm": 3.453125, "learning_rate": 6.122448979591837e-06, "loss": 1.1793060302734375, "step": 16 }, { "epoch": 0.0221606648199446, "grad_norm": 2.21875, "learning_rate": 6.938775510204082e-06, "loss": 1.173147439956665, "step": 18 }, { "epoch": 0.024622960911049555, "grad_norm": 12.875, "learning_rate": 7.755102040816327e-06, "loss": 2.2560791969299316, "step": 20 }, { "epoch": 0.02708525700215451, "grad_norm": 3.859375, "learning_rate": 8.571428571428571e-06, "loss": 1.753507137298584, "step": 22 }, { "epoch": 0.029547553093259463, "grad_norm": 11.125, "learning_rate": 9.387755102040818e-06, "loss": 2.109658718109131, "step": 24 }, { "epoch": 0.03200984918436442, "grad_norm": 4.375, "learning_rate": 1.0204081632653063e-05, "loss": 1.7001088857650757, "step": 26 }, { "epoch": 0.03447214527546937, "grad_norm": 7.53125, "learning_rate": 1.1020408163265306e-05, "loss": 2.2228636741638184, "step": 28 }, { "epoch": 0.03693444136657433, "grad_norm": 2.09375, "learning_rate": 1.1836734693877552e-05, "loss": 1.233575463294983, "step": 30 }, { "epoch": 0.039396737457679284, "grad_norm": 5.03125, "learning_rate": 1.2653061224489798e-05, "loss": 1.834639549255371, "step": 32 }, { "epoch": 0.041859033548784244, "grad_norm": 3.796875, "learning_rate": 1.3469387755102042e-05, "loss": 1.8060579299926758, "step": 34 }, { "epoch": 0.0443213296398892, "grad_norm": 2.625, "learning_rate": 1.4285714285714287e-05, "loss": 1.4287090301513672, "step": 36 }, { "epoch": 0.04678362573099415, "grad_norm": 11.375, "learning_rate": 1.510204081632653e-05, "loss": 2.1703319549560547, "step": 38 }, { "epoch": 0.04924592182209911, "grad_norm": 40.75, "learning_rate": 1.5918367346938776e-05, "loss": 2.1797375679016113, "step": 40 }, { "epoch": 0.05170821791320406, "grad_norm": 15.8125, "learning_rate": 1.673469387755102e-05, "loss": 1.9881037473678589, "step": 42 }, { "epoch": 0.05417051400430902, "grad_norm": 9.375, "learning_rate": 1.7551020408163266e-05, "loss": 1.735787034034729, "step": 44 }, { "epoch": 0.056632810095413974, "grad_norm": 8.125, "learning_rate": 1.836734693877551e-05, "loss": 1.9953798055648804, "step": 46 }, { "epoch": 0.05909510618651893, "grad_norm": 4.40625, "learning_rate": 1.9183673469387756e-05, "loss": 1.1727348566055298, "step": 48 }, { "epoch": 0.061557402277623886, "grad_norm": 3.4375, "learning_rate": 2e-05, "loss": 1.6915946006774902, "step": 50 }, { "epoch": 0.06401969836872884, "grad_norm": 5.96875, "learning_rate": 1.9999936502625722e-05, "loss": 2.3282856941223145, "step": 52 }, { "epoch": 0.0664819944598338, "grad_norm": 5.90625, "learning_rate": 1.9999746011510863e-05, "loss": 1.9712034463882446, "step": 54 }, { "epoch": 0.06894429055093874, "grad_norm": 18.375, "learning_rate": 1.9999428529679345e-05, "loss": 1.5145387649536133, "step": 56 }, { "epoch": 0.0714065866420437, "grad_norm": 3.84375, "learning_rate": 1.9998984062170987e-05, "loss": 1.1939287185668945, "step": 58 }, { "epoch": 0.07386888273314866, "grad_norm": 5.34375, "learning_rate": 1.9998412616041416e-05, "loss": 1.7602123022079468, "step": 60 }, { "epoch": 0.07633117882425362, "grad_norm": 4.71875, "learning_rate": 1.9997714200361962e-05, "loss": 1.446789264678955, "step": 62 }, { "epoch": 0.07879347491535857, "grad_norm": 3.296875, "learning_rate": 1.999688882621952e-05, "loss": 1.6264426708221436, "step": 64 }, { "epoch": 0.08125577100646353, "grad_norm": 2.84375, "learning_rate": 1.9995936506716357e-05, "loss": 1.63454008102417, "step": 66 }, { "epoch": 0.08371806709756849, "grad_norm": 4.0625, "learning_rate": 1.9994857256969928e-05, "loss": 1.8928616046905518, "step": 68 }, { "epoch": 0.08618036318867343, "grad_norm": 4.15625, "learning_rate": 1.999365109411261e-05, "loss": 1.7232308387756348, "step": 70 }, { "epoch": 0.0886426592797784, "grad_norm": 3.5, "learning_rate": 1.9992318037291443e-05, "loss": 1.5345882177352905, "step": 72 }, { "epoch": 0.09110495537088335, "grad_norm": 2.390625, "learning_rate": 1.9990858107667836e-05, "loss": 1.5957210063934326, "step": 74 }, { "epoch": 0.0935672514619883, "grad_norm": 4.9375, "learning_rate": 1.9989271328417207e-05, "loss": 1.5378596782684326, "step": 76 }, { "epoch": 0.09602954755309326, "grad_norm": 4.625, "learning_rate": 1.998755772472864e-05, "loss": 1.7094926834106445, "step": 78 }, { "epoch": 0.09849184364419822, "grad_norm": 4.78125, "learning_rate": 1.9985717323804467e-05, "loss": 1.6278411149978638, "step": 80 }, { "epoch": 0.10095413973530316, "grad_norm": 2.859375, "learning_rate": 1.998375015485984e-05, "loss": 1.1961259841918945, "step": 82 }, { "epoch": 0.10341643582640812, "grad_norm": 6.15625, "learning_rate": 1.9981656249122285e-05, "loss": 1.1318538188934326, "step": 84 }, { "epoch": 0.10587873191751308, "grad_norm": 7.0625, "learning_rate": 1.997943563983117e-05, "loss": 1.7807825803756714, "step": 86 }, { "epoch": 0.10834102800861804, "grad_norm": 5.1875, "learning_rate": 1.9977088362237217e-05, "loss": 1.4653401374816895, "step": 88 }, { "epoch": 0.11080332409972299, "grad_norm": 1.8671875, "learning_rate": 1.9974614453601913e-05, "loss": 1.245106816291809, "step": 90 }, { "epoch": 0.11326562019082795, "grad_norm": 13.625, "learning_rate": 1.997201395319694e-05, "loss": 1.646073818206787, "step": 92 }, { "epoch": 0.11572791628193291, "grad_norm": 4.28125, "learning_rate": 1.996928690230353e-05, "loss": 1.5019184350967407, "step": 94 }, { "epoch": 0.11819021237303785, "grad_norm": 4.375, "learning_rate": 1.996643334421182e-05, "loss": 1.4860734939575195, "step": 96 }, { "epoch": 0.12065250846414281, "grad_norm": 2.8125, "learning_rate": 1.9963453324220185e-05, "loss": 1.1848664283752441, "step": 98 }, { "epoch": 0.12311480455524777, "grad_norm": 3.015625, "learning_rate": 1.9960346889634478e-05, "loss": 1.2456748485565186, "step": 100 }, { "epoch": 0.12557710064635272, "grad_norm": 1.765625, "learning_rate": 1.9957114089767306e-05, "loss": 1.163445234298706, "step": 102 }, { "epoch": 0.12803939673745768, "grad_norm": 2.546875, "learning_rate": 1.9953754975937246e-05, "loss": 1.5070371627807617, "step": 104 }, { "epoch": 0.13050169282856264, "grad_norm": 2.171875, "learning_rate": 1.9950269601468033e-05, "loss": 1.0462160110473633, "step": 106 }, { "epoch": 0.1329639889196676, "grad_norm": 2.75, "learning_rate": 1.9946658021687694e-05, "loss": 1.46537184715271, "step": 108 }, { "epoch": 0.13542628501077256, "grad_norm": 3.109375, "learning_rate": 1.994292029392768e-05, "loss": 1.5482763051986694, "step": 110 }, { "epoch": 0.1378885811018775, "grad_norm": 1.4921875, "learning_rate": 1.993905647752198e-05, "loss": 1.0207593441009521, "step": 112 }, { "epoch": 0.14035087719298245, "grad_norm": 6.46875, "learning_rate": 1.9935066633806133e-05, "loss": 1.77092444896698, "step": 114 }, { "epoch": 0.1428131732840874, "grad_norm": 4.21875, "learning_rate": 1.9930950826116288e-05, "loss": 1.4896173477172852, "step": 116 }, { "epoch": 0.14527546937519237, "grad_norm": 1.6796875, "learning_rate": 1.9926709119788197e-05, "loss": 1.1458995342254639, "step": 118 }, { "epoch": 0.14773776546629733, "grad_norm": 1.8203125, "learning_rate": 1.9922341582156156e-05, "loss": 1.0295559167861938, "step": 120 }, { "epoch": 0.1502000615574023, "grad_norm": 3.984375, "learning_rate": 1.9917848282551965e-05, "loss": 1.4944086074829102, "step": 122 }, { "epoch": 0.15266235764850725, "grad_norm": 2.78125, "learning_rate": 1.9913229292303806e-05, "loss": 1.5551412105560303, "step": 124 }, { "epoch": 0.15512465373961218, "grad_norm": 23.5, "learning_rate": 1.990848468473511e-05, "loss": 1.9140477180480957, "step": 126 }, { "epoch": 0.15758694983071714, "grad_norm": 10.4375, "learning_rate": 1.9903614535163417e-05, "loss": 1.4774185419082642, "step": 128 }, { "epoch": 0.1600492459218221, "grad_norm": 1.6484375, "learning_rate": 1.989861892089914e-05, "loss": 1.1932008266448975, "step": 130 }, { "epoch": 0.16251154201292706, "grad_norm": 1.0625, "learning_rate": 1.9893497921244394e-05, "loss": 1.253312349319458, "step": 132 }, { "epoch": 0.16497383810403202, "grad_norm": 2.8125, "learning_rate": 1.9888251617491674e-05, "loss": 1.0982537269592285, "step": 134 }, { "epoch": 0.16743613419513698, "grad_norm": 3.5, "learning_rate": 1.9882880092922612e-05, "loss": 1.5139843225479126, "step": 136 }, { "epoch": 0.1698984302862419, "grad_norm": 3.109375, "learning_rate": 1.9877383432806633e-05, "loss": 1.542289137840271, "step": 138 }, { "epoch": 0.17236072637734687, "grad_norm": 3.234375, "learning_rate": 1.9871761724399617e-05, "loss": 1.432151436805725, "step": 140 }, { "epoch": 0.17482302246845183, "grad_norm": 2.84375, "learning_rate": 1.986601505694248e-05, "loss": 1.500737190246582, "step": 142 }, { "epoch": 0.1772853185595568, "grad_norm": 4.65625, "learning_rate": 1.986014352165981e-05, "loss": 1.461523413658142, "step": 144 }, { "epoch": 0.17974761465066175, "grad_norm": 2.859375, "learning_rate": 1.985414721175837e-05, "loss": 1.5014877319335938, "step": 146 }, { "epoch": 0.1822099107417667, "grad_norm": 3.3125, "learning_rate": 1.9848026222425636e-05, "loss": 1.4726862907409668, "step": 148 }, { "epoch": 0.18467220683287167, "grad_norm": 4.625, "learning_rate": 1.9841780650828308e-05, "loss": 1.543365716934204, "step": 150 }, { "epoch": 0.1871345029239766, "grad_norm": 16.5, "learning_rate": 1.9835410596110723e-05, "loss": 0.5347945094108582, "step": 152 }, { "epoch": 0.18959679901508156, "grad_norm": 4.71875, "learning_rate": 1.982891615939333e-05, "loss": 1.6506417989730835, "step": 154 }, { "epoch": 0.19205909510618652, "grad_norm": 20.875, "learning_rate": 1.982229744377104e-05, "loss": 0.903097927570343, "step": 156 }, { "epoch": 0.19452139119729148, "grad_norm": 4.71875, "learning_rate": 1.9815554554311623e-05, "loss": 1.4461334943771362, "step": 158 }, { "epoch": 0.19698368728839644, "grad_norm": 7.78125, "learning_rate": 1.9808687598054023e-05, "loss": 1.1890747547149658, "step": 160 }, { "epoch": 0.1994459833795014, "grad_norm": 5.15625, "learning_rate": 1.980169668400666e-05, "loss": 1.4282548427581787, "step": 162 }, { "epoch": 0.20190827947060633, "grad_norm": 3.359375, "learning_rate": 1.9794581923145708e-05, "loss": 1.2562037706375122, "step": 164 }, { "epoch": 0.2043705755617113, "grad_norm": 3.296875, "learning_rate": 1.9787343428413327e-05, "loss": 1.4614920616149902, "step": 166 }, { "epoch": 0.20683287165281625, "grad_norm": 3.484375, "learning_rate": 1.9779981314715866e-05, "loss": 1.3043287992477417, "step": 168 }, { "epoch": 0.2092951677439212, "grad_norm": 3.203125, "learning_rate": 1.9772495698922047e-05, "loss": 1.17995285987854, "step": 170 }, { "epoch": 0.21175746383502617, "grad_norm": 6.03125, "learning_rate": 1.9764886699861104e-05, "loss": 2.0112454891204834, "step": 172 }, { "epoch": 0.21421975992613113, "grad_norm": 5.6875, "learning_rate": 1.9757154438320914e-05, "loss": 1.485538363456726, "step": 174 }, { "epoch": 0.21668205601723609, "grad_norm": 3.78125, "learning_rate": 1.974929903704604e-05, "loss": 1.445993423461914, "step": 176 }, { "epoch": 0.21914435210834102, "grad_norm": 3.59375, "learning_rate": 1.9741320620735832e-05, "loss": 1.4375782012939453, "step": 178 }, { "epoch": 0.22160664819944598, "grad_norm": 5.875, "learning_rate": 1.9733219316042404e-05, "loss": 1.8119451999664307, "step": 180 }, { "epoch": 0.22406894429055094, "grad_norm": 6.4375, "learning_rate": 1.9724995251568648e-05, "loss": 1.9366390705108643, "step": 182 }, { "epoch": 0.2265312403816559, "grad_norm": 5.0, "learning_rate": 1.97166485578662e-05, "loss": 1.4353549480438232, "step": 184 }, { "epoch": 0.22899353647276086, "grad_norm": 3.859375, "learning_rate": 1.9708179367433333e-05, "loss": 1.4814636707305908, "step": 186 }, { "epoch": 0.23145583256386582, "grad_norm": 2.8125, "learning_rate": 1.969958781471289e-05, "loss": 1.3983485698699951, "step": 188 }, { "epoch": 0.23391812865497075, "grad_norm": 6.4375, "learning_rate": 1.9690874036090126e-05, "loss": 1.8465726375579834, "step": 190 }, { "epoch": 0.2363804247460757, "grad_norm": 2.875, "learning_rate": 1.9682038169890563e-05, "loss": 1.4366203546524048, "step": 192 }, { "epoch": 0.23884272083718067, "grad_norm": 4.96875, "learning_rate": 1.9673080356377778e-05, "loss": 1.397793173789978, "step": 194 }, { "epoch": 0.24130501692828563, "grad_norm": 9.1875, "learning_rate": 1.9664000737751176e-05, "loss": 0.40697720646858215, "step": 196 }, { "epoch": 0.24376731301939059, "grad_norm": 9.9375, "learning_rate": 1.9654799458143744e-05, "loss": 0.7866343259811401, "step": 198 }, { "epoch": 0.24622960911049555, "grad_norm": 3.34375, "learning_rate": 1.9645476663619748e-05, "loss": 1.4268109798431396, "step": 200 }, { "epoch": 0.2486919052016005, "grad_norm": 9.3125, "learning_rate": 1.9636032502172445e-05, "loss": 1.2419297695159912, "step": 202 }, { "epoch": 0.25115420129270544, "grad_norm": 5.65625, "learning_rate": 1.962646712372169e-05, "loss": 1.7364747524261475, "step": 204 }, { "epoch": 0.2536164973838104, "grad_norm": 4.28125, "learning_rate": 1.9616780680111587e-05, "loss": 1.3980765342712402, "step": 206 }, { "epoch": 0.25607879347491536, "grad_norm": 12.1875, "learning_rate": 1.9606973325108077e-05, "loss": 1.4629418849945068, "step": 208 }, { "epoch": 0.2585410895660203, "grad_norm": 5.8125, "learning_rate": 1.9597045214396472e-05, "loss": 1.361374855041504, "step": 210 }, { "epoch": 0.2610033856571253, "grad_norm": 3.90625, "learning_rate": 1.958699650557902e-05, "loss": 1.4552102088928223, "step": 212 }, { "epoch": 0.2634656817482302, "grad_norm": 3.703125, "learning_rate": 1.9576827358172377e-05, "loss": 1.4295791387557983, "step": 214 }, { "epoch": 0.2659279778393352, "grad_norm": 7.21875, "learning_rate": 1.956653793360508e-05, "loss": 1.4938560724258423, "step": 216 }, { "epoch": 0.2683902739304401, "grad_norm": 10.875, "learning_rate": 1.955612839521499e-05, "loss": 1.405943512916565, "step": 218 }, { "epoch": 0.2708525700215451, "grad_norm": 7.09375, "learning_rate": 1.95455989082467e-05, "loss": 1.8168143033981323, "step": 220 }, { "epoch": 0.27331486611265005, "grad_norm": 4.8125, "learning_rate": 1.9534949639848894e-05, "loss": 1.880413293838501, "step": 222 }, { "epoch": 0.275777162203755, "grad_norm": 6.3125, "learning_rate": 1.9524180759071724e-05, "loss": 1.4368586540222168, "step": 224 }, { "epoch": 0.27823945829485996, "grad_norm": 7.59375, "learning_rate": 1.9513292436864107e-05, "loss": 1.4332315921783447, "step": 226 }, { "epoch": 0.2807017543859649, "grad_norm": 4.375, "learning_rate": 1.9502284846071003e-05, "loss": 1.4779151678085327, "step": 228 }, { "epoch": 0.2831640504770699, "grad_norm": 8.75, "learning_rate": 1.9491158161430703e-05, "loss": 0.5792175531387329, "step": 230 }, { "epoch": 0.2856263465681748, "grad_norm": 1.3828125, "learning_rate": 1.9479912559572e-05, "loss": 1.0462322235107422, "step": 232 }, { "epoch": 0.2880886426592798, "grad_norm": 7.375, "learning_rate": 1.946854821901146e-05, "loss": 1.3507080078125, "step": 234 }, { "epoch": 0.29055093875038474, "grad_norm": 4.1875, "learning_rate": 1.945706532015052e-05, "loss": 1.4383283853530884, "step": 236 }, { "epoch": 0.29301323484148967, "grad_norm": 5.96875, "learning_rate": 1.9445464045272668e-05, "loss": 0.7620460987091064, "step": 238 }, { "epoch": 0.29547553093259465, "grad_norm": 5.03125, "learning_rate": 1.9433744578540525e-05, "loss": 1.3795279264450073, "step": 240 }, { "epoch": 0.2979378270236996, "grad_norm": 3.1875, "learning_rate": 1.942190710599293e-05, "loss": 1.4460288286209106, "step": 242 }, { "epoch": 0.3004001231148046, "grad_norm": 3.359375, "learning_rate": 1.940995181554199e-05, "loss": 1.4355218410491943, "step": 244 }, { "epoch": 0.3028624192059095, "grad_norm": 5.34375, "learning_rate": 1.93978788969701e-05, "loss": 1.339043140411377, "step": 246 }, { "epoch": 0.3053247152970145, "grad_norm": 3.03125, "learning_rate": 1.9385688541926903e-05, "loss": 1.4305763244628906, "step": 248 }, { "epoch": 0.3077870113881194, "grad_norm": 3.6875, "learning_rate": 1.9373380943926295e-05, "loss": 1.7878942489624023, "step": 250 }, { "epoch": 0.31024930747922436, "grad_norm": 5.96875, "learning_rate": 1.9360956298343313e-05, "loss": 1.680354356765747, "step": 252 }, { "epoch": 0.31271160357032934, "grad_norm": 5.90625, "learning_rate": 1.934841480241105e-05, "loss": 1.5553169250488281, "step": 254 }, { "epoch": 0.3151738996614343, "grad_norm": 3.640625, "learning_rate": 1.9335756655217513e-05, "loss": 1.4183763265609741, "step": 256 }, { "epoch": 0.31763619575253926, "grad_norm": 2.890625, "learning_rate": 1.9322982057702492e-05, "loss": 1.391609787940979, "step": 258 }, { "epoch": 0.3200984918436442, "grad_norm": 3.59375, "learning_rate": 1.931009121265433e-05, "loss": 1.4094479084014893, "step": 260 }, { "epoch": 0.3225607879347491, "grad_norm": 3.21875, "learning_rate": 1.9297084324706734e-05, "loss": 1.4225077629089355, "step": 262 }, { "epoch": 0.3250230840258541, "grad_norm": 6.40625, "learning_rate": 1.9283961600335503e-05, "loss": 1.468010663986206, "step": 264 }, { "epoch": 0.32748538011695905, "grad_norm": 5.3125, "learning_rate": 1.927072324785529e-05, "loss": 1.7119166851043701, "step": 266 }, { "epoch": 0.32994767620806403, "grad_norm": 1.609375, "learning_rate": 1.9257369477416224e-05, "loss": 1.0271199941635132, "step": 268 }, { "epoch": 0.33240997229916897, "grad_norm": 8.5, "learning_rate": 1.9243900501000666e-05, "loss": 1.992653727531433, "step": 270 }, { "epoch": 0.33487226839027395, "grad_norm": 2.46875, "learning_rate": 1.9230316532419776e-05, "loss": 1.1357910633087158, "step": 272 }, { "epoch": 0.3373345644813789, "grad_norm": 4.875, "learning_rate": 1.9216617787310126e-05, "loss": 1.4825578927993774, "step": 274 }, { "epoch": 0.3397968605724838, "grad_norm": 1.6328125, "learning_rate": 1.920280448313031e-05, "loss": 1.0347270965576172, "step": 276 }, { "epoch": 0.3422591566635888, "grad_norm": 12.625, "learning_rate": 1.918887683915746e-05, "loss": 1.3586125373840332, "step": 278 }, { "epoch": 0.34472145275469374, "grad_norm": 4.15625, "learning_rate": 1.9174835076483786e-05, "loss": 1.4484443664550781, "step": 280 }, { "epoch": 0.3471837488457987, "grad_norm": 21.0, "learning_rate": 1.916067941801305e-05, "loss": 1.623072624206543, "step": 282 }, { "epoch": 0.34964604493690365, "grad_norm": 2.25, "learning_rate": 1.914641008845704e-05, "loss": 1.2479501962661743, "step": 284 }, { "epoch": 0.35210834102800864, "grad_norm": 2.1875, "learning_rate": 1.9132027314331992e-05, "loss": 1.23157799243927, "step": 286 }, { "epoch": 0.3545706371191136, "grad_norm": 3.9375, "learning_rate": 1.9117531323955004e-05, "loss": 1.4075965881347656, "step": 288 }, { "epoch": 0.3570329332102185, "grad_norm": 4.59375, "learning_rate": 1.910292234744042e-05, "loss": 1.6323527097702026, "step": 290 }, { "epoch": 0.3594952293013235, "grad_norm": 6.15625, "learning_rate": 1.9088200616696135e-05, "loss": 1.7271039485931396, "step": 292 }, { "epoch": 0.3619575253924284, "grad_norm": 9.625, "learning_rate": 1.9073366365419974e-05, "loss": 1.7908841371536255, "step": 294 }, { "epoch": 0.3644198214835334, "grad_norm": 5.75, "learning_rate": 1.9058419829095926e-05, "loss": 1.6885616779327393, "step": 296 }, { "epoch": 0.36688211757463834, "grad_norm": 4.34375, "learning_rate": 1.9043361244990458e-05, "loss": 1.6981712579727173, "step": 298 }, { "epoch": 0.36934441366574333, "grad_norm": 9.1875, "learning_rate": 1.9028190852148695e-05, "loss": 1.8226585388183594, "step": 300 }, { "epoch": 0.37180670975684826, "grad_norm": 4.5, "learning_rate": 1.9012908891390674e-05, "loss": 1.448561191558838, "step": 302 }, { "epoch": 0.3742690058479532, "grad_norm": 1.8671875, "learning_rate": 1.8997515605307484e-05, "loss": 1.1009801626205444, "step": 304 }, { "epoch": 0.3767313019390582, "grad_norm": 3.53125, "learning_rate": 1.898201123825744e-05, "loss": 1.4048492908477783, "step": 306 }, { "epoch": 0.3791935980301631, "grad_norm": 3.0625, "learning_rate": 1.8966396036362197e-05, "loss": 1.317664384841919, "step": 308 }, { "epoch": 0.3816558941212681, "grad_norm": 7.125, "learning_rate": 1.8950670247502823e-05, "loss": 1.1580454111099243, "step": 310 }, { "epoch": 0.38411819021237303, "grad_norm": 2.71875, "learning_rate": 1.8934834121315904e-05, "loss": 1.486496925354004, "step": 312 }, { "epoch": 0.38658048630347797, "grad_norm": 2.578125, "learning_rate": 1.8918887909189537e-05, "loss": 1.1772874593734741, "step": 314 }, { "epoch": 0.38904278239458295, "grad_norm": 1.75, "learning_rate": 1.8902831864259384e-05, "loss": 1.039048671722412, "step": 316 }, { "epoch": 0.3915050784856879, "grad_norm": 2.921875, "learning_rate": 1.8886666241404614e-05, "loss": 1.3585329055786133, "step": 318 }, { "epoch": 0.3939673745767929, "grad_norm": 2.46875, "learning_rate": 1.887039129724387e-05, "loss": 1.1052215099334717, "step": 320 }, { "epoch": 0.3964296706678978, "grad_norm": 3.296875, "learning_rate": 1.8854007290131223e-05, "loss": 1.4763174057006836, "step": 322 }, { "epoch": 0.3988919667590028, "grad_norm": 6.34375, "learning_rate": 1.8837514480152016e-05, "loss": 1.395377278327942, "step": 324 }, { "epoch": 0.4013542628501077, "grad_norm": 1.90625, "learning_rate": 1.882091312911879e-05, "loss": 1.043440580368042, "step": 326 }, { "epoch": 0.40381655894121266, "grad_norm": 4.5625, "learning_rate": 1.880420350056709e-05, "loss": 1.8225022554397583, "step": 328 }, { "epoch": 0.40627885503231764, "grad_norm": 1.3828125, "learning_rate": 1.87873858597513e-05, "loss": 1.035279393196106, "step": 330 }, { "epoch": 0.4087411511234226, "grad_norm": 4.46875, "learning_rate": 1.877046047364044e-05, "loss": 1.4025003910064697, "step": 332 }, { "epoch": 0.41120344721452756, "grad_norm": 5.3125, "learning_rate": 1.875342761091389e-05, "loss": 1.5152015686035156, "step": 334 }, { "epoch": 0.4136657433056325, "grad_norm": 4.4375, "learning_rate": 1.8736287541957172e-05, "loss": 1.0334498882293701, "step": 336 }, { "epoch": 0.4161280393967375, "grad_norm": 9.25, "learning_rate": 1.8719040538857625e-05, "loss": 1.5699793100357056, "step": 338 }, { "epoch": 0.4185903354878424, "grad_norm": 1.1875, "learning_rate": 1.8701686875400104e-05, "loss": 1.0974748134613037, "step": 340 }, { "epoch": 0.42105263157894735, "grad_norm": 4.25, "learning_rate": 1.8684226827062632e-05, "loss": 1.4441235065460205, "step": 342 }, { "epoch": 0.42351492767005233, "grad_norm": 7.5, "learning_rate": 1.8666660671012002e-05, "loss": 1.0178951025009155, "step": 344 }, { "epoch": 0.42597722376115726, "grad_norm": 6.96875, "learning_rate": 1.8648988686099416e-05, "loss": 1.7429275512695312, "step": 346 }, { "epoch": 0.42843951985226225, "grad_norm": 5.5625, "learning_rate": 1.863121115285604e-05, "loss": 1.3890095949172974, "step": 348 }, { "epoch": 0.4309018159433672, "grad_norm": 107.0, "learning_rate": 1.8613328353488533e-05, "loss": 1.671781301498413, "step": 350 }, { "epoch": 0.43336411203447217, "grad_norm": 6.4375, "learning_rate": 1.8595340571874607e-05, "loss": 0.9639192223548889, "step": 352 }, { "epoch": 0.4358264081255771, "grad_norm": 4.34375, "learning_rate": 1.8577248093558486e-05, "loss": 1.3523774147033691, "step": 354 }, { "epoch": 0.43828870421668203, "grad_norm": 6.53125, "learning_rate": 1.855905120574638e-05, "loss": 1.4467836618423462, "step": 356 }, { "epoch": 0.440751000307787, "grad_norm": 6.125, "learning_rate": 1.854075019730194e-05, "loss": 1.521872878074646, "step": 358 }, { "epoch": 0.44321329639889195, "grad_norm": 3.890625, "learning_rate": 1.8522345358741662e-05, "loss": 0.7035669088363647, "step": 360 }, { "epoch": 0.44567559248999694, "grad_norm": 6.1875, "learning_rate": 1.8503836982230284e-05, "loss": 1.9208122491836548, "step": 362 }, { "epoch": 0.4481378885811019, "grad_norm": 3.328125, "learning_rate": 1.848522536157612e-05, "loss": 1.4902818202972412, "step": 364 }, { "epoch": 0.45060018467220686, "grad_norm": 5.625, "learning_rate": 1.8466510792226447e-05, "loss": 1.7599055767059326, "step": 366 }, { "epoch": 0.4530624807633118, "grad_norm": 9.875, "learning_rate": 1.8447693571262757e-05, "loss": 1.6332001686096191, "step": 368 }, { "epoch": 0.4555247768544167, "grad_norm": 2.953125, "learning_rate": 1.842877399739608e-05, "loss": 1.3132367134094238, "step": 370 }, { "epoch": 0.4579870729455217, "grad_norm": 3.09375, "learning_rate": 1.840975237096224e-05, "loss": 1.3803317546844482, "step": 372 }, { "epoch": 0.46044936903662664, "grad_norm": 4.15625, "learning_rate": 1.8390628993917062e-05, "loss": 1.3456385135650635, "step": 374 }, { "epoch": 0.46291166512773163, "grad_norm": 7.3125, "learning_rate": 1.8371404169831613e-05, "loss": 0.39371660351753235, "step": 376 }, { "epoch": 0.46537396121883656, "grad_norm": 3.453125, "learning_rate": 1.8352078203887346e-05, "loss": 1.3137223720550537, "step": 378 }, { "epoch": 0.4678362573099415, "grad_norm": 6.5, "learning_rate": 1.8332651402871286e-05, "loss": 0.324982613325119, "step": 380 }, { "epoch": 0.4702985534010465, "grad_norm": 7.8125, "learning_rate": 1.8313124075171153e-05, "loss": 1.7339143753051758, "step": 382 }, { "epoch": 0.4727608494921514, "grad_norm": 3.546875, "learning_rate": 1.8293496530770448e-05, "loss": 1.3264766931533813, "step": 384 }, { "epoch": 0.4752231455832564, "grad_norm": 5.28125, "learning_rate": 1.827376908124356e-05, "loss": 1.732757568359375, "step": 386 }, { "epoch": 0.47768544167436133, "grad_norm": 17.625, "learning_rate": 1.8253942039750795e-05, "loss": 1.7728583812713623, "step": 388 }, { "epoch": 0.4801477377654663, "grad_norm": 2.046875, "learning_rate": 1.8234015721033428e-05, "loss": 1.1088775396347046, "step": 390 }, { "epoch": 0.48261003385657125, "grad_norm": 7.75, "learning_rate": 1.8213990441408687e-05, "loss": 1.7161972522735596, "step": 392 }, { "epoch": 0.4850723299476762, "grad_norm": 2.71875, "learning_rate": 1.819386651876474e-05, "loss": 1.3242639303207397, "step": 394 }, { "epoch": 0.48753462603878117, "grad_norm": 3.4375, "learning_rate": 1.8173644272555645e-05, "loss": 1.387306571006775, "step": 396 }, { "epoch": 0.4899969221298861, "grad_norm": 11.875, "learning_rate": 1.815332402379629e-05, "loss": 0.28826314210891724, "step": 398 }, { "epoch": 0.4924592182209911, "grad_norm": 2.953125, "learning_rate": 1.8132906095057287e-05, "loss": 1.4168837070465088, "step": 400 }, { "epoch": 0.494921514312096, "grad_norm": 7.15625, "learning_rate": 1.8112390810459842e-05, "loss": 1.8249226808547974, "step": 402 }, { "epoch": 0.497383810403201, "grad_norm": 3.328125, "learning_rate": 1.8091778495670645e-05, "loss": 1.3672676086425781, "step": 404 }, { "epoch": 0.49984610649430594, "grad_norm": 1.8359375, "learning_rate": 1.8071069477896655e-05, "loss": 1.1166040897369385, "step": 406 }, { "epoch": 0.5023084025854109, "grad_norm": 3.609375, "learning_rate": 1.805026408587994e-05, "loss": 1.401571273803711, "step": 408 }, { "epoch": 0.5047706986765158, "grad_norm": 1.03125, "learning_rate": 1.8029362649892436e-05, "loss": 1.0254689455032349, "step": 410 }, { "epoch": 0.5072329947676208, "grad_norm": 8.3125, "learning_rate": 1.8008365501730716e-05, "loss": 1.4256839752197266, "step": 412 }, { "epoch": 0.5096952908587258, "grad_norm": 2.234375, "learning_rate": 1.7987272974710733e-05, "loss": 1.2576653957366943, "step": 414 }, { "epoch": 0.5121575869498307, "grad_norm": 6.65625, "learning_rate": 1.7966085403662502e-05, "loss": 1.847425937652588, "step": 416 }, { "epoch": 0.5146198830409356, "grad_norm": 1.9609375, "learning_rate": 1.79448031249248e-05, "loss": 1.2791142463684082, "step": 418 }, { "epoch": 0.5170821791320406, "grad_norm": 3.453125, "learning_rate": 1.7923426476339843e-05, "loss": 1.4304306507110596, "step": 420 }, { "epoch": 0.5195444752231456, "grad_norm": 2.765625, "learning_rate": 1.7901955797247894e-05, "loss": 1.354073405265808, "step": 422 }, { "epoch": 0.5220067713142506, "grad_norm": 5.46875, "learning_rate": 1.7880391428481877e-05, "loss": 1.1258585453033447, "step": 424 }, { "epoch": 0.5244690674053555, "grad_norm": 7.96875, "learning_rate": 1.7858733712362006e-05, "loss": 1.2407653331756592, "step": 426 }, { "epoch": 0.5269313634964604, "grad_norm": 3.953125, "learning_rate": 1.7836982992690298e-05, "loss": 1.3420263528823853, "step": 428 }, { "epoch": 0.5293936595875655, "grad_norm": 1.6640625, "learning_rate": 1.781513961474515e-05, "loss": 1.070509672164917, "step": 430 }, { "epoch": 0.5318559556786704, "grad_norm": 5.34375, "learning_rate": 1.7793203925275857e-05, "loss": 1.4249287843704224, "step": 432 }, { "epoch": 0.5343182517697753, "grad_norm": 3.453125, "learning_rate": 1.777117627249708e-05, "loss": 1.3717284202575684, "step": 434 }, { "epoch": 0.5367805478608803, "grad_norm": 3.28125, "learning_rate": 1.774905700608335e-05, "loss": 1.177480697631836, "step": 436 }, { "epoch": 0.5392428439519852, "grad_norm": 1.6015625, "learning_rate": 1.7726846477163506e-05, "loss": 1.0270402431488037, "step": 438 }, { "epoch": 0.5417051400430902, "grad_norm": 3.859375, "learning_rate": 1.7704545038315108e-05, "loss": 1.0033745765686035, "step": 440 }, { "epoch": 0.5441674361341952, "grad_norm": 51.25, "learning_rate": 1.7682153043558865e-05, "loss": 1.7934285402297974, "step": 442 }, { "epoch": 0.5466297322253001, "grad_norm": 4.6875, "learning_rate": 1.765967084835299e-05, "loss": 1.5169916152954102, "step": 444 }, { "epoch": 0.549092028316405, "grad_norm": 2.15625, "learning_rate": 1.763709880958758e-05, "loss": 1.102067470550537, "step": 446 }, { "epoch": 0.55155432440751, "grad_norm": 5.9375, "learning_rate": 1.7614437285578927e-05, "loss": 1.742466926574707, "step": 448 }, { "epoch": 0.554016620498615, "grad_norm": 2.484375, "learning_rate": 1.7591686636063855e-05, "loss": 0.9622822403907776, "step": 450 }, { "epoch": 0.5564789165897199, "grad_norm": 3.671875, "learning_rate": 1.756884722219398e-05, "loss": 1.3980923891067505, "step": 452 }, { "epoch": 0.5589412126808249, "grad_norm": 3.4375, "learning_rate": 1.754591940653002e-05, "loss": 1.2967207431793213, "step": 454 }, { "epoch": 0.5614035087719298, "grad_norm": 2.0625, "learning_rate": 1.7522903553035983e-05, "loss": 1.026415228843689, "step": 456 }, { "epoch": 0.5638658048630347, "grad_norm": 8.4375, "learning_rate": 1.749980002707344e-05, "loss": 1.6526079177856445, "step": 458 }, { "epoch": 0.5663281009541398, "grad_norm": 1.453125, "learning_rate": 1.747660919539571e-05, "loss": 1.0682464838027954, "step": 460 }, { "epoch": 0.5687903970452447, "grad_norm": 1.4296875, "learning_rate": 1.745333142614201e-05, "loss": 1.2323286533355713, "step": 462 }, { "epoch": 0.5712526931363496, "grad_norm": 8.6875, "learning_rate": 1.742996708883165e-05, "loss": 1.657741665840149, "step": 464 }, { "epoch": 0.5737149892274546, "grad_norm": 5.6875, "learning_rate": 1.740651655435815e-05, "loss": 1.5120787620544434, "step": 466 }, { "epoch": 0.5761772853185596, "grad_norm": 3.421875, "learning_rate": 1.7382980194983354e-05, "loss": 1.3939659595489502, "step": 468 }, { "epoch": 0.5786395814096645, "grad_norm": 5.75, "learning_rate": 1.735935838433151e-05, "loss": 1.6433215141296387, "step": 470 }, { "epoch": 0.5811018775007695, "grad_norm": 2.21875, "learning_rate": 1.7335651497383357e-05, "loss": 1.078176498413086, "step": 472 }, { "epoch": 0.5835641735918744, "grad_norm": 3.03125, "learning_rate": 1.731185991047017e-05, "loss": 1.3398302793502808, "step": 474 }, { "epoch": 0.5860264696829793, "grad_norm": 3.015625, "learning_rate": 1.7287984001267765e-05, "loss": 1.344508171081543, "step": 476 }, { "epoch": 0.5884887657740844, "grad_norm": 3.921875, "learning_rate": 1.7264024148790538e-05, "loss": 1.453425407409668, "step": 478 }, { "epoch": 0.5909510618651893, "grad_norm": 1.234375, "learning_rate": 1.7239980733385408e-05, "loss": 0.9735173583030701, "step": 480 }, { "epoch": 0.5934133579562942, "grad_norm": 3.03125, "learning_rate": 1.721585413672582e-05, "loss": 1.3980371952056885, "step": 482 }, { "epoch": 0.5958756540473992, "grad_norm": 3.6875, "learning_rate": 1.7191644741805648e-05, "loss": 1.3482059240341187, "step": 484 }, { "epoch": 0.5983379501385041, "grad_norm": 3.203125, "learning_rate": 1.716735293293316e-05, "loss": 1.404923439025879, "step": 486 }, { "epoch": 0.6008002462296091, "grad_norm": 5.59375, "learning_rate": 1.7142979095724865e-05, "loss": 1.5890945196151733, "step": 488 }, { "epoch": 0.6032625423207141, "grad_norm": 10.25, "learning_rate": 1.7118523617099435e-05, "loss": 1.7281887531280518, "step": 490 }, { "epoch": 0.605724838411819, "grad_norm": 3.3125, "learning_rate": 1.7093986885271532e-05, "loss": 1.4024686813354492, "step": 492 }, { "epoch": 0.6081871345029239, "grad_norm": 1.171875, "learning_rate": 1.7069369289745673e-05, "loss": 1.1231578588485718, "step": 494 }, { "epoch": 0.610649430594029, "grad_norm": 5.40625, "learning_rate": 1.704467122131003e-05, "loss": 1.6918822526931763, "step": 496 }, { "epoch": 0.6131117266851339, "grad_norm": 5.6875, "learning_rate": 1.7019893072030222e-05, "loss": 1.7565666437149048, "step": 498 }, { "epoch": 0.6155740227762388, "grad_norm": 6.0, "learning_rate": 1.6995035235243098e-05, "loss": 1.582336187362671, "step": 500 }, { "epoch": 0.6180363188673438, "grad_norm": 2.515625, "learning_rate": 1.6970098105550514e-05, "loss": 1.2266004085540771, "step": 502 }, { "epoch": 0.6204986149584487, "grad_norm": 3.140625, "learning_rate": 1.694508207881302e-05, "loss": 1.3281134366989136, "step": 504 }, { "epoch": 0.6229609110495538, "grad_norm": 2.6875, "learning_rate": 1.691998755214363e-05, "loss": 1.2356681823730469, "step": 506 }, { "epoch": 0.6254232071406587, "grad_norm": 3.40625, "learning_rate": 1.689481492390148e-05, "loss": 1.0685112476348877, "step": 508 }, { "epoch": 0.6278855032317636, "grad_norm": 3.203125, "learning_rate": 1.686956459368551e-05, "loss": 1.0986112356185913, "step": 510 }, { "epoch": 0.6303477993228686, "grad_norm": 1.8671875, "learning_rate": 1.6844236962328154e-05, "loss": 1.1448196172714233, "step": 512 }, { "epoch": 0.6328100954139735, "grad_norm": 4.0625, "learning_rate": 1.681883243188892e-05, "loss": 1.5838472843170166, "step": 514 }, { "epoch": 0.6352723915050785, "grad_norm": 5.0, "learning_rate": 1.6793351405648053e-05, "loss": 1.0939499139785767, "step": 516 }, { "epoch": 0.6377346875961835, "grad_norm": 2.078125, "learning_rate": 1.6767794288100123e-05, "loss": 0.9746682643890381, "step": 518 }, { "epoch": 0.6401969836872884, "grad_norm": 7.53125, "learning_rate": 1.6742161484947596e-05, "loss": 0.9929898977279663, "step": 520 }, { "epoch": 0.6426592797783933, "grad_norm": 7.28125, "learning_rate": 1.6716453403094394e-05, "loss": 1.6372830867767334, "step": 522 }, { "epoch": 0.6451215758694983, "grad_norm": 18.875, "learning_rate": 1.6690670450639435e-05, "loss": 0.2726695239543915, "step": 524 }, { "epoch": 0.6475838719606033, "grad_norm": 4.1875, "learning_rate": 1.6664813036870174e-05, "loss": 1.3791524171829224, "step": 526 }, { "epoch": 0.6500461680517082, "grad_norm": 20.5, "learning_rate": 1.6638881572256078e-05, "loss": 1.7047182321548462, "step": 528 }, { "epoch": 0.6525084641428132, "grad_norm": 5.5, "learning_rate": 1.6612876468442118e-05, "loss": 1.8910508155822754, "step": 530 }, { "epoch": 0.6549707602339181, "grad_norm": 6.0, "learning_rate": 1.6586798138242258e-05, "loss": 1.3536272048950195, "step": 532 }, { "epoch": 0.6574330563250231, "grad_norm": 5.9375, "learning_rate": 1.6560646995632865e-05, "loss": 1.404782772064209, "step": 534 }, { "epoch": 0.6598953524161281, "grad_norm": 3.296875, "learning_rate": 1.6534423455746157e-05, "loss": 1.3882639408111572, "step": 536 }, { "epoch": 0.662357648507233, "grad_norm": 4.65625, "learning_rate": 1.6508127934863633e-05, "loss": 1.3433642387390137, "step": 538 }, { "epoch": 0.6648199445983379, "grad_norm": 7.5625, "learning_rate": 1.6481760850409406e-05, "loss": 1.7808656692504883, "step": 540 }, { "epoch": 0.6672822406894429, "grad_norm": 5.1875, "learning_rate": 1.645532262094364e-05, "loss": 1.405790090560913, "step": 542 }, { "epoch": 0.6697445367805479, "grad_norm": 8.0625, "learning_rate": 1.6428813666155878e-05, "loss": 1.3506624698638916, "step": 544 }, { "epoch": 0.6722068328716528, "grad_norm": 4.125, "learning_rate": 1.6402234406858375e-05, "loss": 1.3872720003128052, "step": 546 }, { "epoch": 0.6746691289627578, "grad_norm": 2.234375, "learning_rate": 1.6375585264979423e-05, "loss": 1.1865075826644897, "step": 548 }, { "epoch": 0.6771314250538627, "grad_norm": 4.625, "learning_rate": 1.6348866663556645e-05, "loss": 1.4540220499038696, "step": 550 }, { "epoch": 0.6795937211449676, "grad_norm": 8.3125, "learning_rate": 1.6322079026730317e-05, "loss": 1.0791795253753662, "step": 552 }, { "epoch": 0.6820560172360727, "grad_norm": 2.359375, "learning_rate": 1.6295222779736586e-05, "loss": 1.1618213653564453, "step": 554 }, { "epoch": 0.6845183133271776, "grad_norm": 5.78125, "learning_rate": 1.626829834890074e-05, "loss": 1.6633763313293457, "step": 556 }, { "epoch": 0.6869806094182825, "grad_norm": 8.375, "learning_rate": 1.624130616163044e-05, "loss": 1.7596007585525513, "step": 558 }, { "epoch": 0.6894429055093875, "grad_norm": 1.5625, "learning_rate": 1.6214246646408946e-05, "loss": 1.0530022382736206, "step": 560 }, { "epoch": 0.6919052016004925, "grad_norm": 3.375, "learning_rate": 1.61871202327883e-05, "loss": 1.3792953491210938, "step": 562 }, { "epoch": 0.6943674976915974, "grad_norm": 3.640625, "learning_rate": 1.6159927351382512e-05, "loss": 1.3962174654006958, "step": 564 }, { "epoch": 0.6968297937827024, "grad_norm": 3.59375, "learning_rate": 1.6132668433860723e-05, "loss": 1.3606011867523193, "step": 566 }, { "epoch": 0.6992920898738073, "grad_norm": 3.859375, "learning_rate": 1.6105343912940355e-05, "loss": 1.3807508945465088, "step": 568 }, { "epoch": 0.7017543859649122, "grad_norm": 2.78125, "learning_rate": 1.6077954222380235e-05, "loss": 1.3539392948150635, "step": 570 }, { "epoch": 0.7042166820560173, "grad_norm": 5.125, "learning_rate": 1.6050499796973733e-05, "loss": 1.3989124298095703, "step": 572 }, { "epoch": 0.7066789781471222, "grad_norm": 3.875, "learning_rate": 1.6022981072541823e-05, "loss": 1.3723649978637695, "step": 574 }, { "epoch": 0.7091412742382271, "grad_norm": 7.65625, "learning_rate": 1.599539848592619e-05, "loss": 1.3160829544067383, "step": 576 }, { "epoch": 0.7116035703293321, "grad_norm": 3.171875, "learning_rate": 1.5967752474982296e-05, "loss": 1.1242200136184692, "step": 578 }, { "epoch": 0.714065866420437, "grad_norm": 2.4375, "learning_rate": 1.5940043478572413e-05, "loss": 1.0219156742095947, "step": 580 }, { "epoch": 0.716528162511542, "grad_norm": 1.7578125, "learning_rate": 1.591227193655867e-05, "loss": 0.9959127902984619, "step": 582 }, { "epoch": 0.718990458602647, "grad_norm": 5.15625, "learning_rate": 1.5884438289796067e-05, "loss": 1.828487753868103, "step": 584 }, { "epoch": 0.7214527546937519, "grad_norm": 3.828125, "learning_rate": 1.5856542980125477e-05, "loss": 1.4034947156906128, "step": 586 }, { "epoch": 0.7239150507848569, "grad_norm": 13.75, "learning_rate": 1.5828586450366626e-05, "loss": 1.3598823547363281, "step": 588 }, { "epoch": 0.7263773468759618, "grad_norm": 3.453125, "learning_rate": 1.5800569144311078e-05, "loss": 1.3686673641204834, "step": 590 }, { "epoch": 0.7288396429670668, "grad_norm": 4.21875, "learning_rate": 1.5772491506715174e-05, "loss": 1.3769757747650146, "step": 592 }, { "epoch": 0.7313019390581718, "grad_norm": 9.5625, "learning_rate": 1.5744353983292975e-05, "loss": 0.6412605047225952, "step": 594 }, { "epoch": 0.7337642351492767, "grad_norm": 3.921875, "learning_rate": 1.5716157020709196e-05, "loss": 1.3520535230636597, "step": 596 }, { "epoch": 0.7362265312403816, "grad_norm": 3.03125, "learning_rate": 1.5687901066572116e-05, "loss": 1.0531518459320068, "step": 598 }, { "epoch": 0.7386888273314867, "grad_norm": 3.375, "learning_rate": 1.565958656942645e-05, "loss": 1.3364739418029785, "step": 600 }, { "epoch": 0.7411511234225916, "grad_norm": 1.4140625, "learning_rate": 1.563121397874626e-05, "loss": 1.0583405494689941, "step": 602 }, { "epoch": 0.7436134195136965, "grad_norm": 6.875, "learning_rate": 1.5602783744927794e-05, "loss": 1.699558138847351, "step": 604 }, { "epoch": 0.7460757156048015, "grad_norm": 5.5625, "learning_rate": 1.557429631928235e-05, "loss": 1.6496608257293701, "step": 606 }, { "epoch": 0.7485380116959064, "grad_norm": 4.6875, "learning_rate": 1.5545752154029118e-05, "loss": 1.3926259279251099, "step": 608 }, { "epoch": 0.7510003077870114, "grad_norm": 1.796875, "learning_rate": 1.5517151702287977e-05, "loss": 1.0908641815185547, "step": 610 }, { "epoch": 0.7534626038781164, "grad_norm": 2.28125, "learning_rate": 1.548849541807233e-05, "loss": 1.1665232181549072, "step": 612 }, { "epoch": 0.7559248999692213, "grad_norm": 4.375, "learning_rate": 1.5459783756281872e-05, "loss": 1.5498981475830078, "step": 614 }, { "epoch": 0.7583871960603262, "grad_norm": 3.796875, "learning_rate": 1.543101717269539e-05, "loss": 1.3930026292800903, "step": 616 }, { "epoch": 0.7608494921514312, "grad_norm": 18.75, "learning_rate": 1.5402196123963514e-05, "loss": 1.3921393156051636, "step": 618 }, { "epoch": 0.7633117882425362, "grad_norm": 4.4375, "learning_rate": 1.537332106760147e-05, "loss": 1.5707228183746338, "step": 620 }, { "epoch": 0.7657740843336411, "grad_norm": 5.125, "learning_rate": 1.5344392461981835e-05, "loss": 1.1423331499099731, "step": 622 }, { "epoch": 0.7682363804247461, "grad_norm": 3.328125, "learning_rate": 1.5315410766327224e-05, "loss": 1.3473308086395264, "step": 624 }, { "epoch": 0.770698676515851, "grad_norm": 6.65625, "learning_rate": 1.5286376440703034e-05, "loss": 1.5814166069030762, "step": 626 }, { "epoch": 0.7731609726069559, "grad_norm": 6.625, "learning_rate": 1.5257289946010123e-05, "loss": 1.664976954460144, "step": 628 }, { "epoch": 0.775623268698061, "grad_norm": 3.984375, "learning_rate": 1.5228151743977502e-05, "loss": 1.0675089359283447, "step": 630 }, { "epoch": 0.7780855647891659, "grad_norm": 6.34375, "learning_rate": 1.5198962297155002e-05, "loss": 1.5473486185073853, "step": 632 }, { "epoch": 0.7805478608802708, "grad_norm": 3.296875, "learning_rate": 1.5169722068905927e-05, "loss": 1.4237251281738281, "step": 634 }, { "epoch": 0.7830101569713758, "grad_norm": 1.5859375, "learning_rate": 1.514043152339971e-05, "loss": 1.1319770812988281, "step": 636 }, { "epoch": 0.7854724530624808, "grad_norm": 5.90625, "learning_rate": 1.5111091125604538e-05, "loss": 1.7654370069503784, "step": 638 }, { "epoch": 0.7879347491535857, "grad_norm": 4.46875, "learning_rate": 1.5081701341279957e-05, "loss": 1.407934546470642, "step": 640 }, { "epoch": 0.7903970452446907, "grad_norm": 4.09375, "learning_rate": 1.5052262636969506e-05, "loss": 1.3491337299346924, "step": 642 }, { "epoch": 0.7928593413357956, "grad_norm": 1.796875, "learning_rate": 1.502277547999329e-05, "loss": 1.125083565711975, "step": 644 }, { "epoch": 0.7953216374269005, "grad_norm": 6.0, "learning_rate": 1.4993240338440571e-05, "loss": 1.3817883729934692, "step": 646 }, { "epoch": 0.7977839335180056, "grad_norm": 3.46875, "learning_rate": 1.4963657681162328e-05, "loss": 1.3694324493408203, "step": 648 }, { "epoch": 0.8002462296091105, "grad_norm": 7.71875, "learning_rate": 1.4934027977763838e-05, "loss": 1.451867699623108, "step": 650 }, { "epoch": 0.8027085257002154, "grad_norm": 3.59375, "learning_rate": 1.4904351698597181e-05, "loss": 1.386351466178894, "step": 652 }, { "epoch": 0.8051708217913204, "grad_norm": 1.6796875, "learning_rate": 1.4874629314753812e-05, "loss": 1.0673191547393799, "step": 654 }, { "epoch": 0.8076331178824253, "grad_norm": 4.28125, "learning_rate": 1.4844861298057068e-05, "loss": 1.4586551189422607, "step": 656 }, { "epoch": 0.8100954139735304, "grad_norm": 1.6015625, "learning_rate": 1.4815048121054667e-05, "loss": 1.042107105255127, "step": 658 }, { "epoch": 0.8125577100646353, "grad_norm": 4.1875, "learning_rate": 1.4785190257011231e-05, "loss": 1.6682562828063965, "step": 660 }, { "epoch": 0.8150200061557402, "grad_norm": 7.21875, "learning_rate": 1.4755288179900741e-05, "loss": 1.720628261566162, "step": 662 }, { "epoch": 0.8174823022468451, "grad_norm": 7.15625, "learning_rate": 1.4725342364399055e-05, "loss": 1.3896342515945435, "step": 664 }, { "epoch": 0.8199445983379502, "grad_norm": 4.40625, "learning_rate": 1.4695353285876328e-05, "loss": 1.3969242572784424, "step": 666 }, { "epoch": 0.8224068944290551, "grad_norm": 4.4375, "learning_rate": 1.46653214203895e-05, "loss": 1.686731219291687, "step": 668 }, { "epoch": 0.8248691905201601, "grad_norm": 6.28125, "learning_rate": 1.463524724467472e-05, "loss": 1.7890194654464722, "step": 670 }, { "epoch": 0.827331486611265, "grad_norm": 3.0625, "learning_rate": 1.4605131236139789e-05, "loss": 1.3969485759735107, "step": 672 }, { "epoch": 0.8297937827023699, "grad_norm": 1.9765625, "learning_rate": 1.4574973872856566e-05, "loss": 1.009456992149353, "step": 674 }, { "epoch": 0.832256078793475, "grad_norm": 5.75, "learning_rate": 1.4544775633553409e-05, "loss": 1.0795286893844604, "step": 676 }, { "epoch": 0.8347183748845799, "grad_norm": 3.765625, "learning_rate": 1.4514536997607533e-05, "loss": 1.291078805923462, "step": 678 }, { "epoch": 0.8371806709756848, "grad_norm": 3.640625, "learning_rate": 1.4484258445037437e-05, "loss": 1.2912898063659668, "step": 680 }, { "epoch": 0.8396429670667898, "grad_norm": 5.3125, "learning_rate": 1.4453940456495268e-05, "loss": 1.5154544115066528, "step": 682 }, { "epoch": 0.8421052631578947, "grad_norm": 11.6875, "learning_rate": 1.4423583513259196e-05, "loss": 1.7723913192749023, "step": 684 }, { "epoch": 0.8445675592489997, "grad_norm": 3.890625, "learning_rate": 1.4393188097225764e-05, "loss": 1.4048473834991455, "step": 686 }, { "epoch": 0.8470298553401047, "grad_norm": 4.65625, "learning_rate": 1.4362754690902242e-05, "loss": 1.736893653869629, "step": 688 }, { "epoch": 0.8494921514312096, "grad_norm": 2.625, "learning_rate": 1.4332283777398992e-05, "loss": 1.4180538654327393, "step": 690 }, { "epoch": 0.8519544475223145, "grad_norm": 4.875, "learning_rate": 1.4301775840421756e-05, "loss": 1.700308084487915, "step": 692 }, { "epoch": 0.8544167436134195, "grad_norm": 2.859375, "learning_rate": 1.4271231364264008e-05, "loss": 1.2139472961425781, "step": 694 }, { "epoch": 0.8568790397045245, "grad_norm": 8.25, "learning_rate": 1.424065083379926e-05, "loss": 1.690704584121704, "step": 696 }, { "epoch": 0.8593413357956294, "grad_norm": 5.625, "learning_rate": 1.421003473447335e-05, "loss": 1.757250189781189, "step": 698 }, { "epoch": 0.8618036318867344, "grad_norm": 9.25, "learning_rate": 1.4179383552296768e-05, "loss": 1.7566320896148682, "step": 700 }, { "epoch": 0.8642659279778393, "grad_norm": 6.0625, "learning_rate": 1.4148697773836908e-05, "loss": 1.804456353187561, "step": 702 }, { "epoch": 0.8667282240689443, "grad_norm": 3.609375, "learning_rate": 1.4117977886210352e-05, "loss": 1.6510390043258667, "step": 704 }, { "epoch": 0.8691905201600493, "grad_norm": 5.40625, "learning_rate": 1.4087224377075162e-05, "loss": 1.194544792175293, "step": 706 }, { "epoch": 0.8716528162511542, "grad_norm": 5.75, "learning_rate": 1.4056437734623103e-05, "loss": 1.3318874835968018, "step": 708 }, { "epoch": 0.8741151123422591, "grad_norm": 4.78125, "learning_rate": 1.4025618447571914e-05, "loss": 1.4258933067321777, "step": 710 }, { "epoch": 0.8765774084333641, "grad_norm": 1.8671875, "learning_rate": 1.3994767005157543e-05, "loss": 1.1039819717407227, "step": 712 }, { "epoch": 0.8790397045244691, "grad_norm": 3.71875, "learning_rate": 1.3963883897126395e-05, "loss": 1.3149468898773193, "step": 714 }, { "epoch": 0.881502000615574, "grad_norm": 7.15625, "learning_rate": 1.393296961372753e-05, "loss": 1.3563876152038574, "step": 716 }, { "epoch": 0.883964296706679, "grad_norm": 2.578125, "learning_rate": 1.390202464570491e-05, "loss": 1.226351022720337, "step": 718 }, { "epoch": 0.8864265927977839, "grad_norm": 4.40625, "learning_rate": 1.3871049484289586e-05, "loss": 1.6103639602661133, "step": 720 }, { "epoch": 0.8888888888888888, "grad_norm": 3.671875, "learning_rate": 1.3840044621191907e-05, "loss": 1.40117347240448, "step": 722 }, { "epoch": 0.8913511849799939, "grad_norm": 3.3125, "learning_rate": 1.380901054859373e-05, "loss": 1.0493632555007935, "step": 724 }, { "epoch": 0.8938134810710988, "grad_norm": 4.78125, "learning_rate": 1.3777947759140581e-05, "loss": 1.497347354888916, "step": 726 }, { "epoch": 0.8962757771622037, "grad_norm": 1.546875, "learning_rate": 1.3746856745933861e-05, "loss": 1.1111018657684326, "step": 728 }, { "epoch": 0.8987380732533087, "grad_norm": 1.7734375, "learning_rate": 1.3715738002522983e-05, "loss": 1.1223242282867432, "step": 730 }, { "epoch": 0.9012003693444137, "grad_norm": 7.78125, "learning_rate": 1.3684592022897577e-05, "loss": 1.526750087738037, "step": 732 }, { "epoch": 0.9036626654355187, "grad_norm": 1.3203125, "learning_rate": 1.3653419301479625e-05, "loss": 1.1531429290771484, "step": 734 }, { "epoch": 0.9061249615266236, "grad_norm": 6.84375, "learning_rate": 1.3622220333115618e-05, "loss": 1.627996563911438, "step": 736 }, { "epoch": 0.9085872576177285, "grad_norm": 5.0, "learning_rate": 1.3590995613068695e-05, "loss": 1.3804816007614136, "step": 738 }, { "epoch": 0.9110495537088334, "grad_norm": 3.890625, "learning_rate": 1.3559745637010796e-05, "loss": 1.3431119918823242, "step": 740 }, { "epoch": 0.9135118497999385, "grad_norm": 7.21875, "learning_rate": 1.3528470901014768e-05, "loss": 1.7569446563720703, "step": 742 }, { "epoch": 0.9159741458910434, "grad_norm": 3.453125, "learning_rate": 1.3497171901546527e-05, "loss": 1.4046237468719482, "step": 744 }, { "epoch": 0.9184364419821484, "grad_norm": 2.65625, "learning_rate": 1.3465849135457133e-05, "loss": 1.1801738739013672, "step": 746 }, { "epoch": 0.9208987380732533, "grad_norm": 3.625, "learning_rate": 1.3434503099974943e-05, "loss": 1.414689540863037, "step": 748 }, { "epoch": 0.9233610341643582, "grad_norm": 3.40625, "learning_rate": 1.3403134292697688e-05, "loss": 1.3589739799499512, "step": 750 }, { "epoch": 0.9258233302554633, "grad_norm": 3.390625, "learning_rate": 1.3371743211584602e-05, "loss": 1.2147026062011719, "step": 752 }, { "epoch": 0.9282856263465682, "grad_norm": 14.0, "learning_rate": 1.3340330354948483e-05, "loss": 0.6764575242996216, "step": 754 }, { "epoch": 0.9307479224376731, "grad_norm": 2.515625, "learning_rate": 1.330889622144781e-05, "loss": 1.1622259616851807, "step": 756 }, { "epoch": 0.9332102185287781, "grad_norm": 3.53125, "learning_rate": 1.3277441310078824e-05, "loss": 1.3609400987625122, "step": 758 }, { "epoch": 0.935672514619883, "grad_norm": 10.8125, "learning_rate": 1.3245966120167592e-05, "loss": 1.149078130722046, "step": 760 }, { "epoch": 0.938134810710988, "grad_norm": 2.1875, "learning_rate": 1.3214471151362092e-05, "loss": 1.119340419769287, "step": 762 }, { "epoch": 0.940597106802093, "grad_norm": 3.546875, "learning_rate": 1.3182956903624278e-05, "loss": 1.0370396375656128, "step": 764 }, { "epoch": 0.9430594028931979, "grad_norm": 2.453125, "learning_rate": 1.3151423877222147e-05, "loss": 1.1257320642471313, "step": 766 }, { "epoch": 0.9455216989843028, "grad_norm": 3.34375, "learning_rate": 1.3119872572721794e-05, "loss": 1.3441581726074219, "step": 768 }, { "epoch": 0.9479839950754079, "grad_norm": 5.75, "learning_rate": 1.3088303490979471e-05, "loss": 1.3604907989501953, "step": 770 }, { "epoch": 0.9504462911665128, "grad_norm": 4.6875, "learning_rate": 1.3056717133133621e-05, "loss": 1.6805719137191772, "step": 772 }, { "epoch": 0.9529085872576177, "grad_norm": 5.625, "learning_rate": 1.3025114000596943e-05, "loss": 1.780057430267334, "step": 774 }, { "epoch": 0.9553708833487227, "grad_norm": 3.28125, "learning_rate": 1.2993494595048422e-05, "loss": 1.401186466217041, "step": 776 }, { "epoch": 0.9578331794398276, "grad_norm": 8.875, "learning_rate": 1.2961859418425365e-05, "loss": 1.7668989896774292, "step": 778 }, { "epoch": 0.9602954755309326, "grad_norm": 5.6875, "learning_rate": 1.2930208972915437e-05, "loss": 1.4184396266937256, "step": 780 }, { "epoch": 0.9627577716220376, "grad_norm": 3.75, "learning_rate": 1.2898543760948673e-05, "loss": 1.4058780670166016, "step": 782 }, { "epoch": 0.9652200677131425, "grad_norm": 1.0859375, "learning_rate": 1.2866864285189543e-05, "loss": 1.0642720460891724, "step": 784 }, { "epoch": 0.9676823638042474, "grad_norm": 5.65625, "learning_rate": 1.2835171048528916e-05, "loss": 1.7296231985092163, "step": 786 }, { "epoch": 0.9701446598953524, "grad_norm": 4.96875, "learning_rate": 1.2803464554076128e-05, "loss": 1.4836857318878174, "step": 788 }, { "epoch": 0.9726069559864574, "grad_norm": 5.1875, "learning_rate": 1.2771745305150965e-05, "loss": 1.7830345630645752, "step": 790 }, { "epoch": 0.9750692520775623, "grad_norm": 3.421875, "learning_rate": 1.2740013805275672e-05, "loss": 1.3922364711761475, "step": 792 }, { "epoch": 0.9775315481686673, "grad_norm": 7.34375, "learning_rate": 1.2708270558166995e-05, "loss": 1.0389618873596191, "step": 794 }, { "epoch": 0.9799938442597722, "grad_norm": 5.1875, "learning_rate": 1.2676516067728135e-05, "loss": 1.5342938899993896, "step": 796 }, { "epoch": 0.9824561403508771, "grad_norm": 5.03125, "learning_rate": 1.264475083804078e-05, "loss": 1.7565385103225708, "step": 798 }, { "epoch": 0.9849184364419822, "grad_norm": 4.34375, "learning_rate": 1.2612975373357113e-05, "loss": 1.398611068725586, "step": 800 }, { "epoch": 0.9873807325330871, "grad_norm": 5.78125, "learning_rate": 1.2581190178091764e-05, "loss": 1.4105567932128906, "step": 802 }, { "epoch": 0.989843028624192, "grad_norm": 4.59375, "learning_rate": 1.2549395756813852e-05, "loss": 1.1484860181808472, "step": 804 }, { "epoch": 0.992305324715297, "grad_norm": 6.875, "learning_rate": 1.251759261423894e-05, "loss": 0.9945257902145386, "step": 806 }, { "epoch": 0.994767620806402, "grad_norm": 12.5625, "learning_rate": 1.2485781255221037e-05, "loss": 1.5860981941223145, "step": 808 }, { "epoch": 0.997229916897507, "grad_norm": 13.4375, "learning_rate": 1.2453962184744595e-05, "loss": 1.3061414957046509, "step": 810 }, { "epoch": 0.9996922129886119, "grad_norm": 2.109375, "learning_rate": 1.2422135907916459e-05, "loss": 1.0748600959777832, "step": 812 }, { "epoch": 1.0012311480455525, "grad_norm": 1.0078125, "learning_rate": 1.239030292995789e-05, "loss": 1.1813337802886963, "step": 814 }, { "epoch": 1.0036934441366574, "grad_norm": 3.109375, "learning_rate": 1.2358463756196515e-05, "loss": 1.3365702629089355, "step": 816 }, { "epoch": 1.0061557402277623, "grad_norm": 3.484375, "learning_rate": 1.2326618892058316e-05, "loss": 1.269797444343567, "step": 818 }, { "epoch": 1.0086180363188673, "grad_norm": 2.734375, "learning_rate": 1.2294768843059611e-05, "loss": 1.130170226097107, "step": 820 }, { "epoch": 1.0110803324099722, "grad_norm": 4.1875, "learning_rate": 1.2262914114799011e-05, "loss": 1.5535081624984741, "step": 822 }, { "epoch": 1.0135426285010773, "grad_norm": 9.0625, "learning_rate": 1.2231055212949427e-05, "loss": 1.7664412260055542, "step": 824 }, { "epoch": 1.0160049245921823, "grad_norm": 2.109375, "learning_rate": 1.219919264325001e-05, "loss": 1.4970834255218506, "step": 826 }, { "epoch": 1.0184672206832872, "grad_norm": 1.7734375, "learning_rate": 1.2167326911498137e-05, "loss": 1.1276826858520508, "step": 828 }, { "epoch": 1.0209295167743921, "grad_norm": 2.328125, "learning_rate": 1.2135458523541384e-05, "loss": 1.0198701620101929, "step": 830 }, { "epoch": 1.023391812865497, "grad_norm": 6.0, "learning_rate": 1.2103587985269483e-05, "loss": 1.1860932111740112, "step": 832 }, { "epoch": 1.025854108956602, "grad_norm": 2.703125, "learning_rate": 1.207171580260632e-05, "loss": 1.4768877029418945, "step": 834 }, { "epoch": 1.028316405047707, "grad_norm": 8.625, "learning_rate": 1.2039842481501865e-05, "loss": 1.481208086013794, "step": 836 }, { "epoch": 1.0307787011388119, "grad_norm": 2.90625, "learning_rate": 1.200796852792417e-05, "loss": 1.473567008972168, "step": 838 }, { "epoch": 1.0332409972299168, "grad_norm": 10.5625, "learning_rate": 1.1976094447851323e-05, "loss": 1.5777289867401123, "step": 840 }, { "epoch": 1.035703293321022, "grad_norm": 4.03125, "learning_rate": 1.1944220747263425e-05, "loss": 1.3818743228912354, "step": 842 }, { "epoch": 1.0381655894121269, "grad_norm": 3.625, "learning_rate": 1.1912347932134552e-05, "loss": 1.2724220752716064, "step": 844 }, { "epoch": 1.0406278855032318, "grad_norm": 2.6875, "learning_rate": 1.1880476508424717e-05, "loss": 1.3566083908081055, "step": 846 }, { "epoch": 1.0430901815943368, "grad_norm": 1.3515625, "learning_rate": 1.1848606982071851e-05, "loss": 1.2785669565200806, "step": 848 }, { "epoch": 1.0455524776854417, "grad_norm": 5.375, "learning_rate": 1.1816739858983767e-05, "loss": 1.5428179502487183, "step": 850 }, { "epoch": 1.0480147737765466, "grad_norm": 5.53125, "learning_rate": 1.178487564503012e-05, "loss": 1.7369728088378906, "step": 852 }, { "epoch": 1.0504770698676515, "grad_norm": 5.15625, "learning_rate": 1.1753014846034398e-05, "loss": 1.6508008241653442, "step": 854 }, { "epoch": 1.0529393659587565, "grad_norm": 5.125, "learning_rate": 1.1721157967765869e-05, "loss": 1.4951319694519043, "step": 856 }, { "epoch": 1.0554016620498614, "grad_norm": 6.3125, "learning_rate": 1.1689305515931556e-05, "loss": 1.5579488277435303, "step": 858 }, { "epoch": 1.0578639581409663, "grad_norm": 9.0625, "learning_rate": 1.1657457996168233e-05, "loss": 1.2465214729309082, "step": 860 }, { "epoch": 1.0603262542320715, "grad_norm": 2.796875, "learning_rate": 1.1625615914034363e-05, "loss": 1.1531850099563599, "step": 862 }, { "epoch": 1.0627885503231764, "grad_norm": 7.625, "learning_rate": 1.1593779775002104e-05, "loss": 1.6242802143096924, "step": 864 }, { "epoch": 1.0652508464142814, "grad_norm": 6.8125, "learning_rate": 1.1561950084449258e-05, "loss": 1.7797261476516724, "step": 866 }, { "epoch": 1.0677131425053863, "grad_norm": 7.625, "learning_rate": 1.153012734765127e-05, "loss": 1.0688107013702393, "step": 868 }, { "epoch": 1.0701754385964912, "grad_norm": 1.640625, "learning_rate": 1.1498312069773205e-05, "loss": 0.603493332862854, "step": 870 }, { "epoch": 1.0726377346875962, "grad_norm": 7.59375, "learning_rate": 1.1466504755861708e-05, "loss": 1.2946546077728271, "step": 872 }, { "epoch": 1.075100030778701, "grad_norm": 4.0, "learning_rate": 1.143470591083701e-05, "loss": 1.3011809587478638, "step": 874 }, { "epoch": 1.077562326869806, "grad_norm": 3.1875, "learning_rate": 1.1402916039484898e-05, "loss": 1.3322241306304932, "step": 876 }, { "epoch": 1.080024622960911, "grad_norm": 2.953125, "learning_rate": 1.1371135646448716e-05, "loss": 1.3409028053283691, "step": 878 }, { "epoch": 1.082486919052016, "grad_norm": 4.9375, "learning_rate": 1.1339365236221344e-05, "loss": 1.5541951656341553, "step": 880 }, { "epoch": 1.084949215143121, "grad_norm": 4.1875, "learning_rate": 1.1307605313137185e-05, "loss": 1.6270629167556763, "step": 882 }, { "epoch": 1.087411511234226, "grad_norm": 3.515625, "learning_rate": 1.127585638136417e-05, "loss": 1.40193510055542, "step": 884 }, { "epoch": 1.089873807325331, "grad_norm": 4.90625, "learning_rate": 1.1244118944895751e-05, "loss": 1.3631030321121216, "step": 886 }, { "epoch": 1.0923361034164358, "grad_norm": 11.4375, "learning_rate": 1.1212393507542898e-05, "loss": 1.293651819229126, "step": 888 }, { "epoch": 1.0947983995075408, "grad_norm": 4.6875, "learning_rate": 1.1180680572926107e-05, "loss": 1.4282387495040894, "step": 890 }, { "epoch": 1.0972606955986457, "grad_norm": 4.09375, "learning_rate": 1.1148980644467393e-05, "loss": 1.5414776802062988, "step": 892 }, { "epoch": 1.0997229916897506, "grad_norm": 2.0625, "learning_rate": 1.1117294225382316e-05, "loss": 1.2819738388061523, "step": 894 }, { "epoch": 1.1021852877808556, "grad_norm": 3.625, "learning_rate": 1.1085621818671974e-05, "loss": 1.116639256477356, "step": 896 }, { "epoch": 1.1046475838719605, "grad_norm": 5.15625, "learning_rate": 1.1053963927115037e-05, "loss": 1.3504618406295776, "step": 898 }, { "epoch": 1.1071098799630656, "grad_norm": 4.375, "learning_rate": 1.102232105325975e-05, "loss": 1.4307514429092407, "step": 900 }, { "epoch": 1.1095721760541706, "grad_norm": 2.5, "learning_rate": 1.0990693699415962e-05, "loss": 1.2542567253112793, "step": 902 }, { "epoch": 1.1120344721452755, "grad_norm": 7.71875, "learning_rate": 1.0959082367647155e-05, "loss": 1.3272080421447754, "step": 904 }, { "epoch": 1.1144967682363804, "grad_norm": 5.3125, "learning_rate": 1.0927487559762478e-05, "loss": 1.344172477722168, "step": 906 }, { "epoch": 1.1169590643274854, "grad_norm": 7.59375, "learning_rate": 1.0895909777308757e-05, "loss": 1.2731947898864746, "step": 908 }, { "epoch": 1.1194213604185903, "grad_norm": 1.5234375, "learning_rate": 1.0864349521562563e-05, "loss": 1.2336888313293457, "step": 910 }, { "epoch": 1.1218836565096952, "grad_norm": 2.21875, "learning_rate": 1.0832807293522239e-05, "loss": 1.125575304031372, "step": 912 }, { "epoch": 1.1243459526008002, "grad_norm": 2.609375, "learning_rate": 1.080128359389995e-05, "loss": 1.1796314716339111, "step": 914 }, { "epoch": 1.1268082486919053, "grad_norm": 4.28125, "learning_rate": 1.0769778923113736e-05, "loss": 1.1832040548324585, "step": 916 }, { "epoch": 1.1292705447830103, "grad_norm": 2.15625, "learning_rate": 1.0738293781279561e-05, "loss": 1.1113415956497192, "step": 918 }, { "epoch": 1.1317328408741152, "grad_norm": 2.734375, "learning_rate": 1.0706828668203384e-05, "loss": 1.1446493864059448, "step": 920 }, { "epoch": 1.1341951369652201, "grad_norm": 4.09375, "learning_rate": 1.067538408337323e-05, "loss": 1.3466662168502808, "step": 922 }, { "epoch": 1.136657433056325, "grad_norm": 2.953125, "learning_rate": 1.064396052595123e-05, "loss": 1.1979475021362305, "step": 924 }, { "epoch": 1.13911972914743, "grad_norm": 6.0, "learning_rate": 1.0612558494765735e-05, "loss": 1.2253812551498413, "step": 926 }, { "epoch": 1.141582025238535, "grad_norm": 5.59375, "learning_rate": 1.0581178488303379e-05, "loss": 1.512798547744751, "step": 928 }, { "epoch": 1.1440443213296398, "grad_norm": 1.5859375, "learning_rate": 1.0549821004701163e-05, "loss": 1.214385986328125, "step": 930 }, { "epoch": 1.1465066174207448, "grad_norm": 1.640625, "learning_rate": 1.0518486541738552e-05, "loss": 1.0102102756500244, "step": 932 }, { "epoch": 1.1489689135118497, "grad_norm": 8.4375, "learning_rate": 1.0487175596829584e-05, "loss": 1.2178149223327637, "step": 934 }, { "epoch": 1.1514312096029546, "grad_norm": 5.59375, "learning_rate": 1.0455888667014956e-05, "loss": 1.3471554517745972, "step": 936 }, { "epoch": 1.1538935056940598, "grad_norm": 32.5, "learning_rate": 1.0424626248954135e-05, "loss": 1.5330407619476318, "step": 938 }, { "epoch": 1.1563558017851647, "grad_norm": 3.359375, "learning_rate": 1.0393388838917489e-05, "loss": 1.6406910419464111, "step": 940 }, { "epoch": 1.1588180978762697, "grad_norm": 1.703125, "learning_rate": 1.0362176932778399e-05, "loss": 1.2105987071990967, "step": 942 }, { "epoch": 1.1612803939673746, "grad_norm": 1.59375, "learning_rate": 1.0330991026005384e-05, "loss": 1.194588303565979, "step": 944 }, { "epoch": 1.1637426900584795, "grad_norm": 2.65625, "learning_rate": 1.0299831613654243e-05, "loss": 1.1566952466964722, "step": 946 }, { "epoch": 1.1662049861495845, "grad_norm": 3.71875, "learning_rate": 1.026869919036019e-05, "loss": 1.2074699401855469, "step": 948 }, { "epoch": 1.1686672822406894, "grad_norm": 11.375, "learning_rate": 1.0237594250330013e-05, "loss": 1.3596782684326172, "step": 950 }, { "epoch": 1.1711295783317943, "grad_norm": 5.84375, "learning_rate": 1.020651728733422e-05, "loss": 1.3205690383911133, "step": 952 }, { "epoch": 1.1735918744228995, "grad_norm": 3.734375, "learning_rate": 1.0175468794699193e-05, "loss": 1.337862253189087, "step": 954 }, { "epoch": 1.1760541705140044, "grad_norm": 3.875, "learning_rate": 1.014444926529937e-05, "loss": 1.3420543670654297, "step": 956 }, { "epoch": 1.1785164666051093, "grad_norm": 3.265625, "learning_rate": 1.0113459191549423e-05, "loss": 1.3313000202178955, "step": 958 }, { "epoch": 1.1809787626962143, "grad_norm": 5.03125, "learning_rate": 1.008249906539643e-05, "loss": 1.4042177200317383, "step": 960 }, { "epoch": 1.1834410587873192, "grad_norm": 3.9375, "learning_rate": 1.0051569378312066e-05, "loss": 1.3378522396087646, "step": 962 }, { "epoch": 1.1859033548784241, "grad_norm": 10.9375, "learning_rate": 1.0020670621284814e-05, "loss": 0.8419127464294434, "step": 964 }, { "epoch": 1.188365650969529, "grad_norm": 3.796875, "learning_rate": 9.989803284812156e-06, "loss": 0.8327467441558838, "step": 966 }, { "epoch": 1.190827947060634, "grad_norm": 18.0, "learning_rate": 9.958967858892796e-06, "loss": 1.1072711944580078, "step": 968 }, { "epoch": 1.193290243151739, "grad_norm": 11.375, "learning_rate": 9.928164833018884e-06, "loss": 1.109494686126709, "step": 970 }, { "epoch": 1.1957525392428439, "grad_norm": 7.15625, "learning_rate": 9.897394696168232e-06, "loss": 1.2777066230773926, "step": 972 }, { "epoch": 1.1982148353339488, "grad_norm": 4.03125, "learning_rate": 9.866657936796567e-06, "loss": 1.089713454246521, "step": 974 }, { "epoch": 1.200677131425054, "grad_norm": 4.875, "learning_rate": 9.835955042829762e-06, "loss": 1.1587715148925781, "step": 976 }, { "epoch": 1.2031394275161589, "grad_norm": 5.3125, "learning_rate": 9.805286501656111e-06, "loss": 1.300113558769226, "step": 978 }, { "epoch": 1.2056017236072638, "grad_norm": 7.34375, "learning_rate": 9.774652800118567e-06, "loss": 1.2401779890060425, "step": 980 }, { "epoch": 1.2080640196983687, "grad_norm": 1.3046875, "learning_rate": 9.74405442450704e-06, "loss": 1.2466282844543457, "step": 982 }, { "epoch": 1.2105263157894737, "grad_norm": 9.9375, "learning_rate": 9.713491860550646e-06, "loss": 1.485695242881775, "step": 984 }, { "epoch": 1.2129886118805786, "grad_norm": 4.9375, "learning_rate": 9.682965593410037e-06, "loss": 1.6573221683502197, "step": 986 }, { "epoch": 1.2154509079716835, "grad_norm": 3.734375, "learning_rate": 9.652476107669662e-06, "loss": 1.3761565685272217, "step": 988 }, { "epoch": 1.2179132040627885, "grad_norm": 4.09375, "learning_rate": 9.622023887330094e-06, "loss": 1.3099732398986816, "step": 990 }, { "epoch": 1.2203755001538936, "grad_norm": 10.625, "learning_rate": 9.591609415800338e-06, "loss": 1.5944232940673828, "step": 992 }, { "epoch": 1.2228377962449986, "grad_norm": 7.375, "learning_rate": 9.561233175890165e-06, "loss": 1.7219964265823364, "step": 994 }, { "epoch": 1.2253000923361035, "grad_norm": 2.78125, "learning_rate": 9.530895649802445e-06, "loss": 1.623438835144043, "step": 996 }, { "epoch": 1.2277623884272084, "grad_norm": 3.359375, "learning_rate": 9.50059731912549e-06, "loss": 1.3701614141464233, "step": 998 }, { "epoch": 1.2302246845183133, "grad_norm": 2.8125, "learning_rate": 9.470338664825408e-06, "loss": 1.2980146408081055, "step": 1000 }, { "epoch": 1.2326869806094183, "grad_norm": 5.71875, "learning_rate": 9.44012016723848e-06, "loss": 1.5235289335250854, "step": 1002 }, { "epoch": 1.2351492767005232, "grad_norm": 8.3125, "learning_rate": 9.409942306063513e-06, "loss": 1.6062097549438477, "step": 1004 }, { "epoch": 1.2376115727916281, "grad_norm": 2.65625, "learning_rate": 9.379805560354246e-06, "loss": 1.3337829113006592, "step": 1006 }, { "epoch": 1.240073868882733, "grad_norm": 4.78125, "learning_rate": 9.349710408511734e-06, "loss": 0.7538601160049438, "step": 1008 }, { "epoch": 1.242536164973838, "grad_norm": 27.25, "learning_rate": 9.319657328276757e-06, "loss": 0.47900092601776123, "step": 1010 }, { "epoch": 1.244998461064943, "grad_norm": 4.75, "learning_rate": 9.289646796722234e-06, "loss": 1.0039315223693848, "step": 1012 }, { "epoch": 1.247460757156048, "grad_norm": 6.375, "learning_rate": 9.259679290245658e-06, "loss": 1.2915596961975098, "step": 1014 }, { "epoch": 1.249923053247153, "grad_norm": 4.0, "learning_rate": 9.229755284561518e-06, "loss": 1.336082935333252, "step": 1016 }, { "epoch": 1.252385349338258, "grad_norm": 4.90625, "learning_rate": 9.19987525469376e-06, "loss": 1.416182279586792, "step": 1018 }, { "epoch": 1.254847645429363, "grad_norm": 5.0625, "learning_rate": 9.170039674968254e-06, "loss": 1.378662109375, "step": 1020 }, { "epoch": 1.2573099415204678, "grad_norm": 2.484375, "learning_rate": 9.140249019005236e-06, "loss": 1.3030860424041748, "step": 1022 }, { "epoch": 1.2597722376115728, "grad_norm": 4.0625, "learning_rate": 9.110503759711811e-06, "loss": 1.3451809883117676, "step": 1024 }, { "epoch": 1.2622345337026777, "grad_norm": 5.75, "learning_rate": 9.080804369274451e-06, "loss": 1.3729634284973145, "step": 1026 }, { "epoch": 1.2646968297937828, "grad_norm": 3.65625, "learning_rate": 9.051151319151479e-06, "loss": 1.3505221605300903, "step": 1028 }, { "epoch": 1.2671591258848878, "grad_norm": 5.1875, "learning_rate": 9.021545080065603e-06, "loss": 1.3553135395050049, "step": 1030 }, { "epoch": 1.2696214219759927, "grad_norm": 6.09375, "learning_rate": 8.991986121996432e-06, "loss": 1.4693278074264526, "step": 1032 }, { "epoch": 1.2720837180670976, "grad_norm": 7.09375, "learning_rate": 8.962474914173022e-06, "loss": 1.8386784791946411, "step": 1034 }, { "epoch": 1.2745460141582026, "grad_norm": 5.3125, "learning_rate": 8.933011925066431e-06, "loss": 1.5184224843978882, "step": 1036 }, { "epoch": 1.2770083102493075, "grad_norm": 14.0, "learning_rate": 8.903597622382263e-06, "loss": 1.3686227798461914, "step": 1038 }, { "epoch": 1.2794706063404124, "grad_norm": 5.15625, "learning_rate": 8.87423247305327e-06, "loss": 1.3770601749420166, "step": 1040 }, { "epoch": 1.2819329024315174, "grad_norm": 8.75, "learning_rate": 8.84491694323192e-06, "loss": 0.8821253776550293, "step": 1042 }, { "epoch": 1.2843951985226223, "grad_norm": 2.84375, "learning_rate": 8.815651498283002e-06, "loss": 0.7115093469619751, "step": 1044 }, { "epoch": 1.2868574946137272, "grad_norm": 3.0625, "learning_rate": 8.786436602776248e-06, "loss": 1.1449503898620605, "step": 1046 }, { "epoch": 1.2893197907048322, "grad_norm": 3.359375, "learning_rate": 8.757272720478942e-06, "loss": 1.3050785064697266, "step": 1048 }, { "epoch": 1.291782086795937, "grad_norm": 4.78125, "learning_rate": 8.728160314348575e-06, "loss": 1.0610979795455933, "step": 1050 }, { "epoch": 1.2942443828870422, "grad_norm": 2.96875, "learning_rate": 8.699099846525486e-06, "loss": 0.9030791521072388, "step": 1052 }, { "epoch": 1.2967066789781472, "grad_norm": 4.15625, "learning_rate": 8.670091778325521e-06, "loss": 1.3431543111801147, "step": 1054 }, { "epoch": 1.299168975069252, "grad_norm": 2.90625, "learning_rate": 8.641136570232724e-06, "loss": 1.3691339492797852, "step": 1056 }, { "epoch": 1.301631271160357, "grad_norm": 2.78125, "learning_rate": 8.612234681892017e-06, "loss": 1.3442999124526978, "step": 1058 }, { "epoch": 1.304093567251462, "grad_norm": 4.0625, "learning_rate": 8.583386572101902e-06, "loss": 1.3384771347045898, "step": 1060 }, { "epoch": 1.306555863342567, "grad_norm": 6.6875, "learning_rate": 8.554592698807185e-06, "loss": 1.4566752910614014, "step": 1062 }, { "epoch": 1.3090181594336718, "grad_norm": 6.09375, "learning_rate": 8.525853519091708e-06, "loss": 1.7774509191513062, "step": 1064 }, { "epoch": 1.311480455524777, "grad_norm": 5.5625, "learning_rate": 8.497169489171077e-06, "loss": 1.4398928880691528, "step": 1066 }, { "epoch": 1.313942751615882, "grad_norm": 4.5, "learning_rate": 8.468541064385447e-06, "loss": 1.4056460857391357, "step": 1068 }, { "epoch": 1.3164050477069869, "grad_norm": 4.78125, "learning_rate": 8.439968699192262e-06, "loss": 1.2474167346954346, "step": 1070 }, { "epoch": 1.3188673437980918, "grad_norm": 2.65625, "learning_rate": 8.411452847159063e-06, "loss": 1.4466845989227295, "step": 1072 }, { "epoch": 1.3213296398891967, "grad_norm": 2.875, "learning_rate": 8.382993960956287e-06, "loss": 1.3356812000274658, "step": 1074 }, { "epoch": 1.3237919359803016, "grad_norm": 3.578125, "learning_rate": 8.35459249235007e-06, "loss": 1.3684732913970947, "step": 1076 }, { "epoch": 1.3262542320714066, "grad_norm": 13.3125, "learning_rate": 8.32624889219508e-06, "loss": 1.5551846027374268, "step": 1078 }, { "epoch": 1.3287165281625115, "grad_norm": 1.7734375, "learning_rate": 8.297963610427366e-06, "loss": 1.287471055984497, "step": 1080 }, { "epoch": 1.3311788242536164, "grad_norm": 8.375, "learning_rate": 8.269737096057207e-06, "loss": 1.3594995737075806, "step": 1082 }, { "epoch": 1.3336411203447214, "grad_norm": 4.125, "learning_rate": 8.24156979716199e-06, "loss": 1.451033592224121, "step": 1084 }, { "epoch": 1.3361034164358263, "grad_norm": 4.625, "learning_rate": 8.213462160879098e-06, "loss": 1.272244930267334, "step": 1086 }, { "epoch": 1.3385657125269312, "grad_norm": 2.0625, "learning_rate": 8.185414633398805e-06, "loss": 1.1681973934173584, "step": 1088 }, { "epoch": 1.3410280086180364, "grad_norm": 4.0625, "learning_rate": 8.157427659957198e-06, "loss": 1.1624126434326172, "step": 1090 }, { "epoch": 1.3434903047091413, "grad_norm": 9.9375, "learning_rate": 8.12950168482911e-06, "loss": 1.3475921154022217, "step": 1092 }, { "epoch": 1.3459526008002463, "grad_norm": 13.6875, "learning_rate": 8.101637151321057e-06, "loss": 1.4795109033584595, "step": 1094 }, { "epoch": 1.3484148968913512, "grad_norm": 2.078125, "learning_rate": 8.07383450176423e-06, "loss": 1.3539352416992188, "step": 1096 }, { "epoch": 1.3508771929824561, "grad_norm": 2.984375, "learning_rate": 8.046094177507436e-06, "loss": 1.0916264057159424, "step": 1098 }, { "epoch": 1.353339489073561, "grad_norm": 3.078125, "learning_rate": 8.018416618910105e-06, "loss": 1.337206482887268, "step": 1100 }, { "epoch": 1.355801785164666, "grad_norm": 6.78125, "learning_rate": 7.99080226533532e-06, "loss": 1.5372506380081177, "step": 1102 }, { "epoch": 1.3582640812557711, "grad_norm": 8.875, "learning_rate": 7.963251555142813e-06, "loss": 1.4474639892578125, "step": 1104 }, { "epoch": 1.360726377346876, "grad_norm": 6.90625, "learning_rate": 7.935764925682028e-06, "loss": 1.782578468322754, "step": 1106 }, { "epoch": 1.363188673437981, "grad_norm": 10.25, "learning_rate": 7.908342813285159e-06, "loss": 1.6106759309768677, "step": 1108 }, { "epoch": 1.365650969529086, "grad_norm": 10.125, "learning_rate": 7.880985653260244e-06, "loss": 1.5926954746246338, "step": 1110 }, { "epoch": 1.3681132656201909, "grad_norm": 7.375, "learning_rate": 7.853693879884239e-06, "loss": 1.7612438201904297, "step": 1112 }, { "epoch": 1.3705755617112958, "grad_norm": 3.3125, "learning_rate": 7.826467926396125e-06, "loss": 1.5579084157943726, "step": 1114 }, { "epoch": 1.3730378578024007, "grad_norm": 1.5703125, "learning_rate": 7.799308224990049e-06, "loss": 1.1745721101760864, "step": 1116 }, { "epoch": 1.3755001538935057, "grad_norm": 3.59375, "learning_rate": 7.772215206808441e-06, "loss": 1.1942408084869385, "step": 1118 }, { "epoch": 1.3779624499846106, "grad_norm": 3.890625, "learning_rate": 7.745189301935184e-06, "loss": 1.2781388759613037, "step": 1120 }, { "epoch": 1.3804247460757155, "grad_norm": 10.5, "learning_rate": 7.71823093938877e-06, "loss": 1.2326617240905762, "step": 1122 }, { "epoch": 1.3828870421668205, "grad_norm": 3.4375, "learning_rate": 7.691340547115508e-06, "loss": 1.1817359924316406, "step": 1124 }, { "epoch": 1.3853493382579254, "grad_norm": 1.65625, "learning_rate": 7.664518551982729e-06, "loss": 1.280542016029358, "step": 1126 }, { "epoch": 1.3878116343490305, "grad_norm": 1.4140625, "learning_rate": 7.637765379771997e-06, "loss": 1.0744314193725586, "step": 1128 }, { "epoch": 1.3902739304401355, "grad_norm": 3.484375, "learning_rate": 7.61108145517236e-06, "loss": 1.1780340671539307, "step": 1130 }, { "epoch": 1.3927362265312404, "grad_norm": 2.65625, "learning_rate": 7.5844672017736e-06, "loss": 1.1386570930480957, "step": 1132 }, { "epoch": 1.3951985226223453, "grad_norm": 4.21875, "learning_rate": 7.557923042059525e-06, "loss": 1.2564072608947754, "step": 1134 }, { "epoch": 1.3976608187134503, "grad_norm": 8.5625, "learning_rate": 7.531449397401243e-06, "loss": 1.358655333518982, "step": 1136 }, { "epoch": 1.4001231148045552, "grad_norm": 12.125, "learning_rate": 7.505046688050486e-06, "loss": 1.1821155548095703, "step": 1138 }, { "epoch": 1.4025854108956601, "grad_norm": 4.65625, "learning_rate": 7.4787153331329356e-06, "loss": 1.3920905590057373, "step": 1140 }, { "epoch": 1.4050477069867653, "grad_norm": 2.703125, "learning_rate": 7.452455750641563e-06, "loss": 1.3678568601608276, "step": 1142 }, { "epoch": 1.4075100030778702, "grad_norm": 3.71875, "learning_rate": 7.4262683574300046e-06, "loss": 1.2067809104919434, "step": 1144 }, { "epoch": 1.4099722991689752, "grad_norm": 10.25, "learning_rate": 7.4001535692059335e-06, "loss": 1.400128722190857, "step": 1146 }, { "epoch": 1.41243459526008, "grad_norm": 3.140625, "learning_rate": 7.374111800524476e-06, "loss": 1.1754021644592285, "step": 1148 }, { "epoch": 1.414896891351185, "grad_norm": 5.3125, "learning_rate": 7.34814346478161e-06, "loss": 1.3996424674987793, "step": 1150 }, { "epoch": 1.41735918744229, "grad_norm": 1.78125, "learning_rate": 7.322248974207624e-06, "loss": 1.1624915599822998, "step": 1152 }, { "epoch": 1.4198214835333949, "grad_norm": 7.1875, "learning_rate": 7.296428739860557e-06, "loss": 1.2524189949035645, "step": 1154 }, { "epoch": 1.4222837796244998, "grad_norm": 2.03125, "learning_rate": 7.270683171619675e-06, "loss": 1.1983616352081299, "step": 1156 }, { "epoch": 1.4247460757156047, "grad_norm": 5.15625, "learning_rate": 7.2450126781789795e-06, "loss": 1.263120412826538, "step": 1158 }, { "epoch": 1.4272083718067097, "grad_norm": 2.8125, "learning_rate": 7.219417667040702e-06, "loss": 1.5528199672698975, "step": 1160 }, { "epoch": 1.4296706678978146, "grad_norm": 6.375, "learning_rate": 7.193898544508842e-06, "loss": 1.5049046277999878, "step": 1162 }, { "epoch": 1.4321329639889195, "grad_norm": 2.03125, "learning_rate": 7.168455715682716e-06, "loss": 1.2450196743011475, "step": 1164 }, { "epoch": 1.4345952600800247, "grad_norm": 5.75, "learning_rate": 7.143089584450531e-06, "loss": 1.0869059562683105, "step": 1166 }, { "epoch": 1.4370575561711296, "grad_norm": 3.734375, "learning_rate": 7.117800553482971e-06, "loss": 1.3680589199066162, "step": 1168 }, { "epoch": 1.4395198522622346, "grad_norm": 13.8125, "learning_rate": 7.092589024226804e-06, "loss": 1.4548523426055908, "step": 1170 }, { "epoch": 1.4419821483533395, "grad_norm": 6.15625, "learning_rate": 7.067455396898504e-06, "loss": 1.0294753313064575, "step": 1172 }, { "epoch": 1.4444444444444444, "grad_norm": 14.375, "learning_rate": 7.042400070477908e-06, "loss": 1.1527860164642334, "step": 1174 }, { "epoch": 1.4469067405355494, "grad_norm": 5.0625, "learning_rate": 7.0174234427018736e-06, "loss": 1.667987585067749, "step": 1176 }, { "epoch": 1.4493690366266543, "grad_norm": 9.75, "learning_rate": 6.992525910057972e-06, "loss": 1.6407973766326904, "step": 1178 }, { "epoch": 1.4518313327177594, "grad_norm": 9.125, "learning_rate": 6.967707867778193e-06, "loss": 1.551527500152588, "step": 1180 }, { "epoch": 1.4542936288088644, "grad_norm": 3.21875, "learning_rate": 6.9429697098326634e-06, "loss": 1.400420069694519, "step": 1182 }, { "epoch": 1.4567559248999693, "grad_norm": 3.28125, "learning_rate": 6.918311828923403e-06, "loss": 1.3203402757644653, "step": 1184 }, { "epoch": 1.4592182209910742, "grad_norm": 3.4375, "learning_rate": 6.893734616478087e-06, "loss": 1.2934377193450928, "step": 1186 }, { "epoch": 1.4616805170821792, "grad_norm": 7.84375, "learning_rate": 6.869238462643825e-06, "loss": 0.8468174934387207, "step": 1188 }, { "epoch": 1.464142813173284, "grad_norm": 3.78125, "learning_rate": 6.844823756280985e-06, "loss": 0.7017765641212463, "step": 1190 }, { "epoch": 1.466605109264389, "grad_norm": 6.75, "learning_rate": 6.8204908849569996e-06, "loss": 0.8379335999488831, "step": 1192 }, { "epoch": 1.469067405355494, "grad_norm": 4.78125, "learning_rate": 6.79624023494023e-06, "loss": 0.8475155234336853, "step": 1194 }, { "epoch": 1.471529701446599, "grad_norm": 3.625, "learning_rate": 6.772072191193826e-06, "loss": 1.5360143184661865, "step": 1196 }, { "epoch": 1.4739919975377038, "grad_norm": 12.375, "learning_rate": 6.747987137369616e-06, "loss": 1.451025366783142, "step": 1198 }, { "epoch": 1.4764542936288088, "grad_norm": 6.125, "learning_rate": 6.72398545580202e-06, "loss": 1.6992993354797363, "step": 1200 }, { "epoch": 1.4789165897199137, "grad_norm": 2.859375, "learning_rate": 6.700067527501979e-06, "loss": 1.4374724626541138, "step": 1202 }, { "epoch": 1.4813788858110188, "grad_norm": 4.34375, "learning_rate": 6.676233732150905e-06, "loss": 1.423210859298706, "step": 1204 }, { "epoch": 1.4838411819021238, "grad_norm": 6.375, "learning_rate": 6.652484448094654e-06, "loss": 1.3673293590545654, "step": 1206 }, { "epoch": 1.4863034779932287, "grad_norm": 3.171875, "learning_rate": 6.628820052337515e-06, "loss": 1.3383548259735107, "step": 1208 }, { "epoch": 1.4887657740843336, "grad_norm": 2.71875, "learning_rate": 6.605240920536241e-06, "loss": 0.7290570139884949, "step": 1210 }, { "epoch": 1.4912280701754386, "grad_norm": 3.828125, "learning_rate": 6.581747426994074e-06, "loss": 0.8285163044929504, "step": 1212 }, { "epoch": 1.4936903662665435, "grad_norm": 4.8125, "learning_rate": 6.558339944654797e-06, "loss": 1.524817705154419, "step": 1214 }, { "epoch": 1.4961526623576484, "grad_norm": 4.09375, "learning_rate": 6.5350188450968275e-06, "loss": 1.5156073570251465, "step": 1216 }, { "epoch": 1.4986149584487536, "grad_norm": 1.96875, "learning_rate": 6.511784498527316e-06, "loss": 1.266753911972046, "step": 1218 }, { "epoch": 1.5010772545398585, "grad_norm": 4.28125, "learning_rate": 6.488637273776258e-06, "loss": 1.234669804573059, "step": 1220 }, { "epoch": 1.5035395506309635, "grad_norm": 3.296875, "learning_rate": 6.465577538290656e-06, "loss": 1.1362870931625366, "step": 1222 }, { "epoch": 1.5060018467220684, "grad_norm": 5.78125, "learning_rate": 6.4426056581286736e-06, "loss": 1.2194573879241943, "step": 1224 }, { "epoch": 1.5084641428131733, "grad_norm": 2.484375, "learning_rate": 6.419721997953825e-06, "loss": 1.3203624486923218, "step": 1226 }, { "epoch": 1.5109264389042782, "grad_norm": 8.875, "learning_rate": 6.396926921029197e-06, "loss": 1.4041712284088135, "step": 1228 }, { "epoch": 1.5133887349953832, "grad_norm": 3.015625, "learning_rate": 6.374220789211669e-06, "loss": 1.6859148740768433, "step": 1230 }, { "epoch": 1.515851031086488, "grad_norm": 3.03125, "learning_rate": 6.351603962946182e-06, "loss": 1.2609457969665527, "step": 1232 }, { "epoch": 1.518313327177593, "grad_norm": 4.0625, "learning_rate": 6.329076801260007e-06, "loss": 1.3652920722961426, "step": 1234 }, { "epoch": 1.520775623268698, "grad_norm": 0.9921875, "learning_rate": 6.306639661757047e-06, "loss": 1.1765468120574951, "step": 1236 }, { "epoch": 1.523237919359803, "grad_norm": 12.0625, "learning_rate": 6.2842929006121645e-06, "loss": 1.2304123640060425, "step": 1238 }, { "epoch": 1.5257002154509078, "grad_norm": 4.03125, "learning_rate": 6.262036872565519e-06, "loss": 1.1622458696365356, "step": 1240 }, { "epoch": 1.5281625115420128, "grad_norm": 1.765625, "learning_rate": 6.239871930916952e-06, "loss": 1.1903202533721924, "step": 1242 }, { "epoch": 1.530624807633118, "grad_norm": 4.59375, "learning_rate": 6.21779842752036e-06, "loss": 1.1756622791290283, "step": 1244 }, { "epoch": 1.5330871037242229, "grad_norm": 3.09375, "learning_rate": 6.195816712778119e-06, "loss": 1.361944556236267, "step": 1246 }, { "epoch": 1.5355493998153278, "grad_norm": 2.015625, "learning_rate": 6.1739271356355205e-06, "loss": 1.207919955253601, "step": 1248 }, { "epoch": 1.5380116959064327, "grad_norm": 2.96875, "learning_rate": 6.152130043575235e-06, "loss": 1.128209114074707, "step": 1250 }, { "epoch": 1.5404739919975377, "grad_norm": 4.9375, "learning_rate": 6.130425782611788e-06, "loss": 0.9894086122512817, "step": 1252 }, { "epoch": 1.5429362880886428, "grad_norm": 5.4375, "learning_rate": 6.1088146972860796e-06, "loss": 1.4114530086517334, "step": 1254 }, { "epoch": 1.5453985841797477, "grad_norm": 3.28125, "learning_rate": 6.0872971306598985e-06, "loss": 1.6339147090911865, "step": 1256 }, { "epoch": 1.5478608802708527, "grad_norm": 1.34375, "learning_rate": 6.065873424310493e-06, "loss": 1.2093985080718994, "step": 1258 }, { "epoch": 1.5503231763619576, "grad_norm": 4.1875, "learning_rate": 6.044543918325134e-06, "loss": 1.422555923461914, "step": 1260 }, { "epoch": 1.5527854724530625, "grad_norm": 1.6015625, "learning_rate": 6.0233089512957335e-06, "loss": 1.3422693014144897, "step": 1262 }, { "epoch": 1.5552477685441675, "grad_norm": 6.75, "learning_rate": 6.002168860313449e-06, "loss": 1.1010103225708008, "step": 1264 }, { "epoch": 1.5577100646352724, "grad_norm": 4.28125, "learning_rate": 5.9811239809633504e-06, "loss": 1.3068557977676392, "step": 1266 }, { "epoch": 1.5601723607263773, "grad_norm": 3.875, "learning_rate": 5.960174647319083e-06, "loss": 1.1887340545654297, "step": 1268 }, { "epoch": 1.5626346568174823, "grad_norm": 6.21875, "learning_rate": 5.939321191937567e-06, "loss": 1.1840931177139282, "step": 1270 }, { "epoch": 1.5650969529085872, "grad_norm": 2.609375, "learning_rate": 5.918563945853714e-06, "loss": 1.3886611461639404, "step": 1272 }, { "epoch": 1.5675592489996921, "grad_norm": 1.703125, "learning_rate": 5.8979032385751845e-06, "loss": 1.1980421543121338, "step": 1274 }, { "epoch": 1.570021545090797, "grad_norm": 5.4375, "learning_rate": 5.877339398077142e-06, "loss": 1.4251586198806763, "step": 1276 }, { "epoch": 1.572483841181902, "grad_norm": 8.9375, "learning_rate": 5.8568727507970566e-06, "loss": 1.4789252281188965, "step": 1278 }, { "epoch": 1.574946137273007, "grad_norm": 3.453125, "learning_rate": 5.836503621629518e-06, "loss": 1.3751678466796875, "step": 1280 }, { "epoch": 1.577408433364112, "grad_norm": 5.09375, "learning_rate": 5.8162323339210795e-06, "loss": 1.5434916019439697, "step": 1282 }, { "epoch": 1.579870729455217, "grad_norm": 1.0546875, "learning_rate": 5.796059209465128e-06, "loss": 1.2941160202026367, "step": 1284 }, { "epoch": 1.582333025546322, "grad_norm": 3.40625, "learning_rate": 5.775984568496774e-06, "loss": 1.2361758947372437, "step": 1286 }, { "epoch": 1.5847953216374269, "grad_norm": 4.125, "learning_rate": 5.756008729687764e-06, "loss": 1.2213199138641357, "step": 1288 }, { "epoch": 1.587257617728532, "grad_norm": 5.15625, "learning_rate": 5.7361320101414264e-06, "loss": 1.370686411857605, "step": 1290 }, { "epoch": 1.589719913819637, "grad_norm": 2.328125, "learning_rate": 5.716354725387634e-06, "loss": 1.160779595375061, "step": 1292 }, { "epoch": 1.5921822099107419, "grad_norm": 3.625, "learning_rate": 5.696677189377804e-06, "loss": 1.149789810180664, "step": 1294 }, { "epoch": 1.5946445060018468, "grad_norm": 3.96875, "learning_rate": 5.677099714479901e-06, "loss": 1.3322994709014893, "step": 1296 }, { "epoch": 1.5971068020929517, "grad_norm": 3.640625, "learning_rate": 5.657622611473487e-06, "loss": 1.3151819705963135, "step": 1298 }, { "epoch": 1.5995690981840567, "grad_norm": 3.78125, "learning_rate": 5.638246189544789e-06, "loss": 1.4213796854019165, "step": 1300 }, { "epoch": 1.6020313942751616, "grad_norm": 4.75, "learning_rate": 5.618970756281786e-06, "loss": 1.6766854524612427, "step": 1302 }, { "epoch": 1.6044936903662665, "grad_norm": 8.0625, "learning_rate": 5.5997966176693255e-06, "loss": 1.551700472831726, "step": 1304 }, { "epoch": 1.6069559864573715, "grad_norm": 2.46875, "learning_rate": 5.580724078084273e-06, "loss": 1.2433726787567139, "step": 1306 }, { "epoch": 1.6094182825484764, "grad_norm": 10.25, "learning_rate": 5.561753440290676e-06, "loss": 1.3765232563018799, "step": 1308 }, { "epoch": 1.6118805786395813, "grad_norm": 4.3125, "learning_rate": 5.542885005434956e-06, "loss": 1.6626167297363281, "step": 1310 }, { "epoch": 1.6143428747306863, "grad_norm": 11.0625, "learning_rate": 5.524119073041125e-06, "loss": 1.5003547668457031, "step": 1312 }, { "epoch": 1.6168051708217912, "grad_norm": 4.125, "learning_rate": 5.505455941006048e-06, "loss": 1.4539849758148193, "step": 1314 }, { "epoch": 1.6192674669128961, "grad_norm": 3.234375, "learning_rate": 5.486895905594696e-06, "loss": 1.255268931388855, "step": 1316 }, { "epoch": 1.621729763004001, "grad_norm": 3.234375, "learning_rate": 5.468439261435443e-06, "loss": 1.2248173952102661, "step": 1318 }, { "epoch": 1.6241920590951062, "grad_norm": 1.5625, "learning_rate": 5.450086301515402e-06, "loss": 1.1668376922607422, "step": 1320 }, { "epoch": 1.6266543551862112, "grad_norm": 1.3125, "learning_rate": 5.4318373171757635e-06, "loss": 0.9886284470558167, "step": 1322 }, { "epoch": 1.629116651277316, "grad_norm": 5.0, "learning_rate": 5.413692598107173e-06, "loss": 1.1245368719100952, "step": 1324 }, { "epoch": 1.631578947368421, "grad_norm": 4.65625, "learning_rate": 5.395652432345137e-06, "loss": 1.3283562660217285, "step": 1326 }, { "epoch": 1.6340412434595262, "grad_norm": 3.625, "learning_rate": 5.377717106265447e-06, "loss": 1.361234426498413, "step": 1328 }, { "epoch": 1.636503539550631, "grad_norm": 1.625, "learning_rate": 5.3598869045796256e-06, "loss": 1.0329114198684692, "step": 1330 }, { "epoch": 1.638965835641736, "grad_norm": 4.28125, "learning_rate": 5.342162110330427e-06, "loss": 0.9817519187927246, "step": 1332 }, { "epoch": 1.641428131732841, "grad_norm": 7.1875, "learning_rate": 5.3245430048873205e-06, "loss": 1.1899058818817139, "step": 1334 }, { "epoch": 1.643890427823946, "grad_norm": 18.125, "learning_rate": 5.307029867942037e-06, "loss": 0.9700236320495605, "step": 1336 }, { "epoch": 1.6463527239150508, "grad_norm": 3.28125, "learning_rate": 5.289622977504136e-06, "loss": 0.7763628959655762, "step": 1338 }, { "epoch": 1.6488150200061558, "grad_norm": 7.28125, "learning_rate": 5.272322609896572e-06, "loss": 1.5835676193237305, "step": 1340 }, { "epoch": 1.6512773160972607, "grad_norm": 6.625, "learning_rate": 5.2551290397513266e-06, "loss": 1.6835378408432007, "step": 1342 }, { "epoch": 1.6537396121883656, "grad_norm": 10.875, "learning_rate": 5.2380425400050375e-06, "loss": 1.568629503250122, "step": 1344 }, { "epoch": 1.6562019082794706, "grad_norm": 3.21875, "learning_rate": 5.221063381894673e-06, "loss": 1.3448878526687622, "step": 1346 }, { "epoch": 1.6586642043705755, "grad_norm": 4.75, "learning_rate": 5.204191834953222e-06, "loss": 1.3649985790252686, "step": 1348 }, { "epoch": 1.6611265004616804, "grad_norm": 5.125, "learning_rate": 5.187428167005419e-06, "loss": 1.326650619506836, "step": 1350 }, { "epoch": 1.6635887965527854, "grad_norm": 7.0, "learning_rate": 5.1707726441634875e-06, "loss": 1.4459569454193115, "step": 1352 }, { "epoch": 1.6660510926438903, "grad_norm": 7.65625, "learning_rate": 5.1542255308229185e-06, "loss": 1.614980936050415, "step": 1354 }, { "epoch": 1.6685133887349952, "grad_norm": 5.4375, "learning_rate": 5.137787089658273e-06, "loss": 1.3426003456115723, "step": 1356 }, { "epoch": 1.6709756848261004, "grad_norm": 5.03125, "learning_rate": 5.121457581619018e-06, "loss": 1.3568965196609497, "step": 1358 }, { "epoch": 1.6734379809172053, "grad_norm": 2.859375, "learning_rate": 5.105237265925373e-06, "loss": 1.208372712135315, "step": 1360 }, { "epoch": 1.6759002770083102, "grad_norm": 4.59375, "learning_rate": 5.089126400064199e-06, "loss": 1.2874377965927124, "step": 1362 }, { "epoch": 1.6783625730994152, "grad_norm": 1.796875, "learning_rate": 5.0731252397849195e-06, "loss": 1.2037644386291504, "step": 1364 }, { "epoch": 1.6808248691905203, "grad_norm": 2.09375, "learning_rate": 5.057234039095447e-06, "loss": 1.1050446033477783, "step": 1366 }, { "epoch": 1.6832871652816253, "grad_norm": 4.0, "learning_rate": 5.041453050258165e-06, "loss": 1.3572784662246704, "step": 1368 }, { "epoch": 1.6857494613727302, "grad_norm": 6.5625, "learning_rate": 5.025782523785911e-06, "loss": 1.7393821477890015, "step": 1370 }, { "epoch": 1.6882117574638351, "grad_norm": 7.9375, "learning_rate": 5.010222708438004e-06, "loss": 1.312801480293274, "step": 1372 }, { "epoch": 1.69067405355494, "grad_norm": 4.03125, "learning_rate": 4.9947738512163e-06, "loss": 1.1735351085662842, "step": 1374 }, { "epoch": 1.693136349646045, "grad_norm": 4.28125, "learning_rate": 4.979436197361265e-06, "loss": 1.368802547454834, "step": 1376 }, { "epoch": 1.69559864573715, "grad_norm": 3.953125, "learning_rate": 4.964209990348089e-06, "loss": 1.3448070287704468, "step": 1378 }, { "epoch": 1.6980609418282548, "grad_norm": 4.0, "learning_rate": 4.94909547188281e-06, "loss": 1.2951633930206299, "step": 1380 }, { "epoch": 1.7005232379193598, "grad_norm": 2.96875, "learning_rate": 4.934092881898489e-06, "loss": 1.3092372417449951, "step": 1382 }, { "epoch": 1.7029855340104647, "grad_norm": 5.875, "learning_rate": 4.919202458551394e-06, "loss": 1.4099408388137817, "step": 1384 }, { "epoch": 1.7054478301015696, "grad_norm": 4.34375, "learning_rate": 4.9044244382172215e-06, "loss": 1.3373868465423584, "step": 1386 }, { "epoch": 1.7079101261926746, "grad_norm": 4.625, "learning_rate": 4.88975905548734e-06, "loss": 1.3168833255767822, "step": 1388 }, { "epoch": 1.7103724222837795, "grad_norm": 1.96875, "learning_rate": 4.8752065431650775e-06, "loss": 1.1487715244293213, "step": 1390 }, { "epoch": 1.7128347183748844, "grad_norm": 1.984375, "learning_rate": 4.8607671322620134e-06, "loss": 1.083390712738037, "step": 1392 }, { "epoch": 1.7152970144659896, "grad_norm": 2.015625, "learning_rate": 4.846441051994317e-06, "loss": 0.9462494850158691, "step": 1394 }, { "epoch": 1.7177593105570945, "grad_norm": 4.5625, "learning_rate": 4.832228529779107e-06, "loss": 1.4706915616989136, "step": 1396 }, { "epoch": 1.7202216066481995, "grad_norm": 3.828125, "learning_rate": 4.818129791230845e-06, "loss": 1.5781259536743164, "step": 1398 }, { "epoch": 1.7226839027393044, "grad_norm": 3.421875, "learning_rate": 4.804145060157752e-06, "loss": 1.3088247776031494, "step": 1400 }, { "epoch": 1.7251461988304093, "grad_norm": 3.359375, "learning_rate": 4.790274558558255e-06, "loss": 1.305666446685791, "step": 1402 }, { "epoch": 1.7276084949215145, "grad_norm": 3.125, "learning_rate": 4.776518506617457e-06, "loss": 1.3846698999404907, "step": 1404 }, { "epoch": 1.7300707910126194, "grad_norm": 8.0, "learning_rate": 4.762877122703658e-06, "loss": 0.9111043214797974, "step": 1406 }, { "epoch": 1.7325330871037243, "grad_norm": 4.0625, "learning_rate": 4.749350623364867e-06, "loss": 0.9622360467910767, "step": 1408 }, { "epoch": 1.7349953831948293, "grad_norm": 1.6640625, "learning_rate": 4.735939223325387e-06, "loss": 1.1692111492156982, "step": 1410 }, { "epoch": 1.7374576792859342, "grad_norm": 3.109375, "learning_rate": 4.722643135482389e-06, "loss": 1.1715750694274902, "step": 1412 }, { "epoch": 1.7399199753770391, "grad_norm": 1.53125, "learning_rate": 4.709462570902536e-06, "loss": 1.1869937181472778, "step": 1414 }, { "epoch": 1.742382271468144, "grad_norm": 6.3125, "learning_rate": 4.696397738818644e-06, "loss": 1.3076727390289307, "step": 1416 }, { "epoch": 1.744844567559249, "grad_norm": 6.46875, "learning_rate": 4.683448846626342e-06, "loss": 1.78236722946167, "step": 1418 }, { "epoch": 1.747306863650354, "grad_norm": 5.84375, "learning_rate": 4.670616099880796e-06, "loss": 1.399848222732544, "step": 1420 }, { "epoch": 1.7497691597414589, "grad_norm": 1.5234375, "learning_rate": 4.657899702293436e-06, "loss": 1.1672091484069824, "step": 1422 }, { "epoch": 1.7522314558325638, "grad_norm": 1.9921875, "learning_rate": 4.645299855728726e-06, "loss": 1.084723949432373, "step": 1424 }, { "epoch": 1.7546937519236687, "grad_norm": 4.4375, "learning_rate": 4.63281676020096e-06, "loss": 1.277264952659607, "step": 1426 }, { "epoch": 1.7571560480147737, "grad_norm": 3.15625, "learning_rate": 4.620450613871082e-06, "loss": 1.5163521766662598, "step": 1428 }, { "epoch": 1.7596183441058786, "grad_norm": 4.1875, "learning_rate": 4.608201613043551e-06, "loss": 1.3597209453582764, "step": 1430 }, { "epoch": 1.7620806401969837, "grad_norm": 4.71875, "learning_rate": 4.596069952163215e-06, "loss": 1.3845343589782715, "step": 1432 }, { "epoch": 1.7645429362880887, "grad_norm": 8.3125, "learning_rate": 4.584055823812224e-06, "loss": 1.3936517238616943, "step": 1434 }, { "epoch": 1.7670052323791936, "grad_norm": 3.109375, "learning_rate": 4.572159418706983e-06, "loss": 1.2084264755249023, "step": 1436 }, { "epoch": 1.7694675284702985, "grad_norm": 4.25, "learning_rate": 4.560380925695109e-06, "loss": 1.3428120613098145, "step": 1438 }, { "epoch": 1.7719298245614035, "grad_norm": 6.8125, "learning_rate": 4.54872053175245e-06, "loss": 1.7809275388717651, "step": 1440 }, { "epoch": 1.7743921206525086, "grad_norm": 3.078125, "learning_rate": 4.537178421980104e-06, "loss": 1.2580034732818604, "step": 1442 }, { "epoch": 1.7768544167436136, "grad_norm": 14.4375, "learning_rate": 4.52575477960149e-06, "loss": 1.1773604154586792, "step": 1444 }, { "epoch": 1.7793167128347185, "grad_norm": 4.59375, "learning_rate": 4.514449785959429e-06, "loss": 1.5239715576171875, "step": 1446 }, { "epoch": 1.7817790089258234, "grad_norm": 3.125, "learning_rate": 4.503263620513274e-06, "loss": 1.2753288745880127, "step": 1448 }, { "epoch": 1.7842413050169283, "grad_norm": 4.71875, "learning_rate": 4.49219646083606e-06, "loss": 1.2915542125701904, "step": 1450 }, { "epoch": 1.7867036011080333, "grad_norm": 5.0625, "learning_rate": 4.481248482611682e-06, "loss": 1.6656956672668457, "step": 1452 }, { "epoch": 1.7891658971991382, "grad_norm": 4.65625, "learning_rate": 4.470419859632109e-06, "loss": 1.3530993461608887, "step": 1454 }, { "epoch": 1.7916281932902431, "grad_norm": 3.125, "learning_rate": 4.459710763794619e-06, "loss": 1.230569839477539, "step": 1456 }, { "epoch": 1.794090489381348, "grad_norm": 8.5625, "learning_rate": 4.449121365099082e-06, "loss": 1.2140610218048096, "step": 1458 }, { "epoch": 1.796552785472453, "grad_norm": 3.8125, "learning_rate": 4.4386518316452475e-06, "loss": 1.3054462671279907, "step": 1460 }, { "epoch": 1.799015081563558, "grad_norm": 6.59375, "learning_rate": 4.428302329630089e-06, "loss": 1.515989065170288, "step": 1462 }, { "epoch": 1.8014773776546629, "grad_norm": 3.796875, "learning_rate": 4.418073023345158e-06, "loss": 1.2513904571533203, "step": 1464 }, { "epoch": 1.8039396737457678, "grad_norm": 8.9375, "learning_rate": 4.407964075173976e-06, "loss": 1.2142295837402344, "step": 1466 }, { "epoch": 1.8064019698368727, "grad_norm": 4.6875, "learning_rate": 4.397975645589459e-06, "loss": 1.1632449626922607, "step": 1468 }, { "epoch": 1.8088642659279779, "grad_norm": 2.28125, "learning_rate": 4.38810789315137e-06, "loss": 1.2213386297225952, "step": 1470 }, { "epoch": 1.8113265620190828, "grad_norm": 5.5625, "learning_rate": 4.378360974503803e-06, "loss": 1.3299362659454346, "step": 1472 }, { "epoch": 1.8137888581101878, "grad_norm": 19.125, "learning_rate": 4.368735044372691e-06, "loss": 1.8193198442459106, "step": 1474 }, { "epoch": 1.8162511542012927, "grad_norm": 4.03125, "learning_rate": 4.359230255563357e-06, "loss": 1.4013632535934448, "step": 1476 }, { "epoch": 1.8187134502923976, "grad_norm": 3.859375, "learning_rate": 4.349846758958085e-06, "loss": 1.3816094398498535, "step": 1478 }, { "epoch": 1.8211757463835028, "grad_norm": 5.40625, "learning_rate": 4.340584703513722e-06, "loss": 1.48891019821167, "step": 1480 }, { "epoch": 1.8236380424746077, "grad_norm": 6.875, "learning_rate": 4.33144423625932e-06, "loss": 1.8420138359069824, "step": 1482 }, { "epoch": 1.8261003385657126, "grad_norm": 5.625, "learning_rate": 4.322425502293797e-06, "loss": 1.484515905380249, "step": 1484 }, { "epoch": 1.8285626346568176, "grad_norm": 4.0, "learning_rate": 4.313528644783633e-06, "loss": 1.1373395919799805, "step": 1486 }, { "epoch": 1.8310249307479225, "grad_norm": 2.5625, "learning_rate": 4.304753804960603e-06, "loss": 1.0549803972244263, "step": 1488 }, { "epoch": 1.8334872268390274, "grad_norm": 3.125, "learning_rate": 4.2961011221195255e-06, "loss": 1.1374645233154297, "step": 1490 }, { "epoch": 1.8359495229301324, "grad_norm": 3.15625, "learning_rate": 4.287570733616063e-06, "loss": 1.2891483306884766, "step": 1492 }, { "epoch": 1.8384118190212373, "grad_norm": 6.15625, "learning_rate": 4.279162774864535e-06, "loss": 1.3952784538269043, "step": 1494 }, { "epoch": 1.8408741151123422, "grad_norm": 11.4375, "learning_rate": 4.270877379335764e-06, "loss": 1.6006450653076172, "step": 1496 }, { "epoch": 1.8433364112034472, "grad_norm": 3.375, "learning_rate": 4.2627146785549675e-06, "loss": 1.6013039350509644, "step": 1498 }, { "epoch": 1.845798707294552, "grad_norm": 5.65625, "learning_rate": 4.254674802099661e-06, "loss": 1.509192943572998, "step": 1500 }, { "epoch": 1.848261003385657, "grad_norm": 5.1875, "learning_rate": 4.2467578775976064e-06, "loss": 1.611980676651001, "step": 1502 }, { "epoch": 1.850723299476762, "grad_norm": 4.125, "learning_rate": 4.238964030724785e-06, "loss": 1.4414465427398682, "step": 1504 }, { "epoch": 1.8531855955678669, "grad_norm": 3.109375, "learning_rate": 4.231293385203395e-06, "loss": 1.5326135158538818, "step": 1506 }, { "epoch": 1.855647891658972, "grad_norm": 7.09375, "learning_rate": 4.2237460627999035e-06, "loss": 1.3705086708068848, "step": 1508 }, { "epoch": 1.858110187750077, "grad_norm": 5.5, "learning_rate": 4.216322183323097e-06, "loss": 1.7913298606872559, "step": 1510 }, { "epoch": 1.860572483841182, "grad_norm": 6.28125, "learning_rate": 4.2090218646221884e-06, "loss": 1.5537046194076538, "step": 1512 }, { "epoch": 1.8630347799322868, "grad_norm": 6.03125, "learning_rate": 4.201845222584946e-06, "loss": 1.7360601425170898, "step": 1514 }, { "epoch": 1.8654970760233918, "grad_norm": 6.34375, "learning_rate": 4.194792371135853e-06, "loss": 1.8205009698867798, "step": 1516 }, { "epoch": 1.867959372114497, "grad_norm": 3.375, "learning_rate": 4.187863422234293e-06, "loss": 1.408042073249817, "step": 1518 }, { "epoch": 1.8704216682056019, "grad_norm": 5.625, "learning_rate": 4.181058485872784e-06, "loss": 1.096937656402588, "step": 1520 }, { "epoch": 1.8728839642967068, "grad_norm": 6.09375, "learning_rate": 4.174377670075222e-06, "loss": 1.3984037637710571, "step": 1522 }, { "epoch": 1.8753462603878117, "grad_norm": 2.15625, "learning_rate": 4.167821080895174e-06, "loss": 1.3008735179901123, "step": 1524 }, { "epoch": 1.8778085564789166, "grad_norm": 3.296875, "learning_rate": 4.161388822414189e-06, "loss": 1.1213737726211548, "step": 1526 }, { "epoch": 1.8802708525700216, "grad_norm": 6.40625, "learning_rate": 4.155080996740145e-06, "loss": 1.3446485996246338, "step": 1528 }, { "epoch": 1.8827331486611265, "grad_norm": 3.546875, "learning_rate": 4.148897704005638e-06, "loss": 1.3206844329833984, "step": 1530 }, { "epoch": 1.8851954447522314, "grad_norm": 48.0, "learning_rate": 4.14283904236638e-06, "loss": 1.3920776844024658, "step": 1532 }, { "epoch": 1.8876577408433364, "grad_norm": 3.3125, "learning_rate": 4.136905107999645e-06, "loss": 1.4610090255737305, "step": 1534 }, { "epoch": 1.8901200369344413, "grad_norm": 2.0625, "learning_rate": 4.13109599510275e-06, "loss": 1.2146025896072388, "step": 1536 }, { "epoch": 1.8925823330255462, "grad_norm": 3.296875, "learning_rate": 4.125411795891547e-06, "loss": 1.1985912322998047, "step": 1538 }, { "epoch": 1.8950446291166512, "grad_norm": 2.4375, "learning_rate": 4.119852600598966e-06, "loss": 1.32261323928833, "step": 1540 }, { "epoch": 1.897506925207756, "grad_norm": 1.4453125, "learning_rate": 4.114418497473584e-06, "loss": 1.1342700719833374, "step": 1542 }, { "epoch": 1.899969221298861, "grad_norm": 5.71875, "learning_rate": 4.109109572778222e-06, "loss": 1.235834002494812, "step": 1544 }, { "epoch": 1.9024315173899662, "grad_norm": 2.859375, "learning_rate": 4.103925910788572e-06, "loss": 1.3796794414520264, "step": 1546 }, { "epoch": 1.9048938134810711, "grad_norm": 3.3125, "learning_rate": 4.0988675937918686e-06, "loss": 1.2857390642166138, "step": 1548 }, { "epoch": 1.907356109572176, "grad_norm": 4.5, "learning_rate": 4.093934702085574e-06, "loss": 1.4970194101333618, "step": 1550 }, { "epoch": 1.909818405663281, "grad_norm": 6.25, "learning_rate": 4.089127313976101e-06, "loss": 1.31523597240448, "step": 1552 }, { "epoch": 1.912280701754386, "grad_norm": 6.5, "learning_rate": 4.084445505777584e-06, "loss": 1.5725702047348022, "step": 1554 }, { "epoch": 1.914742997845491, "grad_norm": 4.75, "learning_rate": 4.079889351810655e-06, "loss": 1.5622414350509644, "step": 1556 }, { "epoch": 1.917205293936596, "grad_norm": 9.125, "learning_rate": 4.0754589244012665e-06, "loss": 1.2499128580093384, "step": 1558 }, { "epoch": 1.919667590027701, "grad_norm": 5.34375, "learning_rate": 4.071154293879545e-06, "loss": 1.224461555480957, "step": 1560 }, { "epoch": 1.9221298861188059, "grad_norm": 7.53125, "learning_rate": 4.066975528578675e-06, "loss": 1.4134670495986938, "step": 1562 }, { "epoch": 1.9245921822099108, "grad_norm": 3.546875, "learning_rate": 4.062922694833813e-06, "loss": 1.2926013469696045, "step": 1564 }, { "epoch": 1.9270544783010157, "grad_norm": 4.6875, "learning_rate": 4.058995856981032e-06, "loss": 0.9741660356521606, "step": 1566 }, { "epoch": 1.9295167743921207, "grad_norm": 4.46875, "learning_rate": 4.055195077356308e-06, "loss": 0.7483295798301697, "step": 1568 }, { "epoch": 1.9319790704832256, "grad_norm": 4.46875, "learning_rate": 4.051520416294521e-06, "loss": 1.2966933250427246, "step": 1570 }, { "epoch": 1.9344413665743305, "grad_norm": 9.4375, "learning_rate": 4.0479719321285045e-06, "loss": 1.2867720127105713, "step": 1572 }, { "epoch": 1.9369036626654355, "grad_norm": 2.4375, "learning_rate": 4.044549681188113e-06, "loss": 1.154860258102417, "step": 1574 }, { "epoch": 1.9393659587565404, "grad_norm": 2.09375, "learning_rate": 4.041253717799337e-06, "loss": 1.0206176042556763, "step": 1576 }, { "epoch": 1.9418282548476453, "grad_norm": 1.3984375, "learning_rate": 4.038084094283428e-06, "loss": 1.0539655685424805, "step": 1578 }, { "epoch": 1.9442905509387503, "grad_norm": 5.78125, "learning_rate": 4.035040860956082e-06, "loss": 1.2525365352630615, "step": 1580 }, { "epoch": 1.9467528470298552, "grad_norm": 6.0625, "learning_rate": 4.032124066126629e-06, "loss": 1.2998080253601074, "step": 1582 }, { "epoch": 1.9492151431209603, "grad_norm": 3.453125, "learning_rate": 4.029333756097271e-06, "loss": 1.5448267459869385, "step": 1584 }, { "epoch": 1.9516774392120653, "grad_norm": 4.40625, "learning_rate": 4.026669975162351e-06, "loss": 1.6457065343856812, "step": 1586 }, { "epoch": 1.9541397353031702, "grad_norm": 6.4375, "learning_rate": 4.02413276560764e-06, "loss": 1.6280792951583862, "step": 1588 }, { "epoch": 1.9566020313942751, "grad_norm": 4.4375, "learning_rate": 4.021722167709676e-06, "loss": 1.5384184122085571, "step": 1590 }, { "epoch": 1.95906432748538, "grad_norm": 4.5, "learning_rate": 4.019438219735116e-06, "loss": 1.6012859344482422, "step": 1592 }, { "epoch": 1.9615266235764852, "grad_norm": 6.4375, "learning_rate": 4.017280957940137e-06, "loss": 1.3362534046173096, "step": 1594 }, { "epoch": 1.9639889196675901, "grad_norm": 2.15625, "learning_rate": 4.015250416569853e-06, "loss": 1.2762130498886108, "step": 1596 }, { "epoch": 1.966451215758695, "grad_norm": 5.25, "learning_rate": 4.013346627857777e-06, "loss": 1.3821439743041992, "step": 1598 }, { "epoch": 1.9689135118498, "grad_norm": 3.984375, "learning_rate": 4.0115696220253025e-06, "loss": 1.5566853284835815, "step": 1600 }, { "epoch": 1.971375807940905, "grad_norm": 11.1875, "learning_rate": 4.009919427281232e-06, "loss": 1.609104037284851, "step": 1602 }, { "epoch": 1.9738381040320099, "grad_norm": 7.1875, "learning_rate": 4.0083960698213234e-06, "loss": 1.6049237251281738, "step": 1604 }, { "epoch": 1.9763004001231148, "grad_norm": 2.359375, "learning_rate": 4.006999573827876e-06, "loss": 1.2179689407348633, "step": 1606 }, { "epoch": 1.9787626962142197, "grad_norm": 6.4375, "learning_rate": 4.005729961469349e-06, "loss": 1.2181804180145264, "step": 1608 }, { "epoch": 1.9812249923053247, "grad_norm": 9.375, "learning_rate": 4.0045872529000035e-06, "loss": 1.6380505561828613, "step": 1610 }, { "epoch": 1.9836872883964296, "grad_norm": 6.9375, "learning_rate": 4.003571466259587e-06, "loss": 1.5696303844451904, "step": 1612 }, { "epoch": 1.9861495844875345, "grad_norm": 9.4375, "learning_rate": 4.002682617673048e-06, "loss": 1.3733805418014526, "step": 1614 }, { "epoch": 1.9886118805786395, "grad_norm": 2.90625, "learning_rate": 4.001920721250273e-06, "loss": 1.317124843597412, "step": 1616 }, { "epoch": 1.9910741766697444, "grad_norm": 11.1875, "learning_rate": 4.001285789085867e-06, "loss": 1.012315034866333, "step": 1618 }, { "epoch": 1.9935364727608493, "grad_norm": 4.6875, "learning_rate": 4.000777831258963e-06, "loss": 1.1209490299224854, "step": 1620 }, { "epoch": 1.9959987688519545, "grad_norm": 4.1875, "learning_rate": 4.000396855833057e-06, "loss": 1.491336464881897, "step": 1622 }, { "epoch": 1.9984610649430594, "grad_norm": 1.546875, "learning_rate": 4.000142868855884e-06, "loss": 1.23062264919281, "step": 1624 }, { "epoch": 2.0, "grad_norm": 4.5625, "learning_rate": 4.0000158743593194e-06, "loss": 1.0397253036499023, "step": 1626 }, { "epoch": 2.0, "step": 1626, "total_flos": 2.5753569883429274e+18, "train_loss": 1.3656025735654513, "train_runtime": 15098.7141, "train_samples_per_second": 1.721, "train_steps_per_second": 0.108 } ], "logging_steps": 2, "max_steps": 1626, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5753569883429274e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }