{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.997894145725116, "eval_steps": 500, "global_step": 4450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011231222799382282, "grad_norm": 6.350573723690268, "learning_rate": 4.494382022471911e-08, "loss": 0.7656, "step": 1 }, { "epoch": 0.0022462445598764565, "grad_norm": 6.396429921824654, "learning_rate": 8.988764044943822e-08, "loss": 0.8046, "step": 2 }, { "epoch": 0.0033693668398146847, "grad_norm": 6.515479866330877, "learning_rate": 1.348314606741573e-07, "loss": 0.7993, "step": 3 }, { "epoch": 0.004492489119752913, "grad_norm": 6.264174843252429, "learning_rate": 1.7977528089887644e-07, "loss": 0.8049, "step": 4 }, { "epoch": 0.005615611399691142, "grad_norm": 6.57185846446011, "learning_rate": 2.247191011235955e-07, "loss": 0.826, "step": 5 }, { "epoch": 0.0067387336796293695, "grad_norm": 6.517583190885418, "learning_rate": 2.696629213483146e-07, "loss": 0.803, "step": 6 }, { "epoch": 0.007861855959567598, "grad_norm": 6.364802286536463, "learning_rate": 3.1460674157303374e-07, "loss": 0.7837, "step": 7 }, { "epoch": 0.008984978239505826, "grad_norm": 6.365458774711909, "learning_rate": 3.5955056179775287e-07, "loss": 0.7702, "step": 8 }, { "epoch": 0.010108100519444054, "grad_norm": 6.254849998413217, "learning_rate": 4.044943820224719e-07, "loss": 0.7543, "step": 9 }, { "epoch": 0.011231222799382283, "grad_norm": 6.296540752960891, "learning_rate": 4.49438202247191e-07, "loss": 0.7835, "step": 10 }, { "epoch": 0.012354345079320511, "grad_norm": 5.708911549789346, "learning_rate": 4.943820224719102e-07, "loss": 0.764, "step": 11 }, { "epoch": 0.013477467359258739, "grad_norm": 5.900819796376942, "learning_rate": 5.393258426966292e-07, "loss": 0.76, "step": 12 }, { "epoch": 0.014600589639196967, "grad_norm": 5.807726787883733, "learning_rate": 5.842696629213484e-07, "loss": 0.7686, "step": 13 }, { "epoch": 0.015723711919135196, "grad_norm": 5.526670510027518, "learning_rate": 6.292134831460675e-07, "loss": 0.7631, "step": 14 }, { "epoch": 0.016846834199073422, "grad_norm": 4.559877051017468, "learning_rate": 6.741573033707865e-07, "loss": 0.684, "step": 15 }, { "epoch": 0.017969956479011652, "grad_norm": 4.475977243208917, "learning_rate": 7.191011235955057e-07, "loss": 0.7226, "step": 16 }, { "epoch": 0.01909307875894988, "grad_norm": 4.295823217269364, "learning_rate": 7.640449438202248e-07, "loss": 0.7131, "step": 17 }, { "epoch": 0.020216201038888108, "grad_norm": 4.129352981067178, "learning_rate": 8.089887640449438e-07, "loss": 0.6764, "step": 18 }, { "epoch": 0.021339323318826337, "grad_norm": 4.136041569248625, "learning_rate": 8.53932584269663e-07, "loss": 0.6996, "step": 19 }, { "epoch": 0.022462445598764567, "grad_norm": 3.127126410742847, "learning_rate": 8.98876404494382e-07, "loss": 0.6585, "step": 20 }, { "epoch": 0.023585567878702793, "grad_norm": 2.7983535971516136, "learning_rate": 9.438202247191013e-07, "loss": 0.6476, "step": 21 }, { "epoch": 0.024708690158641022, "grad_norm": 2.8307801771170498, "learning_rate": 9.887640449438204e-07, "loss": 0.6081, "step": 22 }, { "epoch": 0.025831812438579252, "grad_norm": 2.6693325996833703, "learning_rate": 1.0337078651685394e-06, "loss": 0.6015, "step": 23 }, { "epoch": 0.026954934718517478, "grad_norm": 2.5249433529837058, "learning_rate": 1.0786516853932585e-06, "loss": 0.5871, "step": 24 }, { "epoch": 0.028078056998455708, "grad_norm": 2.7833384535564893, "learning_rate": 1.1235955056179777e-06, "loss": 0.644, "step": 25 }, { "epoch": 0.029201179278393934, "grad_norm": 2.5636212715816846, "learning_rate": 1.1685393258426967e-06, "loss": 0.6212, "step": 26 }, { "epoch": 0.030324301558332163, "grad_norm": 2.13400734054636, "learning_rate": 1.2134831460674157e-06, "loss": 0.6061, "step": 27 }, { "epoch": 0.03144742383827039, "grad_norm": 1.8020788979606925, "learning_rate": 1.258426966292135e-06, "loss": 0.5786, "step": 28 }, { "epoch": 0.03257054611820862, "grad_norm": 1.7251113036312054, "learning_rate": 1.303370786516854e-06, "loss": 0.519, "step": 29 }, { "epoch": 0.033693668398146845, "grad_norm": 1.7845108367392837, "learning_rate": 1.348314606741573e-06, "loss": 0.5688, "step": 30 }, { "epoch": 0.034816790678085074, "grad_norm": 1.5519488204108933, "learning_rate": 1.3932584269662923e-06, "loss": 0.5282, "step": 31 }, { "epoch": 0.035939912958023304, "grad_norm": 1.4564929135509108, "learning_rate": 1.4382022471910115e-06, "loss": 0.5378, "step": 32 }, { "epoch": 0.037063035237961534, "grad_norm": 1.5083545600301442, "learning_rate": 1.4831460674157305e-06, "loss": 0.5256, "step": 33 }, { "epoch": 0.03818615751789976, "grad_norm": 1.2973997209272992, "learning_rate": 1.5280898876404495e-06, "loss": 0.5157, "step": 34 }, { "epoch": 0.03930927979783799, "grad_norm": 1.2442177307671032, "learning_rate": 1.5730337078651686e-06, "loss": 0.5334, "step": 35 }, { "epoch": 0.040432402077776215, "grad_norm": 1.1214494431535786, "learning_rate": 1.6179775280898876e-06, "loss": 0.5457, "step": 36 }, { "epoch": 0.041555524357714445, "grad_norm": 1.0380116256523142, "learning_rate": 1.662921348314607e-06, "loss": 0.5252, "step": 37 }, { "epoch": 0.042678646637652674, "grad_norm": 0.9512314468973523, "learning_rate": 1.707865168539326e-06, "loss": 0.4822, "step": 38 }, { "epoch": 0.043801768917590904, "grad_norm": 1.1749256602840057, "learning_rate": 1.752808988764045e-06, "loss": 0.4876, "step": 39 }, { "epoch": 0.04492489119752913, "grad_norm": 1.1996241804899646, "learning_rate": 1.797752808988764e-06, "loss": 0.5222, "step": 40 }, { "epoch": 0.046048013477467356, "grad_norm": 1.0789479072322812, "learning_rate": 1.8426966292134831e-06, "loss": 0.5101, "step": 41 }, { "epoch": 0.047171135757405586, "grad_norm": 1.0512996176581257, "learning_rate": 1.8876404494382026e-06, "loss": 0.5055, "step": 42 }, { "epoch": 0.048294258037343815, "grad_norm": 0.9746980947816224, "learning_rate": 1.9325842696629214e-06, "loss": 0.504, "step": 43 }, { "epoch": 0.049417380317282045, "grad_norm": 1.0580329624783904, "learning_rate": 1.977528089887641e-06, "loss": 0.5067, "step": 44 }, { "epoch": 0.050540502597220274, "grad_norm": 0.9449814871215534, "learning_rate": 2.02247191011236e-06, "loss": 0.4902, "step": 45 }, { "epoch": 0.051663624877158504, "grad_norm": 0.8168500336470494, "learning_rate": 2.067415730337079e-06, "loss": 0.4573, "step": 46 }, { "epoch": 0.052786747157096726, "grad_norm": 0.8804112383078366, "learning_rate": 2.112359550561798e-06, "loss": 0.506, "step": 47 }, { "epoch": 0.053909869437034956, "grad_norm": 0.7359559115946035, "learning_rate": 2.157303370786517e-06, "loss": 0.4451, "step": 48 }, { "epoch": 0.055032991716973186, "grad_norm": 0.902331110525257, "learning_rate": 2.202247191011236e-06, "loss": 0.4927, "step": 49 }, { "epoch": 0.056156113996911415, "grad_norm": 0.7375774424153347, "learning_rate": 2.2471910112359554e-06, "loss": 0.4777, "step": 50 }, { "epoch": 0.057279236276849645, "grad_norm": 0.7302558001904716, "learning_rate": 2.2921348314606744e-06, "loss": 0.4696, "step": 51 }, { "epoch": 0.05840235855678787, "grad_norm": 0.7169095239110965, "learning_rate": 2.3370786516853934e-06, "loss": 0.4555, "step": 52 }, { "epoch": 0.0595254808367261, "grad_norm": 0.6897677482032446, "learning_rate": 2.3820224719101125e-06, "loss": 0.4401, "step": 53 }, { "epoch": 0.060648603116664326, "grad_norm": 0.661989923280511, "learning_rate": 2.4269662921348315e-06, "loss": 0.4453, "step": 54 }, { "epoch": 0.061771725396602556, "grad_norm": 0.6769101351556205, "learning_rate": 2.4719101123595505e-06, "loss": 0.4402, "step": 55 }, { "epoch": 0.06289484767654079, "grad_norm": 0.6812241363735265, "learning_rate": 2.51685393258427e-06, "loss": 0.4645, "step": 56 }, { "epoch": 0.06401796995647902, "grad_norm": 0.6757451847169096, "learning_rate": 2.561797752808989e-06, "loss": 0.4535, "step": 57 }, { "epoch": 0.06514109223641724, "grad_norm": 0.6326256743817255, "learning_rate": 2.606741573033708e-06, "loss": 0.4194, "step": 58 }, { "epoch": 0.06626421451635547, "grad_norm": 0.6800176400190888, "learning_rate": 2.6516853932584274e-06, "loss": 0.4541, "step": 59 }, { "epoch": 0.06738733679629369, "grad_norm": 0.7164423610987447, "learning_rate": 2.696629213483146e-06, "loss": 0.4497, "step": 60 }, { "epoch": 0.06851045907623192, "grad_norm": 0.7413520702346671, "learning_rate": 2.7415730337078655e-06, "loss": 0.4753, "step": 61 }, { "epoch": 0.06963358135617015, "grad_norm": 0.7016085047019666, "learning_rate": 2.7865168539325845e-06, "loss": 0.4436, "step": 62 }, { "epoch": 0.07075670363610838, "grad_norm": 0.6265784239939073, "learning_rate": 2.8314606741573035e-06, "loss": 0.4552, "step": 63 }, { "epoch": 0.07187982591604661, "grad_norm": 0.674311288104449, "learning_rate": 2.876404494382023e-06, "loss": 0.458, "step": 64 }, { "epoch": 0.07300294819598484, "grad_norm": 0.6446161293508706, "learning_rate": 2.9213483146067416e-06, "loss": 0.4525, "step": 65 }, { "epoch": 0.07412607047592307, "grad_norm": 0.6444476566168919, "learning_rate": 2.966292134831461e-06, "loss": 0.4184, "step": 66 }, { "epoch": 0.0752491927558613, "grad_norm": 0.6233096607639449, "learning_rate": 3.0112359550561796e-06, "loss": 0.457, "step": 67 }, { "epoch": 0.07637231503579953, "grad_norm": 0.6648165469874694, "learning_rate": 3.056179775280899e-06, "loss": 0.4466, "step": 68 }, { "epoch": 0.07749543731573776, "grad_norm": 0.6486022510349203, "learning_rate": 3.1011235955056185e-06, "loss": 0.4209, "step": 69 }, { "epoch": 0.07861855959567599, "grad_norm": 0.6229828143529706, "learning_rate": 3.146067415730337e-06, "loss": 0.432, "step": 70 }, { "epoch": 0.0797416818756142, "grad_norm": 0.5873293721571403, "learning_rate": 3.1910112359550566e-06, "loss": 0.4333, "step": 71 }, { "epoch": 0.08086480415555243, "grad_norm": 0.6550686084928332, "learning_rate": 3.235955056179775e-06, "loss": 0.4342, "step": 72 }, { "epoch": 0.08198792643549066, "grad_norm": 0.687632103391066, "learning_rate": 3.2808988764044946e-06, "loss": 0.4524, "step": 73 }, { "epoch": 0.08311104871542889, "grad_norm": 0.5997642028480324, "learning_rate": 3.325842696629214e-06, "loss": 0.4151, "step": 74 }, { "epoch": 0.08423417099536712, "grad_norm": 0.6141443910145349, "learning_rate": 3.3707865168539327e-06, "loss": 0.4107, "step": 75 }, { "epoch": 0.08535729327530535, "grad_norm": 0.6086934688843036, "learning_rate": 3.415730337078652e-06, "loss": 0.4464, "step": 76 }, { "epoch": 0.08648041555524358, "grad_norm": 0.6547734824010637, "learning_rate": 3.4606741573033707e-06, "loss": 0.4325, "step": 77 }, { "epoch": 0.08760353783518181, "grad_norm": 0.6816992889881991, "learning_rate": 3.50561797752809e-06, "loss": 0.4632, "step": 78 }, { "epoch": 0.08872666011512004, "grad_norm": 0.6858188783737146, "learning_rate": 3.5505617977528096e-06, "loss": 0.4527, "step": 79 }, { "epoch": 0.08984978239505827, "grad_norm": 0.709230605422352, "learning_rate": 3.595505617977528e-06, "loss": 0.4385, "step": 80 }, { "epoch": 0.0909729046749965, "grad_norm": 0.6185609555383293, "learning_rate": 3.6404494382022476e-06, "loss": 0.4295, "step": 81 }, { "epoch": 0.09209602695493471, "grad_norm": 0.5593705773746459, "learning_rate": 3.6853932584269662e-06, "loss": 0.4107, "step": 82 }, { "epoch": 0.09321914923487294, "grad_norm": 0.6449522343921041, "learning_rate": 3.7303370786516857e-06, "loss": 0.4297, "step": 83 }, { "epoch": 0.09434227151481117, "grad_norm": 0.6162415569253238, "learning_rate": 3.775280898876405e-06, "loss": 0.3956, "step": 84 }, { "epoch": 0.0954653937947494, "grad_norm": 0.6285861854737811, "learning_rate": 3.820224719101124e-06, "loss": 0.4209, "step": 85 }, { "epoch": 0.09658851607468763, "grad_norm": 0.6005391526643882, "learning_rate": 3.865168539325843e-06, "loss": 0.4344, "step": 86 }, { "epoch": 0.09771163835462586, "grad_norm": 0.6037551054743073, "learning_rate": 3.910112359550562e-06, "loss": 0.4238, "step": 87 }, { "epoch": 0.09883476063456409, "grad_norm": 0.6317561271143405, "learning_rate": 3.955056179775282e-06, "loss": 0.4313, "step": 88 }, { "epoch": 0.09995788291450232, "grad_norm": 0.6740920615829088, "learning_rate": 4.000000000000001e-06, "loss": 0.4228, "step": 89 }, { "epoch": 0.10108100519444055, "grad_norm": 0.6163555222810396, "learning_rate": 4.04494382022472e-06, "loss": 0.4297, "step": 90 }, { "epoch": 0.10220412747437878, "grad_norm": 0.5924398711122163, "learning_rate": 4.089887640449439e-06, "loss": 0.413, "step": 91 }, { "epoch": 0.10332724975431701, "grad_norm": 0.6450622353301865, "learning_rate": 4.134831460674158e-06, "loss": 0.4055, "step": 92 }, { "epoch": 0.10445037203425522, "grad_norm": 0.5951767882437019, "learning_rate": 4.179775280898877e-06, "loss": 0.4151, "step": 93 }, { "epoch": 0.10557349431419345, "grad_norm": 0.5644268389534823, "learning_rate": 4.224719101123596e-06, "loss": 0.3983, "step": 94 }, { "epoch": 0.10669661659413168, "grad_norm": 0.6348084088963297, "learning_rate": 4.269662921348315e-06, "loss": 0.4071, "step": 95 }, { "epoch": 0.10781973887406991, "grad_norm": 0.5862325727673693, "learning_rate": 4.314606741573034e-06, "loss": 0.43, "step": 96 }, { "epoch": 0.10894286115400814, "grad_norm": 0.6202803723418159, "learning_rate": 4.359550561797753e-06, "loss": 0.4189, "step": 97 }, { "epoch": 0.11006598343394637, "grad_norm": 0.5944376188372148, "learning_rate": 4.404494382022472e-06, "loss": 0.4106, "step": 98 }, { "epoch": 0.1111891057138846, "grad_norm": 0.6071370637839878, "learning_rate": 4.449438202247192e-06, "loss": 0.4168, "step": 99 }, { "epoch": 0.11231222799382283, "grad_norm": 0.6135945175323315, "learning_rate": 4.494382022471911e-06, "loss": 0.4011, "step": 100 }, { "epoch": 0.11343535027376106, "grad_norm": 0.5901989509886602, "learning_rate": 4.53932584269663e-06, "loss": 0.4226, "step": 101 }, { "epoch": 0.11455847255369929, "grad_norm": 0.5719385399517765, "learning_rate": 4.584269662921349e-06, "loss": 0.4207, "step": 102 }, { "epoch": 0.11568159483363752, "grad_norm": 0.6073755395198991, "learning_rate": 4.629213483146068e-06, "loss": 0.4263, "step": 103 }, { "epoch": 0.11680471711357573, "grad_norm": 0.6019653421562294, "learning_rate": 4.674157303370787e-06, "loss": 0.4149, "step": 104 }, { "epoch": 0.11792783939351396, "grad_norm": 0.6153507547879732, "learning_rate": 4.719101123595506e-06, "loss": 0.4089, "step": 105 }, { "epoch": 0.1190509616734522, "grad_norm": 0.5829914630184178, "learning_rate": 4.764044943820225e-06, "loss": 0.4172, "step": 106 }, { "epoch": 0.12017408395339042, "grad_norm": 0.6272470387655589, "learning_rate": 4.808988764044944e-06, "loss": 0.4265, "step": 107 }, { "epoch": 0.12129720623332865, "grad_norm": 0.6284052319460708, "learning_rate": 4.853932584269663e-06, "loss": 0.4422, "step": 108 }, { "epoch": 0.12242032851326688, "grad_norm": 0.6197638566033578, "learning_rate": 4.898876404494383e-06, "loss": 0.4058, "step": 109 }, { "epoch": 0.12354345079320511, "grad_norm": 0.6033230010642877, "learning_rate": 4.943820224719101e-06, "loss": 0.4245, "step": 110 }, { "epoch": 0.12466657307314334, "grad_norm": 0.630498552024084, "learning_rate": 4.988764044943821e-06, "loss": 0.3942, "step": 111 }, { "epoch": 0.12578969535308157, "grad_norm": 0.6493542531086268, "learning_rate": 5.03370786516854e-06, "loss": 0.4363, "step": 112 }, { "epoch": 0.1269128176330198, "grad_norm": 0.6237798245035485, "learning_rate": 5.078651685393259e-06, "loss": 0.4159, "step": 113 }, { "epoch": 0.12803593991295803, "grad_norm": 0.6597150507909345, "learning_rate": 5.123595505617978e-06, "loss": 0.4351, "step": 114 }, { "epoch": 0.12915906219289625, "grad_norm": 0.5980066757551353, "learning_rate": 5.168539325842698e-06, "loss": 0.389, "step": 115 }, { "epoch": 0.1302821844728345, "grad_norm": 0.6404924520081225, "learning_rate": 5.213483146067416e-06, "loss": 0.4229, "step": 116 }, { "epoch": 0.1314053067527727, "grad_norm": 0.6144448252232805, "learning_rate": 5.258426966292135e-06, "loss": 0.4096, "step": 117 }, { "epoch": 0.13252842903271095, "grad_norm": 0.638697930023902, "learning_rate": 5.303370786516855e-06, "loss": 0.4296, "step": 118 }, { "epoch": 0.13365155131264916, "grad_norm": 0.6304024554599722, "learning_rate": 5.348314606741574e-06, "loss": 0.4136, "step": 119 }, { "epoch": 0.13477467359258738, "grad_norm": 0.6253982419034039, "learning_rate": 5.393258426966292e-06, "loss": 0.4171, "step": 120 }, { "epoch": 0.13589779587252562, "grad_norm": 0.5881906390342778, "learning_rate": 5.438202247191011e-06, "loss": 0.4079, "step": 121 }, { "epoch": 0.13702091815246384, "grad_norm": 0.6940061067312768, "learning_rate": 5.483146067415731e-06, "loss": 0.42, "step": 122 }, { "epoch": 0.13814404043240208, "grad_norm": 0.620303869723247, "learning_rate": 5.52808988764045e-06, "loss": 0.4058, "step": 123 }, { "epoch": 0.1392671627123403, "grad_norm": 0.5917642817410118, "learning_rate": 5.573033707865169e-06, "loss": 0.4048, "step": 124 }, { "epoch": 0.14039028499227854, "grad_norm": 0.578694471042719, "learning_rate": 5.617977528089889e-06, "loss": 0.3975, "step": 125 }, { "epoch": 0.14151340727221676, "grad_norm": 0.6356057703453721, "learning_rate": 5.662921348314607e-06, "loss": 0.4232, "step": 126 }, { "epoch": 0.142636529552155, "grad_norm": 0.63035695832545, "learning_rate": 5.707865168539326e-06, "loss": 0.4061, "step": 127 }, { "epoch": 0.14375965183209322, "grad_norm": 0.5901449106195371, "learning_rate": 5.752808988764046e-06, "loss": 0.394, "step": 128 }, { "epoch": 0.14488277411203146, "grad_norm": 0.6552219033314495, "learning_rate": 5.797752808988765e-06, "loss": 0.4228, "step": 129 }, { "epoch": 0.14600589639196968, "grad_norm": 0.5897358290667347, "learning_rate": 5.842696629213483e-06, "loss": 0.4097, "step": 130 }, { "epoch": 0.1471290186719079, "grad_norm": 0.6506458224613755, "learning_rate": 5.887640449438202e-06, "loss": 0.4193, "step": 131 }, { "epoch": 0.14825214095184613, "grad_norm": 0.5884872839199834, "learning_rate": 5.932584269662922e-06, "loss": 0.3958, "step": 132 }, { "epoch": 0.14937526323178435, "grad_norm": 0.6383694194701096, "learning_rate": 5.977528089887641e-06, "loss": 0.4166, "step": 133 }, { "epoch": 0.1504983855117226, "grad_norm": 0.5988685305190284, "learning_rate": 6.022471910112359e-06, "loss": 0.3987, "step": 134 }, { "epoch": 0.1516215077916608, "grad_norm": 0.6008483348458779, "learning_rate": 6.06741573033708e-06, "loss": 0.4054, "step": 135 }, { "epoch": 0.15274463007159905, "grad_norm": 0.6371268436916242, "learning_rate": 6.112359550561798e-06, "loss": 0.415, "step": 136 }, { "epoch": 0.15386775235153727, "grad_norm": 0.5873165979042162, "learning_rate": 6.157303370786517e-06, "loss": 0.3788, "step": 137 }, { "epoch": 0.1549908746314755, "grad_norm": 0.6321649406199696, "learning_rate": 6.202247191011237e-06, "loss": 0.3874, "step": 138 }, { "epoch": 0.15611399691141373, "grad_norm": 0.5905156513112094, "learning_rate": 6.247191011235956e-06, "loss": 0.3865, "step": 139 }, { "epoch": 0.15723711919135197, "grad_norm": 0.6605429195870949, "learning_rate": 6.292134831460674e-06, "loss": 0.4218, "step": 140 }, { "epoch": 0.1583602414712902, "grad_norm": 0.6197445457794568, "learning_rate": 6.337078651685393e-06, "loss": 0.3986, "step": 141 }, { "epoch": 0.1594833637512284, "grad_norm": 0.6510054129028581, "learning_rate": 6.382022471910113e-06, "loss": 0.4126, "step": 142 }, { "epoch": 0.16060648603116665, "grad_norm": 0.6637714327455877, "learning_rate": 6.426966292134832e-06, "loss": 0.4105, "step": 143 }, { "epoch": 0.16172960831110486, "grad_norm": 0.6320653980077073, "learning_rate": 6.47191011235955e-06, "loss": 0.3876, "step": 144 }, { "epoch": 0.1628527305910431, "grad_norm": 0.6224333021729421, "learning_rate": 6.51685393258427e-06, "loss": 0.3975, "step": 145 }, { "epoch": 0.16397585287098132, "grad_norm": 0.6523728785937435, "learning_rate": 6.561797752808989e-06, "loss": 0.4088, "step": 146 }, { "epoch": 0.16509897515091956, "grad_norm": 0.6399877537131693, "learning_rate": 6.606741573033708e-06, "loss": 0.4243, "step": 147 }, { "epoch": 0.16622209743085778, "grad_norm": 0.6785871204181669, "learning_rate": 6.651685393258428e-06, "loss": 0.4118, "step": 148 }, { "epoch": 0.16734521971079602, "grad_norm": 0.6793168925384406, "learning_rate": 6.696629213483147e-06, "loss": 0.3843, "step": 149 }, { "epoch": 0.16846834199073424, "grad_norm": 0.6765524645653702, "learning_rate": 6.741573033707865e-06, "loss": 0.3929, "step": 150 }, { "epoch": 0.16959146427067248, "grad_norm": 0.6500442496922796, "learning_rate": 6.786516853932584e-06, "loss": 0.4157, "step": 151 }, { "epoch": 0.1707145865506107, "grad_norm": 0.6278514376034972, "learning_rate": 6.831460674157304e-06, "loss": 0.384, "step": 152 }, { "epoch": 0.1718377088305489, "grad_norm": 0.6786012740240396, "learning_rate": 6.876404494382023e-06, "loss": 0.4213, "step": 153 }, { "epoch": 0.17296083111048716, "grad_norm": 0.6103427664078407, "learning_rate": 6.921348314606741e-06, "loss": 0.3827, "step": 154 }, { "epoch": 0.17408395339042537, "grad_norm": 0.6373236421700184, "learning_rate": 6.966292134831461e-06, "loss": 0.4021, "step": 155 }, { "epoch": 0.17520707567036362, "grad_norm": 0.6527172283097773, "learning_rate": 7.01123595505618e-06, "loss": 0.415, "step": 156 }, { "epoch": 0.17633019795030183, "grad_norm": 0.6299621366906155, "learning_rate": 7.056179775280899e-06, "loss": 0.3922, "step": 157 }, { "epoch": 0.17745332023024007, "grad_norm": 0.6205260967234852, "learning_rate": 7.101123595505619e-06, "loss": 0.404, "step": 158 }, { "epoch": 0.1785764425101783, "grad_norm": 0.6314521969215471, "learning_rate": 7.146067415730338e-06, "loss": 0.4, "step": 159 }, { "epoch": 0.17969956479011653, "grad_norm": 0.5826299033616584, "learning_rate": 7.191011235955056e-06, "loss": 0.3936, "step": 160 }, { "epoch": 0.18082268707005475, "grad_norm": 0.6340335724127424, "learning_rate": 7.235955056179775e-06, "loss": 0.4234, "step": 161 }, { "epoch": 0.181945809349993, "grad_norm": 0.632801507097347, "learning_rate": 7.280898876404495e-06, "loss": 0.4098, "step": 162 }, { "epoch": 0.1830689316299312, "grad_norm": 0.6446077682121513, "learning_rate": 7.325842696629214e-06, "loss": 0.3846, "step": 163 }, { "epoch": 0.18419205390986942, "grad_norm": 0.5774497489736088, "learning_rate": 7.3707865168539325e-06, "loss": 0.3763, "step": 164 }, { "epoch": 0.18531517618980767, "grad_norm": 0.6295990680542162, "learning_rate": 7.415730337078652e-06, "loss": 0.4073, "step": 165 }, { "epoch": 0.18643829846974588, "grad_norm": 0.6085108426525031, "learning_rate": 7.460674157303371e-06, "loss": 0.4139, "step": 166 }, { "epoch": 0.18756142074968413, "grad_norm": 0.62992988248332, "learning_rate": 7.50561797752809e-06, "loss": 0.4138, "step": 167 }, { "epoch": 0.18868454302962234, "grad_norm": 0.5836300572336676, "learning_rate": 7.55056179775281e-06, "loss": 0.3736, "step": 168 }, { "epoch": 0.18980766530956059, "grad_norm": 0.6265603677231923, "learning_rate": 7.5955056179775284e-06, "loss": 0.375, "step": 169 }, { "epoch": 0.1909307875894988, "grad_norm": 0.5870206417930635, "learning_rate": 7.640449438202247e-06, "loss": 0.4102, "step": 170 }, { "epoch": 0.19205390986943705, "grad_norm": 0.59819579045645, "learning_rate": 7.685393258426966e-06, "loss": 0.4121, "step": 171 }, { "epoch": 0.19317703214937526, "grad_norm": 0.6122535374224991, "learning_rate": 7.730337078651686e-06, "loss": 0.3967, "step": 172 }, { "epoch": 0.1943001544293135, "grad_norm": 0.5721725402345272, "learning_rate": 7.775280898876405e-06, "loss": 0.4097, "step": 173 }, { "epoch": 0.19542327670925172, "grad_norm": 0.5792461141723496, "learning_rate": 7.820224719101124e-06, "loss": 0.396, "step": 174 }, { "epoch": 0.19654639898918994, "grad_norm": 0.5566347774031598, "learning_rate": 7.865168539325843e-06, "loss": 0.3748, "step": 175 }, { "epoch": 0.19766952126912818, "grad_norm": 0.5947888148668254, "learning_rate": 7.910112359550563e-06, "loss": 0.3984, "step": 176 }, { "epoch": 0.1987926435490664, "grad_norm": 0.5729701631007845, "learning_rate": 7.955056179775281e-06, "loss": 0.3732, "step": 177 }, { "epoch": 0.19991576582900464, "grad_norm": 0.6308612644303433, "learning_rate": 8.000000000000001e-06, "loss": 0.3779, "step": 178 }, { "epoch": 0.20103888810894285, "grad_norm": 0.602666895171434, "learning_rate": 8.04494382022472e-06, "loss": 0.4168, "step": 179 }, { "epoch": 0.2021620103888811, "grad_norm": 0.594878749394712, "learning_rate": 8.08988764044944e-06, "loss": 0.3801, "step": 180 }, { "epoch": 0.2032851326688193, "grad_norm": 0.5819103337590406, "learning_rate": 8.13483146067416e-06, "loss": 0.355, "step": 181 }, { "epoch": 0.20440825494875756, "grad_norm": 0.6795698806664618, "learning_rate": 8.179775280898877e-06, "loss": 0.4105, "step": 182 }, { "epoch": 0.20553137722869577, "grad_norm": 0.6176253376211935, "learning_rate": 8.224719101123596e-06, "loss": 0.4052, "step": 183 }, { "epoch": 0.20665449950863402, "grad_norm": 0.5647204982647233, "learning_rate": 8.269662921348315e-06, "loss": 0.4002, "step": 184 }, { "epoch": 0.20777762178857223, "grad_norm": 0.630533784994395, "learning_rate": 8.314606741573035e-06, "loss": 0.3836, "step": 185 }, { "epoch": 0.20890074406851045, "grad_norm": 0.6619715293898263, "learning_rate": 8.359550561797754e-06, "loss": 0.4138, "step": 186 }, { "epoch": 0.2100238663484487, "grad_norm": 0.5966032860325108, "learning_rate": 8.404494382022472e-06, "loss": 0.3896, "step": 187 }, { "epoch": 0.2111469886283869, "grad_norm": 0.5681040377904722, "learning_rate": 8.449438202247192e-06, "loss": 0.3812, "step": 188 }, { "epoch": 0.21227011090832515, "grad_norm": 0.6236783202259868, "learning_rate": 8.494382022471911e-06, "loss": 0.3927, "step": 189 }, { "epoch": 0.21339323318826336, "grad_norm": 0.7424858758861099, "learning_rate": 8.53932584269663e-06, "loss": 0.4452, "step": 190 }, { "epoch": 0.2145163554682016, "grad_norm": 0.7125468158261177, "learning_rate": 8.58426966292135e-06, "loss": 0.3609, "step": 191 }, { "epoch": 0.21563947774813982, "grad_norm": 0.6157709477352911, "learning_rate": 8.629213483146068e-06, "loss": 0.4078, "step": 192 }, { "epoch": 0.21676260002807807, "grad_norm": 0.6307743638624984, "learning_rate": 8.674157303370788e-06, "loss": 0.3683, "step": 193 }, { "epoch": 0.21788572230801628, "grad_norm": 0.6873653868391115, "learning_rate": 8.719101123595506e-06, "loss": 0.4076, "step": 194 }, { "epoch": 0.21900884458795453, "grad_norm": 0.6311262794185428, "learning_rate": 8.764044943820226e-06, "loss": 0.3853, "step": 195 }, { "epoch": 0.22013196686789274, "grad_norm": 0.6680702186561743, "learning_rate": 8.808988764044944e-06, "loss": 0.4135, "step": 196 }, { "epoch": 0.22125508914783096, "grad_norm": 0.6058308110971806, "learning_rate": 8.853932584269664e-06, "loss": 0.4112, "step": 197 }, { "epoch": 0.2223782114277692, "grad_norm": 0.607906340987603, "learning_rate": 8.898876404494383e-06, "loss": 0.3979, "step": 198 }, { "epoch": 0.22350133370770742, "grad_norm": 0.5987320863883703, "learning_rate": 8.943820224719102e-06, "loss": 0.4017, "step": 199 }, { "epoch": 0.22462445598764566, "grad_norm": 0.6240143951994302, "learning_rate": 8.988764044943822e-06, "loss": 0.3869, "step": 200 }, { "epoch": 0.22574757826758388, "grad_norm": 0.6484774713204989, "learning_rate": 9.033707865168541e-06, "loss": 0.3682, "step": 201 }, { "epoch": 0.22687070054752212, "grad_norm": 0.6398454864125076, "learning_rate": 9.07865168539326e-06, "loss": 0.4031, "step": 202 }, { "epoch": 0.22799382282746034, "grad_norm": 0.6071698475923116, "learning_rate": 9.123595505617978e-06, "loss": 0.4081, "step": 203 }, { "epoch": 0.22911694510739858, "grad_norm": 0.5841395417042217, "learning_rate": 9.168539325842698e-06, "loss": 0.3842, "step": 204 }, { "epoch": 0.2302400673873368, "grad_norm": 0.6674576527002625, "learning_rate": 9.213483146067417e-06, "loss": 0.4123, "step": 205 }, { "epoch": 0.23136318966727504, "grad_norm": 0.6640606414419046, "learning_rate": 9.258426966292136e-06, "loss": 0.3939, "step": 206 }, { "epoch": 0.23248631194721325, "grad_norm": 0.7214937941577327, "learning_rate": 9.303370786516854e-06, "loss": 0.371, "step": 207 }, { "epoch": 0.23360943422715147, "grad_norm": 0.6793273791859282, "learning_rate": 9.348314606741574e-06, "loss": 0.3836, "step": 208 }, { "epoch": 0.2347325565070897, "grad_norm": 0.6871956158863509, "learning_rate": 9.393258426966294e-06, "loss": 0.3932, "step": 209 }, { "epoch": 0.23585567878702793, "grad_norm": 0.7258816009143834, "learning_rate": 9.438202247191012e-06, "loss": 0.414, "step": 210 }, { "epoch": 0.23697880106696617, "grad_norm": 0.6492267333090342, "learning_rate": 9.483146067415732e-06, "loss": 0.416, "step": 211 }, { "epoch": 0.2381019233469044, "grad_norm": 0.6041513090496686, "learning_rate": 9.52808988764045e-06, "loss": 0.3934, "step": 212 }, { "epoch": 0.23922504562684263, "grad_norm": 0.6474076777760845, "learning_rate": 9.57303370786517e-06, "loss": 0.3975, "step": 213 }, { "epoch": 0.24034816790678085, "grad_norm": 0.7092379598633823, "learning_rate": 9.617977528089888e-06, "loss": 0.3781, "step": 214 }, { "epoch": 0.2414712901867191, "grad_norm": 0.6438002116207439, "learning_rate": 9.662921348314608e-06, "loss": 0.3996, "step": 215 }, { "epoch": 0.2425944124666573, "grad_norm": 0.6622015252030728, "learning_rate": 9.707865168539326e-06, "loss": 0.3832, "step": 216 }, { "epoch": 0.24371753474659555, "grad_norm": 0.6078817480395128, "learning_rate": 9.752808988764046e-06, "loss": 0.3952, "step": 217 }, { "epoch": 0.24484065702653376, "grad_norm": 0.6551657494425536, "learning_rate": 9.797752808988766e-06, "loss": 0.3791, "step": 218 }, { "epoch": 0.24596377930647198, "grad_norm": 0.6433323266927976, "learning_rate": 9.842696629213484e-06, "loss": 0.3774, "step": 219 }, { "epoch": 0.24708690158641022, "grad_norm": 0.6188804166540856, "learning_rate": 9.887640449438202e-06, "loss": 0.3712, "step": 220 }, { "epoch": 0.24821002386634844, "grad_norm": 0.671966975214175, "learning_rate": 9.932584269662922e-06, "loss": 0.4239, "step": 221 }, { "epoch": 0.24933314614628668, "grad_norm": 0.5947753032122628, "learning_rate": 9.977528089887642e-06, "loss": 0.3964, "step": 222 }, { "epoch": 0.2504562684262249, "grad_norm": 0.6607682301911262, "learning_rate": 1.0022471910112362e-05, "loss": 0.3903, "step": 223 }, { "epoch": 0.25157939070616314, "grad_norm": 0.6356386896335997, "learning_rate": 1.006741573033708e-05, "loss": 0.4251, "step": 224 }, { "epoch": 0.25270251298610136, "grad_norm": 0.6559033997420501, "learning_rate": 1.01123595505618e-05, "loss": 0.4137, "step": 225 }, { "epoch": 0.2538256352660396, "grad_norm": 0.614989946033004, "learning_rate": 1.0157303370786518e-05, "loss": 0.3601, "step": 226 }, { "epoch": 0.25494875754597784, "grad_norm": 0.6410991676909698, "learning_rate": 1.0202247191011236e-05, "loss": 0.402, "step": 227 }, { "epoch": 0.25607187982591606, "grad_norm": 0.642417180353421, "learning_rate": 1.0247191011235956e-05, "loss": 0.3877, "step": 228 }, { "epoch": 0.2571950021058543, "grad_norm": 0.6125173108674232, "learning_rate": 1.0292134831460674e-05, "loss": 0.3669, "step": 229 }, { "epoch": 0.2583181243857925, "grad_norm": 0.5685847570038455, "learning_rate": 1.0337078651685396e-05, "loss": 0.374, "step": 230 }, { "epoch": 0.2594412466657307, "grad_norm": 0.6544675947013324, "learning_rate": 1.0382022471910114e-05, "loss": 0.3834, "step": 231 }, { "epoch": 0.260564368945669, "grad_norm": 0.6068343322603438, "learning_rate": 1.0426966292134832e-05, "loss": 0.3693, "step": 232 }, { "epoch": 0.2616874912256072, "grad_norm": 0.6453583897346943, "learning_rate": 1.0471910112359552e-05, "loss": 0.4048, "step": 233 }, { "epoch": 0.2628106135055454, "grad_norm": 0.7033942331352795, "learning_rate": 1.051685393258427e-05, "loss": 0.3875, "step": 234 }, { "epoch": 0.2639337357854836, "grad_norm": 0.5715008472023303, "learning_rate": 1.0561797752808988e-05, "loss": 0.4069, "step": 235 }, { "epoch": 0.2650568580654219, "grad_norm": 0.6494208885445116, "learning_rate": 1.060674157303371e-05, "loss": 0.4093, "step": 236 }, { "epoch": 0.2661799803453601, "grad_norm": 0.6812214256281384, "learning_rate": 1.0651685393258428e-05, "loss": 0.4425, "step": 237 }, { "epoch": 0.26730310262529833, "grad_norm": 0.5924772804766913, "learning_rate": 1.0696629213483148e-05, "loss": 0.387, "step": 238 }, { "epoch": 0.26842622490523654, "grad_norm": 0.5764162625896104, "learning_rate": 1.0741573033707866e-05, "loss": 0.3827, "step": 239 }, { "epoch": 0.26954934718517476, "grad_norm": 0.5965307495222452, "learning_rate": 1.0786516853932584e-05, "loss": 0.4129, "step": 240 }, { "epoch": 0.27067246946511303, "grad_norm": 0.6003603754111506, "learning_rate": 1.0831460674157304e-05, "loss": 0.3775, "step": 241 }, { "epoch": 0.27179559174505125, "grad_norm": 0.5922972797219954, "learning_rate": 1.0876404494382022e-05, "loss": 0.3877, "step": 242 }, { "epoch": 0.27291871402498946, "grad_norm": 0.5685410001707504, "learning_rate": 1.0921348314606744e-05, "loss": 0.377, "step": 243 }, { "epoch": 0.2740418363049277, "grad_norm": 0.576628928037405, "learning_rate": 1.0966292134831462e-05, "loss": 0.3971, "step": 244 }, { "epoch": 0.27516495858486595, "grad_norm": 0.5579453193582883, "learning_rate": 1.101123595505618e-05, "loss": 0.378, "step": 245 }, { "epoch": 0.27628808086480416, "grad_norm": 0.6011765922681929, "learning_rate": 1.10561797752809e-05, "loss": 0.4117, "step": 246 }, { "epoch": 0.2774112031447424, "grad_norm": 0.6091646970674486, "learning_rate": 1.1101123595505618e-05, "loss": 0.4426, "step": 247 }, { "epoch": 0.2785343254246806, "grad_norm": 0.5775758129728008, "learning_rate": 1.1146067415730338e-05, "loss": 0.3972, "step": 248 }, { "epoch": 0.27965744770461887, "grad_norm": 0.6604411713439484, "learning_rate": 1.1191011235955056e-05, "loss": 0.3965, "step": 249 }, { "epoch": 0.2807805699845571, "grad_norm": 0.6156558831358852, "learning_rate": 1.1235955056179778e-05, "loss": 0.405, "step": 250 }, { "epoch": 0.2819036922644953, "grad_norm": 0.5623612665541853, "learning_rate": 1.1280898876404496e-05, "loss": 0.3901, "step": 251 }, { "epoch": 0.2830268145444335, "grad_norm": 0.6746239022438368, "learning_rate": 1.1325842696629214e-05, "loss": 0.3876, "step": 252 }, { "epoch": 0.28414993682437173, "grad_norm": 0.5964994154377549, "learning_rate": 1.1370786516853934e-05, "loss": 0.4163, "step": 253 }, { "epoch": 0.28527305910431, "grad_norm": 0.6082500353591788, "learning_rate": 1.1415730337078652e-05, "loss": 0.4286, "step": 254 }, { "epoch": 0.2863961813842482, "grad_norm": 0.6157506350996006, "learning_rate": 1.146067415730337e-05, "loss": 0.3872, "step": 255 }, { "epoch": 0.28751930366418643, "grad_norm": 0.567260212959717, "learning_rate": 1.1505617977528092e-05, "loss": 0.4341, "step": 256 }, { "epoch": 0.28864242594412465, "grad_norm": 0.5791317852426999, "learning_rate": 1.155056179775281e-05, "loss": 0.3499, "step": 257 }, { "epoch": 0.2897655482240629, "grad_norm": 0.5741440071104256, "learning_rate": 1.159550561797753e-05, "loss": 0.3744, "step": 258 }, { "epoch": 0.29088867050400113, "grad_norm": 0.577442355673411, "learning_rate": 1.1640449438202248e-05, "loss": 0.3959, "step": 259 }, { "epoch": 0.29201179278393935, "grad_norm": 0.5701366764697986, "learning_rate": 1.1685393258426966e-05, "loss": 0.3661, "step": 260 }, { "epoch": 0.29313491506387757, "grad_norm": 0.5866338939450678, "learning_rate": 1.1730337078651686e-05, "loss": 0.3817, "step": 261 }, { "epoch": 0.2942580373438158, "grad_norm": 0.5869871652126405, "learning_rate": 1.1775280898876404e-05, "loss": 0.4031, "step": 262 }, { "epoch": 0.29538115962375405, "grad_norm": 0.6113639662622179, "learning_rate": 1.1820224719101126e-05, "loss": 0.3665, "step": 263 }, { "epoch": 0.29650428190369227, "grad_norm": 0.5815782372208732, "learning_rate": 1.1865168539325844e-05, "loss": 0.3718, "step": 264 }, { "epoch": 0.2976274041836305, "grad_norm": 0.5770971481710677, "learning_rate": 1.1910112359550562e-05, "loss": 0.3887, "step": 265 }, { "epoch": 0.2987505264635687, "grad_norm": 0.6378543358730397, "learning_rate": 1.1955056179775282e-05, "loss": 0.4026, "step": 266 }, { "epoch": 0.29987364874350697, "grad_norm": 0.6366625582188497, "learning_rate": 1.2e-05, "loss": 0.3996, "step": 267 }, { "epoch": 0.3009967710234452, "grad_norm": 0.5644318306665338, "learning_rate": 1.2044943820224718e-05, "loss": 0.3736, "step": 268 }, { "epoch": 0.3021198933033834, "grad_norm": 0.5760984664326271, "learning_rate": 1.208988764044944e-05, "loss": 0.3831, "step": 269 }, { "epoch": 0.3032430155833216, "grad_norm": 0.6073186836248484, "learning_rate": 1.213483146067416e-05, "loss": 0.3767, "step": 270 }, { "epoch": 0.3043661378632599, "grad_norm": 0.658052587041447, "learning_rate": 1.2179775280898878e-05, "loss": 0.376, "step": 271 }, { "epoch": 0.3054892601431981, "grad_norm": 0.5705277159249025, "learning_rate": 1.2224719101123596e-05, "loss": 0.4047, "step": 272 }, { "epoch": 0.3066123824231363, "grad_norm": 0.5943437710250703, "learning_rate": 1.2269662921348316e-05, "loss": 0.3938, "step": 273 }, { "epoch": 0.30773550470307454, "grad_norm": 0.6594875627018434, "learning_rate": 1.2314606741573034e-05, "loss": 0.3932, "step": 274 }, { "epoch": 0.30885862698301275, "grad_norm": 0.6227401289864777, "learning_rate": 1.2359550561797752e-05, "loss": 0.368, "step": 275 }, { "epoch": 0.309981749262951, "grad_norm": 0.5836285370383634, "learning_rate": 1.2404494382022474e-05, "loss": 0.3896, "step": 276 }, { "epoch": 0.31110487154288924, "grad_norm": 0.6356736965747966, "learning_rate": 1.2449438202247192e-05, "loss": 0.4001, "step": 277 }, { "epoch": 0.31222799382282745, "grad_norm": 0.5961019104865444, "learning_rate": 1.2494382022471912e-05, "loss": 0.3898, "step": 278 }, { "epoch": 0.31335111610276567, "grad_norm": 0.5894192052755339, "learning_rate": 1.253932584269663e-05, "loss": 0.3821, "step": 279 }, { "epoch": 0.31447423838270394, "grad_norm": 0.6139515035640439, "learning_rate": 1.2584269662921348e-05, "loss": 0.3869, "step": 280 }, { "epoch": 0.31559736066264216, "grad_norm": 0.5710811085875507, "learning_rate": 1.2629213483146068e-05, "loss": 0.3715, "step": 281 }, { "epoch": 0.3167204829425804, "grad_norm": 0.6174376536441986, "learning_rate": 1.2674157303370786e-05, "loss": 0.3887, "step": 282 }, { "epoch": 0.3178436052225186, "grad_norm": 0.5409663843795987, "learning_rate": 1.2719101123595508e-05, "loss": 0.3585, "step": 283 }, { "epoch": 0.3189667275024568, "grad_norm": 0.5842025638662178, "learning_rate": 1.2764044943820226e-05, "loss": 0.3648, "step": 284 }, { "epoch": 0.3200898497823951, "grad_norm": 0.6288491942213608, "learning_rate": 1.2808988764044944e-05, "loss": 0.3943, "step": 285 }, { "epoch": 0.3212129720623333, "grad_norm": 0.6339928168107889, "learning_rate": 1.2853932584269664e-05, "loss": 0.412, "step": 286 }, { "epoch": 0.3223360943422715, "grad_norm": 0.5805070016232142, "learning_rate": 1.2898876404494382e-05, "loss": 0.3936, "step": 287 }, { "epoch": 0.3234592166222097, "grad_norm": 0.6031249399955438, "learning_rate": 1.29438202247191e-05, "loss": 0.3849, "step": 288 }, { "epoch": 0.324582338902148, "grad_norm": 0.6515523772005117, "learning_rate": 1.2988764044943822e-05, "loss": 0.4003, "step": 289 }, { "epoch": 0.3257054611820862, "grad_norm": 0.6020531221934884, "learning_rate": 1.303370786516854e-05, "loss": 0.4059, "step": 290 }, { "epoch": 0.3268285834620244, "grad_norm": 0.5911957722542618, "learning_rate": 1.307865168539326e-05, "loss": 0.3936, "step": 291 }, { "epoch": 0.32795170574196264, "grad_norm": 0.6296525120353835, "learning_rate": 1.3123595505617978e-05, "loss": 0.3638, "step": 292 }, { "epoch": 0.3290748280219009, "grad_norm": 0.6035340356032662, "learning_rate": 1.3168539325842698e-05, "loss": 0.3633, "step": 293 }, { "epoch": 0.3301979503018391, "grad_norm": 0.5640220762566707, "learning_rate": 1.3213483146067416e-05, "loss": 0.3936, "step": 294 }, { "epoch": 0.33132107258177734, "grad_norm": 0.659950946658227, "learning_rate": 1.3258426966292135e-05, "loss": 0.3939, "step": 295 }, { "epoch": 0.33244419486171556, "grad_norm": 0.6547964598471208, "learning_rate": 1.3303370786516856e-05, "loss": 0.3797, "step": 296 }, { "epoch": 0.3335673171416538, "grad_norm": 0.5628971266321644, "learning_rate": 1.3348314606741574e-05, "loss": 0.3878, "step": 297 }, { "epoch": 0.33469043942159205, "grad_norm": 0.5916043311443739, "learning_rate": 1.3393258426966294e-05, "loss": 0.3705, "step": 298 }, { "epoch": 0.33581356170153026, "grad_norm": 0.642531175947998, "learning_rate": 1.3438202247191012e-05, "loss": 0.3841, "step": 299 }, { "epoch": 0.3369366839814685, "grad_norm": 0.5942467702427874, "learning_rate": 1.348314606741573e-05, "loss": 0.3864, "step": 300 }, { "epoch": 0.3380598062614067, "grad_norm": 0.5576069981166258, "learning_rate": 1.352808988764045e-05, "loss": 0.3803, "step": 301 }, { "epoch": 0.33918292854134496, "grad_norm": 0.637796095078525, "learning_rate": 1.3573033707865169e-05, "loss": 0.3845, "step": 302 }, { "epoch": 0.3403060508212832, "grad_norm": 0.5424364787066615, "learning_rate": 1.361797752808989e-05, "loss": 0.3885, "step": 303 }, { "epoch": 0.3414291731012214, "grad_norm": 0.666509048066953, "learning_rate": 1.3662921348314608e-05, "loss": 0.3941, "step": 304 }, { "epoch": 0.3425522953811596, "grad_norm": 0.5667187793072223, "learning_rate": 1.3707865168539327e-05, "loss": 0.3568, "step": 305 }, { "epoch": 0.3436754176610978, "grad_norm": 0.6364904468384852, "learning_rate": 1.3752808988764046e-05, "loss": 0.3807, "step": 306 }, { "epoch": 0.3447985399410361, "grad_norm": 0.5971120227080189, "learning_rate": 1.3797752808988765e-05, "loss": 0.3848, "step": 307 }, { "epoch": 0.3459216622209743, "grad_norm": 0.5555370524108522, "learning_rate": 1.3842696629213483e-05, "loss": 0.3849, "step": 308 }, { "epoch": 0.34704478450091253, "grad_norm": 0.6236731385363299, "learning_rate": 1.3887640449438204e-05, "loss": 0.384, "step": 309 }, { "epoch": 0.34816790678085074, "grad_norm": 0.6149787882648424, "learning_rate": 1.3932584269662923e-05, "loss": 0.3959, "step": 310 }, { "epoch": 0.349291029060789, "grad_norm": 0.6215560633514793, "learning_rate": 1.3977528089887642e-05, "loss": 0.3918, "step": 311 }, { "epoch": 0.35041415134072723, "grad_norm": 0.6209310433594782, "learning_rate": 1.402247191011236e-05, "loss": 0.3855, "step": 312 }, { "epoch": 0.35153727362066545, "grad_norm": 0.5994586964661478, "learning_rate": 1.4067415730337079e-05, "loss": 0.373, "step": 313 }, { "epoch": 0.35266039590060366, "grad_norm": 0.6188656847491276, "learning_rate": 1.4112359550561799e-05, "loss": 0.3892, "step": 314 }, { "epoch": 0.35378351818054193, "grad_norm": 0.5701426572944555, "learning_rate": 1.4157303370786517e-05, "loss": 0.3587, "step": 315 }, { "epoch": 0.35490664046048015, "grad_norm": 0.5735315497169365, "learning_rate": 1.4202247191011238e-05, "loss": 0.3834, "step": 316 }, { "epoch": 0.35602976274041837, "grad_norm": 0.6041588815641994, "learning_rate": 1.4247191011235957e-05, "loss": 0.4063, "step": 317 }, { "epoch": 0.3571528850203566, "grad_norm": 0.6413512842290581, "learning_rate": 1.4292134831460676e-05, "loss": 0.3643, "step": 318 }, { "epoch": 0.3582760073002948, "grad_norm": 0.5633456389709988, "learning_rate": 1.4337078651685395e-05, "loss": 0.3782, "step": 319 }, { "epoch": 0.35939912958023307, "grad_norm": 0.6268598664045065, "learning_rate": 1.4382022471910113e-05, "loss": 0.3959, "step": 320 }, { "epoch": 0.3605222518601713, "grad_norm": 0.5800501810819474, "learning_rate": 1.4426966292134833e-05, "loss": 0.4013, "step": 321 }, { "epoch": 0.3616453741401095, "grad_norm": 0.5769816751771281, "learning_rate": 1.447191011235955e-05, "loss": 0.3828, "step": 322 }, { "epoch": 0.3627684964200477, "grad_norm": 0.5949290584396976, "learning_rate": 1.4516853932584272e-05, "loss": 0.4089, "step": 323 }, { "epoch": 0.363891618699986, "grad_norm": 0.5852043429988183, "learning_rate": 1.456179775280899e-05, "loss": 0.3652, "step": 324 }, { "epoch": 0.3650147409799242, "grad_norm": 0.572491832081323, "learning_rate": 1.4606741573033709e-05, "loss": 0.3984, "step": 325 }, { "epoch": 0.3661378632598624, "grad_norm": 0.5972392800723866, "learning_rate": 1.4651685393258429e-05, "loss": 0.39, "step": 326 }, { "epoch": 0.36726098553980063, "grad_norm": 0.6363257306206275, "learning_rate": 1.4696629213483147e-05, "loss": 0.3562, "step": 327 }, { "epoch": 0.36838410781973885, "grad_norm": 0.5907940254264683, "learning_rate": 1.4741573033707865e-05, "loss": 0.3911, "step": 328 }, { "epoch": 0.3695072300996771, "grad_norm": 0.5871546010212381, "learning_rate": 1.4786516853932587e-05, "loss": 0.379, "step": 329 }, { "epoch": 0.37063035237961534, "grad_norm": 0.6618710636475701, "learning_rate": 1.4831460674157305e-05, "loss": 0.3946, "step": 330 }, { "epoch": 0.37175347465955355, "grad_norm": 0.5734074870533208, "learning_rate": 1.4876404494382025e-05, "loss": 0.4272, "step": 331 }, { "epoch": 0.37287659693949177, "grad_norm": 0.6643286523550941, "learning_rate": 1.4921348314606743e-05, "loss": 0.3963, "step": 332 }, { "epoch": 0.37399971921943004, "grad_norm": 0.6096880457625593, "learning_rate": 1.4966292134831461e-05, "loss": 0.4006, "step": 333 }, { "epoch": 0.37512284149936825, "grad_norm": 0.5812451405437918, "learning_rate": 1.501123595505618e-05, "loss": 0.3975, "step": 334 }, { "epoch": 0.37624596377930647, "grad_norm": 0.5757226170172344, "learning_rate": 1.5056179775280899e-05, "loss": 0.3851, "step": 335 }, { "epoch": 0.3773690860592447, "grad_norm": 0.5954813467325601, "learning_rate": 1.510112359550562e-05, "loss": 0.3817, "step": 336 }, { "epoch": 0.37849220833918296, "grad_norm": 0.5726297825723002, "learning_rate": 1.5146067415730339e-05, "loss": 0.3882, "step": 337 }, { "epoch": 0.37961533061912117, "grad_norm": 0.575616838800246, "learning_rate": 1.5191011235955057e-05, "loss": 0.4013, "step": 338 }, { "epoch": 0.3807384528990594, "grad_norm": 0.5656104756542751, "learning_rate": 1.5235955056179777e-05, "loss": 0.3715, "step": 339 }, { "epoch": 0.3818615751789976, "grad_norm": 0.5700630782329689, "learning_rate": 1.5280898876404495e-05, "loss": 0.382, "step": 340 }, { "epoch": 0.3829846974589358, "grad_norm": 0.5901652073128528, "learning_rate": 1.5325842696629213e-05, "loss": 0.3922, "step": 341 }, { "epoch": 0.3841078197388741, "grad_norm": 0.5680443357781206, "learning_rate": 1.537078651685393e-05, "loss": 0.3956, "step": 342 }, { "epoch": 0.3852309420188123, "grad_norm": 0.6094240246912197, "learning_rate": 1.5415730337078653e-05, "loss": 0.4001, "step": 343 }, { "epoch": 0.3863540642987505, "grad_norm": 0.6025863182389569, "learning_rate": 1.546067415730337e-05, "loss": 0.3991, "step": 344 }, { "epoch": 0.38747718657868874, "grad_norm": 0.5849785735196502, "learning_rate": 1.5505617977528093e-05, "loss": 0.3561, "step": 345 }, { "epoch": 0.388600308858627, "grad_norm": 0.6155288868533991, "learning_rate": 1.555056179775281e-05, "loss": 0.3885, "step": 346 }, { "epoch": 0.3897234311385652, "grad_norm": 0.528624344003196, "learning_rate": 1.559550561797753e-05, "loss": 0.3368, "step": 347 }, { "epoch": 0.39084655341850344, "grad_norm": 0.5742784751968288, "learning_rate": 1.5640449438202247e-05, "loss": 0.3908, "step": 348 }, { "epoch": 0.39196967569844166, "grad_norm": 0.5790550121976005, "learning_rate": 1.568539325842697e-05, "loss": 0.3935, "step": 349 }, { "epoch": 0.39309279797837987, "grad_norm": 0.5837147473027602, "learning_rate": 1.5730337078651687e-05, "loss": 0.3882, "step": 350 }, { "epoch": 0.39421592025831814, "grad_norm": 0.59876417374738, "learning_rate": 1.5775280898876405e-05, "loss": 0.3687, "step": 351 }, { "epoch": 0.39533904253825636, "grad_norm": 0.5577383109842206, "learning_rate": 1.5820224719101127e-05, "loss": 0.3838, "step": 352 }, { "epoch": 0.3964621648181946, "grad_norm": 0.6253260037735091, "learning_rate": 1.5865168539325845e-05, "loss": 0.3988, "step": 353 }, { "epoch": 0.3975852870981328, "grad_norm": 0.5709668147622707, "learning_rate": 1.5910112359550563e-05, "loss": 0.3797, "step": 354 }, { "epoch": 0.39870840937807106, "grad_norm": 0.5766274168346207, "learning_rate": 1.595505617977528e-05, "loss": 0.4017, "step": 355 }, { "epoch": 0.3998315316580093, "grad_norm": 0.5960841144559348, "learning_rate": 1.6000000000000003e-05, "loss": 0.3784, "step": 356 }, { "epoch": 0.4009546539379475, "grad_norm": 0.5637097286513578, "learning_rate": 1.604494382022472e-05, "loss": 0.3835, "step": 357 }, { "epoch": 0.4020777762178857, "grad_norm": 0.621385534061573, "learning_rate": 1.608988764044944e-05, "loss": 0.379, "step": 358 }, { "epoch": 0.4032008984978239, "grad_norm": 0.5804215759491298, "learning_rate": 1.6134831460674157e-05, "loss": 0.3717, "step": 359 }, { "epoch": 0.4043240207777622, "grad_norm": 0.6850149117508036, "learning_rate": 1.617977528089888e-05, "loss": 0.3929, "step": 360 }, { "epoch": 0.4054471430577004, "grad_norm": 0.5678267887232396, "learning_rate": 1.6224719101123597e-05, "loss": 0.3708, "step": 361 }, { "epoch": 0.4065702653376386, "grad_norm": 0.5846240027794225, "learning_rate": 1.626966292134832e-05, "loss": 0.3806, "step": 362 }, { "epoch": 0.40769338761757684, "grad_norm": 0.641861479794963, "learning_rate": 1.6314606741573037e-05, "loss": 0.3751, "step": 363 }, { "epoch": 0.4088165098975151, "grad_norm": 0.551258860522498, "learning_rate": 1.6359550561797755e-05, "loss": 0.3847, "step": 364 }, { "epoch": 0.40993963217745333, "grad_norm": 0.5436695988486194, "learning_rate": 1.6404494382022473e-05, "loss": 0.3588, "step": 365 }, { "epoch": 0.41106275445739154, "grad_norm": 0.6273802594115025, "learning_rate": 1.644943820224719e-05, "loss": 0.3893, "step": 366 }, { "epoch": 0.41218587673732976, "grad_norm": 0.5829276428140747, "learning_rate": 1.649438202247191e-05, "loss": 0.3761, "step": 367 }, { "epoch": 0.41330899901726803, "grad_norm": 0.6510879644948344, "learning_rate": 1.653932584269663e-05, "loss": 0.3875, "step": 368 }, { "epoch": 0.41443212129720625, "grad_norm": 0.5895996214987, "learning_rate": 1.658426966292135e-05, "loss": 0.3861, "step": 369 }, { "epoch": 0.41555524357714446, "grad_norm": 0.5696237361887272, "learning_rate": 1.662921348314607e-05, "loss": 0.3733, "step": 370 }, { "epoch": 0.4166783658570827, "grad_norm": 0.6452128769900847, "learning_rate": 1.667415730337079e-05, "loss": 0.377, "step": 371 }, { "epoch": 0.4178014881370209, "grad_norm": 0.5933187468692044, "learning_rate": 1.6719101123595507e-05, "loss": 0.3727, "step": 372 }, { "epoch": 0.41892461041695916, "grad_norm": 0.5956891237289303, "learning_rate": 1.6764044943820225e-05, "loss": 0.409, "step": 373 }, { "epoch": 0.4200477326968974, "grad_norm": 0.6421959612844145, "learning_rate": 1.6808988764044943e-05, "loss": 0.4176, "step": 374 }, { "epoch": 0.4211708549768356, "grad_norm": 0.5877612385670545, "learning_rate": 1.6853932584269665e-05, "loss": 0.3813, "step": 375 }, { "epoch": 0.4222939772567738, "grad_norm": 0.5522655569800495, "learning_rate": 1.6898876404494383e-05, "loss": 0.3635, "step": 376 }, { "epoch": 0.4234170995367121, "grad_norm": 0.570615445713445, "learning_rate": 1.6943820224719105e-05, "loss": 0.3464, "step": 377 }, { "epoch": 0.4245402218166503, "grad_norm": 0.5335527636986674, "learning_rate": 1.6988764044943823e-05, "loss": 0.3519, "step": 378 }, { "epoch": 0.4256633440965885, "grad_norm": 0.5633249413573972, "learning_rate": 1.703370786516854e-05, "loss": 0.3671, "step": 379 }, { "epoch": 0.42678646637652673, "grad_norm": 0.5962150121982345, "learning_rate": 1.707865168539326e-05, "loss": 0.3638, "step": 380 }, { "epoch": 0.42790958865646495, "grad_norm": 0.533130081409531, "learning_rate": 1.7123595505617977e-05, "loss": 0.3723, "step": 381 }, { "epoch": 0.4290327109364032, "grad_norm": 0.5232079977615192, "learning_rate": 1.71685393258427e-05, "loss": 0.3377, "step": 382 }, { "epoch": 0.43015583321634143, "grad_norm": 0.6669436739174833, "learning_rate": 1.7213483146067417e-05, "loss": 0.3649, "step": 383 }, { "epoch": 0.43127895549627965, "grad_norm": 0.58553539025849, "learning_rate": 1.7258426966292135e-05, "loss": 0.3902, "step": 384 }, { "epoch": 0.43240207777621786, "grad_norm": 0.5805175926211644, "learning_rate": 1.7303370786516857e-05, "loss": 0.3988, "step": 385 }, { "epoch": 0.43352520005615613, "grad_norm": 0.622462053904498, "learning_rate": 1.7348314606741575e-05, "loss": 0.3726, "step": 386 }, { "epoch": 0.43464832233609435, "grad_norm": 0.6586460393239919, "learning_rate": 1.7393258426966293e-05, "loss": 0.382, "step": 387 }, { "epoch": 0.43577144461603257, "grad_norm": 0.503597593779572, "learning_rate": 1.743820224719101e-05, "loss": 0.3548, "step": 388 }, { "epoch": 0.4368945668959708, "grad_norm": 0.6269680293566149, "learning_rate": 1.7483146067415733e-05, "loss": 0.3894, "step": 389 }, { "epoch": 0.43801768917590905, "grad_norm": 0.5826375120126366, "learning_rate": 1.752808988764045e-05, "loss": 0.3814, "step": 390 }, { "epoch": 0.43914081145584727, "grad_norm": 0.5730693678129642, "learning_rate": 1.757303370786517e-05, "loss": 0.4214, "step": 391 }, { "epoch": 0.4402639337357855, "grad_norm": 0.5808702128043581, "learning_rate": 1.7617977528089887e-05, "loss": 0.354, "step": 392 }, { "epoch": 0.4413870560157237, "grad_norm": 0.5974314280889683, "learning_rate": 1.766292134831461e-05, "loss": 0.3776, "step": 393 }, { "epoch": 0.4425101782956619, "grad_norm": 0.5707766947857629, "learning_rate": 1.7707865168539327e-05, "loss": 0.3818, "step": 394 }, { "epoch": 0.4436333005756002, "grad_norm": 0.5675159319972002, "learning_rate": 1.7752808988764045e-05, "loss": 0.4048, "step": 395 }, { "epoch": 0.4447564228555384, "grad_norm": 0.5855000642824909, "learning_rate": 1.7797752808988767e-05, "loss": 0.3669, "step": 396 }, { "epoch": 0.4458795451354766, "grad_norm": 0.5890500820148369, "learning_rate": 1.7842696629213485e-05, "loss": 0.3529, "step": 397 }, { "epoch": 0.44700266741541483, "grad_norm": 0.5765910274102177, "learning_rate": 1.7887640449438203e-05, "loss": 0.4004, "step": 398 }, { "epoch": 0.4481257896953531, "grad_norm": 0.5850026850384389, "learning_rate": 1.793258426966292e-05, "loss": 0.4024, "step": 399 }, { "epoch": 0.4492489119752913, "grad_norm": 0.60842577325865, "learning_rate": 1.7977528089887643e-05, "loss": 0.362, "step": 400 }, { "epoch": 0.45037203425522954, "grad_norm": 0.598714080261855, "learning_rate": 1.802247191011236e-05, "loss": 0.4056, "step": 401 }, { "epoch": 0.45149515653516775, "grad_norm": 0.5460494368263945, "learning_rate": 1.8067415730337083e-05, "loss": 0.3563, "step": 402 }, { "epoch": 0.45261827881510597, "grad_norm": 0.5338120861959073, "learning_rate": 1.81123595505618e-05, "loss": 0.3629, "step": 403 }, { "epoch": 0.45374140109504424, "grad_norm": 0.5715606361300019, "learning_rate": 1.815730337078652e-05, "loss": 0.3717, "step": 404 }, { "epoch": 0.45486452337498245, "grad_norm": 0.541068995106628, "learning_rate": 1.8202247191011237e-05, "loss": 0.3779, "step": 405 }, { "epoch": 0.45598764565492067, "grad_norm": 0.5723001753094253, "learning_rate": 1.8247191011235956e-05, "loss": 0.3882, "step": 406 }, { "epoch": 0.4571107679348589, "grad_norm": 0.5405763992724918, "learning_rate": 1.8292134831460674e-05, "loss": 0.3762, "step": 407 }, { "epoch": 0.45823389021479716, "grad_norm": 0.5159698191034968, "learning_rate": 1.8337078651685395e-05, "loss": 0.377, "step": 408 }, { "epoch": 0.4593570124947354, "grad_norm": 0.5354052943635401, "learning_rate": 1.8382022471910113e-05, "loss": 0.3735, "step": 409 }, { "epoch": 0.4604801347746736, "grad_norm": 0.5507330621386544, "learning_rate": 1.8426966292134835e-05, "loss": 0.3941, "step": 410 }, { "epoch": 0.4616032570546118, "grad_norm": 0.5470411737981918, "learning_rate": 1.8471910112359553e-05, "loss": 0.3788, "step": 411 }, { "epoch": 0.4627263793345501, "grad_norm": 0.6279126208387347, "learning_rate": 1.851685393258427e-05, "loss": 0.3766, "step": 412 }, { "epoch": 0.4638495016144883, "grad_norm": 0.5581331082608008, "learning_rate": 1.856179775280899e-05, "loss": 0.3548, "step": 413 }, { "epoch": 0.4649726238944265, "grad_norm": 0.6339597817219623, "learning_rate": 1.8606741573033708e-05, "loss": 0.3928, "step": 414 }, { "epoch": 0.4660957461743647, "grad_norm": 0.5885950731841739, "learning_rate": 1.8651685393258426e-05, "loss": 0.3834, "step": 415 }, { "epoch": 0.46721886845430294, "grad_norm": 0.5517474243216776, "learning_rate": 1.8696629213483147e-05, "loss": 0.372, "step": 416 }, { "epoch": 0.4683419907342412, "grad_norm": 0.5371558479053717, "learning_rate": 1.8741573033707866e-05, "loss": 0.3781, "step": 417 }, { "epoch": 0.4694651130141794, "grad_norm": 0.5972062873696606, "learning_rate": 1.8786516853932587e-05, "loss": 0.4045, "step": 418 }, { "epoch": 0.47058823529411764, "grad_norm": 0.5289075828695805, "learning_rate": 1.8831460674157305e-05, "loss": 0.3604, "step": 419 }, { "epoch": 0.47171135757405586, "grad_norm": 0.5595781567840159, "learning_rate": 1.8876404494382024e-05, "loss": 0.3733, "step": 420 }, { "epoch": 0.4728344798539941, "grad_norm": 0.5283542090574247, "learning_rate": 1.8921348314606742e-05, "loss": 0.3817, "step": 421 }, { "epoch": 0.47395760213393234, "grad_norm": 0.5224861754856025, "learning_rate": 1.8966292134831463e-05, "loss": 0.3846, "step": 422 }, { "epoch": 0.47508072441387056, "grad_norm": 0.5693567792868641, "learning_rate": 1.901123595505618e-05, "loss": 0.3784, "step": 423 }, { "epoch": 0.4762038466938088, "grad_norm": 0.586099489925582, "learning_rate": 1.90561797752809e-05, "loss": 0.3762, "step": 424 }, { "epoch": 0.477326968973747, "grad_norm": 0.5172936141285605, "learning_rate": 1.910112359550562e-05, "loss": 0.3833, "step": 425 }, { "epoch": 0.47845009125368526, "grad_norm": 0.5666644649128297, "learning_rate": 1.914606741573034e-05, "loss": 0.3839, "step": 426 }, { "epoch": 0.4795732135336235, "grad_norm": 0.5860548355924229, "learning_rate": 1.9191011235955058e-05, "loss": 0.3716, "step": 427 }, { "epoch": 0.4806963358135617, "grad_norm": 0.5520174477409845, "learning_rate": 1.9235955056179776e-05, "loss": 0.3696, "step": 428 }, { "epoch": 0.4818194580934999, "grad_norm": 0.5402453132910056, "learning_rate": 1.9280898876404497e-05, "loss": 0.3769, "step": 429 }, { "epoch": 0.4829425803734382, "grad_norm": 0.5670157993103069, "learning_rate": 1.9325842696629215e-05, "loss": 0.3546, "step": 430 }, { "epoch": 0.4840657026533764, "grad_norm": 0.5378630874757251, "learning_rate": 1.9370786516853934e-05, "loss": 0.3831, "step": 431 }, { "epoch": 0.4851888249333146, "grad_norm": 0.5178321480341342, "learning_rate": 1.9415730337078652e-05, "loss": 0.3796, "step": 432 }, { "epoch": 0.4863119472132528, "grad_norm": 0.5500034972413501, "learning_rate": 1.9460674157303373e-05, "loss": 0.4023, "step": 433 }, { "epoch": 0.4874350694931911, "grad_norm": 0.4790912474912375, "learning_rate": 1.950561797752809e-05, "loss": 0.3695, "step": 434 }, { "epoch": 0.4885581917731293, "grad_norm": 0.520106712842702, "learning_rate": 1.955056179775281e-05, "loss": 0.3907, "step": 435 }, { "epoch": 0.48968131405306753, "grad_norm": 0.5270381224070033, "learning_rate": 1.959550561797753e-05, "loss": 0.3746, "step": 436 }, { "epoch": 0.49080443633300574, "grad_norm": 0.5585640386718159, "learning_rate": 1.964044943820225e-05, "loss": 0.3968, "step": 437 }, { "epoch": 0.49192755861294396, "grad_norm": 0.5212838454555632, "learning_rate": 1.9685393258426968e-05, "loss": 0.3848, "step": 438 }, { "epoch": 0.49305068089288223, "grad_norm": 0.5223182787955911, "learning_rate": 1.9730337078651686e-05, "loss": 0.3871, "step": 439 }, { "epoch": 0.49417380317282045, "grad_norm": 0.5014928363223362, "learning_rate": 1.9775280898876404e-05, "loss": 0.3749, "step": 440 }, { "epoch": 0.49529692545275866, "grad_norm": 0.5448751880848072, "learning_rate": 1.9820224719101126e-05, "loss": 0.4037, "step": 441 }, { "epoch": 0.4964200477326969, "grad_norm": 0.5115396988679667, "learning_rate": 1.9865168539325844e-05, "loss": 0.3578, "step": 442 }, { "epoch": 0.49754317001263515, "grad_norm": 0.5226126289636163, "learning_rate": 1.9910112359550565e-05, "loss": 0.3813, "step": 443 }, { "epoch": 0.49866629229257337, "grad_norm": 0.5651234942385593, "learning_rate": 1.9955056179775283e-05, "loss": 0.4008, "step": 444 }, { "epoch": 0.4997894145725116, "grad_norm": 0.5414290026509725, "learning_rate": 2e-05, "loss": 0.3634, "step": 445 }, { "epoch": 0.5009125368524499, "grad_norm": 0.5062706466010933, "learning_rate": 1.9999996923444976e-05, "loss": 0.3679, "step": 446 }, { "epoch": 0.502035659132388, "grad_norm": 0.6284172666370126, "learning_rate": 1.9999987693781806e-05, "loss": 0.39, "step": 447 }, { "epoch": 0.5031587814123263, "grad_norm": 0.523476357886049, "learning_rate": 1.9999972311016158e-05, "loss": 0.3851, "step": 448 }, { "epoch": 0.5042819036922644, "grad_norm": 0.5198224598673091, "learning_rate": 1.99999507751575e-05, "loss": 0.3722, "step": 449 }, { "epoch": 0.5054050259722027, "grad_norm": 0.5990656789702519, "learning_rate": 1.999992308621909e-05, "loss": 0.3961, "step": 450 }, { "epoch": 0.506528148252141, "grad_norm": 0.5337336925829737, "learning_rate": 1.9999889244217957e-05, "loss": 0.3661, "step": 451 }, { "epoch": 0.5076512705320791, "grad_norm": 0.5265236086660914, "learning_rate": 1.9999849249174933e-05, "loss": 0.3763, "step": 452 }, { "epoch": 0.5087743928120174, "grad_norm": 0.6072251444919325, "learning_rate": 1.9999803101114622e-05, "loss": 0.3702, "step": 453 }, { "epoch": 0.5098975150919557, "grad_norm": 0.6242928399695532, "learning_rate": 1.9999750800065415e-05, "loss": 0.3953, "step": 454 }, { "epoch": 0.5110206373718938, "grad_norm": 0.5179538772395289, "learning_rate": 1.9999692346059504e-05, "loss": 0.3873, "step": 455 }, { "epoch": 0.5121437596518321, "grad_norm": 0.5639243942734614, "learning_rate": 1.9999627739132847e-05, "loss": 0.3975, "step": 456 }, { "epoch": 0.5132668819317703, "grad_norm": 0.6222398095764557, "learning_rate": 1.9999556979325203e-05, "loss": 0.3826, "step": 457 }, { "epoch": 0.5143900042117086, "grad_norm": 0.5381548600095061, "learning_rate": 1.999948006668011e-05, "loss": 0.4006, "step": 458 }, { "epoch": 0.5155131264916468, "grad_norm": 0.5460529769570657, "learning_rate": 1.999939700124489e-05, "loss": 0.3779, "step": 459 }, { "epoch": 0.516636248771585, "grad_norm": 0.5285009639724508, "learning_rate": 1.999930778307066e-05, "loss": 0.4117, "step": 460 }, { "epoch": 0.5177593710515233, "grad_norm": 0.588582651422445, "learning_rate": 1.9999212412212313e-05, "loss": 0.4135, "step": 461 }, { "epoch": 0.5188824933314614, "grad_norm": 0.5262946007341981, "learning_rate": 1.9999110888728527e-05, "loss": 0.3836, "step": 462 }, { "epoch": 0.5200056156113997, "grad_norm": 0.5074018080150626, "learning_rate": 1.999900321268178e-05, "loss": 0.3879, "step": 463 }, { "epoch": 0.521128737891338, "grad_norm": 0.5699895099551994, "learning_rate": 1.9998889384138324e-05, "loss": 0.3994, "step": 464 }, { "epoch": 0.5222518601712761, "grad_norm": 0.5488889184993216, "learning_rate": 1.9998769403168195e-05, "loss": 0.3787, "step": 465 }, { "epoch": 0.5233749824512144, "grad_norm": 0.587601256959231, "learning_rate": 1.9998643269845225e-05, "loss": 0.4028, "step": 466 }, { "epoch": 0.5244981047311527, "grad_norm": 0.5470076750391482, "learning_rate": 1.9998510984247015e-05, "loss": 0.3869, "step": 467 }, { "epoch": 0.5256212270110908, "grad_norm": 0.5490698566612296, "learning_rate": 1.9998372546454973e-05, "loss": 0.4069, "step": 468 }, { "epoch": 0.5267443492910291, "grad_norm": 0.5377412654267874, "learning_rate": 1.999822795655427e-05, "loss": 0.3622, "step": 469 }, { "epoch": 0.5278674715709673, "grad_norm": 0.5398403722418982, "learning_rate": 1.9998077214633884e-05, "loss": 0.3785, "step": 470 }, { "epoch": 0.5289905938509055, "grad_norm": 0.49636950695396015, "learning_rate": 1.999792032078656e-05, "loss": 0.3526, "step": 471 }, { "epoch": 0.5301137161308438, "grad_norm": 0.6168098343297038, "learning_rate": 1.9997757275108847e-05, "loss": 0.3947, "step": 472 }, { "epoch": 0.531236838410782, "grad_norm": 0.5815523034498505, "learning_rate": 1.9997588077701057e-05, "loss": 0.3776, "step": 473 }, { "epoch": 0.5323599606907202, "grad_norm": 0.5214146087899991, "learning_rate": 1.999741272866731e-05, "loss": 0.3685, "step": 474 }, { "epoch": 0.5334830829706584, "grad_norm": 0.554167072491176, "learning_rate": 1.9997231228115487e-05, "loss": 0.3987, "step": 475 }, { "epoch": 0.5346062052505967, "grad_norm": 0.5442738139350299, "learning_rate": 1.999704357615728e-05, "loss": 0.3824, "step": 476 }, { "epoch": 0.5357293275305349, "grad_norm": 0.5157448841129373, "learning_rate": 1.9996849772908144e-05, "loss": 0.3728, "step": 477 }, { "epoch": 0.5368524498104731, "grad_norm": 0.5376140918675025, "learning_rate": 1.9996649818487336e-05, "loss": 0.409, "step": 478 }, { "epoch": 0.5379755720904114, "grad_norm": 0.549248833785917, "learning_rate": 1.9996443713017883e-05, "loss": 0.3852, "step": 479 }, { "epoch": 0.5390986943703495, "grad_norm": 0.5236716683541965, "learning_rate": 1.999623145662661e-05, "loss": 0.3798, "step": 480 }, { "epoch": 0.5402218166502878, "grad_norm": 0.609473300182195, "learning_rate": 1.9996013049444117e-05, "loss": 0.3764, "step": 481 }, { "epoch": 0.5413449389302261, "grad_norm": 0.5355644461303796, "learning_rate": 1.9995788491604797e-05, "loss": 0.3574, "step": 482 }, { "epoch": 0.5424680612101642, "grad_norm": 0.5580759406548179, "learning_rate": 1.9995557783246814e-05, "loss": 0.3704, "step": 483 }, { "epoch": 0.5435911834901025, "grad_norm": 0.5531806767952361, "learning_rate": 1.9995320924512135e-05, "loss": 0.3939, "step": 484 }, { "epoch": 0.5447143057700408, "grad_norm": 0.5834026851989711, "learning_rate": 1.9995077915546496e-05, "loss": 0.3931, "step": 485 }, { "epoch": 0.5458374280499789, "grad_norm": 0.5591794205487529, "learning_rate": 1.9994828756499423e-05, "loss": 0.4071, "step": 486 }, { "epoch": 0.5469605503299172, "grad_norm": 0.5322788134249687, "learning_rate": 1.9994573447524233e-05, "loss": 0.3704, "step": 487 }, { "epoch": 0.5480836726098554, "grad_norm": 0.6160509443550839, "learning_rate": 1.999431198877801e-05, "loss": 0.3668, "step": 488 }, { "epoch": 0.5492067948897936, "grad_norm": 0.4808621346087435, "learning_rate": 1.9994044380421642e-05, "loss": 0.359, "step": 489 }, { "epoch": 0.5503299171697319, "grad_norm": 0.5940190325126405, "learning_rate": 1.9993770622619784e-05, "loss": 0.3975, "step": 490 }, { "epoch": 0.5514530394496701, "grad_norm": 0.5314215977553695, "learning_rate": 1.9993490715540888e-05, "loss": 0.3619, "step": 491 }, { "epoch": 0.5525761617296083, "grad_norm": 0.5286779855947581, "learning_rate": 1.9993204659357176e-05, "loss": 0.3624, "step": 492 }, { "epoch": 0.5536992840095465, "grad_norm": 0.6157841936932821, "learning_rate": 1.9992912454244677e-05, "loss": 0.3856, "step": 493 }, { "epoch": 0.5548224062894848, "grad_norm": 0.5146925583224419, "learning_rate": 1.999261410038317e-05, "loss": 0.3889, "step": 494 }, { "epoch": 0.555945528569423, "grad_norm": 0.5899681817511568, "learning_rate": 1.9992309597956244e-05, "loss": 0.3872, "step": 495 }, { "epoch": 0.5570686508493612, "grad_norm": 0.5514658907626057, "learning_rate": 1.999199894715126e-05, "loss": 0.367, "step": 496 }, { "epoch": 0.5581917731292995, "grad_norm": 0.5727284572822289, "learning_rate": 1.9991682148159372e-05, "loss": 0.3855, "step": 497 }, { "epoch": 0.5593148954092377, "grad_norm": 0.6630678363436415, "learning_rate": 1.9991359201175503e-05, "loss": 0.3747, "step": 498 }, { "epoch": 0.5604380176891759, "grad_norm": 0.5168218883882756, "learning_rate": 1.9991030106398367e-05, "loss": 0.3665, "step": 499 }, { "epoch": 0.5615611399691142, "grad_norm": 0.5285711442774218, "learning_rate": 1.999069486403046e-05, "loss": 0.3661, "step": 500 }, { "epoch": 0.5626842622490523, "grad_norm": 0.612117315690109, "learning_rate": 1.999035347427806e-05, "loss": 0.4039, "step": 501 }, { "epoch": 0.5638073845289906, "grad_norm": 0.5866977447253838, "learning_rate": 1.999000593735123e-05, "loss": 0.4256, "step": 502 }, { "epoch": 0.5649305068089289, "grad_norm": 0.5670878177199548, "learning_rate": 1.998965225346381e-05, "loss": 0.4193, "step": 503 }, { "epoch": 0.566053629088867, "grad_norm": 0.5585226536761726, "learning_rate": 1.998929242283343e-05, "loss": 0.3909, "step": 504 }, { "epoch": 0.5671767513688053, "grad_norm": 0.5483834024577466, "learning_rate": 1.9988926445681495e-05, "loss": 0.3914, "step": 505 }, { "epoch": 0.5682998736487435, "grad_norm": 0.48877542079307956, "learning_rate": 1.9988554322233188e-05, "loss": 0.3718, "step": 506 }, { "epoch": 0.5694229959286817, "grad_norm": 0.5685952858138967, "learning_rate": 1.9988176052717495e-05, "loss": 0.3989, "step": 507 }, { "epoch": 0.57054611820862, "grad_norm": 0.5234278520668304, "learning_rate": 1.9987791637367157e-05, "loss": 0.3742, "step": 508 }, { "epoch": 0.5716692404885582, "grad_norm": 0.5460853796115502, "learning_rate": 1.9987401076418717e-05, "loss": 0.402, "step": 509 }, { "epoch": 0.5727923627684964, "grad_norm": 0.5455839187416055, "learning_rate": 1.9987004370112487e-05, "loss": 0.3641, "step": 510 }, { "epoch": 0.5739154850484347, "grad_norm": 0.5056945010695153, "learning_rate": 1.9986601518692567e-05, "loss": 0.375, "step": 511 }, { "epoch": 0.5750386073283729, "grad_norm": 0.5208870388950549, "learning_rate": 1.9986192522406835e-05, "loss": 0.3714, "step": 512 }, { "epoch": 0.5761617296083111, "grad_norm": 0.5358983435756814, "learning_rate": 1.998577738150695e-05, "loss": 0.3822, "step": 513 }, { "epoch": 0.5772848518882493, "grad_norm": 0.5399478578334925, "learning_rate": 1.9985356096248357e-05, "loss": 0.3978, "step": 514 }, { "epoch": 0.5784079741681876, "grad_norm": 0.5276672395265529, "learning_rate": 1.998492866689027e-05, "loss": 0.351, "step": 515 }, { "epoch": 0.5795310964481258, "grad_norm": 0.5308333056433986, "learning_rate": 1.9984495093695696e-05, "loss": 0.4038, "step": 516 }, { "epoch": 0.580654218728064, "grad_norm": 0.4895548996663825, "learning_rate": 1.9984055376931414e-05, "loss": 0.349, "step": 517 }, { "epoch": 0.5817773410080023, "grad_norm": 0.5250328463562076, "learning_rate": 1.9983609516867993e-05, "loss": 0.3766, "step": 518 }, { "epoch": 0.5829004632879404, "grad_norm": 0.5371115914639484, "learning_rate": 1.9983157513779768e-05, "loss": 0.3576, "step": 519 }, { "epoch": 0.5840235855678787, "grad_norm": 0.5127425095139194, "learning_rate": 1.998269936794487e-05, "loss": 0.3771, "step": 520 }, { "epoch": 0.585146707847817, "grad_norm": 0.5327318689065801, "learning_rate": 1.9982235079645192e-05, "loss": 0.3629, "step": 521 }, { "epoch": 0.5862698301277551, "grad_norm": 0.5158425879035294, "learning_rate": 1.998176464916642e-05, "loss": 0.3513, "step": 522 }, { "epoch": 0.5873929524076934, "grad_norm": 0.5261415497468315, "learning_rate": 1.998128807679802e-05, "loss": 0.3895, "step": 523 }, { "epoch": 0.5885160746876316, "grad_norm": 0.5330293259802309, "learning_rate": 1.998080536283322e-05, "loss": 0.3801, "step": 524 }, { "epoch": 0.5896391969675698, "grad_norm": 0.5328558914321503, "learning_rate": 1.998031650756905e-05, "loss": 0.3746, "step": 525 }, { "epoch": 0.5907623192475081, "grad_norm": 0.5390463848256605, "learning_rate": 1.9979821511306308e-05, "loss": 0.3817, "step": 526 }, { "epoch": 0.5918854415274463, "grad_norm": 0.5431335082497138, "learning_rate": 1.9979320374349564e-05, "loss": 0.3908, "step": 527 }, { "epoch": 0.5930085638073845, "grad_norm": 0.5101979024125878, "learning_rate": 1.997881309700717e-05, "loss": 0.3777, "step": 528 }, { "epoch": 0.5941316860873228, "grad_norm": 0.5209616551352008, "learning_rate": 1.9978299679591276e-05, "loss": 0.385, "step": 529 }, { "epoch": 0.595254808367261, "grad_norm": 0.518301945363611, "learning_rate": 1.9977780122417778e-05, "loss": 0.3927, "step": 530 }, { "epoch": 0.5963779306471992, "grad_norm": 0.5064656297120799, "learning_rate": 1.997725442580637e-05, "loss": 0.366, "step": 531 }, { "epoch": 0.5975010529271374, "grad_norm": 0.560763233769782, "learning_rate": 1.997672259008052e-05, "loss": 0.3839, "step": 532 }, { "epoch": 0.5986241752070757, "grad_norm": 0.5185441179220639, "learning_rate": 1.997618461556747e-05, "loss": 0.3649, "step": 533 }, { "epoch": 0.5997472974870139, "grad_norm": 0.5116157155704527, "learning_rate": 1.9975640502598243e-05, "loss": 0.4155, "step": 534 }, { "epoch": 0.6008704197669521, "grad_norm": 0.5484375904037471, "learning_rate": 1.9975090251507637e-05, "loss": 0.3893, "step": 535 }, { "epoch": 0.6019935420468904, "grad_norm": 0.5194411036484662, "learning_rate": 1.9974533862634234e-05, "loss": 0.3708, "step": 536 }, { "epoch": 0.6031166643268285, "grad_norm": 0.5084933191055989, "learning_rate": 1.9973971336320374e-05, "loss": 0.3618, "step": 537 }, { "epoch": 0.6042397866067668, "grad_norm": 0.500283964355109, "learning_rate": 1.9973402672912196e-05, "loss": 0.3773, "step": 538 }, { "epoch": 0.6053629088867051, "grad_norm": 0.5043471751080929, "learning_rate": 1.9972827872759598e-05, "loss": 0.3706, "step": 539 }, { "epoch": 0.6064860311666432, "grad_norm": 0.48051148948387623, "learning_rate": 1.9972246936216268e-05, "loss": 0.369, "step": 540 }, { "epoch": 0.6076091534465815, "grad_norm": 0.5037545077388655, "learning_rate": 1.9971659863639657e-05, "loss": 0.3777, "step": 541 }, { "epoch": 0.6087322757265198, "grad_norm": 0.5202846079968864, "learning_rate": 1.9971066655390997e-05, "loss": 0.3878, "step": 542 }, { "epoch": 0.6098553980064579, "grad_norm": 0.5196509635056363, "learning_rate": 1.99704673118353e-05, "loss": 0.3786, "step": 543 }, { "epoch": 0.6109785202863962, "grad_norm": 0.5100880924606735, "learning_rate": 1.996986183334134e-05, "loss": 0.3923, "step": 544 }, { "epoch": 0.6121016425663344, "grad_norm": 0.5374754446994799, "learning_rate": 1.9969250220281687e-05, "loss": 0.4019, "step": 545 }, { "epoch": 0.6132247648462726, "grad_norm": 0.5405136047123903, "learning_rate": 1.9968632473032663e-05, "loss": 0.3903, "step": 546 }, { "epoch": 0.6143478871262109, "grad_norm": 0.5377137297729571, "learning_rate": 1.996800859197438e-05, "loss": 0.3847, "step": 547 }, { "epoch": 0.6154710094061491, "grad_norm": 0.5280413079303611, "learning_rate": 1.996737857749072e-05, "loss": 0.3846, "step": 548 }, { "epoch": 0.6165941316860873, "grad_norm": 0.4698879134163189, "learning_rate": 1.996674242996933e-05, "loss": 0.3624, "step": 549 }, { "epoch": 0.6177172539660255, "grad_norm": 0.5693615931340346, "learning_rate": 1.9966100149801648e-05, "loss": 0.3809, "step": 550 }, { "epoch": 0.6188403762459638, "grad_norm": 0.5390167928723439, "learning_rate": 1.9965451737382872e-05, "loss": 0.3944, "step": 551 }, { "epoch": 0.619963498525902, "grad_norm": 0.4687981254942727, "learning_rate": 1.9964797193111973e-05, "loss": 0.3486, "step": 552 }, { "epoch": 0.6210866208058402, "grad_norm": 0.6002959050862824, "learning_rate": 1.9964136517391708e-05, "loss": 0.427, "step": 553 }, { "epoch": 0.6222097430857785, "grad_norm": 0.5424155203819168, "learning_rate": 1.9963469710628592e-05, "loss": 0.3864, "step": 554 }, { "epoch": 0.6233328653657166, "grad_norm": 0.4805110497258522, "learning_rate": 1.996279677323292e-05, "loss": 0.3431, "step": 555 }, { "epoch": 0.6244559876456549, "grad_norm": 0.5468592773183889, "learning_rate": 1.9962117705618757e-05, "loss": 0.3916, "step": 556 }, { "epoch": 0.6255791099255932, "grad_norm": 0.5974551409729048, "learning_rate": 1.9961432508203938e-05, "loss": 0.3703, "step": 557 }, { "epoch": 0.6267022322055313, "grad_norm": 0.46694957713893454, "learning_rate": 1.996074118141008e-05, "loss": 0.36, "step": 558 }, { "epoch": 0.6278253544854696, "grad_norm": 0.5203301666877311, "learning_rate": 1.9960043725662558e-05, "loss": 0.3661, "step": 559 }, { "epoch": 0.6289484767654079, "grad_norm": 0.5315196144666886, "learning_rate": 1.995934014139053e-05, "loss": 0.3878, "step": 560 }, { "epoch": 0.630071599045346, "grad_norm": 0.4726474224175523, "learning_rate": 1.9958630429026912e-05, "loss": 0.3549, "step": 561 }, { "epoch": 0.6311947213252843, "grad_norm": 0.5265877427590347, "learning_rate": 1.9957914589008405e-05, "loss": 0.372, "step": 562 }, { "epoch": 0.6323178436052225, "grad_norm": 0.5108540554457581, "learning_rate": 1.9957192621775466e-05, "loss": 0.3709, "step": 563 }, { "epoch": 0.6334409658851607, "grad_norm": 0.5290487085943824, "learning_rate": 1.9956464527772334e-05, "loss": 0.3884, "step": 564 }, { "epoch": 0.634564088165099, "grad_norm": 0.5281270652784177, "learning_rate": 1.9955730307447015e-05, "loss": 0.3834, "step": 565 }, { "epoch": 0.6356872104450372, "grad_norm": 0.5085482495756416, "learning_rate": 1.9954989961251276e-05, "loss": 0.3732, "step": 566 }, { "epoch": 0.6368103327249754, "grad_norm": 0.4783417140693344, "learning_rate": 1.9954243489640667e-05, "loss": 0.3696, "step": 567 }, { "epoch": 0.6379334550049136, "grad_norm": 0.5374355693068502, "learning_rate": 1.995349089307449e-05, "loss": 0.3755, "step": 568 }, { "epoch": 0.6390565772848519, "grad_norm": 0.5020426636675567, "learning_rate": 1.995273217201584e-05, "loss": 0.3667, "step": 569 }, { "epoch": 0.6401796995647902, "grad_norm": 0.48221702122344806, "learning_rate": 1.995196732693156e-05, "loss": 0.3669, "step": 570 }, { "epoch": 0.6413028218447283, "grad_norm": 0.4948631937332312, "learning_rate": 1.9951196358292266e-05, "loss": 0.3815, "step": 571 }, { "epoch": 0.6424259441246666, "grad_norm": 0.5149576535985988, "learning_rate": 1.9950419266572344e-05, "loss": 0.3748, "step": 572 }, { "epoch": 0.6435490664046049, "grad_norm": 0.5077161706598761, "learning_rate": 1.994963605224995e-05, "loss": 0.3736, "step": 573 }, { "epoch": 0.644672188684543, "grad_norm": 0.4930605863816736, "learning_rate": 1.9948846715807e-05, "loss": 0.3698, "step": 574 }, { "epoch": 0.6457953109644813, "grad_norm": 0.514614667391616, "learning_rate": 1.9948051257729184e-05, "loss": 0.3736, "step": 575 }, { "epoch": 0.6469184332444194, "grad_norm": 0.5176250628419765, "learning_rate": 1.9947249678505956e-05, "loss": 0.3693, "step": 576 }, { "epoch": 0.6480415555243577, "grad_norm": 0.45913174362011855, "learning_rate": 1.994644197863054e-05, "loss": 0.3499, "step": 577 }, { "epoch": 0.649164677804296, "grad_norm": 0.49883541686787625, "learning_rate": 1.994562815859991e-05, "loss": 0.362, "step": 578 }, { "epoch": 0.6502878000842341, "grad_norm": 0.5146434352456531, "learning_rate": 1.9944808218914833e-05, "loss": 0.3906, "step": 579 }, { "epoch": 0.6514109223641724, "grad_norm": 0.5066808040702572, "learning_rate": 1.9943982160079823e-05, "loss": 0.3853, "step": 580 }, { "epoch": 0.6525340446441106, "grad_norm": 0.49848710491518, "learning_rate": 1.9943149982603155e-05, "loss": 0.3985, "step": 581 }, { "epoch": 0.6536571669240488, "grad_norm": 0.4878251052204487, "learning_rate": 1.994231168699689e-05, "loss": 0.3714, "step": 582 }, { "epoch": 0.6547802892039871, "grad_norm": 0.4758786623666332, "learning_rate": 1.9941467273776832e-05, "loss": 0.3946, "step": 583 }, { "epoch": 0.6559034114839253, "grad_norm": 0.4983367896874346, "learning_rate": 1.9940616743462557e-05, "loss": 0.3722, "step": 584 }, { "epoch": 0.6570265337638636, "grad_norm": 0.5075829148442156, "learning_rate": 1.9939760096577408e-05, "loss": 0.3843, "step": 585 }, { "epoch": 0.6581496560438018, "grad_norm": 0.5301054416844904, "learning_rate": 1.9938897333648492e-05, "loss": 0.3997, "step": 586 }, { "epoch": 0.65927277832374, "grad_norm": 0.5148031126628784, "learning_rate": 1.9938028455206674e-05, "loss": 0.3737, "step": 587 }, { "epoch": 0.6603959006036783, "grad_norm": 0.46714827561279637, "learning_rate": 1.9937153461786585e-05, "loss": 0.3431, "step": 588 }, { "epoch": 0.6615190228836164, "grad_norm": 0.5177398408210456, "learning_rate": 1.9936272353926616e-05, "loss": 0.3666, "step": 589 }, { "epoch": 0.6626421451635547, "grad_norm": 0.47039452637574947, "learning_rate": 1.993538513216892e-05, "loss": 0.3635, "step": 590 }, { "epoch": 0.663765267443493, "grad_norm": 0.5093614259546847, "learning_rate": 1.9934491797059425e-05, "loss": 0.3915, "step": 591 }, { "epoch": 0.6648883897234311, "grad_norm": 0.5366679460711811, "learning_rate": 1.99335923491478e-05, "loss": 0.398, "step": 592 }, { "epoch": 0.6660115120033694, "grad_norm": 0.5082350993851118, "learning_rate": 1.993268678898749e-05, "loss": 0.3938, "step": 593 }, { "epoch": 0.6671346342833075, "grad_norm": 0.5182267099894416, "learning_rate": 1.993177511713569e-05, "loss": 0.4121, "step": 594 }, { "epoch": 0.6682577565632458, "grad_norm": 0.49114322650286973, "learning_rate": 1.9930857334153374e-05, "loss": 0.3738, "step": 595 }, { "epoch": 0.6693808788431841, "grad_norm": 0.47709228497356176, "learning_rate": 1.992993344060525e-05, "loss": 0.3767, "step": 596 }, { "epoch": 0.6705040011231223, "grad_norm": 0.5108226641321751, "learning_rate": 1.992900343705981e-05, "loss": 0.3901, "step": 597 }, { "epoch": 0.6716271234030605, "grad_norm": 0.5041216553802877, "learning_rate": 1.9928067324089286e-05, "loss": 0.3699, "step": 598 }, { "epoch": 0.6727502456829987, "grad_norm": 0.4923581285784332, "learning_rate": 1.9927125102269687e-05, "loss": 0.3764, "step": 599 }, { "epoch": 0.673873367962937, "grad_norm": 0.5387456019298923, "learning_rate": 1.992617677218077e-05, "loss": 0.3994, "step": 600 }, { "epoch": 0.6749964902428752, "grad_norm": 0.5115844130946426, "learning_rate": 1.992522233440605e-05, "loss": 0.4016, "step": 601 }, { "epoch": 0.6761196125228134, "grad_norm": 0.489549840063431, "learning_rate": 1.992426178953281e-05, "loss": 0.3861, "step": 602 }, { "epoch": 0.6772427348027517, "grad_norm": 0.48352200149034974, "learning_rate": 1.9923295138152076e-05, "loss": 0.3501, "step": 603 }, { "epoch": 0.6783658570826899, "grad_norm": 0.5003180468080372, "learning_rate": 1.992232238085864e-05, "loss": 0.3927, "step": 604 }, { "epoch": 0.6794889793626281, "grad_norm": 0.46529290603039347, "learning_rate": 1.9921343518251057e-05, "loss": 0.3613, "step": 605 }, { "epoch": 0.6806121016425664, "grad_norm": 0.48009299538190453, "learning_rate": 1.9920358550931627e-05, "loss": 0.3692, "step": 606 }, { "epoch": 0.6817352239225045, "grad_norm": 0.4965554167186214, "learning_rate": 1.9919367479506413e-05, "loss": 0.3629, "step": 607 }, { "epoch": 0.6828583462024428, "grad_norm": 0.48758593884923196, "learning_rate": 1.9918370304585228e-05, "loss": 0.3583, "step": 608 }, { "epoch": 0.6839814684823811, "grad_norm": 0.5358513727384278, "learning_rate": 1.991736702678165e-05, "loss": 0.3916, "step": 609 }, { "epoch": 0.6851045907623192, "grad_norm": 0.4867798988879104, "learning_rate": 1.9916357646713006e-05, "loss": 0.3452, "step": 610 }, { "epoch": 0.6862277130422575, "grad_norm": 0.5325347519332022, "learning_rate": 1.9915342165000375e-05, "loss": 0.403, "step": 611 }, { "epoch": 0.6873508353221957, "grad_norm": 0.48855001583167373, "learning_rate": 1.99143205822686e-05, "loss": 0.3791, "step": 612 }, { "epoch": 0.6884739576021339, "grad_norm": 0.47712559355490186, "learning_rate": 1.9913292899146262e-05, "loss": 0.3626, "step": 613 }, { "epoch": 0.6895970798820722, "grad_norm": 0.537623122561854, "learning_rate": 1.9912259116265718e-05, "loss": 0.3769, "step": 614 }, { "epoch": 0.6907202021620104, "grad_norm": 0.4613452971894331, "learning_rate": 1.991121923426306e-05, "loss": 0.3562, "step": 615 }, { "epoch": 0.6918433244419486, "grad_norm": 0.5176397056990323, "learning_rate": 1.9910173253778136e-05, "loss": 0.3937, "step": 616 }, { "epoch": 0.6929664467218869, "grad_norm": 0.502108110386917, "learning_rate": 1.9909121175454553e-05, "loss": 0.3852, "step": 617 }, { "epoch": 0.6940895690018251, "grad_norm": 0.46871949558914766, "learning_rate": 1.9908062999939666e-05, "loss": 0.3615, "step": 618 }, { "epoch": 0.6952126912817633, "grad_norm": 0.49710099802582, "learning_rate": 1.9906998727884582e-05, "loss": 0.3624, "step": 619 }, { "epoch": 0.6963358135617015, "grad_norm": 0.5132520818715471, "learning_rate": 1.990592835994416e-05, "loss": 0.381, "step": 620 }, { "epoch": 0.6974589358416398, "grad_norm": 0.47566380028639976, "learning_rate": 1.990485189677701e-05, "loss": 0.391, "step": 621 }, { "epoch": 0.698582058121578, "grad_norm": 0.47811544132679007, "learning_rate": 1.990376933904549e-05, "loss": 0.3662, "step": 622 }, { "epoch": 0.6997051804015162, "grad_norm": 0.5120337787106785, "learning_rate": 1.9902680687415704e-05, "loss": 0.3731, "step": 623 }, { "epoch": 0.7008283026814545, "grad_norm": 0.5300963176142912, "learning_rate": 1.990158594255752e-05, "loss": 0.3679, "step": 624 }, { "epoch": 0.7019514249613926, "grad_norm": 0.47924641278798885, "learning_rate": 1.9900485105144544e-05, "loss": 0.3589, "step": 625 }, { "epoch": 0.7030745472413309, "grad_norm": 0.5270278691730991, "learning_rate": 1.9899378175854134e-05, "loss": 0.3921, "step": 626 }, { "epoch": 0.7041976695212692, "grad_norm": 0.5179981618980336, "learning_rate": 1.9898265155367394e-05, "loss": 0.4171, "step": 627 }, { "epoch": 0.7053207918012073, "grad_norm": 0.5055637943014714, "learning_rate": 1.9897146044369177e-05, "loss": 0.3733, "step": 628 }, { "epoch": 0.7064439140811456, "grad_norm": 0.5349952912388607, "learning_rate": 1.9896020843548086e-05, "loss": 0.3912, "step": 629 }, { "epoch": 0.7075670363610839, "grad_norm": 0.47936450848582013, "learning_rate": 1.989488955359647e-05, "loss": 0.3461, "step": 630 }, { "epoch": 0.708690158641022, "grad_norm": 0.5229256593874981, "learning_rate": 1.9893752175210423e-05, "loss": 0.3835, "step": 631 }, { "epoch": 0.7098132809209603, "grad_norm": 0.5517790504856993, "learning_rate": 1.9892608709089788e-05, "loss": 0.379, "step": 632 }, { "epoch": 0.7109364032008985, "grad_norm": 0.5135986709269499, "learning_rate": 1.9891459155938148e-05, "loss": 0.3938, "step": 633 }, { "epoch": 0.7120595254808367, "grad_norm": 0.5235231286661839, "learning_rate": 1.9890303516462842e-05, "loss": 0.4047, "step": 634 }, { "epoch": 0.713182647760775, "grad_norm": 0.5230019091855481, "learning_rate": 1.9889141791374942e-05, "loss": 0.3772, "step": 635 }, { "epoch": 0.7143057700407132, "grad_norm": 0.5086902065079811, "learning_rate": 1.9887973981389276e-05, "loss": 0.3789, "step": 636 }, { "epoch": 0.7154288923206514, "grad_norm": 0.45834248146994216, "learning_rate": 1.9886800087224404e-05, "loss": 0.3511, "step": 637 }, { "epoch": 0.7165520146005896, "grad_norm": 0.5193295159932503, "learning_rate": 1.9885620109602637e-05, "loss": 0.3777, "step": 638 }, { "epoch": 0.7176751368805279, "grad_norm": 0.4751465804458114, "learning_rate": 1.9884434049250038e-05, "loss": 0.3747, "step": 639 }, { "epoch": 0.7187982591604661, "grad_norm": 0.47889259165907166, "learning_rate": 1.988324190689639e-05, "loss": 0.3564, "step": 640 }, { "epoch": 0.7199213814404043, "grad_norm": 0.4504694191651383, "learning_rate": 1.9882043683275235e-05, "loss": 0.3569, "step": 641 }, { "epoch": 0.7210445037203426, "grad_norm": 0.48131939806970797, "learning_rate": 1.9880839379123854e-05, "loss": 0.3775, "step": 642 }, { "epoch": 0.7221676260002807, "grad_norm": 0.5054764749704692, "learning_rate": 1.9879628995183274e-05, "loss": 0.3804, "step": 643 }, { "epoch": 0.723290748280219, "grad_norm": 0.49166280486529856, "learning_rate": 1.987841253219825e-05, "loss": 0.3846, "step": 644 }, { "epoch": 0.7244138705601573, "grad_norm": 0.4729734631200996, "learning_rate": 1.987718999091729e-05, "loss": 0.3721, "step": 645 }, { "epoch": 0.7255369928400954, "grad_norm": 0.5129501870416402, "learning_rate": 1.987596137209263e-05, "loss": 0.3656, "step": 646 }, { "epoch": 0.7266601151200337, "grad_norm": 0.5182252271285124, "learning_rate": 1.9874726676480264e-05, "loss": 0.4007, "step": 647 }, { "epoch": 0.727783237399972, "grad_norm": 0.5183328972199854, "learning_rate": 1.98734859048399e-05, "loss": 0.3853, "step": 648 }, { "epoch": 0.7289063596799101, "grad_norm": 0.48679212429312807, "learning_rate": 1.9872239057935013e-05, "loss": 0.3615, "step": 649 }, { "epoch": 0.7300294819598484, "grad_norm": 0.4599807442751718, "learning_rate": 1.9870986136532792e-05, "loss": 0.3447, "step": 650 }, { "epoch": 0.7311526042397866, "grad_norm": 0.48761341111368395, "learning_rate": 1.986972714140418e-05, "loss": 0.3487, "step": 651 }, { "epoch": 0.7322757265197248, "grad_norm": 0.5019293541443021, "learning_rate": 1.986846207332384e-05, "loss": 0.378, "step": 652 }, { "epoch": 0.7333988487996631, "grad_norm": 0.507765982179853, "learning_rate": 1.986719093307019e-05, "loss": 0.3653, "step": 653 }, { "epoch": 0.7345219710796013, "grad_norm": 0.48272271090692315, "learning_rate": 1.9865913721425376e-05, "loss": 0.3542, "step": 654 }, { "epoch": 0.7356450933595395, "grad_norm": 0.5175822423440689, "learning_rate": 1.9864630439175282e-05, "loss": 0.366, "step": 655 }, { "epoch": 0.7367682156394777, "grad_norm": 0.5289074782303135, "learning_rate": 1.986334108710952e-05, "loss": 0.3817, "step": 656 }, { "epoch": 0.737891337919416, "grad_norm": 0.4861617055976239, "learning_rate": 1.9862045666021448e-05, "loss": 0.3752, "step": 657 }, { "epoch": 0.7390144601993542, "grad_norm": 0.4957975445225569, "learning_rate": 1.986074417670815e-05, "loss": 0.3588, "step": 658 }, { "epoch": 0.7401375824792924, "grad_norm": 0.4948274783820183, "learning_rate": 1.9859436619970448e-05, "loss": 0.3854, "step": 659 }, { "epoch": 0.7412607047592307, "grad_norm": 0.4948086029632615, "learning_rate": 1.9858122996612897e-05, "loss": 0.3778, "step": 660 }, { "epoch": 0.7423838270391689, "grad_norm": 0.5091232713262371, "learning_rate": 1.9856803307443782e-05, "loss": 0.3943, "step": 661 }, { "epoch": 0.7435069493191071, "grad_norm": 0.47932797892770085, "learning_rate": 1.985547755327512e-05, "loss": 0.3608, "step": 662 }, { "epoch": 0.7446300715990454, "grad_norm": 0.48396806021455513, "learning_rate": 1.9854145734922668e-05, "loss": 0.3691, "step": 663 }, { "epoch": 0.7457531938789835, "grad_norm": 0.48291497456525306, "learning_rate": 1.9852807853205903e-05, "loss": 0.3544, "step": 664 }, { "epoch": 0.7468763161589218, "grad_norm": 0.527750629448579, "learning_rate": 1.985146390894804e-05, "loss": 0.3795, "step": 665 }, { "epoch": 0.7479994384388601, "grad_norm": 0.49379054725765187, "learning_rate": 1.9850113902976025e-05, "loss": 0.3595, "step": 666 }, { "epoch": 0.7491225607187982, "grad_norm": 0.5043416333342748, "learning_rate": 1.9848757836120528e-05, "loss": 0.3627, "step": 667 }, { "epoch": 0.7502456829987365, "grad_norm": 0.49989404508934815, "learning_rate": 1.984739570921596e-05, "loss": 0.3515, "step": 668 }, { "epoch": 0.7513688052786747, "grad_norm": 0.508418215461715, "learning_rate": 1.984602752310044e-05, "loss": 0.3601, "step": 669 }, { "epoch": 0.7524919275586129, "grad_norm": 0.5189115847179923, "learning_rate": 1.9844653278615836e-05, "loss": 0.3865, "step": 670 }, { "epoch": 0.7536150498385512, "grad_norm": 0.49519160344351704, "learning_rate": 1.9843272976607735e-05, "loss": 0.3789, "step": 671 }, { "epoch": 0.7547381721184894, "grad_norm": 0.5125455405404472, "learning_rate": 1.984188661792545e-05, "loss": 0.3902, "step": 672 }, { "epoch": 0.7558612943984276, "grad_norm": 0.46902819064744006, "learning_rate": 1.9840494203422024e-05, "loss": 0.3562, "step": 673 }, { "epoch": 0.7569844166783659, "grad_norm": 0.4712682187918327, "learning_rate": 1.9839095733954226e-05, "loss": 0.3724, "step": 674 }, { "epoch": 0.7581075389583041, "grad_norm": 0.5083096985128949, "learning_rate": 1.9837691210382547e-05, "loss": 0.3945, "step": 675 }, { "epoch": 0.7592306612382423, "grad_norm": 0.4756841491353359, "learning_rate": 1.9836280633571206e-05, "loss": 0.3964, "step": 676 }, { "epoch": 0.7603537835181805, "grad_norm": 0.5229455215501784, "learning_rate": 1.9834864004388153e-05, "loss": 0.3925, "step": 677 }, { "epoch": 0.7614769057981188, "grad_norm": 0.490426080752418, "learning_rate": 1.9833441323705046e-05, "loss": 0.3608, "step": 678 }, { "epoch": 0.762600028078057, "grad_norm": 0.46495132814690193, "learning_rate": 1.983201259239728e-05, "loss": 0.3733, "step": 679 }, { "epoch": 0.7637231503579952, "grad_norm": 0.5047763095575648, "learning_rate": 1.9830577811343973e-05, "loss": 0.3618, "step": 680 }, { "epoch": 0.7648462726379335, "grad_norm": 0.5216773678215171, "learning_rate": 1.9829136981427953e-05, "loss": 0.4084, "step": 681 }, { "epoch": 0.7659693949178716, "grad_norm": 0.48638773567106564, "learning_rate": 1.9827690103535787e-05, "loss": 0.3539, "step": 682 }, { "epoch": 0.7670925171978099, "grad_norm": 0.44873074839468674, "learning_rate": 1.9826237178557752e-05, "loss": 0.3502, "step": 683 }, { "epoch": 0.7682156394777482, "grad_norm": 0.48611782864710706, "learning_rate": 1.982477820738785e-05, "loss": 0.3755, "step": 684 }, { "epoch": 0.7693387617576863, "grad_norm": 0.5308709710691715, "learning_rate": 1.9823313190923797e-05, "loss": 0.3836, "step": 685 }, { "epoch": 0.7704618840376246, "grad_norm": 0.49106310483861326, "learning_rate": 1.9821842130067034e-05, "loss": 0.3525, "step": 686 }, { "epoch": 0.7715850063175628, "grad_norm": 0.49027171214223086, "learning_rate": 1.982036502572273e-05, "loss": 0.3929, "step": 687 }, { "epoch": 0.772708128597501, "grad_norm": 0.5040180030072051, "learning_rate": 1.9818881878799755e-05, "loss": 0.3671, "step": 688 }, { "epoch": 0.7738312508774393, "grad_norm": 0.4673216659840785, "learning_rate": 1.9817392690210705e-05, "loss": 0.3687, "step": 689 }, { "epoch": 0.7749543731573775, "grad_norm": 0.5310710286528179, "learning_rate": 1.98158974608719e-05, "loss": 0.3694, "step": 690 }, { "epoch": 0.7760774954373157, "grad_norm": 0.47994947051344194, "learning_rate": 1.981439619170337e-05, "loss": 0.3556, "step": 691 }, { "epoch": 0.777200617717254, "grad_norm": 0.5463444744890577, "learning_rate": 1.981288888362886e-05, "loss": 0.3775, "step": 692 }, { "epoch": 0.7783237399971922, "grad_norm": 0.4724849513305692, "learning_rate": 1.981137553757583e-05, "loss": 0.373, "step": 693 }, { "epoch": 0.7794468622771304, "grad_norm": 0.4833836898450605, "learning_rate": 1.9809856154475463e-05, "loss": 0.3602, "step": 694 }, { "epoch": 0.7805699845570686, "grad_norm": 0.44065803236999307, "learning_rate": 1.9808330735262657e-05, "loss": 0.3323, "step": 695 }, { "epoch": 0.7816931068370069, "grad_norm": 0.5062777136832733, "learning_rate": 1.980679928087601e-05, "loss": 0.3889, "step": 696 }, { "epoch": 0.7828162291169452, "grad_norm": 0.4390743377796556, "learning_rate": 1.980526179225785e-05, "loss": 0.3619, "step": 697 }, { "epoch": 0.7839393513968833, "grad_norm": 0.5039788856880819, "learning_rate": 1.98037182703542e-05, "loss": 0.3932, "step": 698 }, { "epoch": 0.7850624736768216, "grad_norm": 0.4645613871607041, "learning_rate": 1.9802168716114817e-05, "loss": 0.3466, "step": 699 }, { "epoch": 0.7861855959567597, "grad_norm": 0.48869710859846344, "learning_rate": 1.9800613130493158e-05, "loss": 0.3882, "step": 700 }, { "epoch": 0.787308718236698, "grad_norm": 0.46252538633492307, "learning_rate": 1.9799051514446383e-05, "loss": 0.3731, "step": 701 }, { "epoch": 0.7884318405166363, "grad_norm": 0.475702182268912, "learning_rate": 1.9797483868935385e-05, "loss": 0.3535, "step": 702 }, { "epoch": 0.7895549627965744, "grad_norm": 0.4469783800160413, "learning_rate": 1.979591019492474e-05, "loss": 0.3448, "step": 703 }, { "epoch": 0.7906780850765127, "grad_norm": 0.4513038149897409, "learning_rate": 1.9794330493382753e-05, "loss": 0.3749, "step": 704 }, { "epoch": 0.791801207356451, "grad_norm": 0.48022563808130214, "learning_rate": 1.9792744765281435e-05, "loss": 0.373, "step": 705 }, { "epoch": 0.7929243296363891, "grad_norm": 0.4482413723061531, "learning_rate": 1.9791153011596497e-05, "loss": 0.3506, "step": 706 }, { "epoch": 0.7940474519163274, "grad_norm": 0.4520233506526563, "learning_rate": 1.9789555233307363e-05, "loss": 0.359, "step": 707 }, { "epoch": 0.7951705741962656, "grad_norm": 0.4674282940663872, "learning_rate": 1.9787951431397167e-05, "loss": 0.3851, "step": 708 }, { "epoch": 0.7962936964762038, "grad_norm": 0.4584028901339973, "learning_rate": 1.9786341606852743e-05, "loss": 0.3501, "step": 709 }, { "epoch": 0.7974168187561421, "grad_norm": 0.4473101801633832, "learning_rate": 1.9784725760664632e-05, "loss": 0.368, "step": 710 }, { "epoch": 0.7985399410360803, "grad_norm": 0.4657336852380225, "learning_rate": 1.9783103893827088e-05, "loss": 0.3558, "step": 711 }, { "epoch": 0.7996630633160186, "grad_norm": 0.45906042695952914, "learning_rate": 1.9781476007338058e-05, "loss": 0.3599, "step": 712 }, { "epoch": 0.8007861855959567, "grad_norm": 0.46033366175628637, "learning_rate": 1.97798421021992e-05, "loss": 0.3679, "step": 713 }, { "epoch": 0.801909307875895, "grad_norm": 0.4911730826448443, "learning_rate": 1.9778202179415875e-05, "loss": 0.3872, "step": 714 }, { "epoch": 0.8030324301558333, "grad_norm": 0.463483660266614, "learning_rate": 1.9776556239997146e-05, "loss": 0.3758, "step": 715 }, { "epoch": 0.8041555524357714, "grad_norm": 0.5063628630954315, "learning_rate": 1.9774904284955775e-05, "loss": 0.3824, "step": 716 }, { "epoch": 0.8052786747157097, "grad_norm": 0.5154742655615026, "learning_rate": 1.9773246315308226e-05, "loss": 0.3527, "step": 717 }, { "epoch": 0.8064017969956478, "grad_norm": 0.4630501893780488, "learning_rate": 1.9771582332074676e-05, "loss": 0.3875, "step": 718 }, { "epoch": 0.8075249192755861, "grad_norm": 0.49263104968044724, "learning_rate": 1.9769912336278983e-05, "loss": 0.3934, "step": 719 }, { "epoch": 0.8086480415555244, "grad_norm": 0.4672454867734837, "learning_rate": 1.9768236328948718e-05, "loss": 0.3647, "step": 720 }, { "epoch": 0.8097711638354625, "grad_norm": 0.4920760592750608, "learning_rate": 1.9766554311115143e-05, "loss": 0.365, "step": 721 }, { "epoch": 0.8108942861154008, "grad_norm": 0.4647800245178615, "learning_rate": 1.9764866283813224e-05, "loss": 0.3753, "step": 722 }, { "epoch": 0.8120174083953391, "grad_norm": 0.4411909985899985, "learning_rate": 1.9763172248081623e-05, "loss": 0.3604, "step": 723 }, { "epoch": 0.8131405306752773, "grad_norm": 0.4577218647324814, "learning_rate": 1.97614722049627e-05, "loss": 0.3489, "step": 724 }, { "epoch": 0.8142636529552155, "grad_norm": 0.48915076154791487, "learning_rate": 1.975976615550251e-05, "loss": 0.3687, "step": 725 }, { "epoch": 0.8153867752351537, "grad_norm": 0.47499359925278556, "learning_rate": 1.9758054100750802e-05, "loss": 0.3724, "step": 726 }, { "epoch": 0.816509897515092, "grad_norm": 0.4869359292315476, "learning_rate": 1.9756336041761023e-05, "loss": 0.3708, "step": 727 }, { "epoch": 0.8176330197950302, "grad_norm": 0.5178982061022716, "learning_rate": 1.9754611979590313e-05, "loss": 0.3755, "step": 728 }, { "epoch": 0.8187561420749684, "grad_norm": 0.5664792802714854, "learning_rate": 1.9752881915299507e-05, "loss": 0.3925, "step": 729 }, { "epoch": 0.8198792643549067, "grad_norm": 0.5059697408626879, "learning_rate": 1.9751145849953135e-05, "loss": 0.3806, "step": 730 }, { "epoch": 0.8210023866348448, "grad_norm": 0.5546342735387513, "learning_rate": 1.9749403784619415e-05, "loss": 0.3911, "step": 731 }, { "epoch": 0.8221255089147831, "grad_norm": 0.5151210704909763, "learning_rate": 1.9747655720370257e-05, "loss": 0.383, "step": 732 }, { "epoch": 0.8232486311947214, "grad_norm": 0.5218588159099736, "learning_rate": 1.9745901658281267e-05, "loss": 0.3831, "step": 733 }, { "epoch": 0.8243717534746595, "grad_norm": 0.5275710652132647, "learning_rate": 1.9744141599431737e-05, "loss": 0.3626, "step": 734 }, { "epoch": 0.8254948757545978, "grad_norm": 0.5597253504535993, "learning_rate": 1.9742375544904654e-05, "loss": 0.372, "step": 735 }, { "epoch": 0.8266179980345361, "grad_norm": 0.5603047467724314, "learning_rate": 1.9740603495786687e-05, "loss": 0.3842, "step": 736 }, { "epoch": 0.8277411203144742, "grad_norm": 0.4804454455092537, "learning_rate": 1.97388254531682e-05, "loss": 0.371, "step": 737 }, { "epoch": 0.8288642425944125, "grad_norm": 0.49610624738284403, "learning_rate": 1.9737041418143235e-05, "loss": 0.3692, "step": 738 }, { "epoch": 0.8299873648743507, "grad_norm": 0.5248533454817134, "learning_rate": 1.973525139180954e-05, "loss": 0.4152, "step": 739 }, { "epoch": 0.8311104871542889, "grad_norm": 0.470530311658482, "learning_rate": 1.9733455375268532e-05, "loss": 0.3856, "step": 740 }, { "epoch": 0.8322336094342272, "grad_norm": 0.46958211569658714, "learning_rate": 1.9731653369625318e-05, "loss": 0.3817, "step": 741 }, { "epoch": 0.8333567317141654, "grad_norm": 0.5020551519019959, "learning_rate": 1.9729845375988694e-05, "loss": 0.393, "step": 742 }, { "epoch": 0.8344798539941036, "grad_norm": 0.4740449984336523, "learning_rate": 1.9728031395471138e-05, "loss": 0.3808, "step": 743 }, { "epoch": 0.8356029762740418, "grad_norm": 0.49152487880677487, "learning_rate": 1.972621142918881e-05, "loss": 0.3827, "step": 744 }, { "epoch": 0.8367260985539801, "grad_norm": 0.5170747001231172, "learning_rate": 1.972438547826156e-05, "loss": 0.3909, "step": 745 }, { "epoch": 0.8378492208339183, "grad_norm": 0.48953966689610007, "learning_rate": 1.972255354381291e-05, "loss": 0.3816, "step": 746 }, { "epoch": 0.8389723431138565, "grad_norm": 0.45475300416733155, "learning_rate": 1.9720715626970078e-05, "loss": 0.3572, "step": 747 }, { "epoch": 0.8400954653937948, "grad_norm": 0.526527914204446, "learning_rate": 1.9718871728863946e-05, "loss": 0.3707, "step": 748 }, { "epoch": 0.841218587673733, "grad_norm": 0.47359041114522704, "learning_rate": 1.971702185062909e-05, "loss": 0.3832, "step": 749 }, { "epoch": 0.8423417099536712, "grad_norm": 0.5066230520321908, "learning_rate": 1.9715165993403756e-05, "loss": 0.3689, "step": 750 }, { "epoch": 0.8434648322336095, "grad_norm": 0.5222356669126356, "learning_rate": 1.9713304158329873e-05, "loss": 0.3761, "step": 751 }, { "epoch": 0.8445879545135476, "grad_norm": 0.47850315807515664, "learning_rate": 1.9711436346553055e-05, "loss": 0.3765, "step": 752 }, { "epoch": 0.8457110767934859, "grad_norm": 0.5204791826622412, "learning_rate": 1.9709562559222585e-05, "loss": 0.4, "step": 753 }, { "epoch": 0.8468341990734242, "grad_norm": 0.5207156559730746, "learning_rate": 1.970768279749142e-05, "loss": 0.3838, "step": 754 }, { "epoch": 0.8479573213533623, "grad_norm": 0.48686440349858606, "learning_rate": 1.9705797062516204e-05, "loss": 0.3806, "step": 755 }, { "epoch": 0.8490804436333006, "grad_norm": 0.5009212386994344, "learning_rate": 1.9703905355457244e-05, "loss": 0.3766, "step": 756 }, { "epoch": 0.8502035659132388, "grad_norm": 0.4823384548317993, "learning_rate": 1.9702007677478535e-05, "loss": 0.3598, "step": 757 }, { "epoch": 0.851326688193177, "grad_norm": 0.5038461477537077, "learning_rate": 1.9700104029747735e-05, "loss": 0.3706, "step": 758 }, { "epoch": 0.8524498104731153, "grad_norm": 0.499367218377909, "learning_rate": 1.9698194413436186e-05, "loss": 0.3931, "step": 759 }, { "epoch": 0.8535729327530535, "grad_norm": 0.4558139088156976, "learning_rate": 1.9696278829718882e-05, "loss": 0.3661, "step": 760 }, { "epoch": 0.8546960550329917, "grad_norm": 0.49796966194709463, "learning_rate": 1.9694357279774516e-05, "loss": 0.3762, "step": 761 }, { "epoch": 0.8558191773129299, "grad_norm": 0.47539394574011445, "learning_rate": 1.9692429764785436e-05, "loss": 0.3686, "step": 762 }, { "epoch": 0.8569422995928682, "grad_norm": 0.4631634576367207, "learning_rate": 1.969049628593766e-05, "loss": 0.3765, "step": 763 }, { "epoch": 0.8580654218728064, "grad_norm": 0.4622717685068893, "learning_rate": 1.9688556844420877e-05, "loss": 0.3819, "step": 764 }, { "epoch": 0.8591885441527446, "grad_norm": 0.46303379442069725, "learning_rate": 1.9686611441428455e-05, "loss": 0.3417, "step": 765 }, { "epoch": 0.8603116664326829, "grad_norm": 0.48471273461639347, "learning_rate": 1.968466007815741e-05, "loss": 0.3832, "step": 766 }, { "epoch": 0.8614347887126211, "grad_norm": 0.4955079045703151, "learning_rate": 1.968270275580845e-05, "loss": 0.3973, "step": 767 }, { "epoch": 0.8625579109925593, "grad_norm": 0.46897950183267123, "learning_rate": 1.968073947558593e-05, "loss": 0.3577, "step": 768 }, { "epoch": 0.8636810332724976, "grad_norm": 0.49932189730734644, "learning_rate": 1.9678770238697876e-05, "loss": 0.3828, "step": 769 }, { "epoch": 0.8648041555524357, "grad_norm": 0.4385490040536742, "learning_rate": 1.9676795046355986e-05, "loss": 0.3424, "step": 770 }, { "epoch": 0.865927277832374, "grad_norm": 0.44172765516598467, "learning_rate": 1.9674813899775614e-05, "loss": 0.3504, "step": 771 }, { "epoch": 0.8670504001123123, "grad_norm": 0.4925607529971466, "learning_rate": 1.9672826800175786e-05, "loss": 0.3712, "step": 772 }, { "epoch": 0.8681735223922504, "grad_norm": 0.4703364651049239, "learning_rate": 1.967083374877918e-05, "loss": 0.3953, "step": 773 }, { "epoch": 0.8692966446721887, "grad_norm": 0.4744439366917524, "learning_rate": 1.9668834746812148e-05, "loss": 0.3789, "step": 774 }, { "epoch": 0.8704197669521269, "grad_norm": 0.4821724265365719, "learning_rate": 1.9666829795504693e-05, "loss": 0.3801, "step": 775 }, { "epoch": 0.8715428892320651, "grad_norm": 0.48264988540719894, "learning_rate": 1.966481889609049e-05, "loss": 0.3774, "step": 776 }, { "epoch": 0.8726660115120034, "grad_norm": 0.4694424121246419, "learning_rate": 1.9662802049806862e-05, "loss": 0.3574, "step": 777 }, { "epoch": 0.8737891337919416, "grad_norm": 0.47288161574228044, "learning_rate": 1.96607792578948e-05, "loss": 0.3654, "step": 778 }, { "epoch": 0.8749122560718798, "grad_norm": 0.4556516097283803, "learning_rate": 1.9658750521598944e-05, "loss": 0.3557, "step": 779 }, { "epoch": 0.8760353783518181, "grad_norm": 0.42490236766792466, "learning_rate": 1.9656715842167606e-05, "loss": 0.3504, "step": 780 }, { "epoch": 0.8771585006317563, "grad_norm": 0.493236263957415, "learning_rate": 1.965467522085274e-05, "loss": 0.377, "step": 781 }, { "epoch": 0.8782816229116945, "grad_norm": 0.4836659716972491, "learning_rate": 1.9652628658909968e-05, "loss": 0.3731, "step": 782 }, { "epoch": 0.8794047451916327, "grad_norm": 0.46192080514474665, "learning_rate": 1.965057615759856e-05, "loss": 0.3961, "step": 783 }, { "epoch": 0.880527867471571, "grad_norm": 0.48934298480858224, "learning_rate": 1.964851771818144e-05, "loss": 0.35, "step": 784 }, { "epoch": 0.8816509897515092, "grad_norm": 0.46465029854890294, "learning_rate": 1.9646453341925195e-05, "loss": 0.372, "step": 785 }, { "epoch": 0.8827741120314474, "grad_norm": 0.4560535944639423, "learning_rate": 1.9644383030100052e-05, "loss": 0.3499, "step": 786 }, { "epoch": 0.8838972343113857, "grad_norm": 0.4472553278120461, "learning_rate": 1.9642306783979902e-05, "loss": 0.3563, "step": 787 }, { "epoch": 0.8850203565913238, "grad_norm": 0.4728183298345325, "learning_rate": 1.964022460484227e-05, "loss": 0.3915, "step": 788 }, { "epoch": 0.8861434788712621, "grad_norm": 0.49042138391881734, "learning_rate": 1.963813649396836e-05, "loss": 0.4013, "step": 789 }, { "epoch": 0.8872666011512004, "grad_norm": 0.44584377009424664, "learning_rate": 1.9636042452643004e-05, "loss": 0.3408, "step": 790 }, { "epoch": 0.8883897234311385, "grad_norm": 0.4725306653731623, "learning_rate": 1.9633942482154684e-05, "loss": 0.3786, "step": 791 }, { "epoch": 0.8895128457110768, "grad_norm": 0.4614739050205394, "learning_rate": 1.9631836583795537e-05, "loss": 0.3546, "step": 792 }, { "epoch": 0.8906359679910151, "grad_norm": 0.5062498553591016, "learning_rate": 1.962972475886135e-05, "loss": 0.379, "step": 793 }, { "epoch": 0.8917590902709532, "grad_norm": 0.44137610647370507, "learning_rate": 1.9627607008651544e-05, "loss": 0.3719, "step": 794 }, { "epoch": 0.8928822125508915, "grad_norm": 0.4748874875427979, "learning_rate": 1.9625483334469198e-05, "loss": 0.3944, "step": 795 }, { "epoch": 0.8940053348308297, "grad_norm": 0.4105278757488109, "learning_rate": 1.9623353737621035e-05, "loss": 0.3282, "step": 796 }, { "epoch": 0.8951284571107679, "grad_norm": 0.533115865046379, "learning_rate": 1.962121821941742e-05, "loss": 0.3806, "step": 797 }, { "epoch": 0.8962515793907062, "grad_norm": 0.4641263305725229, "learning_rate": 1.9619076781172355e-05, "loss": 0.3488, "step": 798 }, { "epoch": 0.8973747016706444, "grad_norm": 0.4604841991723784, "learning_rate": 1.9616929424203493e-05, "loss": 0.3667, "step": 799 }, { "epoch": 0.8984978239505826, "grad_norm": 0.5045896764129607, "learning_rate": 1.9614776149832127e-05, "loss": 0.3634, "step": 800 }, { "epoch": 0.8996209462305208, "grad_norm": 0.5218566397092465, "learning_rate": 1.961261695938319e-05, "loss": 0.351, "step": 801 }, { "epoch": 0.9007440685104591, "grad_norm": 0.4569027978245014, "learning_rate": 1.9610451854185253e-05, "loss": 0.3656, "step": 802 }, { "epoch": 0.9018671907903973, "grad_norm": 0.49667871542822645, "learning_rate": 1.9608280835570537e-05, "loss": 0.3625, "step": 803 }, { "epoch": 0.9029903130703355, "grad_norm": 0.4709946091327286, "learning_rate": 1.9606103904874886e-05, "loss": 0.3638, "step": 804 }, { "epoch": 0.9041134353502738, "grad_norm": 0.4727139263711212, "learning_rate": 1.9603921063437795e-05, "loss": 0.3594, "step": 805 }, { "epoch": 0.9052365576302119, "grad_norm": 0.5294771408585577, "learning_rate": 1.9601732312602385e-05, "loss": 0.3752, "step": 806 }, { "epoch": 0.9063596799101502, "grad_norm": 0.46959657369103186, "learning_rate": 1.959953765371542e-05, "loss": 0.3625, "step": 807 }, { "epoch": 0.9074828021900885, "grad_norm": 0.4594446586146184, "learning_rate": 1.95973370881273e-05, "loss": 0.3655, "step": 808 }, { "epoch": 0.9086059244700266, "grad_norm": 0.4750310076759929, "learning_rate": 1.959513061719205e-05, "loss": 0.3653, "step": 809 }, { "epoch": 0.9097290467499649, "grad_norm": 0.47678540184859336, "learning_rate": 1.959291824226735e-05, "loss": 0.3454, "step": 810 }, { "epoch": 0.9108521690299032, "grad_norm": 0.4700082470689754, "learning_rate": 1.9590699964714486e-05, "loss": 0.3786, "step": 811 }, { "epoch": 0.9119752913098413, "grad_norm": 0.4620496295067997, "learning_rate": 1.9588475785898394e-05, "loss": 0.3681, "step": 812 }, { "epoch": 0.9130984135897796, "grad_norm": 0.4642629784834156, "learning_rate": 1.9586245707187634e-05, "loss": 0.353, "step": 813 }, { "epoch": 0.9142215358697178, "grad_norm": 0.466408448319715, "learning_rate": 1.9584009729954395e-05, "loss": 0.3603, "step": 814 }, { "epoch": 0.915344658149656, "grad_norm": 0.47105990854376034, "learning_rate": 1.9581767855574508e-05, "loss": 0.3636, "step": 815 }, { "epoch": 0.9164677804295943, "grad_norm": 0.46413275069783316, "learning_rate": 1.9579520085427416e-05, "loss": 0.3951, "step": 816 }, { "epoch": 0.9175909027095325, "grad_norm": 0.471808108910633, "learning_rate": 1.9577266420896194e-05, "loss": 0.3544, "step": 817 }, { "epoch": 0.9187140249894707, "grad_norm": 0.46312200003026205, "learning_rate": 1.9575006863367552e-05, "loss": 0.3747, "step": 818 }, { "epoch": 0.9198371472694089, "grad_norm": 0.4833523027027275, "learning_rate": 1.957274141423182e-05, "loss": 0.3751, "step": 819 }, { "epoch": 0.9209602695493472, "grad_norm": 0.484335661168015, "learning_rate": 1.9570470074882947e-05, "loss": 0.4085, "step": 820 }, { "epoch": 0.9220833918292854, "grad_norm": 0.47205123389289727, "learning_rate": 1.9568192846718523e-05, "loss": 0.3759, "step": 821 }, { "epoch": 0.9232065141092236, "grad_norm": 0.4799100077083178, "learning_rate": 1.956590973113975e-05, "loss": 0.3755, "step": 822 }, { "epoch": 0.9243296363891619, "grad_norm": 0.48768754470310466, "learning_rate": 1.9563620729551448e-05, "loss": 0.3579, "step": 823 }, { "epoch": 0.9254527586691002, "grad_norm": 0.45688489471479504, "learning_rate": 1.956132584336207e-05, "loss": 0.351, "step": 824 }, { "epoch": 0.9265758809490383, "grad_norm": 0.47538544133352595, "learning_rate": 1.9559025073983678e-05, "loss": 0.3808, "step": 825 }, { "epoch": 0.9276990032289766, "grad_norm": 0.4879902084963068, "learning_rate": 1.955671842283197e-05, "loss": 0.3722, "step": 826 }, { "epoch": 0.9288221255089147, "grad_norm": 0.4636609831884478, "learning_rate": 1.955440589132625e-05, "loss": 0.3591, "step": 827 }, { "epoch": 0.929945247788853, "grad_norm": 0.46923991530374826, "learning_rate": 1.955208748088944e-05, "loss": 0.3825, "step": 828 }, { "epoch": 0.9310683700687913, "grad_norm": 0.4811858751733392, "learning_rate": 1.954976319294809e-05, "loss": 0.3867, "step": 829 }, { "epoch": 0.9321914923487294, "grad_norm": 0.44788406222610444, "learning_rate": 1.9547433028932357e-05, "loss": 0.3581, "step": 830 }, { "epoch": 0.9333146146286677, "grad_norm": 0.4780988778135954, "learning_rate": 1.9545096990276016e-05, "loss": 0.3896, "step": 831 }, { "epoch": 0.9344377369086059, "grad_norm": 0.47068420296897273, "learning_rate": 1.954275507841646e-05, "loss": 0.3844, "step": 832 }, { "epoch": 0.9355608591885441, "grad_norm": 0.4884936275913018, "learning_rate": 1.9540407294794685e-05, "loss": 0.4031, "step": 833 }, { "epoch": 0.9366839814684824, "grad_norm": 0.44700134287470683, "learning_rate": 1.9538053640855316e-05, "loss": 0.3436, "step": 834 }, { "epoch": 0.9378071037484206, "grad_norm": 0.45061145673357333, "learning_rate": 1.9535694118046584e-05, "loss": 0.3457, "step": 835 }, { "epoch": 0.9389302260283588, "grad_norm": 0.47148123153216354, "learning_rate": 1.9533328727820322e-05, "loss": 0.368, "step": 836 }, { "epoch": 0.9400533483082971, "grad_norm": 0.4456166431044062, "learning_rate": 1.953095747163199e-05, "loss": 0.3456, "step": 837 }, { "epoch": 0.9411764705882353, "grad_norm": 0.44236685307564755, "learning_rate": 1.9528580350940634e-05, "loss": 0.3431, "step": 838 }, { "epoch": 0.9422995928681736, "grad_norm": 0.48561300894599185, "learning_rate": 1.9526197367208937e-05, "loss": 0.3652, "step": 839 }, { "epoch": 0.9434227151481117, "grad_norm": 0.4720489415303487, "learning_rate": 1.9523808521903165e-05, "loss": 0.3572, "step": 840 }, { "epoch": 0.94454583742805, "grad_norm": 0.44376759133578964, "learning_rate": 1.9521413816493206e-05, "loss": 0.3568, "step": 841 }, { "epoch": 0.9456689597079883, "grad_norm": 0.47211026031113734, "learning_rate": 1.951901325245255e-05, "loss": 0.3589, "step": 842 }, { "epoch": 0.9467920819879264, "grad_norm": 0.487190131673575, "learning_rate": 1.9516606831258282e-05, "loss": 0.3781, "step": 843 }, { "epoch": 0.9479152042678647, "grad_norm": 0.5011825293136943, "learning_rate": 1.951419455439111e-05, "loss": 0.3773, "step": 844 }, { "epoch": 0.9490383265478028, "grad_norm": 0.49776220503772134, "learning_rate": 1.9511776423335327e-05, "loss": 0.4034, "step": 845 }, { "epoch": 0.9501614488277411, "grad_norm": 0.48886020802858143, "learning_rate": 1.9509352439578843e-05, "loss": 0.354, "step": 846 }, { "epoch": 0.9512845711076794, "grad_norm": 0.48370344677831884, "learning_rate": 1.950692260461315e-05, "loss": 0.3524, "step": 847 }, { "epoch": 0.9524076933876175, "grad_norm": 0.49770690316202637, "learning_rate": 1.9504486919933364e-05, "loss": 0.382, "step": 848 }, { "epoch": 0.9535308156675558, "grad_norm": 0.4947415886475524, "learning_rate": 1.9502045387038184e-05, "loss": 0.3573, "step": 849 }, { "epoch": 0.954653937947494, "grad_norm": 0.5081571339909152, "learning_rate": 1.949959800742991e-05, "loss": 0.393, "step": 850 }, { "epoch": 0.9557770602274323, "grad_norm": 0.42096325046895394, "learning_rate": 1.9497144782614445e-05, "loss": 0.319, "step": 851 }, { "epoch": 0.9569001825073705, "grad_norm": 0.506234866583935, "learning_rate": 1.949468571410128e-05, "loss": 0.3887, "step": 852 }, { "epoch": 0.9580233047873087, "grad_norm": 0.5146043757907258, "learning_rate": 1.9492220803403518e-05, "loss": 0.3949, "step": 853 }, { "epoch": 0.959146427067247, "grad_norm": 0.4788729384940881, "learning_rate": 1.9489750052037836e-05, "loss": 0.3748, "step": 854 }, { "epoch": 0.9602695493471852, "grad_norm": 0.44234608280074666, "learning_rate": 1.9487273461524517e-05, "loss": 0.3599, "step": 855 }, { "epoch": 0.9613926716271234, "grad_norm": 0.491710752366928, "learning_rate": 1.9484791033387432e-05, "loss": 0.3717, "step": 856 }, { "epoch": 0.9625157939070617, "grad_norm": 0.4698138574686433, "learning_rate": 1.948230276915405e-05, "loss": 0.379, "step": 857 }, { "epoch": 0.9636389161869998, "grad_norm": 0.4584810154708352, "learning_rate": 1.9479808670355425e-05, "loss": 0.3686, "step": 858 }, { "epoch": 0.9647620384669381, "grad_norm": 0.46535287144204673, "learning_rate": 1.9477308738526207e-05, "loss": 0.3489, "step": 859 }, { "epoch": 0.9658851607468764, "grad_norm": 0.49539431571257847, "learning_rate": 1.947480297520463e-05, "loss": 0.3899, "step": 860 }, { "epoch": 0.9670082830268145, "grad_norm": 0.4870684988697284, "learning_rate": 1.9472291381932515e-05, "loss": 0.3608, "step": 861 }, { "epoch": 0.9681314053067528, "grad_norm": 0.476057375325281, "learning_rate": 1.9469773960255273e-05, "loss": 0.3693, "step": 862 }, { "epoch": 0.969254527586691, "grad_norm": 0.4933511086989605, "learning_rate": 1.9467250711721903e-05, "loss": 0.3673, "step": 863 }, { "epoch": 0.9703776498666292, "grad_norm": 0.486703140404838, "learning_rate": 1.946472163788499e-05, "loss": 0.3623, "step": 864 }, { "epoch": 0.9715007721465675, "grad_norm": 0.46311662873138065, "learning_rate": 1.9462186740300697e-05, "loss": 0.3751, "step": 865 }, { "epoch": 0.9726238944265057, "grad_norm": 0.4756703180857592, "learning_rate": 1.9459646020528777e-05, "loss": 0.4065, "step": 866 }, { "epoch": 0.9737470167064439, "grad_norm": 0.45742793507262375, "learning_rate": 1.9457099480132563e-05, "loss": 0.3783, "step": 867 }, { "epoch": 0.9748701389863822, "grad_norm": 0.47697882778530937, "learning_rate": 1.9454547120678966e-05, "loss": 0.3907, "step": 868 }, { "epoch": 0.9759932612663204, "grad_norm": 0.4664102425023445, "learning_rate": 1.9451988943738485e-05, "loss": 0.374, "step": 869 }, { "epoch": 0.9771163835462586, "grad_norm": 0.41449005991424276, "learning_rate": 1.9449424950885193e-05, "loss": 0.3473, "step": 870 }, { "epoch": 0.9782395058261968, "grad_norm": 0.4344496368295649, "learning_rate": 1.944685514369674e-05, "loss": 0.3481, "step": 871 }, { "epoch": 0.9793626281061351, "grad_norm": 0.47436273506860405, "learning_rate": 1.9444279523754358e-05, "loss": 0.3799, "step": 872 }, { "epoch": 0.9804857503860733, "grad_norm": 0.4905734178799078, "learning_rate": 1.944169809264286e-05, "loss": 0.3655, "step": 873 }, { "epoch": 0.9816088726660115, "grad_norm": 0.4643421651738806, "learning_rate": 1.9439110851950623e-05, "loss": 0.3705, "step": 874 }, { "epoch": 0.9827319949459498, "grad_norm": 0.4860647845700408, "learning_rate": 1.9436517803269603e-05, "loss": 0.3556, "step": 875 }, { "epoch": 0.9838551172258879, "grad_norm": 0.473335344847467, "learning_rate": 1.9433918948195335e-05, "loss": 0.354, "step": 876 }, { "epoch": 0.9849782395058262, "grad_norm": 0.46317370647334366, "learning_rate": 1.9431314288326925e-05, "loss": 0.3564, "step": 877 }, { "epoch": 0.9861013617857645, "grad_norm": 0.48357335447638583, "learning_rate": 1.9428703825267047e-05, "loss": 0.3799, "step": 878 }, { "epoch": 0.9872244840657026, "grad_norm": 0.4575106291945355, "learning_rate": 1.9426087560621944e-05, "loss": 0.3681, "step": 879 }, { "epoch": 0.9883476063456409, "grad_norm": 0.46101892774024467, "learning_rate": 1.942346549600144e-05, "loss": 0.3683, "step": 880 }, { "epoch": 0.989470728625579, "grad_norm": 0.47630591998449023, "learning_rate": 1.942083763301891e-05, "loss": 0.3642, "step": 881 }, { "epoch": 0.9905938509055173, "grad_norm": 0.44576916696684105, "learning_rate": 1.9418203973291317e-05, "loss": 0.3525, "step": 882 }, { "epoch": 0.9917169731854556, "grad_norm": 0.43611774214843185, "learning_rate": 1.9415564518439173e-05, "loss": 0.3376, "step": 883 }, { "epoch": 0.9928400954653938, "grad_norm": 0.44908126004356413, "learning_rate": 1.9412919270086573e-05, "loss": 0.3574, "step": 884 }, { "epoch": 0.993963217745332, "grad_norm": 0.48679555991927803, "learning_rate": 1.941026822986116e-05, "loss": 0.394, "step": 885 }, { "epoch": 0.9950863400252703, "grad_norm": 0.45873412100669514, "learning_rate": 1.9407611399394145e-05, "loss": 0.3587, "step": 886 }, { "epoch": 0.9962094623052085, "grad_norm": 0.45788041705605353, "learning_rate": 1.9404948780320313e-05, "loss": 0.3629, "step": 887 }, { "epoch": 0.9973325845851467, "grad_norm": 0.47526745180684987, "learning_rate": 1.9402280374278e-05, "loss": 0.342, "step": 888 }, { "epoch": 0.9984557068650849, "grad_norm": 0.4320159068252172, "learning_rate": 1.9399606182909104e-05, "loss": 0.3578, "step": 889 }, { "epoch": 0.9995788291450232, "grad_norm": 0.43875671365459706, "learning_rate": 1.9396926207859085e-05, "loss": 0.3598, "step": 890 }, { "epoch": 1.0007019514249613, "grad_norm": 0.8018480221556775, "learning_rate": 1.9394240450776963e-05, "loss": 0.5421, "step": 891 }, { "epoch": 1.0018250737048997, "grad_norm": 0.4696264632124667, "learning_rate": 1.9391548913315312e-05, "loss": 0.3137, "step": 892 }, { "epoch": 1.0029481959848379, "grad_norm": 0.4583140254324182, "learning_rate": 1.9388851597130268e-05, "loss": 0.3087, "step": 893 }, { "epoch": 1.004071318264776, "grad_norm": 0.4690268898240805, "learning_rate": 1.9386148503881515e-05, "loss": 0.2877, "step": 894 }, { "epoch": 1.0051944405447144, "grad_norm": 0.5122329490697226, "learning_rate": 1.9383439635232296e-05, "loss": 0.2904, "step": 895 }, { "epoch": 1.0063175628246526, "grad_norm": 0.5371156002795432, "learning_rate": 1.938072499284941e-05, "loss": 0.3008, "step": 896 }, { "epoch": 1.0074406851045907, "grad_norm": 0.4902755957708443, "learning_rate": 1.9378004578403208e-05, "loss": 0.2819, "step": 897 }, { "epoch": 1.0085638073845289, "grad_norm": 0.4741699815198155, "learning_rate": 1.9375278393567588e-05, "loss": 0.2627, "step": 898 }, { "epoch": 1.0096869296644673, "grad_norm": 0.48986713359299766, "learning_rate": 1.9372546440020003e-05, "loss": 0.2914, "step": 899 }, { "epoch": 1.0108100519444054, "grad_norm": 0.5014813747394071, "learning_rate": 1.9369808719441448e-05, "loss": 0.2912, "step": 900 }, { "epoch": 1.0119331742243436, "grad_norm": 0.5393899996183346, "learning_rate": 1.936706523351648e-05, "loss": 0.3034, "step": 901 }, { "epoch": 1.013056296504282, "grad_norm": 0.48629973546736344, "learning_rate": 1.93643159839332e-05, "loss": 0.315, "step": 902 }, { "epoch": 1.0141794187842201, "grad_norm": 0.47646907262990806, "learning_rate": 1.9361560972383237e-05, "loss": 0.2829, "step": 903 }, { "epoch": 1.0153025410641583, "grad_norm": 0.4784634253911548, "learning_rate": 1.935880020056179e-05, "loss": 0.2634, "step": 904 }, { "epoch": 1.0164256633440967, "grad_norm": 0.4991566494669612, "learning_rate": 1.9356033670167595e-05, "loss": 0.2836, "step": 905 }, { "epoch": 1.0175487856240348, "grad_norm": 0.47357930015207117, "learning_rate": 1.935326138290292e-05, "loss": 0.2845, "step": 906 }, { "epoch": 1.018671907903973, "grad_norm": 0.4650462704839138, "learning_rate": 1.9350483340473595e-05, "loss": 0.2772, "step": 907 }, { "epoch": 1.0197950301839114, "grad_norm": 0.5050485921674276, "learning_rate": 1.9347699544588966e-05, "loss": 0.2964, "step": 908 }, { "epoch": 1.0209181524638495, "grad_norm": 0.441877998501999, "learning_rate": 1.9344909996961943e-05, "loss": 0.2783, "step": 909 }, { "epoch": 1.0220412747437877, "grad_norm": 0.4689605191500745, "learning_rate": 1.9342114699308962e-05, "loss": 0.2759, "step": 910 }, { "epoch": 1.0231643970237259, "grad_norm": 0.4806645271199599, "learning_rate": 1.933931365335e-05, "loss": 0.2724, "step": 911 }, { "epoch": 1.0242875193036642, "grad_norm": 0.48206510864685853, "learning_rate": 1.9336506860808576e-05, "loss": 0.2845, "step": 912 }, { "epoch": 1.0254106415836024, "grad_norm": 0.473977653318953, "learning_rate": 1.9333694323411732e-05, "loss": 0.2691, "step": 913 }, { "epoch": 1.0265337638635406, "grad_norm": 0.4959577608373437, "learning_rate": 1.9330876042890065e-05, "loss": 0.2744, "step": 914 }, { "epoch": 1.027656886143479, "grad_norm": 0.4962252725676315, "learning_rate": 1.932805202097768e-05, "loss": 0.2855, "step": 915 }, { "epoch": 1.028780008423417, "grad_norm": 0.5128348503516724, "learning_rate": 1.9325222259412242e-05, "loss": 0.2795, "step": 916 }, { "epoch": 1.0299031307033553, "grad_norm": 0.4632243182786546, "learning_rate": 1.9322386759934923e-05, "loss": 0.2643, "step": 917 }, { "epoch": 1.0310262529832936, "grad_norm": 0.4730264051539773, "learning_rate": 1.9319545524290447e-05, "loss": 0.2654, "step": 918 }, { "epoch": 1.0321493752632318, "grad_norm": 0.5126131024471591, "learning_rate": 1.931669855422705e-05, "loss": 0.2902, "step": 919 }, { "epoch": 1.03327249754317, "grad_norm": 0.4743950568309829, "learning_rate": 1.9313845851496507e-05, "loss": 0.2665, "step": 920 }, { "epoch": 1.0343956198231083, "grad_norm": 0.5013671474523442, "learning_rate": 1.931098741785412e-05, "loss": 0.2954, "step": 921 }, { "epoch": 1.0355187421030465, "grad_norm": 0.5096143882338355, "learning_rate": 1.930812325505871e-05, "loss": 0.2737, "step": 922 }, { "epoch": 1.0366418643829847, "grad_norm": 0.48645730923494584, "learning_rate": 1.930525336487263e-05, "loss": 0.2871, "step": 923 }, { "epoch": 1.0377649866629228, "grad_norm": 0.48648949738495667, "learning_rate": 1.9302377749061753e-05, "loss": 0.2871, "step": 924 }, { "epoch": 1.0388881089428612, "grad_norm": 0.5445641193684013, "learning_rate": 1.9299496409395482e-05, "loss": 0.2788, "step": 925 }, { "epoch": 1.0400112312227994, "grad_norm": 0.4832715223137077, "learning_rate": 1.9296609347646732e-05, "loss": 0.2705, "step": 926 }, { "epoch": 1.0411343535027375, "grad_norm": 0.4610567782289876, "learning_rate": 1.9293716565591948e-05, "loss": 0.266, "step": 927 }, { "epoch": 1.042257475782676, "grad_norm": 0.476608249508321, "learning_rate": 1.9290818065011084e-05, "loss": 0.2806, "step": 928 }, { "epoch": 1.043380598062614, "grad_norm": 0.4820198451645742, "learning_rate": 1.9287913847687627e-05, "loss": 0.2745, "step": 929 }, { "epoch": 1.0445037203425522, "grad_norm": 0.5092363784952347, "learning_rate": 1.9285003915408575e-05, "loss": 0.29, "step": 930 }, { "epoch": 1.0456268426224906, "grad_norm": 0.4745997969575985, "learning_rate": 1.928208826996443e-05, "loss": 0.2679, "step": 931 }, { "epoch": 1.0467499649024288, "grad_norm": 0.48451499822152283, "learning_rate": 1.927916691314923e-05, "loss": 0.2979, "step": 932 }, { "epoch": 1.047873087182367, "grad_norm": 0.4559532527795186, "learning_rate": 1.9276239846760515e-05, "loss": 0.2745, "step": 933 }, { "epoch": 1.048996209462305, "grad_norm": 0.4942046444197993, "learning_rate": 1.9273307072599343e-05, "loss": 0.2733, "step": 934 }, { "epoch": 1.0501193317422435, "grad_norm": 0.4716142833431438, "learning_rate": 1.927036859247028e-05, "loss": 0.2848, "step": 935 }, { "epoch": 1.0512424540221816, "grad_norm": 0.6276556942610505, "learning_rate": 1.9267424408181406e-05, "loss": 0.2928, "step": 936 }, { "epoch": 1.0523655763021198, "grad_norm": 0.4840738368552477, "learning_rate": 1.9264474521544315e-05, "loss": 0.2831, "step": 937 }, { "epoch": 1.0534886985820582, "grad_norm": 0.513332959686835, "learning_rate": 1.9261518934374093e-05, "loss": 0.3035, "step": 938 }, { "epoch": 1.0546118208619963, "grad_norm": 0.4556522892250145, "learning_rate": 1.9258557648489357e-05, "loss": 0.27, "step": 939 }, { "epoch": 1.0557349431419345, "grad_norm": 0.5074845207806349, "learning_rate": 1.9255590665712214e-05, "loss": 0.283, "step": 940 }, { "epoch": 1.0568580654218729, "grad_norm": 0.499752040349772, "learning_rate": 1.9252617987868278e-05, "loss": 0.2857, "step": 941 }, { "epoch": 1.057981187701811, "grad_norm": 0.5396255130756957, "learning_rate": 1.9249639616786674e-05, "loss": 0.3088, "step": 942 }, { "epoch": 1.0591043099817492, "grad_norm": 0.49212073826967684, "learning_rate": 1.9246655554300028e-05, "loss": 0.2748, "step": 943 }, { "epoch": 1.0602274322616876, "grad_norm": 0.4968561055146951, "learning_rate": 1.9243665802244465e-05, "loss": 0.2749, "step": 944 }, { "epoch": 1.0613505545416257, "grad_norm": 0.49427378612848244, "learning_rate": 1.924067036245961e-05, "loss": 0.2953, "step": 945 }, { "epoch": 1.062473676821564, "grad_norm": 0.48543259955055773, "learning_rate": 1.9237669236788595e-05, "loss": 0.282, "step": 946 }, { "epoch": 1.0635967991015023, "grad_norm": 0.5219319492699731, "learning_rate": 1.923466242707804e-05, "loss": 0.2971, "step": 947 }, { "epoch": 1.0647199213814404, "grad_norm": 0.4692640333578925, "learning_rate": 1.923164993517807e-05, "loss": 0.2753, "step": 948 }, { "epoch": 1.0658430436613786, "grad_norm": 0.47927624646872385, "learning_rate": 1.9228631762942307e-05, "loss": 0.2738, "step": 949 }, { "epoch": 1.0669661659413168, "grad_norm": 0.5057928867460986, "learning_rate": 1.9225607912227864e-05, "loss": 0.2734, "step": 950 }, { "epoch": 1.0680892882212552, "grad_norm": 0.46267709140380153, "learning_rate": 1.922257838489535e-05, "loss": 0.2751, "step": 951 }, { "epoch": 1.0692124105011933, "grad_norm": 0.5332480394868323, "learning_rate": 1.9219543182808862e-05, "loss": 0.2973, "step": 952 }, { "epoch": 1.0703355327811315, "grad_norm": 0.4839164977782545, "learning_rate": 1.9216502307836002e-05, "loss": 0.2835, "step": 953 }, { "epoch": 1.0714586550610699, "grad_norm": 0.4999219278635034, "learning_rate": 1.9213455761847845e-05, "loss": 0.2927, "step": 954 }, { "epoch": 1.072581777341008, "grad_norm": 0.5232385009052505, "learning_rate": 1.921040354671897e-05, "loss": 0.3031, "step": 955 }, { "epoch": 1.0737048996209462, "grad_norm": 0.4541280942778829, "learning_rate": 1.9207345664327434e-05, "loss": 0.2697, "step": 956 }, { "epoch": 1.0748280219008846, "grad_norm": 0.5031936868920343, "learning_rate": 1.9204282116554792e-05, "loss": 0.2867, "step": 957 }, { "epoch": 1.0759511441808227, "grad_norm": 0.4713104216421202, "learning_rate": 1.9201212905286074e-05, "loss": 0.2788, "step": 958 }, { "epoch": 1.0770742664607609, "grad_norm": 0.45894312520199465, "learning_rate": 1.91981380324098e-05, "loss": 0.2848, "step": 959 }, { "epoch": 1.078197388740699, "grad_norm": 0.5223017626157912, "learning_rate": 1.919505749981798e-05, "loss": 0.293, "step": 960 }, { "epoch": 1.0793205110206374, "grad_norm": 0.47157901979531547, "learning_rate": 1.9191971309406085e-05, "loss": 0.2774, "step": 961 }, { "epoch": 1.0804436333005756, "grad_norm": 0.4988305143165187, "learning_rate": 1.9188879463073093e-05, "loss": 0.2967, "step": 962 }, { "epoch": 1.0815667555805137, "grad_norm": 0.47998629427503187, "learning_rate": 1.918578196272145e-05, "loss": 0.2739, "step": 963 }, { "epoch": 1.0826898778604521, "grad_norm": 0.46801323046606513, "learning_rate": 1.918267881025708e-05, "loss": 0.2689, "step": 964 }, { "epoch": 1.0838130001403903, "grad_norm": 0.42494477612916176, "learning_rate": 1.9179570007589384e-05, "loss": 0.2707, "step": 965 }, { "epoch": 1.0849361224203284, "grad_norm": 0.44609307554632605, "learning_rate": 1.9176455556631247e-05, "loss": 0.2626, "step": 966 }, { "epoch": 1.0860592447002668, "grad_norm": 0.5054138308738797, "learning_rate": 1.9173335459299025e-05, "loss": 0.2902, "step": 967 }, { "epoch": 1.087182366980205, "grad_norm": 0.44616997151039844, "learning_rate": 1.9170209717512546e-05, "loss": 0.2687, "step": 968 }, { "epoch": 1.0883054892601431, "grad_norm": 0.497359281811381, "learning_rate": 1.9167078333195116e-05, "loss": 0.3009, "step": 969 }, { "epoch": 1.0894286115400815, "grad_norm": 0.4643079791688917, "learning_rate": 1.9163941308273504e-05, "loss": 0.2792, "step": 970 }, { "epoch": 1.0905517338200197, "grad_norm": 0.47811323587663396, "learning_rate": 1.916079864467796e-05, "loss": 0.2976, "step": 971 }, { "epoch": 1.0916748560999578, "grad_norm": 0.4360155405235776, "learning_rate": 1.9157650344342205e-05, "loss": 0.2621, "step": 972 }, { "epoch": 1.0927979783798962, "grad_norm": 0.45820803802250853, "learning_rate": 1.9154496409203416e-05, "loss": 0.2901, "step": 973 }, { "epoch": 1.0939211006598344, "grad_norm": 0.4370878083723664, "learning_rate": 1.9151336841202246e-05, "loss": 0.2744, "step": 974 }, { "epoch": 1.0950442229397725, "grad_norm": 0.450814956559242, "learning_rate": 1.9148171642282812e-05, "loss": 0.2691, "step": 975 }, { "epoch": 1.0961673452197107, "grad_norm": 0.4621925080101941, "learning_rate": 1.9145000814392696e-05, "loss": 0.2737, "step": 976 }, { "epoch": 1.097290467499649, "grad_norm": 0.4737134489184813, "learning_rate": 1.914182435948294e-05, "loss": 0.2937, "step": 977 }, { "epoch": 1.0984135897795873, "grad_norm": 0.4510174026564208, "learning_rate": 1.9138642279508054e-05, "loss": 0.2743, "step": 978 }, { "epoch": 1.0995367120595254, "grad_norm": 0.48310076248938627, "learning_rate": 1.913545457642601e-05, "loss": 0.2821, "step": 979 }, { "epoch": 1.1006598343394638, "grad_norm": 0.45975463127730243, "learning_rate": 1.9132261252198236e-05, "loss": 0.2743, "step": 980 }, { "epoch": 1.101782956619402, "grad_norm": 0.47566940351291187, "learning_rate": 1.912906230878961e-05, "loss": 0.2901, "step": 981 }, { "epoch": 1.1029060788993401, "grad_norm": 0.49639829721895823, "learning_rate": 1.912585774816849e-05, "loss": 0.3062, "step": 982 }, { "epoch": 1.1040292011792785, "grad_norm": 0.47301791959429074, "learning_rate": 1.912264757230667e-05, "loss": 0.2869, "step": 983 }, { "epoch": 1.1051523234592167, "grad_norm": 0.43846903068715937, "learning_rate": 1.9119431783179413e-05, "loss": 0.2585, "step": 984 }, { "epoch": 1.1062754457391548, "grad_norm": 0.5192008280625632, "learning_rate": 1.911621038276542e-05, "loss": 0.3036, "step": 985 }, { "epoch": 1.107398568019093, "grad_norm": 0.5011419662554821, "learning_rate": 1.911298337304686e-05, "loss": 0.2813, "step": 986 }, { "epoch": 1.1085216902990314, "grad_norm": 0.48085556410924124, "learning_rate": 1.9109750756009348e-05, "loss": 0.2857, "step": 987 }, { "epoch": 1.1096448125789695, "grad_norm": 0.5116521172198443, "learning_rate": 1.9106512533641948e-05, "loss": 0.2867, "step": 988 }, { "epoch": 1.1107679348589077, "grad_norm": 0.4753703314504095, "learning_rate": 1.9103268707937174e-05, "loss": 0.2723, "step": 989 }, { "epoch": 1.111891057138846, "grad_norm": 0.48728987587657996, "learning_rate": 1.9100019280890984e-05, "loss": 0.2938, "step": 990 }, { "epoch": 1.1130141794187842, "grad_norm": 0.46500967954346223, "learning_rate": 1.909676425450279e-05, "loss": 0.277, "step": 991 }, { "epoch": 1.1141373016987224, "grad_norm": 0.4311368054737046, "learning_rate": 1.9093503630775445e-05, "loss": 0.2589, "step": 992 }, { "epoch": 1.1152604239786608, "grad_norm": 0.4806685121172132, "learning_rate": 1.9090237411715248e-05, "loss": 0.2828, "step": 993 }, { "epoch": 1.116383546258599, "grad_norm": 0.47413677573033675, "learning_rate": 1.9086965599331938e-05, "loss": 0.2922, "step": 994 }, { "epoch": 1.117506668538537, "grad_norm": 0.463329362201204, "learning_rate": 1.9083688195638694e-05, "loss": 0.2991, "step": 995 }, { "epoch": 1.1186297908184755, "grad_norm": 0.424765333204886, "learning_rate": 1.9080405202652143e-05, "loss": 0.2589, "step": 996 }, { "epoch": 1.1197529130984136, "grad_norm": 0.4664315339207049, "learning_rate": 1.9077116622392347e-05, "loss": 0.278, "step": 997 }, { "epoch": 1.1208760353783518, "grad_norm": 0.4580593579853824, "learning_rate": 1.9073822456882806e-05, "loss": 0.2781, "step": 998 }, { "epoch": 1.12199915765829, "grad_norm": 0.4428162338559992, "learning_rate": 1.907052270815045e-05, "loss": 0.2788, "step": 999 }, { "epoch": 1.1231222799382283, "grad_norm": 0.4317189895111999, "learning_rate": 1.9067217378225655e-05, "loss": 0.273, "step": 1000 }, { "epoch": 1.1242454022181665, "grad_norm": 0.46512373441877797, "learning_rate": 1.906390646914223e-05, "loss": 0.2987, "step": 1001 }, { "epoch": 1.1253685244981046, "grad_norm": 0.48722030698963165, "learning_rate": 1.906058998293741e-05, "loss": 0.3105, "step": 1002 }, { "epoch": 1.126491646778043, "grad_norm": 0.4758813346060563, "learning_rate": 1.9057267921651865e-05, "loss": 0.3104, "step": 1003 }, { "epoch": 1.1276147690579812, "grad_norm": 0.4603226186566387, "learning_rate": 1.9053940287329696e-05, "loss": 0.2714, "step": 1004 }, { "epoch": 1.1287378913379194, "grad_norm": 0.5016859736501262, "learning_rate": 1.9050607082018437e-05, "loss": 0.3054, "step": 1005 }, { "epoch": 1.1298610136178577, "grad_norm": 0.45587589839460446, "learning_rate": 1.9047268307769044e-05, "loss": 0.295, "step": 1006 }, { "epoch": 1.130984135897796, "grad_norm": 0.4681404958373164, "learning_rate": 1.90439239666359e-05, "loss": 0.2888, "step": 1007 }, { "epoch": 1.132107258177734, "grad_norm": 0.472944576189367, "learning_rate": 1.9040574060676813e-05, "loss": 0.2875, "step": 1008 }, { "epoch": 1.1332303804576722, "grad_norm": 0.4496881501850784, "learning_rate": 1.903721859195302e-05, "loss": 0.28, "step": 1009 }, { "epoch": 1.1343535027376106, "grad_norm": 0.469620980155489, "learning_rate": 1.9033857562529176e-05, "loss": 0.2731, "step": 1010 }, { "epoch": 1.1354766250175488, "grad_norm": 0.45917004853908955, "learning_rate": 1.9030490974473363e-05, "loss": 0.283, "step": 1011 }, { "epoch": 1.136599747297487, "grad_norm": 0.47268979034926534, "learning_rate": 1.902711882985708e-05, "loss": 0.2866, "step": 1012 }, { "epoch": 1.1377228695774253, "grad_norm": 0.4383225568741319, "learning_rate": 1.9023741130755237e-05, "loss": 0.2756, "step": 1013 }, { "epoch": 1.1388459918573635, "grad_norm": 0.46958663656332716, "learning_rate": 1.9020357879246173e-05, "loss": 0.2826, "step": 1014 }, { "epoch": 1.1399691141373016, "grad_norm": 0.45079995860583716, "learning_rate": 1.9016969077411645e-05, "loss": 0.2872, "step": 1015 }, { "epoch": 1.14109223641724, "grad_norm": 0.47973270092319154, "learning_rate": 1.9013574727336817e-05, "loss": 0.3096, "step": 1016 }, { "epoch": 1.1422153586971782, "grad_norm": 0.501462362289904, "learning_rate": 1.9010174831110268e-05, "loss": 0.3117, "step": 1017 }, { "epoch": 1.1433384809771163, "grad_norm": 0.4513154018538136, "learning_rate": 1.9006769390823994e-05, "loss": 0.252, "step": 1018 }, { "epoch": 1.1444616032570547, "grad_norm": 0.4500736376781029, "learning_rate": 1.9003358408573396e-05, "loss": 0.2735, "step": 1019 }, { "epoch": 1.1455847255369929, "grad_norm": 0.5023208553675916, "learning_rate": 1.8999941886457292e-05, "loss": 0.2987, "step": 1020 }, { "epoch": 1.146707847816931, "grad_norm": 0.45282147717459975, "learning_rate": 1.8996519826577907e-05, "loss": 0.2651, "step": 1021 }, { "epoch": 1.1478309700968694, "grad_norm": 0.4648324226943947, "learning_rate": 1.899309223104087e-05, "loss": 0.2985, "step": 1022 }, { "epoch": 1.1489540923768076, "grad_norm": 0.4849118937452435, "learning_rate": 1.898965910195522e-05, "loss": 0.2966, "step": 1023 }, { "epoch": 1.1500772146567457, "grad_norm": 0.4845634318497336, "learning_rate": 1.89862204414334e-05, "loss": 0.2997, "step": 1024 }, { "epoch": 1.1512003369366839, "grad_norm": 0.4849781830896908, "learning_rate": 1.8982776251591247e-05, "loss": 0.301, "step": 1025 }, { "epoch": 1.1523234592166223, "grad_norm": 0.46264538140977657, "learning_rate": 1.8979326534548023e-05, "loss": 0.2892, "step": 1026 }, { "epoch": 1.1534465814965604, "grad_norm": 0.4751680003078539, "learning_rate": 1.8975871292426365e-05, "loss": 0.2943, "step": 1027 }, { "epoch": 1.1545697037764986, "grad_norm": 0.47000462603028687, "learning_rate": 1.8972410527352324e-05, "loss": 0.2944, "step": 1028 }, { "epoch": 1.155692826056437, "grad_norm": 0.46720471177522077, "learning_rate": 1.8968944241455352e-05, "loss": 0.2754, "step": 1029 }, { "epoch": 1.1568159483363751, "grad_norm": 0.4682045577743917, "learning_rate": 1.8965472436868288e-05, "loss": 0.2945, "step": 1030 }, { "epoch": 1.1579390706163133, "grad_norm": 0.4681264807044535, "learning_rate": 1.8961995115727373e-05, "loss": 0.2855, "step": 1031 }, { "epoch": 1.1590621928962517, "grad_norm": 0.46711223820824194, "learning_rate": 1.895851228017224e-05, "loss": 0.2828, "step": 1032 }, { "epoch": 1.1601853151761898, "grad_norm": 0.4305801770746027, "learning_rate": 1.8955023932345916e-05, "loss": 0.2572, "step": 1033 }, { "epoch": 1.161308437456128, "grad_norm": 0.4783787183648787, "learning_rate": 1.8951530074394828e-05, "loss": 0.3005, "step": 1034 }, { "epoch": 1.1624315597360662, "grad_norm": 0.4580608030880634, "learning_rate": 1.894803070846877e-05, "loss": 0.292, "step": 1035 }, { "epoch": 1.1635546820160045, "grad_norm": 0.44482733683564024, "learning_rate": 1.894452583672095e-05, "loss": 0.2814, "step": 1036 }, { "epoch": 1.1646778042959427, "grad_norm": 0.46029498474111563, "learning_rate": 1.8941015461307955e-05, "loss": 0.2893, "step": 1037 }, { "epoch": 1.1658009265758809, "grad_norm": 0.4615488233008817, "learning_rate": 1.8937499584389755e-05, "loss": 0.2932, "step": 1038 }, { "epoch": 1.1669240488558192, "grad_norm": 0.42956178577189436, "learning_rate": 1.8933978208129705e-05, "loss": 0.2739, "step": 1039 }, { "epoch": 1.1680471711357574, "grad_norm": 0.4802415097521688, "learning_rate": 1.893045133469455e-05, "loss": 0.2883, "step": 1040 }, { "epoch": 1.1691702934156956, "grad_norm": 0.4480867146233576, "learning_rate": 1.8926918966254416e-05, "loss": 0.2729, "step": 1041 }, { "epoch": 1.170293415695634, "grad_norm": 0.49140396781805756, "learning_rate": 1.8923381104982806e-05, "loss": 0.28, "step": 1042 }, { "epoch": 1.171416537975572, "grad_norm": 0.46490086726476276, "learning_rate": 1.8919837753056606e-05, "loss": 0.2657, "step": 1043 }, { "epoch": 1.1725396602555103, "grad_norm": 0.4657704225340458, "learning_rate": 1.8916288912656077e-05, "loss": 0.3002, "step": 1044 }, { "epoch": 1.1736627825354486, "grad_norm": 0.45315444310300473, "learning_rate": 1.891273458596486e-05, "loss": 0.2671, "step": 1045 }, { "epoch": 1.1747859048153868, "grad_norm": 0.5014981783064109, "learning_rate": 1.8909174775169968e-05, "loss": 0.2791, "step": 1046 }, { "epoch": 1.175909027095325, "grad_norm": 0.5179938024360514, "learning_rate": 1.89056094824618e-05, "loss": 0.3352, "step": 1047 }, { "epoch": 1.1770321493752633, "grad_norm": 0.4232454478701444, "learning_rate": 1.8902038710034113e-05, "loss": 0.2557, "step": 1048 }, { "epoch": 1.1781552716552015, "grad_norm": 0.5139754763017151, "learning_rate": 1.889846246008405e-05, "loss": 0.2959, "step": 1049 }, { "epoch": 1.1792783939351397, "grad_norm": 0.48647478582657605, "learning_rate": 1.8894880734812106e-05, "loss": 0.3023, "step": 1050 }, { "epoch": 1.1804015162150778, "grad_norm": 0.46616236114663656, "learning_rate": 1.8891293536422165e-05, "loss": 0.2845, "step": 1051 }, { "epoch": 1.1815246384950162, "grad_norm": 0.4287393409593001, "learning_rate": 1.888770086712147e-05, "loss": 0.2696, "step": 1052 }, { "epoch": 1.1826477607749544, "grad_norm": 0.4949520898537966, "learning_rate": 1.8884102729120624e-05, "loss": 0.3007, "step": 1053 }, { "epoch": 1.1837708830548925, "grad_norm": 0.446833141730547, "learning_rate": 1.88804991246336e-05, "loss": 0.2692, "step": 1054 }, { "epoch": 1.184894005334831, "grad_norm": 0.48608855458145783, "learning_rate": 1.8876890055877745e-05, "loss": 0.318, "step": 1055 }, { "epoch": 1.186017127614769, "grad_norm": 0.457882561215241, "learning_rate": 1.887327552507375e-05, "loss": 0.2802, "step": 1056 }, { "epoch": 1.1871402498947072, "grad_norm": 0.45242524194849565, "learning_rate": 1.886965553444568e-05, "loss": 0.264, "step": 1057 }, { "epoch": 1.1882633721746454, "grad_norm": 0.48868541054842407, "learning_rate": 1.886603008622095e-05, "loss": 0.3076, "step": 1058 }, { "epoch": 1.1893864944545838, "grad_norm": 0.44249560468632987, "learning_rate": 1.8862399182630347e-05, "loss": 0.2661, "step": 1059 }, { "epoch": 1.190509616734522, "grad_norm": 0.4588142894528934, "learning_rate": 1.8858762825908e-05, "loss": 0.2865, "step": 1060 }, { "epoch": 1.19163273901446, "grad_norm": 0.46946406735526547, "learning_rate": 1.8855121018291394e-05, "loss": 0.3046, "step": 1061 }, { "epoch": 1.1927558612943985, "grad_norm": 0.45375091401248635, "learning_rate": 1.8851473762021384e-05, "loss": 0.2732, "step": 1062 }, { "epoch": 1.1938789835743366, "grad_norm": 0.4676425499504646, "learning_rate": 1.8847821059342163e-05, "loss": 0.2932, "step": 1063 }, { "epoch": 1.1950021058542748, "grad_norm": 0.46249196792362884, "learning_rate": 1.8844162912501277e-05, "loss": 0.302, "step": 1064 }, { "epoch": 1.1961252281342132, "grad_norm": 0.4812578734002899, "learning_rate": 1.8840499323749624e-05, "loss": 0.303, "step": 1065 }, { "epoch": 1.1972483504141513, "grad_norm": 0.48405098483559544, "learning_rate": 1.883683029534145e-05, "loss": 0.2887, "step": 1066 }, { "epoch": 1.1983714726940895, "grad_norm": 0.4608928377281792, "learning_rate": 1.8833155829534356e-05, "loss": 0.2958, "step": 1067 }, { "epoch": 1.1994945949740279, "grad_norm": 0.43401836542504524, "learning_rate": 1.8829475928589272e-05, "loss": 0.2603, "step": 1068 }, { "epoch": 1.200617717253966, "grad_norm": 0.4780260992243674, "learning_rate": 1.8825790594770487e-05, "loss": 0.2809, "step": 1069 }, { "epoch": 1.2017408395339042, "grad_norm": 0.44275206432188047, "learning_rate": 1.882209983034562e-05, "loss": 0.2773, "step": 1070 }, { "epoch": 1.2028639618138426, "grad_norm": 0.47460553242353476, "learning_rate": 1.881840363758565e-05, "loss": 0.3128, "step": 1071 }, { "epoch": 1.2039870840937807, "grad_norm": 0.46237921124263054, "learning_rate": 1.881470201876488e-05, "loss": 0.2952, "step": 1072 }, { "epoch": 1.205110206373719, "grad_norm": 0.48654497979875483, "learning_rate": 1.8810994976160955e-05, "loss": 0.3147, "step": 1073 }, { "epoch": 1.2062333286536573, "grad_norm": 0.4921863140740264, "learning_rate": 1.880728251205486e-05, "loss": 0.3093, "step": 1074 }, { "epoch": 1.2073564509335954, "grad_norm": 0.471423938361344, "learning_rate": 1.8803564628730916e-05, "loss": 0.2903, "step": 1075 }, { "epoch": 1.2084795732135336, "grad_norm": 0.46874454339863675, "learning_rate": 1.8799841328476776e-05, "loss": 0.2889, "step": 1076 }, { "epoch": 1.2096026954934718, "grad_norm": 0.45281746346669155, "learning_rate": 1.8796112613583427e-05, "loss": 0.2895, "step": 1077 }, { "epoch": 1.2107258177734102, "grad_norm": 0.45353972532299486, "learning_rate": 1.8792378486345196e-05, "loss": 0.2789, "step": 1078 }, { "epoch": 1.2118489400533483, "grad_norm": 0.4725888875307044, "learning_rate": 1.878863894905972e-05, "loss": 0.2847, "step": 1079 }, { "epoch": 1.2129720623332865, "grad_norm": 0.442084650431914, "learning_rate": 1.878489400402799e-05, "loss": 0.274, "step": 1080 }, { "epoch": 1.2140951846132249, "grad_norm": 0.49254544284536805, "learning_rate": 1.8781143653554305e-05, "loss": 0.3088, "step": 1081 }, { "epoch": 1.215218306893163, "grad_norm": 0.4617957372006431, "learning_rate": 1.8777387899946294e-05, "loss": 0.2879, "step": 1082 }, { "epoch": 1.2163414291731012, "grad_norm": 0.4646226561763834, "learning_rate": 1.877362674551492e-05, "loss": 0.2929, "step": 1083 }, { "epoch": 1.2174645514530393, "grad_norm": 0.5009688641472497, "learning_rate": 1.876986019257446e-05, "loss": 0.314, "step": 1084 }, { "epoch": 1.2185876737329777, "grad_norm": 0.49964534711190756, "learning_rate": 1.8766088243442514e-05, "loss": 0.32, "step": 1085 }, { "epoch": 1.2197107960129159, "grad_norm": 0.47363249608950986, "learning_rate": 1.8762310900440007e-05, "loss": 0.2895, "step": 1086 }, { "epoch": 1.220833918292854, "grad_norm": 0.4781168700712531, "learning_rate": 1.875852816589118e-05, "loss": 0.2826, "step": 1087 }, { "epoch": 1.2219570405727924, "grad_norm": 0.45871143064600073, "learning_rate": 1.8754740042123583e-05, "loss": 0.2762, "step": 1088 }, { "epoch": 1.2230801628527306, "grad_norm": 0.5147809750511391, "learning_rate": 1.8750946531468098e-05, "loss": 0.3179, "step": 1089 }, { "epoch": 1.2242032851326687, "grad_norm": 0.45406534949791066, "learning_rate": 1.874714763625892e-05, "loss": 0.2656, "step": 1090 }, { "epoch": 1.2253264074126071, "grad_norm": 0.46488456668844064, "learning_rate": 1.8743343358833536e-05, "loss": 0.2867, "step": 1091 }, { "epoch": 1.2264495296925453, "grad_norm": 0.46081222446789055, "learning_rate": 1.8739533701532768e-05, "loss": 0.2891, "step": 1092 }, { "epoch": 1.2275726519724834, "grad_norm": 0.47759019663041424, "learning_rate": 1.873571866670074e-05, "loss": 0.2884, "step": 1093 }, { "epoch": 1.2286957742524218, "grad_norm": 0.4825230030891384, "learning_rate": 1.8731898256684885e-05, "loss": 0.2886, "step": 1094 }, { "epoch": 1.22981889653236, "grad_norm": 0.4568440667981614, "learning_rate": 1.8728072473835944e-05, "loss": 0.2801, "step": 1095 }, { "epoch": 1.2309420188122981, "grad_norm": 0.4517593846376601, "learning_rate": 1.8724241320507958e-05, "loss": 0.2687, "step": 1096 }, { "epoch": 1.2320651410922365, "grad_norm": 0.473458572983134, "learning_rate": 1.8720404799058284e-05, "loss": 0.2811, "step": 1097 }, { "epoch": 1.2331882633721747, "grad_norm": 0.47853878049484405, "learning_rate": 1.8716562911847572e-05, "loss": 0.3026, "step": 1098 }, { "epoch": 1.2343113856521128, "grad_norm": 0.4732508123997025, "learning_rate": 1.8712715661239783e-05, "loss": 0.2858, "step": 1099 }, { "epoch": 1.235434507932051, "grad_norm": 0.4943536151259743, "learning_rate": 1.8708863049602163e-05, "loss": 0.3155, "step": 1100 }, { "epoch": 1.2365576302119894, "grad_norm": 0.45750032528758056, "learning_rate": 1.8705005079305274e-05, "loss": 0.2851, "step": 1101 }, { "epoch": 1.2376807524919275, "grad_norm": 0.4521267906219757, "learning_rate": 1.8701141752722966e-05, "loss": 0.2747, "step": 1102 }, { "epoch": 1.2388038747718657, "grad_norm": 0.46594419380408797, "learning_rate": 1.8697273072232385e-05, "loss": 0.3064, "step": 1103 }, { "epoch": 1.239926997051804, "grad_norm": 0.43383827589550866, "learning_rate": 1.8693399040213974e-05, "loss": 0.2676, "step": 1104 }, { "epoch": 1.2410501193317423, "grad_norm": 0.4474050960888805, "learning_rate": 1.8689519659051467e-05, "loss": 0.2741, "step": 1105 }, { "epoch": 1.2421732416116804, "grad_norm": 0.45523790295881006, "learning_rate": 1.868563493113189e-05, "loss": 0.2881, "step": 1106 }, { "epoch": 1.2432963638916188, "grad_norm": 0.4584723123279694, "learning_rate": 1.8681744858845555e-05, "loss": 0.284, "step": 1107 }, { "epoch": 1.244419486171557, "grad_norm": 0.46457228328541583, "learning_rate": 1.8677849444586073e-05, "loss": 0.2799, "step": 1108 }, { "epoch": 1.2455426084514951, "grad_norm": 0.45539666514036803, "learning_rate": 1.8673948690750333e-05, "loss": 0.2887, "step": 1109 }, { "epoch": 1.2466657307314333, "grad_norm": 0.4677915933786495, "learning_rate": 1.867004259973851e-05, "loss": 0.292, "step": 1110 }, { "epoch": 1.2477888530113717, "grad_norm": 0.4667047750396251, "learning_rate": 1.866613117395407e-05, "loss": 0.2889, "step": 1111 }, { "epoch": 1.2489119752913098, "grad_norm": 0.48997059381677216, "learning_rate": 1.8662214415803748e-05, "loss": 0.2968, "step": 1112 }, { "epoch": 1.250035097571248, "grad_norm": 0.4384615139812607, "learning_rate": 1.8658292327697574e-05, "loss": 0.2715, "step": 1113 }, { "epoch": 1.2511582198511864, "grad_norm": 0.4547012939945187, "learning_rate": 1.865436491204885e-05, "loss": 0.2747, "step": 1114 }, { "epoch": 1.2522813421311245, "grad_norm": 0.5098417390678678, "learning_rate": 1.865043217127416e-05, "loss": 0.3327, "step": 1115 }, { "epoch": 1.2534044644110627, "grad_norm": 0.47158047623028704, "learning_rate": 1.864649410779336e-05, "loss": 0.2977, "step": 1116 }, { "epoch": 1.254527586691001, "grad_norm": 0.47071696073688374, "learning_rate": 1.8642550724029584e-05, "loss": 0.3032, "step": 1117 }, { "epoch": 1.2556507089709392, "grad_norm": 0.4756046517133648, "learning_rate": 1.863860202240924e-05, "loss": 0.3077, "step": 1118 }, { "epoch": 1.2567738312508774, "grad_norm": 0.47511377917679826, "learning_rate": 1.863464800536201e-05, "loss": 0.2969, "step": 1119 }, { "epoch": 1.2578969535308158, "grad_norm": 0.47570563192173576, "learning_rate": 1.8630688675320844e-05, "loss": 0.3081, "step": 1120 }, { "epoch": 1.259020075810754, "grad_norm": 0.4905178570778228, "learning_rate": 1.8626724034721955e-05, "loss": 0.2967, "step": 1121 }, { "epoch": 1.260143198090692, "grad_norm": 0.4603739199538433, "learning_rate": 1.8622754086004837e-05, "loss": 0.2852, "step": 1122 }, { "epoch": 1.2612663203706305, "grad_norm": 0.48083734404310896, "learning_rate": 1.8618778831612243e-05, "loss": 0.2836, "step": 1123 }, { "epoch": 1.2623894426505686, "grad_norm": 0.46026086440947167, "learning_rate": 1.8614798273990186e-05, "loss": 0.2785, "step": 1124 }, { "epoch": 1.2635125649305068, "grad_norm": 0.49105292345069845, "learning_rate": 1.8610812415587948e-05, "loss": 0.2841, "step": 1125 }, { "epoch": 1.2646356872104452, "grad_norm": 0.45003283326379967, "learning_rate": 1.860682125885808e-05, "loss": 0.2817, "step": 1126 }, { "epoch": 1.2657588094903833, "grad_norm": 0.49522134976029414, "learning_rate": 1.860282480625637e-05, "loss": 0.3027, "step": 1127 }, { "epoch": 1.2668819317703215, "grad_norm": 0.504015437884683, "learning_rate": 1.859882306024189e-05, "loss": 0.3062, "step": 1128 }, { "epoch": 1.2680050540502597, "grad_norm": 0.4935114092813406, "learning_rate": 1.8594816023276954e-05, "loss": 0.3145, "step": 1129 }, { "epoch": 1.269128176330198, "grad_norm": 0.45349496634172853, "learning_rate": 1.8590803697827138e-05, "loss": 0.2756, "step": 1130 }, { "epoch": 1.2702512986101362, "grad_norm": 0.4923320856662289, "learning_rate": 1.8586786086361268e-05, "loss": 0.293, "step": 1131 }, { "epoch": 1.2713744208900744, "grad_norm": 0.48349777935390137, "learning_rate": 1.8582763191351427e-05, "loss": 0.2881, "step": 1132 }, { "epoch": 1.2724975431700125, "grad_norm": 0.45227425022413825, "learning_rate": 1.8578735015272947e-05, "loss": 0.2738, "step": 1133 }, { "epoch": 1.273620665449951, "grad_norm": 0.4796542700286378, "learning_rate": 1.8574701560604405e-05, "loss": 0.2974, "step": 1134 }, { "epoch": 1.274743787729889, "grad_norm": 0.45414376951278596, "learning_rate": 1.8570662829827632e-05, "loss": 0.3021, "step": 1135 }, { "epoch": 1.2758669100098272, "grad_norm": 0.4329561559626584, "learning_rate": 1.8566618825427704e-05, "loss": 0.2636, "step": 1136 }, { "epoch": 1.2769900322897656, "grad_norm": 0.49933859807478426, "learning_rate": 1.8562569549892945e-05, "loss": 0.3034, "step": 1137 }, { "epoch": 1.2781131545697038, "grad_norm": 0.4531437341854577, "learning_rate": 1.855851500571491e-05, "loss": 0.281, "step": 1138 }, { "epoch": 1.279236276849642, "grad_norm": 0.4567172782028271, "learning_rate": 1.8554455195388414e-05, "loss": 0.2751, "step": 1139 }, { "epoch": 1.2803593991295803, "grad_norm": 0.5083738307195607, "learning_rate": 1.8550390121411497e-05, "loss": 0.2964, "step": 1140 }, { "epoch": 1.2814825214095185, "grad_norm": 0.4599915752167777, "learning_rate": 1.8546319786285443e-05, "loss": 0.2804, "step": 1141 }, { "epoch": 1.2826056436894566, "grad_norm": 0.47423327374698915, "learning_rate": 1.854224419251478e-05, "loss": 0.3044, "step": 1142 }, { "epoch": 1.283728765969395, "grad_norm": 0.47354584223797824, "learning_rate": 1.853816334260726e-05, "loss": 0.3022, "step": 1143 }, { "epoch": 1.2848518882493332, "grad_norm": 0.48012669773887895, "learning_rate": 1.8534077239073877e-05, "loss": 0.291, "step": 1144 }, { "epoch": 1.2859750105292713, "grad_norm": 0.4835082015975579, "learning_rate": 1.8529985884428855e-05, "loss": 0.2957, "step": 1145 }, { "epoch": 1.2870981328092097, "grad_norm": 0.44167167547950387, "learning_rate": 1.8525889281189654e-05, "loss": 0.2927, "step": 1146 }, { "epoch": 1.2882212550891479, "grad_norm": 0.4526647766195691, "learning_rate": 1.8521787431876954e-05, "loss": 0.2801, "step": 1147 }, { "epoch": 1.289344377369086, "grad_norm": 0.4568245323185659, "learning_rate": 1.8517680339014667e-05, "loss": 0.2648, "step": 1148 }, { "epoch": 1.2904674996490244, "grad_norm": 0.4883019800865997, "learning_rate": 1.8513568005129937e-05, "loss": 0.2865, "step": 1149 }, { "epoch": 1.2915906219289626, "grad_norm": 0.47025667774471347, "learning_rate": 1.8509450432753123e-05, "loss": 0.2938, "step": 1150 }, { "epoch": 1.2927137442089007, "grad_norm": 0.4428247847588197, "learning_rate": 1.8505327624417816e-05, "loss": 0.2732, "step": 1151 }, { "epoch": 1.293836866488839, "grad_norm": 0.4774238303086379, "learning_rate": 1.8501199582660824e-05, "loss": 0.3058, "step": 1152 }, { "epoch": 1.2949599887687773, "grad_norm": 0.4948410577106829, "learning_rate": 1.849706631002218e-05, "loss": 0.3044, "step": 1153 }, { "epoch": 1.2960831110487154, "grad_norm": 0.48963021776942295, "learning_rate": 1.849292780904513e-05, "loss": 0.3007, "step": 1154 }, { "epoch": 1.2972062333286536, "grad_norm": 0.4741585592952024, "learning_rate": 1.8488784082276137e-05, "loss": 0.2818, "step": 1155 }, { "epoch": 1.2983293556085918, "grad_norm": 0.4604185684567528, "learning_rate": 1.848463513226488e-05, "loss": 0.2471, "step": 1156 }, { "epoch": 1.2994524778885301, "grad_norm": 0.4877301769198903, "learning_rate": 1.848048096156426e-05, "loss": 0.2926, "step": 1157 }, { "epoch": 1.3005756001684683, "grad_norm": 0.4742673297417078, "learning_rate": 1.8476321572730382e-05, "loss": 0.2907, "step": 1158 }, { "epoch": 1.3016987224484065, "grad_norm": 0.4748985003791319, "learning_rate": 1.847215696832256e-05, "loss": 0.2857, "step": 1159 }, { "epoch": 1.3028218447283448, "grad_norm": 0.4785822244074492, "learning_rate": 1.8467987150903325e-05, "loss": 0.2896, "step": 1160 }, { "epoch": 1.303944967008283, "grad_norm": 0.4852477184928872, "learning_rate": 1.846381212303841e-05, "loss": 0.3077, "step": 1161 }, { "epoch": 1.3050680892882212, "grad_norm": 0.46752327714434433, "learning_rate": 1.8459631887296757e-05, "loss": 0.3059, "step": 1162 }, { "epoch": 1.3061912115681595, "grad_norm": 0.491665240404064, "learning_rate": 1.8455446446250508e-05, "loss": 0.2982, "step": 1163 }, { "epoch": 1.3073143338480977, "grad_norm": 0.4481191351234828, "learning_rate": 1.8451255802475014e-05, "loss": 0.2835, "step": 1164 }, { "epoch": 1.3084374561280359, "grad_norm": 0.4419919772694303, "learning_rate": 1.8447059958548822e-05, "loss": 0.2744, "step": 1165 }, { "epoch": 1.3095605784079742, "grad_norm": 0.46574040350278373, "learning_rate": 1.8442858917053682e-05, "loss": 0.291, "step": 1166 }, { "epoch": 1.3106837006879124, "grad_norm": 0.43091837390173826, "learning_rate": 1.843865268057454e-05, "loss": 0.2657, "step": 1167 }, { "epoch": 1.3118068229678506, "grad_norm": 0.4642199123609139, "learning_rate": 1.843444125169954e-05, "loss": 0.2876, "step": 1168 }, { "epoch": 1.312929945247789, "grad_norm": 0.457887008950542, "learning_rate": 1.843022463302002e-05, "loss": 0.2727, "step": 1169 }, { "epoch": 1.314053067527727, "grad_norm": 0.44022972541172534, "learning_rate": 1.8426002827130517e-05, "loss": 0.2892, "step": 1170 }, { "epoch": 1.3151761898076653, "grad_norm": 0.4621964331966014, "learning_rate": 1.842177583662875e-05, "loss": 0.3, "step": 1171 }, { "epoch": 1.3162993120876036, "grad_norm": 0.44527517838102726, "learning_rate": 1.8417543664115632e-05, "loss": 0.286, "step": 1172 }, { "epoch": 1.3174224343675418, "grad_norm": 0.455981271490681, "learning_rate": 1.8413306312195265e-05, "loss": 0.2785, "step": 1173 }, { "epoch": 1.31854555664748, "grad_norm": 0.47535325172610315, "learning_rate": 1.840906378347494e-05, "loss": 0.3106, "step": 1174 }, { "epoch": 1.3196686789274183, "grad_norm": 0.4559106843629929, "learning_rate": 1.8404816080565133e-05, "loss": 0.2955, "step": 1175 }, { "epoch": 1.3207918012073565, "grad_norm": 0.4772186174990515, "learning_rate": 1.84005632060795e-05, "loss": 0.2859, "step": 1176 }, { "epoch": 1.3219149234872947, "grad_norm": 0.5126527615592691, "learning_rate": 1.8396305162634885e-05, "loss": 0.3171, "step": 1177 }, { "epoch": 1.3230380457672328, "grad_norm": 0.4863596644425538, "learning_rate": 1.83920419528513e-05, "loss": 0.3076, "step": 1178 }, { "epoch": 1.3241611680471712, "grad_norm": 0.4329722715025753, "learning_rate": 1.838777357935196e-05, "loss": 0.2594, "step": 1179 }, { "epoch": 1.3252842903271094, "grad_norm": 0.4735307567938337, "learning_rate": 1.8383500044763226e-05, "loss": 0.2752, "step": 1180 }, { "epoch": 1.3264074126070475, "grad_norm": 0.4708270647203179, "learning_rate": 1.837922135171466e-05, "loss": 0.3007, "step": 1181 }, { "epoch": 1.3275305348869857, "grad_norm": 0.4328720336288202, "learning_rate": 1.837493750283899e-05, "loss": 0.2743, "step": 1182 }, { "epoch": 1.328653657166924, "grad_norm": 0.45831796385684703, "learning_rate": 1.8370648500772107e-05, "loss": 0.2961, "step": 1183 }, { "epoch": 1.3297767794468622, "grad_norm": 0.4507244345775207, "learning_rate": 1.836635434815309e-05, "loss": 0.2854, "step": 1184 }, { "epoch": 1.3308999017268004, "grad_norm": 0.454897865730844, "learning_rate": 1.8362055047624175e-05, "loss": 0.2685, "step": 1185 }, { "epoch": 1.3320230240067388, "grad_norm": 0.46791986097149424, "learning_rate": 1.835775060183077e-05, "loss": 0.2824, "step": 1186 }, { "epoch": 1.333146146286677, "grad_norm": 0.47734391363275325, "learning_rate": 1.8353441013421445e-05, "loss": 0.2919, "step": 1187 }, { "epoch": 1.334269268566615, "grad_norm": 0.4525116496155753, "learning_rate": 1.8349126285047937e-05, "loss": 0.2897, "step": 1188 }, { "epoch": 1.3353923908465535, "grad_norm": 0.46057547898872325, "learning_rate": 1.8344806419365152e-05, "loss": 0.2959, "step": 1189 }, { "epoch": 1.3365155131264916, "grad_norm": 0.43961943911433504, "learning_rate": 1.8340481419031146e-05, "loss": 0.2717, "step": 1190 }, { "epoch": 1.3376386354064298, "grad_norm": 0.46908924741585806, "learning_rate": 1.833615128670714e-05, "loss": 0.2921, "step": 1191 }, { "epoch": 1.3387617576863682, "grad_norm": 0.4619903044568609, "learning_rate": 1.8331816025057508e-05, "loss": 0.2898, "step": 1192 }, { "epoch": 1.3398848799663063, "grad_norm": 0.44532555047844774, "learning_rate": 1.8327475636749793e-05, "loss": 0.2824, "step": 1193 }, { "epoch": 1.3410080022462445, "grad_norm": 0.4770456698496007, "learning_rate": 1.8323130124454676e-05, "loss": 0.2934, "step": 1194 }, { "epoch": 1.3421311245261829, "grad_norm": 0.5013999364371049, "learning_rate": 1.8318779490846005e-05, "loss": 0.3106, "step": 1195 }, { "epoch": 1.343254246806121, "grad_norm": 0.4462481861250692, "learning_rate": 1.8314423738600765e-05, "loss": 0.2916, "step": 1196 }, { "epoch": 1.3443773690860592, "grad_norm": 0.4544363504817102, "learning_rate": 1.8310062870399105e-05, "loss": 0.2768, "step": 1197 }, { "epoch": 1.3455004913659976, "grad_norm": 0.4739036937639808, "learning_rate": 1.8305696888924312e-05, "loss": 0.2818, "step": 1198 }, { "epoch": 1.3466236136459357, "grad_norm": 0.4251780111412686, "learning_rate": 1.8301325796862825e-05, "loss": 0.2618, "step": 1199 }, { "epoch": 1.347746735925874, "grad_norm": 0.4905216280674908, "learning_rate": 1.829694959690422e-05, "loss": 0.2881, "step": 1200 }, { "epoch": 1.3488698582058123, "grad_norm": 0.47365961695255354, "learning_rate": 1.8292568291741228e-05, "loss": 0.2954, "step": 1201 }, { "epoch": 1.3499929804857504, "grad_norm": 0.46299602153786523, "learning_rate": 1.8288181884069707e-05, "loss": 0.284, "step": 1202 }, { "epoch": 1.3511161027656886, "grad_norm": 0.44021839710368676, "learning_rate": 1.828379037658867e-05, "loss": 0.2685, "step": 1203 }, { "epoch": 1.3522392250456268, "grad_norm": 0.5131025179704897, "learning_rate": 1.827939377200025e-05, "loss": 0.3053, "step": 1204 }, { "epoch": 1.3533623473255652, "grad_norm": 0.44547118449274636, "learning_rate": 1.8274992073009736e-05, "loss": 0.289, "step": 1205 }, { "epoch": 1.3544854696055033, "grad_norm": 0.44524428838589575, "learning_rate": 1.827058528232553e-05, "loss": 0.2849, "step": 1206 }, { "epoch": 1.3556085918854415, "grad_norm": 0.4649075563603362, "learning_rate": 1.8266173402659193e-05, "loss": 0.2807, "step": 1207 }, { "epoch": 1.3567317141653796, "grad_norm": 0.48613198722949613, "learning_rate": 1.826175643672539e-05, "loss": 0.2949, "step": 1208 }, { "epoch": 1.357854836445318, "grad_norm": 0.43266914602095147, "learning_rate": 1.8257334387241944e-05, "loss": 0.2712, "step": 1209 }, { "epoch": 1.3589779587252562, "grad_norm": 0.4612767862308836, "learning_rate": 1.8252907256929777e-05, "loss": 0.2802, "step": 1210 }, { "epoch": 1.3601010810051943, "grad_norm": 0.48733069495425535, "learning_rate": 1.8248475048512956e-05, "loss": 0.2995, "step": 1211 }, { "epoch": 1.3612242032851327, "grad_norm": 0.4633919892138498, "learning_rate": 1.8244037764718666e-05, "loss": 0.2945, "step": 1212 }, { "epoch": 1.3623473255650709, "grad_norm": 0.48010248333965416, "learning_rate": 1.8239595408277216e-05, "loss": 0.2702, "step": 1213 }, { "epoch": 1.363470447845009, "grad_norm": 0.44254247447818046, "learning_rate": 1.8235147981922042e-05, "loss": 0.2797, "step": 1214 }, { "epoch": 1.3645935701249474, "grad_norm": 0.4685652855739763, "learning_rate": 1.8230695488389688e-05, "loss": 0.2994, "step": 1215 }, { "epoch": 1.3657166924048856, "grad_norm": 0.46157477459451246, "learning_rate": 1.822623793041983e-05, "loss": 0.2945, "step": 1216 }, { "epoch": 1.3668398146848237, "grad_norm": 0.4509218965365652, "learning_rate": 1.8221775310755247e-05, "loss": 0.286, "step": 1217 }, { "epoch": 1.3679629369647621, "grad_norm": 0.4584479264400474, "learning_rate": 1.8217307632141835e-05, "loss": 0.2933, "step": 1218 }, { "epoch": 1.3690860592447003, "grad_norm": 0.4985130465087838, "learning_rate": 1.8212834897328614e-05, "loss": 0.3045, "step": 1219 }, { "epoch": 1.3702091815246384, "grad_norm": 0.47337122837553836, "learning_rate": 1.82083571090677e-05, "loss": 0.2836, "step": 1220 }, { "epoch": 1.3713323038045768, "grad_norm": 0.45448456689959993, "learning_rate": 1.8203874270114327e-05, "loss": 0.2708, "step": 1221 }, { "epoch": 1.372455426084515, "grad_norm": 0.4893145984532352, "learning_rate": 1.8199386383226835e-05, "loss": 0.2826, "step": 1222 }, { "epoch": 1.3735785483644531, "grad_norm": 0.44417991047867217, "learning_rate": 1.8194893451166673e-05, "loss": 0.2821, "step": 1223 }, { "epoch": 1.3747016706443915, "grad_norm": 0.44598812540160476, "learning_rate": 1.819039547669839e-05, "loss": 0.2774, "step": 1224 }, { "epoch": 1.3758247929243297, "grad_norm": 0.4776707667120244, "learning_rate": 1.818589246258964e-05, "loss": 0.3122, "step": 1225 }, { "epoch": 1.3769479152042678, "grad_norm": 0.5082949999474378, "learning_rate": 1.8181384411611173e-05, "loss": 0.2948, "step": 1226 }, { "epoch": 1.3780710374842062, "grad_norm": 0.4424043135503394, "learning_rate": 1.817687132653685e-05, "loss": 0.2743, "step": 1227 }, { "epoch": 1.3791941597641444, "grad_norm": 0.48246812847596726, "learning_rate": 1.8172353210143613e-05, "loss": 0.3154, "step": 1228 }, { "epoch": 1.3803172820440825, "grad_norm": 0.4637624706766306, "learning_rate": 1.8167830065211513e-05, "loss": 0.3047, "step": 1229 }, { "epoch": 1.3814404043240207, "grad_norm": 0.47224752107030954, "learning_rate": 1.8163301894523695e-05, "loss": 0.3023, "step": 1230 }, { "epoch": 1.3825635266039589, "grad_norm": 0.4698090761389382, "learning_rate": 1.8158768700866386e-05, "loss": 0.2635, "step": 1231 }, { "epoch": 1.3836866488838973, "grad_norm": 0.4957909790614304, "learning_rate": 1.8154230487028913e-05, "loss": 0.2928, "step": 1232 }, { "epoch": 1.3848097711638354, "grad_norm": 0.44768649725073345, "learning_rate": 1.8149687255803687e-05, "loss": 0.2874, "step": 1233 }, { "epoch": 1.3859328934437736, "grad_norm": 0.4758485891594923, "learning_rate": 1.814513900998621e-05, "loss": 0.2996, "step": 1234 }, { "epoch": 1.387056015723712, "grad_norm": 0.47272282269055016, "learning_rate": 1.8140585752375063e-05, "loss": 0.2884, "step": 1235 }, { "epoch": 1.3881791380036501, "grad_norm": 0.45230047857816796, "learning_rate": 1.8136027485771926e-05, "loss": 0.2767, "step": 1236 }, { "epoch": 1.3893022602835883, "grad_norm": 0.4507150818799199, "learning_rate": 1.813146421298154e-05, "loss": 0.2884, "step": 1237 }, { "epoch": 1.3904253825635267, "grad_norm": 0.45879614204982117, "learning_rate": 1.8126895936811745e-05, "loss": 0.2708, "step": 1238 }, { "epoch": 1.3915485048434648, "grad_norm": 0.45661246563867913, "learning_rate": 1.812232266007344e-05, "loss": 0.2737, "step": 1239 }, { "epoch": 1.392671627123403, "grad_norm": 0.4601276790260715, "learning_rate": 1.8117744385580627e-05, "loss": 0.2938, "step": 1240 }, { "epoch": 1.3937947494033414, "grad_norm": 0.48293515299373707, "learning_rate": 1.8113161116150356e-05, "loss": 0.2847, "step": 1241 }, { "epoch": 1.3949178716832795, "grad_norm": 0.5135873286633734, "learning_rate": 1.8108572854602774e-05, "loss": 0.3047, "step": 1242 }, { "epoch": 1.3960409939632177, "grad_norm": 0.49511291405284924, "learning_rate": 1.8103979603761084e-05, "loss": 0.296, "step": 1243 }, { "epoch": 1.397164116243156, "grad_norm": 0.4915844024660979, "learning_rate": 1.8099381366451562e-05, "loss": 0.3025, "step": 1244 }, { "epoch": 1.3982872385230942, "grad_norm": 0.457360486981577, "learning_rate": 1.8094778145503555e-05, "loss": 0.2809, "step": 1245 }, { "epoch": 1.3994103608030324, "grad_norm": 0.45194447137874866, "learning_rate": 1.8090169943749477e-05, "loss": 0.2815, "step": 1246 }, { "epoch": 1.4005334830829708, "grad_norm": 0.469717199113566, "learning_rate": 1.8085556764024804e-05, "loss": 0.2844, "step": 1247 }, { "epoch": 1.401656605362909, "grad_norm": 0.461335245541827, "learning_rate": 1.8080938609168073e-05, "loss": 0.2825, "step": 1248 }, { "epoch": 1.402779727642847, "grad_norm": 0.4710994100251086, "learning_rate": 1.8076315482020893e-05, "loss": 0.2925, "step": 1249 }, { "epoch": 1.4039028499227855, "grad_norm": 0.46856919190627966, "learning_rate": 1.8071687385427922e-05, "loss": 0.2925, "step": 1250 }, { "epoch": 1.4050259722027236, "grad_norm": 0.463686961189385, "learning_rate": 1.8067054322236876e-05, "loss": 0.2982, "step": 1251 }, { "epoch": 1.4061490944826618, "grad_norm": 0.4233668907762543, "learning_rate": 1.806241629529853e-05, "loss": 0.279, "step": 1252 }, { "epoch": 1.4072722167626, "grad_norm": 0.4446703109276029, "learning_rate": 1.8057773307466717e-05, "loss": 0.2688, "step": 1253 }, { "epoch": 1.4083953390425383, "grad_norm": 0.4485358287975487, "learning_rate": 1.8053125361598314e-05, "loss": 0.2947, "step": 1254 }, { "epoch": 1.4095184613224765, "grad_norm": 0.4452525960821962, "learning_rate": 1.804847246055326e-05, "loss": 0.2916, "step": 1255 }, { "epoch": 1.4106415836024147, "grad_norm": 0.44434990069489066, "learning_rate": 1.8043814607194528e-05, "loss": 0.2957, "step": 1256 }, { "epoch": 1.4117647058823528, "grad_norm": 0.45782505420166125, "learning_rate": 1.803915180438815e-05, "loss": 0.3081, "step": 1257 }, { "epoch": 1.4128878281622912, "grad_norm": 0.44812612511388755, "learning_rate": 1.80344840550032e-05, "loss": 0.2925, "step": 1258 }, { "epoch": 1.4140109504422294, "grad_norm": 0.4653220459251436, "learning_rate": 1.8029811361911796e-05, "loss": 0.284, "step": 1259 }, { "epoch": 1.4151340727221675, "grad_norm": 0.4294806313427792, "learning_rate": 1.8025133727989095e-05, "loss": 0.2615, "step": 1260 }, { "epoch": 1.416257195002106, "grad_norm": 0.4722245406377723, "learning_rate": 1.8020451156113302e-05, "loss": 0.2932, "step": 1261 }, { "epoch": 1.417380317282044, "grad_norm": 0.4407259836445125, "learning_rate": 1.801576364916565e-05, "loss": 0.2783, "step": 1262 }, { "epoch": 1.4185034395619822, "grad_norm": 0.4554354849309488, "learning_rate": 1.8011071210030417e-05, "loss": 0.2934, "step": 1263 }, { "epoch": 1.4196265618419206, "grad_norm": 0.4797437708219925, "learning_rate": 1.8006373841594905e-05, "loss": 0.2914, "step": 1264 }, { "epoch": 1.4207496841218588, "grad_norm": 0.4453109721657388, "learning_rate": 1.8001671546749466e-05, "loss": 0.273, "step": 1265 }, { "epoch": 1.421872806401797, "grad_norm": 0.455616539511479, "learning_rate": 1.7996964328387473e-05, "loss": 0.2787, "step": 1266 }, { "epoch": 1.4229959286817353, "grad_norm": 0.4660177496804102, "learning_rate": 1.7992252189405318e-05, "loss": 0.288, "step": 1267 }, { "epoch": 1.4241190509616735, "grad_norm": 0.4868499349359341, "learning_rate": 1.798753513270245e-05, "loss": 0.3011, "step": 1268 }, { "epoch": 1.4252421732416116, "grad_norm": 0.42379910255068853, "learning_rate": 1.798281316118131e-05, "loss": 0.2688, "step": 1269 }, { "epoch": 1.42636529552155, "grad_norm": 0.44137517215745187, "learning_rate": 1.797808627774738e-05, "loss": 0.2779, "step": 1270 }, { "epoch": 1.4274884178014882, "grad_norm": 0.4454397017855999, "learning_rate": 1.7973354485309178e-05, "loss": 0.2885, "step": 1271 }, { "epoch": 1.4286115400814263, "grad_norm": 0.41513228116253365, "learning_rate": 1.7968617786778214e-05, "loss": 0.258, "step": 1272 }, { "epoch": 1.4297346623613647, "grad_norm": 0.45824072542709143, "learning_rate": 1.7963876185069032e-05, "loss": 0.2982, "step": 1273 }, { "epoch": 1.4308577846413029, "grad_norm": 0.45152285012254323, "learning_rate": 1.7959129683099202e-05, "loss": 0.2926, "step": 1274 }, { "epoch": 1.431980906921241, "grad_norm": 0.4382850255611188, "learning_rate": 1.795437828378929e-05, "loss": 0.2748, "step": 1275 }, { "epoch": 1.4331040292011794, "grad_norm": 0.45046526339966464, "learning_rate": 1.7949621990062882e-05, "loss": 0.2834, "step": 1276 }, { "epoch": 1.4342271514811176, "grad_norm": 0.45318087991408496, "learning_rate": 1.7944860804846585e-05, "loss": 0.2921, "step": 1277 }, { "epoch": 1.4353502737610557, "grad_norm": 0.4389584828107915, "learning_rate": 1.7940094731070005e-05, "loss": 0.2711, "step": 1278 }, { "epoch": 1.4364733960409939, "grad_norm": 0.4551598275028435, "learning_rate": 1.793532377166576e-05, "loss": 0.2788, "step": 1279 }, { "epoch": 1.4375965183209323, "grad_norm": 0.46450024527966405, "learning_rate": 1.793054792956947e-05, "loss": 0.2937, "step": 1280 }, { "epoch": 1.4387196406008704, "grad_norm": 0.4631044556099893, "learning_rate": 1.7925767207719774e-05, "loss": 0.2961, "step": 1281 }, { "epoch": 1.4398427628808086, "grad_norm": 0.4292656254224395, "learning_rate": 1.792098160905829e-05, "loss": 0.2646, "step": 1282 }, { "epoch": 1.4409658851607468, "grad_norm": 0.47873898790679503, "learning_rate": 1.791619113652966e-05, "loss": 0.2891, "step": 1283 }, { "epoch": 1.4420890074406851, "grad_norm": 0.46176874055866274, "learning_rate": 1.7911395793081508e-05, "loss": 0.3053, "step": 1284 }, { "epoch": 1.4432121297206233, "grad_norm": 0.43168104217807524, "learning_rate": 1.7906595581664462e-05, "loss": 0.2757, "step": 1285 }, { "epoch": 1.4443352520005615, "grad_norm": 0.4526156341532352, "learning_rate": 1.790179050523215e-05, "loss": 0.2906, "step": 1286 }, { "epoch": 1.4454583742804998, "grad_norm": 0.4576287167509249, "learning_rate": 1.7896980566741183e-05, "loss": 0.3055, "step": 1287 }, { "epoch": 1.446581496560438, "grad_norm": 0.4291349675446234, "learning_rate": 1.7892165769151174e-05, "loss": 0.2797, "step": 1288 }, { "epoch": 1.4477046188403762, "grad_norm": 0.4330178308786235, "learning_rate": 1.7887346115424712e-05, "loss": 0.2879, "step": 1289 }, { "epoch": 1.4488277411203145, "grad_norm": 0.4409857238545325, "learning_rate": 1.7882521608527393e-05, "loss": 0.277, "step": 1290 }, { "epoch": 1.4499508634002527, "grad_norm": 0.46396778078552625, "learning_rate": 1.7877692251427783e-05, "loss": 0.292, "step": 1291 }, { "epoch": 1.4510739856801909, "grad_norm": 0.4583722733176541, "learning_rate": 1.7872858047097442e-05, "loss": 0.287, "step": 1292 }, { "epoch": 1.4521971079601292, "grad_norm": 0.4617901289674729, "learning_rate": 1.7868018998510907e-05, "loss": 0.2931, "step": 1293 }, { "epoch": 1.4533202302400674, "grad_norm": 0.4688919751696024, "learning_rate": 1.7863175108645698e-05, "loss": 0.2795, "step": 1294 }, { "epoch": 1.4544433525200056, "grad_norm": 0.473513507634666, "learning_rate": 1.7858326380482313e-05, "loss": 0.3136, "step": 1295 }, { "epoch": 1.455566474799944, "grad_norm": 0.46308815674849557, "learning_rate": 1.7853472817004235e-05, "loss": 0.2871, "step": 1296 }, { "epoch": 1.456689597079882, "grad_norm": 0.5179959894108842, "learning_rate": 1.7848614421197903e-05, "loss": 0.3007, "step": 1297 }, { "epoch": 1.4578127193598203, "grad_norm": 0.4531340269397126, "learning_rate": 1.784375119605275e-05, "loss": 0.2959, "step": 1298 }, { "epoch": 1.4589358416397586, "grad_norm": 0.4582919369256917, "learning_rate": 1.783888314456117e-05, "loss": 0.2894, "step": 1299 }, { "epoch": 1.4600589639196968, "grad_norm": 0.4469673584516157, "learning_rate": 1.7834010269718526e-05, "loss": 0.28, "step": 1300 }, { "epoch": 1.461182086199635, "grad_norm": 0.46123285741882053, "learning_rate": 1.7829132574523155e-05, "loss": 0.2832, "step": 1301 }, { "epoch": 1.4623052084795733, "grad_norm": 0.45607801195534514, "learning_rate": 1.7824250061976355e-05, "loss": 0.287, "step": 1302 }, { "epoch": 1.4634283307595115, "grad_norm": 0.49530717522953166, "learning_rate": 1.7819362735082392e-05, "loss": 0.2993, "step": 1303 }, { "epoch": 1.4645514530394497, "grad_norm": 0.4809186057767541, "learning_rate": 1.7814470596848486e-05, "loss": 0.3111, "step": 1304 }, { "epoch": 1.4656745753193878, "grad_norm": 0.4609939280068489, "learning_rate": 1.780957365028483e-05, "loss": 0.3016, "step": 1305 }, { "epoch": 1.466797697599326, "grad_norm": 0.5159740391114742, "learning_rate": 1.7804671898404567e-05, "loss": 0.303, "step": 1306 }, { "epoch": 1.4679208198792644, "grad_norm": 0.45316092784328266, "learning_rate": 1.7799765344223798e-05, "loss": 0.2806, "step": 1307 }, { "epoch": 1.4690439421592025, "grad_norm": 0.4929344112281057, "learning_rate": 1.7794853990761576e-05, "loss": 0.3244, "step": 1308 }, { "epoch": 1.4701670644391407, "grad_norm": 0.4240215914027117, "learning_rate": 1.778993784103992e-05, "loss": 0.2649, "step": 1309 }, { "epoch": 1.471290186719079, "grad_norm": 0.45336019964413365, "learning_rate": 1.7785016898083786e-05, "loss": 0.281, "step": 1310 }, { "epoch": 1.4724133089990172, "grad_norm": 0.4709195975090023, "learning_rate": 1.778009116492108e-05, "loss": 0.3032, "step": 1311 }, { "epoch": 1.4735364312789554, "grad_norm": 0.4378426156461344, "learning_rate": 1.7775160644582667e-05, "loss": 0.2773, "step": 1312 }, { "epoch": 1.4746595535588938, "grad_norm": 0.4413617070461194, "learning_rate": 1.777022534010235e-05, "loss": 0.295, "step": 1313 }, { "epoch": 1.475782675838832, "grad_norm": 0.45700233268388313, "learning_rate": 1.776528525451687e-05, "loss": 0.2909, "step": 1314 }, { "epoch": 1.47690579811877, "grad_norm": 0.45427285254943417, "learning_rate": 1.776034039086592e-05, "loss": 0.3058, "step": 1315 }, { "epoch": 1.4780289203987085, "grad_norm": 0.45339655669768186, "learning_rate": 1.775539075219213e-05, "loss": 0.2792, "step": 1316 }, { "epoch": 1.4791520426786466, "grad_norm": 0.47160522784360365, "learning_rate": 1.7750436341541066e-05, "loss": 0.2998, "step": 1317 }, { "epoch": 1.4802751649585848, "grad_norm": 0.45954229553978265, "learning_rate": 1.774547716196123e-05, "loss": 0.2871, "step": 1318 }, { "epoch": 1.4813982872385232, "grad_norm": 0.46825968988664846, "learning_rate": 1.7740513216504064e-05, "loss": 0.3125, "step": 1319 }, { "epoch": 1.4825214095184613, "grad_norm": 0.4514724212464635, "learning_rate": 1.7735544508223933e-05, "loss": 0.2851, "step": 1320 }, { "epoch": 1.4836445317983995, "grad_norm": 0.45647208589810134, "learning_rate": 1.773057104017814e-05, "loss": 0.2871, "step": 1321 }, { "epoch": 1.4847676540783379, "grad_norm": 0.45834495190642427, "learning_rate": 1.772559281542692e-05, "loss": 0.2929, "step": 1322 }, { "epoch": 1.485890776358276, "grad_norm": 0.4614837916910236, "learning_rate": 1.7720609837033417e-05, "loss": 0.2978, "step": 1323 }, { "epoch": 1.4870138986382142, "grad_norm": 0.44184030726899937, "learning_rate": 1.7715622108063725e-05, "loss": 0.2663, "step": 1324 }, { "epoch": 1.4881370209181526, "grad_norm": 0.44711212739545164, "learning_rate": 1.771062963158684e-05, "loss": 0.2968, "step": 1325 }, { "epoch": 1.4892601431980907, "grad_norm": 0.44221975823221676, "learning_rate": 1.770563241067469e-05, "loss": 0.2952, "step": 1326 }, { "epoch": 1.490383265478029, "grad_norm": 0.5126809383713152, "learning_rate": 1.7700630448402125e-05, "loss": 0.3161, "step": 1327 }, { "epoch": 1.491506387757967, "grad_norm": 0.4410073711767202, "learning_rate": 1.76956237478469e-05, "loss": 0.2738, "step": 1328 }, { "epoch": 1.4926295100379054, "grad_norm": 0.45627309597281607, "learning_rate": 1.7690612312089702e-05, "loss": 0.2853, "step": 1329 }, { "epoch": 1.4937526323178436, "grad_norm": 0.5028458766464338, "learning_rate": 1.768559614421411e-05, "loss": 0.3025, "step": 1330 }, { "epoch": 1.4948757545977818, "grad_norm": 0.45766426852181796, "learning_rate": 1.768057524730664e-05, "loss": 0.293, "step": 1331 }, { "epoch": 1.49599887687772, "grad_norm": 0.45597657632110705, "learning_rate": 1.7675549624456695e-05, "loss": 0.3038, "step": 1332 }, { "epoch": 1.4971219991576583, "grad_norm": 0.4499374531210488, "learning_rate": 1.7670519278756603e-05, "loss": 0.2815, "step": 1333 }, { "epoch": 1.4982451214375965, "grad_norm": 0.4707335152594058, "learning_rate": 1.7665484213301587e-05, "loss": 0.303, "step": 1334 }, { "epoch": 1.4993682437175346, "grad_norm": 0.4781982752780198, "learning_rate": 1.766044443118978e-05, "loss": 0.2965, "step": 1335 }, { "epoch": 1.500491365997473, "grad_norm": 0.47356503106211406, "learning_rate": 1.7655399935522216e-05, "loss": 0.2957, "step": 1336 }, { "epoch": 1.5016144882774112, "grad_norm": 0.46119467696838445, "learning_rate": 1.765035072940283e-05, "loss": 0.2902, "step": 1337 }, { "epoch": 1.5027376105573493, "grad_norm": 0.44367007330882113, "learning_rate": 1.764529681593845e-05, "loss": 0.2745, "step": 1338 }, { "epoch": 1.5038607328372877, "grad_norm": 0.46264595602282504, "learning_rate": 1.7640238198238803e-05, "loss": 0.2712, "step": 1339 }, { "epoch": 1.5049838551172259, "grad_norm": 0.44037392967410266, "learning_rate": 1.763517487941652e-05, "loss": 0.2778, "step": 1340 }, { "epoch": 1.506106977397164, "grad_norm": 0.4361452264395535, "learning_rate": 1.763010686258711e-05, "loss": 0.2773, "step": 1341 }, { "epoch": 1.5072300996771024, "grad_norm": 0.5842833735748904, "learning_rate": 1.7625034150868983e-05, "loss": 0.3086, "step": 1342 }, { "epoch": 1.5083532219570406, "grad_norm": 0.43427680840538785, "learning_rate": 1.7619956747383435e-05, "loss": 0.2765, "step": 1343 }, { "epoch": 1.5094763442369787, "grad_norm": 0.4179475986040606, "learning_rate": 1.7614874655254644e-05, "loss": 0.2706, "step": 1344 }, { "epoch": 1.5105994665169171, "grad_norm": 0.44696851307851265, "learning_rate": 1.7609787877609678e-05, "loss": 0.2932, "step": 1345 }, { "epoch": 1.5117225887968553, "grad_norm": 0.43430140397443595, "learning_rate": 1.760469641757849e-05, "loss": 0.2748, "step": 1346 }, { "epoch": 1.5128457110767934, "grad_norm": 0.4164245825608252, "learning_rate": 1.7599600278293915e-05, "loss": 0.2765, "step": 1347 }, { "epoch": 1.5139688333567318, "grad_norm": 0.4411217535356976, "learning_rate": 1.7594499462891654e-05, "loss": 0.2671, "step": 1348 }, { "epoch": 1.51509195563667, "grad_norm": 0.47168354723847505, "learning_rate": 1.7589393974510304e-05, "loss": 0.2663, "step": 1349 }, { "epoch": 1.5162150779166081, "grad_norm": 0.45298191218773637, "learning_rate": 1.758428381629132e-05, "loss": 0.2977, "step": 1350 }, { "epoch": 1.5173382001965465, "grad_norm": 0.48015367678036425, "learning_rate": 1.7579168991379042e-05, "loss": 0.3073, "step": 1351 }, { "epoch": 1.5184613224764845, "grad_norm": 0.450921374315479, "learning_rate": 1.757404950292068e-05, "loss": 0.2981, "step": 1352 }, { "epoch": 1.5195844447564228, "grad_norm": 0.4511950623917902, "learning_rate": 1.7568925354066313e-05, "loss": 0.2715, "step": 1353 }, { "epoch": 1.5207075670363612, "grad_norm": 0.4645619454896311, "learning_rate": 1.756379654796888e-05, "loss": 0.2906, "step": 1354 }, { "epoch": 1.5218306893162992, "grad_norm": 0.4597487903047916, "learning_rate": 1.7558663087784195e-05, "loss": 0.2923, "step": 1355 }, { "epoch": 1.5229538115962375, "grad_norm": 0.46125985328422053, "learning_rate": 1.7553524976670936e-05, "loss": 0.3157, "step": 1356 }, { "epoch": 1.524076933876176, "grad_norm": 0.4627880390771236, "learning_rate": 1.7548382217790633e-05, "loss": 0.2844, "step": 1357 }, { "epoch": 1.5252000561561139, "grad_norm": 0.48404911382660587, "learning_rate": 1.7543234814307685e-05, "loss": 0.2848, "step": 1358 }, { "epoch": 1.5263231784360523, "grad_norm": 0.4732923841586989, "learning_rate": 1.753808276938935e-05, "loss": 0.3009, "step": 1359 }, { "epoch": 1.5274463007159904, "grad_norm": 0.44748155272300266, "learning_rate": 1.753292608620573e-05, "loss": 0.2806, "step": 1360 }, { "epoch": 1.5285694229959286, "grad_norm": 0.4671484752305363, "learning_rate": 1.7527764767929794e-05, "loss": 0.2746, "step": 1361 }, { "epoch": 1.529692545275867, "grad_norm": 0.45312930393716716, "learning_rate": 1.7522598817737356e-05, "loss": 0.2817, "step": 1362 }, { "epoch": 1.5308156675558051, "grad_norm": 0.48439112821445, "learning_rate": 1.7517428238807085e-05, "loss": 0.2988, "step": 1363 }, { "epoch": 1.5319387898357433, "grad_norm": 0.4258685474786322, "learning_rate": 1.751225303432049e-05, "loss": 0.2713, "step": 1364 }, { "epoch": 1.5330619121156817, "grad_norm": 0.4404628717066417, "learning_rate": 1.750707320746194e-05, "loss": 0.2817, "step": 1365 }, { "epoch": 1.5341850343956198, "grad_norm": 0.4477719006580123, "learning_rate": 1.750188876141863e-05, "loss": 0.2856, "step": 1366 }, { "epoch": 1.535308156675558, "grad_norm": 0.4482867400976784, "learning_rate": 1.7496699699380612e-05, "loss": 0.2715, "step": 1367 }, { "epoch": 1.5364312789554964, "grad_norm": 0.43210614417809273, "learning_rate": 1.749150602454077e-05, "loss": 0.2858, "step": 1368 }, { "epoch": 1.5375544012354345, "grad_norm": 0.4851645221119642, "learning_rate": 1.7486307740094832e-05, "loss": 0.3126, "step": 1369 }, { "epoch": 1.5386775235153727, "grad_norm": 0.43465394038491617, "learning_rate": 1.7481104849241357e-05, "loss": 0.2834, "step": 1370 }, { "epoch": 1.539800645795311, "grad_norm": 0.4343631643379705, "learning_rate": 1.7475897355181747e-05, "loss": 0.29, "step": 1371 }, { "epoch": 1.5409237680752492, "grad_norm": 0.43113460770410267, "learning_rate": 1.747068526112022e-05, "loss": 0.2712, "step": 1372 }, { "epoch": 1.5420468903551874, "grad_norm": 0.47375551490134643, "learning_rate": 1.7465468570263844e-05, "loss": 0.2852, "step": 1373 }, { "epoch": 1.5431700126351258, "grad_norm": 0.4567963890276265, "learning_rate": 1.7460247285822504e-05, "loss": 0.2882, "step": 1374 }, { "epoch": 1.544293134915064, "grad_norm": 0.4689696386236284, "learning_rate": 1.7455021411008906e-05, "loss": 0.294, "step": 1375 }, { "epoch": 1.545416257195002, "grad_norm": 0.4468934735798268, "learning_rate": 1.7449790949038604e-05, "loss": 0.2934, "step": 1376 }, { "epoch": 1.5465393794749405, "grad_norm": 0.4590404219399577, "learning_rate": 1.7444555903129943e-05, "loss": 0.2996, "step": 1377 }, { "epoch": 1.5476625017548784, "grad_norm": 0.4320176095607792, "learning_rate": 1.7439316276504112e-05, "loss": 0.2958, "step": 1378 }, { "epoch": 1.5487856240348168, "grad_norm": 0.4724597686393324, "learning_rate": 1.7434072072385115e-05, "loss": 0.299, "step": 1379 }, { "epoch": 1.5499087463147552, "grad_norm": 0.4551694704885351, "learning_rate": 1.7428823293999757e-05, "loss": 0.2898, "step": 1380 }, { "epoch": 1.551031868594693, "grad_norm": 0.4655624233086197, "learning_rate": 1.7423569944577677e-05, "loss": 0.3064, "step": 1381 }, { "epoch": 1.5521549908746315, "grad_norm": 0.4495621812588542, "learning_rate": 1.7418312027351322e-05, "loss": 0.277, "step": 1382 }, { "epoch": 1.5532781131545697, "grad_norm": 0.46734262431819734, "learning_rate": 1.741304954555594e-05, "loss": 0.2723, "step": 1383 }, { "epoch": 1.5544012354345078, "grad_norm": 0.4530212621436398, "learning_rate": 1.7407782502429594e-05, "loss": 0.2966, "step": 1384 }, { "epoch": 1.5555243577144462, "grad_norm": 0.48906035382170715, "learning_rate": 1.7402510901213158e-05, "loss": 0.3166, "step": 1385 }, { "epoch": 1.5566474799943844, "grad_norm": 0.4588087448330385, "learning_rate": 1.73972347451503e-05, "loss": 0.2911, "step": 1386 }, { "epoch": 1.5577706022743225, "grad_norm": 0.45890894077015215, "learning_rate": 1.7391954037487503e-05, "loss": 0.2792, "step": 1387 }, { "epoch": 1.558893724554261, "grad_norm": 0.43100684891614244, "learning_rate": 1.738666878147404e-05, "loss": 0.2705, "step": 1388 }, { "epoch": 1.560016846834199, "grad_norm": 0.4533496037726458, "learning_rate": 1.738137898036199e-05, "loss": 0.2984, "step": 1389 }, { "epoch": 1.5611399691141372, "grad_norm": 0.44398777502713194, "learning_rate": 1.7376084637406222e-05, "loss": 0.284, "step": 1390 }, { "epoch": 1.5622630913940756, "grad_norm": 0.4398899733954843, "learning_rate": 1.737078575586441e-05, "loss": 0.2851, "step": 1391 }, { "epoch": 1.5633862136740138, "grad_norm": 0.4743848381771007, "learning_rate": 1.736548233899701e-05, "loss": 0.2954, "step": 1392 }, { "epoch": 1.564509335953952, "grad_norm": 0.4645458661781724, "learning_rate": 1.7360174390067274e-05, "loss": 0.2912, "step": 1393 }, { "epoch": 1.5656324582338903, "grad_norm": 0.44409894402600525, "learning_rate": 1.735486191234124e-05, "loss": 0.2816, "step": 1394 }, { "epoch": 1.5667555805138285, "grad_norm": 0.43988326731406074, "learning_rate": 1.7349544909087737e-05, "loss": 0.2777, "step": 1395 }, { "epoch": 1.5678787027937666, "grad_norm": 0.461134202797876, "learning_rate": 1.734422338357837e-05, "loss": 0.2963, "step": 1396 }, { "epoch": 1.569001825073705, "grad_norm": 0.43219608525830144, "learning_rate": 1.7338897339087536e-05, "loss": 0.2743, "step": 1397 }, { "epoch": 1.5701249473536432, "grad_norm": 0.4522334985486791, "learning_rate": 1.733356677889241e-05, "loss": 0.293, "step": 1398 }, { "epoch": 1.5712480696335813, "grad_norm": 0.4697839824886229, "learning_rate": 1.732823170627294e-05, "loss": 0.3225, "step": 1399 }, { "epoch": 1.5723711919135197, "grad_norm": 0.4379752450384441, "learning_rate": 1.7322892124511862e-05, "loss": 0.2934, "step": 1400 }, { "epoch": 1.5734943141934579, "grad_norm": 0.4403768513512486, "learning_rate": 1.731754803689467e-05, "loss": 0.2808, "step": 1401 }, { "epoch": 1.574617436473396, "grad_norm": 0.4526960513869645, "learning_rate": 1.731219944670965e-05, "loss": 0.3075, "step": 1402 }, { "epoch": 1.5757405587533344, "grad_norm": 0.4391813185175165, "learning_rate": 1.730684635724784e-05, "loss": 0.2818, "step": 1403 }, { "epoch": 1.5768636810332723, "grad_norm": 0.45969828645408684, "learning_rate": 1.7301488771803056e-05, "loss": 0.3036, "step": 1404 }, { "epoch": 1.5779868033132107, "grad_norm": 0.43457708703765435, "learning_rate": 1.7296126693671886e-05, "loss": 0.2746, "step": 1405 }, { "epoch": 1.579109925593149, "grad_norm": 0.4331075300392934, "learning_rate": 1.7290760126153666e-05, "loss": 0.2821, "step": 1406 }, { "epoch": 1.580233047873087, "grad_norm": 0.4208492939766656, "learning_rate": 1.7285389072550515e-05, "loss": 0.2795, "step": 1407 }, { "epoch": 1.5813561701530254, "grad_norm": 0.4634956044262763, "learning_rate": 1.728001353616729e-05, "loss": 0.3025, "step": 1408 }, { "epoch": 1.5824792924329636, "grad_norm": 0.46128626849790916, "learning_rate": 1.727463352031163e-05, "loss": 0.302, "step": 1409 }, { "epoch": 1.5836024147129018, "grad_norm": 0.4275363771455048, "learning_rate": 1.7269249028293907e-05, "loss": 0.2809, "step": 1410 }, { "epoch": 1.5847255369928401, "grad_norm": 0.4508186393663234, "learning_rate": 1.7263860063427263e-05, "loss": 0.2928, "step": 1411 }, { "epoch": 1.5858486592727783, "grad_norm": 0.4641518740610698, "learning_rate": 1.7258466629027586e-05, "loss": 0.2966, "step": 1412 }, { "epoch": 1.5869717815527165, "grad_norm": 0.4350653148056457, "learning_rate": 1.7253068728413517e-05, "loss": 0.2871, "step": 1413 }, { "epoch": 1.5880949038326548, "grad_norm": 0.4574921332811425, "learning_rate": 1.7247666364906443e-05, "loss": 0.305, "step": 1414 }, { "epoch": 1.589218026112593, "grad_norm": 0.4423661730830507, "learning_rate": 1.7242259541830497e-05, "loss": 0.2835, "step": 1415 }, { "epoch": 1.5903411483925312, "grad_norm": 0.4415125572569024, "learning_rate": 1.723684826251256e-05, "loss": 0.287, "step": 1416 }, { "epoch": 1.5914642706724695, "grad_norm": 0.45878372415584956, "learning_rate": 1.7231432530282246e-05, "loss": 0.291, "step": 1417 }, { "epoch": 1.5925873929524077, "grad_norm": 0.4536230530206705, "learning_rate": 1.722601234847192e-05, "loss": 0.2676, "step": 1418 }, { "epoch": 1.5937105152323459, "grad_norm": 0.42510000179340557, "learning_rate": 1.7220587720416677e-05, "loss": 0.2625, "step": 1419 }, { "epoch": 1.5948336375122842, "grad_norm": 0.4617059875851874, "learning_rate": 1.721515864945435e-05, "loss": 0.3081, "step": 1420 }, { "epoch": 1.5959567597922224, "grad_norm": 0.4325597054894661, "learning_rate": 1.7209725138925506e-05, "loss": 0.2713, "step": 1421 }, { "epoch": 1.5970798820721606, "grad_norm": 0.43609135158582557, "learning_rate": 1.7204287192173444e-05, "loss": 0.2722, "step": 1422 }, { "epoch": 1.598203004352099, "grad_norm": 0.43997208529177845, "learning_rate": 1.7198844812544194e-05, "loss": 0.2737, "step": 1423 }, { "epoch": 1.599326126632037, "grad_norm": 0.45811922865343463, "learning_rate": 1.7193398003386514e-05, "loss": 0.3069, "step": 1424 }, { "epoch": 1.6004492489119753, "grad_norm": 0.441765765575073, "learning_rate": 1.718794676805188e-05, "loss": 0.2878, "step": 1425 }, { "epoch": 1.6015723711919136, "grad_norm": 0.44938030928999945, "learning_rate": 1.71824911098945e-05, "loss": 0.2803, "step": 1426 }, { "epoch": 1.6026954934718516, "grad_norm": 0.4711803532185502, "learning_rate": 1.7177031032271298e-05, "loss": 0.2898, "step": 1427 }, { "epoch": 1.60381861575179, "grad_norm": 0.48742752034450476, "learning_rate": 1.7171566538541925e-05, "loss": 0.2931, "step": 1428 }, { "epoch": 1.6049417380317283, "grad_norm": 0.455468095932038, "learning_rate": 1.7166097632068745e-05, "loss": 0.3055, "step": 1429 }, { "epoch": 1.6060648603116663, "grad_norm": 0.4372160767962161, "learning_rate": 1.7160624316216825e-05, "loss": 0.2829, "step": 1430 }, { "epoch": 1.6071879825916047, "grad_norm": 0.47650809154388873, "learning_rate": 1.715514659435397e-05, "loss": 0.283, "step": 1431 }, { "epoch": 1.608311104871543, "grad_norm": 0.430482793568945, "learning_rate": 1.7149664469850674e-05, "loss": 0.277, "step": 1432 }, { "epoch": 1.609434227151481, "grad_norm": 0.4516995855142114, "learning_rate": 1.714417794608015e-05, "loss": 0.2934, "step": 1433 }, { "epoch": 1.6105573494314194, "grad_norm": 0.4110806797442354, "learning_rate": 1.713868702641832e-05, "loss": 0.2593, "step": 1434 }, { "epoch": 1.6116804717113575, "grad_norm": 0.4616915231229448, "learning_rate": 1.7133191714243805e-05, "loss": 0.284, "step": 1435 }, { "epoch": 1.6128035939912957, "grad_norm": 0.41123495094639895, "learning_rate": 1.712769201293793e-05, "loss": 0.258, "step": 1436 }, { "epoch": 1.613926716271234, "grad_norm": 0.4770233615190129, "learning_rate": 1.7122187925884723e-05, "loss": 0.3075, "step": 1437 }, { "epoch": 1.6150498385511722, "grad_norm": 0.45932911711426877, "learning_rate": 1.7116679456470908e-05, "loss": 0.3073, "step": 1438 }, { "epoch": 1.6161729608311104, "grad_norm": 0.43310100468943585, "learning_rate": 1.711116660808591e-05, "loss": 0.2803, "step": 1439 }, { "epoch": 1.6172960831110488, "grad_norm": 0.4053611622253437, "learning_rate": 1.710564938412184e-05, "loss": 0.2552, "step": 1440 }, { "epoch": 1.618419205390987, "grad_norm": 0.4776776817414243, "learning_rate": 1.710012778797351e-05, "loss": 0.3047, "step": 1441 }, { "epoch": 1.619542327670925, "grad_norm": 0.46553831799548406, "learning_rate": 1.7094601823038425e-05, "loss": 0.2946, "step": 1442 }, { "epoch": 1.6206654499508635, "grad_norm": 0.466347563259381, "learning_rate": 1.7089071492716758e-05, "loss": 0.2886, "step": 1443 }, { "epoch": 1.6217885722308016, "grad_norm": 0.44584944747972316, "learning_rate": 1.7083536800411392e-05, "loss": 0.2731, "step": 1444 }, { "epoch": 1.6229116945107398, "grad_norm": 0.49761550260791193, "learning_rate": 1.7077997749527884e-05, "loss": 0.3217, "step": 1445 }, { "epoch": 1.6240348167906782, "grad_norm": 0.42865534739277844, "learning_rate": 1.707245434347447e-05, "loss": 0.2829, "step": 1446 }, { "epoch": 1.6251579390706163, "grad_norm": 0.4159767822652411, "learning_rate": 1.706690658566207e-05, "loss": 0.2645, "step": 1447 }, { "epoch": 1.6262810613505545, "grad_norm": 0.4766099212680193, "learning_rate": 1.7061354479504277e-05, "loss": 0.2893, "step": 1448 }, { "epoch": 1.6274041836304929, "grad_norm": 0.4532911553458812, "learning_rate": 1.705579802841737e-05, "loss": 0.2789, "step": 1449 }, { "epoch": 1.628527305910431, "grad_norm": 0.45741701789824535, "learning_rate": 1.7050237235820287e-05, "loss": 0.289, "step": 1450 }, { "epoch": 1.6296504281903692, "grad_norm": 0.4846189139759324, "learning_rate": 1.704467210513465e-05, "loss": 0.3282, "step": 1451 }, { "epoch": 1.6307735504703076, "grad_norm": 0.4261188062643428, "learning_rate": 1.7039102639784747e-05, "loss": 0.2712, "step": 1452 }, { "epoch": 1.6318966727502455, "grad_norm": 0.4409212737827275, "learning_rate": 1.7033528843197523e-05, "loss": 0.2847, "step": 1453 }, { "epoch": 1.633019795030184, "grad_norm": 0.4458915183695067, "learning_rate": 1.7027950718802605e-05, "loss": 0.2778, "step": 1454 }, { "epoch": 1.6341429173101223, "grad_norm": 0.47098011281228286, "learning_rate": 1.7022368270032268e-05, "loss": 0.2913, "step": 1455 }, { "epoch": 1.6352660395900602, "grad_norm": 0.4436299643362799, "learning_rate": 1.7016781500321458e-05, "loss": 0.2686, "step": 1456 }, { "epoch": 1.6363891618699986, "grad_norm": 0.508079183867471, "learning_rate": 1.7011190413107774e-05, "loss": 0.3568, "step": 1457 }, { "epoch": 1.6375122841499368, "grad_norm": 0.4447742608378096, "learning_rate": 1.7005595011831473e-05, "loss": 0.3054, "step": 1458 }, { "epoch": 1.638635406429875, "grad_norm": 0.45902598871228406, "learning_rate": 1.699999529993547e-05, "loss": 0.2874, "step": 1459 }, { "epoch": 1.6397585287098133, "grad_norm": 0.43790349463956285, "learning_rate": 1.6994391280865327e-05, "loss": 0.2753, "step": 1460 }, { "epoch": 1.6408816509897515, "grad_norm": 0.43987143870781154, "learning_rate": 1.698878295806926e-05, "loss": 0.2812, "step": 1461 }, { "epoch": 1.6420047732696896, "grad_norm": 0.45636973748880527, "learning_rate": 1.698317033499813e-05, "loss": 0.294, "step": 1462 }, { "epoch": 1.643127895549628, "grad_norm": 0.4473056852255817, "learning_rate": 1.6977553415105446e-05, "loss": 0.2922, "step": 1463 }, { "epoch": 1.6442510178295662, "grad_norm": 0.44711605854202396, "learning_rate": 1.6971932201847362e-05, "loss": 0.3037, "step": 1464 }, { "epoch": 1.6453741401095043, "grad_norm": 0.47322347350994115, "learning_rate": 1.6966306698682672e-05, "loss": 0.3091, "step": 1465 }, { "epoch": 1.6464972623894427, "grad_norm": 0.47589867797246205, "learning_rate": 1.6960676909072808e-05, "loss": 0.3044, "step": 1466 }, { "epoch": 1.6476203846693809, "grad_norm": 0.45462083845828744, "learning_rate": 1.6955042836481842e-05, "loss": 0.2901, "step": 1467 }, { "epoch": 1.648743506949319, "grad_norm": 0.43342182003648877, "learning_rate": 1.6949404484376484e-05, "loss": 0.2826, "step": 1468 }, { "epoch": 1.6498666292292574, "grad_norm": 0.4420101471539028, "learning_rate": 1.6943761856226072e-05, "loss": 0.2823, "step": 1469 }, { "epoch": 1.6509897515091956, "grad_norm": 0.45834897544206743, "learning_rate": 1.693811495550258e-05, "loss": 0.2998, "step": 1470 }, { "epoch": 1.6521128737891337, "grad_norm": 0.4376763092414877, "learning_rate": 1.69324637856806e-05, "loss": 0.2906, "step": 1471 }, { "epoch": 1.6532359960690721, "grad_norm": 0.4546259417086349, "learning_rate": 1.6926808350237367e-05, "loss": 0.293, "step": 1472 }, { "epoch": 1.6543591183490103, "grad_norm": 0.4372010366201924, "learning_rate": 1.692114865265273e-05, "loss": 0.2712, "step": 1473 }, { "epoch": 1.6554822406289484, "grad_norm": 0.4778527561849003, "learning_rate": 1.691548469640916e-05, "loss": 0.3141, "step": 1474 }, { "epoch": 1.6566053629088868, "grad_norm": 0.4373422181074543, "learning_rate": 1.690981648499176e-05, "loss": 0.2776, "step": 1475 }, { "epoch": 1.657728485188825, "grad_norm": 0.45910849316443425, "learning_rate": 1.6904144021888236e-05, "loss": 0.2914, "step": 1476 }, { "epoch": 1.6588516074687631, "grad_norm": 0.4377368116005779, "learning_rate": 1.6898467310588917e-05, "loss": 0.2832, "step": 1477 }, { "epoch": 1.6599747297487015, "grad_norm": 0.4343991004179873, "learning_rate": 1.689278635458675e-05, "loss": 0.2697, "step": 1478 }, { "epoch": 1.6610978520286395, "grad_norm": 0.4584326718106985, "learning_rate": 1.6887101157377284e-05, "loss": 0.3061, "step": 1479 }, { "epoch": 1.6622209743085778, "grad_norm": 0.48367472603690564, "learning_rate": 1.6881411722458688e-05, "loss": 0.2898, "step": 1480 }, { "epoch": 1.6633440965885162, "grad_norm": 0.4259604088571098, "learning_rate": 1.6875718053331736e-05, "loss": 0.2642, "step": 1481 }, { "epoch": 1.6644672188684542, "grad_norm": 0.4427453550152015, "learning_rate": 1.6870020153499796e-05, "loss": 0.2742, "step": 1482 }, { "epoch": 1.6655903411483926, "grad_norm": 0.4640041414969516, "learning_rate": 1.686431802646886e-05, "loss": 0.2913, "step": 1483 }, { "epoch": 1.6667134634283307, "grad_norm": 0.46659553606141885, "learning_rate": 1.68586116757475e-05, "loss": 0.2979, "step": 1484 }, { "epoch": 1.6678365857082689, "grad_norm": 0.44118444234300963, "learning_rate": 1.6852901104846902e-05, "loss": 0.2798, "step": 1485 }, { "epoch": 1.6689597079882073, "grad_norm": 0.4232934978922537, "learning_rate": 1.6847186317280844e-05, "loss": 0.2664, "step": 1486 }, { "epoch": 1.6700828302681454, "grad_norm": 0.4549678955070991, "learning_rate": 1.684146731656569e-05, "loss": 0.2691, "step": 1487 }, { "epoch": 1.6712059525480836, "grad_norm": 0.4462793748347985, "learning_rate": 1.683574410622041e-05, "loss": 0.2756, "step": 1488 }, { "epoch": 1.672329074828022, "grad_norm": 0.46943909006116125, "learning_rate": 1.683001668976656e-05, "loss": 0.3007, "step": 1489 }, { "epoch": 1.6734521971079601, "grad_norm": 0.47419859082322885, "learning_rate": 1.6824285070728278e-05, "loss": 0.2873, "step": 1490 }, { "epoch": 1.6745753193878983, "grad_norm": 0.47147067796892533, "learning_rate": 1.6818549252632295e-05, "loss": 0.2839, "step": 1491 }, { "epoch": 1.6756984416678367, "grad_norm": 0.456348027524806, "learning_rate": 1.6812809239007924e-05, "loss": 0.3, "step": 1492 }, { "epoch": 1.6768215639477748, "grad_norm": 0.40777119029954567, "learning_rate": 1.6807065033387052e-05, "loss": 0.2596, "step": 1493 }, { "epoch": 1.677944686227713, "grad_norm": 0.40000140473687745, "learning_rate": 1.6801316639304163e-05, "loss": 0.2658, "step": 1494 }, { "epoch": 1.6790678085076514, "grad_norm": 0.45210384075210974, "learning_rate": 1.6795564060296295e-05, "loss": 0.2966, "step": 1495 }, { "epoch": 1.6801909307875895, "grad_norm": 0.40822664899597394, "learning_rate": 1.678980729990308e-05, "loss": 0.2535, "step": 1496 }, { "epoch": 1.6813140530675277, "grad_norm": 0.4783878749550028, "learning_rate": 1.6784046361666714e-05, "loss": 0.3179, "step": 1497 }, { "epoch": 1.682437175347466, "grad_norm": 0.4205086583335779, "learning_rate": 1.6778281249131973e-05, "loss": 0.2624, "step": 1498 }, { "epoch": 1.6835602976274042, "grad_norm": 0.4448670745068746, "learning_rate": 1.677251196584618e-05, "loss": 0.2726, "step": 1499 }, { "epoch": 1.6846834199073424, "grad_norm": 0.4612087079191544, "learning_rate": 1.676673851535925e-05, "loss": 0.2863, "step": 1500 }, { "epoch": 1.6858065421872808, "grad_norm": 0.44897350104308953, "learning_rate": 1.6760960901223647e-05, "loss": 0.2777, "step": 1501 }, { "epoch": 1.6869296644672187, "grad_norm": 0.4650716938100982, "learning_rate": 1.6755179126994397e-05, "loss": 0.3114, "step": 1502 }, { "epoch": 1.688052786747157, "grad_norm": 0.4408040310278629, "learning_rate": 1.6749393196229097e-05, "loss": 0.2802, "step": 1503 }, { "epoch": 1.6891759090270955, "grad_norm": 0.4323634729178194, "learning_rate": 1.6743603112487888e-05, "loss": 0.272, "step": 1504 }, { "epoch": 1.6902990313070334, "grad_norm": 0.4532517237446688, "learning_rate": 1.6737808879333477e-05, "loss": 0.2994, "step": 1505 }, { "epoch": 1.6914221535869718, "grad_norm": 0.41707219861308187, "learning_rate": 1.6732010500331112e-05, "loss": 0.2786, "step": 1506 }, { "epoch": 1.6925452758669102, "grad_norm": 0.4790732992448698, "learning_rate": 1.6726207979048604e-05, "loss": 0.3217, "step": 1507 }, { "epoch": 1.693668398146848, "grad_norm": 0.4426207341478685, "learning_rate": 1.672040131905631e-05, "loss": 0.2957, "step": 1508 }, { "epoch": 1.6947915204267865, "grad_norm": 0.4491550141407061, "learning_rate": 1.6714590523927127e-05, "loss": 0.2963, "step": 1509 }, { "epoch": 1.6959146427067247, "grad_norm": 0.44425919435030004, "learning_rate": 1.6708775597236507e-05, "loss": 0.2888, "step": 1510 }, { "epoch": 1.6970377649866628, "grad_norm": 0.4428624792544438, "learning_rate": 1.6702956542562433e-05, "loss": 0.2919, "step": 1511 }, { "epoch": 1.6981608872666012, "grad_norm": 0.4645304954497035, "learning_rate": 1.669713336348544e-05, "loss": 0.3044, "step": 1512 }, { "epoch": 1.6992840095465394, "grad_norm": 0.442526153745366, "learning_rate": 1.6691306063588583e-05, "loss": 0.2733, "step": 1513 }, { "epoch": 1.7004071318264775, "grad_norm": 0.44682174416662607, "learning_rate": 1.6685474646457477e-05, "loss": 0.288, "step": 1514 }, { "epoch": 1.701530254106416, "grad_norm": 0.4460477682537591, "learning_rate": 1.6679639115680247e-05, "loss": 0.2892, "step": 1515 }, { "epoch": 1.702653376386354, "grad_norm": 0.42381417287482237, "learning_rate": 1.667379947484756e-05, "loss": 0.2753, "step": 1516 }, { "epoch": 1.7037764986662922, "grad_norm": 0.45442357297793295, "learning_rate": 1.666795572755262e-05, "loss": 0.2963, "step": 1517 }, { "epoch": 1.7048996209462306, "grad_norm": 0.4467561695261649, "learning_rate": 1.666210787739114e-05, "loss": 0.294, "step": 1518 }, { "epoch": 1.7060227432261688, "grad_norm": 0.4514819485972295, "learning_rate": 1.665625592796137e-05, "loss": 0.2916, "step": 1519 }, { "epoch": 1.707145865506107, "grad_norm": 0.47046147688947726, "learning_rate": 1.665039988286408e-05, "loss": 0.3059, "step": 1520 }, { "epoch": 1.7082689877860453, "grad_norm": 0.4664118838351173, "learning_rate": 1.6644539745702558e-05, "loss": 0.2977, "step": 1521 }, { "epoch": 1.7093921100659835, "grad_norm": 0.4388011550954156, "learning_rate": 1.6638675520082613e-05, "loss": 0.2998, "step": 1522 }, { "epoch": 1.7105152323459216, "grad_norm": 0.45082022297345586, "learning_rate": 1.663280720961256e-05, "loss": 0.2966, "step": 1523 }, { "epoch": 1.71163835462586, "grad_norm": 0.4497822398613249, "learning_rate": 1.662693481790324e-05, "loss": 0.2827, "step": 1524 }, { "epoch": 1.7127614769057982, "grad_norm": 0.46875113264717555, "learning_rate": 1.6621058348568008e-05, "loss": 0.3053, "step": 1525 }, { "epoch": 1.7138845991857363, "grad_norm": 0.4547313413961331, "learning_rate": 1.6615177805222703e-05, "loss": 0.2875, "step": 1526 }, { "epoch": 1.7150077214656747, "grad_norm": 0.44523115177994116, "learning_rate": 1.6609293191485704e-05, "loss": 0.2924, "step": 1527 }, { "epoch": 1.7161308437456126, "grad_norm": 0.4348408552400898, "learning_rate": 1.660340451097787e-05, "loss": 0.2842, "step": 1528 }, { "epoch": 1.717253966025551, "grad_norm": 0.4274690190508088, "learning_rate": 1.6597511767322575e-05, "loss": 0.2754, "step": 1529 }, { "epoch": 1.7183770883054894, "grad_norm": 0.45654747366119725, "learning_rate": 1.6591614964145685e-05, "loss": 0.3052, "step": 1530 }, { "epoch": 1.7195002105854273, "grad_norm": 0.4633059182553845, "learning_rate": 1.6585714105075573e-05, "loss": 0.3026, "step": 1531 }, { "epoch": 1.7206233328653657, "grad_norm": 0.45324442029714246, "learning_rate": 1.6579809193743094e-05, "loss": 0.2861, "step": 1532 }, { "epoch": 1.7217464551453041, "grad_norm": 0.4599466934645262, "learning_rate": 1.6573900233781616e-05, "loss": 0.2858, "step": 1533 }, { "epoch": 1.722869577425242, "grad_norm": 0.45346376009896494, "learning_rate": 1.656798722882698e-05, "loss": 0.2902, "step": 1534 }, { "epoch": 1.7239926997051804, "grad_norm": 0.4406609173508741, "learning_rate": 1.6562070182517524e-05, "loss": 0.2978, "step": 1535 }, { "epoch": 1.7251158219851186, "grad_norm": 0.46672133569112967, "learning_rate": 1.6556149098494075e-05, "loss": 0.2761, "step": 1536 }, { "epoch": 1.7262389442650568, "grad_norm": 0.48979063606476814, "learning_rate": 1.6550223980399934e-05, "loss": 0.2984, "step": 1537 }, { "epoch": 1.7273620665449951, "grad_norm": 0.4511837735588295, "learning_rate": 1.65442948318809e-05, "loss": 0.285, "step": 1538 }, { "epoch": 1.7284851888249333, "grad_norm": 0.45689758535181885, "learning_rate": 1.6538361656585237e-05, "loss": 0.3051, "step": 1539 }, { "epoch": 1.7296083111048715, "grad_norm": 0.42911681373204696, "learning_rate": 1.6532424458163692e-05, "loss": 0.2855, "step": 1540 }, { "epoch": 1.7307314333848098, "grad_norm": 0.4599079422199036, "learning_rate": 1.6526483240269497e-05, "loss": 0.2929, "step": 1541 }, { "epoch": 1.731854555664748, "grad_norm": 0.4304194735001978, "learning_rate": 1.6520538006558345e-05, "loss": 0.2708, "step": 1542 }, { "epoch": 1.7329776779446862, "grad_norm": 0.44178440467725394, "learning_rate": 1.6514588760688397e-05, "loss": 0.3007, "step": 1543 }, { "epoch": 1.7341008002246245, "grad_norm": 0.4423330619472906, "learning_rate": 1.65086355063203e-05, "loss": 0.2845, "step": 1544 }, { "epoch": 1.7352239225045627, "grad_norm": 0.433601638401388, "learning_rate": 1.6502678247117146e-05, "loss": 0.2796, "step": 1545 }, { "epoch": 1.7363470447845009, "grad_norm": 0.44925607846317117, "learning_rate": 1.649671698674451e-05, "loss": 0.2786, "step": 1546 }, { "epoch": 1.7374701670644392, "grad_norm": 0.4146082239786263, "learning_rate": 1.6490751728870422e-05, "loss": 0.2596, "step": 1547 }, { "epoch": 1.7385932893443774, "grad_norm": 0.4338894018059736, "learning_rate": 1.6484782477165365e-05, "loss": 0.276, "step": 1548 }, { "epoch": 1.7397164116243156, "grad_norm": 0.4650911120148975, "learning_rate": 1.6478809235302287e-05, "loss": 0.2873, "step": 1549 }, { "epoch": 1.740839533904254, "grad_norm": 0.42522285905199153, "learning_rate": 1.647283200695659e-05, "loss": 0.2689, "step": 1550 }, { "epoch": 1.741962656184192, "grad_norm": 0.44208521414529733, "learning_rate": 1.6466850795806136e-05, "loss": 0.3013, "step": 1551 }, { "epoch": 1.7430857784641303, "grad_norm": 0.4517383587426911, "learning_rate": 1.6460865605531214e-05, "loss": 0.2905, "step": 1552 }, { "epoch": 1.7442089007440686, "grad_norm": 0.42667270058823686, "learning_rate": 1.6454876439814592e-05, "loss": 0.2778, "step": 1553 }, { "epoch": 1.7453320230240066, "grad_norm": 0.4596306665470625, "learning_rate": 1.644888330234146e-05, "loss": 0.2989, "step": 1554 }, { "epoch": 1.746455145303945, "grad_norm": 0.43733172758764655, "learning_rate": 1.6442886196799465e-05, "loss": 0.2934, "step": 1555 }, { "epoch": 1.7475782675838833, "grad_norm": 0.44887986338826835, "learning_rate": 1.6436885126878696e-05, "loss": 0.3157, "step": 1556 }, { "epoch": 1.7487013898638213, "grad_norm": 0.451897402012675, "learning_rate": 1.6430880096271672e-05, "loss": 0.2903, "step": 1557 }, { "epoch": 1.7498245121437597, "grad_norm": 0.41101933804424956, "learning_rate": 1.6424871108673355e-05, "loss": 0.2747, "step": 1558 }, { "epoch": 1.7509476344236978, "grad_norm": 0.49671129631427824, "learning_rate": 1.6418858167781145e-05, "loss": 0.3052, "step": 1559 }, { "epoch": 1.752070756703636, "grad_norm": 0.4181368637873254, "learning_rate": 1.6412841277294865e-05, "loss": 0.2834, "step": 1560 }, { "epoch": 1.7531938789835744, "grad_norm": 0.4215454138779954, "learning_rate": 1.6406820440916778e-05, "loss": 0.267, "step": 1561 }, { "epoch": 1.7543170012635125, "grad_norm": 0.4712786813972758, "learning_rate": 1.6400795662351572e-05, "loss": 0.3015, "step": 1562 }, { "epoch": 1.7554401235434507, "grad_norm": 0.456622799753585, "learning_rate": 1.639476694530635e-05, "loss": 0.2769, "step": 1563 }, { "epoch": 1.756563245823389, "grad_norm": 0.4518709261822103, "learning_rate": 1.6388734293490666e-05, "loss": 0.2828, "step": 1564 }, { "epoch": 1.7576863681033272, "grad_norm": 0.4190624104153518, "learning_rate": 1.6382697710616458e-05, "loss": 0.2702, "step": 1565 }, { "epoch": 1.7588094903832654, "grad_norm": 0.49660564296640625, "learning_rate": 1.6376657200398117e-05, "loss": 0.3019, "step": 1566 }, { "epoch": 1.7599326126632038, "grad_norm": 0.43644278562274014, "learning_rate": 1.6370612766552422e-05, "loss": 0.2656, "step": 1567 }, { "epoch": 1.761055734943142, "grad_norm": 0.45767759479301445, "learning_rate": 1.636456441279859e-05, "loss": 0.2947, "step": 1568 }, { "epoch": 1.76217885722308, "grad_norm": 0.4594875195235337, "learning_rate": 1.6358512142858234e-05, "loss": 0.2993, "step": 1569 }, { "epoch": 1.7633019795030185, "grad_norm": 0.48736759713026184, "learning_rate": 1.6352455960455385e-05, "loss": 0.289, "step": 1570 }, { "epoch": 1.7644251017829566, "grad_norm": 0.4544717098398387, "learning_rate": 1.634639586931648e-05, "loss": 0.2781, "step": 1571 }, { "epoch": 1.7655482240628948, "grad_norm": 0.4587177062509317, "learning_rate": 1.6340331873170356e-05, "loss": 0.2834, "step": 1572 }, { "epoch": 1.7666713463428332, "grad_norm": 0.48310847080548047, "learning_rate": 1.6334263975748263e-05, "loss": 0.2921, "step": 1573 }, { "epoch": 1.7677944686227713, "grad_norm": 0.46119532443782013, "learning_rate": 1.632819218078383e-05, "loss": 0.29, "step": 1574 }, { "epoch": 1.7689175909027095, "grad_norm": 0.46068898687141085, "learning_rate": 1.6322116492013116e-05, "loss": 0.299, "step": 1575 }, { "epoch": 1.7700407131826479, "grad_norm": 0.4439847667371739, "learning_rate": 1.6316036913174555e-05, "loss": 0.2855, "step": 1576 }, { "epoch": 1.7711638354625858, "grad_norm": 0.4441579095125552, "learning_rate": 1.630995344800897e-05, "loss": 0.3003, "step": 1577 }, { "epoch": 1.7722869577425242, "grad_norm": 0.4527313840660522, "learning_rate": 1.6303866100259595e-05, "loss": 0.2885, "step": 1578 }, { "epoch": 1.7734100800224626, "grad_norm": 0.4295498490008525, "learning_rate": 1.6297774873672036e-05, "loss": 0.2913, "step": 1579 }, { "epoch": 1.7745332023024005, "grad_norm": 0.4506045912150568, "learning_rate": 1.6291679771994293e-05, "loss": 0.2919, "step": 1580 }, { "epoch": 1.775656324582339, "grad_norm": 0.43120215422811353, "learning_rate": 1.6285580798976754e-05, "loss": 0.288, "step": 1581 }, { "epoch": 1.7767794468622773, "grad_norm": 0.45220082498818925, "learning_rate": 1.6279477958372175e-05, "loss": 0.2902, "step": 1582 }, { "epoch": 1.7779025691422152, "grad_norm": 0.4292832161883602, "learning_rate": 1.6273371253935707e-05, "loss": 0.2807, "step": 1583 }, { "epoch": 1.7790256914221536, "grad_norm": 0.43823143313617324, "learning_rate": 1.626726068942487e-05, "loss": 0.2899, "step": 1584 }, { "epoch": 1.7801488137020918, "grad_norm": 0.43017131250609614, "learning_rate": 1.6261146268599564e-05, "loss": 0.2717, "step": 1585 }, { "epoch": 1.78127193598203, "grad_norm": 0.45356581546068614, "learning_rate": 1.6255027995222056e-05, "loss": 0.2887, "step": 1586 }, { "epoch": 1.7823950582619683, "grad_norm": 0.44928986844226365, "learning_rate": 1.624890587305699e-05, "loss": 0.2861, "step": 1587 }, { "epoch": 1.7835181805419065, "grad_norm": 0.4524148407584021, "learning_rate": 1.6242779905871375e-05, "loss": 0.2751, "step": 1588 }, { "epoch": 1.7846413028218446, "grad_norm": 0.43075367830451183, "learning_rate": 1.6236650097434586e-05, "loss": 0.2807, "step": 1589 }, { "epoch": 1.785764425101783, "grad_norm": 0.46285463495616647, "learning_rate": 1.623051645151836e-05, "loss": 0.3121, "step": 1590 }, { "epoch": 1.7868875473817212, "grad_norm": 0.43530135150806504, "learning_rate": 1.6224378971896798e-05, "loss": 0.2819, "step": 1591 }, { "epoch": 1.7880106696616593, "grad_norm": 0.490996011158021, "learning_rate": 1.6218237662346356e-05, "loss": 0.2893, "step": 1592 }, { "epoch": 1.7891337919415977, "grad_norm": 0.4647817119466746, "learning_rate": 1.6212092526645854e-05, "loss": 0.2843, "step": 1593 }, { "epoch": 1.7902569142215359, "grad_norm": 0.4413295898097072, "learning_rate": 1.6205943568576457e-05, "loss": 0.2799, "step": 1594 }, { "epoch": 1.791380036501474, "grad_norm": 0.43007143575326345, "learning_rate": 1.6199790791921693e-05, "loss": 0.2593, "step": 1595 }, { "epoch": 1.7925031587814124, "grad_norm": 0.4881130643190996, "learning_rate": 1.6193634200467426e-05, "loss": 0.2961, "step": 1596 }, { "epoch": 1.7936262810613506, "grad_norm": 0.4528464937764476, "learning_rate": 1.618747379800188e-05, "loss": 0.2854, "step": 1597 }, { "epoch": 1.7947494033412887, "grad_norm": 0.45653808613442953, "learning_rate": 1.6181309588315616e-05, "loss": 0.2919, "step": 1598 }, { "epoch": 1.7958725256212271, "grad_norm": 0.4327325119499413, "learning_rate": 1.6175141575201537e-05, "loss": 0.2709, "step": 1599 }, { "epoch": 1.7969956479011653, "grad_norm": 0.4377952749193301, "learning_rate": 1.6168969762454897e-05, "loss": 0.2761, "step": 1600 }, { "epoch": 1.7981187701811034, "grad_norm": 0.4878789221305916, "learning_rate": 1.616279415387327e-05, "loss": 0.312, "step": 1601 }, { "epoch": 1.7992418924610418, "grad_norm": 0.4072202956012937, "learning_rate": 1.6156614753256583e-05, "loss": 0.2731, "step": 1602 }, { "epoch": 1.8003650147409798, "grad_norm": 0.425387058556243, "learning_rate": 1.615043156440709e-05, "loss": 0.2647, "step": 1603 }, { "epoch": 1.8014881370209181, "grad_norm": 0.44570656361437866, "learning_rate": 1.6144244591129373e-05, "loss": 0.2856, "step": 1604 }, { "epoch": 1.8026112593008565, "grad_norm": 0.5020676279175268, "learning_rate": 1.6138053837230345e-05, "loss": 0.3087, "step": 1605 }, { "epoch": 1.8037343815807945, "grad_norm": 0.43825994146175534, "learning_rate": 1.6131859306519243e-05, "loss": 0.2899, "step": 1606 }, { "epoch": 1.8048575038607328, "grad_norm": 0.4446328374851358, "learning_rate": 1.612566100280763e-05, "loss": 0.28, "step": 1607 }, { "epoch": 1.8059806261406712, "grad_norm": 0.4774225518830962, "learning_rate": 1.6119458929909394e-05, "loss": 0.3031, "step": 1608 }, { "epoch": 1.8071037484206092, "grad_norm": 0.426328321425137, "learning_rate": 1.611325309164074e-05, "loss": 0.2766, "step": 1609 }, { "epoch": 1.8082268707005476, "grad_norm": 0.43586684491416294, "learning_rate": 1.610704349182018e-05, "loss": 0.2928, "step": 1610 }, { "epoch": 1.8093499929804857, "grad_norm": 0.4567133760129506, "learning_rate": 1.6100830134268558e-05, "loss": 0.2722, "step": 1611 }, { "epoch": 1.8104731152604239, "grad_norm": 0.45026824414792077, "learning_rate": 1.6094613022809017e-05, "loss": 0.2876, "step": 1612 }, { "epoch": 1.8115962375403623, "grad_norm": 0.45757626469045953, "learning_rate": 1.6088392161267018e-05, "loss": 0.2961, "step": 1613 }, { "epoch": 1.8127193598203004, "grad_norm": 0.4502992493729303, "learning_rate": 1.6082167553470318e-05, "loss": 0.3049, "step": 1614 }, { "epoch": 1.8138424821002386, "grad_norm": 0.4454769781082224, "learning_rate": 1.607593920324899e-05, "loss": 0.2817, "step": 1615 }, { "epoch": 1.814965604380177, "grad_norm": 0.4435355751540588, "learning_rate": 1.606970711443541e-05, "loss": 0.2704, "step": 1616 }, { "epoch": 1.8160887266601151, "grad_norm": 0.4434669970265584, "learning_rate": 1.606347129086425e-05, "loss": 0.2893, "step": 1617 }, { "epoch": 1.8172118489400533, "grad_norm": 0.4472789556626577, "learning_rate": 1.6057231736372478e-05, "loss": 0.2784, "step": 1618 }, { "epoch": 1.8183349712199917, "grad_norm": 0.45291111323420824, "learning_rate": 1.605098845479936e-05, "loss": 0.2933, "step": 1619 }, { "epoch": 1.8194580934999298, "grad_norm": 0.4599287948047513, "learning_rate": 1.6044741449986458e-05, "loss": 0.2984, "step": 1620 }, { "epoch": 1.820581215779868, "grad_norm": 0.4584871742864635, "learning_rate": 1.6038490725777624e-05, "loss": 0.3051, "step": 1621 }, { "epoch": 1.8217043380598064, "grad_norm": 0.4696965226478849, "learning_rate": 1.6032236286018995e-05, "loss": 0.3184, "step": 1622 }, { "epoch": 1.8228274603397445, "grad_norm": 0.4456532072721999, "learning_rate": 1.6025978134559e-05, "loss": 0.2814, "step": 1623 }, { "epoch": 1.8239505826196827, "grad_norm": 0.45088143754678195, "learning_rate": 1.6019716275248342e-05, "loss": 0.3047, "step": 1624 }, { "epoch": 1.825073704899621, "grad_norm": 0.43175491411132033, "learning_rate": 1.6013450711940017e-05, "loss": 0.274, "step": 1625 }, { "epoch": 1.8261968271795592, "grad_norm": 0.46280692541168383, "learning_rate": 1.600718144848929e-05, "loss": 0.2808, "step": 1626 }, { "epoch": 1.8273199494594974, "grad_norm": 0.45335668599330486, "learning_rate": 1.600090848875372e-05, "loss": 0.2749, "step": 1627 }, { "epoch": 1.8284430717394358, "grad_norm": 0.4903202775753055, "learning_rate": 1.5994631836593116e-05, "loss": 0.3107, "step": 1628 }, { "epoch": 1.8295661940193737, "grad_norm": 0.4715442977840691, "learning_rate": 1.5988351495869574e-05, "loss": 0.3007, "step": 1629 }, { "epoch": 1.830689316299312, "grad_norm": 0.4535990104723046, "learning_rate": 1.598206747044746e-05, "loss": 0.2845, "step": 1630 }, { "epoch": 1.8318124385792505, "grad_norm": 0.45369244026805666, "learning_rate": 1.59757797641934e-05, "loss": 0.2893, "step": 1631 }, { "epoch": 1.8329355608591884, "grad_norm": 0.4406335157305204, "learning_rate": 1.596948838097629e-05, "loss": 0.2882, "step": 1632 }, { "epoch": 1.8340586831391268, "grad_norm": 0.4471966455869977, "learning_rate": 1.596319332466729e-05, "loss": 0.275, "step": 1633 }, { "epoch": 1.835181805419065, "grad_norm": 0.43360372640049477, "learning_rate": 1.5956894599139814e-05, "loss": 0.2819, "step": 1634 }, { "epoch": 1.836304927699003, "grad_norm": 0.43343847003699065, "learning_rate": 1.5950592208269536e-05, "loss": 0.2875, "step": 1635 }, { "epoch": 1.8374280499789415, "grad_norm": 0.4165129497318286, "learning_rate": 1.5944286155934396e-05, "loss": 0.2636, "step": 1636 }, { "epoch": 1.8385511722588797, "grad_norm": 0.45238285001736483, "learning_rate": 1.5937976446014563e-05, "loss": 0.263, "step": 1637 }, { "epoch": 1.8396742945388178, "grad_norm": 0.4466843595442531, "learning_rate": 1.593166308239248e-05, "loss": 0.276, "step": 1638 }, { "epoch": 1.8407974168187562, "grad_norm": 0.4358078236250983, "learning_rate": 1.5925346068952833e-05, "loss": 0.2951, "step": 1639 }, { "epoch": 1.8419205390986944, "grad_norm": 0.44824760657818036, "learning_rate": 1.5919025409582537e-05, "loss": 0.3078, "step": 1640 }, { "epoch": 1.8430436613786325, "grad_norm": 0.43735777546518884, "learning_rate": 1.5912701108170777e-05, "loss": 0.2877, "step": 1641 }, { "epoch": 1.844166783658571, "grad_norm": 0.4578607904759152, "learning_rate": 1.5906373168608952e-05, "loss": 0.2865, "step": 1642 }, { "epoch": 1.845289905938509, "grad_norm": 0.41061673520301367, "learning_rate": 1.5900041594790722e-05, "loss": 0.2744, "step": 1643 }, { "epoch": 1.8464130282184472, "grad_norm": 0.4765113577474518, "learning_rate": 1.5893706390611978e-05, "loss": 0.3107, "step": 1644 }, { "epoch": 1.8475361504983856, "grad_norm": 0.435712372199893, "learning_rate": 1.5887367559970825e-05, "loss": 0.2664, "step": 1645 }, { "epoch": 1.8486592727783238, "grad_norm": 0.4753377958035371, "learning_rate": 1.588102510676763e-05, "loss": 0.2915, "step": 1646 }, { "epoch": 1.849782395058262, "grad_norm": 0.45798588409720853, "learning_rate": 1.5874679034904966e-05, "loss": 0.2794, "step": 1647 }, { "epoch": 1.8509055173382003, "grad_norm": 0.42891102168652906, "learning_rate": 1.5868329348287647e-05, "loss": 0.2712, "step": 1648 }, { "epoch": 1.8520286396181385, "grad_norm": 0.45380196793915967, "learning_rate": 1.58619760508227e-05, "loss": 0.2872, "step": 1649 }, { "epoch": 1.8531517618980766, "grad_norm": 0.4721744608194615, "learning_rate": 1.5855619146419382e-05, "loss": 0.3042, "step": 1650 }, { "epoch": 1.854274884178015, "grad_norm": 0.4426415279566046, "learning_rate": 1.5849258638989166e-05, "loss": 0.2886, "step": 1651 }, { "epoch": 1.8553980064579532, "grad_norm": 0.4403624280134429, "learning_rate": 1.5842894532445738e-05, "loss": 0.29, "step": 1652 }, { "epoch": 1.8565211287378913, "grad_norm": 0.44346698610518404, "learning_rate": 1.583652683070501e-05, "loss": 0.286, "step": 1653 }, { "epoch": 1.8576442510178297, "grad_norm": 0.43077248160418, "learning_rate": 1.5830155537685093e-05, "loss": 0.2822, "step": 1654 }, { "epoch": 1.8587673732977676, "grad_norm": 0.4347083314214242, "learning_rate": 1.5823780657306313e-05, "loss": 0.2749, "step": 1655 }, { "epoch": 1.859890495577706, "grad_norm": 0.4278133742132193, "learning_rate": 1.581740219349121e-05, "loss": 0.2656, "step": 1656 }, { "epoch": 1.8610136178576444, "grad_norm": 0.418571552787694, "learning_rate": 1.5811020150164518e-05, "loss": 0.2563, "step": 1657 }, { "epoch": 1.8621367401375823, "grad_norm": 0.4692428605964284, "learning_rate": 1.5804634531253184e-05, "loss": 0.2982, "step": 1658 }, { "epoch": 1.8632598624175207, "grad_norm": 0.4763812793557251, "learning_rate": 1.5798245340686342e-05, "loss": 0.2839, "step": 1659 }, { "epoch": 1.864382984697459, "grad_norm": 0.45442752765196487, "learning_rate": 1.5791852582395334e-05, "loss": 0.3055, "step": 1660 }, { "epoch": 1.865506106977397, "grad_norm": 0.4210856174208563, "learning_rate": 1.5785456260313702e-05, "loss": 0.2787, "step": 1661 }, { "epoch": 1.8666292292573354, "grad_norm": 0.4701122518568725, "learning_rate": 1.577905637837716e-05, "loss": 0.297, "step": 1662 }, { "epoch": 1.8677523515372736, "grad_norm": 0.45300215556011086, "learning_rate": 1.5772652940523637e-05, "loss": 0.287, "step": 1663 }, { "epoch": 1.8688754738172118, "grad_norm": 0.4483737642832065, "learning_rate": 1.576624595069323e-05, "loss": 0.2889, "step": 1664 }, { "epoch": 1.8699985960971501, "grad_norm": 0.44444518527475235, "learning_rate": 1.575983541282824e-05, "loss": 0.278, "step": 1665 }, { "epoch": 1.8711217183770883, "grad_norm": 0.4736568759863554, "learning_rate": 1.5753421330873134e-05, "loss": 0.2886, "step": 1666 }, { "epoch": 1.8722448406570265, "grad_norm": 0.47085233017222466, "learning_rate": 1.574700370877457e-05, "loss": 0.2988, "step": 1667 }, { "epoch": 1.8733679629369648, "grad_norm": 0.4503283744113668, "learning_rate": 1.574058255048138e-05, "loss": 0.2842, "step": 1668 }, { "epoch": 1.874491085216903, "grad_norm": 0.45383270715255947, "learning_rate": 1.5734157859944574e-05, "loss": 0.2956, "step": 1669 }, { "epoch": 1.8756142074968412, "grad_norm": 0.458441923267404, "learning_rate": 1.5727729641117338e-05, "loss": 0.3, "step": 1670 }, { "epoch": 1.8767373297767795, "grad_norm": 0.45050041121130363, "learning_rate": 1.5721297897955023e-05, "loss": 0.2843, "step": 1671 }, { "epoch": 1.8778604520567177, "grad_norm": 0.42269479225207923, "learning_rate": 1.5714862634415145e-05, "loss": 0.2755, "step": 1672 }, { "epoch": 1.8789835743366559, "grad_norm": 0.4118289888578974, "learning_rate": 1.5708423854457408e-05, "loss": 0.2683, "step": 1673 }, { "epoch": 1.8801066966165942, "grad_norm": 0.4519087067087661, "learning_rate": 1.5701981562043648e-05, "loss": 0.2941, "step": 1674 }, { "epoch": 1.8812298188965324, "grad_norm": 0.45169859494837117, "learning_rate": 1.569553576113789e-05, "loss": 0.3081, "step": 1675 }, { "epoch": 1.8823529411764706, "grad_norm": 0.45818548968025397, "learning_rate": 1.56890864557063e-05, "loss": 0.2906, "step": 1676 }, { "epoch": 1.883476063456409, "grad_norm": 0.41903925729828695, "learning_rate": 1.5682633649717206e-05, "loss": 0.256, "step": 1677 }, { "epoch": 1.8845991857363469, "grad_norm": 0.4494067572971225, "learning_rate": 1.5676177347141096e-05, "loss": 0.2999, "step": 1678 }, { "epoch": 1.8857223080162853, "grad_norm": 0.430053388913964, "learning_rate": 1.5669717551950595e-05, "loss": 0.2778, "step": 1679 }, { "epoch": 1.8868454302962236, "grad_norm": 0.45114702164180737, "learning_rate": 1.5663254268120497e-05, "loss": 0.2782, "step": 1680 }, { "epoch": 1.8879685525761616, "grad_norm": 0.4767521278617856, "learning_rate": 1.5656787499627727e-05, "loss": 0.3075, "step": 1681 }, { "epoch": 1.8890916748561, "grad_norm": 0.43432528267843223, "learning_rate": 1.5650317250451357e-05, "loss": 0.2925, "step": 1682 }, { "epoch": 1.8902147971360383, "grad_norm": 0.43280983326124206, "learning_rate": 1.5643843524572605e-05, "loss": 0.2833, "step": 1683 }, { "epoch": 1.8913379194159763, "grad_norm": 0.4460244169687265, "learning_rate": 1.5637366325974823e-05, "loss": 0.3001, "step": 1684 }, { "epoch": 1.8924610416959147, "grad_norm": 0.4748835442825777, "learning_rate": 1.5630885658643508e-05, "loss": 0.305, "step": 1685 }, { "epoch": 1.8935841639758528, "grad_norm": 0.4145939481842465, "learning_rate": 1.5624401526566277e-05, "loss": 0.2629, "step": 1686 }, { "epoch": 1.894707286255791, "grad_norm": 0.452591025755552, "learning_rate": 1.5617913933732892e-05, "loss": 0.3203, "step": 1687 }, { "epoch": 1.8958304085357294, "grad_norm": 0.42763483263940355, "learning_rate": 1.5611422884135245e-05, "loss": 0.2815, "step": 1688 }, { "epoch": 1.8969535308156675, "grad_norm": 0.43158239697123274, "learning_rate": 1.5604928381767345e-05, "loss": 0.2793, "step": 1689 }, { "epoch": 1.8980766530956057, "grad_norm": 0.4293853340809833, "learning_rate": 1.5598430430625335e-05, "loss": 0.2809, "step": 1690 }, { "epoch": 1.899199775375544, "grad_norm": 0.46535273456706855, "learning_rate": 1.5591929034707468e-05, "loss": 0.3093, "step": 1691 }, { "epoch": 1.9003228976554822, "grad_norm": 0.43293986112214783, "learning_rate": 1.5585424198014135e-05, "loss": 0.2704, "step": 1692 }, { "epoch": 1.9014460199354204, "grad_norm": 0.47308351663116804, "learning_rate": 1.5578915924547824e-05, "loss": 0.3087, "step": 1693 }, { "epoch": 1.9025691422153588, "grad_norm": 0.45499586114643786, "learning_rate": 1.557240421831315e-05, "loss": 0.2979, "step": 1694 }, { "epoch": 1.903692264495297, "grad_norm": 0.43179402389716814, "learning_rate": 1.5565889083316847e-05, "loss": 0.2795, "step": 1695 }, { "epoch": 1.904815386775235, "grad_norm": 0.4337080321597782, "learning_rate": 1.5559370523567734e-05, "loss": 0.2884, "step": 1696 }, { "epoch": 1.9059385090551735, "grad_norm": 0.4416737410605399, "learning_rate": 1.5552848543076762e-05, "loss": 0.2916, "step": 1697 }, { "epoch": 1.9070616313351116, "grad_norm": 0.45327771862806987, "learning_rate": 1.5546323145856976e-05, "loss": 0.283, "step": 1698 }, { "epoch": 1.9081847536150498, "grad_norm": 0.43897115726266495, "learning_rate": 1.5539794335923523e-05, "loss": 0.2757, "step": 1699 }, { "epoch": 1.9093078758949882, "grad_norm": 0.45535826828398407, "learning_rate": 1.553326211729365e-05, "loss": 0.2846, "step": 1700 }, { "epoch": 1.9104309981749263, "grad_norm": 0.44632754329033314, "learning_rate": 1.5526726493986707e-05, "loss": 0.2854, "step": 1701 }, { "epoch": 1.9115541204548645, "grad_norm": 0.44264267367794924, "learning_rate": 1.5520187470024138e-05, "loss": 0.291, "step": 1702 }, { "epoch": 1.9126772427348029, "grad_norm": 0.4261191699485715, "learning_rate": 1.5513645049429468e-05, "loss": 0.2744, "step": 1703 }, { "epoch": 1.9138003650147408, "grad_norm": 0.4718910711066927, "learning_rate": 1.550709923622832e-05, "loss": 0.3094, "step": 1704 }, { "epoch": 1.9149234872946792, "grad_norm": 0.443144418899658, "learning_rate": 1.5500550034448415e-05, "loss": 0.265, "step": 1705 }, { "epoch": 1.9160466095746176, "grad_norm": 0.4680353665068334, "learning_rate": 1.549399744811954e-05, "loss": 0.2984, "step": 1706 }, { "epoch": 1.9171697318545555, "grad_norm": 0.451871229122914, "learning_rate": 1.5487441481273576e-05, "loss": 0.2856, "step": 1707 }, { "epoch": 1.918292854134494, "grad_norm": 0.45525795558547877, "learning_rate": 1.5480882137944483e-05, "loss": 0.2972, "step": 1708 }, { "epoch": 1.919415976414432, "grad_norm": 0.457501569499871, "learning_rate": 1.54743194221683e-05, "loss": 0.2842, "step": 1709 }, { "epoch": 1.9205390986943702, "grad_norm": 0.40429630388545335, "learning_rate": 1.546775333798313e-05, "loss": 0.2635, "step": 1710 }, { "epoch": 1.9216622209743086, "grad_norm": 0.43949600121406834, "learning_rate": 1.5461183889429163e-05, "loss": 0.287, "step": 1711 }, { "epoch": 1.9227853432542468, "grad_norm": 0.44623475635576615, "learning_rate": 1.545461108054865e-05, "loss": 0.295, "step": 1712 }, { "epoch": 1.923908465534185, "grad_norm": 0.4332274620156582, "learning_rate": 1.5448034915385912e-05, "loss": 0.2764, "step": 1713 }, { "epoch": 1.9250315878141233, "grad_norm": 0.43976301663859074, "learning_rate": 1.5441455397987342e-05, "loss": 0.2796, "step": 1714 }, { "epoch": 1.9261547100940615, "grad_norm": 0.48004372769154424, "learning_rate": 1.543487253240138e-05, "loss": 0.3315, "step": 1715 }, { "epoch": 1.9272778323739996, "grad_norm": 0.44346770478396347, "learning_rate": 1.5428286322678544e-05, "loss": 0.3029, "step": 1716 }, { "epoch": 1.928400954653938, "grad_norm": 0.452216885627314, "learning_rate": 1.54216967728714e-05, "loss": 0.2889, "step": 1717 }, { "epoch": 1.9295240769338762, "grad_norm": 0.43338802482350897, "learning_rate": 1.5415103887034565e-05, "loss": 0.2847, "step": 1718 }, { "epoch": 1.9306471992138143, "grad_norm": 0.42192888016892804, "learning_rate": 1.540850766922472e-05, "loss": 0.283, "step": 1719 }, { "epoch": 1.9317703214937527, "grad_norm": 0.45160195890660065, "learning_rate": 1.540190812350059e-05, "loss": 0.2745, "step": 1720 }, { "epoch": 1.9328934437736909, "grad_norm": 0.45447959773978414, "learning_rate": 1.539530525392294e-05, "loss": 0.3008, "step": 1721 }, { "epoch": 1.934016566053629, "grad_norm": 0.4371017493232612, "learning_rate": 1.53886990645546e-05, "loss": 0.2948, "step": 1722 }, { "epoch": 1.9351396883335674, "grad_norm": 0.4334080460875823, "learning_rate": 1.5382089559460423e-05, "loss": 0.2818, "step": 1723 }, { "epoch": 1.9362628106135056, "grad_norm": 0.4731746980122572, "learning_rate": 1.5375476742707314e-05, "loss": 0.3038, "step": 1724 }, { "epoch": 1.9373859328934437, "grad_norm": 0.454174378429582, "learning_rate": 1.536886061836421e-05, "loss": 0.2948, "step": 1725 }, { "epoch": 1.9385090551733821, "grad_norm": 0.4432630231842709, "learning_rate": 1.5362241190502086e-05, "loss": 0.2852, "step": 1726 }, { "epoch": 1.9396321774533203, "grad_norm": 0.4598840425874724, "learning_rate": 1.5355618463193945e-05, "loss": 0.2815, "step": 1727 }, { "epoch": 1.9407552997332584, "grad_norm": 0.451831626436298, "learning_rate": 1.5348992440514832e-05, "loss": 0.2729, "step": 1728 }, { "epoch": 1.9418784220131968, "grad_norm": 0.46544809796158676, "learning_rate": 1.534236312654181e-05, "loss": 0.3169, "step": 1729 }, { "epoch": 1.9430015442931348, "grad_norm": 0.44645885927284146, "learning_rate": 1.5335730525353962e-05, "loss": 0.2785, "step": 1730 }, { "epoch": 1.9441246665730731, "grad_norm": 0.469136480578403, "learning_rate": 1.5329094641032406e-05, "loss": 0.3056, "step": 1731 }, { "epoch": 1.9452477888530115, "grad_norm": 0.43983149032385477, "learning_rate": 1.5322455477660274e-05, "loss": 0.2836, "step": 1732 }, { "epoch": 1.9463709111329495, "grad_norm": 0.44719810124182485, "learning_rate": 1.5315813039322714e-05, "loss": 0.2801, "step": 1733 }, { "epoch": 1.9474940334128878, "grad_norm": 0.4290827977526451, "learning_rate": 1.5309167330106895e-05, "loss": 0.2704, "step": 1734 }, { "epoch": 1.948617155692826, "grad_norm": 0.4497567848165034, "learning_rate": 1.5302518354101992e-05, "loss": 0.2967, "step": 1735 }, { "epoch": 1.9497402779727642, "grad_norm": 0.468862605371959, "learning_rate": 1.5295866115399193e-05, "loss": 0.3041, "step": 1736 }, { "epoch": 1.9508634002527026, "grad_norm": 0.4888366595716739, "learning_rate": 1.5289210618091695e-05, "loss": 0.3024, "step": 1737 }, { "epoch": 1.9519865225326407, "grad_norm": 0.476112693226597, "learning_rate": 1.52825518662747e-05, "loss": 0.2621, "step": 1738 }, { "epoch": 1.9531096448125789, "grad_norm": 0.44772753456589154, "learning_rate": 1.527588986404541e-05, "loss": 0.2796, "step": 1739 }, { "epoch": 1.9542327670925173, "grad_norm": 0.5107778899997112, "learning_rate": 1.5269224615503025e-05, "loss": 0.3308, "step": 1740 }, { "epoch": 1.9553558893724554, "grad_norm": 0.4674299014245734, "learning_rate": 1.5262556124748754e-05, "loss": 0.2826, "step": 1741 }, { "epoch": 1.9564790116523936, "grad_norm": 0.46960805580213283, "learning_rate": 1.5255884395885785e-05, "loss": 0.2939, "step": 1742 }, { "epoch": 1.957602133932332, "grad_norm": 0.4209722750164603, "learning_rate": 1.5249209433019307e-05, "loss": 0.2722, "step": 1743 }, { "epoch": 1.9587252562122701, "grad_norm": 0.4568155973367222, "learning_rate": 1.5242531240256501e-05, "loss": 0.2811, "step": 1744 }, { "epoch": 1.9598483784922083, "grad_norm": 0.4307082739584054, "learning_rate": 1.5235849821706531e-05, "loss": 0.28, "step": 1745 }, { "epoch": 1.9609715007721467, "grad_norm": 0.4234189998569159, "learning_rate": 1.5229165181480552e-05, "loss": 0.2671, "step": 1746 }, { "epoch": 1.9620946230520848, "grad_norm": 0.4739925558985982, "learning_rate": 1.5222477323691687e-05, "loss": 0.307, "step": 1747 }, { "epoch": 1.963217745332023, "grad_norm": 0.44810995148794674, "learning_rate": 1.5215786252455056e-05, "loss": 0.2925, "step": 1748 }, { "epoch": 1.9643408676119614, "grad_norm": 0.43430736966909755, "learning_rate": 1.5209091971887747e-05, "loss": 0.281, "step": 1749 }, { "epoch": 1.9654639898918995, "grad_norm": 0.44148058197118256, "learning_rate": 1.5202394486108823e-05, "loss": 0.2981, "step": 1750 }, { "epoch": 1.9665871121718377, "grad_norm": 0.4108291724424686, "learning_rate": 1.5195693799239322e-05, "loss": 0.2675, "step": 1751 }, { "epoch": 1.967710234451776, "grad_norm": 0.4412806106090446, "learning_rate": 1.5188989915402253e-05, "loss": 0.2827, "step": 1752 }, { "epoch": 1.968833356731714, "grad_norm": 0.4529168112870875, "learning_rate": 1.5182282838722584e-05, "loss": 0.2939, "step": 1753 }, { "epoch": 1.9699564790116524, "grad_norm": 0.41285809272736673, "learning_rate": 1.5175572573327257e-05, "loss": 0.2731, "step": 1754 }, { "epoch": 1.9710796012915908, "grad_norm": 0.43190618430843086, "learning_rate": 1.5168859123345172e-05, "loss": 0.2774, "step": 1755 }, { "epoch": 1.9722027235715287, "grad_norm": 0.4552856993615373, "learning_rate": 1.5162142492907186e-05, "loss": 0.2862, "step": 1756 }, { "epoch": 1.973325845851467, "grad_norm": 0.4514244233597325, "learning_rate": 1.5155422686146118e-05, "loss": 0.2727, "step": 1757 }, { "epoch": 1.9744489681314055, "grad_norm": 0.43333285646026426, "learning_rate": 1.5148699707196739e-05, "loss": 0.2774, "step": 1758 }, { "epoch": 1.9755720904113434, "grad_norm": 0.4508656102546517, "learning_rate": 1.5141973560195768e-05, "loss": 0.2944, "step": 1759 }, { "epoch": 1.9766952126912818, "grad_norm": 0.4743163650426743, "learning_rate": 1.5135244249281884e-05, "loss": 0.323, "step": 1760 }, { "epoch": 1.97781833497122, "grad_norm": 0.4564248851686335, "learning_rate": 1.5128511778595703e-05, "loss": 0.2717, "step": 1761 }, { "epoch": 1.978941457251158, "grad_norm": 0.470303991029109, "learning_rate": 1.5121776152279786e-05, "loss": 0.3205, "step": 1762 }, { "epoch": 1.9800645795310965, "grad_norm": 0.4224772001099396, "learning_rate": 1.5115037374478641e-05, "loss": 0.2678, "step": 1763 }, { "epoch": 1.9811877018110347, "grad_norm": 0.4457682890054829, "learning_rate": 1.510829544933871e-05, "loss": 0.297, "step": 1764 }, { "epoch": 1.9823108240909728, "grad_norm": 0.4371918445392385, "learning_rate": 1.5101550381008377e-05, "loss": 0.272, "step": 1765 }, { "epoch": 1.9834339463709112, "grad_norm": 0.4565606365981388, "learning_rate": 1.5094802173637953e-05, "loss": 0.3024, "step": 1766 }, { "epoch": 1.9845570686508494, "grad_norm": 0.4712494126543963, "learning_rate": 1.5088050831379684e-05, "loss": 0.2958, "step": 1767 }, { "epoch": 1.9856801909307875, "grad_norm": 0.4813432876456791, "learning_rate": 1.508129635838775e-05, "loss": 0.2996, "step": 1768 }, { "epoch": 1.986803313210726, "grad_norm": 0.42456961820085154, "learning_rate": 1.5074538758818247e-05, "loss": 0.2701, "step": 1769 }, { "epoch": 1.987926435490664, "grad_norm": 0.4466264321919789, "learning_rate": 1.5067778036829204e-05, "loss": 0.2844, "step": 1770 }, { "epoch": 1.9890495577706022, "grad_norm": 0.45361513546811316, "learning_rate": 1.5061014196580565e-05, "loss": 0.279, "step": 1771 }, { "epoch": 1.9901726800505406, "grad_norm": 0.4418593232740048, "learning_rate": 1.5054247242234197e-05, "loss": 0.2736, "step": 1772 }, { "epoch": 1.9912958023304788, "grad_norm": 0.4860660602595377, "learning_rate": 1.5047477177953887e-05, "loss": 0.313, "step": 1773 }, { "epoch": 1.992418924610417, "grad_norm": 0.43959539799018116, "learning_rate": 1.5040704007905319e-05, "loss": 0.2737, "step": 1774 }, { "epoch": 1.9935420468903553, "grad_norm": 0.49294497786554403, "learning_rate": 1.5033927736256107e-05, "loss": 0.3041, "step": 1775 }, { "epoch": 1.9946651691702935, "grad_norm": 0.42461221519723574, "learning_rate": 1.5027148367175759e-05, "loss": 0.2616, "step": 1776 }, { "epoch": 1.9957882914502316, "grad_norm": 0.4402615651724432, "learning_rate": 1.50203659048357e-05, "loss": 0.2697, "step": 1777 }, { "epoch": 1.99691141373017, "grad_norm": 0.46310993944775436, "learning_rate": 1.5013580353409259e-05, "loss": 0.2917, "step": 1778 }, { "epoch": 1.998034536010108, "grad_norm": 0.46147694875445594, "learning_rate": 1.500679171707165e-05, "loss": 0.2791, "step": 1779 }, { "epoch": 1.9991576582900463, "grad_norm": 0.4680035095821885, "learning_rate": 1.5000000000000002e-05, "loss": 0.2826, "step": 1780 }, { "epoch": 2.0002807805699847, "grad_norm": 0.8085198551259174, "learning_rate": 1.499320520637333e-05, "loss": 0.4237, "step": 1781 }, { "epoch": 2.0014039028499226, "grad_norm": 0.541462070507787, "learning_rate": 1.4986407340372546e-05, "loss": 0.177, "step": 1782 }, { "epoch": 2.002527025129861, "grad_norm": 0.514819434140705, "learning_rate": 1.4979606406180456e-05, "loss": 0.1881, "step": 1783 }, { "epoch": 2.0036501474097994, "grad_norm": 0.45400673841814165, "learning_rate": 1.4972802407981744e-05, "loss": 0.174, "step": 1784 }, { "epoch": 2.0047732696897373, "grad_norm": 0.5894341185038876, "learning_rate": 1.4965995349962987e-05, "loss": 0.1669, "step": 1785 }, { "epoch": 2.0058963919696757, "grad_norm": 0.780454484731567, "learning_rate": 1.4959185236312642e-05, "loss": 0.1779, "step": 1786 }, { "epoch": 2.007019514249614, "grad_norm": 0.5115911523467761, "learning_rate": 1.495237207122105e-05, "loss": 0.1719, "step": 1787 }, { "epoch": 2.008142636529552, "grad_norm": 0.47746167967424985, "learning_rate": 1.4945555858880422e-05, "loss": 0.1658, "step": 1788 }, { "epoch": 2.0092657588094904, "grad_norm": 0.49235675543550705, "learning_rate": 1.493873660348485e-05, "loss": 0.1862, "step": 1789 }, { "epoch": 2.010388881089429, "grad_norm": 0.4566009363822769, "learning_rate": 1.49319143092303e-05, "loss": 0.1768, "step": 1790 }, { "epoch": 2.0115120033693668, "grad_norm": 0.4648626440058988, "learning_rate": 1.4925088980314604e-05, "loss": 0.1751, "step": 1791 }, { "epoch": 2.012635125649305, "grad_norm": 0.45375792194533593, "learning_rate": 1.4918260620937458e-05, "loss": 0.1647, "step": 1792 }, { "epoch": 2.0137582479292435, "grad_norm": 0.42584485799570987, "learning_rate": 1.4911429235300425e-05, "loss": 0.156, "step": 1793 }, { "epoch": 2.0148813702091815, "grad_norm": 0.460137628374996, "learning_rate": 1.490459482760694e-05, "loss": 0.1708, "step": 1794 }, { "epoch": 2.01600449248912, "grad_norm": 0.45344627339365856, "learning_rate": 1.4897757402062285e-05, "loss": 0.1567, "step": 1795 }, { "epoch": 2.0171276147690578, "grad_norm": 0.4953474815052228, "learning_rate": 1.48909169628736e-05, "loss": 0.1707, "step": 1796 }, { "epoch": 2.018250737048996, "grad_norm": 0.4753046189955386, "learning_rate": 1.488407351424989e-05, "loss": 0.1641, "step": 1797 }, { "epoch": 2.0193738593289345, "grad_norm": 0.4519314500216831, "learning_rate": 1.4877227060401997e-05, "loss": 0.1668, "step": 1798 }, { "epoch": 2.0204969816088725, "grad_norm": 0.45671522761655353, "learning_rate": 1.4870377605542624e-05, "loss": 0.1601, "step": 1799 }, { "epoch": 2.021620103888811, "grad_norm": 0.456319516767008, "learning_rate": 1.4863525153886314e-05, "loss": 0.1641, "step": 1800 }, { "epoch": 2.0227432261687492, "grad_norm": 0.45491434648308354, "learning_rate": 1.4856669709649455e-05, "loss": 0.1707, "step": 1801 }, { "epoch": 2.023866348448687, "grad_norm": 0.4492812134103008, "learning_rate": 1.4849811277050279e-05, "loss": 0.1667, "step": 1802 }, { "epoch": 2.0249894707286256, "grad_norm": 0.47837793383271804, "learning_rate": 1.4842949860308854e-05, "loss": 0.1762, "step": 1803 }, { "epoch": 2.026112593008564, "grad_norm": 0.4366394160857708, "learning_rate": 1.4836085463647088e-05, "loss": 0.1627, "step": 1804 }, { "epoch": 2.027235715288502, "grad_norm": 0.4819217078029733, "learning_rate": 1.4829218091288713e-05, "loss": 0.1833, "step": 1805 }, { "epoch": 2.0283588375684403, "grad_norm": 0.4592333930190147, "learning_rate": 1.4822347747459307e-05, "loss": 0.1668, "step": 1806 }, { "epoch": 2.0294819598483786, "grad_norm": 0.45422769074810193, "learning_rate": 1.4815474436386263e-05, "loss": 0.1683, "step": 1807 }, { "epoch": 2.0306050821283166, "grad_norm": 0.4600329925419827, "learning_rate": 1.4808598162298806e-05, "loss": 0.1747, "step": 1808 }, { "epoch": 2.031728204408255, "grad_norm": 0.4583134137959598, "learning_rate": 1.4801718929427986e-05, "loss": 0.1649, "step": 1809 }, { "epoch": 2.0328513266881933, "grad_norm": 0.49051047861023445, "learning_rate": 1.4794836742006667e-05, "loss": 0.1714, "step": 1810 }, { "epoch": 2.0339744489681313, "grad_norm": 0.45752788677919076, "learning_rate": 1.4787951604269533e-05, "loss": 0.17, "step": 1811 }, { "epoch": 2.0350975712480697, "grad_norm": 0.46474367178946024, "learning_rate": 1.478106352045309e-05, "loss": 0.1784, "step": 1812 }, { "epoch": 2.036220693528008, "grad_norm": 0.44888182231043505, "learning_rate": 1.4774172494795651e-05, "loss": 0.1655, "step": 1813 }, { "epoch": 2.037343815807946, "grad_norm": 0.4874435144426795, "learning_rate": 1.4767278531537335e-05, "loss": 0.1746, "step": 1814 }, { "epoch": 2.0384669380878844, "grad_norm": 0.45309578620813684, "learning_rate": 1.476038163492008e-05, "loss": 0.1598, "step": 1815 }, { "epoch": 2.0395900603678228, "grad_norm": 0.47320504356622956, "learning_rate": 1.4753481809187617e-05, "loss": 0.1834, "step": 1816 }, { "epoch": 2.0407131826477607, "grad_norm": 0.4720649193376181, "learning_rate": 1.474657905858549e-05, "loss": 0.1739, "step": 1817 }, { "epoch": 2.041836304927699, "grad_norm": 0.4728143038805621, "learning_rate": 1.4739673387361033e-05, "loss": 0.1764, "step": 1818 }, { "epoch": 2.042959427207637, "grad_norm": 0.44374918226300764, "learning_rate": 1.4732764799763383e-05, "loss": 0.1656, "step": 1819 }, { "epoch": 2.0440825494875754, "grad_norm": 0.4923042826192989, "learning_rate": 1.4725853300043472e-05, "loss": 0.1987, "step": 1820 }, { "epoch": 2.0452056717675138, "grad_norm": 0.46751435491816906, "learning_rate": 1.4718938892454018e-05, "loss": 0.1572, "step": 1821 }, { "epoch": 2.0463287940474517, "grad_norm": 0.47339333449870347, "learning_rate": 1.4712021581249534e-05, "loss": 0.1776, "step": 1822 }, { "epoch": 2.04745191632739, "grad_norm": 0.46020101583349715, "learning_rate": 1.4705101370686316e-05, "loss": 0.1604, "step": 1823 }, { "epoch": 2.0485750386073285, "grad_norm": 0.49263307013323115, "learning_rate": 1.469817826502245e-05, "loss": 0.1668, "step": 1824 }, { "epoch": 2.0496981608872664, "grad_norm": 0.4710764801250717, "learning_rate": 1.4691252268517794e-05, "loss": 0.1775, "step": 1825 }, { "epoch": 2.050821283167205, "grad_norm": 0.49880730266155526, "learning_rate": 1.4684323385433997e-05, "loss": 0.1822, "step": 1826 }, { "epoch": 2.051944405447143, "grad_norm": 0.46625550510782054, "learning_rate": 1.4677391620034467e-05, "loss": 0.174, "step": 1827 }, { "epoch": 2.053067527727081, "grad_norm": 0.4872551100589393, "learning_rate": 1.4670456976584401e-05, "loss": 0.1656, "step": 1828 }, { "epoch": 2.0541906500070195, "grad_norm": 0.43451500585845937, "learning_rate": 1.4663519459350763e-05, "loss": 0.1589, "step": 1829 }, { "epoch": 2.055313772286958, "grad_norm": 0.428654270229592, "learning_rate": 1.4656579072602281e-05, "loss": 0.1596, "step": 1830 }, { "epoch": 2.056436894566896, "grad_norm": 0.48053898356788094, "learning_rate": 1.4649635820609457e-05, "loss": 0.1831, "step": 1831 }, { "epoch": 2.057560016846834, "grad_norm": 0.4472515086562773, "learning_rate": 1.464268970764454e-05, "loss": 0.1618, "step": 1832 }, { "epoch": 2.0586831391267726, "grad_norm": 0.49366586224960823, "learning_rate": 1.4635740737981557e-05, "loss": 0.1771, "step": 1833 }, { "epoch": 2.0598062614067105, "grad_norm": 0.455074161806074, "learning_rate": 1.4628788915896282e-05, "loss": 0.1661, "step": 1834 }, { "epoch": 2.060929383686649, "grad_norm": 0.44819907357299266, "learning_rate": 1.4621834245666254e-05, "loss": 0.1563, "step": 1835 }, { "epoch": 2.0620525059665873, "grad_norm": 0.4826290613150395, "learning_rate": 1.4614876731570751e-05, "loss": 0.1823, "step": 1836 }, { "epoch": 2.0631756282465252, "grad_norm": 0.4657177556230326, "learning_rate": 1.4607916377890807e-05, "loss": 0.1703, "step": 1837 }, { "epoch": 2.0642987505264636, "grad_norm": 0.47786499964299944, "learning_rate": 1.4600953188909214e-05, "loss": 0.1727, "step": 1838 }, { "epoch": 2.065421872806402, "grad_norm": 0.4642252898495588, "learning_rate": 1.4593987168910491e-05, "loss": 0.1743, "step": 1839 }, { "epoch": 2.06654499508634, "grad_norm": 0.4695641601793918, "learning_rate": 1.4587018322180906e-05, "loss": 0.1747, "step": 1840 }, { "epoch": 2.0676681173662783, "grad_norm": 0.4662822283785743, "learning_rate": 1.4580046653008474e-05, "loss": 0.1692, "step": 1841 }, { "epoch": 2.0687912396462167, "grad_norm": 0.4576878116997234, "learning_rate": 1.457307216568293e-05, "loss": 0.1643, "step": 1842 }, { "epoch": 2.0699143619261546, "grad_norm": 0.44945289175219166, "learning_rate": 1.4566094864495761e-05, "loss": 0.1669, "step": 1843 }, { "epoch": 2.071037484206093, "grad_norm": 0.4812629802641515, "learning_rate": 1.4559114753740174e-05, "loss": 0.1838, "step": 1844 }, { "epoch": 2.072160606486031, "grad_norm": 0.4255497433471811, "learning_rate": 1.4552131837711108e-05, "loss": 0.1515, "step": 1845 }, { "epoch": 2.0732837287659693, "grad_norm": 0.4489807287129697, "learning_rate": 1.4545146120705229e-05, "loss": 0.1668, "step": 1846 }, { "epoch": 2.0744068510459077, "grad_norm": 0.4451775619304231, "learning_rate": 1.4538157607020923e-05, "loss": 0.1671, "step": 1847 }, { "epoch": 2.0755299733258457, "grad_norm": 0.4736713116971495, "learning_rate": 1.4531166300958303e-05, "loss": 0.1715, "step": 1848 }, { "epoch": 2.076653095605784, "grad_norm": 0.44599651030040116, "learning_rate": 1.4524172206819195e-05, "loss": 0.1695, "step": 1849 }, { "epoch": 2.0777762178857224, "grad_norm": 0.4925862342950297, "learning_rate": 1.4517175328907141e-05, "loss": 0.177, "step": 1850 }, { "epoch": 2.0788993401656604, "grad_norm": 0.4918233012105012, "learning_rate": 1.4510175671527397e-05, "loss": 0.1756, "step": 1851 }, { "epoch": 2.0800224624455987, "grad_norm": 0.46620816928480197, "learning_rate": 1.4503173238986932e-05, "loss": 0.1762, "step": 1852 }, { "epoch": 2.081145584725537, "grad_norm": 0.45469289458689577, "learning_rate": 1.4496168035594418e-05, "loss": 0.1696, "step": 1853 }, { "epoch": 2.082268707005475, "grad_norm": 0.5073180847345866, "learning_rate": 1.4489160065660231e-05, "loss": 0.1748, "step": 1854 }, { "epoch": 2.0833918292854134, "grad_norm": 0.49912895873161844, "learning_rate": 1.4482149333496455e-05, "loss": 0.1711, "step": 1855 }, { "epoch": 2.084514951565352, "grad_norm": 0.4713637049357059, "learning_rate": 1.4475135843416873e-05, "loss": 0.1696, "step": 1856 }, { "epoch": 2.0856380738452898, "grad_norm": 0.4886436819964026, "learning_rate": 1.4468119599736957e-05, "loss": 0.1719, "step": 1857 }, { "epoch": 2.086761196125228, "grad_norm": 0.47437394171967345, "learning_rate": 1.4461100606773884e-05, "loss": 0.1583, "step": 1858 }, { "epoch": 2.0878843184051665, "grad_norm": 0.4738862023625834, "learning_rate": 1.4454078868846513e-05, "loss": 0.1758, "step": 1859 }, { "epoch": 2.0890074406851045, "grad_norm": 0.47210633789898465, "learning_rate": 1.4447054390275401e-05, "loss": 0.1707, "step": 1860 }, { "epoch": 2.090130562965043, "grad_norm": 0.47219676032402647, "learning_rate": 1.4440027175382784e-05, "loss": 0.1538, "step": 1861 }, { "epoch": 2.0912536852449812, "grad_norm": 0.45850818588081294, "learning_rate": 1.4432997228492586e-05, "loss": 0.1704, "step": 1862 }, { "epoch": 2.092376807524919, "grad_norm": 0.445147848398028, "learning_rate": 1.4425964553930412e-05, "loss": 0.1633, "step": 1863 }, { "epoch": 2.0934999298048576, "grad_norm": 0.462368673003725, "learning_rate": 1.4418929156023543e-05, "loss": 0.164, "step": 1864 }, { "epoch": 2.094623052084796, "grad_norm": 0.477089375485279, "learning_rate": 1.4411891039100934e-05, "loss": 0.1747, "step": 1865 }, { "epoch": 2.095746174364734, "grad_norm": 0.4633909335409037, "learning_rate": 1.4404850207493217e-05, "loss": 0.1706, "step": 1866 }, { "epoch": 2.0968692966446723, "grad_norm": 0.4913608818406286, "learning_rate": 1.4397806665532693e-05, "loss": 0.1802, "step": 1867 }, { "epoch": 2.09799241892461, "grad_norm": 0.4843264536525825, "learning_rate": 1.4390760417553338e-05, "loss": 0.1752, "step": 1868 }, { "epoch": 2.0991155412045486, "grad_norm": 0.4786559227982498, "learning_rate": 1.4383711467890776e-05, "loss": 0.1733, "step": 1869 }, { "epoch": 2.100238663484487, "grad_norm": 0.47724859281601445, "learning_rate": 1.4376659820882308e-05, "loss": 0.1706, "step": 1870 }, { "epoch": 2.101361785764425, "grad_norm": 0.4781288580611537, "learning_rate": 1.4369605480866888e-05, "loss": 0.1682, "step": 1871 }, { "epoch": 2.1024849080443633, "grad_norm": 0.47404027812905025, "learning_rate": 1.436254845218513e-05, "loss": 0.1605, "step": 1872 }, { "epoch": 2.1036080303243017, "grad_norm": 0.4693877600646867, "learning_rate": 1.4355488739179304e-05, "loss": 0.1713, "step": 1873 }, { "epoch": 2.1047311526042396, "grad_norm": 0.4465220690930916, "learning_rate": 1.4348426346193325e-05, "loss": 0.1616, "step": 1874 }, { "epoch": 2.105854274884178, "grad_norm": 0.4533942365528121, "learning_rate": 1.4341361277572766e-05, "loss": 0.1597, "step": 1875 }, { "epoch": 2.1069773971641164, "grad_norm": 0.4752746801317046, "learning_rate": 1.4334293537664836e-05, "loss": 0.1898, "step": 1876 }, { "epoch": 2.1081005194440543, "grad_norm": 0.44524813953865283, "learning_rate": 1.4327223130818393e-05, "loss": 0.1629, "step": 1877 }, { "epoch": 2.1092236417239927, "grad_norm": 0.4740454300839204, "learning_rate": 1.4320150061383941e-05, "loss": 0.1712, "step": 1878 }, { "epoch": 2.110346764003931, "grad_norm": 0.46263291757398317, "learning_rate": 1.4313074333713614e-05, "loss": 0.172, "step": 1879 }, { "epoch": 2.111469886283869, "grad_norm": 0.4605970497488254, "learning_rate": 1.4305995952161189e-05, "loss": 0.1718, "step": 1880 }, { "epoch": 2.1125930085638074, "grad_norm": 0.47336811725490663, "learning_rate": 1.4298914921082068e-05, "loss": 0.1689, "step": 1881 }, { "epoch": 2.1137161308437458, "grad_norm": 0.4671378312674368, "learning_rate": 1.429183124483329e-05, "loss": 0.1637, "step": 1882 }, { "epoch": 2.1148392531236837, "grad_norm": 0.5100884705433952, "learning_rate": 1.4284744927773515e-05, "loss": 0.1888, "step": 1883 }, { "epoch": 2.115962375403622, "grad_norm": 0.48122328157905797, "learning_rate": 1.4277655974263035e-05, "loss": 0.1717, "step": 1884 }, { "epoch": 2.1170854976835605, "grad_norm": 0.46648639584107954, "learning_rate": 1.4270564388663761e-05, "loss": 0.1791, "step": 1885 }, { "epoch": 2.1182086199634984, "grad_norm": 0.4897115936255588, "learning_rate": 1.4263470175339223e-05, "loss": 0.1784, "step": 1886 }, { "epoch": 2.119331742243437, "grad_norm": 0.4695573343126349, "learning_rate": 1.425637333865457e-05, "loss": 0.179, "step": 1887 }, { "epoch": 2.120454864523375, "grad_norm": 0.4472253157982482, "learning_rate": 1.424927388297656e-05, "loss": 0.1593, "step": 1888 }, { "epoch": 2.121577986803313, "grad_norm": 0.4699369871519553, "learning_rate": 1.4242171812673569e-05, "loss": 0.1685, "step": 1889 }, { "epoch": 2.1227011090832515, "grad_norm": 0.4762283130843815, "learning_rate": 1.4235067132115581e-05, "loss": 0.1714, "step": 1890 }, { "epoch": 2.12382423136319, "grad_norm": 0.4835690093328314, "learning_rate": 1.4227959845674182e-05, "loss": 0.1901, "step": 1891 }, { "epoch": 2.124947353643128, "grad_norm": 0.4787055121638117, "learning_rate": 1.4220849957722562e-05, "loss": 0.1807, "step": 1892 }, { "epoch": 2.126070475923066, "grad_norm": 0.4737650279943973, "learning_rate": 1.4213737472635513e-05, "loss": 0.1776, "step": 1893 }, { "epoch": 2.1271935982030046, "grad_norm": 0.4686012980137878, "learning_rate": 1.4206622394789432e-05, "loss": 0.1816, "step": 1894 }, { "epoch": 2.1283167204829425, "grad_norm": 0.43340996309070035, "learning_rate": 1.4199504728562294e-05, "loss": 0.1585, "step": 1895 }, { "epoch": 2.129439842762881, "grad_norm": 0.45042187104895964, "learning_rate": 1.4192384478333686e-05, "loss": 0.1626, "step": 1896 }, { "epoch": 2.130562965042819, "grad_norm": 0.46788318772366466, "learning_rate": 1.4185261648484772e-05, "loss": 0.1641, "step": 1897 }, { "epoch": 2.131686087322757, "grad_norm": 0.45501111518763515, "learning_rate": 1.4178136243398308e-05, "loss": 0.1704, "step": 1898 }, { "epoch": 2.1328092096026956, "grad_norm": 0.458503838473636, "learning_rate": 1.4171008267458636e-05, "loss": 0.1649, "step": 1899 }, { "epoch": 2.1339323318826335, "grad_norm": 0.4467967887329821, "learning_rate": 1.4163877725051677e-05, "loss": 0.1692, "step": 1900 }, { "epoch": 2.135055454162572, "grad_norm": 0.46421473127298934, "learning_rate": 1.4156744620564933e-05, "loss": 0.1764, "step": 1901 }, { "epoch": 2.1361785764425103, "grad_norm": 0.4604460398633165, "learning_rate": 1.4149608958387484e-05, "loss": 0.1638, "step": 1902 }, { "epoch": 2.1373016987224482, "grad_norm": 0.4634460358657593, "learning_rate": 1.4142470742909976e-05, "loss": 0.1757, "step": 1903 }, { "epoch": 2.1384248210023866, "grad_norm": 0.470255942011479, "learning_rate": 1.4135329978524634e-05, "loss": 0.1709, "step": 1904 }, { "epoch": 2.139547943282325, "grad_norm": 0.4547022434396764, "learning_rate": 1.4128186669625247e-05, "loss": 0.1753, "step": 1905 }, { "epoch": 2.140671065562263, "grad_norm": 0.4739510061930601, "learning_rate": 1.4121040820607175e-05, "loss": 0.1746, "step": 1906 }, { "epoch": 2.1417941878422013, "grad_norm": 0.4708878833969285, "learning_rate": 1.4113892435867337e-05, "loss": 0.1739, "step": 1907 }, { "epoch": 2.1429173101221397, "grad_norm": 0.44335049166860324, "learning_rate": 1.410674151980421e-05, "loss": 0.1633, "step": 1908 }, { "epoch": 2.1440404324020776, "grad_norm": 0.45141039205299494, "learning_rate": 1.4099588076817837e-05, "loss": 0.1638, "step": 1909 }, { "epoch": 2.145163554682016, "grad_norm": 0.4443166702543141, "learning_rate": 1.4092432111309804e-05, "loss": 0.1636, "step": 1910 }, { "epoch": 2.1462866769619544, "grad_norm": 0.44643125071661977, "learning_rate": 1.4085273627683257e-05, "loss": 0.1651, "step": 1911 }, { "epoch": 2.1474097992418923, "grad_norm": 0.48330445218374524, "learning_rate": 1.4078112630342891e-05, "loss": 0.1741, "step": 1912 }, { "epoch": 2.1485329215218307, "grad_norm": 0.48560309798250084, "learning_rate": 1.4070949123694945e-05, "loss": 0.1882, "step": 1913 }, { "epoch": 2.149656043801769, "grad_norm": 0.46382488718871623, "learning_rate": 1.4063783112147207e-05, "loss": 0.1794, "step": 1914 }, { "epoch": 2.150779166081707, "grad_norm": 0.46801002995391355, "learning_rate": 1.4056614600108998e-05, "loss": 0.1691, "step": 1915 }, { "epoch": 2.1519022883616454, "grad_norm": 0.4753045332485001, "learning_rate": 1.4049443591991185e-05, "loss": 0.1746, "step": 1916 }, { "epoch": 2.1530254106415834, "grad_norm": 0.4526989009991009, "learning_rate": 1.404227009220617e-05, "loss": 0.173, "step": 1917 }, { "epoch": 2.1541485329215218, "grad_norm": 0.46338849838297363, "learning_rate": 1.403509410516788e-05, "loss": 0.1613, "step": 1918 }, { "epoch": 2.15527165520146, "grad_norm": 0.45937563487016014, "learning_rate": 1.4027915635291786e-05, "loss": 0.1774, "step": 1919 }, { "epoch": 2.156394777481398, "grad_norm": 0.4603844415907502, "learning_rate": 1.4020734686994875e-05, "loss": 0.1709, "step": 1920 }, { "epoch": 2.1575178997613365, "grad_norm": 0.47586340827626805, "learning_rate": 1.4013551264695663e-05, "loss": 0.1739, "step": 1921 }, { "epoch": 2.158641022041275, "grad_norm": 0.4698577828683483, "learning_rate": 1.4006365372814192e-05, "loss": 0.1729, "step": 1922 }, { "epoch": 2.1597641443212128, "grad_norm": 0.44550557016391734, "learning_rate": 1.3999177015772021e-05, "loss": 0.1602, "step": 1923 }, { "epoch": 2.160887266601151, "grad_norm": 0.47621681660889276, "learning_rate": 1.3991986197992223e-05, "loss": 0.1775, "step": 1924 }, { "epoch": 2.1620103888810895, "grad_norm": 0.45703981645014513, "learning_rate": 1.3984792923899387e-05, "loss": 0.1654, "step": 1925 }, { "epoch": 2.1631335111610275, "grad_norm": 0.5176298847902593, "learning_rate": 1.3977597197919614e-05, "loss": 0.1895, "step": 1926 }, { "epoch": 2.164256633440966, "grad_norm": 0.45890734681710893, "learning_rate": 1.3970399024480512e-05, "loss": 0.1733, "step": 1927 }, { "epoch": 2.1653797557209042, "grad_norm": 0.4531704744476037, "learning_rate": 1.39631984080112e-05, "loss": 0.1692, "step": 1928 }, { "epoch": 2.166502878000842, "grad_norm": 0.4631173822015671, "learning_rate": 1.3955995352942296e-05, "loss": 0.1727, "step": 1929 }, { "epoch": 2.1676260002807806, "grad_norm": 0.4510321349529347, "learning_rate": 1.3948789863705914e-05, "loss": 0.1671, "step": 1930 }, { "epoch": 2.168749122560719, "grad_norm": 0.449699047258487, "learning_rate": 1.3941581944735675e-05, "loss": 0.173, "step": 1931 }, { "epoch": 2.169872244840657, "grad_norm": 0.4602286914189392, "learning_rate": 1.3934371600466692e-05, "loss": 0.1777, "step": 1932 }, { "epoch": 2.1709953671205953, "grad_norm": 0.45736864355992796, "learning_rate": 1.3927158835335567e-05, "loss": 0.1796, "step": 1933 }, { "epoch": 2.1721184894005336, "grad_norm": 0.4811354606513004, "learning_rate": 1.3919943653780395e-05, "loss": 0.1726, "step": 1934 }, { "epoch": 2.1732416116804716, "grad_norm": 0.4670363828225833, "learning_rate": 1.3912726060240754e-05, "loss": 0.179, "step": 1935 }, { "epoch": 2.17436473396041, "grad_norm": 0.48423978492534575, "learning_rate": 1.3905506059157712e-05, "loss": 0.1633, "step": 1936 }, { "epoch": 2.1754878562403483, "grad_norm": 0.48037570474597213, "learning_rate": 1.3898283654973812e-05, "loss": 0.1854, "step": 1937 }, { "epoch": 2.1766109785202863, "grad_norm": 0.458965202094496, "learning_rate": 1.3891058852133083e-05, "loss": 0.162, "step": 1938 }, { "epoch": 2.1777341008002247, "grad_norm": 0.48209417167054247, "learning_rate": 1.388383165508102e-05, "loss": 0.1829, "step": 1939 }, { "epoch": 2.178857223080163, "grad_norm": 0.4561793614583002, "learning_rate": 1.38766020682646e-05, "loss": 0.1699, "step": 1940 }, { "epoch": 2.179980345360101, "grad_norm": 0.47026614355709756, "learning_rate": 1.3869370096132269e-05, "loss": 0.1817, "step": 1941 }, { "epoch": 2.1811034676400394, "grad_norm": 0.4724639589057093, "learning_rate": 1.3862135743133937e-05, "loss": 0.1668, "step": 1942 }, { "epoch": 2.1822265899199778, "grad_norm": 0.4717566607972279, "learning_rate": 1.3854899013720982e-05, "loss": 0.1874, "step": 1943 }, { "epoch": 2.1833497121999157, "grad_norm": 0.47264122089233784, "learning_rate": 1.384765991234624e-05, "loss": 0.1785, "step": 1944 }, { "epoch": 2.184472834479854, "grad_norm": 0.4458357059113397, "learning_rate": 1.3840418443464015e-05, "loss": 0.1662, "step": 1945 }, { "epoch": 2.1855959567597925, "grad_norm": 0.4670183468434894, "learning_rate": 1.383317461153006e-05, "loss": 0.1689, "step": 1946 }, { "epoch": 2.1867190790397304, "grad_norm": 0.4802944051246216, "learning_rate": 1.3825928421001583e-05, "loss": 0.1751, "step": 1947 }, { "epoch": 2.1878422013196688, "grad_norm": 0.48140552348423143, "learning_rate": 1.381867987633725e-05, "loss": 0.1789, "step": 1948 }, { "epoch": 2.1889653235996067, "grad_norm": 0.47327163615370027, "learning_rate": 1.3811428981997159e-05, "loss": 0.1742, "step": 1949 }, { "epoch": 2.190088445879545, "grad_norm": 0.4580890578922541, "learning_rate": 1.3804175742442878e-05, "loss": 0.1655, "step": 1950 }, { "epoch": 2.1912115681594835, "grad_norm": 0.47350507091879795, "learning_rate": 1.3796920162137396e-05, "loss": 0.1705, "step": 1951 }, { "epoch": 2.1923346904394214, "grad_norm": 0.46014818205305075, "learning_rate": 1.3789662245545158e-05, "loss": 0.1685, "step": 1952 }, { "epoch": 2.19345781271936, "grad_norm": 0.47613768985331506, "learning_rate": 1.3782401997132037e-05, "loss": 0.1769, "step": 1953 }, { "epoch": 2.194580934999298, "grad_norm": 0.46754997862422304, "learning_rate": 1.3775139421365342e-05, "loss": 0.1766, "step": 1954 }, { "epoch": 2.195704057279236, "grad_norm": 0.4517613096656541, "learning_rate": 1.376787452271382e-05, "loss": 0.1689, "step": 1955 }, { "epoch": 2.1968271795591745, "grad_norm": 0.4808075598405213, "learning_rate": 1.3760607305647637e-05, "loss": 0.1859, "step": 1956 }, { "epoch": 2.197950301839113, "grad_norm": 0.4640306895626649, "learning_rate": 1.3753337774638397e-05, "loss": 0.1692, "step": 1957 }, { "epoch": 2.199073424119051, "grad_norm": 0.48015518700390997, "learning_rate": 1.3746065934159123e-05, "loss": 0.1809, "step": 1958 }, { "epoch": 2.200196546398989, "grad_norm": 0.45119869946429864, "learning_rate": 1.3738791788684254e-05, "loss": 0.1646, "step": 1959 }, { "epoch": 2.2013196686789276, "grad_norm": 0.46414622545090645, "learning_rate": 1.3731515342689654e-05, "loss": 0.1773, "step": 1960 }, { "epoch": 2.2024427909588655, "grad_norm": 0.4431677408444893, "learning_rate": 1.3724236600652598e-05, "loss": 0.1594, "step": 1961 }, { "epoch": 2.203565913238804, "grad_norm": 0.4780229496799192, "learning_rate": 1.371695556705178e-05, "loss": 0.1644, "step": 1962 }, { "epoch": 2.2046890355187423, "grad_norm": 0.49218595733042086, "learning_rate": 1.3709672246367299e-05, "loss": 0.1762, "step": 1963 }, { "epoch": 2.2058121577986802, "grad_norm": 0.47496532240919137, "learning_rate": 1.370238664308066e-05, "loss": 0.168, "step": 1964 }, { "epoch": 2.2069352800786186, "grad_norm": 0.48178589082299556, "learning_rate": 1.3695098761674779e-05, "loss": 0.1811, "step": 1965 }, { "epoch": 2.208058402358557, "grad_norm": 0.46520249661248453, "learning_rate": 1.3687808606633965e-05, "loss": 0.1762, "step": 1966 }, { "epoch": 2.209181524638495, "grad_norm": 0.470425727912398, "learning_rate": 1.3680516182443935e-05, "loss": 0.1723, "step": 1967 }, { "epoch": 2.2103046469184333, "grad_norm": 0.4840533890242917, "learning_rate": 1.3673221493591795e-05, "loss": 0.1861, "step": 1968 }, { "epoch": 2.2114277691983713, "grad_norm": 0.4675135965762372, "learning_rate": 1.3665924544566047e-05, "loss": 0.1717, "step": 1969 }, { "epoch": 2.2125508914783096, "grad_norm": 0.4556393300346666, "learning_rate": 1.3658625339856586e-05, "loss": 0.1725, "step": 1970 }, { "epoch": 2.213674013758248, "grad_norm": 0.4703624670439313, "learning_rate": 1.365132388395469e-05, "loss": 0.1659, "step": 1971 }, { "epoch": 2.214797136038186, "grad_norm": 0.4657783144698897, "learning_rate": 1.364402018135303e-05, "loss": 0.1808, "step": 1972 }, { "epoch": 2.2159202583181243, "grad_norm": 0.462982700227875, "learning_rate": 1.3636714236545649e-05, "loss": 0.1728, "step": 1973 }, { "epoch": 2.2170433805980627, "grad_norm": 0.46495072208063093, "learning_rate": 1.362940605402798e-05, "loss": 0.1684, "step": 1974 }, { "epoch": 2.2181665028780007, "grad_norm": 0.4676792084934364, "learning_rate": 1.3622095638296827e-05, "loss": 0.1675, "step": 1975 }, { "epoch": 2.219289625157939, "grad_norm": 0.498799056127904, "learning_rate": 1.3614782993850367e-05, "loss": 0.1753, "step": 1976 }, { "epoch": 2.2204127474378774, "grad_norm": 0.47991466108630665, "learning_rate": 1.3607468125188153e-05, "loss": 0.1722, "step": 1977 }, { "epoch": 2.2215358697178154, "grad_norm": 0.48676775722149834, "learning_rate": 1.3600151036811101e-05, "loss": 0.1686, "step": 1978 }, { "epoch": 2.2226589919977537, "grad_norm": 0.46122036954736967, "learning_rate": 1.3592831733221499e-05, "loss": 0.1666, "step": 1979 }, { "epoch": 2.223782114277692, "grad_norm": 0.4622532176427233, "learning_rate": 1.3585510218922997e-05, "loss": 0.1744, "step": 1980 }, { "epoch": 2.22490523655763, "grad_norm": 0.4404281659461841, "learning_rate": 1.3578186498420598e-05, "loss": 0.162, "step": 1981 }, { "epoch": 2.2260283588375684, "grad_norm": 0.4647477463673942, "learning_rate": 1.357086057622067e-05, "loss": 0.1687, "step": 1982 }, { "epoch": 2.227151481117507, "grad_norm": 0.45022268319323117, "learning_rate": 1.3563532456830934e-05, "loss": 0.1633, "step": 1983 }, { "epoch": 2.2282746033974448, "grad_norm": 0.4900592975701646, "learning_rate": 1.3556202144760461e-05, "loss": 0.1939, "step": 1984 }, { "epoch": 2.229397725677383, "grad_norm": 0.4450602192202563, "learning_rate": 1.3548869644519677e-05, "loss": 0.1622, "step": 1985 }, { "epoch": 2.2305208479573215, "grad_norm": 0.45292148908157803, "learning_rate": 1.3541534960620349e-05, "loss": 0.1772, "step": 1986 }, { "epoch": 2.2316439702372595, "grad_norm": 0.4495070935381503, "learning_rate": 1.3534198097575581e-05, "loss": 0.1793, "step": 1987 }, { "epoch": 2.232767092517198, "grad_norm": 0.46669693575362725, "learning_rate": 1.3526859059899834e-05, "loss": 0.1724, "step": 1988 }, { "epoch": 2.2338902147971362, "grad_norm": 0.4485177100335402, "learning_rate": 1.3519517852108899e-05, "loss": 0.1679, "step": 1989 }, { "epoch": 2.235013337077074, "grad_norm": 0.4765654300275507, "learning_rate": 1.3512174478719896e-05, "loss": 0.1741, "step": 1990 }, { "epoch": 2.2361364593570126, "grad_norm": 0.46917420600760523, "learning_rate": 1.3504828944251287e-05, "loss": 0.1852, "step": 1991 }, { "epoch": 2.237259581636951, "grad_norm": 0.4599691780330107, "learning_rate": 1.349748125322286e-05, "loss": 0.1678, "step": 1992 }, { "epoch": 2.238382703916889, "grad_norm": 0.47810945057013843, "learning_rate": 1.349013141015573e-05, "loss": 0.1642, "step": 1993 }, { "epoch": 2.2395058261968273, "grad_norm": 0.462210138404077, "learning_rate": 1.3482779419572336e-05, "loss": 0.1722, "step": 1994 }, { "epoch": 2.2406289484767656, "grad_norm": 0.4877043211428916, "learning_rate": 1.3475425285996438e-05, "loss": 0.1763, "step": 1995 }, { "epoch": 2.2417520707567036, "grad_norm": 0.4713028550897873, "learning_rate": 1.3468069013953115e-05, "loss": 0.1672, "step": 1996 }, { "epoch": 2.242875193036642, "grad_norm": 0.4543701026881802, "learning_rate": 1.3460710607968767e-05, "loss": 0.1761, "step": 1997 }, { "epoch": 2.24399831531658, "grad_norm": 0.46559021955454394, "learning_rate": 1.3453350072571097e-05, "loss": 0.1729, "step": 1998 }, { "epoch": 2.2451214375965183, "grad_norm": 0.45021396422109927, "learning_rate": 1.3445987412289126e-05, "loss": 0.1757, "step": 1999 }, { "epoch": 2.2462445598764567, "grad_norm": 0.4335318154822215, "learning_rate": 1.3438622631653178e-05, "loss": 0.1572, "step": 2000 }, { "epoch": 2.2473676821563946, "grad_norm": 0.49286805287821256, "learning_rate": 1.343125573519488e-05, "loss": 0.1827, "step": 2001 }, { "epoch": 2.248490804436333, "grad_norm": 0.48479831173676324, "learning_rate": 1.3423886727447176e-05, "loss": 0.1793, "step": 2002 }, { "epoch": 2.2496139267162714, "grad_norm": 0.46128111314066944, "learning_rate": 1.3416515612944288e-05, "loss": 0.1713, "step": 2003 }, { "epoch": 2.2507370489962093, "grad_norm": 0.47588445133183477, "learning_rate": 1.3409142396221747e-05, "loss": 0.1838, "step": 2004 }, { "epoch": 2.2518601712761477, "grad_norm": 0.4630902693099745, "learning_rate": 1.340176708181637e-05, "loss": 0.1734, "step": 2005 }, { "epoch": 2.252983293556086, "grad_norm": 0.4733521097328874, "learning_rate": 1.3394389674266275e-05, "loss": 0.1702, "step": 2006 }, { "epoch": 2.254106415836024, "grad_norm": 0.4638411258433599, "learning_rate": 1.3387010178110859e-05, "loss": 0.1825, "step": 2007 }, { "epoch": 2.2552295381159624, "grad_norm": 0.4636059997009433, "learning_rate": 1.3379628597890808e-05, "loss": 0.1723, "step": 2008 }, { "epoch": 2.2563526603959008, "grad_norm": 0.4634064303623194, "learning_rate": 1.337224493814809e-05, "loss": 0.1668, "step": 2009 }, { "epoch": 2.2574757826758387, "grad_norm": 0.4582783319228768, "learning_rate": 1.3364859203425953e-05, "loss": 0.1731, "step": 2010 }, { "epoch": 2.258598904955777, "grad_norm": 0.5057417740276605, "learning_rate": 1.335747139826892e-05, "loss": 0.1802, "step": 2011 }, { "epoch": 2.2597220272357155, "grad_norm": 0.4966593306531945, "learning_rate": 1.3350081527222787e-05, "loss": 0.1856, "step": 2012 }, { "epoch": 2.2608451495156534, "grad_norm": 0.47721378699943995, "learning_rate": 1.3342689594834623e-05, "loss": 0.1778, "step": 2013 }, { "epoch": 2.261968271795592, "grad_norm": 0.499924567330496, "learning_rate": 1.333529560565277e-05, "loss": 0.1782, "step": 2014 }, { "epoch": 2.2630913940755297, "grad_norm": 0.4761749575886551, "learning_rate": 1.3327899564226826e-05, "loss": 0.1776, "step": 2015 }, { "epoch": 2.264214516355468, "grad_norm": 0.47595190829952716, "learning_rate": 1.332050147510766e-05, "loss": 0.1799, "step": 2016 }, { "epoch": 2.2653376386354065, "grad_norm": 0.4819227222937039, "learning_rate": 1.3313101342847393e-05, "loss": 0.1777, "step": 2017 }, { "epoch": 2.2664607609153444, "grad_norm": 0.47028267836611976, "learning_rate": 1.3305699171999409e-05, "loss": 0.1783, "step": 2018 }, { "epoch": 2.267583883195283, "grad_norm": 0.4694525543038002, "learning_rate": 1.3298294967118351e-05, "loss": 0.1744, "step": 2019 }, { "epoch": 2.268707005475221, "grad_norm": 0.5046329050414854, "learning_rate": 1.32908887327601e-05, "loss": 0.1857, "step": 2020 }, { "epoch": 2.269830127755159, "grad_norm": 0.46102708624099675, "learning_rate": 1.3283480473481803e-05, "loss": 0.1789, "step": 2021 }, { "epoch": 2.2709532500350975, "grad_norm": 0.47172572667433565, "learning_rate": 1.3276070193841833e-05, "loss": 0.1731, "step": 2022 }, { "epoch": 2.272076372315036, "grad_norm": 0.49124489515157277, "learning_rate": 1.3268657898399822e-05, "loss": 0.1819, "step": 2023 }, { "epoch": 2.273199494594974, "grad_norm": 0.4720309863370387, "learning_rate": 1.3261243591716634e-05, "loss": 0.1686, "step": 2024 }, { "epoch": 2.274322616874912, "grad_norm": 0.47523894040627884, "learning_rate": 1.3253827278354378e-05, "loss": 0.1744, "step": 2025 }, { "epoch": 2.2754457391548506, "grad_norm": 0.4923383132196056, "learning_rate": 1.3246408962876391e-05, "loss": 0.1847, "step": 2026 }, { "epoch": 2.2765688614347885, "grad_norm": 0.4588643937702198, "learning_rate": 1.3238988649847243e-05, "loss": 0.1699, "step": 2027 }, { "epoch": 2.277691983714727, "grad_norm": 0.4754365804448629, "learning_rate": 1.3231566343832736e-05, "loss": 0.1763, "step": 2028 }, { "epoch": 2.2788151059946653, "grad_norm": 0.4708694394595169, "learning_rate": 1.3224142049399896e-05, "loss": 0.1806, "step": 2029 }, { "epoch": 2.2799382282746032, "grad_norm": 0.4532504916608009, "learning_rate": 1.321671577111697e-05, "loss": 0.1726, "step": 2030 }, { "epoch": 2.2810613505545416, "grad_norm": 0.45695642633903705, "learning_rate": 1.3209287513553437e-05, "loss": 0.1709, "step": 2031 }, { "epoch": 2.28218447283448, "grad_norm": 0.4436999178216967, "learning_rate": 1.3201857281279978e-05, "loss": 0.1662, "step": 2032 }, { "epoch": 2.283307595114418, "grad_norm": 0.4361709759275738, "learning_rate": 1.3194425078868498e-05, "loss": 0.171, "step": 2033 }, { "epoch": 2.2844307173943563, "grad_norm": 0.4685768299567099, "learning_rate": 1.3186990910892115e-05, "loss": 0.1845, "step": 2034 }, { "epoch": 2.2855538396742947, "grad_norm": 0.4716954023929674, "learning_rate": 1.317955478192515e-05, "loss": 0.171, "step": 2035 }, { "epoch": 2.2866769619542326, "grad_norm": 0.4508227597245992, "learning_rate": 1.3172116696543142e-05, "loss": 0.1686, "step": 2036 }, { "epoch": 2.287800084234171, "grad_norm": 0.4768566069887853, "learning_rate": 1.3164676659322823e-05, "loss": 0.1771, "step": 2037 }, { "epoch": 2.2889232065141094, "grad_norm": 0.4935990854907148, "learning_rate": 1.315723467484213e-05, "loss": 0.1806, "step": 2038 }, { "epoch": 2.2900463287940473, "grad_norm": 0.46467316460978925, "learning_rate": 1.3149790747680196e-05, "loss": 0.1721, "step": 2039 }, { "epoch": 2.2911694510739857, "grad_norm": 0.4496552803361268, "learning_rate": 1.3142344882417355e-05, "loss": 0.1681, "step": 2040 }, { "epoch": 2.292292573353924, "grad_norm": 0.46285417434016424, "learning_rate": 1.3134897083635126e-05, "loss": 0.177, "step": 2041 }, { "epoch": 2.293415695633862, "grad_norm": 0.45431363080930554, "learning_rate": 1.3127447355916223e-05, "loss": 0.182, "step": 2042 }, { "epoch": 2.2945388179138004, "grad_norm": 0.4507422204052235, "learning_rate": 1.3119995703844551e-05, "loss": 0.174, "step": 2043 }, { "epoch": 2.295661940193739, "grad_norm": 0.45097987664427835, "learning_rate": 1.3112542132005182e-05, "loss": 0.1726, "step": 2044 }, { "epoch": 2.2967850624736768, "grad_norm": 0.4644989156686283, "learning_rate": 1.310508664498439e-05, "loss": 0.1855, "step": 2045 }, { "epoch": 2.297908184753615, "grad_norm": 0.4417912412022252, "learning_rate": 1.3097629247369613e-05, "loss": 0.1663, "step": 2046 }, { "epoch": 2.2990313070335535, "grad_norm": 0.4635140434651788, "learning_rate": 1.3090169943749475e-05, "loss": 0.1789, "step": 2047 }, { "epoch": 2.3001544293134915, "grad_norm": 0.4734702085500251, "learning_rate": 1.3082708738713765e-05, "loss": 0.1699, "step": 2048 }, { "epoch": 2.30127755159343, "grad_norm": 0.47850190337364856, "learning_rate": 1.3075245636853444e-05, "loss": 0.1715, "step": 2049 }, { "epoch": 2.3024006738733678, "grad_norm": 0.466268874412243, "learning_rate": 1.306778064276064e-05, "loss": 0.1735, "step": 2050 }, { "epoch": 2.303523796153306, "grad_norm": 0.4479379317186277, "learning_rate": 1.3060313761028647e-05, "loss": 0.1727, "step": 2051 }, { "epoch": 2.3046469184332445, "grad_norm": 0.44726243257194265, "learning_rate": 1.305284499625192e-05, "loss": 0.1632, "step": 2052 }, { "epoch": 2.3057700407131825, "grad_norm": 0.49557743765194856, "learning_rate": 1.3045374353026073e-05, "loss": 0.1819, "step": 2053 }, { "epoch": 2.306893162993121, "grad_norm": 0.46796170967007167, "learning_rate": 1.3037901835947873e-05, "loss": 0.1758, "step": 2054 }, { "epoch": 2.3080162852730592, "grad_norm": 0.4805637353304701, "learning_rate": 1.3030427449615241e-05, "loss": 0.1809, "step": 2055 }, { "epoch": 2.309139407552997, "grad_norm": 0.4846347936669786, "learning_rate": 1.3022951198627254e-05, "loss": 0.1808, "step": 2056 }, { "epoch": 2.3102625298329356, "grad_norm": 0.4567235403598188, "learning_rate": 1.3015473087584127e-05, "loss": 0.1717, "step": 2057 }, { "epoch": 2.311385652112874, "grad_norm": 0.46361930650452676, "learning_rate": 1.3007993121087226e-05, "loss": 0.1724, "step": 2058 }, { "epoch": 2.312508774392812, "grad_norm": 0.4757333926502388, "learning_rate": 1.3000511303739054e-05, "loss": 0.1764, "step": 2059 }, { "epoch": 2.3136318966727503, "grad_norm": 0.4447398100189297, "learning_rate": 1.299302764014326e-05, "loss": 0.1643, "step": 2060 }, { "epoch": 2.3147550189526886, "grad_norm": 0.45522814478101575, "learning_rate": 1.2985542134904621e-05, "loss": 0.1725, "step": 2061 }, { "epoch": 2.3158781412326266, "grad_norm": 0.47650848146899405, "learning_rate": 1.2978054792629054e-05, "loss": 0.1765, "step": 2062 }, { "epoch": 2.317001263512565, "grad_norm": 0.4501821499764709, "learning_rate": 1.2970565617923598e-05, "loss": 0.1751, "step": 2063 }, { "epoch": 2.3181243857925034, "grad_norm": 0.45950502464250575, "learning_rate": 1.2963074615396428e-05, "loss": 0.1801, "step": 2064 }, { "epoch": 2.3192475080724413, "grad_norm": 0.4613132581972528, "learning_rate": 1.2955581789656844e-05, "loss": 0.1822, "step": 2065 }, { "epoch": 2.3203706303523797, "grad_norm": 0.4619765649138744, "learning_rate": 1.2948087145315256e-05, "loss": 0.1801, "step": 2066 }, { "epoch": 2.3214937526323176, "grad_norm": 0.45361708539848494, "learning_rate": 1.2940590686983208e-05, "loss": 0.1696, "step": 2067 }, { "epoch": 2.322616874912256, "grad_norm": 0.47042240961247933, "learning_rate": 1.2933092419273348e-05, "loss": 0.1822, "step": 2068 }, { "epoch": 2.3237399971921944, "grad_norm": 0.44936652373893554, "learning_rate": 1.2925592346799444e-05, "loss": 0.1715, "step": 2069 }, { "epoch": 2.3248631194721323, "grad_norm": 0.45944253764029824, "learning_rate": 1.2918090474176378e-05, "loss": 0.1728, "step": 2070 }, { "epoch": 2.3259862417520707, "grad_norm": 0.4670499870780179, "learning_rate": 1.2910586806020128e-05, "loss": 0.1778, "step": 2071 }, { "epoch": 2.327109364032009, "grad_norm": 0.47065143463451475, "learning_rate": 1.2903081346947788e-05, "loss": 0.1781, "step": 2072 }, { "epoch": 2.328232486311947, "grad_norm": 0.484564209356825, "learning_rate": 1.2895574101577548e-05, "loss": 0.1867, "step": 2073 }, { "epoch": 2.3293556085918854, "grad_norm": 0.4461711655494198, "learning_rate": 1.28880650745287e-05, "loss": 0.1749, "step": 2074 }, { "epoch": 2.330478730871824, "grad_norm": 0.4523170048395053, "learning_rate": 1.288055427042163e-05, "loss": 0.1712, "step": 2075 }, { "epoch": 2.3316018531517617, "grad_norm": 0.45907136438347945, "learning_rate": 1.2873041693877817e-05, "loss": 0.169, "step": 2076 }, { "epoch": 2.3327249754317, "grad_norm": 0.4596979480607732, "learning_rate": 1.2865527349519836e-05, "loss": 0.1757, "step": 2077 }, { "epoch": 2.3338480977116385, "grad_norm": 0.46814268818136334, "learning_rate": 1.285801124197134e-05, "loss": 0.1747, "step": 2078 }, { "epoch": 2.3349712199915764, "grad_norm": 0.45641847730925356, "learning_rate": 1.2850493375857078e-05, "loss": 0.1662, "step": 2079 }, { "epoch": 2.336094342271515, "grad_norm": 0.4573287262646522, "learning_rate": 1.2842973755802872e-05, "loss": 0.1666, "step": 2080 }, { "epoch": 2.337217464551453, "grad_norm": 0.5012334972800347, "learning_rate": 1.2835452386435629e-05, "loss": 0.1849, "step": 2081 }, { "epoch": 2.338340586831391, "grad_norm": 0.4826799580775581, "learning_rate": 1.282792927238333e-05, "loss": 0.168, "step": 2082 }, { "epoch": 2.3394637091113295, "grad_norm": 0.4765836277272539, "learning_rate": 1.282040441827503e-05, "loss": 0.1885, "step": 2083 }, { "epoch": 2.340586831391268, "grad_norm": 0.45652229568485014, "learning_rate": 1.2812877828740855e-05, "loss": 0.1623, "step": 2084 }, { "epoch": 2.341709953671206, "grad_norm": 0.512094793634767, "learning_rate": 1.2805349508411996e-05, "loss": 0.196, "step": 2085 }, { "epoch": 2.342833075951144, "grad_norm": 0.4764092713379784, "learning_rate": 1.2797819461920714e-05, "loss": 0.1817, "step": 2086 }, { "epoch": 2.3439561982310826, "grad_norm": 0.4704742691821975, "learning_rate": 1.279028769390033e-05, "loss": 0.1687, "step": 2087 }, { "epoch": 2.3450793205110205, "grad_norm": 0.4951306244097187, "learning_rate": 1.2782754208985217e-05, "loss": 0.1728, "step": 2088 }, { "epoch": 2.346202442790959, "grad_norm": 0.47760873675544463, "learning_rate": 1.2775219011810822e-05, "loss": 0.1913, "step": 2089 }, { "epoch": 2.3473255650708973, "grad_norm": 0.4832646096522936, "learning_rate": 1.2767682107013626e-05, "loss": 0.1859, "step": 2090 }, { "epoch": 2.3484486873508352, "grad_norm": 0.4426474698768579, "learning_rate": 1.2760143499231173e-05, "loss": 0.162, "step": 2091 }, { "epoch": 2.3495718096307736, "grad_norm": 0.47174290073665726, "learning_rate": 1.275260319310205e-05, "loss": 0.182, "step": 2092 }, { "epoch": 2.350694931910712, "grad_norm": 0.46639059398090704, "learning_rate": 1.2745061193265896e-05, "loss": 0.1681, "step": 2093 }, { "epoch": 2.35181805419065, "grad_norm": 0.4790429406794705, "learning_rate": 1.2737517504363378e-05, "loss": 0.1779, "step": 2094 }, { "epoch": 2.3529411764705883, "grad_norm": 0.47495388207027717, "learning_rate": 1.2729972131036212e-05, "loss": 0.1805, "step": 2095 }, { "epoch": 2.3540642987505267, "grad_norm": 0.4730593538483904, "learning_rate": 1.2722425077927157e-05, "loss": 0.1753, "step": 2096 }, { "epoch": 2.3551874210304646, "grad_norm": 0.44971031148328955, "learning_rate": 1.271487634967999e-05, "loss": 0.1576, "step": 2097 }, { "epoch": 2.356310543310403, "grad_norm": 0.45993224590682336, "learning_rate": 1.2707325950939529e-05, "loss": 0.1705, "step": 2098 }, { "epoch": 2.357433665590341, "grad_norm": 0.4502585923789524, "learning_rate": 1.2699773886351618e-05, "loss": 0.1716, "step": 2099 }, { "epoch": 2.3585567878702793, "grad_norm": 0.4639139646897428, "learning_rate": 1.2692220160563125e-05, "loss": 0.1761, "step": 2100 }, { "epoch": 2.3596799101502177, "grad_norm": 0.4619505035347143, "learning_rate": 1.2684664778221943e-05, "loss": 0.1817, "step": 2101 }, { "epoch": 2.3608030324301557, "grad_norm": 0.4465823236801084, "learning_rate": 1.2677107743976975e-05, "loss": 0.1622, "step": 2102 }, { "epoch": 2.361926154710094, "grad_norm": 0.4679911842787963, "learning_rate": 1.2669549062478155e-05, "loss": 0.1893, "step": 2103 }, { "epoch": 2.3630492769900324, "grad_norm": 0.4801272473499729, "learning_rate": 1.266198873837642e-05, "loss": 0.1756, "step": 2104 }, { "epoch": 2.3641723992699704, "grad_norm": 0.4649417211528937, "learning_rate": 1.2654426776323719e-05, "loss": 0.1742, "step": 2105 }, { "epoch": 2.3652955215499087, "grad_norm": 0.4612836290072487, "learning_rate": 1.2646863180973012e-05, "loss": 0.1726, "step": 2106 }, { "epoch": 2.366418643829847, "grad_norm": 0.46999430959875044, "learning_rate": 1.2639297956978262e-05, "loss": 0.1722, "step": 2107 }, { "epoch": 2.367541766109785, "grad_norm": 0.482099500823595, "learning_rate": 1.2631731108994436e-05, "loss": 0.1782, "step": 2108 }, { "epoch": 2.3686648883897234, "grad_norm": 0.4723654096317358, "learning_rate": 1.2624162641677498e-05, "loss": 0.1757, "step": 2109 }, { "epoch": 2.369788010669662, "grad_norm": 0.47531394992362225, "learning_rate": 1.2616592559684408e-05, "loss": 0.1862, "step": 2110 }, { "epoch": 2.3709111329495998, "grad_norm": 0.45674297876291126, "learning_rate": 1.2609020867673123e-05, "loss": 0.1663, "step": 2111 }, { "epoch": 2.372034255229538, "grad_norm": 0.458215236610029, "learning_rate": 1.2601447570302585e-05, "loss": 0.181, "step": 2112 }, { "epoch": 2.3731573775094765, "grad_norm": 0.43796983940777867, "learning_rate": 1.259387267223273e-05, "loss": 0.1655, "step": 2113 }, { "epoch": 2.3742804997894145, "grad_norm": 0.4363126086769737, "learning_rate": 1.2586296178124475e-05, "loss": 0.1621, "step": 2114 }, { "epoch": 2.375403622069353, "grad_norm": 0.44964027723907635, "learning_rate": 1.2578718092639724e-05, "loss": 0.1595, "step": 2115 }, { "epoch": 2.376526744349291, "grad_norm": 0.4599641014987599, "learning_rate": 1.2571138420441349e-05, "loss": 0.1626, "step": 2116 }, { "epoch": 2.377649866629229, "grad_norm": 0.4675544983326374, "learning_rate": 1.2563557166193213e-05, "loss": 0.1774, "step": 2117 }, { "epoch": 2.3787729889091676, "grad_norm": 0.4717489195434664, "learning_rate": 1.2555974334560142e-05, "loss": 0.1802, "step": 2118 }, { "epoch": 2.3798961111891055, "grad_norm": 0.4739492033223087, "learning_rate": 1.2548389930207932e-05, "loss": 0.176, "step": 2119 }, { "epoch": 2.381019233469044, "grad_norm": 0.46312844512973367, "learning_rate": 1.2540803957803356e-05, "loss": 0.1677, "step": 2120 }, { "epoch": 2.3821423557489823, "grad_norm": 0.49417837944594656, "learning_rate": 1.2533216422014145e-05, "loss": 0.184, "step": 2121 }, { "epoch": 2.38326547802892, "grad_norm": 0.484024720454384, "learning_rate": 1.2525627327508994e-05, "loss": 0.1734, "step": 2122 }, { "epoch": 2.3843886003088586, "grad_norm": 0.48942672284743194, "learning_rate": 1.2518036678957554e-05, "loss": 0.1844, "step": 2123 }, { "epoch": 2.385511722588797, "grad_norm": 0.46895877443085426, "learning_rate": 1.2510444481030434e-05, "loss": 0.1728, "step": 2124 }, { "epoch": 2.386634844868735, "grad_norm": 0.45324010534175113, "learning_rate": 1.25028507383992e-05, "loss": 0.1735, "step": 2125 }, { "epoch": 2.3877579671486733, "grad_norm": 0.4500103538917005, "learning_rate": 1.2495255455736366e-05, "loss": 0.1663, "step": 2126 }, { "epoch": 2.3888810894286117, "grad_norm": 0.46447167465312617, "learning_rate": 1.2487658637715388e-05, "loss": 0.1851, "step": 2127 }, { "epoch": 2.3900042117085496, "grad_norm": 0.4689612937423477, "learning_rate": 1.2480060289010677e-05, "loss": 0.1812, "step": 2128 }, { "epoch": 2.391127333988488, "grad_norm": 0.44830442220436945, "learning_rate": 1.2472460414297576e-05, "loss": 0.1693, "step": 2129 }, { "epoch": 2.3922504562684264, "grad_norm": 0.47648002812678625, "learning_rate": 1.2464859018252377e-05, "loss": 0.1897, "step": 2130 }, { "epoch": 2.3933735785483643, "grad_norm": 0.4776674804606051, "learning_rate": 1.2457256105552297e-05, "loss": 0.1719, "step": 2131 }, { "epoch": 2.3944967008283027, "grad_norm": 0.45626152225385935, "learning_rate": 1.2449651680875495e-05, "loss": 0.174, "step": 2132 }, { "epoch": 2.395619823108241, "grad_norm": 0.4641074642628347, "learning_rate": 1.2442045748901057e-05, "loss": 0.1765, "step": 2133 }, { "epoch": 2.396742945388179, "grad_norm": 0.45963165730817895, "learning_rate": 1.2434438314308997e-05, "loss": 0.175, "step": 2134 }, { "epoch": 2.3978660676681174, "grad_norm": 0.4559745351834007, "learning_rate": 1.242682938178025e-05, "loss": 0.1607, "step": 2135 }, { "epoch": 2.3989891899480558, "grad_norm": 0.4592128892747727, "learning_rate": 1.2419218955996677e-05, "loss": 0.1752, "step": 2136 }, { "epoch": 2.4001123122279937, "grad_norm": 0.47314594000118837, "learning_rate": 1.2411607041641062e-05, "loss": 0.1715, "step": 2137 }, { "epoch": 2.401235434507932, "grad_norm": 0.46568965161641296, "learning_rate": 1.2403993643397095e-05, "loss": 0.1847, "step": 2138 }, { "epoch": 2.4023585567878705, "grad_norm": 0.45802226050759454, "learning_rate": 1.2396378765949382e-05, "loss": 0.1732, "step": 2139 }, { "epoch": 2.4034816790678084, "grad_norm": 0.4657203849841537, "learning_rate": 1.2388762413983447e-05, "loss": 0.1756, "step": 2140 }, { "epoch": 2.404604801347747, "grad_norm": 0.44869629379918413, "learning_rate": 1.238114459218571e-05, "loss": 0.1697, "step": 2141 }, { "epoch": 2.405727923627685, "grad_norm": 0.46164059934933155, "learning_rate": 1.2373525305243499e-05, "loss": 0.1774, "step": 2142 }, { "epoch": 2.406851045907623, "grad_norm": 0.47884498767707456, "learning_rate": 1.2365904557845054e-05, "loss": 0.1859, "step": 2143 }, { "epoch": 2.4079741681875615, "grad_norm": 0.4576461996017312, "learning_rate": 1.2358282354679494e-05, "loss": 0.1684, "step": 2144 }, { "epoch": 2.4090972904675, "grad_norm": 0.4679384692538624, "learning_rate": 1.2350658700436852e-05, "loss": 0.1738, "step": 2145 }, { "epoch": 2.410220412747438, "grad_norm": 0.46313097703304307, "learning_rate": 1.2343033599808044e-05, "loss": 0.1772, "step": 2146 }, { "epoch": 2.411343535027376, "grad_norm": 0.4625911631962477, "learning_rate": 1.2335407057484877e-05, "loss": 0.1811, "step": 2147 }, { "epoch": 2.4124666573073146, "grad_norm": 0.44513248630919333, "learning_rate": 1.232777907816005e-05, "loss": 0.1749, "step": 2148 }, { "epoch": 2.4135897795872525, "grad_norm": 0.467659286026053, "learning_rate": 1.2320149666527134e-05, "loss": 0.182, "step": 2149 }, { "epoch": 2.414712901867191, "grad_norm": 0.4596330741642717, "learning_rate": 1.2312518827280603e-05, "loss": 0.1725, "step": 2150 }, { "epoch": 2.415836024147129, "grad_norm": 0.48688351873304575, "learning_rate": 1.2304886565115786e-05, "loss": 0.1859, "step": 2151 }, { "epoch": 2.416959146427067, "grad_norm": 0.46822427826794677, "learning_rate": 1.2297252884728904e-05, "loss": 0.1832, "step": 2152 }, { "epoch": 2.4180822687070056, "grad_norm": 0.4729830338617269, "learning_rate": 1.2289617790817039e-05, "loss": 0.1682, "step": 2153 }, { "epoch": 2.4192053909869435, "grad_norm": 0.46719333183624456, "learning_rate": 1.228198128807815e-05, "loss": 0.1741, "step": 2154 }, { "epoch": 2.420328513266882, "grad_norm": 0.4773732399141985, "learning_rate": 1.2274343381211067e-05, "loss": 0.1737, "step": 2155 }, { "epoch": 2.4214516355468203, "grad_norm": 0.48795163089354604, "learning_rate": 1.226670407491547e-05, "loss": 0.1875, "step": 2156 }, { "epoch": 2.4225747578267582, "grad_norm": 0.4579527625487055, "learning_rate": 1.2259063373891911e-05, "loss": 0.1772, "step": 2157 }, { "epoch": 2.4236978801066966, "grad_norm": 0.4533126876310765, "learning_rate": 1.22514212828418e-05, "loss": 0.1705, "step": 2158 }, { "epoch": 2.424821002386635, "grad_norm": 0.43848733476677665, "learning_rate": 1.2243777806467396e-05, "loss": 0.1676, "step": 2159 }, { "epoch": 2.425944124666573, "grad_norm": 0.4719399363071484, "learning_rate": 1.223613294947182e-05, "loss": 0.1811, "step": 2160 }, { "epoch": 2.4270672469465113, "grad_norm": 0.453477191494824, "learning_rate": 1.222848671655903e-05, "loss": 0.1685, "step": 2161 }, { "epoch": 2.4281903692264497, "grad_norm": 0.4658487988412452, "learning_rate": 1.222083911243384e-05, "loss": 0.1773, "step": 2162 }, { "epoch": 2.4293134915063876, "grad_norm": 0.459422798554054, "learning_rate": 1.2213190141801906e-05, "loss": 0.1733, "step": 2163 }, { "epoch": 2.430436613786326, "grad_norm": 0.4887095631557401, "learning_rate": 1.2205539809369719e-05, "loss": 0.191, "step": 2164 }, { "epoch": 2.431559736066264, "grad_norm": 0.48937602650227535, "learning_rate": 1.2197888119844623e-05, "loss": 0.1846, "step": 2165 }, { "epoch": 2.4326828583462023, "grad_norm": 0.46406522024065605, "learning_rate": 1.2190235077934776e-05, "loss": 0.1747, "step": 2166 }, { "epoch": 2.4338059806261407, "grad_norm": 0.4621944971046579, "learning_rate": 1.2182580688349185e-05, "loss": 0.1685, "step": 2167 }, { "epoch": 2.4349291029060787, "grad_norm": 0.4862208277682074, "learning_rate": 1.2174924955797676e-05, "loss": 0.1838, "step": 2168 }, { "epoch": 2.436052225186017, "grad_norm": 0.4488446985907642, "learning_rate": 1.216726788499091e-05, "loss": 0.1735, "step": 2169 }, { "epoch": 2.4371753474659554, "grad_norm": 0.45799186763818456, "learning_rate": 1.2159609480640361e-05, "loss": 0.1615, "step": 2170 }, { "epoch": 2.4382984697458934, "grad_norm": 0.4700112723112886, "learning_rate": 1.2151949747458336e-05, "loss": 0.1729, "step": 2171 }, { "epoch": 2.4394215920258318, "grad_norm": 0.4613922790363946, "learning_rate": 1.214428869015795e-05, "loss": 0.1612, "step": 2172 }, { "epoch": 2.44054471430577, "grad_norm": 0.4730141720396465, "learning_rate": 1.2136626313453136e-05, "loss": 0.1717, "step": 2173 }, { "epoch": 2.441667836585708, "grad_norm": 0.48115765163733054, "learning_rate": 1.212896262205864e-05, "loss": 0.1759, "step": 2174 }, { "epoch": 2.4427909588656465, "grad_norm": 0.4773927024472624, "learning_rate": 1.2121297620690011e-05, "loss": 0.1703, "step": 2175 }, { "epoch": 2.443914081145585, "grad_norm": 0.42953251161909245, "learning_rate": 1.2113631314063615e-05, "loss": 0.1625, "step": 2176 }, { "epoch": 2.4450372034255228, "grad_norm": 0.45411136690423287, "learning_rate": 1.210596370689661e-05, "loss": 0.1695, "step": 2177 }, { "epoch": 2.446160325705461, "grad_norm": 0.48313926964400616, "learning_rate": 1.2098294803906962e-05, "loss": 0.1831, "step": 2178 }, { "epoch": 2.4472834479853995, "grad_norm": 0.4816531131405757, "learning_rate": 1.209062460981343e-05, "loss": 0.1758, "step": 2179 }, { "epoch": 2.4484065702653375, "grad_norm": 0.4399375257912147, "learning_rate": 1.208295312933557e-05, "loss": 0.1685, "step": 2180 }, { "epoch": 2.449529692545276, "grad_norm": 0.467400595438487, "learning_rate": 1.2075280367193727e-05, "loss": 0.1851, "step": 2181 }, { "epoch": 2.4506528148252142, "grad_norm": 0.4658665632807449, "learning_rate": 1.2067606328109038e-05, "loss": 0.1645, "step": 2182 }, { "epoch": 2.451775937105152, "grad_norm": 0.46990058745256763, "learning_rate": 1.2059931016803422e-05, "loss": 0.168, "step": 2183 }, { "epoch": 2.4528990593850906, "grad_norm": 0.4614529742174627, "learning_rate": 1.2052254437999582e-05, "loss": 0.1686, "step": 2184 }, { "epoch": 2.454022181665029, "grad_norm": 0.46087967841279076, "learning_rate": 1.2044576596421003e-05, "loss": 0.182, "step": 2185 }, { "epoch": 2.455145303944967, "grad_norm": 0.4686611799132139, "learning_rate": 1.2036897496791945e-05, "loss": 0.1705, "step": 2186 }, { "epoch": 2.4562684262249053, "grad_norm": 0.45135028559868196, "learning_rate": 1.2029217143837441e-05, "loss": 0.1749, "step": 2187 }, { "epoch": 2.4573915485048436, "grad_norm": 0.45774486543179027, "learning_rate": 1.2021535542283297e-05, "loss": 0.1786, "step": 2188 }, { "epoch": 2.4585146707847816, "grad_norm": 0.4370453888107022, "learning_rate": 1.2013852696856092e-05, "loss": 0.1616, "step": 2189 }, { "epoch": 2.45963779306472, "grad_norm": 0.4941697271173176, "learning_rate": 1.2006168612283158e-05, "loss": 0.1888, "step": 2190 }, { "epoch": 2.4607609153446584, "grad_norm": 0.45220984927599905, "learning_rate": 1.1998483293292602e-05, "loss": 0.1719, "step": 2191 }, { "epoch": 2.4618840376245963, "grad_norm": 0.45516650401879916, "learning_rate": 1.199079674461328e-05, "loss": 0.1718, "step": 2192 }, { "epoch": 2.4630071599045347, "grad_norm": 0.47742118331301076, "learning_rate": 1.1983108970974815e-05, "loss": 0.1848, "step": 2193 }, { "epoch": 2.464130282184473, "grad_norm": 0.46090370293756416, "learning_rate": 1.1975419977107578e-05, "loss": 0.1672, "step": 2194 }, { "epoch": 2.465253404464411, "grad_norm": 0.46359443330017996, "learning_rate": 1.1967729767742688e-05, "loss": 0.1769, "step": 2195 }, { "epoch": 2.4663765267443494, "grad_norm": 0.5022583283184264, "learning_rate": 1.1960038347612021e-05, "loss": 0.181, "step": 2196 }, { "epoch": 2.4674996490242878, "grad_norm": 0.46125009756094304, "learning_rate": 1.1952345721448189e-05, "loss": 0.1784, "step": 2197 }, { "epoch": 2.4686227713042257, "grad_norm": 0.4588215867147784, "learning_rate": 1.1944651893984546e-05, "loss": 0.1676, "step": 2198 }, { "epoch": 2.469745893584164, "grad_norm": 0.44150474530630224, "learning_rate": 1.1936956869955198e-05, "loss": 0.1581, "step": 2199 }, { "epoch": 2.470869015864102, "grad_norm": 0.4648006055553846, "learning_rate": 1.192926065409497e-05, "loss": 0.1756, "step": 2200 }, { "epoch": 2.4719921381440404, "grad_norm": 0.4783515973358098, "learning_rate": 1.1921563251139433e-05, "loss": 0.177, "step": 2201 }, { "epoch": 2.473115260423979, "grad_norm": 0.46380023099280016, "learning_rate": 1.1913864665824878e-05, "loss": 0.1738, "step": 2202 }, { "epoch": 2.4742383827039167, "grad_norm": 0.4494157069182162, "learning_rate": 1.1906164902888336e-05, "loss": 0.1669, "step": 2203 }, { "epoch": 2.475361504983855, "grad_norm": 0.45612118971443005, "learning_rate": 1.189846396706755e-05, "loss": 0.178, "step": 2204 }, { "epoch": 2.4764846272637935, "grad_norm": 0.45627124504496713, "learning_rate": 1.1890761863100994e-05, "loss": 0.1838, "step": 2205 }, { "epoch": 2.4776077495437314, "grad_norm": 0.448762160212113, "learning_rate": 1.1883058595727862e-05, "loss": 0.1715, "step": 2206 }, { "epoch": 2.47873087182367, "grad_norm": 0.4478139255344506, "learning_rate": 1.1875354169688049e-05, "loss": 0.178, "step": 2207 }, { "epoch": 2.479853994103608, "grad_norm": 0.45613528585444935, "learning_rate": 1.186764858972218e-05, "loss": 0.17, "step": 2208 }, { "epoch": 2.480977116383546, "grad_norm": 0.4788049854189734, "learning_rate": 1.185994186057158e-05, "loss": 0.1785, "step": 2209 }, { "epoch": 2.4821002386634845, "grad_norm": 0.4449516828341209, "learning_rate": 1.1852233986978286e-05, "loss": 0.1638, "step": 2210 }, { "epoch": 2.483223360943423, "grad_norm": 0.4533109754634573, "learning_rate": 1.1844524973685036e-05, "loss": 0.17, "step": 2211 }, { "epoch": 2.484346483223361, "grad_norm": 0.4615156080814565, "learning_rate": 1.1836814825435272e-05, "loss": 0.1753, "step": 2212 }, { "epoch": 2.485469605503299, "grad_norm": 0.4617037737156908, "learning_rate": 1.1829103546973135e-05, "loss": 0.1752, "step": 2213 }, { "epoch": 2.4865927277832376, "grad_norm": 0.4800696915193034, "learning_rate": 1.1821391143043455e-05, "loss": 0.1821, "step": 2214 }, { "epoch": 2.4877158500631755, "grad_norm": 0.48034446713417756, "learning_rate": 1.1813677618391759e-05, "loss": 0.177, "step": 2215 }, { "epoch": 2.488838972343114, "grad_norm": 0.46921558168312416, "learning_rate": 1.1805962977764271e-05, "loss": 0.1676, "step": 2216 }, { "epoch": 2.489962094623052, "grad_norm": 0.45754664779814874, "learning_rate": 1.1798247225907883e-05, "loss": 0.1659, "step": 2217 }, { "epoch": 2.4910852169029902, "grad_norm": 0.4445261764071967, "learning_rate": 1.1790530367570194e-05, "loss": 0.1697, "step": 2218 }, { "epoch": 2.4922083391829286, "grad_norm": 0.44402068318245963, "learning_rate": 1.1782812407499461e-05, "loss": 0.1706, "step": 2219 }, { "epoch": 2.4933314614628665, "grad_norm": 0.4853740651601103, "learning_rate": 1.1775093350444638e-05, "loss": 0.1787, "step": 2220 }, { "epoch": 2.494454583742805, "grad_norm": 0.47514363972154094, "learning_rate": 1.1767373201155344e-05, "loss": 0.1767, "step": 2221 }, { "epoch": 2.4955777060227433, "grad_norm": 0.4573804651093508, "learning_rate": 1.1759651964381864e-05, "loss": 0.1643, "step": 2222 }, { "epoch": 2.4967008283026813, "grad_norm": 0.46823456746959147, "learning_rate": 1.1751929644875171e-05, "loss": 0.1801, "step": 2223 }, { "epoch": 2.4978239505826196, "grad_norm": 0.4702347722306607, "learning_rate": 1.1744206247386885e-05, "loss": 0.1756, "step": 2224 }, { "epoch": 2.498947072862558, "grad_norm": 0.4779388858348045, "learning_rate": 1.1736481776669307e-05, "loss": 0.1854, "step": 2225 }, { "epoch": 2.500070195142496, "grad_norm": 0.45908097833882, "learning_rate": 1.1728756237475377e-05, "loss": 0.1724, "step": 2226 }, { "epoch": 2.5011933174224343, "grad_norm": 0.47347497107975, "learning_rate": 1.172102963455871e-05, "loss": 0.1856, "step": 2227 }, { "epoch": 2.5023164397023727, "grad_norm": 0.440317578484131, "learning_rate": 1.1713301972673574e-05, "loss": 0.1677, "step": 2228 }, { "epoch": 2.5034395619823107, "grad_norm": 0.4698698238973603, "learning_rate": 1.1705573256574875e-05, "loss": 0.172, "step": 2229 }, { "epoch": 2.504562684262249, "grad_norm": 0.4666802074158573, "learning_rate": 1.1697843491018189e-05, "loss": 0.1778, "step": 2230 }, { "epoch": 2.5056858065421874, "grad_norm": 0.4675479550389423, "learning_rate": 1.1690112680759714e-05, "loss": 0.1765, "step": 2231 }, { "epoch": 2.5068089288221254, "grad_norm": 0.470321624448753, "learning_rate": 1.1682380830556305e-05, "loss": 0.1741, "step": 2232 }, { "epoch": 2.5079320511020637, "grad_norm": 0.4621451552720319, "learning_rate": 1.1674647945165463e-05, "loss": 0.1725, "step": 2233 }, { "epoch": 2.509055173382002, "grad_norm": 0.47835096191087073, "learning_rate": 1.1666914029345309e-05, "loss": 0.1735, "step": 2234 }, { "epoch": 2.51017829566194, "grad_norm": 0.44571066889134053, "learning_rate": 1.165917908785461e-05, "loss": 0.1658, "step": 2235 }, { "epoch": 2.5113014179418784, "grad_norm": 0.48065746817643185, "learning_rate": 1.165144312545276e-05, "loss": 0.1789, "step": 2236 }, { "epoch": 2.512424540221817, "grad_norm": 0.4865730345149686, "learning_rate": 1.164370614689978e-05, "loss": 0.19, "step": 2237 }, { "epoch": 2.5135476625017548, "grad_norm": 0.46563389532044447, "learning_rate": 1.1635968156956322e-05, "loss": 0.1788, "step": 2238 }, { "epoch": 2.514670784781693, "grad_norm": 0.44428883561842164, "learning_rate": 1.1628229160383653e-05, "loss": 0.1645, "step": 2239 }, { "epoch": 2.5157939070616315, "grad_norm": 0.4699429571414019, "learning_rate": 1.1620489161943665e-05, "loss": 0.1734, "step": 2240 }, { "epoch": 2.5169170293415695, "grad_norm": 0.4645753328504612, "learning_rate": 1.161274816639886e-05, "loss": 0.1695, "step": 2241 }, { "epoch": 2.518040151621508, "grad_norm": 0.4715174058816328, "learning_rate": 1.1605006178512361e-05, "loss": 0.1767, "step": 2242 }, { "epoch": 2.5191632739014462, "grad_norm": 0.4739964569319054, "learning_rate": 1.15972632030479e-05, "loss": 0.1778, "step": 2243 }, { "epoch": 2.520286396181384, "grad_norm": 0.4864994655693669, "learning_rate": 1.1589519244769813e-05, "loss": 0.1907, "step": 2244 }, { "epoch": 2.5214095184613226, "grad_norm": 0.4554291488338845, "learning_rate": 1.1581774308443042e-05, "loss": 0.1726, "step": 2245 }, { "epoch": 2.522532640741261, "grad_norm": 0.4490036340445551, "learning_rate": 1.157402839883313e-05, "loss": 0.1737, "step": 2246 }, { "epoch": 2.523655763021199, "grad_norm": 0.4560497521668537, "learning_rate": 1.1566281520706228e-05, "loss": 0.1698, "step": 2247 }, { "epoch": 2.5247788853011373, "grad_norm": 0.4848150687683439, "learning_rate": 1.1558533678829066e-05, "loss": 0.1779, "step": 2248 }, { "epoch": 2.5259020075810756, "grad_norm": 0.4557588727056414, "learning_rate": 1.1550784877968982e-05, "loss": 0.1792, "step": 2249 }, { "epoch": 2.5270251298610136, "grad_norm": 0.46640445243584955, "learning_rate": 1.1543035122893898e-05, "loss": 0.1821, "step": 2250 }, { "epoch": 2.528148252140952, "grad_norm": 0.46062983958648196, "learning_rate": 1.1535284418372321e-05, "loss": 0.1731, "step": 2251 }, { "epoch": 2.5292713744208903, "grad_norm": 0.4533115380906245, "learning_rate": 1.1527532769173349e-05, "loss": 0.1852, "step": 2252 }, { "epoch": 2.5303944967008283, "grad_norm": 0.45093605161163985, "learning_rate": 1.1519780180066651e-05, "loss": 0.1725, "step": 2253 }, { "epoch": 2.5315176189807667, "grad_norm": 0.4739553972885986, "learning_rate": 1.1512026655822483e-05, "loss": 0.185, "step": 2254 }, { "epoch": 2.5326407412607046, "grad_norm": 0.455497428773056, "learning_rate": 1.150427220121168e-05, "loss": 0.1667, "step": 2255 }, { "epoch": 2.533763863540643, "grad_norm": 0.46427594398131145, "learning_rate": 1.1496516821005632e-05, "loss": 0.1707, "step": 2256 }, { "epoch": 2.5348869858205814, "grad_norm": 0.4726499893846905, "learning_rate": 1.1488760519976321e-05, "loss": 0.1693, "step": 2257 }, { "epoch": 2.5360101081005193, "grad_norm": 0.4770180942704747, "learning_rate": 1.1481003302896274e-05, "loss": 0.1791, "step": 2258 }, { "epoch": 2.5371332303804577, "grad_norm": 0.4824404668774312, "learning_rate": 1.1473245174538601e-05, "loss": 0.1632, "step": 2259 }, { "epoch": 2.538256352660396, "grad_norm": 0.46093177693875936, "learning_rate": 1.1465486139676955e-05, "loss": 0.171, "step": 2260 }, { "epoch": 2.539379474940334, "grad_norm": 0.47811684922648134, "learning_rate": 1.1457726203085565e-05, "loss": 0.1795, "step": 2261 }, { "epoch": 2.5405025972202724, "grad_norm": 0.4605303787580689, "learning_rate": 1.14499653695392e-05, "loss": 0.1717, "step": 2262 }, { "epoch": 2.5416257195002103, "grad_norm": 0.45780123414736634, "learning_rate": 1.1442203643813184e-05, "loss": 0.165, "step": 2263 }, { "epoch": 2.5427488417801487, "grad_norm": 0.4484627901652153, "learning_rate": 1.1434441030683396e-05, "loss": 0.1723, "step": 2264 }, { "epoch": 2.543871964060087, "grad_norm": 0.45899357848479316, "learning_rate": 1.1426677534926259e-05, "loss": 0.178, "step": 2265 }, { "epoch": 2.544995086340025, "grad_norm": 0.4520762026871274, "learning_rate": 1.1418913161318735e-05, "loss": 0.1718, "step": 2266 }, { "epoch": 2.5461182086199634, "grad_norm": 0.48593149453254475, "learning_rate": 1.1411147914638323e-05, "loss": 0.1858, "step": 2267 }, { "epoch": 2.547241330899902, "grad_norm": 0.4699419541357399, "learning_rate": 1.1403381799663073e-05, "loss": 0.1868, "step": 2268 }, { "epoch": 2.5483644531798397, "grad_norm": 0.43072277665816994, "learning_rate": 1.139561482117156e-05, "loss": 0.1635, "step": 2269 }, { "epoch": 2.549487575459778, "grad_norm": 0.48228875953990946, "learning_rate": 1.138784698394289e-05, "loss": 0.1838, "step": 2270 }, { "epoch": 2.5506106977397165, "grad_norm": 0.4589878064473334, "learning_rate": 1.1380078292756695e-05, "loss": 0.1779, "step": 2271 }, { "epoch": 2.5517338200196544, "grad_norm": 0.47422067140567364, "learning_rate": 1.1372308752393144e-05, "loss": 0.1871, "step": 2272 }, { "epoch": 2.552856942299593, "grad_norm": 0.4841535195365073, "learning_rate": 1.136453836763291e-05, "loss": 0.1839, "step": 2273 }, { "epoch": 2.553980064579531, "grad_norm": 0.46931683097362253, "learning_rate": 1.1356767143257208e-05, "loss": 0.1774, "step": 2274 }, { "epoch": 2.555103186859469, "grad_norm": 0.45852223438824385, "learning_rate": 1.134899508404775e-05, "loss": 0.1668, "step": 2275 }, { "epoch": 2.5562263091394075, "grad_norm": 0.4747264084612995, "learning_rate": 1.1341222194786772e-05, "loss": 0.1817, "step": 2276 }, { "epoch": 2.557349431419346, "grad_norm": 0.5309294296915062, "learning_rate": 1.1333448480257019e-05, "loss": 0.1774, "step": 2277 }, { "epoch": 2.558472553699284, "grad_norm": 0.44494999424664955, "learning_rate": 1.132567394524174e-05, "loss": 0.1652, "step": 2278 }, { "epoch": 2.559595675979222, "grad_norm": 0.47565188072125314, "learning_rate": 1.1317898594524694e-05, "loss": 0.1739, "step": 2279 }, { "epoch": 2.5607187982591606, "grad_norm": 0.4724833833160546, "learning_rate": 1.131012243289014e-05, "loss": 0.1756, "step": 2280 }, { "epoch": 2.5618419205390985, "grad_norm": 0.4961383451401611, "learning_rate": 1.1302345465122839e-05, "loss": 0.1751, "step": 2281 }, { "epoch": 2.562965042819037, "grad_norm": 0.45743388018733416, "learning_rate": 1.1294567696008038e-05, "loss": 0.1682, "step": 2282 }, { "epoch": 2.5640881650989753, "grad_norm": 0.46320800217506247, "learning_rate": 1.1286789130331487e-05, "loss": 0.1765, "step": 2283 }, { "epoch": 2.5652112873789132, "grad_norm": 0.46728239868320987, "learning_rate": 1.1279009772879427e-05, "loss": 0.1715, "step": 2284 }, { "epoch": 2.5663344096588516, "grad_norm": 0.455957997479803, "learning_rate": 1.1271229628438578e-05, "loss": 0.174, "step": 2285 }, { "epoch": 2.56745753193879, "grad_norm": 0.46937629962755417, "learning_rate": 1.1263448701796149e-05, "loss": 0.1785, "step": 2286 }, { "epoch": 2.568580654218728, "grad_norm": 0.46560086475544704, "learning_rate": 1.125566699773983e-05, "loss": 0.1782, "step": 2287 }, { "epoch": 2.5697037764986663, "grad_norm": 0.4611166282806535, "learning_rate": 1.1247884521057788e-05, "loss": 0.1604, "step": 2288 }, { "epoch": 2.5708268987786047, "grad_norm": 0.4800102109761718, "learning_rate": 1.1240101276538668e-05, "loss": 0.1798, "step": 2289 }, { "epoch": 2.5719500210585426, "grad_norm": 0.44962924941537197, "learning_rate": 1.1232317268971586e-05, "loss": 0.1645, "step": 2290 }, { "epoch": 2.573073143338481, "grad_norm": 0.45604502325176743, "learning_rate": 1.122453250314613e-05, "loss": 0.1762, "step": 2291 }, { "epoch": 2.5741962656184194, "grad_norm": 0.44941318778717404, "learning_rate": 1.121674698385235e-05, "loss": 0.1723, "step": 2292 }, { "epoch": 2.5753193878983573, "grad_norm": 0.4570440838697352, "learning_rate": 1.1208960715880759e-05, "loss": 0.1692, "step": 2293 }, { "epoch": 2.5764425101782957, "grad_norm": 0.4458264791275645, "learning_rate": 1.1201173704022335e-05, "loss": 0.1714, "step": 2294 }, { "epoch": 2.577565632458234, "grad_norm": 0.46409333426984856, "learning_rate": 1.1193385953068512e-05, "loss": 0.188, "step": 2295 }, { "epoch": 2.578688754738172, "grad_norm": 0.42425996455837706, "learning_rate": 1.118559746781118e-05, "loss": 0.1591, "step": 2296 }, { "epoch": 2.5798118770181104, "grad_norm": 0.447955738326864, "learning_rate": 1.1177808253042679e-05, "loss": 0.1712, "step": 2297 }, { "epoch": 2.580934999298049, "grad_norm": 0.4650895841202455, "learning_rate": 1.1170018313555802e-05, "loss": 0.1808, "step": 2298 }, { "epoch": 2.5820581215779868, "grad_norm": 0.4701322334085429, "learning_rate": 1.1162227654143777e-05, "loss": 0.1893, "step": 2299 }, { "epoch": 2.583181243857925, "grad_norm": 0.4531359125712725, "learning_rate": 1.1154436279600287e-05, "loss": 0.169, "step": 2300 }, { "epoch": 2.5843043661378635, "grad_norm": 0.47496785745939757, "learning_rate": 1.1146644194719454e-05, "loss": 0.171, "step": 2301 }, { "epoch": 2.5854274884178015, "grad_norm": 0.4612778970229022, "learning_rate": 1.1138851404295826e-05, "loss": 0.1671, "step": 2302 }, { "epoch": 2.58655061069774, "grad_norm": 0.4963115959404571, "learning_rate": 1.1131057913124399e-05, "loss": 0.1823, "step": 2303 }, { "epoch": 2.587673732977678, "grad_norm": 0.4776956369356712, "learning_rate": 1.1123263726000588e-05, "loss": 0.1766, "step": 2304 }, { "epoch": 2.588796855257616, "grad_norm": 0.48648831354650407, "learning_rate": 1.1115468847720245e-05, "loss": 0.1757, "step": 2305 }, { "epoch": 2.5899199775375545, "grad_norm": 0.4686638661781645, "learning_rate": 1.110767328307965e-05, "loss": 0.1821, "step": 2306 }, { "epoch": 2.5910430998174925, "grad_norm": 0.44047248695541774, "learning_rate": 1.109987703687549e-05, "loss": 0.1657, "step": 2307 }, { "epoch": 2.592166222097431, "grad_norm": 0.43543299715008577, "learning_rate": 1.1092080113904886e-05, "loss": 0.1599, "step": 2308 }, { "epoch": 2.5932893443773692, "grad_norm": 0.47199067367590014, "learning_rate": 1.1084282518965373e-05, "loss": 0.1791, "step": 2309 }, { "epoch": 2.594412466657307, "grad_norm": 0.4532641018961933, "learning_rate": 1.1076484256854889e-05, "loss": 0.1678, "step": 2310 }, { "epoch": 2.5955355889372456, "grad_norm": 0.45735825060841906, "learning_rate": 1.1068685332371802e-05, "loss": 0.1838, "step": 2311 }, { "epoch": 2.5966587112171835, "grad_norm": 0.4591137341726451, "learning_rate": 1.1060885750314865e-05, "loss": 0.1765, "step": 2312 }, { "epoch": 2.597781833497122, "grad_norm": 0.4679142193330497, "learning_rate": 1.1053085515483255e-05, "loss": 0.179, "step": 2313 }, { "epoch": 2.5989049557770603, "grad_norm": 0.47518636171561446, "learning_rate": 1.1045284632676535e-05, "loss": 0.1698, "step": 2314 }, { "epoch": 2.600028078056998, "grad_norm": 0.4565346667945619, "learning_rate": 1.1037483106694681e-05, "loss": 0.1706, "step": 2315 }, { "epoch": 2.6011512003369366, "grad_norm": 0.47006822600452486, "learning_rate": 1.1029680942338053e-05, "loss": 0.1694, "step": 2316 }, { "epoch": 2.602274322616875, "grad_norm": 0.4563277326138092, "learning_rate": 1.1021878144407408e-05, "loss": 0.1672, "step": 2317 }, { "epoch": 2.603397444896813, "grad_norm": 0.47107089066828634, "learning_rate": 1.1014074717703897e-05, "loss": 0.1749, "step": 2318 }, { "epoch": 2.6045205671767513, "grad_norm": 0.4467643283567714, "learning_rate": 1.1006270667029054e-05, "loss": 0.1679, "step": 2319 }, { "epoch": 2.6056436894566897, "grad_norm": 0.4587123434597116, "learning_rate": 1.0998465997184798e-05, "loss": 0.1777, "step": 2320 }, { "epoch": 2.6067668117366276, "grad_norm": 0.4983006072884289, "learning_rate": 1.099066071297342e-05, "loss": 0.1874, "step": 2321 }, { "epoch": 2.607889934016566, "grad_norm": 0.476575665366494, "learning_rate": 1.0982854819197609e-05, "loss": 0.1747, "step": 2322 }, { "epoch": 2.6090130562965044, "grad_norm": 0.45400404446255577, "learning_rate": 1.0975048320660408e-05, "loss": 0.1801, "step": 2323 }, { "epoch": 2.6101361785764423, "grad_norm": 0.4332841064106607, "learning_rate": 1.0967241222165247e-05, "loss": 0.1632, "step": 2324 }, { "epoch": 2.6112593008563807, "grad_norm": 0.4510328447149446, "learning_rate": 1.0959433528515922e-05, "loss": 0.1741, "step": 2325 }, { "epoch": 2.612382423136319, "grad_norm": 0.454925383272742, "learning_rate": 1.0951625244516584e-05, "loss": 0.1727, "step": 2326 }, { "epoch": 2.613505545416257, "grad_norm": 0.4371439911786275, "learning_rate": 1.094381637497176e-05, "loss": 0.1673, "step": 2327 }, { "epoch": 2.6146286676961954, "grad_norm": 0.47013253731307947, "learning_rate": 1.0936006924686337e-05, "loss": 0.1708, "step": 2328 }, { "epoch": 2.615751789976134, "grad_norm": 0.46539714593918086, "learning_rate": 1.0928196898465552e-05, "loss": 0.1801, "step": 2329 }, { "epoch": 2.6168749122560717, "grad_norm": 0.45190767645037216, "learning_rate": 1.0920386301115e-05, "loss": 0.1744, "step": 2330 }, { "epoch": 2.61799803453601, "grad_norm": 0.4538448950274808, "learning_rate": 1.091257513744063e-05, "loss": 0.1681, "step": 2331 }, { "epoch": 2.6191211568159485, "grad_norm": 0.488178449583916, "learning_rate": 1.0904763412248736e-05, "loss": 0.1845, "step": 2332 }, { "epoch": 2.6202442790958864, "grad_norm": 0.46938216478064526, "learning_rate": 1.0896951130345957e-05, "loss": 0.1767, "step": 2333 }, { "epoch": 2.621367401375825, "grad_norm": 0.44361139483028894, "learning_rate": 1.0889138296539277e-05, "loss": 0.1719, "step": 2334 }, { "epoch": 2.622490523655763, "grad_norm": 0.4571118704858418, "learning_rate": 1.088132491563602e-05, "loss": 0.1693, "step": 2335 }, { "epoch": 2.623613645935701, "grad_norm": 0.4600308129329723, "learning_rate": 1.0873510992443841e-05, "loss": 0.1666, "step": 2336 }, { "epoch": 2.6247367682156395, "grad_norm": 0.4609852591114243, "learning_rate": 1.086569653177074e-05, "loss": 0.1714, "step": 2337 }, { "epoch": 2.625859890495578, "grad_norm": 0.47717459396403794, "learning_rate": 1.0857881538425032e-05, "loss": 0.1872, "step": 2338 }, { "epoch": 2.626983012775516, "grad_norm": 0.4617597071752641, "learning_rate": 1.0850066017215375e-05, "loss": 0.1745, "step": 2339 }, { "epoch": 2.628106135055454, "grad_norm": 0.4654760240823605, "learning_rate": 1.0842249972950743e-05, "loss": 0.1713, "step": 2340 }, { "epoch": 2.6292292573353926, "grad_norm": 0.4698177808893506, "learning_rate": 1.0834433410440432e-05, "loss": 0.1716, "step": 2341 }, { "epoch": 2.6303523796153305, "grad_norm": 0.47185791755913564, "learning_rate": 1.0826616334494068e-05, "loss": 0.177, "step": 2342 }, { "epoch": 2.631475501895269, "grad_norm": 0.46380423420344924, "learning_rate": 1.0818798749921569e-05, "loss": 0.172, "step": 2343 }, { "epoch": 2.6325986241752073, "grad_norm": 0.4790054584186303, "learning_rate": 1.081098066153319e-05, "loss": 0.1726, "step": 2344 }, { "epoch": 2.6337217464551452, "grad_norm": 0.46882773567793207, "learning_rate": 1.0803162074139489e-05, "loss": 0.1699, "step": 2345 }, { "epoch": 2.6348448687350836, "grad_norm": 0.45878850668533416, "learning_rate": 1.0795342992551323e-05, "loss": 0.1749, "step": 2346 }, { "epoch": 2.635967991015022, "grad_norm": 0.46305430397540676, "learning_rate": 1.0787523421579862e-05, "loss": 0.1797, "step": 2347 }, { "epoch": 2.63709111329496, "grad_norm": 0.4663636966697608, "learning_rate": 1.0779703366036573e-05, "loss": 0.1733, "step": 2348 }, { "epoch": 2.6382142355748983, "grad_norm": 0.48294894032218394, "learning_rate": 1.0771882830733223e-05, "loss": 0.1887, "step": 2349 }, { "epoch": 2.6393373578548367, "grad_norm": 0.45323268388669674, "learning_rate": 1.0764061820481872e-05, "loss": 0.1717, "step": 2350 }, { "epoch": 2.6404604801347746, "grad_norm": 0.4532472744181654, "learning_rate": 1.0756240340094877e-05, "loss": 0.1732, "step": 2351 }, { "epoch": 2.641583602414713, "grad_norm": 0.4383917984038164, "learning_rate": 1.0748418394384876e-05, "loss": 0.1658, "step": 2352 }, { "epoch": 2.6427067246946514, "grad_norm": 0.4661107283107099, "learning_rate": 1.07405959881648e-05, "loss": 0.1863, "step": 2353 }, { "epoch": 2.6438298469745893, "grad_norm": 0.4737303429465663, "learning_rate": 1.0732773126247867e-05, "loss": 0.1771, "step": 2354 }, { "epoch": 2.6449529692545277, "grad_norm": 0.47017332443037857, "learning_rate": 1.0724949813447563e-05, "loss": 0.1724, "step": 2355 }, { "epoch": 2.6460760915344657, "grad_norm": 0.49757478615437767, "learning_rate": 1.071712605457766e-05, "loss": 0.1824, "step": 2356 }, { "epoch": 2.647199213814404, "grad_norm": 0.47195996519974315, "learning_rate": 1.0709301854452207e-05, "loss": 0.1778, "step": 2357 }, { "epoch": 2.6483223360943424, "grad_norm": 0.48072524454750337, "learning_rate": 1.0701477217885517e-05, "loss": 0.1854, "step": 2358 }, { "epoch": 2.6494454583742804, "grad_norm": 0.48656148421443146, "learning_rate": 1.0693652149692175e-05, "loss": 0.1838, "step": 2359 }, { "epoch": 2.6505685806542187, "grad_norm": 0.4565743939469196, "learning_rate": 1.068582665468703e-05, "loss": 0.1767, "step": 2360 }, { "epoch": 2.6516917029341567, "grad_norm": 0.4785810820290836, "learning_rate": 1.0678000737685197e-05, "loss": 0.1738, "step": 2361 }, { "epoch": 2.652814825214095, "grad_norm": 0.4574908421572849, "learning_rate": 1.0670174403502051e-05, "loss": 0.1765, "step": 2362 }, { "epoch": 2.6539379474940334, "grad_norm": 0.465593556602867, "learning_rate": 1.0662347656953221e-05, "loss": 0.1679, "step": 2363 }, { "epoch": 2.6550610697739714, "grad_norm": 0.46131939499983016, "learning_rate": 1.0654520502854588e-05, "loss": 0.1777, "step": 2364 }, { "epoch": 2.6561841920539098, "grad_norm": 0.48218612435761826, "learning_rate": 1.0646692946022285e-05, "loss": 0.1927, "step": 2365 }, { "epoch": 2.657307314333848, "grad_norm": 0.44549729019844936, "learning_rate": 1.0638864991272698e-05, "loss": 0.1647, "step": 2366 }, { "epoch": 2.658430436613786, "grad_norm": 0.4723488402722565, "learning_rate": 1.063103664342245e-05, "loss": 0.1757, "step": 2367 }, { "epoch": 2.6595535588937245, "grad_norm": 0.46272844102717686, "learning_rate": 1.0623207907288409e-05, "loss": 0.1719, "step": 2368 }, { "epoch": 2.660676681173663, "grad_norm": 0.45549848321049613, "learning_rate": 1.061537878768769e-05, "loss": 0.1697, "step": 2369 }, { "epoch": 2.661799803453601, "grad_norm": 0.45675590525963283, "learning_rate": 1.0607549289437626e-05, "loss": 0.174, "step": 2370 }, { "epoch": 2.662922925733539, "grad_norm": 0.47841033221623763, "learning_rate": 1.0599719417355801e-05, "loss": 0.1778, "step": 2371 }, { "epoch": 2.6640460480134776, "grad_norm": 0.4536400418140574, "learning_rate": 1.0591889176260017e-05, "loss": 0.1737, "step": 2372 }, { "epoch": 2.6651691702934155, "grad_norm": 0.4630804441617724, "learning_rate": 1.0584058570968312e-05, "loss": 0.1792, "step": 2373 }, { "epoch": 2.666292292573354, "grad_norm": 0.4382451498975878, "learning_rate": 1.0576227606298937e-05, "loss": 0.1567, "step": 2374 }, { "epoch": 2.6674154148532923, "grad_norm": 0.4584632279254452, "learning_rate": 1.0568396287070377e-05, "loss": 0.1751, "step": 2375 }, { "epoch": 2.66853853713323, "grad_norm": 0.46530159309615415, "learning_rate": 1.0560564618101328e-05, "loss": 0.1796, "step": 2376 }, { "epoch": 2.6696616594131686, "grad_norm": 0.4635165893290825, "learning_rate": 1.0552732604210701e-05, "loss": 0.1737, "step": 2377 }, { "epoch": 2.670784781693107, "grad_norm": 0.4395153429761878, "learning_rate": 1.0544900250217615e-05, "loss": 0.1599, "step": 2378 }, { "epoch": 2.671907903973045, "grad_norm": 0.45345637640530895, "learning_rate": 1.0537067560941416e-05, "loss": 0.1746, "step": 2379 }, { "epoch": 2.6730310262529833, "grad_norm": 0.45176101895188636, "learning_rate": 1.0529234541201631e-05, "loss": 0.1724, "step": 2380 }, { "epoch": 2.6741541485329217, "grad_norm": 0.46146241332136667, "learning_rate": 1.0521401195818014e-05, "loss": 0.1748, "step": 2381 }, { "epoch": 2.6752772708128596, "grad_norm": 0.4644112669184593, "learning_rate": 1.0513567529610498e-05, "loss": 0.1792, "step": 2382 }, { "epoch": 2.676400393092798, "grad_norm": 0.474578955286889, "learning_rate": 1.050573354739923e-05, "loss": 0.1785, "step": 2383 }, { "epoch": 2.6775235153727364, "grad_norm": 0.4361002502010669, "learning_rate": 1.049789925400455e-05, "loss": 0.1625, "step": 2384 }, { "epoch": 2.6786466376526743, "grad_norm": 0.45846134672913397, "learning_rate": 1.0490064654246976e-05, "loss": 0.18, "step": 2385 }, { "epoch": 2.6797697599326127, "grad_norm": 0.46526875476056234, "learning_rate": 1.0482229752947228e-05, "loss": 0.1712, "step": 2386 }, { "epoch": 2.680892882212551, "grad_norm": 0.4661257112898557, "learning_rate": 1.0474394554926206e-05, "loss": 0.1815, "step": 2387 }, { "epoch": 2.682016004492489, "grad_norm": 0.4648307273335071, "learning_rate": 1.0466559065004995e-05, "loss": 0.17, "step": 2388 }, { "epoch": 2.6831391267724274, "grad_norm": 0.4673176281630868, "learning_rate": 1.0458723288004858e-05, "loss": 0.1689, "step": 2389 }, { "epoch": 2.6842622490523658, "grad_norm": 0.4580586103480894, "learning_rate": 1.0450887228747229e-05, "loss": 0.1608, "step": 2390 }, { "epoch": 2.6853853713323037, "grad_norm": 0.46591951479828025, "learning_rate": 1.0443050892053733e-05, "loss": 0.1752, "step": 2391 }, { "epoch": 2.686508493612242, "grad_norm": 0.4515547388684425, "learning_rate": 1.0435214282746142e-05, "loss": 0.1598, "step": 2392 }, { "epoch": 2.6876316158921805, "grad_norm": 0.4951158425551178, "learning_rate": 1.0427377405646414e-05, "loss": 0.1933, "step": 2393 }, { "epoch": 2.6887547381721184, "grad_norm": 0.47544695446998514, "learning_rate": 1.0419540265576666e-05, "loss": 0.1797, "step": 2394 }, { "epoch": 2.689877860452057, "grad_norm": 0.45123909591416184, "learning_rate": 1.041170286735918e-05, "loss": 0.1724, "step": 2395 }, { "epoch": 2.691000982731995, "grad_norm": 0.462230580491664, "learning_rate": 1.0403865215816382e-05, "loss": 0.1846, "step": 2396 }, { "epoch": 2.692124105011933, "grad_norm": 0.452353253692668, "learning_rate": 1.0396027315770876e-05, "loss": 0.1757, "step": 2397 }, { "epoch": 2.6932472272918715, "grad_norm": 0.45256519362751757, "learning_rate": 1.0388189172045407e-05, "loss": 0.1815, "step": 2398 }, { "epoch": 2.69437034957181, "grad_norm": 0.47210582761980324, "learning_rate": 1.0380350789462865e-05, "loss": 0.178, "step": 2399 }, { "epoch": 2.695493471851748, "grad_norm": 0.4544816107564523, "learning_rate": 1.0372512172846296e-05, "loss": 0.1659, "step": 2400 }, { "epoch": 2.696616594131686, "grad_norm": 0.47543248227366125, "learning_rate": 1.0364673327018891e-05, "loss": 0.1823, "step": 2401 }, { "epoch": 2.6977397164116246, "grad_norm": 0.4645680632421113, "learning_rate": 1.0356834256803974e-05, "loss": 0.1817, "step": 2402 }, { "epoch": 2.6988628386915625, "grad_norm": 0.47229910569982986, "learning_rate": 1.0348994967025012e-05, "loss": 0.1705, "step": 2403 }, { "epoch": 2.699985960971501, "grad_norm": 0.4709976896475921, "learning_rate": 1.0341155462505606e-05, "loss": 0.1816, "step": 2404 }, { "epoch": 2.701109083251439, "grad_norm": 0.43745184102209106, "learning_rate": 1.033331574806949e-05, "loss": 0.1542, "step": 2405 }, { "epoch": 2.702232205531377, "grad_norm": 0.47528735474280753, "learning_rate": 1.0325475828540524e-05, "loss": 0.1746, "step": 2406 }, { "epoch": 2.7033553278113156, "grad_norm": 0.4864779824250153, "learning_rate": 1.03176357087427e-05, "loss": 0.1861, "step": 2407 }, { "epoch": 2.7044784500912535, "grad_norm": 0.44098989955704543, "learning_rate": 1.030979539350013e-05, "loss": 0.1665, "step": 2408 }, { "epoch": 2.705601572371192, "grad_norm": 0.45665136007590384, "learning_rate": 1.0301954887637045e-05, "loss": 0.1701, "step": 2409 }, { "epoch": 2.7067246946511303, "grad_norm": 0.457919080022194, "learning_rate": 1.0294114195977796e-05, "loss": 0.1647, "step": 2410 }, { "epoch": 2.7078478169310682, "grad_norm": 0.47134438824824476, "learning_rate": 1.0286273323346843e-05, "loss": 0.1811, "step": 2411 }, { "epoch": 2.7089709392110066, "grad_norm": 0.48672871636486603, "learning_rate": 1.0278432274568765e-05, "loss": 0.1743, "step": 2412 }, { "epoch": 2.7100940614909446, "grad_norm": 0.4619105109658203, "learning_rate": 1.0270591054468244e-05, "loss": 0.1684, "step": 2413 }, { "epoch": 2.711217183770883, "grad_norm": 0.4693014599142233, "learning_rate": 1.0262749667870071e-05, "loss": 0.1735, "step": 2414 }, { "epoch": 2.7123403060508213, "grad_norm": 0.4536596675091747, "learning_rate": 1.0254908119599134e-05, "loss": 0.1663, "step": 2415 }, { "epoch": 2.7134634283307593, "grad_norm": 0.4762674495287221, "learning_rate": 1.0247066414480424e-05, "loss": 0.1733, "step": 2416 }, { "epoch": 2.7145865506106976, "grad_norm": 0.4689655559574979, "learning_rate": 1.0239224557339035e-05, "loss": 0.1784, "step": 2417 }, { "epoch": 2.715709672890636, "grad_norm": 0.4673344907187584, "learning_rate": 1.0231382553000143e-05, "loss": 0.1837, "step": 2418 }, { "epoch": 2.716832795170574, "grad_norm": 0.46231195313403367, "learning_rate": 1.0223540406289017e-05, "loss": 0.1691, "step": 2419 }, { "epoch": 2.7179559174505123, "grad_norm": 0.4722466169309128, "learning_rate": 1.0215698122031021e-05, "loss": 0.1765, "step": 2420 }, { "epoch": 2.7190790397304507, "grad_norm": 0.4464937713270927, "learning_rate": 1.0207855705051595e-05, "loss": 0.1692, "step": 2421 }, { "epoch": 2.7202021620103887, "grad_norm": 0.43287348633124384, "learning_rate": 1.020001316017627e-05, "loss": 0.161, "step": 2422 }, { "epoch": 2.721325284290327, "grad_norm": 0.46989012910634004, "learning_rate": 1.0192170492230643e-05, "loss": 0.1809, "step": 2423 }, { "epoch": 2.7224484065702654, "grad_norm": 0.4705286884689803, "learning_rate": 1.0184327706040397e-05, "loss": 0.174, "step": 2424 }, { "epoch": 2.7235715288502034, "grad_norm": 0.43611447014588406, "learning_rate": 1.0176484806431288e-05, "loss": 0.1604, "step": 2425 }, { "epoch": 2.7246946511301418, "grad_norm": 0.4794627114691746, "learning_rate": 1.0168641798229133e-05, "loss": 0.1802, "step": 2426 }, { "epoch": 2.72581777341008, "grad_norm": 0.4381810224958243, "learning_rate": 1.0160798686259825e-05, "loss": 0.1609, "step": 2427 }, { "epoch": 2.726940895690018, "grad_norm": 0.48565778276163146, "learning_rate": 1.0152955475349316e-05, "loss": 0.1878, "step": 2428 }, { "epoch": 2.7280640179699565, "grad_norm": 0.46551401986940266, "learning_rate": 1.014511217032362e-05, "loss": 0.1744, "step": 2429 }, { "epoch": 2.729187140249895, "grad_norm": 0.45724136025751433, "learning_rate": 1.0137268776008809e-05, "loss": 0.17, "step": 2430 }, { "epoch": 2.7303102625298328, "grad_norm": 0.45946275216502946, "learning_rate": 1.0129425297231005e-05, "loss": 0.1766, "step": 2431 }, { "epoch": 2.731433384809771, "grad_norm": 0.46397417169666205, "learning_rate": 1.0121581738816397e-05, "loss": 0.174, "step": 2432 }, { "epoch": 2.7325565070897095, "grad_norm": 0.46245651199187193, "learning_rate": 1.0113738105591203e-05, "loss": 0.1792, "step": 2433 }, { "epoch": 2.7336796293696475, "grad_norm": 0.4524764262786222, "learning_rate": 1.0105894402381703e-05, "loss": 0.1694, "step": 2434 }, { "epoch": 2.734802751649586, "grad_norm": 0.4529695543319877, "learning_rate": 1.0098050634014216e-05, "loss": 0.1753, "step": 2435 }, { "epoch": 2.7359258739295242, "grad_norm": 0.45224833550454835, "learning_rate": 1.0090206805315087e-05, "loss": 0.1688, "step": 2436 }, { "epoch": 2.737048996209462, "grad_norm": 0.4559944373716082, "learning_rate": 1.0082362921110721e-05, "loss": 0.1715, "step": 2437 }, { "epoch": 2.7381721184894006, "grad_norm": 0.45958405120251117, "learning_rate": 1.0074518986227546e-05, "loss": 0.17, "step": 2438 }, { "epoch": 2.739295240769339, "grad_norm": 0.4490783585500566, "learning_rate": 1.0066675005492017e-05, "loss": 0.1715, "step": 2439 }, { "epoch": 2.740418363049277, "grad_norm": 0.448796044847132, "learning_rate": 1.0058830983730622e-05, "loss": 0.178, "step": 2440 }, { "epoch": 2.7415414853292153, "grad_norm": 0.45884028647372394, "learning_rate": 1.0050986925769877e-05, "loss": 0.1731, "step": 2441 }, { "epoch": 2.7426646076091536, "grad_norm": 0.4426418856703806, "learning_rate": 1.0043142836436316e-05, "loss": 0.16, "step": 2442 }, { "epoch": 2.7437877298890916, "grad_norm": 0.4678363877463838, "learning_rate": 1.0035298720556493e-05, "loss": 0.1859, "step": 2443 }, { "epoch": 2.74491085216903, "grad_norm": 0.4676021495394005, "learning_rate": 1.002745458295698e-05, "loss": 0.1706, "step": 2444 }, { "epoch": 2.7460339744489684, "grad_norm": 0.4623563788627261, "learning_rate": 1.0019610428464354e-05, "loss": 0.1755, "step": 2445 }, { "epoch": 2.7471570967289063, "grad_norm": 0.4567998634977174, "learning_rate": 1.001176626190522e-05, "loss": 0.1749, "step": 2446 }, { "epoch": 2.7482802190088447, "grad_norm": 0.47580917906104647, "learning_rate": 1.0003922088106178e-05, "loss": 0.1795, "step": 2447 }, { "epoch": 2.749403341288783, "grad_norm": 0.4595058756812876, "learning_rate": 9.996077911893829e-06, "loss": 0.1758, "step": 2448 }, { "epoch": 2.750526463568721, "grad_norm": 0.45863685177045854, "learning_rate": 9.988233738094782e-06, "loss": 0.177, "step": 2449 }, { "epoch": 2.7516495858486594, "grad_norm": 0.4547228686150152, "learning_rate": 9.980389571535647e-06, "loss": 0.1738, "step": 2450 }, { "epoch": 2.7527727081285978, "grad_norm": 0.4425914309842023, "learning_rate": 9.972545417043024e-06, "loss": 0.1591, "step": 2451 }, { "epoch": 2.7538958304085357, "grad_norm": 0.4739331962858264, "learning_rate": 9.964701279443509e-06, "loss": 0.1862, "step": 2452 }, { "epoch": 2.755018952688474, "grad_norm": 0.44719582544834424, "learning_rate": 9.956857163563689e-06, "loss": 0.1621, "step": 2453 }, { "epoch": 2.7561420749684125, "grad_norm": 0.4493034917391583, "learning_rate": 9.949013074230127e-06, "loss": 0.1678, "step": 2454 }, { "epoch": 2.7572651972483504, "grad_norm": 0.4341676937820874, "learning_rate": 9.94116901626938e-06, "loss": 0.1635, "step": 2455 }, { "epoch": 2.758388319528289, "grad_norm": 0.44123424149038765, "learning_rate": 9.933324994507984e-06, "loss": 0.1617, "step": 2456 }, { "epoch": 2.7595114418082267, "grad_norm": 0.4520594389268704, "learning_rate": 9.925481013772456e-06, "loss": 0.166, "step": 2457 }, { "epoch": 2.760634564088165, "grad_norm": 0.47723497756567185, "learning_rate": 9.91763707888928e-06, "loss": 0.1865, "step": 2458 }, { "epoch": 2.7617576863681035, "grad_norm": 0.4555339710437883, "learning_rate": 9.909793194684914e-06, "loss": 0.1806, "step": 2459 }, { "epoch": 2.7628808086480414, "grad_norm": 0.4636447483901148, "learning_rate": 9.901949365985787e-06, "loss": 0.1672, "step": 2460 }, { "epoch": 2.76400393092798, "grad_norm": 0.468526515916128, "learning_rate": 9.894105597618297e-06, "loss": 0.1739, "step": 2461 }, { "epoch": 2.7651270532079177, "grad_norm": 0.4436410518618047, "learning_rate": 9.886261894408798e-06, "loss": 0.1604, "step": 2462 }, { "epoch": 2.766250175487856, "grad_norm": 0.4584305461946224, "learning_rate": 9.878418261183606e-06, "loss": 0.1645, "step": 2463 }, { "epoch": 2.7673732977677945, "grad_norm": 0.48434226395422153, "learning_rate": 9.870574702768997e-06, "loss": 0.1757, "step": 2464 }, { "epoch": 2.7684964200477324, "grad_norm": 0.48156835406806114, "learning_rate": 9.862731223991196e-06, "loss": 0.1845, "step": 2465 }, { "epoch": 2.769619542327671, "grad_norm": 0.44955147765140463, "learning_rate": 9.854887829676382e-06, "loss": 0.1654, "step": 2466 }, { "epoch": 2.770742664607609, "grad_norm": 0.46769768486111885, "learning_rate": 9.847044524650689e-06, "loss": 0.1839, "step": 2467 }, { "epoch": 2.771865786887547, "grad_norm": 0.4802433910241491, "learning_rate": 9.839201313740179e-06, "loss": 0.1848, "step": 2468 }, { "epoch": 2.7729889091674855, "grad_norm": 0.4730216994614374, "learning_rate": 9.83135820177087e-06, "loss": 0.1784, "step": 2469 }, { "epoch": 2.774112031447424, "grad_norm": 0.49525261555778016, "learning_rate": 9.823515193568715e-06, "loss": 0.1855, "step": 2470 }, { "epoch": 2.775235153727362, "grad_norm": 0.44336826641873517, "learning_rate": 9.815672293959605e-06, "loss": 0.1696, "step": 2471 }, { "epoch": 2.7763582760073002, "grad_norm": 0.4157328847494749, "learning_rate": 9.807829507769362e-06, "loss": 0.1611, "step": 2472 }, { "epoch": 2.7774813982872386, "grad_norm": 0.42886202625170655, "learning_rate": 9.799986839823736e-06, "loss": 0.1618, "step": 2473 }, { "epoch": 2.7786045205671766, "grad_norm": 0.4677846081299561, "learning_rate": 9.792144294948408e-06, "loss": 0.1748, "step": 2474 }, { "epoch": 2.779727642847115, "grad_norm": 0.4583869711242806, "learning_rate": 9.784301877968982e-06, "loss": 0.1778, "step": 2475 }, { "epoch": 2.7808507651270533, "grad_norm": 0.4495391147428708, "learning_rate": 9.776459593710985e-06, "loss": 0.1697, "step": 2476 }, { "epoch": 2.7819738874069913, "grad_norm": 0.46075629132037005, "learning_rate": 9.768617446999862e-06, "loss": 0.1666, "step": 2477 }, { "epoch": 2.7830970096869296, "grad_norm": 0.4673513087773124, "learning_rate": 9.760775442660966e-06, "loss": 0.1815, "step": 2478 }, { "epoch": 2.784220131966868, "grad_norm": 0.4723634910358823, "learning_rate": 9.752933585519578e-06, "loss": 0.1796, "step": 2479 }, { "epoch": 2.785343254246806, "grad_norm": 0.467537758740658, "learning_rate": 9.74509188040087e-06, "loss": 0.1734, "step": 2480 }, { "epoch": 2.7864663765267443, "grad_norm": 0.4600429623850505, "learning_rate": 9.737250332129932e-06, "loss": 0.1653, "step": 2481 }, { "epoch": 2.7875894988066827, "grad_norm": 0.43060840156496566, "learning_rate": 9.72940894553176e-06, "loss": 0.1608, "step": 2482 }, { "epoch": 2.7887126210866207, "grad_norm": 0.4950921987658919, "learning_rate": 9.721567725431239e-06, "loss": 0.1915, "step": 2483 }, { "epoch": 2.789835743366559, "grad_norm": 0.44612774116221526, "learning_rate": 9.71372667665316e-06, "loss": 0.1669, "step": 2484 }, { "epoch": 2.7909588656464974, "grad_norm": 0.4585740919900049, "learning_rate": 9.705885804022207e-06, "loss": 0.1733, "step": 2485 }, { "epoch": 2.7920819879264354, "grad_norm": 0.47309965981508717, "learning_rate": 9.698045112362956e-06, "loss": 0.1752, "step": 2486 }, { "epoch": 2.7932051102063737, "grad_norm": 0.4541221114294698, "learning_rate": 9.690204606499875e-06, "loss": 0.1665, "step": 2487 }, { "epoch": 2.794328232486312, "grad_norm": 0.4816033103876313, "learning_rate": 9.682364291257304e-06, "loss": 0.1854, "step": 2488 }, { "epoch": 2.79545135476625, "grad_norm": 0.46392038210393416, "learning_rate": 9.674524171459478e-06, "loss": 0.1736, "step": 2489 }, { "epoch": 2.7965744770461884, "grad_norm": 0.49408949325331897, "learning_rate": 9.666684251930514e-06, "loss": 0.187, "step": 2490 }, { "epoch": 2.797697599326127, "grad_norm": 0.47463312580832867, "learning_rate": 9.658844537494396e-06, "loss": 0.1789, "step": 2491 }, { "epoch": 2.7988207216060648, "grad_norm": 0.48036454400289674, "learning_rate": 9.651005032974994e-06, "loss": 0.178, "step": 2492 }, { "epoch": 2.799943843886003, "grad_norm": 0.4386654951501184, "learning_rate": 9.64316574319603e-06, "loss": 0.1611, "step": 2493 }, { "epoch": 2.8010669661659415, "grad_norm": 0.46695275500508854, "learning_rate": 9.63532667298111e-06, "loss": 0.1771, "step": 2494 }, { "epoch": 2.8021900884458795, "grad_norm": 0.45981170005783556, "learning_rate": 9.627487827153704e-06, "loss": 0.1818, "step": 2495 }, { "epoch": 2.803313210725818, "grad_norm": 0.44791412818213827, "learning_rate": 9.619649210537136e-06, "loss": 0.1686, "step": 2496 }, { "epoch": 2.8044363330057562, "grad_norm": 0.4694605987355865, "learning_rate": 9.6118108279546e-06, "loss": 0.1804, "step": 2497 }, { "epoch": 2.805559455285694, "grad_norm": 0.46461819232187124, "learning_rate": 9.603972684229127e-06, "loss": 0.1718, "step": 2498 }, { "epoch": 2.8066825775656326, "grad_norm": 0.4640887745511831, "learning_rate": 9.59613478418362e-06, "loss": 0.1729, "step": 2499 }, { "epoch": 2.807805699845571, "grad_norm": 0.46473184965851605, "learning_rate": 9.588297132640824e-06, "loss": 0.1799, "step": 2500 }, { "epoch": 2.808928822125509, "grad_norm": 0.44859390204733884, "learning_rate": 9.580459734423334e-06, "loss": 0.1692, "step": 2501 }, { "epoch": 2.8100519444054473, "grad_norm": 0.48966480162143305, "learning_rate": 9.572622594353589e-06, "loss": 0.1863, "step": 2502 }, { "epoch": 2.8111750666853856, "grad_norm": 0.4707076422301409, "learning_rate": 9.564785717253862e-06, "loss": 0.1737, "step": 2503 }, { "epoch": 2.8122981889653236, "grad_norm": 0.4798544695330848, "learning_rate": 9.556949107946272e-06, "loss": 0.1762, "step": 2504 }, { "epoch": 2.813421311245262, "grad_norm": 0.4460329229033776, "learning_rate": 9.549112771252771e-06, "loss": 0.1645, "step": 2505 }, { "epoch": 2.8145444335252, "grad_norm": 0.46898014918292236, "learning_rate": 9.541276711995149e-06, "loss": 0.1773, "step": 2506 }, { "epoch": 2.8156675558051383, "grad_norm": 0.45833210070182195, "learning_rate": 9.53344093499501e-06, "loss": 0.1764, "step": 2507 }, { "epoch": 2.8167906780850767, "grad_norm": 0.4479886546084605, "learning_rate": 9.525605445073797e-06, "loss": 0.1648, "step": 2508 }, { "epoch": 2.8179138003650146, "grad_norm": 0.4701076235299984, "learning_rate": 9.517770247052775e-06, "loss": 0.1798, "step": 2509 }, { "epoch": 2.819036922644953, "grad_norm": 0.44716178784872795, "learning_rate": 9.509935345753026e-06, "loss": 0.1668, "step": 2510 }, { "epoch": 2.8201600449248914, "grad_norm": 0.4575598504302553, "learning_rate": 9.502100745995456e-06, "loss": 0.1781, "step": 2511 }, { "epoch": 2.8212831672048293, "grad_norm": 0.46438759653279443, "learning_rate": 9.494266452600771e-06, "loss": 0.1759, "step": 2512 }, { "epoch": 2.8224062894847677, "grad_norm": 0.4463213888843625, "learning_rate": 9.486432470389505e-06, "loss": 0.1739, "step": 2513 }, { "epoch": 2.8235294117647056, "grad_norm": 0.44363386556020745, "learning_rate": 9.47859880418199e-06, "loss": 0.1677, "step": 2514 }, { "epoch": 2.824652534044644, "grad_norm": 0.47574851732895207, "learning_rate": 9.470765458798369e-06, "loss": 0.1851, "step": 2515 }, { "epoch": 2.8257756563245824, "grad_norm": 0.47371348230350735, "learning_rate": 9.46293243905859e-06, "loss": 0.1728, "step": 2516 }, { "epoch": 2.8268987786045203, "grad_norm": 0.4529641523386059, "learning_rate": 9.455099749782387e-06, "loss": 0.171, "step": 2517 }, { "epoch": 2.8280219008844587, "grad_norm": 0.46948443592441114, "learning_rate": 9.447267395789304e-06, "loss": 0.1727, "step": 2518 }, { "epoch": 2.829145023164397, "grad_norm": 0.46792399231713533, "learning_rate": 9.439435381898674e-06, "loss": 0.1732, "step": 2519 }, { "epoch": 2.830268145444335, "grad_norm": 0.46471285429284986, "learning_rate": 9.431603712929623e-06, "loss": 0.1792, "step": 2520 }, { "epoch": 2.8313912677242734, "grad_norm": 0.4710096469718015, "learning_rate": 9.423772393701064e-06, "loss": 0.1761, "step": 2521 }, { "epoch": 2.832514390004212, "grad_norm": 0.45657758770291, "learning_rate": 9.415941429031693e-06, "loss": 0.1747, "step": 2522 }, { "epoch": 2.8336375122841497, "grad_norm": 0.4380363624369588, "learning_rate": 9.408110823739985e-06, "loss": 0.167, "step": 2523 }, { "epoch": 2.834760634564088, "grad_norm": 0.4654221971780646, "learning_rate": 9.400280582644204e-06, "loss": 0.1704, "step": 2524 }, { "epoch": 2.8358837568440265, "grad_norm": 0.4630864998764016, "learning_rate": 9.392450710562377e-06, "loss": 0.1747, "step": 2525 }, { "epoch": 2.8370068791239644, "grad_norm": 0.47819793126694177, "learning_rate": 9.384621212312316e-06, "loss": 0.1806, "step": 2526 }, { "epoch": 2.838130001403903, "grad_norm": 0.44989059188440433, "learning_rate": 9.376792092711593e-06, "loss": 0.1719, "step": 2527 }, { "epoch": 2.839253123683841, "grad_norm": 0.4493159352390759, "learning_rate": 9.368963356577554e-06, "loss": 0.1627, "step": 2528 }, { "epoch": 2.840376245963779, "grad_norm": 0.462068887667578, "learning_rate": 9.361135008727304e-06, "loss": 0.1744, "step": 2529 }, { "epoch": 2.8414993682437175, "grad_norm": 0.46328463138927967, "learning_rate": 9.353307053977717e-06, "loss": 0.1698, "step": 2530 }, { "epoch": 2.842622490523656, "grad_norm": 0.4535786753854389, "learning_rate": 9.345479497145417e-06, "loss": 0.1598, "step": 2531 }, { "epoch": 2.843745612803594, "grad_norm": 0.4659782442541839, "learning_rate": 9.337652343046782e-06, "loss": 0.1829, "step": 2532 }, { "epoch": 2.844868735083532, "grad_norm": 0.43264542173054715, "learning_rate": 9.32982559649795e-06, "loss": 0.1633, "step": 2533 }, { "epoch": 2.8459918573634706, "grad_norm": 0.4467290239010384, "learning_rate": 9.321999262314803e-06, "loss": 0.1668, "step": 2534 }, { "epoch": 2.8471149796434085, "grad_norm": 0.4546808256348493, "learning_rate": 9.314173345312972e-06, "loss": 0.1729, "step": 2535 }, { "epoch": 2.848238101923347, "grad_norm": 0.46681169539201894, "learning_rate": 9.30634785030783e-06, "loss": 0.1768, "step": 2536 }, { "epoch": 2.8493612242032853, "grad_norm": 0.48967009130593026, "learning_rate": 9.298522782114488e-06, "loss": 0.1876, "step": 2537 }, { "epoch": 2.8504843464832232, "grad_norm": 0.4476752114365759, "learning_rate": 9.290698145547796e-06, "loss": 0.1624, "step": 2538 }, { "epoch": 2.8516074687631616, "grad_norm": 0.478382573772851, "learning_rate": 9.282873945422341e-06, "loss": 0.1843, "step": 2539 }, { "epoch": 2.8527305910431, "grad_norm": 0.45741308655522045, "learning_rate": 9.27505018655244e-06, "loss": 0.1733, "step": 2540 }, { "epoch": 2.853853713323038, "grad_norm": 0.4683894832083663, "learning_rate": 9.267226873752137e-06, "loss": 0.1729, "step": 2541 }, { "epoch": 2.8549768356029763, "grad_norm": 0.4621198618194314, "learning_rate": 9.259404011835203e-06, "loss": 0.1683, "step": 2542 }, { "epoch": 2.8560999578829147, "grad_norm": 0.47870289959867185, "learning_rate": 9.251581605615128e-06, "loss": 0.1751, "step": 2543 }, { "epoch": 2.8572230801628526, "grad_norm": 0.48939618598564266, "learning_rate": 9.243759659905126e-06, "loss": 0.1851, "step": 2544 }, { "epoch": 2.858346202442791, "grad_norm": 0.48615941707120763, "learning_rate": 9.235938179518131e-06, "loss": 0.1825, "step": 2545 }, { "epoch": 2.8594693247227294, "grad_norm": 0.46996292093877523, "learning_rate": 9.228117169266782e-06, "loss": 0.1706, "step": 2546 }, { "epoch": 2.8605924470026673, "grad_norm": 0.4577345045610456, "learning_rate": 9.22029663396343e-06, "loss": 0.1731, "step": 2547 }, { "epoch": 2.8617155692826057, "grad_norm": 0.4593857700548628, "learning_rate": 9.21247657842014e-06, "loss": 0.1711, "step": 2548 }, { "epoch": 2.862838691562544, "grad_norm": 0.48560219199017984, "learning_rate": 9.204657007448678e-06, "loss": 0.1932, "step": 2549 }, { "epoch": 2.863961813842482, "grad_norm": 0.4747666905904359, "learning_rate": 9.196837925860516e-06, "loss": 0.1844, "step": 2550 }, { "epoch": 2.8650849361224204, "grad_norm": 0.4818639171590592, "learning_rate": 9.189019338466812e-06, "loss": 0.1791, "step": 2551 }, { "epoch": 2.866208058402359, "grad_norm": 0.4643395199305748, "learning_rate": 9.181201250078435e-06, "loss": 0.1704, "step": 2552 }, { "epoch": 2.8673311806822968, "grad_norm": 0.4611993802683832, "learning_rate": 9.173383665505937e-06, "loss": 0.1705, "step": 2553 }, { "epoch": 2.868454302962235, "grad_norm": 0.47676505658180535, "learning_rate": 9.165566589559568e-06, "loss": 0.1831, "step": 2554 }, { "epoch": 2.8695774252421735, "grad_norm": 0.4674787023765812, "learning_rate": 9.15775002704926e-06, "loss": 0.1632, "step": 2555 }, { "epoch": 2.8707005475221115, "grad_norm": 0.4905453400165015, "learning_rate": 9.14993398278463e-06, "loss": 0.1819, "step": 2556 }, { "epoch": 2.87182366980205, "grad_norm": 0.47733632103578205, "learning_rate": 9.142118461574971e-06, "loss": 0.1792, "step": 2557 }, { "epoch": 2.8729467920819878, "grad_norm": 0.4558817946417188, "learning_rate": 9.134303468229264e-06, "loss": 0.1792, "step": 2558 }, { "epoch": 2.874069914361926, "grad_norm": 0.47700983129939084, "learning_rate": 9.12648900755616e-06, "loss": 0.182, "step": 2559 }, { "epoch": 2.8751930366418645, "grad_norm": 0.4553018015995702, "learning_rate": 9.118675084363986e-06, "loss": 0.1667, "step": 2560 }, { "epoch": 2.8763161589218025, "grad_norm": 0.4593598102751128, "learning_rate": 9.110861703460727e-06, "loss": 0.1728, "step": 2561 }, { "epoch": 2.877439281201741, "grad_norm": 0.4648771153530072, "learning_rate": 9.103048869654047e-06, "loss": 0.1728, "step": 2562 }, { "epoch": 2.878562403481679, "grad_norm": 0.436076180749989, "learning_rate": 9.095236587751267e-06, "loss": 0.1615, "step": 2563 }, { "epoch": 2.879685525761617, "grad_norm": 0.4734747981831071, "learning_rate": 9.08742486255937e-06, "loss": 0.1802, "step": 2564 }, { "epoch": 2.8808086480415556, "grad_norm": 0.4877652162687277, "learning_rate": 9.079613698885002e-06, "loss": 0.1869, "step": 2565 }, { "epoch": 2.8819317703214935, "grad_norm": 0.45765721191013503, "learning_rate": 9.071803101534451e-06, "loss": 0.1755, "step": 2566 }, { "epoch": 2.883054892601432, "grad_norm": 0.45021196688829096, "learning_rate": 9.063993075313666e-06, "loss": 0.1727, "step": 2567 }, { "epoch": 2.8841780148813703, "grad_norm": 0.4522674069841106, "learning_rate": 9.056183625028243e-06, "loss": 0.166, "step": 2568 }, { "epoch": 2.885301137161308, "grad_norm": 0.46231761385594666, "learning_rate": 9.04837475548342e-06, "loss": 0.1808, "step": 2569 }, { "epoch": 2.8864242594412466, "grad_norm": 0.47474127509623437, "learning_rate": 9.040566471484085e-06, "loss": 0.183, "step": 2570 }, { "epoch": 2.887547381721185, "grad_norm": 0.45900525335603964, "learning_rate": 9.032758777834754e-06, "loss": 0.1737, "step": 2571 }, { "epoch": 2.888670504001123, "grad_norm": 0.45812880138007356, "learning_rate": 9.024951679339594e-06, "loss": 0.1677, "step": 2572 }, { "epoch": 2.8897936262810613, "grad_norm": 0.47586663132708334, "learning_rate": 9.017145180802393e-06, "loss": 0.1834, "step": 2573 }, { "epoch": 2.8909167485609997, "grad_norm": 0.4116685169052402, "learning_rate": 9.00933928702658e-06, "loss": 0.1496, "step": 2574 }, { "epoch": 2.8920398708409376, "grad_norm": 0.46861344574621616, "learning_rate": 9.001534002815209e-06, "loss": 0.1789, "step": 2575 }, { "epoch": 2.893162993120876, "grad_norm": 0.46679304982614994, "learning_rate": 8.993729332970948e-06, "loss": 0.1742, "step": 2576 }, { "epoch": 2.8942861154008144, "grad_norm": 0.4580429287886535, "learning_rate": 8.985925282296105e-06, "loss": 0.1838, "step": 2577 }, { "epoch": 2.8954092376807523, "grad_norm": 0.4569743853099299, "learning_rate": 8.978121855592593e-06, "loss": 0.1761, "step": 2578 }, { "epoch": 2.8965323599606907, "grad_norm": 0.4743176355550856, "learning_rate": 8.970319057661954e-06, "loss": 0.1831, "step": 2579 }, { "epoch": 2.897655482240629, "grad_norm": 0.4553016611576038, "learning_rate": 8.962516893305324e-06, "loss": 0.1692, "step": 2580 }, { "epoch": 2.898778604520567, "grad_norm": 0.45473502109690267, "learning_rate": 8.954715367323468e-06, "loss": 0.1678, "step": 2581 }, { "epoch": 2.8999017268005054, "grad_norm": 0.4807942668241952, "learning_rate": 8.946914484516748e-06, "loss": 0.1807, "step": 2582 }, { "epoch": 2.901024849080444, "grad_norm": 0.429941902019654, "learning_rate": 8.939114249685135e-06, "loss": 0.1516, "step": 2583 }, { "epoch": 2.9021479713603817, "grad_norm": 0.4653157438770528, "learning_rate": 8.931314667628201e-06, "loss": 0.1767, "step": 2584 }, { "epoch": 2.90327109364032, "grad_norm": 0.44312625971069464, "learning_rate": 8.923515743145113e-06, "loss": 0.1697, "step": 2585 }, { "epoch": 2.9043942159202585, "grad_norm": 0.43811033758732193, "learning_rate": 8.915717481034632e-06, "loss": 0.1604, "step": 2586 }, { "epoch": 2.9055173382001964, "grad_norm": 0.47216339814248476, "learning_rate": 8.907919886095115e-06, "loss": 0.1731, "step": 2587 }, { "epoch": 2.906640460480135, "grad_norm": 0.4621783530580539, "learning_rate": 8.900122963124513e-06, "loss": 0.1759, "step": 2588 }, { "epoch": 2.907763582760073, "grad_norm": 0.45497514323875493, "learning_rate": 8.892326716920356e-06, "loss": 0.1651, "step": 2589 }, { "epoch": 2.908886705040011, "grad_norm": 0.457851254832388, "learning_rate": 8.884531152279757e-06, "loss": 0.1729, "step": 2590 }, { "epoch": 2.9100098273199495, "grad_norm": 0.45822650406750187, "learning_rate": 8.876736273999415e-06, "loss": 0.1693, "step": 2591 }, { "epoch": 2.911132949599888, "grad_norm": 0.46856109234379667, "learning_rate": 8.868942086875605e-06, "loss": 0.1848, "step": 2592 }, { "epoch": 2.912256071879826, "grad_norm": 0.4576565366829723, "learning_rate": 8.861148595704176e-06, "loss": 0.1731, "step": 2593 }, { "epoch": 2.913379194159764, "grad_norm": 0.4523806979663706, "learning_rate": 8.853355805280553e-06, "loss": 0.1744, "step": 2594 }, { "epoch": 2.9145023164397026, "grad_norm": 0.45200684428131077, "learning_rate": 8.845563720399715e-06, "loss": 0.173, "step": 2595 }, { "epoch": 2.9156254387196405, "grad_norm": 0.47579624922930075, "learning_rate": 8.837772345856226e-06, "loss": 0.1917, "step": 2596 }, { "epoch": 2.916748560999579, "grad_norm": 0.45572922351885753, "learning_rate": 8.829981686444201e-06, "loss": 0.174, "step": 2597 }, { "epoch": 2.9178716832795173, "grad_norm": 0.4638260376523015, "learning_rate": 8.822191746957321e-06, "loss": 0.1868, "step": 2598 }, { "epoch": 2.9189948055594552, "grad_norm": 0.45148558403348366, "learning_rate": 8.814402532188824e-06, "loss": 0.1717, "step": 2599 }, { "epoch": 2.9201179278393936, "grad_norm": 0.4573379490292585, "learning_rate": 8.806614046931491e-06, "loss": 0.1713, "step": 2600 }, { "epoch": 2.921241050119332, "grad_norm": 0.4767926454940618, "learning_rate": 8.79882629597767e-06, "loss": 0.1763, "step": 2601 }, { "epoch": 2.92236417239927, "grad_norm": 0.48398291984858927, "learning_rate": 8.791039284119244e-06, "loss": 0.1898, "step": 2602 }, { "epoch": 2.9234872946792083, "grad_norm": 0.4645986763409629, "learning_rate": 8.783253016147652e-06, "loss": 0.1704, "step": 2603 }, { "epoch": 2.9246104169591467, "grad_norm": 0.5308394707341152, "learning_rate": 8.775467496853873e-06, "loss": 0.191, "step": 2604 }, { "epoch": 2.9257335392390846, "grad_norm": 0.4670451326042578, "learning_rate": 8.767682731028415e-06, "loss": 0.1711, "step": 2605 }, { "epoch": 2.926856661519023, "grad_norm": 0.438698398612858, "learning_rate": 8.759898723461333e-06, "loss": 0.1644, "step": 2606 }, { "epoch": 2.927979783798961, "grad_norm": 0.44189027854801244, "learning_rate": 8.752115478942213e-06, "loss": 0.1649, "step": 2607 }, { "epoch": 2.9291029060788993, "grad_norm": 0.4581859363210317, "learning_rate": 8.744333002260172e-06, "loss": 0.1708, "step": 2608 }, { "epoch": 2.9302260283588377, "grad_norm": 0.45676884931636114, "learning_rate": 8.736551298203854e-06, "loss": 0.1742, "step": 2609 }, { "epoch": 2.9313491506387757, "grad_norm": 0.4497025780421319, "learning_rate": 8.728770371561424e-06, "loss": 0.1651, "step": 2610 }, { "epoch": 2.932472272918714, "grad_norm": 0.4781813885457078, "learning_rate": 8.720990227120575e-06, "loss": 0.1815, "step": 2611 }, { "epoch": 2.933595395198652, "grad_norm": 0.46022287061474587, "learning_rate": 8.71321086966851e-06, "loss": 0.1807, "step": 2612 }, { "epoch": 2.9347185174785904, "grad_norm": 0.48446443471996553, "learning_rate": 8.705432303991967e-06, "loss": 0.1915, "step": 2613 }, { "epoch": 2.9358416397585287, "grad_norm": 0.46344897666541895, "learning_rate": 8.697654534877166e-06, "loss": 0.1857, "step": 2614 }, { "epoch": 2.9369647620384667, "grad_norm": 0.4425948152166924, "learning_rate": 8.689877567109861e-06, "loss": 0.1724, "step": 2615 }, { "epoch": 2.938087884318405, "grad_norm": 0.44998509556224764, "learning_rate": 8.682101405475308e-06, "loss": 0.1718, "step": 2616 }, { "epoch": 2.9392110065983434, "grad_norm": 0.4675922727964663, "learning_rate": 8.674326054758261e-06, "loss": 0.1771, "step": 2617 }, { "epoch": 2.9403341288782814, "grad_norm": 0.47264154820508586, "learning_rate": 8.666551519742988e-06, "loss": 0.1816, "step": 2618 }, { "epoch": 2.9414572511582198, "grad_norm": 0.46063885171682406, "learning_rate": 8.658777805213233e-06, "loss": 0.1744, "step": 2619 }, { "epoch": 2.942580373438158, "grad_norm": 0.4568310611912791, "learning_rate": 8.651004915952252e-06, "loss": 0.1779, "step": 2620 }, { "epoch": 2.943703495718096, "grad_norm": 0.48101099147671783, "learning_rate": 8.643232856742794e-06, "loss": 0.1912, "step": 2621 }, { "epoch": 2.9448266179980345, "grad_norm": 0.46646815088946425, "learning_rate": 8.635461632367087e-06, "loss": 0.1859, "step": 2622 }, { "epoch": 2.945949740277973, "grad_norm": 0.4601186273096568, "learning_rate": 8.627691247606862e-06, "loss": 0.1649, "step": 2623 }, { "epoch": 2.947072862557911, "grad_norm": 0.6344634698370505, "learning_rate": 8.619921707243308e-06, "loss": 0.1859, "step": 2624 }, { "epoch": 2.948195984837849, "grad_norm": 0.4461101591994598, "learning_rate": 8.612153016057114e-06, "loss": 0.1586, "step": 2625 }, { "epoch": 2.9493191071177876, "grad_norm": 0.43938035234144796, "learning_rate": 8.604385178828441e-06, "loss": 0.165, "step": 2626 }, { "epoch": 2.9504422293977255, "grad_norm": 0.44943044279325145, "learning_rate": 8.596618200336925e-06, "loss": 0.1764, "step": 2627 }, { "epoch": 2.951565351677664, "grad_norm": 0.4746096642980948, "learning_rate": 8.588852085361679e-06, "loss": 0.1748, "step": 2628 }, { "epoch": 2.9526884739576023, "grad_norm": 0.46112492600966043, "learning_rate": 8.58108683868127e-06, "loss": 0.1707, "step": 2629 }, { "epoch": 2.95381159623754, "grad_norm": 0.45177789162071297, "learning_rate": 8.573322465073746e-06, "loss": 0.1726, "step": 2630 }, { "epoch": 2.9549347185174786, "grad_norm": 0.450023349792469, "learning_rate": 8.565558969316607e-06, "loss": 0.1728, "step": 2631 }, { "epoch": 2.956057840797417, "grad_norm": 0.4770351172578664, "learning_rate": 8.557796356186818e-06, "loss": 0.1877, "step": 2632 }, { "epoch": 2.957180963077355, "grad_norm": 0.4513830973776379, "learning_rate": 8.550034630460806e-06, "loss": 0.1685, "step": 2633 }, { "epoch": 2.9583040853572933, "grad_norm": 0.4857445983384114, "learning_rate": 8.542273796914439e-06, "loss": 0.1718, "step": 2634 }, { "epoch": 2.9594272076372317, "grad_norm": 0.47068053305099244, "learning_rate": 8.534513860323047e-06, "loss": 0.1702, "step": 2635 }, { "epoch": 2.9605503299171696, "grad_norm": 0.47923791043815955, "learning_rate": 8.526754825461402e-06, "loss": 0.1811, "step": 2636 }, { "epoch": 2.961673452197108, "grad_norm": 0.4548065001435798, "learning_rate": 8.518996697103726e-06, "loss": 0.1756, "step": 2637 }, { "epoch": 2.9627965744770464, "grad_norm": 0.4460233301614257, "learning_rate": 8.511239480023686e-06, "loss": 0.1625, "step": 2638 }, { "epoch": 2.9639196967569843, "grad_norm": 0.4548163435033796, "learning_rate": 8.50348317899437e-06, "loss": 0.1658, "step": 2639 }, { "epoch": 2.9650428190369227, "grad_norm": 0.4655678091479219, "learning_rate": 8.495727798788323e-06, "loss": 0.1683, "step": 2640 }, { "epoch": 2.966165941316861, "grad_norm": 0.4680379062633164, "learning_rate": 8.487973344177517e-06, "loss": 0.1869, "step": 2641 }, { "epoch": 2.967289063596799, "grad_norm": 0.44275214343881863, "learning_rate": 8.48021981993335e-06, "loss": 0.1735, "step": 2642 }, { "epoch": 2.9684121858767374, "grad_norm": 0.4393359117417517, "learning_rate": 8.472467230826656e-06, "loss": 0.1596, "step": 2643 }, { "epoch": 2.9695353081566758, "grad_norm": 0.4738828812631789, "learning_rate": 8.464715581627682e-06, "loss": 0.176, "step": 2644 }, { "epoch": 2.9706584304366137, "grad_norm": 0.4573620252964248, "learning_rate": 8.456964877106104e-06, "loss": 0.1727, "step": 2645 }, { "epoch": 2.971781552716552, "grad_norm": 0.46158734380298694, "learning_rate": 8.44921512203102e-06, "loss": 0.1759, "step": 2646 }, { "epoch": 2.9729046749964905, "grad_norm": 0.4627319651256285, "learning_rate": 8.441466321170935e-06, "loss": 0.1755, "step": 2647 }, { "epoch": 2.9740277972764284, "grad_norm": 0.45445850820324185, "learning_rate": 8.433718479293777e-06, "loss": 0.1807, "step": 2648 }, { "epoch": 2.975150919556367, "grad_norm": 0.4428450885299262, "learning_rate": 8.425971601166872e-06, "loss": 0.1684, "step": 2649 }, { "epoch": 2.976274041836305, "grad_norm": 0.4541319747937619, "learning_rate": 8.418225691556962e-06, "loss": 0.1805, "step": 2650 }, { "epoch": 2.977397164116243, "grad_norm": 0.4573856493833637, "learning_rate": 8.41048075523019e-06, "loss": 0.1742, "step": 2651 }, { "epoch": 2.9785202863961815, "grad_norm": 0.4574620336290775, "learning_rate": 8.402736796952104e-06, "loss": 0.1684, "step": 2652 }, { "epoch": 2.97964340867612, "grad_norm": 0.4514989212744961, "learning_rate": 8.39499382148764e-06, "loss": 0.1741, "step": 2653 }, { "epoch": 2.980766530956058, "grad_norm": 0.4559456519447681, "learning_rate": 8.387251833601142e-06, "loss": 0.1761, "step": 2654 }, { "epoch": 2.981889653235996, "grad_norm": 0.45131114670300065, "learning_rate": 8.379510838056338e-06, "loss": 0.1718, "step": 2655 }, { "epoch": 2.983012775515934, "grad_norm": 0.44966905961207143, "learning_rate": 8.371770839616348e-06, "loss": 0.1717, "step": 2656 }, { "epoch": 2.9841358977958725, "grad_norm": 0.46120801970633646, "learning_rate": 8.364031843043683e-06, "loss": 0.1678, "step": 2657 }, { "epoch": 2.985259020075811, "grad_norm": 0.4421962165877643, "learning_rate": 8.356293853100223e-06, "loss": 0.1606, "step": 2658 }, { "epoch": 2.986382142355749, "grad_norm": 0.46999315292829197, "learning_rate": 8.348556874547242e-06, "loss": 0.181, "step": 2659 }, { "epoch": 2.987505264635687, "grad_norm": 0.4506186892281384, "learning_rate": 8.340820912145391e-06, "loss": 0.1712, "step": 2660 }, { "epoch": 2.9886283869156256, "grad_norm": 0.4626508396110205, "learning_rate": 8.333085970654691e-06, "loss": 0.1716, "step": 2661 }, { "epoch": 2.9897515091955635, "grad_norm": 0.45107479921301236, "learning_rate": 8.325352054834542e-06, "loss": 0.1762, "step": 2662 }, { "epoch": 2.990874631475502, "grad_norm": 0.4595705443642629, "learning_rate": 8.317619169443696e-06, "loss": 0.1746, "step": 2663 }, { "epoch": 2.99199775375544, "grad_norm": 0.4694949935187253, "learning_rate": 8.309887319240291e-06, "loss": 0.1747, "step": 2664 }, { "epoch": 2.9931208760353782, "grad_norm": 0.4822701328030329, "learning_rate": 8.302156508981816e-06, "loss": 0.177, "step": 2665 }, { "epoch": 2.9942439983153166, "grad_norm": 0.4623869423886324, "learning_rate": 8.294426743425125e-06, "loss": 0.1653, "step": 2666 }, { "epoch": 2.9953671205952546, "grad_norm": 0.48059683012598114, "learning_rate": 8.286698027326432e-06, "loss": 0.1763, "step": 2667 }, { "epoch": 2.996490242875193, "grad_norm": 0.4600386216074317, "learning_rate": 8.278970365441292e-06, "loss": 0.1671, "step": 2668 }, { "epoch": 2.9976133651551313, "grad_norm": 0.48001415671174696, "learning_rate": 8.271243762524627e-06, "loss": 0.1731, "step": 2669 }, { "epoch": 2.9987364874350693, "grad_norm": 0.4566979760209381, "learning_rate": 8.263518223330698e-06, "loss": 0.1827, "step": 2670 }, { "epoch": 2.9998596097150076, "grad_norm": 0.7596103631516063, "learning_rate": 8.255793752613115e-06, "loss": 0.2575, "step": 2671 }, { "epoch": 3.000982731994946, "grad_norm": 0.6259154790515346, "learning_rate": 8.248070355124832e-06, "loss": 0.1158, "step": 2672 }, { "epoch": 3.002105854274884, "grad_norm": 0.49503499354650105, "learning_rate": 8.240348035618138e-06, "loss": 0.0951, "step": 2673 }, { "epoch": 3.0032289765548223, "grad_norm": 0.4242302786603668, "learning_rate": 8.232626798844661e-06, "loss": 0.0806, "step": 2674 }, { "epoch": 3.0043520988347607, "grad_norm": 0.4269038054152872, "learning_rate": 8.224906649555365e-06, "loss": 0.0878, "step": 2675 }, { "epoch": 3.0054752211146987, "grad_norm": 0.4530862959158299, "learning_rate": 8.21718759250054e-06, "loss": 0.0855, "step": 2676 }, { "epoch": 3.006598343394637, "grad_norm": 0.4886548006885315, "learning_rate": 8.209469632429811e-06, "loss": 0.086, "step": 2677 }, { "epoch": 3.0077214656745754, "grad_norm": 0.554545304546421, "learning_rate": 8.201752774092118e-06, "loss": 0.0994, "step": 2678 }, { "epoch": 3.0088445879545134, "grad_norm": 0.5854424397857068, "learning_rate": 8.194037022235732e-06, "loss": 0.0839, "step": 2679 }, { "epoch": 3.0099677102344518, "grad_norm": 0.5920483383090562, "learning_rate": 8.18632238160824e-06, "loss": 0.0873, "step": 2680 }, { "epoch": 3.01109083251439, "grad_norm": 0.5324726059263992, "learning_rate": 8.178608856956547e-06, "loss": 0.0843, "step": 2681 }, { "epoch": 3.012213954794328, "grad_norm": 0.4991909599622551, "learning_rate": 8.17089645302687e-06, "loss": 0.087, "step": 2682 }, { "epoch": 3.0133370770742665, "grad_norm": 0.4832948640805594, "learning_rate": 8.163185174564731e-06, "loss": 0.0857, "step": 2683 }, { "epoch": 3.014460199354205, "grad_norm": 0.4362830353220936, "learning_rate": 8.155475026314966e-06, "loss": 0.0814, "step": 2684 }, { "epoch": 3.0155833216341428, "grad_norm": 0.424359667514001, "learning_rate": 8.147766013021716e-06, "loss": 0.0829, "step": 2685 }, { "epoch": 3.016706443914081, "grad_norm": 0.41618233800829574, "learning_rate": 8.140058139428425e-06, "loss": 0.0832, "step": 2686 }, { "epoch": 3.0178295661940195, "grad_norm": 0.40606151522127315, "learning_rate": 8.132351410277824e-06, "loss": 0.0858, "step": 2687 }, { "epoch": 3.0189526884739575, "grad_norm": 0.42085251727822387, "learning_rate": 8.124645830311954e-06, "loss": 0.0868, "step": 2688 }, { "epoch": 3.020075810753896, "grad_norm": 0.43662806253277353, "learning_rate": 8.116941404272142e-06, "loss": 0.0884, "step": 2689 }, { "epoch": 3.0211989330338342, "grad_norm": 0.4174983822990382, "learning_rate": 8.109238136899004e-06, "loss": 0.0849, "step": 2690 }, { "epoch": 3.022322055313772, "grad_norm": 0.41649346119332137, "learning_rate": 8.101536032932452e-06, "loss": 0.0791, "step": 2691 }, { "epoch": 3.0234451775937106, "grad_norm": 0.4363115097166906, "learning_rate": 8.093835097111668e-06, "loss": 0.0842, "step": 2692 }, { "epoch": 3.024568299873649, "grad_norm": 0.4507250434052767, "learning_rate": 8.086135334175126e-06, "loss": 0.0809, "step": 2693 }, { "epoch": 3.025691422153587, "grad_norm": 0.46412219030100466, "learning_rate": 8.078436748860572e-06, "loss": 0.0808, "step": 2694 }, { "epoch": 3.0268145444335253, "grad_norm": 0.46817713622059637, "learning_rate": 8.070739345905032e-06, "loss": 0.082, "step": 2695 }, { "epoch": 3.0279376667134636, "grad_norm": 0.4359811445564712, "learning_rate": 8.063043130044806e-06, "loss": 0.0816, "step": 2696 }, { "epoch": 3.0290607889934016, "grad_norm": 0.44282045477705045, "learning_rate": 8.055348106015455e-06, "loss": 0.0738, "step": 2697 }, { "epoch": 3.03018391127334, "grad_norm": 0.4687677335027077, "learning_rate": 8.047654278551814e-06, "loss": 0.083, "step": 2698 }, { "epoch": 3.031307033553278, "grad_norm": 0.45388563511970487, "learning_rate": 8.03996165238798e-06, "loss": 0.0811, "step": 2699 }, { "epoch": 3.0324301558332163, "grad_norm": 0.4511628549306528, "learning_rate": 8.032270232257312e-06, "loss": 0.0811, "step": 2700 }, { "epoch": 3.0335532781131547, "grad_norm": 0.4397177997054855, "learning_rate": 8.024580022892427e-06, "loss": 0.0836, "step": 2701 }, { "epoch": 3.0346764003930926, "grad_norm": 0.43549667655174495, "learning_rate": 8.01689102902519e-06, "loss": 0.0772, "step": 2702 }, { "epoch": 3.035799522673031, "grad_norm": 0.4197204578796909, "learning_rate": 8.009203255386723e-06, "loss": 0.0783, "step": 2703 }, { "epoch": 3.0369226449529694, "grad_norm": 0.42838286805922127, "learning_rate": 8.001516706707401e-06, "loss": 0.0821, "step": 2704 }, { "epoch": 3.0380457672329073, "grad_norm": 0.404870980751977, "learning_rate": 7.993831387716844e-06, "loss": 0.077, "step": 2705 }, { "epoch": 3.0391688895128457, "grad_norm": 0.4167100368716409, "learning_rate": 7.986147303143913e-06, "loss": 0.0761, "step": 2706 }, { "epoch": 3.040292011792784, "grad_norm": 0.40996511641383154, "learning_rate": 7.978464457716704e-06, "loss": 0.0802, "step": 2707 }, { "epoch": 3.041415134072722, "grad_norm": 0.44035804475350526, "learning_rate": 7.97078285616256e-06, "loss": 0.0787, "step": 2708 }, { "epoch": 3.0425382563526604, "grad_norm": 0.43789822457853483, "learning_rate": 7.963102503208058e-06, "loss": 0.0827, "step": 2709 }, { "epoch": 3.043661378632599, "grad_norm": 0.4463729892875018, "learning_rate": 7.955423403578998e-06, "loss": 0.081, "step": 2710 }, { "epoch": 3.0447845009125367, "grad_norm": 0.4593657665400022, "learning_rate": 7.947745562000421e-06, "loss": 0.0839, "step": 2711 }, { "epoch": 3.045907623192475, "grad_norm": 0.4426263669678776, "learning_rate": 7.940068983196581e-06, "loss": 0.0793, "step": 2712 }, { "epoch": 3.0470307454724135, "grad_norm": 0.4546022445193033, "learning_rate": 7.932393671890965e-06, "loss": 0.0823, "step": 2713 }, { "epoch": 3.0481538677523514, "grad_norm": 0.46071246386796366, "learning_rate": 7.924719632806274e-06, "loss": 0.0843, "step": 2714 }, { "epoch": 3.04927699003229, "grad_norm": 0.43222793934163595, "learning_rate": 7.917046870664431e-06, "loss": 0.0818, "step": 2715 }, { "epoch": 3.050400112312228, "grad_norm": 0.4468470825780685, "learning_rate": 7.909375390186572e-06, "loss": 0.0812, "step": 2716 }, { "epoch": 3.051523234592166, "grad_norm": 0.4288047135096686, "learning_rate": 7.90170519609304e-06, "loss": 0.0816, "step": 2717 }, { "epoch": 3.0526463568721045, "grad_norm": 0.445145856226943, "learning_rate": 7.894036293103393e-06, "loss": 0.0844, "step": 2718 }, { "epoch": 3.053769479152043, "grad_norm": 0.4565081459753225, "learning_rate": 7.88636868593639e-06, "loss": 0.0864, "step": 2719 }, { "epoch": 3.054892601431981, "grad_norm": 0.4515132111689418, "learning_rate": 7.878702379309992e-06, "loss": 0.0874, "step": 2720 }, { "epoch": 3.056015723711919, "grad_norm": 0.44344069780688466, "learning_rate": 7.871037377941367e-06, "loss": 0.0808, "step": 2721 }, { "epoch": 3.057138845991857, "grad_norm": 0.448178193139511, "learning_rate": 7.863373686546868e-06, "loss": 0.0855, "step": 2722 }, { "epoch": 3.0582619682717955, "grad_norm": 0.3940384087377325, "learning_rate": 7.855711309842054e-06, "loss": 0.0763, "step": 2723 }, { "epoch": 3.059385090551734, "grad_norm": 0.45681289631612565, "learning_rate": 7.848050252541666e-06, "loss": 0.0845, "step": 2724 }, { "epoch": 3.060508212831672, "grad_norm": 0.4362986797057327, "learning_rate": 7.840390519359644e-06, "loss": 0.0772, "step": 2725 }, { "epoch": 3.0616313351116102, "grad_norm": 0.43138388625168317, "learning_rate": 7.832732115009096e-06, "loss": 0.0817, "step": 2726 }, { "epoch": 3.0627544573915486, "grad_norm": 0.4242754319993129, "learning_rate": 7.825075044202329e-06, "loss": 0.079, "step": 2727 }, { "epoch": 3.0638775796714866, "grad_norm": 0.4371286146302931, "learning_rate": 7.817419311650819e-06, "loss": 0.0835, "step": 2728 }, { "epoch": 3.065000701951425, "grad_norm": 0.42440135053295946, "learning_rate": 7.809764922065226e-06, "loss": 0.0774, "step": 2729 }, { "epoch": 3.0661238242313633, "grad_norm": 0.43591296320616574, "learning_rate": 7.802111880155382e-06, "loss": 0.0812, "step": 2730 }, { "epoch": 3.0672469465113013, "grad_norm": 0.4264395948747793, "learning_rate": 7.794460190630283e-06, "loss": 0.0785, "step": 2731 }, { "epoch": 3.0683700687912396, "grad_norm": 0.47067358685640387, "learning_rate": 7.786809858198096e-06, "loss": 0.0835, "step": 2732 }, { "epoch": 3.069493191071178, "grad_norm": 0.45020780231345264, "learning_rate": 7.779160887566161e-06, "loss": 0.0842, "step": 2733 }, { "epoch": 3.070616313351116, "grad_norm": 0.4598985349986955, "learning_rate": 7.77151328344097e-06, "loss": 0.0818, "step": 2734 }, { "epoch": 3.0717394356310543, "grad_norm": 0.44764895115040065, "learning_rate": 7.763867050528184e-06, "loss": 0.0845, "step": 2735 }, { "epoch": 3.0728625579109927, "grad_norm": 0.45919564184903605, "learning_rate": 7.756222193532606e-06, "loss": 0.078, "step": 2736 }, { "epoch": 3.0739856801909307, "grad_norm": 0.43399382955613974, "learning_rate": 7.748578717158204e-06, "loss": 0.0796, "step": 2737 }, { "epoch": 3.075108802470869, "grad_norm": 0.4415661509626846, "learning_rate": 7.74093662610809e-06, "loss": 0.081, "step": 2738 }, { "epoch": 3.0762319247508074, "grad_norm": 0.4623493322073683, "learning_rate": 7.733295925084534e-06, "loss": 0.0787, "step": 2739 }, { "epoch": 3.0773550470307454, "grad_norm": 0.42863153572886326, "learning_rate": 7.725656618788938e-06, "loss": 0.0822, "step": 2740 }, { "epoch": 3.0784781693106837, "grad_norm": 0.4471791832208811, "learning_rate": 7.718018711921852e-06, "loss": 0.0832, "step": 2741 }, { "epoch": 3.079601291590622, "grad_norm": 0.4478220162373831, "learning_rate": 7.710382209182964e-06, "loss": 0.0874, "step": 2742 }, { "epoch": 3.08072441387056, "grad_norm": 0.4214162588600572, "learning_rate": 7.702747115271098e-06, "loss": 0.0826, "step": 2743 }, { "epoch": 3.0818475361504984, "grad_norm": 0.4469938818454545, "learning_rate": 7.695113434884214e-06, "loss": 0.0804, "step": 2744 }, { "epoch": 3.082970658430437, "grad_norm": 0.46167710102896964, "learning_rate": 7.687481172719402e-06, "loss": 0.0819, "step": 2745 }, { "epoch": 3.0840937807103748, "grad_norm": 0.4605465891709998, "learning_rate": 7.679850333472867e-06, "loss": 0.0855, "step": 2746 }, { "epoch": 3.085216902990313, "grad_norm": 0.43521315545514894, "learning_rate": 7.672220921839955e-06, "loss": 0.0736, "step": 2747 }, { "epoch": 3.086340025270251, "grad_norm": 0.4604041931459674, "learning_rate": 7.664592942515125e-06, "loss": 0.0819, "step": 2748 }, { "epoch": 3.0874631475501895, "grad_norm": 0.4352802112893712, "learning_rate": 7.656966400191956e-06, "loss": 0.0767, "step": 2749 }, { "epoch": 3.088586269830128, "grad_norm": 0.43183285295275914, "learning_rate": 7.649341299563151e-06, "loss": 0.0815, "step": 2750 }, { "epoch": 3.089709392110066, "grad_norm": 0.4398374191229837, "learning_rate": 7.641717645320508e-06, "loss": 0.0826, "step": 2751 }, { "epoch": 3.090832514390004, "grad_norm": 0.44375949908128276, "learning_rate": 7.634095442154949e-06, "loss": 0.0796, "step": 2752 }, { "epoch": 3.0919556366699426, "grad_norm": 0.4341552574208931, "learning_rate": 7.626474694756501e-06, "loss": 0.0798, "step": 2753 }, { "epoch": 3.0930787589498805, "grad_norm": 0.45510308680572736, "learning_rate": 7.6188554078142915e-06, "loss": 0.0818, "step": 2754 }, { "epoch": 3.094201881229819, "grad_norm": 0.4375905218338109, "learning_rate": 7.611237586016558e-06, "loss": 0.0811, "step": 2755 }, { "epoch": 3.0953250035097573, "grad_norm": 0.4441137124518761, "learning_rate": 7.60362123405062e-06, "loss": 0.079, "step": 2756 }, { "epoch": 3.096448125789695, "grad_norm": 0.4502413713509533, "learning_rate": 7.596006356602908e-06, "loss": 0.0797, "step": 2757 }, { "epoch": 3.0975712480696336, "grad_norm": 0.42048708566816295, "learning_rate": 7.58839295835894e-06, "loss": 0.0769, "step": 2758 }, { "epoch": 3.098694370349572, "grad_norm": 0.4573125387642322, "learning_rate": 7.580781044003324e-06, "loss": 0.0799, "step": 2759 }, { "epoch": 3.09981749262951, "grad_norm": 0.506317107529294, "learning_rate": 7.573170618219754e-06, "loss": 0.0858, "step": 2760 }, { "epoch": 3.1009406149094483, "grad_norm": 0.43658684401047776, "learning_rate": 7.565561685691008e-06, "loss": 0.0797, "step": 2761 }, { "epoch": 3.1020637371893867, "grad_norm": 0.42157837097318585, "learning_rate": 7.557954251098946e-06, "loss": 0.0785, "step": 2762 }, { "epoch": 3.1031868594693246, "grad_norm": 0.43891360296769083, "learning_rate": 7.550348319124506e-06, "loss": 0.0801, "step": 2763 }, { "epoch": 3.104309981749263, "grad_norm": 0.4616891997168113, "learning_rate": 7.5427438944477086e-06, "loss": 0.0794, "step": 2764 }, { "epoch": 3.1054331040292014, "grad_norm": 0.43768658044133474, "learning_rate": 7.535140981747627e-06, "loss": 0.0789, "step": 2765 }, { "epoch": 3.1065562263091393, "grad_norm": 0.43116002218960375, "learning_rate": 7.527539585702426e-06, "loss": 0.0766, "step": 2766 }, { "epoch": 3.1076793485890777, "grad_norm": 0.46226193562781304, "learning_rate": 7.519939710989326e-06, "loss": 0.0861, "step": 2767 }, { "epoch": 3.108802470869016, "grad_norm": 0.4584385515521631, "learning_rate": 7.512341362284612e-06, "loss": 0.0824, "step": 2768 }, { "epoch": 3.109925593148954, "grad_norm": 0.42541505578878597, "learning_rate": 7.504744544263639e-06, "loss": 0.0793, "step": 2769 }, { "epoch": 3.1110487154288924, "grad_norm": 0.44149954225470445, "learning_rate": 7.497149261600803e-06, "loss": 0.0761, "step": 2770 }, { "epoch": 3.1121718377088303, "grad_norm": 0.4367339709220696, "learning_rate": 7.489555518969568e-06, "loss": 0.0768, "step": 2771 }, { "epoch": 3.1132949599887687, "grad_norm": 0.44918411334128344, "learning_rate": 7.481963321042449e-06, "loss": 0.0792, "step": 2772 }, { "epoch": 3.114418082268707, "grad_norm": 0.44367325496090004, "learning_rate": 7.474372672491008e-06, "loss": 0.084, "step": 2773 }, { "epoch": 3.115541204548645, "grad_norm": 0.4398077189907013, "learning_rate": 7.4667835779858585e-06, "loss": 0.0823, "step": 2774 }, { "epoch": 3.1166643268285834, "grad_norm": 0.4690551629213953, "learning_rate": 7.459196042196647e-06, "loss": 0.08, "step": 2775 }, { "epoch": 3.117787449108522, "grad_norm": 0.4473577302083683, "learning_rate": 7.45161006979207e-06, "loss": 0.0799, "step": 2776 }, { "epoch": 3.1189105713884597, "grad_norm": 0.4388023068250219, "learning_rate": 7.444025665439862e-06, "loss": 0.0795, "step": 2777 }, { "epoch": 3.120033693668398, "grad_norm": 0.4491722860474616, "learning_rate": 7.43644283380679e-06, "loss": 0.0777, "step": 2778 }, { "epoch": 3.1211568159483365, "grad_norm": 0.45449915733821294, "learning_rate": 7.428861579558653e-06, "loss": 0.0792, "step": 2779 }, { "epoch": 3.1222799382282744, "grad_norm": 0.44290763583095166, "learning_rate": 7.42128190736028e-06, "loss": 0.0767, "step": 2780 }, { "epoch": 3.123403060508213, "grad_norm": 0.46051935717803083, "learning_rate": 7.413703821875526e-06, "loss": 0.0818, "step": 2781 }, { "epoch": 3.124526182788151, "grad_norm": 0.4400112926339292, "learning_rate": 7.406127327767272e-06, "loss": 0.0804, "step": 2782 }, { "epoch": 3.125649305068089, "grad_norm": 0.4589606315894283, "learning_rate": 7.398552429697416e-06, "loss": 0.0829, "step": 2783 }, { "epoch": 3.1267724273480275, "grad_norm": 0.43320506570345907, "learning_rate": 7.390979132326881e-06, "loss": 0.0803, "step": 2784 }, { "epoch": 3.127895549627966, "grad_norm": 0.43767705767402443, "learning_rate": 7.383407440315595e-06, "loss": 0.0797, "step": 2785 }, { "epoch": 3.129018671907904, "grad_norm": 0.4551489448501023, "learning_rate": 7.375837358322504e-06, "loss": 0.0803, "step": 2786 }, { "epoch": 3.130141794187842, "grad_norm": 0.4483555435289511, "learning_rate": 7.368268891005565e-06, "loss": 0.0851, "step": 2787 }, { "epoch": 3.1312649164677806, "grad_norm": 0.44696608573573154, "learning_rate": 7.360702043021738e-06, "loss": 0.0824, "step": 2788 }, { "epoch": 3.1323880387477185, "grad_norm": 0.4175887163998629, "learning_rate": 7.353136819026991e-06, "loss": 0.081, "step": 2789 }, { "epoch": 3.133511161027657, "grad_norm": 0.42858708958351066, "learning_rate": 7.345573223676284e-06, "loss": 0.0793, "step": 2790 }, { "epoch": 3.1346342833075953, "grad_norm": 0.4315573518809542, "learning_rate": 7.338011261623583e-06, "loss": 0.0749, "step": 2791 }, { "epoch": 3.1357574055875332, "grad_norm": 0.46061705300883715, "learning_rate": 7.330450937521847e-06, "loss": 0.0802, "step": 2792 }, { "epoch": 3.1368805278674716, "grad_norm": 0.45499828672626436, "learning_rate": 7.322892256023025e-06, "loss": 0.082, "step": 2793 }, { "epoch": 3.13800365014741, "grad_norm": 0.45703139441971474, "learning_rate": 7.315335221778064e-06, "loss": 0.0837, "step": 2794 }, { "epoch": 3.139126772427348, "grad_norm": 0.44422721534446546, "learning_rate": 7.307779839436878e-06, "loss": 0.0789, "step": 2795 }, { "epoch": 3.1402498947072863, "grad_norm": 0.432747277056686, "learning_rate": 7.300226113648384e-06, "loss": 0.0814, "step": 2796 }, { "epoch": 3.1413730169872247, "grad_norm": 0.4354763866952526, "learning_rate": 7.292674049060473e-06, "loss": 0.0809, "step": 2797 }, { "epoch": 3.1424961392671626, "grad_norm": 0.42129390641539777, "learning_rate": 7.285123650320017e-06, "loss": 0.0759, "step": 2798 }, { "epoch": 3.143619261547101, "grad_norm": 0.4299331690685501, "learning_rate": 7.277574922072847e-06, "loss": 0.0811, "step": 2799 }, { "epoch": 3.144742383827039, "grad_norm": 0.4224270029033599, "learning_rate": 7.27002786896379e-06, "loss": 0.076, "step": 2800 }, { "epoch": 3.1458655061069773, "grad_norm": 0.42177062287547257, "learning_rate": 7.262482495636627e-06, "loss": 0.08, "step": 2801 }, { "epoch": 3.1469886283869157, "grad_norm": 0.44438476148551986, "learning_rate": 7.254938806734108e-06, "loss": 0.0826, "step": 2802 }, { "epoch": 3.1481117506668537, "grad_norm": 0.44860619869928176, "learning_rate": 7.247396806897953e-06, "loss": 0.0795, "step": 2803 }, { "epoch": 3.149234872946792, "grad_norm": 0.45127852568778315, "learning_rate": 7.239856500768829e-06, "loss": 0.084, "step": 2804 }, { "epoch": 3.1503579952267304, "grad_norm": 0.4516223814508983, "learning_rate": 7.232317892986376e-06, "loss": 0.0781, "step": 2805 }, { "epoch": 3.1514811175066684, "grad_norm": 0.45355410301989446, "learning_rate": 7.2247809881891805e-06, "loss": 0.0839, "step": 2806 }, { "epoch": 3.1526042397866068, "grad_norm": 0.4772117453912696, "learning_rate": 7.217245791014782e-06, "loss": 0.0804, "step": 2807 }, { "epoch": 3.153727362066545, "grad_norm": 0.4208304719348036, "learning_rate": 7.2097123060996764e-06, "loss": 0.0701, "step": 2808 }, { "epoch": 3.154850484346483, "grad_norm": 0.44934404159731467, "learning_rate": 7.20218053807929e-06, "loss": 0.0809, "step": 2809 }, { "epoch": 3.1559736066264215, "grad_norm": 0.4313912104136727, "learning_rate": 7.194650491588007e-06, "loss": 0.0771, "step": 2810 }, { "epoch": 3.15709672890636, "grad_norm": 0.4758022567163495, "learning_rate": 7.1871221712591474e-06, "loss": 0.087, "step": 2811 }, { "epoch": 3.1582198511862978, "grad_norm": 0.4145445019750945, "learning_rate": 7.179595581724971e-06, "loss": 0.0699, "step": 2812 }, { "epoch": 3.159342973466236, "grad_norm": 0.44264907459716, "learning_rate": 7.1720707276166736e-06, "loss": 0.0758, "step": 2813 }, { "epoch": 3.1604660957461745, "grad_norm": 0.4482812080110177, "learning_rate": 7.164547613564374e-06, "loss": 0.0803, "step": 2814 }, { "epoch": 3.1615892180261125, "grad_norm": 0.43483597301420784, "learning_rate": 7.157026244197132e-06, "loss": 0.0806, "step": 2815 }, { "epoch": 3.162712340306051, "grad_norm": 0.4958849116746515, "learning_rate": 7.149506624142924e-06, "loss": 0.0844, "step": 2816 }, { "epoch": 3.1638354625859892, "grad_norm": 0.4628156948836129, "learning_rate": 7.1419887580286615e-06, "loss": 0.0792, "step": 2817 }, { "epoch": 3.164958584865927, "grad_norm": 0.44851043959036796, "learning_rate": 7.13447265048017e-06, "loss": 0.0815, "step": 2818 }, { "epoch": 3.1660817071458656, "grad_norm": 0.4600530891320242, "learning_rate": 7.126958306122186e-06, "loss": 0.0847, "step": 2819 }, { "epoch": 3.1672048294258035, "grad_norm": 0.43714011172496053, "learning_rate": 7.119445729578374e-06, "loss": 0.0779, "step": 2820 }, { "epoch": 3.168327951705742, "grad_norm": 0.4453922999565407, "learning_rate": 7.111934925471302e-06, "loss": 0.0812, "step": 2821 }, { "epoch": 3.1694510739856803, "grad_norm": 0.45599177872389823, "learning_rate": 7.1044258984224524e-06, "loss": 0.081, "step": 2822 }, { "epoch": 3.170574196265618, "grad_norm": 0.44089126347694846, "learning_rate": 7.096918653052214e-06, "loss": 0.0771, "step": 2823 }, { "epoch": 3.1716973185455566, "grad_norm": 0.43082375817441887, "learning_rate": 7.089413193979874e-06, "loss": 0.0778, "step": 2824 }, { "epoch": 3.172820440825495, "grad_norm": 0.4425588712821237, "learning_rate": 7.081909525823625e-06, "loss": 0.0821, "step": 2825 }, { "epoch": 3.173943563105433, "grad_norm": 0.4382912863146195, "learning_rate": 7.074407653200559e-06, "loss": 0.0807, "step": 2826 }, { "epoch": 3.1750666853853713, "grad_norm": 0.45424971866156333, "learning_rate": 7.066907580726656e-06, "loss": 0.082, "step": 2827 }, { "epoch": 3.1761898076653097, "grad_norm": 0.41419138678049927, "learning_rate": 7.059409313016798e-06, "loss": 0.072, "step": 2828 }, { "epoch": 3.1773129299452476, "grad_norm": 0.44000690960252986, "learning_rate": 7.051912854684748e-06, "loss": 0.0808, "step": 2829 }, { "epoch": 3.178436052225186, "grad_norm": 0.44970359244435276, "learning_rate": 7.044418210343161e-06, "loss": 0.086, "step": 2830 }, { "epoch": 3.1795591745051244, "grad_norm": 0.4443948815798674, "learning_rate": 7.036925384603572e-06, "loss": 0.079, "step": 2831 }, { "epoch": 3.1806822967850623, "grad_norm": 0.4554250454666911, "learning_rate": 7.029434382076408e-06, "loss": 0.0836, "step": 2832 }, { "epoch": 3.1818054190650007, "grad_norm": 0.4354644467232646, "learning_rate": 7.021945207370951e-06, "loss": 0.0763, "step": 2833 }, { "epoch": 3.182928541344939, "grad_norm": 0.44780451430095414, "learning_rate": 7.014457865095382e-06, "loss": 0.0798, "step": 2834 }, { "epoch": 3.184051663624877, "grad_norm": 0.44114833638296336, "learning_rate": 7.006972359856743e-06, "loss": 0.0837, "step": 2835 }, { "epoch": 3.1851747859048154, "grad_norm": 0.4571651151911437, "learning_rate": 6.999488696260947e-06, "loss": 0.0843, "step": 2836 }, { "epoch": 3.186297908184754, "grad_norm": 0.45247999536502137, "learning_rate": 6.99200687891278e-06, "loss": 0.0864, "step": 2837 }, { "epoch": 3.1874210304646917, "grad_norm": 0.4439747981411603, "learning_rate": 6.984526912415878e-06, "loss": 0.0815, "step": 2838 }, { "epoch": 3.18854415274463, "grad_norm": 0.43498072881051186, "learning_rate": 6.97704880137275e-06, "loss": 0.0822, "step": 2839 }, { "epoch": 3.1896672750245685, "grad_norm": 0.44156927868282614, "learning_rate": 6.96957255038476e-06, "loss": 0.0736, "step": 2840 }, { "epoch": 3.1907903973045064, "grad_norm": 0.4575306361920231, "learning_rate": 6.962098164052129e-06, "loss": 0.0792, "step": 2841 }, { "epoch": 3.191913519584445, "grad_norm": 0.4503884973223931, "learning_rate": 6.954625646973931e-06, "loss": 0.0811, "step": 2842 }, { "epoch": 3.193036641864383, "grad_norm": 0.4419481555121357, "learning_rate": 6.947155003748083e-06, "loss": 0.0812, "step": 2843 }, { "epoch": 3.194159764144321, "grad_norm": 0.4517228406602635, "learning_rate": 6.939686238971356e-06, "loss": 0.0784, "step": 2844 }, { "epoch": 3.1952828864242595, "grad_norm": 0.44544948534956574, "learning_rate": 6.932219357239362e-06, "loss": 0.0816, "step": 2845 }, { "epoch": 3.196406008704198, "grad_norm": 0.445448430389794, "learning_rate": 6.924754363146559e-06, "loss": 0.0897, "step": 2846 }, { "epoch": 3.197529130984136, "grad_norm": 0.4576732225943825, "learning_rate": 6.917291261286239e-06, "loss": 0.0908, "step": 2847 }, { "epoch": 3.198652253264074, "grad_norm": 0.4479613547612505, "learning_rate": 6.909830056250527e-06, "loss": 0.0784, "step": 2848 }, { "epoch": 3.1997753755440126, "grad_norm": 0.4335107505865122, "learning_rate": 6.902370752630387e-06, "loss": 0.0815, "step": 2849 }, { "epoch": 3.2008984978239505, "grad_norm": 0.42987674122594727, "learning_rate": 6.894913355015611e-06, "loss": 0.0803, "step": 2850 }, { "epoch": 3.202021620103889, "grad_norm": 0.4432200727897701, "learning_rate": 6.887457867994819e-06, "loss": 0.078, "step": 2851 }, { "epoch": 3.203144742383827, "grad_norm": 0.4229390999012076, "learning_rate": 6.880004296155456e-06, "loss": 0.077, "step": 2852 }, { "epoch": 3.2042678646637652, "grad_norm": 0.4424366303313029, "learning_rate": 6.872552644083779e-06, "loss": 0.0838, "step": 2853 }, { "epoch": 3.2053909869437036, "grad_norm": 0.4483537337256006, "learning_rate": 6.865102916364876e-06, "loss": 0.0813, "step": 2854 }, { "epoch": 3.2065141092236416, "grad_norm": 0.4459517380181533, "learning_rate": 6.857655117582647e-06, "loss": 0.0856, "step": 2855 }, { "epoch": 3.20763723150358, "grad_norm": 0.45366324323089674, "learning_rate": 6.850209252319804e-06, "loss": 0.0814, "step": 2856 }, { "epoch": 3.2087603537835183, "grad_norm": 0.46685596344158564, "learning_rate": 6.842765325157874e-06, "loss": 0.0829, "step": 2857 }, { "epoch": 3.2098834760634563, "grad_norm": 0.45685853191410364, "learning_rate": 6.83532334067718e-06, "loss": 0.0819, "step": 2858 }, { "epoch": 3.2110065983433946, "grad_norm": 0.450175843547502, "learning_rate": 6.8278833034568595e-06, "loss": 0.082, "step": 2859 }, { "epoch": 3.212129720623333, "grad_norm": 0.45393030619312513, "learning_rate": 6.820445218074849e-06, "loss": 0.0838, "step": 2860 }, { "epoch": 3.213252842903271, "grad_norm": 0.4349893777321262, "learning_rate": 6.813009089107887e-06, "loss": 0.0756, "step": 2861 }, { "epoch": 3.2143759651832093, "grad_norm": 0.42814105018993454, "learning_rate": 6.805574921131506e-06, "loss": 0.0789, "step": 2862 }, { "epoch": 3.2154990874631477, "grad_norm": 0.4538034830964177, "learning_rate": 6.798142718720027e-06, "loss": 0.0844, "step": 2863 }, { "epoch": 3.2166222097430857, "grad_norm": 0.45315166207455276, "learning_rate": 6.790712486446567e-06, "loss": 0.0855, "step": 2864 }, { "epoch": 3.217745332023024, "grad_norm": 0.4527713015472594, "learning_rate": 6.783284228883029e-06, "loss": 0.0794, "step": 2865 }, { "epoch": 3.2188684543029624, "grad_norm": 0.4292889184279728, "learning_rate": 6.775857950600107e-06, "loss": 0.0723, "step": 2866 }, { "epoch": 3.2199915765829004, "grad_norm": 0.4384777852700717, "learning_rate": 6.768433656167267e-06, "loss": 0.0801, "step": 2867 }, { "epoch": 3.2211146988628387, "grad_norm": 0.4749472746328939, "learning_rate": 6.76101135015276e-06, "loss": 0.0853, "step": 2868 }, { "epoch": 3.222237821142777, "grad_norm": 0.447676493402192, "learning_rate": 6.7535910371236105e-06, "loss": 0.085, "step": 2869 }, { "epoch": 3.223360943422715, "grad_norm": 0.4448626847467645, "learning_rate": 6.746172721645625e-06, "loss": 0.0845, "step": 2870 }, { "epoch": 3.2244840657026534, "grad_norm": 0.4369101325966336, "learning_rate": 6.73875640828337e-06, "loss": 0.0789, "step": 2871 }, { "epoch": 3.2256071879825914, "grad_norm": 0.4650100486596405, "learning_rate": 6.731342101600183e-06, "loss": 0.0846, "step": 2872 }, { "epoch": 3.2267303102625298, "grad_norm": 0.45213980309912594, "learning_rate": 6.7239298061581716e-06, "loss": 0.0821, "step": 2873 }, { "epoch": 3.227853432542468, "grad_norm": 0.4660318930820804, "learning_rate": 6.716519526518201e-06, "loss": 0.0803, "step": 2874 }, { "epoch": 3.228976554822406, "grad_norm": 0.43454024938308855, "learning_rate": 6.7091112672399e-06, "loss": 0.0792, "step": 2875 }, { "epoch": 3.2300996771023445, "grad_norm": 0.44078038350930815, "learning_rate": 6.701705032881654e-06, "loss": 0.0798, "step": 2876 }, { "epoch": 3.231222799382283, "grad_norm": 0.452985339480439, "learning_rate": 6.694300828000594e-06, "loss": 0.0811, "step": 2877 }, { "epoch": 3.232345921662221, "grad_norm": 0.4432797452038033, "learning_rate": 6.686898657152612e-06, "loss": 0.0768, "step": 2878 }, { "epoch": 3.233469043942159, "grad_norm": 0.43857516156481385, "learning_rate": 6.679498524892345e-06, "loss": 0.0776, "step": 2879 }, { "epoch": 3.2345921662220976, "grad_norm": 0.46446080341677276, "learning_rate": 6.672100435773176e-06, "loss": 0.0847, "step": 2880 }, { "epoch": 3.2357152885020355, "grad_norm": 0.45625351695476635, "learning_rate": 6.664704394347235e-06, "loss": 0.0803, "step": 2881 }, { "epoch": 3.236838410781974, "grad_norm": 0.45358260601300854, "learning_rate": 6.657310405165379e-06, "loss": 0.0794, "step": 2882 }, { "epoch": 3.2379615330619123, "grad_norm": 0.430765669099815, "learning_rate": 6.649918472777216e-06, "loss": 0.078, "step": 2883 }, { "epoch": 3.23908465534185, "grad_norm": 0.4417554716567722, "learning_rate": 6.642528601731082e-06, "loss": 0.0799, "step": 2884 }, { "epoch": 3.2402077776217886, "grad_norm": 0.4426439861111362, "learning_rate": 6.6351407965740465e-06, "loss": 0.0736, "step": 2885 }, { "epoch": 3.241330899901727, "grad_norm": 0.4461344386327623, "learning_rate": 6.627755061851911e-06, "loss": 0.0826, "step": 2886 }, { "epoch": 3.242454022181665, "grad_norm": 0.4428801843373427, "learning_rate": 6.620371402109195e-06, "loss": 0.083, "step": 2887 }, { "epoch": 3.2435771444616033, "grad_norm": 0.45311188824038895, "learning_rate": 6.612989821889144e-06, "loss": 0.0877, "step": 2888 }, { "epoch": 3.2447002667415417, "grad_norm": 0.43522073849830734, "learning_rate": 6.605610325733728e-06, "loss": 0.0761, "step": 2889 }, { "epoch": 3.2458233890214796, "grad_norm": 0.43937974707906036, "learning_rate": 6.5982329181836325e-06, "loss": 0.078, "step": 2890 }, { "epoch": 3.246946511301418, "grad_norm": 0.44838131831874334, "learning_rate": 6.590857603778259e-06, "loss": 0.0765, "step": 2891 }, { "epoch": 3.2480696335813564, "grad_norm": 0.4487567179771303, "learning_rate": 6.583484387055716e-06, "loss": 0.0829, "step": 2892 }, { "epoch": 3.2491927558612943, "grad_norm": 0.43973073043498445, "learning_rate": 6.5761132725528265e-06, "loss": 0.0775, "step": 2893 }, { "epoch": 3.2503158781412327, "grad_norm": 0.44761002978377296, "learning_rate": 6.568744264805118e-06, "loss": 0.0888, "step": 2894 }, { "epoch": 3.251439000421171, "grad_norm": 0.4314843009567722, "learning_rate": 6.561377368346824e-06, "loss": 0.0752, "step": 2895 }, { "epoch": 3.252562122701109, "grad_norm": 0.4484597216270316, "learning_rate": 6.554012587710879e-06, "loss": 0.0786, "step": 2896 }, { "epoch": 3.2536852449810474, "grad_norm": 0.4533967440227061, "learning_rate": 6.546649927428905e-06, "loss": 0.084, "step": 2897 }, { "epoch": 3.2548083672609858, "grad_norm": 0.43537375245345705, "learning_rate": 6.539289392031234e-06, "loss": 0.0737, "step": 2898 }, { "epoch": 3.2559314895409237, "grad_norm": 0.44392098653435624, "learning_rate": 6.531930986046884e-06, "loss": 0.0798, "step": 2899 }, { "epoch": 3.257054611820862, "grad_norm": 0.43078694542955465, "learning_rate": 6.524574714003562e-06, "loss": 0.0787, "step": 2900 }, { "epoch": 3.2581777341008005, "grad_norm": 0.4525304342259154, "learning_rate": 6.517220580427669e-06, "loss": 0.0852, "step": 2901 }, { "epoch": 3.2593008563807384, "grad_norm": 0.44680111766540903, "learning_rate": 6.509868589844274e-06, "loss": 0.0732, "step": 2902 }, { "epoch": 3.260423978660677, "grad_norm": 0.44095965893761685, "learning_rate": 6.502518746777143e-06, "loss": 0.085, "step": 2903 }, { "epoch": 3.2615471009406147, "grad_norm": 0.45173240369816303, "learning_rate": 6.495171055748714e-06, "loss": 0.0794, "step": 2904 }, { "epoch": 3.262670223220553, "grad_norm": 0.4439061819150142, "learning_rate": 6.487825521280109e-06, "loss": 0.0826, "step": 2905 }, { "epoch": 3.2637933455004915, "grad_norm": 0.4334545536696232, "learning_rate": 6.480482147891106e-06, "loss": 0.0801, "step": 2906 }, { "epoch": 3.2649164677804294, "grad_norm": 0.4329477851970838, "learning_rate": 6.473140940100169e-06, "loss": 0.0822, "step": 2907 }, { "epoch": 3.266039590060368, "grad_norm": 0.44476467976568695, "learning_rate": 6.4658019024244214e-06, "loss": 0.0816, "step": 2908 }, { "epoch": 3.267162712340306, "grad_norm": 0.4373699550356578, "learning_rate": 6.458465039379655e-06, "loss": 0.0749, "step": 2909 }, { "epoch": 3.268285834620244, "grad_norm": 0.4289188369981385, "learning_rate": 6.451130355480326e-06, "loss": 0.0787, "step": 2910 }, { "epoch": 3.2694089569001825, "grad_norm": 0.46145266949341135, "learning_rate": 6.44379785523954e-06, "loss": 0.0805, "step": 2911 }, { "epoch": 3.270532079180121, "grad_norm": 0.4319621203979465, "learning_rate": 6.4364675431690684e-06, "loss": 0.0735, "step": 2912 }, { "epoch": 3.271655201460059, "grad_norm": 0.4507565266528019, "learning_rate": 6.429139423779332e-06, "loss": 0.0809, "step": 2913 }, { "epoch": 3.272778323739997, "grad_norm": 0.44904183335465914, "learning_rate": 6.421813501579403e-06, "loss": 0.0818, "step": 2914 }, { "epoch": 3.2739014460199356, "grad_norm": 0.44237241058884846, "learning_rate": 6.414489781077009e-06, "loss": 0.076, "step": 2915 }, { "epoch": 3.2750245682998735, "grad_norm": 0.45079073834235045, "learning_rate": 6.407168266778503e-06, "loss": 0.0807, "step": 2916 }, { "epoch": 3.276147690579812, "grad_norm": 0.4325942699296943, "learning_rate": 6.399848963188902e-06, "loss": 0.0752, "step": 2917 }, { "epoch": 3.27727081285975, "grad_norm": 0.4436273348713645, "learning_rate": 6.392531874811849e-06, "loss": 0.0759, "step": 2918 }, { "epoch": 3.2783939351396882, "grad_norm": 0.4399049256998244, "learning_rate": 6.385217006149633e-06, "loss": 0.0768, "step": 2919 }, { "epoch": 3.2795170574196266, "grad_norm": 0.45422810539143077, "learning_rate": 6.3779043617031775e-06, "loss": 0.081, "step": 2920 }, { "epoch": 3.2806401796995646, "grad_norm": 0.4643323939153561, "learning_rate": 6.370593945972022e-06, "loss": 0.0824, "step": 2921 }, { "epoch": 3.281763301979503, "grad_norm": 0.4384243413962703, "learning_rate": 6.363285763454352e-06, "loss": 0.0777, "step": 2922 }, { "epoch": 3.2828864242594413, "grad_norm": 0.44988902779488643, "learning_rate": 6.355979818646972e-06, "loss": 0.0797, "step": 2923 }, { "epoch": 3.2840095465393793, "grad_norm": 0.4533162798690284, "learning_rate": 6.34867611604531e-06, "loss": 0.0842, "step": 2924 }, { "epoch": 3.2851326688193176, "grad_norm": 0.4470213268981242, "learning_rate": 6.341374660143419e-06, "loss": 0.0846, "step": 2925 }, { "epoch": 3.286255791099256, "grad_norm": 0.44475456824817117, "learning_rate": 6.334075455433957e-06, "loss": 0.0848, "step": 2926 }, { "epoch": 3.287378913379194, "grad_norm": 0.4345189413360516, "learning_rate": 6.326778506408209e-06, "loss": 0.0763, "step": 2927 }, { "epoch": 3.2885020356591323, "grad_norm": 0.4443083844205599, "learning_rate": 6.319483817556067e-06, "loss": 0.0836, "step": 2928 }, { "epoch": 3.2896251579390707, "grad_norm": 0.44868349523571427, "learning_rate": 6.312191393366036e-06, "loss": 0.0838, "step": 2929 }, { "epoch": 3.2907482802190087, "grad_norm": 0.43446703024226885, "learning_rate": 6.304901238325224e-06, "loss": 0.0802, "step": 2930 }, { "epoch": 3.291871402498947, "grad_norm": 0.4482045860075501, "learning_rate": 6.297613356919341e-06, "loss": 0.0808, "step": 2931 }, { "epoch": 3.2929945247788854, "grad_norm": 0.4427022564744176, "learning_rate": 6.290327753632705e-06, "loss": 0.08, "step": 2932 }, { "epoch": 3.2941176470588234, "grad_norm": 0.45857038552731477, "learning_rate": 6.283044432948222e-06, "loss": 0.0791, "step": 2933 }, { "epoch": 3.2952407693387618, "grad_norm": 0.4495217764887851, "learning_rate": 6.275763399347403e-06, "loss": 0.0809, "step": 2934 }, { "epoch": 3.2963638916187, "grad_norm": 0.4377261844495198, "learning_rate": 6.268484657310351e-06, "loss": 0.0771, "step": 2935 }, { "epoch": 3.297487013898638, "grad_norm": 0.4693294460359561, "learning_rate": 6.26120821131575e-06, "loss": 0.0896, "step": 2936 }, { "epoch": 3.2986101361785765, "grad_norm": 0.4548392670428416, "learning_rate": 6.25393406584088e-06, "loss": 0.0804, "step": 2937 }, { "epoch": 3.299733258458515, "grad_norm": 0.4338443235373701, "learning_rate": 6.246662225361603e-06, "loss": 0.0758, "step": 2938 }, { "epoch": 3.3008563807384528, "grad_norm": 0.4094192153958026, "learning_rate": 6.239392694352362e-06, "loss": 0.0681, "step": 2939 }, { "epoch": 3.301979503018391, "grad_norm": 0.46354022190108407, "learning_rate": 6.232125477286184e-06, "loss": 0.0901, "step": 2940 }, { "epoch": 3.3031026252983295, "grad_norm": 0.43420174617897006, "learning_rate": 6.224860578634659e-06, "loss": 0.0784, "step": 2941 }, { "epoch": 3.3042257475782675, "grad_norm": 0.4497917258205283, "learning_rate": 6.217598002867965e-06, "loss": 0.0847, "step": 2942 }, { "epoch": 3.305348869858206, "grad_norm": 0.44585339258016715, "learning_rate": 6.210337754454842e-06, "loss": 0.0809, "step": 2943 }, { "epoch": 3.3064719921381442, "grad_norm": 0.4134829693581107, "learning_rate": 6.203079837862607e-06, "loss": 0.0766, "step": 2944 }, { "epoch": 3.307595114418082, "grad_norm": 0.4509752340647484, "learning_rate": 6.195824257557126e-06, "loss": 0.0861, "step": 2945 }, { "epoch": 3.3087182366980206, "grad_norm": 0.4664138175037376, "learning_rate": 6.188571018002843e-06, "loss": 0.0853, "step": 2946 }, { "epoch": 3.309841358977959, "grad_norm": 0.4426772347810475, "learning_rate": 6.181320123662755e-06, "loss": 0.0755, "step": 2947 }, { "epoch": 3.310964481257897, "grad_norm": 0.4456457712399848, "learning_rate": 6.174071578998419e-06, "loss": 0.0787, "step": 2948 }, { "epoch": 3.3120876035378353, "grad_norm": 0.4375785593556192, "learning_rate": 6.166825388469946e-06, "loss": 0.0796, "step": 2949 }, { "epoch": 3.3132107258177736, "grad_norm": 0.45268483396403963, "learning_rate": 6.159581556535989e-06, "loss": 0.084, "step": 2950 }, { "epoch": 3.3143338480977116, "grad_norm": 0.42407864679075813, "learning_rate": 6.152340087653762e-06, "loss": 0.0794, "step": 2951 }, { "epoch": 3.31545697037765, "grad_norm": 0.4509299352401977, "learning_rate": 6.145100986279021e-06, "loss": 0.0861, "step": 2952 }, { "epoch": 3.316580092657588, "grad_norm": 0.43447301128730714, "learning_rate": 6.137864256866065e-06, "loss": 0.084, "step": 2953 }, { "epoch": 3.3177032149375263, "grad_norm": 0.4325020912385428, "learning_rate": 6.130629903867734e-06, "loss": 0.0799, "step": 2954 }, { "epoch": 3.3188263372174647, "grad_norm": 0.43598451645884106, "learning_rate": 6.123397931735402e-06, "loss": 0.0776, "step": 2955 }, { "epoch": 3.3199494594974026, "grad_norm": 0.4469736893147317, "learning_rate": 6.116168344918982e-06, "loss": 0.0831, "step": 2956 }, { "epoch": 3.321072581777341, "grad_norm": 0.4626495409312179, "learning_rate": 6.10894114786692e-06, "loss": 0.085, "step": 2957 }, { "epoch": 3.3221957040572794, "grad_norm": 0.4474334248967847, "learning_rate": 6.101716345026189e-06, "loss": 0.0818, "step": 2958 }, { "epoch": 3.3233188263372173, "grad_norm": 0.4512283659979245, "learning_rate": 6.094493940842293e-06, "loss": 0.0859, "step": 2959 }, { "epoch": 3.3244419486171557, "grad_norm": 0.4505970094609106, "learning_rate": 6.08727393975925e-06, "loss": 0.0783, "step": 2960 }, { "epoch": 3.325565070897094, "grad_norm": 0.4250274459193349, "learning_rate": 6.080056346219608e-06, "loss": 0.0816, "step": 2961 }, { "epoch": 3.326688193177032, "grad_norm": 0.47814033258276745, "learning_rate": 6.072841164664433e-06, "loss": 0.0821, "step": 2962 }, { "epoch": 3.3278113154569704, "grad_norm": 0.45580838414860525, "learning_rate": 6.065628399533307e-06, "loss": 0.0825, "step": 2963 }, { "epoch": 3.328934437736909, "grad_norm": 0.4281087882366829, "learning_rate": 6.058418055264328e-06, "loss": 0.0767, "step": 2964 }, { "epoch": 3.3300575600168467, "grad_norm": 0.459821896563383, "learning_rate": 6.051210136294089e-06, "loss": 0.0909, "step": 2965 }, { "epoch": 3.331180682296785, "grad_norm": 0.43302194673000116, "learning_rate": 6.044004647057709e-06, "loss": 0.0772, "step": 2966 }, { "epoch": 3.3323038045767235, "grad_norm": 0.44404264162176665, "learning_rate": 6.036801591988802e-06, "loss": 0.0793, "step": 2967 }, { "epoch": 3.3334269268566614, "grad_norm": 0.4387838908116998, "learning_rate": 6.0296009755194875e-06, "loss": 0.0801, "step": 2968 }, { "epoch": 3.3345500491366, "grad_norm": 0.4532205199732577, "learning_rate": 6.022402802080392e-06, "loss": 0.0835, "step": 2969 }, { "epoch": 3.3356731714165377, "grad_norm": 0.4445884667652232, "learning_rate": 6.015207076100618e-06, "loss": 0.0841, "step": 2970 }, { "epoch": 3.336796293696476, "grad_norm": 0.4396390636961917, "learning_rate": 6.00801380200778e-06, "loss": 0.0764, "step": 2971 }, { "epoch": 3.3379194159764145, "grad_norm": 0.4140514704401163, "learning_rate": 6.000822984227981e-06, "loss": 0.0726, "step": 2972 }, { "epoch": 3.3390425382563524, "grad_norm": 0.4482407427874633, "learning_rate": 5.993634627185807e-06, "loss": 0.0846, "step": 2973 }, { "epoch": 3.340165660536291, "grad_norm": 0.4736302287268389, "learning_rate": 5.986448735304339e-06, "loss": 0.08, "step": 2974 }, { "epoch": 3.341288782816229, "grad_norm": 0.45919790518048875, "learning_rate": 5.979265313005128e-06, "loss": 0.0821, "step": 2975 }, { "epoch": 3.342411905096167, "grad_norm": 0.4370562219096713, "learning_rate": 5.972084364708217e-06, "loss": 0.082, "step": 2976 }, { "epoch": 3.3435350273761055, "grad_norm": 0.4497513684415123, "learning_rate": 5.9649058948321225e-06, "loss": 0.0794, "step": 2977 }, { "epoch": 3.344658149656044, "grad_norm": 0.45727145539332165, "learning_rate": 5.957729907793837e-06, "loss": 0.0749, "step": 2978 }, { "epoch": 3.345781271935982, "grad_norm": 0.4601952594639126, "learning_rate": 5.950556408008818e-06, "loss": 0.0856, "step": 2979 }, { "epoch": 3.3469043942159202, "grad_norm": 0.43940530937369787, "learning_rate": 5.943385399891004e-06, "loss": 0.077, "step": 2980 }, { "epoch": 3.3480275164958586, "grad_norm": 0.463228542311302, "learning_rate": 5.9362168878527944e-06, "loss": 0.0894, "step": 2981 }, { "epoch": 3.3491506387757966, "grad_norm": 0.44397605607111024, "learning_rate": 5.929050876305056e-06, "loss": 0.075, "step": 2982 }, { "epoch": 3.350273761055735, "grad_norm": 0.4474755147706091, "learning_rate": 5.921887369657113e-06, "loss": 0.0823, "step": 2983 }, { "epoch": 3.3513968833356733, "grad_norm": 0.43075761139504265, "learning_rate": 5.914726372316747e-06, "loss": 0.0797, "step": 2984 }, { "epoch": 3.3525200056156113, "grad_norm": 0.4551763638672436, "learning_rate": 5.9075678886902e-06, "loss": 0.0875, "step": 2985 }, { "epoch": 3.3536431278955496, "grad_norm": 0.4372584770340282, "learning_rate": 5.900411923182166e-06, "loss": 0.0797, "step": 2986 }, { "epoch": 3.354766250175488, "grad_norm": 0.42796756138216163, "learning_rate": 5.893258480195789e-06, "loss": 0.0859, "step": 2987 }, { "epoch": 3.355889372455426, "grad_norm": 0.4621345317165231, "learning_rate": 5.886107564132667e-06, "loss": 0.0861, "step": 2988 }, { "epoch": 3.3570124947353643, "grad_norm": 0.4416529130015374, "learning_rate": 5.878959179392828e-06, "loss": 0.0821, "step": 2989 }, { "epoch": 3.3581356170153027, "grad_norm": 0.44707394897677344, "learning_rate": 5.871813330374756e-06, "loss": 0.0764, "step": 2990 }, { "epoch": 3.3592587392952407, "grad_norm": 0.4674292247994994, "learning_rate": 5.86467002147537e-06, "loss": 0.1027, "step": 2991 }, { "epoch": 3.360381861575179, "grad_norm": 0.4473398857345525, "learning_rate": 5.857529257090027e-06, "loss": 0.0808, "step": 2992 }, { "epoch": 3.3615049838551174, "grad_norm": 0.4474406567869844, "learning_rate": 5.85039104161252e-06, "loss": 0.0791, "step": 2993 }, { "epoch": 3.3626281061350554, "grad_norm": 0.4284252832694052, "learning_rate": 5.843255379435069e-06, "loss": 0.0805, "step": 2994 }, { "epoch": 3.3637512284149937, "grad_norm": 0.4470373964616598, "learning_rate": 5.8361222749483246e-06, "loss": 0.0811, "step": 2995 }, { "epoch": 3.364874350694932, "grad_norm": 0.45169365813301077, "learning_rate": 5.8289917325413655e-06, "loss": 0.0809, "step": 2996 }, { "epoch": 3.36599747297487, "grad_norm": 0.45414154287290753, "learning_rate": 5.821863756601694e-06, "loss": 0.0827, "step": 2997 }, { "epoch": 3.3671205952548084, "grad_norm": 0.4542864047987009, "learning_rate": 5.814738351515234e-06, "loss": 0.0901, "step": 2998 }, { "epoch": 3.368243717534747, "grad_norm": 0.4403191156268337, "learning_rate": 5.807615521666321e-06, "loss": 0.0737, "step": 2999 }, { "epoch": 3.3693668398146848, "grad_norm": 0.47140768833660335, "learning_rate": 5.800495271437712e-06, "loss": 0.0857, "step": 3000 }, { "epoch": 3.370489962094623, "grad_norm": 0.44449931199515863, "learning_rate": 5.793377605210575e-06, "loss": 0.0763, "step": 3001 }, { "epoch": 3.371613084374561, "grad_norm": 0.43375063637654476, "learning_rate": 5.786262527364489e-06, "loss": 0.0768, "step": 3002 }, { "epoch": 3.3727362066544995, "grad_norm": 0.4634645012272231, "learning_rate": 5.779150042277445e-06, "loss": 0.0832, "step": 3003 }, { "epoch": 3.373859328934438, "grad_norm": 0.46021930500381747, "learning_rate": 5.7720401543258245e-06, "loss": 0.0864, "step": 3004 }, { "epoch": 3.374982451214376, "grad_norm": 0.45355421975815485, "learning_rate": 5.764932867884423e-06, "loss": 0.0819, "step": 3005 }, { "epoch": 3.376105573494314, "grad_norm": 0.4399739477662287, "learning_rate": 5.757828187326433e-06, "loss": 0.0783, "step": 3006 }, { "epoch": 3.3772286957742526, "grad_norm": 0.43934576191385377, "learning_rate": 5.750726117023442e-06, "loss": 0.0769, "step": 3007 }, { "epoch": 3.3783518180541905, "grad_norm": 0.4567949271653509, "learning_rate": 5.743626661345433e-06, "loss": 0.0824, "step": 3008 }, { "epoch": 3.379474940334129, "grad_norm": 0.4424452473412179, "learning_rate": 5.736529824660778e-06, "loss": 0.0812, "step": 3009 }, { "epoch": 3.3805980626140673, "grad_norm": 0.44503663152596046, "learning_rate": 5.729435611336239e-06, "loss": 0.0794, "step": 3010 }, { "epoch": 3.381721184894005, "grad_norm": 0.44895576795697817, "learning_rate": 5.722344025736965e-06, "loss": 0.0821, "step": 3011 }, { "epoch": 3.3828443071739436, "grad_norm": 0.4744276589880692, "learning_rate": 5.715255072226489e-06, "loss": 0.0914, "step": 3012 }, { "epoch": 3.383967429453882, "grad_norm": 0.45473296934213203, "learning_rate": 5.708168755166714e-06, "loss": 0.0762, "step": 3013 }, { "epoch": 3.38509055173382, "grad_norm": 0.4669408291943065, "learning_rate": 5.701085078917934e-06, "loss": 0.0812, "step": 3014 }, { "epoch": 3.3862136740137583, "grad_norm": 0.4985094979807337, "learning_rate": 5.694004047838812e-06, "loss": 0.081, "step": 3015 }, { "epoch": 3.3873367962936967, "grad_norm": 0.4664501659830471, "learning_rate": 5.686925666286385e-06, "loss": 0.0782, "step": 3016 }, { "epoch": 3.3884599185736346, "grad_norm": 0.4814941024166765, "learning_rate": 5.679849938616062e-06, "loss": 0.0907, "step": 3017 }, { "epoch": 3.389583040853573, "grad_norm": 0.4481377810964817, "learning_rate": 5.672776869181609e-06, "loss": 0.079, "step": 3018 }, { "epoch": 3.390706163133511, "grad_norm": 0.4104819286802798, "learning_rate": 5.665706462335167e-06, "loss": 0.0704, "step": 3019 }, { "epoch": 3.3918292854134493, "grad_norm": 0.4308997552132428, "learning_rate": 5.658638722427237e-06, "loss": 0.08, "step": 3020 }, { "epoch": 3.3929524076933877, "grad_norm": 0.44189383865557846, "learning_rate": 5.651573653806675e-06, "loss": 0.0785, "step": 3021 }, { "epoch": 3.3940755299733256, "grad_norm": 0.45421284153104713, "learning_rate": 5.6445112608207e-06, "loss": 0.078, "step": 3022 }, { "epoch": 3.395198652253264, "grad_norm": 0.4813721347164326, "learning_rate": 5.6374515478148714e-06, "loss": 0.0932, "step": 3023 }, { "epoch": 3.3963217745332024, "grad_norm": 0.44235907345540504, "learning_rate": 5.630394519133114e-06, "loss": 0.0824, "step": 3024 }, { "epoch": 3.3974448968131403, "grad_norm": 0.4338332189073094, "learning_rate": 5.6233401791176946e-06, "loss": 0.0786, "step": 3025 }, { "epoch": 3.3985680190930787, "grad_norm": 0.4394631553406978, "learning_rate": 5.616288532109225e-06, "loss": 0.0794, "step": 3026 }, { "epoch": 3.399691141373017, "grad_norm": 0.44813371027964505, "learning_rate": 5.609239582446666e-06, "loss": 0.0836, "step": 3027 }, { "epoch": 3.400814263652955, "grad_norm": 0.45706449566282054, "learning_rate": 5.602193334467307e-06, "loss": 0.0845, "step": 3028 }, { "epoch": 3.4019373859328934, "grad_norm": 0.4739118872159749, "learning_rate": 5.595149792506785e-06, "loss": 0.0884, "step": 3029 }, { "epoch": 3.403060508212832, "grad_norm": 0.4456277523613716, "learning_rate": 5.588108960899069e-06, "loss": 0.0781, "step": 3030 }, { "epoch": 3.4041836304927697, "grad_norm": 0.45013136112068963, "learning_rate": 5.58107084397646e-06, "loss": 0.0792, "step": 3031 }, { "epoch": 3.405306752772708, "grad_norm": 0.46808037494992766, "learning_rate": 5.574035446069593e-06, "loss": 0.0827, "step": 3032 }, { "epoch": 3.4064298750526465, "grad_norm": 0.44644484340636836, "learning_rate": 5.567002771507416e-06, "loss": 0.0778, "step": 3033 }, { "epoch": 3.4075529973325844, "grad_norm": 0.42953138306404914, "learning_rate": 5.559972824617217e-06, "loss": 0.0798, "step": 3034 }, { "epoch": 3.408676119612523, "grad_norm": 0.4336870120406191, "learning_rate": 5.552945609724601e-06, "loss": 0.0834, "step": 3035 }, { "epoch": 3.409799241892461, "grad_norm": 0.4533973433553709, "learning_rate": 5.545921131153487e-06, "loss": 0.0876, "step": 3036 }, { "epoch": 3.410922364172399, "grad_norm": 0.5136103261731987, "learning_rate": 5.538899393226122e-06, "loss": 0.0853, "step": 3037 }, { "epoch": 3.4120454864523375, "grad_norm": 0.4320180339485597, "learning_rate": 5.5318804002630465e-06, "loss": 0.0818, "step": 3038 }, { "epoch": 3.413168608732276, "grad_norm": 0.4378199677787273, "learning_rate": 5.524864156583132e-06, "loss": 0.0809, "step": 3039 }, { "epoch": 3.414291731012214, "grad_norm": 0.4428589037894112, "learning_rate": 5.517850666503547e-06, "loss": 0.0773, "step": 3040 }, { "epoch": 3.415414853292152, "grad_norm": 0.43356282741829844, "learning_rate": 5.510839934339771e-06, "loss": 0.0788, "step": 3041 }, { "epoch": 3.4165379755720906, "grad_norm": 0.4421358222104903, "learning_rate": 5.503831964405588e-06, "loss": 0.0813, "step": 3042 }, { "epoch": 3.4176610978520285, "grad_norm": 0.45263566082349915, "learning_rate": 5.4968267610130736e-06, "loss": 0.0877, "step": 3043 }, { "epoch": 3.418784220131967, "grad_norm": 0.42884381173143404, "learning_rate": 5.489824328472606e-06, "loss": 0.0783, "step": 3044 }, { "epoch": 3.4199073424119053, "grad_norm": 0.45905652234442995, "learning_rate": 5.482824671092862e-06, "loss": 0.0829, "step": 3045 }, { "epoch": 3.4210304646918432, "grad_norm": 0.43427893805487594, "learning_rate": 5.475827793180808e-06, "loss": 0.0762, "step": 3046 }, { "epoch": 3.4221535869717816, "grad_norm": 0.4383974688917182, "learning_rate": 5.468833699041702e-06, "loss": 0.0787, "step": 3047 }, { "epoch": 3.42327670925172, "grad_norm": 0.42636151758099317, "learning_rate": 5.461842392979081e-06, "loss": 0.0801, "step": 3048 }, { "epoch": 3.424399831531658, "grad_norm": 0.43538975548705666, "learning_rate": 5.454853879294776e-06, "loss": 0.0764, "step": 3049 }, { "epoch": 3.4255229538115963, "grad_norm": 0.43470614632530913, "learning_rate": 5.447868162288895e-06, "loss": 0.0821, "step": 3050 }, { "epoch": 3.4266460760915347, "grad_norm": 0.4258384808643859, "learning_rate": 5.440885246259828e-06, "loss": 0.0738, "step": 3051 }, { "epoch": 3.4277691983714726, "grad_norm": 0.47091027809265673, "learning_rate": 5.433905135504241e-06, "loss": 0.0875, "step": 3052 }, { "epoch": 3.428892320651411, "grad_norm": 0.42250636073037695, "learning_rate": 5.42692783431707e-06, "loss": 0.0776, "step": 3053 }, { "epoch": 3.430015442931349, "grad_norm": 0.4418674818368583, "learning_rate": 5.419953346991529e-06, "loss": 0.0753, "step": 3054 }, { "epoch": 3.4311385652112874, "grad_norm": 0.44643900506814016, "learning_rate": 5.412981677819094e-06, "loss": 0.0839, "step": 3055 }, { "epoch": 3.4322616874912257, "grad_norm": 0.4509416128414816, "learning_rate": 5.406012831089514e-06, "loss": 0.0786, "step": 3056 }, { "epoch": 3.4333848097711637, "grad_norm": 0.4526654447129122, "learning_rate": 5.399046811090789e-06, "loss": 0.0825, "step": 3057 }, { "epoch": 3.434507932051102, "grad_norm": 0.43008105275608804, "learning_rate": 5.392083622109192e-06, "loss": 0.0754, "step": 3058 }, { "epoch": 3.4356310543310404, "grad_norm": 0.44425939125974356, "learning_rate": 5.385123268429251e-06, "loss": 0.0796, "step": 3059 }, { "epoch": 3.4367541766109784, "grad_norm": 0.4459296964868783, "learning_rate": 5.3781657543337484e-06, "loss": 0.0806, "step": 3060 }, { "epoch": 3.4378772988909168, "grad_norm": 0.44459091799716943, "learning_rate": 5.37121108410372e-06, "loss": 0.0811, "step": 3061 }, { "epoch": 3.439000421170855, "grad_norm": 0.4570923904403929, "learning_rate": 5.364259262018448e-06, "loss": 0.082, "step": 3062 }, { "epoch": 3.440123543450793, "grad_norm": 0.44006880702846607, "learning_rate": 5.357310292355463e-06, "loss": 0.0791, "step": 3063 }, { "epoch": 3.4412466657307315, "grad_norm": 0.4606550393242539, "learning_rate": 5.3503641793905485e-06, "loss": 0.0825, "step": 3064 }, { "epoch": 3.44236978801067, "grad_norm": 0.4254705701825656, "learning_rate": 5.343420927397718e-06, "loss": 0.0859, "step": 3065 }, { "epoch": 3.443492910290608, "grad_norm": 0.425582782498819, "learning_rate": 5.33648054064924e-06, "loss": 0.0803, "step": 3066 }, { "epoch": 3.444616032570546, "grad_norm": 0.4400458710522822, "learning_rate": 5.329543023415602e-06, "loss": 0.0805, "step": 3067 }, { "epoch": 3.445739154850484, "grad_norm": 0.4176064311503563, "learning_rate": 5.322608379965537e-06, "loss": 0.075, "step": 3068 }, { "epoch": 3.4468622771304225, "grad_norm": 0.45054223519188685, "learning_rate": 5.315676614566008e-06, "loss": 0.0799, "step": 3069 }, { "epoch": 3.447985399410361, "grad_norm": 0.4423125763067608, "learning_rate": 5.308747731482207e-06, "loss": 0.0787, "step": 3070 }, { "epoch": 3.449108521690299, "grad_norm": 0.4216181686130632, "learning_rate": 5.301821734977555e-06, "loss": 0.0755, "step": 3071 }, { "epoch": 3.450231643970237, "grad_norm": 0.4484194253723383, "learning_rate": 5.2948986293136876e-06, "loss": 0.0805, "step": 3072 }, { "epoch": 3.4513547662501756, "grad_norm": 0.46771531925041326, "learning_rate": 5.28797841875047e-06, "loss": 0.0813, "step": 3073 }, { "epoch": 3.4524778885301135, "grad_norm": 0.4340387002286317, "learning_rate": 5.281061107545985e-06, "loss": 0.0784, "step": 3074 }, { "epoch": 3.453601010810052, "grad_norm": 0.4312273423989433, "learning_rate": 5.274146699956531e-06, "loss": 0.0784, "step": 3075 }, { "epoch": 3.4547241330899903, "grad_norm": 0.45797427049587475, "learning_rate": 5.26723520023662e-06, "loss": 0.0841, "step": 3076 }, { "epoch": 3.455847255369928, "grad_norm": 0.4478121802861111, "learning_rate": 5.260326612638971e-06, "loss": 0.0779, "step": 3077 }, { "epoch": 3.4569703776498666, "grad_norm": 0.4352907201834758, "learning_rate": 5.253420941414513e-06, "loss": 0.0757, "step": 3078 }, { "epoch": 3.458093499929805, "grad_norm": 0.438323892417082, "learning_rate": 5.246518190812384e-06, "loss": 0.077, "step": 3079 }, { "epoch": 3.459216622209743, "grad_norm": 0.45721881289918465, "learning_rate": 5.239618365079921e-06, "loss": 0.086, "step": 3080 }, { "epoch": 3.4603397444896813, "grad_norm": 0.45134533377379477, "learning_rate": 5.232721468462669e-06, "loss": 0.0837, "step": 3081 }, { "epoch": 3.4614628667696197, "grad_norm": 0.4427173473895729, "learning_rate": 5.225827505204355e-06, "loss": 0.0743, "step": 3082 }, { "epoch": 3.4625859890495576, "grad_norm": 0.4591916370110219, "learning_rate": 5.218936479546913e-06, "loss": 0.0832, "step": 3083 }, { "epoch": 3.463709111329496, "grad_norm": 0.45693111857381774, "learning_rate": 5.212048395730469e-06, "loss": 0.0768, "step": 3084 }, { "epoch": 3.4648322336094344, "grad_norm": 0.4522432872093845, "learning_rate": 5.205163257993341e-06, "loss": 0.0822, "step": 3085 }, { "epoch": 3.4659553558893723, "grad_norm": 0.45600705538504716, "learning_rate": 5.19828107057202e-06, "loss": 0.0798, "step": 3086 }, { "epoch": 3.4670784781693107, "grad_norm": 0.45123885398109936, "learning_rate": 5.191401837701197e-06, "loss": 0.0823, "step": 3087 }, { "epoch": 3.468201600449249, "grad_norm": 0.4480694693802906, "learning_rate": 5.18452556361374e-06, "loss": 0.0768, "step": 3088 }, { "epoch": 3.469324722729187, "grad_norm": 0.443774248075364, "learning_rate": 5.177652252540697e-06, "loss": 0.078, "step": 3089 }, { "epoch": 3.4704478450091254, "grad_norm": 0.4651048676199529, "learning_rate": 5.170781908711289e-06, "loss": 0.0814, "step": 3090 }, { "epoch": 3.471570967289064, "grad_norm": 0.4858930041394996, "learning_rate": 5.163914536352919e-06, "loss": 0.0831, "step": 3091 }, { "epoch": 3.4726940895690017, "grad_norm": 0.45944862905297834, "learning_rate": 5.157050139691151e-06, "loss": 0.0847, "step": 3092 }, { "epoch": 3.47381721184894, "grad_norm": 0.43905597445283906, "learning_rate": 5.150188722949725e-06, "loss": 0.0791, "step": 3093 }, { "epoch": 3.4749403341288785, "grad_norm": 0.43904792852959645, "learning_rate": 5.143330290350548e-06, "loss": 0.0767, "step": 3094 }, { "epoch": 3.4760634564088164, "grad_norm": 0.4487339919699267, "learning_rate": 5.136474846113688e-06, "loss": 0.0811, "step": 3095 }, { "epoch": 3.477186578688755, "grad_norm": 0.4280955715492865, "learning_rate": 5.129622394457377e-06, "loss": 0.074, "step": 3096 }, { "epoch": 3.478309700968693, "grad_norm": 0.44114191944550274, "learning_rate": 5.122772939598003e-06, "loss": 0.0811, "step": 3097 }, { "epoch": 3.479432823248631, "grad_norm": 0.4381981300103117, "learning_rate": 5.11592648575011e-06, "loss": 0.0769, "step": 3098 }, { "epoch": 3.4805559455285695, "grad_norm": 0.41185510445743156, "learning_rate": 5.109083037126397e-06, "loss": 0.0731, "step": 3099 }, { "epoch": 3.481679067808508, "grad_norm": 0.4438897661024262, "learning_rate": 5.1022425979377174e-06, "loss": 0.0809, "step": 3100 }, { "epoch": 3.482802190088446, "grad_norm": 0.43375334791452175, "learning_rate": 5.095405172393062e-06, "loss": 0.0828, "step": 3101 }, { "epoch": 3.483925312368384, "grad_norm": 0.4326713003014848, "learning_rate": 5.088570764699574e-06, "loss": 0.0741, "step": 3102 }, { "epoch": 3.485048434648322, "grad_norm": 0.4553368231780253, "learning_rate": 5.081739379062545e-06, "loss": 0.0797, "step": 3103 }, { "epoch": 3.4861715569282605, "grad_norm": 0.45948155021628384, "learning_rate": 5.074911019685398e-06, "loss": 0.079, "step": 3104 }, { "epoch": 3.487294679208199, "grad_norm": 0.4435769006877875, "learning_rate": 5.068085690769702e-06, "loss": 0.0828, "step": 3105 }, { "epoch": 3.488417801488137, "grad_norm": 0.4676805551105927, "learning_rate": 5.06126339651515e-06, "loss": 0.0876, "step": 3106 }, { "epoch": 3.4895409237680752, "grad_norm": 0.44708470564753106, "learning_rate": 5.054444141119579e-06, "loss": 0.0789, "step": 3107 }, { "epoch": 3.4906640460480136, "grad_norm": 0.44694690179062085, "learning_rate": 5.047627928778951e-06, "loss": 0.0797, "step": 3108 }, { "epoch": 3.4917871683279516, "grad_norm": 0.45987872193849977, "learning_rate": 5.040814763687358e-06, "loss": 0.0804, "step": 3109 }, { "epoch": 3.49291029060789, "grad_norm": 0.46368555836155595, "learning_rate": 5.034004650037016e-06, "loss": 0.0838, "step": 3110 }, { "epoch": 3.4940334128878283, "grad_norm": 0.4670280413811003, "learning_rate": 5.02719759201826e-06, "loss": 0.0854, "step": 3111 }, { "epoch": 3.4951565351677663, "grad_norm": 0.4655508082978832, "learning_rate": 5.020393593819547e-06, "loss": 0.085, "step": 3112 }, { "epoch": 3.4962796574477046, "grad_norm": 0.4506202886596756, "learning_rate": 5.013592659627454e-06, "loss": 0.0796, "step": 3113 }, { "epoch": 3.497402779727643, "grad_norm": 0.44760914223329473, "learning_rate": 5.006794793626671e-06, "loss": 0.0773, "step": 3114 }, { "epoch": 3.498525902007581, "grad_norm": 0.44369165321868126, "learning_rate": 5.000000000000003e-06, "loss": 0.0787, "step": 3115 }, { "epoch": 3.4996490242875193, "grad_norm": 0.42826531752580976, "learning_rate": 4.9932082829283524e-06, "loss": 0.0812, "step": 3116 }, { "epoch": 3.5007721465674573, "grad_norm": 0.4370975011654157, "learning_rate": 4.986419646590744e-06, "loss": 0.0811, "step": 3117 }, { "epoch": 3.5018952688473957, "grad_norm": 0.4431924357682965, "learning_rate": 4.979634095164298e-06, "loss": 0.0811, "step": 3118 }, { "epoch": 3.503018391127334, "grad_norm": 0.43224211139362045, "learning_rate": 4.972851632824241e-06, "loss": 0.0797, "step": 3119 }, { "epoch": 3.504141513407272, "grad_norm": 0.44424653895974464, "learning_rate": 4.966072263743899e-06, "loss": 0.0828, "step": 3120 }, { "epoch": 3.5052646356872104, "grad_norm": 0.4572424329181564, "learning_rate": 4.959295992094685e-06, "loss": 0.0801, "step": 3121 }, { "epoch": 3.5063877579671487, "grad_norm": 0.460448950085812, "learning_rate": 4.952522822046117e-06, "loss": 0.0764, "step": 3122 }, { "epoch": 3.5075108802470867, "grad_norm": 0.4249602220115872, "learning_rate": 4.945752757765802e-06, "loss": 0.0771, "step": 3123 }, { "epoch": 3.508634002527025, "grad_norm": 0.4413605494574981, "learning_rate": 4.93898580341944e-06, "loss": 0.0822, "step": 3124 }, { "epoch": 3.5097571248069634, "grad_norm": 0.4291289570379504, "learning_rate": 4.932221963170801e-06, "loss": 0.0797, "step": 3125 }, { "epoch": 3.5108802470869014, "grad_norm": 0.4721851004921407, "learning_rate": 4.925461241181757e-06, "loss": 0.0843, "step": 3126 }, { "epoch": 3.5120033693668398, "grad_norm": 0.4392592949377926, "learning_rate": 4.918703641612255e-06, "loss": 0.0786, "step": 3127 }, { "epoch": 3.513126491646778, "grad_norm": 0.43903462957651435, "learning_rate": 4.9119491686203195e-06, "loss": 0.0804, "step": 3128 }, { "epoch": 3.514249613926716, "grad_norm": 0.4430529984909637, "learning_rate": 4.9051978263620545e-06, "loss": 0.0766, "step": 3129 }, { "epoch": 3.5153727362066545, "grad_norm": 0.4394580449820179, "learning_rate": 4.89844961899163e-06, "loss": 0.0762, "step": 3130 }, { "epoch": 3.516495858486593, "grad_norm": 0.4686215670552109, "learning_rate": 4.891704550661294e-06, "loss": 0.0847, "step": 3131 }, { "epoch": 3.517618980766531, "grad_norm": 0.41231062902627313, "learning_rate": 4.884962625521363e-06, "loss": 0.0793, "step": 3132 }, { "epoch": 3.518742103046469, "grad_norm": 0.4569282712376777, "learning_rate": 4.878223847720217e-06, "loss": 0.0876, "step": 3133 }, { "epoch": 3.5198652253264076, "grad_norm": 0.4208575740257057, "learning_rate": 4.8714882214043e-06, "loss": 0.078, "step": 3134 }, { "epoch": 3.5209883476063455, "grad_norm": 0.43267410608868484, "learning_rate": 4.8647557507181164e-06, "loss": 0.0814, "step": 3135 }, { "epoch": 3.522111469886284, "grad_norm": 0.42634413807778826, "learning_rate": 4.8580264398042355e-06, "loss": 0.0745, "step": 3136 }, { "epoch": 3.5232345921662223, "grad_norm": 0.45182552642987417, "learning_rate": 4.851300292803266e-06, "loss": 0.081, "step": 3137 }, { "epoch": 3.52435771444616, "grad_norm": 0.43109856616030523, "learning_rate": 4.844577313853886e-06, "loss": 0.0814, "step": 3138 }, { "epoch": 3.5254808367260986, "grad_norm": 0.4470250084652514, "learning_rate": 4.837857507092817e-06, "loss": 0.0822, "step": 3139 }, { "epoch": 3.526603959006037, "grad_norm": 0.44862752921716353, "learning_rate": 4.831140876654831e-06, "loss": 0.0802, "step": 3140 }, { "epoch": 3.527727081285975, "grad_norm": 0.4769651304639455, "learning_rate": 4.824427426672743e-06, "loss": 0.083, "step": 3141 }, { "epoch": 3.5288502035659133, "grad_norm": 0.4709080102440193, "learning_rate": 4.8177171612774155e-06, "loss": 0.0833, "step": 3142 }, { "epoch": 3.5299733258458517, "grad_norm": 0.4393104624450094, "learning_rate": 4.811010084597747e-06, "loss": 0.0729, "step": 3143 }, { "epoch": 3.5310964481257896, "grad_norm": 0.4600533673879964, "learning_rate": 4.80430620076068e-06, "loss": 0.0807, "step": 3144 }, { "epoch": 3.532219570405728, "grad_norm": 0.47092645697860963, "learning_rate": 4.797605513891179e-06, "loss": 0.0786, "step": 3145 }, { "epoch": 3.5333426926856664, "grad_norm": 0.44424095531161306, "learning_rate": 4.790908028112256e-06, "loss": 0.0817, "step": 3146 }, { "epoch": 3.5344658149656043, "grad_norm": 0.4536227579660774, "learning_rate": 4.7842137475449444e-06, "loss": 0.0782, "step": 3147 }, { "epoch": 3.5355889372455427, "grad_norm": 0.45564989250792665, "learning_rate": 4.777522676308314e-06, "loss": 0.0823, "step": 3148 }, { "epoch": 3.536712059525481, "grad_norm": 0.42912926619367286, "learning_rate": 4.770834818519454e-06, "loss": 0.0815, "step": 3149 }, { "epoch": 3.537835181805419, "grad_norm": 0.44561480322449887, "learning_rate": 4.764150178293471e-06, "loss": 0.0783, "step": 3150 }, { "epoch": 3.5389583040853574, "grad_norm": 0.4553565188197437, "learning_rate": 4.757468759743501e-06, "loss": 0.0837, "step": 3151 }, { "epoch": 3.5400814263652958, "grad_norm": 0.46776113922754914, "learning_rate": 4.750790566980694e-06, "loss": 0.0809, "step": 3152 }, { "epoch": 3.5412045486452337, "grad_norm": 0.43159302458196586, "learning_rate": 4.744115604114218e-06, "loss": 0.0775, "step": 3153 }, { "epoch": 3.542327670925172, "grad_norm": 0.451930193919335, "learning_rate": 4.737443875251251e-06, "loss": 0.076, "step": 3154 }, { "epoch": 3.5434507932051105, "grad_norm": 0.46172685056937696, "learning_rate": 4.730775384496976e-06, "loss": 0.0796, "step": 3155 }, { "epoch": 3.5445739154850484, "grad_norm": 0.4262475873889323, "learning_rate": 4.724110135954593e-06, "loss": 0.076, "step": 3156 }, { "epoch": 3.545697037764987, "grad_norm": 0.4533319005981512, "learning_rate": 4.717448133725302e-06, "loss": 0.0864, "step": 3157 }, { "epoch": 3.5468201600449247, "grad_norm": 0.44873241402246483, "learning_rate": 4.710789381908308e-06, "loss": 0.0827, "step": 3158 }, { "epoch": 3.547943282324863, "grad_norm": 0.4438311385849649, "learning_rate": 4.704133884600811e-06, "loss": 0.0807, "step": 3159 }, { "epoch": 3.5490664046048015, "grad_norm": 0.42639669279917847, "learning_rate": 4.697481645898012e-06, "loss": 0.0789, "step": 3160 }, { "epoch": 3.5501895268847394, "grad_norm": 0.410746116785441, "learning_rate": 4.690832669893108e-06, "loss": 0.0805, "step": 3161 }, { "epoch": 3.551312649164678, "grad_norm": 0.45365920946155613, "learning_rate": 4.684186960677287e-06, "loss": 0.0866, "step": 3162 }, { "epoch": 3.552435771444616, "grad_norm": 0.46568428236414383, "learning_rate": 4.6775445223397306e-06, "loss": 0.084, "step": 3163 }, { "epoch": 3.553558893724554, "grad_norm": 0.45425121145084363, "learning_rate": 4.670905358967598e-06, "loss": 0.0856, "step": 3164 }, { "epoch": 3.5546820160044925, "grad_norm": 0.44337140062483155, "learning_rate": 4.66426947464604e-06, "loss": 0.0823, "step": 3165 }, { "epoch": 3.5558051382844305, "grad_norm": 0.45052653263910536, "learning_rate": 4.6576368734581935e-06, "loss": 0.0829, "step": 3166 }, { "epoch": 3.556928260564369, "grad_norm": 0.4421176966358397, "learning_rate": 4.651007559485168e-06, "loss": 0.081, "step": 3167 }, { "epoch": 3.558051382844307, "grad_norm": 0.4417611457609658, "learning_rate": 4.644381536806058e-06, "loss": 0.0798, "step": 3168 }, { "epoch": 3.559174505124245, "grad_norm": 0.4362310044034491, "learning_rate": 4.637758809497919e-06, "loss": 0.0789, "step": 3169 }, { "epoch": 3.5602976274041835, "grad_norm": 0.48187030579374374, "learning_rate": 4.631139381635795e-06, "loss": 0.0826, "step": 3170 }, { "epoch": 3.561420749684122, "grad_norm": 0.4338280376779694, "learning_rate": 4.62452325729269e-06, "loss": 0.0782, "step": 3171 }, { "epoch": 3.56254387196406, "grad_norm": 0.44241538181060147, "learning_rate": 4.61791044053958e-06, "loss": 0.0813, "step": 3172 }, { "epoch": 3.5636669942439982, "grad_norm": 0.4572788174432963, "learning_rate": 4.611300935445407e-06, "loss": 0.0822, "step": 3173 }, { "epoch": 3.5647901165239366, "grad_norm": 0.4441694387134764, "learning_rate": 4.604694746077064e-06, "loss": 0.0838, "step": 3174 }, { "epoch": 3.5659132388038746, "grad_norm": 0.45759098365330836, "learning_rate": 4.598091876499417e-06, "loss": 0.0799, "step": 3175 }, { "epoch": 3.567036361083813, "grad_norm": 0.4343009539002791, "learning_rate": 4.591492330775283e-06, "loss": 0.0756, "step": 3176 }, { "epoch": 3.5681594833637513, "grad_norm": 0.4473461995617762, "learning_rate": 4.5848961129654365e-06, "loss": 0.0812, "step": 3177 }, { "epoch": 3.5692826056436893, "grad_norm": 0.4546004165413151, "learning_rate": 4.578303227128603e-06, "loss": 0.0833, "step": 3178 }, { "epoch": 3.5704057279236276, "grad_norm": 0.46424501547792263, "learning_rate": 4.571713677321455e-06, "loss": 0.0812, "step": 3179 }, { "epoch": 3.571528850203566, "grad_norm": 0.4361206138265064, "learning_rate": 4.565127467598619e-06, "loss": 0.0774, "step": 3180 }, { "epoch": 3.572651972483504, "grad_norm": 0.4286236286675256, "learning_rate": 4.5585446020126634e-06, "loss": 0.0733, "step": 3181 }, { "epoch": 3.5737750947634424, "grad_norm": 0.4617841876606261, "learning_rate": 4.551965084614089e-06, "loss": 0.083, "step": 3182 }, { "epoch": 3.5748982170433807, "grad_norm": 0.46431636230368917, "learning_rate": 4.545388919451353e-06, "loss": 0.0772, "step": 3183 }, { "epoch": 3.5760213393233187, "grad_norm": 0.42398130900439096, "learning_rate": 4.538816110570841e-06, "loss": 0.0787, "step": 3184 }, { "epoch": 3.577144461603257, "grad_norm": 0.4340576396237869, "learning_rate": 4.532246662016872e-06, "loss": 0.0792, "step": 3185 }, { "epoch": 3.5782675838831954, "grad_norm": 0.4706268135249468, "learning_rate": 4.5256805778317015e-06, "loss": 0.0879, "step": 3186 }, { "epoch": 3.5793907061631334, "grad_norm": 0.43976864195858995, "learning_rate": 4.519117862055514e-06, "loss": 0.0811, "step": 3187 }, { "epoch": 3.5805138284430718, "grad_norm": 0.45376420671959067, "learning_rate": 4.512558518726425e-06, "loss": 0.0806, "step": 3188 }, { "epoch": 3.58163695072301, "grad_norm": 0.468528511091486, "learning_rate": 4.506002551880462e-06, "loss": 0.0832, "step": 3189 }, { "epoch": 3.582760073002948, "grad_norm": 0.43294967832559633, "learning_rate": 4.4994499655515865e-06, "loss": 0.074, "step": 3190 }, { "epoch": 3.5838831952828865, "grad_norm": 0.42827553987381783, "learning_rate": 4.492900763771679e-06, "loss": 0.0746, "step": 3191 }, { "epoch": 3.585006317562825, "grad_norm": 0.4389937609121276, "learning_rate": 4.486354950570534e-06, "loss": 0.0768, "step": 3192 }, { "epoch": 3.586129439842763, "grad_norm": 0.44454906671504224, "learning_rate": 4.4798125299758666e-06, "loss": 0.0741, "step": 3193 }, { "epoch": 3.587252562122701, "grad_norm": 0.4423414531424126, "learning_rate": 4.473273506013294e-06, "loss": 0.0785, "step": 3194 }, { "epoch": 3.5883756844026395, "grad_norm": 0.45927341409394545, "learning_rate": 4.46673788270635e-06, "loss": 0.088, "step": 3195 }, { "epoch": 3.5894988066825775, "grad_norm": 0.4420335156563938, "learning_rate": 4.460205664076479e-06, "loss": 0.0767, "step": 3196 }, { "epoch": 3.590621928962516, "grad_norm": 0.46538751797421757, "learning_rate": 4.453676854143029e-06, "loss": 0.0848, "step": 3197 }, { "epoch": 3.5917450512424542, "grad_norm": 0.45668682965397933, "learning_rate": 4.447151456923241e-06, "loss": 0.0815, "step": 3198 }, { "epoch": 3.592868173522392, "grad_norm": 0.45937139886413936, "learning_rate": 4.440629476432268e-06, "loss": 0.079, "step": 3199 }, { "epoch": 3.5939912958023306, "grad_norm": 0.4309123966029538, "learning_rate": 4.4341109166831565e-06, "loss": 0.0752, "step": 3200 }, { "epoch": 3.595114418082269, "grad_norm": 0.45123899659954364, "learning_rate": 4.427595781686848e-06, "loss": 0.0819, "step": 3201 }, { "epoch": 3.596237540362207, "grad_norm": 0.4301342537854829, "learning_rate": 4.42108407545218e-06, "loss": 0.0792, "step": 3202 }, { "epoch": 3.5973606626421453, "grad_norm": 0.4234732951722343, "learning_rate": 4.41457580198587e-06, "loss": 0.0759, "step": 3203 }, { "epoch": 3.5984837849220837, "grad_norm": 0.4267600293298272, "learning_rate": 4.408070965292534e-06, "loss": 0.08, "step": 3204 }, { "epoch": 3.5996069072020216, "grad_norm": 0.43857035097482017, "learning_rate": 4.4015695693746685e-06, "loss": 0.0782, "step": 3205 }, { "epoch": 3.60073002948196, "grad_norm": 0.4662120376628465, "learning_rate": 4.395071618232656e-06, "loss": 0.0829, "step": 3206 }, { "epoch": 3.6018531517618984, "grad_norm": 0.4353076779897321, "learning_rate": 4.3885771158647595e-06, "loss": 0.0766, "step": 3207 }, { "epoch": 3.6029762740418363, "grad_norm": 0.420745107025468, "learning_rate": 4.38208606626711e-06, "loss": 0.0788, "step": 3208 }, { "epoch": 3.6040993963217747, "grad_norm": 0.43420825398499724, "learning_rate": 4.375598473433727e-06, "loss": 0.082, "step": 3209 }, { "epoch": 3.6052225186017126, "grad_norm": 0.43624668378544307, "learning_rate": 4.369114341356497e-06, "loss": 0.0782, "step": 3210 }, { "epoch": 3.606345640881651, "grad_norm": 0.4286224207837259, "learning_rate": 4.362633674025178e-06, "loss": 0.0752, "step": 3211 }, { "epoch": 3.6074687631615894, "grad_norm": 0.45618447258162387, "learning_rate": 4.3561564754274e-06, "loss": 0.0821, "step": 3212 }, { "epoch": 3.6085918854415273, "grad_norm": 0.4245976031185963, "learning_rate": 4.349682749548647e-06, "loss": 0.0782, "step": 3213 }, { "epoch": 3.6097150077214657, "grad_norm": 0.4576618374401797, "learning_rate": 4.3432125003722754e-06, "loss": 0.0843, "step": 3214 }, { "epoch": 3.6108381300014036, "grad_norm": 0.48128056616977466, "learning_rate": 4.3367457318795034e-06, "loss": 0.0855, "step": 3215 }, { "epoch": 3.611961252281342, "grad_norm": 0.4573589667687167, "learning_rate": 4.330282448049405e-06, "loss": 0.0869, "step": 3216 }, { "epoch": 3.6130843745612804, "grad_norm": 0.41682956812824573, "learning_rate": 4.323822652858911e-06, "loss": 0.079, "step": 3217 }, { "epoch": 3.6142074968412183, "grad_norm": 0.44417693491582083, "learning_rate": 4.3173663502827985e-06, "loss": 0.0795, "step": 3218 }, { "epoch": 3.6153306191211567, "grad_norm": 0.45615707711819087, "learning_rate": 4.310913544293706e-06, "loss": 0.0842, "step": 3219 }, { "epoch": 3.616453741401095, "grad_norm": 0.46322398924470976, "learning_rate": 4.304464238862115e-06, "loss": 0.0813, "step": 3220 }, { "epoch": 3.617576863681033, "grad_norm": 0.43509351671563573, "learning_rate": 4.298018437956354e-06, "loss": 0.0788, "step": 3221 }, { "epoch": 3.6186999859609714, "grad_norm": 0.43286932500602177, "learning_rate": 4.2915761455425965e-06, "loss": 0.0739, "step": 3222 }, { "epoch": 3.61982310824091, "grad_norm": 0.4596133947427022, "learning_rate": 4.285137365584854e-06, "loss": 0.0875, "step": 3223 }, { "epoch": 3.6209462305208477, "grad_norm": 0.4481272875661536, "learning_rate": 4.27870210204498e-06, "loss": 0.0832, "step": 3224 }, { "epoch": 3.622069352800786, "grad_norm": 0.43075573098301184, "learning_rate": 4.272270358882667e-06, "loss": 0.0785, "step": 3225 }, { "epoch": 3.6231924750807245, "grad_norm": 0.44064062012364946, "learning_rate": 4.265842140055428e-06, "loss": 0.0743, "step": 3226 }, { "epoch": 3.6243155973606624, "grad_norm": 0.44599199599462813, "learning_rate": 4.2594174495186225e-06, "loss": 0.0768, "step": 3227 }, { "epoch": 3.625438719640601, "grad_norm": 0.4756212129886129, "learning_rate": 4.252996291225433e-06, "loss": 0.0832, "step": 3228 }, { "epoch": 3.626561841920539, "grad_norm": 0.44272449482968135, "learning_rate": 4.2465786691268675e-06, "loss": 0.0798, "step": 3229 }, { "epoch": 3.627684964200477, "grad_norm": 0.44797258468509793, "learning_rate": 4.240164587171761e-06, "loss": 0.0745, "step": 3230 }, { "epoch": 3.6288080864804155, "grad_norm": 0.44058249923913984, "learning_rate": 4.233754049306772e-06, "loss": 0.076, "step": 3231 }, { "epoch": 3.629931208760354, "grad_norm": 0.43397062915249096, "learning_rate": 4.227347059476367e-06, "loss": 0.078, "step": 3232 }, { "epoch": 3.631054331040292, "grad_norm": 0.43034776436202615, "learning_rate": 4.220943621622841e-06, "loss": 0.0777, "step": 3233 }, { "epoch": 3.6321774533202302, "grad_norm": 0.46052076524797797, "learning_rate": 4.214543739686302e-06, "loss": 0.0769, "step": 3234 }, { "epoch": 3.6333005756001686, "grad_norm": 0.4761059654293087, "learning_rate": 4.208147417604665e-06, "loss": 0.0866, "step": 3235 }, { "epoch": 3.6344236978801066, "grad_norm": 0.4582874720771719, "learning_rate": 4.2017546593136615e-06, "loss": 0.0816, "step": 3236 }, { "epoch": 3.635546820160045, "grad_norm": 0.4596850731102858, "learning_rate": 4.195365468746821e-06, "loss": 0.0836, "step": 3237 }, { "epoch": 3.6366699424399833, "grad_norm": 0.4451422703440366, "learning_rate": 4.188979849835483e-06, "loss": 0.084, "step": 3238 }, { "epoch": 3.6377930647199213, "grad_norm": 0.45215038572014205, "learning_rate": 4.182597806508791e-06, "loss": 0.0803, "step": 3239 }, { "epoch": 3.6389161869998596, "grad_norm": 0.4545896148997309, "learning_rate": 4.176219342693687e-06, "loss": 0.0817, "step": 3240 }, { "epoch": 3.640039309279798, "grad_norm": 0.4415219136001395, "learning_rate": 4.1698444623149125e-06, "loss": 0.0777, "step": 3241 }, { "epoch": 3.641162431559736, "grad_norm": 0.4290457562106187, "learning_rate": 4.163473169294995e-06, "loss": 0.0756, "step": 3242 }, { "epoch": 3.6422855538396743, "grad_norm": 0.4362701926456306, "learning_rate": 4.1571054675542645e-06, "loss": 0.0749, "step": 3243 }, { "epoch": 3.6434086761196127, "grad_norm": 0.4439307907074262, "learning_rate": 4.150741361010837e-06, "loss": 0.0795, "step": 3244 }, { "epoch": 3.6445317983995507, "grad_norm": 0.4603100158815367, "learning_rate": 4.1443808535806195e-06, "loss": 0.0877, "step": 3245 }, { "epoch": 3.645654920679489, "grad_norm": 0.42378544801862755, "learning_rate": 4.138023949177303e-06, "loss": 0.073, "step": 3246 }, { "epoch": 3.6467780429594274, "grad_norm": 0.4225346697676541, "learning_rate": 4.131670651712357e-06, "loss": 0.0748, "step": 3247 }, { "epoch": 3.6479011652393654, "grad_norm": 0.4518554843426758, "learning_rate": 4.125320965095037e-06, "loss": 0.0785, "step": 3248 }, { "epoch": 3.6490242875193037, "grad_norm": 0.470687474826635, "learning_rate": 4.1189748932323735e-06, "loss": 0.0827, "step": 3249 }, { "epoch": 3.650147409799242, "grad_norm": 0.4506569877630462, "learning_rate": 4.112632440029176e-06, "loss": 0.078, "step": 3250 }, { "epoch": 3.65127053207918, "grad_norm": 0.44578298786080667, "learning_rate": 4.10629360938803e-06, "loss": 0.0823, "step": 3251 }, { "epoch": 3.6523936543591184, "grad_norm": 0.44292577788828275, "learning_rate": 4.099958405209281e-06, "loss": 0.0796, "step": 3252 }, { "epoch": 3.653516776639057, "grad_norm": 0.4464121497898923, "learning_rate": 4.093626831391051e-06, "loss": 0.0729, "step": 3253 }, { "epoch": 3.6546398989189948, "grad_norm": 0.44330661042864056, "learning_rate": 4.087298891829229e-06, "loss": 0.0753, "step": 3254 }, { "epoch": 3.655763021198933, "grad_norm": 0.45305772566991315, "learning_rate": 4.080974590417464e-06, "loss": 0.0766, "step": 3255 }, { "epoch": 3.6568861434788715, "grad_norm": 0.45177947052718287, "learning_rate": 4.074653931047175e-06, "loss": 0.0755, "step": 3256 }, { "epoch": 3.6580092657588095, "grad_norm": 0.45801600980210305, "learning_rate": 4.068336917607521e-06, "loss": 0.0765, "step": 3257 }, { "epoch": 3.659132388038748, "grad_norm": 0.4554130421647156, "learning_rate": 4.0620235539854394e-06, "loss": 0.0773, "step": 3258 }, { "epoch": 3.660255510318686, "grad_norm": 0.4463096526208407, "learning_rate": 4.0557138440656084e-06, "loss": 0.0831, "step": 3259 }, { "epoch": 3.661378632598624, "grad_norm": 0.45167705223108046, "learning_rate": 4.049407791730464e-06, "loss": 0.0774, "step": 3260 }, { "epoch": 3.6625017548785626, "grad_norm": 0.45315081343108904, "learning_rate": 4.043105400860191e-06, "loss": 0.0804, "step": 3261 }, { "epoch": 3.6636248771585005, "grad_norm": 0.43944579518546123, "learning_rate": 4.036806675332715e-06, "loss": 0.0771, "step": 3262 }, { "epoch": 3.664747999438439, "grad_norm": 0.4617022800989824, "learning_rate": 4.030511619023714e-06, "loss": 0.0838, "step": 3263 }, { "epoch": 3.665871121718377, "grad_norm": 0.8198470447722894, "learning_rate": 4.0242202358066026e-06, "loss": 0.089, "step": 3264 }, { "epoch": 3.666994243998315, "grad_norm": 0.4638960398681334, "learning_rate": 4.017932529552543e-06, "loss": 0.0725, "step": 3265 }, { "epoch": 3.6681173662782536, "grad_norm": 0.4669132218710685, "learning_rate": 4.011648504130427e-06, "loss": 0.0883, "step": 3266 }, { "epoch": 3.6692404885581915, "grad_norm": 0.4316332260721309, "learning_rate": 4.0053681634068854e-06, "loss": 0.0817, "step": 3267 }, { "epoch": 3.67036361083813, "grad_norm": 0.4386821038556611, "learning_rate": 3.9990915112462805e-06, "loss": 0.0784, "step": 3268 }, { "epoch": 3.6714867331180683, "grad_norm": 0.43782337955114087, "learning_rate": 3.992818551510705e-06, "loss": 0.0799, "step": 3269 }, { "epoch": 3.672609855398006, "grad_norm": 0.46001973641130106, "learning_rate": 3.986549288059985e-06, "loss": 0.0814, "step": 3270 }, { "epoch": 3.6737329776779446, "grad_norm": 0.45245647722083326, "learning_rate": 3.98028372475166e-06, "loss": 0.0813, "step": 3271 }, { "epoch": 3.674856099957883, "grad_norm": 0.43678629728053836, "learning_rate": 3.974021865441003e-06, "loss": 0.0799, "step": 3272 }, { "epoch": 3.675979222237821, "grad_norm": 0.4176324355614767, "learning_rate": 3.967763713981005e-06, "loss": 0.0796, "step": 3273 }, { "epoch": 3.6771023445177593, "grad_norm": 0.43709690178412436, "learning_rate": 3.961509274222376e-06, "loss": 0.0835, "step": 3274 }, { "epoch": 3.6782254667976977, "grad_norm": 0.47658335386852513, "learning_rate": 3.955258550013544e-06, "loss": 0.0859, "step": 3275 }, { "epoch": 3.6793485890776356, "grad_norm": 0.441801085296526, "learning_rate": 3.949011545200643e-06, "loss": 0.0792, "step": 3276 }, { "epoch": 3.680471711357574, "grad_norm": 0.42725668162684416, "learning_rate": 3.9427682636275256e-06, "loss": 0.0757, "step": 3277 }, { "epoch": 3.6815948336375124, "grad_norm": 0.4409037890558187, "learning_rate": 3.936528709135752e-06, "loss": 0.0764, "step": 3278 }, { "epoch": 3.6827179559174503, "grad_norm": 0.4215867066752958, "learning_rate": 3.93029288556459e-06, "loss": 0.0821, "step": 3279 }, { "epoch": 3.6838410781973887, "grad_norm": 0.4602625173462566, "learning_rate": 3.924060796751012e-06, "loss": 0.08, "step": 3280 }, { "epoch": 3.684964200477327, "grad_norm": 0.4689840196628956, "learning_rate": 3.9178324465296854e-06, "loss": 0.079, "step": 3281 }, { "epoch": 3.686087322757265, "grad_norm": 0.45249515865265794, "learning_rate": 3.911607838732986e-06, "loss": 0.0861, "step": 3282 }, { "epoch": 3.6872104450372034, "grad_norm": 0.4505909397106748, "learning_rate": 3.9053869771909835e-06, "loss": 0.0816, "step": 3283 }, { "epoch": 3.688333567317142, "grad_norm": 0.4357114391116514, "learning_rate": 3.899169865731441e-06, "loss": 0.0793, "step": 3284 }, { "epoch": 3.6894566895970797, "grad_norm": 0.45612069756713314, "learning_rate": 3.892956508179822e-06, "loss": 0.0817, "step": 3285 }, { "epoch": 3.690579811877018, "grad_norm": 0.4559467390418769, "learning_rate": 3.886746908359264e-06, "loss": 0.0832, "step": 3286 }, { "epoch": 3.6917029341569565, "grad_norm": 0.43075052200607683, "learning_rate": 3.880541070090607e-06, "loss": 0.0753, "step": 3287 }, { "epoch": 3.6928260564368944, "grad_norm": 0.4635796785903508, "learning_rate": 3.874338997192371e-06, "loss": 0.084, "step": 3288 }, { "epoch": 3.693949178716833, "grad_norm": 0.4313194092470317, "learning_rate": 3.8681406934807585e-06, "loss": 0.0743, "step": 3289 }, { "epoch": 3.695072300996771, "grad_norm": 0.42259202051656924, "learning_rate": 3.8619461627696605e-06, "loss": 0.0755, "step": 3290 }, { "epoch": 3.696195423276709, "grad_norm": 0.4529194564392549, "learning_rate": 3.855755408870631e-06, "loss": 0.0792, "step": 3291 }, { "epoch": 3.6973185455566475, "grad_norm": 0.46556671089771, "learning_rate": 3.8495684355929115e-06, "loss": 0.0853, "step": 3292 }, { "epoch": 3.698441667836586, "grad_norm": 0.4448844328456529, "learning_rate": 3.8433852467434175e-06, "loss": 0.0786, "step": 3293 }, { "epoch": 3.699564790116524, "grad_norm": 0.4432713263856399, "learning_rate": 3.837205846126731e-06, "loss": 0.0782, "step": 3294 }, { "epoch": 3.700687912396462, "grad_norm": 0.4211879529566886, "learning_rate": 3.83103023754511e-06, "loss": 0.0798, "step": 3295 }, { "epoch": 3.7018110346764006, "grad_norm": 0.4743647041115318, "learning_rate": 3.824858424798467e-06, "loss": 0.0883, "step": 3296 }, { "epoch": 3.7029341569563385, "grad_norm": 0.4248995644146303, "learning_rate": 3.8186904116843895e-06, "loss": 0.0728, "step": 3297 }, { "epoch": 3.704057279236277, "grad_norm": 0.4378423244099125, "learning_rate": 3.8125262019981224e-06, "loss": 0.0767, "step": 3298 }, { "epoch": 3.7051804015162153, "grad_norm": 0.4518707599907417, "learning_rate": 3.8063657995325743e-06, "loss": 0.0856, "step": 3299 }, { "epoch": 3.7063035237961532, "grad_norm": 0.45215273429838926, "learning_rate": 3.8002092080783116e-06, "loss": 0.0765, "step": 3300 }, { "epoch": 3.7074266460760916, "grad_norm": 0.4447025106277428, "learning_rate": 3.794056431423545e-06, "loss": 0.0786, "step": 3301 }, { "epoch": 3.70854976835603, "grad_norm": 0.4459028238128283, "learning_rate": 3.787907473354149e-06, "loss": 0.082, "step": 3302 }, { "epoch": 3.709672890635968, "grad_norm": 0.4404022767102967, "learning_rate": 3.781762337653646e-06, "loss": 0.0785, "step": 3303 }, { "epoch": 3.7107960129159063, "grad_norm": 0.4450961184830428, "learning_rate": 3.7756210281032092e-06, "loss": 0.0828, "step": 3304 }, { "epoch": 3.7119191351958447, "grad_norm": 0.423474128079963, "learning_rate": 3.769483548481646e-06, "loss": 0.0727, "step": 3305 }, { "epoch": 3.7130422574757826, "grad_norm": 0.4556402049278725, "learning_rate": 3.7633499025654186e-06, "loss": 0.0801, "step": 3306 }, { "epoch": 3.714165379755721, "grad_norm": 0.45401931123707706, "learning_rate": 3.7572200941286284e-06, "loss": 0.0828, "step": 3307 }, { "epoch": 3.715288502035659, "grad_norm": 0.4549636700971652, "learning_rate": 3.7510941269430124e-06, "loss": 0.0837, "step": 3308 }, { "epoch": 3.7164116243155974, "grad_norm": 0.45729117494453114, "learning_rate": 3.744972004777947e-06, "loss": 0.0787, "step": 3309 }, { "epoch": 3.7175347465955357, "grad_norm": 0.45161195568530854, "learning_rate": 3.7388537314004394e-06, "loss": 0.0768, "step": 3310 }, { "epoch": 3.7186578688754737, "grad_norm": 0.4400234197426347, "learning_rate": 3.732739310575132e-06, "loss": 0.0745, "step": 3311 }, { "epoch": 3.719780991155412, "grad_norm": 0.4343707184077051, "learning_rate": 3.7266287460642956e-06, "loss": 0.0817, "step": 3312 }, { "epoch": 3.7209041134353504, "grad_norm": 0.4394923766740628, "learning_rate": 3.7205220416278263e-06, "loss": 0.0747, "step": 3313 }, { "epoch": 3.7220272357152884, "grad_norm": 0.43402411147419445, "learning_rate": 3.7144192010232515e-06, "loss": 0.0816, "step": 3314 }, { "epoch": 3.7231503579952268, "grad_norm": 0.4560774435649313, "learning_rate": 3.7083202280057084e-06, "loss": 0.0788, "step": 3315 }, { "epoch": 3.7242734802751647, "grad_norm": 0.4414913360932274, "learning_rate": 3.702225126327965e-06, "loss": 0.0742, "step": 3316 }, { "epoch": 3.725396602555103, "grad_norm": 0.43653829484825973, "learning_rate": 3.6961338997404062e-06, "loss": 0.0766, "step": 3317 }, { "epoch": 3.7265197248350415, "grad_norm": 0.4597215590802913, "learning_rate": 3.6900465519910288e-06, "loss": 0.086, "step": 3318 }, { "epoch": 3.7276428471149794, "grad_norm": 0.45688319712825454, "learning_rate": 3.6839630868254505e-06, "loss": 0.086, "step": 3319 }, { "epoch": 3.728765969394918, "grad_norm": 0.4613315368292363, "learning_rate": 3.6778835079868857e-06, "loss": 0.0794, "step": 3320 }, { "epoch": 3.729889091674856, "grad_norm": 0.45526897285581736, "learning_rate": 3.67180781921617e-06, "loss": 0.0816, "step": 3321 }, { "epoch": 3.731012213954794, "grad_norm": 0.4467162169946555, "learning_rate": 3.6657360242517413e-06, "loss": 0.0795, "step": 3322 }, { "epoch": 3.7321353362347325, "grad_norm": 0.46971814383232896, "learning_rate": 3.6596681268296443e-06, "loss": 0.0985, "step": 3323 }, { "epoch": 3.733258458514671, "grad_norm": 0.4354505523936359, "learning_rate": 3.6536041306835226e-06, "loss": 0.0784, "step": 3324 }, { "epoch": 3.734381580794609, "grad_norm": 0.42196736134846446, "learning_rate": 3.647544039544615e-06, "loss": 0.077, "step": 3325 }, { "epoch": 3.735504703074547, "grad_norm": 0.4436527706104773, "learning_rate": 3.6414878571417667e-06, "loss": 0.0851, "step": 3326 }, { "epoch": 3.7366278253544856, "grad_norm": 0.429806201884575, "learning_rate": 3.6354355872014113e-06, "loss": 0.0726, "step": 3327 }, { "epoch": 3.7377509476344235, "grad_norm": 0.4336025736673827, "learning_rate": 3.629387233447578e-06, "loss": 0.0821, "step": 3328 }, { "epoch": 3.738874069914362, "grad_norm": 0.4602608383711589, "learning_rate": 3.623342799601889e-06, "loss": 0.0804, "step": 3329 }, { "epoch": 3.7399971921943003, "grad_norm": 0.4114914340076706, "learning_rate": 3.617302289383543e-06, "loss": 0.074, "step": 3330 }, { "epoch": 3.741120314474238, "grad_norm": 0.4429172740958033, "learning_rate": 3.6112657065093382e-06, "loss": 0.0788, "step": 3331 }, { "epoch": 3.7422434367541766, "grad_norm": 0.4504323974328796, "learning_rate": 3.6052330546936476e-06, "loss": 0.0791, "step": 3332 }, { "epoch": 3.743366559034115, "grad_norm": 0.4112295106510093, "learning_rate": 3.599204337648431e-06, "loss": 0.0743, "step": 3333 }, { "epoch": 3.744489681314053, "grad_norm": 0.44159456652322426, "learning_rate": 3.593179559083225e-06, "loss": 0.0735, "step": 3334 }, { "epoch": 3.7456128035939913, "grad_norm": 0.4148152260642522, "learning_rate": 3.5871587227051385e-06, "loss": 0.0749, "step": 3335 }, { "epoch": 3.7467359258739297, "grad_norm": 0.47005121017013696, "learning_rate": 3.5811418322188585e-06, "loss": 0.0823, "step": 3336 }, { "epoch": 3.7478590481538676, "grad_norm": 0.4606593735634737, "learning_rate": 3.575128891326647e-06, "loss": 0.0849, "step": 3337 }, { "epoch": 3.748982170433806, "grad_norm": 0.5019905326101969, "learning_rate": 3.56911990372833e-06, "loss": 0.0792, "step": 3338 }, { "epoch": 3.7501052927137444, "grad_norm": 0.4497014234992369, "learning_rate": 3.563114873121308e-06, "loss": 0.0768, "step": 3339 }, { "epoch": 3.7512284149936823, "grad_norm": 0.4280427061355485, "learning_rate": 3.557113803200537e-06, "loss": 0.0757, "step": 3340 }, { "epoch": 3.7523515372736207, "grad_norm": 0.4591535352490078, "learning_rate": 3.5511166976585432e-06, "loss": 0.0836, "step": 3341 }, { "epoch": 3.753474659553559, "grad_norm": 0.4659249443970378, "learning_rate": 3.5451235601854118e-06, "loss": 0.0847, "step": 3342 }, { "epoch": 3.754597781833497, "grad_norm": 0.4465299372843637, "learning_rate": 3.5391343944687906e-06, "loss": 0.075, "step": 3343 }, { "epoch": 3.7557209041134354, "grad_norm": 0.46314659971337047, "learning_rate": 3.533149204193871e-06, "loss": 0.0813, "step": 3344 }, { "epoch": 3.756844026393374, "grad_norm": 0.454679241847546, "learning_rate": 3.527167993043411e-06, "loss": 0.0779, "step": 3345 }, { "epoch": 3.7579671486733117, "grad_norm": 0.432912317003138, "learning_rate": 3.5211907646977152e-06, "loss": 0.0772, "step": 3346 }, { "epoch": 3.75909027095325, "grad_norm": 0.43149778700891545, "learning_rate": 3.5152175228346375e-06, "loss": 0.0754, "step": 3347 }, { "epoch": 3.7602133932331885, "grad_norm": 0.4296614839842937, "learning_rate": 3.5092482711295805e-06, "loss": 0.0777, "step": 3348 }, { "epoch": 3.7613365155131264, "grad_norm": 0.4210158808787179, "learning_rate": 3.5032830132554928e-06, "loss": 0.0722, "step": 3349 }, { "epoch": 3.762459637793065, "grad_norm": 0.4390732400163971, "learning_rate": 3.497321752882856e-06, "loss": 0.0716, "step": 3350 }, { "epoch": 3.763582760073003, "grad_norm": 0.44295131957694994, "learning_rate": 3.4913644936797054e-06, "loss": 0.0751, "step": 3351 }, { "epoch": 3.764705882352941, "grad_norm": 0.4534307190117399, "learning_rate": 3.4854112393116047e-06, "loss": 0.0782, "step": 3352 }, { "epoch": 3.7658290046328795, "grad_norm": 0.4385591150069758, "learning_rate": 3.4794619934416586e-06, "loss": 0.0738, "step": 3353 }, { "epoch": 3.766952126912818, "grad_norm": 0.43649347579395936, "learning_rate": 3.473516759730503e-06, "loss": 0.0768, "step": 3354 }, { "epoch": 3.768075249192756, "grad_norm": 0.437118874941968, "learning_rate": 3.4675755418363054e-06, "loss": 0.0769, "step": 3355 }, { "epoch": 3.769198371472694, "grad_norm": 0.4299095911687841, "learning_rate": 3.461638343414764e-06, "loss": 0.0786, "step": 3356 }, { "epoch": 3.7703214937526326, "grad_norm": 0.4697008287502685, "learning_rate": 3.455705168119101e-06, "loss": 0.0885, "step": 3357 }, { "epoch": 3.7714446160325705, "grad_norm": 0.46789742839025383, "learning_rate": 3.4497760196000686e-06, "loss": 0.0846, "step": 3358 }, { "epoch": 3.772567738312509, "grad_norm": 0.42160537923493707, "learning_rate": 3.4438509015059284e-06, "loss": 0.0742, "step": 3359 }, { "epoch": 3.773690860592447, "grad_norm": 0.42954624698813265, "learning_rate": 3.437929817482477e-06, "loss": 0.078, "step": 3360 }, { "epoch": 3.7748139828723852, "grad_norm": 0.436902871143199, "learning_rate": 3.432012771173021e-06, "loss": 0.0786, "step": 3361 }, { "epoch": 3.7759371051523236, "grad_norm": 0.4617151544078479, "learning_rate": 3.4260997662183836e-06, "loss": 0.0817, "step": 3362 }, { "epoch": 3.7770602274322616, "grad_norm": 0.4527611616144744, "learning_rate": 3.4201908062569066e-06, "loss": 0.0751, "step": 3363 }, { "epoch": 3.7781833497122, "grad_norm": 0.4238491787758921, "learning_rate": 3.4142858949244305e-06, "loss": 0.0733, "step": 3364 }, { "epoch": 3.779306471992138, "grad_norm": 0.4466064502716907, "learning_rate": 3.408385035854317e-06, "loss": 0.0743, "step": 3365 }, { "epoch": 3.7804295942720763, "grad_norm": 0.4426815137139256, "learning_rate": 3.4024882326774266e-06, "loss": 0.0714, "step": 3366 }, { "epoch": 3.7815527165520146, "grad_norm": 0.4699960233638405, "learning_rate": 3.39659548902213e-06, "loss": 0.0803, "step": 3367 }, { "epoch": 3.7826758388319526, "grad_norm": 0.4482404399293802, "learning_rate": 3.390706808514299e-06, "loss": 0.0819, "step": 3368 }, { "epoch": 3.783798961111891, "grad_norm": 0.45212437238597697, "learning_rate": 3.3848221947772976e-06, "loss": 0.0761, "step": 3369 }, { "epoch": 3.7849220833918293, "grad_norm": 0.4533539920505649, "learning_rate": 3.378941651431996e-06, "loss": 0.0814, "step": 3370 }, { "epoch": 3.7860452056717673, "grad_norm": 0.448203569611718, "learning_rate": 3.3730651820967588e-06, "loss": 0.076, "step": 3371 }, { "epoch": 3.7871683279517057, "grad_norm": 0.42790870357703314, "learning_rate": 3.36719279038744e-06, "loss": 0.0789, "step": 3372 }, { "epoch": 3.788291450231644, "grad_norm": 0.42797461979429224, "learning_rate": 3.361324479917393e-06, "loss": 0.0734, "step": 3373 }, { "epoch": 3.789414572511582, "grad_norm": 0.44621043727284554, "learning_rate": 3.3554602542974436e-06, "loss": 0.086, "step": 3374 }, { "epoch": 3.7905376947915204, "grad_norm": 0.42240375254920737, "learning_rate": 3.3496001171359204e-06, "loss": 0.0768, "step": 3375 }, { "epoch": 3.7916608170714587, "grad_norm": 0.4386956304607235, "learning_rate": 3.3437440720386294e-06, "loss": 0.076, "step": 3376 }, { "epoch": 3.7927839393513967, "grad_norm": 0.45986967497222725, "learning_rate": 3.3378921226088632e-06, "loss": 0.0827, "step": 3377 }, { "epoch": 3.793907061631335, "grad_norm": 0.44325952459332413, "learning_rate": 3.3320442724473843e-06, "loss": 0.0842, "step": 3378 }, { "epoch": 3.7950301839112734, "grad_norm": 0.43529759847083, "learning_rate": 3.326200525152441e-06, "loss": 0.0812, "step": 3379 }, { "epoch": 3.7961533061912114, "grad_norm": 0.4637261053857433, "learning_rate": 3.3203608843197575e-06, "loss": 0.0801, "step": 3380 }, { "epoch": 3.7972764284711498, "grad_norm": 0.44756524271819015, "learning_rate": 3.314525353542527e-06, "loss": 0.0811, "step": 3381 }, { "epoch": 3.798399550751088, "grad_norm": 0.4205094044217173, "learning_rate": 3.308693936411421e-06, "loss": 0.0826, "step": 3382 }, { "epoch": 3.799522673031026, "grad_norm": 0.43291470049655334, "learning_rate": 3.302866636514567e-06, "loss": 0.0833, "step": 3383 }, { "epoch": 3.8006457953109645, "grad_norm": 0.42670722217149304, "learning_rate": 3.29704345743757e-06, "loss": 0.073, "step": 3384 }, { "epoch": 3.801768917590903, "grad_norm": 0.4343627394675364, "learning_rate": 3.2912244027634953e-06, "loss": 0.0799, "step": 3385 }, { "epoch": 3.802892039870841, "grad_norm": 0.4488560245180038, "learning_rate": 3.285409476072874e-06, "loss": 0.0779, "step": 3386 }, { "epoch": 3.804015162150779, "grad_norm": 0.4503612616723187, "learning_rate": 3.2795986809436953e-06, "loss": 0.0808, "step": 3387 }, { "epoch": 3.8051382844307176, "grad_norm": 0.4407667707573012, "learning_rate": 3.2737920209513994e-06, "loss": 0.0746, "step": 3388 }, { "epoch": 3.8062614067106555, "grad_norm": 0.4491010129063716, "learning_rate": 3.267989499668892e-06, "loss": 0.0722, "step": 3389 }, { "epoch": 3.807384528990594, "grad_norm": 0.4329898689532073, "learning_rate": 3.262191120666528e-06, "loss": 0.0774, "step": 3390 }, { "epoch": 3.8085076512705323, "grad_norm": 0.4383306886957197, "learning_rate": 3.2563968875121133e-06, "loss": 0.0766, "step": 3391 }, { "epoch": 3.80963077355047, "grad_norm": 0.4401170472954754, "learning_rate": 3.250606803770904e-06, "loss": 0.0791, "step": 3392 }, { "epoch": 3.8107538958304086, "grad_norm": 0.4532511703655005, "learning_rate": 3.2448208730056053e-06, "loss": 0.0799, "step": 3393 }, { "epoch": 3.811877018110347, "grad_norm": 0.42610017647616477, "learning_rate": 3.2390390987763578e-06, "loss": 0.0785, "step": 3394 }, { "epoch": 3.813000140390285, "grad_norm": 0.4230591818086143, "learning_rate": 3.233261484640753e-06, "loss": 0.0667, "step": 3395 }, { "epoch": 3.8141232626702233, "grad_norm": 0.4625893901658147, "learning_rate": 3.2274880341538216e-06, "loss": 0.0769, "step": 3396 }, { "epoch": 3.8152463849501617, "grad_norm": 0.4400378121499265, "learning_rate": 3.2217187508680314e-06, "loss": 0.0779, "step": 3397 }, { "epoch": 3.8163695072300996, "grad_norm": 0.4490227215560117, "learning_rate": 3.2159536383332846e-06, "loss": 0.0779, "step": 3398 }, { "epoch": 3.817492629510038, "grad_norm": 0.41925773594382665, "learning_rate": 3.21019270009692e-06, "loss": 0.0741, "step": 3399 }, { "epoch": 3.8186157517899764, "grad_norm": 0.4415558039457624, "learning_rate": 3.204435939703705e-06, "loss": 0.0759, "step": 3400 }, { "epoch": 3.8197388740699143, "grad_norm": 0.4451411473808772, "learning_rate": 3.198683360695839e-06, "loss": 0.0753, "step": 3401 }, { "epoch": 3.8208619963498527, "grad_norm": 0.42552871943963727, "learning_rate": 3.192934966612948e-06, "loss": 0.075, "step": 3402 }, { "epoch": 3.821985118629791, "grad_norm": 0.4700312235719863, "learning_rate": 3.187190760992078e-06, "loss": 0.0828, "step": 3403 }, { "epoch": 3.823108240909729, "grad_norm": 0.45339677778386067, "learning_rate": 3.1814507473677047e-06, "loss": 0.0789, "step": 3404 }, { "epoch": 3.8242313631896674, "grad_norm": 0.4331334874484187, "learning_rate": 3.1757149292717216e-06, "loss": 0.0754, "step": 3405 }, { "epoch": 3.8253544854696058, "grad_norm": 0.44163707435590865, "learning_rate": 3.1699833102334397e-06, "loss": 0.079, "step": 3406 }, { "epoch": 3.8264776077495437, "grad_norm": 0.42901426073349475, "learning_rate": 3.164255893779591e-06, "loss": 0.0748, "step": 3407 }, { "epoch": 3.827600730029482, "grad_norm": 0.4246549205961034, "learning_rate": 3.1585326834343123e-06, "loss": 0.0758, "step": 3408 }, { "epoch": 3.82872385230942, "grad_norm": 0.43824663282905246, "learning_rate": 3.152813682719159e-06, "loss": 0.0817, "step": 3409 }, { "epoch": 3.8298469745893584, "grad_norm": 0.4487556162344414, "learning_rate": 3.147098895153098e-06, "loss": 0.0808, "step": 3410 }, { "epoch": 3.830970096869297, "grad_norm": 0.41728849401494356, "learning_rate": 3.141388324252499e-06, "loss": 0.0734, "step": 3411 }, { "epoch": 3.8320932191492347, "grad_norm": 0.42655605860029716, "learning_rate": 3.135681973531144e-06, "loss": 0.081, "step": 3412 }, { "epoch": 3.833216341429173, "grad_norm": 0.4241664691861822, "learning_rate": 3.129979846500205e-06, "loss": 0.0779, "step": 3413 }, { "epoch": 3.8343394637091115, "grad_norm": 0.4153186762640536, "learning_rate": 3.1242819466682673e-06, "loss": 0.0771, "step": 3414 }, { "epoch": 3.8354625859890494, "grad_norm": 0.4267033688189136, "learning_rate": 3.1185882775413123e-06, "loss": 0.073, "step": 3415 }, { "epoch": 3.836585708268988, "grad_norm": 0.4457824337627224, "learning_rate": 3.1128988426227193e-06, "loss": 0.0805, "step": 3416 }, { "epoch": 3.8377088305489258, "grad_norm": 0.4281819990435502, "learning_rate": 3.107213645413254e-06, "loss": 0.0783, "step": 3417 }, { "epoch": 3.838831952828864, "grad_norm": 0.42960944794714556, "learning_rate": 3.101532689411085e-06, "loss": 0.0758, "step": 3418 }, { "epoch": 3.8399550751088025, "grad_norm": 0.4270593518801406, "learning_rate": 3.095855978111767e-06, "loss": 0.0763, "step": 3419 }, { "epoch": 3.8410781973887405, "grad_norm": 0.445387965169147, "learning_rate": 3.0901835150082417e-06, "loss": 0.0755, "step": 3420 }, { "epoch": 3.842201319668679, "grad_norm": 0.44342244606556647, "learning_rate": 3.0845153035908415e-06, "loss": 0.0711, "step": 3421 }, { "epoch": 3.843324441948617, "grad_norm": 0.4222486589242842, "learning_rate": 3.0788513473472747e-06, "loss": 0.0778, "step": 3422 }, { "epoch": 3.844447564228555, "grad_norm": 0.43608936121729586, "learning_rate": 3.0731916497626356e-06, "loss": 0.0798, "step": 3423 }, { "epoch": 3.8455706865084935, "grad_norm": 0.4601934819466419, "learning_rate": 3.067536214319402e-06, "loss": 0.0834, "step": 3424 }, { "epoch": 3.846693808788432, "grad_norm": 0.41891016230414746, "learning_rate": 3.0618850444974237e-06, "loss": 0.0703, "step": 3425 }, { "epoch": 3.84781693106837, "grad_norm": 0.4572426188664163, "learning_rate": 3.056238143773932e-06, "loss": 0.0792, "step": 3426 }, { "epoch": 3.8489400533483082, "grad_norm": 0.4518709283094167, "learning_rate": 3.050595515623519e-06, "loss": 0.0793, "step": 3427 }, { "epoch": 3.8500631756282466, "grad_norm": 0.8261186535001159, "learning_rate": 3.0449571635181595e-06, "loss": 0.0906, "step": 3428 }, { "epoch": 3.8511862979081846, "grad_norm": 0.43746333274368837, "learning_rate": 3.0393230909271953e-06, "loss": 0.0806, "step": 3429 }, { "epoch": 3.852309420188123, "grad_norm": 0.4328585063719159, "learning_rate": 3.0336933013173307e-06, "loss": 0.0777, "step": 3430 }, { "epoch": 3.8534325424680613, "grad_norm": 0.43314046732664285, "learning_rate": 3.028067798152643e-06, "loss": 0.0809, "step": 3431 }, { "epoch": 3.8545556647479993, "grad_norm": 0.4373077254389021, "learning_rate": 3.022446584894557e-06, "loss": 0.0788, "step": 3432 }, { "epoch": 3.8556787870279376, "grad_norm": 0.43228751269770593, "learning_rate": 3.0168296650018736e-06, "loss": 0.0777, "step": 3433 }, { "epoch": 3.856801909307876, "grad_norm": 0.43151574407618987, "learning_rate": 3.011217041930743e-06, "loss": 0.0733, "step": 3434 }, { "epoch": 3.857925031587814, "grad_norm": 0.4424834168473701, "learning_rate": 3.005608719134674e-06, "loss": 0.0756, "step": 3435 }, { "epoch": 3.8590481538677524, "grad_norm": 0.4685629925803647, "learning_rate": 3.0000047000645305e-06, "loss": 0.0827, "step": 3436 }, { "epoch": 3.8601712761476907, "grad_norm": 0.44357998574627705, "learning_rate": 2.9944049881685265e-06, "loss": 0.0787, "step": 3437 }, { "epoch": 3.8612943984276287, "grad_norm": 0.4253461038308456, "learning_rate": 2.9888095868922297e-06, "loss": 0.0726, "step": 3438 }, { "epoch": 3.862417520707567, "grad_norm": 0.4260652279185222, "learning_rate": 2.9832184996785453e-06, "loss": 0.077, "step": 3439 }, { "epoch": 3.8635406429875054, "grad_norm": 0.471295717516851, "learning_rate": 2.977631729967735e-06, "loss": 0.0836, "step": 3440 }, { "epoch": 3.8646637652674434, "grad_norm": 0.43840929560138525, "learning_rate": 2.9720492811973977e-06, "loss": 0.076, "step": 3441 }, { "epoch": 3.8657868875473818, "grad_norm": 0.453205839637241, "learning_rate": 2.966471156802477e-06, "loss": 0.075, "step": 3442 }, { "epoch": 3.86691000982732, "grad_norm": 0.4288959454696245, "learning_rate": 2.960897360215255e-06, "loss": 0.0803, "step": 3443 }, { "epoch": 3.868033132107258, "grad_norm": 0.43608110599439925, "learning_rate": 2.9553278948653484e-06, "loss": 0.0804, "step": 3444 }, { "epoch": 3.8691562543871965, "grad_norm": 0.4411598218041702, "learning_rate": 2.949762764179711e-06, "loss": 0.0791, "step": 3445 }, { "epoch": 3.870279376667135, "grad_norm": 0.45413949768533496, "learning_rate": 2.9442019715826318e-06, "loss": 0.0777, "step": 3446 }, { "epoch": 3.871402498947073, "grad_norm": 0.4448584359461084, "learning_rate": 2.938645520495723e-06, "loss": 0.0769, "step": 3447 }, { "epoch": 3.872525621227011, "grad_norm": 0.4551946661458706, "learning_rate": 2.933093414337932e-06, "loss": 0.0783, "step": 3448 }, { "epoch": 3.8736487435069495, "grad_norm": 0.4458905959273584, "learning_rate": 2.927545656525531e-06, "loss": 0.0765, "step": 3449 }, { "epoch": 3.8747718657868875, "grad_norm": 0.4281315602250711, "learning_rate": 2.922002250472119e-06, "loss": 0.0749, "step": 3450 }, { "epoch": 3.875894988066826, "grad_norm": 0.43195709338842564, "learning_rate": 2.9164631995886095e-06, "loss": 0.0784, "step": 3451 }, { "epoch": 3.8770181103467642, "grad_norm": 0.415873215959344, "learning_rate": 2.9109285072832437e-06, "loss": 0.0783, "step": 3452 }, { "epoch": 3.878141232626702, "grad_norm": 0.450371175353607, "learning_rate": 2.9053981769615792e-06, "loss": 0.0779, "step": 3453 }, { "epoch": 3.8792643549066406, "grad_norm": 0.47287548262861223, "learning_rate": 2.899872212026489e-06, "loss": 0.089, "step": 3454 }, { "epoch": 3.880387477186579, "grad_norm": 0.44671822443338133, "learning_rate": 2.894350615878163e-06, "loss": 0.0771, "step": 3455 }, { "epoch": 3.881510599466517, "grad_norm": 0.4287761263930842, "learning_rate": 2.8888333919140954e-06, "loss": 0.0745, "step": 3456 }, { "epoch": 3.8826337217464553, "grad_norm": 0.42522941098200495, "learning_rate": 2.883320543529096e-06, "loss": 0.0733, "step": 3457 }, { "epoch": 3.8837568440263937, "grad_norm": 0.46005795014191836, "learning_rate": 2.8778120741152805e-06, "loss": 0.0813, "step": 3458 }, { "epoch": 3.8848799663063316, "grad_norm": 0.43782850129874495, "learning_rate": 2.872307987062073e-06, "loss": 0.0809, "step": 3459 }, { "epoch": 3.88600308858627, "grad_norm": 0.42888285045593805, "learning_rate": 2.8668082857562006e-06, "loss": 0.0759, "step": 3460 }, { "epoch": 3.887126210866208, "grad_norm": 0.4536689216993471, "learning_rate": 2.8613129735816838e-06, "loss": 0.0819, "step": 3461 }, { "epoch": 3.8882493331461463, "grad_norm": 0.43211539049911346, "learning_rate": 2.855822053919852e-06, "loss": 0.0855, "step": 3462 }, { "epoch": 3.8893724554260847, "grad_norm": 0.42986836248333826, "learning_rate": 2.8503355301493298e-06, "loss": 0.0747, "step": 3463 }, { "epoch": 3.8904955777060226, "grad_norm": 0.441832461225257, "learning_rate": 2.8448534056460332e-06, "loss": 0.0767, "step": 3464 }, { "epoch": 3.891618699985961, "grad_norm": 0.4620375105987588, "learning_rate": 2.839375683783179e-06, "loss": 0.0832, "step": 3465 }, { "epoch": 3.892741822265899, "grad_norm": 0.43964940953553355, "learning_rate": 2.833902367931262e-06, "loss": 0.0746, "step": 3466 }, { "epoch": 3.8938649445458373, "grad_norm": 0.464163071227373, "learning_rate": 2.8284334614580767e-06, "loss": 0.082, "step": 3467 }, { "epoch": 3.8949880668257757, "grad_norm": 0.4290775892676479, "learning_rate": 2.822968967728703e-06, "loss": 0.0742, "step": 3468 }, { "epoch": 3.8961111891057136, "grad_norm": 0.4208068309684712, "learning_rate": 2.8175088901055026e-06, "loss": 0.0751, "step": 3469 }, { "epoch": 3.897234311385652, "grad_norm": 0.4313279478607281, "learning_rate": 2.8120532319481255e-06, "loss": 0.0784, "step": 3470 }, { "epoch": 3.8983574336655904, "grad_norm": 0.4576364929353952, "learning_rate": 2.8066019966134907e-06, "loss": 0.0752, "step": 3471 }, { "epoch": 3.8994805559455283, "grad_norm": 0.45124729742636177, "learning_rate": 2.801155187455807e-06, "loss": 0.0812, "step": 3472 }, { "epoch": 3.9006036782254667, "grad_norm": 0.4274779842231408, "learning_rate": 2.7957128078265574e-06, "loss": 0.076, "step": 3473 }, { "epoch": 3.901726800505405, "grad_norm": 0.4378140528997512, "learning_rate": 2.790274861074497e-06, "loss": 0.0736, "step": 3474 }, { "epoch": 3.902849922785343, "grad_norm": 0.4343770792886928, "learning_rate": 2.7848413505456564e-06, "loss": 0.0743, "step": 3475 }, { "epoch": 3.9039730450652814, "grad_norm": 0.4269837178553136, "learning_rate": 2.7794122795833276e-06, "loss": 0.0785, "step": 3476 }, { "epoch": 3.90509616734522, "grad_norm": 0.44957322685953266, "learning_rate": 2.7739876515280838e-06, "loss": 0.0744, "step": 3477 }, { "epoch": 3.9062192896251577, "grad_norm": 0.4336825511389323, "learning_rate": 2.7685674697177568e-06, "loss": 0.0721, "step": 3478 }, { "epoch": 3.907342411905096, "grad_norm": 0.44723299851333226, "learning_rate": 2.7631517374874427e-06, "loss": 0.0744, "step": 3479 }, { "epoch": 3.9084655341850345, "grad_norm": 0.44066946939094515, "learning_rate": 2.7577404581695035e-06, "loss": 0.0768, "step": 3480 }, { "epoch": 3.9095886564649724, "grad_norm": 0.4500736384140416, "learning_rate": 2.752333635093558e-06, "loss": 0.0754, "step": 3481 }, { "epoch": 3.910711778744911, "grad_norm": 0.4594645625800169, "learning_rate": 2.746931271586484e-06, "loss": 0.0803, "step": 3482 }, { "epoch": 3.911834901024849, "grad_norm": 0.42877398862098437, "learning_rate": 2.7415333709724168e-06, "loss": 0.0736, "step": 3483 }, { "epoch": 3.912958023304787, "grad_norm": 0.42734145184760597, "learning_rate": 2.7361399365727404e-06, "loss": 0.0791, "step": 3484 }, { "epoch": 3.9140811455847255, "grad_norm": 0.4340445049242047, "learning_rate": 2.7307509717060954e-06, "loss": 0.0819, "step": 3485 }, { "epoch": 3.915204267864664, "grad_norm": 0.4341246214018538, "learning_rate": 2.725366479688373e-06, "loss": 0.0756, "step": 3486 }, { "epoch": 3.916327390144602, "grad_norm": 0.4236324146454293, "learning_rate": 2.719986463832708e-06, "loss": 0.0757, "step": 3487 }, { "epoch": 3.9174505124245402, "grad_norm": 0.46626173343907984, "learning_rate": 2.714610927449486e-06, "loss": 0.0853, "step": 3488 }, { "epoch": 3.9185736347044786, "grad_norm": 0.4377237420668139, "learning_rate": 2.7092398738463345e-06, "loss": 0.0767, "step": 3489 }, { "epoch": 3.9196967569844166, "grad_norm": 0.44860358330503586, "learning_rate": 2.7038733063281177e-06, "loss": 0.073, "step": 3490 }, { "epoch": 3.920819879264355, "grad_norm": 0.43604700182859185, "learning_rate": 2.698511228196945e-06, "loss": 0.0763, "step": 3491 }, { "epoch": 3.9219430015442933, "grad_norm": 0.4401389309270918, "learning_rate": 2.6931536427521632e-06, "loss": 0.0846, "step": 3492 }, { "epoch": 3.9230661238242313, "grad_norm": 0.4331666233322056, "learning_rate": 2.687800553290353e-06, "loss": 0.0713, "step": 3493 }, { "epoch": 3.9241892461041696, "grad_norm": 0.44560495275667983, "learning_rate": 2.6824519631053324e-06, "loss": 0.0756, "step": 3494 }, { "epoch": 3.925312368384108, "grad_norm": 0.4209276031297463, "learning_rate": 2.6771078754881417e-06, "loss": 0.0712, "step": 3495 }, { "epoch": 3.926435490664046, "grad_norm": 0.4531187078058862, "learning_rate": 2.6717682937270605e-06, "loss": 0.0865, "step": 3496 }, { "epoch": 3.9275586129439843, "grad_norm": 0.42791348822861297, "learning_rate": 2.6664332211075915e-06, "loss": 0.0734, "step": 3497 }, { "epoch": 3.9286817352239227, "grad_norm": 0.42910458284895564, "learning_rate": 2.6611026609124647e-06, "loss": 0.0767, "step": 3498 }, { "epoch": 3.9298048575038607, "grad_norm": 0.4360003667897708, "learning_rate": 2.6557766164216334e-06, "loss": 0.0821, "step": 3499 }, { "epoch": 3.930927979783799, "grad_norm": 0.4473465009354422, "learning_rate": 2.6504550909122674e-06, "loss": 0.0824, "step": 3500 }, { "epoch": 3.9320511020637374, "grad_norm": 0.4559995591973532, "learning_rate": 2.6451380876587617e-06, "loss": 0.0773, "step": 3501 }, { "epoch": 3.9331742243436754, "grad_norm": 0.44013072714573725, "learning_rate": 2.639825609932727e-06, "loss": 0.0773, "step": 3502 }, { "epoch": 3.9342973466236137, "grad_norm": 0.4385601175313549, "learning_rate": 2.63451766100299e-06, "loss": 0.0766, "step": 3503 }, { "epoch": 3.935420468903552, "grad_norm": 0.46222160717340305, "learning_rate": 2.6292142441355915e-06, "loss": 0.0834, "step": 3504 }, { "epoch": 3.93654359118349, "grad_norm": 0.44847101380558047, "learning_rate": 2.6239153625937786e-06, "loss": 0.0777, "step": 3505 }, { "epoch": 3.9376667134634284, "grad_norm": 0.4335638035588458, "learning_rate": 2.6186210196380135e-06, "loss": 0.0745, "step": 3506 }, { "epoch": 3.938789835743367, "grad_norm": 0.45155174333318204, "learning_rate": 2.613331218525963e-06, "loss": 0.0813, "step": 3507 }, { "epoch": 3.9399129580233048, "grad_norm": 0.44021250781644283, "learning_rate": 2.6080459625124997e-06, "loss": 0.0824, "step": 3508 }, { "epoch": 3.941036080303243, "grad_norm": 0.42626782736040425, "learning_rate": 2.602765254849704e-06, "loss": 0.0759, "step": 3509 }, { "epoch": 3.942159202583181, "grad_norm": 0.4502768942497821, "learning_rate": 2.597489098786847e-06, "loss": 0.0808, "step": 3510 }, { "epoch": 3.9432823248631195, "grad_norm": 0.46342459388260226, "learning_rate": 2.5922174975704083e-06, "loss": 0.0765, "step": 3511 }, { "epoch": 3.944405447143058, "grad_norm": 0.43079292525854407, "learning_rate": 2.5869504544440625e-06, "loss": 0.0804, "step": 3512 }, { "epoch": 3.945528569422996, "grad_norm": 0.43315675855851205, "learning_rate": 2.58168797264868e-06, "loss": 0.0726, "step": 3513 }, { "epoch": 3.946651691702934, "grad_norm": 0.44735874470841014, "learning_rate": 2.576430055422324e-06, "loss": 0.0743, "step": 3514 }, { "epoch": 3.947774813982872, "grad_norm": 0.4564757971460192, "learning_rate": 2.5711767060002457e-06, "loss": 0.0756, "step": 3515 }, { "epoch": 3.9488979362628105, "grad_norm": 0.44223205696360607, "learning_rate": 2.5659279276148896e-06, "loss": 0.0718, "step": 3516 }, { "epoch": 3.950021058542749, "grad_norm": 0.452807039724236, "learning_rate": 2.5606837234958893e-06, "loss": 0.0807, "step": 3517 }, { "epoch": 3.951144180822687, "grad_norm": 0.44686811295375434, "learning_rate": 2.5554440968700587e-06, "loss": 0.0763, "step": 3518 }, { "epoch": 3.952267303102625, "grad_norm": 0.43505801163376684, "learning_rate": 2.550209050961403e-06, "loss": 0.0706, "step": 3519 }, { "epoch": 3.9533904253825636, "grad_norm": 0.4247883961735843, "learning_rate": 2.544978588991096e-06, "loss": 0.0744, "step": 3520 }, { "epoch": 3.9545135476625015, "grad_norm": 0.45479121928721894, "learning_rate": 2.5397527141775025e-06, "loss": 0.0843, "step": 3521 }, { "epoch": 3.95563666994244, "grad_norm": 0.4374273400030285, "learning_rate": 2.534531429736159e-06, "loss": 0.074, "step": 3522 }, { "epoch": 3.9567597922223783, "grad_norm": 0.4149773313963219, "learning_rate": 2.5293147388797813e-06, "loss": 0.0708, "step": 3523 }, { "epoch": 3.957882914502316, "grad_norm": 0.4295010740541775, "learning_rate": 2.524102644818256e-06, "loss": 0.0758, "step": 3524 }, { "epoch": 3.9590060367822546, "grad_norm": 0.4486488019169518, "learning_rate": 2.5188951507586422e-06, "loss": 0.0808, "step": 3525 }, { "epoch": 3.960129159062193, "grad_norm": 0.4462521068537958, "learning_rate": 2.5136922599051684e-06, "loss": 0.0733, "step": 3526 }, { "epoch": 3.961252281342131, "grad_norm": 0.44838324716484473, "learning_rate": 2.508493975459232e-06, "loss": 0.0813, "step": 3527 }, { "epoch": 3.9623754036220693, "grad_norm": 0.4313406939987042, "learning_rate": 2.50330030061939e-06, "loss": 0.0764, "step": 3528 }, { "epoch": 3.9634985259020077, "grad_norm": 0.429785435524192, "learning_rate": 2.498111238581371e-06, "loss": 0.0758, "step": 3529 }, { "epoch": 3.9646216481819456, "grad_norm": 0.4417273682496011, "learning_rate": 2.492926792538061e-06, "loss": 0.0812, "step": 3530 }, { "epoch": 3.965744770461884, "grad_norm": 0.42584217521194967, "learning_rate": 2.487746965679507e-06, "loss": 0.0844, "step": 3531 }, { "epoch": 3.9668678927418224, "grad_norm": 0.4512431805814872, "learning_rate": 2.4825717611929144e-06, "loss": 0.0844, "step": 3532 }, { "epoch": 3.9679910150217603, "grad_norm": 0.44924315703937545, "learning_rate": 2.4774011822626455e-06, "loss": 0.0827, "step": 3533 }, { "epoch": 3.9691141373016987, "grad_norm": 0.45652249852342425, "learning_rate": 2.472235232070208e-06, "loss": 0.085, "step": 3534 }, { "epoch": 3.970237259581637, "grad_norm": 0.4313367900118166, "learning_rate": 2.4670739137942723e-06, "loss": 0.0784, "step": 3535 }, { "epoch": 3.971360381861575, "grad_norm": 0.43828415959844774, "learning_rate": 2.4619172306106533e-06, "loss": 0.0759, "step": 3536 }, { "epoch": 3.9724835041415134, "grad_norm": 0.4205420662994824, "learning_rate": 2.456765185692315e-06, "loss": 0.0765, "step": 3537 }, { "epoch": 3.973606626421452, "grad_norm": 0.4438761928014099, "learning_rate": 2.451617782209371e-06, "loss": 0.077, "step": 3538 }, { "epoch": 3.9747297487013897, "grad_norm": 0.4164454101954422, "learning_rate": 2.446475023329068e-06, "loss": 0.0764, "step": 3539 }, { "epoch": 3.975852870981328, "grad_norm": 0.4441939122586264, "learning_rate": 2.441336912215807e-06, "loss": 0.0797, "step": 3540 }, { "epoch": 3.9769759932612665, "grad_norm": 0.42960608918880705, "learning_rate": 2.4362034520311216e-06, "loss": 0.0729, "step": 3541 }, { "epoch": 3.9780991155412044, "grad_norm": 0.44957178328850944, "learning_rate": 2.4310746459336896e-06, "loss": 0.0802, "step": 3542 }, { "epoch": 3.979222237821143, "grad_norm": 0.4329168884237763, "learning_rate": 2.4259504970793226e-06, "loss": 0.0727, "step": 3543 }, { "epoch": 3.980345360101081, "grad_norm": 0.45775710829454436, "learning_rate": 2.4208310086209607e-06, "loss": 0.0765, "step": 3544 }, { "epoch": 3.981468482381019, "grad_norm": 0.4377294306740636, "learning_rate": 2.415716183708684e-06, "loss": 0.0752, "step": 3545 }, { "epoch": 3.9825916046609575, "grad_norm": 0.443470429696325, "learning_rate": 2.4106060254897002e-06, "loss": 0.0803, "step": 3546 }, { "epoch": 3.983714726940896, "grad_norm": 0.45578066038549137, "learning_rate": 2.405500537108347e-06, "loss": 0.0805, "step": 3547 }, { "epoch": 3.984837849220834, "grad_norm": 0.45327149698238034, "learning_rate": 2.4003997217060893e-06, "loss": 0.0744, "step": 3548 }, { "epoch": 3.985960971500772, "grad_norm": 0.4679219665710609, "learning_rate": 2.395303582421511e-06, "loss": 0.0817, "step": 3549 }, { "epoch": 3.9870840937807106, "grad_norm": 0.4461073700564401, "learning_rate": 2.390212122390323e-06, "loss": 0.0795, "step": 3550 }, { "epoch": 3.9882072160606485, "grad_norm": 0.44258640272166866, "learning_rate": 2.385125344745359e-06, "loss": 0.0764, "step": 3551 }, { "epoch": 3.989330338340587, "grad_norm": 0.4287951915573227, "learning_rate": 2.3800432526165683e-06, "loss": 0.0819, "step": 3552 }, { "epoch": 3.9904534606205253, "grad_norm": 0.4196233297951179, "learning_rate": 2.37496584913102e-06, "loss": 0.0768, "step": 3553 }, { "epoch": 3.9915765829004632, "grad_norm": 0.43264036028578096, "learning_rate": 2.369893137412893e-06, "loss": 0.0827, "step": 3554 }, { "epoch": 3.9926997051804016, "grad_norm": 0.44435177476439064, "learning_rate": 2.3648251205834827e-06, "loss": 0.0747, "step": 3555 }, { "epoch": 3.99382282746034, "grad_norm": 0.4207067931671747, "learning_rate": 2.3597618017611977e-06, "loss": 0.0734, "step": 3556 }, { "epoch": 3.994945949740278, "grad_norm": 0.42953451358834877, "learning_rate": 2.3547031840615532e-06, "loss": 0.0737, "step": 3557 }, { "epoch": 3.9960690720202163, "grad_norm": 0.45552658779976596, "learning_rate": 2.3496492705971753e-06, "loss": 0.0766, "step": 3558 }, { "epoch": 3.9971921943001543, "grad_norm": 0.4389923110529137, "learning_rate": 2.3446000644777856e-06, "loss": 0.0765, "step": 3559 }, { "epoch": 3.9983153165800926, "grad_norm": 0.44857662285405525, "learning_rate": 2.339555568810221e-06, "loss": 0.0774, "step": 3560 }, { "epoch": 3.999438438860031, "grad_norm": 0.4393713835259846, "learning_rate": 2.334515786698415e-06, "loss": 0.0767, "step": 3561 }, { "epoch": 4.000561561139969, "grad_norm": 0.7414061671456206, "learning_rate": 2.329480721243401e-06, "loss": 0.0959, "step": 3562 }, { "epoch": 4.001684683419907, "grad_norm": 0.35974986114637436, "learning_rate": 2.3244503755433077e-06, "loss": 0.0433, "step": 3563 }, { "epoch": 4.002807805699845, "grad_norm": 0.35204376720792546, "learning_rate": 2.3194247526933644e-06, "loss": 0.0427, "step": 3564 }, { "epoch": 4.003930927979784, "grad_norm": 0.30547691019965295, "learning_rate": 2.3144038557858915e-06, "loss": 0.037, "step": 3565 }, { "epoch": 4.005054050259722, "grad_norm": 0.3503089261671818, "learning_rate": 2.3093876879103027e-06, "loss": 0.045, "step": 3566 }, { "epoch": 4.00617717253966, "grad_norm": 0.28985834180869824, "learning_rate": 2.3043762521531e-06, "loss": 0.0356, "step": 3567 }, { "epoch": 4.007300294819599, "grad_norm": 0.3078892585372151, "learning_rate": 2.2993695515978767e-06, "loss": 0.041, "step": 3568 }, { "epoch": 4.008423417099537, "grad_norm": 0.2793695957969276, "learning_rate": 2.2943675893253094e-06, "loss": 0.035, "step": 3569 }, { "epoch": 4.009546539379475, "grad_norm": 0.3126139214392695, "learning_rate": 2.2893703684131608e-06, "loss": 0.0409, "step": 3570 }, { "epoch": 4.0106696616594135, "grad_norm": 0.32235788779278907, "learning_rate": 2.284377891936277e-06, "loss": 0.039, "step": 3571 }, { "epoch": 4.0117927839393515, "grad_norm": 0.3386774994355639, "learning_rate": 2.2793901629665847e-06, "loss": 0.0394, "step": 3572 }, { "epoch": 4.012915906219289, "grad_norm": 0.33966921939041494, "learning_rate": 2.2744071845730843e-06, "loss": 0.0378, "step": 3573 }, { "epoch": 4.014039028499228, "grad_norm": 0.3650876310831837, "learning_rate": 2.26942895982186e-06, "loss": 0.0352, "step": 3574 }, { "epoch": 4.015162150779166, "grad_norm": 0.39322271754253874, "learning_rate": 2.2644554917760674e-06, "loss": 0.0388, "step": 3575 }, { "epoch": 4.016285273059104, "grad_norm": 0.3603318333009823, "learning_rate": 2.2594867834959367e-06, "loss": 0.0349, "step": 3576 }, { "epoch": 4.017408395339043, "grad_norm": 0.39766487225833574, "learning_rate": 2.2545228380387706e-06, "loss": 0.0392, "step": 3577 }, { "epoch": 4.018531517618981, "grad_norm": 0.3879795346602286, "learning_rate": 2.2495636584589353e-06, "loss": 0.0344, "step": 3578 }, { "epoch": 4.019654639898919, "grad_norm": 0.38693724167691756, "learning_rate": 2.2446092478078706e-06, "loss": 0.0391, "step": 3579 }, { "epoch": 4.020777762178858, "grad_norm": 0.3794613149433384, "learning_rate": 2.2396596091340805e-06, "loss": 0.0354, "step": 3580 }, { "epoch": 4.021900884458796, "grad_norm": 0.3992223472216564, "learning_rate": 2.2347147454831306e-06, "loss": 0.0418, "step": 3581 }, { "epoch": 4.0230240067387335, "grad_norm": 0.3661442590112595, "learning_rate": 2.2297746598976545e-06, "loss": 0.0352, "step": 3582 }, { "epoch": 4.024147129018672, "grad_norm": 0.37467677700540847, "learning_rate": 2.2248393554173344e-06, "loss": 0.039, "step": 3583 }, { "epoch": 4.02527025129861, "grad_norm": 0.3601512962433673, "learning_rate": 2.219908835078921e-06, "loss": 0.0358, "step": 3584 }, { "epoch": 4.026393373578548, "grad_norm": 0.37668176536008646, "learning_rate": 2.2149831019162173e-06, "loss": 0.0418, "step": 3585 }, { "epoch": 4.027516495858487, "grad_norm": 0.3258768222612785, "learning_rate": 2.2100621589600813e-06, "loss": 0.0334, "step": 3586 }, { "epoch": 4.028639618138425, "grad_norm": 0.3370222447649643, "learning_rate": 2.205146009238426e-06, "loss": 0.0373, "step": 3587 }, { "epoch": 4.029762740418363, "grad_norm": 0.32311730754858564, "learning_rate": 2.2002346557762068e-06, "loss": 0.0344, "step": 3588 }, { "epoch": 4.030885862698301, "grad_norm": 0.3123818705115929, "learning_rate": 2.1953281015954364e-06, "loss": 0.0326, "step": 3589 }, { "epoch": 4.03200898497824, "grad_norm": 0.3241220437041912, "learning_rate": 2.190426349715171e-06, "loss": 0.0341, "step": 3590 }, { "epoch": 4.033132107258178, "grad_norm": 0.33250107457893724, "learning_rate": 2.185529403151514e-06, "loss": 0.0373, "step": 3591 }, { "epoch": 4.0342552295381156, "grad_norm": 0.3170426069163533, "learning_rate": 2.1806372649176124e-06, "loss": 0.0327, "step": 3592 }, { "epoch": 4.035378351818054, "grad_norm": 0.32095635332227473, "learning_rate": 2.175749938023647e-06, "loss": 0.036, "step": 3593 }, { "epoch": 4.036501474097992, "grad_norm": 0.3296174400793843, "learning_rate": 2.170867425476847e-06, "loss": 0.0345, "step": 3594 }, { "epoch": 4.03762459637793, "grad_norm": 0.32420977388665184, "learning_rate": 2.165989730281475e-06, "loss": 0.0358, "step": 3595 }, { "epoch": 4.038747718657869, "grad_norm": 0.32486539485385396, "learning_rate": 2.1611168554388353e-06, "loss": 0.0375, "step": 3596 }, { "epoch": 4.039870840937807, "grad_norm": 0.33856386057317117, "learning_rate": 2.156248803947254e-06, "loss": 0.0357, "step": 3597 }, { "epoch": 4.040993963217745, "grad_norm": 0.32586631725378973, "learning_rate": 2.1513855788021e-06, "loss": 0.0354, "step": 3598 }, { "epoch": 4.042117085497684, "grad_norm": 0.3308607327766132, "learning_rate": 2.14652718299577e-06, "loss": 0.0347, "step": 3599 }, { "epoch": 4.043240207777622, "grad_norm": 0.33048976251932244, "learning_rate": 2.141673619517687e-06, "loss": 0.0361, "step": 3600 }, { "epoch": 4.04436333005756, "grad_norm": 0.32515150803710624, "learning_rate": 2.1368248913543065e-06, "loss": 0.035, "step": 3601 }, { "epoch": 4.0454864523374985, "grad_norm": 0.3027989162513895, "learning_rate": 2.1319810014890972e-06, "loss": 0.0331, "step": 3602 }, { "epoch": 4.046609574617436, "grad_norm": 0.32565678045910307, "learning_rate": 2.127141952902563e-06, "loss": 0.0371, "step": 3603 }, { "epoch": 4.047732696897374, "grad_norm": 0.30745483238302934, "learning_rate": 2.12230774857222e-06, "loss": 0.0322, "step": 3604 }, { "epoch": 4.048855819177313, "grad_norm": 0.3418209827856129, "learning_rate": 2.1174783914726106e-06, "loss": 0.0357, "step": 3605 }, { "epoch": 4.049978941457251, "grad_norm": 0.34150940597932833, "learning_rate": 2.1126538845752918e-06, "loss": 0.0361, "step": 3606 }, { "epoch": 4.051102063737189, "grad_norm": 0.3319716006832906, "learning_rate": 2.107834230848833e-06, "loss": 0.0355, "step": 3607 }, { "epoch": 4.052225186017128, "grad_norm": 0.3256155221300326, "learning_rate": 2.1030194332588203e-06, "loss": 0.0341, "step": 3608 }, { "epoch": 4.053348308297066, "grad_norm": 0.3511915328594416, "learning_rate": 2.098209494767853e-06, "loss": 0.0357, "step": 3609 }, { "epoch": 4.054471430577004, "grad_norm": 0.3389534143051318, "learning_rate": 2.0934044183355384e-06, "loss": 0.0331, "step": 3610 }, { "epoch": 4.055594552856943, "grad_norm": 0.3311633004682248, "learning_rate": 2.088604206918494e-06, "loss": 0.0297, "step": 3611 }, { "epoch": 4.0567176751368805, "grad_norm": 0.37079247885517713, "learning_rate": 2.0838088634703412e-06, "loss": 0.0495, "step": 3612 }, { "epoch": 4.0578407974168185, "grad_norm": 0.3436254547868691, "learning_rate": 2.0790183909417096e-06, "loss": 0.0354, "step": 3613 }, { "epoch": 4.058963919696757, "grad_norm": 0.3537084660888286, "learning_rate": 2.0742327922802285e-06, "loss": 0.0364, "step": 3614 }, { "epoch": 4.060087041976695, "grad_norm": 0.33720406412003606, "learning_rate": 2.069452070430529e-06, "loss": 0.0357, "step": 3615 }, { "epoch": 4.061210164256633, "grad_norm": 0.361772020239582, "learning_rate": 2.0646762283342448e-06, "loss": 0.0406, "step": 3616 }, { "epoch": 4.062333286536572, "grad_norm": 0.3305905026607228, "learning_rate": 2.059905268929999e-06, "loss": 0.0291, "step": 3617 }, { "epoch": 4.06345640881651, "grad_norm": 0.35558319079029205, "learning_rate": 2.055139195153417e-06, "loss": 0.0314, "step": 3618 }, { "epoch": 4.064579531096448, "grad_norm": 0.347660893552983, "learning_rate": 2.0503780099371196e-06, "loss": 0.0357, "step": 3619 }, { "epoch": 4.065702653376387, "grad_norm": 0.35259174178160463, "learning_rate": 2.045621716210713e-06, "loss": 0.0314, "step": 3620 }, { "epoch": 4.066825775656325, "grad_norm": 0.32281631904843283, "learning_rate": 2.0408703169008015e-06, "loss": 0.033, "step": 3621 }, { "epoch": 4.067948897936263, "grad_norm": 0.34725153445115975, "learning_rate": 2.036123814930967e-06, "loss": 0.0375, "step": 3622 }, { "epoch": 4.069072020216201, "grad_norm": 0.35654253964781996, "learning_rate": 2.0313822132217887e-06, "loss": 0.0328, "step": 3623 }, { "epoch": 4.070195142496139, "grad_norm": 0.33133915489525906, "learning_rate": 2.0266455146908248e-06, "loss": 0.0345, "step": 3624 }, { "epoch": 4.071318264776077, "grad_norm": 0.36110178776774343, "learning_rate": 2.0219137222526188e-06, "loss": 0.0396, "step": 3625 }, { "epoch": 4.072441387056016, "grad_norm": 0.32077710176540647, "learning_rate": 2.0171868388186953e-06, "loss": 0.0358, "step": 3626 }, { "epoch": 4.073564509335954, "grad_norm": 0.3444116573250157, "learning_rate": 2.0124648672975567e-06, "loss": 0.0355, "step": 3627 }, { "epoch": 4.074687631615892, "grad_norm": 0.3515163459356766, "learning_rate": 2.007747810594682e-06, "loss": 0.0367, "step": 3628 }, { "epoch": 4.075810753895831, "grad_norm": 0.3450125011830339, "learning_rate": 2.003035671612532e-06, "loss": 0.036, "step": 3629 }, { "epoch": 4.076933876175769, "grad_norm": 0.3547038161654613, "learning_rate": 1.9983284532505343e-06, "loss": 0.0372, "step": 3630 }, { "epoch": 4.078056998455707, "grad_norm": 0.3417279392843428, "learning_rate": 1.9936261584050974e-06, "loss": 0.0374, "step": 3631 }, { "epoch": 4.0791801207356455, "grad_norm": 0.34281518207305883, "learning_rate": 1.9889287899695887e-06, "loss": 0.0371, "step": 3632 }, { "epoch": 4.0803032430155834, "grad_norm": 0.3272532226131082, "learning_rate": 1.9842363508343532e-06, "loss": 0.0344, "step": 3633 }, { "epoch": 4.081426365295521, "grad_norm": 0.3362181119467591, "learning_rate": 1.9795488438867005e-06, "loss": 0.0325, "step": 3634 }, { "epoch": 4.08254948757546, "grad_norm": 0.34910761973320087, "learning_rate": 1.974866272010908e-06, "loss": 0.0335, "step": 3635 }, { "epoch": 4.083672609855398, "grad_norm": 0.3730488558950869, "learning_rate": 1.9701886380882073e-06, "loss": 0.0402, "step": 3636 }, { "epoch": 4.084795732135336, "grad_norm": 0.3413923732565504, "learning_rate": 1.965515944996803e-06, "loss": 0.0361, "step": 3637 }, { "epoch": 4.085918854415274, "grad_norm": 0.33096031668167075, "learning_rate": 1.960848195611853e-06, "loss": 0.0329, "step": 3638 }, { "epoch": 4.087041976695213, "grad_norm": 0.34388985312179227, "learning_rate": 1.9561853928054753e-06, "loss": 0.0377, "step": 3639 }, { "epoch": 4.088165098975151, "grad_norm": 0.32146386739388727, "learning_rate": 1.9515275394467446e-06, "loss": 0.0349, "step": 3640 }, { "epoch": 4.089288221255089, "grad_norm": 0.33807086229307787, "learning_rate": 1.946874638401688e-06, "loss": 0.0336, "step": 3641 }, { "epoch": 4.0904113435350276, "grad_norm": 0.34620877675756956, "learning_rate": 1.9422266925332857e-06, "loss": 0.0383, "step": 3642 }, { "epoch": 4.0915344658149655, "grad_norm": 0.36100090203902546, "learning_rate": 1.9375837047014712e-06, "loss": 0.0393, "step": 3643 }, { "epoch": 4.092657588094903, "grad_norm": 0.34073891008207474, "learning_rate": 1.9329456777631273e-06, "loss": 0.0342, "step": 3644 }, { "epoch": 4.093780710374842, "grad_norm": 0.343909859895942, "learning_rate": 1.928312614572083e-06, "loss": 0.0346, "step": 3645 }, { "epoch": 4.09490383265478, "grad_norm": 0.3405997491217741, "learning_rate": 1.92368451797911e-06, "loss": 0.0333, "step": 3646 }, { "epoch": 4.096026954934718, "grad_norm": 0.3532535917803961, "learning_rate": 1.919061390831929e-06, "loss": 0.0371, "step": 3647 }, { "epoch": 4.097150077214657, "grad_norm": 0.3402279776506247, "learning_rate": 1.914443235975201e-06, "loss": 0.0345, "step": 3648 }, { "epoch": 4.098273199494595, "grad_norm": 0.33955904857288616, "learning_rate": 1.9098300562505266e-06, "loss": 0.0354, "step": 3649 }, { "epoch": 4.099396321774533, "grad_norm": 0.3548495987201884, "learning_rate": 1.9052218544964473e-06, "loss": 0.0378, "step": 3650 }, { "epoch": 4.100519444054472, "grad_norm": 0.3565039554766224, "learning_rate": 1.9006186335484422e-06, "loss": 0.0389, "step": 3651 }, { "epoch": 4.10164256633441, "grad_norm": 0.3176237453159487, "learning_rate": 1.89602039623892e-06, "loss": 0.0329, "step": 3652 }, { "epoch": 4.1027656886143475, "grad_norm": 0.34171584057174337, "learning_rate": 1.8914271453972277e-06, "loss": 0.0374, "step": 3653 }, { "epoch": 4.103888810894286, "grad_norm": 0.3573392279296198, "learning_rate": 1.8868388838496433e-06, "loss": 0.0341, "step": 3654 }, { "epoch": 4.105011933174224, "grad_norm": 0.3319986984776665, "learning_rate": 1.882255614419376e-06, "loss": 0.0343, "step": 3655 }, { "epoch": 4.106135055454162, "grad_norm": 0.33666673936394453, "learning_rate": 1.8776773399265601e-06, "loss": 0.0354, "step": 3656 }, { "epoch": 4.107258177734101, "grad_norm": 0.36043615873431406, "learning_rate": 1.8731040631882591e-06, "loss": 0.0354, "step": 3657 }, { "epoch": 4.108381300014039, "grad_norm": 0.3719854481200747, "learning_rate": 1.8685357870184605e-06, "loss": 0.0384, "step": 3658 }, { "epoch": 4.109504422293977, "grad_norm": 0.3351525535966926, "learning_rate": 1.8639725142280752e-06, "loss": 0.0336, "step": 3659 }, { "epoch": 4.110627544573916, "grad_norm": 0.33593296146741825, "learning_rate": 1.8594142476249365e-06, "loss": 0.0318, "step": 3660 }, { "epoch": 4.111750666853854, "grad_norm": 0.34070125655000394, "learning_rate": 1.8548609900137926e-06, "loss": 0.0381, "step": 3661 }, { "epoch": 4.112873789133792, "grad_norm": 0.3438466556073073, "learning_rate": 1.8503127441963153e-06, "loss": 0.0317, "step": 3662 }, { "epoch": 4.1139969114137305, "grad_norm": 0.3422040619381074, "learning_rate": 1.8457695129710885e-06, "loss": 0.0341, "step": 3663 }, { "epoch": 4.115120033693668, "grad_norm": 0.3336224223024909, "learning_rate": 1.8412312991336146e-06, "loss": 0.0335, "step": 3664 }, { "epoch": 4.116243155973606, "grad_norm": 0.3243595593137732, "learning_rate": 1.8366981054763077e-06, "loss": 0.034, "step": 3665 }, { "epoch": 4.117366278253545, "grad_norm": 0.3408130322069507, "learning_rate": 1.8321699347884869e-06, "loss": 0.0359, "step": 3666 }, { "epoch": 4.118489400533483, "grad_norm": 0.33585943316941114, "learning_rate": 1.8276467898563887e-06, "loss": 0.0369, "step": 3667 }, { "epoch": 4.119612522813421, "grad_norm": 0.3440889810814874, "learning_rate": 1.8231286734631526e-06, "loss": 0.0323, "step": 3668 }, { "epoch": 4.12073564509336, "grad_norm": 0.34877435967999315, "learning_rate": 1.818615588388829e-06, "loss": 0.0391, "step": 3669 }, { "epoch": 4.121858767373298, "grad_norm": 0.33501771510850326, "learning_rate": 1.8141075374103634e-06, "loss": 0.0334, "step": 3670 }, { "epoch": 4.122981889653236, "grad_norm": 0.3477079472833074, "learning_rate": 1.8096045233016123e-06, "loss": 0.0372, "step": 3671 }, { "epoch": 4.124105011933175, "grad_norm": 0.352773874107671, "learning_rate": 1.8051065488333285e-06, "loss": 0.039, "step": 3672 }, { "epoch": 4.1252281342131125, "grad_norm": 0.3249660660258163, "learning_rate": 1.8006136167731658e-06, "loss": 0.0325, "step": 3673 }, { "epoch": 4.1263512564930505, "grad_norm": 0.352952838092871, "learning_rate": 1.7961257298856783e-06, "loss": 0.0406, "step": 3674 }, { "epoch": 4.127474378772989, "grad_norm": 0.36490521962442574, "learning_rate": 1.7916428909323057e-06, "loss": 0.0374, "step": 3675 }, { "epoch": 4.128597501052927, "grad_norm": 0.33698323334336555, "learning_rate": 1.787165102671391e-06, "loss": 0.0336, "step": 3676 }, { "epoch": 4.129720623332865, "grad_norm": 0.32813513335813027, "learning_rate": 1.7826923678581664e-06, "loss": 0.034, "step": 3677 }, { "epoch": 4.130843745612804, "grad_norm": 0.3358051137115379, "learning_rate": 1.7782246892447564e-06, "loss": 0.0323, "step": 3678 }, { "epoch": 4.131966867892742, "grad_norm": 0.31172849246423406, "learning_rate": 1.7737620695801737e-06, "loss": 0.0306, "step": 3679 }, { "epoch": 4.13308999017268, "grad_norm": 0.3726210328067485, "learning_rate": 1.7693045116103125e-06, "loss": 0.0382, "step": 3680 }, { "epoch": 4.134213112452619, "grad_norm": 0.33518709615258124, "learning_rate": 1.7648520180779605e-06, "loss": 0.0319, "step": 3681 }, { "epoch": 4.135336234732557, "grad_norm": 0.37299298008128623, "learning_rate": 1.7604045917227852e-06, "loss": 0.0364, "step": 3682 }, { "epoch": 4.136459357012495, "grad_norm": 0.3471679342038857, "learning_rate": 1.7559622352813366e-06, "loss": 0.0346, "step": 3683 }, { "epoch": 4.137582479292433, "grad_norm": 0.36575324426037753, "learning_rate": 1.7515249514870504e-06, "loss": 0.036, "step": 3684 }, { "epoch": 4.138705601572371, "grad_norm": 0.33169330081699266, "learning_rate": 1.7470927430702277e-06, "loss": 0.0359, "step": 3685 }, { "epoch": 4.139828723852309, "grad_norm": 0.34483958264370257, "learning_rate": 1.7426656127580598e-06, "loss": 0.0341, "step": 3686 }, { "epoch": 4.140951846132248, "grad_norm": 0.3562439460847369, "learning_rate": 1.7382435632746086e-06, "loss": 0.0379, "step": 3687 }, { "epoch": 4.142074968412186, "grad_norm": 0.31582095689600725, "learning_rate": 1.7338265973408097e-06, "loss": 0.0314, "step": 3688 }, { "epoch": 4.143198090692124, "grad_norm": 0.3433619286521538, "learning_rate": 1.7294147176744725e-06, "loss": 0.0356, "step": 3689 }, { "epoch": 4.144321212972062, "grad_norm": 0.3489147400059873, "learning_rate": 1.7250079269902708e-06, "loss": 0.0367, "step": 3690 }, { "epoch": 4.145444335252001, "grad_norm": 0.3352194034975239, "learning_rate": 1.7206062279997538e-06, "loss": 0.0326, "step": 3691 }, { "epoch": 4.146567457531939, "grad_norm": 0.355514581203735, "learning_rate": 1.7162096234113358e-06, "loss": 0.0372, "step": 3692 }, { "epoch": 4.147690579811877, "grad_norm": 0.3359309769139185, "learning_rate": 1.7118181159302948e-06, "loss": 0.0353, "step": 3693 }, { "epoch": 4.148813702091815, "grad_norm": 0.3380299997227094, "learning_rate": 1.7074317082587755e-06, "loss": 0.0375, "step": 3694 }, { "epoch": 4.149936824371753, "grad_norm": 0.3179117693436562, "learning_rate": 1.703050403095783e-06, "loss": 0.0319, "step": 3695 }, { "epoch": 4.151059946651691, "grad_norm": 0.34267472556917217, "learning_rate": 1.6986742031371794e-06, "loss": 0.0375, "step": 3696 }, { "epoch": 4.15218306893163, "grad_norm": 0.3550491112940439, "learning_rate": 1.6943031110756902e-06, "loss": 0.0384, "step": 3697 }, { "epoch": 4.153306191211568, "grad_norm": 0.3441009447815663, "learning_rate": 1.689937129600897e-06, "loss": 0.036, "step": 3698 }, { "epoch": 4.154429313491506, "grad_norm": 0.35632790653268437, "learning_rate": 1.6855762613992367e-06, "loss": 0.0383, "step": 3699 }, { "epoch": 4.155552435771445, "grad_norm": 0.3380140910478162, "learning_rate": 1.6812205091539979e-06, "loss": 0.0366, "step": 3700 }, { "epoch": 4.156675558051383, "grad_norm": 0.36642723566904717, "learning_rate": 1.676869875545324e-06, "loss": 0.0425, "step": 3701 }, { "epoch": 4.157798680331321, "grad_norm": 0.3486015385918872, "learning_rate": 1.6725243632502074e-06, "loss": 0.0365, "step": 3702 }, { "epoch": 4.1589218026112595, "grad_norm": 0.34087715875690777, "learning_rate": 1.668183974942491e-06, "loss": 0.0354, "step": 3703 }, { "epoch": 4.1600449248911975, "grad_norm": 0.3301015274275597, "learning_rate": 1.6638487132928638e-06, "loss": 0.0337, "step": 3704 }, { "epoch": 4.161168047171135, "grad_norm": 0.32631689981447143, "learning_rate": 1.6595185809688564e-06, "loss": 0.0359, "step": 3705 }, { "epoch": 4.162291169451074, "grad_norm": 0.33677899223356966, "learning_rate": 1.6551935806348485e-06, "loss": 0.0352, "step": 3706 }, { "epoch": 4.163414291731012, "grad_norm": 0.344706832968584, "learning_rate": 1.6508737149520615e-06, "loss": 0.0344, "step": 3707 }, { "epoch": 4.16453741401095, "grad_norm": 0.3319801271011935, "learning_rate": 1.6465589865785581e-06, "loss": 0.0311, "step": 3708 }, { "epoch": 4.165660536290889, "grad_norm": 0.3469710548961725, "learning_rate": 1.6422493981692333e-06, "loss": 0.0343, "step": 3709 }, { "epoch": 4.166783658570827, "grad_norm": 0.3459491276034882, "learning_rate": 1.6379449523758262e-06, "loss": 0.0341, "step": 3710 }, { "epoch": 4.167906780850765, "grad_norm": 0.3664777228112351, "learning_rate": 1.6336456518469112e-06, "loss": 0.0398, "step": 3711 }, { "epoch": 4.169029903130704, "grad_norm": 0.3404376801657948, "learning_rate": 1.6293514992278935e-06, "loss": 0.0326, "step": 3712 }, { "epoch": 4.170153025410642, "grad_norm": 0.3568606518647232, "learning_rate": 1.6250624971610152e-06, "loss": 0.0331, "step": 3713 }, { "epoch": 4.1712761476905795, "grad_norm": 0.34901751847554274, "learning_rate": 1.6207786482853428e-06, "loss": 0.0376, "step": 3714 }, { "epoch": 4.172399269970518, "grad_norm": 0.3399558991517041, "learning_rate": 1.6164999552367767e-06, "loss": 0.0314, "step": 3715 }, { "epoch": 4.173522392250456, "grad_norm": 0.33061099011882433, "learning_rate": 1.6122264206480443e-06, "loss": 0.0329, "step": 3716 }, { "epoch": 4.174645514530394, "grad_norm": 0.3710949614187633, "learning_rate": 1.6079580471486988e-06, "loss": 0.033, "step": 3717 }, { "epoch": 4.175768636810333, "grad_norm": 0.3769411467365772, "learning_rate": 1.6036948373651195e-06, "loss": 0.037, "step": 3718 }, { "epoch": 4.176891759090271, "grad_norm": 0.3349294639560482, "learning_rate": 1.5994367939205012e-06, "loss": 0.0308, "step": 3719 }, { "epoch": 4.178014881370209, "grad_norm": 0.322230527234023, "learning_rate": 1.5951839194348684e-06, "loss": 0.033, "step": 3720 }, { "epoch": 4.179138003650148, "grad_norm": 0.3423969186367366, "learning_rate": 1.5909362165250609e-06, "loss": 0.0333, "step": 3721 }, { "epoch": 4.180261125930086, "grad_norm": 0.34016184747977096, "learning_rate": 1.5866936878047368e-06, "loss": 0.0329, "step": 3722 }, { "epoch": 4.181384248210024, "grad_norm": 0.3606283637266968, "learning_rate": 1.5824563358843725e-06, "loss": 0.0385, "step": 3723 }, { "epoch": 4.1825073704899625, "grad_norm": 0.36699375787006455, "learning_rate": 1.5782241633712536e-06, "loss": 0.0323, "step": 3724 }, { "epoch": 4.1836304927699, "grad_norm": 0.34062675166929457, "learning_rate": 1.5739971728694848e-06, "loss": 0.0323, "step": 3725 }, { "epoch": 4.184753615049838, "grad_norm": 0.3585907476688318, "learning_rate": 1.5697753669799788e-06, "loss": 0.0352, "step": 3726 }, { "epoch": 4.185876737329777, "grad_norm": 0.3627326102547834, "learning_rate": 1.5655587483004608e-06, "loss": 0.0352, "step": 3727 }, { "epoch": 4.186999859609715, "grad_norm": 0.3202203931611738, "learning_rate": 1.5613473194254636e-06, "loss": 0.0323, "step": 3728 }, { "epoch": 4.188122981889653, "grad_norm": 0.35906124847404014, "learning_rate": 1.5571410829463218e-06, "loss": 0.033, "step": 3729 }, { "epoch": 4.189246104169592, "grad_norm": 0.3407268596851762, "learning_rate": 1.5529400414511809e-06, "loss": 0.0363, "step": 3730 }, { "epoch": 4.19036922644953, "grad_norm": 0.3301264157569526, "learning_rate": 1.5487441975249885e-06, "loss": 0.0308, "step": 3731 }, { "epoch": 4.191492348729468, "grad_norm": 0.35754773559343533, "learning_rate": 1.5445535537494926e-06, "loss": 0.0341, "step": 3732 }, { "epoch": 4.192615471009407, "grad_norm": 0.3546319476031264, "learning_rate": 1.5403681127032466e-06, "loss": 0.0354, "step": 3733 }, { "epoch": 4.1937385932893445, "grad_norm": 0.3334688214498469, "learning_rate": 1.5361878769615913e-06, "loss": 0.0363, "step": 3734 }, { "epoch": 4.1948617155692824, "grad_norm": 0.359559039318054, "learning_rate": 1.5320128490966768e-06, "loss": 0.0402, "step": 3735 }, { "epoch": 4.19598483784922, "grad_norm": 0.3559528249651165, "learning_rate": 1.5278430316774406e-06, "loss": 0.0336, "step": 3736 }, { "epoch": 4.197107960129159, "grad_norm": 0.3389181856407088, "learning_rate": 1.5236784272696204e-06, "loss": 0.0363, "step": 3737 }, { "epoch": 4.198231082409097, "grad_norm": 0.33514425271731724, "learning_rate": 1.5195190384357405e-06, "loss": 0.0315, "step": 3738 }, { "epoch": 4.199354204689035, "grad_norm": 0.34519876807313443, "learning_rate": 1.5153648677351196e-06, "loss": 0.0355, "step": 3739 }, { "epoch": 4.200477326968974, "grad_norm": 0.32555795899970486, "learning_rate": 1.5112159177238683e-06, "loss": 0.0322, "step": 3740 }, { "epoch": 4.201600449248912, "grad_norm": 0.3499784570691095, "learning_rate": 1.5070721909548747e-06, "loss": 0.036, "step": 3741 }, { "epoch": 4.20272357152885, "grad_norm": 0.3454531769390773, "learning_rate": 1.5029336899778224e-06, "loss": 0.0351, "step": 3742 }, { "epoch": 4.203846693808789, "grad_norm": 0.33874193071072906, "learning_rate": 1.4988004173391769e-06, "loss": 0.0342, "step": 3743 }, { "epoch": 4.2049698160887266, "grad_norm": 0.3427966066704258, "learning_rate": 1.4946723755821858e-06, "loss": 0.0361, "step": 3744 }, { "epoch": 4.2060929383686645, "grad_norm": 0.33371125691338493, "learning_rate": 1.4905495672468784e-06, "loss": 0.0335, "step": 3745 }, { "epoch": 4.207216060648603, "grad_norm": 0.348963362736691, "learning_rate": 1.4864319948700656e-06, "loss": 0.0355, "step": 3746 }, { "epoch": 4.208339182928541, "grad_norm": 0.3440752736809589, "learning_rate": 1.4823196609853362e-06, "loss": 0.0306, "step": 3747 }, { "epoch": 4.209462305208479, "grad_norm": 0.3531490219546731, "learning_rate": 1.4782125681230497e-06, "loss": 0.0315, "step": 3748 }, { "epoch": 4.210585427488418, "grad_norm": 0.3507377236203947, "learning_rate": 1.4741107188103477e-06, "loss": 0.0337, "step": 3749 }, { "epoch": 4.211708549768356, "grad_norm": 0.33761169130704055, "learning_rate": 1.4700141155711433e-06, "loss": 0.0344, "step": 3750 }, { "epoch": 4.212831672048294, "grad_norm": 0.5661341702283312, "learning_rate": 1.465922760926123e-06, "loss": 0.0341, "step": 3751 }, { "epoch": 4.213954794328233, "grad_norm": 0.3575202836894078, "learning_rate": 1.4618366573927423e-06, "loss": 0.0367, "step": 3752 }, { "epoch": 4.215077916608171, "grad_norm": 0.3513549935655655, "learning_rate": 1.4577558074852228e-06, "loss": 0.0374, "step": 3753 }, { "epoch": 4.216201038888109, "grad_norm": 0.3315282047974937, "learning_rate": 1.453680213714559e-06, "loss": 0.0319, "step": 3754 }, { "epoch": 4.217324161168047, "grad_norm": 0.3718187160987202, "learning_rate": 1.449609878588506e-06, "loss": 0.0375, "step": 3755 }, { "epoch": 4.218447283447985, "grad_norm": 0.3375869813485876, "learning_rate": 1.4455448046115884e-06, "loss": 0.0336, "step": 3756 }, { "epoch": 4.219570405727923, "grad_norm": 0.34005174509994746, "learning_rate": 1.4414849942850927e-06, "loss": 0.0341, "step": 3757 }, { "epoch": 4.220693528007862, "grad_norm": 0.33034188119181235, "learning_rate": 1.4374304501070592e-06, "loss": 0.0329, "step": 3758 }, { "epoch": 4.2218166502878, "grad_norm": 0.3356259815303574, "learning_rate": 1.433381174572297e-06, "loss": 0.0342, "step": 3759 }, { "epoch": 4.222939772567738, "grad_norm": 0.3410011093554527, "learning_rate": 1.4293371701723701e-06, "loss": 0.0335, "step": 3760 }, { "epoch": 4.224062894847677, "grad_norm": 0.34434008829412244, "learning_rate": 1.425298439395597e-06, "loss": 0.0342, "step": 3761 }, { "epoch": 4.225186017127615, "grad_norm": 0.32274850049291476, "learning_rate": 1.4212649847270576e-06, "loss": 0.0344, "step": 3762 }, { "epoch": 4.226309139407553, "grad_norm": 0.36333470622076525, "learning_rate": 1.4172368086485755e-06, "loss": 0.0363, "step": 3763 }, { "epoch": 4.2274322616874915, "grad_norm": 0.3479702931501896, "learning_rate": 1.4132139136387334e-06, "loss": 0.0359, "step": 3764 }, { "epoch": 4.2285553839674295, "grad_norm": 0.33884738891643645, "learning_rate": 1.4091963021728639e-06, "loss": 0.0331, "step": 3765 }, { "epoch": 4.229678506247367, "grad_norm": 0.35343625411719487, "learning_rate": 1.4051839767230479e-06, "loss": 0.0352, "step": 3766 }, { "epoch": 4.230801628527306, "grad_norm": 0.3370621206104113, "learning_rate": 1.4011769397581143e-06, "loss": 0.035, "step": 3767 }, { "epoch": 4.231924750807244, "grad_norm": 0.3693570022771069, "learning_rate": 1.397175193743633e-06, "loss": 0.0379, "step": 3768 }, { "epoch": 4.233047873087182, "grad_norm": 0.34977327665429203, "learning_rate": 1.3931787411419252e-06, "loss": 0.0335, "step": 3769 }, { "epoch": 4.234170995367121, "grad_norm": 0.35935908738327177, "learning_rate": 1.3891875844120517e-06, "loss": 0.0354, "step": 3770 }, { "epoch": 4.235294117647059, "grad_norm": 0.33995180258642155, "learning_rate": 1.385201726009815e-06, "loss": 0.0319, "step": 3771 }, { "epoch": 4.236417239926997, "grad_norm": 0.33986459891753673, "learning_rate": 1.3812211683877608e-06, "loss": 0.0322, "step": 3772 }, { "epoch": 4.237540362206936, "grad_norm": 0.3418058114878745, "learning_rate": 1.3772459139951643e-06, "loss": 0.0351, "step": 3773 }, { "epoch": 4.238663484486874, "grad_norm": 0.3556017630987595, "learning_rate": 1.3732759652780458e-06, "loss": 0.0384, "step": 3774 }, { "epoch": 4.2397866067668115, "grad_norm": 0.3293972750031755, "learning_rate": 1.369311324679159e-06, "loss": 0.0337, "step": 3775 }, { "epoch": 4.24090972904675, "grad_norm": 0.3212509101961188, "learning_rate": 1.3653519946379912e-06, "loss": 0.0345, "step": 3776 }, { "epoch": 4.242032851326688, "grad_norm": 0.34182476674452744, "learning_rate": 1.3613979775907627e-06, "loss": 0.0366, "step": 3777 }, { "epoch": 4.243155973606626, "grad_norm": 0.3369835165724027, "learning_rate": 1.3574492759704194e-06, "loss": 0.0336, "step": 3778 }, { "epoch": 4.244279095886565, "grad_norm": 0.3377925126914695, "learning_rate": 1.3535058922066447e-06, "loss": 0.035, "step": 3779 }, { "epoch": 4.245402218166503, "grad_norm": 0.33513094535575855, "learning_rate": 1.349567828725844e-06, "loss": 0.033, "step": 3780 }, { "epoch": 4.246525340446441, "grad_norm": 0.3396136926844531, "learning_rate": 1.3456350879511526e-06, "loss": 0.0343, "step": 3781 }, { "epoch": 4.24764846272638, "grad_norm": 0.34445639039011566, "learning_rate": 1.3417076723024281e-06, "loss": 0.0337, "step": 3782 }, { "epoch": 4.248771585006318, "grad_norm": 0.3451407077304778, "learning_rate": 1.3377855841962528e-06, "loss": 0.0321, "step": 3783 }, { "epoch": 4.249894707286256, "grad_norm": 0.35973782888492206, "learning_rate": 1.333868826045932e-06, "loss": 0.0384, "step": 3784 }, { "epoch": 4.2510178295661945, "grad_norm": 0.351803523339287, "learning_rate": 1.3299574002614901e-06, "loss": 0.0355, "step": 3785 }, { "epoch": 4.252140951846132, "grad_norm": 0.34276230294757215, "learning_rate": 1.3260513092496674e-06, "loss": 0.0334, "step": 3786 }, { "epoch": 4.25326407412607, "grad_norm": 0.3690545859860941, "learning_rate": 1.322150555413927e-06, "loss": 0.0347, "step": 3787 }, { "epoch": 4.254387196406009, "grad_norm": 0.3630986947686924, "learning_rate": 1.3182551411544454e-06, "loss": 0.0359, "step": 3788 }, { "epoch": 4.255510318685947, "grad_norm": 0.3479212681472171, "learning_rate": 1.314365068868113e-06, "loss": 0.0313, "step": 3789 }, { "epoch": 4.256633440965885, "grad_norm": 0.34790711153559306, "learning_rate": 1.3104803409485357e-06, "loss": 0.0364, "step": 3790 }, { "epoch": 4.257756563245824, "grad_norm": 0.3306603188465356, "learning_rate": 1.3066009597860295e-06, "loss": 0.0356, "step": 3791 }, { "epoch": 4.258879685525762, "grad_norm": 0.34564641948043884, "learning_rate": 1.302726927767618e-06, "loss": 0.0357, "step": 3792 }, { "epoch": 4.2600028078057, "grad_norm": 0.3585307062453091, "learning_rate": 1.2988582472770372e-06, "loss": 0.0376, "step": 3793 }, { "epoch": 4.261125930085638, "grad_norm": 0.3297599087847977, "learning_rate": 1.2949949206947276e-06, "loss": 0.035, "step": 3794 }, { "epoch": 4.2622490523655765, "grad_norm": 0.3581042327920125, "learning_rate": 1.2911369503978389e-06, "loss": 0.0427, "step": 3795 }, { "epoch": 4.263372174645514, "grad_norm": 0.34205869714374937, "learning_rate": 1.287284338760222e-06, "loss": 0.0333, "step": 3796 }, { "epoch": 4.264495296925452, "grad_norm": 0.37851885751818304, "learning_rate": 1.2834370881524294e-06, "loss": 0.0344, "step": 3797 }, { "epoch": 4.265618419205391, "grad_norm": 0.3658512028956833, "learning_rate": 1.2795952009417178e-06, "loss": 0.0357, "step": 3798 }, { "epoch": 4.266741541485329, "grad_norm": 0.35155118073032193, "learning_rate": 1.275758679492043e-06, "loss": 0.0323, "step": 3799 }, { "epoch": 4.267864663765267, "grad_norm": 0.3634666090752805, "learning_rate": 1.2719275261640584e-06, "loss": 0.0368, "step": 3800 }, { "epoch": 4.268987786045206, "grad_norm": 0.3434528301477464, "learning_rate": 1.2681017433151166e-06, "loss": 0.0332, "step": 3801 }, { "epoch": 4.270110908325144, "grad_norm": 0.3551781416800868, "learning_rate": 1.264281333299261e-06, "loss": 0.0358, "step": 3802 }, { "epoch": 4.271234030605082, "grad_norm": 0.35818119334156057, "learning_rate": 1.2604662984672333e-06, "loss": 0.0358, "step": 3803 }, { "epoch": 4.272357152885021, "grad_norm": 0.37688093949994683, "learning_rate": 1.256656641166466e-06, "loss": 0.0389, "step": 3804 }, { "epoch": 4.2734802751649585, "grad_norm": 0.33982096154376207, "learning_rate": 1.252852363741084e-06, "loss": 0.0328, "step": 3805 }, { "epoch": 4.2746033974448965, "grad_norm": 0.33879902680647445, "learning_rate": 1.2490534685319022e-06, "loss": 0.034, "step": 3806 }, { "epoch": 4.275726519724835, "grad_norm": 0.341933822830432, "learning_rate": 1.2452599578764191e-06, "loss": 0.0359, "step": 3807 }, { "epoch": 4.276849642004773, "grad_norm": 0.35541574499799117, "learning_rate": 1.241471834108825e-06, "loss": 0.0375, "step": 3808 }, { "epoch": 4.277972764284711, "grad_norm": 0.33786723192624557, "learning_rate": 1.2376890995599955e-06, "loss": 0.0334, "step": 3809 }, { "epoch": 4.27909588656465, "grad_norm": 0.3586483875205426, "learning_rate": 1.2339117565574877e-06, "loss": 0.0344, "step": 3810 }, { "epoch": 4.280219008844588, "grad_norm": 0.33925970086487844, "learning_rate": 1.2301398074255444e-06, "loss": 0.0317, "step": 3811 }, { "epoch": 4.281342131124526, "grad_norm": 0.34451634085780747, "learning_rate": 1.2263732544850826e-06, "loss": 0.0324, "step": 3812 }, { "epoch": 4.282465253404465, "grad_norm": 0.3534467917065109, "learning_rate": 1.2226121000537082e-06, "loss": 0.0353, "step": 3813 }, { "epoch": 4.283588375684403, "grad_norm": 0.34295732835300846, "learning_rate": 1.2188563464456993e-06, "loss": 0.0344, "step": 3814 }, { "epoch": 4.284711497964341, "grad_norm": 0.3467884753627846, "learning_rate": 1.2151059959720136e-06, "loss": 0.0353, "step": 3815 }, { "epoch": 4.285834620244279, "grad_norm": 0.36125586893387135, "learning_rate": 1.2113610509402806e-06, "loss": 0.0352, "step": 3816 }, { "epoch": 4.286957742524217, "grad_norm": 0.3550225603303854, "learning_rate": 1.2076215136548076e-06, "loss": 0.0332, "step": 3817 }, { "epoch": 4.288080864804155, "grad_norm": 0.31427786748412406, "learning_rate": 1.2038873864165734e-06, "loss": 0.0297, "step": 3818 }, { "epoch": 4.289203987084094, "grad_norm": 0.3748675811463692, "learning_rate": 1.200158671523226e-06, "loss": 0.0397, "step": 3819 }, { "epoch": 4.290327109364032, "grad_norm": 0.33911019219908817, "learning_rate": 1.196435371269089e-06, "loss": 0.0335, "step": 3820 }, { "epoch": 4.29145023164397, "grad_norm": 0.3578468474443971, "learning_rate": 1.1927174879451442e-06, "loss": 0.0385, "step": 3821 }, { "epoch": 4.292573353923909, "grad_norm": 0.325444432765532, "learning_rate": 1.1890050238390493e-06, "loss": 0.033, "step": 3822 }, { "epoch": 4.293696476203847, "grad_norm": 0.38444358306981036, "learning_rate": 1.185297981235124e-06, "loss": 0.0414, "step": 3823 }, { "epoch": 4.294819598483785, "grad_norm": 0.31738949785516596, "learning_rate": 1.1815963624143522e-06, "loss": 0.0309, "step": 3824 }, { "epoch": 4.2959427207637235, "grad_norm": 0.3407919871654333, "learning_rate": 1.1779001696543802e-06, "loss": 0.0374, "step": 3825 }, { "epoch": 4.2970658430436615, "grad_norm": 0.33587739133134814, "learning_rate": 1.1742094052295172e-06, "loss": 0.031, "step": 3826 }, { "epoch": 4.298188965323599, "grad_norm": 0.36769806503943114, "learning_rate": 1.1705240714107301e-06, "loss": 0.0355, "step": 3827 }, { "epoch": 4.299312087603538, "grad_norm": 0.35570550048123933, "learning_rate": 1.1668441704656463e-06, "loss": 0.033, "step": 3828 }, { "epoch": 4.300435209883476, "grad_norm": 0.3535248321200532, "learning_rate": 1.1631697046585511e-06, "loss": 0.0345, "step": 3829 }, { "epoch": 4.301558332163414, "grad_norm": 0.34749247467018113, "learning_rate": 1.1595006762503791e-06, "loss": 0.0346, "step": 3830 }, { "epoch": 4.302681454443353, "grad_norm": 0.338129590666788, "learning_rate": 1.1558370874987268e-06, "loss": 0.0326, "step": 3831 }, { "epoch": 4.303804576723291, "grad_norm": 0.3494915631570644, "learning_rate": 1.1521789406578399e-06, "loss": 0.0351, "step": 3832 }, { "epoch": 4.304927699003229, "grad_norm": 0.3378744608121184, "learning_rate": 1.1485262379786166e-06, "loss": 0.0341, "step": 3833 }, { "epoch": 4.306050821283167, "grad_norm": 0.3419544672603244, "learning_rate": 1.1448789817086048e-06, "loss": 0.0344, "step": 3834 }, { "epoch": 4.307173943563106, "grad_norm": 0.3291924046121765, "learning_rate": 1.1412371740920036e-06, "loss": 0.0316, "step": 3835 }, { "epoch": 4.3082970658430435, "grad_norm": 0.35966027874793216, "learning_rate": 1.137600817369654e-06, "loss": 0.0362, "step": 3836 }, { "epoch": 4.309420188122981, "grad_norm": 0.3557579953353349, "learning_rate": 1.1339699137790483e-06, "loss": 0.0348, "step": 3837 }, { "epoch": 4.31054331040292, "grad_norm": 0.3437370526418855, "learning_rate": 1.1303444655543206e-06, "loss": 0.0351, "step": 3838 }, { "epoch": 4.311666432682858, "grad_norm": 0.3322992260093866, "learning_rate": 1.1267244749262496e-06, "loss": 0.0361, "step": 3839 }, { "epoch": 4.312789554962796, "grad_norm": 0.3506274769092776, "learning_rate": 1.123109944122256e-06, "loss": 0.0346, "step": 3840 }, { "epoch": 4.313912677242735, "grad_norm": 0.3470389405075643, "learning_rate": 1.1195008753663982e-06, "loss": 0.0328, "step": 3841 }, { "epoch": 4.315035799522673, "grad_norm": 0.3428900746970874, "learning_rate": 1.115897270879378e-06, "loss": 0.0358, "step": 3842 }, { "epoch": 4.316158921802611, "grad_norm": 0.3381168228309907, "learning_rate": 1.1122991328785315e-06, "loss": 0.0318, "step": 3843 }, { "epoch": 4.31728204408255, "grad_norm": 0.35370094895864085, "learning_rate": 1.1087064635778333e-06, "loss": 0.0364, "step": 3844 }, { "epoch": 4.318405166362488, "grad_norm": 0.34626426769086593, "learning_rate": 1.1051192651878938e-06, "loss": 0.0348, "step": 3845 }, { "epoch": 4.3195282886424256, "grad_norm": 0.34679291214864055, "learning_rate": 1.1015375399159533e-06, "loss": 0.0347, "step": 3846 }, { "epoch": 4.320651410922364, "grad_norm": 0.3267291332902231, "learning_rate": 1.0979612899658875e-06, "loss": 0.0312, "step": 3847 }, { "epoch": 4.321774533202302, "grad_norm": 0.33086846753238264, "learning_rate": 1.0943905175382018e-06, "loss": 0.0368, "step": 3848 }, { "epoch": 4.32289765548224, "grad_norm": 0.34911582216245135, "learning_rate": 1.0908252248300332e-06, "loss": 0.0357, "step": 3849 }, { "epoch": 4.324020777762179, "grad_norm": 0.330549307989673, "learning_rate": 1.0872654140351458e-06, "loss": 0.0328, "step": 3850 }, { "epoch": 4.325143900042117, "grad_norm": 0.3377978034641037, "learning_rate": 1.0837110873439282e-06, "loss": 0.0323, "step": 3851 }, { "epoch": 4.326267022322055, "grad_norm": 0.3641220302640594, "learning_rate": 1.080162246943398e-06, "loss": 0.0386, "step": 3852 }, { "epoch": 4.327390144601994, "grad_norm": 0.3597585412481393, "learning_rate": 1.0766188950171952e-06, "loss": 0.034, "step": 3853 }, { "epoch": 4.328513266881932, "grad_norm": 0.34322666886228304, "learning_rate": 1.0730810337455856e-06, "loss": 0.0321, "step": 3854 }, { "epoch": 4.32963638916187, "grad_norm": 0.3210457881854714, "learning_rate": 1.069548665305451e-06, "loss": 0.0295, "step": 3855 }, { "epoch": 4.3307595114418085, "grad_norm": 0.36948984897642523, "learning_rate": 1.0660217918702965e-06, "loss": 0.0361, "step": 3856 }, { "epoch": 4.331882633721746, "grad_norm": 0.4271058260932982, "learning_rate": 1.0625004156102492e-06, "loss": 0.0354, "step": 3857 }, { "epoch": 4.333005756001684, "grad_norm": 0.3549117455827815, "learning_rate": 1.0589845386920473e-06, "loss": 0.0387, "step": 3858 }, { "epoch": 4.334128878281623, "grad_norm": 0.35534169762226714, "learning_rate": 1.0554741632790532e-06, "loss": 0.0361, "step": 3859 }, { "epoch": 4.335252000561561, "grad_norm": 0.3399213392541908, "learning_rate": 1.051969291531234e-06, "loss": 0.0346, "step": 3860 }, { "epoch": 4.336375122841499, "grad_norm": 0.3706100498973919, "learning_rate": 1.0484699256051788e-06, "loss": 0.0353, "step": 3861 }, { "epoch": 4.337498245121438, "grad_norm": 0.3578491650225699, "learning_rate": 1.0449760676540844e-06, "loss": 0.0342, "step": 3862 }, { "epoch": 4.338621367401376, "grad_norm": 0.32948715354128266, "learning_rate": 1.0414877198277629e-06, "loss": 0.0329, "step": 3863 }, { "epoch": 4.339744489681314, "grad_norm": 0.3651556904940354, "learning_rate": 1.038004884272632e-06, "loss": 0.0348, "step": 3864 }, { "epoch": 4.340867611961253, "grad_norm": 0.3745318904885555, "learning_rate": 1.0345275631317165e-06, "loss": 0.0371, "step": 3865 }, { "epoch": 4.3419907342411905, "grad_norm": 0.3408489479974244, "learning_rate": 1.0310557585446523e-06, "loss": 0.0343, "step": 3866 }, { "epoch": 4.3431138565211285, "grad_norm": 0.3616137231183904, "learning_rate": 1.0275894726476787e-06, "loss": 0.0382, "step": 3867 }, { "epoch": 4.344236978801067, "grad_norm": 0.35337736664475866, "learning_rate": 1.0241287075736384e-06, "loss": 0.0327, "step": 3868 }, { "epoch": 4.345360101081005, "grad_norm": 0.34065649357736527, "learning_rate": 1.0206734654519802e-06, "loss": 0.0362, "step": 3869 }, { "epoch": 4.346483223360943, "grad_norm": 0.3677984704496751, "learning_rate": 1.0172237484087522e-06, "loss": 0.0383, "step": 3870 }, { "epoch": 4.347606345640882, "grad_norm": 0.3504661828261147, "learning_rate": 1.0137795585666023e-06, "loss": 0.0355, "step": 3871 }, { "epoch": 4.34872946792082, "grad_norm": 0.3547684524121742, "learning_rate": 1.0103408980447793e-06, "loss": 0.0358, "step": 3872 }, { "epoch": 4.349852590200758, "grad_norm": 0.33899684366263133, "learning_rate": 1.0069077689591279e-06, "loss": 0.0309, "step": 3873 }, { "epoch": 4.350975712480697, "grad_norm": 0.3498349592633247, "learning_rate": 1.0034801734220922e-06, "loss": 0.035, "step": 3874 }, { "epoch": 4.352098834760635, "grad_norm": 0.34975151345498384, "learning_rate": 1.0000581135427067e-06, "loss": 0.0365, "step": 3875 }, { "epoch": 4.353221957040573, "grad_norm": 0.3369442861209052, "learning_rate": 9.966415914266049e-07, "loss": 0.0349, "step": 3876 }, { "epoch": 4.354345079320511, "grad_norm": 0.36960971150127886, "learning_rate": 9.93230609176008e-07, "loss": 0.0359, "step": 3877 }, { "epoch": 4.355468201600449, "grad_norm": 0.3499527159771044, "learning_rate": 9.898251688897332e-07, "loss": 0.0357, "step": 3878 }, { "epoch": 4.356591323880387, "grad_norm": 0.3676928856512341, "learning_rate": 9.86425272663185e-07, "loss": 0.035, "step": 3879 }, { "epoch": 4.357714446160326, "grad_norm": 0.35146864143042605, "learning_rate": 9.830309225883562e-07, "loss": 0.0348, "step": 3880 }, { "epoch": 4.358837568440264, "grad_norm": 0.3506185390903599, "learning_rate": 9.796421207538265e-07, "loss": 0.0343, "step": 3881 }, { "epoch": 4.359960690720202, "grad_norm": 0.35319022540725176, "learning_rate": 9.762588692447661e-07, "loss": 0.0341, "step": 3882 }, { "epoch": 4.361083813000141, "grad_norm": 0.335262563237058, "learning_rate": 9.728811701429242e-07, "loss": 0.0345, "step": 3883 }, { "epoch": 4.362206935280079, "grad_norm": 0.35087168008448233, "learning_rate": 9.695090255266394e-07, "loss": 0.0356, "step": 3884 }, { "epoch": 4.363330057560017, "grad_norm": 0.32755003839475955, "learning_rate": 9.66142437470825e-07, "loss": 0.0306, "step": 3885 }, { "epoch": 4.3644531798399555, "grad_norm": 0.36618851591535595, "learning_rate": 9.627814080469822e-07, "loss": 0.0382, "step": 3886 }, { "epoch": 4.3655763021198934, "grad_norm": 0.3411679485731454, "learning_rate": 9.594259393231897e-07, "loss": 0.031, "step": 3887 }, { "epoch": 4.366699424399831, "grad_norm": 0.3488732598923187, "learning_rate": 9.56076033364105e-07, "loss": 0.0348, "step": 3888 }, { "epoch": 4.36782254667977, "grad_norm": 0.351431363191658, "learning_rate": 9.527316922309593e-07, "loss": 0.0358, "step": 3889 }, { "epoch": 4.368945668959708, "grad_norm": 0.36029895486382335, "learning_rate": 9.493929179815631e-07, "loss": 0.0417, "step": 3890 }, { "epoch": 4.370068791239646, "grad_norm": 0.33022530950278417, "learning_rate": 9.46059712670303e-07, "loss": 0.0335, "step": 3891 }, { "epoch": 4.371191913519585, "grad_norm": 0.31602307458227624, "learning_rate": 9.427320783481353e-07, "loss": 0.0296, "step": 3892 }, { "epoch": 4.372315035799523, "grad_norm": 0.357480507018708, "learning_rate": 9.394100170625931e-07, "loss": 0.0383, "step": 3893 }, { "epoch": 4.373438158079461, "grad_norm": 0.35949006197745925, "learning_rate": 9.360935308577723e-07, "loss": 0.0391, "step": 3894 }, { "epoch": 4.374561280359399, "grad_norm": 0.33189753767969865, "learning_rate": 9.327826217743452e-07, "loss": 0.0329, "step": 3895 }, { "epoch": 4.3756844026393376, "grad_norm": 0.3508226977683308, "learning_rate": 9.294772918495521e-07, "loss": 0.0414, "step": 3896 }, { "epoch": 4.3768075249192755, "grad_norm": 0.3643064354558861, "learning_rate": 9.26177543117197e-07, "loss": 0.0336, "step": 3897 }, { "epoch": 4.377930647199213, "grad_norm": 0.3413054142419292, "learning_rate": 9.228833776076551e-07, "loss": 0.0329, "step": 3898 }, { "epoch": 4.379053769479152, "grad_norm": 0.3346550439678259, "learning_rate": 9.195947973478592e-07, "loss": 0.033, "step": 3899 }, { "epoch": 4.38017689175909, "grad_norm": 0.3401382520992714, "learning_rate": 9.163118043613084e-07, "loss": 0.0339, "step": 3900 }, { "epoch": 4.381300014039028, "grad_norm": 0.37105306794446297, "learning_rate": 9.130344006680658e-07, "loss": 0.035, "step": 3901 }, { "epoch": 4.382423136318967, "grad_norm": 0.35559365927896963, "learning_rate": 9.09762588284755e-07, "loss": 0.0308, "step": 3902 }, { "epoch": 4.383546258598905, "grad_norm": 0.34778900056099654, "learning_rate": 9.064963692245588e-07, "loss": 0.0367, "step": 3903 }, { "epoch": 4.384669380878843, "grad_norm": 0.3431968229053822, "learning_rate": 9.03235745497213e-07, "loss": 0.0327, "step": 3904 }, { "epoch": 4.385792503158782, "grad_norm": 0.32014372957978515, "learning_rate": 8.999807191090193e-07, "loss": 0.0309, "step": 3905 }, { "epoch": 4.38691562543872, "grad_norm": 0.3299346644592865, "learning_rate": 8.967312920628312e-07, "loss": 0.0351, "step": 3906 }, { "epoch": 4.3880387477186575, "grad_norm": 0.34173628201013206, "learning_rate": 8.934874663580551e-07, "loss": 0.0337, "step": 3907 }, { "epoch": 4.389161869998596, "grad_norm": 0.34178452730956427, "learning_rate": 8.902492439906552e-07, "loss": 0.031, "step": 3908 }, { "epoch": 4.390284992278534, "grad_norm": 0.35561085437557965, "learning_rate": 8.870166269531421e-07, "loss": 0.0366, "step": 3909 }, { "epoch": 4.391408114558472, "grad_norm": 0.3607348657261128, "learning_rate": 8.837896172345827e-07, "loss": 0.0338, "step": 3910 }, { "epoch": 4.392531236838411, "grad_norm": 0.3535133481860961, "learning_rate": 8.805682168205909e-07, "loss": 0.0336, "step": 3911 }, { "epoch": 4.393654359118349, "grad_norm": 0.33450525221475136, "learning_rate": 8.773524276933299e-07, "loss": 0.0321, "step": 3912 }, { "epoch": 4.394777481398287, "grad_norm": 0.33740549152276617, "learning_rate": 8.741422518315113e-07, "loss": 0.0322, "step": 3913 }, { "epoch": 4.395900603678226, "grad_norm": 0.34668595275206976, "learning_rate": 8.709376912103895e-07, "loss": 0.0343, "step": 3914 }, { "epoch": 4.397023725958164, "grad_norm": 0.3720699327652378, "learning_rate": 8.677387478017673e-07, "loss": 0.0383, "step": 3915 }, { "epoch": 4.398146848238102, "grad_norm": 0.3310372019137898, "learning_rate": 8.645454235739903e-07, "loss": 0.0309, "step": 3916 }, { "epoch": 4.3992699705180405, "grad_norm": 0.3446218941497761, "learning_rate": 8.613577204919455e-07, "loss": 0.0336, "step": 3917 }, { "epoch": 4.400393092797978, "grad_norm": 0.3321270678180654, "learning_rate": 8.581756405170627e-07, "loss": 0.0326, "step": 3918 }, { "epoch": 4.401516215077916, "grad_norm": 0.3429813545514485, "learning_rate": 8.54999185607307e-07, "loss": 0.0337, "step": 3919 }, { "epoch": 4.402639337357855, "grad_norm": 0.329083241283035, "learning_rate": 8.518283577171894e-07, "loss": 0.032, "step": 3920 }, { "epoch": 4.403762459637793, "grad_norm": 0.35028547291655737, "learning_rate": 8.486631587977545e-07, "loss": 0.0331, "step": 3921 }, { "epoch": 4.404885581917731, "grad_norm": 0.36812246717749864, "learning_rate": 8.455035907965837e-07, "loss": 0.0322, "step": 3922 }, { "epoch": 4.40600870419767, "grad_norm": 0.3354095575220094, "learning_rate": 8.423496556577959e-07, "loss": 0.0315, "step": 3923 }, { "epoch": 4.407131826477608, "grad_norm": 0.36423209726760586, "learning_rate": 8.392013553220391e-07, "loss": 0.0363, "step": 3924 }, { "epoch": 4.408254948757546, "grad_norm": 0.33578769827695937, "learning_rate": 8.360586917264979e-07, "loss": 0.0327, "step": 3925 }, { "epoch": 4.409378071037485, "grad_norm": 0.35083436656255, "learning_rate": 8.329216668048878e-07, "loss": 0.0337, "step": 3926 }, { "epoch": 4.4105011933174225, "grad_norm": 0.352145989291215, "learning_rate": 8.297902824874582e-07, "loss": 0.0362, "step": 3927 }, { "epoch": 4.4116243155973605, "grad_norm": 0.35269237221340394, "learning_rate": 8.266645407009788e-07, "loss": 0.0326, "step": 3928 }, { "epoch": 4.412747437877299, "grad_norm": 0.34801427609667296, "learning_rate": 8.235444433687556e-07, "loss": 0.0366, "step": 3929 }, { "epoch": 4.413870560157237, "grad_norm": 0.3399050508413452, "learning_rate": 8.204299924106196e-07, "loss": 0.0347, "step": 3930 }, { "epoch": 4.414993682437175, "grad_norm": 0.3487557779248968, "learning_rate": 8.173211897429245e-07, "loss": 0.0342, "step": 3931 }, { "epoch": 4.416116804717114, "grad_norm": 0.3667385663949515, "learning_rate": 8.142180372785547e-07, "loss": 0.0359, "step": 3932 }, { "epoch": 4.417239926997052, "grad_norm": 0.3333295175583706, "learning_rate": 8.111205369269104e-07, "loss": 0.0308, "step": 3933 }, { "epoch": 4.41836304927699, "grad_norm": 0.33057100673502793, "learning_rate": 8.080286905939172e-07, "loss": 0.0322, "step": 3934 }, { "epoch": 4.419486171556928, "grad_norm": 0.3420534039639189, "learning_rate": 8.049425001820255e-07, "loss": 0.0374, "step": 3935 }, { "epoch": 4.420609293836867, "grad_norm": 0.33462359777080614, "learning_rate": 8.018619675901995e-07, "loss": 0.034, "step": 3936 }, { "epoch": 4.421732416116805, "grad_norm": 0.3634343403035784, "learning_rate": 7.987870947139276e-07, "loss": 0.032, "step": 3937 }, { "epoch": 4.4228555383967425, "grad_norm": 0.3732288458675281, "learning_rate": 7.957178834452095e-07, "loss": 0.036, "step": 3938 }, { "epoch": 4.423978660676681, "grad_norm": 0.3481104484204922, "learning_rate": 7.926543356725658e-07, "loss": 0.0315, "step": 3939 }, { "epoch": 4.425101782956619, "grad_norm": 0.3400656410063691, "learning_rate": 7.895964532810318e-07, "loss": 0.0331, "step": 3940 }, { "epoch": 4.426224905236557, "grad_norm": 0.32595754035332963, "learning_rate": 7.86544238152157e-07, "loss": 0.0327, "step": 3941 }, { "epoch": 4.427348027516496, "grad_norm": 0.35250908793584956, "learning_rate": 7.834976921640025e-07, "loss": 0.0357, "step": 3942 }, { "epoch": 4.428471149796434, "grad_norm": 0.3520988496330288, "learning_rate": 7.804568171911398e-07, "loss": 0.0379, "step": 3943 }, { "epoch": 4.429594272076372, "grad_norm": 0.36252580809370655, "learning_rate": 7.774216151046543e-07, "loss": 0.034, "step": 3944 }, { "epoch": 4.430717394356311, "grad_norm": 0.34372024882244934, "learning_rate": 7.743920877721378e-07, "loss": 0.0337, "step": 3945 }, { "epoch": 4.431840516636249, "grad_norm": 0.3573305329011033, "learning_rate": 7.713682370576947e-07, "loss": 0.034, "step": 3946 }, { "epoch": 4.432963638916187, "grad_norm": 0.35271907669417196, "learning_rate": 7.683500648219322e-07, "loss": 0.0366, "step": 3947 }, { "epoch": 4.434086761196125, "grad_norm": 0.3387988216872993, "learning_rate": 7.653375729219636e-07, "loss": 0.0327, "step": 3948 }, { "epoch": 4.435209883476063, "grad_norm": 0.34034343693680486, "learning_rate": 7.623307632114085e-07, "loss": 0.0345, "step": 3949 }, { "epoch": 4.436333005756001, "grad_norm": 0.3391577418176705, "learning_rate": 7.593296375403914e-07, "loss": 0.034, "step": 3950 }, { "epoch": 4.43745612803594, "grad_norm": 0.34486562074073207, "learning_rate": 7.563341977555372e-07, "loss": 0.0345, "step": 3951 }, { "epoch": 4.438579250315878, "grad_norm": 0.3412224433899792, "learning_rate": 7.533444456999728e-07, "loss": 0.0337, "step": 3952 }, { "epoch": 4.439702372595816, "grad_norm": 0.3572984260577703, "learning_rate": 7.503603832133277e-07, "loss": 0.0362, "step": 3953 }, { "epoch": 4.440825494875755, "grad_norm": 0.3378929389056555, "learning_rate": 7.473820121317243e-07, "loss": 0.0335, "step": 3954 }, { "epoch": 4.441948617155693, "grad_norm": 0.3586567601428513, "learning_rate": 7.4440933428779e-07, "loss": 0.0357, "step": 3955 }, { "epoch": 4.443071739435631, "grad_norm": 0.35329443808804784, "learning_rate": 7.41442351510645e-07, "loss": 0.0317, "step": 3956 }, { "epoch": 4.4441948617155695, "grad_norm": 0.3554120702375195, "learning_rate": 7.384810656259078e-07, "loss": 0.0351, "step": 3957 }, { "epoch": 4.4453179839955075, "grad_norm": 0.32801148615792985, "learning_rate": 7.355254784556887e-07, "loss": 0.0323, "step": 3958 }, { "epoch": 4.446441106275445, "grad_norm": 0.35260937314651863, "learning_rate": 7.325755918185928e-07, "loss": 0.0361, "step": 3959 }, { "epoch": 4.447564228555384, "grad_norm": 0.35121127734872754, "learning_rate": 7.296314075297196e-07, "loss": 0.0338, "step": 3960 }, { "epoch": 4.448687350835322, "grad_norm": 0.3538367483299584, "learning_rate": 7.266929274006595e-07, "loss": 0.0363, "step": 3961 }, { "epoch": 4.44981047311526, "grad_norm": 0.3518231420814252, "learning_rate": 7.237601532394866e-07, "loss": 0.0393, "step": 3962 }, { "epoch": 4.450933595395199, "grad_norm": 0.35867753461888296, "learning_rate": 7.208330868507718e-07, "loss": 0.0355, "step": 3963 }, { "epoch": 4.452056717675137, "grad_norm": 0.3399901393948421, "learning_rate": 7.17911730035572e-07, "loss": 0.0323, "step": 3964 }, { "epoch": 4.453179839955075, "grad_norm": 0.36423866949072375, "learning_rate": 7.149960845914294e-07, "loss": 0.0374, "step": 3965 }, { "epoch": 4.454302962235014, "grad_norm": 0.3566191322953024, "learning_rate": 7.120861523123735e-07, "loss": 0.0362, "step": 3966 }, { "epoch": 4.455426084514952, "grad_norm": 0.35784219880826335, "learning_rate": 7.091819349889162e-07, "loss": 0.0362, "step": 3967 }, { "epoch": 4.4565492067948895, "grad_norm": 0.3382634880819678, "learning_rate": 7.062834344080549e-07, "loss": 0.0315, "step": 3968 }, { "epoch": 4.457672329074828, "grad_norm": 0.3569943519463488, "learning_rate": 7.033906523532697e-07, "loss": 0.0349, "step": 3969 }, { "epoch": 4.458795451354766, "grad_norm": 0.3650433682446708, "learning_rate": 7.005035906045199e-07, "loss": 0.0346, "step": 3970 }, { "epoch": 4.459918573634704, "grad_norm": 0.33813321468776913, "learning_rate": 6.976222509382491e-07, "loss": 0.0317, "step": 3971 }, { "epoch": 4.461041695914643, "grad_norm": 0.3619159838292503, "learning_rate": 6.947466351273735e-07, "loss": 0.0359, "step": 3972 }, { "epoch": 4.462164818194581, "grad_norm": 0.3763835667376468, "learning_rate": 6.918767449412933e-07, "loss": 0.0361, "step": 3973 }, { "epoch": 4.463287940474519, "grad_norm": 0.3436876392723535, "learning_rate": 6.890125821458826e-07, "loss": 0.0336, "step": 3974 }, { "epoch": 4.464411062754458, "grad_norm": 0.32509347109822845, "learning_rate": 6.86154148503494e-07, "loss": 0.0308, "step": 3975 }, { "epoch": 4.465534185034396, "grad_norm": 0.3525907199308526, "learning_rate": 6.833014457729525e-07, "loss": 0.0356, "step": 3976 }, { "epoch": 4.466657307314334, "grad_norm": 0.35208517679189183, "learning_rate": 6.804544757095566e-07, "loss": 0.0328, "step": 3977 }, { "epoch": 4.4677804295942725, "grad_norm": 0.3599858916473201, "learning_rate": 6.776132400650781e-07, "loss": 0.0352, "step": 3978 }, { "epoch": 4.46890355187421, "grad_norm": 0.3641931626113949, "learning_rate": 6.747777405877609e-07, "loss": 0.0393, "step": 3979 }, { "epoch": 4.470026674154148, "grad_norm": 0.34746423533313237, "learning_rate": 6.719479790223204e-07, "loss": 0.0364, "step": 3980 }, { "epoch": 4.471149796434087, "grad_norm": 0.3564297638222409, "learning_rate": 6.691239571099395e-07, "loss": 0.0365, "step": 3981 }, { "epoch": 4.472272918714025, "grad_norm": 0.34573485421602407, "learning_rate": 6.663056765882692e-07, "loss": 0.0329, "step": 3982 }, { "epoch": 4.473396040993963, "grad_norm": 0.3597984417319236, "learning_rate": 6.634931391914278e-07, "loss": 0.037, "step": 3983 }, { "epoch": 4.474519163273902, "grad_norm": 0.3378924186798682, "learning_rate": 6.606863466500013e-07, "loss": 0.0343, "step": 3984 }, { "epoch": 4.47564228555384, "grad_norm": 0.34863970957372714, "learning_rate": 6.578853006910402e-07, "loss": 0.0343, "step": 3985 }, { "epoch": 4.476765407833778, "grad_norm": 0.33950262649507096, "learning_rate": 6.550900030380614e-07, "loss": 0.0378, "step": 3986 }, { "epoch": 4.477888530113717, "grad_norm": 0.35574459663793684, "learning_rate": 6.523004554110379e-07, "loss": 0.0351, "step": 3987 }, { "epoch": 4.4790116523936545, "grad_norm": 0.32764803743478366, "learning_rate": 6.495166595264102e-07, "loss": 0.0318, "step": 3988 }, { "epoch": 4.4801347746735924, "grad_norm": 0.3648664364754083, "learning_rate": 6.467386170970802e-07, "loss": 0.0359, "step": 3989 }, { "epoch": 4.481257896953531, "grad_norm": 0.33735699056875545, "learning_rate": 6.439663298324061e-07, "loss": 0.0307, "step": 3990 }, { "epoch": 4.482381019233469, "grad_norm": 0.36038648251848654, "learning_rate": 6.411997994382102e-07, "loss": 0.0366, "step": 3991 }, { "epoch": 4.483504141513407, "grad_norm": 0.3582401185410582, "learning_rate": 6.384390276167651e-07, "loss": 0.0373, "step": 3992 }, { "epoch": 4.484627263793345, "grad_norm": 0.3514649132145566, "learning_rate": 6.356840160668054e-07, "loss": 0.0351, "step": 3993 }, { "epoch": 4.485750386073284, "grad_norm": 0.3316545137948871, "learning_rate": 6.329347664835206e-07, "loss": 0.0335, "step": 3994 }, { "epoch": 4.486873508353222, "grad_norm": 0.38390972496268505, "learning_rate": 6.30191280558553e-07, "loss": 0.033, "step": 3995 }, { "epoch": 4.48799663063316, "grad_norm": 0.34745468335455126, "learning_rate": 6.274535599800014e-07, "loss": 0.0339, "step": 3996 }, { "epoch": 4.489119752913099, "grad_norm": 0.3469881466439082, "learning_rate": 6.247216064324158e-07, "loss": 0.0348, "step": 3997 }, { "epoch": 4.4902428751930366, "grad_norm": 0.33480434506463175, "learning_rate": 6.219954215967949e-07, "loss": 0.0324, "step": 3998 }, { "epoch": 4.4913659974729745, "grad_norm": 0.33430612710636604, "learning_rate": 6.192750071505904e-07, "loss": 0.0327, "step": 3999 }, { "epoch": 4.492489119752913, "grad_norm": 0.37230150478039675, "learning_rate": 6.165603647677054e-07, "loss": 0.0343, "step": 4000 }, { "epoch": 4.493612242032851, "grad_norm": 0.3715774781533714, "learning_rate": 6.138514961184872e-07, "loss": 0.0388, "step": 4001 }, { "epoch": 4.494735364312789, "grad_norm": 0.34588801151813375, "learning_rate": 6.111484028697334e-07, "loss": 0.0357, "step": 4002 }, { "epoch": 4.495858486592728, "grad_norm": 0.3473978219479411, "learning_rate": 6.084510866846882e-07, "loss": 0.0337, "step": 4003 }, { "epoch": 4.496981608872666, "grad_norm": 0.3558220793979365, "learning_rate": 6.057595492230372e-07, "loss": 0.035, "step": 4004 }, { "epoch": 4.498104731152604, "grad_norm": 0.3617212667424299, "learning_rate": 6.030737921409169e-07, "loss": 0.0366, "step": 4005 }, { "epoch": 4.499227853432543, "grad_norm": 0.372375829726114, "learning_rate": 6.003938170908985e-07, "loss": 0.0372, "step": 4006 }, { "epoch": 4.500350975712481, "grad_norm": 0.33813041058043286, "learning_rate": 5.97719625722003e-07, "loss": 0.0339, "step": 4007 }, { "epoch": 4.501474097992419, "grad_norm": 0.3336235673204082, "learning_rate": 5.950512196796898e-07, "loss": 0.0337, "step": 4008 }, { "epoch": 4.502597220272357, "grad_norm": 0.3582259509306251, "learning_rate": 5.923886006058566e-07, "loss": 0.035, "step": 4009 }, { "epoch": 4.503720342552295, "grad_norm": 0.34292021535584205, "learning_rate": 5.897317701388461e-07, "loss": 0.0325, "step": 4010 }, { "epoch": 4.504843464832233, "grad_norm": 0.3345731631163473, "learning_rate": 5.870807299134307e-07, "loss": 0.0337, "step": 4011 }, { "epoch": 4.505966587112172, "grad_norm": 0.32035494944471327, "learning_rate": 5.844354815608267e-07, "loss": 0.0304, "step": 4012 }, { "epoch": 4.50708970939211, "grad_norm": 0.3447654015600138, "learning_rate": 5.817960267086853e-07, "loss": 0.0329, "step": 4013 }, { "epoch": 4.508212831672048, "grad_norm": 0.34544101773876196, "learning_rate": 5.791623669810908e-07, "loss": 0.0339, "step": 4014 }, { "epoch": 4.509335953951987, "grad_norm": 0.3728532295897952, "learning_rate": 5.765345039985648e-07, "loss": 0.0343, "step": 4015 }, { "epoch": 4.510459076231925, "grad_norm": 0.35317009146829786, "learning_rate": 5.739124393780571e-07, "loss": 0.0335, "step": 4016 }, { "epoch": 4.511582198511863, "grad_norm": 0.3545765005968472, "learning_rate": 5.71296174732956e-07, "loss": 0.0345, "step": 4017 }, { "epoch": 4.5127053207918015, "grad_norm": 0.342843964056811, "learning_rate": 5.68685711673076e-07, "loss": 0.0323, "step": 4018 }, { "epoch": 4.5138284430717395, "grad_norm": 0.3639774486483642, "learning_rate": 5.660810518046644e-07, "loss": 0.0355, "step": 4019 }, { "epoch": 4.514951565351677, "grad_norm": 0.3332083193281611, "learning_rate": 5.634821967303994e-07, "loss": 0.0337, "step": 4020 }, { "epoch": 4.516074687631616, "grad_norm": 0.34776766829793077, "learning_rate": 5.608891480493816e-07, "loss": 0.0351, "step": 4021 }, { "epoch": 4.517197809911554, "grad_norm": 0.3483906727553061, "learning_rate": 5.583019073571427e-07, "loss": 0.034, "step": 4022 }, { "epoch": 4.518320932191492, "grad_norm": 0.35109229875700276, "learning_rate": 5.557204762456425e-07, "loss": 0.0326, "step": 4023 }, { "epoch": 4.519444054471431, "grad_norm": 0.32870715662554173, "learning_rate": 5.531448563032626e-07, "loss": 0.035, "step": 4024 }, { "epoch": 4.520567176751369, "grad_norm": 0.35831485527315554, "learning_rate": 5.505750491148121e-07, "loss": 0.0371, "step": 4025 }, { "epoch": 4.521690299031307, "grad_norm": 0.3717078413941628, "learning_rate": 5.480110562615182e-07, "loss": 0.0401, "step": 4026 }, { "epoch": 4.522813421311246, "grad_norm": 0.34683971898871724, "learning_rate": 5.454528793210356e-07, "loss": 0.0308, "step": 4027 }, { "epoch": 4.523936543591184, "grad_norm": 0.33865977054207247, "learning_rate": 5.429005198674398e-07, "loss": 0.0334, "step": 4028 }, { "epoch": 4.5250596658711215, "grad_norm": 0.35261816775158816, "learning_rate": 5.403539794712243e-07, "loss": 0.036, "step": 4029 }, { "epoch": 4.5261827881510595, "grad_norm": 0.34517842856104286, "learning_rate": 5.378132596993047e-07, "loss": 0.0375, "step": 4030 }, { "epoch": 4.527305910430998, "grad_norm": 0.35750348534406, "learning_rate": 5.352783621150126e-07, "loss": 0.0355, "step": 4031 }, { "epoch": 4.528429032710936, "grad_norm": 0.3454480091845214, "learning_rate": 5.327492882780993e-07, "loss": 0.0356, "step": 4032 }, { "epoch": 4.529552154990874, "grad_norm": 0.32573268708480096, "learning_rate": 5.3022603974473e-07, "loss": 0.0308, "step": 4033 }, { "epoch": 4.530675277270813, "grad_norm": 0.34576124173105804, "learning_rate": 5.277086180674906e-07, "loss": 0.0336, "step": 4034 }, { "epoch": 4.531798399550751, "grad_norm": 0.34623159718083096, "learning_rate": 5.251970247953752e-07, "loss": 0.036, "step": 4035 }, { "epoch": 4.532921521830689, "grad_norm": 0.35168976903132443, "learning_rate": 5.226912614737956e-07, "loss": 0.0326, "step": 4036 }, { "epoch": 4.534044644110628, "grad_norm": 0.5382472283002754, "learning_rate": 5.20191329644577e-07, "loss": 0.0353, "step": 4037 }, { "epoch": 4.535167766390566, "grad_norm": 0.37754077895496924, "learning_rate": 5.176972308459527e-07, "loss": 0.0365, "step": 4038 }, { "epoch": 4.536290888670504, "grad_norm": 0.37416484246623505, "learning_rate": 5.152089666125704e-07, "loss": 0.0394, "step": 4039 }, { "epoch": 4.537414010950442, "grad_norm": 0.3590048250258893, "learning_rate": 5.127265384754865e-07, "loss": 0.0366, "step": 4040 }, { "epoch": 4.53853713323038, "grad_norm": 0.3342358936092815, "learning_rate": 5.102499479621658e-07, "loss": 0.0335, "step": 4041 }, { "epoch": 4.539660255510318, "grad_norm": 0.3493925300990296, "learning_rate": 5.07779196596484e-07, "loss": 0.0339, "step": 4042 }, { "epoch": 4.540783377790257, "grad_norm": 0.33636877105880414, "learning_rate": 5.053142858987192e-07, "loss": 0.0317, "step": 4043 }, { "epoch": 4.541906500070195, "grad_norm": 0.34263176501366066, "learning_rate": 5.028552173855572e-07, "loss": 0.0336, "step": 4044 }, { "epoch": 4.543029622350133, "grad_norm": 0.37257701817237154, "learning_rate": 5.004019925700921e-07, "loss": 0.0392, "step": 4045 }, { "epoch": 4.544152744630072, "grad_norm": 0.3369340532849318, "learning_rate": 4.979546129618184e-07, "loss": 0.0325, "step": 4046 }, { "epoch": 4.54527586691001, "grad_norm": 0.33726138907008607, "learning_rate": 4.955130800666374e-07, "loss": 0.0329, "step": 4047 }, { "epoch": 4.546398989189948, "grad_norm": 0.38713309676317126, "learning_rate": 4.930773953868506e-07, "loss": 0.0378, "step": 4048 }, { "epoch": 4.5475221114698865, "grad_norm": 0.36438821244482394, "learning_rate": 4.906475604211624e-07, "loss": 0.0341, "step": 4049 }, { "epoch": 4.548645233749824, "grad_norm": 0.33904817801576353, "learning_rate": 4.882235766646748e-07, "loss": 0.0356, "step": 4050 }, { "epoch": 4.549768356029762, "grad_norm": 0.35475025219773854, "learning_rate": 4.858054456088923e-07, "loss": 0.0338, "step": 4051 }, { "epoch": 4.550891478309701, "grad_norm": 0.3458201973740821, "learning_rate": 4.833931687417182e-07, "loss": 0.0345, "step": 4052 }, { "epoch": 4.552014600589639, "grad_norm": 0.36111440231472997, "learning_rate": 4.809867475474539e-07, "loss": 0.0355, "step": 4053 }, { "epoch": 4.553137722869577, "grad_norm": 0.3535234629211531, "learning_rate": 4.785861835067962e-07, "loss": 0.0343, "step": 4054 }, { "epoch": 4.554260845149516, "grad_norm": 0.3528696097512272, "learning_rate": 4.761914780968369e-07, "loss": 0.0333, "step": 4055 }, { "epoch": 4.555383967429454, "grad_norm": 0.34838927834260514, "learning_rate": 4.738026327910661e-07, "loss": 0.0358, "step": 4056 }, { "epoch": 4.556507089709392, "grad_norm": 0.3431964796657279, "learning_rate": 4.7141964905936697e-07, "loss": 0.0338, "step": 4057 }, { "epoch": 4.557630211989331, "grad_norm": 0.349604850461702, "learning_rate": 4.6904252836801446e-07, "loss": 0.0351, "step": 4058 }, { "epoch": 4.5587533342692685, "grad_norm": 0.3284644735862563, "learning_rate": 4.6667127217967844e-07, "loss": 0.0312, "step": 4059 }, { "epoch": 4.5598764565492065, "grad_norm": 0.3381032631698376, "learning_rate": 4.6430588195341853e-07, "loss": 0.0347, "step": 4060 }, { "epoch": 4.560999578829145, "grad_norm": 0.3259962293243492, "learning_rate": 4.6194635914468377e-07, "loss": 0.0321, "step": 4061 }, { "epoch": 4.562122701109083, "grad_norm": 0.34016741280577595, "learning_rate": 4.595927052053162e-07, "loss": 0.0317, "step": 4062 }, { "epoch": 4.563245823389021, "grad_norm": 0.3783286672968007, "learning_rate": 4.5724492158354397e-07, "loss": 0.0346, "step": 4063 }, { "epoch": 4.56436894566896, "grad_norm": 0.3627087896517975, "learning_rate": 4.5490300972398705e-07, "loss": 0.0337, "step": 4064 }, { "epoch": 4.565492067948898, "grad_norm": 0.362771085697257, "learning_rate": 4.52566971067645e-07, "loss": 0.0353, "step": 4065 }, { "epoch": 4.566615190228836, "grad_norm": 0.3102609073574198, "learning_rate": 4.502368070519114e-07, "loss": 0.0274, "step": 4066 }, { "epoch": 4.567738312508775, "grad_norm": 0.33733629821182154, "learning_rate": 4.4791251911056043e-07, "loss": 0.0344, "step": 4067 }, { "epoch": 4.568861434788713, "grad_norm": 0.34048282314449596, "learning_rate": 4.4559410867375365e-07, "loss": 0.0378, "step": 4068 }, { "epoch": 4.569984557068651, "grad_norm": 0.34656193172912014, "learning_rate": 4.432815771680321e-07, "loss": 0.0346, "step": 4069 }, { "epoch": 4.571107679348589, "grad_norm": 0.32758773549200243, "learning_rate": 4.409749260163232e-07, "loss": 0.0345, "step": 4070 }, { "epoch": 4.572230801628527, "grad_norm": 0.3654433415452595, "learning_rate": 4.386741566379338e-07, "loss": 0.0387, "step": 4071 }, { "epoch": 4.573353923908465, "grad_norm": 0.3367092688064399, "learning_rate": 4.3637927044855476e-07, "loss": 0.0324, "step": 4072 }, { "epoch": 4.574477046188404, "grad_norm": 0.3640805465047991, "learning_rate": 4.340902688602544e-07, "loss": 0.0339, "step": 4073 }, { "epoch": 4.575600168468342, "grad_norm": 0.34452856807576676, "learning_rate": 4.3180715328147826e-07, "loss": 0.034, "step": 4074 }, { "epoch": 4.57672329074828, "grad_norm": 0.34328210378409857, "learning_rate": 4.295299251170537e-07, "loss": 0.0327, "step": 4075 }, { "epoch": 4.577846413028219, "grad_norm": 0.35304122276887184, "learning_rate": 4.272585857681844e-07, "loss": 0.0344, "step": 4076 }, { "epoch": 4.578969535308157, "grad_norm": 0.3253336789166046, "learning_rate": 4.249931366324511e-07, "loss": 0.0331, "step": 4077 }, { "epoch": 4.580092657588095, "grad_norm": 0.3388987083083182, "learning_rate": 4.2273357910380896e-07, "loss": 0.0343, "step": 4078 }, { "epoch": 4.5812157798680335, "grad_norm": 0.3543772419693709, "learning_rate": 4.2047991457258905e-07, "loss": 0.0374, "step": 4079 }, { "epoch": 4.5823389021479715, "grad_norm": 0.3765630360863373, "learning_rate": 4.182321444254944e-07, "loss": 0.0354, "step": 4080 }, { "epoch": 4.583462024427909, "grad_norm": 0.3540818181461558, "learning_rate": 4.1599027004560535e-07, "loss": 0.0428, "step": 4081 }, { "epoch": 4.584585146707848, "grad_norm": 0.34719828160408894, "learning_rate": 4.1375429281236946e-07, "loss": 0.0366, "step": 4082 }, { "epoch": 4.585708268987786, "grad_norm": 0.3264230696168353, "learning_rate": 4.115242141016085e-07, "loss": 0.0316, "step": 4083 }, { "epoch": 4.586831391267724, "grad_norm": 0.34719908923103476, "learning_rate": 4.0930003528551587e-07, "loss": 0.0359, "step": 4084 }, { "epoch": 4.587954513547663, "grad_norm": 0.35625052906788557, "learning_rate": 4.0708175773265246e-07, "loss": 0.0348, "step": 4085 }, { "epoch": 4.589077635827601, "grad_norm": 0.3650173507417219, "learning_rate": 4.0486938280794754e-07, "loss": 0.038, "step": 4086 }, { "epoch": 4.590200758107539, "grad_norm": 0.33970468745861326, "learning_rate": 4.0266291187270435e-07, "loss": 0.033, "step": 4087 }, { "epoch": 4.591323880387478, "grad_norm": 0.3841339639866425, "learning_rate": 4.004623462845825e-07, "loss": 0.0344, "step": 4088 }, { "epoch": 4.592447002667416, "grad_norm": 0.3247187954531285, "learning_rate": 3.9826768739761765e-07, "loss": 0.0345, "step": 4089 }, { "epoch": 4.5935701249473535, "grad_norm": 0.3431276872602053, "learning_rate": 3.960789365622075e-07, "loss": 0.0311, "step": 4090 }, { "epoch": 4.594693247227292, "grad_norm": 0.34267005186851196, "learning_rate": 3.938960951251136e-07, "loss": 0.0376, "step": 4091 }, { "epoch": 4.59581636950723, "grad_norm": 0.34136270274596336, "learning_rate": 3.917191644294627e-07, "loss": 0.0316, "step": 4092 }, { "epoch": 4.596939491787168, "grad_norm": 0.34219281794077305, "learning_rate": 3.895481458147454e-07, "loss": 0.0339, "step": 4093 }, { "epoch": 4.598062614067107, "grad_norm": 0.3695903719561843, "learning_rate": 3.8738304061681107e-07, "loss": 0.0391, "step": 4094 }, { "epoch": 4.599185736347045, "grad_norm": 0.3426126714468979, "learning_rate": 3.852238501678751e-07, "loss": 0.0339, "step": 4095 }, { "epoch": 4.600308858626983, "grad_norm": 0.3600465378066275, "learning_rate": 3.830705757965081e-07, "loss": 0.0345, "step": 4096 }, { "epoch": 4.601431980906922, "grad_norm": 0.4555888651642329, "learning_rate": 3.809232188276468e-07, "loss": 0.0375, "step": 4097 }, { "epoch": 4.60255510318686, "grad_norm": 0.3441834185775792, "learning_rate": 3.7878178058258217e-07, "loss": 0.0376, "step": 4098 }, { "epoch": 4.603678225466798, "grad_norm": 0.34364984838035073, "learning_rate": 3.766462623789646e-07, "loss": 0.0334, "step": 4099 }, { "epoch": 4.6048013477467356, "grad_norm": 0.36747528881747177, "learning_rate": 3.745166655308019e-07, "loss": 0.0404, "step": 4100 }, { "epoch": 4.605924470026674, "grad_norm": 0.3341683012479436, "learning_rate": 3.723929913484581e-07, "loss": 0.0301, "step": 4101 }, { "epoch": 4.607047592306612, "grad_norm": 0.3505021440089559, "learning_rate": 3.702752411386534e-07, "loss": 0.0367, "step": 4102 }, { "epoch": 4.60817071458655, "grad_norm": 0.3474780616469266, "learning_rate": 3.681634162044645e-07, "loss": 0.0358, "step": 4103 }, { "epoch": 4.609293836866489, "grad_norm": 0.33887657295088386, "learning_rate": 3.6605751784531853e-07, "loss": 0.0362, "step": 4104 }, { "epoch": 4.610416959146427, "grad_norm": 0.34961137504173107, "learning_rate": 3.6395754735699896e-07, "loss": 0.0379, "step": 4105 }, { "epoch": 4.611540081426365, "grad_norm": 0.3302529939325971, "learning_rate": 3.6186350603164e-07, "loss": 0.0334, "step": 4106 }, { "epoch": 4.612663203706304, "grad_norm": 0.33644579720443657, "learning_rate": 3.5977539515772874e-07, "loss": 0.0345, "step": 4107 }, { "epoch": 4.613786325986242, "grad_norm": 0.3278842607379112, "learning_rate": 3.57693216020103e-07, "loss": 0.0332, "step": 4108 }, { "epoch": 4.61490944826618, "grad_norm": 0.36040005767567596, "learning_rate": 3.556169698999501e-07, "loss": 0.035, "step": 4109 }, { "epoch": 4.6160325705461185, "grad_norm": 0.35158408144549563, "learning_rate": 3.535466580748059e-07, "loss": 0.0345, "step": 4110 }, { "epoch": 4.617155692826056, "grad_norm": 0.36439910096515155, "learning_rate": 3.5148228181855923e-07, "loss": 0.0373, "step": 4111 }, { "epoch": 4.618278815105994, "grad_norm": 0.33513522124762823, "learning_rate": 3.4942384240144176e-07, "loss": 0.0347, "step": 4112 }, { "epoch": 4.619401937385933, "grad_norm": 0.34260874749603937, "learning_rate": 3.473713410900326e-07, "loss": 0.0334, "step": 4113 }, { "epoch": 4.620525059665871, "grad_norm": 0.3352507044783814, "learning_rate": 3.453247791472603e-07, "loss": 0.0357, "step": 4114 }, { "epoch": 4.621648181945809, "grad_norm": 0.34750467435944776, "learning_rate": 3.4328415783239646e-07, "loss": 0.033, "step": 4115 }, { "epoch": 4.622771304225748, "grad_norm": 0.3428620974144906, "learning_rate": 3.4124947840105673e-07, "loss": 0.0314, "step": 4116 }, { "epoch": 4.623894426505686, "grad_norm": 0.36245758828541697, "learning_rate": 3.3922074210520407e-07, "loss": 0.0349, "step": 4117 }, { "epoch": 4.625017548785624, "grad_norm": 0.35741628717665924, "learning_rate": 3.3719795019313993e-07, "loss": 0.0319, "step": 4118 }, { "epoch": 4.626140671065563, "grad_norm": 0.3571691112754241, "learning_rate": 3.351811039095121e-07, "loss": 0.0341, "step": 4119 }, { "epoch": 4.6272637933455005, "grad_norm": 0.3361597736689054, "learning_rate": 3.3317020449530666e-07, "loss": 0.0337, "step": 4120 }, { "epoch": 4.6283869156254385, "grad_norm": 0.352268450177105, "learning_rate": 3.3116525318785286e-07, "loss": 0.0315, "step": 4121 }, { "epoch": 4.629510037905377, "grad_norm": 0.3496979817415821, "learning_rate": 3.291662512208216e-07, "loss": 0.0377, "step": 4122 }, { "epoch": 4.630633160185315, "grad_norm": 0.3415382592263034, "learning_rate": 3.271731998242167e-07, "loss": 0.0349, "step": 4123 }, { "epoch": 4.631756282465253, "grad_norm": 0.3329970180899988, "learning_rate": 3.2518610022438724e-07, "loss": 0.0308, "step": 4124 }, { "epoch": 4.632879404745192, "grad_norm": 0.3554638729226304, "learning_rate": 3.2320495364401625e-07, "loss": 0.0396, "step": 4125 }, { "epoch": 4.63400252702513, "grad_norm": 0.3467526836273791, "learning_rate": 3.2122976130212644e-07, "loss": 0.0334, "step": 4126 }, { "epoch": 4.635125649305068, "grad_norm": 0.34256948945547966, "learning_rate": 3.192605244140745e-07, "loss": 0.0329, "step": 4127 }, { "epoch": 4.636248771585007, "grad_norm": 0.35204121377326897, "learning_rate": 3.172972441915523e-07, "loss": 0.0332, "step": 4128 }, { "epoch": 4.637371893864945, "grad_norm": 0.37489357665496786, "learning_rate": 3.153399218425901e-07, "loss": 0.0365, "step": 4129 }, { "epoch": 4.638495016144883, "grad_norm": 0.32498835574013707, "learning_rate": 3.13388558571549e-07, "loss": 0.0307, "step": 4130 }, { "epoch": 4.6396181384248205, "grad_norm": 0.33414718587616266, "learning_rate": 3.114431555791253e-07, "loss": 0.037, "step": 4131 }, { "epoch": 4.640741260704759, "grad_norm": 0.3384428063498482, "learning_rate": 3.0950371406234357e-07, "loss": 0.0323, "step": 4132 }, { "epoch": 4.641864382984697, "grad_norm": 0.3493656495995995, "learning_rate": 3.075702352145671e-07, "loss": 0.0337, "step": 4133 }, { "epoch": 4.642987505264635, "grad_norm": 0.34456307900275657, "learning_rate": 3.0564272022548414e-07, "loss": 0.0321, "step": 4134 }, { "epoch": 4.644110627544574, "grad_norm": 0.35749872404688643, "learning_rate": 3.0372117028111825e-07, "loss": 0.0353, "step": 4135 }, { "epoch": 4.645233749824512, "grad_norm": 0.3702263663076714, "learning_rate": 3.0180558656381806e-07, "loss": 0.0365, "step": 4136 }, { "epoch": 4.64635687210445, "grad_norm": 0.32586875886680355, "learning_rate": 2.9989597025226523e-07, "loss": 0.032, "step": 4137 }, { "epoch": 4.647479994384389, "grad_norm": 0.33538912472499605, "learning_rate": 2.979923225214665e-07, "loss": 0.0313, "step": 4138 }, { "epoch": 4.648603116664327, "grad_norm": 0.37507751939830075, "learning_rate": 2.9609464454275707e-07, "loss": 0.0375, "step": 4139 }, { "epoch": 4.649726238944265, "grad_norm": 0.3481121639949164, "learning_rate": 2.942029374837996e-07, "loss": 0.0314, "step": 4140 }, { "epoch": 4.6508493612242034, "grad_norm": 0.3423593643914144, "learning_rate": 2.9231720250858296e-07, "loss": 0.0343, "step": 4141 }, { "epoch": 4.651972483504141, "grad_norm": 0.35719566136893616, "learning_rate": 2.904374407774191e-07, "loss": 0.0366, "step": 4142 }, { "epoch": 4.653095605784079, "grad_norm": 0.3658780078221181, "learning_rate": 2.8856365344694604e-07, "loss": 0.0348, "step": 4143 }, { "epoch": 4.654218728064018, "grad_norm": 0.34290940357613336, "learning_rate": 2.866958416701271e-07, "loss": 0.0335, "step": 4144 }, { "epoch": 4.655341850343956, "grad_norm": 0.33735481225106245, "learning_rate": 2.8483400659624737e-07, "loss": 0.0354, "step": 4145 }, { "epoch": 4.656464972623894, "grad_norm": 0.35331730640988807, "learning_rate": 2.8297814937091495e-07, "loss": 0.0363, "step": 4146 }, { "epoch": 4.657588094903833, "grad_norm": 0.3582352972813522, "learning_rate": 2.8112827113605637e-07, "loss": 0.0321, "step": 4147 }, { "epoch": 4.658711217183771, "grad_norm": 0.369439070620047, "learning_rate": 2.792843730299244e-07, "loss": 0.0409, "step": 4148 }, { "epoch": 4.659834339463709, "grad_norm": 0.36467256732278813, "learning_rate": 2.774464561870893e-07, "loss": 0.0331, "step": 4149 }, { "epoch": 4.660957461743648, "grad_norm": 0.3578494257105483, "learning_rate": 2.7561452173844206e-07, "loss": 0.0348, "step": 4150 }, { "epoch": 4.6620805840235855, "grad_norm": 0.33043652998644596, "learning_rate": 2.7378857081119204e-07, "loss": 0.0291, "step": 4151 }, { "epoch": 4.663203706303523, "grad_norm": 0.32682915997898, "learning_rate": 2.7196860452886496e-07, "loss": 0.0295, "step": 4152 }, { "epoch": 4.664326828583462, "grad_norm": 0.35842979933675306, "learning_rate": 2.7015462401130843e-07, "loss": 0.0347, "step": 4153 }, { "epoch": 4.6654499508634, "grad_norm": 0.3608036317898208, "learning_rate": 2.68346630374684e-07, "loss": 0.0351, "step": 4154 }, { "epoch": 4.666573073143338, "grad_norm": 0.3445352154282416, "learning_rate": 2.665446247314696e-07, "loss": 0.0368, "step": 4155 }, { "epoch": 4.667696195423277, "grad_norm": 0.36637403299593924, "learning_rate": 2.6474860819046157e-07, "loss": 0.0384, "step": 4156 }, { "epoch": 4.668819317703215, "grad_norm": 0.3402971768771307, "learning_rate": 2.629585818567637e-07, "loss": 0.0336, "step": 4157 }, { "epoch": 4.669942439983153, "grad_norm": 0.32819300144957086, "learning_rate": 2.6117454683180274e-07, "loss": 0.0312, "step": 4158 }, { "epoch": 4.671065562263092, "grad_norm": 0.3449374419407223, "learning_rate": 2.5939650421331395e-07, "loss": 0.0334, "step": 4159 }, { "epoch": 4.67218868454303, "grad_norm": 0.3596838423275439, "learning_rate": 2.576244550953466e-07, "loss": 0.0375, "step": 4160 }, { "epoch": 4.6733118068229675, "grad_norm": 0.3685410673567921, "learning_rate": 2.5585840056826295e-07, "loss": 0.0381, "step": 4161 }, { "epoch": 4.674434929102906, "grad_norm": 0.352525835209393, "learning_rate": 2.540983417187348e-07, "loss": 0.033, "step": 4162 }, { "epoch": 4.675558051382844, "grad_norm": 0.3406022335981378, "learning_rate": 2.5234427962974486e-07, "loss": 0.03, "step": 4163 }, { "epoch": 4.676681173662782, "grad_norm": 0.3732219938497303, "learning_rate": 2.5059621538058743e-07, "loss": 0.0327, "step": 4164 }, { "epoch": 4.677804295942721, "grad_norm": 0.3454388239084282, "learning_rate": 2.488541500468666e-07, "loss": 0.0321, "step": 4165 }, { "epoch": 4.678927418222659, "grad_norm": 0.3482806759890393, "learning_rate": 2.47118084700495e-07, "loss": 0.0334, "step": 4166 }, { "epoch": 4.680050540502597, "grad_norm": 0.3423367434752234, "learning_rate": 2.453880204096892e-07, "loss": 0.0325, "step": 4167 }, { "epoch": 4.681173662782536, "grad_norm": 0.35270956062249814, "learning_rate": 2.4366395823898104e-07, "loss": 0.0339, "step": 4168 }, { "epoch": 4.682296785062474, "grad_norm": 0.3296367877167982, "learning_rate": 2.419458992492019e-07, "loss": 0.0329, "step": 4169 }, { "epoch": 4.683419907342412, "grad_norm": 0.33676841186669126, "learning_rate": 2.402338444974928e-07, "loss": 0.0304, "step": 4170 }, { "epoch": 4.6845430296223505, "grad_norm": 0.3432291676295886, "learning_rate": 2.3852779503730217e-07, "loss": 0.0357, "step": 4171 }, { "epoch": 4.685666151902288, "grad_norm": 0.3415736112710834, "learning_rate": 2.3682775191837814e-07, "loss": 0.0331, "step": 4172 }, { "epoch": 4.686789274182226, "grad_norm": 0.3718654445263105, "learning_rate": 2.3513371618677838e-07, "loss": 0.036, "step": 4173 }, { "epoch": 4.687912396462165, "grad_norm": 0.3659974123825445, "learning_rate": 2.3344568888485907e-07, "loss": 0.0361, "step": 4174 }, { "epoch": 4.689035518742103, "grad_norm": 0.34842971301416137, "learning_rate": 2.3176367105128494e-07, "loss": 0.0353, "step": 4175 }, { "epoch": 4.690158641022041, "grad_norm": 0.35992964433081653, "learning_rate": 2.300876637210181e-07, "loss": 0.0352, "step": 4176 }, { "epoch": 4.69128176330198, "grad_norm": 0.35358447232276063, "learning_rate": 2.2841766792532472e-07, "loss": 0.0367, "step": 4177 }, { "epoch": 4.692404885581918, "grad_norm": 0.3458896703817227, "learning_rate": 2.2675368469177171e-07, "loss": 0.0319, "step": 4178 }, { "epoch": 4.693528007861856, "grad_norm": 0.33681153882758513, "learning_rate": 2.2509571504422678e-07, "loss": 0.035, "step": 4179 }, { "epoch": 4.694651130141795, "grad_norm": 0.34571417197318716, "learning_rate": 2.2344376000285606e-07, "loss": 0.035, "step": 4180 }, { "epoch": 4.6957742524217325, "grad_norm": 0.33932945848040214, "learning_rate": 2.2179782058412646e-07, "loss": 0.0327, "step": 4181 }, { "epoch": 4.6968973747016705, "grad_norm": 0.34489622098163436, "learning_rate": 2.201578978008012e-07, "loss": 0.0337, "step": 4182 }, { "epoch": 4.698020496981609, "grad_norm": 0.3480940006392897, "learning_rate": 2.1852399266194312e-07, "loss": 0.0318, "step": 4183 }, { "epoch": 4.699143619261547, "grad_norm": 0.33290094072681525, "learning_rate": 2.1689610617291357e-07, "loss": 0.0332, "step": 4184 }, { "epoch": 4.700266741541485, "grad_norm": 0.3599868420281495, "learning_rate": 2.1527423933536906e-07, "loss": 0.0385, "step": 4185 }, { "epoch": 4.701389863821424, "grad_norm": 0.3337751304343682, "learning_rate": 2.1365839314726021e-07, "loss": 0.03, "step": 4186 }, { "epoch": 4.702512986101362, "grad_norm": 0.33760321339006033, "learning_rate": 2.1204856860283506e-07, "loss": 0.0353, "step": 4187 }, { "epoch": 4.7036361083813, "grad_norm": 0.3357458834520037, "learning_rate": 2.1044476669263793e-07, "loss": 0.0316, "step": 4188 }, { "epoch": 4.704759230661239, "grad_norm": 0.3494253152613926, "learning_rate": 2.0884698840350492e-07, "loss": 0.0352, "step": 4189 }, { "epoch": 4.705882352941177, "grad_norm": 0.3313642700196441, "learning_rate": 2.0725523471856744e-07, "loss": 0.0344, "step": 4190 }, { "epoch": 4.707005475221115, "grad_norm": 0.3515014279713326, "learning_rate": 2.056695066172476e-07, "loss": 0.0375, "step": 4191 }, { "epoch": 4.708128597501053, "grad_norm": 0.3393284033675587, "learning_rate": 2.0408980507526267e-07, "loss": 0.0324, "step": 4192 }, { "epoch": 4.709251719780991, "grad_norm": 0.3433377101697511, "learning_rate": 2.0251613106461955e-07, "loss": 0.0351, "step": 4193 }, { "epoch": 4.710374842060929, "grad_norm": 0.3609914609514239, "learning_rate": 2.0094848555361702e-07, "loss": 0.0343, "step": 4194 }, { "epoch": 4.711497964340868, "grad_norm": 0.33674994452184703, "learning_rate": 1.993868695068457e-07, "loss": 0.0332, "step": 4195 }, { "epoch": 4.712621086620806, "grad_norm": 0.3405640938227505, "learning_rate": 1.978312838851837e-07, "loss": 0.0338, "step": 4196 }, { "epoch": 4.713744208900744, "grad_norm": 0.3518234996906055, "learning_rate": 1.9628172964580082e-07, "loss": 0.0323, "step": 4197 }, { "epoch": 4.714867331180682, "grad_norm": 0.37220183549793007, "learning_rate": 1.9473820774215557e-07, "loss": 0.0342, "step": 4198 }, { "epoch": 4.715990453460621, "grad_norm": 0.32965656846842434, "learning_rate": 1.932007191239915e-07, "loss": 0.0295, "step": 4199 }, { "epoch": 4.717113575740559, "grad_norm": 0.3521785347247238, "learning_rate": 1.9166926473734636e-07, "loss": 0.033, "step": 4200 }, { "epoch": 4.718236698020497, "grad_norm": 0.35522561854409634, "learning_rate": 1.9014384552453635e-07, "loss": 0.0338, "step": 4201 }, { "epoch": 4.719359820300435, "grad_norm": 0.3310871063889209, "learning_rate": 1.8862446242417175e-07, "loss": 0.0327, "step": 4202 }, { "epoch": 4.720482942580373, "grad_norm": 0.3239117817488521, "learning_rate": 1.8711111637114364e-07, "loss": 0.0334, "step": 4203 }, { "epoch": 4.721606064860311, "grad_norm": 0.33473520840189047, "learning_rate": 1.856038082966327e-07, "loss": 0.0315, "step": 4204 }, { "epoch": 4.72272918714025, "grad_norm": 0.3546234500423055, "learning_rate": 1.841025391281015e-07, "loss": 0.0347, "step": 4205 }, { "epoch": 4.723852309420188, "grad_norm": 0.35300752775260824, "learning_rate": 1.8260730978929664e-07, "loss": 0.0373, "step": 4206 }, { "epoch": 4.724975431700126, "grad_norm": 0.31962674294469384, "learning_rate": 1.8111812120024884e-07, "loss": 0.0304, "step": 4207 }, { "epoch": 4.726098553980065, "grad_norm": 0.3606715730669359, "learning_rate": 1.7963497427727294e-07, "loss": 0.0337, "step": 4208 }, { "epoch": 4.727221676260003, "grad_norm": 0.32723932407883416, "learning_rate": 1.781578699329667e-07, "loss": 0.032, "step": 4209 }, { "epoch": 4.728344798539941, "grad_norm": 0.32660649785881374, "learning_rate": 1.766868090762075e-07, "loss": 0.0304, "step": 4210 }, { "epoch": 4.7294679208198795, "grad_norm": 0.3834637625549436, "learning_rate": 1.7522179261215467e-07, "loss": 0.043, "step": 4211 }, { "epoch": 4.7305910430998175, "grad_norm": 0.31299817068786057, "learning_rate": 1.7376282144224933e-07, "loss": 0.0302, "step": 4212 }, { "epoch": 4.731714165379755, "grad_norm": 0.34864103394231777, "learning_rate": 1.7230989646421337e-07, "loss": 0.0329, "step": 4213 }, { "epoch": 4.732837287659694, "grad_norm": 0.35855522330101297, "learning_rate": 1.7086301857204725e-07, "loss": 0.0327, "step": 4214 }, { "epoch": 4.733960409939632, "grad_norm": 0.35553712042165425, "learning_rate": 1.694221886560299e-07, "loss": 0.037, "step": 4215 }, { "epoch": 4.73508353221957, "grad_norm": 0.3348937621754762, "learning_rate": 1.6798740760272104e-07, "loss": 0.0329, "step": 4216 }, { "epoch": 4.736206654499509, "grad_norm": 0.3809215375029254, "learning_rate": 1.665586762949567e-07, "loss": 0.0366, "step": 4217 }, { "epoch": 4.737329776779447, "grad_norm": 0.34204433938299095, "learning_rate": 1.6513599561185034e-07, "loss": 0.0328, "step": 4218 }, { "epoch": 4.738452899059385, "grad_norm": 0.3550432910580413, "learning_rate": 1.6371936642879504e-07, "loss": 0.0321, "step": 4219 }, { "epoch": 4.739576021339324, "grad_norm": 0.3318127050500399, "learning_rate": 1.623087896174558e-07, "loss": 0.0365, "step": 4220 }, { "epoch": 4.740699143619262, "grad_norm": 0.36204671764832996, "learning_rate": 1.6090426604577714e-07, "loss": 0.0346, "step": 4221 }, { "epoch": 4.7418222658991995, "grad_norm": 0.3387248418153391, "learning_rate": 1.5950579657797894e-07, "loss": 0.0306, "step": 4222 }, { "epoch": 4.742945388179138, "grad_norm": 0.3427148305083139, "learning_rate": 1.5811338207455284e-07, "loss": 0.0345, "step": 4223 }, { "epoch": 4.744068510459076, "grad_norm": 0.35411961450361146, "learning_rate": 1.5672702339226909e-07, "loss": 0.0371, "step": 4224 }, { "epoch": 4.745191632739014, "grad_norm": 0.3721507850238725, "learning_rate": 1.553467213841664e-07, "loss": 0.0368, "step": 4225 }, { "epoch": 4.746314755018953, "grad_norm": 0.3466898365608997, "learning_rate": 1.5397247689956318e-07, "loss": 0.0308, "step": 4226 }, { "epoch": 4.747437877298891, "grad_norm": 0.3574266108394466, "learning_rate": 1.5260429078404416e-07, "loss": 0.0335, "step": 4227 }, { "epoch": 4.748560999578829, "grad_norm": 0.3400286952615133, "learning_rate": 1.5124216387947143e-07, "loss": 0.0307, "step": 4228 }, { "epoch": 4.749684121858768, "grad_norm": 0.36309544829463203, "learning_rate": 1.4988609702397683e-07, "loss": 0.04, "step": 4229 }, { "epoch": 4.750807244138706, "grad_norm": 0.3417693922064799, "learning_rate": 1.4853609105196175e-07, "loss": 0.0367, "step": 4230 }, { "epoch": 4.751930366418644, "grad_norm": 0.3602988292077408, "learning_rate": 1.4719214679409954e-07, "loss": 0.0358, "step": 4231 }, { "epoch": 4.753053488698582, "grad_norm": 0.3735538962893692, "learning_rate": 1.4585426507733536e-07, "loss": 0.0311, "step": 4232 }, { "epoch": 4.75417661097852, "grad_norm": 0.39448460405940855, "learning_rate": 1.445224467248818e-07, "loss": 0.0385, "step": 4233 }, { "epoch": 4.755299733258458, "grad_norm": 0.36862373252888875, "learning_rate": 1.4319669255622115e-07, "loss": 0.0378, "step": 4234 }, { "epoch": 4.756422855538396, "grad_norm": 0.3606558317899239, "learning_rate": 1.418770033871053e-07, "loss": 0.0351, "step": 4235 }, { "epoch": 4.757545977818335, "grad_norm": 0.357546164191823, "learning_rate": 1.405633800295525e-07, "loss": 0.0377, "step": 4236 }, { "epoch": 4.758669100098273, "grad_norm": 0.33523607494626556, "learning_rate": 1.392558232918506e-07, "loss": 0.0313, "step": 4237 }, { "epoch": 4.759792222378211, "grad_norm": 0.3554788435214293, "learning_rate": 1.3795433397855274e-07, "loss": 0.0363, "step": 4238 }, { "epoch": 4.76091534465815, "grad_norm": 0.334541232594485, "learning_rate": 1.366589128904805e-07, "loss": 0.0331, "step": 4239 }, { "epoch": 4.762038466938088, "grad_norm": 0.36173611814235396, "learning_rate": 1.3536956082472074e-07, "loss": 0.0369, "step": 4240 }, { "epoch": 4.763161589218026, "grad_norm": 0.3250258742570545, "learning_rate": 1.3408627857462443e-07, "loss": 0.0318, "step": 4241 }, { "epoch": 4.7642847114979645, "grad_norm": 0.3565828415127001, "learning_rate": 1.32809066929811e-07, "loss": 0.0387, "step": 4242 }, { "epoch": 4.7654078337779024, "grad_norm": 0.3383956134948221, "learning_rate": 1.3153792667616183e-07, "loss": 0.0332, "step": 4243 }, { "epoch": 4.76653095605784, "grad_norm": 0.3476033007253469, "learning_rate": 1.302728585958246e-07, "loss": 0.0365, "step": 4244 }, { "epoch": 4.767654078337779, "grad_norm": 0.3516963013877261, "learning_rate": 1.290138634672089e-07, "loss": 0.0386, "step": 4245 }, { "epoch": 4.768777200617717, "grad_norm": 0.34481467608140326, "learning_rate": 1.2776094206498834e-07, "loss": 0.0339, "step": 4246 }, { "epoch": 4.769900322897655, "grad_norm": 0.3925266775395916, "learning_rate": 1.2651409516009848e-07, "loss": 0.0345, "step": 4247 }, { "epoch": 4.771023445177594, "grad_norm": 0.356302010857859, "learning_rate": 1.2527332351973899e-07, "loss": 0.0342, "step": 4248 }, { "epoch": 4.772146567457532, "grad_norm": 0.3385658467723331, "learning_rate": 1.2403862790737021e-07, "loss": 0.0297, "step": 4249 }, { "epoch": 4.77326968973747, "grad_norm": 0.34274297140098986, "learning_rate": 1.2281000908271336e-07, "loss": 0.0335, "step": 4250 }, { "epoch": 4.774392812017409, "grad_norm": 0.32904157582350113, "learning_rate": 1.2158746780175257e-07, "loss": 0.0324, "step": 4251 }, { "epoch": 4.7755159342973466, "grad_norm": 0.3583962692152138, "learning_rate": 1.2037100481672836e-07, "loss": 0.0349, "step": 4252 }, { "epoch": 4.7766390565772845, "grad_norm": 0.3411729792940599, "learning_rate": 1.1916062087614644e-07, "loss": 0.0344, "step": 4253 }, { "epoch": 4.777762178857223, "grad_norm": 0.34168029879821443, "learning_rate": 1.1795631672476771e-07, "loss": 0.0364, "step": 4254 }, { "epoch": 4.778885301137161, "grad_norm": 0.35060334074323224, "learning_rate": 1.1675809310361497e-07, "loss": 0.0466, "step": 4255 }, { "epoch": 4.780008423417099, "grad_norm": 0.34336606647480034, "learning_rate": 1.1556595074996624e-07, "loss": 0.0331, "step": 4256 }, { "epoch": 4.781131545697038, "grad_norm": 0.32382578010358776, "learning_rate": 1.1437989039736253e-07, "loss": 0.0325, "step": 4257 }, { "epoch": 4.782254667976976, "grad_norm": 0.34851470783798044, "learning_rate": 1.1319991277559783e-07, "loss": 0.036, "step": 4258 }, { "epoch": 4.783377790256914, "grad_norm": 0.35953669765583024, "learning_rate": 1.1202601861072693e-07, "loss": 0.0364, "step": 4259 }, { "epoch": 4.784500912536853, "grad_norm": 0.3420834182439802, "learning_rate": 1.108582086250587e-07, "loss": 0.0304, "step": 4260 }, { "epoch": 4.785624034816791, "grad_norm": 0.3474035202936612, "learning_rate": 1.0969648353715945e-07, "loss": 0.0353, "step": 4261 }, { "epoch": 4.786747157096729, "grad_norm": 0.33774829169251014, "learning_rate": 1.0854084406185184e-07, "loss": 0.0309, "step": 4262 }, { "epoch": 4.787870279376667, "grad_norm": 0.3323004162031955, "learning_rate": 1.0739129091021372e-07, "loss": 0.0301, "step": 4263 }, { "epoch": 4.788993401656605, "grad_norm": 0.35491038528232177, "learning_rate": 1.0624782478957818e-07, "loss": 0.0362, "step": 4264 }, { "epoch": 4.790116523936543, "grad_norm": 0.3148060898080571, "learning_rate": 1.051104464035313e-07, "loss": 0.03, "step": 4265 }, { "epoch": 4.791239646216482, "grad_norm": 0.3454633600900965, "learning_rate": 1.0397915645191437e-07, "loss": 0.0325, "step": 4266 }, { "epoch": 4.79236276849642, "grad_norm": 0.3387657241758695, "learning_rate": 1.028539556308239e-07, "loss": 0.0366, "step": 4267 }, { "epoch": 4.793485890776358, "grad_norm": 0.343688514356464, "learning_rate": 1.017348446326083e-07, "loss": 0.0355, "step": 4268 }, { "epoch": 4.794609013056297, "grad_norm": 0.33804452702394205, "learning_rate": 1.0062182414586786e-07, "loss": 0.0317, "step": 4269 }, { "epoch": 4.795732135336235, "grad_norm": 0.34775190604371375, "learning_rate": 9.951489485545696e-08, "loss": 0.0357, "step": 4270 }, { "epoch": 4.796855257616173, "grad_norm": 0.34403813226107627, "learning_rate": 9.841405744248078e-08, "loss": 0.036, "step": 4271 }, { "epoch": 4.7979783798961115, "grad_norm": 0.337690442201065, "learning_rate": 9.731931258429638e-08, "loss": 0.0336, "step": 4272 }, { "epoch": 4.7991015021760495, "grad_norm": 0.36856828496724703, "learning_rate": 9.623066095451494e-08, "loss": 0.04, "step": 4273 }, { "epoch": 4.800224624455987, "grad_norm": 0.3454697388417179, "learning_rate": 9.514810322299283e-08, "loss": 0.0327, "step": 4274 }, { "epoch": 4.801347746735926, "grad_norm": 0.3462964343590165, "learning_rate": 9.407164005584057e-08, "loss": 0.0333, "step": 4275 }, { "epoch": 4.802470869015864, "grad_norm": 0.3431968345054756, "learning_rate": 9.300127211541832e-08, "loss": 0.0314, "step": 4276 }, { "epoch": 4.803593991295802, "grad_norm": 0.3651225919722223, "learning_rate": 9.193700006033368e-08, "loss": 0.0412, "step": 4277 }, { "epoch": 4.804717113575741, "grad_norm": 0.33522720006840734, "learning_rate": 9.087882454544839e-08, "loss": 0.0298, "step": 4278 }, { "epoch": 4.805840235855679, "grad_norm": 0.3320171412387023, "learning_rate": 8.982674622186605e-08, "loss": 0.0309, "step": 4279 }, { "epoch": 4.806963358135617, "grad_norm": 0.3506412235588877, "learning_rate": 8.878076573694328e-08, "loss": 0.0344, "step": 4280 }, { "epoch": 4.808086480415556, "grad_norm": 0.36747870462605653, "learning_rate": 8.774088373428413e-08, "loss": 0.0401, "step": 4281 }, { "epoch": 4.809209602695494, "grad_norm": 0.3537987285313262, "learning_rate": 8.67071008537379e-08, "loss": 0.0342, "step": 4282 }, { "epoch": 4.8103327249754315, "grad_norm": 0.33629875351700084, "learning_rate": 8.567941773140465e-08, "loss": 0.0332, "step": 4283 }, { "epoch": 4.81145584725537, "grad_norm": 0.3534627818646823, "learning_rate": 8.465783499962633e-08, "loss": 0.0354, "step": 4284 }, { "epoch": 4.812578969535308, "grad_norm": 0.3630589407739355, "learning_rate": 8.364235328699566e-08, "loss": 0.0352, "step": 4285 }, { "epoch": 4.813702091815246, "grad_norm": 0.33985452597445087, "learning_rate": 8.263297321835062e-08, "loss": 0.032, "step": 4286 }, { "epoch": 4.814825214095185, "grad_norm": 0.355931019590897, "learning_rate": 8.162969541477217e-08, "loss": 0.0323, "step": 4287 }, { "epoch": 4.815948336375123, "grad_norm": 0.36125431699041477, "learning_rate": 8.063252049358983e-08, "loss": 0.04, "step": 4288 }, { "epoch": 4.817071458655061, "grad_norm": 0.3354243066230309, "learning_rate": 7.96414490683739e-08, "loss": 0.0334, "step": 4289 }, { "epoch": 4.818194580935, "grad_norm": 0.35478986196267864, "learning_rate": 7.865648174894325e-08, "loss": 0.0338, "step": 4290 }, { "epoch": 4.819317703214938, "grad_norm": 0.34998123448801854, "learning_rate": 7.767761914135974e-08, "loss": 0.0339, "step": 4291 }, { "epoch": 4.820440825494876, "grad_norm": 0.35949343783541043, "learning_rate": 7.670486184792713e-08, "loss": 0.0333, "step": 4292 }, { "epoch": 4.8215639477748145, "grad_norm": 0.3651218793634851, "learning_rate": 7.573821046719332e-08, "loss": 0.0394, "step": 4293 }, { "epoch": 4.822687070054752, "grad_norm": 0.34634749486357275, "learning_rate": 7.477766559395139e-08, "loss": 0.0327, "step": 4294 }, { "epoch": 4.82381019233469, "grad_norm": 0.3413706560361767, "learning_rate": 7.382322781923301e-08, "loss": 0.0367, "step": 4295 }, { "epoch": 4.824933314614629, "grad_norm": 0.3475100318642269, "learning_rate": 7.287489773031508e-08, "loss": 0.0344, "step": 4296 }, { "epoch": 4.826056436894567, "grad_norm": 0.35140910720099416, "learning_rate": 7.193267591071529e-08, "loss": 0.0313, "step": 4297 }, { "epoch": 4.827179559174505, "grad_norm": 0.30755190051409365, "learning_rate": 7.09965629401943e-08, "loss": 0.0299, "step": 4298 }, { "epoch": 4.828302681454443, "grad_norm": 0.3208778262419729, "learning_rate": 7.006655939475248e-08, "loss": 0.0304, "step": 4299 }, { "epoch": 4.829425803734382, "grad_norm": 0.33024767193628707, "learning_rate": 6.914266584662988e-08, "loss": 0.0317, "step": 4300 }, { "epoch": 4.83054892601432, "grad_norm": 0.3600374450371469, "learning_rate": 6.82248828643084e-08, "loss": 0.0358, "step": 4301 }, { "epoch": 4.831672048294258, "grad_norm": 0.343893665355897, "learning_rate": 6.731321101251187e-08, "loss": 0.0348, "step": 4302 }, { "epoch": 4.8327951705741965, "grad_norm": 0.3401993628458368, "learning_rate": 6.640765085220047e-08, "loss": 0.0334, "step": 4303 }, { "epoch": 4.833918292854134, "grad_norm": 0.37117274391604826, "learning_rate": 6.550820294057625e-08, "loss": 0.038, "step": 4304 }, { "epoch": 4.835041415134072, "grad_norm": 0.3270123504885918, "learning_rate": 6.461486783107762e-08, "loss": 0.0318, "step": 4305 }, { "epoch": 4.836164537414011, "grad_norm": 0.35563694733212037, "learning_rate": 6.3727646073386e-08, "loss": 0.0354, "step": 4306 }, { "epoch": 4.837287659693949, "grad_norm": 0.3312605443855957, "learning_rate": 6.284653821341691e-08, "loss": 0.0341, "step": 4307 }, { "epoch": 4.838410781973887, "grad_norm": 0.34887322226150547, "learning_rate": 6.197154479332667e-08, "loss": 0.0356, "step": 4308 }, { "epoch": 4.839533904253826, "grad_norm": 0.34169094625118857, "learning_rate": 6.110266635150796e-08, "loss": 0.0332, "step": 4309 }, { "epoch": 4.840657026533764, "grad_norm": 0.3460642146322915, "learning_rate": 6.02399034225909e-08, "loss": 0.0356, "step": 4310 }, { "epoch": 4.841780148813702, "grad_norm": 0.33286836691758975, "learning_rate": 5.9383256537444144e-08, "loss": 0.0297, "step": 4311 }, { "epoch": 4.842903271093641, "grad_norm": 0.3402965072293783, "learning_rate": 5.853272622317052e-08, "loss": 0.0349, "step": 4312 }, { "epoch": 4.8440263933735785, "grad_norm": 0.35934649357906984, "learning_rate": 5.7688313003112506e-08, "loss": 0.0374, "step": 4313 }, { "epoch": 4.8451495156535165, "grad_norm": 0.3536391767479085, "learning_rate": 5.685001739684448e-08, "loss": 0.037, "step": 4314 }, { "epoch": 4.846272637933455, "grad_norm": 0.41892157440986844, "learning_rate": 5.6017839920180506e-08, "loss": 0.0332, "step": 4315 }, { "epoch": 4.847395760213393, "grad_norm": 0.362207049616821, "learning_rate": 5.519178108516765e-08, "loss": 0.0358, "step": 4316 }, { "epoch": 4.848518882493331, "grad_norm": 0.339492102312975, "learning_rate": 5.437184140009044e-08, "loss": 0.0309, "step": 4317 }, { "epoch": 4.84964200477327, "grad_norm": 0.3487122546387408, "learning_rate": 5.355802136946531e-08, "loss": 0.0345, "step": 4318 }, { "epoch": 4.850765127053208, "grad_norm": 0.3793793262602845, "learning_rate": 5.2750321494046133e-08, "loss": 0.0381, "step": 4319 }, { "epoch": 4.851888249333146, "grad_norm": 0.36623706813318585, "learning_rate": 5.1948742270817584e-08, "loss": 0.0346, "step": 4320 }, { "epoch": 4.853011371613085, "grad_norm": 0.3730438215325291, "learning_rate": 5.1153284193001803e-08, "loss": 0.0391, "step": 4321 }, { "epoch": 4.854134493893023, "grad_norm": 0.31906053615524266, "learning_rate": 5.036394775005282e-08, "loss": 0.0305, "step": 4322 }, { "epoch": 4.855257616172961, "grad_norm": 0.3256027247837537, "learning_rate": 4.958073342765768e-08, "loss": 0.0334, "step": 4323 }, { "epoch": 4.856380738452899, "grad_norm": 0.3507274265615372, "learning_rate": 4.880364170773533e-08, "loss": 0.0349, "step": 4324 }, { "epoch": 4.857503860732837, "grad_norm": 0.3429380538021881, "learning_rate": 4.803267306844106e-08, "loss": 0.0323, "step": 4325 }, { "epoch": 4.858626983012775, "grad_norm": 0.3537057379449312, "learning_rate": 4.726782798415985e-08, "loss": 0.0374, "step": 4326 }, { "epoch": 4.859750105292714, "grad_norm": 0.31967178022973664, "learning_rate": 4.650910692550858e-08, "loss": 0.0312, "step": 4327 }, { "epoch": 4.860873227572652, "grad_norm": 0.3465360661382547, "learning_rate": 4.5756510359337145e-08, "loss": 0.0383, "step": 4328 }, { "epoch": 4.86199634985259, "grad_norm": 0.3480958307636944, "learning_rate": 4.501003874872623e-08, "loss": 0.0361, "step": 4329 }, { "epoch": 4.863119472132528, "grad_norm": 0.351546101719163, "learning_rate": 4.426969255298841e-08, "loss": 0.0365, "step": 4330 }, { "epoch": 4.864242594412467, "grad_norm": 0.33709962673459676, "learning_rate": 4.3535472227667075e-08, "loss": 0.031, "step": 4331 }, { "epoch": 4.865365716692405, "grad_norm": 0.3592936899620035, "learning_rate": 4.280737822453529e-08, "loss": 0.0402, "step": 4332 }, { "epoch": 4.866488838972343, "grad_norm": 0.3400368919719095, "learning_rate": 4.208541099159691e-08, "loss": 0.0324, "step": 4333 }, { "epoch": 4.8676119612522815, "grad_norm": 0.3289295108551006, "learning_rate": 4.136957097308769e-08, "loss": 0.0316, "step": 4334 }, { "epoch": 4.868735083532219, "grad_norm": 0.3564469735283429, "learning_rate": 4.065985860947086e-08, "loss": 0.0383, "step": 4335 }, { "epoch": 4.869858205812157, "grad_norm": 0.3456958920658182, "learning_rate": 3.9956274337441533e-08, "loss": 0.0373, "step": 4336 }, { "epoch": 4.870981328092096, "grad_norm": 0.3546463192740376, "learning_rate": 3.92588185899212e-08, "loss": 0.0351, "step": 4337 }, { "epoch": 4.872104450372034, "grad_norm": 0.3634352198807142, "learning_rate": 3.856749179606323e-08, "loss": 0.0351, "step": 4338 }, { "epoch": 4.873227572651972, "grad_norm": 0.34143099459117354, "learning_rate": 3.7882294381247355e-08, "loss": 0.0325, "step": 4339 }, { "epoch": 4.874350694931911, "grad_norm": 0.3445137921102463, "learning_rate": 3.72032267670841e-08, "loss": 0.0342, "step": 4340 }, { "epoch": 4.875473817211849, "grad_norm": 0.34957356832593245, "learning_rate": 3.6530289371411453e-08, "loss": 0.0335, "step": 4341 }, { "epoch": 4.876596939491787, "grad_norm": 0.3559100988101287, "learning_rate": 3.586348260829486e-08, "loss": 0.0341, "step": 4342 }, { "epoch": 4.877720061771726, "grad_norm": 0.3582325578381939, "learning_rate": 3.520280688802724e-08, "loss": 0.0389, "step": 4343 }, { "epoch": 4.8788431840516635, "grad_norm": 0.3434812907253102, "learning_rate": 3.4548262617131176e-08, "loss": 0.0313, "step": 4344 }, { "epoch": 4.8799663063316014, "grad_norm": 0.32607848224810887, "learning_rate": 3.38998501983534e-08, "loss": 0.0306, "step": 4345 }, { "epoch": 4.88108942861154, "grad_norm": 0.3701550768586464, "learning_rate": 3.3257570030670316e-08, "loss": 0.0361, "step": 4346 }, { "epoch": 4.882212550891478, "grad_norm": 0.36030791267577544, "learning_rate": 3.2621422509282464e-08, "loss": 0.0366, "step": 4347 }, { "epoch": 4.883335673171416, "grad_norm": 0.3451337361364224, "learning_rate": 3.199140802562006e-08, "loss": 0.0314, "step": 4348 }, { "epoch": 4.884458795451355, "grad_norm": 0.3452767709477446, "learning_rate": 3.1367526967336356e-08, "loss": 0.0354, "step": 4349 }, { "epoch": 4.885581917731293, "grad_norm": 0.3547638758596286, "learning_rate": 3.0749779718314273e-08, "loss": 0.0353, "step": 4350 }, { "epoch": 4.886705040011231, "grad_norm": 0.3506982695342505, "learning_rate": 3.013816665865976e-08, "loss": 0.0344, "step": 4351 }, { "epoch": 4.88782816229117, "grad_norm": 0.3737683890517672, "learning_rate": 2.9532688164704005e-08, "loss": 0.0378, "step": 4352 }, { "epoch": 4.888951284571108, "grad_norm": 0.35014209109341093, "learning_rate": 2.8933344609004545e-08, "loss": 0.0367, "step": 4353 }, { "epoch": 4.8900744068510456, "grad_norm": 0.3264875513879221, "learning_rate": 2.834013636034527e-08, "loss": 0.0328, "step": 4354 }, { "epoch": 4.891197529130984, "grad_norm": 0.32367274530213846, "learning_rate": 2.7753063783734212e-08, "loss": 0.0308, "step": 4355 }, { "epoch": 4.892320651410922, "grad_norm": 0.32601616170265874, "learning_rate": 2.7172127240401302e-08, "loss": 0.0306, "step": 4356 }, { "epoch": 4.89344377369086, "grad_norm": 0.3514262599500851, "learning_rate": 2.6597327087805048e-08, "loss": 0.034, "step": 4357 }, { "epoch": 4.894566895970799, "grad_norm": 0.34413690056428353, "learning_rate": 2.6028663679625865e-08, "loss": 0.0358, "step": 4358 }, { "epoch": 4.895690018250737, "grad_norm": 0.33388792579998283, "learning_rate": 2.5466137365768307e-08, "loss": 0.0331, "step": 4359 }, { "epoch": 4.896813140530675, "grad_norm": 0.3558627099214127, "learning_rate": 2.4909748492362162e-08, "loss": 0.0335, "step": 4360 }, { "epoch": 4.897936262810614, "grad_norm": 0.3568173099350905, "learning_rate": 2.4359497401758026e-08, "loss": 0.0335, "step": 4361 }, { "epoch": 4.899059385090552, "grad_norm": 0.3576774734027632, "learning_rate": 2.3815384432531728e-08, "loss": 0.0352, "step": 4362 }, { "epoch": 4.90018250737049, "grad_norm": 0.35519782847619524, "learning_rate": 2.327740991948213e-08, "loss": 0.0346, "step": 4363 }, { "epoch": 4.9013056296504285, "grad_norm": 0.3450872223409437, "learning_rate": 2.2745574193632215e-08, "loss": 0.035, "step": 4364 }, { "epoch": 4.902428751930366, "grad_norm": 0.32911920269693273, "learning_rate": 2.2219877582224657e-08, "loss": 0.0307, "step": 4365 }, { "epoch": 4.903551874210304, "grad_norm": 0.33655879907579667, "learning_rate": 2.170032040872627e-08, "loss": 0.0317, "step": 4366 }, { "epoch": 4.904674996490243, "grad_norm": 0.34894129051496664, "learning_rate": 2.1186902992827995e-08, "loss": 0.0356, "step": 4367 }, { "epoch": 4.905798118770181, "grad_norm": 0.3720141793260485, "learning_rate": 2.067962565043935e-08, "loss": 0.0343, "step": 4368 }, { "epoch": 4.906921241050119, "grad_norm": 0.34874029206243984, "learning_rate": 2.0178488693695096e-08, "loss": 0.0361, "step": 4369 }, { "epoch": 4.908044363330058, "grad_norm": 0.3474003843287452, "learning_rate": 1.968349243094969e-08, "loss": 0.0334, "step": 4370 }, { "epoch": 4.909167485609996, "grad_norm": 0.3423160120197686, "learning_rate": 1.9194637166780606e-08, "loss": 0.0337, "step": 4371 }, { "epoch": 4.910290607889934, "grad_norm": 0.33591485159670115, "learning_rate": 1.8711923201983895e-08, "loss": 0.0343, "step": 4372 }, { "epoch": 4.911413730169873, "grad_norm": 0.33191680893555436, "learning_rate": 1.8235350833579745e-08, "loss": 0.0321, "step": 4373 }, { "epoch": 4.9125368524498105, "grad_norm": 0.3316935624017572, "learning_rate": 1.7764920354809146e-08, "loss": 0.0337, "step": 4374 }, { "epoch": 4.9136599747297485, "grad_norm": 0.35967697736265597, "learning_rate": 1.730063205513277e-08, "loss": 0.0365, "step": 4375 }, { "epoch": 4.914783097009687, "grad_norm": 0.359094588149611, "learning_rate": 1.6842486220232102e-08, "loss": 0.0321, "step": 4376 }, { "epoch": 4.915906219289625, "grad_norm": 0.3604389646619046, "learning_rate": 1.6390483132009415e-08, "loss": 0.0369, "step": 4377 }, { "epoch": 4.917029341569563, "grad_norm": 0.3457113375936627, "learning_rate": 1.5944623068586683e-08, "loss": 0.0339, "step": 4378 }, { "epoch": 4.918152463849502, "grad_norm": 0.31938400735132355, "learning_rate": 1.5504906304306677e-08, "loss": 0.0317, "step": 4379 }, { "epoch": 4.91927558612944, "grad_norm": 0.340077310318995, "learning_rate": 1.5071333109732966e-08, "loss": 0.0361, "step": 4380 }, { "epoch": 4.920398708409378, "grad_norm": 0.35535503166517346, "learning_rate": 1.4643903751647703e-08, "loss": 0.0361, "step": 4381 }, { "epoch": 4.921521830689317, "grad_norm": 0.3537974935188405, "learning_rate": 1.422261849305162e-08, "loss": 0.0342, "step": 4382 }, { "epoch": 4.922644952969255, "grad_norm": 0.32843491322571805, "learning_rate": 1.3807477593166252e-08, "loss": 0.0319, "step": 4383 }, { "epoch": 4.923768075249193, "grad_norm": 0.3642690465675649, "learning_rate": 1.339848130743393e-08, "loss": 0.0363, "step": 4384 }, { "epoch": 4.924891197529131, "grad_norm": 0.3431037322105767, "learning_rate": 1.299562988751335e-08, "loss": 0.0342, "step": 4385 }, { "epoch": 4.926014319809069, "grad_norm": 0.37231643890134525, "learning_rate": 1.2598923581284006e-08, "loss": 0.0341, "step": 4386 }, { "epoch": 4.927137442089007, "grad_norm": 0.3500700062527457, "learning_rate": 1.2208362632842863e-08, "loss": 0.0363, "step": 4387 }, { "epoch": 4.928260564368946, "grad_norm": 0.33425871580503613, "learning_rate": 1.1823947282506576e-08, "loss": 0.0312, "step": 4388 }, { "epoch": 4.929383686648884, "grad_norm": 0.3633980633957219, "learning_rate": 1.144567776681149e-08, "loss": 0.0376, "step": 4389 }, { "epoch": 4.930506808928822, "grad_norm": 0.3349073890426962, "learning_rate": 1.1073554318509206e-08, "loss": 0.0326, "step": 4390 }, { "epoch": 4.931629931208761, "grad_norm": 0.34098481332204433, "learning_rate": 1.0707577166572114e-08, "loss": 0.0341, "step": 4391 }, { "epoch": 4.932753053488699, "grad_norm": 0.3285118951852116, "learning_rate": 1.0347746536191195e-08, "loss": 0.0329, "step": 4392 }, { "epoch": 4.933876175768637, "grad_norm": 0.3451360635835923, "learning_rate": 9.994062648771563e-09, "loss": 0.0345, "step": 4393 }, { "epoch": 4.9349992980485755, "grad_norm": 0.3451649817059978, "learning_rate": 9.646525721940247e-09, "loss": 0.0339, "step": 4394 }, { "epoch": 4.9361224203285134, "grad_norm": 0.3329314247830526, "learning_rate": 9.305135969541746e-09, "loss": 0.0331, "step": 4395 }, { "epoch": 4.937245542608451, "grad_norm": 0.3703275176605861, "learning_rate": 8.969893601634694e-09, "loss": 0.0321, "step": 4396 }, { "epoch": 4.93836866488839, "grad_norm": 0.38004031161852586, "learning_rate": 8.64079882449853e-09, "loss": 0.0343, "step": 4397 }, { "epoch": 4.939491787168328, "grad_norm": 0.36158113782294066, "learning_rate": 8.317851840629055e-09, "loss": 0.0324, "step": 4398 }, { "epoch": 4.940614909448266, "grad_norm": 0.35802038640100103, "learning_rate": 8.001052848739532e-09, "loss": 0.0355, "step": 4399 }, { "epoch": 4.941738031728204, "grad_norm": 0.3534681414083675, "learning_rate": 7.690402043758482e-09, "loss": 0.0342, "step": 4400 }, { "epoch": 4.942861154008143, "grad_norm": 0.3409750506973528, "learning_rate": 7.385899616833003e-09, "loss": 0.0322, "step": 4401 }, { "epoch": 4.943984276288081, "grad_norm": 0.35696111455006185, "learning_rate": 7.087545755327663e-09, "loss": 0.0343, "step": 4402 }, { "epoch": 4.945107398568019, "grad_norm": 0.3628732180685351, "learning_rate": 6.795340642823389e-09, "loss": 0.033, "step": 4403 }, { "epoch": 4.946230520847958, "grad_norm": 0.3561531152392209, "learning_rate": 6.50928445911525e-09, "loss": 0.0347, "step": 4404 }, { "epoch": 4.9473536431278955, "grad_norm": 0.3604397825246946, "learning_rate": 6.229377380218005e-09, "loss": 0.0349, "step": 4405 }, { "epoch": 4.948476765407833, "grad_norm": 0.3515361148095995, "learning_rate": 5.95561957836055e-09, "loss": 0.0349, "step": 4406 }, { "epoch": 4.949599887687772, "grad_norm": 0.34835719474984594, "learning_rate": 5.688011221991474e-09, "loss": 0.034, "step": 4407 }, { "epoch": 4.95072300996771, "grad_norm": 0.36776092832286844, "learning_rate": 5.426552475770175e-09, "loss": 0.0405, "step": 4408 }, { "epoch": 4.951846132247648, "grad_norm": 0.33964356120947575, "learning_rate": 5.1712435005768504e-09, "loss": 0.0314, "step": 4409 }, { "epoch": 4.952969254527587, "grad_norm": 0.3576271748655273, "learning_rate": 4.922084453505838e-09, "loss": 0.0348, "step": 4410 }, { "epoch": 4.954092376807525, "grad_norm": 0.3429879169889465, "learning_rate": 4.679075487866725e-09, "loss": 0.0337, "step": 4411 }, { "epoch": 4.955215499087463, "grad_norm": 0.3745978618879177, "learning_rate": 4.4422167531865675e-09, "loss": 0.0368, "step": 4412 }, { "epoch": 4.956338621367402, "grad_norm": 0.32986700838548433, "learning_rate": 4.211508395206565e-09, "loss": 0.0327, "step": 4413 }, { "epoch": 4.95746174364734, "grad_norm": 0.33311799847039564, "learning_rate": 3.986950555883162e-09, "loss": 0.0348, "step": 4414 }, { "epoch": 4.9585848659272775, "grad_norm": 0.3851607304135312, "learning_rate": 3.768543373391387e-09, "loss": 0.0424, "step": 4415 }, { "epoch": 4.959707988207216, "grad_norm": 0.31347433372100486, "learning_rate": 3.5562869821181843e-09, "loss": 0.0315, "step": 4416 }, { "epoch": 4.960831110487154, "grad_norm": 0.34107319972689115, "learning_rate": 3.3501815126668613e-09, "loss": 0.0339, "step": 4417 }, { "epoch": 4.961954232767092, "grad_norm": 0.33193990363553355, "learning_rate": 3.150227091857083e-09, "loss": 0.0329, "step": 4418 }, { "epoch": 4.963077355047031, "grad_norm": 0.3715468555033886, "learning_rate": 2.9564238427237657e-09, "loss": 0.038, "step": 4419 }, { "epoch": 4.964200477326969, "grad_norm": 0.3372262605486213, "learning_rate": 2.7687718845148538e-09, "loss": 0.0314, "step": 4420 }, { "epoch": 4.965323599606907, "grad_norm": 0.350019697957, "learning_rate": 2.587271332694652e-09, "loss": 0.0347, "step": 4421 }, { "epoch": 4.966446721886846, "grad_norm": 0.3446161586717599, "learning_rate": 2.411922298943825e-09, "loss": 0.0344, "step": 4422 }, { "epoch": 4.967569844166784, "grad_norm": 0.319098731802588, "learning_rate": 2.242724891156067e-09, "loss": 0.0323, "step": 4423 }, { "epoch": 4.968692966446722, "grad_norm": 0.3801202120943219, "learning_rate": 2.079679213439212e-09, "loss": 0.0372, "step": 4424 }, { "epoch": 4.9698160887266605, "grad_norm": 0.33750070670739035, "learning_rate": 1.9227853661174524e-09, "loss": 0.0352, "step": 4425 }, { "epoch": 4.970939211006598, "grad_norm": 0.3519190781169123, "learning_rate": 1.7720434457302315e-09, "loss": 0.0373, "step": 4426 }, { "epoch": 4.972062333286536, "grad_norm": 0.3437633504102528, "learning_rate": 1.6274535450311324e-09, "loss": 0.0325, "step": 4427 }, { "epoch": 4.973185455566475, "grad_norm": 0.3677752690761468, "learning_rate": 1.4890157529856563e-09, "loss": 0.0402, "step": 4428 }, { "epoch": 4.974308577846413, "grad_norm": 0.3683625506073574, "learning_rate": 1.3567301547778856e-09, "loss": 0.037, "step": 4429 }, { "epoch": 4.975431700126351, "grad_norm": 0.3560600240146814, "learning_rate": 1.230596831804931e-09, "loss": 0.0358, "step": 4430 }, { "epoch": 4.976554822406289, "grad_norm": 0.3625253632661801, "learning_rate": 1.1106158616758235e-09, "loss": 0.0366, "step": 4431 }, { "epoch": 4.977677944686228, "grad_norm": 0.3444846000256938, "learning_rate": 9.96787318218173e-10, "loss": 0.0324, "step": 4432 }, { "epoch": 4.978801066966166, "grad_norm": 0.35833357272901073, "learning_rate": 8.891112714726203e-10, "loss": 0.035, "step": 4433 }, { "epoch": 4.979924189246104, "grad_norm": 0.3425329647959898, "learning_rate": 7.875877876906135e-10, "loss": 0.0331, "step": 4434 }, { "epoch": 4.9810473115260425, "grad_norm": 0.35679200669540573, "learning_rate": 6.922169293421821e-10, "loss": 0.0364, "step": 4435 }, { "epoch": 4.9821704338059805, "grad_norm": 0.3259794017034792, "learning_rate": 6.029987551103844e-10, "loss": 0.0319, "step": 4436 }, { "epoch": 4.983293556085918, "grad_norm": 0.3480641727719168, "learning_rate": 5.199333198924183e-10, "loss": 0.0336, "step": 4437 }, { "epoch": 4.984416678365857, "grad_norm": 0.348847347957358, "learning_rate": 4.4302067479851105e-10, "loss": 0.0321, "step": 4438 }, { "epoch": 4.985539800645795, "grad_norm": 0.3783464710272088, "learning_rate": 3.7226086715413945e-10, "loss": 0.0381, "step": 4439 }, { "epoch": 4.986662922925733, "grad_norm": 0.35344733158741803, "learning_rate": 3.076539404989198e-10, "loss": 0.0309, "step": 4440 }, { "epoch": 4.987786045205672, "grad_norm": 0.33442902082308484, "learning_rate": 2.4919993458549783e-10, "loss": 0.0343, "step": 4441 }, { "epoch": 4.98890916748561, "grad_norm": 0.35360484233382744, "learning_rate": 1.968988853806586e-10, "loss": 0.036, "step": 4442 }, { "epoch": 4.990032289765548, "grad_norm": 0.3421628593665695, "learning_rate": 1.5075082506865734e-10, "loss": 0.0356, "step": 4443 }, { "epoch": 4.991155412045487, "grad_norm": 0.33672336609006215, "learning_rate": 1.1075578204233772e-10, "loss": 0.0337, "step": 4444 }, { "epoch": 4.992278534325425, "grad_norm": 0.3641808930391885, "learning_rate": 7.691378091090329e-11, "loss": 0.0328, "step": 4445 }, { "epoch": 4.9934016566053625, "grad_norm": 0.32873267052415456, "learning_rate": 4.9224842499917546e-11, "loss": 0.0315, "step": 4446 }, { "epoch": 4.994524778885301, "grad_norm": 0.36446284662411277, "learning_rate": 2.768898384464258e-11, "loss": 0.035, "step": 4447 }, { "epoch": 4.995647901165239, "grad_norm": 0.33359437371199696, "learning_rate": 1.2306218196700414e-11, "loss": 0.031, "step": 4448 }, { "epoch": 4.996771023445177, "grad_norm": 0.3510829003384089, "learning_rate": 3.0765550229627793e-12, "loss": 0.0348, "step": 4449 }, { "epoch": 4.997894145725116, "grad_norm": 0.3451047076893161, "learning_rate": 0.0, "loss": 0.0326, "step": 4450 }, { "epoch": 4.997894145725116, "step": 4450, "total_flos": 2.0283773111783916e+18, "train_loss": 0.1949592823937033, "train_runtime": 25560.7624, "train_samples_per_second": 22.291, "train_steps_per_second": 0.174 } ], "logging_steps": 1.0, "max_steps": 4450, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0283773111783916e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }