diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31192 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.997894145725116, + "eval_steps": 500, + "global_step": 4450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011231222799382282, + "grad_norm": 6.350573723690268, + "learning_rate": 4.494382022471911e-08, + "loss": 0.7656, + "step": 1 + }, + { + "epoch": 0.0022462445598764565, + "grad_norm": 6.396429921824654, + "learning_rate": 8.988764044943822e-08, + "loss": 0.8046, + "step": 2 + }, + { + "epoch": 0.0033693668398146847, + "grad_norm": 6.515479866330877, + "learning_rate": 1.348314606741573e-07, + "loss": 0.7993, + "step": 3 + }, + { + "epoch": 0.004492489119752913, + "grad_norm": 6.264174843252429, + "learning_rate": 1.7977528089887644e-07, + "loss": 0.8049, + "step": 4 + }, + { + "epoch": 0.005615611399691142, + "grad_norm": 6.57185846446011, + "learning_rate": 2.247191011235955e-07, + "loss": 0.826, + "step": 5 + }, + { + "epoch": 0.0067387336796293695, + "grad_norm": 6.517583190885418, + "learning_rate": 2.696629213483146e-07, + "loss": 0.803, + "step": 6 + }, + { + "epoch": 0.007861855959567598, + "grad_norm": 6.364802286536463, + "learning_rate": 3.1460674157303374e-07, + "loss": 0.7837, + "step": 7 + }, + { + "epoch": 0.008984978239505826, + "grad_norm": 6.365458774711909, + "learning_rate": 3.5955056179775287e-07, + "loss": 0.7702, + "step": 8 + }, + { + "epoch": 0.010108100519444054, + "grad_norm": 6.254849998413217, + "learning_rate": 4.044943820224719e-07, + "loss": 0.7543, + "step": 9 + }, + { + "epoch": 0.011231222799382283, + "grad_norm": 6.296540752960891, + "learning_rate": 4.49438202247191e-07, + "loss": 0.7835, + "step": 10 + }, + { + "epoch": 0.012354345079320511, + "grad_norm": 5.708911549789346, + "learning_rate": 4.943820224719102e-07, + "loss": 0.764, + "step": 11 + }, + { + "epoch": 0.013477467359258739, + "grad_norm": 5.900819796376942, + "learning_rate": 5.393258426966292e-07, + "loss": 0.76, + "step": 12 + }, + { + "epoch": 0.014600589639196967, + "grad_norm": 5.807726787883733, + "learning_rate": 5.842696629213484e-07, + "loss": 0.7686, + "step": 13 + }, + { + "epoch": 0.015723711919135196, + "grad_norm": 5.526670510027518, + "learning_rate": 6.292134831460675e-07, + "loss": 0.7631, + "step": 14 + }, + { + "epoch": 0.016846834199073422, + "grad_norm": 4.559877051017468, + "learning_rate": 6.741573033707865e-07, + "loss": 0.684, + "step": 15 + }, + { + "epoch": 0.017969956479011652, + "grad_norm": 4.475977243208917, + "learning_rate": 7.191011235955057e-07, + "loss": 0.7226, + "step": 16 + }, + { + "epoch": 0.01909307875894988, + "grad_norm": 4.295823217269364, + "learning_rate": 7.640449438202248e-07, + "loss": 0.7131, + "step": 17 + }, + { + "epoch": 0.020216201038888108, + "grad_norm": 4.129352981067178, + "learning_rate": 8.089887640449438e-07, + "loss": 0.6764, + "step": 18 + }, + { + "epoch": 0.021339323318826337, + "grad_norm": 4.136041569248625, + "learning_rate": 8.53932584269663e-07, + "loss": 0.6996, + "step": 19 + }, + { + "epoch": 0.022462445598764567, + "grad_norm": 3.127126410742847, + "learning_rate": 8.98876404494382e-07, + "loss": 0.6585, + "step": 20 + }, + { + "epoch": 0.023585567878702793, + "grad_norm": 2.7983535971516136, + "learning_rate": 9.438202247191013e-07, + "loss": 0.6476, + "step": 21 + }, + { + "epoch": 0.024708690158641022, + "grad_norm": 2.8307801771170498, + "learning_rate": 9.887640449438204e-07, + "loss": 0.6081, + "step": 22 + }, + { + "epoch": 0.025831812438579252, + "grad_norm": 2.6693325996833703, + "learning_rate": 1.0337078651685394e-06, + "loss": 0.6015, + "step": 23 + }, + { + "epoch": 0.026954934718517478, + "grad_norm": 2.5249433529837058, + "learning_rate": 1.0786516853932585e-06, + "loss": 0.5871, + "step": 24 + }, + { + "epoch": 0.028078056998455708, + "grad_norm": 2.7833384535564893, + "learning_rate": 1.1235955056179777e-06, + "loss": 0.644, + "step": 25 + }, + { + "epoch": 0.029201179278393934, + "grad_norm": 2.5636212715816846, + "learning_rate": 1.1685393258426967e-06, + "loss": 0.6212, + "step": 26 + }, + { + "epoch": 0.030324301558332163, + "grad_norm": 2.13400734054636, + "learning_rate": 1.2134831460674157e-06, + "loss": 0.6061, + "step": 27 + }, + { + "epoch": 0.03144742383827039, + "grad_norm": 1.8020788979606925, + "learning_rate": 1.258426966292135e-06, + "loss": 0.5786, + "step": 28 + }, + { + "epoch": 0.03257054611820862, + "grad_norm": 1.7251113036312054, + "learning_rate": 1.303370786516854e-06, + "loss": 0.519, + "step": 29 + }, + { + "epoch": 0.033693668398146845, + "grad_norm": 1.7845108367392837, + "learning_rate": 1.348314606741573e-06, + "loss": 0.5688, + "step": 30 + }, + { + "epoch": 0.034816790678085074, + "grad_norm": 1.5519488204108933, + "learning_rate": 1.3932584269662923e-06, + "loss": 0.5282, + "step": 31 + }, + { + "epoch": 0.035939912958023304, + "grad_norm": 1.4564929135509108, + "learning_rate": 1.4382022471910115e-06, + "loss": 0.5378, + "step": 32 + }, + { + "epoch": 0.037063035237961534, + "grad_norm": 1.5083545600301442, + "learning_rate": 1.4831460674157305e-06, + "loss": 0.5256, + "step": 33 + }, + { + "epoch": 0.03818615751789976, + "grad_norm": 1.2973997209272992, + "learning_rate": 1.5280898876404495e-06, + "loss": 0.5157, + "step": 34 + }, + { + "epoch": 0.03930927979783799, + "grad_norm": 1.2442177307671032, + "learning_rate": 1.5730337078651686e-06, + "loss": 0.5334, + "step": 35 + }, + { + "epoch": 0.040432402077776215, + "grad_norm": 1.1214494431535786, + "learning_rate": 1.6179775280898876e-06, + "loss": 0.5457, + "step": 36 + }, + { + "epoch": 0.041555524357714445, + "grad_norm": 1.0380116256523142, + "learning_rate": 1.662921348314607e-06, + "loss": 0.5252, + "step": 37 + }, + { + "epoch": 0.042678646637652674, + "grad_norm": 0.9512314468973523, + "learning_rate": 1.707865168539326e-06, + "loss": 0.4822, + "step": 38 + }, + { + "epoch": 0.043801768917590904, + "grad_norm": 1.1749256602840057, + "learning_rate": 1.752808988764045e-06, + "loss": 0.4876, + "step": 39 + }, + { + "epoch": 0.04492489119752913, + "grad_norm": 1.1996241804899646, + "learning_rate": 1.797752808988764e-06, + "loss": 0.5222, + "step": 40 + }, + { + "epoch": 0.046048013477467356, + "grad_norm": 1.0789479072322812, + "learning_rate": 1.8426966292134831e-06, + "loss": 0.5101, + "step": 41 + }, + { + "epoch": 0.047171135757405586, + "grad_norm": 1.0512996176581257, + "learning_rate": 1.8876404494382026e-06, + "loss": 0.5055, + "step": 42 + }, + { + "epoch": 0.048294258037343815, + "grad_norm": 0.9746980947816224, + "learning_rate": 1.9325842696629214e-06, + "loss": 0.504, + "step": 43 + }, + { + "epoch": 0.049417380317282045, + "grad_norm": 1.0580329624783904, + "learning_rate": 1.977528089887641e-06, + "loss": 0.5067, + "step": 44 + }, + { + "epoch": 0.050540502597220274, + "grad_norm": 0.9449814871215534, + "learning_rate": 2.02247191011236e-06, + "loss": 0.4902, + "step": 45 + }, + { + "epoch": 0.051663624877158504, + "grad_norm": 0.8168500336470494, + "learning_rate": 2.067415730337079e-06, + "loss": 0.4573, + "step": 46 + }, + { + "epoch": 0.052786747157096726, + "grad_norm": 0.8804112383078366, + "learning_rate": 2.112359550561798e-06, + "loss": 0.506, + "step": 47 + }, + { + "epoch": 0.053909869437034956, + "grad_norm": 0.7359559115946035, + "learning_rate": 2.157303370786517e-06, + "loss": 0.4451, + "step": 48 + }, + { + "epoch": 0.055032991716973186, + "grad_norm": 0.902331110525257, + "learning_rate": 2.202247191011236e-06, + "loss": 0.4927, + "step": 49 + }, + { + "epoch": 0.056156113996911415, + "grad_norm": 0.7375774424153347, + "learning_rate": 2.2471910112359554e-06, + "loss": 0.4777, + "step": 50 + }, + { + "epoch": 0.057279236276849645, + "grad_norm": 0.7302558001904716, + "learning_rate": 2.2921348314606744e-06, + "loss": 0.4696, + "step": 51 + }, + { + "epoch": 0.05840235855678787, + "grad_norm": 0.7169095239110965, + "learning_rate": 2.3370786516853934e-06, + "loss": 0.4555, + "step": 52 + }, + { + "epoch": 0.0595254808367261, + "grad_norm": 0.6897677482032446, + "learning_rate": 2.3820224719101125e-06, + "loss": 0.4401, + "step": 53 + }, + { + "epoch": 0.060648603116664326, + "grad_norm": 0.661989923280511, + "learning_rate": 2.4269662921348315e-06, + "loss": 0.4453, + "step": 54 + }, + { + "epoch": 0.061771725396602556, + "grad_norm": 0.6769101351556205, + "learning_rate": 2.4719101123595505e-06, + "loss": 0.4402, + "step": 55 + }, + { + "epoch": 0.06289484767654079, + "grad_norm": 0.6812241363735265, + "learning_rate": 2.51685393258427e-06, + "loss": 0.4645, + "step": 56 + }, + { + "epoch": 0.06401796995647902, + "grad_norm": 0.6757451847169096, + "learning_rate": 2.561797752808989e-06, + "loss": 0.4535, + "step": 57 + }, + { + "epoch": 0.06514109223641724, + "grad_norm": 0.6326256743817255, + "learning_rate": 2.606741573033708e-06, + "loss": 0.4194, + "step": 58 + }, + { + "epoch": 0.06626421451635547, + "grad_norm": 0.6800176400190888, + "learning_rate": 2.6516853932584274e-06, + "loss": 0.4541, + "step": 59 + }, + { + "epoch": 0.06738733679629369, + "grad_norm": 0.7164423610987447, + "learning_rate": 2.696629213483146e-06, + "loss": 0.4497, + "step": 60 + }, + { + "epoch": 0.06851045907623192, + "grad_norm": 0.7413520702346671, + "learning_rate": 2.7415730337078655e-06, + "loss": 0.4753, + "step": 61 + }, + { + "epoch": 0.06963358135617015, + "grad_norm": 0.7016085047019666, + "learning_rate": 2.7865168539325845e-06, + "loss": 0.4436, + "step": 62 + }, + { + "epoch": 0.07075670363610838, + "grad_norm": 0.6265784239939073, + "learning_rate": 2.8314606741573035e-06, + "loss": 0.4552, + "step": 63 + }, + { + "epoch": 0.07187982591604661, + "grad_norm": 0.674311288104449, + "learning_rate": 2.876404494382023e-06, + "loss": 0.458, + "step": 64 + }, + { + "epoch": 0.07300294819598484, + "grad_norm": 0.6446161293508706, + "learning_rate": 2.9213483146067416e-06, + "loss": 0.4525, + "step": 65 + }, + { + "epoch": 0.07412607047592307, + "grad_norm": 0.6444476566168919, + "learning_rate": 2.966292134831461e-06, + "loss": 0.4184, + "step": 66 + }, + { + "epoch": 0.0752491927558613, + "grad_norm": 0.6233096607639449, + "learning_rate": 3.0112359550561796e-06, + "loss": 0.457, + "step": 67 + }, + { + "epoch": 0.07637231503579953, + "grad_norm": 0.6648165469874694, + "learning_rate": 3.056179775280899e-06, + "loss": 0.4466, + "step": 68 + }, + { + "epoch": 0.07749543731573776, + "grad_norm": 0.6486022510349203, + "learning_rate": 3.1011235955056185e-06, + "loss": 0.4209, + "step": 69 + }, + { + "epoch": 0.07861855959567599, + "grad_norm": 0.6229828143529706, + "learning_rate": 3.146067415730337e-06, + "loss": 0.432, + "step": 70 + }, + { + "epoch": 0.0797416818756142, + "grad_norm": 0.5873293721571403, + "learning_rate": 3.1910112359550566e-06, + "loss": 0.4333, + "step": 71 + }, + { + "epoch": 0.08086480415555243, + "grad_norm": 0.6550686084928332, + "learning_rate": 3.235955056179775e-06, + "loss": 0.4342, + "step": 72 + }, + { + "epoch": 0.08198792643549066, + "grad_norm": 0.687632103391066, + "learning_rate": 3.2808988764044946e-06, + "loss": 0.4524, + "step": 73 + }, + { + "epoch": 0.08311104871542889, + "grad_norm": 0.5997642028480324, + "learning_rate": 3.325842696629214e-06, + "loss": 0.4151, + "step": 74 + }, + { + "epoch": 0.08423417099536712, + "grad_norm": 0.6141443910145349, + "learning_rate": 3.3707865168539327e-06, + "loss": 0.4107, + "step": 75 + }, + { + "epoch": 0.08535729327530535, + "grad_norm": 0.6086934688843036, + "learning_rate": 3.415730337078652e-06, + "loss": 0.4464, + "step": 76 + }, + { + "epoch": 0.08648041555524358, + "grad_norm": 0.6547734824010637, + "learning_rate": 3.4606741573033707e-06, + "loss": 0.4325, + "step": 77 + }, + { + "epoch": 0.08760353783518181, + "grad_norm": 0.6816992889881991, + "learning_rate": 3.50561797752809e-06, + "loss": 0.4632, + "step": 78 + }, + { + "epoch": 0.08872666011512004, + "grad_norm": 0.6858188783737146, + "learning_rate": 3.5505617977528096e-06, + "loss": 0.4527, + "step": 79 + }, + { + "epoch": 0.08984978239505827, + "grad_norm": 0.709230605422352, + "learning_rate": 3.595505617977528e-06, + "loss": 0.4385, + "step": 80 + }, + { + "epoch": 0.0909729046749965, + "grad_norm": 0.6185609555383293, + "learning_rate": 3.6404494382022476e-06, + "loss": 0.4295, + "step": 81 + }, + { + "epoch": 0.09209602695493471, + "grad_norm": 0.5593705773746459, + "learning_rate": 3.6853932584269662e-06, + "loss": 0.4107, + "step": 82 + }, + { + "epoch": 0.09321914923487294, + "grad_norm": 0.6449522343921041, + "learning_rate": 3.7303370786516857e-06, + "loss": 0.4297, + "step": 83 + }, + { + "epoch": 0.09434227151481117, + "grad_norm": 0.6162415569253238, + "learning_rate": 3.775280898876405e-06, + "loss": 0.3956, + "step": 84 + }, + { + "epoch": 0.0954653937947494, + "grad_norm": 0.6285861854737811, + "learning_rate": 3.820224719101124e-06, + "loss": 0.4209, + "step": 85 + }, + { + "epoch": 0.09658851607468763, + "grad_norm": 0.6005391526643882, + "learning_rate": 3.865168539325843e-06, + "loss": 0.4344, + "step": 86 + }, + { + "epoch": 0.09771163835462586, + "grad_norm": 0.6037551054743073, + "learning_rate": 3.910112359550562e-06, + "loss": 0.4238, + "step": 87 + }, + { + "epoch": 0.09883476063456409, + "grad_norm": 0.6317561271143405, + "learning_rate": 3.955056179775282e-06, + "loss": 0.4313, + "step": 88 + }, + { + "epoch": 0.09995788291450232, + "grad_norm": 0.6740920615829088, + "learning_rate": 4.000000000000001e-06, + "loss": 0.4228, + "step": 89 + }, + { + "epoch": 0.10108100519444055, + "grad_norm": 0.6163555222810396, + "learning_rate": 4.04494382022472e-06, + "loss": 0.4297, + "step": 90 + }, + { + "epoch": 0.10220412747437878, + "grad_norm": 0.5924398711122163, + "learning_rate": 4.089887640449439e-06, + "loss": 0.413, + "step": 91 + }, + { + "epoch": 0.10332724975431701, + "grad_norm": 0.6450622353301865, + "learning_rate": 4.134831460674158e-06, + "loss": 0.4055, + "step": 92 + }, + { + "epoch": 0.10445037203425522, + "grad_norm": 0.5951767882437019, + "learning_rate": 4.179775280898877e-06, + "loss": 0.4151, + "step": 93 + }, + { + "epoch": 0.10557349431419345, + "grad_norm": 0.5644268389534823, + "learning_rate": 4.224719101123596e-06, + "loss": 0.3983, + "step": 94 + }, + { + "epoch": 0.10669661659413168, + "grad_norm": 0.6348084088963297, + "learning_rate": 4.269662921348315e-06, + "loss": 0.4071, + "step": 95 + }, + { + "epoch": 0.10781973887406991, + "grad_norm": 0.5862325727673693, + "learning_rate": 4.314606741573034e-06, + "loss": 0.43, + "step": 96 + }, + { + "epoch": 0.10894286115400814, + "grad_norm": 0.6202803723418159, + "learning_rate": 4.359550561797753e-06, + "loss": 0.4189, + "step": 97 + }, + { + "epoch": 0.11006598343394637, + "grad_norm": 0.5944376188372148, + "learning_rate": 4.404494382022472e-06, + "loss": 0.4106, + "step": 98 + }, + { + "epoch": 0.1111891057138846, + "grad_norm": 0.6071370637839878, + "learning_rate": 4.449438202247192e-06, + "loss": 0.4168, + "step": 99 + }, + { + "epoch": 0.11231222799382283, + "grad_norm": 0.6135945175323315, + "learning_rate": 4.494382022471911e-06, + "loss": 0.4011, + "step": 100 + }, + { + "epoch": 0.11343535027376106, + "grad_norm": 0.5901989509886602, + "learning_rate": 4.53932584269663e-06, + "loss": 0.4226, + "step": 101 + }, + { + "epoch": 0.11455847255369929, + "grad_norm": 0.5719385399517765, + "learning_rate": 4.584269662921349e-06, + "loss": 0.4207, + "step": 102 + }, + { + "epoch": 0.11568159483363752, + "grad_norm": 0.6073755395198991, + "learning_rate": 4.629213483146068e-06, + "loss": 0.4263, + "step": 103 + }, + { + "epoch": 0.11680471711357573, + "grad_norm": 0.6019653421562294, + "learning_rate": 4.674157303370787e-06, + "loss": 0.4149, + "step": 104 + }, + { + "epoch": 0.11792783939351396, + "grad_norm": 0.6153507547879732, + "learning_rate": 4.719101123595506e-06, + "loss": 0.4089, + "step": 105 + }, + { + "epoch": 0.1190509616734522, + "grad_norm": 0.5829914630184178, + "learning_rate": 4.764044943820225e-06, + "loss": 0.4172, + "step": 106 + }, + { + "epoch": 0.12017408395339042, + "grad_norm": 0.6272470387655589, + "learning_rate": 4.808988764044944e-06, + "loss": 0.4265, + "step": 107 + }, + { + "epoch": 0.12129720623332865, + "grad_norm": 0.6284052319460708, + "learning_rate": 4.853932584269663e-06, + "loss": 0.4422, + "step": 108 + }, + { + "epoch": 0.12242032851326688, + "grad_norm": 0.6197638566033578, + "learning_rate": 4.898876404494383e-06, + "loss": 0.4058, + "step": 109 + }, + { + "epoch": 0.12354345079320511, + "grad_norm": 0.6033230010642877, + "learning_rate": 4.943820224719101e-06, + "loss": 0.4245, + "step": 110 + }, + { + "epoch": 0.12466657307314334, + "grad_norm": 0.630498552024084, + "learning_rate": 4.988764044943821e-06, + "loss": 0.3942, + "step": 111 + }, + { + "epoch": 0.12578969535308157, + "grad_norm": 0.6493542531086268, + "learning_rate": 5.03370786516854e-06, + "loss": 0.4363, + "step": 112 + }, + { + "epoch": 0.1269128176330198, + "grad_norm": 0.6237798245035485, + "learning_rate": 5.078651685393259e-06, + "loss": 0.4159, + "step": 113 + }, + { + "epoch": 0.12803593991295803, + "grad_norm": 0.6597150507909345, + "learning_rate": 5.123595505617978e-06, + "loss": 0.4351, + "step": 114 + }, + { + "epoch": 0.12915906219289625, + "grad_norm": 0.5980066757551353, + "learning_rate": 5.168539325842698e-06, + "loss": 0.389, + "step": 115 + }, + { + "epoch": 0.1302821844728345, + "grad_norm": 0.6404924520081225, + "learning_rate": 5.213483146067416e-06, + "loss": 0.4229, + "step": 116 + }, + { + "epoch": 0.1314053067527727, + "grad_norm": 0.6144448252232805, + "learning_rate": 5.258426966292135e-06, + "loss": 0.4096, + "step": 117 + }, + { + "epoch": 0.13252842903271095, + "grad_norm": 0.638697930023902, + "learning_rate": 5.303370786516855e-06, + "loss": 0.4296, + "step": 118 + }, + { + "epoch": 0.13365155131264916, + "grad_norm": 0.6304024554599722, + "learning_rate": 5.348314606741574e-06, + "loss": 0.4136, + "step": 119 + }, + { + "epoch": 0.13477467359258738, + "grad_norm": 0.6253982419034039, + "learning_rate": 5.393258426966292e-06, + "loss": 0.4171, + "step": 120 + }, + { + "epoch": 0.13589779587252562, + "grad_norm": 0.5881906390342778, + "learning_rate": 5.438202247191011e-06, + "loss": 0.4079, + "step": 121 + }, + { + "epoch": 0.13702091815246384, + "grad_norm": 0.6940061067312768, + "learning_rate": 5.483146067415731e-06, + "loss": 0.42, + "step": 122 + }, + { + "epoch": 0.13814404043240208, + "grad_norm": 0.620303869723247, + "learning_rate": 5.52808988764045e-06, + "loss": 0.4058, + "step": 123 + }, + { + "epoch": 0.1392671627123403, + "grad_norm": 0.5917642817410118, + "learning_rate": 5.573033707865169e-06, + "loss": 0.4048, + "step": 124 + }, + { + "epoch": 0.14039028499227854, + "grad_norm": 0.578694471042719, + "learning_rate": 5.617977528089889e-06, + "loss": 0.3975, + "step": 125 + }, + { + "epoch": 0.14151340727221676, + "grad_norm": 0.6356057703453721, + "learning_rate": 5.662921348314607e-06, + "loss": 0.4232, + "step": 126 + }, + { + "epoch": 0.142636529552155, + "grad_norm": 0.63035695832545, + "learning_rate": 5.707865168539326e-06, + "loss": 0.4061, + "step": 127 + }, + { + "epoch": 0.14375965183209322, + "grad_norm": 0.5901449106195371, + "learning_rate": 5.752808988764046e-06, + "loss": 0.394, + "step": 128 + }, + { + "epoch": 0.14488277411203146, + "grad_norm": 0.6552219033314495, + "learning_rate": 5.797752808988765e-06, + "loss": 0.4228, + "step": 129 + }, + { + "epoch": 0.14600589639196968, + "grad_norm": 0.5897358290667347, + "learning_rate": 5.842696629213483e-06, + "loss": 0.4097, + "step": 130 + }, + { + "epoch": 0.1471290186719079, + "grad_norm": 0.6506458224613755, + "learning_rate": 5.887640449438202e-06, + "loss": 0.4193, + "step": 131 + }, + { + "epoch": 0.14825214095184613, + "grad_norm": 0.5884872839199834, + "learning_rate": 5.932584269662922e-06, + "loss": 0.3958, + "step": 132 + }, + { + "epoch": 0.14937526323178435, + "grad_norm": 0.6383694194701096, + "learning_rate": 5.977528089887641e-06, + "loss": 0.4166, + "step": 133 + }, + { + "epoch": 0.1504983855117226, + "grad_norm": 0.5988685305190284, + "learning_rate": 6.022471910112359e-06, + "loss": 0.3987, + "step": 134 + }, + { + "epoch": 0.1516215077916608, + "grad_norm": 0.6008483348458779, + "learning_rate": 6.06741573033708e-06, + "loss": 0.4054, + "step": 135 + }, + { + "epoch": 0.15274463007159905, + "grad_norm": 0.6371268436916242, + "learning_rate": 6.112359550561798e-06, + "loss": 0.415, + "step": 136 + }, + { + "epoch": 0.15386775235153727, + "grad_norm": 0.5873165979042162, + "learning_rate": 6.157303370786517e-06, + "loss": 0.3788, + "step": 137 + }, + { + "epoch": 0.1549908746314755, + "grad_norm": 0.6321649406199696, + "learning_rate": 6.202247191011237e-06, + "loss": 0.3874, + "step": 138 + }, + { + "epoch": 0.15611399691141373, + "grad_norm": 0.5905156513112094, + "learning_rate": 6.247191011235956e-06, + "loss": 0.3865, + "step": 139 + }, + { + "epoch": 0.15723711919135197, + "grad_norm": 0.6605429195870949, + "learning_rate": 6.292134831460674e-06, + "loss": 0.4218, + "step": 140 + }, + { + "epoch": 0.1583602414712902, + "grad_norm": 0.6197445457794568, + "learning_rate": 6.337078651685393e-06, + "loss": 0.3986, + "step": 141 + }, + { + "epoch": 0.1594833637512284, + "grad_norm": 0.6510054129028581, + "learning_rate": 6.382022471910113e-06, + "loss": 0.4126, + "step": 142 + }, + { + "epoch": 0.16060648603116665, + "grad_norm": 0.6637714327455877, + "learning_rate": 6.426966292134832e-06, + "loss": 0.4105, + "step": 143 + }, + { + "epoch": 0.16172960831110486, + "grad_norm": 0.6320653980077073, + "learning_rate": 6.47191011235955e-06, + "loss": 0.3876, + "step": 144 + }, + { + "epoch": 0.1628527305910431, + "grad_norm": 0.6224333021729421, + "learning_rate": 6.51685393258427e-06, + "loss": 0.3975, + "step": 145 + }, + { + "epoch": 0.16397585287098132, + "grad_norm": 0.6523728785937435, + "learning_rate": 6.561797752808989e-06, + "loss": 0.4088, + "step": 146 + }, + { + "epoch": 0.16509897515091956, + "grad_norm": 0.6399877537131693, + "learning_rate": 6.606741573033708e-06, + "loss": 0.4243, + "step": 147 + }, + { + "epoch": 0.16622209743085778, + "grad_norm": 0.6785871204181669, + "learning_rate": 6.651685393258428e-06, + "loss": 0.4118, + "step": 148 + }, + { + "epoch": 0.16734521971079602, + "grad_norm": 0.6793168925384406, + "learning_rate": 6.696629213483147e-06, + "loss": 0.3843, + "step": 149 + }, + { + "epoch": 0.16846834199073424, + "grad_norm": 0.6765524645653702, + "learning_rate": 6.741573033707865e-06, + "loss": 0.3929, + "step": 150 + }, + { + "epoch": 0.16959146427067248, + "grad_norm": 0.6500442496922796, + "learning_rate": 6.786516853932584e-06, + "loss": 0.4157, + "step": 151 + }, + { + "epoch": 0.1707145865506107, + "grad_norm": 0.6278514376034972, + "learning_rate": 6.831460674157304e-06, + "loss": 0.384, + "step": 152 + }, + { + "epoch": 0.1718377088305489, + "grad_norm": 0.6786012740240396, + "learning_rate": 6.876404494382023e-06, + "loss": 0.4213, + "step": 153 + }, + { + "epoch": 0.17296083111048716, + "grad_norm": 0.6103427664078407, + "learning_rate": 6.921348314606741e-06, + "loss": 0.3827, + "step": 154 + }, + { + "epoch": 0.17408395339042537, + "grad_norm": 0.6373236421700184, + "learning_rate": 6.966292134831461e-06, + "loss": 0.4021, + "step": 155 + }, + { + "epoch": 0.17520707567036362, + "grad_norm": 0.6527172283097773, + "learning_rate": 7.01123595505618e-06, + "loss": 0.415, + "step": 156 + }, + { + "epoch": 0.17633019795030183, + "grad_norm": 0.6299621366906155, + "learning_rate": 7.056179775280899e-06, + "loss": 0.3922, + "step": 157 + }, + { + "epoch": 0.17745332023024007, + "grad_norm": 0.6205260967234852, + "learning_rate": 7.101123595505619e-06, + "loss": 0.404, + "step": 158 + }, + { + "epoch": 0.1785764425101783, + "grad_norm": 0.6314521969215471, + "learning_rate": 7.146067415730338e-06, + "loss": 0.4, + "step": 159 + }, + { + "epoch": 0.17969956479011653, + "grad_norm": 0.5826299033616584, + "learning_rate": 7.191011235955056e-06, + "loss": 0.3936, + "step": 160 + }, + { + "epoch": 0.18082268707005475, + "grad_norm": 0.6340335724127424, + "learning_rate": 7.235955056179775e-06, + "loss": 0.4234, + "step": 161 + }, + { + "epoch": 0.181945809349993, + "grad_norm": 0.632801507097347, + "learning_rate": 7.280898876404495e-06, + "loss": 0.4098, + "step": 162 + }, + { + "epoch": 0.1830689316299312, + "grad_norm": 0.6446077682121513, + "learning_rate": 7.325842696629214e-06, + "loss": 0.3846, + "step": 163 + }, + { + "epoch": 0.18419205390986942, + "grad_norm": 0.5774497489736088, + "learning_rate": 7.3707865168539325e-06, + "loss": 0.3763, + "step": 164 + }, + { + "epoch": 0.18531517618980767, + "grad_norm": 0.6295990680542162, + "learning_rate": 7.415730337078652e-06, + "loss": 0.4073, + "step": 165 + }, + { + "epoch": 0.18643829846974588, + "grad_norm": 0.6085108426525031, + "learning_rate": 7.460674157303371e-06, + "loss": 0.4139, + "step": 166 + }, + { + "epoch": 0.18756142074968413, + "grad_norm": 0.62992988248332, + "learning_rate": 7.50561797752809e-06, + "loss": 0.4138, + "step": 167 + }, + { + "epoch": 0.18868454302962234, + "grad_norm": 0.5836300572336676, + "learning_rate": 7.55056179775281e-06, + "loss": 0.3736, + "step": 168 + }, + { + "epoch": 0.18980766530956059, + "grad_norm": 0.6265603677231923, + "learning_rate": 7.5955056179775284e-06, + "loss": 0.375, + "step": 169 + }, + { + "epoch": 0.1909307875894988, + "grad_norm": 0.5870206417930635, + "learning_rate": 7.640449438202247e-06, + "loss": 0.4102, + "step": 170 + }, + { + "epoch": 0.19205390986943705, + "grad_norm": 0.59819579045645, + "learning_rate": 7.685393258426966e-06, + "loss": 0.4121, + "step": 171 + }, + { + "epoch": 0.19317703214937526, + "grad_norm": 0.6122535374224991, + "learning_rate": 7.730337078651686e-06, + "loss": 0.3967, + "step": 172 + }, + { + "epoch": 0.1943001544293135, + "grad_norm": 0.5721725402345272, + "learning_rate": 7.775280898876405e-06, + "loss": 0.4097, + "step": 173 + }, + { + "epoch": 0.19542327670925172, + "grad_norm": 0.5792461141723496, + "learning_rate": 7.820224719101124e-06, + "loss": 0.396, + "step": 174 + }, + { + "epoch": 0.19654639898918994, + "grad_norm": 0.5566347774031598, + "learning_rate": 7.865168539325843e-06, + "loss": 0.3748, + "step": 175 + }, + { + "epoch": 0.19766952126912818, + "grad_norm": 0.5947888148668254, + "learning_rate": 7.910112359550563e-06, + "loss": 0.3984, + "step": 176 + }, + { + "epoch": 0.1987926435490664, + "grad_norm": 0.5729701631007845, + "learning_rate": 7.955056179775281e-06, + "loss": 0.3732, + "step": 177 + }, + { + "epoch": 0.19991576582900464, + "grad_norm": 0.6308612644303433, + "learning_rate": 8.000000000000001e-06, + "loss": 0.3779, + "step": 178 + }, + { + "epoch": 0.20103888810894285, + "grad_norm": 0.602666895171434, + "learning_rate": 8.04494382022472e-06, + "loss": 0.4168, + "step": 179 + }, + { + "epoch": 0.2021620103888811, + "grad_norm": 0.594878749394712, + "learning_rate": 8.08988764044944e-06, + "loss": 0.3801, + "step": 180 + }, + { + "epoch": 0.2032851326688193, + "grad_norm": 0.5819103337590406, + "learning_rate": 8.13483146067416e-06, + "loss": 0.355, + "step": 181 + }, + { + "epoch": 0.20440825494875756, + "grad_norm": 0.6795698806664618, + "learning_rate": 8.179775280898877e-06, + "loss": 0.4105, + "step": 182 + }, + { + "epoch": 0.20553137722869577, + "grad_norm": 0.6176253376211935, + "learning_rate": 8.224719101123596e-06, + "loss": 0.4052, + "step": 183 + }, + { + "epoch": 0.20665449950863402, + "grad_norm": 0.5647204982647233, + "learning_rate": 8.269662921348315e-06, + "loss": 0.4002, + "step": 184 + }, + { + "epoch": 0.20777762178857223, + "grad_norm": 0.630533784994395, + "learning_rate": 8.314606741573035e-06, + "loss": 0.3836, + "step": 185 + }, + { + "epoch": 0.20890074406851045, + "grad_norm": 0.6619715293898263, + "learning_rate": 8.359550561797754e-06, + "loss": 0.4138, + "step": 186 + }, + { + "epoch": 0.2100238663484487, + "grad_norm": 0.5966032860325108, + "learning_rate": 8.404494382022472e-06, + "loss": 0.3896, + "step": 187 + }, + { + "epoch": 0.2111469886283869, + "grad_norm": 0.5681040377904722, + "learning_rate": 8.449438202247192e-06, + "loss": 0.3812, + "step": 188 + }, + { + "epoch": 0.21227011090832515, + "grad_norm": 0.6236783202259868, + "learning_rate": 8.494382022471911e-06, + "loss": 0.3927, + "step": 189 + }, + { + "epoch": 0.21339323318826336, + "grad_norm": 0.7424858758861099, + "learning_rate": 8.53932584269663e-06, + "loss": 0.4452, + "step": 190 + }, + { + "epoch": 0.2145163554682016, + "grad_norm": 0.7125468158261177, + "learning_rate": 8.58426966292135e-06, + "loss": 0.3609, + "step": 191 + }, + { + "epoch": 0.21563947774813982, + "grad_norm": 0.6157709477352911, + "learning_rate": 8.629213483146068e-06, + "loss": 0.4078, + "step": 192 + }, + { + "epoch": 0.21676260002807807, + "grad_norm": 0.6307743638624984, + "learning_rate": 8.674157303370788e-06, + "loss": 0.3683, + "step": 193 + }, + { + "epoch": 0.21788572230801628, + "grad_norm": 0.6873653868391115, + "learning_rate": 8.719101123595506e-06, + "loss": 0.4076, + "step": 194 + }, + { + "epoch": 0.21900884458795453, + "grad_norm": 0.6311262794185428, + "learning_rate": 8.764044943820226e-06, + "loss": 0.3853, + "step": 195 + }, + { + "epoch": 0.22013196686789274, + "grad_norm": 0.6680702186561743, + "learning_rate": 8.808988764044944e-06, + "loss": 0.4135, + "step": 196 + }, + { + "epoch": 0.22125508914783096, + "grad_norm": 0.6058308110971806, + "learning_rate": 8.853932584269664e-06, + "loss": 0.4112, + "step": 197 + }, + { + "epoch": 0.2223782114277692, + "grad_norm": 0.607906340987603, + "learning_rate": 8.898876404494383e-06, + "loss": 0.3979, + "step": 198 + }, + { + "epoch": 0.22350133370770742, + "grad_norm": 0.5987320863883703, + "learning_rate": 8.943820224719102e-06, + "loss": 0.4017, + "step": 199 + }, + { + "epoch": 0.22462445598764566, + "grad_norm": 0.6240143951994302, + "learning_rate": 8.988764044943822e-06, + "loss": 0.3869, + "step": 200 + }, + { + "epoch": 0.22574757826758388, + "grad_norm": 0.6484774713204989, + "learning_rate": 9.033707865168541e-06, + "loss": 0.3682, + "step": 201 + }, + { + "epoch": 0.22687070054752212, + "grad_norm": 0.6398454864125076, + "learning_rate": 9.07865168539326e-06, + "loss": 0.4031, + "step": 202 + }, + { + "epoch": 0.22799382282746034, + "grad_norm": 0.6071698475923116, + "learning_rate": 9.123595505617978e-06, + "loss": 0.4081, + "step": 203 + }, + { + "epoch": 0.22911694510739858, + "grad_norm": 0.5841395417042217, + "learning_rate": 9.168539325842698e-06, + "loss": 0.3842, + "step": 204 + }, + { + "epoch": 0.2302400673873368, + "grad_norm": 0.6674576527002625, + "learning_rate": 9.213483146067417e-06, + "loss": 0.4123, + "step": 205 + }, + { + "epoch": 0.23136318966727504, + "grad_norm": 0.6640606414419046, + "learning_rate": 9.258426966292136e-06, + "loss": 0.3939, + "step": 206 + }, + { + "epoch": 0.23248631194721325, + "grad_norm": 0.7214937941577327, + "learning_rate": 9.303370786516854e-06, + "loss": 0.371, + "step": 207 + }, + { + "epoch": 0.23360943422715147, + "grad_norm": 0.6793273791859282, + "learning_rate": 9.348314606741574e-06, + "loss": 0.3836, + "step": 208 + }, + { + "epoch": 0.2347325565070897, + "grad_norm": 0.6871956158863509, + "learning_rate": 9.393258426966294e-06, + "loss": 0.3932, + "step": 209 + }, + { + "epoch": 0.23585567878702793, + "grad_norm": 0.7258816009143834, + "learning_rate": 9.438202247191012e-06, + "loss": 0.414, + "step": 210 + }, + { + "epoch": 0.23697880106696617, + "grad_norm": 0.6492267333090342, + "learning_rate": 9.483146067415732e-06, + "loss": 0.416, + "step": 211 + }, + { + "epoch": 0.2381019233469044, + "grad_norm": 0.6041513090496686, + "learning_rate": 9.52808988764045e-06, + "loss": 0.3934, + "step": 212 + }, + { + "epoch": 0.23922504562684263, + "grad_norm": 0.6474076777760845, + "learning_rate": 9.57303370786517e-06, + "loss": 0.3975, + "step": 213 + }, + { + "epoch": 0.24034816790678085, + "grad_norm": 0.7092379598633823, + "learning_rate": 9.617977528089888e-06, + "loss": 0.3781, + "step": 214 + }, + { + "epoch": 0.2414712901867191, + "grad_norm": 0.6438002116207439, + "learning_rate": 9.662921348314608e-06, + "loss": 0.3996, + "step": 215 + }, + { + "epoch": 0.2425944124666573, + "grad_norm": 0.6622015252030728, + "learning_rate": 9.707865168539326e-06, + "loss": 0.3832, + "step": 216 + }, + { + "epoch": 0.24371753474659555, + "grad_norm": 0.6078817480395128, + "learning_rate": 9.752808988764046e-06, + "loss": 0.3952, + "step": 217 + }, + { + "epoch": 0.24484065702653376, + "grad_norm": 0.6551657494425536, + "learning_rate": 9.797752808988766e-06, + "loss": 0.3791, + "step": 218 + }, + { + "epoch": 0.24596377930647198, + "grad_norm": 0.6433323266927976, + "learning_rate": 9.842696629213484e-06, + "loss": 0.3774, + "step": 219 + }, + { + "epoch": 0.24708690158641022, + "grad_norm": 0.6188804166540856, + "learning_rate": 9.887640449438202e-06, + "loss": 0.3712, + "step": 220 + }, + { + "epoch": 0.24821002386634844, + "grad_norm": 0.671966975214175, + "learning_rate": 9.932584269662922e-06, + "loss": 0.4239, + "step": 221 + }, + { + "epoch": 0.24933314614628668, + "grad_norm": 0.5947753032122628, + "learning_rate": 9.977528089887642e-06, + "loss": 0.3964, + "step": 222 + }, + { + "epoch": 0.2504562684262249, + "grad_norm": 0.6607682301911262, + "learning_rate": 1.0022471910112362e-05, + "loss": 0.3903, + "step": 223 + }, + { + "epoch": 0.25157939070616314, + "grad_norm": 0.6356386896335997, + "learning_rate": 1.006741573033708e-05, + "loss": 0.4251, + "step": 224 + }, + { + "epoch": 0.25270251298610136, + "grad_norm": 0.6559033997420501, + "learning_rate": 1.01123595505618e-05, + "loss": 0.4137, + "step": 225 + }, + { + "epoch": 0.2538256352660396, + "grad_norm": 0.614989946033004, + "learning_rate": 1.0157303370786518e-05, + "loss": 0.3601, + "step": 226 + }, + { + "epoch": 0.25494875754597784, + "grad_norm": 0.6410991676909698, + "learning_rate": 1.0202247191011236e-05, + "loss": 0.402, + "step": 227 + }, + { + "epoch": 0.25607187982591606, + "grad_norm": 0.642417180353421, + "learning_rate": 1.0247191011235956e-05, + "loss": 0.3877, + "step": 228 + }, + { + "epoch": 0.2571950021058543, + "grad_norm": 0.6125173108674232, + "learning_rate": 1.0292134831460674e-05, + "loss": 0.3669, + "step": 229 + }, + { + "epoch": 0.2583181243857925, + "grad_norm": 0.5685847570038455, + "learning_rate": 1.0337078651685396e-05, + "loss": 0.374, + "step": 230 + }, + { + "epoch": 0.2594412466657307, + "grad_norm": 0.6544675947013324, + "learning_rate": 1.0382022471910114e-05, + "loss": 0.3834, + "step": 231 + }, + { + "epoch": 0.260564368945669, + "grad_norm": 0.6068343322603438, + "learning_rate": 1.0426966292134832e-05, + "loss": 0.3693, + "step": 232 + }, + { + "epoch": 0.2616874912256072, + "grad_norm": 0.6453583897346943, + "learning_rate": 1.0471910112359552e-05, + "loss": 0.4048, + "step": 233 + }, + { + "epoch": 0.2628106135055454, + "grad_norm": 0.7033942331352795, + "learning_rate": 1.051685393258427e-05, + "loss": 0.3875, + "step": 234 + }, + { + "epoch": 0.2639337357854836, + "grad_norm": 0.5715008472023303, + "learning_rate": 1.0561797752808988e-05, + "loss": 0.4069, + "step": 235 + }, + { + "epoch": 0.2650568580654219, + "grad_norm": 0.6494208885445116, + "learning_rate": 1.060674157303371e-05, + "loss": 0.4093, + "step": 236 + }, + { + "epoch": 0.2661799803453601, + "grad_norm": 0.6812214256281384, + "learning_rate": 1.0651685393258428e-05, + "loss": 0.4425, + "step": 237 + }, + { + "epoch": 0.26730310262529833, + "grad_norm": 0.5924772804766913, + "learning_rate": 1.0696629213483148e-05, + "loss": 0.387, + "step": 238 + }, + { + "epoch": 0.26842622490523654, + "grad_norm": 0.5764162625896104, + "learning_rate": 1.0741573033707866e-05, + "loss": 0.3827, + "step": 239 + }, + { + "epoch": 0.26954934718517476, + "grad_norm": 0.5965307495222452, + "learning_rate": 1.0786516853932584e-05, + "loss": 0.4129, + "step": 240 + }, + { + "epoch": 0.27067246946511303, + "grad_norm": 0.6003603754111506, + "learning_rate": 1.0831460674157304e-05, + "loss": 0.3775, + "step": 241 + }, + { + "epoch": 0.27179559174505125, + "grad_norm": 0.5922972797219954, + "learning_rate": 1.0876404494382022e-05, + "loss": 0.3877, + "step": 242 + }, + { + "epoch": 0.27291871402498946, + "grad_norm": 0.5685410001707504, + "learning_rate": 1.0921348314606744e-05, + "loss": 0.377, + "step": 243 + }, + { + "epoch": 0.2740418363049277, + "grad_norm": 0.576628928037405, + "learning_rate": 1.0966292134831462e-05, + "loss": 0.3971, + "step": 244 + }, + { + "epoch": 0.27516495858486595, + "grad_norm": 0.5579453193582883, + "learning_rate": 1.101123595505618e-05, + "loss": 0.378, + "step": 245 + }, + { + "epoch": 0.27628808086480416, + "grad_norm": 0.6011765922681929, + "learning_rate": 1.10561797752809e-05, + "loss": 0.4117, + "step": 246 + }, + { + "epoch": 0.2774112031447424, + "grad_norm": 0.6091646970674486, + "learning_rate": 1.1101123595505618e-05, + "loss": 0.4426, + "step": 247 + }, + { + "epoch": 0.2785343254246806, + "grad_norm": 0.5775758129728008, + "learning_rate": 1.1146067415730338e-05, + "loss": 0.3972, + "step": 248 + }, + { + "epoch": 0.27965744770461887, + "grad_norm": 0.6604411713439484, + "learning_rate": 1.1191011235955056e-05, + "loss": 0.3965, + "step": 249 + }, + { + "epoch": 0.2807805699845571, + "grad_norm": 0.6156558831358852, + "learning_rate": 1.1235955056179778e-05, + "loss": 0.405, + "step": 250 + }, + { + "epoch": 0.2819036922644953, + "grad_norm": 0.5623612665541853, + "learning_rate": 1.1280898876404496e-05, + "loss": 0.3901, + "step": 251 + }, + { + "epoch": 0.2830268145444335, + "grad_norm": 0.6746239022438368, + "learning_rate": 1.1325842696629214e-05, + "loss": 0.3876, + "step": 252 + }, + { + "epoch": 0.28414993682437173, + "grad_norm": 0.5964994154377549, + "learning_rate": 1.1370786516853934e-05, + "loss": 0.4163, + "step": 253 + }, + { + "epoch": 0.28527305910431, + "grad_norm": 0.6082500353591788, + "learning_rate": 1.1415730337078652e-05, + "loss": 0.4286, + "step": 254 + }, + { + "epoch": 0.2863961813842482, + "grad_norm": 0.6157506350996006, + "learning_rate": 1.146067415730337e-05, + "loss": 0.3872, + "step": 255 + }, + { + "epoch": 0.28751930366418643, + "grad_norm": 0.567260212959717, + "learning_rate": 1.1505617977528092e-05, + "loss": 0.4341, + "step": 256 + }, + { + "epoch": 0.28864242594412465, + "grad_norm": 0.5791317852426999, + "learning_rate": 1.155056179775281e-05, + "loss": 0.3499, + "step": 257 + }, + { + "epoch": 0.2897655482240629, + "grad_norm": 0.5741440071104256, + "learning_rate": 1.159550561797753e-05, + "loss": 0.3744, + "step": 258 + }, + { + "epoch": 0.29088867050400113, + "grad_norm": 0.577442355673411, + "learning_rate": 1.1640449438202248e-05, + "loss": 0.3959, + "step": 259 + }, + { + "epoch": 0.29201179278393935, + "grad_norm": 0.5701366764697986, + "learning_rate": 1.1685393258426966e-05, + "loss": 0.3661, + "step": 260 + }, + { + "epoch": 0.29313491506387757, + "grad_norm": 0.5866338939450678, + "learning_rate": 1.1730337078651686e-05, + "loss": 0.3817, + "step": 261 + }, + { + "epoch": 0.2942580373438158, + "grad_norm": 0.5869871652126405, + "learning_rate": 1.1775280898876404e-05, + "loss": 0.4031, + "step": 262 + }, + { + "epoch": 0.29538115962375405, + "grad_norm": 0.6113639662622179, + "learning_rate": 1.1820224719101126e-05, + "loss": 0.3665, + "step": 263 + }, + { + "epoch": 0.29650428190369227, + "grad_norm": 0.5815782372208732, + "learning_rate": 1.1865168539325844e-05, + "loss": 0.3718, + "step": 264 + }, + { + "epoch": 0.2976274041836305, + "grad_norm": 0.5770971481710677, + "learning_rate": 1.1910112359550562e-05, + "loss": 0.3887, + "step": 265 + }, + { + "epoch": 0.2987505264635687, + "grad_norm": 0.6378543358730397, + "learning_rate": 1.1955056179775282e-05, + "loss": 0.4026, + "step": 266 + }, + { + "epoch": 0.29987364874350697, + "grad_norm": 0.6366625582188497, + "learning_rate": 1.2e-05, + "loss": 0.3996, + "step": 267 + }, + { + "epoch": 0.3009967710234452, + "grad_norm": 0.5644318306665338, + "learning_rate": 1.2044943820224718e-05, + "loss": 0.3736, + "step": 268 + }, + { + "epoch": 0.3021198933033834, + "grad_norm": 0.5760984664326271, + "learning_rate": 1.208988764044944e-05, + "loss": 0.3831, + "step": 269 + }, + { + "epoch": 0.3032430155833216, + "grad_norm": 0.6073186836248484, + "learning_rate": 1.213483146067416e-05, + "loss": 0.3767, + "step": 270 + }, + { + "epoch": 0.3043661378632599, + "grad_norm": 0.658052587041447, + "learning_rate": 1.2179775280898878e-05, + "loss": 0.376, + "step": 271 + }, + { + "epoch": 0.3054892601431981, + "grad_norm": 0.5705277159249025, + "learning_rate": 1.2224719101123596e-05, + "loss": 0.4047, + "step": 272 + }, + { + "epoch": 0.3066123824231363, + "grad_norm": 0.5943437710250703, + "learning_rate": 1.2269662921348316e-05, + "loss": 0.3938, + "step": 273 + }, + { + "epoch": 0.30773550470307454, + "grad_norm": 0.6594875627018434, + "learning_rate": 1.2314606741573034e-05, + "loss": 0.3932, + "step": 274 + }, + { + "epoch": 0.30885862698301275, + "grad_norm": 0.6227401289864777, + "learning_rate": 1.2359550561797752e-05, + "loss": 0.368, + "step": 275 + }, + { + "epoch": 0.309981749262951, + "grad_norm": 0.5836285370383634, + "learning_rate": 1.2404494382022474e-05, + "loss": 0.3896, + "step": 276 + }, + { + "epoch": 0.31110487154288924, + "grad_norm": 0.6356736965747966, + "learning_rate": 1.2449438202247192e-05, + "loss": 0.4001, + "step": 277 + }, + { + "epoch": 0.31222799382282745, + "grad_norm": 0.5961019104865444, + "learning_rate": 1.2494382022471912e-05, + "loss": 0.3898, + "step": 278 + }, + { + "epoch": 0.31335111610276567, + "grad_norm": 0.5894192052755339, + "learning_rate": 1.253932584269663e-05, + "loss": 0.3821, + "step": 279 + }, + { + "epoch": 0.31447423838270394, + "grad_norm": 0.6139515035640439, + "learning_rate": 1.2584269662921348e-05, + "loss": 0.3869, + "step": 280 + }, + { + "epoch": 0.31559736066264216, + "grad_norm": 0.5710811085875507, + "learning_rate": 1.2629213483146068e-05, + "loss": 0.3715, + "step": 281 + }, + { + "epoch": 0.3167204829425804, + "grad_norm": 0.6174376536441986, + "learning_rate": 1.2674157303370786e-05, + "loss": 0.3887, + "step": 282 + }, + { + "epoch": 0.3178436052225186, + "grad_norm": 0.5409663843795987, + "learning_rate": 1.2719101123595508e-05, + "loss": 0.3585, + "step": 283 + }, + { + "epoch": 0.3189667275024568, + "grad_norm": 0.5842025638662178, + "learning_rate": 1.2764044943820226e-05, + "loss": 0.3648, + "step": 284 + }, + { + "epoch": 0.3200898497823951, + "grad_norm": 0.6288491942213608, + "learning_rate": 1.2808988764044944e-05, + "loss": 0.3943, + "step": 285 + }, + { + "epoch": 0.3212129720623333, + "grad_norm": 0.6339928168107889, + "learning_rate": 1.2853932584269664e-05, + "loss": 0.412, + "step": 286 + }, + { + "epoch": 0.3223360943422715, + "grad_norm": 0.5805070016232142, + "learning_rate": 1.2898876404494382e-05, + "loss": 0.3936, + "step": 287 + }, + { + "epoch": 0.3234592166222097, + "grad_norm": 0.6031249399955438, + "learning_rate": 1.29438202247191e-05, + "loss": 0.3849, + "step": 288 + }, + { + "epoch": 0.324582338902148, + "grad_norm": 0.6515523772005117, + "learning_rate": 1.2988764044943822e-05, + "loss": 0.4003, + "step": 289 + }, + { + "epoch": 0.3257054611820862, + "grad_norm": 0.6020531221934884, + "learning_rate": 1.303370786516854e-05, + "loss": 0.4059, + "step": 290 + }, + { + "epoch": 0.3268285834620244, + "grad_norm": 0.5911957722542618, + "learning_rate": 1.307865168539326e-05, + "loss": 0.3936, + "step": 291 + }, + { + "epoch": 0.32795170574196264, + "grad_norm": 0.6296525120353835, + "learning_rate": 1.3123595505617978e-05, + "loss": 0.3638, + "step": 292 + }, + { + "epoch": 0.3290748280219009, + "grad_norm": 0.6035340356032662, + "learning_rate": 1.3168539325842698e-05, + "loss": 0.3633, + "step": 293 + }, + { + "epoch": 0.3301979503018391, + "grad_norm": 0.5640220762566707, + "learning_rate": 1.3213483146067416e-05, + "loss": 0.3936, + "step": 294 + }, + { + "epoch": 0.33132107258177734, + "grad_norm": 0.659950946658227, + "learning_rate": 1.3258426966292135e-05, + "loss": 0.3939, + "step": 295 + }, + { + "epoch": 0.33244419486171556, + "grad_norm": 0.6547964598471208, + "learning_rate": 1.3303370786516856e-05, + "loss": 0.3797, + "step": 296 + }, + { + "epoch": 0.3335673171416538, + "grad_norm": 0.5628971266321644, + "learning_rate": 1.3348314606741574e-05, + "loss": 0.3878, + "step": 297 + }, + { + "epoch": 0.33469043942159205, + "grad_norm": 0.5916043311443739, + "learning_rate": 1.3393258426966294e-05, + "loss": 0.3705, + "step": 298 + }, + { + "epoch": 0.33581356170153026, + "grad_norm": 0.642531175947998, + "learning_rate": 1.3438202247191012e-05, + "loss": 0.3841, + "step": 299 + }, + { + "epoch": 0.3369366839814685, + "grad_norm": 0.5942467702427874, + "learning_rate": 1.348314606741573e-05, + "loss": 0.3864, + "step": 300 + }, + { + "epoch": 0.3380598062614067, + "grad_norm": 0.5576069981166258, + "learning_rate": 1.352808988764045e-05, + "loss": 0.3803, + "step": 301 + }, + { + "epoch": 0.33918292854134496, + "grad_norm": 0.637796095078525, + "learning_rate": 1.3573033707865169e-05, + "loss": 0.3845, + "step": 302 + }, + { + "epoch": 0.3403060508212832, + "grad_norm": 0.5424364787066615, + "learning_rate": 1.361797752808989e-05, + "loss": 0.3885, + "step": 303 + }, + { + "epoch": 0.3414291731012214, + "grad_norm": 0.666509048066953, + "learning_rate": 1.3662921348314608e-05, + "loss": 0.3941, + "step": 304 + }, + { + "epoch": 0.3425522953811596, + "grad_norm": 0.5667187793072223, + "learning_rate": 1.3707865168539327e-05, + "loss": 0.3568, + "step": 305 + }, + { + "epoch": 0.3436754176610978, + "grad_norm": 0.6364904468384852, + "learning_rate": 1.3752808988764046e-05, + "loss": 0.3807, + "step": 306 + }, + { + "epoch": 0.3447985399410361, + "grad_norm": 0.5971120227080189, + "learning_rate": 1.3797752808988765e-05, + "loss": 0.3848, + "step": 307 + }, + { + "epoch": 0.3459216622209743, + "grad_norm": 0.5555370524108522, + "learning_rate": 1.3842696629213483e-05, + "loss": 0.3849, + "step": 308 + }, + { + "epoch": 0.34704478450091253, + "grad_norm": 0.6236731385363299, + "learning_rate": 1.3887640449438204e-05, + "loss": 0.384, + "step": 309 + }, + { + "epoch": 0.34816790678085074, + "grad_norm": 0.6149787882648424, + "learning_rate": 1.3932584269662923e-05, + "loss": 0.3959, + "step": 310 + }, + { + "epoch": 0.349291029060789, + "grad_norm": 0.6215560633514793, + "learning_rate": 1.3977528089887642e-05, + "loss": 0.3918, + "step": 311 + }, + { + "epoch": 0.35041415134072723, + "grad_norm": 0.6209310433594782, + "learning_rate": 1.402247191011236e-05, + "loss": 0.3855, + "step": 312 + }, + { + "epoch": 0.35153727362066545, + "grad_norm": 0.5994586964661478, + "learning_rate": 1.4067415730337079e-05, + "loss": 0.373, + "step": 313 + }, + { + "epoch": 0.35266039590060366, + "grad_norm": 0.6188656847491276, + "learning_rate": 1.4112359550561799e-05, + "loss": 0.3892, + "step": 314 + }, + { + "epoch": 0.35378351818054193, + "grad_norm": 0.5701426572944555, + "learning_rate": 1.4157303370786517e-05, + "loss": 0.3587, + "step": 315 + }, + { + "epoch": 0.35490664046048015, + "grad_norm": 0.5735315497169365, + "learning_rate": 1.4202247191011238e-05, + "loss": 0.3834, + "step": 316 + }, + { + "epoch": 0.35602976274041837, + "grad_norm": 0.6041588815641994, + "learning_rate": 1.4247191011235957e-05, + "loss": 0.4063, + "step": 317 + }, + { + "epoch": 0.3571528850203566, + "grad_norm": 0.6413512842290581, + "learning_rate": 1.4292134831460676e-05, + "loss": 0.3643, + "step": 318 + }, + { + "epoch": 0.3582760073002948, + "grad_norm": 0.5633456389709988, + "learning_rate": 1.4337078651685395e-05, + "loss": 0.3782, + "step": 319 + }, + { + "epoch": 0.35939912958023307, + "grad_norm": 0.6268598664045065, + "learning_rate": 1.4382022471910113e-05, + "loss": 0.3959, + "step": 320 + }, + { + "epoch": 0.3605222518601713, + "grad_norm": 0.5800501810819474, + "learning_rate": 1.4426966292134833e-05, + "loss": 0.4013, + "step": 321 + }, + { + "epoch": 0.3616453741401095, + "grad_norm": 0.5769816751771281, + "learning_rate": 1.447191011235955e-05, + "loss": 0.3828, + "step": 322 + }, + { + "epoch": 0.3627684964200477, + "grad_norm": 0.5949290584396976, + "learning_rate": 1.4516853932584272e-05, + "loss": 0.4089, + "step": 323 + }, + { + "epoch": 0.363891618699986, + "grad_norm": 0.5852043429988183, + "learning_rate": 1.456179775280899e-05, + "loss": 0.3652, + "step": 324 + }, + { + "epoch": 0.3650147409799242, + "grad_norm": 0.572491832081323, + "learning_rate": 1.4606741573033709e-05, + "loss": 0.3984, + "step": 325 + }, + { + "epoch": 0.3661378632598624, + "grad_norm": 0.5972392800723866, + "learning_rate": 1.4651685393258429e-05, + "loss": 0.39, + "step": 326 + }, + { + "epoch": 0.36726098553980063, + "grad_norm": 0.6363257306206275, + "learning_rate": 1.4696629213483147e-05, + "loss": 0.3562, + "step": 327 + }, + { + "epoch": 0.36838410781973885, + "grad_norm": 0.5907940254264683, + "learning_rate": 1.4741573033707865e-05, + "loss": 0.3911, + "step": 328 + }, + { + "epoch": 0.3695072300996771, + "grad_norm": 0.5871546010212381, + "learning_rate": 1.4786516853932587e-05, + "loss": 0.379, + "step": 329 + }, + { + "epoch": 0.37063035237961534, + "grad_norm": 0.6618710636475701, + "learning_rate": 1.4831460674157305e-05, + "loss": 0.3946, + "step": 330 + }, + { + "epoch": 0.37175347465955355, + "grad_norm": 0.5734074870533208, + "learning_rate": 1.4876404494382025e-05, + "loss": 0.4272, + "step": 331 + }, + { + "epoch": 0.37287659693949177, + "grad_norm": 0.6643286523550941, + "learning_rate": 1.4921348314606743e-05, + "loss": 0.3963, + "step": 332 + }, + { + "epoch": 0.37399971921943004, + "grad_norm": 0.6096880457625593, + "learning_rate": 1.4966292134831461e-05, + "loss": 0.4006, + "step": 333 + }, + { + "epoch": 0.37512284149936825, + "grad_norm": 0.5812451405437918, + "learning_rate": 1.501123595505618e-05, + "loss": 0.3975, + "step": 334 + }, + { + "epoch": 0.37624596377930647, + "grad_norm": 0.5757226170172344, + "learning_rate": 1.5056179775280899e-05, + "loss": 0.3851, + "step": 335 + }, + { + "epoch": 0.3773690860592447, + "grad_norm": 0.5954813467325601, + "learning_rate": 1.510112359550562e-05, + "loss": 0.3817, + "step": 336 + }, + { + "epoch": 0.37849220833918296, + "grad_norm": 0.5726297825723002, + "learning_rate": 1.5146067415730339e-05, + "loss": 0.3882, + "step": 337 + }, + { + "epoch": 0.37961533061912117, + "grad_norm": 0.575616838800246, + "learning_rate": 1.5191011235955057e-05, + "loss": 0.4013, + "step": 338 + }, + { + "epoch": 0.3807384528990594, + "grad_norm": 0.5656104756542751, + "learning_rate": 1.5235955056179777e-05, + "loss": 0.3715, + "step": 339 + }, + { + "epoch": 0.3818615751789976, + "grad_norm": 0.5700630782329689, + "learning_rate": 1.5280898876404495e-05, + "loss": 0.382, + "step": 340 + }, + { + "epoch": 0.3829846974589358, + "grad_norm": 0.5901652073128528, + "learning_rate": 1.5325842696629213e-05, + "loss": 0.3922, + "step": 341 + }, + { + "epoch": 0.3841078197388741, + "grad_norm": 0.5680443357781206, + "learning_rate": 1.537078651685393e-05, + "loss": 0.3956, + "step": 342 + }, + { + "epoch": 0.3852309420188123, + "grad_norm": 0.6094240246912197, + "learning_rate": 1.5415730337078653e-05, + "loss": 0.4001, + "step": 343 + }, + { + "epoch": 0.3863540642987505, + "grad_norm": 0.6025863182389569, + "learning_rate": 1.546067415730337e-05, + "loss": 0.3991, + "step": 344 + }, + { + "epoch": 0.38747718657868874, + "grad_norm": 0.5849785735196502, + "learning_rate": 1.5505617977528093e-05, + "loss": 0.3561, + "step": 345 + }, + { + "epoch": 0.388600308858627, + "grad_norm": 0.6155288868533991, + "learning_rate": 1.555056179775281e-05, + "loss": 0.3885, + "step": 346 + }, + { + "epoch": 0.3897234311385652, + "grad_norm": 0.528624344003196, + "learning_rate": 1.559550561797753e-05, + "loss": 0.3368, + "step": 347 + }, + { + "epoch": 0.39084655341850344, + "grad_norm": 0.5742784751968288, + "learning_rate": 1.5640449438202247e-05, + "loss": 0.3908, + "step": 348 + }, + { + "epoch": 0.39196967569844166, + "grad_norm": 0.5790550121976005, + "learning_rate": 1.568539325842697e-05, + "loss": 0.3935, + "step": 349 + }, + { + "epoch": 0.39309279797837987, + "grad_norm": 0.5837147473027602, + "learning_rate": 1.5730337078651687e-05, + "loss": 0.3882, + "step": 350 + }, + { + "epoch": 0.39421592025831814, + "grad_norm": 0.59876417374738, + "learning_rate": 1.5775280898876405e-05, + "loss": 0.3687, + "step": 351 + }, + { + "epoch": 0.39533904253825636, + "grad_norm": 0.5577383109842206, + "learning_rate": 1.5820224719101127e-05, + "loss": 0.3838, + "step": 352 + }, + { + "epoch": 0.3964621648181946, + "grad_norm": 0.6253260037735091, + "learning_rate": 1.5865168539325845e-05, + "loss": 0.3988, + "step": 353 + }, + { + "epoch": 0.3975852870981328, + "grad_norm": 0.5709668147622707, + "learning_rate": 1.5910112359550563e-05, + "loss": 0.3797, + "step": 354 + }, + { + "epoch": 0.39870840937807106, + "grad_norm": 0.5766274168346207, + "learning_rate": 1.595505617977528e-05, + "loss": 0.4017, + "step": 355 + }, + { + "epoch": 0.3998315316580093, + "grad_norm": 0.5960841144559348, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.3784, + "step": 356 + }, + { + "epoch": 0.4009546539379475, + "grad_norm": 0.5637097286513578, + "learning_rate": 1.604494382022472e-05, + "loss": 0.3835, + "step": 357 + }, + { + "epoch": 0.4020777762178857, + "grad_norm": 0.621385534061573, + "learning_rate": 1.608988764044944e-05, + "loss": 0.379, + "step": 358 + }, + { + "epoch": 0.4032008984978239, + "grad_norm": 0.5804215759491298, + "learning_rate": 1.6134831460674157e-05, + "loss": 0.3717, + "step": 359 + }, + { + "epoch": 0.4043240207777622, + "grad_norm": 0.6850149117508036, + "learning_rate": 1.617977528089888e-05, + "loss": 0.3929, + "step": 360 + }, + { + "epoch": 0.4054471430577004, + "grad_norm": 0.5678267887232396, + "learning_rate": 1.6224719101123597e-05, + "loss": 0.3708, + "step": 361 + }, + { + "epoch": 0.4065702653376386, + "grad_norm": 0.5846240027794225, + "learning_rate": 1.626966292134832e-05, + "loss": 0.3806, + "step": 362 + }, + { + "epoch": 0.40769338761757684, + "grad_norm": 0.641861479794963, + "learning_rate": 1.6314606741573037e-05, + "loss": 0.3751, + "step": 363 + }, + { + "epoch": 0.4088165098975151, + "grad_norm": 0.551258860522498, + "learning_rate": 1.6359550561797755e-05, + "loss": 0.3847, + "step": 364 + }, + { + "epoch": 0.40993963217745333, + "grad_norm": 0.5436695988486194, + "learning_rate": 1.6404494382022473e-05, + "loss": 0.3588, + "step": 365 + }, + { + "epoch": 0.41106275445739154, + "grad_norm": 0.6273802594115025, + "learning_rate": 1.644943820224719e-05, + "loss": 0.3893, + "step": 366 + }, + { + "epoch": 0.41218587673732976, + "grad_norm": 0.5829276428140747, + "learning_rate": 1.649438202247191e-05, + "loss": 0.3761, + "step": 367 + }, + { + "epoch": 0.41330899901726803, + "grad_norm": 0.6510879644948344, + "learning_rate": 1.653932584269663e-05, + "loss": 0.3875, + "step": 368 + }, + { + "epoch": 0.41443212129720625, + "grad_norm": 0.5895996214987, + "learning_rate": 1.658426966292135e-05, + "loss": 0.3861, + "step": 369 + }, + { + "epoch": 0.41555524357714446, + "grad_norm": 0.5696237361887272, + "learning_rate": 1.662921348314607e-05, + "loss": 0.3733, + "step": 370 + }, + { + "epoch": 0.4166783658570827, + "grad_norm": 0.6452128769900847, + "learning_rate": 1.667415730337079e-05, + "loss": 0.377, + "step": 371 + }, + { + "epoch": 0.4178014881370209, + "grad_norm": 0.5933187468692044, + "learning_rate": 1.6719101123595507e-05, + "loss": 0.3727, + "step": 372 + }, + { + "epoch": 0.41892461041695916, + "grad_norm": 0.5956891237289303, + "learning_rate": 1.6764044943820225e-05, + "loss": 0.409, + "step": 373 + }, + { + "epoch": 0.4200477326968974, + "grad_norm": 0.6421959612844145, + "learning_rate": 1.6808988764044943e-05, + "loss": 0.4176, + "step": 374 + }, + { + "epoch": 0.4211708549768356, + "grad_norm": 0.5877612385670545, + "learning_rate": 1.6853932584269665e-05, + "loss": 0.3813, + "step": 375 + }, + { + "epoch": 0.4222939772567738, + "grad_norm": 0.5522655569800495, + "learning_rate": 1.6898876404494383e-05, + "loss": 0.3635, + "step": 376 + }, + { + "epoch": 0.4234170995367121, + "grad_norm": 0.570615445713445, + "learning_rate": 1.6943820224719105e-05, + "loss": 0.3464, + "step": 377 + }, + { + "epoch": 0.4245402218166503, + "grad_norm": 0.5335527636986674, + "learning_rate": 1.6988764044943823e-05, + "loss": 0.3519, + "step": 378 + }, + { + "epoch": 0.4256633440965885, + "grad_norm": 0.5633249413573972, + "learning_rate": 1.703370786516854e-05, + "loss": 0.3671, + "step": 379 + }, + { + "epoch": 0.42678646637652673, + "grad_norm": 0.5962150121982345, + "learning_rate": 1.707865168539326e-05, + "loss": 0.3638, + "step": 380 + }, + { + "epoch": 0.42790958865646495, + "grad_norm": 0.533130081409531, + "learning_rate": 1.7123595505617977e-05, + "loss": 0.3723, + "step": 381 + }, + { + "epoch": 0.4290327109364032, + "grad_norm": 0.5232079977615192, + "learning_rate": 1.71685393258427e-05, + "loss": 0.3377, + "step": 382 + }, + { + "epoch": 0.43015583321634143, + "grad_norm": 0.6669436739174833, + "learning_rate": 1.7213483146067417e-05, + "loss": 0.3649, + "step": 383 + }, + { + "epoch": 0.43127895549627965, + "grad_norm": 0.58553539025849, + "learning_rate": 1.7258426966292135e-05, + "loss": 0.3902, + "step": 384 + }, + { + "epoch": 0.43240207777621786, + "grad_norm": 0.5805175926211644, + "learning_rate": 1.7303370786516857e-05, + "loss": 0.3988, + "step": 385 + }, + { + "epoch": 0.43352520005615613, + "grad_norm": 0.622462053904498, + "learning_rate": 1.7348314606741575e-05, + "loss": 0.3726, + "step": 386 + }, + { + "epoch": 0.43464832233609435, + "grad_norm": 0.6586460393239919, + "learning_rate": 1.7393258426966293e-05, + "loss": 0.382, + "step": 387 + }, + { + "epoch": 0.43577144461603257, + "grad_norm": 0.503597593779572, + "learning_rate": 1.743820224719101e-05, + "loss": 0.3548, + "step": 388 + }, + { + "epoch": 0.4368945668959708, + "grad_norm": 0.6269680293566149, + "learning_rate": 1.7483146067415733e-05, + "loss": 0.3894, + "step": 389 + }, + { + "epoch": 0.43801768917590905, + "grad_norm": 0.5826375120126366, + "learning_rate": 1.752808988764045e-05, + "loss": 0.3814, + "step": 390 + }, + { + "epoch": 0.43914081145584727, + "grad_norm": 0.5730693678129642, + "learning_rate": 1.757303370786517e-05, + "loss": 0.4214, + "step": 391 + }, + { + "epoch": 0.4402639337357855, + "grad_norm": 0.5808702128043581, + "learning_rate": 1.7617977528089887e-05, + "loss": 0.354, + "step": 392 + }, + { + "epoch": 0.4413870560157237, + "grad_norm": 0.5974314280889683, + "learning_rate": 1.766292134831461e-05, + "loss": 0.3776, + "step": 393 + }, + { + "epoch": 0.4425101782956619, + "grad_norm": 0.5707766947857629, + "learning_rate": 1.7707865168539327e-05, + "loss": 0.3818, + "step": 394 + }, + { + "epoch": 0.4436333005756002, + "grad_norm": 0.5675159319972002, + "learning_rate": 1.7752808988764045e-05, + "loss": 0.4048, + "step": 395 + }, + { + "epoch": 0.4447564228555384, + "grad_norm": 0.5855000642824909, + "learning_rate": 1.7797752808988767e-05, + "loss": 0.3669, + "step": 396 + }, + { + "epoch": 0.4458795451354766, + "grad_norm": 0.5890500820148369, + "learning_rate": 1.7842696629213485e-05, + "loss": 0.3529, + "step": 397 + }, + { + "epoch": 0.44700266741541483, + "grad_norm": 0.5765910274102177, + "learning_rate": 1.7887640449438203e-05, + "loss": 0.4004, + "step": 398 + }, + { + "epoch": 0.4481257896953531, + "grad_norm": 0.5850026850384389, + "learning_rate": 1.793258426966292e-05, + "loss": 0.4024, + "step": 399 + }, + { + "epoch": 0.4492489119752913, + "grad_norm": 0.60842577325865, + "learning_rate": 1.7977528089887643e-05, + "loss": 0.362, + "step": 400 + }, + { + "epoch": 0.45037203425522954, + "grad_norm": 0.598714080261855, + "learning_rate": 1.802247191011236e-05, + "loss": 0.4056, + "step": 401 + }, + { + "epoch": 0.45149515653516775, + "grad_norm": 0.5460494368263945, + "learning_rate": 1.8067415730337083e-05, + "loss": 0.3563, + "step": 402 + }, + { + "epoch": 0.45261827881510597, + "grad_norm": 0.5338120861959073, + "learning_rate": 1.81123595505618e-05, + "loss": 0.3629, + "step": 403 + }, + { + "epoch": 0.45374140109504424, + "grad_norm": 0.5715606361300019, + "learning_rate": 1.815730337078652e-05, + "loss": 0.3717, + "step": 404 + }, + { + "epoch": 0.45486452337498245, + "grad_norm": 0.541068995106628, + "learning_rate": 1.8202247191011237e-05, + "loss": 0.3779, + "step": 405 + }, + { + "epoch": 0.45598764565492067, + "grad_norm": 0.5723001753094253, + "learning_rate": 1.8247191011235956e-05, + "loss": 0.3882, + "step": 406 + }, + { + "epoch": 0.4571107679348589, + "grad_norm": 0.5405763992724918, + "learning_rate": 1.8292134831460674e-05, + "loss": 0.3762, + "step": 407 + }, + { + "epoch": 0.45823389021479716, + "grad_norm": 0.5159698191034968, + "learning_rate": 1.8337078651685395e-05, + "loss": 0.377, + "step": 408 + }, + { + "epoch": 0.4593570124947354, + "grad_norm": 0.5354052943635401, + "learning_rate": 1.8382022471910113e-05, + "loss": 0.3735, + "step": 409 + }, + { + "epoch": 0.4604801347746736, + "grad_norm": 0.5507330621386544, + "learning_rate": 1.8426966292134835e-05, + "loss": 0.3941, + "step": 410 + }, + { + "epoch": 0.4616032570546118, + "grad_norm": 0.5470411737981918, + "learning_rate": 1.8471910112359553e-05, + "loss": 0.3788, + "step": 411 + }, + { + "epoch": 0.4627263793345501, + "grad_norm": 0.6279126208387347, + "learning_rate": 1.851685393258427e-05, + "loss": 0.3766, + "step": 412 + }, + { + "epoch": 0.4638495016144883, + "grad_norm": 0.5581331082608008, + "learning_rate": 1.856179775280899e-05, + "loss": 0.3548, + "step": 413 + }, + { + "epoch": 0.4649726238944265, + "grad_norm": 0.6339597817219623, + "learning_rate": 1.8606741573033708e-05, + "loss": 0.3928, + "step": 414 + }, + { + "epoch": 0.4660957461743647, + "grad_norm": 0.5885950731841739, + "learning_rate": 1.8651685393258426e-05, + "loss": 0.3834, + "step": 415 + }, + { + "epoch": 0.46721886845430294, + "grad_norm": 0.5517474243216776, + "learning_rate": 1.8696629213483147e-05, + "loss": 0.372, + "step": 416 + }, + { + "epoch": 0.4683419907342412, + "grad_norm": 0.5371558479053717, + "learning_rate": 1.8741573033707866e-05, + "loss": 0.3781, + "step": 417 + }, + { + "epoch": 0.4694651130141794, + "grad_norm": 0.5972062873696606, + "learning_rate": 1.8786516853932587e-05, + "loss": 0.4045, + "step": 418 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.5289075828695805, + "learning_rate": 1.8831460674157305e-05, + "loss": 0.3604, + "step": 419 + }, + { + "epoch": 0.47171135757405586, + "grad_norm": 0.5595781567840159, + "learning_rate": 1.8876404494382024e-05, + "loss": 0.3733, + "step": 420 + }, + { + "epoch": 0.4728344798539941, + "grad_norm": 0.5283542090574247, + "learning_rate": 1.8921348314606742e-05, + "loss": 0.3817, + "step": 421 + }, + { + "epoch": 0.47395760213393234, + "grad_norm": 0.5224861754856025, + "learning_rate": 1.8966292134831463e-05, + "loss": 0.3846, + "step": 422 + }, + { + "epoch": 0.47508072441387056, + "grad_norm": 0.5693567792868641, + "learning_rate": 1.901123595505618e-05, + "loss": 0.3784, + "step": 423 + }, + { + "epoch": 0.4762038466938088, + "grad_norm": 0.586099489925582, + "learning_rate": 1.90561797752809e-05, + "loss": 0.3762, + "step": 424 + }, + { + "epoch": 0.477326968973747, + "grad_norm": 0.5172936141285605, + "learning_rate": 1.910112359550562e-05, + "loss": 0.3833, + "step": 425 + }, + { + "epoch": 0.47845009125368526, + "grad_norm": 0.5666644649128297, + "learning_rate": 1.914606741573034e-05, + "loss": 0.3839, + "step": 426 + }, + { + "epoch": 0.4795732135336235, + "grad_norm": 0.5860548355924229, + "learning_rate": 1.9191011235955058e-05, + "loss": 0.3716, + "step": 427 + }, + { + "epoch": 0.4806963358135617, + "grad_norm": 0.5520174477409845, + "learning_rate": 1.9235955056179776e-05, + "loss": 0.3696, + "step": 428 + }, + { + "epoch": 0.4818194580934999, + "grad_norm": 0.5402453132910056, + "learning_rate": 1.9280898876404497e-05, + "loss": 0.3769, + "step": 429 + }, + { + "epoch": 0.4829425803734382, + "grad_norm": 0.5670157993103069, + "learning_rate": 1.9325842696629215e-05, + "loss": 0.3546, + "step": 430 + }, + { + "epoch": 0.4840657026533764, + "grad_norm": 0.5378630874757251, + "learning_rate": 1.9370786516853934e-05, + "loss": 0.3831, + "step": 431 + }, + { + "epoch": 0.4851888249333146, + "grad_norm": 0.5178321480341342, + "learning_rate": 1.9415730337078652e-05, + "loss": 0.3796, + "step": 432 + }, + { + "epoch": 0.4863119472132528, + "grad_norm": 0.5500034972413501, + "learning_rate": 1.9460674157303373e-05, + "loss": 0.4023, + "step": 433 + }, + { + "epoch": 0.4874350694931911, + "grad_norm": 0.4790912474912375, + "learning_rate": 1.950561797752809e-05, + "loss": 0.3695, + "step": 434 + }, + { + "epoch": 0.4885581917731293, + "grad_norm": 0.520106712842702, + "learning_rate": 1.955056179775281e-05, + "loss": 0.3907, + "step": 435 + }, + { + "epoch": 0.48968131405306753, + "grad_norm": 0.5270381224070033, + "learning_rate": 1.959550561797753e-05, + "loss": 0.3746, + "step": 436 + }, + { + "epoch": 0.49080443633300574, + "grad_norm": 0.5585640386718159, + "learning_rate": 1.964044943820225e-05, + "loss": 0.3968, + "step": 437 + }, + { + "epoch": 0.49192755861294396, + "grad_norm": 0.5212838454555632, + "learning_rate": 1.9685393258426968e-05, + "loss": 0.3848, + "step": 438 + }, + { + "epoch": 0.49305068089288223, + "grad_norm": 0.5223182787955911, + "learning_rate": 1.9730337078651686e-05, + "loss": 0.3871, + "step": 439 + }, + { + "epoch": 0.49417380317282045, + "grad_norm": 0.5014928363223362, + "learning_rate": 1.9775280898876404e-05, + "loss": 0.3749, + "step": 440 + }, + { + "epoch": 0.49529692545275866, + "grad_norm": 0.5448751880848072, + "learning_rate": 1.9820224719101126e-05, + "loss": 0.4037, + "step": 441 + }, + { + "epoch": 0.4964200477326969, + "grad_norm": 0.5115396988679667, + "learning_rate": 1.9865168539325844e-05, + "loss": 0.3578, + "step": 442 + }, + { + "epoch": 0.49754317001263515, + "grad_norm": 0.5226126289636163, + "learning_rate": 1.9910112359550565e-05, + "loss": 0.3813, + "step": 443 + }, + { + "epoch": 0.49866629229257337, + "grad_norm": 0.5651234942385593, + "learning_rate": 1.9955056179775283e-05, + "loss": 0.4008, + "step": 444 + }, + { + "epoch": 0.4997894145725116, + "grad_norm": 0.5414290026509725, + "learning_rate": 2e-05, + "loss": 0.3634, + "step": 445 + }, + { + "epoch": 0.5009125368524499, + "grad_norm": 0.5062706466010933, + "learning_rate": 1.9999996923444976e-05, + "loss": 0.3679, + "step": 446 + }, + { + "epoch": 0.502035659132388, + "grad_norm": 0.6284172666370126, + "learning_rate": 1.9999987693781806e-05, + "loss": 0.39, + "step": 447 + }, + { + "epoch": 0.5031587814123263, + "grad_norm": 0.523476357886049, + "learning_rate": 1.9999972311016158e-05, + "loss": 0.3851, + "step": 448 + }, + { + "epoch": 0.5042819036922644, + "grad_norm": 0.5198224598673091, + "learning_rate": 1.99999507751575e-05, + "loss": 0.3722, + "step": 449 + }, + { + "epoch": 0.5054050259722027, + "grad_norm": 0.5990656789702519, + "learning_rate": 1.999992308621909e-05, + "loss": 0.3961, + "step": 450 + }, + { + "epoch": 0.506528148252141, + "grad_norm": 0.5337336925829737, + "learning_rate": 1.9999889244217957e-05, + "loss": 0.3661, + "step": 451 + }, + { + "epoch": 0.5076512705320791, + "grad_norm": 0.5265236086660914, + "learning_rate": 1.9999849249174933e-05, + "loss": 0.3763, + "step": 452 + }, + { + "epoch": 0.5087743928120174, + "grad_norm": 0.6072251444919325, + "learning_rate": 1.9999803101114622e-05, + "loss": 0.3702, + "step": 453 + }, + { + "epoch": 0.5098975150919557, + "grad_norm": 0.6242928399695532, + "learning_rate": 1.9999750800065415e-05, + "loss": 0.3953, + "step": 454 + }, + { + "epoch": 0.5110206373718938, + "grad_norm": 0.5179538772395289, + "learning_rate": 1.9999692346059504e-05, + "loss": 0.3873, + "step": 455 + }, + { + "epoch": 0.5121437596518321, + "grad_norm": 0.5639243942734614, + "learning_rate": 1.9999627739132847e-05, + "loss": 0.3975, + "step": 456 + }, + { + "epoch": 0.5132668819317703, + "grad_norm": 0.6222398095764557, + "learning_rate": 1.9999556979325203e-05, + "loss": 0.3826, + "step": 457 + }, + { + "epoch": 0.5143900042117086, + "grad_norm": 0.5381548600095061, + "learning_rate": 1.999948006668011e-05, + "loss": 0.4006, + "step": 458 + }, + { + "epoch": 0.5155131264916468, + "grad_norm": 0.5460529769570657, + "learning_rate": 1.999939700124489e-05, + "loss": 0.3779, + "step": 459 + }, + { + "epoch": 0.516636248771585, + "grad_norm": 0.5285009639724508, + "learning_rate": 1.999930778307066e-05, + "loss": 0.4117, + "step": 460 + }, + { + "epoch": 0.5177593710515233, + "grad_norm": 0.588582651422445, + "learning_rate": 1.9999212412212313e-05, + "loss": 0.4135, + "step": 461 + }, + { + "epoch": 0.5188824933314614, + "grad_norm": 0.5262946007341981, + "learning_rate": 1.9999110888728527e-05, + "loss": 0.3836, + "step": 462 + }, + { + "epoch": 0.5200056156113997, + "grad_norm": 0.5074018080150626, + "learning_rate": 1.999900321268178e-05, + "loss": 0.3879, + "step": 463 + }, + { + "epoch": 0.521128737891338, + "grad_norm": 0.5699895099551994, + "learning_rate": 1.9998889384138324e-05, + "loss": 0.3994, + "step": 464 + }, + { + "epoch": 0.5222518601712761, + "grad_norm": 0.5488889184993216, + "learning_rate": 1.9998769403168195e-05, + "loss": 0.3787, + "step": 465 + }, + { + "epoch": 0.5233749824512144, + "grad_norm": 0.587601256959231, + "learning_rate": 1.9998643269845225e-05, + "loss": 0.4028, + "step": 466 + }, + { + "epoch": 0.5244981047311527, + "grad_norm": 0.5470076750391482, + "learning_rate": 1.9998510984247015e-05, + "loss": 0.3869, + "step": 467 + }, + { + "epoch": 0.5256212270110908, + "grad_norm": 0.5490698566612296, + "learning_rate": 1.9998372546454973e-05, + "loss": 0.4069, + "step": 468 + }, + { + "epoch": 0.5267443492910291, + "grad_norm": 0.5377412654267874, + "learning_rate": 1.999822795655427e-05, + "loss": 0.3622, + "step": 469 + }, + { + "epoch": 0.5278674715709673, + "grad_norm": 0.5398403722418982, + "learning_rate": 1.9998077214633884e-05, + "loss": 0.3785, + "step": 470 + }, + { + "epoch": 0.5289905938509055, + "grad_norm": 0.49636950695396015, + "learning_rate": 1.999792032078656e-05, + "loss": 0.3526, + "step": 471 + }, + { + "epoch": 0.5301137161308438, + "grad_norm": 0.6168098343297038, + "learning_rate": 1.9997757275108847e-05, + "loss": 0.3947, + "step": 472 + }, + { + "epoch": 0.531236838410782, + "grad_norm": 0.5815523034498505, + "learning_rate": 1.9997588077701057e-05, + "loss": 0.3776, + "step": 473 + }, + { + "epoch": 0.5323599606907202, + "grad_norm": 0.5214146087899991, + "learning_rate": 1.999741272866731e-05, + "loss": 0.3685, + "step": 474 + }, + { + "epoch": 0.5334830829706584, + "grad_norm": 0.554167072491176, + "learning_rate": 1.9997231228115487e-05, + "loss": 0.3987, + "step": 475 + }, + { + "epoch": 0.5346062052505967, + "grad_norm": 0.5442738139350299, + "learning_rate": 1.999704357615728e-05, + "loss": 0.3824, + "step": 476 + }, + { + "epoch": 0.5357293275305349, + "grad_norm": 0.5157448841129373, + "learning_rate": 1.9996849772908144e-05, + "loss": 0.3728, + "step": 477 + }, + { + "epoch": 0.5368524498104731, + "grad_norm": 0.5376140918675025, + "learning_rate": 1.9996649818487336e-05, + "loss": 0.409, + "step": 478 + }, + { + "epoch": 0.5379755720904114, + "grad_norm": 0.549248833785917, + "learning_rate": 1.9996443713017883e-05, + "loss": 0.3852, + "step": 479 + }, + { + "epoch": 0.5390986943703495, + "grad_norm": 0.5236716683541965, + "learning_rate": 1.999623145662661e-05, + "loss": 0.3798, + "step": 480 + }, + { + "epoch": 0.5402218166502878, + "grad_norm": 0.609473300182195, + "learning_rate": 1.9996013049444117e-05, + "loss": 0.3764, + "step": 481 + }, + { + "epoch": 0.5413449389302261, + "grad_norm": 0.5355644461303796, + "learning_rate": 1.9995788491604797e-05, + "loss": 0.3574, + "step": 482 + }, + { + "epoch": 0.5424680612101642, + "grad_norm": 0.5580759406548179, + "learning_rate": 1.9995557783246814e-05, + "loss": 0.3704, + "step": 483 + }, + { + "epoch": 0.5435911834901025, + "grad_norm": 0.5531806767952361, + "learning_rate": 1.9995320924512135e-05, + "loss": 0.3939, + "step": 484 + }, + { + "epoch": 0.5447143057700408, + "grad_norm": 0.5834026851989711, + "learning_rate": 1.9995077915546496e-05, + "loss": 0.3931, + "step": 485 + }, + { + "epoch": 0.5458374280499789, + "grad_norm": 0.5591794205487529, + "learning_rate": 1.9994828756499423e-05, + "loss": 0.4071, + "step": 486 + }, + { + "epoch": 0.5469605503299172, + "grad_norm": 0.5322788134249687, + "learning_rate": 1.9994573447524233e-05, + "loss": 0.3704, + "step": 487 + }, + { + "epoch": 0.5480836726098554, + "grad_norm": 0.6160509443550839, + "learning_rate": 1.999431198877801e-05, + "loss": 0.3668, + "step": 488 + }, + { + "epoch": 0.5492067948897936, + "grad_norm": 0.4808621346087435, + "learning_rate": 1.9994044380421642e-05, + "loss": 0.359, + "step": 489 + }, + { + "epoch": 0.5503299171697319, + "grad_norm": 0.5940190325126405, + "learning_rate": 1.9993770622619784e-05, + "loss": 0.3975, + "step": 490 + }, + { + "epoch": 0.5514530394496701, + "grad_norm": 0.5314215977553695, + "learning_rate": 1.9993490715540888e-05, + "loss": 0.3619, + "step": 491 + }, + { + "epoch": 0.5525761617296083, + "grad_norm": 0.5286779855947581, + "learning_rate": 1.9993204659357176e-05, + "loss": 0.3624, + "step": 492 + }, + { + "epoch": 0.5536992840095465, + "grad_norm": 0.6157841936932821, + "learning_rate": 1.9992912454244677e-05, + "loss": 0.3856, + "step": 493 + }, + { + "epoch": 0.5548224062894848, + "grad_norm": 0.5146925583224419, + "learning_rate": 1.999261410038317e-05, + "loss": 0.3889, + "step": 494 + }, + { + "epoch": 0.555945528569423, + "grad_norm": 0.5899681817511568, + "learning_rate": 1.9992309597956244e-05, + "loss": 0.3872, + "step": 495 + }, + { + "epoch": 0.5570686508493612, + "grad_norm": 0.5514658907626057, + "learning_rate": 1.999199894715126e-05, + "loss": 0.367, + "step": 496 + }, + { + "epoch": 0.5581917731292995, + "grad_norm": 0.5727284572822289, + "learning_rate": 1.9991682148159372e-05, + "loss": 0.3855, + "step": 497 + }, + { + "epoch": 0.5593148954092377, + "grad_norm": 0.6630678363436415, + "learning_rate": 1.9991359201175503e-05, + "loss": 0.3747, + "step": 498 + }, + { + "epoch": 0.5604380176891759, + "grad_norm": 0.5168218883882756, + "learning_rate": 1.9991030106398367e-05, + "loss": 0.3665, + "step": 499 + }, + { + "epoch": 0.5615611399691142, + "grad_norm": 0.5285711442774218, + "learning_rate": 1.999069486403046e-05, + "loss": 0.3661, + "step": 500 + }, + { + "epoch": 0.5626842622490523, + "grad_norm": 0.612117315690109, + "learning_rate": 1.999035347427806e-05, + "loss": 0.4039, + "step": 501 + }, + { + "epoch": 0.5638073845289906, + "grad_norm": 0.5866977447253838, + "learning_rate": 1.999000593735123e-05, + "loss": 0.4256, + "step": 502 + }, + { + "epoch": 0.5649305068089289, + "grad_norm": 0.5670878177199548, + "learning_rate": 1.998965225346381e-05, + "loss": 0.4193, + "step": 503 + }, + { + "epoch": 0.566053629088867, + "grad_norm": 0.5585226536761726, + "learning_rate": 1.998929242283343e-05, + "loss": 0.3909, + "step": 504 + }, + { + "epoch": 0.5671767513688053, + "grad_norm": 0.5483834024577466, + "learning_rate": 1.9988926445681495e-05, + "loss": 0.3914, + "step": 505 + }, + { + "epoch": 0.5682998736487435, + "grad_norm": 0.48877542079307956, + "learning_rate": 1.9988554322233188e-05, + "loss": 0.3718, + "step": 506 + }, + { + "epoch": 0.5694229959286817, + "grad_norm": 0.5685952858138967, + "learning_rate": 1.9988176052717495e-05, + "loss": 0.3989, + "step": 507 + }, + { + "epoch": 0.57054611820862, + "grad_norm": 0.5234278520668304, + "learning_rate": 1.9987791637367157e-05, + "loss": 0.3742, + "step": 508 + }, + { + "epoch": 0.5716692404885582, + "grad_norm": 0.5460853796115502, + "learning_rate": 1.9987401076418717e-05, + "loss": 0.402, + "step": 509 + }, + { + "epoch": 0.5727923627684964, + "grad_norm": 0.5455839187416055, + "learning_rate": 1.9987004370112487e-05, + "loss": 0.3641, + "step": 510 + }, + { + "epoch": 0.5739154850484347, + "grad_norm": 0.5056945010695153, + "learning_rate": 1.9986601518692567e-05, + "loss": 0.375, + "step": 511 + }, + { + "epoch": 0.5750386073283729, + "grad_norm": 0.5208870388950549, + "learning_rate": 1.9986192522406835e-05, + "loss": 0.3714, + "step": 512 + }, + { + "epoch": 0.5761617296083111, + "grad_norm": 0.5358983435756814, + "learning_rate": 1.998577738150695e-05, + "loss": 0.3822, + "step": 513 + }, + { + "epoch": 0.5772848518882493, + "grad_norm": 0.5399478578334925, + "learning_rate": 1.9985356096248357e-05, + "loss": 0.3978, + "step": 514 + }, + { + "epoch": 0.5784079741681876, + "grad_norm": 0.5276672395265529, + "learning_rate": 1.998492866689027e-05, + "loss": 0.351, + "step": 515 + }, + { + "epoch": 0.5795310964481258, + "grad_norm": 0.5308333056433986, + "learning_rate": 1.9984495093695696e-05, + "loss": 0.4038, + "step": 516 + }, + { + "epoch": 0.580654218728064, + "grad_norm": 0.4895548996663825, + "learning_rate": 1.9984055376931414e-05, + "loss": 0.349, + "step": 517 + }, + { + "epoch": 0.5817773410080023, + "grad_norm": 0.5250328463562076, + "learning_rate": 1.9983609516867993e-05, + "loss": 0.3766, + "step": 518 + }, + { + "epoch": 0.5829004632879404, + "grad_norm": 0.5371115914639484, + "learning_rate": 1.9983157513779768e-05, + "loss": 0.3576, + "step": 519 + }, + { + "epoch": 0.5840235855678787, + "grad_norm": 0.5127425095139194, + "learning_rate": 1.998269936794487e-05, + "loss": 0.3771, + "step": 520 + }, + { + "epoch": 0.585146707847817, + "grad_norm": 0.5327318689065801, + "learning_rate": 1.9982235079645192e-05, + "loss": 0.3629, + "step": 521 + }, + { + "epoch": 0.5862698301277551, + "grad_norm": 0.5158425879035294, + "learning_rate": 1.998176464916642e-05, + "loss": 0.3513, + "step": 522 + }, + { + "epoch": 0.5873929524076934, + "grad_norm": 0.5261415497468315, + "learning_rate": 1.998128807679802e-05, + "loss": 0.3895, + "step": 523 + }, + { + "epoch": 0.5885160746876316, + "grad_norm": 0.5330293259802309, + "learning_rate": 1.998080536283322e-05, + "loss": 0.3801, + "step": 524 + }, + { + "epoch": 0.5896391969675698, + "grad_norm": 0.5328558914321503, + "learning_rate": 1.998031650756905e-05, + "loss": 0.3746, + "step": 525 + }, + { + "epoch": 0.5907623192475081, + "grad_norm": 0.5390463848256605, + "learning_rate": 1.9979821511306308e-05, + "loss": 0.3817, + "step": 526 + }, + { + "epoch": 0.5918854415274463, + "grad_norm": 0.5431335082497138, + "learning_rate": 1.9979320374349564e-05, + "loss": 0.3908, + "step": 527 + }, + { + "epoch": 0.5930085638073845, + "grad_norm": 0.5101979024125878, + "learning_rate": 1.997881309700717e-05, + "loss": 0.3777, + "step": 528 + }, + { + "epoch": 0.5941316860873228, + "grad_norm": 0.5209616551352008, + "learning_rate": 1.9978299679591276e-05, + "loss": 0.385, + "step": 529 + }, + { + "epoch": 0.595254808367261, + "grad_norm": 0.518301945363611, + "learning_rate": 1.9977780122417778e-05, + "loss": 0.3927, + "step": 530 + }, + { + "epoch": 0.5963779306471992, + "grad_norm": 0.5064656297120799, + "learning_rate": 1.997725442580637e-05, + "loss": 0.366, + "step": 531 + }, + { + "epoch": 0.5975010529271374, + "grad_norm": 0.560763233769782, + "learning_rate": 1.997672259008052e-05, + "loss": 0.3839, + "step": 532 + }, + { + "epoch": 0.5986241752070757, + "grad_norm": 0.5185441179220639, + "learning_rate": 1.997618461556747e-05, + "loss": 0.3649, + "step": 533 + }, + { + "epoch": 0.5997472974870139, + "grad_norm": 0.5116157155704527, + "learning_rate": 1.9975640502598243e-05, + "loss": 0.4155, + "step": 534 + }, + { + "epoch": 0.6008704197669521, + "grad_norm": 0.5484375904037471, + "learning_rate": 1.9975090251507637e-05, + "loss": 0.3893, + "step": 535 + }, + { + "epoch": 0.6019935420468904, + "grad_norm": 0.5194411036484662, + "learning_rate": 1.9974533862634234e-05, + "loss": 0.3708, + "step": 536 + }, + { + "epoch": 0.6031166643268285, + "grad_norm": 0.5084933191055989, + "learning_rate": 1.9973971336320374e-05, + "loss": 0.3618, + "step": 537 + }, + { + "epoch": 0.6042397866067668, + "grad_norm": 0.500283964355109, + "learning_rate": 1.9973402672912196e-05, + "loss": 0.3773, + "step": 538 + }, + { + "epoch": 0.6053629088867051, + "grad_norm": 0.5043471751080929, + "learning_rate": 1.9972827872759598e-05, + "loss": 0.3706, + "step": 539 + }, + { + "epoch": 0.6064860311666432, + "grad_norm": 0.48051148948387623, + "learning_rate": 1.9972246936216268e-05, + "loss": 0.369, + "step": 540 + }, + { + "epoch": 0.6076091534465815, + "grad_norm": 0.5037545077388655, + "learning_rate": 1.9971659863639657e-05, + "loss": 0.3777, + "step": 541 + }, + { + "epoch": 0.6087322757265198, + "grad_norm": 0.5202846079968864, + "learning_rate": 1.9971066655390997e-05, + "loss": 0.3878, + "step": 542 + }, + { + "epoch": 0.6098553980064579, + "grad_norm": 0.5196509635056363, + "learning_rate": 1.99704673118353e-05, + "loss": 0.3786, + "step": 543 + }, + { + "epoch": 0.6109785202863962, + "grad_norm": 0.5100880924606735, + "learning_rate": 1.996986183334134e-05, + "loss": 0.3923, + "step": 544 + }, + { + "epoch": 0.6121016425663344, + "grad_norm": 0.5374754446994799, + "learning_rate": 1.9969250220281687e-05, + "loss": 0.4019, + "step": 545 + }, + { + "epoch": 0.6132247648462726, + "grad_norm": 0.5405136047123903, + "learning_rate": 1.9968632473032663e-05, + "loss": 0.3903, + "step": 546 + }, + { + "epoch": 0.6143478871262109, + "grad_norm": 0.5377137297729571, + "learning_rate": 1.996800859197438e-05, + "loss": 0.3847, + "step": 547 + }, + { + "epoch": 0.6154710094061491, + "grad_norm": 0.5280413079303611, + "learning_rate": 1.996737857749072e-05, + "loss": 0.3846, + "step": 548 + }, + { + "epoch": 0.6165941316860873, + "grad_norm": 0.4698879134163189, + "learning_rate": 1.996674242996933e-05, + "loss": 0.3624, + "step": 549 + }, + { + "epoch": 0.6177172539660255, + "grad_norm": 0.5693615931340346, + "learning_rate": 1.9966100149801648e-05, + "loss": 0.3809, + "step": 550 + }, + { + "epoch": 0.6188403762459638, + "grad_norm": 0.5390167928723439, + "learning_rate": 1.9965451737382872e-05, + "loss": 0.3944, + "step": 551 + }, + { + "epoch": 0.619963498525902, + "grad_norm": 0.4687981254942727, + "learning_rate": 1.9964797193111973e-05, + "loss": 0.3486, + "step": 552 + }, + { + "epoch": 0.6210866208058402, + "grad_norm": 0.6002959050862824, + "learning_rate": 1.9964136517391708e-05, + "loss": 0.427, + "step": 553 + }, + { + "epoch": 0.6222097430857785, + "grad_norm": 0.5424155203819168, + "learning_rate": 1.9963469710628592e-05, + "loss": 0.3864, + "step": 554 + }, + { + "epoch": 0.6233328653657166, + "grad_norm": 0.4805110497258522, + "learning_rate": 1.996279677323292e-05, + "loss": 0.3431, + "step": 555 + }, + { + "epoch": 0.6244559876456549, + "grad_norm": 0.5468592773183889, + "learning_rate": 1.9962117705618757e-05, + "loss": 0.3916, + "step": 556 + }, + { + "epoch": 0.6255791099255932, + "grad_norm": 0.5974551409729048, + "learning_rate": 1.9961432508203938e-05, + "loss": 0.3703, + "step": 557 + }, + { + "epoch": 0.6267022322055313, + "grad_norm": 0.46694957713893454, + "learning_rate": 1.996074118141008e-05, + "loss": 0.36, + "step": 558 + }, + { + "epoch": 0.6278253544854696, + "grad_norm": 0.5203301666877311, + "learning_rate": 1.9960043725662558e-05, + "loss": 0.3661, + "step": 559 + }, + { + "epoch": 0.6289484767654079, + "grad_norm": 0.5315196144666886, + "learning_rate": 1.995934014139053e-05, + "loss": 0.3878, + "step": 560 + }, + { + "epoch": 0.630071599045346, + "grad_norm": 0.4726474224175523, + "learning_rate": 1.9958630429026912e-05, + "loss": 0.3549, + "step": 561 + }, + { + "epoch": 0.6311947213252843, + "grad_norm": 0.5265877427590347, + "learning_rate": 1.9957914589008405e-05, + "loss": 0.372, + "step": 562 + }, + { + "epoch": 0.6323178436052225, + "grad_norm": 0.5108540554457581, + "learning_rate": 1.9957192621775466e-05, + "loss": 0.3709, + "step": 563 + }, + { + "epoch": 0.6334409658851607, + "grad_norm": 0.5290487085943824, + "learning_rate": 1.9956464527772334e-05, + "loss": 0.3884, + "step": 564 + }, + { + "epoch": 0.634564088165099, + "grad_norm": 0.5281270652784177, + "learning_rate": 1.9955730307447015e-05, + "loss": 0.3834, + "step": 565 + }, + { + "epoch": 0.6356872104450372, + "grad_norm": 0.5085482495756416, + "learning_rate": 1.9954989961251276e-05, + "loss": 0.3732, + "step": 566 + }, + { + "epoch": 0.6368103327249754, + "grad_norm": 0.4783417140693344, + "learning_rate": 1.9954243489640667e-05, + "loss": 0.3696, + "step": 567 + }, + { + "epoch": 0.6379334550049136, + "grad_norm": 0.5374355693068502, + "learning_rate": 1.995349089307449e-05, + "loss": 0.3755, + "step": 568 + }, + { + "epoch": 0.6390565772848519, + "grad_norm": 0.5020426636675567, + "learning_rate": 1.995273217201584e-05, + "loss": 0.3667, + "step": 569 + }, + { + "epoch": 0.6401796995647902, + "grad_norm": 0.48221702122344806, + "learning_rate": 1.995196732693156e-05, + "loss": 0.3669, + "step": 570 + }, + { + "epoch": 0.6413028218447283, + "grad_norm": 0.4948631937332312, + "learning_rate": 1.9951196358292266e-05, + "loss": 0.3815, + "step": 571 + }, + { + "epoch": 0.6424259441246666, + "grad_norm": 0.5149576535985988, + "learning_rate": 1.9950419266572344e-05, + "loss": 0.3748, + "step": 572 + }, + { + "epoch": 0.6435490664046049, + "grad_norm": 0.5077161706598761, + "learning_rate": 1.994963605224995e-05, + "loss": 0.3736, + "step": 573 + }, + { + "epoch": 0.644672188684543, + "grad_norm": 0.4930605863816736, + "learning_rate": 1.9948846715807e-05, + "loss": 0.3698, + "step": 574 + }, + { + "epoch": 0.6457953109644813, + "grad_norm": 0.514614667391616, + "learning_rate": 1.9948051257729184e-05, + "loss": 0.3736, + "step": 575 + }, + { + "epoch": 0.6469184332444194, + "grad_norm": 0.5176250628419765, + "learning_rate": 1.9947249678505956e-05, + "loss": 0.3693, + "step": 576 + }, + { + "epoch": 0.6480415555243577, + "grad_norm": 0.45913174362011855, + "learning_rate": 1.994644197863054e-05, + "loss": 0.3499, + "step": 577 + }, + { + "epoch": 0.649164677804296, + "grad_norm": 0.49883541686787625, + "learning_rate": 1.994562815859991e-05, + "loss": 0.362, + "step": 578 + }, + { + "epoch": 0.6502878000842341, + "grad_norm": 0.5146434352456531, + "learning_rate": 1.9944808218914833e-05, + "loss": 0.3906, + "step": 579 + }, + { + "epoch": 0.6514109223641724, + "grad_norm": 0.5066808040702572, + "learning_rate": 1.9943982160079823e-05, + "loss": 0.3853, + "step": 580 + }, + { + "epoch": 0.6525340446441106, + "grad_norm": 0.49848710491518, + "learning_rate": 1.9943149982603155e-05, + "loss": 0.3985, + "step": 581 + }, + { + "epoch": 0.6536571669240488, + "grad_norm": 0.4878251052204487, + "learning_rate": 1.994231168699689e-05, + "loss": 0.3714, + "step": 582 + }, + { + "epoch": 0.6547802892039871, + "grad_norm": 0.4758786623666332, + "learning_rate": 1.9941467273776832e-05, + "loss": 0.3946, + "step": 583 + }, + { + "epoch": 0.6559034114839253, + "grad_norm": 0.4983367896874346, + "learning_rate": 1.9940616743462557e-05, + "loss": 0.3722, + "step": 584 + }, + { + "epoch": 0.6570265337638636, + "grad_norm": 0.5075829148442156, + "learning_rate": 1.9939760096577408e-05, + "loss": 0.3843, + "step": 585 + }, + { + "epoch": 0.6581496560438018, + "grad_norm": 0.5301054416844904, + "learning_rate": 1.9938897333648492e-05, + "loss": 0.3997, + "step": 586 + }, + { + "epoch": 0.65927277832374, + "grad_norm": 0.5148031126628784, + "learning_rate": 1.9938028455206674e-05, + "loss": 0.3737, + "step": 587 + }, + { + "epoch": 0.6603959006036783, + "grad_norm": 0.46714827561279637, + "learning_rate": 1.9937153461786585e-05, + "loss": 0.3431, + "step": 588 + }, + { + "epoch": 0.6615190228836164, + "grad_norm": 0.5177398408210456, + "learning_rate": 1.9936272353926616e-05, + "loss": 0.3666, + "step": 589 + }, + { + "epoch": 0.6626421451635547, + "grad_norm": 0.47039452637574947, + "learning_rate": 1.993538513216892e-05, + "loss": 0.3635, + "step": 590 + }, + { + "epoch": 0.663765267443493, + "grad_norm": 0.5093614259546847, + "learning_rate": 1.9934491797059425e-05, + "loss": 0.3915, + "step": 591 + }, + { + "epoch": 0.6648883897234311, + "grad_norm": 0.5366679460711811, + "learning_rate": 1.99335923491478e-05, + "loss": 0.398, + "step": 592 + }, + { + "epoch": 0.6660115120033694, + "grad_norm": 0.5082350993851118, + "learning_rate": 1.993268678898749e-05, + "loss": 0.3938, + "step": 593 + }, + { + "epoch": 0.6671346342833075, + "grad_norm": 0.5182267099894416, + "learning_rate": 1.993177511713569e-05, + "loss": 0.4121, + "step": 594 + }, + { + "epoch": 0.6682577565632458, + "grad_norm": 0.49114322650286973, + "learning_rate": 1.9930857334153374e-05, + "loss": 0.3738, + "step": 595 + }, + { + "epoch": 0.6693808788431841, + "grad_norm": 0.47709228497356176, + "learning_rate": 1.992993344060525e-05, + "loss": 0.3767, + "step": 596 + }, + { + "epoch": 0.6705040011231223, + "grad_norm": 0.5108226641321751, + "learning_rate": 1.992900343705981e-05, + "loss": 0.3901, + "step": 597 + }, + { + "epoch": 0.6716271234030605, + "grad_norm": 0.5041216553802877, + "learning_rate": 1.9928067324089286e-05, + "loss": 0.3699, + "step": 598 + }, + { + "epoch": 0.6727502456829987, + "grad_norm": 0.4923581285784332, + "learning_rate": 1.9927125102269687e-05, + "loss": 0.3764, + "step": 599 + }, + { + "epoch": 0.673873367962937, + "grad_norm": 0.5387456019298923, + "learning_rate": 1.992617677218077e-05, + "loss": 0.3994, + "step": 600 + }, + { + "epoch": 0.6749964902428752, + "grad_norm": 0.5115844130946426, + "learning_rate": 1.992522233440605e-05, + "loss": 0.4016, + "step": 601 + }, + { + "epoch": 0.6761196125228134, + "grad_norm": 0.489549840063431, + "learning_rate": 1.992426178953281e-05, + "loss": 0.3861, + "step": 602 + }, + { + "epoch": 0.6772427348027517, + "grad_norm": 0.48352200149034974, + "learning_rate": 1.9923295138152076e-05, + "loss": 0.3501, + "step": 603 + }, + { + "epoch": 0.6783658570826899, + "grad_norm": 0.5003180468080372, + "learning_rate": 1.992232238085864e-05, + "loss": 0.3927, + "step": 604 + }, + { + "epoch": 0.6794889793626281, + "grad_norm": 0.46529290603039347, + "learning_rate": 1.9921343518251057e-05, + "loss": 0.3613, + "step": 605 + }, + { + "epoch": 0.6806121016425664, + "grad_norm": 0.48009299538190453, + "learning_rate": 1.9920358550931627e-05, + "loss": 0.3692, + "step": 606 + }, + { + "epoch": 0.6817352239225045, + "grad_norm": 0.4965554167186214, + "learning_rate": 1.9919367479506413e-05, + "loss": 0.3629, + "step": 607 + }, + { + "epoch": 0.6828583462024428, + "grad_norm": 0.48758593884923196, + "learning_rate": 1.9918370304585228e-05, + "loss": 0.3583, + "step": 608 + }, + { + "epoch": 0.6839814684823811, + "grad_norm": 0.5358513727384278, + "learning_rate": 1.991736702678165e-05, + "loss": 0.3916, + "step": 609 + }, + { + "epoch": 0.6851045907623192, + "grad_norm": 0.4867798988879104, + "learning_rate": 1.9916357646713006e-05, + "loss": 0.3452, + "step": 610 + }, + { + "epoch": 0.6862277130422575, + "grad_norm": 0.5325347519332022, + "learning_rate": 1.9915342165000375e-05, + "loss": 0.403, + "step": 611 + }, + { + "epoch": 0.6873508353221957, + "grad_norm": 0.48855001583167373, + "learning_rate": 1.99143205822686e-05, + "loss": 0.3791, + "step": 612 + }, + { + "epoch": 0.6884739576021339, + "grad_norm": 0.47712559355490186, + "learning_rate": 1.9913292899146262e-05, + "loss": 0.3626, + "step": 613 + }, + { + "epoch": 0.6895970798820722, + "grad_norm": 0.537623122561854, + "learning_rate": 1.9912259116265718e-05, + "loss": 0.3769, + "step": 614 + }, + { + "epoch": 0.6907202021620104, + "grad_norm": 0.4613452971894331, + "learning_rate": 1.991121923426306e-05, + "loss": 0.3562, + "step": 615 + }, + { + "epoch": 0.6918433244419486, + "grad_norm": 0.5176397056990323, + "learning_rate": 1.9910173253778136e-05, + "loss": 0.3937, + "step": 616 + }, + { + "epoch": 0.6929664467218869, + "grad_norm": 0.502108110386917, + "learning_rate": 1.9909121175454553e-05, + "loss": 0.3852, + "step": 617 + }, + { + "epoch": 0.6940895690018251, + "grad_norm": 0.46871949558914766, + "learning_rate": 1.9908062999939666e-05, + "loss": 0.3615, + "step": 618 + }, + { + "epoch": 0.6952126912817633, + "grad_norm": 0.49710099802582, + "learning_rate": 1.9906998727884582e-05, + "loss": 0.3624, + "step": 619 + }, + { + "epoch": 0.6963358135617015, + "grad_norm": 0.5132520818715471, + "learning_rate": 1.990592835994416e-05, + "loss": 0.381, + "step": 620 + }, + { + "epoch": 0.6974589358416398, + "grad_norm": 0.47566380028639976, + "learning_rate": 1.990485189677701e-05, + "loss": 0.391, + "step": 621 + }, + { + "epoch": 0.698582058121578, + "grad_norm": 0.47811544132679007, + "learning_rate": 1.990376933904549e-05, + "loss": 0.3662, + "step": 622 + }, + { + "epoch": 0.6997051804015162, + "grad_norm": 0.5120337787106785, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.3731, + "step": 623 + }, + { + "epoch": 0.7008283026814545, + "grad_norm": 0.5300963176142912, + "learning_rate": 1.990158594255752e-05, + "loss": 0.3679, + "step": 624 + }, + { + "epoch": 0.7019514249613926, + "grad_norm": 0.47924641278798885, + "learning_rate": 1.9900485105144544e-05, + "loss": 0.3589, + "step": 625 + }, + { + "epoch": 0.7030745472413309, + "grad_norm": 0.5270278691730991, + "learning_rate": 1.9899378175854134e-05, + "loss": 0.3921, + "step": 626 + }, + { + "epoch": 0.7041976695212692, + "grad_norm": 0.5179981618980336, + "learning_rate": 1.9898265155367394e-05, + "loss": 0.4171, + "step": 627 + }, + { + "epoch": 0.7053207918012073, + "grad_norm": 0.5055637943014714, + "learning_rate": 1.9897146044369177e-05, + "loss": 0.3733, + "step": 628 + }, + { + "epoch": 0.7064439140811456, + "grad_norm": 0.5349952912388607, + "learning_rate": 1.9896020843548086e-05, + "loss": 0.3912, + "step": 629 + }, + { + "epoch": 0.7075670363610839, + "grad_norm": 0.47936450848582013, + "learning_rate": 1.989488955359647e-05, + "loss": 0.3461, + "step": 630 + }, + { + "epoch": 0.708690158641022, + "grad_norm": 0.5229256593874981, + "learning_rate": 1.9893752175210423e-05, + "loss": 0.3835, + "step": 631 + }, + { + "epoch": 0.7098132809209603, + "grad_norm": 0.5517790504856993, + "learning_rate": 1.9892608709089788e-05, + "loss": 0.379, + "step": 632 + }, + { + "epoch": 0.7109364032008985, + "grad_norm": 0.5135986709269499, + "learning_rate": 1.9891459155938148e-05, + "loss": 0.3938, + "step": 633 + }, + { + "epoch": 0.7120595254808367, + "grad_norm": 0.5235231286661839, + "learning_rate": 1.9890303516462842e-05, + "loss": 0.4047, + "step": 634 + }, + { + "epoch": 0.713182647760775, + "grad_norm": 0.5230019091855481, + "learning_rate": 1.9889141791374942e-05, + "loss": 0.3772, + "step": 635 + }, + { + "epoch": 0.7143057700407132, + "grad_norm": 0.5086902065079811, + "learning_rate": 1.9887973981389276e-05, + "loss": 0.3789, + "step": 636 + }, + { + "epoch": 0.7154288923206514, + "grad_norm": 0.45834248146994216, + "learning_rate": 1.9886800087224404e-05, + "loss": 0.3511, + "step": 637 + }, + { + "epoch": 0.7165520146005896, + "grad_norm": 0.5193295159932503, + "learning_rate": 1.9885620109602637e-05, + "loss": 0.3777, + "step": 638 + }, + { + "epoch": 0.7176751368805279, + "grad_norm": 0.4751465804458114, + "learning_rate": 1.9884434049250038e-05, + "loss": 0.3747, + "step": 639 + }, + { + "epoch": 0.7187982591604661, + "grad_norm": 0.47889259165907166, + "learning_rate": 1.988324190689639e-05, + "loss": 0.3564, + "step": 640 + }, + { + "epoch": 0.7199213814404043, + "grad_norm": 0.4504694191651383, + "learning_rate": 1.9882043683275235e-05, + "loss": 0.3569, + "step": 641 + }, + { + "epoch": 0.7210445037203426, + "grad_norm": 0.48131939806970797, + "learning_rate": 1.9880839379123854e-05, + "loss": 0.3775, + "step": 642 + }, + { + "epoch": 0.7221676260002807, + "grad_norm": 0.5054764749704692, + "learning_rate": 1.9879628995183274e-05, + "loss": 0.3804, + "step": 643 + }, + { + "epoch": 0.723290748280219, + "grad_norm": 0.49166280486529856, + "learning_rate": 1.987841253219825e-05, + "loss": 0.3846, + "step": 644 + }, + { + "epoch": 0.7244138705601573, + "grad_norm": 0.4729734631200996, + "learning_rate": 1.987718999091729e-05, + "loss": 0.3721, + "step": 645 + }, + { + "epoch": 0.7255369928400954, + "grad_norm": 0.5129501870416402, + "learning_rate": 1.987596137209263e-05, + "loss": 0.3656, + "step": 646 + }, + { + "epoch": 0.7266601151200337, + "grad_norm": 0.5182252271285124, + "learning_rate": 1.9874726676480264e-05, + "loss": 0.4007, + "step": 647 + }, + { + "epoch": 0.727783237399972, + "grad_norm": 0.5183328972199854, + "learning_rate": 1.98734859048399e-05, + "loss": 0.3853, + "step": 648 + }, + { + "epoch": 0.7289063596799101, + "grad_norm": 0.48679212429312807, + "learning_rate": 1.9872239057935013e-05, + "loss": 0.3615, + "step": 649 + }, + { + "epoch": 0.7300294819598484, + "grad_norm": 0.4599807442751718, + "learning_rate": 1.9870986136532792e-05, + "loss": 0.3447, + "step": 650 + }, + { + "epoch": 0.7311526042397866, + "grad_norm": 0.48761341111368395, + "learning_rate": 1.986972714140418e-05, + "loss": 0.3487, + "step": 651 + }, + { + "epoch": 0.7322757265197248, + "grad_norm": 0.5019293541443021, + "learning_rate": 1.986846207332384e-05, + "loss": 0.378, + "step": 652 + }, + { + "epoch": 0.7333988487996631, + "grad_norm": 0.507765982179853, + "learning_rate": 1.986719093307019e-05, + "loss": 0.3653, + "step": 653 + }, + { + "epoch": 0.7345219710796013, + "grad_norm": 0.48272271090692315, + "learning_rate": 1.9865913721425376e-05, + "loss": 0.3542, + "step": 654 + }, + { + "epoch": 0.7356450933595395, + "grad_norm": 0.5175822423440689, + "learning_rate": 1.9864630439175282e-05, + "loss": 0.366, + "step": 655 + }, + { + "epoch": 0.7367682156394777, + "grad_norm": 0.5289074782303135, + "learning_rate": 1.986334108710952e-05, + "loss": 0.3817, + "step": 656 + }, + { + "epoch": 0.737891337919416, + "grad_norm": 0.4861617055976239, + "learning_rate": 1.9862045666021448e-05, + "loss": 0.3752, + "step": 657 + }, + { + "epoch": 0.7390144601993542, + "grad_norm": 0.4957975445225569, + "learning_rate": 1.986074417670815e-05, + "loss": 0.3588, + "step": 658 + }, + { + "epoch": 0.7401375824792924, + "grad_norm": 0.4948274783820183, + "learning_rate": 1.9859436619970448e-05, + "loss": 0.3854, + "step": 659 + }, + { + "epoch": 0.7412607047592307, + "grad_norm": 0.4948086029632615, + "learning_rate": 1.9858122996612897e-05, + "loss": 0.3778, + "step": 660 + }, + { + "epoch": 0.7423838270391689, + "grad_norm": 0.5091232713262371, + "learning_rate": 1.9856803307443782e-05, + "loss": 0.3943, + "step": 661 + }, + { + "epoch": 0.7435069493191071, + "grad_norm": 0.47932797892770085, + "learning_rate": 1.985547755327512e-05, + "loss": 0.3608, + "step": 662 + }, + { + "epoch": 0.7446300715990454, + "grad_norm": 0.48396806021455513, + "learning_rate": 1.9854145734922668e-05, + "loss": 0.3691, + "step": 663 + }, + { + "epoch": 0.7457531938789835, + "grad_norm": 0.48291497456525306, + "learning_rate": 1.9852807853205903e-05, + "loss": 0.3544, + "step": 664 + }, + { + "epoch": 0.7468763161589218, + "grad_norm": 0.527750629448579, + "learning_rate": 1.985146390894804e-05, + "loss": 0.3795, + "step": 665 + }, + { + "epoch": 0.7479994384388601, + "grad_norm": 0.49379054725765187, + "learning_rate": 1.9850113902976025e-05, + "loss": 0.3595, + "step": 666 + }, + { + "epoch": 0.7491225607187982, + "grad_norm": 0.5043416333342748, + "learning_rate": 1.9848757836120528e-05, + "loss": 0.3627, + "step": 667 + }, + { + "epoch": 0.7502456829987365, + "grad_norm": 0.49989404508934815, + "learning_rate": 1.984739570921596e-05, + "loss": 0.3515, + "step": 668 + }, + { + "epoch": 0.7513688052786747, + "grad_norm": 0.508418215461715, + "learning_rate": 1.984602752310044e-05, + "loss": 0.3601, + "step": 669 + }, + { + "epoch": 0.7524919275586129, + "grad_norm": 0.5189115847179923, + "learning_rate": 1.9844653278615836e-05, + "loss": 0.3865, + "step": 670 + }, + { + "epoch": 0.7536150498385512, + "grad_norm": 0.49519160344351704, + "learning_rate": 1.9843272976607735e-05, + "loss": 0.3789, + "step": 671 + }, + { + "epoch": 0.7547381721184894, + "grad_norm": 0.5125455405404472, + "learning_rate": 1.984188661792545e-05, + "loss": 0.3902, + "step": 672 + }, + { + "epoch": 0.7558612943984276, + "grad_norm": 0.46902819064744006, + "learning_rate": 1.9840494203422024e-05, + "loss": 0.3562, + "step": 673 + }, + { + "epoch": 0.7569844166783659, + "grad_norm": 0.4712682187918327, + "learning_rate": 1.9839095733954226e-05, + "loss": 0.3724, + "step": 674 + }, + { + "epoch": 0.7581075389583041, + "grad_norm": 0.5083096985128949, + "learning_rate": 1.9837691210382547e-05, + "loss": 0.3945, + "step": 675 + }, + { + "epoch": 0.7592306612382423, + "grad_norm": 0.4756841491353359, + "learning_rate": 1.9836280633571206e-05, + "loss": 0.3964, + "step": 676 + }, + { + "epoch": 0.7603537835181805, + "grad_norm": 0.5229455215501784, + "learning_rate": 1.9834864004388153e-05, + "loss": 0.3925, + "step": 677 + }, + { + "epoch": 0.7614769057981188, + "grad_norm": 0.490426080752418, + "learning_rate": 1.9833441323705046e-05, + "loss": 0.3608, + "step": 678 + }, + { + "epoch": 0.762600028078057, + "grad_norm": 0.46495132814690193, + "learning_rate": 1.983201259239728e-05, + "loss": 0.3733, + "step": 679 + }, + { + "epoch": 0.7637231503579952, + "grad_norm": 0.5047763095575648, + "learning_rate": 1.9830577811343973e-05, + "loss": 0.3618, + "step": 680 + }, + { + "epoch": 0.7648462726379335, + "grad_norm": 0.5216773678215171, + "learning_rate": 1.9829136981427953e-05, + "loss": 0.4084, + "step": 681 + }, + { + "epoch": 0.7659693949178716, + "grad_norm": 0.48638773567106564, + "learning_rate": 1.9827690103535787e-05, + "loss": 0.3539, + "step": 682 + }, + { + "epoch": 0.7670925171978099, + "grad_norm": 0.44873074839468674, + "learning_rate": 1.9826237178557752e-05, + "loss": 0.3502, + "step": 683 + }, + { + "epoch": 0.7682156394777482, + "grad_norm": 0.48611782864710706, + "learning_rate": 1.982477820738785e-05, + "loss": 0.3755, + "step": 684 + }, + { + "epoch": 0.7693387617576863, + "grad_norm": 0.5308709710691715, + "learning_rate": 1.9823313190923797e-05, + "loss": 0.3836, + "step": 685 + }, + { + "epoch": 0.7704618840376246, + "grad_norm": 0.49106310483861326, + "learning_rate": 1.9821842130067034e-05, + "loss": 0.3525, + "step": 686 + }, + { + "epoch": 0.7715850063175628, + "grad_norm": 0.49027171214223086, + "learning_rate": 1.982036502572273e-05, + "loss": 0.3929, + "step": 687 + }, + { + "epoch": 0.772708128597501, + "grad_norm": 0.5040180030072051, + "learning_rate": 1.9818881878799755e-05, + "loss": 0.3671, + "step": 688 + }, + { + "epoch": 0.7738312508774393, + "grad_norm": 0.4673216659840785, + "learning_rate": 1.9817392690210705e-05, + "loss": 0.3687, + "step": 689 + }, + { + "epoch": 0.7749543731573775, + "grad_norm": 0.5310710286528179, + "learning_rate": 1.98158974608719e-05, + "loss": 0.3694, + "step": 690 + }, + { + "epoch": 0.7760774954373157, + "grad_norm": 0.47994947051344194, + "learning_rate": 1.981439619170337e-05, + "loss": 0.3556, + "step": 691 + }, + { + "epoch": 0.777200617717254, + "grad_norm": 0.5463444744890577, + "learning_rate": 1.981288888362886e-05, + "loss": 0.3775, + "step": 692 + }, + { + "epoch": 0.7783237399971922, + "grad_norm": 0.4724849513305692, + "learning_rate": 1.981137553757583e-05, + "loss": 0.373, + "step": 693 + }, + { + "epoch": 0.7794468622771304, + "grad_norm": 0.4833836898450605, + "learning_rate": 1.9809856154475463e-05, + "loss": 0.3602, + "step": 694 + }, + { + "epoch": 0.7805699845570686, + "grad_norm": 0.44065803236999307, + "learning_rate": 1.9808330735262657e-05, + "loss": 0.3323, + "step": 695 + }, + { + "epoch": 0.7816931068370069, + "grad_norm": 0.5062777136832733, + "learning_rate": 1.980679928087601e-05, + "loss": 0.3889, + "step": 696 + }, + { + "epoch": 0.7828162291169452, + "grad_norm": 0.4390743377796556, + "learning_rate": 1.980526179225785e-05, + "loss": 0.3619, + "step": 697 + }, + { + "epoch": 0.7839393513968833, + "grad_norm": 0.5039788856880819, + "learning_rate": 1.98037182703542e-05, + "loss": 0.3932, + "step": 698 + }, + { + "epoch": 0.7850624736768216, + "grad_norm": 0.4645613871607041, + "learning_rate": 1.9802168716114817e-05, + "loss": 0.3466, + "step": 699 + }, + { + "epoch": 0.7861855959567597, + "grad_norm": 0.48869710859846344, + "learning_rate": 1.9800613130493158e-05, + "loss": 0.3882, + "step": 700 + }, + { + "epoch": 0.787308718236698, + "grad_norm": 0.46252538633492307, + "learning_rate": 1.9799051514446383e-05, + "loss": 0.3731, + "step": 701 + }, + { + "epoch": 0.7884318405166363, + "grad_norm": 0.475702182268912, + "learning_rate": 1.9797483868935385e-05, + "loss": 0.3535, + "step": 702 + }, + { + "epoch": 0.7895549627965744, + "grad_norm": 0.4469783800160413, + "learning_rate": 1.979591019492474e-05, + "loss": 0.3448, + "step": 703 + }, + { + "epoch": 0.7906780850765127, + "grad_norm": 0.4513038149897409, + "learning_rate": 1.9794330493382753e-05, + "loss": 0.3749, + "step": 704 + }, + { + "epoch": 0.791801207356451, + "grad_norm": 0.48022563808130214, + "learning_rate": 1.9792744765281435e-05, + "loss": 0.373, + "step": 705 + }, + { + "epoch": 0.7929243296363891, + "grad_norm": 0.4482413723061531, + "learning_rate": 1.9791153011596497e-05, + "loss": 0.3506, + "step": 706 + }, + { + "epoch": 0.7940474519163274, + "grad_norm": 0.4520233506526563, + "learning_rate": 1.9789555233307363e-05, + "loss": 0.359, + "step": 707 + }, + { + "epoch": 0.7951705741962656, + "grad_norm": 0.4674282940663872, + "learning_rate": 1.9787951431397167e-05, + "loss": 0.3851, + "step": 708 + }, + { + "epoch": 0.7962936964762038, + "grad_norm": 0.4584028901339973, + "learning_rate": 1.9786341606852743e-05, + "loss": 0.3501, + "step": 709 + }, + { + "epoch": 0.7974168187561421, + "grad_norm": 0.4473101801633832, + "learning_rate": 1.9784725760664632e-05, + "loss": 0.368, + "step": 710 + }, + { + "epoch": 0.7985399410360803, + "grad_norm": 0.4657336852380225, + "learning_rate": 1.9783103893827088e-05, + "loss": 0.3558, + "step": 711 + }, + { + "epoch": 0.7996630633160186, + "grad_norm": 0.45906042695952914, + "learning_rate": 1.9781476007338058e-05, + "loss": 0.3599, + "step": 712 + }, + { + "epoch": 0.8007861855959567, + "grad_norm": 0.46033366175628637, + "learning_rate": 1.97798421021992e-05, + "loss": 0.3679, + "step": 713 + }, + { + "epoch": 0.801909307875895, + "grad_norm": 0.4911730826448443, + "learning_rate": 1.9778202179415875e-05, + "loss": 0.3872, + "step": 714 + }, + { + "epoch": 0.8030324301558333, + "grad_norm": 0.463483660266614, + "learning_rate": 1.9776556239997146e-05, + "loss": 0.3758, + "step": 715 + }, + { + "epoch": 0.8041555524357714, + "grad_norm": 0.5063628630954315, + "learning_rate": 1.9774904284955775e-05, + "loss": 0.3824, + "step": 716 + }, + { + "epoch": 0.8052786747157097, + "grad_norm": 0.5154742655615026, + "learning_rate": 1.9773246315308226e-05, + "loss": 0.3527, + "step": 717 + }, + { + "epoch": 0.8064017969956478, + "grad_norm": 0.4630501893780488, + "learning_rate": 1.9771582332074676e-05, + "loss": 0.3875, + "step": 718 + }, + { + "epoch": 0.8075249192755861, + "grad_norm": 0.49263104968044724, + "learning_rate": 1.9769912336278983e-05, + "loss": 0.3934, + "step": 719 + }, + { + "epoch": 0.8086480415555244, + "grad_norm": 0.4672454867734837, + "learning_rate": 1.9768236328948718e-05, + "loss": 0.3647, + "step": 720 + }, + { + "epoch": 0.8097711638354625, + "grad_norm": 0.4920760592750608, + "learning_rate": 1.9766554311115143e-05, + "loss": 0.365, + "step": 721 + }, + { + "epoch": 0.8108942861154008, + "grad_norm": 0.4647800245178615, + "learning_rate": 1.9764866283813224e-05, + "loss": 0.3753, + "step": 722 + }, + { + "epoch": 0.8120174083953391, + "grad_norm": 0.4411909985899985, + "learning_rate": 1.9763172248081623e-05, + "loss": 0.3604, + "step": 723 + }, + { + "epoch": 0.8131405306752773, + "grad_norm": 0.4577218647324814, + "learning_rate": 1.97614722049627e-05, + "loss": 0.3489, + "step": 724 + }, + { + "epoch": 0.8142636529552155, + "grad_norm": 0.48915076154791487, + "learning_rate": 1.975976615550251e-05, + "loss": 0.3687, + "step": 725 + }, + { + "epoch": 0.8153867752351537, + "grad_norm": 0.47499359925278556, + "learning_rate": 1.9758054100750802e-05, + "loss": 0.3724, + "step": 726 + }, + { + "epoch": 0.816509897515092, + "grad_norm": 0.4869359292315476, + "learning_rate": 1.9756336041761023e-05, + "loss": 0.3708, + "step": 727 + }, + { + "epoch": 0.8176330197950302, + "grad_norm": 0.5178982061022716, + "learning_rate": 1.9754611979590313e-05, + "loss": 0.3755, + "step": 728 + }, + { + "epoch": 0.8187561420749684, + "grad_norm": 0.5664792802714854, + "learning_rate": 1.9752881915299507e-05, + "loss": 0.3925, + "step": 729 + }, + { + "epoch": 0.8198792643549067, + "grad_norm": 0.5059697408626879, + "learning_rate": 1.9751145849953135e-05, + "loss": 0.3806, + "step": 730 + }, + { + "epoch": 0.8210023866348448, + "grad_norm": 0.5546342735387513, + "learning_rate": 1.9749403784619415e-05, + "loss": 0.3911, + "step": 731 + }, + { + "epoch": 0.8221255089147831, + "grad_norm": 0.5151210704909763, + "learning_rate": 1.9747655720370257e-05, + "loss": 0.383, + "step": 732 + }, + { + "epoch": 0.8232486311947214, + "grad_norm": 0.5218588159099736, + "learning_rate": 1.9745901658281267e-05, + "loss": 0.3831, + "step": 733 + }, + { + "epoch": 0.8243717534746595, + "grad_norm": 0.5275710652132647, + "learning_rate": 1.9744141599431737e-05, + "loss": 0.3626, + "step": 734 + }, + { + "epoch": 0.8254948757545978, + "grad_norm": 0.5597253504535993, + "learning_rate": 1.9742375544904654e-05, + "loss": 0.372, + "step": 735 + }, + { + "epoch": 0.8266179980345361, + "grad_norm": 0.5603047467724314, + "learning_rate": 1.9740603495786687e-05, + "loss": 0.3842, + "step": 736 + }, + { + "epoch": 0.8277411203144742, + "grad_norm": 0.4804454455092537, + "learning_rate": 1.97388254531682e-05, + "loss": 0.371, + "step": 737 + }, + { + "epoch": 0.8288642425944125, + "grad_norm": 0.49610624738284403, + "learning_rate": 1.9737041418143235e-05, + "loss": 0.3692, + "step": 738 + }, + { + "epoch": 0.8299873648743507, + "grad_norm": 0.5248533454817134, + "learning_rate": 1.973525139180954e-05, + "loss": 0.4152, + "step": 739 + }, + { + "epoch": 0.8311104871542889, + "grad_norm": 0.470530311658482, + "learning_rate": 1.9733455375268532e-05, + "loss": 0.3856, + "step": 740 + }, + { + "epoch": 0.8322336094342272, + "grad_norm": 0.46958211569658714, + "learning_rate": 1.9731653369625318e-05, + "loss": 0.3817, + "step": 741 + }, + { + "epoch": 0.8333567317141654, + "grad_norm": 0.5020551519019959, + "learning_rate": 1.9729845375988694e-05, + "loss": 0.393, + "step": 742 + }, + { + "epoch": 0.8344798539941036, + "grad_norm": 0.4740449984336523, + "learning_rate": 1.9728031395471138e-05, + "loss": 0.3808, + "step": 743 + }, + { + "epoch": 0.8356029762740418, + "grad_norm": 0.49152487880677487, + "learning_rate": 1.972621142918881e-05, + "loss": 0.3827, + "step": 744 + }, + { + "epoch": 0.8367260985539801, + "grad_norm": 0.5170747001231172, + "learning_rate": 1.972438547826156e-05, + "loss": 0.3909, + "step": 745 + }, + { + "epoch": 0.8378492208339183, + "grad_norm": 0.48953966689610007, + "learning_rate": 1.972255354381291e-05, + "loss": 0.3816, + "step": 746 + }, + { + "epoch": 0.8389723431138565, + "grad_norm": 0.45475300416733155, + "learning_rate": 1.9720715626970078e-05, + "loss": 0.3572, + "step": 747 + }, + { + "epoch": 0.8400954653937948, + "grad_norm": 0.526527914204446, + "learning_rate": 1.9718871728863946e-05, + "loss": 0.3707, + "step": 748 + }, + { + "epoch": 0.841218587673733, + "grad_norm": 0.47359041114522704, + "learning_rate": 1.971702185062909e-05, + "loss": 0.3832, + "step": 749 + }, + { + "epoch": 0.8423417099536712, + "grad_norm": 0.5066230520321908, + "learning_rate": 1.9715165993403756e-05, + "loss": 0.3689, + "step": 750 + }, + { + "epoch": 0.8434648322336095, + "grad_norm": 0.5222356669126356, + "learning_rate": 1.9713304158329873e-05, + "loss": 0.3761, + "step": 751 + }, + { + "epoch": 0.8445879545135476, + "grad_norm": 0.47850315807515664, + "learning_rate": 1.9711436346553055e-05, + "loss": 0.3765, + "step": 752 + }, + { + "epoch": 0.8457110767934859, + "grad_norm": 0.5204791826622412, + "learning_rate": 1.9709562559222585e-05, + "loss": 0.4, + "step": 753 + }, + { + "epoch": 0.8468341990734242, + "grad_norm": 0.5207156559730746, + "learning_rate": 1.970768279749142e-05, + "loss": 0.3838, + "step": 754 + }, + { + "epoch": 0.8479573213533623, + "grad_norm": 0.48686440349858606, + "learning_rate": 1.9705797062516204e-05, + "loss": 0.3806, + "step": 755 + }, + { + "epoch": 0.8490804436333006, + "grad_norm": 0.5009212386994344, + "learning_rate": 1.9703905355457244e-05, + "loss": 0.3766, + "step": 756 + }, + { + "epoch": 0.8502035659132388, + "grad_norm": 0.4823384548317993, + "learning_rate": 1.9702007677478535e-05, + "loss": 0.3598, + "step": 757 + }, + { + "epoch": 0.851326688193177, + "grad_norm": 0.5038461477537077, + "learning_rate": 1.9700104029747735e-05, + "loss": 0.3706, + "step": 758 + }, + { + "epoch": 0.8524498104731153, + "grad_norm": 0.499367218377909, + "learning_rate": 1.9698194413436186e-05, + "loss": 0.3931, + "step": 759 + }, + { + "epoch": 0.8535729327530535, + "grad_norm": 0.4558139088156976, + "learning_rate": 1.9696278829718882e-05, + "loss": 0.3661, + "step": 760 + }, + { + "epoch": 0.8546960550329917, + "grad_norm": 0.49796966194709463, + "learning_rate": 1.9694357279774516e-05, + "loss": 0.3762, + "step": 761 + }, + { + "epoch": 0.8558191773129299, + "grad_norm": 0.47539394574011445, + "learning_rate": 1.9692429764785436e-05, + "loss": 0.3686, + "step": 762 + }, + { + "epoch": 0.8569422995928682, + "grad_norm": 0.4631634576367207, + "learning_rate": 1.969049628593766e-05, + "loss": 0.3765, + "step": 763 + }, + { + "epoch": 0.8580654218728064, + "grad_norm": 0.4622717685068893, + "learning_rate": 1.9688556844420877e-05, + "loss": 0.3819, + "step": 764 + }, + { + "epoch": 0.8591885441527446, + "grad_norm": 0.46303379442069725, + "learning_rate": 1.9686611441428455e-05, + "loss": 0.3417, + "step": 765 + }, + { + "epoch": 0.8603116664326829, + "grad_norm": 0.48471273461639347, + "learning_rate": 1.968466007815741e-05, + "loss": 0.3832, + "step": 766 + }, + { + "epoch": 0.8614347887126211, + "grad_norm": 0.4955079045703151, + "learning_rate": 1.968270275580845e-05, + "loss": 0.3973, + "step": 767 + }, + { + "epoch": 0.8625579109925593, + "grad_norm": 0.46897950183267123, + "learning_rate": 1.968073947558593e-05, + "loss": 0.3577, + "step": 768 + }, + { + "epoch": 0.8636810332724976, + "grad_norm": 0.49932189730734644, + "learning_rate": 1.9678770238697876e-05, + "loss": 0.3828, + "step": 769 + }, + { + "epoch": 0.8648041555524357, + "grad_norm": 0.4385490040536742, + "learning_rate": 1.9676795046355986e-05, + "loss": 0.3424, + "step": 770 + }, + { + "epoch": 0.865927277832374, + "grad_norm": 0.44172765516598467, + "learning_rate": 1.9674813899775614e-05, + "loss": 0.3504, + "step": 771 + }, + { + "epoch": 0.8670504001123123, + "grad_norm": 0.4925607529971466, + "learning_rate": 1.9672826800175786e-05, + "loss": 0.3712, + "step": 772 + }, + { + "epoch": 0.8681735223922504, + "grad_norm": 0.4703364651049239, + "learning_rate": 1.967083374877918e-05, + "loss": 0.3953, + "step": 773 + }, + { + "epoch": 0.8692966446721887, + "grad_norm": 0.4744439366917524, + "learning_rate": 1.9668834746812148e-05, + "loss": 0.3789, + "step": 774 + }, + { + "epoch": 0.8704197669521269, + "grad_norm": 0.4821724265365719, + "learning_rate": 1.9666829795504693e-05, + "loss": 0.3801, + "step": 775 + }, + { + "epoch": 0.8715428892320651, + "grad_norm": 0.48264988540719894, + "learning_rate": 1.966481889609049e-05, + "loss": 0.3774, + "step": 776 + }, + { + "epoch": 0.8726660115120034, + "grad_norm": 0.4694424121246419, + "learning_rate": 1.9662802049806862e-05, + "loss": 0.3574, + "step": 777 + }, + { + "epoch": 0.8737891337919416, + "grad_norm": 0.47288161574228044, + "learning_rate": 1.96607792578948e-05, + "loss": 0.3654, + "step": 778 + }, + { + "epoch": 0.8749122560718798, + "grad_norm": 0.4556516097283803, + "learning_rate": 1.9658750521598944e-05, + "loss": 0.3557, + "step": 779 + }, + { + "epoch": 0.8760353783518181, + "grad_norm": 0.42490236766792466, + "learning_rate": 1.9656715842167606e-05, + "loss": 0.3504, + "step": 780 + }, + { + "epoch": 0.8771585006317563, + "grad_norm": 0.493236263957415, + "learning_rate": 1.965467522085274e-05, + "loss": 0.377, + "step": 781 + }, + { + "epoch": 0.8782816229116945, + "grad_norm": 0.4836659716972491, + "learning_rate": 1.9652628658909968e-05, + "loss": 0.3731, + "step": 782 + }, + { + "epoch": 0.8794047451916327, + "grad_norm": 0.46192080514474665, + "learning_rate": 1.965057615759856e-05, + "loss": 0.3961, + "step": 783 + }, + { + "epoch": 0.880527867471571, + "grad_norm": 0.48934298480858224, + "learning_rate": 1.964851771818144e-05, + "loss": 0.35, + "step": 784 + }, + { + "epoch": 0.8816509897515092, + "grad_norm": 0.46465029854890294, + "learning_rate": 1.9646453341925195e-05, + "loss": 0.372, + "step": 785 + }, + { + "epoch": 0.8827741120314474, + "grad_norm": 0.4560535944639423, + "learning_rate": 1.9644383030100052e-05, + "loss": 0.3499, + "step": 786 + }, + { + "epoch": 0.8838972343113857, + "grad_norm": 0.4472553278120461, + "learning_rate": 1.9642306783979902e-05, + "loss": 0.3563, + "step": 787 + }, + { + "epoch": 0.8850203565913238, + "grad_norm": 0.4728183298345325, + "learning_rate": 1.964022460484227e-05, + "loss": 0.3915, + "step": 788 + }, + { + "epoch": 0.8861434788712621, + "grad_norm": 0.49042138391881734, + "learning_rate": 1.963813649396836e-05, + "loss": 0.4013, + "step": 789 + }, + { + "epoch": 0.8872666011512004, + "grad_norm": 0.44584377009424664, + "learning_rate": 1.9636042452643004e-05, + "loss": 0.3408, + "step": 790 + }, + { + "epoch": 0.8883897234311385, + "grad_norm": 0.4725306653731623, + "learning_rate": 1.9633942482154684e-05, + "loss": 0.3786, + "step": 791 + }, + { + "epoch": 0.8895128457110768, + "grad_norm": 0.4614739050205394, + "learning_rate": 1.9631836583795537e-05, + "loss": 0.3546, + "step": 792 + }, + { + "epoch": 0.8906359679910151, + "grad_norm": 0.5062498553591016, + "learning_rate": 1.962972475886135e-05, + "loss": 0.379, + "step": 793 + }, + { + "epoch": 0.8917590902709532, + "grad_norm": 0.44137610647370507, + "learning_rate": 1.9627607008651544e-05, + "loss": 0.3719, + "step": 794 + }, + { + "epoch": 0.8928822125508915, + "grad_norm": 0.4748874875427979, + "learning_rate": 1.9625483334469198e-05, + "loss": 0.3944, + "step": 795 + }, + { + "epoch": 0.8940053348308297, + "grad_norm": 0.4105278757488109, + "learning_rate": 1.9623353737621035e-05, + "loss": 0.3282, + "step": 796 + }, + { + "epoch": 0.8951284571107679, + "grad_norm": 0.533115865046379, + "learning_rate": 1.962121821941742e-05, + "loss": 0.3806, + "step": 797 + }, + { + "epoch": 0.8962515793907062, + "grad_norm": 0.4641263305725229, + "learning_rate": 1.9619076781172355e-05, + "loss": 0.3488, + "step": 798 + }, + { + "epoch": 0.8973747016706444, + "grad_norm": 0.4604841991723784, + "learning_rate": 1.9616929424203493e-05, + "loss": 0.3667, + "step": 799 + }, + { + "epoch": 0.8984978239505826, + "grad_norm": 0.5045896764129607, + "learning_rate": 1.9614776149832127e-05, + "loss": 0.3634, + "step": 800 + }, + { + "epoch": 0.8996209462305208, + "grad_norm": 0.5218566397092465, + "learning_rate": 1.961261695938319e-05, + "loss": 0.351, + "step": 801 + }, + { + "epoch": 0.9007440685104591, + "grad_norm": 0.4569027978245014, + "learning_rate": 1.9610451854185253e-05, + "loss": 0.3656, + "step": 802 + }, + { + "epoch": 0.9018671907903973, + "grad_norm": 0.49667871542822645, + "learning_rate": 1.9608280835570537e-05, + "loss": 0.3625, + "step": 803 + }, + { + "epoch": 0.9029903130703355, + "grad_norm": 0.4709946091327286, + "learning_rate": 1.9606103904874886e-05, + "loss": 0.3638, + "step": 804 + }, + { + "epoch": 0.9041134353502738, + "grad_norm": 0.4727139263711212, + "learning_rate": 1.9603921063437795e-05, + "loss": 0.3594, + "step": 805 + }, + { + "epoch": 0.9052365576302119, + "grad_norm": 0.5294771408585577, + "learning_rate": 1.9601732312602385e-05, + "loss": 0.3752, + "step": 806 + }, + { + "epoch": 0.9063596799101502, + "grad_norm": 0.46959657369103186, + "learning_rate": 1.959953765371542e-05, + "loss": 0.3625, + "step": 807 + }, + { + "epoch": 0.9074828021900885, + "grad_norm": 0.4594446586146184, + "learning_rate": 1.95973370881273e-05, + "loss": 0.3655, + "step": 808 + }, + { + "epoch": 0.9086059244700266, + "grad_norm": 0.4750310076759929, + "learning_rate": 1.959513061719205e-05, + "loss": 0.3653, + "step": 809 + }, + { + "epoch": 0.9097290467499649, + "grad_norm": 0.47678540184859336, + "learning_rate": 1.959291824226735e-05, + "loss": 0.3454, + "step": 810 + }, + { + "epoch": 0.9108521690299032, + "grad_norm": 0.4700082470689754, + "learning_rate": 1.9590699964714486e-05, + "loss": 0.3786, + "step": 811 + }, + { + "epoch": 0.9119752913098413, + "grad_norm": 0.4620496295067997, + "learning_rate": 1.9588475785898394e-05, + "loss": 0.3681, + "step": 812 + }, + { + "epoch": 0.9130984135897796, + "grad_norm": 0.4642629784834156, + "learning_rate": 1.9586245707187634e-05, + "loss": 0.353, + "step": 813 + }, + { + "epoch": 0.9142215358697178, + "grad_norm": 0.466408448319715, + "learning_rate": 1.9584009729954395e-05, + "loss": 0.3603, + "step": 814 + }, + { + "epoch": 0.915344658149656, + "grad_norm": 0.47105990854376034, + "learning_rate": 1.9581767855574508e-05, + "loss": 0.3636, + "step": 815 + }, + { + "epoch": 0.9164677804295943, + "grad_norm": 0.46413275069783316, + "learning_rate": 1.9579520085427416e-05, + "loss": 0.3951, + "step": 816 + }, + { + "epoch": 0.9175909027095325, + "grad_norm": 0.471808108910633, + "learning_rate": 1.9577266420896194e-05, + "loss": 0.3544, + "step": 817 + }, + { + "epoch": 0.9187140249894707, + "grad_norm": 0.46312200003026205, + "learning_rate": 1.9575006863367552e-05, + "loss": 0.3747, + "step": 818 + }, + { + "epoch": 0.9198371472694089, + "grad_norm": 0.4833523027027275, + "learning_rate": 1.957274141423182e-05, + "loss": 0.3751, + "step": 819 + }, + { + "epoch": 0.9209602695493472, + "grad_norm": 0.484335661168015, + "learning_rate": 1.9570470074882947e-05, + "loss": 0.4085, + "step": 820 + }, + { + "epoch": 0.9220833918292854, + "grad_norm": 0.47205123389289727, + "learning_rate": 1.9568192846718523e-05, + "loss": 0.3759, + "step": 821 + }, + { + "epoch": 0.9232065141092236, + "grad_norm": 0.4799100077083178, + "learning_rate": 1.956590973113975e-05, + "loss": 0.3755, + "step": 822 + }, + { + "epoch": 0.9243296363891619, + "grad_norm": 0.48768754470310466, + "learning_rate": 1.9563620729551448e-05, + "loss": 0.3579, + "step": 823 + }, + { + "epoch": 0.9254527586691002, + "grad_norm": 0.45688489471479504, + "learning_rate": 1.956132584336207e-05, + "loss": 0.351, + "step": 824 + }, + { + "epoch": 0.9265758809490383, + "grad_norm": 0.47538544133352595, + "learning_rate": 1.9559025073983678e-05, + "loss": 0.3808, + "step": 825 + }, + { + "epoch": 0.9276990032289766, + "grad_norm": 0.4879902084963068, + "learning_rate": 1.955671842283197e-05, + "loss": 0.3722, + "step": 826 + }, + { + "epoch": 0.9288221255089147, + "grad_norm": 0.4636609831884478, + "learning_rate": 1.955440589132625e-05, + "loss": 0.3591, + "step": 827 + }, + { + "epoch": 0.929945247788853, + "grad_norm": 0.46923991530374826, + "learning_rate": 1.955208748088944e-05, + "loss": 0.3825, + "step": 828 + }, + { + "epoch": 0.9310683700687913, + "grad_norm": 0.4811858751733392, + "learning_rate": 1.954976319294809e-05, + "loss": 0.3867, + "step": 829 + }, + { + "epoch": 0.9321914923487294, + "grad_norm": 0.44788406222610444, + "learning_rate": 1.9547433028932357e-05, + "loss": 0.3581, + "step": 830 + }, + { + "epoch": 0.9333146146286677, + "grad_norm": 0.4780988778135954, + "learning_rate": 1.9545096990276016e-05, + "loss": 0.3896, + "step": 831 + }, + { + "epoch": 0.9344377369086059, + "grad_norm": 0.47068420296897273, + "learning_rate": 1.954275507841646e-05, + "loss": 0.3844, + "step": 832 + }, + { + "epoch": 0.9355608591885441, + "grad_norm": 0.4884936275913018, + "learning_rate": 1.9540407294794685e-05, + "loss": 0.4031, + "step": 833 + }, + { + "epoch": 0.9366839814684824, + "grad_norm": 0.44700134287470683, + "learning_rate": 1.9538053640855316e-05, + "loss": 0.3436, + "step": 834 + }, + { + "epoch": 0.9378071037484206, + "grad_norm": 0.45061145673357333, + "learning_rate": 1.9535694118046584e-05, + "loss": 0.3457, + "step": 835 + }, + { + "epoch": 0.9389302260283588, + "grad_norm": 0.47148123153216354, + "learning_rate": 1.9533328727820322e-05, + "loss": 0.368, + "step": 836 + }, + { + "epoch": 0.9400533483082971, + "grad_norm": 0.4456166431044062, + "learning_rate": 1.953095747163199e-05, + "loss": 0.3456, + "step": 837 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.44236685307564755, + "learning_rate": 1.9528580350940634e-05, + "loss": 0.3431, + "step": 838 + }, + { + "epoch": 0.9422995928681736, + "grad_norm": 0.48561300894599185, + "learning_rate": 1.9526197367208937e-05, + "loss": 0.3652, + "step": 839 + }, + { + "epoch": 0.9434227151481117, + "grad_norm": 0.4720489415303487, + "learning_rate": 1.9523808521903165e-05, + "loss": 0.3572, + "step": 840 + }, + { + "epoch": 0.94454583742805, + "grad_norm": 0.44376759133578964, + "learning_rate": 1.9521413816493206e-05, + "loss": 0.3568, + "step": 841 + }, + { + "epoch": 0.9456689597079883, + "grad_norm": 0.47211026031113734, + "learning_rate": 1.951901325245255e-05, + "loss": 0.3589, + "step": 842 + }, + { + "epoch": 0.9467920819879264, + "grad_norm": 0.487190131673575, + "learning_rate": 1.9516606831258282e-05, + "loss": 0.3781, + "step": 843 + }, + { + "epoch": 0.9479152042678647, + "grad_norm": 0.5011825293136943, + "learning_rate": 1.951419455439111e-05, + "loss": 0.3773, + "step": 844 + }, + { + "epoch": 0.9490383265478028, + "grad_norm": 0.49776220503772134, + "learning_rate": 1.9511776423335327e-05, + "loss": 0.4034, + "step": 845 + }, + { + "epoch": 0.9501614488277411, + "grad_norm": 0.48886020802858143, + "learning_rate": 1.9509352439578843e-05, + "loss": 0.354, + "step": 846 + }, + { + "epoch": 0.9512845711076794, + "grad_norm": 0.48370344677831884, + "learning_rate": 1.950692260461315e-05, + "loss": 0.3524, + "step": 847 + }, + { + "epoch": 0.9524076933876175, + "grad_norm": 0.49770690316202637, + "learning_rate": 1.9504486919933364e-05, + "loss": 0.382, + "step": 848 + }, + { + "epoch": 0.9535308156675558, + "grad_norm": 0.4947415886475524, + "learning_rate": 1.9502045387038184e-05, + "loss": 0.3573, + "step": 849 + }, + { + "epoch": 0.954653937947494, + "grad_norm": 0.5081571339909152, + "learning_rate": 1.949959800742991e-05, + "loss": 0.393, + "step": 850 + }, + { + "epoch": 0.9557770602274323, + "grad_norm": 0.42096325046895394, + "learning_rate": 1.9497144782614445e-05, + "loss": 0.319, + "step": 851 + }, + { + "epoch": 0.9569001825073705, + "grad_norm": 0.506234866583935, + "learning_rate": 1.949468571410128e-05, + "loss": 0.3887, + "step": 852 + }, + { + "epoch": 0.9580233047873087, + "grad_norm": 0.5146043757907258, + "learning_rate": 1.9492220803403518e-05, + "loss": 0.3949, + "step": 853 + }, + { + "epoch": 0.959146427067247, + "grad_norm": 0.4788729384940881, + "learning_rate": 1.9489750052037836e-05, + "loss": 0.3748, + "step": 854 + }, + { + "epoch": 0.9602695493471852, + "grad_norm": 0.44234608280074666, + "learning_rate": 1.9487273461524517e-05, + "loss": 0.3599, + "step": 855 + }, + { + "epoch": 0.9613926716271234, + "grad_norm": 0.491710752366928, + "learning_rate": 1.9484791033387432e-05, + "loss": 0.3717, + "step": 856 + }, + { + "epoch": 0.9625157939070617, + "grad_norm": 0.4698138574686433, + "learning_rate": 1.948230276915405e-05, + "loss": 0.379, + "step": 857 + }, + { + "epoch": 0.9636389161869998, + "grad_norm": 0.4584810154708352, + "learning_rate": 1.9479808670355425e-05, + "loss": 0.3686, + "step": 858 + }, + { + "epoch": 0.9647620384669381, + "grad_norm": 0.46535287144204673, + "learning_rate": 1.9477308738526207e-05, + "loss": 0.3489, + "step": 859 + }, + { + "epoch": 0.9658851607468764, + "grad_norm": 0.49539431571257847, + "learning_rate": 1.947480297520463e-05, + "loss": 0.3899, + "step": 860 + }, + { + "epoch": 0.9670082830268145, + "grad_norm": 0.4870684988697284, + "learning_rate": 1.9472291381932515e-05, + "loss": 0.3608, + "step": 861 + }, + { + "epoch": 0.9681314053067528, + "grad_norm": 0.476057375325281, + "learning_rate": 1.9469773960255273e-05, + "loss": 0.3693, + "step": 862 + }, + { + "epoch": 0.969254527586691, + "grad_norm": 0.4933511086989605, + "learning_rate": 1.9467250711721903e-05, + "loss": 0.3673, + "step": 863 + }, + { + "epoch": 0.9703776498666292, + "grad_norm": 0.486703140404838, + "learning_rate": 1.946472163788499e-05, + "loss": 0.3623, + "step": 864 + }, + { + "epoch": 0.9715007721465675, + "grad_norm": 0.46311662873138065, + "learning_rate": 1.9462186740300697e-05, + "loss": 0.3751, + "step": 865 + }, + { + "epoch": 0.9726238944265057, + "grad_norm": 0.4756703180857592, + "learning_rate": 1.9459646020528777e-05, + "loss": 0.4065, + "step": 866 + }, + { + "epoch": 0.9737470167064439, + "grad_norm": 0.45742793507262375, + "learning_rate": 1.9457099480132563e-05, + "loss": 0.3783, + "step": 867 + }, + { + "epoch": 0.9748701389863822, + "grad_norm": 0.47697882778530937, + "learning_rate": 1.9454547120678966e-05, + "loss": 0.3907, + "step": 868 + }, + { + "epoch": 0.9759932612663204, + "grad_norm": 0.4664102425023445, + "learning_rate": 1.9451988943738485e-05, + "loss": 0.374, + "step": 869 + }, + { + "epoch": 0.9771163835462586, + "grad_norm": 0.41449005991424276, + "learning_rate": 1.9449424950885193e-05, + "loss": 0.3473, + "step": 870 + }, + { + "epoch": 0.9782395058261968, + "grad_norm": 0.4344496368295649, + "learning_rate": 1.944685514369674e-05, + "loss": 0.3481, + "step": 871 + }, + { + "epoch": 0.9793626281061351, + "grad_norm": 0.47436273506860405, + "learning_rate": 1.9444279523754358e-05, + "loss": 0.3799, + "step": 872 + }, + { + "epoch": 0.9804857503860733, + "grad_norm": 0.4905734178799078, + "learning_rate": 1.944169809264286e-05, + "loss": 0.3655, + "step": 873 + }, + { + "epoch": 0.9816088726660115, + "grad_norm": 0.4643421651738806, + "learning_rate": 1.9439110851950623e-05, + "loss": 0.3705, + "step": 874 + }, + { + "epoch": 0.9827319949459498, + "grad_norm": 0.4860647845700408, + "learning_rate": 1.9436517803269603e-05, + "loss": 0.3556, + "step": 875 + }, + { + "epoch": 0.9838551172258879, + "grad_norm": 0.473335344847467, + "learning_rate": 1.9433918948195335e-05, + "loss": 0.354, + "step": 876 + }, + { + "epoch": 0.9849782395058262, + "grad_norm": 0.46317370647334366, + "learning_rate": 1.9431314288326925e-05, + "loss": 0.3564, + "step": 877 + }, + { + "epoch": 0.9861013617857645, + "grad_norm": 0.48357335447638583, + "learning_rate": 1.9428703825267047e-05, + "loss": 0.3799, + "step": 878 + }, + { + "epoch": 0.9872244840657026, + "grad_norm": 0.4575106291945355, + "learning_rate": 1.9426087560621944e-05, + "loss": 0.3681, + "step": 879 + }, + { + "epoch": 0.9883476063456409, + "grad_norm": 0.46101892774024467, + "learning_rate": 1.942346549600144e-05, + "loss": 0.3683, + "step": 880 + }, + { + "epoch": 0.989470728625579, + "grad_norm": 0.47630591998449023, + "learning_rate": 1.942083763301891e-05, + "loss": 0.3642, + "step": 881 + }, + { + "epoch": 0.9905938509055173, + "grad_norm": 0.44576916696684105, + "learning_rate": 1.9418203973291317e-05, + "loss": 0.3525, + "step": 882 + }, + { + "epoch": 0.9917169731854556, + "grad_norm": 0.43611774214843185, + "learning_rate": 1.9415564518439173e-05, + "loss": 0.3376, + "step": 883 + }, + { + "epoch": 0.9928400954653938, + "grad_norm": 0.44908126004356413, + "learning_rate": 1.9412919270086573e-05, + "loss": 0.3574, + "step": 884 + }, + { + "epoch": 0.993963217745332, + "grad_norm": 0.48679555991927803, + "learning_rate": 1.941026822986116e-05, + "loss": 0.394, + "step": 885 + }, + { + "epoch": 0.9950863400252703, + "grad_norm": 0.45873412100669514, + "learning_rate": 1.9407611399394145e-05, + "loss": 0.3587, + "step": 886 + }, + { + "epoch": 0.9962094623052085, + "grad_norm": 0.45788041705605353, + "learning_rate": 1.9404948780320313e-05, + "loss": 0.3629, + "step": 887 + }, + { + "epoch": 0.9973325845851467, + "grad_norm": 0.47526745180684987, + "learning_rate": 1.9402280374278e-05, + "loss": 0.342, + "step": 888 + }, + { + "epoch": 0.9984557068650849, + "grad_norm": 0.4320159068252172, + "learning_rate": 1.9399606182909104e-05, + "loss": 0.3578, + "step": 889 + }, + { + "epoch": 0.9995788291450232, + "grad_norm": 0.43875671365459706, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.3598, + "step": 890 + }, + { + "epoch": 1.0007019514249613, + "grad_norm": 0.8018480221556775, + "learning_rate": 1.9394240450776963e-05, + "loss": 0.5421, + "step": 891 + }, + { + "epoch": 1.0018250737048997, + "grad_norm": 0.4696264632124667, + "learning_rate": 1.9391548913315312e-05, + "loss": 0.3137, + "step": 892 + }, + { + "epoch": 1.0029481959848379, + "grad_norm": 0.4583140254324182, + "learning_rate": 1.9388851597130268e-05, + "loss": 0.3087, + "step": 893 + }, + { + "epoch": 1.004071318264776, + "grad_norm": 0.4690268898240805, + "learning_rate": 1.9386148503881515e-05, + "loss": 0.2877, + "step": 894 + }, + { + "epoch": 1.0051944405447144, + "grad_norm": 0.5122329490697226, + "learning_rate": 1.9383439635232296e-05, + "loss": 0.2904, + "step": 895 + }, + { + "epoch": 1.0063175628246526, + "grad_norm": 0.5371156002795432, + "learning_rate": 1.938072499284941e-05, + "loss": 0.3008, + "step": 896 + }, + { + "epoch": 1.0074406851045907, + "grad_norm": 0.4902755957708443, + "learning_rate": 1.9378004578403208e-05, + "loss": 0.2819, + "step": 897 + }, + { + "epoch": 1.0085638073845289, + "grad_norm": 0.4741699815198155, + "learning_rate": 1.9375278393567588e-05, + "loss": 0.2627, + "step": 898 + }, + { + "epoch": 1.0096869296644673, + "grad_norm": 0.48986713359299766, + "learning_rate": 1.9372546440020003e-05, + "loss": 0.2914, + "step": 899 + }, + { + "epoch": 1.0108100519444054, + "grad_norm": 0.5014813747394071, + "learning_rate": 1.9369808719441448e-05, + "loss": 0.2912, + "step": 900 + }, + { + "epoch": 1.0119331742243436, + "grad_norm": 0.5393899996183346, + "learning_rate": 1.936706523351648e-05, + "loss": 0.3034, + "step": 901 + }, + { + "epoch": 1.013056296504282, + "grad_norm": 0.48629973546736344, + "learning_rate": 1.93643159839332e-05, + "loss": 0.315, + "step": 902 + }, + { + "epoch": 1.0141794187842201, + "grad_norm": 0.47646907262990806, + "learning_rate": 1.9361560972383237e-05, + "loss": 0.2829, + "step": 903 + }, + { + "epoch": 1.0153025410641583, + "grad_norm": 0.4784634253911548, + "learning_rate": 1.935880020056179e-05, + "loss": 0.2634, + "step": 904 + }, + { + "epoch": 1.0164256633440967, + "grad_norm": 0.4991566494669612, + "learning_rate": 1.9356033670167595e-05, + "loss": 0.2836, + "step": 905 + }, + { + "epoch": 1.0175487856240348, + "grad_norm": 0.47357930015207117, + "learning_rate": 1.935326138290292e-05, + "loss": 0.2845, + "step": 906 + }, + { + "epoch": 1.018671907903973, + "grad_norm": 0.4650462704839138, + "learning_rate": 1.9350483340473595e-05, + "loss": 0.2772, + "step": 907 + }, + { + "epoch": 1.0197950301839114, + "grad_norm": 0.5050485921674276, + "learning_rate": 1.9347699544588966e-05, + "loss": 0.2964, + "step": 908 + }, + { + "epoch": 1.0209181524638495, + "grad_norm": 0.441877998501999, + "learning_rate": 1.9344909996961943e-05, + "loss": 0.2783, + "step": 909 + }, + { + "epoch": 1.0220412747437877, + "grad_norm": 0.4689605191500745, + "learning_rate": 1.9342114699308962e-05, + "loss": 0.2759, + "step": 910 + }, + { + "epoch": 1.0231643970237259, + "grad_norm": 0.4806645271199599, + "learning_rate": 1.933931365335e-05, + "loss": 0.2724, + "step": 911 + }, + { + "epoch": 1.0242875193036642, + "grad_norm": 0.48206510864685853, + "learning_rate": 1.9336506860808576e-05, + "loss": 0.2845, + "step": 912 + }, + { + "epoch": 1.0254106415836024, + "grad_norm": 0.473977653318953, + "learning_rate": 1.9333694323411732e-05, + "loss": 0.2691, + "step": 913 + }, + { + "epoch": 1.0265337638635406, + "grad_norm": 0.4959577608373437, + "learning_rate": 1.9330876042890065e-05, + "loss": 0.2744, + "step": 914 + }, + { + "epoch": 1.027656886143479, + "grad_norm": 0.4962252725676315, + "learning_rate": 1.932805202097768e-05, + "loss": 0.2855, + "step": 915 + }, + { + "epoch": 1.028780008423417, + "grad_norm": 0.5128348503516724, + "learning_rate": 1.9325222259412242e-05, + "loss": 0.2795, + "step": 916 + }, + { + "epoch": 1.0299031307033553, + "grad_norm": 0.4632243182786546, + "learning_rate": 1.9322386759934923e-05, + "loss": 0.2643, + "step": 917 + }, + { + "epoch": 1.0310262529832936, + "grad_norm": 0.4730264051539773, + "learning_rate": 1.9319545524290447e-05, + "loss": 0.2654, + "step": 918 + }, + { + "epoch": 1.0321493752632318, + "grad_norm": 0.5126131024471591, + "learning_rate": 1.931669855422705e-05, + "loss": 0.2902, + "step": 919 + }, + { + "epoch": 1.03327249754317, + "grad_norm": 0.4743950568309829, + "learning_rate": 1.9313845851496507e-05, + "loss": 0.2665, + "step": 920 + }, + { + "epoch": 1.0343956198231083, + "grad_norm": 0.5013671474523442, + "learning_rate": 1.931098741785412e-05, + "loss": 0.2954, + "step": 921 + }, + { + "epoch": 1.0355187421030465, + "grad_norm": 0.5096143882338355, + "learning_rate": 1.930812325505871e-05, + "loss": 0.2737, + "step": 922 + }, + { + "epoch": 1.0366418643829847, + "grad_norm": 0.48645730923494584, + "learning_rate": 1.930525336487263e-05, + "loss": 0.2871, + "step": 923 + }, + { + "epoch": 1.0377649866629228, + "grad_norm": 0.48648949738495667, + "learning_rate": 1.9302377749061753e-05, + "loss": 0.2871, + "step": 924 + }, + { + "epoch": 1.0388881089428612, + "grad_norm": 0.5445641193684013, + "learning_rate": 1.9299496409395482e-05, + "loss": 0.2788, + "step": 925 + }, + { + "epoch": 1.0400112312227994, + "grad_norm": 0.4832715223137077, + "learning_rate": 1.9296609347646732e-05, + "loss": 0.2705, + "step": 926 + }, + { + "epoch": 1.0411343535027375, + "grad_norm": 0.4610567782289876, + "learning_rate": 1.9293716565591948e-05, + "loss": 0.266, + "step": 927 + }, + { + "epoch": 1.042257475782676, + "grad_norm": 0.476608249508321, + "learning_rate": 1.9290818065011084e-05, + "loss": 0.2806, + "step": 928 + }, + { + "epoch": 1.043380598062614, + "grad_norm": 0.4820198451645742, + "learning_rate": 1.9287913847687627e-05, + "loss": 0.2745, + "step": 929 + }, + { + "epoch": 1.0445037203425522, + "grad_norm": 0.5092363784952347, + "learning_rate": 1.9285003915408575e-05, + "loss": 0.29, + "step": 930 + }, + { + "epoch": 1.0456268426224906, + "grad_norm": 0.4745997969575985, + "learning_rate": 1.928208826996443e-05, + "loss": 0.2679, + "step": 931 + }, + { + "epoch": 1.0467499649024288, + "grad_norm": 0.48451499822152283, + "learning_rate": 1.927916691314923e-05, + "loss": 0.2979, + "step": 932 + }, + { + "epoch": 1.047873087182367, + "grad_norm": 0.4559532527795186, + "learning_rate": 1.9276239846760515e-05, + "loss": 0.2745, + "step": 933 + }, + { + "epoch": 1.048996209462305, + "grad_norm": 0.4942046444197993, + "learning_rate": 1.9273307072599343e-05, + "loss": 0.2733, + "step": 934 + }, + { + "epoch": 1.0501193317422435, + "grad_norm": 0.4716142833431438, + "learning_rate": 1.927036859247028e-05, + "loss": 0.2848, + "step": 935 + }, + { + "epoch": 1.0512424540221816, + "grad_norm": 0.6276556942610505, + "learning_rate": 1.9267424408181406e-05, + "loss": 0.2928, + "step": 936 + }, + { + "epoch": 1.0523655763021198, + "grad_norm": 0.4840738368552477, + "learning_rate": 1.9264474521544315e-05, + "loss": 0.2831, + "step": 937 + }, + { + "epoch": 1.0534886985820582, + "grad_norm": 0.513332959686835, + "learning_rate": 1.9261518934374093e-05, + "loss": 0.3035, + "step": 938 + }, + { + "epoch": 1.0546118208619963, + "grad_norm": 0.4556522892250145, + "learning_rate": 1.9258557648489357e-05, + "loss": 0.27, + "step": 939 + }, + { + "epoch": 1.0557349431419345, + "grad_norm": 0.5074845207806349, + "learning_rate": 1.9255590665712214e-05, + "loss": 0.283, + "step": 940 + }, + { + "epoch": 1.0568580654218729, + "grad_norm": 0.499752040349772, + "learning_rate": 1.9252617987868278e-05, + "loss": 0.2857, + "step": 941 + }, + { + "epoch": 1.057981187701811, + "grad_norm": 0.5396255130756957, + "learning_rate": 1.9249639616786674e-05, + "loss": 0.3088, + "step": 942 + }, + { + "epoch": 1.0591043099817492, + "grad_norm": 0.49212073826967684, + "learning_rate": 1.9246655554300028e-05, + "loss": 0.2748, + "step": 943 + }, + { + "epoch": 1.0602274322616876, + "grad_norm": 0.4968561055146951, + "learning_rate": 1.9243665802244465e-05, + "loss": 0.2749, + "step": 944 + }, + { + "epoch": 1.0613505545416257, + "grad_norm": 0.49427378612848244, + "learning_rate": 1.924067036245961e-05, + "loss": 0.2953, + "step": 945 + }, + { + "epoch": 1.062473676821564, + "grad_norm": 0.48543259955055773, + "learning_rate": 1.9237669236788595e-05, + "loss": 0.282, + "step": 946 + }, + { + "epoch": 1.0635967991015023, + "grad_norm": 0.5219319492699731, + "learning_rate": 1.923466242707804e-05, + "loss": 0.2971, + "step": 947 + }, + { + "epoch": 1.0647199213814404, + "grad_norm": 0.4692640333578925, + "learning_rate": 1.923164993517807e-05, + "loss": 0.2753, + "step": 948 + }, + { + "epoch": 1.0658430436613786, + "grad_norm": 0.47927624646872385, + "learning_rate": 1.9228631762942307e-05, + "loss": 0.2738, + "step": 949 + }, + { + "epoch": 1.0669661659413168, + "grad_norm": 0.5057928867460986, + "learning_rate": 1.9225607912227864e-05, + "loss": 0.2734, + "step": 950 + }, + { + "epoch": 1.0680892882212552, + "grad_norm": 0.46267709140380153, + "learning_rate": 1.922257838489535e-05, + "loss": 0.2751, + "step": 951 + }, + { + "epoch": 1.0692124105011933, + "grad_norm": 0.5332480394868323, + "learning_rate": 1.9219543182808862e-05, + "loss": 0.2973, + "step": 952 + }, + { + "epoch": 1.0703355327811315, + "grad_norm": 0.4839164977782545, + "learning_rate": 1.9216502307836002e-05, + "loss": 0.2835, + "step": 953 + }, + { + "epoch": 1.0714586550610699, + "grad_norm": 0.4999219278635034, + "learning_rate": 1.9213455761847845e-05, + "loss": 0.2927, + "step": 954 + }, + { + "epoch": 1.072581777341008, + "grad_norm": 0.5232385009052505, + "learning_rate": 1.921040354671897e-05, + "loss": 0.3031, + "step": 955 + }, + { + "epoch": 1.0737048996209462, + "grad_norm": 0.4541280942778829, + "learning_rate": 1.9207345664327434e-05, + "loss": 0.2697, + "step": 956 + }, + { + "epoch": 1.0748280219008846, + "grad_norm": 0.5031936868920343, + "learning_rate": 1.9204282116554792e-05, + "loss": 0.2867, + "step": 957 + }, + { + "epoch": 1.0759511441808227, + "grad_norm": 0.4713104216421202, + "learning_rate": 1.9201212905286074e-05, + "loss": 0.2788, + "step": 958 + }, + { + "epoch": 1.0770742664607609, + "grad_norm": 0.45894312520199465, + "learning_rate": 1.91981380324098e-05, + "loss": 0.2848, + "step": 959 + }, + { + "epoch": 1.078197388740699, + "grad_norm": 0.5223017626157912, + "learning_rate": 1.919505749981798e-05, + "loss": 0.293, + "step": 960 + }, + { + "epoch": 1.0793205110206374, + "grad_norm": 0.47157901979531547, + "learning_rate": 1.9191971309406085e-05, + "loss": 0.2774, + "step": 961 + }, + { + "epoch": 1.0804436333005756, + "grad_norm": 0.4988305143165187, + "learning_rate": 1.9188879463073093e-05, + "loss": 0.2967, + "step": 962 + }, + { + "epoch": 1.0815667555805137, + "grad_norm": 0.47998629427503187, + "learning_rate": 1.918578196272145e-05, + "loss": 0.2739, + "step": 963 + }, + { + "epoch": 1.0826898778604521, + "grad_norm": 0.46801323046606513, + "learning_rate": 1.918267881025708e-05, + "loss": 0.2689, + "step": 964 + }, + { + "epoch": 1.0838130001403903, + "grad_norm": 0.42494477612916176, + "learning_rate": 1.9179570007589384e-05, + "loss": 0.2707, + "step": 965 + }, + { + "epoch": 1.0849361224203284, + "grad_norm": 0.44609307554632605, + "learning_rate": 1.9176455556631247e-05, + "loss": 0.2626, + "step": 966 + }, + { + "epoch": 1.0860592447002668, + "grad_norm": 0.5054138308738797, + "learning_rate": 1.9173335459299025e-05, + "loss": 0.2902, + "step": 967 + }, + { + "epoch": 1.087182366980205, + "grad_norm": 0.44616997151039844, + "learning_rate": 1.9170209717512546e-05, + "loss": 0.2687, + "step": 968 + }, + { + "epoch": 1.0883054892601431, + "grad_norm": 0.497359281811381, + "learning_rate": 1.9167078333195116e-05, + "loss": 0.3009, + "step": 969 + }, + { + "epoch": 1.0894286115400815, + "grad_norm": 0.4643079791688917, + "learning_rate": 1.9163941308273504e-05, + "loss": 0.2792, + "step": 970 + }, + { + "epoch": 1.0905517338200197, + "grad_norm": 0.47811323587663396, + "learning_rate": 1.916079864467796e-05, + "loss": 0.2976, + "step": 971 + }, + { + "epoch": 1.0916748560999578, + "grad_norm": 0.4360155405235776, + "learning_rate": 1.9157650344342205e-05, + "loss": 0.2621, + "step": 972 + }, + { + "epoch": 1.0927979783798962, + "grad_norm": 0.45820803802250853, + "learning_rate": 1.9154496409203416e-05, + "loss": 0.2901, + "step": 973 + }, + { + "epoch": 1.0939211006598344, + "grad_norm": 0.4370878083723664, + "learning_rate": 1.9151336841202246e-05, + "loss": 0.2744, + "step": 974 + }, + { + "epoch": 1.0950442229397725, + "grad_norm": 0.450814956559242, + "learning_rate": 1.9148171642282812e-05, + "loss": 0.2691, + "step": 975 + }, + { + "epoch": 1.0961673452197107, + "grad_norm": 0.4621925080101941, + "learning_rate": 1.9145000814392696e-05, + "loss": 0.2737, + "step": 976 + }, + { + "epoch": 1.097290467499649, + "grad_norm": 0.4737134489184813, + "learning_rate": 1.914182435948294e-05, + "loss": 0.2937, + "step": 977 + }, + { + "epoch": 1.0984135897795873, + "grad_norm": 0.4510174026564208, + "learning_rate": 1.9138642279508054e-05, + "loss": 0.2743, + "step": 978 + }, + { + "epoch": 1.0995367120595254, + "grad_norm": 0.48310076248938627, + "learning_rate": 1.913545457642601e-05, + "loss": 0.2821, + "step": 979 + }, + { + "epoch": 1.1006598343394638, + "grad_norm": 0.45975463127730243, + "learning_rate": 1.9132261252198236e-05, + "loss": 0.2743, + "step": 980 + }, + { + "epoch": 1.101782956619402, + "grad_norm": 0.47566940351291187, + "learning_rate": 1.912906230878961e-05, + "loss": 0.2901, + "step": 981 + }, + { + "epoch": 1.1029060788993401, + "grad_norm": 0.49639829721895823, + "learning_rate": 1.912585774816849e-05, + "loss": 0.3062, + "step": 982 + }, + { + "epoch": 1.1040292011792785, + "grad_norm": 0.47301791959429074, + "learning_rate": 1.912264757230667e-05, + "loss": 0.2869, + "step": 983 + }, + { + "epoch": 1.1051523234592167, + "grad_norm": 0.43846903068715937, + "learning_rate": 1.9119431783179413e-05, + "loss": 0.2585, + "step": 984 + }, + { + "epoch": 1.1062754457391548, + "grad_norm": 0.5192008280625632, + "learning_rate": 1.911621038276542e-05, + "loss": 0.3036, + "step": 985 + }, + { + "epoch": 1.107398568019093, + "grad_norm": 0.5011419662554821, + "learning_rate": 1.911298337304686e-05, + "loss": 0.2813, + "step": 986 + }, + { + "epoch": 1.1085216902990314, + "grad_norm": 0.48085556410924124, + "learning_rate": 1.9109750756009348e-05, + "loss": 0.2857, + "step": 987 + }, + { + "epoch": 1.1096448125789695, + "grad_norm": 0.5116521172198443, + "learning_rate": 1.9106512533641948e-05, + "loss": 0.2867, + "step": 988 + }, + { + "epoch": 1.1107679348589077, + "grad_norm": 0.4753703314504095, + "learning_rate": 1.9103268707937174e-05, + "loss": 0.2723, + "step": 989 + }, + { + "epoch": 1.111891057138846, + "grad_norm": 0.48728987587657996, + "learning_rate": 1.9100019280890984e-05, + "loss": 0.2938, + "step": 990 + }, + { + "epoch": 1.1130141794187842, + "grad_norm": 0.46500967954346223, + "learning_rate": 1.909676425450279e-05, + "loss": 0.277, + "step": 991 + }, + { + "epoch": 1.1141373016987224, + "grad_norm": 0.4311368054737046, + "learning_rate": 1.9093503630775445e-05, + "loss": 0.2589, + "step": 992 + }, + { + "epoch": 1.1152604239786608, + "grad_norm": 0.4806685121172132, + "learning_rate": 1.9090237411715248e-05, + "loss": 0.2828, + "step": 993 + }, + { + "epoch": 1.116383546258599, + "grad_norm": 0.47413677573033675, + "learning_rate": 1.9086965599331938e-05, + "loss": 0.2922, + "step": 994 + }, + { + "epoch": 1.117506668538537, + "grad_norm": 0.463329362201204, + "learning_rate": 1.9083688195638694e-05, + "loss": 0.2991, + "step": 995 + }, + { + "epoch": 1.1186297908184755, + "grad_norm": 0.424765333204886, + "learning_rate": 1.9080405202652143e-05, + "loss": 0.2589, + "step": 996 + }, + { + "epoch": 1.1197529130984136, + "grad_norm": 0.4664315339207049, + "learning_rate": 1.9077116622392347e-05, + "loss": 0.278, + "step": 997 + }, + { + "epoch": 1.1208760353783518, + "grad_norm": 0.4580593579853824, + "learning_rate": 1.9073822456882806e-05, + "loss": 0.2781, + "step": 998 + }, + { + "epoch": 1.12199915765829, + "grad_norm": 0.4428162338559992, + "learning_rate": 1.907052270815045e-05, + "loss": 0.2788, + "step": 999 + }, + { + "epoch": 1.1231222799382283, + "grad_norm": 0.4317189895111999, + "learning_rate": 1.9067217378225655e-05, + "loss": 0.273, + "step": 1000 + }, + { + "epoch": 1.1242454022181665, + "grad_norm": 0.46512373441877797, + "learning_rate": 1.906390646914223e-05, + "loss": 0.2987, + "step": 1001 + }, + { + "epoch": 1.1253685244981046, + "grad_norm": 0.48722030698963165, + "learning_rate": 1.906058998293741e-05, + "loss": 0.3105, + "step": 1002 + }, + { + "epoch": 1.126491646778043, + "grad_norm": 0.4758813346060563, + "learning_rate": 1.9057267921651865e-05, + "loss": 0.3104, + "step": 1003 + }, + { + "epoch": 1.1276147690579812, + "grad_norm": 0.4603226186566387, + "learning_rate": 1.9053940287329696e-05, + "loss": 0.2714, + "step": 1004 + }, + { + "epoch": 1.1287378913379194, + "grad_norm": 0.5016859736501262, + "learning_rate": 1.9050607082018437e-05, + "loss": 0.3054, + "step": 1005 + }, + { + "epoch": 1.1298610136178577, + "grad_norm": 0.45587589839460446, + "learning_rate": 1.9047268307769044e-05, + "loss": 0.295, + "step": 1006 + }, + { + "epoch": 1.130984135897796, + "grad_norm": 0.4681404958373164, + "learning_rate": 1.90439239666359e-05, + "loss": 0.2888, + "step": 1007 + }, + { + "epoch": 1.132107258177734, + "grad_norm": 0.472944576189367, + "learning_rate": 1.9040574060676813e-05, + "loss": 0.2875, + "step": 1008 + }, + { + "epoch": 1.1332303804576722, + "grad_norm": 0.4496881501850784, + "learning_rate": 1.903721859195302e-05, + "loss": 0.28, + "step": 1009 + }, + { + "epoch": 1.1343535027376106, + "grad_norm": 0.469620980155489, + "learning_rate": 1.9033857562529176e-05, + "loss": 0.2731, + "step": 1010 + }, + { + "epoch": 1.1354766250175488, + "grad_norm": 0.45917004853908955, + "learning_rate": 1.9030490974473363e-05, + "loss": 0.283, + "step": 1011 + }, + { + "epoch": 1.136599747297487, + "grad_norm": 0.47268979034926534, + "learning_rate": 1.902711882985708e-05, + "loss": 0.2866, + "step": 1012 + }, + { + "epoch": 1.1377228695774253, + "grad_norm": 0.4383225568741319, + "learning_rate": 1.9023741130755237e-05, + "loss": 0.2756, + "step": 1013 + }, + { + "epoch": 1.1388459918573635, + "grad_norm": 0.46958663656332716, + "learning_rate": 1.9020357879246173e-05, + "loss": 0.2826, + "step": 1014 + }, + { + "epoch": 1.1399691141373016, + "grad_norm": 0.45079995860583716, + "learning_rate": 1.9016969077411645e-05, + "loss": 0.2872, + "step": 1015 + }, + { + "epoch": 1.14109223641724, + "grad_norm": 0.47973270092319154, + "learning_rate": 1.9013574727336817e-05, + "loss": 0.3096, + "step": 1016 + }, + { + "epoch": 1.1422153586971782, + "grad_norm": 0.501462362289904, + "learning_rate": 1.9010174831110268e-05, + "loss": 0.3117, + "step": 1017 + }, + { + "epoch": 1.1433384809771163, + "grad_norm": 0.4513154018538136, + "learning_rate": 1.9006769390823994e-05, + "loss": 0.252, + "step": 1018 + }, + { + "epoch": 1.1444616032570547, + "grad_norm": 0.4500736376781029, + "learning_rate": 1.9003358408573396e-05, + "loss": 0.2735, + "step": 1019 + }, + { + "epoch": 1.1455847255369929, + "grad_norm": 0.5023208553675916, + "learning_rate": 1.8999941886457292e-05, + "loss": 0.2987, + "step": 1020 + }, + { + "epoch": 1.146707847816931, + "grad_norm": 0.45282147717459975, + "learning_rate": 1.8996519826577907e-05, + "loss": 0.2651, + "step": 1021 + }, + { + "epoch": 1.1478309700968694, + "grad_norm": 0.4648324226943947, + "learning_rate": 1.899309223104087e-05, + "loss": 0.2985, + "step": 1022 + }, + { + "epoch": 1.1489540923768076, + "grad_norm": 0.4849118937452435, + "learning_rate": 1.898965910195522e-05, + "loss": 0.2966, + "step": 1023 + }, + { + "epoch": 1.1500772146567457, + "grad_norm": 0.4845634318497336, + "learning_rate": 1.89862204414334e-05, + "loss": 0.2997, + "step": 1024 + }, + { + "epoch": 1.1512003369366839, + "grad_norm": 0.4849781830896908, + "learning_rate": 1.8982776251591247e-05, + "loss": 0.301, + "step": 1025 + }, + { + "epoch": 1.1523234592166223, + "grad_norm": 0.46264538140977657, + "learning_rate": 1.8979326534548023e-05, + "loss": 0.2892, + "step": 1026 + }, + { + "epoch": 1.1534465814965604, + "grad_norm": 0.4751680003078539, + "learning_rate": 1.8975871292426365e-05, + "loss": 0.2943, + "step": 1027 + }, + { + "epoch": 1.1545697037764986, + "grad_norm": 0.47000462603028687, + "learning_rate": 1.8972410527352324e-05, + "loss": 0.2944, + "step": 1028 + }, + { + "epoch": 1.155692826056437, + "grad_norm": 0.46720471177522077, + "learning_rate": 1.8968944241455352e-05, + "loss": 0.2754, + "step": 1029 + }, + { + "epoch": 1.1568159483363751, + "grad_norm": 0.4682045577743917, + "learning_rate": 1.8965472436868288e-05, + "loss": 0.2945, + "step": 1030 + }, + { + "epoch": 1.1579390706163133, + "grad_norm": 0.4681264807044535, + "learning_rate": 1.8961995115727373e-05, + "loss": 0.2855, + "step": 1031 + }, + { + "epoch": 1.1590621928962517, + "grad_norm": 0.46711223820824194, + "learning_rate": 1.895851228017224e-05, + "loss": 0.2828, + "step": 1032 + }, + { + "epoch": 1.1601853151761898, + "grad_norm": 0.4305801770746027, + "learning_rate": 1.8955023932345916e-05, + "loss": 0.2572, + "step": 1033 + }, + { + "epoch": 1.161308437456128, + "grad_norm": 0.4783787183648787, + "learning_rate": 1.8951530074394828e-05, + "loss": 0.3005, + "step": 1034 + }, + { + "epoch": 1.1624315597360662, + "grad_norm": 0.4580608030880634, + "learning_rate": 1.894803070846877e-05, + "loss": 0.292, + "step": 1035 + }, + { + "epoch": 1.1635546820160045, + "grad_norm": 0.44482733683564024, + "learning_rate": 1.894452583672095e-05, + "loss": 0.2814, + "step": 1036 + }, + { + "epoch": 1.1646778042959427, + "grad_norm": 0.46029498474111563, + "learning_rate": 1.8941015461307955e-05, + "loss": 0.2893, + "step": 1037 + }, + { + "epoch": 1.1658009265758809, + "grad_norm": 0.4615488233008817, + "learning_rate": 1.8937499584389755e-05, + "loss": 0.2932, + "step": 1038 + }, + { + "epoch": 1.1669240488558192, + "grad_norm": 0.42956178577189436, + "learning_rate": 1.8933978208129705e-05, + "loss": 0.2739, + "step": 1039 + }, + { + "epoch": 1.1680471711357574, + "grad_norm": 0.4802415097521688, + "learning_rate": 1.893045133469455e-05, + "loss": 0.2883, + "step": 1040 + }, + { + "epoch": 1.1691702934156956, + "grad_norm": 0.4480867146233576, + "learning_rate": 1.8926918966254416e-05, + "loss": 0.2729, + "step": 1041 + }, + { + "epoch": 1.170293415695634, + "grad_norm": 0.49140396781805756, + "learning_rate": 1.8923381104982806e-05, + "loss": 0.28, + "step": 1042 + }, + { + "epoch": 1.171416537975572, + "grad_norm": 0.46490086726476276, + "learning_rate": 1.8919837753056606e-05, + "loss": 0.2657, + "step": 1043 + }, + { + "epoch": 1.1725396602555103, + "grad_norm": 0.4657704225340458, + "learning_rate": 1.8916288912656077e-05, + "loss": 0.3002, + "step": 1044 + }, + { + "epoch": 1.1736627825354486, + "grad_norm": 0.45315444310300473, + "learning_rate": 1.891273458596486e-05, + "loss": 0.2671, + "step": 1045 + }, + { + "epoch": 1.1747859048153868, + "grad_norm": 0.5014981783064109, + "learning_rate": 1.8909174775169968e-05, + "loss": 0.2791, + "step": 1046 + }, + { + "epoch": 1.175909027095325, + "grad_norm": 0.5179938024360514, + "learning_rate": 1.89056094824618e-05, + "loss": 0.3352, + "step": 1047 + }, + { + "epoch": 1.1770321493752633, + "grad_norm": 0.4232454478701444, + "learning_rate": 1.8902038710034113e-05, + "loss": 0.2557, + "step": 1048 + }, + { + "epoch": 1.1781552716552015, + "grad_norm": 0.5139754763017151, + "learning_rate": 1.889846246008405e-05, + "loss": 0.2959, + "step": 1049 + }, + { + "epoch": 1.1792783939351397, + "grad_norm": 0.48647478582657605, + "learning_rate": 1.8894880734812106e-05, + "loss": 0.3023, + "step": 1050 + }, + { + "epoch": 1.1804015162150778, + "grad_norm": 0.46616236114663656, + "learning_rate": 1.8891293536422165e-05, + "loss": 0.2845, + "step": 1051 + }, + { + "epoch": 1.1815246384950162, + "grad_norm": 0.4287393409593001, + "learning_rate": 1.888770086712147e-05, + "loss": 0.2696, + "step": 1052 + }, + { + "epoch": 1.1826477607749544, + "grad_norm": 0.4949520898537966, + "learning_rate": 1.8884102729120624e-05, + "loss": 0.3007, + "step": 1053 + }, + { + "epoch": 1.1837708830548925, + "grad_norm": 0.446833141730547, + "learning_rate": 1.88804991246336e-05, + "loss": 0.2692, + "step": 1054 + }, + { + "epoch": 1.184894005334831, + "grad_norm": 0.48608855458145783, + "learning_rate": 1.8876890055877745e-05, + "loss": 0.318, + "step": 1055 + }, + { + "epoch": 1.186017127614769, + "grad_norm": 0.457882561215241, + "learning_rate": 1.887327552507375e-05, + "loss": 0.2802, + "step": 1056 + }, + { + "epoch": 1.1871402498947072, + "grad_norm": 0.45242524194849565, + "learning_rate": 1.886965553444568e-05, + "loss": 0.264, + "step": 1057 + }, + { + "epoch": 1.1882633721746454, + "grad_norm": 0.48868541054842407, + "learning_rate": 1.886603008622095e-05, + "loss": 0.3076, + "step": 1058 + }, + { + "epoch": 1.1893864944545838, + "grad_norm": 0.44249560468632987, + "learning_rate": 1.8862399182630347e-05, + "loss": 0.2661, + "step": 1059 + }, + { + "epoch": 1.190509616734522, + "grad_norm": 0.4588142894528934, + "learning_rate": 1.8858762825908e-05, + "loss": 0.2865, + "step": 1060 + }, + { + "epoch": 1.19163273901446, + "grad_norm": 0.46946406735526547, + "learning_rate": 1.8855121018291394e-05, + "loss": 0.3046, + "step": 1061 + }, + { + "epoch": 1.1927558612943985, + "grad_norm": 0.45375091401248635, + "learning_rate": 1.8851473762021384e-05, + "loss": 0.2732, + "step": 1062 + }, + { + "epoch": 1.1938789835743366, + "grad_norm": 0.4676425499504646, + "learning_rate": 1.8847821059342163e-05, + "loss": 0.2932, + "step": 1063 + }, + { + "epoch": 1.1950021058542748, + "grad_norm": 0.46249196792362884, + "learning_rate": 1.8844162912501277e-05, + "loss": 0.302, + "step": 1064 + }, + { + "epoch": 1.1961252281342132, + "grad_norm": 0.4812578734002899, + "learning_rate": 1.8840499323749624e-05, + "loss": 0.303, + "step": 1065 + }, + { + "epoch": 1.1972483504141513, + "grad_norm": 0.48405098483559544, + "learning_rate": 1.883683029534145e-05, + "loss": 0.2887, + "step": 1066 + }, + { + "epoch": 1.1983714726940895, + "grad_norm": 0.4608928377281792, + "learning_rate": 1.8833155829534356e-05, + "loss": 0.2958, + "step": 1067 + }, + { + "epoch": 1.1994945949740279, + "grad_norm": 0.43401836542504524, + "learning_rate": 1.8829475928589272e-05, + "loss": 0.2603, + "step": 1068 + }, + { + "epoch": 1.200617717253966, + "grad_norm": 0.4780260992243674, + "learning_rate": 1.8825790594770487e-05, + "loss": 0.2809, + "step": 1069 + }, + { + "epoch": 1.2017408395339042, + "grad_norm": 0.44275206432188047, + "learning_rate": 1.882209983034562e-05, + "loss": 0.2773, + "step": 1070 + }, + { + "epoch": 1.2028639618138426, + "grad_norm": 0.47460553242353476, + "learning_rate": 1.881840363758565e-05, + "loss": 0.3128, + "step": 1071 + }, + { + "epoch": 1.2039870840937807, + "grad_norm": 0.46237921124263054, + "learning_rate": 1.881470201876488e-05, + "loss": 0.2952, + "step": 1072 + }, + { + "epoch": 1.205110206373719, + "grad_norm": 0.48654497979875483, + "learning_rate": 1.8810994976160955e-05, + "loss": 0.3147, + "step": 1073 + }, + { + "epoch": 1.2062333286536573, + "grad_norm": 0.4921863140740264, + "learning_rate": 1.880728251205486e-05, + "loss": 0.3093, + "step": 1074 + }, + { + "epoch": 1.2073564509335954, + "grad_norm": 0.471423938361344, + "learning_rate": 1.8803564628730916e-05, + "loss": 0.2903, + "step": 1075 + }, + { + "epoch": 1.2084795732135336, + "grad_norm": 0.46874454339863675, + "learning_rate": 1.8799841328476776e-05, + "loss": 0.2889, + "step": 1076 + }, + { + "epoch": 1.2096026954934718, + "grad_norm": 0.45281746346669155, + "learning_rate": 1.8796112613583427e-05, + "loss": 0.2895, + "step": 1077 + }, + { + "epoch": 1.2107258177734102, + "grad_norm": 0.45353972532299486, + "learning_rate": 1.8792378486345196e-05, + "loss": 0.2789, + "step": 1078 + }, + { + "epoch": 1.2118489400533483, + "grad_norm": 0.4725888875307044, + "learning_rate": 1.878863894905972e-05, + "loss": 0.2847, + "step": 1079 + }, + { + "epoch": 1.2129720623332865, + "grad_norm": 0.442084650431914, + "learning_rate": 1.878489400402799e-05, + "loss": 0.274, + "step": 1080 + }, + { + "epoch": 1.2140951846132249, + "grad_norm": 0.49254544284536805, + "learning_rate": 1.8781143653554305e-05, + "loss": 0.3088, + "step": 1081 + }, + { + "epoch": 1.215218306893163, + "grad_norm": 0.4617957372006431, + "learning_rate": 1.8777387899946294e-05, + "loss": 0.2879, + "step": 1082 + }, + { + "epoch": 1.2163414291731012, + "grad_norm": 0.4646226561763834, + "learning_rate": 1.877362674551492e-05, + "loss": 0.2929, + "step": 1083 + }, + { + "epoch": 1.2174645514530393, + "grad_norm": 0.5009688641472497, + "learning_rate": 1.876986019257446e-05, + "loss": 0.314, + "step": 1084 + }, + { + "epoch": 1.2185876737329777, + "grad_norm": 0.49964534711190756, + "learning_rate": 1.8766088243442514e-05, + "loss": 0.32, + "step": 1085 + }, + { + "epoch": 1.2197107960129159, + "grad_norm": 0.47363249608950986, + "learning_rate": 1.8762310900440007e-05, + "loss": 0.2895, + "step": 1086 + }, + { + "epoch": 1.220833918292854, + "grad_norm": 0.4781168700712531, + "learning_rate": 1.875852816589118e-05, + "loss": 0.2826, + "step": 1087 + }, + { + "epoch": 1.2219570405727924, + "grad_norm": 0.45871143064600073, + "learning_rate": 1.8754740042123583e-05, + "loss": 0.2762, + "step": 1088 + }, + { + "epoch": 1.2230801628527306, + "grad_norm": 0.5147809750511391, + "learning_rate": 1.8750946531468098e-05, + "loss": 0.3179, + "step": 1089 + }, + { + "epoch": 1.2242032851326687, + "grad_norm": 0.45406534949791066, + "learning_rate": 1.874714763625892e-05, + "loss": 0.2656, + "step": 1090 + }, + { + "epoch": 1.2253264074126071, + "grad_norm": 0.46488456668844064, + "learning_rate": 1.8743343358833536e-05, + "loss": 0.2867, + "step": 1091 + }, + { + "epoch": 1.2264495296925453, + "grad_norm": 0.46081222446789055, + "learning_rate": 1.8739533701532768e-05, + "loss": 0.2891, + "step": 1092 + }, + { + "epoch": 1.2275726519724834, + "grad_norm": 0.47759019663041424, + "learning_rate": 1.873571866670074e-05, + "loss": 0.2884, + "step": 1093 + }, + { + "epoch": 1.2286957742524218, + "grad_norm": 0.4825230030891384, + "learning_rate": 1.8731898256684885e-05, + "loss": 0.2886, + "step": 1094 + }, + { + "epoch": 1.22981889653236, + "grad_norm": 0.4568440667981614, + "learning_rate": 1.8728072473835944e-05, + "loss": 0.2801, + "step": 1095 + }, + { + "epoch": 1.2309420188122981, + "grad_norm": 0.4517593846376601, + "learning_rate": 1.8724241320507958e-05, + "loss": 0.2687, + "step": 1096 + }, + { + "epoch": 1.2320651410922365, + "grad_norm": 0.473458572983134, + "learning_rate": 1.8720404799058284e-05, + "loss": 0.2811, + "step": 1097 + }, + { + "epoch": 1.2331882633721747, + "grad_norm": 0.47853878049484405, + "learning_rate": 1.8716562911847572e-05, + "loss": 0.3026, + "step": 1098 + }, + { + "epoch": 1.2343113856521128, + "grad_norm": 0.4732508123997025, + "learning_rate": 1.8712715661239783e-05, + "loss": 0.2858, + "step": 1099 + }, + { + "epoch": 1.235434507932051, + "grad_norm": 0.4943536151259743, + "learning_rate": 1.8708863049602163e-05, + "loss": 0.3155, + "step": 1100 + }, + { + "epoch": 1.2365576302119894, + "grad_norm": 0.45750032528758056, + "learning_rate": 1.8705005079305274e-05, + "loss": 0.2851, + "step": 1101 + }, + { + "epoch": 1.2376807524919275, + "grad_norm": 0.4521267906219757, + "learning_rate": 1.8701141752722966e-05, + "loss": 0.2747, + "step": 1102 + }, + { + "epoch": 1.2388038747718657, + "grad_norm": 0.46594419380408797, + "learning_rate": 1.8697273072232385e-05, + "loss": 0.3064, + "step": 1103 + }, + { + "epoch": 1.239926997051804, + "grad_norm": 0.43383827589550866, + "learning_rate": 1.8693399040213974e-05, + "loss": 0.2676, + "step": 1104 + }, + { + "epoch": 1.2410501193317423, + "grad_norm": 0.4474050960888805, + "learning_rate": 1.8689519659051467e-05, + "loss": 0.2741, + "step": 1105 + }, + { + "epoch": 1.2421732416116804, + "grad_norm": 0.45523790295881006, + "learning_rate": 1.868563493113189e-05, + "loss": 0.2881, + "step": 1106 + }, + { + "epoch": 1.2432963638916188, + "grad_norm": 0.4584723123279694, + "learning_rate": 1.8681744858845555e-05, + "loss": 0.284, + "step": 1107 + }, + { + "epoch": 1.244419486171557, + "grad_norm": 0.46457228328541583, + "learning_rate": 1.8677849444586073e-05, + "loss": 0.2799, + "step": 1108 + }, + { + "epoch": 1.2455426084514951, + "grad_norm": 0.45539666514036803, + "learning_rate": 1.8673948690750333e-05, + "loss": 0.2887, + "step": 1109 + }, + { + "epoch": 1.2466657307314333, + "grad_norm": 0.4677915933786495, + "learning_rate": 1.867004259973851e-05, + "loss": 0.292, + "step": 1110 + }, + { + "epoch": 1.2477888530113717, + "grad_norm": 0.4667047750396251, + "learning_rate": 1.866613117395407e-05, + "loss": 0.2889, + "step": 1111 + }, + { + "epoch": 1.2489119752913098, + "grad_norm": 0.48997059381677216, + "learning_rate": 1.8662214415803748e-05, + "loss": 0.2968, + "step": 1112 + }, + { + "epoch": 1.250035097571248, + "grad_norm": 0.4384615139812607, + "learning_rate": 1.8658292327697574e-05, + "loss": 0.2715, + "step": 1113 + }, + { + "epoch": 1.2511582198511864, + "grad_norm": 0.4547012939945187, + "learning_rate": 1.865436491204885e-05, + "loss": 0.2747, + "step": 1114 + }, + { + "epoch": 1.2522813421311245, + "grad_norm": 0.5098417390678678, + "learning_rate": 1.865043217127416e-05, + "loss": 0.3327, + "step": 1115 + }, + { + "epoch": 1.2534044644110627, + "grad_norm": 0.47158047623028704, + "learning_rate": 1.864649410779336e-05, + "loss": 0.2977, + "step": 1116 + }, + { + "epoch": 1.254527586691001, + "grad_norm": 0.47071696073688374, + "learning_rate": 1.8642550724029584e-05, + "loss": 0.3032, + "step": 1117 + }, + { + "epoch": 1.2556507089709392, + "grad_norm": 0.4756046517133648, + "learning_rate": 1.863860202240924e-05, + "loss": 0.3077, + "step": 1118 + }, + { + "epoch": 1.2567738312508774, + "grad_norm": 0.47511377917679826, + "learning_rate": 1.863464800536201e-05, + "loss": 0.2969, + "step": 1119 + }, + { + "epoch": 1.2578969535308158, + "grad_norm": 0.47570563192173576, + "learning_rate": 1.8630688675320844e-05, + "loss": 0.3081, + "step": 1120 + }, + { + "epoch": 1.259020075810754, + "grad_norm": 0.4905178570778228, + "learning_rate": 1.8626724034721955e-05, + "loss": 0.2967, + "step": 1121 + }, + { + "epoch": 1.260143198090692, + "grad_norm": 0.4603739199538433, + "learning_rate": 1.8622754086004837e-05, + "loss": 0.2852, + "step": 1122 + }, + { + "epoch": 1.2612663203706305, + "grad_norm": 0.48083734404310896, + "learning_rate": 1.8618778831612243e-05, + "loss": 0.2836, + "step": 1123 + }, + { + "epoch": 1.2623894426505686, + "grad_norm": 0.46026086440947167, + "learning_rate": 1.8614798273990186e-05, + "loss": 0.2785, + "step": 1124 + }, + { + "epoch": 1.2635125649305068, + "grad_norm": 0.49105292345069845, + "learning_rate": 1.8610812415587948e-05, + "loss": 0.2841, + "step": 1125 + }, + { + "epoch": 1.2646356872104452, + "grad_norm": 0.45003283326379967, + "learning_rate": 1.860682125885808e-05, + "loss": 0.2817, + "step": 1126 + }, + { + "epoch": 1.2657588094903833, + "grad_norm": 0.49522134976029414, + "learning_rate": 1.860282480625637e-05, + "loss": 0.3027, + "step": 1127 + }, + { + "epoch": 1.2668819317703215, + "grad_norm": 0.504015437884683, + "learning_rate": 1.859882306024189e-05, + "loss": 0.3062, + "step": 1128 + }, + { + "epoch": 1.2680050540502597, + "grad_norm": 0.4935114092813406, + "learning_rate": 1.8594816023276954e-05, + "loss": 0.3145, + "step": 1129 + }, + { + "epoch": 1.269128176330198, + "grad_norm": 0.45349496634172853, + "learning_rate": 1.8590803697827138e-05, + "loss": 0.2756, + "step": 1130 + }, + { + "epoch": 1.2702512986101362, + "grad_norm": 0.4923320856662289, + "learning_rate": 1.8586786086361268e-05, + "loss": 0.293, + "step": 1131 + }, + { + "epoch": 1.2713744208900744, + "grad_norm": 0.48349777935390137, + "learning_rate": 1.8582763191351427e-05, + "loss": 0.2881, + "step": 1132 + }, + { + "epoch": 1.2724975431700125, + "grad_norm": 0.45227425022413825, + "learning_rate": 1.8578735015272947e-05, + "loss": 0.2738, + "step": 1133 + }, + { + "epoch": 1.273620665449951, + "grad_norm": 0.4796542700286378, + "learning_rate": 1.8574701560604405e-05, + "loss": 0.2974, + "step": 1134 + }, + { + "epoch": 1.274743787729889, + "grad_norm": 0.45414376951278596, + "learning_rate": 1.8570662829827632e-05, + "loss": 0.3021, + "step": 1135 + }, + { + "epoch": 1.2758669100098272, + "grad_norm": 0.4329561559626584, + "learning_rate": 1.8566618825427704e-05, + "loss": 0.2636, + "step": 1136 + }, + { + "epoch": 1.2769900322897656, + "grad_norm": 0.49933859807478426, + "learning_rate": 1.8562569549892945e-05, + "loss": 0.3034, + "step": 1137 + }, + { + "epoch": 1.2781131545697038, + "grad_norm": 0.4531437341854577, + "learning_rate": 1.855851500571491e-05, + "loss": 0.281, + "step": 1138 + }, + { + "epoch": 1.279236276849642, + "grad_norm": 0.4567172782028271, + "learning_rate": 1.8554455195388414e-05, + "loss": 0.2751, + "step": 1139 + }, + { + "epoch": 1.2803593991295803, + "grad_norm": 0.5083738307195607, + "learning_rate": 1.8550390121411497e-05, + "loss": 0.2964, + "step": 1140 + }, + { + "epoch": 1.2814825214095185, + "grad_norm": 0.4599915752167777, + "learning_rate": 1.8546319786285443e-05, + "loss": 0.2804, + "step": 1141 + }, + { + "epoch": 1.2826056436894566, + "grad_norm": 0.47423327374698915, + "learning_rate": 1.854224419251478e-05, + "loss": 0.3044, + "step": 1142 + }, + { + "epoch": 1.283728765969395, + "grad_norm": 0.47354584223797824, + "learning_rate": 1.853816334260726e-05, + "loss": 0.3022, + "step": 1143 + }, + { + "epoch": 1.2848518882493332, + "grad_norm": 0.48012669773887895, + "learning_rate": 1.8534077239073877e-05, + "loss": 0.291, + "step": 1144 + }, + { + "epoch": 1.2859750105292713, + "grad_norm": 0.4835082015975579, + "learning_rate": 1.8529985884428855e-05, + "loss": 0.2957, + "step": 1145 + }, + { + "epoch": 1.2870981328092097, + "grad_norm": 0.44167167547950387, + "learning_rate": 1.8525889281189654e-05, + "loss": 0.2927, + "step": 1146 + }, + { + "epoch": 1.2882212550891479, + "grad_norm": 0.4526647766195691, + "learning_rate": 1.8521787431876954e-05, + "loss": 0.2801, + "step": 1147 + }, + { + "epoch": 1.289344377369086, + "grad_norm": 0.4568245323185659, + "learning_rate": 1.8517680339014667e-05, + "loss": 0.2648, + "step": 1148 + }, + { + "epoch": 1.2904674996490244, + "grad_norm": 0.4883019800865997, + "learning_rate": 1.8513568005129937e-05, + "loss": 0.2865, + "step": 1149 + }, + { + "epoch": 1.2915906219289626, + "grad_norm": 0.47025667774471347, + "learning_rate": 1.8509450432753123e-05, + "loss": 0.2938, + "step": 1150 + }, + { + "epoch": 1.2927137442089007, + "grad_norm": 0.4428247847588197, + "learning_rate": 1.8505327624417816e-05, + "loss": 0.2732, + "step": 1151 + }, + { + "epoch": 1.293836866488839, + "grad_norm": 0.4774238303086379, + "learning_rate": 1.8501199582660824e-05, + "loss": 0.3058, + "step": 1152 + }, + { + "epoch": 1.2949599887687773, + "grad_norm": 0.4948410577106829, + "learning_rate": 1.849706631002218e-05, + "loss": 0.3044, + "step": 1153 + }, + { + "epoch": 1.2960831110487154, + "grad_norm": 0.48963021776942295, + "learning_rate": 1.849292780904513e-05, + "loss": 0.3007, + "step": 1154 + }, + { + "epoch": 1.2972062333286536, + "grad_norm": 0.4741585592952024, + "learning_rate": 1.8488784082276137e-05, + "loss": 0.2818, + "step": 1155 + }, + { + "epoch": 1.2983293556085918, + "grad_norm": 0.4604185684567528, + "learning_rate": 1.848463513226488e-05, + "loss": 0.2471, + "step": 1156 + }, + { + "epoch": 1.2994524778885301, + "grad_norm": 0.4877301769198903, + "learning_rate": 1.848048096156426e-05, + "loss": 0.2926, + "step": 1157 + }, + { + "epoch": 1.3005756001684683, + "grad_norm": 0.4742673297417078, + "learning_rate": 1.8476321572730382e-05, + "loss": 0.2907, + "step": 1158 + }, + { + "epoch": 1.3016987224484065, + "grad_norm": 0.4748985003791319, + "learning_rate": 1.847215696832256e-05, + "loss": 0.2857, + "step": 1159 + }, + { + "epoch": 1.3028218447283448, + "grad_norm": 0.4785822244074492, + "learning_rate": 1.8467987150903325e-05, + "loss": 0.2896, + "step": 1160 + }, + { + "epoch": 1.303944967008283, + "grad_norm": 0.4852477184928872, + "learning_rate": 1.846381212303841e-05, + "loss": 0.3077, + "step": 1161 + }, + { + "epoch": 1.3050680892882212, + "grad_norm": 0.46752327714434433, + "learning_rate": 1.8459631887296757e-05, + "loss": 0.3059, + "step": 1162 + }, + { + "epoch": 1.3061912115681595, + "grad_norm": 0.491665240404064, + "learning_rate": 1.8455446446250508e-05, + "loss": 0.2982, + "step": 1163 + }, + { + "epoch": 1.3073143338480977, + "grad_norm": 0.4481191351234828, + "learning_rate": 1.8451255802475014e-05, + "loss": 0.2835, + "step": 1164 + }, + { + "epoch": 1.3084374561280359, + "grad_norm": 0.4419919772694303, + "learning_rate": 1.8447059958548822e-05, + "loss": 0.2744, + "step": 1165 + }, + { + "epoch": 1.3095605784079742, + "grad_norm": 0.46574040350278373, + "learning_rate": 1.8442858917053682e-05, + "loss": 0.291, + "step": 1166 + }, + { + "epoch": 1.3106837006879124, + "grad_norm": 0.43091837390173826, + "learning_rate": 1.843865268057454e-05, + "loss": 0.2657, + "step": 1167 + }, + { + "epoch": 1.3118068229678506, + "grad_norm": 0.4642199123609139, + "learning_rate": 1.843444125169954e-05, + "loss": 0.2876, + "step": 1168 + }, + { + "epoch": 1.312929945247789, + "grad_norm": 0.457887008950542, + "learning_rate": 1.843022463302002e-05, + "loss": 0.2727, + "step": 1169 + }, + { + "epoch": 1.314053067527727, + "grad_norm": 0.44022972541172534, + "learning_rate": 1.8426002827130517e-05, + "loss": 0.2892, + "step": 1170 + }, + { + "epoch": 1.3151761898076653, + "grad_norm": 0.4621964331966014, + "learning_rate": 1.842177583662875e-05, + "loss": 0.3, + "step": 1171 + }, + { + "epoch": 1.3162993120876036, + "grad_norm": 0.44527517838102726, + "learning_rate": 1.8417543664115632e-05, + "loss": 0.286, + "step": 1172 + }, + { + "epoch": 1.3174224343675418, + "grad_norm": 0.455981271490681, + "learning_rate": 1.8413306312195265e-05, + "loss": 0.2785, + "step": 1173 + }, + { + "epoch": 1.31854555664748, + "grad_norm": 0.47535325172610315, + "learning_rate": 1.840906378347494e-05, + "loss": 0.3106, + "step": 1174 + }, + { + "epoch": 1.3196686789274183, + "grad_norm": 0.4559106843629929, + "learning_rate": 1.8404816080565133e-05, + "loss": 0.2955, + "step": 1175 + }, + { + "epoch": 1.3207918012073565, + "grad_norm": 0.4772186174990515, + "learning_rate": 1.84005632060795e-05, + "loss": 0.2859, + "step": 1176 + }, + { + "epoch": 1.3219149234872947, + "grad_norm": 0.5126527615592691, + "learning_rate": 1.8396305162634885e-05, + "loss": 0.3171, + "step": 1177 + }, + { + "epoch": 1.3230380457672328, + "grad_norm": 0.4863596644425538, + "learning_rate": 1.83920419528513e-05, + "loss": 0.3076, + "step": 1178 + }, + { + "epoch": 1.3241611680471712, + "grad_norm": 0.4329722715025753, + "learning_rate": 1.838777357935196e-05, + "loss": 0.2594, + "step": 1179 + }, + { + "epoch": 1.3252842903271094, + "grad_norm": 0.4735307567938337, + "learning_rate": 1.8383500044763226e-05, + "loss": 0.2752, + "step": 1180 + }, + { + "epoch": 1.3264074126070475, + "grad_norm": 0.4708270647203179, + "learning_rate": 1.837922135171466e-05, + "loss": 0.3007, + "step": 1181 + }, + { + "epoch": 1.3275305348869857, + "grad_norm": 0.4328720336288202, + "learning_rate": 1.837493750283899e-05, + "loss": 0.2743, + "step": 1182 + }, + { + "epoch": 1.328653657166924, + "grad_norm": 0.45831796385684703, + "learning_rate": 1.8370648500772107e-05, + "loss": 0.2961, + "step": 1183 + }, + { + "epoch": 1.3297767794468622, + "grad_norm": 0.4507244345775207, + "learning_rate": 1.836635434815309e-05, + "loss": 0.2854, + "step": 1184 + }, + { + "epoch": 1.3308999017268004, + "grad_norm": 0.454897865730844, + "learning_rate": 1.8362055047624175e-05, + "loss": 0.2685, + "step": 1185 + }, + { + "epoch": 1.3320230240067388, + "grad_norm": 0.46791986097149424, + "learning_rate": 1.835775060183077e-05, + "loss": 0.2824, + "step": 1186 + }, + { + "epoch": 1.333146146286677, + "grad_norm": 0.47734391363275325, + "learning_rate": 1.8353441013421445e-05, + "loss": 0.2919, + "step": 1187 + }, + { + "epoch": 1.334269268566615, + "grad_norm": 0.4525116496155753, + "learning_rate": 1.8349126285047937e-05, + "loss": 0.2897, + "step": 1188 + }, + { + "epoch": 1.3353923908465535, + "grad_norm": 0.46057547898872325, + "learning_rate": 1.8344806419365152e-05, + "loss": 0.2959, + "step": 1189 + }, + { + "epoch": 1.3365155131264916, + "grad_norm": 0.43961943911433504, + "learning_rate": 1.8340481419031146e-05, + "loss": 0.2717, + "step": 1190 + }, + { + "epoch": 1.3376386354064298, + "grad_norm": 0.46908924741585806, + "learning_rate": 1.833615128670714e-05, + "loss": 0.2921, + "step": 1191 + }, + { + "epoch": 1.3387617576863682, + "grad_norm": 0.4619903044568609, + "learning_rate": 1.8331816025057508e-05, + "loss": 0.2898, + "step": 1192 + }, + { + "epoch": 1.3398848799663063, + "grad_norm": 0.44532555047844774, + "learning_rate": 1.8327475636749793e-05, + "loss": 0.2824, + "step": 1193 + }, + { + "epoch": 1.3410080022462445, + "grad_norm": 0.4770456698496007, + "learning_rate": 1.8323130124454676e-05, + "loss": 0.2934, + "step": 1194 + }, + { + "epoch": 1.3421311245261829, + "grad_norm": 0.5013999364371049, + "learning_rate": 1.8318779490846005e-05, + "loss": 0.3106, + "step": 1195 + }, + { + "epoch": 1.343254246806121, + "grad_norm": 0.4462481861250692, + "learning_rate": 1.8314423738600765e-05, + "loss": 0.2916, + "step": 1196 + }, + { + "epoch": 1.3443773690860592, + "grad_norm": 0.4544363504817102, + "learning_rate": 1.8310062870399105e-05, + "loss": 0.2768, + "step": 1197 + }, + { + "epoch": 1.3455004913659976, + "grad_norm": 0.4739036937639808, + "learning_rate": 1.8305696888924312e-05, + "loss": 0.2818, + "step": 1198 + }, + { + "epoch": 1.3466236136459357, + "grad_norm": 0.4251780111412686, + "learning_rate": 1.8301325796862825e-05, + "loss": 0.2618, + "step": 1199 + }, + { + "epoch": 1.347746735925874, + "grad_norm": 0.4905216280674908, + "learning_rate": 1.829694959690422e-05, + "loss": 0.2881, + "step": 1200 + }, + { + "epoch": 1.3488698582058123, + "grad_norm": 0.47365961695255354, + "learning_rate": 1.8292568291741228e-05, + "loss": 0.2954, + "step": 1201 + }, + { + "epoch": 1.3499929804857504, + "grad_norm": 0.46299602153786523, + "learning_rate": 1.8288181884069707e-05, + "loss": 0.284, + "step": 1202 + }, + { + "epoch": 1.3511161027656886, + "grad_norm": 0.44021839710368676, + "learning_rate": 1.828379037658867e-05, + "loss": 0.2685, + "step": 1203 + }, + { + "epoch": 1.3522392250456268, + "grad_norm": 0.5131025179704897, + "learning_rate": 1.827939377200025e-05, + "loss": 0.3053, + "step": 1204 + }, + { + "epoch": 1.3533623473255652, + "grad_norm": 0.44547118449274636, + "learning_rate": 1.8274992073009736e-05, + "loss": 0.289, + "step": 1205 + }, + { + "epoch": 1.3544854696055033, + "grad_norm": 0.44524428838589575, + "learning_rate": 1.827058528232553e-05, + "loss": 0.2849, + "step": 1206 + }, + { + "epoch": 1.3556085918854415, + "grad_norm": 0.4649075563603362, + "learning_rate": 1.8266173402659193e-05, + "loss": 0.2807, + "step": 1207 + }, + { + "epoch": 1.3567317141653796, + "grad_norm": 0.48613198722949613, + "learning_rate": 1.826175643672539e-05, + "loss": 0.2949, + "step": 1208 + }, + { + "epoch": 1.357854836445318, + "grad_norm": 0.43266914602095147, + "learning_rate": 1.8257334387241944e-05, + "loss": 0.2712, + "step": 1209 + }, + { + "epoch": 1.3589779587252562, + "grad_norm": 0.4612767862308836, + "learning_rate": 1.8252907256929777e-05, + "loss": 0.2802, + "step": 1210 + }, + { + "epoch": 1.3601010810051943, + "grad_norm": 0.48733069495425535, + "learning_rate": 1.8248475048512956e-05, + "loss": 0.2995, + "step": 1211 + }, + { + "epoch": 1.3612242032851327, + "grad_norm": 0.4633919892138498, + "learning_rate": 1.8244037764718666e-05, + "loss": 0.2945, + "step": 1212 + }, + { + "epoch": 1.3623473255650709, + "grad_norm": 0.48010248333965416, + "learning_rate": 1.8239595408277216e-05, + "loss": 0.2702, + "step": 1213 + }, + { + "epoch": 1.363470447845009, + "grad_norm": 0.44254247447818046, + "learning_rate": 1.8235147981922042e-05, + "loss": 0.2797, + "step": 1214 + }, + { + "epoch": 1.3645935701249474, + "grad_norm": 0.4685652855739763, + "learning_rate": 1.8230695488389688e-05, + "loss": 0.2994, + "step": 1215 + }, + { + "epoch": 1.3657166924048856, + "grad_norm": 0.46157477459451246, + "learning_rate": 1.822623793041983e-05, + "loss": 0.2945, + "step": 1216 + }, + { + "epoch": 1.3668398146848237, + "grad_norm": 0.4509218965365652, + "learning_rate": 1.8221775310755247e-05, + "loss": 0.286, + "step": 1217 + }, + { + "epoch": 1.3679629369647621, + "grad_norm": 0.4584479264400474, + "learning_rate": 1.8217307632141835e-05, + "loss": 0.2933, + "step": 1218 + }, + { + "epoch": 1.3690860592447003, + "grad_norm": 0.4985130465087838, + "learning_rate": 1.8212834897328614e-05, + "loss": 0.3045, + "step": 1219 + }, + { + "epoch": 1.3702091815246384, + "grad_norm": 0.47337122837553836, + "learning_rate": 1.82083571090677e-05, + "loss": 0.2836, + "step": 1220 + }, + { + "epoch": 1.3713323038045768, + "grad_norm": 0.45448456689959993, + "learning_rate": 1.8203874270114327e-05, + "loss": 0.2708, + "step": 1221 + }, + { + "epoch": 1.372455426084515, + "grad_norm": 0.4893145984532352, + "learning_rate": 1.8199386383226835e-05, + "loss": 0.2826, + "step": 1222 + }, + { + "epoch": 1.3735785483644531, + "grad_norm": 0.44417991047867217, + "learning_rate": 1.8194893451166673e-05, + "loss": 0.2821, + "step": 1223 + }, + { + "epoch": 1.3747016706443915, + "grad_norm": 0.44598812540160476, + "learning_rate": 1.819039547669839e-05, + "loss": 0.2774, + "step": 1224 + }, + { + "epoch": 1.3758247929243297, + "grad_norm": 0.4776707667120244, + "learning_rate": 1.818589246258964e-05, + "loss": 0.3122, + "step": 1225 + }, + { + "epoch": 1.3769479152042678, + "grad_norm": 0.5082949999474378, + "learning_rate": 1.8181384411611173e-05, + "loss": 0.2948, + "step": 1226 + }, + { + "epoch": 1.3780710374842062, + "grad_norm": 0.4424043135503394, + "learning_rate": 1.817687132653685e-05, + "loss": 0.2743, + "step": 1227 + }, + { + "epoch": 1.3791941597641444, + "grad_norm": 0.48246812847596726, + "learning_rate": 1.8172353210143613e-05, + "loss": 0.3154, + "step": 1228 + }, + { + "epoch": 1.3803172820440825, + "grad_norm": 0.4637624706766306, + "learning_rate": 1.8167830065211513e-05, + "loss": 0.3047, + "step": 1229 + }, + { + "epoch": 1.3814404043240207, + "grad_norm": 0.47224752107030954, + "learning_rate": 1.8163301894523695e-05, + "loss": 0.3023, + "step": 1230 + }, + { + "epoch": 1.3825635266039589, + "grad_norm": 0.4698090761389382, + "learning_rate": 1.8158768700866386e-05, + "loss": 0.2635, + "step": 1231 + }, + { + "epoch": 1.3836866488838973, + "grad_norm": 0.4957909790614304, + "learning_rate": 1.8154230487028913e-05, + "loss": 0.2928, + "step": 1232 + }, + { + "epoch": 1.3848097711638354, + "grad_norm": 0.44768649725073345, + "learning_rate": 1.8149687255803687e-05, + "loss": 0.2874, + "step": 1233 + }, + { + "epoch": 1.3859328934437736, + "grad_norm": 0.4758485891594923, + "learning_rate": 1.814513900998621e-05, + "loss": 0.2996, + "step": 1234 + }, + { + "epoch": 1.387056015723712, + "grad_norm": 0.47272282269055016, + "learning_rate": 1.8140585752375063e-05, + "loss": 0.2884, + "step": 1235 + }, + { + "epoch": 1.3881791380036501, + "grad_norm": 0.45230047857816796, + "learning_rate": 1.8136027485771926e-05, + "loss": 0.2767, + "step": 1236 + }, + { + "epoch": 1.3893022602835883, + "grad_norm": 0.4507150818799199, + "learning_rate": 1.813146421298154e-05, + "loss": 0.2884, + "step": 1237 + }, + { + "epoch": 1.3904253825635267, + "grad_norm": 0.45879614204982117, + "learning_rate": 1.8126895936811745e-05, + "loss": 0.2708, + "step": 1238 + }, + { + "epoch": 1.3915485048434648, + "grad_norm": 0.45661246563867913, + "learning_rate": 1.812232266007344e-05, + "loss": 0.2737, + "step": 1239 + }, + { + "epoch": 1.392671627123403, + "grad_norm": 0.4601276790260715, + "learning_rate": 1.8117744385580627e-05, + "loss": 0.2938, + "step": 1240 + }, + { + "epoch": 1.3937947494033414, + "grad_norm": 0.48293515299373707, + "learning_rate": 1.8113161116150356e-05, + "loss": 0.2847, + "step": 1241 + }, + { + "epoch": 1.3949178716832795, + "grad_norm": 0.5135873286633734, + "learning_rate": 1.8108572854602774e-05, + "loss": 0.3047, + "step": 1242 + }, + { + "epoch": 1.3960409939632177, + "grad_norm": 0.49511291405284924, + "learning_rate": 1.8103979603761084e-05, + "loss": 0.296, + "step": 1243 + }, + { + "epoch": 1.397164116243156, + "grad_norm": 0.4915844024660979, + "learning_rate": 1.8099381366451562e-05, + "loss": 0.3025, + "step": 1244 + }, + { + "epoch": 1.3982872385230942, + "grad_norm": 0.457360486981577, + "learning_rate": 1.8094778145503555e-05, + "loss": 0.2809, + "step": 1245 + }, + { + "epoch": 1.3994103608030324, + "grad_norm": 0.45194447137874866, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.2815, + "step": 1246 + }, + { + "epoch": 1.4005334830829708, + "grad_norm": 0.469717199113566, + "learning_rate": 1.8085556764024804e-05, + "loss": 0.2844, + "step": 1247 + }, + { + "epoch": 1.401656605362909, + "grad_norm": 0.461335245541827, + "learning_rate": 1.8080938609168073e-05, + "loss": 0.2825, + "step": 1248 + }, + { + "epoch": 1.402779727642847, + "grad_norm": 0.4710994100251086, + "learning_rate": 1.8076315482020893e-05, + "loss": 0.2925, + "step": 1249 + }, + { + "epoch": 1.4039028499227855, + "grad_norm": 0.46856919190627966, + "learning_rate": 1.8071687385427922e-05, + "loss": 0.2925, + "step": 1250 + }, + { + "epoch": 1.4050259722027236, + "grad_norm": 0.463686961189385, + "learning_rate": 1.8067054322236876e-05, + "loss": 0.2982, + "step": 1251 + }, + { + "epoch": 1.4061490944826618, + "grad_norm": 0.4233668907762543, + "learning_rate": 1.806241629529853e-05, + "loss": 0.279, + "step": 1252 + }, + { + "epoch": 1.4072722167626, + "grad_norm": 0.4446703109276029, + "learning_rate": 1.8057773307466717e-05, + "loss": 0.2688, + "step": 1253 + }, + { + "epoch": 1.4083953390425383, + "grad_norm": 0.4485358287975487, + "learning_rate": 1.8053125361598314e-05, + "loss": 0.2947, + "step": 1254 + }, + { + "epoch": 1.4095184613224765, + "grad_norm": 0.4452525960821962, + "learning_rate": 1.804847246055326e-05, + "loss": 0.2916, + "step": 1255 + }, + { + "epoch": 1.4106415836024147, + "grad_norm": 0.44434990069489066, + "learning_rate": 1.8043814607194528e-05, + "loss": 0.2957, + "step": 1256 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.45782505420166125, + "learning_rate": 1.803915180438815e-05, + "loss": 0.3081, + "step": 1257 + }, + { + "epoch": 1.4128878281622912, + "grad_norm": 0.44812612511388755, + "learning_rate": 1.80344840550032e-05, + "loss": 0.2925, + "step": 1258 + }, + { + "epoch": 1.4140109504422294, + "grad_norm": 0.4653220459251436, + "learning_rate": 1.8029811361911796e-05, + "loss": 0.284, + "step": 1259 + }, + { + "epoch": 1.4151340727221675, + "grad_norm": 0.4294806313427792, + "learning_rate": 1.8025133727989095e-05, + "loss": 0.2615, + "step": 1260 + }, + { + "epoch": 1.416257195002106, + "grad_norm": 0.4722245406377723, + "learning_rate": 1.8020451156113302e-05, + "loss": 0.2932, + "step": 1261 + }, + { + "epoch": 1.417380317282044, + "grad_norm": 0.4407259836445125, + "learning_rate": 1.801576364916565e-05, + "loss": 0.2783, + "step": 1262 + }, + { + "epoch": 1.4185034395619822, + "grad_norm": 0.4554354849309488, + "learning_rate": 1.8011071210030417e-05, + "loss": 0.2934, + "step": 1263 + }, + { + "epoch": 1.4196265618419206, + "grad_norm": 0.4797437708219925, + "learning_rate": 1.8006373841594905e-05, + "loss": 0.2914, + "step": 1264 + }, + { + "epoch": 1.4207496841218588, + "grad_norm": 0.4453109721657388, + "learning_rate": 1.8001671546749466e-05, + "loss": 0.273, + "step": 1265 + }, + { + "epoch": 1.421872806401797, + "grad_norm": 0.455616539511479, + "learning_rate": 1.7996964328387473e-05, + "loss": 0.2787, + "step": 1266 + }, + { + "epoch": 1.4229959286817353, + "grad_norm": 0.4660177496804102, + "learning_rate": 1.7992252189405318e-05, + "loss": 0.288, + "step": 1267 + }, + { + "epoch": 1.4241190509616735, + "grad_norm": 0.4868499349359341, + "learning_rate": 1.798753513270245e-05, + "loss": 0.3011, + "step": 1268 + }, + { + "epoch": 1.4252421732416116, + "grad_norm": 0.42379910255068853, + "learning_rate": 1.798281316118131e-05, + "loss": 0.2688, + "step": 1269 + }, + { + "epoch": 1.42636529552155, + "grad_norm": 0.44137517215745187, + "learning_rate": 1.797808627774738e-05, + "loss": 0.2779, + "step": 1270 + }, + { + "epoch": 1.4274884178014882, + "grad_norm": 0.4454397017855999, + "learning_rate": 1.7973354485309178e-05, + "loss": 0.2885, + "step": 1271 + }, + { + "epoch": 1.4286115400814263, + "grad_norm": 0.41513228116253365, + "learning_rate": 1.7968617786778214e-05, + "loss": 0.258, + "step": 1272 + }, + { + "epoch": 1.4297346623613647, + "grad_norm": 0.45824072542709143, + "learning_rate": 1.7963876185069032e-05, + "loss": 0.2982, + "step": 1273 + }, + { + "epoch": 1.4308577846413029, + "grad_norm": 0.45152285012254323, + "learning_rate": 1.7959129683099202e-05, + "loss": 0.2926, + "step": 1274 + }, + { + "epoch": 1.431980906921241, + "grad_norm": 0.4382850255611188, + "learning_rate": 1.795437828378929e-05, + "loss": 0.2748, + "step": 1275 + }, + { + "epoch": 1.4331040292011794, + "grad_norm": 0.45046526339966464, + "learning_rate": 1.7949621990062882e-05, + "loss": 0.2834, + "step": 1276 + }, + { + "epoch": 1.4342271514811176, + "grad_norm": 0.45318087991408496, + "learning_rate": 1.7944860804846585e-05, + "loss": 0.2921, + "step": 1277 + }, + { + "epoch": 1.4353502737610557, + "grad_norm": 0.4389584828107915, + "learning_rate": 1.7940094731070005e-05, + "loss": 0.2711, + "step": 1278 + }, + { + "epoch": 1.4364733960409939, + "grad_norm": 0.4551598275028435, + "learning_rate": 1.793532377166576e-05, + "loss": 0.2788, + "step": 1279 + }, + { + "epoch": 1.4375965183209323, + "grad_norm": 0.46450024527966405, + "learning_rate": 1.793054792956947e-05, + "loss": 0.2937, + "step": 1280 + }, + { + "epoch": 1.4387196406008704, + "grad_norm": 0.4631044556099893, + "learning_rate": 1.7925767207719774e-05, + "loss": 0.2961, + "step": 1281 + }, + { + "epoch": 1.4398427628808086, + "grad_norm": 0.4292656254224395, + "learning_rate": 1.792098160905829e-05, + "loss": 0.2646, + "step": 1282 + }, + { + "epoch": 1.4409658851607468, + "grad_norm": 0.47873898790679503, + "learning_rate": 1.791619113652966e-05, + "loss": 0.2891, + "step": 1283 + }, + { + "epoch": 1.4420890074406851, + "grad_norm": 0.46176874055866274, + "learning_rate": 1.7911395793081508e-05, + "loss": 0.3053, + "step": 1284 + }, + { + "epoch": 1.4432121297206233, + "grad_norm": 0.43168104217807524, + "learning_rate": 1.7906595581664462e-05, + "loss": 0.2757, + "step": 1285 + }, + { + "epoch": 1.4443352520005615, + "grad_norm": 0.4526156341532352, + "learning_rate": 1.790179050523215e-05, + "loss": 0.2906, + "step": 1286 + }, + { + "epoch": 1.4454583742804998, + "grad_norm": 0.4576287167509249, + "learning_rate": 1.7896980566741183e-05, + "loss": 0.3055, + "step": 1287 + }, + { + "epoch": 1.446581496560438, + "grad_norm": 0.4291349675446234, + "learning_rate": 1.7892165769151174e-05, + "loss": 0.2797, + "step": 1288 + }, + { + "epoch": 1.4477046188403762, + "grad_norm": 0.4330178308786235, + "learning_rate": 1.7887346115424712e-05, + "loss": 0.2879, + "step": 1289 + }, + { + "epoch": 1.4488277411203145, + "grad_norm": 0.4409857238545325, + "learning_rate": 1.7882521608527393e-05, + "loss": 0.277, + "step": 1290 + }, + { + "epoch": 1.4499508634002527, + "grad_norm": 0.46396778078552625, + "learning_rate": 1.7877692251427783e-05, + "loss": 0.292, + "step": 1291 + }, + { + "epoch": 1.4510739856801909, + "grad_norm": 0.4583722733176541, + "learning_rate": 1.7872858047097442e-05, + "loss": 0.287, + "step": 1292 + }, + { + "epoch": 1.4521971079601292, + "grad_norm": 0.4617901289674729, + "learning_rate": 1.7868018998510907e-05, + "loss": 0.2931, + "step": 1293 + }, + { + "epoch": 1.4533202302400674, + "grad_norm": 0.4688919751696024, + "learning_rate": 1.7863175108645698e-05, + "loss": 0.2795, + "step": 1294 + }, + { + "epoch": 1.4544433525200056, + "grad_norm": 0.473513507634666, + "learning_rate": 1.7858326380482313e-05, + "loss": 0.3136, + "step": 1295 + }, + { + "epoch": 1.455566474799944, + "grad_norm": 0.46308815674849557, + "learning_rate": 1.7853472817004235e-05, + "loss": 0.2871, + "step": 1296 + }, + { + "epoch": 1.456689597079882, + "grad_norm": 0.5179959894108842, + "learning_rate": 1.7848614421197903e-05, + "loss": 0.3007, + "step": 1297 + }, + { + "epoch": 1.4578127193598203, + "grad_norm": 0.4531340269397126, + "learning_rate": 1.784375119605275e-05, + "loss": 0.2959, + "step": 1298 + }, + { + "epoch": 1.4589358416397586, + "grad_norm": 0.4582919369256917, + "learning_rate": 1.783888314456117e-05, + "loss": 0.2894, + "step": 1299 + }, + { + "epoch": 1.4600589639196968, + "grad_norm": 0.4469673584516157, + "learning_rate": 1.7834010269718526e-05, + "loss": 0.28, + "step": 1300 + }, + { + "epoch": 1.461182086199635, + "grad_norm": 0.46123285741882053, + "learning_rate": 1.7829132574523155e-05, + "loss": 0.2832, + "step": 1301 + }, + { + "epoch": 1.4623052084795733, + "grad_norm": 0.45607801195534514, + "learning_rate": 1.7824250061976355e-05, + "loss": 0.287, + "step": 1302 + }, + { + "epoch": 1.4634283307595115, + "grad_norm": 0.49530717522953166, + "learning_rate": 1.7819362735082392e-05, + "loss": 0.2993, + "step": 1303 + }, + { + "epoch": 1.4645514530394497, + "grad_norm": 0.4809186057767541, + "learning_rate": 1.7814470596848486e-05, + "loss": 0.3111, + "step": 1304 + }, + { + "epoch": 1.4656745753193878, + "grad_norm": 0.4609939280068489, + "learning_rate": 1.780957365028483e-05, + "loss": 0.3016, + "step": 1305 + }, + { + "epoch": 1.466797697599326, + "grad_norm": 0.5159740391114742, + "learning_rate": 1.7804671898404567e-05, + "loss": 0.303, + "step": 1306 + }, + { + "epoch": 1.4679208198792644, + "grad_norm": 0.45316092784328266, + "learning_rate": 1.7799765344223798e-05, + "loss": 0.2806, + "step": 1307 + }, + { + "epoch": 1.4690439421592025, + "grad_norm": 0.4929344112281057, + "learning_rate": 1.7794853990761576e-05, + "loss": 0.3244, + "step": 1308 + }, + { + "epoch": 1.4701670644391407, + "grad_norm": 0.4240215914027117, + "learning_rate": 1.778993784103992e-05, + "loss": 0.2649, + "step": 1309 + }, + { + "epoch": 1.471290186719079, + "grad_norm": 0.45336019964413365, + "learning_rate": 1.7785016898083786e-05, + "loss": 0.281, + "step": 1310 + }, + { + "epoch": 1.4724133089990172, + "grad_norm": 0.4709195975090023, + "learning_rate": 1.778009116492108e-05, + "loss": 0.3032, + "step": 1311 + }, + { + "epoch": 1.4735364312789554, + "grad_norm": 0.4378426156461344, + "learning_rate": 1.7775160644582667e-05, + "loss": 0.2773, + "step": 1312 + }, + { + "epoch": 1.4746595535588938, + "grad_norm": 0.4413617070461194, + "learning_rate": 1.777022534010235e-05, + "loss": 0.295, + "step": 1313 + }, + { + "epoch": 1.475782675838832, + "grad_norm": 0.45700233268388313, + "learning_rate": 1.776528525451687e-05, + "loss": 0.2909, + "step": 1314 + }, + { + "epoch": 1.47690579811877, + "grad_norm": 0.45427285254943417, + "learning_rate": 1.776034039086592e-05, + "loss": 0.3058, + "step": 1315 + }, + { + "epoch": 1.4780289203987085, + "grad_norm": 0.45339655669768186, + "learning_rate": 1.775539075219213e-05, + "loss": 0.2792, + "step": 1316 + }, + { + "epoch": 1.4791520426786466, + "grad_norm": 0.47160522784360365, + "learning_rate": 1.7750436341541066e-05, + "loss": 0.2998, + "step": 1317 + }, + { + "epoch": 1.4802751649585848, + "grad_norm": 0.45954229553978265, + "learning_rate": 1.774547716196123e-05, + "loss": 0.2871, + "step": 1318 + }, + { + "epoch": 1.4813982872385232, + "grad_norm": 0.46825968988664846, + "learning_rate": 1.7740513216504064e-05, + "loss": 0.3125, + "step": 1319 + }, + { + "epoch": 1.4825214095184613, + "grad_norm": 0.4514724212464635, + "learning_rate": 1.7735544508223933e-05, + "loss": 0.2851, + "step": 1320 + }, + { + "epoch": 1.4836445317983995, + "grad_norm": 0.45647208589810134, + "learning_rate": 1.773057104017814e-05, + "loss": 0.2871, + "step": 1321 + }, + { + "epoch": 1.4847676540783379, + "grad_norm": 0.45834495190642427, + "learning_rate": 1.772559281542692e-05, + "loss": 0.2929, + "step": 1322 + }, + { + "epoch": 1.485890776358276, + "grad_norm": 0.4614837916910236, + "learning_rate": 1.7720609837033417e-05, + "loss": 0.2978, + "step": 1323 + }, + { + "epoch": 1.4870138986382142, + "grad_norm": 0.44184030726899937, + "learning_rate": 1.7715622108063725e-05, + "loss": 0.2663, + "step": 1324 + }, + { + "epoch": 1.4881370209181526, + "grad_norm": 0.44711212739545164, + "learning_rate": 1.771062963158684e-05, + "loss": 0.2968, + "step": 1325 + }, + { + "epoch": 1.4892601431980907, + "grad_norm": 0.44221975823221676, + "learning_rate": 1.770563241067469e-05, + "loss": 0.2952, + "step": 1326 + }, + { + "epoch": 1.490383265478029, + "grad_norm": 0.5126809383713152, + "learning_rate": 1.7700630448402125e-05, + "loss": 0.3161, + "step": 1327 + }, + { + "epoch": 1.491506387757967, + "grad_norm": 0.4410073711767202, + "learning_rate": 1.76956237478469e-05, + "loss": 0.2738, + "step": 1328 + }, + { + "epoch": 1.4926295100379054, + "grad_norm": 0.45627309597281607, + "learning_rate": 1.7690612312089702e-05, + "loss": 0.2853, + "step": 1329 + }, + { + "epoch": 1.4937526323178436, + "grad_norm": 0.5028458766464338, + "learning_rate": 1.768559614421411e-05, + "loss": 0.3025, + "step": 1330 + }, + { + "epoch": 1.4948757545977818, + "grad_norm": 0.45766426852181796, + "learning_rate": 1.768057524730664e-05, + "loss": 0.293, + "step": 1331 + }, + { + "epoch": 1.49599887687772, + "grad_norm": 0.45597657632110705, + "learning_rate": 1.7675549624456695e-05, + "loss": 0.3038, + "step": 1332 + }, + { + "epoch": 1.4971219991576583, + "grad_norm": 0.4499374531210488, + "learning_rate": 1.7670519278756603e-05, + "loss": 0.2815, + "step": 1333 + }, + { + "epoch": 1.4982451214375965, + "grad_norm": 0.4707335152594058, + "learning_rate": 1.7665484213301587e-05, + "loss": 0.303, + "step": 1334 + }, + { + "epoch": 1.4993682437175346, + "grad_norm": 0.4781982752780198, + "learning_rate": 1.766044443118978e-05, + "loss": 0.2965, + "step": 1335 + }, + { + "epoch": 1.500491365997473, + "grad_norm": 0.47356503106211406, + "learning_rate": 1.7655399935522216e-05, + "loss": 0.2957, + "step": 1336 + }, + { + "epoch": 1.5016144882774112, + "grad_norm": 0.46119467696838445, + "learning_rate": 1.765035072940283e-05, + "loss": 0.2902, + "step": 1337 + }, + { + "epoch": 1.5027376105573493, + "grad_norm": 0.44367007330882113, + "learning_rate": 1.764529681593845e-05, + "loss": 0.2745, + "step": 1338 + }, + { + "epoch": 1.5038607328372877, + "grad_norm": 0.46264595602282504, + "learning_rate": 1.7640238198238803e-05, + "loss": 0.2712, + "step": 1339 + }, + { + "epoch": 1.5049838551172259, + "grad_norm": 0.44037392967410266, + "learning_rate": 1.763517487941652e-05, + "loss": 0.2778, + "step": 1340 + }, + { + "epoch": 1.506106977397164, + "grad_norm": 0.4361452264395535, + "learning_rate": 1.763010686258711e-05, + "loss": 0.2773, + "step": 1341 + }, + { + "epoch": 1.5072300996771024, + "grad_norm": 0.5842833735748904, + "learning_rate": 1.7625034150868983e-05, + "loss": 0.3086, + "step": 1342 + }, + { + "epoch": 1.5083532219570406, + "grad_norm": 0.43427680840538785, + "learning_rate": 1.7619956747383435e-05, + "loss": 0.2765, + "step": 1343 + }, + { + "epoch": 1.5094763442369787, + "grad_norm": 0.4179475986040606, + "learning_rate": 1.7614874655254644e-05, + "loss": 0.2706, + "step": 1344 + }, + { + "epoch": 1.5105994665169171, + "grad_norm": 0.44696851307851265, + "learning_rate": 1.7609787877609678e-05, + "loss": 0.2932, + "step": 1345 + }, + { + "epoch": 1.5117225887968553, + "grad_norm": 0.43430140397443595, + "learning_rate": 1.760469641757849e-05, + "loss": 0.2748, + "step": 1346 + }, + { + "epoch": 1.5128457110767934, + "grad_norm": 0.4164245825608252, + "learning_rate": 1.7599600278293915e-05, + "loss": 0.2765, + "step": 1347 + }, + { + "epoch": 1.5139688333567318, + "grad_norm": 0.4411217535356976, + "learning_rate": 1.7594499462891654e-05, + "loss": 0.2671, + "step": 1348 + }, + { + "epoch": 1.51509195563667, + "grad_norm": 0.47168354723847505, + "learning_rate": 1.7589393974510304e-05, + "loss": 0.2663, + "step": 1349 + }, + { + "epoch": 1.5162150779166081, + "grad_norm": 0.45298191218773637, + "learning_rate": 1.758428381629132e-05, + "loss": 0.2977, + "step": 1350 + }, + { + "epoch": 1.5173382001965465, + "grad_norm": 0.48015367678036425, + "learning_rate": 1.7579168991379042e-05, + "loss": 0.3073, + "step": 1351 + }, + { + "epoch": 1.5184613224764845, + "grad_norm": 0.450921374315479, + "learning_rate": 1.757404950292068e-05, + "loss": 0.2981, + "step": 1352 + }, + { + "epoch": 1.5195844447564228, + "grad_norm": 0.4511950623917902, + "learning_rate": 1.7568925354066313e-05, + "loss": 0.2715, + "step": 1353 + }, + { + "epoch": 1.5207075670363612, + "grad_norm": 0.4645619454896311, + "learning_rate": 1.756379654796888e-05, + "loss": 0.2906, + "step": 1354 + }, + { + "epoch": 1.5218306893162992, + "grad_norm": 0.4597487903047916, + "learning_rate": 1.7558663087784195e-05, + "loss": 0.2923, + "step": 1355 + }, + { + "epoch": 1.5229538115962375, + "grad_norm": 0.46125985328422053, + "learning_rate": 1.7553524976670936e-05, + "loss": 0.3157, + "step": 1356 + }, + { + "epoch": 1.524076933876176, + "grad_norm": 0.4627880390771236, + "learning_rate": 1.7548382217790633e-05, + "loss": 0.2844, + "step": 1357 + }, + { + "epoch": 1.5252000561561139, + "grad_norm": 0.48404911382660587, + "learning_rate": 1.7543234814307685e-05, + "loss": 0.2848, + "step": 1358 + }, + { + "epoch": 1.5263231784360523, + "grad_norm": 0.4732923841586989, + "learning_rate": 1.753808276938935e-05, + "loss": 0.3009, + "step": 1359 + }, + { + "epoch": 1.5274463007159904, + "grad_norm": 0.44748155272300266, + "learning_rate": 1.753292608620573e-05, + "loss": 0.2806, + "step": 1360 + }, + { + "epoch": 1.5285694229959286, + "grad_norm": 0.4671484752305363, + "learning_rate": 1.7527764767929794e-05, + "loss": 0.2746, + "step": 1361 + }, + { + "epoch": 1.529692545275867, + "grad_norm": 0.45312930393716716, + "learning_rate": 1.7522598817737356e-05, + "loss": 0.2817, + "step": 1362 + }, + { + "epoch": 1.5308156675558051, + "grad_norm": 0.48439112821445, + "learning_rate": 1.7517428238807085e-05, + "loss": 0.2988, + "step": 1363 + }, + { + "epoch": 1.5319387898357433, + "grad_norm": 0.4258685474786322, + "learning_rate": 1.751225303432049e-05, + "loss": 0.2713, + "step": 1364 + }, + { + "epoch": 1.5330619121156817, + "grad_norm": 0.4404628717066417, + "learning_rate": 1.750707320746194e-05, + "loss": 0.2817, + "step": 1365 + }, + { + "epoch": 1.5341850343956198, + "grad_norm": 0.4477719006580123, + "learning_rate": 1.750188876141863e-05, + "loss": 0.2856, + "step": 1366 + }, + { + "epoch": 1.535308156675558, + "grad_norm": 0.4482867400976784, + "learning_rate": 1.7496699699380612e-05, + "loss": 0.2715, + "step": 1367 + }, + { + "epoch": 1.5364312789554964, + "grad_norm": 0.43210614417809273, + "learning_rate": 1.749150602454077e-05, + "loss": 0.2858, + "step": 1368 + }, + { + "epoch": 1.5375544012354345, + "grad_norm": 0.4851645221119642, + "learning_rate": 1.7486307740094832e-05, + "loss": 0.3126, + "step": 1369 + }, + { + "epoch": 1.5386775235153727, + "grad_norm": 0.43465394038491617, + "learning_rate": 1.7481104849241357e-05, + "loss": 0.2834, + "step": 1370 + }, + { + "epoch": 1.539800645795311, + "grad_norm": 0.4343631643379705, + "learning_rate": 1.7475897355181747e-05, + "loss": 0.29, + "step": 1371 + }, + { + "epoch": 1.5409237680752492, + "grad_norm": 0.43113460770410267, + "learning_rate": 1.747068526112022e-05, + "loss": 0.2712, + "step": 1372 + }, + { + "epoch": 1.5420468903551874, + "grad_norm": 0.47375551490134643, + "learning_rate": 1.7465468570263844e-05, + "loss": 0.2852, + "step": 1373 + }, + { + "epoch": 1.5431700126351258, + "grad_norm": 0.4567963890276265, + "learning_rate": 1.7460247285822504e-05, + "loss": 0.2882, + "step": 1374 + }, + { + "epoch": 1.544293134915064, + "grad_norm": 0.4689696386236284, + "learning_rate": 1.7455021411008906e-05, + "loss": 0.294, + "step": 1375 + }, + { + "epoch": 1.545416257195002, + "grad_norm": 0.4468934735798268, + "learning_rate": 1.7449790949038604e-05, + "loss": 0.2934, + "step": 1376 + }, + { + "epoch": 1.5465393794749405, + "grad_norm": 0.4590404219399577, + "learning_rate": 1.7444555903129943e-05, + "loss": 0.2996, + "step": 1377 + }, + { + "epoch": 1.5476625017548784, + "grad_norm": 0.4320176095607792, + "learning_rate": 1.7439316276504112e-05, + "loss": 0.2958, + "step": 1378 + }, + { + "epoch": 1.5487856240348168, + "grad_norm": 0.4724597686393324, + "learning_rate": 1.7434072072385115e-05, + "loss": 0.299, + "step": 1379 + }, + { + "epoch": 1.5499087463147552, + "grad_norm": 0.4551694704885351, + "learning_rate": 1.7428823293999757e-05, + "loss": 0.2898, + "step": 1380 + }, + { + "epoch": 1.551031868594693, + "grad_norm": 0.4655624233086197, + "learning_rate": 1.7423569944577677e-05, + "loss": 0.3064, + "step": 1381 + }, + { + "epoch": 1.5521549908746315, + "grad_norm": 0.4495621812588542, + "learning_rate": 1.7418312027351322e-05, + "loss": 0.277, + "step": 1382 + }, + { + "epoch": 1.5532781131545697, + "grad_norm": 0.46734262431819734, + "learning_rate": 1.741304954555594e-05, + "loss": 0.2723, + "step": 1383 + }, + { + "epoch": 1.5544012354345078, + "grad_norm": 0.4530212621436398, + "learning_rate": 1.7407782502429594e-05, + "loss": 0.2966, + "step": 1384 + }, + { + "epoch": 1.5555243577144462, + "grad_norm": 0.48906035382170715, + "learning_rate": 1.7402510901213158e-05, + "loss": 0.3166, + "step": 1385 + }, + { + "epoch": 1.5566474799943844, + "grad_norm": 0.4588087448330385, + "learning_rate": 1.73972347451503e-05, + "loss": 0.2911, + "step": 1386 + }, + { + "epoch": 1.5577706022743225, + "grad_norm": 0.45890894077015215, + "learning_rate": 1.7391954037487503e-05, + "loss": 0.2792, + "step": 1387 + }, + { + "epoch": 1.558893724554261, + "grad_norm": 0.43100684891614244, + "learning_rate": 1.738666878147404e-05, + "loss": 0.2705, + "step": 1388 + }, + { + "epoch": 1.560016846834199, + "grad_norm": 0.4533496037726458, + "learning_rate": 1.738137898036199e-05, + "loss": 0.2984, + "step": 1389 + }, + { + "epoch": 1.5611399691141372, + "grad_norm": 0.44398777502713194, + "learning_rate": 1.7376084637406222e-05, + "loss": 0.284, + "step": 1390 + }, + { + "epoch": 1.5622630913940756, + "grad_norm": 0.4398899733954843, + "learning_rate": 1.737078575586441e-05, + "loss": 0.2851, + "step": 1391 + }, + { + "epoch": 1.5633862136740138, + "grad_norm": 0.4743848381771007, + "learning_rate": 1.736548233899701e-05, + "loss": 0.2954, + "step": 1392 + }, + { + "epoch": 1.564509335953952, + "grad_norm": 0.4645458661781724, + "learning_rate": 1.7360174390067274e-05, + "loss": 0.2912, + "step": 1393 + }, + { + "epoch": 1.5656324582338903, + "grad_norm": 0.44409894402600525, + "learning_rate": 1.735486191234124e-05, + "loss": 0.2816, + "step": 1394 + }, + { + "epoch": 1.5667555805138285, + "grad_norm": 0.43988326731406074, + "learning_rate": 1.7349544909087737e-05, + "loss": 0.2777, + "step": 1395 + }, + { + "epoch": 1.5678787027937666, + "grad_norm": 0.461134202797876, + "learning_rate": 1.734422338357837e-05, + "loss": 0.2963, + "step": 1396 + }, + { + "epoch": 1.569001825073705, + "grad_norm": 0.43219608525830144, + "learning_rate": 1.7338897339087536e-05, + "loss": 0.2743, + "step": 1397 + }, + { + "epoch": 1.5701249473536432, + "grad_norm": 0.4522334985486791, + "learning_rate": 1.733356677889241e-05, + "loss": 0.293, + "step": 1398 + }, + { + "epoch": 1.5712480696335813, + "grad_norm": 0.4697839824886229, + "learning_rate": 1.732823170627294e-05, + "loss": 0.3225, + "step": 1399 + }, + { + "epoch": 1.5723711919135197, + "grad_norm": 0.4379752450384441, + "learning_rate": 1.7322892124511862e-05, + "loss": 0.2934, + "step": 1400 + }, + { + "epoch": 1.5734943141934579, + "grad_norm": 0.4403768513512486, + "learning_rate": 1.731754803689467e-05, + "loss": 0.2808, + "step": 1401 + }, + { + "epoch": 1.574617436473396, + "grad_norm": 0.4526960513869645, + "learning_rate": 1.731219944670965e-05, + "loss": 0.3075, + "step": 1402 + }, + { + "epoch": 1.5757405587533344, + "grad_norm": 0.4391813185175165, + "learning_rate": 1.730684635724784e-05, + "loss": 0.2818, + "step": 1403 + }, + { + "epoch": 1.5768636810332723, + "grad_norm": 0.45969828645408684, + "learning_rate": 1.7301488771803056e-05, + "loss": 0.3036, + "step": 1404 + }, + { + "epoch": 1.5779868033132107, + "grad_norm": 0.43457708703765435, + "learning_rate": 1.7296126693671886e-05, + "loss": 0.2746, + "step": 1405 + }, + { + "epoch": 1.579109925593149, + "grad_norm": 0.4331075300392934, + "learning_rate": 1.7290760126153666e-05, + "loss": 0.2821, + "step": 1406 + }, + { + "epoch": 1.580233047873087, + "grad_norm": 0.4208492939766656, + "learning_rate": 1.7285389072550515e-05, + "loss": 0.2795, + "step": 1407 + }, + { + "epoch": 1.5813561701530254, + "grad_norm": 0.4634956044262763, + "learning_rate": 1.728001353616729e-05, + "loss": 0.3025, + "step": 1408 + }, + { + "epoch": 1.5824792924329636, + "grad_norm": 0.46128626849790916, + "learning_rate": 1.727463352031163e-05, + "loss": 0.302, + "step": 1409 + }, + { + "epoch": 1.5836024147129018, + "grad_norm": 0.4275363771455048, + "learning_rate": 1.7269249028293907e-05, + "loss": 0.2809, + "step": 1410 + }, + { + "epoch": 1.5847255369928401, + "grad_norm": 0.4508186393663234, + "learning_rate": 1.7263860063427263e-05, + "loss": 0.2928, + "step": 1411 + }, + { + "epoch": 1.5858486592727783, + "grad_norm": 0.4641518740610698, + "learning_rate": 1.7258466629027586e-05, + "loss": 0.2966, + "step": 1412 + }, + { + "epoch": 1.5869717815527165, + "grad_norm": 0.4350653148056457, + "learning_rate": 1.7253068728413517e-05, + "loss": 0.2871, + "step": 1413 + }, + { + "epoch": 1.5880949038326548, + "grad_norm": 0.4574921332811425, + "learning_rate": 1.7247666364906443e-05, + "loss": 0.305, + "step": 1414 + }, + { + "epoch": 1.589218026112593, + "grad_norm": 0.4423661730830507, + "learning_rate": 1.7242259541830497e-05, + "loss": 0.2835, + "step": 1415 + }, + { + "epoch": 1.5903411483925312, + "grad_norm": 0.4415125572569024, + "learning_rate": 1.723684826251256e-05, + "loss": 0.287, + "step": 1416 + }, + { + "epoch": 1.5914642706724695, + "grad_norm": 0.45878372415584956, + "learning_rate": 1.7231432530282246e-05, + "loss": 0.291, + "step": 1417 + }, + { + "epoch": 1.5925873929524077, + "grad_norm": 0.4536230530206705, + "learning_rate": 1.722601234847192e-05, + "loss": 0.2676, + "step": 1418 + }, + { + "epoch": 1.5937105152323459, + "grad_norm": 0.42510000179340557, + "learning_rate": 1.7220587720416677e-05, + "loss": 0.2625, + "step": 1419 + }, + { + "epoch": 1.5948336375122842, + "grad_norm": 0.4617059875851874, + "learning_rate": 1.721515864945435e-05, + "loss": 0.3081, + "step": 1420 + }, + { + "epoch": 1.5959567597922224, + "grad_norm": 0.4325597054894661, + "learning_rate": 1.7209725138925506e-05, + "loss": 0.2713, + "step": 1421 + }, + { + "epoch": 1.5970798820721606, + "grad_norm": 0.43609135158582557, + "learning_rate": 1.7204287192173444e-05, + "loss": 0.2722, + "step": 1422 + }, + { + "epoch": 1.598203004352099, + "grad_norm": 0.43997208529177845, + "learning_rate": 1.7198844812544194e-05, + "loss": 0.2737, + "step": 1423 + }, + { + "epoch": 1.599326126632037, + "grad_norm": 0.45811922865343463, + "learning_rate": 1.7193398003386514e-05, + "loss": 0.3069, + "step": 1424 + }, + { + "epoch": 1.6004492489119753, + "grad_norm": 0.441765765575073, + "learning_rate": 1.718794676805188e-05, + "loss": 0.2878, + "step": 1425 + }, + { + "epoch": 1.6015723711919136, + "grad_norm": 0.44938030928999945, + "learning_rate": 1.71824911098945e-05, + "loss": 0.2803, + "step": 1426 + }, + { + "epoch": 1.6026954934718516, + "grad_norm": 0.4711803532185502, + "learning_rate": 1.7177031032271298e-05, + "loss": 0.2898, + "step": 1427 + }, + { + "epoch": 1.60381861575179, + "grad_norm": 0.48742752034450476, + "learning_rate": 1.7171566538541925e-05, + "loss": 0.2931, + "step": 1428 + }, + { + "epoch": 1.6049417380317283, + "grad_norm": 0.455468095932038, + "learning_rate": 1.7166097632068745e-05, + "loss": 0.3055, + "step": 1429 + }, + { + "epoch": 1.6060648603116663, + "grad_norm": 0.4372160767962161, + "learning_rate": 1.7160624316216825e-05, + "loss": 0.2829, + "step": 1430 + }, + { + "epoch": 1.6071879825916047, + "grad_norm": 0.47650809154388873, + "learning_rate": 1.715514659435397e-05, + "loss": 0.283, + "step": 1431 + }, + { + "epoch": 1.608311104871543, + "grad_norm": 0.430482793568945, + "learning_rate": 1.7149664469850674e-05, + "loss": 0.277, + "step": 1432 + }, + { + "epoch": 1.609434227151481, + "grad_norm": 0.4516995855142114, + "learning_rate": 1.714417794608015e-05, + "loss": 0.2934, + "step": 1433 + }, + { + "epoch": 1.6105573494314194, + "grad_norm": 0.4110806797442354, + "learning_rate": 1.713868702641832e-05, + "loss": 0.2593, + "step": 1434 + }, + { + "epoch": 1.6116804717113575, + "grad_norm": 0.4616915231229448, + "learning_rate": 1.7133191714243805e-05, + "loss": 0.284, + "step": 1435 + }, + { + "epoch": 1.6128035939912957, + "grad_norm": 0.41123495094639895, + "learning_rate": 1.712769201293793e-05, + "loss": 0.258, + "step": 1436 + }, + { + "epoch": 1.613926716271234, + "grad_norm": 0.4770233615190129, + "learning_rate": 1.7122187925884723e-05, + "loss": 0.3075, + "step": 1437 + }, + { + "epoch": 1.6150498385511722, + "grad_norm": 0.45932911711426877, + "learning_rate": 1.7116679456470908e-05, + "loss": 0.3073, + "step": 1438 + }, + { + "epoch": 1.6161729608311104, + "grad_norm": 0.43310100468943585, + "learning_rate": 1.711116660808591e-05, + "loss": 0.2803, + "step": 1439 + }, + { + "epoch": 1.6172960831110488, + "grad_norm": 0.4053611622253437, + "learning_rate": 1.710564938412184e-05, + "loss": 0.2552, + "step": 1440 + }, + { + "epoch": 1.618419205390987, + "grad_norm": 0.4776776817414243, + "learning_rate": 1.710012778797351e-05, + "loss": 0.3047, + "step": 1441 + }, + { + "epoch": 1.619542327670925, + "grad_norm": 0.46553831799548406, + "learning_rate": 1.7094601823038425e-05, + "loss": 0.2946, + "step": 1442 + }, + { + "epoch": 1.6206654499508635, + "grad_norm": 0.466347563259381, + "learning_rate": 1.7089071492716758e-05, + "loss": 0.2886, + "step": 1443 + }, + { + "epoch": 1.6217885722308016, + "grad_norm": 0.44584944747972316, + "learning_rate": 1.7083536800411392e-05, + "loss": 0.2731, + "step": 1444 + }, + { + "epoch": 1.6229116945107398, + "grad_norm": 0.49761550260791193, + "learning_rate": 1.7077997749527884e-05, + "loss": 0.3217, + "step": 1445 + }, + { + "epoch": 1.6240348167906782, + "grad_norm": 0.42865534739277844, + "learning_rate": 1.707245434347447e-05, + "loss": 0.2829, + "step": 1446 + }, + { + "epoch": 1.6251579390706163, + "grad_norm": 0.4159767822652411, + "learning_rate": 1.706690658566207e-05, + "loss": 0.2645, + "step": 1447 + }, + { + "epoch": 1.6262810613505545, + "grad_norm": 0.4766099212680193, + "learning_rate": 1.7061354479504277e-05, + "loss": 0.2893, + "step": 1448 + }, + { + "epoch": 1.6274041836304929, + "grad_norm": 0.4532911553458812, + "learning_rate": 1.705579802841737e-05, + "loss": 0.2789, + "step": 1449 + }, + { + "epoch": 1.628527305910431, + "grad_norm": 0.45741701789824535, + "learning_rate": 1.7050237235820287e-05, + "loss": 0.289, + "step": 1450 + }, + { + "epoch": 1.6296504281903692, + "grad_norm": 0.4846189139759324, + "learning_rate": 1.704467210513465e-05, + "loss": 0.3282, + "step": 1451 + }, + { + "epoch": 1.6307735504703076, + "grad_norm": 0.4261188062643428, + "learning_rate": 1.7039102639784747e-05, + "loss": 0.2712, + "step": 1452 + }, + { + "epoch": 1.6318966727502455, + "grad_norm": 0.4409212737827275, + "learning_rate": 1.7033528843197523e-05, + "loss": 0.2847, + "step": 1453 + }, + { + "epoch": 1.633019795030184, + "grad_norm": 0.4458915183695067, + "learning_rate": 1.7027950718802605e-05, + "loss": 0.2778, + "step": 1454 + }, + { + "epoch": 1.6341429173101223, + "grad_norm": 0.47098011281228286, + "learning_rate": 1.7022368270032268e-05, + "loss": 0.2913, + "step": 1455 + }, + { + "epoch": 1.6352660395900602, + "grad_norm": 0.4436299643362799, + "learning_rate": 1.7016781500321458e-05, + "loss": 0.2686, + "step": 1456 + }, + { + "epoch": 1.6363891618699986, + "grad_norm": 0.508079183867471, + "learning_rate": 1.7011190413107774e-05, + "loss": 0.3568, + "step": 1457 + }, + { + "epoch": 1.6375122841499368, + "grad_norm": 0.4447742608378096, + "learning_rate": 1.7005595011831473e-05, + "loss": 0.3054, + "step": 1458 + }, + { + "epoch": 1.638635406429875, + "grad_norm": 0.45902598871228406, + "learning_rate": 1.699999529993547e-05, + "loss": 0.2874, + "step": 1459 + }, + { + "epoch": 1.6397585287098133, + "grad_norm": 0.43790349463956285, + "learning_rate": 1.6994391280865327e-05, + "loss": 0.2753, + "step": 1460 + }, + { + "epoch": 1.6408816509897515, + "grad_norm": 0.43987143870781154, + "learning_rate": 1.698878295806926e-05, + "loss": 0.2812, + "step": 1461 + }, + { + "epoch": 1.6420047732696896, + "grad_norm": 0.45636973748880527, + "learning_rate": 1.698317033499813e-05, + "loss": 0.294, + "step": 1462 + }, + { + "epoch": 1.643127895549628, + "grad_norm": 0.4473056852255817, + "learning_rate": 1.6977553415105446e-05, + "loss": 0.2922, + "step": 1463 + }, + { + "epoch": 1.6442510178295662, + "grad_norm": 0.44711605854202396, + "learning_rate": 1.6971932201847362e-05, + "loss": 0.3037, + "step": 1464 + }, + { + "epoch": 1.6453741401095043, + "grad_norm": 0.47322347350994115, + "learning_rate": 1.6966306698682672e-05, + "loss": 0.3091, + "step": 1465 + }, + { + "epoch": 1.6464972623894427, + "grad_norm": 0.47589867797246205, + "learning_rate": 1.6960676909072808e-05, + "loss": 0.3044, + "step": 1466 + }, + { + "epoch": 1.6476203846693809, + "grad_norm": 0.45462083845828744, + "learning_rate": 1.6955042836481842e-05, + "loss": 0.2901, + "step": 1467 + }, + { + "epoch": 1.648743506949319, + "grad_norm": 0.43342182003648877, + "learning_rate": 1.6949404484376484e-05, + "loss": 0.2826, + "step": 1468 + }, + { + "epoch": 1.6498666292292574, + "grad_norm": 0.4420101471539028, + "learning_rate": 1.6943761856226072e-05, + "loss": 0.2823, + "step": 1469 + }, + { + "epoch": 1.6509897515091956, + "grad_norm": 0.45834897544206743, + "learning_rate": 1.693811495550258e-05, + "loss": 0.2998, + "step": 1470 + }, + { + "epoch": 1.6521128737891337, + "grad_norm": 0.4376763092414877, + "learning_rate": 1.69324637856806e-05, + "loss": 0.2906, + "step": 1471 + }, + { + "epoch": 1.6532359960690721, + "grad_norm": 0.4546259417086349, + "learning_rate": 1.6926808350237367e-05, + "loss": 0.293, + "step": 1472 + }, + { + "epoch": 1.6543591183490103, + "grad_norm": 0.4372010366201924, + "learning_rate": 1.692114865265273e-05, + "loss": 0.2712, + "step": 1473 + }, + { + "epoch": 1.6554822406289484, + "grad_norm": 0.4778527561849003, + "learning_rate": 1.691548469640916e-05, + "loss": 0.3141, + "step": 1474 + }, + { + "epoch": 1.6566053629088868, + "grad_norm": 0.4373422181074543, + "learning_rate": 1.690981648499176e-05, + "loss": 0.2776, + "step": 1475 + }, + { + "epoch": 1.657728485188825, + "grad_norm": 0.45910849316443425, + "learning_rate": 1.6904144021888236e-05, + "loss": 0.2914, + "step": 1476 + }, + { + "epoch": 1.6588516074687631, + "grad_norm": 0.4377368116005779, + "learning_rate": 1.6898467310588917e-05, + "loss": 0.2832, + "step": 1477 + }, + { + "epoch": 1.6599747297487015, + "grad_norm": 0.4343991004179873, + "learning_rate": 1.689278635458675e-05, + "loss": 0.2697, + "step": 1478 + }, + { + "epoch": 1.6610978520286395, + "grad_norm": 0.4584326718106985, + "learning_rate": 1.6887101157377284e-05, + "loss": 0.3061, + "step": 1479 + }, + { + "epoch": 1.6622209743085778, + "grad_norm": 0.48367472603690564, + "learning_rate": 1.6881411722458688e-05, + "loss": 0.2898, + "step": 1480 + }, + { + "epoch": 1.6633440965885162, + "grad_norm": 0.4259604088571098, + "learning_rate": 1.6875718053331736e-05, + "loss": 0.2642, + "step": 1481 + }, + { + "epoch": 1.6644672188684542, + "grad_norm": 0.4427453550152015, + "learning_rate": 1.6870020153499796e-05, + "loss": 0.2742, + "step": 1482 + }, + { + "epoch": 1.6655903411483926, + "grad_norm": 0.4640041414969516, + "learning_rate": 1.686431802646886e-05, + "loss": 0.2913, + "step": 1483 + }, + { + "epoch": 1.6667134634283307, + "grad_norm": 0.46659553606141885, + "learning_rate": 1.68586116757475e-05, + "loss": 0.2979, + "step": 1484 + }, + { + "epoch": 1.6678365857082689, + "grad_norm": 0.44118444234300963, + "learning_rate": 1.6852901104846902e-05, + "loss": 0.2798, + "step": 1485 + }, + { + "epoch": 1.6689597079882073, + "grad_norm": 0.4232934978922537, + "learning_rate": 1.6847186317280844e-05, + "loss": 0.2664, + "step": 1486 + }, + { + "epoch": 1.6700828302681454, + "grad_norm": 0.4549678955070991, + "learning_rate": 1.684146731656569e-05, + "loss": 0.2691, + "step": 1487 + }, + { + "epoch": 1.6712059525480836, + "grad_norm": 0.4462793748347985, + "learning_rate": 1.683574410622041e-05, + "loss": 0.2756, + "step": 1488 + }, + { + "epoch": 1.672329074828022, + "grad_norm": 0.46943909006116125, + "learning_rate": 1.683001668976656e-05, + "loss": 0.3007, + "step": 1489 + }, + { + "epoch": 1.6734521971079601, + "grad_norm": 0.47419859082322885, + "learning_rate": 1.6824285070728278e-05, + "loss": 0.2873, + "step": 1490 + }, + { + "epoch": 1.6745753193878983, + "grad_norm": 0.47147067796892533, + "learning_rate": 1.6818549252632295e-05, + "loss": 0.2839, + "step": 1491 + }, + { + "epoch": 1.6756984416678367, + "grad_norm": 0.456348027524806, + "learning_rate": 1.6812809239007924e-05, + "loss": 0.3, + "step": 1492 + }, + { + "epoch": 1.6768215639477748, + "grad_norm": 0.40777119029954567, + "learning_rate": 1.6807065033387052e-05, + "loss": 0.2596, + "step": 1493 + }, + { + "epoch": 1.677944686227713, + "grad_norm": 0.40000140473687745, + "learning_rate": 1.6801316639304163e-05, + "loss": 0.2658, + "step": 1494 + }, + { + "epoch": 1.6790678085076514, + "grad_norm": 0.45210384075210974, + "learning_rate": 1.6795564060296295e-05, + "loss": 0.2966, + "step": 1495 + }, + { + "epoch": 1.6801909307875895, + "grad_norm": 0.40822664899597394, + "learning_rate": 1.678980729990308e-05, + "loss": 0.2535, + "step": 1496 + }, + { + "epoch": 1.6813140530675277, + "grad_norm": 0.4783878749550028, + "learning_rate": 1.6784046361666714e-05, + "loss": 0.3179, + "step": 1497 + }, + { + "epoch": 1.682437175347466, + "grad_norm": 0.4205086583335779, + "learning_rate": 1.6778281249131973e-05, + "loss": 0.2624, + "step": 1498 + }, + { + "epoch": 1.6835602976274042, + "grad_norm": 0.4448670745068746, + "learning_rate": 1.677251196584618e-05, + "loss": 0.2726, + "step": 1499 + }, + { + "epoch": 1.6846834199073424, + "grad_norm": 0.4612087079191544, + "learning_rate": 1.676673851535925e-05, + "loss": 0.2863, + "step": 1500 + }, + { + "epoch": 1.6858065421872808, + "grad_norm": 0.44897350104308953, + "learning_rate": 1.6760960901223647e-05, + "loss": 0.2777, + "step": 1501 + }, + { + "epoch": 1.6869296644672187, + "grad_norm": 0.4650716938100982, + "learning_rate": 1.6755179126994397e-05, + "loss": 0.3114, + "step": 1502 + }, + { + "epoch": 1.688052786747157, + "grad_norm": 0.4408040310278629, + "learning_rate": 1.6749393196229097e-05, + "loss": 0.2802, + "step": 1503 + }, + { + "epoch": 1.6891759090270955, + "grad_norm": 0.4323634729178194, + "learning_rate": 1.6743603112487888e-05, + "loss": 0.272, + "step": 1504 + }, + { + "epoch": 1.6902990313070334, + "grad_norm": 0.4532517237446688, + "learning_rate": 1.6737808879333477e-05, + "loss": 0.2994, + "step": 1505 + }, + { + "epoch": 1.6914221535869718, + "grad_norm": 0.41707219861308187, + "learning_rate": 1.6732010500331112e-05, + "loss": 0.2786, + "step": 1506 + }, + { + "epoch": 1.6925452758669102, + "grad_norm": 0.4790732992448698, + "learning_rate": 1.6726207979048604e-05, + "loss": 0.3217, + "step": 1507 + }, + { + "epoch": 1.693668398146848, + "grad_norm": 0.4426207341478685, + "learning_rate": 1.672040131905631e-05, + "loss": 0.2957, + "step": 1508 + }, + { + "epoch": 1.6947915204267865, + "grad_norm": 0.4491550141407061, + "learning_rate": 1.6714590523927127e-05, + "loss": 0.2963, + "step": 1509 + }, + { + "epoch": 1.6959146427067247, + "grad_norm": 0.44425919435030004, + "learning_rate": 1.6708775597236507e-05, + "loss": 0.2888, + "step": 1510 + }, + { + "epoch": 1.6970377649866628, + "grad_norm": 0.4428624792544438, + "learning_rate": 1.6702956542562433e-05, + "loss": 0.2919, + "step": 1511 + }, + { + "epoch": 1.6981608872666012, + "grad_norm": 0.4645304954497035, + "learning_rate": 1.669713336348544e-05, + "loss": 0.3044, + "step": 1512 + }, + { + "epoch": 1.6992840095465394, + "grad_norm": 0.442526153745366, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.2733, + "step": 1513 + }, + { + "epoch": 1.7004071318264775, + "grad_norm": 0.44682174416662607, + "learning_rate": 1.6685474646457477e-05, + "loss": 0.288, + "step": 1514 + }, + { + "epoch": 1.701530254106416, + "grad_norm": 0.4460477682537591, + "learning_rate": 1.6679639115680247e-05, + "loss": 0.2892, + "step": 1515 + }, + { + "epoch": 1.702653376386354, + "grad_norm": 0.42381417287482237, + "learning_rate": 1.667379947484756e-05, + "loss": 0.2753, + "step": 1516 + }, + { + "epoch": 1.7037764986662922, + "grad_norm": 0.45442357297793295, + "learning_rate": 1.666795572755262e-05, + "loss": 0.2963, + "step": 1517 + }, + { + "epoch": 1.7048996209462306, + "grad_norm": 0.4467561695261649, + "learning_rate": 1.666210787739114e-05, + "loss": 0.294, + "step": 1518 + }, + { + "epoch": 1.7060227432261688, + "grad_norm": 0.4514819485972295, + "learning_rate": 1.665625592796137e-05, + "loss": 0.2916, + "step": 1519 + }, + { + "epoch": 1.707145865506107, + "grad_norm": 0.47046147688947726, + "learning_rate": 1.665039988286408e-05, + "loss": 0.3059, + "step": 1520 + }, + { + "epoch": 1.7082689877860453, + "grad_norm": 0.4664118838351173, + "learning_rate": 1.6644539745702558e-05, + "loss": 0.2977, + "step": 1521 + }, + { + "epoch": 1.7093921100659835, + "grad_norm": 0.4388011550954156, + "learning_rate": 1.6638675520082613e-05, + "loss": 0.2998, + "step": 1522 + }, + { + "epoch": 1.7105152323459216, + "grad_norm": 0.45082022297345586, + "learning_rate": 1.663280720961256e-05, + "loss": 0.2966, + "step": 1523 + }, + { + "epoch": 1.71163835462586, + "grad_norm": 0.4497822398613249, + "learning_rate": 1.662693481790324e-05, + "loss": 0.2827, + "step": 1524 + }, + { + "epoch": 1.7127614769057982, + "grad_norm": 0.46875113264717555, + "learning_rate": 1.6621058348568008e-05, + "loss": 0.3053, + "step": 1525 + }, + { + "epoch": 1.7138845991857363, + "grad_norm": 0.4547313413961331, + "learning_rate": 1.6615177805222703e-05, + "loss": 0.2875, + "step": 1526 + }, + { + "epoch": 1.7150077214656747, + "grad_norm": 0.44523115177994116, + "learning_rate": 1.6609293191485704e-05, + "loss": 0.2924, + "step": 1527 + }, + { + "epoch": 1.7161308437456126, + "grad_norm": 0.4348408552400898, + "learning_rate": 1.660340451097787e-05, + "loss": 0.2842, + "step": 1528 + }, + { + "epoch": 1.717253966025551, + "grad_norm": 0.4274690190508088, + "learning_rate": 1.6597511767322575e-05, + "loss": 0.2754, + "step": 1529 + }, + { + "epoch": 1.7183770883054894, + "grad_norm": 0.45654747366119725, + "learning_rate": 1.6591614964145685e-05, + "loss": 0.3052, + "step": 1530 + }, + { + "epoch": 1.7195002105854273, + "grad_norm": 0.4633059182553845, + "learning_rate": 1.6585714105075573e-05, + "loss": 0.3026, + "step": 1531 + }, + { + "epoch": 1.7206233328653657, + "grad_norm": 0.45324442029714246, + "learning_rate": 1.6579809193743094e-05, + "loss": 0.2861, + "step": 1532 + }, + { + "epoch": 1.7217464551453041, + "grad_norm": 0.4599466934645262, + "learning_rate": 1.6573900233781616e-05, + "loss": 0.2858, + "step": 1533 + }, + { + "epoch": 1.722869577425242, + "grad_norm": 0.45346376009896494, + "learning_rate": 1.656798722882698e-05, + "loss": 0.2902, + "step": 1534 + }, + { + "epoch": 1.7239926997051804, + "grad_norm": 0.4406609173508741, + "learning_rate": 1.6562070182517524e-05, + "loss": 0.2978, + "step": 1535 + }, + { + "epoch": 1.7251158219851186, + "grad_norm": 0.46672133569112967, + "learning_rate": 1.6556149098494075e-05, + "loss": 0.2761, + "step": 1536 + }, + { + "epoch": 1.7262389442650568, + "grad_norm": 0.48979063606476814, + "learning_rate": 1.6550223980399934e-05, + "loss": 0.2984, + "step": 1537 + }, + { + "epoch": 1.7273620665449951, + "grad_norm": 0.4511837735588295, + "learning_rate": 1.65442948318809e-05, + "loss": 0.285, + "step": 1538 + }, + { + "epoch": 1.7284851888249333, + "grad_norm": 0.45689758535181885, + "learning_rate": 1.6538361656585237e-05, + "loss": 0.3051, + "step": 1539 + }, + { + "epoch": 1.7296083111048715, + "grad_norm": 0.42911681373204696, + "learning_rate": 1.6532424458163692e-05, + "loss": 0.2855, + "step": 1540 + }, + { + "epoch": 1.7307314333848098, + "grad_norm": 0.4599079422199036, + "learning_rate": 1.6526483240269497e-05, + "loss": 0.2929, + "step": 1541 + }, + { + "epoch": 1.731854555664748, + "grad_norm": 0.4304194735001978, + "learning_rate": 1.6520538006558345e-05, + "loss": 0.2708, + "step": 1542 + }, + { + "epoch": 1.7329776779446862, + "grad_norm": 0.44178440467725394, + "learning_rate": 1.6514588760688397e-05, + "loss": 0.3007, + "step": 1543 + }, + { + "epoch": 1.7341008002246245, + "grad_norm": 0.4423330619472906, + "learning_rate": 1.65086355063203e-05, + "loss": 0.2845, + "step": 1544 + }, + { + "epoch": 1.7352239225045627, + "grad_norm": 0.433601638401388, + "learning_rate": 1.6502678247117146e-05, + "loss": 0.2796, + "step": 1545 + }, + { + "epoch": 1.7363470447845009, + "grad_norm": 0.44925607846317117, + "learning_rate": 1.649671698674451e-05, + "loss": 0.2786, + "step": 1546 + }, + { + "epoch": 1.7374701670644392, + "grad_norm": 0.4146082239786263, + "learning_rate": 1.6490751728870422e-05, + "loss": 0.2596, + "step": 1547 + }, + { + "epoch": 1.7385932893443774, + "grad_norm": 0.4338894018059736, + "learning_rate": 1.6484782477165365e-05, + "loss": 0.276, + "step": 1548 + }, + { + "epoch": 1.7397164116243156, + "grad_norm": 0.4650911120148975, + "learning_rate": 1.6478809235302287e-05, + "loss": 0.2873, + "step": 1549 + }, + { + "epoch": 1.740839533904254, + "grad_norm": 0.42522285905199153, + "learning_rate": 1.647283200695659e-05, + "loss": 0.2689, + "step": 1550 + }, + { + "epoch": 1.741962656184192, + "grad_norm": 0.44208521414529733, + "learning_rate": 1.6466850795806136e-05, + "loss": 0.3013, + "step": 1551 + }, + { + "epoch": 1.7430857784641303, + "grad_norm": 0.4517383587426911, + "learning_rate": 1.6460865605531214e-05, + "loss": 0.2905, + "step": 1552 + }, + { + "epoch": 1.7442089007440686, + "grad_norm": 0.42667270058823686, + "learning_rate": 1.6454876439814592e-05, + "loss": 0.2778, + "step": 1553 + }, + { + "epoch": 1.7453320230240066, + "grad_norm": 0.4596306665470625, + "learning_rate": 1.644888330234146e-05, + "loss": 0.2989, + "step": 1554 + }, + { + "epoch": 1.746455145303945, + "grad_norm": 0.43733172758764655, + "learning_rate": 1.6442886196799465e-05, + "loss": 0.2934, + "step": 1555 + }, + { + "epoch": 1.7475782675838833, + "grad_norm": 0.44887986338826835, + "learning_rate": 1.6436885126878696e-05, + "loss": 0.3157, + "step": 1556 + }, + { + "epoch": 1.7487013898638213, + "grad_norm": 0.451897402012675, + "learning_rate": 1.6430880096271672e-05, + "loss": 0.2903, + "step": 1557 + }, + { + "epoch": 1.7498245121437597, + "grad_norm": 0.41101933804424956, + "learning_rate": 1.6424871108673355e-05, + "loss": 0.2747, + "step": 1558 + }, + { + "epoch": 1.7509476344236978, + "grad_norm": 0.49671129631427824, + "learning_rate": 1.6418858167781145e-05, + "loss": 0.3052, + "step": 1559 + }, + { + "epoch": 1.752070756703636, + "grad_norm": 0.4181368637873254, + "learning_rate": 1.6412841277294865e-05, + "loss": 0.2834, + "step": 1560 + }, + { + "epoch": 1.7531938789835744, + "grad_norm": 0.4215454138779954, + "learning_rate": 1.6406820440916778e-05, + "loss": 0.267, + "step": 1561 + }, + { + "epoch": 1.7543170012635125, + "grad_norm": 0.4712786813972758, + "learning_rate": 1.6400795662351572e-05, + "loss": 0.3015, + "step": 1562 + }, + { + "epoch": 1.7554401235434507, + "grad_norm": 0.456622799753585, + "learning_rate": 1.639476694530635e-05, + "loss": 0.2769, + "step": 1563 + }, + { + "epoch": 1.756563245823389, + "grad_norm": 0.4518709261822103, + "learning_rate": 1.6388734293490666e-05, + "loss": 0.2828, + "step": 1564 + }, + { + "epoch": 1.7576863681033272, + "grad_norm": 0.4190624104153518, + "learning_rate": 1.6382697710616458e-05, + "loss": 0.2702, + "step": 1565 + }, + { + "epoch": 1.7588094903832654, + "grad_norm": 0.49660564296640625, + "learning_rate": 1.6376657200398117e-05, + "loss": 0.3019, + "step": 1566 + }, + { + "epoch": 1.7599326126632038, + "grad_norm": 0.43644278562274014, + "learning_rate": 1.6370612766552422e-05, + "loss": 0.2656, + "step": 1567 + }, + { + "epoch": 1.761055734943142, + "grad_norm": 0.45767759479301445, + "learning_rate": 1.636456441279859e-05, + "loss": 0.2947, + "step": 1568 + }, + { + "epoch": 1.76217885722308, + "grad_norm": 0.4594875195235337, + "learning_rate": 1.6358512142858234e-05, + "loss": 0.2993, + "step": 1569 + }, + { + "epoch": 1.7633019795030185, + "grad_norm": 0.48736759713026184, + "learning_rate": 1.6352455960455385e-05, + "loss": 0.289, + "step": 1570 + }, + { + "epoch": 1.7644251017829566, + "grad_norm": 0.4544717098398387, + "learning_rate": 1.634639586931648e-05, + "loss": 0.2781, + "step": 1571 + }, + { + "epoch": 1.7655482240628948, + "grad_norm": 0.4587177062509317, + "learning_rate": 1.6340331873170356e-05, + "loss": 0.2834, + "step": 1572 + }, + { + "epoch": 1.7666713463428332, + "grad_norm": 0.48310847080548047, + "learning_rate": 1.6334263975748263e-05, + "loss": 0.2921, + "step": 1573 + }, + { + "epoch": 1.7677944686227713, + "grad_norm": 0.46119532443782013, + "learning_rate": 1.632819218078383e-05, + "loss": 0.29, + "step": 1574 + }, + { + "epoch": 1.7689175909027095, + "grad_norm": 0.46068898687141085, + "learning_rate": 1.6322116492013116e-05, + "loss": 0.299, + "step": 1575 + }, + { + "epoch": 1.7700407131826479, + "grad_norm": 0.4439847667371739, + "learning_rate": 1.6316036913174555e-05, + "loss": 0.2855, + "step": 1576 + }, + { + "epoch": 1.7711638354625858, + "grad_norm": 0.4441579095125552, + "learning_rate": 1.630995344800897e-05, + "loss": 0.3003, + "step": 1577 + }, + { + "epoch": 1.7722869577425242, + "grad_norm": 0.4527313840660522, + "learning_rate": 1.6303866100259595e-05, + "loss": 0.2885, + "step": 1578 + }, + { + "epoch": 1.7734100800224626, + "grad_norm": 0.4295498490008525, + "learning_rate": 1.6297774873672036e-05, + "loss": 0.2913, + "step": 1579 + }, + { + "epoch": 1.7745332023024005, + "grad_norm": 0.4506045912150568, + "learning_rate": 1.6291679771994293e-05, + "loss": 0.2919, + "step": 1580 + }, + { + "epoch": 1.775656324582339, + "grad_norm": 0.43120215422811353, + "learning_rate": 1.6285580798976754e-05, + "loss": 0.288, + "step": 1581 + }, + { + "epoch": 1.7767794468622773, + "grad_norm": 0.45220082498818925, + "learning_rate": 1.6279477958372175e-05, + "loss": 0.2902, + "step": 1582 + }, + { + "epoch": 1.7779025691422152, + "grad_norm": 0.4292832161883602, + "learning_rate": 1.6273371253935707e-05, + "loss": 0.2807, + "step": 1583 + }, + { + "epoch": 1.7790256914221536, + "grad_norm": 0.43823143313617324, + "learning_rate": 1.626726068942487e-05, + "loss": 0.2899, + "step": 1584 + }, + { + "epoch": 1.7801488137020918, + "grad_norm": 0.43017131250609614, + "learning_rate": 1.6261146268599564e-05, + "loss": 0.2717, + "step": 1585 + }, + { + "epoch": 1.78127193598203, + "grad_norm": 0.45356581546068614, + "learning_rate": 1.6255027995222056e-05, + "loss": 0.2887, + "step": 1586 + }, + { + "epoch": 1.7823950582619683, + "grad_norm": 0.44928986844226365, + "learning_rate": 1.624890587305699e-05, + "loss": 0.2861, + "step": 1587 + }, + { + "epoch": 1.7835181805419065, + "grad_norm": 0.4524148407584021, + "learning_rate": 1.6242779905871375e-05, + "loss": 0.2751, + "step": 1588 + }, + { + "epoch": 1.7846413028218446, + "grad_norm": 0.43075367830451183, + "learning_rate": 1.6236650097434586e-05, + "loss": 0.2807, + "step": 1589 + }, + { + "epoch": 1.785764425101783, + "grad_norm": 0.46285463495616647, + "learning_rate": 1.623051645151836e-05, + "loss": 0.3121, + "step": 1590 + }, + { + "epoch": 1.7868875473817212, + "grad_norm": 0.43530135150806504, + "learning_rate": 1.6224378971896798e-05, + "loss": 0.2819, + "step": 1591 + }, + { + "epoch": 1.7880106696616593, + "grad_norm": 0.490996011158021, + "learning_rate": 1.6218237662346356e-05, + "loss": 0.2893, + "step": 1592 + }, + { + "epoch": 1.7891337919415977, + "grad_norm": 0.4647817119466746, + "learning_rate": 1.6212092526645854e-05, + "loss": 0.2843, + "step": 1593 + }, + { + "epoch": 1.7902569142215359, + "grad_norm": 0.4413295898097072, + "learning_rate": 1.6205943568576457e-05, + "loss": 0.2799, + "step": 1594 + }, + { + "epoch": 1.791380036501474, + "grad_norm": 0.43007143575326345, + "learning_rate": 1.6199790791921693e-05, + "loss": 0.2593, + "step": 1595 + }, + { + "epoch": 1.7925031587814124, + "grad_norm": 0.4881130643190996, + "learning_rate": 1.6193634200467426e-05, + "loss": 0.2961, + "step": 1596 + }, + { + "epoch": 1.7936262810613506, + "grad_norm": 0.4528464937764476, + "learning_rate": 1.618747379800188e-05, + "loss": 0.2854, + "step": 1597 + }, + { + "epoch": 1.7947494033412887, + "grad_norm": 0.45653808613442953, + "learning_rate": 1.6181309588315616e-05, + "loss": 0.2919, + "step": 1598 + }, + { + "epoch": 1.7958725256212271, + "grad_norm": 0.4327325119499413, + "learning_rate": 1.6175141575201537e-05, + "loss": 0.2709, + "step": 1599 + }, + { + "epoch": 1.7969956479011653, + "grad_norm": 0.4377952749193301, + "learning_rate": 1.6168969762454897e-05, + "loss": 0.2761, + "step": 1600 + }, + { + "epoch": 1.7981187701811034, + "grad_norm": 0.4878789221305916, + "learning_rate": 1.616279415387327e-05, + "loss": 0.312, + "step": 1601 + }, + { + "epoch": 1.7992418924610418, + "grad_norm": 0.4072202956012937, + "learning_rate": 1.6156614753256583e-05, + "loss": 0.2731, + "step": 1602 + }, + { + "epoch": 1.8003650147409798, + "grad_norm": 0.425387058556243, + "learning_rate": 1.615043156440709e-05, + "loss": 0.2647, + "step": 1603 + }, + { + "epoch": 1.8014881370209181, + "grad_norm": 0.44570656361437866, + "learning_rate": 1.6144244591129373e-05, + "loss": 0.2856, + "step": 1604 + }, + { + "epoch": 1.8026112593008565, + "grad_norm": 0.5020676279175268, + "learning_rate": 1.6138053837230345e-05, + "loss": 0.3087, + "step": 1605 + }, + { + "epoch": 1.8037343815807945, + "grad_norm": 0.43825994146175534, + "learning_rate": 1.6131859306519243e-05, + "loss": 0.2899, + "step": 1606 + }, + { + "epoch": 1.8048575038607328, + "grad_norm": 0.4446328374851358, + "learning_rate": 1.612566100280763e-05, + "loss": 0.28, + "step": 1607 + }, + { + "epoch": 1.8059806261406712, + "grad_norm": 0.4774225518830962, + "learning_rate": 1.6119458929909394e-05, + "loss": 0.3031, + "step": 1608 + }, + { + "epoch": 1.8071037484206092, + "grad_norm": 0.426328321425137, + "learning_rate": 1.611325309164074e-05, + "loss": 0.2766, + "step": 1609 + }, + { + "epoch": 1.8082268707005476, + "grad_norm": 0.43586684491416294, + "learning_rate": 1.610704349182018e-05, + "loss": 0.2928, + "step": 1610 + }, + { + "epoch": 1.8093499929804857, + "grad_norm": 0.4567133760129506, + "learning_rate": 1.6100830134268558e-05, + "loss": 0.2722, + "step": 1611 + }, + { + "epoch": 1.8104731152604239, + "grad_norm": 0.45026824414792077, + "learning_rate": 1.6094613022809017e-05, + "loss": 0.2876, + "step": 1612 + }, + { + "epoch": 1.8115962375403623, + "grad_norm": 0.45757626469045953, + "learning_rate": 1.6088392161267018e-05, + "loss": 0.2961, + "step": 1613 + }, + { + "epoch": 1.8127193598203004, + "grad_norm": 0.4502992493729303, + "learning_rate": 1.6082167553470318e-05, + "loss": 0.3049, + "step": 1614 + }, + { + "epoch": 1.8138424821002386, + "grad_norm": 0.4454769781082224, + "learning_rate": 1.607593920324899e-05, + "loss": 0.2817, + "step": 1615 + }, + { + "epoch": 1.814965604380177, + "grad_norm": 0.4435355751540588, + "learning_rate": 1.606970711443541e-05, + "loss": 0.2704, + "step": 1616 + }, + { + "epoch": 1.8160887266601151, + "grad_norm": 0.4434669970265584, + "learning_rate": 1.606347129086425e-05, + "loss": 0.2893, + "step": 1617 + }, + { + "epoch": 1.8172118489400533, + "grad_norm": 0.4472789556626577, + "learning_rate": 1.6057231736372478e-05, + "loss": 0.2784, + "step": 1618 + }, + { + "epoch": 1.8183349712199917, + "grad_norm": 0.45291111323420824, + "learning_rate": 1.605098845479936e-05, + "loss": 0.2933, + "step": 1619 + }, + { + "epoch": 1.8194580934999298, + "grad_norm": 0.4599287948047513, + "learning_rate": 1.6044741449986458e-05, + "loss": 0.2984, + "step": 1620 + }, + { + "epoch": 1.820581215779868, + "grad_norm": 0.4584871742864635, + "learning_rate": 1.6038490725777624e-05, + "loss": 0.3051, + "step": 1621 + }, + { + "epoch": 1.8217043380598064, + "grad_norm": 0.4696965226478849, + "learning_rate": 1.6032236286018995e-05, + "loss": 0.3184, + "step": 1622 + }, + { + "epoch": 1.8228274603397445, + "grad_norm": 0.4456532072721999, + "learning_rate": 1.6025978134559e-05, + "loss": 0.2814, + "step": 1623 + }, + { + "epoch": 1.8239505826196827, + "grad_norm": 0.45088143754678195, + "learning_rate": 1.6019716275248342e-05, + "loss": 0.3047, + "step": 1624 + }, + { + "epoch": 1.825073704899621, + "grad_norm": 0.43175491411132033, + "learning_rate": 1.6013450711940017e-05, + "loss": 0.274, + "step": 1625 + }, + { + "epoch": 1.8261968271795592, + "grad_norm": 0.46280692541168383, + "learning_rate": 1.600718144848929e-05, + "loss": 0.2808, + "step": 1626 + }, + { + "epoch": 1.8273199494594974, + "grad_norm": 0.45335668599330486, + "learning_rate": 1.600090848875372e-05, + "loss": 0.2749, + "step": 1627 + }, + { + "epoch": 1.8284430717394358, + "grad_norm": 0.4903202775753055, + "learning_rate": 1.5994631836593116e-05, + "loss": 0.3107, + "step": 1628 + }, + { + "epoch": 1.8295661940193737, + "grad_norm": 0.4715442977840691, + "learning_rate": 1.5988351495869574e-05, + "loss": 0.3007, + "step": 1629 + }, + { + "epoch": 1.830689316299312, + "grad_norm": 0.4535990104723046, + "learning_rate": 1.598206747044746e-05, + "loss": 0.2845, + "step": 1630 + }, + { + "epoch": 1.8318124385792505, + "grad_norm": 0.45369244026805666, + "learning_rate": 1.59757797641934e-05, + "loss": 0.2893, + "step": 1631 + }, + { + "epoch": 1.8329355608591884, + "grad_norm": 0.4406335157305204, + "learning_rate": 1.596948838097629e-05, + "loss": 0.2882, + "step": 1632 + }, + { + "epoch": 1.8340586831391268, + "grad_norm": 0.4471966455869977, + "learning_rate": 1.596319332466729e-05, + "loss": 0.275, + "step": 1633 + }, + { + "epoch": 1.835181805419065, + "grad_norm": 0.43360372640049477, + "learning_rate": 1.5956894599139814e-05, + "loss": 0.2819, + "step": 1634 + }, + { + "epoch": 1.836304927699003, + "grad_norm": 0.43343847003699065, + "learning_rate": 1.5950592208269536e-05, + "loss": 0.2875, + "step": 1635 + }, + { + "epoch": 1.8374280499789415, + "grad_norm": 0.4165129497318286, + "learning_rate": 1.5944286155934396e-05, + "loss": 0.2636, + "step": 1636 + }, + { + "epoch": 1.8385511722588797, + "grad_norm": 0.45238285001736483, + "learning_rate": 1.5937976446014563e-05, + "loss": 0.263, + "step": 1637 + }, + { + "epoch": 1.8396742945388178, + "grad_norm": 0.4466843595442531, + "learning_rate": 1.593166308239248e-05, + "loss": 0.276, + "step": 1638 + }, + { + "epoch": 1.8407974168187562, + "grad_norm": 0.4358078236250983, + "learning_rate": 1.5925346068952833e-05, + "loss": 0.2951, + "step": 1639 + }, + { + "epoch": 1.8419205390986944, + "grad_norm": 0.44824760657818036, + "learning_rate": 1.5919025409582537e-05, + "loss": 0.3078, + "step": 1640 + }, + { + "epoch": 1.8430436613786325, + "grad_norm": 0.43735777546518884, + "learning_rate": 1.5912701108170777e-05, + "loss": 0.2877, + "step": 1641 + }, + { + "epoch": 1.844166783658571, + "grad_norm": 0.4578607904759152, + "learning_rate": 1.5906373168608952e-05, + "loss": 0.2865, + "step": 1642 + }, + { + "epoch": 1.845289905938509, + "grad_norm": 0.41061673520301367, + "learning_rate": 1.5900041594790722e-05, + "loss": 0.2744, + "step": 1643 + }, + { + "epoch": 1.8464130282184472, + "grad_norm": 0.4765113577474518, + "learning_rate": 1.5893706390611978e-05, + "loss": 0.3107, + "step": 1644 + }, + { + "epoch": 1.8475361504983856, + "grad_norm": 0.435712372199893, + "learning_rate": 1.5887367559970825e-05, + "loss": 0.2664, + "step": 1645 + }, + { + "epoch": 1.8486592727783238, + "grad_norm": 0.4753377958035371, + "learning_rate": 1.588102510676763e-05, + "loss": 0.2915, + "step": 1646 + }, + { + "epoch": 1.849782395058262, + "grad_norm": 0.45798588409720853, + "learning_rate": 1.5874679034904966e-05, + "loss": 0.2794, + "step": 1647 + }, + { + "epoch": 1.8509055173382003, + "grad_norm": 0.42891102168652906, + "learning_rate": 1.5868329348287647e-05, + "loss": 0.2712, + "step": 1648 + }, + { + "epoch": 1.8520286396181385, + "grad_norm": 0.45380196793915967, + "learning_rate": 1.58619760508227e-05, + "loss": 0.2872, + "step": 1649 + }, + { + "epoch": 1.8531517618980766, + "grad_norm": 0.4721744608194615, + "learning_rate": 1.5855619146419382e-05, + "loss": 0.3042, + "step": 1650 + }, + { + "epoch": 1.854274884178015, + "grad_norm": 0.4426415279566046, + "learning_rate": 1.5849258638989166e-05, + "loss": 0.2886, + "step": 1651 + }, + { + "epoch": 1.8553980064579532, + "grad_norm": 0.4403624280134429, + "learning_rate": 1.5842894532445738e-05, + "loss": 0.29, + "step": 1652 + }, + { + "epoch": 1.8565211287378913, + "grad_norm": 0.44346698610518404, + "learning_rate": 1.583652683070501e-05, + "loss": 0.286, + "step": 1653 + }, + { + "epoch": 1.8576442510178297, + "grad_norm": 0.43077248160418, + "learning_rate": 1.5830155537685093e-05, + "loss": 0.2822, + "step": 1654 + }, + { + "epoch": 1.8587673732977676, + "grad_norm": 0.4347083314214242, + "learning_rate": 1.5823780657306313e-05, + "loss": 0.2749, + "step": 1655 + }, + { + "epoch": 1.859890495577706, + "grad_norm": 0.4278133742132193, + "learning_rate": 1.581740219349121e-05, + "loss": 0.2656, + "step": 1656 + }, + { + "epoch": 1.8610136178576444, + "grad_norm": 0.418571552787694, + "learning_rate": 1.5811020150164518e-05, + "loss": 0.2563, + "step": 1657 + }, + { + "epoch": 1.8621367401375823, + "grad_norm": 0.4692428605964284, + "learning_rate": 1.5804634531253184e-05, + "loss": 0.2982, + "step": 1658 + }, + { + "epoch": 1.8632598624175207, + "grad_norm": 0.4763812793557251, + "learning_rate": 1.5798245340686342e-05, + "loss": 0.2839, + "step": 1659 + }, + { + "epoch": 1.864382984697459, + "grad_norm": 0.45442752765196487, + "learning_rate": 1.5791852582395334e-05, + "loss": 0.3055, + "step": 1660 + }, + { + "epoch": 1.865506106977397, + "grad_norm": 0.4210856174208563, + "learning_rate": 1.5785456260313702e-05, + "loss": 0.2787, + "step": 1661 + }, + { + "epoch": 1.8666292292573354, + "grad_norm": 0.4701122518568725, + "learning_rate": 1.577905637837716e-05, + "loss": 0.297, + "step": 1662 + }, + { + "epoch": 1.8677523515372736, + "grad_norm": 0.45300215556011086, + "learning_rate": 1.5772652940523637e-05, + "loss": 0.287, + "step": 1663 + }, + { + "epoch": 1.8688754738172118, + "grad_norm": 0.4483737642832065, + "learning_rate": 1.576624595069323e-05, + "loss": 0.2889, + "step": 1664 + }, + { + "epoch": 1.8699985960971501, + "grad_norm": 0.44444518527475235, + "learning_rate": 1.575983541282824e-05, + "loss": 0.278, + "step": 1665 + }, + { + "epoch": 1.8711217183770883, + "grad_norm": 0.4736568759863554, + "learning_rate": 1.5753421330873134e-05, + "loss": 0.2886, + "step": 1666 + }, + { + "epoch": 1.8722448406570265, + "grad_norm": 0.47085233017222466, + "learning_rate": 1.574700370877457e-05, + "loss": 0.2988, + "step": 1667 + }, + { + "epoch": 1.8733679629369648, + "grad_norm": 0.4503283744113668, + "learning_rate": 1.574058255048138e-05, + "loss": 0.2842, + "step": 1668 + }, + { + "epoch": 1.874491085216903, + "grad_norm": 0.45383270715255947, + "learning_rate": 1.5734157859944574e-05, + "loss": 0.2956, + "step": 1669 + }, + { + "epoch": 1.8756142074968412, + "grad_norm": 0.458441923267404, + "learning_rate": 1.5727729641117338e-05, + "loss": 0.3, + "step": 1670 + }, + { + "epoch": 1.8767373297767795, + "grad_norm": 0.45050041121130363, + "learning_rate": 1.5721297897955023e-05, + "loss": 0.2843, + "step": 1671 + }, + { + "epoch": 1.8778604520567177, + "grad_norm": 0.42269479225207923, + "learning_rate": 1.5714862634415145e-05, + "loss": 0.2755, + "step": 1672 + }, + { + "epoch": 1.8789835743366559, + "grad_norm": 0.4118289888578974, + "learning_rate": 1.5708423854457408e-05, + "loss": 0.2683, + "step": 1673 + }, + { + "epoch": 1.8801066966165942, + "grad_norm": 0.4519087067087661, + "learning_rate": 1.5701981562043648e-05, + "loss": 0.2941, + "step": 1674 + }, + { + "epoch": 1.8812298188965324, + "grad_norm": 0.45169859494837117, + "learning_rate": 1.569553576113789e-05, + "loss": 0.3081, + "step": 1675 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.45818548968025397, + "learning_rate": 1.56890864557063e-05, + "loss": 0.2906, + "step": 1676 + }, + { + "epoch": 1.883476063456409, + "grad_norm": 0.41903925729828695, + "learning_rate": 1.5682633649717206e-05, + "loss": 0.256, + "step": 1677 + }, + { + "epoch": 1.8845991857363469, + "grad_norm": 0.4494067572971225, + "learning_rate": 1.5676177347141096e-05, + "loss": 0.2999, + "step": 1678 + }, + { + "epoch": 1.8857223080162853, + "grad_norm": 0.430053388913964, + "learning_rate": 1.5669717551950595e-05, + "loss": 0.2778, + "step": 1679 + }, + { + "epoch": 1.8868454302962236, + "grad_norm": 0.45114702164180737, + "learning_rate": 1.5663254268120497e-05, + "loss": 0.2782, + "step": 1680 + }, + { + "epoch": 1.8879685525761616, + "grad_norm": 0.4767521278617856, + "learning_rate": 1.5656787499627727e-05, + "loss": 0.3075, + "step": 1681 + }, + { + "epoch": 1.8890916748561, + "grad_norm": 0.43432528267843223, + "learning_rate": 1.5650317250451357e-05, + "loss": 0.2925, + "step": 1682 + }, + { + "epoch": 1.8902147971360383, + "grad_norm": 0.43280983326124206, + "learning_rate": 1.5643843524572605e-05, + "loss": 0.2833, + "step": 1683 + }, + { + "epoch": 1.8913379194159763, + "grad_norm": 0.4460244169687265, + "learning_rate": 1.5637366325974823e-05, + "loss": 0.3001, + "step": 1684 + }, + { + "epoch": 1.8924610416959147, + "grad_norm": 0.4748835442825777, + "learning_rate": 1.5630885658643508e-05, + "loss": 0.305, + "step": 1685 + }, + { + "epoch": 1.8935841639758528, + "grad_norm": 0.4145939481842465, + "learning_rate": 1.5624401526566277e-05, + "loss": 0.2629, + "step": 1686 + }, + { + "epoch": 1.894707286255791, + "grad_norm": 0.452591025755552, + "learning_rate": 1.5617913933732892e-05, + "loss": 0.3203, + "step": 1687 + }, + { + "epoch": 1.8958304085357294, + "grad_norm": 0.42763483263940355, + "learning_rate": 1.5611422884135245e-05, + "loss": 0.2815, + "step": 1688 + }, + { + "epoch": 1.8969535308156675, + "grad_norm": 0.43158239697123274, + "learning_rate": 1.5604928381767345e-05, + "loss": 0.2793, + "step": 1689 + }, + { + "epoch": 1.8980766530956057, + "grad_norm": 0.4293853340809833, + "learning_rate": 1.5598430430625335e-05, + "loss": 0.2809, + "step": 1690 + }, + { + "epoch": 1.899199775375544, + "grad_norm": 0.46535273456706855, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.3093, + "step": 1691 + }, + { + "epoch": 1.9003228976554822, + "grad_norm": 0.43293986112214783, + "learning_rate": 1.5585424198014135e-05, + "loss": 0.2704, + "step": 1692 + }, + { + "epoch": 1.9014460199354204, + "grad_norm": 0.47308351663116804, + "learning_rate": 1.5578915924547824e-05, + "loss": 0.3087, + "step": 1693 + }, + { + "epoch": 1.9025691422153588, + "grad_norm": 0.45499586114643786, + "learning_rate": 1.557240421831315e-05, + "loss": 0.2979, + "step": 1694 + }, + { + "epoch": 1.903692264495297, + "grad_norm": 0.43179402389716814, + "learning_rate": 1.5565889083316847e-05, + "loss": 0.2795, + "step": 1695 + }, + { + "epoch": 1.904815386775235, + "grad_norm": 0.4337080321597782, + "learning_rate": 1.5559370523567734e-05, + "loss": 0.2884, + "step": 1696 + }, + { + "epoch": 1.9059385090551735, + "grad_norm": 0.4416737410605399, + "learning_rate": 1.5552848543076762e-05, + "loss": 0.2916, + "step": 1697 + }, + { + "epoch": 1.9070616313351116, + "grad_norm": 0.45327771862806987, + "learning_rate": 1.5546323145856976e-05, + "loss": 0.283, + "step": 1698 + }, + { + "epoch": 1.9081847536150498, + "grad_norm": 0.43897115726266495, + "learning_rate": 1.5539794335923523e-05, + "loss": 0.2757, + "step": 1699 + }, + { + "epoch": 1.9093078758949882, + "grad_norm": 0.45535826828398407, + "learning_rate": 1.553326211729365e-05, + "loss": 0.2846, + "step": 1700 + }, + { + "epoch": 1.9104309981749263, + "grad_norm": 0.44632754329033314, + "learning_rate": 1.5526726493986707e-05, + "loss": 0.2854, + "step": 1701 + }, + { + "epoch": 1.9115541204548645, + "grad_norm": 0.44264267367794924, + "learning_rate": 1.5520187470024138e-05, + "loss": 0.291, + "step": 1702 + }, + { + "epoch": 1.9126772427348029, + "grad_norm": 0.4261191699485715, + "learning_rate": 1.5513645049429468e-05, + "loss": 0.2744, + "step": 1703 + }, + { + "epoch": 1.9138003650147408, + "grad_norm": 0.4718910711066927, + "learning_rate": 1.550709923622832e-05, + "loss": 0.3094, + "step": 1704 + }, + { + "epoch": 1.9149234872946792, + "grad_norm": 0.443144418899658, + "learning_rate": 1.5500550034448415e-05, + "loss": 0.265, + "step": 1705 + }, + { + "epoch": 1.9160466095746176, + "grad_norm": 0.4680353665068334, + "learning_rate": 1.549399744811954e-05, + "loss": 0.2984, + "step": 1706 + }, + { + "epoch": 1.9171697318545555, + "grad_norm": 0.451871229122914, + "learning_rate": 1.5487441481273576e-05, + "loss": 0.2856, + "step": 1707 + }, + { + "epoch": 1.918292854134494, + "grad_norm": 0.45525795558547877, + "learning_rate": 1.5480882137944483e-05, + "loss": 0.2972, + "step": 1708 + }, + { + "epoch": 1.919415976414432, + "grad_norm": 0.457501569499871, + "learning_rate": 1.54743194221683e-05, + "loss": 0.2842, + "step": 1709 + }, + { + "epoch": 1.9205390986943702, + "grad_norm": 0.40429630388545335, + "learning_rate": 1.546775333798313e-05, + "loss": 0.2635, + "step": 1710 + }, + { + "epoch": 1.9216622209743086, + "grad_norm": 0.43949600121406834, + "learning_rate": 1.5461183889429163e-05, + "loss": 0.287, + "step": 1711 + }, + { + "epoch": 1.9227853432542468, + "grad_norm": 0.44623475635576615, + "learning_rate": 1.545461108054865e-05, + "loss": 0.295, + "step": 1712 + }, + { + "epoch": 1.923908465534185, + "grad_norm": 0.4332274620156582, + "learning_rate": 1.5448034915385912e-05, + "loss": 0.2764, + "step": 1713 + }, + { + "epoch": 1.9250315878141233, + "grad_norm": 0.43976301663859074, + "learning_rate": 1.5441455397987342e-05, + "loss": 0.2796, + "step": 1714 + }, + { + "epoch": 1.9261547100940615, + "grad_norm": 0.48004372769154424, + "learning_rate": 1.543487253240138e-05, + "loss": 0.3315, + "step": 1715 + }, + { + "epoch": 1.9272778323739996, + "grad_norm": 0.44346770478396347, + "learning_rate": 1.5428286322678544e-05, + "loss": 0.3029, + "step": 1716 + }, + { + "epoch": 1.928400954653938, + "grad_norm": 0.452216885627314, + "learning_rate": 1.54216967728714e-05, + "loss": 0.2889, + "step": 1717 + }, + { + "epoch": 1.9295240769338762, + "grad_norm": 0.43338802482350897, + "learning_rate": 1.5415103887034565e-05, + "loss": 0.2847, + "step": 1718 + }, + { + "epoch": 1.9306471992138143, + "grad_norm": 0.42192888016892804, + "learning_rate": 1.540850766922472e-05, + "loss": 0.283, + "step": 1719 + }, + { + "epoch": 1.9317703214937527, + "grad_norm": 0.45160195890660065, + "learning_rate": 1.540190812350059e-05, + "loss": 0.2745, + "step": 1720 + }, + { + "epoch": 1.9328934437736909, + "grad_norm": 0.45447959773978414, + "learning_rate": 1.539530525392294e-05, + "loss": 0.3008, + "step": 1721 + }, + { + "epoch": 1.934016566053629, + "grad_norm": 0.4371017493232612, + "learning_rate": 1.53886990645546e-05, + "loss": 0.2948, + "step": 1722 + }, + { + "epoch": 1.9351396883335674, + "grad_norm": 0.4334080460875823, + "learning_rate": 1.5382089559460423e-05, + "loss": 0.2818, + "step": 1723 + }, + { + "epoch": 1.9362628106135056, + "grad_norm": 0.4731746980122572, + "learning_rate": 1.5375476742707314e-05, + "loss": 0.3038, + "step": 1724 + }, + { + "epoch": 1.9373859328934437, + "grad_norm": 0.454174378429582, + "learning_rate": 1.536886061836421e-05, + "loss": 0.2948, + "step": 1725 + }, + { + "epoch": 1.9385090551733821, + "grad_norm": 0.4432630231842709, + "learning_rate": 1.5362241190502086e-05, + "loss": 0.2852, + "step": 1726 + }, + { + "epoch": 1.9396321774533203, + "grad_norm": 0.4598840425874724, + "learning_rate": 1.5355618463193945e-05, + "loss": 0.2815, + "step": 1727 + }, + { + "epoch": 1.9407552997332584, + "grad_norm": 0.451831626436298, + "learning_rate": 1.5348992440514832e-05, + "loss": 0.2729, + "step": 1728 + }, + { + "epoch": 1.9418784220131968, + "grad_norm": 0.46544809796158676, + "learning_rate": 1.534236312654181e-05, + "loss": 0.3169, + "step": 1729 + }, + { + "epoch": 1.9430015442931348, + "grad_norm": 0.44645885927284146, + "learning_rate": 1.5335730525353962e-05, + "loss": 0.2785, + "step": 1730 + }, + { + "epoch": 1.9441246665730731, + "grad_norm": 0.469136480578403, + "learning_rate": 1.5329094641032406e-05, + "loss": 0.3056, + "step": 1731 + }, + { + "epoch": 1.9452477888530115, + "grad_norm": 0.43983149032385477, + "learning_rate": 1.5322455477660274e-05, + "loss": 0.2836, + "step": 1732 + }, + { + "epoch": 1.9463709111329495, + "grad_norm": 0.44719810124182485, + "learning_rate": 1.5315813039322714e-05, + "loss": 0.2801, + "step": 1733 + }, + { + "epoch": 1.9474940334128878, + "grad_norm": 0.4290827977526451, + "learning_rate": 1.5309167330106895e-05, + "loss": 0.2704, + "step": 1734 + }, + { + "epoch": 1.948617155692826, + "grad_norm": 0.4497567848165034, + "learning_rate": 1.5302518354101992e-05, + "loss": 0.2967, + "step": 1735 + }, + { + "epoch": 1.9497402779727642, + "grad_norm": 0.468862605371959, + "learning_rate": 1.5295866115399193e-05, + "loss": 0.3041, + "step": 1736 + }, + { + "epoch": 1.9508634002527026, + "grad_norm": 0.4888366595716739, + "learning_rate": 1.5289210618091695e-05, + "loss": 0.3024, + "step": 1737 + }, + { + "epoch": 1.9519865225326407, + "grad_norm": 0.476112693226597, + "learning_rate": 1.52825518662747e-05, + "loss": 0.2621, + "step": 1738 + }, + { + "epoch": 1.9531096448125789, + "grad_norm": 0.44772753456589154, + "learning_rate": 1.527588986404541e-05, + "loss": 0.2796, + "step": 1739 + }, + { + "epoch": 1.9542327670925173, + "grad_norm": 0.5107778899997112, + "learning_rate": 1.5269224615503025e-05, + "loss": 0.3308, + "step": 1740 + }, + { + "epoch": 1.9553558893724554, + "grad_norm": 0.4674299014245734, + "learning_rate": 1.5262556124748754e-05, + "loss": 0.2826, + "step": 1741 + }, + { + "epoch": 1.9564790116523936, + "grad_norm": 0.46960805580213283, + "learning_rate": 1.5255884395885785e-05, + "loss": 0.2939, + "step": 1742 + }, + { + "epoch": 1.957602133932332, + "grad_norm": 0.4209722750164603, + "learning_rate": 1.5249209433019307e-05, + "loss": 0.2722, + "step": 1743 + }, + { + "epoch": 1.9587252562122701, + "grad_norm": 0.4568155973367222, + "learning_rate": 1.5242531240256501e-05, + "loss": 0.2811, + "step": 1744 + }, + { + "epoch": 1.9598483784922083, + "grad_norm": 0.4307082739584054, + "learning_rate": 1.5235849821706531e-05, + "loss": 0.28, + "step": 1745 + }, + { + "epoch": 1.9609715007721467, + "grad_norm": 0.4234189998569159, + "learning_rate": 1.5229165181480552e-05, + "loss": 0.2671, + "step": 1746 + }, + { + "epoch": 1.9620946230520848, + "grad_norm": 0.4739925558985982, + "learning_rate": 1.5222477323691687e-05, + "loss": 0.307, + "step": 1747 + }, + { + "epoch": 1.963217745332023, + "grad_norm": 0.44810995148794674, + "learning_rate": 1.5215786252455056e-05, + "loss": 0.2925, + "step": 1748 + }, + { + "epoch": 1.9643408676119614, + "grad_norm": 0.43430736966909755, + "learning_rate": 1.5209091971887747e-05, + "loss": 0.281, + "step": 1749 + }, + { + "epoch": 1.9654639898918995, + "grad_norm": 0.44148058197118256, + "learning_rate": 1.5202394486108823e-05, + "loss": 0.2981, + "step": 1750 + }, + { + "epoch": 1.9665871121718377, + "grad_norm": 0.4108291724424686, + "learning_rate": 1.5195693799239322e-05, + "loss": 0.2675, + "step": 1751 + }, + { + "epoch": 1.967710234451776, + "grad_norm": 0.4412806106090446, + "learning_rate": 1.5188989915402253e-05, + "loss": 0.2827, + "step": 1752 + }, + { + "epoch": 1.968833356731714, + "grad_norm": 0.4529168112870875, + "learning_rate": 1.5182282838722584e-05, + "loss": 0.2939, + "step": 1753 + }, + { + "epoch": 1.9699564790116524, + "grad_norm": 0.41285809272736673, + "learning_rate": 1.5175572573327257e-05, + "loss": 0.2731, + "step": 1754 + }, + { + "epoch": 1.9710796012915908, + "grad_norm": 0.43190618430843086, + "learning_rate": 1.5168859123345172e-05, + "loss": 0.2774, + "step": 1755 + }, + { + "epoch": 1.9722027235715287, + "grad_norm": 0.4552856993615373, + "learning_rate": 1.5162142492907186e-05, + "loss": 0.2862, + "step": 1756 + }, + { + "epoch": 1.973325845851467, + "grad_norm": 0.4514244233597325, + "learning_rate": 1.5155422686146118e-05, + "loss": 0.2727, + "step": 1757 + }, + { + "epoch": 1.9744489681314055, + "grad_norm": 0.43333285646026426, + "learning_rate": 1.5148699707196739e-05, + "loss": 0.2774, + "step": 1758 + }, + { + "epoch": 1.9755720904113434, + "grad_norm": 0.4508656102546517, + "learning_rate": 1.5141973560195768e-05, + "loss": 0.2944, + "step": 1759 + }, + { + "epoch": 1.9766952126912818, + "grad_norm": 0.4743163650426743, + "learning_rate": 1.5135244249281884e-05, + "loss": 0.323, + "step": 1760 + }, + { + "epoch": 1.97781833497122, + "grad_norm": 0.4564248851686335, + "learning_rate": 1.5128511778595703e-05, + "loss": 0.2717, + "step": 1761 + }, + { + "epoch": 1.978941457251158, + "grad_norm": 0.470303991029109, + "learning_rate": 1.5121776152279786e-05, + "loss": 0.3205, + "step": 1762 + }, + { + "epoch": 1.9800645795310965, + "grad_norm": 0.4224772001099396, + "learning_rate": 1.5115037374478641e-05, + "loss": 0.2678, + "step": 1763 + }, + { + "epoch": 1.9811877018110347, + "grad_norm": 0.4457682890054829, + "learning_rate": 1.510829544933871e-05, + "loss": 0.297, + "step": 1764 + }, + { + "epoch": 1.9823108240909728, + "grad_norm": 0.4371918445392385, + "learning_rate": 1.5101550381008377e-05, + "loss": 0.272, + "step": 1765 + }, + { + "epoch": 1.9834339463709112, + "grad_norm": 0.4565606365981388, + "learning_rate": 1.5094802173637953e-05, + "loss": 0.3024, + "step": 1766 + }, + { + "epoch": 1.9845570686508494, + "grad_norm": 0.4712494126543963, + "learning_rate": 1.5088050831379684e-05, + "loss": 0.2958, + "step": 1767 + }, + { + "epoch": 1.9856801909307875, + "grad_norm": 0.4813432876456791, + "learning_rate": 1.508129635838775e-05, + "loss": 0.2996, + "step": 1768 + }, + { + "epoch": 1.986803313210726, + "grad_norm": 0.42456961820085154, + "learning_rate": 1.5074538758818247e-05, + "loss": 0.2701, + "step": 1769 + }, + { + "epoch": 1.987926435490664, + "grad_norm": 0.4466264321919789, + "learning_rate": 1.5067778036829204e-05, + "loss": 0.2844, + "step": 1770 + }, + { + "epoch": 1.9890495577706022, + "grad_norm": 0.45361513546811316, + "learning_rate": 1.5061014196580565e-05, + "loss": 0.279, + "step": 1771 + }, + { + "epoch": 1.9901726800505406, + "grad_norm": 0.4418593232740048, + "learning_rate": 1.5054247242234197e-05, + "loss": 0.2736, + "step": 1772 + }, + { + "epoch": 1.9912958023304788, + "grad_norm": 0.4860660602595377, + "learning_rate": 1.5047477177953887e-05, + "loss": 0.313, + "step": 1773 + }, + { + "epoch": 1.992418924610417, + "grad_norm": 0.43959539799018116, + "learning_rate": 1.5040704007905319e-05, + "loss": 0.2737, + "step": 1774 + }, + { + "epoch": 1.9935420468903553, + "grad_norm": 0.49294497786554403, + "learning_rate": 1.5033927736256107e-05, + "loss": 0.3041, + "step": 1775 + }, + { + "epoch": 1.9946651691702935, + "grad_norm": 0.42461221519723574, + "learning_rate": 1.5027148367175759e-05, + "loss": 0.2616, + "step": 1776 + }, + { + "epoch": 1.9957882914502316, + "grad_norm": 0.4402615651724432, + "learning_rate": 1.50203659048357e-05, + "loss": 0.2697, + "step": 1777 + }, + { + "epoch": 1.99691141373017, + "grad_norm": 0.46310993944775436, + "learning_rate": 1.5013580353409259e-05, + "loss": 0.2917, + "step": 1778 + }, + { + "epoch": 1.998034536010108, + "grad_norm": 0.46147694875445594, + "learning_rate": 1.500679171707165e-05, + "loss": 0.2791, + "step": 1779 + }, + { + "epoch": 1.9991576582900463, + "grad_norm": 0.4680035095821885, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.2826, + "step": 1780 + }, + { + "epoch": 2.0002807805699847, + "grad_norm": 0.8085198551259174, + "learning_rate": 1.499320520637333e-05, + "loss": 0.4237, + "step": 1781 + }, + { + "epoch": 2.0014039028499226, + "grad_norm": 0.541462070507787, + "learning_rate": 1.4986407340372546e-05, + "loss": 0.177, + "step": 1782 + }, + { + "epoch": 2.002527025129861, + "grad_norm": 0.514819434140705, + "learning_rate": 1.4979606406180456e-05, + "loss": 0.1881, + "step": 1783 + }, + { + "epoch": 2.0036501474097994, + "grad_norm": 0.45400673841814165, + "learning_rate": 1.4972802407981744e-05, + "loss": 0.174, + "step": 1784 + }, + { + "epoch": 2.0047732696897373, + "grad_norm": 0.5894341185038876, + "learning_rate": 1.4965995349962987e-05, + "loss": 0.1669, + "step": 1785 + }, + { + "epoch": 2.0058963919696757, + "grad_norm": 0.780454484731567, + "learning_rate": 1.4959185236312642e-05, + "loss": 0.1779, + "step": 1786 + }, + { + "epoch": 2.007019514249614, + "grad_norm": 0.5115911523467761, + "learning_rate": 1.495237207122105e-05, + "loss": 0.1719, + "step": 1787 + }, + { + "epoch": 2.008142636529552, + "grad_norm": 0.47746167967424985, + "learning_rate": 1.4945555858880422e-05, + "loss": 0.1658, + "step": 1788 + }, + { + "epoch": 2.0092657588094904, + "grad_norm": 0.49235675543550705, + "learning_rate": 1.493873660348485e-05, + "loss": 0.1862, + "step": 1789 + }, + { + "epoch": 2.010388881089429, + "grad_norm": 0.4566009363822769, + "learning_rate": 1.49319143092303e-05, + "loss": 0.1768, + "step": 1790 + }, + { + "epoch": 2.0115120033693668, + "grad_norm": 0.4648626440058988, + "learning_rate": 1.4925088980314604e-05, + "loss": 0.1751, + "step": 1791 + }, + { + "epoch": 2.012635125649305, + "grad_norm": 0.45375792194533593, + "learning_rate": 1.4918260620937458e-05, + "loss": 0.1647, + "step": 1792 + }, + { + "epoch": 2.0137582479292435, + "grad_norm": 0.42584485799570987, + "learning_rate": 1.4911429235300425e-05, + "loss": 0.156, + "step": 1793 + }, + { + "epoch": 2.0148813702091815, + "grad_norm": 0.460137628374996, + "learning_rate": 1.490459482760694e-05, + "loss": 0.1708, + "step": 1794 + }, + { + "epoch": 2.01600449248912, + "grad_norm": 0.45344627339365856, + "learning_rate": 1.4897757402062285e-05, + "loss": 0.1567, + "step": 1795 + }, + { + "epoch": 2.0171276147690578, + "grad_norm": 0.4953474815052228, + "learning_rate": 1.48909169628736e-05, + "loss": 0.1707, + "step": 1796 + }, + { + "epoch": 2.018250737048996, + "grad_norm": 0.4753046189955386, + "learning_rate": 1.488407351424989e-05, + "loss": 0.1641, + "step": 1797 + }, + { + "epoch": 2.0193738593289345, + "grad_norm": 0.4519314500216831, + "learning_rate": 1.4877227060401997e-05, + "loss": 0.1668, + "step": 1798 + }, + { + "epoch": 2.0204969816088725, + "grad_norm": 0.45671522761655353, + "learning_rate": 1.4870377605542624e-05, + "loss": 0.1601, + "step": 1799 + }, + { + "epoch": 2.021620103888811, + "grad_norm": 0.456319516767008, + "learning_rate": 1.4863525153886314e-05, + "loss": 0.1641, + "step": 1800 + }, + { + "epoch": 2.0227432261687492, + "grad_norm": 0.45491434648308354, + "learning_rate": 1.4856669709649455e-05, + "loss": 0.1707, + "step": 1801 + }, + { + "epoch": 2.023866348448687, + "grad_norm": 0.4492812134103008, + "learning_rate": 1.4849811277050279e-05, + "loss": 0.1667, + "step": 1802 + }, + { + "epoch": 2.0249894707286256, + "grad_norm": 0.47837793383271804, + "learning_rate": 1.4842949860308854e-05, + "loss": 0.1762, + "step": 1803 + }, + { + "epoch": 2.026112593008564, + "grad_norm": 0.4366394160857708, + "learning_rate": 1.4836085463647088e-05, + "loss": 0.1627, + "step": 1804 + }, + { + "epoch": 2.027235715288502, + "grad_norm": 0.4819217078029733, + "learning_rate": 1.4829218091288713e-05, + "loss": 0.1833, + "step": 1805 + }, + { + "epoch": 2.0283588375684403, + "grad_norm": 0.4592333930190147, + "learning_rate": 1.4822347747459307e-05, + "loss": 0.1668, + "step": 1806 + }, + { + "epoch": 2.0294819598483786, + "grad_norm": 0.45422769074810193, + "learning_rate": 1.4815474436386263e-05, + "loss": 0.1683, + "step": 1807 + }, + { + "epoch": 2.0306050821283166, + "grad_norm": 0.4600329925419827, + "learning_rate": 1.4808598162298806e-05, + "loss": 0.1747, + "step": 1808 + }, + { + "epoch": 2.031728204408255, + "grad_norm": 0.4583134137959598, + "learning_rate": 1.4801718929427986e-05, + "loss": 0.1649, + "step": 1809 + }, + { + "epoch": 2.0328513266881933, + "grad_norm": 0.49051047861023445, + "learning_rate": 1.4794836742006667e-05, + "loss": 0.1714, + "step": 1810 + }, + { + "epoch": 2.0339744489681313, + "grad_norm": 0.45752788677919076, + "learning_rate": 1.4787951604269533e-05, + "loss": 0.17, + "step": 1811 + }, + { + "epoch": 2.0350975712480697, + "grad_norm": 0.46474367178946024, + "learning_rate": 1.478106352045309e-05, + "loss": 0.1784, + "step": 1812 + }, + { + "epoch": 2.036220693528008, + "grad_norm": 0.44888182231043505, + "learning_rate": 1.4774172494795651e-05, + "loss": 0.1655, + "step": 1813 + }, + { + "epoch": 2.037343815807946, + "grad_norm": 0.4874435144426795, + "learning_rate": 1.4767278531537335e-05, + "loss": 0.1746, + "step": 1814 + }, + { + "epoch": 2.0384669380878844, + "grad_norm": 0.45309578620813684, + "learning_rate": 1.476038163492008e-05, + "loss": 0.1598, + "step": 1815 + }, + { + "epoch": 2.0395900603678228, + "grad_norm": 0.47320504356622956, + "learning_rate": 1.4753481809187617e-05, + "loss": 0.1834, + "step": 1816 + }, + { + "epoch": 2.0407131826477607, + "grad_norm": 0.4720649193376181, + "learning_rate": 1.474657905858549e-05, + "loss": 0.1739, + "step": 1817 + }, + { + "epoch": 2.041836304927699, + "grad_norm": 0.4728143038805621, + "learning_rate": 1.4739673387361033e-05, + "loss": 0.1764, + "step": 1818 + }, + { + "epoch": 2.042959427207637, + "grad_norm": 0.44374918226300764, + "learning_rate": 1.4732764799763383e-05, + "loss": 0.1656, + "step": 1819 + }, + { + "epoch": 2.0440825494875754, + "grad_norm": 0.4923042826192989, + "learning_rate": 1.4725853300043472e-05, + "loss": 0.1987, + "step": 1820 + }, + { + "epoch": 2.0452056717675138, + "grad_norm": 0.46751435491816906, + "learning_rate": 1.4718938892454018e-05, + "loss": 0.1572, + "step": 1821 + }, + { + "epoch": 2.0463287940474517, + "grad_norm": 0.47339333449870347, + "learning_rate": 1.4712021581249534e-05, + "loss": 0.1776, + "step": 1822 + }, + { + "epoch": 2.04745191632739, + "grad_norm": 0.46020101583349715, + "learning_rate": 1.4705101370686316e-05, + "loss": 0.1604, + "step": 1823 + }, + { + "epoch": 2.0485750386073285, + "grad_norm": 0.49263307013323115, + "learning_rate": 1.469817826502245e-05, + "loss": 0.1668, + "step": 1824 + }, + { + "epoch": 2.0496981608872664, + "grad_norm": 0.4710764801250717, + "learning_rate": 1.4691252268517794e-05, + "loss": 0.1775, + "step": 1825 + }, + { + "epoch": 2.050821283167205, + "grad_norm": 0.49880730266155526, + "learning_rate": 1.4684323385433997e-05, + "loss": 0.1822, + "step": 1826 + }, + { + "epoch": 2.051944405447143, + "grad_norm": 0.46625550510782054, + "learning_rate": 1.4677391620034467e-05, + "loss": 0.174, + "step": 1827 + }, + { + "epoch": 2.053067527727081, + "grad_norm": 0.4872551100589393, + "learning_rate": 1.4670456976584401e-05, + "loss": 0.1656, + "step": 1828 + }, + { + "epoch": 2.0541906500070195, + "grad_norm": 0.43451500585845937, + "learning_rate": 1.4663519459350763e-05, + "loss": 0.1589, + "step": 1829 + }, + { + "epoch": 2.055313772286958, + "grad_norm": 0.428654270229592, + "learning_rate": 1.4656579072602281e-05, + "loss": 0.1596, + "step": 1830 + }, + { + "epoch": 2.056436894566896, + "grad_norm": 0.48053898356788094, + "learning_rate": 1.4649635820609457e-05, + "loss": 0.1831, + "step": 1831 + }, + { + "epoch": 2.057560016846834, + "grad_norm": 0.4472515086562773, + "learning_rate": 1.464268970764454e-05, + "loss": 0.1618, + "step": 1832 + }, + { + "epoch": 2.0586831391267726, + "grad_norm": 0.49366586224960823, + "learning_rate": 1.4635740737981557e-05, + "loss": 0.1771, + "step": 1833 + }, + { + "epoch": 2.0598062614067105, + "grad_norm": 0.455074161806074, + "learning_rate": 1.4628788915896282e-05, + "loss": 0.1661, + "step": 1834 + }, + { + "epoch": 2.060929383686649, + "grad_norm": 0.44819907357299266, + "learning_rate": 1.4621834245666254e-05, + "loss": 0.1563, + "step": 1835 + }, + { + "epoch": 2.0620525059665873, + "grad_norm": 0.4826290613150395, + "learning_rate": 1.4614876731570751e-05, + "loss": 0.1823, + "step": 1836 + }, + { + "epoch": 2.0631756282465252, + "grad_norm": 0.4657177556230326, + "learning_rate": 1.4607916377890807e-05, + "loss": 0.1703, + "step": 1837 + }, + { + "epoch": 2.0642987505264636, + "grad_norm": 0.47786499964299944, + "learning_rate": 1.4600953188909214e-05, + "loss": 0.1727, + "step": 1838 + }, + { + "epoch": 2.065421872806402, + "grad_norm": 0.4642252898495588, + "learning_rate": 1.4593987168910491e-05, + "loss": 0.1743, + "step": 1839 + }, + { + "epoch": 2.06654499508634, + "grad_norm": 0.4695641601793918, + "learning_rate": 1.4587018322180906e-05, + "loss": 0.1747, + "step": 1840 + }, + { + "epoch": 2.0676681173662783, + "grad_norm": 0.4662822283785743, + "learning_rate": 1.4580046653008474e-05, + "loss": 0.1692, + "step": 1841 + }, + { + "epoch": 2.0687912396462167, + "grad_norm": 0.4576878116997234, + "learning_rate": 1.457307216568293e-05, + "loss": 0.1643, + "step": 1842 + }, + { + "epoch": 2.0699143619261546, + "grad_norm": 0.44945289175219166, + "learning_rate": 1.4566094864495761e-05, + "loss": 0.1669, + "step": 1843 + }, + { + "epoch": 2.071037484206093, + "grad_norm": 0.4812629802641515, + "learning_rate": 1.4559114753740174e-05, + "loss": 0.1838, + "step": 1844 + }, + { + "epoch": 2.072160606486031, + "grad_norm": 0.4255497433471811, + "learning_rate": 1.4552131837711108e-05, + "loss": 0.1515, + "step": 1845 + }, + { + "epoch": 2.0732837287659693, + "grad_norm": 0.4489807287129697, + "learning_rate": 1.4545146120705229e-05, + "loss": 0.1668, + "step": 1846 + }, + { + "epoch": 2.0744068510459077, + "grad_norm": 0.4451775619304231, + "learning_rate": 1.4538157607020923e-05, + "loss": 0.1671, + "step": 1847 + }, + { + "epoch": 2.0755299733258457, + "grad_norm": 0.4736713116971495, + "learning_rate": 1.4531166300958303e-05, + "loss": 0.1715, + "step": 1848 + }, + { + "epoch": 2.076653095605784, + "grad_norm": 0.44599651030040116, + "learning_rate": 1.4524172206819195e-05, + "loss": 0.1695, + "step": 1849 + }, + { + "epoch": 2.0777762178857224, + "grad_norm": 0.4925862342950297, + "learning_rate": 1.4517175328907141e-05, + "loss": 0.177, + "step": 1850 + }, + { + "epoch": 2.0788993401656604, + "grad_norm": 0.4918233012105012, + "learning_rate": 1.4510175671527397e-05, + "loss": 0.1756, + "step": 1851 + }, + { + "epoch": 2.0800224624455987, + "grad_norm": 0.46620816928480197, + "learning_rate": 1.4503173238986932e-05, + "loss": 0.1762, + "step": 1852 + }, + { + "epoch": 2.081145584725537, + "grad_norm": 0.45469289458689577, + "learning_rate": 1.4496168035594418e-05, + "loss": 0.1696, + "step": 1853 + }, + { + "epoch": 2.082268707005475, + "grad_norm": 0.5073180847345866, + "learning_rate": 1.4489160065660231e-05, + "loss": 0.1748, + "step": 1854 + }, + { + "epoch": 2.0833918292854134, + "grad_norm": 0.49912895873161844, + "learning_rate": 1.4482149333496455e-05, + "loss": 0.1711, + "step": 1855 + }, + { + "epoch": 2.084514951565352, + "grad_norm": 0.4713637049357059, + "learning_rate": 1.4475135843416873e-05, + "loss": 0.1696, + "step": 1856 + }, + { + "epoch": 2.0856380738452898, + "grad_norm": 0.4886436819964026, + "learning_rate": 1.4468119599736957e-05, + "loss": 0.1719, + "step": 1857 + }, + { + "epoch": 2.086761196125228, + "grad_norm": 0.47437394171967345, + "learning_rate": 1.4461100606773884e-05, + "loss": 0.1583, + "step": 1858 + }, + { + "epoch": 2.0878843184051665, + "grad_norm": 0.4738862023625834, + "learning_rate": 1.4454078868846513e-05, + "loss": 0.1758, + "step": 1859 + }, + { + "epoch": 2.0890074406851045, + "grad_norm": 0.47210633789898465, + "learning_rate": 1.4447054390275401e-05, + "loss": 0.1707, + "step": 1860 + }, + { + "epoch": 2.090130562965043, + "grad_norm": 0.47219676032402647, + "learning_rate": 1.4440027175382784e-05, + "loss": 0.1538, + "step": 1861 + }, + { + "epoch": 2.0912536852449812, + "grad_norm": 0.45850818588081294, + "learning_rate": 1.4432997228492586e-05, + "loss": 0.1704, + "step": 1862 + }, + { + "epoch": 2.092376807524919, + "grad_norm": 0.445147848398028, + "learning_rate": 1.4425964553930412e-05, + "loss": 0.1633, + "step": 1863 + }, + { + "epoch": 2.0934999298048576, + "grad_norm": 0.462368673003725, + "learning_rate": 1.4418929156023543e-05, + "loss": 0.164, + "step": 1864 + }, + { + "epoch": 2.094623052084796, + "grad_norm": 0.477089375485279, + "learning_rate": 1.4411891039100934e-05, + "loss": 0.1747, + "step": 1865 + }, + { + "epoch": 2.095746174364734, + "grad_norm": 0.4633909335409037, + "learning_rate": 1.4404850207493217e-05, + "loss": 0.1706, + "step": 1866 + }, + { + "epoch": 2.0968692966446723, + "grad_norm": 0.4913608818406286, + "learning_rate": 1.4397806665532693e-05, + "loss": 0.1802, + "step": 1867 + }, + { + "epoch": 2.09799241892461, + "grad_norm": 0.4843264536525825, + "learning_rate": 1.4390760417553338e-05, + "loss": 0.1752, + "step": 1868 + }, + { + "epoch": 2.0991155412045486, + "grad_norm": 0.4786559227982498, + "learning_rate": 1.4383711467890776e-05, + "loss": 0.1733, + "step": 1869 + }, + { + "epoch": 2.100238663484487, + "grad_norm": 0.47724859281601445, + "learning_rate": 1.4376659820882308e-05, + "loss": 0.1706, + "step": 1870 + }, + { + "epoch": 2.101361785764425, + "grad_norm": 0.4781288580611537, + "learning_rate": 1.4369605480866888e-05, + "loss": 0.1682, + "step": 1871 + }, + { + "epoch": 2.1024849080443633, + "grad_norm": 0.47404027812905025, + "learning_rate": 1.436254845218513e-05, + "loss": 0.1605, + "step": 1872 + }, + { + "epoch": 2.1036080303243017, + "grad_norm": 0.4693877600646867, + "learning_rate": 1.4355488739179304e-05, + "loss": 0.1713, + "step": 1873 + }, + { + "epoch": 2.1047311526042396, + "grad_norm": 0.4465220690930916, + "learning_rate": 1.4348426346193325e-05, + "loss": 0.1616, + "step": 1874 + }, + { + "epoch": 2.105854274884178, + "grad_norm": 0.4533942365528121, + "learning_rate": 1.4341361277572766e-05, + "loss": 0.1597, + "step": 1875 + }, + { + "epoch": 2.1069773971641164, + "grad_norm": 0.4752746801317046, + "learning_rate": 1.4334293537664836e-05, + "loss": 0.1898, + "step": 1876 + }, + { + "epoch": 2.1081005194440543, + "grad_norm": 0.44524813953865283, + "learning_rate": 1.4327223130818393e-05, + "loss": 0.1629, + "step": 1877 + }, + { + "epoch": 2.1092236417239927, + "grad_norm": 0.4740454300839204, + "learning_rate": 1.4320150061383941e-05, + "loss": 0.1712, + "step": 1878 + }, + { + "epoch": 2.110346764003931, + "grad_norm": 0.46263291757398317, + "learning_rate": 1.4313074333713614e-05, + "loss": 0.172, + "step": 1879 + }, + { + "epoch": 2.111469886283869, + "grad_norm": 0.4605970497488254, + "learning_rate": 1.4305995952161189e-05, + "loss": 0.1718, + "step": 1880 + }, + { + "epoch": 2.1125930085638074, + "grad_norm": 0.47336811725490663, + "learning_rate": 1.4298914921082068e-05, + "loss": 0.1689, + "step": 1881 + }, + { + "epoch": 2.1137161308437458, + "grad_norm": 0.4671378312674368, + "learning_rate": 1.429183124483329e-05, + "loss": 0.1637, + "step": 1882 + }, + { + "epoch": 2.1148392531236837, + "grad_norm": 0.5100884705433952, + "learning_rate": 1.4284744927773515e-05, + "loss": 0.1888, + "step": 1883 + }, + { + "epoch": 2.115962375403622, + "grad_norm": 0.48122328157905797, + "learning_rate": 1.4277655974263035e-05, + "loss": 0.1717, + "step": 1884 + }, + { + "epoch": 2.1170854976835605, + "grad_norm": 0.46648639584107954, + "learning_rate": 1.4270564388663761e-05, + "loss": 0.1791, + "step": 1885 + }, + { + "epoch": 2.1182086199634984, + "grad_norm": 0.4897115936255588, + "learning_rate": 1.4263470175339223e-05, + "loss": 0.1784, + "step": 1886 + }, + { + "epoch": 2.119331742243437, + "grad_norm": 0.4695573343126349, + "learning_rate": 1.425637333865457e-05, + "loss": 0.179, + "step": 1887 + }, + { + "epoch": 2.120454864523375, + "grad_norm": 0.4472253157982482, + "learning_rate": 1.424927388297656e-05, + "loss": 0.1593, + "step": 1888 + }, + { + "epoch": 2.121577986803313, + "grad_norm": 0.4699369871519553, + "learning_rate": 1.4242171812673569e-05, + "loss": 0.1685, + "step": 1889 + }, + { + "epoch": 2.1227011090832515, + "grad_norm": 0.4762283130843815, + "learning_rate": 1.4235067132115581e-05, + "loss": 0.1714, + "step": 1890 + }, + { + "epoch": 2.12382423136319, + "grad_norm": 0.4835690093328314, + "learning_rate": 1.4227959845674182e-05, + "loss": 0.1901, + "step": 1891 + }, + { + "epoch": 2.124947353643128, + "grad_norm": 0.4787055121638117, + "learning_rate": 1.4220849957722562e-05, + "loss": 0.1807, + "step": 1892 + }, + { + "epoch": 2.126070475923066, + "grad_norm": 0.4737650279943973, + "learning_rate": 1.4213737472635513e-05, + "loss": 0.1776, + "step": 1893 + }, + { + "epoch": 2.1271935982030046, + "grad_norm": 0.4686012980137878, + "learning_rate": 1.4206622394789432e-05, + "loss": 0.1816, + "step": 1894 + }, + { + "epoch": 2.1283167204829425, + "grad_norm": 0.43340996309070035, + "learning_rate": 1.4199504728562294e-05, + "loss": 0.1585, + "step": 1895 + }, + { + "epoch": 2.129439842762881, + "grad_norm": 0.45042187104895964, + "learning_rate": 1.4192384478333686e-05, + "loss": 0.1626, + "step": 1896 + }, + { + "epoch": 2.130562965042819, + "grad_norm": 0.46788318772366466, + "learning_rate": 1.4185261648484772e-05, + "loss": 0.1641, + "step": 1897 + }, + { + "epoch": 2.131686087322757, + "grad_norm": 0.45501111518763515, + "learning_rate": 1.4178136243398308e-05, + "loss": 0.1704, + "step": 1898 + }, + { + "epoch": 2.1328092096026956, + "grad_norm": 0.458503838473636, + "learning_rate": 1.4171008267458636e-05, + "loss": 0.1649, + "step": 1899 + }, + { + "epoch": 2.1339323318826335, + "grad_norm": 0.4467967887329821, + "learning_rate": 1.4163877725051677e-05, + "loss": 0.1692, + "step": 1900 + }, + { + "epoch": 2.135055454162572, + "grad_norm": 0.46421473127298934, + "learning_rate": 1.4156744620564933e-05, + "loss": 0.1764, + "step": 1901 + }, + { + "epoch": 2.1361785764425103, + "grad_norm": 0.4604460398633165, + "learning_rate": 1.4149608958387484e-05, + "loss": 0.1638, + "step": 1902 + }, + { + "epoch": 2.1373016987224482, + "grad_norm": 0.4634460358657593, + "learning_rate": 1.4142470742909976e-05, + "loss": 0.1757, + "step": 1903 + }, + { + "epoch": 2.1384248210023866, + "grad_norm": 0.470255942011479, + "learning_rate": 1.4135329978524634e-05, + "loss": 0.1709, + "step": 1904 + }, + { + "epoch": 2.139547943282325, + "grad_norm": 0.4547022434396764, + "learning_rate": 1.4128186669625247e-05, + "loss": 0.1753, + "step": 1905 + }, + { + "epoch": 2.140671065562263, + "grad_norm": 0.4739510061930601, + "learning_rate": 1.4121040820607175e-05, + "loss": 0.1746, + "step": 1906 + }, + { + "epoch": 2.1417941878422013, + "grad_norm": 0.4708878833969285, + "learning_rate": 1.4113892435867337e-05, + "loss": 0.1739, + "step": 1907 + }, + { + "epoch": 2.1429173101221397, + "grad_norm": 0.44335049166860324, + "learning_rate": 1.410674151980421e-05, + "loss": 0.1633, + "step": 1908 + }, + { + "epoch": 2.1440404324020776, + "grad_norm": 0.45141039205299494, + "learning_rate": 1.4099588076817837e-05, + "loss": 0.1638, + "step": 1909 + }, + { + "epoch": 2.145163554682016, + "grad_norm": 0.4443166702543141, + "learning_rate": 1.4092432111309804e-05, + "loss": 0.1636, + "step": 1910 + }, + { + "epoch": 2.1462866769619544, + "grad_norm": 0.44643125071661977, + "learning_rate": 1.4085273627683257e-05, + "loss": 0.1651, + "step": 1911 + }, + { + "epoch": 2.1474097992418923, + "grad_norm": 0.48330445218374524, + "learning_rate": 1.4078112630342891e-05, + "loss": 0.1741, + "step": 1912 + }, + { + "epoch": 2.1485329215218307, + "grad_norm": 0.48560309798250084, + "learning_rate": 1.4070949123694945e-05, + "loss": 0.1882, + "step": 1913 + }, + { + "epoch": 2.149656043801769, + "grad_norm": 0.46382488718871623, + "learning_rate": 1.4063783112147207e-05, + "loss": 0.1794, + "step": 1914 + }, + { + "epoch": 2.150779166081707, + "grad_norm": 0.46801002995391355, + "learning_rate": 1.4056614600108998e-05, + "loss": 0.1691, + "step": 1915 + }, + { + "epoch": 2.1519022883616454, + "grad_norm": 0.4753045332485001, + "learning_rate": 1.4049443591991185e-05, + "loss": 0.1746, + "step": 1916 + }, + { + "epoch": 2.1530254106415834, + "grad_norm": 0.4526989009991009, + "learning_rate": 1.404227009220617e-05, + "loss": 0.173, + "step": 1917 + }, + { + "epoch": 2.1541485329215218, + "grad_norm": 0.46338849838297363, + "learning_rate": 1.403509410516788e-05, + "loss": 0.1613, + "step": 1918 + }, + { + "epoch": 2.15527165520146, + "grad_norm": 0.45937563487016014, + "learning_rate": 1.4027915635291786e-05, + "loss": 0.1774, + "step": 1919 + }, + { + "epoch": 2.156394777481398, + "grad_norm": 0.4603844415907502, + "learning_rate": 1.4020734686994875e-05, + "loss": 0.1709, + "step": 1920 + }, + { + "epoch": 2.1575178997613365, + "grad_norm": 0.47586340827626805, + "learning_rate": 1.4013551264695663e-05, + "loss": 0.1739, + "step": 1921 + }, + { + "epoch": 2.158641022041275, + "grad_norm": 0.4698577828683483, + "learning_rate": 1.4006365372814192e-05, + "loss": 0.1729, + "step": 1922 + }, + { + "epoch": 2.1597641443212128, + "grad_norm": 0.44550557016391734, + "learning_rate": 1.3999177015772021e-05, + "loss": 0.1602, + "step": 1923 + }, + { + "epoch": 2.160887266601151, + "grad_norm": 0.47621681660889276, + "learning_rate": 1.3991986197992223e-05, + "loss": 0.1775, + "step": 1924 + }, + { + "epoch": 2.1620103888810895, + "grad_norm": 0.45703981645014513, + "learning_rate": 1.3984792923899387e-05, + "loss": 0.1654, + "step": 1925 + }, + { + "epoch": 2.1631335111610275, + "grad_norm": 0.5176298847902593, + "learning_rate": 1.3977597197919614e-05, + "loss": 0.1895, + "step": 1926 + }, + { + "epoch": 2.164256633440966, + "grad_norm": 0.45890734681710893, + "learning_rate": 1.3970399024480512e-05, + "loss": 0.1733, + "step": 1927 + }, + { + "epoch": 2.1653797557209042, + "grad_norm": 0.4531704744476037, + "learning_rate": 1.39631984080112e-05, + "loss": 0.1692, + "step": 1928 + }, + { + "epoch": 2.166502878000842, + "grad_norm": 0.4631173822015671, + "learning_rate": 1.3955995352942296e-05, + "loss": 0.1727, + "step": 1929 + }, + { + "epoch": 2.1676260002807806, + "grad_norm": 0.4510321349529347, + "learning_rate": 1.3948789863705914e-05, + "loss": 0.1671, + "step": 1930 + }, + { + "epoch": 2.168749122560719, + "grad_norm": 0.449699047258487, + "learning_rate": 1.3941581944735675e-05, + "loss": 0.173, + "step": 1931 + }, + { + "epoch": 2.169872244840657, + "grad_norm": 0.4602286914189392, + "learning_rate": 1.3934371600466692e-05, + "loss": 0.1777, + "step": 1932 + }, + { + "epoch": 2.1709953671205953, + "grad_norm": 0.45736864355992796, + "learning_rate": 1.3927158835335567e-05, + "loss": 0.1796, + "step": 1933 + }, + { + "epoch": 2.1721184894005336, + "grad_norm": 0.4811354606513004, + "learning_rate": 1.3919943653780395e-05, + "loss": 0.1726, + "step": 1934 + }, + { + "epoch": 2.1732416116804716, + "grad_norm": 0.4670363828225833, + "learning_rate": 1.3912726060240754e-05, + "loss": 0.179, + "step": 1935 + }, + { + "epoch": 2.17436473396041, + "grad_norm": 0.48423978492534575, + "learning_rate": 1.3905506059157712e-05, + "loss": 0.1633, + "step": 1936 + }, + { + "epoch": 2.1754878562403483, + "grad_norm": 0.48037570474597213, + "learning_rate": 1.3898283654973812e-05, + "loss": 0.1854, + "step": 1937 + }, + { + "epoch": 2.1766109785202863, + "grad_norm": 0.458965202094496, + "learning_rate": 1.3891058852133083e-05, + "loss": 0.162, + "step": 1938 + }, + { + "epoch": 2.1777341008002247, + "grad_norm": 0.48209417167054247, + "learning_rate": 1.388383165508102e-05, + "loss": 0.1829, + "step": 1939 + }, + { + "epoch": 2.178857223080163, + "grad_norm": 0.4561793614583002, + "learning_rate": 1.38766020682646e-05, + "loss": 0.1699, + "step": 1940 + }, + { + "epoch": 2.179980345360101, + "grad_norm": 0.47026614355709756, + "learning_rate": 1.3869370096132269e-05, + "loss": 0.1817, + "step": 1941 + }, + { + "epoch": 2.1811034676400394, + "grad_norm": 0.4724639589057093, + "learning_rate": 1.3862135743133937e-05, + "loss": 0.1668, + "step": 1942 + }, + { + "epoch": 2.1822265899199778, + "grad_norm": 0.4717566607972279, + "learning_rate": 1.3854899013720982e-05, + "loss": 0.1874, + "step": 1943 + }, + { + "epoch": 2.1833497121999157, + "grad_norm": 0.47264122089233784, + "learning_rate": 1.384765991234624e-05, + "loss": 0.1785, + "step": 1944 + }, + { + "epoch": 2.184472834479854, + "grad_norm": 0.4458357059113397, + "learning_rate": 1.3840418443464015e-05, + "loss": 0.1662, + "step": 1945 + }, + { + "epoch": 2.1855959567597925, + "grad_norm": 0.4670183468434894, + "learning_rate": 1.383317461153006e-05, + "loss": 0.1689, + "step": 1946 + }, + { + "epoch": 2.1867190790397304, + "grad_norm": 0.4802944051246216, + "learning_rate": 1.3825928421001583e-05, + "loss": 0.1751, + "step": 1947 + }, + { + "epoch": 2.1878422013196688, + "grad_norm": 0.48140552348423143, + "learning_rate": 1.381867987633725e-05, + "loss": 0.1789, + "step": 1948 + }, + { + "epoch": 2.1889653235996067, + "grad_norm": 0.47327163615370027, + "learning_rate": 1.3811428981997159e-05, + "loss": 0.1742, + "step": 1949 + }, + { + "epoch": 2.190088445879545, + "grad_norm": 0.4580890578922541, + "learning_rate": 1.3804175742442878e-05, + "loss": 0.1655, + "step": 1950 + }, + { + "epoch": 2.1912115681594835, + "grad_norm": 0.47350507091879795, + "learning_rate": 1.3796920162137396e-05, + "loss": 0.1705, + "step": 1951 + }, + { + "epoch": 2.1923346904394214, + "grad_norm": 0.46014818205305075, + "learning_rate": 1.3789662245545158e-05, + "loss": 0.1685, + "step": 1952 + }, + { + "epoch": 2.19345781271936, + "grad_norm": 0.47613768985331506, + "learning_rate": 1.3782401997132037e-05, + "loss": 0.1769, + "step": 1953 + }, + { + "epoch": 2.194580934999298, + "grad_norm": 0.46754997862422304, + "learning_rate": 1.3775139421365342e-05, + "loss": 0.1766, + "step": 1954 + }, + { + "epoch": 2.195704057279236, + "grad_norm": 0.4517613096656541, + "learning_rate": 1.376787452271382e-05, + "loss": 0.1689, + "step": 1955 + }, + { + "epoch": 2.1968271795591745, + "grad_norm": 0.4808075598405213, + "learning_rate": 1.3760607305647637e-05, + "loss": 0.1859, + "step": 1956 + }, + { + "epoch": 2.197950301839113, + "grad_norm": 0.4640306895626649, + "learning_rate": 1.3753337774638397e-05, + "loss": 0.1692, + "step": 1957 + }, + { + "epoch": 2.199073424119051, + "grad_norm": 0.48015518700390997, + "learning_rate": 1.3746065934159123e-05, + "loss": 0.1809, + "step": 1958 + }, + { + "epoch": 2.200196546398989, + "grad_norm": 0.45119869946429864, + "learning_rate": 1.3738791788684254e-05, + "loss": 0.1646, + "step": 1959 + }, + { + "epoch": 2.2013196686789276, + "grad_norm": 0.46414622545090645, + "learning_rate": 1.3731515342689654e-05, + "loss": 0.1773, + "step": 1960 + }, + { + "epoch": 2.2024427909588655, + "grad_norm": 0.4431677408444893, + "learning_rate": 1.3724236600652598e-05, + "loss": 0.1594, + "step": 1961 + }, + { + "epoch": 2.203565913238804, + "grad_norm": 0.4780229496799192, + "learning_rate": 1.371695556705178e-05, + "loss": 0.1644, + "step": 1962 + }, + { + "epoch": 2.2046890355187423, + "grad_norm": 0.49218595733042086, + "learning_rate": 1.3709672246367299e-05, + "loss": 0.1762, + "step": 1963 + }, + { + "epoch": 2.2058121577986802, + "grad_norm": 0.47496532240919137, + "learning_rate": 1.370238664308066e-05, + "loss": 0.168, + "step": 1964 + }, + { + "epoch": 2.2069352800786186, + "grad_norm": 0.48178589082299556, + "learning_rate": 1.3695098761674779e-05, + "loss": 0.1811, + "step": 1965 + }, + { + "epoch": 2.208058402358557, + "grad_norm": 0.46520249661248453, + "learning_rate": 1.3687808606633965e-05, + "loss": 0.1762, + "step": 1966 + }, + { + "epoch": 2.209181524638495, + "grad_norm": 0.470425727912398, + "learning_rate": 1.3680516182443935e-05, + "loss": 0.1723, + "step": 1967 + }, + { + "epoch": 2.2103046469184333, + "grad_norm": 0.4840533890242917, + "learning_rate": 1.3673221493591795e-05, + "loss": 0.1861, + "step": 1968 + }, + { + "epoch": 2.2114277691983713, + "grad_norm": 0.4675135965762372, + "learning_rate": 1.3665924544566047e-05, + "loss": 0.1717, + "step": 1969 + }, + { + "epoch": 2.2125508914783096, + "grad_norm": 0.4556393300346666, + "learning_rate": 1.3658625339856586e-05, + "loss": 0.1725, + "step": 1970 + }, + { + "epoch": 2.213674013758248, + "grad_norm": 0.4703624670439313, + "learning_rate": 1.365132388395469e-05, + "loss": 0.1659, + "step": 1971 + }, + { + "epoch": 2.214797136038186, + "grad_norm": 0.4657783144698897, + "learning_rate": 1.364402018135303e-05, + "loss": 0.1808, + "step": 1972 + }, + { + "epoch": 2.2159202583181243, + "grad_norm": 0.462982700227875, + "learning_rate": 1.3636714236545649e-05, + "loss": 0.1728, + "step": 1973 + }, + { + "epoch": 2.2170433805980627, + "grad_norm": 0.46495072208063093, + "learning_rate": 1.362940605402798e-05, + "loss": 0.1684, + "step": 1974 + }, + { + "epoch": 2.2181665028780007, + "grad_norm": 0.4676792084934364, + "learning_rate": 1.3622095638296827e-05, + "loss": 0.1675, + "step": 1975 + }, + { + "epoch": 2.219289625157939, + "grad_norm": 0.498799056127904, + "learning_rate": 1.3614782993850367e-05, + "loss": 0.1753, + "step": 1976 + }, + { + "epoch": 2.2204127474378774, + "grad_norm": 0.47991466108630665, + "learning_rate": 1.3607468125188153e-05, + "loss": 0.1722, + "step": 1977 + }, + { + "epoch": 2.2215358697178154, + "grad_norm": 0.48676775722149834, + "learning_rate": 1.3600151036811101e-05, + "loss": 0.1686, + "step": 1978 + }, + { + "epoch": 2.2226589919977537, + "grad_norm": 0.46122036954736967, + "learning_rate": 1.3592831733221499e-05, + "loss": 0.1666, + "step": 1979 + }, + { + "epoch": 2.223782114277692, + "grad_norm": 0.4622532176427233, + "learning_rate": 1.3585510218922997e-05, + "loss": 0.1744, + "step": 1980 + }, + { + "epoch": 2.22490523655763, + "grad_norm": 0.4404281659461841, + "learning_rate": 1.3578186498420598e-05, + "loss": 0.162, + "step": 1981 + }, + { + "epoch": 2.2260283588375684, + "grad_norm": 0.4647477463673942, + "learning_rate": 1.357086057622067e-05, + "loss": 0.1687, + "step": 1982 + }, + { + "epoch": 2.227151481117507, + "grad_norm": 0.45022268319323117, + "learning_rate": 1.3563532456830934e-05, + "loss": 0.1633, + "step": 1983 + }, + { + "epoch": 2.2282746033974448, + "grad_norm": 0.4900592975701646, + "learning_rate": 1.3556202144760461e-05, + "loss": 0.1939, + "step": 1984 + }, + { + "epoch": 2.229397725677383, + "grad_norm": 0.4450602192202563, + "learning_rate": 1.3548869644519677e-05, + "loss": 0.1622, + "step": 1985 + }, + { + "epoch": 2.2305208479573215, + "grad_norm": 0.45292148908157803, + "learning_rate": 1.3541534960620349e-05, + "loss": 0.1772, + "step": 1986 + }, + { + "epoch": 2.2316439702372595, + "grad_norm": 0.4495070935381503, + "learning_rate": 1.3534198097575581e-05, + "loss": 0.1793, + "step": 1987 + }, + { + "epoch": 2.232767092517198, + "grad_norm": 0.46669693575362725, + "learning_rate": 1.3526859059899834e-05, + "loss": 0.1724, + "step": 1988 + }, + { + "epoch": 2.2338902147971362, + "grad_norm": 0.4485177100335402, + "learning_rate": 1.3519517852108899e-05, + "loss": 0.1679, + "step": 1989 + }, + { + "epoch": 2.235013337077074, + "grad_norm": 0.4765654300275507, + "learning_rate": 1.3512174478719896e-05, + "loss": 0.1741, + "step": 1990 + }, + { + "epoch": 2.2361364593570126, + "grad_norm": 0.46917420600760523, + "learning_rate": 1.3504828944251287e-05, + "loss": 0.1852, + "step": 1991 + }, + { + "epoch": 2.237259581636951, + "grad_norm": 0.4599691780330107, + "learning_rate": 1.349748125322286e-05, + "loss": 0.1678, + "step": 1992 + }, + { + "epoch": 2.238382703916889, + "grad_norm": 0.47810945057013843, + "learning_rate": 1.349013141015573e-05, + "loss": 0.1642, + "step": 1993 + }, + { + "epoch": 2.2395058261968273, + "grad_norm": 0.462210138404077, + "learning_rate": 1.3482779419572336e-05, + "loss": 0.1722, + "step": 1994 + }, + { + "epoch": 2.2406289484767656, + "grad_norm": 0.4877043211428916, + "learning_rate": 1.3475425285996438e-05, + "loss": 0.1763, + "step": 1995 + }, + { + "epoch": 2.2417520707567036, + "grad_norm": 0.4713028550897873, + "learning_rate": 1.3468069013953115e-05, + "loss": 0.1672, + "step": 1996 + }, + { + "epoch": 2.242875193036642, + "grad_norm": 0.4543701026881802, + "learning_rate": 1.3460710607968767e-05, + "loss": 0.1761, + "step": 1997 + }, + { + "epoch": 2.24399831531658, + "grad_norm": 0.46559021955454394, + "learning_rate": 1.3453350072571097e-05, + "loss": 0.1729, + "step": 1998 + }, + { + "epoch": 2.2451214375965183, + "grad_norm": 0.45021396422109927, + "learning_rate": 1.3445987412289126e-05, + "loss": 0.1757, + "step": 1999 + }, + { + "epoch": 2.2462445598764567, + "grad_norm": 0.4335318154822215, + "learning_rate": 1.3438622631653178e-05, + "loss": 0.1572, + "step": 2000 + }, + { + "epoch": 2.2473676821563946, + "grad_norm": 0.49286805287821256, + "learning_rate": 1.343125573519488e-05, + "loss": 0.1827, + "step": 2001 + }, + { + "epoch": 2.248490804436333, + "grad_norm": 0.48479831173676324, + "learning_rate": 1.3423886727447176e-05, + "loss": 0.1793, + "step": 2002 + }, + { + "epoch": 2.2496139267162714, + "grad_norm": 0.46128111314066944, + "learning_rate": 1.3416515612944288e-05, + "loss": 0.1713, + "step": 2003 + }, + { + "epoch": 2.2507370489962093, + "grad_norm": 0.47588445133183477, + "learning_rate": 1.3409142396221747e-05, + "loss": 0.1838, + "step": 2004 + }, + { + "epoch": 2.2518601712761477, + "grad_norm": 0.4630902693099745, + "learning_rate": 1.340176708181637e-05, + "loss": 0.1734, + "step": 2005 + }, + { + "epoch": 2.252983293556086, + "grad_norm": 0.4733521097328874, + "learning_rate": 1.3394389674266275e-05, + "loss": 0.1702, + "step": 2006 + }, + { + "epoch": 2.254106415836024, + "grad_norm": 0.4638411258433599, + "learning_rate": 1.3387010178110859e-05, + "loss": 0.1825, + "step": 2007 + }, + { + "epoch": 2.2552295381159624, + "grad_norm": 0.4636059997009433, + "learning_rate": 1.3379628597890808e-05, + "loss": 0.1723, + "step": 2008 + }, + { + "epoch": 2.2563526603959008, + "grad_norm": 0.4634064303623194, + "learning_rate": 1.337224493814809e-05, + "loss": 0.1668, + "step": 2009 + }, + { + "epoch": 2.2574757826758387, + "grad_norm": 0.4582783319228768, + "learning_rate": 1.3364859203425953e-05, + "loss": 0.1731, + "step": 2010 + }, + { + "epoch": 2.258598904955777, + "grad_norm": 0.5057417740276605, + "learning_rate": 1.335747139826892e-05, + "loss": 0.1802, + "step": 2011 + }, + { + "epoch": 2.2597220272357155, + "grad_norm": 0.4966593306531945, + "learning_rate": 1.3350081527222787e-05, + "loss": 0.1856, + "step": 2012 + }, + { + "epoch": 2.2608451495156534, + "grad_norm": 0.47721378699943995, + "learning_rate": 1.3342689594834623e-05, + "loss": 0.1778, + "step": 2013 + }, + { + "epoch": 2.261968271795592, + "grad_norm": 0.499924567330496, + "learning_rate": 1.333529560565277e-05, + "loss": 0.1782, + "step": 2014 + }, + { + "epoch": 2.2630913940755297, + "grad_norm": 0.4761749575886551, + "learning_rate": 1.3327899564226826e-05, + "loss": 0.1776, + "step": 2015 + }, + { + "epoch": 2.264214516355468, + "grad_norm": 0.47595190829952716, + "learning_rate": 1.332050147510766e-05, + "loss": 0.1799, + "step": 2016 + }, + { + "epoch": 2.2653376386354065, + "grad_norm": 0.4819227222937039, + "learning_rate": 1.3313101342847393e-05, + "loss": 0.1777, + "step": 2017 + }, + { + "epoch": 2.2664607609153444, + "grad_norm": 0.47028267836611976, + "learning_rate": 1.3305699171999409e-05, + "loss": 0.1783, + "step": 2018 + }, + { + "epoch": 2.267583883195283, + "grad_norm": 0.4694525543038002, + "learning_rate": 1.3298294967118351e-05, + "loss": 0.1744, + "step": 2019 + }, + { + "epoch": 2.268707005475221, + "grad_norm": 0.5046329050414854, + "learning_rate": 1.32908887327601e-05, + "loss": 0.1857, + "step": 2020 + }, + { + "epoch": 2.269830127755159, + "grad_norm": 0.46102708624099675, + "learning_rate": 1.3283480473481803e-05, + "loss": 0.1789, + "step": 2021 + }, + { + "epoch": 2.2709532500350975, + "grad_norm": 0.47172572667433565, + "learning_rate": 1.3276070193841833e-05, + "loss": 0.1731, + "step": 2022 + }, + { + "epoch": 2.272076372315036, + "grad_norm": 0.49124489515157277, + "learning_rate": 1.3268657898399822e-05, + "loss": 0.1819, + "step": 2023 + }, + { + "epoch": 2.273199494594974, + "grad_norm": 0.4720309863370387, + "learning_rate": 1.3261243591716634e-05, + "loss": 0.1686, + "step": 2024 + }, + { + "epoch": 2.274322616874912, + "grad_norm": 0.47523894040627884, + "learning_rate": 1.3253827278354378e-05, + "loss": 0.1744, + "step": 2025 + }, + { + "epoch": 2.2754457391548506, + "grad_norm": 0.4923383132196056, + "learning_rate": 1.3246408962876391e-05, + "loss": 0.1847, + "step": 2026 + }, + { + "epoch": 2.2765688614347885, + "grad_norm": 0.4588643937702198, + "learning_rate": 1.3238988649847243e-05, + "loss": 0.1699, + "step": 2027 + }, + { + "epoch": 2.277691983714727, + "grad_norm": 0.4754365804448629, + "learning_rate": 1.3231566343832736e-05, + "loss": 0.1763, + "step": 2028 + }, + { + "epoch": 2.2788151059946653, + "grad_norm": 0.4708694394595169, + "learning_rate": 1.3224142049399896e-05, + "loss": 0.1806, + "step": 2029 + }, + { + "epoch": 2.2799382282746032, + "grad_norm": 0.4532504916608009, + "learning_rate": 1.321671577111697e-05, + "loss": 0.1726, + "step": 2030 + }, + { + "epoch": 2.2810613505545416, + "grad_norm": 0.45695642633903705, + "learning_rate": 1.3209287513553437e-05, + "loss": 0.1709, + "step": 2031 + }, + { + "epoch": 2.28218447283448, + "grad_norm": 0.4436999178216967, + "learning_rate": 1.3201857281279978e-05, + "loss": 0.1662, + "step": 2032 + }, + { + "epoch": 2.283307595114418, + "grad_norm": 0.4361709759275738, + "learning_rate": 1.3194425078868498e-05, + "loss": 0.171, + "step": 2033 + }, + { + "epoch": 2.2844307173943563, + "grad_norm": 0.4685768299567099, + "learning_rate": 1.3186990910892115e-05, + "loss": 0.1845, + "step": 2034 + }, + { + "epoch": 2.2855538396742947, + "grad_norm": 0.4716954023929674, + "learning_rate": 1.317955478192515e-05, + "loss": 0.171, + "step": 2035 + }, + { + "epoch": 2.2866769619542326, + "grad_norm": 0.4508227597245992, + "learning_rate": 1.3172116696543142e-05, + "loss": 0.1686, + "step": 2036 + }, + { + "epoch": 2.287800084234171, + "grad_norm": 0.4768566069887853, + "learning_rate": 1.3164676659322823e-05, + "loss": 0.1771, + "step": 2037 + }, + { + "epoch": 2.2889232065141094, + "grad_norm": 0.4935990854907148, + "learning_rate": 1.315723467484213e-05, + "loss": 0.1806, + "step": 2038 + }, + { + "epoch": 2.2900463287940473, + "grad_norm": 0.46467316460978925, + "learning_rate": 1.3149790747680196e-05, + "loss": 0.1721, + "step": 2039 + }, + { + "epoch": 2.2911694510739857, + "grad_norm": 0.4496552803361268, + "learning_rate": 1.3142344882417355e-05, + "loss": 0.1681, + "step": 2040 + }, + { + "epoch": 2.292292573353924, + "grad_norm": 0.46285417434016424, + "learning_rate": 1.3134897083635126e-05, + "loss": 0.177, + "step": 2041 + }, + { + "epoch": 2.293415695633862, + "grad_norm": 0.45431363080930554, + "learning_rate": 1.3127447355916223e-05, + "loss": 0.182, + "step": 2042 + }, + { + "epoch": 2.2945388179138004, + "grad_norm": 0.4507422204052235, + "learning_rate": 1.3119995703844551e-05, + "loss": 0.174, + "step": 2043 + }, + { + "epoch": 2.295661940193739, + "grad_norm": 0.45097987664427835, + "learning_rate": 1.3112542132005182e-05, + "loss": 0.1726, + "step": 2044 + }, + { + "epoch": 2.2967850624736768, + "grad_norm": 0.4644989156686283, + "learning_rate": 1.310508664498439e-05, + "loss": 0.1855, + "step": 2045 + }, + { + "epoch": 2.297908184753615, + "grad_norm": 0.4417912412022252, + "learning_rate": 1.3097629247369613e-05, + "loss": 0.1663, + "step": 2046 + }, + { + "epoch": 2.2990313070335535, + "grad_norm": 0.4635140434651788, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.1789, + "step": 2047 + }, + { + "epoch": 2.3001544293134915, + "grad_norm": 0.4734702085500251, + "learning_rate": 1.3082708738713765e-05, + "loss": 0.1699, + "step": 2048 + }, + { + "epoch": 2.30127755159343, + "grad_norm": 0.47850190337364856, + "learning_rate": 1.3075245636853444e-05, + "loss": 0.1715, + "step": 2049 + }, + { + "epoch": 2.3024006738733678, + "grad_norm": 0.466268874412243, + "learning_rate": 1.306778064276064e-05, + "loss": 0.1735, + "step": 2050 + }, + { + "epoch": 2.303523796153306, + "grad_norm": 0.4479379317186277, + "learning_rate": 1.3060313761028647e-05, + "loss": 0.1727, + "step": 2051 + }, + { + "epoch": 2.3046469184332445, + "grad_norm": 0.44726243257194265, + "learning_rate": 1.305284499625192e-05, + "loss": 0.1632, + "step": 2052 + }, + { + "epoch": 2.3057700407131825, + "grad_norm": 0.49557743765194856, + "learning_rate": 1.3045374353026073e-05, + "loss": 0.1819, + "step": 2053 + }, + { + "epoch": 2.306893162993121, + "grad_norm": 0.46796170967007167, + "learning_rate": 1.3037901835947873e-05, + "loss": 0.1758, + "step": 2054 + }, + { + "epoch": 2.3080162852730592, + "grad_norm": 0.4805637353304701, + "learning_rate": 1.3030427449615241e-05, + "loss": 0.1809, + "step": 2055 + }, + { + "epoch": 2.309139407552997, + "grad_norm": 0.4846347936669786, + "learning_rate": 1.3022951198627254e-05, + "loss": 0.1808, + "step": 2056 + }, + { + "epoch": 2.3102625298329356, + "grad_norm": 0.4567235403598188, + "learning_rate": 1.3015473087584127e-05, + "loss": 0.1717, + "step": 2057 + }, + { + "epoch": 2.311385652112874, + "grad_norm": 0.46361930650452676, + "learning_rate": 1.3007993121087226e-05, + "loss": 0.1724, + "step": 2058 + }, + { + "epoch": 2.312508774392812, + "grad_norm": 0.4757333926502388, + "learning_rate": 1.3000511303739054e-05, + "loss": 0.1764, + "step": 2059 + }, + { + "epoch": 2.3136318966727503, + "grad_norm": 0.4447398100189297, + "learning_rate": 1.299302764014326e-05, + "loss": 0.1643, + "step": 2060 + }, + { + "epoch": 2.3147550189526886, + "grad_norm": 0.45522814478101575, + "learning_rate": 1.2985542134904621e-05, + "loss": 0.1725, + "step": 2061 + }, + { + "epoch": 2.3158781412326266, + "grad_norm": 0.47650848146899405, + "learning_rate": 1.2978054792629054e-05, + "loss": 0.1765, + "step": 2062 + }, + { + "epoch": 2.317001263512565, + "grad_norm": 0.4501821499764709, + "learning_rate": 1.2970565617923598e-05, + "loss": 0.1751, + "step": 2063 + }, + { + "epoch": 2.3181243857925034, + "grad_norm": 0.45950502464250575, + "learning_rate": 1.2963074615396428e-05, + "loss": 0.1801, + "step": 2064 + }, + { + "epoch": 2.3192475080724413, + "grad_norm": 0.4613132581972528, + "learning_rate": 1.2955581789656844e-05, + "loss": 0.1822, + "step": 2065 + }, + { + "epoch": 2.3203706303523797, + "grad_norm": 0.4619765649138744, + "learning_rate": 1.2948087145315256e-05, + "loss": 0.1801, + "step": 2066 + }, + { + "epoch": 2.3214937526323176, + "grad_norm": 0.45361708539848494, + "learning_rate": 1.2940590686983208e-05, + "loss": 0.1696, + "step": 2067 + }, + { + "epoch": 2.322616874912256, + "grad_norm": 0.47042240961247933, + "learning_rate": 1.2933092419273348e-05, + "loss": 0.1822, + "step": 2068 + }, + { + "epoch": 2.3237399971921944, + "grad_norm": 0.44936652373893554, + "learning_rate": 1.2925592346799444e-05, + "loss": 0.1715, + "step": 2069 + }, + { + "epoch": 2.3248631194721323, + "grad_norm": 0.45944253764029824, + "learning_rate": 1.2918090474176378e-05, + "loss": 0.1728, + "step": 2070 + }, + { + "epoch": 2.3259862417520707, + "grad_norm": 0.4670499870780179, + "learning_rate": 1.2910586806020128e-05, + "loss": 0.1778, + "step": 2071 + }, + { + "epoch": 2.327109364032009, + "grad_norm": 0.47065143463451475, + "learning_rate": 1.2903081346947788e-05, + "loss": 0.1781, + "step": 2072 + }, + { + "epoch": 2.328232486311947, + "grad_norm": 0.484564209356825, + "learning_rate": 1.2895574101577548e-05, + "loss": 0.1867, + "step": 2073 + }, + { + "epoch": 2.3293556085918854, + "grad_norm": 0.4461711655494198, + "learning_rate": 1.28880650745287e-05, + "loss": 0.1749, + "step": 2074 + }, + { + "epoch": 2.330478730871824, + "grad_norm": 0.4523170048395053, + "learning_rate": 1.288055427042163e-05, + "loss": 0.1712, + "step": 2075 + }, + { + "epoch": 2.3316018531517617, + "grad_norm": 0.45907136438347945, + "learning_rate": 1.2873041693877817e-05, + "loss": 0.169, + "step": 2076 + }, + { + "epoch": 2.3327249754317, + "grad_norm": 0.4596979480607732, + "learning_rate": 1.2865527349519836e-05, + "loss": 0.1757, + "step": 2077 + }, + { + "epoch": 2.3338480977116385, + "grad_norm": 0.46814268818136334, + "learning_rate": 1.285801124197134e-05, + "loss": 0.1747, + "step": 2078 + }, + { + "epoch": 2.3349712199915764, + "grad_norm": 0.45641847730925356, + "learning_rate": 1.2850493375857078e-05, + "loss": 0.1662, + "step": 2079 + }, + { + "epoch": 2.336094342271515, + "grad_norm": 0.4573287262646522, + "learning_rate": 1.2842973755802872e-05, + "loss": 0.1666, + "step": 2080 + }, + { + "epoch": 2.337217464551453, + "grad_norm": 0.5012334972800347, + "learning_rate": 1.2835452386435629e-05, + "loss": 0.1849, + "step": 2081 + }, + { + "epoch": 2.338340586831391, + "grad_norm": 0.4826799580775581, + "learning_rate": 1.282792927238333e-05, + "loss": 0.168, + "step": 2082 + }, + { + "epoch": 2.3394637091113295, + "grad_norm": 0.4765836277272539, + "learning_rate": 1.282040441827503e-05, + "loss": 0.1885, + "step": 2083 + }, + { + "epoch": 2.340586831391268, + "grad_norm": 0.45652229568485014, + "learning_rate": 1.2812877828740855e-05, + "loss": 0.1623, + "step": 2084 + }, + { + "epoch": 2.341709953671206, + "grad_norm": 0.512094793634767, + "learning_rate": 1.2805349508411996e-05, + "loss": 0.196, + "step": 2085 + }, + { + "epoch": 2.342833075951144, + "grad_norm": 0.4764092713379784, + "learning_rate": 1.2797819461920714e-05, + "loss": 0.1817, + "step": 2086 + }, + { + "epoch": 2.3439561982310826, + "grad_norm": 0.4704742691821975, + "learning_rate": 1.279028769390033e-05, + "loss": 0.1687, + "step": 2087 + }, + { + "epoch": 2.3450793205110205, + "grad_norm": 0.4951306244097187, + "learning_rate": 1.2782754208985217e-05, + "loss": 0.1728, + "step": 2088 + }, + { + "epoch": 2.346202442790959, + "grad_norm": 0.47760873675544463, + "learning_rate": 1.2775219011810822e-05, + "loss": 0.1913, + "step": 2089 + }, + { + "epoch": 2.3473255650708973, + "grad_norm": 0.4832646096522936, + "learning_rate": 1.2767682107013626e-05, + "loss": 0.1859, + "step": 2090 + }, + { + "epoch": 2.3484486873508352, + "grad_norm": 0.4426474698768579, + "learning_rate": 1.2760143499231173e-05, + "loss": 0.162, + "step": 2091 + }, + { + "epoch": 2.3495718096307736, + "grad_norm": 0.47174290073665726, + "learning_rate": 1.275260319310205e-05, + "loss": 0.182, + "step": 2092 + }, + { + "epoch": 2.350694931910712, + "grad_norm": 0.46639059398090704, + "learning_rate": 1.2745061193265896e-05, + "loss": 0.1681, + "step": 2093 + }, + { + "epoch": 2.35181805419065, + "grad_norm": 0.4790429406794705, + "learning_rate": 1.2737517504363378e-05, + "loss": 0.1779, + "step": 2094 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.47495388207027717, + "learning_rate": 1.2729972131036212e-05, + "loss": 0.1805, + "step": 2095 + }, + { + "epoch": 2.3540642987505267, + "grad_norm": 0.4730593538483904, + "learning_rate": 1.2722425077927157e-05, + "loss": 0.1753, + "step": 2096 + }, + { + "epoch": 2.3551874210304646, + "grad_norm": 0.44971031148328955, + "learning_rate": 1.271487634967999e-05, + "loss": 0.1576, + "step": 2097 + }, + { + "epoch": 2.356310543310403, + "grad_norm": 0.45993224590682336, + "learning_rate": 1.2707325950939529e-05, + "loss": 0.1705, + "step": 2098 + }, + { + "epoch": 2.357433665590341, + "grad_norm": 0.4502585923789524, + "learning_rate": 1.2699773886351618e-05, + "loss": 0.1716, + "step": 2099 + }, + { + "epoch": 2.3585567878702793, + "grad_norm": 0.4639139646897428, + "learning_rate": 1.2692220160563125e-05, + "loss": 0.1761, + "step": 2100 + }, + { + "epoch": 2.3596799101502177, + "grad_norm": 0.4619505035347143, + "learning_rate": 1.2684664778221943e-05, + "loss": 0.1817, + "step": 2101 + }, + { + "epoch": 2.3608030324301557, + "grad_norm": 0.4465823236801084, + "learning_rate": 1.2677107743976975e-05, + "loss": 0.1622, + "step": 2102 + }, + { + "epoch": 2.361926154710094, + "grad_norm": 0.4679911842787963, + "learning_rate": 1.2669549062478155e-05, + "loss": 0.1893, + "step": 2103 + }, + { + "epoch": 2.3630492769900324, + "grad_norm": 0.4801272473499729, + "learning_rate": 1.266198873837642e-05, + "loss": 0.1756, + "step": 2104 + }, + { + "epoch": 2.3641723992699704, + "grad_norm": 0.4649417211528937, + "learning_rate": 1.2654426776323719e-05, + "loss": 0.1742, + "step": 2105 + }, + { + "epoch": 2.3652955215499087, + "grad_norm": 0.4612836290072487, + "learning_rate": 1.2646863180973012e-05, + "loss": 0.1726, + "step": 2106 + }, + { + "epoch": 2.366418643829847, + "grad_norm": 0.46999430959875044, + "learning_rate": 1.2639297956978262e-05, + "loss": 0.1722, + "step": 2107 + }, + { + "epoch": 2.367541766109785, + "grad_norm": 0.482099500823595, + "learning_rate": 1.2631731108994436e-05, + "loss": 0.1782, + "step": 2108 + }, + { + "epoch": 2.3686648883897234, + "grad_norm": 0.4723654096317358, + "learning_rate": 1.2624162641677498e-05, + "loss": 0.1757, + "step": 2109 + }, + { + "epoch": 2.369788010669662, + "grad_norm": 0.47531394992362225, + "learning_rate": 1.2616592559684408e-05, + "loss": 0.1862, + "step": 2110 + }, + { + "epoch": 2.3709111329495998, + "grad_norm": 0.45674297876291126, + "learning_rate": 1.2609020867673123e-05, + "loss": 0.1663, + "step": 2111 + }, + { + "epoch": 2.372034255229538, + "grad_norm": 0.458215236610029, + "learning_rate": 1.2601447570302585e-05, + "loss": 0.181, + "step": 2112 + }, + { + "epoch": 2.3731573775094765, + "grad_norm": 0.43796983940777867, + "learning_rate": 1.259387267223273e-05, + "loss": 0.1655, + "step": 2113 + }, + { + "epoch": 2.3742804997894145, + "grad_norm": 0.4363126086769737, + "learning_rate": 1.2586296178124475e-05, + "loss": 0.1621, + "step": 2114 + }, + { + "epoch": 2.375403622069353, + "grad_norm": 0.44964027723907635, + "learning_rate": 1.2578718092639724e-05, + "loss": 0.1595, + "step": 2115 + }, + { + "epoch": 2.376526744349291, + "grad_norm": 0.4599641014987599, + "learning_rate": 1.2571138420441349e-05, + "loss": 0.1626, + "step": 2116 + }, + { + "epoch": 2.377649866629229, + "grad_norm": 0.4675544983326374, + "learning_rate": 1.2563557166193213e-05, + "loss": 0.1774, + "step": 2117 + }, + { + "epoch": 2.3787729889091676, + "grad_norm": 0.4717489195434664, + "learning_rate": 1.2555974334560142e-05, + "loss": 0.1802, + "step": 2118 + }, + { + "epoch": 2.3798961111891055, + "grad_norm": 0.4739492033223087, + "learning_rate": 1.2548389930207932e-05, + "loss": 0.176, + "step": 2119 + }, + { + "epoch": 2.381019233469044, + "grad_norm": 0.46312844512973367, + "learning_rate": 1.2540803957803356e-05, + "loss": 0.1677, + "step": 2120 + }, + { + "epoch": 2.3821423557489823, + "grad_norm": 0.49417837944594656, + "learning_rate": 1.2533216422014145e-05, + "loss": 0.184, + "step": 2121 + }, + { + "epoch": 2.38326547802892, + "grad_norm": 0.484024720454384, + "learning_rate": 1.2525627327508994e-05, + "loss": 0.1734, + "step": 2122 + }, + { + "epoch": 2.3843886003088586, + "grad_norm": 0.48942672284743194, + "learning_rate": 1.2518036678957554e-05, + "loss": 0.1844, + "step": 2123 + }, + { + "epoch": 2.385511722588797, + "grad_norm": 0.46895877443085426, + "learning_rate": 1.2510444481030434e-05, + "loss": 0.1728, + "step": 2124 + }, + { + "epoch": 2.386634844868735, + "grad_norm": 0.45324010534175113, + "learning_rate": 1.25028507383992e-05, + "loss": 0.1735, + "step": 2125 + }, + { + "epoch": 2.3877579671486733, + "grad_norm": 0.4500103538917005, + "learning_rate": 1.2495255455736366e-05, + "loss": 0.1663, + "step": 2126 + }, + { + "epoch": 2.3888810894286117, + "grad_norm": 0.46447167465312617, + "learning_rate": 1.2487658637715388e-05, + "loss": 0.1851, + "step": 2127 + }, + { + "epoch": 2.3900042117085496, + "grad_norm": 0.4689612937423477, + "learning_rate": 1.2480060289010677e-05, + "loss": 0.1812, + "step": 2128 + }, + { + "epoch": 2.391127333988488, + "grad_norm": 0.44830442220436945, + "learning_rate": 1.2472460414297576e-05, + "loss": 0.1693, + "step": 2129 + }, + { + "epoch": 2.3922504562684264, + "grad_norm": 0.47648002812678625, + "learning_rate": 1.2464859018252377e-05, + "loss": 0.1897, + "step": 2130 + }, + { + "epoch": 2.3933735785483643, + "grad_norm": 0.4776674804606051, + "learning_rate": 1.2457256105552297e-05, + "loss": 0.1719, + "step": 2131 + }, + { + "epoch": 2.3944967008283027, + "grad_norm": 0.45626152225385935, + "learning_rate": 1.2449651680875495e-05, + "loss": 0.174, + "step": 2132 + }, + { + "epoch": 2.395619823108241, + "grad_norm": 0.4641074642628347, + "learning_rate": 1.2442045748901057e-05, + "loss": 0.1765, + "step": 2133 + }, + { + "epoch": 2.396742945388179, + "grad_norm": 0.45963165730817895, + "learning_rate": 1.2434438314308997e-05, + "loss": 0.175, + "step": 2134 + }, + { + "epoch": 2.3978660676681174, + "grad_norm": 0.4559745351834007, + "learning_rate": 1.242682938178025e-05, + "loss": 0.1607, + "step": 2135 + }, + { + "epoch": 2.3989891899480558, + "grad_norm": 0.4592128892747727, + "learning_rate": 1.2419218955996677e-05, + "loss": 0.1752, + "step": 2136 + }, + { + "epoch": 2.4001123122279937, + "grad_norm": 0.47314594000118837, + "learning_rate": 1.2411607041641062e-05, + "loss": 0.1715, + "step": 2137 + }, + { + "epoch": 2.401235434507932, + "grad_norm": 0.46568965161641296, + "learning_rate": 1.2403993643397095e-05, + "loss": 0.1847, + "step": 2138 + }, + { + "epoch": 2.4023585567878705, + "grad_norm": 0.45802226050759454, + "learning_rate": 1.2396378765949382e-05, + "loss": 0.1732, + "step": 2139 + }, + { + "epoch": 2.4034816790678084, + "grad_norm": 0.4657203849841537, + "learning_rate": 1.2388762413983447e-05, + "loss": 0.1756, + "step": 2140 + }, + { + "epoch": 2.404604801347747, + "grad_norm": 0.44869629379918413, + "learning_rate": 1.238114459218571e-05, + "loss": 0.1697, + "step": 2141 + }, + { + "epoch": 2.405727923627685, + "grad_norm": 0.46164059934933155, + "learning_rate": 1.2373525305243499e-05, + "loss": 0.1774, + "step": 2142 + }, + { + "epoch": 2.406851045907623, + "grad_norm": 0.47884498767707456, + "learning_rate": 1.2365904557845054e-05, + "loss": 0.1859, + "step": 2143 + }, + { + "epoch": 2.4079741681875615, + "grad_norm": 0.4576461996017312, + "learning_rate": 1.2358282354679494e-05, + "loss": 0.1684, + "step": 2144 + }, + { + "epoch": 2.4090972904675, + "grad_norm": 0.4679384692538624, + "learning_rate": 1.2350658700436852e-05, + "loss": 0.1738, + "step": 2145 + }, + { + "epoch": 2.410220412747438, + "grad_norm": 0.46313097703304307, + "learning_rate": 1.2343033599808044e-05, + "loss": 0.1772, + "step": 2146 + }, + { + "epoch": 2.411343535027376, + "grad_norm": 0.4625911631962477, + "learning_rate": 1.2335407057484877e-05, + "loss": 0.1811, + "step": 2147 + }, + { + "epoch": 2.4124666573073146, + "grad_norm": 0.44513248630919333, + "learning_rate": 1.232777907816005e-05, + "loss": 0.1749, + "step": 2148 + }, + { + "epoch": 2.4135897795872525, + "grad_norm": 0.467659286026053, + "learning_rate": 1.2320149666527134e-05, + "loss": 0.182, + "step": 2149 + }, + { + "epoch": 2.414712901867191, + "grad_norm": 0.4596330741642717, + "learning_rate": 1.2312518827280603e-05, + "loss": 0.1725, + "step": 2150 + }, + { + "epoch": 2.415836024147129, + "grad_norm": 0.48688351873304575, + "learning_rate": 1.2304886565115786e-05, + "loss": 0.1859, + "step": 2151 + }, + { + "epoch": 2.416959146427067, + "grad_norm": 0.46822427826794677, + "learning_rate": 1.2297252884728904e-05, + "loss": 0.1832, + "step": 2152 + }, + { + "epoch": 2.4180822687070056, + "grad_norm": 0.4729830338617269, + "learning_rate": 1.2289617790817039e-05, + "loss": 0.1682, + "step": 2153 + }, + { + "epoch": 2.4192053909869435, + "grad_norm": 0.46719333183624456, + "learning_rate": 1.228198128807815e-05, + "loss": 0.1741, + "step": 2154 + }, + { + "epoch": 2.420328513266882, + "grad_norm": 0.4773732399141985, + "learning_rate": 1.2274343381211067e-05, + "loss": 0.1737, + "step": 2155 + }, + { + "epoch": 2.4214516355468203, + "grad_norm": 0.48795163089354604, + "learning_rate": 1.226670407491547e-05, + "loss": 0.1875, + "step": 2156 + }, + { + "epoch": 2.4225747578267582, + "grad_norm": 0.4579527625487055, + "learning_rate": 1.2259063373891911e-05, + "loss": 0.1772, + "step": 2157 + }, + { + "epoch": 2.4236978801066966, + "grad_norm": 0.4533126876310765, + "learning_rate": 1.22514212828418e-05, + "loss": 0.1705, + "step": 2158 + }, + { + "epoch": 2.424821002386635, + "grad_norm": 0.43848733476677665, + "learning_rate": 1.2243777806467396e-05, + "loss": 0.1676, + "step": 2159 + }, + { + "epoch": 2.425944124666573, + "grad_norm": 0.4719399363071484, + "learning_rate": 1.223613294947182e-05, + "loss": 0.1811, + "step": 2160 + }, + { + "epoch": 2.4270672469465113, + "grad_norm": 0.453477191494824, + "learning_rate": 1.222848671655903e-05, + "loss": 0.1685, + "step": 2161 + }, + { + "epoch": 2.4281903692264497, + "grad_norm": 0.4658487988412452, + "learning_rate": 1.222083911243384e-05, + "loss": 0.1773, + "step": 2162 + }, + { + "epoch": 2.4293134915063876, + "grad_norm": 0.459422798554054, + "learning_rate": 1.2213190141801906e-05, + "loss": 0.1733, + "step": 2163 + }, + { + "epoch": 2.430436613786326, + "grad_norm": 0.4887095631557401, + "learning_rate": 1.2205539809369719e-05, + "loss": 0.191, + "step": 2164 + }, + { + "epoch": 2.431559736066264, + "grad_norm": 0.48937602650227535, + "learning_rate": 1.2197888119844623e-05, + "loss": 0.1846, + "step": 2165 + }, + { + "epoch": 2.4326828583462023, + "grad_norm": 0.46406522024065605, + "learning_rate": 1.2190235077934776e-05, + "loss": 0.1747, + "step": 2166 + }, + { + "epoch": 2.4338059806261407, + "grad_norm": 0.4621944971046579, + "learning_rate": 1.2182580688349185e-05, + "loss": 0.1685, + "step": 2167 + }, + { + "epoch": 2.4349291029060787, + "grad_norm": 0.4862208277682074, + "learning_rate": 1.2174924955797676e-05, + "loss": 0.1838, + "step": 2168 + }, + { + "epoch": 2.436052225186017, + "grad_norm": 0.4488446985907642, + "learning_rate": 1.216726788499091e-05, + "loss": 0.1735, + "step": 2169 + }, + { + "epoch": 2.4371753474659554, + "grad_norm": 0.45799186763818456, + "learning_rate": 1.2159609480640361e-05, + "loss": 0.1615, + "step": 2170 + }, + { + "epoch": 2.4382984697458934, + "grad_norm": 0.4700112723112886, + "learning_rate": 1.2151949747458336e-05, + "loss": 0.1729, + "step": 2171 + }, + { + "epoch": 2.4394215920258318, + "grad_norm": 0.4613922790363946, + "learning_rate": 1.214428869015795e-05, + "loss": 0.1612, + "step": 2172 + }, + { + "epoch": 2.44054471430577, + "grad_norm": 0.4730141720396465, + "learning_rate": 1.2136626313453136e-05, + "loss": 0.1717, + "step": 2173 + }, + { + "epoch": 2.441667836585708, + "grad_norm": 0.48115765163733054, + "learning_rate": 1.212896262205864e-05, + "loss": 0.1759, + "step": 2174 + }, + { + "epoch": 2.4427909588656465, + "grad_norm": 0.4773927024472624, + "learning_rate": 1.2121297620690011e-05, + "loss": 0.1703, + "step": 2175 + }, + { + "epoch": 2.443914081145585, + "grad_norm": 0.42953251161909245, + "learning_rate": 1.2113631314063615e-05, + "loss": 0.1625, + "step": 2176 + }, + { + "epoch": 2.4450372034255228, + "grad_norm": 0.45411136690423287, + "learning_rate": 1.210596370689661e-05, + "loss": 0.1695, + "step": 2177 + }, + { + "epoch": 2.446160325705461, + "grad_norm": 0.48313926964400616, + "learning_rate": 1.2098294803906962e-05, + "loss": 0.1831, + "step": 2178 + }, + { + "epoch": 2.4472834479853995, + "grad_norm": 0.4816531131405757, + "learning_rate": 1.209062460981343e-05, + "loss": 0.1758, + "step": 2179 + }, + { + "epoch": 2.4484065702653375, + "grad_norm": 0.4399375257912147, + "learning_rate": 1.208295312933557e-05, + "loss": 0.1685, + "step": 2180 + }, + { + "epoch": 2.449529692545276, + "grad_norm": 0.467400595438487, + "learning_rate": 1.2075280367193727e-05, + "loss": 0.1851, + "step": 2181 + }, + { + "epoch": 2.4506528148252142, + "grad_norm": 0.4658665632807449, + "learning_rate": 1.2067606328109038e-05, + "loss": 0.1645, + "step": 2182 + }, + { + "epoch": 2.451775937105152, + "grad_norm": 0.46990058745256763, + "learning_rate": 1.2059931016803422e-05, + "loss": 0.168, + "step": 2183 + }, + { + "epoch": 2.4528990593850906, + "grad_norm": 0.4614529742174627, + "learning_rate": 1.2052254437999582e-05, + "loss": 0.1686, + "step": 2184 + }, + { + "epoch": 2.454022181665029, + "grad_norm": 0.46087967841279076, + "learning_rate": 1.2044576596421003e-05, + "loss": 0.182, + "step": 2185 + }, + { + "epoch": 2.455145303944967, + "grad_norm": 0.4686611799132139, + "learning_rate": 1.2036897496791945e-05, + "loss": 0.1705, + "step": 2186 + }, + { + "epoch": 2.4562684262249053, + "grad_norm": 0.45135028559868196, + "learning_rate": 1.2029217143837441e-05, + "loss": 0.1749, + "step": 2187 + }, + { + "epoch": 2.4573915485048436, + "grad_norm": 0.45774486543179027, + "learning_rate": 1.2021535542283297e-05, + "loss": 0.1786, + "step": 2188 + }, + { + "epoch": 2.4585146707847816, + "grad_norm": 0.4370453888107022, + "learning_rate": 1.2013852696856092e-05, + "loss": 0.1616, + "step": 2189 + }, + { + "epoch": 2.45963779306472, + "grad_norm": 0.4941697271173176, + "learning_rate": 1.2006168612283158e-05, + "loss": 0.1888, + "step": 2190 + }, + { + "epoch": 2.4607609153446584, + "grad_norm": 0.45220984927599905, + "learning_rate": 1.1998483293292602e-05, + "loss": 0.1719, + "step": 2191 + }, + { + "epoch": 2.4618840376245963, + "grad_norm": 0.45516650401879916, + "learning_rate": 1.199079674461328e-05, + "loss": 0.1718, + "step": 2192 + }, + { + "epoch": 2.4630071599045347, + "grad_norm": 0.47742118331301076, + "learning_rate": 1.1983108970974815e-05, + "loss": 0.1848, + "step": 2193 + }, + { + "epoch": 2.464130282184473, + "grad_norm": 0.46090370293756416, + "learning_rate": 1.1975419977107578e-05, + "loss": 0.1672, + "step": 2194 + }, + { + "epoch": 2.465253404464411, + "grad_norm": 0.46359443330017996, + "learning_rate": 1.1967729767742688e-05, + "loss": 0.1769, + "step": 2195 + }, + { + "epoch": 2.4663765267443494, + "grad_norm": 0.5022583283184264, + "learning_rate": 1.1960038347612021e-05, + "loss": 0.181, + "step": 2196 + }, + { + "epoch": 2.4674996490242878, + "grad_norm": 0.46125009756094304, + "learning_rate": 1.1952345721448189e-05, + "loss": 0.1784, + "step": 2197 + }, + { + "epoch": 2.4686227713042257, + "grad_norm": 0.4588215867147784, + "learning_rate": 1.1944651893984546e-05, + "loss": 0.1676, + "step": 2198 + }, + { + "epoch": 2.469745893584164, + "grad_norm": 0.44150474530630224, + "learning_rate": 1.1936956869955198e-05, + "loss": 0.1581, + "step": 2199 + }, + { + "epoch": 2.470869015864102, + "grad_norm": 0.4648006055553846, + "learning_rate": 1.192926065409497e-05, + "loss": 0.1756, + "step": 2200 + }, + { + "epoch": 2.4719921381440404, + "grad_norm": 0.4783515973358098, + "learning_rate": 1.1921563251139433e-05, + "loss": 0.177, + "step": 2201 + }, + { + "epoch": 2.473115260423979, + "grad_norm": 0.46380023099280016, + "learning_rate": 1.1913864665824878e-05, + "loss": 0.1738, + "step": 2202 + }, + { + "epoch": 2.4742383827039167, + "grad_norm": 0.4494157069182162, + "learning_rate": 1.1906164902888336e-05, + "loss": 0.1669, + "step": 2203 + }, + { + "epoch": 2.475361504983855, + "grad_norm": 0.45612118971443005, + "learning_rate": 1.189846396706755e-05, + "loss": 0.178, + "step": 2204 + }, + { + "epoch": 2.4764846272637935, + "grad_norm": 0.45627124504496713, + "learning_rate": 1.1890761863100994e-05, + "loss": 0.1838, + "step": 2205 + }, + { + "epoch": 2.4776077495437314, + "grad_norm": 0.448762160212113, + "learning_rate": 1.1883058595727862e-05, + "loss": 0.1715, + "step": 2206 + }, + { + "epoch": 2.47873087182367, + "grad_norm": 0.4478139255344506, + "learning_rate": 1.1875354169688049e-05, + "loss": 0.178, + "step": 2207 + }, + { + "epoch": 2.479853994103608, + "grad_norm": 0.45613528585444935, + "learning_rate": 1.186764858972218e-05, + "loss": 0.17, + "step": 2208 + }, + { + "epoch": 2.480977116383546, + "grad_norm": 0.4788049854189734, + "learning_rate": 1.185994186057158e-05, + "loss": 0.1785, + "step": 2209 + }, + { + "epoch": 2.4821002386634845, + "grad_norm": 0.4449516828341209, + "learning_rate": 1.1852233986978286e-05, + "loss": 0.1638, + "step": 2210 + }, + { + "epoch": 2.483223360943423, + "grad_norm": 0.4533109754634573, + "learning_rate": 1.1844524973685036e-05, + "loss": 0.17, + "step": 2211 + }, + { + "epoch": 2.484346483223361, + "grad_norm": 0.4615156080814565, + "learning_rate": 1.1836814825435272e-05, + "loss": 0.1753, + "step": 2212 + }, + { + "epoch": 2.485469605503299, + "grad_norm": 0.4617037737156908, + "learning_rate": 1.1829103546973135e-05, + "loss": 0.1752, + "step": 2213 + }, + { + "epoch": 2.4865927277832376, + "grad_norm": 0.4800696915193034, + "learning_rate": 1.1821391143043455e-05, + "loss": 0.1821, + "step": 2214 + }, + { + "epoch": 2.4877158500631755, + "grad_norm": 0.48034446713417756, + "learning_rate": 1.1813677618391759e-05, + "loss": 0.177, + "step": 2215 + }, + { + "epoch": 2.488838972343114, + "grad_norm": 0.46921558168312416, + "learning_rate": 1.1805962977764271e-05, + "loss": 0.1676, + "step": 2216 + }, + { + "epoch": 2.489962094623052, + "grad_norm": 0.45754664779814874, + "learning_rate": 1.1798247225907883e-05, + "loss": 0.1659, + "step": 2217 + }, + { + "epoch": 2.4910852169029902, + "grad_norm": 0.4445261764071967, + "learning_rate": 1.1790530367570194e-05, + "loss": 0.1697, + "step": 2218 + }, + { + "epoch": 2.4922083391829286, + "grad_norm": 0.44402068318245963, + "learning_rate": 1.1782812407499461e-05, + "loss": 0.1706, + "step": 2219 + }, + { + "epoch": 2.4933314614628665, + "grad_norm": 0.4853740651601103, + "learning_rate": 1.1775093350444638e-05, + "loss": 0.1787, + "step": 2220 + }, + { + "epoch": 2.494454583742805, + "grad_norm": 0.47514363972154094, + "learning_rate": 1.1767373201155344e-05, + "loss": 0.1767, + "step": 2221 + }, + { + "epoch": 2.4955777060227433, + "grad_norm": 0.4573804651093508, + "learning_rate": 1.1759651964381864e-05, + "loss": 0.1643, + "step": 2222 + }, + { + "epoch": 2.4967008283026813, + "grad_norm": 0.46823456746959147, + "learning_rate": 1.1751929644875171e-05, + "loss": 0.1801, + "step": 2223 + }, + { + "epoch": 2.4978239505826196, + "grad_norm": 0.4702347722306607, + "learning_rate": 1.1744206247386885e-05, + "loss": 0.1756, + "step": 2224 + }, + { + "epoch": 2.498947072862558, + "grad_norm": 0.4779388858348045, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.1854, + "step": 2225 + }, + { + "epoch": 2.500070195142496, + "grad_norm": 0.45908097833882, + "learning_rate": 1.1728756237475377e-05, + "loss": 0.1724, + "step": 2226 + }, + { + "epoch": 2.5011933174224343, + "grad_norm": 0.47347497107975, + "learning_rate": 1.172102963455871e-05, + "loss": 0.1856, + "step": 2227 + }, + { + "epoch": 2.5023164397023727, + "grad_norm": 0.440317578484131, + "learning_rate": 1.1713301972673574e-05, + "loss": 0.1677, + "step": 2228 + }, + { + "epoch": 2.5034395619823107, + "grad_norm": 0.4698698238973603, + "learning_rate": 1.1705573256574875e-05, + "loss": 0.172, + "step": 2229 + }, + { + "epoch": 2.504562684262249, + "grad_norm": 0.4666802074158573, + "learning_rate": 1.1697843491018189e-05, + "loss": 0.1778, + "step": 2230 + }, + { + "epoch": 2.5056858065421874, + "grad_norm": 0.4675479550389423, + "learning_rate": 1.1690112680759714e-05, + "loss": 0.1765, + "step": 2231 + }, + { + "epoch": 2.5068089288221254, + "grad_norm": 0.470321624448753, + "learning_rate": 1.1682380830556305e-05, + "loss": 0.1741, + "step": 2232 + }, + { + "epoch": 2.5079320511020637, + "grad_norm": 0.4621451552720319, + "learning_rate": 1.1674647945165463e-05, + "loss": 0.1725, + "step": 2233 + }, + { + "epoch": 2.509055173382002, + "grad_norm": 0.47835096191087073, + "learning_rate": 1.1666914029345309e-05, + "loss": 0.1735, + "step": 2234 + }, + { + "epoch": 2.51017829566194, + "grad_norm": 0.44571066889134053, + "learning_rate": 1.165917908785461e-05, + "loss": 0.1658, + "step": 2235 + }, + { + "epoch": 2.5113014179418784, + "grad_norm": 0.48065746817643185, + "learning_rate": 1.165144312545276e-05, + "loss": 0.1789, + "step": 2236 + }, + { + "epoch": 2.512424540221817, + "grad_norm": 0.4865730345149686, + "learning_rate": 1.164370614689978e-05, + "loss": 0.19, + "step": 2237 + }, + { + "epoch": 2.5135476625017548, + "grad_norm": 0.46563389532044447, + "learning_rate": 1.1635968156956322e-05, + "loss": 0.1788, + "step": 2238 + }, + { + "epoch": 2.514670784781693, + "grad_norm": 0.44428883561842164, + "learning_rate": 1.1628229160383653e-05, + "loss": 0.1645, + "step": 2239 + }, + { + "epoch": 2.5157939070616315, + "grad_norm": 0.4699429571414019, + "learning_rate": 1.1620489161943665e-05, + "loss": 0.1734, + "step": 2240 + }, + { + "epoch": 2.5169170293415695, + "grad_norm": 0.4645753328504612, + "learning_rate": 1.161274816639886e-05, + "loss": 0.1695, + "step": 2241 + }, + { + "epoch": 2.518040151621508, + "grad_norm": 0.4715174058816328, + "learning_rate": 1.1605006178512361e-05, + "loss": 0.1767, + "step": 2242 + }, + { + "epoch": 2.5191632739014462, + "grad_norm": 0.4739964569319054, + "learning_rate": 1.15972632030479e-05, + "loss": 0.1778, + "step": 2243 + }, + { + "epoch": 2.520286396181384, + "grad_norm": 0.4864994655693669, + "learning_rate": 1.1589519244769813e-05, + "loss": 0.1907, + "step": 2244 + }, + { + "epoch": 2.5214095184613226, + "grad_norm": 0.4554291488338845, + "learning_rate": 1.1581774308443042e-05, + "loss": 0.1726, + "step": 2245 + }, + { + "epoch": 2.522532640741261, + "grad_norm": 0.4490036340445551, + "learning_rate": 1.157402839883313e-05, + "loss": 0.1737, + "step": 2246 + }, + { + "epoch": 2.523655763021199, + "grad_norm": 0.4560497521668537, + "learning_rate": 1.1566281520706228e-05, + "loss": 0.1698, + "step": 2247 + }, + { + "epoch": 2.5247788853011373, + "grad_norm": 0.4848150687683439, + "learning_rate": 1.1558533678829066e-05, + "loss": 0.1779, + "step": 2248 + }, + { + "epoch": 2.5259020075810756, + "grad_norm": 0.4557588727056414, + "learning_rate": 1.1550784877968982e-05, + "loss": 0.1792, + "step": 2249 + }, + { + "epoch": 2.5270251298610136, + "grad_norm": 0.46640445243584955, + "learning_rate": 1.1543035122893898e-05, + "loss": 0.1821, + "step": 2250 + }, + { + "epoch": 2.528148252140952, + "grad_norm": 0.46062983958648196, + "learning_rate": 1.1535284418372321e-05, + "loss": 0.1731, + "step": 2251 + }, + { + "epoch": 2.5292713744208903, + "grad_norm": 0.4533115380906245, + "learning_rate": 1.1527532769173349e-05, + "loss": 0.1852, + "step": 2252 + }, + { + "epoch": 2.5303944967008283, + "grad_norm": 0.45093605161163985, + "learning_rate": 1.1519780180066651e-05, + "loss": 0.1725, + "step": 2253 + }, + { + "epoch": 2.5315176189807667, + "grad_norm": 0.4739553972885986, + "learning_rate": 1.1512026655822483e-05, + "loss": 0.185, + "step": 2254 + }, + { + "epoch": 2.5326407412607046, + "grad_norm": 0.455497428773056, + "learning_rate": 1.150427220121168e-05, + "loss": 0.1667, + "step": 2255 + }, + { + "epoch": 2.533763863540643, + "grad_norm": 0.46427594398131145, + "learning_rate": 1.1496516821005632e-05, + "loss": 0.1707, + "step": 2256 + }, + { + "epoch": 2.5348869858205814, + "grad_norm": 0.4726499893846905, + "learning_rate": 1.1488760519976321e-05, + "loss": 0.1693, + "step": 2257 + }, + { + "epoch": 2.5360101081005193, + "grad_norm": 0.4770180942704747, + "learning_rate": 1.1481003302896274e-05, + "loss": 0.1791, + "step": 2258 + }, + { + "epoch": 2.5371332303804577, + "grad_norm": 0.4824404668774312, + "learning_rate": 1.1473245174538601e-05, + "loss": 0.1632, + "step": 2259 + }, + { + "epoch": 2.538256352660396, + "grad_norm": 0.46093177693875936, + "learning_rate": 1.1465486139676955e-05, + "loss": 0.171, + "step": 2260 + }, + { + "epoch": 2.539379474940334, + "grad_norm": 0.47811684922648134, + "learning_rate": 1.1457726203085565e-05, + "loss": 0.1795, + "step": 2261 + }, + { + "epoch": 2.5405025972202724, + "grad_norm": 0.4605303787580689, + "learning_rate": 1.14499653695392e-05, + "loss": 0.1717, + "step": 2262 + }, + { + "epoch": 2.5416257195002103, + "grad_norm": 0.45780123414736634, + "learning_rate": 1.1442203643813184e-05, + "loss": 0.165, + "step": 2263 + }, + { + "epoch": 2.5427488417801487, + "grad_norm": 0.4484627901652153, + "learning_rate": 1.1434441030683396e-05, + "loss": 0.1723, + "step": 2264 + }, + { + "epoch": 2.543871964060087, + "grad_norm": 0.45899357848479316, + "learning_rate": 1.1426677534926259e-05, + "loss": 0.178, + "step": 2265 + }, + { + "epoch": 2.544995086340025, + "grad_norm": 0.4520762026871274, + "learning_rate": 1.1418913161318735e-05, + "loss": 0.1718, + "step": 2266 + }, + { + "epoch": 2.5461182086199634, + "grad_norm": 0.48593149453254475, + "learning_rate": 1.1411147914638323e-05, + "loss": 0.1858, + "step": 2267 + }, + { + "epoch": 2.547241330899902, + "grad_norm": 0.4699419541357399, + "learning_rate": 1.1403381799663073e-05, + "loss": 0.1868, + "step": 2268 + }, + { + "epoch": 2.5483644531798397, + "grad_norm": 0.43072277665816994, + "learning_rate": 1.139561482117156e-05, + "loss": 0.1635, + "step": 2269 + }, + { + "epoch": 2.549487575459778, + "grad_norm": 0.48228875953990946, + "learning_rate": 1.138784698394289e-05, + "loss": 0.1838, + "step": 2270 + }, + { + "epoch": 2.5506106977397165, + "grad_norm": 0.4589878064473334, + "learning_rate": 1.1380078292756695e-05, + "loss": 0.1779, + "step": 2271 + }, + { + "epoch": 2.5517338200196544, + "grad_norm": 0.47422067140567364, + "learning_rate": 1.1372308752393144e-05, + "loss": 0.1871, + "step": 2272 + }, + { + "epoch": 2.552856942299593, + "grad_norm": 0.4841535195365073, + "learning_rate": 1.136453836763291e-05, + "loss": 0.1839, + "step": 2273 + }, + { + "epoch": 2.553980064579531, + "grad_norm": 0.46931683097362253, + "learning_rate": 1.1356767143257208e-05, + "loss": 0.1774, + "step": 2274 + }, + { + "epoch": 2.555103186859469, + "grad_norm": 0.45852223438824385, + "learning_rate": 1.134899508404775e-05, + "loss": 0.1668, + "step": 2275 + }, + { + "epoch": 2.5562263091394075, + "grad_norm": 0.4747264084612995, + "learning_rate": 1.1341222194786772e-05, + "loss": 0.1817, + "step": 2276 + }, + { + "epoch": 2.557349431419346, + "grad_norm": 0.5309294296915062, + "learning_rate": 1.1333448480257019e-05, + "loss": 0.1774, + "step": 2277 + }, + { + "epoch": 2.558472553699284, + "grad_norm": 0.44494999424664955, + "learning_rate": 1.132567394524174e-05, + "loss": 0.1652, + "step": 2278 + }, + { + "epoch": 2.559595675979222, + "grad_norm": 0.47565188072125314, + "learning_rate": 1.1317898594524694e-05, + "loss": 0.1739, + "step": 2279 + }, + { + "epoch": 2.5607187982591606, + "grad_norm": 0.4724833833160546, + "learning_rate": 1.131012243289014e-05, + "loss": 0.1756, + "step": 2280 + }, + { + "epoch": 2.5618419205390985, + "grad_norm": 0.4961383451401611, + "learning_rate": 1.1302345465122839e-05, + "loss": 0.1751, + "step": 2281 + }, + { + "epoch": 2.562965042819037, + "grad_norm": 0.45743388018733416, + "learning_rate": 1.1294567696008038e-05, + "loss": 0.1682, + "step": 2282 + }, + { + "epoch": 2.5640881650989753, + "grad_norm": 0.46320800217506247, + "learning_rate": 1.1286789130331487e-05, + "loss": 0.1765, + "step": 2283 + }, + { + "epoch": 2.5652112873789132, + "grad_norm": 0.46728239868320987, + "learning_rate": 1.1279009772879427e-05, + "loss": 0.1715, + "step": 2284 + }, + { + "epoch": 2.5663344096588516, + "grad_norm": 0.455957997479803, + "learning_rate": 1.1271229628438578e-05, + "loss": 0.174, + "step": 2285 + }, + { + "epoch": 2.56745753193879, + "grad_norm": 0.46937629962755417, + "learning_rate": 1.1263448701796149e-05, + "loss": 0.1785, + "step": 2286 + }, + { + "epoch": 2.568580654218728, + "grad_norm": 0.46560086475544704, + "learning_rate": 1.125566699773983e-05, + "loss": 0.1782, + "step": 2287 + }, + { + "epoch": 2.5697037764986663, + "grad_norm": 0.4611166282806535, + "learning_rate": 1.1247884521057788e-05, + "loss": 0.1604, + "step": 2288 + }, + { + "epoch": 2.5708268987786047, + "grad_norm": 0.4800102109761718, + "learning_rate": 1.1240101276538668e-05, + "loss": 0.1798, + "step": 2289 + }, + { + "epoch": 2.5719500210585426, + "grad_norm": 0.44962924941537197, + "learning_rate": 1.1232317268971586e-05, + "loss": 0.1645, + "step": 2290 + }, + { + "epoch": 2.573073143338481, + "grad_norm": 0.45604502325176743, + "learning_rate": 1.122453250314613e-05, + "loss": 0.1762, + "step": 2291 + }, + { + "epoch": 2.5741962656184194, + "grad_norm": 0.44941318778717404, + "learning_rate": 1.121674698385235e-05, + "loss": 0.1723, + "step": 2292 + }, + { + "epoch": 2.5753193878983573, + "grad_norm": 0.4570440838697352, + "learning_rate": 1.1208960715880759e-05, + "loss": 0.1692, + "step": 2293 + }, + { + "epoch": 2.5764425101782957, + "grad_norm": 0.4458264791275645, + "learning_rate": 1.1201173704022335e-05, + "loss": 0.1714, + "step": 2294 + }, + { + "epoch": 2.577565632458234, + "grad_norm": 0.46409333426984856, + "learning_rate": 1.1193385953068512e-05, + "loss": 0.188, + "step": 2295 + }, + { + "epoch": 2.578688754738172, + "grad_norm": 0.42425996455837706, + "learning_rate": 1.118559746781118e-05, + "loss": 0.1591, + "step": 2296 + }, + { + "epoch": 2.5798118770181104, + "grad_norm": 0.447955738326864, + "learning_rate": 1.1177808253042679e-05, + "loss": 0.1712, + "step": 2297 + }, + { + "epoch": 2.580934999298049, + "grad_norm": 0.4650895841202455, + "learning_rate": 1.1170018313555802e-05, + "loss": 0.1808, + "step": 2298 + }, + { + "epoch": 2.5820581215779868, + "grad_norm": 0.4701322334085429, + "learning_rate": 1.1162227654143777e-05, + "loss": 0.1893, + "step": 2299 + }, + { + "epoch": 2.583181243857925, + "grad_norm": 0.4531359125712725, + "learning_rate": 1.1154436279600287e-05, + "loss": 0.169, + "step": 2300 + }, + { + "epoch": 2.5843043661378635, + "grad_norm": 0.47496785745939757, + "learning_rate": 1.1146644194719454e-05, + "loss": 0.171, + "step": 2301 + }, + { + "epoch": 2.5854274884178015, + "grad_norm": 0.4612778970229022, + "learning_rate": 1.1138851404295826e-05, + "loss": 0.1671, + "step": 2302 + }, + { + "epoch": 2.58655061069774, + "grad_norm": 0.4963115959404571, + "learning_rate": 1.1131057913124399e-05, + "loss": 0.1823, + "step": 2303 + }, + { + "epoch": 2.587673732977678, + "grad_norm": 0.4776956369356712, + "learning_rate": 1.1123263726000588e-05, + "loss": 0.1766, + "step": 2304 + }, + { + "epoch": 2.588796855257616, + "grad_norm": 0.48648831354650407, + "learning_rate": 1.1115468847720245e-05, + "loss": 0.1757, + "step": 2305 + }, + { + "epoch": 2.5899199775375545, + "grad_norm": 0.4686638661781645, + "learning_rate": 1.110767328307965e-05, + "loss": 0.1821, + "step": 2306 + }, + { + "epoch": 2.5910430998174925, + "grad_norm": 0.44047248695541774, + "learning_rate": 1.109987703687549e-05, + "loss": 0.1657, + "step": 2307 + }, + { + "epoch": 2.592166222097431, + "grad_norm": 0.43543299715008577, + "learning_rate": 1.1092080113904886e-05, + "loss": 0.1599, + "step": 2308 + }, + { + "epoch": 2.5932893443773692, + "grad_norm": 0.47199067367590014, + "learning_rate": 1.1084282518965373e-05, + "loss": 0.1791, + "step": 2309 + }, + { + "epoch": 2.594412466657307, + "grad_norm": 0.4532641018961933, + "learning_rate": 1.1076484256854889e-05, + "loss": 0.1678, + "step": 2310 + }, + { + "epoch": 2.5955355889372456, + "grad_norm": 0.45735825060841906, + "learning_rate": 1.1068685332371802e-05, + "loss": 0.1838, + "step": 2311 + }, + { + "epoch": 2.5966587112171835, + "grad_norm": 0.4591137341726451, + "learning_rate": 1.1060885750314865e-05, + "loss": 0.1765, + "step": 2312 + }, + { + "epoch": 2.597781833497122, + "grad_norm": 0.4679142193330497, + "learning_rate": 1.1053085515483255e-05, + "loss": 0.179, + "step": 2313 + }, + { + "epoch": 2.5989049557770603, + "grad_norm": 0.47518636171561446, + "learning_rate": 1.1045284632676535e-05, + "loss": 0.1698, + "step": 2314 + }, + { + "epoch": 2.600028078056998, + "grad_norm": 0.4565346667945619, + "learning_rate": 1.1037483106694681e-05, + "loss": 0.1706, + "step": 2315 + }, + { + "epoch": 2.6011512003369366, + "grad_norm": 0.47006822600452486, + "learning_rate": 1.1029680942338053e-05, + "loss": 0.1694, + "step": 2316 + }, + { + "epoch": 2.602274322616875, + "grad_norm": 0.4563277326138092, + "learning_rate": 1.1021878144407408e-05, + "loss": 0.1672, + "step": 2317 + }, + { + "epoch": 2.603397444896813, + "grad_norm": 0.47107089066828634, + "learning_rate": 1.1014074717703897e-05, + "loss": 0.1749, + "step": 2318 + }, + { + "epoch": 2.6045205671767513, + "grad_norm": 0.4467643283567714, + "learning_rate": 1.1006270667029054e-05, + "loss": 0.1679, + "step": 2319 + }, + { + "epoch": 2.6056436894566897, + "grad_norm": 0.4587123434597116, + "learning_rate": 1.0998465997184798e-05, + "loss": 0.1777, + "step": 2320 + }, + { + "epoch": 2.6067668117366276, + "grad_norm": 0.4983006072884289, + "learning_rate": 1.099066071297342e-05, + "loss": 0.1874, + "step": 2321 + }, + { + "epoch": 2.607889934016566, + "grad_norm": 0.476575665366494, + "learning_rate": 1.0982854819197609e-05, + "loss": 0.1747, + "step": 2322 + }, + { + "epoch": 2.6090130562965044, + "grad_norm": 0.45400404446255577, + "learning_rate": 1.0975048320660408e-05, + "loss": 0.1801, + "step": 2323 + }, + { + "epoch": 2.6101361785764423, + "grad_norm": 0.4332841064106607, + "learning_rate": 1.0967241222165247e-05, + "loss": 0.1632, + "step": 2324 + }, + { + "epoch": 2.6112593008563807, + "grad_norm": 0.4510328447149446, + "learning_rate": 1.0959433528515922e-05, + "loss": 0.1741, + "step": 2325 + }, + { + "epoch": 2.612382423136319, + "grad_norm": 0.454925383272742, + "learning_rate": 1.0951625244516584e-05, + "loss": 0.1727, + "step": 2326 + }, + { + "epoch": 2.613505545416257, + "grad_norm": 0.4371439911786275, + "learning_rate": 1.094381637497176e-05, + "loss": 0.1673, + "step": 2327 + }, + { + "epoch": 2.6146286676961954, + "grad_norm": 0.47013253731307947, + "learning_rate": 1.0936006924686337e-05, + "loss": 0.1708, + "step": 2328 + }, + { + "epoch": 2.615751789976134, + "grad_norm": 0.46539714593918086, + "learning_rate": 1.0928196898465552e-05, + "loss": 0.1801, + "step": 2329 + }, + { + "epoch": 2.6168749122560717, + "grad_norm": 0.45190767645037216, + "learning_rate": 1.0920386301115e-05, + "loss": 0.1744, + "step": 2330 + }, + { + "epoch": 2.61799803453601, + "grad_norm": 0.4538448950274808, + "learning_rate": 1.091257513744063e-05, + "loss": 0.1681, + "step": 2331 + }, + { + "epoch": 2.6191211568159485, + "grad_norm": 0.488178449583916, + "learning_rate": 1.0904763412248736e-05, + "loss": 0.1845, + "step": 2332 + }, + { + "epoch": 2.6202442790958864, + "grad_norm": 0.46938216478064526, + "learning_rate": 1.0896951130345957e-05, + "loss": 0.1767, + "step": 2333 + }, + { + "epoch": 2.621367401375825, + "grad_norm": 0.44361139483028894, + "learning_rate": 1.0889138296539277e-05, + "loss": 0.1719, + "step": 2334 + }, + { + "epoch": 2.622490523655763, + "grad_norm": 0.4571118704858418, + "learning_rate": 1.088132491563602e-05, + "loss": 0.1693, + "step": 2335 + }, + { + "epoch": 2.623613645935701, + "grad_norm": 0.4600308129329723, + "learning_rate": 1.0873510992443841e-05, + "loss": 0.1666, + "step": 2336 + }, + { + "epoch": 2.6247367682156395, + "grad_norm": 0.4609852591114243, + "learning_rate": 1.086569653177074e-05, + "loss": 0.1714, + "step": 2337 + }, + { + "epoch": 2.625859890495578, + "grad_norm": 0.47717459396403794, + "learning_rate": 1.0857881538425032e-05, + "loss": 0.1872, + "step": 2338 + }, + { + "epoch": 2.626983012775516, + "grad_norm": 0.4617597071752641, + "learning_rate": 1.0850066017215375e-05, + "loss": 0.1745, + "step": 2339 + }, + { + "epoch": 2.628106135055454, + "grad_norm": 0.4654760240823605, + "learning_rate": 1.0842249972950743e-05, + "loss": 0.1713, + "step": 2340 + }, + { + "epoch": 2.6292292573353926, + "grad_norm": 0.4698177808893506, + "learning_rate": 1.0834433410440432e-05, + "loss": 0.1716, + "step": 2341 + }, + { + "epoch": 2.6303523796153305, + "grad_norm": 0.47185791755913564, + "learning_rate": 1.0826616334494068e-05, + "loss": 0.177, + "step": 2342 + }, + { + "epoch": 2.631475501895269, + "grad_norm": 0.46380423420344924, + "learning_rate": 1.0818798749921569e-05, + "loss": 0.172, + "step": 2343 + }, + { + "epoch": 2.6325986241752073, + "grad_norm": 0.4790054584186303, + "learning_rate": 1.081098066153319e-05, + "loss": 0.1726, + "step": 2344 + }, + { + "epoch": 2.6337217464551452, + "grad_norm": 0.46882773567793207, + "learning_rate": 1.0803162074139489e-05, + "loss": 0.1699, + "step": 2345 + }, + { + "epoch": 2.6348448687350836, + "grad_norm": 0.45878850668533416, + "learning_rate": 1.0795342992551323e-05, + "loss": 0.1749, + "step": 2346 + }, + { + "epoch": 2.635967991015022, + "grad_norm": 0.46305430397540676, + "learning_rate": 1.0787523421579862e-05, + "loss": 0.1797, + "step": 2347 + }, + { + "epoch": 2.63709111329496, + "grad_norm": 0.4663636966697608, + "learning_rate": 1.0779703366036573e-05, + "loss": 0.1733, + "step": 2348 + }, + { + "epoch": 2.6382142355748983, + "grad_norm": 0.48294894032218394, + "learning_rate": 1.0771882830733223e-05, + "loss": 0.1887, + "step": 2349 + }, + { + "epoch": 2.6393373578548367, + "grad_norm": 0.45323268388669674, + "learning_rate": 1.0764061820481872e-05, + "loss": 0.1717, + "step": 2350 + }, + { + "epoch": 2.6404604801347746, + "grad_norm": 0.4532472744181654, + "learning_rate": 1.0756240340094877e-05, + "loss": 0.1732, + "step": 2351 + }, + { + "epoch": 2.641583602414713, + "grad_norm": 0.4383917984038164, + "learning_rate": 1.0748418394384876e-05, + "loss": 0.1658, + "step": 2352 + }, + { + "epoch": 2.6427067246946514, + "grad_norm": 0.4661107283107099, + "learning_rate": 1.07405959881648e-05, + "loss": 0.1863, + "step": 2353 + }, + { + "epoch": 2.6438298469745893, + "grad_norm": 0.4737303429465663, + "learning_rate": 1.0732773126247867e-05, + "loss": 0.1771, + "step": 2354 + }, + { + "epoch": 2.6449529692545277, + "grad_norm": 0.47017332443037857, + "learning_rate": 1.0724949813447563e-05, + "loss": 0.1724, + "step": 2355 + }, + { + "epoch": 2.6460760915344657, + "grad_norm": 0.49757478615437767, + "learning_rate": 1.071712605457766e-05, + "loss": 0.1824, + "step": 2356 + }, + { + "epoch": 2.647199213814404, + "grad_norm": 0.47195996519974315, + "learning_rate": 1.0709301854452207e-05, + "loss": 0.1778, + "step": 2357 + }, + { + "epoch": 2.6483223360943424, + "grad_norm": 0.48072524454750337, + "learning_rate": 1.0701477217885517e-05, + "loss": 0.1854, + "step": 2358 + }, + { + "epoch": 2.6494454583742804, + "grad_norm": 0.48656148421443146, + "learning_rate": 1.0693652149692175e-05, + "loss": 0.1838, + "step": 2359 + }, + { + "epoch": 2.6505685806542187, + "grad_norm": 0.4565743939469196, + "learning_rate": 1.068582665468703e-05, + "loss": 0.1767, + "step": 2360 + }, + { + "epoch": 2.6516917029341567, + "grad_norm": 0.4785810820290836, + "learning_rate": 1.0678000737685197e-05, + "loss": 0.1738, + "step": 2361 + }, + { + "epoch": 2.652814825214095, + "grad_norm": 0.4574908421572849, + "learning_rate": 1.0670174403502051e-05, + "loss": 0.1765, + "step": 2362 + }, + { + "epoch": 2.6539379474940334, + "grad_norm": 0.465593556602867, + "learning_rate": 1.0662347656953221e-05, + "loss": 0.1679, + "step": 2363 + }, + { + "epoch": 2.6550610697739714, + "grad_norm": 0.46131939499983016, + "learning_rate": 1.0654520502854588e-05, + "loss": 0.1777, + "step": 2364 + }, + { + "epoch": 2.6561841920539098, + "grad_norm": 0.48218612435761826, + "learning_rate": 1.0646692946022285e-05, + "loss": 0.1927, + "step": 2365 + }, + { + "epoch": 2.657307314333848, + "grad_norm": 0.44549729019844936, + "learning_rate": 1.0638864991272698e-05, + "loss": 0.1647, + "step": 2366 + }, + { + "epoch": 2.658430436613786, + "grad_norm": 0.4723488402722565, + "learning_rate": 1.063103664342245e-05, + "loss": 0.1757, + "step": 2367 + }, + { + "epoch": 2.6595535588937245, + "grad_norm": 0.46272844102717686, + "learning_rate": 1.0623207907288409e-05, + "loss": 0.1719, + "step": 2368 + }, + { + "epoch": 2.660676681173663, + "grad_norm": 0.45549848321049613, + "learning_rate": 1.061537878768769e-05, + "loss": 0.1697, + "step": 2369 + }, + { + "epoch": 2.661799803453601, + "grad_norm": 0.45675590525963283, + "learning_rate": 1.0607549289437626e-05, + "loss": 0.174, + "step": 2370 + }, + { + "epoch": 2.662922925733539, + "grad_norm": 0.47841033221623763, + "learning_rate": 1.0599719417355801e-05, + "loss": 0.1778, + "step": 2371 + }, + { + "epoch": 2.6640460480134776, + "grad_norm": 0.4536400418140574, + "learning_rate": 1.0591889176260017e-05, + "loss": 0.1737, + "step": 2372 + }, + { + "epoch": 2.6651691702934155, + "grad_norm": 0.4630804441617724, + "learning_rate": 1.0584058570968312e-05, + "loss": 0.1792, + "step": 2373 + }, + { + "epoch": 2.666292292573354, + "grad_norm": 0.4382451498975878, + "learning_rate": 1.0576227606298937e-05, + "loss": 0.1567, + "step": 2374 + }, + { + "epoch": 2.6674154148532923, + "grad_norm": 0.4584632279254452, + "learning_rate": 1.0568396287070377e-05, + "loss": 0.1751, + "step": 2375 + }, + { + "epoch": 2.66853853713323, + "grad_norm": 0.46530159309615415, + "learning_rate": 1.0560564618101328e-05, + "loss": 0.1796, + "step": 2376 + }, + { + "epoch": 2.6696616594131686, + "grad_norm": 0.4635165893290825, + "learning_rate": 1.0552732604210701e-05, + "loss": 0.1737, + "step": 2377 + }, + { + "epoch": 2.670784781693107, + "grad_norm": 0.4395153429761878, + "learning_rate": 1.0544900250217615e-05, + "loss": 0.1599, + "step": 2378 + }, + { + "epoch": 2.671907903973045, + "grad_norm": 0.45345637640530895, + "learning_rate": 1.0537067560941416e-05, + "loss": 0.1746, + "step": 2379 + }, + { + "epoch": 2.6730310262529833, + "grad_norm": 0.45176101895188636, + "learning_rate": 1.0529234541201631e-05, + "loss": 0.1724, + "step": 2380 + }, + { + "epoch": 2.6741541485329217, + "grad_norm": 0.46146241332136667, + "learning_rate": 1.0521401195818014e-05, + "loss": 0.1748, + "step": 2381 + }, + { + "epoch": 2.6752772708128596, + "grad_norm": 0.4644112669184593, + "learning_rate": 1.0513567529610498e-05, + "loss": 0.1792, + "step": 2382 + }, + { + "epoch": 2.676400393092798, + "grad_norm": 0.474578955286889, + "learning_rate": 1.050573354739923e-05, + "loss": 0.1785, + "step": 2383 + }, + { + "epoch": 2.6775235153727364, + "grad_norm": 0.4361002502010669, + "learning_rate": 1.049789925400455e-05, + "loss": 0.1625, + "step": 2384 + }, + { + "epoch": 2.6786466376526743, + "grad_norm": 0.45846134672913397, + "learning_rate": 1.0490064654246976e-05, + "loss": 0.18, + "step": 2385 + }, + { + "epoch": 2.6797697599326127, + "grad_norm": 0.46526875476056234, + "learning_rate": 1.0482229752947228e-05, + "loss": 0.1712, + "step": 2386 + }, + { + "epoch": 2.680892882212551, + "grad_norm": 0.4661257112898557, + "learning_rate": 1.0474394554926206e-05, + "loss": 0.1815, + "step": 2387 + }, + { + "epoch": 2.682016004492489, + "grad_norm": 0.4648307273335071, + "learning_rate": 1.0466559065004995e-05, + "loss": 0.17, + "step": 2388 + }, + { + "epoch": 2.6831391267724274, + "grad_norm": 0.4673176281630868, + "learning_rate": 1.0458723288004858e-05, + "loss": 0.1689, + "step": 2389 + }, + { + "epoch": 2.6842622490523658, + "grad_norm": 0.4580586103480894, + "learning_rate": 1.0450887228747229e-05, + "loss": 0.1608, + "step": 2390 + }, + { + "epoch": 2.6853853713323037, + "grad_norm": 0.46591951479828025, + "learning_rate": 1.0443050892053733e-05, + "loss": 0.1752, + "step": 2391 + }, + { + "epoch": 2.686508493612242, + "grad_norm": 0.4515547388684425, + "learning_rate": 1.0435214282746142e-05, + "loss": 0.1598, + "step": 2392 + }, + { + "epoch": 2.6876316158921805, + "grad_norm": 0.4951158425551178, + "learning_rate": 1.0427377405646414e-05, + "loss": 0.1933, + "step": 2393 + }, + { + "epoch": 2.6887547381721184, + "grad_norm": 0.47544695446998514, + "learning_rate": 1.0419540265576666e-05, + "loss": 0.1797, + "step": 2394 + }, + { + "epoch": 2.689877860452057, + "grad_norm": 0.45123909591416184, + "learning_rate": 1.041170286735918e-05, + "loss": 0.1724, + "step": 2395 + }, + { + "epoch": 2.691000982731995, + "grad_norm": 0.462230580491664, + "learning_rate": 1.0403865215816382e-05, + "loss": 0.1846, + "step": 2396 + }, + { + "epoch": 2.692124105011933, + "grad_norm": 0.452353253692668, + "learning_rate": 1.0396027315770876e-05, + "loss": 0.1757, + "step": 2397 + }, + { + "epoch": 2.6932472272918715, + "grad_norm": 0.45256519362751757, + "learning_rate": 1.0388189172045407e-05, + "loss": 0.1815, + "step": 2398 + }, + { + "epoch": 2.69437034957181, + "grad_norm": 0.47210582761980324, + "learning_rate": 1.0380350789462865e-05, + "loss": 0.178, + "step": 2399 + }, + { + "epoch": 2.695493471851748, + "grad_norm": 0.4544816107564523, + "learning_rate": 1.0372512172846296e-05, + "loss": 0.1659, + "step": 2400 + }, + { + "epoch": 2.696616594131686, + "grad_norm": 0.47543248227366125, + "learning_rate": 1.0364673327018891e-05, + "loss": 0.1823, + "step": 2401 + }, + { + "epoch": 2.6977397164116246, + "grad_norm": 0.4645680632421113, + "learning_rate": 1.0356834256803974e-05, + "loss": 0.1817, + "step": 2402 + }, + { + "epoch": 2.6988628386915625, + "grad_norm": 0.47229910569982986, + "learning_rate": 1.0348994967025012e-05, + "loss": 0.1705, + "step": 2403 + }, + { + "epoch": 2.699985960971501, + "grad_norm": 0.4709976896475921, + "learning_rate": 1.0341155462505606e-05, + "loss": 0.1816, + "step": 2404 + }, + { + "epoch": 2.701109083251439, + "grad_norm": 0.43745184102209106, + "learning_rate": 1.033331574806949e-05, + "loss": 0.1542, + "step": 2405 + }, + { + "epoch": 2.702232205531377, + "grad_norm": 0.47528735474280753, + "learning_rate": 1.0325475828540524e-05, + "loss": 0.1746, + "step": 2406 + }, + { + "epoch": 2.7033553278113156, + "grad_norm": 0.4864779824250153, + "learning_rate": 1.03176357087427e-05, + "loss": 0.1861, + "step": 2407 + }, + { + "epoch": 2.7044784500912535, + "grad_norm": 0.44098989955704543, + "learning_rate": 1.030979539350013e-05, + "loss": 0.1665, + "step": 2408 + }, + { + "epoch": 2.705601572371192, + "grad_norm": 0.45665136007590384, + "learning_rate": 1.0301954887637045e-05, + "loss": 0.1701, + "step": 2409 + }, + { + "epoch": 2.7067246946511303, + "grad_norm": 0.457919080022194, + "learning_rate": 1.0294114195977796e-05, + "loss": 0.1647, + "step": 2410 + }, + { + "epoch": 2.7078478169310682, + "grad_norm": 0.47134438824824476, + "learning_rate": 1.0286273323346843e-05, + "loss": 0.1811, + "step": 2411 + }, + { + "epoch": 2.7089709392110066, + "grad_norm": 0.48672871636486603, + "learning_rate": 1.0278432274568765e-05, + "loss": 0.1743, + "step": 2412 + }, + { + "epoch": 2.7100940614909446, + "grad_norm": 0.4619105109658203, + "learning_rate": 1.0270591054468244e-05, + "loss": 0.1684, + "step": 2413 + }, + { + "epoch": 2.711217183770883, + "grad_norm": 0.4693014599142233, + "learning_rate": 1.0262749667870071e-05, + "loss": 0.1735, + "step": 2414 + }, + { + "epoch": 2.7123403060508213, + "grad_norm": 0.4536596675091747, + "learning_rate": 1.0254908119599134e-05, + "loss": 0.1663, + "step": 2415 + }, + { + "epoch": 2.7134634283307593, + "grad_norm": 0.4762674495287221, + "learning_rate": 1.0247066414480424e-05, + "loss": 0.1733, + "step": 2416 + }, + { + "epoch": 2.7145865506106976, + "grad_norm": 0.4689655559574979, + "learning_rate": 1.0239224557339035e-05, + "loss": 0.1784, + "step": 2417 + }, + { + "epoch": 2.715709672890636, + "grad_norm": 0.4673344907187584, + "learning_rate": 1.0231382553000143e-05, + "loss": 0.1837, + "step": 2418 + }, + { + "epoch": 2.716832795170574, + "grad_norm": 0.46231195313403367, + "learning_rate": 1.0223540406289017e-05, + "loss": 0.1691, + "step": 2419 + }, + { + "epoch": 2.7179559174505123, + "grad_norm": 0.4722466169309128, + "learning_rate": 1.0215698122031021e-05, + "loss": 0.1765, + "step": 2420 + }, + { + "epoch": 2.7190790397304507, + "grad_norm": 0.4464937713270927, + "learning_rate": 1.0207855705051595e-05, + "loss": 0.1692, + "step": 2421 + }, + { + "epoch": 2.7202021620103887, + "grad_norm": 0.43287348633124384, + "learning_rate": 1.020001316017627e-05, + "loss": 0.161, + "step": 2422 + }, + { + "epoch": 2.721325284290327, + "grad_norm": 0.46989012910634004, + "learning_rate": 1.0192170492230643e-05, + "loss": 0.1809, + "step": 2423 + }, + { + "epoch": 2.7224484065702654, + "grad_norm": 0.4705286884689803, + "learning_rate": 1.0184327706040397e-05, + "loss": 0.174, + "step": 2424 + }, + { + "epoch": 2.7235715288502034, + "grad_norm": 0.43611447014588406, + "learning_rate": 1.0176484806431288e-05, + "loss": 0.1604, + "step": 2425 + }, + { + "epoch": 2.7246946511301418, + "grad_norm": 0.4794627114691746, + "learning_rate": 1.0168641798229133e-05, + "loss": 0.1802, + "step": 2426 + }, + { + "epoch": 2.72581777341008, + "grad_norm": 0.4381810224958243, + "learning_rate": 1.0160798686259825e-05, + "loss": 0.1609, + "step": 2427 + }, + { + "epoch": 2.726940895690018, + "grad_norm": 0.48565778276163146, + "learning_rate": 1.0152955475349316e-05, + "loss": 0.1878, + "step": 2428 + }, + { + "epoch": 2.7280640179699565, + "grad_norm": 0.46551401986940266, + "learning_rate": 1.014511217032362e-05, + "loss": 0.1744, + "step": 2429 + }, + { + "epoch": 2.729187140249895, + "grad_norm": 0.45724136025751433, + "learning_rate": 1.0137268776008809e-05, + "loss": 0.17, + "step": 2430 + }, + { + "epoch": 2.7303102625298328, + "grad_norm": 0.45946275216502946, + "learning_rate": 1.0129425297231005e-05, + "loss": 0.1766, + "step": 2431 + }, + { + "epoch": 2.731433384809771, + "grad_norm": 0.46397417169666205, + "learning_rate": 1.0121581738816397e-05, + "loss": 0.174, + "step": 2432 + }, + { + "epoch": 2.7325565070897095, + "grad_norm": 0.46245651199187193, + "learning_rate": 1.0113738105591203e-05, + "loss": 0.1792, + "step": 2433 + }, + { + "epoch": 2.7336796293696475, + "grad_norm": 0.4524764262786222, + "learning_rate": 1.0105894402381703e-05, + "loss": 0.1694, + "step": 2434 + }, + { + "epoch": 2.734802751649586, + "grad_norm": 0.4529695543319877, + "learning_rate": 1.0098050634014216e-05, + "loss": 0.1753, + "step": 2435 + }, + { + "epoch": 2.7359258739295242, + "grad_norm": 0.45224833550454835, + "learning_rate": 1.0090206805315087e-05, + "loss": 0.1688, + "step": 2436 + }, + { + "epoch": 2.737048996209462, + "grad_norm": 0.4559944373716082, + "learning_rate": 1.0082362921110721e-05, + "loss": 0.1715, + "step": 2437 + }, + { + "epoch": 2.7381721184894006, + "grad_norm": 0.45958405120251117, + "learning_rate": 1.0074518986227546e-05, + "loss": 0.17, + "step": 2438 + }, + { + "epoch": 2.739295240769339, + "grad_norm": 0.4490783585500566, + "learning_rate": 1.0066675005492017e-05, + "loss": 0.1715, + "step": 2439 + }, + { + "epoch": 2.740418363049277, + "grad_norm": 0.448796044847132, + "learning_rate": 1.0058830983730622e-05, + "loss": 0.178, + "step": 2440 + }, + { + "epoch": 2.7415414853292153, + "grad_norm": 0.45884028647372394, + "learning_rate": 1.0050986925769877e-05, + "loss": 0.1731, + "step": 2441 + }, + { + "epoch": 2.7426646076091536, + "grad_norm": 0.4426418856703806, + "learning_rate": 1.0043142836436316e-05, + "loss": 0.16, + "step": 2442 + }, + { + "epoch": 2.7437877298890916, + "grad_norm": 0.4678363877463838, + "learning_rate": 1.0035298720556493e-05, + "loss": 0.1859, + "step": 2443 + }, + { + "epoch": 2.74491085216903, + "grad_norm": 0.4676021495394005, + "learning_rate": 1.002745458295698e-05, + "loss": 0.1706, + "step": 2444 + }, + { + "epoch": 2.7460339744489684, + "grad_norm": 0.4623563788627261, + "learning_rate": 1.0019610428464354e-05, + "loss": 0.1755, + "step": 2445 + }, + { + "epoch": 2.7471570967289063, + "grad_norm": 0.4567998634977174, + "learning_rate": 1.001176626190522e-05, + "loss": 0.1749, + "step": 2446 + }, + { + "epoch": 2.7482802190088447, + "grad_norm": 0.47580917906104647, + "learning_rate": 1.0003922088106178e-05, + "loss": 0.1795, + "step": 2447 + }, + { + "epoch": 2.749403341288783, + "grad_norm": 0.4595058756812876, + "learning_rate": 9.996077911893829e-06, + "loss": 0.1758, + "step": 2448 + }, + { + "epoch": 2.750526463568721, + "grad_norm": 0.45863685177045854, + "learning_rate": 9.988233738094782e-06, + "loss": 0.177, + "step": 2449 + }, + { + "epoch": 2.7516495858486594, + "grad_norm": 0.4547228686150152, + "learning_rate": 9.980389571535647e-06, + "loss": 0.1738, + "step": 2450 + }, + { + "epoch": 2.7527727081285978, + "grad_norm": 0.4425914309842023, + "learning_rate": 9.972545417043024e-06, + "loss": 0.1591, + "step": 2451 + }, + { + "epoch": 2.7538958304085357, + "grad_norm": 0.4739331962858264, + "learning_rate": 9.964701279443509e-06, + "loss": 0.1862, + "step": 2452 + }, + { + "epoch": 2.755018952688474, + "grad_norm": 0.44719582544834424, + "learning_rate": 9.956857163563689e-06, + "loss": 0.1621, + "step": 2453 + }, + { + "epoch": 2.7561420749684125, + "grad_norm": 0.4493034917391583, + "learning_rate": 9.949013074230127e-06, + "loss": 0.1678, + "step": 2454 + }, + { + "epoch": 2.7572651972483504, + "grad_norm": 0.4341676937820874, + "learning_rate": 9.94116901626938e-06, + "loss": 0.1635, + "step": 2455 + }, + { + "epoch": 2.758388319528289, + "grad_norm": 0.44123424149038765, + "learning_rate": 9.933324994507984e-06, + "loss": 0.1617, + "step": 2456 + }, + { + "epoch": 2.7595114418082267, + "grad_norm": 0.4520594389268704, + "learning_rate": 9.925481013772456e-06, + "loss": 0.166, + "step": 2457 + }, + { + "epoch": 2.760634564088165, + "grad_norm": 0.47723497756567185, + "learning_rate": 9.91763707888928e-06, + "loss": 0.1865, + "step": 2458 + }, + { + "epoch": 2.7617576863681035, + "grad_norm": 0.4555339710437883, + "learning_rate": 9.909793194684914e-06, + "loss": 0.1806, + "step": 2459 + }, + { + "epoch": 2.7628808086480414, + "grad_norm": 0.4636447483901148, + "learning_rate": 9.901949365985787e-06, + "loss": 0.1672, + "step": 2460 + }, + { + "epoch": 2.76400393092798, + "grad_norm": 0.468526515916128, + "learning_rate": 9.894105597618297e-06, + "loss": 0.1739, + "step": 2461 + }, + { + "epoch": 2.7651270532079177, + "grad_norm": 0.4436410518618047, + "learning_rate": 9.886261894408798e-06, + "loss": 0.1604, + "step": 2462 + }, + { + "epoch": 2.766250175487856, + "grad_norm": 0.4584305461946224, + "learning_rate": 9.878418261183606e-06, + "loss": 0.1645, + "step": 2463 + }, + { + "epoch": 2.7673732977677945, + "grad_norm": 0.48434226395422153, + "learning_rate": 9.870574702768997e-06, + "loss": 0.1757, + "step": 2464 + }, + { + "epoch": 2.7684964200477324, + "grad_norm": 0.48156835406806114, + "learning_rate": 9.862731223991196e-06, + "loss": 0.1845, + "step": 2465 + }, + { + "epoch": 2.769619542327671, + "grad_norm": 0.44955147765140463, + "learning_rate": 9.854887829676382e-06, + "loss": 0.1654, + "step": 2466 + }, + { + "epoch": 2.770742664607609, + "grad_norm": 0.46769768486111885, + "learning_rate": 9.847044524650689e-06, + "loss": 0.1839, + "step": 2467 + }, + { + "epoch": 2.771865786887547, + "grad_norm": 0.4802433910241491, + "learning_rate": 9.839201313740179e-06, + "loss": 0.1848, + "step": 2468 + }, + { + "epoch": 2.7729889091674855, + "grad_norm": 0.4730216994614374, + "learning_rate": 9.83135820177087e-06, + "loss": 0.1784, + "step": 2469 + }, + { + "epoch": 2.774112031447424, + "grad_norm": 0.49525261555778016, + "learning_rate": 9.823515193568715e-06, + "loss": 0.1855, + "step": 2470 + }, + { + "epoch": 2.775235153727362, + "grad_norm": 0.44336826641873517, + "learning_rate": 9.815672293959605e-06, + "loss": 0.1696, + "step": 2471 + }, + { + "epoch": 2.7763582760073002, + "grad_norm": 0.4157328847494749, + "learning_rate": 9.807829507769362e-06, + "loss": 0.1611, + "step": 2472 + }, + { + "epoch": 2.7774813982872386, + "grad_norm": 0.42886202625170655, + "learning_rate": 9.799986839823736e-06, + "loss": 0.1618, + "step": 2473 + }, + { + "epoch": 2.7786045205671766, + "grad_norm": 0.4677846081299561, + "learning_rate": 9.792144294948408e-06, + "loss": 0.1748, + "step": 2474 + }, + { + "epoch": 2.779727642847115, + "grad_norm": 0.4583869711242806, + "learning_rate": 9.784301877968982e-06, + "loss": 0.1778, + "step": 2475 + }, + { + "epoch": 2.7808507651270533, + "grad_norm": 0.4495391147428708, + "learning_rate": 9.776459593710985e-06, + "loss": 0.1697, + "step": 2476 + }, + { + "epoch": 2.7819738874069913, + "grad_norm": 0.46075629132037005, + "learning_rate": 9.768617446999862e-06, + "loss": 0.1666, + "step": 2477 + }, + { + "epoch": 2.7830970096869296, + "grad_norm": 0.4673513087773124, + "learning_rate": 9.760775442660966e-06, + "loss": 0.1815, + "step": 2478 + }, + { + "epoch": 2.784220131966868, + "grad_norm": 0.4723634910358823, + "learning_rate": 9.752933585519578e-06, + "loss": 0.1796, + "step": 2479 + }, + { + "epoch": 2.785343254246806, + "grad_norm": 0.467537758740658, + "learning_rate": 9.74509188040087e-06, + "loss": 0.1734, + "step": 2480 + }, + { + "epoch": 2.7864663765267443, + "grad_norm": 0.4600429623850505, + "learning_rate": 9.737250332129932e-06, + "loss": 0.1653, + "step": 2481 + }, + { + "epoch": 2.7875894988066827, + "grad_norm": 0.43060840156496566, + "learning_rate": 9.72940894553176e-06, + "loss": 0.1608, + "step": 2482 + }, + { + "epoch": 2.7887126210866207, + "grad_norm": 0.4950921987658919, + "learning_rate": 9.721567725431239e-06, + "loss": 0.1915, + "step": 2483 + }, + { + "epoch": 2.789835743366559, + "grad_norm": 0.44612774116221526, + "learning_rate": 9.71372667665316e-06, + "loss": 0.1669, + "step": 2484 + }, + { + "epoch": 2.7909588656464974, + "grad_norm": 0.4585740919900049, + "learning_rate": 9.705885804022207e-06, + "loss": 0.1733, + "step": 2485 + }, + { + "epoch": 2.7920819879264354, + "grad_norm": 0.47309965981508717, + "learning_rate": 9.698045112362956e-06, + "loss": 0.1752, + "step": 2486 + }, + { + "epoch": 2.7932051102063737, + "grad_norm": 0.4541221114294698, + "learning_rate": 9.690204606499875e-06, + "loss": 0.1665, + "step": 2487 + }, + { + "epoch": 2.794328232486312, + "grad_norm": 0.4816033103876313, + "learning_rate": 9.682364291257304e-06, + "loss": 0.1854, + "step": 2488 + }, + { + "epoch": 2.79545135476625, + "grad_norm": 0.46392038210393416, + "learning_rate": 9.674524171459478e-06, + "loss": 0.1736, + "step": 2489 + }, + { + "epoch": 2.7965744770461884, + "grad_norm": 0.49408949325331897, + "learning_rate": 9.666684251930514e-06, + "loss": 0.187, + "step": 2490 + }, + { + "epoch": 2.797697599326127, + "grad_norm": 0.47463312580832867, + "learning_rate": 9.658844537494396e-06, + "loss": 0.1789, + "step": 2491 + }, + { + "epoch": 2.7988207216060648, + "grad_norm": 0.48036454400289674, + "learning_rate": 9.651005032974994e-06, + "loss": 0.178, + "step": 2492 + }, + { + "epoch": 2.799943843886003, + "grad_norm": 0.4386654951501184, + "learning_rate": 9.64316574319603e-06, + "loss": 0.1611, + "step": 2493 + }, + { + "epoch": 2.8010669661659415, + "grad_norm": 0.46695275500508854, + "learning_rate": 9.63532667298111e-06, + "loss": 0.1771, + "step": 2494 + }, + { + "epoch": 2.8021900884458795, + "grad_norm": 0.45981170005783556, + "learning_rate": 9.627487827153704e-06, + "loss": 0.1818, + "step": 2495 + }, + { + "epoch": 2.803313210725818, + "grad_norm": 0.44791412818213827, + "learning_rate": 9.619649210537136e-06, + "loss": 0.1686, + "step": 2496 + }, + { + "epoch": 2.8044363330057562, + "grad_norm": 0.4694605987355865, + "learning_rate": 9.6118108279546e-06, + "loss": 0.1804, + "step": 2497 + }, + { + "epoch": 2.805559455285694, + "grad_norm": 0.46461819232187124, + "learning_rate": 9.603972684229127e-06, + "loss": 0.1718, + "step": 2498 + }, + { + "epoch": 2.8066825775656326, + "grad_norm": 0.4640887745511831, + "learning_rate": 9.59613478418362e-06, + "loss": 0.1729, + "step": 2499 + }, + { + "epoch": 2.807805699845571, + "grad_norm": 0.46473184965851605, + "learning_rate": 9.588297132640824e-06, + "loss": 0.1799, + "step": 2500 + }, + { + "epoch": 2.808928822125509, + "grad_norm": 0.44859390204733884, + "learning_rate": 9.580459734423334e-06, + "loss": 0.1692, + "step": 2501 + }, + { + "epoch": 2.8100519444054473, + "grad_norm": 0.48966480162143305, + "learning_rate": 9.572622594353589e-06, + "loss": 0.1863, + "step": 2502 + }, + { + "epoch": 2.8111750666853856, + "grad_norm": 0.4707076422301409, + "learning_rate": 9.564785717253862e-06, + "loss": 0.1737, + "step": 2503 + }, + { + "epoch": 2.8122981889653236, + "grad_norm": 0.4798544695330848, + "learning_rate": 9.556949107946272e-06, + "loss": 0.1762, + "step": 2504 + }, + { + "epoch": 2.813421311245262, + "grad_norm": 0.4460329229033776, + "learning_rate": 9.549112771252771e-06, + "loss": 0.1645, + "step": 2505 + }, + { + "epoch": 2.8145444335252, + "grad_norm": 0.46898014918292236, + "learning_rate": 9.541276711995149e-06, + "loss": 0.1773, + "step": 2506 + }, + { + "epoch": 2.8156675558051383, + "grad_norm": 0.45833210070182195, + "learning_rate": 9.53344093499501e-06, + "loss": 0.1764, + "step": 2507 + }, + { + "epoch": 2.8167906780850767, + "grad_norm": 0.4479886546084605, + "learning_rate": 9.525605445073797e-06, + "loss": 0.1648, + "step": 2508 + }, + { + "epoch": 2.8179138003650146, + "grad_norm": 0.4701076235299984, + "learning_rate": 9.517770247052775e-06, + "loss": 0.1798, + "step": 2509 + }, + { + "epoch": 2.819036922644953, + "grad_norm": 0.44716178784872795, + "learning_rate": 9.509935345753026e-06, + "loss": 0.1668, + "step": 2510 + }, + { + "epoch": 2.8201600449248914, + "grad_norm": 0.4575598504302553, + "learning_rate": 9.502100745995456e-06, + "loss": 0.1781, + "step": 2511 + }, + { + "epoch": 2.8212831672048293, + "grad_norm": 0.46438759653279443, + "learning_rate": 9.494266452600771e-06, + "loss": 0.1759, + "step": 2512 + }, + { + "epoch": 2.8224062894847677, + "grad_norm": 0.4463213888843625, + "learning_rate": 9.486432470389505e-06, + "loss": 0.1739, + "step": 2513 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.44363386556020745, + "learning_rate": 9.47859880418199e-06, + "loss": 0.1677, + "step": 2514 + }, + { + "epoch": 2.824652534044644, + "grad_norm": 0.47574851732895207, + "learning_rate": 9.470765458798369e-06, + "loss": 0.1851, + "step": 2515 + }, + { + "epoch": 2.8257756563245824, + "grad_norm": 0.47371348230350735, + "learning_rate": 9.46293243905859e-06, + "loss": 0.1728, + "step": 2516 + }, + { + "epoch": 2.8268987786045203, + "grad_norm": 0.4529641523386059, + "learning_rate": 9.455099749782387e-06, + "loss": 0.171, + "step": 2517 + }, + { + "epoch": 2.8280219008844587, + "grad_norm": 0.46948443592441114, + "learning_rate": 9.447267395789304e-06, + "loss": 0.1727, + "step": 2518 + }, + { + "epoch": 2.829145023164397, + "grad_norm": 0.46792399231713533, + "learning_rate": 9.439435381898674e-06, + "loss": 0.1732, + "step": 2519 + }, + { + "epoch": 2.830268145444335, + "grad_norm": 0.46471285429284986, + "learning_rate": 9.431603712929623e-06, + "loss": 0.1792, + "step": 2520 + }, + { + "epoch": 2.8313912677242734, + "grad_norm": 0.4710096469718015, + "learning_rate": 9.423772393701064e-06, + "loss": 0.1761, + "step": 2521 + }, + { + "epoch": 2.832514390004212, + "grad_norm": 0.45657758770291, + "learning_rate": 9.415941429031693e-06, + "loss": 0.1747, + "step": 2522 + }, + { + "epoch": 2.8336375122841497, + "grad_norm": 0.4380363624369588, + "learning_rate": 9.408110823739985e-06, + "loss": 0.167, + "step": 2523 + }, + { + "epoch": 2.834760634564088, + "grad_norm": 0.4654221971780646, + "learning_rate": 9.400280582644204e-06, + "loss": 0.1704, + "step": 2524 + }, + { + "epoch": 2.8358837568440265, + "grad_norm": 0.4630864998764016, + "learning_rate": 9.392450710562377e-06, + "loss": 0.1747, + "step": 2525 + }, + { + "epoch": 2.8370068791239644, + "grad_norm": 0.47819793126694177, + "learning_rate": 9.384621212312316e-06, + "loss": 0.1806, + "step": 2526 + }, + { + "epoch": 2.838130001403903, + "grad_norm": 0.44989059188440433, + "learning_rate": 9.376792092711593e-06, + "loss": 0.1719, + "step": 2527 + }, + { + "epoch": 2.839253123683841, + "grad_norm": 0.4493159352390759, + "learning_rate": 9.368963356577554e-06, + "loss": 0.1627, + "step": 2528 + }, + { + "epoch": 2.840376245963779, + "grad_norm": 0.462068887667578, + "learning_rate": 9.361135008727304e-06, + "loss": 0.1744, + "step": 2529 + }, + { + "epoch": 2.8414993682437175, + "grad_norm": 0.46328463138927967, + "learning_rate": 9.353307053977717e-06, + "loss": 0.1698, + "step": 2530 + }, + { + "epoch": 2.842622490523656, + "grad_norm": 0.4535786753854389, + "learning_rate": 9.345479497145417e-06, + "loss": 0.1598, + "step": 2531 + }, + { + "epoch": 2.843745612803594, + "grad_norm": 0.4659782442541839, + "learning_rate": 9.337652343046782e-06, + "loss": 0.1829, + "step": 2532 + }, + { + "epoch": 2.844868735083532, + "grad_norm": 0.43264542173054715, + "learning_rate": 9.32982559649795e-06, + "loss": 0.1633, + "step": 2533 + }, + { + "epoch": 2.8459918573634706, + "grad_norm": 0.4467290239010384, + "learning_rate": 9.321999262314803e-06, + "loss": 0.1668, + "step": 2534 + }, + { + "epoch": 2.8471149796434085, + "grad_norm": 0.4546808256348493, + "learning_rate": 9.314173345312972e-06, + "loss": 0.1729, + "step": 2535 + }, + { + "epoch": 2.848238101923347, + "grad_norm": 0.46681169539201894, + "learning_rate": 9.30634785030783e-06, + "loss": 0.1768, + "step": 2536 + }, + { + "epoch": 2.8493612242032853, + "grad_norm": 0.48967009130593026, + "learning_rate": 9.298522782114488e-06, + "loss": 0.1876, + "step": 2537 + }, + { + "epoch": 2.8504843464832232, + "grad_norm": 0.4476752114365759, + "learning_rate": 9.290698145547796e-06, + "loss": 0.1624, + "step": 2538 + }, + { + "epoch": 2.8516074687631616, + "grad_norm": 0.478382573772851, + "learning_rate": 9.282873945422341e-06, + "loss": 0.1843, + "step": 2539 + }, + { + "epoch": 2.8527305910431, + "grad_norm": 0.45741308655522045, + "learning_rate": 9.27505018655244e-06, + "loss": 0.1733, + "step": 2540 + }, + { + "epoch": 2.853853713323038, + "grad_norm": 0.4683894832083663, + "learning_rate": 9.267226873752137e-06, + "loss": 0.1729, + "step": 2541 + }, + { + "epoch": 2.8549768356029763, + "grad_norm": 0.4621198618194314, + "learning_rate": 9.259404011835203e-06, + "loss": 0.1683, + "step": 2542 + }, + { + "epoch": 2.8560999578829147, + "grad_norm": 0.47870289959867185, + "learning_rate": 9.251581605615128e-06, + "loss": 0.1751, + "step": 2543 + }, + { + "epoch": 2.8572230801628526, + "grad_norm": 0.48939618598564266, + "learning_rate": 9.243759659905126e-06, + "loss": 0.1851, + "step": 2544 + }, + { + "epoch": 2.858346202442791, + "grad_norm": 0.48615941707120763, + "learning_rate": 9.235938179518131e-06, + "loss": 0.1825, + "step": 2545 + }, + { + "epoch": 2.8594693247227294, + "grad_norm": 0.46996292093877523, + "learning_rate": 9.228117169266782e-06, + "loss": 0.1706, + "step": 2546 + }, + { + "epoch": 2.8605924470026673, + "grad_norm": 0.4577345045610456, + "learning_rate": 9.22029663396343e-06, + "loss": 0.1731, + "step": 2547 + }, + { + "epoch": 2.8617155692826057, + "grad_norm": 0.4593857700548628, + "learning_rate": 9.21247657842014e-06, + "loss": 0.1711, + "step": 2548 + }, + { + "epoch": 2.862838691562544, + "grad_norm": 0.48560219199017984, + "learning_rate": 9.204657007448678e-06, + "loss": 0.1932, + "step": 2549 + }, + { + "epoch": 2.863961813842482, + "grad_norm": 0.4747666905904359, + "learning_rate": 9.196837925860516e-06, + "loss": 0.1844, + "step": 2550 + }, + { + "epoch": 2.8650849361224204, + "grad_norm": 0.4818639171590592, + "learning_rate": 9.189019338466812e-06, + "loss": 0.1791, + "step": 2551 + }, + { + "epoch": 2.866208058402359, + "grad_norm": 0.4643395199305748, + "learning_rate": 9.181201250078435e-06, + "loss": 0.1704, + "step": 2552 + }, + { + "epoch": 2.8673311806822968, + "grad_norm": 0.4611993802683832, + "learning_rate": 9.173383665505937e-06, + "loss": 0.1705, + "step": 2553 + }, + { + "epoch": 2.868454302962235, + "grad_norm": 0.47676505658180535, + "learning_rate": 9.165566589559568e-06, + "loss": 0.1831, + "step": 2554 + }, + { + "epoch": 2.8695774252421735, + "grad_norm": 0.4674787023765812, + "learning_rate": 9.15775002704926e-06, + "loss": 0.1632, + "step": 2555 + }, + { + "epoch": 2.8707005475221115, + "grad_norm": 0.4905453400165015, + "learning_rate": 9.14993398278463e-06, + "loss": 0.1819, + "step": 2556 + }, + { + "epoch": 2.87182366980205, + "grad_norm": 0.47733632103578205, + "learning_rate": 9.142118461574971e-06, + "loss": 0.1792, + "step": 2557 + }, + { + "epoch": 2.8729467920819878, + "grad_norm": 0.4558817946417188, + "learning_rate": 9.134303468229264e-06, + "loss": 0.1792, + "step": 2558 + }, + { + "epoch": 2.874069914361926, + "grad_norm": 0.47700983129939084, + "learning_rate": 9.12648900755616e-06, + "loss": 0.182, + "step": 2559 + }, + { + "epoch": 2.8751930366418645, + "grad_norm": 0.4553018015995702, + "learning_rate": 9.118675084363986e-06, + "loss": 0.1667, + "step": 2560 + }, + { + "epoch": 2.8763161589218025, + "grad_norm": 0.4593598102751128, + "learning_rate": 9.110861703460727e-06, + "loss": 0.1728, + "step": 2561 + }, + { + "epoch": 2.877439281201741, + "grad_norm": 0.4648771153530072, + "learning_rate": 9.103048869654047e-06, + "loss": 0.1728, + "step": 2562 + }, + { + "epoch": 2.878562403481679, + "grad_norm": 0.436076180749989, + "learning_rate": 9.095236587751267e-06, + "loss": 0.1615, + "step": 2563 + }, + { + "epoch": 2.879685525761617, + "grad_norm": 0.4734747981831071, + "learning_rate": 9.08742486255937e-06, + "loss": 0.1802, + "step": 2564 + }, + { + "epoch": 2.8808086480415556, + "grad_norm": 0.4877652162687277, + "learning_rate": 9.079613698885002e-06, + "loss": 0.1869, + "step": 2565 + }, + { + "epoch": 2.8819317703214935, + "grad_norm": 0.45765721191013503, + "learning_rate": 9.071803101534451e-06, + "loss": 0.1755, + "step": 2566 + }, + { + "epoch": 2.883054892601432, + "grad_norm": 0.45021196688829096, + "learning_rate": 9.063993075313666e-06, + "loss": 0.1727, + "step": 2567 + }, + { + "epoch": 2.8841780148813703, + "grad_norm": 0.4522674069841106, + "learning_rate": 9.056183625028243e-06, + "loss": 0.166, + "step": 2568 + }, + { + "epoch": 2.885301137161308, + "grad_norm": 0.46231761385594666, + "learning_rate": 9.04837475548342e-06, + "loss": 0.1808, + "step": 2569 + }, + { + "epoch": 2.8864242594412466, + "grad_norm": 0.47474127509623437, + "learning_rate": 9.040566471484085e-06, + "loss": 0.183, + "step": 2570 + }, + { + "epoch": 2.887547381721185, + "grad_norm": 0.45900525335603964, + "learning_rate": 9.032758777834754e-06, + "loss": 0.1737, + "step": 2571 + }, + { + "epoch": 2.888670504001123, + "grad_norm": 0.45812880138007356, + "learning_rate": 9.024951679339594e-06, + "loss": 0.1677, + "step": 2572 + }, + { + "epoch": 2.8897936262810613, + "grad_norm": 0.47586663132708334, + "learning_rate": 9.017145180802393e-06, + "loss": 0.1834, + "step": 2573 + }, + { + "epoch": 2.8909167485609997, + "grad_norm": 0.4116685169052402, + "learning_rate": 9.00933928702658e-06, + "loss": 0.1496, + "step": 2574 + }, + { + "epoch": 2.8920398708409376, + "grad_norm": 0.46861344574621616, + "learning_rate": 9.001534002815209e-06, + "loss": 0.1789, + "step": 2575 + }, + { + "epoch": 2.893162993120876, + "grad_norm": 0.46679304982614994, + "learning_rate": 8.993729332970948e-06, + "loss": 0.1742, + "step": 2576 + }, + { + "epoch": 2.8942861154008144, + "grad_norm": 0.4580429287886535, + "learning_rate": 8.985925282296105e-06, + "loss": 0.1838, + "step": 2577 + }, + { + "epoch": 2.8954092376807523, + "grad_norm": 0.4569743853099299, + "learning_rate": 8.978121855592593e-06, + "loss": 0.1761, + "step": 2578 + }, + { + "epoch": 2.8965323599606907, + "grad_norm": 0.4743176355550856, + "learning_rate": 8.970319057661954e-06, + "loss": 0.1831, + "step": 2579 + }, + { + "epoch": 2.897655482240629, + "grad_norm": 0.4553016611576038, + "learning_rate": 8.962516893305324e-06, + "loss": 0.1692, + "step": 2580 + }, + { + "epoch": 2.898778604520567, + "grad_norm": 0.45473502109690267, + "learning_rate": 8.954715367323468e-06, + "loss": 0.1678, + "step": 2581 + }, + { + "epoch": 2.8999017268005054, + "grad_norm": 0.4807942668241952, + "learning_rate": 8.946914484516748e-06, + "loss": 0.1807, + "step": 2582 + }, + { + "epoch": 2.901024849080444, + "grad_norm": 0.429941902019654, + "learning_rate": 8.939114249685135e-06, + "loss": 0.1516, + "step": 2583 + }, + { + "epoch": 2.9021479713603817, + "grad_norm": 0.4653157438770528, + "learning_rate": 8.931314667628201e-06, + "loss": 0.1767, + "step": 2584 + }, + { + "epoch": 2.90327109364032, + "grad_norm": 0.44312625971069464, + "learning_rate": 8.923515743145113e-06, + "loss": 0.1697, + "step": 2585 + }, + { + "epoch": 2.9043942159202585, + "grad_norm": 0.43811033758732193, + "learning_rate": 8.915717481034632e-06, + "loss": 0.1604, + "step": 2586 + }, + { + "epoch": 2.9055173382001964, + "grad_norm": 0.47216339814248476, + "learning_rate": 8.907919886095115e-06, + "loss": 0.1731, + "step": 2587 + }, + { + "epoch": 2.906640460480135, + "grad_norm": 0.4621783530580539, + "learning_rate": 8.900122963124513e-06, + "loss": 0.1759, + "step": 2588 + }, + { + "epoch": 2.907763582760073, + "grad_norm": 0.45497514323875493, + "learning_rate": 8.892326716920356e-06, + "loss": 0.1651, + "step": 2589 + }, + { + "epoch": 2.908886705040011, + "grad_norm": 0.457851254832388, + "learning_rate": 8.884531152279757e-06, + "loss": 0.1729, + "step": 2590 + }, + { + "epoch": 2.9100098273199495, + "grad_norm": 0.45822650406750187, + "learning_rate": 8.876736273999415e-06, + "loss": 0.1693, + "step": 2591 + }, + { + "epoch": 2.911132949599888, + "grad_norm": 0.46856109234379667, + "learning_rate": 8.868942086875605e-06, + "loss": 0.1848, + "step": 2592 + }, + { + "epoch": 2.912256071879826, + "grad_norm": 0.4576565366829723, + "learning_rate": 8.861148595704176e-06, + "loss": 0.1731, + "step": 2593 + }, + { + "epoch": 2.913379194159764, + "grad_norm": 0.4523806979663706, + "learning_rate": 8.853355805280553e-06, + "loss": 0.1744, + "step": 2594 + }, + { + "epoch": 2.9145023164397026, + "grad_norm": 0.45200684428131077, + "learning_rate": 8.845563720399715e-06, + "loss": 0.173, + "step": 2595 + }, + { + "epoch": 2.9156254387196405, + "grad_norm": 0.47579624922930075, + "learning_rate": 8.837772345856226e-06, + "loss": 0.1917, + "step": 2596 + }, + { + "epoch": 2.916748560999579, + "grad_norm": 0.45572922351885753, + "learning_rate": 8.829981686444201e-06, + "loss": 0.174, + "step": 2597 + }, + { + "epoch": 2.9178716832795173, + "grad_norm": 0.4638260376523015, + "learning_rate": 8.822191746957321e-06, + "loss": 0.1868, + "step": 2598 + }, + { + "epoch": 2.9189948055594552, + "grad_norm": 0.45148558403348366, + "learning_rate": 8.814402532188824e-06, + "loss": 0.1717, + "step": 2599 + }, + { + "epoch": 2.9201179278393936, + "grad_norm": 0.4573379490292585, + "learning_rate": 8.806614046931491e-06, + "loss": 0.1713, + "step": 2600 + }, + { + "epoch": 2.921241050119332, + "grad_norm": 0.4767926454940618, + "learning_rate": 8.79882629597767e-06, + "loss": 0.1763, + "step": 2601 + }, + { + "epoch": 2.92236417239927, + "grad_norm": 0.48398291984858927, + "learning_rate": 8.791039284119244e-06, + "loss": 0.1898, + "step": 2602 + }, + { + "epoch": 2.9234872946792083, + "grad_norm": 0.4645986763409629, + "learning_rate": 8.783253016147652e-06, + "loss": 0.1704, + "step": 2603 + }, + { + "epoch": 2.9246104169591467, + "grad_norm": 0.5308394707341152, + "learning_rate": 8.775467496853873e-06, + "loss": 0.191, + "step": 2604 + }, + { + "epoch": 2.9257335392390846, + "grad_norm": 0.4670451326042578, + "learning_rate": 8.767682731028415e-06, + "loss": 0.1711, + "step": 2605 + }, + { + "epoch": 2.926856661519023, + "grad_norm": 0.438698398612858, + "learning_rate": 8.759898723461333e-06, + "loss": 0.1644, + "step": 2606 + }, + { + "epoch": 2.927979783798961, + "grad_norm": 0.44189027854801244, + "learning_rate": 8.752115478942213e-06, + "loss": 0.1649, + "step": 2607 + }, + { + "epoch": 2.9291029060788993, + "grad_norm": 0.4581859363210317, + "learning_rate": 8.744333002260172e-06, + "loss": 0.1708, + "step": 2608 + }, + { + "epoch": 2.9302260283588377, + "grad_norm": 0.45676884931636114, + "learning_rate": 8.736551298203854e-06, + "loss": 0.1742, + "step": 2609 + }, + { + "epoch": 2.9313491506387757, + "grad_norm": 0.4497025780421319, + "learning_rate": 8.728770371561424e-06, + "loss": 0.1651, + "step": 2610 + }, + { + "epoch": 2.932472272918714, + "grad_norm": 0.4781813885457078, + "learning_rate": 8.720990227120575e-06, + "loss": 0.1815, + "step": 2611 + }, + { + "epoch": 2.933595395198652, + "grad_norm": 0.46022287061474587, + "learning_rate": 8.71321086966851e-06, + "loss": 0.1807, + "step": 2612 + }, + { + "epoch": 2.9347185174785904, + "grad_norm": 0.48446443471996553, + "learning_rate": 8.705432303991967e-06, + "loss": 0.1915, + "step": 2613 + }, + { + "epoch": 2.9358416397585287, + "grad_norm": 0.46344897666541895, + "learning_rate": 8.697654534877166e-06, + "loss": 0.1857, + "step": 2614 + }, + { + "epoch": 2.9369647620384667, + "grad_norm": 0.4425948152166924, + "learning_rate": 8.689877567109861e-06, + "loss": 0.1724, + "step": 2615 + }, + { + "epoch": 2.938087884318405, + "grad_norm": 0.44998509556224764, + "learning_rate": 8.682101405475308e-06, + "loss": 0.1718, + "step": 2616 + }, + { + "epoch": 2.9392110065983434, + "grad_norm": 0.4675922727964663, + "learning_rate": 8.674326054758261e-06, + "loss": 0.1771, + "step": 2617 + }, + { + "epoch": 2.9403341288782814, + "grad_norm": 0.47264154820508586, + "learning_rate": 8.666551519742988e-06, + "loss": 0.1816, + "step": 2618 + }, + { + "epoch": 2.9414572511582198, + "grad_norm": 0.46063885171682406, + "learning_rate": 8.658777805213233e-06, + "loss": 0.1744, + "step": 2619 + }, + { + "epoch": 2.942580373438158, + "grad_norm": 0.4568310611912791, + "learning_rate": 8.651004915952252e-06, + "loss": 0.1779, + "step": 2620 + }, + { + "epoch": 2.943703495718096, + "grad_norm": 0.48101099147671783, + "learning_rate": 8.643232856742794e-06, + "loss": 0.1912, + "step": 2621 + }, + { + "epoch": 2.9448266179980345, + "grad_norm": 0.46646815088946425, + "learning_rate": 8.635461632367087e-06, + "loss": 0.1859, + "step": 2622 + }, + { + "epoch": 2.945949740277973, + "grad_norm": 0.4601186273096568, + "learning_rate": 8.627691247606862e-06, + "loss": 0.1649, + "step": 2623 + }, + { + "epoch": 2.947072862557911, + "grad_norm": 0.6344634698370505, + "learning_rate": 8.619921707243308e-06, + "loss": 0.1859, + "step": 2624 + }, + { + "epoch": 2.948195984837849, + "grad_norm": 0.4461101591994598, + "learning_rate": 8.612153016057114e-06, + "loss": 0.1586, + "step": 2625 + }, + { + "epoch": 2.9493191071177876, + "grad_norm": 0.43938035234144796, + "learning_rate": 8.604385178828441e-06, + "loss": 0.165, + "step": 2626 + }, + { + "epoch": 2.9504422293977255, + "grad_norm": 0.44943044279325145, + "learning_rate": 8.596618200336925e-06, + "loss": 0.1764, + "step": 2627 + }, + { + "epoch": 2.951565351677664, + "grad_norm": 0.4746096642980948, + "learning_rate": 8.588852085361679e-06, + "loss": 0.1748, + "step": 2628 + }, + { + "epoch": 2.9526884739576023, + "grad_norm": 0.46112492600966043, + "learning_rate": 8.58108683868127e-06, + "loss": 0.1707, + "step": 2629 + }, + { + "epoch": 2.95381159623754, + "grad_norm": 0.45177789162071297, + "learning_rate": 8.573322465073746e-06, + "loss": 0.1726, + "step": 2630 + }, + { + "epoch": 2.9549347185174786, + "grad_norm": 0.450023349792469, + "learning_rate": 8.565558969316607e-06, + "loss": 0.1728, + "step": 2631 + }, + { + "epoch": 2.956057840797417, + "grad_norm": 0.4770351172578664, + "learning_rate": 8.557796356186818e-06, + "loss": 0.1877, + "step": 2632 + }, + { + "epoch": 2.957180963077355, + "grad_norm": 0.4513830973776379, + "learning_rate": 8.550034630460806e-06, + "loss": 0.1685, + "step": 2633 + }, + { + "epoch": 2.9583040853572933, + "grad_norm": 0.4857445983384114, + "learning_rate": 8.542273796914439e-06, + "loss": 0.1718, + "step": 2634 + }, + { + "epoch": 2.9594272076372317, + "grad_norm": 0.47068053305099244, + "learning_rate": 8.534513860323047e-06, + "loss": 0.1702, + "step": 2635 + }, + { + "epoch": 2.9605503299171696, + "grad_norm": 0.47923791043815955, + "learning_rate": 8.526754825461402e-06, + "loss": 0.1811, + "step": 2636 + }, + { + "epoch": 2.961673452197108, + "grad_norm": 0.4548065001435798, + "learning_rate": 8.518996697103726e-06, + "loss": 0.1756, + "step": 2637 + }, + { + "epoch": 2.9627965744770464, + "grad_norm": 0.4460233301614257, + "learning_rate": 8.511239480023686e-06, + "loss": 0.1625, + "step": 2638 + }, + { + "epoch": 2.9639196967569843, + "grad_norm": 0.4548163435033796, + "learning_rate": 8.50348317899437e-06, + "loss": 0.1658, + "step": 2639 + }, + { + "epoch": 2.9650428190369227, + "grad_norm": 0.4655678091479219, + "learning_rate": 8.495727798788323e-06, + "loss": 0.1683, + "step": 2640 + }, + { + "epoch": 2.966165941316861, + "grad_norm": 0.4680379062633164, + "learning_rate": 8.487973344177517e-06, + "loss": 0.1869, + "step": 2641 + }, + { + "epoch": 2.967289063596799, + "grad_norm": 0.44275214343881863, + "learning_rate": 8.48021981993335e-06, + "loss": 0.1735, + "step": 2642 + }, + { + "epoch": 2.9684121858767374, + "grad_norm": 0.4393359117417517, + "learning_rate": 8.472467230826656e-06, + "loss": 0.1596, + "step": 2643 + }, + { + "epoch": 2.9695353081566758, + "grad_norm": 0.4738828812631789, + "learning_rate": 8.464715581627682e-06, + "loss": 0.176, + "step": 2644 + }, + { + "epoch": 2.9706584304366137, + "grad_norm": 0.4573620252964248, + "learning_rate": 8.456964877106104e-06, + "loss": 0.1727, + "step": 2645 + }, + { + "epoch": 2.971781552716552, + "grad_norm": 0.46158734380298694, + "learning_rate": 8.44921512203102e-06, + "loss": 0.1759, + "step": 2646 + }, + { + "epoch": 2.9729046749964905, + "grad_norm": 0.4627319651256285, + "learning_rate": 8.441466321170935e-06, + "loss": 0.1755, + "step": 2647 + }, + { + "epoch": 2.9740277972764284, + "grad_norm": 0.45445850820324185, + "learning_rate": 8.433718479293777e-06, + "loss": 0.1807, + "step": 2648 + }, + { + "epoch": 2.975150919556367, + "grad_norm": 0.4428450885299262, + "learning_rate": 8.425971601166872e-06, + "loss": 0.1684, + "step": 2649 + }, + { + "epoch": 2.976274041836305, + "grad_norm": 0.4541319747937619, + "learning_rate": 8.418225691556962e-06, + "loss": 0.1805, + "step": 2650 + }, + { + "epoch": 2.977397164116243, + "grad_norm": 0.4573856493833637, + "learning_rate": 8.41048075523019e-06, + "loss": 0.1742, + "step": 2651 + }, + { + "epoch": 2.9785202863961815, + "grad_norm": 0.4574620336290775, + "learning_rate": 8.402736796952104e-06, + "loss": 0.1684, + "step": 2652 + }, + { + "epoch": 2.97964340867612, + "grad_norm": 0.4514989212744961, + "learning_rate": 8.39499382148764e-06, + "loss": 0.1741, + "step": 2653 + }, + { + "epoch": 2.980766530956058, + "grad_norm": 0.4559456519447681, + "learning_rate": 8.387251833601142e-06, + "loss": 0.1761, + "step": 2654 + }, + { + "epoch": 2.981889653235996, + "grad_norm": 0.45131114670300065, + "learning_rate": 8.379510838056338e-06, + "loss": 0.1718, + "step": 2655 + }, + { + "epoch": 2.983012775515934, + "grad_norm": 0.44966905961207143, + "learning_rate": 8.371770839616348e-06, + "loss": 0.1717, + "step": 2656 + }, + { + "epoch": 2.9841358977958725, + "grad_norm": 0.46120801970633646, + "learning_rate": 8.364031843043683e-06, + "loss": 0.1678, + "step": 2657 + }, + { + "epoch": 2.985259020075811, + "grad_norm": 0.4421962165877643, + "learning_rate": 8.356293853100223e-06, + "loss": 0.1606, + "step": 2658 + }, + { + "epoch": 2.986382142355749, + "grad_norm": 0.46999315292829197, + "learning_rate": 8.348556874547242e-06, + "loss": 0.181, + "step": 2659 + }, + { + "epoch": 2.987505264635687, + "grad_norm": 0.4506186892281384, + "learning_rate": 8.340820912145391e-06, + "loss": 0.1712, + "step": 2660 + }, + { + "epoch": 2.9886283869156256, + "grad_norm": 0.4626508396110205, + "learning_rate": 8.333085970654691e-06, + "loss": 0.1716, + "step": 2661 + }, + { + "epoch": 2.9897515091955635, + "grad_norm": 0.45107479921301236, + "learning_rate": 8.325352054834542e-06, + "loss": 0.1762, + "step": 2662 + }, + { + "epoch": 2.990874631475502, + "grad_norm": 0.4595705443642629, + "learning_rate": 8.317619169443696e-06, + "loss": 0.1746, + "step": 2663 + }, + { + "epoch": 2.99199775375544, + "grad_norm": 0.4694949935187253, + "learning_rate": 8.309887319240291e-06, + "loss": 0.1747, + "step": 2664 + }, + { + "epoch": 2.9931208760353782, + "grad_norm": 0.4822701328030329, + "learning_rate": 8.302156508981816e-06, + "loss": 0.177, + "step": 2665 + }, + { + "epoch": 2.9942439983153166, + "grad_norm": 0.4623869423886324, + "learning_rate": 8.294426743425125e-06, + "loss": 0.1653, + "step": 2666 + }, + { + "epoch": 2.9953671205952546, + "grad_norm": 0.48059683012598114, + "learning_rate": 8.286698027326432e-06, + "loss": 0.1763, + "step": 2667 + }, + { + "epoch": 2.996490242875193, + "grad_norm": 0.4600386216074317, + "learning_rate": 8.278970365441292e-06, + "loss": 0.1671, + "step": 2668 + }, + { + "epoch": 2.9976133651551313, + "grad_norm": 0.48001415671174696, + "learning_rate": 8.271243762524627e-06, + "loss": 0.1731, + "step": 2669 + }, + { + "epoch": 2.9987364874350693, + "grad_norm": 0.4566979760209381, + "learning_rate": 8.263518223330698e-06, + "loss": 0.1827, + "step": 2670 + }, + { + "epoch": 2.9998596097150076, + "grad_norm": 0.7596103631516063, + "learning_rate": 8.255793752613115e-06, + "loss": 0.2575, + "step": 2671 + }, + { + "epoch": 3.000982731994946, + "grad_norm": 0.6259154790515346, + "learning_rate": 8.248070355124832e-06, + "loss": 0.1158, + "step": 2672 + }, + { + "epoch": 3.002105854274884, + "grad_norm": 0.49503499354650105, + "learning_rate": 8.240348035618138e-06, + "loss": 0.0951, + "step": 2673 + }, + { + "epoch": 3.0032289765548223, + "grad_norm": 0.4242302786603668, + "learning_rate": 8.232626798844661e-06, + "loss": 0.0806, + "step": 2674 + }, + { + "epoch": 3.0043520988347607, + "grad_norm": 0.4269038054152872, + "learning_rate": 8.224906649555365e-06, + "loss": 0.0878, + "step": 2675 + }, + { + "epoch": 3.0054752211146987, + "grad_norm": 0.4530862959158299, + "learning_rate": 8.21718759250054e-06, + "loss": 0.0855, + "step": 2676 + }, + { + "epoch": 3.006598343394637, + "grad_norm": 0.4886548006885315, + "learning_rate": 8.209469632429811e-06, + "loss": 0.086, + "step": 2677 + }, + { + "epoch": 3.0077214656745754, + "grad_norm": 0.554545304546421, + "learning_rate": 8.201752774092118e-06, + "loss": 0.0994, + "step": 2678 + }, + { + "epoch": 3.0088445879545134, + "grad_norm": 0.5854424397857068, + "learning_rate": 8.194037022235732e-06, + "loss": 0.0839, + "step": 2679 + }, + { + "epoch": 3.0099677102344518, + "grad_norm": 0.5920483383090562, + "learning_rate": 8.18632238160824e-06, + "loss": 0.0873, + "step": 2680 + }, + { + "epoch": 3.01109083251439, + "grad_norm": 0.5324726059263992, + "learning_rate": 8.178608856956547e-06, + "loss": 0.0843, + "step": 2681 + }, + { + "epoch": 3.012213954794328, + "grad_norm": 0.4991909599622551, + "learning_rate": 8.17089645302687e-06, + "loss": 0.087, + "step": 2682 + }, + { + "epoch": 3.0133370770742665, + "grad_norm": 0.4832948640805594, + "learning_rate": 8.163185174564731e-06, + "loss": 0.0857, + "step": 2683 + }, + { + "epoch": 3.014460199354205, + "grad_norm": 0.4362830353220936, + "learning_rate": 8.155475026314966e-06, + "loss": 0.0814, + "step": 2684 + }, + { + "epoch": 3.0155833216341428, + "grad_norm": 0.424359667514001, + "learning_rate": 8.147766013021716e-06, + "loss": 0.0829, + "step": 2685 + }, + { + "epoch": 3.016706443914081, + "grad_norm": 0.41618233800829574, + "learning_rate": 8.140058139428425e-06, + "loss": 0.0832, + "step": 2686 + }, + { + "epoch": 3.0178295661940195, + "grad_norm": 0.40606151522127315, + "learning_rate": 8.132351410277824e-06, + "loss": 0.0858, + "step": 2687 + }, + { + "epoch": 3.0189526884739575, + "grad_norm": 0.42085251727822387, + "learning_rate": 8.124645830311954e-06, + "loss": 0.0868, + "step": 2688 + }, + { + "epoch": 3.020075810753896, + "grad_norm": 0.43662806253277353, + "learning_rate": 8.116941404272142e-06, + "loss": 0.0884, + "step": 2689 + }, + { + "epoch": 3.0211989330338342, + "grad_norm": 0.4174983822990382, + "learning_rate": 8.109238136899004e-06, + "loss": 0.0849, + "step": 2690 + }, + { + "epoch": 3.022322055313772, + "grad_norm": 0.41649346119332137, + "learning_rate": 8.101536032932452e-06, + "loss": 0.0791, + "step": 2691 + }, + { + "epoch": 3.0234451775937106, + "grad_norm": 0.4363115097166906, + "learning_rate": 8.093835097111668e-06, + "loss": 0.0842, + "step": 2692 + }, + { + "epoch": 3.024568299873649, + "grad_norm": 0.4507250434052767, + "learning_rate": 8.086135334175126e-06, + "loss": 0.0809, + "step": 2693 + }, + { + "epoch": 3.025691422153587, + "grad_norm": 0.46412219030100466, + "learning_rate": 8.078436748860572e-06, + "loss": 0.0808, + "step": 2694 + }, + { + "epoch": 3.0268145444335253, + "grad_norm": 0.46817713622059637, + "learning_rate": 8.070739345905032e-06, + "loss": 0.082, + "step": 2695 + }, + { + "epoch": 3.0279376667134636, + "grad_norm": 0.4359811445564712, + "learning_rate": 8.063043130044806e-06, + "loss": 0.0816, + "step": 2696 + }, + { + "epoch": 3.0290607889934016, + "grad_norm": 0.44282045477705045, + "learning_rate": 8.055348106015455e-06, + "loss": 0.0738, + "step": 2697 + }, + { + "epoch": 3.03018391127334, + "grad_norm": 0.4687677335027077, + "learning_rate": 8.047654278551814e-06, + "loss": 0.083, + "step": 2698 + }, + { + "epoch": 3.031307033553278, + "grad_norm": 0.45388563511970487, + "learning_rate": 8.03996165238798e-06, + "loss": 0.0811, + "step": 2699 + }, + { + "epoch": 3.0324301558332163, + "grad_norm": 0.4511628549306528, + "learning_rate": 8.032270232257312e-06, + "loss": 0.0811, + "step": 2700 + }, + { + "epoch": 3.0335532781131547, + "grad_norm": 0.4397177997054855, + "learning_rate": 8.024580022892427e-06, + "loss": 0.0836, + "step": 2701 + }, + { + "epoch": 3.0346764003930926, + "grad_norm": 0.43549667655174495, + "learning_rate": 8.01689102902519e-06, + "loss": 0.0772, + "step": 2702 + }, + { + "epoch": 3.035799522673031, + "grad_norm": 0.4197204578796909, + "learning_rate": 8.009203255386723e-06, + "loss": 0.0783, + "step": 2703 + }, + { + "epoch": 3.0369226449529694, + "grad_norm": 0.42838286805922127, + "learning_rate": 8.001516706707401e-06, + "loss": 0.0821, + "step": 2704 + }, + { + "epoch": 3.0380457672329073, + "grad_norm": 0.404870980751977, + "learning_rate": 7.993831387716844e-06, + "loss": 0.077, + "step": 2705 + }, + { + "epoch": 3.0391688895128457, + "grad_norm": 0.4167100368716409, + "learning_rate": 7.986147303143913e-06, + "loss": 0.0761, + "step": 2706 + }, + { + "epoch": 3.040292011792784, + "grad_norm": 0.40996511641383154, + "learning_rate": 7.978464457716704e-06, + "loss": 0.0802, + "step": 2707 + }, + { + "epoch": 3.041415134072722, + "grad_norm": 0.44035804475350526, + "learning_rate": 7.97078285616256e-06, + "loss": 0.0787, + "step": 2708 + }, + { + "epoch": 3.0425382563526604, + "grad_norm": 0.43789822457853483, + "learning_rate": 7.963102503208058e-06, + "loss": 0.0827, + "step": 2709 + }, + { + "epoch": 3.043661378632599, + "grad_norm": 0.4463729892875018, + "learning_rate": 7.955423403578998e-06, + "loss": 0.081, + "step": 2710 + }, + { + "epoch": 3.0447845009125367, + "grad_norm": 0.4593657665400022, + "learning_rate": 7.947745562000421e-06, + "loss": 0.0839, + "step": 2711 + }, + { + "epoch": 3.045907623192475, + "grad_norm": 0.4426263669678776, + "learning_rate": 7.940068983196581e-06, + "loss": 0.0793, + "step": 2712 + }, + { + "epoch": 3.0470307454724135, + "grad_norm": 0.4546022445193033, + "learning_rate": 7.932393671890965e-06, + "loss": 0.0823, + "step": 2713 + }, + { + "epoch": 3.0481538677523514, + "grad_norm": 0.46071246386796366, + "learning_rate": 7.924719632806274e-06, + "loss": 0.0843, + "step": 2714 + }, + { + "epoch": 3.04927699003229, + "grad_norm": 0.43222793934163595, + "learning_rate": 7.917046870664431e-06, + "loss": 0.0818, + "step": 2715 + }, + { + "epoch": 3.050400112312228, + "grad_norm": 0.4468470825780685, + "learning_rate": 7.909375390186572e-06, + "loss": 0.0812, + "step": 2716 + }, + { + "epoch": 3.051523234592166, + "grad_norm": 0.4288047135096686, + "learning_rate": 7.90170519609304e-06, + "loss": 0.0816, + "step": 2717 + }, + { + "epoch": 3.0526463568721045, + "grad_norm": 0.445145856226943, + "learning_rate": 7.894036293103393e-06, + "loss": 0.0844, + "step": 2718 + }, + { + "epoch": 3.053769479152043, + "grad_norm": 0.4565081459753225, + "learning_rate": 7.88636868593639e-06, + "loss": 0.0864, + "step": 2719 + }, + { + "epoch": 3.054892601431981, + "grad_norm": 0.4515132111689418, + "learning_rate": 7.878702379309992e-06, + "loss": 0.0874, + "step": 2720 + }, + { + "epoch": 3.056015723711919, + "grad_norm": 0.44344069780688466, + "learning_rate": 7.871037377941367e-06, + "loss": 0.0808, + "step": 2721 + }, + { + "epoch": 3.057138845991857, + "grad_norm": 0.448178193139511, + "learning_rate": 7.863373686546868e-06, + "loss": 0.0855, + "step": 2722 + }, + { + "epoch": 3.0582619682717955, + "grad_norm": 0.3940384087377325, + "learning_rate": 7.855711309842054e-06, + "loss": 0.0763, + "step": 2723 + }, + { + "epoch": 3.059385090551734, + "grad_norm": 0.45681289631612565, + "learning_rate": 7.848050252541666e-06, + "loss": 0.0845, + "step": 2724 + }, + { + "epoch": 3.060508212831672, + "grad_norm": 0.4362986797057327, + "learning_rate": 7.840390519359644e-06, + "loss": 0.0772, + "step": 2725 + }, + { + "epoch": 3.0616313351116102, + "grad_norm": 0.43138388625168317, + "learning_rate": 7.832732115009096e-06, + "loss": 0.0817, + "step": 2726 + }, + { + "epoch": 3.0627544573915486, + "grad_norm": 0.4242754319993129, + "learning_rate": 7.825075044202329e-06, + "loss": 0.079, + "step": 2727 + }, + { + "epoch": 3.0638775796714866, + "grad_norm": 0.4371286146302931, + "learning_rate": 7.817419311650819e-06, + "loss": 0.0835, + "step": 2728 + }, + { + "epoch": 3.065000701951425, + "grad_norm": 0.42440135053295946, + "learning_rate": 7.809764922065226e-06, + "loss": 0.0774, + "step": 2729 + }, + { + "epoch": 3.0661238242313633, + "grad_norm": 0.43591296320616574, + "learning_rate": 7.802111880155382e-06, + "loss": 0.0812, + "step": 2730 + }, + { + "epoch": 3.0672469465113013, + "grad_norm": 0.4264395948747793, + "learning_rate": 7.794460190630283e-06, + "loss": 0.0785, + "step": 2731 + }, + { + "epoch": 3.0683700687912396, + "grad_norm": 0.47067358685640387, + "learning_rate": 7.786809858198096e-06, + "loss": 0.0835, + "step": 2732 + }, + { + "epoch": 3.069493191071178, + "grad_norm": 0.45020780231345264, + "learning_rate": 7.779160887566161e-06, + "loss": 0.0842, + "step": 2733 + }, + { + "epoch": 3.070616313351116, + "grad_norm": 0.4598985349986955, + "learning_rate": 7.77151328344097e-06, + "loss": 0.0818, + "step": 2734 + }, + { + "epoch": 3.0717394356310543, + "grad_norm": 0.44764895115040065, + "learning_rate": 7.763867050528184e-06, + "loss": 0.0845, + "step": 2735 + }, + { + "epoch": 3.0728625579109927, + "grad_norm": 0.45919564184903605, + "learning_rate": 7.756222193532606e-06, + "loss": 0.078, + "step": 2736 + }, + { + "epoch": 3.0739856801909307, + "grad_norm": 0.43399382955613974, + "learning_rate": 7.748578717158204e-06, + "loss": 0.0796, + "step": 2737 + }, + { + "epoch": 3.075108802470869, + "grad_norm": 0.4415661509626846, + "learning_rate": 7.74093662610809e-06, + "loss": 0.081, + "step": 2738 + }, + { + "epoch": 3.0762319247508074, + "grad_norm": 0.4623493322073683, + "learning_rate": 7.733295925084534e-06, + "loss": 0.0787, + "step": 2739 + }, + { + "epoch": 3.0773550470307454, + "grad_norm": 0.42863153572886326, + "learning_rate": 7.725656618788938e-06, + "loss": 0.0822, + "step": 2740 + }, + { + "epoch": 3.0784781693106837, + "grad_norm": 0.4471791832208811, + "learning_rate": 7.718018711921852e-06, + "loss": 0.0832, + "step": 2741 + }, + { + "epoch": 3.079601291590622, + "grad_norm": 0.4478220162373831, + "learning_rate": 7.710382209182964e-06, + "loss": 0.0874, + "step": 2742 + }, + { + "epoch": 3.08072441387056, + "grad_norm": 0.4214162588600572, + "learning_rate": 7.702747115271098e-06, + "loss": 0.0826, + "step": 2743 + }, + { + "epoch": 3.0818475361504984, + "grad_norm": 0.4469938818454545, + "learning_rate": 7.695113434884214e-06, + "loss": 0.0804, + "step": 2744 + }, + { + "epoch": 3.082970658430437, + "grad_norm": 0.46167710102896964, + "learning_rate": 7.687481172719402e-06, + "loss": 0.0819, + "step": 2745 + }, + { + "epoch": 3.0840937807103748, + "grad_norm": 0.4605465891709998, + "learning_rate": 7.679850333472867e-06, + "loss": 0.0855, + "step": 2746 + }, + { + "epoch": 3.085216902990313, + "grad_norm": 0.43521315545514894, + "learning_rate": 7.672220921839955e-06, + "loss": 0.0736, + "step": 2747 + }, + { + "epoch": 3.086340025270251, + "grad_norm": 0.4604041931459674, + "learning_rate": 7.664592942515125e-06, + "loss": 0.0819, + "step": 2748 + }, + { + "epoch": 3.0874631475501895, + "grad_norm": 0.4352802112893712, + "learning_rate": 7.656966400191956e-06, + "loss": 0.0767, + "step": 2749 + }, + { + "epoch": 3.088586269830128, + "grad_norm": 0.43183285295275914, + "learning_rate": 7.649341299563151e-06, + "loss": 0.0815, + "step": 2750 + }, + { + "epoch": 3.089709392110066, + "grad_norm": 0.4398374191229837, + "learning_rate": 7.641717645320508e-06, + "loss": 0.0826, + "step": 2751 + }, + { + "epoch": 3.090832514390004, + "grad_norm": 0.44375949908128276, + "learning_rate": 7.634095442154949e-06, + "loss": 0.0796, + "step": 2752 + }, + { + "epoch": 3.0919556366699426, + "grad_norm": 0.4341552574208931, + "learning_rate": 7.626474694756501e-06, + "loss": 0.0798, + "step": 2753 + }, + { + "epoch": 3.0930787589498805, + "grad_norm": 0.45510308680572736, + "learning_rate": 7.6188554078142915e-06, + "loss": 0.0818, + "step": 2754 + }, + { + "epoch": 3.094201881229819, + "grad_norm": 0.4375905218338109, + "learning_rate": 7.611237586016558e-06, + "loss": 0.0811, + "step": 2755 + }, + { + "epoch": 3.0953250035097573, + "grad_norm": 0.4441137124518761, + "learning_rate": 7.60362123405062e-06, + "loss": 0.079, + "step": 2756 + }, + { + "epoch": 3.096448125789695, + "grad_norm": 0.4502413713509533, + "learning_rate": 7.596006356602908e-06, + "loss": 0.0797, + "step": 2757 + }, + { + "epoch": 3.0975712480696336, + "grad_norm": 0.42048708566816295, + "learning_rate": 7.58839295835894e-06, + "loss": 0.0769, + "step": 2758 + }, + { + "epoch": 3.098694370349572, + "grad_norm": 0.4573125387642322, + "learning_rate": 7.580781044003324e-06, + "loss": 0.0799, + "step": 2759 + }, + { + "epoch": 3.09981749262951, + "grad_norm": 0.506317107529294, + "learning_rate": 7.573170618219754e-06, + "loss": 0.0858, + "step": 2760 + }, + { + "epoch": 3.1009406149094483, + "grad_norm": 0.43658684401047776, + "learning_rate": 7.565561685691008e-06, + "loss": 0.0797, + "step": 2761 + }, + { + "epoch": 3.1020637371893867, + "grad_norm": 0.42157837097318585, + "learning_rate": 7.557954251098946e-06, + "loss": 0.0785, + "step": 2762 + }, + { + "epoch": 3.1031868594693246, + "grad_norm": 0.43891360296769083, + "learning_rate": 7.550348319124506e-06, + "loss": 0.0801, + "step": 2763 + }, + { + "epoch": 3.104309981749263, + "grad_norm": 0.4616891997168113, + "learning_rate": 7.5427438944477086e-06, + "loss": 0.0794, + "step": 2764 + }, + { + "epoch": 3.1054331040292014, + "grad_norm": 0.43768658044133474, + "learning_rate": 7.535140981747627e-06, + "loss": 0.0789, + "step": 2765 + }, + { + "epoch": 3.1065562263091393, + "grad_norm": 0.43116002218960375, + "learning_rate": 7.527539585702426e-06, + "loss": 0.0766, + "step": 2766 + }, + { + "epoch": 3.1076793485890777, + "grad_norm": 0.46226193562781304, + "learning_rate": 7.519939710989326e-06, + "loss": 0.0861, + "step": 2767 + }, + { + "epoch": 3.108802470869016, + "grad_norm": 0.4584385515521631, + "learning_rate": 7.512341362284612e-06, + "loss": 0.0824, + "step": 2768 + }, + { + "epoch": 3.109925593148954, + "grad_norm": 0.42541505578878597, + "learning_rate": 7.504744544263639e-06, + "loss": 0.0793, + "step": 2769 + }, + { + "epoch": 3.1110487154288924, + "grad_norm": 0.44149954225470445, + "learning_rate": 7.497149261600803e-06, + "loss": 0.0761, + "step": 2770 + }, + { + "epoch": 3.1121718377088303, + "grad_norm": 0.4367339709220696, + "learning_rate": 7.489555518969568e-06, + "loss": 0.0768, + "step": 2771 + }, + { + "epoch": 3.1132949599887687, + "grad_norm": 0.44918411334128344, + "learning_rate": 7.481963321042449e-06, + "loss": 0.0792, + "step": 2772 + }, + { + "epoch": 3.114418082268707, + "grad_norm": 0.44367325496090004, + "learning_rate": 7.474372672491008e-06, + "loss": 0.084, + "step": 2773 + }, + { + "epoch": 3.115541204548645, + "grad_norm": 0.4398077189907013, + "learning_rate": 7.4667835779858585e-06, + "loss": 0.0823, + "step": 2774 + }, + { + "epoch": 3.1166643268285834, + "grad_norm": 0.4690551629213953, + "learning_rate": 7.459196042196647e-06, + "loss": 0.08, + "step": 2775 + }, + { + "epoch": 3.117787449108522, + "grad_norm": 0.4473577302083683, + "learning_rate": 7.45161006979207e-06, + "loss": 0.0799, + "step": 2776 + }, + { + "epoch": 3.1189105713884597, + "grad_norm": 0.4388023068250219, + "learning_rate": 7.444025665439862e-06, + "loss": 0.0795, + "step": 2777 + }, + { + "epoch": 3.120033693668398, + "grad_norm": 0.4491722860474616, + "learning_rate": 7.43644283380679e-06, + "loss": 0.0777, + "step": 2778 + }, + { + "epoch": 3.1211568159483365, + "grad_norm": 0.45449915733821294, + "learning_rate": 7.428861579558653e-06, + "loss": 0.0792, + "step": 2779 + }, + { + "epoch": 3.1222799382282744, + "grad_norm": 0.44290763583095166, + "learning_rate": 7.42128190736028e-06, + "loss": 0.0767, + "step": 2780 + }, + { + "epoch": 3.123403060508213, + "grad_norm": 0.46051935717803083, + "learning_rate": 7.413703821875526e-06, + "loss": 0.0818, + "step": 2781 + }, + { + "epoch": 3.124526182788151, + "grad_norm": 0.4400112926339292, + "learning_rate": 7.406127327767272e-06, + "loss": 0.0804, + "step": 2782 + }, + { + "epoch": 3.125649305068089, + "grad_norm": 0.4589606315894283, + "learning_rate": 7.398552429697416e-06, + "loss": 0.0829, + "step": 2783 + }, + { + "epoch": 3.1267724273480275, + "grad_norm": 0.43320506570345907, + "learning_rate": 7.390979132326881e-06, + "loss": 0.0803, + "step": 2784 + }, + { + "epoch": 3.127895549627966, + "grad_norm": 0.43767705767402443, + "learning_rate": 7.383407440315595e-06, + "loss": 0.0797, + "step": 2785 + }, + { + "epoch": 3.129018671907904, + "grad_norm": 0.4551489448501023, + "learning_rate": 7.375837358322504e-06, + "loss": 0.0803, + "step": 2786 + }, + { + "epoch": 3.130141794187842, + "grad_norm": 0.4483555435289511, + "learning_rate": 7.368268891005565e-06, + "loss": 0.0851, + "step": 2787 + }, + { + "epoch": 3.1312649164677806, + "grad_norm": 0.44696608573573154, + "learning_rate": 7.360702043021738e-06, + "loss": 0.0824, + "step": 2788 + }, + { + "epoch": 3.1323880387477185, + "grad_norm": 0.4175887163998629, + "learning_rate": 7.353136819026991e-06, + "loss": 0.081, + "step": 2789 + }, + { + "epoch": 3.133511161027657, + "grad_norm": 0.42858708958351066, + "learning_rate": 7.345573223676284e-06, + "loss": 0.0793, + "step": 2790 + }, + { + "epoch": 3.1346342833075953, + "grad_norm": 0.4315573518809542, + "learning_rate": 7.338011261623583e-06, + "loss": 0.0749, + "step": 2791 + }, + { + "epoch": 3.1357574055875332, + "grad_norm": 0.46061705300883715, + "learning_rate": 7.330450937521847e-06, + "loss": 0.0802, + "step": 2792 + }, + { + "epoch": 3.1368805278674716, + "grad_norm": 0.45499828672626436, + "learning_rate": 7.322892256023025e-06, + "loss": 0.082, + "step": 2793 + }, + { + "epoch": 3.13800365014741, + "grad_norm": 0.45703139441971474, + "learning_rate": 7.315335221778064e-06, + "loss": 0.0837, + "step": 2794 + }, + { + "epoch": 3.139126772427348, + "grad_norm": 0.44422721534446546, + "learning_rate": 7.307779839436878e-06, + "loss": 0.0789, + "step": 2795 + }, + { + "epoch": 3.1402498947072863, + "grad_norm": 0.432747277056686, + "learning_rate": 7.300226113648384e-06, + "loss": 0.0814, + "step": 2796 + }, + { + "epoch": 3.1413730169872247, + "grad_norm": 0.4354763866952526, + "learning_rate": 7.292674049060473e-06, + "loss": 0.0809, + "step": 2797 + }, + { + "epoch": 3.1424961392671626, + "grad_norm": 0.42129390641539777, + "learning_rate": 7.285123650320017e-06, + "loss": 0.0759, + "step": 2798 + }, + { + "epoch": 3.143619261547101, + "grad_norm": 0.4299331690685501, + "learning_rate": 7.277574922072847e-06, + "loss": 0.0811, + "step": 2799 + }, + { + "epoch": 3.144742383827039, + "grad_norm": 0.4224270029033599, + "learning_rate": 7.27002786896379e-06, + "loss": 0.076, + "step": 2800 + }, + { + "epoch": 3.1458655061069773, + "grad_norm": 0.42177062287547257, + "learning_rate": 7.262482495636627e-06, + "loss": 0.08, + "step": 2801 + }, + { + "epoch": 3.1469886283869157, + "grad_norm": 0.44438476148551986, + "learning_rate": 7.254938806734108e-06, + "loss": 0.0826, + "step": 2802 + }, + { + "epoch": 3.1481117506668537, + "grad_norm": 0.44860619869928176, + "learning_rate": 7.247396806897953e-06, + "loss": 0.0795, + "step": 2803 + }, + { + "epoch": 3.149234872946792, + "grad_norm": 0.45127852568778315, + "learning_rate": 7.239856500768829e-06, + "loss": 0.084, + "step": 2804 + }, + { + "epoch": 3.1503579952267304, + "grad_norm": 0.4516223814508983, + "learning_rate": 7.232317892986376e-06, + "loss": 0.0781, + "step": 2805 + }, + { + "epoch": 3.1514811175066684, + "grad_norm": 0.45355410301989446, + "learning_rate": 7.2247809881891805e-06, + "loss": 0.0839, + "step": 2806 + }, + { + "epoch": 3.1526042397866068, + "grad_norm": 0.4772117453912696, + "learning_rate": 7.217245791014782e-06, + "loss": 0.0804, + "step": 2807 + }, + { + "epoch": 3.153727362066545, + "grad_norm": 0.4208304719348036, + "learning_rate": 7.2097123060996764e-06, + "loss": 0.0701, + "step": 2808 + }, + { + "epoch": 3.154850484346483, + "grad_norm": 0.44934404159731467, + "learning_rate": 7.20218053807929e-06, + "loss": 0.0809, + "step": 2809 + }, + { + "epoch": 3.1559736066264215, + "grad_norm": 0.4313912104136727, + "learning_rate": 7.194650491588007e-06, + "loss": 0.0771, + "step": 2810 + }, + { + "epoch": 3.15709672890636, + "grad_norm": 0.4758022567163495, + "learning_rate": 7.1871221712591474e-06, + "loss": 0.087, + "step": 2811 + }, + { + "epoch": 3.1582198511862978, + "grad_norm": 0.4145445019750945, + "learning_rate": 7.179595581724971e-06, + "loss": 0.0699, + "step": 2812 + }, + { + "epoch": 3.159342973466236, + "grad_norm": 0.44264907459716, + "learning_rate": 7.1720707276166736e-06, + "loss": 0.0758, + "step": 2813 + }, + { + "epoch": 3.1604660957461745, + "grad_norm": 0.4482812080110177, + "learning_rate": 7.164547613564374e-06, + "loss": 0.0803, + "step": 2814 + }, + { + "epoch": 3.1615892180261125, + "grad_norm": 0.43483597301420784, + "learning_rate": 7.157026244197132e-06, + "loss": 0.0806, + "step": 2815 + }, + { + "epoch": 3.162712340306051, + "grad_norm": 0.4958849116746515, + "learning_rate": 7.149506624142924e-06, + "loss": 0.0844, + "step": 2816 + }, + { + "epoch": 3.1638354625859892, + "grad_norm": 0.4628156948836129, + "learning_rate": 7.1419887580286615e-06, + "loss": 0.0792, + "step": 2817 + }, + { + "epoch": 3.164958584865927, + "grad_norm": 0.44851043959036796, + "learning_rate": 7.13447265048017e-06, + "loss": 0.0815, + "step": 2818 + }, + { + "epoch": 3.1660817071458656, + "grad_norm": 0.4600530891320242, + "learning_rate": 7.126958306122186e-06, + "loss": 0.0847, + "step": 2819 + }, + { + "epoch": 3.1672048294258035, + "grad_norm": 0.43714011172496053, + "learning_rate": 7.119445729578374e-06, + "loss": 0.0779, + "step": 2820 + }, + { + "epoch": 3.168327951705742, + "grad_norm": 0.4453922999565407, + "learning_rate": 7.111934925471302e-06, + "loss": 0.0812, + "step": 2821 + }, + { + "epoch": 3.1694510739856803, + "grad_norm": 0.45599177872389823, + "learning_rate": 7.1044258984224524e-06, + "loss": 0.081, + "step": 2822 + }, + { + "epoch": 3.170574196265618, + "grad_norm": 0.44089126347694846, + "learning_rate": 7.096918653052214e-06, + "loss": 0.0771, + "step": 2823 + }, + { + "epoch": 3.1716973185455566, + "grad_norm": 0.43082375817441887, + "learning_rate": 7.089413193979874e-06, + "loss": 0.0778, + "step": 2824 + }, + { + "epoch": 3.172820440825495, + "grad_norm": 0.4425588712821237, + "learning_rate": 7.081909525823625e-06, + "loss": 0.0821, + "step": 2825 + }, + { + "epoch": 3.173943563105433, + "grad_norm": 0.4382912863146195, + "learning_rate": 7.074407653200559e-06, + "loss": 0.0807, + "step": 2826 + }, + { + "epoch": 3.1750666853853713, + "grad_norm": 0.45424971866156333, + "learning_rate": 7.066907580726656e-06, + "loss": 0.082, + "step": 2827 + }, + { + "epoch": 3.1761898076653097, + "grad_norm": 0.41419138678049927, + "learning_rate": 7.059409313016798e-06, + "loss": 0.072, + "step": 2828 + }, + { + "epoch": 3.1773129299452476, + "grad_norm": 0.44000690960252986, + "learning_rate": 7.051912854684748e-06, + "loss": 0.0808, + "step": 2829 + }, + { + "epoch": 3.178436052225186, + "grad_norm": 0.44970359244435276, + "learning_rate": 7.044418210343161e-06, + "loss": 0.086, + "step": 2830 + }, + { + "epoch": 3.1795591745051244, + "grad_norm": 0.4443948815798674, + "learning_rate": 7.036925384603572e-06, + "loss": 0.079, + "step": 2831 + }, + { + "epoch": 3.1806822967850623, + "grad_norm": 0.4554250454666911, + "learning_rate": 7.029434382076408e-06, + "loss": 0.0836, + "step": 2832 + }, + { + "epoch": 3.1818054190650007, + "grad_norm": 0.4354644467232646, + "learning_rate": 7.021945207370951e-06, + "loss": 0.0763, + "step": 2833 + }, + { + "epoch": 3.182928541344939, + "grad_norm": 0.44780451430095414, + "learning_rate": 7.014457865095382e-06, + "loss": 0.0798, + "step": 2834 + }, + { + "epoch": 3.184051663624877, + "grad_norm": 0.44114833638296336, + "learning_rate": 7.006972359856743e-06, + "loss": 0.0837, + "step": 2835 + }, + { + "epoch": 3.1851747859048154, + "grad_norm": 0.4571651151911437, + "learning_rate": 6.999488696260947e-06, + "loss": 0.0843, + "step": 2836 + }, + { + "epoch": 3.186297908184754, + "grad_norm": 0.45247999536502137, + "learning_rate": 6.99200687891278e-06, + "loss": 0.0864, + "step": 2837 + }, + { + "epoch": 3.1874210304646917, + "grad_norm": 0.4439747981411603, + "learning_rate": 6.984526912415878e-06, + "loss": 0.0815, + "step": 2838 + }, + { + "epoch": 3.18854415274463, + "grad_norm": 0.43498072881051186, + "learning_rate": 6.97704880137275e-06, + "loss": 0.0822, + "step": 2839 + }, + { + "epoch": 3.1896672750245685, + "grad_norm": 0.44156927868282614, + "learning_rate": 6.96957255038476e-06, + "loss": 0.0736, + "step": 2840 + }, + { + "epoch": 3.1907903973045064, + "grad_norm": 0.4575306361920231, + "learning_rate": 6.962098164052129e-06, + "loss": 0.0792, + "step": 2841 + }, + { + "epoch": 3.191913519584445, + "grad_norm": 0.4503884973223931, + "learning_rate": 6.954625646973931e-06, + "loss": 0.0811, + "step": 2842 + }, + { + "epoch": 3.193036641864383, + "grad_norm": 0.4419481555121357, + "learning_rate": 6.947155003748083e-06, + "loss": 0.0812, + "step": 2843 + }, + { + "epoch": 3.194159764144321, + "grad_norm": 0.4517228406602635, + "learning_rate": 6.939686238971356e-06, + "loss": 0.0784, + "step": 2844 + }, + { + "epoch": 3.1952828864242595, + "grad_norm": 0.44544948534956574, + "learning_rate": 6.932219357239362e-06, + "loss": 0.0816, + "step": 2845 + }, + { + "epoch": 3.196406008704198, + "grad_norm": 0.445448430389794, + "learning_rate": 6.924754363146559e-06, + "loss": 0.0897, + "step": 2846 + }, + { + "epoch": 3.197529130984136, + "grad_norm": 0.4576732225943825, + "learning_rate": 6.917291261286239e-06, + "loss": 0.0908, + "step": 2847 + }, + { + "epoch": 3.198652253264074, + "grad_norm": 0.4479613547612505, + "learning_rate": 6.909830056250527e-06, + "loss": 0.0784, + "step": 2848 + }, + { + "epoch": 3.1997753755440126, + "grad_norm": 0.4335107505865122, + "learning_rate": 6.902370752630387e-06, + "loss": 0.0815, + "step": 2849 + }, + { + "epoch": 3.2008984978239505, + "grad_norm": 0.42987674122594727, + "learning_rate": 6.894913355015611e-06, + "loss": 0.0803, + "step": 2850 + }, + { + "epoch": 3.202021620103889, + "grad_norm": 0.4432200727897701, + "learning_rate": 6.887457867994819e-06, + "loss": 0.078, + "step": 2851 + }, + { + "epoch": 3.203144742383827, + "grad_norm": 0.4229390999012076, + "learning_rate": 6.880004296155456e-06, + "loss": 0.077, + "step": 2852 + }, + { + "epoch": 3.2042678646637652, + "grad_norm": 0.4424366303313029, + "learning_rate": 6.872552644083779e-06, + "loss": 0.0838, + "step": 2853 + }, + { + "epoch": 3.2053909869437036, + "grad_norm": 0.4483537337256006, + "learning_rate": 6.865102916364876e-06, + "loss": 0.0813, + "step": 2854 + }, + { + "epoch": 3.2065141092236416, + "grad_norm": 0.4459517380181533, + "learning_rate": 6.857655117582647e-06, + "loss": 0.0856, + "step": 2855 + }, + { + "epoch": 3.20763723150358, + "grad_norm": 0.45366324323089674, + "learning_rate": 6.850209252319804e-06, + "loss": 0.0814, + "step": 2856 + }, + { + "epoch": 3.2087603537835183, + "grad_norm": 0.46685596344158564, + "learning_rate": 6.842765325157874e-06, + "loss": 0.0829, + "step": 2857 + }, + { + "epoch": 3.2098834760634563, + "grad_norm": 0.45685853191410364, + "learning_rate": 6.83532334067718e-06, + "loss": 0.0819, + "step": 2858 + }, + { + "epoch": 3.2110065983433946, + "grad_norm": 0.450175843547502, + "learning_rate": 6.8278833034568595e-06, + "loss": 0.082, + "step": 2859 + }, + { + "epoch": 3.212129720623333, + "grad_norm": 0.45393030619312513, + "learning_rate": 6.820445218074849e-06, + "loss": 0.0838, + "step": 2860 + }, + { + "epoch": 3.213252842903271, + "grad_norm": 0.4349893777321262, + "learning_rate": 6.813009089107887e-06, + "loss": 0.0756, + "step": 2861 + }, + { + "epoch": 3.2143759651832093, + "grad_norm": 0.42814105018993454, + "learning_rate": 6.805574921131506e-06, + "loss": 0.0789, + "step": 2862 + }, + { + "epoch": 3.2154990874631477, + "grad_norm": 0.4538034830964177, + "learning_rate": 6.798142718720027e-06, + "loss": 0.0844, + "step": 2863 + }, + { + "epoch": 3.2166222097430857, + "grad_norm": 0.45315166207455276, + "learning_rate": 6.790712486446567e-06, + "loss": 0.0855, + "step": 2864 + }, + { + "epoch": 3.217745332023024, + "grad_norm": 0.4527713015472594, + "learning_rate": 6.783284228883029e-06, + "loss": 0.0794, + "step": 2865 + }, + { + "epoch": 3.2188684543029624, + "grad_norm": 0.4292889184279728, + "learning_rate": 6.775857950600107e-06, + "loss": 0.0723, + "step": 2866 + }, + { + "epoch": 3.2199915765829004, + "grad_norm": 0.4384777852700717, + "learning_rate": 6.768433656167267e-06, + "loss": 0.0801, + "step": 2867 + }, + { + "epoch": 3.2211146988628387, + "grad_norm": 0.4749472746328939, + "learning_rate": 6.76101135015276e-06, + "loss": 0.0853, + "step": 2868 + }, + { + "epoch": 3.222237821142777, + "grad_norm": 0.447676493402192, + "learning_rate": 6.7535910371236105e-06, + "loss": 0.085, + "step": 2869 + }, + { + "epoch": 3.223360943422715, + "grad_norm": 0.4448626847467645, + "learning_rate": 6.746172721645625e-06, + "loss": 0.0845, + "step": 2870 + }, + { + "epoch": 3.2244840657026534, + "grad_norm": 0.4369101325966336, + "learning_rate": 6.73875640828337e-06, + "loss": 0.0789, + "step": 2871 + }, + { + "epoch": 3.2256071879825914, + "grad_norm": 0.4650100486596405, + "learning_rate": 6.731342101600183e-06, + "loss": 0.0846, + "step": 2872 + }, + { + "epoch": 3.2267303102625298, + "grad_norm": 0.45213980309912594, + "learning_rate": 6.7239298061581716e-06, + "loss": 0.0821, + "step": 2873 + }, + { + "epoch": 3.227853432542468, + "grad_norm": 0.4660318930820804, + "learning_rate": 6.716519526518201e-06, + "loss": 0.0803, + "step": 2874 + }, + { + "epoch": 3.228976554822406, + "grad_norm": 0.43454024938308855, + "learning_rate": 6.7091112672399e-06, + "loss": 0.0792, + "step": 2875 + }, + { + "epoch": 3.2300996771023445, + "grad_norm": 0.44078038350930815, + "learning_rate": 6.701705032881654e-06, + "loss": 0.0798, + "step": 2876 + }, + { + "epoch": 3.231222799382283, + "grad_norm": 0.452985339480439, + "learning_rate": 6.694300828000594e-06, + "loss": 0.0811, + "step": 2877 + }, + { + "epoch": 3.232345921662221, + "grad_norm": 0.4432797452038033, + "learning_rate": 6.686898657152612e-06, + "loss": 0.0768, + "step": 2878 + }, + { + "epoch": 3.233469043942159, + "grad_norm": 0.43857516156481385, + "learning_rate": 6.679498524892345e-06, + "loss": 0.0776, + "step": 2879 + }, + { + "epoch": 3.2345921662220976, + "grad_norm": 0.46446080341677276, + "learning_rate": 6.672100435773176e-06, + "loss": 0.0847, + "step": 2880 + }, + { + "epoch": 3.2357152885020355, + "grad_norm": 0.45625351695476635, + "learning_rate": 6.664704394347235e-06, + "loss": 0.0803, + "step": 2881 + }, + { + "epoch": 3.236838410781974, + "grad_norm": 0.45358260601300854, + "learning_rate": 6.657310405165379e-06, + "loss": 0.0794, + "step": 2882 + }, + { + "epoch": 3.2379615330619123, + "grad_norm": 0.430765669099815, + "learning_rate": 6.649918472777216e-06, + "loss": 0.078, + "step": 2883 + }, + { + "epoch": 3.23908465534185, + "grad_norm": 0.4417554716567722, + "learning_rate": 6.642528601731082e-06, + "loss": 0.0799, + "step": 2884 + }, + { + "epoch": 3.2402077776217886, + "grad_norm": 0.4426439861111362, + "learning_rate": 6.6351407965740465e-06, + "loss": 0.0736, + "step": 2885 + }, + { + "epoch": 3.241330899901727, + "grad_norm": 0.4461344386327623, + "learning_rate": 6.627755061851911e-06, + "loss": 0.0826, + "step": 2886 + }, + { + "epoch": 3.242454022181665, + "grad_norm": 0.4428801843373427, + "learning_rate": 6.620371402109195e-06, + "loss": 0.083, + "step": 2887 + }, + { + "epoch": 3.2435771444616033, + "grad_norm": 0.45311188824038895, + "learning_rate": 6.612989821889144e-06, + "loss": 0.0877, + "step": 2888 + }, + { + "epoch": 3.2447002667415417, + "grad_norm": 0.43522073849830734, + "learning_rate": 6.605610325733728e-06, + "loss": 0.0761, + "step": 2889 + }, + { + "epoch": 3.2458233890214796, + "grad_norm": 0.43937974707906036, + "learning_rate": 6.5982329181836325e-06, + "loss": 0.078, + "step": 2890 + }, + { + "epoch": 3.246946511301418, + "grad_norm": 0.44838131831874334, + "learning_rate": 6.590857603778259e-06, + "loss": 0.0765, + "step": 2891 + }, + { + "epoch": 3.2480696335813564, + "grad_norm": 0.4487567179771303, + "learning_rate": 6.583484387055716e-06, + "loss": 0.0829, + "step": 2892 + }, + { + "epoch": 3.2491927558612943, + "grad_norm": 0.43973073043498445, + "learning_rate": 6.5761132725528265e-06, + "loss": 0.0775, + "step": 2893 + }, + { + "epoch": 3.2503158781412327, + "grad_norm": 0.44761002978377296, + "learning_rate": 6.568744264805118e-06, + "loss": 0.0888, + "step": 2894 + }, + { + "epoch": 3.251439000421171, + "grad_norm": 0.4314843009567722, + "learning_rate": 6.561377368346824e-06, + "loss": 0.0752, + "step": 2895 + }, + { + "epoch": 3.252562122701109, + "grad_norm": 0.4484597216270316, + "learning_rate": 6.554012587710879e-06, + "loss": 0.0786, + "step": 2896 + }, + { + "epoch": 3.2536852449810474, + "grad_norm": 0.4533967440227061, + "learning_rate": 6.546649927428905e-06, + "loss": 0.084, + "step": 2897 + }, + { + "epoch": 3.2548083672609858, + "grad_norm": 0.43537375245345705, + "learning_rate": 6.539289392031234e-06, + "loss": 0.0737, + "step": 2898 + }, + { + "epoch": 3.2559314895409237, + "grad_norm": 0.44392098653435624, + "learning_rate": 6.531930986046884e-06, + "loss": 0.0798, + "step": 2899 + }, + { + "epoch": 3.257054611820862, + "grad_norm": 0.43078694542955465, + "learning_rate": 6.524574714003562e-06, + "loss": 0.0787, + "step": 2900 + }, + { + "epoch": 3.2581777341008005, + "grad_norm": 0.4525304342259154, + "learning_rate": 6.517220580427669e-06, + "loss": 0.0852, + "step": 2901 + }, + { + "epoch": 3.2593008563807384, + "grad_norm": 0.44680111766540903, + "learning_rate": 6.509868589844274e-06, + "loss": 0.0732, + "step": 2902 + }, + { + "epoch": 3.260423978660677, + "grad_norm": 0.44095965893761685, + "learning_rate": 6.502518746777143e-06, + "loss": 0.085, + "step": 2903 + }, + { + "epoch": 3.2615471009406147, + "grad_norm": 0.45173240369816303, + "learning_rate": 6.495171055748714e-06, + "loss": 0.0794, + "step": 2904 + }, + { + "epoch": 3.262670223220553, + "grad_norm": 0.4439061819150142, + "learning_rate": 6.487825521280109e-06, + "loss": 0.0826, + "step": 2905 + }, + { + "epoch": 3.2637933455004915, + "grad_norm": 0.4334545536696232, + "learning_rate": 6.480482147891106e-06, + "loss": 0.0801, + "step": 2906 + }, + { + "epoch": 3.2649164677804294, + "grad_norm": 0.4329477851970838, + "learning_rate": 6.473140940100169e-06, + "loss": 0.0822, + "step": 2907 + }, + { + "epoch": 3.266039590060368, + "grad_norm": 0.44476467976568695, + "learning_rate": 6.4658019024244214e-06, + "loss": 0.0816, + "step": 2908 + }, + { + "epoch": 3.267162712340306, + "grad_norm": 0.4373699550356578, + "learning_rate": 6.458465039379655e-06, + "loss": 0.0749, + "step": 2909 + }, + { + "epoch": 3.268285834620244, + "grad_norm": 0.4289188369981385, + "learning_rate": 6.451130355480326e-06, + "loss": 0.0787, + "step": 2910 + }, + { + "epoch": 3.2694089569001825, + "grad_norm": 0.46145266949341135, + "learning_rate": 6.44379785523954e-06, + "loss": 0.0805, + "step": 2911 + }, + { + "epoch": 3.270532079180121, + "grad_norm": 0.4319621203979465, + "learning_rate": 6.4364675431690684e-06, + "loss": 0.0735, + "step": 2912 + }, + { + "epoch": 3.271655201460059, + "grad_norm": 0.4507565266528019, + "learning_rate": 6.429139423779332e-06, + "loss": 0.0809, + "step": 2913 + }, + { + "epoch": 3.272778323739997, + "grad_norm": 0.44904183335465914, + "learning_rate": 6.421813501579403e-06, + "loss": 0.0818, + "step": 2914 + }, + { + "epoch": 3.2739014460199356, + "grad_norm": 0.44237241058884846, + "learning_rate": 6.414489781077009e-06, + "loss": 0.076, + "step": 2915 + }, + { + "epoch": 3.2750245682998735, + "grad_norm": 0.45079073834235045, + "learning_rate": 6.407168266778503e-06, + "loss": 0.0807, + "step": 2916 + }, + { + "epoch": 3.276147690579812, + "grad_norm": 0.4325942699296943, + "learning_rate": 6.399848963188902e-06, + "loss": 0.0752, + "step": 2917 + }, + { + "epoch": 3.27727081285975, + "grad_norm": 0.4436273348713645, + "learning_rate": 6.392531874811849e-06, + "loss": 0.0759, + "step": 2918 + }, + { + "epoch": 3.2783939351396882, + "grad_norm": 0.4399049256998244, + "learning_rate": 6.385217006149633e-06, + "loss": 0.0768, + "step": 2919 + }, + { + "epoch": 3.2795170574196266, + "grad_norm": 0.45422810539143077, + "learning_rate": 6.3779043617031775e-06, + "loss": 0.081, + "step": 2920 + }, + { + "epoch": 3.2806401796995646, + "grad_norm": 0.4643323939153561, + "learning_rate": 6.370593945972022e-06, + "loss": 0.0824, + "step": 2921 + }, + { + "epoch": 3.281763301979503, + "grad_norm": 0.4384243413962703, + "learning_rate": 6.363285763454352e-06, + "loss": 0.0777, + "step": 2922 + }, + { + "epoch": 3.2828864242594413, + "grad_norm": 0.44988902779488643, + "learning_rate": 6.355979818646972e-06, + "loss": 0.0797, + "step": 2923 + }, + { + "epoch": 3.2840095465393793, + "grad_norm": 0.4533162798690284, + "learning_rate": 6.34867611604531e-06, + "loss": 0.0842, + "step": 2924 + }, + { + "epoch": 3.2851326688193176, + "grad_norm": 0.4470213268981242, + "learning_rate": 6.341374660143419e-06, + "loss": 0.0846, + "step": 2925 + }, + { + "epoch": 3.286255791099256, + "grad_norm": 0.44475456824817117, + "learning_rate": 6.334075455433957e-06, + "loss": 0.0848, + "step": 2926 + }, + { + "epoch": 3.287378913379194, + "grad_norm": 0.4345189413360516, + "learning_rate": 6.326778506408209e-06, + "loss": 0.0763, + "step": 2927 + }, + { + "epoch": 3.2885020356591323, + "grad_norm": 0.4443083844205599, + "learning_rate": 6.319483817556067e-06, + "loss": 0.0836, + "step": 2928 + }, + { + "epoch": 3.2896251579390707, + "grad_norm": 0.44868349523571427, + "learning_rate": 6.312191393366036e-06, + "loss": 0.0838, + "step": 2929 + }, + { + "epoch": 3.2907482802190087, + "grad_norm": 0.43446703024226885, + "learning_rate": 6.304901238325224e-06, + "loss": 0.0802, + "step": 2930 + }, + { + "epoch": 3.291871402498947, + "grad_norm": 0.4482045860075501, + "learning_rate": 6.297613356919341e-06, + "loss": 0.0808, + "step": 2931 + }, + { + "epoch": 3.2929945247788854, + "grad_norm": 0.4427022564744176, + "learning_rate": 6.290327753632705e-06, + "loss": 0.08, + "step": 2932 + }, + { + "epoch": 3.2941176470588234, + "grad_norm": 0.45857038552731477, + "learning_rate": 6.283044432948222e-06, + "loss": 0.0791, + "step": 2933 + }, + { + "epoch": 3.2952407693387618, + "grad_norm": 0.4495217764887851, + "learning_rate": 6.275763399347403e-06, + "loss": 0.0809, + "step": 2934 + }, + { + "epoch": 3.2963638916187, + "grad_norm": 0.4377261844495198, + "learning_rate": 6.268484657310351e-06, + "loss": 0.0771, + "step": 2935 + }, + { + "epoch": 3.297487013898638, + "grad_norm": 0.4693294460359561, + "learning_rate": 6.26120821131575e-06, + "loss": 0.0896, + "step": 2936 + }, + { + "epoch": 3.2986101361785765, + "grad_norm": 0.4548392670428416, + "learning_rate": 6.25393406584088e-06, + "loss": 0.0804, + "step": 2937 + }, + { + "epoch": 3.299733258458515, + "grad_norm": 0.4338443235373701, + "learning_rate": 6.246662225361603e-06, + "loss": 0.0758, + "step": 2938 + }, + { + "epoch": 3.3008563807384528, + "grad_norm": 0.4094192153958026, + "learning_rate": 6.239392694352362e-06, + "loss": 0.0681, + "step": 2939 + }, + { + "epoch": 3.301979503018391, + "grad_norm": 0.46354022190108407, + "learning_rate": 6.232125477286184e-06, + "loss": 0.0901, + "step": 2940 + }, + { + "epoch": 3.3031026252983295, + "grad_norm": 0.43420174617897006, + "learning_rate": 6.224860578634659e-06, + "loss": 0.0784, + "step": 2941 + }, + { + "epoch": 3.3042257475782675, + "grad_norm": 0.4497917258205283, + "learning_rate": 6.217598002867965e-06, + "loss": 0.0847, + "step": 2942 + }, + { + "epoch": 3.305348869858206, + "grad_norm": 0.44585339258016715, + "learning_rate": 6.210337754454842e-06, + "loss": 0.0809, + "step": 2943 + }, + { + "epoch": 3.3064719921381442, + "grad_norm": 0.4134829693581107, + "learning_rate": 6.203079837862607e-06, + "loss": 0.0766, + "step": 2944 + }, + { + "epoch": 3.307595114418082, + "grad_norm": 0.4509752340647484, + "learning_rate": 6.195824257557126e-06, + "loss": 0.0861, + "step": 2945 + }, + { + "epoch": 3.3087182366980206, + "grad_norm": 0.4664138175037376, + "learning_rate": 6.188571018002843e-06, + "loss": 0.0853, + "step": 2946 + }, + { + "epoch": 3.309841358977959, + "grad_norm": 0.4426772347810475, + "learning_rate": 6.181320123662755e-06, + "loss": 0.0755, + "step": 2947 + }, + { + "epoch": 3.310964481257897, + "grad_norm": 0.4456457712399848, + "learning_rate": 6.174071578998419e-06, + "loss": 0.0787, + "step": 2948 + }, + { + "epoch": 3.3120876035378353, + "grad_norm": 0.4375785593556192, + "learning_rate": 6.166825388469946e-06, + "loss": 0.0796, + "step": 2949 + }, + { + "epoch": 3.3132107258177736, + "grad_norm": 0.45268483396403963, + "learning_rate": 6.159581556535989e-06, + "loss": 0.084, + "step": 2950 + }, + { + "epoch": 3.3143338480977116, + "grad_norm": 0.42407864679075813, + "learning_rate": 6.152340087653762e-06, + "loss": 0.0794, + "step": 2951 + }, + { + "epoch": 3.31545697037765, + "grad_norm": 0.4509299352401977, + "learning_rate": 6.145100986279021e-06, + "loss": 0.0861, + "step": 2952 + }, + { + "epoch": 3.316580092657588, + "grad_norm": 0.43447301128730714, + "learning_rate": 6.137864256866065e-06, + "loss": 0.084, + "step": 2953 + }, + { + "epoch": 3.3177032149375263, + "grad_norm": 0.4325020912385428, + "learning_rate": 6.130629903867734e-06, + "loss": 0.0799, + "step": 2954 + }, + { + "epoch": 3.3188263372174647, + "grad_norm": 0.43598451645884106, + "learning_rate": 6.123397931735402e-06, + "loss": 0.0776, + "step": 2955 + }, + { + "epoch": 3.3199494594974026, + "grad_norm": 0.4469736893147317, + "learning_rate": 6.116168344918982e-06, + "loss": 0.0831, + "step": 2956 + }, + { + "epoch": 3.321072581777341, + "grad_norm": 0.4626495409312179, + "learning_rate": 6.10894114786692e-06, + "loss": 0.085, + "step": 2957 + }, + { + "epoch": 3.3221957040572794, + "grad_norm": 0.4474334248967847, + "learning_rate": 6.101716345026189e-06, + "loss": 0.0818, + "step": 2958 + }, + { + "epoch": 3.3233188263372173, + "grad_norm": 0.4512283659979245, + "learning_rate": 6.094493940842293e-06, + "loss": 0.0859, + "step": 2959 + }, + { + "epoch": 3.3244419486171557, + "grad_norm": 0.4505970094609106, + "learning_rate": 6.08727393975925e-06, + "loss": 0.0783, + "step": 2960 + }, + { + "epoch": 3.325565070897094, + "grad_norm": 0.4250274459193349, + "learning_rate": 6.080056346219608e-06, + "loss": 0.0816, + "step": 2961 + }, + { + "epoch": 3.326688193177032, + "grad_norm": 0.47814033258276745, + "learning_rate": 6.072841164664433e-06, + "loss": 0.0821, + "step": 2962 + }, + { + "epoch": 3.3278113154569704, + "grad_norm": 0.45580838414860525, + "learning_rate": 6.065628399533307e-06, + "loss": 0.0825, + "step": 2963 + }, + { + "epoch": 3.328934437736909, + "grad_norm": 0.4281087882366829, + "learning_rate": 6.058418055264328e-06, + "loss": 0.0767, + "step": 2964 + }, + { + "epoch": 3.3300575600168467, + "grad_norm": 0.459821896563383, + "learning_rate": 6.051210136294089e-06, + "loss": 0.0909, + "step": 2965 + }, + { + "epoch": 3.331180682296785, + "grad_norm": 0.43302194673000116, + "learning_rate": 6.044004647057709e-06, + "loss": 0.0772, + "step": 2966 + }, + { + "epoch": 3.3323038045767235, + "grad_norm": 0.44404264162176665, + "learning_rate": 6.036801591988802e-06, + "loss": 0.0793, + "step": 2967 + }, + { + "epoch": 3.3334269268566614, + "grad_norm": 0.4387838908116998, + "learning_rate": 6.0296009755194875e-06, + "loss": 0.0801, + "step": 2968 + }, + { + "epoch": 3.3345500491366, + "grad_norm": 0.4532205199732577, + "learning_rate": 6.022402802080392e-06, + "loss": 0.0835, + "step": 2969 + }, + { + "epoch": 3.3356731714165377, + "grad_norm": 0.4445884667652232, + "learning_rate": 6.015207076100618e-06, + "loss": 0.0841, + "step": 2970 + }, + { + "epoch": 3.336796293696476, + "grad_norm": 0.4396390636961917, + "learning_rate": 6.00801380200778e-06, + "loss": 0.0764, + "step": 2971 + }, + { + "epoch": 3.3379194159764145, + "grad_norm": 0.4140514704401163, + "learning_rate": 6.000822984227981e-06, + "loss": 0.0726, + "step": 2972 + }, + { + "epoch": 3.3390425382563524, + "grad_norm": 0.4482407427874633, + "learning_rate": 5.993634627185807e-06, + "loss": 0.0846, + "step": 2973 + }, + { + "epoch": 3.340165660536291, + "grad_norm": 0.4736302287268389, + "learning_rate": 5.986448735304339e-06, + "loss": 0.08, + "step": 2974 + }, + { + "epoch": 3.341288782816229, + "grad_norm": 0.45919790518048875, + "learning_rate": 5.979265313005128e-06, + "loss": 0.0821, + "step": 2975 + }, + { + "epoch": 3.342411905096167, + "grad_norm": 0.4370562219096713, + "learning_rate": 5.972084364708217e-06, + "loss": 0.082, + "step": 2976 + }, + { + "epoch": 3.3435350273761055, + "grad_norm": 0.4497513684415123, + "learning_rate": 5.9649058948321225e-06, + "loss": 0.0794, + "step": 2977 + }, + { + "epoch": 3.344658149656044, + "grad_norm": 0.45727145539332165, + "learning_rate": 5.957729907793837e-06, + "loss": 0.0749, + "step": 2978 + }, + { + "epoch": 3.345781271935982, + "grad_norm": 0.4601952594639126, + "learning_rate": 5.950556408008818e-06, + "loss": 0.0856, + "step": 2979 + }, + { + "epoch": 3.3469043942159202, + "grad_norm": 0.43940530937369787, + "learning_rate": 5.943385399891004e-06, + "loss": 0.077, + "step": 2980 + }, + { + "epoch": 3.3480275164958586, + "grad_norm": 0.463228542311302, + "learning_rate": 5.9362168878527944e-06, + "loss": 0.0894, + "step": 2981 + }, + { + "epoch": 3.3491506387757966, + "grad_norm": 0.44397605607111024, + "learning_rate": 5.929050876305056e-06, + "loss": 0.075, + "step": 2982 + }, + { + "epoch": 3.350273761055735, + "grad_norm": 0.4474755147706091, + "learning_rate": 5.921887369657113e-06, + "loss": 0.0823, + "step": 2983 + }, + { + "epoch": 3.3513968833356733, + "grad_norm": 0.43075761139504265, + "learning_rate": 5.914726372316747e-06, + "loss": 0.0797, + "step": 2984 + }, + { + "epoch": 3.3525200056156113, + "grad_norm": 0.4551763638672436, + "learning_rate": 5.9075678886902e-06, + "loss": 0.0875, + "step": 2985 + }, + { + "epoch": 3.3536431278955496, + "grad_norm": 0.4372584770340282, + "learning_rate": 5.900411923182166e-06, + "loss": 0.0797, + "step": 2986 + }, + { + "epoch": 3.354766250175488, + "grad_norm": 0.42796756138216163, + "learning_rate": 5.893258480195789e-06, + "loss": 0.0859, + "step": 2987 + }, + { + "epoch": 3.355889372455426, + "grad_norm": 0.4621345317165231, + "learning_rate": 5.886107564132667e-06, + "loss": 0.0861, + "step": 2988 + }, + { + "epoch": 3.3570124947353643, + "grad_norm": 0.4416529130015374, + "learning_rate": 5.878959179392828e-06, + "loss": 0.0821, + "step": 2989 + }, + { + "epoch": 3.3581356170153027, + "grad_norm": 0.44707394897677344, + "learning_rate": 5.871813330374756e-06, + "loss": 0.0764, + "step": 2990 + }, + { + "epoch": 3.3592587392952407, + "grad_norm": 0.4674292247994994, + "learning_rate": 5.86467002147537e-06, + "loss": 0.1027, + "step": 2991 + }, + { + "epoch": 3.360381861575179, + "grad_norm": 0.4473398857345525, + "learning_rate": 5.857529257090027e-06, + "loss": 0.0808, + "step": 2992 + }, + { + "epoch": 3.3615049838551174, + "grad_norm": 0.4474406567869844, + "learning_rate": 5.85039104161252e-06, + "loss": 0.0791, + "step": 2993 + }, + { + "epoch": 3.3626281061350554, + "grad_norm": 0.4284252832694052, + "learning_rate": 5.843255379435069e-06, + "loss": 0.0805, + "step": 2994 + }, + { + "epoch": 3.3637512284149937, + "grad_norm": 0.4470373964616598, + "learning_rate": 5.8361222749483246e-06, + "loss": 0.0811, + "step": 2995 + }, + { + "epoch": 3.364874350694932, + "grad_norm": 0.45169365813301077, + "learning_rate": 5.8289917325413655e-06, + "loss": 0.0809, + "step": 2996 + }, + { + "epoch": 3.36599747297487, + "grad_norm": 0.45414154287290753, + "learning_rate": 5.821863756601694e-06, + "loss": 0.0827, + "step": 2997 + }, + { + "epoch": 3.3671205952548084, + "grad_norm": 0.4542864047987009, + "learning_rate": 5.814738351515234e-06, + "loss": 0.0901, + "step": 2998 + }, + { + "epoch": 3.368243717534747, + "grad_norm": 0.4403191156268337, + "learning_rate": 5.807615521666321e-06, + "loss": 0.0737, + "step": 2999 + }, + { + "epoch": 3.3693668398146848, + "grad_norm": 0.47140768833660335, + "learning_rate": 5.800495271437712e-06, + "loss": 0.0857, + "step": 3000 + }, + { + "epoch": 3.370489962094623, + "grad_norm": 0.44449931199515863, + "learning_rate": 5.793377605210575e-06, + "loss": 0.0763, + "step": 3001 + }, + { + "epoch": 3.371613084374561, + "grad_norm": 0.43375063637654476, + "learning_rate": 5.786262527364489e-06, + "loss": 0.0768, + "step": 3002 + }, + { + "epoch": 3.3727362066544995, + "grad_norm": 0.4634645012272231, + "learning_rate": 5.779150042277445e-06, + "loss": 0.0832, + "step": 3003 + }, + { + "epoch": 3.373859328934438, + "grad_norm": 0.46021930500381747, + "learning_rate": 5.7720401543258245e-06, + "loss": 0.0864, + "step": 3004 + }, + { + "epoch": 3.374982451214376, + "grad_norm": 0.45355421975815485, + "learning_rate": 5.764932867884423e-06, + "loss": 0.0819, + "step": 3005 + }, + { + "epoch": 3.376105573494314, + "grad_norm": 0.4399739477662287, + "learning_rate": 5.757828187326433e-06, + "loss": 0.0783, + "step": 3006 + }, + { + "epoch": 3.3772286957742526, + "grad_norm": 0.43934576191385377, + "learning_rate": 5.750726117023442e-06, + "loss": 0.0769, + "step": 3007 + }, + { + "epoch": 3.3783518180541905, + "grad_norm": 0.4567949271653509, + "learning_rate": 5.743626661345433e-06, + "loss": 0.0824, + "step": 3008 + }, + { + "epoch": 3.379474940334129, + "grad_norm": 0.4424452473412179, + "learning_rate": 5.736529824660778e-06, + "loss": 0.0812, + "step": 3009 + }, + { + "epoch": 3.3805980626140673, + "grad_norm": 0.44503663152596046, + "learning_rate": 5.729435611336239e-06, + "loss": 0.0794, + "step": 3010 + }, + { + "epoch": 3.381721184894005, + "grad_norm": 0.44895576795697817, + "learning_rate": 5.722344025736965e-06, + "loss": 0.0821, + "step": 3011 + }, + { + "epoch": 3.3828443071739436, + "grad_norm": 0.4744276589880692, + "learning_rate": 5.715255072226489e-06, + "loss": 0.0914, + "step": 3012 + }, + { + "epoch": 3.383967429453882, + "grad_norm": 0.45473296934213203, + "learning_rate": 5.708168755166714e-06, + "loss": 0.0762, + "step": 3013 + }, + { + "epoch": 3.38509055173382, + "grad_norm": 0.4669408291943065, + "learning_rate": 5.701085078917934e-06, + "loss": 0.0812, + "step": 3014 + }, + { + "epoch": 3.3862136740137583, + "grad_norm": 0.4985094979807337, + "learning_rate": 5.694004047838812e-06, + "loss": 0.081, + "step": 3015 + }, + { + "epoch": 3.3873367962936967, + "grad_norm": 0.4664501659830471, + "learning_rate": 5.686925666286385e-06, + "loss": 0.0782, + "step": 3016 + }, + { + "epoch": 3.3884599185736346, + "grad_norm": 0.4814941024166765, + "learning_rate": 5.679849938616062e-06, + "loss": 0.0907, + "step": 3017 + }, + { + "epoch": 3.389583040853573, + "grad_norm": 0.4481377810964817, + "learning_rate": 5.672776869181609e-06, + "loss": 0.079, + "step": 3018 + }, + { + "epoch": 3.390706163133511, + "grad_norm": 0.4104819286802798, + "learning_rate": 5.665706462335167e-06, + "loss": 0.0704, + "step": 3019 + }, + { + "epoch": 3.3918292854134493, + "grad_norm": 0.4308997552132428, + "learning_rate": 5.658638722427237e-06, + "loss": 0.08, + "step": 3020 + }, + { + "epoch": 3.3929524076933877, + "grad_norm": 0.44189383865557846, + "learning_rate": 5.651573653806675e-06, + "loss": 0.0785, + "step": 3021 + }, + { + "epoch": 3.3940755299733256, + "grad_norm": 0.45421284153104713, + "learning_rate": 5.6445112608207e-06, + "loss": 0.078, + "step": 3022 + }, + { + "epoch": 3.395198652253264, + "grad_norm": 0.4813721347164326, + "learning_rate": 5.6374515478148714e-06, + "loss": 0.0932, + "step": 3023 + }, + { + "epoch": 3.3963217745332024, + "grad_norm": 0.44235907345540504, + "learning_rate": 5.630394519133114e-06, + "loss": 0.0824, + "step": 3024 + }, + { + "epoch": 3.3974448968131403, + "grad_norm": 0.4338332189073094, + "learning_rate": 5.6233401791176946e-06, + "loss": 0.0786, + "step": 3025 + }, + { + "epoch": 3.3985680190930787, + "grad_norm": 0.4394631553406978, + "learning_rate": 5.616288532109225e-06, + "loss": 0.0794, + "step": 3026 + }, + { + "epoch": 3.399691141373017, + "grad_norm": 0.44813371027964505, + "learning_rate": 5.609239582446666e-06, + "loss": 0.0836, + "step": 3027 + }, + { + "epoch": 3.400814263652955, + "grad_norm": 0.45706449566282054, + "learning_rate": 5.602193334467307e-06, + "loss": 0.0845, + "step": 3028 + }, + { + "epoch": 3.4019373859328934, + "grad_norm": 0.4739118872159749, + "learning_rate": 5.595149792506785e-06, + "loss": 0.0884, + "step": 3029 + }, + { + "epoch": 3.403060508212832, + "grad_norm": 0.4456277523613716, + "learning_rate": 5.588108960899069e-06, + "loss": 0.0781, + "step": 3030 + }, + { + "epoch": 3.4041836304927697, + "grad_norm": 0.45013136112068963, + "learning_rate": 5.58107084397646e-06, + "loss": 0.0792, + "step": 3031 + }, + { + "epoch": 3.405306752772708, + "grad_norm": 0.46808037494992766, + "learning_rate": 5.574035446069593e-06, + "loss": 0.0827, + "step": 3032 + }, + { + "epoch": 3.4064298750526465, + "grad_norm": 0.44644484340636836, + "learning_rate": 5.567002771507416e-06, + "loss": 0.0778, + "step": 3033 + }, + { + "epoch": 3.4075529973325844, + "grad_norm": 0.42953138306404914, + "learning_rate": 5.559972824617217e-06, + "loss": 0.0798, + "step": 3034 + }, + { + "epoch": 3.408676119612523, + "grad_norm": 0.4336870120406191, + "learning_rate": 5.552945609724601e-06, + "loss": 0.0834, + "step": 3035 + }, + { + "epoch": 3.409799241892461, + "grad_norm": 0.4533973433553709, + "learning_rate": 5.545921131153487e-06, + "loss": 0.0876, + "step": 3036 + }, + { + "epoch": 3.410922364172399, + "grad_norm": 0.5136103261731987, + "learning_rate": 5.538899393226122e-06, + "loss": 0.0853, + "step": 3037 + }, + { + "epoch": 3.4120454864523375, + "grad_norm": 0.4320180339485597, + "learning_rate": 5.5318804002630465e-06, + "loss": 0.0818, + "step": 3038 + }, + { + "epoch": 3.413168608732276, + "grad_norm": 0.4378199677787273, + "learning_rate": 5.524864156583132e-06, + "loss": 0.0809, + "step": 3039 + }, + { + "epoch": 3.414291731012214, + "grad_norm": 0.4428589037894112, + "learning_rate": 5.517850666503547e-06, + "loss": 0.0773, + "step": 3040 + }, + { + "epoch": 3.415414853292152, + "grad_norm": 0.43356282741829844, + "learning_rate": 5.510839934339771e-06, + "loss": 0.0788, + "step": 3041 + }, + { + "epoch": 3.4165379755720906, + "grad_norm": 0.4421358222104903, + "learning_rate": 5.503831964405588e-06, + "loss": 0.0813, + "step": 3042 + }, + { + "epoch": 3.4176610978520285, + "grad_norm": 0.45263566082349915, + "learning_rate": 5.4968267610130736e-06, + "loss": 0.0877, + "step": 3043 + }, + { + "epoch": 3.418784220131967, + "grad_norm": 0.42884381173143404, + "learning_rate": 5.489824328472606e-06, + "loss": 0.0783, + "step": 3044 + }, + { + "epoch": 3.4199073424119053, + "grad_norm": 0.45905652234442995, + "learning_rate": 5.482824671092862e-06, + "loss": 0.0829, + "step": 3045 + }, + { + "epoch": 3.4210304646918432, + "grad_norm": 0.43427893805487594, + "learning_rate": 5.475827793180808e-06, + "loss": 0.0762, + "step": 3046 + }, + { + "epoch": 3.4221535869717816, + "grad_norm": 0.4383974688917182, + "learning_rate": 5.468833699041702e-06, + "loss": 0.0787, + "step": 3047 + }, + { + "epoch": 3.42327670925172, + "grad_norm": 0.42636151758099317, + "learning_rate": 5.461842392979081e-06, + "loss": 0.0801, + "step": 3048 + }, + { + "epoch": 3.424399831531658, + "grad_norm": 0.43538975548705666, + "learning_rate": 5.454853879294776e-06, + "loss": 0.0764, + "step": 3049 + }, + { + "epoch": 3.4255229538115963, + "grad_norm": 0.43470614632530913, + "learning_rate": 5.447868162288895e-06, + "loss": 0.0821, + "step": 3050 + }, + { + "epoch": 3.4266460760915347, + "grad_norm": 0.4258384808643859, + "learning_rate": 5.440885246259828e-06, + "loss": 0.0738, + "step": 3051 + }, + { + "epoch": 3.4277691983714726, + "grad_norm": 0.47091027809265673, + "learning_rate": 5.433905135504241e-06, + "loss": 0.0875, + "step": 3052 + }, + { + "epoch": 3.428892320651411, + "grad_norm": 0.42250636073037695, + "learning_rate": 5.42692783431707e-06, + "loss": 0.0776, + "step": 3053 + }, + { + "epoch": 3.430015442931349, + "grad_norm": 0.4418674818368583, + "learning_rate": 5.419953346991529e-06, + "loss": 0.0753, + "step": 3054 + }, + { + "epoch": 3.4311385652112874, + "grad_norm": 0.44643900506814016, + "learning_rate": 5.412981677819094e-06, + "loss": 0.0839, + "step": 3055 + }, + { + "epoch": 3.4322616874912257, + "grad_norm": 0.4509416128414816, + "learning_rate": 5.406012831089514e-06, + "loss": 0.0786, + "step": 3056 + }, + { + "epoch": 3.4333848097711637, + "grad_norm": 0.4526654447129122, + "learning_rate": 5.399046811090789e-06, + "loss": 0.0825, + "step": 3057 + }, + { + "epoch": 3.434507932051102, + "grad_norm": 0.43008105275608804, + "learning_rate": 5.392083622109192e-06, + "loss": 0.0754, + "step": 3058 + }, + { + "epoch": 3.4356310543310404, + "grad_norm": 0.44425939125974356, + "learning_rate": 5.385123268429251e-06, + "loss": 0.0796, + "step": 3059 + }, + { + "epoch": 3.4367541766109784, + "grad_norm": 0.4459296964868783, + "learning_rate": 5.3781657543337484e-06, + "loss": 0.0806, + "step": 3060 + }, + { + "epoch": 3.4378772988909168, + "grad_norm": 0.44459091799716943, + "learning_rate": 5.37121108410372e-06, + "loss": 0.0811, + "step": 3061 + }, + { + "epoch": 3.439000421170855, + "grad_norm": 0.4570923904403929, + "learning_rate": 5.364259262018448e-06, + "loss": 0.082, + "step": 3062 + }, + { + "epoch": 3.440123543450793, + "grad_norm": 0.44006880702846607, + "learning_rate": 5.357310292355463e-06, + "loss": 0.0791, + "step": 3063 + }, + { + "epoch": 3.4412466657307315, + "grad_norm": 0.4606550393242539, + "learning_rate": 5.3503641793905485e-06, + "loss": 0.0825, + "step": 3064 + }, + { + "epoch": 3.44236978801067, + "grad_norm": 0.4254705701825656, + "learning_rate": 5.343420927397718e-06, + "loss": 0.0859, + "step": 3065 + }, + { + "epoch": 3.443492910290608, + "grad_norm": 0.425582782498819, + "learning_rate": 5.33648054064924e-06, + "loss": 0.0803, + "step": 3066 + }, + { + "epoch": 3.444616032570546, + "grad_norm": 0.4400458710522822, + "learning_rate": 5.329543023415602e-06, + "loss": 0.0805, + "step": 3067 + }, + { + "epoch": 3.445739154850484, + "grad_norm": 0.4176064311503563, + "learning_rate": 5.322608379965537e-06, + "loss": 0.075, + "step": 3068 + }, + { + "epoch": 3.4468622771304225, + "grad_norm": 0.45054223519188685, + "learning_rate": 5.315676614566008e-06, + "loss": 0.0799, + "step": 3069 + }, + { + "epoch": 3.447985399410361, + "grad_norm": 0.4423125763067608, + "learning_rate": 5.308747731482207e-06, + "loss": 0.0787, + "step": 3070 + }, + { + "epoch": 3.449108521690299, + "grad_norm": 0.4216181686130632, + "learning_rate": 5.301821734977555e-06, + "loss": 0.0755, + "step": 3071 + }, + { + "epoch": 3.450231643970237, + "grad_norm": 0.4484194253723383, + "learning_rate": 5.2948986293136876e-06, + "loss": 0.0805, + "step": 3072 + }, + { + "epoch": 3.4513547662501756, + "grad_norm": 0.46771531925041326, + "learning_rate": 5.28797841875047e-06, + "loss": 0.0813, + "step": 3073 + }, + { + "epoch": 3.4524778885301135, + "grad_norm": 0.4340387002286317, + "learning_rate": 5.281061107545985e-06, + "loss": 0.0784, + "step": 3074 + }, + { + "epoch": 3.453601010810052, + "grad_norm": 0.4312273423989433, + "learning_rate": 5.274146699956531e-06, + "loss": 0.0784, + "step": 3075 + }, + { + "epoch": 3.4547241330899903, + "grad_norm": 0.45797427049587475, + "learning_rate": 5.26723520023662e-06, + "loss": 0.0841, + "step": 3076 + }, + { + "epoch": 3.455847255369928, + "grad_norm": 0.4478121802861111, + "learning_rate": 5.260326612638971e-06, + "loss": 0.0779, + "step": 3077 + }, + { + "epoch": 3.4569703776498666, + "grad_norm": 0.4352907201834758, + "learning_rate": 5.253420941414513e-06, + "loss": 0.0757, + "step": 3078 + }, + { + "epoch": 3.458093499929805, + "grad_norm": 0.438323892417082, + "learning_rate": 5.246518190812384e-06, + "loss": 0.077, + "step": 3079 + }, + { + "epoch": 3.459216622209743, + "grad_norm": 0.45721881289918465, + "learning_rate": 5.239618365079921e-06, + "loss": 0.086, + "step": 3080 + }, + { + "epoch": 3.4603397444896813, + "grad_norm": 0.45134533377379477, + "learning_rate": 5.232721468462669e-06, + "loss": 0.0837, + "step": 3081 + }, + { + "epoch": 3.4614628667696197, + "grad_norm": 0.4427173473895729, + "learning_rate": 5.225827505204355e-06, + "loss": 0.0743, + "step": 3082 + }, + { + "epoch": 3.4625859890495576, + "grad_norm": 0.4591916370110219, + "learning_rate": 5.218936479546913e-06, + "loss": 0.0832, + "step": 3083 + }, + { + "epoch": 3.463709111329496, + "grad_norm": 0.45693111857381774, + "learning_rate": 5.212048395730469e-06, + "loss": 0.0768, + "step": 3084 + }, + { + "epoch": 3.4648322336094344, + "grad_norm": 0.4522432872093845, + "learning_rate": 5.205163257993341e-06, + "loss": 0.0822, + "step": 3085 + }, + { + "epoch": 3.4659553558893723, + "grad_norm": 0.45600705538504716, + "learning_rate": 5.19828107057202e-06, + "loss": 0.0798, + "step": 3086 + }, + { + "epoch": 3.4670784781693107, + "grad_norm": 0.45123885398109936, + "learning_rate": 5.191401837701197e-06, + "loss": 0.0823, + "step": 3087 + }, + { + "epoch": 3.468201600449249, + "grad_norm": 0.4480694693802906, + "learning_rate": 5.18452556361374e-06, + "loss": 0.0768, + "step": 3088 + }, + { + "epoch": 3.469324722729187, + "grad_norm": 0.443774248075364, + "learning_rate": 5.177652252540697e-06, + "loss": 0.078, + "step": 3089 + }, + { + "epoch": 3.4704478450091254, + "grad_norm": 0.4651048676199529, + "learning_rate": 5.170781908711289e-06, + "loss": 0.0814, + "step": 3090 + }, + { + "epoch": 3.471570967289064, + "grad_norm": 0.4858930041394996, + "learning_rate": 5.163914536352919e-06, + "loss": 0.0831, + "step": 3091 + }, + { + "epoch": 3.4726940895690017, + "grad_norm": 0.45944862905297834, + "learning_rate": 5.157050139691151e-06, + "loss": 0.0847, + "step": 3092 + }, + { + "epoch": 3.47381721184894, + "grad_norm": 0.43905597445283906, + "learning_rate": 5.150188722949725e-06, + "loss": 0.0791, + "step": 3093 + }, + { + "epoch": 3.4749403341288785, + "grad_norm": 0.43904792852959645, + "learning_rate": 5.143330290350548e-06, + "loss": 0.0767, + "step": 3094 + }, + { + "epoch": 3.4760634564088164, + "grad_norm": 0.4487339919699267, + "learning_rate": 5.136474846113688e-06, + "loss": 0.0811, + "step": 3095 + }, + { + "epoch": 3.477186578688755, + "grad_norm": 0.4280955715492865, + "learning_rate": 5.129622394457377e-06, + "loss": 0.074, + "step": 3096 + }, + { + "epoch": 3.478309700968693, + "grad_norm": 0.44114191944550274, + "learning_rate": 5.122772939598003e-06, + "loss": 0.0811, + "step": 3097 + }, + { + "epoch": 3.479432823248631, + "grad_norm": 0.4381981300103117, + "learning_rate": 5.11592648575011e-06, + "loss": 0.0769, + "step": 3098 + }, + { + "epoch": 3.4805559455285695, + "grad_norm": 0.41185510445743156, + "learning_rate": 5.109083037126397e-06, + "loss": 0.0731, + "step": 3099 + }, + { + "epoch": 3.481679067808508, + "grad_norm": 0.4438897661024262, + "learning_rate": 5.1022425979377174e-06, + "loss": 0.0809, + "step": 3100 + }, + { + "epoch": 3.482802190088446, + "grad_norm": 0.43375334791452175, + "learning_rate": 5.095405172393062e-06, + "loss": 0.0828, + "step": 3101 + }, + { + "epoch": 3.483925312368384, + "grad_norm": 0.4326713003014848, + "learning_rate": 5.088570764699574e-06, + "loss": 0.0741, + "step": 3102 + }, + { + "epoch": 3.485048434648322, + "grad_norm": 0.4553368231780253, + "learning_rate": 5.081739379062545e-06, + "loss": 0.0797, + "step": 3103 + }, + { + "epoch": 3.4861715569282605, + "grad_norm": 0.45948155021628384, + "learning_rate": 5.074911019685398e-06, + "loss": 0.079, + "step": 3104 + }, + { + "epoch": 3.487294679208199, + "grad_norm": 0.4435769006877875, + "learning_rate": 5.068085690769702e-06, + "loss": 0.0828, + "step": 3105 + }, + { + "epoch": 3.488417801488137, + "grad_norm": 0.4676805551105927, + "learning_rate": 5.06126339651515e-06, + "loss": 0.0876, + "step": 3106 + }, + { + "epoch": 3.4895409237680752, + "grad_norm": 0.44708470564753106, + "learning_rate": 5.054444141119579e-06, + "loss": 0.0789, + "step": 3107 + }, + { + "epoch": 3.4906640460480136, + "grad_norm": 0.44694690179062085, + "learning_rate": 5.047627928778951e-06, + "loss": 0.0797, + "step": 3108 + }, + { + "epoch": 3.4917871683279516, + "grad_norm": 0.45987872193849977, + "learning_rate": 5.040814763687358e-06, + "loss": 0.0804, + "step": 3109 + }, + { + "epoch": 3.49291029060789, + "grad_norm": 0.46368555836155595, + "learning_rate": 5.034004650037016e-06, + "loss": 0.0838, + "step": 3110 + }, + { + "epoch": 3.4940334128878283, + "grad_norm": 0.4670280413811003, + "learning_rate": 5.02719759201826e-06, + "loss": 0.0854, + "step": 3111 + }, + { + "epoch": 3.4951565351677663, + "grad_norm": 0.4655508082978832, + "learning_rate": 5.020393593819547e-06, + "loss": 0.085, + "step": 3112 + }, + { + "epoch": 3.4962796574477046, + "grad_norm": 0.4506202886596756, + "learning_rate": 5.013592659627454e-06, + "loss": 0.0796, + "step": 3113 + }, + { + "epoch": 3.497402779727643, + "grad_norm": 0.44760914223329473, + "learning_rate": 5.006794793626671e-06, + "loss": 0.0773, + "step": 3114 + }, + { + "epoch": 3.498525902007581, + "grad_norm": 0.44369165321868126, + "learning_rate": 5.000000000000003e-06, + "loss": 0.0787, + "step": 3115 + }, + { + "epoch": 3.4996490242875193, + "grad_norm": 0.42826531752580976, + "learning_rate": 4.9932082829283524e-06, + "loss": 0.0812, + "step": 3116 + }, + { + "epoch": 3.5007721465674573, + "grad_norm": 0.4370975011654157, + "learning_rate": 4.986419646590744e-06, + "loss": 0.0811, + "step": 3117 + }, + { + "epoch": 3.5018952688473957, + "grad_norm": 0.4431924357682965, + "learning_rate": 4.979634095164298e-06, + "loss": 0.0811, + "step": 3118 + }, + { + "epoch": 3.503018391127334, + "grad_norm": 0.43224211139362045, + "learning_rate": 4.972851632824241e-06, + "loss": 0.0797, + "step": 3119 + }, + { + "epoch": 3.504141513407272, + "grad_norm": 0.44424653895974464, + "learning_rate": 4.966072263743899e-06, + "loss": 0.0828, + "step": 3120 + }, + { + "epoch": 3.5052646356872104, + "grad_norm": 0.4572424329181564, + "learning_rate": 4.959295992094685e-06, + "loss": 0.0801, + "step": 3121 + }, + { + "epoch": 3.5063877579671487, + "grad_norm": 0.460448950085812, + "learning_rate": 4.952522822046117e-06, + "loss": 0.0764, + "step": 3122 + }, + { + "epoch": 3.5075108802470867, + "grad_norm": 0.4249602220115872, + "learning_rate": 4.945752757765802e-06, + "loss": 0.0771, + "step": 3123 + }, + { + "epoch": 3.508634002527025, + "grad_norm": 0.4413605494574981, + "learning_rate": 4.93898580341944e-06, + "loss": 0.0822, + "step": 3124 + }, + { + "epoch": 3.5097571248069634, + "grad_norm": 0.4291289570379504, + "learning_rate": 4.932221963170801e-06, + "loss": 0.0797, + "step": 3125 + }, + { + "epoch": 3.5108802470869014, + "grad_norm": 0.4721851004921407, + "learning_rate": 4.925461241181757e-06, + "loss": 0.0843, + "step": 3126 + }, + { + "epoch": 3.5120033693668398, + "grad_norm": 0.4392592949377926, + "learning_rate": 4.918703641612255e-06, + "loss": 0.0786, + "step": 3127 + }, + { + "epoch": 3.513126491646778, + "grad_norm": 0.43903462957651435, + "learning_rate": 4.9119491686203195e-06, + "loss": 0.0804, + "step": 3128 + }, + { + "epoch": 3.514249613926716, + "grad_norm": 0.4430529984909637, + "learning_rate": 4.9051978263620545e-06, + "loss": 0.0766, + "step": 3129 + }, + { + "epoch": 3.5153727362066545, + "grad_norm": 0.4394580449820179, + "learning_rate": 4.89844961899163e-06, + "loss": 0.0762, + "step": 3130 + }, + { + "epoch": 3.516495858486593, + "grad_norm": 0.4686215670552109, + "learning_rate": 4.891704550661294e-06, + "loss": 0.0847, + "step": 3131 + }, + { + "epoch": 3.517618980766531, + "grad_norm": 0.41231062902627313, + "learning_rate": 4.884962625521363e-06, + "loss": 0.0793, + "step": 3132 + }, + { + "epoch": 3.518742103046469, + "grad_norm": 0.4569282712376777, + "learning_rate": 4.878223847720217e-06, + "loss": 0.0876, + "step": 3133 + }, + { + "epoch": 3.5198652253264076, + "grad_norm": 0.4208575740257057, + "learning_rate": 4.8714882214043e-06, + "loss": 0.078, + "step": 3134 + }, + { + "epoch": 3.5209883476063455, + "grad_norm": 0.43267410608868484, + "learning_rate": 4.8647557507181164e-06, + "loss": 0.0814, + "step": 3135 + }, + { + "epoch": 3.522111469886284, + "grad_norm": 0.42634413807778826, + "learning_rate": 4.8580264398042355e-06, + "loss": 0.0745, + "step": 3136 + }, + { + "epoch": 3.5232345921662223, + "grad_norm": 0.45182552642987417, + "learning_rate": 4.851300292803266e-06, + "loss": 0.081, + "step": 3137 + }, + { + "epoch": 3.52435771444616, + "grad_norm": 0.43109856616030523, + "learning_rate": 4.844577313853886e-06, + "loss": 0.0814, + "step": 3138 + }, + { + "epoch": 3.5254808367260986, + "grad_norm": 0.4470250084652514, + "learning_rate": 4.837857507092817e-06, + "loss": 0.0822, + "step": 3139 + }, + { + "epoch": 3.526603959006037, + "grad_norm": 0.44862752921716353, + "learning_rate": 4.831140876654831e-06, + "loss": 0.0802, + "step": 3140 + }, + { + "epoch": 3.527727081285975, + "grad_norm": 0.4769651304639455, + "learning_rate": 4.824427426672743e-06, + "loss": 0.083, + "step": 3141 + }, + { + "epoch": 3.5288502035659133, + "grad_norm": 0.4709080102440193, + "learning_rate": 4.8177171612774155e-06, + "loss": 0.0833, + "step": 3142 + }, + { + "epoch": 3.5299733258458517, + "grad_norm": 0.4393104624450094, + "learning_rate": 4.811010084597747e-06, + "loss": 0.0729, + "step": 3143 + }, + { + "epoch": 3.5310964481257896, + "grad_norm": 0.4600533673879964, + "learning_rate": 4.80430620076068e-06, + "loss": 0.0807, + "step": 3144 + }, + { + "epoch": 3.532219570405728, + "grad_norm": 0.47092645697860963, + "learning_rate": 4.797605513891179e-06, + "loss": 0.0786, + "step": 3145 + }, + { + "epoch": 3.5333426926856664, + "grad_norm": 0.44424095531161306, + "learning_rate": 4.790908028112256e-06, + "loss": 0.0817, + "step": 3146 + }, + { + "epoch": 3.5344658149656043, + "grad_norm": 0.4536227579660774, + "learning_rate": 4.7842137475449444e-06, + "loss": 0.0782, + "step": 3147 + }, + { + "epoch": 3.5355889372455427, + "grad_norm": 0.45564989250792665, + "learning_rate": 4.777522676308314e-06, + "loss": 0.0823, + "step": 3148 + }, + { + "epoch": 3.536712059525481, + "grad_norm": 0.42912926619367286, + "learning_rate": 4.770834818519454e-06, + "loss": 0.0815, + "step": 3149 + }, + { + "epoch": 3.537835181805419, + "grad_norm": 0.44561480322449887, + "learning_rate": 4.764150178293471e-06, + "loss": 0.0783, + "step": 3150 + }, + { + "epoch": 3.5389583040853574, + "grad_norm": 0.4553565188197437, + "learning_rate": 4.757468759743501e-06, + "loss": 0.0837, + "step": 3151 + }, + { + "epoch": 3.5400814263652958, + "grad_norm": 0.46776113922754914, + "learning_rate": 4.750790566980694e-06, + "loss": 0.0809, + "step": 3152 + }, + { + "epoch": 3.5412045486452337, + "grad_norm": 0.43159302458196586, + "learning_rate": 4.744115604114218e-06, + "loss": 0.0775, + "step": 3153 + }, + { + "epoch": 3.542327670925172, + "grad_norm": 0.451930193919335, + "learning_rate": 4.737443875251251e-06, + "loss": 0.076, + "step": 3154 + }, + { + "epoch": 3.5434507932051105, + "grad_norm": 0.46172685056937696, + "learning_rate": 4.730775384496976e-06, + "loss": 0.0796, + "step": 3155 + }, + { + "epoch": 3.5445739154850484, + "grad_norm": 0.4262475873889323, + "learning_rate": 4.724110135954593e-06, + "loss": 0.076, + "step": 3156 + }, + { + "epoch": 3.545697037764987, + "grad_norm": 0.4533319005981512, + "learning_rate": 4.717448133725302e-06, + "loss": 0.0864, + "step": 3157 + }, + { + "epoch": 3.5468201600449247, + "grad_norm": 0.44873241402246483, + "learning_rate": 4.710789381908308e-06, + "loss": 0.0827, + "step": 3158 + }, + { + "epoch": 3.547943282324863, + "grad_norm": 0.4438311385849649, + "learning_rate": 4.704133884600811e-06, + "loss": 0.0807, + "step": 3159 + }, + { + "epoch": 3.5490664046048015, + "grad_norm": 0.42639669279917847, + "learning_rate": 4.697481645898012e-06, + "loss": 0.0789, + "step": 3160 + }, + { + "epoch": 3.5501895268847394, + "grad_norm": 0.410746116785441, + "learning_rate": 4.690832669893108e-06, + "loss": 0.0805, + "step": 3161 + }, + { + "epoch": 3.551312649164678, + "grad_norm": 0.45365920946155613, + "learning_rate": 4.684186960677287e-06, + "loss": 0.0866, + "step": 3162 + }, + { + "epoch": 3.552435771444616, + "grad_norm": 0.46568428236414383, + "learning_rate": 4.6775445223397306e-06, + "loss": 0.084, + "step": 3163 + }, + { + "epoch": 3.553558893724554, + "grad_norm": 0.45425121145084363, + "learning_rate": 4.670905358967598e-06, + "loss": 0.0856, + "step": 3164 + }, + { + "epoch": 3.5546820160044925, + "grad_norm": 0.44337140062483155, + "learning_rate": 4.66426947464604e-06, + "loss": 0.0823, + "step": 3165 + }, + { + "epoch": 3.5558051382844305, + "grad_norm": 0.45052653263910536, + "learning_rate": 4.6576368734581935e-06, + "loss": 0.0829, + "step": 3166 + }, + { + "epoch": 3.556928260564369, + "grad_norm": 0.4421176966358397, + "learning_rate": 4.651007559485168e-06, + "loss": 0.081, + "step": 3167 + }, + { + "epoch": 3.558051382844307, + "grad_norm": 0.4417611457609658, + "learning_rate": 4.644381536806058e-06, + "loss": 0.0798, + "step": 3168 + }, + { + "epoch": 3.559174505124245, + "grad_norm": 0.4362310044034491, + "learning_rate": 4.637758809497919e-06, + "loss": 0.0789, + "step": 3169 + }, + { + "epoch": 3.5602976274041835, + "grad_norm": 0.48187030579374374, + "learning_rate": 4.631139381635795e-06, + "loss": 0.0826, + "step": 3170 + }, + { + "epoch": 3.561420749684122, + "grad_norm": 0.4338280376779694, + "learning_rate": 4.62452325729269e-06, + "loss": 0.0782, + "step": 3171 + }, + { + "epoch": 3.56254387196406, + "grad_norm": 0.44241538181060147, + "learning_rate": 4.61791044053958e-06, + "loss": 0.0813, + "step": 3172 + }, + { + "epoch": 3.5636669942439982, + "grad_norm": 0.4572788174432963, + "learning_rate": 4.611300935445407e-06, + "loss": 0.0822, + "step": 3173 + }, + { + "epoch": 3.5647901165239366, + "grad_norm": 0.4441694387134764, + "learning_rate": 4.604694746077064e-06, + "loss": 0.0838, + "step": 3174 + }, + { + "epoch": 3.5659132388038746, + "grad_norm": 0.45759098365330836, + "learning_rate": 4.598091876499417e-06, + "loss": 0.0799, + "step": 3175 + }, + { + "epoch": 3.567036361083813, + "grad_norm": 0.4343009539002791, + "learning_rate": 4.591492330775283e-06, + "loss": 0.0756, + "step": 3176 + }, + { + "epoch": 3.5681594833637513, + "grad_norm": 0.4473461995617762, + "learning_rate": 4.5848961129654365e-06, + "loss": 0.0812, + "step": 3177 + }, + { + "epoch": 3.5692826056436893, + "grad_norm": 0.4546004165413151, + "learning_rate": 4.578303227128603e-06, + "loss": 0.0833, + "step": 3178 + }, + { + "epoch": 3.5704057279236276, + "grad_norm": 0.46424501547792263, + "learning_rate": 4.571713677321455e-06, + "loss": 0.0812, + "step": 3179 + }, + { + "epoch": 3.571528850203566, + "grad_norm": 0.4361206138265064, + "learning_rate": 4.565127467598619e-06, + "loss": 0.0774, + "step": 3180 + }, + { + "epoch": 3.572651972483504, + "grad_norm": 0.4286236286675256, + "learning_rate": 4.5585446020126634e-06, + "loss": 0.0733, + "step": 3181 + }, + { + "epoch": 3.5737750947634424, + "grad_norm": 0.4617841876606261, + "learning_rate": 4.551965084614089e-06, + "loss": 0.083, + "step": 3182 + }, + { + "epoch": 3.5748982170433807, + "grad_norm": 0.46431636230368917, + "learning_rate": 4.545388919451353e-06, + "loss": 0.0772, + "step": 3183 + }, + { + "epoch": 3.5760213393233187, + "grad_norm": 0.42398130900439096, + "learning_rate": 4.538816110570841e-06, + "loss": 0.0787, + "step": 3184 + }, + { + "epoch": 3.577144461603257, + "grad_norm": 0.4340576396237869, + "learning_rate": 4.532246662016872e-06, + "loss": 0.0792, + "step": 3185 + }, + { + "epoch": 3.5782675838831954, + "grad_norm": 0.4706268135249468, + "learning_rate": 4.5256805778317015e-06, + "loss": 0.0879, + "step": 3186 + }, + { + "epoch": 3.5793907061631334, + "grad_norm": 0.43976864195858995, + "learning_rate": 4.519117862055514e-06, + "loss": 0.0811, + "step": 3187 + }, + { + "epoch": 3.5805138284430718, + "grad_norm": 0.45376420671959067, + "learning_rate": 4.512558518726425e-06, + "loss": 0.0806, + "step": 3188 + }, + { + "epoch": 3.58163695072301, + "grad_norm": 0.468528511091486, + "learning_rate": 4.506002551880462e-06, + "loss": 0.0832, + "step": 3189 + }, + { + "epoch": 3.582760073002948, + "grad_norm": 0.43294967832559633, + "learning_rate": 4.4994499655515865e-06, + "loss": 0.074, + "step": 3190 + }, + { + "epoch": 3.5838831952828865, + "grad_norm": 0.42827553987381783, + "learning_rate": 4.492900763771679e-06, + "loss": 0.0746, + "step": 3191 + }, + { + "epoch": 3.585006317562825, + "grad_norm": 0.4389937609121276, + "learning_rate": 4.486354950570534e-06, + "loss": 0.0768, + "step": 3192 + }, + { + "epoch": 3.586129439842763, + "grad_norm": 0.44454906671504224, + "learning_rate": 4.4798125299758666e-06, + "loss": 0.0741, + "step": 3193 + }, + { + "epoch": 3.587252562122701, + "grad_norm": 0.4423414531424126, + "learning_rate": 4.473273506013294e-06, + "loss": 0.0785, + "step": 3194 + }, + { + "epoch": 3.5883756844026395, + "grad_norm": 0.45927341409394545, + "learning_rate": 4.46673788270635e-06, + "loss": 0.088, + "step": 3195 + }, + { + "epoch": 3.5894988066825775, + "grad_norm": 0.4420335156563938, + "learning_rate": 4.460205664076479e-06, + "loss": 0.0767, + "step": 3196 + }, + { + "epoch": 3.590621928962516, + "grad_norm": 0.46538751797421757, + "learning_rate": 4.453676854143029e-06, + "loss": 0.0848, + "step": 3197 + }, + { + "epoch": 3.5917450512424542, + "grad_norm": 0.45668682965397933, + "learning_rate": 4.447151456923241e-06, + "loss": 0.0815, + "step": 3198 + }, + { + "epoch": 3.592868173522392, + "grad_norm": 0.45937139886413936, + "learning_rate": 4.440629476432268e-06, + "loss": 0.079, + "step": 3199 + }, + { + "epoch": 3.5939912958023306, + "grad_norm": 0.4309123966029538, + "learning_rate": 4.4341109166831565e-06, + "loss": 0.0752, + "step": 3200 + }, + { + "epoch": 3.595114418082269, + "grad_norm": 0.45123899659954364, + "learning_rate": 4.427595781686848e-06, + "loss": 0.0819, + "step": 3201 + }, + { + "epoch": 3.596237540362207, + "grad_norm": 0.4301342537854829, + "learning_rate": 4.42108407545218e-06, + "loss": 0.0792, + "step": 3202 + }, + { + "epoch": 3.5973606626421453, + "grad_norm": 0.4234732951722343, + "learning_rate": 4.41457580198587e-06, + "loss": 0.0759, + "step": 3203 + }, + { + "epoch": 3.5984837849220837, + "grad_norm": 0.4267600293298272, + "learning_rate": 4.408070965292534e-06, + "loss": 0.08, + "step": 3204 + }, + { + "epoch": 3.5996069072020216, + "grad_norm": 0.43857035097482017, + "learning_rate": 4.4015695693746685e-06, + "loss": 0.0782, + "step": 3205 + }, + { + "epoch": 3.60073002948196, + "grad_norm": 0.4662120376628465, + "learning_rate": 4.395071618232656e-06, + "loss": 0.0829, + "step": 3206 + }, + { + "epoch": 3.6018531517618984, + "grad_norm": 0.4353076779897321, + "learning_rate": 4.3885771158647595e-06, + "loss": 0.0766, + "step": 3207 + }, + { + "epoch": 3.6029762740418363, + "grad_norm": 0.420745107025468, + "learning_rate": 4.38208606626711e-06, + "loss": 0.0788, + "step": 3208 + }, + { + "epoch": 3.6040993963217747, + "grad_norm": 0.43420825398499724, + "learning_rate": 4.375598473433727e-06, + "loss": 0.082, + "step": 3209 + }, + { + "epoch": 3.6052225186017126, + "grad_norm": 0.43624668378544307, + "learning_rate": 4.369114341356497e-06, + "loss": 0.0782, + "step": 3210 + }, + { + "epoch": 3.606345640881651, + "grad_norm": 0.4286224207837259, + "learning_rate": 4.362633674025178e-06, + "loss": 0.0752, + "step": 3211 + }, + { + "epoch": 3.6074687631615894, + "grad_norm": 0.45618447258162387, + "learning_rate": 4.3561564754274e-06, + "loss": 0.0821, + "step": 3212 + }, + { + "epoch": 3.6085918854415273, + "grad_norm": 0.4245976031185963, + "learning_rate": 4.349682749548647e-06, + "loss": 0.0782, + "step": 3213 + }, + { + "epoch": 3.6097150077214657, + "grad_norm": 0.4576618374401797, + "learning_rate": 4.3432125003722754e-06, + "loss": 0.0843, + "step": 3214 + }, + { + "epoch": 3.6108381300014036, + "grad_norm": 0.48128056616977466, + "learning_rate": 4.3367457318795034e-06, + "loss": 0.0855, + "step": 3215 + }, + { + "epoch": 3.611961252281342, + "grad_norm": 0.4573589667687167, + "learning_rate": 4.330282448049405e-06, + "loss": 0.0869, + "step": 3216 + }, + { + "epoch": 3.6130843745612804, + "grad_norm": 0.41682956812824573, + "learning_rate": 4.323822652858911e-06, + "loss": 0.079, + "step": 3217 + }, + { + "epoch": 3.6142074968412183, + "grad_norm": 0.44417693491582083, + "learning_rate": 4.3173663502827985e-06, + "loss": 0.0795, + "step": 3218 + }, + { + "epoch": 3.6153306191211567, + "grad_norm": 0.45615707711819087, + "learning_rate": 4.310913544293706e-06, + "loss": 0.0842, + "step": 3219 + }, + { + "epoch": 3.616453741401095, + "grad_norm": 0.46322398924470976, + "learning_rate": 4.304464238862115e-06, + "loss": 0.0813, + "step": 3220 + }, + { + "epoch": 3.617576863681033, + "grad_norm": 0.43509351671563573, + "learning_rate": 4.298018437956354e-06, + "loss": 0.0788, + "step": 3221 + }, + { + "epoch": 3.6186999859609714, + "grad_norm": 0.43286932500602177, + "learning_rate": 4.2915761455425965e-06, + "loss": 0.0739, + "step": 3222 + }, + { + "epoch": 3.61982310824091, + "grad_norm": 0.4596133947427022, + "learning_rate": 4.285137365584854e-06, + "loss": 0.0875, + "step": 3223 + }, + { + "epoch": 3.6209462305208477, + "grad_norm": 0.4481272875661536, + "learning_rate": 4.27870210204498e-06, + "loss": 0.0832, + "step": 3224 + }, + { + "epoch": 3.622069352800786, + "grad_norm": 0.43075573098301184, + "learning_rate": 4.272270358882667e-06, + "loss": 0.0785, + "step": 3225 + }, + { + "epoch": 3.6231924750807245, + "grad_norm": 0.44064062012364946, + "learning_rate": 4.265842140055428e-06, + "loss": 0.0743, + "step": 3226 + }, + { + "epoch": 3.6243155973606624, + "grad_norm": 0.44599199599462813, + "learning_rate": 4.2594174495186225e-06, + "loss": 0.0768, + "step": 3227 + }, + { + "epoch": 3.625438719640601, + "grad_norm": 0.4756212129886129, + "learning_rate": 4.252996291225433e-06, + "loss": 0.0832, + "step": 3228 + }, + { + "epoch": 3.626561841920539, + "grad_norm": 0.44272449482968135, + "learning_rate": 4.2465786691268675e-06, + "loss": 0.0798, + "step": 3229 + }, + { + "epoch": 3.627684964200477, + "grad_norm": 0.44797258468509793, + "learning_rate": 4.240164587171761e-06, + "loss": 0.0745, + "step": 3230 + }, + { + "epoch": 3.6288080864804155, + "grad_norm": 0.44058249923913984, + "learning_rate": 4.233754049306772e-06, + "loss": 0.076, + "step": 3231 + }, + { + "epoch": 3.629931208760354, + "grad_norm": 0.43397062915249096, + "learning_rate": 4.227347059476367e-06, + "loss": 0.078, + "step": 3232 + }, + { + "epoch": 3.631054331040292, + "grad_norm": 0.43034776436202615, + "learning_rate": 4.220943621622841e-06, + "loss": 0.0777, + "step": 3233 + }, + { + "epoch": 3.6321774533202302, + "grad_norm": 0.46052076524797797, + "learning_rate": 4.214543739686302e-06, + "loss": 0.0769, + "step": 3234 + }, + { + "epoch": 3.6333005756001686, + "grad_norm": 0.4761059654293087, + "learning_rate": 4.208147417604665e-06, + "loss": 0.0866, + "step": 3235 + }, + { + "epoch": 3.6344236978801066, + "grad_norm": 0.4582874720771719, + "learning_rate": 4.2017546593136615e-06, + "loss": 0.0816, + "step": 3236 + }, + { + "epoch": 3.635546820160045, + "grad_norm": 0.4596850731102858, + "learning_rate": 4.195365468746821e-06, + "loss": 0.0836, + "step": 3237 + }, + { + "epoch": 3.6366699424399833, + "grad_norm": 0.4451422703440366, + "learning_rate": 4.188979849835483e-06, + "loss": 0.084, + "step": 3238 + }, + { + "epoch": 3.6377930647199213, + "grad_norm": 0.45215038572014205, + "learning_rate": 4.182597806508791e-06, + "loss": 0.0803, + "step": 3239 + }, + { + "epoch": 3.6389161869998596, + "grad_norm": 0.4545896148997309, + "learning_rate": 4.176219342693687e-06, + "loss": 0.0817, + "step": 3240 + }, + { + "epoch": 3.640039309279798, + "grad_norm": 0.4415219136001395, + "learning_rate": 4.1698444623149125e-06, + "loss": 0.0777, + "step": 3241 + }, + { + "epoch": 3.641162431559736, + "grad_norm": 0.4290457562106187, + "learning_rate": 4.163473169294995e-06, + "loss": 0.0756, + "step": 3242 + }, + { + "epoch": 3.6422855538396743, + "grad_norm": 0.4362701926456306, + "learning_rate": 4.1571054675542645e-06, + "loss": 0.0749, + "step": 3243 + }, + { + "epoch": 3.6434086761196127, + "grad_norm": 0.4439307907074262, + "learning_rate": 4.150741361010837e-06, + "loss": 0.0795, + "step": 3244 + }, + { + "epoch": 3.6445317983995507, + "grad_norm": 0.4603100158815367, + "learning_rate": 4.1443808535806195e-06, + "loss": 0.0877, + "step": 3245 + }, + { + "epoch": 3.645654920679489, + "grad_norm": 0.42378544801862755, + "learning_rate": 4.138023949177303e-06, + "loss": 0.073, + "step": 3246 + }, + { + "epoch": 3.6467780429594274, + "grad_norm": 0.4225346697676541, + "learning_rate": 4.131670651712357e-06, + "loss": 0.0748, + "step": 3247 + }, + { + "epoch": 3.6479011652393654, + "grad_norm": 0.4518554843426758, + "learning_rate": 4.125320965095037e-06, + "loss": 0.0785, + "step": 3248 + }, + { + "epoch": 3.6490242875193037, + "grad_norm": 0.470687474826635, + "learning_rate": 4.1189748932323735e-06, + "loss": 0.0827, + "step": 3249 + }, + { + "epoch": 3.650147409799242, + "grad_norm": 0.4506569877630462, + "learning_rate": 4.112632440029176e-06, + "loss": 0.078, + "step": 3250 + }, + { + "epoch": 3.65127053207918, + "grad_norm": 0.44578298786080667, + "learning_rate": 4.10629360938803e-06, + "loss": 0.0823, + "step": 3251 + }, + { + "epoch": 3.6523936543591184, + "grad_norm": 0.44292577788828275, + "learning_rate": 4.099958405209281e-06, + "loss": 0.0796, + "step": 3252 + }, + { + "epoch": 3.653516776639057, + "grad_norm": 0.4464121497898923, + "learning_rate": 4.093626831391051e-06, + "loss": 0.0729, + "step": 3253 + }, + { + "epoch": 3.6546398989189948, + "grad_norm": 0.44330661042864056, + "learning_rate": 4.087298891829229e-06, + "loss": 0.0753, + "step": 3254 + }, + { + "epoch": 3.655763021198933, + "grad_norm": 0.45305772566991315, + "learning_rate": 4.080974590417464e-06, + "loss": 0.0766, + "step": 3255 + }, + { + "epoch": 3.6568861434788715, + "grad_norm": 0.45177947052718287, + "learning_rate": 4.074653931047175e-06, + "loss": 0.0755, + "step": 3256 + }, + { + "epoch": 3.6580092657588095, + "grad_norm": 0.45801600980210305, + "learning_rate": 4.068336917607521e-06, + "loss": 0.0765, + "step": 3257 + }, + { + "epoch": 3.659132388038748, + "grad_norm": 0.4554130421647156, + "learning_rate": 4.0620235539854394e-06, + "loss": 0.0773, + "step": 3258 + }, + { + "epoch": 3.660255510318686, + "grad_norm": 0.4463096526208407, + "learning_rate": 4.0557138440656084e-06, + "loss": 0.0831, + "step": 3259 + }, + { + "epoch": 3.661378632598624, + "grad_norm": 0.45167705223108046, + "learning_rate": 4.049407791730464e-06, + "loss": 0.0774, + "step": 3260 + }, + { + "epoch": 3.6625017548785626, + "grad_norm": 0.45315081343108904, + "learning_rate": 4.043105400860191e-06, + "loss": 0.0804, + "step": 3261 + }, + { + "epoch": 3.6636248771585005, + "grad_norm": 0.43944579518546123, + "learning_rate": 4.036806675332715e-06, + "loss": 0.0771, + "step": 3262 + }, + { + "epoch": 3.664747999438439, + "grad_norm": 0.4617022800989824, + "learning_rate": 4.030511619023714e-06, + "loss": 0.0838, + "step": 3263 + }, + { + "epoch": 3.665871121718377, + "grad_norm": 0.8198470447722894, + "learning_rate": 4.0242202358066026e-06, + "loss": 0.089, + "step": 3264 + }, + { + "epoch": 3.666994243998315, + "grad_norm": 0.4638960398681334, + "learning_rate": 4.017932529552543e-06, + "loss": 0.0725, + "step": 3265 + }, + { + "epoch": 3.6681173662782536, + "grad_norm": 0.4669132218710685, + "learning_rate": 4.011648504130427e-06, + "loss": 0.0883, + "step": 3266 + }, + { + "epoch": 3.6692404885581915, + "grad_norm": 0.4316332260721309, + "learning_rate": 4.0053681634068854e-06, + "loss": 0.0817, + "step": 3267 + }, + { + "epoch": 3.67036361083813, + "grad_norm": 0.4386821038556611, + "learning_rate": 3.9990915112462805e-06, + "loss": 0.0784, + "step": 3268 + }, + { + "epoch": 3.6714867331180683, + "grad_norm": 0.43782337955114087, + "learning_rate": 3.992818551510705e-06, + "loss": 0.0799, + "step": 3269 + }, + { + "epoch": 3.672609855398006, + "grad_norm": 0.46001973641130106, + "learning_rate": 3.986549288059985e-06, + "loss": 0.0814, + "step": 3270 + }, + { + "epoch": 3.6737329776779446, + "grad_norm": 0.45245647722083326, + "learning_rate": 3.98028372475166e-06, + "loss": 0.0813, + "step": 3271 + }, + { + "epoch": 3.674856099957883, + "grad_norm": 0.43678629728053836, + "learning_rate": 3.974021865441003e-06, + "loss": 0.0799, + "step": 3272 + }, + { + "epoch": 3.675979222237821, + "grad_norm": 0.4176324355614767, + "learning_rate": 3.967763713981005e-06, + "loss": 0.0796, + "step": 3273 + }, + { + "epoch": 3.6771023445177593, + "grad_norm": 0.43709690178412436, + "learning_rate": 3.961509274222376e-06, + "loss": 0.0835, + "step": 3274 + }, + { + "epoch": 3.6782254667976977, + "grad_norm": 0.47658335386852513, + "learning_rate": 3.955258550013544e-06, + "loss": 0.0859, + "step": 3275 + }, + { + "epoch": 3.6793485890776356, + "grad_norm": 0.441801085296526, + "learning_rate": 3.949011545200643e-06, + "loss": 0.0792, + "step": 3276 + }, + { + "epoch": 3.680471711357574, + "grad_norm": 0.42725668162684416, + "learning_rate": 3.9427682636275256e-06, + "loss": 0.0757, + "step": 3277 + }, + { + "epoch": 3.6815948336375124, + "grad_norm": 0.4409037890558187, + "learning_rate": 3.936528709135752e-06, + "loss": 0.0764, + "step": 3278 + }, + { + "epoch": 3.6827179559174503, + "grad_norm": 0.4215867066752958, + "learning_rate": 3.93029288556459e-06, + "loss": 0.0821, + "step": 3279 + }, + { + "epoch": 3.6838410781973887, + "grad_norm": 0.4602625173462566, + "learning_rate": 3.924060796751012e-06, + "loss": 0.08, + "step": 3280 + }, + { + "epoch": 3.684964200477327, + "grad_norm": 0.4689840196628956, + "learning_rate": 3.9178324465296854e-06, + "loss": 0.079, + "step": 3281 + }, + { + "epoch": 3.686087322757265, + "grad_norm": 0.45249515865265794, + "learning_rate": 3.911607838732986e-06, + "loss": 0.0861, + "step": 3282 + }, + { + "epoch": 3.6872104450372034, + "grad_norm": 0.4505909397106748, + "learning_rate": 3.9053869771909835e-06, + "loss": 0.0816, + "step": 3283 + }, + { + "epoch": 3.688333567317142, + "grad_norm": 0.4357114391116514, + "learning_rate": 3.899169865731441e-06, + "loss": 0.0793, + "step": 3284 + }, + { + "epoch": 3.6894566895970797, + "grad_norm": 0.45612069756713314, + "learning_rate": 3.892956508179822e-06, + "loss": 0.0817, + "step": 3285 + }, + { + "epoch": 3.690579811877018, + "grad_norm": 0.4559467390418769, + "learning_rate": 3.886746908359264e-06, + "loss": 0.0832, + "step": 3286 + }, + { + "epoch": 3.6917029341569565, + "grad_norm": 0.43075052200607683, + "learning_rate": 3.880541070090607e-06, + "loss": 0.0753, + "step": 3287 + }, + { + "epoch": 3.6928260564368944, + "grad_norm": 0.4635796785903508, + "learning_rate": 3.874338997192371e-06, + "loss": 0.084, + "step": 3288 + }, + { + "epoch": 3.693949178716833, + "grad_norm": 0.4313194092470317, + "learning_rate": 3.8681406934807585e-06, + "loss": 0.0743, + "step": 3289 + }, + { + "epoch": 3.695072300996771, + "grad_norm": 0.42259202051656924, + "learning_rate": 3.8619461627696605e-06, + "loss": 0.0755, + "step": 3290 + }, + { + "epoch": 3.696195423276709, + "grad_norm": 0.4529194564392549, + "learning_rate": 3.855755408870631e-06, + "loss": 0.0792, + "step": 3291 + }, + { + "epoch": 3.6973185455566475, + "grad_norm": 0.46556671089771, + "learning_rate": 3.8495684355929115e-06, + "loss": 0.0853, + "step": 3292 + }, + { + "epoch": 3.698441667836586, + "grad_norm": 0.4448844328456529, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.0786, + "step": 3293 + }, + { + "epoch": 3.699564790116524, + "grad_norm": 0.4432713263856399, + "learning_rate": 3.837205846126731e-06, + "loss": 0.0782, + "step": 3294 + }, + { + "epoch": 3.700687912396462, + "grad_norm": 0.4211879529566886, + "learning_rate": 3.83103023754511e-06, + "loss": 0.0798, + "step": 3295 + }, + { + "epoch": 3.7018110346764006, + "grad_norm": 0.4743647041115318, + "learning_rate": 3.824858424798467e-06, + "loss": 0.0883, + "step": 3296 + }, + { + "epoch": 3.7029341569563385, + "grad_norm": 0.4248995644146303, + "learning_rate": 3.8186904116843895e-06, + "loss": 0.0728, + "step": 3297 + }, + { + "epoch": 3.704057279236277, + "grad_norm": 0.4378423244099125, + "learning_rate": 3.8125262019981224e-06, + "loss": 0.0767, + "step": 3298 + }, + { + "epoch": 3.7051804015162153, + "grad_norm": 0.4518707599907417, + "learning_rate": 3.8063657995325743e-06, + "loss": 0.0856, + "step": 3299 + }, + { + "epoch": 3.7063035237961532, + "grad_norm": 0.45215273429838926, + "learning_rate": 3.8002092080783116e-06, + "loss": 0.0765, + "step": 3300 + }, + { + "epoch": 3.7074266460760916, + "grad_norm": 0.4447025106277428, + "learning_rate": 3.794056431423545e-06, + "loss": 0.0786, + "step": 3301 + }, + { + "epoch": 3.70854976835603, + "grad_norm": 0.4459028238128283, + "learning_rate": 3.787907473354149e-06, + "loss": 0.082, + "step": 3302 + }, + { + "epoch": 3.709672890635968, + "grad_norm": 0.4404022767102967, + "learning_rate": 3.781762337653646e-06, + "loss": 0.0785, + "step": 3303 + }, + { + "epoch": 3.7107960129159063, + "grad_norm": 0.4450961184830428, + "learning_rate": 3.7756210281032092e-06, + "loss": 0.0828, + "step": 3304 + }, + { + "epoch": 3.7119191351958447, + "grad_norm": 0.423474128079963, + "learning_rate": 3.769483548481646e-06, + "loss": 0.0727, + "step": 3305 + }, + { + "epoch": 3.7130422574757826, + "grad_norm": 0.4556402049278725, + "learning_rate": 3.7633499025654186e-06, + "loss": 0.0801, + "step": 3306 + }, + { + "epoch": 3.714165379755721, + "grad_norm": 0.45401931123707706, + "learning_rate": 3.7572200941286284e-06, + "loss": 0.0828, + "step": 3307 + }, + { + "epoch": 3.715288502035659, + "grad_norm": 0.4549636700971652, + "learning_rate": 3.7510941269430124e-06, + "loss": 0.0837, + "step": 3308 + }, + { + "epoch": 3.7164116243155974, + "grad_norm": 0.45729117494453114, + "learning_rate": 3.744972004777947e-06, + "loss": 0.0787, + "step": 3309 + }, + { + "epoch": 3.7175347465955357, + "grad_norm": 0.45161195568530854, + "learning_rate": 3.7388537314004394e-06, + "loss": 0.0768, + "step": 3310 + }, + { + "epoch": 3.7186578688754737, + "grad_norm": 0.4400234197426347, + "learning_rate": 3.732739310575132e-06, + "loss": 0.0745, + "step": 3311 + }, + { + "epoch": 3.719780991155412, + "grad_norm": 0.4343707184077051, + "learning_rate": 3.7266287460642956e-06, + "loss": 0.0817, + "step": 3312 + }, + { + "epoch": 3.7209041134353504, + "grad_norm": 0.4394923766740628, + "learning_rate": 3.7205220416278263e-06, + "loss": 0.0747, + "step": 3313 + }, + { + "epoch": 3.7220272357152884, + "grad_norm": 0.43402411147419445, + "learning_rate": 3.7144192010232515e-06, + "loss": 0.0816, + "step": 3314 + }, + { + "epoch": 3.7231503579952268, + "grad_norm": 0.4560774435649313, + "learning_rate": 3.7083202280057084e-06, + "loss": 0.0788, + "step": 3315 + }, + { + "epoch": 3.7242734802751647, + "grad_norm": 0.4414913360932274, + "learning_rate": 3.702225126327965e-06, + "loss": 0.0742, + "step": 3316 + }, + { + "epoch": 3.725396602555103, + "grad_norm": 0.43653829484825973, + "learning_rate": 3.6961338997404062e-06, + "loss": 0.0766, + "step": 3317 + }, + { + "epoch": 3.7265197248350415, + "grad_norm": 0.4597215590802913, + "learning_rate": 3.6900465519910288e-06, + "loss": 0.086, + "step": 3318 + }, + { + "epoch": 3.7276428471149794, + "grad_norm": 0.45688319712825454, + "learning_rate": 3.6839630868254505e-06, + "loss": 0.086, + "step": 3319 + }, + { + "epoch": 3.728765969394918, + "grad_norm": 0.4613315368292363, + "learning_rate": 3.6778835079868857e-06, + "loss": 0.0794, + "step": 3320 + }, + { + "epoch": 3.729889091674856, + "grad_norm": 0.45526897285581736, + "learning_rate": 3.67180781921617e-06, + "loss": 0.0816, + "step": 3321 + }, + { + "epoch": 3.731012213954794, + "grad_norm": 0.4467162169946555, + "learning_rate": 3.6657360242517413e-06, + "loss": 0.0795, + "step": 3322 + }, + { + "epoch": 3.7321353362347325, + "grad_norm": 0.46971814383232896, + "learning_rate": 3.6596681268296443e-06, + "loss": 0.0985, + "step": 3323 + }, + { + "epoch": 3.733258458514671, + "grad_norm": 0.4354505523936359, + "learning_rate": 3.6536041306835226e-06, + "loss": 0.0784, + "step": 3324 + }, + { + "epoch": 3.734381580794609, + "grad_norm": 0.42196736134846446, + "learning_rate": 3.647544039544615e-06, + "loss": 0.077, + "step": 3325 + }, + { + "epoch": 3.735504703074547, + "grad_norm": 0.4436527706104773, + "learning_rate": 3.6414878571417667e-06, + "loss": 0.0851, + "step": 3326 + }, + { + "epoch": 3.7366278253544856, + "grad_norm": 0.429806201884575, + "learning_rate": 3.6354355872014113e-06, + "loss": 0.0726, + "step": 3327 + }, + { + "epoch": 3.7377509476344235, + "grad_norm": 0.4336025736673827, + "learning_rate": 3.629387233447578e-06, + "loss": 0.0821, + "step": 3328 + }, + { + "epoch": 3.738874069914362, + "grad_norm": 0.4602608383711589, + "learning_rate": 3.623342799601889e-06, + "loss": 0.0804, + "step": 3329 + }, + { + "epoch": 3.7399971921943003, + "grad_norm": 0.4114914340076706, + "learning_rate": 3.617302289383543e-06, + "loss": 0.074, + "step": 3330 + }, + { + "epoch": 3.741120314474238, + "grad_norm": 0.4429172740958033, + "learning_rate": 3.6112657065093382e-06, + "loss": 0.0788, + "step": 3331 + }, + { + "epoch": 3.7422434367541766, + "grad_norm": 0.4504323974328796, + "learning_rate": 3.6052330546936476e-06, + "loss": 0.0791, + "step": 3332 + }, + { + "epoch": 3.743366559034115, + "grad_norm": 0.4112295106510093, + "learning_rate": 3.599204337648431e-06, + "loss": 0.0743, + "step": 3333 + }, + { + "epoch": 3.744489681314053, + "grad_norm": 0.44159456652322426, + "learning_rate": 3.593179559083225e-06, + "loss": 0.0735, + "step": 3334 + }, + { + "epoch": 3.7456128035939913, + "grad_norm": 0.4148152260642522, + "learning_rate": 3.5871587227051385e-06, + "loss": 0.0749, + "step": 3335 + }, + { + "epoch": 3.7467359258739297, + "grad_norm": 0.47005121017013696, + "learning_rate": 3.5811418322188585e-06, + "loss": 0.0823, + "step": 3336 + }, + { + "epoch": 3.7478590481538676, + "grad_norm": 0.4606593735634737, + "learning_rate": 3.575128891326647e-06, + "loss": 0.0849, + "step": 3337 + }, + { + "epoch": 3.748982170433806, + "grad_norm": 0.5019905326101969, + "learning_rate": 3.56911990372833e-06, + "loss": 0.0792, + "step": 3338 + }, + { + "epoch": 3.7501052927137444, + "grad_norm": 0.4497014234992369, + "learning_rate": 3.563114873121308e-06, + "loss": 0.0768, + "step": 3339 + }, + { + "epoch": 3.7512284149936823, + "grad_norm": 0.4280427061355485, + "learning_rate": 3.557113803200537e-06, + "loss": 0.0757, + "step": 3340 + }, + { + "epoch": 3.7523515372736207, + "grad_norm": 0.4591535352490078, + "learning_rate": 3.5511166976585432e-06, + "loss": 0.0836, + "step": 3341 + }, + { + "epoch": 3.753474659553559, + "grad_norm": 0.4659249443970378, + "learning_rate": 3.5451235601854118e-06, + "loss": 0.0847, + "step": 3342 + }, + { + "epoch": 3.754597781833497, + "grad_norm": 0.4465299372843637, + "learning_rate": 3.5391343944687906e-06, + "loss": 0.075, + "step": 3343 + }, + { + "epoch": 3.7557209041134354, + "grad_norm": 0.46314659971337047, + "learning_rate": 3.533149204193871e-06, + "loss": 0.0813, + "step": 3344 + }, + { + "epoch": 3.756844026393374, + "grad_norm": 0.454679241847546, + "learning_rate": 3.527167993043411e-06, + "loss": 0.0779, + "step": 3345 + }, + { + "epoch": 3.7579671486733117, + "grad_norm": 0.432912317003138, + "learning_rate": 3.5211907646977152e-06, + "loss": 0.0772, + "step": 3346 + }, + { + "epoch": 3.75909027095325, + "grad_norm": 0.43149778700891545, + "learning_rate": 3.5152175228346375e-06, + "loss": 0.0754, + "step": 3347 + }, + { + "epoch": 3.7602133932331885, + "grad_norm": 0.4296614839842937, + "learning_rate": 3.5092482711295805e-06, + "loss": 0.0777, + "step": 3348 + }, + { + "epoch": 3.7613365155131264, + "grad_norm": 0.4210158808787179, + "learning_rate": 3.5032830132554928e-06, + "loss": 0.0722, + "step": 3349 + }, + { + "epoch": 3.762459637793065, + "grad_norm": 0.4390732400163971, + "learning_rate": 3.497321752882856e-06, + "loss": 0.0716, + "step": 3350 + }, + { + "epoch": 3.763582760073003, + "grad_norm": 0.44295131957694994, + "learning_rate": 3.4913644936797054e-06, + "loss": 0.0751, + "step": 3351 + }, + { + "epoch": 3.764705882352941, + "grad_norm": 0.4534307190117399, + "learning_rate": 3.4854112393116047e-06, + "loss": 0.0782, + "step": 3352 + }, + { + "epoch": 3.7658290046328795, + "grad_norm": 0.4385591150069758, + "learning_rate": 3.4794619934416586e-06, + "loss": 0.0738, + "step": 3353 + }, + { + "epoch": 3.766952126912818, + "grad_norm": 0.43649347579395936, + "learning_rate": 3.473516759730503e-06, + "loss": 0.0768, + "step": 3354 + }, + { + "epoch": 3.768075249192756, + "grad_norm": 0.437118874941968, + "learning_rate": 3.4675755418363054e-06, + "loss": 0.0769, + "step": 3355 + }, + { + "epoch": 3.769198371472694, + "grad_norm": 0.4299095911687841, + "learning_rate": 3.461638343414764e-06, + "loss": 0.0786, + "step": 3356 + }, + { + "epoch": 3.7703214937526326, + "grad_norm": 0.4697008287502685, + "learning_rate": 3.455705168119101e-06, + "loss": 0.0885, + "step": 3357 + }, + { + "epoch": 3.7714446160325705, + "grad_norm": 0.46789742839025383, + "learning_rate": 3.4497760196000686e-06, + "loss": 0.0846, + "step": 3358 + }, + { + "epoch": 3.772567738312509, + "grad_norm": 0.42160537923493707, + "learning_rate": 3.4438509015059284e-06, + "loss": 0.0742, + "step": 3359 + }, + { + "epoch": 3.773690860592447, + "grad_norm": 0.42954624698813265, + "learning_rate": 3.437929817482477e-06, + "loss": 0.078, + "step": 3360 + }, + { + "epoch": 3.7748139828723852, + "grad_norm": 0.436902871143199, + "learning_rate": 3.432012771173021e-06, + "loss": 0.0786, + "step": 3361 + }, + { + "epoch": 3.7759371051523236, + "grad_norm": 0.4617151544078479, + "learning_rate": 3.4260997662183836e-06, + "loss": 0.0817, + "step": 3362 + }, + { + "epoch": 3.7770602274322616, + "grad_norm": 0.4527611616144744, + "learning_rate": 3.4201908062569066e-06, + "loss": 0.0751, + "step": 3363 + }, + { + "epoch": 3.7781833497122, + "grad_norm": 0.4238491787758921, + "learning_rate": 3.4142858949244305e-06, + "loss": 0.0733, + "step": 3364 + }, + { + "epoch": 3.779306471992138, + "grad_norm": 0.4466064502716907, + "learning_rate": 3.408385035854317e-06, + "loss": 0.0743, + "step": 3365 + }, + { + "epoch": 3.7804295942720763, + "grad_norm": 0.4426815137139256, + "learning_rate": 3.4024882326774266e-06, + "loss": 0.0714, + "step": 3366 + }, + { + "epoch": 3.7815527165520146, + "grad_norm": 0.4699960233638405, + "learning_rate": 3.39659548902213e-06, + "loss": 0.0803, + "step": 3367 + }, + { + "epoch": 3.7826758388319526, + "grad_norm": 0.4482404399293802, + "learning_rate": 3.390706808514299e-06, + "loss": 0.0819, + "step": 3368 + }, + { + "epoch": 3.783798961111891, + "grad_norm": 0.45212437238597697, + "learning_rate": 3.3848221947772976e-06, + "loss": 0.0761, + "step": 3369 + }, + { + "epoch": 3.7849220833918293, + "grad_norm": 0.4533539920505649, + "learning_rate": 3.378941651431996e-06, + "loss": 0.0814, + "step": 3370 + }, + { + "epoch": 3.7860452056717673, + "grad_norm": 0.448203569611718, + "learning_rate": 3.3730651820967588e-06, + "loss": 0.076, + "step": 3371 + }, + { + "epoch": 3.7871683279517057, + "grad_norm": 0.42790870357703314, + "learning_rate": 3.36719279038744e-06, + "loss": 0.0789, + "step": 3372 + }, + { + "epoch": 3.788291450231644, + "grad_norm": 0.42797461979429224, + "learning_rate": 3.361324479917393e-06, + "loss": 0.0734, + "step": 3373 + }, + { + "epoch": 3.789414572511582, + "grad_norm": 0.44621043727284554, + "learning_rate": 3.3554602542974436e-06, + "loss": 0.086, + "step": 3374 + }, + { + "epoch": 3.7905376947915204, + "grad_norm": 0.42240375254920737, + "learning_rate": 3.3496001171359204e-06, + "loss": 0.0768, + "step": 3375 + }, + { + "epoch": 3.7916608170714587, + "grad_norm": 0.4386956304607235, + "learning_rate": 3.3437440720386294e-06, + "loss": 0.076, + "step": 3376 + }, + { + "epoch": 3.7927839393513967, + "grad_norm": 0.45986967497222725, + "learning_rate": 3.3378921226088632e-06, + "loss": 0.0827, + "step": 3377 + }, + { + "epoch": 3.793907061631335, + "grad_norm": 0.44325952459332413, + "learning_rate": 3.3320442724473843e-06, + "loss": 0.0842, + "step": 3378 + }, + { + "epoch": 3.7950301839112734, + "grad_norm": 0.43529759847083, + "learning_rate": 3.326200525152441e-06, + "loss": 0.0812, + "step": 3379 + }, + { + "epoch": 3.7961533061912114, + "grad_norm": 0.4637261053857433, + "learning_rate": 3.3203608843197575e-06, + "loss": 0.0801, + "step": 3380 + }, + { + "epoch": 3.7972764284711498, + "grad_norm": 0.44756524271819015, + "learning_rate": 3.314525353542527e-06, + "loss": 0.0811, + "step": 3381 + }, + { + "epoch": 3.798399550751088, + "grad_norm": 0.4205094044217173, + "learning_rate": 3.308693936411421e-06, + "loss": 0.0826, + "step": 3382 + }, + { + "epoch": 3.799522673031026, + "grad_norm": 0.43291470049655334, + "learning_rate": 3.302866636514567e-06, + "loss": 0.0833, + "step": 3383 + }, + { + "epoch": 3.8006457953109645, + "grad_norm": 0.42670722217149304, + "learning_rate": 3.29704345743757e-06, + "loss": 0.073, + "step": 3384 + }, + { + "epoch": 3.801768917590903, + "grad_norm": 0.4343627394675364, + "learning_rate": 3.2912244027634953e-06, + "loss": 0.0799, + "step": 3385 + }, + { + "epoch": 3.802892039870841, + "grad_norm": 0.4488560245180038, + "learning_rate": 3.285409476072874e-06, + "loss": 0.0779, + "step": 3386 + }, + { + "epoch": 3.804015162150779, + "grad_norm": 0.4503612616723187, + "learning_rate": 3.2795986809436953e-06, + "loss": 0.0808, + "step": 3387 + }, + { + "epoch": 3.8051382844307176, + "grad_norm": 0.4407667707573012, + "learning_rate": 3.2737920209513994e-06, + "loss": 0.0746, + "step": 3388 + }, + { + "epoch": 3.8062614067106555, + "grad_norm": 0.4491010129063716, + "learning_rate": 3.267989499668892e-06, + "loss": 0.0722, + "step": 3389 + }, + { + "epoch": 3.807384528990594, + "grad_norm": 0.4329898689532073, + "learning_rate": 3.262191120666528e-06, + "loss": 0.0774, + "step": 3390 + }, + { + "epoch": 3.8085076512705323, + "grad_norm": 0.4383306886957197, + "learning_rate": 3.2563968875121133e-06, + "loss": 0.0766, + "step": 3391 + }, + { + "epoch": 3.80963077355047, + "grad_norm": 0.4401170472954754, + "learning_rate": 3.250606803770904e-06, + "loss": 0.0791, + "step": 3392 + }, + { + "epoch": 3.8107538958304086, + "grad_norm": 0.4532511703655005, + "learning_rate": 3.2448208730056053e-06, + "loss": 0.0799, + "step": 3393 + }, + { + "epoch": 3.811877018110347, + "grad_norm": 0.42610017647616477, + "learning_rate": 3.2390390987763578e-06, + "loss": 0.0785, + "step": 3394 + }, + { + "epoch": 3.813000140390285, + "grad_norm": 0.4230591818086143, + "learning_rate": 3.233261484640753e-06, + "loss": 0.0667, + "step": 3395 + }, + { + "epoch": 3.8141232626702233, + "grad_norm": 0.4625893901658147, + "learning_rate": 3.2274880341538216e-06, + "loss": 0.0769, + "step": 3396 + }, + { + "epoch": 3.8152463849501617, + "grad_norm": 0.4400378121499265, + "learning_rate": 3.2217187508680314e-06, + "loss": 0.0779, + "step": 3397 + }, + { + "epoch": 3.8163695072300996, + "grad_norm": 0.4490227215560117, + "learning_rate": 3.2159536383332846e-06, + "loss": 0.0779, + "step": 3398 + }, + { + "epoch": 3.817492629510038, + "grad_norm": 0.41925773594382665, + "learning_rate": 3.21019270009692e-06, + "loss": 0.0741, + "step": 3399 + }, + { + "epoch": 3.8186157517899764, + "grad_norm": 0.4415558039457624, + "learning_rate": 3.204435939703705e-06, + "loss": 0.0759, + "step": 3400 + }, + { + "epoch": 3.8197388740699143, + "grad_norm": 0.4451411473808772, + "learning_rate": 3.198683360695839e-06, + "loss": 0.0753, + "step": 3401 + }, + { + "epoch": 3.8208619963498527, + "grad_norm": 0.42552871943963727, + "learning_rate": 3.192934966612948e-06, + "loss": 0.075, + "step": 3402 + }, + { + "epoch": 3.821985118629791, + "grad_norm": 0.4700312235719863, + "learning_rate": 3.187190760992078e-06, + "loss": 0.0828, + "step": 3403 + }, + { + "epoch": 3.823108240909729, + "grad_norm": 0.45339677778386067, + "learning_rate": 3.1814507473677047e-06, + "loss": 0.0789, + "step": 3404 + }, + { + "epoch": 3.8242313631896674, + "grad_norm": 0.4331334874484187, + "learning_rate": 3.1757149292717216e-06, + "loss": 0.0754, + "step": 3405 + }, + { + "epoch": 3.8253544854696058, + "grad_norm": 0.44163707435590865, + "learning_rate": 3.1699833102334397e-06, + "loss": 0.079, + "step": 3406 + }, + { + "epoch": 3.8264776077495437, + "grad_norm": 0.42901426073349475, + "learning_rate": 3.164255893779591e-06, + "loss": 0.0748, + "step": 3407 + }, + { + "epoch": 3.827600730029482, + "grad_norm": 0.4246549205961034, + "learning_rate": 3.1585326834343123e-06, + "loss": 0.0758, + "step": 3408 + }, + { + "epoch": 3.82872385230942, + "grad_norm": 0.43824663282905246, + "learning_rate": 3.152813682719159e-06, + "loss": 0.0817, + "step": 3409 + }, + { + "epoch": 3.8298469745893584, + "grad_norm": 0.4487556162344414, + "learning_rate": 3.147098895153098e-06, + "loss": 0.0808, + "step": 3410 + }, + { + "epoch": 3.830970096869297, + "grad_norm": 0.41728849401494356, + "learning_rate": 3.141388324252499e-06, + "loss": 0.0734, + "step": 3411 + }, + { + "epoch": 3.8320932191492347, + "grad_norm": 0.42655605860029716, + "learning_rate": 3.135681973531144e-06, + "loss": 0.081, + "step": 3412 + }, + { + "epoch": 3.833216341429173, + "grad_norm": 0.4241664691861822, + "learning_rate": 3.129979846500205e-06, + "loss": 0.0779, + "step": 3413 + }, + { + "epoch": 3.8343394637091115, + "grad_norm": 0.4153186762640536, + "learning_rate": 3.1242819466682673e-06, + "loss": 0.0771, + "step": 3414 + }, + { + "epoch": 3.8354625859890494, + "grad_norm": 0.4267033688189136, + "learning_rate": 3.1185882775413123e-06, + "loss": 0.073, + "step": 3415 + }, + { + "epoch": 3.836585708268988, + "grad_norm": 0.4457824337627224, + "learning_rate": 3.1128988426227193e-06, + "loss": 0.0805, + "step": 3416 + }, + { + "epoch": 3.8377088305489258, + "grad_norm": 0.4281819990435502, + "learning_rate": 3.107213645413254e-06, + "loss": 0.0783, + "step": 3417 + }, + { + "epoch": 3.838831952828864, + "grad_norm": 0.42960944794714556, + "learning_rate": 3.101532689411085e-06, + "loss": 0.0758, + "step": 3418 + }, + { + "epoch": 3.8399550751088025, + "grad_norm": 0.4270593518801406, + "learning_rate": 3.095855978111767e-06, + "loss": 0.0763, + "step": 3419 + }, + { + "epoch": 3.8410781973887405, + "grad_norm": 0.445387965169147, + "learning_rate": 3.0901835150082417e-06, + "loss": 0.0755, + "step": 3420 + }, + { + "epoch": 3.842201319668679, + "grad_norm": 0.44342244606556647, + "learning_rate": 3.0845153035908415e-06, + "loss": 0.0711, + "step": 3421 + }, + { + "epoch": 3.843324441948617, + "grad_norm": 0.4222486589242842, + "learning_rate": 3.0788513473472747e-06, + "loss": 0.0778, + "step": 3422 + }, + { + "epoch": 3.844447564228555, + "grad_norm": 0.43608936121729586, + "learning_rate": 3.0731916497626356e-06, + "loss": 0.0798, + "step": 3423 + }, + { + "epoch": 3.8455706865084935, + "grad_norm": 0.4601934819466419, + "learning_rate": 3.067536214319402e-06, + "loss": 0.0834, + "step": 3424 + }, + { + "epoch": 3.846693808788432, + "grad_norm": 0.41891016230414746, + "learning_rate": 3.0618850444974237e-06, + "loss": 0.0703, + "step": 3425 + }, + { + "epoch": 3.84781693106837, + "grad_norm": 0.4572426188664163, + "learning_rate": 3.056238143773932e-06, + "loss": 0.0792, + "step": 3426 + }, + { + "epoch": 3.8489400533483082, + "grad_norm": 0.4518709283094167, + "learning_rate": 3.050595515623519e-06, + "loss": 0.0793, + "step": 3427 + }, + { + "epoch": 3.8500631756282466, + "grad_norm": 0.8261186535001159, + "learning_rate": 3.0449571635181595e-06, + "loss": 0.0906, + "step": 3428 + }, + { + "epoch": 3.8511862979081846, + "grad_norm": 0.43746333274368837, + "learning_rate": 3.0393230909271953e-06, + "loss": 0.0806, + "step": 3429 + }, + { + "epoch": 3.852309420188123, + "grad_norm": 0.4328585063719159, + "learning_rate": 3.0336933013173307e-06, + "loss": 0.0777, + "step": 3430 + }, + { + "epoch": 3.8534325424680613, + "grad_norm": 0.43314046732664285, + "learning_rate": 3.028067798152643e-06, + "loss": 0.0809, + "step": 3431 + }, + { + "epoch": 3.8545556647479993, + "grad_norm": 0.4373077254389021, + "learning_rate": 3.022446584894557e-06, + "loss": 0.0788, + "step": 3432 + }, + { + "epoch": 3.8556787870279376, + "grad_norm": 0.43228751269770593, + "learning_rate": 3.0168296650018736e-06, + "loss": 0.0777, + "step": 3433 + }, + { + "epoch": 3.856801909307876, + "grad_norm": 0.43151574407618987, + "learning_rate": 3.011217041930743e-06, + "loss": 0.0733, + "step": 3434 + }, + { + "epoch": 3.857925031587814, + "grad_norm": 0.4424834168473701, + "learning_rate": 3.005608719134674e-06, + "loss": 0.0756, + "step": 3435 + }, + { + "epoch": 3.8590481538677524, + "grad_norm": 0.4685629925803647, + "learning_rate": 3.0000047000645305e-06, + "loss": 0.0827, + "step": 3436 + }, + { + "epoch": 3.8601712761476907, + "grad_norm": 0.44357998574627705, + "learning_rate": 2.9944049881685265e-06, + "loss": 0.0787, + "step": 3437 + }, + { + "epoch": 3.8612943984276287, + "grad_norm": 0.4253461038308456, + "learning_rate": 2.9888095868922297e-06, + "loss": 0.0726, + "step": 3438 + }, + { + "epoch": 3.862417520707567, + "grad_norm": 0.4260652279185222, + "learning_rate": 2.9832184996785453e-06, + "loss": 0.077, + "step": 3439 + }, + { + "epoch": 3.8635406429875054, + "grad_norm": 0.471295717516851, + "learning_rate": 2.977631729967735e-06, + "loss": 0.0836, + "step": 3440 + }, + { + "epoch": 3.8646637652674434, + "grad_norm": 0.43840929560138525, + "learning_rate": 2.9720492811973977e-06, + "loss": 0.076, + "step": 3441 + }, + { + "epoch": 3.8657868875473818, + "grad_norm": 0.453205839637241, + "learning_rate": 2.966471156802477e-06, + "loss": 0.075, + "step": 3442 + }, + { + "epoch": 3.86691000982732, + "grad_norm": 0.4288959454696245, + "learning_rate": 2.960897360215255e-06, + "loss": 0.0803, + "step": 3443 + }, + { + "epoch": 3.868033132107258, + "grad_norm": 0.43608110599439925, + "learning_rate": 2.9553278948653484e-06, + "loss": 0.0804, + "step": 3444 + }, + { + "epoch": 3.8691562543871965, + "grad_norm": 0.4411598218041702, + "learning_rate": 2.949762764179711e-06, + "loss": 0.0791, + "step": 3445 + }, + { + "epoch": 3.870279376667135, + "grad_norm": 0.45413949768533496, + "learning_rate": 2.9442019715826318e-06, + "loss": 0.0777, + "step": 3446 + }, + { + "epoch": 3.871402498947073, + "grad_norm": 0.4448584359461084, + "learning_rate": 2.938645520495723e-06, + "loss": 0.0769, + "step": 3447 + }, + { + "epoch": 3.872525621227011, + "grad_norm": 0.4551946661458706, + "learning_rate": 2.933093414337932e-06, + "loss": 0.0783, + "step": 3448 + }, + { + "epoch": 3.8736487435069495, + "grad_norm": 0.4458905959273584, + "learning_rate": 2.927545656525531e-06, + "loss": 0.0765, + "step": 3449 + }, + { + "epoch": 3.8747718657868875, + "grad_norm": 0.4281315602250711, + "learning_rate": 2.922002250472119e-06, + "loss": 0.0749, + "step": 3450 + }, + { + "epoch": 3.875894988066826, + "grad_norm": 0.43195709338842564, + "learning_rate": 2.9164631995886095e-06, + "loss": 0.0784, + "step": 3451 + }, + { + "epoch": 3.8770181103467642, + "grad_norm": 0.415873215959344, + "learning_rate": 2.9109285072832437e-06, + "loss": 0.0783, + "step": 3452 + }, + { + "epoch": 3.878141232626702, + "grad_norm": 0.450371175353607, + "learning_rate": 2.9053981769615792e-06, + "loss": 0.0779, + "step": 3453 + }, + { + "epoch": 3.8792643549066406, + "grad_norm": 0.47287548262861223, + "learning_rate": 2.899872212026489e-06, + "loss": 0.089, + "step": 3454 + }, + { + "epoch": 3.880387477186579, + "grad_norm": 0.44671822443338133, + "learning_rate": 2.894350615878163e-06, + "loss": 0.0771, + "step": 3455 + }, + { + "epoch": 3.881510599466517, + "grad_norm": 0.4287761263930842, + "learning_rate": 2.8888333919140954e-06, + "loss": 0.0745, + "step": 3456 + }, + { + "epoch": 3.8826337217464553, + "grad_norm": 0.42522941098200495, + "learning_rate": 2.883320543529096e-06, + "loss": 0.0733, + "step": 3457 + }, + { + "epoch": 3.8837568440263937, + "grad_norm": 0.46005795014191836, + "learning_rate": 2.8778120741152805e-06, + "loss": 0.0813, + "step": 3458 + }, + { + "epoch": 3.8848799663063316, + "grad_norm": 0.43782850129874495, + "learning_rate": 2.872307987062073e-06, + "loss": 0.0809, + "step": 3459 + }, + { + "epoch": 3.88600308858627, + "grad_norm": 0.42888285045593805, + "learning_rate": 2.8668082857562006e-06, + "loss": 0.0759, + "step": 3460 + }, + { + "epoch": 3.887126210866208, + "grad_norm": 0.4536689216993471, + "learning_rate": 2.8613129735816838e-06, + "loss": 0.0819, + "step": 3461 + }, + { + "epoch": 3.8882493331461463, + "grad_norm": 0.43211539049911346, + "learning_rate": 2.855822053919852e-06, + "loss": 0.0855, + "step": 3462 + }, + { + "epoch": 3.8893724554260847, + "grad_norm": 0.42986836248333826, + "learning_rate": 2.8503355301493298e-06, + "loss": 0.0747, + "step": 3463 + }, + { + "epoch": 3.8904955777060226, + "grad_norm": 0.441832461225257, + "learning_rate": 2.8448534056460332e-06, + "loss": 0.0767, + "step": 3464 + }, + { + "epoch": 3.891618699985961, + "grad_norm": 0.4620375105987588, + "learning_rate": 2.839375683783179e-06, + "loss": 0.0832, + "step": 3465 + }, + { + "epoch": 3.892741822265899, + "grad_norm": 0.43964940953553355, + "learning_rate": 2.833902367931262e-06, + "loss": 0.0746, + "step": 3466 + }, + { + "epoch": 3.8938649445458373, + "grad_norm": 0.464163071227373, + "learning_rate": 2.8284334614580767e-06, + "loss": 0.082, + "step": 3467 + }, + { + "epoch": 3.8949880668257757, + "grad_norm": 0.4290775892676479, + "learning_rate": 2.822968967728703e-06, + "loss": 0.0742, + "step": 3468 + }, + { + "epoch": 3.8961111891057136, + "grad_norm": 0.4208068309684712, + "learning_rate": 2.8175088901055026e-06, + "loss": 0.0751, + "step": 3469 + }, + { + "epoch": 3.897234311385652, + "grad_norm": 0.4313279478607281, + "learning_rate": 2.8120532319481255e-06, + "loss": 0.0784, + "step": 3470 + }, + { + "epoch": 3.8983574336655904, + "grad_norm": 0.4576364929353952, + "learning_rate": 2.8066019966134907e-06, + "loss": 0.0752, + "step": 3471 + }, + { + "epoch": 3.8994805559455283, + "grad_norm": 0.45124729742636177, + "learning_rate": 2.801155187455807e-06, + "loss": 0.0812, + "step": 3472 + }, + { + "epoch": 3.9006036782254667, + "grad_norm": 0.4274779842231408, + "learning_rate": 2.7957128078265574e-06, + "loss": 0.076, + "step": 3473 + }, + { + "epoch": 3.901726800505405, + "grad_norm": 0.4378140528997512, + "learning_rate": 2.790274861074497e-06, + "loss": 0.0736, + "step": 3474 + }, + { + "epoch": 3.902849922785343, + "grad_norm": 0.4343770792886928, + "learning_rate": 2.7848413505456564e-06, + "loss": 0.0743, + "step": 3475 + }, + { + "epoch": 3.9039730450652814, + "grad_norm": 0.4269837178553136, + "learning_rate": 2.7794122795833276e-06, + "loss": 0.0785, + "step": 3476 + }, + { + "epoch": 3.90509616734522, + "grad_norm": 0.44957322685953266, + "learning_rate": 2.7739876515280838e-06, + "loss": 0.0744, + "step": 3477 + }, + { + "epoch": 3.9062192896251577, + "grad_norm": 0.4336825511389323, + "learning_rate": 2.7685674697177568e-06, + "loss": 0.0721, + "step": 3478 + }, + { + "epoch": 3.907342411905096, + "grad_norm": 0.44723299851333226, + "learning_rate": 2.7631517374874427e-06, + "loss": 0.0744, + "step": 3479 + }, + { + "epoch": 3.9084655341850345, + "grad_norm": 0.44066946939094515, + "learning_rate": 2.7577404581695035e-06, + "loss": 0.0768, + "step": 3480 + }, + { + "epoch": 3.9095886564649724, + "grad_norm": 0.4500736384140416, + "learning_rate": 2.752333635093558e-06, + "loss": 0.0754, + "step": 3481 + }, + { + "epoch": 3.910711778744911, + "grad_norm": 0.4594645625800169, + "learning_rate": 2.746931271586484e-06, + "loss": 0.0803, + "step": 3482 + }, + { + "epoch": 3.911834901024849, + "grad_norm": 0.42877398862098437, + "learning_rate": 2.7415333709724168e-06, + "loss": 0.0736, + "step": 3483 + }, + { + "epoch": 3.912958023304787, + "grad_norm": 0.42734145184760597, + "learning_rate": 2.7361399365727404e-06, + "loss": 0.0791, + "step": 3484 + }, + { + "epoch": 3.9140811455847255, + "grad_norm": 0.4340445049242047, + "learning_rate": 2.7307509717060954e-06, + "loss": 0.0819, + "step": 3485 + }, + { + "epoch": 3.915204267864664, + "grad_norm": 0.4341246214018538, + "learning_rate": 2.725366479688373e-06, + "loss": 0.0756, + "step": 3486 + }, + { + "epoch": 3.916327390144602, + "grad_norm": 0.4236324146454293, + "learning_rate": 2.719986463832708e-06, + "loss": 0.0757, + "step": 3487 + }, + { + "epoch": 3.9174505124245402, + "grad_norm": 0.46626173343907984, + "learning_rate": 2.714610927449486e-06, + "loss": 0.0853, + "step": 3488 + }, + { + "epoch": 3.9185736347044786, + "grad_norm": 0.4377237420668139, + "learning_rate": 2.7092398738463345e-06, + "loss": 0.0767, + "step": 3489 + }, + { + "epoch": 3.9196967569844166, + "grad_norm": 0.44860358330503586, + "learning_rate": 2.7038733063281177e-06, + "loss": 0.073, + "step": 3490 + }, + { + "epoch": 3.920819879264355, + "grad_norm": 0.43604700182859185, + "learning_rate": 2.698511228196945e-06, + "loss": 0.0763, + "step": 3491 + }, + { + "epoch": 3.9219430015442933, + "grad_norm": 0.4401389309270918, + "learning_rate": 2.6931536427521632e-06, + "loss": 0.0846, + "step": 3492 + }, + { + "epoch": 3.9230661238242313, + "grad_norm": 0.4331666233322056, + "learning_rate": 2.687800553290353e-06, + "loss": 0.0713, + "step": 3493 + }, + { + "epoch": 3.9241892461041696, + "grad_norm": 0.44560495275667983, + "learning_rate": 2.6824519631053324e-06, + "loss": 0.0756, + "step": 3494 + }, + { + "epoch": 3.925312368384108, + "grad_norm": 0.4209276031297463, + "learning_rate": 2.6771078754881417e-06, + "loss": 0.0712, + "step": 3495 + }, + { + "epoch": 3.926435490664046, + "grad_norm": 0.4531187078058862, + "learning_rate": 2.6717682937270605e-06, + "loss": 0.0865, + "step": 3496 + }, + { + "epoch": 3.9275586129439843, + "grad_norm": 0.42791348822861297, + "learning_rate": 2.6664332211075915e-06, + "loss": 0.0734, + "step": 3497 + }, + { + "epoch": 3.9286817352239227, + "grad_norm": 0.42910458284895564, + "learning_rate": 2.6611026609124647e-06, + "loss": 0.0767, + "step": 3498 + }, + { + "epoch": 3.9298048575038607, + "grad_norm": 0.4360003667897708, + "learning_rate": 2.6557766164216334e-06, + "loss": 0.0821, + "step": 3499 + }, + { + "epoch": 3.930927979783799, + "grad_norm": 0.4473465009354422, + "learning_rate": 2.6504550909122674e-06, + "loss": 0.0824, + "step": 3500 + }, + { + "epoch": 3.9320511020637374, + "grad_norm": 0.4559995591973532, + "learning_rate": 2.6451380876587617e-06, + "loss": 0.0773, + "step": 3501 + }, + { + "epoch": 3.9331742243436754, + "grad_norm": 0.44013072714573725, + "learning_rate": 2.639825609932727e-06, + "loss": 0.0773, + "step": 3502 + }, + { + "epoch": 3.9342973466236137, + "grad_norm": 0.4385601175313549, + "learning_rate": 2.63451766100299e-06, + "loss": 0.0766, + "step": 3503 + }, + { + "epoch": 3.935420468903552, + "grad_norm": 0.46222160717340305, + "learning_rate": 2.6292142441355915e-06, + "loss": 0.0834, + "step": 3504 + }, + { + "epoch": 3.93654359118349, + "grad_norm": 0.44847101380558047, + "learning_rate": 2.6239153625937786e-06, + "loss": 0.0777, + "step": 3505 + }, + { + "epoch": 3.9376667134634284, + "grad_norm": 0.4335638035588458, + "learning_rate": 2.6186210196380135e-06, + "loss": 0.0745, + "step": 3506 + }, + { + "epoch": 3.938789835743367, + "grad_norm": 0.45155174333318204, + "learning_rate": 2.613331218525963e-06, + "loss": 0.0813, + "step": 3507 + }, + { + "epoch": 3.9399129580233048, + "grad_norm": 0.44021250781644283, + "learning_rate": 2.6080459625124997e-06, + "loss": 0.0824, + "step": 3508 + }, + { + "epoch": 3.941036080303243, + "grad_norm": 0.42626782736040425, + "learning_rate": 2.602765254849704e-06, + "loss": 0.0759, + "step": 3509 + }, + { + "epoch": 3.942159202583181, + "grad_norm": 0.4502768942497821, + "learning_rate": 2.597489098786847e-06, + "loss": 0.0808, + "step": 3510 + }, + { + "epoch": 3.9432823248631195, + "grad_norm": 0.46342459388260226, + "learning_rate": 2.5922174975704083e-06, + "loss": 0.0765, + "step": 3511 + }, + { + "epoch": 3.944405447143058, + "grad_norm": 0.43079292525854407, + "learning_rate": 2.5869504544440625e-06, + "loss": 0.0804, + "step": 3512 + }, + { + "epoch": 3.945528569422996, + "grad_norm": 0.43315675855851205, + "learning_rate": 2.58168797264868e-06, + "loss": 0.0726, + "step": 3513 + }, + { + "epoch": 3.946651691702934, + "grad_norm": 0.44735874470841014, + "learning_rate": 2.576430055422324e-06, + "loss": 0.0743, + "step": 3514 + }, + { + "epoch": 3.947774813982872, + "grad_norm": 0.4564757971460192, + "learning_rate": 2.5711767060002457e-06, + "loss": 0.0756, + "step": 3515 + }, + { + "epoch": 3.9488979362628105, + "grad_norm": 0.44223205696360607, + "learning_rate": 2.5659279276148896e-06, + "loss": 0.0718, + "step": 3516 + }, + { + "epoch": 3.950021058542749, + "grad_norm": 0.452807039724236, + "learning_rate": 2.5606837234958893e-06, + "loss": 0.0807, + "step": 3517 + }, + { + "epoch": 3.951144180822687, + "grad_norm": 0.44686811295375434, + "learning_rate": 2.5554440968700587e-06, + "loss": 0.0763, + "step": 3518 + }, + { + "epoch": 3.952267303102625, + "grad_norm": 0.43505801163376684, + "learning_rate": 2.550209050961403e-06, + "loss": 0.0706, + "step": 3519 + }, + { + "epoch": 3.9533904253825636, + "grad_norm": 0.4247883961735843, + "learning_rate": 2.544978588991096e-06, + "loss": 0.0744, + "step": 3520 + }, + { + "epoch": 3.9545135476625015, + "grad_norm": 0.45479121928721894, + "learning_rate": 2.5397527141775025e-06, + "loss": 0.0843, + "step": 3521 + }, + { + "epoch": 3.95563666994244, + "grad_norm": 0.4374273400030285, + "learning_rate": 2.534531429736159e-06, + "loss": 0.074, + "step": 3522 + }, + { + "epoch": 3.9567597922223783, + "grad_norm": 0.4149773313963219, + "learning_rate": 2.5293147388797813e-06, + "loss": 0.0708, + "step": 3523 + }, + { + "epoch": 3.957882914502316, + "grad_norm": 0.4295010740541775, + "learning_rate": 2.524102644818256e-06, + "loss": 0.0758, + "step": 3524 + }, + { + "epoch": 3.9590060367822546, + "grad_norm": 0.4486488019169518, + "learning_rate": 2.5188951507586422e-06, + "loss": 0.0808, + "step": 3525 + }, + { + "epoch": 3.960129159062193, + "grad_norm": 0.4462521068537958, + "learning_rate": 2.5136922599051684e-06, + "loss": 0.0733, + "step": 3526 + }, + { + "epoch": 3.961252281342131, + "grad_norm": 0.44838324716484473, + "learning_rate": 2.508493975459232e-06, + "loss": 0.0813, + "step": 3527 + }, + { + "epoch": 3.9623754036220693, + "grad_norm": 0.4313406939987042, + "learning_rate": 2.50330030061939e-06, + "loss": 0.0764, + "step": 3528 + }, + { + "epoch": 3.9634985259020077, + "grad_norm": 0.429785435524192, + "learning_rate": 2.498111238581371e-06, + "loss": 0.0758, + "step": 3529 + }, + { + "epoch": 3.9646216481819456, + "grad_norm": 0.4417273682496011, + "learning_rate": 2.492926792538061e-06, + "loss": 0.0812, + "step": 3530 + }, + { + "epoch": 3.965744770461884, + "grad_norm": 0.42584217521194967, + "learning_rate": 2.487746965679507e-06, + "loss": 0.0844, + "step": 3531 + }, + { + "epoch": 3.9668678927418224, + "grad_norm": 0.4512431805814872, + "learning_rate": 2.4825717611929144e-06, + "loss": 0.0844, + "step": 3532 + }, + { + "epoch": 3.9679910150217603, + "grad_norm": 0.44924315703937545, + "learning_rate": 2.4774011822626455e-06, + "loss": 0.0827, + "step": 3533 + }, + { + "epoch": 3.9691141373016987, + "grad_norm": 0.45652249852342425, + "learning_rate": 2.472235232070208e-06, + "loss": 0.085, + "step": 3534 + }, + { + "epoch": 3.970237259581637, + "grad_norm": 0.4313367900118166, + "learning_rate": 2.4670739137942723e-06, + "loss": 0.0784, + "step": 3535 + }, + { + "epoch": 3.971360381861575, + "grad_norm": 0.43828415959844774, + "learning_rate": 2.4619172306106533e-06, + "loss": 0.0759, + "step": 3536 + }, + { + "epoch": 3.9724835041415134, + "grad_norm": 0.4205420662994824, + "learning_rate": 2.456765185692315e-06, + "loss": 0.0765, + "step": 3537 + }, + { + "epoch": 3.973606626421452, + "grad_norm": 0.4438761928014099, + "learning_rate": 2.451617782209371e-06, + "loss": 0.077, + "step": 3538 + }, + { + "epoch": 3.9747297487013897, + "grad_norm": 0.4164454101954422, + "learning_rate": 2.446475023329068e-06, + "loss": 0.0764, + "step": 3539 + }, + { + "epoch": 3.975852870981328, + "grad_norm": 0.4441939122586264, + "learning_rate": 2.441336912215807e-06, + "loss": 0.0797, + "step": 3540 + }, + { + "epoch": 3.9769759932612665, + "grad_norm": 0.42960608918880705, + "learning_rate": 2.4362034520311216e-06, + "loss": 0.0729, + "step": 3541 + }, + { + "epoch": 3.9780991155412044, + "grad_norm": 0.44957178328850944, + "learning_rate": 2.4310746459336896e-06, + "loss": 0.0802, + "step": 3542 + }, + { + "epoch": 3.979222237821143, + "grad_norm": 0.4329168884237763, + "learning_rate": 2.4259504970793226e-06, + "loss": 0.0727, + "step": 3543 + }, + { + "epoch": 3.980345360101081, + "grad_norm": 0.45775710829454436, + "learning_rate": 2.4208310086209607e-06, + "loss": 0.0765, + "step": 3544 + }, + { + "epoch": 3.981468482381019, + "grad_norm": 0.4377294306740636, + "learning_rate": 2.415716183708684e-06, + "loss": 0.0752, + "step": 3545 + }, + { + "epoch": 3.9825916046609575, + "grad_norm": 0.443470429696325, + "learning_rate": 2.4106060254897002e-06, + "loss": 0.0803, + "step": 3546 + }, + { + "epoch": 3.983714726940896, + "grad_norm": 0.45578066038549137, + "learning_rate": 2.405500537108347e-06, + "loss": 0.0805, + "step": 3547 + }, + { + "epoch": 3.984837849220834, + "grad_norm": 0.45327149698238034, + "learning_rate": 2.4003997217060893e-06, + "loss": 0.0744, + "step": 3548 + }, + { + "epoch": 3.985960971500772, + "grad_norm": 0.4679219665710609, + "learning_rate": 2.395303582421511e-06, + "loss": 0.0817, + "step": 3549 + }, + { + "epoch": 3.9870840937807106, + "grad_norm": 0.4461073700564401, + "learning_rate": 2.390212122390323e-06, + "loss": 0.0795, + "step": 3550 + }, + { + "epoch": 3.9882072160606485, + "grad_norm": 0.44258640272166866, + "learning_rate": 2.385125344745359e-06, + "loss": 0.0764, + "step": 3551 + }, + { + "epoch": 3.989330338340587, + "grad_norm": 0.4287951915573227, + "learning_rate": 2.3800432526165683e-06, + "loss": 0.0819, + "step": 3552 + }, + { + "epoch": 3.9904534606205253, + "grad_norm": 0.4196233297951179, + "learning_rate": 2.37496584913102e-06, + "loss": 0.0768, + "step": 3553 + }, + { + "epoch": 3.9915765829004632, + "grad_norm": 0.43264036028578096, + "learning_rate": 2.369893137412893e-06, + "loss": 0.0827, + "step": 3554 + }, + { + "epoch": 3.9926997051804016, + "grad_norm": 0.44435177476439064, + "learning_rate": 2.3648251205834827e-06, + "loss": 0.0747, + "step": 3555 + }, + { + "epoch": 3.99382282746034, + "grad_norm": 0.4207067931671747, + "learning_rate": 2.3597618017611977e-06, + "loss": 0.0734, + "step": 3556 + }, + { + "epoch": 3.994945949740278, + "grad_norm": 0.42953451358834877, + "learning_rate": 2.3547031840615532e-06, + "loss": 0.0737, + "step": 3557 + }, + { + "epoch": 3.9960690720202163, + "grad_norm": 0.45552658779976596, + "learning_rate": 2.3496492705971753e-06, + "loss": 0.0766, + "step": 3558 + }, + { + "epoch": 3.9971921943001543, + "grad_norm": 0.4389923110529137, + "learning_rate": 2.3446000644777856e-06, + "loss": 0.0765, + "step": 3559 + }, + { + "epoch": 3.9983153165800926, + "grad_norm": 0.44857662285405525, + "learning_rate": 2.339555568810221e-06, + "loss": 0.0774, + "step": 3560 + }, + { + "epoch": 3.999438438860031, + "grad_norm": 0.4393713835259846, + "learning_rate": 2.334515786698415e-06, + "loss": 0.0767, + "step": 3561 + }, + { + "epoch": 4.000561561139969, + "grad_norm": 0.7414061671456206, + "learning_rate": 2.329480721243401e-06, + "loss": 0.0959, + "step": 3562 + }, + { + "epoch": 4.001684683419907, + "grad_norm": 0.35974986114637436, + "learning_rate": 2.3244503755433077e-06, + "loss": 0.0433, + "step": 3563 + }, + { + "epoch": 4.002807805699845, + "grad_norm": 0.35204376720792546, + "learning_rate": 2.3194247526933644e-06, + "loss": 0.0427, + "step": 3564 + }, + { + "epoch": 4.003930927979784, + "grad_norm": 0.30547691019965295, + "learning_rate": 2.3144038557858915e-06, + "loss": 0.037, + "step": 3565 + }, + { + "epoch": 4.005054050259722, + "grad_norm": 0.3503089261671818, + "learning_rate": 2.3093876879103027e-06, + "loss": 0.045, + "step": 3566 + }, + { + "epoch": 4.00617717253966, + "grad_norm": 0.28985834180869824, + "learning_rate": 2.3043762521531e-06, + "loss": 0.0356, + "step": 3567 + }, + { + "epoch": 4.007300294819599, + "grad_norm": 0.3078892585372151, + "learning_rate": 2.2993695515978767e-06, + "loss": 0.041, + "step": 3568 + }, + { + "epoch": 4.008423417099537, + "grad_norm": 0.2793695957969276, + "learning_rate": 2.2943675893253094e-06, + "loss": 0.035, + "step": 3569 + }, + { + "epoch": 4.009546539379475, + "grad_norm": 0.3126139214392695, + "learning_rate": 2.2893703684131608e-06, + "loss": 0.0409, + "step": 3570 + }, + { + "epoch": 4.0106696616594135, + "grad_norm": 0.32235788779278907, + "learning_rate": 2.284377891936277e-06, + "loss": 0.039, + "step": 3571 + }, + { + "epoch": 4.0117927839393515, + "grad_norm": 0.3386774994355639, + "learning_rate": 2.2793901629665847e-06, + "loss": 0.0394, + "step": 3572 + }, + { + "epoch": 4.012915906219289, + "grad_norm": 0.33966921939041494, + "learning_rate": 2.2744071845730843e-06, + "loss": 0.0378, + "step": 3573 + }, + { + "epoch": 4.014039028499228, + "grad_norm": 0.3650876310831837, + "learning_rate": 2.26942895982186e-06, + "loss": 0.0352, + "step": 3574 + }, + { + "epoch": 4.015162150779166, + "grad_norm": 0.39322271754253874, + "learning_rate": 2.2644554917760674e-06, + "loss": 0.0388, + "step": 3575 + }, + { + "epoch": 4.016285273059104, + "grad_norm": 0.3603318333009823, + "learning_rate": 2.2594867834959367e-06, + "loss": 0.0349, + "step": 3576 + }, + { + "epoch": 4.017408395339043, + "grad_norm": 0.39766487225833574, + "learning_rate": 2.2545228380387706e-06, + "loss": 0.0392, + "step": 3577 + }, + { + "epoch": 4.018531517618981, + "grad_norm": 0.3879795346602286, + "learning_rate": 2.2495636584589353e-06, + "loss": 0.0344, + "step": 3578 + }, + { + "epoch": 4.019654639898919, + "grad_norm": 0.38693724167691756, + "learning_rate": 2.2446092478078706e-06, + "loss": 0.0391, + "step": 3579 + }, + { + "epoch": 4.020777762178858, + "grad_norm": 0.3794613149433384, + "learning_rate": 2.2396596091340805e-06, + "loss": 0.0354, + "step": 3580 + }, + { + "epoch": 4.021900884458796, + "grad_norm": 0.3992223472216564, + "learning_rate": 2.2347147454831306e-06, + "loss": 0.0418, + "step": 3581 + }, + { + "epoch": 4.0230240067387335, + "grad_norm": 0.3661442590112595, + "learning_rate": 2.2297746598976545e-06, + "loss": 0.0352, + "step": 3582 + }, + { + "epoch": 4.024147129018672, + "grad_norm": 0.37467677700540847, + "learning_rate": 2.2248393554173344e-06, + "loss": 0.039, + "step": 3583 + }, + { + "epoch": 4.02527025129861, + "grad_norm": 0.3601512962433673, + "learning_rate": 2.219908835078921e-06, + "loss": 0.0358, + "step": 3584 + }, + { + "epoch": 4.026393373578548, + "grad_norm": 0.37668176536008646, + "learning_rate": 2.2149831019162173e-06, + "loss": 0.0418, + "step": 3585 + }, + { + "epoch": 4.027516495858487, + "grad_norm": 0.3258768222612785, + "learning_rate": 2.2100621589600813e-06, + "loss": 0.0334, + "step": 3586 + }, + { + "epoch": 4.028639618138425, + "grad_norm": 0.3370222447649643, + "learning_rate": 2.205146009238426e-06, + "loss": 0.0373, + "step": 3587 + }, + { + "epoch": 4.029762740418363, + "grad_norm": 0.32311730754858564, + "learning_rate": 2.2002346557762068e-06, + "loss": 0.0344, + "step": 3588 + }, + { + "epoch": 4.030885862698301, + "grad_norm": 0.3123818705115929, + "learning_rate": 2.1953281015954364e-06, + "loss": 0.0326, + "step": 3589 + }, + { + "epoch": 4.03200898497824, + "grad_norm": 0.3241220437041912, + "learning_rate": 2.190426349715171e-06, + "loss": 0.0341, + "step": 3590 + }, + { + "epoch": 4.033132107258178, + "grad_norm": 0.33250107457893724, + "learning_rate": 2.185529403151514e-06, + "loss": 0.0373, + "step": 3591 + }, + { + "epoch": 4.0342552295381156, + "grad_norm": 0.3170426069163533, + "learning_rate": 2.1806372649176124e-06, + "loss": 0.0327, + "step": 3592 + }, + { + "epoch": 4.035378351818054, + "grad_norm": 0.32095635332227473, + "learning_rate": 2.175749938023647e-06, + "loss": 0.036, + "step": 3593 + }, + { + "epoch": 4.036501474097992, + "grad_norm": 0.3296174400793843, + "learning_rate": 2.170867425476847e-06, + "loss": 0.0345, + "step": 3594 + }, + { + "epoch": 4.03762459637793, + "grad_norm": 0.32420977388665184, + "learning_rate": 2.165989730281475e-06, + "loss": 0.0358, + "step": 3595 + }, + { + "epoch": 4.038747718657869, + "grad_norm": 0.32486539485385396, + "learning_rate": 2.1611168554388353e-06, + "loss": 0.0375, + "step": 3596 + }, + { + "epoch": 4.039870840937807, + "grad_norm": 0.33856386057317117, + "learning_rate": 2.156248803947254e-06, + "loss": 0.0357, + "step": 3597 + }, + { + "epoch": 4.040993963217745, + "grad_norm": 0.32586631725378973, + "learning_rate": 2.1513855788021e-06, + "loss": 0.0354, + "step": 3598 + }, + { + "epoch": 4.042117085497684, + "grad_norm": 0.3308607327766132, + "learning_rate": 2.14652718299577e-06, + "loss": 0.0347, + "step": 3599 + }, + { + "epoch": 4.043240207777622, + "grad_norm": 0.33048976251932244, + "learning_rate": 2.141673619517687e-06, + "loss": 0.0361, + "step": 3600 + }, + { + "epoch": 4.04436333005756, + "grad_norm": 0.32515150803710624, + "learning_rate": 2.1368248913543065e-06, + "loss": 0.035, + "step": 3601 + }, + { + "epoch": 4.0454864523374985, + "grad_norm": 0.3027989162513895, + "learning_rate": 2.1319810014890972e-06, + "loss": 0.0331, + "step": 3602 + }, + { + "epoch": 4.046609574617436, + "grad_norm": 0.32565678045910307, + "learning_rate": 2.127141952902563e-06, + "loss": 0.0371, + "step": 3603 + }, + { + "epoch": 4.047732696897374, + "grad_norm": 0.30745483238302934, + "learning_rate": 2.12230774857222e-06, + "loss": 0.0322, + "step": 3604 + }, + { + "epoch": 4.048855819177313, + "grad_norm": 0.3418209827856129, + "learning_rate": 2.1174783914726106e-06, + "loss": 0.0357, + "step": 3605 + }, + { + "epoch": 4.049978941457251, + "grad_norm": 0.34150940597932833, + "learning_rate": 2.1126538845752918e-06, + "loss": 0.0361, + "step": 3606 + }, + { + "epoch": 4.051102063737189, + "grad_norm": 0.3319716006832906, + "learning_rate": 2.107834230848833e-06, + "loss": 0.0355, + "step": 3607 + }, + { + "epoch": 4.052225186017128, + "grad_norm": 0.3256155221300326, + "learning_rate": 2.1030194332588203e-06, + "loss": 0.0341, + "step": 3608 + }, + { + "epoch": 4.053348308297066, + "grad_norm": 0.3511915328594416, + "learning_rate": 2.098209494767853e-06, + "loss": 0.0357, + "step": 3609 + }, + { + "epoch": 4.054471430577004, + "grad_norm": 0.3389534143051318, + "learning_rate": 2.0934044183355384e-06, + "loss": 0.0331, + "step": 3610 + }, + { + "epoch": 4.055594552856943, + "grad_norm": 0.3311633004682248, + "learning_rate": 2.088604206918494e-06, + "loss": 0.0297, + "step": 3611 + }, + { + "epoch": 4.0567176751368805, + "grad_norm": 0.37079247885517713, + "learning_rate": 2.0838088634703412e-06, + "loss": 0.0495, + "step": 3612 + }, + { + "epoch": 4.0578407974168185, + "grad_norm": 0.3436254547868691, + "learning_rate": 2.0790183909417096e-06, + "loss": 0.0354, + "step": 3613 + }, + { + "epoch": 4.058963919696757, + "grad_norm": 0.3537084660888286, + "learning_rate": 2.0742327922802285e-06, + "loss": 0.0364, + "step": 3614 + }, + { + "epoch": 4.060087041976695, + "grad_norm": 0.33720406412003606, + "learning_rate": 2.069452070430529e-06, + "loss": 0.0357, + "step": 3615 + }, + { + "epoch": 4.061210164256633, + "grad_norm": 0.361772020239582, + "learning_rate": 2.0646762283342448e-06, + "loss": 0.0406, + "step": 3616 + }, + { + "epoch": 4.062333286536572, + "grad_norm": 0.3305905026607228, + "learning_rate": 2.059905268929999e-06, + "loss": 0.0291, + "step": 3617 + }, + { + "epoch": 4.06345640881651, + "grad_norm": 0.35558319079029205, + "learning_rate": 2.055139195153417e-06, + "loss": 0.0314, + "step": 3618 + }, + { + "epoch": 4.064579531096448, + "grad_norm": 0.347660893552983, + "learning_rate": 2.0503780099371196e-06, + "loss": 0.0357, + "step": 3619 + }, + { + "epoch": 4.065702653376387, + "grad_norm": 0.35259174178160463, + "learning_rate": 2.045621716210713e-06, + "loss": 0.0314, + "step": 3620 + }, + { + "epoch": 4.066825775656325, + "grad_norm": 0.32281631904843283, + "learning_rate": 2.0408703169008015e-06, + "loss": 0.033, + "step": 3621 + }, + { + "epoch": 4.067948897936263, + "grad_norm": 0.34725153445115975, + "learning_rate": 2.036123814930967e-06, + "loss": 0.0375, + "step": 3622 + }, + { + "epoch": 4.069072020216201, + "grad_norm": 0.35654253964781996, + "learning_rate": 2.0313822132217887e-06, + "loss": 0.0328, + "step": 3623 + }, + { + "epoch": 4.070195142496139, + "grad_norm": 0.33133915489525906, + "learning_rate": 2.0266455146908248e-06, + "loss": 0.0345, + "step": 3624 + }, + { + "epoch": 4.071318264776077, + "grad_norm": 0.36110178776774343, + "learning_rate": 2.0219137222526188e-06, + "loss": 0.0396, + "step": 3625 + }, + { + "epoch": 4.072441387056016, + "grad_norm": 0.32077710176540647, + "learning_rate": 2.0171868388186953e-06, + "loss": 0.0358, + "step": 3626 + }, + { + "epoch": 4.073564509335954, + "grad_norm": 0.3444116573250157, + "learning_rate": 2.0124648672975567e-06, + "loss": 0.0355, + "step": 3627 + }, + { + "epoch": 4.074687631615892, + "grad_norm": 0.3515163459356766, + "learning_rate": 2.007747810594682e-06, + "loss": 0.0367, + "step": 3628 + }, + { + "epoch": 4.075810753895831, + "grad_norm": 0.3450125011830339, + "learning_rate": 2.003035671612532e-06, + "loss": 0.036, + "step": 3629 + }, + { + "epoch": 4.076933876175769, + "grad_norm": 0.3547038161654613, + "learning_rate": 1.9983284532505343e-06, + "loss": 0.0372, + "step": 3630 + }, + { + "epoch": 4.078056998455707, + "grad_norm": 0.3417279392843428, + "learning_rate": 1.9936261584050974e-06, + "loss": 0.0374, + "step": 3631 + }, + { + "epoch": 4.0791801207356455, + "grad_norm": 0.34281518207305883, + "learning_rate": 1.9889287899695887e-06, + "loss": 0.0371, + "step": 3632 + }, + { + "epoch": 4.0803032430155834, + "grad_norm": 0.3272532226131082, + "learning_rate": 1.9842363508343532e-06, + "loss": 0.0344, + "step": 3633 + }, + { + "epoch": 4.081426365295521, + "grad_norm": 0.3362181119467591, + "learning_rate": 1.9795488438867005e-06, + "loss": 0.0325, + "step": 3634 + }, + { + "epoch": 4.08254948757546, + "grad_norm": 0.34910761973320087, + "learning_rate": 1.974866272010908e-06, + "loss": 0.0335, + "step": 3635 + }, + { + "epoch": 4.083672609855398, + "grad_norm": 0.3730488558950869, + "learning_rate": 1.9701886380882073e-06, + "loss": 0.0402, + "step": 3636 + }, + { + "epoch": 4.084795732135336, + "grad_norm": 0.3413923732565504, + "learning_rate": 1.965515944996803e-06, + "loss": 0.0361, + "step": 3637 + }, + { + "epoch": 4.085918854415274, + "grad_norm": 0.33096031668167075, + "learning_rate": 1.960848195611853e-06, + "loss": 0.0329, + "step": 3638 + }, + { + "epoch": 4.087041976695213, + "grad_norm": 0.34388985312179227, + "learning_rate": 1.9561853928054753e-06, + "loss": 0.0377, + "step": 3639 + }, + { + "epoch": 4.088165098975151, + "grad_norm": 0.32146386739388727, + "learning_rate": 1.9515275394467446e-06, + "loss": 0.0349, + "step": 3640 + }, + { + "epoch": 4.089288221255089, + "grad_norm": 0.33807086229307787, + "learning_rate": 1.946874638401688e-06, + "loss": 0.0336, + "step": 3641 + }, + { + "epoch": 4.0904113435350276, + "grad_norm": 0.34620877675756956, + "learning_rate": 1.9422266925332857e-06, + "loss": 0.0383, + "step": 3642 + }, + { + "epoch": 4.0915344658149655, + "grad_norm": 0.36100090203902546, + "learning_rate": 1.9375837047014712e-06, + "loss": 0.0393, + "step": 3643 + }, + { + "epoch": 4.092657588094903, + "grad_norm": 0.34073891008207474, + "learning_rate": 1.9329456777631273e-06, + "loss": 0.0342, + "step": 3644 + }, + { + "epoch": 4.093780710374842, + "grad_norm": 0.343909859895942, + "learning_rate": 1.928312614572083e-06, + "loss": 0.0346, + "step": 3645 + }, + { + "epoch": 4.09490383265478, + "grad_norm": 0.3405997491217741, + "learning_rate": 1.92368451797911e-06, + "loss": 0.0333, + "step": 3646 + }, + { + "epoch": 4.096026954934718, + "grad_norm": 0.3532535917803961, + "learning_rate": 1.919061390831929e-06, + "loss": 0.0371, + "step": 3647 + }, + { + "epoch": 4.097150077214657, + "grad_norm": 0.3402279776506247, + "learning_rate": 1.914443235975201e-06, + "loss": 0.0345, + "step": 3648 + }, + { + "epoch": 4.098273199494595, + "grad_norm": 0.33955904857288616, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.0354, + "step": 3649 + }, + { + "epoch": 4.099396321774533, + "grad_norm": 0.3548495987201884, + "learning_rate": 1.9052218544964473e-06, + "loss": 0.0378, + "step": 3650 + }, + { + "epoch": 4.100519444054472, + "grad_norm": 0.3565039554766224, + "learning_rate": 1.9006186335484422e-06, + "loss": 0.0389, + "step": 3651 + }, + { + "epoch": 4.10164256633441, + "grad_norm": 0.3176237453159487, + "learning_rate": 1.89602039623892e-06, + "loss": 0.0329, + "step": 3652 + }, + { + "epoch": 4.1027656886143475, + "grad_norm": 0.34171584057174337, + "learning_rate": 1.8914271453972277e-06, + "loss": 0.0374, + "step": 3653 + }, + { + "epoch": 4.103888810894286, + "grad_norm": 0.3573392279296198, + "learning_rate": 1.8868388838496433e-06, + "loss": 0.0341, + "step": 3654 + }, + { + "epoch": 4.105011933174224, + "grad_norm": 0.3319986984776665, + "learning_rate": 1.882255614419376e-06, + "loss": 0.0343, + "step": 3655 + }, + { + "epoch": 4.106135055454162, + "grad_norm": 0.33666673936394453, + "learning_rate": 1.8776773399265601e-06, + "loss": 0.0354, + "step": 3656 + }, + { + "epoch": 4.107258177734101, + "grad_norm": 0.36043615873431406, + "learning_rate": 1.8731040631882591e-06, + "loss": 0.0354, + "step": 3657 + }, + { + "epoch": 4.108381300014039, + "grad_norm": 0.3719854481200747, + "learning_rate": 1.8685357870184605e-06, + "loss": 0.0384, + "step": 3658 + }, + { + "epoch": 4.109504422293977, + "grad_norm": 0.3351525535966926, + "learning_rate": 1.8639725142280752e-06, + "loss": 0.0336, + "step": 3659 + }, + { + "epoch": 4.110627544573916, + "grad_norm": 0.33593296146741825, + "learning_rate": 1.8594142476249365e-06, + "loss": 0.0318, + "step": 3660 + }, + { + "epoch": 4.111750666853854, + "grad_norm": 0.34070125655000394, + "learning_rate": 1.8548609900137926e-06, + "loss": 0.0381, + "step": 3661 + }, + { + "epoch": 4.112873789133792, + "grad_norm": 0.3438466556073073, + "learning_rate": 1.8503127441963153e-06, + "loss": 0.0317, + "step": 3662 + }, + { + "epoch": 4.1139969114137305, + "grad_norm": 0.3422040619381074, + "learning_rate": 1.8457695129710885e-06, + "loss": 0.0341, + "step": 3663 + }, + { + "epoch": 4.115120033693668, + "grad_norm": 0.3336224223024909, + "learning_rate": 1.8412312991336146e-06, + "loss": 0.0335, + "step": 3664 + }, + { + "epoch": 4.116243155973606, + "grad_norm": 0.3243595593137732, + "learning_rate": 1.8366981054763077e-06, + "loss": 0.034, + "step": 3665 + }, + { + "epoch": 4.117366278253545, + "grad_norm": 0.3408130322069507, + "learning_rate": 1.8321699347884869e-06, + "loss": 0.0359, + "step": 3666 + }, + { + "epoch": 4.118489400533483, + "grad_norm": 0.33585943316941114, + "learning_rate": 1.8276467898563887e-06, + "loss": 0.0369, + "step": 3667 + }, + { + "epoch": 4.119612522813421, + "grad_norm": 0.3440889810814874, + "learning_rate": 1.8231286734631526e-06, + "loss": 0.0323, + "step": 3668 + }, + { + "epoch": 4.12073564509336, + "grad_norm": 0.34877435967999315, + "learning_rate": 1.818615588388829e-06, + "loss": 0.0391, + "step": 3669 + }, + { + "epoch": 4.121858767373298, + "grad_norm": 0.33501771510850326, + "learning_rate": 1.8141075374103634e-06, + "loss": 0.0334, + "step": 3670 + }, + { + "epoch": 4.122981889653236, + "grad_norm": 0.3477079472833074, + "learning_rate": 1.8096045233016123e-06, + "loss": 0.0372, + "step": 3671 + }, + { + "epoch": 4.124105011933175, + "grad_norm": 0.352773874107671, + "learning_rate": 1.8051065488333285e-06, + "loss": 0.039, + "step": 3672 + }, + { + "epoch": 4.1252281342131125, + "grad_norm": 0.3249660660258163, + "learning_rate": 1.8006136167731658e-06, + "loss": 0.0325, + "step": 3673 + }, + { + "epoch": 4.1263512564930505, + "grad_norm": 0.352952838092871, + "learning_rate": 1.7961257298856783e-06, + "loss": 0.0406, + "step": 3674 + }, + { + "epoch": 4.127474378772989, + "grad_norm": 0.36490521962442574, + "learning_rate": 1.7916428909323057e-06, + "loss": 0.0374, + "step": 3675 + }, + { + "epoch": 4.128597501052927, + "grad_norm": 0.33698323334336555, + "learning_rate": 1.787165102671391e-06, + "loss": 0.0336, + "step": 3676 + }, + { + "epoch": 4.129720623332865, + "grad_norm": 0.32813513335813027, + "learning_rate": 1.7826923678581664e-06, + "loss": 0.034, + "step": 3677 + }, + { + "epoch": 4.130843745612804, + "grad_norm": 0.3358051137115379, + "learning_rate": 1.7782246892447564e-06, + "loss": 0.0323, + "step": 3678 + }, + { + "epoch": 4.131966867892742, + "grad_norm": 0.31172849246423406, + "learning_rate": 1.7737620695801737e-06, + "loss": 0.0306, + "step": 3679 + }, + { + "epoch": 4.13308999017268, + "grad_norm": 0.3726210328067485, + "learning_rate": 1.7693045116103125e-06, + "loss": 0.0382, + "step": 3680 + }, + { + "epoch": 4.134213112452619, + "grad_norm": 0.33518709615258124, + "learning_rate": 1.7648520180779605e-06, + "loss": 0.0319, + "step": 3681 + }, + { + "epoch": 4.135336234732557, + "grad_norm": 0.37299298008128623, + "learning_rate": 1.7604045917227852e-06, + "loss": 0.0364, + "step": 3682 + }, + { + "epoch": 4.136459357012495, + "grad_norm": 0.3471679342038857, + "learning_rate": 1.7559622352813366e-06, + "loss": 0.0346, + "step": 3683 + }, + { + "epoch": 4.137582479292433, + "grad_norm": 0.36575324426037753, + "learning_rate": 1.7515249514870504e-06, + "loss": 0.036, + "step": 3684 + }, + { + "epoch": 4.138705601572371, + "grad_norm": 0.33169330081699266, + "learning_rate": 1.7470927430702277e-06, + "loss": 0.0359, + "step": 3685 + }, + { + "epoch": 4.139828723852309, + "grad_norm": 0.34483958264370257, + "learning_rate": 1.7426656127580598e-06, + "loss": 0.0341, + "step": 3686 + }, + { + "epoch": 4.140951846132248, + "grad_norm": 0.3562439460847369, + "learning_rate": 1.7382435632746086e-06, + "loss": 0.0379, + "step": 3687 + }, + { + "epoch": 4.142074968412186, + "grad_norm": 0.31582095689600725, + "learning_rate": 1.7338265973408097e-06, + "loss": 0.0314, + "step": 3688 + }, + { + "epoch": 4.143198090692124, + "grad_norm": 0.3433619286521538, + "learning_rate": 1.7294147176744725e-06, + "loss": 0.0356, + "step": 3689 + }, + { + "epoch": 4.144321212972062, + "grad_norm": 0.3489147400059873, + "learning_rate": 1.7250079269902708e-06, + "loss": 0.0367, + "step": 3690 + }, + { + "epoch": 4.145444335252001, + "grad_norm": 0.3352194034975239, + "learning_rate": 1.7206062279997538e-06, + "loss": 0.0326, + "step": 3691 + }, + { + "epoch": 4.146567457531939, + "grad_norm": 0.355514581203735, + "learning_rate": 1.7162096234113358e-06, + "loss": 0.0372, + "step": 3692 + }, + { + "epoch": 4.147690579811877, + "grad_norm": 0.3359309769139185, + "learning_rate": 1.7118181159302948e-06, + "loss": 0.0353, + "step": 3693 + }, + { + "epoch": 4.148813702091815, + "grad_norm": 0.3380299997227094, + "learning_rate": 1.7074317082587755e-06, + "loss": 0.0375, + "step": 3694 + }, + { + "epoch": 4.149936824371753, + "grad_norm": 0.3179117693436562, + "learning_rate": 1.703050403095783e-06, + "loss": 0.0319, + "step": 3695 + }, + { + "epoch": 4.151059946651691, + "grad_norm": 0.34267472556917217, + "learning_rate": 1.6986742031371794e-06, + "loss": 0.0375, + "step": 3696 + }, + { + "epoch": 4.15218306893163, + "grad_norm": 0.3550491112940439, + "learning_rate": 1.6943031110756902e-06, + "loss": 0.0384, + "step": 3697 + }, + { + "epoch": 4.153306191211568, + "grad_norm": 0.3441009447815663, + "learning_rate": 1.689937129600897e-06, + "loss": 0.036, + "step": 3698 + }, + { + "epoch": 4.154429313491506, + "grad_norm": 0.35632790653268437, + "learning_rate": 1.6855762613992367e-06, + "loss": 0.0383, + "step": 3699 + }, + { + "epoch": 4.155552435771445, + "grad_norm": 0.3380140910478162, + "learning_rate": 1.6812205091539979e-06, + "loss": 0.0366, + "step": 3700 + }, + { + "epoch": 4.156675558051383, + "grad_norm": 0.36642723566904717, + "learning_rate": 1.676869875545324e-06, + "loss": 0.0425, + "step": 3701 + }, + { + "epoch": 4.157798680331321, + "grad_norm": 0.3486015385918872, + "learning_rate": 1.6725243632502074e-06, + "loss": 0.0365, + "step": 3702 + }, + { + "epoch": 4.1589218026112595, + "grad_norm": 0.34087715875690777, + "learning_rate": 1.668183974942491e-06, + "loss": 0.0354, + "step": 3703 + }, + { + "epoch": 4.1600449248911975, + "grad_norm": 0.3301015274275597, + "learning_rate": 1.6638487132928638e-06, + "loss": 0.0337, + "step": 3704 + }, + { + "epoch": 4.161168047171135, + "grad_norm": 0.32631689981447143, + "learning_rate": 1.6595185809688564e-06, + "loss": 0.0359, + "step": 3705 + }, + { + "epoch": 4.162291169451074, + "grad_norm": 0.33677899223356966, + "learning_rate": 1.6551935806348485e-06, + "loss": 0.0352, + "step": 3706 + }, + { + "epoch": 4.163414291731012, + "grad_norm": 0.344706832968584, + "learning_rate": 1.6508737149520615e-06, + "loss": 0.0344, + "step": 3707 + }, + { + "epoch": 4.16453741401095, + "grad_norm": 0.3319801271011935, + "learning_rate": 1.6465589865785581e-06, + "loss": 0.0311, + "step": 3708 + }, + { + "epoch": 4.165660536290889, + "grad_norm": 0.3469710548961725, + "learning_rate": 1.6422493981692333e-06, + "loss": 0.0343, + "step": 3709 + }, + { + "epoch": 4.166783658570827, + "grad_norm": 0.3459491276034882, + "learning_rate": 1.6379449523758262e-06, + "loss": 0.0341, + "step": 3710 + }, + { + "epoch": 4.167906780850765, + "grad_norm": 0.3664777228112351, + "learning_rate": 1.6336456518469112e-06, + "loss": 0.0398, + "step": 3711 + }, + { + "epoch": 4.169029903130704, + "grad_norm": 0.3404376801657948, + "learning_rate": 1.6293514992278935e-06, + "loss": 0.0326, + "step": 3712 + }, + { + "epoch": 4.170153025410642, + "grad_norm": 0.3568606518647232, + "learning_rate": 1.6250624971610152e-06, + "loss": 0.0331, + "step": 3713 + }, + { + "epoch": 4.1712761476905795, + "grad_norm": 0.34901751847554274, + "learning_rate": 1.6207786482853428e-06, + "loss": 0.0376, + "step": 3714 + }, + { + "epoch": 4.172399269970518, + "grad_norm": 0.3399558991517041, + "learning_rate": 1.6164999552367767e-06, + "loss": 0.0314, + "step": 3715 + }, + { + "epoch": 4.173522392250456, + "grad_norm": 0.33061099011882433, + "learning_rate": 1.6122264206480443e-06, + "loss": 0.0329, + "step": 3716 + }, + { + "epoch": 4.174645514530394, + "grad_norm": 0.3710949614187633, + "learning_rate": 1.6079580471486988e-06, + "loss": 0.033, + "step": 3717 + }, + { + "epoch": 4.175768636810333, + "grad_norm": 0.3769411467365772, + "learning_rate": 1.6036948373651195e-06, + "loss": 0.037, + "step": 3718 + }, + { + "epoch": 4.176891759090271, + "grad_norm": 0.3349294639560482, + "learning_rate": 1.5994367939205012e-06, + "loss": 0.0308, + "step": 3719 + }, + { + "epoch": 4.178014881370209, + "grad_norm": 0.322230527234023, + "learning_rate": 1.5951839194348684e-06, + "loss": 0.033, + "step": 3720 + }, + { + "epoch": 4.179138003650148, + "grad_norm": 0.3423969186367366, + "learning_rate": 1.5909362165250609e-06, + "loss": 0.0333, + "step": 3721 + }, + { + "epoch": 4.180261125930086, + "grad_norm": 0.34016184747977096, + "learning_rate": 1.5866936878047368e-06, + "loss": 0.0329, + "step": 3722 + }, + { + "epoch": 4.181384248210024, + "grad_norm": 0.3606283637266968, + "learning_rate": 1.5824563358843725e-06, + "loss": 0.0385, + "step": 3723 + }, + { + "epoch": 4.1825073704899625, + "grad_norm": 0.36699375787006455, + "learning_rate": 1.5782241633712536e-06, + "loss": 0.0323, + "step": 3724 + }, + { + "epoch": 4.1836304927699, + "grad_norm": 0.34062675166929457, + "learning_rate": 1.5739971728694848e-06, + "loss": 0.0323, + "step": 3725 + }, + { + "epoch": 4.184753615049838, + "grad_norm": 0.3585907476688318, + "learning_rate": 1.5697753669799788e-06, + "loss": 0.0352, + "step": 3726 + }, + { + "epoch": 4.185876737329777, + "grad_norm": 0.3627326102547834, + "learning_rate": 1.5655587483004608e-06, + "loss": 0.0352, + "step": 3727 + }, + { + "epoch": 4.186999859609715, + "grad_norm": 0.3202203931611738, + "learning_rate": 1.5613473194254636e-06, + "loss": 0.0323, + "step": 3728 + }, + { + "epoch": 4.188122981889653, + "grad_norm": 0.35906124847404014, + "learning_rate": 1.5571410829463218e-06, + "loss": 0.033, + "step": 3729 + }, + { + "epoch": 4.189246104169592, + "grad_norm": 0.3407268596851762, + "learning_rate": 1.5529400414511809e-06, + "loss": 0.0363, + "step": 3730 + }, + { + "epoch": 4.19036922644953, + "grad_norm": 0.3301264157569526, + "learning_rate": 1.5487441975249885e-06, + "loss": 0.0308, + "step": 3731 + }, + { + "epoch": 4.191492348729468, + "grad_norm": 0.35754773559343533, + "learning_rate": 1.5445535537494926e-06, + "loss": 0.0341, + "step": 3732 + }, + { + "epoch": 4.192615471009407, + "grad_norm": 0.3546319476031264, + "learning_rate": 1.5403681127032466e-06, + "loss": 0.0354, + "step": 3733 + }, + { + "epoch": 4.1937385932893445, + "grad_norm": 0.3334688214498469, + "learning_rate": 1.5361878769615913e-06, + "loss": 0.0363, + "step": 3734 + }, + { + "epoch": 4.1948617155692824, + "grad_norm": 0.359559039318054, + "learning_rate": 1.5320128490966768e-06, + "loss": 0.0402, + "step": 3735 + }, + { + "epoch": 4.19598483784922, + "grad_norm": 0.3559528249651165, + "learning_rate": 1.5278430316774406e-06, + "loss": 0.0336, + "step": 3736 + }, + { + "epoch": 4.197107960129159, + "grad_norm": 0.3389181856407088, + "learning_rate": 1.5236784272696204e-06, + "loss": 0.0363, + "step": 3737 + }, + { + "epoch": 4.198231082409097, + "grad_norm": 0.33514425271731724, + "learning_rate": 1.5195190384357405e-06, + "loss": 0.0315, + "step": 3738 + }, + { + "epoch": 4.199354204689035, + "grad_norm": 0.34519876807313443, + "learning_rate": 1.5153648677351196e-06, + "loss": 0.0355, + "step": 3739 + }, + { + "epoch": 4.200477326968974, + "grad_norm": 0.32555795899970486, + "learning_rate": 1.5112159177238683e-06, + "loss": 0.0322, + "step": 3740 + }, + { + "epoch": 4.201600449248912, + "grad_norm": 0.3499784570691095, + "learning_rate": 1.5070721909548747e-06, + "loss": 0.036, + "step": 3741 + }, + { + "epoch": 4.20272357152885, + "grad_norm": 0.3454531769390773, + "learning_rate": 1.5029336899778224e-06, + "loss": 0.0351, + "step": 3742 + }, + { + "epoch": 4.203846693808789, + "grad_norm": 0.33874193071072906, + "learning_rate": 1.4988004173391769e-06, + "loss": 0.0342, + "step": 3743 + }, + { + "epoch": 4.2049698160887266, + "grad_norm": 0.3427966066704258, + "learning_rate": 1.4946723755821858e-06, + "loss": 0.0361, + "step": 3744 + }, + { + "epoch": 4.2060929383686645, + "grad_norm": 0.33371125691338493, + "learning_rate": 1.4905495672468784e-06, + "loss": 0.0335, + "step": 3745 + }, + { + "epoch": 4.207216060648603, + "grad_norm": 0.348963362736691, + "learning_rate": 1.4864319948700656e-06, + "loss": 0.0355, + "step": 3746 + }, + { + "epoch": 4.208339182928541, + "grad_norm": 0.3440752736809589, + "learning_rate": 1.4823196609853362e-06, + "loss": 0.0306, + "step": 3747 + }, + { + "epoch": 4.209462305208479, + "grad_norm": 0.3531490219546731, + "learning_rate": 1.4782125681230497e-06, + "loss": 0.0315, + "step": 3748 + }, + { + "epoch": 4.210585427488418, + "grad_norm": 0.3507377236203947, + "learning_rate": 1.4741107188103477e-06, + "loss": 0.0337, + "step": 3749 + }, + { + "epoch": 4.211708549768356, + "grad_norm": 0.33761169130704055, + "learning_rate": 1.4700141155711433e-06, + "loss": 0.0344, + "step": 3750 + }, + { + "epoch": 4.212831672048294, + "grad_norm": 0.5661341702283312, + "learning_rate": 1.465922760926123e-06, + "loss": 0.0341, + "step": 3751 + }, + { + "epoch": 4.213954794328233, + "grad_norm": 0.3575202836894078, + "learning_rate": 1.4618366573927423e-06, + "loss": 0.0367, + "step": 3752 + }, + { + "epoch": 4.215077916608171, + "grad_norm": 0.3513549935655655, + "learning_rate": 1.4577558074852228e-06, + "loss": 0.0374, + "step": 3753 + }, + { + "epoch": 4.216201038888109, + "grad_norm": 0.3315282047974937, + "learning_rate": 1.453680213714559e-06, + "loss": 0.0319, + "step": 3754 + }, + { + "epoch": 4.217324161168047, + "grad_norm": 0.3718187160987202, + "learning_rate": 1.449609878588506e-06, + "loss": 0.0375, + "step": 3755 + }, + { + "epoch": 4.218447283447985, + "grad_norm": 0.3375869813485876, + "learning_rate": 1.4455448046115884e-06, + "loss": 0.0336, + "step": 3756 + }, + { + "epoch": 4.219570405727923, + "grad_norm": 0.34005174509994746, + "learning_rate": 1.4414849942850927e-06, + "loss": 0.0341, + "step": 3757 + }, + { + "epoch": 4.220693528007862, + "grad_norm": 0.33034188119181235, + "learning_rate": 1.4374304501070592e-06, + "loss": 0.0329, + "step": 3758 + }, + { + "epoch": 4.2218166502878, + "grad_norm": 0.3356259815303574, + "learning_rate": 1.433381174572297e-06, + "loss": 0.0342, + "step": 3759 + }, + { + "epoch": 4.222939772567738, + "grad_norm": 0.3410011093554527, + "learning_rate": 1.4293371701723701e-06, + "loss": 0.0335, + "step": 3760 + }, + { + "epoch": 4.224062894847677, + "grad_norm": 0.34434008829412244, + "learning_rate": 1.425298439395597e-06, + "loss": 0.0342, + "step": 3761 + }, + { + "epoch": 4.225186017127615, + "grad_norm": 0.32274850049291476, + "learning_rate": 1.4212649847270576e-06, + "loss": 0.0344, + "step": 3762 + }, + { + "epoch": 4.226309139407553, + "grad_norm": 0.36333470622076525, + "learning_rate": 1.4172368086485755e-06, + "loss": 0.0363, + "step": 3763 + }, + { + "epoch": 4.2274322616874915, + "grad_norm": 0.3479702931501896, + "learning_rate": 1.4132139136387334e-06, + "loss": 0.0359, + "step": 3764 + }, + { + "epoch": 4.2285553839674295, + "grad_norm": 0.33884738891643645, + "learning_rate": 1.4091963021728639e-06, + "loss": 0.0331, + "step": 3765 + }, + { + "epoch": 4.229678506247367, + "grad_norm": 0.35343625411719487, + "learning_rate": 1.4051839767230479e-06, + "loss": 0.0352, + "step": 3766 + }, + { + "epoch": 4.230801628527306, + "grad_norm": 0.3370621206104113, + "learning_rate": 1.4011769397581143e-06, + "loss": 0.035, + "step": 3767 + }, + { + "epoch": 4.231924750807244, + "grad_norm": 0.3693570022771069, + "learning_rate": 1.397175193743633e-06, + "loss": 0.0379, + "step": 3768 + }, + { + "epoch": 4.233047873087182, + "grad_norm": 0.34977327665429203, + "learning_rate": 1.3931787411419252e-06, + "loss": 0.0335, + "step": 3769 + }, + { + "epoch": 4.234170995367121, + "grad_norm": 0.35935908738327177, + "learning_rate": 1.3891875844120517e-06, + "loss": 0.0354, + "step": 3770 + }, + { + "epoch": 4.235294117647059, + "grad_norm": 0.33995180258642155, + "learning_rate": 1.385201726009815e-06, + "loss": 0.0319, + "step": 3771 + }, + { + "epoch": 4.236417239926997, + "grad_norm": 0.33986459891753673, + "learning_rate": 1.3812211683877608e-06, + "loss": 0.0322, + "step": 3772 + }, + { + "epoch": 4.237540362206936, + "grad_norm": 0.3418058114878745, + "learning_rate": 1.3772459139951643e-06, + "loss": 0.0351, + "step": 3773 + }, + { + "epoch": 4.238663484486874, + "grad_norm": 0.3556017630987595, + "learning_rate": 1.3732759652780458e-06, + "loss": 0.0384, + "step": 3774 + }, + { + "epoch": 4.2397866067668115, + "grad_norm": 0.3293972750031755, + "learning_rate": 1.369311324679159e-06, + "loss": 0.0337, + "step": 3775 + }, + { + "epoch": 4.24090972904675, + "grad_norm": 0.3212509101961188, + "learning_rate": 1.3653519946379912e-06, + "loss": 0.0345, + "step": 3776 + }, + { + "epoch": 4.242032851326688, + "grad_norm": 0.34182476674452744, + "learning_rate": 1.3613979775907627e-06, + "loss": 0.0366, + "step": 3777 + }, + { + "epoch": 4.243155973606626, + "grad_norm": 0.3369835165724027, + "learning_rate": 1.3574492759704194e-06, + "loss": 0.0336, + "step": 3778 + }, + { + "epoch": 4.244279095886565, + "grad_norm": 0.3377925126914695, + "learning_rate": 1.3535058922066447e-06, + "loss": 0.035, + "step": 3779 + }, + { + "epoch": 4.245402218166503, + "grad_norm": 0.33513094535575855, + "learning_rate": 1.349567828725844e-06, + "loss": 0.033, + "step": 3780 + }, + { + "epoch": 4.246525340446441, + "grad_norm": 0.3396136926844531, + "learning_rate": 1.3456350879511526e-06, + "loss": 0.0343, + "step": 3781 + }, + { + "epoch": 4.24764846272638, + "grad_norm": 0.34445639039011566, + "learning_rate": 1.3417076723024281e-06, + "loss": 0.0337, + "step": 3782 + }, + { + "epoch": 4.248771585006318, + "grad_norm": 0.3451407077304778, + "learning_rate": 1.3377855841962528e-06, + "loss": 0.0321, + "step": 3783 + }, + { + "epoch": 4.249894707286256, + "grad_norm": 0.35973782888492206, + "learning_rate": 1.333868826045932e-06, + "loss": 0.0384, + "step": 3784 + }, + { + "epoch": 4.2510178295661945, + "grad_norm": 0.351803523339287, + "learning_rate": 1.3299574002614901e-06, + "loss": 0.0355, + "step": 3785 + }, + { + "epoch": 4.252140951846132, + "grad_norm": 0.34276230294757215, + "learning_rate": 1.3260513092496674e-06, + "loss": 0.0334, + "step": 3786 + }, + { + "epoch": 4.25326407412607, + "grad_norm": 0.3690545859860941, + "learning_rate": 1.322150555413927e-06, + "loss": 0.0347, + "step": 3787 + }, + { + "epoch": 4.254387196406009, + "grad_norm": 0.3630986947686924, + "learning_rate": 1.3182551411544454e-06, + "loss": 0.0359, + "step": 3788 + }, + { + "epoch": 4.255510318685947, + "grad_norm": 0.3479212681472171, + "learning_rate": 1.314365068868113e-06, + "loss": 0.0313, + "step": 3789 + }, + { + "epoch": 4.256633440965885, + "grad_norm": 0.34790711153559306, + "learning_rate": 1.3104803409485357e-06, + "loss": 0.0364, + "step": 3790 + }, + { + "epoch": 4.257756563245824, + "grad_norm": 0.3306603188465356, + "learning_rate": 1.3066009597860295e-06, + "loss": 0.0356, + "step": 3791 + }, + { + "epoch": 4.258879685525762, + "grad_norm": 0.34564641948043884, + "learning_rate": 1.302726927767618e-06, + "loss": 0.0357, + "step": 3792 + }, + { + "epoch": 4.2600028078057, + "grad_norm": 0.3585307062453091, + "learning_rate": 1.2988582472770372e-06, + "loss": 0.0376, + "step": 3793 + }, + { + "epoch": 4.261125930085638, + "grad_norm": 0.3297599087847977, + "learning_rate": 1.2949949206947276e-06, + "loss": 0.035, + "step": 3794 + }, + { + "epoch": 4.2622490523655765, + "grad_norm": 0.3581042327920125, + "learning_rate": 1.2911369503978389e-06, + "loss": 0.0427, + "step": 3795 + }, + { + "epoch": 4.263372174645514, + "grad_norm": 0.34205869714374937, + "learning_rate": 1.287284338760222e-06, + "loss": 0.0333, + "step": 3796 + }, + { + "epoch": 4.264495296925452, + "grad_norm": 0.37851885751818304, + "learning_rate": 1.2834370881524294e-06, + "loss": 0.0344, + "step": 3797 + }, + { + "epoch": 4.265618419205391, + "grad_norm": 0.3658512028956833, + "learning_rate": 1.2795952009417178e-06, + "loss": 0.0357, + "step": 3798 + }, + { + "epoch": 4.266741541485329, + "grad_norm": 0.35155118073032193, + "learning_rate": 1.275758679492043e-06, + "loss": 0.0323, + "step": 3799 + }, + { + "epoch": 4.267864663765267, + "grad_norm": 0.3634666090752805, + "learning_rate": 1.2719275261640584e-06, + "loss": 0.0368, + "step": 3800 + }, + { + "epoch": 4.268987786045206, + "grad_norm": 0.3434528301477464, + "learning_rate": 1.2681017433151166e-06, + "loss": 0.0332, + "step": 3801 + }, + { + "epoch": 4.270110908325144, + "grad_norm": 0.3551781416800868, + "learning_rate": 1.264281333299261e-06, + "loss": 0.0358, + "step": 3802 + }, + { + "epoch": 4.271234030605082, + "grad_norm": 0.35818119334156057, + "learning_rate": 1.2604662984672333e-06, + "loss": 0.0358, + "step": 3803 + }, + { + "epoch": 4.272357152885021, + "grad_norm": 0.37688093949994683, + "learning_rate": 1.256656641166466e-06, + "loss": 0.0389, + "step": 3804 + }, + { + "epoch": 4.2734802751649585, + "grad_norm": 0.33982096154376207, + "learning_rate": 1.252852363741084e-06, + "loss": 0.0328, + "step": 3805 + }, + { + "epoch": 4.2746033974448965, + "grad_norm": 0.33879902680647445, + "learning_rate": 1.2490534685319022e-06, + "loss": 0.034, + "step": 3806 + }, + { + "epoch": 4.275726519724835, + "grad_norm": 0.341933822830432, + "learning_rate": 1.2452599578764191e-06, + "loss": 0.0359, + "step": 3807 + }, + { + "epoch": 4.276849642004773, + "grad_norm": 0.35541574499799117, + "learning_rate": 1.241471834108825e-06, + "loss": 0.0375, + "step": 3808 + }, + { + "epoch": 4.277972764284711, + "grad_norm": 0.33786723192624557, + "learning_rate": 1.2376890995599955e-06, + "loss": 0.0334, + "step": 3809 + }, + { + "epoch": 4.27909588656465, + "grad_norm": 0.3586483875205426, + "learning_rate": 1.2339117565574877e-06, + "loss": 0.0344, + "step": 3810 + }, + { + "epoch": 4.280219008844588, + "grad_norm": 0.33925970086487844, + "learning_rate": 1.2301398074255444e-06, + "loss": 0.0317, + "step": 3811 + }, + { + "epoch": 4.281342131124526, + "grad_norm": 0.34451634085780747, + "learning_rate": 1.2263732544850826e-06, + "loss": 0.0324, + "step": 3812 + }, + { + "epoch": 4.282465253404465, + "grad_norm": 0.3534467917065109, + "learning_rate": 1.2226121000537082e-06, + "loss": 0.0353, + "step": 3813 + }, + { + "epoch": 4.283588375684403, + "grad_norm": 0.34295732835300846, + "learning_rate": 1.2188563464456993e-06, + "loss": 0.0344, + "step": 3814 + }, + { + "epoch": 4.284711497964341, + "grad_norm": 0.3467884753627846, + "learning_rate": 1.2151059959720136e-06, + "loss": 0.0353, + "step": 3815 + }, + { + "epoch": 4.285834620244279, + "grad_norm": 0.36125586893387135, + "learning_rate": 1.2113610509402806e-06, + "loss": 0.0352, + "step": 3816 + }, + { + "epoch": 4.286957742524217, + "grad_norm": 0.3550225603303854, + "learning_rate": 1.2076215136548076e-06, + "loss": 0.0332, + "step": 3817 + }, + { + "epoch": 4.288080864804155, + "grad_norm": 0.31427786748412406, + "learning_rate": 1.2038873864165734e-06, + "loss": 0.0297, + "step": 3818 + }, + { + "epoch": 4.289203987084094, + "grad_norm": 0.3748675811463692, + "learning_rate": 1.200158671523226e-06, + "loss": 0.0397, + "step": 3819 + }, + { + "epoch": 4.290327109364032, + "grad_norm": 0.33911019219908817, + "learning_rate": 1.196435371269089e-06, + "loss": 0.0335, + "step": 3820 + }, + { + "epoch": 4.29145023164397, + "grad_norm": 0.3578468474443971, + "learning_rate": 1.1927174879451442e-06, + "loss": 0.0385, + "step": 3821 + }, + { + "epoch": 4.292573353923909, + "grad_norm": 0.325444432765532, + "learning_rate": 1.1890050238390493e-06, + "loss": 0.033, + "step": 3822 + }, + { + "epoch": 4.293696476203847, + "grad_norm": 0.38444358306981036, + "learning_rate": 1.185297981235124e-06, + "loss": 0.0414, + "step": 3823 + }, + { + "epoch": 4.294819598483785, + "grad_norm": 0.31738949785516596, + "learning_rate": 1.1815963624143522e-06, + "loss": 0.0309, + "step": 3824 + }, + { + "epoch": 4.2959427207637235, + "grad_norm": 0.3407919871654333, + "learning_rate": 1.1779001696543802e-06, + "loss": 0.0374, + "step": 3825 + }, + { + "epoch": 4.2970658430436615, + "grad_norm": 0.33587739133134814, + "learning_rate": 1.1742094052295172e-06, + "loss": 0.031, + "step": 3826 + }, + { + "epoch": 4.298188965323599, + "grad_norm": 0.36769806503943114, + "learning_rate": 1.1705240714107301e-06, + "loss": 0.0355, + "step": 3827 + }, + { + "epoch": 4.299312087603538, + "grad_norm": 0.35570550048123933, + "learning_rate": 1.1668441704656463e-06, + "loss": 0.033, + "step": 3828 + }, + { + "epoch": 4.300435209883476, + "grad_norm": 0.3535248321200532, + "learning_rate": 1.1631697046585511e-06, + "loss": 0.0345, + "step": 3829 + }, + { + "epoch": 4.301558332163414, + "grad_norm": 0.34749247467018113, + "learning_rate": 1.1595006762503791e-06, + "loss": 0.0346, + "step": 3830 + }, + { + "epoch": 4.302681454443353, + "grad_norm": 0.338129590666788, + "learning_rate": 1.1558370874987268e-06, + "loss": 0.0326, + "step": 3831 + }, + { + "epoch": 4.303804576723291, + "grad_norm": 0.3494915631570644, + "learning_rate": 1.1521789406578399e-06, + "loss": 0.0351, + "step": 3832 + }, + { + "epoch": 4.304927699003229, + "grad_norm": 0.3378744608121184, + "learning_rate": 1.1485262379786166e-06, + "loss": 0.0341, + "step": 3833 + }, + { + "epoch": 4.306050821283167, + "grad_norm": 0.3419544672603244, + "learning_rate": 1.1448789817086048e-06, + "loss": 0.0344, + "step": 3834 + }, + { + "epoch": 4.307173943563106, + "grad_norm": 0.3291924046121765, + "learning_rate": 1.1412371740920036e-06, + "loss": 0.0316, + "step": 3835 + }, + { + "epoch": 4.3082970658430435, + "grad_norm": 0.35966027874793216, + "learning_rate": 1.137600817369654e-06, + "loss": 0.0362, + "step": 3836 + }, + { + "epoch": 4.309420188122981, + "grad_norm": 0.3557579953353349, + "learning_rate": 1.1339699137790483e-06, + "loss": 0.0348, + "step": 3837 + }, + { + "epoch": 4.31054331040292, + "grad_norm": 0.3437370526418855, + "learning_rate": 1.1303444655543206e-06, + "loss": 0.0351, + "step": 3838 + }, + { + "epoch": 4.311666432682858, + "grad_norm": 0.3322992260093866, + "learning_rate": 1.1267244749262496e-06, + "loss": 0.0361, + "step": 3839 + }, + { + "epoch": 4.312789554962796, + "grad_norm": 0.3506274769092776, + "learning_rate": 1.123109944122256e-06, + "loss": 0.0346, + "step": 3840 + }, + { + "epoch": 4.313912677242735, + "grad_norm": 0.3470389405075643, + "learning_rate": 1.1195008753663982e-06, + "loss": 0.0328, + "step": 3841 + }, + { + "epoch": 4.315035799522673, + "grad_norm": 0.3428900746970874, + "learning_rate": 1.115897270879378e-06, + "loss": 0.0358, + "step": 3842 + }, + { + "epoch": 4.316158921802611, + "grad_norm": 0.3381168228309907, + "learning_rate": 1.1122991328785315e-06, + "loss": 0.0318, + "step": 3843 + }, + { + "epoch": 4.31728204408255, + "grad_norm": 0.35370094895864085, + "learning_rate": 1.1087064635778333e-06, + "loss": 0.0364, + "step": 3844 + }, + { + "epoch": 4.318405166362488, + "grad_norm": 0.34626426769086593, + "learning_rate": 1.1051192651878938e-06, + "loss": 0.0348, + "step": 3845 + }, + { + "epoch": 4.3195282886424256, + "grad_norm": 0.34679291214864055, + "learning_rate": 1.1015375399159533e-06, + "loss": 0.0347, + "step": 3846 + }, + { + "epoch": 4.320651410922364, + "grad_norm": 0.3267291332902231, + "learning_rate": 1.0979612899658875e-06, + "loss": 0.0312, + "step": 3847 + }, + { + "epoch": 4.321774533202302, + "grad_norm": 0.33086846753238264, + "learning_rate": 1.0943905175382018e-06, + "loss": 0.0368, + "step": 3848 + }, + { + "epoch": 4.32289765548224, + "grad_norm": 0.34911582216245135, + "learning_rate": 1.0908252248300332e-06, + "loss": 0.0357, + "step": 3849 + }, + { + "epoch": 4.324020777762179, + "grad_norm": 0.330549307989673, + "learning_rate": 1.0872654140351458e-06, + "loss": 0.0328, + "step": 3850 + }, + { + "epoch": 4.325143900042117, + "grad_norm": 0.3377978034641037, + "learning_rate": 1.0837110873439282e-06, + "loss": 0.0323, + "step": 3851 + }, + { + "epoch": 4.326267022322055, + "grad_norm": 0.3641220302640594, + "learning_rate": 1.080162246943398e-06, + "loss": 0.0386, + "step": 3852 + }, + { + "epoch": 4.327390144601994, + "grad_norm": 0.3597585412481393, + "learning_rate": 1.0766188950171952e-06, + "loss": 0.034, + "step": 3853 + }, + { + "epoch": 4.328513266881932, + "grad_norm": 0.34322666886228304, + "learning_rate": 1.0730810337455856e-06, + "loss": 0.0321, + "step": 3854 + }, + { + "epoch": 4.32963638916187, + "grad_norm": 0.3210457881854714, + "learning_rate": 1.069548665305451e-06, + "loss": 0.0295, + "step": 3855 + }, + { + "epoch": 4.3307595114418085, + "grad_norm": 0.36948984897642523, + "learning_rate": 1.0660217918702965e-06, + "loss": 0.0361, + "step": 3856 + }, + { + "epoch": 4.331882633721746, + "grad_norm": 0.4271058260932982, + "learning_rate": 1.0625004156102492e-06, + "loss": 0.0354, + "step": 3857 + }, + { + "epoch": 4.333005756001684, + "grad_norm": 0.3549117455827815, + "learning_rate": 1.0589845386920473e-06, + "loss": 0.0387, + "step": 3858 + }, + { + "epoch": 4.334128878281623, + "grad_norm": 0.35534169762226714, + "learning_rate": 1.0554741632790532e-06, + "loss": 0.0361, + "step": 3859 + }, + { + "epoch": 4.335252000561561, + "grad_norm": 0.3399213392541908, + "learning_rate": 1.051969291531234e-06, + "loss": 0.0346, + "step": 3860 + }, + { + "epoch": 4.336375122841499, + "grad_norm": 0.3706100498973919, + "learning_rate": 1.0484699256051788e-06, + "loss": 0.0353, + "step": 3861 + }, + { + "epoch": 4.337498245121438, + "grad_norm": 0.3578491650225699, + "learning_rate": 1.0449760676540844e-06, + "loss": 0.0342, + "step": 3862 + }, + { + "epoch": 4.338621367401376, + "grad_norm": 0.32948715354128266, + "learning_rate": 1.0414877198277629e-06, + "loss": 0.0329, + "step": 3863 + }, + { + "epoch": 4.339744489681314, + "grad_norm": 0.3651556904940354, + "learning_rate": 1.038004884272632e-06, + "loss": 0.0348, + "step": 3864 + }, + { + "epoch": 4.340867611961253, + "grad_norm": 0.3745318904885555, + "learning_rate": 1.0345275631317165e-06, + "loss": 0.0371, + "step": 3865 + }, + { + "epoch": 4.3419907342411905, + "grad_norm": 0.3408489479974244, + "learning_rate": 1.0310557585446523e-06, + "loss": 0.0343, + "step": 3866 + }, + { + "epoch": 4.3431138565211285, + "grad_norm": 0.3616137231183904, + "learning_rate": 1.0275894726476787e-06, + "loss": 0.0382, + "step": 3867 + }, + { + "epoch": 4.344236978801067, + "grad_norm": 0.35337736664475866, + "learning_rate": 1.0241287075736384e-06, + "loss": 0.0327, + "step": 3868 + }, + { + "epoch": 4.345360101081005, + "grad_norm": 0.34065649357736527, + "learning_rate": 1.0206734654519802e-06, + "loss": 0.0362, + "step": 3869 + }, + { + "epoch": 4.346483223360943, + "grad_norm": 0.3677984704496751, + "learning_rate": 1.0172237484087522e-06, + "loss": 0.0383, + "step": 3870 + }, + { + "epoch": 4.347606345640882, + "grad_norm": 0.3504661828261147, + "learning_rate": 1.0137795585666023e-06, + "loss": 0.0355, + "step": 3871 + }, + { + "epoch": 4.34872946792082, + "grad_norm": 0.3547684524121742, + "learning_rate": 1.0103408980447793e-06, + "loss": 0.0358, + "step": 3872 + }, + { + "epoch": 4.349852590200758, + "grad_norm": 0.33899684366263133, + "learning_rate": 1.0069077689591279e-06, + "loss": 0.0309, + "step": 3873 + }, + { + "epoch": 4.350975712480697, + "grad_norm": 0.3498349592633247, + "learning_rate": 1.0034801734220922e-06, + "loss": 0.035, + "step": 3874 + }, + { + "epoch": 4.352098834760635, + "grad_norm": 0.34975151345498384, + "learning_rate": 1.0000581135427067e-06, + "loss": 0.0365, + "step": 3875 + }, + { + "epoch": 4.353221957040573, + "grad_norm": 0.3369442861209052, + "learning_rate": 9.966415914266049e-07, + "loss": 0.0349, + "step": 3876 + }, + { + "epoch": 4.354345079320511, + "grad_norm": 0.36960971150127886, + "learning_rate": 9.93230609176008e-07, + "loss": 0.0359, + "step": 3877 + }, + { + "epoch": 4.355468201600449, + "grad_norm": 0.3499527159771044, + "learning_rate": 9.898251688897332e-07, + "loss": 0.0357, + "step": 3878 + }, + { + "epoch": 4.356591323880387, + "grad_norm": 0.3676928856512341, + "learning_rate": 9.86425272663185e-07, + "loss": 0.035, + "step": 3879 + }, + { + "epoch": 4.357714446160326, + "grad_norm": 0.35146864143042605, + "learning_rate": 9.830309225883562e-07, + "loss": 0.0348, + "step": 3880 + }, + { + "epoch": 4.358837568440264, + "grad_norm": 0.3506185390903599, + "learning_rate": 9.796421207538265e-07, + "loss": 0.0343, + "step": 3881 + }, + { + "epoch": 4.359960690720202, + "grad_norm": 0.35319022540725176, + "learning_rate": 9.762588692447661e-07, + "loss": 0.0341, + "step": 3882 + }, + { + "epoch": 4.361083813000141, + "grad_norm": 0.335262563237058, + "learning_rate": 9.728811701429242e-07, + "loss": 0.0345, + "step": 3883 + }, + { + "epoch": 4.362206935280079, + "grad_norm": 0.35087168008448233, + "learning_rate": 9.695090255266394e-07, + "loss": 0.0356, + "step": 3884 + }, + { + "epoch": 4.363330057560017, + "grad_norm": 0.32755003839475955, + "learning_rate": 9.66142437470825e-07, + "loss": 0.0306, + "step": 3885 + }, + { + "epoch": 4.3644531798399555, + "grad_norm": 0.36618851591535595, + "learning_rate": 9.627814080469822e-07, + "loss": 0.0382, + "step": 3886 + }, + { + "epoch": 4.3655763021198934, + "grad_norm": 0.3411679485731454, + "learning_rate": 9.594259393231897e-07, + "loss": 0.031, + "step": 3887 + }, + { + "epoch": 4.366699424399831, + "grad_norm": 0.3488732598923187, + "learning_rate": 9.56076033364105e-07, + "loss": 0.0348, + "step": 3888 + }, + { + "epoch": 4.36782254667977, + "grad_norm": 0.351431363191658, + "learning_rate": 9.527316922309593e-07, + "loss": 0.0358, + "step": 3889 + }, + { + "epoch": 4.368945668959708, + "grad_norm": 0.36029895486382335, + "learning_rate": 9.493929179815631e-07, + "loss": 0.0417, + "step": 3890 + }, + { + "epoch": 4.370068791239646, + "grad_norm": 0.33022530950278417, + "learning_rate": 9.46059712670303e-07, + "loss": 0.0335, + "step": 3891 + }, + { + "epoch": 4.371191913519585, + "grad_norm": 0.31602307458227624, + "learning_rate": 9.427320783481353e-07, + "loss": 0.0296, + "step": 3892 + }, + { + "epoch": 4.372315035799523, + "grad_norm": 0.357480507018708, + "learning_rate": 9.394100170625931e-07, + "loss": 0.0383, + "step": 3893 + }, + { + "epoch": 4.373438158079461, + "grad_norm": 0.35949006197745925, + "learning_rate": 9.360935308577723e-07, + "loss": 0.0391, + "step": 3894 + }, + { + "epoch": 4.374561280359399, + "grad_norm": 0.33189753767969865, + "learning_rate": 9.327826217743452e-07, + "loss": 0.0329, + "step": 3895 + }, + { + "epoch": 4.3756844026393376, + "grad_norm": 0.3508226977683308, + "learning_rate": 9.294772918495521e-07, + "loss": 0.0414, + "step": 3896 + }, + { + "epoch": 4.3768075249192755, + "grad_norm": 0.3643064354558861, + "learning_rate": 9.26177543117197e-07, + "loss": 0.0336, + "step": 3897 + }, + { + "epoch": 4.377930647199213, + "grad_norm": 0.3413054142419292, + "learning_rate": 9.228833776076551e-07, + "loss": 0.0329, + "step": 3898 + }, + { + "epoch": 4.379053769479152, + "grad_norm": 0.3346550439678259, + "learning_rate": 9.195947973478592e-07, + "loss": 0.033, + "step": 3899 + }, + { + "epoch": 4.38017689175909, + "grad_norm": 0.3401382520992714, + "learning_rate": 9.163118043613084e-07, + "loss": 0.0339, + "step": 3900 + }, + { + "epoch": 4.381300014039028, + "grad_norm": 0.37105306794446297, + "learning_rate": 9.130344006680658e-07, + "loss": 0.035, + "step": 3901 + }, + { + "epoch": 4.382423136318967, + "grad_norm": 0.35559365927896963, + "learning_rate": 9.09762588284755e-07, + "loss": 0.0308, + "step": 3902 + }, + { + "epoch": 4.383546258598905, + "grad_norm": 0.34778900056099654, + "learning_rate": 9.064963692245588e-07, + "loss": 0.0367, + "step": 3903 + }, + { + "epoch": 4.384669380878843, + "grad_norm": 0.3431968229053822, + "learning_rate": 9.03235745497213e-07, + "loss": 0.0327, + "step": 3904 + }, + { + "epoch": 4.385792503158782, + "grad_norm": 0.32014372957978515, + "learning_rate": 8.999807191090193e-07, + "loss": 0.0309, + "step": 3905 + }, + { + "epoch": 4.38691562543872, + "grad_norm": 0.3299346644592865, + "learning_rate": 8.967312920628312e-07, + "loss": 0.0351, + "step": 3906 + }, + { + "epoch": 4.3880387477186575, + "grad_norm": 0.34173628201013206, + "learning_rate": 8.934874663580551e-07, + "loss": 0.0337, + "step": 3907 + }, + { + "epoch": 4.389161869998596, + "grad_norm": 0.34178452730956427, + "learning_rate": 8.902492439906552e-07, + "loss": 0.031, + "step": 3908 + }, + { + "epoch": 4.390284992278534, + "grad_norm": 0.35561085437557965, + "learning_rate": 8.870166269531421e-07, + "loss": 0.0366, + "step": 3909 + }, + { + "epoch": 4.391408114558472, + "grad_norm": 0.3607348657261128, + "learning_rate": 8.837896172345827e-07, + "loss": 0.0338, + "step": 3910 + }, + { + "epoch": 4.392531236838411, + "grad_norm": 0.3535133481860961, + "learning_rate": 8.805682168205909e-07, + "loss": 0.0336, + "step": 3911 + }, + { + "epoch": 4.393654359118349, + "grad_norm": 0.33450525221475136, + "learning_rate": 8.773524276933299e-07, + "loss": 0.0321, + "step": 3912 + }, + { + "epoch": 4.394777481398287, + "grad_norm": 0.33740549152276617, + "learning_rate": 8.741422518315113e-07, + "loss": 0.0322, + "step": 3913 + }, + { + "epoch": 4.395900603678226, + "grad_norm": 0.34668595275206976, + "learning_rate": 8.709376912103895e-07, + "loss": 0.0343, + "step": 3914 + }, + { + "epoch": 4.397023725958164, + "grad_norm": 0.3720699327652378, + "learning_rate": 8.677387478017673e-07, + "loss": 0.0383, + "step": 3915 + }, + { + "epoch": 4.398146848238102, + "grad_norm": 0.3310372019137898, + "learning_rate": 8.645454235739903e-07, + "loss": 0.0309, + "step": 3916 + }, + { + "epoch": 4.3992699705180405, + "grad_norm": 0.3446218941497761, + "learning_rate": 8.613577204919455e-07, + "loss": 0.0336, + "step": 3917 + }, + { + "epoch": 4.400393092797978, + "grad_norm": 0.3321270678180654, + "learning_rate": 8.581756405170627e-07, + "loss": 0.0326, + "step": 3918 + }, + { + "epoch": 4.401516215077916, + "grad_norm": 0.3429813545514485, + "learning_rate": 8.54999185607307e-07, + "loss": 0.0337, + "step": 3919 + }, + { + "epoch": 4.402639337357855, + "grad_norm": 0.329083241283035, + "learning_rate": 8.518283577171894e-07, + "loss": 0.032, + "step": 3920 + }, + { + "epoch": 4.403762459637793, + "grad_norm": 0.35028547291655737, + "learning_rate": 8.486631587977545e-07, + "loss": 0.0331, + "step": 3921 + }, + { + "epoch": 4.404885581917731, + "grad_norm": 0.36812246717749864, + "learning_rate": 8.455035907965837e-07, + "loss": 0.0322, + "step": 3922 + }, + { + "epoch": 4.40600870419767, + "grad_norm": 0.3354095575220094, + "learning_rate": 8.423496556577959e-07, + "loss": 0.0315, + "step": 3923 + }, + { + "epoch": 4.407131826477608, + "grad_norm": 0.36423209726760586, + "learning_rate": 8.392013553220391e-07, + "loss": 0.0363, + "step": 3924 + }, + { + "epoch": 4.408254948757546, + "grad_norm": 0.33578769827695937, + "learning_rate": 8.360586917264979e-07, + "loss": 0.0327, + "step": 3925 + }, + { + "epoch": 4.409378071037485, + "grad_norm": 0.35083436656255, + "learning_rate": 8.329216668048878e-07, + "loss": 0.0337, + "step": 3926 + }, + { + "epoch": 4.4105011933174225, + "grad_norm": 0.352145989291215, + "learning_rate": 8.297902824874582e-07, + "loss": 0.0362, + "step": 3927 + }, + { + "epoch": 4.4116243155973605, + "grad_norm": 0.35269237221340394, + "learning_rate": 8.266645407009788e-07, + "loss": 0.0326, + "step": 3928 + }, + { + "epoch": 4.412747437877299, + "grad_norm": 0.34801427609667296, + "learning_rate": 8.235444433687556e-07, + "loss": 0.0366, + "step": 3929 + }, + { + "epoch": 4.413870560157237, + "grad_norm": 0.3399050508413452, + "learning_rate": 8.204299924106196e-07, + "loss": 0.0347, + "step": 3930 + }, + { + "epoch": 4.414993682437175, + "grad_norm": 0.3487557779248968, + "learning_rate": 8.173211897429245e-07, + "loss": 0.0342, + "step": 3931 + }, + { + "epoch": 4.416116804717114, + "grad_norm": 0.3667385663949515, + "learning_rate": 8.142180372785547e-07, + "loss": 0.0359, + "step": 3932 + }, + { + "epoch": 4.417239926997052, + "grad_norm": 0.3333295175583706, + "learning_rate": 8.111205369269104e-07, + "loss": 0.0308, + "step": 3933 + }, + { + "epoch": 4.41836304927699, + "grad_norm": 0.33057100673502793, + "learning_rate": 8.080286905939172e-07, + "loss": 0.0322, + "step": 3934 + }, + { + "epoch": 4.419486171556928, + "grad_norm": 0.3420534039639189, + "learning_rate": 8.049425001820255e-07, + "loss": 0.0374, + "step": 3935 + }, + { + "epoch": 4.420609293836867, + "grad_norm": 0.33462359777080614, + "learning_rate": 8.018619675901995e-07, + "loss": 0.034, + "step": 3936 + }, + { + "epoch": 4.421732416116805, + "grad_norm": 0.3634343403035784, + "learning_rate": 7.987870947139276e-07, + "loss": 0.032, + "step": 3937 + }, + { + "epoch": 4.4228555383967425, + "grad_norm": 0.3732288458675281, + "learning_rate": 7.957178834452095e-07, + "loss": 0.036, + "step": 3938 + }, + { + "epoch": 4.423978660676681, + "grad_norm": 0.3481104484204922, + "learning_rate": 7.926543356725658e-07, + "loss": 0.0315, + "step": 3939 + }, + { + "epoch": 4.425101782956619, + "grad_norm": 0.3400656410063691, + "learning_rate": 7.895964532810318e-07, + "loss": 0.0331, + "step": 3940 + }, + { + "epoch": 4.426224905236557, + "grad_norm": 0.32595754035332963, + "learning_rate": 7.86544238152157e-07, + "loss": 0.0327, + "step": 3941 + }, + { + "epoch": 4.427348027516496, + "grad_norm": 0.35250908793584956, + "learning_rate": 7.834976921640025e-07, + "loss": 0.0357, + "step": 3942 + }, + { + "epoch": 4.428471149796434, + "grad_norm": 0.3520988496330288, + "learning_rate": 7.804568171911398e-07, + "loss": 0.0379, + "step": 3943 + }, + { + "epoch": 4.429594272076372, + "grad_norm": 0.36252580809370655, + "learning_rate": 7.774216151046543e-07, + "loss": 0.034, + "step": 3944 + }, + { + "epoch": 4.430717394356311, + "grad_norm": 0.34372024882244934, + "learning_rate": 7.743920877721378e-07, + "loss": 0.0337, + "step": 3945 + }, + { + "epoch": 4.431840516636249, + "grad_norm": 0.3573305329011033, + "learning_rate": 7.713682370576947e-07, + "loss": 0.034, + "step": 3946 + }, + { + "epoch": 4.432963638916187, + "grad_norm": 0.35271907669417196, + "learning_rate": 7.683500648219322e-07, + "loss": 0.0366, + "step": 3947 + }, + { + "epoch": 4.434086761196125, + "grad_norm": 0.3387988216872993, + "learning_rate": 7.653375729219636e-07, + "loss": 0.0327, + "step": 3948 + }, + { + "epoch": 4.435209883476063, + "grad_norm": 0.34034343693680486, + "learning_rate": 7.623307632114085e-07, + "loss": 0.0345, + "step": 3949 + }, + { + "epoch": 4.436333005756001, + "grad_norm": 0.3391577418176705, + "learning_rate": 7.593296375403914e-07, + "loss": 0.034, + "step": 3950 + }, + { + "epoch": 4.43745612803594, + "grad_norm": 0.34486562074073207, + "learning_rate": 7.563341977555372e-07, + "loss": 0.0345, + "step": 3951 + }, + { + "epoch": 4.438579250315878, + "grad_norm": 0.3412224433899792, + "learning_rate": 7.533444456999728e-07, + "loss": 0.0337, + "step": 3952 + }, + { + "epoch": 4.439702372595816, + "grad_norm": 0.3572984260577703, + "learning_rate": 7.503603832133277e-07, + "loss": 0.0362, + "step": 3953 + }, + { + "epoch": 4.440825494875755, + "grad_norm": 0.3378929389056555, + "learning_rate": 7.473820121317243e-07, + "loss": 0.0335, + "step": 3954 + }, + { + "epoch": 4.441948617155693, + "grad_norm": 0.3586567601428513, + "learning_rate": 7.4440933428779e-07, + "loss": 0.0357, + "step": 3955 + }, + { + "epoch": 4.443071739435631, + "grad_norm": 0.35329443808804784, + "learning_rate": 7.41442351510645e-07, + "loss": 0.0317, + "step": 3956 + }, + { + "epoch": 4.4441948617155695, + "grad_norm": 0.3554120702375195, + "learning_rate": 7.384810656259078e-07, + "loss": 0.0351, + "step": 3957 + }, + { + "epoch": 4.4453179839955075, + "grad_norm": 0.32801148615792985, + "learning_rate": 7.355254784556887e-07, + "loss": 0.0323, + "step": 3958 + }, + { + "epoch": 4.446441106275445, + "grad_norm": 0.35260937314651863, + "learning_rate": 7.325755918185928e-07, + "loss": 0.0361, + "step": 3959 + }, + { + "epoch": 4.447564228555384, + "grad_norm": 0.35121127734872754, + "learning_rate": 7.296314075297196e-07, + "loss": 0.0338, + "step": 3960 + }, + { + "epoch": 4.448687350835322, + "grad_norm": 0.3538367483299584, + "learning_rate": 7.266929274006595e-07, + "loss": 0.0363, + "step": 3961 + }, + { + "epoch": 4.44981047311526, + "grad_norm": 0.3518231420814252, + "learning_rate": 7.237601532394866e-07, + "loss": 0.0393, + "step": 3962 + }, + { + "epoch": 4.450933595395199, + "grad_norm": 0.35867753461888296, + "learning_rate": 7.208330868507718e-07, + "loss": 0.0355, + "step": 3963 + }, + { + "epoch": 4.452056717675137, + "grad_norm": 0.3399901393948421, + "learning_rate": 7.17911730035572e-07, + "loss": 0.0323, + "step": 3964 + }, + { + "epoch": 4.453179839955075, + "grad_norm": 0.36423866949072375, + "learning_rate": 7.149960845914294e-07, + "loss": 0.0374, + "step": 3965 + }, + { + "epoch": 4.454302962235014, + "grad_norm": 0.3566191322953024, + "learning_rate": 7.120861523123735e-07, + "loss": 0.0362, + "step": 3966 + }, + { + "epoch": 4.455426084514952, + "grad_norm": 0.35784219880826335, + "learning_rate": 7.091819349889162e-07, + "loss": 0.0362, + "step": 3967 + }, + { + "epoch": 4.4565492067948895, + "grad_norm": 0.3382634880819678, + "learning_rate": 7.062834344080549e-07, + "loss": 0.0315, + "step": 3968 + }, + { + "epoch": 4.457672329074828, + "grad_norm": 0.3569943519463488, + "learning_rate": 7.033906523532697e-07, + "loss": 0.0349, + "step": 3969 + }, + { + "epoch": 4.458795451354766, + "grad_norm": 0.3650433682446708, + "learning_rate": 7.005035906045199e-07, + "loss": 0.0346, + "step": 3970 + }, + { + "epoch": 4.459918573634704, + "grad_norm": 0.33813321468776913, + "learning_rate": 6.976222509382491e-07, + "loss": 0.0317, + "step": 3971 + }, + { + "epoch": 4.461041695914643, + "grad_norm": 0.3619159838292503, + "learning_rate": 6.947466351273735e-07, + "loss": 0.0359, + "step": 3972 + }, + { + "epoch": 4.462164818194581, + "grad_norm": 0.3763835667376468, + "learning_rate": 6.918767449412933e-07, + "loss": 0.0361, + "step": 3973 + }, + { + "epoch": 4.463287940474519, + "grad_norm": 0.3436876392723535, + "learning_rate": 6.890125821458826e-07, + "loss": 0.0336, + "step": 3974 + }, + { + "epoch": 4.464411062754458, + "grad_norm": 0.32509347109822845, + "learning_rate": 6.86154148503494e-07, + "loss": 0.0308, + "step": 3975 + }, + { + "epoch": 4.465534185034396, + "grad_norm": 0.3525907199308526, + "learning_rate": 6.833014457729525e-07, + "loss": 0.0356, + "step": 3976 + }, + { + "epoch": 4.466657307314334, + "grad_norm": 0.35208517679189183, + "learning_rate": 6.804544757095566e-07, + "loss": 0.0328, + "step": 3977 + }, + { + "epoch": 4.4677804295942725, + "grad_norm": 0.3599858916473201, + "learning_rate": 6.776132400650781e-07, + "loss": 0.0352, + "step": 3978 + }, + { + "epoch": 4.46890355187421, + "grad_norm": 0.3641931626113949, + "learning_rate": 6.747777405877609e-07, + "loss": 0.0393, + "step": 3979 + }, + { + "epoch": 4.470026674154148, + "grad_norm": 0.34746423533313237, + "learning_rate": 6.719479790223204e-07, + "loss": 0.0364, + "step": 3980 + }, + { + "epoch": 4.471149796434087, + "grad_norm": 0.3564297638222409, + "learning_rate": 6.691239571099395e-07, + "loss": 0.0365, + "step": 3981 + }, + { + "epoch": 4.472272918714025, + "grad_norm": 0.34573485421602407, + "learning_rate": 6.663056765882692e-07, + "loss": 0.0329, + "step": 3982 + }, + { + "epoch": 4.473396040993963, + "grad_norm": 0.3597984417319236, + "learning_rate": 6.634931391914278e-07, + "loss": 0.037, + "step": 3983 + }, + { + "epoch": 4.474519163273902, + "grad_norm": 0.3378924186798682, + "learning_rate": 6.606863466500013e-07, + "loss": 0.0343, + "step": 3984 + }, + { + "epoch": 4.47564228555384, + "grad_norm": 0.34863970957372714, + "learning_rate": 6.578853006910402e-07, + "loss": 0.0343, + "step": 3985 + }, + { + "epoch": 4.476765407833778, + "grad_norm": 0.33950262649507096, + "learning_rate": 6.550900030380614e-07, + "loss": 0.0378, + "step": 3986 + }, + { + "epoch": 4.477888530113717, + "grad_norm": 0.35574459663793684, + "learning_rate": 6.523004554110379e-07, + "loss": 0.0351, + "step": 3987 + }, + { + "epoch": 4.4790116523936545, + "grad_norm": 0.32764803743478366, + "learning_rate": 6.495166595264102e-07, + "loss": 0.0318, + "step": 3988 + }, + { + "epoch": 4.4801347746735924, + "grad_norm": 0.3648664364754083, + "learning_rate": 6.467386170970802e-07, + "loss": 0.0359, + "step": 3989 + }, + { + "epoch": 4.481257896953531, + "grad_norm": 0.33735699056875545, + "learning_rate": 6.439663298324061e-07, + "loss": 0.0307, + "step": 3990 + }, + { + "epoch": 4.482381019233469, + "grad_norm": 0.36038648251848654, + "learning_rate": 6.411997994382102e-07, + "loss": 0.0366, + "step": 3991 + }, + { + "epoch": 4.483504141513407, + "grad_norm": 0.3582401185410582, + "learning_rate": 6.384390276167651e-07, + "loss": 0.0373, + "step": 3992 + }, + { + "epoch": 4.484627263793345, + "grad_norm": 0.3514649132145566, + "learning_rate": 6.356840160668054e-07, + "loss": 0.0351, + "step": 3993 + }, + { + "epoch": 4.485750386073284, + "grad_norm": 0.3316545137948871, + "learning_rate": 6.329347664835206e-07, + "loss": 0.0335, + "step": 3994 + }, + { + "epoch": 4.486873508353222, + "grad_norm": 0.38390972496268505, + "learning_rate": 6.30191280558553e-07, + "loss": 0.033, + "step": 3995 + }, + { + "epoch": 4.48799663063316, + "grad_norm": 0.34745468335455126, + "learning_rate": 6.274535599800014e-07, + "loss": 0.0339, + "step": 3996 + }, + { + "epoch": 4.489119752913099, + "grad_norm": 0.3469881466439082, + "learning_rate": 6.247216064324158e-07, + "loss": 0.0348, + "step": 3997 + }, + { + "epoch": 4.4902428751930366, + "grad_norm": 0.33480434506463175, + "learning_rate": 6.219954215967949e-07, + "loss": 0.0324, + "step": 3998 + }, + { + "epoch": 4.4913659974729745, + "grad_norm": 0.33430612710636604, + "learning_rate": 6.192750071505904e-07, + "loss": 0.0327, + "step": 3999 + }, + { + "epoch": 4.492489119752913, + "grad_norm": 0.37230150478039675, + "learning_rate": 6.165603647677054e-07, + "loss": 0.0343, + "step": 4000 + }, + { + "epoch": 4.493612242032851, + "grad_norm": 0.3715774781533714, + "learning_rate": 6.138514961184872e-07, + "loss": 0.0388, + "step": 4001 + }, + { + "epoch": 4.494735364312789, + "grad_norm": 0.34588801151813375, + "learning_rate": 6.111484028697334e-07, + "loss": 0.0357, + "step": 4002 + }, + { + "epoch": 4.495858486592728, + "grad_norm": 0.3473978219479411, + "learning_rate": 6.084510866846882e-07, + "loss": 0.0337, + "step": 4003 + }, + { + "epoch": 4.496981608872666, + "grad_norm": 0.3558220793979365, + "learning_rate": 6.057595492230372e-07, + "loss": 0.035, + "step": 4004 + }, + { + "epoch": 4.498104731152604, + "grad_norm": 0.3617212667424299, + "learning_rate": 6.030737921409169e-07, + "loss": 0.0366, + "step": 4005 + }, + { + "epoch": 4.499227853432543, + "grad_norm": 0.372375829726114, + "learning_rate": 6.003938170908985e-07, + "loss": 0.0372, + "step": 4006 + }, + { + "epoch": 4.500350975712481, + "grad_norm": 0.33813041058043286, + "learning_rate": 5.97719625722003e-07, + "loss": 0.0339, + "step": 4007 + }, + { + "epoch": 4.501474097992419, + "grad_norm": 0.3336235673204082, + "learning_rate": 5.950512196796898e-07, + "loss": 0.0337, + "step": 4008 + }, + { + "epoch": 4.502597220272357, + "grad_norm": 0.3582259509306251, + "learning_rate": 5.923886006058566e-07, + "loss": 0.035, + "step": 4009 + }, + { + "epoch": 4.503720342552295, + "grad_norm": 0.34292021535584205, + "learning_rate": 5.897317701388461e-07, + "loss": 0.0325, + "step": 4010 + }, + { + "epoch": 4.504843464832233, + "grad_norm": 0.3345731631163473, + "learning_rate": 5.870807299134307e-07, + "loss": 0.0337, + "step": 4011 + }, + { + "epoch": 4.505966587112172, + "grad_norm": 0.32035494944471327, + "learning_rate": 5.844354815608267e-07, + "loss": 0.0304, + "step": 4012 + }, + { + "epoch": 4.50708970939211, + "grad_norm": 0.3447654015600138, + "learning_rate": 5.817960267086853e-07, + "loss": 0.0329, + "step": 4013 + }, + { + "epoch": 4.508212831672048, + "grad_norm": 0.34544101773876196, + "learning_rate": 5.791623669810908e-07, + "loss": 0.0339, + "step": 4014 + }, + { + "epoch": 4.509335953951987, + "grad_norm": 0.3728532295897952, + "learning_rate": 5.765345039985648e-07, + "loss": 0.0343, + "step": 4015 + }, + { + "epoch": 4.510459076231925, + "grad_norm": 0.35317009146829786, + "learning_rate": 5.739124393780571e-07, + "loss": 0.0335, + "step": 4016 + }, + { + "epoch": 4.511582198511863, + "grad_norm": 0.3545765005968472, + "learning_rate": 5.71296174732956e-07, + "loss": 0.0345, + "step": 4017 + }, + { + "epoch": 4.5127053207918015, + "grad_norm": 0.342843964056811, + "learning_rate": 5.68685711673076e-07, + "loss": 0.0323, + "step": 4018 + }, + { + "epoch": 4.5138284430717395, + "grad_norm": 0.3639774486483642, + "learning_rate": 5.660810518046644e-07, + "loss": 0.0355, + "step": 4019 + }, + { + "epoch": 4.514951565351677, + "grad_norm": 0.3332083193281611, + "learning_rate": 5.634821967303994e-07, + "loss": 0.0337, + "step": 4020 + }, + { + "epoch": 4.516074687631616, + "grad_norm": 0.34776766829793077, + "learning_rate": 5.608891480493816e-07, + "loss": 0.0351, + "step": 4021 + }, + { + "epoch": 4.517197809911554, + "grad_norm": 0.3483906727553061, + "learning_rate": 5.583019073571427e-07, + "loss": 0.034, + "step": 4022 + }, + { + "epoch": 4.518320932191492, + "grad_norm": 0.35109229875700276, + "learning_rate": 5.557204762456425e-07, + "loss": 0.0326, + "step": 4023 + }, + { + "epoch": 4.519444054471431, + "grad_norm": 0.32870715662554173, + "learning_rate": 5.531448563032626e-07, + "loss": 0.035, + "step": 4024 + }, + { + "epoch": 4.520567176751369, + "grad_norm": 0.35831485527315554, + "learning_rate": 5.505750491148121e-07, + "loss": 0.0371, + "step": 4025 + }, + { + "epoch": 4.521690299031307, + "grad_norm": 0.3717078413941628, + "learning_rate": 5.480110562615182e-07, + "loss": 0.0401, + "step": 4026 + }, + { + "epoch": 4.522813421311246, + "grad_norm": 0.34683971898871724, + "learning_rate": 5.454528793210356e-07, + "loss": 0.0308, + "step": 4027 + }, + { + "epoch": 4.523936543591184, + "grad_norm": 0.33865977054207247, + "learning_rate": 5.429005198674398e-07, + "loss": 0.0334, + "step": 4028 + }, + { + "epoch": 4.5250596658711215, + "grad_norm": 0.35261816775158816, + "learning_rate": 5.403539794712243e-07, + "loss": 0.036, + "step": 4029 + }, + { + "epoch": 4.5261827881510595, + "grad_norm": 0.34517842856104286, + "learning_rate": 5.378132596993047e-07, + "loss": 0.0375, + "step": 4030 + }, + { + "epoch": 4.527305910430998, + "grad_norm": 0.35750348534406, + "learning_rate": 5.352783621150126e-07, + "loss": 0.0355, + "step": 4031 + }, + { + "epoch": 4.528429032710936, + "grad_norm": 0.3454480091845214, + "learning_rate": 5.327492882780993e-07, + "loss": 0.0356, + "step": 4032 + }, + { + "epoch": 4.529552154990874, + "grad_norm": 0.32573268708480096, + "learning_rate": 5.3022603974473e-07, + "loss": 0.0308, + "step": 4033 + }, + { + "epoch": 4.530675277270813, + "grad_norm": 0.34576124173105804, + "learning_rate": 5.277086180674906e-07, + "loss": 0.0336, + "step": 4034 + }, + { + "epoch": 4.531798399550751, + "grad_norm": 0.34623159718083096, + "learning_rate": 5.251970247953752e-07, + "loss": 0.036, + "step": 4035 + }, + { + "epoch": 4.532921521830689, + "grad_norm": 0.35168976903132443, + "learning_rate": 5.226912614737956e-07, + "loss": 0.0326, + "step": 4036 + }, + { + "epoch": 4.534044644110628, + "grad_norm": 0.5382472283002754, + "learning_rate": 5.20191329644577e-07, + "loss": 0.0353, + "step": 4037 + }, + { + "epoch": 4.535167766390566, + "grad_norm": 0.37754077895496924, + "learning_rate": 5.176972308459527e-07, + "loss": 0.0365, + "step": 4038 + }, + { + "epoch": 4.536290888670504, + "grad_norm": 0.37416484246623505, + "learning_rate": 5.152089666125704e-07, + "loss": 0.0394, + "step": 4039 + }, + { + "epoch": 4.537414010950442, + "grad_norm": 0.3590048250258893, + "learning_rate": 5.127265384754865e-07, + "loss": 0.0366, + "step": 4040 + }, + { + "epoch": 4.53853713323038, + "grad_norm": 0.3342358936092815, + "learning_rate": 5.102499479621658e-07, + "loss": 0.0335, + "step": 4041 + }, + { + "epoch": 4.539660255510318, + "grad_norm": 0.3493925300990296, + "learning_rate": 5.07779196596484e-07, + "loss": 0.0339, + "step": 4042 + }, + { + "epoch": 4.540783377790257, + "grad_norm": 0.33636877105880414, + "learning_rate": 5.053142858987192e-07, + "loss": 0.0317, + "step": 4043 + }, + { + "epoch": 4.541906500070195, + "grad_norm": 0.34263176501366066, + "learning_rate": 5.028552173855572e-07, + "loss": 0.0336, + "step": 4044 + }, + { + "epoch": 4.543029622350133, + "grad_norm": 0.37257701817237154, + "learning_rate": 5.004019925700921e-07, + "loss": 0.0392, + "step": 4045 + }, + { + "epoch": 4.544152744630072, + "grad_norm": 0.3369340532849318, + "learning_rate": 4.979546129618184e-07, + "loss": 0.0325, + "step": 4046 + }, + { + "epoch": 4.54527586691001, + "grad_norm": 0.33726138907008607, + "learning_rate": 4.955130800666374e-07, + "loss": 0.0329, + "step": 4047 + }, + { + "epoch": 4.546398989189948, + "grad_norm": 0.38713309676317126, + "learning_rate": 4.930773953868506e-07, + "loss": 0.0378, + "step": 4048 + }, + { + "epoch": 4.5475221114698865, + "grad_norm": 0.36438821244482394, + "learning_rate": 4.906475604211624e-07, + "loss": 0.0341, + "step": 4049 + }, + { + "epoch": 4.548645233749824, + "grad_norm": 0.33904817801576353, + "learning_rate": 4.882235766646748e-07, + "loss": 0.0356, + "step": 4050 + }, + { + "epoch": 4.549768356029762, + "grad_norm": 0.35475025219773854, + "learning_rate": 4.858054456088923e-07, + "loss": 0.0338, + "step": 4051 + }, + { + "epoch": 4.550891478309701, + "grad_norm": 0.3458201973740821, + "learning_rate": 4.833931687417182e-07, + "loss": 0.0345, + "step": 4052 + }, + { + "epoch": 4.552014600589639, + "grad_norm": 0.36111440231472997, + "learning_rate": 4.809867475474539e-07, + "loss": 0.0355, + "step": 4053 + }, + { + "epoch": 4.553137722869577, + "grad_norm": 0.3535234629211531, + "learning_rate": 4.785861835067962e-07, + "loss": 0.0343, + "step": 4054 + }, + { + "epoch": 4.554260845149516, + "grad_norm": 0.3528696097512272, + "learning_rate": 4.761914780968369e-07, + "loss": 0.0333, + "step": 4055 + }, + { + "epoch": 4.555383967429454, + "grad_norm": 0.34838927834260514, + "learning_rate": 4.738026327910661e-07, + "loss": 0.0358, + "step": 4056 + }, + { + "epoch": 4.556507089709392, + "grad_norm": 0.3431964796657279, + "learning_rate": 4.7141964905936697e-07, + "loss": 0.0338, + "step": 4057 + }, + { + "epoch": 4.557630211989331, + "grad_norm": 0.349604850461702, + "learning_rate": 4.6904252836801446e-07, + "loss": 0.0351, + "step": 4058 + }, + { + "epoch": 4.5587533342692685, + "grad_norm": 0.3284644735862563, + "learning_rate": 4.6667127217967844e-07, + "loss": 0.0312, + "step": 4059 + }, + { + "epoch": 4.5598764565492065, + "grad_norm": 0.3381032631698376, + "learning_rate": 4.6430588195341853e-07, + "loss": 0.0347, + "step": 4060 + }, + { + "epoch": 4.560999578829145, + "grad_norm": 0.3259962293243492, + "learning_rate": 4.6194635914468377e-07, + "loss": 0.0321, + "step": 4061 + }, + { + "epoch": 4.562122701109083, + "grad_norm": 0.34016741280577595, + "learning_rate": 4.595927052053162e-07, + "loss": 0.0317, + "step": 4062 + }, + { + "epoch": 4.563245823389021, + "grad_norm": 0.3783286672968007, + "learning_rate": 4.5724492158354397e-07, + "loss": 0.0346, + "step": 4063 + }, + { + "epoch": 4.56436894566896, + "grad_norm": 0.3627087896517975, + "learning_rate": 4.5490300972398705e-07, + "loss": 0.0337, + "step": 4064 + }, + { + "epoch": 4.565492067948898, + "grad_norm": 0.362771085697257, + "learning_rate": 4.52566971067645e-07, + "loss": 0.0353, + "step": 4065 + }, + { + "epoch": 4.566615190228836, + "grad_norm": 0.3102609073574198, + "learning_rate": 4.502368070519114e-07, + "loss": 0.0274, + "step": 4066 + }, + { + "epoch": 4.567738312508775, + "grad_norm": 0.33733629821182154, + "learning_rate": 4.4791251911056043e-07, + "loss": 0.0344, + "step": 4067 + }, + { + "epoch": 4.568861434788713, + "grad_norm": 0.34048282314449596, + "learning_rate": 4.4559410867375365e-07, + "loss": 0.0378, + "step": 4068 + }, + { + "epoch": 4.569984557068651, + "grad_norm": 0.34656193172912014, + "learning_rate": 4.432815771680321e-07, + "loss": 0.0346, + "step": 4069 + }, + { + "epoch": 4.571107679348589, + "grad_norm": 0.32758773549200243, + "learning_rate": 4.409749260163232e-07, + "loss": 0.0345, + "step": 4070 + }, + { + "epoch": 4.572230801628527, + "grad_norm": 0.3654433415452595, + "learning_rate": 4.386741566379338e-07, + "loss": 0.0387, + "step": 4071 + }, + { + "epoch": 4.573353923908465, + "grad_norm": 0.3367092688064399, + "learning_rate": 4.3637927044855476e-07, + "loss": 0.0324, + "step": 4072 + }, + { + "epoch": 4.574477046188404, + "grad_norm": 0.3640805465047991, + "learning_rate": 4.340902688602544e-07, + "loss": 0.0339, + "step": 4073 + }, + { + "epoch": 4.575600168468342, + "grad_norm": 0.34452856807576676, + "learning_rate": 4.3180715328147826e-07, + "loss": 0.034, + "step": 4074 + }, + { + "epoch": 4.57672329074828, + "grad_norm": 0.34328210378409857, + "learning_rate": 4.295299251170537e-07, + "loss": 0.0327, + "step": 4075 + }, + { + "epoch": 4.577846413028219, + "grad_norm": 0.35304122276887184, + "learning_rate": 4.272585857681844e-07, + "loss": 0.0344, + "step": 4076 + }, + { + "epoch": 4.578969535308157, + "grad_norm": 0.3253336789166046, + "learning_rate": 4.249931366324511e-07, + "loss": 0.0331, + "step": 4077 + }, + { + "epoch": 4.580092657588095, + "grad_norm": 0.3388987083083182, + "learning_rate": 4.2273357910380896e-07, + "loss": 0.0343, + "step": 4078 + }, + { + "epoch": 4.5812157798680335, + "grad_norm": 0.3543772419693709, + "learning_rate": 4.2047991457258905e-07, + "loss": 0.0374, + "step": 4079 + }, + { + "epoch": 4.5823389021479715, + "grad_norm": 0.3765630360863373, + "learning_rate": 4.182321444254944e-07, + "loss": 0.0354, + "step": 4080 + }, + { + "epoch": 4.583462024427909, + "grad_norm": 0.3540818181461558, + "learning_rate": 4.1599027004560535e-07, + "loss": 0.0428, + "step": 4081 + }, + { + "epoch": 4.584585146707848, + "grad_norm": 0.34719828160408894, + "learning_rate": 4.1375429281236946e-07, + "loss": 0.0366, + "step": 4082 + }, + { + "epoch": 4.585708268987786, + "grad_norm": 0.3264230696168353, + "learning_rate": 4.115242141016085e-07, + "loss": 0.0316, + "step": 4083 + }, + { + "epoch": 4.586831391267724, + "grad_norm": 0.34719908923103476, + "learning_rate": 4.0930003528551587e-07, + "loss": 0.0359, + "step": 4084 + }, + { + "epoch": 4.587954513547663, + "grad_norm": 0.35625052906788557, + "learning_rate": 4.0708175773265246e-07, + "loss": 0.0348, + "step": 4085 + }, + { + "epoch": 4.589077635827601, + "grad_norm": 0.3650173507417219, + "learning_rate": 4.0486938280794754e-07, + "loss": 0.038, + "step": 4086 + }, + { + "epoch": 4.590200758107539, + "grad_norm": 0.33970468745861326, + "learning_rate": 4.0266291187270435e-07, + "loss": 0.033, + "step": 4087 + }, + { + "epoch": 4.591323880387478, + "grad_norm": 0.3841339639866425, + "learning_rate": 4.004623462845825e-07, + "loss": 0.0344, + "step": 4088 + }, + { + "epoch": 4.592447002667416, + "grad_norm": 0.3247187954531285, + "learning_rate": 3.9826768739761765e-07, + "loss": 0.0345, + "step": 4089 + }, + { + "epoch": 4.5935701249473535, + "grad_norm": 0.3431276872602053, + "learning_rate": 3.960789365622075e-07, + "loss": 0.0311, + "step": 4090 + }, + { + "epoch": 4.594693247227292, + "grad_norm": 0.34267005186851196, + "learning_rate": 3.938960951251136e-07, + "loss": 0.0376, + "step": 4091 + }, + { + "epoch": 4.59581636950723, + "grad_norm": 0.34136270274596336, + "learning_rate": 3.917191644294627e-07, + "loss": 0.0316, + "step": 4092 + }, + { + "epoch": 4.596939491787168, + "grad_norm": 0.34219281794077305, + "learning_rate": 3.895481458147454e-07, + "loss": 0.0339, + "step": 4093 + }, + { + "epoch": 4.598062614067107, + "grad_norm": 0.3695903719561843, + "learning_rate": 3.8738304061681107e-07, + "loss": 0.0391, + "step": 4094 + }, + { + "epoch": 4.599185736347045, + "grad_norm": 0.3426126714468979, + "learning_rate": 3.852238501678751e-07, + "loss": 0.0339, + "step": 4095 + }, + { + "epoch": 4.600308858626983, + "grad_norm": 0.3600465378066275, + "learning_rate": 3.830705757965081e-07, + "loss": 0.0345, + "step": 4096 + }, + { + "epoch": 4.601431980906922, + "grad_norm": 0.4555888651642329, + "learning_rate": 3.809232188276468e-07, + "loss": 0.0375, + "step": 4097 + }, + { + "epoch": 4.60255510318686, + "grad_norm": 0.3441834185775792, + "learning_rate": 3.7878178058258217e-07, + "loss": 0.0376, + "step": 4098 + }, + { + "epoch": 4.603678225466798, + "grad_norm": 0.34364984838035073, + "learning_rate": 3.766462623789646e-07, + "loss": 0.0334, + "step": 4099 + }, + { + "epoch": 4.6048013477467356, + "grad_norm": 0.36747528881747177, + "learning_rate": 3.745166655308019e-07, + "loss": 0.0404, + "step": 4100 + }, + { + "epoch": 4.605924470026674, + "grad_norm": 0.3341683012479436, + "learning_rate": 3.723929913484581e-07, + "loss": 0.0301, + "step": 4101 + }, + { + "epoch": 4.607047592306612, + "grad_norm": 0.3505021440089559, + "learning_rate": 3.702752411386534e-07, + "loss": 0.0367, + "step": 4102 + }, + { + "epoch": 4.60817071458655, + "grad_norm": 0.3474780616469266, + "learning_rate": 3.681634162044645e-07, + "loss": 0.0358, + "step": 4103 + }, + { + "epoch": 4.609293836866489, + "grad_norm": 0.33887657295088386, + "learning_rate": 3.6605751784531853e-07, + "loss": 0.0362, + "step": 4104 + }, + { + "epoch": 4.610416959146427, + "grad_norm": 0.34961137504173107, + "learning_rate": 3.6395754735699896e-07, + "loss": 0.0379, + "step": 4105 + }, + { + "epoch": 4.611540081426365, + "grad_norm": 0.3302529939325971, + "learning_rate": 3.6186350603164e-07, + "loss": 0.0334, + "step": 4106 + }, + { + "epoch": 4.612663203706304, + "grad_norm": 0.33644579720443657, + "learning_rate": 3.5977539515772874e-07, + "loss": 0.0345, + "step": 4107 + }, + { + "epoch": 4.613786325986242, + "grad_norm": 0.3278842607379112, + "learning_rate": 3.57693216020103e-07, + "loss": 0.0332, + "step": 4108 + }, + { + "epoch": 4.61490944826618, + "grad_norm": 0.36040005767567596, + "learning_rate": 3.556169698999501e-07, + "loss": 0.035, + "step": 4109 + }, + { + "epoch": 4.6160325705461185, + "grad_norm": 0.35158408144549563, + "learning_rate": 3.535466580748059e-07, + "loss": 0.0345, + "step": 4110 + }, + { + "epoch": 4.617155692826056, + "grad_norm": 0.36439910096515155, + "learning_rate": 3.5148228181855923e-07, + "loss": 0.0373, + "step": 4111 + }, + { + "epoch": 4.618278815105994, + "grad_norm": 0.33513522124762823, + "learning_rate": 3.4942384240144176e-07, + "loss": 0.0347, + "step": 4112 + }, + { + "epoch": 4.619401937385933, + "grad_norm": 0.34260874749603937, + "learning_rate": 3.473713410900326e-07, + "loss": 0.0334, + "step": 4113 + }, + { + "epoch": 4.620525059665871, + "grad_norm": 0.3352507044783814, + "learning_rate": 3.453247791472603e-07, + "loss": 0.0357, + "step": 4114 + }, + { + "epoch": 4.621648181945809, + "grad_norm": 0.34750467435944776, + "learning_rate": 3.4328415783239646e-07, + "loss": 0.033, + "step": 4115 + }, + { + "epoch": 4.622771304225748, + "grad_norm": 0.3428620974144906, + "learning_rate": 3.4124947840105673e-07, + "loss": 0.0314, + "step": 4116 + }, + { + "epoch": 4.623894426505686, + "grad_norm": 0.36245758828541697, + "learning_rate": 3.3922074210520407e-07, + "loss": 0.0349, + "step": 4117 + }, + { + "epoch": 4.625017548785624, + "grad_norm": 0.35741628717665924, + "learning_rate": 3.3719795019313993e-07, + "loss": 0.0319, + "step": 4118 + }, + { + "epoch": 4.626140671065563, + "grad_norm": 0.3571691112754241, + "learning_rate": 3.351811039095121e-07, + "loss": 0.0341, + "step": 4119 + }, + { + "epoch": 4.6272637933455005, + "grad_norm": 0.3361597736689054, + "learning_rate": 3.3317020449530666e-07, + "loss": 0.0337, + "step": 4120 + }, + { + "epoch": 4.6283869156254385, + "grad_norm": 0.352268450177105, + "learning_rate": 3.3116525318785286e-07, + "loss": 0.0315, + "step": 4121 + }, + { + "epoch": 4.629510037905377, + "grad_norm": 0.3496979817415821, + "learning_rate": 3.291662512208216e-07, + "loss": 0.0377, + "step": 4122 + }, + { + "epoch": 4.630633160185315, + "grad_norm": 0.3415382592263034, + "learning_rate": 3.271731998242167e-07, + "loss": 0.0349, + "step": 4123 + }, + { + "epoch": 4.631756282465253, + "grad_norm": 0.3329970180899988, + "learning_rate": 3.2518610022438724e-07, + "loss": 0.0308, + "step": 4124 + }, + { + "epoch": 4.632879404745192, + "grad_norm": 0.3554638729226304, + "learning_rate": 3.2320495364401625e-07, + "loss": 0.0396, + "step": 4125 + }, + { + "epoch": 4.63400252702513, + "grad_norm": 0.3467526836273791, + "learning_rate": 3.2122976130212644e-07, + "loss": 0.0334, + "step": 4126 + }, + { + "epoch": 4.635125649305068, + "grad_norm": 0.34256948945547966, + "learning_rate": 3.192605244140745e-07, + "loss": 0.0329, + "step": 4127 + }, + { + "epoch": 4.636248771585007, + "grad_norm": 0.35204121377326897, + "learning_rate": 3.172972441915523e-07, + "loss": 0.0332, + "step": 4128 + }, + { + "epoch": 4.637371893864945, + "grad_norm": 0.37489357665496786, + "learning_rate": 3.153399218425901e-07, + "loss": 0.0365, + "step": 4129 + }, + { + "epoch": 4.638495016144883, + "grad_norm": 0.32498835574013707, + "learning_rate": 3.13388558571549e-07, + "loss": 0.0307, + "step": 4130 + }, + { + "epoch": 4.6396181384248205, + "grad_norm": 0.33414718587616266, + "learning_rate": 3.114431555791253e-07, + "loss": 0.037, + "step": 4131 + }, + { + "epoch": 4.640741260704759, + "grad_norm": 0.3384428063498482, + "learning_rate": 3.0950371406234357e-07, + "loss": 0.0323, + "step": 4132 + }, + { + "epoch": 4.641864382984697, + "grad_norm": 0.3493656495995995, + "learning_rate": 3.075702352145671e-07, + "loss": 0.0337, + "step": 4133 + }, + { + "epoch": 4.642987505264635, + "grad_norm": 0.34456307900275657, + "learning_rate": 3.0564272022548414e-07, + "loss": 0.0321, + "step": 4134 + }, + { + "epoch": 4.644110627544574, + "grad_norm": 0.35749872404688643, + "learning_rate": 3.0372117028111825e-07, + "loss": 0.0353, + "step": 4135 + }, + { + "epoch": 4.645233749824512, + "grad_norm": 0.3702263663076714, + "learning_rate": 3.0180558656381806e-07, + "loss": 0.0365, + "step": 4136 + }, + { + "epoch": 4.64635687210445, + "grad_norm": 0.32586875886680355, + "learning_rate": 2.9989597025226523e-07, + "loss": 0.032, + "step": 4137 + }, + { + "epoch": 4.647479994384389, + "grad_norm": 0.33538912472499605, + "learning_rate": 2.979923225214665e-07, + "loss": 0.0313, + "step": 4138 + }, + { + "epoch": 4.648603116664327, + "grad_norm": 0.37507751939830075, + "learning_rate": 2.9609464454275707e-07, + "loss": 0.0375, + "step": 4139 + }, + { + "epoch": 4.649726238944265, + "grad_norm": 0.3481121639949164, + "learning_rate": 2.942029374837996e-07, + "loss": 0.0314, + "step": 4140 + }, + { + "epoch": 4.6508493612242034, + "grad_norm": 0.3423593643914144, + "learning_rate": 2.9231720250858296e-07, + "loss": 0.0343, + "step": 4141 + }, + { + "epoch": 4.651972483504141, + "grad_norm": 0.35719566136893616, + "learning_rate": 2.904374407774191e-07, + "loss": 0.0366, + "step": 4142 + }, + { + "epoch": 4.653095605784079, + "grad_norm": 0.3658780078221181, + "learning_rate": 2.8856365344694604e-07, + "loss": 0.0348, + "step": 4143 + }, + { + "epoch": 4.654218728064018, + "grad_norm": 0.34290940357613336, + "learning_rate": 2.866958416701271e-07, + "loss": 0.0335, + "step": 4144 + }, + { + "epoch": 4.655341850343956, + "grad_norm": 0.33735481225106245, + "learning_rate": 2.8483400659624737e-07, + "loss": 0.0354, + "step": 4145 + }, + { + "epoch": 4.656464972623894, + "grad_norm": 0.35331730640988807, + "learning_rate": 2.8297814937091495e-07, + "loss": 0.0363, + "step": 4146 + }, + { + "epoch": 4.657588094903833, + "grad_norm": 0.3582352972813522, + "learning_rate": 2.8112827113605637e-07, + "loss": 0.0321, + "step": 4147 + }, + { + "epoch": 4.658711217183771, + "grad_norm": 0.369439070620047, + "learning_rate": 2.792843730299244e-07, + "loss": 0.0409, + "step": 4148 + }, + { + "epoch": 4.659834339463709, + "grad_norm": 0.36467256732278813, + "learning_rate": 2.774464561870893e-07, + "loss": 0.0331, + "step": 4149 + }, + { + "epoch": 4.660957461743648, + "grad_norm": 0.3578494257105483, + "learning_rate": 2.7561452173844206e-07, + "loss": 0.0348, + "step": 4150 + }, + { + "epoch": 4.6620805840235855, + "grad_norm": 0.33043652998644596, + "learning_rate": 2.7378857081119204e-07, + "loss": 0.0291, + "step": 4151 + }, + { + "epoch": 4.663203706303523, + "grad_norm": 0.32682915997898, + "learning_rate": 2.7196860452886496e-07, + "loss": 0.0295, + "step": 4152 + }, + { + "epoch": 4.664326828583462, + "grad_norm": 0.35842979933675306, + "learning_rate": 2.7015462401130843e-07, + "loss": 0.0347, + "step": 4153 + }, + { + "epoch": 4.6654499508634, + "grad_norm": 0.3608036317898208, + "learning_rate": 2.68346630374684e-07, + "loss": 0.0351, + "step": 4154 + }, + { + "epoch": 4.666573073143338, + "grad_norm": 0.3445352154282416, + "learning_rate": 2.665446247314696e-07, + "loss": 0.0368, + "step": 4155 + }, + { + "epoch": 4.667696195423277, + "grad_norm": 0.36637403299593924, + "learning_rate": 2.6474860819046157e-07, + "loss": 0.0384, + "step": 4156 + }, + { + "epoch": 4.668819317703215, + "grad_norm": 0.3402971768771307, + "learning_rate": 2.629585818567637e-07, + "loss": 0.0336, + "step": 4157 + }, + { + "epoch": 4.669942439983153, + "grad_norm": 0.32819300144957086, + "learning_rate": 2.6117454683180274e-07, + "loss": 0.0312, + "step": 4158 + }, + { + "epoch": 4.671065562263092, + "grad_norm": 0.3449374419407223, + "learning_rate": 2.5939650421331395e-07, + "loss": 0.0334, + "step": 4159 + }, + { + "epoch": 4.67218868454303, + "grad_norm": 0.3596838423275439, + "learning_rate": 2.576244550953466e-07, + "loss": 0.0375, + "step": 4160 + }, + { + "epoch": 4.6733118068229675, + "grad_norm": 0.3685410673567921, + "learning_rate": 2.5585840056826295e-07, + "loss": 0.0381, + "step": 4161 + }, + { + "epoch": 4.674434929102906, + "grad_norm": 0.352525835209393, + "learning_rate": 2.540983417187348e-07, + "loss": 0.033, + "step": 4162 + }, + { + "epoch": 4.675558051382844, + "grad_norm": 0.3406022335981378, + "learning_rate": 2.5234427962974486e-07, + "loss": 0.03, + "step": 4163 + }, + { + "epoch": 4.676681173662782, + "grad_norm": 0.3732219938497303, + "learning_rate": 2.5059621538058743e-07, + "loss": 0.0327, + "step": 4164 + }, + { + "epoch": 4.677804295942721, + "grad_norm": 0.3454388239084282, + "learning_rate": 2.488541500468666e-07, + "loss": 0.0321, + "step": 4165 + }, + { + "epoch": 4.678927418222659, + "grad_norm": 0.3482806759890393, + "learning_rate": 2.47118084700495e-07, + "loss": 0.0334, + "step": 4166 + }, + { + "epoch": 4.680050540502597, + "grad_norm": 0.3423367434752234, + "learning_rate": 2.453880204096892e-07, + "loss": 0.0325, + "step": 4167 + }, + { + "epoch": 4.681173662782536, + "grad_norm": 0.35270956062249814, + "learning_rate": 2.4366395823898104e-07, + "loss": 0.0339, + "step": 4168 + }, + { + "epoch": 4.682296785062474, + "grad_norm": 0.3296367877167982, + "learning_rate": 2.419458992492019e-07, + "loss": 0.0329, + "step": 4169 + }, + { + "epoch": 4.683419907342412, + "grad_norm": 0.33676841186669126, + "learning_rate": 2.402338444974928e-07, + "loss": 0.0304, + "step": 4170 + }, + { + "epoch": 4.6845430296223505, + "grad_norm": 0.3432291676295886, + "learning_rate": 2.3852779503730217e-07, + "loss": 0.0357, + "step": 4171 + }, + { + "epoch": 4.685666151902288, + "grad_norm": 0.3415736112710834, + "learning_rate": 2.3682775191837814e-07, + "loss": 0.0331, + "step": 4172 + }, + { + "epoch": 4.686789274182226, + "grad_norm": 0.3718654445263105, + "learning_rate": 2.3513371618677838e-07, + "loss": 0.036, + "step": 4173 + }, + { + "epoch": 4.687912396462165, + "grad_norm": 0.3659974123825445, + "learning_rate": 2.3344568888485907e-07, + "loss": 0.0361, + "step": 4174 + }, + { + "epoch": 4.689035518742103, + "grad_norm": 0.34842971301416137, + "learning_rate": 2.3176367105128494e-07, + "loss": 0.0353, + "step": 4175 + }, + { + "epoch": 4.690158641022041, + "grad_norm": 0.35992964433081653, + "learning_rate": 2.300876637210181e-07, + "loss": 0.0352, + "step": 4176 + }, + { + "epoch": 4.69128176330198, + "grad_norm": 0.35358447232276063, + "learning_rate": 2.2841766792532472e-07, + "loss": 0.0367, + "step": 4177 + }, + { + "epoch": 4.692404885581918, + "grad_norm": 0.3458896703817227, + "learning_rate": 2.2675368469177171e-07, + "loss": 0.0319, + "step": 4178 + }, + { + "epoch": 4.693528007861856, + "grad_norm": 0.33681153882758513, + "learning_rate": 2.2509571504422678e-07, + "loss": 0.035, + "step": 4179 + }, + { + "epoch": 4.694651130141795, + "grad_norm": 0.34571417197318716, + "learning_rate": 2.2344376000285606e-07, + "loss": 0.035, + "step": 4180 + }, + { + "epoch": 4.6957742524217325, + "grad_norm": 0.33932945848040214, + "learning_rate": 2.2179782058412646e-07, + "loss": 0.0327, + "step": 4181 + }, + { + "epoch": 4.6968973747016705, + "grad_norm": 0.34489622098163436, + "learning_rate": 2.201578978008012e-07, + "loss": 0.0337, + "step": 4182 + }, + { + "epoch": 4.698020496981609, + "grad_norm": 0.3480940006392897, + "learning_rate": 2.1852399266194312e-07, + "loss": 0.0318, + "step": 4183 + }, + { + "epoch": 4.699143619261547, + "grad_norm": 0.33290094072681525, + "learning_rate": 2.1689610617291357e-07, + "loss": 0.0332, + "step": 4184 + }, + { + "epoch": 4.700266741541485, + "grad_norm": 0.3599868420281495, + "learning_rate": 2.1527423933536906e-07, + "loss": 0.0385, + "step": 4185 + }, + { + "epoch": 4.701389863821424, + "grad_norm": 0.3337751304343682, + "learning_rate": 2.1365839314726021e-07, + "loss": 0.03, + "step": 4186 + }, + { + "epoch": 4.702512986101362, + "grad_norm": 0.33760321339006033, + "learning_rate": 2.1204856860283506e-07, + "loss": 0.0353, + "step": 4187 + }, + { + "epoch": 4.7036361083813, + "grad_norm": 0.3357458834520037, + "learning_rate": 2.1044476669263793e-07, + "loss": 0.0316, + "step": 4188 + }, + { + "epoch": 4.704759230661239, + "grad_norm": 0.3494253152613926, + "learning_rate": 2.0884698840350492e-07, + "loss": 0.0352, + "step": 4189 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 0.3313642700196441, + "learning_rate": 2.0725523471856744e-07, + "loss": 0.0344, + "step": 4190 + }, + { + "epoch": 4.707005475221115, + "grad_norm": 0.3515014279713326, + "learning_rate": 2.056695066172476e-07, + "loss": 0.0375, + "step": 4191 + }, + { + "epoch": 4.708128597501053, + "grad_norm": 0.3393284033675587, + "learning_rate": 2.0408980507526267e-07, + "loss": 0.0324, + "step": 4192 + }, + { + "epoch": 4.709251719780991, + "grad_norm": 0.3433377101697511, + "learning_rate": 2.0251613106461955e-07, + "loss": 0.0351, + "step": 4193 + }, + { + "epoch": 4.710374842060929, + "grad_norm": 0.3609914609514239, + "learning_rate": 2.0094848555361702e-07, + "loss": 0.0343, + "step": 4194 + }, + { + "epoch": 4.711497964340868, + "grad_norm": 0.33674994452184703, + "learning_rate": 1.993868695068457e-07, + "loss": 0.0332, + "step": 4195 + }, + { + "epoch": 4.712621086620806, + "grad_norm": 0.3405640938227505, + "learning_rate": 1.978312838851837e-07, + "loss": 0.0338, + "step": 4196 + }, + { + "epoch": 4.713744208900744, + "grad_norm": 0.3518234996906055, + "learning_rate": 1.9628172964580082e-07, + "loss": 0.0323, + "step": 4197 + }, + { + "epoch": 4.714867331180682, + "grad_norm": 0.37220183549793007, + "learning_rate": 1.9473820774215557e-07, + "loss": 0.0342, + "step": 4198 + }, + { + "epoch": 4.715990453460621, + "grad_norm": 0.32965656846842434, + "learning_rate": 1.932007191239915e-07, + "loss": 0.0295, + "step": 4199 + }, + { + "epoch": 4.717113575740559, + "grad_norm": 0.3521785347247238, + "learning_rate": 1.9166926473734636e-07, + "loss": 0.033, + "step": 4200 + }, + { + "epoch": 4.718236698020497, + "grad_norm": 0.35522561854409634, + "learning_rate": 1.9014384552453635e-07, + "loss": 0.0338, + "step": 4201 + }, + { + "epoch": 4.719359820300435, + "grad_norm": 0.3310871063889209, + "learning_rate": 1.8862446242417175e-07, + "loss": 0.0327, + "step": 4202 + }, + { + "epoch": 4.720482942580373, + "grad_norm": 0.3239117817488521, + "learning_rate": 1.8711111637114364e-07, + "loss": 0.0334, + "step": 4203 + }, + { + "epoch": 4.721606064860311, + "grad_norm": 0.33473520840189047, + "learning_rate": 1.856038082966327e-07, + "loss": 0.0315, + "step": 4204 + }, + { + "epoch": 4.72272918714025, + "grad_norm": 0.3546234500423055, + "learning_rate": 1.841025391281015e-07, + "loss": 0.0347, + "step": 4205 + }, + { + "epoch": 4.723852309420188, + "grad_norm": 0.35300752775260824, + "learning_rate": 1.8260730978929664e-07, + "loss": 0.0373, + "step": 4206 + }, + { + "epoch": 4.724975431700126, + "grad_norm": 0.31962674294469384, + "learning_rate": 1.8111812120024884e-07, + "loss": 0.0304, + "step": 4207 + }, + { + "epoch": 4.726098553980065, + "grad_norm": 0.3606715730669359, + "learning_rate": 1.7963497427727294e-07, + "loss": 0.0337, + "step": 4208 + }, + { + "epoch": 4.727221676260003, + "grad_norm": 0.32723932407883416, + "learning_rate": 1.781578699329667e-07, + "loss": 0.032, + "step": 4209 + }, + { + "epoch": 4.728344798539941, + "grad_norm": 0.32660649785881374, + "learning_rate": 1.766868090762075e-07, + "loss": 0.0304, + "step": 4210 + }, + { + "epoch": 4.7294679208198795, + "grad_norm": 0.3834637625549436, + "learning_rate": 1.7522179261215467e-07, + "loss": 0.043, + "step": 4211 + }, + { + "epoch": 4.7305910430998175, + "grad_norm": 0.31299817068786057, + "learning_rate": 1.7376282144224933e-07, + "loss": 0.0302, + "step": 4212 + }, + { + "epoch": 4.731714165379755, + "grad_norm": 0.34864103394231777, + "learning_rate": 1.7230989646421337e-07, + "loss": 0.0329, + "step": 4213 + }, + { + "epoch": 4.732837287659694, + "grad_norm": 0.35855522330101297, + "learning_rate": 1.7086301857204725e-07, + "loss": 0.0327, + "step": 4214 + }, + { + "epoch": 4.733960409939632, + "grad_norm": 0.35553712042165425, + "learning_rate": 1.694221886560299e-07, + "loss": 0.037, + "step": 4215 + }, + { + "epoch": 4.73508353221957, + "grad_norm": 0.3348937621754762, + "learning_rate": 1.6798740760272104e-07, + "loss": 0.0329, + "step": 4216 + }, + { + "epoch": 4.736206654499509, + "grad_norm": 0.3809215375029254, + "learning_rate": 1.665586762949567e-07, + "loss": 0.0366, + "step": 4217 + }, + { + "epoch": 4.737329776779447, + "grad_norm": 0.34204433938299095, + "learning_rate": 1.6513599561185034e-07, + "loss": 0.0328, + "step": 4218 + }, + { + "epoch": 4.738452899059385, + "grad_norm": 0.3550432910580413, + "learning_rate": 1.6371936642879504e-07, + "loss": 0.0321, + "step": 4219 + }, + { + "epoch": 4.739576021339324, + "grad_norm": 0.3318127050500399, + "learning_rate": 1.623087896174558e-07, + "loss": 0.0365, + "step": 4220 + }, + { + "epoch": 4.740699143619262, + "grad_norm": 0.36204671764832996, + "learning_rate": 1.6090426604577714e-07, + "loss": 0.0346, + "step": 4221 + }, + { + "epoch": 4.7418222658991995, + "grad_norm": 0.3387248418153391, + "learning_rate": 1.5950579657797894e-07, + "loss": 0.0306, + "step": 4222 + }, + { + "epoch": 4.742945388179138, + "grad_norm": 0.3427148305083139, + "learning_rate": 1.5811338207455284e-07, + "loss": 0.0345, + "step": 4223 + }, + { + "epoch": 4.744068510459076, + "grad_norm": 0.35411961450361146, + "learning_rate": 1.5672702339226909e-07, + "loss": 0.0371, + "step": 4224 + }, + { + "epoch": 4.745191632739014, + "grad_norm": 0.3721507850238725, + "learning_rate": 1.553467213841664e-07, + "loss": 0.0368, + "step": 4225 + }, + { + "epoch": 4.746314755018953, + "grad_norm": 0.3466898365608997, + "learning_rate": 1.5397247689956318e-07, + "loss": 0.0308, + "step": 4226 + }, + { + "epoch": 4.747437877298891, + "grad_norm": 0.3574266108394466, + "learning_rate": 1.5260429078404416e-07, + "loss": 0.0335, + "step": 4227 + }, + { + "epoch": 4.748560999578829, + "grad_norm": 0.3400286952615133, + "learning_rate": 1.5124216387947143e-07, + "loss": 0.0307, + "step": 4228 + }, + { + "epoch": 4.749684121858768, + "grad_norm": 0.36309544829463203, + "learning_rate": 1.4988609702397683e-07, + "loss": 0.04, + "step": 4229 + }, + { + "epoch": 4.750807244138706, + "grad_norm": 0.3417693922064799, + "learning_rate": 1.4853609105196175e-07, + "loss": 0.0367, + "step": 4230 + }, + { + "epoch": 4.751930366418644, + "grad_norm": 0.3602988292077408, + "learning_rate": 1.4719214679409954e-07, + "loss": 0.0358, + "step": 4231 + }, + { + "epoch": 4.753053488698582, + "grad_norm": 0.3735538962893692, + "learning_rate": 1.4585426507733536e-07, + "loss": 0.0311, + "step": 4232 + }, + { + "epoch": 4.75417661097852, + "grad_norm": 0.39448460405940855, + "learning_rate": 1.445224467248818e-07, + "loss": 0.0385, + "step": 4233 + }, + { + "epoch": 4.755299733258458, + "grad_norm": 0.36862373252888875, + "learning_rate": 1.4319669255622115e-07, + "loss": 0.0378, + "step": 4234 + }, + { + "epoch": 4.756422855538396, + "grad_norm": 0.3606558317899239, + "learning_rate": 1.418770033871053e-07, + "loss": 0.0351, + "step": 4235 + }, + { + "epoch": 4.757545977818335, + "grad_norm": 0.357546164191823, + "learning_rate": 1.405633800295525e-07, + "loss": 0.0377, + "step": 4236 + }, + { + "epoch": 4.758669100098273, + "grad_norm": 0.33523607494626556, + "learning_rate": 1.392558232918506e-07, + "loss": 0.0313, + "step": 4237 + }, + { + "epoch": 4.759792222378211, + "grad_norm": 0.3554788435214293, + "learning_rate": 1.3795433397855274e-07, + "loss": 0.0363, + "step": 4238 + }, + { + "epoch": 4.76091534465815, + "grad_norm": 0.334541232594485, + "learning_rate": 1.366589128904805e-07, + "loss": 0.0331, + "step": 4239 + }, + { + "epoch": 4.762038466938088, + "grad_norm": 0.36173611814235396, + "learning_rate": 1.3536956082472074e-07, + "loss": 0.0369, + "step": 4240 + }, + { + "epoch": 4.763161589218026, + "grad_norm": 0.3250258742570545, + "learning_rate": 1.3408627857462443e-07, + "loss": 0.0318, + "step": 4241 + }, + { + "epoch": 4.7642847114979645, + "grad_norm": 0.3565828415127001, + "learning_rate": 1.32809066929811e-07, + "loss": 0.0387, + "step": 4242 + }, + { + "epoch": 4.7654078337779024, + "grad_norm": 0.3383956134948221, + "learning_rate": 1.3153792667616183e-07, + "loss": 0.0332, + "step": 4243 + }, + { + "epoch": 4.76653095605784, + "grad_norm": 0.3476033007253469, + "learning_rate": 1.302728585958246e-07, + "loss": 0.0365, + "step": 4244 + }, + { + "epoch": 4.767654078337779, + "grad_norm": 0.3516963013877261, + "learning_rate": 1.290138634672089e-07, + "loss": 0.0386, + "step": 4245 + }, + { + "epoch": 4.768777200617717, + "grad_norm": 0.34481467608140326, + "learning_rate": 1.2776094206498834e-07, + "loss": 0.0339, + "step": 4246 + }, + { + "epoch": 4.769900322897655, + "grad_norm": 0.3925266775395916, + "learning_rate": 1.2651409516009848e-07, + "loss": 0.0345, + "step": 4247 + }, + { + "epoch": 4.771023445177594, + "grad_norm": 0.356302010857859, + "learning_rate": 1.2527332351973899e-07, + "loss": 0.0342, + "step": 4248 + }, + { + "epoch": 4.772146567457532, + "grad_norm": 0.3385658467723331, + "learning_rate": 1.2403862790737021e-07, + "loss": 0.0297, + "step": 4249 + }, + { + "epoch": 4.77326968973747, + "grad_norm": 0.34274297140098986, + "learning_rate": 1.2281000908271336e-07, + "loss": 0.0335, + "step": 4250 + }, + { + "epoch": 4.774392812017409, + "grad_norm": 0.32904157582350113, + "learning_rate": 1.2158746780175257e-07, + "loss": 0.0324, + "step": 4251 + }, + { + "epoch": 4.7755159342973466, + "grad_norm": 0.3583962692152138, + "learning_rate": 1.2037100481672836e-07, + "loss": 0.0349, + "step": 4252 + }, + { + "epoch": 4.7766390565772845, + "grad_norm": 0.3411729792940599, + "learning_rate": 1.1916062087614644e-07, + "loss": 0.0344, + "step": 4253 + }, + { + "epoch": 4.777762178857223, + "grad_norm": 0.34168029879821443, + "learning_rate": 1.1795631672476771e-07, + "loss": 0.0364, + "step": 4254 + }, + { + "epoch": 4.778885301137161, + "grad_norm": 0.35060334074323224, + "learning_rate": 1.1675809310361497e-07, + "loss": 0.0466, + "step": 4255 + }, + { + "epoch": 4.780008423417099, + "grad_norm": 0.34336606647480034, + "learning_rate": 1.1556595074996624e-07, + "loss": 0.0331, + "step": 4256 + }, + { + "epoch": 4.781131545697038, + "grad_norm": 0.32382578010358776, + "learning_rate": 1.1437989039736253e-07, + "loss": 0.0325, + "step": 4257 + }, + { + "epoch": 4.782254667976976, + "grad_norm": 0.34851470783798044, + "learning_rate": 1.1319991277559783e-07, + "loss": 0.036, + "step": 4258 + }, + { + "epoch": 4.783377790256914, + "grad_norm": 0.35953669765583024, + "learning_rate": 1.1202601861072693e-07, + "loss": 0.0364, + "step": 4259 + }, + { + "epoch": 4.784500912536853, + "grad_norm": 0.3420834182439802, + "learning_rate": 1.108582086250587e-07, + "loss": 0.0304, + "step": 4260 + }, + { + "epoch": 4.785624034816791, + "grad_norm": 0.3474035202936612, + "learning_rate": 1.0969648353715945e-07, + "loss": 0.0353, + "step": 4261 + }, + { + "epoch": 4.786747157096729, + "grad_norm": 0.33774829169251014, + "learning_rate": 1.0854084406185184e-07, + "loss": 0.0309, + "step": 4262 + }, + { + "epoch": 4.787870279376667, + "grad_norm": 0.3323004162031955, + "learning_rate": 1.0739129091021372e-07, + "loss": 0.0301, + "step": 4263 + }, + { + "epoch": 4.788993401656605, + "grad_norm": 0.35491038528232177, + "learning_rate": 1.0624782478957818e-07, + "loss": 0.0362, + "step": 4264 + }, + { + "epoch": 4.790116523936543, + "grad_norm": 0.3148060898080571, + "learning_rate": 1.051104464035313e-07, + "loss": 0.03, + "step": 4265 + }, + { + "epoch": 4.791239646216482, + "grad_norm": 0.3454633600900965, + "learning_rate": 1.0397915645191437e-07, + "loss": 0.0325, + "step": 4266 + }, + { + "epoch": 4.79236276849642, + "grad_norm": 0.3387657241758695, + "learning_rate": 1.028539556308239e-07, + "loss": 0.0366, + "step": 4267 + }, + { + "epoch": 4.793485890776358, + "grad_norm": 0.343688514356464, + "learning_rate": 1.017348446326083e-07, + "loss": 0.0355, + "step": 4268 + }, + { + "epoch": 4.794609013056297, + "grad_norm": 0.33804452702394205, + "learning_rate": 1.0062182414586786e-07, + "loss": 0.0317, + "step": 4269 + }, + { + "epoch": 4.795732135336235, + "grad_norm": 0.34775190604371375, + "learning_rate": 9.951489485545696e-08, + "loss": 0.0357, + "step": 4270 + }, + { + "epoch": 4.796855257616173, + "grad_norm": 0.34403813226107627, + "learning_rate": 9.841405744248078e-08, + "loss": 0.036, + "step": 4271 + }, + { + "epoch": 4.7979783798961115, + "grad_norm": 0.337690442201065, + "learning_rate": 9.731931258429638e-08, + "loss": 0.0336, + "step": 4272 + }, + { + "epoch": 4.7991015021760495, + "grad_norm": 0.36856828496724703, + "learning_rate": 9.623066095451494e-08, + "loss": 0.04, + "step": 4273 + }, + { + "epoch": 4.800224624455987, + "grad_norm": 0.3454697388417179, + "learning_rate": 9.514810322299283e-08, + "loss": 0.0327, + "step": 4274 + }, + { + "epoch": 4.801347746735926, + "grad_norm": 0.3462964343590165, + "learning_rate": 9.407164005584057e-08, + "loss": 0.0333, + "step": 4275 + }, + { + "epoch": 4.802470869015864, + "grad_norm": 0.3431968345054756, + "learning_rate": 9.300127211541832e-08, + "loss": 0.0314, + "step": 4276 + }, + { + "epoch": 4.803593991295802, + "grad_norm": 0.3651225919722223, + "learning_rate": 9.193700006033368e-08, + "loss": 0.0412, + "step": 4277 + }, + { + "epoch": 4.804717113575741, + "grad_norm": 0.33522720006840734, + "learning_rate": 9.087882454544839e-08, + "loss": 0.0298, + "step": 4278 + }, + { + "epoch": 4.805840235855679, + "grad_norm": 0.3320171412387023, + "learning_rate": 8.982674622186605e-08, + "loss": 0.0309, + "step": 4279 + }, + { + "epoch": 4.806963358135617, + "grad_norm": 0.3506412235588877, + "learning_rate": 8.878076573694328e-08, + "loss": 0.0344, + "step": 4280 + }, + { + "epoch": 4.808086480415556, + "grad_norm": 0.36747870462605653, + "learning_rate": 8.774088373428413e-08, + "loss": 0.0401, + "step": 4281 + }, + { + "epoch": 4.809209602695494, + "grad_norm": 0.3537987285313262, + "learning_rate": 8.67071008537379e-08, + "loss": 0.0342, + "step": 4282 + }, + { + "epoch": 4.8103327249754315, + "grad_norm": 0.33629875351700084, + "learning_rate": 8.567941773140465e-08, + "loss": 0.0332, + "step": 4283 + }, + { + "epoch": 4.81145584725537, + "grad_norm": 0.3534627818646823, + "learning_rate": 8.465783499962633e-08, + "loss": 0.0354, + "step": 4284 + }, + { + "epoch": 4.812578969535308, + "grad_norm": 0.3630589407739355, + "learning_rate": 8.364235328699566e-08, + "loss": 0.0352, + "step": 4285 + }, + { + "epoch": 4.813702091815246, + "grad_norm": 0.33985452597445087, + "learning_rate": 8.263297321835062e-08, + "loss": 0.032, + "step": 4286 + }, + { + "epoch": 4.814825214095185, + "grad_norm": 0.355931019590897, + "learning_rate": 8.162969541477217e-08, + "loss": 0.0323, + "step": 4287 + }, + { + "epoch": 4.815948336375123, + "grad_norm": 0.36125431699041477, + "learning_rate": 8.063252049358983e-08, + "loss": 0.04, + "step": 4288 + }, + { + "epoch": 4.817071458655061, + "grad_norm": 0.3354243066230309, + "learning_rate": 7.96414490683739e-08, + "loss": 0.0334, + "step": 4289 + }, + { + "epoch": 4.818194580935, + "grad_norm": 0.35478986196267864, + "learning_rate": 7.865648174894325e-08, + "loss": 0.0338, + "step": 4290 + }, + { + "epoch": 4.819317703214938, + "grad_norm": 0.34998123448801854, + "learning_rate": 7.767761914135974e-08, + "loss": 0.0339, + "step": 4291 + }, + { + "epoch": 4.820440825494876, + "grad_norm": 0.35949343783541043, + "learning_rate": 7.670486184792713e-08, + "loss": 0.0333, + "step": 4292 + }, + { + "epoch": 4.8215639477748145, + "grad_norm": 0.3651218793634851, + "learning_rate": 7.573821046719332e-08, + "loss": 0.0394, + "step": 4293 + }, + { + "epoch": 4.822687070054752, + "grad_norm": 0.34634749486357275, + "learning_rate": 7.477766559395139e-08, + "loss": 0.0327, + "step": 4294 + }, + { + "epoch": 4.82381019233469, + "grad_norm": 0.3413706560361767, + "learning_rate": 7.382322781923301e-08, + "loss": 0.0367, + "step": 4295 + }, + { + "epoch": 4.824933314614629, + "grad_norm": 0.3475100318642269, + "learning_rate": 7.287489773031508e-08, + "loss": 0.0344, + "step": 4296 + }, + { + "epoch": 4.826056436894567, + "grad_norm": 0.35140910720099416, + "learning_rate": 7.193267591071529e-08, + "loss": 0.0313, + "step": 4297 + }, + { + "epoch": 4.827179559174505, + "grad_norm": 0.30755190051409365, + "learning_rate": 7.09965629401943e-08, + "loss": 0.0299, + "step": 4298 + }, + { + "epoch": 4.828302681454443, + "grad_norm": 0.3208778262419729, + "learning_rate": 7.006655939475248e-08, + "loss": 0.0304, + "step": 4299 + }, + { + "epoch": 4.829425803734382, + "grad_norm": 0.33024767193628707, + "learning_rate": 6.914266584662988e-08, + "loss": 0.0317, + "step": 4300 + }, + { + "epoch": 4.83054892601432, + "grad_norm": 0.3600374450371469, + "learning_rate": 6.82248828643084e-08, + "loss": 0.0358, + "step": 4301 + }, + { + "epoch": 4.831672048294258, + "grad_norm": 0.343893665355897, + "learning_rate": 6.731321101251187e-08, + "loss": 0.0348, + "step": 4302 + }, + { + "epoch": 4.8327951705741965, + "grad_norm": 0.3401993628458368, + "learning_rate": 6.640765085220047e-08, + "loss": 0.0334, + "step": 4303 + }, + { + "epoch": 4.833918292854134, + "grad_norm": 0.37117274391604826, + "learning_rate": 6.550820294057625e-08, + "loss": 0.038, + "step": 4304 + }, + { + "epoch": 4.835041415134072, + "grad_norm": 0.3270123504885918, + "learning_rate": 6.461486783107762e-08, + "loss": 0.0318, + "step": 4305 + }, + { + "epoch": 4.836164537414011, + "grad_norm": 0.35563694733212037, + "learning_rate": 6.3727646073386e-08, + "loss": 0.0354, + "step": 4306 + }, + { + "epoch": 4.837287659693949, + "grad_norm": 0.3312605443855957, + "learning_rate": 6.284653821341691e-08, + "loss": 0.0341, + "step": 4307 + }, + { + "epoch": 4.838410781973887, + "grad_norm": 0.34887322226150547, + "learning_rate": 6.197154479332667e-08, + "loss": 0.0356, + "step": 4308 + }, + { + "epoch": 4.839533904253826, + "grad_norm": 0.34169094625118857, + "learning_rate": 6.110266635150796e-08, + "loss": 0.0332, + "step": 4309 + }, + { + "epoch": 4.840657026533764, + "grad_norm": 0.3460642146322915, + "learning_rate": 6.02399034225909e-08, + "loss": 0.0356, + "step": 4310 + }, + { + "epoch": 4.841780148813702, + "grad_norm": 0.33286836691758975, + "learning_rate": 5.9383256537444144e-08, + "loss": 0.0297, + "step": 4311 + }, + { + "epoch": 4.842903271093641, + "grad_norm": 0.3402965072293783, + "learning_rate": 5.853272622317052e-08, + "loss": 0.0349, + "step": 4312 + }, + { + "epoch": 4.8440263933735785, + "grad_norm": 0.35934649357906984, + "learning_rate": 5.7688313003112506e-08, + "loss": 0.0374, + "step": 4313 + }, + { + "epoch": 4.8451495156535165, + "grad_norm": 0.3536391767479085, + "learning_rate": 5.685001739684448e-08, + "loss": 0.037, + "step": 4314 + }, + { + "epoch": 4.846272637933455, + "grad_norm": 0.41892157440986844, + "learning_rate": 5.6017839920180506e-08, + "loss": 0.0332, + "step": 4315 + }, + { + "epoch": 4.847395760213393, + "grad_norm": 0.362207049616821, + "learning_rate": 5.519178108516765e-08, + "loss": 0.0358, + "step": 4316 + }, + { + "epoch": 4.848518882493331, + "grad_norm": 0.339492102312975, + "learning_rate": 5.437184140009044e-08, + "loss": 0.0309, + "step": 4317 + }, + { + "epoch": 4.84964200477327, + "grad_norm": 0.3487122546387408, + "learning_rate": 5.355802136946531e-08, + "loss": 0.0345, + "step": 4318 + }, + { + "epoch": 4.850765127053208, + "grad_norm": 0.3793793262602845, + "learning_rate": 5.2750321494046133e-08, + "loss": 0.0381, + "step": 4319 + }, + { + "epoch": 4.851888249333146, + "grad_norm": 0.36623706813318585, + "learning_rate": 5.1948742270817584e-08, + "loss": 0.0346, + "step": 4320 + }, + { + "epoch": 4.853011371613085, + "grad_norm": 0.3730438215325291, + "learning_rate": 5.1153284193001803e-08, + "loss": 0.0391, + "step": 4321 + }, + { + "epoch": 4.854134493893023, + "grad_norm": 0.31906053615524266, + "learning_rate": 5.036394775005282e-08, + "loss": 0.0305, + "step": 4322 + }, + { + "epoch": 4.855257616172961, + "grad_norm": 0.3256027247837537, + "learning_rate": 4.958073342765768e-08, + "loss": 0.0334, + "step": 4323 + }, + { + "epoch": 4.856380738452899, + "grad_norm": 0.3507274265615372, + "learning_rate": 4.880364170773533e-08, + "loss": 0.0349, + "step": 4324 + }, + { + "epoch": 4.857503860732837, + "grad_norm": 0.3429380538021881, + "learning_rate": 4.803267306844106e-08, + "loss": 0.0323, + "step": 4325 + }, + { + "epoch": 4.858626983012775, + "grad_norm": 0.3537057379449312, + "learning_rate": 4.726782798415985e-08, + "loss": 0.0374, + "step": 4326 + }, + { + "epoch": 4.859750105292714, + "grad_norm": 0.31967178022973664, + "learning_rate": 4.650910692550858e-08, + "loss": 0.0312, + "step": 4327 + }, + { + "epoch": 4.860873227572652, + "grad_norm": 0.3465360661382547, + "learning_rate": 4.5756510359337145e-08, + "loss": 0.0383, + "step": 4328 + }, + { + "epoch": 4.86199634985259, + "grad_norm": 0.3480958307636944, + "learning_rate": 4.501003874872623e-08, + "loss": 0.0361, + "step": 4329 + }, + { + "epoch": 4.863119472132528, + "grad_norm": 0.351546101719163, + "learning_rate": 4.426969255298841e-08, + "loss": 0.0365, + "step": 4330 + }, + { + "epoch": 4.864242594412467, + "grad_norm": 0.33709962673459676, + "learning_rate": 4.3535472227667075e-08, + "loss": 0.031, + "step": 4331 + }, + { + "epoch": 4.865365716692405, + "grad_norm": 0.3592936899620035, + "learning_rate": 4.280737822453529e-08, + "loss": 0.0402, + "step": 4332 + }, + { + "epoch": 4.866488838972343, + "grad_norm": 0.3400368919719095, + "learning_rate": 4.208541099159691e-08, + "loss": 0.0324, + "step": 4333 + }, + { + "epoch": 4.8676119612522815, + "grad_norm": 0.3289295108551006, + "learning_rate": 4.136957097308769e-08, + "loss": 0.0316, + "step": 4334 + }, + { + "epoch": 4.868735083532219, + "grad_norm": 0.3564469735283429, + "learning_rate": 4.065985860947086e-08, + "loss": 0.0383, + "step": 4335 + }, + { + "epoch": 4.869858205812157, + "grad_norm": 0.3456958920658182, + "learning_rate": 3.9956274337441533e-08, + "loss": 0.0373, + "step": 4336 + }, + { + "epoch": 4.870981328092096, + "grad_norm": 0.3546463192740376, + "learning_rate": 3.92588185899212e-08, + "loss": 0.0351, + "step": 4337 + }, + { + "epoch": 4.872104450372034, + "grad_norm": 0.3634352198807142, + "learning_rate": 3.856749179606323e-08, + "loss": 0.0351, + "step": 4338 + }, + { + "epoch": 4.873227572651972, + "grad_norm": 0.34143099459117354, + "learning_rate": 3.7882294381247355e-08, + "loss": 0.0325, + "step": 4339 + }, + { + "epoch": 4.874350694931911, + "grad_norm": 0.3445137921102463, + "learning_rate": 3.72032267670841e-08, + "loss": 0.0342, + "step": 4340 + }, + { + "epoch": 4.875473817211849, + "grad_norm": 0.34957356832593245, + "learning_rate": 3.6530289371411453e-08, + "loss": 0.0335, + "step": 4341 + }, + { + "epoch": 4.876596939491787, + "grad_norm": 0.3559100988101287, + "learning_rate": 3.586348260829486e-08, + "loss": 0.0341, + "step": 4342 + }, + { + "epoch": 4.877720061771726, + "grad_norm": 0.3582325578381939, + "learning_rate": 3.520280688802724e-08, + "loss": 0.0389, + "step": 4343 + }, + { + "epoch": 4.8788431840516635, + "grad_norm": 0.3434812907253102, + "learning_rate": 3.4548262617131176e-08, + "loss": 0.0313, + "step": 4344 + }, + { + "epoch": 4.8799663063316014, + "grad_norm": 0.32607848224810887, + "learning_rate": 3.38998501983534e-08, + "loss": 0.0306, + "step": 4345 + }, + { + "epoch": 4.88108942861154, + "grad_norm": 0.3701550768586464, + "learning_rate": 3.3257570030670316e-08, + "loss": 0.0361, + "step": 4346 + }, + { + "epoch": 4.882212550891478, + "grad_norm": 0.36030791267577544, + "learning_rate": 3.2621422509282464e-08, + "loss": 0.0366, + "step": 4347 + }, + { + "epoch": 4.883335673171416, + "grad_norm": 0.3451337361364224, + "learning_rate": 3.199140802562006e-08, + "loss": 0.0314, + "step": 4348 + }, + { + "epoch": 4.884458795451355, + "grad_norm": 0.3452767709477446, + "learning_rate": 3.1367526967336356e-08, + "loss": 0.0354, + "step": 4349 + }, + { + "epoch": 4.885581917731293, + "grad_norm": 0.3547638758596286, + "learning_rate": 3.0749779718314273e-08, + "loss": 0.0353, + "step": 4350 + }, + { + "epoch": 4.886705040011231, + "grad_norm": 0.3506982695342505, + "learning_rate": 3.013816665865976e-08, + "loss": 0.0344, + "step": 4351 + }, + { + "epoch": 4.88782816229117, + "grad_norm": 0.3737683890517672, + "learning_rate": 2.9532688164704005e-08, + "loss": 0.0378, + "step": 4352 + }, + { + "epoch": 4.888951284571108, + "grad_norm": 0.35014209109341093, + "learning_rate": 2.8933344609004545e-08, + "loss": 0.0367, + "step": 4353 + }, + { + "epoch": 4.8900744068510456, + "grad_norm": 0.3264875513879221, + "learning_rate": 2.834013636034527e-08, + "loss": 0.0328, + "step": 4354 + }, + { + "epoch": 4.891197529130984, + "grad_norm": 0.32367274530213846, + "learning_rate": 2.7753063783734212e-08, + "loss": 0.0308, + "step": 4355 + }, + { + "epoch": 4.892320651410922, + "grad_norm": 0.32601616170265874, + "learning_rate": 2.7172127240401302e-08, + "loss": 0.0306, + "step": 4356 + }, + { + "epoch": 4.89344377369086, + "grad_norm": 0.3514262599500851, + "learning_rate": 2.6597327087805048e-08, + "loss": 0.034, + "step": 4357 + }, + { + "epoch": 4.894566895970799, + "grad_norm": 0.34413690056428353, + "learning_rate": 2.6028663679625865e-08, + "loss": 0.0358, + "step": 4358 + }, + { + "epoch": 4.895690018250737, + "grad_norm": 0.33388792579998283, + "learning_rate": 2.5466137365768307e-08, + "loss": 0.0331, + "step": 4359 + }, + { + "epoch": 4.896813140530675, + "grad_norm": 0.3558627099214127, + "learning_rate": 2.4909748492362162e-08, + "loss": 0.0335, + "step": 4360 + }, + { + "epoch": 4.897936262810614, + "grad_norm": 0.3568173099350905, + "learning_rate": 2.4359497401758026e-08, + "loss": 0.0335, + "step": 4361 + }, + { + "epoch": 4.899059385090552, + "grad_norm": 0.3576774734027632, + "learning_rate": 2.3815384432531728e-08, + "loss": 0.0352, + "step": 4362 + }, + { + "epoch": 4.90018250737049, + "grad_norm": 0.35519782847619524, + "learning_rate": 2.327740991948213e-08, + "loss": 0.0346, + "step": 4363 + }, + { + "epoch": 4.9013056296504285, + "grad_norm": 0.3450872223409437, + "learning_rate": 2.2745574193632215e-08, + "loss": 0.035, + "step": 4364 + }, + { + "epoch": 4.902428751930366, + "grad_norm": 0.32911920269693273, + "learning_rate": 2.2219877582224657e-08, + "loss": 0.0307, + "step": 4365 + }, + { + "epoch": 4.903551874210304, + "grad_norm": 0.33655879907579667, + "learning_rate": 2.170032040872627e-08, + "loss": 0.0317, + "step": 4366 + }, + { + "epoch": 4.904674996490243, + "grad_norm": 0.34894129051496664, + "learning_rate": 2.1186902992827995e-08, + "loss": 0.0356, + "step": 4367 + }, + { + "epoch": 4.905798118770181, + "grad_norm": 0.3720141793260485, + "learning_rate": 2.067962565043935e-08, + "loss": 0.0343, + "step": 4368 + }, + { + "epoch": 4.906921241050119, + "grad_norm": 0.34874029206243984, + "learning_rate": 2.0178488693695096e-08, + "loss": 0.0361, + "step": 4369 + }, + { + "epoch": 4.908044363330058, + "grad_norm": 0.3474003843287452, + "learning_rate": 1.968349243094969e-08, + "loss": 0.0334, + "step": 4370 + }, + { + "epoch": 4.909167485609996, + "grad_norm": 0.3423160120197686, + "learning_rate": 1.9194637166780606e-08, + "loss": 0.0337, + "step": 4371 + }, + { + "epoch": 4.910290607889934, + "grad_norm": 0.33591485159670115, + "learning_rate": 1.8711923201983895e-08, + "loss": 0.0343, + "step": 4372 + }, + { + "epoch": 4.911413730169873, + "grad_norm": 0.33191680893555436, + "learning_rate": 1.8235350833579745e-08, + "loss": 0.0321, + "step": 4373 + }, + { + "epoch": 4.9125368524498105, + "grad_norm": 0.3316935624017572, + "learning_rate": 1.7764920354809146e-08, + "loss": 0.0337, + "step": 4374 + }, + { + "epoch": 4.9136599747297485, + "grad_norm": 0.35967697736265597, + "learning_rate": 1.730063205513277e-08, + "loss": 0.0365, + "step": 4375 + }, + { + "epoch": 4.914783097009687, + "grad_norm": 0.359094588149611, + "learning_rate": 1.6842486220232102e-08, + "loss": 0.0321, + "step": 4376 + }, + { + "epoch": 4.915906219289625, + "grad_norm": 0.3604389646619046, + "learning_rate": 1.6390483132009415e-08, + "loss": 0.0369, + "step": 4377 + }, + { + "epoch": 4.917029341569563, + "grad_norm": 0.3457113375936627, + "learning_rate": 1.5944623068586683e-08, + "loss": 0.0339, + "step": 4378 + }, + { + "epoch": 4.918152463849502, + "grad_norm": 0.31938400735132355, + "learning_rate": 1.5504906304306677e-08, + "loss": 0.0317, + "step": 4379 + }, + { + "epoch": 4.91927558612944, + "grad_norm": 0.340077310318995, + "learning_rate": 1.5071333109732966e-08, + "loss": 0.0361, + "step": 4380 + }, + { + "epoch": 4.920398708409378, + "grad_norm": 0.35535503166517346, + "learning_rate": 1.4643903751647703e-08, + "loss": 0.0361, + "step": 4381 + }, + { + "epoch": 4.921521830689317, + "grad_norm": 0.3537974935188405, + "learning_rate": 1.422261849305162e-08, + "loss": 0.0342, + "step": 4382 + }, + { + "epoch": 4.922644952969255, + "grad_norm": 0.32843491322571805, + "learning_rate": 1.3807477593166252e-08, + "loss": 0.0319, + "step": 4383 + }, + { + "epoch": 4.923768075249193, + "grad_norm": 0.3642690465675649, + "learning_rate": 1.339848130743393e-08, + "loss": 0.0363, + "step": 4384 + }, + { + "epoch": 4.924891197529131, + "grad_norm": 0.3431037322105767, + "learning_rate": 1.299562988751335e-08, + "loss": 0.0342, + "step": 4385 + }, + { + "epoch": 4.926014319809069, + "grad_norm": 0.37231643890134525, + "learning_rate": 1.2598923581284006e-08, + "loss": 0.0341, + "step": 4386 + }, + { + "epoch": 4.927137442089007, + "grad_norm": 0.3500700062527457, + "learning_rate": 1.2208362632842863e-08, + "loss": 0.0363, + "step": 4387 + }, + { + "epoch": 4.928260564368946, + "grad_norm": 0.33425871580503613, + "learning_rate": 1.1823947282506576e-08, + "loss": 0.0312, + "step": 4388 + }, + { + "epoch": 4.929383686648884, + "grad_norm": 0.3633980633957219, + "learning_rate": 1.144567776681149e-08, + "loss": 0.0376, + "step": 4389 + }, + { + "epoch": 4.930506808928822, + "grad_norm": 0.3349073890426962, + "learning_rate": 1.1073554318509206e-08, + "loss": 0.0326, + "step": 4390 + }, + { + "epoch": 4.931629931208761, + "grad_norm": 0.34098481332204433, + "learning_rate": 1.0707577166572114e-08, + "loss": 0.0341, + "step": 4391 + }, + { + "epoch": 4.932753053488699, + "grad_norm": 0.3285118951852116, + "learning_rate": 1.0347746536191195e-08, + "loss": 0.0329, + "step": 4392 + }, + { + "epoch": 4.933876175768637, + "grad_norm": 0.3451360635835923, + "learning_rate": 9.994062648771563e-09, + "loss": 0.0345, + "step": 4393 + }, + { + "epoch": 4.9349992980485755, + "grad_norm": 0.3451649817059978, + "learning_rate": 9.646525721940247e-09, + "loss": 0.0339, + "step": 4394 + }, + { + "epoch": 4.9361224203285134, + "grad_norm": 0.3329314247830526, + "learning_rate": 9.305135969541746e-09, + "loss": 0.0331, + "step": 4395 + }, + { + "epoch": 4.937245542608451, + "grad_norm": 0.3703275176605861, + "learning_rate": 8.969893601634694e-09, + "loss": 0.0321, + "step": 4396 + }, + { + "epoch": 4.93836866488839, + "grad_norm": 0.38004031161852586, + "learning_rate": 8.64079882449853e-09, + "loss": 0.0343, + "step": 4397 + }, + { + "epoch": 4.939491787168328, + "grad_norm": 0.36158113782294066, + "learning_rate": 8.317851840629055e-09, + "loss": 0.0324, + "step": 4398 + }, + { + "epoch": 4.940614909448266, + "grad_norm": 0.35802038640100103, + "learning_rate": 8.001052848739532e-09, + "loss": 0.0355, + "step": 4399 + }, + { + "epoch": 4.941738031728204, + "grad_norm": 0.3534681414083675, + "learning_rate": 7.690402043758482e-09, + "loss": 0.0342, + "step": 4400 + }, + { + "epoch": 4.942861154008143, + "grad_norm": 0.3409750506973528, + "learning_rate": 7.385899616833003e-09, + "loss": 0.0322, + "step": 4401 + }, + { + "epoch": 4.943984276288081, + "grad_norm": 0.35696111455006185, + "learning_rate": 7.087545755327663e-09, + "loss": 0.0343, + "step": 4402 + }, + { + "epoch": 4.945107398568019, + "grad_norm": 0.3628732180685351, + "learning_rate": 6.795340642823389e-09, + "loss": 0.033, + "step": 4403 + }, + { + "epoch": 4.946230520847958, + "grad_norm": 0.3561531152392209, + "learning_rate": 6.50928445911525e-09, + "loss": 0.0347, + "step": 4404 + }, + { + "epoch": 4.9473536431278955, + "grad_norm": 0.3604397825246946, + "learning_rate": 6.229377380218005e-09, + "loss": 0.0349, + "step": 4405 + }, + { + "epoch": 4.948476765407833, + "grad_norm": 0.3515361148095995, + "learning_rate": 5.95561957836055e-09, + "loss": 0.0349, + "step": 4406 + }, + { + "epoch": 4.949599887687772, + "grad_norm": 0.34835719474984594, + "learning_rate": 5.688011221991474e-09, + "loss": 0.034, + "step": 4407 + }, + { + "epoch": 4.95072300996771, + "grad_norm": 0.36776092832286844, + "learning_rate": 5.426552475770175e-09, + "loss": 0.0405, + "step": 4408 + }, + { + "epoch": 4.951846132247648, + "grad_norm": 0.33964356120947575, + "learning_rate": 5.1712435005768504e-09, + "loss": 0.0314, + "step": 4409 + }, + { + "epoch": 4.952969254527587, + "grad_norm": 0.3576271748655273, + "learning_rate": 4.922084453505838e-09, + "loss": 0.0348, + "step": 4410 + }, + { + "epoch": 4.954092376807525, + "grad_norm": 0.3429879169889465, + "learning_rate": 4.679075487866725e-09, + "loss": 0.0337, + "step": 4411 + }, + { + "epoch": 4.955215499087463, + "grad_norm": 0.3745978618879177, + "learning_rate": 4.4422167531865675e-09, + "loss": 0.0368, + "step": 4412 + }, + { + "epoch": 4.956338621367402, + "grad_norm": 0.32986700838548433, + "learning_rate": 4.211508395206565e-09, + "loss": 0.0327, + "step": 4413 + }, + { + "epoch": 4.95746174364734, + "grad_norm": 0.33311799847039564, + "learning_rate": 3.986950555883162e-09, + "loss": 0.0348, + "step": 4414 + }, + { + "epoch": 4.9585848659272775, + "grad_norm": 0.3851607304135312, + "learning_rate": 3.768543373391387e-09, + "loss": 0.0424, + "step": 4415 + }, + { + "epoch": 4.959707988207216, + "grad_norm": 0.31347433372100486, + "learning_rate": 3.5562869821181843e-09, + "loss": 0.0315, + "step": 4416 + }, + { + "epoch": 4.960831110487154, + "grad_norm": 0.34107319972689115, + "learning_rate": 3.3501815126668613e-09, + "loss": 0.0339, + "step": 4417 + }, + { + "epoch": 4.961954232767092, + "grad_norm": 0.33193990363553355, + "learning_rate": 3.150227091857083e-09, + "loss": 0.0329, + "step": 4418 + }, + { + "epoch": 4.963077355047031, + "grad_norm": 0.3715468555033886, + "learning_rate": 2.9564238427237657e-09, + "loss": 0.038, + "step": 4419 + }, + { + "epoch": 4.964200477326969, + "grad_norm": 0.3372262605486213, + "learning_rate": 2.7687718845148538e-09, + "loss": 0.0314, + "step": 4420 + }, + { + "epoch": 4.965323599606907, + "grad_norm": 0.350019697957, + "learning_rate": 2.587271332694652e-09, + "loss": 0.0347, + "step": 4421 + }, + { + "epoch": 4.966446721886846, + "grad_norm": 0.3446161586717599, + "learning_rate": 2.411922298943825e-09, + "loss": 0.0344, + "step": 4422 + }, + { + "epoch": 4.967569844166784, + "grad_norm": 0.319098731802588, + "learning_rate": 2.242724891156067e-09, + "loss": 0.0323, + "step": 4423 + }, + { + "epoch": 4.968692966446722, + "grad_norm": 0.3801202120943219, + "learning_rate": 2.079679213439212e-09, + "loss": 0.0372, + "step": 4424 + }, + { + "epoch": 4.9698160887266605, + "grad_norm": 0.33750070670739035, + "learning_rate": 1.9227853661174524e-09, + "loss": 0.0352, + "step": 4425 + }, + { + "epoch": 4.970939211006598, + "grad_norm": 0.3519190781169123, + "learning_rate": 1.7720434457302315e-09, + "loss": 0.0373, + "step": 4426 + }, + { + "epoch": 4.972062333286536, + "grad_norm": 0.3437633504102528, + "learning_rate": 1.6274535450311324e-09, + "loss": 0.0325, + "step": 4427 + }, + { + "epoch": 4.973185455566475, + "grad_norm": 0.3677752690761468, + "learning_rate": 1.4890157529856563e-09, + "loss": 0.0402, + "step": 4428 + }, + { + "epoch": 4.974308577846413, + "grad_norm": 0.3683625506073574, + "learning_rate": 1.3567301547778856e-09, + "loss": 0.037, + "step": 4429 + }, + { + "epoch": 4.975431700126351, + "grad_norm": 0.3560600240146814, + "learning_rate": 1.230596831804931e-09, + "loss": 0.0358, + "step": 4430 + }, + { + "epoch": 4.976554822406289, + "grad_norm": 0.3625253632661801, + "learning_rate": 1.1106158616758235e-09, + "loss": 0.0366, + "step": 4431 + }, + { + "epoch": 4.977677944686228, + "grad_norm": 0.3444846000256938, + "learning_rate": 9.96787318218173e-10, + "loss": 0.0324, + "step": 4432 + }, + { + "epoch": 4.978801066966166, + "grad_norm": 0.35833357272901073, + "learning_rate": 8.891112714726203e-10, + "loss": 0.035, + "step": 4433 + }, + { + "epoch": 4.979924189246104, + "grad_norm": 0.3425329647959898, + "learning_rate": 7.875877876906135e-10, + "loss": 0.0331, + "step": 4434 + }, + { + "epoch": 4.9810473115260425, + "grad_norm": 0.35679200669540573, + "learning_rate": 6.922169293421821e-10, + "loss": 0.0364, + "step": 4435 + }, + { + "epoch": 4.9821704338059805, + "grad_norm": 0.3259794017034792, + "learning_rate": 6.029987551103844e-10, + "loss": 0.0319, + "step": 4436 + }, + { + "epoch": 4.983293556085918, + "grad_norm": 0.3480641727719168, + "learning_rate": 5.199333198924183e-10, + "loss": 0.0336, + "step": 4437 + }, + { + "epoch": 4.984416678365857, + "grad_norm": 0.348847347957358, + "learning_rate": 4.4302067479851105e-10, + "loss": 0.0321, + "step": 4438 + }, + { + "epoch": 4.985539800645795, + "grad_norm": 0.3783464710272088, + "learning_rate": 3.7226086715413945e-10, + "loss": 0.0381, + "step": 4439 + }, + { + "epoch": 4.986662922925733, + "grad_norm": 0.35344733158741803, + "learning_rate": 3.076539404989198e-10, + "loss": 0.0309, + "step": 4440 + }, + { + "epoch": 4.987786045205672, + "grad_norm": 0.33442902082308484, + "learning_rate": 2.4919993458549783e-10, + "loss": 0.0343, + "step": 4441 + }, + { + "epoch": 4.98890916748561, + "grad_norm": 0.35360484233382744, + "learning_rate": 1.968988853806586e-10, + "loss": 0.036, + "step": 4442 + }, + { + "epoch": 4.990032289765548, + "grad_norm": 0.3421628593665695, + "learning_rate": 1.5075082506865734e-10, + "loss": 0.0356, + "step": 4443 + }, + { + "epoch": 4.991155412045487, + "grad_norm": 0.33672336609006215, + "learning_rate": 1.1075578204233772e-10, + "loss": 0.0337, + "step": 4444 + }, + { + "epoch": 4.992278534325425, + "grad_norm": 0.3641808930391885, + "learning_rate": 7.691378091090329e-11, + "loss": 0.0328, + "step": 4445 + }, + { + "epoch": 4.9934016566053625, + "grad_norm": 0.32873267052415456, + "learning_rate": 4.9224842499917546e-11, + "loss": 0.0315, + "step": 4446 + }, + { + "epoch": 4.994524778885301, + "grad_norm": 0.36446284662411277, + "learning_rate": 2.768898384464258e-11, + "loss": 0.035, + "step": 4447 + }, + { + "epoch": 4.995647901165239, + "grad_norm": 0.33359437371199696, + "learning_rate": 1.2306218196700414e-11, + "loss": 0.031, + "step": 4448 + }, + { + "epoch": 4.996771023445177, + "grad_norm": 0.3510829003384089, + "learning_rate": 3.0765550229627793e-12, + "loss": 0.0348, + "step": 4449 + }, + { + "epoch": 4.997894145725116, + "grad_norm": 0.3451047076893161, + "learning_rate": 0.0, + "loss": 0.0326, + "step": 4450 + }, + { + "epoch": 4.997894145725116, + "step": 4450, + "total_flos": 2.0283773111783916e+18, + "train_loss": 0.1949592823937033, + "train_runtime": 25560.7624, + "train_samples_per_second": 22.291, + "train_steps_per_second": 0.174 + } + ], + "logging_steps": 1.0, + "max_steps": 4450, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0283773111783916e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}