{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 538240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018579072532699168, "grad_norm": 3.261758327484131, "learning_rate": 4.995355231866826e-05, "loss": 7.9049, "step": 500 }, { "epoch": 0.037158145065398336, "grad_norm": 2.7945966720581055, "learning_rate": 4.99071046373365e-05, "loss": 7.313, "step": 1000 }, { "epoch": 0.0557372175980975, "grad_norm": 3.66495680809021, "learning_rate": 4.986065695600476e-05, "loss": 7.0529, "step": 1500 }, { "epoch": 0.07431629013079667, "grad_norm": 3.696575880050659, "learning_rate": 4.981420927467301e-05, "loss": 6.8319, "step": 2000 }, { "epoch": 0.09289536266349584, "grad_norm": 4.190186977386475, "learning_rate": 4.976776159334126e-05, "loss": 6.5888, "step": 2500 }, { "epoch": 0.111474435196195, "grad_norm": 4.945929050445557, "learning_rate": 4.972131391200951e-05, "loss": 6.3799, "step": 3000 }, { "epoch": 0.13005350772889418, "grad_norm": 4.898506164550781, "learning_rate": 4.967486623067777e-05, "loss": 6.1872, "step": 3500 }, { "epoch": 0.14863258026159334, "grad_norm": 4.006326675415039, "learning_rate": 4.962841854934602e-05, "loss": 6.0257, "step": 4000 }, { "epoch": 0.1672116527942925, "grad_norm": 5.2170209884643555, "learning_rate": 4.958197086801427e-05, "loss": 5.8457, "step": 4500 }, { "epoch": 0.18579072532699167, "grad_norm": 5.515870094299316, "learning_rate": 4.953552318668253e-05, "loss": 5.7231, "step": 5000 }, { "epoch": 0.20436979785969084, "grad_norm": 6.801781177520752, "learning_rate": 4.948907550535077e-05, "loss": 5.5989, "step": 5500 }, { "epoch": 0.22294887039239, "grad_norm": 5.6380205154418945, "learning_rate": 4.944262782401903e-05, "loss": 5.4627, "step": 6000 }, { "epoch": 0.24152794292508917, "grad_norm": 4.960023880004883, "learning_rate": 4.939618014268728e-05, "loss": 5.3395, "step": 6500 }, { "epoch": 0.26010701545778836, "grad_norm": 4.814383506774902, "learning_rate": 4.934973246135553e-05, "loss": 5.2441, "step": 7000 }, { "epoch": 0.2786860879904875, "grad_norm": 5.070895195007324, "learning_rate": 4.930328478002379e-05, "loss": 5.1584, "step": 7500 }, { "epoch": 0.2972651605231867, "grad_norm": 4.370517730712891, "learning_rate": 4.925683709869204e-05, "loss": 5.08, "step": 8000 }, { "epoch": 0.3158442330558858, "grad_norm": 5.01335334777832, "learning_rate": 4.921038941736029e-05, "loss": 5.0119, "step": 8500 }, { "epoch": 0.334423305588585, "grad_norm": 5.798189163208008, "learning_rate": 4.916394173602854e-05, "loss": 4.9469, "step": 9000 }, { "epoch": 0.3530023781212842, "grad_norm": 5.567455291748047, "learning_rate": 4.9117494054696796e-05, "loss": 4.8906, "step": 9500 }, { "epoch": 0.37158145065398335, "grad_norm": 5.719528675079346, "learning_rate": 4.907104637336504e-05, "loss": 4.826, "step": 10000 }, { "epoch": 0.39016052318668254, "grad_norm": 5.381674289703369, "learning_rate": 4.90245986920333e-05, "loss": 4.7627, "step": 10500 }, { "epoch": 0.4087395957193817, "grad_norm": 5.749002933502197, "learning_rate": 4.8978151010701554e-05, "loss": 4.7247, "step": 11000 }, { "epoch": 0.42731866825208087, "grad_norm": 5.553402423858643, "learning_rate": 4.89317033293698e-05, "loss": 4.648, "step": 11500 }, { "epoch": 0.44589774078478, "grad_norm": 5.483209133148193, "learning_rate": 4.8885255648038055e-05, "loss": 4.5921, "step": 12000 }, { "epoch": 0.4644768133174792, "grad_norm": 5.7984747886657715, "learning_rate": 4.8838807966706305e-05, "loss": 4.5489, "step": 12500 }, { "epoch": 0.48305588585017833, "grad_norm": 5.168997287750244, "learning_rate": 4.8792360285374556e-05, "loss": 4.5258, "step": 13000 }, { "epoch": 0.5016349583828775, "grad_norm": 5.243541240692139, "learning_rate": 4.8745912604042806e-05, "loss": 4.4826, "step": 13500 }, { "epoch": 0.5202140309155767, "grad_norm": 5.172119140625, "learning_rate": 4.8699464922711064e-05, "loss": 4.4446, "step": 14000 }, { "epoch": 0.5387931034482759, "grad_norm": 4.617650032043457, "learning_rate": 4.8653017241379314e-05, "loss": 4.4262, "step": 14500 }, { "epoch": 0.557372175980975, "grad_norm": 5.853124618530273, "learning_rate": 4.8606569560047565e-05, "loss": 4.3786, "step": 15000 }, { "epoch": 0.5759512485136742, "grad_norm": 4.6473493576049805, "learning_rate": 4.8560121878715815e-05, "loss": 4.3497, "step": 15500 }, { "epoch": 0.5945303210463734, "grad_norm": 5.469015121459961, "learning_rate": 4.8513674197384066e-05, "loss": 4.2946, "step": 16000 }, { "epoch": 0.6131093935790726, "grad_norm": 4.9087371826171875, "learning_rate": 4.846722651605232e-05, "loss": 4.2847, "step": 16500 }, { "epoch": 0.6316884661117717, "grad_norm": 5.479755878448486, "learning_rate": 4.842077883472057e-05, "loss": 4.2635, "step": 17000 }, { "epoch": 0.6502675386444708, "grad_norm": 5.843453407287598, "learning_rate": 4.8374331153388824e-05, "loss": 4.2108, "step": 17500 }, { "epoch": 0.66884661117717, "grad_norm": 5.5419535636901855, "learning_rate": 4.832788347205708e-05, "loss": 4.2187, "step": 18000 }, { "epoch": 0.6874256837098692, "grad_norm": 5.9003376960754395, "learning_rate": 4.8281435790725325e-05, "loss": 4.16, "step": 18500 }, { "epoch": 0.7060047562425684, "grad_norm": 5.591574668884277, "learning_rate": 4.823498810939358e-05, "loss": 4.1433, "step": 19000 }, { "epoch": 0.7245838287752675, "grad_norm": 5.295060634613037, "learning_rate": 4.818854042806183e-05, "loss": 4.1182, "step": 19500 }, { "epoch": 0.7431629013079667, "grad_norm": 5.101735591888428, "learning_rate": 4.814209274673008e-05, "loss": 4.1155, "step": 20000 }, { "epoch": 0.7617419738406659, "grad_norm": 5.852224349975586, "learning_rate": 4.809564506539834e-05, "loss": 4.0749, "step": 20500 }, { "epoch": 0.7803210463733651, "grad_norm": 5.916712760925293, "learning_rate": 4.804919738406659e-05, "loss": 4.0554, "step": 21000 }, { "epoch": 0.7989001189060642, "grad_norm": 5.017261505126953, "learning_rate": 4.800274970273484e-05, "loss": 4.0196, "step": 21500 }, { "epoch": 0.8174791914387634, "grad_norm": 5.785404682159424, "learning_rate": 4.795630202140309e-05, "loss": 4.0196, "step": 22000 }, { "epoch": 0.8360582639714625, "grad_norm": 5.758474826812744, "learning_rate": 4.790985434007135e-05, "loss": 4.0027, "step": 22500 }, { "epoch": 0.8546373365041617, "grad_norm": 6.120078086853027, "learning_rate": 4.786340665873959e-05, "loss": 3.9577, "step": 23000 }, { "epoch": 0.8732164090368609, "grad_norm": 6.130490779876709, "learning_rate": 4.781695897740785e-05, "loss": 3.9507, "step": 23500 }, { "epoch": 0.89179548156956, "grad_norm": 4.95521354675293, "learning_rate": 4.77705112960761e-05, "loss": 3.9216, "step": 24000 }, { "epoch": 0.9103745541022592, "grad_norm": 5.775145530700684, "learning_rate": 4.772406361474435e-05, "loss": 3.9177, "step": 24500 }, { "epoch": 0.9289536266349584, "grad_norm": 5.804774761199951, "learning_rate": 4.767761593341261e-05, "loss": 3.8925, "step": 25000 }, { "epoch": 0.9475326991676576, "grad_norm": 5.883671760559082, "learning_rate": 4.763116825208086e-05, "loss": 3.8722, "step": 25500 }, { "epoch": 0.9661117717003567, "grad_norm": 5.462569236755371, "learning_rate": 4.758472057074911e-05, "loss": 3.8502, "step": 26000 }, { "epoch": 0.9846908442330559, "grad_norm": 5.68014669418335, "learning_rate": 4.753827288941736e-05, "loss": 3.8339, "step": 26500 }, { "epoch": 1.003269916765755, "grad_norm": 5.658189296722412, "learning_rate": 4.749182520808562e-05, "loss": 3.8221, "step": 27000 }, { "epoch": 1.0218489892984541, "grad_norm": 5.337306976318359, "learning_rate": 4.744537752675387e-05, "loss": 3.8099, "step": 27500 }, { "epoch": 1.0404280618311534, "grad_norm": 5.810146808624268, "learning_rate": 4.739892984542212e-05, "loss": 3.7751, "step": 28000 }, { "epoch": 1.0590071343638525, "grad_norm": 5.76528263092041, "learning_rate": 4.7352482164090375e-05, "loss": 3.7551, "step": 28500 }, { "epoch": 1.0775862068965518, "grad_norm": 6.346560955047607, "learning_rate": 4.730603448275862e-05, "loss": 3.7454, "step": 29000 }, { "epoch": 1.096165279429251, "grad_norm": 5.019473552703857, "learning_rate": 4.7259586801426876e-05, "loss": 3.74, "step": 29500 }, { "epoch": 1.11474435196195, "grad_norm": 5.211459636688232, "learning_rate": 4.721313912009513e-05, "loss": 3.7018, "step": 30000 }, { "epoch": 1.1333234244946493, "grad_norm": 5.713869571685791, "learning_rate": 4.716669143876338e-05, "loss": 3.6834, "step": 30500 }, { "epoch": 1.1519024970273484, "grad_norm": 5.4642744064331055, "learning_rate": 4.7120243757431635e-05, "loss": 3.7053, "step": 31000 }, { "epoch": 1.1704815695600477, "grad_norm": 6.932915687561035, "learning_rate": 4.7073796076099885e-05, "loss": 3.6719, "step": 31500 }, { "epoch": 1.1890606420927468, "grad_norm": 5.861956596374512, "learning_rate": 4.7027348394768136e-05, "loss": 3.6613, "step": 32000 }, { "epoch": 1.2076397146254458, "grad_norm": 5.654363632202148, "learning_rate": 4.6980900713436386e-05, "loss": 3.6413, "step": 32500 }, { "epoch": 1.2262187871581451, "grad_norm": 5.2097392082214355, "learning_rate": 4.693445303210464e-05, "loss": 3.6603, "step": 33000 }, { "epoch": 1.2447978596908442, "grad_norm": 5.455073833465576, "learning_rate": 4.688800535077289e-05, "loss": 3.6455, "step": 33500 }, { "epoch": 1.2633769322235433, "grad_norm": 5.670316219329834, "learning_rate": 4.6841557669441144e-05, "loss": 3.6029, "step": 34000 }, { "epoch": 1.2819560047562426, "grad_norm": 6.064113140106201, "learning_rate": 4.6795109988109395e-05, "loss": 3.5978, "step": 34500 }, { "epoch": 1.3005350772889417, "grad_norm": 5.650447368621826, "learning_rate": 4.6748662306777645e-05, "loss": 3.5817, "step": 35000 }, { "epoch": 1.3191141498216408, "grad_norm": 7.115864276885986, "learning_rate": 4.67022146254459e-05, "loss": 3.5769, "step": 35500 }, { "epoch": 1.33769322235434, "grad_norm": 6.497522354125977, "learning_rate": 4.665576694411415e-05, "loss": 3.5825, "step": 36000 }, { "epoch": 1.3562722948870394, "grad_norm": 5.834658622741699, "learning_rate": 4.6609319262782404e-05, "loss": 3.555, "step": 36500 }, { "epoch": 1.3748513674197385, "grad_norm": 5.968478679656982, "learning_rate": 4.6562871581450654e-05, "loss": 3.5476, "step": 37000 }, { "epoch": 1.3934304399524375, "grad_norm": 5.435029983520508, "learning_rate": 4.651642390011891e-05, "loss": 3.518, "step": 37500 }, { "epoch": 1.4120095124851368, "grad_norm": 5.952866554260254, "learning_rate": 4.646997621878716e-05, "loss": 3.5201, "step": 38000 }, { "epoch": 1.430588585017836, "grad_norm": 6.440069675445557, "learning_rate": 4.642352853745541e-05, "loss": 3.5159, "step": 38500 }, { "epoch": 1.449167657550535, "grad_norm": 5.686422824859619, "learning_rate": 4.637708085612367e-05, "loss": 3.5054, "step": 39000 }, { "epoch": 1.4677467300832343, "grad_norm": 6.039205551147461, "learning_rate": 4.633063317479191e-05, "loss": 3.499, "step": 39500 }, { "epoch": 1.4863258026159334, "grad_norm": 5.929929256439209, "learning_rate": 4.628418549346017e-05, "loss": 3.5125, "step": 40000 }, { "epoch": 1.5049048751486325, "grad_norm": 6.127495288848877, "learning_rate": 4.623773781212842e-05, "loss": 3.4683, "step": 40500 }, { "epoch": 1.5234839476813318, "grad_norm": 5.202524662017822, "learning_rate": 4.619129013079667e-05, "loss": 3.4457, "step": 41000 }, { "epoch": 1.542063020214031, "grad_norm": 6.199319839477539, "learning_rate": 4.614484244946493e-05, "loss": 3.4489, "step": 41500 }, { "epoch": 1.56064209274673, "grad_norm": 5.948836803436279, "learning_rate": 4.609839476813318e-05, "loss": 3.4412, "step": 42000 }, { "epoch": 1.5792211652794292, "grad_norm": 6.036122798919678, "learning_rate": 4.605194708680143e-05, "loss": 3.4259, "step": 42500 }, { "epoch": 1.5978002378121285, "grad_norm": 7.213582992553711, "learning_rate": 4.600549940546968e-05, "loss": 3.425, "step": 43000 }, { "epoch": 1.6163793103448276, "grad_norm": 5.579181671142578, "learning_rate": 4.595905172413794e-05, "loss": 3.4094, "step": 43500 }, { "epoch": 1.6349583828775267, "grad_norm": 6.071746349334717, "learning_rate": 4.591260404280618e-05, "loss": 3.3886, "step": 44000 }, { "epoch": 1.653537455410226, "grad_norm": 6.017687797546387, "learning_rate": 4.586615636147444e-05, "loss": 3.3987, "step": 44500 }, { "epoch": 1.672116527942925, "grad_norm": 6.2989349365234375, "learning_rate": 4.581970868014269e-05, "loss": 3.3987, "step": 45000 }, { "epoch": 1.6906956004756242, "grad_norm": 5.678338527679443, "learning_rate": 4.577326099881094e-05, "loss": 3.3755, "step": 45500 }, { "epoch": 1.7092746730083235, "grad_norm": 6.020495891571045, "learning_rate": 4.57268133174792e-05, "loss": 3.3591, "step": 46000 }, { "epoch": 1.7278537455410226, "grad_norm": 5.941638946533203, "learning_rate": 4.568036563614744e-05, "loss": 3.3574, "step": 46500 }, { "epoch": 1.7464328180737216, "grad_norm": 6.722168922424316, "learning_rate": 4.56339179548157e-05, "loss": 3.3746, "step": 47000 }, { "epoch": 1.765011890606421, "grad_norm": 6.632647514343262, "learning_rate": 4.558747027348395e-05, "loss": 3.3535, "step": 47500 }, { "epoch": 1.7835909631391202, "grad_norm": 6.448876857757568, "learning_rate": 4.55410225921522e-05, "loss": 3.3581, "step": 48000 }, { "epoch": 1.802170035671819, "grad_norm": 5.348858833312988, "learning_rate": 4.5494574910820456e-05, "loss": 3.3432, "step": 48500 }, { "epoch": 1.8207491082045184, "grad_norm": 6.1672186851501465, "learning_rate": 4.5448127229488707e-05, "loss": 3.3422, "step": 49000 }, { "epoch": 1.8393281807372177, "grad_norm": 5.889304161071777, "learning_rate": 4.540167954815696e-05, "loss": 3.3224, "step": 49500 }, { "epoch": 1.8579072532699168, "grad_norm": 6.291742324829102, "learning_rate": 4.535523186682521e-05, "loss": 3.3133, "step": 50000 }, { "epoch": 1.8764863258026159, "grad_norm": 5.806668758392334, "learning_rate": 4.5308784185493465e-05, "loss": 3.3243, "step": 50500 }, { "epoch": 1.8950653983353152, "grad_norm": 6.051152229309082, "learning_rate": 4.5262336504161715e-05, "loss": 3.2947, "step": 51000 }, { "epoch": 1.9136444708680143, "grad_norm": 6.568633079528809, "learning_rate": 4.5215888822829966e-05, "loss": 3.2911, "step": 51500 }, { "epoch": 1.9322235434007133, "grad_norm": 6.369818210601807, "learning_rate": 4.516944114149822e-05, "loss": 3.2676, "step": 52000 }, { "epoch": 1.9508026159334126, "grad_norm": 5.4059014320373535, "learning_rate": 4.512299346016647e-05, "loss": 3.2633, "step": 52500 }, { "epoch": 1.9693816884661117, "grad_norm": 5.883274078369141, "learning_rate": 4.5076545778834724e-05, "loss": 3.2759, "step": 53000 }, { "epoch": 1.9879607609988108, "grad_norm": 6.672674179077148, "learning_rate": 4.5030098097502975e-05, "loss": 3.2794, "step": 53500 }, { "epoch": 2.00653983353151, "grad_norm": 6.007123947143555, "learning_rate": 4.4983650416171225e-05, "loss": 3.2476, "step": 54000 }, { "epoch": 2.0251189060642094, "grad_norm": 5.490503787994385, "learning_rate": 4.4937202734839475e-05, "loss": 3.2269, "step": 54500 }, { "epoch": 2.0436979785969083, "grad_norm": 6.020650386810303, "learning_rate": 4.489075505350773e-05, "loss": 3.2368, "step": 55000 }, { "epoch": 2.0622770511296076, "grad_norm": 6.575464248657227, "learning_rate": 4.484430737217598e-05, "loss": 3.1994, "step": 55500 }, { "epoch": 2.080856123662307, "grad_norm": 5.844559192657471, "learning_rate": 4.4797859690844234e-05, "loss": 3.1938, "step": 56000 }, { "epoch": 2.0994351961950057, "grad_norm": 5.892060279846191, "learning_rate": 4.475141200951249e-05, "loss": 3.205, "step": 56500 }, { "epoch": 2.118014268727705, "grad_norm": 5.993409156799316, "learning_rate": 4.4704964328180735e-05, "loss": 3.212, "step": 57000 }, { "epoch": 2.1365933412604043, "grad_norm": 6.564383029937744, "learning_rate": 4.465851664684899e-05, "loss": 3.1842, "step": 57500 }, { "epoch": 2.1551724137931036, "grad_norm": 5.989982604980469, "learning_rate": 4.461206896551724e-05, "loss": 3.1859, "step": 58000 }, { "epoch": 2.1737514863258025, "grad_norm": 5.895771503448486, "learning_rate": 4.456562128418549e-05, "loss": 3.1839, "step": 58500 }, { "epoch": 2.192330558858502, "grad_norm": 5.832091331481934, "learning_rate": 4.451917360285375e-05, "loss": 3.1728, "step": 59000 }, { "epoch": 2.210909631391201, "grad_norm": 5.480752468109131, "learning_rate": 4.4472725921522e-05, "loss": 3.1711, "step": 59500 }, { "epoch": 2.2294887039239, "grad_norm": 5.683358192443848, "learning_rate": 4.442627824019025e-05, "loss": 3.1669, "step": 60000 }, { "epoch": 2.2480677764565993, "grad_norm": 6.638919830322266, "learning_rate": 4.43798305588585e-05, "loss": 3.1677, "step": 60500 }, { "epoch": 2.2666468489892986, "grad_norm": 5.941629886627197, "learning_rate": 4.433338287752676e-05, "loss": 3.1506, "step": 61000 }, { "epoch": 2.2852259215219974, "grad_norm": 6.227372169494629, "learning_rate": 4.428693519619501e-05, "loss": 3.1607, "step": 61500 }, { "epoch": 2.3038049940546967, "grad_norm": 6.063544750213623, "learning_rate": 4.424048751486326e-05, "loss": 3.1477, "step": 62000 }, { "epoch": 2.322384066587396, "grad_norm": 5.8914618492126465, "learning_rate": 4.419403983353152e-05, "loss": 3.1362, "step": 62500 }, { "epoch": 2.3409631391200953, "grad_norm": 5.964859962463379, "learning_rate": 4.414759215219976e-05, "loss": 3.1282, "step": 63000 }, { "epoch": 2.359542211652794, "grad_norm": 5.622297763824463, "learning_rate": 4.410114447086802e-05, "loss": 3.1386, "step": 63500 }, { "epoch": 2.3781212841854935, "grad_norm": 6.728824138641357, "learning_rate": 4.405469678953627e-05, "loss": 3.1202, "step": 64000 }, { "epoch": 2.396700356718193, "grad_norm": 6.513198375701904, "learning_rate": 4.400824910820452e-05, "loss": 3.1455, "step": 64500 }, { "epoch": 2.4152794292508917, "grad_norm": 6.273243427276611, "learning_rate": 4.396180142687277e-05, "loss": 3.1143, "step": 65000 }, { "epoch": 2.433858501783591, "grad_norm": 5.384542465209961, "learning_rate": 4.391535374554103e-05, "loss": 3.1111, "step": 65500 }, { "epoch": 2.4524375743162903, "grad_norm": 5.742457866668701, "learning_rate": 4.386890606420928e-05, "loss": 3.1146, "step": 66000 }, { "epoch": 2.471016646848989, "grad_norm": 6.236721038818359, "learning_rate": 4.382245838287753e-05, "loss": 3.111, "step": 66500 }, { "epoch": 2.4895957193816884, "grad_norm": 6.027072429656982, "learning_rate": 4.3776010701545785e-05, "loss": 3.088, "step": 67000 }, { "epoch": 2.5081747919143877, "grad_norm": 6.66511869430542, "learning_rate": 4.372956302021403e-05, "loss": 3.1021, "step": 67500 }, { "epoch": 2.5267538644470866, "grad_norm": 5.52970552444458, "learning_rate": 4.3683115338882286e-05, "loss": 3.1001, "step": 68000 }, { "epoch": 2.545332936979786, "grad_norm": 6.7701897621154785, "learning_rate": 4.363666765755054e-05, "loss": 3.0905, "step": 68500 }, { "epoch": 2.563912009512485, "grad_norm": 5.972938537597656, "learning_rate": 4.359021997621879e-05, "loss": 3.0665, "step": 69000 }, { "epoch": 2.582491082045184, "grad_norm": 6.33815860748291, "learning_rate": 4.3543772294887044e-05, "loss": 3.0703, "step": 69500 }, { "epoch": 2.6010701545778834, "grad_norm": 5.83467435836792, "learning_rate": 4.3497324613555295e-05, "loss": 3.0804, "step": 70000 }, { "epoch": 2.6196492271105827, "grad_norm": 6.139744758605957, "learning_rate": 4.3450876932223545e-05, "loss": 3.0668, "step": 70500 }, { "epoch": 2.6382282996432815, "grad_norm": 7.028213977813721, "learning_rate": 4.3404429250891796e-05, "loss": 3.0549, "step": 71000 }, { "epoch": 2.656807372175981, "grad_norm": 5.353559970855713, "learning_rate": 4.335798156956005e-05, "loss": 3.0684, "step": 71500 }, { "epoch": 2.67538644470868, "grad_norm": 6.900554656982422, "learning_rate": 4.3311533888228304e-05, "loss": 3.035, "step": 72000 }, { "epoch": 2.6939655172413794, "grad_norm": 6.68520450592041, "learning_rate": 4.3265086206896554e-05, "loss": 3.0307, "step": 72500 }, { "epoch": 2.7125445897740788, "grad_norm": 6.080930233001709, "learning_rate": 4.321863852556481e-05, "loss": 3.0379, "step": 73000 }, { "epoch": 2.7311236623067776, "grad_norm": 5.922386646270752, "learning_rate": 4.3172190844233055e-05, "loss": 3.0393, "step": 73500 }, { "epoch": 2.749702734839477, "grad_norm": 6.372087001800537, "learning_rate": 4.312574316290131e-05, "loss": 3.0243, "step": 74000 }, { "epoch": 2.768281807372176, "grad_norm": 6.071821689605713, "learning_rate": 4.307929548156956e-05, "loss": 3.0283, "step": 74500 }, { "epoch": 2.786860879904875, "grad_norm": 6.012415409088135, "learning_rate": 4.3032847800237813e-05, "loss": 3.025, "step": 75000 }, { "epoch": 2.8054399524375744, "grad_norm": 6.770437717437744, "learning_rate": 4.2986400118906064e-05, "loss": 3.0242, "step": 75500 }, { "epoch": 2.8240190249702737, "grad_norm": 6.748111724853516, "learning_rate": 4.2939952437574314e-05, "loss": 3.016, "step": 76000 }, { "epoch": 2.8425980975029725, "grad_norm": 6.000352382659912, "learning_rate": 4.289350475624257e-05, "loss": 3.0208, "step": 76500 }, { "epoch": 2.861177170035672, "grad_norm": 6.079233646392822, "learning_rate": 4.284705707491082e-05, "loss": 3.0062, "step": 77000 }, { "epoch": 2.879756242568371, "grad_norm": 5.8158040046691895, "learning_rate": 4.280060939357907e-05, "loss": 3.0162, "step": 77500 }, { "epoch": 2.89833531510107, "grad_norm": 7.081645965576172, "learning_rate": 4.275416171224732e-05, "loss": 2.9933, "step": 78000 }, { "epoch": 2.9169143876337693, "grad_norm": 7.042798042297363, "learning_rate": 4.270771403091558e-05, "loss": 3.0055, "step": 78500 }, { "epoch": 2.9354934601664686, "grad_norm": 6.736599445343018, "learning_rate": 4.266126634958383e-05, "loss": 2.9964, "step": 79000 }, { "epoch": 2.9540725326991675, "grad_norm": 6.28444242477417, "learning_rate": 4.261481866825208e-05, "loss": 2.9943, "step": 79500 }, { "epoch": 2.972651605231867, "grad_norm": 6.56734561920166, "learning_rate": 4.256837098692034e-05, "loss": 2.9807, "step": 80000 }, { "epoch": 2.991230677764566, "grad_norm": 6.312285423278809, "learning_rate": 4.252192330558858e-05, "loss": 2.9975, "step": 80500 }, { "epoch": 3.009809750297265, "grad_norm": 5.604495048522949, "learning_rate": 4.247547562425684e-05, "loss": 2.964, "step": 81000 }, { "epoch": 3.0283888228299642, "grad_norm": 6.2611083984375, "learning_rate": 4.242902794292509e-05, "loss": 2.9411, "step": 81500 }, { "epoch": 3.0469678953626635, "grad_norm": 6.149163246154785, "learning_rate": 4.238258026159334e-05, "loss": 2.9448, "step": 82000 }, { "epoch": 3.065546967895363, "grad_norm": 6.137192249298096, "learning_rate": 4.23361325802616e-05, "loss": 2.9208, "step": 82500 }, { "epoch": 3.0841260404280617, "grad_norm": 5.697031497955322, "learning_rate": 4.228968489892985e-05, "loss": 2.9352, "step": 83000 }, { "epoch": 3.102705112960761, "grad_norm": 6.298037528991699, "learning_rate": 4.22432372175981e-05, "loss": 2.9318, "step": 83500 }, { "epoch": 3.1212841854934603, "grad_norm": 6.293707370758057, "learning_rate": 4.219678953626635e-05, "loss": 2.9484, "step": 84000 }, { "epoch": 3.139863258026159, "grad_norm": 6.098722457885742, "learning_rate": 4.215034185493461e-05, "loss": 2.923, "step": 84500 }, { "epoch": 3.1584423305588585, "grad_norm": 5.604320526123047, "learning_rate": 4.210389417360285e-05, "loss": 2.932, "step": 85000 }, { "epoch": 3.177021403091558, "grad_norm": 6.741579055786133, "learning_rate": 4.205744649227111e-05, "loss": 2.911, "step": 85500 }, { "epoch": 3.1956004756242566, "grad_norm": 6.246683120727539, "learning_rate": 4.201099881093936e-05, "loss": 2.9139, "step": 86000 }, { "epoch": 3.214179548156956, "grad_norm": 6.600460052490234, "learning_rate": 4.196455112960761e-05, "loss": 2.9329, "step": 86500 }, { "epoch": 3.2327586206896552, "grad_norm": 6.846024990081787, "learning_rate": 4.1918103448275866e-05, "loss": 2.9189, "step": 87000 }, { "epoch": 3.2513376932223546, "grad_norm": 6.301860332489014, "learning_rate": 4.1871655766944116e-05, "loss": 2.9191, "step": 87500 }, { "epoch": 3.2699167657550534, "grad_norm": 5.542537689208984, "learning_rate": 4.182520808561237e-05, "loss": 2.8991, "step": 88000 }, { "epoch": 3.2884958382877527, "grad_norm": 6.527828216552734, "learning_rate": 4.177876040428062e-05, "loss": 2.8959, "step": 88500 }, { "epoch": 3.307074910820452, "grad_norm": 6.696499824523926, "learning_rate": 4.1732312722948875e-05, "loss": 2.904, "step": 89000 }, { "epoch": 3.325653983353151, "grad_norm": 6.901641368865967, "learning_rate": 4.1685865041617125e-05, "loss": 2.8918, "step": 89500 }, { "epoch": 3.34423305588585, "grad_norm": 5.950034141540527, "learning_rate": 4.1639417360285376e-05, "loss": 2.8953, "step": 90000 }, { "epoch": 3.3628121284185495, "grad_norm": 6.489218235015869, "learning_rate": 4.159296967895363e-05, "loss": 2.8983, "step": 90500 }, { "epoch": 3.3813912009512483, "grad_norm": 7.06480073928833, "learning_rate": 4.154652199762188e-05, "loss": 2.9144, "step": 91000 }, { "epoch": 3.3999702734839476, "grad_norm": 6.297793388366699, "learning_rate": 4.1500074316290134e-05, "loss": 2.8853, "step": 91500 }, { "epoch": 3.418549346016647, "grad_norm": 6.4150919914245605, "learning_rate": 4.1453626634958384e-05, "loss": 2.8906, "step": 92000 }, { "epoch": 3.437128418549346, "grad_norm": 7.301102638244629, "learning_rate": 4.1407178953626635e-05, "loss": 2.8706, "step": 92500 }, { "epoch": 3.455707491082045, "grad_norm": 6.061220645904541, "learning_rate": 4.136073127229489e-05, "loss": 2.8732, "step": 93000 }, { "epoch": 3.4742865636147444, "grad_norm": 6.419704914093018, "learning_rate": 4.131428359096314e-05, "loss": 2.861, "step": 93500 }, { "epoch": 3.4928656361474433, "grad_norm": 7.272397994995117, "learning_rate": 4.126783590963139e-05, "loss": 2.8942, "step": 94000 }, { "epoch": 3.5114447086801426, "grad_norm": 6.250875949859619, "learning_rate": 4.1221388228299644e-05, "loss": 2.8639, "step": 94500 }, { "epoch": 3.530023781212842, "grad_norm": 6.624760150909424, "learning_rate": 4.11749405469679e-05, "loss": 2.8706, "step": 95000 }, { "epoch": 3.548602853745541, "grad_norm": 7.242002487182617, "learning_rate": 4.1128492865636145e-05, "loss": 2.8549, "step": 95500 }, { "epoch": 3.56718192627824, "grad_norm": 6.070115089416504, "learning_rate": 4.10820451843044e-05, "loss": 2.8564, "step": 96000 }, { "epoch": 3.5857609988109393, "grad_norm": 6.022694110870361, "learning_rate": 4.103559750297266e-05, "loss": 2.8637, "step": 96500 }, { "epoch": 3.6043400713436387, "grad_norm": 5.543400287628174, "learning_rate": 4.09891498216409e-05, "loss": 2.8219, "step": 97000 }, { "epoch": 3.622919143876338, "grad_norm": 6.441455841064453, "learning_rate": 4.094270214030916e-05, "loss": 2.844, "step": 97500 }, { "epoch": 3.641498216409037, "grad_norm": 6.443978786468506, "learning_rate": 4.089625445897741e-05, "loss": 2.8337, "step": 98000 }, { "epoch": 3.660077288941736, "grad_norm": 7.063666820526123, "learning_rate": 4.084980677764566e-05, "loss": 2.869, "step": 98500 }, { "epoch": 3.6786563614744354, "grad_norm": 6.34807825088501, "learning_rate": 4.080335909631391e-05, "loss": 2.8303, "step": 99000 }, { "epoch": 3.6972354340071343, "grad_norm": 6.020463466644287, "learning_rate": 4.075691141498217e-05, "loss": 2.8442, "step": 99500 }, { "epoch": 3.7158145065398336, "grad_norm": 6.808725357055664, "learning_rate": 4.071046373365042e-05, "loss": 2.8432, "step": 100000 }, { "epoch": 3.734393579072533, "grad_norm": 6.207636833190918, "learning_rate": 4.066401605231867e-05, "loss": 2.859, "step": 100500 }, { "epoch": 3.7529726516052317, "grad_norm": 6.0236616134643555, "learning_rate": 4.061756837098693e-05, "loss": 2.847, "step": 101000 }, { "epoch": 3.771551724137931, "grad_norm": 5.8015241622924805, "learning_rate": 4.057112068965517e-05, "loss": 2.8239, "step": 101500 }, { "epoch": 3.7901307966706304, "grad_norm": 6.354222297668457, "learning_rate": 4.052467300832343e-05, "loss": 2.8574, "step": 102000 }, { "epoch": 3.808709869203329, "grad_norm": 6.587215900421143, "learning_rate": 4.047822532699168e-05, "loss": 2.8354, "step": 102500 }, { "epoch": 3.8272889417360285, "grad_norm": 7.283754825592041, "learning_rate": 4.043177764565993e-05, "loss": 2.8218, "step": 103000 }, { "epoch": 3.845868014268728, "grad_norm": 6.165238857269287, "learning_rate": 4.0385329964328186e-05, "loss": 2.841, "step": 103500 }, { "epoch": 3.8644470868014267, "grad_norm": 6.120512008666992, "learning_rate": 4.033888228299644e-05, "loss": 2.8195, "step": 104000 }, { "epoch": 3.883026159334126, "grad_norm": 6.8183698654174805, "learning_rate": 4.029243460166469e-05, "loss": 2.8044, "step": 104500 }, { "epoch": 3.9016052318668253, "grad_norm": 5.847311973571777, "learning_rate": 4.024598692033294e-05, "loss": 2.8056, "step": 105000 }, { "epoch": 3.920184304399524, "grad_norm": 7.423314571380615, "learning_rate": 4.019953923900119e-05, "loss": 2.8164, "step": 105500 }, { "epoch": 3.9387633769322234, "grad_norm": 6.736742973327637, "learning_rate": 4.015309155766944e-05, "loss": 2.8099, "step": 106000 }, { "epoch": 3.9573424494649228, "grad_norm": 6.3364644050598145, "learning_rate": 4.0106643876337696e-05, "loss": 2.8138, "step": 106500 }, { "epoch": 3.9759215219976216, "grad_norm": 6.03089714050293, "learning_rate": 4.006019619500595e-05, "loss": 2.8161, "step": 107000 }, { "epoch": 3.994500594530321, "grad_norm": 7.099618911743164, "learning_rate": 4.00137485136742e-05, "loss": 2.7899, "step": 107500 }, { "epoch": 4.01307966706302, "grad_norm": 6.682999134063721, "learning_rate": 3.9967300832342454e-05, "loss": 2.7953, "step": 108000 }, { "epoch": 4.031658739595719, "grad_norm": 5.30817985534668, "learning_rate": 3.99208531510107e-05, "loss": 2.7662, "step": 108500 }, { "epoch": 4.050237812128419, "grad_norm": 6.152495384216309, "learning_rate": 3.9874405469678955e-05, "loss": 2.7628, "step": 109000 }, { "epoch": 4.068816884661118, "grad_norm": 6.075979232788086, "learning_rate": 3.9827957788347206e-05, "loss": 2.7805, "step": 109500 }, { "epoch": 4.0873959571938165, "grad_norm": 6.708266258239746, "learning_rate": 3.9781510107015456e-05, "loss": 2.7607, "step": 110000 }, { "epoch": 4.105975029726516, "grad_norm": 6.425528049468994, "learning_rate": 3.9735062425683714e-05, "loss": 2.7738, "step": 110500 }, { "epoch": 4.124554102259215, "grad_norm": 6.978008270263672, "learning_rate": 3.9688614744351964e-05, "loss": 2.7654, "step": 111000 }, { "epoch": 4.143133174791914, "grad_norm": 6.780577182769775, "learning_rate": 3.9642167063020215e-05, "loss": 2.7632, "step": 111500 }, { "epoch": 4.161712247324614, "grad_norm": 5.834601879119873, "learning_rate": 3.9595719381688465e-05, "loss": 2.7671, "step": 112000 }, { "epoch": 4.180291319857313, "grad_norm": 7.513933181762695, "learning_rate": 3.954927170035672e-05, "loss": 2.7738, "step": 112500 }, { "epoch": 4.1988703923900115, "grad_norm": 6.303833484649658, "learning_rate": 3.950282401902497e-05, "loss": 2.782, "step": 113000 }, { "epoch": 4.217449464922711, "grad_norm": 5.807947158813477, "learning_rate": 3.945637633769322e-05, "loss": 2.7434, "step": 113500 }, { "epoch": 4.23602853745541, "grad_norm": 6.926473617553711, "learning_rate": 3.940992865636148e-05, "loss": 2.7458, "step": 114000 }, { "epoch": 4.25460760998811, "grad_norm": 6.764691352844238, "learning_rate": 3.9363480975029724e-05, "loss": 2.7357, "step": 114500 }, { "epoch": 4.273186682520809, "grad_norm": 5.976272106170654, "learning_rate": 3.931703329369798e-05, "loss": 2.7473, "step": 115000 }, { "epoch": 4.2917657550535075, "grad_norm": 6.0660810470581055, "learning_rate": 3.927058561236623e-05, "loss": 2.7387, "step": 115500 }, { "epoch": 4.310344827586207, "grad_norm": 6.600549221038818, "learning_rate": 3.922413793103448e-05, "loss": 2.7409, "step": 116000 }, { "epoch": 4.328923900118906, "grad_norm": 7.705731391906738, "learning_rate": 3.917769024970274e-05, "loss": 2.7408, "step": 116500 }, { "epoch": 4.347502972651605, "grad_norm": 6.347229957580566, "learning_rate": 3.913124256837099e-05, "loss": 2.7517, "step": 117000 }, { "epoch": 4.366082045184305, "grad_norm": 7.695369243621826, "learning_rate": 3.908479488703924e-05, "loss": 2.7443, "step": 117500 }, { "epoch": 4.384661117717004, "grad_norm": 6.612791538238525, "learning_rate": 3.903834720570749e-05, "loss": 2.7378, "step": 118000 }, { "epoch": 4.4032401902497025, "grad_norm": 6.125636577606201, "learning_rate": 3.899189952437575e-05, "loss": 2.7224, "step": 118500 }, { "epoch": 4.421819262782402, "grad_norm": 6.215822696685791, "learning_rate": 3.894545184304399e-05, "loss": 2.7311, "step": 119000 }, { "epoch": 4.440398335315101, "grad_norm": 6.436295032501221, "learning_rate": 3.889900416171225e-05, "loss": 2.722, "step": 119500 }, { "epoch": 4.4589774078478, "grad_norm": 6.271787166595459, "learning_rate": 3.88525564803805e-05, "loss": 2.7421, "step": 120000 }, { "epoch": 4.4775564803805, "grad_norm": 5.990880012512207, "learning_rate": 3.880610879904875e-05, "loss": 2.7284, "step": 120500 }, { "epoch": 4.4961355529131986, "grad_norm": 6.567028999328613, "learning_rate": 3.875966111771701e-05, "loss": 2.7244, "step": 121000 }, { "epoch": 4.514714625445897, "grad_norm": 6.399959087371826, "learning_rate": 3.871321343638526e-05, "loss": 2.7139, "step": 121500 }, { "epoch": 4.533293697978597, "grad_norm": 6.813540458679199, "learning_rate": 3.866676575505351e-05, "loss": 2.7177, "step": 122000 }, { "epoch": 4.551872770511296, "grad_norm": 6.263701438903809, "learning_rate": 3.862031807372176e-05, "loss": 2.7245, "step": 122500 }, { "epoch": 4.570451843043995, "grad_norm": 6.281601428985596, "learning_rate": 3.8573870392390017e-05, "loss": 2.728, "step": 123000 }, { "epoch": 4.589030915576695, "grad_norm": 6.198410511016846, "learning_rate": 3.852742271105827e-05, "loss": 2.7187, "step": 123500 }, { "epoch": 4.6076099881093935, "grad_norm": 7.052980899810791, "learning_rate": 3.848097502972652e-05, "loss": 2.7294, "step": 124000 }, { "epoch": 4.626189060642092, "grad_norm": 6.8353776931762695, "learning_rate": 3.8434527348394775e-05, "loss": 2.7366, "step": 124500 }, { "epoch": 4.644768133174792, "grad_norm": 6.245896816253662, "learning_rate": 3.838807966706302e-05, "loss": 2.7061, "step": 125000 }, { "epoch": 4.663347205707491, "grad_norm": 5.742074489593506, "learning_rate": 3.8341631985731276e-05, "loss": 2.7031, "step": 125500 }, { "epoch": 4.681926278240191, "grad_norm": 6.55544376373291, "learning_rate": 3.8295184304399526e-05, "loss": 2.6768, "step": 126000 }, { "epoch": 4.70050535077289, "grad_norm": 5.943970203399658, "learning_rate": 3.824873662306778e-05, "loss": 2.6986, "step": 126500 }, { "epoch": 4.719084423305588, "grad_norm": 7.413682460784912, "learning_rate": 3.8202288941736034e-05, "loss": 2.7229, "step": 127000 }, { "epoch": 4.737663495838287, "grad_norm": 7.258702278137207, "learning_rate": 3.8155841260404285e-05, "loss": 2.7006, "step": 127500 }, { "epoch": 4.756242568370987, "grad_norm": 6.239523887634277, "learning_rate": 3.8109393579072535e-05, "loss": 2.7015, "step": 128000 }, { "epoch": 4.774821640903686, "grad_norm": 6.031528949737549, "learning_rate": 3.8062945897740786e-05, "loss": 2.6794, "step": 128500 }, { "epoch": 4.793400713436386, "grad_norm": 6.504217624664307, "learning_rate": 3.801649821640904e-05, "loss": 2.6756, "step": 129000 }, { "epoch": 4.8119797859690845, "grad_norm": 6.261529922485352, "learning_rate": 3.7970050535077287e-05, "loss": 2.6825, "step": 129500 }, { "epoch": 4.830558858501783, "grad_norm": 5.9492292404174805, "learning_rate": 3.7923602853745544e-05, "loss": 2.6894, "step": 130000 }, { "epoch": 4.849137931034483, "grad_norm": 5.7504706382751465, "learning_rate": 3.7877155172413794e-05, "loss": 2.693, "step": 130500 }, { "epoch": 4.867717003567182, "grad_norm": 6.545624256134033, "learning_rate": 3.7830707491082045e-05, "loss": 2.6888, "step": 131000 }, { "epoch": 4.886296076099881, "grad_norm": 6.274423599243164, "learning_rate": 3.77842598097503e-05, "loss": 2.6884, "step": 131500 }, { "epoch": 4.904875148632581, "grad_norm": 5.632358074188232, "learning_rate": 3.773781212841855e-05, "loss": 2.678, "step": 132000 }, { "epoch": 4.923454221165279, "grad_norm": 6.883337020874023, "learning_rate": 3.76913644470868e-05, "loss": 2.677, "step": 132500 }, { "epoch": 4.942033293697978, "grad_norm": 6.676144123077393, "learning_rate": 3.7644916765755054e-05, "loss": 2.6788, "step": 133000 }, { "epoch": 4.960612366230678, "grad_norm": 8.354021072387695, "learning_rate": 3.759846908442331e-05, "loss": 2.6885, "step": 133500 }, { "epoch": 4.979191438763377, "grad_norm": 6.048637866973877, "learning_rate": 3.755202140309156e-05, "loss": 2.6636, "step": 134000 }, { "epoch": 4.997770511296076, "grad_norm": 5.709485054016113, "learning_rate": 3.750557372175981e-05, "loss": 2.6601, "step": 134500 }, { "epoch": 5.0163495838287755, "grad_norm": 7.082876682281494, "learning_rate": 3.745912604042807e-05, "loss": 2.6477, "step": 135000 }, { "epoch": 5.034928656361474, "grad_norm": 6.736342430114746, "learning_rate": 3.741267835909631e-05, "loss": 2.6357, "step": 135500 }, { "epoch": 5.053507728894173, "grad_norm": 6.7299580574035645, "learning_rate": 3.736623067776457e-05, "loss": 2.6532, "step": 136000 }, { "epoch": 5.072086801426873, "grad_norm": 6.488595008850098, "learning_rate": 3.731978299643282e-05, "loss": 2.6478, "step": 136500 }, { "epoch": 5.090665873959572, "grad_norm": 6.1401262283325195, "learning_rate": 3.727333531510107e-05, "loss": 2.6271, "step": 137000 }, { "epoch": 5.109244946492271, "grad_norm": 6.6415300369262695, "learning_rate": 3.722688763376933e-05, "loss": 2.6347, "step": 137500 }, { "epoch": 5.12782401902497, "grad_norm": 6.715450286865234, "learning_rate": 3.718043995243757e-05, "loss": 2.6377, "step": 138000 }, { "epoch": 5.146403091557669, "grad_norm": 6.399317741394043, "learning_rate": 3.713399227110583e-05, "loss": 2.6348, "step": 138500 }, { "epoch": 5.164982164090369, "grad_norm": 7.233635902404785, "learning_rate": 3.708754458977408e-05, "loss": 2.6411, "step": 139000 }, { "epoch": 5.183561236623068, "grad_norm": 6.6088433265686035, "learning_rate": 3.704109690844233e-05, "loss": 2.62, "step": 139500 }, { "epoch": 5.202140309155767, "grad_norm": 6.2975287437438965, "learning_rate": 3.699464922711058e-05, "loss": 2.6337, "step": 140000 }, { "epoch": 5.2207193816884665, "grad_norm": 5.73189115524292, "learning_rate": 3.694820154577884e-05, "loss": 2.6347, "step": 140500 }, { "epoch": 5.239298454221165, "grad_norm": 6.38447904586792, "learning_rate": 3.690175386444709e-05, "loss": 2.6338, "step": 141000 }, { "epoch": 5.257877526753864, "grad_norm": 6.772334098815918, "learning_rate": 3.685530618311534e-05, "loss": 2.6371, "step": 141500 }, { "epoch": 5.276456599286564, "grad_norm": 6.404881477355957, "learning_rate": 3.6808858501783596e-05, "loss": 2.6223, "step": 142000 }, { "epoch": 5.295035671819263, "grad_norm": 6.889057159423828, "learning_rate": 3.676241082045184e-05, "loss": 2.624, "step": 142500 }, { "epoch": 5.313614744351962, "grad_norm": 6.435732841491699, "learning_rate": 3.67159631391201e-05, "loss": 2.6408, "step": 143000 }, { "epoch": 5.332193816884661, "grad_norm": 6.9687418937683105, "learning_rate": 3.666951545778835e-05, "loss": 2.6239, "step": 143500 }, { "epoch": 5.35077288941736, "grad_norm": 6.787994861602783, "learning_rate": 3.66230677764566e-05, "loss": 2.6285, "step": 144000 }, { "epoch": 5.369351961950059, "grad_norm": 6.9550700187683105, "learning_rate": 3.6576620095124856e-05, "loss": 2.6173, "step": 144500 }, { "epoch": 5.387931034482759, "grad_norm": 6.4159345626831055, "learning_rate": 3.6530172413793106e-05, "loss": 2.6186, "step": 145000 }, { "epoch": 5.406510107015458, "grad_norm": 6.8777995109558105, "learning_rate": 3.6483724732461357e-05, "loss": 2.6239, "step": 145500 }, { "epoch": 5.425089179548157, "grad_norm": 6.115660667419434, "learning_rate": 3.643727705112961e-05, "loss": 2.6243, "step": 146000 }, { "epoch": 5.443668252080856, "grad_norm": 7.484089374542236, "learning_rate": 3.6390829369797864e-05, "loss": 2.6211, "step": 146500 }, { "epoch": 5.462247324613555, "grad_norm": 6.411886692047119, "learning_rate": 3.6344381688466115e-05, "loss": 2.61, "step": 147000 }, { "epoch": 5.480826397146254, "grad_norm": 6.482817649841309, "learning_rate": 3.6297934007134365e-05, "loss": 2.5962, "step": 147500 }, { "epoch": 5.499405469678954, "grad_norm": 5.861370086669922, "learning_rate": 3.625148632580262e-05, "loss": 2.6081, "step": 148000 }, { "epoch": 5.517984542211653, "grad_norm": 7.179725170135498, "learning_rate": 3.6205038644470866e-05, "loss": 2.6138, "step": 148500 }, { "epoch": 5.536563614744352, "grad_norm": 6.607731819152832, "learning_rate": 3.6158590963139124e-05, "loss": 2.6335, "step": 149000 }, { "epoch": 5.555142687277051, "grad_norm": 7.58914041519165, "learning_rate": 3.6112143281807374e-05, "loss": 2.6188, "step": 149500 }, { "epoch": 5.57372175980975, "grad_norm": 6.815672397613525, "learning_rate": 3.6065695600475625e-05, "loss": 2.5999, "step": 150000 }, { "epoch": 5.592300832342449, "grad_norm": 7.304187297821045, "learning_rate": 3.6019247919143875e-05, "loss": 2.61, "step": 150500 }, { "epoch": 5.610879904875149, "grad_norm": 6.256832599639893, "learning_rate": 3.597280023781213e-05, "loss": 2.5896, "step": 151000 }, { "epoch": 5.629458977407848, "grad_norm": 6.603561878204346, "learning_rate": 3.592635255648038e-05, "loss": 2.5937, "step": 151500 }, { "epoch": 5.648038049940547, "grad_norm": 6.757023334503174, "learning_rate": 3.587990487514863e-05, "loss": 2.6102, "step": 152000 }, { "epoch": 5.666617122473246, "grad_norm": 6.520168304443359, "learning_rate": 3.583345719381689e-05, "loss": 2.5915, "step": 152500 }, { "epoch": 5.685196195005945, "grad_norm": 6.486233234405518, "learning_rate": 3.5787009512485134e-05, "loss": 2.5833, "step": 153000 }, { "epoch": 5.703775267538645, "grad_norm": 5.79095458984375, "learning_rate": 3.574056183115339e-05, "loss": 2.5862, "step": 153500 }, { "epoch": 5.722354340071344, "grad_norm": 6.390963077545166, "learning_rate": 3.569411414982164e-05, "loss": 2.5867, "step": 154000 }, { "epoch": 5.7409334126040426, "grad_norm": 6.4793548583984375, "learning_rate": 3.564766646848989e-05, "loss": 2.5983, "step": 154500 }, { "epoch": 5.759512485136742, "grad_norm": 6.3781585693359375, "learning_rate": 3.560121878715815e-05, "loss": 2.6013, "step": 155000 }, { "epoch": 5.778091557669441, "grad_norm": 6.004998207092285, "learning_rate": 3.55547711058264e-05, "loss": 2.5908, "step": 155500 }, { "epoch": 5.79667063020214, "grad_norm": 7.406827926635742, "learning_rate": 3.550832342449465e-05, "loss": 2.6126, "step": 156000 }, { "epoch": 5.81524970273484, "grad_norm": 6.263004302978516, "learning_rate": 3.54618757431629e-05, "loss": 2.5957, "step": 156500 }, { "epoch": 5.833828775267539, "grad_norm": 6.236379623413086, "learning_rate": 3.541542806183116e-05, "loss": 2.5819, "step": 157000 }, { "epoch": 5.8524078478002375, "grad_norm": 7.537994861602783, "learning_rate": 3.536898038049941e-05, "loss": 2.574, "step": 157500 }, { "epoch": 5.870986920332937, "grad_norm": 5.823127269744873, "learning_rate": 3.532253269916766e-05, "loss": 2.5702, "step": 158000 }, { "epoch": 5.889565992865636, "grad_norm": 5.820526123046875, "learning_rate": 3.527608501783592e-05, "loss": 2.5799, "step": 158500 }, { "epoch": 5.908145065398335, "grad_norm": 6.082313060760498, "learning_rate": 3.522963733650416e-05, "loss": 2.5835, "step": 159000 }, { "epoch": 5.926724137931035, "grad_norm": 6.335425853729248, "learning_rate": 3.518318965517242e-05, "loss": 2.5823, "step": 159500 }, { "epoch": 5.945303210463734, "grad_norm": 6.930739879608154, "learning_rate": 3.513674197384067e-05, "loss": 2.5829, "step": 160000 }, { "epoch": 5.963882282996433, "grad_norm": 6.215325832366943, "learning_rate": 3.509029429250892e-05, "loss": 2.5851, "step": 160500 }, { "epoch": 5.982461355529132, "grad_norm": 5.954530239105225, "learning_rate": 3.504384661117717e-05, "loss": 2.5843, "step": 161000 }, { "epoch": 6.001040428061831, "grad_norm": 6.572493076324463, "learning_rate": 3.4997398929845426e-05, "loss": 2.5786, "step": 161500 }, { "epoch": 6.01961950059453, "grad_norm": 6.725315093994141, "learning_rate": 3.495095124851368e-05, "loss": 2.5384, "step": 162000 }, { "epoch": 6.03819857312723, "grad_norm": 6.982800483703613, "learning_rate": 3.490450356718193e-05, "loss": 2.5501, "step": 162500 }, { "epoch": 6.0567776456599285, "grad_norm": 6.240696430206299, "learning_rate": 3.4858055885850185e-05, "loss": 2.5446, "step": 163000 }, { "epoch": 6.075356718192628, "grad_norm": 6.301703453063965, "learning_rate": 3.481160820451843e-05, "loss": 2.5471, "step": 163500 }, { "epoch": 6.093935790725327, "grad_norm": 7.478944778442383, "learning_rate": 3.4765160523186686e-05, "loss": 2.5473, "step": 164000 }, { "epoch": 6.112514863258026, "grad_norm": 6.435521125793457, "learning_rate": 3.4718712841854936e-05, "loss": 2.5417, "step": 164500 }, { "epoch": 6.131093935790726, "grad_norm": 7.630947589874268, "learning_rate": 3.467226516052319e-05, "loss": 2.5365, "step": 165000 }, { "epoch": 6.149673008323425, "grad_norm": 7.021152973175049, "learning_rate": 3.4625817479191444e-05, "loss": 2.5493, "step": 165500 }, { "epoch": 6.168252080856123, "grad_norm": 6.182572841644287, "learning_rate": 3.457936979785969e-05, "loss": 2.5435, "step": 166000 }, { "epoch": 6.186831153388823, "grad_norm": 7.1868767738342285, "learning_rate": 3.4532922116527945e-05, "loss": 2.5461, "step": 166500 }, { "epoch": 6.205410225921522, "grad_norm": 6.992275714874268, "learning_rate": 3.4486474435196195e-05, "loss": 2.5243, "step": 167000 }, { "epoch": 6.223989298454221, "grad_norm": 6.819701194763184, "learning_rate": 3.4440026753864446e-05, "loss": 2.533, "step": 167500 }, { "epoch": 6.242568370986921, "grad_norm": 7.018156051635742, "learning_rate": 3.43935790725327e-05, "loss": 2.5373, "step": 168000 }, { "epoch": 6.2611474435196195, "grad_norm": 6.9675188064575195, "learning_rate": 3.4347131391200954e-05, "loss": 2.5354, "step": 168500 }, { "epoch": 6.279726516052318, "grad_norm": 6.449595928192139, "learning_rate": 3.4300683709869204e-05, "loss": 2.5198, "step": 169000 }, { "epoch": 6.298305588585018, "grad_norm": 6.839005470275879, "learning_rate": 3.4254236028537455e-05, "loss": 2.529, "step": 169500 }, { "epoch": 6.316884661117717, "grad_norm": 6.704039096832275, "learning_rate": 3.420778834720571e-05, "loss": 2.5365, "step": 170000 }, { "epoch": 6.335463733650416, "grad_norm": 6.419273853302002, "learning_rate": 3.4161340665873956e-05, "loss": 2.5441, "step": 170500 }, { "epoch": 6.354042806183116, "grad_norm": 7.052849292755127, "learning_rate": 3.411489298454221e-05, "loss": 2.5277, "step": 171000 }, { "epoch": 6.372621878715814, "grad_norm": 7.161109447479248, "learning_rate": 3.4068445303210463e-05, "loss": 2.522, "step": 171500 }, { "epoch": 6.391200951248513, "grad_norm": 6.348012447357178, "learning_rate": 3.4021997621878714e-05, "loss": 2.524, "step": 172000 }, { "epoch": 6.409780023781213, "grad_norm": 6.336347579956055, "learning_rate": 3.397554994054697e-05, "loss": 2.5222, "step": 172500 }, { "epoch": 6.428359096313912, "grad_norm": 5.670961380004883, "learning_rate": 3.392910225921522e-05, "loss": 2.5421, "step": 173000 }, { "epoch": 6.446938168846611, "grad_norm": 6.4542717933654785, "learning_rate": 3.388265457788347e-05, "loss": 2.5347, "step": 173500 }, { "epoch": 6.4655172413793105, "grad_norm": 6.445559024810791, "learning_rate": 3.383620689655172e-05, "loss": 2.5161, "step": 174000 }, { "epoch": 6.484096313912009, "grad_norm": 6.366390228271484, "learning_rate": 3.378975921521998e-05, "loss": 2.5254, "step": 174500 }, { "epoch": 6.502675386444709, "grad_norm": 5.990453720092773, "learning_rate": 3.374331153388823e-05, "loss": 2.5291, "step": 175000 }, { "epoch": 6.521254458977408, "grad_norm": 7.384641170501709, "learning_rate": 3.369686385255648e-05, "loss": 2.5273, "step": 175500 }, { "epoch": 6.539833531510107, "grad_norm": 7.899537563323975, "learning_rate": 3.365041617122474e-05, "loss": 2.5217, "step": 176000 }, { "epoch": 6.558412604042807, "grad_norm": 7.456251621246338, "learning_rate": 3.360396848989298e-05, "loss": 2.5183, "step": 176500 }, { "epoch": 6.576991676575505, "grad_norm": 7.728662967681885, "learning_rate": 3.355752080856124e-05, "loss": 2.5242, "step": 177000 }, { "epoch": 6.595570749108204, "grad_norm": 6.4795708656311035, "learning_rate": 3.351107312722949e-05, "loss": 2.5148, "step": 177500 }, { "epoch": 6.614149821640904, "grad_norm": 6.569213390350342, "learning_rate": 3.346462544589774e-05, "loss": 2.513, "step": 178000 }, { "epoch": 6.632728894173603, "grad_norm": 5.95412015914917, "learning_rate": 3.3418177764566e-05, "loss": 2.5141, "step": 178500 }, { "epoch": 6.651307966706302, "grad_norm": 6.39993143081665, "learning_rate": 3.337173008323425e-05, "loss": 2.5209, "step": 179000 }, { "epoch": 6.6698870392390015, "grad_norm": 6.558811664581299, "learning_rate": 3.33252824019025e-05, "loss": 2.5168, "step": 179500 }, { "epoch": 6.6884661117717, "grad_norm": 6.443490982055664, "learning_rate": 3.327883472057075e-05, "loss": 2.5122, "step": 180000 }, { "epoch": 6.707045184304399, "grad_norm": 6.475789546966553, "learning_rate": 3.3232387039239006e-05, "loss": 2.5265, "step": 180500 }, { "epoch": 6.725624256837099, "grad_norm": 6.097142219543457, "learning_rate": 3.318593935790725e-05, "loss": 2.4953, "step": 181000 }, { "epoch": 6.744203329369798, "grad_norm": 6.5415849685668945, "learning_rate": 3.313949167657551e-05, "loss": 2.504, "step": 181500 }, { "epoch": 6.762782401902497, "grad_norm": 6.81630277633667, "learning_rate": 3.309304399524376e-05, "loss": 2.515, "step": 182000 }, { "epoch": 6.781361474435196, "grad_norm": 6.129889488220215, "learning_rate": 3.304659631391201e-05, "loss": 2.4993, "step": 182500 }, { "epoch": 6.799940546967895, "grad_norm": 6.960236072540283, "learning_rate": 3.3000148632580265e-05, "loss": 2.5146, "step": 183000 }, { "epoch": 6.818519619500594, "grad_norm": 7.540809154510498, "learning_rate": 3.2953700951248516e-05, "loss": 2.4899, "step": 183500 }, { "epoch": 6.837098692033294, "grad_norm": 6.699360370635986, "learning_rate": 3.2907253269916766e-05, "loss": 2.5032, "step": 184000 }, { "epoch": 6.855677764565993, "grad_norm": 6.967233180999756, "learning_rate": 3.286080558858502e-05, "loss": 2.5079, "step": 184500 }, { "epoch": 6.874256837098692, "grad_norm": 6.475770473480225, "learning_rate": 3.2814357907253274e-05, "loss": 2.5278, "step": 185000 }, { "epoch": 6.892835909631391, "grad_norm": 7.317842483520508, "learning_rate": 3.2767910225921525e-05, "loss": 2.4923, "step": 185500 }, { "epoch": 6.91141498216409, "grad_norm": 6.920095443725586, "learning_rate": 3.2721462544589775e-05, "loss": 2.5094, "step": 186000 }, { "epoch": 6.92999405469679, "grad_norm": 6.825646877288818, "learning_rate": 3.267501486325803e-05, "loss": 2.4832, "step": 186500 }, { "epoch": 6.948573127229489, "grad_norm": 7.122133255004883, "learning_rate": 3.2628567181926276e-05, "loss": 2.5033, "step": 187000 }, { "epoch": 6.967152199762188, "grad_norm": 6.76582145690918, "learning_rate": 3.2582119500594533e-05, "loss": 2.4841, "step": 187500 }, { "epoch": 6.9857312722948866, "grad_norm": 6.667826175689697, "learning_rate": 3.2535671819262784e-05, "loss": 2.5107, "step": 188000 }, { "epoch": 7.004310344827586, "grad_norm": 6.997557640075684, "learning_rate": 3.2489224137931034e-05, "loss": 2.4932, "step": 188500 }, { "epoch": 7.022889417360285, "grad_norm": 7.383888244628906, "learning_rate": 3.244277645659929e-05, "loss": 2.4599, "step": 189000 }, { "epoch": 7.041468489892985, "grad_norm": 6.721455097198486, "learning_rate": 3.239632877526754e-05, "loss": 2.461, "step": 189500 }, { "epoch": 7.060047562425684, "grad_norm": 5.605399131774902, "learning_rate": 3.234988109393579e-05, "loss": 2.4733, "step": 190000 }, { "epoch": 7.078626634958383, "grad_norm": 6.753121852874756, "learning_rate": 3.230343341260404e-05, "loss": 2.4739, "step": 190500 }, { "epoch": 7.097205707491082, "grad_norm": 6.254253387451172, "learning_rate": 3.22569857312723e-05, "loss": 2.4702, "step": 191000 }, { "epoch": 7.115784780023781, "grad_norm": 7.067044258117676, "learning_rate": 3.2210538049940544e-05, "loss": 2.4565, "step": 191500 }, { "epoch": 7.13436385255648, "grad_norm": 6.651601791381836, "learning_rate": 3.21640903686088e-05, "loss": 2.4591, "step": 192000 }, { "epoch": 7.15294292508918, "grad_norm": 7.131402015686035, "learning_rate": 3.211764268727705e-05, "loss": 2.4699, "step": 192500 }, { "epoch": 7.171521997621879, "grad_norm": 6.57224702835083, "learning_rate": 3.20711950059453e-05, "loss": 2.4652, "step": 193000 }, { "epoch": 7.190101070154578, "grad_norm": 6.226948261260986, "learning_rate": 3.202474732461356e-05, "loss": 2.4553, "step": 193500 }, { "epoch": 7.208680142687277, "grad_norm": 6.283173561096191, "learning_rate": 3.197829964328181e-05, "loss": 2.4666, "step": 194000 }, { "epoch": 7.227259215219976, "grad_norm": 6.692994117736816, "learning_rate": 3.193185196195006e-05, "loss": 2.4634, "step": 194500 }, { "epoch": 7.245838287752675, "grad_norm": 5.542157173156738, "learning_rate": 3.188540428061831e-05, "loss": 2.4483, "step": 195000 }, { "epoch": 7.264417360285375, "grad_norm": 7.492745876312256, "learning_rate": 3.183895659928656e-05, "loss": 2.467, "step": 195500 }, { "epoch": 7.282996432818074, "grad_norm": 6.997331619262695, "learning_rate": 3.179250891795482e-05, "loss": 2.4562, "step": 196000 }, { "epoch": 7.3015755053507725, "grad_norm": 7.160475730895996, "learning_rate": 3.174606123662307e-05, "loss": 2.4645, "step": 196500 }, { "epoch": 7.320154577883472, "grad_norm": 6.583847522735596, "learning_rate": 3.169961355529132e-05, "loss": 2.4591, "step": 197000 }, { "epoch": 7.338733650416171, "grad_norm": 7.247707366943359, "learning_rate": 3.165316587395957e-05, "loss": 2.447, "step": 197500 }, { "epoch": 7.357312722948871, "grad_norm": 6.818671226501465, "learning_rate": 3.160671819262783e-05, "loss": 2.4524, "step": 198000 }, { "epoch": 7.37589179548157, "grad_norm": 6.533426284790039, "learning_rate": 3.156027051129608e-05, "loss": 2.4614, "step": 198500 }, { "epoch": 7.394470868014269, "grad_norm": 6.117506504058838, "learning_rate": 3.151382282996433e-05, "loss": 2.4523, "step": 199000 }, { "epoch": 7.413049940546968, "grad_norm": 6.545726776123047, "learning_rate": 3.1467375148632586e-05, "loss": 2.4413, "step": 199500 }, { "epoch": 7.431629013079667, "grad_norm": 6.267510414123535, "learning_rate": 3.142092746730083e-05, "loss": 2.4626, "step": 200000 }, { "epoch": 7.450208085612366, "grad_norm": 6.45424222946167, "learning_rate": 3.137447978596909e-05, "loss": 2.4525, "step": 200500 }, { "epoch": 7.468787158145066, "grad_norm": 6.826467990875244, "learning_rate": 3.132803210463734e-05, "loss": 2.4535, "step": 201000 }, { "epoch": 7.487366230677765, "grad_norm": 6.419857501983643, "learning_rate": 3.128158442330559e-05, "loss": 2.4567, "step": 201500 }, { "epoch": 7.5059453032104635, "grad_norm": 7.912742614746094, "learning_rate": 3.123513674197384e-05, "loss": 2.4352, "step": 202000 }, { "epoch": 7.524524375743163, "grad_norm": 6.15361213684082, "learning_rate": 3.1188689060642096e-05, "loss": 2.4525, "step": 202500 }, { "epoch": 7.543103448275862, "grad_norm": 6.077796936035156, "learning_rate": 3.1142241379310346e-05, "loss": 2.4621, "step": 203000 }, { "epoch": 7.561682520808561, "grad_norm": 6.890556335449219, "learning_rate": 3.10957936979786e-05, "loss": 2.4454, "step": 203500 }, { "epoch": 7.580261593341261, "grad_norm": 7.002103328704834, "learning_rate": 3.1049346016646854e-05, "loss": 2.4737, "step": 204000 }, { "epoch": 7.59884066587396, "grad_norm": 7.24050760269165, "learning_rate": 3.10028983353151e-05, "loss": 2.4369, "step": 204500 }, { "epoch": 7.617419738406658, "grad_norm": 7.357000827789307, "learning_rate": 3.0956450653983355e-05, "loss": 2.4338, "step": 205000 }, { "epoch": 7.635998810939358, "grad_norm": 6.06101131439209, "learning_rate": 3.0910002972651605e-05, "loss": 2.4378, "step": 205500 }, { "epoch": 7.654577883472057, "grad_norm": 7.14568567276001, "learning_rate": 3.0863555291319856e-05, "loss": 2.4448, "step": 206000 }, { "epoch": 7.673156956004756, "grad_norm": 6.747462272644043, "learning_rate": 3.081710760998811e-05, "loss": 2.4604, "step": 206500 }, { "epoch": 7.691736028537456, "grad_norm": 7.445852756500244, "learning_rate": 3.0770659928656364e-05, "loss": 2.4454, "step": 207000 }, { "epoch": 7.7103151010701545, "grad_norm": 6.196556568145752, "learning_rate": 3.0724212247324614e-05, "loss": 2.4378, "step": 207500 }, { "epoch": 7.728894173602853, "grad_norm": 6.7122039794921875, "learning_rate": 3.0677764565992865e-05, "loss": 2.4286, "step": 208000 }, { "epoch": 7.747473246135553, "grad_norm": 7.239169120788574, "learning_rate": 3.063131688466112e-05, "loss": 2.4459, "step": 208500 }, { "epoch": 7.766052318668252, "grad_norm": 5.942273139953613, "learning_rate": 3.058486920332937e-05, "loss": 2.4554, "step": 209000 }, { "epoch": 7.784631391200952, "grad_norm": 6.494337558746338, "learning_rate": 3.053842152199762e-05, "loss": 2.4404, "step": 209500 }, { "epoch": 7.803210463733651, "grad_norm": 7.0354084968566895, "learning_rate": 3.0491973840665877e-05, "loss": 2.4454, "step": 210000 }, { "epoch": 7.821789536266349, "grad_norm": 6.828258037567139, "learning_rate": 3.0445526159334127e-05, "loss": 2.4325, "step": 210500 }, { "epoch": 7.840368608799048, "grad_norm": 7.0825724601745605, "learning_rate": 3.039907847800238e-05, "loss": 2.4289, "step": 211000 }, { "epoch": 7.858947681331748, "grad_norm": 7.346935749053955, "learning_rate": 3.0352630796670635e-05, "loss": 2.4548, "step": 211500 }, { "epoch": 7.877526753864447, "grad_norm": 6.80324125289917, "learning_rate": 3.0306183115338882e-05, "loss": 2.4507, "step": 212000 }, { "epoch": 7.896105826397147, "grad_norm": 6.606076717376709, "learning_rate": 3.0259735434007136e-05, "loss": 2.43, "step": 212500 }, { "epoch": 7.9146848989298455, "grad_norm": 7.006173133850098, "learning_rate": 3.021328775267539e-05, "loss": 2.4376, "step": 213000 }, { "epoch": 7.933263971462544, "grad_norm": 6.683685779571533, "learning_rate": 3.0166840071343637e-05, "loss": 2.4173, "step": 213500 }, { "epoch": 7.951843043995244, "grad_norm": 7.217254161834717, "learning_rate": 3.012039239001189e-05, "loss": 2.4465, "step": 214000 }, { "epoch": 7.970422116527943, "grad_norm": 6.2831573486328125, "learning_rate": 3.0073944708680145e-05, "loss": 2.4347, "step": 214500 }, { "epoch": 7.989001189060642, "grad_norm": 7.447052955627441, "learning_rate": 3.0027497027348395e-05, "loss": 2.4346, "step": 215000 }, { "epoch": 8.007580261593342, "grad_norm": 6.3113884925842285, "learning_rate": 2.998104934601665e-05, "loss": 2.4264, "step": 215500 }, { "epoch": 8.02615933412604, "grad_norm": 7.349926948547363, "learning_rate": 2.9934601664684903e-05, "loss": 2.409, "step": 216000 }, { "epoch": 8.04473840665874, "grad_norm": 7.6226959228515625, "learning_rate": 2.988815398335315e-05, "loss": 2.4047, "step": 216500 }, { "epoch": 8.063317479191438, "grad_norm": 7.297638893127441, "learning_rate": 2.9841706302021404e-05, "loss": 2.4064, "step": 217000 }, { "epoch": 8.081896551724139, "grad_norm": 6.703174114227295, "learning_rate": 2.9795258620689658e-05, "loss": 2.3948, "step": 217500 }, { "epoch": 8.100475624256838, "grad_norm": 7.86271858215332, "learning_rate": 2.974881093935791e-05, "loss": 2.4033, "step": 218000 }, { "epoch": 8.119054696789537, "grad_norm": 6.666792392730713, "learning_rate": 2.9702363258026162e-05, "loss": 2.4189, "step": 218500 }, { "epoch": 8.137633769322235, "grad_norm": 7.112173557281494, "learning_rate": 2.9655915576694416e-05, "loss": 2.4139, "step": 219000 }, { "epoch": 8.156212841854934, "grad_norm": 7.117358684539795, "learning_rate": 2.9609467895362663e-05, "loss": 2.4092, "step": 219500 }, { "epoch": 8.174791914387633, "grad_norm": 5.946983337402344, "learning_rate": 2.9563020214030917e-05, "loss": 2.403, "step": 220000 }, { "epoch": 8.193370986920334, "grad_norm": 6.8523030281066895, "learning_rate": 2.951657253269917e-05, "loss": 2.3906, "step": 220500 }, { "epoch": 8.211950059453033, "grad_norm": 6.419975280761719, "learning_rate": 2.947012485136742e-05, "loss": 2.4058, "step": 221000 }, { "epoch": 8.230529131985731, "grad_norm": 7.008522033691406, "learning_rate": 2.9423677170035675e-05, "loss": 2.3949, "step": 221500 }, { "epoch": 8.24910820451843, "grad_norm": 6.398033618927002, "learning_rate": 2.937722948870393e-05, "loss": 2.3894, "step": 222000 }, { "epoch": 8.26768727705113, "grad_norm": 6.911588668823242, "learning_rate": 2.9330781807372176e-05, "loss": 2.4026, "step": 222500 }, { "epoch": 8.286266349583828, "grad_norm": 6.391911029815674, "learning_rate": 2.928433412604043e-05, "loss": 2.388, "step": 223000 }, { "epoch": 8.304845422116529, "grad_norm": 6.878973484039307, "learning_rate": 2.9237886444708684e-05, "loss": 2.3964, "step": 223500 }, { "epoch": 8.323424494649228, "grad_norm": 6.681369781494141, "learning_rate": 2.919143876337693e-05, "loss": 2.4049, "step": 224000 }, { "epoch": 8.342003567181926, "grad_norm": 6.652570724487305, "learning_rate": 2.9144991082045185e-05, "loss": 2.3935, "step": 224500 }, { "epoch": 8.360582639714625, "grad_norm": 6.757369041442871, "learning_rate": 2.909854340071344e-05, "loss": 2.4024, "step": 225000 }, { "epoch": 8.379161712247324, "grad_norm": 7.730061054229736, "learning_rate": 2.905209571938169e-05, "loss": 2.3815, "step": 225500 }, { "epoch": 8.397740784780023, "grad_norm": 7.044541358947754, "learning_rate": 2.9005648038049943e-05, "loss": 2.3872, "step": 226000 }, { "epoch": 8.416319857312724, "grad_norm": 7.104819297790527, "learning_rate": 2.895920035671819e-05, "loss": 2.3888, "step": 226500 }, { "epoch": 8.434898929845422, "grad_norm": 6.207997798919678, "learning_rate": 2.8912752675386444e-05, "loss": 2.3901, "step": 227000 }, { "epoch": 8.453478002378121, "grad_norm": 6.412841796875, "learning_rate": 2.8866304994054698e-05, "loss": 2.404, "step": 227500 }, { "epoch": 8.47205707491082, "grad_norm": 7.31563663482666, "learning_rate": 2.881985731272295e-05, "loss": 2.3952, "step": 228000 }, { "epoch": 8.490636147443519, "grad_norm": 6.783107757568359, "learning_rate": 2.8773409631391203e-05, "loss": 2.369, "step": 228500 }, { "epoch": 8.50921521997622, "grad_norm": 7.456410884857178, "learning_rate": 2.8726961950059456e-05, "loss": 2.3955, "step": 229000 }, { "epoch": 8.527794292508919, "grad_norm": 6.817208766937256, "learning_rate": 2.8680514268727704e-05, "loss": 2.3777, "step": 229500 }, { "epoch": 8.546373365041617, "grad_norm": 6.829710483551025, "learning_rate": 2.8634066587395957e-05, "loss": 2.3803, "step": 230000 }, { "epoch": 8.564952437574316, "grad_norm": 6.171419620513916, "learning_rate": 2.858761890606421e-05, "loss": 2.3867, "step": 230500 }, { "epoch": 8.583531510107015, "grad_norm": 7.179515361785889, "learning_rate": 2.8541171224732462e-05, "loss": 2.3819, "step": 231000 }, { "epoch": 8.602110582639714, "grad_norm": 7.424422264099121, "learning_rate": 2.8494723543400716e-05, "loss": 2.3892, "step": 231500 }, { "epoch": 8.620689655172415, "grad_norm": 6.56906795501709, "learning_rate": 2.844827586206897e-05, "loss": 2.3875, "step": 232000 }, { "epoch": 8.639268727705113, "grad_norm": 5.986749649047852, "learning_rate": 2.8401828180737217e-05, "loss": 2.3881, "step": 232500 }, { "epoch": 8.657847800237812, "grad_norm": 7.885437965393066, "learning_rate": 2.835538049940547e-05, "loss": 2.3898, "step": 233000 }, { "epoch": 8.676426872770511, "grad_norm": 8.217313766479492, "learning_rate": 2.8308932818073724e-05, "loss": 2.3853, "step": 233500 }, { "epoch": 8.69500594530321, "grad_norm": 7.467879295349121, "learning_rate": 2.826248513674197e-05, "loss": 2.3894, "step": 234000 }, { "epoch": 8.713585017835909, "grad_norm": 6.856407642364502, "learning_rate": 2.8216037455410225e-05, "loss": 2.3884, "step": 234500 }, { "epoch": 8.73216409036861, "grad_norm": 7.717813014984131, "learning_rate": 2.816958977407848e-05, "loss": 2.3735, "step": 235000 }, { "epoch": 8.750743162901308, "grad_norm": 6.215982913970947, "learning_rate": 2.812314209274673e-05, "loss": 2.3704, "step": 235500 }, { "epoch": 8.769322235434007, "grad_norm": 5.821375370025635, "learning_rate": 2.8076694411414984e-05, "loss": 2.3821, "step": 236000 }, { "epoch": 8.787901307966706, "grad_norm": 5.752195358276367, "learning_rate": 2.8030246730083238e-05, "loss": 2.362, "step": 236500 }, { "epoch": 8.806480380499405, "grad_norm": 7.1153693199157715, "learning_rate": 2.7983799048751485e-05, "loss": 2.3804, "step": 237000 }, { "epoch": 8.825059453032104, "grad_norm": 7.165075302124023, "learning_rate": 2.793735136741974e-05, "loss": 2.3749, "step": 237500 }, { "epoch": 8.843638525564804, "grad_norm": 7.609332084655762, "learning_rate": 2.7890903686087992e-05, "loss": 2.3783, "step": 238000 }, { "epoch": 8.862217598097503, "grad_norm": 7.269701957702637, "learning_rate": 2.7844456004756243e-05, "loss": 2.3612, "step": 238500 }, { "epoch": 8.880796670630202, "grad_norm": 6.229999542236328, "learning_rate": 2.7798008323424497e-05, "loss": 2.3674, "step": 239000 }, { "epoch": 8.899375743162901, "grad_norm": 6.712778568267822, "learning_rate": 2.775156064209275e-05, "loss": 2.4045, "step": 239500 }, { "epoch": 8.9179548156956, "grad_norm": 6.752030372619629, "learning_rate": 2.7705112960760998e-05, "loss": 2.3665, "step": 240000 }, { "epoch": 8.9365338882283, "grad_norm": 7.107761383056641, "learning_rate": 2.765866527942925e-05, "loss": 2.3757, "step": 240500 }, { "epoch": 8.955112960761, "grad_norm": 6.4916300773620605, "learning_rate": 2.7612217598097506e-05, "loss": 2.364, "step": 241000 }, { "epoch": 8.973692033293698, "grad_norm": 6.902660846710205, "learning_rate": 2.7565769916765756e-05, "loss": 2.3945, "step": 241500 }, { "epoch": 8.992271105826397, "grad_norm": 6.676261901855469, "learning_rate": 2.751932223543401e-05, "loss": 2.368, "step": 242000 }, { "epoch": 9.010850178359096, "grad_norm": 7.0637125968933105, "learning_rate": 2.7472874554102264e-05, "loss": 2.3526, "step": 242500 }, { "epoch": 9.029429250891795, "grad_norm": 6.886041164398193, "learning_rate": 2.742642687277051e-05, "loss": 2.3409, "step": 243000 }, { "epoch": 9.048008323424495, "grad_norm": 6.17530632019043, "learning_rate": 2.7379979191438765e-05, "loss": 2.3368, "step": 243500 }, { "epoch": 9.066587395957194, "grad_norm": 6.835616588592529, "learning_rate": 2.733353151010702e-05, "loss": 2.3521, "step": 244000 }, { "epoch": 9.085166468489893, "grad_norm": 7.837756156921387, "learning_rate": 2.7287083828775266e-05, "loss": 2.3517, "step": 244500 }, { "epoch": 9.103745541022592, "grad_norm": 7.3295793533325195, "learning_rate": 2.724063614744352e-05, "loss": 2.3351, "step": 245000 }, { "epoch": 9.122324613555291, "grad_norm": 6.278160095214844, "learning_rate": 2.7194188466111774e-05, "loss": 2.3544, "step": 245500 }, { "epoch": 9.14090368608799, "grad_norm": 6.8166823387146, "learning_rate": 2.7147740784780024e-05, "loss": 2.3562, "step": 246000 }, { "epoch": 9.15948275862069, "grad_norm": 6.9190473556518555, "learning_rate": 2.7101293103448278e-05, "loss": 2.3707, "step": 246500 }, { "epoch": 9.17806183115339, "grad_norm": 8.471137046813965, "learning_rate": 2.7054845422116532e-05, "loss": 2.3348, "step": 247000 }, { "epoch": 9.196640903686088, "grad_norm": 7.1549553871154785, "learning_rate": 2.700839774078478e-05, "loss": 2.3482, "step": 247500 }, { "epoch": 9.215219976218787, "grad_norm": 7.972681999206543, "learning_rate": 2.6961950059453033e-05, "loss": 2.3327, "step": 248000 }, { "epoch": 9.233799048751486, "grad_norm": 6.290485858917236, "learning_rate": 2.6915502378121287e-05, "loss": 2.3344, "step": 248500 }, { "epoch": 9.252378121284185, "grad_norm": 7.835150718688965, "learning_rate": 2.6869054696789537e-05, "loss": 2.3523, "step": 249000 }, { "epoch": 9.270957193816885, "grad_norm": 6.171538829803467, "learning_rate": 2.682260701545779e-05, "loss": 2.3439, "step": 249500 }, { "epoch": 9.289536266349584, "grad_norm": 6.854957580566406, "learning_rate": 2.6776159334126045e-05, "loss": 2.348, "step": 250000 }, { "epoch": 9.308115338882283, "grad_norm": 6.949794769287109, "learning_rate": 2.6729711652794292e-05, "loss": 2.3416, "step": 250500 }, { "epoch": 9.326694411414982, "grad_norm": 7.924169540405273, "learning_rate": 2.6683263971462546e-05, "loss": 2.341, "step": 251000 }, { "epoch": 9.34527348394768, "grad_norm": 6.802456378936768, "learning_rate": 2.66368162901308e-05, "loss": 2.3373, "step": 251500 }, { "epoch": 9.363852556480381, "grad_norm": 5.974133491516113, "learning_rate": 2.659036860879905e-05, "loss": 2.3447, "step": 252000 }, { "epoch": 9.38243162901308, "grad_norm": 7.3315277099609375, "learning_rate": 2.6543920927467304e-05, "loss": 2.345, "step": 252500 }, { "epoch": 9.40101070154578, "grad_norm": 7.01455020904541, "learning_rate": 2.6497473246135558e-05, "loss": 2.3354, "step": 253000 }, { "epoch": 9.419589774078478, "grad_norm": 6.553669452667236, "learning_rate": 2.6451025564803805e-05, "loss": 2.3505, "step": 253500 }, { "epoch": 9.438168846611177, "grad_norm": 7.384204387664795, "learning_rate": 2.640457788347206e-05, "loss": 2.3406, "step": 254000 }, { "epoch": 9.456747919143876, "grad_norm": 7.899343490600586, "learning_rate": 2.6358130202140313e-05, "loss": 2.3534, "step": 254500 }, { "epoch": 9.475326991676576, "grad_norm": 6.718962669372559, "learning_rate": 2.631168252080856e-05, "loss": 2.3447, "step": 255000 }, { "epoch": 9.493906064209275, "grad_norm": 7.7100629806518555, "learning_rate": 2.6265234839476814e-05, "loss": 2.3378, "step": 255500 }, { "epoch": 9.512485136741974, "grad_norm": 6.307003974914551, "learning_rate": 2.6218787158145064e-05, "loss": 2.3673, "step": 256000 }, { "epoch": 9.531064209274673, "grad_norm": 6.968733787536621, "learning_rate": 2.6172339476813318e-05, "loss": 2.3502, "step": 256500 }, { "epoch": 9.549643281807372, "grad_norm": 7.223754405975342, "learning_rate": 2.6125891795481572e-05, "loss": 2.3397, "step": 257000 }, { "epoch": 9.56822235434007, "grad_norm": 7.984851360321045, "learning_rate": 2.607944411414982e-05, "loss": 2.3394, "step": 257500 }, { "epoch": 9.586801426872771, "grad_norm": 6.745290279388428, "learning_rate": 2.6032996432818073e-05, "loss": 2.357, "step": 258000 }, { "epoch": 9.60538049940547, "grad_norm": 6.241764068603516, "learning_rate": 2.5986548751486327e-05, "loss": 2.3294, "step": 258500 }, { "epoch": 9.623959571938169, "grad_norm": 6.849953651428223, "learning_rate": 2.5940101070154577e-05, "loss": 2.319, "step": 259000 }, { "epoch": 9.642538644470868, "grad_norm": 6.786033630371094, "learning_rate": 2.589365338882283e-05, "loss": 2.3381, "step": 259500 }, { "epoch": 9.661117717003567, "grad_norm": 6.5294952392578125, "learning_rate": 2.5847205707491085e-05, "loss": 2.3292, "step": 260000 }, { "epoch": 9.679696789536266, "grad_norm": 6.852995872497559, "learning_rate": 2.5800758026159332e-05, "loss": 2.3513, "step": 260500 }, { "epoch": 9.698275862068966, "grad_norm": 7.107331275939941, "learning_rate": 2.5754310344827586e-05, "loss": 2.342, "step": 261000 }, { "epoch": 9.716854934601665, "grad_norm": 6.497838020324707, "learning_rate": 2.570786266349584e-05, "loss": 2.3518, "step": 261500 }, { "epoch": 9.735434007134364, "grad_norm": 7.103449821472168, "learning_rate": 2.566141498216409e-05, "loss": 2.3243, "step": 262000 }, { "epoch": 9.754013079667063, "grad_norm": 6.207728862762451, "learning_rate": 2.5614967300832344e-05, "loss": 2.3295, "step": 262500 }, { "epoch": 9.772592152199762, "grad_norm": 6.938514232635498, "learning_rate": 2.55685196195006e-05, "loss": 2.3378, "step": 263000 }, { "epoch": 9.791171224732462, "grad_norm": 8.32728385925293, "learning_rate": 2.5522071938168845e-05, "loss": 2.3397, "step": 263500 }, { "epoch": 9.809750297265161, "grad_norm": 7.170902729034424, "learning_rate": 2.54756242568371e-05, "loss": 2.3152, "step": 264000 }, { "epoch": 9.82832936979786, "grad_norm": 6.303475856781006, "learning_rate": 2.5429176575505353e-05, "loss": 2.335, "step": 264500 }, { "epoch": 9.846908442330559, "grad_norm": 6.429758548736572, "learning_rate": 2.53827288941736e-05, "loss": 2.3193, "step": 265000 }, { "epoch": 9.865487514863258, "grad_norm": 7.365509986877441, "learning_rate": 2.5336281212841854e-05, "loss": 2.3291, "step": 265500 }, { "epoch": 9.884066587395957, "grad_norm": 6.403247356414795, "learning_rate": 2.528983353151011e-05, "loss": 2.3289, "step": 266000 }, { "epoch": 9.902645659928655, "grad_norm": 6.402617454528809, "learning_rate": 2.524338585017836e-05, "loss": 2.3383, "step": 266500 }, { "epoch": 9.921224732461356, "grad_norm": 8.039521217346191, "learning_rate": 2.5196938168846612e-05, "loss": 2.3535, "step": 267000 }, { "epoch": 9.939803804994055, "grad_norm": 6.797732830047607, "learning_rate": 2.5150490487514866e-05, "loss": 2.3102, "step": 267500 }, { "epoch": 9.958382877526754, "grad_norm": 6.878042221069336, "learning_rate": 2.5104042806183113e-05, "loss": 2.3252, "step": 268000 }, { "epoch": 9.976961950059453, "grad_norm": 7.837581634521484, "learning_rate": 2.5057595124851367e-05, "loss": 2.3259, "step": 268500 }, { "epoch": 9.995541022592151, "grad_norm": 7.878035545349121, "learning_rate": 2.501114744351962e-05, "loss": 2.3206, "step": 269000 }, { "epoch": 10.014120095124852, "grad_norm": 7.06614875793457, "learning_rate": 2.4964699762187875e-05, "loss": 2.3031, "step": 269500 }, { "epoch": 10.032699167657551, "grad_norm": 6.305147647857666, "learning_rate": 2.4918252080856126e-05, "loss": 2.2958, "step": 270000 }, { "epoch": 10.05127824019025, "grad_norm": 7.321694374084473, "learning_rate": 2.4871804399524376e-05, "loss": 2.3102, "step": 270500 }, { "epoch": 10.069857312722949, "grad_norm": 6.2910356521606445, "learning_rate": 2.482535671819263e-05, "loss": 2.3087, "step": 271000 }, { "epoch": 10.088436385255648, "grad_norm": 6.352067470550537, "learning_rate": 2.477890903686088e-05, "loss": 2.2997, "step": 271500 }, { "epoch": 10.107015457788346, "grad_norm": 7.583943843841553, "learning_rate": 2.473246135552913e-05, "loss": 2.2976, "step": 272000 }, { "epoch": 10.125594530321047, "grad_norm": 6.128369331359863, "learning_rate": 2.4686013674197385e-05, "loss": 2.3184, "step": 272500 }, { "epoch": 10.144173602853746, "grad_norm": 7.117658615112305, "learning_rate": 2.463956599286564e-05, "loss": 2.297, "step": 273000 }, { "epoch": 10.162752675386445, "grad_norm": 6.37664270401001, "learning_rate": 2.459311831153389e-05, "loss": 2.3054, "step": 273500 }, { "epoch": 10.181331747919144, "grad_norm": 8.254295349121094, "learning_rate": 2.4546670630202143e-05, "loss": 2.2856, "step": 274000 }, { "epoch": 10.199910820451842, "grad_norm": 7.399996757507324, "learning_rate": 2.4500222948870394e-05, "loss": 2.3191, "step": 274500 }, { "epoch": 10.218489892984541, "grad_norm": 7.4784464836120605, "learning_rate": 2.4453775267538644e-05, "loss": 2.2994, "step": 275000 }, { "epoch": 10.237068965517242, "grad_norm": 7.332183837890625, "learning_rate": 2.4407327586206898e-05, "loss": 2.3022, "step": 275500 }, { "epoch": 10.25564803804994, "grad_norm": 6.316469192504883, "learning_rate": 2.4360879904875152e-05, "loss": 2.306, "step": 276000 }, { "epoch": 10.27422711058264, "grad_norm": 7.272724628448486, "learning_rate": 2.4314432223543402e-05, "loss": 2.293, "step": 276500 }, { "epoch": 10.292806183115339, "grad_norm": 7.283202171325684, "learning_rate": 2.4267984542211656e-05, "loss": 2.3086, "step": 277000 }, { "epoch": 10.311385255648037, "grad_norm": 6.357330799102783, "learning_rate": 2.4221536860879907e-05, "loss": 2.2958, "step": 277500 }, { "epoch": 10.329964328180738, "grad_norm": 6.361136436462402, "learning_rate": 2.4175089179548157e-05, "loss": 2.2856, "step": 278000 }, { "epoch": 10.348543400713437, "grad_norm": 7.32297420501709, "learning_rate": 2.4128641498216408e-05, "loss": 2.2904, "step": 278500 }, { "epoch": 10.367122473246136, "grad_norm": 7.6246161460876465, "learning_rate": 2.408219381688466e-05, "loss": 2.2872, "step": 279000 }, { "epoch": 10.385701545778835, "grad_norm": 6.27332067489624, "learning_rate": 2.4035746135552915e-05, "loss": 2.2829, "step": 279500 }, { "epoch": 10.404280618311534, "grad_norm": 7.062289714813232, "learning_rate": 2.3989298454221166e-05, "loss": 2.2938, "step": 280000 }, { "epoch": 10.422859690844232, "grad_norm": 8.132457733154297, "learning_rate": 2.394285077288942e-05, "loss": 2.2994, "step": 280500 }, { "epoch": 10.441438763376933, "grad_norm": 6.456370830535889, "learning_rate": 2.389640309155767e-05, "loss": 2.2845, "step": 281000 }, { "epoch": 10.460017835909632, "grad_norm": 8.033242225646973, "learning_rate": 2.384995541022592e-05, "loss": 2.2907, "step": 281500 }, { "epoch": 10.47859690844233, "grad_norm": 7.318391799926758, "learning_rate": 2.3803507728894175e-05, "loss": 2.288, "step": 282000 }, { "epoch": 10.49717598097503, "grad_norm": 6.92618465423584, "learning_rate": 2.3757060047562425e-05, "loss": 2.2875, "step": 282500 }, { "epoch": 10.515755053507728, "grad_norm": 6.721688747406006, "learning_rate": 2.371061236623068e-05, "loss": 2.295, "step": 283000 }, { "epoch": 10.534334126040427, "grad_norm": 7.079250335693359, "learning_rate": 2.3664164684898933e-05, "loss": 2.2806, "step": 283500 }, { "epoch": 10.552913198573128, "grad_norm": 7.229697227478027, "learning_rate": 2.3617717003567183e-05, "loss": 2.2828, "step": 284000 }, { "epoch": 10.571492271105827, "grad_norm": 6.85770845413208, "learning_rate": 2.3571269322235434e-05, "loss": 2.3038, "step": 284500 }, { "epoch": 10.590071343638526, "grad_norm": 7.07368803024292, "learning_rate": 2.3524821640903688e-05, "loss": 2.2918, "step": 285000 }, { "epoch": 10.608650416171225, "grad_norm": 7.446401119232178, "learning_rate": 2.3478373959571938e-05, "loss": 2.3097, "step": 285500 }, { "epoch": 10.627229488703923, "grad_norm": 7.388403415679932, "learning_rate": 2.3431926278240192e-05, "loss": 2.2753, "step": 286000 }, { "epoch": 10.645808561236624, "grad_norm": 7.510107517242432, "learning_rate": 2.3385478596908446e-05, "loss": 2.2592, "step": 286500 }, { "epoch": 10.664387633769323, "grad_norm": 6.856348514556885, "learning_rate": 2.3339030915576697e-05, "loss": 2.3018, "step": 287000 }, { "epoch": 10.682966706302022, "grad_norm": 5.952792644500732, "learning_rate": 2.3292583234244947e-05, "loss": 2.293, "step": 287500 }, { "epoch": 10.70154577883472, "grad_norm": 6.156429290771484, "learning_rate": 2.32461355529132e-05, "loss": 2.2794, "step": 288000 }, { "epoch": 10.72012485136742, "grad_norm": 7.464205741882324, "learning_rate": 2.319968787158145e-05, "loss": 2.2819, "step": 288500 }, { "epoch": 10.738703923900118, "grad_norm": 6.248416423797607, "learning_rate": 2.3153240190249702e-05, "loss": 2.2841, "step": 289000 }, { "epoch": 10.757282996432817, "grad_norm": 6.5093183517456055, "learning_rate": 2.3106792508917956e-05, "loss": 2.2974, "step": 289500 }, { "epoch": 10.775862068965518, "grad_norm": 6.669436454772949, "learning_rate": 2.306034482758621e-05, "loss": 2.2823, "step": 290000 }, { "epoch": 10.794441141498217, "grad_norm": 6.547306537628174, "learning_rate": 2.301389714625446e-05, "loss": 2.2783, "step": 290500 }, { "epoch": 10.813020214030916, "grad_norm": 7.420673847198486, "learning_rate": 2.2967449464922714e-05, "loss": 2.2803, "step": 291000 }, { "epoch": 10.831599286563614, "grad_norm": 7.08470344543457, "learning_rate": 2.2921001783590965e-05, "loss": 2.2897, "step": 291500 }, { "epoch": 10.850178359096313, "grad_norm": 7.092275142669678, "learning_rate": 2.2874554102259215e-05, "loss": 2.2842, "step": 292000 }, { "epoch": 10.868757431629014, "grad_norm": 6.814739227294922, "learning_rate": 2.282810642092747e-05, "loss": 2.2637, "step": 292500 }, { "epoch": 10.887336504161713, "grad_norm": 6.778537750244141, "learning_rate": 2.278165873959572e-05, "loss": 2.2802, "step": 293000 }, { "epoch": 10.905915576694412, "grad_norm": 6.529074668884277, "learning_rate": 2.2735211058263973e-05, "loss": 2.282, "step": 293500 }, { "epoch": 10.92449464922711, "grad_norm": 7.486764430999756, "learning_rate": 2.2688763376932224e-05, "loss": 2.2964, "step": 294000 }, { "epoch": 10.94307372175981, "grad_norm": 9.576150894165039, "learning_rate": 2.2642315695600478e-05, "loss": 2.2853, "step": 294500 }, { "epoch": 10.961652794292508, "grad_norm": 7.3996429443359375, "learning_rate": 2.2595868014268728e-05, "loss": 2.278, "step": 295000 }, { "epoch": 10.980231866825209, "grad_norm": 6.478265762329102, "learning_rate": 2.254942033293698e-05, "loss": 2.2857, "step": 295500 }, { "epoch": 10.998810939357908, "grad_norm": 7.264919757843018, "learning_rate": 2.2502972651605233e-05, "loss": 2.2638, "step": 296000 }, { "epoch": 11.017390011890607, "grad_norm": 6.449435234069824, "learning_rate": 2.2456524970273486e-05, "loss": 2.2538, "step": 296500 }, { "epoch": 11.035969084423305, "grad_norm": 8.838685035705566, "learning_rate": 2.2410077288941737e-05, "loss": 2.2658, "step": 297000 }, { "epoch": 11.054548156956004, "grad_norm": 7.12150764465332, "learning_rate": 2.236362960760999e-05, "loss": 2.2582, "step": 297500 }, { "epoch": 11.073127229488703, "grad_norm": 7.337321758270264, "learning_rate": 2.231718192627824e-05, "loss": 2.2531, "step": 298000 }, { "epoch": 11.091706302021404, "grad_norm": 7.290600776672363, "learning_rate": 2.2270734244946492e-05, "loss": 2.2607, "step": 298500 }, { "epoch": 11.110285374554103, "grad_norm": 6.834112644195557, "learning_rate": 2.2224286563614746e-05, "loss": 2.2593, "step": 299000 }, { "epoch": 11.128864447086801, "grad_norm": 7.174058437347412, "learning_rate": 2.2177838882282996e-05, "loss": 2.2584, "step": 299500 }, { "epoch": 11.1474435196195, "grad_norm": 6.08710241317749, "learning_rate": 2.213139120095125e-05, "loss": 2.2572, "step": 300000 }, { "epoch": 11.1660225921522, "grad_norm": 7.66245174407959, "learning_rate": 2.2084943519619504e-05, "loss": 2.2597, "step": 300500 }, { "epoch": 11.1846016646849, "grad_norm": 6.607715606689453, "learning_rate": 2.2038495838287754e-05, "loss": 2.2383, "step": 301000 }, { "epoch": 11.203180737217599, "grad_norm": 6.562816143035889, "learning_rate": 2.1992048156956005e-05, "loss": 2.2497, "step": 301500 }, { "epoch": 11.221759809750298, "grad_norm": 6.655299186706543, "learning_rate": 2.194560047562426e-05, "loss": 2.2628, "step": 302000 }, { "epoch": 11.240338882282996, "grad_norm": 6.629017353057861, "learning_rate": 2.189915279429251e-05, "loss": 2.2568, "step": 302500 }, { "epoch": 11.258917954815695, "grad_norm": 7.567939281463623, "learning_rate": 2.185270511296076e-05, "loss": 2.2732, "step": 303000 }, { "epoch": 11.277497027348394, "grad_norm": 8.384344100952148, "learning_rate": 2.1806257431629014e-05, "loss": 2.2702, "step": 303500 }, { "epoch": 11.296076099881095, "grad_norm": 7.6042914390563965, "learning_rate": 2.1759809750297268e-05, "loss": 2.2671, "step": 304000 }, { "epoch": 11.314655172413794, "grad_norm": 6.45172643661499, "learning_rate": 2.1713362068965518e-05, "loss": 2.2531, "step": 304500 }, { "epoch": 11.333234244946492, "grad_norm": 6.863234519958496, "learning_rate": 2.1666914387633772e-05, "loss": 2.2627, "step": 305000 }, { "epoch": 11.351813317479191, "grad_norm": 8.442804336547852, "learning_rate": 2.1620466706302022e-05, "loss": 2.2551, "step": 305500 }, { "epoch": 11.37039239001189, "grad_norm": 8.2174072265625, "learning_rate": 2.1574019024970273e-05, "loss": 2.2559, "step": 306000 }, { "epoch": 11.388971462544589, "grad_norm": 7.479830265045166, "learning_rate": 2.1527571343638527e-05, "loss": 2.257, "step": 306500 }, { "epoch": 11.40755053507729, "grad_norm": 6.4119744300842285, "learning_rate": 2.148112366230678e-05, "loss": 2.2577, "step": 307000 }, { "epoch": 11.426129607609989, "grad_norm": 7.141465187072754, "learning_rate": 2.143467598097503e-05, "loss": 2.2583, "step": 307500 }, { "epoch": 11.444708680142687, "grad_norm": 7.255865097045898, "learning_rate": 2.138822829964328e-05, "loss": 2.2549, "step": 308000 }, { "epoch": 11.463287752675386, "grad_norm": 6.533185958862305, "learning_rate": 2.1341780618311535e-05, "loss": 2.2563, "step": 308500 }, { "epoch": 11.481866825208085, "grad_norm": 5.948304176330566, "learning_rate": 2.1295332936979786e-05, "loss": 2.2629, "step": 309000 }, { "epoch": 11.500445897740784, "grad_norm": 7.485329627990723, "learning_rate": 2.1248885255648036e-05, "loss": 2.243, "step": 309500 }, { "epoch": 11.519024970273485, "grad_norm": 7.400222301483154, "learning_rate": 2.120243757431629e-05, "loss": 2.2436, "step": 310000 }, { "epoch": 11.537604042806183, "grad_norm": 7.361048221588135, "learning_rate": 2.1155989892984544e-05, "loss": 2.2597, "step": 310500 }, { "epoch": 11.556183115338882, "grad_norm": 7.483823776245117, "learning_rate": 2.1109542211652795e-05, "loss": 2.2643, "step": 311000 }, { "epoch": 11.574762187871581, "grad_norm": 7.027825832366943, "learning_rate": 2.106309453032105e-05, "loss": 2.2455, "step": 311500 }, { "epoch": 11.59334126040428, "grad_norm": 6.856015205383301, "learning_rate": 2.10166468489893e-05, "loss": 2.2351, "step": 312000 }, { "epoch": 11.611920332936979, "grad_norm": 7.20182991027832, "learning_rate": 2.097019916765755e-05, "loss": 2.2472, "step": 312500 }, { "epoch": 11.63049940546968, "grad_norm": 6.145348072052002, "learning_rate": 2.0923751486325803e-05, "loss": 2.2493, "step": 313000 }, { "epoch": 11.649078478002378, "grad_norm": 7.6849517822265625, "learning_rate": 2.0877303804994054e-05, "loss": 2.2468, "step": 313500 }, { "epoch": 11.667657550535077, "grad_norm": 7.373369216918945, "learning_rate": 2.0830856123662308e-05, "loss": 2.252, "step": 314000 }, { "epoch": 11.686236623067776, "grad_norm": 7.262668132781982, "learning_rate": 2.0784408442330562e-05, "loss": 2.2411, "step": 314500 }, { "epoch": 11.704815695600475, "grad_norm": 6.475069999694824, "learning_rate": 2.0737960760998812e-05, "loss": 2.245, "step": 315000 }, { "epoch": 11.723394768133176, "grad_norm": 6.434516906738281, "learning_rate": 2.0691513079667063e-05, "loss": 2.2459, "step": 315500 }, { "epoch": 11.741973840665874, "grad_norm": 7.697376251220703, "learning_rate": 2.0645065398335317e-05, "loss": 2.2556, "step": 316000 }, { "epoch": 11.760552913198573, "grad_norm": 7.839350700378418, "learning_rate": 2.0598617717003567e-05, "loss": 2.2431, "step": 316500 }, { "epoch": 11.779131985731272, "grad_norm": 7.546802997589111, "learning_rate": 2.055217003567182e-05, "loss": 2.2627, "step": 317000 }, { "epoch": 11.797711058263971, "grad_norm": 6.828023910522461, "learning_rate": 2.0505722354340075e-05, "loss": 2.2353, "step": 317500 }, { "epoch": 11.81629013079667, "grad_norm": 6.4239935874938965, "learning_rate": 2.0459274673008325e-05, "loss": 2.2472, "step": 318000 }, { "epoch": 11.83486920332937, "grad_norm": 6.941580772399902, "learning_rate": 2.0412826991676576e-05, "loss": 2.2423, "step": 318500 }, { "epoch": 11.85344827586207, "grad_norm": 7.385081768035889, "learning_rate": 2.036637931034483e-05, "loss": 2.2332, "step": 319000 }, { "epoch": 11.872027348394768, "grad_norm": 7.3545613288879395, "learning_rate": 2.031993162901308e-05, "loss": 2.2592, "step": 319500 }, { "epoch": 11.890606420927467, "grad_norm": 6.4375104904174805, "learning_rate": 2.027348394768133e-05, "loss": 2.2352, "step": 320000 }, { "epoch": 11.909185493460166, "grad_norm": 6.863650798797607, "learning_rate": 2.0227036266349585e-05, "loss": 2.2622, "step": 320500 }, { "epoch": 11.927764565992865, "grad_norm": 6.5175275802612305, "learning_rate": 2.018058858501784e-05, "loss": 2.2421, "step": 321000 }, { "epoch": 11.946343638525565, "grad_norm": 7.415239334106445, "learning_rate": 2.013414090368609e-05, "loss": 2.2483, "step": 321500 }, { "epoch": 11.964922711058264, "grad_norm": 8.416884422302246, "learning_rate": 2.0087693222354343e-05, "loss": 2.245, "step": 322000 }, { "epoch": 11.983501783590963, "grad_norm": 6.286489009857178, "learning_rate": 2.0041245541022593e-05, "loss": 2.2409, "step": 322500 }, { "epoch": 12.002080856123662, "grad_norm": 7.4863080978393555, "learning_rate": 1.9994797859690844e-05, "loss": 2.2337, "step": 323000 }, { "epoch": 12.020659928656361, "grad_norm": 6.175674915313721, "learning_rate": 1.9948350178359094e-05, "loss": 2.2054, "step": 323500 }, { "epoch": 12.03923900118906, "grad_norm": 7.600936412811279, "learning_rate": 1.9901902497027348e-05, "loss": 2.2135, "step": 324000 }, { "epoch": 12.05781807372176, "grad_norm": 7.510547637939453, "learning_rate": 1.9855454815695602e-05, "loss": 2.231, "step": 324500 }, { "epoch": 12.07639714625446, "grad_norm": 6.505836009979248, "learning_rate": 1.9809007134363853e-05, "loss": 2.2123, "step": 325000 }, { "epoch": 12.094976218787158, "grad_norm": 7.495330333709717, "learning_rate": 1.9762559453032106e-05, "loss": 2.2048, "step": 325500 }, { "epoch": 12.113555291319857, "grad_norm": 7.062661170959473, "learning_rate": 1.9716111771700357e-05, "loss": 2.2055, "step": 326000 }, { "epoch": 12.132134363852556, "grad_norm": 7.220265865325928, "learning_rate": 1.9669664090368607e-05, "loss": 2.2333, "step": 326500 }, { "epoch": 12.150713436385256, "grad_norm": 6.432553768157959, "learning_rate": 1.962321640903686e-05, "loss": 2.2274, "step": 327000 }, { "epoch": 12.169292508917955, "grad_norm": 7.610962390899658, "learning_rate": 1.9576768727705115e-05, "loss": 2.2108, "step": 327500 }, { "epoch": 12.187871581450654, "grad_norm": 8.169533729553223, "learning_rate": 1.9530321046373366e-05, "loss": 2.1948, "step": 328000 }, { "epoch": 12.206450653983353, "grad_norm": 6.529592037200928, "learning_rate": 1.948387336504162e-05, "loss": 2.2195, "step": 328500 }, { "epoch": 12.225029726516052, "grad_norm": 7.463806629180908, "learning_rate": 1.943742568370987e-05, "loss": 2.221, "step": 329000 }, { "epoch": 12.24360879904875, "grad_norm": 7.339646816253662, "learning_rate": 1.939097800237812e-05, "loss": 2.2126, "step": 329500 }, { "epoch": 12.262187871581451, "grad_norm": 7.518458366394043, "learning_rate": 1.9344530321046374e-05, "loss": 2.2102, "step": 330000 }, { "epoch": 12.28076694411415, "grad_norm": 7.828365325927734, "learning_rate": 1.9298082639714625e-05, "loss": 2.2335, "step": 330500 }, { "epoch": 12.29934601664685, "grad_norm": 7.198127269744873, "learning_rate": 1.925163495838288e-05, "loss": 2.2214, "step": 331000 }, { "epoch": 12.317925089179548, "grad_norm": 6.6039533615112305, "learning_rate": 1.9205187277051133e-05, "loss": 2.1985, "step": 331500 }, { "epoch": 12.336504161712247, "grad_norm": 7.200562477111816, "learning_rate": 1.9158739595719383e-05, "loss": 2.211, "step": 332000 }, { "epoch": 12.355083234244946, "grad_norm": 7.252729892730713, "learning_rate": 1.9112291914387634e-05, "loss": 2.2302, "step": 332500 }, { "epoch": 12.373662306777646, "grad_norm": 7.972862243652344, "learning_rate": 1.9065844233055888e-05, "loss": 2.2095, "step": 333000 }, { "epoch": 12.392241379310345, "grad_norm": 8.594975471496582, "learning_rate": 1.9019396551724138e-05, "loss": 2.2204, "step": 333500 }, { "epoch": 12.410820451843044, "grad_norm": 7.73285436630249, "learning_rate": 1.897294887039239e-05, "loss": 2.2174, "step": 334000 }, { "epoch": 12.429399524375743, "grad_norm": 6.429736614227295, "learning_rate": 1.8926501189060646e-05, "loss": 2.2236, "step": 334500 }, { "epoch": 12.447978596908442, "grad_norm": 6.68847131729126, "learning_rate": 1.8880053507728896e-05, "loss": 2.2293, "step": 335000 }, { "epoch": 12.46655766944114, "grad_norm": 6.902133464813232, "learning_rate": 1.8833605826397147e-05, "loss": 2.215, "step": 335500 }, { "epoch": 12.485136741973841, "grad_norm": 6.436554908752441, "learning_rate": 1.87871581450654e-05, "loss": 2.2269, "step": 336000 }, { "epoch": 12.50371581450654, "grad_norm": 6.80860710144043, "learning_rate": 1.874071046373365e-05, "loss": 2.2223, "step": 336500 }, { "epoch": 12.522294887039239, "grad_norm": 7.977982044219971, "learning_rate": 1.86942627824019e-05, "loss": 2.2155, "step": 337000 }, { "epoch": 12.540873959571938, "grad_norm": 7.9569478034973145, "learning_rate": 1.8647815101070156e-05, "loss": 2.2305, "step": 337500 }, { "epoch": 12.559453032104637, "grad_norm": 6.445404529571533, "learning_rate": 1.860136741973841e-05, "loss": 2.2075, "step": 338000 }, { "epoch": 12.578032104637337, "grad_norm": 7.153224468231201, "learning_rate": 1.855491973840666e-05, "loss": 2.2246, "step": 338500 }, { "epoch": 12.596611177170036, "grad_norm": 7.287299633026123, "learning_rate": 1.850847205707491e-05, "loss": 2.1913, "step": 339000 }, { "epoch": 12.615190249702735, "grad_norm": 6.6666107177734375, "learning_rate": 1.8462024375743164e-05, "loss": 2.2267, "step": 339500 }, { "epoch": 12.633769322235434, "grad_norm": 7.024231433868408, "learning_rate": 1.8415576694411415e-05, "loss": 2.2106, "step": 340000 }, { "epoch": 12.652348394768133, "grad_norm": 6.549313068389893, "learning_rate": 1.8369129013079665e-05, "loss": 2.2208, "step": 340500 }, { "epoch": 12.670927467300832, "grad_norm": 6.641164302825928, "learning_rate": 1.832268133174792e-05, "loss": 2.2157, "step": 341000 }, { "epoch": 12.689506539833532, "grad_norm": 7.615879535675049, "learning_rate": 1.8276233650416173e-05, "loss": 2.217, "step": 341500 }, { "epoch": 12.708085612366231, "grad_norm": 7.870852470397949, "learning_rate": 1.8229785969084424e-05, "loss": 2.2254, "step": 342000 }, { "epoch": 12.72666468489893, "grad_norm": 5.989630222320557, "learning_rate": 1.8183338287752677e-05, "loss": 2.2129, "step": 342500 }, { "epoch": 12.745243757431629, "grad_norm": 6.8082594871521, "learning_rate": 1.8136890606420928e-05, "loss": 2.2229, "step": 343000 }, { "epoch": 12.763822829964328, "grad_norm": 7.244877338409424, "learning_rate": 1.809044292508918e-05, "loss": 2.2127, "step": 343500 }, { "epoch": 12.782401902497027, "grad_norm": 7.6857008934021, "learning_rate": 1.8043995243757432e-05, "loss": 2.2235, "step": 344000 }, { "epoch": 12.800980975029727, "grad_norm": 7.00359582901001, "learning_rate": 1.7997547562425686e-05, "loss": 2.2211, "step": 344500 }, { "epoch": 12.819560047562426, "grad_norm": 7.0071187019348145, "learning_rate": 1.7951099881093937e-05, "loss": 2.2182, "step": 345000 }, { "epoch": 12.838139120095125, "grad_norm": 6.9319634437561035, "learning_rate": 1.790465219976219e-05, "loss": 2.2043, "step": 345500 }, { "epoch": 12.856718192627824, "grad_norm": 6.487482070922852, "learning_rate": 1.785820451843044e-05, "loss": 2.2139, "step": 346000 }, { "epoch": 12.875297265160523, "grad_norm": 7.508727550506592, "learning_rate": 1.781175683709869e-05, "loss": 2.2243, "step": 346500 }, { "epoch": 12.893876337693222, "grad_norm": 6.555574893951416, "learning_rate": 1.7765309155766945e-05, "loss": 2.2167, "step": 347000 }, { "epoch": 12.912455410225922, "grad_norm": 7.410988807678223, "learning_rate": 1.7718861474435196e-05, "loss": 2.2158, "step": 347500 }, { "epoch": 12.931034482758621, "grad_norm": 8.217428207397461, "learning_rate": 1.767241379310345e-05, "loss": 2.2031, "step": 348000 }, { "epoch": 12.94961355529132, "grad_norm": 6.6040754318237305, "learning_rate": 1.7625966111771704e-05, "loss": 2.1866, "step": 348500 }, { "epoch": 12.968192627824019, "grad_norm": 6.99837064743042, "learning_rate": 1.7579518430439954e-05, "loss": 2.1899, "step": 349000 }, { "epoch": 12.986771700356718, "grad_norm": 6.531412124633789, "learning_rate": 1.7533070749108205e-05, "loss": 2.2244, "step": 349500 }, { "epoch": 13.005350772889418, "grad_norm": 7.704728126525879, "learning_rate": 1.748662306777646e-05, "loss": 2.18, "step": 350000 }, { "epoch": 13.023929845422117, "grad_norm": 6.77532434463501, "learning_rate": 1.744017538644471e-05, "loss": 2.1789, "step": 350500 }, { "epoch": 13.042508917954816, "grad_norm": 6.446128845214844, "learning_rate": 1.739372770511296e-05, "loss": 2.1707, "step": 351000 }, { "epoch": 13.061087990487515, "grad_norm": 7.576733589172363, "learning_rate": 1.7347280023781213e-05, "loss": 2.1768, "step": 351500 }, { "epoch": 13.079667063020214, "grad_norm": 7.239291191101074, "learning_rate": 1.7300832342449467e-05, "loss": 2.2011, "step": 352000 }, { "epoch": 13.098246135552913, "grad_norm": 6.936691761016846, "learning_rate": 1.7254384661117718e-05, "loss": 2.1767, "step": 352500 }, { "epoch": 13.116825208085613, "grad_norm": 7.205715179443359, "learning_rate": 1.7207936979785968e-05, "loss": 2.1581, "step": 353000 }, { "epoch": 13.135404280618312, "grad_norm": 6.61326789855957, "learning_rate": 1.7161489298454222e-05, "loss": 2.185, "step": 353500 }, { "epoch": 13.15398335315101, "grad_norm": 7.715660572052002, "learning_rate": 1.7115041617122473e-05, "loss": 2.1924, "step": 354000 }, { "epoch": 13.17256242568371, "grad_norm": 6.543544769287109, "learning_rate": 1.7068593935790726e-05, "loss": 2.1998, "step": 354500 }, { "epoch": 13.191141498216409, "grad_norm": 8.281086921691895, "learning_rate": 1.702214625445898e-05, "loss": 2.1787, "step": 355000 }, { "epoch": 13.209720570749107, "grad_norm": 6.323915481567383, "learning_rate": 1.697569857312723e-05, "loss": 2.1834, "step": 355500 }, { "epoch": 13.228299643281808, "grad_norm": 8.45340347290039, "learning_rate": 1.692925089179548e-05, "loss": 2.1806, "step": 356000 }, { "epoch": 13.246878715814507, "grad_norm": 8.1563720703125, "learning_rate": 1.6882803210463735e-05, "loss": 2.1708, "step": 356500 }, { "epoch": 13.265457788347206, "grad_norm": 7.083395481109619, "learning_rate": 1.6836355529131986e-05, "loss": 2.1866, "step": 357000 }, { "epoch": 13.284036860879905, "grad_norm": 6.55299186706543, "learning_rate": 1.6789907847800236e-05, "loss": 2.1723, "step": 357500 }, { "epoch": 13.302615933412604, "grad_norm": 6.710261821746826, "learning_rate": 1.674346016646849e-05, "loss": 2.1977, "step": 358000 }, { "epoch": 13.321195005945302, "grad_norm": 7.0249738693237305, "learning_rate": 1.6697012485136744e-05, "loss": 2.2007, "step": 358500 }, { "epoch": 13.339774078478003, "grad_norm": 7.835285663604736, "learning_rate": 1.6650564803804994e-05, "loss": 2.1959, "step": 359000 }, { "epoch": 13.358353151010702, "grad_norm": 8.400995254516602, "learning_rate": 1.660411712247325e-05, "loss": 2.1991, "step": 359500 }, { "epoch": 13.3769322235434, "grad_norm": 6.235854148864746, "learning_rate": 1.65576694411415e-05, "loss": 2.174, "step": 360000 }, { "epoch": 13.3955112960761, "grad_norm": 6.741766929626465, "learning_rate": 1.651122175980975e-05, "loss": 2.1777, "step": 360500 }, { "epoch": 13.414090368608798, "grad_norm": 8.243950843811035, "learning_rate": 1.6464774078478003e-05, "loss": 2.1841, "step": 361000 }, { "epoch": 13.432669441141499, "grad_norm": 6.43676233291626, "learning_rate": 1.6418326397146254e-05, "loss": 2.1769, "step": 361500 }, { "epoch": 13.451248513674198, "grad_norm": 6.800743579864502, "learning_rate": 1.6371878715814508e-05, "loss": 2.2043, "step": 362000 }, { "epoch": 13.469827586206897, "grad_norm": 6.082602500915527, "learning_rate": 1.632543103448276e-05, "loss": 2.167, "step": 362500 }, { "epoch": 13.488406658739596, "grad_norm": 7.768115520477295, "learning_rate": 1.6278983353151012e-05, "loss": 2.1623, "step": 363000 }, { "epoch": 13.506985731272295, "grad_norm": 6.893867492675781, "learning_rate": 1.6232535671819262e-05, "loss": 2.1835, "step": 363500 }, { "epoch": 13.525564803804993, "grad_norm": 6.749509811401367, "learning_rate": 1.6186087990487516e-05, "loss": 2.1659, "step": 364000 }, { "epoch": 13.544143876337694, "grad_norm": 6.05668306350708, "learning_rate": 1.6139640309155767e-05, "loss": 2.1703, "step": 364500 }, { "epoch": 13.562722948870393, "grad_norm": 7.0912251472473145, "learning_rate": 1.609319262782402e-05, "loss": 2.1919, "step": 365000 }, { "epoch": 13.581302021403092, "grad_norm": 6.6050310134887695, "learning_rate": 1.6046744946492275e-05, "loss": 2.1756, "step": 365500 }, { "epoch": 13.59988109393579, "grad_norm": 6.950946807861328, "learning_rate": 1.6000297265160525e-05, "loss": 2.1825, "step": 366000 }, { "epoch": 13.61846016646849, "grad_norm": 7.240453243255615, "learning_rate": 1.5953849583828776e-05, "loss": 2.1837, "step": 366500 }, { "epoch": 13.637039239001188, "grad_norm": 8.0787935256958, "learning_rate": 1.590740190249703e-05, "loss": 2.1747, "step": 367000 }, { "epoch": 13.655618311533889, "grad_norm": 6.953646659851074, "learning_rate": 1.586095422116528e-05, "loss": 2.1821, "step": 367500 }, { "epoch": 13.674197384066588, "grad_norm": 6.981358051300049, "learning_rate": 1.581450653983353e-05, "loss": 2.1751, "step": 368000 }, { "epoch": 13.692776456599287, "grad_norm": 7.580711841583252, "learning_rate": 1.5768058858501784e-05, "loss": 2.1685, "step": 368500 }, { "epoch": 13.711355529131986, "grad_norm": 7.360109806060791, "learning_rate": 1.5721611177170038e-05, "loss": 2.1566, "step": 369000 }, { "epoch": 13.729934601664684, "grad_norm": 6.589022636413574, "learning_rate": 1.567516349583829e-05, "loss": 2.1725, "step": 369500 }, { "epoch": 13.748513674197383, "grad_norm": 7.376802444458008, "learning_rate": 1.562871581450654e-05, "loss": 2.1814, "step": 370000 }, { "epoch": 13.767092746730084, "grad_norm": 7.36546516418457, "learning_rate": 1.5582268133174793e-05, "loss": 2.1824, "step": 370500 }, { "epoch": 13.785671819262783, "grad_norm": 7.832765579223633, "learning_rate": 1.5535820451843044e-05, "loss": 2.1651, "step": 371000 }, { "epoch": 13.804250891795482, "grad_norm": 7.414605617523193, "learning_rate": 1.5489372770511294e-05, "loss": 2.1488, "step": 371500 }, { "epoch": 13.82282996432818, "grad_norm": 7.148501873016357, "learning_rate": 1.5442925089179548e-05, "loss": 2.185, "step": 372000 }, { "epoch": 13.84140903686088, "grad_norm": 6.733073711395264, "learning_rate": 1.5396477407847802e-05, "loss": 2.1645, "step": 372500 }, { "epoch": 13.85998810939358, "grad_norm": 7.812681198120117, "learning_rate": 1.5350029726516052e-05, "loss": 2.1836, "step": 373000 }, { "epoch": 13.878567181926279, "grad_norm": 6.853206634521484, "learning_rate": 1.5303582045184306e-05, "loss": 2.1778, "step": 373500 }, { "epoch": 13.897146254458978, "grad_norm": 7.234543323516846, "learning_rate": 1.5257134363852557e-05, "loss": 2.1759, "step": 374000 }, { "epoch": 13.915725326991677, "grad_norm": 7.433253765106201, "learning_rate": 1.5210686682520809e-05, "loss": 2.1722, "step": 374500 }, { "epoch": 13.934304399524375, "grad_norm": 7.073111534118652, "learning_rate": 1.5164239001189063e-05, "loss": 2.1855, "step": 375000 }, { "epoch": 13.952883472057074, "grad_norm": 7.280003547668457, "learning_rate": 1.5117791319857313e-05, "loss": 2.1819, "step": 375500 }, { "epoch": 13.971462544589775, "grad_norm": 6.7823991775512695, "learning_rate": 1.5071343638525565e-05, "loss": 2.1585, "step": 376000 }, { "epoch": 13.990041617122474, "grad_norm": 7.181284427642822, "learning_rate": 1.502489595719382e-05, "loss": 2.1689, "step": 376500 }, { "epoch": 14.008620689655173, "grad_norm": 6.957113265991211, "learning_rate": 1.497844827586207e-05, "loss": 2.1754, "step": 377000 }, { "epoch": 14.027199762187871, "grad_norm": 7.111293315887451, "learning_rate": 1.493200059453032e-05, "loss": 2.1651, "step": 377500 }, { "epoch": 14.04577883472057, "grad_norm": 7.025313854217529, "learning_rate": 1.4885552913198574e-05, "loss": 2.1458, "step": 378000 }, { "epoch": 14.06435790725327, "grad_norm": 6.963667869567871, "learning_rate": 1.4839105231866826e-05, "loss": 2.1456, "step": 378500 }, { "epoch": 14.08293697978597, "grad_norm": 7.611172199249268, "learning_rate": 1.4792657550535077e-05, "loss": 2.158, "step": 379000 }, { "epoch": 14.101516052318669, "grad_norm": 6.874037265777588, "learning_rate": 1.474620986920333e-05, "loss": 2.1446, "step": 379500 }, { "epoch": 14.120095124851368, "grad_norm": 7.512300491333008, "learning_rate": 1.4699762187871583e-05, "loss": 2.1528, "step": 380000 }, { "epoch": 14.138674197384066, "grad_norm": 6.693312168121338, "learning_rate": 1.4653314506539833e-05, "loss": 2.1563, "step": 380500 }, { "epoch": 14.157253269916765, "grad_norm": 6.6438164710998535, "learning_rate": 1.4606866825208087e-05, "loss": 2.1383, "step": 381000 }, { "epoch": 14.175832342449464, "grad_norm": 7.537757873535156, "learning_rate": 1.456041914387634e-05, "loss": 2.1554, "step": 381500 }, { "epoch": 14.194411414982165, "grad_norm": 8.159100532531738, "learning_rate": 1.451397146254459e-05, "loss": 2.1538, "step": 382000 }, { "epoch": 14.212990487514864, "grad_norm": 7.427910327911377, "learning_rate": 1.4467523781212844e-05, "loss": 2.1623, "step": 382500 }, { "epoch": 14.231569560047562, "grad_norm": 7.805336952209473, "learning_rate": 1.4421076099881094e-05, "loss": 2.1645, "step": 383000 }, { "epoch": 14.250148632580261, "grad_norm": 6.669980525970459, "learning_rate": 1.4374628418549347e-05, "loss": 2.1525, "step": 383500 }, { "epoch": 14.26872770511296, "grad_norm": 7.358639240264893, "learning_rate": 1.4328180737217597e-05, "loss": 2.1542, "step": 384000 }, { "epoch": 14.28730677764566, "grad_norm": 7.103815078735352, "learning_rate": 1.4281733055885851e-05, "loss": 2.1567, "step": 384500 }, { "epoch": 14.30588585017836, "grad_norm": 7.218321800231934, "learning_rate": 1.4235285374554103e-05, "loss": 2.1569, "step": 385000 }, { "epoch": 14.324464922711059, "grad_norm": 7.941781520843506, "learning_rate": 1.4188837693222354e-05, "loss": 2.1554, "step": 385500 }, { "epoch": 14.343043995243757, "grad_norm": 8.86156940460205, "learning_rate": 1.4142390011890607e-05, "loss": 2.1649, "step": 386000 }, { "epoch": 14.361623067776456, "grad_norm": 6.904116153717041, "learning_rate": 1.409594233055886e-05, "loss": 2.1486, "step": 386500 }, { "epoch": 14.380202140309155, "grad_norm": 6.8697943687438965, "learning_rate": 1.404949464922711e-05, "loss": 2.1686, "step": 387000 }, { "epoch": 14.398781212841856, "grad_norm": 7.536423683166504, "learning_rate": 1.4003046967895364e-05, "loss": 2.1534, "step": 387500 }, { "epoch": 14.417360285374555, "grad_norm": 6.2832465171813965, "learning_rate": 1.3956599286563615e-05, "loss": 2.1638, "step": 388000 }, { "epoch": 14.435939357907253, "grad_norm": 7.5254926681518555, "learning_rate": 1.3910151605231867e-05, "loss": 2.1611, "step": 388500 }, { "epoch": 14.454518430439952, "grad_norm": 6.102006912231445, "learning_rate": 1.386370392390012e-05, "loss": 2.1481, "step": 389000 }, { "epoch": 14.473097502972651, "grad_norm": 6.829434871673584, "learning_rate": 1.3817256242568371e-05, "loss": 2.1309, "step": 389500 }, { "epoch": 14.49167657550535, "grad_norm": 7.072176456451416, "learning_rate": 1.3770808561236623e-05, "loss": 2.1493, "step": 390000 }, { "epoch": 14.51025564803805, "grad_norm": 8.32613754272461, "learning_rate": 1.3724360879904877e-05, "loss": 2.128, "step": 390500 }, { "epoch": 14.52883472057075, "grad_norm": 7.587469577789307, "learning_rate": 1.3677913198573128e-05, "loss": 2.1446, "step": 391000 }, { "epoch": 14.547413793103448, "grad_norm": 7.003942966461182, "learning_rate": 1.363146551724138e-05, "loss": 2.1493, "step": 391500 }, { "epoch": 14.565992865636147, "grad_norm": 6.587801456451416, "learning_rate": 1.3585017835909634e-05, "loss": 2.1554, "step": 392000 }, { "epoch": 14.584571938168846, "grad_norm": 6.796844005584717, "learning_rate": 1.3538570154577884e-05, "loss": 2.1699, "step": 392500 }, { "epoch": 14.603151010701545, "grad_norm": 6.230968952178955, "learning_rate": 1.3492122473246135e-05, "loss": 2.1514, "step": 393000 }, { "epoch": 14.621730083234246, "grad_norm": 7.986715793609619, "learning_rate": 1.3445674791914389e-05, "loss": 2.1439, "step": 393500 }, { "epoch": 14.640309155766944, "grad_norm": 6.953087329864502, "learning_rate": 1.339922711058264e-05, "loss": 2.1359, "step": 394000 }, { "epoch": 14.658888228299643, "grad_norm": 6.939476490020752, "learning_rate": 1.3352779429250891e-05, "loss": 2.1438, "step": 394500 }, { "epoch": 14.677467300832342, "grad_norm": 7.4189229011535645, "learning_rate": 1.3306331747919145e-05, "loss": 2.1494, "step": 395000 }, { "epoch": 14.696046373365041, "grad_norm": 6.914766788482666, "learning_rate": 1.3259884066587397e-05, "loss": 2.1336, "step": 395500 }, { "epoch": 14.714625445897742, "grad_norm": 6.602614402770996, "learning_rate": 1.3213436385255648e-05, "loss": 2.154, "step": 396000 }, { "epoch": 14.73320451843044, "grad_norm": 7.446470260620117, "learning_rate": 1.3166988703923902e-05, "loss": 2.1425, "step": 396500 }, { "epoch": 14.75178359096314, "grad_norm": 6.55057430267334, "learning_rate": 1.3120541022592154e-05, "loss": 2.1403, "step": 397000 }, { "epoch": 14.770362663495838, "grad_norm": 6.798906326293945, "learning_rate": 1.3074093341260404e-05, "loss": 2.1396, "step": 397500 }, { "epoch": 14.788941736028537, "grad_norm": 7.93524169921875, "learning_rate": 1.3027645659928655e-05, "loss": 2.1665, "step": 398000 }, { "epoch": 14.807520808561236, "grad_norm": 8.041824340820312, "learning_rate": 1.2981197978596909e-05, "loss": 2.1377, "step": 398500 }, { "epoch": 14.826099881093937, "grad_norm": 6.651689529418945, "learning_rate": 1.2934750297265161e-05, "loss": 2.1461, "step": 399000 }, { "epoch": 14.844678953626635, "grad_norm": 6.821606636047363, "learning_rate": 1.2888302615933411e-05, "loss": 2.158, "step": 399500 }, { "epoch": 14.863258026159334, "grad_norm": 8.040721893310547, "learning_rate": 1.2841854934601665e-05, "loss": 2.1466, "step": 400000 }, { "epoch": 14.881837098692033, "grad_norm": 7.286508083343506, "learning_rate": 1.2795407253269918e-05, "loss": 2.1428, "step": 400500 }, { "epoch": 14.900416171224732, "grad_norm": 8.08362102508545, "learning_rate": 1.2748959571938168e-05, "loss": 2.1515, "step": 401000 }, { "epoch": 14.918995243757431, "grad_norm": 8.438191413879395, "learning_rate": 1.2702511890606422e-05, "loss": 2.1446, "step": 401500 }, { "epoch": 14.937574316290132, "grad_norm": 7.372959136962891, "learning_rate": 1.2656064209274674e-05, "loss": 2.1496, "step": 402000 }, { "epoch": 14.95615338882283, "grad_norm": 7.080979347229004, "learning_rate": 1.2609616527942925e-05, "loss": 2.1579, "step": 402500 }, { "epoch": 14.97473246135553, "grad_norm": 7.254255294799805, "learning_rate": 1.2563168846611178e-05, "loss": 2.1414, "step": 403000 }, { "epoch": 14.993311533888228, "grad_norm": 7.761992931365967, "learning_rate": 1.2516721165279429e-05, "loss": 2.15, "step": 403500 }, { "epoch": 15.011890606420927, "grad_norm": 7.0644049644470215, "learning_rate": 1.2470273483947683e-05, "loss": 2.1474, "step": 404000 }, { "epoch": 15.030469678953626, "grad_norm": 8.067272186279297, "learning_rate": 1.2423825802615933e-05, "loss": 2.1103, "step": 404500 }, { "epoch": 15.049048751486326, "grad_norm": 6.896698474884033, "learning_rate": 1.2377378121284185e-05, "loss": 2.1159, "step": 405000 }, { "epoch": 15.067627824019025, "grad_norm": 6.983173370361328, "learning_rate": 1.233093043995244e-05, "loss": 2.1463, "step": 405500 }, { "epoch": 15.086206896551724, "grad_norm": 8.10067367553711, "learning_rate": 1.228448275862069e-05, "loss": 2.1418, "step": 406000 }, { "epoch": 15.104785969084423, "grad_norm": 7.817485332489014, "learning_rate": 1.2238035077288942e-05, "loss": 2.1433, "step": 406500 }, { "epoch": 15.123365041617122, "grad_norm": 7.6188578605651855, "learning_rate": 1.2191587395957194e-05, "loss": 2.1198, "step": 407000 }, { "epoch": 15.14194411414982, "grad_norm": 7.024149417877197, "learning_rate": 1.2145139714625446e-05, "loss": 2.1157, "step": 407500 }, { "epoch": 15.160523186682521, "grad_norm": 6.95907735824585, "learning_rate": 1.2098692033293699e-05, "loss": 2.134, "step": 408000 }, { "epoch": 15.17910225921522, "grad_norm": 6.850398540496826, "learning_rate": 1.205224435196195e-05, "loss": 2.1178, "step": 408500 }, { "epoch": 15.19768133174792, "grad_norm": 7.054015159606934, "learning_rate": 1.2005796670630203e-05, "loss": 2.1353, "step": 409000 }, { "epoch": 15.216260404280618, "grad_norm": 8.049177169799805, "learning_rate": 1.1959348989298455e-05, "loss": 2.1175, "step": 409500 }, { "epoch": 15.234839476813317, "grad_norm": 7.3112568855285645, "learning_rate": 1.1912901307966706e-05, "loss": 2.1269, "step": 410000 }, { "epoch": 15.253418549346017, "grad_norm": 7.102066516876221, "learning_rate": 1.186645362663496e-05, "loss": 2.121, "step": 410500 }, { "epoch": 15.271997621878716, "grad_norm": 7.103978633880615, "learning_rate": 1.1820005945303212e-05, "loss": 2.123, "step": 411000 }, { "epoch": 15.290576694411415, "grad_norm": 7.16837215423584, "learning_rate": 1.1773558263971462e-05, "loss": 2.1555, "step": 411500 }, { "epoch": 15.309155766944114, "grad_norm": 7.387100696563721, "learning_rate": 1.1727110582639714e-05, "loss": 2.1323, "step": 412000 }, { "epoch": 15.327734839476813, "grad_norm": 7.893144607543945, "learning_rate": 1.1680662901307968e-05, "loss": 2.1358, "step": 412500 }, { "epoch": 15.346313912009512, "grad_norm": 7.737049579620361, "learning_rate": 1.1634215219976219e-05, "loss": 2.122, "step": 413000 }, { "epoch": 15.364892984542212, "grad_norm": 7.758161544799805, "learning_rate": 1.1587767538644471e-05, "loss": 2.1262, "step": 413500 }, { "epoch": 15.383472057074911, "grad_norm": 7.8588666915893555, "learning_rate": 1.1541319857312725e-05, "loss": 2.1202, "step": 414000 }, { "epoch": 15.40205112960761, "grad_norm": 7.353470325469971, "learning_rate": 1.1494872175980975e-05, "loss": 2.1406, "step": 414500 }, { "epoch": 15.420630202140309, "grad_norm": 6.766369819641113, "learning_rate": 1.1448424494649228e-05, "loss": 2.1344, "step": 415000 }, { "epoch": 15.439209274673008, "grad_norm": 7.156630992889404, "learning_rate": 1.140197681331748e-05, "loss": 2.1232, "step": 415500 }, { "epoch": 15.457788347205707, "grad_norm": 7.754790782928467, "learning_rate": 1.1355529131985732e-05, "loss": 2.1154, "step": 416000 }, { "epoch": 15.476367419738407, "grad_norm": 8.716788291931152, "learning_rate": 1.1309081450653984e-05, "loss": 2.1236, "step": 416500 }, { "epoch": 15.494946492271106, "grad_norm": 7.345715522766113, "learning_rate": 1.1262633769322235e-05, "loss": 2.1299, "step": 417000 }, { "epoch": 15.513525564803805, "grad_norm": 7.088531494140625, "learning_rate": 1.1216186087990488e-05, "loss": 2.1168, "step": 417500 }, { "epoch": 15.532104637336504, "grad_norm": 7.417008876800537, "learning_rate": 1.116973840665874e-05, "loss": 2.1247, "step": 418000 }, { "epoch": 15.550683709869203, "grad_norm": 7.3177995681762695, "learning_rate": 1.1123290725326991e-05, "loss": 2.1198, "step": 418500 }, { "epoch": 15.569262782401903, "grad_norm": 6.9706711769104, "learning_rate": 1.1076843043995245e-05, "loss": 2.1228, "step": 419000 }, { "epoch": 15.587841854934602, "grad_norm": 6.97265625, "learning_rate": 1.1030395362663497e-05, "loss": 2.1237, "step": 419500 }, { "epoch": 15.606420927467301, "grad_norm": 6.226667404174805, "learning_rate": 1.0983947681331748e-05, "loss": 2.1017, "step": 420000 }, { "epoch": 15.625, "grad_norm": 7.427140712738037, "learning_rate": 1.09375e-05, "loss": 2.1131, "step": 420500 }, { "epoch": 15.643579072532699, "grad_norm": 8.942204475402832, "learning_rate": 1.0891052318668254e-05, "loss": 2.1294, "step": 421000 }, { "epoch": 15.662158145065398, "grad_norm": 7.123710632324219, "learning_rate": 1.0844604637336504e-05, "loss": 2.1207, "step": 421500 }, { "epoch": 15.680737217598097, "grad_norm": 6.2210798263549805, "learning_rate": 1.0798156956004756e-05, "loss": 2.1222, "step": 422000 }, { "epoch": 15.699316290130797, "grad_norm": 7.38429069519043, "learning_rate": 1.0751709274673009e-05, "loss": 2.114, "step": 422500 }, { "epoch": 15.717895362663496, "grad_norm": 6.752946853637695, "learning_rate": 1.070526159334126e-05, "loss": 2.1105, "step": 423000 }, { "epoch": 15.736474435196195, "grad_norm": 6.8533406257629395, "learning_rate": 1.0658813912009513e-05, "loss": 2.1263, "step": 423500 }, { "epoch": 15.755053507728894, "grad_norm": 8.36920166015625, "learning_rate": 1.0612366230677765e-05, "loss": 2.1082, "step": 424000 }, { "epoch": 15.773632580261593, "grad_norm": 6.900448799133301, "learning_rate": 1.0565918549346017e-05, "loss": 2.1245, "step": 424500 }, { "epoch": 15.792211652794293, "grad_norm": 7.180041313171387, "learning_rate": 1.051947086801427e-05, "loss": 2.1163, "step": 425000 }, { "epoch": 15.810790725326992, "grad_norm": 7.32526159286499, "learning_rate": 1.047302318668252e-05, "loss": 2.1344, "step": 425500 }, { "epoch": 15.829369797859691, "grad_norm": 7.500328540802002, "learning_rate": 1.0426575505350774e-05, "loss": 2.1127, "step": 426000 }, { "epoch": 15.84794887039239, "grad_norm": 7.36287784576416, "learning_rate": 1.0380127824019026e-05, "loss": 2.1104, "step": 426500 }, { "epoch": 15.866527942925089, "grad_norm": 7.004654884338379, "learning_rate": 1.0333680142687277e-05, "loss": 2.13, "step": 427000 }, { "epoch": 15.885107015457788, "grad_norm": 6.9634528160095215, "learning_rate": 1.0287232461355529e-05, "loss": 2.1196, "step": 427500 }, { "epoch": 15.903686087990488, "grad_norm": 7.970580101013184, "learning_rate": 1.0240784780023783e-05, "loss": 2.1144, "step": 428000 }, { "epoch": 15.922265160523187, "grad_norm": 7.777002334594727, "learning_rate": 1.0194337098692033e-05, "loss": 2.1226, "step": 428500 }, { "epoch": 15.940844233055886, "grad_norm": 6.956545352935791, "learning_rate": 1.0147889417360285e-05, "loss": 2.1353, "step": 429000 }, { "epoch": 15.959423305588585, "grad_norm": 7.85087251663208, "learning_rate": 1.010144173602854e-05, "loss": 2.1281, "step": 429500 }, { "epoch": 15.978002378121284, "grad_norm": 8.030372619628906, "learning_rate": 1.005499405469679e-05, "loss": 2.1247, "step": 430000 }, { "epoch": 15.996581450653984, "grad_norm": 7.764926433563232, "learning_rate": 1.0008546373365042e-05, "loss": 2.1291, "step": 430500 }, { "epoch": 16.015160523186683, "grad_norm": 6.365900039672852, "learning_rate": 9.962098692033294e-06, "loss": 2.098, "step": 431000 }, { "epoch": 16.033739595719382, "grad_norm": 7.203670024871826, "learning_rate": 9.915651010701546e-06, "loss": 2.1182, "step": 431500 }, { "epoch": 16.05231866825208, "grad_norm": 7.516459941864014, "learning_rate": 9.869203329369798e-06, "loss": 2.0847, "step": 432000 }, { "epoch": 16.07089774078478, "grad_norm": 6.9018235206604, "learning_rate": 9.822755648038049e-06, "loss": 2.0801, "step": 432500 }, { "epoch": 16.08947681331748, "grad_norm": 7.418632507324219, "learning_rate": 9.776307966706303e-06, "loss": 2.0981, "step": 433000 }, { "epoch": 16.108055885850177, "grad_norm": 7.646805763244629, "learning_rate": 9.729860285374555e-06, "loss": 2.1049, "step": 433500 }, { "epoch": 16.126634958382876, "grad_norm": 6.691248893737793, "learning_rate": 9.683412604042806e-06, "loss": 2.1026, "step": 434000 }, { "epoch": 16.145214030915575, "grad_norm": 8.201228141784668, "learning_rate": 9.63696492271106e-06, "loss": 2.1197, "step": 434500 }, { "epoch": 16.163793103448278, "grad_norm": 6.836193561553955, "learning_rate": 9.590517241379312e-06, "loss": 2.1048, "step": 435000 }, { "epoch": 16.182372175980976, "grad_norm": 6.607935905456543, "learning_rate": 9.544069560047562e-06, "loss": 2.0952, "step": 435500 }, { "epoch": 16.200951248513675, "grad_norm": 7.329438209533691, "learning_rate": 9.497621878715814e-06, "loss": 2.1096, "step": 436000 }, { "epoch": 16.219530321046374, "grad_norm": 7.701877117156982, "learning_rate": 9.451174197384068e-06, "loss": 2.1061, "step": 436500 }, { "epoch": 16.238109393579073, "grad_norm": 6.743167877197266, "learning_rate": 9.404726516052319e-06, "loss": 2.1151, "step": 437000 }, { "epoch": 16.256688466111772, "grad_norm": 7.008676528930664, "learning_rate": 9.35827883472057e-06, "loss": 2.1113, "step": 437500 }, { "epoch": 16.27526753864447, "grad_norm": 7.036728858947754, "learning_rate": 9.311831153388825e-06, "loss": 2.0898, "step": 438000 }, { "epoch": 16.29384661117717, "grad_norm": 7.374510765075684, "learning_rate": 9.265383472057075e-06, "loss": 2.0845, "step": 438500 }, { "epoch": 16.31242568370987, "grad_norm": 7.095835208892822, "learning_rate": 9.218935790725327e-06, "loss": 2.117, "step": 439000 }, { "epoch": 16.331004756242567, "grad_norm": 7.737233638763428, "learning_rate": 9.17248810939358e-06, "loss": 2.0987, "step": 439500 }, { "epoch": 16.349583828775266, "grad_norm": 7.745171546936035, "learning_rate": 9.126040428061832e-06, "loss": 2.0858, "step": 440000 }, { "epoch": 16.368162901307965, "grad_norm": 6.25264835357666, "learning_rate": 9.079592746730084e-06, "loss": 2.1014, "step": 440500 }, { "epoch": 16.386741973840667, "grad_norm": 8.324295043945312, "learning_rate": 9.033145065398334e-06, "loss": 2.1006, "step": 441000 }, { "epoch": 16.405321046373366, "grad_norm": 7.7967352867126465, "learning_rate": 8.986697384066588e-06, "loss": 2.0758, "step": 441500 }, { "epoch": 16.423900118906065, "grad_norm": 7.272579193115234, "learning_rate": 8.94024970273484e-06, "loss": 2.1062, "step": 442000 }, { "epoch": 16.442479191438764, "grad_norm": 7.0281195640563965, "learning_rate": 8.893802021403091e-06, "loss": 2.0979, "step": 442500 }, { "epoch": 16.461058263971463, "grad_norm": 7.969797611236572, "learning_rate": 8.847354340071345e-06, "loss": 2.0782, "step": 443000 }, { "epoch": 16.47963733650416, "grad_norm": 7.431224822998047, "learning_rate": 8.800906658739597e-06, "loss": 2.0986, "step": 443500 }, { "epoch": 16.49821640903686, "grad_norm": 7.004672050476074, "learning_rate": 8.754458977407848e-06, "loss": 2.1162, "step": 444000 }, { "epoch": 16.51679548156956, "grad_norm": 7.328388214111328, "learning_rate": 8.7080112960761e-06, "loss": 2.0903, "step": 444500 }, { "epoch": 16.53537455410226, "grad_norm": 7.997599124908447, "learning_rate": 8.661563614744354e-06, "loss": 2.0926, "step": 445000 }, { "epoch": 16.553953626634957, "grad_norm": 6.598504066467285, "learning_rate": 8.615115933412604e-06, "loss": 2.103, "step": 445500 }, { "epoch": 16.572532699167656, "grad_norm": 8.041633605957031, "learning_rate": 8.568668252080856e-06, "loss": 2.0902, "step": 446000 }, { "epoch": 16.591111771700355, "grad_norm": 6.456114768981934, "learning_rate": 8.522220570749109e-06, "loss": 2.1113, "step": 446500 }, { "epoch": 16.609690844233057, "grad_norm": 8.524587631225586, "learning_rate": 8.47577288941736e-06, "loss": 2.1034, "step": 447000 }, { "epoch": 16.628269916765756, "grad_norm": 7.4559102058410645, "learning_rate": 8.429325208085613e-06, "loss": 2.0911, "step": 447500 }, { "epoch": 16.646848989298455, "grad_norm": 7.678273677825928, "learning_rate": 8.382877526753865e-06, "loss": 2.1009, "step": 448000 }, { "epoch": 16.665428061831154, "grad_norm": 6.468957424163818, "learning_rate": 8.336429845422117e-06, "loss": 2.104, "step": 448500 }, { "epoch": 16.684007134363853, "grad_norm": 7.746886730194092, "learning_rate": 8.28998216409037e-06, "loss": 2.0961, "step": 449000 }, { "epoch": 16.70258620689655, "grad_norm": 6.837747097015381, "learning_rate": 8.24353448275862e-06, "loss": 2.0867, "step": 449500 }, { "epoch": 16.72116527942925, "grad_norm": 7.098623275756836, "learning_rate": 8.197086801426874e-06, "loss": 2.1093, "step": 450000 }, { "epoch": 16.73974435196195, "grad_norm": 6.478063106536865, "learning_rate": 8.150639120095126e-06, "loss": 2.0933, "step": 450500 }, { "epoch": 16.758323424494648, "grad_norm": 7.2032012939453125, "learning_rate": 8.104191438763376e-06, "loss": 2.0801, "step": 451000 }, { "epoch": 16.776902497027347, "grad_norm": 6.382145881652832, "learning_rate": 8.057743757431629e-06, "loss": 2.1063, "step": 451500 }, { "epoch": 16.795481569560046, "grad_norm": 7.381346702575684, "learning_rate": 8.011296076099883e-06, "loss": 2.0992, "step": 452000 }, { "epoch": 16.81406064209275, "grad_norm": 6.544224739074707, "learning_rate": 7.964848394768133e-06, "loss": 2.0993, "step": 452500 }, { "epoch": 16.832639714625447, "grad_norm": 7.141576290130615, "learning_rate": 7.918400713436385e-06, "loss": 2.0919, "step": 453000 }, { "epoch": 16.851218787158146, "grad_norm": 6.41404914855957, "learning_rate": 7.871953032104639e-06, "loss": 2.0961, "step": 453500 }, { "epoch": 16.869797859690845, "grad_norm": 7.792717933654785, "learning_rate": 7.82550535077289e-06, "loss": 2.0875, "step": 454000 }, { "epoch": 16.888376932223544, "grad_norm": 8.0609130859375, "learning_rate": 7.779057669441142e-06, "loss": 2.0905, "step": 454500 }, { "epoch": 16.906956004756243, "grad_norm": 7.00869083404541, "learning_rate": 7.732609988109394e-06, "loss": 2.0877, "step": 455000 }, { "epoch": 16.92553507728894, "grad_norm": 7.780180931091309, "learning_rate": 7.686162306777646e-06, "loss": 2.0956, "step": 455500 }, { "epoch": 16.94411414982164, "grad_norm": 7.056099891662598, "learning_rate": 7.639714625445898e-06, "loss": 2.0762, "step": 456000 }, { "epoch": 16.96269322235434, "grad_norm": 6.861847877502441, "learning_rate": 7.59326694411415e-06, "loss": 2.0946, "step": 456500 }, { "epoch": 16.981272294887038, "grad_norm": 7.449362754821777, "learning_rate": 7.546819262782402e-06, "loss": 2.1104, "step": 457000 }, { "epoch": 16.999851367419737, "grad_norm": 7.2395782470703125, "learning_rate": 7.500371581450655e-06, "loss": 2.1044, "step": 457500 }, { "epoch": 17.01843043995244, "grad_norm": 7.514138221740723, "learning_rate": 7.453923900118906e-06, "loss": 2.0918, "step": 458000 }, { "epoch": 17.037009512485138, "grad_norm": 6.817535877227783, "learning_rate": 7.4074762187871585e-06, "loss": 2.1078, "step": 458500 }, { "epoch": 17.055588585017837, "grad_norm": 7.827926158905029, "learning_rate": 7.3610285374554115e-06, "loss": 2.0747, "step": 459000 }, { "epoch": 17.074167657550536, "grad_norm": 9.247724533081055, "learning_rate": 7.314580856123662e-06, "loss": 2.0964, "step": 459500 }, { "epoch": 17.092746730083235, "grad_norm": 8.57845687866211, "learning_rate": 7.268133174791915e-06, "loss": 2.0876, "step": 460000 }, { "epoch": 17.111325802615934, "grad_norm": 7.123178482055664, "learning_rate": 7.221685493460167e-06, "loss": 2.0672, "step": 460500 }, { "epoch": 17.129904875148632, "grad_norm": 7.820250034332275, "learning_rate": 7.1752378121284185e-06, "loss": 2.0708, "step": 461000 }, { "epoch": 17.14848394768133, "grad_norm": 7.021051406860352, "learning_rate": 7.1287901307966716e-06, "loss": 2.0801, "step": 461500 }, { "epoch": 17.16706302021403, "grad_norm": 8.586702346801758, "learning_rate": 7.082342449464922e-06, "loss": 2.0749, "step": 462000 }, { "epoch": 17.18564209274673, "grad_norm": 6.818421363830566, "learning_rate": 7.035894768133175e-06, "loss": 2.0916, "step": 462500 }, { "epoch": 17.204221165279428, "grad_norm": 7.275014877319336, "learning_rate": 6.989447086801427e-06, "loss": 2.0739, "step": 463000 }, { "epoch": 17.222800237812127, "grad_norm": 6.750241756439209, "learning_rate": 6.942999405469679e-06, "loss": 2.0852, "step": 463500 }, { "epoch": 17.24137931034483, "grad_norm": 7.445390701293945, "learning_rate": 6.896551724137932e-06, "loss": 2.0748, "step": 464000 }, { "epoch": 17.259958382877528, "grad_norm": 8.087651252746582, "learning_rate": 6.850104042806184e-06, "loss": 2.0588, "step": 464500 }, { "epoch": 17.278537455410227, "grad_norm": 7.12742805480957, "learning_rate": 6.803656361474435e-06, "loss": 2.074, "step": 465000 }, { "epoch": 17.297116527942926, "grad_norm": 7.231345176696777, "learning_rate": 6.757208680142687e-06, "loss": 2.0682, "step": 465500 }, { "epoch": 17.315695600475625, "grad_norm": 7.275602340698242, "learning_rate": 6.71076099881094e-06, "loss": 2.0813, "step": 466000 }, { "epoch": 17.334274673008323, "grad_norm": 7.546669006347656, "learning_rate": 6.664313317479192e-06, "loss": 2.0839, "step": 466500 }, { "epoch": 17.352853745541022, "grad_norm": 7.166531085968018, "learning_rate": 6.617865636147444e-06, "loss": 2.064, "step": 467000 }, { "epoch": 17.37143281807372, "grad_norm": 8.803594589233398, "learning_rate": 6.571417954815696e-06, "loss": 2.0847, "step": 467500 }, { "epoch": 17.39001189060642, "grad_norm": 7.301925182342529, "learning_rate": 6.5249702734839475e-06, "loss": 2.0938, "step": 468000 }, { "epoch": 17.40859096313912, "grad_norm": 7.235419273376465, "learning_rate": 6.4785225921522005e-06, "loss": 2.0824, "step": 468500 }, { "epoch": 17.427170035671818, "grad_norm": 9.021172523498535, "learning_rate": 6.432074910820453e-06, "loss": 2.0924, "step": 469000 }, { "epoch": 17.445749108204517, "grad_norm": 7.037625789642334, "learning_rate": 6.385627229488704e-06, "loss": 2.0872, "step": 469500 }, { "epoch": 17.46432818073722, "grad_norm": 8.20162296295166, "learning_rate": 6.339179548156956e-06, "loss": 2.0755, "step": 470000 }, { "epoch": 17.482907253269918, "grad_norm": 7.615068435668945, "learning_rate": 6.2927318668252075e-06, "loss": 2.0826, "step": 470500 }, { "epoch": 17.501486325802617, "grad_norm": 7.641859531402588, "learning_rate": 6.2462841854934606e-06, "loss": 2.087, "step": 471000 }, { "epoch": 17.520065398335316, "grad_norm": 6.667306900024414, "learning_rate": 6.199836504161712e-06, "loss": 2.0842, "step": 471500 }, { "epoch": 17.538644470868014, "grad_norm": 6.990174770355225, "learning_rate": 6.153388822829965e-06, "loss": 2.0861, "step": 472000 }, { "epoch": 17.557223543400713, "grad_norm": 7.540374755859375, "learning_rate": 6.106941141498216e-06, "loss": 2.078, "step": 472500 }, { "epoch": 17.575802615933412, "grad_norm": 6.960676670074463, "learning_rate": 6.0604934601664685e-06, "loss": 2.0877, "step": 473000 }, { "epoch": 17.59438168846611, "grad_norm": 8.197839736938477, "learning_rate": 6.0140457788347215e-06, "loss": 2.0718, "step": 473500 }, { "epoch": 17.61296076099881, "grad_norm": 7.723132610321045, "learning_rate": 5.967598097502973e-06, "loss": 2.0793, "step": 474000 }, { "epoch": 17.63153983353151, "grad_norm": 6.541485786437988, "learning_rate": 5.921150416171225e-06, "loss": 2.0754, "step": 474500 }, { "epoch": 17.650118906064208, "grad_norm": 7.376631736755371, "learning_rate": 5.874702734839476e-06, "loss": 2.0792, "step": 475000 }, { "epoch": 17.66869797859691, "grad_norm": 6.127633094787598, "learning_rate": 5.828255053507729e-06, "loss": 2.0492, "step": 475500 }, { "epoch": 17.68727705112961, "grad_norm": 7.734124183654785, "learning_rate": 5.781807372175982e-06, "loss": 2.0748, "step": 476000 }, { "epoch": 17.705856123662308, "grad_norm": 6.9572601318359375, "learning_rate": 5.735359690844233e-06, "loss": 2.0992, "step": 476500 }, { "epoch": 17.724435196195007, "grad_norm": 6.885385513305664, "learning_rate": 5.688912009512486e-06, "loss": 2.0771, "step": 477000 }, { "epoch": 17.743014268727705, "grad_norm": 7.826180458068848, "learning_rate": 5.642464328180737e-06, "loss": 2.081, "step": 477500 }, { "epoch": 17.761593341260404, "grad_norm": 7.1644439697265625, "learning_rate": 5.5960166468489895e-06, "loss": 2.0847, "step": 478000 }, { "epoch": 17.780172413793103, "grad_norm": 8.081832885742188, "learning_rate": 5.549568965517242e-06, "loss": 2.072, "step": 478500 }, { "epoch": 17.798751486325802, "grad_norm": 6.1492600440979, "learning_rate": 5.503121284185494e-06, "loss": 2.0919, "step": 479000 }, { "epoch": 17.8173305588585, "grad_norm": 6.837408542633057, "learning_rate": 5.456673602853746e-06, "loss": 2.0745, "step": 479500 }, { "epoch": 17.8359096313912, "grad_norm": 6.619295120239258, "learning_rate": 5.410225921521997e-06, "loss": 2.0758, "step": 480000 }, { "epoch": 17.8544887039239, "grad_norm": 7.465627193450928, "learning_rate": 5.3637782401902504e-06, "loss": 2.0705, "step": 480500 }, { "epoch": 17.8730677764566, "grad_norm": 7.469555854797363, "learning_rate": 5.317330558858502e-06, "loss": 2.0675, "step": 481000 }, { "epoch": 17.8916468489893, "grad_norm": 7.39703893661499, "learning_rate": 5.270882877526754e-06, "loss": 2.0869, "step": 481500 }, { "epoch": 17.910225921522, "grad_norm": 6.684396743774414, "learning_rate": 5.224435196195006e-06, "loss": 2.0782, "step": 482000 }, { "epoch": 17.928804994054698, "grad_norm": 8.273653984069824, "learning_rate": 5.177987514863258e-06, "loss": 2.0792, "step": 482500 }, { "epoch": 17.947384066587396, "grad_norm": 7.827981472015381, "learning_rate": 5.1315398335315105e-06, "loss": 2.0862, "step": 483000 }, { "epoch": 17.965963139120095, "grad_norm": 7.737405300140381, "learning_rate": 5.085092152199762e-06, "loss": 2.0632, "step": 483500 }, { "epoch": 17.984542211652794, "grad_norm": 7.617379665374756, "learning_rate": 5.038644470868015e-06, "loss": 2.1037, "step": 484000 }, { "epoch": 18.003121284185493, "grad_norm": 7.147322177886963, "learning_rate": 4.992196789536266e-06, "loss": 2.0701, "step": 484500 }, { "epoch": 18.021700356718192, "grad_norm": 6.316223621368408, "learning_rate": 4.945749108204518e-06, "loss": 2.0536, "step": 485000 }, { "epoch": 18.04027942925089, "grad_norm": 7.639254093170166, "learning_rate": 4.8993014268727714e-06, "loss": 2.0594, "step": 485500 }, { "epoch": 18.05885850178359, "grad_norm": 7.149983882904053, "learning_rate": 4.852853745541023e-06, "loss": 2.0546, "step": 486000 }, { "epoch": 18.07743757431629, "grad_norm": 7.123045921325684, "learning_rate": 4.806406064209275e-06, "loss": 2.0819, "step": 486500 }, { "epoch": 18.09601664684899, "grad_norm": 7.2495293617248535, "learning_rate": 4.759958382877526e-06, "loss": 2.0641, "step": 487000 }, { "epoch": 18.11459571938169, "grad_norm": 7.309257507324219, "learning_rate": 4.713510701545779e-06, "loss": 2.0664, "step": 487500 }, { "epoch": 18.13317479191439, "grad_norm": 6.188238620758057, "learning_rate": 4.6670630202140315e-06, "loss": 2.0801, "step": 488000 }, { "epoch": 18.151753864447087, "grad_norm": 7.894054889678955, "learning_rate": 4.620615338882283e-06, "loss": 2.0576, "step": 488500 }, { "epoch": 18.170332936979786, "grad_norm": 7.271005153656006, "learning_rate": 4.574167657550536e-06, "loss": 2.0477, "step": 489000 }, { "epoch": 18.188912009512485, "grad_norm": 7.505859851837158, "learning_rate": 4.527719976218787e-06, "loss": 2.0726, "step": 489500 }, { "epoch": 18.207491082045184, "grad_norm": 7.29171085357666, "learning_rate": 4.4812722948870394e-06, "loss": 2.0641, "step": 490000 }, { "epoch": 18.226070154577883, "grad_norm": 7.959132671356201, "learning_rate": 4.434824613555292e-06, "loss": 2.0704, "step": 490500 }, { "epoch": 18.244649227110582, "grad_norm": 6.843657493591309, "learning_rate": 4.388376932223544e-06, "loss": 2.0695, "step": 491000 }, { "epoch": 18.26322829964328, "grad_norm": 6.887396812438965, "learning_rate": 4.341929250891796e-06, "loss": 2.0622, "step": 491500 }, { "epoch": 18.28180737217598, "grad_norm": 7.143764019012451, "learning_rate": 4.295481569560047e-06, "loss": 2.0729, "step": 492000 }, { "epoch": 18.30038644470868, "grad_norm": 7.79412841796875, "learning_rate": 4.2490338882283e-06, "loss": 2.054, "step": 492500 }, { "epoch": 18.31896551724138, "grad_norm": 7.328498363494873, "learning_rate": 4.202586206896552e-06, "loss": 2.0582, "step": 493000 }, { "epoch": 18.33754458977408, "grad_norm": 6.897115230560303, "learning_rate": 4.156138525564804e-06, "loss": 2.0548, "step": 493500 }, { "epoch": 18.35612366230678, "grad_norm": 7.248096942901611, "learning_rate": 4.109690844233056e-06, "loss": 2.0668, "step": 494000 }, { "epoch": 18.374702734839477, "grad_norm": 8.02023983001709, "learning_rate": 4.063243162901308e-06, "loss": 2.0527, "step": 494500 }, { "epoch": 18.393281807372176, "grad_norm": 7.436459541320801, "learning_rate": 4.0167954815695605e-06, "loss": 2.0685, "step": 495000 }, { "epoch": 18.411860879904875, "grad_norm": 7.74916410446167, "learning_rate": 3.970347800237812e-06, "loss": 2.0709, "step": 495500 }, { "epoch": 18.430439952437574, "grad_norm": 8.027193069458008, "learning_rate": 3.923900118906065e-06, "loss": 2.0614, "step": 496000 }, { "epoch": 18.449019024970273, "grad_norm": 6.885724067687988, "learning_rate": 3.877452437574316e-06, "loss": 2.0519, "step": 496500 }, { "epoch": 18.46759809750297, "grad_norm": 7.010785102844238, "learning_rate": 3.831004756242568e-06, "loss": 2.068, "step": 497000 }, { "epoch": 18.48617717003567, "grad_norm": 6.670163631439209, "learning_rate": 3.78455707491082e-06, "loss": 2.0695, "step": 497500 }, { "epoch": 18.50475624256837, "grad_norm": 7.944169998168945, "learning_rate": 3.7381093935790727e-06, "loss": 2.0723, "step": 498000 }, { "epoch": 18.523335315101072, "grad_norm": 5.955043792724609, "learning_rate": 3.691661712247325e-06, "loss": 2.0456, "step": 498500 }, { "epoch": 18.54191438763377, "grad_norm": 7.109121322631836, "learning_rate": 3.6452140309155767e-06, "loss": 2.0612, "step": 499000 }, { "epoch": 18.56049346016647, "grad_norm": 6.318941593170166, "learning_rate": 3.5987663495838293e-06, "loss": 2.0702, "step": 499500 }, { "epoch": 18.57907253269917, "grad_norm": 8.715611457824707, "learning_rate": 3.552318668252081e-06, "loss": 2.08, "step": 500000 }, { "epoch": 18.597651605231867, "grad_norm": 9.135302543640137, "learning_rate": 3.505870986920333e-06, "loss": 2.0762, "step": 500500 }, { "epoch": 18.616230677764566, "grad_norm": 7.45161247253418, "learning_rate": 3.4594233055885854e-06, "loss": 2.0533, "step": 501000 }, { "epoch": 18.634809750297265, "grad_norm": 8.179544448852539, "learning_rate": 3.412975624256837e-06, "loss": 2.0548, "step": 501500 }, { "epoch": 18.653388822829964, "grad_norm": 9.695868492126465, "learning_rate": 3.3665279429250894e-06, "loss": 2.0502, "step": 502000 }, { "epoch": 18.671967895362663, "grad_norm": 9.127315521240234, "learning_rate": 3.320080261593341e-06, "loss": 2.0559, "step": 502500 }, { "epoch": 18.69054696789536, "grad_norm": 7.58563232421875, "learning_rate": 3.2736325802615937e-06, "loss": 2.0618, "step": 503000 }, { "epoch": 18.70912604042806, "grad_norm": 6.781043529510498, "learning_rate": 3.2271848989298455e-06, "loss": 2.0654, "step": 503500 }, { "epoch": 18.727705112960763, "grad_norm": 7.929651737213135, "learning_rate": 3.1807372175980973e-06, "loss": 2.0493, "step": 504000 }, { "epoch": 18.74628418549346, "grad_norm": 7.395569324493408, "learning_rate": 3.13428953626635e-06, "loss": 2.042, "step": 504500 }, { "epoch": 18.76486325802616, "grad_norm": 8.050883293151855, "learning_rate": 3.087841854934602e-06, "loss": 2.0625, "step": 505000 }, { "epoch": 18.78344233055886, "grad_norm": 8.531946182250977, "learning_rate": 3.041394173602854e-06, "loss": 2.0636, "step": 505500 }, { "epoch": 18.80202140309156, "grad_norm": 7.74788236618042, "learning_rate": 2.994946492271106e-06, "loss": 2.0758, "step": 506000 }, { "epoch": 18.820600475624257, "grad_norm": 7.532721996307373, "learning_rate": 2.9484988109393578e-06, "loss": 2.0559, "step": 506500 }, { "epoch": 18.839179548156956, "grad_norm": 6.848814487457275, "learning_rate": 2.90205112960761e-06, "loss": 2.0552, "step": 507000 }, { "epoch": 18.857758620689655, "grad_norm": 7.606546401977539, "learning_rate": 2.855603448275862e-06, "loss": 2.0528, "step": 507500 }, { "epoch": 18.876337693222354, "grad_norm": 7.560408592224121, "learning_rate": 2.8091557669441143e-06, "loss": 2.0529, "step": 508000 }, { "epoch": 18.894916765755053, "grad_norm": 8.788424491882324, "learning_rate": 2.7627080856123665e-06, "loss": 2.0564, "step": 508500 }, { "epoch": 18.91349583828775, "grad_norm": 6.34813928604126, "learning_rate": 2.7162604042806183e-06, "loss": 2.0517, "step": 509000 }, { "epoch": 18.93207491082045, "grad_norm": 7.5938005447387695, "learning_rate": 2.6698127229488705e-06, "loss": 2.0615, "step": 509500 }, { "epoch": 18.950653983353153, "grad_norm": 7.773651123046875, "learning_rate": 2.6233650416171222e-06, "loss": 2.0572, "step": 510000 }, { "epoch": 18.96923305588585, "grad_norm": 6.474369049072266, "learning_rate": 2.576917360285375e-06, "loss": 2.0563, "step": 510500 }, { "epoch": 18.98781212841855, "grad_norm": 7.805785179138184, "learning_rate": 2.530469678953627e-06, "loss": 2.0697, "step": 511000 }, { "epoch": 19.00639120095125, "grad_norm": 6.911838054656982, "learning_rate": 2.484021997621879e-06, "loss": 2.066, "step": 511500 }, { "epoch": 19.024970273483948, "grad_norm": 7.869637966156006, "learning_rate": 2.437574316290131e-06, "loss": 2.0383, "step": 512000 }, { "epoch": 19.043549346016647, "grad_norm": 8.383206367492676, "learning_rate": 2.3911266349583828e-06, "loss": 2.0579, "step": 512500 }, { "epoch": 19.062128418549346, "grad_norm": 8.408047676086426, "learning_rate": 2.344678953626635e-06, "loss": 2.0651, "step": 513000 }, { "epoch": 19.080707491082045, "grad_norm": 7.509279251098633, "learning_rate": 2.298231272294887e-06, "loss": 2.0762, "step": 513500 }, { "epoch": 19.099286563614744, "grad_norm": 7.116653919219971, "learning_rate": 2.2517835909631393e-06, "loss": 2.05, "step": 514000 }, { "epoch": 19.117865636147442, "grad_norm": 6.725174427032471, "learning_rate": 2.2053359096313915e-06, "loss": 2.0654, "step": 514500 }, { "epoch": 19.13644470868014, "grad_norm": 6.7676544189453125, "learning_rate": 2.1588882282996433e-06, "loss": 2.0608, "step": 515000 }, { "epoch": 19.15502378121284, "grad_norm": 7.207517147064209, "learning_rate": 2.1124405469678954e-06, "loss": 2.06, "step": 515500 }, { "epoch": 19.173602853745543, "grad_norm": 7.989516735076904, "learning_rate": 2.065992865636147e-06, "loss": 2.034, "step": 516000 }, { "epoch": 19.19218192627824, "grad_norm": 7.003942489624023, "learning_rate": 2.0195451843044e-06, "loss": 2.0518, "step": 516500 }, { "epoch": 19.21076099881094, "grad_norm": 6.7362236976623535, "learning_rate": 1.973097502972652e-06, "loss": 2.0609, "step": 517000 }, { "epoch": 19.22934007134364, "grad_norm": 6.881633758544922, "learning_rate": 1.9266498216409038e-06, "loss": 2.0541, "step": 517500 }, { "epoch": 19.247919143876338, "grad_norm": 7.07053279876709, "learning_rate": 1.880202140309156e-06, "loss": 2.037, "step": 518000 }, { "epoch": 19.266498216409037, "grad_norm": 7.328449249267578, "learning_rate": 1.833754458977408e-06, "loss": 2.0379, "step": 518500 }, { "epoch": 19.285077288941736, "grad_norm": 7.447302341461182, "learning_rate": 1.7873067776456601e-06, "loss": 2.0461, "step": 519000 }, { "epoch": 19.303656361474435, "grad_norm": 6.588597774505615, "learning_rate": 1.7408590963139119e-06, "loss": 2.0395, "step": 519500 }, { "epoch": 19.322235434007133, "grad_norm": 6.768307685852051, "learning_rate": 1.694411414982164e-06, "loss": 2.0747, "step": 520000 }, { "epoch": 19.340814506539832, "grad_norm": 7.688553810119629, "learning_rate": 1.6479637336504165e-06, "loss": 2.0446, "step": 520500 }, { "epoch": 19.35939357907253, "grad_norm": 8.146514892578125, "learning_rate": 1.6015160523186682e-06, "loss": 2.0389, "step": 521000 }, { "epoch": 19.377972651605234, "grad_norm": 8.384140968322754, "learning_rate": 1.5550683709869204e-06, "loss": 2.0367, "step": 521500 }, { "epoch": 19.396551724137932, "grad_norm": 7.63324499130249, "learning_rate": 1.5086206896551726e-06, "loss": 2.0461, "step": 522000 }, { "epoch": 19.41513079667063, "grad_norm": 6.784617900848389, "learning_rate": 1.4621730083234246e-06, "loss": 2.0453, "step": 522500 }, { "epoch": 19.43370986920333, "grad_norm": 6.640545845031738, "learning_rate": 1.4157253269916766e-06, "loss": 2.0388, "step": 523000 }, { "epoch": 19.45228894173603, "grad_norm": 6.723217487335205, "learning_rate": 1.3692776456599287e-06, "loss": 2.0414, "step": 523500 }, { "epoch": 19.470868014268728, "grad_norm": 6.989643573760986, "learning_rate": 1.3228299643281807e-06, "loss": 2.055, "step": 524000 }, { "epoch": 19.489447086801427, "grad_norm": 6.394150257110596, "learning_rate": 1.276382282996433e-06, "loss": 2.0539, "step": 524500 }, { "epoch": 19.508026159334126, "grad_norm": 7.73260498046875, "learning_rate": 1.229934601664685e-06, "loss": 2.0549, "step": 525000 }, { "epoch": 19.526605231866824, "grad_norm": 7.458393096923828, "learning_rate": 1.183486920332937e-06, "loss": 2.0486, "step": 525500 }, { "epoch": 19.545184304399523, "grad_norm": 7.173522472381592, "learning_rate": 1.137039239001189e-06, "loss": 2.0644, "step": 526000 }, { "epoch": 19.563763376932222, "grad_norm": 7.556340217590332, "learning_rate": 1.0905915576694412e-06, "loss": 2.0456, "step": 526500 }, { "epoch": 19.582342449464925, "grad_norm": 8.111367225646973, "learning_rate": 1.0441438763376932e-06, "loss": 2.0397, "step": 527000 }, { "epoch": 19.600921521997623, "grad_norm": 6.623202323913574, "learning_rate": 9.976961950059454e-07, "loss": 2.0488, "step": 527500 }, { "epoch": 19.619500594530322, "grad_norm": 7.327664375305176, "learning_rate": 9.512485136741974e-07, "loss": 2.037, "step": 528000 }, { "epoch": 19.63807966706302, "grad_norm": 7.518941879272461, "learning_rate": 9.048008323424495e-07, "loss": 2.0694, "step": 528500 }, { "epoch": 19.65665873959572, "grad_norm": 7.00496244430542, "learning_rate": 8.583531510107016e-07, "loss": 2.0459, "step": 529000 }, { "epoch": 19.67523781212842, "grad_norm": 7.160311222076416, "learning_rate": 8.119054696789537e-07, "loss": 2.029, "step": 529500 }, { "epoch": 19.693816884661118, "grad_norm": 7.951440811157227, "learning_rate": 7.654577883472057e-07, "loss": 2.0529, "step": 530000 }, { "epoch": 19.712395957193817, "grad_norm": 7.71318244934082, "learning_rate": 7.190101070154579e-07, "loss": 2.057, "step": 530500 }, { "epoch": 19.730975029726515, "grad_norm": 7.5362043380737305, "learning_rate": 6.7256242568371e-07, "loss": 2.0493, "step": 531000 }, { "epoch": 19.749554102259214, "grad_norm": 8.362653732299805, "learning_rate": 6.261147443519619e-07, "loss": 2.0467, "step": 531500 }, { "epoch": 19.768133174791913, "grad_norm": 7.16049337387085, "learning_rate": 5.79667063020214e-07, "loss": 2.0346, "step": 532000 }, { "epoch": 19.786712247324612, "grad_norm": 7.634875297546387, "learning_rate": 5.332193816884662e-07, "loss": 2.035, "step": 532500 }, { "epoch": 19.805291319857314, "grad_norm": 7.416409015655518, "learning_rate": 4.867717003567182e-07, "loss": 2.0513, "step": 533000 }, { "epoch": 19.823870392390013, "grad_norm": 6.575763702392578, "learning_rate": 4.4032401902497025e-07, "loss": 2.0582, "step": 533500 }, { "epoch": 19.842449464922712, "grad_norm": 7.2025909423828125, "learning_rate": 3.938763376932224e-07, "loss": 2.0534, "step": 534000 }, { "epoch": 19.86102853745541, "grad_norm": 7.560851573944092, "learning_rate": 3.4742865636147446e-07, "loss": 2.0566, "step": 534500 }, { "epoch": 19.87960760998811, "grad_norm": 7.525179386138916, "learning_rate": 3.0098097502972654e-07, "loss": 2.0374, "step": 535000 }, { "epoch": 19.89818668252081, "grad_norm": 6.616377830505371, "learning_rate": 2.5453329369797857e-07, "loss": 2.0483, "step": 535500 }, { "epoch": 19.916765755053508, "grad_norm": 7.127399444580078, "learning_rate": 2.0808561236623068e-07, "loss": 2.0612, "step": 536000 }, { "epoch": 19.935344827586206, "grad_norm": 8.101058959960938, "learning_rate": 1.6163793103448276e-07, "loss": 2.046, "step": 536500 }, { "epoch": 19.953923900118905, "grad_norm": 7.135190010070801, "learning_rate": 1.1519024970273484e-07, "loss": 2.0619, "step": 537000 }, { "epoch": 19.972502972651604, "grad_norm": 7.481634616851807, "learning_rate": 6.874256837098692e-08, "loss": 2.0529, "step": 537500 }, { "epoch": 19.991082045184303, "grad_norm": 7.486691474914551, "learning_rate": 2.2294887039239002e-08, "loss": 2.0536, "step": 538000 } ], "logging_steps": 500, "max_steps": 538240, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.258410922006282e+17, "train_batch_size": 46, "trial_name": null, "trial_params": null }